diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 447387a958655b0e7e71002bb41018918fb81bdc..9ff3069504e05b1b791fc81351a53ea7be527008 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -701,8 +701,6 @@ static bool read_backup_label(XLogRecPtr *checkPointLoc, static void rm_redo_error_callback(void *arg); static int get_sync_bit(int method); -static void fsync_pgdata(char *datadir); - /* * Insert an XLOG record having the specified RMID and info bytes, * with the body of the record being the data chunk(s) described by @@ -6418,18 +6416,6 @@ StartupXLOG(void) (errmsg("database system was interrupted; last known up at %s", str_time(ControlFile->time)))); - /* - * If we previously crashed, there might be data which we had written, - * intending to fsync it, but which we had not actually fsync'd yet. - * Therefore, a power failure in the near future might cause earlier - * unflushed writes to be lost, even though more recent data written to - * disk from here on would be persisted. To avoid that, fsync the entire - * data directory. - */ - if (ControlFile->state != DB_SHUTDOWNED && - ControlFile->state != DB_SHUTDOWNED_IN_RECOVERY) - fsync_pgdata(data_directory); - /* This is just to allow attaching to startup process with a debugger */ #ifdef XLOG_REPLAY_DELAY if (ControlFile->state != DB_SHUTDOWNED) @@ -6453,6 +6439,18 @@ StartupXLOG(void) */ RelationCacheInitFileRemove(); + /* + * If we previously crashed, there might be data which we had written, + * intending to fsync it, but which we had not actually fsync'd yet. + * Therefore, a power failure in the near future might cause earlier + * unflushed writes to be lost, even though more recent data written to + * disk from here on would be persisted. To avoid that, fsync the entire + * data directory. + */ + if (ControlFile->state != DB_SHUTDOWNED && + ControlFile->state != DB_SHUTDOWNED_IN_RECOVERY) + SyncDataDirectory(); + /* * Initialize on the assumption we want to recover to the same timeline * that's active according to pg_control. @@ -11027,31 +11025,3 @@ SetWalWriterSleeping(bool sleeping) xlogctl->WalWriterSleeping = sleeping; SpinLockRelease(&xlogctl->info_lck); } - -/* - * Issue fsync recursively on PGDATA and all its contents. - */ -static void -fsync_pgdata(char *datadir) -{ - if (!enableFsync) - return; - - /* - * If possible, hint to the kernel that we're soon going to fsync - * the data directory and its contents. - */ -#if defined(HAVE_SYNC_FILE_RANGE) || \ - (defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)) - walkdir(datadir, pre_sync_fname); -#endif - - /* - * Now we do the fsync()s in the same order. - * - * It's important to fsync the destination directory itself as individual - * file fsyncs don't guarantee that the directory entry for the file is - * synced. - */ - walkdir(datadir, fsync_fname); -} diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c index fea0998e85882f1ab47aefca9be2504798bde50b..03788f61c2873b2c07577d115ff67e6fd4c756d3 100644 --- a/src/backend/storage/file/fd.c +++ b/src/backend/storage/file/fd.c @@ -60,6 +60,11 @@ #include "utils/resowner.h" +/* Define PG_FLUSH_DATA_WORKS if we have an implementation for pg_flush_data */ +#if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED) +#define PG_FLUSH_DATA_WORKS 1 +#endif + /* * We must leave some file descriptors free for system(), the dynamic loader, * and other code that tries to open files without consulting fd.c. This @@ -261,6 +266,8 @@ static int FileAccess(File file); static File OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError); static bool reserveAllocatedDesc(void); static int FreeDesc(AllocateDesc *desc); +static struct dirent *ReadDirExtended(DIR *dir, const char *dirname, int elevel); + static void AtProcExit_Files(int code, Datum arg); static void CleanupTempFiles(bool isProcExit); static void RemovePgTempFilesInDir(const char *tmpdirname); @@ -268,6 +275,15 @@ static void RemovePgTempRelationFiles(const char *tsdirname); static void RemovePgTempRelationFilesInDbspace(const char *dbspacedirname); static bool looks_like_temp_rel_name(const char *name); +static void walkdir(const char *path, + void (*action) (const char *fname, bool isdir, int elevel), + bool process_symlinks, + int elevel); +#ifdef PG_FLUSH_DATA_WORKS +static void pre_sync_fname(const char *fname, bool isdir, int elevel); +#endif +static void fsync_fname_ext(const char *fname, bool isdir, int elevel); + /* * pg_fsync --- do fsync with or without writethrough @@ -343,15 +359,24 @@ pg_fdatasync(int fd) * pg_flush_data --- advise OS that the data described won't be needed soon * * Not all platforms have posix_fadvise; treat as noop if not available. + * Also, treat as no-op if enableFsync is off; this is because the call isn't + * free, and some platforms such as Linux will actually block the requestor + * until the write is scheduled. */ int pg_flush_data(int fd, off_t offset, off_t amount) { +#ifdef PG_FLUSH_DATA_WORKS + if (enableFsync) + { #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED) - return posix_fadvise(fd, offset, amount, POSIX_FADV_DONTNEED); + return posix_fadvise(fd, offset, amount, POSIX_FADV_DONTNEED); #else - return 0; +#error PG_FLUSH_DATA_WORKS should not have been defined #endif + } +#endif + return 0; } @@ -1787,15 +1812,28 @@ TryAgain: */ struct dirent * ReadDir(DIR *dir, const char *dirname) +{ + return ReadDirExtended(dir, dirname, ERROR); +} + +/* + * Alternate version that allows caller to specify the elevel for any + * error report. If elevel < ERROR, returns NULL on any error. + */ +static struct dirent * +ReadDirExtended(DIR *dir, const char *dirname, int elevel) { struct dirent *dent; /* Give a generic message for AllocateDir failure, if caller didn't */ if (dir == NULL) - ereport(ERROR, + { + ereport(elevel, (errcode_for_file_access(), errmsg("could not open directory \"%s\": %m", dirname))); + return NULL; + } errno = 0; if ((dent = readdir(dir)) != NULL) @@ -1808,7 +1846,7 @@ ReadDir(DIR *dir, const char *dirname) #endif if (errno) - ereport(ERROR, + ereport(elevel, (errcode_for_file_access(), errmsg("could not read directory \"%s\": %m", dirname))); @@ -2266,54 +2304,121 @@ looks_like_temp_rel_name(const char *name) return true; } + /* - * Hint to the OS that it should get ready to fsync() this file. + * Issue fsync recursively on PGDATA and all its contents. + * + * We fsync regular files and directories wherever they are, but we + * follow symlinks only for pg_xlog and immediately under pg_tblspc. + * Other symlinks are presumed to point at files we're not responsible + * for fsyncing, and might not have privileges to write at all. + * + * Errors are logged but not considered fatal; that's because this is used + * only during database startup, to deal with the possibility that there are + * issued-but-unsynced writes pending against the data directory. We want to + * ensure that such writes reach disk before anything that's done in the new + * run. However, aborting on error would result in failure to start for + * harmless cases such as read-only files in the data directory, and that's + * not good either. * - * Adapted from pre_sync_fname in initdb.c + * Note we assume we're chdir'd into PGDATA to begin with. */ void -pre_sync_fname(char *fname, bool isdir) +SyncDataDirectory(void) { - int fd; + bool xlog_is_symlink; - fd = BasicOpenFile(fname, O_RDONLY | PG_BINARY, 0); + /* We can skip this whole thing if fsync is disabled. */ + if (!enableFsync) + return; /* - * Some OSs don't allow us to open directories at all (Windows returns - * EACCES) + * If pg_xlog is a symlink, we'll need to recurse into it separately, + * because the first walkdir below will ignore it. */ - if (fd < 0 && isdir && (errno == EISDIR || errno == EACCES)) - return; + xlog_is_symlink = false; - if (fd < 0) - ereport(FATAL, - (errmsg("could not open file \"%s\": %m", - fname))); +#ifndef WIN32 + { + struct stat st; - pg_flush_data(fd, 0, 0); + if (lstat("pg_xlog", &st) < 0) + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not stat file \"%s\": %m", + "pg_xlog"))); + else if (S_ISLNK(st.st_mode)) + xlog_is_symlink = true; + } +#else + if (pgwin32_is_junction("pg_xlog")) + xlog_is_symlink = true; +#endif - close(fd); + /* + * If possible, hint to the kernel that we're soon going to fsync the data + * directory and its contents. Errors in this step are even less + * interesting than normal, so log them only at DEBUG1. + */ +#ifdef PG_FLUSH_DATA_WORKS + walkdir(".", pre_sync_fname, false, DEBUG1); + if (xlog_is_symlink) + walkdir("pg_xlog", pre_sync_fname, false, DEBUG1); + walkdir("pg_tblspc", pre_sync_fname, true, DEBUG1); +#endif + + /* + * Now we do the fsync()s in the same order. + * + * The main call ignores symlinks, so in addition to specially processing + * pg_xlog if it's a symlink, pg_tblspc has to be visited separately with + * process_symlinks = true. Note that if there are any plain directories + * in pg_tblspc, they'll get fsync'd twice. That's not an expected case + * so we don't worry about optimizing it. + */ + walkdir(".", fsync_fname_ext, false, LOG); + if (xlog_is_symlink) + walkdir("pg_xlog", fsync_fname_ext, false, LOG); + walkdir("pg_tblspc", fsync_fname_ext, true, LOG); } /* * walkdir: recursively walk a directory, applying the action to each - * regular file and directory (including the named directory itself) - * and following symbolic links. + * regular file and directory (including the named directory itself). + * + * If process_symlinks is true, the action and recursion are also applied + * to regular files and directories that are pointed to by symlinks in the + * given directory; otherwise symlinks are ignored. Symlinks are always + * ignored in subdirectories, ie we intentionally don't pass down the + * process_symlinks flag to recursive calls. * - * NB: There is another version of walkdir in initdb.c, but that version - * behaves differently with respect to symbolic links. Caveat emptor! + * Errors are reported at level elevel, which might be ERROR or less. + * + * See also walkdir in initdb.c, which is a frontend version of this logic. */ -void -walkdir(char *path, void (*action) (char *fname, bool isdir)) +static void +walkdir(const char *path, + void (*action) (const char *fname, bool isdir, int elevel), + bool process_symlinks, + int elevel) { DIR *dir; struct dirent *de; dir = AllocateDir(path); - while ((de = ReadDir(dir, path)) != NULL) + if (dir == NULL) + { + ereport(elevel, + (errcode_for_file_access(), + errmsg("could not open directory \"%s\": %m", path))); + return; + } + + while ((de = ReadDirExtended(dir, path, elevel)) != NULL) { char subpath[MAXPGPATH]; struct stat fst; + int sret; CHECK_FOR_INTERRUPTS(); @@ -2323,60 +2428,132 @@ walkdir(char *path, void (*action) (char *fname, bool isdir)) snprintf(subpath, MAXPGPATH, "%s/%s", path, de->d_name); - if (lstat(subpath, &fst) < 0) - ereport(ERROR, + if (process_symlinks) + sret = stat(subpath, &fst); + else + sret = lstat(subpath, &fst); + + if (sret < 0) + { + ereport(elevel, (errcode_for_file_access(), errmsg("could not stat file \"%s\": %m", subpath))); + continue; + } if (S_ISREG(fst.st_mode)) - (*action) (subpath, false); + (*action) (subpath, false, elevel); else if (S_ISDIR(fst.st_mode)) - walkdir(subpath, action); -#ifndef WIN32 - else if (S_ISLNK(fst.st_mode)) -#else - else if (pgwin32_is_junction(subpath)) + walkdir(subpath, action, false, elevel); + } + + FreeDir(dir); /* we ignore any error here */ + + /* + * It's important to fsync the destination directory itself as individual + * file fsyncs don't guarantee that the directory entry for the file is + * synced. + */ + (*action) (path, true, elevel); +} + + +/* + * Hint to the OS that it should get ready to fsync() this file. + * + * Ignores errors trying to open unreadable files, and logs other errors at a + * caller-specified level. + */ +#ifdef PG_FLUSH_DATA_WORKS + +static void +pre_sync_fname(const char *fname, bool isdir, int elevel) +{ + int fd; + + fd = BasicOpenFile((char *) fname, O_RDONLY | PG_BINARY, 0); + + if (fd < 0) + { + if (errno == EACCES || (isdir && errno == EISDIR)) + return; + +#ifdef ETXTBSY + if (errno == ETXTBSY) + return; #endif - { -#if defined(HAVE_READLINK) || defined(WIN32) - char linkpath[MAXPGPATH]; - int len; - struct stat lst; - len = readlink(subpath, linkpath, sizeof(linkpath)-1); - if (len < 0) - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not read symbolic link \"%s\": %m", - subpath))); + ereport(elevel, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", fname))); + return; + } - if (len >= sizeof(linkpath)-1) - ereport(ERROR, - (errmsg("symbolic link \"%s\" target is too long", - subpath))); + (void) pg_flush_data(fd, 0, 0); - linkpath[len] = '\0'; + (void) close(fd); +} - if (lstat(linkpath, &lst) == 0) - { - if (S_ISREG(lst.st_mode)) - (*action) (linkpath, false); - else if (S_ISDIR(lst.st_mode)) - walkdir(subpath, action); - } - else if (errno != ENOENT) - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not stat file \"%s\": %m", linkpath))); -#else - ereport(WARNING, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("this platform does not support symbolic links; ignoring \"%s\"", - subpath))); +#endif /* PG_FLUSH_DATA_WORKS */ + +/* + * fsync_fname_ext -- Try to fsync a file or directory + * + * Ignores errors trying to open unreadable files, or trying to fsync + * directories on systems where that isn't allowed/required, and logs other + * errors at a caller-specified level. + */ +static void +fsync_fname_ext(const char *fname, bool isdir, int elevel) +{ + int fd; + int flags; + int returncode; + + /* + * Some OSs require directories to be opened read-only whereas other + * systems don't allow us to fsync files opened read-only; so we need both + * cases here. Using O_RDWR will cause us to fail to fsync files that are + * not writable by our userid, but we assume that's OK. + */ + flags = PG_BINARY; + if (!isdir) + flags |= O_RDWR; + else + flags |= O_RDONLY; + + /* + * Open the file, silently ignoring errors about unreadable files (or + * unsupported operations, e.g. opening a directory under Windows), and + * logging others. + */ + fd = BasicOpenFile((char *) fname, flags, 0); + if (fd < 0) + { + if (errno == EACCES || (isdir && errno == EISDIR)) + return; + +#ifdef ETXTBSY + if (errno == ETXTBSY) + return; #endif - } + + ereport(elevel, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", fname))); + return; } - FreeDir(dir); - (*action) (path, true); + returncode = pg_fsync(fd); + + /* + * Some OSes don't allow us to fsync directories at all, so we can ignore + * those errors. Anything else needs to be logged. + */ + if (returncode != 0 && !(isdir && errno == EBADF)) + ereport(elevel, + (errcode_for_file_access(), + errmsg("could not fsync file \"%s\": %m", fname))); + + (void) close(fd); } diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h index 8a37ea9d1a2939cde28f03559ce4829674c57dd7..bb2957989db551d9e8f0882d131db5f2892bae04 100644 --- a/src/include/storage/fd.h +++ b/src/include/storage/fd.h @@ -105,8 +105,7 @@ extern int pg_fsync_writethrough(int fd); extern int pg_fdatasync(int fd); extern int pg_flush_data(int fd, off_t offset, off_t amount); extern void fsync_fname(char *fname, bool isdir); -extern void pre_sync_fname(char *fname, bool isdir); -extern void walkdir(char *path, void (*action) (char *fname, bool isdir)); +extern void SyncDataDirectory(void); /* Filename components for OpenTemporaryFile */ #define PG_TEMP_FILES_DIR "pgsql_tmp"