From aa8377e64ff0ddac5342979d0afb050eb178ff8a Mon Sep 17 00:00:00 2001 From: Tom Lane <tgl@sss.pgh.pa.us> Date: Thu, 28 May 2015 17:33:03 -0400 Subject: [PATCH] Fix fsync-at-startup code to not treat errors as fatal. Commit 2ce439f3379aed857517c8ce207485655000fc8e introduced a rather serious regression, namely that if its scan of the data directory came across any un-fsync-able files, it would fail and thereby prevent database startup. Worse yet, symlinks to such files also caused the problem, which meant that crash restart was guaranteed to fail on certain common installations such as older Debian. After discussion, we agreed that (1) failure to start is worse than any consequence of not fsync'ing is likely to be, therefore treat all errors in this code as nonfatal; (2) we should not chase symlinks other than those that are expected to exist, namely pg_xlog/ and tablespace links under pg_tblspc/. The latter restriction avoids possibly fsync'ing a much larger part of the filesystem than intended, if the user has left random symlinks hanging about in the data directory. This commit takes care of that and also does some code beautification, mainly moving the relevant code into fd.c, which seems a much better place for it than xlog.c, and making sure that the conditional compilation for the pre_sync_fname pass has something to do with whether pg_flush_data works. I also relocated the call site in xlog.c down a few lines; it seems a bit silly to be doing this before ValidateXLOGDirectoryStructure(). The similar logic in initdb.c ought to be made to match this, but that change is noncritical and will be dealt with separately. Back-patch to all active branches, like the prior commit. Abhijit Menon-Sen and Tom Lane --- src/backend/access/transam/xlog.c | 54 ++--- src/backend/storage/file/fd.c | 315 +++++++++++++++++++++++------- src/include/storage/fd.h | 3 +- 3 files changed, 259 insertions(+), 113 deletions(-) diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 447387a9586..9ff3069504e 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -701,8 +701,6 @@ static bool read_backup_label(XLogRecPtr *checkPointLoc, static void rm_redo_error_callback(void *arg); static int get_sync_bit(int method); -static void fsync_pgdata(char *datadir); - /* * Insert an XLOG record having the specified RMID and info bytes, * with the body of the record being the data chunk(s) described by @@ -6418,18 +6416,6 @@ StartupXLOG(void) (errmsg("database system was interrupted; last known up at %s", str_time(ControlFile->time)))); - /* - * If we previously crashed, there might be data which we had written, - * intending to fsync it, but which we had not actually fsync'd yet. - * Therefore, a power failure in the near future might cause earlier - * unflushed writes to be lost, even though more recent data written to - * disk from here on would be persisted. To avoid that, fsync the entire - * data directory. - */ - if (ControlFile->state != DB_SHUTDOWNED && - ControlFile->state != DB_SHUTDOWNED_IN_RECOVERY) - fsync_pgdata(data_directory); - /* This is just to allow attaching to startup process with a debugger */ #ifdef XLOG_REPLAY_DELAY if (ControlFile->state != DB_SHUTDOWNED) @@ -6453,6 +6439,18 @@ StartupXLOG(void) */ RelationCacheInitFileRemove(); + /* + * If we previously crashed, there might be data which we had written, + * intending to fsync it, but which we had not actually fsync'd yet. + * Therefore, a power failure in the near future might cause earlier + * unflushed writes to be lost, even though more recent data written to + * disk from here on would be persisted. To avoid that, fsync the entire + * data directory. + */ + if (ControlFile->state != DB_SHUTDOWNED && + ControlFile->state != DB_SHUTDOWNED_IN_RECOVERY) + SyncDataDirectory(); + /* * Initialize on the assumption we want to recover to the same timeline * that's active according to pg_control. @@ -11027,31 +11025,3 @@ SetWalWriterSleeping(bool sleeping) xlogctl->WalWriterSleeping = sleeping; SpinLockRelease(&xlogctl->info_lck); } - -/* - * Issue fsync recursively on PGDATA and all its contents. - */ -static void -fsync_pgdata(char *datadir) -{ - if (!enableFsync) - return; - - /* - * If possible, hint to the kernel that we're soon going to fsync - * the data directory and its contents. - */ -#if defined(HAVE_SYNC_FILE_RANGE) || \ - (defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)) - walkdir(datadir, pre_sync_fname); -#endif - - /* - * Now we do the fsync()s in the same order. - * - * It's important to fsync the destination directory itself as individual - * file fsyncs don't guarantee that the directory entry for the file is - * synced. - */ - walkdir(datadir, fsync_fname); -} diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c index fea0998e858..03788f61c28 100644 --- a/src/backend/storage/file/fd.c +++ b/src/backend/storage/file/fd.c @@ -60,6 +60,11 @@ #include "utils/resowner.h" +/* Define PG_FLUSH_DATA_WORKS if we have an implementation for pg_flush_data */ +#if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED) +#define PG_FLUSH_DATA_WORKS 1 +#endif + /* * We must leave some file descriptors free for system(), the dynamic loader, * and other code that tries to open files without consulting fd.c. This @@ -261,6 +266,8 @@ static int FileAccess(File file); static File OpenTemporaryFileInTablespace(Oid tblspcOid, bool rejectError); static bool reserveAllocatedDesc(void); static int FreeDesc(AllocateDesc *desc); +static struct dirent *ReadDirExtended(DIR *dir, const char *dirname, int elevel); + static void AtProcExit_Files(int code, Datum arg); static void CleanupTempFiles(bool isProcExit); static void RemovePgTempFilesInDir(const char *tmpdirname); @@ -268,6 +275,15 @@ static void RemovePgTempRelationFiles(const char *tsdirname); static void RemovePgTempRelationFilesInDbspace(const char *dbspacedirname); static bool looks_like_temp_rel_name(const char *name); +static void walkdir(const char *path, + void (*action) (const char *fname, bool isdir, int elevel), + bool process_symlinks, + int elevel); +#ifdef PG_FLUSH_DATA_WORKS +static void pre_sync_fname(const char *fname, bool isdir, int elevel); +#endif +static void fsync_fname_ext(const char *fname, bool isdir, int elevel); + /* * pg_fsync --- do fsync with or without writethrough @@ -343,15 +359,24 @@ pg_fdatasync(int fd) * pg_flush_data --- advise OS that the data described won't be needed soon * * Not all platforms have posix_fadvise; treat as noop if not available. + * Also, treat as no-op if enableFsync is off; this is because the call isn't + * free, and some platforms such as Linux will actually block the requestor + * until the write is scheduled. */ int pg_flush_data(int fd, off_t offset, off_t amount) { +#ifdef PG_FLUSH_DATA_WORKS + if (enableFsync) + { #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED) - return posix_fadvise(fd, offset, amount, POSIX_FADV_DONTNEED); + return posix_fadvise(fd, offset, amount, POSIX_FADV_DONTNEED); #else - return 0; +#error PG_FLUSH_DATA_WORKS should not have been defined #endif + } +#endif + return 0; } @@ -1787,15 +1812,28 @@ TryAgain: */ struct dirent * ReadDir(DIR *dir, const char *dirname) +{ + return ReadDirExtended(dir, dirname, ERROR); +} + +/* + * Alternate version that allows caller to specify the elevel for any + * error report. If elevel < ERROR, returns NULL on any error. + */ +static struct dirent * +ReadDirExtended(DIR *dir, const char *dirname, int elevel) { struct dirent *dent; /* Give a generic message for AllocateDir failure, if caller didn't */ if (dir == NULL) - ereport(ERROR, + { + ereport(elevel, (errcode_for_file_access(), errmsg("could not open directory \"%s\": %m", dirname))); + return NULL; + } errno = 0; if ((dent = readdir(dir)) != NULL) @@ -1808,7 +1846,7 @@ ReadDir(DIR *dir, const char *dirname) #endif if (errno) - ereport(ERROR, + ereport(elevel, (errcode_for_file_access(), errmsg("could not read directory \"%s\": %m", dirname))); @@ -2266,54 +2304,121 @@ looks_like_temp_rel_name(const char *name) return true; } + /* - * Hint to the OS that it should get ready to fsync() this file. + * Issue fsync recursively on PGDATA and all its contents. + * + * We fsync regular files and directories wherever they are, but we + * follow symlinks only for pg_xlog and immediately under pg_tblspc. + * Other symlinks are presumed to point at files we're not responsible + * for fsyncing, and might not have privileges to write at all. + * + * Errors are logged but not considered fatal; that's because this is used + * only during database startup, to deal with the possibility that there are + * issued-but-unsynced writes pending against the data directory. We want to + * ensure that such writes reach disk before anything that's done in the new + * run. However, aborting on error would result in failure to start for + * harmless cases such as read-only files in the data directory, and that's + * not good either. * - * Adapted from pre_sync_fname in initdb.c + * Note we assume we're chdir'd into PGDATA to begin with. */ void -pre_sync_fname(char *fname, bool isdir) +SyncDataDirectory(void) { - int fd; + bool xlog_is_symlink; - fd = BasicOpenFile(fname, O_RDONLY | PG_BINARY, 0); + /* We can skip this whole thing if fsync is disabled. */ + if (!enableFsync) + return; /* - * Some OSs don't allow us to open directories at all (Windows returns - * EACCES) + * If pg_xlog is a symlink, we'll need to recurse into it separately, + * because the first walkdir below will ignore it. */ - if (fd < 0 && isdir && (errno == EISDIR || errno == EACCES)) - return; + xlog_is_symlink = false; - if (fd < 0) - ereport(FATAL, - (errmsg("could not open file \"%s\": %m", - fname))); +#ifndef WIN32 + { + struct stat st; - pg_flush_data(fd, 0, 0); + if (lstat("pg_xlog", &st) < 0) + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not stat file \"%s\": %m", + "pg_xlog"))); + else if (S_ISLNK(st.st_mode)) + xlog_is_symlink = true; + } +#else + if (pgwin32_is_junction("pg_xlog")) + xlog_is_symlink = true; +#endif - close(fd); + /* + * If possible, hint to the kernel that we're soon going to fsync the data + * directory and its contents. Errors in this step are even less + * interesting than normal, so log them only at DEBUG1. + */ +#ifdef PG_FLUSH_DATA_WORKS + walkdir(".", pre_sync_fname, false, DEBUG1); + if (xlog_is_symlink) + walkdir("pg_xlog", pre_sync_fname, false, DEBUG1); + walkdir("pg_tblspc", pre_sync_fname, true, DEBUG1); +#endif + + /* + * Now we do the fsync()s in the same order. + * + * The main call ignores symlinks, so in addition to specially processing + * pg_xlog if it's a symlink, pg_tblspc has to be visited separately with + * process_symlinks = true. Note that if there are any plain directories + * in pg_tblspc, they'll get fsync'd twice. That's not an expected case + * so we don't worry about optimizing it. + */ + walkdir(".", fsync_fname_ext, false, LOG); + if (xlog_is_symlink) + walkdir("pg_xlog", fsync_fname_ext, false, LOG); + walkdir("pg_tblspc", fsync_fname_ext, true, LOG); } /* * walkdir: recursively walk a directory, applying the action to each - * regular file and directory (including the named directory itself) - * and following symbolic links. + * regular file and directory (including the named directory itself). + * + * If process_symlinks is true, the action and recursion are also applied + * to regular files and directories that are pointed to by symlinks in the + * given directory; otherwise symlinks are ignored. Symlinks are always + * ignored in subdirectories, ie we intentionally don't pass down the + * process_symlinks flag to recursive calls. * - * NB: There is another version of walkdir in initdb.c, but that version - * behaves differently with respect to symbolic links. Caveat emptor! + * Errors are reported at level elevel, which might be ERROR or less. + * + * See also walkdir in initdb.c, which is a frontend version of this logic. */ -void -walkdir(char *path, void (*action) (char *fname, bool isdir)) +static void +walkdir(const char *path, + void (*action) (const char *fname, bool isdir, int elevel), + bool process_symlinks, + int elevel) { DIR *dir; struct dirent *de; dir = AllocateDir(path); - while ((de = ReadDir(dir, path)) != NULL) + if (dir == NULL) + { + ereport(elevel, + (errcode_for_file_access(), + errmsg("could not open directory \"%s\": %m", path))); + return; + } + + while ((de = ReadDirExtended(dir, path, elevel)) != NULL) { char subpath[MAXPGPATH]; struct stat fst; + int sret; CHECK_FOR_INTERRUPTS(); @@ -2323,60 +2428,132 @@ walkdir(char *path, void (*action) (char *fname, bool isdir)) snprintf(subpath, MAXPGPATH, "%s/%s", path, de->d_name); - if (lstat(subpath, &fst) < 0) - ereport(ERROR, + if (process_symlinks) + sret = stat(subpath, &fst); + else + sret = lstat(subpath, &fst); + + if (sret < 0) + { + ereport(elevel, (errcode_for_file_access(), errmsg("could not stat file \"%s\": %m", subpath))); + continue; + } if (S_ISREG(fst.st_mode)) - (*action) (subpath, false); + (*action) (subpath, false, elevel); else if (S_ISDIR(fst.st_mode)) - walkdir(subpath, action); -#ifndef WIN32 - else if (S_ISLNK(fst.st_mode)) -#else - else if (pgwin32_is_junction(subpath)) + walkdir(subpath, action, false, elevel); + } + + FreeDir(dir); /* we ignore any error here */ + + /* + * It's important to fsync the destination directory itself as individual + * file fsyncs don't guarantee that the directory entry for the file is + * synced. + */ + (*action) (path, true, elevel); +} + + +/* + * Hint to the OS that it should get ready to fsync() this file. + * + * Ignores errors trying to open unreadable files, and logs other errors at a + * caller-specified level. + */ +#ifdef PG_FLUSH_DATA_WORKS + +static void +pre_sync_fname(const char *fname, bool isdir, int elevel) +{ + int fd; + + fd = BasicOpenFile((char *) fname, O_RDONLY | PG_BINARY, 0); + + if (fd < 0) + { + if (errno == EACCES || (isdir && errno == EISDIR)) + return; + +#ifdef ETXTBSY + if (errno == ETXTBSY) + return; #endif - { -#if defined(HAVE_READLINK) || defined(WIN32) - char linkpath[MAXPGPATH]; - int len; - struct stat lst; - len = readlink(subpath, linkpath, sizeof(linkpath)-1); - if (len < 0) - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not read symbolic link \"%s\": %m", - subpath))); + ereport(elevel, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", fname))); + return; + } - if (len >= sizeof(linkpath)-1) - ereport(ERROR, - (errmsg("symbolic link \"%s\" target is too long", - subpath))); + (void) pg_flush_data(fd, 0, 0); - linkpath[len] = '\0'; + (void) close(fd); +} - if (lstat(linkpath, &lst) == 0) - { - if (S_ISREG(lst.st_mode)) - (*action) (linkpath, false); - else if (S_ISDIR(lst.st_mode)) - walkdir(subpath, action); - } - else if (errno != ENOENT) - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not stat file \"%s\": %m", linkpath))); -#else - ereport(WARNING, - (errcode(ERRCODE_FEATURE_NOT_SUPPORTED), - errmsg("this platform does not support symbolic links; ignoring \"%s\"", - subpath))); +#endif /* PG_FLUSH_DATA_WORKS */ + +/* + * fsync_fname_ext -- Try to fsync a file or directory + * + * Ignores errors trying to open unreadable files, or trying to fsync + * directories on systems where that isn't allowed/required, and logs other + * errors at a caller-specified level. + */ +static void +fsync_fname_ext(const char *fname, bool isdir, int elevel) +{ + int fd; + int flags; + int returncode; + + /* + * Some OSs require directories to be opened read-only whereas other + * systems don't allow us to fsync files opened read-only; so we need both + * cases here. Using O_RDWR will cause us to fail to fsync files that are + * not writable by our userid, but we assume that's OK. + */ + flags = PG_BINARY; + if (!isdir) + flags |= O_RDWR; + else + flags |= O_RDONLY; + + /* + * Open the file, silently ignoring errors about unreadable files (or + * unsupported operations, e.g. opening a directory under Windows), and + * logging others. + */ + fd = BasicOpenFile((char *) fname, flags, 0); + if (fd < 0) + { + if (errno == EACCES || (isdir && errno == EISDIR)) + return; + +#ifdef ETXTBSY + if (errno == ETXTBSY) + return; #endif - } + + ereport(elevel, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", fname))); + return; } - FreeDir(dir); - (*action) (path, true); + returncode = pg_fsync(fd); + + /* + * Some OSes don't allow us to fsync directories at all, so we can ignore + * those errors. Anything else needs to be logged. + */ + if (returncode != 0 && !(isdir && errno == EBADF)) + ereport(elevel, + (errcode_for_file_access(), + errmsg("could not fsync file \"%s\": %m", fname))); + + (void) close(fd); } diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h index 8a37ea9d1a2..bb2957989db 100644 --- a/src/include/storage/fd.h +++ b/src/include/storage/fd.h @@ -105,8 +105,7 @@ extern int pg_fsync_writethrough(int fd); extern int pg_fdatasync(int fd); extern int pg_flush_data(int fd, off_t offset, off_t amount); extern void fsync_fname(char *fname, bool isdir); -extern void pre_sync_fname(char *fname, bool isdir); -extern void walkdir(char *path, void (*action) (char *fname, bool isdir)); +extern void SyncDataDirectory(void); /* Filename components for OpenTemporaryFile */ #define PG_TEMP_FILES_DIR "pgsql_tmp" -- GitLab