diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c index dd87e430434845b087c67b17bd9db286dbf9cf00..40e655f685555c832b4fbc6b4cd4fdf97fa6bd56 100644 --- a/src/backend/postmaster/postmaster.c +++ b/src/backend/postmaster/postmaster.c @@ -1372,9 +1372,10 @@ ServerLoop(void) fd_set readmask; int nSockets; time_t now, + last_lockfile_recheck_time, last_touch_time; - last_touch_time = time(NULL); + last_lockfile_recheck_time = last_touch_time = time(NULL); nSockets = initMasks(&readmask); @@ -1520,27 +1521,56 @@ ServerLoop(void) kill(AutoVacPID, SIGUSR2); } +#ifdef HAVE_PTHREAD_IS_THREADED_NP + + /* + * With assertions enabled, check regularly for appearance of + * additional threads. All builds check at start and exit. + */ + Assert(pthread_is_threaded_np() == 0); +#endif + + /* + * Lastly, check to see if it's time to do some things that we don't + * want to do every single time through the loop, because they're a + * bit expensive. Note that there's up to a minute of slop in when + * these tasks will be performed, since DetermineSleepTime() will let + * us sleep at most that long. + */ + now = time(NULL); + /* - * Touch the socket and lock file every 58 minutes, to ensure that + * Once a minute, verify that postmaster.pid hasn't been removed or + * overwritten. If it has, we force a shutdown. This avoids having + * postmasters and child processes hanging around after their database + * is gone, and maybe causing problems if a new database cluster is + * created in the same place. It also provides some protection + * against a DBA foolishly removing postmaster.pid and manually + * starting a new postmaster. Data corruption is likely to ensue from + * that anyway, but we can minimize the damage by aborting ASAP. + */ + if (now - last_lockfile_recheck_time >= 1 * SECS_PER_MINUTE) + { + if (!RecheckDataDirLockFile()) + { + ereport(LOG, + (errmsg("performing immediate shutdown because data directory lock file is invalid"))); + kill(MyProcPid, SIGQUIT); + } + last_lockfile_recheck_time = now; + } + + /* + * Touch Unix socket and lock file every 58 minutes, to ensure that * they are not removed by overzealous /tmp-cleaning tasks. We assume * no one runs cleaners with cutoff times of less than an hour ... */ - now = time(NULL); if (now - last_touch_time >= 58 * SECS_PER_MINUTE) { TouchSocketFile(); TouchSocketLockFile(); last_touch_time = now; } - -#ifdef HAVE_PTHREAD_IS_THREADED_NP - - /* - * With assertions enabled, check regularly for appearance of - * additional threads. All builds check at start and exit. - */ - Assert(pthread_is_threaded_np() == 0); -#endif } } diff --git a/src/backend/utils/init/miscinit.c b/src/backend/utils/init/miscinit.c index f994af64b4a37b74c55e1cffbc805533d9034949..ca5adc336286798993d7801b6245bcda944496a6 100644 --- a/src/backend/utils/init/miscinit.c +++ b/src/backend/utils/init/miscinit.c @@ -1108,6 +1108,76 @@ AddToDataDirLockFile(int target_line, const char *str) } +/* + * Recheck that the data directory lock file still exists with expected + * content. Return TRUE if the lock file appears OK, FALSE if it isn't. + * + * We call this periodically in the postmaster. The idea is that if the + * lock file has been removed or replaced by another postmaster, we should + * do a panic database shutdown. Therefore, we should return TRUE if there + * is any doubt: we do not want to cause a panic shutdown unnecessarily. + * Transient failures like EINTR or ENFILE should not cause us to fail. + * (If there really is something wrong, we'll detect it on a future recheck.) + */ +bool +RecheckDataDirLockFile(void) +{ + int fd; + int len; + long file_pid; + char buffer[BLCKSZ]; + + fd = open(DIRECTORY_LOCK_FILE, O_RDWR | PG_BINARY, 0); + if (fd < 0) + { + /* + * There are many foreseeable false-positive error conditions. For + * safety, fail only on enumerated clearly-something-is-wrong + * conditions. + */ + switch (errno) + { + case ENOENT: + case ENOTDIR: + /* disaster */ + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m", + DIRECTORY_LOCK_FILE))); + return false; + default: + /* non-fatal, at least for now */ + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not open file \"%s\": %m; continuing anyway", + DIRECTORY_LOCK_FILE))); + return true; + } + } + len = read(fd, buffer, sizeof(buffer) - 1); + if (len < 0) + { + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not read from file \"%s\": %m", + DIRECTORY_LOCK_FILE))); + close(fd); + return true; /* treat read failure as nonfatal */ + } + buffer[len] = '\0'; + close(fd); + file_pid = atol(buffer); + if (file_pid == getpid()) + return true; /* all is well */ + + /* Trouble: someone's overwritten the lock file */ + ereport(LOG, + (errmsg("lock file \"%s\" contains wrong PID: %ld instead of %ld", + DIRECTORY_LOCK_FILE, file_pid, (long) getpid()))); + return false; +} + + /*------------------------------------------------------------------------- * Version checking support *------------------------------------------------------------------------- diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h index db6271e41e100f3d1c3fb2465e0d9c8cad61dd57..859e40c2ca9bc850d3b8d85672b9153fdbd5f937 100644 --- a/src/include/miscadmin.h +++ b/src/include/miscadmin.h @@ -446,6 +446,7 @@ extern void CreateDataDirLockFile(bool amPostmaster); extern void CreateSocketLockFile(const char *socketfile, bool amPostmaster); extern void TouchSocketLockFile(void); extern void AddToDataDirLockFile(int target_line, const char *str); +extern bool RecheckDataDirLockFile(void); extern void ValidatePgVersion(const char *path); extern void process_shared_preload_libraries(void); extern void process_local_preload_libraries(void);