From 20d1878b6a7f78a4a9ce3668c4588d92bc0af78d Mon Sep 17 00:00:00 2001 From: Tom Lane <tgl@sss.pgh.pa.us> Date: Sun, 2 Aug 2015 14:54:44 -0400 Subject: [PATCH] Fix incorrect order of lock file removal and failure to close() sockets. Commit c9b0cbe98bd783e24a8c4d8d8ac472a494b81292 accidentally broke the order of operations during postmaster shutdown: it resulted in removing the per-socket lockfiles after, not before, postmaster.pid. This creates a race-condition hazard for a new postmaster that's started immediately after observing that postmaster.pid has disappeared; if it sees the socket lockfile still present, it will quite properly refuse to start. This error appears to be the explanation for at least some of the intermittent buildfarm failures we've seen in the pg_upgrade test. Another problem, which has been there all along, is that the postmaster has never bothered to close() its listen sockets, but has just allowed them to close at process death. This creates a different race condition for an incoming postmaster: it might be unable to bind to the desired listen address because the old postmaster is still incumbent. This might explain some odd failures we've seen in the past, too. (Note: this is not related to the fact that individual backends don't close their client communication sockets. That behavior is intentional and is not changed by this patch.) Fix by adding an on_proc_exit function that closes the postmaster's ports explicitly, and (in 9.3 and up) reshuffling the responsibility for where to unlink the Unix socket files. Lock file unlinking can stay where it is, but teach it to unlink the lock files in reverse order of creation. --- src/backend/postmaster/postmaster.c | 39 +++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/src/backend/postmaster/postmaster.c b/src/backend/postmaster/postmaster.c index f091ec74212..aff9a5e1159 100644 --- a/src/backend/postmaster/postmaster.c +++ b/src/backend/postmaster/postmaster.c @@ -338,6 +338,7 @@ static DNSServiceRef bonjour_sdref = NULL; /* * postmaster.c - function prototypes */ +static void CloseServerPorts(int status, Datum arg); static void getInstallationPaths(const char *argv0); static void checkDataDir(void); static Port *ConnCreate(int serverFd); @@ -991,6 +992,14 @@ PostmasterMain(int argc, char *argv[]) ereport(FATAL, (errmsg("no socket created for listening"))); + /* + * Set up an on_proc_exit function that's charged with closing the sockets + * again at postmaster shutdown. You might think we should have done this + * earlier, but we want it to run before not after the proc_exit callback + * that will remove the Unix socket file. + */ + on_proc_exit(CloseServerPorts, 0); + /* * If no valid TCP ports, write an empty line for listen address, * indicating the Unix socket must be used. Note that this line is not @@ -1161,6 +1170,36 @@ PostmasterMain(int argc, char *argv[]) } +/* + * on_proc_exit callback to close server's listen sockets + */ +static void +CloseServerPorts(int status, Datum arg) +{ + int i; + + /* + * First, explicitly close all the socket FDs. We used to just let this + * happen implicitly at postmaster exit, but it's better to close them + * before we remove the postmaster.pid lockfile; otherwise there's a race + * condition if a new postmaster wants to re-use the TCP port number. + */ + for (i = 0; i < MAXLISTEN; i++) + { + if (ListenSocket[i] != PGINVALID_SOCKET) + { + StreamClose(ListenSocket[i]); + ListenSocket[i] = PGINVALID_SOCKET; + } + } + + /* + * Removal of the Unix socket file and socket lockfile will happen in + * later on_proc_exit callbacks. + */ +} + + /* * Compute and check the directory paths to files that are part of the * installation (as deduced from the postgres executable's own location) -- GitLab