From 2ada6779c5d3fcc31568ba263f8a0cc9bb8318c1 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Tue, 16 Aug 2011 13:11:54 -0400
Subject: [PATCH] Fix race condition in relcache init file invalidation.

The previous code tried to synchronize by unlinking the init file twice,
but that doesn't actually work: it leaves a window wherein a third process
could read the already-stale init file but miss the SI messages that would
tell it the data is stale.  The result would be bizarre failures in catalog
accesses, typically "could not read block 0 in file ..." later during
startup.

Instead, hold RelCacheInitLock across both the unlink and the sending of
the SI messages.  This is more straightforward, and might even be a bit
faster since only one unlink call is needed.

This has been wrong since it was put in (in 2002!), so back-patch to all
supported releases.
---
 src/backend/access/transam/twophase.c |  4 +-
 src/backend/utils/cache/inval.c       | 33 +++++++-------
 src/backend/utils/cache/relcache.c    | 66 +++++++++++++++------------
 src/include/utils/relcache.h          |  3 +-
 4 files changed, 57 insertions(+), 49 deletions(-)

diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c
index 281268120ef..54176ee9df9 100644
--- a/src/backend/access/transam/twophase.c
+++ b/src/backend/access/transam/twophase.c
@@ -1356,10 +1356,10 @@ FinishPreparedTransaction(const char *gid, bool isCommit)
 	 * after we send the SI messages. See AtEOXact_Inval()
 	 */
 	if (hdr->initfileinval)
-		RelationCacheInitFileInvalidate(true);
+		RelationCacheInitFilePreInvalidate();
 	SendSharedInvalidMessages(invalmsgs, hdr->ninvalmsgs);
 	if (hdr->initfileinval)
-		RelationCacheInitFileInvalidate(false);
+		RelationCacheInitFilePostInvalidate();
 
 	/* And now do the callbacks */
 	if (isCommit)
diff --git a/src/backend/utils/cache/inval.c b/src/backend/utils/cache/inval.c
index a5580bd92fb..4249bd33765 100644
--- a/src/backend/utils/cache/inval.c
+++ b/src/backend/utils/cache/inval.c
@@ -854,24 +854,12 @@ xactGetCommittedInvalidationMessages(SharedInvalidationMessage **msgs,
 	return numSharedInvalidMessagesArray;
 }
 
-#define RecoveryRelationCacheInitFileInvalidate(dbo, tbo, tf) \
-{ \
-	DatabasePath = GetDatabasePath(dbo, tbo); \
-	elog(trace_recovery(DEBUG4), "removing relcache init file in %s", DatabasePath); \
-	RelationCacheInitFileInvalidate(tf); \
-	pfree(DatabasePath); \
-}
-
 /*
  * ProcessCommittedInvalidationMessages is executed by xact_redo_commit()
  * to process invalidation messages added to commit records.
  *
  * Relcache init file invalidation requires processing both
  * before and after we send the SI messages. See AtEOXact_Inval()
- *
- * We deliberately avoid SetDatabasePath() since it is intended to be used
- * only once by normal backends, so we set DatabasePath directly then
- * pfree after use. See RecoveryRelationCacheInitFileInvalidate() macro.
  */
 void
 ProcessCommittedInvalidationMessages(SharedInvalidationMessage *msgs,
@@ -885,12 +873,25 @@ ProcessCommittedInvalidationMessages(SharedInvalidationMessage *msgs,
 		 (RelcacheInitFileInval ? " and relcache file invalidation" : ""));
 
 	if (RelcacheInitFileInval)
-		RecoveryRelationCacheInitFileInvalidate(dbid, tsid, true);
+	{
+		/*
+		 * RelationCacheInitFilePreInvalidate requires DatabasePath to be set,
+		 * but we should not use SetDatabasePath during recovery, since it is
+		 * intended to be used only once by normal backends.  Hence, a quick
+		 * hack: set DatabasePath directly then unset after use.
+		 */
+		DatabasePath = GetDatabasePath(dbid, tsid);
+		elog(trace_recovery(DEBUG4), "removing relcache init file in \"%s\"",
+			 DatabasePath);
+		RelationCacheInitFilePreInvalidate();
+		pfree(DatabasePath);
+		DatabasePath = NULL;
+	}
 
 	SendSharedInvalidMessages(msgs, nmsgs);
 
 	if (RelcacheInitFileInval)
-		RecoveryRelationCacheInitFileInvalidate(dbid, tsid, false);
+		RelationCacheInitFilePostInvalidate();
 }
 
 /*
@@ -931,7 +932,7 @@ AtEOXact_Inval(bool isCommit)
 		 * unless we committed.
 		 */
 		if (transInvalInfo->RelcacheInitFileInval)
-			RelationCacheInitFileInvalidate(true);
+			RelationCacheInitFilePreInvalidate();
 
 		AppendInvalidationMessages(&transInvalInfo->PriorCmdInvalidMsgs,
 								   &transInvalInfo->CurrentCmdInvalidMsgs);
@@ -940,7 +941,7 @@ AtEOXact_Inval(bool isCommit)
 										 SendSharedInvalidMessages);
 
 		if (transInvalInfo->RelcacheInitFileInval)
-			RelationCacheInitFileInvalidate(false);
+			RelationCacheInitFilePostInvalidate();
 	}
 	else if (transInvalInfo != NULL)
 	{
diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c
index 809222bfe95..dfc758c1695 100644
--- a/src/backend/utils/cache/relcache.c
+++ b/src/backend/utils/cache/relcache.c
@@ -4377,8 +4377,8 @@ write_relcache_init_file(bool shared)
 	 * updated by SI message processing, but we can't be sure whether what we
 	 * wrote out was up-to-date.)
 	 *
-	 * This mustn't run concurrently with RelationCacheInitFileInvalidate, so
-	 * grab a serialization lock for the duration.
+	 * This mustn't run concurrently with the code that unlinks an init file
+	 * and sends SI messages, so grab a serialization lock for the duration.
 	 */
 	LWLockAcquire(RelCacheInitLock, LW_EXCLUSIVE);
 
@@ -4442,19 +4442,22 @@ RelationIdIsInInitFile(Oid relationId)
  * changed one or more of the relation cache entries that are kept in the
  * local init file.
  *
- * We actually need to remove the init file twice: once just before sending
- * the SI messages that include relcache inval for such relations, and once
- * just after sending them.  The unlink before ensures that a backend that's
- * currently starting cannot read the now-obsolete init file and then miss
- * the SI messages that will force it to update its relcache entries.  (This
- * works because the backend startup sequence gets into the PGPROC array before
- * trying to load the init file.)  The unlink after is to synchronize with a
- * backend that may currently be trying to write an init file based on data
- * that we've just rendered invalid.  Such a backend will see the SI messages,
- * but we can't leave the init file sitting around to fool later backends.
+ * To be safe against concurrent inspection or rewriting of the init file,
+ * we must take RelCacheInitLock, then remove the old init file, then send
+ * the SI messages that include relcache inval for such relations, and then
+ * release RelCacheInitLock.  This serializes the whole affair against
+ * write_relcache_init_file, so that we can be sure that any other process
+ * that's concurrently trying to create a new init file won't move an
+ * already-stale version into place after we unlink.  Also, because we unlink
+ * before sending the SI messages, a backend that's currently starting cannot
+ * read the now-obsolete init file and then miss the SI messages that will
+ * force it to update its relcache entries.  (This works because the backend
+ * startup sequence gets into the sinval array before trying to load the init
+ * file.)
  *
- * Ignore any failure to unlink the file, since it might not be there if
- * no backend has been started since the last removal.
+ * We take the lock and do the unlink in RelationCacheInitFilePreInvalidate,
+ * then release the lock in RelationCacheInitFilePostInvalidate.  Caller must
+ * send any pending SI messages between those calls.
  *
  * Notice this deals only with the local init file, not the shared init file.
  * The reason is that there can never be a "significant" change to the
@@ -4464,34 +4467,37 @@ RelationIdIsInInitFile(Oid relationId)
  * be invalid enough to make it necessary to remove it.
  */
 void
-RelationCacheInitFileInvalidate(bool beforeSend)
+RelationCacheInitFilePreInvalidate(void)
 {
 	char		initfilename[MAXPGPATH];
 
 	snprintf(initfilename, sizeof(initfilename), "%s/%s",
 			 DatabasePath, RELCACHE_INIT_FILENAME);
 
-	if (beforeSend)
-	{
-		/* no interlock needed here */
-		unlink(initfilename);
-	}
-	else
+	LWLockAcquire(RelCacheInitLock, LW_EXCLUSIVE);
+
+	if (unlink(initfilename) < 0)
 	{
 		/*
-		 * We need to interlock this against write_relcache_init_file, to
-		 * guard against possibility that someone renames a new-but-
-		 * already-obsolete init file into place just after we unlink. With
-		 * the interlock, it's certain that write_relcache_init_file will
-		 * notice our SI inval message before renaming into place, or else
-		 * that we will execute second and successfully unlink the file.
+		 * The file might not be there if no backend has been started since
+		 * the last removal.  But complain about failures other than ENOENT.
+		 * Fortunately, it's not too late to abort the transaction if we
+		 * can't get rid of the would-be-obsolete init file.
 		 */
-		LWLockAcquire(RelCacheInitLock, LW_EXCLUSIVE);
-		unlink(initfilename);
-		LWLockRelease(RelCacheInitLock);
+		if (errno != ENOENT)
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("could not remove cache file \"%s\": %m",
+							initfilename)));
 	}
 }
 
+void
+RelationCacheInitFilePostInvalidate(void)
+{
+	LWLockRelease(RelCacheInitLock);
+}
+
 /*
  * Remove the init files during postmaster startup.
  *
diff --git a/src/include/utils/relcache.h b/src/include/utils/relcache.h
index ef1692cfa84..fc49ee3acc0 100644
--- a/src/include/utils/relcache.h
+++ b/src/include/utils/relcache.h
@@ -98,7 +98,8 @@ extern void AtEOSubXact_RelationCache(bool isCommit, SubTransactionId mySubid,
  * Routines to help manage rebuilding of relcache init files
  */
 extern bool RelationIdIsInInitFile(Oid relationId);
-extern void RelationCacheInitFileInvalidate(bool beforeSend);
+extern void RelationCacheInitFilePreInvalidate(void);
+extern void RelationCacheInitFilePostInvalidate(void);
 extern void RelationCacheInitFileRemove(void);
 
 /* should be used only by relcache.c and catcache.c */
-- 
GitLab