From b50991eedb458a81d875d049f48fb62ab1685c0d Mon Sep 17 00:00:00 2001
From: Robert Haas <rhaas@postgresql.org>
Date: Thu, 7 Jun 2012 12:25:41 -0400
Subject: [PATCH] Fix more crash-safe visibility map bugs, and improve
 comments.

In lazy_scan_heap, we could issue bogus warnings about incorrect
information in the visibility map, because we checked the visibility
map bit before locking the heap page, creating a race condition.  Fix
by rechecking the visibility map bit before we complain.  Rejigger
some related logic so that we rely on the possibly-outdated
all_visible_according_to_vm value as little as possible.

In heap_multi_insert, it's not safe to clear the visibility map bit
before beginning the critical section.  The visibility map is not
crash-safe unless we treat clearing the bit as a critical operation.
Specifically, if the transaction were to error out after we set the
bit and before entering the critical section, we could end up writing
the heap page to disk (with the bit cleared) and crashing before the
visibility map page made it to disk.  That would be bad.  heap_insert
has this correct, but somehow the order of operations got rearranged
when heap_multi_insert was added.

Also, add some more comments to visibilitymap_test, lazy_scan_heap,
and IndexOnlyNext, expounding on concurrency issues.

Per extensive code review by Andres Freund, and further review by Tom
Lane, who also made the original report about the bogus warnings.
---
 src/backend/access/heap/heapam.c         | 18 ++++++-------
 src/backend/access/heap/visibilitymap.c  | 11 +++++++-
 src/backend/commands/vacuumlazy.c        | 34 ++++++++++++++++++++----
 src/backend/executor/nodeIndexonlyscan.c | 13 +++++++++
 4 files changed, 61 insertions(+), 15 deletions(-)

diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index ce52e2dd48e..2d81383ae8a 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -2149,15 +2149,6 @@ heap_multi_insert(Relation relation, HeapTuple *tuples, int ntuples,
 										   &vmbuffer, NULL);
 		page = BufferGetPage(buffer);
 
-		if (PageIsAllVisible(page))
-		{
-			all_visible_cleared = true;
-			PageClearAllVisible(page);
-			visibilitymap_clear(relation,
-								BufferGetBlockNumber(buffer),
-								vmbuffer);
-		}
-
 		/* NO EREPORT(ERROR) from here till changes are logged */
 		START_CRIT_SECTION();
 
@@ -2172,6 +2163,15 @@ heap_multi_insert(Relation relation, HeapTuple *tuples, int ntuples,
 			RelationPutHeapTuple(relation, buffer, heaptup);
 		}
 
+		if (PageIsAllVisible(page))
+		{
+			all_visible_cleared = true;
+			PageClearAllVisible(page);
+			visibilitymap_clear(relation,
+								BufferGetBlockNumber(buffer),
+								vmbuffer);
+		}
+
 		/*
 		 * XXX Should we set PageSetPrunable on this page ? See heap_insert()
 		 */
diff --git a/src/backend/access/heap/visibilitymap.c b/src/backend/access/heap/visibilitymap.c
index 5696abe4d2a..9152c7d1511 100644
--- a/src/backend/access/heap/visibilitymap.c
+++ b/src/backend/access/heap/visibilitymap.c
@@ -293,6 +293,13 @@ visibilitymap_set(Relation rel, BlockNumber heapBlk, XLogRecPtr recptr,
  * relation. On return, *buf is a valid buffer with the map page containing
  * the bit for heapBlk, or InvalidBuffer. The caller is responsible for
  * releasing *buf after it's done testing and setting bits.
+ *
+ * NOTE: This function is typically called without a lock on the heap page,
+ * so somebody else could change the bit just after we look at it.  In fact,
+ * since we don't lock the visibility map page either, it's even possible that
+ * someone else could have changed the bit just before we look at it, but yet
+ * we might see the old value.  It is the caller's responsibility to deal with
+ * all concurrency issues!
  */
 bool
 visibilitymap_test(Relation rel, BlockNumber heapBlk, Buffer *buf)
@@ -327,7 +334,9 @@ visibilitymap_test(Relation rel, BlockNumber heapBlk, Buffer *buf)
 	map = PageGetContents(BufferGetPage(*buf));
 
 	/*
-	 * We don't need to lock the page, as we're only looking at a single bit.
+	 * A single-bit read is atomic.  There could be memory-ordering effects
+	 * here, but for performance reasons we make it the caller's job to worry
+	 * about that.
 	 */
 	result = (map[mapByte] & (1 << mapBit)) ? true : false;
 
diff --git a/src/backend/commands/vacuumlazy.c b/src/backend/commands/vacuumlazy.c
index 0e0193d40e1..3ff56a73664 100644
--- a/src/backend/commands/vacuumlazy.c
+++ b/src/backend/commands/vacuumlazy.c
@@ -419,6 +419,15 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
 	 * Note: if scan_all is true, we won't actually skip any pages; but we
 	 * maintain next_not_all_visible_block anyway, so as to set up the
 	 * all_visible_according_to_vm flag correctly for each page.
+	 *
+	 * Note: The value returned by visibilitymap_test could be slightly
+	 * out-of-date, since we make this test before reading the corresponding
+	 * heap page or locking the buffer.  This is OK.  If we mistakenly think
+	 * that the page is all-visible when in fact the flag's just been cleared,
+	 * we might fail to vacuum the page.  But it's OK to skip pages when
+	 * scan_all is not set, so no great harm done; the next vacuum will find
+	 * them.  If we make the reverse mistake and vacuum a page unnecessarily,
+	 * it'll just be a no-op.
 	 */
 	for (next_not_all_visible_block = 0;
 		 next_not_all_visible_block < nblocks;
@@ -852,22 +861,37 @@ lazy_scan_heap(Relation onerel, LVRelStats *vacrelstats,
 		freespace = PageGetHeapFreeSpace(page);
 
 		/* mark page all-visible, if appropriate */
-		if (all_visible && !all_visible_according_to_vm)
+		if (all_visible)
 		{
 			if (!PageIsAllVisible(page))
 			{
 				PageSetAllVisible(page);
 				MarkBufferDirty(buf);
+				visibilitymap_set(onerel, blkno, InvalidXLogRecPtr, vmbuffer,
+								  visibility_cutoff_xid);
+			}
+			else if (!all_visible_according_to_vm)
+			{
+				/*
+				 * It should never be the case that the visibility map page
+				 * is set while the page-level bit is clear, but the reverse
+				 * is allowed.  Set the visibility map bit as well so that
+				 * we get back in sync.
+				 */
+				visibilitymap_set(onerel, blkno, InvalidXLogRecPtr, vmbuffer,
+								  visibility_cutoff_xid);
 			}
-			visibilitymap_set(onerel, blkno, InvalidXLogRecPtr, vmbuffer,
-							  visibility_cutoff_xid);
 		}
 
 		/*
 		 * As of PostgreSQL 9.2, the visibility map bit should never be set if
-		 * the page-level bit is clear.
+		 * the page-level bit is clear.  However, it's possible that the bit
+		 * got cleared after we checked it and before we took the buffer
+		 * content lock, so we must recheck before jumping to the conclusion
+		 * that something bad has happened.
 		 */
-		else if (all_visible_according_to_vm && !PageIsAllVisible(page))
+		else if (all_visible_according_to_vm && !PageIsAllVisible(page)
+				 && visibilitymap_test(onerel, blkno, &vmbuffer))
 		{
 			elog(WARNING, "page is not marked all-visible but visibility map bit is set in relation \"%s\" page %u",
 				 relname, blkno);
diff --git a/src/backend/executor/nodeIndexonlyscan.c b/src/backend/executor/nodeIndexonlyscan.c
index 4abd805aa31..af31671b3eb 100644
--- a/src/backend/executor/nodeIndexonlyscan.c
+++ b/src/backend/executor/nodeIndexonlyscan.c
@@ -82,6 +82,19 @@ IndexOnlyNext(IndexOnlyScanState *node)
 		 * We can skip the heap fetch if the TID references a heap page on
 		 * which all tuples are known visible to everybody.  In any case,
 		 * we'll use the index tuple not the heap tuple as the data source.
+		 *
+		 * Note on Memory Ordering Effects: visibilitymap_test does not lock
+		 * the visibility map buffer, and therefore the result we read here
+		 * could be slightly stale.  However, it can't be stale enough to
+		 * matter.  It suffices to show that (1) there is a read barrier
+		 * between the time we read the index TID and the time we test the
+		 * visibility map; and (2) there is a write barrier between the time
+		 * some other concurrent process clears the visibility map bit and the
+		 * time it inserts the index TID.  Since acquiring or releasing a
+		 * LWLock interposes a full barrier, this is easy to show: (1) is
+		 * satisfied by the release of the index buffer content lock after
+		 * reading the TID; and (2) is satisfied by the acquisition of the
+		 * buffer content lock in order to insert the TID.
 		 */
 		if (!visibilitymap_test(scandesc->heapRelation,
 								ItemPointerGetBlockNumber(tid),
-- 
GitLab