From fafa374f2d1e04ab265d56cdadb634124364646f Mon Sep 17 00:00:00 2001
From: Simon Riggs <simon@2ndQuadrant.com>
Date: Sat, 13 Feb 2010 00:59:58 +0000
Subject: [PATCH] Introduce WAL records to log reuse of btree pages, allowing
 conflict resolution during Hot Standby. Page reuse interlock requested by
 Tom. Analysis and patch by me.

---
 src/backend/access/nbtree/nbtpage.c | 58 +++++++++++++++++++++++++++-
 src/backend/access/nbtree/nbtxlog.c | 60 ++++++++++++++++++++---------
 src/include/access/nbtree.h         | 15 +++++++-
 3 files changed, 111 insertions(+), 22 deletions(-)

diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c
index b0eff770d0b..5df975e4ec5 100644
--- a/src/backend/access/nbtree/nbtpage.c
+++ b/src/backend/access/nbtree/nbtpage.c
@@ -9,7 +9,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/nbtree/nbtpage.c,v 1.118 2010/02/08 04:33:53 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/nbtree/nbtpage.c,v 1.119 2010/02/13 00:59:58 sriggs Exp $
  *
  *	NOTES
  *	   Postgres btree pages look like ordinary relation pages.	The opaque
@@ -446,6 +446,48 @@ _bt_checkpage(Relation rel, Buffer buf)
 				 errhint("Please REINDEX it.")));
 }
 
+/*
+ * Log the reuse of a page from the FSM.
+ */
+static void
+_bt_log_reuse_page(Relation rel, BlockNumber blkno, TransactionId latestRemovedXid)
+{
+	if (rel->rd_istemp)
+		return;
+
+	/* No ereport(ERROR) until changes are logged */
+	START_CRIT_SECTION();
+
+	/*
+	 * We don't do MarkBufferDirty here because we're about initialise
+	 * the page, and nobody else can see it yet.
+	 */
+
+	/* XLOG stuff */
+	{
+		XLogRecPtr	recptr;
+		XLogRecData rdata[1];
+		xl_btree_reuse_page xlrec_reuse;
+
+		xlrec_reuse.node = rel->rd_node;
+		xlrec_reuse.block = blkno;
+		xlrec_reuse.latestRemovedXid = latestRemovedXid;
+		rdata[0].data = (char *) &xlrec_reuse;
+		rdata[0].len = SizeOfBtreeReusePage;
+		rdata[0].buffer = InvalidBuffer;
+		rdata[0].next = NULL;
+
+		recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_REUSE_PAGE, rdata);
+
+		/*
+		 * We don't do PageSetLSN or PageSetTLI here because
+		 * we're about initialise the page, so no need.
+		 */
+	}
+
+	END_CRIT_SECTION();
+}
+
 /*
  *	_bt_getbuf() -- Get a buffer by block number for read or write.
  *
@@ -510,7 +552,19 @@ _bt_getbuf(Relation rel, BlockNumber blkno, int access)
 			{
 				page = BufferGetPage(buf);
 				if (_bt_page_recyclable(page))
-				{
+				{					
+					/*
+					 * If we are generating WAL for Hot Standby then create
+					 * a WAL record that will allow us to conflict with
+					 * queries running on standby.
+					 */
+					if (XLogStandbyInfoActive())
+					{
+						BTPageOpaque opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+
+						_bt_log_reuse_page(rel, blkno, opaque->btpo.xact);
+					}
+
 					/* Okay to use page.  Re-initialize and return it */
 					_bt_pageinit(page, BufferGetPageSize(buf));
 					return buf;
diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c
index 83a7c98c14e..f5320fb1039 100644
--- a/src/backend/access/nbtree/nbtxlog.c
+++ b/src/backend/access/nbtree/nbtxlog.c
@@ -8,7 +8,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/nbtree/nbtxlog.c,v 1.60 2010/02/08 04:33:53 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/nbtree/nbtxlog.c,v 1.61 2010/02/13 00:59:58 sriggs Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -814,26 +814,48 @@ btree_redo(XLogRecPtr lsn, XLogRecord *record)
 {
 	uint8		info = record->xl_info & ~XLR_INFO_MASK;
 
-	/*
-	 * Btree delete records can conflict with standby queries. You might
-	 * think that vacuum records would conflict as well, but we've handled
-	 * that already. XLOG_HEAP2_CLEANUP_INFO records provide the highest xid
-	 * cleaned by the vacuum of the heap and so we can resolve any conflicts
-	 * just once when that arrives. After that any we know that no conflicts
-	 * exist from individual btree vacuum records on that index.
-	 */
-	if (InHotStandby && info == XLOG_BTREE_DELETE)
+	if (InHotStandby)
 	{
-		xl_btree_delete *xlrec = (xl_btree_delete *) XLogRecGetData(record);
+		switch (info)
+		{
+			case XLOG_BTREE_DELETE:
+				/*
+				 * Btree delete records can conflict with standby queries. You might
+				 * think that vacuum records would conflict as well, but we've handled
+				 * that already. XLOG_HEAP2_CLEANUP_INFO records provide the highest xid
+				 * cleaned by the vacuum of the heap and so we can resolve any conflicts
+				 * just once when that arrives. After that any we know that no conflicts
+				 * exist from individual btree vacuum records on that index.
+				 */
+				{
+					xl_btree_delete *xlrec = (xl_btree_delete *) XLogRecGetData(record);
 
-		/*
-		 * XXX Currently we put everybody on death row, because
-		 * currently _bt_delitems() supplies InvalidTransactionId.
-		 * This can be fairly painful, so providing a better value
-		 * here is worth some thought and possibly some effort to
-		 * improve.
-		 */
-		ResolveRecoveryConflictWithSnapshot(xlrec->latestRemovedXid, xlrec->node);
+					/*
+					 * XXX Currently we put everybody on death row, because
+					 * currently _bt_delitems() supplies InvalidTransactionId.
+					 * This can be fairly painful, so providing a better value
+					 * here is worth some thought and possibly some effort to
+					 * improve.
+					 */
+					ResolveRecoveryConflictWithSnapshot(xlrec->latestRemovedXid, xlrec->node);
+				}
+				break;
+
+			case XLOG_BTREE_REUSE_PAGE:
+				/*
+				 * Btree reuse page records exist to provide a conflict point when we
+				 * reuse pages in the index via the FSM. That's all it does though.
+				 */
+				{
+					xl_btree_reuse_page *xlrec = (xl_btree_reuse_page *) XLogRecGetData(record);
+
+					ResolveRecoveryConflictWithSnapshot(xlrec->latestRemovedXid, xlrec->node);
+				}
+				return;
+
+			default:
+				break;
+		}
 	}
 
 	/*
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index acbb0cbc7d7..f3898a41408 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/access/nbtree.h,v 1.128 2010/02/08 04:33:54 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/access/nbtree.h,v 1.129 2010/02/13 00:59:58 sriggs Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -221,6 +221,7 @@ typedef struct BTMetaPageData
 #define XLOG_BTREE_DELETE_PAGE_HALF 0xB0		/* page deletion that makes
 												 * parent half-dead */
 #define XLOG_BTREE_VACUUM		0xC0	/* delete entries on a page during vacuum */
+#define XLOG_BTREE_REUSE_PAGE	0xD0	/* old page is about to be reused from FSM */
 
 /*
  * All that we need to find changed index tuple
@@ -321,6 +322,18 @@ typedef struct xl_btree_delete
 
 #define SizeOfBtreeDelete	(offsetof(xl_btree_delete, latestRemovedXid) + sizeof(TransactionId))
 
+/*
+ * This is what we need to know about page reuse within btree.
+ */
+typedef struct xl_btree_reuse_page
+{
+	RelFileNode node;
+	BlockNumber block;
+	TransactionId	latestRemovedXid;
+} xl_btree_reuse_page;
+
+#define SizeOfBtreeReusePage	(sizeof(xl_btree_reuse_page))
+
 /*
  * This is what we need to know about vacuum of individual leaf index tuples.
  * The WAL record can represent deletion of any number of index tuples on a
-- 
GitLab