From cc59049daf78c3d351c1ec78fb319b5fdeb20d53 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Fri, 21 Sep 2007 21:25:42 +0000
Subject: [PATCH] Improve handling of prune/no-prune decisions by storing a
 page's oldest unpruned XMAX in its header.  At the cost of 4 bytes per page,
 this keeps us from performing heap_page_prune when there's no chance of
 pruning anything. Seems to be necessary per Heikki's preliminary performance
 testing.

---
 contrib/pageinspect/README.pageinspect | 12 +++----
 contrib/pageinspect/pageinspect.sql.in |  3 +-
 contrib/pageinspect/rawpage.c          |  7 ++--
 doc/src/sgml/storage.sgml              | 16 ++++++---
 src/backend/access/heap/README.HOT     |  6 ++--
 src/backend/access/heap/heapam.c       | 26 ++++++++-------
 src/backend/access/heap/pruneheap.c    | 27 +++++++++++-----
 src/backend/storage/page/bufpage.c     |  5 +--
 src/include/catalog/catversion.h       |  4 +--
 src/include/storage/bufpage.h          | 45 +++++++++++++++-----------
 10 files changed, 92 insertions(+), 59 deletions(-)

diff --git a/contrib/pageinspect/README.pageinspect b/contrib/pageinspect/README.pageinspect
index 88cc0209549..fc4991db641 100644
--- a/contrib/pageinspect/README.pageinspect
+++ b/contrib/pageinspect/README.pageinspect
@@ -23,14 +23,14 @@ only by superusers.
 
     A page image obtained with get_raw_page should be passed as argument:
 
-        test=# SELECT * FROM page_header(get_raw_page('pg_class',0));
-           lsn    | tli | flags | lower | upper | special | pagesize | version
-        ----------+-----+-------+-------+-------+---------+----------+---------
-         0/3C5614 |   1 |     1 |   216 |   256 |    8192 |     8192 |       4
+        regression=# SELECT * FROM page_header(get_raw_page('pg_class',0));
+            lsn    | tli | flags | lower | upper | special | pagesize | version | prune_xid 
+        -----------+-----+-------+-------+-------+---------+----------+---------+-----------
+         0/24A1B50 |   1 |     1 |   232 |   368 |    8192 |     8192 |       4 |         0
         (1 row)
 
-    The returned columns correspond to the fields in the PageHeaderData-struct,
-    see src/include/storage/bufpage.h for more details.
+    The returned columns correspond to the fields in the PageHeaderData struct.
+    See src/include/storage/bufpage.h for details.
 
     heap_page_items
     ---------------
diff --git a/contrib/pageinspect/pageinspect.sql.in b/contrib/pageinspect/pageinspect.sql.in
index 40b75dcbc02..4821f8c3a3e 100644
--- a/contrib/pageinspect/pageinspect.sql.in
+++ b/contrib/pageinspect/pageinspect.sql.in
@@ -20,7 +20,8 @@ CREATE OR REPLACE FUNCTION page_header(IN page bytea,
     OUT upper smallint,
     OUT special smallint,
     OUT pagesize smallint,
-    OUT version smallint)
+    OUT version smallint,
+    OUT prune_xid xid)
 AS 'MODULE_PATHNAME', 'page_header'
 LANGUAGE C STRICT;
 
diff --git a/contrib/pageinspect/rawpage.c b/contrib/pageinspect/rawpage.c
index 4aba08e7800..80632be9fb5 100644
--- a/contrib/pageinspect/rawpage.c
+++ b/contrib/pageinspect/rawpage.c
@@ -8,7 +8,7 @@
  * Copyright (c) 2007, PostgreSQL Global Development Group
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/contrib/pageinspect/rawpage.c,v 1.1 2007/05/17 19:11:24 momjian Exp $
+ *	  $PostgreSQL: pgsql/contrib/pageinspect/rawpage.c,v 1.2 2007/09/21 21:25:42 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -110,8 +110,8 @@ page_header(PG_FUNCTION_ARGS)
 
 	Datum		result;
 	HeapTuple	tuple;
-	Datum		values[8];
-	bool		nulls[8];
+	Datum		values[9];
+	bool		nulls[9];
 
 	PageHeader	page;
 	XLogRecPtr	lsn;
@@ -152,6 +152,7 @@ page_header(PG_FUNCTION_ARGS)
 	values[5] = UInt16GetDatum(page->pd_special);
 	values[6] = UInt16GetDatum(PageGetPageSize(page));
 	values[7] = UInt16GetDatum(PageGetPageLayoutVersion(page));
+	values[8] = TransactionIdGetDatum(page->pd_prune_xid);
 
     /* Build and return the tuple. */
 
diff --git a/doc/src/sgml/storage.sgml b/doc/src/sgml/storage.sgml
index a66aeb2584e..93950dde297 100644
--- a/doc/src/sgml/storage.sgml
+++ b/doc/src/sgml/storage.sgml
@@ -1,4 +1,4 @@
-<!-- $PostgreSQL: pgsql/doc/src/sgml/storage.sgml,v 1.18 2007/06/03 17:05:53 tgl Exp $ -->
+<!-- $PostgreSQL: pgsql/doc/src/sgml/storage.sgml,v 1.19 2007/09/21 21:25:42 tgl Exp $ -->
 
 <chapter id="storage">
 
@@ -416,7 +416,7 @@ Item
 
 <row>
  <entry>PageHeaderData</entry>
- <entry>20 bytes long. Contains general information about the page, including
+ <entry>24 bytes long. Contains general information about the page, including
 free space pointers.</entry>
 </row>
 
@@ -449,7 +449,7 @@ data. Empty in ordinary tables.</entry>
 
  <para>
 
-  The first 20 bytes of each page consists of a page header
+  The first 24 bytes of each page consists of a page header
   (PageHeaderData). Its format is detailed in <xref
   linkend="pageheaderdata-table">. The first two fields track the most
   recent WAL entry related to this page. Next is a 2-byte field
@@ -459,7 +459,7 @@ data. Empty in ordinary tables.</entry>
   from the page start to the start
   of unallocated space, to the end of unallocated space, and to the start of
   the special space. 
-  The last 2 bytes of the page header,
+  The next 2 bytes of the page header,
   <structfield>pd_pagesize_version</structfield>, store both the page size
   and a version indicator.  Beginning with
   <productname>PostgreSQL</productname> 8.3 the version number is 4;
@@ -471,6 +471,8 @@ data. Empty in ordinary tables.</entry>
   versions, but the layout of heap row headers has.)  The page size
   is basically only present as a cross-check; there is no support for having
   more than one page size in an installation.
+  The last field is a hint that shows whether pruning the page is likely
+  to be profitable: it tracks the oldest un-pruned XMAX on the page.
   
  </para>
  
@@ -530,6 +532,12 @@ data. Empty in ordinary tables.</entry>
    <entry>2 bytes</entry>
    <entry>Page size and layout version number information</entry>
   </row>
+  <row>
+   <entry>pd_prune_xid</entry>
+   <entry>TransactionId</entry>
+   <entry>4 bytes</entry>
+   <entry>Oldest unpruned XMAX on page, or zero if none</entry>
+  </row>
  </tbody>
  </tgroup>
  </table>
diff --git a/src/backend/access/heap/README.HOT b/src/backend/access/heap/README.HOT
index 8cf0fa44de6..82984efc756 100644
--- a/src/backend/access/heap/README.HOT
+++ b/src/backend/access/heap/README.HOT
@@ -1,4 +1,4 @@
-$PostgreSQL: pgsql/src/backend/access/heap/README.HOT,v 1.1 2007/09/20 17:56:30 tgl Exp $
+$PostgreSQL: pgsql/src/backend/access/heap/README.HOT,v 1.2 2007/09/21 21:25:42 tgl Exp $
 
                            Heap Only Tuples (HOT)
 
@@ -233,8 +233,8 @@ subsequent index searches.  However it is unclear that this gain is
 large enough to accept any extra maintenance burden for.
 
 The currently planned heuristic is to prune and defrag when first accessing
-a page that potentially has prunable tuples (flagged by the PD_PRUNABLE
-page hint bit) and that either has free space less than MAX(fillfactor
+a page that potentially has prunable tuples (as flagged by the pd_prune_xid
+page hint field) and that either has free space less than MAX(fillfactor
 target free space, BLCKSZ/10) *or* has recently had an UPDATE fail to
 find enough free space to store an updated tuple version.  (These rules
 are subject to change.)
diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index d5a2f9a43d1..0f8d52bd0c6 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/heap/heapam.c,v 1.241 2007/09/20 17:56:30 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/heap/heapam.c,v 1.242 2007/09/21 21:25:42 tgl Exp $
  *
  *
  * INTERFACE ROUTINES
@@ -2068,11 +2068,12 @@ l1:
 
 	/*
 	 * If this transaction commits, the tuple will become DEAD sooner or
-	 * later. Set hint bit that this page is a candidate for pruning.  If
-	 * the transaction finally aborts, the subsequent page pruning will be
-	 * a no-op and the hint will be cleared.
+	 * later.  Set flag that this page is a candidate for pruning once our xid
+	 * falls below the OldestXmin horizon.  If the transaction finally aborts,
+	 * the subsequent page pruning will be a no-op and the hint will be
+	 * cleared.
 	 */
-	PageSetPrunable((Page) dp);
+	PageSetPrunable(dp, xid);
 
 	/* store transaction information of xact deleting the tuple */
 	tp.t_data->t_infomask &= ~(HEAP_XMAX_COMMITTED |
@@ -2571,16 +2572,17 @@ l2:
 
 	/*
 	 * If this transaction commits, the old tuple will become DEAD sooner or
-	 * later. Set hint bit that this page is a candidate for pruning.  If
-	 * the transaction finally aborts, the subsequent page pruning will be
-	 * a no-op and the hint will be cleared.
+	 * later.  Set flag that this page is a candidate for pruning once our xid
+	 * falls below the OldestXmin horizon.  If the transaction finally aborts,
+	 * the subsequent page pruning will be a no-op and the hint will be
+	 * cleared.
 	 *
 	 * XXX Should we set hint on newbuf as well?  If the transaction
 	 * aborts, there would be a prunable tuple in the newbuf; but for now
 	 * we choose not to optimize for aborts.  Note that heap_xlog_update
-	 * must be kept in sync if this changes.
+	 * must be kept in sync if this decision changes.
 	 */
-	PageSetPrunable(dp);
+	PageSetPrunable(dp, xid);
 
 	if (use_hot_update)
 	{
@@ -4108,7 +4110,7 @@ heap_xlog_delete(XLogRecPtr lsn, XLogRecord *record)
 	HeapTupleHeaderSetCmax(htup, FirstCommandId, false);
 
 	/* Mark the page as a candidate for pruning */
-	PageSetPrunable(page);
+	PageSetPrunable(page, record->xl_xid);
 
 	/* Make sure there is no forward chain link in t_ctid */
 	htup->t_ctid = xlrec->target.tid;
@@ -4284,7 +4286,7 @@ heap_xlog_update(XLogRecPtr lsn, XLogRecord *record, bool move, bool hot_update)
 	}
 
 	/* Mark the page as a candidate for pruning */
-	PageSetPrunable(page);
+	PageSetPrunable(page, record->xl_xid);
 
 	/*
 	 * this test is ugly, but necessary to avoid thinking that insert change
diff --git a/src/backend/access/heap/pruneheap.c b/src/backend/access/heap/pruneheap.c
index d5496689003..5a1e82216bd 100644
--- a/src/backend/access/heap/pruneheap.c
+++ b/src/backend/access/heap/pruneheap.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/heap/pruneheap.c,v 1.1 2007/09/20 17:56:30 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/heap/pruneheap.c,v 1.2 2007/09/21 21:25:42 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -63,9 +63,10 @@ heap_page_prune_opt(Relation relation, Buffer buffer, TransactionId OldestXmin)
 	/*
 	 * Let's see if we really need pruning.
 	 *
-	 * Forget it if page is not hinted to contain something prunable
+	 * Forget it if page is not hinted to contain something prunable that's
+	 * older than OldestXmin.
 	 */
-	if (!PageIsPrunable(dp))
+	if (!PageIsPrunable(dp, OldestXmin))
 		return;
 
 	/*
@@ -93,6 +94,8 @@ heap_page_prune_opt(Relation relation, Buffer buffer, TransactionId OldestXmin)
 		/*
 		 * Now that we have buffer lock, get accurate information about the
 		 * page's free space, and recheck the heuristic about whether to prune.
+		 * (We needn't recheck PageIsPrunable, since no one else could have
+		 * pruned while we hold pin.)
 		 */
 		if (PageIsFull(dp) || PageGetHeapFreeSpace((Page) dp) < minfree)
 		{
@@ -147,7 +150,7 @@ heap_page_prune(Relation relation, Buffer buffer, TransactionId OldestXmin,
 	START_CRIT_SECTION();
 
 	/*
-	 * Mark the page as clear of prunable tuples. If we find a tuple which
+	 * Mark the page as clear of prunable tuples.  If we find a tuple which
 	 * may soon become prunable, we shall set the hint again.  Also clear
 	 * the "page is full" flag, since there's no point in repeating the
 	 * prune/defrag process until something else happens to the page.
@@ -203,6 +206,14 @@ heap_page_prune(Relation relation, Buffer buffer, TransactionId OldestXmin,
 			PageSetLSN(BufferGetPage(buffer), recptr);
 		}
 	}
+	else
+	{
+		/*
+		 * If we didn't prune anything, we have nonetheless updated the
+		 * pd_prune_xid field; treat this as a non-WAL-logged hint.
+		 */
+		SetBufferCommitInfoNeedsSave(buffer);
+	}
 
 	END_CRIT_SECTION();
 
@@ -392,18 +403,18 @@ heap_prune_chain(Relation relation, Buffer buffer, OffsetNumber rootoffnum,
 			case HEAPTUPLE_RECENTLY_DEAD:
 				recent_dead = true;
 				/*
-				 * This tuple may soon become DEAD. Re-set the hint bit so
+				 * This tuple may soon become DEAD.  Update the hint field so
 				 * that the page is reconsidered for pruning in future.
 				 */
-				PageSetPrunable(dp);
+				PageSetPrunable(dp, HeapTupleHeaderGetXmax(htup));
 				break;
 
 			case HEAPTUPLE_DELETE_IN_PROGRESS:
 				/*
-				 * This tuple may soon become DEAD. Re-set the hint bit so
+				 * This tuple may soon become DEAD.  Update the hint field so
 				 * that the page is reconsidered for pruning in future.
 				 */
-				PageSetPrunable(dp);
+				PageSetPrunable(dp, HeapTupleHeaderGetXmax(htup));
 				break;
 
 			case HEAPTUPLE_LIVE:
diff --git a/src/backend/storage/page/bufpage.c b/src/backend/storage/page/bufpage.c
index b382e4d0240..ca5ea020747 100644
--- a/src/backend/storage/page/bufpage.c
+++ b/src/backend/storage/page/bufpage.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/storage/page/bufpage.c,v 1.74 2007/09/20 17:56:31 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/storage/page/bufpage.c,v 1.75 2007/09/21 21:25:42 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -40,11 +40,12 @@ PageInit(Page page, Size pageSize, Size specialSize)
 	/* Make sure all fields of page are zero, as well as unused space */
 	MemSet(p, 0, pageSize);
 
-	/* p->pd_flags = 0; 					done by above MemSet */
+	/* p->pd_flags = 0;								done by above MemSet */
 	p->pd_lower = SizeOfPageHeaderData;
 	p->pd_upper = pageSize - specialSize;
 	p->pd_special = pageSize - specialSize;
 	PageSetPageSizeAndVersion(page, pageSize, PG_PAGE_LAYOUT_VERSION);
+	/* p->pd_prune_xid = InvalidTransactionId;		done by above MemSet */
 }
 
 
diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h
index c295aab857c..4c1b84bd62b 100644
--- a/src/include/catalog/catversion.h
+++ b/src/include/catalog/catversion.h
@@ -37,7 +37,7 @@
  * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/catalog/catversion.h,v 1.426 2007/09/20 17:56:32 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/catalog/catversion.h,v 1.427 2007/09/21 21:25:42 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -53,6 +53,6 @@
  */
 
 /*							yyyymmddN */
-#define CATALOG_VERSION_NO	200709201
+#define CATALOG_VERSION_NO	200709211
 
 #endif
diff --git a/src/include/storage/bufpage.h b/src/include/storage/bufpage.h
index 8ca2dd8e38f..e3e84bfab56 100644
--- a/src/include/storage/bufpage.h
+++ b/src/include/storage/bufpage.h
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/storage/bufpage.h,v 1.74 2007/09/20 17:56:32 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/storage/bufpage.h,v 1.75 2007/09/21 21:25:42 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -48,7 +48,7 @@
  *
  * EXCEPTIONS:
  *
- * obviously, a page is not formatted before it is initialized with by
+ * obviously, a page is not formatted before it is initialized by
  * a call to PageInit.
  *
  * NOTES:
@@ -95,6 +95,7 @@ typedef uint16 LocationIndex;
  *		pd_upper	- offset to end of free space.
  *		pd_special	- offset to start of special space.
  *		pd_pagesize_version - size in bytes and page layout version number.
+ *		pd_prune_xid - oldest XID among potentially prunable tuples on page.
  *
  * The LSN is used by the buffer manager to enforce the basic rule of WAL:
  * "thou shalt write xlog before data".  A dirty buffer cannot be dumped
@@ -103,6 +104,9 @@ typedef uint16 LocationIndex;
  * purposes (it is not clear that this is actually necessary, but it seems
  * like a good idea).
  *
+ * pd_prune_xid is a hint field that helps determine whether pruning will be
+ * useful.  It is currently unused in index pages.
+ *
  * The page version number and page size are packed together into a single
  * uint16 field.  This is for historical reasons: before PostgreSQL 7.3,
  * there was no concept of a page version number, and doing it this way
@@ -128,6 +132,7 @@ typedef struct PageHeaderData
 	LocationIndex pd_upper;		/* offset to end of free space */
 	LocationIndex pd_special;	/* offset to start of special space */
 	uint16		pd_pagesize_version;
+	TransactionId pd_prune_xid;	/* oldest prunable XID, or zero if none */
 	ItemIdData	pd_linp[1];		/* beginning of line pointer array */
 } PageHeaderData;
 
@@ -141,20 +146,14 @@ typedef PageHeaderData *PageHeader;
  * pd_lower.  This should be considered a hint rather than the truth, since
  * changes to it are not WAL-logged.
  *
- * PD_PRUNABLE is set if there are any prunable tuples in the page.
- * This should be considered a hint rather than the truth, since
- * the transaction which generates a prunable tuple may or may not commit.
- * Also there is a lag before a tuple is declared dead.
- *
  * PD_PAGE_FULL is set if an UPDATE doesn't find enough free space in the
  * page for its new tuple version; this suggests that a prune is needed.
  * Again, this is just a hint.
  */
 #define PD_HAS_FREE_LINES	0x0001	/* are there any unused line pointers? */
-#define PD_PRUNABLE			0x0002	/* are there any prunable tuples? */
-#define PD_PAGE_FULL		0x0004	/* not enough free space for new tuple? */
+#define PD_PAGE_FULL		0x0002	/* not enough free space for new tuple? */
 
-#define PD_VALID_FLAG_BITS	0x0007	/* OR of all valid pd_flags bits */
+#define PD_VALID_FLAG_BITS	0x0003	/* OR of all valid pd_flags bits */
 
 /*
  * Page layout version number 0 is for pre-7.3 Postgres releases.
@@ -162,7 +161,8 @@ typedef PageHeaderData *PageHeader;
  * Release 8.0 uses 2; it changed the HeapTupleHeader layout again.
  * Release 8.1 uses 3; it redefined HeapTupleHeader infomask bits.
  * Release 8.3 uses 4; it changed the HeapTupleHeader layout again, and
- * added the pd_flags field (by stealing some bits from pd_tli).
+ *		added the pd_flags field (by stealing some bits from pd_tli),
+ *		as well as adding the pd_prune_xid field (which enlarges the header).
  */
 #define PG_PAGE_LAYOUT_VERSION		4
 
@@ -348,13 +348,6 @@ typedef PageHeaderData *PageHeader;
 #define PageClearHasFreeLinePointers(page) \
 	(((PageHeader) (page))->pd_flags &= ~PD_HAS_FREE_LINES)
 
-#define PageIsPrunable(page) \
-	(((PageHeader) (page))->pd_flags & PD_PRUNABLE)
-#define PageSetPrunable(page) \
-	(((PageHeader) (page))->pd_flags |= PD_PRUNABLE)
-#define PageClearPrunable(page) \
-	(((PageHeader) (page))->pd_flags &= ~PD_PRUNABLE)
-
 #define PageIsFull(page) \
 	(((PageHeader) (page))->pd_flags & PD_PAGE_FULL)
 #define PageSetFull(page) \
@@ -362,6 +355,22 @@ typedef PageHeaderData *PageHeader;
 #define PageClearFull(page) \
 	(((PageHeader) (page))->pd_flags &= ~PD_PAGE_FULL)
 
+#define PageIsPrunable(page, oldestxmin) \
+( \
+	AssertMacro(TransactionIdIsNormal(oldestxmin)), \
+	TransactionIdIsValid(((PageHeader) (page))->pd_prune_xid) && \
+	TransactionIdPrecedes(((PageHeader) (page))->pd_prune_xid, oldestxmin) \
+)
+#define PageSetPrunable(page, xid) \
+do { \
+	Assert(TransactionIdIsNormal(xid)); \
+	if (!TransactionIdIsValid(((PageHeader) (page))->pd_prune_xid) || \
+		TransactionIdPrecedes(xid, ((PageHeader) (page))->pd_prune_xid)) \
+		((PageHeader) (page))->pd_prune_xid = (xid); \
+} while (0)
+#define PageClearPrunable(page) \
+	(((PageHeader) (page))->pd_prune_xid = InvalidTransactionId)
+
 
 /* ----------------------------------------------------------------
  *		extern declarations
-- 
GitLab