From 06da3c570f21394003fc392d80f54862f7dec19f Mon Sep 17 00:00:00 2001
From: Alvaro Herrera <alvherre@alvh.no-ip.org>
Date: Mon, 20 Oct 2008 19:18:18 +0000
Subject: [PATCH] Rework subtransaction commit protocol for hot standby.

This patch eliminates the marking of subtransactions as SUBCOMMITTED in pg_clog
during their commit; instead they remain in-progress until main transaction
commit.  At main transaction commit, the commit protocol is atomic-by-page
instead of one transaction at a time.  To avoid a race condition with some
subtransactions appearing committed before others in the case where they span
more than one pg_clog page, we conserve the logic that marks them subcommitted
before marking the parent committed.

Simon Riggs with minor help from me
---
 src/backend/access/transam/README     |  21 ++-
 src/backend/access/transam/clog.c     | 229 ++++++++++++++++++++++++--
 src/backend/access/transam/transam.c  | 144 +++-------------
 src/backend/access/transam/twophase.c |   9 +-
 src/backend/access/transam/xact.c     |  72 ++------
 src/include/access/clog.h             |   5 +-
 src/include/access/transam.h          |  11 +-
 7 files changed, 279 insertions(+), 212 deletions(-)

diff --git a/src/backend/access/transam/README b/src/backend/access/transam/README
index 2288102d40c..2edac9d088f 100644
--- a/src/backend/access/transam/README
+++ b/src/backend/access/transam/README
@@ -1,4 +1,4 @@
-$PostgreSQL: pgsql/src/backend/access/transam/README,v 1.11 2008/03/21 13:23:28 momjian Exp $
+$PostgreSQL: pgsql/src/backend/access/transam/README,v 1.12 2008/10/20 19:18:18 alvherre Exp $
 
 The Transaction System
 ======================
@@ -341,11 +341,20 @@ from disk.  They also allow information to be permanent across server restarts.
 pg_clog records the commit status for each transaction that has been assigned
 an XID.  A transaction can be in progress, committed, aborted, or
 "sub-committed".  This last state means that it's a subtransaction that's no
-longer running, but its parent has not updated its state yet (either it is
-still running, or the backend crashed without updating its status).  A
-sub-committed transaction's status will be updated again to the final value as
-soon as the parent commits or aborts, or when the parent is detected to be
-aborted.
+longer running, but its parent has not updated its state yet.  It is not
+necessary to update a subtransaction's transaction status to subcommit, so we
+can just defer it until main transaction commit.  The main role of marking
+transactions as sub-committed is to provide an atomic commit protocol when
+transaction status is spread across multiple clog pages. As a result, whenever
+transaction status spreads across multiple pages we must use a two-phase commit
+protocol: the first phase is to mark the subtransactions as sub-committed, then
+we mark the top level transaction and all its subtransactions committed (in
+that order).  Thus, subtransactions that have not aborted appear as in-progress
+even when they have already finished, and the subcommit status appears as a
+very short transitory state during main transaction commit.  Subtransaction
+abort is always marked in clog as soon as it occurs.  When the transaction
+status all fit in a single CLOG page, we atomically mark them all as committed
+without bothering with the intermediate sub-commit state.
 
 Savepoints are implemented using subtransactions.  A subtransaction is a
 transaction inside a transaction; its commit or abort status is not only
diff --git a/src/backend/access/transam/clog.c b/src/backend/access/transam/clog.c
index f30ef3a2262..fcff3ea3cfd 100644
--- a/src/backend/access/transam/clog.c
+++ b/src/backend/access/transam/clog.c
@@ -26,7 +26,7 @@
  * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/backend/access/transam/clog.c,v 1.47 2008/08/01 13:16:08 alvherre Exp $
+ * $PostgreSQL: pgsql/src/backend/access/transam/clog.c,v 1.48 2008/10/20 19:18:18 alvherre Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -80,32 +80,182 @@ static int	ZeroCLOGPage(int pageno, bool writeXlog);
 static bool CLOGPagePrecedes(int page1, int page2);
 static void WriteZeroPageXlogRec(int pageno);
 static void WriteTruncateXlogRec(int pageno);
+static void TransactionIdSetPageStatus(TransactionId xid, int nsubxids,
+					   	   TransactionId *subxids, XidStatus status,
+						   XLogRecPtr lsn, int pageno);
+static void TransactionIdSetStatusBit(TransactionId xid, XidStatus status,
+						  XLogRecPtr lsn, int slotno);
+static void set_status_by_pages(int nsubxids, TransactionId *subxids,
+					XidStatus status, XLogRecPtr lsn);
 
 
 /*
- * Record the final state of a transaction in the commit log.
+ * TransactionIdSetTreeStatus
+ *
+ * Record the final state of transaction entries in the commit log for
+ * a transaction and its subtransaction tree. Take care to ensure this is
+ * efficient, and as atomic as possible.
+ *
+ * xid is a single xid to set status for. This will typically be
+ * the top level transactionid for a top level commit or abort. It can
+ * also be a subtransaction when we record transaction aborts.
+ *
+ * subxids is an array of xids of length nsubxids, representing subtransactions
+ * in the tree of xid. In various cases nsubxids may be zero.
  *
  * lsn must be the WAL location of the commit record when recording an async
  * commit.	For a synchronous commit it can be InvalidXLogRecPtr, since the
  * caller guarantees the commit record is already flushed in that case.  It
  * should be InvalidXLogRecPtr for abort cases, too.
  *
+ * In the commit case, atomicity is limited by whether all the subxids are in
+ * the same CLOG page as xid.  If they all are, then the lock will be grabbed
+ * only once, and the status will be set to committed directly.  Otherwise
+ * we must
+ *   1. set sub-committed all subxids that are not on the same page as the
+ *      main xid
+ *   2. atomically set committed the main xid and the subxids on the same page
+ *   3. go over the first bunch again and set them committed
+ * Note that as far as concurrent checkers are concerned, main transaction
+ * commit as a whole is still atomic.
+ *
+ * Example:
+ *		TransactionId t commits and has subxids t1, t2, t3, t4
+ *		t is on page p1, t1 is also on p1, t2 and t3 are on p2, t4 is on p3
+ *		1. update pages2-3:
+ *					page2: set t2,t3 as sub-committed
+ *					page3: set t4 as sub-committed
+ *		2. update page1:
+ *					set t1 as sub-committed, 
+ *					then set t as committed,
+					then set t1 as committed
+ *		3. update pages2-3:
+ *					page2: set t2,t3 as committed
+ *					page3: set t4 as committed
+ * 
  * NB: this is a low-level routine and is NOT the preferred entry point
- * for most uses; TransactionLogUpdate() in transam.c is the intended caller.
+ * for most uses; functions in transam.c are the intended callers.
+ *
+ * XXX Think about issuing FADVISE_WILLNEED on pages that we will need,
+ * but aren't yet in cache, as well as hinting pages not to fall out of
+ * cache yet.
  */
 void
-TransactionIdSetStatus(TransactionId xid, XidStatus status, XLogRecPtr lsn)
+TransactionIdSetTreeStatus(TransactionId xid, int nsubxids,
+				TransactionId *subxids, XidStatus status, XLogRecPtr lsn)
+{
+	int		pageno = TransactionIdToPage(xid); /* get page of parent */
+	int 	i;
+
+	Assert(status == TRANSACTION_STATUS_COMMITTED ||
+		   status == TRANSACTION_STATUS_ABORTED);
+
+	/*
+	 * See how many subxids, if any, are on the same page as the parent, if any.
+	 */
+	for (i = 0; i < nsubxids; i++)
+	{
+		if (TransactionIdToPage(subxids[i]) != pageno)
+			break;
+	}
+
+	/*
+	 * Do all items fit on a single page?
+	 */
+	if (i == nsubxids)
+	{
+		/*
+		 * Set the parent and all subtransactions in a single call
+		 */
+		TransactionIdSetPageStatus(xid, nsubxids, subxids, status, lsn,
+								   pageno);
+	}
+	else
+	{
+		int		nsubxids_on_first_page = i;
+
+		/*
+		 * If this is a commit then we care about doing this correctly (i.e.
+		 * using the subcommitted intermediate status).  By here, we know we're
+		 * updating more than one page of clog, so we must mark entries that
+		 * are *not* on the first page so that they show as subcommitted before
+		 * we then return to update the status to fully committed.
+		 *
+		 * To avoid touching the first page twice, skip marking subcommitted
+		 * for the subxids on that first page.
+		 */
+		if (status == TRANSACTION_STATUS_COMMITTED)
+			set_status_by_pages(nsubxids - nsubxids_on_first_page,
+								subxids + nsubxids_on_first_page,
+								TRANSACTION_STATUS_SUB_COMMITTED, lsn);
+
+		/*
+		 * Now set the parent and subtransactions on same page as the parent,
+		 * if any
+		 */
+		pageno = TransactionIdToPage(xid);
+		TransactionIdSetPageStatus(xid, nsubxids_on_first_page, subxids, status,
+								   lsn, pageno);
+
+		/*
+		 * Now work through the rest of the subxids one clog page at a time,
+		 * starting from the second page onwards, like we did above.
+		 */
+		set_status_by_pages(nsubxids - nsubxids_on_first_page,
+							subxids + nsubxids_on_first_page,
+							status, lsn);
+	}
+}
+
+/*
+ * Helper for TransactionIdSetTreeStatus: set the status for a bunch of
+ * transactions, chunking in the separate CLOG pages involved. We never
+ * pass the whole transaction tree to this function, only subtransactions
+ * that are on different pages to the top level transaction id.
+ */
+static void
+set_status_by_pages(int nsubxids, TransactionId *subxids,
+					XidStatus status, XLogRecPtr lsn)
+{
+	int		pageno = TransactionIdToPage(subxids[0]);
+	int		offset = 0;
+	int		i = 0;
+
+	while (i < nsubxids)
+	{
+		int		num_on_page = 0;
+
+		while (TransactionIdToPage(subxids[i]) == pageno && i < nsubxids)
+		{
+			num_on_page++;
+			i++;
+		}
+
+		TransactionIdSetPageStatus(InvalidTransactionId,
+								   num_on_page, subxids + offset,
+								   status, lsn, pageno);
+		offset = i;
+		pageno = TransactionIdToPage(subxids[offset]);
+	}
+}
+
+/*
+ * Record the final state of transaction entries in the commit log for
+ * all entries on a single page.  Atomic only on this page.
+ *
+ * Otherwise API is same as TransactionIdSetTreeStatus()
+ */
+static void
+TransactionIdSetPageStatus(TransactionId xid, int nsubxids,
+						   TransactionId *subxids, XidStatus status,
+						   XLogRecPtr lsn, int pageno)
 {
-	int			pageno = TransactionIdToPage(xid);
-	int			byteno = TransactionIdToByte(xid);
-	int			bshift = TransactionIdToBIndex(xid) * CLOG_BITS_PER_XACT;
 	int			slotno;
-	char	   *byteptr;
-	char		byteval;
+	int 		i;
 
 	Assert(status == TRANSACTION_STATUS_COMMITTED ||
 		   status == TRANSACTION_STATUS_ABORTED ||
-		   status == TRANSACTION_STATUS_SUB_COMMITTED);
+		   (status == TRANSACTION_STATUS_SUB_COMMITTED && !TransactionIdIsValid(xid)));
 
 	LWLockAcquire(CLogControlLock, LW_EXCLUSIVE);
 
@@ -116,9 +266,62 @@ TransactionIdSetStatus(TransactionId xid, XidStatus status, XLogRecPtr lsn)
 	 * mustn't let it reach disk until we've done the appropriate WAL flush.
 	 * But when lsn is invalid, it's OK to scribble on a page while it is
 	 * write-busy, since we don't care if the update reaches disk sooner than
-	 * we think.  Hence, pass write_ok = XLogRecPtrIsInvalid(lsn).
+	 * we think.
 	 */
 	slotno = SimpleLruReadPage(ClogCtl, pageno, XLogRecPtrIsInvalid(lsn), xid);
+
+	/*
+	 * Set the main transaction id, if any.
+	 *
+	 * If we update more than one xid on this page while it is being written
+	 * out, we might find that some of the bits go to disk and others don't.
+	 * If we are updating commits on the page with the top-level xid that could
+	 * break atomicity, so we subcommit the subxids first before we mark the
+	 * top-level commit.
+	 */
+	if (TransactionIdIsValid(xid))
+	{
+		/* Subtransactions first, if needed ... */
+		if (status == TRANSACTION_STATUS_COMMITTED)
+		{
+			for (i = 0; i < nsubxids; i++)
+			{
+				Assert(ClogCtl->shared->page_number[slotno] == TransactionIdToPage(subxids[i]));
+				TransactionIdSetStatusBit(subxids[i],
+										  TRANSACTION_STATUS_SUB_COMMITTED,
+										  lsn, slotno);
+			}
+		}
+
+		/* ... then the main transaction */
+		TransactionIdSetStatusBit(xid, status, lsn, slotno);
+	}
+
+	/* Set the subtransactions */
+	for (i = 0; i < nsubxids; i++)
+	{
+		Assert(ClogCtl->shared->page_number[slotno] == TransactionIdToPage(subxids[i]));
+		TransactionIdSetStatusBit(subxids[i], status, lsn, slotno);
+	}
+
+	ClogCtl->shared->page_dirty[slotno] = true;
+
+	LWLockRelease(CLogControlLock);
+}
+
+/*
+ * Sets the commit status of a single transaction.
+ *
+ * Must be called with CLogControlLock held
+ */
+static void
+TransactionIdSetStatusBit(TransactionId xid, XidStatus status, XLogRecPtr lsn, int slotno)
+{
+	int			byteno = TransactionIdToByte(xid);
+	int			bshift = TransactionIdToBIndex(xid) * CLOG_BITS_PER_XACT;
+	char	   *byteptr;
+	char		byteval;
+
 	byteptr = ClogCtl->shared->page_buffer[slotno] + byteno;
 
 	/* Current state should be 0, subcommitted or target state */
@@ -132,8 +335,6 @@ TransactionIdSetStatus(TransactionId xid, XidStatus status, XLogRecPtr lsn)
 	byteval |= (status << bshift);
 	*byteptr = byteval;
 
-	ClogCtl->shared->page_dirty[slotno] = true;
-
 	/*
 	 * Update the group LSN if the transaction completion LSN is higher.
 	 *
@@ -149,8 +350,6 @@ TransactionIdSetStatus(TransactionId xid, XidStatus status, XLogRecPtr lsn)
 		if (XLByteLT(ClogCtl->shared->group_lsn[lsnindex], lsn))
 			ClogCtl->shared->group_lsn[lsnindex] = lsn;
 	}
-
-	LWLockRelease(CLogControlLock);
 }
 
 /*
diff --git a/src/backend/access/transam/transam.c b/src/backend/access/transam/transam.c
index 6fbaf7158c2..0e74dd446e7 100644
--- a/src/backend/access/transam/transam.c
+++ b/src/backend/access/transam/transam.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/transam/transam.c,v 1.76 2008/03/26 18:48:59 alvherre Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/transam/transam.c,v 1.77 2008/10/20 19:18:18 alvherre Exp $
  *
  * NOTES
  *	  This file contains the high level access-method interface to the
@@ -40,15 +40,12 @@ static const XLogRecPtr InvalidXLogRecPtr = {0, 0};
 
 /* Local functions */
 static XidStatus TransactionLogFetch(TransactionId transactionId);
-static void TransactionLogUpdate(TransactionId transactionId,
-					 XidStatus status, XLogRecPtr lsn);
 
 
 /* ----------------------------------------------------------------
  *		Postgres log access method interface
  *
  *		TransactionLogFetch
- *		TransactionLogUpdate
  * ----------------------------------------------------------------
  */
 
@@ -100,41 +97,6 @@ TransactionLogFetch(TransactionId transactionId)
 	return xidstatus;
 }
 
-/*
- *		TransactionLogUpdate
- *
- * Store the new status of a transaction.  The commit record LSN must be
- * passed when recording an async commit; else it should be InvalidXLogRecPtr.
- */
-static inline void
-TransactionLogUpdate(TransactionId transactionId,
-					 XidStatus status, XLogRecPtr lsn)
-{
-	/*
-	 * update the commit log
-	 */
-	TransactionIdSetStatus(transactionId, status, lsn);
-}
-
-/*
- * TransactionLogMultiUpdate
- *
- * Update multiple transaction identifiers to a given status.
- * Don't depend on this being atomic; it's not.
- */
-static inline void
-TransactionLogMultiUpdate(int nxids, TransactionId *xids,
-						  XidStatus status, XLogRecPtr lsn)
-{
-	int			i;
-
-	Assert(nxids != 0);
-
-	for (i = 0; i < nxids; i++)
-		TransactionIdSetStatus(xids[i], status, lsn);
-}
-
-
 /* ----------------------------------------------------------------
  *						Interface functions
  *
@@ -144,11 +106,12 @@ TransactionLogMultiUpdate(int nxids, TransactionId *xids,
  *		   these functions test the transaction status of
  *		   a specified transaction id.
  *
- *		TransactionIdCommit
- *		TransactionIdAbort
+ *		TransactionIdCommitTree
+ *		TransactionIdAsyncCommitTree
+ *		TransactionIdAbortTree
  *		========
- *		   these functions set the transaction status
- *		   of the specified xid.
+ *		   these functions set the transaction status of the specified
+ *		   transaction tree.
  *
  * See also TransactionIdIsInProgress, which once was in this module
  * but now lives in procarray.c.
@@ -287,76 +250,22 @@ TransactionIdIsKnownCompleted(TransactionId transactionId)
 	return false;
 }
 
-
-/*
- * TransactionIdCommit
- *		Commits the transaction associated with the identifier.
- *
- * Note:
- *		Assumes transaction identifier is valid.
- */
-void
-TransactionIdCommit(TransactionId transactionId)
-{
-	TransactionLogUpdate(transactionId, TRANSACTION_STATUS_COMMITTED,
-						 InvalidXLogRecPtr);
-}
-
-/*
- * TransactionIdAsyncCommit
- *		Same as above, but for async commits.  The commit record LSN is needed.
- */
-void
-TransactionIdAsyncCommit(TransactionId transactionId, XLogRecPtr lsn)
-{
-	TransactionLogUpdate(transactionId, TRANSACTION_STATUS_COMMITTED, lsn);
-}
-
-/*
- * TransactionIdAbort
- *		Aborts the transaction associated with the identifier.
- *
- * Note:
- *		Assumes transaction identifier is valid.
- *		No async version of this is needed.
- */
-void
-TransactionIdAbort(TransactionId transactionId)
-{
-	TransactionLogUpdate(transactionId, TRANSACTION_STATUS_ABORTED,
-						 InvalidXLogRecPtr);
-}
-
-/*
- * TransactionIdSubCommit
- *		Marks the subtransaction associated with the identifier as
- *		sub-committed.
- *
- * Note:
- *		No async version of this is needed.
- */
-void
-TransactionIdSubCommit(TransactionId transactionId)
-{
-	TransactionLogUpdate(transactionId, TRANSACTION_STATUS_SUB_COMMITTED,
-						 InvalidXLogRecPtr);
-}
-
 /*
  * TransactionIdCommitTree
- *		Marks all the given transaction ids as committed.
+ *		Marks the given transaction and children as committed
  *
- * The caller has to be sure that this is used only to mark subcommitted
- * subtransactions as committed, and only *after* marking the toplevel
- * parent as committed.  Otherwise there is a race condition against
- * TransactionIdDidCommit.
+ * "xid" is a toplevel transaction commit, and the xids array contains its
+ * committed subtransactions.
+ *
+ * This commit operation is not guaranteed to be atomic, but if not, subxids
+ * are correctly marked subcommit first.
  */
 void
-TransactionIdCommitTree(int nxids, TransactionId *xids)
+TransactionIdCommitTree(TransactionId xid, int nxids, TransactionId *xids)
 {
-	if (nxids > 0)
-		TransactionLogMultiUpdate(nxids, xids, TRANSACTION_STATUS_COMMITTED,
-								  InvalidXLogRecPtr);
+	return TransactionIdSetTreeStatus(xid, nxids, xids,
+									  TRANSACTION_STATUS_COMMITTED,
+									  InvalidXLogRecPtr);
 }
 
 /*
@@ -364,29 +273,30 @@ TransactionIdCommitTree(int nxids, TransactionId *xids)
  *		Same as above, but for async commits.  The commit record LSN is needed.
  */
 void
-TransactionIdAsyncCommitTree(int nxids, TransactionId *xids, XLogRecPtr lsn)
+TransactionIdAsyncCommitTree(TransactionId xid, int nxids, TransactionId *xids,
+							 XLogRecPtr lsn)
 {
-	if (nxids > 0)
-		TransactionLogMultiUpdate(nxids, xids, TRANSACTION_STATUS_COMMITTED,
-								  lsn);
+	return TransactionIdSetTreeStatus(xid, nxids, xids,
+									  TRANSACTION_STATUS_COMMITTED, lsn);
 }
 
 /*
  * TransactionIdAbortTree
- *		Marks all the given transaction ids as aborted.
+ *		Marks the given transaction and children as aborted.
+ *
+ * "xid" is a toplevel transaction commit, and the xids array contains its
+ * committed subtransactions.
  *
  * We don't need to worry about the non-atomic behavior, since any onlookers
  * will consider all the xacts as not-yet-committed anyway.
  */
 void
-TransactionIdAbortTree(int nxids, TransactionId *xids)
+TransactionIdAbortTree(TransactionId xid, int nxids, TransactionId *xids)
 {
-	if (nxids > 0)
-		TransactionLogMultiUpdate(nxids, xids, TRANSACTION_STATUS_ABORTED,
-								  InvalidXLogRecPtr);
+	TransactionIdSetTreeStatus(xid, nxids, xids,
+							   TRANSACTION_STATUS_ABORTED, InvalidXLogRecPtr);
 }
 
-
 /*
  * TransactionIdPrecedes --- is id1 logically < id2?
  */
diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c
index b86bdc36677..d24ea6b2a10 100644
--- a/src/backend/access/transam/twophase.c
+++ b/src/backend/access/transam/twophase.c
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *		$PostgreSQL: pgsql/src/backend/access/transam/twophase.c,v 1.45 2008/08/11 11:05:10 heikki Exp $
+ *		$PostgreSQL: pgsql/src/backend/access/transam/twophase.c,v 1.46 2008/10/20 19:18:18 alvherre Exp $
  *
  * NOTES
  *		Each global transaction is associated with a global transaction
@@ -1745,9 +1745,7 @@ RecordTransactionCommitPrepared(TransactionId xid,
 	XLogFlush(recptr);
 
 	/* Mark the transaction committed in pg_clog */
-	TransactionIdCommit(xid);
-	/* to avoid race conditions, the parent must commit first */
-	TransactionIdCommitTree(nchildren, children);
+	TransactionIdCommitTree(xid, nchildren, children);
 
 	/* Checkpoint can proceed now */
 	MyProc->inCommit = false;
@@ -1822,8 +1820,7 @@ RecordTransactionAbortPrepared(TransactionId xid,
 	 * Mark the transaction aborted in clog.  This is not absolutely necessary
 	 * but we may as well do it while we are here.
 	 */
-	TransactionIdAbort(xid);
-	TransactionIdAbortTree(nchildren, children);
+	TransactionIdAbortTree(xid, nchildren, children);
 
 	END_CRIT_SECTION();
 }
diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c
index 5f6c9df677a..49737dfb8b4 100644
--- a/src/backend/access/transam/xact.c
+++ b/src/backend/access/transam/xact.c
@@ -10,7 +10,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/transam/xact.c,v 1.265 2008/08/11 11:05:10 heikki Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/transam/xact.c,v 1.266 2008/10/20 19:18:18 alvherre Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -254,7 +254,6 @@ static void CommitTransaction(void);
 static TransactionId RecordTransactionAbort(bool isSubXact);
 static void StartTransaction(void);
 
-static void RecordSubTransactionCommit(void);
 static void StartSubTransaction(void);
 static void CommitSubTransaction(void);
 static void AbortSubTransaction(void);
@@ -952,11 +951,7 @@ RecordTransactionCommit(void)
 		 * Now we may update the CLOG, if we wrote a COMMIT record above
 		 */
 		if (markXidCommitted)
-		{
-			TransactionIdCommit(xid);
-			/* to avoid race conditions, the parent must commit first */
-			TransactionIdCommitTree(nchildren, children);
-		}
+			TransactionIdCommitTree(xid, nchildren, children);
 	}
 	else
 	{
@@ -974,11 +969,7 @@ RecordTransactionCommit(void)
 		 * flushed before the CLOG may be updated.
 		 */
 		if (markXidCommitted)
-		{
-			TransactionIdAsyncCommit(xid, XactLastRecEnd);
-			/* to avoid race conditions, the parent must commit first */
-			TransactionIdAsyncCommitTree(nchildren, children, XactLastRecEnd);
-		}
+			TransactionIdAsyncCommitTree(xid, nchildren, children, XactLastRecEnd);
 	}
 
 	/*
@@ -1156,36 +1147,6 @@ AtSubCommit_childXids(void)
 	s->maxChildXids = 0;
 }
 
-/*
- * RecordSubTransactionCommit
- */
-static void
-RecordSubTransactionCommit(void)
-{
-	TransactionId xid = GetCurrentTransactionIdIfAny();
-
-	/*
-	 * We do not log the subcommit in XLOG; it doesn't matter until the
-	 * top-level transaction commits.
-	 *
-	 * We must mark the subtransaction subcommitted in the CLOG if it had a
-	 * valid XID assigned.	If it did not, nobody else will ever know about
-	 * the existence of this subxact.  We don't have to deal with deletions
-	 * scheduled for on-commit here, since they'll be reassigned to our parent
-	 * (who might still abort).
-	 */
-	if (TransactionIdIsValid(xid))
-	{
-		/* XXX does this really need to be a critical section? */
-		START_CRIT_SECTION();
-
-		/* Record subtransaction subcommit */
-		TransactionIdSubCommit(xid);
-
-		END_CRIT_SECTION();
-	}
-}
-
 /* ----------------------------------------------------------------
  *						AbortTransaction stuff
  * ----------------------------------------------------------------
@@ -1288,14 +1249,8 @@ RecordTransactionAbort(bool isSubXact)
 	 * waiting for already-aborted subtransactions.  It is OK to do it without
 	 * having flushed the ABORT record to disk, because in event of a crash
 	 * we'd be assumed to have aborted anyway.
-	 *
-	 * The ordering here isn't critical but it seems best to mark the parent
-	 * first.  This assures an atomic transition of all the subtransactions to
-	 * aborted state from the point of view of concurrent
-	 * TransactionIdDidAbort calls.
 	 */
-	TransactionIdAbort(xid);
-	TransactionIdAbortTree(nchildren, children);
+	TransactionIdAbortTree(xid, nchildren, children);
 
 	END_CRIT_SECTION();
 
@@ -3791,8 +3746,11 @@ CommitSubTransaction(void)
 	/* Must CCI to ensure commands of subtransaction are seen as done */
 	CommandCounterIncrement();
 
-	/* Mark subtransaction as subcommitted */
-	RecordSubTransactionCommit();
+	/* 
+	 * Prior to 8.4 we marked subcommit in clog at this point.  We now only
+	 * perform that step, if required, as part of the atomic update of the
+	 * whole transaction tree at top level commit or abort.
+	 */
 
 	/* Post-commit cleanup */
 	if (TransactionIdIsValid(s->transactionId))
@@ -4259,11 +4217,9 @@ xact_redo_commit(xl_xact_commit *xlrec, TransactionId xid)
 	TransactionId max_xid;
 	int			i;
 
-	TransactionIdCommit(xid);
-
-	/* Mark committed subtransactions as committed */
+	/* Mark the transaction committed in pg_clog */
 	sub_xids = (TransactionId *) &(xlrec->xnodes[xlrec->nrels]);
-	TransactionIdCommitTree(xlrec->nsubxacts, sub_xids);
+	TransactionIdCommitTree(xid, xlrec->nsubxacts, sub_xids);
 
 	/* Make sure nextXid is beyond any XID mentioned in the record */
 	max_xid = xid;
@@ -4299,11 +4255,9 @@ xact_redo_abort(xl_xact_abort *xlrec, TransactionId xid)
 	TransactionId max_xid;
 	int			i;
 
-	TransactionIdAbort(xid);
-
-	/* Mark subtransactions as aborted */
+	/* Mark the transaction aborted in pg_clog */
 	sub_xids = (TransactionId *) &(xlrec->xnodes[xlrec->nrels]);
-	TransactionIdAbortTree(xlrec->nsubxacts, sub_xids);
+	TransactionIdAbortTree(xid, xlrec->nsubxacts, sub_xids);
 
 	/* Make sure nextXid is beyond any XID mentioned in the record */
 	max_xid = xid;
diff --git a/src/include/access/clog.h b/src/include/access/clog.h
index 8b640a39d6d..ac74c471320 100644
--- a/src/include/access/clog.h
+++ b/src/include/access/clog.h
@@ -6,7 +6,7 @@
  * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/access/clog.h,v 1.21 2008/01/01 19:45:56 momjian Exp $
+ * $PostgreSQL: pgsql/src/include/access/clog.h,v 1.22 2008/10/20 19:18:18 alvherre Exp $
  */
 #ifndef CLOG_H
 #define CLOG_H
@@ -32,7 +32,8 @@ typedef int XidStatus;
 #define NUM_CLOG_BUFFERS	8
 
 
-extern void TransactionIdSetStatus(TransactionId xid, XidStatus status, XLogRecPtr lsn);
+extern void TransactionIdSetTreeStatus(TransactionId xid, int nsubxids, 
+					TransactionId *subxids, XidStatus status, XLogRecPtr lsn);
 extern XidStatus TransactionIdGetStatus(TransactionId xid, XLogRecPtr *lsn);
 
 extern Size CLOGShmemSize(void);
diff --git a/src/include/access/transam.h b/src/include/access/transam.h
index 232c59cad3c..6b37723cb9b 100644
--- a/src/include/access/transam.h
+++ b/src/include/access/transam.h
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/access/transam.h,v 1.65 2008/03/11 20:20:35 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/access/transam.h,v 1.66 2008/10/20 19:18:18 alvherre Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -139,13 +139,10 @@ extern VariableCache ShmemVariableCache;
 extern bool TransactionIdDidCommit(TransactionId transactionId);
 extern bool TransactionIdDidAbort(TransactionId transactionId);
 extern bool TransactionIdIsKnownCompleted(TransactionId transactionId);
-extern void TransactionIdCommit(TransactionId transactionId);
-extern void TransactionIdAsyncCommit(TransactionId transactionId, XLogRecPtr lsn);
 extern void TransactionIdAbort(TransactionId transactionId);
-extern void TransactionIdSubCommit(TransactionId transactionId);
-extern void TransactionIdCommitTree(int nxids, TransactionId *xids);
-extern void TransactionIdAsyncCommitTree(int nxids, TransactionId *xids, XLogRecPtr lsn);
-extern void TransactionIdAbortTree(int nxids, TransactionId *xids);
+extern void TransactionIdCommitTree(TransactionId xid, int nxids, TransactionId *xids);
+extern void TransactionIdAsyncCommitTree(TransactionId xid, int nxids, TransactionId *xids, XLogRecPtr lsn);
+extern void TransactionIdAbortTree(TransactionId xid, int nxids, TransactionId *xids);
 extern bool TransactionIdPrecedes(TransactionId id1, TransactionId id2);
 extern bool TransactionIdPrecedesOrEquals(TransactionId id1, TransactionId id2);
 extern bool TransactionIdFollows(TransactionId id1, TransactionId id2);
-- 
GitLab