From ed5c4e4a14e9f9f4b818521b943bace0d5cd1e01 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Mon, 16 Jul 2001 22:43:34 +0000
Subject: [PATCH] Improve documentation about reasoning behind the order of
 operations in GetSnapshotData, GetNewTransactionId, CommitTransaction,
 AbortTransaction, etc.  Correct race condition in transaction status testing
 in HeapTupleSatisfiesVacuum --- this wasn't important for old VACUUM with
 exclusive lock on its table, but it sure is important now.  All per pghackers
 discussion 7/11/01 and 7/12/01.

---
 src/backend/access/transam/varsup.c | 14 +++++--
 src/backend/access/transam/xact.c   | 54 +++++++++++++++-----------
 src/backend/storage/ipc/sinval.c    | 60 +++++++++++++++++++++++------
 src/backend/utils/time/tqual.c      | 31 +++++++++------
 4 files changed, 110 insertions(+), 49 deletions(-)

diff --git a/src/backend/access/transam/varsup.c b/src/backend/access/transam/varsup.c
index 2b253fc5855..857f2c3d3e8 100644
--- a/src/backend/access/transam/varsup.c
+++ b/src/backend/access/transam/varsup.c
@@ -6,7 +6,7 @@
  * Copyright (c) 2000, PostgreSQL Global Development Group
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/access/transam/varsup.c,v 1.41 2001/07/12 04:11:13 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/access/transam/varsup.c,v 1.42 2001/07/16 22:43:33 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -60,8 +60,16 @@ GetNewTransactionId(TransactionId *xid)
 	 * XXX by storing xid into MyProc without acquiring SInvalLock, we are
 	 * relying on fetch/store of an xid to be atomic, else other backends
 	 * might see a partially-set xid here.  But holding both locks at once
-	 * would be a nasty concurrency hit (and at this writing, could cause a
-	 * deadlock against GetSnapshotData).  So for now, assume atomicity.
+	 * would be a nasty concurrency hit (and in fact could cause a deadlock
+	 * against GetSnapshotData).  So for now, assume atomicity.  Note that
+	 * readers of PROC xid field should be careful to fetch the value only
+	 * once, rather than assume they can read it multiple times and get the
+	 * same answer each time.
+	 *
+	 * A solution to the atomic-store problem would be to give each PROC
+	 * its own spinlock used only for fetching/storing that PROC's xid.
+	 * (SInvalLock would then mean primarily that PROCs couldn't be added/
+	 * removed while holding the lock.)
 	 */
 	if (MyProc != (PROC *) NULL)
 		MyProc->xid = *xid;
diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c
index d32a6dda978..f35e8d9203e 100644
--- a/src/backend/access/transam/xact.c
+++ b/src/backend/access/transam/xact.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/access/transam/xact.c,v 1.107 2001/07/15 22:48:16 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/access/transam/xact.c,v 1.108 2001/07/16 22:43:33 tgl Exp $
  *
  * NOTES
  *		Transaction aborts can now occur two ways:
@@ -1018,16 +1018,21 @@ CommitTransaction(void)
 
 	CloseSequences();
 	AtEOXact_portals();
+
+	/* Here is where we really truly commit. */
 	RecordTransactionCommit();
 
 	/*
 	 * Let others know about no transaction in progress by me. Note that
-	 * this must be done _before_ releasing locks we hold and
+	 * this must be done _before_ releasing locks we hold and _after_
+	 * RecordTransactionCommit.
+	 *
 	 * SpinAcquire(SInvalLock) is required: UPDATE with xid 0 is blocked
 	 * by xid 1' UPDATE, xid 1 is doing commit while xid 2 gets snapshot -
 	 * if xid 2' GetSnapshotData sees xid 1 as running then it must see
 	 * xid 0 as running as well or it will see two tuple versions - one
-	 * deleted by xid 1 and one inserted by xid 0.
+	 * deleted by xid 1 and one inserted by xid 0.  See notes in
+	 * GetSnapshotData.
 	 */
 	if (MyProc != (PROC *) NULL)
 	{
@@ -1038,6 +1043,12 @@ CommitTransaction(void)
 		SpinRelease(SInvalLock);
 	}
 
+	/*
+	 * This is all post-commit cleanup.  Note that if an error is raised
+	 * here, it's too late to abort the transaction.  This should be just
+	 * noncritical resource releasing.
+	 */
+
 	RelationPurgeLocalRelation(true);
 	AtEOXact_temp_relations(true);
 	smgrDoPendingDeletes(true);
@@ -1080,23 +1091,6 @@ AbortTransaction(void)
 	/* Prevent cancel/die interrupt while cleaning up */
 	HOLD_INTERRUPTS();
 
-	/*
-	 * Let others to know about no transaction in progress - vadim
-	 * 11/26/96
-	 *
-	 * XXX it'd be nice to acquire SInvalLock for this, but too much risk of
-	 * lockup: what if we were holding SInvalLock when we elog'd?  Net effect
-	 * is that we are relying on fetch/store of an xid to be atomic, else
-	 * other backends might see a partially-zeroed xid here.  Would it be
-	 * safe to release spins before we reset xid/xmin?  But see also 
-	 * GetNewTransactionId, which does the same thing.
-	 */
-	if (MyProc != (PROC *) NULL)
-	{
-		MyProc->xid = InvalidTransactionId;
-		MyProc->xmin = InvalidTransactionId;
-	}
-
 	/*
 	 * Release any spinlocks or buffer context locks we might be holding
 	 * as quickly as possible.	(Real locks, however, must be held till we
@@ -1143,10 +1137,23 @@ AbortTransaction(void)
 	AtAbort_Notify();
 	CloseSequences();
 	AtEOXact_portals();
+
+	/* Advertise the fact that we aborted in pg_log. */
 	RecordTransactionAbort();
 
-	/* Count transaction abort in statistics collector */
-	pgstat_count_xact_rollback();
+	/*
+	 * Let others know about no transaction in progress by me. Note that
+	 * this must be done _before_ releasing locks we hold and _after_
+	 * RecordTransactionAbort.
+	 */
+	if (MyProc != (PROC *) NULL)
+	{
+		/* Lock SInvalLock because that's what GetSnapshotData uses. */
+		SpinAcquire(SInvalLock);
+		MyProc->xid = InvalidTransactionId;
+		MyProc->xmin = InvalidTransactionId;
+		SpinRelease(SInvalLock);
+	}
 
 	RelationPurgeLocalRelation(false);
 	AtEOXact_temp_relations(false);
@@ -1165,6 +1172,9 @@ AbortTransaction(void)
 
 	SharedBufferChanged = false;/* safest place to do it */
 
+	/* Count transaction abort in statistics collector */
+	pgstat_count_xact_rollback();
+
 	/*
 	 * State remains TRANS_ABORT until CleanupTransaction().
 	 */
diff --git a/src/backend/storage/ipc/sinval.c b/src/backend/storage/ipc/sinval.c
index a9b9046702c..a93e91237f6 100644
--- a/src/backend/storage/ipc/sinval.c
+++ b/src/backend/storage/ipc/sinval.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/storage/ipc/sinval.c,v 1.36 2001/07/12 04:11:13 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/storage/ipc/sinval.c,v 1.37 2001/07/16 22:43:34 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -193,8 +193,10 @@ TransactionIdIsInProgress(TransactionId xid)
 		if (pOffset != INVALID_OFFSET)
 		{
 			PROC	   *proc = (PROC *) MAKE_PTR(pOffset);
+			/* Fetch xid just once - see GetNewTransactionId */
+			TransactionId pxid = proc->xid;
 
-			if (TransactionIdEquals(proc->xid, xid))
+			if (TransactionIdEquals(pxid, xid))
 			{
 				result = true;
 				break;
@@ -236,9 +238,9 @@ GetXmaxRecent(TransactionId *XmaxRecent)
 		if (pOffset != INVALID_OFFSET)
 		{
 			PROC	   *proc = (PROC *) MAKE_PTR(pOffset);
-			TransactionId xid;
+			/* Fetch xid just once - see GetNewTransactionId */
+			TransactionId xid = proc->xid;
 
-			xid = proc->xid;
 			if (! TransactionIdIsSpecial(xid))
 			{
 				if (TransactionIdPrecedes(xid, result))
@@ -256,8 +258,19 @@ GetXmaxRecent(TransactionId *XmaxRecent)
 	*XmaxRecent = result;
 }
 
-/*
+/*----------
  * GetSnapshotData -- returns information about running transactions.
+ *
+ * The returned snapshot includes xmin (lowest still-running xact ID),
+ * xmax (next xact ID to be assigned), and a list of running xact IDs
+ * in the range xmin <= xid < xmax.  It is used as follows:
+ *		All xact IDs < xmin are considered finished.
+ *		All xact IDs >= xmax are considered still running.
+ *		For an xact ID xmin <= xid < xmax, consult list to see whether
+ *		it is considered running or not.
+ * This ensures that the set of transactions seen as "running" by the
+ * current xact will not change after it takes the snapshot.
+ *----------
  */
 Snapshot
 GetSnapshotData(bool serializable)
@@ -287,12 +300,33 @@ GetSnapshotData(bool serializable)
 		elog(ERROR, "Memory exhausted in GetSnapshotData");
 	}
 
-	/*
-	 * Unfortunately, we have to call ReadNewTransactionId() after
-	 * acquiring SInvalLock above. It's not good because
-	 * ReadNewTransactionId() does SpinAcquire(XidGenLockId) but
-	 * _necessary_.
+	/*--------------------
+	 * Unfortunately, we have to call ReadNewTransactionId() after acquiring
+	 * SInvalLock above.  It's not good because ReadNewTransactionId() does
+	 * SpinAcquire(XidGenLockId), but *necessary*.  We need to be sure that
+	 * no transactions exit the set of currently-running transactions
+	 * between the time we fetch xmax and the time we finish building our
+	 * snapshot.  Otherwise we could have a situation like this:
+	 *
+	 *		1. Tx Old is running (in Read Committed mode).
+	 *		2. Tx S reads new transaction ID into xmax, then
+	 *		   is swapped out before acquiring SInvalLock.
+	 *		3. Tx New gets new transaction ID (>= S' xmax),
+	 *		   makes changes and commits.
+	 *		4. Tx Old changes some row R changed by Tx New and commits.
+	 *		5. Tx S finishes getting its snapshot data.  It sees Tx Old as
+	 *		   done, but sees Tx New as still running (since New >= xmax).
+	 *
+	 * Now S will see R changed by both Tx Old and Tx New, *but* does not
+	 * see other changes made by Tx New.  If S is supposed to be in
+	 * Serializable mode, this is wrong.
+	 *
+	 * By locking SInvalLock before we read xmax, we ensure that TX Old
+	 * cannot exit the set of running transactions seen by Tx S.  Therefore
+	 * both Old and New will be seen as still running => no inconsistency.
+	 *--------------------
 	 */
+
 	ReadNewTransactionId(&(snapshot->xmax));
 
 	for (index = 0; index < segP->lastBackend; index++)
@@ -302,6 +336,7 @@ GetSnapshotData(bool serializable)
 		if (pOffset != INVALID_OFFSET)
 		{
 			PROC	   *proc = (PROC *) MAKE_PTR(pOffset);
+			/* Fetch xid just once - see GetNewTransactionId */
 			TransactionId xid = proc->xid;
 
 			/*
@@ -325,11 +360,12 @@ GetSnapshotData(bool serializable)
 
 	if (serializable)
 		MyProc->xmin = snapshot->xmin;
-	/* Serializable snapshot must be computed before any other... */
-	Assert(MyProc->xmin != InvalidTransactionId);
 
 	SpinRelease(SInvalLock);
 
+	/* Serializable snapshot must be computed before any other... */
+	Assert(MyProc->xmin != InvalidTransactionId);
+
 	snapshot->xcnt = count;
 	return snapshot;
 }
diff --git a/src/backend/utils/time/tqual.c b/src/backend/utils/time/tqual.c
index 2dd56b6f08e..35113a36228 100644
--- a/src/backend/utils/time/tqual.c
+++ b/src/backend/utils/time/tqual.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/utils/time/tqual.c,v 1.38 2001/07/12 04:11:13 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/utils/time/tqual.c,v 1.39 2001/07/16 22:43:34 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -610,6 +610,13 @@ HeapTupleSatisfiesVacuum(HeapTupleHeader tuple, TransactionId XmaxRecent)
 	 *
 	 * If the inserting transaction aborted, then the tuple was never visible
 	 * to any other transaction, so we can delete it immediately.
+	 *
+	 * NOTE: must check TransactionIdIsInProgress (which looks in shared mem)
+	 * before TransactionIdDidCommit/TransactionIdDidAbort (which look in
+	 * pg_log).  Otherwise we have a race condition where we might decide
+	 * that a just-committed transaction crashed, because none of the tests
+	 * succeed.  xact.c is careful to record commit/abort in pg_log before
+	 * it unsets MyProc->xid in shared memory.
 	 */
 	if (!(tuple->t_infomask & HEAP_XMIN_COMMITTED))
 	{
@@ -636,19 +643,19 @@ HeapTupleSatisfiesVacuum(HeapTupleHeader tuple, TransactionId XmaxRecent)
 			}
 			tuple->t_infomask |= HEAP_XMIN_COMMITTED;
 		}
+		else if (TransactionIdIsInProgress(tuple->t_xmin))
+			return HEAPTUPLE_INSERT_IN_PROGRESS;
+		else if (TransactionIdDidCommit(tuple->t_xmin))
+			tuple->t_infomask |= HEAP_XMIN_COMMITTED;
 		else if (TransactionIdDidAbort(tuple->t_xmin))
 		{
 			tuple->t_infomask |= HEAP_XMIN_INVALID;
 			return HEAPTUPLE_DEAD;
 		}
-		else if (TransactionIdDidCommit(tuple->t_xmin))
-			tuple->t_infomask |= HEAP_XMIN_COMMITTED;
-		else if (TransactionIdIsInProgress(tuple->t_xmin))
-			return HEAPTUPLE_INSERT_IN_PROGRESS;
 		else
 		{
 			/*
-			 * Not Aborted, Not Committed, Not in Progress -
+			 * Not in Progress, Not Committed, Not Aborted -
 			 * so it's from crashed process. - vadim 11/26/96
 			 */
 			tuple->t_infomask |= HEAP_XMIN_INVALID;
@@ -667,19 +674,19 @@ HeapTupleSatisfiesVacuum(HeapTupleHeader tuple, TransactionId XmaxRecent)
 
 	if (!(tuple->t_infomask & HEAP_XMAX_COMMITTED))
 	{
-		if (TransactionIdDidAbort(tuple->t_xmax))
+		if (TransactionIdIsInProgress(tuple->t_xmax))
+			return HEAPTUPLE_DELETE_IN_PROGRESS;
+		else if (TransactionIdDidCommit(tuple->t_xmax))
+			tuple->t_infomask |= HEAP_XMAX_COMMITTED;
+		else if (TransactionIdDidAbort(tuple->t_xmax))
 		{
 			tuple->t_infomask |= HEAP_XMAX_INVALID;
 			return HEAPTUPLE_LIVE;
 		}
-		else if (TransactionIdDidCommit(tuple->t_xmax))
-			tuple->t_infomask |= HEAP_XMAX_COMMITTED;
-		else if (TransactionIdIsInProgress(tuple->t_xmax))
-			return HEAPTUPLE_DELETE_IN_PROGRESS;
 		else
 		{
 			/*
-			 * Not Aborted, Not Committed, Not in Progress -
+			 * Not in Progress, Not Committed, Not Aborted -
 			 * so it's from crashed process. - vadim 06/02/97
 			 */
 			tuple->t_infomask |= HEAP_XMAX_INVALID;
-- 
GitLab