From 295e63983d7596ccc5717ff4a0a235ba241a2614 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Wed, 5 Sep 2007 18:10:48 +0000
Subject: [PATCH] Implement lazy XID allocation: transactions that do not
 modify any database rows will normally never obtain an XID at all.  We
 already did things this way for subtransactions, but this patch extends the
 concept to top-level transactions.  In applications where there are lots of
 short read-only transactions, this should improve performance noticeably; not
 so much from removal of the actual XID-assignments, as from reduction of
 overhead that's driven by the rate of XID consumption.  We add a concept of a
 "virtual transaction ID" so that active transactions can be uniquely
 identified even if they don't have a regular XID.  This is a much
 lighter-weight concept: uniqueness of VXIDs is only guaranteed over the short
 term, and no on-disk record is made about them.

Florian Pflug, with some editorialization by Tom.
---
 doc/src/sgml/catalogs.sgml                    |  37 +-
 doc/src/sgml/config.sgml                      |  11 +-
 src/backend/access/heap/heapam.c              |  24 +-
 src/backend/access/transam/README             |  36 +-
 src/backend/access/transam/clog.c             |  13 +-
 src/backend/access/transam/multixact.c        |   7 +-
 src/backend/access/transam/twophase.c         |  16 +-
 src/backend/access/transam/xact.c             | 769 +++++++++---------
 src/backend/access/transam/xlog.c             |  48 +-
 src/backend/catalog/system_views.sql          |   6 +-
 src/backend/commands/indexcmds.c              |  57 +-
 src/backend/commands/sequence.c               |  21 +-
 src/backend/commands/vacuum.c                 |  29 +-
 src/backend/commands/vacuumlazy.c             |   7 +-
 src/backend/storage/ipc/procarray.c           | 189 +++--
 src/backend/storage/ipc/sinvaladt.c           |  54 +-
 src/backend/storage/lmgr/lmgr.c               |  77 +-
 src/backend/storage/lmgr/lock.c               |  66 +-
 src/backend/storage/lmgr/proc.c               |  12 +-
 src/backend/storage/smgr/smgr.c               |  26 +-
 src/backend/utils/adt/lockfuncs.c             |  86 +-
 src/backend/utils/error/elog.c                |  21 +-
 src/backend/utils/misc/postgresql.conf.sample |   3 +-
 src/include/access/xact.h                     |   3 +-
 src/include/access/xlog.h                     |  13 +-
 src/include/c.h                               |   4 +-
 src/include/catalog/catversion.h              |   4 +-
 src/include/storage/lmgr.h                    |   7 +-
 src/include/storage/lock.h                    |  47 +-
 src/include/storage/proc.h                    |  12 +-
 src/include/storage/procarray.h               |   3 +-
 src/include/storage/sinvaladt.h               |  11 +-
 src/include/storage/smgr.h                    |   5 +-
 src/test/regress/expected/rules.out           |   2 +-
 34 files changed, 987 insertions(+), 739 deletions(-)

diff --git a/doc/src/sgml/catalogs.sgml b/doc/src/sgml/catalogs.sgml
index 525d24feace..68ff0921481 100644
--- a/doc/src/sgml/catalogs.sgml
+++ b/doc/src/sgml/catalogs.sgml
@@ -1,4 +1,4 @@
-<!-- $PostgreSQL: pgsql/doc/src/sgml/catalogs.sgml,v 2.156 2007/09/03 00:39:11 tgl Exp $ -->
+<!-- $PostgreSQL: pgsql/doc/src/sgml/catalogs.sgml,v 2.157 2007/09/05 18:10:47 tgl Exp $ -->
 <!--
  Documentation of the system catalogs, directed toward PostgreSQL developers
  -->
@@ -5147,7 +5147,7 @@
    There are several distinct types of lockable objects:
    whole relations (e.g., tables), individual pages of relations,
    individual tuples of relations,
-   transaction IDs,
+   transaction IDs (both virtual and permanent IDs),
    and general database objects (identified by class OID and object OID,
    in the same way as in <structname>pg_description</structname> or
    <structname>pg_depend</structname>).  Also, the right to extend a
@@ -5178,6 +5178,7 @@
        <literal>page</>,
        <literal>tuple</>,
        <literal>transactionid</>,
+       <literal>virtualxid</>,
        <literal>object</>,
        <literal>userlock</>, or
        <literal>advisory</>
@@ -5219,6 +5220,15 @@
        Tuple number within the page, or NULL if the object is not a tuple
       </entry>
      </row>
+     <row>
+      <entry><structfield>virtualxid</structfield></entry>
+      <entry><type>text</type></entry>
+      <entry></entry>
+      <entry>
+       Virtual ID of a transaction, or NULL if the object is not a
+       virtual transaction ID
+      </entry>
+     </row>
      <row>
       <entry><structfield>transactionid</structfield></entry>
       <entry><type>xid</type></entry>
@@ -5257,11 +5267,11 @@
       </entry>
      </row>
      <row>
-      <entry><structfield>transaction</structfield></entry>
-      <entry><type>xid</type></entry>
+      <entry><structfield>virtualtransaction</structfield></entry>
+      <entry><type>text</type></entry>
       <entry></entry>
       <entry>
-       ID of the transaction that is holding or awaiting this lock
+       Virtual ID of the transaction that is holding or awaiting this lock
       </entry>
      </row>
      <row>
@@ -5301,10 +5311,14 @@
   </para>
 
   <para>
-   Every transaction holds an exclusive lock on its transaction ID for its
-   entire duration. If one transaction finds it necessary to wait specifically
+   Every transaction holds an exclusive lock on its virtual transaction ID for
+   its entire duration.  If a permanent ID is assigned to the transaction
+   (which normally happens only if the transaction changes the state of the
+   database), it also holds an exclusive lock on its permanent transaction ID
+   until it ends.  When one transaction finds it necessary to wait specifically
    for another transaction, it does so by attempting to acquire share lock on
-   the other transaction ID. That will succeed only when the other transaction
+   the other transaction ID (either virtual or permanent ID depending on the
+   situation). That will succeed only when the other transaction
    terminates and releases its locks. 
   </para>
 
@@ -5314,7 +5328,7 @@
    and therefore row-level locks normally do not appear in this view.
    If a transaction is waiting for a
    row-level lock, it will usually appear in the view as waiting for the
-   transaction ID of the current holder of that row lock.
+   permanent transaction ID of the current holder of that row lock.
   </para>
 
   <para>
@@ -5350,11 +5364,10 @@
   </para>
 
   <para>
-   If you have enabled the statistics collector, the
-   <structfield>pid</structfield> column can be joined to the
+   The <structfield>pid</structfield> column can be joined to the
    <structfield>procpid</structfield> column of the
    <structname>pg_stat_activity</structname> view to get more
-   information on the session holding or waiting to hold the lock.
+   information on the session holding or waiting to hold each lock.
    Also, if you are using prepared transactions, the
    <structfield>transaction</> column can be joined to the
    <structfield>transaction</structfield> column of the
diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index 5ef230a4fe3..d770c4606f2 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -1,4 +1,4 @@
-<!-- $PostgreSQL: pgsql/doc/src/sgml/config.sgml,v 1.141 2007/08/22 04:45:20 tgl Exp $ -->
+<!-- $PostgreSQL: pgsql/doc/src/sgml/config.sgml,v 1.142 2007/09/05 18:10:47 tgl Exp $ -->
 
 <chapter Id="runtime-config">
   <title>Server Configuration</title>
@@ -2939,10 +2939,15 @@ SELECT * FROM parent WHERE key = 2400;
              <entry>Process start time stamp</entry>
              <entry>no</entry>
             </row>
+            <row>
+             <entry><literal>%v</literal></entry>
+             <entry>Virtual transaction ID (backendID/localXID)</entry>
+             <entry>no</entry>
+            </row>
             <row>
              <entry><literal>%x</literal></entry>
-             <entry>Transaction ID</entry>
-             <entry>yes</entry>
+             <entry>Transaction ID (0 if none is assigned)</entry>
+             <entry>no</entry>
             </row>
             <row>
              <entry><literal>%q</literal></entry>
diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index 3f44bd7d948..3f23378b8fa 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/heap/heapam.c,v 1.237 2007/08/14 17:35:18 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/heap/heapam.c,v 1.238 2007/09/05 18:10:47 tgl Exp $
  *
  *
  * INTERFACE ROUTINES
@@ -1632,12 +1632,7 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid,
 	MarkBufferDirty(buffer);
 
 	/* XLOG stuff */
-	if (relation->rd_istemp)
-	{
-		/* No XLOG record, but still need to flag that XID exists on disk */
-		MyXactMadeTempRelUpdate = true;
-	}
-	else if (use_wal)
+	if (use_wal && !relation->rd_istemp)
 	{
 		xl_heap_insert xlrec;
 		xl_heap_header xlhdr;
@@ -1947,11 +1942,6 @@ l1:
 		PageSetLSN(dp, recptr);
 		PageSetTLI(dp, ThisTimeLineID);
 	}
-	else
-	{
-		/* No XLOG record, but still need to flag that XID exists on disk */
-		MyXactMadeTempRelUpdate = true;
-	}
 
 	END_CRIT_SECTION();
 
@@ -2403,11 +2393,6 @@ l2:
 		PageSetLSN(BufferGetPage(buffer), recptr);
 		PageSetTLI(BufferGetPage(buffer), ThisTimeLineID);
 	}
-	else
-	{
-		/* No XLOG record, but still need to flag that XID exists on disk */
-		MyXactMadeTempRelUpdate = true;
-	}
 
 	END_CRIT_SECTION();
 
@@ -2924,11 +2909,6 @@ l3:
 		PageSetLSN(dp, recptr);
 		PageSetTLI(dp, ThisTimeLineID);
 	}
-	else
-	{
-		/* No XLOG record, but still need to flag that XID exists on disk */
-		MyXactMadeTempRelUpdate = true;
-	}
 
 	END_CRIT_SECTION();
 
diff --git a/src/backend/access/transam/README b/src/backend/access/transam/README
index 6e7e132acab..87b40591702 100644
--- a/src/backend/access/transam/README
+++ b/src/backend/access/transam/README
@@ -1,4 +1,4 @@
-$PostgreSQL: pgsql/src/backend/access/transam/README,v 1.6 2007/08/01 22:45:07 tgl Exp $
+$PostgreSQL: pgsql/src/backend/access/transam/README,v 1.7 2007/09/05 18:10:47 tgl Exp $
 
 The Transaction System
 ----------------------
@@ -187,16 +187,29 @@ Another difference is that BeginInternalSubtransaction is allowed when no
 explicit transaction block has been established, while DefineSavepoint is not.
 
 
-Subtransaction numbering
-------------------------
+Transaction and subtransaction numbering
+----------------------------------------
 
-A top-level transaction is always given a TransactionId (XID) as soon as it is
-created.  This is necessary for a number of reasons, notably XMIN bookkeeping
-for VACUUM.  However, a subtransaction doesn't need its own XID unless it
-(or one of its child subxacts) writes tuples into the database.  Therefore,
-we postpone assigning XIDs to subxacts until and unless they call
-GetCurrentTransactionId.  The subsidiary actions of obtaining a lock on the
-XID and and entering it into pg_subtrans and PG_PROC are done at the same time.
+Transactions and subtransactions are assigned permanent XIDs only when/if
+they first do something that requires one --- typically, insert/update/delete
+a tuple, though there are a few other places that need an XID assigned.
+If a subtransaction requires an XID, we always first assign one to its
+parent.  This maintains the invariant that child transactions have XIDs later
+than their parents, which is assumed in a number of places.
+
+The subsidiary actions of obtaining a lock on the XID and and entering it into
+pg_subtrans and PG_PROC are done at the time it is assigned.
+
+A transaction that has no XID still needs to be identified for various
+purposes, notably holding locks.  For this purpose we assign a "virtual
+transaction ID" or VXID to each top-level transaction.  VXIDs are formed from
+two fields, the backendID and a backend-local counter; this arrangement allows
+assignment of a new VXID at transaction start without any contention for
+shared memory.  To ensure that a VXID isn't re-used too soon after backend
+exit, we store the last local counter value into shared memory at backend
+exit, and initialize it from the previous value for the same backendID slot
+at backend start.  All these counters go back to zero at shared memory
+re-initialization, but that's OK because VXIDs never appear anywhere on-disk.
 
 Internally, a backend needs a way to identify subtransactions whether or not
 they have XIDs; but this need only lasts as long as the parent top transaction
@@ -204,7 +217,8 @@ endures.  Therefore, we have SubTransactionId, which is somewhat like
 CommandId in that it's generated from a counter that we reset at the start of
 each top transaction.  The top-level transaction itself has SubTransactionId 1,
 and subtransactions have IDs 2 and up.  (Zero is reserved for
-InvalidSubTransactionId.)
+InvalidSubTransactionId.)  Note that subtransactions do not have their
+own VXIDs; they use the parent top transaction's VXID.
 
 
 pg_clog and pg_subtrans
diff --git a/src/backend/access/transam/clog.c b/src/backend/access/transam/clog.c
index 9665d129541..419c8656065 100644
--- a/src/backend/access/transam/clog.c
+++ b/src/backend/access/transam/clog.c
@@ -26,7 +26,7 @@
  * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/backend/access/transam/clog.c,v 1.43 2007/08/01 22:45:07 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/access/transam/clog.c,v 1.44 2007/09/05 18:10:47 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -423,10 +423,6 @@ CLOGPagePrecedes(int page1, int page2)
 
 /*
  * Write a ZEROPAGE xlog record
- *
- * Note: xlog record is marked as outside transaction control, since we
- * want it to be redone whether the invoking transaction commits or not.
- * (Besides which, this is normally done just before entering a transaction.)
  */
 static void
 WriteZeroPageXlogRec(int pageno)
@@ -437,7 +433,7 @@ WriteZeroPageXlogRec(int pageno)
 	rdata.len = sizeof(int);
 	rdata.buffer = InvalidBuffer;
 	rdata.next = NULL;
-	(void) XLogInsert(RM_CLOG_ID, CLOG_ZEROPAGE | XLOG_NO_TRAN, &rdata);
+	(void) XLogInsert(RM_CLOG_ID, CLOG_ZEROPAGE, &rdata);
 }
 
 /*
@@ -445,9 +441,6 @@ WriteZeroPageXlogRec(int pageno)
  *
  * We must flush the xlog record to disk before returning --- see notes
  * in TruncateCLOG().
- *
- * Note: xlog record is marked as outside transaction control, since we
- * want it to be redone whether the invoking transaction commits or not.
  */
 static void
 WriteTruncateXlogRec(int pageno)
@@ -459,7 +452,7 @@ WriteTruncateXlogRec(int pageno)
 	rdata.len = sizeof(int);
 	rdata.buffer = InvalidBuffer;
 	rdata.next = NULL;
-	recptr = XLogInsert(RM_CLOG_ID, CLOG_TRUNCATE | XLOG_NO_TRAN, &rdata);
+	recptr = XLogInsert(RM_CLOG_ID, CLOG_TRUNCATE, &rdata);
 	XLogFlush(recptr);
 }
 
diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c
index 3ce6f14bcf6..b34fa9be785 100644
--- a/src/backend/access/transam/multixact.c
+++ b/src/backend/access/transam/multixact.c
@@ -42,7 +42,7 @@
  * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/backend/access/transam/multixact.c,v 1.24 2007/08/01 22:45:07 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/access/transam/multixact.c,v 1.25 2007/09/05 18:10:47 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -1842,9 +1842,6 @@ MultiXactOffsetPrecedes(MultiXactOffset offset1, MultiXactOffset offset2)
 /*
  * Write an xlog record reflecting the zeroing of either a MEMBERs or
  * OFFSETs page (info shows which)
- *
- * Note: xlog record is marked as outside transaction control, since we
- * want it to be redone whether the invoking transaction commits or not.
  */
 static void
 WriteMZeroPageXlogRec(int pageno, uint8 info)
@@ -1855,7 +1852,7 @@ WriteMZeroPageXlogRec(int pageno, uint8 info)
 	rdata.len = sizeof(int);
 	rdata.buffer = InvalidBuffer;
 	rdata.next = NULL;
-	(void) XLogInsert(RM_MULTIXACT_ID, info | XLOG_NO_TRAN, &rdata);
+	(void) XLogInsert(RM_MULTIXACT_ID, info, &rdata);
 }
 
 /*
diff --git a/src/backend/access/transam/twophase.c b/src/backend/access/transam/twophase.c
index 2ae81e823d5..3e7e8435029 100644
--- a/src/backend/access/transam/twophase.c
+++ b/src/backend/access/transam/twophase.c
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *		$PostgreSQL: pgsql/src/backend/access/transam/twophase.c,v 1.32 2007/08/01 22:45:07 tgl Exp $
+ *		$PostgreSQL: pgsql/src/backend/access/transam/twophase.c,v 1.33 2007/09/05 18:10:47 tgl Exp $
  *
  * NOTES
  *		Each global transaction is associated with a global transaction
@@ -274,9 +274,11 @@ MarkAsPreparing(TransactionId xid, const char *gid,
 	MemSet(&gxact->proc, 0, sizeof(PGPROC));
 	SHMQueueElemInit(&(gxact->proc.links));
 	gxact->proc.waitStatus = STATUS_OK;
+	gxact->proc.lxid = InvalidLocalTransactionId;
 	gxact->proc.xid = xid;
 	gxact->proc.xmin = InvalidTransactionId;
 	gxact->proc.pid = 0;
+	gxact->proc.backendId = InvalidBackendId;
 	gxact->proc.databaseId = databaseid;
 	gxact->proc.roleId = owner;
 	gxact->proc.inCommit = false;
@@ -813,8 +815,8 @@ StartPrepare(GlobalTransaction gxact)
 	hdr.prepared_at = gxact->prepared_at;
 	hdr.owner = gxact->owner;
 	hdr.nsubxacts = xactGetCommittedChildren(&children);
-	hdr.ncommitrels = smgrGetPendingDeletes(true, &commitrels);
-	hdr.nabortrels = smgrGetPendingDeletes(false, &abortrels);
+	hdr.ncommitrels = smgrGetPendingDeletes(true, &commitrels, NULL);
+	hdr.nabortrels = smgrGetPendingDeletes(false, &abortrels, NULL);
 	StrNCpy(hdr.gid, gxact->gid, GIDSIZE);
 
 	save_state_data(&hdr, sizeof(TwoPhaseFileHeader));
@@ -1702,9 +1704,7 @@ RecordTransactionCommitPrepared(TransactionId xid,
 	}
 	rdata[lastrdata].next = NULL;
 
-	recptr = XLogInsert(RM_XACT_ID,
-						XLOG_XACT_COMMIT_PREPARED | XLOG_NO_TRAN,
-						rdata);
+	recptr = XLogInsert(RM_XACT_ID, XLOG_XACT_COMMIT_PREPARED, rdata);
 
 	/*
 	 * We don't currently try to sleep before flush here ... nor is there
@@ -1784,9 +1784,7 @@ RecordTransactionAbortPrepared(TransactionId xid,
 	}
 	rdata[lastrdata].next = NULL;
 
-	recptr = XLogInsert(RM_XACT_ID,
-						XLOG_XACT_ABORT_PREPARED | XLOG_NO_TRAN,
-						rdata);
+	recptr = XLogInsert(RM_XACT_ID, XLOG_XACT_ABORT_PREPARED, rdata);
 
 	/* Always flush, since we're about to remove the 2PC state file */
 	XLogFlush(recptr);
diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c
index 18787d17770..2e972d56f60 100644
--- a/src/backend/access/transam/xact.c
+++ b/src/backend/access/transam/xact.c
@@ -10,7 +10,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/transam/xact.c,v 1.247 2007/09/03 00:39:13 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/transam/xact.c,v 1.248 2007/09/05 18:10:47 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -37,6 +37,7 @@
 #include "storage/fd.h"
 #include "storage/lmgr.h"
 #include "storage/procarray.h"
+#include "storage/sinvaladt.h"
 #include "storage/smgr.h"
 #include "utils/combocid.h"
 #include "utils/flatfiles.h"
@@ -216,7 +217,7 @@ static SubXactCallbackItem *SubXact_callbacks = NULL;
 
 
 /* local function prototypes */
-static void AssignSubTransactionId(TransactionState s);
+static void AssignTransactionId(TransactionState s);
 static void AbortTransaction(void);
 static void AtAbort_Memory(void);
 static void AtCleanup_Memory(void);
@@ -232,7 +233,7 @@ static void CallSubXactCallbacks(SubXactEvent event,
 					 SubTransactionId parentSubid);
 static void CleanupTransaction(void);
 static void CommitTransaction(void);
-static void RecordTransactionAbort(void);
+static void RecordTransactionAbort(bool isSubXact);
 static void StartTransaction(void);
 
 static void RecordSubTransactionCommit(void);
@@ -304,25 +305,36 @@ IsAbortedTransactionBlockState(void)
 /*
  *	GetTopTransactionId
  *
- * Get the ID of the main transaction, even if we are currently inside
- * a subtransaction.  If we are not in a transaction at all, or if we
- * are in transaction startup and haven't yet assigned an XID,
- * InvalidTransactionId is returned.
+ * This will return the XID of the main transaction, assigning one if
+ * it's not yet set.  Be careful to call this only inside a valid xact.
  */
 TransactionId
 GetTopTransactionId(void)
 {
+	if (!TransactionIdIsValid(TopTransactionStateData.transactionId))
+		AssignTransactionId(&TopTransactionStateData);
 	return TopTransactionStateData.transactionId;
 }
 
+/*
+ *	GetTopTransactionIdIfAny
+ *
+ * This will return the XID of the main transaction, if one is assigned.
+ * It will return InvalidTransactionId if we are not currently inside a
+ * transaction, or inside a transaction that hasn't yet been assigned an XID.
+ */
+TransactionId
+GetTopTransactionIdIfAny(void)
+{
+	return TopTransactionStateData.transactionId;
+}
 
 /*
  *	GetCurrentTransactionId
  *
- * We do not assign XIDs to subtransactions until/unless this is called.
- * When we do assign an XID to a subtransaction, recursively make sure
- * its parent has one as well (this maintains the invariant that a child
- * transaction has an XID following its parent's).
+ * This will return the XID of the current transaction (main or sub
+ * transaction), assigning one if it's not yet set.  Be careful to call this
+ * only inside a valid xact.
  */
 TransactionId
 GetCurrentTransactionId(void)
@@ -330,20 +342,49 @@ GetCurrentTransactionId(void)
 	TransactionState s = CurrentTransactionState;
 
 	if (!TransactionIdIsValid(s->transactionId))
-		AssignSubTransactionId(s);
-
+		AssignTransactionId(s);
 	return s->transactionId;
 }
 
+/*
+ *	GetCurrentTransactionIdIfAny
+ *
+ * This will return the XID of the current sub xact, if one is assigned.
+ * It will return InvalidTransactionId if we are not currently inside a
+ * transaction, or inside a transaction that hasn't been assigned an XID yet.
+ */
+TransactionId
+GetCurrentTransactionIdIfAny(void)
+{
+	return CurrentTransactionState->transactionId;
+}
+
+
+/*
+ * AssignTransactionId
+ *
+ * Assigns a new permanent XID to the given TransactionState.
+ * We do not assign XIDs to transactions until/unless this is called.
+ * Also, any parent TransactionStates that don't yet have XIDs are assigned
+ * one; this maintains the invariant that a child transaction has an XID
+ * following its parent's.
+ */
 static void
-AssignSubTransactionId(TransactionState s)
+AssignTransactionId(TransactionState s)
 {
+	bool isSubXact = (s->parent != NULL);
 	ResourceOwner currentOwner;
 
-	Assert(s->parent != NULL);
+	/* Assert that caller didn't screw up */
+	Assert(!TransactionIdIsValid(s->transactionId));
 	Assert(s->state == TRANS_INPROGRESS);
-	if (!TransactionIdIsValid(s->parent->transactionId))
-		AssignSubTransactionId(s->parent);
+
+	/*
+	 * Ensure parent(s) have XIDs, so that a child always has an XID later
+	 * than its parent.
+	 */
+	if (isSubXact && !TransactionIdIsValid(s->parent->transactionId))
+		AssignTransactionId(s->parent);
 
 	/*
 	 * Generate a new Xid and record it in PG_PROC and pg_subtrans.
@@ -353,20 +394,20 @@ AssignSubTransactionId(TransactionState s)
 	 * PG_PROC, the subtrans entry is needed to ensure that other backends see
 	 * the Xid as "running".  See GetNewTransactionId.
 	 */
-	s->transactionId = GetNewTransactionId(true);
+	s->transactionId = GetNewTransactionId(isSubXact);
 
-	SubTransSetParent(s->transactionId, s->parent->transactionId);
+	if (isSubXact)
+		SubTransSetParent(s->transactionId, s->parent->transactionId);
 
 	/*
-	 * Acquire lock on the transaction XID.  (We assume this cannot block.) We
-	 * have to be sure that the lock is assigned to the transaction's
-	 * ResourceOwner.
+	 * Acquire lock on the transaction XID.  (We assume this cannot block.)
+	 * We have to ensure that the lock is assigned to the transaction's
+	 * own ResourceOwner.
 	 */
 	currentOwner = CurrentResourceOwner;
 	PG_TRY();
 	{
 		CurrentResourceOwner = s->curTransactionOwner;
-
 		XactLockTableInsert(s->transactionId);
 	}
 	PG_CATCH();
@@ -380,22 +421,6 @@ AssignSubTransactionId(TransactionState s)
 }
 
 
-/*
- *	GetCurrentTransactionIdIfAny
- *
- * Unlike GetCurrentTransactionId, this will return InvalidTransactionId
- * if we are currently not in a transaction, or in a transaction or
- * subtransaction that has not yet assigned itself an XID.
- */
-TransactionId
-GetCurrentTransactionIdIfAny(void)
-{
-	TransactionState s = CurrentTransactionState;
-
-	return s->transactionId;
-}
-
-
 /*
  *	GetCurrentSubTransactionId
  */
@@ -726,192 +751,188 @@ AtSubStart_ResourceOwner(void)
 void
 RecordTransactionCommit(void)
 {
+	TransactionId xid = GetTopTransactionIdIfAny();
+	bool 		markXidCommitted = TransactionIdIsValid(xid);
 	int			nrels;
 	RelFileNode *rels;
+	bool		haveNonTemp;
 	int			nchildren;
 	TransactionId *children;
 
 	/* Get data needed for commit record */
-	nrels = smgrGetPendingDeletes(true, &rels);
+	nrels = smgrGetPendingDeletes(true, &rels, &haveNonTemp);
 	nchildren = xactGetCommittedChildren(&children);
 
 	/*
-	 * If we made neither any XLOG entries nor any temp-rel updates, and have
-	 * no files to be deleted, we can omit recording the transaction commit at
-	 * all.  (This test includes the effects of subtransactions, so the
-	 * presence of committed subxacts need not alone force a write.)
+	 * If we haven't been assigned an XID yet, we neither can, nor do we
+	 * want to write a COMMIT record.
 	 */
-	if (MyXactMadeXLogEntry || MyXactMadeTempRelUpdate || nrels > 0)
+	if (!markXidCommitted)
 	{
-		TransactionId xid = GetCurrentTransactionId();
-		bool		madeTCentries;
-		bool		isAsyncCommit = false;
-		XLogRecPtr	recptr;
+		/*
+		 * We expect that every smgrscheduleunlink is followed by a catalog
+		 * update, and hence XID assignment, so we shouldn't get here with
+		 * any pending deletes.  Use a real test not just an Assert to check
+		 * this, since it's a bit fragile.
+		 */
+		if (nrels != 0)
+			elog(ERROR, "cannot commit a transaction that deleted files but has no xid");
+
+		/* Can't have child XIDs either; AssignTransactionId enforces this */
+		Assert(nchildren == 0);
+		
+		/*
+		 * If we didn't create XLOG entries, we're done here; otherwise we
+		 * should flush those entries the same as a commit record.  (An
+		 * example of a possible record that wouldn't cause an XID to be
+		 * assigned is a sequence advance record due to nextval() --- we
+		 * want to flush that to disk before reporting commit.)
+		 */
+		if (XactLastRecEnd.xrecoff == 0)
+			goto cleanup;
+	}
+	else
+	{
+		/*
+		 * Begin commit critical section and insert the commit XLOG record.
+		 */
+		XLogRecData 	rdata[3];
+		int				lastrdata = 0;
+		xl_xact_commit	xlrec;
 
 		/* Tell bufmgr and smgr to prepare for commit */
 		BufmgrCommit();
 
-		START_CRIT_SECTION();
-
 		/*
-		 * We only need to log the commit in XLOG if the transaction made any
-		 * transaction-controlled XLOG entries or will delete files.
+		 * Mark ourselves as within our "commit critical section".  This
+		 * forces any concurrent checkpoint to wait until we've updated
+		 * pg_clog.  Without this, it is possible for the checkpoint to
+		 * set REDO after the XLOG record but fail to flush the pg_clog
+		 * update to disk, leading to loss of the transaction commit if
+		 * the system crashes a little later.
+		 *
+		 * Note: we could, but don't bother to, set this flag in
+		 * RecordTransactionAbort.  That's because loss of a transaction
+		 * abort is noncritical; the presumption would be that it aborted,
+		 * anyway.
+		 *
+		 * It's safe to change the inCommit flag of our own backend
+		 * without holding the ProcArrayLock, since we're the only one
+		 * modifying it.  This makes checkpoint's determination of which
+		 * xacts are inCommit a bit fuzzy, but it doesn't matter.
 		 */
-		madeTCentries = (MyLastRecPtr.xrecoff != 0);
-		if (madeTCentries || nrels > 0)
+		START_CRIT_SECTION();
+		MyProc->inCommit = true;
+
+		SetCurrentTransactionStopTimestamp();
+		xlrec.xact_time = xactStopTimestamp;
+		xlrec.nrels = nrels;
+		xlrec.nsubxacts = nchildren;
+		rdata[0].data = (char *) (&xlrec);
+		rdata[0].len = MinSizeOfXactCommit;
+		rdata[0].buffer = InvalidBuffer;
+		/* dump rels to delete */
+		if (nrels > 0)
 		{
-			XLogRecData rdata[3];
-			int			lastrdata = 0;
-			xl_xact_commit xlrec;
-
-			/*
-			 * Mark ourselves as within our "commit critical section".  This
-			 * forces any concurrent checkpoint to wait until we've updated
-			 * pg_clog.  Without this, it is possible for the checkpoint to
-			 * set REDO after the XLOG record but fail to flush the pg_clog
-			 * update to disk, leading to loss of the transaction commit if
-			 * the system crashes a little later.
-			 *
-			 * Note: we could, but don't bother to, set this flag in
-			 * RecordTransactionAbort.  That's because loss of a transaction
-			 * abort is noncritical; the presumption would be that it aborted,
-			 * anyway.
-			 *
-			 * It's safe to change the inCommit flag of our own backend
-			 * without holding the ProcArrayLock, since we're the only one
-			 * modifying it.  This makes checkpoint's determination of which
-			 * xacts are inCommit a bit fuzzy, but it doesn't matter.
-			 */
-			MyProc->inCommit = true;
-
-			SetCurrentTransactionStopTimestamp();
-			xlrec.xact_time = xactStopTimestamp;
-			xlrec.nrels = nrels;
-			xlrec.nsubxacts = nchildren;
-			rdata[0].data = (char *) (&xlrec);
-			rdata[0].len = MinSizeOfXactCommit;
-			rdata[0].buffer = InvalidBuffer;
-			/* dump rels to delete */
-			if (nrels > 0)
-			{
-				rdata[0].next = &(rdata[1]);
-				rdata[1].data = (char *) rels;
-				rdata[1].len = nrels * sizeof(RelFileNode);
-				rdata[1].buffer = InvalidBuffer;
-				lastrdata = 1;
-			}
-			/* dump committed child Xids */
-			if (nchildren > 0)
-			{
-				rdata[lastrdata].next = &(rdata[2]);
-				rdata[2].data = (char *) children;
-				rdata[2].len = nchildren * sizeof(TransactionId);
-				rdata[2].buffer = InvalidBuffer;
-				lastrdata = 2;
-			}
-			rdata[lastrdata].next = NULL;
-
-			recptr = XLogInsert(RM_XACT_ID, XLOG_XACT_COMMIT, rdata);
+			rdata[0].next = &(rdata[1]);
+			rdata[1].data = (char *) rels;
+			rdata[1].len = nrels * sizeof(RelFileNode);
+			rdata[1].buffer = InvalidBuffer;
+			lastrdata = 1;
 		}
-		else
+		/* dump committed child Xids */
+		if (nchildren > 0)
 		{
-			/* Just flush through last record written by me */
-			recptr = ProcLastRecEnd;
+			rdata[lastrdata].next = &(rdata[2]);
+			rdata[2].data = (char *) children;
+			rdata[2].len = nchildren * sizeof(TransactionId);
+			rdata[2].buffer = InvalidBuffer;
+			lastrdata = 2;
 		}
+		rdata[lastrdata].next = NULL;
+
+		(void) XLogInsert(RM_XACT_ID, XLOG_XACT_COMMIT, rdata);
+	}
 
+	/*
+	 * Check if we want to commit asynchronously.  If the user has set
+	 * synchronous_commit = off, and we're not doing cleanup of any non-temp
+	 * rels nor committing any command that wanted to force sync commit, then
+	 * we can defer flushing XLOG.  (We must not allow asynchronous commit if
+	 * there are any non-temp tables to be deleted, because we might delete
+	 * the files before the COMMIT record is flushed to disk.  We do allow
+	 * asynchronous commit if all to-be-deleted tables are temporary though,
+	 * since they are lost anyway if we crash.)
+	 */
+	if (XactSyncCommit || forceSyncCommit || haveNonTemp)
+	{
 		/*
-		 * We must flush our XLOG entries to disk if we made any XLOG entries,
-		 * whether in or out of transaction control.  For example, if we
-		 * reported a nextval() result to the client, this ensures that any
-		 * XLOG record generated by nextval will hit the disk before we report
-		 * the transaction committed.
+		 * Synchronous commit case.
 		 *
-		 * Note: if we generated a commit record above, MyXactMadeXLogEntry
-		 * will certainly be set now.
+		 * Sleep before flush! So we can flush more than one commit
+		 * records per single fsync.  (The idea is some other backend
+		 * may do the XLogFlush while we're sleeping.  This needs work
+		 * still, because on most Unixen, the minimum select() delay
+		 * is 10msec or more, which is way too long.)
+		 *
+		 * We do not sleep if enableFsync is not turned on, nor if
+		 * there are fewer than CommitSiblings other backends with
+		 * active transactions.
 		 */
-		if (MyXactMadeXLogEntry)
-		{
-			/*
-			 * If the user has set synchronous_commit = off, and we're
-			 * not doing cleanup of any rels nor committing any command
-			 * that wanted to force sync commit, then we can defer fsync.
-			 */
-			if (XactSyncCommit || forceSyncCommit || nrels > 0)
-			{
-				/*
-				 * Synchronous commit case.
-				 *
-				 * Sleep before flush! So we can flush more than one commit
-				 * records per single fsync.  (The idea is some other backend
-				 * may do the XLogFlush while we're sleeping.  This needs work
-				 * still, because on most Unixen, the minimum select() delay
-				 * is 10msec or more, which is way too long.)
-				 *
-				 * We do not sleep if enableFsync is not turned on, nor if
-				 * there are fewer than CommitSiblings other backends with
-				 * active transactions.
-				 */
-				if (CommitDelay > 0 && enableFsync &&
-					CountActiveBackends() >= CommitSiblings)
-					pg_usleep(CommitDelay);
+		if (CommitDelay > 0 && enableFsync &&
+			CountActiveBackends() >= CommitSiblings)
+			pg_usleep(CommitDelay);
 
-				XLogFlush(recptr);
-			}
-			else
-			{
-				/*
-				 * Asynchronous commit case.
-				 */
-				isAsyncCommit = true;
+		XLogFlush(XactLastRecEnd);
 
-				/*
-				 * Report the latest async commit LSN, so that
-				 * the WAL writer knows to flush this commit.
-				 */
-				XLogSetAsyncCommitLSN(recptr);
-			}
+		/*
+		 * Now we may update the CLOG, if we wrote a COMMIT record above
+		 */
+		if (markXidCommitted)
+		{
+			TransactionIdCommit(xid);
+			/* to avoid race conditions, the parent must commit first */
+			TransactionIdCommitTree(nchildren, children);
 		}
-
+	}
+	else
+	{
 		/*
-		 * We must mark the transaction committed in clog if its XID appears
-		 * either in permanent rels or in local temporary rels. We test this
-		 * by seeing if we made transaction-controlled entries *OR* local-rel
-		 * tuple updates.  Note that if we made only the latter, we have not
-		 * emitted an XLOG record for our commit, and so in the event of a
-		 * crash the clog update might be lost.  This is okay because no one
-		 * else will ever care whether we committed.
+		 * Asynchronous commit case.
 		 *
-		 * The recptr here refers to the last xlog entry by this transaction
-		 * so is the correct value to use for setting the clog.
+		 * Report the latest async commit LSN, so that
+		 * the WAL writer knows to flush this commit.
 		 */
-		if (madeTCentries || MyXactMadeTempRelUpdate)
+		XLogSetAsyncCommitLSN(XactLastRecEnd);
+
+		/*
+		 * We must not immediately update the CLOG, since we didn't
+		 * flush the XLOG. Instead, we store the LSN up to which
+		 * the XLOG must be flushed before the CLOG may be updated.
+		 */
+		if (markXidCommitted)
 		{
-			if (isAsyncCommit)
-			{
-				TransactionIdAsyncCommit(xid, recptr);
-				/* to avoid race conditions, the parent must commit first */
-				TransactionIdAsyncCommitTree(nchildren, children, recptr);
-			}
-			else
-			{
-				TransactionIdCommit(xid);
-				/* to avoid race conditions, the parent must commit first */
-				TransactionIdCommitTree(nchildren, children);
-			}
+			TransactionIdAsyncCommit(xid, XactLastRecEnd);
+			/* to avoid race conditions, the parent must commit first */
+			TransactionIdAsyncCommitTree(nchildren, children, XactLastRecEnd);
 		}
+	}
 
-		/* Checkpoint can proceed now */
+	/*
+	 * If we entered a commit critical section, leave it now, and
+	 * let checkpoints proceed.
+	 */
+	if (markXidCommitted)
+	{
 		MyProc->inCommit = false;
-
 		END_CRIT_SECTION();
 	}
 
-	/* Break the chain of back-links in the XLOG records I output */
-	MyLastRecPtr.xrecoff = 0;
-	MyXactMadeXLogEntry = false;
-	MyXactMadeTempRelUpdate = false;
+	/* Reset XactLastRecEnd until the next transaction writes something */
+	XactLastRecEnd.xrecoff = 0;
 
-	/* And clean up local data */
+cleanup:
+	/* Clean up local data */
 	if (rels)
 		pfree(rels);
 	if (children)
@@ -1030,23 +1051,20 @@ AtSubCommit_childXids(void)
 static void
 RecordSubTransactionCommit(void)
 {
+	TransactionId xid = GetCurrentTransactionIdIfAny();
+
 	/*
 	 * We do not log the subcommit in XLOG; it doesn't matter until the
 	 * top-level transaction commits.
 	 *
-	 * We must mark the subtransaction subcommitted in clog if its XID appears
-	 * either in permanent rels or in local temporary rels. We test this by
-	 * seeing if we made transaction-controlled entries *OR* local-rel tuple
-	 * updates.  (The test here actually covers the entire transaction tree so
-	 * far, so it may mark subtransactions that don't really need it, but it's
-	 * probably not worth being tenser. Note that if a prior subtransaction
-	 * dirtied these variables, then RecordTransactionCommit will have to do
-	 * the full pushup anyway...)
+	 * We must mark the subtransaction subcommitted in the CLOG if
+	 * it had a valid XID assigned.  If it did not, nobody else will
+	 * ever know about the existence of this subxact.  We don't
+	 * have to deal with deletions scheduled for on-commit here, since
+	 * they'll be reassigned to our parent (who might still abort).
 	 */
-	if (MyLastRecPtr.xrecoff != 0 || MyXactMadeTempRelUpdate)
+	if (TransactionIdIsValid(xid))
 	{
-		TransactionId xid = GetCurrentTransactionId();
-
 		/* XXX does this really need to be a critical section? */
 		START_CRIT_SECTION();
 
@@ -1066,108 +1084,118 @@ RecordSubTransactionCommit(void)
  *	RecordTransactionAbort
  */
 static void
-RecordTransactionAbort(void)
+RecordTransactionAbort(bool isSubXact)
 {
+	TransactionId xid = GetCurrentTransactionIdIfAny();
 	int			nrels;
 	RelFileNode *rels;
 	int			nchildren;
 	TransactionId *children;
-
-	/* Get data needed for abort record */
-	nrels = smgrGetPendingDeletes(false, &rels);
-	nchildren = xactGetCommittedChildren(&children);
+	XLogRecData 	rdata[3];
+	int				lastrdata = 0;
+	xl_xact_abort	xlrec;
 
 	/*
-	 * If we made neither any transaction-controlled XLOG entries nor any
-	 * temp-rel updates, and are not going to delete any files, we can omit
-	 * recording the transaction abort at all.	No one will ever care that it
-	 * aborted.  (These tests cover our whole transaction tree.)
+	 * If we haven't been assigned an XID, nobody will care whether we
+	 * aborted or not.  Hence, we're done in that case.  It does not matter
+	 * if we have rels to delete (note that this routine is not responsible
+	 * for actually deleting 'em).  We cannot have any child XIDs, either.
 	 */
-	if (MyLastRecPtr.xrecoff != 0 || MyXactMadeTempRelUpdate || nrels > 0)
+	if (!TransactionIdIsValid(xid))
 	{
-		TransactionId xid = GetCurrentTransactionId();
+		/* Reset XactLastRecEnd until the next transaction writes something */
+		if (!isSubXact)
+			XactLastRecEnd.xrecoff = 0;
+		return;
+	}
 
-		/*
-		 * Catch the scenario where we aborted partway through
-		 * RecordTransactionCommit ...
-		 */
-		if (TransactionIdDidCommit(xid))
-			elog(PANIC, "cannot abort transaction %u, it was already committed", xid);
+	/*
+	 * We have a valid XID, so we should write an ABORT record for it.
+	 *
+	 * We do not flush XLOG to disk here, since the default assumption after a
+	 * crash would be that we aborted, anyway.  For the same reason, we don't
+	 * need to worry about interlocking against checkpoint start.
+	 */
 
-		START_CRIT_SECTION();
+	/*
+	 * Check that we haven't aborted halfway through RecordTransactionCommit.
+	 */
+	if (TransactionIdDidCommit(xid))
+		elog(PANIC, "cannot abort transaction %u, it was already committed",
+			 xid);
 
-		/*
-		 * We only need to log the abort in XLOG if the transaction made any
-		 * transaction-controlled XLOG entries or will delete files. (If it
-		 * made no transaction-controlled XLOG entries, its XID appears
-		 * nowhere in permanent storage, so no one else will ever care if it
-		 * committed.)
-		 *
-		 * We do not flush XLOG to disk unless deleting files, since the
-		 * default assumption after a crash would be that we aborted, anyway.
-		 * For the same reason, we don't need to worry about interlocking
-		 * against checkpoint start.
-		 */
-		if (MyLastRecPtr.xrecoff != 0 || nrels > 0)
-		{
-			XLogRecData rdata[3];
-			int			lastrdata = 0;
-			xl_xact_abort xlrec;
-			XLogRecPtr	recptr;
-
-			SetCurrentTransactionStopTimestamp();
-			xlrec.xact_time = xactStopTimestamp;
-			xlrec.nrels = nrels;
-			xlrec.nsubxacts = nchildren;
-			rdata[0].data = (char *) (&xlrec);
-			rdata[0].len = MinSizeOfXactAbort;
-			rdata[0].buffer = InvalidBuffer;
-			/* dump rels to delete */
-			if (nrels > 0)
-			{
-				rdata[0].next = &(rdata[1]);
-				rdata[1].data = (char *) rels;
-				rdata[1].len = nrels * sizeof(RelFileNode);
-				rdata[1].buffer = InvalidBuffer;
-				lastrdata = 1;
-			}
-			/* dump committed child Xids */
-			if (nchildren > 0)
-			{
-				rdata[lastrdata].next = &(rdata[2]);
-				rdata[2].data = (char *) children;
-				rdata[2].len = nchildren * sizeof(TransactionId);
-				rdata[2].buffer = InvalidBuffer;
-				lastrdata = 2;
-			}
-			rdata[lastrdata].next = NULL;
+	/* Fetch the data we need for the abort record */
+	nrels = smgrGetPendingDeletes(false, &rels, NULL);
+	nchildren = xactGetCommittedChildren(&children);
 
-			recptr = XLogInsert(RM_XACT_ID, XLOG_XACT_ABORT, rdata);
+	/* XXX do we really need a critical section here? */
+	START_CRIT_SECTION();
 
-			/* Must flush if we are deleting files... */
-			if (nrels > 0)
-				XLogFlush(recptr);
-		}
+	/* Write the ABORT record */
+	if (isSubXact)
+		xlrec.xact_time = GetCurrentTimestamp();
+	else
+	{
+		SetCurrentTransactionStopTimestamp();
+		xlrec.xact_time = xactStopTimestamp;
+	}
+	xlrec.nrels = nrels;
+	xlrec.nsubxacts = nchildren;
+	rdata[0].data = (char *) (&xlrec);
+	rdata[0].len = MinSizeOfXactAbort;
+	rdata[0].buffer = InvalidBuffer;
+	/* dump rels to delete */
+	if (nrels > 0)
+	{
+		rdata[0].next = &(rdata[1]);
+		rdata[1].data = (char *) rels;
+		rdata[1].len = nrels * sizeof(RelFileNode);
+		rdata[1].buffer = InvalidBuffer;
+		lastrdata = 1;
+	}
+	/* dump committed child Xids */
+	if (nchildren > 0)
+	{
+		rdata[lastrdata].next = &(rdata[2]);
+		rdata[2].data = (char *) children;
+		rdata[2].len = nchildren * sizeof(TransactionId);
+		rdata[2].buffer = InvalidBuffer;
+		lastrdata = 2;
+	}
+	rdata[lastrdata].next = NULL;
 
-		/*
-		 * Mark the transaction aborted in clog.  This is not absolutely
-		 * necessary but we may as well do it while we are here.
-		 *
-		 * The ordering here isn't critical but it seems best to mark the
-		 * parent first.  This assures an atomic transition of all the
-		 * subtransactions to aborted state from the point of view of
-		 * concurrent TransactionIdDidAbort calls.
-		 */
-		TransactionIdAbort(xid);
-		TransactionIdAbortTree(nchildren, children);
+	(void) XLogInsert(RM_XACT_ID, XLOG_XACT_ABORT, rdata);
 
-		END_CRIT_SECTION();
-	}
+	/*
+	 * Mark the transaction aborted in clog.  This is not absolutely necessary
+	 * but we may as well do it while we are here; also, in the subxact case
+	 * it is helpful because XactLockTableWait makes use of it to avoid
+	 * waiting for already-aborted subtransactions.  It is OK to do it without
+	 * having flushed the ABORT record to disk, because in event of a crash
+	 * we'd be assumed to have aborted anyway.
+	 *
+	 * The ordering here isn't critical but it seems best to mark the
+	 * parent first.  This assures an atomic transition of all the
+	 * subtransactions to aborted state from the point of view of
+	 * concurrent TransactionIdDidAbort calls.
+	 */
+	TransactionIdAbort(xid);
+	TransactionIdAbortTree(nchildren, children);
 
-	/* Break the chain of back-links in the XLOG records I output */
-	MyLastRecPtr.xrecoff = 0;
-	MyXactMadeXLogEntry = false;
-	MyXactMadeTempRelUpdate = false;
+	END_CRIT_SECTION();
+
+	/*
+	 * If we're aborting a subtransaction, we can immediately remove failed
+	 * XIDs from PGPROC's cache of running child XIDs.  We do that here for
+	 * subxacts, because we already have the child XID array at hand.  For
+	 * main xacts, the equivalent happens just after this function returns.
+	 */
+	if (isSubXact)
+		XidCacheRemoveRunningXids(xid, nchildren, children);
+
+	/* Reset XactLastRecEnd until the next transaction writes something */
+	if (!isSubXact)
+		XactLastRecEnd.xrecoff = 0;
 
 	/* And clean up local data */
 	if (rels)
@@ -1251,108 +1279,6 @@ AtSubAbort_childXids(void)
 	s->childXids = NIL;
 }
 
-/*
- * RecordSubTransactionAbort
- */
-static void
-RecordSubTransactionAbort(void)
-{
-	int			nrels;
-	RelFileNode *rels;
-	TransactionId xid = GetCurrentTransactionId();
-	int			nchildren;
-	TransactionId *children;
-
-	/* Get data needed for abort record */
-	nrels = smgrGetPendingDeletes(false, &rels);
-	nchildren = xactGetCommittedChildren(&children);
-
-	/*
-	 * If we made neither any transaction-controlled XLOG entries nor any
-	 * temp-rel updates, and are not going to delete any files, we can omit
-	 * recording the transaction abort at all.	No one will ever care that it
-	 * aborted.  (These tests cover our whole transaction tree, and therefore
-	 * may mark subxacts that don't really need it, but it's probably not
-	 * worth being tenser.)
-	 *
-	 * In this case we needn't worry about marking subcommitted children as
-	 * aborted, because they didn't mark themselves as subcommitted in the
-	 * first place; see the optimization in RecordSubTransactionCommit.
-	 */
-	if (MyLastRecPtr.xrecoff != 0 || MyXactMadeTempRelUpdate || nrels > 0)
-	{
-		START_CRIT_SECTION();
-
-		/*
-		 * We only need to log the abort in XLOG if the transaction made any
-		 * transaction-controlled XLOG entries or will delete files.
-		 */
-		if (MyLastRecPtr.xrecoff != 0 || nrels > 0)
-		{
-			XLogRecData rdata[3];
-			int			lastrdata = 0;
-			xl_xact_abort xlrec;
-			XLogRecPtr	recptr;
-
-			xlrec.xact_time = GetCurrentTimestamp();
-			xlrec.nrels = nrels;
-			xlrec.nsubxacts = nchildren;
-			rdata[0].data = (char *) (&xlrec);
-			rdata[0].len = MinSizeOfXactAbort;
-			rdata[0].buffer = InvalidBuffer;
-			/* dump rels to delete */
-			if (nrels > 0)
-			{
-				rdata[0].next = &(rdata[1]);
-				rdata[1].data = (char *) rels;
-				rdata[1].len = nrels * sizeof(RelFileNode);
-				rdata[1].buffer = InvalidBuffer;
-				lastrdata = 1;
-			}
-			/* dump committed child Xids */
-			if (nchildren > 0)
-			{
-				rdata[lastrdata].next = &(rdata[2]);
-				rdata[2].data = (char *) children;
-				rdata[2].len = nchildren * sizeof(TransactionId);
-				rdata[2].buffer = InvalidBuffer;
-				lastrdata = 2;
-			}
-			rdata[lastrdata].next = NULL;
-
-			recptr = XLogInsert(RM_XACT_ID, XLOG_XACT_ABORT, rdata);
-
-			/* Must flush if we are deleting files... */
-			if (nrels > 0)
-				XLogFlush(recptr);
-		}
-
-		/*
-		 * Mark the transaction aborted in clog.  This is not absolutely
-		 * necessary but XactLockTableWait makes use of it to avoid waiting
-		 * for already-aborted subtransactions.
-		 */
-		TransactionIdAbort(xid);
-		TransactionIdAbortTree(nchildren, children);
-
-		END_CRIT_SECTION();
-	}
-
-	/*
-	 * We can immediately remove failed XIDs from PGPROC's cache of running
-	 * child XIDs. It's easiest to do it here while we have the child XID
-	 * array at hand, even though in the main-transaction case the equivalent
-	 * work happens just after return from RecordTransactionAbort.
-	 */
-	XidCacheRemoveRunningXids(xid, nchildren, children);
-
-	/* And clean up local data */
-	if (rels)
-		pfree(rels);
-	if (children)
-		pfree(children);
-}
-
 /* ----------------------------------------------------------------
  *						CleanupTransaction stuff
  * ----------------------------------------------------------------
@@ -1436,6 +1362,7 @@ static void
 StartTransaction(void)
 {
 	TransactionState s;
+	VirtualTransactionId vxid;
 
 	/*
 	 * Let's just make sure the state stack is empty
@@ -1479,13 +1406,25 @@ StartTransaction(void)
 	AtStart_ResourceOwner();
 
 	/*
-	 * generate a new transaction id
+	 * Assign a new LocalTransactionId, and combine it with the backendId to
+	 * form a virtual transaction id.
+	 */
+	vxid.backendId = MyBackendId;
+	vxid.localTransactionId = GetNextLocalTransactionId();
+
+	/*
+	 * Lock the virtual transaction id before we announce it in the proc array
 	 */
-	s->transactionId = GetNewTransactionId(false);
+	VirtualXactLockTableInsert(vxid);
 
-	XactLockTableInsert(s->transactionId);
+	/*
+	 * Advertise it in the proc array.  We assume assignment of
+	 * LocalTransactionID is atomic, and the backendId should be set already.
+	 */
+	Assert(MyProc->backendId == vxid.backendId);
+	MyProc->lxid = vxid.localTransactionId;
 
-	PG_TRACE1(transaction__start, s->transactionId);
+	PG_TRACE1(transaction__start, vxid.localTransactionId);
 
 	/*
 	 * set transaction_timestamp() (a/k/a now()).  We want this to be the same
@@ -1631,9 +1570,17 @@ CommitTransaction(void)
 	 */
 	if (MyProc != NULL)
 	{
-		/* Lock ProcArrayLock because that's what GetSnapshotData uses. */
+		/*
+		 * Lock ProcArrayLock because that's what GetSnapshotData uses.
+		 * You might assume that we can skip this step if we had no
+		 * transaction id assigned, because the failure case outlined
+		 * in GetSnapshotData cannot happen in that case. This is true,
+		 * but we *still* need the lock guarantee that two concurrent
+		 * computations of the *oldest* xmin will get the same result.
+		 */
 		LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
 		MyProc->xid = InvalidTransactionId;
+		MyProc->lxid = InvalidLocalTransactionId;
 		MyProc->xmin = InvalidTransactionId;
 		MyProc->inVacuum = false;		/* must be cleared with xid/xmin */
 
@@ -1861,10 +1808,8 @@ PrepareTransaction(void)
 	 * Now we clean up backend-internal state and release internal resources.
 	 */
 
-	/* Break the chain of back-links in the XLOG records I output */
-	MyLastRecPtr.xrecoff = 0;
-	MyXactMadeXLogEntry = false;
-	MyXactMadeTempRelUpdate = false;
+	/* Reset XactLastRecEnd until the next transaction writes something */
+	XactLastRecEnd.xrecoff = 0;
 
 	/*
 	 * Let others know about no transaction in progress by me.	This has to be
@@ -1872,9 +1817,17 @@ PrepareTransaction(void)
 	 * someone may think it is unlocked and recyclable.
 	 */
 
-	/* Lock ProcArrayLock because that's what GetSnapshotData uses. */
+	/*
+	 * Lock ProcArrayLock because that's what GetSnapshotData uses.
+	 * You might assume that we can skip this step if we have no
+	 * transaction id assigned, because the failure case outlined
+	 * in GetSnapshotData cannot happen in that case. This is true,
+	 * but we *still* need the lock guarantee that two concurrent
+	 * computations of the *oldest* xmin will get the same result.
+	 */
 	LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
 	MyProc->xid = InvalidTransactionId;
+	MyProc->lxid = InvalidLocalTransactionId;
 	MyProc->xmin = InvalidTransactionId;
 	MyProc->inVacuum = false;	/* must be cleared with xid/xmin */
 
@@ -2032,8 +1985,7 @@ AbortTransaction(void)
 	 * Advertise the fact that we aborted in pg_clog (assuming that we got as
 	 * far as assigning an XID to advertise).
 	 */
-	if (TransactionIdIsValid(s->transactionId))
-		RecordTransactionAbort();
+	RecordTransactionAbort(false);
 
 	/*
 	 * Let others know about no transaction in progress by me. Note that this
@@ -2042,9 +1994,17 @@ AbortTransaction(void)
 	 */
 	if (MyProc != NULL)
 	{
-		/* Lock ProcArrayLock because that's what GetSnapshotData uses. */
+		/*
+		 * Lock ProcArrayLock because that's what GetSnapshotData uses.
+		 * You might assume that we can skip this step if we have no
+		 * transaction id assigned, because the failure case outlined
+		 * in GetSnapshotData cannot happen in that case. This is true,
+		 * but we *still* need the lock guarantee that two concurrent
+		 * computations of the *oldest* xmin will get the same result.
+		 */
 		LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
 		MyProc->xid = InvalidTransactionId;
+		MyProc->lxid = InvalidLocalTransactionId;
 		MyProc->xmin = InvalidTransactionId;
 		MyProc->inVacuum = false;		/* must be cleared with xid/xmin */
 		MyProc->inCommit = false;		/* be sure this gets cleared */
@@ -3752,13 +3712,11 @@ CommitSubTransaction(void)
 	CommandCounterIncrement();
 
 	/* Mark subtransaction as subcommitted */
-	if (TransactionIdIsValid(s->transactionId))
-	{
-		RecordSubTransactionCommit();
-		AtSubCommit_childXids();
-	}
+	RecordSubTransactionCommit();
 
 	/* Post-commit cleanup */
+	if (TransactionIdIsValid(s->transactionId))
+		AtSubCommit_childXids();
 	AfterTriggerEndSubXact(true);
 	AtSubCommit_Portals(s->subTransactionId,
 						s->parent->subTransactionId,
@@ -3884,13 +3842,12 @@ AbortSubTransaction(void)
 									s->parent->subTransactionId);
 
 		/* Advertise the fact that we aborted in pg_clog. */
+		RecordTransactionAbort(true);
+
+		/* Post-abort cleanup */
 		if (TransactionIdIsValid(s->transactionId))
-		{
-			RecordSubTransactionAbort();
 			AtSubAbort_childXids();
-		}
 
-		/* Post-abort cleanup */
 		CallSubXactCallbacks(SUBXACT_EVENT_ABORT_SUB, s->subTransactionId,
 							 s->parent->subTransactionId);
 
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 1db33fb26da..5474a91c247 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.279 2007/08/28 23:17:47 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.280 2007/09/05 18:10:47 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -154,38 +154,16 @@ static TimeLineID recoveryTargetTLI;
 static List *expectedTLIs;
 static TimeLineID curFileTLI;
 
-/*
- * MyLastRecPtr points to the start of the last XLOG record inserted by the
- * current transaction.  If MyLastRecPtr.xrecoff == 0, then the current
- * xact hasn't yet inserted any transaction-controlled XLOG records.
- *
- * Note that XLOG records inserted outside transaction control are not
- * reflected into MyLastRecPtr.  They do, however, cause MyXactMadeXLogEntry
- * to be set true.	The latter can be used to test whether the current xact
- * made any loggable changes (including out-of-xact changes, such as
- * sequence updates).
- *
- * When we insert/update/delete a tuple in a temporary relation, we do not
- * make any XLOG record, since we don't care about recovering the state of
- * the temp rel after a crash.	However, we will still need to remember
- * whether our transaction committed or aborted in that case.  So, we must
- * set MyXactMadeTempRelUpdate true to indicate that the XID will be of
- * interest later.
- */
-XLogRecPtr	MyLastRecPtr = {0, 0};
-
-bool		MyXactMadeXLogEntry = false;
-
-bool		MyXactMadeTempRelUpdate = false;
-
 /*
  * ProcLastRecPtr points to the start of the last XLOG record inserted by the
- * current backend.  It is updated for all inserts, transaction-controlled
- * or not.	ProcLastRecEnd is similar but points to end+1 of last record.
+ * current backend.  It is updated for all inserts.  XactLastRecEnd points to
+ * end+1 of the last record, and is reset when we end a top-level transaction,
+ * or start a new one; so it can be used to tell if the current transaction has
+ * created any XLOG records.
  */
 static XLogRecPtr ProcLastRecPtr = {0, 0};
 
-XLogRecPtr	ProcLastRecEnd = {0, 0};
+XLogRecPtr	XactLastRecEnd = {0, 0};
 
 /*
  * RedoRecPtr is this backend's local copy of the REDO record pointer
@@ -488,15 +466,10 @@ XLogInsert(RmgrId rmid, uint8 info, XLogRecData *rdata)
 	bool		updrqst;
 	bool		doPageWrites;
 	bool		isLogSwitch = (rmid == RM_XLOG_ID && info == XLOG_SWITCH);
-	bool		no_tran = (rmid == RM_XLOG_ID);
 
+	/* info's high bits are reserved for use by me */
 	if (info & XLR_INFO_MASK)
-	{
-		if ((info & XLR_INFO_MASK) != XLOG_NO_TRAN)
-			elog(PANIC, "invalid xlog info mask %02X", (info & XLR_INFO_MASK));
-		no_tran = true;
-		info &= ~XLR_INFO_MASK;
-	}
+		elog(PANIC, "invalid xlog info mask %02X", info);
 
 	/*
 	 * In bootstrap mode, we don't actually log anything but XLOG resources;
@@ -856,11 +829,8 @@ begin:;
 #endif
 
 	/* Record begin of record in appropriate places */
-	if (!no_tran)
-		MyLastRecPtr = RecPtr;
 	ProcLastRecPtr = RecPtr;
 	Insert->PrevRecord = RecPtr;
-	MyXactMadeXLogEntry = true;
 
 	Insert->currpos += SizeOfXLogRecord;
 	freespace -= SizeOfXLogRecord;
@@ -1018,7 +988,7 @@ begin:;
 		SpinLockRelease(&xlogctl->info_lck);
 	}
 
-	ProcLastRecEnd = RecPtr;
+	XactLastRecEnd = RecPtr;
 
 	END_CRIT_SECTION();
 
diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql
index 74735248659..30ea87d5b7a 100644
--- a/src/backend/catalog/system_views.sql
+++ b/src/backend/catalog/system_views.sql
@@ -3,7 +3,7 @@
  *
  * Copyright (c) 1996-2007, PostgreSQL Global Development Group
  *
- * $PostgreSQL: pgsql/src/backend/catalog/system_views.sql,v 1.41 2007/08/25 17:47:44 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/catalog/system_views.sql,v 1.42 2007/09/05 18:10:47 tgl Exp $
  */
 
 CREATE VIEW pg_roles AS 
@@ -145,8 +145,8 @@ CREATE VIEW pg_locks AS
     SELECT * 
     FROM pg_lock_status() AS L
     (locktype text, database oid, relation oid, page int4, tuple int2,
-     transactionid xid, classid oid, objid oid, objsubid int2,
-     transaction xid, pid int4, mode text, granted boolean);
+     virtualxid text, transactionid xid, classid oid, objid oid, objsubid int2,
+     virtualtransaction text, pid int4, mode text, granted boolean);
 
 CREATE VIEW pg_cursors AS
     SELECT C.name, C.statement, C.is_holdable, C.is_binary,
diff --git a/src/backend/commands/indexcmds.c b/src/backend/commands/indexcmds.c
index d79e73f59d8..ac56b583f17 100644
--- a/src/backend/commands/indexcmds.c
+++ b/src/backend/commands/indexcmds.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/commands/indexcmds.c,v 1.162 2007/08/25 19:08:19 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/commands/indexcmds.c,v 1.163 2007/09/05 18:10:47 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -38,6 +38,7 @@
 #include "parser/parse_expr.h"
 #include "parser/parse_func.h"
 #include "parser/parsetree.h"
+#include "storage/procarray.h"
 #include "utils/acl.h"
 #include "utils/builtins.h"
 #include "utils/fmgroids.h"
@@ -126,9 +127,8 @@ DefineIndex(RangeVar *heapRelation,
 	int16	   *coloptions;
 	IndexInfo  *indexInfo;
 	int			numberOfAttributes;
-	List	   *old_xact_list;
-	ListCell   *lc;
-	uint32		ixcnt;
+	VirtualTransactionId *old_lockholders;
+	VirtualTransactionId *old_snapshots;
 	LockRelId	heaprelid;
 	LOCKTAG		heaplocktag;
 	Snapshot	snapshot;
@@ -484,24 +484,36 @@ DefineIndex(RangeVar *heapRelation,
 	 * xacts that open the table for writing after this point; they will see
 	 * the new index when they open it.
 	 *
+	 * Note: the reason we use actual lock acquisition here, rather than
+	 * just checking the ProcArray and sleeping, is that deadlock is possible
+	 * if one of the transactions in question is blocked trying to acquire
+	 * an exclusive lock on our table.  The lock code will detect deadlock
+	 * and error out properly.
+	 *
 	 * Note: GetLockConflicts() never reports our own xid, hence we need not
-	 * check for that.
+	 * check for that.  Also, prepared xacts are not reported, which is
+	 * fine since they certainly aren't going to do anything more.
 	 */
 	SET_LOCKTAG_RELATION(heaplocktag, heaprelid.dbId, heaprelid.relId);
-	old_xact_list = GetLockConflicts(&heaplocktag, ShareLock);
+	old_lockholders = GetLockConflicts(&heaplocktag, ShareLock);
 
-	foreach(lc, old_xact_list)
+	while (VirtualTransactionIdIsValid(*old_lockholders))
 	{
-		TransactionId xid = lfirst_xid(lc);
-
-		XactLockTableWait(xid);
+		VirtualXactLockTableWait(*old_lockholders);
+		old_lockholders++;
 	}
 
 	/*
 	 * Now take the "reference snapshot" that will be used by validate_index()
-	 * to filter candidate tuples.	All other transactions running at this
-	 * time will have to be out-waited before we can commit, because we can't
-	 * guarantee that tuples deleted just before this will be in the index.
+	 * to filter candidate tuples.  Beware!  There might be still snapshots
+	 * in use that treat some transaction as in-progress that our reference
+	 * snapshot treats as committed.  If such a recently-committed transaction
+	 * deleted tuples in the table, we will not include them in the index; yet
+	 * those transactions which see the deleting one as still-in-progress will
+	 * expect them to be there once we mark the index as valid.
+	 *
+	 * We solve this by waiting for all endangered transactions to exit before
+	 * we mark the index as valid.
 	 *
 	 * We also set ActiveSnapshot to this snap, since functions in indexes may
 	 * need a snapshot.
@@ -518,14 +530,21 @@ DefineIndex(RangeVar *heapRelation,
 	 * The index is now valid in the sense that it contains all currently
 	 * interesting tuples.	But since it might not contain tuples deleted just
 	 * before the reference snap was taken, we have to wait out any
-	 * transactions older than the reference snap.	We can do this by waiting
-	 * for each xact explicitly listed in the snap.
+	 * transactions that might have older snapshots.  Obtain a list of
+	 * VXIDs of such transactions, and wait for them individually.
 	 *
-	 * Note: GetSnapshotData() never stores our own xid into a snap, hence we
-	 * need not check for that.
+	 * We can exclude any running transactions that have xmin >= the xmax of
+	 * our reference snapshot, since they are clearly not interested in any
+	 * missing older tuples.  Also, GetCurrentVirtualXIDs never reports our
+	 * own vxid, so we need not check for that.
 	 */
-	for (ixcnt = 0; ixcnt < snapshot->xcnt; ixcnt++)
-		XactLockTableWait(snapshot->xip[ixcnt]);
+	old_snapshots = GetCurrentVirtualXIDs(ActiveSnapshot->xmax);
+
+	while (VirtualTransactionIdIsValid(*old_snapshots))
+	{
+		VirtualXactLockTableWait(*old_snapshots);
+		old_snapshots++;
+	}
 
 	/*
 	 * Index can now be marked valid -- update its pg_index entry
diff --git a/src/backend/commands/sequence.c b/src/backend/commands/sequence.c
index bd06bfb5da3..cb2a1380caf 100644
--- a/src/backend/commands/sequence.c
+++ b/src/backend/commands/sequence.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/commands/sequence.c,v 1.143 2007/02/01 19:10:26 momjian Exp $
+ *	  $PostgreSQL: pgsql/src/backend/commands/sequence.c,v 1.144 2007/09/05 18:10:47 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -25,6 +25,7 @@
 #include "commands/tablecmds.h"
 #include "miscadmin.h"
 #include "nodes/makefuncs.h"
+#include "storage/proc.h"
 #include "utils/acl.h"
 #include "utils/builtins.h"
 #include "utils/lsyscache.h"
@@ -63,7 +64,7 @@ typedef struct SeqTableData
 {
 	struct SeqTableData *next;	/* link to next SeqTable object */
 	Oid			relid;			/* pg_class OID of this sequence */
-	TransactionId xid;			/* xact in which we last did a seq op */
+	LocalTransactionId lxid;	/* xact in which we last did a seq op */
 	int64		last;			/* value last returned by nextval */
 	int64		cached;			/* last value already cached for nextval */
 	/* if last != cached, we have not used up all the cached values */
@@ -282,7 +283,7 @@ DefineSequence(CreateSeqStmt *seq)
 		rdata[1].buffer = InvalidBuffer;
 		rdata[1].next = NULL;
 
-		recptr = XLogInsert(RM_SEQ_ID, XLOG_SEQ_LOG | XLOG_NO_TRAN, rdata);
+		recptr = XLogInsert(RM_SEQ_ID, XLOG_SEQ_LOG, rdata);
 
 		PageSetLSN(page, recptr);
 		PageSetTLI(page, ThisTimeLineID);
@@ -366,7 +367,7 @@ AlterSequence(AlterSeqStmt *stmt)
 		rdata[1].buffer = InvalidBuffer;
 		rdata[1].next = NULL;
 
-		recptr = XLogInsert(RM_SEQ_ID, XLOG_SEQ_LOG | XLOG_NO_TRAN, rdata);
+		recptr = XLogInsert(RM_SEQ_ID, XLOG_SEQ_LOG, rdata);
 
 		PageSetLSN(page, recptr);
 		PageSetTLI(page, ThisTimeLineID);
@@ -594,7 +595,7 @@ nextval_internal(Oid relid)
 		rdata[1].buffer = InvalidBuffer;
 		rdata[1].next = NULL;
 
-		recptr = XLogInsert(RM_SEQ_ID, XLOG_SEQ_LOG | XLOG_NO_TRAN, rdata);
+		recptr = XLogInsert(RM_SEQ_ID, XLOG_SEQ_LOG, rdata);
 
 		PageSetLSN(page, recptr);
 		PageSetTLI(page, ThisTimeLineID);
@@ -764,7 +765,7 @@ do_setval(Oid relid, int64 next, bool iscalled)
 		rdata[1].buffer = InvalidBuffer;
 		rdata[1].next = NULL;
 
-		recptr = XLogInsert(RM_SEQ_ID, XLOG_SEQ_LOG | XLOG_NO_TRAN, rdata);
+		recptr = XLogInsert(RM_SEQ_ID, XLOG_SEQ_LOG, rdata);
 
 		PageSetLSN(page, recptr);
 		PageSetTLI(page, ThisTimeLineID);
@@ -825,10 +826,10 @@ setval3_oid(PG_FUNCTION_ARGS)
 static Relation
 open_share_lock(SeqTable seq)
 {
-	TransactionId thisxid = GetTopTransactionId();
+	LocalTransactionId thislxid = MyProc->lxid;
 
 	/* Get the lock if not already held in this xact */
-	if (seq->xid != thisxid)
+	if (seq->lxid != thislxid)
 	{
 		ResourceOwner currentOwner;
 
@@ -848,7 +849,7 @@ open_share_lock(SeqTable seq)
 		CurrentResourceOwner = currentOwner;
 
 		/* Flag that we have a lock in the current xact */
-		seq->xid = thisxid;
+		seq->lxid = thislxid;
 	}
 
 	/* We now know we have AccessShareLock, and can safely open the rel */
@@ -891,7 +892,7 @@ init_sequence(Oid relid, SeqTable *p_elm, Relation *p_rel)
 					(errcode(ERRCODE_OUT_OF_MEMORY),
 					 errmsg("out of memory")));
 		elm->relid = relid;
-		elm->xid = InvalidTransactionId;
+		elm->lxid = InvalidLocalTransactionId;
 		/* increment is set to 0 until we do read_info (see currval) */
 		elm->last = elm->cached = elm->increment = 0;
 		elm->next = seqtab;
diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c
index 358e9a5ad99..87cf57daec3 100644
--- a/src/backend/commands/vacuum.c
+++ b/src/backend/commands/vacuum.c
@@ -13,7 +13,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/commands/vacuum.c,v 1.355 2007/08/13 19:08:26 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/commands/vacuum.c,v 1.356 2007/09/05 18:10:47 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -2601,14 +2601,6 @@ repair_frag(VRelStats *vacrelstats, Relation onerel,
 				PageSetLSN(page, recptr);
 				PageSetTLI(page, ThisTimeLineID);
 			}
-			else
-			{
-				/*
-				 * No XLOG record, but still need to flag that XID exists on
-				 * disk
-				 */
-				MyXactMadeTempRelUpdate = true;
-			}
 
 			END_CRIT_SECTION();
 
@@ -2761,13 +2753,6 @@ move_chain_tuple(Relation rel,
 		PageSetLSN(dst_page, recptr);
 		PageSetTLI(dst_page, ThisTimeLineID);
 	}
-	else
-	{
-		/*
-		 * No XLOG record, but still need to flag that XID exists on disk
-		 */
-		MyXactMadeTempRelUpdate = true;
-	}
 
 	END_CRIT_SECTION();
 
@@ -2868,13 +2853,6 @@ move_plain_tuple(Relation rel,
 		PageSetLSN(dst_page, recptr);
 		PageSetTLI(dst_page, ThisTimeLineID);
 	}
-	else
-	{
-		/*
-		 * No XLOG record, but still need to flag that XID exists on disk
-		 */
-		MyXactMadeTempRelUpdate = true;
-	}
 
 	END_CRIT_SECTION();
 
@@ -3070,11 +3048,6 @@ vacuum_page(Relation onerel, Buffer buffer, VacPage vacpage)
 		PageSetLSN(page, recptr);
 		PageSetTLI(page, ThisTimeLineID);
 	}
-	else
-	{
-		/* No XLOG record, but still need to flag that XID exists on disk */
-		MyXactMadeTempRelUpdate = true;
-	}
 
 	END_CRIT_SECTION();
 }
diff --git a/src/backend/commands/vacuumlazy.c b/src/backend/commands/vacuumlazy.c
index 3ac097388b2..ecc0ee78074 100644
--- a/src/backend/commands/vacuumlazy.c
+++ b/src/backend/commands/vacuumlazy.c
@@ -36,7 +36,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/commands/vacuumlazy.c,v 1.90 2007/05/30 20:11:57 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/commands/vacuumlazy.c,v 1.91 2007/09/05 18:10:47 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -658,11 +658,6 @@ lazy_vacuum_page(Relation onerel, BlockNumber blkno, Buffer buffer,
 		PageSetLSN(page, recptr);
 		PageSetTLI(page, ThisTimeLineID);
 	}
-	else
-	{
-		/* No XLOG record, but still need to flag that XID exists on disk */
-		MyXactMadeTempRelUpdate = true;
-	}
 
 	END_CRIT_SECTION();
 
diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c
index 51da9679f35..577f73a31f1 100644
--- a/src/backend/storage/ipc/procarray.c
+++ b/src/backend/storage/ipc/procarray.c
@@ -23,7 +23,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/storage/ipc/procarray.c,v 1.28 2007/07/01 02:22:23 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/storage/ipc/procarray.c,v 1.29 2007/09/05 18:10:47 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -404,7 +404,7 @@ TransactionIdIsActive(TransactionId xid)
  * This is also used to determine where to truncate pg_subtrans.  allDbs
  * must be TRUE for that case, and ignoreVacuum FALSE.
  *
- * Note: we include the currently running xids in the set of considered xids.
+ * Note: we include all currently running xids in the set of considered xids.
  * This ensures that if a just-started xact has not yet set its snapshot,
  * when it does set the snapshot it cannot set xmin less than what we compute.
  */
@@ -416,15 +416,19 @@ GetOldestXmin(bool allDbs, bool ignoreVacuum)
 	int			index;
 
 	/*
-	 * Normally we start the min() calculation with our own XID.  But if
-	 * called by checkpointer, we will not be inside a transaction, so use
-	 * next XID as starting point for min() calculation.  (Note that if there
-	 * are no xacts running at all, that will be the subtrans truncation
-	 * point!)
+	 * We need to initialize the MIN() calculation with something.
+	 * ReadNewTransactionId() is guaranteed to work, but is relatively
+	 * expensive due to locking; so first we try a couple of shortcuts.
+	 * If we have a valid xmin in our own PGPROC entry, that will do;
+	 * or if we have assigned ourselves an XID, that will do.
 	 */
-	result = GetTopTransactionId();
+	result = MyProc ? MyProc->xmin : InvalidTransactionId;
 	if (!TransactionIdIsValid(result))
-		result = ReadNewTransactionId();
+	{
+		result = GetTopTransactionIdIfAny();
+		if (!TransactionIdIsValid(result))
+			result = ReadNewTransactionId();
+	}
 
 	LWLockAcquire(ProcArrayLock, LW_SHARED);
 
@@ -440,23 +444,22 @@ GetOldestXmin(bool allDbs, bool ignoreVacuum)
 			/* Fetch xid just once - see GetNewTransactionId */
 			TransactionId xid = proc->xid;
 
-			if (TransactionIdIsNormal(xid))
-			{
-				/* First consider the transaction own's Xid */
-				if (TransactionIdPrecedes(xid, result))
-					result = xid;
-
-				/*
-				 * Also consider the transaction's Xmin, if set.
-				 *
-				 * We must check both Xid and Xmin because there is a window
-				 * where an xact's Xid is set but Xmin isn't yet.
-				 */
-				xid = proc->xmin;
-				if (TransactionIdIsNormal(xid))
-					if (TransactionIdPrecedes(xid, result))
-						result = xid;
-			}
+			/* First consider the transaction's own Xid, if any */
+			if (TransactionIdIsNormal(xid) &&
+				TransactionIdPrecedes(xid, result))
+				result = xid;
+
+			/*
+			 * Also consider the transaction's Xmin, if set.
+			 *
+			 * We must check both Xid and Xmin because a transaction might
+			 * have an Xmin but not (yet) an Xid; conversely, if it has
+			 * an Xid, that could determine some not-yet-set Xmin.
+			 */
+			xid = proc->xmin;	/* Fetch just once */
+			if (TransactionIdIsNormal(xid) &&
+				TransactionIdPrecedes(xid, result))
+				result = xid;
 		}
 	}
 
@@ -545,8 +548,6 @@ GetSnapshotData(Snapshot snapshot, bool serializable)
 					 errmsg("out of memory")));
 	}
 
-	globalxmin = xmin = GetTopTransactionId();
-
 	/*
 	 * It is sufficient to get shared lock on ProcArrayLock, even if we are
 	 * computing a serializable snapshot and therefore will be setting
@@ -557,6 +558,19 @@ GetSnapshotData(Snapshot snapshot, bool serializable)
 	 * discussion just below).	So it doesn't matter whether another backend
 	 * concurrently doing GetSnapshotData or GetOldestXmin sees our xmin as
 	 * set or not; he'd compute the same xmin for himself either way.
+	 * (We are assuming here that xmin can be set and read atomically,
+	 * just like xid.)
+	 *
+	 * There is a corner case in which the above argument doesn't work: if
+	 * there isn't any oldest xact, ie, all xids in the array are invalid.
+	 * In that case we will compute xmin as the result of ReadNewTransactionId,
+	 * and since GetNewTransactionId doesn't take the ProcArrayLock, it's not
+	 * so obvious that two backends with overlapping shared locks will get
+	 * the same answer.  But GetNewTransactionId is required to store the XID
+	 * it assigned into the ProcArray before releasing XidGenLock.  Therefore
+	 * the backend that did ReadNewTransactionId later will see that XID in
+	 * the array, and will compute the same xmin as the earlier one that saw
+	 * no XIDs in the array.
 	 */
 	LWLockAcquire(ProcArrayLock, LW_SHARED);
 
@@ -589,6 +603,9 @@ GetSnapshotData(Snapshot snapshot, bool serializable)
 
 	xmax = ReadNewTransactionId();
 
+	/* initialize xmin calculation with xmax */
+	globalxmin = xmin = xmax;
+
 	/*
 	 * Spin over procArray checking xid, xmin, and subxids.  The goal is
 	 * to gather all active xids, find the lowest xmin, and try to record
@@ -597,34 +614,40 @@ GetSnapshotData(Snapshot snapshot, bool serializable)
 	for (index = 0; index < arrayP->numProcs; index++)
 	{
 		PGPROC	   *proc = arrayP->procs[index];
+		TransactionId xid;
+
+		/* Ignore procs running LAZY VACUUM */
+		if (proc->inVacuum)
+			continue;
+
+		/* Update globalxmin to be the smallest valid xmin */
+		xid = proc->xmin;		/* fetch just once */
+		if (TransactionIdIsNormal(xid) &&
+			TransactionIdPrecedes(xid, globalxmin))
+			globalxmin = xid;
 
 		/* Fetch xid just once - see GetNewTransactionId */
-		TransactionId xid = proc->xid;
+		xid = proc->xid;
 
 		/*
-		 * Ignore my own proc (dealt with my xid above), procs not running a
-		 * transaction, xacts started since we read the next transaction ID,
-		 * and xacts executing LAZY VACUUM. There's no need to store XIDs
-		 * above what we got from ReadNewTransactionId, since we'll treat them
-		 * as running anyway.  We also assume that such xacts can't compute an
-		 * xmin older than ours, so they needn't be considered in computing
-		 * globalxmin.
+		 * If the transaction has been assigned an xid < xmax we add it to the
+		 * snapshot, and update xmin if necessary.  There's no need to store
+		 * XIDs above what we got from ReadNewTransactionId, since we'll treat
+		 * them as running anyway.  We don't bother to examine their subxids
+		 * either.
+		 *
+		 * We don't include our own XID (if any) in the snapshot, but we must
+		 * include it into xmin.
 		 */
-		if (proc == MyProc ||
-			!TransactionIdIsNormal(xid) ||
-			TransactionIdFollowsOrEquals(xid, xmax) ||
-			proc->inVacuum)
-			continue;
-
-		if (TransactionIdPrecedes(xid, xmin))
-			xmin = xid;
-		snapshot->xip[count++] = xid;
-
-		/* Update globalxmin to be the smallest valid xmin */
-		xid = proc->xmin;
 		if (TransactionIdIsNormal(xid))
-			if (TransactionIdPrecedes(xid, globalxmin))
-				globalxmin = xid;
+		{
+			if (TransactionIdFollowsOrEquals(xid, xmax))
+				continue;
+			if (proc != MyProc)
+				snapshot->xip[count++] = xid;
+			if (TransactionIdPrecedes(xid, xmin))
+				xmin = xid;
+		}
 
 		/*
 		 * Save subtransaction XIDs if possible (if we've already overflowed,
@@ -635,8 +658,10 @@ GetSnapshotData(Snapshot snapshot, bool serializable)
 		 * remove any.	Hence it's important to fetch nxids just once. Should
 		 * be safe to use memcpy, though.  (We needn't worry about missing any
 		 * xids added concurrently, because they must postdate xmax.)
+		 *
+		 * Again, our own XIDs are not included in the snapshot.
 		 */
-		if (subcount >= 0)
+		if (subcount >= 0 && proc != MyProc)
 		{
 			if (proc->subxids.overflowed)
 				subcount = -1;	/* overflowed */
@@ -818,6 +843,9 @@ BackendPidGetProc(int pid)
  *
  * Only main transaction Ids are considered.  This function is mainly
  * useful for determining what backend owns a lock.
+ *
+ * Beware that not every xact has an XID assigned.  However, as long as you
+ * only call this using an XID found on disk, you're safe.
  */
 int
 BackendXidGetPid(TransactionId xid)
@@ -856,6 +884,63 @@ IsBackendPid(int pid)
 	return (BackendPidGetProc(pid) != NULL);
 }
 
+
+/*
+ * GetCurrentVirtualXIDs -- returns an array of currently active VXIDs.
+ *
+ * The array is palloc'd and is terminated with an invalid VXID.
+ *
+ * If limitXmin is not InvalidTransactionId, we skip any backends
+ * with xmin >= limitXmin.  Also, our own process is always skipped.
+ */
+VirtualTransactionId *
+GetCurrentVirtualXIDs(TransactionId limitXmin)
+{
+	VirtualTransactionId *vxids;
+	ProcArrayStruct *arrayP = procArray;
+	int			count = 0;
+	int			index;
+
+	/* allocate result space with room for a terminator */
+	vxids = (VirtualTransactionId *)
+		palloc(sizeof(VirtualTransactionId) * (arrayP->maxProcs + 1));
+
+	LWLockAcquire(ProcArrayLock, LW_SHARED);
+
+	for (index = 0; index < arrayP->numProcs; index++)
+	{
+		PGPROC	   *proc = arrayP->procs[index];
+		/* Fetch xmin just once - might change on us? */
+		TransactionId pxmin = proc->xmin;
+
+		if (proc == MyProc)
+			continue;
+
+		/*
+		 * Note that InvalidTransactionId precedes all other XIDs, so a
+		 * proc that hasn't set xmin yet will always be included.
+		 */
+		if (!TransactionIdIsValid(limitXmin) ||
+			TransactionIdPrecedes(pxmin, limitXmin))
+		{
+			VirtualTransactionId vxid;
+
+			GET_VXID_FROM_PGPROC(vxid, *proc);
+			if (VirtualTransactionIdIsValid(vxid))
+				vxids[count++] = vxid;
+		}
+	}
+
+	LWLockRelease(ProcArrayLock);
+
+	/* add the terminator */
+	vxids[count].backendId = InvalidBackendId;
+	vxids[count].localTransactionId = InvalidLocalTransactionId;
+
+	return vxids;
+}
+
+
 /*
  * CountActiveBackends --- count backends (other than myself) that are in
  *		active transactions.  This is used as a heuristic to decide if
@@ -885,7 +970,7 @@ CountActiveBackends(void)
 		if (proc->pid == 0)
 			continue;			/* do not count prepared xacts */
 		if (proc->xid == InvalidTransactionId)
-			continue;			/* do not count if not in a transaction */
+			continue;			/* do not count if no XID assigned */
 		if (proc->waitLock != NULL)
 			continue;			/* do not count if blocked on a lock */
 		count++;
diff --git a/src/backend/storage/ipc/sinvaladt.c b/src/backend/storage/ipc/sinvaladt.c
index 31c4a2dfad1..99690d8b36b 100644
--- a/src/backend/storage/ipc/sinvaladt.c
+++ b/src/backend/storage/ipc/sinvaladt.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/storage/ipc/sinvaladt.c,v 1.63 2007/01/05 22:19:38 momjian Exp $
+ *	  $PostgreSQL: pgsql/src/backend/storage/ipc/sinvaladt.c,v 1.64 2007/09/05 18:10:47 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -19,12 +19,15 @@
 #include "storage/ipc.h"
 #include "storage/lwlock.h"
 #include "storage/pmsignal.h"
+#include "storage/proc.h"
 #include "storage/shmem.h"
 #include "storage/sinvaladt.h"
 
 
 SISeg	   *shmInvalBuffer;
 
+static LocalTransactionId nextLocalTransactionId;
+
 static void CleanupInvalidationState(int status, Datum arg);
 static void SISetProcStateInvalid(SISeg *segP);
 
@@ -40,6 +43,8 @@ SInvalShmemSize(void)
 	size = offsetof(SISeg, procState);
 	size = add_size(size, mul_size(sizeof(ProcState), MaxBackends));
 
+	size = add_size(size, mul_size(sizeof(LocalTransactionId), MaxBackends));
+
 	return size;
 }
 
@@ -51,15 +56,21 @@ void
 SIBufferInit(void)
 {
 	SISeg	   *segP;
+	Size		size;
 	int			i;
 	bool		found;
 
 	/* Allocate space in shared memory */
+	size = offsetof(SISeg, procState);
+	size = add_size(size, mul_size(sizeof(ProcState), MaxBackends));
+
 	shmInvalBuffer = segP = (SISeg *)
-		ShmemInitStruct("shmInvalBuffer", SInvalShmemSize(), &found);
+		ShmemInitStruct("shmInvalBuffer", size, &found);
 	if (found)
 		return;
 
+	segP->nextLXID = ShmemAlloc(sizeof(LocalTransactionId) * MaxBackends);
+
 	/* Clear message counters, save size of procState array */
 	segP->minMsgNum = 0;
 	segP->maxMsgNum = 0;
@@ -69,11 +80,12 @@ SIBufferInit(void)
 
 	/* The buffer[] array is initially all unused, so we need not fill it */
 
-	/* Mark all backends inactive */
+	/* Mark all backends inactive, and initialize nextLXID */
 	for (i = 0; i < segP->maxBackends; i++)
 	{
 		segP->procState[i].nextMsgNum = -1;		/* inactive */
 		segP->procState[i].resetState = false;
+		segP->nextLXID[i] = InvalidLocalTransactionId;
 	}
 }
 
@@ -128,9 +140,15 @@ SIBackendInit(SISeg *segP)
 	elog(DEBUG2, "my backend id is %d", MyBackendId);
 #endif   /* INVALIDDEBUG */
 
+	/* Advertise assigned backend ID in MyProc */
+	MyProc->backendId = MyBackendId;
+
 	/* Reduce free slot count */
 	segP->freeBackends--;
 
+	/* Fetch next local transaction ID into local memory */
+	nextLocalTransactionId = segP->nextLXID[MyBackendId - 1];
+
 	/* mark myself active, with all extant messages already read */
 	stateP->nextMsgNum = segP->maxMsgNum;
 	stateP->resetState = false;
@@ -160,6 +178,9 @@ CleanupInvalidationState(int status, Datum arg)
 
 	LWLockAcquire(SInvalLock, LW_EXCLUSIVE);
 
+	/* Update next local transaction ID for next holder of this backendID */
+	segP->nextLXID[MyBackendId - 1] = nextLocalTransactionId;
+
 	/* Mark myself inactive */
 	segP->procState[MyBackendId - 1].nextMsgNum = -1;
 	segP->procState[MyBackendId - 1].resetState = false;
@@ -352,3 +373,30 @@ SIDelExpiredDataEntries(SISeg *segP)
 		}
 	}
 }
+
+
+/*
+ * GetNextLocalTransactionId --- allocate a new LocalTransactionId
+ *
+ * We split VirtualTransactionIds into two parts so that it is possible
+ * to allocate a new one without any contention for shared memory, except
+ * for a bit of additional overhead during backend startup/shutdown.
+ * The high-order part of a VirtualTransactionId is a BackendId, and the
+ * low-order part is a LocalTransactionId, which we assign from a local
+ * counter.  To avoid the risk of a VirtualTransactionId being reused
+ * within a short interval, successive procs occupying the same backend ID
+ * slot should use a consecutive sequence of local IDs, which is implemented
+ * by copying nextLocalTransactionId as seen above.
+ */
+LocalTransactionId
+GetNextLocalTransactionId(void)
+{
+	LocalTransactionId result;
+
+	/* loop to avoid returning InvalidLocalTransactionId at wraparound */
+	do {
+		result = nextLocalTransactionId++;
+	} while (!LocalTransactionIdIsValid(result));
+
+	return result;
+}
diff --git a/src/backend/storage/lmgr/lmgr.c b/src/backend/storage/lmgr/lmgr.c
index 1c5db363203..f947d226fea 100644
--- a/src/backend/storage/lmgr/lmgr.c
+++ b/src/backend/storage/lmgr/lmgr.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/storage/lmgr/lmgr.c,v 1.92 2007/07/25 22:16:18 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/storage/lmgr/lmgr.c,v 1.93 2007/09/05 18:10:47 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -421,8 +421,8 @@ UnlockTuple(Relation relation, ItemPointer tid, LOCKMODE lockmode)
  *		XactLockTableInsert
  *
  * Insert a lock showing that the given transaction ID is running ---
- * this is done during xact startup.  The lock can then be used to wait
- * for the transaction to finish.
+ * this is done when an XID is acquired by a transaction or subtransaction.
+ * The lock can then be used to wait for the transaction to finish.
  */
 void
 XactLockTableInsert(TransactionId xid)
@@ -439,8 +439,7 @@ XactLockTableInsert(TransactionId xid)
  *
  * Delete the lock showing that the given transaction ID is running.
  * (This is never used for main transaction IDs; those locks are only
- * released implicitly at transaction end.	But we do use it for subtrans
- * IDs.)
+ * released implicitly at transaction end.	But we do use it for subtrans IDs.)
  */
 void
 XactLockTableDelete(TransactionId xid)
@@ -472,7 +471,7 @@ XactLockTableWait(TransactionId xid)
 	for (;;)
 	{
 		Assert(TransactionIdIsValid(xid));
-		Assert(!TransactionIdEquals(xid, GetTopTransactionId()));
+		Assert(!TransactionIdEquals(xid, GetTopTransactionIdIfAny()));
 
 		SET_LOCKTAG_TRANSACTION(tag, xid);
 
@@ -500,7 +499,7 @@ ConditionalXactLockTableWait(TransactionId xid)
 	for (;;)
 	{
 		Assert(TransactionIdIsValid(xid));
-		Assert(!TransactionIdEquals(xid, GetTopTransactionId()));
+		Assert(!TransactionIdEquals(xid, GetTopTransactionIdIfAny()));
 
 		SET_LOCKTAG_TRANSACTION(tag, xid);
 
@@ -517,6 +516,70 @@ ConditionalXactLockTableWait(TransactionId xid)
 	return true;
 }
 
+
+/*
+ * 		VirtualXactLockTableInsert
+ *
+ * Insert a lock showing that the given virtual transaction ID is running ---
+ * this is done at main transaction start when its VXID is assigned.
+ * The lock can then be used to wait for the transaction to finish.
+ */
+void
+VirtualXactLockTableInsert(VirtualTransactionId vxid)
+{
+	LOCKTAG		tag;
+
+	Assert(VirtualTransactionIdIsValid(vxid));
+
+	SET_LOCKTAG_VIRTUALTRANSACTION(tag, vxid);
+
+	(void) LockAcquire(&tag, ExclusiveLock, false, false);
+}
+
+/*
+ * 		VirtualXactLockTableWait
+ *
+ * Waits until the lock on the given VXID is released, which shows that
+ * the top-level transaction owning the VXID has ended.
+ */
+void
+VirtualXactLockTableWait(VirtualTransactionId vxid)
+{
+	LOCKTAG		tag;
+
+	Assert(VirtualTransactionIdIsValid(vxid));
+
+	SET_LOCKTAG_VIRTUALTRANSACTION(tag, vxid);
+
+	(void) LockAcquire(&tag, ShareLock, false, false);
+
+	LockRelease(&tag, ShareLock, false);
+}
+
+/*
+ * 		ConditionalVirtualXactLockTableWait
+ *
+ * As above, but only lock if we can get the lock without blocking.
+ * Returns TRUE if the lock was acquired.
+ */
+bool
+ConditionalVirtualXactLockTableWait(VirtualTransactionId vxid)
+{
+	LOCKTAG		tag;
+
+	Assert(VirtualTransactionIdIsValid(vxid));
+
+	SET_LOCKTAG_VIRTUALTRANSACTION(tag, vxid);
+
+	if (LockAcquire(&tag, ShareLock, false, true) == LOCKACQUIRE_NOT_AVAIL)
+		return false;
+
+	LockRelease(&tag, ShareLock, false);
+
+	return true;
+}
+
+
 /*
  *		LockDatabaseObject
  *
diff --git a/src/backend/storage/lmgr/lock.c b/src/backend/storage/lmgr/lock.c
index a4a0910d393..06a4f7adae5 100644
--- a/src/backend/storage/lmgr/lock.c
+++ b/src/backend/storage/lmgr/lock.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/storage/lmgr/lock.c,v 1.177 2007/07/16 21:09:50 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/storage/lmgr/lock.c,v 1.178 2007/09/05 18:10:47 tgl Exp $
  *
  * NOTES
  *	  A lock table is a shared memory hash table.  When
@@ -1681,20 +1681,24 @@ LockReassignCurrentOwner(void)
 
 /*
  * GetLockConflicts
- *		Get a list of TransactionIds of xacts currently holding locks
+ *		Get an array of VirtualTransactionIds of xacts currently holding locks
  *		that would conflict with the specified lock/lockmode.
  *		xacts merely awaiting such a lock are NOT reported.
  *
+ * The result array is palloc'd and is terminated with an invalid VXID.
+ *
  * Of course, the result could be out of date by the time it's returned,
  * so use of this function has to be thought about carefully.
  *
- * Only top-level XIDs are reported.  Note we never include the current xact
- * in the result list, since an xact never blocks itself.
+ * Note we never include the current xact's vxid in the result array,
+ * since an xact never blocks itself.  Also, prepared transactions are
+ * ignored, which is a bit more debatable but is appropriate for current
+ * uses of the result.
  */
-List *
+VirtualTransactionId *
 GetLockConflicts(const LOCKTAG *locktag, LOCKMODE lockmode)
 {
-	List	   *result = NIL;
+	VirtualTransactionId *vxids;
 	LOCKMETHODID lockmethodid = locktag->locktag_lockmethodid;
 	LockMethod	lockMethodTable;
 	LOCK	   *lock;
@@ -1703,6 +1707,7 @@ GetLockConflicts(const LOCKTAG *locktag, LOCKMODE lockmode)
 	PROCLOCK   *proclock;
 	uint32		hashcode;
 	LWLockId	partitionLock;
+	int			count = 0;
 
 	if (lockmethodid <= 0 || lockmethodid >= lengthof(LockMethods))
 		elog(ERROR, "unrecognized lock method: %d", lockmethodid);
@@ -1710,6 +1715,14 @@ GetLockConflicts(const LOCKTAG *locktag, LOCKMODE lockmode)
 	if (lockmode <= 0 || lockmode > lockMethodTable->numLockModes)
 		elog(ERROR, "unrecognized lock mode: %d", lockmode);
 
+	/*
+	 * Allocate memory to store results, and fill with InvalidVXID.  We
+	 * only need enough space for MaxBackends + a terminator, since
+	 * prepared xacts don't count.
+	 */
+	vxids = (VirtualTransactionId *)
+		palloc0(sizeof(VirtualTransactionId) * (MaxBackends + 1));
+
 	/*
 	 * Look up the lock object matching the tag.
 	 */
@@ -1730,7 +1743,7 @@ GetLockConflicts(const LOCKTAG *locktag, LOCKMODE lockmode)
 		 * on this lockable object.
 		 */
 		LWLockRelease(partitionLock);
-		return NIL;
+		return vxids;
 	}
 
 	/*
@@ -1752,18 +1765,17 @@ GetLockConflicts(const LOCKTAG *locktag, LOCKMODE lockmode)
 			/* A backend never blocks itself */
 			if (proc != MyProc)
 			{
-				/* Fetch xid just once - see GetNewTransactionId */
-				TransactionId xid = proc->xid;
+				VirtualTransactionId vxid;
+
+				GET_VXID_FROM_PGPROC(vxid, *proc);
 
 				/*
-				 * Race condition: during xact commit/abort we zero out
-				 * PGPROC's xid before we mark its locks released.  If we see
-				 * zero in the xid field, assume the xact is in process of
-				 * shutting down and act as though the lock is already
-				 * released.
+				 * If we see an invalid VXID, then either the xact has already
+				 * committed (or aborted), or it's a prepared xact.  In
+				 * either case we may ignore it.
 				 */
-				if (TransactionIdIsValid(xid))
-					result = lappend_xid(result, xid);
+				if (VirtualTransactionIdIsValid(vxid))
+					vxids[count++] = vxid;
 			}
 		}
 
@@ -1773,7 +1785,10 @@ GetLockConflicts(const LOCKTAG *locktag, LOCKMODE lockmode)
 
 	LWLockRelease(partitionLock);
 
-	return result;
+	if (count > MaxBackends)	/* should never happen */
+		elog(PANIC, "too many conflicting locks found");
+
+	return vxids;
 }
 
 
@@ -1782,7 +1797,7 @@ GetLockConflicts(const LOCKTAG *locktag, LOCKMODE lockmode)
  *		Do the preparatory work for a PREPARE: make 2PC state file records
  *		for all locks currently held.
  *
- * Non-transactional locks are ignored.
+ * Non-transactional locks are ignored, as are VXID locks.
  *
  * There are some special cases that we error out on: we can't be holding
  * any session locks (should be OK since only VACUUM uses those) and we
@@ -1812,6 +1827,13 @@ AtPrepare_Locks(void)
 		if (!LockMethods[LOCALLOCK_LOCKMETHOD(*locallock)]->transactional)
 			continue;
 
+		/*
+		 * Ignore VXID locks.  We don't want those to be held by prepared
+		 * transactions, since they aren't meaningful after a restart.
+		 */
+		if (locallock->tag.lock.locktag_type == LOCKTAG_VIRTUALTRANSACTION)
+			continue;
+
 		/* Ignore it if we don't actually hold the lock */
 		if (locallock->nLocks <= 0)
 			continue;
@@ -1899,6 +1921,10 @@ PostPrepare_Locks(TransactionId xid)
 		if (!LockMethods[LOCALLOCK_LOCKMETHOD(*locallock)]->transactional)
 			continue;
 
+		/* Ignore VXID locks */
+		if (locallock->tag.lock.locktag_type == LOCKTAG_VIRTUALTRANSACTION)
+			continue;
+
 		/* We already checked there are no session locks */
 
 		/* Mark the proclock to show we need to release this lockmode */
@@ -1944,6 +1970,10 @@ PostPrepare_Locks(TransactionId xid)
 			if (!LockMethods[LOCK_LOCKMETHOD(*lock)]->transactional)
 				goto next_item;
 
+			/* Ignore VXID locks */
+			if (lock->tag.locktag_type == LOCKTAG_VIRTUALTRANSACTION)
+				goto next_item;
+
 			PROCLOCK_PRINT("PostPrepare_Locks", proclock);
 			LOCK_PRINT("PostPrepare_Locks", lock, 0);
 			Assert(lock->nRequested >= 0);
diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c
index 048fa31bccd..5441dd322de 100644
--- a/src/backend/storage/lmgr/proc.c
+++ b/src/backend/storage/lmgr/proc.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/storage/lmgr/proc.c,v 1.192 2007/08/28 03:23:44 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/storage/lmgr/proc.c,v 1.193 2007/09/05 18:10:47 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -282,10 +282,12 @@ InitProcess(void)
 	 */
 	SHMQueueElemInit(&(MyProc->links));
 	MyProc->waitStatus = STATUS_OK;
+	MyProc->lxid = InvalidLocalTransactionId;
 	MyProc->xid = InvalidTransactionId;
 	MyProc->xmin = InvalidTransactionId;
 	MyProc->pid = MyProcPid;
-	/* databaseId and roleId will be filled in later */
+	/* backendId, databaseId and roleId will be filled in later */
+	MyProc->backendId = InvalidBackendId;
 	MyProc->databaseId = InvalidOid;
 	MyProc->roleId = InvalidOid;
 	MyProc->inCommit = false;
@@ -359,7 +361,9 @@ InitProcessPhase2(void)
  *
  * Auxiliary processes are presently not expected to wait for real (lockmgr)
  * locks, so we need not set up the deadlock checker.  They are never added
- * to the ProcArray or the sinval messaging mechanism, either.
+ * to the ProcArray or the sinval messaging mechanism, either.  They also
+ * don't get a VXID assigned, since this is only useful when we actually
+ * hold lockmgr locks.
  */
 void
 InitAuxiliaryProcess(void)
@@ -418,8 +422,10 @@ InitAuxiliaryProcess(void)
 	 */
 	SHMQueueElemInit(&(MyProc->links));
 	MyProc->waitStatus = STATUS_OK;
+	MyProc->lxid = InvalidLocalTransactionId;
 	MyProc->xid = InvalidTransactionId;
 	MyProc->xmin = InvalidTransactionId;
+	MyProc->backendId = InvalidBackendId;
 	MyProc->databaseId = InvalidOid;
 	MyProc->roleId = InvalidOid;
 	MyProc->inCommit = false;
diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c
index 7137d2dc08c..22ac13146c8 100644
--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@@ -11,7 +11,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/storage/smgr/smgr.c,v 1.105 2007/07/20 16:29:53 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/storage/smgr/smgr.c,v 1.106 2007/09/05 18:10:48 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -347,9 +347,8 @@ smgrcreate(SMgrRelation reln, bool isTemp, bool isRedo)
 		return;
 
 	/*
-	 * Make a non-transactional XLOG entry showing the file creation. It's
-	 * non-transactional because we should replay it whether the transaction
-	 * commits or not; if not, the file will be dropped at abort time.
+	 * Make an XLOG entry showing the file creation.  If we abort, the file
+	 * will be dropped at abort time.
 	 */
 	xlrec.rnode = reln->smgr_rnode;
 
@@ -358,7 +357,7 @@ smgrcreate(SMgrRelation reln, bool isTemp, bool isRedo)
 	rdata.buffer = InvalidBuffer;
 	rdata.next = NULL;
 
-	lsn = XLogInsert(RM_SMGR_ID, XLOG_SMGR_CREATE | XLOG_NO_TRAN, &rdata);
+	lsn = XLogInsert(RM_SMGR_ID, XLOG_SMGR_CREATE, &rdata);
 
 	/* Add the relation to the list of stuff to delete at abort */
 	pending = (PendingRelDelete *)
@@ -554,10 +553,7 @@ smgrtruncate(SMgrRelation reln, BlockNumber nblocks, bool isTemp)
 	if (!isTemp)
 	{
 		/*
-		 * Make a non-transactional XLOG entry showing the file truncation.
-		 * It's non-transactional because we should replay it whether the
-		 * transaction commits or not; the underlying file change is certainly
-		 * not reversible.
+		 * Make an XLOG entry showing the file truncation.
 		 */
 		XLogRecPtr	lsn;
 		XLogRecData rdata;
@@ -571,8 +567,7 @@ smgrtruncate(SMgrRelation reln, BlockNumber nblocks, bool isTemp)
 		rdata.buffer = InvalidBuffer;
 		rdata.next = NULL;
 
-		lsn = XLogInsert(RM_SMGR_ID, XLOG_SMGR_TRUNCATE | XLOG_NO_TRAN,
-						 &rdata);
+		lsn = XLogInsert(RM_SMGR_ID, XLOG_SMGR_TRUNCATE, &rdata);
 	}
 }
 
@@ -679,11 +674,14 @@ smgrDoPendingDeletes(bool isCommit)
  * *ptr is set to point to a freshly-palloc'd array of RelFileNodes.
  * If there are no relations to be deleted, *ptr is set to NULL.
  *
+ * If haveNonTemp isn't NULL, the bool it points to gets set to true if
+ * there is any non-temp table pending to be deleted; false if not.
+ *
  * Note that the list does not include anything scheduled for termination
  * by upper-level transactions.
  */
 int
-smgrGetPendingDeletes(bool forCommit, RelFileNode **ptr)
+smgrGetPendingDeletes(bool forCommit, RelFileNode **ptr, bool *haveNonTemp)
 {
 	int			nestLevel = GetCurrentTransactionNestLevel();
 	int			nrels;
@@ -691,6 +689,8 @@ smgrGetPendingDeletes(bool forCommit, RelFileNode **ptr)
 	PendingRelDelete *pending;
 
 	nrels = 0;
+	if (haveNonTemp)
+		*haveNonTemp = false;
 	for (pending = pendingDeletes; pending != NULL; pending = pending->next)
 	{
 		if (pending->nestLevel >= nestLevel && pending->atCommit == forCommit)
@@ -707,6 +707,8 @@ smgrGetPendingDeletes(bool forCommit, RelFileNode **ptr)
 	{
 		if (pending->nestLevel >= nestLevel && pending->atCommit == forCommit)
 			*rptr++ = pending->relnode;
+		if (haveNonTemp && !pending->isTemp)
+			*haveNonTemp = true;
 	}
 	return nrels;
 }
diff --git a/src/backend/utils/adt/lockfuncs.c b/src/backend/utils/adt/lockfuncs.c
index 2263a946039..e78d74f9efe 100644
--- a/src/backend/utils/adt/lockfuncs.c
+++ b/src/backend/utils/adt/lockfuncs.c
@@ -6,7 +6,7 @@
  * Copyright (c) 2002-2007, PostgreSQL Global Development Group
  *
  * IDENTIFICATION
- *		$PostgreSQL: pgsql/src/backend/utils/adt/lockfuncs.c,v 1.28 2007/01/05 22:19:41 momjian Exp $
+ *		$PostgreSQL: pgsql/src/backend/utils/adt/lockfuncs.c,v 1.29 2007/09/05 18:10:48 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -27,6 +27,7 @@ static const char *const LockTagTypeNames[] = {
 	"page",
 	"tuple",
 	"transactionid",
+	"virtualxid",
 	"object",
 	"userlock",
 	"advisory"
@@ -39,6 +40,27 @@ typedef struct
 	int			currIdx;		/* current PROCLOCK index */
 } PG_Lock_Status;
 
+
+/*
+ * VXIDGetDatum - Construct a text representation of a VXID
+ *
+ * This is currently only used in pg_lock_status, so we put it here.
+ */
+static Datum
+VXIDGetDatum(BackendId bid, LocalTransactionId lxid)
+{
+	/*
+	 * The representation is "<bid>/<lxid>", decimal and unsigned decimal
+	 * respectively.  Note that elog.c also knows how to format a vxid.
+	 */
+	char vxidstr[32];
+
+	snprintf(vxidstr, sizeof(vxidstr), "%d/%u", bid, lxid);
+
+	return DirectFunctionCall1(textin, CStringGetDatum(vxidstr));
+}
+
+
 /*
  * pg_lock_status - produce a view with one row per held or awaited lock mode
  */
@@ -64,7 +86,7 @@ pg_lock_status(PG_FUNCTION_ARGS)
 
 		/* build tupdesc for result tuples */
 		/* this had better match pg_locks view in system_views.sql */
-		tupdesc = CreateTemplateTupleDesc(13, false);
+		tupdesc = CreateTemplateTupleDesc(14, false);
 		TupleDescInitEntry(tupdesc, (AttrNumber) 1, "locktype",
 						   TEXTOID, -1, 0);
 		TupleDescInitEntry(tupdesc, (AttrNumber) 2, "database",
@@ -75,21 +97,23 @@ pg_lock_status(PG_FUNCTION_ARGS)
 						   INT4OID, -1, 0);
 		TupleDescInitEntry(tupdesc, (AttrNumber) 5, "tuple",
 						   INT2OID, -1, 0);
-		TupleDescInitEntry(tupdesc, (AttrNumber) 6, "transactionid",
+		TupleDescInitEntry(tupdesc, (AttrNumber) 6, "virtualxid",
+						   TEXTOID, -1, 0);
+		TupleDescInitEntry(tupdesc, (AttrNumber) 7, "transactionid",
 						   XIDOID, -1, 0);
-		TupleDescInitEntry(tupdesc, (AttrNumber) 7, "classid",
+		TupleDescInitEntry(tupdesc, (AttrNumber) 8, "classid",
 						   OIDOID, -1, 0);
-		TupleDescInitEntry(tupdesc, (AttrNumber) 8, "objid",
+		TupleDescInitEntry(tupdesc, (AttrNumber) 9, "objid",
 						   OIDOID, -1, 0);
-		TupleDescInitEntry(tupdesc, (AttrNumber) 9, "objsubid",
+		TupleDescInitEntry(tupdesc, (AttrNumber) 10, "objsubid",
 						   INT2OID, -1, 0);
-		TupleDescInitEntry(tupdesc, (AttrNumber) 10, "transaction",
-						   XIDOID, -1, 0);
-		TupleDescInitEntry(tupdesc, (AttrNumber) 11, "pid",
+		TupleDescInitEntry(tupdesc, (AttrNumber) 11, "virtualtransaction",
+						   TEXTOID, -1, 0);
+		TupleDescInitEntry(tupdesc, (AttrNumber) 12, "pid",
 						   INT4OID, -1, 0);
-		TupleDescInitEntry(tupdesc, (AttrNumber) 12, "mode",
+		TupleDescInitEntry(tupdesc, (AttrNumber) 13, "mode",
 						   TEXTOID, -1, 0);
-		TupleDescInitEntry(tupdesc, (AttrNumber) 13, "granted",
+		TupleDescInitEntry(tupdesc, (AttrNumber) 14, "granted",
 						   BOOLOID, -1, 0);
 
 		funcctx->tuple_desc = BlessTupleDesc(tupdesc);
@@ -120,8 +144,8 @@ pg_lock_status(PG_FUNCTION_ARGS)
 		LOCKMODE	mode = 0;
 		const char *locktypename;
 		char		tnbuf[32];
-		Datum		values[13];
-		char		nulls[13];
+		Datum		values[14];
+		char		nulls[14];
 		HeapTuple	tuple;
 		Datum		result;
 
@@ -193,7 +217,6 @@ pg_lock_status(PG_FUNCTION_ARGS)
 		values[0] = DirectFunctionCall1(textin,
 										CStringGetDatum(locktypename));
 
-
 		switch (lock->tag.locktag_type)
 		{
 			case LOCKTAG_RELATION:
@@ -206,6 +229,7 @@ pg_lock_status(PG_FUNCTION_ARGS)
 				nulls[6] = 'n';
 				nulls[7] = 'n';
 				nulls[8] = 'n';
+				nulls[9] = 'n';
 				break;
 			case LOCKTAG_PAGE:
 				values[1] = ObjectIdGetDatum(lock->tag.locktag_field1);
@@ -216,6 +240,7 @@ pg_lock_status(PG_FUNCTION_ARGS)
 				nulls[6] = 'n';
 				nulls[7] = 'n';
 				nulls[8] = 'n';
+				nulls[9] = 'n';
 				break;
 			case LOCKTAG_TUPLE:
 				values[1] = ObjectIdGetDatum(lock->tag.locktag_field1);
@@ -226,9 +251,22 @@ pg_lock_status(PG_FUNCTION_ARGS)
 				nulls[6] = 'n';
 				nulls[7] = 'n';
 				nulls[8] = 'n';
+				nulls[9] = 'n';
 				break;
 			case LOCKTAG_TRANSACTION:
-				values[5] = TransactionIdGetDatum(lock->tag.locktag_field1);
+				values[6] = TransactionIdGetDatum(lock->tag.locktag_field1);
+				nulls[1] = 'n';
+				nulls[2] = 'n';
+				nulls[3] = 'n';
+				nulls[4] = 'n';
+				nulls[5] = 'n';
+				nulls[7] = 'n';
+				nulls[8] = 'n';
+				nulls[9] = 'n';
+				break;
+			case LOCKTAG_VIRTUALTRANSACTION:
+				values[5] = VXIDGetDatum(lock->tag.locktag_field1,
+										 lock->tag.locktag_field2);
 				nulls[1] = 'n';
 				nulls[2] = 'n';
 				nulls[3] = 'n';
@@ -236,31 +274,33 @@ pg_lock_status(PG_FUNCTION_ARGS)
 				nulls[6] = 'n';
 				nulls[7] = 'n';
 				nulls[8] = 'n';
+				nulls[9] = 'n';
 				break;
 			case LOCKTAG_OBJECT:
 			case LOCKTAG_USERLOCK:
 			case LOCKTAG_ADVISORY:
 			default:			/* treat unknown locktags like OBJECT */
 				values[1] = ObjectIdGetDatum(lock->tag.locktag_field1);
-				values[6] = ObjectIdGetDatum(lock->tag.locktag_field2);
-				values[7] = ObjectIdGetDatum(lock->tag.locktag_field3);
-				values[8] = Int16GetDatum(lock->tag.locktag_field4);
+				values[7] = ObjectIdGetDatum(lock->tag.locktag_field2);
+				values[8] = ObjectIdGetDatum(lock->tag.locktag_field3);
+				values[9] = Int16GetDatum(lock->tag.locktag_field4);
 				nulls[2] = 'n';
 				nulls[3] = 'n';
 				nulls[4] = 'n';
 				nulls[5] = 'n';
+				nulls[6] = 'n';
 				break;
 		}
 
-		values[9] = TransactionIdGetDatum(proc->xid);
+		values[10] = VXIDGetDatum(proc->backendId, proc->lxid);
 		if (proc->pid != 0)
-			values[10] = Int32GetDatum(proc->pid);
+			values[11] = Int32GetDatum(proc->pid);
 		else
-			nulls[10] = 'n';
-		values[11] = DirectFunctionCall1(textin,
+			nulls[11] = 'n';
+		values[12] = DirectFunctionCall1(textin,
 					  CStringGetDatum(GetLockmodeName(LOCK_LOCKMETHOD(*lock),
 													  mode)));
-		values[12] = BoolGetDatum(granted);
+		values[13] = BoolGetDatum(granted);
 
 		tuple = heap_formtuple(funcctx->tuple_desc, values, nulls);
 		result = HeapTupleGetDatum(tuple);
diff --git a/src/backend/utils/error/elog.c b/src/backend/utils/error/elog.c
index d0d024e075a..e8a3ed3db0e 100644
--- a/src/backend/utils/error/elog.c
+++ b/src/backend/utils/error/elog.c
@@ -42,7 +42,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/utils/error/elog.c,v 1.195 2007/08/23 01:24:43 adunstan Exp $
+ *	  $PostgreSQL: pgsql/src/backend/utils/error/elog.c,v 1.196 2007/09/05 18:10:48 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -66,6 +66,7 @@
 #include "postmaster/postmaster.h"
 #include "postmaster/syslogger.h"
 #include "storage/ipc.h"
+#include "storage/proc.h"
 #include "tcop/tcopprot.h"
 #include "utils/memutils.h"
 #include "utils/ps_status.h"
@@ -1592,9 +1593,14 @@ log_line_prefix(StringInfo buf)
 				if (MyProcPort == NULL)
 					i = format_len;
 				break;
+			case 'v':
+				/* keep VXID format in sync with lockfuncs.c */
+				if (MyProc != NULL)
+					appendStringInfo(buf, "%d/%u",
+									 MyProc->backendId, MyProc->lxid);
+				break;
 			case 'x':
-				if (MyProcPort)
-					appendStringInfo(buf, "%u", GetTopTransactionId());
+				appendStringInfo(buf, "%u", GetTopTransactionIdIfAny());
 				break;
 			case '%':
 				appendStringInfoChar(buf, '%');
@@ -1785,15 +1791,8 @@ write_csvlog(ErrorData *edata)
 	appendStringInfoString(&buf, formatted_start_time);
 	appendStringInfoChar(&buf, ',');
 
-
 	/* Transaction id */
-	if (MyProcPort)
-	{
-		if (IsTransactionState())
-			appendStringInfo(&buf, "%u", GetTopTransactionId());
-		else
-			appendStringInfo(&buf, "%u", InvalidTransactionId);
-	}
+	appendStringInfo(&buf, "%u", GetTopTransactionIdIfAny());
 
 	appendStringInfoChar(&buf, ',');
 
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index b22099c2fd7..7de3145aa01 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -338,7 +338,8 @@
 					#   %c = session id
 					#   %l = session line number
 					#   %s = session start timestamp
-					#   %x = transaction id
+					#   %v = virtual transaction id
+					#   %x = transaction id (0 if none)
 					#   %q = stop here in non-session 
 					#        processes
 					#   %% = '%'
diff --git a/src/include/access/xact.h b/src/include/access/xact.h
index e8e2b08de42..731269af9a0 100644
--- a/src/include/access/xact.h
+++ b/src/include/access/xact.h
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/access/xact.h,v 1.88 2007/08/01 22:45:09 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/access/xact.h,v 1.89 2007/09/05 18:10:48 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -139,6 +139,7 @@ typedef struct xl_xact_abort_prepared
 extern bool IsTransactionState(void);
 extern bool IsAbortedTransactionBlockState(void);
 extern TransactionId GetTopTransactionId(void);
+extern TransactionId GetTopTransactionIdIfAny(void);
 extern TransactionId GetCurrentTransactionId(void);
 extern TransactionId GetCurrentTransactionIdIfAny(void);
 extern SubTransactionId GetCurrentSubTransactionId(void);
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index 2e1928dace0..372a43797a4 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -6,7 +6,7 @@
  * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/access/xlog.h,v 1.82 2007/08/01 22:45:09 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/access/xlog.h,v 1.83 2007/09/05 18:10:48 tgl Exp $
  */
 #ifndef XLOG_H
 #define XLOG_H
@@ -85,12 +85,6 @@ typedef struct XLogRecord
  */
 #define XLR_BKP_REMOVABLE		0x01
 
-/*
- * Sometimes we log records which are out of transaction control.
- * Rmgr may "or" XLOG_NO_TRAN into info passed to XLogInsert to indicate this.
- */
-#define XLOG_NO_TRAN			XLR_INFO_MASK
-
 /* Sync methods */
 #define SYNC_METHOD_FSYNC		0
 #define SYNC_METHOD_FDATASYNC	1
@@ -139,10 +133,7 @@ typedef struct XLogRecData
 
 extern TimeLineID ThisTimeLineID;		/* current TLI */
 extern bool InRecovery;
-extern XLogRecPtr MyLastRecPtr;
-extern bool MyXactMadeXLogEntry;
-extern bool MyXactMadeTempRelUpdate;
-extern XLogRecPtr ProcLastRecEnd;
+extern XLogRecPtr XactLastRecEnd;
 
 /* these variables are GUC parameters related to XLOG */
 extern int	CheckPointSegments;
diff --git a/src/include/c.h b/src/include/c.h
index 35e7bb9150f..d808609ab01 100644
--- a/src/include/c.h
+++ b/src/include/c.h
@@ -12,7 +12,7 @@
  * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/c.h,v 1.220 2007/07/25 12:22:52 mha Exp $
+ * $PostgreSQL: pgsql/src/include/c.h,v 1.221 2007/09/05 18:10:48 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -370,6 +370,8 @@ typedef regproc RegProcedure;
 
 typedef uint32 TransactionId;
 
+typedef uint32 LocalTransactionId;
+
 typedef uint32 SubTransactionId;
 
 #define InvalidSubTransactionId		((SubTransactionId) 0)
diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h
index e229f161f94..dcd9c90ecbb 100644
--- a/src/include/catalog/catversion.h
+++ b/src/include/catalog/catversion.h
@@ -37,7 +37,7 @@
  * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/catalog/catversion.h,v 1.422 2007/09/04 16:41:42 adunstan Exp $
+ * $PostgreSQL: pgsql/src/include/catalog/catversion.h,v 1.423 2007/09/05 18:10:48 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -53,6 +53,6 @@
  */
 
 /*							yyyymmddN */
-#define CATALOG_VERSION_NO	200709041
+#define CATALOG_VERSION_NO	200709042
 
 #endif
diff --git a/src/include/storage/lmgr.h b/src/include/storage/lmgr.h
index 36474cd2781..fedf6b1fffb 100644
--- a/src/include/storage/lmgr.h
+++ b/src/include/storage/lmgr.h
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/storage/lmgr.h,v 1.58 2007/06/19 20:13:22 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/storage/lmgr.h,v 1.59 2007/09/05 18:10:48 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -55,6 +55,11 @@ extern void XactLockTableDelete(TransactionId xid);
 extern void XactLockTableWait(TransactionId xid);
 extern bool ConditionalXactLockTableWait(TransactionId xid);
 
+/* Lock a VXID (used to wait for a transaction to finish) */
+extern void VirtualXactLockTableInsert(VirtualTransactionId vxid);
+extern void VirtualXactLockTableWait(VirtualTransactionId vxid);
+extern bool ConditionalVirtualXactLockTableWait(VirtualTransactionId vxid);
+
 /* Lock a general object (other than a relation) of the current database */
 extern void LockDatabaseObject(Oid classid, Oid objid, uint16 objsubid,
 				   LOCKMODE lockmode);
diff --git a/src/include/storage/lock.h b/src/include/storage/lock.h
index e2a5bc7b6f5..30c8a3fa2bc 100644
--- a/src/include/storage/lock.h
+++ b/src/include/storage/lock.h
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/storage/lock.h,v 1.106 2007/06/19 20:13:22 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/storage/lock.h,v 1.107 2007/09/05 18:10:48 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -15,6 +15,7 @@
 #define LOCK_H_
 
 #include "nodes/pg_list.h"
+#include "storage/backendid.h"
 #include "storage/itemptr.h"
 #include "storage/lwlock.h"
 #include "storage/shmem.h"
@@ -41,6 +42,37 @@ extern bool Debug_deadlocks;
 #endif   /* LOCK_DEBUG */
 
 
+/*
+ * Top-level transactions are identified by VirtualTransactionIDs comprising
+ * the BackendId of the backend running the xact, plus a locally-assigned
+ * LocalTransactionId.  These are guaranteed unique over the short term,
+ * but will be reused after a database restart; hence they should never
+ * be stored on disk.
+ *
+ * Note that struct VirtualTransactionId can not be assumed to be atomically
+ * assignable as a whole.  However, type LocalTransactionId is assumed to
+ * be atomically assignable, and the backend ID doesn't change often enough
+ * to be a problem, so we can fetch or assign the two fields separately.
+ * We deliberately refrain from using the struct within PGPROC, to prevent
+ * coding errors from trying to use struct assignment with it; instead use
+ * GET_VXID_FROM_PGPROC().
+ */
+typedef struct
+{
+	BackendId	backendId;		/* determined at backend startup */
+	LocalTransactionId localTransactionId;	/* backend-local transaction id */
+} VirtualTransactionId;
+
+#define InvalidLocalTransactionId		0
+#define LocalTransactionIdIsValid(lxid)	((lxid) != InvalidLocalTransactionId)
+#define VirtualTransactionIdIsValid(vxid) \
+	(((vxid).backendId != InvalidBackendId) && \
+	 LocalTransactionIdIsValid((vxid).localTransactionId))
+#define GET_VXID_FROM_PGPROC(vxid, proc) \
+	((vxid).backendId = (proc).backendId, \
+	 (vxid).localTransactionId = (proc).lxid)
+
+
 /*
  * LOCKMODE is an integer (1..N) indicating a lock type.  LOCKMASK is a bit
  * mask indicating a set of held or requested lock types (the bit 1<<mode
@@ -139,6 +171,8 @@ typedef enum LockTagType
 	/* ID info for a tuple is PAGE info + OffsetNumber */
 	LOCKTAG_TRANSACTION,		/* transaction (for waiting for xact done) */
 	/* ID info for a transaction is its TransactionId */
+	LOCKTAG_VIRTUALTRANSACTION,	/* virtual transaction (ditto) */
+	/* ID info for a virtual transaction is its VirtualTransactionId */
 	LOCKTAG_OBJECT,				/* non-relation database object */
 	/* ID info for an object is DB OID + CLASS OID + OBJECT OID + SUBID */
 
@@ -214,6 +248,14 @@ typedef struct LOCKTAG
 	 (locktag).locktag_type = LOCKTAG_TRANSACTION, \
 	 (locktag).locktag_lockmethodid = DEFAULT_LOCKMETHOD)
 
+#define SET_LOCKTAG_VIRTUALTRANSACTION(locktag,vxid) \
+	((locktag).locktag_field1 = (vxid).backendId, \
+	 (locktag).locktag_field2 = (vxid).localTransactionId, \
+	 (locktag).locktag_field3 = 0, \
+	 (locktag).locktag_field4 = 0, \
+	 (locktag).locktag_type = LOCKTAG_VIRTUALTRANSACTION, \
+	 (locktag).locktag_lockmethodid = DEFAULT_LOCKMETHOD)
+
 #define SET_LOCKTAG_OBJECT(locktag,dboid,classoid,objoid,objsubid) \
 	((locktag).locktag_field1 = (dboid), \
 	 (locktag).locktag_field2 = (classoid), \
@@ -431,7 +473,8 @@ extern bool LockRelease(const LOCKTAG *locktag,
 extern void LockReleaseAll(LOCKMETHODID lockmethodid, bool allLocks);
 extern void LockReleaseCurrentOwner(void);
 extern void LockReassignCurrentOwner(void);
-extern List *GetLockConflicts(const LOCKTAG *locktag, LOCKMODE lockmode);
+extern VirtualTransactionId *GetLockConflicts(const LOCKTAG *locktag,
+											  LOCKMODE lockmode);
 extern void AtPrepare_Locks(void);
 extern void PostPrepare_Locks(TransactionId xid);
 extern int LockCheckConflicts(LockMethod lockMethodTable,
diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h
index 756b0ffb0e7..9fefa0a5a93 100644
--- a/src/include/storage/proc.h
+++ b/src/include/storage/proc.h
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/storage/proc.h,v 1.99 2007/07/25 12:22:53 mha Exp $
+ * $PostgreSQL: pgsql/src/include/storage/proc.h,v 1.100 2007/09/05 18:10:48 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -62,8 +62,13 @@ struct PGPROC
 	PGSemaphoreData sem;		/* ONE semaphore to sleep on */
 	int			waitStatus;		/* STATUS_WAITING, STATUS_OK or STATUS_ERROR */
 
-	TransactionId xid;			/* transaction currently being executed by
-								 * this proc */
+	LocalTransactionId lxid;	/* local id of top-level transaction currently
+								 * being executed by this proc, if running;
+								 * else InvalidLocalTransactionId */
+
+	TransactionId xid;			/* id of top-level transaction currently being
+								 * executed by this proc, if running and XID
+								 * is assigned; else InvalidTransactionId */
 
 	TransactionId xmin;			/* minimal running XID as it was when we were
 								 * starting our xact, excluding LAZY VACUUM:
@@ -71,6 +76,7 @@ struct PGPROC
 								 * xid >= xmin ! */
 
 	int			pid;			/* This backend's process id, or 0 */
+	BackendId	backendId;		/* This backend's backend ID (if assigned) */
 	Oid			databaseId;		/* OID of database this backend is using */
 	Oid			roleId;			/* OID of role using this backend */
 
diff --git a/src/include/storage/procarray.h b/src/include/storage/procarray.h
index dafb83a9658..524710506a7 100644
--- a/src/include/storage/procarray.h
+++ b/src/include/storage/procarray.h
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/storage/procarray.h,v 1.14 2007/06/01 19:38:07 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/storage/procarray.h,v 1.15 2007/09/05 18:10:48 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -33,6 +33,7 @@ extern PGPROC *BackendPidGetProc(int pid);
 extern int	BackendXidGetPid(TransactionId xid);
 extern bool IsBackendPid(int pid);
 
+extern VirtualTransactionId *GetCurrentVirtualXIDs(TransactionId limitXmin);
 extern int	CountActiveBackends(void);
 extern int	CountDBBackends(Oid databaseid);
 extern int	CountUserBackends(Oid roleid);
diff --git a/src/include/storage/sinvaladt.h b/src/include/storage/sinvaladt.h
index 778d7a4a2ba..ff0a68e25a2 100644
--- a/src/include/storage/sinvaladt.h
+++ b/src/include/storage/sinvaladt.h
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/storage/sinvaladt.h,v 1.42 2007/01/05 22:19:58 momjian Exp $
+ * $PostgreSQL: pgsql/src/include/storage/sinvaladt.h,v 1.43 2007/09/05 18:10:48 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -85,6 +85,13 @@ typedef struct SISeg
 	int			maxBackends;	/* size of procState array */
 	int			freeBackends;	/* number of empty procState slots */
 
+	/*
+	 * Next LocalTransactionId to use for each idle backend slot.  We keep
+	 * this here because it is indexed by BackendId and it is convenient to
+	 * copy the value to and from local memory when MyBackendId is set.
+	 */
+	LocalTransactionId *nextLXID; /* array of maxBackends entries */
+
 	/*
 	 * Circular buffer holding shared-inval messages
 	 */
@@ -114,4 +121,6 @@ extern int SIGetDataEntry(SISeg *segP, int backendId,
 			   SharedInvalidationMessage *data);
 extern void SIDelExpiredDataEntries(SISeg *segP);
 
+extern LocalTransactionId GetNextLocalTransactionId(void);
+
 #endif   /* SINVALADT_H */
diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h
index 3beb14febaf..bc071e7ef05 100644
--- a/src/include/storage/smgr.h
+++ b/src/include/storage/smgr.h
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/storage/smgr.h,v 1.58 2007/01/17 16:25:01 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/storage/smgr.h,v 1.59 2007/09/05 18:10:48 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -76,7 +76,8 @@ extern void smgrtruncate(SMgrRelation reln, BlockNumber nblocks,
 			 bool isTemp);
 extern void smgrimmedsync(SMgrRelation reln);
 extern void smgrDoPendingDeletes(bool isCommit);
-extern int	smgrGetPendingDeletes(bool forCommit, RelFileNode **ptr);
+extern int	smgrGetPendingDeletes(bool forCommit, RelFileNode **ptr,
+								  bool *haveNonTemp);
 extern void AtSubCommit_smgr(void);
 extern void AtSubAbort_smgr(void);
 extern void PostPrepare_smgr(void);
diff --git a/src/test/regress/expected/rules.out b/src/test/regress/expected/rules.out
index 39e27a74924..ebbf8d16262 100644
--- a/src/test/regress/expected/rules.out
+++ b/src/test/regress/expected/rules.out
@@ -1282,7 +1282,7 @@ SELECT viewname, definition FROM pg_views WHERE schemaname <> 'information_schem
  pg_cursors               | SELECT c.name, c.statement, c.is_holdable, c.is_binary, c.is_scrollable, c.creation_time FROM pg_cursor() c(name text, statement text, is_holdable boolean, is_binary boolean, is_scrollable boolean, creation_time timestamp with time zone);
  pg_group                 | SELECT pg_authid.rolname AS groname, pg_authid.oid AS grosysid, ARRAY(SELECT pg_auth_members.member FROM pg_auth_members WHERE (pg_auth_members.roleid = pg_authid.oid)) AS grolist FROM pg_authid WHERE (NOT pg_authid.rolcanlogin);
  pg_indexes               | SELECT n.nspname AS schemaname, c.relname AS tablename, i.relname AS indexname, t.spcname AS tablespace, pg_get_indexdef(i.oid) AS indexdef FROM ((((pg_index x JOIN pg_class c ON ((c.oid = x.indrelid))) JOIN pg_class i ON ((i.oid = x.indexrelid))) LEFT JOIN pg_namespace n ON ((n.oid = c.relnamespace))) LEFT JOIN pg_tablespace t ON ((t.oid = i.reltablespace))) WHERE ((c.relkind = 'r'::"char") AND (i.relkind = 'i'::"char"));
- pg_locks                 | SELECT l.locktype, l.database, l.relation, l.page, l.tuple, l.transactionid, l.classid, l.objid, l.objsubid, l.transaction, l.pid, l.mode, l.granted FROM pg_lock_status() l(locktype text, database oid, relation oid, page integer, tuple smallint, transactionid xid, classid oid, objid oid, objsubid smallint, transaction xid, pid integer, mode text, granted boolean);
+ pg_locks                 | SELECT l.locktype, l.database, l.relation, l.page, l.tuple, l.virtualxid, l.transactionid, l.classid, l.objid, l.objsubid, l.virtualtransaction, l.pid, l.mode, l.granted FROM pg_lock_status() l(locktype text, database oid, relation oid, page integer, tuple smallint, virtualxid text, transactionid xid, classid oid, objid oid, objsubid smallint, virtualtransaction text, pid integer, mode text, granted boolean);
  pg_prepared_statements   | SELECT p.name, p.statement, p.prepare_time, p.parameter_types, p.from_sql FROM pg_prepared_statement() p(name text, statement text, prepare_time timestamp with time zone, parameter_types regtype[], from_sql boolean);
  pg_prepared_xacts        | SELECT p.transaction, p.gid, p.prepared, u.rolname AS owner, d.datname AS database FROM ((pg_prepared_xact() p(transaction xid, gid text, prepared timestamp with time zone, ownerid oid, dbid oid) LEFT JOIN pg_authid u ON ((p.ownerid = u.oid))) LEFT JOIN pg_database d ON ((p.dbid = d.oid)));
  pg_roles                 | SELECT pg_authid.rolname, pg_authid.rolsuper, pg_authid.rolinherit, pg_authid.rolcreaterole, pg_authid.rolcreatedb, pg_authid.rolcatupdate, pg_authid.rolcanlogin, pg_authid.rolconnlimit, '********'::text AS rolpassword, pg_authid.rolvaliduntil, pg_authid.rolconfig, pg_authid.oid FROM pg_authid;
-- 
GitLab