From 573a71a5da70d6e2503c8f53e3b4f26b3b6d738d Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Thu, 1 Jul 2004 00:52:04 +0000
Subject: [PATCH] Nested transactions.  There is still much left to do,
 especially on the performance front, but with feature freeze upon us I think
 it's time to drive a stake in the ground and say that this will be in 7.5.

Alvaro Herrera, with some help from Tom Lane.
---
 contrib/userlock/user_locks.c              |    3 +-
 src/backend/access/gist/gistscan.c         |   44 +-
 src/backend/access/hash/hashscan.c         |   44 +-
 src/backend/access/rtree/rtscan.c          |   44 +-
 src/backend/access/transam/Makefile        |    4 +-
 src/backend/access/transam/clog.c          |   71 +-
 src/backend/access/transam/rmgr.c          |    6 +-
 src/backend/access/transam/slru.c          |   57 +-
 src/backend/access/transam/subtrans.c      |  388 +++++
 src/backend/access/transam/transam.c       |  187 ++-
 src/backend/access/transam/varsup.c        |   13 +-
 src/backend/access/transam/xact.c          | 1521 +++++++++++++++++---
 src/backend/access/transam/xlog.c          |    7 +-
 src/backend/commands/async.c               |   72 +-
 src/backend/commands/tablecmds.c           |   71 +-
 src/backend/commands/trigger.c             |  577 ++++++--
 src/backend/commands/vacuum.c              |    6 +-
 src/backend/commands/variable.c            |   11 +-
 src/backend/executor/spi.c                 |  131 +-
 src/backend/postmaster/pgstat.c            |   67 +-
 src/backend/storage/buffer/bufmgr.c        |  119 +-
 src/backend/storage/ipc/ipci.c             |    8 +-
 src/backend/storage/ipc/sinval.c           |   93 +-
 src/backend/storage/lmgr/lmgr.c            |   20 +-
 src/backend/storage/lmgr/lock.c            |   45 +-
 src/backend/storage/lmgr/lwlock.c          |    6 +-
 src/backend/storage/lmgr/proc.c            |   34 +-
 src/backend/storage/smgr/smgr.c            |  115 +-
 src/backend/tcop/postgres.c                |    5 +-
 src/backend/utils/cache/catcache.c         |  187 ++-
 src/backend/utils/cache/inval.c            |  230 +--
 src/backend/utils/cache/relcache.c         |  143 +-
 src/backend/utils/init/postinit.c          |    9 +-
 src/backend/utils/misc/README              |   69 +-
 src/backend/utils/misc/guc.c               |  532 +++++--
 src/backend/utils/mmgr/README              |   37 +-
 src/backend/utils/mmgr/mcxt.c              |    3 +-
 src/backend/utils/mmgr/portalmem.c         |   93 +-
 src/backend/utils/time/tqual.c             |   34 +-
 src/bin/initdb/initdb.c                    |    4 +-
 src/include/access/clog.h                  |   15 +-
 src/include/access/gistscan.h              |    3 +-
 src/include/access/hash.h                  |    3 +-
 src/include/access/htup.h                  |   67 +-
 src/include/access/rmgr.h                  |    4 +-
 src/include/access/rtree.h                 |    3 +-
 src/include/access/slru.h                  |   11 +-
 src/include/access/subtrans.h              |   35 +
 src/include/access/transam.h               |    7 +-
 src/include/access/xact.h                  |   41 +-
 src/include/access/xlog.h                  |    4 +-
 src/include/catalog/catversion.h           |    4 +-
 src/include/commands/async.h               |    5 +-
 src/include/commands/tablecmds.h           |    7 +-
 src/include/commands/trigger.h             |   38 +-
 src/include/executor/spi.h                 |    3 +-
 src/include/executor/spi_priv.h            |    3 +-
 src/include/storage/bufmgr.h               |    4 +-
 src/include/storage/bufpage.h              |    9 +-
 src/include/storage/lock.h                 |   29 +-
 src/include/storage/proc.h                 |    5 +-
 src/include/storage/smgr.h                 |    5 +-
 src/include/utils/catcache.h               |   10 +-
 src/include/utils/guc.h                    |    6 +-
 src/include/utils/guc_tables.h             |   52 +-
 src/include/utils/inval.h                  |   12 +-
 src/include/utils/memutils.h               |    3 +-
 src/include/utils/portal.h                 |    5 +-
 src/include/utils/rel.h                    |    5 +-
 src/include/utils/relcache.h               |    6 +-
 src/test/regress/expected/transactions.out |   67 +
 src/test/regress/expected/without_oid.out  |   23 +-
 src/test/regress/sql/transactions.sql      |   45 +
 src/test/regress/sql/without_oid.sql       |   21 +-
 74 files changed, 4521 insertions(+), 1149 deletions(-)
 create mode 100644 src/backend/access/transam/subtrans.c
 create mode 100644 src/include/access/subtrans.h

diff --git a/contrib/userlock/user_locks.c b/contrib/userlock/user_locks.c
index e1ee603f80e..0996970a9f4 100644
--- a/contrib/userlock/user_locks.c
+++ b/contrib/userlock/user_locks.c
@@ -75,8 +75,7 @@ user_write_unlock_oid(Oid oid)
 int
 user_unlock_all(void)
 {
-	return LockReleaseAll(USER_LOCKMETHOD, MyProc, false,
-						  InvalidTransactionId);
+	return LockReleaseAll(USER_LOCKMETHOD, MyProc, ReleaseAll, 0, NULL);
 }
 
 /* end of file */
diff --git a/src/backend/access/gist/gistscan.c b/src/backend/access/gist/gistscan.c
index 30bb9b810af..dc424a6773d 100644
--- a/src/backend/access/gist/gistscan.c
+++ b/src/backend/access/gist/gistscan.c
@@ -8,7 +8,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/gist/gistscan.c,v 1.51 2004/01/07 18:56:23 neilc Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/gist/gistscan.c,v 1.52 2004/07/01 00:49:27 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -41,6 +41,7 @@ static void adjustiptr(IndexScanDesc s, ItemPointer iptr,
 typedef struct GISTScanListData
 {
 	IndexScanDesc gsl_scan;
+	TransactionId gsl_creatingXid;
 	struct GISTScanListData *gsl_next;
 } GISTScanListData;
 
@@ -223,6 +224,7 @@ gistregscan(IndexScanDesc s)
 
 	l = (GISTScanList) palloc(sizeof(GISTScanListData));
 	l->gsl_scan = s;
+	l->gsl_creatingXid = GetCurrentTransactionId();
 	l->gsl_next = GISTScans;
 	GISTScans = l;
 }
@@ -271,6 +273,46 @@ AtEOXact_gist(void)
 	GISTScans = NULL;
 }
 
+/*
+ * AtEOSubXact_gist() --- clean up gist subsystem at subxact abort or commit.
+ *
+ * This is here because it needs to touch this module's static var GISTScans.
+ */
+void
+AtEOSubXact_gist(TransactionId childXid)
+{
+	GISTScanList l;
+	GISTScanList prev;
+	GISTScanList next;
+
+	/*
+	 * Note: these actions should only be necessary during xact abort; but
+	 * they can't hurt during a commit.
+	 */
+
+	/*
+	 * Forget active scans that were started in this subtransaction.
+	 */
+	prev = NULL;
+
+	for (l = GISTScans; l != NULL; l = next)
+	{
+		next = l->gsl_next;
+		if (l->gsl_creatingXid == childXid)
+		{
+			if (prev == NULL)
+				GISTScans = next;
+			else
+				prev->gsl_next = next;
+
+			pfree(l);
+			/* prev does not change */
+		}
+		else
+			prev = l;
+	}
+}
+
 void
 gistadjscans(Relation rel, int op, BlockNumber blkno, OffsetNumber offnum)
 {
diff --git a/src/backend/access/hash/hashscan.c b/src/backend/access/hash/hashscan.c
index fcf2a01cddb..d107596c750 100644
--- a/src/backend/access/hash/hashscan.c
+++ b/src/backend/access/hash/hashscan.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/hash/hashscan.c,v 1.33 2004/01/07 18:56:23 neilc Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/hash/hashscan.c,v 1.34 2004/07/01 00:49:29 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -21,6 +21,7 @@
 typedef struct HashScanListData
 {
 	IndexScanDesc hashsl_scan;
+	TransactionId hashsl_creatingXid;
 	struct HashScanListData *hashsl_next;
 } HashScanListData;
 
@@ -50,6 +51,46 @@ AtEOXact_hash(void)
 	HashScans = NULL;
 }
 
+/*
+ * AtEOSubXact_hash() --- clean up hash subsystem at subxact abort or commit.
+ *
+ * This is here because it needs to touch this module's static var HashScans.
+ */
+void
+AtEOSubXact_hash(TransactionId childXid)
+{
+	HashScanList l;
+	HashScanList prev;
+	HashScanList next;
+
+	/*
+	 * Note: these actions should only be necessary during xact abort; but
+	 * they can't hurt during a commit.
+	 */
+
+	/*
+	 * Forget active scans that were started in this subtransaction.
+	 */
+	prev = NULL;
+
+	for (l = HashScans; l != NULL; l = next)
+	{
+		next = l->hashsl_next;
+		if (l->hashsl_creatingXid == childXid)
+		{
+			if (prev == NULL)
+				HashScans = next;
+			else
+				prev->hashsl_next = next;
+
+			pfree(l);
+			/* prev does not change */
+		}
+		else
+			prev = l;
+	}
+}
+
 /*
  *	_Hash_regscan() -- register a new scan.
  */
@@ -60,6 +101,7 @@ _hash_regscan(IndexScanDesc scan)
 
 	new_el = (HashScanList) palloc(sizeof(HashScanListData));
 	new_el->hashsl_scan = scan;
+	new_el->hashsl_creatingXid = GetCurrentTransactionId();
 	new_el->hashsl_next = HashScans;
 	HashScans = new_el;
 }
diff --git a/src/backend/access/rtree/rtscan.c b/src/backend/access/rtree/rtscan.c
index 9dac2a15c06..d3530966e6d 100644
--- a/src/backend/access/rtree/rtscan.c
+++ b/src/backend/access/rtree/rtscan.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/rtree/rtscan.c,v 1.51 2004/01/07 18:56:24 neilc Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/rtree/rtscan.c,v 1.52 2004/07/01 00:49:31 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -42,6 +42,7 @@ static void adjustiptr(IndexScanDesc s, ItemPointer iptr,
 typedef struct RTScanListData
 {
 	IndexScanDesc rtsl_scan;
+	TransactionId rtsl_creatingXid;
 	struct RTScanListData *rtsl_next;
 } RTScanListData;
 
@@ -240,6 +241,7 @@ rtregscan(IndexScanDesc s)
 
 	l = (RTScanList) palloc(sizeof(RTScanListData));
 	l->rtsl_scan = s;
+	l->rtsl_creatingXid = GetCurrentTransactionId();
 	l->rtsl_next = RTScans;
 	RTScans = l;
 }
@@ -290,6 +292,46 @@ AtEOXact_rtree(void)
 	RTScans = NULL;
 }
 
+/*
+ * AtEOSubXact_rtree() --- clean up rtree subsystem at subxact abort or commit.
+ *
+ * This is here because it needs to touch this module's static var RTScans.
+ */
+void
+AtEOSubXact_rtree(TransactionId childXid)
+{
+	RTScanList l;
+	RTScanList prev;
+	RTScanList next;
+
+	/*
+	 * Note: these actions should only be necessary during xact abort; but
+	 * they can't hurt during a commit.
+	 */
+
+	/*
+	 * Forget active scans that were started in this subtransaction.
+	 */
+	prev = NULL;
+
+	for (l = RTScans; l != NULL; l = next)
+	{
+		next = l->rtsl_next;
+		if (l->rtsl_creatingXid == childXid)
+		{
+			if (prev == NULL)
+				RTScans = next;
+			else
+				prev->rtsl_next = next;
+
+			pfree(l);
+			/* prev does not change */
+		}
+		else
+			prev = l;
+	}
+}
+
 void
 rtadjscans(Relation r, int op, BlockNumber blkno, OffsetNumber offnum)
 {
diff --git a/src/backend/access/transam/Makefile b/src/backend/access/transam/Makefile
index 762ecf0ab7f..fe740a045f8 100644
--- a/src/backend/access/transam/Makefile
+++ b/src/backend/access/transam/Makefile
@@ -4,7 +4,7 @@
 #    Makefile for access/transam
 #
 # IDENTIFICATION
-#    $PostgreSQL: pgsql/src/backend/access/transam/Makefile,v 1.18 2003/11/29 19:51:40 pgsql Exp $
+#    $PostgreSQL: pgsql/src/backend/access/transam/Makefile,v 1.19 2004/07/01 00:49:42 tgl Exp $
 #
 #-------------------------------------------------------------------------
 
@@ -12,7 +12,7 @@ subdir = src/backend/access/transam
 top_builddir = ../../../..
 include $(top_builddir)/src/Makefile.global
 
-OBJS = clog.o transam.o varsup.o xact.o xlog.o xlogutils.o rmgr.o slru.o
+OBJS = clog.o transam.o varsup.o xact.o xlog.o xlogutils.o rmgr.o slru.o subtrans.o
 
 all: SUBSYS.o
 
diff --git a/src/backend/access/transam/clog.c b/src/backend/access/transam/clog.c
index 97f887d0a06..54514a24e71 100644
--- a/src/backend/access/transam/clog.c
+++ b/src/backend/access/transam/clog.c
@@ -13,7 +13,7 @@
  * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/backend/access/transam/clog.c,v 1.20 2004/05/31 03:47:54 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/access/transam/clog.c,v 1.21 2004/07/01 00:49:42 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -21,14 +21,13 @@
 
 #include <fcntl.h>
 #include <dirent.h>
-#include <errno.h>
 #include <sys/stat.h>
 #include <unistd.h>
 
 #include "access/clog.h"
 #include "access/slru.h"
-#include "storage/lwlock.h"
 #include "miscadmin.h"
+#include "storage/lwlock.h"
 
 
 /*
@@ -65,7 +64,7 @@
  * is guaranteed flushed through the XLOG commit record before we are called
  * to log a commit, so the WAL rule "write xlog before data" is satisfied
  * automatically for commits, and we don't really care for aborts.  Therefore,
- * we don't need to mark XLOG pages with LSN information; we have enough
+ * we don't need to mark CLOG pages with LSN information; we have enough
  * synchronization already.
  *----------
  */
@@ -95,20 +94,22 @@ TransactionIdSetStatus(TransactionId xid, XidStatus status)
 	char	   *byteptr;
 
 	Assert(status == TRANSACTION_STATUS_COMMITTED ||
-		   status == TRANSACTION_STATUS_ABORTED);
+		   status == TRANSACTION_STATUS_ABORTED ||
+		   status == TRANSACTION_STATUS_SUB_COMMITTED);
 
 	LWLockAcquire(ClogCtl->ControlLock, LW_EXCLUSIVE);
 
 	byteptr = SimpleLruReadPage(ClogCtl, pageno, xid, true);
 	byteptr += byteno;
 
-	/* Current state should be 0 or target state */
+	/* Current state should be 0, subcommitted or target state */
 	Assert(((*byteptr >> bshift) & CLOG_XACT_BITMASK) == 0 ||
+		   ((*byteptr >> bshift) & CLOG_XACT_BITMASK) == TRANSACTION_STATUS_SUB_COMMITTED ||
 		   ((*byteptr >> bshift) & CLOG_XACT_BITMASK) == status);
 
 	*byteptr |= (status << bshift);
 
-	/* ...->page_status[slotno] = CLOG_PAGE_DIRTY; already done */
+	/* ...->page_status[slotno] = SLRU_PAGE_DIRTY; already done */
 
 	LWLockRelease(ClogCtl->ControlLock);
 }
@@ -117,7 +118,7 @@ TransactionIdSetStatus(TransactionId xid, XidStatus status)
  * Interrogate the state of a transaction in the commit log.
  *
  * NB: this is a low-level routine and is NOT the preferred entry point
- * for most uses; TransactionLogTest() in transam.c is the intended caller.
+ * for most uses; TransactionLogFetch() in transam.c is the intended caller.
  */
 XidStatus
 TransactionIdGetStatus(TransactionId xid)
@@ -176,7 +177,7 @@ BootStrapCLOG(void)
 
 	/* Make sure it's written out */
 	SimpleLruWritePage(ClogCtl, slotno, NULL);
-	/* Assert(ClogCtl->page_status[slotno] == CLOG_PAGE_CLEAN); */
+	/* Assert(ClogCtl->page_status[slotno] == SLRU_PAGE_CLEAN); */
 
 	LWLockRelease(ClogCtl->ControlLock);
 }
@@ -211,7 +212,8 @@ StartupCLOG(void)
 	/*
 	 * Initialize our idea of the latest page number.
 	 */
-	SimpleLruSetLatestPage(ClogCtl, TransactionIdToPage(ShmemVariableCache->nextXid));
+	SimpleLruSetLatestPage(ClogCtl,
+						   TransactionIdToPage(ShmemVariableCache->nextXid));
 }
 
 /*
@@ -333,51 +335,20 @@ WriteZeroPageXlogRec(int pageno)
 	rdata.data = (char *) (&pageno);
 	rdata.len = sizeof(int);
 	rdata.next = NULL;
-	(void) XLogInsert(RM_CLOG_ID, CLOG_ZEROPAGE | XLOG_NO_TRAN, &rdata);
-}
-
-/*
- * CLOG resource manager's routines
- */
-void
-clog_redo(XLogRecPtr lsn, XLogRecord *record)
-{
-	uint8		info = record->xl_info & ~XLR_INFO_MASK;
-
-	if (info == CLOG_ZEROPAGE)
-	{
-		int			pageno;
-		int			slotno;
-
-		memcpy(&pageno, XLogRecGetData(record), sizeof(int));
-
-		LWLockAcquire(ClogCtl->ControlLock, LW_EXCLUSIVE);
-
-		slotno = ZeroCLOGPage(pageno, false);
-		SimpleLruWritePage(ClogCtl, slotno, NULL);
-		/* Assert(ClogCtl->page_status[slotno] == SLRU_PAGE_CLEAN); */
-
-		LWLockRelease(ClogCtl->ControlLock);
-	}
+	(void) XLogInsert(RM_SLRU_ID, CLOG_ZEROPAGE | XLOG_NO_TRAN, &rdata);
 }
 
+/* Redo a ZEROPAGE action during WAL replay */
 void
-clog_undo(XLogRecPtr lsn, XLogRecord *record)
+clog_zeropage_redo(int pageno)
 {
-}
+	int			slotno;
 
-void
-clog_desc(char *buf, uint8 xl_info, char *rec)
-{
-	uint8		info = xl_info & ~XLR_INFO_MASK;
+	LWLockAcquire(ClogCtl->ControlLock, LW_EXCLUSIVE);
 
-	if (info == CLOG_ZEROPAGE)
-	{
-		int			pageno;
+	slotno = ZeroCLOGPage(pageno, false);
+	SimpleLruWritePage(ClogCtl, slotno, NULL);
+	/* Assert(ClogCtl->page_status[slotno] == SLRU_PAGE_CLEAN); */
 
-		memcpy(&pageno, rec, sizeof(int));
-		sprintf(buf + strlen(buf), "zeropage: %d", pageno);
-	}
-	else
-		strcat(buf, "UNKNOWN");
+	LWLockRelease(ClogCtl->ControlLock);
 }
diff --git a/src/backend/access/transam/rmgr.c b/src/backend/access/transam/rmgr.c
index 112363bf291..d6c8c93ca6e 100644
--- a/src/backend/access/transam/rmgr.c
+++ b/src/backend/access/transam/rmgr.c
@@ -3,16 +3,16 @@
  *
  * Resource managers definition
  *
- * $PostgreSQL: pgsql/src/backend/access/transam/rmgr.c,v 1.12 2003/11/29 19:51:40 pgsql Exp $
+ * $PostgreSQL: pgsql/src/backend/access/transam/rmgr.c,v 1.13 2004/07/01 00:49:42 tgl Exp $
  */
 #include "postgres.h"
 
-#include "access/clog.h"
 #include "access/gist.h"
 #include "access/hash.h"
 #include "access/heapam.h"
 #include "access/nbtree.h"
 #include "access/rtree.h"
+#include "access/slru.h"
 #include "access/xact.h"
 #include "access/xlog.h"
 #include "storage/smgr.h"
@@ -23,7 +23,7 @@ RmgrData	RmgrTable[RM_MAX_ID + 1] = {
 	{"XLOG", xlog_redo, xlog_undo, xlog_desc, NULL, NULL},
 	{"Transaction", xact_redo, xact_undo, xact_desc, NULL, NULL},
 	{"Storage", smgr_redo, smgr_undo, smgr_desc, NULL, NULL},
-	{"CLOG", clog_redo, clog_undo, clog_desc, NULL, NULL},
+	{"SLRU", slru_redo, slru_undo, slru_desc, NULL, NULL},
 	{"Reserved 4", NULL, NULL, NULL, NULL, NULL},
 	{"Reserved 5", NULL, NULL, NULL, NULL, NULL},
 	{"Reserved 6", NULL, NULL, NULL, NULL, NULL},
diff --git a/src/backend/access/transam/slru.c b/src/backend/access/transam/slru.c
index 58798d0f07f..0181e2d6260 100644
--- a/src/backend/access/transam/slru.c
+++ b/src/backend/access/transam/slru.c
@@ -6,7 +6,7 @@
  * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/backend/access/transam/slru.c,v 1.16 2004/05/31 03:47:54 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/access/transam/slru.c,v 1.17 2004/07/01 00:49:42 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -16,8 +16,9 @@
 #include <sys/stat.h>
 #include <unistd.h>
 
+#include "access/clog.h"
 #include "access/slru.h"
-#include "access/clog.h"		/* only for NUM_CLOG_BUFFERS */
+#include "access/subtrans.h"
 #include "postmaster/bgwriter.h"
 #include "storage/fd.h"
 #include "storage/lwlock.h"
@@ -1025,3 +1026,55 @@ SlruScanDirectory(SlruCtl ctl, int cutoffPage, bool doDeletions)
 
 	return found;
 }
+
+/*
+ * SLRU resource manager's routines
+ */
+void
+slru_redo(XLogRecPtr lsn, XLogRecord *record)
+{
+	uint8		info = record->xl_info & ~XLR_INFO_MASK;
+	int			pageno;
+
+	memcpy(&pageno, XLogRecGetData(record), sizeof(int));
+
+	switch (info)
+	{
+		case CLOG_ZEROPAGE:
+			clog_zeropage_redo(pageno);
+			break;
+		case SUBTRANS_ZEROPAGE:
+			subtrans_zeropage_redo(pageno);
+			break;
+		default:
+			elog(PANIC, "slru_redo: unknown op code %u", info);
+	}
+}
+
+void
+slru_undo(XLogRecPtr lsn, XLogRecord *record)
+{
+}
+
+void
+slru_desc(char *buf, uint8 xl_info, char *rec)
+{
+	uint8		info = xl_info & ~XLR_INFO_MASK;
+
+	if (info == CLOG_ZEROPAGE)
+	{
+		int			pageno;
+
+		memcpy(&pageno, rec, sizeof(int));
+		sprintf(buf + strlen(buf), "clog zeropage: %d", pageno);
+	}
+	else if (info == SUBTRANS_ZEROPAGE)
+	{
+		int			pageno;
+
+		memcpy(&pageno, rec, sizeof(int));
+		sprintf(buf + strlen(buf), "subtrans zeropage: %d", pageno);
+	}
+	else
+		strcat(buf, "UNKNOWN");
+}
diff --git a/src/backend/access/transam/subtrans.c b/src/backend/access/transam/subtrans.c
new file mode 100644
index 00000000000..1babedbe590
--- /dev/null
+++ b/src/backend/access/transam/subtrans.c
@@ -0,0 +1,388 @@
+/*-------------------------------------------------------------------------
+ *
+ * subtrans.c
+ *		PostgreSQL subtrans-log manager
+ *
+ * The pg_subtrans manager is a pg_clog-like manager which stores the parent
+ * transaction Id for each transaction.  It is a fundamental part of the
+ * nested transactions implementation.  A main transaction has a parent
+ * of InvalidTransactionId, and each subtransaction has its immediate parent.
+ * The tree can easily be walked from child to parent, but not in the
+ * opposite direction.
+ *
+ * This code is mostly derived from clog.c.
+ *
+ * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * $PostgreSQL: pgsql/src/backend/access/transam/subtrans.c,v 1.1 2004/07/01 00:49:42 tgl Exp $
+ *
+ *-------------------------------------------------------------------------
+ */
+#include "postgres.h"
+
+#include <fcntl.h>
+#include <dirent.h>
+#include <sys/stat.h>
+#include <unistd.h>
+
+#include "access/slru.h"
+#include "access/subtrans.h"
+#include "miscadmin.h"
+#include "storage/lwlock.h"
+
+
+/*
+ * Defines for SubTrans page and segment sizes.  A page is the same BLCKSZ
+ * as is used everywhere else in Postgres.
+ *
+ * Note: because TransactionIds are 32 bits and wrap around at 0xFFFFFFFF,
+ * SubTrans page numbering also wraps around at
+ * 0xFFFFFFFF/SUBTRANS_XACTS_PER_PAGE, and segment numbering at
+ * 0xFFFFFFFF/SUBTRANS_XACTS_PER_PAGE/SLRU_SEGMENTS_PER_PAGE.  We need take no
+ * explicit notice of that fact in this module, except when comparing segment
+ * and page numbers in TruncateSubTrans (see SubTransPagePrecedes).
+ */
+
+/* We need four bytes per xact */
+#define SUBTRANS_XACTS_PER_PAGE (BLCKSZ / sizeof(TransactionId))
+
+#define TransactionIdToPage(xid) ((xid) / (TransactionId) SUBTRANS_XACTS_PER_PAGE)
+#define TransactionIdToEntry(xid) ((xid) % (TransactionId) SUBTRANS_XACTS_PER_PAGE)
+
+
+/*----------
+ * Shared-memory data structures for SUBTRANS control
+ *
+ * XLOG interactions: this module generates an XLOG record whenever a new
+ * SUBTRANS page is initialized to zeroes.	Other writes of SUBTRANS come from
+ * recording of transaction commit or abort in xact.c, which generates its
+ * own XLOG records for these events and will re-perform the status update
+ * on redo; so we need make no additional XLOG entry here.	Also, the XLOG
+ * is guaranteed flushed through the XLOG commit record before we are called
+ * to log a commit, so the WAL rule "write xlog before data" is satisfied
+ * automatically for commits, and we don't really care for aborts.  Therefore,
+ * we don't need to mark SUBTRANS pages with LSN information; we have enough
+ * synchronization already.
+ *----------
+ */
+
+
+static SlruCtlData SubTransCtlData;
+static SlruCtl SubTransCtl = &SubTransCtlData;
+
+
+static int	ZeroSUBTRANSPage(int pageno, bool writeXlog);
+static bool SubTransPagePrecedes(int page1, int page2);
+static void WriteZeroPageXlogRec(int pageno);
+
+
+/*
+ * Record the parent of a subtransaction in the subtrans log.
+ */
+void
+SubTransSetParent(TransactionId xid, TransactionId parent)
+{
+	int			pageno = TransactionIdToPage(xid);
+	int			entryno = TransactionIdToEntry(xid);
+	TransactionId *ptr;
+
+	LWLockAcquire(SubTransCtl->ControlLock, LW_EXCLUSIVE);
+
+	ptr = (TransactionId *) SimpleLruReadPage(SubTransCtl, pageno, xid, true);
+	ptr += entryno;
+
+	/* Current state should be 0 or target state */
+	Assert(*ptr == InvalidTransactionId || *ptr == parent);
+
+	*ptr = parent;
+
+	/* ...->page_status[slotno] = SLRU_PAGE_DIRTY; already done */
+
+	LWLockRelease(SubTransCtl->ControlLock);
+}
+
+/*
+ * Interrogate the parent of a transaction in the subtrans log.
+ */
+TransactionId
+SubTransGetParent(TransactionId xid)
+{
+	int			pageno = TransactionIdToPage(xid);
+	int			entryno = TransactionIdToEntry(xid);
+	TransactionId *ptr;
+	TransactionId	parent;
+
+	/* Bootstrap and frozen XIDs have no parent */
+	if (!TransactionIdIsNormal(xid))
+		return InvalidTransactionId;
+
+	LWLockAcquire(SubTransCtl->ControlLock, LW_EXCLUSIVE);
+
+	ptr = (TransactionId *) SimpleLruReadPage(SubTransCtl, pageno, xid, false);
+	ptr += entryno;
+
+	parent = *ptr;
+
+	LWLockRelease(SubTransCtl->ControlLock);
+
+	return parent;
+}
+
+/*
+ * SubTransGetTopmostTransaction
+ *
+ * Returns the topmost transaction of the given transaction id.
+ */
+TransactionId
+SubTransGetTopmostTransaction(TransactionId xid)
+{
+	TransactionId parentXid = xid,
+				  previousXid = xid;
+
+	while (TransactionIdIsValid(parentXid))
+	{
+		previousXid = parentXid;
+		parentXid = SubTransGetParent(parentXid);
+	}
+
+	Assert(TransactionIdIsValid(previousXid));
+
+	return previousXid;
+}
+
+/*
+ * SubTransXidsHaveCommonAncestor
+ *
+ * Returns true iff the Xids have a common ancestor
+ */
+bool
+SubTransXidsHaveCommonAncestor(TransactionId xid1, TransactionId xid2)
+{
+	if (TransactionIdEquals(xid1, xid2))
+		return true;
+
+	while (TransactionIdIsValid(xid1) && TransactionIdIsValid(xid2))
+	{
+		if (TransactionIdPrecedes(xid2, xid1))
+			xid1 = SubTransGetParent(xid1);
+		else
+			xid2 = SubTransGetParent(xid2);
+
+		if (TransactionIdEquals(xid1, xid2))
+			return true;
+	}
+
+	return false;
+}
+
+/*
+ * Initialization of shared memory for Subtrans
+ */
+
+int
+SUBTRANSShmemSize(void)
+{
+	return SimpleLruShmemSize();
+}
+
+void
+SUBTRANSShmemInit(void)
+{
+	SimpleLruInit(SubTransCtl, "SUBTRANS Ctl", "pg_subtrans");
+	SubTransCtl->PagePrecedes = SubTransPagePrecedes;
+}
+
+/*
+ * This func must be called ONCE on system install.  It creates
+ * the initial SubTrans segment.  (The SubTrans directory is assumed to
+ * have been created by initdb, and SubTransShmemInit must have been called
+ * already.)
+ */
+void
+BootStrapSUBTRANS(void)
+{
+	int			slotno;
+
+	LWLockAcquire(SubTransCtl->ControlLock, LW_EXCLUSIVE);
+
+	/* Create and zero the first page of the commit log */
+	slotno = ZeroSUBTRANSPage(0, false);
+
+	/* Make sure it's written out */
+	SimpleLruWritePage(SubTransCtl, slotno, NULL);
+	/* Assert(SubTransCtl->page_status[slotno] == SLRU_PAGE_CLEAN); */
+
+	LWLockRelease(SubTransCtl->ControlLock);
+}
+
+/*
+ * Initialize (or reinitialize) a page of SubTrans to zeroes.
+ * If writeXlog is TRUE, also emit an XLOG record saying we did this.
+ *
+ * The page is not actually written, just set up in shared memory.
+ * The slot number of the new page is returned.
+ *
+ * Control lock must be held at entry, and will be held at exit.
+ */
+static int
+ZeroSUBTRANSPage(int pageno, bool writeXlog)
+{
+	int			slotno = SimpleLruZeroPage(SubTransCtl, pageno);
+
+	if (writeXlog)
+		WriteZeroPageXlogRec(pageno);
+
+	return slotno;
+}
+
+/*
+ * This must be called ONCE during postmaster or standalone-backend startup,
+ * after StartupXLOG has initialized ShmemVariableCache->nextXid.
+ */
+void
+StartupSUBTRANS(void)
+{
+	/*
+	 * Initialize our idea of the latest page number.
+	 */
+	SimpleLruSetLatestPage(SubTransCtl,
+						   TransactionIdToPage(ShmemVariableCache->nextXid));
+}
+
+/*
+ * This must be called ONCE during postmaster or standalone-backend shutdown
+ */
+void
+ShutdownSUBTRANS(void)
+{
+	SimpleLruFlush(SubTransCtl, false);
+}
+
+/*
+ * Perform a checkpoint --- either during shutdown, or on-the-fly
+ */
+void
+CheckPointSUBTRANS(void)
+{
+	SimpleLruFlush(SubTransCtl, true);
+}
+
+
+/*
+ * Make sure that SubTrans has room for a newly-allocated XID.
+ *
+ * NB: this is called while holding XidGenLock.  We want it to be very fast
+ * most of the time; even when it's not so fast, no actual I/O need happen
+ * unless we're forced to write out a dirty subtrans or xlog page to make room
+ * in shared memory.
+ */
+void
+ExtendSUBTRANS(TransactionId newestXact)
+{
+	int			pageno;
+
+	/*
+	 * No work except at first XID of a page.  But beware: just after
+	 * wraparound, the first XID of page zero is FirstNormalTransactionId.
+	 */
+	if (TransactionIdToEntry(newestXact) != 0 &&
+		!TransactionIdEquals(newestXact, FirstNormalTransactionId))
+		return;
+
+	pageno = TransactionIdToPage(newestXact);
+
+	LWLockAcquire(SubTransCtl->ControlLock, LW_EXCLUSIVE);
+
+	/* Zero the page and make an XLOG entry about it */
+	ZeroSUBTRANSPage(pageno, true);
+
+	LWLockRelease(SubTransCtl->ControlLock);
+}
+
+
+/*
+ * Remove all SubTrans segments before the one holding the passed transaction ID
+ *
+ * When this is called, we know that the database logically contains no
+ * reference to transaction IDs older than oldestXact.	However, we must
+ * not truncate the SubTrans until we have performed a checkpoint, to ensure
+ * that no such references remain on disk either; else a crash just after
+ * the truncation might leave us with a problem.  Since SubTrans segments hold
+ * a large number of transactions, the opportunity to actually remove a
+ * segment is fairly rare, and so it seems best not to do the checkpoint
+ * unless we have confirmed that there is a removable segment.	Therefore
+ * we issue the checkpoint command here, not in higher-level code as might
+ * seem cleaner.
+ */
+void
+TruncateSUBTRANS(TransactionId oldestXact)
+{
+	int			cutoffPage;
+
+	/*
+	 * The cutoff point is the start of the segment containing oldestXact.
+	 * We pass the *page* containing oldestXact to SimpleLruTruncate.
+	 */
+	cutoffPage = TransactionIdToPage(oldestXact);
+	SimpleLruTruncate(SubTransCtl, cutoffPage);
+}
+
+
+/*
+ * Decide which of two SubTrans page numbers is "older" for truncation purposes.
+ *
+ * We need to use comparison of TransactionIds here in order to do the right
+ * thing with wraparound XID arithmetic.  However, if we are asked about
+ * page number zero, we don't want to hand InvalidTransactionId to
+ * TransactionIdPrecedes: it'll get weird about permanent xact IDs.  So,
+ * offset both xids by FirstNormalTransactionId to avoid that.
+ */
+static bool
+SubTransPagePrecedes(int page1, int page2)
+{
+	TransactionId xid1;
+	TransactionId xid2;
+
+	xid1 = ((TransactionId) page1) * SUBTRANS_XACTS_PER_PAGE;
+	xid1 += FirstNormalTransactionId;
+	xid2 = ((TransactionId) page2) * SUBTRANS_XACTS_PER_PAGE;
+	xid2 += FirstNormalTransactionId;
+
+	return TransactionIdPrecedes(xid1, xid2);
+}
+
+
+/*
+ * Write a ZEROPAGE xlog record
+ *
+ * Note: xlog record is marked as outside transaction control, since we
+ * want it to be redone whether the invoking transaction commits or not.
+ * (Besides which, this is normally done just before entering a transaction.)
+ */
+static void
+WriteZeroPageXlogRec(int pageno)
+{
+	XLogRecData rdata;
+
+	rdata.buffer = InvalidBuffer;
+	rdata.data = (char *) (&pageno);
+	rdata.len = sizeof(int);
+	rdata.next = NULL;
+	(void) XLogInsert(RM_SLRU_ID, SUBTRANS_ZEROPAGE | XLOG_NO_TRAN, &rdata);
+}
+
+/* Redo a ZEROPAGE action during WAL replay */
+void
+subtrans_zeropage_redo(int pageno)
+{
+	int			slotno;
+
+	LWLockAcquire(SubTransCtl->ControlLock, LW_EXCLUSIVE);
+
+	slotno = ZeroSUBTRANSPage(pageno, false);
+	SimpleLruWritePage(SubTransCtl, slotno, NULL);
+	/* Assert(SubTransCtl->page_status[slotno] == SLRU_PAGE_CLEAN); */
+
+	LWLockRelease(SubTransCtl->ControlLock);
+}
diff --git a/src/backend/access/transam/transam.c b/src/backend/access/transam/transam.c
index bbd4f08bf06..34d281de587 100644
--- a/src/backend/access/transam/transam.c
+++ b/src/backend/access/transam/transam.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/transam/transam.c,v 1.56 2003/11/29 19:51:40 pgsql Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/transam/transam.c,v 1.57 2004/07/01 00:49:42 tgl Exp $
  *
  * NOTES
  *	  This file contains the high level access-method interface to the
@@ -20,6 +20,7 @@
 #include "postgres.h"
 
 #include "access/clog.h"
+#include "access/subtrans.h"
 #include "access/transam.h"
 
 
@@ -35,44 +36,40 @@
 bool		AMI_OVERRIDE = false;
 
 
-static bool TransactionLogTest(TransactionId transactionId, XidStatus status);
+static XidStatus TransactionLogFetch(TransactionId transactionId);
 static void TransactionLogUpdate(TransactionId transactionId,
 					 XidStatus status);
 
 /* ----------------
- *		Single-item cache for results of TransactionLogTest.
+ *		Single-item cache for results of TransactionLogFetch.
  * ----------------
  */
-static TransactionId cachedTestXid = InvalidTransactionId;
-static XidStatus cachedTestXidStatus;
+static TransactionId cachedFetchXid = InvalidTransactionId;
+static XidStatus cachedFetchXidStatus;
 
 
 /* ----------------------------------------------------------------
  *		postgres log access method interface
  *
- *		TransactionLogTest
+ *		TransactionLogFetch
  *		TransactionLogUpdate
  * ----------------------------------------------------------------
  */
 
-/* --------------------------------
- *		TransactionLogTest
- * --------------------------------
+/*
+ * TransactionLogFetch --- fetch commit status of specified transaction id
  */
-
-static bool						/* true/false: does transaction id have
-								 * specified status? */
-TransactionLogTest(TransactionId transactionId, /* transaction id to test */
-				   XidStatus status)	/* transaction status */
+static XidStatus
+TransactionLogFetch(TransactionId transactionId)
 {
-	XidStatus	xidstatus;		/* recorded status of xid */
+	XidStatus	xidstatus;
 
 	/*
 	 * Before going to the commit log manager, check our single item cache
 	 * to see if we didn't just check the transaction status a moment ago.
 	 */
-	if (TransactionIdEquals(transactionId, cachedTestXid))
-		return (status == cachedTestXidStatus);
+	if (TransactionIdEquals(transactionId, cachedFetchXid))
+		return cachedFetchXidStatus;
 
 	/*
 	 * Also, check to see if the transaction ID is a permanent one.
@@ -80,10 +77,10 @@ TransactionLogTest(TransactionId transactionId, /* transaction id to test */
 	if (!TransactionIdIsNormal(transactionId))
 	{
 		if (TransactionIdEquals(transactionId, BootstrapTransactionId))
-			return (status == TRANSACTION_STATUS_COMMITTED);
+			return TRANSACTION_STATUS_COMMITTED;
 		if (TransactionIdEquals(transactionId, FrozenTransactionId))
-			return (status == TRANSACTION_STATUS_COMMITTED);
-		return (status == TRANSACTION_STATUS_ABORTED);
+			return TRANSACTION_STATUS_COMMITTED;
+		return TRANSACTION_STATUS_ABORTED;
 	}
 
 	/*
@@ -92,15 +89,17 @@ TransactionLogTest(TransactionId transactionId, /* transaction id to test */
 	xidstatus = TransactionIdGetStatus(transactionId);
 
 	/*
-	 * DO NOT cache status for unfinished transactions!
+	 * DO NOT cache status for unfinished or sub-committed transactions!
+	 * We only cache status that is guaranteed not to change.
 	 */
-	if (xidstatus != TRANSACTION_STATUS_IN_PROGRESS)
+	if (xidstatus != TRANSACTION_STATUS_IN_PROGRESS &&
+		xidstatus != TRANSACTION_STATUS_SUB_COMMITTED)
 	{
-		TransactionIdStore(transactionId, &cachedTestXid);
-		cachedTestXidStatus = xidstatus;
+		TransactionIdStore(transactionId, &cachedFetchXid);
+		cachedFetchXidStatus = xidstatus;
 	}
 
-	return (status == xidstatus);
+	return xidstatus;
 }
 
 /* --------------------------------
@@ -115,12 +114,23 @@ TransactionLogUpdate(TransactionId transactionId,		/* trans id to update */
 	 * update the commit log
 	 */
 	TransactionIdSetStatus(transactionId, status);
+}
 
-	/*
-	 * update (invalidate) our single item TransactionLogTest cache.
-	 */
-	TransactionIdStore(transactionId, &cachedTestXid);
-	cachedTestXidStatus = status;
+/*
+ * TransactionLogMultiUpdate
+ *
+ * Update multiple transaction identifiers to a given status.
+ * Don't depend on this being atomic; it's not.
+ */
+static void
+TransactionLogMultiUpdate(int nxids, TransactionId *xids, XidStatus status)
+{
+	int i;
+
+	Assert(nxids != 0);
+
+	for (i = 0; i < nxids; i++)
+		TransactionIdSetStatus(xids[i], status);
 }
 
 /* --------------------------------
@@ -171,13 +181,38 @@ AmiTransactionOverride(bool flag)
 bool							/* true if given transaction committed */
 TransactionIdDidCommit(TransactionId transactionId)
 {
+	XidStatus	xidstatus;
+
 	if (AMI_OVERRIDE)
 	{
 		Assert(transactionId == BootstrapTransactionId);
 		return true;
 	}
 
-	return TransactionLogTest(transactionId, TRANSACTION_STATUS_COMMITTED);
+	xidstatus = TransactionLogFetch(transactionId);
+
+	/*
+	 * If it's marked committed, it's committed.
+	 */
+	if (xidstatus == TRANSACTION_STATUS_COMMITTED)
+		return true;
+
+	/*
+	 * If it's marked subcommitted, we have to check the parent recursively.
+	 */
+	if (xidstatus == TRANSACTION_STATUS_SUB_COMMITTED)
+	{
+		TransactionId parentXid;
+	   
+		parentXid = SubTransGetParent(transactionId);
+		Assert(TransactionIdIsValid(parentXid));
+		return TransactionIdDidCommit(parentXid);
+	}
+
+	/* 
+	 * It's not committed.
+	 */
+	return false;
 }
 
 /*
@@ -190,35 +225,49 @@ TransactionIdDidCommit(TransactionId transactionId)
 bool							/* true if given transaction aborted */
 TransactionIdDidAbort(TransactionId transactionId)
 {
+	XidStatus	xidstatus;
+
 	if (AMI_OVERRIDE)
 	{
 		Assert(transactionId == BootstrapTransactionId);
 		return false;
 	}
 
-	return TransactionLogTest(transactionId, TRANSACTION_STATUS_ABORTED);
-}
+	xidstatus = TransactionLogFetch(transactionId);
 
-/*
- * Now this func in shmem.c and gives quality answer by scanning
- * PGPROC structures of all running backend. - vadim 11/26/96
- *
- * Old comments:
- * true if given transaction has neither committed nor aborted
- */
-#ifdef NOT_USED
-bool
-TransactionIdIsInProgress(TransactionId transactionId)
-{
-	if (AMI_OVERRIDE)
+	/*
+	 * If it's marked aborted, it's aborted.
+	 */
+	if (xidstatus == TRANSACTION_STATUS_ABORTED)
+		return true;
+
+	/*
+	 * If it's marked subcommitted, we have to check the parent recursively.
+	 * 
+	 * If we detect that the parent has aborted, update pg_clog to show the
+	 * subtransaction as aborted.  This is only needed when the parent
+	 * crashed before either committing or aborting.  We want to clean up
+	 * pg_clog so future visitors don't need to make this check again.
+	 */
+	if (xidstatus == TRANSACTION_STATUS_SUB_COMMITTED)
 	{
-		Assert(transactionId == BootstrapTransactionId);
-		return false;
+		TransactionId parentXid;
+		bool parentAborted;
+	   
+		parentXid = SubTransGetParent(transactionId);
+		parentAborted = TransactionIdDidAbort(parentXid);
+
+		if (parentAborted)
+			TransactionIdAbort(transactionId);
+
+		return parentAborted;
 	}
 
-	return TransactionLogTest(transactionId, TRANSACTION_STATUS_IN_PROGRESS);
+	/*
+	 * It's not aborted.
+	 */
+	return false;
 }
-#endif   /* NOT_USED */
 
 /* --------------------------------
  *		TransactionId Commit
@@ -252,6 +301,46 @@ TransactionIdAbort(TransactionId transactionId)
 	TransactionLogUpdate(transactionId, TRANSACTION_STATUS_ABORTED);
 }
 
+/*
+ * TransactionIdSubCommit
+ *		Marks the subtransaction associated with the identifier as
+ *		sub-committed.
+ */
+void
+TransactionIdSubCommit(TransactionId transactionId)
+{
+	TransactionLogUpdate(transactionId, TRANSACTION_STATUS_SUB_COMMITTED);
+}
+
+/*
+ * TransactionIdCommitTree
+ *		Marks all the given transaction ids as committed.
+ *
+ * The caller has to be sure that this is used only to mark subcommitted
+ * subtransactions as committed, and only *after* marking the toplevel
+ * parent as committed.  Otherwise there is a race condition against
+ * TransactionIdDidCommit.
+ */
+void
+TransactionIdCommitTree(int nxids, TransactionId *xids)
+{
+	if (nxids > 0)
+		TransactionLogMultiUpdate(nxids, xids, TRANSACTION_STATUS_COMMITTED);
+}
+
+/*
+ * TransactionIdAbortTree
+ *		Marks all the given transaction ids as aborted.
+ *
+ * We don't need to worry about the non-atomic behavior, since any onlookers
+ * will consider all the xacts as not-yet-committed anyway.
+ */
+void
+TransactionIdAbortTree(int nxids, TransactionId *xids)
+{
+	if (nxids > 0)
+		TransactionLogMultiUpdate(nxids, xids, TRANSACTION_STATUS_ABORTED);
+}
 
 /*
  * TransactionIdPrecedes --- is id1 logically < id2?
diff --git a/src/backend/access/transam/varsup.c b/src/backend/access/transam/varsup.c
index 617c7d19c43..9d3b0b323aa 100644
--- a/src/backend/access/transam/varsup.c
+++ b/src/backend/access/transam/varsup.c
@@ -6,7 +6,7 @@
  * Copyright (c) 2000-2003, PostgreSQL Global Development Group
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/transam/varsup.c,v 1.55 2004/01/26 19:15:59 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/transam/varsup.c,v 1.56 2004/07/01 00:49:42 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -14,6 +14,7 @@
 #include "postgres.h"
 
 #include "access/clog.h"
+#include "access/subtrans.h"
 #include "access/transam.h"
 #include "storage/ipc.h"
 #include "storage/proc.h"
@@ -30,7 +31,7 @@ VariableCache ShmemVariableCache = NULL;
  * Allocate the next XID for my new transaction.
  */
 TransactionId
-GetNewTransactionId(void)
+GetNewTransactionId(bool isSubXact)
 {
 	TransactionId xid;
 
@@ -52,8 +53,11 @@ GetNewTransactionId(void)
 	 * commit a later XID before we zero the page.	Fortunately, a page of
 	 * the commit log holds 32K or more transactions, so we don't have to
 	 * do this very often.
+	 *
+	 * Extend pg_subtrans too.
 	 */
 	ExtendCLOG(xid);
+	ExtendSUBTRANS(xid);
 
 	/*
 	 * Now advance the nextXid counter.  This must not happen until after
@@ -82,8 +86,11 @@ GetNewTransactionId(void)
 	 * its own spinlock used only for fetching/storing that PGPROC's xid.
 	 * (SInvalLock would then mean primarily that PGPROCs couldn't be added/
 	 * removed while holding the lock.)
+	 *
+	 * We don't want a subtransaction to update the stored Xid; we'll check
+	 * if a transaction Xid is a running subxact by checking pg_subtrans.
 	 */
-	if (MyProc != NULL)
+	if (MyProc != NULL && !isSubXact)
 		MyProc->xid = xid;
 
 	LWLockRelease(XidGenLock);
diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c
index 2ae0fc5b21d..fcf5b374453 100644
--- a/src/backend/access/transam/xact.c
+++ b/src/backend/access/transam/xact.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/transam/xact.c,v 1.168 2004/06/03 02:08:00 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/transam/xact.c,v 1.169 2004/07/01 00:49:42 tgl Exp $
  *
  * NOTES
  *		Transaction aborts can now occur two ways:
@@ -148,6 +148,7 @@
 #include "access/hash.h"
 #include "access/nbtree.h"
 #include "access/rtree.h"
+#include "access/subtrans.h"
 #include "access/xact.h"
 #include "catalog/heap.h"
 #include "catalog/index.h"
@@ -190,20 +191,53 @@ static void CommitTransaction(void);
 static void RecordTransactionAbort(void);
 static void StartTransaction(void);
 
+static void RecordSubTransactionCommit(void);
+static void StartSubTransaction(void);
+static void CommitSubTransaction(void);
+static void AbortSubTransaction(void);
+static void CleanupSubTransaction(void);
+static void PushTransaction(void);
+static void PopTransaction(void);
+
+static void AtSubAbort_Locks(void);
+static void AtSubAbort_Memory(void);
+static void AtSubCleanup_Memory(void);
+static void AtSubCommit_Memory(void);
+static void AtSubStart_Memory(void);
+
+static void ShowTransactionState(const char *str);
+static void ShowTransactionStateRec(TransactionState state);
+static const char *BlockStateAsString(TBlockState blockState);
+static const char *TransStateAsString(TransState state);
+
 /*
- *	global variables holding the current transaction state.
+ * CurrentTransactionState always points to the current transaction state
+ * block.  It will point to TopTransactionStateData when not in a
+ * transaction at all, or when in a top-level transaction.
  */
-static TransactionStateData CurrentTransactionStateData = {
+static TransactionStateData TopTransactionStateData = {
 	0,							/* transaction id */
 	FirstCommandId,				/* command id */
-	0,							/* scan command id */
-	0x0,						/* start time */
 	TRANS_DEFAULT,				/* transaction state */
-	TBLOCK_DEFAULT				/* transaction block state from the client
+	TBLOCK_DEFAULT,				/* transaction block state from the client
 								 * perspective */
+	0,							/* nesting level */
+	NULL,						/* cur transaction context */
+	NIL,						/* subcommitted child Xids */
+	0,							/* entry-time current userid */
+	NULL						/* link to parent state block */
 };
 
-static TransactionState CurrentTransactionState = &CurrentTransactionStateData;
+static TransactionState CurrentTransactionState = &TopTransactionStateData;
+
+/*
+ * These vars hold the value of now(), ie, the transaction start time.
+ * This does not change as we enter and exit subtransactions, so we don't
+ * keep it inside the TransactionState stack.
+ */
+static AbsoluteTime xactStartTime;			/* integer part */
+static int		xactStartTimeUsec;			/* microsecond part */
+
 
 /*
  *	User-tweakable parameters
@@ -282,13 +316,27 @@ IsAbortedTransactionBlockState(void)
 {
 	TransactionState s = CurrentTransactionState;
 
-	if (s->blockState == TBLOCK_ABORT)
+	if (s->blockState == TBLOCK_ABORT || 
+			s->blockState == TBLOCK_SUBABORT)
 		return true;
 
 	return false;
 }
 
 
+/*
+ *	GetTopTransactionId
+ *
+ * Get the ID of the main transaction, even if we are currently inside
+ * a subtransaction.
+ */
+TransactionId
+GetTopTransactionId(void)
+{
+	return TopTransactionStateData.transactionIdData;
+}
+
+
 /*
  *	GetCurrentTransactionId
  */
@@ -319,9 +367,7 @@ GetCurrentCommandId(void)
 AbsoluteTime
 GetCurrentTransactionStartTime(void)
 {
-	TransactionState s = CurrentTransactionState;
-
-	return s->startTime;
+	return xactStartTime;
 }
 
 
@@ -331,11 +377,23 @@ GetCurrentTransactionStartTime(void)
 AbsoluteTime
 GetCurrentTransactionStartTimeUsec(int *msec)
 {
-	TransactionState s = CurrentTransactionState;
+	*msec = xactStartTimeUsec;
+	return xactStartTime;
+}
+
 
-	*msec = s->startTimeUsec;
+/*
+ *	GetCurrentTransactionNestLevel
+ *
+ * Note: this will return zero when not inside any transaction, one when
+ * inside a top-level transaction, etc.
+ */
+int
+GetCurrentTransactionNestLevel(void)
+{
+	TransactionState s = CurrentTransactionState;
 
-	return s->startTime;
+	return s->nestingLevel;
 }
 
 
@@ -358,19 +416,27 @@ TransactionIdIsCurrentTransactionId(TransactionId xid)
 		return false;
 	}
 
-	return TransactionIdEquals(xid, s->transactionIdData);
-}
+	/*
+	 * We will return true for the Xid of the current subtransaction,
+	 * any of its subcommitted children, any of its parents, or any of
+	 * their previously subcommitted children.
+	 */
+	while (s != NULL)
+	{
+		ListCell *cell;
 
+		if (TransactionIdEquals(xid, s->transactionIdData))
+			return true;
+		foreach(cell, s->childXids)
+		{
+			if (TransactionIdEquals(xid, lfirst_int(cell)))
+				return true;
+		}
 
-/*
- *	CommandIdIsCurrentCommandId
- */
-bool
-CommandIdIsCurrentCommandId(CommandId cid)
-{
-	TransactionState s = CurrentTransactionState;
+		s = s->parent;
+	}
 
-	return (cid == s->commandId);
+	return false;
 }
 
 
@@ -437,13 +503,15 @@ AtStart_Locks(void)
 static void
 AtStart_Memory(void)
 {
+	TransactionState s = CurrentTransactionState;
+
 	/*
 	 * We shouldn't have a transaction context already.
 	 */
 	Assert(TopTransactionContext == NULL);
 
 	/*
-	 * Create a toplevel context for the transaction, and make it active.
+	 * Create a toplevel context for the transaction.
 	 */
 	TopTransactionContext =
 		AllocSetContextCreate(TopMemoryContext,
@@ -452,9 +520,47 @@ AtStart_Memory(void)
 							  ALLOCSET_DEFAULT_INITSIZE,
 							  ALLOCSET_DEFAULT_MAXSIZE);
 
-	MemoryContextSwitchTo(TopTransactionContext);
+	/*
+	 * In a top-level transaction, CurTransactionContext is the same as
+	 * TopTransactionContext.
+	 */
+	CurTransactionContext = TopTransactionContext;
+	s->curTransactionContext = CurTransactionContext;
+
+	/* Make the CurTransactionContext active. */
+	MemoryContextSwitchTo(CurTransactionContext);
 }
 
+/* ----------------------------------------------------------------
+ *						StartSubTransaction stuff
+ * ----------------------------------------------------------------
+ */
+
+/*
+ * AtSubStart_Memory
+ */
+static void
+AtSubStart_Memory(void)
+{
+	TransactionState s = CurrentTransactionState;
+
+	Assert(CurTransactionContext != NULL);
+
+	/*
+	 * Create a CurTransactionContext, which will be used to hold data that
+	 * survives subtransaction commit but disappears on subtransaction abort.
+	 * We make it a child of the immediate parent's CurTransactionContext.
+	 */
+	CurTransactionContext = AllocSetContextCreate(CurTransactionContext,
+												  "CurTransactionContext",
+												  ALLOCSET_DEFAULT_MINSIZE,
+												  ALLOCSET_DEFAULT_INITSIZE,
+												  ALLOCSET_DEFAULT_MAXSIZE);
+	s->curTransactionContext = CurTransactionContext;
+
+	/* Make the CurTransactionContext active. */
+	MemoryContextSwitchTo(CurTransactionContext);
+}
 
 /* ----------------------------------------------------------------
  *						CommitTransaction stuff
@@ -467,13 +573,25 @@ AtStart_Memory(void)
 void
 RecordTransactionCommit(void)
 {
+	int			nrels;
+	RelFileNode *rptr;
+	int			nchildren;
+	TransactionId *children;
+
+	/* Get data needed for commit record */
+	nrels = smgrGetPendingDeletes(true, &rptr);
+	nchildren = xactGetCommittedChildren(&children, false);
+
 	/*
-	 * If we made neither any XLOG entries nor any temp-rel updates, we
-	 * can omit recording the transaction commit at all.
+	 * If we made neither any XLOG entries nor any temp-rel updates,
+	 * and have no files to be deleted, we can omit recording the transaction
+	 * commit at all.  (This test includes the effects of subtransactions,
+	 * so the presence of committed subxacts need not alone force a write.)
 	 */
-	if (MyXactMadeXLogEntry || MyXactMadeTempRelUpdate)
+	if (MyXactMadeXLogEntry || MyXactMadeTempRelUpdate || nrels > 0)
 	{
 		TransactionId xid = GetCurrentTransactionId();
+		bool		madeTCentries;
 		XLogRecPtr	recptr;
 
 		/* Tell bufmgr and smgr to prepare for commit */
@@ -482,40 +600,46 @@ RecordTransactionCommit(void)
 		START_CRIT_SECTION();
 
 		/*
-		 * We only need to log the commit in xlog if the transaction made
-		 * any transaction-controlled XLOG entries.  (Otherwise, its XID
-		 * appears nowhere in permanent storage, so no one else will ever
-		 * care if it committed.)
+		 * We only need to log the commit in XLOG if the transaction made
+		 * any transaction-controlled XLOG entries or will delete files.
+		 * (If it made no transaction-controlled XLOG entries, its XID
+		 * appears nowhere in permanent storage, so no one else will ever care
+		 * if it committed.)
 		 */
-		if (MyLastRecPtr.xrecoff != 0)
+		madeTCentries = (MyLastRecPtr.xrecoff != 0);
+		if (madeTCentries || nrels > 0)
 		{
-			/* Need to emit a commit record */
-			XLogRecData rdata[2];
+			XLogRecData rdata[3];
+			int			lastrdata = 0;
 			xl_xact_commit xlrec;
-			int			nrels;
-			RelFileNode *rptr;
-
-			nrels = smgrGetPendingDeletes(true, &rptr);
 
 			xlrec.xtime = time(NULL);
+			xlrec.nrels = nrels;
+			xlrec.nsubxacts = nchildren;
 			rdata[0].buffer = InvalidBuffer;
 			rdata[0].data = (char *) (&xlrec);
 			rdata[0].len = MinSizeOfXactCommit;
+			/* dump rels to delete */
 			if (nrels > 0)
 			{
 				rdata[0].next = &(rdata[1]);
 				rdata[1].buffer = InvalidBuffer;
 				rdata[1].data = (char *) rptr;
 				rdata[1].len = nrels * sizeof(RelFileNode);
-				rdata[1].next = NULL;
+				lastrdata = 1;
 			}
-			else
-				rdata[0].next = NULL;
+			/* dump committed child Xids */
+			if (nchildren > 0)
+			{
+				rdata[lastrdata].next = &(rdata[2]);
+				rdata[2].buffer = InvalidBuffer;
+				rdata[2].data = (char *) children;
+				rdata[2].len = nchildren * sizeof(TransactionId);
+				lastrdata = 2;
+			}
+			rdata[lastrdata].next = NULL;
 
 			recptr = XLogInsert(RM_XACT_ID, XLOG_XACT_COMMIT, rdata);
-
-			if (rptr)
-				pfree(rptr);
 		}
 		else
 		{
@@ -529,6 +653,9 @@ RecordTransactionCommit(void)
 		 * example, if we reported a nextval() result to the client, this
 		 * ensures that any XLOG record generated by nextval will hit the
 		 * disk before we report the transaction committed.
+		 *
+		 * Note: if we generated a commit record above, MyXactMadeXLogEntry
+		 * will certainly be set now.
 		 */
 		if (MyXactMadeXLogEntry)
 		{
@@ -560,8 +687,12 @@ RecordTransactionCommit(void)
 		 * is okay because no one else will ever care whether we
 		 * committed.
 		 */
-		if (MyLastRecPtr.xrecoff != 0 || MyXactMadeTempRelUpdate)
+		if (madeTCentries || MyXactMadeTempRelUpdate)
+		{
 			TransactionIdCommit(xid);
+			/* to avoid race conditions, the parent must commit first */
+			TransactionIdCommitTree(nchildren, children);
+		}
 
 		END_CRIT_SECTION();
 	}
@@ -573,6 +704,12 @@ RecordTransactionCommit(void)
 
 	/* Show myself as out of the transaction in PGPROC array */
 	MyProc->logRec.xrecoff = 0;
+
+	/* And clean up local data */
+	if (rptr)
+		pfree(rptr);
+	if (children)
+		pfree(children);
 }
 
 
@@ -590,7 +727,7 @@ AtCommit_Cache(void)
 	/*
 	 * Make catalog changes visible to all backends.
 	 */
-	AtEOXactInvalidationMessages(true);
+	AtEOXact_Inval(true);
 }
 
 /*
@@ -602,7 +739,7 @@ AtCommit_LocalCache(void)
 	/*
 	 * Make catalog changes visible to me for the next command.
 	 */
-	CommandEndInvalidationMessages(true);
+	CommandEndInvalidationMessages();
 }
 
 /*
@@ -616,7 +753,7 @@ AtCommit_Locks(void)
 	 *
 	 * Then you're up a creek! -mer 5/24/92
 	 */
-	ProcReleaseLocks(true);
+	ProcReleaseLocks(ReleaseAllExceptSession, 0, NULL);
 }
 
 /*
@@ -638,6 +775,88 @@ AtCommit_Memory(void)
 	Assert(TopTransactionContext != NULL);
 	MemoryContextDelete(TopTransactionContext);
 	TopTransactionContext = NULL;
+	CurTransactionContext = NULL;
+	CurrentTransactionState->curTransactionContext = NULL;
+}
+
+/* ----------------------------------------------------------------
+ *						CommitSubTransaction stuff
+ * ----------------------------------------------------------------
+ */
+
+/*
+ * AtSubCommit_Memory
+ *
+ * We do not throw away the child's CurTransactionContext, since the data
+ * it contains will be needed at upper commit.
+ */
+static void
+AtSubCommit_Memory(void)
+{
+	TransactionState s = CurrentTransactionState;
+
+	Assert(s->parent != NULL);
+
+	/* Return to parent transaction level's memory context. */
+	CurTransactionContext = s->parent->curTransactionContext;
+	MemoryContextSwitchTo(CurTransactionContext);
+}
+
+/*
+ * AtSubCommit_childXids
+ *
+ * Pass my own XID and my child XIDs up to my parent as committed children.
+ */
+static void
+AtSubCommit_childXids(void)
+{
+	TransactionState s = CurrentTransactionState;
+	MemoryContext old_cxt;
+
+	Assert(s->parent != NULL);
+
+	old_cxt = MemoryContextSwitchTo(s->parent->curTransactionContext);
+
+	s->parent->childXids = list_concat(s->parent->childXids, s->childXids);
+	s->childXids = NIL;			/* ensure list not doubly referenced */
+
+	s->parent->childXids = lappend_int(s->parent->childXids,
+									   s->transactionIdData);
+
+	MemoryContextSwitchTo(old_cxt);
+}
+
+/*
+ * RecordSubTransactionCommit
+ */
+static void
+RecordSubTransactionCommit(void)
+{
+	/*
+	 * We do not log the subcommit in XLOG; it doesn't matter until
+	 * the top-level transaction commits.
+	 *
+	 * We must mark the subtransaction subcommitted in clog if its XID
+	 * appears either in permanent rels or in local temporary rels. We
+	 * test this by seeing if we made transaction-controlled entries
+	 * *OR* local-rel tuple updates.  (The test here actually covers the
+	 * entire transaction tree so far, so it may mark subtransactions that
+	 * don't really need it, but it's probably not worth being tenser.
+	 * Note that if a prior subtransaction dirtied these variables, then
+	 * RecordTransactionCommit will have to do the full pushup anyway...)
+	 */
+	if (MyLastRecPtr.xrecoff != 0 || MyXactMadeTempRelUpdate)
+	{
+		TransactionId	xid = GetCurrentTransactionId();
+
+		/* XXX does this really need to be a critical section? */
+		START_CRIT_SECTION();
+
+		/* Record subtransaction subcommit */
+		TransactionIdSubCommit(xid);
+
+		END_CRIT_SECTION();
+	}
 }
 
 /* ----------------------------------------------------------------
@@ -651,14 +870,24 @@ AtCommit_Memory(void)
 static void
 RecordTransactionAbort(void)
 {
+	int			nrels;
+	RelFileNode *rptr;
+	int 			nchildren;
+	TransactionId  *children;
+
+	/* Get data needed for abort record */
+	nrels = smgrGetPendingDeletes(false, &rptr);
+	nchildren = xactGetCommittedChildren(&children, false);
+
 	/*
 	 * If we made neither any transaction-controlled XLOG entries nor any
-	 * temp-rel updates, we can omit recording the transaction abort at
-	 * all. No one will ever care that it aborted.
+	 * temp-rel updates, and are not going to delete any files, we can omit
+	 * recording the transaction abort at all.  No one will ever care that
+	 * it aborted.  (These tests cover our whole transaction tree.)
 	 */
-	if (MyLastRecPtr.xrecoff != 0 || MyXactMadeTempRelUpdate)
+	if (MyLastRecPtr.xrecoff != 0 || MyXactMadeTempRelUpdate || nrels > 0)
 	{
-		TransactionId xid = GetCurrentTransactionId();
+		TransactionId	xid = GetCurrentTransactionId();
 
 		/*
 		 * Catch the scenario where we aborted partway through
@@ -671,50 +900,64 @@ RecordTransactionAbort(void)
 
 		/*
 		 * We only need to log the abort in XLOG if the transaction made
-		 * any transaction-controlled XLOG entries.  (Otherwise, its XID
-		 * appears nowhere in permanent storage, so no one else will ever
-		 * care if it committed.)  We do not flush XLOG to disk unless
-		 * deleting files, since the default assumption after a crash
-		 * would be that we aborted, anyway.
+		 * any transaction-controlled XLOG entries or will delete files.
+		 * (If it made no transaction-controlled XLOG entries, its XID
+		 * appears nowhere in permanent storage, so no one else will ever care
+		 * if it committed.)
+		 *
+		 * We do not flush XLOG to disk unless deleting files, since the
+		 * default assumption after a crash would be that we aborted, anyway.
 		 */
-		if (MyLastRecPtr.xrecoff != 0)
+		if (MyLastRecPtr.xrecoff != 0 || nrels > 0)
 		{
-			XLogRecData rdata[2];
+			XLogRecData rdata[3];
+			int			lastrdata = 0;
 			xl_xact_abort xlrec;
-			int			nrels;
-			RelFileNode *rptr;
 			XLogRecPtr	recptr;
 
-			nrels = smgrGetPendingDeletes(false, &rptr);
-
 			xlrec.xtime = time(NULL);
+			xlrec.nrels = nrels;
+			xlrec.nsubxacts = nchildren;
 			rdata[0].buffer = InvalidBuffer;
 			rdata[0].data = (char *) (&xlrec);
 			rdata[0].len = MinSizeOfXactAbort;
+			/* dump rels to delete */
 			if (nrels > 0)
 			{
 				rdata[0].next = &(rdata[1]);
 				rdata[1].buffer = InvalidBuffer;
 				rdata[1].data = (char *) rptr;
 				rdata[1].len = nrels * sizeof(RelFileNode);
-				rdata[1].next = NULL;
+				lastrdata = 1;
 			}
-			else
-				rdata[0].next = NULL;
+			/* dump committed child Xids */
+			if (nchildren > 0)
+			{
+				rdata[lastrdata].next = &(rdata[2]);
+				rdata[2].buffer = InvalidBuffer;
+				rdata[2].data = (char *) children;
+				rdata[2].len = nchildren * sizeof(TransactionId);
+				lastrdata = 2;
+			}
+			rdata[lastrdata].next = NULL;
 
 			recptr = XLogInsert(RM_XACT_ID, XLOG_XACT_ABORT, rdata);
 
+			/* Must flush if we are deleting files... */
 			if (nrels > 0)
 				XLogFlush(recptr);
-
-			if (rptr)
-				pfree(rptr);
 		}
 
 		/*
 		 * Mark the transaction aborted in clog.  This is not absolutely
 		 * necessary but we may as well do it while we are here.
+		 *
+		 * The ordering here isn't critical but it seems best to mark the
+		 * parent last.  That reduces the chance that concurrent
+		 * TransactionIdDidAbort calls will decide they need to do redundant
+		 * work.
 		 */
+		TransactionIdAbortTree(nchildren, children);
 		TransactionIdAbort(xid);
 
 		END_CRIT_SECTION();
@@ -727,6 +970,12 @@ RecordTransactionAbort(void)
 
 	/* Show myself as out of the transaction in PGPROC array */
 	MyProc->logRec.xrecoff = 0;
+
+	/* And clean up local data */
+	if (rptr)
+		pfree(rptr);
+	if (children)
+		pfree(children);
 }
 
 /*
@@ -736,7 +985,7 @@ static void
 AtAbort_Cache(void)
 {
 	AtEOXact_RelationCache(false);
-	AtEOXactInvalidationMessages(false);
+	AtEOXact_Inval(false);
 }
 
 /*
@@ -750,7 +999,7 @@ AtAbort_Locks(void)
 	 *
 	 * Then you're up a creek without a paddle! -mer
 	 */
-	ProcReleaseLocks(false);
+	ProcReleaseLocks(ReleaseAll, 0, NULL);
 }
 
 
@@ -779,6 +1028,127 @@ AtAbort_Memory(void)
 		MemoryContextSwitchTo(TopMemoryContext);
 }
 
+/*
+ * AtSubAbort_Locks
+ */
+static void
+AtSubAbort_Locks(void)
+{
+	int nxids;
+	TransactionId *xids;
+
+	nxids = xactGetCommittedChildren(&xids, true);
+
+	ProcReleaseLocks(ReleaseGivenXids, nxids, xids);
+
+	pfree(xids);
+}
+
+
+/*
+ * AtSubAbort_Memory
+ */
+static void
+AtSubAbort_Memory(void)
+{
+	Assert(TopTransactionContext != NULL);
+
+	MemoryContextSwitchTo(TopTransactionContext);
+}
+
+/*
+ * RecordSubTransactionAbort
+ */
+static void
+RecordSubTransactionAbort(void)
+{
+	int			nrels;
+	RelFileNode *rptr;
+	int 			nchildren;
+	TransactionId  *children;
+
+	/* Get data needed for abort record */
+	nrels = smgrGetPendingDeletes(false, &rptr);
+	nchildren = xactGetCommittedChildren(&children, false);
+
+	/*
+	 * If we made neither any transaction-controlled XLOG entries nor any
+	 * temp-rel updates, and are not going to delete any files, we can omit
+	 * recording the transaction abort at all.  No one will ever care that
+	 * it aborted.  (These tests cover our whole transaction tree, and
+	 * therefore may mark subxacts that don't really need it, but it's
+	 * probably not worth being tenser.)
+	 *
+	 * In this case we needn't worry about marking subcommitted children as
+	 * aborted, because they didn't mark themselves as subcommitted in the
+	 * first place; see the optimization in RecordSubTransactionCommit.
+	 */
+	if (MyLastRecPtr.xrecoff != 0 || MyXactMadeTempRelUpdate || nrels > 0)
+	{
+		TransactionId	xid = GetCurrentTransactionId();
+
+		START_CRIT_SECTION();
+
+		/*
+		 * We only need to log the abort in XLOG if the transaction made
+		 * any transaction-controlled XLOG entries or will delete files.
+		 */
+		if (MyLastRecPtr.xrecoff != 0 || nrels > 0)
+		{
+			XLogRecData rdata[3];
+			int lastrdata = 0;
+			xl_xact_abort xlrec;
+			XLogRecPtr      recptr;
+
+			xlrec.xtime = time(NULL);
+			xlrec.nrels = nrels;
+			xlrec.nsubxacts = nchildren;
+			rdata[0].buffer = InvalidBuffer;
+			rdata[0].data = (char *) (&xlrec);
+			rdata[0].len = MinSizeOfXactAbort;
+			/* dump rels to delete */
+			if (nrels > 0)
+			{
+				rdata[0].next = &(rdata[1]);
+				rdata[1].buffer = InvalidBuffer;
+				rdata[1].data = (char *) rptr;
+				rdata[1].len = nrels * sizeof(RelFileNode);
+				lastrdata = 1;
+			}
+			/* dump committed child Xids */
+			if (nchildren > 0)
+			{
+				rdata[lastrdata].next = &(rdata[2]);
+				rdata[2].buffer = InvalidBuffer;
+				rdata[2].data = (char *) children;
+				rdata[2].len = nchildren * sizeof(TransactionId);
+				lastrdata = 2;
+			}
+			rdata[lastrdata].next = NULL;
+
+			recptr = XLogInsert(RM_XACT_ID, XLOG_XACT_ABORT, rdata);
+
+			/* Must flush if we are deleting files... */
+			if (nrels > 0)
+				XLogFlush(recptr);
+		}
+
+		/*
+		 * Mark the transaction aborted in clog.  This is not absolutely
+		 * necessary but we may as well do it while we are here.
+		 */
+		TransactionIdAbortTree(nchildren, children);
+		TransactionIdAbort(xid);
+
+		END_CRIT_SECTION();
+	}
+
+	/* And clean up local data */
+	if (rptr)
+		pfree(rptr);
+	if (children)
+		pfree(children);
+}
 
 /* ----------------------------------------------------------------
  *						CleanupTransaction stuff
@@ -798,15 +1168,46 @@ AtCleanup_Memory(void)
 	 */
 	MemoryContextSwitchTo(TopMemoryContext);
 
+	Assert(CurrentTransactionState->parent == NULL);
+
 	/*
 	 * Release all transaction-local memory.
 	 */
 	if (TopTransactionContext != NULL)
 		MemoryContextDelete(TopTransactionContext);
 	TopTransactionContext = NULL;
+	CurTransactionContext = NULL;
+	CurrentTransactionState->curTransactionContext = NULL;
 }
 
 
+/* ----------------------------------------------------------------
+ *						CleanupSubTransaction stuff
+ * ----------------------------------------------------------------
+ */
+
+/*
+ * AtSubCleanup_Memory
+ */
+static void
+AtSubCleanup_Memory(void)
+{
+	TransactionState s = CurrentTransactionState;
+
+	Assert(s->parent != NULL);
+
+	/* Make sure we're not in an about-to-be-deleted context */
+	MemoryContextSwitchTo(s->parent->curTransactionContext);
+	CurTransactionContext = s->parent->curTransactionContext;
+
+	/*
+	 * Delete the subxact local memory contexts. Its CurTransactionContext
+	 * can go too (note this also kills CurTransactionContexts from any
+	 * children of the subxact).
+	 */
+	MemoryContextDelete(s->curTransactionContext);
+}
+
 /* ----------------------------------------------------------------
  *						interface routines
  * ----------------------------------------------------------------
@@ -842,20 +1243,34 @@ StartTransaction(void)
 	/*
 	 * generate a new transaction id
 	 */
-	s->transactionIdData = GetNewTransactionId();
+	s->transactionIdData = GetNewTransactionId(false);
 
 	XactLockTableInsert(s->transactionIdData);
 
+	/*
+	 * set now()
+	 */
+	xactStartTime = GetCurrentAbsoluteTimeUsec(&(xactStartTimeUsec));
+
 	/*
 	 * initialize current transaction state fields
 	 */
 	s->commandId = FirstCommandId;
-	s->startTime = GetCurrentAbsoluteTimeUsec(&(s->startTimeUsec));
+	s->nestingLevel = 1;
+	s->childXids = NIL;
+
+	/*
+	 * You might expect to see "s->currentUser = GetUserId();" here, but
+	 * you won't because it doesn't work during startup; the userid isn't
+	 * set yet during a backend's first transaction start.  We only use
+	 * the currentUser field in sub-transaction state structs.
+	 */
 
 	/*
 	 * initialize the various transaction subsystems
 	 */
 	AtStart_Memory();
+	AtStart_Inval();
 	AtStart_Cache();
 	AtStart_Locks();
 
@@ -870,6 +1285,7 @@ StartTransaction(void)
 	 */
 	s->state = TRANS_INPROGRESS;
 
+	ShowTransactionState("StartTransaction");
 }
 
 /*
@@ -880,11 +1296,14 @@ CommitTransaction(void)
 {
 	TransactionState s = CurrentTransactionState;
 
+	ShowTransactionState("CommitTransaction");
+
 	/*
 	 * check the current transaction state
 	 */
 	if (s->state != TRANS_INPROGRESS)
 		elog(WARNING, "CommitTransaction and not in in-progress state");
+	Assert(s->parent == NULL);
 
 	/*
 	 * Tell the trigger manager that this transaction is about to be
@@ -970,19 +1389,22 @@ CommitTransaction(void)
 	AtCommit_Locks();
 
 	CallEOXactCallbacks(true);
-	AtEOXact_GUC(true);
+	AtEOXact_GUC(true, false);
 	AtEOXact_SPI(true);
 	AtEOXact_gist();
 	AtEOXact_hash();
 	AtEOXact_nbtree();
 	AtEOXact_rtree();
-	AtEOXact_on_commit_actions(true);
+	AtEOXact_on_commit_actions(true, s->transactionIdData);
 	AtEOXact_Namespace(true);
 	AtEOXact_CatCache(true);
 	AtEOXact_Files();
 	pgstat_count_xact_commit();
 	AtCommit_Memory();
 
+	s->nestingLevel = 0;
+	s->childXids = NIL;
+
 	/*
 	 * done with commit processing, set current transaction state back to
 	 * default
@@ -1026,6 +1448,7 @@ AbortTransaction(void)
 	 */
 	if (s->state != TRANS_INPROGRESS)
 		elog(WARNING, "AbortTransaction and not in in-progress state");
+	Assert(s->parent == NULL);
 
 	/*
 	 * set the current transaction state information appropriately during
@@ -1037,7 +1460,14 @@ AbortTransaction(void)
 	AtAbort_Memory();
 
 	/*
-	 * Reset user id which might have been changed transiently
+	 * Reset user id which might have been changed transiently.  We cannot
+	 * use s->currentUser, but must get the session userid from miscinit.c.
+	 *
+	 * (Note: it is not necessary to restore session authorization here
+	 * because that can only be changed via GUC, and GUC will take care of
+	 * rolling it back if need be.  However, an error within a SECURITY
+	 * DEFINER function could send control here with the wrong current
+	 * userid.)
 	 */
 	SetUserId(GetSessionUserId());
 
@@ -1080,13 +1510,13 @@ AbortTransaction(void)
 	AtAbort_Locks();
 
 	CallEOXactCallbacks(false);
-	AtEOXact_GUC(false);
+	AtEOXact_GUC(false, false);
 	AtEOXact_SPI(false);
 	AtEOXact_gist();
 	AtEOXact_hash();
 	AtEOXact_nbtree();
 	AtEOXact_rtree();
-	AtEOXact_on_commit_actions(false);
+	AtEOXact_on_commit_actions(false, s->transactionIdData);
 	AtEOXact_Namespace(false);
 	AtEOXact_CatCache(false);
 	AtEOXact_Files();
@@ -1119,6 +1549,9 @@ CleanupTransaction(void)
 	AtCleanup_Portals();		/* now safe to release portal memory */
 	AtCleanup_Memory();			/* and transaction memory */
 
+	s->nestingLevel = 0;
+	s->childXids = NIL;
+
 	/*
 	 * done with abort processing, set current transaction state back to
 	 * default
@@ -1145,25 +1578,6 @@ StartTransactionCommand(void)
 			s->blockState = TBLOCK_STARTED;
 			break;
 
-			/*
-			 * We should never experience this -- it means the STARTED state
-			 * was not changed in the previous CommitTransactionCommand.
-			 */
-		case TBLOCK_STARTED:
-			elog(WARNING, "StartTransactionCommand: unexpected TBLOCK_STARTED");
-			break;
-
-			/*
-			 * We should never experience this -- if we do it means the
-			 * BEGIN state was not changed in the previous
-			 * CommitTransactionCommand().	If we get it, we print a
-			 * warning and change to the in-progress state.
-			 */
-		case TBLOCK_BEGIN:
-			elog(WARNING, "StartTransactionCommand: unexpected TBLOCK_BEGIN");
-			s->blockState = TBLOCK_INPROGRESS;
-			break;
-
 			/*
 			 * This is the case when are somewhere in a transaction block
 			 * and about to start a new command.  For now we do nothing
@@ -1171,20 +1585,7 @@ StartTransactionCommand(void)
 			 * initialization.
 			 */
 		case TBLOCK_INPROGRESS:
-			break;
-
-			/*
-			 * As with BEGIN, we should never experience this if we do it
-			 * means the END state was not changed in the previous
-			 * CommitTransactionCommand().	If we get it, we print a
-			 * warning, commit the transaction, start a new transaction
-			 * and change to the default state.
-			 */
-		case TBLOCK_END:
-			elog(WARNING, "StartTransactionCommand: unexpected TBLOCK_END");
-			CommitTransaction();
-			StartTransaction();
-			s->blockState = TBLOCK_DEFAULT;
+		case TBLOCK_SUBINPROGRESS:
 			break;
 
 			/*
@@ -1194,26 +1595,30 @@ StartTransactionCommand(void)
 			 * TRANSACTION" which will set things straight.
 			 */
 		case TBLOCK_ABORT:
+		case TBLOCK_SUBABORT:
 			break;
 
-			/*
-			 * This means we somehow aborted and the last call to
-			 * CommitTransactionCommand() didn't clear the state so we
-			 * remain in the ENDABORT state and maybe next time we get to
-			 * CommitTransactionCommand() the state will get reset to
-			 * default.
-			 */
+			/* These cases are invalid. */
+		case TBLOCK_STARTED:
+		case TBLOCK_BEGIN:
+		case TBLOCK_SUBBEGIN:
+		case TBLOCK_SUBBEGINABORT:
+		case TBLOCK_END:
+		case TBLOCK_SUBEND:
+		case TBLOCK_SUBENDABORT_OK:
+		case TBLOCK_SUBENDABORT_ERROR:
 		case TBLOCK_ENDABORT:
-			elog(WARNING, "StartTransactionCommand: unexpected TBLOCK_ENDABORT");
+			elog(FATAL, "StartTransactionCommand: unexpected state %s",
+				 BlockStateAsString(s->blockState));
 			break;
 	}
 
 	/*
-	 * We must switch to TopTransactionContext before returning. This is
+	 * We must switch to CurTransactionContext before returning. This is
 	 * already done if we called StartTransaction, otherwise not.
 	 */
-	Assert(TopTransactionContext != NULL);
-	MemoryContextSwitchTo(TopTransactionContext);
+	Assert(CurTransactionContext != NULL);
+	MemoryContextSwitchTo(CurTransactionContext);
 }
 
 /*
@@ -1232,7 +1637,7 @@ CommitTransactionCommand(void)
 			 * appropiately.
 			 */
 		case TBLOCK_DEFAULT:
-			elog(WARNING, "CommitTransactionCommand: unexpected TBLOCK_DEFAULT");
+			elog(FATAL, "CommitTransactionCommand: unexpected TBLOCK_DEFAULT");
 			break;
 
 			/*
@@ -1291,18 +1696,83 @@ CommitTransactionCommand(void)
 			CleanupTransaction();
 			s->blockState = TBLOCK_DEFAULT;
 			break;
-	}
-}
 
-/*
- *	AbortCurrentTransaction
- */
-void
-AbortCurrentTransaction(void)
-{
-	TransactionState s = CurrentTransactionState;
+			/*
+			 * We were just issued a BEGIN inside a transaction block.
+			 * Start a subtransaction.
+			 */
+		case TBLOCK_SUBBEGIN:
+			StartSubTransaction();
+			s->blockState = TBLOCK_SUBINPROGRESS;
+			break;
 
-	switch (s->blockState)
+			/*
+			 * We were issued a BEGIN inside an aborted transaction block.
+			 * Start a subtransaction, and put it in aborted state.
+			 */
+		case TBLOCK_SUBBEGINABORT:
+			StartSubTransaction();
+			AbortSubTransaction();
+			s->blockState = TBLOCK_SUBABORT;
+			break;
+
+			/*
+			 * Inside a subtransaction, increment the command counter.
+			 */
+		case TBLOCK_SUBINPROGRESS:
+			CommandCounterIncrement();
+			break;
+
+			/*
+			 * We where issued a COMMIT command, so we end the current
+			 * subtransaction and return to the parent transaction.
+			 */
+		case TBLOCK_SUBEND:
+			CommitSubTransaction();
+			PopTransaction();
+			s = CurrentTransactionState;		/* changed by pop */
+			break;
+
+			/*
+			 * If we are in an aborted subtransaction, do nothing.
+			 */
+		case TBLOCK_SUBABORT:
+			break;
+
+			/*
+			 * We are ending a subtransaction that aborted nicely,
+			 * so the parent can be allowed to live.
+			 */
+		case TBLOCK_SUBENDABORT_OK:
+			CleanupSubTransaction();
+			PopTransaction();
+			s = CurrentTransactionState;		/* changed by pop */
+			break;
+
+			/*
+			 * We are ending a subtransaction that aborted in a unclean
+			 * way (e.g. the user issued COMMIT in an aborted subtrasaction.)
+			 * Abort the subtransaction, and abort the parent too.
+			 */
+		case TBLOCK_SUBENDABORT_ERROR:
+			CleanupSubTransaction();
+			PopTransaction();
+			s = CurrentTransactionState;		/* changed by pop */
+			Assert(s->blockState != TBLOCK_SUBENDABORT_ERROR);
+			AbortCurrentTransaction();
+			break;
+	}
+}
+
+/*
+ *	AbortCurrentTransaction
+ */
+void
+AbortCurrentTransaction(void)
+{
+	TransactionState s = CurrentTransactionState;
+
+	switch (s->blockState)
 	{
 		/*
 		 * we aren't in a transaction, so we do nothing.
@@ -1362,6 +1832,7 @@ AbortCurrentTransaction(void)
 			 * state.
 			 */
 		case TBLOCK_ABORT:
+		case TBLOCK_SUBABORT:
 			break;
 
 			/*
@@ -1374,6 +1845,53 @@ AbortCurrentTransaction(void)
 			CleanupTransaction();
 			s->blockState = TBLOCK_DEFAULT;
 			break;
+
+			/*
+			 * If we are just starting a subtransaction, put it
+			 * in aborted state.
+			 */
+		case TBLOCK_SUBBEGIN:
+		case TBLOCK_SUBBEGINABORT:
+			PushTransaction();
+			s = CurrentTransactionState;		/* changed by push */
+			StartSubTransaction();
+			AbortSubTransaction();
+			s->blockState = TBLOCK_SUBABORT;
+			break;
+
+		case TBLOCK_SUBINPROGRESS:
+			AbortSubTransaction();
+			s->blockState = TBLOCK_SUBABORT;
+			break;
+
+			/*
+			 * If we are aborting an ending transaction,
+			 * we have to abort the parent transaction too.
+			 */
+		case TBLOCK_SUBEND:
+			AbortSubTransaction();
+			CleanupSubTransaction();
+			PopTransaction();
+			s = CurrentTransactionState;		/* changed by pop */
+			Assert(s->blockState != TBLOCK_SUBEND &&
+					s->blockState != TBLOCK_SUBENDABORT_OK &&
+					s->blockState != TBLOCK_SUBENDABORT_ERROR);
+			AbortCurrentTransaction();
+			break;
+
+			/*
+			 * Same as above, except the Abort() was already done.
+			 */
+		case TBLOCK_SUBENDABORT_OK:
+		case TBLOCK_SUBENDABORT_ERROR:
+			CleanupSubTransaction();
+			PopTransaction();
+			s = CurrentTransactionState;		/* changed by pop */
+			Assert(s->blockState != TBLOCK_SUBEND &&
+					s->blockState != TBLOCK_SUBENDABORT_OK &&
+					s->blockState != TBLOCK_SUBENDABORT_ERROR);
+			AbortCurrentTransaction();
+			break;
 	}
 }
 
@@ -1387,7 +1905,7 @@ AbortCurrentTransaction(void)
  *	If we have already started a transaction block, issue an error; also issue
  *	an error if we appear to be running inside a user-defined function (which
  *	could issue more commands and possibly cause a failure after the statement
- *	completes).
+ *	completes).  Subtransactions are verboten too.
  *
  *	stmtNode: pointer to parameter block for statement; this is used in
  *	a very klugy way to determine whether we are inside a function.
@@ -1406,6 +1924,16 @@ PreventTransactionChain(void *stmtNode, const char *stmtType)
 				 errmsg("%s cannot run inside a transaction block",
 						stmtType)));
 
+	/*
+	 * subtransaction?
+	 */
+	if (IsSubTransaction())
+		ereport(ERROR,
+				(errcode(ERRCODE_ACTIVE_SQL_TRANSACTION),
+		/* translator: %s represents an SQL statement name */
+				 errmsg("%s cannot run inside a subtransaction",
+						stmtType)));
+
 	/*
 	 * Are we inside a function call?  If the statement's parameter block
 	 * was allocated in QueryContext, assume it is an interactive command.
@@ -1416,10 +1944,11 @@ PreventTransactionChain(void *stmtNode, const char *stmtType)
 				(errcode(ERRCODE_ACTIVE_SQL_TRANSACTION),
 		/* translator: %s represents an SQL statement name */
 			 errmsg("%s cannot be executed from a function", stmtType)));
+
 	/* If we got past IsTransactionBlock test, should be in default state */
 	if (CurrentTransactionState->blockState != TBLOCK_DEFAULT &&
 		CurrentTransactionState->blockState != TBLOCK_STARTED)
-		elog(ERROR, "cannot prevent transaction chain");
+		elog(FATAL, "cannot prevent transaction chain");
 	/* all okay */
 }
 
@@ -1433,8 +1962,8 @@ PreventTransactionChain(void *stmtNode, const char *stmtType)
  *
  *	If we appear to be running inside a user-defined function, we do not
  *	issue an error, since the function could issue more commands that make
- *	use of the current statement's results.  Thus this is an inverse for
- *	PreventTransactionChain.
+ *	use of the current statement's results.  Likewise subtransactions.
+ *	Thus this is an inverse for PreventTransactionChain.
  *
  *	stmtNode: pointer to parameter block for statement; this is used in
  *	a very klugy way to determine whether we are inside a function.
@@ -1449,6 +1978,12 @@ RequireTransactionChain(void *stmtNode, const char *stmtType)
 	if (IsTransactionBlock())
 		return;
 
+	/*
+	 * subtransaction?
+	 */
+	if (IsSubTransaction())
+		return;
+
 	/*
 	 * Are we inside a function call?  If the statement's parameter block
 	 * was allocated in QueryContext, assume it is an interactive command.
@@ -1483,6 +2018,9 @@ IsInTransactionChain(void *stmtNode)
 	if (IsTransactionBlock())
 		return true;
 
+	if (IsSubTransaction())
+		return true;
+
 	if (!MemoryContextContains(QueryContext, stmtNode))
 		return true;
 
@@ -1571,26 +2109,40 @@ BeginTransactionBlock(void)
 			s->blockState = TBLOCK_BEGIN;
 			break;
 
-			/* Already a transaction block in progress. */
+			/*
+			 * Already a transaction block in progress.
+			 * Start a subtransaction.
+			 */
 		case TBLOCK_INPROGRESS:
-			ereport(WARNING,
-					(errcode(ERRCODE_ACTIVE_SQL_TRANSACTION),
-					 errmsg("there is already a transaction in progress")));
+		case TBLOCK_SUBINPROGRESS:
+			PushTransaction();
+			s = CurrentTransactionState;		/* changed by push */
+			s->blockState = TBLOCK_SUBBEGIN;
+			break;
 
 			/*
-			 * This shouldn't happen, because a transaction in aborted state
-			 * will not be allowed to call BeginTransactionBlock.
+			 * An aborted transaction block should be allowed to start
+			 * a subtransaction, but it must put it in aborted state.
 			 */
 		case TBLOCK_ABORT:
-			elog(WARNING, "BeginTransactionBlock: unexpected TBLOCK_ABORT");
+		case TBLOCK_SUBABORT:
+			PushTransaction();
+			s = CurrentTransactionState;		/* changed by push */
+			s->blockState = TBLOCK_SUBBEGINABORT;
 			break;
 
 			/* These cases are invalid.  Reject them altogether. */
 		case TBLOCK_DEFAULT:
 		case TBLOCK_BEGIN:
+		case TBLOCK_SUBBEGIN:
+		case TBLOCK_SUBBEGINABORT:
 		case TBLOCK_ENDABORT:
 		case TBLOCK_END:
-			elog(FATAL, "BeginTransactionBlock: not in a user-allowed state!");
+		case TBLOCK_SUBENDABORT_OK:
+		case TBLOCK_SUBENDABORT_ERROR:
+		case TBLOCK_SUBEND:
+			elog(FATAL, "BeginTransactionBlock: unexpected state %s",
+				 BlockStateAsString(s->blockState));
 			break;
 	}
 }
@@ -1614,6 +2166,15 @@ EndTransactionBlock(void)
 			s->blockState = TBLOCK_END;
 			break;
 
+			/*
+			 * here we are in a subtransaction block.  Signal
+			 * CommitTransactionCommand() to end it and return to the
+			 * parent transaction.
+			 */
+		case TBLOCK_SUBINPROGRESS:
+			s->blockState = TBLOCK_SUBEND;
+			break;
+
 			/*
 			 * here, we are in a transaction block which aborted and since the
 			 * AbortTransaction() was already done, we do whatever is needed
@@ -1625,12 +2186,21 @@ EndTransactionBlock(void)
 			s->blockState = TBLOCK_ENDABORT;
 			break;
 
+			/*
+			 * here we are in an aborted subtransaction.  Signal
+			 * CommitTransactionCommand() to clean up and return to the
+			 * parent transaction.
+			 */
+		case TBLOCK_SUBABORT:
+			s->blockState = TBLOCK_SUBENDABORT_ERROR;
+			break;
+
 		case TBLOCK_STARTED:
 			/*
-			 * here, the user issued COMMIT when not inside a transaction. Issue a
-			 * WARNING and go to abort state.  The upcoming call to
-			 * CommitTransactionCommand() will then put us back into the default
-			 * state.
+			 * here, the user issued COMMIT when not inside a
+			 * transaction. Issue a WARNING and go to abort state.  The
+			 * upcoming call to CommitTransactionCommand() will then put us
+			 * back into the default state.
 			 */
 			ereport(WARNING,
 					(errcode(ERRCODE_NO_ACTIVE_SQL_TRANSACTION),
@@ -1644,7 +2214,13 @@ EndTransactionBlock(void)
 		case TBLOCK_BEGIN:
 		case TBLOCK_ENDABORT:
 		case TBLOCK_END:
-			elog(FATAL, "EndTransactionBlock and not in a user-allowed state");
+		case TBLOCK_SUBBEGIN:
+		case TBLOCK_SUBBEGINABORT:
+		case TBLOCK_SUBEND:
+		case TBLOCK_SUBENDABORT_OK:
+		case TBLOCK_SUBENDABORT_ERROR:
+			elog(FATAL, "EndTransactionBlock: unexpected state %s",
+				 BlockStateAsString(s->blockState));
 			break;
 	}
 }
@@ -1657,42 +2233,68 @@ UserAbortTransactionBlock(void)
 {
 	TransactionState s = CurrentTransactionState;
 
-	/*
-	 * if the transaction has already been automatically aborted with an
-	 * error, and the user subsequently types 'abort', allow it.  (the
-	 * behavior is the same as if they had typed 'end'.)
-	 */
-	if (s->blockState == TBLOCK_ABORT)
-	{
-		s->blockState = TBLOCK_ENDABORT;
-		return;
-	}
-
-	if (s->blockState == TBLOCK_INPROGRESS)
-	{
+	switch (s->blockState) {
 		/*
-		 * here we were inside a transaction block and we got an abort
-		 * command from the user, so we move to the ENDABORT state and
-		 * do abort processing so we will end up in the default state
-		 * after the upcoming CommitTransactionCommand().
+		 * here we are inside a failed transaction block and we got an abort
+		 * command from the user.  Abort processing is already done, we just
+		 * need to move to the ENDABORT state so we will end up in the default
+		 * state after the upcoming CommitTransactionCommand().
 		 */
-		s->blockState = TBLOCK_ABORT;
-		AbortTransaction();
-		s->blockState = TBLOCK_ENDABORT;
-		return;
+		case TBLOCK_ABORT:
+			s->blockState = TBLOCK_ENDABORT;
+			break;
+
+			/* Ditto, for a subtransaction. */
+		case TBLOCK_SUBABORT:
+			s->blockState = TBLOCK_SUBENDABORT_OK;
+			break;
+
+			/*
+			 * here we are inside a transaction block and we got an abort
+			 * command from the user, so we move to the ENDABORT state and
+			 * do abort processing so we will end up in the default state
+			 * after the upcoming CommitTransactionCommand().
+			 */
+		case TBLOCK_INPROGRESS:
+			AbortTransaction();
+			s->blockState = TBLOCK_ENDABORT;
+			break;
+
+			/* Ditto, for a subtransaction. */
+		case TBLOCK_SUBINPROGRESS:
+			AbortSubTransaction();
+			s->blockState = TBLOCK_SUBENDABORT_OK;
+			break;
+
+			/*
+			 * here, the user issued ABORT when not inside a
+			 * transaction. Issue a WARNING and go to abort state.  The
+			 * upcoming call to CommitTransactionCommand() will then put us
+			 * back into the default state.
+			 */
+		case TBLOCK_STARTED:
+			ereport(WARNING,
+					(errcode(ERRCODE_NO_ACTIVE_SQL_TRANSACTION),
+					 errmsg("there is no transaction in progress")));
+			AbortTransaction();
+			s->blockState = TBLOCK_ENDABORT;
+			break;
+
+			/* these cases are invalid. */
+		case TBLOCK_DEFAULT:
+		case TBLOCK_BEGIN:
+		case TBLOCK_END:
+		case TBLOCK_ENDABORT:
+		case TBLOCK_SUBEND:
+		case TBLOCK_SUBENDABORT_OK:
+		case TBLOCK_SUBENDABORT_ERROR:
+		case TBLOCK_SUBBEGIN:
+		case TBLOCK_SUBBEGINABORT:
+			elog(FATAL, "UserAbortTransactionBlock: unexpected state %s",
+				 BlockStateAsString(s->blockState));
+			break;
 	}
 
-	/*
-	 * here, the user issued ABORT when not inside a transaction. Issue a
-	 * WARNING and go to abort state.  The upcoming call to
-	 * CommitTransactionCommand() will then put us back into the default
-	 * state.
-	 */
-	ereport(WARNING,
-			(errcode(ERRCODE_NO_ACTIVE_SQL_TRANSACTION),
-			 errmsg("there is no transaction in progress")));
-	AbortTransaction();
-	s->blockState = TBLOCK_ENDABORT;
 }
 
 /*
@@ -1708,32 +2310,58 @@ AbortOutOfAnyTransaction(void)
 	TransactionState s = CurrentTransactionState;
 
 	/*
-	 * Get out of any transaction
+	 * Get out of any transaction or nested transaction
 	 */
-	switch (s->blockState)
-	{
-		case TBLOCK_DEFAULT:
-			/* Not in a transaction, do nothing */
-			break;
-		case TBLOCK_STARTED:
-		case TBLOCK_BEGIN:
-		case TBLOCK_INPROGRESS:
-		case TBLOCK_END:
-			/* In a transaction, so clean up */
-			AbortTransaction();
-			CleanupTransaction();
-			break;
-		case TBLOCK_ABORT:
-		case TBLOCK_ENDABORT:
-			/* AbortTransaction already done, still need Cleanup */
-			CleanupTransaction();
-			break;
-	}
+	do {
+		switch (s->blockState)
+		{
+			case TBLOCK_DEFAULT:
+				/* Not in a transaction, do nothing */
+				break;
+			case TBLOCK_STARTED:
+			case TBLOCK_BEGIN:
+			case TBLOCK_INPROGRESS:
+			case TBLOCK_END:
+				/* In a transaction, so clean up */
+				AbortTransaction();
+				CleanupTransaction();
+				s->blockState = TBLOCK_DEFAULT;
+				break;
+			case TBLOCK_ABORT:
+			case TBLOCK_ENDABORT:
+				/* AbortTransaction already done, still need Cleanup */
+				CleanupTransaction();
+				s->blockState = TBLOCK_DEFAULT;
+				break;
+			case TBLOCK_SUBBEGIN:
+			case TBLOCK_SUBBEGINABORT:
+				/*
+				 * Just starting a new transaction -- return to parent.
+				 * FIXME -- Is this correct?
+				 */
+				PopTransaction();
+				s = CurrentTransactionState;		/* changed by pop */
+				break;
+			case TBLOCK_SUBINPROGRESS:
+			case TBLOCK_SUBEND:
+				/* In a subtransaction, so clean it up and abort parent too */
+				AbortSubTransaction();
+				CleanupSubTransaction();
+				PopTransaction();
+				s = CurrentTransactionState;		/* changed by pop */
+				break;
+			case TBLOCK_SUBABORT:
+			case TBLOCK_SUBENDABORT_OK:
+			case TBLOCK_SUBENDABORT_ERROR:
+				CleanupSubTransaction();
+				PopTransaction();
+				s = CurrentTransactionState;		/* changed by pop */
+				break;
+		}
+	} while (s->blockState != TBLOCK_DEFAULT);
 
-	/*
-	 * Now reset the transaction state
-	 */
-	s->blockState = TBLOCK_DEFAULT;
+	/* Should be out of all subxacts now */
+	Assert(s->parent == NULL);
 }
 
 /*
@@ -1784,18 +2412,436 @@ TransactionBlockStatusCode(void)
 		case TBLOCK_BEGIN:
 		case TBLOCK_INPROGRESS:
 		case TBLOCK_END:
+		case TBLOCK_SUBINPROGRESS:
+		case TBLOCK_SUBBEGIN:
+		case TBLOCK_SUBEND:
 			return 'T';			/* in transaction */
 		case TBLOCK_ABORT:
 		case TBLOCK_ENDABORT:
+		case TBLOCK_SUBABORT:
+		case TBLOCK_SUBENDABORT_OK:
+		case TBLOCK_SUBENDABORT_ERROR:
+		case TBLOCK_SUBBEGINABORT:
 			return 'E';			/* in failed transaction */
 	}
 
 	/* should never get here */
-	elog(ERROR, "invalid transaction block state: %d",
-		 (int) s->blockState);
+	elog(FATAL, "invalid transaction block state: %s",
+		 BlockStateAsString(s->blockState));
 	return 0;					/* keep compiler quiet */
 }
 
+/*
+ * IsSubTransaction
+ */
+bool
+IsSubTransaction(void)
+{
+	TransactionState s = CurrentTransactionState;
+	
+	switch (s->blockState) {
+		case TBLOCK_DEFAULT:
+		case TBLOCK_STARTED:
+		case TBLOCK_BEGIN:
+		case TBLOCK_INPROGRESS:
+		case TBLOCK_END:
+		case TBLOCK_ABORT:
+		case TBLOCK_ENDABORT:
+			return false;
+		case TBLOCK_SUBBEGIN:
+		case TBLOCK_SUBBEGINABORT:
+		case TBLOCK_SUBINPROGRESS:
+		case TBLOCK_SUBABORT:
+		case TBLOCK_SUBEND:
+		case TBLOCK_SUBENDABORT_OK:
+		case TBLOCK_SUBENDABORT_ERROR:
+			return true;
+	}
+
+	/* should never get here */
+	elog(FATAL, "invalid transaction block state: %s",
+		 BlockStateAsString(s->blockState));
+	return false;				/* keep compiler quiet */
+}
+
+/*
+ * StartSubTransaction
+ */
+static void
+StartSubTransaction(void)
+{
+	TransactionState s = CurrentTransactionState;
+
+	if (s->state != TRANS_DEFAULT)
+		elog(WARNING, "StartSubTransaction and not in default state");
+
+	s->state = TRANS_START;
+
+	/*
+	 * Generate a new Xid and record it in pg_subtrans.
+	 */
+	s->transactionIdData = GetNewTransactionId(true);
+
+	SubTransSetParent(s->transactionIdData, s->parent->transactionIdData);
+
+	/*
+	 * Finish setup of other transaction state fields.
+	 */
+	s->currentUser = GetUserId();
+	
+	/* Initialize the various transaction subsystems */
+	AtSubStart_Memory();
+	AtSubStart_Inval();
+	AtSubStart_RelationCache();
+	AtSubStart_CatCache();
+	AtSubStart_Buffers();
+	AtSubStart_smgr();
+	AtSubStart_Notify();
+	DeferredTriggerBeginSubXact();
+
+	s->state = TRANS_INPROGRESS;
+
+	ShowTransactionState("StartSubTransaction");
+}
+
+/*
+ * CommitSubTransaction
+ */
+static void
+CommitSubTransaction(void)
+{
+	TransactionState s = CurrentTransactionState;
+
+	ShowTransactionState("CommitSubTransaction");
+
+	if (s->state != TRANS_INPROGRESS)
+		elog(WARNING, "CommitSubTransaction and not in in-progress state");
+
+	/* Pre-commit processing */
+	AtSubCommit_Portals(s->parent->transactionIdData);
+	DeferredTriggerEndSubXact(true);
+
+	/* Mark subtransaction as subcommitted */
+	CommandCounterIncrement();
+	RecordSubTransactionCommit();
+	AtSubCommit_childXids();
+
+	/* Post-commit cleanup */
+	AtSubCommit_smgr();
+
+	AtSubEOXact_Inval(true);
+	AtEOSubXact_SPI(true, s->transactionIdData);
+	AtSubCommit_Notify();
+	AtEOXact_GUC(true, true);
+	AtEOSubXact_gist(s->transactionIdData);
+	AtEOSubXact_hash(s->transactionIdData);
+	AtEOSubXact_rtree(s->transactionIdData);
+	AtEOSubXact_on_commit_actions(true, s->transactionIdData,
+								  s->parent->transactionIdData);
+
+	AtEOSubXact_CatCache(true);
+	AtEOSubXact_RelationCache(true);
+	AtEOSubXact_Buffers(true);
+	AtSubCommit_Memory();
+
+	s->state = TRANS_DEFAULT;
+}
+
+/*
+ * AbortSubTransaction
+ */
+static void
+AbortSubTransaction(void)
+{
+	TransactionState s = CurrentTransactionState;
+
+	ShowTransactionState("AbortSubTransaction");
+
+	HOLD_INTERRUPTS();
+
+	s->state = TRANS_ABORT;
+
+	/*
+	 * Release any LW locks we might be holding as quickly as possible.
+	 * (Regular locks, however, must be held till we finish aborting.)
+	 * Releasing LW locks is critical since we might try to grab them
+	 * again while cleaning up!
+	 *
+	 * FIXME This may be incorrect --- Are there some locks we should keep?
+	 * Buffer locks, for example?  I don't think so but I'm not sure.
+	 */
+	LWLockReleaseAll();
+
+	AbortBufferIO();
+	UnlockBuffers();
+
+	LockWaitCancel();
+
+	AtSubAbort_Memory();
+
+	/*
+	 * do abort processing
+	 */
+
+	RecordSubTransactionAbort();
+
+	/* Post-abort cleanup */
+	AtSubAbort_smgr();
+
+	DeferredTriggerEndSubXact(false);
+	AtSubAbort_Portals();
+	AtSubEOXact_Inval(false);
+	AtSubAbort_Locks();
+	AtEOSubXact_SPI(false, s->transactionIdData);
+	AtSubAbort_Notify();
+	AtEOXact_GUC(false, true);
+	AtEOSubXact_gist(s->transactionIdData);
+	AtEOSubXact_hash(s->transactionIdData);
+	AtEOSubXact_rtree(s->transactionIdData);
+	AtEOSubXact_on_commit_actions(false, s->transactionIdData,
+								  s->parent->transactionIdData);
+	AtEOSubXact_RelationCache(false);
+	AtEOSubXact_CatCache(false);
+	AtEOSubXact_Buffers(false);
+
+	/*
+	 * Reset user id which might have been changed transiently.  Here we
+	 * want to restore to the userid that was current at subxact entry.
+	 * (As in AbortTransaction, we need not worry about the session userid.)
+	 *
+	 * Must do this after AtEOXact_GUC to handle the case where we entered
+	 * the subxact inside a SECURITY DEFINER function (hence current and
+	 * session userids were different) and then session auth was changed
+	 * inside the subxact.  GUC will reset both current and session userids
+	 * to the entry-time session userid.  This is right in every other
+	 * scenario so it seems simplest to let GUC do that and fix it here.
+	 */
+	SetUserId(s->currentUser);
+
+	CommandCounterIncrement();
+
+	RESUME_INTERRUPTS();
+}
+
+/*
+ * CleanupSubTransaction
+ */
+static void
+CleanupSubTransaction(void)
+{
+	TransactionState s = CurrentTransactionState;
+
+	ShowTransactionState("CleanupSubTransaction");
+
+	if (s->state != TRANS_ABORT)
+		elog(WARNING, "CleanupSubTransaction and not in aborted state");
+
+	AtSubCleanup_Portals();
+	AtSubCleanup_Memory();
+
+	s->state = TRANS_DEFAULT;
+}
+
+/*
+ * PushTransaction
+ *		Set up transaction state for a subtransaction
+ */
+static void
+PushTransaction(void)
+{
+	TransactionState    p = CurrentTransactionState;
+	TransactionState    s;
+
+	/*
+	 * We keep subtransaction state nodes in TopTransactionContext.
+	 */
+	s = (TransactionState)
+		MemoryContextAllocZero(TopTransactionContext,
+							   sizeof(TransactionStateData));
+	s->parent = p;
+	s->nestingLevel = p->nestingLevel + 1;
+	s->state = TRANS_DEFAULT;
+	s->blockState = TBLOCK_SUBBEGIN;
+
+	/* Command IDs count in a continuous sequence through subtransactions */
+	s->commandId = p->commandId;
+
+	/*
+	 * Copy down some other data so that we will have valid state until
+	 * StartSubTransaction runs.
+	 */
+	s->transactionIdData = p->transactionIdData;
+	s->curTransactionContext = p->curTransactionContext;
+
+	CurrentTransactionState = s;
+}
+
+/*
+ * PopTransaction
+ *		Pop back to parent transaction state
+ */
+static void
+PopTransaction(void)
+{
+	TransactionState s = CurrentTransactionState;
+
+	if (s->state != TRANS_DEFAULT)
+		elog(WARNING, "PopTransaction and not in default state");
+
+	if (s->parent == NULL)
+		elog(FATAL, "PopTransaction with no parent");
+
+	/* Command IDs count in a continuous sequence through subtransactions */
+	s->parent->commandId = s->commandId;
+
+	CurrentTransactionState = s->parent;
+
+	/* Let's just make sure CurTransactionContext is good */
+	CurTransactionContext = s->parent->curTransactionContext;
+	MemoryContextSwitchTo(CurTransactionContext);
+
+	/* Free the old child structure */
+	pfree(s);
+}
+
+/*
+ * ShowTransactionState
+ *		Debug support
+ */
+static void
+ShowTransactionState(const char *str)
+{
+	/* skip work if message will definitely not be printed */
+	if (log_min_messages <= DEBUG2 || client_min_messages <= DEBUG2)
+	{
+		elog(DEBUG2, "%s", str);
+		ShowTransactionStateRec(CurrentTransactionState);
+	}
+}
+
+/*
+ * ShowTransactionStateRec
+ *		Recursive subroutine for ShowTransactionState
+ */
+static void
+ShowTransactionStateRec(TransactionState s)
+{
+	if (s->parent)
+		ShowTransactionStateRec(s->parent);
+
+	/* use ereport to suppress computation if msg will not be printed */
+	ereport(DEBUG2,
+			(errmsg_internal("blockState: %13s; state: %7s, xid/cid: %u/%02u, nestlvl: %d, children: %s",
+							 BlockStateAsString(s->blockState),
+							 TransStateAsString(s->state),
+							 (unsigned int) s->transactionIdData,
+							 (unsigned int) s->commandId,
+							 s->nestingLevel,
+							 nodeToString(s->childXids))));
+}
+
+/*
+ * BlockStateAsString
+ *		Debug support
+ */
+static const char *
+BlockStateAsString(TBlockState blockState)
+{
+	switch (blockState) {
+		case TBLOCK_DEFAULT:
+			return "DEFAULT";
+		case TBLOCK_STARTED:
+			return "STARTED";
+		case TBLOCK_BEGIN:
+			return "BEGIN";
+		case TBLOCK_INPROGRESS:
+			return "INPROGRESS";
+		case TBLOCK_END:
+			return "END";
+		case TBLOCK_ABORT:
+			return "ABORT";
+		case TBLOCK_ENDABORT:
+			return "ENDABORT";
+		case TBLOCK_SUBBEGIN:
+			return "SUB BEGIN";
+		case TBLOCK_SUBBEGINABORT:
+			return "SUB BEGIN AB";
+		case TBLOCK_SUBINPROGRESS:
+			return "SUB INPROGRS";
+		case TBLOCK_SUBEND:
+			return "SUB END";
+		case TBLOCK_SUBABORT:
+			return "SUB ABORT";
+		case TBLOCK_SUBENDABORT_OK:
+			return "SUB ENDAB OK";
+		case TBLOCK_SUBENDABORT_ERROR:
+			return "SUB ENDAB ERR";
+	}
+	return "UNRECOGNIZED";
+}
+
+/*
+ * TransStateAsString
+ *		Debug support
+ */
+static const char *
+TransStateAsString(TransState state)
+{
+	switch (state) {
+		case TRANS_DEFAULT:
+			return "DEFAULT";
+		case TRANS_START:
+			return "START";
+		case TRANS_COMMIT:
+			return "COMMIT";
+		case TRANS_ABORT:
+			return "ABORT";
+		case TRANS_INPROGRESS:
+			return "INPROGR";
+	}
+	return "UNRECOGNIZED";
+}
+
+/*
+ * xactGetCommittedChildren
+ *
+ * Gets the list of committed children of the current transaction.  The return
+ * value is the number of child transactions.  *children is set to point to a
+ * palloc'd array of TransactionIds.  If there are no subxacts, *children is
+ * set to NULL.
+ *
+ * If metoo is true, include the current TransactionId.
+ */
+int
+xactGetCommittedChildren(TransactionId **ptr, bool metoo)
+{
+	TransactionState	s = CurrentTransactionState;
+	int					nchildren;
+	TransactionId	   *children;
+	ListCell		   *p;
+
+	nchildren = list_length(s->childXids);
+	if (metoo)
+		nchildren++;
+	if (nchildren == 0)
+	{
+		*ptr = NULL;
+		return 0;
+	}
+
+	children = (TransactionId *) palloc(nchildren * sizeof(TransactionId));
+	*ptr = children;
+
+	foreach(p, s->childXids)
+	{
+		TransactionId child = lfirst_int(p);
+		*children++ = (TransactionId)child;
+	}
+	if (metoo)
+		*children = s->transactionIdData;
+
+	return nchildren;
+}
 
 /*
  *	XLOG support routines
@@ -1809,13 +2855,14 @@ xact_redo(XLogRecPtr lsn, XLogRecord *record)
 	if (info == XLOG_XACT_COMMIT)
 	{
 		xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record);
-		int		nfiles;
 		int		i;
 
 		TransactionIdCommit(record->xl_xid);
+		/* Mark committed subtransactions as committed */
+		TransactionIdCommitTree(xlrec->nsubxacts,
+								(TransactionId *) &(xlrec->xnodes[xlrec->nrels]));
 		/* Make sure files supposed to be dropped are dropped */
-		nfiles = (record->xl_len - MinSizeOfXactCommit) / sizeof(RelFileNode);
-		for (i = 0; i < nfiles; i++)
+		for (i = 0; i < xlrec->nrels; i++)
 		{
 			XLogCloseRelation(xlrec->xnodes[i]);
 			smgrdounlink(smgropen(xlrec->xnodes[i]), false, true);
@@ -1824,13 +2871,14 @@ xact_redo(XLogRecPtr lsn, XLogRecord *record)
 	else if (info == XLOG_XACT_ABORT)
 	{
 		xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record);
-		int		nfiles;
 		int		i;
 
 		TransactionIdAbort(record->xl_xid);
+		/* mark subtransactions as aborted */
+		TransactionIdAbortTree(xlrec->nsubxacts,
+							   (TransactionId *) &(xlrec->xnodes[xlrec->nrels]));
 		/* Make sure files supposed to be dropped are dropped */
-		nfiles = (record->xl_len - MinSizeOfXactAbort) / sizeof(RelFileNode);
-		for (i = 0; i < nfiles; i++)
+		for (i = 0; i < xlrec->nrels; i++)
 		{
 			XLogCloseRelation(xlrec->xnodes[i]);
 			smgrdounlink(smgropen(xlrec->xnodes[i]), false, true);
@@ -1855,6 +2903,7 @@ void
 xact_desc(char *buf, uint8 xl_info, char *rec)
 {
 	uint8		info = xl_info & ~XLR_INFO_MASK;
+	int i;
 
 	if (info == XLOG_XACT_COMMIT)
 	{
@@ -1864,7 +2913,25 @@ xact_desc(char *buf, uint8 xl_info, char *rec)
 		sprintf(buf + strlen(buf), "commit: %04u-%02u-%02u %02u:%02u:%02u",
 				tm->tm_year + 1900, tm->tm_mon + 1, tm->tm_mday,
 				tm->tm_hour, tm->tm_min, tm->tm_sec);
-		/* XXX can't show RelFileNodes for lack of access to record length */
+		if (xlrec->nrels > 0)
+		{
+			sprintf(buf + strlen(buf), "; rels:");
+			for (i = 0; i < xlrec->nrels; i++)
+			{
+				RelFileNode rnode = xlrec->xnodes[i];
+				sprintf(buf + strlen(buf), " %u/%u/%u",
+						rnode.spcNode, rnode.dbNode, rnode.relNode);
+			}
+		}
+		if (xlrec->nsubxacts > 0)
+		{
+			TransactionId *xacts = (TransactionId *)
+				&xlrec->xnodes[xlrec->nrels];
+
+			sprintf(buf + strlen(buf), "; subxacts:");
+			for (i = 0; i < xlrec->nsubxacts; i++)
+				sprintf(buf + strlen(buf), " %u", xacts[i]);
+		}
 	}
 	else if (info == XLOG_XACT_ABORT)
 	{
@@ -1874,7 +2941,25 @@ xact_desc(char *buf, uint8 xl_info, char *rec)
 		sprintf(buf + strlen(buf), "abort: %04u-%02u-%02u %02u:%02u:%02u",
 				tm->tm_year + 1900, tm->tm_mon + 1, tm->tm_mday,
 				tm->tm_hour, tm->tm_min, tm->tm_sec);
-		/* XXX can't show RelFileNodes for lack of access to record length */
+		if (xlrec->nrels > 0)
+		{
+			sprintf(buf + strlen(buf), "; rels:");
+			for (i = 0; i < xlrec->nrels; i++)
+			{
+				RelFileNode rnode = xlrec->xnodes[i];
+				sprintf(buf + strlen(buf), " %u/%u/%u",
+						rnode.spcNode, rnode.dbNode, rnode.relNode);
+			}
+		}
+		if (xlrec->nsubxacts > 0)
+		{
+			TransactionId *xacts = (TransactionId *)
+				&xlrec->xnodes[xlrec->nrels];
+
+			sprintf(buf + strlen(buf), "; subxacts:");
+			for (i = 0; i < xlrec->nsubxacts; i++)
+				sprintf(buf + strlen(buf), " %u", xacts[i]);
+		}
 	}
 	else
 		strcat(buf, "UNKNOWN");
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index f1205640615..a6f53ba79f1 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.146 2004/06/03 02:08:00 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.147 2004/07/01 00:49:50 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -22,6 +22,7 @@
 #include <sys/time.h>
 
 #include "access/clog.h"
+#include "access/subtrans.h"
 #include "access/transam.h"
 #include "access/xact.h"
 #include "access/xlog.h"
@@ -2755,6 +2756,7 @@ BootStrapXLOG(void)
 
 	/* Bootstrap the commit log, too */
 	BootStrapCLOG();
+	BootStrapSUBTRANS();
 }
 
 static char *
@@ -3154,6 +3156,7 @@ StartupXLOG(void)
 
 	/* Start up the commit log, too */
 	StartupCLOG();
+	StartupSUBTRANS();
 
 	ereport(LOG,
 			(errmsg("database system is ready")));
@@ -3292,6 +3295,7 @@ ShutdownXLOG(int code, Datum arg)
 	CritSectionCount++;
 	CreateCheckPoint(true, true);
 	ShutdownCLOG();
+	ShutdownSUBTRANS();
 	CritSectionCount--;
 
 	ereport(LOG,
@@ -3467,6 +3471,7 @@ CreateCheckPoint(bool shutdown, bool force)
 	END_CRIT_SECTION();
 
 	CheckPointCLOG();
+	CheckPointSUBTRANS();
 	FlushBufferPool();
 
 	START_CRIT_SECTION();
diff --git a/src/backend/commands/async.c b/src/backend/commands/async.c
index 847f73ff06a..8e53d6af7d7 100644
--- a/src/backend/commands/async.c
+++ b/src/backend/commands/async.c
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/commands/async.c,v 1.112 2004/05/26 04:41:10 neilc Exp $
+ *	  $PostgreSQL: pgsql/src/backend/commands/async.c,v 1.113 2004/07/01 00:50:10 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -97,11 +97,17 @@
  * State for outbound notifies consists of a list of all relnames NOTIFYed
  * in the current transaction.	We do not actually perform a NOTIFY until
  * and unless the transaction commits.	pendingNotifies is NIL if no
- * NOTIFYs have been done in the current transaction.  The List nodes and
- * referenced strings are all palloc'd in TopTransactionContext.
+ * NOTIFYs have been done in the current transaction.
+ *
+ * The list is kept in CurTransactionContext.  In subtransactions, each
+ * subtransaction has its own list in its own CurTransactionContext, but
+ * successful subtransactions attach their lists to their parent's list.
+ * Failed subtransactions simply discard their lists.
  */
 static List *pendingNotifies = NIL;
 
+static List *upperPendingNotifies = NIL; /* list of upper-xact lists */
+
 /*
  * State for inbound notifies consists of two flags: one saying whether
  * the signal handler is currently allowed to call ProcessIncomingNotify
@@ -155,11 +161,11 @@ Async_Notify(char *relname)
 	{
 		/*
 		 * The name list needs to live until end of transaction, so store
-		 * it in the top transaction context.
+		 * it in the transaction context.
 		 */
 		MemoryContext oldcontext;
 
-		oldcontext = MemoryContextSwitchTo(TopTransactionContext);
+		oldcontext = MemoryContextSwitchTo(CurTransactionContext);
 
 		pendingNotifies = lcons(pstrdup(relname), pendingNotifies);
 
@@ -606,6 +612,60 @@ AtAbort_Notify(void)
 	ClearPendingNotifies();
 }
 
+/*
+ * AtSubStart_Notify() --- Take care of subtransaction start.
+ *
+ * Push empty state for the new subtransaction.
+ */
+void
+AtSubStart_Notify(void)
+{
+	MemoryContext	old_cxt;
+
+	/* Keep the list-of-lists in TopTransactionContext for simplicity */
+	old_cxt = MemoryContextSwitchTo(TopTransactionContext);
+
+	upperPendingNotifies = lcons(pendingNotifies, upperPendingNotifies);
+
+	pendingNotifies = NIL;
+
+	MemoryContextSwitchTo(old_cxt);
+}
+
+/*
+ * AtSubCommit_Notify() --- Take care of subtransaction commit.
+ *
+ * Reassign all items in the pending notifies list to the parent transaction.
+ */
+void
+AtSubCommit_Notify(void)
+{
+	List	*parentPendingNotifies;
+
+	parentPendingNotifies = (List *) linitial(upperPendingNotifies);
+	upperPendingNotifies = list_delete_first(upperPendingNotifies);
+
+	/*
+	 * We could try to eliminate duplicates here, but it seems not worthwhile.
+	 */
+	pendingNotifies = list_concat(parentPendingNotifies, pendingNotifies);
+}
+
+/*
+ * AtSubAbort_Notify() --- Take care of subtransaction abort.
+ */
+void
+AtSubAbort_Notify(void)
+{
+	/*
+	 * All we have to do is pop the stack --- the notifies made in this
+	 * subxact are no longer interesting, and the space will be freed when
+	 * CurTransactionContext is recycled.
+	 */
+	pendingNotifies = (List *) linitial(upperPendingNotifies);
+	upperPendingNotifies = list_delete_first(upperPendingNotifies);
+}
+
 /*
  *--------------------------------------------------------------
  * NotifyInterruptHandler
@@ -951,7 +1011,7 @@ ClearPendingNotifies(void)
 	/*
 	 * We used to have to explicitly deallocate the list members and
 	 * nodes, because they were malloc'd.  Now, since we know they are
-	 * palloc'd in TopTransactionContext, we need not do that --- they'll
+	 * palloc'd in CurTransactionContext, we need not do that --- they'll
 	 * go away automatically at transaction exit.  We need only reset the
 	 * list head pointer.
 	 */
diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c
index cfd8bd80cc0..392822abf50 100644
--- a/src/backend/commands/tablecmds.c
+++ b/src/backend/commands/tablecmds.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/commands/tablecmds.c,v 1.117 2004/06/25 21:55:53 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/commands/tablecmds.c,v 1.118 2004/07/01 00:50:10 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -76,8 +76,8 @@ typedef struct OnCommitItem
 	 * entries in the list until commit so that we can roll back if
 	 * needed.
 	 */
-	bool		created_in_cur_xact;
-	bool		deleted_in_cur_xact;
+	TransactionId	creating_xid;
+	TransactionId	deleting_xid;
 } OnCommitItem;
 
 static List *on_commits = NIL;
@@ -5483,8 +5483,8 @@ register_on_commit_action(Oid relid, OnCommitAction action)
 	oc = (OnCommitItem *) palloc(sizeof(OnCommitItem));
 	oc->relid = relid;
 	oc->oncommit = action;
-	oc->created_in_cur_xact = true;
-	oc->deleted_in_cur_xact = false;
+	oc->creating_xid = GetCurrentTransactionId();
+	oc->deleting_xid = InvalidTransactionId;
 
 	on_commits = lcons(oc, on_commits);
 
@@ -5507,7 +5507,7 @@ remove_on_commit_action(Oid relid)
 
 		if (oc->relid == relid)
 		{
-			oc->deleted_in_cur_xact = true;
+			oc->deleting_xid = GetCurrentTransactionId();
 			break;
 		}
 	}
@@ -5522,6 +5522,7 @@ remove_on_commit_action(Oid relid)
 void
 PreCommit_on_commit_actions(void)
 {
+	TransactionId xid = GetCurrentTransactionId();
 	ListCell   *l;
 
 	foreach(l, on_commits)
@@ -5529,7 +5530,7 @@ PreCommit_on_commit_actions(void)
 		OnCommitItem *oc = (OnCommitItem *) lfirst(l);
 
 		/* Ignore entry if already dropped in this xact */
-		if (oc->deleted_in_cur_xact)
+		if (oc->deleting_xid == xid)
 			continue;
 
 		switch (oc->oncommit)
@@ -5556,7 +5557,7 @@ PreCommit_on_commit_actions(void)
 					 * remove_on_commit_action, so the entry should get
 					 * marked as deleted.
 					 */
-					Assert(oc->deleted_in_cur_xact);
+					Assert(oc->deleting_xid == xid);
 					break;
 				}
 		}
@@ -5572,7 +5573,7 @@ PreCommit_on_commit_actions(void)
  * during abort, remove those created during this transaction.
  */
 void
-AtEOXact_on_commit_actions(bool isCommit)
+AtEOXact_on_commit_actions(bool isCommit, TransactionId xid)
 {
 	ListCell *cur_item;
 	ListCell *prev_item;
@@ -5584,8 +5585,8 @@ AtEOXact_on_commit_actions(bool isCommit)
 	{
 		OnCommitItem *oc = (OnCommitItem *) lfirst(cur_item);
 
-		if (isCommit ? oc->deleted_in_cur_xact :
-			oc->created_in_cur_xact)
+		if (isCommit ? TransactionIdEquals(oc->deleting_xid, xid) :
+			TransactionIdEquals(oc->creating_xid, xid))
 		{
 			/* cur_item must be removed */
 			on_commits = list_delete_cell(on_commits, cur_item, prev_item);
@@ -5598,8 +5599,52 @@ AtEOXact_on_commit_actions(bool isCommit)
 		else
 		{
 			/* cur_item must be preserved */
-			oc->deleted_in_cur_xact = false;
-			oc->created_in_cur_xact = false;
+			oc->creating_xid = InvalidTransactionId;
+			oc->deleting_xid = InvalidTransactionId;
+			prev_item = cur_item;
+			cur_item = lnext(prev_item);
+		}
+	}
+}
+
+/*
+ * Post-subcommit or post-subabort cleanup for ON COMMIT management.
+ *
+ * During subabort, we can immediately remove entries created during this
+ * subtransaction.  During subcommit, just relabel entries marked during
+ * this subtransaction as being the parent's responsibility.
+ */
+void
+AtEOSubXact_on_commit_actions(bool isCommit, TransactionId childXid,
+							  TransactionId parentXid)
+{
+	ListCell *cur_item;
+	ListCell *prev_item;
+
+	prev_item = NULL;
+	cur_item = list_head(on_commits);
+
+	while (cur_item != NULL)
+	{
+		OnCommitItem *oc = (OnCommitItem *) lfirst(cur_item);
+
+		if (!isCommit && TransactionIdEquals(oc->creating_xid, childXid))
+		{
+			/* cur_item must be removed */
+			on_commits = list_delete_cell(on_commits, cur_item, prev_item);
+			pfree(oc);
+			if (prev_item)
+				cur_item = lnext(prev_item);
+			else
+				cur_item = list_head(on_commits);
+		}
+		else
+		{
+			/* cur_item must be preserved */
+			if (TransactionIdEquals(oc->creating_xid, childXid))
+				oc->creating_xid = parentXid;
+			if (TransactionIdEquals(oc->deleting_xid, childXid))
+				oc->deleting_xid = isCommit ? parentXid : InvalidTransactionId;
 			prev_item = cur_item;
 			cur_item = lnext(prev_item);
 		}
diff --git a/src/backend/commands/trigger.c b/src/backend/commands/trigger.c
index cfbd58e4282..15f4cfa8dcb 100644
--- a/src/backend/commands/trigger.c
+++ b/src/backend/commands/trigger.c
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/commands/trigger.c,v 1.165 2004/05/26 04:41:12 neilc Exp $
+ *	  $PostgreSQL: pgsql/src/backend/commands/trigger.c,v 1.166 2004/07/01 00:50:11 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -50,9 +50,6 @@ static HeapTuple ExecCallTriggerFunc(TriggerData *trigdata,
 					MemoryContext per_tuple_context);
 static void DeferredTriggerSaveEvent(ResultRelInfo *relinfo, int event,
 				   bool row_trigger, HeapTuple oldtup, HeapTuple newtup);
-static void DeferredTriggerExecute(DeferredTriggerEvent event, int itemno,
-					Relation rel, TriggerDesc *trigdesc, FmgrInfo *finfo,
-					   MemoryContext per_tuple_context);
 
 
 /*
@@ -1639,47 +1636,130 @@ ltrmark:;
 
 /* ----------
  * Deferred trigger stuff
+ *
+ * The DeferredTriggersData struct holds data about pending deferred
+ * trigger events during the current transaction tree.  The struct and
+ * most of its subsidiary data are kept in TopTransactionContext; however
+ * the individual event records are kept in CurTransactionContext, so that
+ * they will easily go away during subtransaction abort.
+ *
+ * DeferredTriggersData has the following fields:
+ *
+ * state keeps track of the deferred state of each trigger
+ * (including the global state).  This is saved and restored across
+ * failed subtransactions.
+ *
+ * events is the head of the list of events.
+ *
+ * tail_thisxact points to the tail of the list, for the current
+ * transaction (whether main transaction or subtransaction).  We always
+ * append to the list using this pointer.
+ *
+ * events_imm points to the last element scanned by the last
+ * deferredTriggerInvokeEvents call.  We can use this to avoid rescanning
+ * unnecessarily; if it's NULL, the scan should start at the head of the
+ * list.  Its name comes from the fact that it's set to the last event fired
+ * by the last call to immediate triggers.
+ *
+ * tail_stack and imm_stack are stacks of pointer, which hold the pointers
+ * to the tail and the "immediate" events as of the start of a subtransaction.
+ * We use to revert them when aborting the subtransaction.
+ *
+ * state_stack is a stack of pointers to saved copies of the deferred-trigger
+ * state data; each subtransaction level that modifies that state first
+ * saves a copy, which we use to restore the state if we abort.
+ *
+ * numpushed and numalloc keep control of allocation and storage in the above
+ * stacks.  numpushed is essentially the current subtransaction nesting depth.
+ *
+ * XXX We need to be able to save the per-event data in a file if it grows too
+ * large.
  * ----------
  */
 
-typedef struct DeferredTriggersData
+/* Per-item data */
+typedef struct DeferredTriggerEventItem
 {
-	/* Internal data is held in a per-transaction memory context */
-	MemoryContext deftrig_cxt;
-	/* ALL DEFERRED or ALL IMMEDIATE */
-	bool		deftrig_all_isset;
-	bool		deftrig_all_isdeferred;
-	/* Per trigger state */
-	List	   *deftrig_trigstates;
-	/* List of pending deferred triggers. Previous comment below */
-	DeferredTriggerEvent deftrig_events;
-	DeferredTriggerEvent deftrig_events_imm;
-	DeferredTriggerEvent deftrig_event_tail;
-} DeferredTriggersData;
+	Oid			dti_tgoid;
+	TransactionId dti_done_xid;
+	int32		dti_state;
+} DeferredTriggerEventItem;
 
-/* ----------
- * deftrig_events, deftrig_event_tail:
- * The list of pending deferred trigger events during the current transaction.
+typedef struct DeferredTriggerEventData *DeferredTriggerEvent;
+
+/* Per-event data */
+typedef struct DeferredTriggerEventData
+{
+	DeferredTriggerEvent dte_next;		/* list link */
+	int32		dte_event;
+	Oid			dte_relid;
+	TransactionId dte_done_xid;
+	ItemPointerData dte_oldctid;
+	ItemPointerData dte_newctid;
+	int32		dte_n_items;
+	/* dte_item is actually a variable-size array, of length dte_n_items */
+	DeferredTriggerEventItem dte_item[1];
+} DeferredTriggerEventData;
+
+/* Per-trigger status data */
+typedef struct DeferredTriggerStatusData
+{
+	Oid			dts_tgoid;
+	bool		dts_tgisdeferred;
+} DeferredTriggerStatusData;
+
+typedef struct DeferredTriggerStatusData *DeferredTriggerStatus;
+
+
+/*
+ * Trigger deferral status data.
  *
- * deftrig_events is the head, deftrig_event_tail is the last entry.
- * Because this can grow pretty large, we don't use separate List nodes,
- * but instead thread the list through the dte_next fields of the member
- * nodes.  Saves just a few bytes per entry, but that adds up.
+ * We make this a single palloc'd object so it can be copied and freed easily.
  *
- * deftrig_events_imm holds the tail pointer as of the last
- * deferredTriggerInvokeEvents call; we can use this to avoid rescanning
- * entries unnecessarily.  It is NULL if deferredTriggerInvokeEvents
- * hasn't run since the last state change.
+ * all_isset and all_isdeferred are used to keep track
+ * of SET CONSTRAINTS ALL {DEFERRED, IMMEDIATE}.
  *
- * XXX Need to be able to shove this data out to a file if it grows too
- *	   large...
- * ----------
+ * trigstates[] stores per-trigger tgisdeferred settings.
  */
+typedef struct DeferredTriggerStateData
+{
+	bool	all_isset;
+	bool	all_isdeferred;
+	int		numstates;			/* number of trigstates[] entries in use */
+	int		numalloc;			/* allocated size of trigstates[] */
+	DeferredTriggerStatusData trigstates[1]; /* VARIABLE LENGTH ARRAY */
+} DeferredTriggerStateData;
+
+typedef DeferredTriggerStateData *DeferredTriggerState;
+
+/* Per-transaction data */
+typedef struct DeferredTriggersData
+{
+	DeferredTriggerState	state;
+	DeferredTriggerEvent	events;
+	DeferredTriggerEvent	tail_thisxact;
+	DeferredTriggerEvent	events_imm;
+	DeferredTriggerEvent   *tail_stack;
+	DeferredTriggerEvent   *imm_stack;
+	DeferredTriggerState   *state_stack;
+	int						numpushed;
+	int						numalloc;
+} DeferredTriggersData;
 
 typedef DeferredTriggersData *DeferredTriggers;
 
 static DeferredTriggers deferredTriggers;
 
+
+static void DeferredTriggerExecute(DeferredTriggerEvent event, int itemno,
+					Relation rel, TriggerDesc *trigdesc, FmgrInfo *finfo,
+					   MemoryContext per_tuple_context);
+static DeferredTriggerState DeferredTriggerStateCreate(int numalloc);
+static DeferredTriggerState DeferredTriggerStateCopy(DeferredTriggerState state);
+static DeferredTriggerState DeferredTriggerStateAddItem(DeferredTriggerState state,
+														Oid tgoid, bool tgisdeferred);
+
+
 /* ----------
  * deferredTriggerCheckState()
  *
@@ -1690,13 +1770,12 @@ static DeferredTriggers deferredTriggers;
 static bool
 deferredTriggerCheckState(Oid tgoid, int32 itemstate)
 {
-	MemoryContext oldcxt;
-	ListCell   *sl;
-	DeferredTriggerStatus trigstate;
+	bool	tgisdeferred;
+	int		i;
 
 	/*
-	 * Not deferrable triggers (i.e. normal AFTER ROW triggers and
-	 * constraints declared NOT DEFERRABLE, the state is always false.
+	 * For not-deferrable triggers (i.e. normal AFTER ROW triggers and
+	 * constraints declared NOT DEFERRABLE), the state is always false.
 	 */
 	if ((itemstate & TRIGGER_DEFERRED_DEFERRABLE) == 0)
 		return false;
@@ -1704,37 +1783,29 @@ deferredTriggerCheckState(Oid tgoid, int32 itemstate)
 	/*
 	 * Lookup if we know an individual state for this trigger
 	 */
-	foreach(sl, deferredTriggers->deftrig_trigstates)
+	for (i = 0; i < deferredTriggers->state->numstates; i++)
 	{
-		trigstate = (DeferredTriggerStatus) lfirst(sl);
-		if (trigstate->dts_tgoid == tgoid)
-			return trigstate->dts_tgisdeferred;
+		if (deferredTriggers->state->trigstates[i].dts_tgoid == tgoid)
+			return deferredTriggers->state->trigstates[i].dts_tgisdeferred;
 	}
 
 	/*
 	 * No individual state known - so if the user issued a SET CONSTRAINT
 	 * ALL ..., we return that instead of the triggers default state.
 	 */
-	if (deferredTriggers->deftrig_all_isset)
-		return deferredTriggers->deftrig_all_isdeferred;
+	if (deferredTriggers->state->all_isset)
+		return deferredTriggers->state->all_isdeferred;
 
 	/*
 	 * No ALL state known either, remember the default state as the
-	 * current and return that.
+	 * current and return that.  (XXX why do we bother making a state entry?)
 	 */
-	oldcxt = MemoryContextSwitchTo(deferredTriggers->deftrig_cxt);
+	tgisdeferred = ((itemstate & TRIGGER_DEFERRED_INITDEFERRED) != 0);
+	deferredTriggers->state =
+		DeferredTriggerStateAddItem(deferredTriggers->state,
+									tgoid, tgisdeferred);
 
-	trigstate = (DeferredTriggerStatus)
-		palloc(sizeof(DeferredTriggerStatusData));
-	trigstate->dts_tgoid = tgoid;
-	trigstate->dts_tgisdeferred =
-		((itemstate & TRIGGER_DEFERRED_INITDEFERRED) != 0);
-	deferredTriggers->deftrig_trigstates =
-		lappend(deferredTriggers->deftrig_trigstates, trigstate);
-
-	MemoryContextSwitchTo(oldcxt);
-
-	return trigstate->dts_tgisdeferred;
+	return tgisdeferred;
 }
 
 
@@ -1747,22 +1818,18 @@ deferredTriggerCheckState(Oid tgoid, int32 itemstate)
 static void
 deferredTriggerAddEvent(DeferredTriggerEvent event)
 {
-	/*
-	 * Since the event list could grow quite long, we keep track of the
-	 * list tail and append there, rather than just doing a stupid
-	 * "lappend". This avoids O(N^2) behavior for large numbers of events.
-	 */
-	event->dte_next = NULL;
-	if (deferredTriggers->deftrig_event_tail == NULL)
+	Assert(event->dte_next == NULL);
+
+	if (deferredTriggers->tail_thisxact == NULL)
 	{
 		/* first list entry */
-		deferredTriggers->deftrig_events = event;
-		deferredTriggers->deftrig_event_tail = event;
+		deferredTriggers->events = event;
+		deferredTriggers->tail_thisxact = event;
 	}
 	else
 	{
-		deferredTriggers->deftrig_event_tail->dte_next = event;
-		deferredTriggers->deftrig_event_tail = event;
+		deferredTriggers->tail_thisxact->dte_next = event;
+		deferredTriggers->tail_thisxact = event;
 	}
 }
 
@@ -1915,18 +1982,18 @@ deferredTriggerInvokeEvents(bool immediate_only)
 
 	/*
 	 * If immediate_only is true, then the only events that could need
-	 * firing are those since deftrig_events_imm.  (But if
-	 * deftrig_events_imm is NULL, we must scan the entire list.)
+	 * firing are those since events_imm.  (But if
+	 * events_imm is NULL, we must scan the entire list.)
 	 */
-	if (immediate_only && deferredTriggers->deftrig_events_imm != NULL)
+	if (immediate_only && deferredTriggers->events_imm != NULL)
 	{
-		prev_event = deferredTriggers->deftrig_events_imm;
+		prev_event = deferredTriggers->events_imm;
 		event = prev_event->dte_next;
 	}
 	else
 	{
 		prev_event = NULL;
-		event = deferredTriggers->deftrig_events;
+		event = deferredTriggers->events;
 	}
 
 	while (event != NULL)
@@ -1936,10 +2003,13 @@ deferredTriggerInvokeEvents(bool immediate_only)
 		int			i;
 
 		/*
-		 * Check if event is already completely done.
+		 * Skip executing cancelled events, and events done by transactions
+		 * that are not aborted.
 		 */
-		if (!(event->dte_event & (TRIGGER_DEFERRED_DONE |
-								  TRIGGER_DEFERRED_CANCELED)))
+		if (!(event->dte_event & TRIGGER_DEFERRED_CANCELED) ||
+				(event->dte_event & TRIGGER_DEFERRED_DONE &&
+				 TransactionIdIsValid(event->dte_done_xid) &&
+				 !TransactionIdDidAbort(event->dte_done_xid)))
 		{
 			MemoryContextReset(per_tuple_context);
 
@@ -1948,7 +2018,9 @@ deferredTriggerInvokeEvents(bool immediate_only)
 			 */
 			for (i = 0; i < event->dte_n_items; i++)
 			{
-				if (event->dte_item[i].dti_state & TRIGGER_DEFERRED_DONE)
+				if (event->dte_item[i].dti_state & TRIGGER_DEFERRED_DONE &&
+						TransactionIdIsValid(event->dte_item[i].dti_done_xid) &&
+						!(TransactionIdDidAbort(event->dte_item[i].dti_done_xid)))
 					continue;
 
 				/*
@@ -2003,6 +2075,7 @@ deferredTriggerInvokeEvents(bool immediate_only)
 									   per_tuple_context);
 
 				event->dte_item[i].dti_state |= TRIGGER_DEFERRED_DONE;
+				event->dte_item[i].dti_done_xid = GetCurrentTransactionId();
 			}					/* end loop over items within event */
 		}
 
@@ -2022,23 +2095,27 @@ deferredTriggerInvokeEvents(bool immediate_only)
 		}
 		else
 		{
-			/* Done */
-			if (immediate_only)
+			/*
+			 * We can drop an item if it's done, but only if we're not
+			 * inside a subtransaction because it could abort later on.
+			 * We will want to check the item again if it does.
+			 */
+			if (immediate_only && !IsSubTransaction())
 			{
 				/* delink it from list and free it */
 				if (prev_event)
 					prev_event->dte_next = next_event;
 				else
-					deferredTriggers->deftrig_events = next_event;
+					deferredTriggers->events = next_event;
 				pfree(event);
 			}
 			else
 			{
 				/*
-				 * We will clean up later, but just for paranoia's sake,
-				 * mark the event done.
+				 * Mark the event done.
 				 */
 				event->dte_event |= TRIGGER_DEFERRED_DONE;
+				event->dte_done_xid = GetCurrentTransactionId();
 			}
 		}
 
@@ -2046,10 +2123,10 @@ deferredTriggerInvokeEvents(bool immediate_only)
 	}
 
 	/* Update list tail pointer in case we just deleted tail event */
-	deferredTriggers->deftrig_event_tail = prev_event;
+	deferredTriggers->tail_thisxact = prev_event;
 
 	/* Set the immediate event pointer for next time */
-	deferredTriggers->deftrig_events_imm = prev_event;
+	deferredTriggers->events_imm = prev_event;
 
 	/* Release working resources */
 	if (rel)
@@ -2060,23 +2137,6 @@ deferredTriggerInvokeEvents(bool immediate_only)
 	MemoryContextDelete(per_tuple_context);
 }
 
-
-/* ----------
- * DeferredTriggerInit()
- *
- *	Initialize the deferred trigger mechanism. This is called during
- *	backend startup and is guaranteed to be before the first of all
- *	transactions.
- * ----------
- */
-void
-DeferredTriggerInit(void)
-{
-	/* Nothing to do */
-	;
-}
-
-
 /* ----------
  * DeferredTriggerBeginXact()
  *
@@ -2087,34 +2147,24 @@ DeferredTriggerInit(void)
 void
 DeferredTriggerBeginXact(void)
 {
-	/*
-	 * This will be changed to a special context when the nested
-	 * transactions project moves forward.
-	 */
-	MemoryContext cxt = TopTransactionContext;
-
-	deferredTriggers = (DeferredTriggers) MemoryContextAlloc(TopTransactionContext,
-										   sizeof(DeferredTriggersData));
+	Assert(deferredTriggers == NULL);
 
-	/*
-	 * Create the per transaction memory context
-	 */
-	deferredTriggers->deftrig_cxt = AllocSetContextCreate(cxt,
-												   "DeferredTriggerXact",
-												ALLOCSET_DEFAULT_MINSIZE,
-											   ALLOCSET_DEFAULT_INITSIZE,
-											   ALLOCSET_DEFAULT_MAXSIZE);
+	deferredTriggers = (DeferredTriggers)
+		MemoryContextAlloc(TopTransactionContext,
+						   sizeof(DeferredTriggersData));
 
 	/*
 	 * If unspecified, constraints default to IMMEDIATE, per SQL
 	 */
-	deferredTriggers->deftrig_all_isdeferred = false;
-	deferredTriggers->deftrig_all_isset = false;
-
-	deferredTriggers->deftrig_trigstates = NIL;
-	deferredTriggers->deftrig_events = NULL;
-	deferredTriggers->deftrig_events_imm = NULL;
-	deferredTriggers->deftrig_event_tail = NULL;
+	deferredTriggers->state = DeferredTriggerStateCreate(8);
+	deferredTriggers->events = NULL;
+	deferredTriggers->events_imm = NULL;
+	deferredTriggers->tail_thisxact = NULL;
+	deferredTriggers->tail_stack = NULL;
+	deferredTriggers->imm_stack = NULL;
+	deferredTriggers->state_stack = NULL;
+	deferredTriggers->numalloc = 0;
+	deferredTriggers->numpushed = 0;
 }
 
 
@@ -2156,6 +2206,12 @@ DeferredTriggerEndXact(void)
 
 	deferredTriggerInvokeEvents(false);
 
+	/*
+	 * Forget everything we know about deferred triggers.
+	 *
+	 * Since all the info is in TopTransactionContext or children thereof,
+	 * we need do nothing special to reclaim memory.
+	 */
 	deferredTriggers = NULL;
 }
 
@@ -2179,10 +2235,217 @@ DeferredTriggerAbortXact(void)
 
 	/*
 	 * Forget everything we know about deferred triggers.
+	 *
+	 * Since all the info is in TopTransactionContext or children thereof,
+	 * we need do nothing special to reclaim memory.
 	 */
 	deferredTriggers = NULL;
 }
 
+/*
+ * DeferredTriggerBeginSubXact()
+ *
+ *	Start a subtransaction.
+ */
+void
+DeferredTriggerBeginSubXact(void)
+{
+	/*
+	 * Ignore call if the transaction is in aborted state.
+	 */
+	if (deferredTriggers == NULL)
+		return;
+
+	/*
+	 * Allocate more space in the stacks if needed.
+	 */
+	if (deferredTriggers->numpushed == deferredTriggers->numalloc)
+	{
+		if (deferredTriggers->numalloc == 0)
+		{
+			MemoryContext old_cxt;
+
+			old_cxt = MemoryContextSwitchTo(TopTransactionContext);
+
+#define DEFTRIG_INITALLOC 8
+			deferredTriggers->tail_stack = (DeferredTriggerEvent *)
+				palloc(DEFTRIG_INITALLOC * sizeof(DeferredTriggerEvent));
+			deferredTriggers->imm_stack = (DeferredTriggerEvent *)
+				palloc(DEFTRIG_INITALLOC * sizeof(DeferredTriggerEvent));
+			deferredTriggers->state_stack = (DeferredTriggerState *)
+				palloc(DEFTRIG_INITALLOC * sizeof(DeferredTriggerState));
+			deferredTriggers->numalloc = DEFTRIG_INITALLOC;
+
+			MemoryContextSwitchTo(old_cxt);
+		}
+		else
+		{
+			/* repalloc will keep the stacks in the same context */
+			deferredTriggers->numalloc *= 2;
+
+			deferredTriggers->tail_stack = (DeferredTriggerEvent *)
+				repalloc(deferredTriggers->tail_stack,
+						 deferredTriggers->numalloc * sizeof(DeferredTriggerEvent));
+			deferredTriggers->imm_stack = (DeferredTriggerEvent *)
+				repalloc(deferredTriggers->imm_stack,
+						deferredTriggers->numalloc * sizeof(DeferredTriggerEvent));
+			deferredTriggers->state_stack = (DeferredTriggerState *)
+				repalloc(deferredTriggers->state_stack,
+						 deferredTriggers->numalloc * sizeof(DeferredTriggerState));
+		}
+	}
+
+	/*
+	 * Push the current list position into the stack and reset the
+	 * pointer.
+	 */
+	deferredTriggers->tail_stack[deferredTriggers->numpushed] =
+		deferredTriggers->tail_thisxact;
+	deferredTriggers->imm_stack[deferredTriggers->numpushed] =
+		deferredTriggers->events_imm;
+	/* State is not saved until/unless changed */
+	deferredTriggers->state_stack[deferredTriggers->numpushed] = NULL;
+
+	deferredTriggers->numpushed++;
+}
+
+/*
+ * DeferredTriggerEndSubXact()
+ *
+ *	The current subtransaction is ending.
+ */
+void
+DeferredTriggerEndSubXact(bool isCommit)
+{
+	DeferredTriggerState state;
+
+	/*
+	 * Ignore call if the transaction is in aborted state.
+	 */
+	if (deferredTriggers == NULL)
+		return;
+
+	/*
+	 * Move back the "top of the stack."
+	 */
+	Assert(deferredTriggers->numpushed > 0);
+
+	deferredTriggers->numpushed--;
+
+	if (isCommit)
+	{
+		/* If we saved a prior state, we don't need it anymore */
+		state = deferredTriggers->state_stack[deferredTriggers->numpushed];
+		if (state != NULL)
+			pfree(state);
+	}
+	else
+	{
+		/*
+		 * Aborting --- restore the pointers from the stacks.
+		 */
+		deferredTriggers->tail_thisxact =
+			deferredTriggers->tail_stack[deferredTriggers->numpushed];
+		deferredTriggers->events_imm =
+			deferredTriggers->imm_stack[deferredTriggers->numpushed];
+
+		/*
+		 * Cleanup the head and the tail of the list.
+		 */
+		if (deferredTriggers->tail_thisxact == NULL)
+			deferredTriggers->events = NULL;
+		else
+			deferredTriggers->tail_thisxact->dte_next = NULL;
+
+		/*
+		 * We don't need to free the items, since the CurTransactionContext
+		 * will be reset shortly.
+		 */
+
+		/*
+		 * Restore the trigger state.  If the saved state is NULL, then
+		 * this subxact didn't save it, so it doesn't need restoring.
+		 */
+		state = deferredTriggers->state_stack[deferredTriggers->numpushed];
+		if (state != NULL)
+		{
+			pfree(deferredTriggers->state);
+			deferredTriggers->state = state;
+		}
+	}
+}
+
+/*
+ * Create an empty DeferredTriggerState with room for numalloc trigstates
+ */
+static DeferredTriggerState
+DeferredTriggerStateCreate(int numalloc)
+{
+	DeferredTriggerState state;
+
+	/* Behave sanely with numalloc == 0 */
+	if (numalloc <= 0)
+		numalloc = 1;
+
+	/*
+	 * We assume that zeroing will correctly initialize the state values.
+	 */
+	state = (DeferredTriggerState)
+		MemoryContextAllocZero(TopTransactionContext,
+							   sizeof(DeferredTriggerStateData) +
+							   (numalloc - 1) * sizeof(DeferredTriggerStatusData));
+
+	state->numalloc = numalloc;
+
+	return state;
+}
+
+/*
+ * Copy a DeferredTriggerState
+ */
+static DeferredTriggerState
+DeferredTriggerStateCopy(DeferredTriggerState origstate)
+{
+	DeferredTriggerState state;
+
+	state = DeferredTriggerStateCreate(origstate->numstates);
+
+	state->all_isset = origstate->all_isset;
+	state->all_isdeferred = origstate->all_isdeferred;
+	state->numstates = origstate->numstates;
+	memcpy(state->trigstates, origstate->trigstates,
+		   origstate->numstates * sizeof(DeferredTriggerStatusData));
+
+	return state;
+}
+
+/*
+ * Add a per-trigger item to a DeferredTriggerState.  Returns possibly-changed
+ * pointer to the state object (it will change if we have to repalloc).
+ */
+static DeferredTriggerState
+DeferredTriggerStateAddItem(DeferredTriggerState state,
+							Oid tgoid, bool tgisdeferred)
+{
+	if (state->numstates >= state->numalloc)
+	{
+		int		newalloc = state->numalloc * 2;
+
+		newalloc = Max(newalloc, 8); /* in case original has size 0 */
+		state = (DeferredTriggerState)
+			repalloc(state,
+					 sizeof(DeferredTriggerStateData) +
+					 (newalloc - 1) * sizeof(DeferredTriggerStatusData));
+		state->numalloc = newalloc;
+		Assert(state->numstates < state->numalloc);
+	}
+
+	state->trigstates[state->numstates].dts_tgoid = tgoid;
+	state->trigstates[state->numstates].dts_tgisdeferred = tgisdeferred;
+	state->numstates++;
+
+	return state;
+}
 
 /* ----------
  * DeferredTriggerSetState()
@@ -2193,14 +2456,23 @@ DeferredTriggerAbortXact(void)
 void
 DeferredTriggerSetState(ConstraintsSetStmt *stmt)
 {
-	ListCell	   *l;
-
 	/*
 	 * Ignore call if we aren't in a transaction.
 	 */
 	if (deferredTriggers == NULL)
 		return;
 
+	/*
+	 * If in a subtransaction, and we didn't save the current state already,
+	 * save it so it can be restored if the subtransaction aborts.
+	 */
+	if (deferredTriggers->numpushed > 0 &&
+		deferredTriggers->state_stack[deferredTriggers->numpushed - 1] == NULL)
+	{
+		deferredTriggers->state_stack[deferredTriggers->numpushed - 1] =
+			DeferredTriggerStateCopy(deferredTriggers->state);
+	}
+
 	/*
 	 * Handle SET CONSTRAINTS ALL ...
 	 */
@@ -2210,23 +2482,19 @@ DeferredTriggerSetState(ConstraintsSetStmt *stmt)
 		 * Drop all per-transaction information about individual trigger
 		 * states.
 		 */
-		list_free_deep(deferredTriggers->deftrig_trigstates);
-		deferredTriggers->deftrig_trigstates = NIL;
+		deferredTriggers->state->numstates = 0;
 
 		/*
 		 * Set the per-transaction ALL state to known.
 		 */
-		deferredTriggers->deftrig_all_isset = true;
-		deferredTriggers->deftrig_all_isdeferred = stmt->deferred;
+		deferredTriggers->state->all_isset = true;
+		deferredTriggers->state->all_isdeferred = stmt->deferred;
 	}
 	else
 	{
 		Relation	tgrel;
-		MemoryContext oldcxt;
-		bool		found;
-		DeferredTriggerStatus state;
-		ListCell   *ls;
-		List	   *loid = NIL;
+		ListCell   *l;
+		List	   *oidlist = NIL;
 
 		/* ----------
 		 * Handle SET CONSTRAINTS constraint-name [, ...]
@@ -2241,6 +2509,7 @@ DeferredTriggerSetState(ConstraintsSetStmt *stmt)
 			ScanKeyData skey;
 			SysScanDesc tgscan;
 			HeapTuple	htup;
+			bool		found;
 
 			/*
 			 * Check that only named constraints are set explicitly
@@ -2285,7 +2554,7 @@ DeferredTriggerSetState(ConstraintsSetStmt *stmt)
 									cname)));
 
 				constr_oid = HeapTupleGetOid(htup);
-				loid = lappend_oid(loid, constr_oid);
+				oidlist = lappend_oid(oidlist, constr_oid);
 				found = true;
 			}
 
@@ -2305,34 +2574,28 @@ DeferredTriggerSetState(ConstraintsSetStmt *stmt)
 		 * Inside of a transaction block set the trigger states of
 		 * individual triggers on transaction level.
 		 */
-		oldcxt = MemoryContextSwitchTo(deferredTriggers->deftrig_cxt);
-
-		foreach(l, loid)
+		foreach(l, oidlist)
 		{
-			found = false;
-			foreach(ls, deferredTriggers->deftrig_trigstates)
+			Oid			tgoid = lfirst_oid(l);
+			bool		found = false;
+			int			i;
+
+			for (i = 0; i < deferredTriggers->state->numstates; i++)
 			{
-				state = (DeferredTriggerStatus) lfirst(ls);
-				if (state->dts_tgoid == lfirst_oid(l))
+				if (deferredTriggers->state->trigstates[i].dts_tgoid == tgoid)
 				{
-					state->dts_tgisdeferred = stmt->deferred;
+					deferredTriggers->state->trigstates[i].dts_tgisdeferred = stmt->deferred;
 					found = true;
 					break;
 				}
 			}
 			if (!found)
 			{
-				state = (DeferredTriggerStatus)
-					palloc(sizeof(DeferredTriggerStatusData));
-				state->dts_tgoid = lfirst_oid(l);
-				state->dts_tgisdeferred = stmt->deferred;
-
-				deferredTriggers->deftrig_trigstates =
-					lappend(deferredTriggers->deftrig_trigstates, state);
+				deferredTriggers->state =
+					DeferredTriggerStateAddItem(deferredTriggers->state,
+												tgoid, stmt->deferred);
 			}
 		}
-
-		MemoryContextSwitchTo(oldcxt);
 	}
 
 	/*
@@ -2347,14 +2610,14 @@ DeferredTriggerSetState(ConstraintsSetStmt *stmt)
 	 * entire list, in case some deferred events are now immediately
 	 * invokable.
 	 */
-	deferredTriggers->deftrig_events_imm = NULL;
+	deferredTriggers->events_imm = NULL;
 }
 
 
 /* ----------
  * DeferredTriggerSaveEvent()
  *
- *	Called by ExecAR...Triggers() to add the event to the queue.
+ *	Called by ExecA[RS]...Triggers() to add the event to the queue.
  *
  *	NOTE: should be called only if we've determined that an event must
  *	be added to the queue.
@@ -2423,9 +2686,10 @@ DeferredTriggerSaveEvent(ResultRelInfo *relinfo, int event, bool row_trigger,
 		return;
 
 	/*
-	 * Create a new event
+	 * Create a new event.  We use the CurTransactionContext so the event
+	 * will automatically go away if the subtransaction aborts.
 	 */
-	oldcxt = MemoryContextSwitchTo(deferredTriggers->deftrig_cxt);
+	oldcxt = MemoryContextSwitchTo(CurTransactionContext);
 
 	new_size = offsetof(DeferredTriggerEventData, dte_item[0]) +
 		n_enabled_triggers * sizeof(DeferredTriggerEventItem);
@@ -2433,6 +2697,7 @@ DeferredTriggerSaveEvent(ResultRelInfo *relinfo, int event, bool row_trigger,
 	new_event = (DeferredTriggerEvent) palloc(new_size);
 	new_event->dte_next = NULL;
 	new_event->dte_event = event & TRIGGER_EVENT_OPMASK;
+	new_event->dte_done_xid = InvalidTransactionId;
 	if (row_trigger)
 		new_event->dte_event |= TRIGGER_EVENT_ROW;
 	new_event->dte_relid = rel->rd_id;
@@ -2449,6 +2714,7 @@ DeferredTriggerSaveEvent(ResultRelInfo *relinfo, int event, bool row_trigger,
 
 		ev_item = &(new_event->dte_item[i]);
 		ev_item->dti_tgoid = trigger->tgoid;
+		ev_item->dti_done_xid = InvalidTransactionId;
 		ev_item->dti_state =
 			((trigger->tgdeferrable) ?
 			 TRIGGER_DEFERRED_DEFERRABLE : 0) |
@@ -2517,6 +2783,7 @@ DeferredTriggerSaveEvent(ResultRelInfo *relinfo, int event, bool row_trigger,
 					 * the trigger at all.
 					 */
 					new_event->dte_item[i].dti_state |= TRIGGER_DEFERRED_DONE;
+					new_event->dte_item[i].dti_done_xid = GetCurrentTransactionId();
 				}
 			}
 
diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c
index 80a021487f9..c62bc6eaf1e 100644
--- a/src/backend/commands/vacuum.c
+++ b/src/backend/commands/vacuum.c
@@ -13,7 +13,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/commands/vacuum.c,v 1.281 2004/06/08 13:59:36 momjian Exp $
+ *	  $PostgreSQL: pgsql/src/backend/commands/vacuum.c,v 1.282 2004/07/01 00:50:11 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -25,6 +25,7 @@
 #include "access/clog.h"
 #include "access/genam.h"
 #include "access/heapam.h"
+#include "access/subtrans.h"
 #include "access/xlog.h"
 #include "catalog/catalog.h"
 #include "catalog/catname.h"
@@ -798,8 +799,9 @@ vac_truncate_clog(TransactionId vacuumXID, TransactionId frozenXID)
 		return;
 	}
 
-	/* Truncate CLOG to the oldest vacuumxid */
+	/* Truncate CLOG and SUBTRANS to the oldest vacuumxid */
 	TruncateCLOG(vacuumXID);
+	TruncateSUBTRANS(vacuumXID);
 
 	/* Give warning about impending wraparound problems */
 	if (frozenAlreadyWrapped)
diff --git a/src/backend/commands/variable.c b/src/backend/commands/variable.c
index 4a58419079a..dfa3f7121ed 100644
--- a/src/backend/commands/variable.c
+++ b/src/backend/commands/variable.c
@@ -9,7 +9,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/commands/variable.c,v 1.97 2004/05/26 04:41:13 neilc Exp $
+ *	  $PostgreSQL: pgsql/src/backend/commands/variable.c,v 1.98 2004/07/01 00:50:12 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -470,10 +470,17 @@ show_timezone(void)
 const char *
 assign_XactIsoLevel(const char *value, bool doit, GucSource source)
 {
-	if (doit && source >= PGC_S_INTERACTIVE && SerializableSnapshot != NULL)
+	if (doit && source >= PGC_S_INTERACTIVE)
+	{
+	   if (SerializableSnapshot != NULL)
 		ereport(ERROR,
 				(errcode(ERRCODE_ACTIVE_SQL_TRANSACTION),
 				 errmsg("SET TRANSACTION ISOLATION LEVEL must be called before any query")));
+	   if (IsSubTransaction())
+		ereport(ERROR,
+				(errcode(ERRCODE_ACTIVE_SQL_TRANSACTION),
+				 errmsg("SET TRANSACTION ISOLATION LEVEL must not be called in a subtransaction")));
+	}
 
 	if (strcmp(value, "serializable") == 0)
 	{
diff --git a/src/backend/executor/spi.c b/src/backend/executor/spi.c
index 91b633d9bd8..7534ddd7933 100644
--- a/src/backend/executor/spi.c
+++ b/src/backend/executor/spi.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/executor/spi.c,v 1.118 2004/06/11 01:08:43 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/executor/spi.c,v 1.119 2004/07/01 00:50:26 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -29,6 +29,7 @@ int			SPI_result;
 
 static _SPI_connection *_SPI_stack = NULL;
 static _SPI_connection *_SPI_current = NULL;
+static int	_SPI_stack_depth = 0; /* allocated size of _SPI_stack */
 static int	_SPI_connected = -1;
 static int	_SPI_curid = -1;
 
@@ -59,7 +60,7 @@ static bool _SPI_checktuples(void);
 int
 SPI_connect(void)
 {
-	_SPI_connection *new_SPI_stack;
+	int		newdepth;
 
 	/*
 	 * When procedure called by Executor _SPI_curid expected to be equal
@@ -70,39 +71,46 @@ SPI_connect(void)
 
 	if (_SPI_stack == NULL)
 	{
-		if (_SPI_connected != -1)
+		if (_SPI_connected != -1 || _SPI_stack_depth != 0)
 			elog(ERROR, "SPI stack corrupted");
-		new_SPI_stack = (_SPI_connection *) malloc(sizeof(_SPI_connection));
+		newdepth = 16;
+		_SPI_stack = (_SPI_connection *)
+			MemoryContextAlloc(TopTransactionContext,
+							   newdepth * sizeof(_SPI_connection));
+		_SPI_stack_depth = newdepth;
 	}
 	else
 	{
-		if (_SPI_connected < 0)
+		if (_SPI_stack_depth <= 0 || _SPI_stack_depth <= _SPI_connected)
 			elog(ERROR, "SPI stack corrupted");
-		new_SPI_stack = (_SPI_connection *) realloc(_SPI_stack,
-						 (_SPI_connected + 2) * sizeof(_SPI_connection));
+		if (_SPI_stack_depth == _SPI_connected + 1)
+		{
+			newdepth = _SPI_stack_depth * 2;
+			_SPI_stack = (_SPI_connection *)
+				repalloc(_SPI_stack,
+						 newdepth * sizeof(_SPI_connection));
+			_SPI_stack_depth = newdepth;
+		}
 	}
 
-	if (new_SPI_stack == NULL)
-		ereport(ERROR,
-				(errcode(ERRCODE_OUT_OF_MEMORY),
-				 errmsg("out of memory")));
-
 	/*
-	 * We' returning to procedure where _SPI_curid == _SPI_connected - 1
+	 * We're entering procedure where _SPI_curid == _SPI_connected - 1
 	 */
-	_SPI_stack = new_SPI_stack;
 	_SPI_connected++;
+	Assert(_SPI_connected >= 0 && _SPI_connected < _SPI_stack_depth);
 
 	_SPI_current = &(_SPI_stack[_SPI_connected]);
 	_SPI_current->processed = 0;
 	_SPI_current->tuptable = NULL;
+	_SPI_current->connectXid = GetCurrentTransactionId();
 
 	/*
 	 * Create memory contexts for this procedure
 	 *
-	 * XXX it would be better to use PortalContext as the parent context, but
-	 * we may not be inside a portal (consider deferred-trigger
-	 * execution).
+	 * XXX it would be better to use PortalContext as the parent context,
+	 * but we may not be inside a portal (consider deferred-trigger
+	 * execution).  Perhaps CurTransactionContext would do?  For now it
+	 * doesn't matter because we clean up explicitly in AtEOSubXact_SPI().
 	 */
 	_SPI_current->procCxt = AllocSetContextCreate(TopTransactionContext,
 												  "SPI Proc",
@@ -152,28 +160,11 @@ SPI_finish(void)
 	_SPI_connected--;
 	_SPI_curid--;
 	if (_SPI_connected == -1)
-	{
-		free(_SPI_stack);
-		_SPI_stack = NULL;
 		_SPI_current = NULL;
-	}
 	else
-	{
-		_SPI_connection *new_SPI_stack;
-
-		new_SPI_stack = (_SPI_connection *) realloc(_SPI_stack,
-						 (_SPI_connected + 1) * sizeof(_SPI_connection));
-		/* This could only fail with a pretty stupid malloc package ... */
-		if (new_SPI_stack == NULL)
-			ereport(ERROR,
-					(errcode(ERRCODE_OUT_OF_MEMORY),
-					 errmsg("out of memory")));
-		_SPI_stack = new_SPI_stack;
 		_SPI_current = &(_SPI_stack[_SPI_connected]);
-	}
 
 	return SPI_OK_FINISH;
-
 }
 
 /*
@@ -187,23 +178,54 @@ AtEOXact_SPI(bool isCommit)
 	 * freed automatically, so we can ignore them here.  We just need to
 	 * restore our static variables to initial state.
 	 */
-	if (_SPI_stack != NULL)
-	{
-		free(_SPI_stack);
-		if (isCommit)
-			ereport(WARNING,
-					(errcode(ERRCODE_WARNING),
-					 errmsg("freeing non-empty SPI stack"),
-					 errhint("Check for missing \"SPI_finish\" calls")));
-	}
+	if (isCommit && _SPI_connected != -1)
+		ereport(WARNING,
+				(errcode(ERRCODE_WARNING),
+				 errmsg("transaction left non-empty SPI stack"),
+				 errhint("Check for missing \"SPI_finish\" calls")));
 
 	_SPI_current = _SPI_stack = NULL;
+	_SPI_stack_depth = 0;
 	_SPI_connected = _SPI_curid = -1;
 	SPI_processed = 0;
 	SPI_lastoid = InvalidOid;
 	SPI_tuptable = NULL;
 }
 
+/*
+ * Clean up SPI state at subtransaction commit or abort.
+ *
+ * During commit, there shouldn't be any unclosed entries remaining from
+ * the current transaction; we throw them away if found.
+ */
+void
+AtEOSubXact_SPI(bool isCommit, TransactionId childXid)
+{
+	bool	found = false;
+
+	while (_SPI_connected >= 0)
+	{
+		_SPI_connection *connection = &(_SPI_stack[_SPI_connected]);
+		int		res;
+
+		if (connection->connectXid != childXid)
+			break;				/* couldn't be any underneath it either */
+
+		found = true;
+
+		_SPI_curid = _SPI_connected - 1; /* avoid begin_call error */
+		res = SPI_finish();
+		Assert(res == SPI_OK_FINISH);
+	}
+
+	if (found && isCommit)
+		ereport(WARNING,
+				(errcode(ERRCODE_WARNING),
+				 errmsg("subtransaction left non-empty SPI stack"),
+				 errhint("Check for missing \"SPI_finish\" calls")));
+}
+
+
 /* Pushes SPI stack to allow recursive SPI calls */
 void
 SPI_push(void)
@@ -1148,16 +1170,18 @@ _SPI_execute(const char *src, int tcount, _SPI_plan *plan)
 					res = SPI_ERROR_CURSOR;
 					goto fail;
 				}
-				else if (IsA(queryTree->utilityStmt, TransactionStmt))
-				{
-					res = SPI_ERROR_TRANSACTION;
-					goto fail;
-				}
 				res = SPI_OK_UTILITY;
 				if (plan == NULL)
 				{
 					ProcessUtility(queryTree->utilityStmt, dest, NULL);
-					CommandCounterIncrement();
+
+					if (IsA(queryTree->utilityStmt, TransactionStmt))
+					{
+						CommitTransactionCommand();
+						StartTransactionCommand();
+					}
+					else
+						CommandCounterIncrement();
 				}
 			}
 			else if (plan == NULL)
@@ -1273,7 +1297,14 @@ _SPI_execute_plan(_SPI_plan *plan, Datum *Values, const char *Nulls,
 			{
 				ProcessUtility(queryTree->utilityStmt, dest, NULL);
 				res = SPI_OK_UTILITY;
-				CommandCounterIncrement();
+
+				if (IsA(queryTree->utilityStmt, TransactionStmt))
+				{
+					CommitTransactionCommand();
+					StartTransactionCommand();
+				}
+				else
+					CommandCounterIncrement();
 			}
 			else
 			{
diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c
index 5616d0b3cd1..dbd4f15cefd 100644
--- a/src/backend/postmaster/pgstat.c
+++ b/src/backend/postmaster/pgstat.c
@@ -13,7 +13,7 @@
  *
  *	Copyright (c) 2001-2003, PostgreSQL Global Development Group
  *
- *	$PostgreSQL: pgsql/src/backend/postmaster/pgstat.c,v 1.76 2004/06/26 16:32:02 tgl Exp $
+ *	$PostgreSQL: pgsql/src/backend/postmaster/pgstat.c,v 1.77 2004/07/01 00:50:36 tgl Exp $
  * ----------
  */
 #include "postgres.h"
@@ -167,6 +167,7 @@ static void pgstat_write_statsfile(void);
 static void pgstat_read_statsfile(HTAB **dbhash, Oid onlydb,
 					  PgStat_StatBeEntry **betab,
 					  int *numbackends);
+static void backend_read_statsfile(void);
 
 static void pgstat_setheader(PgStat_MsgHdr *hdr, int mtype);
 static void pgstat_send(void *msg, int len);
@@ -786,12 +787,7 @@ pgstat_vacuum_tabstat(void)
 	 * If not done for this transaction, read the statistics collector
 	 * stats file into some hash tables.
 	 */
-	if (!TransactionIdEquals(pgStatDBHashXact, GetCurrentTransactionId()))
-	{
-		pgstat_read_statsfile(&pgStatDBHash, MyDatabaseId,
-							  &pgStatBeTable, &pgStatNumBackends);
-		pgStatDBHashXact = GetCurrentTransactionId();
-	}
+	backend_read_statsfile();
 
 	/*
 	 * Lookup our own database entry
@@ -1210,15 +1206,9 @@ pgstat_fetch_stat_dbentry(Oid dbid)
 
 	/*
 	 * If not done for this transaction, read the statistics collector
-	 * stats file into some hash tables. Be careful with the
-	 * read_statsfile() call below!
+	 * stats file into some hash tables.
 	 */
-	if (!TransactionIdEquals(pgStatDBHashXact, GetCurrentTransactionId()))
-	{
-		pgstat_read_statsfile(&pgStatDBHash, MyDatabaseId,
-							  &pgStatBeTable, &pgStatNumBackends);
-		pgStatDBHashXact = GetCurrentTransactionId();
-	}
+	backend_read_statsfile();
 
 	/*
 	 * Lookup the requested database
@@ -1250,15 +1240,9 @@ pgstat_fetch_stat_tabentry(Oid relid)
 
 	/*
 	 * If not done for this transaction, read the statistics collector
-	 * stats file into some hash tables. Be careful with the
-	 * read_statsfile() call below!
+	 * stats file into some hash tables.
 	 */
-	if (!TransactionIdEquals(pgStatDBHashXact, GetCurrentTransactionId()))
-	{
-		pgstat_read_statsfile(&pgStatDBHash, MyDatabaseId,
-							  &pgStatBeTable, &pgStatNumBackends);
-		pgStatDBHashXact = GetCurrentTransactionId();
-	}
+	backend_read_statsfile();
 
 	/*
 	 * Lookup our database.
@@ -1296,12 +1280,7 @@ pgstat_fetch_stat_tabentry(Oid relid)
 PgStat_StatBeEntry *
 pgstat_fetch_stat_beentry(int beid)
 {
-	if (!TransactionIdEquals(pgStatDBHashXact, GetCurrentTransactionId()))
-	{
-		pgstat_read_statsfile(&pgStatDBHash, MyDatabaseId,
-							  &pgStatBeTable, &pgStatNumBackends);
-		pgStatDBHashXact = GetCurrentTransactionId();
-	}
+	backend_read_statsfile();
 
 	if (beid < 1 || beid > pgStatNumBackends)
 		return NULL;
@@ -1320,12 +1299,7 @@ pgstat_fetch_stat_beentry(int beid)
 int
 pgstat_fetch_stat_numbackends(void)
 {
-	if (!TransactionIdEquals(pgStatDBHashXact, GetCurrentTransactionId()))
-	{
-		pgstat_read_statsfile(&pgStatDBHash, MyDatabaseId,
-							  &pgStatBeTable, &pgStatNumBackends);
-		pgStatDBHashXact = GetCurrentTransactionId();
-	}
+	backend_read_statsfile();
 
 	return pgStatNumBackends;
 }
@@ -2759,11 +2733,32 @@ pgstat_read_statsfile(HTAB **dbhash, Oid onlydb,
 	fclose(fpin);
 }
 
+/*
+ * If not done for this transaction, read the statistics collector
+ * stats file into some hash tables.
+ *
+ * Because we store the hash tables in TopTransactionContext, the result
+ * is good for the entire current main transaction.
+ */
+static void
+backend_read_statsfile(void)
+{
+	TransactionId	topXid = GetTopTransactionId();
+
+	if (!TransactionIdEquals(pgStatDBHashXact, topXid))
+	{
+		Assert(!pgStatRunningInCollector);
+		pgstat_read_statsfile(&pgStatDBHash, MyDatabaseId,
+							  &pgStatBeTable, &pgStatNumBackends);
+		pgStatDBHashXact = topXid;
+	}
+}
+
 
 /* ----------
  * pgstat_recv_bestart() -
  *
- *	Process a backend starup message.
+ *	Process a backend startup message.
  * ----------
  */
 static void
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 725b79cad38..4a9ddc32432 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/storage/buffer/bufmgr.c,v 1.171 2004/06/18 06:13:33 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/storage/buffer/bufmgr.c,v 1.172 2004/07/01 00:50:46 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -45,6 +45,7 @@
 #include "storage/bufpage.h"
 #include "storage/proc.h"
 #include "storage/smgr.h"
+#include "utils/memutils.h"
 #include "utils/relcache.h"
 #include "pgstat.h"
 
@@ -64,9 +65,13 @@ long		NDirectFileRead;	/* some I/O's are direct file access.
 								 * bypass bufmgr */
 long		NDirectFileWrite;	/* e.g., I/O in psort and hashjoin. */
 
+/* List of upper-level-transaction buffer refcount arrays */
+static List *upperRefCounts = NIL;
+
 
 static void PinBuffer(BufferDesc *buf);
 static void UnpinBuffer(BufferDesc *buf);
+static void BufferFixLeak(Buffer bufnum, int32 shouldBe, bool emitWarning);
 static void WaitIO(BufferDesc *buf);
 static void StartBufferIO(BufferDesc *buf, bool forInput);
 static void TerminateBufferIO(BufferDesc *buf, int err_flag);
@@ -826,30 +831,104 @@ AtEOXact_Buffers(bool isCommit)
 	for (i = 0; i < NBuffers; i++)
 	{
 		if (PrivateRefCount[i] != 0)
-		{
-			BufferDesc *buf = &(BufferDescriptors[i]);
-
-			if (isCommit)
-				elog(WARNING,
-					 "buffer refcount leak: [%03d] "
-					 "(rel=%u/%u/%u, blockNum=%u, flags=0x%x, refcount=%u %d)",
-					 i,
-					 buf->tag.rnode.spcNode, buf->tag.rnode.dbNode,
-					 buf->tag.rnode.relNode,
-					 buf->tag.blockNum, buf->flags,
-					 buf->refcount, PrivateRefCount[i]);
-
-			PrivateRefCount[i] = 1;		/* make sure we release shared pin */
-			LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
-			UnpinBuffer(buf);
-			LWLockRelease(BufMgrLock);
-			Assert(PrivateRefCount[i] == 0);
-		}
+			BufferFixLeak(i, 0, isCommit);
 	}
 
 	AtEOXact_LocalBuffers(isCommit);
 }
 
+/*
+ * During subtransaction start, save buffer reference counts.
+ */
+void
+AtSubStart_Buffers(void)
+{
+	int32		   *copyRefCounts;
+	Size			rcSize;
+	MemoryContext	old_cxt;
+
+	/* this is probably the active context already, but be safe */
+	old_cxt = MemoryContextSwitchTo(CurTransactionContext);
+
+	/*
+	 * We need to copy the current state of PrivateRefCount[].  In the typical
+	 * scenario, few if any of the entries will be nonzero, and we could save
+	 * space by storing only the nonzero ones.  However, copying the whole
+	 * thing is lots simpler and faster both here and in AtEOSubXact_Buffers,
+	 * so it seems best to waste the space.
+	 */
+	rcSize = NBuffers * sizeof(int32);
+	copyRefCounts = (int32 *) palloc(rcSize);
+	memcpy(copyRefCounts, PrivateRefCount, rcSize);
+
+	/* Attach to list */
+	upperRefCounts = lcons(copyRefCounts, upperRefCounts);
+
+	MemoryContextSwitchTo(old_cxt);
+}
+
+/*
+ * AtEOSubXact_Buffers
+ *
+ * At subtransaction end, we restore the saved counts.  If committing, we
+ * complain if the refcounts don't match; if aborting, just restore silently.
+ */
+void
+AtEOSubXact_Buffers(bool isCommit)
+{
+	int32	   *oldRefCounts;
+	int			i;
+
+	oldRefCounts = (int32 *) linitial(upperRefCounts);
+	upperRefCounts = list_delete_first(upperRefCounts);
+
+	for (i = 0; i < NBuffers; i++)
+	{
+		if (PrivateRefCount[i] != oldRefCounts[i])
+			BufferFixLeak(i, oldRefCounts[i], isCommit);
+	}
+
+	pfree(oldRefCounts);
+}
+
+/*
+ * Fix a buffer refcount leak.
+ *
+ * The caller does not hold the BufMgrLock.
+ */
+static void
+BufferFixLeak(Buffer bufnum, int32 shouldBe, bool emitWarning)
+{
+	BufferDesc	*buf = &(BufferDescriptors[bufnum]);
+
+	if (emitWarning)
+		elog(WARNING,
+			 "buffer refcount leak: [%03d] (rel=%u/%u/%u, blockNum=%u, flags=0x%x, refcount=%u %d, should be=%d)",
+			 bufnum,
+			 buf->tag.rnode.spcNode, buf->tag.rnode.dbNode,
+			 buf->tag.rnode.relNode,
+			 buf->tag.blockNum, buf->flags,
+			 buf->refcount, PrivateRefCount[bufnum], shouldBe);
+
+	/* If it's less, we're in a heap o' trouble */
+	if (PrivateRefCount[bufnum] <= shouldBe)
+		elog(FATAL, "buffer refcount was decreased by subtransaction");
+
+	if (shouldBe > 0)
+	{
+		/* We still keep the shared-memory pin */
+		PrivateRefCount[bufnum] = shouldBe;
+	}
+	else
+	{
+		PrivateRefCount[bufnum] = 1; /* make sure we release shared pin */
+		LWLockAcquire(BufMgrLock, LW_EXCLUSIVE);
+		UnpinBuffer(buf);
+		LWLockRelease(BufMgrLock);
+		Assert(PrivateRefCount[bufnum] == 0);
+	}
+}
+
 /*
  * FlushBufferPool
  *
diff --git a/src/backend/storage/ipc/ipci.c b/src/backend/storage/ipc/ipci.c
index 69c460306b4..4c759db9d8e 100644
--- a/src/backend/storage/ipc/ipci.c
+++ b/src/backend/storage/ipc/ipci.c
@@ -8,16 +8,16 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/storage/ipc/ipci.c,v 1.68 2004/05/29 22:48:20 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/storage/ipc/ipci.c,v 1.69 2004/07/01 00:50:52 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
 #include "postgres.h"
 
-
-#include "miscadmin.h"
 #include "access/clog.h"
+#include "access/subtrans.h"
 #include "access/xlog.h"
+#include "miscadmin.h"
 #include "postmaster/bgwriter.h"
 #include "storage/bufmgr.h"
 #include "storage/freespace.h"
@@ -70,6 +70,7 @@ CreateSharedMemoryAndSemaphores(bool makePrivate,
 		size += LockShmemSize(maxBackends);
 		size += XLOGShmemSize();
 		size += CLOGShmemSize();
+		size += SUBTRANSShmemSize();
 		size += LWLockShmemSize();
 		size += SInvalShmemSize(maxBackends);
 		size += FreeSpaceShmemSize();
@@ -133,6 +134,7 @@ CreateSharedMemoryAndSemaphores(bool makePrivate,
 	 */
 	XLOGShmemInit();
 	CLOGShmemInit();
+	SUBTRANSShmemInit();
 	InitBufferPool();
 
 	/*
diff --git a/src/backend/storage/ipc/sinval.c b/src/backend/storage/ipc/sinval.c
index 856d0f0a73f..bf4eb0f6293 100644
--- a/src/backend/storage/ipc/sinval.c
+++ b/src/backend/storage/ipc/sinval.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/storage/ipc/sinval.c,v 1.64 2004/06/02 21:29:28 momjian Exp $
+ *	  $PostgreSQL: pgsql/src/backend/storage/ipc/sinval.c,v 1.65 2004/07/01 00:50:52 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -16,6 +16,8 @@
 
 #include <signal.h>
 
+#include "access/subtrans.h"
+#include "access/transam.h"
 #include "commands/async.h"
 #include "storage/ipc.h"
 #include "storage/proc.h"
@@ -428,20 +430,40 @@ DatabaseHasActiveBackends(Oid databaseId, bool ignoreMyself)
 
 /*
  * TransactionIdIsInProgress -- is given transaction running by some backend
+ *
+ * There are three possibilities for finding a running transaction:
+ *
+ * 1. the given Xid is a main transaction Id.  We will find this out cheaply
+ * by looking at the PGPROC struct for each backend.
+ *
+ * 2. the given Xid is one of the cached subxact Xids in the PGPROC array.
+ * We can find this out cheaply too.
+ *
+ * 3. Search the SubTrans tree.  This is the slowest, but sadly it has to be
+ * done always if the other two failed.
+ *
+ * SInvalLock has to be held while we do 1 and 2.  If we save all the Xids
+ * while doing 1, we can release the SInvalLock while we do 3.  This buys back
+ * some concurrency (we can't retrieve the main Xids from PGPROC again anyway,
+ * see GetNewTransactionId)
  */
 bool
 TransactionIdIsInProgress(TransactionId xid)
 {
-	bool		result = false;
-	SISeg	   *segP = shmInvalBuffer;
-	ProcState  *stateP = segP->procState;
-	int			index;
+	bool			result = false;
+	SISeg		   *segP = shmInvalBuffer;
+	ProcState	   *stateP = segP->procState;
+	int				i;
+	int				nxids = 0;
+	TransactionId  *xids;
+
+	xids = (TransactionId *)palloc(sizeof(TransactionId) * segP->maxBackends);
 
 	LWLockAcquire(SInvalLock, LW_SHARED);
 
-	for (index = 0; index < segP->lastBackend; index++)
+	for (i = 0; i < segP->lastBackend; i++)
 	{
-		SHMEM_OFFSET pOffset = stateP[index].procStruct;
+		SHMEM_OFFSET pOffset = stateP[i].procStruct;
 
 		if (pOffset != INVALID_OFFSET)
 		{
@@ -450,16 +472,71 @@ TransactionIdIsInProgress(TransactionId xid)
 			/* Fetch xid just once - see GetNewTransactionId */
 			TransactionId pxid = proc->xid;
 
+			/*
+			 * check the main Xid (step 1 above)
+			 */
 			if (TransactionIdEquals(pxid, xid))
 			{
 				result = true;
 				break;
 			}
+
+			/*
+			 * save the main Xid for step 3.
+			 */
+			xids[nxids++] = pxid;
+
+#ifdef NOT_USED
+			FIXME -- waiting to save the Xids in PGPROC ...
+
+			/*
+			 * check the saved Xids array (step 2)
+			 */
+			for (j = 0; j < PGPROC_MAX_SAVED_XIDS; j++)
+			{
+				pxid = proc->savedxids[j];
+
+				if (!TransactionIdIsValid(pxids))
+					break;
+
+				if (TransactionIdEquals(pxid, xid))
+				{
+					result = true;
+					break;
+				}
+			}
+#endif
+
+			if (result)
+				break;
+
 		}
 	}
 
 	LWLockRelease(SInvalLock);
 
+	/*
+	 * Step 3: have to check pg_subtrans.  Use the saved Xids.
+	 *
+	 * XXX Could save the cached Xids too for further improvement.
+	 */
+	if (!result)
+	{
+		/* this is a potentially expensive call. */
+		xid = SubTransGetTopmostTransaction(xid);
+		
+		Assert(TransactionIdIsValid(xid));
+
+		/*
+		 * We don't care if it aborted, because if it did, we won't find
+		 * it in the array.
+		 */
+
+		for (i = 0; i < nxids; i++)
+			if (TransactionIdEquals(xids[i], xid))
+				return true;
+	}
+
 	return result;
 }
 
@@ -596,7 +673,7 @@ GetSnapshotData(Snapshot snapshot, bool serializable)
 	 * This does open a possibility for avoiding repeated malloc/free:
 	 * since MaxBackends does not change at runtime, we can simply reuse
 	 * the previous xip array if any.  (This relies on the fact that all
-	 * calls pass static SnapshotData structs.)
+	 * callers pass static SnapshotData structs.)
 	 */
 	if (snapshot->xip == NULL)
 	{
diff --git a/src/backend/storage/lmgr/lmgr.c b/src/backend/storage/lmgr/lmgr.c
index e4e52b16abf..45305b4dea2 100644
--- a/src/backend/storage/lmgr/lmgr.c
+++ b/src/backend/storage/lmgr/lmgr.c
@@ -8,13 +8,14 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/storage/lmgr/lmgr.c,v 1.63 2004/05/28 05:13:04 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/storage/lmgr/lmgr.c,v 1.64 2004/07/01 00:50:59 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
 
 #include "postgres.h"
 
+#include "access/subtrans.h"
 #include "access/transam.h"
 #include "access/xact.h"
 #include "catalog/catalog.h"
@@ -333,19 +334,21 @@ XactLockTableInsert(TransactionId xid)
  *		XactLockTableWait
  *
  * Wait for the specified transaction to commit or abort.
+ * We actually wait on the topmost transaction of the transaction tree.
  */
 void
 XactLockTableWait(TransactionId xid)
 {
 	LOCKTAG		tag;
 	TransactionId myxid = GetCurrentTransactionId();
+	TransactionId waitXid = SubTransGetTopmostTransaction(xid);
 
-	Assert(!TransactionIdEquals(xid, myxid));
+	Assert(!SubTransXidsHaveCommonAncestor(waitXid, myxid));
 
 	MemSet(&tag, 0, sizeof(tag));
 	tag.relId = XactLockTableId;
 	tag.dbId = InvalidOid;
-	tag.objId.xid = xid;
+	tag.objId.xid = waitXid;
 
 	if (!LockAcquire(LockTableId, &tag, myxid,
 					 ShareLock, false))
@@ -355,8 +358,13 @@ XactLockTableWait(TransactionId xid)
 
 	/*
 	 * Transaction was committed/aborted/crashed - we have to update
-	 * pg_clog if transaction is still marked as running.
+	 * pg_clog if transaction is still marked as running.  If it's a
+	 * subtransaction, we can update the parent status too.
 	 */
-	if (!TransactionIdDidCommit(xid) && !TransactionIdDidAbort(xid))
-		TransactionIdAbort(xid);
+	if (!TransactionIdDidCommit(waitXid) && !TransactionIdDidAbort(waitXid))
+	{
+		TransactionIdAbort(waitXid);
+		if (waitXid != xid)
+			TransactionIdAbort(xid);
+	}
 }
diff --git a/src/backend/storage/lmgr/lock.c b/src/backend/storage/lmgr/lock.c
index c04f3b5c88a..6b7f43440e6 100644
--- a/src/backend/storage/lmgr/lock.c
+++ b/src/backend/storage/lmgr/lock.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/storage/lmgr/lock.c,v 1.133 2004/06/05 19:48:08 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/storage/lmgr/lock.c,v 1.134 2004/07/01 00:50:59 tgl Exp $
  *
  * NOTES
  *	  Outside modules can create a lock table and acquire/release
@@ -23,7 +23,7 @@
  *	Interface:
  *
  *	LockAcquire(), LockRelease(), LockMethodTableInit(),
- *	LockMethodTableRename(), LockReleaseAll,
+ *	LockMethodTableRename(), LockReleaseAll(),
  *	LockCheckConflicts(), GrantLock()
  *
  *-------------------------------------------------------------------------
@@ -1129,19 +1129,25 @@ LockRelease(LOCKMETHODID lockmethodid, LOCKTAG *locktag,
 }
 
 /*
- * LockReleaseAll -- Release all locks in a process's lock list.
+ * LockReleaseAll -- Release all locks of the specified lock method that
+ *		are held by the specified process.
  *
- * Well, not really *all* locks.
+ * Well, not necessarily *all* locks.  The available behaviors are:
  *
- * If 'allxids' is TRUE, all locks of the specified lock method are
- * released, regardless of transaction affiliation.
+ * which == ReleaseAll: release all locks regardless of transaction
+ * affiliation.
  *
- * If 'allxids' is FALSE, all locks of the specified lock method and
- * specified XID are released.
+ * which == ReleaseAllExceptSession: release all locks with Xid != 0
+ * (zero is the Xid used for "session" locks).
+ *
+ * which == ReleaseGivenXids: release only locks whose Xids appear in
+ * the xids[] array (of length nxids).
+ *
+ * xids/nxids are ignored when which != ReleaseGivenXids.
  */
 bool
 LockReleaseAll(LOCKMETHODID lockmethodid, PGPROC *proc,
-			   bool allxids, TransactionId xid)
+			   LockReleaseWhich which, int nxids, TransactionId *xids)
 {
 	SHM_QUEUE  *procHolders = &(proc->procHolders);
 	PROCLOCK   *proclock;
@@ -1190,8 +1196,25 @@ LockReleaseAll(LOCKMETHODID lockmethodid, PGPROC *proc,
 		if (LOCK_LOCKMETHOD(*lock) != lockmethodid)
 			goto next_item;
 
-		/* If not allxids, ignore items that are of the wrong xid */
-		if (!allxids && !TransactionIdEquals(xid, proclock->tag.xid))
+		if (which == ReleaseGivenXids)
+		{
+			/* Ignore locks with an Xid not in the list */
+			bool release = false;
+
+			for (i = 0; i < nxids; i++)
+			{
+				if (TransactionIdEquals(proclock->tag.xid, xids[i]))
+				{
+					release = true;
+					break;
+				}
+			}
+			if (!release)
+				goto next_item;
+		}
+		/* Ignore locks with Xid=0 unless we are asked to release All locks */
+		else if (TransactionIdEquals(proclock->tag.xid, InvalidTransactionId)
+				 && which != ReleaseAll)
 			goto next_item;
 
 		PROCLOCK_PRINT("LockReleaseAll", proclock);
diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c
index d1410d04a05..e48531c10ac 100644
--- a/src/backend/storage/lmgr/lwlock.c
+++ b/src/backend/storage/lmgr/lwlock.c
@@ -15,13 +15,14 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/storage/lmgr/lwlock.c,v 1.20 2004/06/11 16:43:24 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/storage/lmgr/lwlock.c,v 1.21 2004/07/01 00:50:59 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
 #include "postgres.h"
 
 #include "access/clog.h"
+#include "access/subtrans.h"
 #include "storage/lwlock.h"
 #include "storage/proc.h"
 #include "storage/spin.h"
@@ -111,6 +112,9 @@ NumLWLocks(void)
 	/* clog.c needs one per CLOG buffer + one control lock */
 	numLocks += NUM_CLOG_BUFFERS + 1;
 
+	/* subtrans.c needs one per SubTrans buffer + one control lock */
+	numLocks += NUM_SUBTRANS_BUFFERS + 1;
+
 	/* Perhaps create a few more for use by user-defined modules? */
 
 	return numLocks;
diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c
index dbf5b414153..abe44e808ad 100644
--- a/src/backend/storage/lmgr/proc.c
+++ b/src/backend/storage/lmgr/proc.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/storage/lmgr/proc.c,v 1.148 2004/05/29 22:48:20 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/storage/lmgr/proc.c,v 1.149 2004/07/01 00:50:59 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -380,26 +380,34 @@ LockWaitCancel(void)
 
 /*
  * ProcReleaseLocks() -- release locks associated with current transaction
- *			at transaction commit or abort
+ *			at main transaction and subtransaction commit or abort
  *
- * At commit, we release only locks tagged with the current transaction's XID,
- * leaving those marked with XID 0 (ie, session locks) undisturbed.  At abort,
- * we release all locks including XID 0, because we need to clean up after
- * a failure.  This logic will need extension if we ever support nested
- * transactions.
+ * The options for which locks to release are the same as for the underlying
+ * LockReleaseAll() function.
  *
- * Note that user locks are not released in either case.
+ * Notes:
+ *
+ * At main transaction commit, we release all locks except session locks.
+ * At main transaction abort, we release all locks including session locks;
+ * this lets us clean up after a VACUUM FULL failure.
+ *
+ * At subtransaction commit, we don't release any locks (so this func is not
+ * called at all); we will defer the releasing to the parent transaction.
+ * At subtransaction abort, we release all locks held by the subtransaction;
+ * this is implemented by passing in the Xids of the failed subxact and its
+ * children in the xids[] array.
+ *
+ * Note that user locks are not released in any case.
  */
 void
-ProcReleaseLocks(bool isCommit)
+ProcReleaseLocks(LockReleaseWhich which, int nxids, TransactionId *xids)
 {
 	if (!MyProc)
 		return;
 	/* If waiting, get off wait queue (should only be needed after error) */
 	LockWaitCancel();
 	/* Release locks */
-	LockReleaseAll(DEFAULT_LOCKMETHOD, MyProc,
-				   !isCommit, GetCurrentTransactionId());
+	LockReleaseAll(DEFAULT_LOCKMETHOD, MyProc, which, nxids, xids);
 }
 
 
@@ -432,11 +440,11 @@ ProcKill(int code, Datum arg)
 	LockWaitCancel();
 
 	/* Remove from the standard lock table */
-	LockReleaseAll(DEFAULT_LOCKMETHOD, MyProc, true, InvalidTransactionId);
+	LockReleaseAll(DEFAULT_LOCKMETHOD, MyProc, ReleaseAll, 0, NULL);
 
 #ifdef USER_LOCKS
 	/* Remove from the user lock table */
-	LockReleaseAll(USER_LOCKMETHOD, MyProc, true, InvalidTransactionId);
+	LockReleaseAll(USER_LOCKMETHOD, MyProc, ReleaseAll, 0, NULL);
 #endif
 
 	SpinLockAcquire(ProcStructLock);
diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c
index c7783d878f2..5c53d48f838 100644
--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@@ -11,7 +11,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/storage/smgr/smgr.c,v 1.74 2004/06/18 06:13:37 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/storage/smgr/smgr.c,v 1.75 2004/07/01 00:51:07 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -80,9 +80,10 @@ static HTAB *SMgrRelationHash = NULL;
  * executed immediately, but is just entered in the list.  When and if
  * the transaction commits, we can delete the physical file.
  *
- * NOTE: the list is kept in TopMemoryContext to be sure it won't disappear
- * unbetimes.  It'd probably be OK to keep it in TopTransactionContext,
- * but I'm being paranoid.
+ * The list is kept in CurTransactionContext.  In subtransactions, each
+ * subtransaction has its own list in its own CurTransactionContext, but
+ * successful subtransactions attach their lists to their parent's list.
+ * Failed subtransactions can immediately execute the abort-time actions.
  */
 
 typedef struct PendingRelDelete
@@ -91,10 +92,11 @@ typedef struct PendingRelDelete
 	int			which;			/* which storage manager? */
 	bool		isTemp;			/* is it a temporary relation? */
 	bool		atCommit;		/* T=delete at commit; F=delete at abort */
-	struct PendingRelDelete *next;		/* linked-list link */
 } PendingRelDelete;
 
-static PendingRelDelete *pendingDeletes = NULL; /* head of linked list */
+static List *pendingDeletes = NIL;		/* head of linked list */
+
+static List *upperPendingDeletes = NIL; /* list of upper-xact lists */
 
 
 /*
@@ -305,6 +307,7 @@ smgrcreate(SMgrRelation reln, bool isTemp, bool isRedo)
 	XLogRecData		rdata;
 	xl_smgr_create	xlrec;
 	PendingRelDelete *pending;
+	MemoryContext	old_cxt;
 
 	if (! (*(smgrsw[reln->smgr_which].smgr_create)) (reln, isRedo))
 		ereport(ERROR,
@@ -332,14 +335,17 @@ smgrcreate(SMgrRelation reln, bool isTemp, bool isRedo)
 	lsn = XLogInsert(RM_SMGR_ID, XLOG_SMGR_CREATE | XLOG_NO_TRAN, &rdata);
 
 	/* Add the relation to the list of stuff to delete at abort */
-	pending = (PendingRelDelete *)
-		MemoryContextAlloc(TopMemoryContext, sizeof(PendingRelDelete));
+	old_cxt = MemoryContextSwitchTo(CurTransactionContext);
+
+	pending = (PendingRelDelete *) palloc(sizeof(PendingRelDelete));
 	pending->relnode = reln->smgr_rnode;
 	pending->which = reln->smgr_which;
 	pending->isTemp = isTemp;
 	pending->atCommit = false;	/* delete if abort */
-	pending->next = pendingDeletes;
-	pendingDeletes = pending;
+
+	pendingDeletes = lcons(pending, pendingDeletes);
+
+	MemoryContextSwitchTo(old_cxt);
 }
 
 /*
@@ -354,16 +360,20 @@ void
 smgrscheduleunlink(SMgrRelation reln, bool isTemp)
 {
 	PendingRelDelete *pending;
+	MemoryContext	 old_cxt;
 
 	/* Add the relation to the list of stuff to delete at commit */
-	pending = (PendingRelDelete *)
-		MemoryContextAlloc(TopMemoryContext, sizeof(PendingRelDelete));
+	old_cxt = MemoryContextSwitchTo(CurTransactionContext);
+
+	pending = (PendingRelDelete *) palloc(sizeof(PendingRelDelete));
 	pending->relnode = reln->smgr_rnode;
 	pending->which = reln->smgr_which;
 	pending->isTemp = isTemp;
 	pending->atCommit = true;	/* delete if commit */
-	pending->next = pendingDeletes;
-	pendingDeletes = pending;
+
+	pendingDeletes = lcons(pending, pendingDeletes);
+
+	MemoryContextSwitchTo(old_cxt);
 
 	/*
 	 * NOTE: if the relation was created in this transaction, it will now
@@ -627,18 +637,21 @@ smgrimmedsync(SMgrRelation reln)
 void
 smgrDoPendingDeletes(bool isCommit)
 {
-	while (pendingDeletes != NULL)
+	ListCell *p;
+
+	foreach(p, pendingDeletes)
 	{
-		PendingRelDelete *pending = pendingDeletes;
+		PendingRelDelete *pending = lfirst(p);
 
-		pendingDeletes = pending->next;
 		if (pending->atCommit == isCommit)
 			smgr_internal_unlink(pending->relnode,
 								 pending->which,
 								 pending->isTemp,
 								 false);
-		pfree(pending);
 	}
+
+	/* We needn't free the cells since they are in CurTransactionContext */
+	pendingDeletes = NIL;
 }
 
 /*
@@ -647,17 +660,22 @@ smgrDoPendingDeletes(bool isCommit)
  * The return value is the number of relations scheduled for termination.
  * *ptr is set to point to a freshly-palloc'd array of RelFileNodes.
  * If there are no relations to be deleted, *ptr is set to NULL.
+ *
+ * Note that the list does not include anything scheduled for termination
+ * by upper-level transactions.
  */
 int
 smgrGetPendingDeletes(bool forCommit, RelFileNode **ptr)
 {
 	int			nrels;
 	RelFileNode *rptr;
-	PendingRelDelete *pending;
+	ListCell	*p;
 
 	nrels = 0;
-	for (pending = pendingDeletes; pending != NULL; pending = pending->next)
+	foreach(p, pendingDeletes)
 	{
+		PendingRelDelete *pending = lfirst(p);
+
 		if (pending->atCommit == forCommit)
 			nrels++;
 	}
@@ -668,14 +686,69 @@ smgrGetPendingDeletes(bool forCommit, RelFileNode **ptr)
 	}
 	rptr = (RelFileNode *) palloc(nrels * sizeof(RelFileNode));
 	*ptr = rptr;
-	for (pending = pendingDeletes; pending != NULL; pending = pending->next)
+	foreach(p, pendingDeletes)
 	{
+		PendingRelDelete *pending = lfirst(p);
+
 		if (pending->atCommit == forCommit)
 			*rptr++ = pending->relnode;
 	}
 	return nrels;
 }
 
+/*
+ * AtSubStart_smgr() --- Take care of subtransaction start.
+ *
+ * Push empty state for the new subtransaction.
+ */
+void
+AtSubStart_smgr(void)
+{
+	MemoryContext	old_cxt;
+
+	/* Keep the list-of-lists in TopTransactionContext for simplicity */
+	old_cxt = MemoryContextSwitchTo(TopTransactionContext);
+
+	upperPendingDeletes = lcons(pendingDeletes, upperPendingDeletes);
+
+	pendingDeletes = NIL;
+
+	MemoryContextSwitchTo(old_cxt);
+}
+
+/*
+ * AtSubCommit_smgr() --- Take care of subtransaction commit.
+ *
+ * Reassign all items in the pending deletes list to the parent transaction.
+ */
+void
+AtSubCommit_smgr(void)
+{
+	List	*parentPendingDeletes;
+
+	parentPendingDeletes = (List *) linitial(upperPendingDeletes);
+	upperPendingDeletes = list_delete_first(upperPendingDeletes);
+
+	pendingDeletes = list_concat(parentPendingDeletes, pendingDeletes);
+}
+
+/*
+ * AtSubAbort_smgr() --- Take care of subtransaction abort.
+ *
+ * Delete created relations and forget about deleted relations.
+ * We can execute these operations immediately because we know this
+ * subtransaction will not commit.
+ */
+void
+AtSubAbort_smgr(void)
+{
+	smgrDoPendingDeletes(false);
+
+	/* Must pop the stack, too */
+	pendingDeletes = (List *) linitial(upperPendingDeletes);
+	upperPendingDeletes = list_delete_first(upperPendingDeletes);
+}
+
 /*
  *	smgrcommit() -- Prepare to commit changes made during the current
  *					transaction.
diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c
index 9b7cfcd6681..c42bd6c7bfb 100644
--- a/src/backend/tcop/postgres.c
+++ b/src/backend/tcop/postgres.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/tcop/postgres.c,v 1.421 2004/06/24 21:03:08 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/tcop/postgres.c,v 1.422 2004/07/01 00:51:11 tgl Exp $
  *
  * NOTES
  *	  this is the "main" module of the postgres backend and
@@ -841,6 +841,7 @@ exec_simple_query(const char *query_string)
 				TransactionStmt *stmt = (TransactionStmt *) parsetree;
 
 				if (stmt->kind == TRANS_STMT_COMMIT ||
+					stmt->kind == TRANS_STMT_BEGIN ||
 					stmt->kind == TRANS_STMT_ROLLBACK)
 					allowit = true;
 			}
@@ -1161,6 +1162,7 @@ exec_parse_message(const char *query_string,	/* string to execute */
 				TransactionStmt *stmt = (TransactionStmt *) parsetree;
 
 				if (stmt->kind == TRANS_STMT_COMMIT ||
+					stmt->kind == TRANS_STMT_BEGIN ||
 					stmt->kind == TRANS_STMT_ROLLBACK)
 					allowit = true;
 			}
@@ -1623,6 +1625,7 @@ exec_execute_message(const char *portal_name, long max_rows)
 
 			is_trans_stmt = true;
 			if (stmt->kind == TRANS_STMT_COMMIT ||
+				stmt->kind == TRANS_STMT_BEGIN ||
 				stmt->kind == TRANS_STMT_ROLLBACK)
 				is_trans_exit = true;
 		}
diff --git a/src/backend/utils/cache/catcache.c b/src/backend/utils/cache/catcache.c
index 5e91a7283ec..8bfa3610bdb 100644
--- a/src/backend/utils/cache/catcache.c
+++ b/src/backend/utils/cache/catcache.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/utils/cache/catcache.c,v 1.112 2004/05/26 04:41:40 neilc Exp $
+ *	  $PostgreSQL: pgsql/src/backend/utils/cache/catcache.c,v 1.113 2004/07/01 00:51:17 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -360,6 +360,8 @@ CatCacheRemoveCTup(CatCache *cache, CatCTup *ct)
 	/* free associated tuple data */
 	if (ct->tuple.t_data != NULL)
 		pfree(ct->tuple.t_data);
+	if (ct->prev_refcount != NULL)
+		pfree(ct->prev_refcount);
 	pfree(ct);
 
 	--cache->cc_ntup;
@@ -394,6 +396,8 @@ CatCacheRemoveCList(CatCache *cache, CatCList *cl)
 	/* free associated tuple data */
 	if (cl->tuple.t_data != NULL)
 		pfree(cl->tuple.t_data);
+	if (cl->prev_refcount != NULL)
+		pfree(cl->prev_refcount);
 	pfree(cl);
 }
 
@@ -518,9 +522,9 @@ CreateCacheMemoryContext(void)
 	if (!CacheMemoryContext)
 		CacheMemoryContext = AllocSetContextCreate(TopMemoryContext,
 												   "CacheMemoryContext",
-												ALLOCSET_DEFAULT_MINSIZE,
-											   ALLOCSET_DEFAULT_INITSIZE,
-											   ALLOCSET_DEFAULT_MAXSIZE);
+												   ALLOCSET_DEFAULT_MINSIZE,
+												   ALLOCSET_DEFAULT_INITSIZE,
+												   ALLOCSET_DEFAULT_MAXSIZE);
 }
 
 
@@ -560,6 +564,13 @@ AtEOXact_CatCache(bool isCommit)
 				cl->refcount = 0;
 			}
 
+			/*
+			 * Reset the refcount stack.  Drop the item count to zero,
+			 * but don't deallocate the stack itself, so it can be used by
+			 * future subtransactions.
+			 */
+			cl->numpushes = 0;
+
 			/* Clean up any now-deletable dead entries */
 			if (cl->dead)
 				CatCacheRemoveCList(ccp, cl);
@@ -585,12 +596,174 @@ AtEOXact_CatCache(bool isCommit)
 			ct->refcount = 0;
 		}
 
+		/*
+		 * Reset the refcount stack.  Drop the item count to zero,
+		 * but don't deallocate the stack itself, so it can be used by
+		 * future subtransactions.
+		 */
+		ct->numpushes = 0;
+
 		/* Clean up any now-deletable dead entries */
 		if (ct->dead)
 			CatCacheRemoveCTup(ct->my_cache, ct);
 	}
 }
 
+/*
+ * AtSubStart_CatCache
+ *
+ * Saves reference counts of each entry at subtransaction start so they
+ * can be restored if the subtransaction later aborts.
+ */
+void
+AtSubStart_CatCache(void)
+{
+	CatCache   *ccp;
+	Dlelem	   *elt,
+			   *nextelt;
+	MemoryContext old_cxt;
+   
+
+	old_cxt = MemoryContextSwitchTo(CacheMemoryContext);
+
+	/*
+	 * Prepare CLists
+	 */
+	for (ccp = CacheHdr->ch_caches; ccp; ccp = ccp->cc_next)
+	{
+		for (elt = DLGetHead(&ccp->cc_lists); elt; elt = nextelt)
+		{
+			CatCList   *cl = (CatCList *) DLE_VAL(elt);
+
+			nextelt = DLGetSucc(elt);
+
+			if (cl->numpushes == cl->numalloc)
+			{
+				if (cl->numalloc == 0)
+				{
+					cl->numalloc = 8;
+					cl->prev_refcount = palloc(sizeof(int) * cl->numalloc);
+				}
+				else
+				{
+					cl->numalloc *= 2;
+					cl->prev_refcount = repalloc(cl->prev_refcount, cl->numalloc * sizeof(int));
+				}
+			}
+
+			cl->prev_refcount[cl->numpushes++] = cl->refcount;
+		}
+	}
+
+	/*
+	 * Prepare CTuples
+	 */
+	for (elt = DLGetHead(&CacheHdr->ch_lrulist); elt; elt = nextelt)
+	{
+		CatCTup    *ct = (CatCTup *) DLE_VAL(elt);
+
+		nextelt = DLGetSucc(elt);
+
+		if (ct->numpushes == ct->numalloc)
+		{
+			if (ct->numalloc == 0)
+			{
+				ct->numalloc = 8;
+				ct->prev_refcount = palloc(sizeof(int) * ct->numalloc);
+			}
+			else
+			{
+				ct->numalloc *= 2;
+				ct->prev_refcount = repalloc(ct->prev_refcount, sizeof(int) * ct->numalloc);
+			}
+		}
+
+		ct->prev_refcount[ct->numpushes++] = ct->refcount;
+	}
+
+	MemoryContextSwitchTo(old_cxt);
+}
+
+void
+AtEOSubXact_CatCache(bool isCommit)
+{
+	CatCache   *ccp;
+	Dlelem	   *elt,
+			   *nextelt;
+	
+	/*
+	 * Restore CLists
+	 */
+	for (ccp = CacheHdr->ch_caches; ccp; ccp = ccp->cc_next)
+	{
+		for (elt = DLGetHead(&ccp->cc_lists); elt; elt = nextelt)
+		{
+			CatCList   *cl = (CatCList *) DLE_VAL(elt);
+
+			nextelt = DLGetSucc(elt);
+
+			/*
+			 * During commit, check whether the count is what
+			 * we expect.
+			 */
+			if (isCommit)
+			{
+				int expected_refcount;
+				if (cl->numpushes > 0)
+					expected_refcount = cl->prev_refcount[cl->numpushes - 1];
+				else
+					expected_refcount = 0;
+
+				if (cl->refcount != expected_refcount)
+					elog(WARNING, "catcache reference leak");
+			}
+
+			/*
+			 * During abort we have to restore the original count;
+			 * during commit, we have to restore in case of a leak,
+			 * and it won't harm if this is the expected count.
+			 */
+			if (cl->numpushes > 0)
+				cl->refcount = cl->prev_refcount[--cl->numpushes];
+			else
+				cl->refcount = 0;
+		}
+	}
+
+	/*
+	 * Prepare CTuples
+	 */
+	for (elt = DLGetHead(&CacheHdr->ch_lrulist); elt; elt = nextelt)
+	{
+		CatCTup    *ct = (CatCTup *) DLE_VAL(elt);
+
+		nextelt = DLGetSucc(elt);
+
+		if (isCommit)
+		{
+			int expected_refcount;
+
+			if (ct->numpushes > 0)
+				expected_refcount = ct->prev_refcount[ct->numpushes - 1];
+			else
+				expected_refcount = 0;
+
+			if (ct->refcount != expected_refcount)
+				elog(WARNING, "catcache reference leak");
+		}
+
+		/*
+		 * During abort we have to restore the original count;
+		 * during commit, we have to restore in case of a leak,
+		 * and it won't harm if this is the expected count.
+		 */
+		if (ct->numpushes > 0)
+			ct->refcount = ct->prev_refcount[--ct->numpushes];
+		else
+			ct->refcount = 0;
+	}
+}
+
 /*
  *		ResetCatalogCache
  *
@@ -1505,6 +1678,9 @@ SearchCatCacheList(CatCache *cache,
 	cl->my_cache = cache;
 	DLInitElem(&cl->cache_elem, (void *) cl);
 	cl->refcount = 1;			/* count this first reference */
+	cl->prev_refcount = NULL;
+	cl->numpushes = 0;
+	cl->numalloc = 0;
 	cl->dead = false;
 	cl->ordered = ordered;
 	cl->nkeys = nkeys;
@@ -1603,6 +1779,9 @@ CatalogCacheCreateEntry(CatCache *cache, HeapTuple ntp,
 	ct->dead = false;
 	ct->negative = negative;
 	ct->hash_value = hashValue;
+	ct->prev_refcount = NULL;
+	ct->numpushes = 0;
+	ct->numalloc = 0;
 
 	DLAddHead(&CacheHdr->ch_lrulist, &ct->lrulist_elem);
 	DLAddHead(&cache->cc_bucket[hashIndex], &ct->cache_elem);
diff --git a/src/backend/utils/cache/inval.c b/src/backend/utils/cache/inval.c
index ea958a27b46..e54a74fae4b 100644
--- a/src/backend/utils/cache/inval.c
+++ b/src/backend/utils/cache/inval.c
@@ -33,6 +33,10 @@
  *	to record the transaction commit before sending SI messages, otherwise
  *	the other backends won't see our updated tuples as good.
  *
+ *	When a subtransaction aborts, we can process and discard any events
+ *	it has queued.  When a subtransaction commits, we just add its events
+ *	to the pending lists of the parent transaction.
+ *
  *	In short, we need to remember until xact end every insert or delete
  *	of a tuple that might be in the system caches.	Updates are treated as
  *	two events, delete + insert, for simplicity.  (There are cases where
@@ -66,15 +70,17 @@
  *	manipulating the init file is in relcache.c, but we keep track of the
  *	need for it here.
  *
- *	All the request lists are kept in TopTransactionContext memory, since
- *	they need not live beyond the end of the current transaction.
+ *	The request lists proper are kept in CurTransactionContext of their
+ *	creating (sub)transaction, since they can be forgotten on abort of that
+ *	transaction but must be kept till top-level commit otherwise.  For
+ *	simplicity we keep the controlling list-of-lists in TopTransactionContext.
  *
  *
  * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/utils/cache/inval.c,v 1.62 2004/06/18 06:13:52 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/utils/cache/inval.c,v 1.63 2004/07/01 00:51:17 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -95,7 +101,7 @@
  * To minimize palloc traffic, we keep pending requests in successively-
  * larger chunks (a slightly more sophisticated version of an expansible
  * array).	All request types can be stored as SharedInvalidationMessage
- * records.
+ * records.  The ordering of requests within a list is never significant.
  */
 typedef struct InvalidationChunk
 {
@@ -112,12 +118,15 @@ typedef struct InvalidationListHeader
 } InvalidationListHeader;
 
 /*----------------
- *	Invalidation info is divided into two lists:
+ * Invalidation info is divided into two lists:
  *	1) events so far in current command, not yet reflected to caches.
  *	2) events in previous commands of current transaction; these have
  *	   been reflected to local caches, and must be either broadcast to
  *	   other backends or rolled back from local cache when we commit
  *	   or abort the transaction.
+ * Actually, we need two such lists for each level of nested transaction,
+ * so that we can discard events from an aborted subtransaction.  When
+ * a subtransaction commits, we append its lists to the parent's lists.
  *
  * The relcache-file-invalidated flag can just be a simple boolean,
  * since we only act on it at transaction commit; we don't care which
@@ -125,13 +134,22 @@ typedef struct InvalidationListHeader
  *----------------
  */
 
-/* head of current-command event list */
-static InvalidationListHeader CurrentCmdInvalidMsgs;
+typedef struct TransInvalidationInfo
+{
+	/* Back link to parent transaction's info */
+	struct TransInvalidationInfo *parent;
+
+	/* head of current-command event list */
+	InvalidationListHeader CurrentCmdInvalidMsgs;
 
-/* head of previous-commands event list */
-static InvalidationListHeader PriorCmdInvalidMsgs;
+	/* head of previous-commands event list */
+	InvalidationListHeader PriorCmdInvalidMsgs;
 
-static bool RelcacheInitFileInval;		/* init file must be invalidated? */
+	/* init file must be invalidated? */
+	bool RelcacheInitFileInval;
+} TransInvalidationInfo;
+
+static TransInvalidationInfo *transInvalInfo = NULL;
 
 /*
  * Dynamically-registered callback functions.  Current implementation
@@ -176,7 +194,7 @@ AddInvalidationMessage(InvalidationChunk **listHdr,
 		/* First time through; create initial chunk */
 #define FIRSTCHUNKSIZE 16
 		chunk = (InvalidationChunk *)
-			MemoryContextAlloc(TopTransactionContext,
+			MemoryContextAlloc(CurTransactionContext,
 							   sizeof(InvalidationChunk) +
 				(FIRSTCHUNKSIZE - 1) *sizeof(SharedInvalidationMessage));
 		chunk->nitems = 0;
@@ -190,7 +208,7 @@ AddInvalidationMessage(InvalidationChunk **listHdr,
 		int			chunksize = 2 * chunk->maxitems;
 
 		chunk = (InvalidationChunk *)
-			MemoryContextAlloc(TopTransactionContext,
+			MemoryContextAlloc(CurTransactionContext,
 							   sizeof(InvalidationChunk) +
 					 (chunksize - 1) *sizeof(SharedInvalidationMessage));
 		chunk->nitems = 0;
@@ -203,29 +221,6 @@ AddInvalidationMessage(InvalidationChunk **listHdr,
 	chunk->nitems++;
 }
 
-/*
- * Free a list of inval message chunks.
- *
- * NOTE: when we are about to commit or abort a transaction, it's
- * not really necessary to pfree the lists explicitly, since they will
- * go away anyway when TopTransactionContext is destroyed.
- */
-static void
-FreeInvalidationMessageList(InvalidationChunk **listHdr)
-{
-	InvalidationChunk *chunk = *listHdr;
-
-	*listHdr = NULL;
-
-	while (chunk != NULL)
-	{
-		InvalidationChunk *nextchunk = chunk->next;
-
-		pfree(chunk);
-		chunk = nextchunk;
-	}
-}
-
 /*
  * Append one list of invalidation message chunks to another, resetting
  * the source chunk-list pointer to NULL.
@@ -331,31 +326,6 @@ AppendInvalidationMessages(InvalidationListHeader *dest,
 	AppendInvalidationMessageList(&dest->rclist, &src->rclist);
 }
 
-/*
- * Reset an invalidation list to empty
- *
- * physicalFree may be set false if caller knows transaction is ending
- */
-static void
-DiscardInvalidationMessages(InvalidationListHeader *hdr, bool physicalFree)
-{
-	if (physicalFree)
-	{
-		/* Physically pfree the list data */
-		FreeInvalidationMessageList(&hdr->cclist);
-		FreeInvalidationMessageList(&hdr->rclist);
-	}
-	else
-	{
-		/*
-		 * Assume the storage will go away at xact end, just reset
-		 * pointers
-		 */
-		hdr->cclist = NULL;
-		hdr->rclist = NULL;
-	}
-}
-
 /*
  * Execute the given function for all the messages in an invalidation list.
  * The list is not altered.
@@ -386,7 +356,7 @@ RegisterCatcacheInvalidation(int cacheId,
 							 ItemPointer tuplePtr,
 							 Oid dbId)
 {
-	AddCatcacheInvalidationMessage(&CurrentCmdInvalidMsgs,
+	AddCatcacheInvalidationMessage(&transInvalInfo->CurrentCmdInvalidMsgs,
 								   cacheId, hashValue, tuplePtr, dbId);
 }
 
@@ -398,7 +368,7 @@ RegisterCatcacheInvalidation(int cacheId,
 static void
 RegisterRelcacheInvalidation(Oid dbId, Oid relId, RelFileNode physId)
 {
-	AddRelcacheInvalidationMessage(&CurrentCmdInvalidMsgs,
+	AddRelcacheInvalidationMessage(&transInvalInfo->CurrentCmdInvalidMsgs,
 								   dbId, relId, physId);
 
 	/*
@@ -406,7 +376,7 @@ RegisterRelcacheInvalidation(Oid dbId, Oid relId, RelFileNode physId)
 	 * relcache init file, mark that we need to zap that file at commit.
 	 */
 	if (RelationIdIsInInitFile(relId))
-		RelcacheInitFileInval = true;
+		transInvalInfo->RelcacheInitFileInval = true;
 }
 
 /*
@@ -619,8 +589,38 @@ AcceptInvalidationMessages(void)
 }
 
 /*
- * AtEOXactInvalidationMessages
- *		Process queued-up invalidation messages at end of transaction.
+ * AtStart_Inval
+ *		Initialize inval lists at start of a main transaction.
+ */
+void
+AtStart_Inval(void)
+{
+	Assert(transInvalInfo == NULL);
+	transInvalInfo = (TransInvalidationInfo *)
+		MemoryContextAllocZero(TopTransactionContext,
+							   sizeof(TransInvalidationInfo));
+}
+
+/*
+ * AtSubStart_Inval
+ *		Initialize inval lists at start of a subtransaction.
+ */
+void
+AtSubStart_Inval(void)
+{
+	TransInvalidationInfo *myInfo;
+
+	Assert(transInvalInfo != NULL);
+	myInfo = (TransInvalidationInfo *)
+		MemoryContextAllocZero(TopTransactionContext,
+							   sizeof(TransInvalidationInfo));
+	myInfo->parent = transInvalInfo;
+	transInvalInfo = myInfo;
+}
+
+/*
+ * AtEOXact_Inval
+ *		Process queued-up invalidation messages at end of main transaction.
  *
  * If isCommit, we must send out the messages in our PriorCmdInvalidMsgs list
  * to the shared invalidation message queue.  Note that these will be read
@@ -643,8 +643,11 @@ AcceptInvalidationMessages(void)
  *		This should be called as the last step in processing a transaction.
  */
 void
-AtEOXactInvalidationMessages(bool isCommit)
+AtEOXact_Inval(bool isCommit)
 {
+	/* Must be at top of stack */
+	Assert(transInvalInfo != NULL && transInvalInfo->parent == NULL);
+
 	if (isCommit)
 	{
 		/*
@@ -652,28 +655,77 @@ AtEOXactInvalidationMessages(bool isCommit)
 		 * and after we send the SI messages.  However, we need not do
 		 * anything unless we committed.
 		 */
-		if (RelcacheInitFileInval)
+		if (transInvalInfo->RelcacheInitFileInval)
 			RelationCacheInitFileInvalidate(true);
 
-		AppendInvalidationMessages(&PriorCmdInvalidMsgs,
-								   &CurrentCmdInvalidMsgs);
+		AppendInvalidationMessages(&transInvalInfo->PriorCmdInvalidMsgs,
+								   &transInvalInfo->CurrentCmdInvalidMsgs);
 
-		ProcessInvalidationMessages(&PriorCmdInvalidMsgs,
+		ProcessInvalidationMessages(&transInvalInfo->PriorCmdInvalidMsgs,
 									SendSharedInvalidMessage);
 
-		if (RelcacheInitFileInval)
+		if (transInvalInfo->RelcacheInitFileInval)
 			RelationCacheInitFileInvalidate(false);
 	}
 	else
 	{
-		ProcessInvalidationMessages(&PriorCmdInvalidMsgs,
+		ProcessInvalidationMessages(&transInvalInfo->PriorCmdInvalidMsgs,
 									LocalExecuteInvalidationMessage);
 	}
 
-	RelcacheInitFileInval = false;
+	/* Need not free anything explicitly */
+	transInvalInfo = NULL;
+}
+
+/*
+ * AtSubEOXact_Inval
+ *		Process queued-up invalidation messages at end of subtransaction.
+ *
+ * If isCommit, process CurrentCmdInvalidMsgs if any (there probably aren't),
+ * and then attach both CurrentCmdInvalidMsgs and PriorCmdInvalidMsgs to the
+ * parent's PriorCmdInvalidMsgs list.
+ *
+ * If not isCommit, we are aborting, and must locally process the messages
+ * in PriorCmdInvalidMsgs.	No messages need be sent to other backends.
+ * We can forget about CurrentCmdInvalidMsgs too, since those changes haven't
+ * touched the caches yet.
+ *
+ * In any case, pop the transaction stack.  We need not physically free memory
+ * here, since CurTransactionContext is about to be emptied anyway
+ * (if aborting).
+ */
+void
+AtSubEOXact_Inval(bool isCommit)
+{
+	TransInvalidationInfo *myInfo = transInvalInfo;
+
+	/* Must be at non-top of stack */
+	Assert(myInfo != NULL && myInfo->parent != NULL);
+
+	if (isCommit)
+	{
+		/* If CurrentCmdInvalidMsgs still has anything, fix it */
+		CommandEndInvalidationMessages();
+
+		/* Pass up my inval messages to parent */
+		AppendInvalidationMessages(&myInfo->parent->PriorCmdInvalidMsgs,
+								   &myInfo->PriorCmdInvalidMsgs);
 
-	DiscardInvalidationMessages(&PriorCmdInvalidMsgs, false);
-	DiscardInvalidationMessages(&CurrentCmdInvalidMsgs, false);
+		/* Pending relcache inval becomes parent's problem too */
+		if (myInfo->RelcacheInitFileInval)
+			myInfo->parent->RelcacheInitFileInval = true;
+	}
+	else
+	{
+		ProcessInvalidationMessages(&myInfo->PriorCmdInvalidMsgs,
+									LocalExecuteInvalidationMessage);
+	}
+
+	/* Pop the transaction state stack */
+	transInvalInfo = myInfo->parent;
+
+	/* Need not free anything else explicitly */
+	pfree(myInfo);
 }
 
 /*
@@ -687,27 +739,25 @@ AtEOXactInvalidationMessages(bool isCommit)
  * current command.  We then move the current-cmd list over to become part
  * of the prior-cmds list.
  *
- * The isCommit = false case is not currently used, but may someday be
- * needed to support rollback to a savepoint within a transaction.
- *
  * Note:
  *		This should be called during CommandCounterIncrement(),
  *		after we have advanced the command ID.
  */
 void
-CommandEndInvalidationMessages(bool isCommit)
+CommandEndInvalidationMessages(void)
 {
-	if (isCommit)
-	{
-		ProcessInvalidationMessages(&CurrentCmdInvalidMsgs,
-									LocalExecuteInvalidationMessage);
-		AppendInvalidationMessages(&PriorCmdInvalidMsgs,
-								   &CurrentCmdInvalidMsgs);
-	}
-	else
-	{
-		/* XXX what needs to be done here? */
-	}
+	/*
+	 * You might think this shouldn't be called outside any transaction,
+	 * but bootstrap does it, and also ABORT issued when not in a transaction.
+	 * So just quietly return if no state to work on.
+	 */
+	if (transInvalInfo == NULL)
+		return;
+
+	ProcessInvalidationMessages(&transInvalInfo->CurrentCmdInvalidMsgs,
+								LocalExecuteInvalidationMessage);
+	AppendInvalidationMessages(&transInvalInfo->PriorCmdInvalidMsgs,
+							   &transInvalInfo->CurrentCmdInvalidMsgs);
 }
 
 /*
diff --git a/src/backend/utils/cache/relcache.c b/src/backend/utils/cache/relcache.c
index ee8b46407e1..23428992724 100644
--- a/src/backend/utils/cache/relcache.c
+++ b/src/backend/utils/cache/relcache.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/utils/cache/relcache.c,v 1.205 2004/06/18 06:13:52 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/utils/cache/relcache.c,v 1.206 2004/07/01 00:51:17 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -273,6 +273,8 @@ static void IndexSupportInitialize(Form_pg_index iform,
 static OpClassCacheEnt *LookupOpclassInfo(Oid operatorClassOid,
 				  StrategyNumber numStrats,
 				  StrategyNumber numSupport);
+static inline void RelationPushReferenceCount(Relation rel);
+static inline void RelationPopReferenceCount(Relation rel);
 
 
 /*
@@ -1678,6 +1680,8 @@ RelationClearRelation(Relation relation, bool rebuild)
 	list_free(relation->rd_indexlist);
 	if (relation->rd_indexcxt)
 		MemoryContextDelete(relation->rd_indexcxt);
+	if (relation->rd_prevrefcnt)
+		pfree(relation->rd_prevrefcnt);
 
 	/*
 	 * If we're really done with the relcache entry, blow it away. But if
@@ -1968,7 +1972,7 @@ RelationCacheInvalidate(void)
  * we must reset refcnts before handling pending invalidations.
  */
 void
-AtEOXact_RelationCache(bool commit)
+AtEOXact_RelationCache(bool isCommit)
 {
 	HASH_SEQ_STATUS status;
 	RelIdCacheEnt *idhentry;
@@ -1993,7 +1997,7 @@ AtEOXact_RelationCache(bool commit)
 		 */
 		if (relation->rd_isnew)
 		{
-			if (commit)
+			if (isCommit)
 				relation->rd_isnew = false;
 			else
 			{
@@ -2019,7 +2023,7 @@ AtEOXact_RelationCache(bool commit)
 		 */
 		expected_refcnt = relation->rd_isnailed ? 1 : 0;
 
-		if (commit)
+		if (isCommit)
 		{
 			if (relation->rd_refcnt != expected_refcnt &&
 				!IsBootstrapProcessingMode())
@@ -2036,6 +2040,12 @@ AtEOXact_RelationCache(bool commit)
 			RelationSetReferenceCount(relation, expected_refcnt);
 		}
 
+		/*
+		 * Reset the refcount stack.  Just drop the item count; don't deallocate
+		 * the stack itself so it can be reused by future subtransactions.
+		 */
+		relation->rd_numpushed = 0;
+
 		/*
 		 * Flush any temporary index list.
 		 */
@@ -2048,6 +2058,131 @@ AtEOXact_RelationCache(bool commit)
 	}
 }
 
+/*
+ * RelationPushReferenceCount
+ *
+ * Push the current reference count into the stack.  Don't modify the
+ * reference count itself.
+ */
+static inline void
+RelationPushReferenceCount(Relation rel)
+{
+	/* Enlarge the stack if we run out of space. */
+	if (rel->rd_numpushed == rel->rd_numalloc)
+	{
+		MemoryContext	old_cxt = MemoryContextSwitchTo(CacheMemoryContext);
+
+		if (rel->rd_numalloc == 0)
+		{
+			rel->rd_numalloc = 8;
+			rel->rd_prevrefcnt = palloc(rel->rd_numalloc * sizeof(int));
+		}
+		else
+		{
+			rel->rd_numalloc *= 2;
+			rel->rd_prevrefcnt = repalloc(rel->rd_prevrefcnt, rel->rd_numalloc * sizeof(int));
+		}
+
+		MemoryContextSwitchTo(old_cxt);
+	}
+
+	rel->rd_prevrefcnt[rel->rd_numpushed++] = rel->rd_refcnt;
+}
+
+/*
+ * RelationPopReferenceCount
+ *
+ * Pop the latest stored reference count.  If there is none, drop it
+ * to zero; the entry was created in the current subtransaction.
+ */
+static inline void
+RelationPopReferenceCount(Relation rel)
+{
+	if (rel->rd_numpushed == 0)
+	{
+		rel->rd_refcnt = rel->rd_isnailed ? 1 : 0;
+		return;
+	}
+
+	rel->rd_refcnt = rel->rd_prevrefcnt[--rel->rd_numpushed];
+}
+
+/*
+ * AtEOSubXact_RelationCache
+ */
+void
+AtEOSubXact_RelationCache(bool isCommit)
+{
+	HASH_SEQ_STATUS status;
+	RelIdCacheEnt *idhentry;
+
+	/* We'd better not be bootstrapping. */
+	Assert(!IsBootstrapProcessingMode());
+
+	hash_seq_init(&status, RelationIdCache);
+
+	while ((idhentry = (RelIdCacheEnt *) hash_seq_search(&status)) != NULL)
+	{
+		Relation	relation = idhentry->reldesc;
+
+		/*
+		 * During subtransaction commit, we first check whether the
+		 * current refcount is correct: if there is no item in the stack,
+		 * the relcache entry was created during this subtransaction, it should
+		 * be 0 (or 1 for nailed relations).  If the stack has at least one
+		 * item, the expected count is whatever that item is.
+		 */
+		if (isCommit)
+		{
+			int expected_refcnt;
+
+			if (relation->rd_numpushed == 0)
+				expected_refcnt = relation->rd_isnailed ? 1 : 0;
+			else
+				expected_refcnt = relation->rd_prevrefcnt[relation->rd_numpushed - 1];
+
+			if (relation->rd_refcnt != expected_refcnt)
+			{
+				elog(WARNING, "relcache reference leak: relation \"%s\" has refcnt %d instead of %d",
+						RelationGetRelationName(relation),
+						relation->rd_refcnt, expected_refcnt);
+			}
+		}
+
+		/*
+		 * On commit, the expected count is stored so there's no harm in
+		 * popping it (and we may need to fix if there was a leak); and during
+		 * abort, the correct refcount has to be restored.
+		 */
+		RelationPopReferenceCount(relation);
+	}
+}
+
+/*
+ * AtSubStart_RelationCache
+ *
+ * At subtransaction start, we push the current reference count into
+ * the refcount stack, so it can be restored if the subtransaction aborts.
+ */
+void
+AtSubStart_RelationCache(void)
+{
+	HASH_SEQ_STATUS status;
+	RelIdCacheEnt *idhentry;
+
+	/* We'd better not be bootstrapping. */
+	Assert(!IsBootstrapProcessingMode());
+
+	hash_seq_init(&status, RelationIdCache);
+
+	while ((idhentry = (RelIdCacheEnt *) hash_seq_search(&status)) != NULL)
+	{
+		Relation	relation = idhentry->reldesc;
+
+		RelationPushReferenceCount(relation);
+	}
+}
+
 /*
  *		RelationBuildLocalRelation
  *			Build a relcache entry for an about-to-be-created relation,
diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c
index 48d28d429f2..3caf18c5f33 100644
--- a/src/backend/utils/init/postinit.c
+++ b/src/backend/utils/init/postinit.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/utils/init/postinit.c,v 1.134 2004/06/18 06:13:54 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/utils/init/postinit.c,v 1.135 2004/07/01 00:51:20 tgl Exp $
  *
  *
  *-------------------------------------------------------------------------
@@ -27,7 +27,6 @@
 #include "catalog/pg_database.h"
 #include "catalog/pg_shadow.h"
 #include "catalog/pg_tablespace.h"
-#include "commands/trigger.h"
 #include "mb/pg_wchar.h"
 #include "miscadmin.h"
 #include "postmaster/postmaster.h"
@@ -350,12 +349,6 @@ InitPostgres(const char *dbname, const char *username)
 	/* Initialize portal manager */
 	EnablePortalManager();
 
-	/*
-	 * Initialize the deferred trigger manager --- must happen before
-	 * first transaction start.
-	 */
-	DeferredTriggerInit();
-
 	/* start a new transaction here before access to db */
 	if (!bootstrap)
 		StartTransactionCommand();
diff --git a/src/backend/utils/misc/README b/src/backend/utils/misc/README
index 12a2cdef036..3ea838b1f53 100644
--- a/src/backend/utils/misc/README
+++ b/src/backend/utils/misc/README
@@ -1,4 +1,4 @@
-$PostgreSQL: pgsql/src/backend/utils/misc/README,v 1.4 2004/01/19 19:04:40 tgl Exp $
+$PostgreSQL: pgsql/src/backend/utils/misc/README,v 1.5 2004/07/01 00:51:24 tgl Exp $
 
 
 GUC IMPLEMENTATION NOTES
@@ -68,49 +68,66 @@ SET on transaction abort, and rollback of SET LOCAL at transaction end
 would be effective had there never been any SET commands in the current
 session.
 
-To handle these cases we must keep track of as many as four distinct
-values for each variable.  They are:
+To handle these cases we must keep track of many distinct values for each
+variable.  The primary values are:
 
 * actual variable contents	always the current effective value
 
 * reset_value			the value to use for RESET
 
-* session_value			the "committed" setting for the session
-
 * tentative_value		the uncommitted result of SET
 
-During initialization we set the first three of these (actual, reset_value,
-and session_value) based on whichever non-interactive source has the
-highest priority.  All three will have the same value.
+The reason we need a tentative_value separate from the actual value is
+that when a transaction does SET followed by SET LOCAL, the actual value
+will now be the LOCAL value, but we want to remember the prior SET so that
+that value is restored at transaction commit.
+
+In addition, for each level of transaction (possibly nested) we have to
+remember the transaction-entry-time actual and tentative values, in case
+we need to restore them at transaction end.  (The RESET value is essentially
+non-transactional, so it doesn't have to be stacked.)  For efficiency these
+stack entries are not constructed until/unless the variable is actually SET
+within a particular transaction.
+
+During initialization we set the actual value and reset_value based on
+whichever non-interactive source has the highest priority.  They will
+have the same value.  The tentative_value is not meaningful at this point.
+
+A SET command starts by stacking the existing actual and tentative values
+if this hasn't already been done within the current transaction.  Then:
 
 A SET LOCAL command sets the actual variable (and nothing else).  At
-transaction end, the session_value is used to restore the actual variable
-to its pre-transaction value.
+transaction end, the stacked values are used to restore the GUC entry
+to its pre-transaction state.
 
 A SET (or SET SESSION) command sets the actual variable, and if no error,
 then sets the tentative_value.  If the transaction commits, the
-tentative_value is assigned to the session_value and the actual variable
-(which could by now be different, if the SET was followed by SET LOCAL).
-If the transaction aborts, the tentative_value is discarded and the
-actual variable is restored from the session_value.
+tentative_value is assigned again to the actual variable (which could by
+now be different, if the SET was followed by SET LOCAL).  If the
+transaction aborts, the stacked values are used to restore the GUC entry
+to its pre-transaction state.
+
+In the case of SET within nested subtransactions, at each commit the
+tentative_value propagates out to the next transaction level.  It will
+be thrown away at abort of any level, or after exiting the top transaction.
 
 RESET is executed like a SET, but using the reset_value as the desired new
 value.  (We do not provide a RESET LOCAL command, but SET LOCAL TO DEFAULT
 has the same behavior that RESET LOCAL would.)  The source associated with
-the reset_value also becomes associated with the actual and session values.
+the reset_value also becomes associated with the actual and tentative values.
 
 If SIGHUP is received, the GUC code rereads the postgresql.conf
 configuration file (this does not happen in the signal handler, but at
 next return to main loop; note that it can be executed while within a
 transaction).  New values from postgresql.conf are assigned to actual
-variable, reset_value, and session_value, but only if each of these has a
-current source priority <= PGC_S_FILE.  (It is thus possible for
-reset_value to track the config-file setting even if there is currently
-a different interactive value of the actual variable.)
+variable, reset_value, and stacked actual values, but only if each of
+these has a current source priority <= PGC_S_FILE.  (It is thus possible
+for reset_value to track the config-file setting even if there is
+currently a different interactive value of the actual variable.)
 
 Note that tentative_value is unused and undefined except between a SET
 command and the end of the transaction.  Also notice that we must track
-the source associated with each of the four values.
+the source associated with each one of the values.
 
 The assign_hook and show_hook routines work only with the actual variable,
 and are not directly aware of the additional values maintained by GUC.
@@ -129,9 +146,9 @@ pstrdup/palloc mechanisms.  We would need to keep them in a permanent
 context anyway, and strdup gives us more control over handling
 out-of-memory failures.
 
-We allow a variable's actual value, reset_val, session_val, and
-tentative_val to point at the same storage.  This makes it slightly harder
-to free space (must test that the value to be freed isn't equal to any of
-the other three pointers).  The main advantage is that we never need to
-strdup during transaction commit/abort, so cannot cause an out-of-memory
-failure there.
+We allow a string variable's actual value, reset_val, tentative_val, and
+stacked copies of same to point at the same storage.  This makes it
+slightly harder to free space (must test whether a value to be freed isn't
+equal to any of the other pointers in the GUC entry or associated stack
+items).  The main advantage is that we never need to strdup during
+transaction commit/abort, so cannot cause an out-of-memory failure there.
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index f050e201e2a..f5c16de83ba 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -10,17 +10,16 @@
  * Written by Peter Eisentraut <peter_e@gmx.net>.
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/utils/misc/guc.c,v 1.211 2004/06/11 03:54:54 momjian Exp $
+ *	  $PostgreSQL: pgsql/src/backend/utils/misc/guc.c,v 1.212 2004/07/01 00:51:24 tgl Exp $
  *
  *--------------------------------------------------------------------
  */
 #include "postgres.h"
 
-#include <errno.h>
+#include <ctype.h>
 #include <float.h>
 #include <limits.h>
 #include <unistd.h>
-#include <ctype.h>
 
 #include "utils/guc.h"
 #include "utils/guc_tables.h"
@@ -54,6 +53,7 @@
 #include "tcop/tcopprot.h"
 #include "utils/array.h"
 #include "utils/builtins.h"
+#include "utils/memutils.h"
 #include "utils/pg_locale.h"
 #include "pgstat.h"
 
@@ -105,6 +105,7 @@ static const char *assign_custom_variable_classes(const char *newval, bool doit,
 						   GucSource source);
 static bool assign_stage_log_stats(bool newval, bool doit, GucSource source);
 static bool assign_log_stats(bool newval, bool doit, GucSource source);
+static bool assign_transaction_read_only(bool newval, bool doit, GucSource source);
 
 
 /*
@@ -174,45 +175,6 @@ static int	max_identifier_length;
 static int	block_size;
 static bool integer_datetimes;
 
-/* Macros for freeing malloc'd pointers only if appropriate to do so */
-/* Some of these tests are probably redundant, but be safe ... */
-#define SET_STRING_VARIABLE(rec, newval) \
-	do { \
-		if (*(rec)->variable && \
-			*(rec)->variable != (rec)->reset_val && \
-			*(rec)->variable != (rec)->session_val && \
-			*(rec)->variable != (rec)->tentative_val) \
-			free(*(rec)->variable); \
-		*(rec)->variable = (newval); \
-	} while (0)
-#define SET_STRING_RESET_VAL(rec, newval) \
-	do { \
-		if ((rec)->reset_val && \
-			(rec)->reset_val != *(rec)->variable && \
-			(rec)->reset_val != (rec)->session_val && \
-			(rec)->reset_val != (rec)->tentative_val) \
-			free((rec)->reset_val); \
-		(rec)->reset_val = (newval); \
-	} while (0)
-#define SET_STRING_SESSION_VAL(rec, newval) \
-	do { \
-		if ((rec)->session_val && \
-			(rec)->session_val != *(rec)->variable && \
-			(rec)->session_val != (rec)->reset_val && \
-			(rec)->session_val != (rec)->tentative_val) \
-			free((rec)->session_val); \
-		(rec)->session_val = (newval); \
-	} while (0)
-#define SET_STRING_TENTATIVE_VAL(rec, newval) \
-	do { \
-		if ((rec)->tentative_val && \
-			(rec)->tentative_val != *(rec)->variable && \
-			(rec)->tentative_val != (rec)->reset_val && \
-			(rec)->tentative_val != (rec)->session_val) \
-			free((rec)->tentative_val); \
-		(rec)->tentative_val = (newval); \
-	} while (0)
-
 
 /*
  * Displayable names for context types (enum GucContext)
@@ -801,7 +763,7 @@ static struct config_bool ConfigureNamesBool[] =
 			GUC_NO_RESET_ALL | GUC_NOT_IN_SAMPLE | GUC_DISALLOW_IN_FILE
 		},
 		&XactReadOnly,
-		false, NULL, NULL
+		false, assign_transaction_read_only, NULL
 	},
 	{
 		{"add_missing_from", PGC_USERSET, COMPAT_OPTIONS_PREVIOUS,
@@ -1766,14 +1728,13 @@ static const char * const map_old_guc_names[] = {
  */
 static struct config_generic **guc_variables;
 
-/* Current number of variables contained in the vector
- */
+/* Current number of variables contained in the vector */
 static int num_guc_variables;
 
-/* Vector capacity
- */
+/* Vector capacity */
 static int size_guc_variables;
 
+
 static bool guc_dirty;			/* TRUE if need to do commit/abort work */
 
 static bool reporting_enabled;	/* TRUE to enable GUC_REPORT */
@@ -1783,14 +1744,71 @@ static char *guc_string_workspace;		/* for avoiding memory leaks */
 
 static int	guc_var_compare(const void *a, const void *b);
 static int	guc_name_compare(const char *namea, const char *nameb);
+static void push_old_value(struct config_generic *gconf);
 static void ReportGUCOption(struct config_generic * record);
 static char *_ShowOption(struct config_generic * record);
 
-struct config_generic** get_guc_variables()
+
+/*
+ * Support for assigning to a field of a string GUC item.  Free the prior
+ * value if it's not referenced anywhere else in the item (including stacked
+ * states).
+ */
+static void
+set_string_field(struct config_string *conf, char **field, char *newval)
+{
+	char	*oldval = *field;
+	GucStack *stack;
+
+	/* Do the assignment */
+	*field = newval;
+
+	/* Exit if any duplicate references, or if old value was NULL anyway */
+	if (oldval == NULL ||
+		oldval == *(conf->variable) ||
+		oldval == conf->reset_val ||
+		oldval == conf->tentative_val)
+		return;
+	for (stack = conf->gen.stack; stack; stack = stack->prev)
+	{
+		if (oldval == stack->tentative_val.stringval ||
+			oldval == stack->value.stringval)
+			return;
+	}
+
+	/* Not used anymore, so free it */
+	free(oldval);
+}
+
+/*
+ * Detect whether strval is referenced anywhere in a GUC string item
+ */
+static bool
+string_field_used(struct config_string *conf, char *strval)
+{
+	GucStack *stack;
+
+	if (strval == *(conf->variable) ||
+		strval == conf->reset_val ||
+		strval == conf->tentative_val)
+		return true;
+	for (stack = conf->gen.stack; stack; stack = stack->prev)
+	{
+		if (strval == stack->tentative_val.stringval ||
+			strval == stack->value.stringval)
+			return true;
+	}
+	return false;
+}
+
+
+struct config_generic **
+get_guc_variables(void)
 {
 	return guc_variables;
 }
 
+
 /*
  * Build the sorted array.	This is split out so that it could be
  * re-executed after startup (eg, we could allow loadable modules to
@@ -2001,14 +2019,13 @@ find_option(const char *name)
 			return find_option(map_old_guc_names[i+1]);
 	}
 
-	/* Check if the name is qualified, and if so, check if the qualifier
+	/*
+	 * Check if the name is qualified, and if so, check if the qualifier
 	 * maps to a custom variable class.
 	 */
 	dot = strchr(name, GUC_QUALIFIER_SEPARATOR);
 	if(dot != NULL && is_custom_class(name, dot - name))
-		/*
-		 * Add a placeholder variable for this name
-		 */
+		/* Add a placeholder variable for this name */
 		return (struct config_generic*)add_placeholder_variable(name);
 
 	/* Unknown name */
@@ -2081,9 +2098,9 @@ InitializeGUCOptions(void)
 
 		gconf->status = 0;
 		gconf->reset_source = PGC_S_DEFAULT;
-		gconf->session_source = PGC_S_DEFAULT;
 		gconf->tentative_source = PGC_S_DEFAULT;
 		gconf->source = PGC_S_DEFAULT;
+		gconf->stack = NULL;
 
 		switch (gconf->vartype)
 		{
@@ -2097,7 +2114,6 @@ InitializeGUCOptions(void)
 							elog(FATAL, "failed to initialize %s to %d",
 								 conf->gen.name, (int) conf->reset_val);
 					*conf->variable = conf->reset_val;
-					conf->session_val = conf->reset_val;
 					break;
 				}
 			case PGC_INT:
@@ -2119,7 +2135,6 @@ InitializeGUCOptions(void)
 							elog(FATAL, "failed to initialize %s to %d",
 								 conf->gen.name, conf->reset_val);
 					*conf->variable = conf->reset_val;
-					conf->session_val = conf->reset_val;
 					break;
 				}
 			case PGC_REAL:
@@ -2135,7 +2150,6 @@ InitializeGUCOptions(void)
 							elog(FATAL, "failed to initialize %s to %g",
 								 conf->gen.name, conf->reset_val);
 					*conf->variable = conf->reset_val;
-					conf->session_val = conf->reset_val;
 					break;
 				}
 			case PGC_STRING:
@@ -2150,7 +2164,6 @@ InitializeGUCOptions(void)
 						   conf->assign_hook == assign_log_statement);
 					*conf->variable = NULL;
 					conf->reset_val = NULL;
-					conf->session_val = NULL;
 					conf->tentative_val = NULL;
 
 					if (conf->boot_val == NULL)
@@ -2190,7 +2203,6 @@ InitializeGUCOptions(void)
 						}
 					}
 					*conf->variable = str;
-					conf->session_val = str;
 					break;
 				}
 		}
@@ -2254,6 +2266,9 @@ ResetAllOptions(void)
 		if (gconf->source <= PGC_S_OVERRIDE)
 			continue;
 
+		/* Save old value to support transaction abort */
+		push_old_value(gconf);
+
 		switch (gconf->vartype)
 		{
 			case PGC_BOOL:
@@ -2336,8 +2351,8 @@ ResetAllOptions(void)
 						}
 					}
 
-					SET_STRING_VARIABLE(conf, str);
-					SET_STRING_TENTATIVE_VAL(conf, str);
+					set_string_field(conf, conf->variable, str);
+					set_string_field(conf, &conf->tentative_val, str);
 					conf->gen.source = conf->gen.reset_source;
 					conf->gen.tentative_source = conf->gen.reset_source;
 					conf->gen.status |= GUC_HAVE_TENTATIVE;
@@ -2353,11 +2368,93 @@ ResetAllOptions(void)
 
 
 /*
- * Do GUC processing at transaction commit or abort.
+ * push_old_value
+ *		Push previous state during first assignment to a GUC variable
+ *		within a particular transaction.
+ *
+ * We have to be willing to "back-fill" the state stack if the first
+ * assignment occurs within a subtransaction nested several levels deep.
+ * This ensures that if an intermediate transaction aborts, it will have
+ * the proper value available to restore the setting to.
+ */
+static void
+push_old_value(struct config_generic *gconf)
+{
+	int			my_level = GetCurrentTransactionNestLevel();
+	GucStack   *stack;
+
+	/* If we're not inside a transaction, do nothing */
+	if (my_level == 0)
+		return;
+
+	for (;;)
+	{
+		/* Done if we already pushed it at this nesting depth */
+		if (gconf->stack && gconf->stack->nest_level >= my_level)
+			return;
+
+		/*
+		 * We keep all the stack entries in TopTransactionContext so as to
+		 * avoid allocation problems when a subtransaction back-fills stack
+		 * entries for upper transaction levels.
+		 */
+		stack = (GucStack *) MemoryContextAlloc(TopTransactionContext,
+												sizeof(GucStack));
+
+		stack->prev = gconf->stack;
+		stack->nest_level = stack->prev ? stack->prev->nest_level + 1 : 1;
+		stack->status = gconf->status;
+		stack->tentative_source = gconf->tentative_source;
+		stack->source = gconf->source;
+
+		switch (gconf->vartype)
+		{
+			case PGC_BOOL:
+				stack->tentative_val.boolval =
+					((struct config_bool *) gconf)->tentative_val;
+				stack->value.boolval =
+					*((struct config_bool *) gconf)->variable;
+				break;
+
+			case PGC_INT:
+				stack->tentative_val.intval =
+					((struct config_int *) gconf)->tentative_val;
+				stack->value.intval =
+					*((struct config_int *) gconf)->variable;
+				break;
+
+			case PGC_REAL:
+				stack->tentative_val.realval =
+					((struct config_real *) gconf)->tentative_val;
+				stack->value.realval =
+					*((struct config_real *) gconf)->variable;
+				break;
+
+			case PGC_STRING:
+				stack->tentative_val.stringval =
+					((struct config_string *) gconf)->tentative_val;
+				stack->value.stringval =
+					*((struct config_string *) gconf)->variable;
+				break;
+		}
+
+		gconf->stack = stack;
+
+		/* Set state to indicate nothing happened yet within this level */
+		gconf->status = GUC_HAVE_STACK;
+
+		/* Ensure we remember to pop at end of xact */
+		guc_dirty = true;
+	}
+}
+
+/*
+ * Do GUC processing at transaction or subtransaction commit or abort.
  */
 void
-AtEOXact_GUC(bool isCommit)
+AtEOXact_GUC(bool isCommit, bool isSubXact)
 {
+	int			my_level;
 	int			i;
 
 	/* Quick exit if nothing's changed in this transaction */
@@ -2371,15 +2468,56 @@ AtEOXact_GUC(bool isCommit)
 		guc_string_workspace = NULL;
 	}
 
+	my_level = GetCurrentTransactionNestLevel();
+	Assert(isSubXact ? (my_level > 1) : (my_level == 1));
+
 	for (i = 0; i < num_guc_variables; i++)
 	{
 		struct config_generic *gconf = guc_variables[i];
+		int			my_status = gconf->status;
+		GucStack   *stack = gconf->stack;
+		bool		useTentative;
 		bool		changed;
 
-		/* Skip if nothing's happened to this var in this transaction */
-		if (gconf->status == 0)
+		/*
+		 * Skip if nothing's happened to this var in this transaction
+		 */
+		if (my_status == 0)
+		{
+			Assert(stack == NULL);
+			continue;
+		}
+		/* Assert that we stacked old value before changing it */
+		Assert(stack != NULL && (my_status & GUC_HAVE_STACK));
+		/* However, the last change may have been at an outer xact level */
+		if (stack->nest_level < my_level)
 			continue;
+		Assert(stack->nest_level == my_level);
+
+		/*
+		 * We will pop the stack entry.  Start by restoring outer xact status
+		 * (since we may want to modify it below).  Be careful to use
+		 * my_status to reference the inner xact status below this point...
+		 */
+		gconf->status = stack->status;
+
+		/*
+		 * We have two cases:
+		 *
+		 * If commit and HAVE_TENTATIVE, set actual value to tentative
+		 * (this is to override a SET LOCAL if one occurred later than SET).
+		 * We keep the tentative value and propagate HAVE_TENTATIVE to
+		 * the parent status, allowing the SET's effect to percolate up.
+		 * (But if we're exiting the outermost transaction, we'll drop the
+		 * HAVE_TENTATIVE bit below.)
+		 *
+		 * Otherwise, we have a transaction that aborted or executed only
+		 * SET LOCAL (or no SET at all).  In either case it should have no
+		 * further effect, so restore both tentative and actual values from
+		 * the stack entry.
+		 */
 
+		useTentative = isCommit && (my_status & GUC_HAVE_TENTATIVE) != 0;
 		changed = false;
 
 		switch (gconf->vartype)
@@ -2387,126 +2525,190 @@ AtEOXact_GUC(bool isCommit)
 			case PGC_BOOL:
 				{
 					struct config_bool *conf = (struct config_bool *) gconf;
+					bool		newval;
+					GucSource	newsource;
 
-					if (isCommit && (conf->gen.status & GUC_HAVE_TENTATIVE))
+					if (useTentative)
+					{
+						newval = conf->tentative_val;
+						newsource = conf->gen.tentative_source;
+						conf->gen.status |= GUC_HAVE_TENTATIVE;
+					}
+					else
 					{
-						conf->session_val = conf->tentative_val;
-						conf->gen.session_source = conf->gen.tentative_source;
+						newval = stack->value.boolval;
+						newsource = stack->source;
+						conf->tentative_val = stack->tentative_val.boolval;
+						conf->gen.tentative_source = stack->tentative_source;
 					}
 
-					if (*conf->variable != conf->session_val)
+					if (*conf->variable != newval)
 					{
 						if (conf->assign_hook)
-							if (!(*conf->assign_hook) (conf->session_val,
+							if (!(*conf->assign_hook) (newval,
 													   true, PGC_S_OVERRIDE))
 								elog(LOG, "failed to commit %s",
 									 conf->gen.name);
-						*conf->variable = conf->session_val;
+						*conf->variable = newval;
 						changed = true;
 					}
-					conf->gen.source = conf->gen.session_source;
-					conf->gen.status = 0;
+					conf->gen.source = newsource;
 					break;
 				}
 			case PGC_INT:
 				{
 					struct config_int *conf = (struct config_int *) gconf;
+					int			newval;
+					GucSource	newsource;
 
-					if (isCommit && (conf->gen.status & GUC_HAVE_TENTATIVE))
+					if (useTentative)
+					{
+						newval = conf->tentative_val;
+						newsource = conf->gen.tentative_source;
+						conf->gen.status |= GUC_HAVE_TENTATIVE;
+					}
+					else
 					{
-						conf->session_val = conf->tentative_val;
-						conf->gen.session_source = conf->gen.tentative_source;
+						newval = stack->value.intval;
+						newsource = stack->source;
+						conf->tentative_val = stack->tentative_val.intval;
+						conf->gen.tentative_source = stack->tentative_source;
 					}
 
-					if (*conf->variable != conf->session_val)
+					if (*conf->variable != newval)
 					{
 						if (conf->assign_hook)
-							if (!(*conf->assign_hook) (conf->session_val,
+							if (!(*conf->assign_hook) (newval,
 													   true, PGC_S_OVERRIDE))
 								elog(LOG, "failed to commit %s",
 									 conf->gen.name);
-						*conf->variable = conf->session_val;
+						*conf->variable = newval;
 						changed = true;
 					}
-					conf->gen.source = conf->gen.session_source;
-					conf->gen.status = 0;
+					conf->gen.source = newsource;
 					break;
 				}
 			case PGC_REAL:
 				{
 					struct config_real *conf = (struct config_real *) gconf;
+					double		newval;
+					GucSource	newsource;
 
-					if (isCommit && (conf->gen.status & GUC_HAVE_TENTATIVE))
+					if (useTentative)
 					{
-						conf->session_val = conf->tentative_val;
-						conf->gen.session_source = conf->gen.tentative_source;
+						newval = conf->tentative_val;
+						newsource = conf->gen.tentative_source;
+						conf->gen.status |= GUC_HAVE_TENTATIVE;
+					}
+					else
+					{
+						newval = stack->value.realval;
+						newsource = stack->source;
+						conf->tentative_val = stack->tentative_val.realval;
+						conf->gen.tentative_source = stack->tentative_source;
 					}
 
-					if (*conf->variable != conf->session_val)
+					if (*conf->variable != newval)
 					{
 						if (conf->assign_hook)
-							if (!(*conf->assign_hook) (conf->session_val,
+							if (!(*conf->assign_hook) (newval,
 													   true, PGC_S_OVERRIDE))
 								elog(LOG, "failed to commit %s",
 									 conf->gen.name);
-						*conf->variable = conf->session_val;
+						*conf->variable = newval;
 						changed = true;
 					}
-					conf->gen.source = conf->gen.session_source;
-					conf->gen.status = 0;
+					conf->gen.source = newsource;
 					break;
 				}
 			case PGC_STRING:
 				{
 					struct config_string *conf = (struct config_string *) gconf;
+					char	   *newval;
+					GucSource	newsource;
 
-					if (isCommit && (conf->gen.status & GUC_HAVE_TENTATIVE))
+					if (useTentative)
 					{
-						SET_STRING_SESSION_VAL(conf, conf->tentative_val);
-						conf->gen.session_source = conf->gen.tentative_source;
-						conf->tentative_val = NULL;		/* transfer ownership */
+						newval = conf->tentative_val;
+						newsource = conf->gen.tentative_source;
+						conf->gen.status |= GUC_HAVE_TENTATIVE;
 					}
 					else
-						SET_STRING_TENTATIVE_VAL(conf, NULL);
-
-					if (*conf->variable != conf->session_val)
 					{
-						char	   *str = conf->session_val;
+						newval = stack->value.stringval;
+						newsource = stack->source;
+						set_string_field(conf, &conf->tentative_val,
+										 stack->tentative_val.stringval);
+						conf->gen.tentative_source = stack->tentative_source;
+					}
 
+					if (*conf->variable != newval)
+					{
 						if (conf->assign_hook)
 						{
 							const char *newstr;
 
-							newstr = (*conf->assign_hook) (str, true,
+							newstr = (*conf->assign_hook) (newval, true,
 														   PGC_S_OVERRIDE);
 							if (newstr == NULL)
 								elog(LOG, "failed to commit %s",
 									 conf->gen.name);
-							else if (newstr != str)
+							else if (newstr != newval)
 							{
 								/*
+								 * If newval should now be freed, it'll be
+								 * taken care of below.
+								 *
 								 * See notes in set_config_option about
 								 * casting
 								 */
-								str = (char *) newstr;
-								SET_STRING_SESSION_VAL(conf, str);
+								newval = (char *) newstr;
 							}
 						}
 
-						SET_STRING_VARIABLE(conf, str);
+						set_string_field(conf, conf->variable, newval);
 						changed = true;
 					}
-					conf->gen.source = conf->gen.session_source;
-					conf->gen.status = 0;
+					conf->gen.source = newsource;
+					/* Release stacked values if not used anymore */
+					set_string_field(conf, &stack->value.stringval,
+									 NULL);
+					set_string_field(conf, &stack->tentative_val.stringval,
+									 NULL);
+					/* Don't store tentative value separately after commit */
+					if (!isSubXact)
+						set_string_field(conf, &conf->tentative_val, NULL);
 					break;
 				}
 		}
 
+		/* Finish popping the state stack */
+		gconf->stack = stack->prev;
+		pfree(stack);
+
+		/*
+		 * If we're now out of all xact levels, forget TENTATIVE status bit;
+		 * there's nothing tentative about the value anymore.
+		 */
+		if (!isSubXact)
+		{
+			Assert(gconf->stack == NULL);
+			gconf->status = 0;
+		}
+
+		/* Report new value if we changed it */
 		if (changed && (gconf->flags & GUC_REPORT))
 			ReportGUCOption(gconf);
 	}
 
-	guc_dirty = false;
+	/*
+	 * If we're now out of all xact levels, we can clear guc_dirty.
+	 * (Note: we cannot reset guc_dirty when exiting a subtransaction,
+	 * because we know that all outer transaction levels will have stacked
+	 * values to deal with.)
+	 */
+	if (!isSubXact)
+		guc_dirty = false;
 }
 
 
@@ -2810,7 +3012,7 @@ set_config_option(const char *name, const char *value,
 	}
 
 	/*
-	 * Should we set reset/session values?	(If so, the behavior is not
+	 * Should we set reset/stacked values?	(If so, the behavior is not
 	 * transactional.)
 	 */
 	makeDefault = changeVal && (source <= PGC_S_OVERRIDE) && (value != NULL);
@@ -2820,7 +3022,7 @@ set_config_option(const char *name, const char *value,
 	 * However, if changeVal is false then plow ahead anyway since we are
 	 * trying to find out if the value is potentially good, not actually
 	 * use it. Also keep going if makeDefault is true, since we may want
-	 * to set the reset/session values even if we can't set the variable
+	 * to set the reset/stacked values even if we can't set the variable
 	 * itself.
 	 */
 	if (record->source > source)
@@ -2901,6 +3103,9 @@ set_config_option(const char *name, const char *value,
 
 				if (changeVal || makeDefault)
 				{
+					/* Save old value to support transaction abort */
+					if (!makeDefault)
+						push_old_value(&conf->gen);
 					if (changeVal)
 					{
 						*conf->variable = newval;
@@ -2908,15 +3113,20 @@ set_config_option(const char *name, const char *value,
 					}
 					if (makeDefault)
 					{
+						GucStack *stack;
+
 						if (conf->gen.reset_source <= source)
 						{
 							conf->reset_val = newval;
 							conf->gen.reset_source = source;
 						}
-						if (conf->gen.session_source <= source)
+						for (stack = conf->gen.stack; stack; stack = stack->prev)
 						{
-							conf->session_val = newval;
-							conf->gen.session_source = source;
+							if (stack->source <= source)
+							{
+								stack->value.boolval = newval;
+								stack->source = source;
+							}
 						}
 					}
 					else if (isLocal)
@@ -3006,6 +3216,9 @@ set_config_option(const char *name, const char *value,
 
 				if (changeVal || makeDefault)
 				{
+					/* Save old value to support transaction abort */
+					if (!makeDefault)
+						push_old_value(&conf->gen);
 					if (changeVal)
 					{
 						*conf->variable = newval;
@@ -3013,15 +3226,20 @@ set_config_option(const char *name, const char *value,
 					}
 					if (makeDefault)
 					{
+						GucStack *stack;
+
 						if (conf->gen.reset_source <= source)
 						{
 							conf->reset_val = newval;
 							conf->gen.reset_source = source;
 						}
-						if (conf->gen.session_source <= source)
+						for (stack = conf->gen.stack; stack; stack = stack->prev)
 						{
-							conf->session_val = newval;
-							conf->gen.session_source = source;
+							if (stack->source <= source)
+							{
+								stack->value.intval = newval;
+								stack->source = source;
+							}
 						}
 					}
 					else if (isLocal)
@@ -3101,6 +3319,9 @@ set_config_option(const char *name, const char *value,
 
 				if (changeVal || makeDefault)
 				{
+					/* Save old value to support transaction abort */
+					if (!makeDefault)
+						push_old_value(&conf->gen);
 					if (changeVal)
 					{
 						*conf->variable = newval;
@@ -3108,15 +3329,20 @@ set_config_option(const char *name, const char *value,
 					}
 					if (makeDefault)
 					{
+						GucStack *stack;
+
 						if (conf->gen.reset_source <= source)
 						{
 							conf->reset_val = newval;
 							conf->gen.reset_source = source;
 						}
-						if (conf->gen.session_source <= source)
+						for (stack = conf->gen.stack; stack; stack = stack->prev)
 						{
-							conf->session_val = newval;
-							conf->gen.session_source = source;
+							if (stack->source <= source)
+							{
+								stack->value.realval = newval;
+								stack->source = source;
+							}
 						}
 					}
 					else if (isLocal)
@@ -3261,27 +3487,34 @@ set_config_option(const char *name, const char *value,
 
 				if (changeVal || makeDefault)
 				{
+					/* Save old value to support transaction abort */
+					if (!makeDefault)
+						push_old_value(&conf->gen);
 					if (changeVal)
 					{
-						SET_STRING_VARIABLE(conf, newval);
+						set_string_field(conf, conf->variable, newval);
 						conf->gen.source = source;
 					}
 					if (makeDefault)
 					{
+						GucStack *stack;
+
 						if (conf->gen.reset_source <= source)
 						{
-							SET_STRING_RESET_VAL(conf, newval);
+							set_string_field(conf, &conf->reset_val, newval);
 							conf->gen.reset_source = source;
 						}
-						if (conf->gen.session_source <= source)
+						for (stack = conf->gen.stack; stack; stack = stack->prev)
 						{
-							SET_STRING_SESSION_VAL(conf, newval);
-							conf->gen.session_source = source;
+							if (stack->source <= source)
+							{
+								set_string_field(conf, &stack->value.stringval,
+												 newval);
+								stack->source = source;
+							}
 						}
 						/* Perhaps we didn't install newval anywhere */
-						if (newval != *conf->variable &&
-							newval != conf->session_val &&
-							newval != conf->reset_val)
+						if (!string_field_used(conf, newval))
 							free(newval);
 					}
 					else if (isLocal)
@@ -3291,7 +3524,7 @@ set_config_option(const char *name, const char *value,
 					}
 					else
 					{
-						SET_STRING_TENTATIVE_VAL(conf, newval);
+						set_string_field(conf, &conf->tentative_val, newval);
 						conf->gen.tentative_source = source;
 						conf->gen.status |= GUC_HAVE_TENTATIVE;
 						guc_dirty = true;
@@ -3608,44 +3841,36 @@ define_custom_variable(struct config_generic* variable)
 	/* This better be a placeholder
 	 */
 	if(((*res)->flags & GUC_CUSTOM_PLACEHOLDER) == 0)
-	{
 		ereport(ERROR,
 				(errcode(ERRCODE_INTERNAL_ERROR),
 				 errmsg("attempt to redefine parameter \"%s\"", name)));
-	}
-	pHolder = (struct config_string*)*res;
+
+	Assert((*res)->vartype == PGC_STRING);
+	pHolder = (struct config_string*) *res;
 	
-	/* We have the same name, no sorting is necessary.
-	 */
+	/* We have the same name, no sorting is necessary */
 	*res = variable;
 
 	value = *pHolder->variable;
 
-	/* Assign the variable stored in the placeholder to the real
-	 * variable.
+	/*
+	 * Assign the string value stored in the placeholder to the real variable.
+	 *
+	 * XXX this is not really good enough --- it should be a nontransactional
+	 * assignment, since we don't want it to roll back if the current xact
+	 * fails later.
 	 */
 	set_config_option(name, value,
 				  pHolder->gen.context, pHolder->gen.source,
 				  false, true);
 
-	/* Free up stuff occupied by the placeholder variable
+	/*
+	 * Free up as much as we conveniently can of the placeholder structure
+	 * (this neglects any stack items...)
 	 */
-	if(value != NULL)
-		free((void*)value);
-
-	if(pHolder->reset_val != NULL && pHolder->reset_val != value)
-		free(pHolder->reset_val);
-
-	if(pHolder->session_val != NULL
-	&& pHolder->session_val != value
-	&& pHolder->session_val != pHolder->reset_val)
-		free(pHolder->session_val);
-
-	if(pHolder->tentative_val != NULL
-	&& pHolder->tentative_val != value
-	&& pHolder->tentative_val != pHolder->reset_val
-	&& pHolder->tentative_val != pHolder->session_val)
-		free(pHolder->tentative_val);
+	set_string_field(pHolder, pHolder->variable, NULL);
+	set_string_field(pHolder, &pHolder->reset_val, NULL);
+	set_string_field(pHolder, &pHolder->tentative_val, NULL);
 
 	free(pHolder);
 }
@@ -3754,7 +3979,7 @@ void DefineCustomStringVariable(
 	define_custom_variable(&var->gen);
 }
 
-extern void EmittWarningsOnPlaceholders(const char* className)
+extern void EmitWarningsOnPlaceholders(const char* className)
 {
 	struct config_generic** vars = guc_variables;
 	struct config_generic** last = vars + num_guc_variables;
@@ -5133,5 +5358,14 @@ assign_log_stats(bool newval, bool doit, GucSource source)
 	return true;
 }
 
+static bool
+assign_transaction_read_only(bool newval, bool doit, GucSource source)
+{
+	if (doit && source >= PGC_S_INTERACTIVE && IsSubTransaction())
+		ereport(ERROR,
+				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+				 errmsg("cannot set transaction read only mode inside a subtransaction")));
+	return true;
+}
 
 #include "guc-file.c"
diff --git a/src/backend/utils/mmgr/README b/src/backend/utils/mmgr/README
index f705827c1b6..490b781cc91 100644
--- a/src/backend/utils/mmgr/README
+++ b/src/backend/utils/mmgr/README
@@ -1,4 +1,4 @@
-$PostgreSQL: pgsql/src/backend/utils/mmgr/README,v 1.6 2004/06/05 19:48:09 tgl Exp $
+$PostgreSQL: pgsql/src/backend/utils/mmgr/README,v 1.7 2004/07/01 00:51:29 tgl Exp $
 
 Notes about memory allocation redesign
 --------------------------------------
@@ -90,7 +90,7 @@ context managers as discussed below.
 We could even consider getting rid of CurrentMemoryContext entirely,
 instead requiring the target memory context for allocation to be specified
 explicitly.  But I think that would be too much notational overhead ---
-we'd have to pass an apppropriate memory context to called routines in
+we'd have to pass an appropriate memory context to called routines in
 many places.  For example, the copyObject routines would need to be passed
 a context, as would function execution routines that return a
 pass-by-reference datatype.  And what of routines that temporarily
@@ -176,15 +176,30 @@ is kept separate from per-transaction and per-portal contexts because a
 query string might need to live either a longer or shorter time than any
 single transaction or portal.
 
-TopTransactionContext --- this holds everything that lives until end of
-transaction (longer than one statement within a transaction!).  An example
-of what has to be here is the list of pending NOTIFY messages to be sent
-at xact commit.  This context will be reset, and all its children deleted,
-at conclusion of each transaction cycle.  Note: this context is NOT
-cleared immediately upon error; its contents will survive until the
-transaction block is exited by COMMIT/ROLLBACK.
-(If we ever implement nested transactions, TopTransactionContext may need
-to be split into a true "top" pointer and a "current transaction" pointer.)
+TopTransactionContext --- this holds everything that lives until end of the
+top-level transaction.  This context will be reset, and all its children
+deleted, at conclusion of each top-level transaction cycle.  In most cases
+you don't want to allocate stuff directly here, but in CurTransactionContext;
+what does belong here is control information that exists explicitly to manage
+status across multiple subtransactions.  Note: this context is NOT cleared
+immediately upon error; its contents will survive until the transaction block
+is exited by COMMIT/ROLLBACK.
+
+CurTransactionContext --- this holds data that has to survive until the end
+of the current transaction, and in particular will be needed at top-level
+transaction commit.  When we are in a top-level transaction this is the same
+as TopTransactionContext, but in subtransactions it points to a child context.
+It is important to understand that if a subtransaction aborts, its
+CurTransactionContext is thrown away after finishing the abort processing;
+but a committed subtransaction's CurTransactionContext is kept until top-level
+commit (unless of course one of the intermediate levels of subtransaction
+aborts).  This ensures that we do not keep data from a failed subtransaction
+longer than necessary.  Because of this behavior, you must be careful to clean
+up properly during subtransaction abort --- the subtransaction's state must be
+delinked from any pointers or lists kept in upper transactions, or you will
+have dangling pointers leading to a crash at top-level commit.  An example of
+data kept here is pending NOTIFY messages, which are sent at top-level commit,
+but only if the generating subtransaction did not abort.
 
 QueryContext --- this is not actually a separate context, but a global
 variable pointing to the context that holds the current command's parse
diff --git a/src/backend/utils/mmgr/mcxt.c b/src/backend/utils/mmgr/mcxt.c
index c444886e140..96ffb1a8e1c 100644
--- a/src/backend/utils/mmgr/mcxt.c
+++ b/src/backend/utils/mmgr/mcxt.c
@@ -14,7 +14,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/utils/mmgr/mcxt.c,v 1.45 2004/06/05 19:48:09 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/utils/mmgr/mcxt.c,v 1.46 2004/07/01 00:51:29 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -45,6 +45,7 @@ MemoryContext PostmasterContext = NULL;
 MemoryContext CacheMemoryContext = NULL;
 MemoryContext MessageContext = NULL;
 MemoryContext TopTransactionContext = NULL;
+MemoryContext CurTransactionContext = NULL;
 
 /* These two are transient links to contexts owned by other objects: */
 MemoryContext QueryContext = NULL;
diff --git a/src/backend/utils/mmgr/portalmem.c b/src/backend/utils/mmgr/portalmem.c
index f77125cebf5..466b2fc97bf 100644
--- a/src/backend/utils/mmgr/portalmem.c
+++ b/src/backend/utils/mmgr/portalmem.c
@@ -12,7 +12,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/utils/mmgr/portalmem.c,v 1.65 2004/05/30 23:40:39 neilc Exp $
+ *	  $PostgreSQL: pgsql/src/backend/utils/mmgr/portalmem.c,v 1.66 2004/07/01 00:51:29 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -511,3 +511,94 @@ AtCleanup_Portals(void)
 		PortalDrop(portal, true);
 	}
 }
+
+/*
+ * Pre-subcommit processing for portals.
+ *
+ * Reassign the portals created in the current subtransaction to the parent
+ * transaction.  (XXX perhaps we should reassign only holdable cursors,
+ * and drop the rest?)
+ */
+void
+AtSubCommit_Portals(TransactionId parentXid)
+{
+	HASH_SEQ_STATUS status;
+	PortalHashEnt *hentry;
+	TransactionId curXid = GetCurrentTransactionId();
+
+	hash_seq_init(&status, PortalHashTable);
+
+	while ((hentry = (PortalHashEnt *) hash_seq_search(&status)) != NULL)
+	{
+		Portal	portal = hentry->portal;
+
+		if (portal->createXact == curXid)
+			portal->createXact = parentXid;
+	}
+}
+
+/*
+ * Subtransaction abort handling for portals.
+ *
+ * Deactivate all portals created during the failed subtransaction.
+ * Note that per AtSubCommit_Portals, this will catch portals created
+ * in descendants of the subtransaction too.
+ */
+void
+AtSubAbort_Portals(void)
+{
+	HASH_SEQ_STATUS status;
+	PortalHashEnt *hentry;
+	TransactionId curXid = GetCurrentTransactionId();
+
+	hash_seq_init(&status, PortalHashTable);
+
+	while ((hentry = (PortalHashEnt *) hash_seq_search(&status)) != NULL)
+	{
+		Portal	portal = hentry->portal;
+
+		if (portal->createXact != curXid)
+			continue;
+
+		portal->portalActive = false;
+
+		/* let portalcmds.c clean up the state it knows about */
+		if (PointerIsValid(portal->cleanup))
+		{
+			(*portal->cleanup) (portal, true);
+			portal->cleanup = NULL;
+		}
+	}
+}
+
+/*
+ * Post-subabort cleanup for portals.
+ *
+ * Drop all portals created in the finishing subtransaction and all
+ * its descendants.
+ */
+void
+AtSubCleanup_Portals(void)
+{
+	HASH_SEQ_STATUS status;
+	PortalHashEnt *hentry;
+	TransactionId curXid = GetCurrentTransactionId();
+
+	hash_seq_init(&status, PortalHashTable);
+
+	while ((hentry = (PortalHashEnt *) hash_seq_search(&status)) != NULL)
+	{
+		Portal		portal = hentry->portal;
+
+		if (portal->createXact != curXid)
+			continue;
+
+		/*
+		 * Let's just make sure no one's active...
+		 */
+		portal->portalActive = false;
+
+		/* Zap it with prejudice. */
+		PortalDrop(portal, true);
+	}
+}
diff --git a/src/backend/utils/time/tqual.c b/src/backend/utils/time/tqual.c
index a56e59a3d65..446ee4b72c5 100644
--- a/src/backend/utils/time/tqual.c
+++ b/src/backend/utils/time/tqual.c
@@ -16,13 +16,14 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/utils/time/tqual.c,v 1.72 2003/11/29 19:52:04 pgsql Exp $
+ *	  $PostgreSQL: pgsql/src/backend/utils/time/tqual.c,v 1.73 2004/07/01 00:51:33 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
 
 #include "postgres.h"
 
+#include "access/subtrans.h"
 #include "storage/sinval.h"
 #include "utils/tqual.h"
 
@@ -115,6 +116,10 @@ HeapTupleSatisfiesItself(HeapTupleHeader tuple)
 			if (tuple->t_infomask & HEAP_XMAX_INVALID)	/* xid invalid */
 				return true;
 
+			/* deleting subtransaction aborted */
+			if (TransactionIdDidAbort(HeapTupleHeaderGetXmax(tuple)))
+				return true;
+
 			Assert(TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmax(tuple)));
 
 			if (tuple->t_infomask & HEAP_MARKED_FOR_UPDATE)
@@ -261,6 +266,10 @@ HeapTupleSatisfiesNow(HeapTupleHeader tuple)
 			if (tuple->t_infomask & HEAP_XMAX_INVALID)	/* xid invalid */
 				return true;
 
+			/* deleting subtransaction aborted */
+			if (TransactionIdDidAbort(HeapTupleHeaderGetXmax(tuple)))
+				return true;
+
 			Assert(TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmax(tuple)));
 
 			if (tuple->t_infomask & HEAP_MARKED_FOR_UPDATE)
@@ -441,6 +450,10 @@ HeapTupleSatisfiesUpdate(HeapTupleHeader tuple, CommandId curcid)
 			if (tuple->t_infomask & HEAP_XMAX_INVALID)	/* xid invalid */
 				return HeapTupleMayBeUpdated;
 
+			/* deleting subtransaction aborted */
+			if (TransactionIdDidAbort(HeapTupleHeaderGetXmax(tuple)))
+				return HeapTupleMayBeUpdated;
+
 			Assert(TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmax(tuple)));
 
 			if (tuple->t_infomask & HEAP_MARKED_FOR_UPDATE)
@@ -575,6 +588,10 @@ HeapTupleSatisfiesDirty(HeapTupleHeader tuple)
 			if (tuple->t_infomask & HEAP_XMAX_INVALID)	/* xid invalid */
 				return true;
 
+			/* deleting subtransaction aborted */
+			if (TransactionIdDidAbort(HeapTupleHeaderGetXmax(tuple)))
+				return true;
+
 			Assert(TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmax(tuple)));
 
 			if (tuple->t_infomask & HEAP_MARKED_FOR_UPDATE)
@@ -712,6 +729,11 @@ HeapTupleSatisfiesSnapshot(HeapTupleHeader tuple, Snapshot snapshot)
 			if (tuple->t_infomask & HEAP_XMAX_INVALID)	/* xid invalid */
 				return true;
 
+			/* deleting subtransaction aborted */
+			/* FIXME -- is this correct w.r.t. the cmax of the tuple? */
+			if (TransactionIdDidAbort(HeapTupleHeaderGetXmax(tuple)))
+				return true;
+
 			Assert(TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmax(tuple)));
 
 			if (tuple->t_infomask & HEAP_MARKED_FOR_UPDATE)
@@ -747,7 +769,7 @@ HeapTupleSatisfiesSnapshot(HeapTupleHeader tuple, Snapshot snapshot)
 
 		for (i = 0; i < snapshot->xcnt; i++)
 		{
-			if (TransactionIdEquals(HeapTupleHeaderGetXmin(tuple),
+			if (SubTransXidsHaveCommonAncestor(HeapTupleHeaderGetXmin(tuple),
 									snapshot->xip[i]))
 				return false;
 		}
@@ -792,7 +814,7 @@ HeapTupleSatisfiesSnapshot(HeapTupleHeader tuple, Snapshot snapshot)
 			return true;
 		for (i = 0; i < snapshot->xcnt; i++)
 		{
-			if (TransactionIdEquals(HeapTupleHeaderGetXmax(tuple), snapshot->xip[i]))
+			if (SubTransXidsHaveCommonAncestor(HeapTupleHeaderGetXmax(tuple), snapshot->xip[i]))
 				return true;
 		}
 	}
@@ -868,8 +890,8 @@ HeapTupleSatisfiesVacuum(HeapTupleHeader tuple, TransactionId OldestXmin)
 		{
 			if (tuple->t_infomask & HEAP_XMAX_INVALID)	/* xid invalid */
 				return HEAPTUPLE_INSERT_IN_PROGRESS;
-			Assert(HeapTupleHeaderGetXmin(tuple) ==
-				   HeapTupleHeaderGetXmax(tuple));
+			Assert(SubTransXidsHaveCommonAncestor(HeapTupleHeaderGetXmin(tuple),
+						HeapTupleHeaderGetXmax(tuple)));
 			if (tuple->t_infomask & HEAP_MARKED_FOR_UPDATE)
 				return HEAPTUPLE_INSERT_IN_PROGRESS;
 			/* inserted and then deleted by same xact */
@@ -943,7 +965,7 @@ HeapTupleSatisfiesVacuum(HeapTupleHeader tuple, TransactionId OldestXmin)
 	 * Deleter committed, but check special cases.
 	 */
 
-	if (TransactionIdEquals(HeapTupleHeaderGetXmin(tuple),
+	if (SubTransXidsHaveCommonAncestor(HeapTupleHeaderGetXmin(tuple),
 							HeapTupleHeaderGetXmax(tuple)))
 	{
 		/*
diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c
index 11bc08ed977..8b4ac1e29e7 100644
--- a/src/bin/initdb/initdb.c
+++ b/src/bin/initdb/initdb.c
@@ -39,7 +39,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  * Portions taken from FreeBSD.
  *
- * $PostgreSQL: pgsql/src/bin/initdb/initdb.c,v 1.40 2004/06/24 19:26:59 tgl Exp $
+ * $PostgreSQL: pgsql/src/bin/initdb/initdb.c,v 1.41 2004/07/01 00:51:36 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -1828,7 +1828,7 @@ main(int argc, char *argv[])
 	char	   *pgdenv;			/* PGDATA value got from sent to
 								 * environment */
 	char	   *subdirs[] =
-	{"global", "pg_xlog", "pg_clog", "base", "base/1", "pg_tblspc"};
+	{"global", "pg_xlog", "pg_clog", "pg_subtrans", "base", "base/1", "pg_tblspc"};
 
 	progname = get_progname(argv[0]);
 	set_pglocale_pgservice(argv[0], "initdb");
diff --git a/src/include/access/clog.h b/src/include/access/clog.h
index bd7f4152be3..2df1cedc1c9 100644
--- a/src/include/access/clog.h
+++ b/src/include/access/clog.h
@@ -6,7 +6,7 @@
  * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/access/clog.h,v 1.8 2003/11/29 22:40:55 pgsql Exp $
+ * $PostgreSQL: pgsql/src/include/access/clog.h,v 1.9 2004/07/01 00:51:38 tgl Exp $
  */
 #ifndef CLOG_H
 #define CLOG_H
@@ -16,13 +16,16 @@
 /*
  * Possible transaction statuses --- note that all-zeroes is the initial
  * state.
+ *
+ * A "subcommitted" transaction is a committed subtransaction whose parent
+ * hasn't committed or aborted yet.
  */
 typedef int XidStatus;
 
 #define TRANSACTION_STATUS_IN_PROGRESS		0x00
 #define TRANSACTION_STATUS_COMMITTED		0x01
 #define TRANSACTION_STATUS_ABORTED			0x02
-/* 0x03 is available without changing commit log space allocation */
+#define TRANSACTION_STATUS_SUB_COMMITTED	0x03
 
 /* exported because lwlock.c needs it */
 #define NUM_CLOG_BUFFERS	8
@@ -39,12 +42,6 @@ extern void ShutdownCLOG(void);
 extern void CheckPointCLOG(void);
 extern void ExtendCLOG(TransactionId newestXact);
 extern void TruncateCLOG(TransactionId oldestXact);
-
-/* XLOG stuff */
-#define CLOG_ZEROPAGE		0x00
-
-extern void clog_redo(XLogRecPtr lsn, XLogRecord *record);
-extern void clog_undo(XLogRecPtr lsn, XLogRecord *record);
-extern void clog_desc(char *buf, uint8 xl_info, char *rec);
+extern void clog_zeropage_redo(int pageno);
 
 #endif   /* CLOG_H */
diff --git a/src/include/access/gistscan.h b/src/include/access/gistscan.h
index b8466429959..4022f542752 100644
--- a/src/include/access/gistscan.h
+++ b/src/include/access/gistscan.h
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/access/gistscan.h,v 1.22 2003/11/29 22:40:55 pgsql Exp $
+ * $PostgreSQL: pgsql/src/include/access/gistscan.h,v 1.23 2004/07/01 00:51:38 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -23,5 +23,6 @@ extern Datum gistrestrpos(PG_FUNCTION_ARGS);
 extern Datum gistendscan(PG_FUNCTION_ARGS);
 extern void gistadjscans(Relation r, int op, BlockNumber blkno, OffsetNumber offnum);
 extern void AtEOXact_gist(void);
+extern void AtEOSubXact_gist(TransactionId childXid);
 
 #endif   /* GISTSCAN_H */
diff --git a/src/include/access/hash.h b/src/include/access/hash.h
index ffeea63417b..2088cc2f5a6 100644
--- a/src/include/access/hash.h
+++ b/src/include/access/hash.h
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/access/hash.h,v 1.54 2003/11/29 22:40:55 pgsql Exp $
+ * $PostgreSQL: pgsql/src/include/access/hash.h,v 1.55 2004/07/01 00:51:38 tgl Exp $
  *
  * NOTES
  *		modeled after Margo Seltzer's hash implementation for unix.
@@ -293,6 +293,7 @@ extern void _hash_regscan(IndexScanDesc scan);
 extern void _hash_dropscan(IndexScanDesc scan);
 extern bool _hash_has_active_scan(Relation rel, Bucket bucket);
 extern void AtEOXact_hash(void);
+extern void AtEOSubXact_hash(TransactionId childXid);
 
 /* hashsearch.c */
 extern bool _hash_next(IndexScanDesc scan, ScanDirection dir);
diff --git a/src/include/access/htup.h b/src/include/access/htup.h
index 3d48b5f45a3..fdcfc8dc6f1 100644
--- a/src/include/access/htup.h
+++ b/src/include/access/htup.h
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/access/htup.h,v 1.65 2004/04/01 21:28:45 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/access/htup.h,v 1.66 2004/07/01 00:51:38 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -109,18 +109,14 @@
 typedef struct HeapTupleFields
 {
 	TransactionId t_xmin;		/* inserting xact ID */
-
-	union
-	{
-		CommandId	t_cmin;		/* inserting command ID */
-		TransactionId t_xmax;	/* deleting xact ID */
-	}			t_field2;
+	CommandId	t_cmin;			/* inserting command ID */
+	TransactionId t_xmax;		/* deleting xact ID */
 
 	union
 	{
 		CommandId	t_cmax;		/* deleting command ID */
 		TransactionId t_xvac;	/* VACUUM FULL xact ID */
-	}			t_field3;
+	}			t_field4;
 } HeapTupleFields;
 
 typedef struct DatumTupleFields
@@ -172,9 +168,7 @@ typedef HeapTupleHeaderData *HeapTupleHeader;
 										 * attribute(s) */
 #define HEAP_HASEXTENDED		0x000C	/* the two above combined */
 #define HEAP_HASOID				0x0010	/* has an object-id field */
-/* bit 0x0020 is presently unused */
-#define HEAP_XMAX_IS_XMIN		0x0040	/* created and deleted in the same
-										 * transaction */
+/* 0x0020 and 0x0040 are unused */
 #define HEAP_XMAX_UNLOGGED		0x0080	/* to lock tuple for update
 										 * without logging */
 #define HEAP_XMIN_COMMITTED		0x0100	/* t_xmin committed */
@@ -211,62 +205,47 @@ typedef HeapTupleHeaderData *HeapTupleHeader;
 
 #define HeapTupleHeaderGetXmax(tup) \
 ( \
-	((tup)->t_infomask & HEAP_XMAX_IS_XMIN) ? \
-		(tup)->t_choice.t_heap.t_xmin \
-	: \
-		(tup)->t_choice.t_heap.t_field2.t_xmax \
+	(tup)->t_choice.t_heap.t_xmax \
 )
 
 #define HeapTupleHeaderSetXmax(tup, xid) \
-do { \
-	TransactionId	_newxid = (xid); \
-	if (TransactionIdEquals((tup)->t_choice.t_heap.t_xmin, _newxid)) \
-		(tup)->t_infomask |= HEAP_XMAX_IS_XMIN; \
-	else \
-	{ \
-		(tup)->t_infomask &= ~HEAP_XMAX_IS_XMIN; \
-		TransactionIdStore(_newxid, &(tup)->t_choice.t_heap.t_field2.t_xmax); \
-	} \
-} while (0)
+( \
+	TransactionIdStore((xid), &(tup)->t_choice.t_heap.t_xmax) \
+)
 
-/*
- * Note: GetCmin will produce wrong answers after SetXmax has been executed
- * by a transaction other than the inserting one.  We could check
- * HEAP_XMAX_INVALID and return FirstCommandId if it's clear, but since that
- * bit will be set again if the deleting transaction aborts, there'd be no
- * real gain in safety from the extra test.  So, just rely on the caller not
- * to trust the value unless it's meaningful.
- */
 #define HeapTupleHeaderGetCmin(tup) \
 ( \
-	(tup)->t_choice.t_heap.t_field2.t_cmin \
+	(tup)->t_choice.t_heap.t_cmin \
 )
 
 #define HeapTupleHeaderSetCmin(tup, cid) \
-do { \
-	Assert((tup)->t_infomask & HEAP_XMAX_INVALID); \
-	(tup)->t_choice.t_heap.t_field2.t_cmin = (cid); \
-} while (0)
+( \
+	(tup)->t_choice.t_heap.t_cmin = (cid) \
+)
 
 /*
- * As with GetCmin, we can't completely ensure that GetCmax can detect whether
- * a valid command ID is available, and there's little point in a partial test.
+ * Note: GetCmax will produce wrong answers after SetXvac has been executed
+ * by a transaction other than the inserting one.  We could check
+ * HEAP_XMAX_INVALID and return FirstCommandId if it's clear, but since that
+ * bit will be set again if the deleting transaction aborts, there'd be no
+ * real gain in safety from the extra test.  So, just rely on the caller not
+ * to trust the value unless it's meaningful.
  */
 #define HeapTupleHeaderGetCmax(tup) \
 ( \
-	(tup)->t_choice.t_heap.t_field3.t_cmax \
+	(tup)->t_choice.t_heap.t_field4.t_cmax \
 )
 
 #define HeapTupleHeaderSetCmax(tup, cid) \
 do { \
 	Assert(!((tup)->t_infomask & HEAP_MOVED)); \
-	(tup)->t_choice.t_heap.t_field3.t_cmax = (cid); \
+	(tup)->t_choice.t_heap.t_field4.t_cmax = (cid); \
 } while (0)
 
 #define HeapTupleHeaderGetXvac(tup) \
 ( \
 	((tup)->t_infomask & HEAP_MOVED) ? \
-		(tup)->t_choice.t_heap.t_field3.t_xvac \
+		(tup)->t_choice.t_heap.t_field4.t_xvac \
 	: \
 		InvalidTransactionId \
 )
@@ -274,7 +253,7 @@ do { \
 #define HeapTupleHeaderSetXvac(tup, xid) \
 do { \
 	Assert((tup)->t_infomask & HEAP_MOVED); \
-	TransactionIdStore((xid), &(tup)->t_choice.t_heap.t_field3.t_xvac); \
+	TransactionIdStore((xid), &(tup)->t_choice.t_heap.t_field4.t_xvac); \
 } while (0)
 
 #define HeapTupleHeaderGetDatumLength(tup) \
diff --git a/src/include/access/rmgr.h b/src/include/access/rmgr.h
index e63e3fbc31d..7ea3134031d 100644
--- a/src/include/access/rmgr.h
+++ b/src/include/access/rmgr.h
@@ -3,7 +3,7 @@
  *
  * Resource managers definition
  *
- * $PostgreSQL: pgsql/src/include/access/rmgr.h,v 1.10 2003/11/29 22:40:55 pgsql Exp $
+ * $PostgreSQL: pgsql/src/include/access/rmgr.h,v 1.11 2004/07/01 00:51:38 tgl Exp $
  */
 #ifndef RMGR_H
 #define RMGR_H
@@ -16,7 +16,7 @@ typedef uint8 RmgrId;
 #define RM_XLOG_ID				0
 #define RM_XACT_ID				1
 #define RM_SMGR_ID				2
-#define RM_CLOG_ID				3
+#define RM_SLRU_ID				3
 #define RM_HEAP_ID				10
 #define RM_BTREE_ID				11
 #define RM_HASH_ID				12
diff --git a/src/include/access/rtree.h b/src/include/access/rtree.h
index fdb33eba06f..5b5347e9a08 100644
--- a/src/include/access/rtree.h
+++ b/src/include/access/rtree.h
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/access/rtree.h,v 1.32 2003/11/29 22:40:55 pgsql Exp $
+ * $PostgreSQL: pgsql/src/include/access/rtree.h,v 1.33 2004/07/01 00:51:38 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -130,6 +130,7 @@ extern void rtree_desc(char *buf, uint8 xl_info, char *rec);
 extern void rtadjscans(Relation r, int op, BlockNumber blkno,
 		   OffsetNumber offnum);
 extern void AtEOXact_rtree(void);
+extern void AtEOSubXact_rtree(TransactionId childXid);
 
 /* rtstrat.c */
 extern StrategyNumber RTMapToInternalOperator(StrategyNumber strat);
diff --git a/src/include/access/slru.h b/src/include/access/slru.h
index 213cca5c216..e3245fac658 100644
--- a/src/include/access/slru.h
+++ b/src/include/access/slru.h
@@ -6,11 +6,12 @@
  * Portions Copyright (c) 2003, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/access/slru.h,v 1.6 2004/05/31 03:48:08 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/access/slru.h,v 1.7 2004/07/01 00:51:38 tgl Exp $
  */
 #ifndef SLRU_H
 #define SLRU_H
 
+#include "access/xlog.h"
 #include "storage/lwlock.h"
 
 
@@ -56,4 +57,12 @@ extern void SimpleLruSetLatestPage(SlruCtl ctl, int pageno);
 extern void SimpleLruFlush(SlruCtl ctl, bool checkpoint);
 extern void SimpleLruTruncate(SlruCtl ctl, int cutoffPage);
 
+/* XLOG stuff */
+#define CLOG_ZEROPAGE		0x00
+#define SUBTRANS_ZEROPAGE	0x10
+
+extern void slru_redo(XLogRecPtr lsn, XLogRecord *record);
+extern void slru_undo(XLogRecPtr lsn, XLogRecord *record);
+extern void slru_desc(char *buf, uint8 xl_info, char *rec);
+
 #endif   /* SLRU_H */
diff --git a/src/include/access/subtrans.h b/src/include/access/subtrans.h
new file mode 100644
index 00000000000..2c601752d12
--- /dev/null
+++ b/src/include/access/subtrans.h
@@ -0,0 +1,35 @@
+/*
+ * subtrans.h
+ *
+ * PostgreSQL subtrans-log manager
+ *
+ * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * $PostgreSQL: pgsql/src/include/access/subtrans.h,v 1.1 2004/07/01 00:51:38 tgl Exp $
+ */
+#ifndef SUBTRANS_H
+#define SUBTRANS_H
+
+#include "access/xlog.h"
+
+/* exported because lwlock.c needs it */
+/* cannot be different from NUM_CLOG_BUFFERS without slru.c changes */
+#define NUM_SUBTRANS_BUFFERS	NUM_CLOG_BUFFERS
+
+extern void SubTransSetParent(TransactionId xid, TransactionId parent);
+extern TransactionId SubTransGetParent(TransactionId xid);
+extern TransactionId SubTransGetTopmostTransaction(TransactionId xid);
+extern bool SubTransXidsHaveCommonAncestor(TransactionId xid1, TransactionId xid2);
+
+extern int	SUBTRANSShmemSize(void);
+extern void SUBTRANSShmemInit(void);
+extern void BootStrapSUBTRANS(void);
+extern void StartupSUBTRANS(void);
+extern void ShutdownSUBTRANS(void);
+extern void CheckPointSUBTRANS(void);
+extern void ExtendSUBTRANS(TransactionId newestXact);
+extern void TruncateSUBTRANS(TransactionId oldestXact);
+extern void subtrans_zeropage_redo(int pageno);
+
+#endif   /* SUBTRANS_H */
diff --git a/src/include/access/transam.h b/src/include/access/transam.h
index 3a2cad7bb03..44e0ff6ea75 100644
--- a/src/include/access/transam.h
+++ b/src/include/access/transam.h
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/access/transam.h,v 1.48 2003/11/29 22:40:55 pgsql Exp $
+ * $PostgreSQL: pgsql/src/include/access/transam.h,v 1.49 2004/07/01 00:51:38 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -107,13 +107,16 @@ extern bool TransactionIdDidCommit(TransactionId transactionId);
 extern bool TransactionIdDidAbort(TransactionId transactionId);
 extern void TransactionIdCommit(TransactionId transactionId);
 extern void TransactionIdAbort(TransactionId transactionId);
+extern void TransactionIdSubCommit(TransactionId transactionId);
+extern void TransactionIdCommitTree(int nxids, TransactionId *xids);
+extern void TransactionIdAbortTree(int nxids, TransactionId *xids);
 extern bool TransactionIdPrecedes(TransactionId id1, TransactionId id2);
 extern bool TransactionIdPrecedesOrEquals(TransactionId id1, TransactionId id2);
 extern bool TransactionIdFollows(TransactionId id1, TransactionId id2);
 extern bool TransactionIdFollowsOrEquals(TransactionId id1, TransactionId id2);
 
 /* in transam/varsup.c */
-extern TransactionId GetNewTransactionId(void);
+extern TransactionId GetNewTransactionId(bool isSubXact);
 extern TransactionId ReadNewTransactionId(void);
 extern Oid	GetNewObjectId(void);
 extern void CheckMaxObjectId(Oid assigned_oid);
diff --git a/src/include/access/xact.h b/src/include/access/xact.h
index 53a585ec694..c5b66afd0df 100644
--- a/src/include/access/xact.h
+++ b/src/include/access/xact.h
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/access/xact.h,v 1.63 2004/05/22 23:14:38 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/access/xact.h,v 1.64 2004/07/01 00:51:38 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -63,7 +63,15 @@ typedef enum TBlockState
 	TBLOCK_INPROGRESS,
 	TBLOCK_END,
 	TBLOCK_ABORT,
-	TBLOCK_ENDABORT
+	TBLOCK_ENDABORT,
+
+	TBLOCK_SUBBEGIN,
+	TBLOCK_SUBBEGINABORT,
+	TBLOCK_SUBINPROGRESS,
+	TBLOCK_SUBEND,
+	TBLOCK_SUBABORT,
+	TBLOCK_SUBENDABORT_OK,
+	TBLOCK_SUBENDABORT_ERROR
 } TBlockState;
 
 /*
@@ -76,12 +84,15 @@ typedef void (*EOXactCallback) (bool isCommit, void *arg);
  */
 typedef struct TransactionStateData
 {
-	TransactionId	transactionIdData;
-	CommandId		commandId;
-	AbsoluteTime	startTime;
-	int				startTimeUsec;
-	TransState		state;
-	TBlockState		blockState;
+	TransactionId	transactionIdData;		/* my XID */
+	CommandId		commandId;				/* current CID */
+	TransState		state;					/* low-level state */
+	TBlockState		blockState;				/* high-level state */
+	int				nestingLevel;			/* nest depth */
+	MemoryContext	curTransactionContext;	/* my xact-lifetime context */
+	List		   *childXids;				/* subcommitted child XIDs */
+	AclId			currentUser;			/* subxact start current_user */
+	struct TransactionStateData *parent;	/* back link to parent */
 } TransactionStateData;
 
 typedef TransactionStateData *TransactionState;
@@ -102,9 +113,11 @@ typedef TransactionStateData *TransactionState;
 typedef struct xl_xact_commit
 {
 	time_t		xtime;
+	int			nrels;			/* number of RelFileNodes */
+	int			nsubxacts;		/* number of subtransaction XIDs */
 	/* Array of RelFileNode(s) to drop at commit */
-	/* The XLOG record length determines how many there are */
 	RelFileNode	xnodes[1];		/* VARIABLE LENGTH ARRAY */
+	/* ARRAY OF COMMITTED SUBTRANSACTION XIDs FOLLOWS */
 } xl_xact_commit;
 
 #define MinSizeOfXactCommit	offsetof(xl_xact_commit, xnodes)
@@ -112,9 +125,11 @@ typedef struct xl_xact_commit
 typedef struct xl_xact_abort
 {
 	time_t		xtime;
+	int			nrels;			/* number of RelFileNodes */
+	int			nsubxacts;		/* number of subtransaction XIDs */
 	/* Array of RelFileNode(s) to drop at abort */
-	/* The XLOG record length determines how many there are */
 	RelFileNode	xnodes[1];		/* VARIABLE LENGTH ARRAY */
+	/* ARRAY OF ABORTED SUBTRANSACTION XIDs FOLLOWS */
 } xl_xact_abort;
 
 #define MinSizeOfXactAbort offsetof(xl_xact_abort, xnodes)
@@ -126,18 +141,20 @@ typedef struct xl_xact_abort
  */
 extern bool IsTransactionState(void);
 extern bool IsAbortedTransactionBlockState(void);
+extern TransactionId GetTopTransactionId(void);
 extern TransactionId GetCurrentTransactionId(void);
 extern CommandId GetCurrentCommandId(void);
 extern AbsoluteTime GetCurrentTransactionStartTime(void);
 extern AbsoluteTime GetCurrentTransactionStartTimeUsec(int *usec);
+extern int	GetCurrentTransactionNestLevel(void);
 extern bool TransactionIdIsCurrentTransactionId(TransactionId xid);
-extern bool CommandIdIsCurrentCommandId(CommandId cid);
 extern void CommandCounterIncrement(void);
 extern void StartTransactionCommand(void);
 extern void CommitTransactionCommand(void);
 extern void AbortCurrentTransaction(void);
 extern void BeginTransactionBlock(void);
 extern void EndTransactionBlock(void);
+extern bool IsSubTransaction(void);
 extern bool IsTransactionBlock(void);
 extern bool IsTransactionOrTransactionBlock(void);
 extern char TransactionBlockStatusCode(void);
@@ -151,6 +168,8 @@ extern void UnregisterEOXactCallback(EOXactCallback callback, void *arg);
 
 extern void RecordTransactionCommit(void);
 
+extern int	xactGetCommittedChildren(TransactionId **ptr, bool metoo);
+
 extern void XactPushRollback(void (*func) (void *), void *data);
 extern void XactPopRollback(void);
 
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index 0e44e77446d..1c29ab07626 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -6,7 +6,7 @@
  * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/access/xlog.h,v 1.51 2004/05/29 22:48:22 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/access/xlog.h,v 1.52 2004/07/01 00:51:38 tgl Exp $
  */
 #ifndef XLOG_H
 #define XLOG_H
@@ -111,7 +111,7 @@ typedef struct XLogContRecord
 /*
  * Each page of XLOG file has a header like this:
  */
-#define XLOG_PAGE_MAGIC 0xD05A	/* can be used as WAL version indicator */
+#define XLOG_PAGE_MAGIC 0xD05B	/* can be used as WAL version indicator */
 
 typedef struct XLogPageHeaderData
 {
diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h
index ac6f8a1fa0c..b6f98778628 100644
--- a/src/include/catalog/catversion.h
+++ b/src/include/catalog/catversion.h
@@ -37,7 +37,7 @@
  * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/catalog/catversion.h,v 1.240 2004/06/25 17:20:28 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/catalog/catversion.h,v 1.241 2004/07/01 00:51:39 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -53,6 +53,6 @@
  */
 
 /*							yyyymmddN */
-#define CATALOG_VERSION_NO	200406251
+#define CATALOG_VERSION_NO	200406261
 
 #endif
diff --git a/src/include/commands/async.h b/src/include/commands/async.h
index 6429895fbdc..47bd91aaaa4 100644
--- a/src/include/commands/async.h
+++ b/src/include/commands/async.h
@@ -6,7 +6,7 @@
  * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/commands/async.h,v 1.24 2004/05/23 03:50:45 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/commands/async.h,v 1.25 2004/07/01 00:51:40 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -23,6 +23,9 @@ extern void Async_Unlisten(char *relname, int pid);
 /* perform (or cancel) outbound notify processing at transaction commit */
 extern void AtCommit_Notify(void);
 extern void AtAbort_Notify(void);
+extern void AtSubStart_Notify(void);
+extern void AtSubCommit_Notify(void);
+extern void AtSubAbort_Notify(void);
 
 /* signal handler for inbound notifies (SIGUSR2) */
 extern void NotifyInterruptHandler(SIGNAL_ARGS);
diff --git a/src/include/commands/tablecmds.h b/src/include/commands/tablecmds.h
index f9f03c1bd03..73021fbb91f 100644
--- a/src/include/commands/tablecmds.h
+++ b/src/include/commands/tablecmds.h
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/commands/tablecmds.h,v 1.16 2004/05/05 04:48:47 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/commands/tablecmds.h,v 1.17 2004/07/01 00:51:40 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -42,6 +42,9 @@ extern void register_on_commit_action(Oid relid, OnCommitAction action);
 extern void remove_on_commit_action(Oid relid);
 
 extern void PreCommit_on_commit_actions(void);
-extern void AtEOXact_on_commit_actions(bool isCommit);
+extern void AtEOXact_on_commit_actions(bool isCommit, TransactionId xid);
+extern void AtEOSubXact_on_commit_actions(bool isCommit,
+										  TransactionId childXid,
+										  TransactionId parentXid);
 
 #endif   /* TABLECMDS_H */
diff --git a/src/include/commands/trigger.h b/src/include/commands/trigger.h
index 9083c1395f2..f9e4b2a396e 100644
--- a/src/include/commands/trigger.h
+++ b/src/include/commands/trigger.h
@@ -6,7 +6,7 @@
  * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/commands/trigger.h,v 1.45 2003/11/29 22:40:59 pgsql Exp $
+ * $PostgreSQL: pgsql/src/include/commands/trigger.h,v 1.46 2004/07/01 00:51:40 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -151,44 +151,12 @@ extern void ExecARUpdateTriggers(EState *estate,
 					 ItemPointer tupleid,
 					 HeapTuple newtuple);
 
-
-/*
- * Deferred trigger stuff
- */
-typedef struct DeferredTriggerStatusData
-{
-	Oid			dts_tgoid;
-	bool		dts_tgisdeferred;
-} DeferredTriggerStatusData;
-
-typedef struct DeferredTriggerStatusData *DeferredTriggerStatus;
-
-typedef struct DeferredTriggerEventItem
-{
-	Oid			dti_tgoid;
-	int32		dti_state;
-} DeferredTriggerEventItem;
-
-typedef struct DeferredTriggerEventData *DeferredTriggerEvent;
-
-typedef struct DeferredTriggerEventData
-{
-	DeferredTriggerEvent dte_next;		/* list link */
-	int32		dte_event;
-	Oid			dte_relid;
-	ItemPointerData dte_oldctid;
-	ItemPointerData dte_newctid;
-	int32		dte_n_items;
-	/* dte_item is actually a variable-size array, of length dte_n_items */
-	DeferredTriggerEventItem dte_item[1];
-} DeferredTriggerEventData;
-
-
-extern void DeferredTriggerInit(void);
 extern void DeferredTriggerBeginXact(void);
 extern void DeferredTriggerEndQuery(void);
 extern void DeferredTriggerEndXact(void);
 extern void DeferredTriggerAbortXact(void);
+extern void DeferredTriggerBeginSubXact(void);
+extern void DeferredTriggerEndSubXact(bool isCommit);
 
 extern void DeferredTriggerSetState(ConstraintsSetStmt *stmt);
 
diff --git a/src/include/executor/spi.h b/src/include/executor/spi.h
index 2e477e70f87..e283b55cecd 100644
--- a/src/include/executor/spi.h
+++ b/src/include/executor/spi.h
@@ -2,7 +2,7 @@
  *
  * spi.h
  *
- * $PostgreSQL: pgsql/src/include/executor/spi.h,v 1.44 2004/04/01 21:28:46 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/executor/spi.h,v 1.45 2004/07/01 00:51:41 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -119,5 +119,6 @@ extern void SPI_cursor_move(Portal portal, bool forward, int count);
 extern void SPI_cursor_close(Portal portal);
 
 extern void AtEOXact_SPI(bool isCommit);
+extern void AtEOSubXact_SPI(bool isCommit, TransactionId childXid);
 
 #endif   /* SPI_H */
diff --git a/src/include/executor/spi_priv.h b/src/include/executor/spi_priv.h
index dcafa1ccb9a..2785f6fe281 100644
--- a/src/include/executor/spi_priv.h
+++ b/src/include/executor/spi_priv.h
@@ -6,7 +6,7 @@
  * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/executor/spi_priv.h,v 1.18 2004/03/21 22:29:11 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/executor/spi_priv.h,v 1.19 2004/07/01 00:51:42 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -23,6 +23,7 @@ typedef struct
 	MemoryContext procCxt;		/* procedure context */
 	MemoryContext execCxt;		/* executor context */
 	MemoryContext savedcxt;
+	TransactionId connectXid;	/* Xid of connecting transaction */
 } _SPI_connection;
 
 typedef struct
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index 7defaf93f88..e992751f856 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/storage/bufmgr.h,v 1.82 2004/05/31 19:24:05 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/storage/bufmgr.h,v 1.83 2004/07/01 00:51:43 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -148,6 +148,8 @@ extern void InitBufferPoolAccess(void);
 extern char *ShowBufferUsage(void);
 extern void ResetBufferUsage(void);
 extern void AtEOXact_Buffers(bool isCommit);
+extern void AtSubStart_Buffers(void);
+extern void AtEOSubXact_Buffers(bool isCommit);
 extern void FlushBufferPool(void);
 extern BlockNumber BufferGetBlockNumber(Buffer buffer);
 extern BlockNumber RelationGetNumberOfBlocks(Relation relation);
diff --git a/src/include/storage/bufpage.h b/src/include/storage/bufpage.h
index 5f8012e0ed6..727ec508a3b 100644
--- a/src/include/storage/bufpage.h
+++ b/src/include/storage/bufpage.h
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/storage/bufpage.h,v 1.58 2004/06/05 17:42:46 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/storage/bufpage.h,v 1.59 2004/07/01 00:51:43 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -126,10 +126,11 @@ typedef struct PageHeaderData
 typedef PageHeaderData *PageHeader;
 
 /*
- * Page layout version number 0 is for pre-7.3 Postgres releases.  The
- * current version number is 1, denoting a new HeapTupleHeader layout.
+ * Page layout version number 0 is for pre-7.3 Postgres releases.
+ * Releases 7.3 and 7.4 use 1, denoting a new HeapTupleHeader layout.
+ * Release 7.5 changed the HeapTupleHeader layout again.
  */
-#define PG_PAGE_LAYOUT_VERSION		1
+#define PG_PAGE_LAYOUT_VERSION		2
 
 
 /* ----------------------------------------------------------------
diff --git a/src/include/storage/lock.h b/src/include/storage/lock.h
index 8c7159c0cb0..650b3269497 100644
--- a/src/include/storage/lock.h
+++ b/src/include/storage/lock.h
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/storage/lock.h,v 1.77 2004/05/28 05:13:29 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/storage/lock.h,v 1.78 2004/07/01 00:51:43 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -26,6 +26,14 @@ typedef struct PROC_QUEUE
 	int			size;			/* number of entries in list */
 } PROC_QUEUE;
 
+/* Release options for LockReleaseAll */
+typedef enum
+{
+	ReleaseAll,					/* All my locks */
+	ReleaseAllExceptSession,	/* All except session locks (Xid = 0) */
+	ReleaseGivenXids			/* Only locks with Xids in given array */
+} LockReleaseWhich;
+
 /* struct PGPROC is declared in storage/proc.h, but must forward-reference it */
 typedef struct PGPROC PGPROC;
 
@@ -165,11 +173,12 @@ typedef struct LOCK
  *
  * There are two possible kinds of proclock tags: a transaction (identified
  * both by the PGPROC of the backend running it, and the xact's own ID) and
- * a session (identified by backend PGPROC, with xid = InvalidTransactionId).
+ * a session (identified by backend PGPROC, with XID = InvalidTransactionId).
  *
  * Currently, session proclocks are used for user locks and for cross-xact
- * locks obtained for VACUUM.  We assume that a session lock never conflicts
- * with per-transaction locks obtained by the same backend.
+ * locks obtained for VACUUM.  Note that a single backend can hold locks
+ * under several different XIDs at once (including session locks).  We treat
+ * such locks as never conflicting (a backend can never block itself).
  *
  * The holding[] array counts the granted locks (of each type) represented
  * by this proclock. Note that there will be a proclock object, possibly with
@@ -177,11 +186,11 @@ typedef struct LOCK
  * Otherwise, proclock objects whose counts have gone to zero are recycled
  * as soon as convenient.
  *
- * Each PROCLOCK object is linked into lists for both the associated LOCK object
- * and the owning PGPROC object.	Note that the PROCLOCK is entered into these
- * lists as soon as it is created, even if no lock has yet been granted.
- * A PGPROC that is waiting for a lock to be granted will also be linked into
- * the lock's waitProcs queue.
+ * Each PROCLOCK object is linked into lists for both the associated LOCK
+ * object and the owning PGPROC object.  Note that the PROCLOCK is entered
+ * into these lists as soon as it is created, even if no lock has yet been
+ * granted.  A PGPROC that is waiting for a lock to be granted will also be
+ * linked into the lock's waitProcs queue.
  */
 typedef struct PROCLOCKTAG
 {
@@ -239,7 +248,7 @@ extern bool LockAcquire(LOCKMETHODID lockmethodid, LOCKTAG *locktag,
 extern bool LockRelease(LOCKMETHODID lockmethodid, LOCKTAG *locktag,
 			TransactionId xid, LOCKMODE lockmode);
 extern bool LockReleaseAll(LOCKMETHODID lockmethodid, PGPROC *proc,
-			   bool allxids, TransactionId xid);
+			   LockReleaseWhich which, int nxids, TransactionId *xids);
 extern int LockCheckConflicts(LockMethod lockMethodTable,
 				   LOCKMODE lockmode,
 				   LOCK *lock, PROCLOCK *proclock, PGPROC *proc,
diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h
index c7283f374cf..1551d7568c5 100644
--- a/src/include/storage/proc.h
+++ b/src/include/storage/proc.h
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/storage/proc.h,v 1.67 2003/12/01 21:59:25 momjian Exp $
+ * $PostgreSQL: pgsql/src/include/storage/proc.h,v 1.68 2004/07/01 00:51:43 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -103,7 +103,8 @@ extern int	ProcGlobalSemas(int maxBackends);
 extern void InitProcGlobal(int maxBackends);
 extern void InitProcess(void);
 extern void InitDummyProcess(int proctype);
-extern void ProcReleaseLocks(bool isCommit);
+extern void ProcReleaseLocks(LockReleaseWhich which,
+							 int nxids, TransactionId *xids);
 
 extern void ProcQueueInit(PROC_QUEUE *queue);
 extern int ProcSleep(LockMethod lockMethodTable, LOCKMODE lockmode,
diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h
index 52040432dcc..e4f0930ef7a 100644
--- a/src/include/storage/smgr.h
+++ b/src/include/storage/smgr.h
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/storage/smgr.h,v 1.44 2004/06/02 17:28:18 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/storage/smgr.h,v 1.45 2004/07/01 00:51:43 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -66,6 +66,9 @@ extern BlockNumber smgrtruncate(SMgrRelation reln, BlockNumber nblocks,
 extern void smgrimmedsync(SMgrRelation reln);
 extern void smgrDoPendingDeletes(bool isCommit);
 extern int	smgrGetPendingDeletes(bool forCommit, RelFileNode **ptr);
+extern void AtSubStart_smgr(void);
+extern void AtSubCommit_smgr(void);
+extern void AtSubAbort_smgr(void);
 extern void smgrcommit(void);
 extern void smgrabort(void);
 extern void smgrsync(void);
diff --git a/src/include/utils/catcache.h b/src/include/utils/catcache.h
index 9c8d3053fec..3ce54b99a25 100644
--- a/src/include/utils/catcache.h
+++ b/src/include/utils/catcache.h
@@ -13,7 +13,7 @@
  * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/utils/catcache.h,v 1.48 2003/11/29 22:41:15 pgsql Exp $
+ * $PostgreSQL: pgsql/src/include/utils/catcache.h,v 1.49 2004/07/01 00:51:44 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -101,6 +101,9 @@ typedef struct catctup
 	 * and negative entries is identical.
 	 */
 	int			refcount;		/* number of active references */
+	int		   *prev_refcount;	/* refcounts for upper subtransactions */
+	int			numpushes;		/* number of used refcounts in the array */
+	int			numalloc;		/* allocated size of array */
 	bool		dead;			/* dead but not yet removed? */
 	bool		negative;		/* negative cache entry? */
 	uint32		hash_value;		/* hash value for this tuple's keys */
@@ -139,6 +142,9 @@ typedef struct catclist
 	 */
 	Dlelem		cache_elem;		/* list member of per-catcache list */
 	int			refcount;		/* number of active references */
+	int		   *prev_refcount;	/* refcounts for upper subtransactions */
+	int			numpushes;		/* number of used refcounts in the array */
+	int			numalloc;		/* allocated size of array */
 	bool		dead;			/* dead but not yet removed? */
 	bool		ordered;		/* members listed in index order? */
 	short		nkeys;			/* number of lookup keys specified */
@@ -163,6 +169,8 @@ extern DLLIMPORT MemoryContext CacheMemoryContext;
 
 extern void CreateCacheMemoryContext(void);
 extern void AtEOXact_CatCache(bool isCommit);
+extern void AtSubStart_CatCache(void);
+extern void AtEOSubXact_CatCache(bool isCommit);
 
 extern CatCache *InitCatCache(int id, const char *relname, const char *indname,
 			 int reloidattr,
diff --git a/src/include/utils/guc.h b/src/include/utils/guc.h
index b91682af88f..0a510cd6653 100644
--- a/src/include/utils/guc.h
+++ b/src/include/utils/guc.h
@@ -7,7 +7,7 @@
  * Copyright (c) 2000-2003, PostgreSQL Global Development Group
  * Written by Peter Eisentraut <peter_e@gmx.net>.
  *
- * $PostgreSQL: pgsql/src/include/utils/guc.h,v 1.47 2004/05/28 05:13:32 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/utils/guc.h,v 1.48 2004/07/01 00:51:44 tgl Exp $
  *--------------------------------------------------------------------
  */
 #ifndef GUC_H
@@ -175,14 +175,14 @@ extern void DefineCustomStringVariable(
 	GucStringAssignHook assign_hook,
 	GucShowHook show_hook);
 
-extern void EmittWarningsOnPlaceholders(const char* className);
+extern void EmitWarningsOnPlaceholders(const char* className);
 
 extern const char *GetConfigOption(const char *name);
 extern const char *GetConfigOptionResetString(const char *name);
 extern void ProcessConfigFile(GucContext context);
 extern void InitializeGUCOptions(void);
 extern void ResetAllOptions(void);
-extern void AtEOXact_GUC(bool isCommit);
+extern void AtEOXact_GUC(bool isCommit, bool isSubXact);
 extern void BeginReportingGUCOptions(void);
 extern void ParseLongOption(const char *string, char **name, char **value);
 extern bool set_config_option(const char *name, const char *value,
diff --git a/src/include/utils/guc_tables.h b/src/include/utils/guc_tables.h
index 62d2d571b29..d522f6d5e94 100644
--- a/src/include/utils/guc_tables.h
+++ b/src/include/utils/guc_tables.h
@@ -7,12 +7,31 @@
  *
  * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
  *
- *	  $PostgreSQL: pgsql/src/include/utils/guc_tables.h,v 1.11 2004/05/26 15:07:41 momjian Exp $
+ *	  $PostgreSQL: pgsql/src/include/utils/guc_tables.h,v 1.12 2004/07/01 00:51:44 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
-#ifndef GUC_TABLES
-#define GUC_TABLES 1
+#ifndef GUC_TABLES_H
+#define GUC_TABLES_H 1
+
+/*
+ * GUC supports these types of variables:
+ */
+enum config_type
+{
+	PGC_BOOL,
+	PGC_INT,
+	PGC_REAL,
+	PGC_STRING
+};
+
+union config_var_value
+{
+	bool		boolval;
+	int			intval;
+	double		realval;
+	char	   *stringval;
+};
 
 /*
  * Groupings to help organize all the run-time options for display
@@ -56,15 +75,19 @@ enum config_group
 };
 
 /*
- * GUC supports these types of variables:
+ * Stack entry for saving the state of a variable prior to the current
+ * transaction
  */
-enum config_type
+typedef struct guc_stack
 {
-	PGC_BOOL,
-	PGC_INT,
-	PGC_REAL,
-	PGC_STRING
-};
+	struct guc_stack *prev;		/* previous stack item, if any */
+	int			nest_level;		/* nesting depth of cur transaction */
+	int			status;			/* previous status bits, see below */
+	GucSource	tentative_source;		/* source of the tentative_value */
+	GucSource	source;			/* source of the actual value */
+	union config_var_value tentative_val;	/* previous tentative val */
+	union config_var_value value;			/* previous actual value */
+} GucStack;
 
 /*
  * Generic fields applicable to all types of variables
@@ -86,9 +109,9 @@ struct config_generic
 	enum config_type vartype;	/* type of variable (set only at startup) */
 	int			status;			/* status bits, see below */
 	GucSource	reset_source;	/* source of the reset_value */
-	GucSource	session_source; /* source of the session_value */
 	GucSource	tentative_source;		/* source of the tentative_value */
 	GucSource	source;			/* source of the current actual value */
+	GucStack   *stack;			/* stacked outside-of-transaction states */
 };
 
 /* bit values in flags field */
@@ -104,6 +127,7 @@ struct config_generic
 /* bit values in status field */
 #define GUC_HAVE_TENTATIVE	0x0001		/* tentative value is defined */
 #define GUC_HAVE_LOCAL		0x0002		/* a SET LOCAL has been executed */
+#define GUC_HAVE_STACK		0x0004		/* we have stacked prior value(s) */
 
 
 /* GUC records for specific variable types */
@@ -118,7 +142,6 @@ struct config_bool
 	GucBoolAssignHook assign_hook;
 	GucShowHook show_hook;
 	/* variable fields, initialized at runtime: */
-	bool		session_val;
 	bool		tentative_val;
 };
 
@@ -134,7 +157,6 @@ struct config_int
 	GucIntAssignHook assign_hook;
 	GucShowHook show_hook;
 	/* variable fields, initialized at runtime: */
-	int			session_val;
 	int			tentative_val;
 };
 
@@ -150,7 +172,6 @@ struct config_real
 	GucRealAssignHook assign_hook;
 	GucShowHook show_hook;
 	/* variable fields, initialized at runtime: */
-	double		session_val;
 	double		tentative_val;
 };
 
@@ -165,7 +186,6 @@ struct config_string
 	GucShowHook show_hook;
 	/* variable fields, initialized at runtime: */
 	char	   *reset_val;
-	char	   *session_val;
 	char	   *tentative_val;
 };
 
@@ -180,4 +200,4 @@ extern struct config_generic **get_guc_variables(void);
 
 extern void build_guc_variables(void);
 
-#endif
+#endif /* GUC_TABLES_H */
diff --git a/src/include/utils/inval.h b/src/include/utils/inval.h
index a2bad9cd06c..add5ca83c71 100644
--- a/src/include/utils/inval.h
+++ b/src/include/utils/inval.h
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/utils/inval.h,v 1.31 2004/05/06 16:10:57 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/utils/inval.h,v 1.32 2004/07/01 00:51:44 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -22,9 +22,15 @@ typedef void (*CacheCallbackFunction) (Datum arg, Oid relid);
 
 extern void AcceptInvalidationMessages(void);
 
-extern void AtEOXactInvalidationMessages(bool isCommit);
+extern void AtStart_Inval(void);
 
-extern void CommandEndInvalidationMessages(bool isCommit);
+extern void AtSubStart_Inval(void);
+
+extern void AtEOXact_Inval(bool isCommit);
+
+extern void AtSubEOXact_Inval(bool isCommit);
+
+extern void CommandEndInvalidationMessages(void);
 
 extern void CacheInvalidateHeapTuple(Relation relation, HeapTuple tuple);
 
diff --git a/src/include/utils/memutils.h b/src/include/utils/memutils.h
index 7865859c062..d2d7d4a9093 100644
--- a/src/include/utils/memutils.h
+++ b/src/include/utils/memutils.h
@@ -10,7 +10,7 @@
  * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/utils/memutils.h,v 1.55 2004/06/05 19:48:09 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/utils/memutils.h,v 1.56 2004/07/01 00:51:44 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -73,6 +73,7 @@ extern DLLIMPORT MemoryContext PostmasterContext;
 extern DLLIMPORT MemoryContext CacheMemoryContext;
 extern DLLIMPORT MemoryContext MessageContext;
 extern DLLIMPORT MemoryContext TopTransactionContext;
+extern DLLIMPORT MemoryContext CurTransactionContext;
 
 /* These two are transient links to contexts owned by other objects: */
 extern DLLIMPORT MemoryContext QueryContext;
diff --git a/src/include/utils/portal.h b/src/include/utils/portal.h
index 2819295e837..3437dc448a3 100644
--- a/src/include/utils/portal.h
+++ b/src/include/utils/portal.h
@@ -39,7 +39,7 @@
  * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/utils/portal.h,v 1.48 2003/11/29 22:41:16 pgsql Exp $
+ * $PostgreSQL: pgsql/src/include/utils/portal.h,v 1.49 2004/07/01 00:51:44 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -167,6 +167,9 @@ extern void EnablePortalManager(void);
 extern void AtCommit_Portals(void);
 extern void AtAbort_Portals(void);
 extern void AtCleanup_Portals(void);
+extern void AtSubCommit_Portals(TransactionId parentXid);
+extern void AtSubAbort_Portals(void);
+extern void AtSubCleanup_Portals(void);
 extern Portal CreatePortal(const char *name, bool allowDup, bool dupSilent);
 extern Portal CreateNewPortal(void);
 extern void PortalDrop(Portal portal, bool isError);
diff --git a/src/include/utils/rel.h b/src/include/utils/rel.h
index e5008e56ea5..b7f85eda68e 100644
--- a/src/include/utils/rel.h
+++ b/src/include/utils/rel.h
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/utils/rel.h,v 1.74 2004/05/08 19:09:25 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/utils/rel.h,v 1.75 2004/07/01 00:51:44 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -110,6 +110,9 @@ typedef struct RelationData
 	BlockNumber rd_targblock;	/* current insertion target block, or
 								 * InvalidBlockNumber */
 	int			rd_refcnt;		/* reference count */
+	int		   *rd_prevrefcnt;	/* reference count stack */
+	int			rd_numalloc;	/* stack allocated size */
+	int			rd_numpushed;	/* stack used size */
 	bool		rd_isnew;		/* rel was created in current xact */
 
 	/*
diff --git a/src/include/utils/relcache.h b/src/include/utils/relcache.h
index da82f4f6137..47f46190df7 100644
--- a/src/include/utils/relcache.h
+++ b/src/include/utils/relcache.h
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/utils/relcache.h,v 1.40 2004/06/18 06:14:21 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/utils/relcache.h,v 1.41 2004/07/01 00:51:45 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -65,7 +65,9 @@ extern void RelationCacheInvalidateEntry(Oid relationId, RelFileNode *rnode);
 
 extern void RelationCacheInvalidate(void);
 
-extern void AtEOXact_RelationCache(bool commit);
+extern void AtEOXact_RelationCache(bool isCommit);
+extern void AtSubStart_RelationCache(void);
+extern void AtEOSubXact_RelationCache(bool isCommit);
 
 /*
  * Routines to help manage rebuilding of relcache init file
diff --git a/src/test/regress/expected/transactions.out b/src/test/regress/expected/transactions.out
index b72ca5f36e5..6cc89b5c5e4 100644
--- a/src/test/regress/expected/transactions.out
+++ b/src/test/regress/expected/transactions.out
@@ -68,3 +68,70 @@ ERROR:  transaction is read-only
 START TRANSACTION READ WRITE;
 DROP TABLE writetest; -- ok
 COMMIT;
+-- Subtransactions, basic tests
+-- create & drop tables
+SET SESSION CHARACTERISTICS AS TRANSACTION READ WRITE;
+CREATE TABLE foobar (a int);
+BEGIN;
+	CREATE TABLE foo (a int);
+	BEGIN;
+		DROP TABLE foo;
+		CREATE TABLE bar (a int);
+	ROLLBACK;
+	BEGIN;
+		CREATE TABLE baz (a int);
+	COMMIT;
+	drop TABLE foobar;
+	CREATE TABLE barbaz (a int);
+COMMIT;
+-- should exist: barbaz, baz, foo
+SELECT * FROM foo;		-- should be empty
+ a 
+---
+(0 rows)
+
+SELECT * FROM bar;		-- shouldn't exist
+ERROR:  relation "bar" does not exist
+SELECT * FROM barbaz;	-- should be empty
+ a 
+---
+(0 rows)
+
+SELECT * FROM baz;		-- should be empty
+ a 
+---
+(0 rows)
+
+-- inserts
+BEGIN;
+	INSERT INTO foo VALUES (1);
+	BEGIN;
+		INSERT into bar VALUES (1);
+ERROR:  relation "bar" does not exist
+	ROLLBACK;
+	BEGIN;
+		INSERT into barbaz VALUES (1);
+	COMMIT;
+	BEGIN;
+		BEGIN;
+			INSERT INTO foo VALUES (2);
+		COMMIT;
+	ROLLBACK;
+	INSERT INTO foo VALUES (3);
+COMMIT;
+SELECT * FROM foo;		-- should have 1 and 3
+ a 
+---
+ 1
+ 3
+(2 rows)
+
+SELECT * FROM barbaz;	-- should have 1
+ a 
+---
+ 1
+(1 row)
+
+DROP TABLE foo;
+DROP TABLE baz;
+DROP TABLE barbaz;
diff --git a/src/test/regress/expected/without_oid.out b/src/test/regress/expected/without_oid.out
index ef373e6e3c3..708c4c5e94d 100644
--- a/src/test/regress/expected/without_oid.out
+++ b/src/test/regress/expected/without_oid.out
@@ -1,8 +1,18 @@
 --
 -- WITHOUT OID
 --
-CREATE TABLE wi (i INT) WITH OIDS;
-CREATE TABLE wo (i INT) WITHOUT OIDS;
+--
+-- This test tries to verify that WITHOUT OIDS actually saves space.
+-- On machines where MAXALIGN is 8, WITHOUT OIDS may or may not save any
+-- space, depending on the size of the tuple header + null bitmap.
+-- As of 7.5 we need a 9-bit null bitmap to force the difference to appear.
+--
+CREATE TABLE wi (i INT,
+                 n1 int, n2 int, n3 int, n4 int,
+                 n5 int, n6 int, n7 int, n8 int) WITH OIDS;
+CREATE TABLE wo (i INT,
+                 n1 int, n2 int, n3 int, n4 int,
+                 n5 int, n6 int, n7 int, n8 int) WITHOUT OIDS;
 INSERT INTO wi VALUES (1);  -- 1
 INSERT INTO wo SELECT i FROM wi;  -- 1
 INSERT INTO wo SELECT i+1 FROM wi;  -- 1+1=2
@@ -24,6 +34,15 @@ INSERT INTO wo SELECT i+896 FROM wi;  -- 896+2448=3344
 INSERT INTO wo SELECT i+3344 FROM wo;  -- 3344+3344=6688
 INSERT INTO wi SELECT i+2448 FROM wo;  -- 2448+6688=9136
 INSERT INTO wo SELECT i+6688 FROM wi WHERE i<=2448;  -- 6688+2448=9136
+SELECT count(oid) FROM wi;
+ count 
+-------
+  9136
+(1 row)
+
+-- should fail
+SELECT count(oid) FROM wo;
+ERROR:  column "oid" does not exist
 VACUUM ANALYZE wi;
 VACUUM ANALYZE wo;
 SELECT min(relpages) < max(relpages), min(reltuples) - max(reltuples)
diff --git a/src/test/regress/sql/transactions.sql b/src/test/regress/sql/transactions.sql
index 10ef759998b..a656c393b4f 100644
--- a/src/test/regress/sql/transactions.sql
+++ b/src/test/regress/sql/transactions.sql
@@ -54,3 +54,48 @@ CREATE TABLE test AS SELECT * FROM writetest; -- fail
 START TRANSACTION READ WRITE;
 DROP TABLE writetest; -- ok
 COMMIT;
+
+-- Subtransactions, basic tests
+-- create & drop tables
+SET SESSION CHARACTERISTICS AS TRANSACTION READ WRITE;
+CREATE TABLE foobar (a int);
+BEGIN;
+	CREATE TABLE foo (a int);
+	BEGIN;
+		DROP TABLE foo;
+		CREATE TABLE bar (a int);
+	ROLLBACK;
+	BEGIN;
+		CREATE TABLE baz (a int);
+	COMMIT;
+	drop TABLE foobar;
+	CREATE TABLE barbaz (a int);
+COMMIT;
+-- should exist: barbaz, baz, foo
+SELECT * FROM foo;		-- should be empty
+SELECT * FROM bar;		-- shouldn't exist
+SELECT * FROM barbaz;	-- should be empty
+SELECT * FROM baz;		-- should be empty
+
+-- inserts
+BEGIN;
+	INSERT INTO foo VALUES (1);
+	BEGIN;
+		INSERT into bar VALUES (1);
+	ROLLBACK;
+	BEGIN;
+		INSERT into barbaz VALUES (1);
+	COMMIT;
+	BEGIN;
+		BEGIN;
+			INSERT INTO foo VALUES (2);
+		COMMIT;
+	ROLLBACK;
+	INSERT INTO foo VALUES (3);
+COMMIT;
+SELECT * FROM foo;		-- should have 1 and 3
+SELECT * FROM barbaz;	-- should have 1
+
+DROP TABLE foo;
+DROP TABLE baz;
+DROP TABLE barbaz;
diff --git a/src/test/regress/sql/without_oid.sql b/src/test/regress/sql/without_oid.sql
index 4cb961941a9..2c176c8e3e2 100644
--- a/src/test/regress/sql/without_oid.sql
+++ b/src/test/regress/sql/without_oid.sql
@@ -2,8 +2,19 @@
 -- WITHOUT OID
 --
 
-CREATE TABLE wi (i INT) WITH OIDS;
-CREATE TABLE wo (i INT) WITHOUT OIDS;
+--
+-- This test tries to verify that WITHOUT OIDS actually saves space.
+-- On machines where MAXALIGN is 8, WITHOUT OIDS may or may not save any
+-- space, depending on the size of the tuple header + null bitmap.
+-- As of 7.5 we need a 9-bit null bitmap to force the difference to appear.
+--
+CREATE TABLE wi (i INT,
+                 n1 int, n2 int, n3 int, n4 int,
+                 n5 int, n6 int, n7 int, n8 int) WITH OIDS;
+CREATE TABLE wo (i INT,
+                 n1 int, n2 int, n3 int, n4 int,
+                 n5 int, n6 int, n7 int, n8 int) WITHOUT OIDS;
+
 INSERT INTO wi VALUES (1);  -- 1
 INSERT INTO wo SELECT i FROM wi;  -- 1
 INSERT INTO wo SELECT i+1 FROM wi;  -- 1+1=2
@@ -25,8 +36,14 @@ INSERT INTO wo SELECT i+896 FROM wi;  -- 896+2448=3344
 INSERT INTO wo SELECT i+3344 FROM wo;  -- 3344+3344=6688
 INSERT INTO wi SELECT i+2448 FROM wo;  -- 2448+6688=9136
 INSERT INTO wo SELECT i+6688 FROM wi WHERE i<=2448;  -- 6688+2448=9136
+
+SELECT count(oid) FROM wi;
+-- should fail
+SELECT count(oid) FROM wo;
+
 VACUUM ANALYZE wi;
 VACUUM ANALYZE wo;
+
 SELECT min(relpages) < max(relpages), min(reltuples) - max(reltuples)
   FROM pg_class
  WHERE relname IN ('wi', 'wo');
-- 
GitLab