diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c
index 55d5ef9b80ac8a5cbc9f702395f8dcef82ce5867..f938cdcc5ba55f86bd24ae0c5d7e8e84091fe1a5 100644
--- a/src/backend/access/transam/xact.c
+++ b/src/backend/access/transam/xact.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/transam/xact.c,v 1.172 2004/07/27 05:10:49 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/transam/xact.c,v 1.173 2004/07/28 14:23:27 tgl Exp $
  *
  * NOTES
  *		Transaction aborts can now occur two ways:
@@ -224,6 +224,7 @@ typedef struct TransactionStateData
 	ResourceOwner	curTransactionOwner;	/* my query resources */
 	List		   *childXids;				/* subcommitted child XIDs */
 	AclId			currentUser;			/* subxact start current_user */
+	bool			prevXactReadOnly;		/* entry-time xact r/o state */
 	struct TransactionStateData *parent;	/* back link to parent */
 } TransactionStateData;
 
@@ -284,6 +285,7 @@ static TransactionStateData TopTransactionStateData = {
 	NULL,						/* cur transaction resource owner */
 	NIL,						/* subcommitted child Xids */
 	0,							/* entry-time current userid */
+	false,						/* entry-time xact r/o state */
 	NULL						/* link to parent state block */
 };
 
@@ -1242,7 +1244,8 @@ StartTransaction(void)
 	 * check the current transaction state
 	 */
 	if (s->state != TRANS_DEFAULT)
-		elog(WARNING, "StartTransaction and not in default state");
+		elog(WARNING, "StartTransaction while in %s state",
+			 TransStateAsString(s->state));
 
 	/*
 	 * set the current transaction state information appropriately during
@@ -1287,6 +1290,8 @@ StartTransaction(void)
 	 * you won't because it doesn't work during startup; the userid isn't
 	 * set yet during a backend's first transaction start.  We only use
 	 * the currentUser field in sub-transaction state structs.
+	 *
+	 * prevXactReadOnly is also valid only in sub-transactions.
 	 */
 
 	/*
@@ -1319,7 +1324,8 @@ CommitTransaction(void)
 	 * check the current transaction state
 	 */
 	if (s->state != TRANS_INPROGRESS)
-		elog(WARNING, "CommitTransaction and not in in-progress state");
+		elog(WARNING, "CommitTransaction while in %s state",
+			 TransStateAsString(s->state));
 	Assert(s->parent == NULL);
 
 	/*
@@ -1351,14 +1357,14 @@ CommitTransaction(void)
 
 	AtCommit_Portals();
 
-	/* handle commit for large objects [ PA, 7/17/98 ] */
-	/* XXX probably this does not belong here */
-	lo_commit(true);
+	/* close large objects before lower-level cleanup */
+	AtEOXact_LargeObject(true);
 
 	/* NOTIFY commit must come before lower-level cleanup */
 	AtCommit_Notify();
 
 	/* Update the flat password file if we changed pg_shadow or pg_group */
+	/* This should be the last step before commit */
 	AtEOXact_UpdatePasswordFile(true);
 
 	/*
@@ -1486,7 +1492,8 @@ AbortTransaction(void)
 	 * check the current transaction state
 	 */
 	if (s->state != TRANS_INPROGRESS)
-		elog(WARNING, "AbortTransaction and not in in-progress state");
+		elog(WARNING, "AbortTransaction while in %s state",
+			 TransStateAsString(s->state));
 	Assert(s->parent == NULL);
 
 	/*
@@ -1515,7 +1522,7 @@ AbortTransaction(void)
 	 */
 	DeferredTriggerAbortXact();
 	AtAbort_Portals();
-	lo_commit(false);			/* 'false' means it's abort */
+	AtEOXact_LargeObject(false);			/* 'false' means it's abort */
 	AtAbort_Notify();
 	AtEOXact_UpdatePasswordFile(false);
 
@@ -1870,6 +1877,9 @@ CleanupAbortedSubTransactions(bool returnName)
 		s = CurrentTransactionState;
 	}
 
+	AssertState(s->blockState == TBLOCK_SUBINPROGRESS ||
+				s->blockState == TBLOCK_INPROGRESS);
+
 	return name;
 }
 
@@ -2866,7 +2876,8 @@ StartSubTransaction(void)
 	TransactionState s = CurrentTransactionState;
 
 	if (s->state != TRANS_DEFAULT)
-		elog(WARNING, "StartSubTransaction and not in default state");
+		elog(WARNING, "StartSubTransaction while in %s state",
+			 TransStateAsString(s->state));
 
 	s->state = TRANS_START;
 
@@ -2889,6 +2900,7 @@ StartSubTransaction(void)
 	 * Finish setup of other transaction state fields.
 	 */
 	s->currentUser = GetUserId();
+	s->prevXactReadOnly = XactReadOnly;
 	
 	/*
 	 * Initialize other subsystems for new subtransaction
@@ -2913,7 +2925,8 @@ CommitSubTransaction(void)
 	ShowTransactionState("CommitSubTransaction");
 
 	if (s->state != TRANS_INPROGRESS)
-		elog(WARNING, "CommitSubTransaction and not in in-progress state");
+		elog(WARNING, "CommitSubTransaction while in %s state",
+			 TransStateAsString(s->state));
 
 	/* Pre-commit processing */
 	AtSubCommit_Portals(s->parent->transactionIdData,
@@ -2930,9 +2943,18 @@ CommitSubTransaction(void)
 	/* Post-commit cleanup */
 	AtSubCommit_smgr();
 
-	AtSubEOXact_Inval(true);
+	AtEOSubXact_Inval(true);
 	AtEOSubXact_SPI(true, s->transactionIdData);
 
+	AtEOSubXact_LargeObject(true, s->transactionIdData,
+							s->parent->transactionIdData);
+	AtEOSubXact_UpdatePasswordFile(true, s->transactionIdData,
+								   s->parent->transactionIdData);
+	AtEOSubXact_Files(true, s->transactionIdData,
+					  s->parent->transactionIdData);
+	AtEOSubXact_Namespace(true, s->transactionIdData,
+						  s->parent->transactionIdData);
+
 	/*
 	 * Note that we just release the resource owner's resources and don't
 	 * delete it.  This is because locks are not actually released here.
@@ -2953,6 +2975,13 @@ CommitSubTransaction(void)
 	AtEOSubXact_on_commit_actions(true, s->transactionIdData,
 								  s->parent->transactionIdData);
 
+	/*
+	 * We need to restore the upper transaction's read-only state,
+	 * in case the upper is read-write while the child is read-only;
+	 * GUC will incorrectly think it should leave the child state in place.
+	 */
+	XactReadOnly = s->prevXactReadOnly;
+
 	CurrentResourceOwner = s->parent->curTransactionOwner;
 	CurTransactionResourceOwner = s->parent->curTransactionOwner;
 	s->curTransactionOwner = NULL;
@@ -2973,7 +3002,8 @@ AbortSubTransaction(void)
 	ShowTransactionState("AbortSubTransaction");
 
 	if (s->state != TRANS_INPROGRESS)
-		elog(WARNING, "AbortSubTransaction and not in in-progress state");
+		elog(WARNING, "AbortSubTransaction while in %s state",
+			 TransStateAsString(s->state));
 
 	HOLD_INTERRUPTS();
 
@@ -3010,7 +3040,16 @@ AbortSubTransaction(void)
 	AtEOSubXact_SPI(false, s->transactionIdData);
 	AtSubAbort_Portals(s->parent->transactionIdData,
 					   s->parent->curTransactionOwner);
-	AtSubEOXact_Inval(false);
+	AtEOSubXact_Inval(false);
+
+	AtEOSubXact_LargeObject(false, s->transactionIdData,
+							s->parent->transactionIdData);
+	AtEOSubXact_UpdatePasswordFile(false, s->transactionIdData,
+								   s->parent->transactionIdData);
+	AtEOSubXact_Files(false, s->transactionIdData,
+					  s->parent->transactionIdData);
+	AtEOSubXact_Namespace(false, s->transactionIdData,
+						  s->parent->transactionIdData);
 
 	ResourceOwnerRelease(s->curTransactionOwner,
 						 RESOURCE_RELEASE_BEFORE_LOCKS,
@@ -3041,6 +3080,13 @@ AbortSubTransaction(void)
 	 */
 	SetUserId(s->currentUser);
 
+	/*
+	 * Restore the upper transaction's read-only state, too.  This should
+	 * be redundant with GUC's cleanup but we may as well do it for
+	 * consistency with the commit case.
+	 */
+	XactReadOnly = s->prevXactReadOnly;
+
 	CommandCounterIncrement();
 
 	RESUME_INTERRUPTS();
@@ -3057,7 +3103,8 @@ CleanupSubTransaction(void)
 	ShowTransactionState("CleanupSubTransaction");
 
 	if (s->state != TRANS_ABORT)
-		elog(WARNING, "CleanupSubTransaction and not in aborted state");
+		elog(WARNING, "CleanupSubTransaction while in %s state",
+			 TransStateAsString(s->state));
 
 	AtSubCleanup_Portals();
 
@@ -3088,7 +3135,8 @@ StartAbortedSubTransaction(void)
 	TransactionState s = CurrentTransactionState;
 
 	if (s->state != TRANS_DEFAULT)
-		elog(WARNING, "StartAbortedSubTransaction and not in default state");
+		elog(WARNING, "StartAbortedSubTransaction while in %s state",
+			 TransStateAsString(s->state));
 
 	s->state = TRANS_START;
 
@@ -3168,7 +3216,8 @@ PopTransaction(void)
 	TransactionState s = CurrentTransactionState;
 
 	if (s->state != TRANS_DEFAULT)
-		elog(WARNING, "PopTransaction and not in default state");
+		elog(WARNING, "PopTransaction while in %s state",
+			 TransStateAsString(s->state));
 
 	if (s->parent == NULL)
 		elog(FATAL, "PopTransaction with no parent");
diff --git a/src/backend/catalog/namespace.c b/src/backend/catalog/namespace.c
index b412023fe2873ed62edd4eb10cc9976693f02358..7bc388cf14bc90fdd715605aefdbb9bcc2b9c1d5 100644
--- a/src/backend/catalog/namespace.c
+++ b/src/backend/catalog/namespace.c
@@ -13,7 +13,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/catalog/namespace.c,v 1.67 2004/06/18 06:13:19 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/catalog/namespace.c,v 1.68 2004/07/28 14:23:27 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -107,12 +107,17 @@ static bool namespaceSearchPathValid = true;
  * myTempNamespace is InvalidOid until and unless a TEMP namespace is set up
  * in a particular backend session (this happens when a CREATE TEMP TABLE
  * command is first executed).	Thereafter it's the OID of the temp namespace.
- * firstTempTransaction flags whether we've committed creation of the TEMP
- * namespace or not.
+ *
+ * myTempNamespaceXID shows whether we've created the TEMP namespace in the
+ * current transaction.  The TransactionId propagates up the transaction tree,
+ * so the main transaction will correctly recognize the flag if all
+ * intermediate subtransactions commit.  When it is InvalidTransactionId,
+ * we either haven't made the TEMP namespace yet, or have successfully
+ * committed its creation, depending on whether myTempNamespace is valid.
  */
 static Oid	myTempNamespace = InvalidOid;
 
-static bool firstTempTransaction = false;
+static TransactionId myTempNamespaceXID = InvalidTransactionId;
 
 /*
  * "Special" namespace for CREATE SCHEMA.  If set, it's the first search
@@ -1688,7 +1693,9 @@ InitTempTableNamespace(void)
 	 */
 	myTempNamespace = namespaceId;
 
-	firstTempTransaction = true;
+	/* It should not be done already. */
+	AssertState(myTempNamespaceXID == InvalidTransactionId);
+	myTempNamespaceXID = GetCurrentTransactionId();
 
 	namespaceSearchPathValid = false;	/* need to rebuild list */
 }
@@ -1707,7 +1714,7 @@ AtEOXact_Namespace(bool isCommit)
 	 * temp tables at backend shutdown.  (We only want to register the
 	 * callback once per session, so this is a good place to do it.)
 	 */
-	if (firstTempTransaction)
+	if (myTempNamespaceXID == GetCurrentTransactionId())
 	{
 		if (isCommit)
 			on_shmem_exit(RemoveTempRelationsCallback, 0);
@@ -1716,7 +1723,7 @@ AtEOXact_Namespace(bool isCommit)
 			myTempNamespace = InvalidOid;
 			namespaceSearchPathValid = false;	/* need to rebuild list */
 		}
-		firstTempTransaction = false;
+		myTempNamespaceXID = InvalidTransactionId;
 	}
 
 	/*
@@ -1729,6 +1736,32 @@ AtEOXact_Namespace(bool isCommit)
 	}
 }
 
+/*
+ * AtEOSubXact_Namespace
+ *
+ * At subtransaction commit, propagate the temp-namespace-creation
+ * flag to the parent transaction.
+ *
+ * At subtransaction abort, forget the flag if we set it up.
+ */
+void
+AtEOSubXact_Namespace(bool isCommit, TransactionId myXid,
+					  TransactionId parentXid)
+{
+	if (myTempNamespaceXID == myXid)
+	{
+		if (isCommit)
+			myTempNamespaceXID = parentXid;
+		else
+		{
+			myTempNamespaceXID = InvalidTransactionId;
+			/* TEMP namespace creation failed, so reset state */
+			myTempNamespace = InvalidOid;
+			namespaceSearchPathValid = false;	/* need to rebuild list */
+		}
+	}
+}
+
 /*
  * Remove all relations in the specified temp namespace.
  *
diff --git a/src/backend/commands/user.c b/src/backend/commands/user.c
index 255428fadc7d62491e51c7371d9adb07d18503f2..da8f92aee7ca4e551c46020e3498ec927cb0179e 100644
--- a/src/backend/commands/user.c
+++ b/src/backend/commands/user.c
@@ -6,7 +6,7 @@
  * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/backend/commands/user.c,v 1.141 2004/05/26 04:41:12 neilc Exp $
+ * $PostgreSQL: pgsql/src/backend/commands/user.c,v 1.142 2004/07/28 14:23:28 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -44,8 +44,30 @@
 
 extern bool Password_encryption;
 
-static bool user_file_update_needed = false;
-static bool group_file_update_needed = false;
+/*
+ * The need-to-update-files flags are a pair of TransactionIds that show what
+ * level of the transaction tree requested the update.  To register an update,
+ * the transaction saves its own TransactionId in the flag, unless the value
+ * was already set to a valid TransactionId.  If it aborts and the value is its
+ * TransactionId, it resets the value to InvalidTransactionId.  If it commits,
+ * it changes the value to its parent's TransactionId.  This way the value is
+ * propagated up to the topmost transaction, which will update the files if a
+ * valid TransactionId is detected.
+ */
+static TransactionId user_file_update_xid = InvalidTransactionId;
+static TransactionId group_file_update_xid = InvalidTransactionId;
+
+#define user_file_update_needed() \
+	do { \
+		if (user_file_update_xid == InvalidTransactionId) \
+			user_file_update_xid = GetCurrentTransactionId(); \
+	} while (0)
+
+#define group_file_update_needed() \
+	do { \
+		if (group_file_update_xid == InvalidTransactionId) \
+			group_file_update_xid = GetCurrentTransactionId(); \
+	} while (0)
 
 
 static void CheckPgUserAclNotNull(void);
@@ -402,8 +424,8 @@ write_user_file(Relation urel)
 Datum
 update_pg_pwd_and_pg_group(PG_FUNCTION_ARGS)
 {
-	user_file_update_needed = true;
-	group_file_update_needed = true;
+	user_file_update_needed();
+	group_file_update_needed();
 
 	return PointerGetDatum(NULL);
 }
@@ -429,13 +451,14 @@ AtEOXact_UpdatePasswordFile(bool isCommit)
 	Relation	urel = NULL;
 	Relation	grel = NULL;
 
-	if (!(user_file_update_needed || group_file_update_needed))
+	if (user_file_update_xid == InvalidTransactionId &&
+		group_file_update_xid == InvalidTransactionId)
 		return;
 
 	if (!isCommit)
 	{
-		user_file_update_needed = false;
-		group_file_update_needed = false;
+		user_file_update_xid = InvalidTransactionId;
+		group_file_update_xid = InvalidTransactionId;
 		return;
 	}
 
@@ -447,22 +470,22 @@ AtEOXact_UpdatePasswordFile(bool isCommit)
 	 * pg_shadow or pg_group, which likely won't have gotten a strong
 	 * enough lock), so get the locks we need before writing anything.
 	 */
-	if (user_file_update_needed)
+	if (user_file_update_xid != InvalidTransactionId)
 		urel = heap_openr(ShadowRelationName, ExclusiveLock);
-	if (group_file_update_needed)
+	if (group_file_update_xid != InvalidTransactionId)
 		grel = heap_openr(GroupRelationName, ExclusiveLock);
 
 	/* Okay to write the files */
-	if (user_file_update_needed)
+	if (user_file_update_xid != InvalidTransactionId)
 	{
-		user_file_update_needed = false;
+		user_file_update_xid = InvalidTransactionId;
 		write_user_file(urel);
 		heap_close(urel, NoLock);
 	}
 
-	if (group_file_update_needed)
+	if (group_file_update_xid != InvalidTransactionId)
 	{
-		group_file_update_needed = false;
+		group_file_update_xid = InvalidTransactionId;
 		write_group_file(grel);
 		heap_close(grel, NoLock);
 	}
@@ -473,7 +496,33 @@ AtEOXact_UpdatePasswordFile(bool isCommit)
 	SendPostmasterSignal(PMSIGNAL_PASSWORD_CHANGE);
 }
 
+/*
+ * AtEOSubXact_UpdatePasswordFile
+ *
+ * Called at subtransaction end, this routine resets or updates the
+ * need-to-update-files flags.
+ */
+void
+AtEOSubXact_UpdatePasswordFile(bool isCommit, TransactionId myXid,
+							   TransactionId parentXid)
+{
+	if (isCommit)
+	{
+		if (user_file_update_xid == myXid)
+			user_file_update_xid = parentXid;
+
+		if (group_file_update_xid == myXid)
+			group_file_update_xid = parentXid;
+	}
+	else
+	{
+		if (user_file_update_xid == myXid)
+			user_file_update_xid = InvalidTransactionId;
 
+		if (group_file_update_xid == myXid)
+			group_file_update_xid = InvalidTransactionId;
+	}
+}
 
 /*
  * CREATE USER
@@ -728,7 +777,7 @@ CreateUser(CreateUserStmt *stmt)
 	/*
 	 * Set flag to update flat password file at commit.
 	 */
-	user_file_update_needed = true;
+	user_file_update_needed();
 }
 
 
@@ -925,7 +974,7 @@ AlterUser(AlterUserStmt *stmt)
 	/*
 	 * Set flag to update flat password file at commit.
 	 */
-	user_file_update_needed = true;
+	user_file_update_needed();
 }
 
 
@@ -1147,7 +1196,7 @@ DropUser(DropUserStmt *stmt)
 	/*
 	 * Set flag to update flat password file at commit.
 	 */
-	user_file_update_needed = true;
+	user_file_update_needed();
 }
 
 
@@ -1233,7 +1282,7 @@ RenameUser(const char *oldname, const char *newname)
 	ReleaseSysCache(oldtuple);
 	heap_close(rel, NoLock);
 
-	user_file_update_needed = true;
+	user_file_update_needed();
 }
 
 
@@ -1438,7 +1487,7 @@ CreateGroup(CreateGroupStmt *stmt)
 	/*
 	 * Set flag to update flat group file at commit.
 	 */
-	group_file_update_needed = true;
+	group_file_update_needed();
 }
 
 
@@ -1590,7 +1639,7 @@ AlterGroup(AlterGroupStmt *stmt, const char *tag)
 	/*
 	 * Set flag to update flat group file at commit.
 	 */
-	group_file_update_needed = true;
+	group_file_update_needed();
 }
 
 /*
@@ -1730,7 +1779,7 @@ DropGroup(DropGroupStmt *stmt)
 	/*
 	 * Set flag to update flat group file at commit.
 	 */
-	group_file_update_needed = true;
+	group_file_update_needed();
 }
 
 
@@ -1776,5 +1825,5 @@ RenameGroup(const char *oldname, const char *newname)
 	heap_close(rel, NoLock);
 	heap_freetuple(tup);
 
-	group_file_update_needed = true;
+	group_file_update_needed();
 }
diff --git a/src/backend/libpq/be-fsstubs.c b/src/backend/libpq/be-fsstubs.c
index ed19e76db2ccd3d3ea1ce4b4e4461ea41aaf7bc6..21d1f3ddcfe04f3b4fb5228897d8ccaf35ed91fe 100644
--- a/src/backend/libpq/be-fsstubs.c
+++ b/src/backend/libpq/be-fsstubs.c
@@ -1,24 +1,22 @@
 /*-------------------------------------------------------------------------
  *
  * be-fsstubs.c
- *	  support for filesystem operations on large objects
+ *	  Builtin functions for open/close/read/write operations on large objects
  *
  * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/libpq/be-fsstubs.c,v 1.70 2004/02/10 01:55:25 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/libpq/be-fsstubs.c,v 1.71 2004/07/28 14:23:28 tgl Exp $
  *
  * NOTES
  *	  This should be moved to a more appropriate place.  It is here
  *	  for lack of a better place.
  *
- *	  Builtin functions for open/close/read/write operations on large objects.
- *
  *	  These functions operate in a private MemoryContext, which means
- *	  that large object descriptors hang around until we destroy the context.
- *	  That happens in lo_commit().	It'd be possible to prolong the lifetime
+ *	  that large object descriptors hang around until we destroy the context
+ *	  at transaction end.  It'd be possible to prolong the lifetime
  *	  of the context so that LO FDs are good across transactions (for example,
  *	  we could release the context only if we see that no FDs remain open).
  *	  But we'd need additional state in order to do the right thing at the
@@ -29,6 +27,11 @@
  *	  existing documented semantics of LO FDs: they're only good within a
  *	  transaction.
  *
+ *	  As of PostgreSQL 7.5, much of the angst expressed above is no longer
+ *	  relevant, and in fact it'd be pretty easy to allow LO FDs to stay
+ *	  open across transactions.  However backwards compatibility suggests
+ *	  that we should stick to the status quo.
+ *
  *-------------------------------------------------------------------------
  */
 
@@ -46,8 +49,6 @@
 #include "utils/memutils.h"
 
 
-/* [PA] is Pascal André <andre@via.ecp.fr> */
-
 /*#define FSDB 1*/
 #define BUFSIZE			8192
 
@@ -68,6 +69,7 @@ static MemoryContext fscxt = NULL;
 static int	newLOfd(LargeObjectDesc *lobjCookie);
 static void deleteLOfd(int fd);
 
+
 /*****************************************************************************
  *	File Interfaces for Large Objects
  *****************************************************************************/
@@ -399,7 +401,7 @@ lo_import(PG_FUNCTION_ARGS)
 	lobjOid = lobj->id;
 
 	/*
-	 * read in from the Unix file and write to the inversion file
+	 * read in from the filesystem and write to the inversion file
 	 */
 	while ((nbytes = FileRead(fd, buf, BUFSIZE)) > 0)
 	{
@@ -471,7 +473,7 @@ lo_export(PG_FUNCTION_ARGS)
 						fnamebuf)));
 
 	/*
-	 * read in from the inversion file and write to the Unix file
+	 * read in from the inversion file and write to the filesystem
 	 */
 	while ((nbytes = inv_read(lobj, buf, BUFSIZE)) > 0)
 	{
@@ -490,11 +492,11 @@ lo_export(PG_FUNCTION_ARGS)
 }
 
 /*
- * lo_commit -
- *		 prepares large objects for transaction commit [PA, 7/17/98]
+ * AtEOXact_LargeObject -
+ *		 prepares large objects for transaction commit
  */
 void
-lo_commit(bool isCommit)
+AtEOXact_LargeObject(bool isCommit)
 {
 	int			i;
 	MemoryContext currentContext;
@@ -505,8 +507,8 @@ lo_commit(bool isCommit)
 	currentContext = MemoryContextSwitchTo(fscxt);
 
 	/*
-	 * Clean out still-open index scans (not necessary if aborting) and
-	 * clear cookies array so that LO fds are no longer good.
+	 * Close LO fds and clear cookies array so that LO fds are no longer good.
+	 * On abort we skip the close step.
 	 */
 	for (i = 0; i < cookies_size; i++)
 	{
@@ -514,7 +516,7 @@ lo_commit(bool isCommit)
 		{
 			if (isCommit)
 				inv_close(cookies[i]);
-			cookies[i] = NULL;
+			deleteLOfd(i);
 		}
 	}
 
@@ -527,8 +529,47 @@ lo_commit(bool isCommit)
 	/* Release the LO memory context to prevent permanent memory leaks. */
 	MemoryContextDelete(fscxt);
 	fscxt = NULL;
+
+	/* Give inv_api.c a chance to clean up, too */
+	close_lo_relation(isCommit);
 }
 
+/*
+ * AtEOSubXact_LargeObject
+ * 		Take care of large objects at subtransaction commit/abort
+ *
+ * Reassign LOs created/opened during a committing subtransaction
+ * to the parent transaction.  On abort, just close them.
+ */
+void
+AtEOSubXact_LargeObject(bool isCommit, TransactionId myXid,
+						TransactionId parentXid)
+{
+	int				i;
+
+	if (fscxt == NULL)			/* no LO operations in this xact */
+		return;
+
+	for (i = 0; i < cookies_size; i++)
+	{
+		LargeObjectDesc *lo = cookies[i];
+
+		if (lo != NULL && lo->xid == myXid)
+		{
+			if (isCommit)
+				lo->xid = parentXid;
+			else
+			{
+				/*
+				 * Make sure we do not call inv_close twice if it errors out
+				 * for some reason.  Better a leak than a crash.
+				 */
+				deleteLOfd(i);
+				inv_close(lo);
+			}
+		}
+	}
+}
 
 /*****************************************************************************
  *	Support routines for this file
diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c
index 96de54110cfaab2f21d42120c63f3c09fbb6d961..918d541e2a768df45420a6a3b2b40948624ba7f6 100644
--- a/src/backend/storage/file/fd.c
+++ b/src/backend/storage/file/fd.c
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/storage/file/fd.c,v 1.109 2004/05/31 03:48:04 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/storage/file/fd.c,v 1.110 2004/07/28 14:23:28 tgl Exp $
  *
  * NOTES:
  *
@@ -47,6 +47,7 @@
 #include <fcntl.h>
 
 #include "miscadmin.h"
+#include "access/xact.h"
 #include "storage/fd.h"
 #include "storage/ipc.h"
 
@@ -122,6 +123,7 @@ typedef struct vfd
 {
 	signed short fd;			/* current FD, or VFD_CLOSED if none */
 	unsigned short fdstate;		/* bitflags for VFD's state */
+	TransactionId create_xid;	/* for XACT_TEMPORARY fds, creating Xid */
 	File		nextFree;		/* link to next free VFD, if in freelist */
 	File		lruMoreRecently;	/* doubly linked recency-of-use list */
 	File		lruLessRecently;
@@ -146,27 +148,31 @@ static Size SizeVfdCache = 0;
 static int	nfile = 0;
 
 /*
- * List of stdio FILEs opened with AllocateFile.
+ * List of stdio FILEs and <dirent.h> DIRs opened with AllocateFile
+ * and AllocateDir.
  *
- * Since we don't want to encourage heavy use of AllocateFile, it seems
- * OK to put a pretty small maximum limit on the number of simultaneously
- * allocated files.
+ * Since we don't want to encourage heavy use of AllocateFile or AllocateDir,
+ * it seems OK to put a pretty small maximum limit on the number of
+ * simultaneously allocated descs.
  */
-#define MAX_ALLOCATED_FILES  32
+#define MAX_ALLOCATED_DESCS  32
 
-static int	numAllocatedFiles = 0;
-static FILE *allocatedFiles[MAX_ALLOCATED_FILES];
+typedef enum {
+	AllocateDescFile,
+	AllocateDescDir
+} AllocateDescKind;
 
-/*
- * List of <dirent.h> DIRs opened with AllocateDir.
- *
- * Since we don't have heavy use of AllocateDir, it seems OK to put a pretty
- * small maximum limit on the number of simultaneously allocated dirs.
- */
-#define MAX_ALLOCATED_DIRS  10
+typedef struct {
+	AllocateDescKind	kind;
+	union	{
+		FILE	*file;
+		DIR		*dir;
+	} desc;
+	TransactionId create_xid;
+} AllocateDesc;
 
-static int	numAllocatedDirs = 0;
-static DIR *allocatedDirs[MAX_ALLOCATED_DIRS];
+static int numAllocatedDescs = 0;
+static AllocateDesc allocatedDescs[MAX_ALLOCATED_DESCS];
 
 /*
  * Number of temporary files opened during the current session;
@@ -499,7 +505,7 @@ LruInsert(File file)
 
 	if (FileIsNotOpen(file))
 	{
-		while (nfile + numAllocatedFiles + numAllocatedDirs >= max_safe_fds)
+		while (nfile + numAllocatedDescs >= max_safe_fds)
 		{
 			if (!ReleaseLruFile())
 				break;
@@ -759,7 +765,7 @@ fileNameOpenFile(FileName fileName,
 	file = AllocateVfd();
 	vfdP = &VfdCache[file];
 
-	while (nfile + numAllocatedFiles + numAllocatedDirs >= max_safe_fds)
+	while (nfile + numAllocatedDescs >= max_safe_fds)
 	{
 		if (!ReleaseLruFile())
 			break;
@@ -876,7 +882,10 @@ OpenTemporaryFile(bool interXact)
 
 	/* Mark it for deletion at EOXact */
 	if (!interXact)
+	{
 		VfdCache[file].fdstate |= FD_XACT_TEMPORARY;
+		VfdCache[file].create_xid = GetCurrentTransactionId();
+	}
 
 	return file;
 }
@@ -1134,24 +1143,29 @@ AllocateFile(char *name, char *mode)
 {
 	FILE	   *file;
 
-	DO_DB(elog(LOG, "AllocateFile: Allocated %d", numAllocatedFiles));
+	DO_DB(elog(LOG, "AllocateFile: Allocated %d (%s)",
+			   numAllocatedDescs, name));
 
 	/*
-	 * The test against MAX_ALLOCATED_FILES prevents us from overflowing
+	 * The test against MAX_ALLOCATED_DESCS prevents us from overflowing
 	 * allocatedFiles[]; the test against max_safe_fds prevents AllocateFile
 	 * from hogging every one of the available FDs, which'd lead to infinite
 	 * looping.
 	 */
-	if (numAllocatedFiles >= MAX_ALLOCATED_FILES ||
-		numAllocatedFiles + numAllocatedDirs >= max_safe_fds - 1)
+	if (numAllocatedDescs >= MAX_ALLOCATED_DESCS ||
+		numAllocatedDescs >= max_safe_fds - 1)
 		elog(ERROR, "too many private files demanded");
 
 TryAgain:
 	if ((file = fopen(name, mode)) != NULL)
 	{
-		allocatedFiles[numAllocatedFiles] = file;
-		numAllocatedFiles++;
-		return file;
+		AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
+
+		desc->kind = AllocateDescFile;
+		desc->desc.file = file;
+		desc->create_xid = GetCurrentTransactionId();
+		numAllocatedDescs++;
+		return desc->desc.file;
 	}
 
 	if (errno == EMFILE || errno == ENFILE)
@@ -1170,6 +1184,38 @@ TryAgain:
 	return NULL;
 }
 
+/*
+ * Free an AllocateDesc of either type.
+ *
+ * The argument *must* point into the allocatedDescs[] array.
+ */
+static int
+FreeDesc(AllocateDesc *desc)
+{
+	int		result;
+
+	/* Close the underlying object */
+	switch (desc->kind)
+	{
+		case AllocateDescFile:
+			result = fclose(desc->desc.file);
+			break;
+		case AllocateDescDir:
+			result = closedir(desc->desc.dir);
+			break;
+		default:
+			elog(ERROR, "AllocateDesc kind not recognized");
+			result = 0;			/* keep compiler quiet */
+			break;
+	}
+
+	/* Compact storage in the allocatedDescs array */
+	numAllocatedDescs--;
+	*desc = allocatedDescs[numAllocatedDescs];
+
+	return result;
+}
+
 /*
  * Close a file returned by AllocateFile.
  *
@@ -1181,20 +1227,19 @@ FreeFile(FILE *file)
 {
 	int			i;
 
-	DO_DB(elog(LOG, "FreeFile: Allocated %d", numAllocatedFiles));
+	DO_DB(elog(LOG, "FreeFile: Allocated %d", numAllocatedDescs));
 
 	/* Remove file from list of allocated files, if it's present */
-	for (i = numAllocatedFiles; --i >= 0;)
+	for (i = numAllocatedDescs; --i >= 0;)
 	{
-		if (allocatedFiles[i] == file)
-		{
-			numAllocatedFiles--;
-			allocatedFiles[i] = allocatedFiles[numAllocatedFiles];
-			break;
-		}
+		AllocateDesc *desc = &allocatedDescs[i];
+
+		if (desc->kind == AllocateDescFile && desc->desc.file == file)
+			return FreeDesc(desc);
 	}
-	if (i < 0)
-		elog(WARNING, "file passed to FreeFile was not obtained from AllocateFile");
+
+	/* Only get here if someone passes us a file not in allocatedDescs */
+	elog(WARNING, "file passed to FreeFile was not obtained from AllocateFile");
 
 	return fclose(file);
 }
@@ -1213,24 +1258,29 @@ AllocateDir(const char *dirname)
 {
 	DIR	   *dir;
 
-	DO_DB(elog(LOG, "AllocateDir: Allocated %d", numAllocatedDirs));
+	DO_DB(elog(LOG, "AllocateDir: Allocated %d (%s)",
+			   numAllocatedDescs, dirname));
 
 	/*
-	 * The test against MAX_ALLOCATED_DIRS prevents us from overflowing
-	 * allocatedDirs[]; the test against max_safe_fds prevents AllocateDir
+	 * The test against MAX_ALLOCATED_DESCS prevents us from overflowing
+	 * allocatedDescs[]; the test against max_safe_fds prevents AllocateDir
 	 * from hogging every one of the available FDs, which'd lead to infinite
 	 * looping.
 	 */
-	if (numAllocatedDirs >= MAX_ALLOCATED_DIRS ||
-		numAllocatedDirs + numAllocatedFiles >= max_safe_fds - 1)
+	if (numAllocatedDescs >= MAX_ALLOCATED_DESCS ||
+		numAllocatedDescs >= max_safe_fds - 1)
 		elog(ERROR, "too many private dirs demanded");
 
 TryAgain:
 	if ((dir = opendir(dirname)) != NULL)
 	{
-		allocatedDirs[numAllocatedDirs] = dir;
-		numAllocatedDirs++;
-		return dir;
+		AllocateDesc *desc = &allocatedDescs[numAllocatedDescs];
+
+		desc->kind = AllocateDescDir;
+		desc->desc.dir = dir;
+		desc->create_xid = GetCurrentTransactionId();
+		numAllocatedDescs++;
+		return desc->desc.dir;
 	}
 
 	if (errno == EMFILE || errno == ENFILE)
@@ -1260,20 +1310,19 @@ FreeDir(DIR *dir)
 {
 	int			i;
 
-	DO_DB(elog(LOG, "FreeDir: Allocated %d", numAllocatedDirs));
+	DO_DB(elog(LOG, "FreeDir: Allocated %d", numAllocatedDescs));
 
 	/* Remove dir from list of allocated dirs, if it's present */
-	for (i = numAllocatedDirs; --i >= 0;)
+	for (i = numAllocatedDescs; --i >= 0;)
 	{
-		if (allocatedDirs[i] == dir)
-		{
-			numAllocatedDirs--;
-			allocatedDirs[i] = allocatedDirs[numAllocatedDirs];
-			break;
-		}
+		AllocateDesc *desc = &allocatedDescs[i];
+
+		if (desc->kind == AllocateDescDir && desc->desc.dir == dir)
+			return FreeDesc(desc);
 	}
-	if (i < 0)
-		elog(WARNING, "dir passed to FreeDir was not obtained from AllocateDir");
+
+	/* Only get here if someone passes us a dir not in allocatedDescs */
+	elog(WARNING, "dir passed to FreeDir was not obtained from AllocateDir");
 
 	return closedir(dir);
 }
@@ -1302,6 +1351,51 @@ closeAllVfds(void)
 	}
 }
 
+/*
+ * AtEOSubXact_Files
+ *
+ * Take care of subtransaction commit/abort.  At abort, we close temp files
+ * that the subtransaction may have opened.  At commit, we reassign the
+ * files that were opened to the parent transaction.
+ */
+void
+AtEOSubXact_Files(bool isCommit, TransactionId myXid, TransactionId parentXid)
+{
+	Index i;
+
+	if (SizeVfdCache > 0)
+	{
+		Assert(FileIsNotOpen(0));		/* Make sure ring not corrupted */
+		for (i = 1; i < SizeVfdCache; i++)
+		{
+			unsigned short fdstate = VfdCache[i].fdstate;
+
+			if ((fdstate & FD_XACT_TEMPORARY) &&
+				VfdCache[i].create_xid == myXid)
+			{
+				if (isCommit)
+					VfdCache[i].create_xid = parentXid;
+				else if (VfdCache[i].fileName != NULL)
+					FileClose(i);
+			}
+		}
+	}
+
+	for (i = 0; i < numAllocatedDescs; i++)
+	{
+		if (allocatedDescs[i].create_xid == myXid)
+		{
+			if (isCommit)
+				allocatedDescs[i].create_xid = parentXid;
+			else
+			{
+				/* have to recheck the item after FreeDesc (ugly) */
+				FreeDesc(&allocatedDescs[i--]);
+			}
+		}
+	}
+}
+
 /*
  * AtEOXact_Files
  *
@@ -1362,11 +1456,8 @@ CleanupTempFiles(bool isProcExit)
 		}
 	}
 
-	while (numAllocatedFiles > 0)
-		FreeFile(allocatedFiles[0]);
-
-	while (numAllocatedDirs > 0)
-		FreeDir(allocatedDirs[0]);
+	while (numAllocatedDescs > 0)
+		FreeDesc(&allocatedDescs[0]);
 }
 
 
diff --git a/src/backend/storage/large_object/inv_api.c b/src/backend/storage/large_object/inv_api.c
index 5f75e06e189af397891e38a6db89ccd9d1aa2bc3..470dcf11aa948905d120c48e3e56e2fb6354c351 100644
--- a/src/backend/storage/large_object/inv_api.c
+++ b/src/backend/storage/large_object/inv_api.c
@@ -9,36 +9,92 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/storage/large_object/inv_api.c,v 1.102 2003/11/29 19:51:56 pgsql Exp $
+ *	  $PostgreSQL: pgsql/src/backend/storage/large_object/inv_api.c,v 1.103 2004/07/28 14:23:29 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
 #include "postgres.h"
 
-#include <errno.h>
-#include <sys/file.h>
-#include <sys/stat.h>
-
 #include "access/genam.h"
 #include "access/heapam.h"
-#include "access/htup.h"
 #include "access/tuptoaster.h"
 #include "catalog/catalog.h"
 #include "catalog/catname.h"
-#include "catalog/heap.h"
-#include "catalog/index.h"
 #include "catalog/indexing.h"
-#include "catalog/pg_opclass.h"
 #include "catalog/pg_largeobject.h"
-#include "catalog/pg_type.h"
 #include "commands/comment.h"
 #include "libpq/libpq-fs.h"
-#include "miscadmin.h"
 #include "storage/large_object.h"
-#include "storage/smgr.h"
-#include "utils/builtins.h"
 #include "utils/fmgroids.h"
 #include "utils/lsyscache.h"
+#include "utils/resowner.h"
+
+
+/*
+ * All accesses to pg_largeobject and its index make use of a single Relation
+ * reference, so that we only need to open pg_relation once per transaction.
+ * To avoid problems when the first such reference occurs inside a
+ * subtransaction, we execute a slightly klugy maneuver to assign ownership of
+ * the Relation reference to TopTransactionResourceOwner.
+ */
+static Relation lo_heap_r = NULL;
+static Relation lo_index_r = NULL;
+
+
+/*
+ * Open pg_largeobject and its index, if not already done in current xact
+ */
+static void
+open_lo_relation(void)
+{
+	ResourceOwner currentOwner;
+
+	if (lo_heap_r && lo_index_r)
+		return;					/* already open in current xact */
+
+	/* Arrange for the top xact to own these relation references */
+	currentOwner = CurrentResourceOwner;
+	CurrentResourceOwner = TopTransactionResourceOwner;
+
+	/* Use RowExclusiveLock since we might either read or write */
+	if (lo_heap_r == NULL)
+		lo_heap_r = heap_openr(LargeObjectRelationName, RowExclusiveLock);
+	if (lo_index_r == NULL)
+		lo_index_r = index_openr(LargeObjectLOidPNIndex);
+
+	CurrentResourceOwner = currentOwner;
+}
+
+/*
+ * Clean up at main transaction end
+ */
+void
+close_lo_relation(bool isCommit)
+{
+	if (lo_heap_r || lo_index_r)
+	{
+		/*
+		 * Only bother to close if committing; else abort cleanup will
+		 * handle it
+		 */
+		if (isCommit)
+		{
+			ResourceOwner currentOwner;
+
+			currentOwner = CurrentResourceOwner;
+			CurrentResourceOwner = TopTransactionResourceOwner;
+
+			if (lo_index_r)
+				index_close(lo_index_r);
+			if (lo_heap_r)
+				heap_close(lo_heap_r, NoLock);
+
+			CurrentResourceOwner = currentOwner;
+		}
+		lo_heap_r = NULL;
+		lo_index_r = NULL;
+	}
+}
 
 
 static int32
@@ -50,6 +106,7 @@ getbytealen(bytea *data)
 	return (VARSIZE(data) - VARHDRSZ);
 }
 
+
 /*
  *	inv_create -- create a new large object.
  *
@@ -92,23 +149,20 @@ inv_create(int flags)
 	retval = (LargeObjectDesc *) palloc(sizeof(LargeObjectDesc));
 
 	retval->id = file_oid;
+	retval->xid = GetCurrentTransactionId();
 	retval->offset = 0;
 
 	if (flags & INV_WRITE)
 	{
 		retval->flags = IFS_WRLOCK | IFS_RDLOCK;
-		retval->heap_r = heap_openr(LargeObjectRelationName, RowExclusiveLock);
 	}
 	else if (flags & INV_READ)
 	{
 		retval->flags = IFS_RDLOCK;
-		retval->heap_r = heap_openr(LargeObjectRelationName, AccessShareLock);
 	}
 	else
 		elog(ERROR, "invalid flags: %d", flags);
 
-	retval->index_r = index_openr(LargeObjectLOidPNIndex);
-
 	return retval;
 }
 
@@ -131,23 +185,20 @@ inv_open(Oid lobjId, int flags)
 	retval = (LargeObjectDesc *) palloc(sizeof(LargeObjectDesc));
 
 	retval->id = lobjId;
+	retval->xid = GetCurrentTransactionId();
 	retval->offset = 0;
 
 	if (flags & INV_WRITE)
 	{
 		retval->flags = IFS_WRLOCK | IFS_RDLOCK;
-		retval->heap_r = heap_openr(LargeObjectRelationName, RowExclusiveLock);
 	}
 	else if (flags & INV_READ)
 	{
 		retval->flags = IFS_RDLOCK;
-		retval->heap_r = heap_openr(LargeObjectRelationName, AccessShareLock);
 	}
 	else
 		elog(ERROR, "invalid flags: %d", flags);
 
-	retval->index_r = index_openr(LargeObjectLOidPNIndex);
-
 	return retval;
 }
 
@@ -158,13 +209,6 @@ void
 inv_close(LargeObjectDesc *obj_desc)
 {
 	Assert(PointerIsValid(obj_desc));
-
-	if (obj_desc->flags & IFS_WRLOCK)
-		heap_close(obj_desc->heap_r, RowExclusiveLock);
-	else if (obj_desc->flags & IFS_RDLOCK)
-		heap_close(obj_desc->heap_r, AccessShareLock);
-	index_close(obj_desc->index_r);
-
 	pfree(obj_desc);
 }
 
@@ -212,12 +256,14 @@ inv_getsize(LargeObjectDesc *obj_desc)
 
 	Assert(PointerIsValid(obj_desc));
 
+	open_lo_relation();
+
 	ScanKeyInit(&skey[0],
 				Anum_pg_largeobject_loid,
 				BTEqualStrategyNumber, F_OIDEQ,
 				ObjectIdGetDatum(obj_desc->id));
 
-	sd = index_beginscan(obj_desc->heap_r, obj_desc->index_r,
+	sd = index_beginscan(lo_heap_r, lo_index_r,
 						 SnapshotNow, 1, skey);
 
 	/*
@@ -316,6 +362,8 @@ inv_read(LargeObjectDesc *obj_desc, char *buf, int nbytes)
 	if (nbytes <= 0)
 		return 0;
 
+	open_lo_relation();
+
 	ScanKeyInit(&skey[0],
 				Anum_pg_largeobject_loid,
 				BTEqualStrategyNumber, F_OIDEQ,
@@ -326,7 +374,7 @@ inv_read(LargeObjectDesc *obj_desc, char *buf, int nbytes)
 				BTGreaterEqualStrategyNumber, F_INT4GE,
 				Int32GetDatum(pageno));
 
-	sd = index_beginscan(obj_desc->heap_r, obj_desc->index_r,
+	sd = index_beginscan(lo_heap_r, lo_index_r,
 						 SnapshotNow, 2, skey);
 
 	while ((tuple = index_getnext(sd, ForwardScanDirection)) != NULL)
@@ -421,7 +469,9 @@ inv_write(LargeObjectDesc *obj_desc, char *buf, int nbytes)
 	if (nbytes <= 0)
 		return 0;
 
-	indstate = CatalogOpenIndexes(obj_desc->heap_r);
+	open_lo_relation();
+
+	indstate = CatalogOpenIndexes(lo_heap_r);
 
 	ScanKeyInit(&skey[0],
 				Anum_pg_largeobject_loid,
@@ -433,7 +483,7 @@ inv_write(LargeObjectDesc *obj_desc, char *buf, int nbytes)
 				BTGreaterEqualStrategyNumber, F_INT4GE,
 				Int32GetDatum(pageno));
 
-	sd = index_beginscan(obj_desc->heap_r, obj_desc->index_r,
+	sd = index_beginscan(lo_heap_r, lo_index_r,
 						 SnapshotNow, 2, skey);
 
 	oldtuple = NULL;
@@ -510,9 +560,9 @@ inv_write(LargeObjectDesc *obj_desc, char *buf, int nbytes)
 			memset(replace, ' ', sizeof(replace));
 			values[Anum_pg_largeobject_data - 1] = PointerGetDatum(&workbuf);
 			replace[Anum_pg_largeobject_data - 1] = 'r';
-			newtup = heap_modifytuple(oldtuple, obj_desc->heap_r,
+			newtup = heap_modifytuple(oldtuple, lo_heap_r,
 									  values, nulls, replace);
-			simple_heap_update(obj_desc->heap_r, &newtup->t_self, newtup);
+			simple_heap_update(lo_heap_r, &newtup->t_self, newtup);
 			CatalogIndexInsert(indstate, newtup);
 			heap_freetuple(newtup);
 
@@ -554,8 +604,8 @@ inv_write(LargeObjectDesc *obj_desc, char *buf, int nbytes)
 			values[Anum_pg_largeobject_loid - 1] = ObjectIdGetDatum(obj_desc->id);
 			values[Anum_pg_largeobject_pageno - 1] = Int32GetDatum(pageno);
 			values[Anum_pg_largeobject_data - 1] = PointerGetDatum(&workbuf);
-			newtup = heap_formtuple(obj_desc->heap_r->rd_att, values, nulls);
-			simple_heap_insert(obj_desc->heap_r, newtup);
+			newtup = heap_formtuple(lo_heap_r->rd_att, values, nulls);
+			simple_heap_insert(lo_heap_r, newtup);
 			CatalogIndexInsert(indstate, newtup);
 			heap_freetuple(newtup);
 		}
diff --git a/src/backend/storage/lmgr/lmgr.c b/src/backend/storage/lmgr/lmgr.c
index 176767507c2d2c8c8a15c7d64564325a91b9dd46..11d73c583001b68aaace31a2fdc806aceafa5c4a 100644
--- a/src/backend/storage/lmgr/lmgr.c
+++ b/src/backend/storage/lmgr/lmgr.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/storage/lmgr/lmgr.c,v 1.65 2004/07/27 05:10:58 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/storage/lmgr/lmgr.c,v 1.66 2004/07/28 14:23:29 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -137,7 +137,7 @@ LockRelation(Relation relation, LOCKMODE lockmode)
 	tag.dbId = relation->rd_lockInfo.lockRelId.dbId;
 	tag.objId.blkno = InvalidBlockNumber;
 
-	if (!LockAcquire(LockTableId, &tag, GetCurrentTransactionId(),
+	if (!LockAcquire(LockTableId, &tag, GetTopTransactionId(),
 					 lockmode, false))
 		elog(ERROR, "LockAcquire failed");
 
@@ -171,7 +171,7 @@ ConditionalLockRelation(Relation relation, LOCKMODE lockmode)
 	tag.dbId = relation->rd_lockInfo.lockRelId.dbId;
 	tag.objId.blkno = InvalidBlockNumber;
 
-	if (!LockAcquire(LockTableId, &tag, GetCurrentTransactionId(),
+	if (!LockAcquire(LockTableId, &tag, GetTopTransactionId(),
 					 lockmode, true))
 		return false;
 
@@ -201,7 +201,7 @@ UnlockRelation(Relation relation, LOCKMODE lockmode)
 	tag.dbId = relation->rd_lockInfo.lockRelId.dbId;
 	tag.objId.blkno = InvalidBlockNumber;
 
-	LockRelease(LockTableId, &tag, GetCurrentTransactionId(), lockmode);
+	LockRelease(LockTableId, &tag, GetTopTransactionId(), lockmode);
 }
 
 /*
@@ -264,7 +264,7 @@ LockPage(Relation relation, BlockNumber blkno, LOCKMODE lockmode)
 	tag.dbId = relation->rd_lockInfo.lockRelId.dbId;
 	tag.objId.blkno = blkno;
 
-	if (!LockAcquire(LockTableId, &tag, GetCurrentTransactionId(),
+	if (!LockAcquire(LockTableId, &tag, GetTopTransactionId(),
 					 lockmode, false))
 		elog(ERROR, "LockAcquire failed");
 }
@@ -285,7 +285,7 @@ ConditionalLockPage(Relation relation, BlockNumber blkno, LOCKMODE lockmode)
 	tag.dbId = relation->rd_lockInfo.lockRelId.dbId;
 	tag.objId.blkno = blkno;
 
-	return LockAcquire(LockTableId, &tag, GetCurrentTransactionId(),
+	return LockAcquire(LockTableId, &tag, GetTopTransactionId(),
 					   lockmode, true);
 }
 
@@ -302,7 +302,7 @@ UnlockPage(Relation relation, BlockNumber blkno, LOCKMODE lockmode)
 	tag.dbId = relation->rd_lockInfo.lockRelId.dbId;
 	tag.objId.blkno = blkno;
 
-	LockRelease(LockTableId, &tag, GetCurrentTransactionId(), lockmode);
+	LockRelease(LockTableId, &tag, GetTopTransactionId(), lockmode);
 }
 
 /*
@@ -343,7 +343,7 @@ void
 XactLockTableWait(TransactionId xid)
 {
 	LOCKTAG		tag;
-	TransactionId myxid = GetCurrentTransactionId();
+	TransactionId myxid = GetTopTransactionId();
 
 	Assert(!SubTransXidsHaveCommonAncestor(xid, myxid));
 
diff --git a/src/backend/utils/cache/inval.c b/src/backend/utils/cache/inval.c
index e54a74fae4bea1028a90e94383b1868dcf838482..946bd0c9eb79ac40770d957c02e46f08f348bdb6 100644
--- a/src/backend/utils/cache/inval.c
+++ b/src/backend/utils/cache/inval.c
@@ -80,7 +80,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/utils/cache/inval.c,v 1.63 2004/07/01 00:51:17 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/utils/cache/inval.c,v 1.64 2004/07/28 14:23:29 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -678,7 +678,7 @@ AtEOXact_Inval(bool isCommit)
 }
 
 /*
- * AtSubEOXact_Inval
+ * AtEOSubXact_Inval
  *		Process queued-up invalidation messages at end of subtransaction.
  *
  * If isCommit, process CurrentCmdInvalidMsgs if any (there probably aren't),
@@ -695,7 +695,7 @@ AtEOXact_Inval(bool isCommit)
  * (if aborting).
  */
 void
-AtSubEOXact_Inval(bool isCommit)
+AtEOSubXact_Inval(bool isCommit)
 {
 	TransInvalidationInfo *myInfo = transInvalInfo;
 
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index b9865462a4ec2636b69dd434b93ec3d1bc17d499..22df3effc329e1812c7d96603d39d8c0b485e0e2 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -10,7 +10,7 @@
  * Written by Peter Eisentraut <peter_e@gmx.net>.
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/utils/misc/guc.c,v 1.224 2004/07/24 19:51:23 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/utils/misc/guc.c,v 1.225 2004/07/28 14:23:29 tgl Exp $
  *
  *--------------------------------------------------------------------
  */
@@ -5436,10 +5436,15 @@ assign_log_stats(bool newval, bool doit, GucSource source)
 static bool
 assign_transaction_read_only(bool newval, bool doit, GucSource source)
 {
-	if (doit && source >= PGC_S_INTERACTIVE && IsSubTransaction())
-		ereport(ERROR,
-				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
-				 errmsg("cannot set transaction read only mode inside a subtransaction")));
+	/* Can't go to r/w mode inside a r/o transaction */
+	if (newval == false && XactReadOnly && IsSubTransaction())
+	{
+		if (source >= PGC_S_INTERACTIVE)
+			ereport(ERROR,
+					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+					 errmsg("cannot set transaction read-write mode inside a read-only transaction")));
+		return false;
+	}
 	return true;
 }
 
diff --git a/src/backend/utils/time/tqual.c b/src/backend/utils/time/tqual.c
index 446ee4b72c54e7e37cdfcfc0d8de459ca0dfea2b..d1a7179484e3fd57f36d6acd60863e48f4ec29ce 100644
--- a/src/backend/utils/time/tqual.c
+++ b/src/backend/utils/time/tqual.c
@@ -16,7 +16,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/utils/time/tqual.c,v 1.73 2004/07/01 00:51:33 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/utils/time/tqual.c,v 1.74 2004/07/28 14:23:30 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -118,7 +118,10 @@ HeapTupleSatisfiesItself(HeapTupleHeader tuple)
 
 			/* deleting subtransaction aborted */
 			if (TransactionIdDidAbort(HeapTupleHeaderGetXmax(tuple)))
+			{
+				tuple->t_infomask |= HEAP_XMAX_INVALID;
 				return true;
+			}
 
 			Assert(TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmax(tuple)));
 
@@ -268,7 +271,10 @@ HeapTupleSatisfiesNow(HeapTupleHeader tuple)
 
 			/* deleting subtransaction aborted */
 			if (TransactionIdDidAbort(HeapTupleHeaderGetXmax(tuple)))
+			{
+				tuple->t_infomask |= HEAP_XMAX_INVALID;
 				return true;
+			}
 
 			Assert(TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmax(tuple)));
 
@@ -452,7 +458,10 @@ HeapTupleSatisfiesUpdate(HeapTupleHeader tuple, CommandId curcid)
 
 			/* deleting subtransaction aborted */
 			if (TransactionIdDidAbort(HeapTupleHeaderGetXmax(tuple)))
+			{
+				tuple->t_infomask |= HEAP_XMAX_INVALID;
 				return HeapTupleMayBeUpdated;
+			}
 
 			Assert(TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmax(tuple)));
 
@@ -590,7 +599,10 @@ HeapTupleSatisfiesDirty(HeapTupleHeader tuple)
 
 			/* deleting subtransaction aborted */
 			if (TransactionIdDidAbort(HeapTupleHeaderGetXmax(tuple)))
+			{
+				tuple->t_infomask |= HEAP_XMAX_INVALID;
 				return true;
+			}
 
 			Assert(TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmax(tuple)));
 
@@ -732,7 +744,10 @@ HeapTupleSatisfiesSnapshot(HeapTupleHeader tuple, Snapshot snapshot)
 			/* deleting subtransaction aborted */
 			/* FIXME -- is this correct w.r.t. the cmax of the tuple? */
 			if (TransactionIdDidAbort(HeapTupleHeaderGetXmax(tuple)))
+			{
+				tuple->t_infomask |= HEAP_XMAX_INVALID;
 				return true;
+			}
 
 			Assert(TransactionIdIsCurrentTransactionId(HeapTupleHeaderGetXmax(tuple)));
 
@@ -757,21 +772,36 @@ HeapTupleSatisfiesSnapshot(HeapTupleHeader tuple, Snapshot snapshot)
 	/*
 	 * By here, the inserting transaction has committed - have to check
 	 * when...
+	 *
+	 * Note that the provided snapshot contains only top-level XIDs, so
+	 * we have to convert a subxact XID to its parent for comparison.
+	 * However, we can make first-pass range checks with the given XID,
+	 * because a subxact with XID < xmin has surely also got a parent with
+	 * XID < xmin, while one with XID >= xmax must belong to a parent that
+	 * was not yet committed at the time of this snapshot.
 	 */
 	if (TransactionIdFollowsOrEquals(HeapTupleHeaderGetXmin(tuple),
 									 snapshot->xmin))
 	{
-		uint32		i;
+		TransactionId parentXid;
 
 		if (TransactionIdFollowsOrEquals(HeapTupleHeaderGetXmin(tuple),
 										 snapshot->xmax))
 			return false;
 
-		for (i = 0; i < snapshot->xcnt; i++)
+		parentXid = SubTransGetTopmostTransaction(HeapTupleHeaderGetXmin(tuple));
+
+		if (TransactionIdFollowsOrEquals(parentXid, snapshot->xmin))
 		{
-			if (SubTransXidsHaveCommonAncestor(HeapTupleHeaderGetXmin(tuple),
-									snapshot->xip[i]))
-				return false;
+			uint32		i;
+
+			/* no point in checking parentXid against xmax here */
+
+			for (i = 0; i < snapshot->xcnt; i++)
+			{
+				if (TransactionIdEquals(parentXid, snapshot->xip[i]))
+					return false;
+			}
 		}
 	}
 
@@ -804,18 +834,31 @@ HeapTupleSatisfiesSnapshot(HeapTupleHeader tuple, Snapshot snapshot)
 
 	/*
 	 * OK, the deleting transaction committed too ... but when?
+	 *
+	 * See notes for the similar tests on tuple xmin, above.
 	 */
-	if (TransactionIdFollowsOrEquals(HeapTupleHeaderGetXmax(tuple), snapshot->xmin))
+	if (TransactionIdFollowsOrEquals(HeapTupleHeaderGetXmax(tuple),
+									 snapshot->xmin))
 	{
-		uint32		i;
+		TransactionId parentXid;
 
 		if (TransactionIdFollowsOrEquals(HeapTupleHeaderGetXmax(tuple),
 										 snapshot->xmax))
 			return true;
-		for (i = 0; i < snapshot->xcnt; i++)
+
+		parentXid = SubTransGetTopmostTransaction(HeapTupleHeaderGetXmax(tuple));
+
+		if (TransactionIdFollowsOrEquals(parentXid, snapshot->xmin))
 		{
-			if (SubTransXidsHaveCommonAncestor(HeapTupleHeaderGetXmax(tuple), snapshot->xip[i]))
-				return true;
+			uint32		i;
+
+			/* no point in checking parentXid against xmax here */
+
+			for (i = 0; i < snapshot->xcnt; i++)
+			{
+				if (TransactionIdEquals(parentXid, snapshot->xip[i]))
+					return true;
+			}
 		}
 	}
 
diff --git a/src/bin/psql/tab-complete.c b/src/bin/psql/tab-complete.c
index 0dfaebe38b01cc37e2629106dffbc2a32a08fe40..130fcd33f4b3aab376e46b64dec10ad33146fe36 100644
--- a/src/bin/psql/tab-complete.c
+++ b/src/bin/psql/tab-complete.c
@@ -3,7 +3,7 @@
  *
  * Copyright (c) 2000-2003, PostgreSQL Global Development Group
  *
- * $PostgreSQL: pgsql/src/bin/psql/tab-complete.c,v 1.108 2004/07/27 05:11:11 tgl Exp $
+ * $PostgreSQL: pgsql/src/bin/psql/tab-complete.c,v 1.109 2004/07/28 14:23:30 tgl Exp $
  */
 
 /*----------------------------------------------------------------------
@@ -722,7 +722,7 @@ psql_completion(char *text, int start, int end)
 	else if (pg_strcasecmp(prev2_wd, "ANALYZE") == 0)
 		COMPLETE_WITH_CONST(";");
 
-/* BEGIN, COMMIT, ABORT */
+/* BEGIN, END, COMMIT, ABORT */
 	else if (pg_strcasecmp(prev_wd, "BEGIN") == 0 ||
 	         pg_strcasecmp(prev_wd, "END") == 0 ||
 	         pg_strcasecmp(prev_wd, "COMMIT") == 0 ||
diff --git a/src/include/catalog/namespace.h b/src/include/catalog/namespace.h
index 7a0cfd70994ecade27356d1bc1732aebabb1507b..986a26b96f9a64ae59fdb9f27bbec24f3ae9f0dc 100644
--- a/src/include/catalog/namespace.h
+++ b/src/include/catalog/namespace.h
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/catalog/namespace.h,v 1.30 2004/01/19 19:04:40 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/catalog/namespace.h,v 1.31 2004/07/28 14:23:30 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -91,6 +91,8 @@ extern Oid	FindDefaultConversionProc(int4 for_encoding, int4 to_encoding);
 /* initialization & transaction cleanup code */
 extern void InitializeSearchPath(void);
 extern void AtEOXact_Namespace(bool isCommit);
+extern void AtEOSubXact_Namespace(bool isCommit, TransactionId myXid,
+								  TransactionId parentXid);
 
 /* stuff for search_path GUC variable */
 extern char *namespace_search_path;
diff --git a/src/include/commands/user.h b/src/include/commands/user.h
index 8dba146aa5bcb76a4beb85b12be548ea5e5606b1..7a723192779a2e971076d9cad90b675a39579083 100644
--- a/src/include/commands/user.h
+++ b/src/include/commands/user.h
@@ -4,7 +4,7 @@
  *	  Commands for manipulating users and groups.
  *
  *
- * $PostgreSQL: pgsql/src/include/commands/user.h,v 1.22 2003/11/29 22:40:59 pgsql Exp $
+ * $PostgreSQL: pgsql/src/include/commands/user.h,v 1.23 2004/07/28 14:23:31 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -32,5 +32,7 @@ extern void RenameGroup(const char *oldname, const char *newname);
 extern Datum update_pg_pwd_and_pg_group(PG_FUNCTION_ARGS);
 
 extern void AtEOXact_UpdatePasswordFile(bool isCommit);
+extern void AtEOSubXact_UpdatePasswordFile(bool isCommit, TransactionId myXid,
+										   TransactionId parentXid);
 
 #endif   /* USER_H */
diff --git a/src/include/libpq/be-fsstubs.h b/src/include/libpq/be-fsstubs.h
index 9c45876be5a4488c0633980198041e366e244bf5..b2d8b3d340445701e0bbdd13d50e28a452182c30 100644
--- a/src/include/libpq/be-fsstubs.h
+++ b/src/include/libpq/be-fsstubs.h
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/libpq/be-fsstubs.h,v 1.18 2003/11/29 22:41:03 pgsql Exp $
+ * $PostgreSQL: pgsql/src/include/libpq/be-fsstubs.h,v 1.19 2004/07/28 14:23:31 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -43,8 +43,10 @@ extern int	lo_read(int fd, char *buf, int len);
 extern int	lo_write(int fd, char *buf, int len);
 
 /*
- * Cleanup LOs at xact commit/abort [ Pascal André <andre@via.ecp.fr> ]
+ * Cleanup LOs at xact commit/abort
  */
-extern void lo_commit(bool isCommit);
+extern void AtEOXact_LargeObject(bool isCommit);
+extern void AtEOSubXact_LargeObject(bool isCommit, TransactionId myXid,
+									TransactionId parentXid);
 
 #endif   /* BE_FSSTUBS_H */
diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h
index 430ed5d8c74181f11b10a41428453f123159f06b..c62f6fff76f1661e86f689eacc31077ad2a2a663 100644
--- a/src/include/storage/fd.h
+++ b/src/include/storage/fd.h
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/storage/fd.h,v 1.45 2004/05/31 03:48:10 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/storage/fd.h,v 1.46 2004/07/28 14:23:31 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -85,6 +85,8 @@ extern int	BasicOpenFile(FileName fileName, int fileFlags, int fileMode);
 extern void set_max_safe_fds(void);
 extern void closeAllVfds(void);
 extern void AtEOXact_Files(void);
+extern void AtEOSubXact_Files(bool isCommit, TransactionId myXid,
+							  TransactionId parentXid);
 extern void RemovePgTempFiles(void);
 extern int	pg_fsync(int fd);
 extern int	pg_fdatasync(int fd);
diff --git a/src/include/storage/large_object.h b/src/include/storage/large_object.h
index 23118aaf5acbd6db3b8d8345f0c9b8287193e79d..164d3abb5dca1ccadf9b264c48ceaecac43bd36e 100644
--- a/src/include/storage/large_object.h
+++ b/src/include/storage/large_object.h
@@ -1,47 +1,44 @@
 /*-------------------------------------------------------------------------
  *
  * large_object.h
- *	  file of info for Postgres large objects. POSTGRES 4.2 supports
+ *	  Declarations for PostgreSQL large objects.  POSTGRES 4.2 supported
  *	  zillions of large objects (internal, external, jaquith, inversion).
  *	  Now we only support inversion.
  *
  * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/storage/large_object.h,v 1.27 2003/11/29 22:41:13 pgsql Exp $
+ * $PostgreSQL: pgsql/src/include/storage/large_object.h,v 1.28 2004/07/28 14:23:31 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
 #ifndef LARGE_OBJECT_H
 #define LARGE_OBJECT_H
 
-#include "utils/rel.h"
-
 
 /*----------
  * Data about a currently-open large object.
  *
  * id is the logical OID of the large object
+ * xid is the transaction Id that opened the LO (or currently owns it)
  * offset is the current seek offset within the LO
- * heap_r holds an open-relation reference to pg_largeobject
- * index_r holds an open-relation reference to pg_largeobject_loid_pn_index
+ * flags contains some flag bits
  *
- * NOTE: before 7.1, heap_r and index_r held references to the separate
- * table and index of a specific large object.	Now they all live in one rel.
+ * NOTE: before 7.1, we also had to store references to the separate table
+ * and index of a specific large object.  Now they all live in pg_largeobject
+ * and are accessed via a common relation descriptor.
  *----------
  */
 typedef struct LargeObjectDesc
 {
-	Oid			id;
+	Oid			id;				/* LO's identifier */
+	TransactionId xid;			/* owning XID */
 	uint32		offset;			/* current seek pointer */
 	int			flags;			/* locking info, etc */
 
 /* flag bits: */
 #define IFS_RDLOCK		(1 << 0)
 #define IFS_WRLOCK		(1 << 1)
-
-	Relation	heap_r;
-	Relation	index_r;
 } LargeObjectDesc;
 
 
@@ -67,6 +64,7 @@ typedef struct LargeObjectDesc
  */
 
 /* inversion stuff in inv_api.c */
+extern void close_lo_relation(bool isCommit);
 extern LargeObjectDesc *inv_create(int flags);
 extern LargeObjectDesc *inv_open(Oid lobjId, int flags);
 extern void inv_close(LargeObjectDesc *obj_desc);
diff --git a/src/include/utils/inval.h b/src/include/utils/inval.h
index add5ca83c713842f10c2720d9c5170af772d8c6d..cf12122ea5cfc1af3c64026d9d33c558608b9228 100644
--- a/src/include/utils/inval.h
+++ b/src/include/utils/inval.h
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/utils/inval.h,v 1.32 2004/07/01 00:51:44 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/utils/inval.h,v 1.33 2004/07/28 14:23:31 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -28,7 +28,7 @@ extern void AtSubStart_Inval(void);
 
 extern void AtEOXact_Inval(bool isCommit);
 
-extern void AtSubEOXact_Inval(bool isCommit);
+extern void AtEOSubXact_Inval(bool isCommit);
 
 extern void CommandEndInvalidationMessages(void);