diff --git a/src/backend/access/gist/gist.c b/src/backend/access/gist/gist.c
index 0535fd6278db7d52c4ab2b57e0a6248f7caef76f..bc3338b11564450c53bd0dcd85310f9e52caa9c1 100644
--- a/src/backend/access/gist/gist.c
+++ b/src/backend/access/gist/gist.c
@@ -6,7 +6,7 @@
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/access/gist/gist.c,v 1.45 1999/09/18 19:05:46 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/access/gist/gist.c,v 1.46 1999/09/24 00:23:42 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -141,7 +141,7 @@ gistbuild(Relation heap,
 		tupleTable = ExecCreateTupleTable(1);
 		slot = ExecAllocTableSlot(tupleTable);
 		econtext = makeNode(ExprContext);
-		FillDummyExprContext(econtext, slot, hd, buffer);
+		FillDummyExprContext(econtext, slot, hd, InvalidBuffer);
 	}
 	else
 /* shut the compiler up */
diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c
index 6e729008e85b70abea3200becb47eb6672458076..ca7c14d30df7194ec71a24f5992250aaff1d8ce0 100644
--- a/src/backend/access/hash/hash.c
+++ b/src/backend/access/hash/hash.c
@@ -7,7 +7,7 @@
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/access/hash/hash.c,v 1.31 1999/09/18 19:05:52 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/access/hash/hash.c,v 1.32 1999/09/24 00:23:48 tgl Exp $
  *
  * NOTES
  *	  This file contains only the public interface routines.
@@ -58,7 +58,6 @@ hashbuild(Relation heap,
 				nitups;
 	int			i;
 	HashItem	hitem;
-	Buffer		buffer = InvalidBuffer;
 
 #ifndef OMIT_PARTIAL_INDEX
 	ExprContext *econtext;
@@ -101,7 +100,7 @@ hashbuild(Relation heap,
 		tupleTable = ExecCreateTupleTable(1);
 		slot = ExecAllocTableSlot(tupleTable);
 		econtext = makeNode(ExprContext);
-		FillDummyExprContext(econtext, slot, htupdesc, buffer);
+		FillDummyExprContext(econtext, slot, htupdesc, InvalidBuffer);
 	}
 	else
 /* quiet the compiler */
diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index 092a23cc0ec8dc48cffe013af73a0c840d7c5d3d..4c0549219494047e14446775275bc6e1fb93a899 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -7,7 +7,7 @@
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/access/heap/heapam.c,v 1.54 1999/09/18 19:05:58 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/access/heap/heapam.c,v 1.55 1999/09/24 00:23:54 tgl Exp $
  *
  *
  * INTERFACE ROUTINES
@@ -160,7 +160,7 @@ unpinscan(HeapScanDesc scan)
 		ReleaseBuffer(scan->rs_pbuf);
 
 	/* ------------------------------------
-	 *	Scan will pin buffer one for each non-NULL tuple pointer
+	 *	Scan will pin buffer once for each non-NULL tuple pointer
 	 *	(ptup, ctup, ntup), so they have to be unpinned multiple
 	 *	times.
 	 * ------------------------------------
@@ -170,6 +170,10 @@ unpinscan(HeapScanDesc scan)
 
 	if (BufferIsValid(scan->rs_nbuf))
 		ReleaseBuffer(scan->rs_nbuf);
+
+	/* we don't bother to clear rs_pbuf etc --- caller must
+	 * reinitialize them if scan descriptor is not being deleted.
+	 */
 }
 
 /* ------------------------------------------
@@ -826,6 +830,8 @@ heap_getnext(HeapScanDesc scandesc, int backw)
 		{
 			if (BufferIsValid(scan->rs_nbuf))
 				ReleaseBuffer(scan->rs_nbuf);
+			scan->rs_ntup.t_data = NULL;
+			scan->rs_nbuf = UnknownBuffer;
 			return NULL;
 		}
 
@@ -906,6 +912,8 @@ heap_getnext(HeapScanDesc scandesc, int backw)
 		{
 			if (BufferIsValid(scan->rs_pbuf))
 				ReleaseBuffer(scan->rs_pbuf);
+			scan->rs_ptup.t_data = NULL;
+			scan->rs_pbuf = UnknownBuffer;
 			HEAPDEBUG_3;		/* heap_getnext returns NULL at end */
 			return NULL;
 		}
@@ -1014,8 +1022,6 @@ heap_fetch(Relation relation,
 	ItemPointer tid = &(tuple->t_self);
 	OffsetNumber offnum;
 
-	AssertMacro(PointerIsValid(userbuf));		/* see comments above */
-
 	/* ----------------
 	 *	increment access statistics
 	 * ----------------
@@ -1067,21 +1073,17 @@ heap_fetch(Relation relation,
 
 	if (tuple->t_data == NULL)
 	{
+		/* Tuple failed time check, so we can release now. */
 		ReleaseBuffer(buffer);
-		return;
+		*userbuf = InvalidBuffer;
+	}
+	else
+	{
+		/* All checks passed, so return the tuple as valid.
+		 * Caller is now responsible for releasing the buffer.
+		 */
+		*userbuf = buffer;
 	}
-
-	/* ----------------
-	 *	all checks passed, now either return a copy of the tuple
-	 *	or pin the buffer page and return a pointer, depending on
-	 *	whether caller gave us a valid buf.
-	 * ----------------
-	 */
-
-	*userbuf = buffer;			/* user is required to ReleaseBuffer()
-								 * this */
-
-	return;
 }
 
 /* ----------------
diff --git a/src/backend/access/rtree/rtree.c b/src/backend/access/rtree/rtree.c
index 133bbdbc032c9a965d653cfbdd6c1279b65556a7..ee36b418893d10a0e3c5f1ea4976ca580d10e856 100644
--- a/src/backend/access/rtree/rtree.c
+++ b/src/backend/access/rtree/rtree.c
@@ -7,7 +7,7 @@
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/access/rtree/Attic/rtree.c,v 1.37 1999/09/18 19:06:16 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/access/rtree/Attic/rtree.c,v 1.38 1999/09/24 00:23:59 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -135,7 +135,7 @@ rtbuild(Relation heap,
 		tupleTable = ExecCreateTupleTable(1);
 		slot = ExecAllocTableSlot(tupleTable);
 		econtext = makeNode(ExprContext);
-		FillDummyExprContext(econtext, slot, hd, buffer);
+		FillDummyExprContext(econtext, slot, hd, InvalidBuffer);
 	}
 	else
 	{
diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c
index 1c5dae27be27da0835b999d5f6dc6792b47acde7..b3bf5cd973713d4d4df782bc62ab23c87e0f34a5 100644
--- a/src/backend/access/transam/xact.c
+++ b/src/backend/access/transam/xact.c
@@ -7,7 +7,7 @@
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/access/transam/xact.c,v 1.52 1999/09/16 09:08:56 ishii Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/access/transam/xact.c,v 1.53 1999/09/24 00:24:05 tgl Exp $
  *
  * NOTES
  *		Transaction aborts can now occur two ways:
@@ -151,6 +151,7 @@
 #include "commands/vacuum.h"
 #include "libpq/be-fsstubs.h"
 #include "storage/proc.h"
+#include "storage/sinval.h"
 #include "utils/temprel.h"
 #include "utils/inval.h"
 #include "utils/portal.h"
@@ -749,8 +750,8 @@ RecordTransactionAbort()
 static void
 AtAbort_Cache()
 {
-	RegisterInvalid(false);
 	RelationCacheAbort();
+	RegisterInvalid(false);
 }
 
 /* --------------------------------
@@ -929,7 +930,7 @@ CommitTransaction()
 	/*
 	 * Let others know about no transaction in progress by me.
 	 * Note that this must be done _before_ releasing locks we hold 
-	 * and SpinAcquire(ShmemIndexLock) is required: UPDATE with xid 0 is 
+	 * and SpinAcquire(SInvalLock) is required: UPDATE with xid 0 is 
 	 * blocked by xid 1' UPDATE, xid 1 is doing commit while xid 2 
 	 * gets snapshot - if xid 2' GetSnapshotData sees xid 1 as running
 	 * then it must see xid 0 as running as well or it will see two
@@ -937,10 +938,11 @@ CommitTransaction()
 	 */
 	if (MyProc != (PROC *) NULL)
 	{
-		SpinAcquire(ShmemIndexLock);
+		/* Lock SInvalLock because that's what GetSnapshotData uses. */
+		SpinAcquire(SInvalLock);
 		MyProc->xid = InvalidTransactionId;
 		MyProc->xmin = InvalidTransactionId;
-		SpinRelease(ShmemIndexLock);
+		SpinRelease(SInvalLock);
 	}
 
 	RelationPurgeLocalRelation(true);
diff --git a/src/backend/catalog/heap.c b/src/backend/catalog/heap.c
index e225a5834dc55a0a5da9f694a80ea64f0102808a..f1051cb784b74916e827c25d0c897283de2af499 100644
--- a/src/backend/catalog/heap.c
+++ b/src/backend/catalog/heap.c
@@ -7,7 +7,7 @@
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/catalog/heap.c,v 1.97 1999/09/23 17:02:34 momjian Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/catalog/heap.c,v 1.98 1999/09/24 00:24:11 tgl Exp $
  *
  *
  * INTERFACE ROUTINES
@@ -1065,224 +1065,189 @@ DeleteRelationTuple(Relation rel)
  * The routine will truncate and then reconstruct the indices on
  * the relation specified by the heapRelation parameter.
  * --------------------------------
-*/
+ */
+static void
+RelationTruncateIndexes(Relation heapRelation)
+{
+	Relation indexRelation, currentIndex;
+	ScanKeyData entry;
+	HeapScanDesc scan;
+	HeapTuple indexTuple, procTuple, classTuple;
+	Form_pg_index index;
+	Oid heapId, indexId, procId, accessMethodId;
+	Node *oldPred = NULL;
+	PredInfo *predInfo;
+	List *cnfPred = NULL;
+	AttrNumber *attributeNumberA;
+	FuncIndexInfo fInfo, *funcInfo = NULL;
+	int i, numberOfAttributes;
+	char *predString;
+
+	heapId = RelationGetRelid(heapRelation);
+
+	/* Scan pg_index to find indexes on heapRelation */
+
+	indexRelation = heap_openr(IndexRelationName, AccessShareLock);
+	ScanKeyEntryInitialize(&entry, 0, Anum_pg_index_indrelid, F_OIDEQ,
+						   ObjectIdGetDatum(heapId));
+	scan = heap_beginscan(indexRelation, false, SnapshotNow, 1, &entry);
+	while (HeapTupleIsValid(indexTuple = heap_getnext(scan, 0)))
+	{
+		/*
+		 * For each index, fetch index attributes so we can apply index_build
+		 */
+		index = (Form_pg_index) GETSTRUCT(indexTuple);
+		indexId = index->indexrelid;
+		procId = index->indproc;
+
+		for (i = 0; i < INDEX_MAX_KEYS; i++)
+		{
+			if (index->indkey[i] == InvalidAttrNumber)
+				break;
+		}
+		numberOfAttributes = i;
+
+		/* If a valid where predicate, compute predicate Node */
+		if (VARSIZE(&index->indpred) != 0)
+		{
+			predString = fmgr(F_TEXTOUT, &index->indpred);
+			oldPred = stringToNode(predString);
+			pfree(predString);
+		}
+		predInfo = (PredInfo *) palloc(sizeof(PredInfo));
+		predInfo->pred = (Node *) cnfPred;
+		predInfo->oldPred = oldPred;
+
+		/* Assign Index keys to attributes array */
+		attributeNumberA = (AttrNumber *) palloc(numberOfAttributes *
+												 sizeof(AttrNumber));
+		for (i = 0; i < numberOfAttributes; i++)
+			attributeNumberA[i] = index->indkey[i];
+
+		/* If this is a procedural index, initialize our FuncIndexInfo */
+		if (procId != InvalidOid)
+		{
+			funcInfo = &fInfo;
+			FIsetnArgs(funcInfo, numberOfAttributes);
+			procTuple = SearchSysCacheTuple(PROOID, ObjectIdGetDatum(procId),
+											0, 0, 0);
+			if (!HeapTupleIsValid(procTuple))
+				elog(ERROR, "RelationTruncateIndexes: index procedure not found");
+			namecpy(&(funcInfo->funcName),
+					&(((Form_pg_proc) GETSTRUCT(procTuple))->proname));
+			FIsetProcOid(funcInfo, procTuple->t_data->t_oid);
+		}
+
+		/* Fetch the classTuple associated with this index */
+		classTuple = SearchSysCacheTupleCopy(RELOID, ObjectIdGetDatum(indexId),
+											 0, 0, 0);
+		if (!HeapTupleIsValid(classTuple))
+			elog(ERROR, "RelationTruncateIndexes: index access method not found");
+		accessMethodId = ((Form_pg_class) GETSTRUCT(classTuple))->relam;
+
+		/* Open our index relation */
+		currentIndex = index_open(indexId);
+		if (currentIndex == NULL)
+			elog(ERROR, "RelationTruncateIndexes: can't open index relation");
+
+		/* Obtain exclusive lock on it, just to be sure */
+		LockRelation(currentIndex, AccessExclusiveLock);
+
+		/*
+		 * Release any buffers associated with this index.  If they're dirty,
+		 * they're just dropped without bothering to flush to disk.
+		 */
+		ReleaseRelationBuffers(currentIndex);
+		if (FlushRelationBuffers(currentIndex, (BlockNumber) 0, false) < 0)
+			elog(ERROR, "RelationTruncateIndexes: unable to flush index from buffer pool");
+
+		/* Now truncate the actual data and set blocks to zero */
+		smgrtruncate(DEFAULT_SMGR, currentIndex, 0);
+		currentIndex->rd_nblocks = 0;
+
+		/* Initialize the index and rebuild */
+		InitIndexStrategy(numberOfAttributes, currentIndex, accessMethodId);
+		index_build(heapRelation, currentIndex, numberOfAttributes,
+					attributeNumberA, 0, NULL, funcInfo, predInfo);
 
-static void 
-RelationTruncateIndexes(Relation heapRelation) {
-
-  Relation indexRelation, currentIndex;
-  ScanKeyData entry;
-  HeapScanDesc scan;  
-  HeapTuple indexTuple, procTuple, classTuple;
-  Form_pg_index index;
-  Oid heapId, indexId, procId, accessMethodId;
-  Node *oldPred = NULL;
-  PredInfo *predInfo;
-  List *cnfPred = NULL;
-  AttrNumber *attributeNumberA;
-  FuncIndexInfo fInfo, *funcInfo = NULL;
-  int i, numberOfAttributes;
-  char *predString;
-
-  /*** Save the id of the heap relation ***/
-
-  heapId = RelationGetRelid(heapRelation);
-  
-  /*** Open the System relation, pg_index ***/
-
-  indexRelation = heap_openr(IndexRelationName);
-  
-  /*** Scan pg_index For indexes related to heap relation ***/
-
-  ScanKeyEntryInitialize(&entry, 0x0, Anum_pg_index_indrelid, F_OIDEQ,
-			 ObjectIdGetDatum(heapId));
-
-  scan = heap_beginscan(indexRelation, false, SnapshotNow, 1, &entry);
-  while (HeapTupleIsValid(indexTuple = heap_getnext(scan, 0))) {
-      
-    /*** For each index, fetch index attributes ***/
-
-    index = (Form_pg_index) GETSTRUCT(indexTuple);
-    indexId = index->indexrelid;
-    procId = index->indproc;
-    
-    for (i = 0; i < INDEX_MAX_KEYS; i++) {
-      if (index->indkey[i] == InvalidAttrNumber) break;
-    }
-    numberOfAttributes = i;
-    
-    /*** If a valid where predicate, compute predicate Node ***/
-
-    if (VARSIZE(&index->indpred) != 0) {	
-      predString = fmgr(F_TEXTOUT, &index->indpred);
-      oldPred = stringToNode(predString);
-      pfree(predString);
-    }
-    
-    predInfo = (PredInfo *) palloc(sizeof(PredInfo));
-    predInfo->pred = (Node *) cnfPred;
-    /* predInfo->pred = (Node *) oldPred; */
-    predInfo->oldPred = oldPred;
-
-    /*** Assign Index keys to attributes array ***/
-
-    attributeNumberA = (AttrNumber *) palloc(numberOfAttributes * 
-					     sizeof(attributeNumberA[0]));    
-    for (i = 0; i < numberOfAttributes; i++) {
-      attributeNumberA[i] = index->indkey[i];
-    }
-    
-    /*** If this is a procedural index, initialize our FuncIndexInfo ***/
-
-    if (procId != InvalidOid) {
-      funcInfo = &fInfo;
-      FIsetnArgs(funcInfo, numberOfAttributes);      
-      procTuple = SearchSysCacheTuple(PROOID, ObjectIdGetDatum(procId),
-				      0, 0, 0);
-      if (!HeapTupleIsValid(procTuple)) {
-	elog(ERROR, "RelationTruncateIndexes: index procedure not found");
-      }
-      namecpy(&(funcInfo->funcName),
-	      &(((Form_pg_proc) GETSTRUCT(procTuple))->proname));
-      FIsetProcOid(funcInfo, procTuple->t_data->t_oid);
-    }
-
-    /*** Fetch the classTuple associated with this index ***/
-    
-    classTuple = SearchSysCacheTupleCopy(RELOID, ObjectIdGetDatum(indexId),
-					 0, 0, 0);
-    if (!HeapTupleIsValid(classTuple)) {
-      elog(ERROR, "RelationTruncateIndexes: index access method not found");
-    }
-    accessMethodId = ((Form_pg_class) GETSTRUCT(classTuple))->relam;
-
-    /*** Open our index relation ***/
-    
-    currentIndex = index_open(indexId);
-    if (currentIndex == NULL) {
-      elog(ERROR, "RelationTruncateIndexes: can't open index relation");
-    }
-
-    /*** Truncate the index before building ***/
-
-    smgrtruncate(DEFAULT_SMGR, currentIndex, 0);
-    currentIndex->rd_nblocks = 0;
-    
-    /*** Initialize the index and rebuild ***/
-
-    InitIndexStrategy(numberOfAttributes, currentIndex, accessMethodId);
-    index_build(heapRelation, currentIndex, numberOfAttributes,
-		attributeNumberA, 0, NULL, funcInfo, predInfo);
-
-    /*** Re-open our heap relation and re-lock, since index_build ***/
-    /*** will close and unlock the relation ***/
-
-    heapRelation = heap_open(heapId);
-    LockRelation(heapRelation, AccessExclusiveLock);
-
-    /*** RelationUnsetLockForWrite(currentIndex); ***/
-    
-  }
-
-  /*** Complete the scan and close the Catalogueindex Relation ***/
-  
-  heap_endscan(scan);
-  heap_close(indexRelation);
+		/*
+		 * index_build will close both the heap and index relations
+		 * (but not give up the locks we hold on them).  That's fine
+		 * for the index, but we need to open the heap again.  We need
+		 * no new lock, since this backend still has the exclusive lock
+		 * grabbed by heap_truncate.
+		 */
+		heapRelation = heap_open(heapId, NoLock);
+		Assert(heapRelation != NULL);
+	}
 
+	/* Complete the scan and close pg_index */
+    heap_endscan(scan);
+	heap_close(indexRelation, AccessShareLock);
 }
 
 /* ----------------------------
  *   heap_truncate
- *   
- *   This routine is used to truncate the data from the 
- *   storange manager of any data within the relation handed
- *   to this routine.  The routine assumes that the relation 
- *   handed to this routine is an open relation.  
  *
+ *   This routine is used to truncate the data from the
+ *   storage manager of any data within the relation handed
+ *   to this routine.
  * ----------------------------
  */
 
-void 
-heap_truncate(char *relname) {
-  
-  Relation rel;
-  Oid rid; 	
-  Portal portal;
-  char *pname;
-  MemoryContext old;
-  PortalVariableMemory pmem;
-  NameData truncRel;
-
-  /*
-   * Create a portal for safe memory across transctions. We need to
-   * palloc the name space for it because our hash function expects the
-   * name to be on a longword boundary.  CreatePortal copies the name to
-   * safe storage for us.
-   */
-  
-  pname = (char *) palloc(strlen(TRUNCPNAME) + 1);
-  strcpy(pname, TRUNCPNAME);
-  portal = CreatePortal(pname);
-  pfree(pname);
-
-  /* relname gets de-allocated on transaction commit */
-  
-  strcpy(truncRel.data, relname);
-  
-  pmem = PortalGetVariableMemory(portal);
-  old = MemoryContextSwitchTo((MemoryContext) pmem);
-  MemoryContextSwitchTo(old);
-  
-  /* Commit the current transaction */
-  
-  CommitTransactionCommand();
-  StartTransactionCommand();
-     
-  /* Open relation for processing */
-
-  rel = heap_openr(truncRel.data);
-  if (rel == NULL)
-    elog(ERROR, "Relation %s Does Not Exist!", truncRel.data);
-  rid = rel->rd_id;
-
-  LockRelation(rel, AccessExclusiveLock); 
-
-  /* Release any buffers associated with this relation */
-
-  ReleaseRelationBuffers(rel);  
-  BlowawayRelationBuffers(rel, 0);
-
-  /* Now truncate the actual data and set blocks to zero */
-  
-  smgrtruncate(DEFAULT_SMGR, rel, 0);
-  rel->rd_nblocks = 0;
-
-  /* If this relation has indexes, truncate the indexes, which */
-  /* will unlock the relation as a result.  Otherwise, unlock */
-  /* the relation ourselves. */
-  
-  if (rel->rd_rel->relhasindex) {
-    RelationTruncateIndexes(rel);
-  } else {
-    UnlockRelation(rel, AccessExclusiveLock);
-  }
-
-  /* Close our relation */
-  
-  heap_close(rel);
-  RelationForgetRelation(rid);
-  
-  /* Destoy cross-transaction memory */
-
-  PortalDestroy(&portal);
-
-  /* Start new transaction */
-
-  CommitTransactionCommand();
-  StartTransactionCommand();
-  
-  return;
+void
+heap_truncate(char *relname)
+{
+	Relation rel;
+	Oid rid;
+
+	/* Open relation for processing, and grab exclusive access on it. */
+
+	rel = heap_openr(relname, AccessExclusiveLock);
+	rid = rel->rd_id;
+
+	/* ----------------
+	 *	TRUNCATE TABLE within a transaction block is dangerous, because
+	 *	if the transaction is later rolled back we have no way to
+	 *	undo truncation of the relation's physical file.  For now, allow it
+	 *	but emit a warning message.
+	 *	Someday we might want to consider postponing the physical truncate
+	 *	until transaction commit, but that's a lot of work...
+	 *	The only case that actually works right is for relations created
+	 *	in the current transaction, since the post-abort state would be that
+	 *	they don't exist anyway.  So, no warning in that case.
+	 * ----------------
+	 */
+	if (IsTransactionBlock() && ! rel->rd_myxactonly)
+		elog(NOTICE, "Caution: TRUNCATE TABLE cannot be rolled back, so don't abort now");
+
+	/*
+	 * Release any buffers associated with this relation.  If they're dirty,
+	 * they're just dropped without bothering to flush to disk.
+	 */
 
+	ReleaseRelationBuffers(rel);
+	if (FlushRelationBuffers(rel, (BlockNumber) 0, false) < 0)
+		elog(ERROR, "heap_truncate: unable to flush relation from buffer pool");
+
+	/* Now truncate the actual data and set blocks to zero */
+
+	smgrtruncate(DEFAULT_SMGR, rel, 0);
+	rel->rd_nblocks = 0;
+
+	/* If this relation has indexes, truncate the indexes too */
+	if (rel->rd_rel->relhasindex)
+		RelationTruncateIndexes(rel);
+
+	/*
+	 * Close the relation, but keep exclusive lock on it until commit.
+	 */
+	heap_close(rel, NoLock);
+
+	/*
+	 * Is this really necessary?
+	 */
+	RelationForgetRelation(rid);
 }
 
 
@@ -1468,15 +1433,19 @@ heap_destroy_with_catalog(char *relname)
 			 &rel->rd_rel->relname);
 
 	/* ----------------
-	 *	We do not allow DROP TABLE within a transaction block, because
-	 *	if the transaction is later rolled back there would be no way to
-	 *	undo the unlink of the relation's physical file.  The sole exception
-	 *	is for relations created in the current transaction, since the post-
-	 *	abort state would be that they don't exist anyway.
+	 *	DROP TABLE within a transaction block is dangerous, because
+	 *	if the transaction is later rolled back there will be no way to
+	 *	undo the unlink of the relation's physical file.  For now, allow it
+	 *	but emit a warning message.
+	 *	Someday we might want to consider postponing the physical unlink
+	 *	until transaction commit, but that's a lot of work...
+	 *	The only case that actually works right is for relations created
+	 *	in the current transaction, since the post-abort state would be that
+	 *	they don't exist anyway.  So, no warning in that case.
 	 * ----------------
 	 */
 	if (IsTransactionBlock() && ! rel->rd_myxactonly)
-		elog(ERROR, "Cannot destroy relation within a transaction block");
+		elog(NOTICE, "Caution: DROP TABLE cannot be rolled back, so don't abort now");
 
 	/* ----------------
 	 *	remove inheritance information
diff --git a/src/backend/catalog/index.c b/src/backend/catalog/index.c
index 912996fb1fc7f8a92882ef64bd91505421a88712..f8c4dac95e77278290f56b13ecf300cc03dcf43b 100644
--- a/src/backend/catalog/index.c
+++ b/src/backend/catalog/index.c
@@ -7,7 +7,7 @@
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/catalog/index.c,v 1.90 1999/09/18 19:06:33 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/catalog/index.c,v 1.91 1999/09/24 00:24:11 tgl Exp $
  *
  *
  * INTERFACE ROUTINES
@@ -1113,15 +1113,19 @@ index_destroy(Oid indexId)
 	LockRelation(userindexRelation, AccessExclusiveLock);
 
 	/* ----------------
-	 *	We do not allow DROP INDEX within a transaction block, because
-	 *	if the transaction is later rolled back there would be no way to
-	 *	undo the unlink of the relation's physical file.  The sole exception
-	 *	is for relations created in the current transaction, since the post-
-	 *	abort state would be that they don't exist anyway.
+	 *	DROP INDEX within a transaction block is dangerous, because
+	 *	if the transaction is later rolled back there will be no way to
+	 *	undo the unlink of the relation's physical file.  For now, allow it
+	 *	but emit a warning message.
+	 *	Someday we might want to consider postponing the physical unlink
+	 *	until transaction commit, but that's a lot of work...
+	 *	The only case that actually works right is for relations created
+	 *	in the current transaction, since the post-abort state would be that
+	 *	they don't exist anyway.  So, no warning in that case.
 	 * ----------------
 	 */
 	if (IsTransactionBlock() && ! userindexRelation->rd_myxactonly)
-		elog(ERROR, "Cannot destroy index within a transaction block");
+		elog(NOTICE, "Caution: DROP INDEX cannot be rolled back, so don't abort now");
 
 	/* ----------------
 	 * fix RELATION relation
@@ -1370,7 +1374,7 @@ UpdateStats(Oid relid, long reltuples, bool hasindex)
 		rd_rel->relpages = relpages;
 		rd_rel->reltuples = reltuples;
 		rd_rel->relhasindex = hasindex;
-		WriteBuffer(pg_class_scan->rs_cbuf);
+		WriteNoReleaseBuffer(pg_class_scan->rs_cbuf);
 	}
 	else
 	{
@@ -1413,6 +1417,9 @@ UpdateStats(Oid relid, long reltuples, bool hasindex)
  *		FillDummyExprContext
  *			Sets up dummy ExprContext and TupleTableSlot objects for use
  *			with ExecQual.
+ *
+ *			NOTE: buffer is passed for historical reasons; it should
+ *			almost certainly always be InvalidBuffer.
  * -------------------------
  */
 void
@@ -1508,7 +1515,6 @@ DefaultBuild(Relation heapRelation,
 		tupleTable = ExecCreateTupleTable(1);
 		slot = ExecAllocTableSlot(tupleTable);
 		econtext = makeNode(ExprContext);
-		/* last parameter was junk being sent bjm 1998/08/17 */
 		FillDummyExprContext(econtext, slot, heapDescriptor, InvalidBuffer);
 	}
 	else
@@ -1605,7 +1611,8 @@ DefaultBuild(Relation heapRelation,
 #ifndef OMIT_PARTIAL_INDEX
 	if (predicate != NULL || oldPred != NULL)
 	{
-		ExecDestroyTupleTable(tupleTable, false);
+		/* parameter was 'false', almost certainly wrong --- tgl 9/21/99 */
+		ExecDestroyTupleTable(tupleTable, true);
 	}
 #endif	 /* OMIT_PARTIAL_INDEX */
 
diff --git a/src/backend/commands/dbcommands.c b/src/backend/commands/dbcommands.c
index 24eb5b531d392f4a014a965951aacf68f582b5a4..b243dd173b19fa2a64a9898abb30c0a7de163ab8 100644
--- a/src/backend/commands/dbcommands.c
+++ b/src/backend/commands/dbcommands.c
@@ -7,7 +7,7 @@
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/commands/dbcommands.c,v 1.40 1999/09/18 19:06:40 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/commands/dbcommands.c,v 1.41 1999/09/24 00:24:17 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -22,6 +22,7 @@
 #include "catalog/pg_shadow.h"
 #include "commands/dbcommands.h"
 #include "miscadmin.h"
+#include "storage/sinval.h"
 #include "tcop/tcopprot.h"
 #include "utils/syscache.h"
 
@@ -89,7 +90,11 @@ destroydb(char *dbname, CommandDest dest)
 	Oid			db_id;
 	char	   *path,
 				dbpath[MAXPGPATH + 1],
-				buf[512];
+				buf[MAXPGPATH + 50];
+	Relation	pgdbrel;
+	HeapScanDesc pgdbscan;
+	ScanKeyData	key;
+	HeapTuple	tup;
 
 	/*
 	 * If this call returns, the database exists and we're allowed to
@@ -97,36 +102,79 @@ destroydb(char *dbname, CommandDest dest)
 	 */
 	check_permissions("destroydb", dbpath, dbname, &db_id, &user_id);
 
+	/* do as much checking as we can... */
 	if (!OidIsValid(db_id))
 		elog(FATAL, "pg_database instance has an invalid OID");
 
-	/* stop the vacuum daemon */
-	stop_vacuum(dbpath, dbname);
-
-	/* XXX what about stopping backends connected to the target database? */
-
 	path = ExpandDatabasePath(dbpath);
 	if (path == NULL)
 		elog(ERROR, "Unable to locate path '%s'"
 			 "\n\tThis may be due to a missing environment variable"
 			 " in the server", dbpath);
 
+	/* stop the vacuum daemon (dead code...) */
+	stop_vacuum(dbpath, dbname);
+
 	/*
-	 * remove the pg_database tuple FIRST, this may fail due to
-	 * permissions problems
+	 * Obtain exclusive lock on pg_database.  We need this to ensure
+	 * that no new backend starts up in the target database while we
+	 * are deleting it.  (Actually, a new backend might still manage to
+	 * start up, because it will read pg_database without any locking
+	 * to discover the database's OID.  But it will detect its error
+	 * in ReverifyMyDatabase and shut down before any serious damage
+	 * is done.  See postinit.c.)
 	 */
-	snprintf(buf, 512,
-	"delete from pg_database where pg_database.oid = \'%u\'::oid", db_id);
-	pg_exec_query_dest(buf, dest, false);
+	pgdbrel = heap_openr(DatabaseRelationName, AccessExclusiveLock);
+
+	/*
+	 * Check for active backends in the target database.
+	 */
+	if (DatabaseHasActiveBackends(db_id))
+		elog(ERROR, "Database '%s' has running backends, can't destroy it",
+			 dbname);
+
+	/*
+	 * Find the database's tuple by OID (should be unique, we trust).
+	 */
+	ScanKeyEntryInitialize(&key, 0, ObjectIdAttributeNumber,
+						   F_OIDEQ, ObjectIdGetDatum(db_id));
+
+	pgdbscan = heap_beginscan(pgdbrel, 0, SnapshotNow, 1, &key);
 
-	/* drop pages for this database that are in the shared buffer cache */
+	tup = heap_getnext(pgdbscan, 0);
+	if (!HeapTupleIsValid(tup))
+	{
+		heap_close(pgdbrel, AccessExclusiveLock);
+		elog(ERROR, "Database '%s', OID %u, not found in pg_database",
+			 dbname, db_id);
+	}
+
+	/*
+	 * Houston, we have launch commit...
+	 *
+	 * Remove the database's tuple from pg_database.
+	 */
+	heap_delete(pgdbrel, &tup->t_self, NULL);
+
+	heap_endscan(pgdbscan);
+
+	/*
+	 * Close pg_database, but keep exclusive lock till commit to ensure
+	 * that any new backend scanning pg_database will see the tuple dead.
+	 */
+	heap_close(pgdbrel, NoLock);
+
+	/*
+	 * Drop pages for this database that are in the shared buffer cache.
+	 * This is important to ensure that no remaining backend tries to
+	 * write out a dirty buffer to the dead database later...
+	 */
 	DropBuffers(db_id);
 
 	/*
-	 * remove the data directory. If the DELETE above failed, this will
-	 * not be reached
+	 * Remove the database's subdirectory and everything in it.
 	 */
-	snprintf(buf, 512, "rm -r %s", path);
+	snprintf(buf, sizeof(buf), "rm -r '%s'", path);
 	system(buf);
 }
 
@@ -274,22 +322,28 @@ check_permissions(char *command,
 }	/* check_permissions() */
 
 /*
- *	stop_vacuum() -- stop the vacuum daemon on the database, if one is running.
+ *	stop_vacuum -- stop the vacuum daemon on the database, if one is running.
+ *
+ *	This is currently dead code, since we don't *have* vacuum daemons.
+ *	If you want to re-enable it, think about the interlock against deleting
+ *	a database out from under running backends, in destroydb() above.
  */
 static void
 stop_vacuum(char *dbpath, char *dbname)
 {
-	char		filename[256];
+#ifdef NOT_USED
+	char		filename[MAXPGPATH + 1];
 	FILE	   *fp;
 	int			pid;
 
 	if (strchr(dbpath, SEP_CHAR) != 0)
 	{
-		snprintf(filename, 256, "%s%cbase%c%s%c%s.vacuum",
+		snprintf(filename, sizeof(filename), "%s%cbase%c%s%c%s.vacuum",
 				 DataDir, SEP_CHAR, SEP_CHAR, dbname, SEP_CHAR, dbname);
 	}
 	else
-		snprintf(filename, 256, "%s%c%s.vacuum", dbpath, SEP_CHAR, dbname);
+		snprintf(filename, sizeof(filename), "%s%c%s.vacuum",
+				 dbpath, SEP_CHAR, dbname);
 
 #ifndef __CYGWIN32__
 	if ((fp = AllocateFile(filename, "r")) != NULL)
@@ -305,4 +359,5 @@ stop_vacuum(char *dbpath, char *dbname)
 				 pid, dbname);
 		}
 	}
+#endif
 }
diff --git a/src/backend/commands/rename.c b/src/backend/commands/rename.c
index 3a822bd4e49ce6c2782440124fa3657b8a201460..0a72ba497ec1db82c68e139baf639eea9e1b823c 100644
--- a/src/backend/commands/rename.c
+++ b/src/backend/commands/rename.c
@@ -7,12 +7,14 @@
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/commands/Attic/rename.c,v 1.33 1999/09/18 19:06:40 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/commands/Attic/rename.c,v 1.34 1999/09/24 00:24:17 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
 #include "postgres.h"
 
+#include <errno.h>
+
 #include "access/heapam.h"
 #include "catalog/catname.h"
 #include "utils/syscache.h"
@@ -21,6 +23,7 @@
 #include "catalog/catalog.h"
 #include "commands/rename.h"
 #include "miscadmin.h"
+#include "storage/smgr.h"
 #include "optimizer/prep.h"
 #include "utils/acl.h"
 
@@ -166,19 +169,6 @@ renameatt(char *relname,
 
 /*
  *		renamerel		- change the name of a relation
- *
- *		Relname attribute is changed in relation catalog.
- *		No record of the previous relname is kept (correct?).
- *
- *		scan relation catalog
- *				for name conflict
- *				for original relation (if not arg)
- *		modify relname in relation tuple
- *		insert modified relation in relation catalog
- *		delete original relation from relation catalog
- *
- *		XXX Will currently lose track of a relation if it is unable to
- *				properly replace the new relation tuple.
  */
 void
 renamerel(char *oldrelname, char *newrelname)
@@ -206,8 +196,55 @@ renamerel(char *oldrelname, char *newrelname)
 	 * until end of transaction.
 	 */
 	targetrelation = heap_openr(oldrelname, AccessExclusiveLock);
-	heap_close(targetrelation, NoLock);	/* close rel but keep lock! */
 
+	/* ----------------
+	 *	RENAME TABLE within a transaction block is dangerous, because
+	 *	if the transaction is later rolled back we have no way to
+	 *	undo the rename of the relation's physical file.  For now, allow it
+	 *	but emit a warning message.
+	 *	Someday we might want to consider postponing the physical rename
+	 *	until transaction commit, but that's a lot of work...
+	 *	The only case that actually works right is for relations created
+	 *	in the current transaction, since the post-abort state would be that
+	 *	they don't exist anyway.  So, no warning in that case.
+	 * ----------------
+	 */
+	if (IsTransactionBlock() && ! targetrelation->rd_myxactonly)
+		elog(NOTICE, "Caution: RENAME TABLE cannot be rolled back, so don't abort now");
+
+	/*
+	 * Flush all blocks of the relation out of the buffer pool.  We need this
+	 * because the blocks are marked with the relation's name as well as OID.
+	 * If some backend tries to write a dirty buffer with mdblindwrt after
+	 * we've renamed the physical file, we'll be in big trouble.
+	 *
+	 * Since we hold the exclusive lock on the relation, we don't have to
+	 * worry about more blocks being read in while we finish the rename.
+	 */
+	if (FlushRelationBuffers(targetrelation, (BlockNumber) 0, true) < 0)
+		elog(ERROR, "renamerel: unable to flush relation from buffer pool");
+
+	/*
+	 * Make sure smgr and lower levels close the relation's files.
+	 * (Next access to rel will reopen them.)
+	 *
+	 * Note: we rely on shared cache invalidation message to make other
+	 * backends close and re-open the files.
+	 */
+	smgrclose(DEFAULT_SMGR, targetrelation);
+
+	/*
+	 * Close rel, but keep exclusive lock!
+	 *
+	 * Note: we don't do anything about updating the relcache entry;
+	 * we assume it will be flushed by shared cache invalidate.
+	 * XXX is this good enough?  What if relation is myxactonly?
+	 */
+	heap_close(targetrelation, NoLock);
+
+	/*
+	 * Find relation's pg_class tuple, and make sure newrelname isn't in use.
+	 */
 	relrelation = heap_openr(RelationRelationName, RowExclusiveLock);
 
 	oldreltup = SearchSysCacheTupleCopy(RELNAME,
@@ -220,14 +257,17 @@ renamerel(char *oldrelname, char *newrelname)
 		elog(ERROR, "renamerel: relation \"%s\" exists", newrelname);
 
 	/*
-	 * XXX need to close relation and flush dirty buffers here!
+	 * Perform physical rename of files.  If this fails, we haven't yet
+	 * done anything irreversible.
+	 *
+	 * XXX smgr.c ought to provide an interface for this; doing it
+	 * directly is bletcherous.
 	 */
-
-	/* rename the path first, so if this fails the rename's not done */
 	strcpy(oldpath, relpath(oldrelname));
 	strcpy(newpath, relpath(newrelname));
 	if (rename(oldpath, newpath) < 0)
-		elog(ERROR, "renamerel: unable to rename file: %s", oldpath);
+		elog(ERROR, "renamerel: unable to rename %s to %s: %m",
+			 oldpath, newpath);
 
 	/* rename additional segments of relation, too */
 	for (i = 1;; i++)
@@ -235,13 +275,22 @@ renamerel(char *oldrelname, char *newrelname)
 		sprintf(toldpath, "%s.%d", oldpath, i);
 		sprintf(tnewpath, "%s.%d", newpath, i);
 		if (rename(toldpath, tnewpath) < 0)
-			break;
+		{
+			/* expected case is that there's not another segment file */
+			if (errno == ENOENT)
+				break;
+			/* otherwise we're up the creek... */
+			elog(ERROR, "renamerel: unable to rename %s to %s: %m",
+				 toldpath, tnewpath);
+		}
 	}
 
+	/*
+	 * Update pg_class tuple with new relname.
+	 */
 	StrNCpy((((Form_pg_class) GETSTRUCT(oldreltup))->relname.data),
 			newrelname, NAMEDATALEN);
 
-	/* insert fixed rel tuple */
 	heap_replace(relrelation, &oldreltup->t_self, oldreltup, NULL);
 
 	/* keep the system catalog indices current */
diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c
index 3027763b46861368eaf2f585795ec2b9440f185c..e5cf7b0c88aebd0334845e7a01e7748779e3f07a 100644
--- a/src/backend/commands/vacuum.c
+++ b/src/backend/commands/vacuum.c
@@ -7,7 +7,7 @@
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/commands/vacuum.c,v 1.120 1999/09/18 19:06:41 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/commands/vacuum.c,v 1.121 1999/09/24 00:24:17 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -30,6 +30,7 @@
 #include "commands/vacuum.h"
 #include "miscadmin.h"
 #include "parser/parse_oper.h"
+#include "storage/sinval.h"
 #include "storage/smgr.h"
 #include "utils/builtins.h"
 #include "utils/inval.h"
@@ -46,8 +47,6 @@
 
  /* #include <port-protos.h> *//* Why? */
 
-extern int	BlowawayRelationBuffers(Relation rel, BlockNumber block);
-
 bool		VacuumRunning = false;
 
 static Portal vc_portal;
@@ -1838,9 +1837,9 @@ Elapsed %u/%u sec.",
 	/* truncate relation */
 	if (blkno < nblocks)
 	{
-		i = BlowawayRelationBuffers(onerel, blkno);
+		i = FlushRelationBuffers(onerel, blkno, false);
 		if (i < 0)
-			elog(FATAL, "VACUUM (vc_rpfheap): BlowawayRelationBuffers returned %d", i);
+			elog(FATAL, "VACUUM (vc_rpfheap): FlushRelationBuffers returned %d", i);
 		blkno = smgrtruncate(DEFAULT_SMGR, onerel, blkno);
 		Assert(blkno >= 0);
 		vacrelstats->num_pages = blkno; /* set new number of blocks */
@@ -1902,12 +1901,14 @@ vc_vacheap(VRelStats *vacrelstats, Relation onerel, VPageList vacuum_pages)
 		/*
 		 * we have to flush "empty" end-pages (if changed, but who knows
 		 * it) before truncation
+		 *
+		 * XXX wouldn't passing 'true' to FlushRelationBuffers do the job?
 		 */
 		FlushBufferPool(!TransactionFlushEnabled());
 
-		i = BlowawayRelationBuffers(onerel, nblocks);
+		i = FlushRelationBuffers(onerel, nblocks, false);
 		if (i < 0)
-			elog(FATAL, "VACUUM (vc_vacheap): BlowawayRelationBuffers returned %d", i);
+			elog(FATAL, "VACUUM (vc_vacheap): FlushRelationBuffers returned %d", i);
 
 		nblocks = smgrtruncate(DEFAULT_SMGR, onerel, nblocks);
 		Assert(nblocks >= 0);
diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c
index 97dffe548f74577027a585331782d70d9ba4b1da..f07f8777a2f94b226e353d3746eaf8f959b6e5df 100644
--- a/src/backend/executor/execMain.c
+++ b/src/backend/executor/execMain.c
@@ -26,7 +26,7 @@
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/executor/execMain.c,v 1.94 1999/09/18 19:06:47 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/executor/execMain.c,v 1.95 1999/09/24 00:24:23 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -130,16 +130,6 @@ ExecutorStart(QueryDesc *queryDesc, EState *estate)
 					  queryDesc->plantree,
 					  estate);
 
-	/*
-	 * reset buffer refcount.  the current refcounts are saved and will be
-	 * restored when ExecutorEnd is called
-	 *
-	 * this makes sure that when ExecutorRun's are called recursively as for
-	 * postquel functions, the buffers pinned by one ExecutorRun will not
-	 * be unpinned by another ExecutorRun.
-	 */
-	BufferRefCountReset(estate->es_refcount);
-
 	return result;
 }
 
@@ -385,10 +375,6 @@ ExecutorEnd(QueryDesc *queryDesc, EState *estate)
 		pfree(estate->es_param_exec_vals);
 		estate->es_param_exec_vals = NULL;
 	}
-
-	/* restore saved refcounts. */
-	BufferRefCountRestore(estate->es_refcount);
-
 }
 
 void
@@ -802,7 +788,7 @@ EndPlan(Plan *plan, EState *estate)
 	{
 		TupleTable	tupleTable = (TupleTable) estate->es_tupleTable;
 
-		ExecDestroyTupleTable(tupleTable, true);		/* was missing last arg */
+		ExecDestroyTupleTable(tupleTable, true);
 		estate->es_tupleTable = NULL;
 	}
 
@@ -1678,7 +1664,6 @@ EvalPlanQual(EState *estate, Index rti, ItemPointer tid)
 						   sizeof(ParamExecData));
 			epqstate->es_tupleTable =
 				ExecCreateTupleTable(estate->es_tupleTable->size);
-			epqstate->es_refcount = estate->es_refcount;
 			/* ... rest */
 			newepq->plan = copyObject(estate->es_origPlan);
 			newepq->free = NULL;
diff --git a/src/backend/executor/execQual.c b/src/backend/executor/execQual.c
index a44030aa406e5a48320d9571addf6492397a2b1a..2886cab7253d0e60d9b752205aa3dea77b3ced45 100644
--- a/src/backend/executor/execQual.c
+++ b/src/backend/executor/execQual.c
@@ -7,7 +7,7 @@
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/executor/execQual.c,v 1.59 1999/09/18 23:26:37 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/executor/execQual.c,v 1.60 1999/09/24 00:24:23 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -637,7 +637,8 @@ ExecEvalFuncArgs(FunctionCachePtr fcache,
 
 		if (!(*argIsDone))
 		{
-			Assert(i == 0);
+			if (i != 0)
+				elog(ERROR, "functions can only take sets in their first argument");
 			fcache->setArg = (char *) argV[0];
 			fcache->hasSetArg = true;
 		}
@@ -758,35 +759,48 @@ ExecMakeFunctionResult(Node *node,
 	if (fcache->language == SQLlanguageId)
 	{
 		Datum		result;
+		bool		argDone;
 
 		Assert(funcNode);
-		result = postquel_function(funcNode, (char **) argV, isNull, isDone);
 
-		/*
-		 * finagle the situation where we are iterating through all
-		 * results in a nested dot function (whose argument function
+		/*--------------------
+		 * This loop handles the situation where we are iterating through
+		 * all results in a nested dot function (whose argument function
 		 * returns a set of tuples) and the current function finally
-		 * finishes.  We need to get the next argument in the set and run
-		 * the function all over again.  This is getting unclean.
+		 * finishes.  We need to get the next argument in the set and start
+		 * the function all over again.  We might have to do it more than
+		 * once, if the function produces no results for a particular argument.
+		 * This is getting unclean.
+		 *--------------------
 		 */
-		if ((*isDone) && (fcache->hasSetArg))
+		for (;;)
 		{
-			bool		argDone;
+			result = postquel_function(funcNode, (char **) argV,
+									   isNull, isDone);
+
+			if (! *isDone)
+				break;			/* got a result from current argument */
+			if (! fcache->hasSetArg)
+				break;			/* input not a set, so done */
 
+			/* OK, get the next argument... */
 			ExecEvalFuncArgs(fcache, econtext, arguments, argV, &argDone);
 
 			if (argDone)
 			{
+				/* End of arguments, so reset the setArg flag and say "Done" */
 				fcache->setArg = (char *) NULL;
+				fcache->hasSetArg = false;
 				*isDone = true;
 				result = (Datum) NULL;
+				break;
 			}
-			else
-				result = postquel_function(funcNode,
-										   (char **) argV,
-										   isNull,
-										   isDone);
+
+			/* If we reach here, loop around to run the function on the
+			 * new argument.
+			 */
 		}
+
 		if (funcisset)
 		{
 
@@ -805,6 +819,7 @@ ExecMakeFunctionResult(Node *node,
 			if (*isDone)
 				((Func *) node)->func_fcache = NULL;
 		}
+
 		return result;
 	}
 	else
@@ -1424,8 +1439,10 @@ ExecTargetList(List *targetlist,
 {
 	char		nulls_array[64];
 	bool		fjNullArray[64];
-	bool	   *fjIsNull;
+	bool		itemIsDoneArray[64];
 	char	   *null_head;
+	bool	   *fjIsNull;
+	bool	   *itemIsDone;
 	List	   *tl;
 	TargetEntry *tle;
 	Node	   *expr;
@@ -1434,6 +1451,7 @@ ExecTargetList(List *targetlist,
 	Datum		constvalue;
 	HeapTuple	newTuple;
 	bool		isNull;
+	bool		haveDoneIters;
 	static struct tupleDesc NullTupleDesc; /* we assume this inits to zeroes */
 
 	/*
@@ -1457,24 +1475,30 @@ ExecTargetList(List *targetlist,
 	/*
 	 * allocate an array of char's to hold the "null" information only if
 	 * we have a really large targetlist.  otherwise we use the stack.
+	 *
+	 * We also allocate a bool array that is used to hold fjoin result state,
+	 * and another that holds the isDone status for each targetlist item.
 	 */
 	if (nodomains > 64)
 	{
 		null_head = (char *) palloc(nodomains + 1);
 		fjIsNull = (bool *) palloc(nodomains + 1);
+		itemIsDone = (bool *) palloc(nodomains + 1);
 	}
 	else
 	{
 		null_head = &nulls_array[0];
 		fjIsNull = &fjNullArray[0];
+		itemIsDone = &itemIsDoneArray[0];
 	}
 
 	/*
 	 * evaluate all the expressions in the target list
 	 */
-	EV_printf("ExecTargetList: setting target list values\n");
 
-	*isDone = true;
+	*isDone = true;				/* until proven otherwise */
+	haveDoneIters = false;		/* any isDone Iter exprs in tlist? */
+
 	foreach(tl, targetlist)
 	{
 
@@ -1493,13 +1517,11 @@ ExecTargetList(List *targetlist,
 			expr = tle->expr;
 			resdom = tle->resdom;
 			resind = resdom->resno - 1;
+
 			constvalue = (Datum) ExecEvalExpr(expr,
 											  econtext,
 											  &isNull,
-											  isDone);
-
-			if ((IsA(expr, Iter)) && (*isDone))
-				return (HeapTuple) NULL;
+											  &itemIsDone[resind]);
 
 			values[resind] = constvalue;
 
@@ -1507,6 +1529,14 @@ ExecTargetList(List *targetlist,
 				null_head[resind] = ' ';
 			else
 				null_head[resind] = 'n';
+
+			if (IsA(expr, Iter))
+			{
+				if (itemIsDone[resind])
+					haveDoneIters = true;
+				else
+					*isDone = false; /* we have undone Iters in the list */
+			}
 		}
 		else
 		{
@@ -1518,6 +1548,8 @@ ExecTargetList(List *targetlist,
 			DatumPtr	results = fjNode->fj_results;
 
 			ExecEvalFjoin(tle, econtext, fjIsNull, isDone);
+
+			/* this is probably wrong: */
 			if (*isDone)
 				return (HeapTuple) NULL;
 
@@ -1558,18 +1590,86 @@ ExecTargetList(List *targetlist,
 		}
 	}
 
+	if (haveDoneIters)
+	{
+		if (*isDone)
+		{
+			/* all Iters are done, so return a null indicating tlist set
+			 * expansion is complete.
+			 */
+			newTuple = NULL;
+			goto exit;
+		}
+		else
+		{
+			/* We have some done and some undone Iters.  Restart the done ones
+			 * so that we can deliver a tuple (if possible).
+			 *
+			 * XXX this code is a crock, because it only works for Iters at
+			 * the top level of tlist expressions, and doesn't even work right
+			 * for them: you should get all possible combinations of Iter
+			 * results, but you won't unless the numbers of values returned by
+			 * each are relatively prime.  Should have a mechanism more like
+			 * aggregate functions, where we make a list of all Iters
+			 * contained in the tlist and cycle through their values in a
+			 * methodical fashion.  To do someday; can't get excited about
+			 * fixing a Berkeley feature that's not in SQL92.  (The only
+			 * reason we're doing this much is that we have to be sure all
+			 * the Iters are run to completion, or their subplan executors
+			 * will have unreleased resources, e.g. pinned buffers...)
+			 */
+			foreach(tl, targetlist)
+			{
+				tle = lfirst(tl);
+
+				if (tle->resdom != NULL)
+				{
+					expr = tle->expr;
+					resdom = tle->resdom;
+					resind = resdom->resno - 1;
+
+					if (IsA(expr, Iter) && itemIsDone[resind])
+					{
+						constvalue = (Datum) ExecEvalExpr(expr,
+														  econtext,
+														  &isNull,
+														  &itemIsDone[resind]);
+						if (itemIsDone[resind])
+						{
+							/* Oh dear, this Iter is returning an empty set.
+							 * Guess we can't make a tuple after all.
+							 */
+							*isDone = true;
+							newTuple = NULL;
+							goto exit;
+						}
+
+						values[resind] = constvalue;
+
+						if (!isNull)
+							null_head[resind] = ' ';
+						else
+							null_head[resind] = 'n';
+					}
+				}
+			}
+		}
+	}
+
 	/*
 	 * form the new result tuple (in the "normal" context)
 	 */
 	newTuple = (HeapTuple) heap_formtuple(targettype, values, null_head);
 
+exit:
 	/*
-	 * free the nulls array if we allocated one..
+	 * free the status arrays if we palloc'd them
 	 */
 	if (nodomains > 64)
 	{
 		pfree(null_head);
 		pfree(fjIsNull);
+		pfree(itemIsDone);
 	}
 
 	return newTuple;
diff --git a/src/backend/executor/execTuples.c b/src/backend/executor/execTuples.c
index 6e2e249c9a843a287bffe7748f2687139a869827..835dba7c5cd27743ba8ebb7ae761ae4accd1c692 100644
--- a/src/backend/executor/execTuples.c
+++ b/src/backend/executor/execTuples.c
@@ -14,7 +14,7 @@
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/executor/execTuples.c,v 1.29 1999/07/17 20:16:57 momjian Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/executor/execTuples.c,v 1.30 1999/09/24 00:24:23 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -38,9 +38,6 @@
  *		ExecSetSlotDescriptor	- set a slot's tuple descriptor
  *		ExecSetSlotDescriptorIsNew - diddle the slot-desc-is-new flag
  *		ExecSetNewSlotDescriptor - set a desc and the is-new-flag all at once
- *		ExecSlotBuffer			- return buffer of tuple in slot
- *		ExecSetSlotBuffer		- set the buffer for tuple in slot
- *		ExecIncrSlotBufferRefcnt - bump the refcnt of the slot buffer(Macro)
  *
  *	 SLOT STATUS PREDICATES
  *		TupIsNull				- true when slot contains no tuple(Macro)
@@ -193,7 +190,7 @@ ExecDestroyTupleTable(TupleTable table, /* tuple table */
 					  bool shouldFree)	/* true if we should free slot
 										 * contents */
 {
-	int			next;			/* next avaliable slot */
+	int			next;			/* next available slot */
 	TupleTableSlot *array;		/* start of table array */
 	int			i;				/* counter */
 
@@ -212,38 +209,27 @@ ExecDestroyTupleTable(TupleTable table, /* tuple table */
 
 	/* ----------------
 	 *	first free all the valid pointers in the tuple array
-	 *	if that's what the caller wants..
+	 *	and drop refcounts of any referenced buffers,
+	 *	if that's what the caller wants.  (There is probably
+	 *	no good reason for the caller ever not to want it!)
 	 *
-	 *	Note: we do nothing about the Buffer and Tuple Descriptor's
+	 *	Note: we do nothing about the Tuple Descriptor's
 	 *	we store in the slots.	This may have to change (ex: we should
 	 *	probably worry about pfreeing tuple descs too) -cim 3/14/91
+	 *
+	 *	Right now, the handling of tuple pointers and buffer refcounts
+	 *	is clean, but the handling of tuple descriptors is NOT; they
+	 *	are copied around with wild abandon.  It would take some work
+	 *	to make tuple descs pfree'able.  Fortunately, since they're
+	 *	normally only made once per scan, it's probably not worth
+	 *	worrying about...  tgl 9/21/99
 	 * ----------------
 	 */
 	if (shouldFree)
+	{
 		for (i = 0; i < next; i++)
-		{
-			TupleTableSlot slot;
-			HeapTuple	tuple;
-
-			slot = array[i];
-			tuple = slot.val;
-
-			if (tuple != NULL)
-			{
-				slot.val = (HeapTuple) NULL;
-				if (slot.ttc_shouldFree)
-				{
-					/* ----------------
-					 *	since a tuple may contain a pointer to
-					 *	lock information allocated along with the
-					 *	tuple, we have to be careful to free any
-					 *	rule locks also -cim 1/17/90
-					 * ----------------
-					 */
-					pfree(tuple);
-				}
-			}
-		}
+			ExecClearTuple(&array[i]);
+	}
 
 	/* ----------------
 	 *	finally free the tuple array and the table itself.
@@ -274,6 +260,7 @@ TupleTableSlot *				/* return: the slot allocated in the tuple
 ExecAllocTableSlot(TupleTable table)
 {
 	int			slotnum;		/* new slot number */
+	TupleTableSlot*   slot;
 
 	/* ----------------
 	 *	sanity checks
@@ -319,9 +306,18 @@ ExecAllocTableSlot(TupleTable table)
 	slotnum = table->next;
 	table->next++;
 
-	table->array[slotnum].type = T_TupleTableSlot;
+	slot = &(table->array[slotnum]);
+
+	/* Make sure the allocated slot is valid (and empty) */
+	slot->type = T_TupleTableSlot;
+	slot->val = (HeapTuple) NULL;
+	slot->ttc_shouldFree = true;
+	slot->ttc_descIsNew = true;
+	slot->ttc_tupleDescriptor = (TupleDesc) NULL;
+	slot->ttc_buffer = InvalidBuffer;
+	slot->ttc_whichplan = -1;
 
-	return &(table->array[slotnum]);
+	return slot;
 }
 
 /* ----------------------------------------------------------------
@@ -333,26 +329,49 @@ ExecAllocTableSlot(TupleTable table)
  *		ExecStoreTuple
  *
  *		This function is used to store a tuple into a specified
- *		slot in the tuple table.  Note: the only slots which should
- *		be called with shouldFree == false are those slots used to
- *		store tuples not allocated with pfree().  Currently the
- *		seqscan and indexscan nodes use this for the tuples returned
- *		by amgetattr, which are actually pointers onto disk pages.
+ *		slot in the tuple table.
+ *
+ *		tuple:	tuple to store
+ *		slot:	slot to store it in
+ *		buffer:	disk buffer if tuple is in a disk page, else InvalidBuffer
+ *		shouldFree:	true if ExecClearTuple should pfree() the tuple
+ *					when done with it
+ *
+ * If 'buffer' is not InvalidBuffer, the tuple table code acquires a pin
+ * on the buffer which is held until the slot is cleared, so that the tuple
+ * won't go away on us.
+ *
+ * shouldFree is normally set 'true' for tuples constructed on-the-fly.
+ * It must always be 'false' for tuples that are stored in disk pages,
+ * since we don't want to try to pfree those.
+ *
+ * Another case where it is 'false' is when the referenced tuple is held
+ * in a tuple table slot belonging to a lower-level executor Proc node.
+ * In this case the lower-level slot retains ownership and responsibility
+ * for eventually releasing the tuple.  When this method is used, we must
+ * be certain that the upper-level Proc node will lose interest in the tuple
+ * sooner than the lower-level one does!  If you're not certain, copy the
+ * lower-level tuple with heap_copytuple and let the upper-level table
+ * slot assume ownership of the copy!
+ *
+ * Return value is just the passed-in slot pointer.
  * --------------------------------
  */
-TupleTableSlot *				/* return: slot passed */
-ExecStoreTuple(HeapTuple tuple, /* tuple to store */
-			   TupleTableSlot *slot,	/* slot in which to store tuple */
-			   Buffer buffer,	/* buffer associated with tuple */
-			   bool shouldFree) /* true if we call pfree() when we gc. */
+TupleTableSlot *
+ExecStoreTuple(HeapTuple tuple,
+			   TupleTableSlot *slot,
+			   Buffer buffer,
+			   bool shouldFree)
 {
 	/* ----------------
 	 *	sanity checks
 	 * ----------------
 	 */
 	Assert(slot != NULL);
+	/* passing shouldFree=true for a tuple on a disk page is not sane */
+	Assert(BufferIsValid(buffer) ? (!shouldFree) : true);
 
-	/* clear out the slot first */
+	/* clear out any old contents of the slot */
 	ExecClearTuple(slot);
 
 	/* ----------------
@@ -364,6 +383,12 @@ ExecStoreTuple(HeapTuple tuple, /* tuple to store */
 	slot->ttc_buffer = buffer;
 	slot->ttc_shouldFree = shouldFree;
 
+	/* If tuple is on a disk page, keep the page pinned as long as we hold
+	 * a pointer into it.
+	 */
+	if (BufferIsValid(buffer))
+		IncrBufferRefCount(buffer);
+
 	return slot;
 }
 
@@ -395,29 +420,20 @@ ExecClearTuple(TupleTableSlot *slot)	/* slot in which to store tuple */
 	 * ----------------
 	 */
 	if (slot->ttc_shouldFree && oldtuple != NULL)
-	{
-		/* ----------------
-		 *	since a tuple may contain a pointer to
-		 *	lock information allocated along with the
-		 *	tuple, we have to be careful to free any
-		 *	rule locks also -cim 1/17/90
-		 * ----------------
-		 */
 		pfree(oldtuple);
-	}
+
+	slot->val = (HeapTuple) NULL;
+
+	slot->ttc_shouldFree = true; /* probably useless code... */
 
 	/* ----------------
-	 *	store NULL into the specified slot and return the slot.
-	 *	- also set buffer to InvalidBuffer -cim 3/14/91
+	 *	Drop the pin on the referenced buffer, if there is one.
 	 * ----------------
 	 */
-	slot->val = (HeapTuple) NULL;
-
 	if (BufferIsValid(slot->ttc_buffer))
 		ReleaseBuffer(slot->ttc_buffer);
 
 	slot->ttc_buffer = InvalidBuffer;
-	slot->ttc_shouldFree = true;
 
 	return slot;
 }
@@ -525,41 +541,6 @@ ExecSetNewSlotDescriptor(TupleTableSlot *slot,	/* slot to change */
 
 #endif
 
-/* --------------------------------
- *		ExecSlotBuffer
- *
- *		This function is used to get the tuple descriptor associated
- *		with the slot's tuple.  Be very careful with this as it does not
- *		balance the reference counts.  If the buffer returned is stored
- *		someplace else, then also use ExecIncrSlotBufferRefcnt().
- *
- * Now a macro in tuptable.h
- * --------------------------------
- */
-
-/* --------------------------------
- *		ExecSetSlotBuffer
- *
- *		This function is used to set the tuple descriptor associated
- *		with the slot's tuple.   Be very careful with this as it does not
- *		balance the reference counts.  If we're using this then we should
- *		also use ExecIncrSlotBufferRefcnt().
- * --------------------------------
- */
-#ifdef NOT_USED
-Buffer							/* return: old slot buffer */
-ExecSetSlotBuffer(TupleTableSlot *slot, /* slot to change */
-				  Buffer b)		/* tuple descriptor */
-{
-	Buffer		oldb = slot->ttc_buffer;
-
-	slot->ttc_buffer = b;
-
-	return oldb;
-}
-
-#endif
-
 /* ----------------------------------------------------------------
  *				  tuple table slot status predicates
  * ----------------------------------------------------------------
@@ -601,12 +582,7 @@ ExecSlotDescriptorIsNew(TupleTableSlot *slot)	/* slot to inspect */
 
 #define INIT_SLOT_ALLOC \
 	tupleTable = (TupleTable) estate->es_tupleTable; \
-	slot =		 ExecAllocTableSlot(tupleTable); \
-	slot->val = (HeapTuple)NULL; \
-	slot->ttc_shouldFree = true; \
-	slot->ttc_tupleDescriptor = (TupleDesc)NULL; \
-	slot->ttc_whichplan = -1;\
-	slot->ttc_descIsNew = true;
+	slot =		 ExecAllocTableSlot(tupleTable);
 
 /* ----------------
  *		ExecInitResultTupleSlot
diff --git a/src/backend/executor/nodeAppend.c b/src/backend/executor/nodeAppend.c
index bd515d51f97af7525138f828934cc24abd28d7c8..f20d9c56bc654c8e66bccb5a97a20e62aa8d6cb3 100644
--- a/src/backend/executor/nodeAppend.c
+++ b/src/backend/executor/nodeAppend.c
@@ -7,7 +7,7 @@
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/executor/nodeAppend.c,v 1.25 1999/09/18 19:06:48 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/executor/nodeAppend.c,v 1.26 1999/09/24 00:24:23 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -399,12 +399,13 @@ ExecProcAppend(Append *node)
 	{
 		/* ----------------
 		 *	if the subplan gave us something then place a copy of
-		 *	whatever we get into our result slot and return it, else..
+		 *	whatever we get into our result slot and return it.
+		 *
+		 *	Note we rely on the subplan to retain ownership of the
+		 *	tuple for as long as we need it --- we don't copy it.
 		 * ----------------
 		 */
-		return ExecStoreTuple(result->val,
-							  result_slot, result->ttc_buffer, false);
-
+		return ExecStoreTuple(result->val, result_slot, InvalidBuffer, false);
 	}
 	else
 	{
diff --git a/src/backend/executor/nodeGroup.c b/src/backend/executor/nodeGroup.c
index f856d4780bc69898b87dec9c2bbf55c73b523010..38f7a0365db9e56e6d00c71f12cccf8321655872 100644
--- a/src/backend/executor/nodeGroup.c
+++ b/src/backend/executor/nodeGroup.c
@@ -13,7 +13,7 @@
  *	  columns. (ie. tuples from the same group are consecutive)
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/executor/nodeGroup.c,v 1.29 1999/07/17 20:16:58 momjian Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/executor/nodeGroup.c,v 1.30 1999/09/24 00:24:23 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -91,10 +91,12 @@ ExecGroupEveryTuple(Group *node)
 	{
 		grpstate->grp_useFirstTuple = FALSE;
 
+		/* note we rely on subplan to hold ownership of the tuple
+		 * for as long as we need it; we don't copy it.
+		 */
 		ExecStoreTuple(grpstate->grp_firstTuple,
 					   grpstate->csstate.css_ScanTupleSlot,
-					   InvalidBuffer,
-					   false);
+					   InvalidBuffer, false);
 	}
 	else
 	{
@@ -129,10 +131,12 @@ ExecGroupEveryTuple(Group *node)
 			}
 		}
 
+		/* note we rely on subplan to hold ownership of the tuple
+		 * for as long as we need it; we don't copy it.
+		 */
 		ExecStoreTuple(outerTuple,
 					   grpstate->csstate.css_ScanTupleSlot,
-					   outerslot->ttc_buffer,
-					   false);
+					   InvalidBuffer, false);
 	}
 
 	/* ----------------
@@ -226,10 +230,12 @@ ExecGroupOneTuple(Group *node)
 	 */
 	projInfo = grpstate->csstate.cstate.cs_ProjInfo;
 
+	/* note we rely on subplan to hold ownership of the tuple
+	 * for as long as we need it; we don't copy it.
+	 */
 	ExecStoreTuple(firsttuple,
 				   grpstate->csstate.css_ScanTupleSlot,
-				   InvalidBuffer,
-				   false);
+				   InvalidBuffer, false);
 	econtext->ecxt_scantuple = grpstate->csstate.css_ScanTupleSlot;
 	resultSlot = ExecProject(projInfo, &isDone);
 
diff --git a/src/backend/executor/nodeIndexscan.c b/src/backend/executor/nodeIndexscan.c
index 362851a425a32280b39feee9ceba218a1c3b13ec..b9e3cf58636832608d01f386f0f8c31e3b6f6384 100644
--- a/src/backend/executor/nodeIndexscan.c
+++ b/src/backend/executor/nodeIndexscan.c
@@ -7,7 +7,7 @@
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/executor/nodeIndexscan.c,v 1.42 1999/08/12 00:42:43 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/executor/nodeIndexscan.c,v 1.43 1999/09/24 00:24:23 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -125,14 +125,14 @@ IndexNext(IndexScan *node)
 	{
 		int			iptr;
 
-		slot->ttc_buffer = InvalidBuffer;
-		slot->ttc_shouldFree = false;
+		ExecClearTuple(slot);
 		if (estate->es_evTupleNull[node->scan.scanrelid - 1])
-		{
-			slot->val = NULL;	/* must not free tuple! */
-			return (slot);
-		}
+			return slot;		/* return empty slot */
+
+		/* probably ought to use ExecStoreTuple here... */
 		slot->val = estate->es_evTuple[node->scan.scanrelid - 1];
+		slot->ttc_shouldFree = false;
+
 		for (iptr = 0; iptr < numIndices; iptr++)
 		{
 			scanstate->cstate.cs_ExprContext->ecxt_scantuple = slot;
@@ -142,6 +142,7 @@ IndexNext(IndexScan *node)
 		}
 		if (iptr == numIndices) /* would not be returned by indices */
 			slot->val = NULL;
+
 		/* Flag for the next call that no more tuples */
 		estate->es_evTupleNull[node->scan.scanrelid - 1] = true;
 		return (slot);
@@ -192,7 +193,7 @@ IndexNext(IndexScan *node)
 				 *	the scan state.  Eventually we will only do this and not
 				 *	return a tuple.  Note: we pass 'false' because tuples
 				 *	returned by amgetnext are pointers onto disk pages and
-				 *	were not created with palloc() and so should not be pfree()'d.
+				 *	must not be pfree()'d.
 				 * ----------------
 				 */
 				ExecStoreTuple(tuple,	/* tuple to store */
@@ -200,6 +201,13 @@ IndexNext(IndexScan *node)
 							   buffer,	/* buffer associated with tuple  */
 							   false);	/* don't pfree */
 
+				/*
+				 * At this point we have an extra pin on the buffer,
+				 * because ExecStoreTuple incremented the pin count.
+				 * Drop our local pin.
+				 */
+				ReleaseBuffer(buffer);
+
 				/*
 				 * We must check to see if the current tuple would have
 				 * been matched by an earlier index, so we don't double
@@ -223,8 +231,6 @@ IndexNext(IndexScan *node)
 				else
 					ExecClearTuple(slot);
 			}
-			if (BufferIsValid(buffer))
-				ReleaseBuffer(buffer);
 		}
 		if (indexNumber < numIndices)
 		{
diff --git a/src/backend/executor/nodeMaterial.c b/src/backend/executor/nodeMaterial.c
index 783dbc7b328a111f7d7aceba5868497ab25807f0..24232617cf6e2ed5e12cbc7766e0ab16d9056575 100644
--- a/src/backend/executor/nodeMaterial.c
+++ b/src/backend/executor/nodeMaterial.c
@@ -7,7 +7,7 @@
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/executor/nodeMaterial.c,v 1.25 1999/07/16 04:58:50 momjian Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/executor/nodeMaterial.c,v 1.26 1999/09/24 00:24:23 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -31,7 +31,7 @@
  *		ExecMaterial
  *
  *		The first time this is called, ExecMaterial retrieves tuples
- *		this node's outer subplan and inserts them into a temporary
+ *		from this node's outer subplan and inserts them into a temporary
  *		relation.  After this is done, a flag is set indicating that
  *		the subplan has been materialized.	Once the relation is
  *		materialized, the first tuple is then returned.  Successive
@@ -41,7 +41,7 @@
  *		Initial State:
  *
  *		ExecMaterial assumes the temporary relation has been
- *		created and openend by ExecInitMaterial during the prior
+ *		created and opened by ExecInitMaterial during the prior
  *		InitPlan() phase.
  *
  * ----------------------------------------------------------------
@@ -116,18 +116,7 @@ ExecMaterial(Material *node)
 			if (TupIsNull(slot))
 				break;
 
-			/*
-			 * heap_insert changes something...
-			 */
-			if (slot->ttc_buffer != InvalidBuffer)
-				heapTuple = heap_copytuple(slot->val);
-			else
-				heapTuple = slot->val;
-
-			heap_insert(tempRelation, heapTuple);
-
-			if (slot->ttc_buffer != InvalidBuffer)
-				pfree(heapTuple);
+			heap_insert(tempRelation, slot->val);
 
 			ExecClearTuple(slot);
 		}
@@ -164,7 +153,7 @@ ExecMaterial(Material *node)
 
 	/* ----------------
 	 *	at this point we know we have a sorted relation so
-	 *	we preform a simple scan on it with amgetnext()..
+	 *	we perform a simple scan on it with amgetnext()..
 	 * ----------------
 	 */
 	currentScanDesc = matstate->csstate.css_currentScanDesc;
diff --git a/src/backend/executor/nodeMergejoin.c b/src/backend/executor/nodeMergejoin.c
index 62b53af3c65d3e800421df7dac88c3ffad68c838..4b3f021fe0c88e52b3e7d7eb1f577cef3bc45aa7 100644
--- a/src/backend/executor/nodeMergejoin.c
+++ b/src/backend/executor/nodeMergejoin.c
@@ -7,7 +7,7 @@
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/executor/nodeMergejoin.c,v 1.28 1999/07/16 04:58:50 momjian Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/executor/nodeMergejoin.c,v 1.29 1999/09/24 00:24:23 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -1153,15 +1153,18 @@ ExecInitMergeJoin(MergeJoin *node, EState *estate, Plan *parent)
 #define MERGEJOIN_NSLOTS 2
 	/* ----------------
 	 *	tuple table initialization
+	 *
+	 *	XXX why aren't we getting a tuple table slot in the normal way?
 	 * ----------------
 	 */
 	ExecInitResultTupleSlot(estate, &mergestate->jstate);
-	mjSlot = (TupleTableSlot *) palloc(sizeof(TupleTableSlot));
+	mjSlot = makeNode(TupleTableSlot);
 	mjSlot->val = NULL;
 	mjSlot->ttc_shouldFree = true;
+	mjSlot->ttc_descIsNew = true;
 	mjSlot->ttc_tupleDescriptor = NULL;
+	mjSlot->ttc_buffer = InvalidBuffer;
 	mjSlot->ttc_whichplan = -1;
-	mjSlot->ttc_descIsNew = true;
 	mergestate->mj_MarkedTupleSlot = mjSlot;
 
 	/* ----------------
@@ -1278,11 +1281,9 @@ ExecReScanMergeJoin(MergeJoin *node, ExprContext *exprCtxt, Plan *parent)
 	TupleTableSlot *mjSlot = mergestate->mj_MarkedTupleSlot;
 
 	ExecClearTuple(mjSlot);
-	mjSlot->val = NULL;
-	mjSlot->ttc_shouldFree = true;
 	mjSlot->ttc_tupleDescriptor = NULL;
-	mjSlot->ttc_whichplan = -1;
 	mjSlot->ttc_descIsNew = true;
+	mjSlot->ttc_whichplan = -1;
 
 	mergestate->mj_JoinState = EXEC_MJ_INITIALIZE;
 
diff --git a/src/backend/executor/nodeSeqscan.c b/src/backend/executor/nodeSeqscan.c
index c83aa725a70b8ad152f7d94b81882296ec503a12..eb73733b58fe63a0b22e720180cc26671fc7fe86 100644
--- a/src/backend/executor/nodeSeqscan.c
+++ b/src/backend/executor/nodeSeqscan.c
@@ -7,7 +7,7 @@
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/executor/nodeSeqscan.c,v 1.20 1999/07/16 04:58:52 momjian Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/executor/nodeSeqscan.c,v 1.21 1999/09/24 00:24:24 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -74,20 +74,20 @@ SeqNext(SeqScan *node)
 	if (estate->es_evTuple != NULL &&
 		estate->es_evTuple[node->scanrelid - 1] != NULL)
 	{
-		slot->ttc_buffer = InvalidBuffer;
-		slot->ttc_shouldFree = false;
+		ExecClearTuple(slot);
 		if (estate->es_evTupleNull[node->scanrelid - 1])
-		{
-			slot->val = NULL;	/* must not free tuple! */
-			return (slot);
-		}
+			return slot;		/* return empty slot */
+
+		/* probably ought to use ExecStoreTuple here... */
 		slot->val = estate->es_evTuple[node->scanrelid - 1];
+		slot->ttc_shouldFree = false;
 
 		/*
 		 * Note that unlike IndexScan, SeqScan never use keys in
-		 * heap_beginscan (and this is very bad) - so, here we have not
+		 * heap_beginscan (and this is very bad) - so, here we do not
 		 * check are keys ok or not.
 		 */
+
 		/* Flag for the next call that no more tuples */
 		estate->es_evTupleNull[node->scanrelid - 1] = true;
 		return (slot);
@@ -104,7 +104,9 @@ SeqNext(SeqScan *node)
 	 *	in our scan tuple slot and return the slot.  Note: we pass 'false'
 	 *	because tuples returned by heap_getnext() are pointers onto
 	 *	disk pages and were not created with palloc() and so should not
-	 *	be pfree()'d.
+	 *	be pfree()'d.  Note also that ExecStoreTuple will increment the
+	 *	refcount of the buffer; the refcount will not be dropped until
+	 *	the tuple table slot is cleared.
 	 * ----------------
 	 */
 
@@ -114,17 +116,6 @@ SeqNext(SeqScan *node)
 												 * this tuple */
 						  false);		/* don't pfree this pointer */
 
-	/* ----------------
-	 *	XXX -- mao says:  The sequential scan for heap relations will
-	 *	automatically unpin the buffer this tuple is on when we cross
-	 *	a page boundary.  The clearslot code also does this.  We bump
-	 *	the pin count on the page here, since we actually have two
-	 *	pointers to it -- one in the scan desc and one in the tuple
-	 *	table slot.  --mar 20 91
-	 * ----------------
-	 */
-	ExecIncrSlotBufferRefcnt(slot);
-
 	return slot;
 }
 
diff --git a/src/backend/executor/nodeSubplan.c b/src/backend/executor/nodeSubplan.c
index 4bd0eb2ff31c39bdb222b0bf87d4c8799204cf07..32a39ee18d96f43a78bc1a0572f3e52479785e0f 100644
--- a/src/backend/executor/nodeSubplan.c
+++ b/src/backend/executor/nodeSubplan.c
@@ -165,8 +165,6 @@ ExecInitSubPlan(SubPlan *node, EState *estate, Plan *parent)
 	sp_estate->es_param_exec_vals = estate->es_param_exec_vals;
 	sp_estate->es_tupleTable =
 		ExecCreateTupleTable(ExecCountSlotsNode(node->plan) + 10);
-	pfree(sp_estate->es_refcount);
-	sp_estate->es_refcount = estate->es_refcount;
 	sp_estate->es_snapshot = estate->es_snapshot;
 
 	if (!ExecInitNode(node->plan, sp_estate, NULL))
diff --git a/src/backend/storage/buffer/buf_init.c b/src/backend/storage/buffer/buf_init.c
index 88f6416e84043d3d7986b5e6bf513694bb4b2235..bfd0561705374792ec28c2e700e970cb1ac043bf 100644
--- a/src/backend/storage/buffer/buf_init.c
+++ b/src/backend/storage/buffer/buf_init.c
@@ -7,7 +7,7 @@
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/storage/buffer/buf_init.c,v 1.29 1999/07/17 20:17:40 momjian Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/storage/buffer/buf_init.c,v 1.30 1999/09/24 00:24:29 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -64,7 +64,6 @@ long	   *NWaitIOBackendP;
 extern IpcSemaphoreId WaitIOSemId;
 
 long	   *PrivateRefCount;	/* also used in freelist.c */
-long	   *LastRefCount;		/* refcounts of last ExecMain level */
 bits8	   *BufferLocks;		/* */
 long	   *CommitInfoNeedsSave;/* to write buffers where we have filled
 								 * in t_infomask */
@@ -244,7 +243,6 @@ InitBufferPool(IPCKey key)
 	}
 #endif
 	PrivateRefCount = (long *) calloc(NBuffers, sizeof(long));
-	LastRefCount = (long *) calloc(NBuffers, sizeof(long));
 	BufferLocks = (bits8 *) calloc(NBuffers, sizeof(bits8));
 	CommitInfoNeedsSave = (long *) calloc(NBuffers, sizeof(long));
 }
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index b435dd53cac10ed46b5c9b3b20620769ae41e296..e0327c678f2dab759b355998292763d2050b6890 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -7,7 +7,7 @@
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/storage/buffer/bufmgr.c,v 1.62 1999/09/18 19:07:26 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/storage/buffer/bufmgr.c,v 1.63 1999/09/24 00:24:29 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -89,9 +89,6 @@ static void BufferSync(void);
 static int	BufferReplace(BufferDesc *bufHdr, bool bufferLockHeld);
 void		PrintBufferDescs(void);
 
-/* not static but used by vacuum only ... */
-int			BlowawayRelationBuffers(Relation rel, BlockNumber block);
-
 /* ---------------------------------------------------
  * RelationGetBufferWithBuffer
  *		see if the given buffer is what we want
@@ -146,9 +143,6 @@ RelationGetBufferWithBuffer(Relation relation,
  *		opened already.
  */
 
-extern int	ShowPinTrace;
-
-
 #undef ReadBuffer				/* conflicts with macro when BUFMGR_DEBUG
 								 * defined */
 
@@ -499,6 +493,7 @@ BufferAlloc(Relation reln,
 					SignalIO(buf);
 #endif	 /* !HAS_TEST_AND_SET */
 				PrivateRefCount[BufferDescriptorGetBuffer(buf) - 1] = 0;
+				Assert(buf->refcount > 0);
 				buf->refcount--;
 				if (buf->refcount == 0)
 				{
@@ -575,10 +570,14 @@ BufferAlloc(Relation reln,
 						SignalIO(buf);
 #endif	 /* !HAS_TEST_AND_SET */
 					/* give up the buffer since we don't need it any more */
-					buf->refcount--;
 					PrivateRefCount[BufferDescriptorGetBuffer(buf) - 1] = 0;
-					AddBufferToFreelist(buf);
-					buf->flags |= BM_FREE;
+					Assert(buf->refcount > 0);
+					buf->refcount--;
+					if (buf->refcount == 0)
+					{
+						AddBufferToFreelist(buf);
+						buf->flags |= BM_FREE;
+					}
 					buf->flags &= ~BM_IO_IN_PROGRESS;
 				}
 
@@ -791,7 +790,7 @@ FlushBuffer(Buffer buffer, bool release)
 	int			status;
 
 	if (BufferIsLocal(buffer))
-		return FlushLocalBuffer(buffer, release);
+		return FlushLocalBuffer(buffer, release) ? STATUS_OK : STATUS_ERROR;
 
 	if (BAD_BUFFER_ID(buffer))
 		return STATUS_ERROR;
@@ -813,7 +812,7 @@ FlushBuffer(Buffer buffer, bool release)
 	status = smgrflush(DEFAULT_SMGR, bufrel, bufHdr->tag.blockNum,
 					   (char *) MAKE_PTR(bufHdr->data));
 
-	/* drop relcache refcount incremented by RelationIdCacheGetRelation */
+	/* drop relcache refcnt incremented by RelationIdCacheGetRelation */
 	RelationDecrementReferenceCount(bufrel);
 
 	if (status == SM_FAIL)
@@ -908,15 +907,10 @@ ReleaseAndReadBuffer(Buffer buffer,
 			bufHdr = &BufferDescriptors[buffer - 1];
 			Assert(PrivateRefCount[buffer - 1] > 0);
 			PrivateRefCount[buffer - 1]--;
-			if (PrivateRefCount[buffer - 1] == 0 &&
-				LastRefCount[buffer - 1] == 0)
+			if (PrivateRefCount[buffer - 1] == 0)
 			{
-
-				/*
-				 * only release buffer if it is not pinned in previous
-				 * ExecMain level
-				 */
 				SpinAcquire(BufMgrLock);
+				Assert(bufHdr->refcount > 0);
 				bufHdr->refcount--;
 				if (bufHdr->refcount == 0)
 				{
@@ -994,7 +988,7 @@ BufferSync()
 						elog(ERROR, "BufferSync: write error %u for %s",
 							 bufHdr->tag.blockNum, bufHdr->sb_relname);
 					}
-					/* drop refcount from RelationIdCacheGetRelation */
+					/* drop refcnt from RelationIdCacheGetRelation */
 					if (reln != (Relation) NULL)
 						RelationDecrementReferenceCount(reln);
 					continue;
@@ -1049,7 +1043,7 @@ BufferSync()
 				 */
 				if (!(bufHdr->flags & BM_JUST_DIRTIED))
 					bufHdr->flags &= ~BM_DIRTY;
-				/* drop refcount from RelationIdCacheGetRelation */
+				/* drop refcnt from RelationIdCacheGetRelation */
 				if (reln != (Relation) NULL)
 					RelationDecrementReferenceCount(reln);
 			}
@@ -1175,7 +1169,7 @@ ResetBufferUsage()
  *		ResetBufferPool
  *
  *		this routine is supposed to be called when a transaction aborts.
- *		it will release all the buffer pins held by the transaciton.
+ *		it will release all the buffer pins held by the transaction.
  *
  * ----------------------------------------------
  */
@@ -1184,15 +1178,24 @@ ResetBufferPool()
 {
 	int			i;
 
-	for (i = 1; i <= NBuffers; i++)
+	for (i = 0; i < NBuffers; i++)
 	{
-		CommitInfoNeedsSave[i - 1] = 0;
-		if (BufferIsValid(i))
+		if (PrivateRefCount[i] != 0)
 		{
-			while (PrivateRefCount[i - 1] > 0)
-				ReleaseBuffer(i);
+			BufferDesc *buf = &BufferDescriptors[i];
+
+			SpinAcquire(BufMgrLock);
+			Assert(buf->refcount > 0);
+			buf->refcount--;
+			if (buf->refcount == 0)
+			{
+				AddBufferToFreelist(buf);
+				buf->flags |= BM_FREE;
+			}
+			SpinRelease(BufMgrLock);
 		}
-		LastRefCount[i - 1] = 0;
+		PrivateRefCount[i] = 0;
+		CommitInfoNeedsSave[i] = 0;
 	}
 
 	ResetLocalBufferPool();
@@ -1213,7 +1216,7 @@ BufferPoolCheckLeak()
 
 	for (i = 1; i <= NBuffers; i++)
 	{
-		if (BufferIsValid(i))
+		if (PrivateRefCount[i - 1] != 0)
 		{
 			BufferDesc *buf = &(BufferDescriptors[i - 1]);
 
@@ -1226,7 +1229,7 @@ relname=%s, blockNum=%d, flags=0x%x, refcount=%d %d)",
 			result = 1;
 		}
 	}
-	return (result);
+	return result;
 }
 
 /* ------------------------------------------------
@@ -1287,7 +1290,7 @@ BufferGetRelation(Buffer buffer)
 	relation = RelationIdGetRelation(relid);
 	Assert(relation);
 
-	/* drop relcache refcount incremented by RelationIdGetRelation */
+	/* drop relcache refcnt incremented by RelationIdGetRelation */
 	RelationDecrementReferenceCount(relation);
 
 	if (RelationHasReferenceCountZero(relation))
@@ -1354,7 +1357,7 @@ BufferReplace(BufferDesc *bufHdr, bool bufferLockHeld)
 							  (char *) MAKE_PTR(bufHdr->data));
 	}
 
-	/* drop relcache refcount incremented by RelationIdCacheGetRelation */
+	/* drop relcache refcnt incremented by RelationIdCacheGetRelation */
 	if (reln != (Relation) NULL)
 		RelationDecrementReferenceCount(reln);
 
@@ -1549,10 +1552,27 @@ BufferPoolBlowaway()
 #endif
 
 /* ---------------------------------------------------------------------
- *		BlowawayRelationBuffers
+ *		FlushRelationBuffers
+ *
+ *		This function removes from the buffer pool all pages of a relation
+ *		that have blocknumber >= specified block.  If doFlush is true,
+ *		dirty buffers are written out --- otherwise it's an error for any
+ *		of the buffers to be dirty.
+ *
+ *		This is used by VACUUM before truncating the relation to the given
+ *		number of blocks.  For VACUUM, we pass doFlush = false since it would
+ *		mean a bug in VACUUM if any of the unwanted pages were still dirty.
+ *		(TRUNCATE TABLE also uses it in the same way.)
  *
- *		This function blowaway all the pages with blocknumber >= passed
- *		of a relation in the buffer pool. Used by vacuum before truncation...
+ *		This is also used by RENAME TABLE (with block = 0 and doFlush = true)
+ *		to clear out the buffer cache before renaming the physical files of
+ *		a relation.  Without that, some other backend might try to do a
+ *		blind write of a buffer page (relying on the sb_relname of the buffer)
+ *		and fail because it's not got the right filename anymore.
+ *
+ *		In both cases, the caller should be holding AccessExclusiveLock on
+ *		the target relation to ensure that no other backend is busy reading
+ *		more blocks of the relation...
  *
  *		Returns: 0 - Ok, -1 - DIRTY, -2 - PINNED
  *
@@ -1561,7 +1581,7 @@ BufferPoolBlowaway()
  * --------------------------------------------------------------------
  */
 int
-BlowawayRelationBuffers(Relation rel, BlockNumber block)
+FlushRelationBuffers(Relation rel, BlockNumber block, bool doFlush)
 {
 	int			i;
 	BufferDesc *buf;
@@ -1576,13 +1596,25 @@ BlowawayRelationBuffers(Relation rel, BlockNumber block)
 			{
 				if (buf->flags & BM_DIRTY)
 				{
-					elog(NOTICE, "BlowawayRelationBuffers(%s (local), %u): block %u is dirty",
-					rel->rd_rel->relname.data, block, buf->tag.blockNum);
-					return -1;
+					if (doFlush)
+					{
+						if (FlushBuffer(-i-1, false) != STATUS_OK)
+						{
+							elog(NOTICE, "FlushRelationBuffers(%s (local), %u): block %u is dirty, could not flush it",
+								 rel->rd_rel->relname.data, block, buf->tag.blockNum);
+							return -1;
+						}
+					}
+					else
+					{
+						elog(NOTICE, "FlushRelationBuffers(%s (local), %u): block %u is dirty",
+							 rel->rd_rel->relname.data, block, buf->tag.blockNum);
+						return -1;
+					}
 				}
 				if (LocalRefCount[i] > 0)
 				{
-					elog(NOTICE, "BlowawayRelationBuffers(%s (local), %u): block %u is referenced (%d)",
+					elog(NOTICE, "FlushRelationBuffers(%s (local), %u): block %u is referenced (%d)",
 						 rel->rd_rel->relname.data, block,
 						 buf->tag.blockNum, LocalRefCount[i]);
 					return -2;
@@ -1603,18 +1635,33 @@ BlowawayRelationBuffers(Relation rel, BlockNumber block)
 		{
 			if (buf->flags & BM_DIRTY)
 			{
-				elog(NOTICE, "BlowawayRelationBuffers(%s, %u): block %u is dirty (private %d, last %d, global %d)",
-					 buf->sb_relname, block, buf->tag.blockNum,
-					 PrivateRefCount[i], LastRefCount[i], buf->refcount);
-				SpinRelease(BufMgrLock);
-				return -1;
+				if (doFlush)
+				{
+					SpinRelease(BufMgrLock);
+					if (FlushBuffer(i+1, false) != STATUS_OK)
+					{
+						elog(NOTICE, "FlushRelationBuffers(%s, %u): block %u is dirty (private %d, global %d), could not flush it",
+							 buf->sb_relname, block, buf->tag.blockNum,
+							 PrivateRefCount[i], buf->refcount);
+						return -1;
+					}
+					SpinAcquire(BufMgrLock);
+				}
+				else
+				{
+					SpinRelease(BufMgrLock);
+					elog(NOTICE, "FlushRelationBuffers(%s, %u): block %u is dirty (private %d, global %d)",
+						 buf->sb_relname, block, buf->tag.blockNum,
+						 PrivateRefCount[i], buf->refcount);
+					return -1;
+				}
 			}
 			if (!(buf->flags & BM_FREE))
 			{
-				elog(NOTICE, "BlowawayRelationBuffers(%s, %u): block %u is referenced (private %d, last %d, global %d)",
-					 buf->sb_relname, block, buf->tag.blockNum,
-					 PrivateRefCount[i], LastRefCount[i], buf->refcount);
 				SpinRelease(BufMgrLock);
+				elog(NOTICE, "FlushRelationBuffers(%s, %u): block %u is referenced (private %d, global %d)",
+					 buf->sb_relname, block, buf->tag.blockNum,
+					 PrivateRefCount[i], buf->refcount);
 				return -2;
 			}
 			BufTableDelete(buf);
@@ -1650,14 +1697,10 @@ ReleaseBuffer(Buffer buffer)
 
 	Assert(PrivateRefCount[buffer - 1] > 0);
 	PrivateRefCount[buffer - 1]--;
-	if (PrivateRefCount[buffer - 1] == 0 && LastRefCount[buffer - 1] == 0)
+	if (PrivateRefCount[buffer - 1] == 0)
 	{
-
-		/*
-		 * only release buffer if it is not pinned in previous ExecMain
-		 * levels
-		 */
 		SpinAcquire(BufMgrLock);
+		Assert(bufHdr->refcount > 0);
 		bufHdr->refcount--;
 		if (bufHdr->refcount == 0)
 		{
@@ -1892,32 +1935,6 @@ _bm_die(Oid dbId, Oid relId, int blkNo, int bufNo,
 
 #endif	 /* BMTRACE */
 
-void
-BufferRefCountReset(int *refcountsave)
-{
-	int			i;
-
-	for (i = 0; i < NBuffers; i++)
-	{
-		refcountsave[i] = PrivateRefCount[i];
-		LastRefCount[i] += PrivateRefCount[i];
-		PrivateRefCount[i] = 0;
-	}
-}
-
-void
-BufferRefCountRestore(int *refcountsave)
-{
-	int			i;
-
-	for (i = 0; i < NBuffers; i++)
-	{
-		PrivateRefCount[i] = refcountsave[i];
-		LastRefCount[i] -= refcountsave[i];
-		refcountsave[i] = 0;
-	}
-}
-
 int
 SetBufferWriteMode(int mode)
 {
diff --git a/src/backend/storage/buffer/freelist.c b/src/backend/storage/buffer/freelist.c
index d8f70a3287e1f856e66907c572f66093fb11792a..f59a2cc81e3d0d83df7ef95bdc2e7b59f05065c5 100644
--- a/src/backend/storage/buffer/freelist.c
+++ b/src/backend/storage/buffer/freelist.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/storage/buffer/freelist.c,v 1.18 1999/07/17 20:17:41 momjian Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/storage/buffer/freelist.c,v 1.19 1999/09/24 00:24:29 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -105,7 +105,7 @@ PinBuffer(BufferDesc *buf)
 
 	b = BufferDescriptorGetBuffer(buf) - 1;
 	Assert(PrivateRefCount[b] >= 0);
-	if (PrivateRefCount[b] == 0 && LastRefCount[b] == 0)
+	if (PrivateRefCount[b] == 0)
 		buf->refcount++;
 	PrivateRefCount[b]++;
 }
@@ -138,10 +138,10 @@ UnpinBuffer(BufferDesc *buf)
 {
 	long		b = BufferDescriptorGetBuffer(buf) - 1;
 
-	Assert(buf->refcount);
+	Assert(buf->refcount > 0);
 	Assert(PrivateRefCount[b] > 0);
 	PrivateRefCount[b]--;
-	if (PrivateRefCount[b] == 0 && LastRefCount[b] == 0)
+	if (PrivateRefCount[b] == 0)
 		buf->refcount--;
 	NotInQueue(buf);
 
diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c
index e003595beda6d0395bccdfd9ac0665468b205e0c..6c0d1431e53bf49565c5f95d0b8907a4fefe79b7 100644
--- a/src/backend/storage/buffer/localbuf.c
+++ b/src/backend/storage/buffer/localbuf.c
@@ -15,7 +15,7 @@
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/storage/buffer/localbuf.c,v 1.27 1999/09/18 19:07:26 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/storage/buffer/localbuf.c,v 1.28 1999/09/24 00:24:29 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -193,9 +193,11 @@ FlushLocalBuffer(Buffer buffer, bool release)
 	/* drop relcache refcount incremented by RelationIdCacheGetRelation */
 	RelationDecrementReferenceCount(bufrel);
 
-	Assert(LocalRefCount[bufid] > 0);
 	if (release)
+	{
+		Assert(LocalRefCount[bufid] > 0);
 		LocalRefCount[bufid]--;
+	}
 
 	return true;
 }
diff --git a/src/backend/storage/ipc/shmem.c b/src/backend/storage/ipc/shmem.c
index b057aaa578a8d7fd9f6a8f0624fde84ce007e457..c23952c19105efe0368d5123124362d3197cbda0 100644
--- a/src/backend/storage/ipc/shmem.c
+++ b/src/backend/storage/ipc/shmem.c
@@ -7,7 +7,7 @@
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/storage/ipc/shmem.c,v 1.45 1999/07/17 20:17:44 momjian Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/storage/ipc/shmem.c,v 1.46 1999/09/24 00:24:35 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -579,174 +579,3 @@ ShmemInitStruct(char *name, unsigned long size, bool *foundPtr)
 	SpinRelease(ShmemIndexLock);
 	return structPtr;
 }
-
-/*
- * TransactionIdIsInProgress -- is given transaction running by some backend
- *
- * Strange place for this func, but we have to lookup process data structures
- * for all running backends. - vadim 11/26/96
- *
- * We should keep all PROC structs not in ShmemIndex - this is too
- * general hash table...
- *
- */
-bool
-TransactionIdIsInProgress(TransactionId xid)
-{
-	ShmemIndexEnt *result;
-	PROC	   *proc;
-
-	Assert(ShmemIndex);
-
-	SpinAcquire(ShmemIndexLock);
-
-	hash_seq((HTAB *) NULL);
-	while ((result = (ShmemIndexEnt *) hash_seq(ShmemIndex)) != NULL)
-	{
-		if (result == (ShmemIndexEnt *) TRUE)
-		{
-			SpinRelease(ShmemIndexLock);
-			return false;
-		}
-		if (result->location == INVALID_OFFSET ||
-			strncmp(result->key, "PID ", 4) != 0)
-			continue;
-		proc = (PROC *) MAKE_PTR(result->location);
-		if (proc->xid == xid)
-		{
-			SpinRelease(ShmemIndexLock);
-			return true;
-		}
-	}
-
-	SpinRelease(ShmemIndexLock);
-	elog(ERROR, "TransactionIdIsInProgress: ShmemIndex corrupted");
-	return false;
-}
-
-/*
- * GetSnapshotData -- returns information about running transactions.
- *
- * Yet another strange func for this place...	- vadim 07/21/98
- */
-Snapshot
-GetSnapshotData(bool serializable)
-{
-	Snapshot	snapshot = (Snapshot) malloc(sizeof(SnapshotData));
-	ShmemIndexEnt *result;
-	PROC	   *proc;
-	TransactionId cid = GetCurrentTransactionId();
-	TransactionId xid;
-	uint32		count = 0;
-	uint32		have = 32;
-
-	Assert(ShmemIndex);
-
-	snapshot->xip = (TransactionId *) malloc(have * sizeof(TransactionId));
-	snapshot->xmin = cid;
-
-	SpinAcquire(ShmemIndexLock);
-	/*
-	 * Unfortunately, we have to call ReadNewTransactionId()
-	 * after acquiring ShmemIndexLock above. It's not good because of
-	 * ReadNewTransactionId() does SpinAcquire(OidGenLockId) but
-	 * _necessary_.
-	 */
-	ReadNewTransactionId(&(snapshot->xmax));
-
-	hash_seq((HTAB *) NULL);
-	while ((result = (ShmemIndexEnt *) hash_seq(ShmemIndex)) != NULL)
-	{
-		if (result == (ShmemIndexEnt *) TRUE)
-		{
-			if (serializable)
-				MyProc->xmin = snapshot->xmin;
-			/* Serializable snapshot must be computed before any other... */
-			Assert(MyProc->xmin != InvalidTransactionId);
-			SpinRelease(ShmemIndexLock);
-			snapshot->xcnt = count;
-			return snapshot;
-		}
-		if (result->location == INVALID_OFFSET ||
-			strncmp(result->key, "PID ", 4) != 0)
-			continue;
-		proc = (PROC *) MAKE_PTR(result->location);
-		/* 
-		 * We don't use spin-locking when changing proc->xid 
-		 * in GetNewTransactionId() and in AbortTransaction() !..
-		 */
-		xid = proc->xid;
-		if (proc == MyProc || 
-			xid < FirstTransactionId || xid >= snapshot->xmax)
-		{
-			/*
-			 * Seems that there is no sense to store xid >= snapshot->xmax
-			 * (what we got from ReadNewTransactionId above) in snapshot->xip 
-			 * - we just assume that all xacts with such xid-s are running 
-			 * and may be ignored.
-			 */
-			continue;
-		}
-		if (xid < snapshot->xmin)
-			snapshot->xmin = xid;
-		if (have == 0)
-		{
-			snapshot->xip = (TransactionId *) realloc(snapshot->xip,
-								   (count + 32) * sizeof(TransactionId));
-			have = 32;
-		}
-		snapshot->xip[count] = xid;
-		have--;
-		count++;
-	}
-
-	SpinRelease(ShmemIndexLock);
-	free(snapshot->xip);
-	free(snapshot);
-	elog(ERROR, "GetSnapshotData: ShmemIndex corrupted");
-	return NULL;
-}
-
-/*
- * GetXmaxRecent -- returns oldest transaction that was running
- *					when all current transaction was started.
- *					It's used by vacuum to decide what deleted
- *					tuples must be preserved in a table.
- *
- * And yet another strange func for this place...	- vadim 03/18/99
- */
-void
-GetXmaxRecent(TransactionId *XmaxRecent)
-{
-	ShmemIndexEnt *result;
-	PROC	   *proc;
-	TransactionId xmin;
-
-	Assert(ShmemIndex);
-
-	*XmaxRecent = GetCurrentTransactionId();
-
-	SpinAcquire(ShmemIndexLock);
-
-	hash_seq((HTAB *) NULL);
-	while ((result = (ShmemIndexEnt *) hash_seq(ShmemIndex)) != NULL)
-	{
-		if (result == (ShmemIndexEnt *) TRUE)
-		{
-			SpinRelease(ShmemIndexLock);
-			return;
-		}
-		if (result->location == INVALID_OFFSET ||
-			strncmp(result->key, "PID ", 4) != 0)
-			continue;
-		proc = (PROC *) MAKE_PTR(result->location);
-		xmin = proc->xmin;	/* we don't use spin-locking in AbortTransaction() ! */
-		if (proc == MyProc || xmin < FirstTransactionId)
-			continue;
-		if (xmin < *XmaxRecent)
-			*XmaxRecent = xmin;
-	}
-
-	SpinRelease(ShmemIndexLock);
-	elog(ERROR, "GetXmaxRecent: ShmemIndex corrupted");
-}
diff --git a/src/backend/storage/ipc/sinval.c b/src/backend/storage/ipc/sinval.c
index c1a557033b6b8d6177d73ce0bc15f14db8b13c87..42c22faa2f5474892b2889dda598dd3f4f6cbc04 100644
--- a/src/backend/storage/ipc/sinval.c
+++ b/src/backend/storage/ipc/sinval.c
@@ -7,7 +7,7 @@
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/storage/ipc/sinval.c,v 1.18 1999/09/06 19:37:38 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/storage/ipc/sinval.c,v 1.19 1999/09/24 00:24:35 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -18,8 +18,10 @@
 #include "postgres.h"
 
 #include "storage/backendid.h"
+#include "storage/proc.h"
 #include "storage/sinval.h"
 #include "storage/sinvaladt.h"
+#include "utils/tqual.h"
 
 SPINLOCK	SInvalLock = (SPINLOCK) NULL;
 
@@ -165,3 +167,201 @@ InvalidateSharedInvalid(void (*invalFunction) (),
 		SpinRelease(SInvalLock);
 	}
 }
+
+
+/****************************************************************************/
+/* Functions that need to scan the PROC structures of all running backends.	*/
+/* It's a bit strange to keep these in sinval.c, since they don't have any	*/
+/* direct relationship to shared-cache invalidation.  But the procState		*/
+/* array in the SI segment is the only place in the system where we have	*/
+/* an array of per-backend data, so it is the most convenient place to keep	*/
+/* pointers to the backends' PROC structures.  We used to implement these	*/
+/* functions with a slow, ugly search through the ShmemIndex hash table ---	*/
+/* now they are simple loops over the SI ProcState array.					*/
+/****************************************************************************/
+
+
+/*
+ * DatabaseHasActiveBackends -- are there any backends running in the given DB
+ *
+ * This function is used to interlock DROP DATABASE against there being
+ * any active backends in the target DB --- dropping the DB while active
+ * backends remain would be a Bad Thing.  Note that we cannot detect here
+ * the possibility of a newly-started backend that is trying to connect
+ * to the doomed database, so additional interlocking is needed during
+ * backend startup.
+ */
+
+bool
+DatabaseHasActiveBackends(Oid databaseId)
+{
+	bool		result = false;
+	SISeg	   *segP = shmInvalBuffer;
+	ProcState  *stateP = segP->procState;
+	int			index;
+
+	SpinAcquire(SInvalLock);
+
+	for (index = 0; index < segP->maxBackends; index++)
+	{
+		SHMEM_OFFSET	pOffset = stateP[index].procStruct;
+
+		if (pOffset != INVALID_OFFSET)
+		{
+			PROC	   *proc = (PROC *) MAKE_PTR(pOffset);
+
+			if (proc->databaseId == databaseId)
+			{
+				result = true;
+				break;
+			}
+		}
+	}
+
+	SpinRelease(SInvalLock);
+
+	return result;
+}
+
+/*
+ * TransactionIdIsInProgress -- is given transaction running by some backend
+ */
+bool
+TransactionIdIsInProgress(TransactionId xid)
+{
+	bool		result = false;
+	SISeg	   *segP = shmInvalBuffer;
+	ProcState  *stateP = segP->procState;
+	int			index;
+
+	SpinAcquire(SInvalLock);
+
+	for (index = 0; index < segP->maxBackends; index++)
+	{
+		SHMEM_OFFSET	pOffset = stateP[index].procStruct;
+
+		if (pOffset != INVALID_OFFSET)
+		{
+			PROC	   *proc = (PROC *) MAKE_PTR(pOffset);
+
+			if (proc->xid == xid)
+			{
+				result = true;
+				break;
+			}
+		}
+	}
+
+	SpinRelease(SInvalLock);
+
+	return result;
+}
+
+/*
+ * GetXmaxRecent -- returns oldest transaction that was running
+ *					when all current transaction was started.
+ *					It's used by vacuum to decide what deleted
+ *					tuples must be preserved in a table.
+ */
+void
+GetXmaxRecent(TransactionId *XmaxRecent)
+{
+	SISeg	   *segP = shmInvalBuffer;
+	ProcState  *stateP = segP->procState;
+	int			index;
+
+	*XmaxRecent = GetCurrentTransactionId();
+
+	SpinAcquire(SInvalLock);
+
+	for (index = 0; index < segP->maxBackends; index++)
+	{
+		SHMEM_OFFSET	pOffset = stateP[index].procStruct;
+
+		if (pOffset != INVALID_OFFSET)
+		{
+			PROC	   *proc = (PROC *) MAKE_PTR(pOffset);
+			TransactionId xmin;
+
+			xmin = proc->xmin;	/* we don't use spin-locking in AbortTransaction() ! */
+			if (proc == MyProc || xmin < FirstTransactionId)
+				continue;
+			if (xmin < *XmaxRecent)
+				*XmaxRecent = xmin;
+		}
+	}
+
+	SpinRelease(SInvalLock);
+}
+
+/*
+ * GetSnapshotData -- returns information about running transactions.
+ */
+Snapshot
+GetSnapshotData(bool serializable)
+{
+	Snapshot	snapshot = (Snapshot) malloc(sizeof(SnapshotData));
+	SISeg	   *segP = shmInvalBuffer;
+	ProcState  *stateP = segP->procState;
+	int			index;
+	int			count = 0;
+
+	/* There can be no more than maxBackends active transactions,
+	 * so this is enough space:
+	 */
+	snapshot->xip = (TransactionId *)
+		malloc(segP->maxBackends * sizeof(TransactionId));
+	snapshot->xmin = GetCurrentTransactionId();
+
+	SpinAcquire(SInvalLock);
+
+	/*
+	 * Unfortunately, we have to call ReadNewTransactionId()
+	 * after acquiring SInvalLock above. It's not good because
+	 * ReadNewTransactionId() does SpinAcquire(OidGenLockId) but
+	 * _necessary_.
+	 */
+	ReadNewTransactionId(&(snapshot->xmax));
+
+	for (index = 0; index < segP->maxBackends; index++)
+	{
+		SHMEM_OFFSET	pOffset = stateP[index].procStruct;
+
+		if (pOffset != INVALID_OFFSET)
+		{
+			PROC	   *proc = (PROC *) MAKE_PTR(pOffset);
+			TransactionId xid;
+
+			/* 
+			 * We don't use spin-locking when changing proc->xid 
+			 * in GetNewTransactionId() and in AbortTransaction() !..
+			 */
+			xid = proc->xid;
+			if (proc == MyProc || 
+				xid < FirstTransactionId || xid >= snapshot->xmax)
+			{
+				/*
+				 * Seems that there is no sense to store xid >= snapshot->xmax
+				 * (what we got from ReadNewTransactionId above) in
+				 * snapshot->xip - we just assume that all xacts with such
+				 * xid-s are running and may be ignored.
+				 */
+				continue;
+			}
+			if (xid < snapshot->xmin)
+				snapshot->xmin = xid;
+			snapshot->xip[count] = xid;
+			count++;
+		}
+	}
+
+	if (serializable)
+		MyProc->xmin = snapshot->xmin;
+	/* Serializable snapshot must be computed before any other... */
+	Assert(MyProc->xmin != InvalidTransactionId);
+
+	SpinRelease(SInvalLock);
+
+	snapshot->xcnt = count;
+	return snapshot;
+}
diff --git a/src/backend/storage/ipc/sinvaladt.c b/src/backend/storage/ipc/sinvaladt.c
index 1a91dde9b1cf907c0f8ad9167a6bd16a8cd6dfb2..f2e531be444a9b4401c4074cfd36a5ef6e06427b 100644
--- a/src/backend/storage/ipc/sinvaladt.c
+++ b/src/backend/storage/ipc/sinvaladt.c
@@ -7,7 +7,7 @@
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/storage/ipc/sinvaladt.c,v 1.26 1999/09/09 14:56:06 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/storage/ipc/sinvaladt.c,v 1.27 1999/09/24 00:24:35 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -19,6 +19,8 @@
 #include "miscadmin.h"
 #include "storage/backendid.h"
 #include "storage/lmgr.h"
+#include "storage/proc.h"
+#include "storage/sinval.h"
 #include "storage/sinvaladt.h"
 #include "utils/trace.h"
 
@@ -125,6 +127,7 @@ SISegInit(SISeg *segP, int maxBackends)
 		segP->procState[i].nextMsgNum = -1;	/* inactive */
 		segP->procState[i].resetState = false;
 		segP->procState[i].tag = InvalidBackendTag;
+		segP->procState[i].procStruct = INVALID_OFFSET;
 	}
 }
 
@@ -161,8 +164,8 @@ SIBackendInit(SISeg *segP)
 		}
 	}
 
-	/* elog() with spinlock held is probably not too cool, but these
-	 * conditions should never happen anyway.
+	/* elog() with spinlock held is probably not too cool, but this
+	 * condition should never happen anyway.
 	 */
 	if (stateP == NULL)
 	{
@@ -179,9 +182,10 @@ SIBackendInit(SISeg *segP)
 #endif	 /* INVALIDDEBUG */
 
 	/* mark myself active, with all extant messages already read */
-	stateP->tag = MyBackendTag;
-	stateP->resetState = false;
 	stateP->nextMsgNum = segP->maxMsgNum;
+	stateP->resetState = false;
+	stateP->tag = MyBackendTag;
+	stateP->procStruct = MAKE_OFFSET(MyProc);
 
 	/* register exit routine to mark my entry inactive at exit */
 	on_shmem_exit(CleanupInvalidationState, (caddr_t) segP);
@@ -193,7 +197,8 @@ SIBackendInit(SISeg *segP)
  * CleanupInvalidationState
  *		Mark the current backend as no longer active.
  *
- * This function is called via on_shmem_exit() during backend shutdown.
+ * This function is called via on_shmem_exit() during backend shutdown,
+ * so the caller has NOT acquired the lock for us.
  */
 static void
 CleanupInvalidationState(int status,
@@ -201,13 +206,14 @@ CleanupInvalidationState(int status,
 {
 	Assert(PointerIsValid(segP));
 
-	/* XXX we probably oughta grab the SInval spinlock for this...
-	 * but I think it is safe not to.
-	 */
+	SpinAcquire(SInvalLock);
 
 	segP->procState[MyBackendId - 1].nextMsgNum = -1;
 	segP->procState[MyBackendId - 1].resetState = false;
 	segP->procState[MyBackendId - 1].tag = InvalidBackendTag;
+	segP->procState[MyBackendId - 1].procStruct = INVALID_OFFSET;
+
+	SpinRelease(SInvalLock);
 }
 
 /*
diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c
index 6186904ad455640c9476a8d2c3693afd1190ca7f..159edf0549b6c519f7eea4f2f3863c39776ab0aa 100644
--- a/src/backend/storage/lmgr/proc.c
+++ b/src/backend/storage/lmgr/proc.c
@@ -7,7 +7,7 @@
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/storage/lmgr/proc.c,v 1.60 1999/07/17 20:17:47 momjian Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/storage/lmgr/proc.c,v 1.61 1999/09/24 00:24:41 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -46,7 +46,7 @@
  *		This is so that we can support more backends. (system-wide semaphore
  *		sets run out pretty fast.)				  -ay 4/95
  *
- * $Header: /cvsroot/pgsql/src/backend/storage/lmgr/proc.c,v 1.60 1999/07/17 20:17:47 momjian Exp $
+ * $Header: /cvsroot/pgsql/src/backend/storage/lmgr/proc.c,v 1.61 1999/09/24 00:24:41 tgl Exp $
  */
 #include <sys/time.h>
 #include <unistd.h>
@@ -296,6 +296,7 @@ InitProcess(IPCKey key)
 	SpinRelease(ProcStructLock);
 
 	MyProc->pid = MyProcPid;
+	MyProc->databaseId = MyDatabaseId;
 	MyProc->xid = InvalidTransactionId;
 	MyProc->xmin = InvalidTransactionId;
 
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
index 304dc786f2955ccfdc791f77d07aaf0093b31cf6..444181a938a78ba63f793c5959bf6b0e146789f3 100644
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -7,7 +7,7 @@
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/storage/smgr/md.c,v 1.53 1999/09/05 23:24:53 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/storage/smgr/md.c,v 1.54 1999/09/24 00:24:47 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -576,12 +576,10 @@ mdblindwrt(char *dbstr,
 /* this is work arround only !!! */
 	{
 		char		dbpath[MAXPGPATH + 1];
-		int4		owner;
 		Oid			id;
 		char	   *tmpPath;
-		int			tmpEncoding;
 
-		GetRawDatabaseInfo(dbstr, &owner, &id, dbpath, &tmpEncoding);
+		GetRawDatabaseInfo(dbstr, &id, dbpath);
 
 		if (id != dbid)
 			elog(FATAL, "mdblindwrt: oid of db %s is not %u", dbstr, dbid);
@@ -615,12 +613,10 @@ mdblindwrt(char *dbstr,
 /* this is work arround only !!! */
 	{
 		char		dbpath[MAXPGPATH + 1];
-		int4		owner;
 		Oid			id;
 		char	   *tmpPath;
-		int			tmpEncoding;
 
-		GetRawDatabaseInfo(dbstr, &owner, &id, dbpath, &tmpEncoding);
+		GetRawDatabaseInfo(dbstr, &id, dbpath);
 
 		if (id != dbid)
 			elog(FATAL, "mdblindwrt: oid of db %s is not %u", dbstr, dbid);
diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c
index 3986bee47bed7caecca17de16dac16c52e487514..4947b2913752b53a6fb23825e2316b5098dcc00d 100644
--- a/src/backend/tcop/postgres.c
+++ b/src/backend/tcop/postgres.c
@@ -7,7 +7,7 @@
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/tcop/postgres.c,v 1.128 1999/08/31 04:26:40 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/tcop/postgres.c,v 1.129 1999/09/24 00:24:52 tgl Exp $
  *
  * NOTES
  *	  this is the "main" module of the postgres backend and
@@ -113,7 +113,22 @@ char		relname[80];		/* current relation name */
 
 /* note: these declarations had better match tcopprot.h */
 DLLIMPORT sigjmp_buf Warn_restart;
-bool		InError;
+
+bool		InError = true;
+
+/*
+ * Note: InError is a flag to elog() telling whether it is safe to longjmp
+ * back to PostgresMain.  It is "false", allowing an error longjmp, during
+ * normal processing.  It is "true" during startup, when we have not yet
+ * set the Warn_restart jmp_buf, and also "true" in the interval when we
+ * have executed a longjmp back to PostgresMain and not yet finished cleaning
+ * up after the error.  In either case, elog(ERROR) should be treated as a
+ * fatal exit condition rather than attempting to recover --- since there is
+ * noplace to recover to in the first case, and we don't want to risk an
+ * infinite loop of "error recoveries" in the second case.
+ *
+ * Therefore, InError starts out "true" at program load time, as shown above.
+ */
 
 extern int	NBuffers;
 
@@ -1469,7 +1484,7 @@ PostgresMain(int argc, char *argv[], int real_argc, char *real_argv[])
 	if (!IsUnderPostmaster)
 	{
 		puts("\nPOSTGRES backend interactive interface ");
-		puts("$Revision: 1.128 $ $Date: 1999/08/31 04:26:40 $\n");
+		puts("$Revision: 1.129 $ $Date: 1999/09/24 00:24:52 $\n");
 	}
 
 	/* ----------------
@@ -1479,6 +1494,7 @@ PostgresMain(int argc, char *argv[], int real_argc, char *real_argv[])
 	 *	so we abort the current transaction and start a new one.
 	 *
 	 *	Note:  elog(ERROR) does a siglongjmp() to transfer control here.
+	 *	See comments with the declaration of InError, above.
 	 * ----------------
 	 */
 
diff --git a/src/backend/tcop/pquery.c b/src/backend/tcop/pquery.c
index 6307f0cd6a30ba09a7bd96afe81c4ebe555481e4..cbbe82eb3b5c2ae689477d2920ebc446a6e17ae3 100644
--- a/src/backend/tcop/pquery.c
+++ b/src/backend/tcop/pquery.c
@@ -7,7 +7,7 @@
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/tcop/pquery.c,v 1.27 1999/07/17 20:17:51 momjian Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/tcop/pquery.c,v 1.28 1999/09/24 00:24:53 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -55,7 +55,6 @@ CreateExecutorState(void)
 {
 	EState	   *state;
 	extern int	NBuffers;
-	long	   *refcount;
 
 	/* ----------------
 	 *	create a new executor state
@@ -81,10 +80,6 @@ CreateExecutorState(void)
 
 	state->es_junkFilter = NULL;
 
-	refcount = (long *) palloc(NBuffers * sizeof(long));
-	MemSet((char *) refcount, 0, NBuffers * sizeof(long));
-	state->es_refcount = (int *) refcount;
-
 	/* ----------------
 	 *	return the executor state structure
 	 * ----------------
diff --git a/src/backend/utils/init/postinit.c b/src/backend/utils/init/postinit.c
index 4118fe16a181971a8cc39aab194dbb19f9342b50..51d4727fe35a726d4bae9faea13eca53456ec21e 100644
--- a/src/backend/utils/init/postinit.c
+++ b/src/backend/utils/init/postinit.c
@@ -7,7 +7,7 @@
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/utils/init/postinit.c,v 1.48 1999/07/17 20:18:08 momjian Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/utils/init/postinit.c,v 1.49 1999/09/24 00:24:58 tgl Exp $
  *
  * NOTES
  *		InitPostgres() is the function called from PostgresMain
@@ -36,6 +36,7 @@
 
 #include "access/heapam.h"
 #include "catalog/catname.h"
+#include "catalog/pg_database.h"
 #include "libpq/libpq.h"
 #include "miscadmin.h"
 #include "storage/backendid.h"
@@ -54,13 +55,12 @@
 
 static void VerifySystemDatabase(void);
 static void VerifyMyDatabase(void);
+static void ReverifyMyDatabase(char *name);
 static void InitCommunication(void);
 static void InitMyDatabaseInfo(char *name);
 static void InitStdio(void);
 static void InitUserid(void);
 
-extern char *ExpandDatabasePath(char *name);
-extern void GetRawDatabaseInfo(char *name, int4 *owner, Oid *db_id, char *path, int *encoding);
 
 static IPCKey PostgresIpcKey;
 
@@ -98,13 +98,11 @@ static IPCKey PostgresIpcKey;
 static void
 InitMyDatabaseInfo(char *name)
 {
-	int4		owner;
 	char	   *path,
 				myPath[MAXPGPATH + 1];
-	int			encoding;
 
 	SetDatabaseName(name);
-	GetRawDatabaseInfo(name, &owner, &MyDatabaseId, myPath, &encoding);
+	GetRawDatabaseInfo(name, &MyDatabaseId, myPath);
 
 	if (!OidIsValid(MyDatabaseId))
 		elog(FATAL,
@@ -114,11 +112,6 @@ InitMyDatabaseInfo(char *name)
 
 	path = ExpandDatabasePath(myPath);
 	SetDatabasePath(path);
-#ifdef MULTIBYTE
-	SetDatabaseEncoding(encoding);
-#endif
-
-	return;
 }	/* InitMyDatabaseInfo() */
 
 
@@ -249,6 +242,86 @@ VerifyMyDatabase()
 	/* Above does not return */
 }	/* VerifyMyDatabase() */
 
+/* --------------------------------
+ *		ReverifyMyDatabase
+ *
+ * Since we are forced to fetch the database OID out of pg_database without
+ * benefit of locking or transaction ID checking (see utils/misc/database.c),
+ * we might have gotten a wrong answer.  Or, we might have attached to a
+ * database that's in process of being destroyed by destroydb().  This
+ * routine is called after we have all the locking and other infrastructure
+ * running --- now we can check that we are really attached to a valid
+ * database.
+ *
+ * In reality, if destroydb() is running in parallel with our startup,
+ * it's pretty likely that we will have failed before now, due to being
+ * unable to read some of the system tables within the doomed database.
+ * This routine just exists to make *sure* we have not started up in an
+ * invalid database.  If we quit now, we should have managed to avoid
+ * creating any serious problems.
+ *
+ * This is also a handy place to fetch the database encoding info out
+ * of pg_database, if we are in MULTIBYTE mode.
+ * --------------------------------
+ */
+static void
+ReverifyMyDatabase(char *name)
+{
+	Relation	pgdbrel;
+	HeapScanDesc pgdbscan;
+	ScanKeyData	key;
+	HeapTuple	tup;
+
+	/*
+	 * Because we grab AccessShareLock here, we can be sure that
+	 * destroydb is not running in parallel with us (any more).
+	 */
+	pgdbrel = heap_openr(DatabaseRelationName, AccessShareLock);
+
+	ScanKeyEntryInitialize(&key, 0, Anum_pg_database_datname,
+						   F_NAMEEQ, NameGetDatum(name));
+
+	pgdbscan = heap_beginscan(pgdbrel, 0, SnapshotNow, 1, &key);
+
+	tup = heap_getnext(pgdbscan, 0);
+	if (!HeapTupleIsValid(tup) ||
+		tup->t_data->t_oid != MyDatabaseId)
+	{
+		/* OOPS */
+		heap_close(pgdbrel, AccessShareLock);
+		/*
+		 * The only real problem I could have created is to load dirty
+		 * buffers for the dead database into shared buffer cache;
+		 * if I did, some other backend will eventually try to write
+		 * them and die in mdblindwrt.  Flush any such pages to forestall
+		 * trouble.
+		 */
+		DropBuffers(MyDatabaseId);
+		/* Now I can commit hara-kiri with a clear conscience... */
+		elog(FATAL, "Database '%s', OID %u, has disappeared from pg_database",
+			 name, MyDatabaseId);
+	}
+
+	/*
+	 * OK, we're golden.  Only other to-do item is to save the MULTIBYTE
+	 * encoding info out of the pg_database tuple.  Note we also set the
+	 * "template encoding", which is the default encoding for any
+	 * CREATE DATABASE commands executed in this backend; essentially,
+	 * you get the same encoding of the database you connected to as
+	 * the default.  (This replaces code that unreliably grabbed
+	 * template1's encoding out of pg_database.  We could do an extra
+	 * scan to find template1's tuple, but for 99.99% of all backend
+	 * startups it'd be wasted cycles --- and the 'createdb' script
+	 * connects to template1 anyway, so there's no difference.)
+	 */
+#ifdef MULTIBYTE
+	SetDatabaseEncoding(((Form_pg_database) GETSTRUCT(tup))->encoding);
+	SetTemplateEncoding(((Form_pg_database) GETSTRUCT(tup))->encoding);
+#endif
+
+	heap_endscan(pgdbscan);
+	heap_close(pgdbrel, AccessShareLock);
+}
 
 /* --------------------------------
  *		InitUserid
@@ -402,17 +475,11 @@ InitStdio()
  *		Be very careful with the order of calls in the InitPostgres function.
  * --------------------------------
  */
-bool		PostgresIsInitialized = false;
 extern int	NBuffers;
 
-/*
- *	this global is used by wei for testing his code, but must be declared
- *	here rather than in postgres.c so that it's defined for cinterface.a
- *	applications.
- */
+bool		PostgresIsInitialized = false;
 
-/*int	testFlag = 0;*/
-int			lockingOff = 0;
+int			lockingOff = 0;		/* backend -L switch */
 
 /*
  */
@@ -530,21 +597,21 @@ InitPostgres(char *name)		/* database name */
 	LockDisable(false);
 
 	/* ----------------
-	 *	anyone knows what this does?  something having to do with
-	 *	system catalog cache invalidation in the case of multiple
-	 *	backends, I think -cim 10/3/90
-	 *	Sets up MyBackendId a unique backend identifier.
+	 * Set up my per-backend PROC struct in shared memory.
 	 * ----------------
 	 */
-	InitSharedInvalidationState();
+	InitProcess(PostgresIpcKey);
 
 	/* ----------------
-	 * Set up a per backend process in shared memory.  Must be done after
-	 * InitSharedInvalidationState() as it relies on MyBackendId being
-	 * initialized already.  XXX -mer 11 Aug 1991
+	 *	Initialize my entry in the shared-invalidation manager's
+	 *	array of per-backend data.  (Formerly this came before
+	 *	InitProcess, but now it must happen after, because it uses
+	 *	MyProc.)  Once I have done this, I am visible to other backends!
+	 *
+	 *	Sets up MyBackendId, a unique backend identifier.
 	 * ----------------
 	 */
-	InitProcess(PostgresIpcKey);
+	InitSharedInvalidationState();
 
 	if (MyBackendId > MAXBACKENDS || MyBackendId <= 0)
 	{
@@ -592,7 +659,6 @@ InitPostgres(char *name)		/* database name */
 	 * ----------------
 	 */
 	PostgresIsInitialized = true;
-/*	  on_shmem_exit(DestroyLocalRelList, (caddr_t) NULL); */
 
 	/* ----------------
 	 *	Done with "InitPostgres", now change to NormalProcessing unless
@@ -601,7 +667,14 @@ InitPostgres(char *name)		/* database name */
 	 */
 	if (!bootstrap)
 		SetProcessingMode(NormalProcessing);
-/*	  if (testFlag || lockingOff) */
 	if (lockingOff)
 		LockDisable(true);
+
+	/*
+	 * Unless we are bootstrapping, double-check that InitMyDatabaseInfo()
+	 * got a correct result.  We can't do this until essentially all the
+	 * infrastructure is up, so just do it at the end.
+	 */
+	if (!bootstrap)
+		ReverifyMyDatabase(name);
 }
diff --git a/src/backend/utils/misc/database.c b/src/backend/utils/misc/database.c
index 321ab943aeac394811e8d136d75ba550c696dece..f5ff732b8f2f9dfa967978502febb8364d68003e 100644
--- a/src/backend/utils/misc/database.c
+++ b/src/backend/utils/misc/database.c
@@ -1,13 +1,13 @@
 /*-------------------------------------------------------------------------
  *
  * database.c
- *	  miscellanious initialization support stuff
+ *	  miscellaneous initialization support stuff
  *
  * Copyright (c) 1994, Regents of the University of California
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/utils/misc/Attic/database.c,v 1.29 1999/09/18 19:08:07 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/utils/misc/Attic/database.c,v 1.30 1999/09/24 00:25:04 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -21,10 +21,6 @@
 #include "miscadmin.h"
 #include "utils/syscache.h"
 
-#ifdef MULTIBYTE
-#include "mb/pg_wchar.h"
-#endif
-
 #ifdef NOT_USED
 /* GetDatabaseInfo()
  * Pull database information from pg_database.
@@ -153,24 +149,13 @@ ExpandDatabasePath(char *dbpath)
  *		cache.	To get around this problem, this code opens and scans the
  *		pg_database relation by hand.
  *
- *		This algorithm relies on the fact that first attribute in the
- *		pg_database relation schema is the database name.  It also knows
- *		about the internal format of tuples on disk and the length of
- *		the datname attribute.	It knows the location of the pg_database
- *		file.
- *		Actually, the code looks as though it is using the pg_database
- *		tuple definition to locate the database name, so the above statement
- *		seems to be no longer correct. - thomas 1997-11-01
- *
- *		This code is called from InitPostgres(), before we chdir() to the
- *		local database directory and before we open any relations.
- *		Used to be called after the chdir(), but we now want to confirm
- *		the location of the target database using pg_database info.
- *		- thomas 1997-11-01
+ *		This code knows way more than it should about the layout of
+ *		tuples on disk, but there seems to be no help for that.
+ *		We're pulling ourselves up by the bootstraps here...
  * --------------------------------
  */
 void
-GetRawDatabaseInfo(char *name, int4 *owner, Oid *db_id, char *path, int *encoding)
+GetRawDatabaseInfo(char *name, Oid *db_id, char *path)
 {
 	int			dbfd;
 	int			fileflags;
@@ -238,48 +223,38 @@ GetRawDatabaseInfo(char *name, int4 *owner, Oid *db_id, char *path, int *encodin
 			 * skip this tuple.  XXX warning, will robinson:  violation of
 			 * transaction semantics happens right here.  we should check
 			 * to be sure that the xact that deleted this tuple actually
-			 * committed.  only way to do this at init time is to paw over
-			 * the log relation by hand, too.  let's be optimistic.
+			 * committed.  Only way to do that at init time is to paw over
+			 * the log relation by hand, too.  Instead we take the
+			 * conservative assumption that if someone tried to delete it,
+			 * it's gone.  The other side of the coin is that we might
+			 * accept a tuple that was stored and never committed.  All in
+			 * all, this code is pretty shaky.  We will cross-check our
+			 * result in ReverifyMyDatabase() in postinit.c.
 			 *
-			 * XXX This is an evil type cast.  tup->t_xmax is char[5] while
-			 * TransactionId is struct * { char data[5] }.	It works but
-			 * if data is ever moved and no longer the first field this
-			 * will be broken!! -mer 11 Nov 1991.
+			 * NOTE: if a bogus tuple in pg_database prevents connection
+			 * to a valid database, a fix is to connect to another database
+			 * and do "select * from pg_database".  That should cause
+			 * committed and dead tuples to be marked with correct states.
+			 *
+			 * XXX wouldn't it be better to let new backends read the
+			 * database OID from a flat file, handled the same way
+			 * we handle the password relation?
 			 */
 			if (TransactionIdIsValid((TransactionId) tup.t_data->t_xmax))
 				continue;
 
 			/*
-			 * Okay, see if this is the one we want. XXX 1 july 91:  mao
-			 * and mer discover that tuples now squash t_bits.	Why is
-			 * this?
-			 *
-			 * 24 july 92:	mer realizes that the t_bits field is only used
-			 * in the event of null values.  If no fields are null we
-			 * reduce the header size by doing the squash.	t_hoff tells
-			 * you exactly how big the header actually is. use the PC
-			 * means of getting at sys cat attrs.
+			 * Okay, see if this is the one we want.
 			 */
 			tup_db = (Form_pg_database) GETSTRUCT(&tup);
-#ifdef MULTIBYTE
 
-			/*
-			 * get encoding from template database. This is the "default
-			 * for default" for create database command.
-			 */
-			if (strcmp("template1", tup_db->datname.data) == 0)
-				SetTemplateEncoding(tup_db->encoding);
-#endif
 			if (strcmp(name, tup_db->datname.data) == 0)
 			{
+				/* Found it; extract the OID and the database path. */
 				*db_id = tup.t_data->t_oid;
 				strncpy(path, VARDATA(&(tup_db->datpath)),
 						(VARSIZE(&(tup_db->datpath)) - VARHDRSZ));
 				*(path + VARSIZE(&(tup_db->datpath)) - VARHDRSZ) = '\0';
-#ifdef MULTIBYTE
-				*encoding = tup_db->encoding;
-#endif
-
 				goto done;
 			}
 		}
diff --git a/src/include/executor/executor.h b/src/include/executor/executor.h
index a23c81227e645aa1829454c111ed6633ff444300..662358f4599cf20fa85a2e790e04d743391f06ff 100644
--- a/src/include/executor/executor.h
+++ b/src/include/executor/executor.h
@@ -6,7 +6,7 @@
  *
  * Copyright (c) 1994, Regents of the University of California
  *
- * $Id: executor.h,v 1.37 1999/07/17 20:18:26 momjian Exp $
+ * $Id: executor.h,v 1.38 1999/09/24 00:25:10 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -36,22 +36,6 @@
 	) \
 )
 
-/* --------------------------------
- *		ExecIncrSlotBufferRefcnt
- *
- *		When we pass around buffers in the tuple table, we have to
- *		be careful to increment reference counts appropriately.
- *		This is used mainly in the mergejoin code.
- * --------------------------------
- */
-#define ExecIncrSlotBufferRefcnt(slot) \
-( \
-	BufferIsValid((slot)->ttc_buffer) ? \
-		IncrBufferRefCount((slot)->ttc_buffer) \
-	: (void)NULL \
-)
-
-
 /*
  * prototypes from functions in execAmi.c
  */
diff --git a/src/include/miscadmin.h b/src/include/miscadmin.h
index a9e603053f8532ef927e683868066baf5725da61..ecd567c364f12a7c6d7dc6161453de82a806dac3 100644
--- a/src/include/miscadmin.h
+++ b/src/include/miscadmin.h
@@ -11,7 +11,7 @@
  *
  * Copyright (c) 1994, Regents of the University of California
  *
- * $Id: miscadmin.h,v 1.40 1999/06/12 22:17:23 tgl Exp $
+ * $Id: miscadmin.h,v 1.41 1999/09/24 00:25:16 tgl Exp $
  *
  * NOTES
  *	  some of the information in this file will be moved to
@@ -111,7 +111,7 @@ extern char *DatabaseName;
 extern char *DatabasePath;
 
 /* in utils/misc/database.c */
-extern void GetRawDatabaseInfo(char *name, int4 *owner, Oid *db_id, char *path, int *encoding);
+extern void GetRawDatabaseInfo(char *name, Oid *db_id, char *path);
 extern int	GetDatabaseInfo(char *name, int4 *owner, char *path);
 extern char *ExpandDatabasePath(char *path);
 
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index 132f05266762c791a48398e23f3afefa6d3a1d30..3263e500f2c6ee2aa7240ae5e9e254534dce41b2 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -6,7 +6,7 @@
  *
  * Copyright (c) 1994, Regents of the University of California
  *
- * $Id: execnodes.h,v 1.34 1999/08/21 03:49:08 tgl Exp $
+ * $Id: execnodes.h,v 1.35 1999/09/24 00:25:22 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -205,7 +205,6 @@ typedef struct EState
 	int			es_BaseId;
 	TupleTable	es_tupleTable;
 	JunkFilter *es_junkFilter;
-	int		   *es_refcount;
 	uint32		es_processed;	/* # of tuples processed */
 	Oid			es_lastoid;		/* last oid processed (by INSERT) */
 	List	   *es_rowMark;		/* not good place, but there is no other */
diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h
index b98f0fb820f057834821273d52b50fe93579761c..b38d3ff5c8864f6a661354b88befb333e8958cdc 100644
--- a/src/include/storage/buf_internals.h
+++ b/src/include/storage/buf_internals.h
@@ -6,7 +6,7 @@
  *
  * Copyright (c) 1994, Regents of the University of California
  *
- * $Id: buf_internals.h,v 1.32 1999/09/18 19:08:18 tgl Exp $
+ * $Id: buf_internals.h,v 1.33 1999/09/24 00:25:27 tgl Exp $
  *
  * NOTE
  *		If BUFFERPAGE0 is defined, then 0 will be used as a
@@ -68,7 +68,7 @@ struct buftag
 	(a)->relId = (xx_reln)->rd_lockInfo.lockRelId \
 )
 
-#define BAD_BUFFER_ID(bid) ((bid<1) || (bid>(NBuffers)))
+#define BAD_BUFFER_ID(bid) ((bid) < 1 || (bid) > NBuffers)
 #define INVALID_DESCRIPTOR (-3)
 
 /*
@@ -168,7 +168,6 @@ extern bool BufTableInsert(BufferDesc *buf);
 extern BufferDesc *BufferDescriptors;
 extern BufferBlock BufferBlocks;
 extern long *PrivateRefCount;
-extern long *LastRefCount;
 extern bits8 *BufferLocks;
 extern long *CommitInfoNeedsSave;
 extern SPINLOCK BufMgrLock;
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index b55230e13672c1a348a4886125a7f232f550c14a..fb901b8f442108e51ae943aae540d30ba5e47de4 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -6,7 +6,7 @@
  *
  * Copyright (c) 1994, Regents of the University of California
  *
- * $Id: bufmgr.h,v 1.30 1999/09/23 17:03:27 momjian Exp $
+ * $Id: bufmgr.h,v 1.31 1999/09/24 00:25:27 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -71,12 +71,35 @@ extern int	ShowPinTrace;
 
 /*
  * BufferIsValid
- *		True iff the refcnt of the local buffer is > 0
+ *		True iff the given buffer number is valid (either as a shared
+ *		or local buffer).
+ *
  * Note:
  *		BufferIsValid(InvalidBuffer) is False.
  *		BufferIsValid(UnknownBuffer) is False.
+ *
+ * Note: For a long time this was defined the same as BufferIsPinned,
+ * that is it would say False if you didn't hold a pin on the buffer.
+ * I believe this was bogus and served only to mask logic errors.
+ * Code should always know whether it has a buffer reference,
+ * independently of the pin state.
  */
 #define BufferIsValid(bufnum) \
+( \
+	BufferIsLocal(bufnum) ? \
+		((bufnum) >= -NLocBuffer) \
+	: \
+		(! BAD_BUFFER_ID(bufnum)) \
+)
+
+/*
+ * BufferIsPinned
+ *		True iff the buffer is pinned (also checks for valid buffer number).
+ *
+ *		NOTE: what we check here is that *this* backend holds a pin on
+ *		the buffer.  We do not care whether some other backend does.
+ */
+#define BufferIsPinned(bufnum) \
 ( \
 	BufferIsLocal(bufnum) ? \
 		((bufnum) >= -NLocBuffer && LocalRefCount[-(bufnum) - 1] > 0) \
@@ -90,28 +113,27 @@ extern int	ShowPinTrace;
 )
 
 /*
- * BufferIsPinned
- *		True iff the buffer is pinned (and therefore valid)
+ * IncrBufferRefCount
+ *		Increment the pin count on a buffer that we have *already* pinned
+ *		at least once.
  *
- * Note:
- *		Smenatics are identical to BufferIsValid
- *		XXX - need to remove either one eventually.
+ *		This macro cannot be used on a buffer we do not have pinned,
+ *		because it doesn't change the shared buffer state.  Therefore the
+ *		Assert checks are for refcount > 0.  Someone got this wrong once...
  */
-#define BufferIsPinned BufferIsValid
-
-
 #define IncrBufferRefCount(buffer) \
 ( \
 	BufferIsLocal(buffer) ? \
 	( \
-		(void)AssertMacro(LocalRefCount[-(buffer) - 1] >= 0), \
-		(void)LocalRefCount[-(buffer) - 1]++ \
+		(void) AssertMacro((buffer) >= -NLocBuffer), \
+		(void) AssertMacro(LocalRefCount[-(buffer) - 1] > 0), \
+		(void) LocalRefCount[-(buffer) - 1]++ \
 	) \
 	: \
 	( \
-		(void)AssertMacro(!BAD_BUFFER_ID(buffer)), \
-		(void)AssertMacro(PrivateRefCount[(buffer) - 1] >= 0), \
-		(void)PrivateRefCount[(buffer) - 1]++ \
+		(void) AssertMacro(!BAD_BUFFER_ID(buffer)), \
+		(void) AssertMacro(PrivateRefCount[(buffer) - 1] > 0), \
+		(void) PrivateRefCount[(buffer) - 1]++ \
 	) \
 )
 
@@ -151,19 +173,18 @@ extern int	BufferPoolCheckLeak(void);
 extern void FlushBufferPool(int StableMainMemoryFlag);
 extern BlockNumber BufferGetBlockNumber(Buffer buffer);
 extern BlockNumber RelationGetNumberOfBlocks(Relation relation);
+extern int	FlushRelationBuffers(Relation rel, BlockNumber block,
+								 bool doFlush);
 extern void ReleaseRelationBuffers(Relation rel);
 extern void DropBuffers(Oid dbid);
 extern void PrintPinnedBufs(void);
 extern int	BufferShmemSize(void);
 extern int	ReleaseBuffer(Buffer buffer);
 
-extern void BufferRefCountReset(int *refcountsave);
-extern void BufferRefCountRestore(int *refcountsave);
 extern int	SetBufferWriteMode(int mode);
 extern void SetBufferCommitInfoNeedsSave(Buffer buffer);
-extern int BlowawayRelationBuffers(Relation rel, BlockNumber block);
 
 extern void UnlockBuffers(void);
 extern void LockBuffer(Buffer buffer, int mode);
 
-#endif	 /* !defined(BufMgrIncluded) */
+#endif
diff --git a/src/include/storage/proc.h b/src/include/storage/proc.h
index 4a5eb5533a22f71f6f172c500066839dd11ee531..d28e936b33fa7984f85f9ee43c2599cd7893a1f2 100644
--- a/src/include/storage/proc.h
+++ b/src/include/storage/proc.h
@@ -6,7 +6,7 @@
  *
  * Copyright (c) 1994, Regents of the University of California
  *
- * $Id: proc.h,v 1.25 1999/07/15 23:04:13 momjian Exp $
+ * $Id: proc.h,v 1.26 1999/09/24 00:25:27 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -51,7 +51,8 @@ typedef struct proc
 	LOCK	   *waitLock;		/* Lock we're sleeping on ... */
 	int			token;			/* type of lock we sleeping for */
 	int			holdLock;		/* while holding these locks */
-	int			pid;			/* This procs process id */
+	int			pid;			/* This backend's process id */
+	Oid			databaseId;		/* OID of database this backend is using */
 	short		sLocks[MAX_SPINS];		/* Spin lock stats */
 	SHM_QUEUE	lockQueue;		/* locks associated with current
 								 * transaction */
@@ -64,6 +65,7 @@ typedef struct proc
  * on your machine), or our free-semaphores bitmap won't work.  You also must
  * not set it higher than your kernel's SEMMSL (max semaphores per set)
  * parameter, which is often around 25.
+ *
  * MAX_PROC_SEMS is the maximum number of per-process semaphores (those used
  * by the lock mgr) we can keep track of.  It must be a multiple of
  * PROC_NSEMS_PER_SET.
@@ -78,9 +80,9 @@ typedef struct procglobal
 	int32		freeSemMap[MAX_PROC_SEMS / PROC_NSEMS_PER_SET];
 
 	/*
-	 * In each freeSemMap entry, the PROC_NSEMS_PER_SET lsbs flag whether
-	 * individual semaphores are in use, and the next higher bit is set to
-	 * show that the entire set is allocated.
+	 * In each freeSemMap entry, the PROC_NSEMS_PER_SET least-significant bits
+	 * flag whether individual semaphores are in use, and the next higher bit
+	 * is set to show that the entire set is allocated.
 	 */
 } PROC_HDR;
 
diff --git a/src/include/storage/shmem.h b/src/include/storage/shmem.h
index d73404d1543e8ebfe5a66dc55e9531b0445dd474..c10aec4c91287e31bd823a0ce38c29612b690c02 100644
--- a/src/include/storage/shmem.h
+++ b/src/include/storage/shmem.h
@@ -6,7 +6,7 @@
  *
  * Copyright (c) 1994, Regents of the University of California
  *
- * $Id: shmem.h,v 1.20 1999/07/16 17:07:38 momjian Exp $
+ * $Id: shmem.h,v 1.21 1999/09/24 00:25:27 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -70,8 +70,6 @@ extern bool ShmemPIDLookup(int pid, SHMEM_OFFSET *locationPtr);
 extern SHMEM_OFFSET ShmemPIDDestroy(int pid);
 extern long *ShmemInitStruct(char *name, unsigned long size,
 				bool *foundPtr);
-extern bool TransactionIdIsInProgress(TransactionId xid);
-extern void GetXmaxRecent(TransactionId *XmaxRecent);
 
 
 typedef int TableID;
diff --git a/src/include/storage/sinval.h b/src/include/storage/sinval.h
index 8d0f35a43c369da28d44fc73f671e99d3c75194d..19ce6375f032224cca7d1a57927843e0f20a0d7d 100644
--- a/src/include/storage/sinval.h
+++ b/src/include/storage/sinval.h
@@ -6,7 +6,7 @@
  *
  * Copyright (c) 1994, Regents of the University of California
  *
- * $Id: sinval.h,v 1.12 1999/07/15 23:04:14 momjian Exp $
+ * $Id: sinval.h,v 1.13 1999/09/24 00:25:27 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -26,5 +26,9 @@ extern void RegisterSharedInvalid(int cacheId, Index hashIndex,
 extern void InvalidateSharedInvalid(void (*invalFunction) (),
 												void (*resetFunction) ());
 
+extern bool DatabaseHasActiveBackends(Oid databaseId);
+extern bool TransactionIdIsInProgress(TransactionId xid);
+extern void GetXmaxRecent(TransactionId *XmaxRecent);
+
 
 #endif	 /* SINVAL_H */
diff --git a/src/include/storage/sinvaladt.h b/src/include/storage/sinvaladt.h
index b9d349a4c5755181ea80b3e1bef5cfa1a61434e3..7944f21a64e8a628985d23c868f4a26b0669337d 100644
--- a/src/include/storage/sinvaladt.h
+++ b/src/include/storage/sinvaladt.h
@@ -6,7 +6,7 @@
  *
  * Copyright (c) 1994, Regents of the University of California
  *
- * $Id: sinvaladt.h,v 1.18 1999/09/06 19:37:37 tgl Exp $
+ * $Id: sinvaladt.h,v 1.19 1999/09/24 00:25:27 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -15,6 +15,7 @@
 
 #include "storage/ipc.h"
 #include "storage/itemptr.h"
+#include "storage/shmem.h"
 
 /*
  * The shared cache invalidation manager is responsible for transmitting
@@ -77,6 +78,7 @@ typedef struct ProcState
 	int			nextMsgNum;		/* next message number to read, or -1 */
 	bool		resetState;		/* true, if backend has to reset its state */
 	int			tag;			/* backend tag received from postmaster */
+	SHMEM_OFFSET procStruct;	/* location of backend's PROC struct */
 } ProcState;
 
 /* Shared cache invalidation memory segment */
diff --git a/src/test/regress/output/misc.source b/src/test/regress/output/misc.source
index ff6b77291d8309c831079b104212866fc7c7c211..4eb0dafb1ef4dd876d1d77b862a428f899ec48be 100644
--- a/src/test/regress/output/misc.source
+++ b/src/test/regress/output/misc.source
@@ -404,52 +404,61 @@ QUERY: SELECT p.name, p.hobbies.name, p.hobbies.equipment.name FROM person p;
 name |name       |name         
 -----+-----------+-------------
 mike |posthacking|advil        
-joe  |basketball |peet's coffee
+mike |posthacking|peet's coffee
+joe  |basketball |hightops     
 sally|basketball |hightops     
-(3 rows)
+(4 rows)
 
 QUERY: SELECT p.name, p.hobbies.name, p.hobbies.equipment.name FROM person* p;
 name |name       |name         
 -----+-----------+-------------
 mike |posthacking|advil        
-joe  |basketball |peet's coffee
+mike |posthacking|peet's coffee
+joe  |basketball |hightops     
 sally|basketball |hightops     
 jeff |posthacking|advil        
-(4 rows)
+jeff |posthacking|peet's coffee
+(6 rows)
 
 QUERY: SELECT p.hobbies.equipment.name, p.name, p.hobbies.name FROM person p;
-name    |name |name       
---------+-----+-----------
-advil   |mike |posthacking
-hightops|joe  |basketball 
-hightops|sally|basketball 
-(3 rows)
+name         |name |name       
+-------------+-----+-----------
+advil        |mike |posthacking
+peet's coffee|mike |posthacking
+hightops     |joe  |basketball 
+hightops     |sally|basketball 
+(4 rows)
 
 QUERY: SELECT p.hobbies.equipment.name, p.name, p.hobbies.name FROM person* p;
-name    |name |name       
---------+-----+-----------
-advil   |mike |posthacking
-hightops|joe  |basketball 
-hightops|sally|basketball 
-advil   |jeff |posthacking
-(4 rows)
+name         |name |name       
+-------------+-----+-----------
+advil        |mike |posthacking
+peet's coffee|mike |posthacking
+hightops     |joe  |basketball 
+hightops     |sally|basketball 
+advil        |jeff |posthacking
+peet's coffee|jeff |posthacking
+(6 rows)
 
 QUERY: SELECT p.hobbies.equipment.name, p.hobbies.name, p.name FROM person p;
 name         |name       |name 
 -------------+-----------+-----
 advil        |posthacking|mike 
-peet's coffee|basketball |joe  
+peet's coffee|posthacking|mike 
+hightops     |basketball |joe  
 hightops     |basketball |sally
-(3 rows)
+(4 rows)
 
 QUERY: SELECT p.hobbies.equipment.name, p.hobbies.name, p.name FROM person* p;
 name         |name       |name 
 -------------+-----------+-----
 advil        |posthacking|mike 
-peet's coffee|basketball |joe  
+peet's coffee|posthacking|mike 
+hightops     |basketball |joe  
 hightops     |basketball |sally
 advil        |posthacking|jeff 
-(4 rows)
+peet's coffee|posthacking|jeff 
+(6 rows)
 
 QUERY: SELECT user_relns() AS user_relns
    ORDER BY user_relns;