diff --git a/src/backend/access/nbtree/README b/src/backend/access/nbtree/README
index d8ec739b2a8cb4da030a9ae0d942665fb267b305..964b8b4e11a92d67d3b9d79599932e826ef641be 100644
--- a/src/backend/access/nbtree/README
+++ b/src/backend/access/nbtree/README
@@ -1,4 +1,4 @@
-$Header: /cvsroot/pgsql/src/backend/access/nbtree/README,v 1.5 2001/07/15 22:48:16 tgl Exp $
+$Header: /cvsroot/pgsql/src/backend/access/nbtree/README,v 1.6 2002/10/20 20:47:31 tgl Exp $
 
 This directory contains a correct implementation of Lehman and Yao's
 high-concurrency B-tree management algorithm (P. Lehman and S. Yao,
@@ -60,14 +60,19 @@ into Postgres:
    move right until we find a page whose right-link matches the page we
    came from.
 
-+  Read locks on a page are held for as long as a scan has a pointer
-   to the page.  However, locks are always surrendered before the
-   sibling page lock is acquired (for readers), so we remain deadlock-
-   free.  I will do a formal proof if I get bored anytime soon.
-   NOTE: nbtree.c arranges to drop the read lock, but not the buffer pin,
++  Read locks on a page are held for as long as a scan is examining a page.
+   But nbtree.c arranges to drop the read lock, but not the buffer pin,
    on the current page of a scan before control leaves nbtree.  When we
    come back to resume the scan, we have to re-grab the read lock and
-   then move right if the current item moved (see _bt_restscan()).
+   then move right if the current item moved (see _bt_restscan()).  Keeping
+   the pin ensures that the current item cannot move left or be deleted
+   (see btbulkdelete).
+
++  In most cases we release our lock and pin on a page before attempting
+   to acquire pin and lock on the page we are moving to.  In a few places
+   it is necessary to lock the next page before releasing the current one.
+   This is safe when moving right or up, but not when moving left or down
+   (else we'd create the possibility of deadlocks).
 
 +  Lehman and Yao fail to discuss what must happen when the root page
    becomes full and must be split.  Our implementation is to split the
diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index ac46681c61fd42d148ac854c60710a55b71e66ea..8ad9cc8b357b310ff72d3740fc53c72fdf2fdb98 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -12,7 +12,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtree.c,v 1.92 2002/09/04 20:31:10 momjian Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtree.c,v 1.93 2002/10/20 20:47:31 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -603,12 +603,21 @@ btbulkdelete(PG_FUNCTION_ARGS)
 	 * loop, we skip most of the wrapper layers of index_getnext and
 	 * instead call _bt_step directly.	This implies holding buffer lock
 	 * on a target page throughout the loop over the page's tuples.
-	 * Initially, we have a read lock acquired by _bt_step when we stepped
-	 * onto the page.  If we find a tuple we need to delete, we trade in
-	 * the read lock for an exclusive write lock; after that, we hold the
-	 * write lock until we step off the page (fortunately, _bt_relbuf
-	 * doesn't care which kind of lock it's releasing).  This should
-	 * minimize the amount of work needed per page.
+	 *
+	 * Whenever we step onto a new page, we have to trade in the read
+	 * lock acquired by _bt_first or _bt_step for an exclusive write lock
+	 * (fortunately, _bt_relbuf doesn't care which kind of lock it's
+	 * releasing when it comes time for _bt_step to release our lock).
+	 *
+	 * Note that we exclusive-lock every leaf page, or at least every one
+	 * containing data items.  It sounds attractive to only exclusive-lock
+	 * those containing items we need to delete, but unfortunately that
+	 * is not safe: we could then pass a stopped indexscan, which could
+	 * in rare cases lead to deleting the item it needs to find when it
+	 * resumes.  (See _bt_restscan --- this could only happen if an indexscan
+	 * stops on a deletable item and then a page split moves that item
+	 * into a page further to its right, which the indexscan will have no
+	 * pin on.)
 	 */
 	scan = index_beginscan(NULL, rel, SnapshotAny, 0, (ScanKey) NULL);
 	so = (BTScanOpaque) scan->opaque;
@@ -620,7 +629,7 @@ btbulkdelete(PG_FUNCTION_ARGS)
 		Buffer		buf;
 		BlockNumber lockedBlock = InvalidBlockNumber;
 
-		/* we have the buffer pinned and locked */
+		/* we have the buffer pinned and read-locked */
 		buf = so->btso_curbuf;
 		Assert(BufferIsValid(buf));
 
@@ -637,65 +646,59 @@ btbulkdelete(PG_FUNCTION_ARGS)
 			CHECK_FOR_INTERRUPTS();
 
 			/* current is the next index tuple */
+			page = BufferGetPage(buf);
 			blkno = ItemPointerGetBlockNumber(current);
+
+			/*
+			 * Make sure we have a super-exclusive write lock on this page.
+			 *
+			 * We assume that only concurrent insertions, not deletions,
+			 * can occur while we're not holding the page lock (the
+			 * caller should hold a suitable relation lock to ensure
+			 * this). Therefore, no items can escape being scanned because
+			 * of this temporary lock release.
+			 */
+			if (blkno != lockedBlock)
+			{
+				LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+				LockBufferForCleanup(buf);
+				lockedBlock = blkno;
+				/*
+				 * If the page was formerly rightmost but was split while we
+				 * didn't hold the lock, and ip_posid is pointing to item
+				 * 1, then ip_posid now points at the high key not a valid
+				 * data item. In this case we need to step forward.
+				 */
+				opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+				if (current->ip_posid < P_FIRSTDATAKEY(opaque))
+					current->ip_posid = P_FIRSTDATAKEY(opaque);
+			}
+
 			offnum = ItemPointerGetOffsetNumber(current);
-			page = BufferGetPage(buf);
 			btitem = (BTItem) PageGetItem(page, PageGetItemId(page, offnum));
 			itup = &btitem->bti_itup;
 			htup = &(itup->t_tid);
 
 			if (callback(htup, callback_state))
 			{
-				/*
-				 * If this is first deletion on this page, trade in read
-				 * lock for a really-exclusive write lock.	Then, step
-				 * back one and re-examine the item, because other
-				 * backends might have inserted item(s) while we weren't
-				 * holding the lock!
-				 *
-				 * We assume that only concurrent insertions, not deletions,
-				 * can occur while we're not holding the page lock (the
-				 * caller should hold a suitable relation lock to ensure
-				 * this). Therefore, the item we want to delete is either
-				 * in the same slot as before, or some slot to its right.
-				 * Rechecking the same slot is necessary and sufficient to
-				 * get back in sync after any insertions.
-				 */
-				if (blkno != lockedBlock)
-				{
-					LockBuffer(buf, BUFFER_LOCK_UNLOCK);
-					LockBufferForCleanup(buf);
-					lockedBlock = blkno;
-				}
-				else
-				{
-					/* Okay to delete the item from the page */
-					_bt_itemdel(rel, buf, current);
-
-					/* Mark buffer dirty, but keep the lock and pin */
-					WriteNoReleaseBuffer(buf);
-
-					tuples_removed += 1;
-				}
+				/* Okay to delete the item from the page */
+				_bt_itemdel(rel, buf, current);
+
+				/* Mark buffer dirty, but keep the lock and pin */
+				WriteNoReleaseBuffer(buf);
+
+				tuples_removed += 1;
 
 				/*
-				 * In either case, we now need to back up the scan one
-				 * item, so that the next cycle will re-examine the same
-				 * offnum on this page.
+				 * We now need to back up the scan one item, so that the next
+				 * cycle will re-examine the same offnum on this page (which
+				 * now holds the next item).
 				 *
 				 * For now, just hack the current-item index.  Will need to
 				 * be smarter when deletion includes removal of empty
 				 * index pages.
-				 *
-				 * We must decrement ip_posid in all cases but one: if the
-				 * page was formerly rightmost but was split while we
-				 * didn't hold the lock, and ip_posid is pointing to item
-				 * 1, then ip_posid now points at the high key not a valid
-				 * data item. In this case we do want to step forward.
 				 */
-				opaque = (BTPageOpaque) PageGetSpecialPointer(page);
-				if (current->ip_posid >= P_FIRSTDATAKEY(opaque))
-					current->ip_posid--;
+				current->ip_posid--;
 			}
 			else
 				num_index_tuples += 1;
@@ -717,6 +720,16 @@ btbulkdelete(PG_FUNCTION_ARGS)
 
 /*
  * Restore scan position when btgettuple is called to continue a scan.
+ *
+ * This is nontrivial because concurrent insertions might have moved the
+ * index tuple we stopped on.  We assume the tuple can only have moved to
+ * the right from our stop point, because we kept a pin on the buffer,
+ * and so no deletion can have occurred on that page.
+ *
+ * On entry, we have a pin but no read lock on the buffer that contained
+ * the index tuple we stopped the scan on.  On exit, we have pin and read
+ * lock on the buffer that now contains that index tuple, and the scandesc's
+ * current position is updated to point at it.
  */
 static void
 _bt_restscan(IndexScanDesc scan)
@@ -729,13 +742,14 @@ _bt_restscan(IndexScanDesc scan)
 	OffsetNumber offnum = ItemPointerGetOffsetNumber(current),
 				maxoff;
 	BTPageOpaque opaque;
+	Buffer		nextbuf;
 	ItemPointerData target = so->curHeapIptr;
 	BTItem		item;
 	BlockNumber blkno;
 
 	/*
-	 * Get back the read lock we were holding on the buffer. (We still
-	 * have a reference-count pin on it, so need not get that.)
+	 * Reacquire read lock on the buffer.  (We should still have
+	 * a reference-count pin on it, so need not get that.)
 	 */
 	LockBuffer(buf, BT_READ);
 
@@ -747,7 +761,7 @@ _bt_restscan(IndexScanDesc scan)
 	 * We use this as flag when first index tuple on page is deleted but
 	 * we do not move left (this would slowdown vacuum) - so we set
 	 * current->ip_posid before first index tuple on the current page
-	 * (_bt_step will move it right)...
+	 * (_bt_step will move it right)...  XXX still needed?
 	 */
 	if (!ItemPointerIsValid(&target))
 	{
@@ -758,7 +772,7 @@ _bt_restscan(IndexScanDesc scan)
 
 	/*
 	 * The item we were on may have moved right due to insertions. Find it
-	 * again.
+	 * again.  We use the heap TID to identify the item uniquely.
 	 */
 	for (;;)
 	{
@@ -774,28 +788,33 @@ _bt_restscan(IndexScanDesc scan)
 				target.ip_blkid.bi_lo &&
 				item->bti_itup.t_tid.ip_posid == target.ip_posid)
 			{
+				/* Found it */
 				current->ip_posid = offnum;
 				return;
 			}
 		}
 
 		/*
-		 * By here, the item we're looking for moved right at least one
-		 * page
+		 * The item we're looking for moved right at least one page, so
+		 * move right.  We are careful here to pin and read-lock the next
+		 * page before releasing the current one.  This ensures that a
+		 * concurrent btbulkdelete scan cannot pass our position --- if it
+		 * did, it might be able to reach and delete our target item before
+		 * we can find it again.
 		 */
 		if (P_RIGHTMOST(opaque))
 			elog(FATAL, "_bt_restscan: my bits moved right off the end of the world!"
 				 "\n\tRecreate index %s.", RelationGetRelationName(rel));
 
 		blkno = opaque->btpo_next;
+		nextbuf = _bt_getbuf(rel, blkno, BT_READ);
 		_bt_relbuf(rel, buf);
-		buf = _bt_getbuf(rel, blkno, BT_READ);
+		so->btso_curbuf = buf = nextbuf;
 		page = BufferGetPage(buf);
 		maxoff = PageGetMaxOffsetNumber(page);
 		opaque = (BTPageOpaque) PageGetSpecialPointer(page);
 		offnum = P_FIRSTDATAKEY(opaque);
 		ItemPointerSet(current, blkno, offnum);
-		so->btso_curbuf = buf;
 	}
 }