From cd255bb07052a8db8fd3c435516e77d7c61f662e Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Fri, 23 Nov 2001 23:41:54 +0000
Subject: [PATCH] Fix boundary condition in btbulkdelete: don't examine high
 key in case where rightmost index page splits while we are waiting to obtain
 exclusive lock on it.  Not clear this would actually hurt (probably the
 callback would always fail), but better safe than sorry. Also, improve
 comments describing concurrency considerations in this code.

---
 src/backend/access/nbtree/nbtree.c | 32 +++++++++++++++++++++++-------
 1 file changed, 25 insertions(+), 7 deletions(-)

diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index b8eef7e06b0..6c7c4350aaa 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -12,7 +12,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtree.c,v 1.85 2001/11/10 23:51:13 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtree.c,v 1.86 2001/11/23 23:41:54 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -592,6 +592,7 @@ btbulkdelete(PG_FUNCTION_ARGS)
 			BlockNumber blkno;
 			OffsetNumber offnum;
 			BTItem		btitem;
+			BTPageOpaque opaque;
 			IndexTuple	itup;
 			ItemPointer htup;
 
@@ -608,9 +609,17 @@ btbulkdelete(PG_FUNCTION_ARGS)
 				/*
 				 * If this is first deletion on this page, trade in read
 				 * lock for a really-exclusive write lock.	Then, step
-				 * back one and re-examine the item, because someone else
-				 * might have inserted an item while we weren't holding
+				 * back one and re-examine the item, because other backends
+				 * might have inserted item(s) while we weren't holding
 				 * the lock!
+				 *
+				 * We assume that only concurrent insertions, not deletions,
+				 * can occur while we're not holding the page lock (the caller
+				 * should hold a suitable relation lock to ensure this).
+				 * Therefore, the item we want to delete is either in the
+				 * same slot as before, or some slot to its right.
+				 * Rechecking the same slot is necessary and sufficient to
+				 * get back in sync after any insertions.
 				 */
 				if (blkno != lockedBlock)
 				{
@@ -620,7 +629,7 @@ btbulkdelete(PG_FUNCTION_ARGS)
 				}
 				else
 				{
-					/* Delete the item from the page */
+					/* Okay to delete the item from the page */
 					_bt_itemdel(rel, buf, current);
 
 					/* Mark buffer dirty, but keep the lock and pin */
@@ -630,14 +639,23 @@ btbulkdelete(PG_FUNCTION_ARGS)
 				}
 
 				/*
-				 * We need to back up the scan one item so that the next
-				 * cycle will re-examine the same offnum on this page.
+				 * In either case, we now need to back up the scan one item,
+				 * so that the next cycle will re-examine the same offnum on
+				 * this page.
 				 *
 				 * For now, just hack the current-item index.  Will need to
 				 * be smarter when deletion includes removal of empty
 				 * index pages.
+				 *
+				 * We must decrement ip_posid in all cases but one: if the
+				 * page was formerly rightmost but was split while we didn't
+				 * hold the lock, and ip_posid is pointing to item 1, then
+				 * ip_posid now points at the high key not a valid data item.
+				 * In this case we do want to step forward.
 				 */
-				current->ip_posid--;
+				opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+				if (current->ip_posid >= P_FIRSTDATAKEY(opaque))
+					current->ip_posid--;
 			}
 			else
 				num_index_tuples += 1;
-- 
GitLab