From cf23b75b4dce75151df7164ed72263e66b758ae9 Mon Sep 17 00:00:00 2001
From: Teodor Sigaev <teodor@sigaev.ru>
Date: Tue, 22 Apr 2008 17:52:43 +0000
Subject: [PATCH] Fix using too many LWLocks bug, reported by Craig Ringer
 <craig@postnewspapers.com.au>. It was my mistake, I missed limitation of
 number of held locks, now GIN doesn't use continiuous locks, but still hold
 buffers pinned to prevent interference with vacuum's deletion algorithm.

Backpatch is needed.
---
 src/backend/access/gin/ginget.c | 362 ++++++++++++++------------------
 src/include/access/gin.h        |  14 +-
 2 files changed, 165 insertions(+), 211 deletions(-)

diff --git a/src/backend/access/gin/ginget.c b/src/backend/access/gin/ginget.c
index 26abaa76afa..b95fd7eed91 100644
--- a/src/backend/access/gin/ginget.c
+++ b/src/backend/access/gin/ginget.c
@@ -8,7 +8,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *			$PostgreSQL: pgsql/src/backend/access/gin/ginget.c,v 1.13 2008/04/14 17:05:33 tgl Exp $
+ *			$PostgreSQL: pgsql/src/backend/access/gin/ginget.c,v 1.14 2008/04/22 17:52:43 teodor Exp $
  *-------------------------------------------------------------------------
  */
 
@@ -29,58 +29,16 @@ findItemInPage(Page page, ItemPointer item, OffsetNumber *off)
 		/* page was deleted by concurrent  vacuum */
 		return false;
 
-	if (*off > maxoff || *off == InvalidOffsetNumber)
-		res = -1;
-	else
-		res = compareItemPointers(item, (ItemPointer) GinDataPageGetItem(page, *off));
-
-	if (res == 0)
-	{
-		/* page isn't changed */
-		return true;
-	}
-	else if (res > 0)
-	{
-		/*
-		 * some items was added before our position, look further to find it
-		 * or first greater
-		 */
-
-		(*off)++;
-		for (; *off <= maxoff; (*off)++)
-		{
-			res = compareItemPointers(item, (ItemPointer) GinDataPageGetItem(page, *off));
-
-			if (res == 0)
-				return true;
+	/*
+	 * scan page to find equal or first greater value
+	 */
 
-			if (res < 0)
-			{
-				(*off)--;
-				return true;
-			}
-		}
-	}
-	else
+	for (*off = FirstOffsetNumber; *off <= maxoff; (*off)++)
 	{
-		/*
-		 * some items was deleted before our position, look from begining to
-		 * find it or first greater
-		 */
-
-		for (*off = FirstOffsetNumber; *off <= maxoff; (*off)++)
-		{
-			res = compareItemPointers(item, (ItemPointer) GinDataPageGetItem(page, *off));
-
-			if (res == 0)
-				return true;
+		res = compareItemPointers(item, (ItemPointer) GinDataPageGetItem(page, *off));
 
-			if (res < 0)
-			{
-				(*off)--;
-				return true;
-			}
-		}
+		if (res <= 0)
+			return true;
 	}
 
 	return false;
@@ -91,119 +49,87 @@ findItemInPage(Page page, ItemPointer item, OffsetNumber *off)
  * Stop* functions unlock buffer (but don't release!)
  */
 static void
-startScanEntry(Relation index, GinState *ginstate, GinScanEntry entry, bool firstCall)
+startScanEntry(Relation index, GinState *ginstate, GinScanEntry entry)
 {
+	GinBtreeData 	btreeEntry;
+	GinBtreeStack  *stackEntry;
+	Page			page;
+	bool			needUnlock = TRUE;
+
 	if (entry->master != NULL)
 	{
 		entry->isFinished = entry->master->isFinished;
 		return;
 	}
 
-	if (firstCall)
-	{
-		/*
-		 * at first call we should find entry, and begin scan of posting tree
-		 * or just store posting list in memory
-		 */
-		GinBtreeData btreeEntry;
-		GinBtreeStack *stackEntry;
-		Page		page;
-		bool		needUnlock = TRUE;
-
-		prepareEntryScan(&btreeEntry, index, entry->entry, ginstate);
-		btreeEntry.searchMode = TRUE;
-		stackEntry = ginFindLeafPage(&btreeEntry, NULL);
-		page = BufferGetPage(stackEntry->buffer);
-
-		entry->isFinished = TRUE;
-		entry->buffer = InvalidBuffer;
-		entry->offset = InvalidOffsetNumber;
-		entry->list = NULL;
-		entry->nlist = 0;
-		entry->reduceResult = FALSE;
-		entry->predictNumberResult = 0;
-
-		if (btreeEntry.findItem(&btreeEntry, stackEntry))
-		{
-			IndexTuple	itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, stackEntry->off));
-
-			if (GinIsPostingTree(itup))
-			{
-				BlockNumber rootPostingTree = GinGetPostingTree(itup);
-				GinPostingTreeScan *gdi;
-				Page		page;
+	/*
+	 * we should find entry, and begin scan of posting tree
+	 * or just store posting list in memory
+	 */
 
-				LockBuffer(stackEntry->buffer, GIN_UNLOCK);
-				needUnlock = FALSE;
-				gdi = prepareScanPostingTree(index, rootPostingTree, TRUE);
+	prepareEntryScan(&btreeEntry, index, entry->entry, ginstate);
+	btreeEntry.searchMode = TRUE;
+	stackEntry = ginFindLeafPage(&btreeEntry, NULL);
+	page = BufferGetPage(stackEntry->buffer);
 
-				entry->buffer = scanBeginPostingTree(gdi);
-				IncrBufferRefCount(entry->buffer);
+	entry->isFinished = TRUE;
+	entry->buffer = InvalidBuffer;
+	entry->offset = InvalidOffsetNumber;
+	entry->list = NULL;
+	entry->nlist = 0;
+	entry->reduceResult = FALSE;
+	entry->predictNumberResult = 0;
 
-				page = BufferGetPage(entry->buffer);
-				entry->predictNumberResult = gdi->stack->predictNumber * GinPageGetOpaque(page)->maxoff;
+	if (btreeEntry.findItem(&btreeEntry, stackEntry))
+	{
+		IndexTuple	itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, stackEntry->off));
 
-				freeGinBtreeStack(gdi->stack);
-				pfree(gdi);
-				entry->isFinished = FALSE;
-			}
-			else if (GinGetNPosting(itup) > 0)
-			{
-				entry->nlist = GinGetNPosting(itup);
-				entry->list = (ItemPointerData *) palloc(sizeof(ItemPointerData) * entry->nlist);
-				memcpy(entry->list, GinGetPosting(itup), sizeof(ItemPointerData) * entry->nlist);
-				entry->isFinished = FALSE;
-			}
-		}
+		if (GinIsPostingTree(itup))
+		{
+			BlockNumber rootPostingTree = GinGetPostingTree(itup);
+			GinPostingTreeScan *gdi;
+			Page		page;
 
-		if (needUnlock)
 			LockBuffer(stackEntry->buffer, GIN_UNLOCK);
-		freeGinBtreeStack(stackEntry);
-	}
-	else if (entry->buffer != InvalidBuffer)
-	{
-		/* we should find place where we was stopped */
-		BlockNumber blkno;
-		Page		page;
+			needUnlock = FALSE;
+			gdi = prepareScanPostingTree(index, rootPostingTree, TRUE);
 
-		LockBuffer(entry->buffer, GIN_SHARE);
-
-		if (!ItemPointerIsValid(&entry->curItem))
-			/* start position */
-			return;
-		Assert(entry->offset != InvalidOffsetNumber);
+			entry->buffer = scanBeginPostingTree(gdi);
+			/*
+			 * We keep buffer pinned because we need to prevent deletition
+			 * page during scan. See GIN's vacuum implementation. RefCount
+			 * is increased to keep buffer pinned after freeGinBtreeStack() call.
+			 */
+			IncrBufferRefCount(entry->buffer);
 
-		page = BufferGetPage(entry->buffer);
+			page = BufferGetPage(entry->buffer);
+			entry->predictNumberResult = gdi->stack->predictNumber * GinPageGetOpaque(page)->maxoff;
 
-		/* try to find curItem in current buffer */
-		if (findItemInPage(page, &entry->curItem, &entry->offset))
-			return;
+			/*
+			 * Keep page content in memory to prevent durable page locking
+			 */
+			entry->list = (ItemPointerData *) palloc( BLCKSZ );
+			entry->nlist = GinPageGetOpaque(page)->maxoff;
+			memcpy( entry->list, GinDataPageGetItem(page, FirstOffsetNumber), 
+						GinPageGetOpaque(page)->maxoff * sizeof(ItemPointerData) );
 
-		/* walk to right */
-		while ((blkno = GinPageGetOpaque(page)->rightlink) != InvalidBlockNumber)
-		{
 			LockBuffer(entry->buffer, GIN_UNLOCK);
-			entry->buffer = ReleaseAndReadBuffer(entry->buffer, index, blkno);
-			LockBuffer(entry->buffer, GIN_SHARE);
-			page = BufferGetPage(entry->buffer);
-
-			entry->offset = InvalidOffsetNumber;
-			if (findItemInPage(page, &entry->curItem, &entry->offset))
-				return;
+			freeGinBtreeStack(gdi->stack);
+			pfree(gdi);
+			entry->isFinished = FALSE;
+		}
+		else if (GinGetNPosting(itup) > 0)
+		{
+			entry->nlist = GinGetNPosting(itup);
+			entry->list = (ItemPointerData *) palloc(sizeof(ItemPointerData) * entry->nlist);
+			memcpy(entry->list, GinGetPosting(itup), sizeof(ItemPointerData) * entry->nlist);
+			entry->isFinished = FALSE;
 		}
-
-		/*
-		 * curItem and any greated items was deleted by concurrent vacuum, so
-		 * we finished scan with currrent entry
-		 */
 	}
-}
 
-static void
-stopScanEntry(GinScanEntry entry)
-{
-	if (entry->buffer != InvalidBuffer)
-		LockBuffer(entry->buffer, GIN_UNLOCK);
+	if (needUnlock)
+			LockBuffer(stackEntry->buffer, GIN_UNLOCK);
+	freeGinBtreeStack(stackEntry);
 }
 
 static void
@@ -211,47 +137,38 @@ startScanKey(Relation index, GinState *ginstate, GinScanKey key)
 {
 	uint32		i;
 
+	if (!key->firstCall)
+		return;
+
 	for (i = 0; i < key->nentries; i++)
-		startScanEntry(index, ginstate, key->scanEntry + i, key->firstCall);
+		startScanEntry(index, ginstate, key->scanEntry + i);
 
-	if (key->firstCall)
-	{
-		memset(key->entryRes, TRUE, sizeof(bool) * key->nentries);
-		key->isFinished = FALSE;
-		key->firstCall = FALSE;
+	memset(key->entryRes, TRUE, sizeof(bool) * key->nentries);
+	key->isFinished = FALSE;
+	key->firstCall = FALSE;
 
-		if (GinFuzzySearchLimit > 0)
-		{
-			/*
-			 * If all of keys more than threshold we will try to reduce
-			 * result, we hope (and only hope, for intersection operation of
-			 * array our supposition isn't true), that total result will not
-			 * more than minimal predictNumberResult.
-			 */
+	if (GinFuzzySearchLimit > 0)
+	{
+		/*
+		 * If all of keys more than threshold we will try to reduce
+		 * result, we hope (and only hope, for intersection operation of
+		 * array our supposition isn't true), that total result will not
+		 * more than minimal predictNumberResult.
+		 */
 
-			for (i = 0; i < key->nentries; i++)
-				if (key->scanEntry[i].predictNumberResult <= key->nentries * GinFuzzySearchLimit)
-					return;
+		for (i = 0; i < key->nentries; i++)
+			if (key->scanEntry[i].predictNumberResult <= key->nentries * GinFuzzySearchLimit)
+				return;
 
-			for (i = 0; i < key->nentries; i++)
-				if (key->scanEntry[i].predictNumberResult > key->nentries * GinFuzzySearchLimit)
-				{
-					key->scanEntry[i].predictNumberResult /= key->nentries;
-					key->scanEntry[i].reduceResult = TRUE;
-				}
-		}
+		for (i = 0; i < key->nentries; i++)
+			if (key->scanEntry[i].predictNumberResult > key->nentries * GinFuzzySearchLimit)
+			{
+				key->scanEntry[i].predictNumberResult /= key->nentries;
+				key->scanEntry[i].reduceResult = TRUE;
+			}
 	}
 }
 
-static void
-stopScanKey(GinScanKey key)
-{
-	uint32		i;
-
-	for (i = 0; i < key->nentries; i++)
-		stopScanEntry(key->scanEntry + i);
-}
-
 static void
 startScan(IndexScanDesc scan)
 {
@@ -262,44 +179,82 @@ startScan(IndexScanDesc scan)
 		startScanKey(scan->indexRelation, &so->ginstate, so->keys + i);
 }
 
-static void
-stopScan(IndexScanDesc scan)
-{
-	uint32		i;
-	GinScanOpaque so = (GinScanOpaque) scan->opaque;
-
-	for (i = 0; i < so->nkeys; i++)
-		stopScanKey(so->keys + i);
-}
-
-
+/*
+ * Gets next ItemPointer from PostingTree. Note, that we copy
+ * page into GinScanEntry->list array and unlock page, but keep it pinned
+ * to prevent interference with vacuum
+ */
 static void
 entryGetNextItem(Relation index, GinScanEntry entry)
 {
-	Page		page = BufferGetPage(entry->buffer);
+	Page		page;
+	BlockNumber blkno;
 
-	entry->offset++;
-	if (entry->offset <= GinPageGetOpaque(page)->maxoff && GinPageGetOpaque(page)->maxoff >= FirstOffsetNumber)
+	for(;;)
 	{
-		entry->curItem = *(ItemPointerData *) GinDataPageGetItem(page, entry->offset);
-	}
-	else
-	{
-		BlockNumber blkno = GinPageGetOpaque(page)->rightlink;
+		entry->offset++;
 
-		LockBuffer(entry->buffer, GIN_UNLOCK);
-		if (blkno == InvalidBlockNumber)
+		if (entry->offset <= entry->nlist)
 		{
-			ReleaseBuffer(entry->buffer);
-			entry->buffer = InvalidBuffer;
-			entry->isFinished = TRUE;
+			entry->curItem = entry->list[entry->offset - 1];
+			return;
 		}
-		else
+
+		LockBuffer(entry->buffer, GIN_SHARE);
+		page = BufferGetPage(entry->buffer);
+		for(;;)
 		{
+			/*
+			 * It's needed to go by right link. During that we should refind
+			 * first ItemPointer greater that stored
+			 */
+
+			blkno = GinPageGetOpaque(page)->rightlink;
+
+			LockBuffer(entry->buffer, GIN_UNLOCK);
+			if (blkno == InvalidBlockNumber)
+			{
+				ReleaseBuffer(entry->buffer);
+				ItemPointerSet(&entry->curItem, InvalidBlockNumber, InvalidOffsetNumber);
+				entry->buffer = InvalidBuffer;
+				entry->isFinished = TRUE;
+				return;
+			}
+
 			entry->buffer = ReleaseAndReadBuffer(entry->buffer, index, blkno);
 			LockBuffer(entry->buffer, GIN_SHARE);
+			page = BufferGetPage(entry->buffer);
+
 			entry->offset = InvalidOffsetNumber;
-			entryGetNextItem(index, entry);
+			if (!ItemPointerIsValid(&entry->curItem) || findItemInPage(page, &entry->curItem, &entry->offset))
+			{
+				/*
+				 * Found position equal to or greater than stored
+				 */
+				entry->nlist = GinPageGetOpaque(page)->maxoff;
+				memcpy( entry->list, GinDataPageGetItem(page, FirstOffsetNumber), 
+							GinPageGetOpaque(page)->maxoff * sizeof(ItemPointerData) );
+
+				LockBuffer(entry->buffer, GIN_UNLOCK);
+
+				if ( !ItemPointerIsValid(&entry->curItem) || 
+					 compareItemPointers( &entry->curItem, entry->list + entry->offset - 1 ) == 0 )
+				{
+					/*
+					 * First pages are deleted or empty, or we found exact position,
+					 * so break inner loop and continue outer one.
+					 */
+
+					 break;
+				}
+			
+				/*
+				 * Find greater than entry->curItem position, store it.
+				 */
+				entry->curItem = entry->list[entry->offset - 1];
+
+				return;
+			}
 		}
 	}
 }
@@ -319,7 +274,7 @@ entryGetItem(Relation index, GinScanEntry entry)
 		entry->isFinished = entry->master->isFinished;
 		entry->curItem = entry->master->curItem;
 	}
-	else if (entry->list)
+	else if (!BufferIsValid(entry->buffer))
 	{
 		entry->offset++;
 		if (entry->offset <= entry->nlist)
@@ -527,8 +482,6 @@ gingetbitmap(PG_FUNCTION_ARGS)
 		ntids++;
 	}
 
-	stopScan(scan);
-
 	PG_RETURN_INT64(ntids);
 }
 
@@ -550,7 +503,6 @@ gingettuple(PG_FUNCTION_ARGS)
 
 	startScan(scan);
 	res = scanGetItem(scan, &scan->xs_ctup.t_self, &scan->xs_recheck);
-	stopScan(scan);
 
 	PG_RETURN_BOOL(res);
 }
diff --git a/src/include/access/gin.h b/src/include/access/gin.h
index 7a343c62c49..614e8566b90 100644
--- a/src/include/access/gin.h
+++ b/src/include/access/gin.h
@@ -4,7 +4,7 @@
  *
  *	Copyright (c) 2006-2008, PostgreSQL Global Development Group
  *
- *	$PostgreSQL: pgsql/src/include/access/gin.h,v 1.17 2008/04/10 22:25:25 tgl Exp $
+ *	$PostgreSQL: pgsql/src/include/access/gin.h,v 1.18 2008/04/22 17:52:43 teodor Exp $
  *--------------------------------------------------------------------------
  */
 
@@ -356,14 +356,16 @@ typedef struct GinScanEntryData
 	/* entry, got from extractQueryFn */
 	Datum		entry;
 
-	/* current ItemPointer to heap, its offset in buffer and buffer */
-	ItemPointerData curItem;
-	OffsetNumber offset;
+	/* Current page in posting tree */
 	Buffer		buffer;
 
-	/* in case of Posing list */
+	/* current ItemPointer to heap */
+	ItemPointerData curItem;
+
+	/* used for Posting list and one page in Posting tree */
 	ItemPointerData *list;
-	uint32		nlist;
+	uint32			 nlist;
+	OffsetNumber     offset;
 
 	bool		isFinished;
 	bool		reduceResult;
-- 
GitLab