From e20c70cb0fa74d5bffa080e21a99b44bf0768667 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Wed, 29 Jan 2014 17:53:39 +0200
Subject: [PATCH] Allow skipping some items in a multi-key GIN search.

In a multi-key search, ie. something like "col @> 'foo' AND col @> 'bar'",
as soon as we find the next item that matches the first criteria, we don't
need to check the second criteria for TIDs smaller the first match. That
saves a lot of effort, especially if one of the terms is rare, while the
second occurs very frequently.

Based on ideas from Alexander Korotkov's fast scan patch.
---
 src/backend/access/gin/ginget.c | 460 +++++++++++++++++---------------
 1 file changed, 250 insertions(+), 210 deletions(-)

diff --git a/src/backend/access/gin/ginget.c b/src/backend/access/gin/ginget.c
index 4bdbd4525d5..40abb3971e2 100644
--- a/src/backend/access/gin/ginget.c
+++ b/src/backend/access/gin/ginget.c
@@ -67,29 +67,6 @@ callConsistentFn(GinState *ginstate, GinScanKey key)
 									 PointerGetDatum(key->queryCategories)));
 }
 
-/*
- * Tries to refind previously taken ItemPointer on a posting page.
- */
-static bool
-needToStepRight(Page page, ItemPointer item)
-{
-	if (GinPageGetOpaque(page)->flags & GIN_DELETED)
-		/* page was deleted by concurrent vacuum */
-		return true;
-
-	if (ginCompareItemPointers(item, GinDataPageGetRightBound(page)) > 0
-			&& !GinPageRightMost(page))
-	{
-		/*
-		 * the item we're looking is > the right bound of the page, so it
-		 * can't be on this page.
-		 */
-		return true;
-	}
-
-	return false;
-}
-
 /*
  * Goes to the next page if current offset is outside of bounds
  */
@@ -447,8 +424,7 @@ restartScanEntry:
 			page = BufferGetPage(entry->buffer);
 
 			/*
-			 * Copy page content to memory to avoid keeping it locked for
-			 * a long time.
+			 * Load the first page into memory.
 			 */
 			entry->list = GinDataLeafPageGetItems(page, &entry->nlist);
 
@@ -518,88 +494,78 @@ startScan(IndexScanDesc scan)
 }
 
 /*
- * Gets next ItemPointer from PostingTree. Note, that we copy
- * page into GinScanEntry->list array and unlock page, but keep it pinned
- * to prevent interference with vacuum
+ * Load the next batch of item pointers from a posting tree.
+ *
+ * Note that we copy the page into GinScanEntry->list array and unlock it, but
+ * keep it pinned to prevent interference with vacuum.
  */
 static void
-entryGetNextItem(GinState *ginstate, GinScanEntry entry)
+entryLoadMoreItems(GinState *ginstate, GinScanEntry entry, ItemPointerData advancePast)
 {
 	Page		page;
 	int			i;
 
+	LockBuffer(entry->buffer, GIN_SHARE);
+	page = BufferGetPage(entry->buffer);
 	for (;;)
 	{
-		if (entry->offset < entry->nlist)
+		entry->offset = InvalidOffsetNumber;
+		if (entry->list)
 		{
-			entry->curItem = entry->list[entry->offset++];
-			return;
+			pfree(entry->list);
+			entry->list = NULL;
+			entry->nlist = 0;
 		}
 
-		LockBuffer(entry->buffer, GIN_SHARE);
-		page = BufferGetPage(entry->buffer);
-		for (;;)
+		/*
+		 * We've processed all the entries on this page. If it was the last
+		 * page in the tree, we're done.
+		 */
+		if (GinPageRightMost(page))
 		{
-			/*
-			 * It's needed to go by right link. During that we should refind
-			 * first ItemPointer greater that stored
-			 */
-			if (GinPageRightMost(page))
-			{
-				UnlockReleaseBuffer(entry->buffer);
-				ItemPointerSetInvalid(&entry->curItem);
-				entry->buffer = InvalidBuffer;
-				entry->isFinished = TRUE;
-				return;
-			}
+			UnlockReleaseBuffer(entry->buffer);
+			entry->buffer = InvalidBuffer;
+			entry->isFinished = TRUE;
+			return;
+		}
 
-			entry->buffer = ginStepRight(entry->buffer,
-										 ginstate->index,
-										 GIN_SHARE);
-			page = BufferGetPage(entry->buffer);
+		if (GinPageGetOpaque(page)->flags & GIN_DELETED)
+			continue;		/* page was deleted by concurrent vacuum */
 
-			entry->offset = InvalidOffsetNumber;
-			if (entry->list)
-			{
-				pfree(entry->list);
-				entry->list = NULL;
-			}
+		/*
+		 * Step to next page, following the right link. then find the first
+		 * ItemPointer greater than advancePast.
+		 */
+		entry->buffer = ginStepRight(entry->buffer,
+									 ginstate->index,
+									 GIN_SHARE);
+		page = BufferGetPage(entry->buffer);
 
+		/*
+		 * The first item > advancePast might not be on this page, but
+		 * somewhere to the right, if the page was split, or a non-match from
+		 * another key in the query allowed us to skip some items from this
+		 * entry. Keep following the right-links until we re-find the correct
+		 * page.
+		 */
+		if (!GinPageRightMost(page) &&
+			ginCompareItemPointers(&advancePast, GinDataPageGetRightBound(page)) >= 0)
+		{
 			/*
-			 * If the page was concurrently split, we have to re-find the
-			 * item we were stopped on. If the page was split more than once,
-			 * the item might not be on this page, but somewhere to the right.
-			 * Keep following the right-links until we re-find the correct
-			 * page.
+			 * the item we're looking is > the right bound of the page, so it
+			 * can't be on this page.
 			 */
-			if (ItemPointerIsValid(&entry->curItem) &&
-				needToStepRight(page, &entry->curItem))
-			{
-				continue;
-			}
+			continue;
+		}
 
-			entry->list = GinDataLeafPageGetItems(page, &entry->nlist);
+		entry->list = GinDataLeafPageGetItems(page, &entry->nlist);
 
-			/* re-find the item we were stopped on. */
-			if (ItemPointerIsValid(&entry->curItem))
-			{
-				for (i = 0; i < entry->nlist; i++)
-				{
-					if (ginCompareItemPointers(&entry->curItem,
-											   &entry->list[i]) < 0)
-					{
-						LockBuffer(entry->buffer, GIN_UNLOCK);
-						entry->offset = i + 1;
-						entry->curItem = entry->list[entry->offset - 1];
-						return;
-					}
-				}
-			}
-			else
+		for (i = 0; i < entry->nlist; i++)
+		{
+			if (ginCompareItemPointers(&advancePast, &entry->list[i]) < 0)
 			{
 				LockBuffer(entry->buffer, GIN_UNLOCK);
-				entry->offset = 1; /* scan all items on the page. */
-				entry->curItem = entry->list[entry->offset - 1];
+				entry->offset = i;
 				return;
 			}
 		}
@@ -610,10 +576,10 @@ entryGetNextItem(GinState *ginstate, GinScanEntry entry)
 #define dropItem(e) ( gin_rand() > ((double)GinFuzzySearchLimit)/((double)((e)->predictNumberResult)) )
 
 /*
- * Sets entry->curItem to next heap item pointer for one entry of one scan key,
- * or sets entry->isFinished to TRUE if there are no more.
+ * Sets entry->curItem to next heap item pointer > advancePast, for one entry
+ * of one scan key, or sets entry->isFinished to TRUE if there are no more.
  *
- * Item pointers must be returned in ascending order.
+ * Item pointers are returned in ascending order.
  *
  * Note: this can return a "lossy page" item pointer, indicating that the
  * entry potentially matches all items on that heap page.  However, it is
@@ -623,12 +589,20 @@ entryGetNextItem(GinState *ginstate, GinScanEntry entry)
  * current implementation this is guaranteed by the behavior of tidbitmaps.
  */
 static void
-entryGetItem(GinState *ginstate, GinScanEntry entry)
+entryGetItem(GinState *ginstate, GinScanEntry entry,
+			 ItemPointerData advancePast)
 {
 	Assert(!entry->isFinished);
 
+	Assert(!ItemPointerIsValid(&entry->curItem) ||
+		   ginCompareItemPointers(&entry->curItem, &advancePast) <= 0);
+
 	if (entry->matchBitmap)
 	{
+		/* A bitmap result */
+		BlockNumber advancePastBlk = GinItemPointerGetBlockNumber(&advancePast);
+		OffsetNumber advancePastOff = GinItemPointerGetOffsetNumber(&advancePast);
+
 		do
 		{
 			if (entry->matchResult == NULL ||
@@ -645,6 +619,18 @@ entryGetItem(GinState *ginstate, GinScanEntry entry)
 					break;
 				}
 
+				/*
+				 * If all the matches on this page are <= advancePast, skip
+				 * to next page.
+				 */
+				if (entry->matchResult->blockno < advancePastBlk ||
+					(entry->matchResult->blockno == advancePastBlk &&
+					 entry->matchResult->offsets[entry->offset] <= advancePastOff))
+				{
+					entry->offset = entry->matchResult->ntuples;
+					continue;
+				}
+
 				/*
 				 * Reset counter to the beginning of entry->matchResult. Note:
 				 * entry->offset is still greater than matchResult->ntuples if
@@ -670,6 +656,17 @@ entryGetItem(GinState *ginstate, GinScanEntry entry)
 				break;
 			}
 
+			if (entry->matchResult->blockno == advancePastBlk)
+			{
+				/*
+				 * Skip to the right offset on this page. We already checked
+				 * in above loop that there is at least one item > advancePast
+				 * on the page.
+				 */
+				while (entry->matchResult->offsets[entry->offset] <= advancePastOff)
+					entry->offset++;
+			}
+
 			ItemPointerSet(&entry->curItem,
 						   entry->matchResult->blockno,
 						   entry->matchResult->offsets[entry->offset]);
@@ -678,29 +675,48 @@ entryGetItem(GinState *ginstate, GinScanEntry entry)
 	}
 	else if (!BufferIsValid(entry->buffer))
 	{
-		entry->offset++;
-		if (entry->offset <= entry->nlist)
-			entry->curItem = entry->list[entry->offset - 1];
-		else
+		/* A posting list from an entry tuple  */
+		do
 		{
-			ItemPointerSetInvalid(&entry->curItem);
-			entry->isFinished = TRUE;
-		}
+			if (entry->offset >= entry->nlist)
+			{
+				ItemPointerSetInvalid(&entry->curItem);
+				entry->isFinished = TRUE;
+				break;
+			}
+
+			entry->curItem = entry->list[entry->offset++];
+		} while (ginCompareItemPointers(&entry->curItem, &advancePast) <= 0);
+		/* XXX: shouldn't we apply the fuzzy search limit here? */
 	}
 	else
 	{
+		/* A posting tree */
 		do
 		{
-			entryGetNextItem(ginstate, entry);
-		} while (entry->isFinished == FALSE &&
-				 entry->reduceResult == TRUE &&
-				 dropItem(entry));
+			/* If we've processed the current batch, load more items */
+			while (entry->offset >= entry->nlist)
+			{
+				entryLoadMoreItems(ginstate, entry, advancePast);
+
+				if (entry->isFinished)
+				{
+					ItemPointerSetInvalid(&entry->curItem);
+					return;
+				}
+			}
+
+			entry->curItem = entry->list[entry->offset++];
+
+		} while (ginCompareItemPointers(&entry->curItem, &advancePast) <= 0 ||
+				 (entry->reduceResult == TRUE && dropItem(entry)));
 	}
 }
 
 /*
- * Identify the "current" item among the input entry streams for this scan key,
- * and test whether it passes the scan key qual condition.
+ * Identify the "current" item among the input entry streams for this scan key
+ * that is greater than advancePast, and test whether it passes the scan key
+ * qual condition.
  *
  * The current item is the smallest curItem among the inputs.  key->curItem
  * is set to that value.  key->curItemMatches is set to indicate whether that
@@ -719,7 +735,8 @@ entryGetItem(GinState *ginstate, GinScanEntry entry)
  * logic in scanGetItem.)
  */
 static void
-keyGetItem(GinState *ginstate, MemoryContext tempCtx, GinScanKey key)
+keyGetItem(GinState *ginstate, MemoryContext tempCtx, GinScanKey key,
+		   ItemPointerData advancePast)
 {
 	ItemPointerData minItem;
 	ItemPointerData curPageLossy;
@@ -729,11 +746,20 @@ keyGetItem(GinState *ginstate, MemoryContext tempCtx, GinScanKey key)
 	GinScanEntry entry;
 	bool		res;
 	MemoryContext oldCtx;
+	bool		allFinished;
 
 	Assert(!key->isFinished);
 
 	/*
-	 * Find the minimum of the active entry curItems.
+	 * We might have already tested this item; if so, no need to repeat work.
+	 * (Note: the ">" case can happen, if minItem is exact but we previously
+	 * had to set curItem to a lossy-page pointer.)
+	 */
+	if (ginCompareItemPointers(&key->curItem, &advancePast) > 0)
+		return;
+
+	/*
+	 * Find the minimum item > advancePast among the active entry streams.
 	 *
 	 * Note: a lossy-page entry is encoded by a ItemPointer with max value for
 	 * offset (0xffff), so that it will sort after any exact entries for the
@@ -741,16 +767,33 @@ keyGetItem(GinState *ginstate, MemoryContext tempCtx, GinScanKey key)
 	 * pointers, which is good.
 	 */
 	ItemPointerSetMax(&minItem);
-
+	allFinished = true;
 	for (i = 0; i < key->nentries; i++)
 	{
 		entry = key->scanEntry[i];
-		if (entry->isFinished == FALSE &&
-			ginCompareItemPointers(&entry->curItem, &minItem) < 0)
-			minItem = entry->curItem;
+
+		/*
+		 * Advance this stream if necessary.
+		 *
+		 * In particular, since entry->curItem was initialized with
+		 * ItemPointerSetMin, this ensures we fetch the first item for each
+		 * entry on the first call.
+		 */
+		while (entry->isFinished == FALSE &&
+			   ginCompareItemPointers(&entry->curItem, &advancePast) <= 0)
+		{
+			entryGetItem(ginstate, entry, advancePast);
+		}
+
+		if (!entry->isFinished)
+		{
+			allFinished = FALSE;
+			if (ginCompareItemPointers(&entry->curItem, &minItem) < 0)
+				minItem = entry->curItem;
+		}
 	}
 
-	if (ItemPointerIsMax(&minItem))
+	if (allFinished)
 	{
 		/* all entries are finished */
 		key->isFinished = TRUE;
@@ -758,15 +801,7 @@ keyGetItem(GinState *ginstate, MemoryContext tempCtx, GinScanKey key)
 	}
 
 	/*
-	 * We might have already tested this item; if so, no need to repeat work.
-	 * (Note: the ">" case can happen, if minItem is exact but we previously
-	 * had to set curItem to a lossy-page pointer.)
-	 */
-	if (ginCompareItemPointers(&key->curItem, &minItem) >= 0)
-		return;
-
-	/*
-	 * OK, advance key->curItem and perform consistentFn test.
+	 * OK, set key->curItem and perform consistentFn test.
 	 */
 	key->curItem = minItem;
 
@@ -895,117 +930,122 @@ keyGetItem(GinState *ginstate, MemoryContext tempCtx, GinScanKey key)
  * keyGetItem() the combination logic is known only to the consistentFn.
  */
 static bool
-scanGetItem(IndexScanDesc scan, ItemPointer advancePast,
+scanGetItem(IndexScanDesc scan, ItemPointerData advancePast,
 			ItemPointerData *item, bool *recheck)
 {
 	GinScanOpaque so = (GinScanOpaque) scan->opaque;
-	GinState   *ginstate = &so->ginstate;
-	ItemPointerData myAdvancePast = *advancePast;
 	uint32		i;
-	bool		allFinished;
 	bool		match;
 
-	for (;;)
+	/*----------
+	 * Advance the scan keys in lock-step, until we find an item that matches
+	 * all the keys. If any key reports isFinished, meaning its subset of the
+	 * entries is exhausted, we can stop.  Otherwise, set *item to the next
+	 * matching item.
+	 *
+	 * This logic works only if a keyGetItem stream can never contain both
+	 * exact and lossy pointers for the same page.	Else we could have a
+	 * case like
+	 *
+	 *		stream 1		stream 2
+	 *		...				...
+	 *		42/6			42/7
+	 *		50/1			42/0xffff
+	 *		...				...
+	 *
+	 * We would conclude that 42/6 is not a match and advance stream 1,
+	 * thus never detecting the match to the lossy pointer in stream 2.
+	 * (keyGetItem has a similar problem versus entryGetItem.)
+	 *----------
+	 */
+	do
 	{
-		/*
-		 * Advance any entries that are <= myAdvancePast.  In particular,
-		 * since entry->curItem was initialized with ItemPointerSetMin, this
-		 * ensures we fetch the first item for each entry on the first call.
-		 */
-		allFinished = TRUE;
-
-		for (i = 0; i < so->totalentries; i++)
-		{
-			GinScanEntry entry = so->entries[i];
-
-			while (entry->isFinished == FALSE &&
-				   ginCompareItemPointers(&entry->curItem,
-										  &myAdvancePast) <= 0)
-				entryGetItem(ginstate, entry);
-
-			if (entry->isFinished == FALSE)
-				allFinished = FALSE;
-		}
-
-		if (allFinished)
-		{
-			/* all entries exhausted, so we're done */
-			return false;
-		}
-
-		/*
-		 * Perform the consistentFn test for each scan key.  If any key
-		 * reports isFinished, meaning its subset of the entries is exhausted,
-		 * we can stop.  Otherwise, set *item to the minimum of the key
-		 * curItems.
-		 */
-		ItemPointerSetMax(item);
-
-		for (i = 0; i < so->nkeys; i++)
+		ItemPointerSetMin(item);
+		match = true;
+		for (i = 0; i < so->nkeys && match; i++)
 		{
 			GinScanKey	key = so->keys + i;
 
-			keyGetItem(&so->ginstate, so->tempCtx, key);
+			/* Fetch the next item for this key that is > advancePast. */
+			keyGetItem(&so->ginstate, so->tempCtx, key, advancePast);
 
 			if (key->isFinished)
-				return false;	/* finished one of keys */
-
-			if (ginCompareItemPointers(&key->curItem, item) < 0)
-				*item = key->curItem;
-		}
+				return false;
 
-		Assert(!ItemPointerIsMax(item));
+			/*
+			 * If it's not a match, we can immediately conclude that nothing
+			 * <= this item matches, without checking the rest of the keys.
+			 */
+			if (!key->curItemMatches)
+			{
+				advancePast = key->curItem;
+				match = false;
+				break;
+			}
 
-		/*----------
-		 * Now *item contains first ItemPointer after previous result.
-		 *
-		 * The item is a valid hit only if all the keys succeeded for either
-		 * that exact TID, or a lossy reference to the same page.
-		 *
-		 * This logic works only if a keyGetItem stream can never contain both
-		 * exact and lossy pointers for the same page.	Else we could have a
-		 * case like
-		 *
-		 *		stream 1		stream 2
-		 *		...				...
-		 *		42/6			42/7
-		 *		50/1			42/0xffff
-		 *		...				...
-		 *
-		 * We would conclude that 42/6 is not a match and advance stream 1,
-		 * thus never detecting the match to the lossy pointer in stream 2.
-		 * (keyGetItem has a similar problem versus entryGetItem.)
-		 *----------
-		 */
-		match = true;
-		for (i = 0; i < so->nkeys; i++)
-		{
-			GinScanKey	key = so->keys + i;
+			/*
+			 * It's a match. We can conclude that nothing < matches, so
+			 * the other key streams can skip to this item.
+			 *
+			 * Beware of lossy pointers, though; from a lossy pointer, we
+			 * can only conclude that nothing smaller than this *block*
+			 * matches.
+			 */
+			if (ItemPointerIsLossyPage(&key->curItem))
+			{
+				if (GinItemPointerGetBlockNumber(&advancePast) <
+					GinItemPointerGetBlockNumber(&key->curItem))
+				{
+					advancePast.ip_blkid = key->curItem.ip_blkid;
+					advancePast.ip_posid = 0;
+				}
+			}
+			else
+			{
+				Assert(key->curItem.ip_posid > 0);
+				advancePast = key->curItem;
+				advancePast.ip_posid--;
+			}
 
-			if (key->curItemMatches)
+			/*
+			 * If this is the first key, remember this location as a
+			 * potential match.
+			 *
+			 * Otherwise, check if this is the same item that we checked the
+			 * previous keys for (or a lossy pointer for the same page). If
+			 * not, loop back to check the previous keys for this item (we
+			 * will check this key again too, but keyGetItem returns quickly
+			 * for that)
+			 */
+			if (i == 0)
 			{
-				if (ginCompareItemPointers(item, &key->curItem) == 0)
-					continue;
-				if (ItemPointerIsLossyPage(&key->curItem) &&
-					GinItemPointerGetBlockNumber(&key->curItem) ==
-					GinItemPointerGetBlockNumber(item))
-					continue;
+				*item = key->curItem;
+			}
+			else
+			{
+				if (ItemPointerIsLossyPage(&key->curItem) ||
+					ItemPointerIsLossyPage(item))
+				{
+					Assert (GinItemPointerGetBlockNumber(&key->curItem) >= GinItemPointerGetBlockNumber(item));
+					match = (GinItemPointerGetBlockNumber(&key->curItem) ==
+							 GinItemPointerGetBlockNumber(item));
+				}
+				else
+				{
+					Assert(ginCompareItemPointers(&key->curItem, item) >= 0);
+					match = (ginCompareItemPointers(&key->curItem, item) == 0);
+				}
 			}
-			match = false;
-			break;
 		}
+	} while (!match);
 
-		if (match)
-			break;
-
-		/*
-		 * No hit.	Update myAdvancePast to this TID, so that on the next pass
-		 * we'll move to the next possible entry.
-		 */
-		myAdvancePast = *item;
-	}
+	Assert(!ItemPointerIsMin(item));
 
 	/*
+	 * Now *item contains the first ItemPointer after previous result that
+	 * satisfied all the keys for that exact TID, or a lossy reference
+	 * to the same page.
+	 *
 	 * We must return recheck = true if any of the keys are marked recheck.
 	 */
 	*recheck = false;
@@ -1536,7 +1576,7 @@ gingetbitmap(PG_FUNCTION_ARGS)
 	{
 		CHECK_FOR_INTERRUPTS();
 
-		if (!scanGetItem(scan, &iptr, &iptr, &recheck))
+		if (!scanGetItem(scan, iptr, &iptr, &recheck))
 			break;
 
 		if (ItemPointerIsLossyPage(&iptr))
-- 
GitLab