diff --git a/src/backend/access/spgist/README b/src/backend/access/spgist/README index d20ad17a4b66980d0cd59afff8c1523e6ddb1366..1b86e275914d4216262b099b06dfdf1bfc21ab92 100644 --- a/src/backend/access/spgist/README +++ b/src/backend/access/spgist/README @@ -314,6 +314,27 @@ the reverse map of the nextOffset links (ie, when we see tuple x links to tuple y, we set predecessor[y] = x). Then head tuples are the ones with no predecessor. +Because insertions can occur while VACUUM runs, a pure sequential scan +could miss deleting some target leaf tuples, because they could get moved +from a not-yet-visited leaf page to an already-visited leaf page as a +consequence of a PickSplit or MoveLeafs operation. Failing to delete any +target TID is not acceptable, so we have to extend the algorithm to cope +with such cases. We recognize that such a move might have occurred when +we see a leaf-page REDIRECT tuple whose XID indicates it might have been +created after the VACUUM scan started. We add the redirection target TID +to a "pending list" of places we need to recheck. Between pages of the +main sequential scan, we empty the pending list by visiting each listed +TID. If it points to an inner tuple (from a PickSplit), add each downlink +TID to the pending list. If it points to a leaf page, vacuum that page. +(We could just vacuum the single pointed-to chain, but vacuuming the +whole page simplifies the code and reduces the odds of VACUUM having to +modify the same page multiple times.) To ensure that pending-list +processing can never get into an endless loop, even in the face of +concurrent index changes, we don't remove list entries immediately but +only after we've completed all pending-list processing; instead we just +mark items as done after processing them. Adding a TID that's already in +the list is a no-op, whether or not that item is marked done yet. + spgbulkdelete also updates the index's free space map. Currently, spgvacuumcleanup has nothing to do if spgbulkdelete was diff --git a/src/backend/access/spgist/spgvacuum.c b/src/backend/access/spgist/spgvacuum.c index a09da84a2aac18b9846919f6c81c6ba0d47f3dd2..856790ee2aa41cac1728021a079338a10b21e429 100644 --- a/src/backend/access/spgist/spgvacuum.c +++ b/src/backend/access/spgist/spgvacuum.c @@ -25,9 +25,18 @@ #include "storage/indexfsm.h" #include "storage/lmgr.h" #include "storage/procarray.h" +#include "utils/snapmgr.h" -/* local state for vacuum operations */ +/* Entry in pending-list of TIDs we need to revisit */ +typedef struct spgVacPendingItem +{ + ItemPointerData tid; /* redirection target to visit */ + bool done; /* have we dealt with this? */ + struct spgVacPendingItem *next; /* list link */ +} spgVacPendingItem; + +/* Local state for vacuum operations */ typedef struct spgBulkDeleteState { /* Parameters passed in to spgvacuumscan */ @@ -35,22 +44,87 @@ typedef struct spgBulkDeleteState IndexBulkDeleteResult *stats; IndexBulkDeleteCallback callback; void *callback_state; + /* Additional working state */ - SpGistState spgstate; - TransactionId OldestXmin; - BlockNumber lastFilledBlock; + SpGistState spgstate; /* for SPGiST operations that need one */ + spgVacPendingItem *pendingList; /* TIDs we need to (re)visit */ + TransactionId myXmin; /* for detecting newly-added redirects */ + TransactionId OldestXmin; /* for deciding a redirect is obsolete */ + BlockNumber lastFilledBlock; /* last non-deletable block */ } spgBulkDeleteState; +/* + * Add TID to pendingList, but only if not already present. + * + * Note that new items are always appended at the end of the list; this + * ensures that scans of the list don't miss items added during the scan. + */ +static void +spgAddPendingTID(spgBulkDeleteState *bds, ItemPointer tid) +{ + spgVacPendingItem *pitem; + spgVacPendingItem **listLink; + + /* search the list for pre-existing entry */ + listLink = &bds->pendingList; + while (*listLink != NULL) + { + pitem = *listLink; + if (ItemPointerEquals(tid, &pitem->tid)) + return; /* already in list, do nothing */ + listLink = &pitem->next; + } + /* not there, so append new entry */ + pitem = (spgVacPendingItem *) palloc(sizeof(spgVacPendingItem)); + pitem->tid = *tid; + pitem->done = false; + pitem->next = NULL; + *listLink = pitem; +} + +/* + * Clear pendingList + */ +static void +spgClearPendingList(spgBulkDeleteState *bds) +{ + spgVacPendingItem *pitem; + spgVacPendingItem *nitem; + + for (pitem = bds->pendingList; pitem != NULL; pitem = nitem) + { + nitem = pitem->next; + /* All items in list should have been dealt with */ + Assert(pitem->done); + pfree(pitem); + } + bds->pendingList = NULL; +} + /* * Vacuum a regular (non-root) leaf page * * We must delete tuples that are targeted for deletion by the VACUUM, * but not move any tuples that are referenced by outside links; we assume * those are the ones that are heads of chains. + * + * If we find a REDIRECT that was made by a concurrently-running transaction, + * we must add its target TID to pendingList. (We don't try to visit the + * target immediately, first because we don't want VACUUM locking more than + * one buffer at a time, and second because the duplicate-filtering logic + * in spgAddPendingTID is useful to ensure we can't get caught in an infinite + * loop in the face of continuous concurrent insertions.) + * + * If forPending is true, we are examining the page as a consequence of + * chasing a redirect link, not as part of the normal sequential scan. + * We still vacuum the page normally, but we don't increment the stats + * about live tuples; else we'd double-count those tuples, since the page + * has been or will be visited in the sequential scan as well. */ static void -vacuumLeafPage(spgBulkDeleteState *bds, Relation index, Buffer buffer) +vacuumLeafPage(spgBulkDeleteState *bds, Relation index, Buffer buffer, + bool forPending) { Page page = BufferGetPage(buffer); spgxlogVacuumLeaf xlrec; @@ -90,7 +164,8 @@ vacuumLeafPage(spgBulkDeleteState *bds, Relation index, Buffer buffer) } else { - bds->stats->num_index_tuples += 1; + if (!forPending) + bds->stats->num_index_tuples += 1; } /* Form predecessor map, too */ @@ -106,6 +181,25 @@ vacuumLeafPage(spgBulkDeleteState *bds, Relation index, Buffer buffer) predecessor[lt->nextOffset] = i; } } + else if (lt->tupstate == SPGIST_REDIRECT) + { + SpGistDeadTuple dt = (SpGistDeadTuple) lt; + + Assert(dt->nextOffset == InvalidOffsetNumber); + Assert(ItemPointerIsValid(&dt->pointer)); + + /* + * Add target TID to pending list if the redirection could have + * happened since VACUUM started. + * + * Note: we could make a tighter test by seeing if the xid is + * "running" according to the active snapshot; but tqual.c doesn't + * currently export a suitable API, and it's not entirely clear + * that a tighter test is worth the cycles anyway. + */ + if (TransactionIdFollowsOrEquals(dt->xid, bds->myXmin)) + spgAddPendingTID(bds, &dt->pointer); + } else { Assert(lt->nextOffset == InvalidOffsetNumber); @@ -545,7 +639,7 @@ spgvacuumpage(spgBulkDeleteState *bds, BlockNumber blkno) } else { - vacuumLeafPage(bds, index, buffer); + vacuumLeafPage(bds, index, buffer, false); vacuumRedirectAndPlaceholder(index, buffer, bds->OldestXmin); } } @@ -556,8 +650,8 @@ spgvacuumpage(spgBulkDeleteState *bds, BlockNumber blkno) } /* - * The root page must never be deleted, nor marked as available in FSM, - * because we don't want it ever returned by a search for a place to + * The root pages must never be deleted, nor marked as available in FSM, + * because we don't want them ever returned by a search for a place to * put a new tuple. Otherwise, check for empty/deletable page, and * make sure FSM knows about it. */ @@ -585,6 +679,118 @@ spgvacuumpage(spgBulkDeleteState *bds, BlockNumber blkno) UnlockReleaseBuffer(buffer); } +/* + * Process the pending-TID list between pages of the main scan + */ +static void +spgprocesspending(spgBulkDeleteState *bds) +{ + Relation index = bds->info->index; + spgVacPendingItem *pitem; + spgVacPendingItem *nitem; + BlockNumber blkno; + Buffer buffer; + Page page; + + for (pitem = bds->pendingList; pitem != NULL; pitem = pitem->next) + { + if (pitem->done) + continue; /* ignore already-done items */ + + /* call vacuum_delay_point while not holding any buffer lock */ + vacuum_delay_point(); + + /* examine the referenced page */ + blkno = ItemPointerGetBlockNumber(&pitem->tid); + buffer = ReadBufferExtended(index, MAIN_FORKNUM, blkno, + RBM_NORMAL, bds->info->strategy); + LockBuffer(buffer, BUFFER_LOCK_EXCLUSIVE); + page = (Page) BufferGetPage(buffer); + + if (PageIsNew(page) || SpGistPageIsDeleted(page)) + { + /* Probably shouldn't happen, but ignore it */ + } + else if (SpGistPageIsLeaf(page)) + { + if (SpGistBlockIsRoot(blkno)) + { + /* this should definitely not happen */ + elog(ERROR, "redirection leads to root page of index \"%s\"", + RelationGetRelationName(index)); + } + + /* deal with any deletable tuples */ + vacuumLeafPage(bds, index, buffer, true); + /* might as well do this while we are here */ + vacuumRedirectAndPlaceholder(index, buffer, bds->OldestXmin); + + SpGistSetLastUsedPage(index, buffer); + + /* + * We can mark as done not only this item, but any later ones + * pointing at the same page, since we vacuumed the whole page. + */ + pitem->done = true; + for (nitem = pitem->next; nitem != NULL; nitem = nitem->next) + { + if (ItemPointerGetBlockNumber(&nitem->tid) == blkno) + nitem->done = true; + } + } + else + { + /* + * On an inner page, visit the referenced inner tuple and add + * all its downlinks to the pending list. We might have pending + * items for more than one inner tuple on the same page (in fact + * this is pretty likely given the way space allocation works), + * so get them all while we are here. + */ + for (nitem = pitem; nitem != NULL; nitem = nitem->next) + { + if (nitem->done) + continue; + if (ItemPointerGetBlockNumber(&nitem->tid) == blkno) + { + OffsetNumber offset; + SpGistInnerTuple innerTuple; + + offset = ItemPointerGetOffsetNumber(&nitem->tid); + innerTuple = (SpGistInnerTuple) PageGetItem(page, + PageGetItemId(page, offset)); + if (innerTuple->tupstate == SPGIST_LIVE) + { + SpGistNodeTuple node; + int i; + + SGITITERATE(innerTuple, i, node) + { + if (ItemPointerIsValid(&node->t_tid)) + spgAddPendingTID(bds, &node->t_tid); + } + } + else if (innerTuple->tupstate == SPGIST_REDIRECT) + { + /* transfer attention to redirect point */ + spgAddPendingTID(bds, + &((SpGistDeadTuple) innerTuple)->pointer); + } + else + elog(ERROR, "unexpected SPGiST tuple state: %d", + innerTuple->tupstate); + + nitem->done = true; + } + } + } + + UnlockReleaseBuffer(buffer); + } + + spgClearPendingList(bds); +} + /* * Perform a bulkdelete scan */ @@ -598,6 +804,8 @@ spgvacuumscan(spgBulkDeleteState *bds) /* Finish setting up spgBulkDeleteState */ initSpGistState(&bds->spgstate, index); + bds->pendingList = NULL; + bds->myXmin = GetActiveSnapshot()->xmin; bds->OldestXmin = GetOldestXmin(true, false); bds->lastFilledBlock = SPGIST_LAST_FIXED_BLKNO; @@ -637,6 +845,9 @@ spgvacuumscan(spgBulkDeleteState *bds) for (; blkno < num_pages; blkno++) { spgvacuumpage(bds, blkno); + /* empty the pending-list after each page */ + if (bds->pendingList != NULL) + spgprocesspending(bds); } } @@ -747,7 +958,7 @@ spgvacuumcleanup(PG_FUNCTION_ARGS) IndexFreeSpaceMapVacuum(index); /* - * It's quite possible for us to be fooled by concurrent page splits into + * It's quite possible for us to be fooled by concurrent tuple moves into * double-counting some index tuples, so disbelieve any total that exceeds * the underlying heap's count ... if we know that accurately. Otherwise * this might just make matters worse.