diff --git a/src/backend/access/spgist/README b/src/backend/access/spgist/README index 1b86e275914d4216262b099b06dfdf1bfc21ab92..0939ef16a46218eb42c4d98edc85cc2365e3f597 100644 --- a/src/backend/access/spgist/README +++ b/src/backend/access/spgist/README @@ -201,17 +201,31 @@ space utilization, but doesn't change the basis of the algorithm. CONCURRENCY While descending the tree, the insertion algorithm holds exclusive lock on -two tree levels at a time, ie both parent and child pages (parent and child -pages can be the same, see notes above). There is a possibility of deadlock -between two insertions if there are cross-referenced pages in different -branches. That is, if inner tuple on page M has a child on page N while -an inner tuple from another branch is on page N and has a child on page M, -then two insertions descending the two branches could deadlock. To prevent -deadlocks we introduce a concept of "triple parity" of pages: if inner tuple -is on page with BlockNumber N, then its child tuples should be placed on the -same page, or else on a page with BlockNumber M where (N+1) mod 3 == M mod 3. -This rule guarantees that tuples on page M will have no children on page N, -since (M+1) mod 3 != N mod 3. +two tree levels at a time, ie both parent and child pages (but parent and +child pages can be the same, see notes above). There is a possibility of +deadlock between two insertions if there are cross-referenced pages in +different branches. That is, if inner tuple on page M has a child on page N +while an inner tuple from another branch is on page N and has a child on +page M, then two insertions descending the two branches could deadlock, +since they will each hold their parent page's lock while trying to get the +child page's lock. + +Currently, we deal with this by conditionally locking buffers as we descend +the tree. If we fail to get lock on a buffer, we release both buffers and +restart the insertion process. This is potentially inefficient, but the +locking costs of a more deterministic approach seem very high. + +To reduce the number of cases where that happens, we introduce a concept of +"triple parity" of pages: if inner tuple is on page with BlockNumber N, then +its child tuples should be placed on the same page, or else on a page with +BlockNumber M where (N+1) mod 3 == M mod 3. This rule ensures that tuples +on page M will have no children on page N, since (M+1) mod 3 != N mod 3. +That makes it unlikely that two insertion processes will conflict against +each other while descending the tree. It's not perfect though: in the first +place, we could still get a deadlock among three or more insertion processes, +and in the second place, it's impractical to preserve this invariant in every +case when we expand or split an inner tuple. So we still have to allow for +deadlocks. Insertion may also need to take locks on an additional inner and/or leaf page to add tuples of the right type(s), when there's not enough room on the pages diff --git a/src/backend/access/spgist/spgdoinsert.c b/src/backend/access/spgist/spgdoinsert.c index b3f8f6a231372fb1f4b6e2c1d42f124b2ca215cb..1df9fbc5b320c275374dee8463cbb3b2d7e0ceff 100644 --- a/src/backend/access/spgist/spgdoinsert.c +++ b/src/backend/access/spgist/spgdoinsert.c @@ -1850,9 +1850,13 @@ spgSplitNodeAction(Relation index, SpGistState *state, } /* - * Insert one item into the index + * Insert one item into the index. + * + * Returns true on success, false if we failed to complete the insertion + * because of conflict with a concurrent insert. In the latter case, + * caller should re-call spgdoinsert() with the same args. */ -void +bool spgdoinsert(Relation index, SpGistState *state, ItemPointer heapPtr, Datum datum, bool isnull) { @@ -1933,12 +1937,32 @@ spgdoinsert(Relation index, SpGistState *state, &isNew); current.blkno = BufferGetBlockNumber(current.buffer); } - else if (parent.buffer == InvalidBuffer || - current.blkno != parent.blkno) + else if (parent.buffer == InvalidBuffer) { + /* we hold no parent-page lock, so no deadlock is possible */ current.buffer = ReadBuffer(index, current.blkno); LockBuffer(current.buffer, BUFFER_LOCK_EXCLUSIVE); } + else if (current.blkno != parent.blkno) + { + /* descend to a new child page */ + current.buffer = ReadBuffer(index, current.blkno); + + /* + * Attempt to acquire lock on child page. We must beware of + * deadlock against another insertion process descending from that + * page to our parent page (see README). If we fail to get lock, + * abandon the insertion and tell our caller to start over. XXX + * this could be improved; perhaps it'd be worth sleeping a bit + * before giving up? + */ + if (!ConditionalLockBuffer(current.buffer)) + { + ReleaseBuffer(current.buffer); + UnlockReleaseBuffer(parent.buffer); + return false; + } + } else { /* inner tuple can be stored on the same page as parent one */ @@ -2139,4 +2163,6 @@ spgdoinsert(Relation index, SpGistState *state, SpGistSetLastUsedPage(index, parent.buffer); UnlockReleaseBuffer(parent.buffer); } + + return true; } diff --git a/src/backend/access/spgist/spginsert.c b/src/backend/access/spgist/spginsert.c index 456a71fbba5dabb557510167baaccccea4fa6c9b..b20ddcc79a3bdb35b436549b1059dd0a0ecce232 100644 --- a/src/backend/access/spgist/spginsert.c +++ b/src/backend/access/spgist/spginsert.c @@ -43,7 +43,10 @@ spgistBuildCallback(Relation index, HeapTuple htup, Datum *values, /* Work in temp context, and reset it after each tuple */ oldCtx = MemoryContextSwitchTo(buildstate->tmpCtx); - spgdoinsert(index, &buildstate->spgstate, &htup->t_self, *values, *isnull); + /* No concurrent insertions can be happening, so failure is unexpected */ + if (!spgdoinsert(index, &buildstate->spgstate, &htup->t_self, + *values, *isnull)) + elog(ERROR, "unexpected spgdoinsert() failure"); MemoryContextSwitchTo(oldCtx); MemoryContextReset(buildstate->tmpCtx); @@ -217,7 +220,17 @@ spginsert(PG_FUNCTION_ARGS) initSpGistState(&spgstate, index); - spgdoinsert(index, &spgstate, ht_ctid, *values, *isnull); + /* + * We might have to repeat spgdoinsert() multiple times, if conflicts + * occur with concurrent insertions. If so, reset the insertCtx each time + * to avoid cumulative memory consumption. That means we also have to + * redo initSpGistState(), but it's cheap enough not to matter. + */ + while (!spgdoinsert(index, &spgstate, ht_ctid, *values, *isnull)) + { + MemoryContextReset(insertCtx); + initSpGistState(&spgstate, index); + } SpGistUpdateMetaPage(index); diff --git a/src/include/access/spgist_private.h b/src/include/access/spgist_private.h index b5bc45121aa8a84cf30609aa9302897b447f5625..5ab982cdedd4a8b4cd6ebb70144936031e517e8d 100644 --- a/src/include/access/spgist_private.h +++ b/src/include/access/spgist_private.h @@ -651,7 +651,7 @@ extern void spgPageIndexMultiDelete(SpGistState *state, Page page, OffsetNumber *itemnos, int nitems, int firststate, int reststate, BlockNumber blkno, OffsetNumber offnum); -extern void spgdoinsert(Relation index, SpGistState *state, +extern bool spgdoinsert(Relation index, SpGistState *state, ItemPointer heapPtr, Datum datum, bool isnull); #endif /* SPGIST_PRIVATE_H */