diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index 6e70ab25c106445374102ccd2188461abe667953..46c7c4da73f7aaf2107e1ddb9be2ef2dd99d288a 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/heap/heapam.c,v 1.197 2005/08/12 01:35:54 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/access/heap/heapam.c,v 1.198 2005/08/20 00:39:51 tgl Exp $ * * * INTERFACE ROUTINES @@ -22,7 +22,7 @@ * heap_rescan - restart a relation scan * heap_endscan - end relation scan * heap_getnext - retrieve next tuple in scan - * heap_fetch - retrieve tuple with tid + * heap_fetch - retrieve tuple with given tid * heap_insert - insert tuple into a relation * heap_delete - delete a tuple from a relation * heap_update - replace a tuple in a relation with another tuple @@ -152,7 +152,7 @@ heapgettup(Relation relation, tid = NULL; } - tuple->t_tableOid = relation->rd_id; + tuple->t_tableOid = RelationGetRelid(relation); /* * return null immediately if relation is empty @@ -800,10 +800,13 @@ heap_getnext(HeapScanDesc scan, ScanDirection direction) * keep_buf = false, the pin is released and *userbuf is set to InvalidBuffer. * * It is somewhat inconsistent that we ereport() on invalid block number but - * return false on invalid item number. This is historical. The only - * justification I can see is that the caller can relatively easily check the - * block number for validity, but cannot check the item number without reading - * the page himself. + * return false on invalid item number. There are a couple of reasons though. + * One is that the caller can relatively easily check the block number for + * validity, but cannot check the item number without reading the page + * himself. Another is that when we are following a t_ctid link, we can be + * reasonably confident that the page number is valid (since VACUUM shouldn't + * truncate off the destination page without having killed the referencing + * tuple first), but the item number might well not be good. */ bool heap_fetch(Relation relation, @@ -906,7 +909,7 @@ heap_release_fetch(Relation relation, tuple->t_datamcxt = NULL; tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lp); tuple->t_len = ItemIdGetLength(lp); - tuple->t_tableOid = relation->rd_id; + tuple->t_tableOid = RelationGetRelid(relation); /* * check time qualification of tuple, then release lock @@ -950,83 +953,129 @@ heap_release_fetch(Relation relation, /* * heap_get_latest_tid - get the latest tid of a specified tuple + * + * Actually, this gets the latest version that is visible according to + * the passed snapshot. You can pass SnapshotDirty to get the very latest, + * possibly uncommitted version. + * + * *tid is both an input and an output parameter: it is updated to + * show the latest version of the row. Note that it will not be changed + * if no version of the row passes the snapshot test. */ -ItemPointer +void heap_get_latest_tid(Relation relation, Snapshot snapshot, ItemPointer tid) { - ItemId lp = NULL; - Buffer buffer; - PageHeader dp; - OffsetNumber offnum; - HeapTupleData tp; - HeapTupleHeader t_data; + BlockNumber blk; ItemPointerData ctid; - bool invalidBlock, - linkend, - valid; + TransactionId priorXmax; - /* - * get the buffer from the relation descriptor Note that this does a - * buffer pin. - */ - buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(tid)); - LockBuffer(buffer, BUFFER_LOCK_SHARE); + /* this is to avoid Assert failures on bad input */ + if (!ItemPointerIsValid(tid)) + return; /* - * get the item line pointer corresponding to the requested tid + * Since this can be called with user-supplied TID, don't trust the + * input too much. (RelationGetNumberOfBlocks is an expensive check, + * so we don't check t_ctid links again this way. Note that it would + * not do to call it just once and save the result, either.) */ - dp = (PageHeader) BufferGetPage(buffer); - offnum = ItemPointerGetOffsetNumber(tid); - invalidBlock = true; - if (!PageIsNew(dp)) - { - lp = PageGetItemId(dp, offnum); - if (ItemIdIsUsed(lp)) - invalidBlock = false; - } - if (invalidBlock) - { - LockBuffer(buffer, BUFFER_LOCK_UNLOCK); - ReleaseBuffer(buffer); - return NULL; - } + blk = ItemPointerGetBlockNumber(tid); + if (blk >= RelationGetNumberOfBlocks(relation)) + elog(ERROR, "block number %u is out of range for relation \"%s\"", + blk, RelationGetRelationName(relation)); /* - * more sanity checks + * Loop to chase down t_ctid links. At top of loop, ctid is the + * tuple we need to examine, and *tid is the TID we will return if + * ctid turns out to be bogus. + * + * Note that we will loop until we reach the end of the t_ctid chain. + * Depending on the snapshot passed, there might be at most one visible + * version of the row, but we don't try to optimize for that. */ + ctid = *tid; + priorXmax = InvalidTransactionId; /* cannot check first XMIN */ + for (;;) + { + Buffer buffer; + PageHeader dp; + OffsetNumber offnum; + ItemId lp; + HeapTupleData tp; + bool valid; - tp.t_datamcxt = NULL; - t_data = tp.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lp); - tp.t_len = ItemIdGetLength(lp); - tp.t_self = *tid; - ctid = tp.t_data->t_ctid; + /* + * Read, pin, and lock the page. + */ + buffer = ReadBuffer(relation, ItemPointerGetBlockNumber(&ctid)); + LockBuffer(buffer, BUFFER_LOCK_SHARE); + dp = (PageHeader) BufferGetPage(buffer); - /* - * check time qualification of tid - */ + /* + * Check for bogus item number. This is not treated as an error + * condition because it can happen while following a t_ctid link. + * We just assume that the prior tid is OK and return it unchanged. + */ + offnum = ItemPointerGetOffsetNumber(&ctid); + if (offnum < FirstOffsetNumber || offnum > PageGetMaxOffsetNumber(dp)) + { + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + ReleaseBuffer(buffer); + break; + } + lp = PageGetItemId(dp, offnum); + if (!ItemIdIsUsed(lp)) + { + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + ReleaseBuffer(buffer); + break; + } - HeapTupleSatisfies(&tp, relation, buffer, dp, - snapshot, 0, NULL, valid); + /* OK to access the tuple */ + tp.t_self = ctid; + tp.t_datamcxt = NULL; + tp.t_data = (HeapTupleHeader) PageGetItem(dp, lp); + tp.t_len = ItemIdGetLength(lp); - linkend = true; - if ((t_data->t_infomask & HEAP_XMIN_COMMITTED) != 0 && - !ItemPointerEquals(tid, &ctid)) - linkend = false; + /* + * After following a t_ctid link, we might arrive at an unrelated + * tuple. Check for XMIN match. + */ + if (TransactionIdIsValid(priorXmax) && + !TransactionIdEquals(priorXmax, HeapTupleHeaderGetXmin(tp.t_data))) + { + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + ReleaseBuffer(buffer); + break; + } - LockBuffer(buffer, BUFFER_LOCK_UNLOCK); - ReleaseBuffer(buffer); + /* + * Check time qualification of tuple; if visible, set it as the new + * result candidate. + */ + HeapTupleSatisfies(&tp, relation, buffer, dp, + snapshot, 0, NULL, valid); + if (valid) + *tid = ctid; - if (!valid) - { - if (linkend) - return NULL; - heap_get_latest_tid(relation, snapshot, &ctid); - *tid = ctid; - } + /* + * If there's a valid t_ctid link, follow it, else we're done. + */ + if ((tp.t_data->t_infomask & (HEAP_XMAX_INVALID | HEAP_IS_LOCKED)) || + ItemPointerEquals(&tp.t_self, &tp.t_data->t_ctid)) + { + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + ReleaseBuffer(buffer); + break; + } - return tid; + ctid = tp.t_data->t_ctid; + priorXmax = HeapTupleHeaderGetXmax(tp.t_data); + LockBuffer(buffer, BUFFER_LOCK_UNLOCK); + ReleaseBuffer(buffer); + } /* end of loop */ } /* @@ -1083,7 +1132,7 @@ heap_insert(Relation relation, HeapTuple tup, CommandId cid, HeapTupleHeaderSetCmin(tup->t_data, cid); HeapTupleHeaderSetXmax(tup->t_data, 0); /* zero out Datum fields */ HeapTupleHeaderSetCmax(tup->t_data, 0); /* for cleanliness */ - tup->t_tableOid = relation->rd_id; + tup->t_tableOid = RelationGetRelid(relation); /* * If the new tuple is too big for storage or contains already toasted @@ -1197,29 +1246,34 @@ simple_heap_insert(Relation relation, HeapTuple tup) } /* - * heap_delete - delete a tuple + * heap_delete - delete a tuple * * NB: do not call this directly unless you are prepared to deal with * concurrent-update conditions. Use simple_heap_delete instead. * - * relation - table to be modified + * relation - table to be modified (caller must hold suitable lock) * tid - TID of tuple to be deleted * ctid - output parameter, used only for failure case (see below) - * cid - delete command ID to use in verifying tuple visibility + * update_xmax - output parameter, used only for failure case (see below) + * cid - delete command ID (used for visibility test, and stored into + * cmax if successful) * crosscheck - if not InvalidSnapshot, also check tuple against this * wait - true if should wait for any conflicting update to commit/abort * * Normal, successful return value is HeapTupleMayBeUpdated, which * actually means we did delete it. Failure return codes are * HeapTupleSelfUpdated, HeapTupleUpdated, or HeapTupleBeingUpdated - * (the last only possible if wait == false). On a failure return, - * *ctid is set to the ctid link of the target tuple (possibly a later - * version of the row). + * (the last only possible if wait == false). + * + * In the failure cases, the routine returns the tuple's t_ctid and t_xmax. + * If t_ctid is the same as tid, the tuple was deleted; if different, the + * tuple was updated, and t_ctid is the location of the replacement tuple. + * (t_xmax is needed to verify that the replacement tuple matches.) */ HTSU_Result heap_delete(Relation relation, ItemPointer tid, - ItemPointer ctid, CommandId cid, - Snapshot crosscheck, bool wait) + ItemPointer ctid, TransactionId *update_xmax, + CommandId cid, Snapshot crosscheck, bool wait) { HTSU_Result result; TransactionId xid = GetCurrentTransactionId(); @@ -1236,11 +1290,11 @@ heap_delete(Relation relation, ItemPointer tid, dp = (PageHeader) BufferGetPage(buffer); lp = PageGetItemId(dp, ItemPointerGetOffsetNumber(tid)); + tp.t_datamcxt = NULL; - tp.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lp); + tp.t_data = (HeapTupleHeader) PageGetItem(dp, lp); tp.t_len = ItemIdGetLength(lp); tp.t_self = *tid; - tp.t_tableOid = relation->rd_id; l1: result = HeapTupleSatisfiesUpdate(tp.t_data, cid, buffer); @@ -1360,7 +1414,9 @@ l1: Assert(result == HeapTupleSelfUpdated || result == HeapTupleUpdated || result == HeapTupleBeingUpdated); + Assert(!(tp.t_data->t_infomask & HEAP_XMAX_INVALID)); *ctid = tp.t_data->t_ctid; + *update_xmax = HeapTupleHeaderGetXmax(tp.t_data); LockBuffer(buffer, BUFFER_LOCK_UNLOCK); ReleaseBuffer(buffer); if (have_tuple_lock) @@ -1457,11 +1513,12 @@ l1: void simple_heap_delete(Relation relation, ItemPointer tid) { - ItemPointerData ctid; HTSU_Result result; + ItemPointerData update_ctid; + TransactionId update_xmax; result = heap_delete(relation, tid, - &ctid, + &update_ctid, &update_xmax, GetCurrentCommandId(), InvalidSnapshot, true /* wait for commit */ ); switch (result) @@ -1491,27 +1548,33 @@ simple_heap_delete(Relation relation, ItemPointer tid) * NB: do not call this directly unless you are prepared to deal with * concurrent-update conditions. Use simple_heap_update instead. * - * relation - table to be modified + * relation - table to be modified (caller must hold suitable lock) * otid - TID of old tuple to be replaced * newtup - newly constructed tuple data to store * ctid - output parameter, used only for failure case (see below) - * cid - update command ID to use in verifying old tuple visibility + * update_xmax - output parameter, used only for failure case (see below) + * cid - update command ID (used for visibility test, and stored into + * cmax/cmin if successful) * crosscheck - if not InvalidSnapshot, also check old tuple against this * wait - true if should wait for any conflicting update to commit/abort * * Normal, successful return value is HeapTupleMayBeUpdated, which * actually means we *did* update it. Failure return codes are * HeapTupleSelfUpdated, HeapTupleUpdated, or HeapTupleBeingUpdated - * (the last only possible if wait == false). On a failure return, - * *ctid is set to the ctid link of the old tuple (possibly a later - * version of the row). + * (the last only possible if wait == false). + * * On success, newtup->t_self is set to the TID where the new tuple * was inserted. + * + * In the failure cases, the routine returns the tuple's t_ctid and t_xmax. + * If t_ctid is the same as otid, the tuple was deleted; if different, the + * tuple was updated, and t_ctid is the location of the replacement tuple. + * (t_xmax is needed to verify that the replacement tuple matches.) */ HTSU_Result heap_update(Relation relation, ItemPointer otid, HeapTuple newtup, - ItemPointer ctid, CommandId cid, - Snapshot crosscheck, bool wait) + ItemPointer ctid, TransactionId *update_xmax, + CommandId cid, Snapshot crosscheck, bool wait) { HTSU_Result result; TransactionId xid = GetCurrentTransactionId(); @@ -1664,7 +1727,9 @@ l2: Assert(result == HeapTupleSelfUpdated || result == HeapTupleUpdated || result == HeapTupleBeingUpdated); + Assert(!(oldtup.t_data->t_infomask & HEAP_XMAX_INVALID)); *ctid = oldtup.t_data->t_ctid; + *update_xmax = HeapTupleHeaderGetXmax(oldtup.t_data); LockBuffer(buffer, BUFFER_LOCK_UNLOCK); ReleaseBuffer(buffer); if (have_tuple_lock) @@ -1878,11 +1943,12 @@ l2: void simple_heap_update(Relation relation, ItemPointer otid, HeapTuple tup) { - ItemPointerData ctid; HTSU_Result result; + ItemPointerData update_ctid; + TransactionId update_xmax; result = heap_update(relation, otid, tup, - &ctid, + &update_ctid, &update_xmax, GetCurrentCommandId(), InvalidSnapshot, true /* wait for commit */ ); switch (result) @@ -1907,7 +1973,34 @@ simple_heap_update(Relation relation, ItemPointer otid, HeapTuple tup) } /* - * heap_lock_tuple - lock a tuple in shared or exclusive mode + * heap_lock_tuple - lock a tuple in shared or exclusive mode + * + * Note that this acquires a buffer pin, which the caller must release. + * + * Input parameters: + * relation: relation containing tuple (caller must hold suitable lock) + * tuple->t_self: TID of tuple to lock (rest of struct need not be valid) + * cid: current command ID (used for visibility test, and stored into + * tuple's cmax if lock is successful) + * mode: indicates if shared or exclusive tuple lock is desired + * nowait: if true, ereport rather than blocking if lock not available + * + * Output parameters: + * *tuple: all fields filled in + * *buffer: set to buffer holding tuple (pinned but not locked at exit) + * *ctid: set to tuple's t_ctid, but only in failure cases + * *update_xmax: set to tuple's xmax, but only in failure cases + * + * Function result may be: + * HeapTupleMayBeUpdated: lock was successfully acquired + * HeapTupleSelfUpdated: lock failed because tuple updated by self + * HeapTupleUpdated: lock failed because tuple updated by other xact + * + * In the failure cases, the routine returns the tuple's t_ctid and t_xmax. + * If t_ctid is the same as t_self, the tuple was deleted; if different, the + * tuple was updated, and t_ctid is the location of the replacement tuple. + * (t_xmax is needed to verify that the replacement tuple matches.) + * * * NOTES: because the shared-memory lock table is of finite size, but users * could reasonably want to lock large numbers of tuples, we do not rely on @@ -1943,7 +2036,8 @@ simple_heap_update(Relation relation, ItemPointer otid, HeapTuple tup) */ HTSU_Result heap_lock_tuple(Relation relation, HeapTuple tuple, Buffer *buffer, - CommandId cid, LockTupleMode mode, bool nowait) + ItemPointer ctid, TransactionId *update_xmax, + CommandId cid, LockTupleMode mode, bool nowait) { HTSU_Result result; ItemPointer tid = &(tuple->t_self); @@ -1961,9 +2055,12 @@ heap_lock_tuple(Relation relation, HeapTuple tuple, Buffer *buffer, dp = (PageHeader) BufferGetPage(*buffer); lp = PageGetItemId(dp, ItemPointerGetOffsetNumber(tid)); + Assert(ItemIdIsUsed(lp)); + tuple->t_datamcxt = NULL; tuple->t_data = (HeapTupleHeader) PageGetItem((Page) dp, lp); tuple->t_len = ItemIdGetLength(lp); + tuple->t_tableOid = RelationGetRelid(relation); l3: result = HeapTupleSatisfiesUpdate(tuple->t_data, cid, *buffer); @@ -2112,14 +2209,13 @@ l3: if (result != HeapTupleMayBeUpdated) { - ItemPointerData newctid = tuple->t_data->t_ctid; - Assert(result == HeapTupleSelfUpdated || result == HeapTupleUpdated); + Assert(!(tuple->t_data->t_infomask & HEAP_XMAX_INVALID)); + *ctid = tuple->t_data->t_ctid; + *update_xmax = HeapTupleHeaderGetXmax(tuple->t_data); LockBuffer(*buffer, BUFFER_LOCK_UNLOCK); if (have_tuple_lock) UnlockTuple(relation, tid, tuple_lock_type); - /* can't overwrite t_self (== *tid) until after above Unlock */ - tuple->t_self = newctid; return result; } diff --git a/src/backend/commands/async.c b/src/backend/commands/async.c index 142b02dfaf88d7cf9eabe16e022c59379ab0b5f5..e2c6203891d48fcb9951bc9142a286affa1fe2a7 100644 --- a/src/backend/commands/async.c +++ b/src/backend/commands/async.c @@ -7,7 +7,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/commands/async.c,v 1.123 2005/06/17 22:32:43 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/commands/async.c,v 1.124 2005/08/20 00:39:53 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -550,8 +550,9 @@ AtCommit_Notify(void) } else if (listener->notification == 0) { - ItemPointerData ctid; HTSU_Result result; + ItemPointerData update_ctid; + TransactionId update_xmax; rTuple = heap_modifytuple(lTuple, tdesc, value, nulls, repl); @@ -573,7 +574,7 @@ AtCommit_Notify(void) * heap_update calls. */ result = heap_update(lRel, &lTuple->t_self, rTuple, - &ctid, + &update_ctid, &update_xmax, GetCurrentCommandId(), InvalidSnapshot, false /* no wait for commit */ ); switch (result) @@ -585,7 +586,6 @@ AtCommit_Notify(void) case HeapTupleMayBeUpdated: /* done successfully */ - #ifdef NOT_USED /* currently there are no indexes */ CatalogUpdateIndexes(lRel, rTuple); #endif diff --git a/src/backend/commands/trigger.c b/src/backend/commands/trigger.c index 4ea973ae7fa4422e7dbb1eb55757b3ab6ce39717..562f676f4b8b4f91919f794681544cf5a0a68bd1 100644 --- a/src/backend/commands/trigger.c +++ b/src/backend/commands/trigger.c @@ -7,7 +7,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/commands/trigger.c,v 1.191 2005/08/12 01:35:57 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/commands/trigger.c,v 1.192 2005/08/20 00:39:54 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -1592,14 +1592,18 @@ GetTupleForTrigger(EState *estate, ResultRelInfo *relinfo, if (newSlot != NULL) { HTSU_Result test; + ItemPointerData update_ctid; + TransactionId update_xmax; + + *newSlot = NULL; /* * lock tuple for update */ - *newSlot = NULL; - tuple.t_self = *tid; ltrmark:; - test = heap_lock_tuple(relation, &tuple, &buffer, cid, + tuple.t_self = *tid; + test = heap_lock_tuple(relation, &tuple, &buffer, + &update_ctid, &update_xmax, cid, LockTupleExclusive, false); switch (test) { @@ -1617,15 +1621,18 @@ ltrmark:; ereport(ERROR, (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), errmsg("could not serialize access due to concurrent update"))); - else if (!(ItemPointerEquals(&(tuple.t_self), tid))) + else if (!ItemPointerEquals(&update_ctid, &tuple.t_self)) { - TupleTableSlot *epqslot = EvalPlanQual(estate, - relinfo->ri_RangeTableIndex, - &(tuple.t_self)); - - if (!(TupIsNull(epqslot))) + /* it was updated, so look at the updated version */ + TupleTableSlot *epqslot; + + epqslot = EvalPlanQual(estate, + relinfo->ri_RangeTableIndex, + &update_ctid, + update_xmax); + if (!TupIsNull(epqslot)) { - *tid = tuple.t_self; + *tid = update_ctid; *newSlot = epqslot; goto ltrmark; } @@ -1639,7 +1646,7 @@ ltrmark:; default: ReleaseBuffer(buffer); - elog(ERROR, "invalid heap_lock_tuple status: %d", test); + elog(ERROR, "unrecognized heap_lock_tuple status: %u", test); return NULL; /* keep compiler quiet */ } } @@ -1659,6 +1666,7 @@ ltrmark:; tuple.t_data = (HeapTupleHeader) PageGetItem((Page) dp, lp); tuple.t_len = ItemIdGetLength(lp); tuple.t_self = *tid; + tuple.t_tableOid = RelationGetRelid(relation); } result = heap_copytuple(&tuple); diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c index 9db91209448624a17730b2e79f3be5d25ac0ac0c..ef199c5f0734c56b3376d147b6315e008321c638 100644 --- a/src/backend/commands/vacuum.c +++ b/src/backend/commands/vacuum.c @@ -13,7 +13,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/commands/vacuum.c,v 1.312 2005/07/29 19:30:03 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/commands/vacuum.c,v 1.313 2005/08/20 00:39:54 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -51,6 +51,10 @@ #include "pgstat.h" +/* + * VacPage structures keep track of each page on which we find useful + * amounts of free space. + */ typedef struct VacPageData { BlockNumber blkno; /* BlockNumber of this Page */ @@ -73,30 +77,54 @@ typedef struct VacPageListData typedef VacPageListData *VacPageList; +/* + * The "vtlinks" array keeps information about each recently-updated tuple + * ("recent" meaning its XMAX is too new to let us recycle the tuple). + * We store the tuple's own TID as well as its t_ctid (its link to the next + * newer tuple version). Searching in this array allows us to follow update + * chains backwards from newer to older tuples. When we move a member of an + * update chain, we must move *all* the live members of the chain, so that we + * can maintain their t_ctid link relationships (we must not just overwrite + * t_ctid in an existing tuple). + * + * Note: because t_ctid links can be stale (this would only occur if a prior + * VACUUM crashed partway through), it is possible that new_tid points to an + * empty slot or unrelated tuple. We have to check the linkage as we follow + * it, just as is done in EvalPlanQual. + */ typedef struct VTupleLinkData { - ItemPointerData new_tid; - ItemPointerData this_tid; + ItemPointerData new_tid; /* t_ctid of an updated tuple */ + ItemPointerData this_tid; /* t_self of the tuple */ } VTupleLinkData; typedef VTupleLinkData *VTupleLink; +/* + * We use an array of VTupleMoveData to plan a chain tuple move fully + * before we do it. + */ typedef struct VTupleMoveData { ItemPointerData tid; /* tuple ID */ - VacPage vacpage; /* where to move */ - bool cleanVpd; /* clean vacpage before using */ + VacPage vacpage; /* where to move it to */ + bool cleanVpd; /* clean vacpage before using? */ } VTupleMoveData; typedef VTupleMoveData *VTupleMove; +/* + * VRelStats contains the data acquired by scan_heap for use later + */ typedef struct VRelStats { + /* miscellaneous statistics */ BlockNumber rel_pages; double rel_tuples; Size min_tlen; Size max_tlen; bool hasindex; + /* vtlinks array for tuple chain following - sorted by new_tid */ int num_vtlinks; VTupleLink vtlinks; } VRelStats; @@ -117,6 +145,7 @@ typedef struct ExecContextData EState *estate; TupleTableSlot *slot; } ExecContextData; + typedef ExecContextData *ExecContext; static void @@ -1802,18 +1831,25 @@ repair_frag(VRelStats *vacrelstats, Relation onerel, } /* - * If this tuple is in the chain of tuples created in updates - * by "recent" transactions then we have to move all chain of - * tuples to another places. + * If this tuple is in a chain of tuples created in updates + * by "recent" transactions then we have to move the whole chain + * of tuples to other places, so that we can write new t_ctid + * links that preserve the chain relationship. + * + * This test is complicated. Read it as "if tuple is a recently + * created updated version, OR if it is an obsoleted version". + * (In the second half of the test, we needn't make any check + * on XMAX --- it must be recently obsoleted, else scan_heap + * would have deemed it removable.) * * NOTE: this test is not 100% accurate: it is possible for a * tuple to be an updated one with recent xmin, and yet not - * have a corresponding tuple in the vtlinks list. Presumably + * match any new_tid entry in the vtlinks list. Presumably * there was once a parent tuple with xmax matching the xmin, * but it's possible that that tuple has been removed --- for - * example, if it had xmin = xmax then - * HeapTupleSatisfiesVacuum would deem it removable as soon as - * the xmin xact completes. + * example, if it had xmin = xmax and wasn't itself an updated + * version, then HeapTupleSatisfiesVacuum would deem it removable + * as soon as the xmin xact completes. * * To be on the safe side, we abandon the repair_frag process if * we cannot find the parent tuple in vtlinks. This may be @@ -1854,72 +1890,85 @@ repair_frag(VRelStats *vacrelstats, Relation onerel, break; /* out of walk-along-page loop */ } - vtmove = (VTupleMove) palloc(100 * sizeof(VTupleMoveData)); - num_vtmove = 0; - free_vtmove = 100; - /* * If this tuple is in the begin/middle of the chain then - * we have to move to the end of chain. + * we have to move to the end of chain. As with any + * t_ctid chase, we have to verify that each new tuple + * is really the descendant of the tuple we came from. */ while (!(tp.t_data->t_infomask & (HEAP_XMAX_INVALID | HEAP_IS_LOCKED)) && !(ItemPointerEquals(&(tp.t_self), &(tp.t_data->t_ctid)))) { - Page Cpage; - ItemId Citemid; - ItemPointerData Ctid; - - Ctid = tp.t_data->t_ctid; - if (freeCbuf) - ReleaseBuffer(Cbuf); - freeCbuf = true; - Cbuf = ReadBuffer(onerel, - ItemPointerGetBlockNumber(&Ctid)); - Cpage = BufferGetPage(Cbuf); - Citemid = PageGetItemId(Cpage, - ItemPointerGetOffsetNumber(&Ctid)); - if (!ItemIdIsUsed(Citemid)) + ItemPointerData nextTid; + TransactionId priorXmax; + Buffer nextBuf; + Page nextPage; + OffsetNumber nextOffnum; + ItemId nextItemid; + HeapTupleHeader nextTdata; + + nextTid = tp.t_data->t_ctid; + priorXmax = HeapTupleHeaderGetXmax(tp.t_data); + /* assume block# is OK (see heap_fetch comments) */ + nextBuf = ReadBuffer(onerel, + ItemPointerGetBlockNumber(&nextTid)); + nextPage = BufferGetPage(nextBuf); + /* If bogus or unused slot, assume tp is end of chain */ + nextOffnum = ItemPointerGetOffsetNumber(&nextTid); + if (nextOffnum < FirstOffsetNumber || + nextOffnum > PageGetMaxOffsetNumber(nextPage)) { - /* - * This means that in the middle of chain there - * was tuple updated by older (than OldestXmin) - * xaction and this tuple is already deleted by - * me. Actually, upper part of chain should be - * removed and seems that this should be handled - * in scan_heap(), but it's not implemented at the - * moment and so we just stop shrinking here. - */ - elog(DEBUG2, "child itemid in update-chain marked as unused --- can't continue repair_frag"); - chain_move_failed = true; - break; /* out of loop to move to chain end */ + ReleaseBuffer(nextBuf); + break; + } + nextItemid = PageGetItemId(nextPage, nextOffnum); + if (!ItemIdIsUsed(nextItemid)) + { + ReleaseBuffer(nextBuf); + break; } + /* if not matching XMIN, assume tp is end of chain */ + nextTdata = (HeapTupleHeader) PageGetItem(nextPage, + nextItemid); + if (!TransactionIdEquals(HeapTupleHeaderGetXmin(nextTdata), + priorXmax)) + { + ReleaseBuffer(nextBuf); + break; + } + /* OK, switch our attention to the next tuple in chain */ tp.t_datamcxt = NULL; - tp.t_data = (HeapTupleHeader) PageGetItem(Cpage, Citemid); - tp.t_self = Ctid; - tlen = tp.t_len = ItemIdGetLength(Citemid); - } - if (chain_move_failed) - { + tp.t_data = nextTdata; + tp.t_self = nextTid; + tlen = tp.t_len = ItemIdGetLength(nextItemid); if (freeCbuf) ReleaseBuffer(Cbuf); - pfree(vtmove); - break; /* out of walk-along-page loop */ + Cbuf = nextBuf; + freeCbuf = true; } + /* Set up workspace for planning the chain move */ + vtmove = (VTupleMove) palloc(100 * sizeof(VTupleMoveData)); + num_vtmove = 0; + free_vtmove = 100; + /* - * Check if all items in chain can be moved + * Now, walk backwards up the chain (towards older tuples) + * and check if all items in chain can be moved. We record + * all the moves that need to be made in the vtmove array. */ for (;;) { Buffer Pbuf; Page Ppage; ItemId Pitemid; - HeapTupleData Ptp; + HeapTupleHeader PTdata; VTupleLinkData vtld, *vtlp; + /* Identify a target page to move this tuple to */ if (to_vacpage == NULL || !enough_space(to_vacpage, tlen)) { @@ -1942,6 +1991,8 @@ repair_frag(VRelStats *vacrelstats, Relation onerel, if (to_vacpage->offsets_used >= to_vacpage->offsets_free) to_vacpage->free -= sizeof(ItemIdData); (to_vacpage->offsets_used)++; + + /* Add an entry to vtmove list */ if (free_vtmove == 0) { free_vtmove = 1000; @@ -1959,13 +2010,13 @@ repair_frag(VRelStats *vacrelstats, Relation onerel, free_vtmove--; num_vtmove++; - /* At beginning of chain? */ + /* Done if at beginning of chain */ if (!(tp.t_data->t_infomask & HEAP_UPDATED) || TransactionIdPrecedes(HeapTupleHeaderGetXmin(tp.t_data), OldestXmin)) - break; + break; /* out of check-all-items loop */ - /* No, move to tuple with prior row version */ + /* Move to tuple with prior row version */ vtld.new_tid = tp.t_self; vtlp = (VTupleLink) vac_bsearch((void *) &vtld, @@ -1989,18 +2040,17 @@ repair_frag(VRelStats *vacrelstats, Relation onerel, /* this can't happen since we saw tuple earlier: */ if (!ItemIdIsUsed(Pitemid)) elog(ERROR, "parent itemid marked as unused"); - Ptp.t_datamcxt = NULL; - Ptp.t_data = (HeapTupleHeader) PageGetItem(Ppage, Pitemid); + PTdata = (HeapTupleHeader) PageGetItem(Ppage, Pitemid); /* ctid should not have changed since we saved it */ Assert(ItemPointerEquals(&(vtld.new_tid), - &(Ptp.t_data->t_ctid))); + &(PTdata->t_ctid))); /* - * Read above about cases when !ItemIdIsUsed(Citemid) + * Read above about cases when !ItemIdIsUsed(nextItemid) * (child item is removed)... Due to the fact that at * the moment we don't remove unuseful part of - * update-chain, it's possible to get too old parent + * update-chain, it's possible to get non-matching parent * row here. Like as in the case which caused this * problem, we stop shrinking here. I could try to * find real parent row but want not to do it because @@ -2008,8 +2058,8 @@ repair_frag(VRelStats *vacrelstats, Relation onerel, * and we are too close to 6.5 release. - vadim * 06/11/99 */ - if (Ptp.t_data->t_infomask & HEAP_XMAX_IS_MULTI || - !(TransactionIdEquals(HeapTupleHeaderGetXmax(Ptp.t_data), + if ((PTdata->t_infomask & HEAP_XMAX_IS_MULTI) || + !(TransactionIdEquals(HeapTupleHeaderGetXmax(PTdata), HeapTupleHeaderGetXmin(tp.t_data)))) { ReleaseBuffer(Pbuf); @@ -2017,8 +2067,8 @@ repair_frag(VRelStats *vacrelstats, Relation onerel, chain_move_failed = true; break; /* out of check-all-items loop */ } - tp.t_datamcxt = Ptp.t_datamcxt; - tp.t_data = Ptp.t_data; + tp.t_datamcxt = NULL; + tp.t_data = PTdata; tlen = tp.t_len = ItemIdGetLength(Pitemid); if (freeCbuf) ReleaseBuffer(Cbuf); @@ -2047,7 +2097,9 @@ repair_frag(VRelStats *vacrelstats, Relation onerel, } /* - * Okay, move the whole tuple chain + * Okay, move the whole tuple chain in reverse order. + * + * Ctid tracks the new location of the previously-moved tuple. */ ItemPointerSetInvalid(&Ctid); for (ti = 0; ti < num_vtmove; ti++) @@ -2077,10 +2129,6 @@ repair_frag(VRelStats *vacrelstats, Relation onerel, tuple.t_data = (HeapTupleHeader) PageGetItem(Cpage, Citemid); tuple_len = tuple.t_len = ItemIdGetLength(Citemid); - /* - * make a copy of the source tuple, and then mark the - * source tuple MOVED_OFF. - */ move_chain_tuple(onerel, Cbuf, Cpage, &tuple, dst_buffer, dst_page, destvacpage, &ec, &Ctid, vtmove[ti].cleanVpd); @@ -2143,7 +2191,6 @@ repair_frag(VRelStats *vacrelstats, Relation onerel, move_plain_tuple(onerel, buf, page, &tuple, dst_buffer, dst_page, dst_vacpage, &ec); - num_moved++; if (dst_vacpage->blkno > last_move_dest_block) last_move_dest_block = dst_vacpage->blkno; @@ -2474,6 +2521,9 @@ move_chain_tuple(Relation rel, ItemId newitemid; Size tuple_len = old_tup->t_len; + /* + * make a modifiable copy of the source tuple. + */ heap_copytuple_with_tuple(old_tup, &newtup); /* @@ -2484,6 +2534,9 @@ move_chain_tuple(Relation rel, /* NO EREPORT(ERROR) TILL CHANGES ARE LOGGED */ START_CRIT_SECTION(); + /* + * mark the source tuple MOVED_OFF. + */ old_tup->t_data->t_infomask &= ~(HEAP_XMIN_COMMITTED | HEAP_XMIN_INVALID | HEAP_MOVED_IN); @@ -2529,16 +2582,27 @@ move_chain_tuple(Relation rel, newoff = PageAddItem(dst_page, (Item) newtup.t_data, tuple_len, InvalidOffsetNumber, LP_USED); if (newoff == InvalidOffsetNumber) - { elog(PANIC, "failed to add item with len = %lu to page %u while moving tuple chain", (unsigned long) tuple_len, dst_vacpage->blkno); - } newitemid = PageGetItemId(dst_page, newoff); + /* drop temporary copy, and point to the version on the dest page */ pfree(newtup.t_data); newtup.t_datamcxt = NULL; newtup.t_data = (HeapTupleHeader) PageGetItem(dst_page, newitemid); + ItemPointerSet(&(newtup.t_self), dst_vacpage->blkno, newoff); + /* + * Set new tuple's t_ctid pointing to itself if last tuple in chain, + * and to next tuple in chain otherwise. (Since we move the chain + * in reverse order, this is actually the previously processed tuple.) + */ + if (!ItemPointerIsValid(ctid)) + newtup.t_data->t_ctid = newtup.t_self; + else + newtup.t_data->t_ctid = *ctid; + *ctid = newtup.t_self; + /* XLOG stuff */ if (!rel->rd_istemp) { @@ -2563,17 +2627,6 @@ move_chain_tuple(Relation rel, END_CRIT_SECTION(); - /* - * Set new tuple's t_ctid pointing to itself for last tuple in chain, - * and to next tuple in chain otherwise. - */ - /* Is this ok after log_heap_move() and END_CRIT_SECTION()? */ - if (!ItemPointerIsValid(ctid)) - newtup.t_data->t_ctid = newtup.t_self; - else - newtup.t_data->t_ctid = *ctid; - *ctid = newtup.t_self; - LockBuffer(dst_buf, BUFFER_LOCK_UNLOCK); if (dst_buf != old_buf) LockBuffer(old_buf, BUFFER_LOCK_UNLOCK); @@ -2638,12 +2691,10 @@ move_plain_tuple(Relation rel, newoff = PageAddItem(dst_page, (Item) newtup.t_data, tuple_len, InvalidOffsetNumber, LP_USED); if (newoff == InvalidOffsetNumber) - { elog(PANIC, "failed to add item with len = %lu to page %u (free space %lu, nusd %u, noff %u)", (unsigned long) tuple_len, dst_vacpage->blkno, (unsigned long) dst_vacpage->free, dst_vacpage->offsets_used, dst_vacpage->offsets_free); - } newitemid = PageGetItemId(dst_page, newoff); pfree(newtup.t_data); newtup.t_datamcxt = NULL; diff --git a/src/backend/executor/execMain.c b/src/backend/executor/execMain.c index eb485ad5476a7ba35579bfbaf033ef52b75b7cec..9f5c008fa9f72e3d92fee9750e9b4c437dee736e 100644 --- a/src/backend/executor/execMain.c +++ b/src/backend/executor/execMain.c @@ -26,7 +26,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/executor/execMain.c,v 1.253 2005/08/18 21:34:20 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/executor/execMain.c,v 1.254 2005/08/20 00:39:55 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -1165,8 +1165,10 @@ lnext: ; foreach(l, estate->es_rowMarks) { execRowMark *erm = lfirst(l); - Buffer buffer; HeapTupleData tuple; + Buffer buffer; + ItemPointerData update_ctid; + TransactionId update_xmax; TupleTableSlot *newSlot; LockTupleMode lockmode; HTSU_Result test; @@ -1183,15 +1185,17 @@ lnext: ; if (isNull) elog(ERROR, "\"%s\" is NULL", erm->resname); + tuple.t_self = *((ItemPointer) DatumGetPointer(datum)); + if (estate->es_forUpdate) lockmode = LockTupleExclusive; else lockmode = LockTupleShared; - tuple.t_self = *((ItemPointer) DatumGetPointer(datum)); test = heap_lock_tuple(erm->relation, &tuple, &buffer, - estate->es_snapshot->curcid, - lockmode, estate->es_rowNoWait); + &update_ctid, &update_xmax, + estate->es_snapshot->curcid, + lockmode, estate->es_rowNoWait); ReleaseBuffer(buffer); switch (test) { @@ -1207,11 +1211,15 @@ lnext: ; ereport(ERROR, (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), errmsg("could not serialize access due to concurrent update"))); - if (!(ItemPointerEquals(&(tuple.t_self), - (ItemPointer) DatumGetPointer(datum)))) + if (!ItemPointerEquals(&update_ctid, + &tuple.t_self)) { - newSlot = EvalPlanQual(estate, erm->rti, &(tuple.t_self)); - if (!(TupIsNull(newSlot))) + /* updated, so look at updated version */ + newSlot = EvalPlanQual(estate, + erm->rti, + &update_ctid, + update_xmax); + if (!TupIsNull(newSlot)) { slot = newSlot; estate->es_useEvalPlan = true; @@ -1454,8 +1462,9 @@ ExecDelete(TupleTableSlot *slot, { ResultRelInfo *resultRelInfo; Relation resultRelationDesc; - ItemPointerData ctid; HTSU_Result result; + ItemPointerData update_ctid; + TransactionId update_xmax; /* * get information on the (current) result relation @@ -1486,7 +1495,7 @@ ExecDelete(TupleTableSlot *slot, */ ldelete:; result = heap_delete(resultRelationDesc, tupleid, - &ctid, + &update_ctid, &update_xmax, estate->es_snapshot->curcid, estate->es_crosscheck_snapshot, true /* wait for commit */ ); @@ -1504,14 +1513,17 @@ ldelete:; ereport(ERROR, (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), errmsg("could not serialize access due to concurrent update"))); - else if (!(ItemPointerEquals(tupleid, &ctid))) + else if (!ItemPointerEquals(tupleid, &update_ctid)) { - TupleTableSlot *epqslot = EvalPlanQual(estate, - resultRelInfo->ri_RangeTableIndex, &ctid); + TupleTableSlot *epqslot; + epqslot = EvalPlanQual(estate, + resultRelInfo->ri_RangeTableIndex, + &update_ctid, + update_xmax); if (!TupIsNull(epqslot)) { - *tupleid = ctid; + *tupleid = update_ctid; goto ldelete; } } @@ -1558,8 +1570,9 @@ ExecUpdate(TupleTableSlot *slot, HeapTuple tuple; ResultRelInfo *resultRelInfo; Relation resultRelationDesc; - ItemPointerData ctid; HTSU_Result result; + ItemPointerData update_ctid; + TransactionId update_xmax; /* * abort the operation if not running transactions @@ -1627,7 +1640,7 @@ lreplace:; * referential integrity updates in serializable transactions. */ result = heap_update(resultRelationDesc, tupleid, tuple, - &ctid, + &update_ctid, &update_xmax, estate->es_snapshot->curcid, estate->es_crosscheck_snapshot, true /* wait for commit */ ); @@ -1645,14 +1658,17 @@ lreplace:; ereport(ERROR, (errcode(ERRCODE_T_R_SERIALIZATION_FAILURE), errmsg("could not serialize access due to concurrent update"))); - else if (!(ItemPointerEquals(tupleid, &ctid))) + else if (!ItemPointerEquals(tupleid, &update_ctid)) { - TupleTableSlot *epqslot = EvalPlanQual(estate, - resultRelInfo->ri_RangeTableIndex, &ctid); + TupleTableSlot *epqslot; + epqslot = EvalPlanQual(estate, + resultRelInfo->ri_RangeTableIndex, + &update_ctid, + update_xmax); if (!TupIsNull(epqslot)) { - *tupleid = ctid; + *tupleid = update_ctid; slot = ExecFilterJunk(estate->es_junkFilter, epqslot); tuple = ExecMaterializeSlot(slot); goto lreplace; @@ -1791,9 +1807,21 @@ ExecConstraints(ResultRelInfo *resultRelInfo, * under READ COMMITTED rules. * * See backend/executor/README for some info about how this works. + * + * estate - executor state data + * rti - rangetable index of table containing tuple + * *tid - t_ctid from the outdated tuple (ie, next updated version) + * priorXmax - t_xmax from the outdated tuple + * + * *tid is also an output parameter: it's modified to hold the TID of the + * latest version of the tuple (note this may be changed even on failure) + * + * Returns a slot containing the new candidate update/delete tuple, or + * NULL if we determine we shouldn't process the row. */ TupleTableSlot * -EvalPlanQual(EState *estate, Index rti, ItemPointer tid) +EvalPlanQual(EState *estate, Index rti, + ItemPointer tid, TransactionId priorXmax) { evalPlanQual *epq; EState *epqstate; @@ -1837,11 +1865,24 @@ EvalPlanQual(EState *estate, Index rti, ItemPointer tid) { Buffer buffer; - if (heap_fetch(relation, SnapshotDirty, &tuple, &buffer, false, NULL)) + if (heap_fetch(relation, SnapshotDirty, &tuple, &buffer, true, NULL)) { - TransactionId xwait = SnapshotDirty->xmax; + /* + * If xmin isn't what we're expecting, the slot must have been + * recycled and reused for an unrelated tuple. This implies + * that the latest version of the row was deleted, so we need + * do nothing. (Should be safe to examine xmin without getting + * buffer's content lock, since xmin never changes in an existing + * tuple.) + */ + if (!TransactionIdEquals(HeapTupleHeaderGetXmin(tuple.t_data), + priorXmax)) + { + ReleaseBuffer(buffer); + return NULL; + } - /* xmin should not be dirty... */ + /* otherwise xmin should not be dirty... */ if (TransactionIdIsValid(SnapshotDirty->xmin)) elog(ERROR, "t_xmin is uncommitted in tuple to be updated"); @@ -1849,11 +1890,11 @@ EvalPlanQual(EState *estate, Index rti, ItemPointer tid) * If tuple is being updated by other transaction then we have * to wait for its commit/abort. */ - if (TransactionIdIsValid(xwait)) + if (TransactionIdIsValid(SnapshotDirty->xmax)) { ReleaseBuffer(buffer); - XactLockTableWait(xwait); - continue; + XactLockTableWait(SnapshotDirty->xmax); + continue; /* loop back to repeat heap_fetch */ } /* @@ -1865,22 +1906,50 @@ EvalPlanQual(EState *estate, Index rti, ItemPointer tid) } /* - * Oops! Invalid tuple. Have to check is it updated or deleted. - * Note that it's possible to get invalid SnapshotDirty->tid if - * tuple updated by this transaction. Have we to check this ? + * If the referenced slot was actually empty, the latest version + * of the row must have been deleted, so we need do nothing. */ - if (ItemPointerIsValid(&(SnapshotDirty->tid)) && - !(ItemPointerEquals(&(tuple.t_self), &(SnapshotDirty->tid)))) + if (tuple.t_data == NULL) { - /* updated, so look at the updated copy */ - tuple.t_self = SnapshotDirty->tid; - continue; + ReleaseBuffer(buffer); + return NULL; } /* - * Deleted or updated by this transaction; forget it. + * As above, if xmin isn't what we're expecting, do nothing. */ - return NULL; + if (!TransactionIdEquals(HeapTupleHeaderGetXmin(tuple.t_data), + priorXmax)) + { + ReleaseBuffer(buffer); + return NULL; + } + + /* + * If we get here, the tuple was found but failed SnapshotDirty. + * Assuming the xmin is either a committed xact or our own xact + * (as it certainly should be if we're trying to modify the tuple), + * this must mean that the row was updated or deleted by either + * a committed xact or our own xact. If it was deleted, we can + * ignore it; if it was updated then chain up to the next version + * and repeat the whole test. + * + * As above, it should be safe to examine xmax and t_ctid without + * the buffer content lock, because they can't be changing. + */ + if (ItemPointerEquals(&tuple.t_self, &tuple.t_data->t_ctid)) + { + /* deleted, so forget about it */ + ReleaseBuffer(buffer); + return NULL; + } + + /* updated, so look at the updated row */ + tuple.t_self = tuple.t_data->t_ctid; + /* updated row should have xmin matching this xmax */ + priorXmax = HeapTupleHeaderGetXmax(tuple.t_data); + ReleaseBuffer(buffer); + /* loop back to fetch next in chain */ } /* diff --git a/src/backend/utils/time/tqual.c b/src/backend/utils/time/tqual.c index 94633249641298025fb43fa5f14388e0d099bbab..f8dcf43b64d7ff28b6077919fb4b10533d9b826a 100644 --- a/src/backend/utils/time/tqual.c +++ b/src/backend/utils/time/tqual.c @@ -32,7 +32,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/time/tqual.c,v 1.89 2005/05/19 21:35:47 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/utils/time/tqual.c,v 1.90 2005/08/20 00:39:57 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -677,14 +677,15 @@ HeapTupleSatisfiesUpdate(HeapTupleHeader tuple, CommandId curcid, * However, we also include the effects of other xacts still in progress. * * Returns extra information in the global variable SnapshotDirty, namely - * xids of concurrent xacts that affected the tuple. Also, the tuple's - * t_ctid (forward link) is returned if it's being updated. + * xids of concurrent xacts that affected the tuple. SnapshotDirty->xmin + * is set to InvalidTransactionId if xmin is either committed good or + * committed dead; or to xmin if that transaction is still in progress. + * Similarly for SnapshotDirty->xmax. */ bool HeapTupleSatisfiesDirty(HeapTupleHeader tuple, Buffer buffer) { SnapshotDirty->xmin = SnapshotDirty->xmax = InvalidTransactionId; - ItemPointerSetInvalid(&(SnapshotDirty->tid)); if (!(tuple->t_infomask & HEAP_XMIN_COMMITTED)) { @@ -781,7 +782,6 @@ HeapTupleSatisfiesDirty(HeapTupleHeader tuple, Buffer buffer) { if (tuple->t_infomask & HEAP_IS_LOCKED) return true; - SnapshotDirty->tid = tuple->t_ctid; return false; /* updated by other */ } @@ -824,7 +824,6 @@ HeapTupleSatisfiesDirty(HeapTupleHeader tuple, Buffer buffer) tuple->t_infomask |= HEAP_XMAX_COMMITTED; SetBufferCommitInfoNeedsSave(buffer); - SnapshotDirty->tid = tuple->t_ctid; return false; /* updated by other */ } @@ -1224,10 +1223,13 @@ HeapTupleSatisfiesVacuum(HeapTupleHeader tuple, TransactionId OldestXmin, HeapTupleHeaderGetXmax(tuple))) { /* - * inserter also deleted it, so it was never visible to anyone - * else + * Inserter also deleted it, so it was never visible to anyone + * else. However, we can only remove it early if it's not an + * updated tuple; else its parent tuple is linking to it via t_ctid, + * and this tuple mustn't go away before the parent does. */ - return HEAPTUPLE_DEAD; + if (!(tuple->t_infomask & HEAP_UPDATED)) + return HEAPTUPLE_DEAD; } if (!TransactionIdPrecedes(HeapTupleHeaderGetXmax(tuple), OldestXmin)) diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h index 9c040b0dfd4f11db6cabf3a974cce30dfd548261..3221734a6f6f08c3a21b0894aa6e592c2c1a0689 100644 --- a/src/include/access/heapam.h +++ b/src/include/access/heapam.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/access/heapam.h,v 1.103 2005/08/01 20:31:13 tgl Exp $ + * $PostgreSQL: pgsql/src/include/access/heapam.h,v 1.104 2005/08/20 00:39:59 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -152,19 +152,23 @@ extern bool heap_release_fetch(Relation relation, Snapshot snapshot, HeapTuple tuple, Buffer *userbuf, bool keep_buf, PgStat_Info *pgstat_info); -extern ItemPointer heap_get_latest_tid(Relation relation, Snapshot snapshot, +extern void heap_get_latest_tid(Relation relation, Snapshot snapshot, ItemPointer tid); extern void setLastTid(const ItemPointer tid); extern Oid heap_insert(Relation relation, HeapTuple tup, CommandId cid, bool use_wal, bool use_fsm); -extern HTSU_Result heap_delete(Relation relation, ItemPointer tid, ItemPointer ctid, - CommandId cid, Snapshot crosscheck, bool wait); -extern HTSU_Result heap_update(Relation relation, ItemPointer otid, HeapTuple tup, - ItemPointer ctid, CommandId cid, Snapshot crosscheck, bool wait); -extern HTSU_Result heap_lock_tuple(Relation relation, HeapTuple tup, - Buffer *userbuf, CommandId cid, - LockTupleMode mode, bool nowait); +extern HTSU_Result heap_delete(Relation relation, ItemPointer tid, + ItemPointer ctid, TransactionId *update_xmax, + CommandId cid, Snapshot crosscheck, bool wait); +extern HTSU_Result heap_update(Relation relation, ItemPointer otid, + HeapTuple newtup, + ItemPointer ctid, TransactionId *update_xmax, + CommandId cid, Snapshot crosscheck, bool wait); +extern HTSU_Result heap_lock_tuple(Relation relation, HeapTuple tuple, + Buffer *buffer, ItemPointer ctid, + TransactionId *update_xmax, CommandId cid, + LockTupleMode mode, bool nowait); extern Oid simple_heap_insert(Relation relation, HeapTuple tup); extern void simple_heap_delete(Relation relation, ItemPointer tid); diff --git a/src/include/access/htup.h b/src/include/access/htup.h index e394afd31392a1e96fab99e8f333545b57e0cf1e..abc4dce9b95dfa8756284c0a4d8cd25846d3c0f6 100644 --- a/src/include/access/htup.h +++ b/src/include/access/htup.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/access/htup.h,v 1.75 2005/06/08 15:50:27 tgl Exp $ + * $PostgreSQL: pgsql/src/include/access/htup.h,v 1.76 2005/08/20 00:39:59 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -80,6 +80,21 @@ * However, with the advent of subtransactions, a tuple may need both Xmax * and Cmin simultaneously, so this is no longer possible. * + * A word about t_ctid: whenever a new tuple is stored on disk, its t_ctid + * is initialized with its own TID (location). If the tuple is ever updated, + * its t_ctid is changed to point to the replacement version of the tuple. + * Thus, a tuple is the latest version of its row iff XMAX is invalid or + * t_ctid points to itself (in which case, if XMAX is valid, the tuple is + * either locked or deleted). One can follow the chain of t_ctid links + * to find the newest version of the row. Beware however that VACUUM might + * erase the pointed-to (newer) tuple before erasing the pointing (older) + * tuple. Hence, when following a t_ctid link, it is necessary to check + * to see if the referenced slot is empty or contains an unrelated tuple. + * Check that the referenced tuple has XMIN equal to the referencing tuple's + * XMAX to verify that it is actually the descendant version and not an + * unrelated tuple stored into a slot recently freed by VACUUM. If either + * check fails, one may assume that there is no live descendant version. + * * Following the fixed header fields, the nulls bitmap is stored (beginning * at t_bits). The bitmap is *not* stored if t_infomask shows that there * are no nulls in the tuple. If an OID field is present (as indicated by @@ -334,18 +349,29 @@ do { \ /* * HeapTupleData is an in-memory data structure that points to a tuple. * - * This new HeapTuple for version >= 6.5 and this is why it was changed: + * There are several ways in which this data structure is used: + * + * * Pointer to a tuple in a disk buffer: t_data points directly into the + * buffer (which the code had better be holding a pin on, but this is not + * reflected in HeapTupleData itself). t_datamcxt must be NULL. + * + * * Pointer to nothing: t_data and t_datamcxt are NULL. This is used as + * a failure indication in some functions. + * + * * Part of a palloc'd tuple: the HeapTupleData itself and the tuple + * form a single palloc'd chunk. t_data points to the memory location + * immediately following the HeapTupleData struct (at offset HEAPTUPLESIZE), + * and t_datamcxt is the containing context. This is used as the output + * format of heap_form_tuple and related routines. * - * 1. t_len moved off on-disk tuple data - ItemIdData is used to get len; - * 2. t_ctid above is not self tuple TID now - it may point to - * updated version of tuple (required by MVCC); - * 3. someday someone let tuple to cross block boundaries - - * he have to add something below... + * * Separately allocated tuple: t_data points to a palloc'd chunk that + * is not adjacent to the HeapTupleData, and t_datamcxt is the context + * containing that chunk. * - * Change for 7.0: - * Up to now t_data could be NULL, the memory location directly following - * HeapTupleData, or pointing into a buffer. Now, it could also point to - * a separate allocation that was done in the t_datamcxt memory context. + * t_len should always be valid, except in the pointer-to-nothing case. + * t_self and t_tableOid should be valid if the HeapTupleData points to + * a disk buffer, or if it represents a copy of a tuple on disk. They + * should be explicitly set invalid in manufactured tuples. */ typedef struct HeapTupleData { diff --git a/src/include/executor/executor.h b/src/include/executor/executor.h index 2e42894788eebdeb2f6bb05311d0659cf9904229..6064ff2f4f23eb7e3b686569418146253114c462 100644 --- a/src/include/executor/executor.h +++ b/src/include/executor/executor.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/executor/executor.h,v 1.118 2005/04/16 20:07:35 tgl Exp $ + * $PostgreSQL: pgsql/src/include/executor/executor.h,v 1.119 2005/08/20 00:40:13 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -98,7 +98,7 @@ extern bool ExecContextForcesOids(PlanState *planstate, bool *hasoids); extern void ExecConstraints(ResultRelInfo *resultRelInfo, TupleTableSlot *slot, EState *estate); extern TupleTableSlot *EvalPlanQual(EState *estate, Index rti, - ItemPointer tid); + ItemPointer tid, TransactionId priorXmax); /* * prototypes from functions in execProcnode.c diff --git a/src/include/utils/tqual.h b/src/include/utils/tqual.h index f12ae2233fd29c8b93ad9086270919b25a73fbfd..fa530ed977cc1843485f8af5b3ceaaaa796bbc7d 100644 --- a/src/include/utils/tqual.h +++ b/src/include/utils/tqual.h @@ -8,7 +8,7 @@ * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/utils/tqual.h,v 1.57 2005/05/19 21:35:48 tgl Exp $ + * $PostgreSQL: pgsql/src/include/utils/tqual.h,v 1.58 2005/08/20 00:40:32 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -42,7 +42,6 @@ typedef struct SnapshotData TransactionId *xip; /* array of xact IDs in progress */ /* note: all ids in xip[] satisfy xmin <= xip[i] < xmax */ CommandId curcid; /* in my xact, CID < curcid are visible */ - ItemPointerData tid; /* required for Dirty snapshot -:( */ } SnapshotData; typedef SnapshotData *Snapshot;