diff --git a/src/backend/access/transam/multixact.c b/src/backend/access/transam/multixact.c index 2cdfed4945eb1e2eef0d0ec74eb28fd439c3b6f4..9f259bb54ebb22ec6d941a4105d3c7fceb855aee 100644 --- a/src/backend/access/transam/multixact.c +++ b/src/backend/access/transam/multixact.c @@ -45,14 +45,17 @@ * anything we saw during replay. * * We are able to remove segments no longer necessary by carefully tracking - * each table's used values: during vacuum, any multixact older than a - * certain value is removed; the cutoff value is stored in pg_class. - * The minimum value in each database is stored in pg_database, and the - * global minimum is part of pg_control. Any vacuum that is able to - * advance its database's minimum value also computes a new global minimum, - * and uses this value to truncate older segments. When new multixactid - * values are to be created, care is taken that the counter does not - * fall within the wraparound horizon considering the global minimum value. + * each table's used values: during vacuum, any multixact older than a certain + * value is removed; the cutoff value is stored in pg_class. The minimum value + * across all tables in each database is stored in pg_database, and the global + * minimum across all databases is part of pg_control and is kept in shared + * memory. At checkpoint time, after the value is known flushed in WAL, any + * files that correspond to multixacts older than that value are removed. + * (These files are also removed when a restartpoint is executed.) + * + * When new multixactid values are to be created, care is taken that the + * counter does not fall within the wraparound horizon considering the global + * minimum value. * * Portions Copyright (c) 1996-2014, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California @@ -91,7 +94,7 @@ * Note: because MultiXactOffsets are 32 bits and wrap around at 0xFFFFFFFF, * MultiXact page numbering also wraps around at * 0xFFFFFFFF/MULTIXACT_OFFSETS_PER_PAGE, and segment numbering at - * 0xFFFFFFFF/MULTIXACT_OFFSETS_PER_PAGE/SLRU_SEGMENTS_PER_PAGE. We need + * 0xFFFFFFFF/MULTIXACT_OFFSETS_PER_PAGE/SLRU_PAGES_PER_SEGMENT. We need * take no explicit notice of that fact in this module, except when comparing * segment and page numbers in TruncateMultiXact (see * MultiXactOffsetPagePrecedes). @@ -188,16 +191,20 @@ typedef struct MultiXactStateData /* next-to-be-assigned offset */ MultiXactOffset nextOffset; - /* the Offset SLRU area was last truncated at this MultiXactId */ - MultiXactId lastTruncationPoint; - /* - * oldest multixact that is still on disk. Anything older than this - * should not be consulted. + * Oldest multixact that is still on disk. Anything older than this + * should not be consulted. These values are updated by vacuum. */ MultiXactId oldestMultiXactId; Oid oldestMultiXactDB; + /* + * This is what the previous checkpoint stored as the truncate position. + * This value is the oldestMultiXactId that was valid when a checkpoint + * was last executed. + */ + MultiXactId lastCheckpointedOldest; + /* support for anti-wraparound measures */ MultiXactId multiVacLimit; MultiXactId multiWarnLimit; @@ -234,12 +241,20 @@ typedef struct MultiXactStateData * than its own OldestVisibleMXactId[] setting; this is necessary because * the checkpointer could truncate away such data at any instant. * - * The checkpointer can compute the safe truncation point as the oldest - * valid value among all the OldestMemberMXactId[] and - * OldestVisibleMXactId[] entries, or nextMXact if none are valid. - * Clearly, it is not possible for any later-computed OldestVisibleMXactId - * value to be older than this, and so there is no risk of truncating data - * that is still needed. + * The oldest valid value among all of the OldestMemberMXactId[] and + * OldestVisibleMXactId[] entries is considered by vacuum as the earliest + * possible value still having any live member transaction. Subtracting + * vacuum_multixact_freeze_min_age from that value we obtain the freezing + * point for multixacts for that table. Any value older than that is + * removed from tuple headers (or "frozen"; see FreezeMultiXactId. Note + * that multis that have member xids that are older than the cutoff point + * for xids must also be frozen, even if the multis themselves are newer + * than the multixid cutoff point). Whenever a full table vacuum happens, + * the freezing point so computed is used as the new pg_class.relminmxid + * value. The minimum of all those values in a database is stored as + * pg_database.datminmxid. In turn, the minimum of all of those values is + * stored in pg_control and used as truncation point for pg_multixact. At + * checkpoint or restartpoint, unneeded segments are removed. */ MultiXactId perBackendXactIds[1]; /* VARIABLE LENGTH ARRAY */ } MultiXactStateData; @@ -1121,8 +1136,8 @@ GetMultiXactIdMembers(MultiXactId multi, MultiXactMember **members, * We check known limits on MultiXact before resorting to the SLRU area. * * An ID older than MultiXactState->oldestMultiXactId cannot possibly be - * useful; it should have already been removed by vacuum. We've truncated - * the on-disk structures anyway. Returning the wrong values could lead + * useful; it has already been removed, or will be removed shortly, by + * truncation. Returning the wrong values could lead * to an incorrect visibility result. However, to support pg_upgrade we * need to allow an empty set to be returned regardless, if the caller is * willing to accept it; the caller is expected to check that it's an @@ -1932,14 +1947,14 @@ TrimMultiXact(void) LWLockAcquire(MultiXactOffsetControlLock, LW_EXCLUSIVE); /* - * (Re-)Initialize our idea of the latest page number. + * (Re-)Initialize our idea of the latest page number for offsets. */ pageno = MultiXactIdToOffsetPage(multi); MultiXactOffsetCtl->shared->latest_page_number = pageno; /* * Zero out the remainder of the current offsets page. See notes in - * StartupCLOG() for motivation. + * TrimCLOG() for motivation. */ entryno = MultiXactIdToOffsetEntry(multi); if (entryno != 0) @@ -1962,7 +1977,7 @@ TrimMultiXact(void) LWLockAcquire(MultiXactMemberControlLock, LW_EXCLUSIVE); /* - * (Re-)Initialize our idea of the latest page number. + * (Re-)Initialize our idea of the latest page number for members. */ pageno = MXOffsetToMemberPage(offset); MultiXactMemberCtl->shared->latest_page_number = pageno; @@ -2240,6 +2255,18 @@ MultiXactAdvanceOldest(MultiXactId oldestMulti, Oid oldestMultiDB) SetMultiXactIdLimit(oldestMulti, oldestMultiDB); } +/* + * Update the "safe truncation point". This is the newest value of oldestMulti + * that is known to be flushed as part of a checkpoint record. + */ +void +MultiXactSetSafeTruncate(MultiXactId safeTruncateMulti) +{ + LWLockAcquire(MultiXactGenLock, LW_EXCLUSIVE); + MultiXactState->lastCheckpointedOldest = safeTruncateMulti; + LWLockRelease(MultiXactGenLock); +} + /* * Make sure that MultiXactOffset has room for a newly-allocated MultiXactId. * @@ -2478,25 +2505,31 @@ SlruScanDirCbFindEarliest(SlruCtl ctl, char *filename, int segpage, void *data) * Remove all MultiXactOffset and MultiXactMember segments before the oldest * ones still of interest. * - * On a primary, this is called by vacuum after it has successfully advanced a - * database's datminmxid value; the cutoff value we're passed is the minimum of - * all databases' datminmxid values. - * - * During crash recovery, it's called from CreateRestartPoint() instead. We - * rely on the fact that xlog_redo() will already have called - * MultiXactAdvanceOldest(). Our latest_page_number will already have been - * initialized by StartupMultiXact() and kept up to date as new pages are - * zeroed. + * On a primary, this is called by the checkpointer process after a checkpoint + * has been flushed; during crash recovery, it's called from + * CreateRestartPoint(). In the latter case, we rely on the fact that + * xlog_redo() will already have called MultiXactAdvanceOldest(). Our + * latest_page_number will already have been initialized by StartupMultiXact() + * and kept up to date as new pages are zeroed. */ void -TruncateMultiXact(MultiXactId oldestMXact) +TruncateMultiXact(void) { + MultiXactId oldestMXact; MultiXactOffset oldestOffset; MultiXactOffset nextOffset; mxtruncinfo trunc; MultiXactId earliest; MembersLiveRange range; + Assert(AmCheckpointerProcess() || AmStartupProcess() || + !IsPostmasterEnvironment); + + LWLockAcquire(MultiXactGenLock, LW_SHARED); + oldestMXact = MultiXactState->lastCheckpointedOldest; + LWLockRelease(MultiXactGenLock); + Assert(MultiXactIdIsValid(oldestMXact)); + /* * Note we can't just plow ahead with the truncation; it's possible that * there are no segments to truncate, which is a problem because we are @@ -2507,6 +2540,8 @@ TruncateMultiXact(MultiXactId oldestMXact) trunc.earliestExistingPage = -1; SlruScanDirectory(MultiXactOffsetCtl, SlruScanDirCbFindEarliest, &trunc); earliest = trunc.earliestExistingPage * MULTIXACT_OFFSETS_PER_PAGE; + if (earliest < FirstMultiXactId) + earliest = FirstMultiXactId; /* nothing to do */ if (MultiXactIdPrecedes(oldestMXact, earliest)) @@ -2514,8 +2549,7 @@ TruncateMultiXact(MultiXactId oldestMXact) /* * First, compute the safe truncation point for MultiXactMember. This is - * the starting offset of the multixact we were passed as MultiXactOffset - * cutoff. + * the starting offset of the oldest multixact. */ { int pageno; @@ -2538,10 +2572,6 @@ TruncateMultiXact(MultiXactId oldestMXact) LWLockRelease(MultiXactOffsetControlLock); } - /* truncate MultiXactOffset */ - SimpleLruTruncate(MultiXactOffsetCtl, - MultiXactIdToOffsetPage(oldestMXact)); - /* * To truncate MultiXactMembers, we need to figure out the active page * range and delete all files outside that range. The start point is the @@ -2559,6 +2589,11 @@ TruncateMultiXact(MultiXactId oldestMXact) range.rangeEnd = MXOffsetToMemberPage(nextOffset); SlruScanDirectory(MultiXactMemberCtl, SlruScanDirCbRemoveMembers, &range); + + /* Now we can truncate MultiXactOffset */ + SimpleLruTruncate(MultiXactOffsetCtl, + MultiXactIdToOffsetPage(oldestMXact)); + } /* diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index abc5682e7f938afe4d2a94bd19f79111ea070346..e5640793eb8e09355b09d13cbdb68d3a9773ae6b 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -6264,6 +6264,7 @@ StartupXLOG(void) MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset); SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB); SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB); + MultiXactSetSafeTruncate(checkPoint.oldestMulti); XLogCtl->ckptXidEpoch = checkPoint.nextXidEpoch; XLogCtl->ckptXid = checkPoint.nextXid; @@ -8272,6 +8273,12 @@ CreateCheckPoint(int flags) */ END_CRIT_SECTION(); + /* + * Now that the checkpoint is safely on disk, we can update the point to + * which multixact can be truncated. + */ + MultiXactSetSafeTruncate(checkPoint.oldestMulti); + /* * Let smgr do post-checkpoint cleanup (eg, deleting old files). */ @@ -8305,6 +8312,11 @@ CreateCheckPoint(int flags) if (!RecoveryInProgress()) TruncateSUBTRANS(GetOldestXmin(NULL, false)); + /* + * Truncate pg_multixact too. + */ + TruncateMultiXact(); + /* Real work is done, but log and update stats before releasing lock. */ LogCheckpointEnd(false); @@ -8578,21 +8590,6 @@ CreateRestartPoint(int flags) } LWLockRelease(ControlFileLock); - /* - * Due to an historical accident multixact truncations are not WAL-logged, - * but just performed everytime the mxact horizon is increased. So, unless - * we explicitly execute truncations on a standby it will never clean out - * /pg_multixact which obviously is bad, both because it uses space and - * because we can wrap around into pre-existing data... - * - * We can only do the truncation here, after the UpdateControlFile() - * above, because we've now safely established a restart point, that - * guarantees we will not need need to access those multis. - * - * It's probably worth improving this. - */ - TruncateMultiXact(lastCheckPoint.oldestMulti); - /* * Delete old log files (those no longer needed even for previous * checkpoint/restartpoint) to prevent the disk holding the xlog from @@ -8651,6 +8648,21 @@ CreateRestartPoint(int flags) ThisTimeLineID = 0; } + /* + * Due to an historical accident multixact truncations are not WAL-logged, + * but just performed everytime the mxact horizon is increased. So, unless + * we explicitly execute truncations on a standby it will never clean out + * /pg_multixact which obviously is bad, both because it uses space and + * because we can wrap around into pre-existing data... + * + * We can only do the truncation here, after the UpdateControlFile() + * above, because we've now safely established a restart point. That + * guarantees we will not need to access those multis. + * + * It's probably worth improving this. + */ + TruncateMultiXact(); + /* * Truncate pg_subtrans if possible. We can throw away all data before * the oldest XMIN of any running transaction. No future transaction will @@ -9117,6 +9129,7 @@ xlog_redo(XLogRecPtr lsn, XLogRecord *record) checkPoint.nextMultiOffset); SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB); SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB); + MultiXactSetSafeTruncate(checkPoint.oldestMulti); /* * If we see a shutdown checkpoint while waiting for an end-of-backup @@ -9217,6 +9230,7 @@ xlog_redo(XLogRecPtr lsn, XLogRecord *record) checkPoint.oldestXidDB); MultiXactAdvanceOldest(checkPoint.oldestMulti, checkPoint.oldestMultiDB); + MultiXactSetSafeTruncate(checkPoint.oldestMulti); /* ControlFile->checkPointCopy always tracks the latest ckpt XID */ ControlFile->checkPointCopy.nextXidEpoch = checkPoint.nextXidEpoch; diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c index 3d2c73902c6ef73460a98e64b30a85e962280b44..8822a154dccee7552403b1b6bb847c741219b9a4 100644 --- a/src/backend/commands/vacuum.c +++ b/src/backend/commands/vacuum.c @@ -969,9 +969,11 @@ vac_truncate_clog(TransactionId frozenXID, MultiXactId minMulti) return; } - /* Truncate CLOG and Multi to the oldest computed value */ + /* + * Truncate CLOG to the oldest computed value. Note we don't truncate + * multixacts; that will be done by the next checkpoint. + */ TruncateCLOG(frozenXID); - TruncateMultiXact(minMulti); /* * Update the wrap limit for GetNewTransactionId and creation of new @@ -980,7 +982,7 @@ vac_truncate_clog(TransactionId frozenXID, MultiXactId minMulti) * signalling twice? */ SetTransactionIdLimit(frozenXID, oldestxid_datoid); - MultiXactAdvanceOldest(minMulti, minmulti_datoid); + SetMultiXactIdLimit(minMulti, minmulti_datoid); } diff --git a/src/include/access/multixact.h b/src/include/access/multixact.h index 448ec100d393399e1357322fe7085491fa8d749d..f6d2e0418b191fad2e0d098062b1e5af3827223f 100644 --- a/src/include/access/multixact.h +++ b/src/include/access/multixact.h @@ -119,12 +119,13 @@ extern void MultiXactGetCheckptMulti(bool is_shutdown, Oid *oldestMultiDB); extern void CheckPointMultiXact(void); extern MultiXactId GetOldestMultiXactId(void); -extern void TruncateMultiXact(MultiXactId cutoff_multi); +extern void TruncateMultiXact(void); extern void MultiXactSetNextMXact(MultiXactId nextMulti, MultiXactOffset nextMultiOffset); extern void MultiXactAdvanceNextMXact(MultiXactId minMulti, MultiXactOffset minMultiOffset); extern void MultiXactAdvanceOldest(MultiXactId oldestMulti, Oid oldestMultiDB); +extern void MultiXactSetSafeTruncate(MultiXactId safeTruncateMulti); extern void multixact_twophase_recover(TransactionId xid, uint16 info, void *recdata, uint32 len);