diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index fb5d6473efee9cb8f916cae32b917fb7872be7ae..dc63d7d5e40d1775340868b069f1a789321d876c 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -8184,6 +8184,38 @@ LOG: CleanUpLock: deleting: lock(0xb7acd844) id(24688,24696,0,0,0,1) </listitem> </varlistentry> + <varlistentry id="guc-wal-consistency-checking" xreflabel="wal_consistency_checking"> + <term><varname>wal_consistency_checking</varname> (<type>string</type>) + <indexterm> + <primary><varname>wal_consistency_checking</> configuration parameter</primary> + </indexterm> + </term> + <listitem> + <para> + This parameter is intended to be used to check for bugs in the WAL + redo routines. When enabled, full-page images of any buffers modified + in conjunction with the WAL record are added to the record. + If the record is subsequently replayed, the system will first apply + each record and then test whether the buffers modified by the record + match the stored images. In certain cases (such as hint bits), minor + variations are acceptable, and will be ignored. Any unexpected + differences will result in a fatal error, terminating recovery. + </para> + + <para> + The default value of this setting is the empty string, which disables + the feature. It can be set to <literal>all</literal> to check all + records, or to a comma-separated list of resource managers to check + only records originating from those resource managers. Currently, + the supported resource managers are <literal>heap</>, + <literal>heap2</>, <literal>btree</>, <literal>gin</>, + <literal>gist</>, <literal>sequence</>, <literal>spgist</>, + <literal>brin</>, and <literal>generic</>. Only + superusers can change this setting. + </para> + </listitem> + </varlistentry> + <varlistentry id="guc-wal-debug" xreflabel="wal_debug"> <term><varname>wal_debug</varname> (<type>boolean</type>) <indexterm> diff --git a/src/backend/access/brin/brin_xlog.c b/src/backend/access/brin/brin_xlog.c index b698c9b58c597c5acdea7511ff333393a507d3fa..f416bacc3f7e3816cb2cd22db8f533e4f1aa3dcd 100644 --- a/src/backend/access/brin/brin_xlog.c +++ b/src/backend/access/brin/brin_xlog.c @@ -13,6 +13,7 @@ #include "access/brin_page.h" #include "access/brin_pageops.h" #include "access/brin_xlog.h" +#include "access/bufmask.h" #include "access/xlogutils.h" @@ -279,3 +280,22 @@ brin_redo(XLogReaderState *record) elog(PANIC, "brin_redo: unknown op code %u", info); } } + +/* + * Mask a BRIN page before doing consistency checks. + */ +void +brin_mask(char *pagedata, BlockNumber blkno) +{ + Page page = (Page) pagedata; + + mask_page_lsn(page); + + mask_page_hint_bits(page); + + if (BRIN_IS_REGULAR_PAGE(page)) + { + /* Regular brin pages contain unused space which needs to be masked. */ + mask_unused_space(page); + } +} diff --git a/src/backend/access/common/Makefile b/src/backend/access/common/Makefile index d4b8132a9739f430f1acb18934383cd64a68fb1d..fb27944b891662a88c0724119fa8c900c678dfa1 100644 --- a/src/backend/access/common/Makefile +++ b/src/backend/access/common/Makefile @@ -12,7 +12,7 @@ subdir = src/backend/access/common top_builddir = ../../../.. include $(top_builddir)/src/Makefile.global -OBJS = heaptuple.o indextuple.o printsimple.o printtup.o reloptions.o \ - scankey.o tupconvert.o tupdesc.o +OBJS = bufmask.o heaptuple.o indextuple.o printsimple.o printtup.o \ + reloptions.o scankey.o tupconvert.o tupdesc.o include $(top_srcdir)/src/backend/common.mk diff --git a/src/backend/access/common/bufmask.c b/src/backend/access/common/bufmask.c new file mode 100644 index 0000000000000000000000000000000000000000..3b06115e03585898a59be95c917cf337721e3bee --- /dev/null +++ b/src/backend/access/common/bufmask.c @@ -0,0 +1,128 @@ +/*------------------------------------------------------------------------- + * + * bufmask.c + * Routines for buffer masking. Used to mask certain bits + * in a page which can be different when the WAL is generated + * and when the WAL is applied. + * + * Portions Copyright (c) 2016, PostgreSQL Global Development Group + * + * Contains common routines required for masking a page. + * + * IDENTIFICATION + * src/backend/storage/buffer/bufmask.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/bufmask.h" + +/* + * mask_page_lsn + * + * In consistency checks, the LSN of the two pages compared will likely be + * different because of concurrent operations when the WAL is generated + * and the state of the page when WAL is applied. + */ +void +mask_page_lsn(Page page) +{ + PageHeader phdr = (PageHeader) page; + + PageXLogRecPtrSet(phdr->pd_lsn, (uint64) MASK_MARKER); +} + +/* + * mask_page_hint_bits + * + * Mask hint bits in PageHeader. We want to ignore differences in hint bits, + * since they can be set without emitting any WAL. + */ +void +mask_page_hint_bits(Page page) +{ + PageHeader phdr = (PageHeader) page; + + /* Ignore prune_xid (it's like a hint-bit) */ + phdr->pd_prune_xid = MASK_MARKER; + + /* Ignore PD_PAGE_FULL and PD_HAS_FREE_LINES flags, they are just hints. */ + PageClearFull(page); + PageClearHasFreeLinePointers(page); + + /* + * During replay, if the page LSN has advanced past our XLOG record's LSN, + * we don't mark the page all-visible. See heap_xlog_visible() for + * details. + */ + PageClearAllVisible(page); +} + +/* + * mask_unused_space + * + * Mask the unused space of a page between pd_lower and pd_upper. + */ +void +mask_unused_space(Page page) +{ + int pd_lower = ((PageHeader) page)->pd_lower; + int pd_upper = ((PageHeader) page)->pd_upper; + int pd_special = ((PageHeader) page)->pd_special; + + /* Sanity check */ + if (pd_lower > pd_upper || pd_special < pd_upper || + pd_lower < SizeOfPageHeaderData || pd_special > BLCKSZ) + { + elog(ERROR, "invalid page pd_lower %u pd_upper %u pd_special %u\n", + pd_lower, pd_upper, pd_special); + } + + memset(page + pd_lower, MASK_MARKER, pd_upper - pd_lower); +} + +/* + * mask_lp_flags + * + * In some index AMs, line pointer flags can be modified in master without + * emitting any WAL record. + */ +void +mask_lp_flags(Page page) +{ + OffsetNumber offnum, + maxoff; + + maxoff = PageGetMaxOffsetNumber(page); + for (offnum = FirstOffsetNumber; + offnum <= maxoff; + offnum = OffsetNumberNext(offnum)) + { + ItemId itemId = PageGetItemId(page, offnum); + + if (ItemIdIsUsed(itemId)) + itemId->lp_flags = LP_UNUSED; + } +} + +/* + * mask_page_content + * + * In some index AMs, the contents of deleted pages need to be almost + * completely ignored. + */ +void +mask_page_content(Page page) +{ + /* Mask Page Content */ + memset(page + SizeOfPageHeaderData, MASK_MARKER, + BLCKSZ - SizeOfPageHeaderData); + + /* Mask pd_lower and pd_upper */ + memset(&((PageHeader) page)->pd_lower, MASK_MARKER, + sizeof(uint16)); + memset(&((PageHeader) page)->pd_upper, MASK_MARKER, + sizeof(uint16)); +} diff --git a/src/backend/access/gin/ginxlog.c b/src/backend/access/gin/ginxlog.c index 8468fe825cf906ceabaeecf597c64142bd6cf86b..2995e7b06a7d17024f6a86738359035b2982980d 100644 --- a/src/backend/access/gin/ginxlog.c +++ b/src/backend/access/gin/ginxlog.c @@ -13,6 +13,7 @@ */ #include "postgres.h" +#include "access/bufmask.h" #include "access/gin_private.h" #include "access/xlogutils.h" #include "utils/memutils.h" @@ -758,3 +759,34 @@ gin_xlog_cleanup(void) MemoryContextDelete(opCtx); opCtx = NULL; } + +/* + * Mask a GIN page before running consistency checks on it. + */ +void +gin_mask(char *pagedata, BlockNumber blkno) +{ + Page page = (Page) pagedata; + GinPageOpaque opaque; + + mask_page_lsn(page); + opaque = GinPageGetOpaque(page); + + mask_page_hint_bits(page); + + /* + * GIN metapage doesn't use pd_lower/pd_upper. Other page types do. Hence, + * we need to apply masking for those pages. + */ + if (opaque->flags != GIN_META) + { + /* + * For GIN_DELETED page, the page is initialized to empty. Hence, mask + * the page content. + */ + if (opaque->flags & GIN_DELETED) + mask_page_content(page); + else + mask_unused_space(page); + } +} diff --git a/src/backend/access/gist/gistxlog.c b/src/backend/access/gist/gistxlog.c index 88b97a4e48740263121e467da2872187de5d041c..cbda9e705cc5ae80c2767aaf200acc93ab6126c1 100644 --- a/src/backend/access/gist/gistxlog.c +++ b/src/backend/access/gist/gistxlog.c @@ -13,6 +13,7 @@ */ #include "postgres.h" +#include "access/bufmask.h" #include "access/gist_private.h" #include "access/xloginsert.h" #include "access/xlogutils.h" @@ -342,6 +343,48 @@ gist_xlog_cleanup(void) MemoryContextDelete(opCtx); } +/* + * Mask a Gist page before running consistency checks on it. + */ +void +gist_mask(char *pagedata, BlockNumber blkno) +{ + Page page = (Page) pagedata; + + mask_page_lsn(page); + + mask_page_hint_bits(page); + mask_unused_space(page); + + /* + * NSN is nothing but a special purpose LSN. Hence, mask it for the same + * reason as mask_page_lsn. + */ + GistPageSetNSN(page, (uint64) MASK_MARKER); + + /* + * We update F_FOLLOW_RIGHT flag on the left child after writing WAL + * record. Hence, mask this flag. See gistplacetopage() for details. + */ + GistMarkFollowRight(page); + + if (GistPageIsLeaf(page)) + { + /* + * In gist leaf pages, it is possible to modify the LP_FLAGS without + * emitting any WAL record. Hence, mask the line pointer flags. See + * gistkillitems() for details. + */ + mask_lp_flags(page); + } + + /* + * During gist redo, we never mark a page as garbage. Hence, mask it to + * ignore any differences. + */ + GistClearPageHasGarbage(page); +} + /* * Write WAL record of a page split. */ diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c index 5fd7f1e1a200da7a4646e4de7c971fad80bd5b41..0be48fb3ee112cb97888851053f946a671137b02 100644 --- a/src/backend/access/heap/heapam.c +++ b/src/backend/access/heap/heapam.c @@ -38,6 +38,7 @@ */ #include "postgres.h" +#include "access/bufmask.h" #include "access/heapam.h" #include "access/heapam_xlog.h" #include "access/hio.h" @@ -9142,3 +9143,81 @@ heap_sync(Relation rel) heap_close(toastrel, AccessShareLock); } } + +/* + * Mask a heap page before performing consistency checks on it. + */ +void +heap_mask(char *pagedata, BlockNumber blkno) +{ + Page page = (Page) pagedata; + OffsetNumber off; + + mask_page_lsn(page); + + mask_page_hint_bits(page); + mask_unused_space(page); + + for (off = 1; off <= PageGetMaxOffsetNumber(page); off++) + { + ItemId iid = PageGetItemId(page, off); + char *page_item; + + page_item = (char *) (page + ItemIdGetOffset(iid)); + + if (ItemIdIsNormal(iid)) + { + + HeapTupleHeader page_htup = (HeapTupleHeader) page_item; + + /* + * If xmin of a tuple is not yet frozen, we should ignore + * differences in hint bits, since they can be set without + * emitting WAL. + */ + if (!HeapTupleHeaderXminFrozen(page_htup)) + page_htup->t_infomask &= ~HEAP_XACT_MASK; + else + { + /* Still we need to mask xmax hint bits. */ + page_htup->t_infomask &= ~HEAP_XMAX_INVALID; + page_htup->t_infomask &= ~HEAP_XMAX_COMMITTED; + } + + /* + * During replay, we set Command Id to FirstCommandId. Hence, mask + * it. See heap_xlog_insert() for details. + */ + page_htup->t_choice.t_heap.t_field3.t_cid = MASK_MARKER; + + /* + * For a speculative tuple, heap_insert() does not set ctid in the + * caller-passed heap tuple itself, leaving the ctid field to + * contain a speculative token value - a per-backend monotonically + * increasing identifier. Besides, it does not WAL-log ctid under + * any circumstances. + * + * During redo, heap_xlog_insert() sets t_ctid to current block + * number and self offset number. It doesn't care about any + * speculative insertions in master. Hence, we set t_ctid to + * current block number and self offset number to ignore any + * inconsistency. + */ + if (HeapTupleHeaderIsSpeculative(page_htup)) + ItemPointerSet(&page_htup->t_ctid, blkno, off); + } + + /* + * Ignore any padding bytes after the tuple, when the length of the + * item is not MAXALIGNed. + */ + if (ItemIdHasStorage(iid)) + { + int len = ItemIdGetLength(iid); + int padlen = MAXALIGN(len) - len; + + if (padlen > 0) + memset(page_item + len, MASK_MARKER, padlen); + } + } +} diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c index efad745c57e429c09ff23de48030aa9324e70fba..a9ca279d8138dd1d32a80796e260b8dd35055d96 100644 --- a/src/backend/access/nbtree/nbtxlog.c +++ b/src/backend/access/nbtree/nbtxlog.c @@ -14,6 +14,7 @@ */ #include "postgres.h" +#include "access/bufmask.h" #include "access/heapam_xlog.h" #include "access/nbtree.h" #include "access/transam.h" @@ -1028,3 +1029,52 @@ btree_redo(XLogReaderState *record) elog(PANIC, "btree_redo: unknown op code %u", info); } } + +/* + * Mask a btree page before performing consistency checks on it. + */ +void +btree_mask(char *pagedata, BlockNumber blkno) +{ + Page page = (Page) pagedata; + BTPageOpaque maskopaq; + + mask_page_lsn(page); + + mask_page_hint_bits(page); + mask_unused_space(page); + + maskopaq = (BTPageOpaque) PageGetSpecialPointer(page); + + if (P_ISDELETED(maskopaq)) + { + /* + * Mask page content on a DELETED page since it will be re-initialized + * during replay. See btree_xlog_unlink_page() for details. + */ + mask_page_content(page); + } + else if (P_ISLEAF(maskopaq)) + { + /* + * In btree leaf pages, it is possible to modify the LP_FLAGS without + * emitting any WAL record. Hence, mask the line pointer flags. See + * _bt_killitems(), _bt_check_unique() for details. + */ + mask_lp_flags(page); + } + + /* + * BTP_HAS_GARBAGE is just an un-logged hint bit. So, mask it. See + * _bt_killitems(), _bt_check_unique() for details. + */ + maskopaq->btpo_flags &= ~BTP_HAS_GARBAGE; + + /* + * During replay of a btree page split, we don't set the BTP_SPLIT_END + * flag of the right sibling and initialize the cycle_id to 0 for the same + * page. See btree_xlog_split() for details. + */ + maskopaq->btpo_flags &= ~BTP_SPLIT_END; + maskopaq->btpo_cycleid = 0; +} diff --git a/src/backend/access/rmgrdesc/gindesc.c b/src/backend/access/rmgrdesc/gindesc.c index 9e488b359af017c4cd84721412fe9649735ad61a..d4ed7f9c0ab845e5de489b08e5eacf26c30c9cd9 100644 --- a/src/backend/access/rmgrdesc/gindesc.c +++ b/src/backend/access/rmgrdesc/gindesc.c @@ -105,7 +105,12 @@ gin_desc(StringInfo buf, XLogReaderState *record) leftChildBlkno, rightChildBlkno); } if (XLogRecHasBlockImage(record, 0)) - appendStringInfoString(buf, " (full page image)"); + { + if (XLogRecBlockImageApply(record, 0)) + appendStringInfoString(buf, " (full page image)"); + else + appendStringInfoString(buf, " (full page image, for WAL verification)"); + } else { char *payload = XLogRecGetBlockData(record, 0, NULL); @@ -145,7 +150,12 @@ gin_desc(StringInfo buf, XLogReaderState *record) case XLOG_GIN_VACUUM_DATA_LEAF_PAGE: { if (XLogRecHasBlockImage(record, 0)) - appendStringInfoString(buf, " (full page image)"); + { + if (XLogRecBlockImageApply(record, 0)) + appendStringInfoString(buf, " (full page image)"); + else + appendStringInfoString(buf, " (full page image, for WAL verification)"); + } else { ginxlogVacuumDataLeafPage *xlrec = diff --git a/src/backend/access/spgist/spgxlog.c b/src/backend/access/spgist/spgxlog.c index 3dc6a5ab88131da8c1486c8f40cd9064d685da92..596b266ba64b8cd587ba5763734e6cde58c7a5d2 100644 --- a/src/backend/access/spgist/spgxlog.c +++ b/src/backend/access/spgist/spgxlog.c @@ -14,6 +14,7 @@ */ #include "postgres.h" +#include "access/bufmask.h" #include "access/spgist_private.h" #include "access/transam.h" #include "access/xlog.h" @@ -1023,3 +1024,23 @@ spg_xlog_cleanup(void) MemoryContextDelete(opCtx); opCtx = NULL; } + +/* + * Mask a SpGist page before performing consistency checks on it. + */ +void +spg_mask(char *pagedata, BlockNumber blkno) +{ + Page page = (Page) pagedata; + + mask_page_lsn(page); + + mask_page_hint_bits(page); + + /* + * Any SpGist page other than meta contains unused space which needs to be + * masked. + */ + if (!SpGistPageIsMeta(page)) + mask_unused_space(page); +} diff --git a/src/backend/access/transam/generic_xlog.c b/src/backend/access/transam/generic_xlog.c index eddec9bc548176edb29d1bf7be9eb4d488b589ba..fbc6810c2f28c285cebf6e171831a8fc4b92f710 100644 --- a/src/backend/access/transam/generic_xlog.c +++ b/src/backend/access/transam/generic_xlog.c @@ -13,6 +13,7 @@ */ #include "postgres.h" +#include "access/bufmask.h" #include "access/generic_xlog.h" #include "access/xlogutils.h" #include "miscadmin.h" @@ -533,3 +534,14 @@ generic_redo(XLogReaderState *record) UnlockReleaseBuffer(buffers[block_id]); } } + +/* + * Mask a generic page before performing consistency checks on it. + */ +void +generic_mask(char *page, BlockNumber blkno) +{ + mask_page_lsn(page); + + mask_unused_space(page); +} diff --git a/src/backend/access/transam/rmgr.c b/src/backend/access/transam/rmgr.c index 9bb136218d579f01739dd562b356eb0e135bf8a2..eae75242fea82c5a5c438aa0e48ad5e3f63b67db 100644 --- a/src/backend/access/transam/rmgr.c +++ b/src/backend/access/transam/rmgr.c @@ -30,8 +30,8 @@ #include "utils/relmapper.h" /* must be kept in sync with RmgrData definition in xlog_internal.h */ -#define PG_RMGR(symname,name,redo,desc,identify,startup,cleanup) \ - { name, redo, desc, identify, startup, cleanup }, +#define PG_RMGR(symname,name,redo,desc,identify,startup,cleanup,mask) \ + { name, redo, desc, identify, startup, cleanup, mask }, const RmgrData RmgrTable[RM_MAX_ID + 1] = { #include "access/rmgrlist.h" diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index 2f5d6030660e8fc17bbda82ce5e9eb1e984800b1..cc8b83fa8d636cc64cf2fd462024dc5f182cc4af 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -95,6 +95,8 @@ bool EnableHotStandby = false; bool fullPageWrites = true; bool wal_log_hints = false; bool wal_compression = false; +char *wal_consistency_checking_string = NULL; +bool *wal_consistency_checking = NULL; bool log_checkpoints = false; int sync_method = DEFAULT_SYNC_METHOD; int wal_level = WAL_LEVEL_MINIMAL; @@ -245,6 +247,10 @@ bool InArchiveRecovery = false; /* Was the last xlog file restored from archive, or local? */ static bool restoredFromArchive = false; +/* Buffers dedicated to consistency checks of size BLCKSZ */ +static char *replay_image_masked = NULL; +static char *master_image_masked = NULL; + /* options taken from recovery.conf for archive recovery */ char *recoveryRestoreCommand = NULL; static char *recoveryEndCommand = NULL; @@ -903,6 +909,7 @@ static char *GetXLogBuffer(XLogRecPtr ptr); static XLogRecPtr XLogBytePosToRecPtr(uint64 bytepos); static XLogRecPtr XLogBytePosToEndRecPtr(uint64 bytepos); static uint64 XLogRecPtrToBytePos(XLogRecPtr ptr); +static void checkXLogConsistency(XLogReaderState *record); static void WALInsertLockAcquire(void); static void WALInsertLockAcquireExclusive(void); @@ -1314,6 +1321,103 @@ ReserveXLogSwitch(XLogRecPtr *StartPos, XLogRecPtr *EndPos, XLogRecPtr *PrevPtr) return true; } +/* + * Checks whether the current buffer page and backup page stored in the + * WAL record are consistent or not. Before comparing the two pages, a + * masking can be applied to the pages to ignore certain areas like hint bits, + * unused space between pd_lower and pd_upper among other things. This + * function should be called once WAL replay has been completed for a + * given record. + */ +static void +checkXLogConsistency(XLogReaderState *record) +{ + RmgrId rmid = XLogRecGetRmid(record); + RelFileNode rnode; + ForkNumber forknum; + BlockNumber blkno; + int block_id; + + /* Records with no backup blocks have no need for consistency checks. */ + if (!XLogRecHasAnyBlockRefs(record)) + return; + + Assert((XLogRecGetInfo(record) & XLR_CHECK_CONSISTENCY) != 0); + + for (block_id = 0; block_id <= record->max_block_id; block_id++) + { + Buffer buf; + Page page; + + if (!XLogRecGetBlockTag(record, block_id, &rnode, &forknum, &blkno)) + { + /* + * WAL record doesn't contain a block reference with the given id. + * Do nothing. + */ + continue; + } + + Assert(XLogRecHasBlockImage(record, block_id)); + + /* + * Read the contents from the current buffer and store it in a + * temporary page. + */ + buf = XLogReadBufferExtended(rnode, forknum, blkno, + RBM_NORMAL_NO_LOG); + if (!BufferIsValid(buf)) + continue; + + LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE); + page = BufferGetPage(buf); + + /* + * Take a copy of the local page where WAL has been applied to have a + * comparison base before masking it... + */ + memcpy(replay_image_masked, page, BLCKSZ); + + /* No need for this page anymore now that a copy is in. */ + UnlockReleaseBuffer(buf); + + /* + * If the block LSN is already ahead of this WAL record, we can't + * expect contents to match. This can happen if recovery is restarted. + */ + if (PageGetLSN(replay_image_masked) > record->EndRecPtr) + continue; + + /* + * Read the contents from the backup copy, stored in WAL record and + * store it in a temporary page. There is not need to allocate a new + * page here, a local buffer is fine to hold its contents and a mask + * can be directly applied on it. + */ + if (!RestoreBlockImage(record, block_id, master_image_masked)) + elog(ERROR, "failed to restore block image"); + + /* + * If masking function is defined, mask both the master and replay + * images + */ + if (RmgrTable[rmid].rm_mask != NULL) + { + RmgrTable[rmid].rm_mask(replay_image_masked, blkno); + RmgrTable[rmid].rm_mask(master_image_masked, blkno); + } + + /* Time to compare the master and replay images. */ + if (memcmp(replay_image_masked, master_image_masked, BLCKSZ) != 0) + { + elog(FATAL, + "inconsistent page found, rel %u/%u/%u, forknum %u, blkno %u", + rnode.spcNode, rnode.dbNode, rnode.relNode, + forknum, blkno); + } + } +} + /* * Subroutine of XLogInsertRecord. Copies a WAL record to an already-reserved * area in the WAL. @@ -6200,6 +6304,13 @@ StartupXLOG(void) errdetail("Failed while allocating an XLog reading processor."))); xlogreader->system_identifier = ControlFile->system_identifier; + /* + * Allocate pages dedicated to WAL consistency checks, those had better + * be aligned. + */ + replay_image_masked = (char *) palloc(BLCKSZ); + master_image_masked = (char *) palloc(BLCKSZ); + if (read_backup_label(&checkPointLoc, &backupEndRequired, &backupFromStandby)) { @@ -7000,6 +7111,15 @@ StartupXLOG(void) /* Now apply the WAL record itself */ RmgrTable[record->xl_rmid].rm_redo(xlogreader); + /* + * After redo, check whether the backup pages associated with + * the WAL record are consistent with the existing pages. This + * check is done only if consistency check is enabled for this + * record. + */ + if ((record->xl_info & XLR_CHECK_CONSISTENCY) != 0) + checkXLogConsistency(xlogreader); + /* Pop the error context stack */ error_context_stack = errcallback.previous; diff --git a/src/backend/access/transam/xloginsert.c b/src/backend/access/transam/xloginsert.c index a5aa58d845d587f15be2b03cdfc998bd6c9d9388..797e68cd901ea020eb6fe0c3df261cdbafcb86d5 100644 --- a/src/backend/access/transam/xloginsert.c +++ b/src/backend/access/transam/xloginsert.c @@ -421,10 +421,12 @@ XLogInsert(RmgrId rmid, uint8 info) elog(ERROR, "XLogBeginInsert was not called"); /* - * The caller can set rmgr bits and XLR_SPECIAL_REL_UPDATE; the rest are - * reserved for use by me. + * The caller can set rmgr bits, XLR_SPECIAL_REL_UPDATE and + * XLR_CHECK_CONSISTENCY; the rest are reserved for use by me. */ - if ((info & ~(XLR_RMGR_INFO_MASK | XLR_SPECIAL_REL_UPDATE)) != 0) + if ((info & ~(XLR_RMGR_INFO_MASK | + XLR_SPECIAL_REL_UPDATE | + XLR_CHECK_CONSISTENCY)) != 0) elog(PANIC, "invalid xlog info mask %02X", info); TRACE_POSTGRESQL_XLOG_INSERT(rmid, info); @@ -504,6 +506,15 @@ XLogRecordAssemble(RmgrId rmid, uint8 info, rdt_datas_last = &hdr_rdt; hdr_rdt.data = hdr_scratch; + /* + * Enforce consistency checks for this record if user is looking for + * it. Do this before at the beginning of this routine to give the + * possibility for callers of XLogInsert() to pass XLR_CHECK_CONSISTENCY + * directly for a record. + */ + if (wal_consistency_checking[rmid]) + info |= XLR_CHECK_CONSISTENCY; + /* * Make an rdata chain containing all the data portions of all block * references. This includes the data for full-page images. Also append @@ -520,6 +531,7 @@ XLogRecordAssemble(RmgrId rmid, uint8 info, XLogRecordBlockCompressHeader cbimg = {0}; bool samerel; bool is_compressed = false; + bool include_image; if (!regbuf->in_use) continue; @@ -563,7 +575,14 @@ XLogRecordAssemble(RmgrId rmid, uint8 info, if ((regbuf->flags & REGBUF_WILL_INIT) == REGBUF_WILL_INIT) bkpb.fork_flags |= BKPBLOCK_WILL_INIT; - if (needs_backup) + /* + * If needs_backup is true or WAL checking is enabled for + * current resource manager, log a full-page write for the current + * block. + */ + include_image = needs_backup || (info & XLR_CHECK_CONSISTENCY) != 0; + + if (include_image) { Page page = regbuf->page; uint16 compressed_len; @@ -625,6 +644,15 @@ XLogRecordAssemble(RmgrId rmid, uint8 info, bimg.bimg_info = (cbimg.hole_length == 0) ? 0 : BKPIMAGE_HAS_HOLE; + /* + * If WAL consistency checking is enabled for the resource manager of + * this WAL record, a full-page image is included in the record + * for the block modified. During redo, the full-page is replayed + * only if BKPIMAGE_APPLY is set. + */ + if (needs_backup) + bimg.bimg_info |= BKPIMAGE_APPLY; + if (is_compressed) { bimg.length = compressed_len; @@ -687,7 +715,7 @@ XLogRecordAssemble(RmgrId rmid, uint8 info, /* Ok, copy the header to the scratch buffer */ memcpy(scratch, &bkpb, SizeOfXLogRecordBlockHeader); scratch += SizeOfXLogRecordBlockHeader; - if (needs_backup) + if (include_image) { memcpy(scratch, &bimg, SizeOfXLogRecordBlockImageHeader); scratch += SizeOfXLogRecordBlockImageHeader; diff --git a/src/backend/access/transam/xlogreader.c b/src/backend/access/transam/xlogreader.c index b528745fe85c8096adfabf34a46b18a267a56361..f077662946f4f7a0e6ebf44dfc9a747d65c95706 100644 --- a/src/backend/access/transam/xlogreader.c +++ b/src/backend/access/transam/xlogreader.c @@ -997,6 +997,7 @@ ResetDecoder(XLogReaderState *state) state->blocks[block_id].in_use = false; state->blocks[block_id].has_image = false; state->blocks[block_id].has_data = false; + state->blocks[block_id].apply_image = false; } state->max_block_id = -1; } @@ -1089,6 +1090,7 @@ DecodeXLogRecord(XLogReaderState *state, XLogRecord *record, char **errormsg) blk = &state->blocks[block_id]; blk->in_use = true; + blk->apply_image = false; COPY_HEADER_FIELD(&fork_flags, sizeof(uint8)); blk->forknum = fork_flags & BKPBLOCK_FORK_MASK; @@ -1120,6 +1122,9 @@ DecodeXLogRecord(XLogReaderState *state, XLogRecord *record, char **errormsg) COPY_HEADER_FIELD(&blk->bimg_len, sizeof(uint16)); COPY_HEADER_FIELD(&blk->hole_offset, sizeof(uint16)); COPY_HEADER_FIELD(&blk->bimg_info, sizeof(uint8)); + + blk->apply_image = ((blk->bimg_info & BKPIMAGE_APPLY) != 0); + if (blk->bimg_info & BKPIMAGE_IS_COMPRESSED) { if (blk->bimg_info & BKPIMAGE_HAS_HOLE) @@ -1243,6 +1248,9 @@ DecodeXLogRecord(XLogReaderState *state, XLogRecord *record, char **errormsg) if (!blk->in_use) continue; + + Assert(blk->has_image || !blk->apply_image); + if (blk->has_image) { blk->bkp_image = ptr; diff --git a/src/backend/access/transam/xlogutils.c b/src/backend/access/transam/xlogutils.c index 0de2419e54ba451a239c037516afd1acc59025ba..6627f5498b9b5e3b05a741fb9b0976c4f406c593 100644 --- a/src/backend/access/transam/xlogutils.c +++ b/src/backend/access/transam/xlogutils.c @@ -275,9 +275,9 @@ XLogCheckInvalidPages(void) * will complain if we don't have the lock. In hot standby mode it's * definitely necessary.) * - * Note: when a backup block is available in XLOG, we restore it - * unconditionally, even if the page in the database appears newer. This is - * to protect ourselves against database pages that were partially or + * Note: when a backup block is available in XLOG with the BKPIMAGE_APPLY flag + * set, we restore it, even if the page in the database appears newer. This + * is to protect ourselves against database pages that were partially or * incorrectly written during a crash. We assume that the XLOG data must be * good because it has passed a CRC check, while the database page might not * be. This will force us to replay all subsequent modifications of the page @@ -352,9 +352,10 @@ XLogReadBufferForRedoExtended(XLogReaderState *record, if (!willinit && zeromode) elog(PANIC, "block to be initialized in redo routine must be marked with WILL_INIT flag in the WAL record"); - /* If it's a full-page image, restore it. */ - if (XLogRecHasBlockImage(record, block_id)) + /* If it has a full-page image and it should be restored, do it. */ + if (XLogRecBlockImageApply(record, block_id)) { + Assert(XLogRecHasBlockImage(record, block_id)); *buf = XLogReadBufferExtended(rnode, forknum, blkno, get_cleanup_lock ? RBM_ZERO_AND_CLEANUP_LOCK : RBM_ZERO_AND_LOCK); page = BufferGetPage(*buf); diff --git a/src/backend/commands/sequence.c b/src/backend/commands/sequence.c index c148b09cd725ba56987e767663a7e6575b164ba0..e6f87543df85d65f58a1753177892e871cb0b1f4 100644 --- a/src/backend/commands/sequence.c +++ b/src/backend/commands/sequence.c @@ -14,6 +14,7 @@ */ #include "postgres.h" +#include "access/bufmask.h" #include "access/htup_details.h" #include "access/multixact.h" #include "access/transam.h" @@ -1740,3 +1741,14 @@ ResetSequenceCaches(void) last_used_seq = NULL; } + +/* + * Mask a Sequence page before performing consistency checks on it. + */ +void +seq_mask(char *page, BlockNumber blkno) +{ + mask_page_lsn(page); + + mask_unused_space(page); +} diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index c53aededcb4ef47d378b47df96a86df3a6cc4848..de85eca6a8f25bc06bdd6ddd744796f7c1c77f28 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -28,9 +28,11 @@ #include "access/commit_ts.h" #include "access/gin.h" +#include "access/rmgr.h" #include "access/transam.h" #include "access/twophase.h" #include "access/xact.h" +#include "access/xlog_internal.h" #include "catalog/namespace.h" #include "commands/async.h" #include "commands/prepare.h" @@ -147,6 +149,10 @@ static bool call_enum_check_hook(struct config_enum * conf, int *newval, static bool check_log_destination(char **newval, void **extra, GucSource source); static void assign_log_destination(const char *newval, void *extra); +static bool check_wal_consistency_checking(char **newval, void **extra, + GucSource source); +static void assign_wal_consistency_checking(const char *newval, void *extra); + #ifdef HAVE_SYSLOG static int syslog_facility = LOG_LOCAL0; #else @@ -3572,6 +3578,17 @@ static struct config_string ConfigureNamesString[] = check_cluster_name, NULL, NULL }, + { + {"wal_consistency_checking", PGC_SUSET, DEVELOPER_OPTIONS, + gettext_noop("Sets the WAL resource managers for which WAL consistency checks are done."), + gettext_noop("Full-page images will be logged for all data blocks and cross-checked against the results of WAL replay."), + GUC_LIST_INPUT | GUC_NOT_IN_SAMPLE + }, + &wal_consistency_checking_string, + "", + check_wal_consistency_checking, assign_wal_consistency_checking, NULL + }, + /* End-of-list marker */ { {NULL, 0, 0, NULL, NULL}, NULL, NULL, NULL, NULL, NULL @@ -9888,6 +9905,86 @@ call_enum_check_hook(struct config_enum * conf, int *newval, void **extra, * check_hook, assign_hook and show_hook subroutines */ +static bool +check_wal_consistency_checking(char **newval, void **extra, GucSource source) +{ + char *rawstring; + List *elemlist; + ListCell *l; + bool newwalconsistency[RM_MAX_ID + 1]; + + /* Initialize the array */ + MemSet(newwalconsistency, 0, (RM_MAX_ID + 1) * sizeof(bool)); + + /* Need a modifiable copy of string */ + rawstring = pstrdup(*newval); + + /* Parse string into list of identifiers */ + if (!SplitIdentifierString(rawstring, ',', &elemlist)) + { + /* syntax error in list */ + GUC_check_errdetail("List syntax is invalid."); + pfree(rawstring); + list_free(elemlist); + return false; + } + + foreach(l, elemlist) + { + char *tok = (char *) lfirst(l); + bool found = false; + RmgrId rmid; + + /* Check for 'all'. */ + if (pg_strcasecmp(tok, "all") == 0) + { + for (rmid = 0; rmid <= RM_MAX_ID; rmid++) + if (RmgrTable[rmid].rm_mask != NULL) + newwalconsistency[rmid] = true; + found = true; + } + else + { + /* + * Check if the token matches with any individual resource + * manager. + */ + for (rmid = 0; rmid <= RM_MAX_ID; rmid++) + { + if (pg_strcasecmp(tok, RmgrTable[rmid].rm_name) == 0 && + RmgrTable[rmid].rm_mask != NULL) + { + newwalconsistency[rmid] = true; + found = true; + } + } + } + + /* If a valid resource manager is found, check for the next one. */ + if (!found) + { + GUC_check_errdetail("Unrecognized key word: \"%s\".", tok); + pfree(rawstring); + list_free(elemlist); + return false; + } + } + + pfree(rawstring); + list_free(elemlist); + + /* assign new value */ + *extra = guc_malloc(ERROR, (RM_MAX_ID + 1) * sizeof(bool)); + memcpy(*extra, newwalconsistency, (RM_MAX_ID + 1) * sizeof(bool)); + return true; +} + +static void +assign_wal_consistency_checking(const char *newval, void *extra) +{ + wal_consistency_checking = (bool *) extra; +} + static bool check_log_destination(char **newval, void **extra, GucSource source) { diff --git a/src/bin/pg_rewind/parsexlog.c b/src/bin/pg_rewind/parsexlog.c index cb433819e45632d82ee02247f5d8aab0fbe84ba1..a7f6fe2df327e3e4f2a2b269be5dc1007169d8af 100644 --- a/src/bin/pg_rewind/parsexlog.c +++ b/src/bin/pg_rewind/parsexlog.c @@ -29,7 +29,7 @@ * RmgrNames is an array of resource manager names, to make error messages * a bit nicer. */ -#define PG_RMGR(symname,name,redo,desc,identify,startup,cleanup) \ +#define PG_RMGR(symname,name,redo,desc,identify,startup,cleanup,mask) \ name, static const char *RmgrNames[RM_MAX_ID + 1] = { diff --git a/src/bin/pg_xlogdump/pg_xlogdump.c b/src/bin/pg_xlogdump/pg_xlogdump.c index 590d2ad587e3c17eba6d61ad03fdadb2280aa869..679aead8951d2c59884897c58b0e8d32d36bc82f 100644 --- a/src/bin/pg_xlogdump/pg_xlogdump.c +++ b/src/bin/pg_xlogdump/pg_xlogdump.c @@ -465,7 +465,12 @@ XLogDumpDisplayRecord(XLogDumpConfig *config, XLogReaderState *record) rnode.spcNode, rnode.dbNode, rnode.relNode, blk); if (XLogRecHasBlockImage(record, block_id)) - printf(" FPW"); + { + if (XLogRecBlockImageApply(record, block_id)) + printf(" FPW"); + else + printf(" FPW for WAL verification"); + } } putchar('\n'); } @@ -489,7 +494,10 @@ XLogDumpDisplayRecord(XLogDumpConfig *config, XLogReaderState *record) if (record->blocks[block_id].bimg_info & BKPIMAGE_IS_COMPRESSED) { - printf(" (FPW); hole: offset: %u, length: %u, compression saved: %u\n", + printf(" (FPW%s); hole: offset: %u, length: %u, " + "compression saved: %u\n", + XLogRecBlockImageApply(record, block_id) ? + "" : " for WAL verification", record->blocks[block_id].hole_offset, record->blocks[block_id].hole_length, BLCKSZ - @@ -498,7 +506,9 @@ XLogDumpDisplayRecord(XLogDumpConfig *config, XLogReaderState *record) } else { - printf(" (FPW); hole: offset: %u, length: %u\n", + printf(" (FPW%s); hole: offset: %u, length: %u\n", + XLogRecBlockImageApply(record, block_id) ? + "" : " for WAL verification", record->blocks[block_id].hole_offset, record->blocks[block_id].hole_length); } diff --git a/src/bin/pg_xlogdump/rmgrdesc.c b/src/bin/pg_xlogdump/rmgrdesc.c index 8fe20ce97e0c71a9f53c0b0a5f1f62737e6459fd..5d19a4af7257f2bd5fa9233552b5bd1df1d679a4 100644 --- a/src/bin/pg_xlogdump/rmgrdesc.c +++ b/src/bin/pg_xlogdump/rmgrdesc.c @@ -32,7 +32,7 @@ #include "storage/standbydefs.h" #include "utils/relmapper.h" -#define PG_RMGR(symname,name,redo,desc,identify,startup,cleanup) \ +#define PG_RMGR(symname,name,redo,desc,identify,startup,cleanup,mask) \ { name, desc, identify}, const RmgrDescData RmgrDescTable[RM_MAX_ID + 1] = { diff --git a/src/include/access/brin_xlog.h b/src/include/access/brin_xlog.h index 527b2f1a225d55850ed66fe71c550d641b3e3e00..33ceb34ea53873a49cba9f079889fcc02f8581fe 100644 --- a/src/include/access/brin_xlog.h +++ b/src/include/access/brin_xlog.h @@ -128,5 +128,6 @@ typedef struct xl_brin_revmap_extend extern void brin_redo(XLogReaderState *record); extern void brin_desc(StringInfo buf, XLogReaderState *record); extern const char *brin_identify(uint8 info); +extern void brin_mask(char *pagedata, BlockNumber blkno); #endif /* BRIN_XLOG_H */ diff --git a/src/include/access/bufmask.h b/src/include/access/bufmask.h new file mode 100644 index 0000000000000000000000000000000000000000..add2dc0cd145b0e0717605dffd841ad74a751943 --- /dev/null +++ b/src/include/access/bufmask.h @@ -0,0 +1,33 @@ +/*------------------------------------------------------------------------- + * + * bufmask.h + * Definitions for buffer masking routines, used to mask certain bits + * in a page which can be different when the WAL is generated + * and when the WAL is applied. This is really the job of each + * individual rmgr, but we make things easier by providing some + * common routines to handle cases which occur in multiple rmgrs. + * + * Portions Copyright (c) 2016, PostgreSQL Global Development Group + * + * src/include/access/bufmask.h + * + *------------------------------------------------------------------------- + */ + +#ifndef BUFMASK_H +#define BUFMASK_H + +#include "postgres.h" +#include "storage/block.h" +#include "storage/bufmgr.h" + +/* Marker used to mask pages consistently */ +#define MASK_MARKER 0 + +extern void mask_page_lsn(Page page); +extern void mask_page_hint_bits(Page page); +extern void mask_unused_space(Page page); +extern void mask_lp_flags(Page page); +extern void mask_page_content(Page page); + +#endif diff --git a/src/include/access/generic_xlog.h b/src/include/access/generic_xlog.h index 187d68b3e183366ec78f7fcbbfd195edbd187a85..0dc17f55f2c7f19bdc25d937f295856e60448c58 100644 --- a/src/include/access/generic_xlog.h +++ b/src/include/access/generic_xlog.h @@ -40,5 +40,6 @@ extern void GenericXLogAbort(GenericXLogState *state); extern void generic_redo(XLogReaderState *record); extern const char *generic_identify(uint8 info); extern void generic_desc(StringInfo buf, XLogReaderState *record); +extern void generic_mask(char *pagedata, BlockNumber blkno); #endif /* GENERIC_XLOG_H */ diff --git a/src/include/access/gin.h b/src/include/access/gin.h index 5629c8add7a729951d33351977f3259a0563e811..e5d67305d9c17f7c6430cfd56e7f44228d7f7c5d 100644 --- a/src/include/access/gin.h +++ b/src/include/access/gin.h @@ -79,5 +79,6 @@ extern void gin_desc(StringInfo buf, XLogReaderState *record); extern const char *gin_identify(uint8 info); extern void gin_xlog_startup(void); extern void gin_xlog_cleanup(void); +extern void gin_mask(char *pagedata, BlockNumber blkno); #endif /* GIN_H */ diff --git a/src/include/access/gist_private.h b/src/include/access/gist_private.h index 60a770aa305b252c0cad804d1a02c21b6d23eafd..f4beeb920949cbbd2d970e7aaa905e8dd0c7009b 100644 --- a/src/include/access/gist_private.h +++ b/src/include/access/gist_private.h @@ -459,6 +459,7 @@ extern void gist_desc(StringInfo buf, XLogReaderState *record); extern const char *gist_identify(uint8 info); extern void gist_xlog_startup(void); extern void gist_xlog_cleanup(void); +extern void gist_mask(char *pagedata, BlockNumber blkno); extern XLogRecPtr gistXLogUpdate(Buffer buffer, OffsetNumber *todelete, int ntodelete, diff --git a/src/include/access/heapam_xlog.h b/src/include/access/heapam_xlog.h index 52f28b86cbc678368a5883b29914553869e569be..b285f172aa86e5f714188af497dfb29cf16c1a3f 100644 --- a/src/include/access/heapam_xlog.h +++ b/src/include/access/heapam_xlog.h @@ -373,6 +373,7 @@ extern void HeapTupleHeaderAdvanceLatestRemovedXid(HeapTupleHeader tuple, extern void heap_redo(XLogReaderState *record); extern void heap_desc(StringInfo buf, XLogReaderState *record); extern const char *heap_identify(uint8 info); +extern void heap_mask(char *pagedata, BlockNumber blkno); extern void heap2_redo(XLogReaderState *record); extern void heap2_desc(StringInfo buf, XLogReaderState *record); extern const char *heap2_identify(uint8 info); diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index 011a72ecf7886968f72694b9f5eeb868e9e605fe..b2517623aa863a98972238a1ac944bba61f0d215 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -774,5 +774,6 @@ extern void _bt_leafbuild(BTSpool *btspool, BTSpool *spool2); extern void btree_redo(XLogReaderState *record); extern void btree_desc(StringInfo buf, XLogReaderState *record); extern const char *btree_identify(uint8 info); +extern void btree_mask(char *pagedata, BlockNumber blkno); #endif /* NBTREE_H */ diff --git a/src/include/access/rmgr.h b/src/include/access/rmgr.h index ff7fe62c05fcbc6191a13de1ba12c0b34186dc44..64b92ff33a676286cfafe4f4e03764c0aa4ad8c5 100644 --- a/src/include/access/rmgr.h +++ b/src/include/access/rmgr.h @@ -19,7 +19,7 @@ typedef uint8 RmgrId; * Note: RM_MAX_ID must fit in RmgrId; widening that type will affect the XLOG * file format. */ -#define PG_RMGR(symname,name,redo,desc,identify,startup,cleanup) \ +#define PG_RMGR(symname,name,redo,desc,identify,startup,cleanup,mask) \ symname, typedef enum RmgrIds diff --git a/src/include/access/rmgrlist.h b/src/include/access/rmgrlist.h index 5f76749dbd835189c34c6e83feaba66bb6e457e5..b892aea3701f282fb7484ac071b265928cea8b3c 100644 --- a/src/include/access/rmgrlist.h +++ b/src/include/access/rmgrlist.h @@ -25,25 +25,25 @@ */ /* symbol name, textual name, redo, desc, identify, startup, cleanup */ -PG_RMGR(RM_XLOG_ID, "XLOG", xlog_redo, xlog_desc, xlog_identify, NULL, NULL) -PG_RMGR(RM_XACT_ID, "Transaction", xact_redo, xact_desc, xact_identify, NULL, NULL) -PG_RMGR(RM_SMGR_ID, "Storage", smgr_redo, smgr_desc, smgr_identify, NULL, NULL) -PG_RMGR(RM_CLOG_ID, "CLOG", clog_redo, clog_desc, clog_identify, NULL, NULL) -PG_RMGR(RM_DBASE_ID, "Database", dbase_redo, dbase_desc, dbase_identify, NULL, NULL) -PG_RMGR(RM_TBLSPC_ID, "Tablespace", tblspc_redo, tblspc_desc, tblspc_identify, NULL, NULL) -PG_RMGR(RM_MULTIXACT_ID, "MultiXact", multixact_redo, multixact_desc, multixact_identify, NULL, NULL) -PG_RMGR(RM_RELMAP_ID, "RelMap", relmap_redo, relmap_desc, relmap_identify, NULL, NULL) -PG_RMGR(RM_STANDBY_ID, "Standby", standby_redo, standby_desc, standby_identify, NULL, NULL) -PG_RMGR(RM_HEAP2_ID, "Heap2", heap2_redo, heap2_desc, heap2_identify, NULL, NULL) -PG_RMGR(RM_HEAP_ID, "Heap", heap_redo, heap_desc, heap_identify, NULL, NULL) -PG_RMGR(RM_BTREE_ID, "Btree", btree_redo, btree_desc, btree_identify, NULL, NULL) -PG_RMGR(RM_HASH_ID, "Hash", hash_redo, hash_desc, hash_identify, NULL, NULL) -PG_RMGR(RM_GIN_ID, "Gin", gin_redo, gin_desc, gin_identify, gin_xlog_startup, gin_xlog_cleanup) -PG_RMGR(RM_GIST_ID, "Gist", gist_redo, gist_desc, gist_identify, gist_xlog_startup, gist_xlog_cleanup) -PG_RMGR(RM_SEQ_ID, "Sequence", seq_redo, seq_desc, seq_identify, NULL, NULL) -PG_RMGR(RM_SPGIST_ID, "SPGist", spg_redo, spg_desc, spg_identify, spg_xlog_startup, spg_xlog_cleanup) -PG_RMGR(RM_BRIN_ID, "BRIN", brin_redo, brin_desc, brin_identify, NULL, NULL) -PG_RMGR(RM_COMMIT_TS_ID, "CommitTs", commit_ts_redo, commit_ts_desc, commit_ts_identify, NULL, NULL) -PG_RMGR(RM_REPLORIGIN_ID, "ReplicationOrigin", replorigin_redo, replorigin_desc, replorigin_identify, NULL, NULL) -PG_RMGR(RM_GENERIC_ID, "Generic", generic_redo, generic_desc, generic_identify, NULL, NULL) -PG_RMGR(RM_LOGICALMSG_ID, "LogicalMessage", logicalmsg_redo, logicalmsg_desc, logicalmsg_identify, NULL, NULL) +PG_RMGR(RM_XLOG_ID, "XLOG", xlog_redo, xlog_desc, xlog_identify, NULL, NULL, NULL) +PG_RMGR(RM_XACT_ID, "Transaction", xact_redo, xact_desc, xact_identify, NULL, NULL, NULL) +PG_RMGR(RM_SMGR_ID, "Storage", smgr_redo, smgr_desc, smgr_identify, NULL, NULL, NULL) +PG_RMGR(RM_CLOG_ID, "CLOG", clog_redo, clog_desc, clog_identify, NULL, NULL, NULL) +PG_RMGR(RM_DBASE_ID, "Database", dbase_redo, dbase_desc, dbase_identify, NULL, NULL, NULL) +PG_RMGR(RM_TBLSPC_ID, "Tablespace", tblspc_redo, tblspc_desc, tblspc_identify, NULL, NULL, NULL) +PG_RMGR(RM_MULTIXACT_ID, "MultiXact", multixact_redo, multixact_desc, multixact_identify, NULL, NULL, NULL) +PG_RMGR(RM_RELMAP_ID, "RelMap", relmap_redo, relmap_desc, relmap_identify, NULL, NULL, NULL) +PG_RMGR(RM_STANDBY_ID, "Standby", standby_redo, standby_desc, standby_identify, NULL, NULL, NULL) +PG_RMGR(RM_HEAP2_ID, "Heap2", heap2_redo, heap2_desc, heap2_identify, NULL, NULL, heap_mask) +PG_RMGR(RM_HEAP_ID, "Heap", heap_redo, heap_desc, heap_identify, NULL, NULL, heap_mask) +PG_RMGR(RM_BTREE_ID, "Btree", btree_redo, btree_desc, btree_identify, NULL, NULL, btree_mask) +PG_RMGR(RM_HASH_ID, "Hash", hash_redo, hash_desc, hash_identify, NULL, NULL, NULL) +PG_RMGR(RM_GIN_ID, "Gin", gin_redo, gin_desc, gin_identify, gin_xlog_startup, gin_xlog_cleanup, gin_mask) +PG_RMGR(RM_GIST_ID, "Gist", gist_redo, gist_desc, gist_identify, gist_xlog_startup, gist_xlog_cleanup, gist_mask) +PG_RMGR(RM_SEQ_ID, "Sequence", seq_redo, seq_desc, seq_identify, NULL, NULL, seq_mask) +PG_RMGR(RM_SPGIST_ID, "SPGist", spg_redo, spg_desc, spg_identify, spg_xlog_startup, spg_xlog_cleanup, spg_mask) +PG_RMGR(RM_BRIN_ID, "BRIN", brin_redo, brin_desc, brin_identify, NULL, NULL, brin_mask) +PG_RMGR(RM_COMMIT_TS_ID, "CommitTs", commit_ts_redo, commit_ts_desc, commit_ts_identify, NULL, NULL, NULL) +PG_RMGR(RM_REPLORIGIN_ID, "ReplicationOrigin", replorigin_redo, replorigin_desc, replorigin_identify, NULL, NULL, NULL) +PG_RMGR(RM_GENERIC_ID, "Generic", generic_redo, generic_desc, generic_identify, NULL, NULL, generic_mask) +PG_RMGR(RM_LOGICALMSG_ID, "LogicalMessage", logicalmsg_redo, logicalmsg_desc, logicalmsg_identify, NULL, NULL, NULL) diff --git a/src/include/access/spgist.h b/src/include/access/spgist.h index aaf78bca97d42090d86565ecbb2049dd86c6edc7..6f59c0bbc52246897a00c02575f3f66c16bfab81 100644 --- a/src/include/access/spgist.h +++ b/src/include/access/spgist.h @@ -219,5 +219,6 @@ extern void spg_desc(StringInfo buf, XLogReaderState *record); extern const char *spg_identify(uint8 info); extern void spg_xlog_startup(void); extern void spg_xlog_cleanup(void); +extern void spg_mask(char *pagedata, BlockNumber blkno); #endif /* SPGIST_H */ diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index a4255723b7dcdc48f8587e930911175034670c97..9f036c72d89d748e29f60602e092287680eea970 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -105,6 +105,8 @@ extern bool EnableHotStandby; extern bool fullPageWrites; extern bool wal_log_hints; extern bool wal_compression; +extern bool *wal_consistency_checking; +extern char *wal_consistency_checking_string; extern bool log_checkpoints; extern int CheckPointSegments; diff --git a/src/include/access/xlog_internal.h b/src/include/access/xlog_internal.h index 8ad4d47d127628af358bf2868292e61e5da87553..3005b98aaa8e6e6bb121aa6bc64dd19fc0e476f8 100644 --- a/src/include/access/xlog_internal.h +++ b/src/include/access/xlog_internal.h @@ -31,7 +31,7 @@ /* * Each page of XLOG file has a header like this: */ -#define XLOG_PAGE_MAGIC 0xD094 /* can be used as WAL version indicator */ +#define XLOG_PAGE_MAGIC 0xD095 /* can be used as WAL version indicator */ typedef struct XLogPageHeaderData { @@ -266,6 +266,9 @@ typedef enum * "VACUUM". rm_desc can then be called to obtain additional detail for the * record, if available (e.g. the last block). * + * rm_mask takes as input a page modified by the resource manager and masks + * out bits that shouldn't be flagged by wal_consistency_checking. + * * RmgrTable[] is indexed by RmgrId values (see rmgrlist.h). */ typedef struct RmgrData @@ -276,6 +279,7 @@ typedef struct RmgrData const char *(*rm_identify) (uint8 info); void (*rm_startup) (void); void (*rm_cleanup) (void); + void (*rm_mask) (char *pagedata, BlockNumber blkno); } RmgrData; extern const RmgrData RmgrTable[]; diff --git a/src/include/access/xlogreader.h b/src/include/access/xlogreader.h index 00102e8e0b3da2f9c4f6fafc515fdf49028bf5c6..663d3e7890b7c2ad14492f1245d06ab20a614781 100644 --- a/src/include/access/xlogreader.h +++ b/src/include/access/xlogreader.h @@ -51,7 +51,8 @@ typedef struct uint8 flags; /* Information on full-page image, if any */ - bool has_image; + bool has_image; /* has image, even for consistency checking */ + bool apply_image; /* has image that should be restored */ char *bkp_image; uint16 hole_offset; uint16 hole_length; @@ -205,6 +206,8 @@ extern bool DecodeXLogRecord(XLogReaderState *state, XLogRecord *record, ((decoder)->blocks[block_id].in_use) #define XLogRecHasBlockImage(decoder, block_id) \ ((decoder)->blocks[block_id].has_image) +#define XLogRecBlockImageApply(decoder, block_id) \ + ((decoder)->blocks[block_id].apply_image) extern bool RestoreBlockImage(XLogReaderState *recoder, uint8 block_id, char *dst); extern char *XLogRecGetBlockData(XLogReaderState *record, uint8 block_id, Size *len); diff --git a/src/include/access/xlogrecord.h b/src/include/access/xlogrecord.h index 0162f93e8230007d9aa2f20827aa872589dfca7f..eeb6a30c1c3acd029d1022b2b505c2483bdcd748 100644 --- a/src/include/access/xlogrecord.h +++ b/src/include/access/xlogrecord.h @@ -56,8 +56,8 @@ typedef struct XLogRecord /* * The high 4 bits in xl_info may be used freely by rmgr. The - * XLR_SPECIAL_REL_UPDATE bit can be passed by XLogInsert caller. The rest - * are set internally by XLogInsert. + * XLR_SPECIAL_REL_UPDATE and XLR_CHECK_CONSISTENCY bits can be passed by + * XLogInsert caller. The rest are set internally by XLogInsert. */ #define XLR_INFO_MASK 0x0F #define XLR_RMGR_INFO_MASK 0xF0 @@ -70,6 +70,15 @@ typedef struct XLogRecord */ #define XLR_SPECIAL_REL_UPDATE 0x01 +/* + * Enforces consistency checks of replayed WAL at recovery. If enabled, + * each record will log a full-page write for each block modified by the + * record and will reuse it afterwards for consistency checks. The caller + * of XLogInsert can use this value if necessary, but if + * wal_consistency_checking is enabled for a rmgr this is set unconditionally. + */ +#define XLR_CHECK_CONSISTENCY 0x02 + /* * Header info for block data appended to an XLOG record. * @@ -137,6 +146,7 @@ typedef struct XLogRecordBlockImageHeader /* Information stored in bimg_info */ #define BKPIMAGE_HAS_HOLE 0x01 /* page image has "hole" */ #define BKPIMAGE_IS_COMPRESSED 0x02 /* page image is compressed */ +#define BKPIMAGE_APPLY 0x04 /* page image should be restored during replay */ /* * Extra header information used when page image has "hole" and diff --git a/src/include/commands/sequence.h b/src/include/commands/sequence.h index 144c3c2e6fe8cd666097563ae3ab373c9ba870a8..49a77c42fc0c53837e95fafa30288ffe2eea1e3f 100644 --- a/src/include/commands/sequence.h +++ b/src/include/commands/sequence.h @@ -62,5 +62,6 @@ extern void ResetSequenceCaches(void); extern void seq_redo(XLogReaderState *rptr); extern void seq_desc(StringInfo buf, XLogReaderState *rptr); extern const char *seq_identify(uint8 info); +extern void seq_mask(char *pagedata, BlockNumber blkno); #endif /* SEQUENCE_H */