From 21fda22ec46deb7734f793ef4d7fa6c226b4c78e Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Thu, 2 Jun 2005 05:55:29 +0000
Subject: [PATCH] Change CRCs in WAL records from 64bit to 32bit for
 performance reasons. Instead of a separate CRC on each backup block, include
 backup blocks in their parent WAL record's CRC; this is important to ensure
 that the backup block really goes with the WAL record, ie there was not a
 page tear right at the start of the backup block.  Implement a simple form of
 compression of backup blocks: drop any run of zeroes starting at pd_lower, so
 as not to store the unused 'hole' that commonly exists in PG heap and index
 pages.  Tweak PageRepairFragmentation and related routines to ensure they
 keep the unused space zeroed, so that the above compression method remains
 effective.  All per recent discussions.

---
 src/backend/access/nbtree/nbtpage.c     |   9 +-
 src/backend/access/nbtree/nbtxlog.c     |   9 +-
 src/backend/access/transam/xlog.c       | 332 ++++++++++++++++--------
 src/backend/storage/page/bufpage.c      |  29 ++-
 src/backend/utils/hash/pg_crc.c         | 110 +++++++-
 src/bin/pg_controldata/pg_controldata.c |  16 +-
 src/bin/pg_resetxlog/pg_resetxlog.c     |  39 +--
 src/include/access/xlog.h               |  30 ++-
 src/include/access/xlog_internal.h      |  33 ++-
 src/include/catalog/pg_control.h        |  24 +-
 src/include/utils/pg_crc.h              |  87 +++++--
 11 files changed, 511 insertions(+), 207 deletions(-)

diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c
index ea023253189..b9d42bad6d2 100644
--- a/src/backend/access/nbtree/nbtpage.c
+++ b/src/backend/access/nbtree/nbtpage.c
@@ -9,7 +9,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/nbtree/nbtpage.c,v 1.84 2005/05/07 21:32:23 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/nbtree/nbtpage.c,v 1.85 2005/06/02 05:55:28 tgl Exp $
  *
  *	NOTES
  *	   Postgres btree pages look like ordinary relation pages.	The opaque
@@ -113,6 +113,13 @@ _bt_initmetapage(Page page, BlockNumber rootbknum, uint32 level)
 
 	metaopaque = (BTPageOpaque) PageGetSpecialPointer(page);
 	metaopaque->btpo_flags = BTP_META;
+
+	/*
+	 * Set pd_lower just past the end of the metadata.  This is not
+	 * essential but it makes the page look compressible to xlog.c.
+	 */
+	((PageHeader) page)->pd_lower =
+		((char *) metad + sizeof(BTMetaPageData)) - (char *) page;
 }
 
 /*
diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c
index ade60619a3d..536bc177180 100644
--- a/src/backend/access/nbtree/nbtxlog.c
+++ b/src/backend/access/nbtree/nbtxlog.c
@@ -8,7 +8,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/nbtree/nbtxlog.c,v 1.20 2005/03/22 06:17:03 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/nbtree/nbtxlog.c,v 1.21 2005/06/02 05:55:28 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -135,6 +135,13 @@ _bt_restore_meta(Relation reln, XLogRecPtr lsn,
 	pageop = (BTPageOpaque) PageGetSpecialPointer(metapg);
 	pageop->btpo_flags = BTP_META;
 
+	/*
+	 * Set pd_lower just past the end of the metadata.  This is not
+	 * essential but it makes the page look compressible to xlog.c.
+	 */
+	((PageHeader) metapg)->pd_lower =
+		((char *) md + sizeof(BTMetaPageData)) - (char *) metapg;
+
 	PageSetLSN(metapg, lsn);
 	PageSetTLI(metapg, ThisTimeLineID);
 	LockBuffer(metabuf, BUFFER_LOCK_UNLOCK);
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 2352313b051..27f6354987d 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.194 2005/05/31 19:10:28 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.195 2005/06/02 05:55:28 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -434,6 +434,7 @@ static void exitArchiveRecovery(TimeLineID endTLI,
 					uint32 endLogId, uint32 endLogSeg);
 static bool recoveryStopsHere(XLogRecord *record, bool *includeThis);
 
+static void SetBkpBlock(BkpBlock *bkpb, Buffer buffer);
 static bool AdvanceXLInsertBuffer(void);
 static void XLogWrite(XLogwrtRqst WriteRqst);
 static int XLogFileInit(uint32 log, uint32 seg,
@@ -499,8 +500,10 @@ XLogInsert(RmgrId rmid, uint8 info, XLogRecData *rdata)
 	bool		dtbuf_bkp[XLR_MAX_BKP_BLOCKS];
 	BkpBlock	dtbuf_xlg[XLR_MAX_BKP_BLOCKS];
 	XLogRecPtr	dtbuf_lsn[XLR_MAX_BKP_BLOCKS];
-	XLogRecData dtbuf_rdt[2 * XLR_MAX_BKP_BLOCKS];
-	crc64		rdata_crc;
+	XLogRecData dtbuf_rdt1[XLR_MAX_BKP_BLOCKS];
+	XLogRecData dtbuf_rdt2[XLR_MAX_BKP_BLOCKS];
+	XLogRecData dtbuf_rdt3[XLR_MAX_BKP_BLOCKS];
+	pg_crc32	rdata_crc;
 	uint32		len,
 				write_len;
 	unsigned	i;
@@ -531,8 +534,10 @@ XLogInsert(RmgrId rmid, uint8 info, XLogRecData *rdata)
 	/*
 	 * Here we scan the rdata list, determine which buffers must be backed
 	 * up, and compute the CRC values for the data.  Note that the record
-	 * header isn't added into the CRC yet since we don't know the final
-	 * length or info bits quite yet.
+	 * header isn't added into the CRC initially since we don't know the
+	 * final length or info bits quite yet.  Thus, the CRC will represent
+	 * the CRC of the whole record in the order "rdata, then backup blocks,
+	 * then record header".
 	 *
 	 * We may have to loop back to here if a race condition is detected
 	 * below. We could prevent the race by doing all this work while
@@ -553,7 +558,7 @@ begin:;
 		dtbuf_bkp[i] = false;
 	}
 
-	INIT_CRC64(rdata_crc);
+	INIT_CRC32(rdata_crc);
 	len = 0;
 	for (rdt = rdata;;)
 	{
@@ -561,7 +566,7 @@ begin:;
 		{
 			/* Simple data, just include it */
 			len += rdt->len;
-			COMP_CRC64(rdata_crc, rdt->data, rdt->len);
+			COMP_CRC32(rdata_crc, rdt->data, rdt->len);
 		}
 		else
 		{
@@ -576,7 +581,7 @@ begin:;
 					else if (rdt->data)
 					{
 						len += rdt->len;
-						COMP_CRC64(rdata_crc, rdt->data, rdt->len);
+						COMP_CRC32(rdata_crc, rdt->data, rdt->len);
 					}
 					break;
 				}
@@ -591,26 +596,14 @@ begin:;
 					dtbuf_lsn[i] = *((XLogRecPtr *) BufferGetBlock(rdt->buffer));
 					if (XLByteLE(dtbuf_lsn[i], RedoRecPtr))
 					{
-						crc64		dtcrc;
-
 						dtbuf_bkp[i] = true;
+						SetBkpBlock(&(dtbuf_xlg[i]), rdt->buffer);
 						rdt->data = NULL;
-						INIT_CRC64(dtcrc);
-						COMP_CRC64(dtcrc,
-								   BufferGetBlock(dtbuf[i]),
-								   BLCKSZ);
-						dtbuf_xlg[i].node = BufferGetFileNode(dtbuf[i]);
-						dtbuf_xlg[i].block = BufferGetBlockNumber(dtbuf[i]);
-						COMP_CRC64(dtcrc,
-								(char *) &(dtbuf_xlg[i]) + sizeof(crc64),
-								   sizeof(BkpBlock) - sizeof(crc64));
-						FIN_CRC64(dtcrc);
-						dtbuf_xlg[i].crc = dtcrc;
 					}
 					else if (rdt->data)
 					{
 						len += rdt->len;
-						COMP_CRC64(rdata_crc, rdt->data, rdt->len);
+						COMP_CRC32(rdata_crc, rdt->data, rdt->len);
 					}
 					break;
 				}
@@ -625,6 +618,39 @@ begin:;
 		rdt = rdt->next;
 	}
 
+	/*
+	 * Now add the backup block headers and data into the CRC
+	 */
+	for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
+	{
+		if (dtbuf_bkp[i])
+		{
+			BkpBlock   *bkpb = &(dtbuf_xlg[i]);
+			char	   *page;
+
+			COMP_CRC32(rdata_crc,
+					   (char *) bkpb,
+					   sizeof(BkpBlock));
+			page = (char *) BufferGetBlock(dtbuf[i]);
+			if (bkpb->hole_length == 0)
+			{
+				COMP_CRC32(rdata_crc,
+						   page,
+						   BLCKSZ);
+			}
+			else
+			{
+				/* must skip the hole */
+				COMP_CRC32(rdata_crc,
+						   page,
+						   bkpb->hole_offset);
+				COMP_CRC32(rdata_crc,
+						   page + (bkpb->hole_offset + bkpb->hole_length),
+						   BLCKSZ - (bkpb->hole_offset + bkpb->hole_length));
+			}
+		}
+	}
+
 	/*
 	 * NOTE: the test for len == 0 here is somewhat fishy, since in theory
 	 * all of the rmgr data might have been suppressed in favor of backup
@@ -713,23 +739,49 @@ begin:;
 	write_len = len;
 	for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
 	{
+		BkpBlock   *bkpb;
+		char	   *page;
+
 		if (dtbuf[i] == InvalidBuffer || !(dtbuf_bkp[i]))
 			continue;
 
 		info |= XLR_SET_BKP_BLOCK(i);
 
-		rdt->next = &(dtbuf_rdt[2 * i]);
+		bkpb = &(dtbuf_xlg[i]);
+		page = (char *) BufferGetBlock(dtbuf[i]);
+
+		rdt->next = &(dtbuf_rdt1[i]);
+		rdt = rdt->next;
 
-		dtbuf_rdt[2 * i].data = (char *) &(dtbuf_xlg[i]);
-		dtbuf_rdt[2 * i].len = sizeof(BkpBlock);
+		rdt->data = (char *) bkpb;
+		rdt->len = sizeof(BkpBlock);
 		write_len += sizeof(BkpBlock);
 
-		rdt = dtbuf_rdt[2 * i].next = &(dtbuf_rdt[2 * i + 1]);
+		rdt->next = &(dtbuf_rdt2[i]);
+		rdt = rdt->next;
 
-		dtbuf_rdt[2 * i + 1].data = (char *) BufferGetBlock(dtbuf[i]);
-		dtbuf_rdt[2 * i + 1].len = BLCKSZ;
-		write_len += BLCKSZ;
-		dtbuf_rdt[2 * i + 1].next = NULL;
+		if (bkpb->hole_length == 0)
+		{
+			rdt->data = page;
+			rdt->len = BLCKSZ;
+			write_len += BLCKSZ;
+			rdt->next = NULL;
+		}
+		else
+		{
+			/* must skip the hole */
+			rdt->data = page;
+			rdt->len = bkpb->hole_offset;
+			write_len += bkpb->hole_offset;
+
+			rdt->next = &(dtbuf_rdt3[i]);
+			rdt = rdt->next;
+
+			rdt->data = page + (bkpb->hole_offset + bkpb->hole_length);
+			rdt->len = BLCKSZ - (bkpb->hole_offset + bkpb->hole_length);
+			write_len += rdt->len;
+			rdt->next = NULL;
+		}
 	}
 
 	/*
@@ -752,14 +804,15 @@ begin:;
 
 	record->xl_prev = Insert->PrevRecord;
 	record->xl_xid = GetCurrentTransactionIdIfAny();
+	record->xl_tot_len = SizeOfXLogRecord + write_len;
 	record->xl_len = len;		/* doesn't include backup blocks */
 	record->xl_info = info;
 	record->xl_rmid = rmid;
 
-	/* Now we can finish computing the main CRC */
-	COMP_CRC64(rdata_crc, (char *) record + sizeof(crc64),
-			   SizeOfXLogRecord - sizeof(crc64));
-	FIN_CRC64(rdata_crc);
+	/* Now we can finish computing the record's CRC */
+	COMP_CRC32(rdata_crc, (char *) record + sizeof(pg_crc32),
+			   SizeOfXLogRecord - sizeof(pg_crc32));
+	FIN_CRC32(rdata_crc);
 	record->xl_crc = rdata_crc;
 
 	/* Compute record's XLOG location */
@@ -884,6 +937,46 @@ begin:;
 	return (RecPtr);
 }
 
+/*
+ * Fill a BkpBlock struct given a buffer containing the page to be saved
+ *
+ * This is nontrivial only because it has to decide whether to apply "hole
+ * compression".
+ */
+static void
+SetBkpBlock(BkpBlock *bkpb, Buffer buffer)
+{
+	PageHeader	page;
+	uint16		offset;
+	uint16		length;
+
+	/* Save page identity info */
+	bkpb->node = BufferGetFileNode(buffer);
+	bkpb->block = BufferGetBlockNumber(buffer);
+
+	/* Test whether there is a "hole" containing zeroes in the page */
+	page = (PageHeader) BufferGetBlock(buffer);
+	offset = page->pd_lower;
+	/* Check if pd_lower appears sane at all */
+	if (offset >= SizeOfPageHeaderData && offset < BLCKSZ)
+	{
+		char   *spd = (char *) page + offset;
+		char   *epd = (char *) page + BLCKSZ;
+		char   *pd = spd;
+
+		while (pd < epd && *pd == '\0')
+			pd++;
+
+		length = pd - spd;
+		if (length == 0)
+			offset = 0;
+	}
+	else
+		offset = length = 0;
+	bkpb->hole_offset = offset;
+	bkpb->hole_length = length;
+}
+
 /*
  * XLogArchiveNotify
  *
@@ -2276,7 +2369,7 @@ RestoreBkpBlocks(XLogRecord *record, XLogRecPtr lsn)
 		if (!(record->xl_info & XLR_SET_BKP_BLOCK(i)))
 			continue;
 
-		memcpy((char *) &bkpb, blk, sizeof(BkpBlock));
+		memcpy(&bkpb, blk, sizeof(BkpBlock));
 		blk += sizeof(BkpBlock);
 
 		reln = XLogOpenRelation(true, record->xl_rmid, bkpb.node);
@@ -2287,7 +2380,21 @@ RestoreBkpBlocks(XLogRecord *record, XLogRecPtr lsn)
 			if (BufferIsValid(buffer))
 			{
 				page = (Page) BufferGetPage(buffer);
-				memcpy((char *) page, blk, BLCKSZ);
+
+				if (bkpb.hole_length == 0)
+				{
+					memcpy((char *) page, blk, BLCKSZ);
+				}
+				else
+				{
+					/* must zero-fill the hole */
+					MemSet((char *) page, 0, BLCKSZ);
+					memcpy((char *) page, blk, bkpb.hole_offset);
+					memcpy((char *) page + (bkpb.hole_offset + bkpb.hole_length),
+						   blk + bkpb.hole_offset,
+						   BLCKSZ - (bkpb.hole_offset + bkpb.hole_length));
+				}
+
 				PageSetLSN(page, lsn);
 				PageSetTLI(page, ThisTimeLineID);
 				LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
@@ -2295,7 +2402,7 @@ RestoreBkpBlocks(XLogRecord *record, XLogRecPtr lsn)
 			}
 		}
 
-		blk += BLCKSZ;
+		blk += BLCKSZ - bkpb.hole_length;
 	}
 }
 
@@ -2309,53 +2416,61 @@ RestoreBkpBlocks(XLogRecord *record, XLogRecPtr lsn)
 static bool
 RecordIsValid(XLogRecord *record, XLogRecPtr recptr, int emode)
 {
-	crc64		crc;
-	crc64		cbuf;
+	pg_crc32	crc;
 	int			i;
 	uint32		len = record->xl_len;
+	BkpBlock	bkpb;
 	char	   *blk;
 
-	/* Check CRC of rmgr data and record header */
-	INIT_CRC64(crc);
-	COMP_CRC64(crc, XLogRecGetData(record), len);
-	COMP_CRC64(crc, (char *) record + sizeof(crc64),
-			   SizeOfXLogRecord - sizeof(crc64));
-	FIN_CRC64(crc);
+	/* First the rmgr data */
+	INIT_CRC32(crc);
+	COMP_CRC32(crc, XLogRecGetData(record), len);
 
-	if (!EQ_CRC64(record->xl_crc, crc))
-	{
-		ereport(emode,
-				(errmsg("incorrect resource manager data checksum in record at %X/%X",
-						recptr.xlogid, recptr.xrecoff)));
-		return (false);
-	}
-
-	/* Check CRCs of backup blocks, if any */
+	/* Add in the backup blocks, if any */
 	blk = (char *) XLogRecGetData(record) + len;
 	for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
 	{
+		uint32	blen;
+
 		if (!(record->xl_info & XLR_SET_BKP_BLOCK(i)))
 			continue;
 
-		INIT_CRC64(crc);
-		COMP_CRC64(crc, blk + sizeof(BkpBlock), BLCKSZ);
-		COMP_CRC64(crc, blk + sizeof(crc64),
-				   sizeof(BkpBlock) - sizeof(crc64));
-		FIN_CRC64(crc);
-		memcpy((char *) &cbuf, blk, sizeof(crc64));		/* don't assume
-														 * alignment */
-
-		if (!EQ_CRC64(cbuf, crc))
+		memcpy(&bkpb, blk, sizeof(BkpBlock));
+		if (bkpb.hole_offset + bkpb.hole_length > BLCKSZ)
 		{
 			ereport(emode,
-					(errmsg("incorrect checksum of backup block %d in record at %X/%X",
-							i + 1, recptr.xlogid, recptr.xrecoff)));
-			return (false);
+					(errmsg("incorrect hole size in record at %X/%X",
+							recptr.xlogid, recptr.xrecoff)));
+			return false;
 		}
-		blk += sizeof(BkpBlock) + BLCKSZ;
+		blen = sizeof(BkpBlock) + BLCKSZ - bkpb.hole_length;
+		COMP_CRC32(crc, blk, blen);
+		blk += blen;
+	}
+
+	/* Check that xl_tot_len agrees with our calculation */
+	if (blk != (char *) record + record->xl_tot_len)
+	{
+		ereport(emode,
+				(errmsg("incorrect total length in record at %X/%X",
+						recptr.xlogid, recptr.xrecoff)));
+		return false;
 	}
 
-	return (true);
+	/* Finally include the record header */
+	COMP_CRC32(crc, (char *) record + sizeof(pg_crc32),
+			   SizeOfXLogRecord - sizeof(pg_crc32));
+	FIN_CRC32(crc);
+
+	if (!EQ_CRC32(record->xl_crc, crc))
+	{
+		ereport(emode,
+				(errmsg("incorrect resource manager data checksum in record at %X/%X",
+						recptr.xlogid, recptr.xrecoff)));
+		return false;
+	}
+
+	return true;
 }
 
 /*
@@ -2382,7 +2497,6 @@ ReadRecord(XLogRecPtr *RecPtr, int emode)
 	uint32		targetPageOff;
 	uint32		targetRecOff;
 	uint32		pageHeaderSize;
-	unsigned	i;
 
 	if (readBuf == NULL)
 	{
@@ -2518,6 +2632,15 @@ got_record:;
 						RecPtr->xlogid, RecPtr->xrecoff)));
 		goto next_record_is_invalid;
 	}
+	if (record->xl_tot_len < SizeOfXLogRecord + record->xl_len ||
+		record->xl_tot_len > SizeOfXLogRecord + record->xl_len +
+		XLR_MAX_BKP_BLOCKS * (sizeof(BkpBlock) + BLCKSZ))
+	{
+		ereport(emode,
+				(errmsg("invalid record length at %X/%X",
+						RecPtr->xlogid, RecPtr->xrecoff)));
+		goto next_record_is_invalid;
+	}
 	if (record->xl_rmid > RM_MAX_ID)
 	{
 		ereport(emode,
@@ -2557,18 +2680,6 @@ got_record:;
 		}
 	}
 
-	/*
-	 * Compute total length of record including any appended backup
-	 * blocks.
-	 */
-	total_len = SizeOfXLogRecord + record->xl_len;
-	for (i = 0; i < XLR_MAX_BKP_BLOCKS; i++)
-	{
-		if (!(record->xl_info & XLR_SET_BKP_BLOCK(i)))
-			continue;
-		total_len += sizeof(BkpBlock) + BLCKSZ;
-	}
-
 	/*
 	 * Allocate or enlarge readRecordBuf as needed.  To avoid useless
 	 * small increases, round its size to a multiple of BLCKSZ, and make
@@ -2576,6 +2687,7 @@ got_record:;
 	 * "normal" records, but very large commit or abort records might need
 	 * more space.)
 	 */
+	total_len = record->xl_tot_len;
 	if (total_len > readRecordBufSize)
 	{
 		uint32		newSize = total_len;
@@ -2666,15 +2778,15 @@ got_record:;
 			goto next_record_is_invalid;
 		pageHeaderSize = XLogPageHeaderSize((XLogPageHeader) readBuf);
 		if (BLCKSZ - SizeOfXLogRecord >= pageHeaderSize +
-			SizeOfXLogContRecord + MAXALIGN(contrecord->xl_rem_len))
+			MAXALIGN(SizeOfXLogContRecord + contrecord->xl_rem_len))
 		{
 			nextRecord = (XLogRecord *) ((char *) contrecord +
-				SizeOfXLogContRecord + MAXALIGN(contrecord->xl_rem_len));
+				MAXALIGN(SizeOfXLogContRecord + contrecord->xl_rem_len));
 		}
 		EndRecPtr.xlogid = readId;
 		EndRecPtr.xrecoff = readSeg * XLogSegSize + readOff +
-			pageHeaderSize + SizeOfXLogContRecord +
-			MAXALIGN(contrecord->xl_rem_len);
+			pageHeaderSize +
+			MAXALIGN(SizeOfXLogContRecord + contrecord->xl_rem_len);
 		ReadRecPtr = *RecPtr;
 		return record;
 	}
@@ -3194,11 +3306,11 @@ WriteControlFile(void)
 	StrNCpy(ControlFile->lc_ctype, localeptr, LOCALE_NAME_BUFLEN);
 
 	/* Contents are protected with a CRC */
-	INIT_CRC64(ControlFile->crc);
-	COMP_CRC64(ControlFile->crc,
-			   (char *) ControlFile + sizeof(crc64),
-			   sizeof(ControlFileData) - sizeof(crc64));
-	FIN_CRC64(ControlFile->crc);
+	INIT_CRC32(ControlFile->crc);
+	COMP_CRC32(ControlFile->crc,
+			   (char *) ControlFile,
+			   offsetof(ControlFileData, crc));
+	FIN_CRC32(ControlFile->crc);
 
 	/*
 	 * We write out BLCKSZ bytes into pg_control, zero-padding the excess
@@ -3247,7 +3359,7 @@ WriteControlFile(void)
 static void
 ReadControlFile(void)
 {
-	crc64		crc;
+	pg_crc32	crc;
 	int			fd;
 
 	/*
@@ -3281,13 +3393,13 @@ ReadControlFile(void)
 					ControlFile->pg_control_version, PG_CONTROL_VERSION),
 				 errhint("It looks like you need to initdb.")));
 	/* Now check the CRC. */
-	INIT_CRC64(crc);
-	COMP_CRC64(crc,
-			   (char *) ControlFile + sizeof(crc64),
-			   sizeof(ControlFileData) - sizeof(crc64));
-	FIN_CRC64(crc);
+	INIT_CRC32(crc);
+	COMP_CRC32(crc,
+			   (char *) ControlFile,
+			   offsetof(ControlFileData, crc));
+	FIN_CRC32(crc);
 
-	if (!EQ_CRC64(crc, ControlFile->crc))
+	if (!EQ_CRC32(crc, ControlFile->crc))
 		ereport(FATAL,
 				(errmsg("incorrect checksum in control file")));
 
@@ -3396,11 +3508,11 @@ UpdateControlFile(void)
 {
 	int			fd;
 
-	INIT_CRC64(ControlFile->crc);
-	COMP_CRC64(ControlFile->crc,
-			   (char *) ControlFile + sizeof(crc64),
-			   sizeof(ControlFileData) - sizeof(crc64));
-	FIN_CRC64(ControlFile->crc);
+	INIT_CRC32(ControlFile->crc);
+	COMP_CRC32(ControlFile->crc,
+			   (char *) ControlFile,
+			   offsetof(ControlFileData, crc));
+	FIN_CRC32(ControlFile->crc);
 
 	fd = BasicOpenFile(ControlFilePath, O_RDWR | PG_BINARY, S_IRUSR | S_IWUSR);
 	if (fd < 0)
@@ -3525,7 +3637,7 @@ BootStrapXLOG(void)
 	bool		use_existent;
 	uint64		sysidentifier;
 	struct timeval tv;
-	crc64		crc;
+	pg_crc32	crc;
 
 	/*
 	 * Select a hopefully-unique system identifier code for this
@@ -3582,16 +3694,17 @@ BootStrapXLOG(void)
 	record->xl_prev.xlogid = 0;
 	record->xl_prev.xrecoff = 0;
 	record->xl_xid = InvalidTransactionId;
+	record->xl_tot_len = SizeOfXLogRecord + sizeof(checkPoint);
 	record->xl_len = sizeof(checkPoint);
 	record->xl_info = XLOG_CHECKPOINT_SHUTDOWN;
 	record->xl_rmid = RM_XLOG_ID;
 	memcpy(XLogRecGetData(record), &checkPoint, sizeof(checkPoint));
 
-	INIT_CRC64(crc);
-	COMP_CRC64(crc, &checkPoint, sizeof(checkPoint));
-	COMP_CRC64(crc, (char *) record + sizeof(crc64),
-			   SizeOfXLogRecord - sizeof(crc64));
-	FIN_CRC64(crc);
+	INIT_CRC32(crc);
+	COMP_CRC32(crc, &checkPoint, sizeof(checkPoint));
+	COMP_CRC32(crc, (char *) record + sizeof(pg_crc32),
+			   SizeOfXLogRecord - sizeof(pg_crc32));
+	FIN_CRC32(crc);
 	record->xl_crc = crc;
 
 	/* Create first XLOG segment file */
@@ -4694,7 +4807,8 @@ ReadCheckpointRecord(XLogRecPtr RecPtr, int whichChkpt)
 		}
 		return NULL;
 	}
-	if (record->xl_len != sizeof(CheckPoint))
+	if (record->xl_len != sizeof(CheckPoint) ||
+		record->xl_tot_len != SizeOfXLogRecord + sizeof(CheckPoint))
 	{
 		switch (whichChkpt)
 		{
diff --git a/src/backend/storage/page/bufpage.c b/src/backend/storage/page/bufpage.c
index c33a0011e60..8f8ba9e0d2b 100644
--- a/src/backend/storage/page/bufpage.c
+++ b/src/backend/storage/page/bufpage.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/storage/page/bufpage.c,v 1.63 2005/03/22 06:17:03 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/storage/page/bufpage.c,v 1.64 2005/06/02 05:55:28 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -357,7 +357,7 @@ PageRepairFragmentation(Page page, OffsetNumber *unused)
 			lp = PageGetItemId(page, i + 1);
 			lp->lp_len = 0;		/* indicate unused & deallocated */
 		}
-		((PageHeader) page)->pd_upper = pd_special;
+		((PageHeader) page)->pd_upper = pd_upper = pd_special;
 	}
 	else
 	{							/* nused != 0 */
@@ -411,11 +411,17 @@ PageRepairFragmentation(Page page, OffsetNumber *unused)
 			lp->lp_off = upper;
 		}
 
-		((PageHeader) page)->pd_upper = upper;
+		((PageHeader) page)->pd_upper = pd_upper = upper;
 
 		pfree(itemidbase);
 	}
 
+	/*
+	 * Zero out the now-free space.  This is not essential, but it allows
+	 * xlog.c to compress WAL data better.
+	 */
+	MemSet((char *) page + pd_lower, 0, pd_upper - pd_lower);
+
 	return (nline - nused);
 }
 
@@ -525,6 +531,13 @@ PageIndexTupleDelete(Page page, OffsetNumber offnum)
 	phdr->pd_upper += size;
 	phdr->pd_lower -= sizeof(ItemIdData);
 
+	/*
+	 * Zero out the just-freed space.  This is not essential, but it allows
+	 * xlog.c to compress WAL data better.
+	 */
+	MemSet((char *) page + phdr->pd_lower, 0, sizeof(ItemIdData));
+	MemSet(addr, 0, size);
+
 	/*
 	 * Finally, we need to adjust the linp entries that remain.
 	 *
@@ -672,8 +685,14 @@ PageIndexMultiDelete(Page page, OffsetNumber *itemnos, int nitems)
 		lp->lp_off = upper;
 	}
 
-	phdr->pd_lower = SizeOfPageHeaderData + nused * sizeof(ItemIdData);
-	phdr->pd_upper = upper;
+	phdr->pd_lower = pd_lower = SizeOfPageHeaderData + nused * sizeof(ItemIdData);
+	phdr->pd_upper = pd_upper = upper;
+
+	/*
+	 * Zero out the now-free space.  This is not essential, but it allows
+	 * xlog.c to compress WAL data better.
+	 */
+	MemSet((char *) page + pd_lower, 0, pd_upper - pd_lower);
 
 	pfree(itemidbase);
 }
diff --git a/src/backend/utils/hash/pg_crc.c b/src/backend/utils/hash/pg_crc.c
index bf23242a5ad..211da1aa729 100644
--- a/src/backend/utils/hash/pg_crc.c
+++ b/src/backend/utils/hash/pg_crc.c
@@ -1,14 +1,25 @@
 /*-------------------------------------------------------------------------
  *
  * pg_crc.c
- *	  PostgreSQL 64-bit CRC support
+ *	  PostgreSQL CRC support
+ *
+ * See Ross Williams' excellent introduction
+ * A PAINLESS GUIDE TO CRC ERROR DETECTION ALGORITHMS, available from
+ * ftp://ftp.rocksoft.com/papers/crc_v3.txt or several other net sites.
+ *
+ * We use a normal (not "reflected", in Williams' terms) CRC, using initial
+ * all-ones register contents and a final bit inversion.
+ *
+ * The 64-bit variant is not used as of PostgreSQL 8.1, but we retain the
+ * code for possible future use.
+ *
  *
  * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/utils/hash/pg_crc.c,v 1.12 2004/12/31 22:01:37 pgsql Exp $
+ *	  $PostgreSQL: pgsql/src/backend/utils/hash/pg_crc.c,v 1.13 2005/06/02 05:55:29 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -17,9 +28,96 @@
 #include "utils/pg_crc.h"
 
 
+/*
+ * This table is based on the polynomial
+ *	x^32+x^26+x^23+x^22+x^16+x^12+x^11+x^10+x^8+x^7+x^5+x^4+x^2+x+1.
+ * (This is the same polynomial used in Ethernet checksums, for instance.)
+ */
+const uint32 pg_crc32_table[256] = {
+	0x00000000, 0x77073096, 0xEE0E612C, 0x990951BA,
+	0x076DC419, 0x706AF48F, 0xE963A535, 0x9E6495A3,
+	0x0EDB8832, 0x79DCB8A4, 0xE0D5E91E, 0x97D2D988,
+	0x09B64C2B, 0x7EB17CBD, 0xE7B82D07, 0x90BF1D91,
+	0x1DB71064, 0x6AB020F2, 0xF3B97148, 0x84BE41DE,
+	0x1ADAD47D, 0x6DDDE4EB, 0xF4D4B551, 0x83D385C7,
+	0x136C9856, 0x646BA8C0, 0xFD62F97A, 0x8A65C9EC,
+	0x14015C4F, 0x63066CD9, 0xFA0F3D63, 0x8D080DF5,
+	0x3B6E20C8, 0x4C69105E, 0xD56041E4, 0xA2677172,
+	0x3C03E4D1, 0x4B04D447, 0xD20D85FD, 0xA50AB56B,
+	0x35B5A8FA, 0x42B2986C, 0xDBBBC9D6, 0xACBCF940,
+	0x32D86CE3, 0x45DF5C75, 0xDCD60DCF, 0xABD13D59,
+	0x26D930AC, 0x51DE003A, 0xC8D75180, 0xBFD06116,
+	0x21B4F4B5, 0x56B3C423, 0xCFBA9599, 0xB8BDA50F,
+	0x2802B89E, 0x5F058808, 0xC60CD9B2, 0xB10BE924,
+	0x2F6F7C87, 0x58684C11, 0xC1611DAB, 0xB6662D3D,
+	0x76DC4190, 0x01DB7106, 0x98D220BC, 0xEFD5102A,
+	0x71B18589, 0x06B6B51F, 0x9FBFE4A5, 0xE8B8D433,
+	0x7807C9A2, 0x0F00F934, 0x9609A88E, 0xE10E9818,
+	0x7F6A0DBB, 0x086D3D2D, 0x91646C97, 0xE6635C01,
+	0x6B6B51F4, 0x1C6C6162, 0x856530D8, 0xF262004E,
+	0x6C0695ED, 0x1B01A57B, 0x8208F4C1, 0xF50FC457,
+	0x65B0D9C6, 0x12B7E950, 0x8BBEB8EA, 0xFCB9887C,
+	0x62DD1DDF, 0x15DA2D49, 0x8CD37CF3, 0xFBD44C65,
+	0x4DB26158, 0x3AB551CE, 0xA3BC0074, 0xD4BB30E2,
+	0x4ADFA541, 0x3DD895D7, 0xA4D1C46D, 0xD3D6F4FB,
+	0x4369E96A, 0x346ED9FC, 0xAD678846, 0xDA60B8D0,
+	0x44042D73, 0x33031DE5, 0xAA0A4C5F, 0xDD0D7CC9,
+	0x5005713C, 0x270241AA, 0xBE0B1010, 0xC90C2086,
+	0x5768B525, 0x206F85B3, 0xB966D409, 0xCE61E49F,
+	0x5EDEF90E, 0x29D9C998, 0xB0D09822, 0xC7D7A8B4,
+	0x59B33D17, 0x2EB40D81, 0xB7BD5C3B, 0xC0BA6CAD,
+	0xEDB88320, 0x9ABFB3B6, 0x03B6E20C, 0x74B1D29A,
+	0xEAD54739, 0x9DD277AF, 0x04DB2615, 0x73DC1683,
+	0xE3630B12, 0x94643B84, 0x0D6D6A3E, 0x7A6A5AA8,
+	0xE40ECF0B, 0x9309FF9D, 0x0A00AE27, 0x7D079EB1,
+	0xF00F9344, 0x8708A3D2, 0x1E01F268, 0x6906C2FE,
+	0xF762575D, 0x806567CB, 0x196C3671, 0x6E6B06E7,
+	0xFED41B76, 0x89D32BE0, 0x10DA7A5A, 0x67DD4ACC,
+	0xF9B9DF6F, 0x8EBEEFF9, 0x17B7BE43, 0x60B08ED5,
+	0xD6D6A3E8, 0xA1D1937E, 0x38D8C2C4, 0x4FDFF252,
+	0xD1BB67F1, 0xA6BC5767, 0x3FB506DD, 0x48B2364B,
+	0xD80D2BDA, 0xAF0A1B4C, 0x36034AF6, 0x41047A60,
+	0xDF60EFC3, 0xA867DF55, 0x316E8EEF, 0x4669BE79,
+	0xCB61B38C, 0xBC66831A, 0x256FD2A0, 0x5268E236,
+	0xCC0C7795, 0xBB0B4703, 0x220216B9, 0x5505262F,
+	0xC5BA3BBE, 0xB2BD0B28, 0x2BB45A92, 0x5CB36A04,
+	0xC2D7FFA7, 0xB5D0CF31, 0x2CD99E8B, 0x5BDEAE1D,
+	0x9B64C2B0, 0xEC63F226, 0x756AA39C, 0x026D930A,
+	0x9C0906A9, 0xEB0E363F, 0x72076785, 0x05005713,
+	0x95BF4A82, 0xE2B87A14, 0x7BB12BAE, 0x0CB61B38,
+	0x92D28E9B, 0xE5D5BE0D, 0x7CDCEFB7, 0x0BDBDF21,
+	0x86D3D2D4, 0xF1D4E242, 0x68DDB3F8, 0x1FDA836E,
+	0x81BE16CD, 0xF6B9265B, 0x6FB077E1, 0x18B74777,
+	0x88085AE6, 0xFF0F6A70, 0x66063BCA, 0x11010B5C,
+	0x8F659EFF, 0xF862AE69, 0x616BFFD3, 0x166CCF45,
+	0xA00AE278, 0xD70DD2EE, 0x4E048354, 0x3903B3C2,
+	0xA7672661, 0xD06016F7, 0x4969474D, 0x3E6E77DB,
+	0xAED16A4A, 0xD9D65ADC, 0x40DF0B66, 0x37D83BF0,
+	0xA9BCAE53, 0xDEBB9EC5, 0x47B2CF7F, 0x30B5FFE9,
+	0xBDBDF21C, 0xCABAC28A, 0x53B39330, 0x24B4A3A6,
+	0xBAD03605, 0xCDD70693, 0x54DE5729, 0x23D967BF,
+	0xB3667A2E, 0xC4614AB8, 0x5D681B02, 0x2A6F2B94,
+	0xB40BBE37, 0xC30C8EA1, 0x5A05DF1B, 0x2D02EF8D
+};
+
+
+#ifdef PROVIDE_64BIT_CRC
+
+/*
+ * This table is based on the polynomial
+ *
+ * x^64 + x^62 + x^57 + x^55 + x^54 + x^53 + x^52 + x^47 + x^46 + x^45 +
+ * x^40 + x^39 + x^38 + x^37 + x^35 + x^33 + x^32 + x^31 + x^29 + x^27 +
+ * x^24 + x^23 + x^22 + x^21 + x^19 + x^17 + x^13 + x^12 + x^10 + x^9 +
+ * x^7 + x^4 + x + 1
+ *
+ * which is borrowed from the DLT1 spec
+ * (ECMA-182, available from http://www.ecma.ch/ecma1/STAND/ECMA-182.HTM)
+ */
+
 #ifdef INT64_IS_BUSTED
 
-const uint32 crc_table0[256] = {
+const uint32 pg_crc64_table0[256] = {
 	0x00000000, 0xA9EA3693,
 	0x53D46D26, 0xFA3E5BB5,
 	0x0E42ECDF, 0xA7A8DA4C,
@@ -150,7 +248,7 @@ const uint32 crc_table0[256] = {
 	0x676F8394, 0xCE85B507
 };
 
-const uint32 crc_table1[256] = {
+const uint32 pg_crc64_table1[256] = {
 	0x00000000, 0x42F0E1EB,
 	0x85E1C3D7, 0xC711223C,
 	0x49336645, 0x0BC387AE,
@@ -283,7 +381,7 @@ const uint32 crc_table1[256] = {
 
 #else							/* int64 works */
 
-const uint64 crc_table[256] = {
+const uint64 pg_crc64_table[256] = {
 	UINT64CONST(0x0000000000000000), UINT64CONST(0x42F0E1EBA9EA3693),
 	UINT64CONST(0x85E1C3D753D46D26), UINT64CONST(0xC711223CFA3E5BB5),
 	UINT64CONST(0x493366450E42ECDF), UINT64CONST(0x0BC387AEA7A8DA4C),
@@ -415,3 +513,5 @@ const uint64 crc_table[256] = {
 };
 
 #endif   /* INT64_IS_BUSTED */
+
+#endif	/* PROVIDE_64BIT_CRC */
diff --git a/src/bin/pg_controldata/pg_controldata.c b/src/bin/pg_controldata/pg_controldata.c
index d89a934dfc2..77f61af06f4 100644
--- a/src/bin/pg_controldata/pg_controldata.c
+++ b/src/bin/pg_controldata/pg_controldata.c
@@ -6,7 +6,7 @@
  * copyright (c) Oliver Elphick <olly@lfix.co.uk>, 2001;
  * licence: BSD
  *
- * $PostgreSQL: pgsql/src/bin/pg_controldata/pg_controldata.c,v 1.23 2005/04/28 21:47:16 tgl Exp $
+ * $PostgreSQL: pgsql/src/bin/pg_controldata/pg_controldata.c,v 1.24 2005/06/02 05:55:29 tgl Exp $
  */
 #include "postgres.h"
 
@@ -66,7 +66,7 @@ main(int argc, char *argv[])
 	int			fd;
 	char		ControlFilePath[MAXPGPATH];
 	char	   *DataDir;
-	crc64		crc;
+	pg_crc32	crc;
 	char		pgctime_str[128];
 	char		ckpttime_str[128];
 	char		sysident_str[32];
@@ -120,13 +120,13 @@ main(int argc, char *argv[])
 	close(fd);
 
 	/* Check the CRC. */
-	INIT_CRC64(crc);
-	COMP_CRC64(crc,
-			   (char *) &ControlFile + sizeof(crc64),
-			   sizeof(ControlFileData) - sizeof(crc64));
-	FIN_CRC64(crc);
+	INIT_CRC32(crc);
+	COMP_CRC32(crc,
+			   (char *) &ControlFile,
+			   offsetof(ControlFileData, crc));
+	FIN_CRC32(crc);
 
-	if (!EQ_CRC64(crc, ControlFile.crc))
+	if (!EQ_CRC32(crc, ControlFile.crc))
 		printf(_("WARNING: Calculated CRC checksum does not match value stored in file.\n"
 				 "Either the file is corrupt, or it has a different layout than this program\n"
 			 "is expecting.  The results below are untrustworthy.\n\n"));
diff --git a/src/bin/pg_resetxlog/pg_resetxlog.c b/src/bin/pg_resetxlog/pg_resetxlog.c
index cabc5c00124..6eceb0a3543 100644
--- a/src/bin/pg_resetxlog/pg_resetxlog.c
+++ b/src/bin/pg_resetxlog/pg_resetxlog.c
@@ -23,7 +23,7 @@
  * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/bin/pg_resetxlog/pg_resetxlog.c,v 1.32 2005/04/28 21:47:16 tgl Exp $
+ * $PostgreSQL: pgsql/src/bin/pg_resetxlog/pg_resetxlog.c,v 1.33 2005/06/02 05:55:29 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -327,7 +327,7 @@ ReadControlFile(void)
 	int			fd;
 	int			len;
 	char	   *buffer;
-	crc64		crc;
+	pg_crc32	crc;
 
 	if ((fd = open(ControlFilePath, O_RDONLY)) < 0)
 	{
@@ -362,13 +362,13 @@ ReadControlFile(void)
 		((ControlFileData *) buffer)->pg_control_version == PG_CONTROL_VERSION)
 	{
 		/* Check the CRC. */
-		INIT_CRC64(crc);
-		COMP_CRC64(crc,
-				   buffer + sizeof(crc64),
-				   sizeof(ControlFileData) - sizeof(crc64));
-		FIN_CRC64(crc);
+		INIT_CRC32(crc);
+		COMP_CRC32(crc,
+				   buffer,
+				   offsetof(ControlFileData, crc));
+		FIN_CRC32(crc);
 
-		if (EQ_CRC64(crc, ((ControlFileData *) buffer)->crc))
+		if (EQ_CRC32(crc, ((ControlFileData *) buffer)->crc))
 		{
 			/* Valid data... */
 			memcpy(&ControlFile, buffer, sizeof(ControlFile));
@@ -553,11 +553,11 @@ RewriteControlFile(void)
 	ControlFile.prevCheckPoint.xrecoff = 0;
 
 	/* Contents are protected with a CRC */
-	INIT_CRC64(ControlFile.crc);
-	COMP_CRC64(ControlFile.crc,
-			   (char *) &ControlFile + sizeof(crc64),
-			   sizeof(ControlFileData) - sizeof(crc64));
-	FIN_CRC64(ControlFile.crc);
+	INIT_CRC32(ControlFile.crc);
+	COMP_CRC32(ControlFile.crc,
+			   (char *) &ControlFile,
+			   offsetof(ControlFileData, crc));
+	FIN_CRC32(ControlFile.crc);
 
 	/*
 	 * We write out BLCKSZ bytes into pg_control, zero-padding the excess
@@ -673,7 +673,7 @@ WriteEmptyXLOG(void)
 	XLogPageHeader page;
 	XLogLongPageHeader longpage;
 	XLogRecord *record;
-	crc64		crc;
+	pg_crc32	crc;
 	char		path[MAXPGPATH];
 	int			fd;
 	int			nbytes;
@@ -700,17 +700,18 @@ WriteEmptyXLOG(void)
 	record->xl_prev.xlogid = 0;
 	record->xl_prev.xrecoff = 0;
 	record->xl_xid = InvalidTransactionId;
+	record->xl_tot_len = SizeOfXLogRecord + sizeof(CheckPoint);
 	record->xl_len = sizeof(CheckPoint);
 	record->xl_info = XLOG_CHECKPOINT_SHUTDOWN;
 	record->xl_rmid = RM_XLOG_ID;
 	memcpy(XLogRecGetData(record), &ControlFile.checkPointCopy,
 		   sizeof(CheckPoint));
 
-	INIT_CRC64(crc);
-	COMP_CRC64(crc, &ControlFile.checkPointCopy, sizeof(CheckPoint));
-	COMP_CRC64(crc, (char *) record + sizeof(crc64),
-			   SizeOfXLogRecord - sizeof(crc64));
-	FIN_CRC64(crc);
+	INIT_CRC32(crc);
+	COMP_CRC32(crc, &ControlFile.checkPointCopy, sizeof(CheckPoint));
+	COMP_CRC32(crc, (char *) record + sizeof(pg_crc32),
+			   SizeOfXLogRecord - sizeof(pg_crc32));
+	FIN_CRC32(crc);
 	record->xl_crc = crc;
 
 	/* Write the first page */
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index ab471738970..1d1aa9c1526 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -6,7 +6,7 @@
  * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/access/xlog.h,v 1.61 2005/05/20 14:53:26 momjian Exp $
+ * $PostgreSQL: pgsql/src/include/access/xlog.h,v 1.62 2005/06/02 05:55:29 tgl Exp $
  */
 #ifndef XLOG_H
 #define XLOG_H
@@ -19,23 +19,31 @@
 
 
 /*
- * Header for each record in XLOG
+ * The overall layout of an XLOG record is:
+ *		Fixed-size header (XLogRecord struct)
+ *		rmgr-specific data
+ *		BkpBlock
+ *		backup block data
+ *		BkpBlock
+ *		backup block data
+ *		...
  *
- * NOTE: xl_len counts only the rmgr data, not the XLogRecord header,
- * and also not any backup blocks appended to the record (which are signaled
- * by xl_info flag bits).  The total space needed for an XLOG record is
- * really:
- *
- * SizeOfXLogRecord + xl_len + n_backup_blocks * (sizeof(BkpBlock) + BLCKSZ)
+ * where there can be zero to three backup blocks (as signaled by xl_info flag
+ * bits).  XLogRecord structs always start on MAXALIGN boundaries in the WAL
+ * files, and we round up SizeOfXLogRecord so that the rmgr data is also
+ * guaranteed to begin on a MAXALIGN boundary.  However, no padding is added
+ * to align BkpBlock structs or backup block data.
  *
- * rounded up to a MAXALIGN boundary (so that all xlog records start on
- * MAXALIGN boundaries).
+ * NOTE: xl_len counts only the rmgr data, not the XLogRecord header,
+ * and also not any backup blocks.  xl_tot_len counts everything.  Neither
+ * length field is rounded up to an alignment boundary.
  */
 typedef struct XLogRecord
 {
-	crc64		xl_crc;			/* CRC for this record */
+	pg_crc32	xl_crc;			/* CRC for this record */
 	XLogRecPtr	xl_prev;		/* ptr to previous record in log */
 	TransactionId xl_xid;		/* xact id */
+	uint32		xl_tot_len;		/* total len of entire record */
 	uint32		xl_len;			/* total len of rmgr data */
 	uint8		xl_info;		/* flag bits, see below */
 	RmgrId		xl_rmid;		/* resource manager for this record */
diff --git a/src/include/access/xlog_internal.h b/src/include/access/xlog_internal.h
index 75842328db4..a0b0b761ccb 100644
--- a/src/include/access/xlog_internal.h
+++ b/src/include/access/xlog_internal.h
@@ -11,7 +11,7 @@
  * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/access/xlog_internal.h,v 1.6 2004/12/31 22:03:21 pgsql Exp $
+ * $PostgreSQL: pgsql/src/include/access/xlog_internal.h,v 1.7 2005/06/02 05:55:29 tgl Exp $
  */
 #ifndef XLOG_INTERNAL_H
 #define XLOG_INTERNAL_H
@@ -25,15 +25,25 @@
 /*
  * Header info for a backup block appended to an XLOG record.
  *
- * Note that the backup block has its own CRC, and is not covered by
- * the CRC of the XLOG record proper.  Also note that we don't attempt
- * to align either the BkpBlock struct or the block's data.
+ * As a trivial form of data compression, the XLOG code is aware that
+ * PG data pages usually contain an unused "hole" in the middle, which
+ * contains only zero bytes.  If hole_length > 0 then we have removed
+ * such a "hole" from the stored data (and it's not counted in the
+ * XLOG record's CRC, either).  Hence, the amount of block data actually
+ * present following the BkpBlock struct is BLCKSZ - hole_length bytes.
+ *
+ * Note that we don't attempt to align either the BkpBlock struct or the
+ * block's data.  So, the struct must be copied to aligned local storage
+ * before use.
  */
 typedef struct BkpBlock
 {
-	crc64		crc;
-	RelFileNode node;
-	BlockNumber block;
+	RelFileNode node;			/* relation containing block */
+	BlockNumber block;			/* block number */
+	uint16		hole_offset;	/* number of bytes before "hole" */
+	uint16		hole_length;	/* number of bytes in "hole" */
+
+	/* ACTUAL BLOCK DATA FOLLOWS AT END OF STRUCT */
 } BkpBlock;
 
 /*
@@ -42,8 +52,9 @@ typedef struct BkpBlock
  * XLogRecord header will never be split across pages; if there's less than
  * SizeOfXLogRecord space left at the end of a page, we just waste it.)
  *
- * Note that xl_rem_len includes backup-block data, unlike xl_len in the
- * initial header.
+ * Note that xl_rem_len includes backup-block data; that is, it tracks
+ * xl_tot_len not xl_len in the initial header.  Also note that the
+ * continuation data isn't necessarily aligned.
  */
 typedef struct XLogContRecord
 {
@@ -53,12 +64,12 @@ typedef struct XLogContRecord
 
 } XLogContRecord;
 
-#define SizeOfXLogContRecord	MAXALIGN(sizeof(XLogContRecord))
+#define SizeOfXLogContRecord	sizeof(XLogContRecord)
 
 /*
  * Each page of XLOG file has a header like this:
  */
-#define XLOG_PAGE_MAGIC 0xD05C	/* can be used as WAL version indicator */
+#define XLOG_PAGE_MAGIC 0xD05D	/* can be used as WAL version indicator */
 
 typedef struct XLogPageHeaderData
 {
diff --git a/src/include/catalog/pg_control.h b/src/include/catalog/pg_control.h
index e60a879424a..3f96b6bf261 100644
--- a/src/include/catalog/pg_control.h
+++ b/src/include/catalog/pg_control.h
@@ -8,7 +8,7 @@
  * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/catalog/pg_control.h,v 1.21 2005/04/28 21:47:17 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/catalog/pg_control.h,v 1.22 2005/06/02 05:55:29 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -22,7 +22,7 @@
 
 
 /* Version identifier for this pg_control format */
-#define PG_CONTROL_VERSION	81
+#define PG_CONTROL_VERSION	810
 
 /*
  * Body of CheckPoint XLOG records.  This is declared here because we keep
@@ -73,12 +73,17 @@ typedef enum DBState
 
 typedef struct ControlFileData
 {
-	crc64		crc;			/* CRC for remainder of struct */
+	/*
+	 * Unique system identifier --- to ensure we match up xlog files with
+	 * the installation that produced them.
+	 */
+	uint64		system_identifier;
 
 	/*
-	 * Version identifier information.	Keep these fields at the front,
+	 * Version identifier information.	Keep these fields at the same offset,
 	 * especially pg_control_version; they won't be real useful if they
-	 * move around.
+	 * move around.  (For historical reasons they must be 8 bytes into
+	 * the file rather than immediately at the front.)
 	 *
 	 * pg_control_version identifies the format of pg_control itself.
 	 * catalog_version_no identifies the format of the system catalogs.
@@ -90,12 +95,6 @@ typedef struct ControlFileData
 	uint32		pg_control_version;		/* PG_CONTROL_VERSION */
 	uint32		catalog_version_no;		/* see catversion.h */
 
-	/*
-	 * Unique system identifier --- to ensure we match up xlog files with
-	 * the installation that produced them.
-	 */
-	uint64		system_identifier;
-
 	/*
 	 * System status data
 	 */
@@ -127,6 +126,9 @@ typedef struct ControlFileData
 	uint32		localeBuflen;
 	char		lc_collate[LOCALE_NAME_BUFLEN];
 	char		lc_ctype[LOCALE_NAME_BUFLEN];
+
+	/* CRC of all above ... MUST BE LAST! */
+	pg_crc32	crc;
 } ControlFileData;
 
 #endif   /* PG_CONTROL_H */
diff --git a/src/include/utils/pg_crc.h b/src/include/utils/pg_crc.h
index 6638f75d74e..5bf9ed76335 100644
--- a/src/include/utils/pg_crc.h
+++ b/src/include/utils/pg_crc.h
@@ -1,32 +1,65 @@
 /*
  * pg_crc.h
  *
- * PostgreSQL 64-bit CRC support
+ * PostgreSQL CRC support
+ *
+ * See Ross Williams' excellent introduction
+ * A PAINLESS GUIDE TO CRC ERROR DETECTION ALGORITHMS, available from
+ * ftp://ftp.rocksoft.com/papers/crc_v3.txt or several other net sites.
+ *
+ * We use a normal (not "reflected", in Williams' terms) CRC, using initial
+ * all-ones register contents and a final bit inversion.
+ *
+ * The 64-bit variant is not used as of PostgreSQL 8.1, but we retain the
+ * code for possible future use.
+ *
  *
  * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/utils/pg_crc.h,v 1.12 2004/12/31 22:03:46 pgsql Exp $
+ * $PostgreSQL: pgsql/src/include/utils/pg_crc.h,v 1.13 2005/06/02 05:55:29 tgl Exp $
  */
 #ifndef PG_CRC_H
 #define PG_CRC_H
 
+
+typedef uint32 pg_crc32;
+
+/* Initialize a CRC accumulator */
+#define INIT_CRC32(crc) ((crc) = 0xFFFFFFFF)
+
+/* Finish a CRC calculation */
+#define FIN_CRC32(crc)	((crc) ^= 0xFFFFFFFF)
+
+/* Accumulate some (more) bytes into a CRC */
+#define COMP_CRC32(crc, data, len)	\
+do { \
+	unsigned char *__data = (unsigned char *) (data); \
+	uint32		__len = (len); \
+\
+	while (__len-- > 0) \
+	{ \
+		int		__tab_index = ((int) ((crc) >> 24) ^ *__data++) & 0xFF; \
+		(crc) = pg_crc32_table[__tab_index] ^ ((crc) << 8); \
+	} \
+} while (0)
+
+/* Check for equality of two CRCs */
+#define EQ_CRC32(c1,c2)  ((c1) == (c2))
+
+/* Constant table for CRC calculation */
+extern const uint32 pg_crc32_table[];
+
+
+#ifdef PROVIDE_64BIT_CRC
+
 /*
  * If we have a 64-bit integer type, then a 64-bit CRC looks just like the
- * usual sort of implementation.  (See Ross Williams' excellent introduction
- * A PAINLESS GUIDE TO CRC ERROR DETECTION ALGORITHMS, available from
- * ftp://ftp.rocksoft.com/papers/crc_v3.txt or several other net sites.)
- * If we have no working 64-bit type, then fake it with two 32-bit registers.
- *
- * The present implementation is a normal (not "reflected", in Williams'
- * terms) 64-bit CRC, using initial all-ones register contents and a final
- * bit inversion.  The chosen polynomial is borrowed from the DLT1 spec
- * (ECMA-182, available from http://www.ecma.ch/ecma1/STAND/ECMA-182.HTM):
- *
- * x^64 + x^62 + x^57 + x^55 + x^54 + x^53 + x^52 + x^47 + x^46 + x^45 +
- * x^40 + x^39 + x^38 + x^37 + x^35 + x^33 + x^32 + x^31 + x^29 + x^27 +
- * x^24 + x^23 + x^22 + x^21 + x^19 + x^17 + x^13 + x^12 + x^10 + x^9 +
- * x^7 + x^4 + x + 1
+ * usual sort of implementation.  If we have no working 64-bit type, then
+ * fake it with two 32-bit registers.  (Note: experience has shown that the
+ * two-32-bit-registers code is as fast as, or even much faster than, the
+ * 64-bit code on all but true 64-bit machines.  INT64_IS_BUSTED is therefore
+ * probably the wrong control symbol to use to select the implementation.)
  */
 
 #ifdef INT64_IS_BUSTED
@@ -39,11 +72,11 @@
  * all machines, we could do a configure test to decide how to order the
  * two fields, but it seems not worth the trouble.
  */
-typedef struct crc64
+typedef struct pg_crc64
 {
 	uint32		crc0;
 	uint32		crc1;
-} crc64;
+} pg_crc64;
 
 /* Initialize a CRC accumulator */
 #define INIT_CRC64(crc) ((crc).crc0 = 0xffffffff, (crc).crc1 = 0xffffffff)
@@ -62,8 +95,8 @@ do { \
 	while (__len-- > 0) \
 	{ \
 		int		__tab_index = ((int) (__crc1 >> 24) ^ *__data++) & 0xFF; \
-		__crc1 = crc_table1[__tab_index] ^ ((__crc1 << 8) | (__crc0 >> 24)); \
-		__crc0 = crc_table0[__tab_index] ^ (__crc0 << 8); \
+		__crc1 = pg_crc64_table1[__tab_index] ^ ((__crc1 << 8) | (__crc0 >> 24)); \
+		__crc0 = pg_crc64_table0[__tab_index] ^ (__crc0 << 8); \
 	} \
 	(crc).crc0 = __crc0; \
 	(crc).crc1 = __crc1; \
@@ -73,15 +106,15 @@ do { \
 #define EQ_CRC64(c1,c2)  ((c1).crc0 == (c2).crc0 && (c1).crc1 == (c2).crc1)
 
 /* Constant table for CRC calculation */
-extern const uint32 crc_table0[];
-extern const uint32 crc_table1[];
+extern const uint32 pg_crc64_table0[];
+extern const uint32 pg_crc64_table1[];
 
 #else							/* int64 works */
 
-typedef struct crc64
+typedef struct pg_crc64
 {
 	uint64		crc0;
-} crc64;
+} pg_crc64;
 
 /* Initialize a CRC accumulator */
 #define INIT_CRC64(crc) ((crc).crc0 = UINT64CONST(0xffffffffffffffff))
@@ -99,7 +132,7 @@ do { \
 	while (__len-- > 0) \
 	{ \
 		int		__tab_index = ((int) (__crc0 >> 56) ^ *__data++) & 0xFF; \
-		__crc0 = crc_table[__tab_index] ^ (__crc0 << 8); \
+		__crc0 = pg_crc64_table[__tab_index] ^ (__crc0 << 8); \
 	} \
 	(crc).crc0 = __crc0; \
 } while (0)
@@ -108,7 +141,9 @@ do { \
 #define EQ_CRC64(c1,c2)  ((c1).crc0 == (c2).crc0)
 
 /* Constant table for CRC calculation */
-extern const uint64 crc_table[];
+extern const uint64 pg_crc64_table[];
 #endif   /* INT64_IS_BUSTED */
 
+#endif	/* PROVIDE_64BIT_CRC */
+
 #endif   /* PG_CRC_H */
-- 
GitLab