From 0cb91ccba93038c57a7dda6388c9f6bcd5cc52c0 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Fri, 8 Dec 2006 19:50:53 +0000
Subject: [PATCH] Remove the logId/logSeg fields from pg_control, because they
 are not needed in normal operation, and we can avoid rewriting pg_control at
 every log segment switch if we don't insist that these values be valid. 
 Reducing the number of pg_control updates is a good idea for both performance
 and reliability.  It does make pg_resetxlog's life a bit harder, but that
 seems a good tradeoff; and anyway the change to pg_resetxlog amounts to
 automating something people formerly needed to do by hand, namely look at the
 existing pg_xlog files to make sure the new WAL start point was past them.

In passing, change the wording of xlog.c's "database system was interrupted"
messages: describe the pg_control timestamp as "last known up at" rather than
implying it is the exact time of service interruption.  With this change the
timestamp will generally be the time of the last checkpoint, which could be
many minutes before the failure; and we've already seen indications that
people tend to misinterpret the old wording.

initdb forced due to change in pg_control layout.  Simon Riggs and Tom Lane
---
 doc/src/sgml/ref/pg_resetxlog.sgml      |  18 ++-
 src/backend/access/transam/xlog.c       | 100 +++++++----------
 src/bin/pg_controldata/pg_controldata.c |   6 +-
 src/bin/pg_resetxlog/pg_resetxlog.c     | 141 +++++++++++++++++++-----
 src/include/catalog/pg_control.h        |   6 +-
 5 files changed, 172 insertions(+), 99 deletions(-)

diff --git a/doc/src/sgml/ref/pg_resetxlog.sgml b/doc/src/sgml/ref/pg_resetxlog.sgml
index 11d5d1eef63..c1c9754368f 100644
--- a/doc/src/sgml/ref/pg_resetxlog.sgml
+++ b/doc/src/sgml/ref/pg_resetxlog.sgml
@@ -1,5 +1,5 @@
 <!--
-$PostgreSQL: pgsql/doc/src/sgml/ref/pg_resetxlog.sgml,v 1.18 2006/09/16 00:30:19 momjian Exp $
+$PostgreSQL: pgsql/doc/src/sgml/ref/pg_resetxlog.sgml,v 1.19 2006/12/08 19:50:52 tgl Exp $
 PostgreSQL documentation
 -->
 
@@ -72,7 +72,7 @@ PostgreSQL documentation
    <literal>-f</> can still be used, but
    the recovered database must be treated with even more suspicion than
    usual: an immediate dump and reload is imperative.  <emphasis>Do not</>
-   execute any data-modifying operations in the database before you dump;
+   execute any data-modifying operations in the database before you dump,
    as any such action is likely to make the corruption worse.
   </para>
 
@@ -127,7 +127,7 @@ PostgreSQL documentation
     <listitem>
      <para>
       The WAL starting address (<literal>-l</>) should be
-      larger than any file name currently existing in
+      larger than any WAL segment file name currently existing in
       the directory <filename>pg_xlog</> under the data directory.
       These names are also in hexadecimal and have three parts.  The first
       part is the <quote>timeline ID</> and should usually be kept the same.
@@ -139,6 +139,18 @@ PostgreSQL documentation
       <filename>000000010000003A000000FF</>, choose <literal>-l 0x1,0x3B,0x0</>
       or more.
      </para>
+
+     <note>
+      <para>
+       <command>pg_resetxlog</command> itself looks at the files in
+       <filename>pg_xlog</> and chooses a default <literal>-l</> setting
+       beyond the last existing file name.  Therefore, manual adjustment of
+       <literal>-l</> should only be needed if you are aware of WAL segment
+       files that are not currently present in <filename>pg_xlog</>, such as
+       entries in an offline archive; or if the contents of
+       <filename>pg_xlog</> have been lost entirely.
+      </para>
+     </note>
     </listitem>
 
     <listitem>
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 98c610dcadd..4d4a17a3aa5 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.258 2006/11/30 18:29:11 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.259 2006/12/08 19:50:53 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -1538,54 +1538,6 @@ XLogWrite(XLogwrtRqst WriteRqst, bool flexible, bool xlog_switch)
 			openLogFile = XLogFileInit(openLogId, openLogSeg,
 									   &use_existent, true);
 			openLogOff = 0;
-
-			/* update pg_control, unless someone else already did */
-			LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
-			if (ControlFile->logId < openLogId ||
-				(ControlFile->logId == openLogId &&
-				 ControlFile->logSeg < openLogSeg + 1))
-			{
-				ControlFile->logId = openLogId;
-				ControlFile->logSeg = openLogSeg + 1;
-				ControlFile->time = time(NULL);
-				UpdateControlFile();
-
-				/*
-				 * Signal bgwriter to start a checkpoint if it's been too long
-				 * since the last one.	(We look at local copy of RedoRecPtr
-				 * which might be a little out of date, but should be close
-				 * enough for this purpose.)
-				 *
-				 * A straight computation of segment number could overflow 32
-				 * bits.  Rather than assuming we have working 64-bit
-				 * arithmetic, we compare the highest-order bits separately,
-				 * and force a checkpoint immediately when they change.
-				 */
-				if (IsUnderPostmaster)
-				{
-					uint32		old_segno,
-								new_segno;
-					uint32		old_highbits,
-								new_highbits;
-
-					old_segno = (RedoRecPtr.xlogid % XLogSegSize) * XLogSegsPerFile +
-						(RedoRecPtr.xrecoff / XLogSegSize);
-					old_highbits = RedoRecPtr.xlogid / XLogSegSize;
-					new_segno = (openLogId % XLogSegSize) * XLogSegsPerFile +
-						openLogSeg;
-					new_highbits = openLogId / XLogSegSize;
-					if (new_highbits != old_highbits ||
-						new_segno >= old_segno + (uint32) CheckPointSegments)
-					{
-#ifdef WAL_DEBUG
-						if (XLOG_DEBUG)
-							elog(LOG, "time for a checkpoint, signaling bgwriter");
-#endif
-						RequestCheckpoint(false, true);
-					}
-				}
-			}
-			LWLockRelease(ControlFileLock);
 		}
 
 		/* Make sure we have the current logfile open */
@@ -1669,7 +1621,9 @@ XLogWrite(XLogwrtRqst WriteRqst, bool flexible, bool xlog_switch)
 			 *
 			 * This is also the right place to notify the Archiver that the
 			 * segment is ready to copy to archival storage, and to update the
-			 * timer for archive_timeout.
+			 * timer for archive_timeout, and to signal for a checkpoint if
+			 * too many logfile segments have been used since the last
+			 * checkpoint.
 			 */
 			if (finishing_seg || (xlog_switch && last_iteration))
 			{
@@ -1680,6 +1634,41 @@ XLogWrite(XLogwrtRqst WriteRqst, bool flexible, bool xlog_switch)
 					XLogArchiveNotifySeg(openLogId, openLogSeg);
 
 				Write->lastSegSwitchTime = time(NULL);
+
+				/*
+				 * Signal bgwriter to start a checkpoint if it's been too long
+				 * since the last one.	(We look at local copy of RedoRecPtr
+				 * which might be a little out of date, but should be close
+				 * enough for this purpose.)
+				 *
+				 * A straight computation of segment number could overflow 32
+				 * bits.  Rather than assuming we have working 64-bit
+				 * arithmetic, we compare the highest-order bits separately,
+				 * and force a checkpoint immediately when they change.
+				 */
+				if (IsUnderPostmaster)
+				{
+					uint32		old_segno,
+								new_segno;
+					uint32		old_highbits,
+								new_highbits;
+
+					old_segno = (RedoRecPtr.xlogid % XLogSegSize) * XLogSegsPerFile +
+						(RedoRecPtr.xrecoff / XLogSegSize);
+					old_highbits = RedoRecPtr.xlogid / XLogSegSize;
+					new_segno = (openLogId % XLogSegSize) * XLogSegsPerFile +
+						openLogSeg;
+					new_highbits = openLogId / XLogSegSize;
+					if (new_highbits != old_highbits ||
+						new_segno >= old_segno + (uint32) (CheckPointSegments-1))
+					{
+#ifdef WAL_DEBUG
+						if (XLOG_DEBUG)
+							elog(LOG, "time for a checkpoint, signaling bgwriter");
+#endif
+						RequestCheckpoint(false, true);
+					}
+				}
 			}
 		}
 
@@ -4199,8 +4188,6 @@ BootStrapXLOG(void)
 	ControlFile->system_identifier = sysidentifier;
 	ControlFile->state = DB_SHUTDOWNED;
 	ControlFile->time = checkPoint.time;
-	ControlFile->logId = 0;
-	ControlFile->logSeg = 1;
 	ControlFile->checkPoint = checkPoint.redo;
 	ControlFile->checkPointCopy = checkPoint;
 	/* some additional ControlFile fields are set in WriteControlFile() */
@@ -4659,8 +4646,7 @@ StartupXLOG(void)
 	 */
 	ReadControlFile();
 
-	if (ControlFile->logSeg == 0 ||
-		ControlFile->state < DB_SHUTDOWNED ||
+	if (ControlFile->state < DB_SHUTDOWNED ||
 		ControlFile->state > DB_IN_PRODUCTION ||
 		!XRecOffIsValid(ControlFile->checkPoint.xrecoff))
 		ereport(FATAL,
@@ -4672,7 +4658,7 @@ StartupXLOG(void)
 						str_time(ControlFile->time))));
 	else if (ControlFile->state == DB_SHUTDOWNING)
 		ereport(LOG,
-				(errmsg("database system shutdown was interrupted at %s",
+				(errmsg("database system shutdown was interrupted; last known up at %s",
 						str_time(ControlFile->time))));
 	else if (ControlFile->state == DB_IN_CRASH_RECOVERY)
 		ereport(LOG,
@@ -4688,7 +4674,7 @@ StartupXLOG(void)
 				" and you may need to choose an earlier recovery target.")));
 	else if (ControlFile->state == DB_IN_PRODUCTION)
 		ereport(LOG,
-				(errmsg("database system was interrupted at %s",
+				(errmsg("database system was interrupted; last known up at %s",
 						str_time(ControlFile->time))));
 
 	/* This is just to allow attaching to startup process with a debugger */
@@ -5064,8 +5050,6 @@ StartupXLOG(void)
 	openLogSeg = endLogSeg;
 	openLogFile = XLogFileOpen(openLogId, openLogSeg);
 	openLogOff = 0;
-	ControlFile->logId = openLogId;
-	ControlFile->logSeg = openLogSeg + 1;
 	Insert = &XLogCtl->Insert;
 	Insert->PrevRecord = LastRec;
 	XLogCtl->xlblocks[0].xlogid = openLogId;
diff --git a/src/bin/pg_controldata/pg_controldata.c b/src/bin/pg_controldata/pg_controldata.c
index cbde5357edf..6805af61536 100644
--- a/src/bin/pg_controldata/pg_controldata.c
+++ b/src/bin/pg_controldata/pg_controldata.c
@@ -6,7 +6,7 @@
  * copyright (c) Oliver Elphick <olly@lfix.co.uk>, 2001;
  * licence: BSD
  *
- * $PostgreSQL: pgsql/src/bin/pg_controldata/pg_controldata.c,v 1.31 2006/08/21 16:16:31 tgl Exp $
+ * $PostgreSQL: pgsql/src/bin/pg_controldata/pg_controldata.c,v 1.32 2006/12/08 19:50:53 tgl Exp $
  */
 #include "postgres.h"
 
@@ -159,10 +159,6 @@ main(int argc, char *argv[])
 		   dbState(ControlFile.state));
 	printf(_("pg_control last modified:             %s\n"),
 		   pgctime_str);
-	printf(_("Current log file ID:                  %u\n"),
-		   ControlFile.logId);
-	printf(_("Next log file segment:                %u\n"),
-		   ControlFile.logSeg);
 	printf(_("Latest checkpoint location:           %X/%X\n"),
 		   ControlFile.checkPoint.xlogid,
 		   ControlFile.checkPoint.xrecoff);
diff --git a/src/bin/pg_resetxlog/pg_resetxlog.c b/src/bin/pg_resetxlog/pg_resetxlog.c
index 15613832792..9212ccc5b76 100644
--- a/src/bin/pg_resetxlog/pg_resetxlog.c
+++ b/src/bin/pg_resetxlog/pg_resetxlog.c
@@ -23,7 +23,7 @@
  * Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/bin/pg_resetxlog/pg_resetxlog.c,v 1.53 2006/10/04 00:30:05 momjian Exp $
+ * $PostgreSQL: pgsql/src/bin/pg_resetxlog/pg_resetxlog.c,v 1.54 2006/12/08 19:50:53 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -60,6 +60,7 @@ static bool ReadControlFile(void);
 static void GuessControlValues(void);
 static void PrintControlValues(bool guessed);
 static void RewriteControlFile(void);
+static void FindEndOfXLOG(void);
 static void KillExistingXLOG(void);
 static void WriteEmptyXLOG(void);
 static void usage(void);
@@ -283,6 +284,11 @@ main(int argc, char *argv[])
 	if (!ReadControlFile())
 		GuessControlValues();
 
+	/*
+	 * Also look at existing segment files to set up newXlogId/newXlogSeg
+	 */
+	FindEndOfXLOG();
+
 	/*
 	 * Adjust fields if required by switches.  (Do this now so that printout,
 	 * if any, includes these values.)
@@ -305,12 +311,12 @@ main(int argc, char *argv[])
 	if (minXlogTli > ControlFile.checkPointCopy.ThisTimeLineID)
 		ControlFile.checkPointCopy.ThisTimeLineID = minXlogTli;
 
-	if (minXlogId > ControlFile.logId ||
-		(minXlogId == ControlFile.logId &&
-		 minXlogSeg > ControlFile.logSeg))
+	if (minXlogId > newXlogId ||
+		(minXlogId == newXlogId &&
+		 minXlogSeg > newXlogSeg))
 	{
-		ControlFile.logId = minXlogId;
-		ControlFile.logSeg = minXlogSeg;
+		newXlogId = minXlogId;
+		newXlogSeg = minXlogSeg;
 	}
 
 	/*
@@ -469,8 +475,6 @@ GuessControlValues(void)
 
 	ControlFile.state = DB_SHUTDOWNED;
 	ControlFile.time = time(NULL);
-	ControlFile.logId = 0;
-	ControlFile.logSeg = 1;
 	ControlFile.checkPoint = ControlFile.checkPointCopy.redo;
 
 	ControlFile.maxAlign = MAXIMUM_ALIGNOF;
@@ -533,16 +537,16 @@ PrintControlValues(bool guessed)
 	snprintf(sysident_str, sizeof(sysident_str), UINT64_FORMAT,
 			 ControlFile.system_identifier);
 
+	printf(_("First log file ID for new XLOG:       %u\n"),
+		   newXlogId);
+	printf(_("First log file segment for new XLOG:  %u\n"),
+		   newXlogSeg);
 	printf(_("pg_control version number:            %u\n"),
 		   ControlFile.pg_control_version);
 	printf(_("Catalog version number:               %u\n"),
 		   ControlFile.catalog_version_no);
 	printf(_("Database system identifier:           %s\n"),
 		   sysident_str);
-	printf(_("Current log file ID:                  %u\n"),
-		   ControlFile.logId);
-	printf(_("Next log file segment:                %u\n"),
-		   ControlFile.logSeg);
 	printf(_("Latest checkpoint's TimeLineID:       %u\n"),
 		   ControlFile.checkPointCopy.ThisTimeLineID);
 	printf(_("Latest checkpoint's NextXID:          %u/%u\n"),
@@ -590,22 +594,9 @@ RewriteControlFile(void)
 	char		buffer[PG_CONTROL_SIZE];		/* need not be aligned */
 
 	/*
-	 * Adjust fields as needed to force an empty XLOG starting at the next
-	 * available segment.
+	 * Adjust fields as needed to force an empty XLOG starting at
+	 * newXlogId/newXlogSeg.
 	 */
-	newXlogId = ControlFile.logId;
-	newXlogSeg = ControlFile.logSeg;
-
-	/* adjust in case we are changing segment size */
-	newXlogSeg *= ControlFile.xlog_seg_size;
-	newXlogSeg = (newXlogSeg + XLogSegSize - 1) / XLogSegSize;
-
-	/* be sure we wrap around correctly at end of a logfile */
-	NextLogSeg(newXlogId, newXlogSeg);
-
-	/* Now we can force the recorded xlog seg size to the right thing. */
-	ControlFile.xlog_seg_size = XLogSegSize;
-
 	ControlFile.checkPointCopy.redo.xlogid = newXlogId;
 	ControlFile.checkPointCopy.redo.xrecoff =
 		newXlogSeg * XLogSegSize + SizeOfXLogLongPHD;
@@ -614,14 +605,15 @@ RewriteControlFile(void)
 
 	ControlFile.state = DB_SHUTDOWNED;
 	ControlFile.time = time(NULL);
-	ControlFile.logId = newXlogId;
-	ControlFile.logSeg = newXlogSeg + 1;
 	ControlFile.checkPoint = ControlFile.checkPointCopy.redo;
 	ControlFile.prevCheckPoint.xlogid = 0;
 	ControlFile.prevCheckPoint.xrecoff = 0;
 	ControlFile.minRecoveryPoint.xlogid = 0;
 	ControlFile.minRecoveryPoint.xrecoff = 0;
 
+	/* Now we can force the recorded xlog seg size to the right thing. */
+	ControlFile.xlog_seg_size = XLogSegSize;
+
 	/* Contents are protected with a CRC */
 	INIT_CRC32(ControlFile.crc);
 	COMP_CRC32(ControlFile.crc,
@@ -680,6 +672,97 @@ RewriteControlFile(void)
 }
 
 
+/*
+ * Scan existing XLOG files and determine the highest existing WAL address
+ *
+ * On entry, ControlFile.checkPointCopy.redo and ControlFile.xlog_seg_size
+ * are assumed valid (note that we allow the old xlog seg size to differ
+ * from what we're using).  On exit, newXlogId and newXlogSeg are set to
+ * suitable values for the beginning of replacement WAL (in our seg size).
+ */
+static void
+FindEndOfXLOG(void)
+{
+	DIR		   *xldir;
+	struct dirent *xlde;
+
+	/*
+	 * Initialize the max() computation using the last checkpoint address
+	 * from old pg_control.  Note that for the moment we are working with
+	 * segment numbering according to the old xlog seg size.
+	 */
+	newXlogId = ControlFile.checkPointCopy.redo.xlogid;
+	newXlogSeg = ControlFile.checkPointCopy.redo.xrecoff / ControlFile.xlog_seg_size;
+
+	/*
+	 * Scan the pg_xlog directory to find existing WAL segment files.
+	 * We assume any present have been used; in most scenarios this should
+	 * be conservative, because of xlog.c's attempts to pre-create files.
+	 */
+	xldir = opendir(XLOGDIR);
+	if (xldir == NULL)
+	{
+		fprintf(stderr, _("%s: could not open directory \"%s\": %s\n"),
+				progname, XLOGDIR, strerror(errno));
+		exit(1);
+	}
+
+	errno = 0;
+	while ((xlde = readdir(xldir)) != NULL)
+	{
+		if (strlen(xlde->d_name) == 24 &&
+			strspn(xlde->d_name, "0123456789ABCDEF") == 24)
+		{
+			unsigned int	tli,
+							log,
+							seg;
+
+			sscanf(xlde->d_name, "%08X%08X%08X", &tli, &log, &seg);
+			/*
+			 * Note: we take the max of all files found, regardless of their
+			 * timelines.  Another possibility would be to ignore files of
+			 * timelines other than the target TLI, but this seems safer.
+			 * Better too large a result than too small...
+			 */
+			if (log > newXlogId ||
+				(log == newXlogId && seg > newXlogSeg))
+			{
+				newXlogId = log;
+				newXlogSeg = seg;
+			}
+		}
+		errno = 0;
+	}
+#ifdef WIN32
+
+	/*
+	 * This fix is in mingw cvs (runtime/mingwex/dirent.c rev 1.4), but not in
+	 * released version
+	 */
+	if (GetLastError() == ERROR_NO_MORE_FILES)
+		errno = 0;
+#endif
+
+	if (errno)
+	{
+		fprintf(stderr, _("%s: could not read from directory \"%s\": %s\n"),
+				progname, XLOGDIR, strerror(errno));
+		exit(1);
+	}
+	closedir(xldir);
+
+	/*
+	 * Finally, convert to new xlog seg size, and advance by one to ensure
+	 * we are in virgin territory.
+	 */
+	newXlogSeg *= ControlFile.xlog_seg_size;
+	newXlogSeg = (newXlogSeg + XLogSegSize - 1) / XLogSegSize;
+
+	/* be sure we wrap around correctly at end of a logfile */
+	NextLogSeg(newXlogId, newXlogSeg);
+}
+
+
 /*
  * Remove existing XLOG files
  */
diff --git a/src/include/catalog/pg_control.h b/src/include/catalog/pg_control.h
index 0e1132052e7..84c148c2abd 100644
--- a/src/include/catalog/pg_control.h
+++ b/src/include/catalog/pg_control.h
@@ -8,7 +8,7 @@
  * Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/catalog/pg_control.h,v 1.33 2006/10/04 00:30:07 momjian Exp $
+ * $PostgreSQL: pgsql/src/include/catalog/pg_control.h,v 1.34 2006/12/08 19:50:53 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -22,7 +22,7 @@
 
 
 /* Version identifier for this pg_control format */
-#define PG_CONTROL_VERSION	822
+#define PG_CONTROL_VERSION	831
 
 /*
  * Body of CheckPoint XLOG records.  This is declared here because we keep
@@ -102,8 +102,6 @@ typedef struct ControlFileData
 	 */
 	DBState		state;			/* see enum above */
 	time_t		time;			/* time stamp of last pg_control update */
-	uint32		logId;			/* current log file id */
-	uint32		logSeg;			/* current log file segment, + 1 */
 	XLogRecPtr	checkPoint;		/* last check point record ptr */
 	XLogRecPtr	prevCheckPoint; /* previous check point record ptr */
 
-- 
GitLab