From 32f4de0adfb2037f1402e40b54a5c4043227363f Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Tue, 4 Dec 2012 15:28:58 +0200
Subject: [PATCH] Write exact xlog position of timeline switch in the timeline
 history file.

This allows us to do some more rigorous sanity checking for various
incorrect point-in-time recovery scenarios, and provides more information
for debugging purposes. It will also come handy in the upcoming patch to
allow timeline switches to be replicated by streaming replication.
---
 src/backend/access/transam/timeline.c | 144 +++++++++++++++++----
 src/backend/access/transam/xlog.c     | 173 ++++++++++++++++----------
 src/include/access/timeline.h         |  20 ++-
 3 files changed, 247 insertions(+), 90 deletions(-)

diff --git a/src/backend/access/transam/timeline.c b/src/backend/access/transam/timeline.c
index 225ce465f7f..324b6c18601 100644
--- a/src/backend/access/transam/timeline.c
+++ b/src/backend/access/transam/timeline.c
@@ -12,10 +12,10 @@
  *
  * Each line in the file represents a timeline switch:
  *
- * <parentTLI> <xlogfname> <reason>
+ * <parentTLI> <switchpoint> <reason>
  *
  *	parentTLI	ID of the parent timeline
- *	xlogfname	filename of the WAL segment where the switch happened
+ *	switchpoint	XLogRecPtr of the WAL position where the switch happened
  *	reason		human-readable explanation of why the timeline was changed
  *
  * The fields are separated by tabs. Lines beginning with # are comments, and
@@ -56,10 +56,18 @@ readTimeLineHistory(TimeLineID targetTLI)
 	char		histfname[MAXFNAMELEN];
 	char		fline[MAXPGPATH];
 	FILE	   *fd;
+	TimeLineHistoryEntry *entry;
+	TimeLineID	lasttli = 0;
+	XLogRecPtr	prevend;
 
 	/* Timeline 1 does not have a history file, so no need to check */
 	if (targetTLI == 1)
-		return list_make1_int((int) targetTLI);
+	{
+		entry = (TimeLineHistoryEntry *) palloc(sizeof(TimeLineHistoryEntry));
+		entry->tli = targetTLI;
+		entry->begin = entry->end = InvalidXLogRecPtr;
+		return list_make1(entry);
+	}
 
 	if (InArchiveRecovery)
 	{
@@ -77,7 +85,10 @@ readTimeLineHistory(TimeLineID targetTLI)
 					(errcode_for_file_access(),
 					 errmsg("could not open file \"%s\": %m", path)));
 		/* Not there, so assume no parents */
-		return list_make1_int((int) targetTLI);
+		entry = (TimeLineHistoryEntry *) palloc(sizeof(TimeLineHistoryEntry));
+		entry->tli = targetTLI;
+		entry->begin = entry->end = InvalidXLogRecPtr;
+		return list_make1(entry);
 	}
 
 	result = NIL;
@@ -85,12 +96,15 @@ readTimeLineHistory(TimeLineID targetTLI)
 	/*
 	 * Parse the file...
 	 */
+	prevend = InvalidXLogRecPtr;
 	while (fgets(fline, sizeof(fline), fd) != NULL)
 	{
 		/* skip leading whitespace and check for # comment */
 		char	   *ptr;
-		char	   *endptr;
 		TimeLineID	tli;
+		uint32		switchpoint_hi;
+		uint32		switchpoint_lo;
+		int			nfields;
 
 		for (ptr = fline; *ptr; ptr++)
 		{
@@ -100,38 +114,56 @@ readTimeLineHistory(TimeLineID targetTLI)
 		if (*ptr == '\0' || *ptr == '#')
 			continue;
 
-		/* expect a numeric timeline ID as first field of line */
-		tli = (TimeLineID) strtoul(ptr, &endptr, 0);
-		if (endptr == ptr)
+		nfields = sscanf(fline, "%u\t%X/%X", &tli, &switchpoint_hi, &switchpoint_lo);
+
+		if (nfields < 1)
+		{
+			/* expect a numeric timeline ID as first field of line */
 			ereport(FATAL,
 					(errmsg("syntax error in history file: %s", fline),
 					 errhint("Expected a numeric timeline ID.")));
+		}
+		if (nfields != 3)
+			ereport(FATAL,
+					(errmsg("syntax error in history file: %s", fline),
+					 errhint("Expected an XLOG switchpoint location.")));
 
-		if (result &&
-			tli <= (TimeLineID) linitial_int(result))
+		if (result && tli <= lasttli)
 			ereport(FATAL,
 					(errmsg("invalid data in history file: %s", fline),
 				   errhint("Timeline IDs must be in increasing sequence.")));
 
+		lasttli = tli;
+
+		entry = (TimeLineHistoryEntry *) palloc(sizeof(TimeLineHistoryEntry));
+		entry->tli = tli;
+		entry->begin = prevend;
+		entry->end = ((uint64) (switchpoint_hi)) << 32 | (uint64) switchpoint_lo;
+		prevend = entry->end;
+
 		/* Build list with newest item first */
-		result = lcons_int((int) tli, result);
+		result = lcons(entry, result);
 
 		/* we ignore the remainder of each line */
 	}
 
 	FreeFile(fd);
 
-	if (result &&
-		targetTLI <= (TimeLineID) linitial_int(result))
+	if (result && targetTLI <= lasttli)
 		ereport(FATAL,
 				(errmsg("invalid data in history file \"%s\"", path),
 			errhint("Timeline IDs must be less than child timeline's ID.")));
 
-	result = lcons_int((int) targetTLI, result);
+	/*
+	 * Create one more entry for the "tip" of the timeline, which has no
+	 * entry in the history file.
+	 */
+	entry = (TimeLineHistoryEntry *) palloc(sizeof(TimeLineHistoryEntry));
+	entry->tli = targetTLI;
+	entry->begin = prevend;
+	entry->end = InvalidXLogRecPtr;
 
-	ereport(DEBUG3,
-			(errmsg_internal("history of timeline %u is %s",
-							 targetTLI, nodeToString(result))));
+	result = lcons(entry, result);
 
 	return result;
 }
@@ -214,7 +246,7 @@ findNewestTimeLine(TimeLineID startTLI)
  *
  *	newTLI: ID of the new timeline
  *	parentTLI: ID of its immediate parent
- *	endTLI et al: ID of the last used WAL file, for annotation purposes
+ *	switchpoint: XLOG position where the system switched to the new timeline
  *	reason: human-readable explanation of why the timeline was switched
  *
  * Currently this is only used at the end recovery, and so there are no locking
@@ -223,12 +255,11 @@ findNewestTimeLine(TimeLineID startTLI)
  */
 void
 writeTimeLineHistory(TimeLineID newTLI, TimeLineID parentTLI,
-					 TimeLineID endTLI, XLogSegNo endLogSegNo, char *reason)
+					 XLogRecPtr switchpoint, char *reason)
 {
 	char		path[MAXPGPATH];
 	char		tmppath[MAXPGPATH];
 	char		histfname[MAXFNAMELEN];
-	char		xlogfname[MAXFNAMELEN];
 	char		buffer[BLCKSZ];
 	int			srcfd;
 	int			fd;
@@ -313,13 +344,11 @@ writeTimeLineHistory(TimeLineID newTLI, TimeLineID parentTLI,
 	 * If we did have a parent file, insert an extra newline just in case the
 	 * parent file failed to end with one.
 	 */
-	XLogFileName(xlogfname, endTLI, endLogSegNo);
-
 	snprintf(buffer, sizeof(buffer),
-			 "%s%u\t%s\t%s\n",
+			 "%s%u\t%X/%X\t%s\n",
 			 (srcfd < 0) ? "" : "\n",
 			 parentTLI,
-			 xlogfname,
+			 (uint32) (switchpoint >> 32), (uint32) (switchpoint),
 			 reason);
 
 	nbytes = strlen(buffer);
@@ -380,3 +409,70 @@ writeTimeLineHistory(TimeLineID newTLI, TimeLineID parentTLI,
 	TLHistoryFileName(histfname, newTLI);
 	XLogArchiveNotify(histfname);
 }
+
+/*
+ * Returns true if 'expectedTLEs' contains a timeline with id 'tli'
+ */
+bool
+tliInHistory(TimeLineID tli, List *expectedTLEs)
+{
+	ListCell *cell;
+
+	foreach(cell, expectedTLEs)
+	{
+		if (((TimeLineHistoryEntry *) lfirst(cell))->tli == tli)
+			return true;
+	}
+
+	return false;
+}
+
+/*
+ * Returns the ID of the timeline in use at a particular point in time, in
+ * the given timeline history.
+ */
+TimeLineID
+tliOfPointInHistory(XLogRecPtr ptr, List *history)
+{
+	ListCell *cell;
+
+	foreach(cell, history)
+	{
+		TimeLineHistoryEntry *tle = (TimeLineHistoryEntry *) lfirst(cell);
+		if ((XLogRecPtrIsInvalid(tle->begin) || XLByteLE(tle->begin, ptr)) &&
+			(XLogRecPtrIsInvalid(tle->end) || XLByteLT(ptr, tle->end)))
+		{
+			/* found it */
+			return tle->tli;
+		}
+	}
+
+	/* shouldn't happen. */
+	elog(ERROR, "timeline history was not contiguous");
+	return 0;	/* keep compiler quiet */
+}
+
+/*
+ * Returns the point in history where we branched off the given timeline.
+ * Returns InvalidXLogRecPtr if the timeline is current (= we have not
+ * branched off from it), and throws an error if the timeline is not part of
+ * this server's history.
+ */
+XLogRecPtr
+tliSwitchPoint(TimeLineID tli, List *history)
+{
+	ListCell   *cell;
+
+	foreach (cell, history)
+	{
+		TimeLineHistoryEntry *tle = (TimeLineHistoryEntry *) lfirst(cell);
+
+		if (tle->tli == tli)
+			return tle->end;
+	}
+
+	ereport(ERROR,
+			(errmsg("requested timeline %u is not in this server's history",
+					tli)));
+	return InvalidXLogRecPtr; /* keep compiler quiet */
+}
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index b3356fd4349..d60c2a3bfc5 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -226,7 +226,7 @@ static bool recoveryStopAfter;
  *
  * recoveryTargetIsLatest: was the requested target timeline 'latest'?
  *
- * expectedTLIs: an integer list of recoveryTargetTLI and the TLIs of
+ * expectedTLEs: a list of TimeLineHistoryEntries for recoveryTargetTLI and the timelines of
  * its known parents, newest first (so recoveryTargetTLI is always the
  * first list member).	Only these TLIs are expected to be seen in the WAL
  * segments we read, and indeed only these TLIs will be considered as
@@ -240,7 +240,7 @@ static bool recoveryStopAfter;
  */
 static TimeLineID recoveryTargetTLI;
 static bool recoveryTargetIsLatest = false;
-static List *expectedTLIs;
+static List *expectedTLEs;
 static TimeLineID curFileTLI;
 
 /*
@@ -2515,7 +2515,7 @@ InstallXLogFileSegment(XLogSegNo *segno, char *tmppath,
 
 	/*
 	 * Prefer link() to rename() here just to be really sure that we don't
-	 * overwrite an existing logfile.  However, there shouldn't be one, so
+	 * overwrite an existing file.  However, there shouldn't be one, so
 	 * rename() is an acceptable substitute except for the truly paranoid.
 	 */
 #if HAVE_WORKING_LINK
@@ -2716,7 +2716,7 @@ XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli,
 /*
  * Open a logfile segment for reading (during recovery).
  *
- * This version searches for the segment with any TLI listed in expectedTLIs.
+ * This version searches for the segment with any TLI listed in expectedTLEs.
  */
 static int
 XLogFileReadAnyTLI(XLogSegNo segno, int emode, int source)
@@ -2727,7 +2727,7 @@ XLogFileReadAnyTLI(XLogSegNo segno, int emode, int source)
 
 	/*
 	 * Loop looking for a suitable timeline ID: we might need to read any of
-	 * the timelines listed in expectedTLIs.
+	 * the timelines listed in expectedTLEs.
 	 *
 	 * We expect curFileTLI on entry to be the TLI of the preceding file in
 	 * sequence, or 0 if there was no predecessor.	We do not allow curFileTLI
@@ -2735,9 +2735,9 @@ XLogFileReadAnyTLI(XLogSegNo segno, int emode, int source)
 	 * parent timeline extends to higher segment numbers than the child we
 	 * want to read.
 	 */
-	foreach(cell, expectedTLIs)
+	foreach(cell, expectedTLEs)
 	{
-		TimeLineID	tli = (TimeLineID) lfirst_int(cell);
+		TimeLineID	tli = ((TimeLineHistoryEntry *) lfirst(cell))->tli;
 
 		if (tli < curFileTLI)
 			break;				/* don't bother looking at too-old TLIs */
@@ -3344,7 +3344,7 @@ ReadRecord(XLogRecPtr *RecPtr, int emode, bool fetching_ckpt)
 		/*
 		 * Since we are going to a random position in WAL, forget any prior
 		 * state about what timeline we were in, and allow it to be any
-		 * timeline in expectedTLIs.  We also set a flag to allow curFileTLI
+		 * timeline in expectedTLEs.  We also set a flag to allow curFileTLI
 		 * to go backwards (but we can't reset that variable right here, since
 		 * we might not change files at all).
 		 */
@@ -3675,7 +3675,7 @@ ValidXLogPageHeader(XLogPageHeader hdr, int emode, bool segmentonly)
 	/*
 	 * Check page TLI is one of the expected values.
 	 */
-	if (!list_member_int(expectedTLIs, (int) hdr->xlp_tli))
+	if (!tliInHistory(hdr->xlp_tli, expectedTLEs))
 	{
 		ereport(emode_for_corrupt_record(emode, recaddr),
 				(errmsg("unexpected timeline ID %u in log segment %s, offset %u",
@@ -3812,57 +3812,86 @@ ValidXLogRecordHeader(XLogRecPtr *RecPtr, XLogRecord *record, int emode,
 static bool
 rescanLatestTimeLine(void)
 {
+	List	   *newExpectedTLEs;
+	bool		found;
+	ListCell   *cell;
 	TimeLineID	newtarget;
+	TimeLineHistoryEntry *currentTle = NULL;
+	/* use volatile pointer to prevent code rearrangement */
+	volatile XLogCtlData *xlogctl = XLogCtl;
 
 	newtarget = findNewestTimeLine(recoveryTargetTLI);
-	if (newtarget != recoveryTargetTLI)
+	if (newtarget == recoveryTargetTLI)
 	{
-		/*
-		 * Determine the list of expected TLIs for the new TLI
-		 */
-		List	   *newExpectedTLIs;
-
-		newExpectedTLIs = readTimeLineHistory(newtarget);
+		/* No new timelines found */
+		return false;
+	}
 
-		/*
-		 * If the current timeline is not part of the history of the new
-		 * timeline, we cannot proceed to it.
-		 *
-		 * XXX This isn't foolproof: The new timeline might have forked from
-		 * the current one, but before the current recovery location. In that
-		 * case we will still switch to the new timeline and proceed replaying
-		 * from it even though the history doesn't match what we already
-		 * replayed. That's not good. We will likely notice at the next online
-		 * checkpoint, as the TLI won't match what we expected, but it's not
-		 * guaranteed. The admin needs to make sure that doesn't happen.
-		 */
-		if (!list_member_int(newExpectedTLIs,
-							 (int) recoveryTargetTLI))
-			ereport(LOG,
-					(errmsg("new timeline %u is not a child of database system timeline %u",
-							newtarget,
-							ThisTimeLineID)));
-		else
-		{
-			/* use volatile pointer to prevent code rearrangement */
-			volatile XLogCtlData *xlogctl = XLogCtl;
+	/*
+	 * Determine the list of expected TLIs for the new TLI
+	 */
 
-			/* Switch target */
-			recoveryTargetTLI = newtarget;
-			list_free(expectedTLIs);
-			expectedTLIs = newExpectedTLIs;
+	newExpectedTLEs = readTimeLineHistory(newtarget);
 
-			SpinLockAcquire(&xlogctl->info_lck);
-			xlogctl->RecoveryTargetTLI = recoveryTargetTLI;
-			SpinLockRelease(&xlogctl->info_lck);
+	/*
+	 * If the current timeline is not part of the history of the new
+	 * timeline, we cannot proceed to it.
+	 */
+	found = false;
+	foreach (cell, newExpectedTLEs)
+	{
+		currentTle = (TimeLineHistoryEntry *) lfirst(cell);
 
-			ereport(LOG,
-					(errmsg("new target timeline is %u",
-							recoveryTargetTLI)));
-			return true;
+		if (currentTle->tli == recoveryTargetTLI)
+		{
+			found = true;
+			break;
 		}
 	}
-	return false;
+	if (!found)
+	{
+		ereport(LOG,
+				(errmsg("new timeline %u is not a child of database system timeline %u",
+						newtarget,
+						ThisTimeLineID)));
+		return false;
+	}
+
+	/*
+	 * The current timeline was found in the history file, but check that the
+	 * next timeline was forked off from it *after* the current recovery
+	 * location.
+	 */
+	if (XLByteLT(currentTle->end, EndRecPtr))
+	{
+		ereport(LOG,
+				(errmsg("new timeline %u forked off current database system timeline %u before current recovery point %X/%X",
+						newtarget,
+						ThisTimeLineID,
+						(uint32) (EndRecPtr >> 32), (uint32) EndRecPtr)));
+		return false;
+	}
+
+	/* The new timeline history seems valid. Switch target */
+	recoveryTargetTLI = newtarget;
+	list_free_deep(expectedTLEs);
+	expectedTLEs = newExpectedTLEs;
+
+	SpinLockAcquire(&xlogctl->info_lck);
+	xlogctl->RecoveryTargetTLI = recoveryTargetTLI;
+	SpinLockRelease(&xlogctl->info_lck);
+
+	ereport(LOG,
+			(errmsg("new target timeline is %u",
+					recoveryTargetTLI)));
+
+	/*
+	 * Wake up any walsenders to notice that we have a new target timeline.
+	 */
+	if (AllowCascadeReplication())
+		WalSndWakeup();
+
+	return true;
 }
 
 /*
@@ -5300,26 +5329,41 @@ StartupXLOG(void)
 	readRecoveryCommandFile();
 
 	/* Now we can determine the list of expected TLIs */
-	expectedTLIs = readTimeLineHistory(recoveryTargetTLI);
+	expectedTLEs = readTimeLineHistory(recoveryTargetTLI);
 
 	/*
-	 * If pg_control's timeline is not in expectedTLIs, then we cannot
-	 * proceed: the backup is not part of the history of the requested
-	 * timeline.
+	 * If the location of the checkpoint record is not on the expected
+	 * timeline in the history of the requested timeline, we cannot proceed:
+	 * the backup is not part of the history of the requested timeline.
 	 */
-	if (!list_member_int(expectedTLIs,
-						 (int) ControlFile->checkPointCopy.ThisTimeLineID))
+	if (tliOfPointInHistory(ControlFile->checkPoint, expectedTLEs) !=
+			ControlFile->checkPointCopy.ThisTimeLineID)
+	{
+		XLogRecPtr switchpoint;
+
+		/*
+		 * tliSwitchPoint will throw an error if the checkpoint's timeline
+		 * is not in expectedTLEs at all.
+		 */
+		switchpoint = tliSwitchPoint(ControlFile->checkPointCopy.ThisTimeLineID, expectedTLEs);
 		ereport(FATAL,
-				(errmsg("requested timeline %u is not a child of database system timeline %u",
-						recoveryTargetTLI,
-						ControlFile->checkPointCopy.ThisTimeLineID)));
+				(errmsg("requested timeline %u is not a child of this server's history",
+						recoveryTargetTLI),
+				 errdetail("Latest checkpoint is at %X/%X on timeline %u, but in the history of the requested timeline, the server forked off from that timeline at %X/%X",
+						   (uint32) (ControlFile->checkPoint >> 32),
+						   (uint32) ControlFile->checkPoint,
+						   ControlFile->checkPointCopy.ThisTimeLineID,
+						   (uint32) (switchpoint >> 32),
+						   (uint32) switchpoint)));
+	}
 
 	/*
 	 * The min recovery point should be part of the requested timeline's
 	 * history, too.
 	 */
 	if (!XLogRecPtrIsInvalid(ControlFile->minRecoveryPoint) &&
-		!list_member_int(expectedTLIs, ControlFile->minRecoveryPointTLI))
+		tliOfPointInHistory(ControlFile->minRecoveryPoint - 1, expectedTLEs) !=
+			ControlFile->minRecoveryPointTLI)
 		ereport(FATAL,
 				(errmsg("requested timeline %u does not contain minimum recovery point %X/%X on timeline %u",
 						recoveryTargetTLI,
@@ -6026,8 +6070,8 @@ StartupXLOG(void)
 				(errmsg("selected new timeline ID: %u", ThisTimeLineID)));
 
 		/*
-		 * Write comment to history file to explain why and where timeline
-		 * changed. Comment varies according to the recovery target used.
+		 * Create a comment for the history file to explain why and where
+		 * timeline changed.
 		 */
 		if (recoveryTarget == RECOVERY_TARGET_XID)
 			snprintf(reason, sizeof(reason),
@@ -6047,7 +6091,7 @@ StartupXLOG(void)
 			snprintf(reason, sizeof(reason), "no recovery target specified");
 
 		writeTimeLineHistory(ThisTimeLineID, recoveryTargetTLI,
-							 curFileTLI, endLogSegNo, reason);
+							 EndRecPtr, reason);
 	}
 
 	/* Save the selected TimeLineID in shared memory, too */
@@ -7916,8 +7960,7 @@ xlog_redo(XLogRecPtr lsn, XLogRecord *record)
 			 * decrease.
 			 */
 			if (checkPoint.ThisTimeLineID < ThisTimeLineID ||
-				!list_member_int(expectedTLIs,
-								 (int) checkPoint.ThisTimeLineID))
+				!tliInHistory(checkPoint.ThisTimeLineID, expectedTLEs))
 				ereport(PANIC,
 						(errmsg("unexpected timeline ID %u (after %u) in checkpoint record",
 								checkPoint.ThisTimeLineID, ThisTimeLineID)));
diff --git a/src/include/access/timeline.h b/src/include/access/timeline.h
index f2a7658bc45..785195bd36a 100644
--- a/src/include/access/timeline.h
+++ b/src/include/access/timeline.h
@@ -14,10 +14,28 @@
 #include "access/xlogdefs.h"
 #include "nodes/pg_list.h"
 
+/*
+ * A list of these structs describes the timeline history of the server. Each
+ * TimeLineHistoryEntry represents a piece of WAL belonging to the history,
+ * from newest to oldest. All WAL positions between 'begin' and 'end' belong to
+ * the timeline represented by the entry. Together the 'begin' and 'end'
+ * pointers of all the entries form a contiguous line from beginning of time
+ * to infinity.
+ */
+typedef struct
+{
+	TimeLineID	tli;
+	XLogRecPtr	begin;	/* inclusive */
+	XLogRecPtr	end;	/* exclusive, 0 means infinity */
+} TimeLineHistoryEntry;
+
 extern List *readTimeLineHistory(TimeLineID targetTLI);
 extern bool existsTimeLineHistory(TimeLineID probeTLI);
 extern TimeLineID findNewestTimeLine(TimeLineID startTLI);
 extern void writeTimeLineHistory(TimeLineID newTLI, TimeLineID parentTLI,
-					 TimeLineID endTLI, XLogSegNo endLogSegNo, char *reason);
+					 XLogRecPtr switchpoint, char *reason);
+extern bool tliInHistory(TimeLineID tli, List *expectedTLIs);
+extern TimeLineID tliOfPointInHistory(XLogRecPtr ptr, List *history);
+extern XLogRecPtr tliSwitchPoint(TimeLineID tli, List *history);
 
 #endif   /* TIMELINE_H */
-- 
GitLab