From 990fe3c4edfdabf4f56aa9a403a11f53006d0dd7 Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Wed, 23 Jan 2013 10:01:04 +0200
Subject: [PATCH] Fix more issues with cascading replication and timeline
 switches.

When a standby server follows the master using WAL archive, and it chooses
a new timeline (recovery_target_timeline='latest'), it only fetches the
timeline history file for the chosen target timeline, not any other history
files that might be missing from pg_xlog. For example, if the current
timeline is 2, and we choose 4 as the new recovery target timeline, the
history file for timeline 3 is not fetched, even if it's part of this
server's history. That's enough for the standby itself - the history file
for timeline 4 includes timeline 3 as well - but if a cascading standby
server wants to recover to timeline 3, it needs the history file. To fix,
when a new recovery target timeline is chosen, try to copy any missing
history files from the archive to pg_xlog between the old and new target
timeline.

A second similar issue was with the WAL files. When a standby recovers from
archive, and it reaches a segment that contains a switch to a new timeline,
recovery fetches only the WAL file labelled with the new timeline's ID. The
file from the new timeline contains a copy of the WAL from the old timeline
up to the point where the switch happened, and recovery recovers it from the
new file. But in streaming replication, walsender only tries to read it
from the old timeline's file. To fix, change walsender to read it from the
new file, so that it behaves the same as recovery in that sense, and doesn't
try to open the possibly nonexistent file with the old timeline's ID.
---
 src/backend/access/transam/timeline.c       | 22 ++++++++
 src/backend/access/transam/xlog.c           | 22 +++++++-
 src/backend/replication/walsender.c         | 57 +++++++++++++++++----
 src/include/access/timeline.h               |  1 +
 src/include/replication/walsender_private.h |  1 -
 5 files changed, 92 insertions(+), 11 deletions(-)

diff --git a/src/backend/access/transam/timeline.c b/src/backend/access/transam/timeline.c
index ad4f3162c53..51b37ca8f8c 100644
--- a/src/backend/access/transam/timeline.c
+++ b/src/backend/access/transam/timeline.c
@@ -40,6 +40,28 @@
 #include "access/xlogdefs.h"
 #include "storage/fd.h"
 
+/*
+ * Copies all timeline history files with id's between 'begin' and 'end'
+ * from archive to pg_xlog.
+ */
+void
+restoreTimeLineHistoryFiles(TimeLineID begin, TimeLineID end)
+{
+	char		path[MAXPGPATH];
+	char		histfname[MAXFNAMELEN];
+	TimeLineID tli;
+
+	for (tli = begin; tli < end; tli++)
+	{
+		if (tli == 1)
+			continue;
+
+		TLHistoryFileName(histfname, tli);
+		if (RestoreArchivedFile(path, histfname, "RECOVERYHISTORY", 0, false))
+			KeepFileRestoredFromArchive(path, histfname);
+	}
+}
+
 /*
  * Try to read a timeline's history file.
  *
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 9ad92271795..d316c979265 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -3276,8 +3276,8 @@ rescanLatestTimeLine(void)
 	bool		found;
 	ListCell   *cell;
 	TimeLineID	newtarget;
+	TimeLineID	oldtarget = recoveryTargetTLI;
 	TimeLineHistoryEntry *currentTle = NULL;
-	/* use volatile pointer to prevent code rearrangement */
 
 	newtarget = findNewestTimeLine(recoveryTargetTLI);
 	if (newtarget == recoveryTargetTLI)
@@ -3336,6 +3336,12 @@ rescanLatestTimeLine(void)
 	list_free_deep(expectedTLEs);
 	expectedTLEs = newExpectedTLEs;
 
+	/*
+	 * As in StartupXLOG(), try to ensure we have all the history files
+	 * between the old target and new target in pg_xlog.
+	 */
+	restoreTimeLineHistoryFiles(oldtarget + 1, newtarget);
+
 	ereport(LOG,
 			(errmsg("new target timeline is %u",
 					recoveryTargetTLI)));
@@ -4993,6 +4999,20 @@ StartupXLOG(void)
 	 */
 	ThisTimeLineID = checkPoint.ThisTimeLineID;
 
+	/*
+	 * Copy any missing timeline history files between 'now' and the
+	 * recovery target timeline from archive to pg_xlog. While we don't need
+	 * those files ourselves - the history file of the recovery target
+	 * timeline covers all the previous timelines in the history too - a
+	 * cascading standby server might be interested in them. Or, if you
+	 * archive the WAL from this server to a different archive than the
+	 * master, it'd be good for all the history files to get archived there
+	 * after failover, so that you can use one of the old timelines as a
+	 * PITR target. Timeline history files are small, so it's better to copy
+	 * them unnecessarily than not copy them and regret later.
+	 */
+	restoreTimeLineHistoryFiles(ThisTimeLineID, recoveryTargetTLI);
+
 	lastFullPageWrites = checkPoint.fullPageWrites;
 
 	RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;
diff --git a/src/backend/replication/walsender.c b/src/backend/replication/walsender.c
index ba138e73da3..10e40506965 100644
--- a/src/backend/replication/walsender.c
+++ b/src/backend/replication/walsender.c
@@ -110,6 +110,9 @@ static int	sendFile = -1;
 static XLogSegNo sendSegNo = 0;
 static uint32 sendOff = 0;
 
+/* Timeline ID of the currently open file */
+static TimeLineID	curFileTimeLine = 0;
+
 /*
  * These variables keep track of the state of the timeline we're currently
  * sending. sendTimeLine identifies the timeline. If sendTimeLineIsHistoric,
@@ -1201,8 +1204,8 @@ WalSndKill(int code, Datum arg)
  * always be one descriptor left open until the process ends, but never
  * more than one.
  */
-void
-XLogRead(char *buf, TimeLineID tli, XLogRecPtr startptr, Size count)
+static void
+XLogRead(char *buf, XLogRecPtr startptr, Size count)
 {
 	char	   *p;
 	XLogRecPtr	recptr;
@@ -1222,7 +1225,7 @@ retry:
 
 		startoff = recptr % XLogSegSize;
 
-		if (sendFile < 0 || !XLByteInSeg(recptr, sendSegNo) || sendTimeLine != tli)
+		if (sendFile < 0 || !XLByteInSeg(recptr, sendSegNo))
 		{
 			char		path[MAXPGPATH];
 
@@ -1230,9 +1233,45 @@ retry:
 			if (sendFile >= 0)
 				close(sendFile);
 
-			sendTimeLine = tli;
 			XLByteToSeg(recptr, sendSegNo);
-			XLogFilePath(path, sendTimeLine, sendSegNo);
+
+			/*-------
+			 * When reading from a historic timeline, and there is a timeline
+			 * switch within this segment, read from the WAL segment belonging
+			 * to the new timeline.
+			 *
+			 * For example, imagine that this server is currently on timeline
+			 * 5, and we're streaming timeline 4. The switch from timeline 4
+			 * to 5 happened at 0/13002088. In pg_xlog, we have these files:
+			 *
+			 * ...
+			 * 000000040000000000000012
+			 * 000000040000000000000013
+			 * 000000050000000000000013
+			 * 000000050000000000000014
+			 * ...
+			 *
+			 * In this situation, when requested to send the WAL from
+			 * segment 0x13, on timeline 4, we read the WAL from file
+			 * 000000050000000000000013. Archive recovery prefers files from
+			 * newer timelines, so if the segment was restored from the
+			 * archive on this server, the file belonging to the old timeline,
+			 * 000000040000000000000013, might not exist. Their contents are
+			 * equal up to the switchpoint, because at a timeline switch, the
+			 * used portion of the old segment is copied to the new file.
+			 *-------
+			 */
+			curFileTimeLine = sendTimeLine;
+			if (sendTimeLineIsHistoric)
+			{
+				XLogSegNo endSegNo;
+
+				XLByteToSeg(sendTimeLineValidUpto, endSegNo);
+				if (sendSegNo == endSegNo)
+					curFileTimeLine = sendTimeLineNextTLI;
+			}
+
+			XLogFilePath(path, curFileTimeLine, sendSegNo);
 
 			sendFile = BasicOpenFile(path, O_RDONLY | PG_BINARY, 0);
 			if (sendFile < 0)
@@ -1246,7 +1285,7 @@ retry:
 					ereport(ERROR,
 							(errcode_for_file_access(),
 							 errmsg("requested WAL segment %s has already been removed",
-									XLogFileNameP(sendTimeLine, sendSegNo))));
+									XLogFileNameP(curFileTimeLine, sendSegNo))));
 				else
 					ereport(ERROR,
 							(errcode_for_file_access(),
@@ -1263,7 +1302,7 @@ retry:
 				ereport(ERROR,
 						(errcode_for_file_access(),
 						 errmsg("could not seek in log segment %s to offset %u: %m",
-								XLogFileNameP(sendTimeLine, sendSegNo),
+								XLogFileNameP(curFileTimeLine, sendSegNo),
 								startoff)));
 			sendOff = startoff;
 		}
@@ -1280,7 +1319,7 @@ retry:
 			ereport(ERROR,
 					(errcode_for_file_access(),
 			errmsg("could not read from log segment %s, offset %u, length %lu: %m",
-				   XLogFileNameP(sendTimeLine, sendSegNo),
+				   XLogFileNameP(curFileTimeLine, sendSegNo),
 				   sendOff, (unsigned long) segbytes)));
 		}
 
@@ -1524,7 +1563,7 @@ XLogSend(bool *caughtup)
 	 * calls.
 	 */
 	enlargeStringInfo(&output_message, nbytes);
-	XLogRead(&output_message.data[output_message.len], sendTimeLine, startptr, nbytes);
+	XLogRead(&output_message.data[output_message.len], startptr, nbytes);
 	output_message.len += nbytes;
 	output_message.data[output_message.len] = '\0';
 
diff --git a/src/include/access/timeline.h b/src/include/access/timeline.h
index 7d45fcad8a4..2e5e9a42a38 100644
--- a/src/include/access/timeline.h
+++ b/src/include/access/timeline.h
@@ -35,6 +35,7 @@ extern TimeLineID findNewestTimeLine(TimeLineID startTLI);
 extern void writeTimeLineHistory(TimeLineID newTLI, TimeLineID parentTLI,
 					 XLogRecPtr switchpoint, char *reason);
 extern void writeTimeLineHistoryFile(TimeLineID tli, char *content, int size);
+extern void restoreTimeLineHistoryFiles(TimeLineID begin, TimeLineID end);
 extern bool tliInHistory(TimeLineID tli, List *expectedTLIs);
 extern TimeLineID tliOfPointInHistory(XLogRecPtr ptr, List *history);
 extern XLogRecPtr tliSwitchPoint(TimeLineID tli, List *history,
diff --git a/src/include/replication/walsender_private.h b/src/include/replication/walsender_private.h
index 8f479fda7e5..7eaa21b9f7e 100644
--- a/src/include/replication/walsender_private.h
+++ b/src/include/replication/walsender_private.h
@@ -95,7 +95,6 @@ extern WalSndCtlData *WalSndCtl;
 
 
 extern void WalSndSetState(WalSndState state);
-extern void XLogRead(char *buf, TimeLineID tli, XLogRecPtr startptr, Size count);
 
 /*
  * Internal functions for parsing the replication grammar, in repl_gram.y and
-- 
GitLab