From a15ad67089feed2e3f27613f0ead5243ca7ebbed Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Sat, 12 Dec 2015 14:19:51 +0100
Subject: [PATCH] Fix ALTER TABLE ... SET TABLESPACE for unlogged relations.

Changing the tablespace of an unlogged relation did not WAL log the
creation and content of the init fork. Thus, after a standby is
promoted, unlogged relation cannot be accessed anymore, with errors
like:
ERROR:  58P01: could not open file "pg_tblspc/...": No such file or directory
Additionally the init fork was not synced to disk, independent of the
configured wal_level, a relatively small durability risk.

Investigation of that problem also brought to light that, even for
permanent relations, the creation of !main forks was not WAL logged,
i.e. no XLOG_SMGR_CREATE record were emitted. That mostly turns out not
to be a problem, because these files were created when the actual
relation data is copied; nonexistent files are not treated as an error
condition during replay. But that doesn't work for empty files, and
generally feels a bit haphazard. Luckily, outside init and main forks,
empty forks don't occur often or are not a problem.

Add the required WAL logging and syncing to disk.

Reported-By: Michael Paquier
Author: Michael Paquier and Andres Freund
Discussion: 20151210163230.GA11331@alap3.anarazel.de
Backpatch: 9.1, where unlogged relations were introduced
---
 src/backend/commands/tablecmds.c | 23 +++++++++++++++++++++--
 1 file changed, 21 insertions(+), 2 deletions(-)

diff --git a/src/backend/commands/tablecmds.c b/src/backend/commands/tablecmds.c
index 2a13f9c53e9..36b252340a6 100644
--- a/src/backend/commands/tablecmds.c
+++ b/src/backend/commands/tablecmds.c
@@ -8824,6 +8824,15 @@ ATExecSetTableSpace(Oid tableOid, Oid newTableSpace, LOCKMODE lockmode)
 		if (smgrexists(rel->rd_smgr, forkNum))
 		{
 			smgrcreate(dstrel, forkNum, false);
+
+			/*
+			 * WAL log creation if the relation is persistent, or this is the
+			 * init fork of an unlogged relation.
+			 */
+			if (rel->rd_rel->relpersistence == RELPERSISTENCE_PERMANENT ||
+				(rel->rd_rel->relpersistence == RELPERSISTENCE_UNLOGGED &&
+				 forkNum == INIT_FORKNUM))
+				log_smgrcreate(&newrnode, forkNum);
 			copy_relation_data(rel->rd_smgr, dstrel, forkNum,
 							   rel->rd_rel->relpersistence);
 		}
@@ -8865,6 +8874,7 @@ copy_relation_data(SMgrRelation src, SMgrRelation dst,
 	char	   *buf;
 	Page		page;
 	bool		use_wal;
+	bool		copying_initfork;
 	BlockNumber nblocks;
 	BlockNumber blkno;
 
@@ -8877,11 +8887,20 @@ copy_relation_data(SMgrRelation src, SMgrRelation dst,
 	buf = (char *) palloc(BLCKSZ);
 	page = (Page) buf;
 
+	/*
+	 * The init fork for an unlogged relation in many respects has to be
+	 * treated the same as normal relation, changes need to be WAL logged and
+	 * it needs to be synced to disk.
+	 */
+	copying_initfork = relpersistence == RELPERSISTENCE_UNLOGGED &&
+		forkNum == INIT_FORKNUM;
+
 	/*
 	 * We need to log the copied data in WAL iff WAL archiving/streaming is
 	 * enabled AND it's a permanent relation.
 	 */
-	use_wal = XLogIsNeeded() && relpersistence == RELPERSISTENCE_PERMANENT;
+	use_wal = XLogIsNeeded() &&
+		(relpersistence == RELPERSISTENCE_PERMANENT || copying_initfork);
 
 	nblocks = smgrnblocks(src, forkNum);
 
@@ -8920,7 +8939,7 @@ copy_relation_data(SMgrRelation src, SMgrRelation dst,
 	 * wouldn't replay our earlier WAL entries. If we do not fsync those pages
 	 * here, they might still not be on disk when the crash occurs.
 	 */
-	if (relpersistence == RELPERSISTENCE_PERMANENT)
+	if (relpersistence == RELPERSISTENCE_PERMANENT || copying_initfork)
 		smgrimmedsync(dst, forkNum);
 }
 
-- 
GitLab