From 33755e8edf149dabfc0ed9b697a84f70b0cca0de Mon Sep 17 00:00:00 2001
From: Heikki Linnakangas <heikki.linnakangas@iki.fi>
Date: Fri, 10 Oct 2014 09:59:44 +0300
Subject: [PATCH] Change the way encoding and locale checks are done in
 pg_upgrade.

Lc_collate and lc_ctype have been per-database settings since server version
8.4, but pg_upgrade was still treating them as cluster-wide options. It
fetched the values for the template0 databases in old and new cluster, and
compared them. That's backwards; the encoding and locale of the template0
database doesn't matter, as template0 is guaranteed to contain only ASCII
characters. But if there are any other databases that exist on both clusters
(in particular template1 and postgres databases), their encodings and
locales must be compatible.

Also, make the locale comparison more lenient. If the locale names are not
equal, try to canonicalize both of them by passing them to setlocale(). We
used to do that only when upgrading from 9.1 or below, but it seems like a
good idea even with newer versions. If we change the canonical form of a
locale, this allows pg_upgrade to still work. I'm about to do just that to
fix bug #11431, by mapping a locale name that contains non-ASCII characters
to a pure-ASCII alias of the same locale.

No backpatching, because earlier versions of pg_upgrade still support
upgrading from 8.3 servers. That would be more complicated, so it doesn't
seem worth it, given that we haven't received any complaints about this
from users.
---
 contrib/pg_upgrade/check.c       | 206 +++++++++++--------------------
 contrib/pg_upgrade/controldata.c |  34 -----
 contrib/pg_upgrade/info.c        |  14 ++-
 contrib/pg_upgrade/pg_upgrade.h  |   6 +-
 4 files changed, 87 insertions(+), 173 deletions(-)

diff --git a/contrib/pg_upgrade/check.c b/contrib/pg_upgrade/check.c
index bbfcab71ce1..3629b3961f6 100644
--- a/contrib/pg_upgrade/check.c
+++ b/contrib/pg_upgrade/check.c
@@ -14,12 +14,10 @@
 #include "pg_upgrade.h"
 
 
-static void set_locale_and_encoding(ClusterInfo *cluster);
 static void check_new_cluster_is_empty(void);
-static void check_locale_and_encoding(ControlData *oldctrl,
-						  ControlData *newctrl);
-static bool equivalent_locale(const char *loca, const char *locb);
-static bool equivalent_encoding(const char *chara, const char *charb);
+static void check_databases_are_compatible(void);
+static void check_locale_and_encoding(DbInfo *olddb, DbInfo *newdb);
+static bool equivalent_locale(int category, const char *loca, const char *locb);
 static void check_is_install_user(ClusterInfo *cluster);
 static void check_for_prepared_transactions(ClusterInfo *cluster);
 static void check_for_isn_and_int8_passing_mismatch(ClusterInfo *cluster);
@@ -81,8 +79,6 @@ check_and_dump_old_cluster(bool live_check)
 	if (!live_check)
 		start_postmaster(&old_cluster, true);
 
-	set_locale_and_encoding(&old_cluster);
-
 	get_pg_database_relfilenode(&old_cluster);
 
 	/* Extract a list of databases and tables from the old cluster */
@@ -127,13 +123,10 @@ check_and_dump_old_cluster(bool live_check)
 void
 check_new_cluster(void)
 {
-	set_locale_and_encoding(&new_cluster);
-
-	check_locale_and_encoding(&old_cluster.controldata, &new_cluster.controldata);
-
 	get_db_and_rel_infos(&new_cluster);
 
 	check_new_cluster_is_empty();
+	check_databases_are_compatible();
 
 	check_loadable_libraries();
 
@@ -278,94 +271,26 @@ check_cluster_compatibility(bool live_check)
 }
 
 
-/*
- * set_locale_and_encoding()
- *
- * query the database to get the template0 locale
- */
-static void
-set_locale_and_encoding(ClusterInfo *cluster)
-{
-	ControlData *ctrl = &cluster->controldata;
-	PGconn	   *conn;
-	PGresult   *res;
-	int			i_encoding;
-	int			cluster_version = cluster->major_version;
-
-	conn = connectToServer(cluster, "template1");
-
-	/* for pg < 80400, we got the values from pg_controldata */
-	if (cluster_version >= 80400)
-	{
-		int			i_datcollate;
-		int			i_datctype;
-
-		res = executeQueryOrDie(conn,
-								"SELECT datcollate, datctype "
-								"FROM	pg_catalog.pg_database "
-								"WHERE	datname = 'template0' ");
-		assert(PQntuples(res) == 1);
-
-		i_datcollate = PQfnumber(res, "datcollate");
-		i_datctype = PQfnumber(res, "datctype");
-
-		if (GET_MAJOR_VERSION(cluster->major_version) < 902)
-		{
-			/*
-			 * Pre-9.2 did not canonicalize the supplied locale names to match
-			 * what the system returns, while 9.2+ does, so convert pre-9.2 to
-			 * match.
-			 */
-			ctrl->lc_collate = get_canonical_locale_name(LC_COLLATE,
-								pg_strdup(PQgetvalue(res, 0, i_datcollate)));
-			ctrl->lc_ctype = get_canonical_locale_name(LC_CTYPE,
-								  pg_strdup(PQgetvalue(res, 0, i_datctype)));
-		}
-		else
-		{
-			ctrl->lc_collate = pg_strdup(PQgetvalue(res, 0, i_datcollate));
-			ctrl->lc_ctype = pg_strdup(PQgetvalue(res, 0, i_datctype));
-		}
-
-		PQclear(res);
-	}
-
-	res = executeQueryOrDie(conn,
-							"SELECT pg_catalog.pg_encoding_to_char(encoding) "
-							"FROM	pg_catalog.pg_database "
-							"WHERE	datname = 'template0' ");
-	assert(PQntuples(res) == 1);
-
-	i_encoding = PQfnumber(res, "pg_encoding_to_char");
-	ctrl->encoding = pg_strdup(PQgetvalue(res, 0, i_encoding));
-
-	PQclear(res);
-
-	PQfinish(conn);
-}
-
-
 /*
  * check_locale_and_encoding()
  *
- * Check that old and new locale and encoding match.  Even though the backend
- * tries to canonicalize stored locale names, the platform often doesn't
- * cooperate, so it's entirely possible that one DB thinks its locale is
- * "en_US.UTF-8" while the other says "en_US.utf8".  Try to be forgiving.
+ * Check that locale and encoding of a database in the old and new clusters
+ * are compatible.
  */
 static void
-check_locale_and_encoding(ControlData *oldctrl,
-						  ControlData *newctrl)
+check_locale_and_encoding(DbInfo *olddb, DbInfo *newdb)
 {
-	if (!equivalent_locale(oldctrl->lc_collate, newctrl->lc_collate))
-		pg_fatal("lc_collate cluster values do not match:  old \"%s\", new \"%s\"\n",
-				 oldctrl->lc_collate, newctrl->lc_collate);
-	if (!equivalent_locale(oldctrl->lc_ctype, newctrl->lc_ctype))
-		pg_fatal("lc_ctype cluster values do not match:  old \"%s\", new \"%s\"\n",
-				 oldctrl->lc_ctype, newctrl->lc_ctype);
-	if (!equivalent_encoding(oldctrl->encoding, newctrl->encoding))
-		pg_fatal("encoding cluster values do not match:  old \"%s\", new \"%s\"\n",
-				 oldctrl->encoding, newctrl->encoding);
+	if (olddb->db_encoding != newdb->db_encoding)
+		pg_fatal("encodings for database \"%s\" do not match:  old \"%s\", new \"%s\"\n",
+				 olddb->db_name,
+				 pg_encoding_to_char(olddb->db_encoding),
+				 pg_encoding_to_char(newdb->db_encoding));
+	if (!equivalent_locale(LC_COLLATE, olddb->db_collate, newdb->db_collate))
+		pg_fatal("lc_collate values for database \"%s\" do not match:  old \"%s\", new \"%s\"\n",
+				 olddb->db_name, olddb->db_collate, newdb->db_collate);
+	if (!equivalent_locale(LC_CTYPE, olddb->db_ctype, newdb->db_ctype))
+		pg_fatal("lc_ctype values for database \"%s\" do not match:  old \"%s\", new \"%s\"\n",
+				 olddb->db_name, olddb->db_ctype, newdb->db_ctype);
 }
 
 /*
@@ -373,61 +298,46 @@ check_locale_and_encoding(ControlData *oldctrl,
  *
  * Best effort locale-name comparison.  Return false if we are not 100% sure
  * the locales are equivalent.
+ *
+ * Note: The encoding parts of the names are ignored. This function is
+ * currently used to compare locale names stored in pg_database, and
+ * pg_database contains a separate encoding field. That's compared directly
+ * in check_locale_and_encoding().
  */
 static bool
-equivalent_locale(const char *loca, const char *locb)
+equivalent_locale(int category, const char *loca, const char *locb)
 {
 	const char *chara = strrchr(loca, '.');
 	const char *charb = strrchr(locb, '.');
-	int			lencmp;
-
-	/* If they don't both contain an encoding part, just do strcasecmp(). */
-	if (!chara || !charb)
-		return (pg_strcasecmp(loca, locb) == 0);
+	char	   *canona;
+	char	   *canonb;
+	int			lena;
+	int			lenb;
 
 	/*
-	 * Compare the encoding parts.  Windows tends to use code page numbers for
-	 * the encoding part, which equivalent_encoding() won't like, so accept if
-	 * the strings are case-insensitive equal; otherwise use
-	 * equivalent_encoding() to compare.
+	 * If the names are equal, the locales are equivalent. Checking this
+	 * first avoids calling setlocale() in the common case that the names
+	 * are equal. That's a good thing, if setlocale() is buggy, for example.
 	 */
-	if (pg_strcasecmp(chara + 1, charb + 1) != 0 &&
-		!equivalent_encoding(chara + 1, charb + 1))
-		return false;
+	if (pg_strcasecmp(loca, locb) == 0)
+		return true;
 
 	/*
-	 * OK, compare the locale identifiers (e.g. en_US part of en_US.utf8).
-	 *
-	 * It's tempting to ignore non-alphanumeric chars here, but for now it's
-	 * not clear that that's necessary; just do case-insensitive comparison.
+	 * Not identical. Canonicalize both names, remove the encoding parts,
+	 * and try again.
 	 */
-	lencmp = chara - loca;
-	if (lencmp != charb - locb)
-		return false;
+	canona = get_canonical_locale_name(category, loca);
+	chara = strrchr(canona, '.');
+	lena = chara ? (chara - canona) : strlen(canona);
 
-	return (pg_strncasecmp(loca, locb, lencmp) == 0);
-}
+	canonb = get_canonical_locale_name(category, locb);
+	charb = strrchr(canonb, '.');
+	lenb = charb ? (charb - canonb) : strlen(canonb);
 
-/*
- * equivalent_encoding()
- *
- * Best effort encoding-name comparison.  Return true only if the encodings
- * are valid server-side encodings and known equivalent.
- *
- * Because the lookup in pg_valid_server_encoding() does case folding and
- * ignores non-alphanumeric characters, this will recognize many popular
- * variant spellings as equivalent, eg "utf8" and "UTF-8" will match.
- */
-static bool
-equivalent_encoding(const char *chara, const char *charb)
-{
-	int			enca = pg_valid_server_encoding(chara);
-	int			encb = pg_valid_server_encoding(charb);
+	if (lena == lenb && pg_strncasecmp(canona, canonb, lena) == 0)
+		return true;
 
-	if (enca < 0 || encb < 0)
-		return false;
-
-	return (enca == encb);
+	return false;
 }
 
 
@@ -450,7 +360,35 @@ check_new_cluster_is_empty(void)
 						 new_cluster.dbarr.dbs[dbnum].db_name);
 		}
 	}
+}
+
+/*
+ * Check that every database that already exists in the new cluster is
+ * compatible with the corresponding database in the old one.
+ */
+static void
+check_databases_are_compatible(void)
+{
+	int			newdbnum;
+	int			olddbnum;
+	DbInfo	   *newdbinfo;
+	DbInfo	   *olddbinfo;
 
+	for (newdbnum = 0; newdbnum < new_cluster.dbarr.ndbs; newdbnum++)
+	{
+		newdbinfo = &new_cluster.dbarr.dbs[newdbnum];
+
+		/* Find the corresponding database in the old cluster */
+		for (olddbnum = 0; olddbnum < old_cluster.dbarr.ndbs; olddbnum++)
+		{
+			olddbinfo = &old_cluster.dbarr.dbs[olddbnum];
+			if (strcmp(newdbinfo->db_name, olddbinfo->db_name) == 0)
+			{
+				check_locale_and_encoding(olddbinfo, newdbinfo);
+				break;
+			}
+		}
+	}
 }
 
 
diff --git a/contrib/pg_upgrade/controldata.c b/contrib/pg_upgrade/controldata.c
index 8379ebd71be..4e9d5948fae 100644
--- a/contrib/pg_upgrade/controldata.c
+++ b/contrib/pg_upgrade/controldata.c
@@ -122,10 +122,6 @@ get_control_data(ClusterInfo *cluster, bool live_check)
 		pg_fatal("Could not get control data using %s: %s\n",
 				 cmd, getErrorText(errno));
 
-	/* Only pre-8.4 has these so if they are not set below we will check later */
-	cluster->controldata.lc_collate = NULL;
-	cluster->controldata.lc_ctype = NULL;
-
 	/* Only in <= 9.2 */
 	if (GET_MAJOR_VERSION(cluster->major_version) <= 902)
 	{
@@ -404,36 +400,6 @@ get_control_data(ClusterInfo *cluster, bool live_check)
 			cluster->controldata.data_checksum_version = str2uint(p);
 			got_data_checksum_version = true;
 		}
-		/* In pre-8.4 only */
-		else if ((p = strstr(bufin, "LC_COLLATE:")) != NULL)
-		{
-			p = strchr(p, ':');
-
-			if (p == NULL || strlen(p) <= 1)
-				pg_fatal("%d: controldata retrieval problem\n", __LINE__);
-
-			p++;				/* remove ':' char */
-			/* skip leading spaces and remove trailing newline */
-			p += strspn(p, " ");
-			if (strlen(p) > 0 && *(p + strlen(p) - 1) == '\n')
-				*(p + strlen(p) - 1) = '\0';
-			cluster->controldata.lc_collate = pg_strdup(p);
-		}
-		/* In pre-8.4 only */
-		else if ((p = strstr(bufin, "LC_CTYPE:")) != NULL)
-		{
-			p = strchr(p, ':');
-
-			if (p == NULL || strlen(p) <= 1)
-				pg_fatal("%d: controldata retrieval problem\n", __LINE__);
-
-			p++;				/* remove ':' char */
-			/* skip leading spaces and remove trailing newline */
-			p += strspn(p, " ");
-			if (strlen(p) > 0 && *(p + strlen(p) - 1) == '\n')
-				*(p + strlen(p) - 1) = '\0';
-			cluster->controldata.lc_ctype = pg_strdup(p);
-		}
 	}
 
 	if (output)
diff --git a/contrib/pg_upgrade/info.c b/contrib/pg_upgrade/info.c
index a1773aa8e53..c347dfc4932 100644
--- a/contrib/pg_upgrade/info.c
+++ b/contrib/pg_upgrade/info.c
@@ -239,11 +239,15 @@ get_db_infos(ClusterInfo *cluster)
 	DbInfo	   *dbinfos;
 	int			i_datname,
 				i_oid,
+				i_encoding,
+				i_datcollate,
+				i_datctype,
 				i_spclocation;
 	char		query[QUERY_ALLOC];
 
 	snprintf(query, sizeof(query),
-			 "SELECT d.oid, d.datname, %s "
+			 "SELECT d.oid, d.datname, d.encoding, d.datcollate, d.datctype, "
+			 "%s AS spclocation "
 			 "FROM pg_catalog.pg_database d "
 			 " LEFT OUTER JOIN pg_catalog.pg_tablespace t "
 			 " ON d.dattablespace = t.oid "
@@ -252,12 +256,15 @@ get_db_infos(ClusterInfo *cluster)
 			 "ORDER BY 2",
 	/* 9.2 removed the spclocation column */
 			 (GET_MAJOR_VERSION(cluster->major_version) <= 901) ?
-			 "t.spclocation" : "pg_catalog.pg_tablespace_location(t.oid) AS spclocation");
+			 "t.spclocation" : "pg_catalog.pg_tablespace_location(t.oid)");
 
 	res = executeQueryOrDie(conn, "%s", query);
 
 	i_oid = PQfnumber(res, "oid");
 	i_datname = PQfnumber(res, "datname");
+	i_encoding = PQfnumber(res, "encoding");
+	i_datcollate = PQfnumber(res, "datcollate");
+	i_datctype = PQfnumber(res, "datctype");
 	i_spclocation = PQfnumber(res, "spclocation");
 
 	ntups = PQntuples(res);
@@ -267,6 +274,9 @@ get_db_infos(ClusterInfo *cluster)
 	{
 		dbinfos[tupnum].db_oid = atooid(PQgetvalue(res, tupnum, i_oid));
 		dbinfos[tupnum].db_name = pg_strdup(PQgetvalue(res, tupnum, i_datname));
+		dbinfos[tupnum].db_encoding = atoi(PQgetvalue(res, tupnum, i_encoding));
+		dbinfos[tupnum].db_collate = pg_strdup(PQgetvalue(res, tupnum, i_datcollate));
+		dbinfos[tupnum].db_ctype = pg_strdup(PQgetvalue(res, tupnum, i_datctype));
 		snprintf(dbinfos[tupnum].db_tablespace, sizeof(dbinfos[tupnum].db_tablespace), "%s",
 				 PQgetvalue(res, tupnum, i_spclocation));
 	}
diff --git a/contrib/pg_upgrade/pg_upgrade.h b/contrib/pg_upgrade/pg_upgrade.h
index 56a7505a961..65dda9780ab 100644
--- a/contrib/pg_upgrade/pg_upgrade.h
+++ b/contrib/pg_upgrade/pg_upgrade.h
@@ -180,6 +180,9 @@ typedef struct
 	char	   *db_name;		/* database name */
 	char		db_tablespace[MAXPGPATH];		/* database default tablespace
 												 * path */
+	char	   *db_collate;
+	char	   *db_ctype;
+	int			db_encoding;
 	RelInfoArr	rel_arr;		/* array of all user relinfos */
 } DbInfo;
 
@@ -218,9 +221,6 @@ typedef struct
 	bool		date_is_int;
 	bool		float8_pass_by_value;
 	bool		data_checksum_version;
-	char	   *lc_collate;
-	char	   *lc_ctype;
-	char	   *encoding;
 } ControlData;
 
 /*
-- 
GitLab