From 656beff59033ccc5261a615802e1a85da68e8fad Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Thu, 22 Dec 2005 22:50:00 +0000
Subject: [PATCH] Adjust string comparison so that only bitwise-equal strings
 are considered equal: if strcoll claims two strings are equal, check it with
 strcmp, and sort according to strcmp if not identical.  This fixes
 inconsistent behavior under glibc's hu_HU locale, and probably under some
 other locales as well.  Also, take advantage of the now-well-defined behavior
 to speed up texteq, textne, bpchareq, bpcharne: they may as well just do a
 bitwise comparison and not bother with strcoll at all.

NOTE: affected databases may need to REINDEX indexes on text columns to be
sure they are self-consistent.
---
 src/backend/access/hash/hashfunc.c |  8 ++++----
 src/backend/utils/adt/varchar.c    | 20 ++++++++++++++------
 src/backend/utils/adt/varlena.c    | 27 ++++++++++++++++++++++-----
 3 files changed, 40 insertions(+), 15 deletions(-)

diff --git a/src/backend/access/hash/hashfunc.c b/src/backend/access/hash/hashfunc.c
index 2ffca5efe6a..6cf9eb6eee2 100644
--- a/src/backend/access/hash/hashfunc.c
+++ b/src/backend/access/hash/hashfunc.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/access/hash/hashfunc.c,v 1.45 2005/10/15 02:49:08 momjian Exp $
+ *	  $PostgreSQL: pgsql/src/backend/access/hash/hashfunc.c,v 1.46 2005/12/22 22:50:00 tgl Exp $
  *
  * NOTES
  *	  These functions are stored in pg_amproc.	For each operator class
@@ -138,9 +138,9 @@ hashtext(PG_FUNCTION_ARGS)
 	Datum		result;
 
 	/*
-	 * Note: this is currently identical in behavior to hashvarlena, but it
-	 * seems likely that we may need to do something different in non-C
-	 * locales.  (See also hashbpchar, if so.)
+	 * Note: this is currently identical in behavior to hashvarlena, but
+	 * keep it as a separate function in case we someday want to do something
+	 * different in non-C locales.  (See also hashbpchar, if so.)
 	 */
 	result = hash_any((unsigned char *) VARDATA(key),
 					  VARSIZE(key) - VARHDRSZ);
diff --git a/src/backend/utils/adt/varchar.c b/src/backend/utils/adt/varchar.c
index 1377e7cc6d2..8fd46531b84 100644
--- a/src/backend/utils/adt/varchar.c
+++ b/src/backend/utils/adt/varchar.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/utils/adt/varchar.c,v 1.113 2005/10/15 02:49:30 momjian Exp $
+ *	  $PostgreSQL: pgsql/src/backend/utils/adt/varchar.c,v 1.114 2005/12/22 22:50:00 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -614,11 +614,14 @@ bpchareq(PG_FUNCTION_ARGS)
 	len1 = bcTruelen(arg1);
 	len2 = bcTruelen(arg2);
 
-	/* fast path for different-length inputs */
+	/*
+	 * Since we only care about equality or not-equality, we can avoid all
+	 * the expense of strcoll() here, and just do bitwise comparison.
+	 */
 	if (len1 != len2)
 		result = false;
 	else
-		result = (varstr_cmp(VARDATA(arg1), len1, VARDATA(arg2), len2) == 0);
+		result = (strncmp(VARDATA(arg1), VARDATA(arg2), len1) == 0);
 
 	PG_FREE_IF_COPY(arg1, 0);
 	PG_FREE_IF_COPY(arg2, 1);
@@ -638,11 +641,14 @@ bpcharne(PG_FUNCTION_ARGS)
 	len1 = bcTruelen(arg1);
 	len2 = bcTruelen(arg2);
 
-	/* fast path for different-length inputs */
+	/*
+	 * Since we only care about equality or not-equality, we can avoid all
+	 * the expense of strcoll() here, and just do bitwise comparison.
+	 */
 	if (len1 != len2)
 		result = true;
 	else
-		result = (varstr_cmp(VARDATA(arg1), len1, VARDATA(arg2), len2) != 0);
+		result = (strncmp(VARDATA(arg1), VARDATA(arg2), len1) != 0);
 
 	PG_FREE_IF_COPY(arg1, 0);
 	PG_FREE_IF_COPY(arg2, 1);
@@ -789,7 +795,9 @@ bpchar_smaller(PG_FUNCTION_ARGS)
  * bpchar needs a specialized hash function because we want to ignore
  * trailing blanks in comparisons.
  *
- * XXX is there any need for locale-specific behavior here?
+ * Note: currently there is no need for locale-specific behavior here,
+ * but if we ever change the semantics of bpchar comparison to trust
+ * strcoll() completely, we'd need to do something different in non-C locales.
  */
 Datum
 hashbpchar(PG_FUNCTION_ARGS)
diff --git a/src/backend/utils/adt/varlena.c b/src/backend/utils/adt/varlena.c
index 69544ea90f6..20b40fe2e5d 100644
--- a/src/backend/utils/adt/varlena.c
+++ b/src/backend/utils/adt/varlena.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/utils/adt/varlena.c,v 1.141 2005/11/22 18:17:23 momjian Exp $
+ *	  $PostgreSQL: pgsql/src/backend/utils/adt/varlena.c,v 1.142 2005/12/22 22:50:00 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -938,6 +938,15 @@ varstr_cmp(char *arg1, int len1, char *arg2, int len2)
 
 		result = strcoll(a1p, a2p);
 
+		/*
+		 * In some locales strcoll() can claim that nonidentical strings are
+		 * equal.  Believing that would be bad news for a number of reasons,
+		 * so we follow Perl's lead and sort "equal" strings according to
+		 * strcmp().
+		 */
+		if (result == 0)
+			result = strcmp(a1p, a2p);
+
 		if (a1p != a1buf)
 			pfree(a1p);
 		if (a2p != a2buf)
@@ -984,11 +993,15 @@ texteq(PG_FUNCTION_ARGS)
 	text	   *arg2 = PG_GETARG_TEXT_P(1);
 	bool		result;
 
-	/* fast path for different-length inputs */
+	/*
+	 * Since we only care about equality or not-equality, we can avoid all
+	 * the expense of strcoll() here, and just do bitwise comparison.
+	 */
 	if (VARSIZE(arg1) != VARSIZE(arg2))
 		result = false;
 	else
-		result = (text_cmp(arg1, arg2) == 0);
+		result = (strncmp(VARDATA(arg1), VARDATA(arg2),
+						  VARSIZE(arg1) - VARHDRSZ) == 0);
 
 	PG_FREE_IF_COPY(arg1, 0);
 	PG_FREE_IF_COPY(arg2, 1);
@@ -1003,11 +1016,15 @@ textne(PG_FUNCTION_ARGS)
 	text	   *arg2 = PG_GETARG_TEXT_P(1);
 	bool		result;
 
-	/* fast path for different-length inputs */
+	/*
+	 * Since we only care about equality or not-equality, we can avoid all
+	 * the expense of strcoll() here, and just do bitwise comparison.
+	 */
 	if (VARSIZE(arg1) != VARSIZE(arg2))
 		result = true;
 	else
-		result = (text_cmp(arg1, arg2) != 0);
+		result = (strncmp(VARDATA(arg1), VARDATA(arg2),
+						  VARSIZE(arg1) - VARHDRSZ) != 0);
 
 	PG_FREE_IF_COPY(arg1, 0);
 	PG_FREE_IF_COPY(arg2, 1);
-- 
GitLab