From a396144ac03b0cf337f80201df7e4663cc5a8131 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Fri, 22 Jan 2016 11:53:06 -0500
Subject: [PATCH] Remove new coupling between NAMEDATALEN and
 MAX_LEVENSHTEIN_STRLEN.

Commit e529cd4ffa605c6f introduced an Assert requiring NAMEDATALEN to be
less than MAX_LEVENSHTEIN_STRLEN, which has been 255 for a long time.
Since up to that instant we had always allowed NAMEDATALEN to be
substantially more than that, this was ill-advised.

It's debatable whether we need MAX_LEVENSHTEIN_STRLEN at all (versus
putting a CHECK_FOR_INTERRUPTS into the loop), or whether it has to be
so tight; but this patch takes the narrower approach of just not applying
the MAX_LEVENSHTEIN_STRLEN limit to calls from the parser.

Trusting the parser for this seems reasonable, first because the strings
are limited to NAMEDATALEN which is unlikely to be hugely more than 256,
and second because the maximum distance is tightly constrained by
MAX_FUZZY_DISTANCE (though we'd forgotten to make use of that limit in one
place).  That means the cost is not really O(mn) but more like O(max(m,n)).

Relaxing the limit for user-supplied calls is left for future research;
given the lack of complaints to date, it doesn't seem very high priority.

In passing, fix confusion between lengths-in-bytes and lengths-in-chars
in comments and error messages.

Per gripe from Kevin Day; solution suggested by Robert Haas.  Back-patch
to 9.5 where the unwanted restriction was introduced.
---
 contrib/fuzzystrmatch/fuzzystrmatch.c | 29 ++++++++-------
 src/backend/parser/parse_relation.c   | 13 ++++---
 src/backend/utils/adt/levenshtein.c   | 52 +++++++++++++++------------
 src/include/utils/builtins.h          | 11 +++---
 4 files changed, 60 insertions(+), 45 deletions(-)

diff --git a/contrib/fuzzystrmatch/fuzzystrmatch.c b/contrib/fuzzystrmatch/fuzzystrmatch.c
index 92a2f1b92a6..cbac1f2381f 100644
--- a/contrib/fuzzystrmatch/fuzzystrmatch.c
+++ b/contrib/fuzzystrmatch/fuzzystrmatch.c
@@ -171,12 +171,12 @@ levenshtein_with_costs(PG_FUNCTION_ARGS)
 	/* Extract a pointer to the actual character data */
 	s_data = VARDATA_ANY(src);
 	t_data = VARDATA_ANY(dst);
-	/* Determine length of each string in bytes and characters */
+	/* Determine length of each string in bytes */
 	s_bytes = VARSIZE_ANY_EXHDR(src);
 	t_bytes = VARSIZE_ANY_EXHDR(dst);
 
-	PG_RETURN_INT32(varstr_levenshtein(s_data, s_bytes, t_data, t_bytes, ins_c,
-									   del_c, sub_c));
+	PG_RETURN_INT32(varstr_levenshtein(s_data, s_bytes, t_data, t_bytes,
+									   ins_c, del_c, sub_c, false));
 }
 
 
@@ -194,12 +194,12 @@ levenshtein(PG_FUNCTION_ARGS)
 	/* Extract a pointer to the actual character data */
 	s_data = VARDATA_ANY(src);
 	t_data = VARDATA_ANY(dst);
-	/* Determine length of each string in bytes and characters */
+	/* Determine length of each string in bytes */
 	s_bytes = VARSIZE_ANY_EXHDR(src);
 	t_bytes = VARSIZE_ANY_EXHDR(dst);
 
-	PG_RETURN_INT32(varstr_levenshtein(s_data, s_bytes, t_data, t_bytes, 1, 1,
-									   1));
+	PG_RETURN_INT32(varstr_levenshtein(s_data, s_bytes, t_data, t_bytes,
+									   1, 1, 1, false));
 }
 
 
@@ -221,13 +221,14 @@ levenshtein_less_equal_with_costs(PG_FUNCTION_ARGS)
 	/* Extract a pointer to the actual character data */
 	s_data = VARDATA_ANY(src);
 	t_data = VARDATA_ANY(dst);
-	/* Determine length of each string in bytes and characters */
+	/* Determine length of each string in bytes */
 	s_bytes = VARSIZE_ANY_EXHDR(src);
 	t_bytes = VARSIZE_ANY_EXHDR(dst);
 
-	PG_RETURN_INT32(varstr_levenshtein_less_equal(s_data, s_bytes, t_data,
-												  t_bytes, ins_c, del_c,
-												  sub_c, max_d));
+	PG_RETURN_INT32(varstr_levenshtein_less_equal(s_data, s_bytes,
+												  t_data, t_bytes,
+												  ins_c, del_c, sub_c,
+												  max_d, false));
 }
 
 
@@ -246,12 +247,14 @@ levenshtein_less_equal(PG_FUNCTION_ARGS)
 	/* Extract a pointer to the actual character data */
 	s_data = VARDATA_ANY(src);
 	t_data = VARDATA_ANY(dst);
-	/* Determine length of each string in bytes and characters */
+	/* Determine length of each string in bytes */
 	s_bytes = VARSIZE_ANY_EXHDR(src);
 	t_bytes = VARSIZE_ANY_EXHDR(dst);
 
-	PG_RETURN_INT32(varstr_levenshtein_less_equal(s_data, s_bytes, t_data,
-												  t_bytes, 1, 1, 1, max_d));
+	PG_RETURN_INT32(varstr_levenshtein_less_equal(s_data, s_bytes,
+												  t_data, t_bytes,
+												  1, 1, 1,
+												  max_d, false));
 }
 
 
diff --git a/src/backend/parser/parse_relation.c b/src/backend/parser/parse_relation.c
index 632eb29312e..81332b57d93 100644
--- a/src/backend/parser/parse_relation.c
+++ b/src/backend/parser/parse_relation.c
@@ -550,7 +550,8 @@ updateFuzzyAttrMatchState(int fuzzy_rte_penalty,
 		varstr_levenshtein_less_equal(actual, strlen(actual), match, matchlen,
 									  1, 1, 1,
 									  fuzzystate->distance + 1
-									  - fuzzy_rte_penalty);
+									  - fuzzy_rte_penalty,
+									  true);
 
 	/*
 	 * If more than half the characters are different, don't treat it as a
@@ -843,10 +844,12 @@ searchRangeTableForCol(ParseState *pstate, const char *alias, char *colname,
 			 */
 			if (alias != NULL)
 				fuzzy_rte_penalty =
-					varstr_levenshtein(alias, strlen(alias),
-									   rte->eref->aliasname,
-									   strlen(rte->eref->aliasname),
-									   1, 1, 1);
+					varstr_levenshtein_less_equal(alias, strlen(alias),
+												  rte->eref->aliasname,
+												strlen(rte->eref->aliasname),
+												  1, 1, 1,
+												  MAX_FUZZY_DISTANCE + 1,
+												  true);
 
 			/*
 			 * Scan for a matching column; if we find an exact match, we're
diff --git a/src/backend/utils/adt/levenshtein.c b/src/backend/utils/adt/levenshtein.c
index a499a20df9d..f40557b847e 100644
--- a/src/backend/utils/adt/levenshtein.c
+++ b/src/backend/utils/adt/levenshtein.c
@@ -26,9 +26,16 @@
 #define MAX_LEVENSHTEIN_STRLEN		255
 
 /*
- * Calculates Levenshtein distance metric between supplied csrings, which are
- * not necessarily null-terminated.  Generally (1, 1, 1) penalty costs suffices
- * for common cases, but your mileage may vary.
+ * Calculates Levenshtein distance metric between supplied strings, which are
+ * not necessarily null-terminated.
+ *
+ * source: source string, of length slen bytes.
+ * target: target string, of length tlen bytes.
+ * ins_c, del_c, sub_c: costs to charge for character insertion, deletion,
+ *		and substitution respectively; (1, 1, 1) costs suffice for common
+ *		cases, but your mileage may vary.
+ * max_d: if provided and >= 0, maximum distance we care about; see below.
+ * trusted: caller is trusted and need not obey MAX_LEVENSHTEIN_STRLEN.
  *
  * One way to compute Levenshtein distance is to incrementally construct
  * an (m+1)x(n+1) matrix where cell (i, j) represents the minimum number
@@ -43,7 +50,7 @@
  * array.
  *
  * If max_d >= 0, we only need to provide an accurate answer when that answer
- * is less than or equal to the bound.  From any cell in the matrix, there is
+ * is less than or equal to max_d.  From any cell in the matrix, there is
  * theoretical "minimum residual distance" from that cell to the last column
  * of the final row.  This minimum residual distance is zero when the
  * untransformed portions of the strings are of equal length (because we might
@@ -58,12 +65,15 @@
  */
 int
 #ifdef LEVENSHTEIN_LESS_EQUAL
-varstr_levenshtein_less_equal(const char *source, int slen, const char *target,
-							  int tlen, int ins_c, int del_c, int sub_c,
-							  int max_d)
+varstr_levenshtein_less_equal(const char *source, int slen,
+							  const char *target, int tlen,
+							  int ins_c, int del_c, int sub_c,
+							  int max_d, bool trusted)
 #else
-varstr_levenshtein(const char *source, int slen, const char *target, int tlen,
-				   int ins_c, int del_c, int sub_c)
+varstr_levenshtein(const char *source, int slen,
+				   const char *target, int tlen,
+				   int ins_c, int del_c, int sub_c,
+				   bool trusted)
 #endif
 {
 	int			m,
@@ -95,15 +105,7 @@ varstr_levenshtein(const char *source, int slen, const char *target, int tlen,
 #define STOP_COLUMN m
 #endif
 
-	/*
-	 * A common use for Levenshtein distance is to match attributes when
-	 * building diagnostic, user-visible messages.  Restrict the size of
-	 * MAX_LEVENSHTEIN_STRLEN at compile time so that this is guaranteed to
-	 * work.
-	 */
-	StaticAssertStmt(NAMEDATALEN <= MAX_LEVENSHTEIN_STRLEN,
-					 "Levenshtein hinting mechanism restricts NAMEDATALEN");
-
+	/* Convert string lengths (in bytes) to lengths in characters */
 	m = pg_mbstrlen_with_len(source, slen);
 	n = pg_mbstrlen_with_len(target, tlen);
 
@@ -118,14 +120,18 @@ varstr_levenshtein(const char *source, int slen, const char *target, int tlen,
 
 	/*
 	 * For security concerns, restrict excessive CPU+RAM usage. (This
-	 * implementation uses O(m) memory and has O(mn) complexity.)
+	 * implementation uses O(m) memory and has O(mn) complexity.)  If
+	 * "trusted" is true, caller is responsible for not making excessive
+	 * requests, typically by using a small max_d along with strings that are
+	 * bounded, though not necessarily to MAX_LEVENSHTEIN_STRLEN exactly.
 	 */
-	if (m > MAX_LEVENSHTEIN_STRLEN ||
-		n > MAX_LEVENSHTEIN_STRLEN)
+	if (!trusted &&
+		(m > MAX_LEVENSHTEIN_STRLEN ||
+		 n > MAX_LEVENSHTEIN_STRLEN))
 		ereport(ERROR,
 				(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
-				 errmsg("argument exceeds the maximum length of %d bytes",
-						MAX_LEVENSHTEIN_STRLEN)));
+		errmsg("levenshtein argument exceeds maximum length of %d characters",
+			   MAX_LEVENSHTEIN_STRLEN)));
 
 #ifdef LEVENSHTEIN_LESS_EQUAL
 	/* Initialize start and stop columns. */
diff --git a/src/include/utils/builtins.h b/src/include/utils/builtins.h
index 477fde1f81d..3c134a3aa96 100644
--- a/src/include/utils/builtins.h
+++ b/src/include/utils/builtins.h
@@ -810,11 +810,14 @@ extern Datum textoverlay_no_len(PG_FUNCTION_ARGS);
 extern Datum name_text(PG_FUNCTION_ARGS);
 extern Datum text_name(PG_FUNCTION_ARGS);
 extern int	varstr_cmp(char *arg1, int len1, char *arg2, int len2, Oid collid);
-extern int varstr_levenshtein(const char *source, int slen, const char *target,
-				   int tlen, int ins_c, int del_c, int sub_c);
+extern int varstr_levenshtein(const char *source, int slen,
+				   const char *target, int tlen,
+				   int ins_c, int del_c, int sub_c,
+				   bool trusted);
 extern int varstr_levenshtein_less_equal(const char *source, int slen,
-							  const char *target, int tlen, int ins_c,
-							  int del_c, int sub_c, int max_d);
+							  const char *target, int tlen,
+							  int ins_c, int del_c, int sub_c,
+							  int max_d, bool trusted);
 extern List *textToQualifiedNameList(text *textval);
 extern bool SplitIdentifierString(char *rawstring, char separator,
 					  List **namelist);
-- 
GitLab