From cc39aca7d4a317a66db51a2180f7fee6e76084bd Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Thu, 13 Apr 2006 18:01:31 +0000
Subject: [PATCH] Fix similar_escape() so that SIMILAR TO works properly for
 patterns involving alternatives ("|" symbol).  The original coding allowed
 the added ^ and $ constraints to be absorbed into the first and last
 alternatives, producing a pattern that would match more than it should.  Per
 report from Eric Noriega.

I also changed the pattern to add an ARE director ("***:"), ensuring that
SIMILAR TO patterns do not change behavior if regex_flavor is changed.  This
is necessary to make the non-capturing parentheses work, and seems like a
good idea on general principles.

Back-patched as far as 7.4.  7.3 also has the bug, but a fix seems impractical
because that version's regex engine doesn't have non-capturing parens.
---
 src/backend/utils/adt/regexp.c | 32 +++++++++++++++++++++++++++++---
 1 file changed, 29 insertions(+), 3 deletions(-)

diff --git a/src/backend/utils/adt/regexp.c b/src/backend/utils/adt/regexp.c
index 3db5ca9b6cf..c9d61de4180 100644
--- a/src/backend/utils/adt/regexp.c
+++ b/src/backend/utils/adt/regexp.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/utils/adt/regexp.c,v 1.62 2006/03/05 15:58:43 momjian Exp $
+ *	  $PostgreSQL: pgsql/src/backend/utils/adt/regexp.c,v 1.63 2006/04/13 18:01:31 tgl Exp $
  *
  *		Alistair Crooks added the code for the regex caching
  *		agc - cached the regular expressions used - there's a good chance
@@ -549,11 +549,36 @@ similar_escape(PG_FUNCTION_ARGS)
 				  errhint("Escape string must be empty or one character.")));
 	}
 
-	/* We need room for ^, $, and up to 2 output bytes per input byte */
-	result = (text *) palloc(VARHDRSZ + 2 + 2 * plen);
+	/*----------
+	 * We surround the transformed input string with
+	 *			***:^(?: ... )$
+	 * which is bizarre enough to require some explanation.  "***:" is a
+	 * director prefix to force the regex to be treated as an ARE regardless
+	 * of the current regex_flavor setting.  We need "^" and "$" to force
+	 * the pattern to match the entire input string as per SQL99 spec.  The
+	 * "(?:" and ")" are a non-capturing set of parens; we have to have
+	 * parens in case the string contains "|", else the "^" and "$" will
+	 * be bound into the first and last alternatives which is not what we
+	 * want, and the parens must be non capturing because we don't want them
+	 * to count when selecting output for SUBSTRING.
+	 *----------
+	 */
+
+	/*
+	 * We need room for the prefix/postfix plus as many as 2 output bytes per
+	 * input byte
+	 */
+	result = (text *) palloc(VARHDRSZ + 10 + 2 * plen);
 	r = VARDATA(result);
 
+	*r++ = '*';
+	*r++ = '*';
+	*r++ = '*';
+	*r++ = ':';
 	*r++ = '^';
+	*r++ = '(';
+	*r++ = '?';
+	*r++ = ':';
 
 	while (plen > 0)
 	{
@@ -593,6 +618,7 @@ similar_escape(PG_FUNCTION_ARGS)
 		p++, plen--;
 	}
 
+	*r++ = ')';
 	*r++ = '$';
 
 	VARATT_SIZEP(result) = r - ((char *) result);
-- 
GitLab