From cc39aca7d4a317a66db51a2180f7fee6e76084bd Mon Sep 17 00:00:00 2001 From: Tom Lane <tgl@sss.pgh.pa.us> Date: Thu, 13 Apr 2006 18:01:31 +0000 Subject: [PATCH] Fix similar_escape() so that SIMILAR TO works properly for patterns involving alternatives ("|" symbol). The original coding allowed the added ^ and $ constraints to be absorbed into the first and last alternatives, producing a pattern that would match more than it should. Per report from Eric Noriega. I also changed the pattern to add an ARE director ("***:"), ensuring that SIMILAR TO patterns do not change behavior if regex_flavor is changed. This is necessary to make the non-capturing parentheses work, and seems like a good idea on general principles. Back-patched as far as 7.4. 7.3 also has the bug, but a fix seems impractical because that version's regex engine doesn't have non-capturing parens. --- src/backend/utils/adt/regexp.c | 32 +++++++++++++++++++++++++++++--- 1 file changed, 29 insertions(+), 3 deletions(-) diff --git a/src/backend/utils/adt/regexp.c b/src/backend/utils/adt/regexp.c index 3db5ca9b6cf..c9d61de4180 100644 --- a/src/backend/utils/adt/regexp.c +++ b/src/backend/utils/adt/regexp.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/adt/regexp.c,v 1.62 2006/03/05 15:58:43 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/utils/adt/regexp.c,v 1.63 2006/04/13 18:01:31 tgl Exp $ * * Alistair Crooks added the code for the regex caching * agc - cached the regular expressions used - there's a good chance @@ -549,11 +549,36 @@ similar_escape(PG_FUNCTION_ARGS) errhint("Escape string must be empty or one character."))); } - /* We need room for ^, $, and up to 2 output bytes per input byte */ - result = (text *) palloc(VARHDRSZ + 2 + 2 * plen); + /*---------- + * We surround the transformed input string with + * ***:^(?: ... )$ + * which is bizarre enough to require some explanation. "***:" is a + * director prefix to force the regex to be treated as an ARE regardless + * of the current regex_flavor setting. We need "^" and "$" to force + * the pattern to match the entire input string as per SQL99 spec. The + * "(?:" and ")" are a non-capturing set of parens; we have to have + * parens in case the string contains "|", else the "^" and "$" will + * be bound into the first and last alternatives which is not what we + * want, and the parens must be non capturing because we don't want them + * to count when selecting output for SUBSTRING. + *---------- + */ + + /* + * We need room for the prefix/postfix plus as many as 2 output bytes per + * input byte + */ + result = (text *) palloc(VARHDRSZ + 10 + 2 * plen); r = VARDATA(result); + *r++ = '*'; + *r++ = '*'; + *r++ = '*'; + *r++ = ':'; *r++ = '^'; + *r++ = '('; + *r++ = '?'; + *r++ = ':'; while (plen > 0) { @@ -593,6 +618,7 @@ similar_escape(PG_FUNCTION_ARGS) p++, plen--; } + *r++ = ')'; *r++ = '$'; VARATT_SIZEP(result) = r - ((char *) result); -- GitLab