From 15f8202c204386b7a3b95b00976b4c1e7ba062c1 Mon Sep 17 00:00:00 2001 From: Andrew Dunstan <andrew@dunslane.net> Date: Sat, 2 Jun 2007 02:03:42 +0000 Subject: [PATCH] Improve efficiency of LIKE/ILIKE code, especially for multi-byte charsets, and most especially for UTF8. Remove unnecessary special cases for bytea processing and single-byte charset ILIKE. a ILIKE b is now processed as lower(a) LIKE lower(b) in all cases. The code is now considerably simpler. All comparisons are now performed byte-wise, and the text and pattern are also advanced byte-wise where it is safe to do so - essentially where a wildcard is not being matched. Andrew Dunstan, from an original patch by ITAGAKI Takahiro, with ideas from Tom Lane and Mark Mielke. --- src/backend/utils/adt/like.c | 402 ++++++----------------------- src/backend/utils/adt/like_match.c | 220 ++++++++-------- 2 files changed, 183 insertions(+), 439 deletions(-) diff --git a/src/backend/utils/adt/like.c b/src/backend/utils/adt/like.c index 46f223b38ff..de5d7e7c859 100644 --- a/src/backend/utils/adt/like.c +++ b/src/backend/utils/adt/like.c @@ -11,7 +11,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/adt/like.c,v 1.68 2007/02/27 23:48:08 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/utils/adt/like.c,v 1.69 2007/06/02 02:03:42 adunstan Exp $ * *------------------------------------------------------------------------- */ @@ -28,21 +28,23 @@ #define LIKE_ABORT (-1) -static int MatchText(char *t, int tlen, char *p, int plen); -static int MatchTextIC(char *t, int tlen, char *p, int plen); -static int MatchBytea(char *t, int tlen, char *p, int plen); -static text *do_like_escape(text *, text *); +static int SB_MatchText(char *t, int tlen, char *p, int plen); +static text *SB_do_like_escape(text *, text *); -static int MBMatchText(char *t, int tlen, char *p, int plen); -static int MBMatchTextIC(char *t, int tlen, char *p, int plen); +static int MB_MatchText(char *t, int tlen, char *p, int plen); static text *MB_do_like_escape(text *, text *); +static int UTF8_MatchText(char *t, int tlen, char *p, int plen); + +static int GenericMatchText(char *s, int slen, char* p, int plen); +static int Generic_Text_IC_like(text *str, text *pat); + /*-------------------- * Support routine for MatchText. Compares given multibyte streams * as wide characters. If they match, returns 1 otherwise returns 0. *-------------------- */ -static int +static inline int wchareq(char *p1, char *p2) { int p1_len; @@ -72,15 +74,12 @@ wchareq(char *p1, char *p2) * of getting a single character transformed to the system's wchar_t format. * So now, we just downcase the strings using lower() and apply regular LIKE * comparison. This should be revisited when we install better locale support. - * - * Note that MBMatchText and MBMatchTextIC do exactly the same thing now. - * Is it worth refactoring to avoid duplicated code? They might become - * different again in the future. */ +#define NextByte(p, plen) ((p)++, (plen)--) + /* Set up to compile like_match.c for multibyte characters */ -#define CHAREQ(p1, p2) wchareq(p1, p2) -#define ICHAREQ(p1, p2) wchareq(p1, p2) +#define CHAREQ(p1, p2) wchareq((p1), (p2)) #define NextChar(p, plen) \ do { int __l = pg_mblen(p); (p) +=__l; (plen) -=__l; } while (0) #define CopyAdvChar(dst, src, srclen) \ @@ -90,33 +89,59 @@ wchareq(char *p1, char *p2) *(dst)++ = *(src)++; \ } while (0) -#define MatchText MBMatchText -#define MatchTextIC MBMatchTextIC +#define MatchText MB_MatchText #define do_like_escape MB_do_like_escape #include "like_match.c" -#undef CHAREQ -#undef ICHAREQ -#undef NextChar -#undef CopyAdvChar -#undef MatchText -#undef MatchTextIC -#undef do_like_escape - /* Set up to compile like_match.c for single-byte characters */ #define CHAREQ(p1, p2) (*(p1) == *(p2)) -#define ICHAREQ(p1, p2) (tolower((unsigned char) *(p1)) == tolower((unsigned char) *(p2))) -#define NextChar(p, plen) ((p)++, (plen)--) +#define NextChar(p, plen) NextByte((p), (plen)) #define CopyAdvChar(dst, src, srclen) (*(dst)++ = *(src)++, (srclen)--) +#define MatchText SB_MatchText +#define do_like_escape SB_do_like_escape + +#include "like_match.c" + + +/* setup to compile like_match.c for UTF8 encoding, using fast NextChar */ + +#define NextChar(p, plen) \ + do { (p)++; (plen)--; } while ((plen) > 0 && (*(p) & 0xC0) == 0x80 ) +#define MatchText UTF8_MatchText + #include "like_match.c" -/* And some support for BYTEA */ -#define BYTEA_CHAREQ(p1, p2) (*(p1) == *(p2)) -#define BYTEA_NextChar(p, plen) ((p)++, (plen)--) -#define BYTEA_CopyAdvChar(dst, src, srclen) (*(dst)++ = *(src)++, (srclen)--) +static inline int +GenericMatchText(char *s, int slen, char* p, int plen) +{ + if (pg_database_encoding_max_length() == 1) + return SB_MatchText(s, slen, p, plen); + else if (GetDatabaseEncoding() == PG_UTF8) + return UTF8_MatchText(s, slen, p, plen); + else + return MB_MatchText(s, slen, p, plen); +} + +static inline int +Generic_Text_IC_like(text *str, text *pat) +{ + char *s, + *p; + int slen, + plen; + + /* Force inputs to lower case to achieve case insensitivity */ + str = DatumGetTextP(DirectFunctionCall1(lower, PointerGetDatum(str))); + pat = DatumGetTextP(DirectFunctionCall1(lower, PointerGetDatum(pat))); + s = VARDATA(str); + slen = (VARSIZE(str) - VARHDRSZ); + p = VARDATA(pat); + plen = (VARSIZE(pat) - VARHDRSZ); + return GenericMatchText(s, slen, p, plen); +} /* * interface routines called by the function manager @@ -138,10 +163,7 @@ namelike(PG_FUNCTION_ARGS) p = VARDATA(pat); plen = (VARSIZE(pat) - VARHDRSZ); - if (pg_database_encoding_max_length() == 1) - result = (MatchText(s, slen, p, plen) == LIKE_TRUE); - else - result = (MBMatchText(s, slen, p, plen) == LIKE_TRUE); + result = (GenericMatchText(s, slen, p, plen) == LIKE_TRUE); PG_RETURN_BOOL(result); } @@ -162,10 +184,7 @@ namenlike(PG_FUNCTION_ARGS) p = VARDATA(pat); plen = (VARSIZE(pat) - VARHDRSZ); - if (pg_database_encoding_max_length() == 1) - result = (MatchText(s, slen, p, plen) != LIKE_TRUE); - else - result = (MBMatchText(s, slen, p, plen) != LIKE_TRUE); + result = (GenericMatchText(s, slen, p, plen) != LIKE_TRUE); PG_RETURN_BOOL(result); } @@ -186,10 +205,7 @@ textlike(PG_FUNCTION_ARGS) p = VARDATA(pat); plen = (VARSIZE(pat) - VARHDRSZ); - if (pg_database_encoding_max_length() == 1) - result = (MatchText(s, slen, p, plen) == LIKE_TRUE); - else - result = (MBMatchText(s, slen, p, plen) == LIKE_TRUE); + result = (GenericMatchText(s, slen, p, plen) == LIKE_TRUE); PG_RETURN_BOOL(result); } @@ -210,10 +226,7 @@ textnlike(PG_FUNCTION_ARGS) p = VARDATA(pat); plen = (VARSIZE(pat) - VARHDRSZ); - if (pg_database_encoding_max_length() == 1) - result = (MatchText(s, slen, p, plen) != LIKE_TRUE); - else - result = (MBMatchText(s, slen, p, plen) != LIKE_TRUE); + result = (GenericMatchText(s, slen, p, plen) != LIKE_TRUE); PG_RETURN_BOOL(result); } @@ -234,7 +247,7 @@ bytealike(PG_FUNCTION_ARGS) p = VARDATA(pat); plen = (VARSIZE(pat) - VARHDRSZ); - result = (MatchBytea(s, slen, p, plen) == LIKE_TRUE); + result = (SB_MatchText(s, slen, p, plen) == LIKE_TRUE); PG_RETURN_BOOL(result); } @@ -255,7 +268,7 @@ byteanlike(PG_FUNCTION_ARGS) p = VARDATA(pat); plen = (VARSIZE(pat) - VARHDRSZ); - result = (MatchBytea(s, slen, p, plen) != LIKE_TRUE); + result = (SB_MatchText(s, slen, p, plen) != LIKE_TRUE); PG_RETURN_BOOL(result); } @@ -270,37 +283,11 @@ nameiclike(PG_FUNCTION_ARGS) Name str = PG_GETARG_NAME(0); text *pat = PG_GETARG_TEXT_P(1); bool result; - char *s, - *p; - int slen, - plen; - - if (pg_database_encoding_max_length() == 1) - { - s = NameStr(*str); - slen = strlen(s); - p = VARDATA(pat); - plen = (VARSIZE(pat) - VARHDRSZ); - result = (MatchTextIC(s, slen, p, plen) == LIKE_TRUE); - } - else - { - /* Force inputs to lower case to achieve case insensitivity */ - text *strtext; + text *strtext; - strtext = DatumGetTextP(DirectFunctionCall1(name_text, + strtext = DatumGetTextP(DirectFunctionCall1(name_text, NameGetDatum(str))); - strtext = DatumGetTextP(DirectFunctionCall1(lower, - PointerGetDatum(strtext))); - pat = DatumGetTextP(DirectFunctionCall1(lower, - PointerGetDatum(pat))); - - s = VARDATA(strtext); - slen = (VARSIZE(strtext) - VARHDRSZ); - p = VARDATA(pat); - plen = (VARSIZE(pat) - VARHDRSZ); - result = (MBMatchTextIC(s, slen, p, plen) == LIKE_TRUE); - } + result = (Generic_Text_IC_like(strtext, pat) == LIKE_TRUE); PG_RETURN_BOOL(result); } @@ -311,37 +298,11 @@ nameicnlike(PG_FUNCTION_ARGS) Name str = PG_GETARG_NAME(0); text *pat = PG_GETARG_TEXT_P(1); bool result; - char *s, - *p; - int slen, - plen; - - if (pg_database_encoding_max_length() == 1) - { - s = NameStr(*str); - slen = strlen(s); - p = VARDATA(pat); - plen = (VARSIZE(pat) - VARHDRSZ); - result = (MatchTextIC(s, slen, p, plen) != LIKE_TRUE); - } - else - { - /* Force inputs to lower case to achieve case insensitivity */ - text *strtext; + text *strtext; - strtext = DatumGetTextP(DirectFunctionCall1(name_text, + strtext = DatumGetTextP(DirectFunctionCall1(name_text, NameGetDatum(str))); - strtext = DatumGetTextP(DirectFunctionCall1(lower, - PointerGetDatum(strtext))); - pat = DatumGetTextP(DirectFunctionCall1(lower, - PointerGetDatum(pat))); - - s = VARDATA(strtext); - slen = (VARSIZE(strtext) - VARHDRSZ); - p = VARDATA(pat); - plen = (VARSIZE(pat) - VARHDRSZ); - result = (MBMatchTextIC(s, slen, p, plen) != LIKE_TRUE); - } + result = (Generic_Text_IC_like(strtext, pat) != LIKE_TRUE); PG_RETURN_BOOL(result); } @@ -352,32 +313,8 @@ texticlike(PG_FUNCTION_ARGS) text *str = PG_GETARG_TEXT_P(0); text *pat = PG_GETARG_TEXT_P(1); bool result; - char *s, - *p; - int slen, - plen; - if (pg_database_encoding_max_length() == 1) - { - s = VARDATA(str); - slen = (VARSIZE(str) - VARHDRSZ); - p = VARDATA(pat); - plen = (VARSIZE(pat) - VARHDRSZ); - result = (MatchTextIC(s, slen, p, plen) == LIKE_TRUE); - } - else - { - /* Force inputs to lower case to achieve case insensitivity */ - str = DatumGetTextP(DirectFunctionCall1(lower, - PointerGetDatum(str))); - pat = DatumGetTextP(DirectFunctionCall1(lower, - PointerGetDatum(pat))); - s = VARDATA(str); - slen = (VARSIZE(str) - VARHDRSZ); - p = VARDATA(pat); - plen = (VARSIZE(pat) - VARHDRSZ); - result = (MBMatchTextIC(s, slen, p, plen) == LIKE_TRUE); - } + result = (Generic_Text_IC_like(str, pat) == LIKE_TRUE); PG_RETURN_BOOL(result); } @@ -388,32 +325,8 @@ texticnlike(PG_FUNCTION_ARGS) text *str = PG_GETARG_TEXT_P(0); text *pat = PG_GETARG_TEXT_P(1); bool result; - char *s, - *p; - int slen, - plen; - if (pg_database_encoding_max_length() == 1) - { - s = VARDATA(str); - slen = (VARSIZE(str) - VARHDRSZ); - p = VARDATA(pat); - plen = (VARSIZE(pat) - VARHDRSZ); - result = (MatchTextIC(s, slen, p, plen) != LIKE_TRUE); - } - else - { - /* Force inputs to lower case to achieve case insensitivity */ - str = DatumGetTextP(DirectFunctionCall1(lower, - PointerGetDatum(str))); - pat = DatumGetTextP(DirectFunctionCall1(lower, - PointerGetDatum(pat))); - s = VARDATA(str); - slen = (VARSIZE(str) - VARHDRSZ); - p = VARDATA(pat); - plen = (VARSIZE(pat) - VARHDRSZ); - result = (MBMatchTextIC(s, slen, p, plen) != LIKE_TRUE); - } + result = (Generic_Text_IC_like(str, pat) != LIKE_TRUE); PG_RETURN_BOOL(result); } @@ -430,7 +343,7 @@ like_escape(PG_FUNCTION_ARGS) text *result; if (pg_database_encoding_max_length() == 1) - result = do_like_escape(pat, esc); + result = SB_do_like_escape(pat, esc); else result = MB_do_like_escape(pat, esc); @@ -446,179 +359,8 @@ like_escape_bytea(PG_FUNCTION_ARGS) { bytea *pat = PG_GETARG_BYTEA_P(0); bytea *esc = PG_GETARG_BYTEA_P(1); - bytea *result; - char *p, - *e, - *r; - int plen, - elen; - bool afterescape; - - p = VARDATA(pat); - plen = (VARSIZE(pat) - VARHDRSZ); - e = VARDATA(esc); - elen = (VARSIZE(esc) - VARHDRSZ); - - /* - * Worst-case pattern growth is 2x --- unlikely, but it's hardly worth - * trying to calculate the size more accurately than that. - */ - result = (text *) palloc(plen * 2 + VARHDRSZ); - r = VARDATA(result); - - if (elen == 0) - { - /* - * No escape character is wanted. Double any backslashes in the - * pattern to make them act like ordinary characters. - */ - while (plen > 0) - { - if (*p == '\\') - *r++ = '\\'; - BYTEA_CopyAdvChar(r, p, plen); - } - } - else - { - /* - * The specified escape must be only a single character. - */ - BYTEA_NextChar(e, elen); - if (elen != 0) - ereport(ERROR, - (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE), - errmsg("invalid escape string"), - errhint("Escape string must be empty or one character."))); - - e = VARDATA(esc); - - /* - * If specified escape is '\', just copy the pattern as-is. - */ - if (*e == '\\') - { - memcpy(result, pat, VARSIZE(pat)); - PG_RETURN_BYTEA_P(result); - } - - /* - * Otherwise, convert occurrences of the specified escape character to - * '\', and double occurrences of '\' --- unless they immediately - * follow an escape character! - */ - afterescape = false; - while (plen > 0) - { - if (BYTEA_CHAREQ(p, e) && !afterescape) - { - *r++ = '\\'; - BYTEA_NextChar(p, plen); - afterescape = true; - } - else if (*p == '\\') - { - *r++ = '\\'; - if (!afterescape) - *r++ = '\\'; - BYTEA_NextChar(p, plen); - afterescape = false; - } - else - { - BYTEA_CopyAdvChar(r, p, plen); - afterescape = false; - } - } - } - - SET_VARSIZE(result, r - ((char *) result)); + bytea *result = SB_do_like_escape((text *)pat, (text *)esc); - PG_RETURN_BYTEA_P(result); + PG_RETURN_BYTEA_P((bytea *)result); } -/* - * Same as above, but specifically for bytea (binary) datatype - */ -static int -MatchBytea(char *t, int tlen, char *p, int plen) -{ - /* Fast path for match-everything pattern */ - if ((plen == 1) && (*p == '%')) - return LIKE_TRUE; - - while ((tlen > 0) && (plen > 0)) - { - if (*p == '\\') - { - /* Next pattern char must match literally, whatever it is */ - BYTEA_NextChar(p, plen); - if ((plen <= 0) || !BYTEA_CHAREQ(t, p)) - return LIKE_FALSE; - } - else if (*p == '%') - { - /* %% is the same as % according to the SQL standard */ - /* Advance past all %'s */ - while ((plen > 0) && (*p == '%')) - BYTEA_NextChar(p, plen); - /* Trailing percent matches everything. */ - if (plen <= 0) - return LIKE_TRUE; - - /* - * Otherwise, scan for a text position at which we can match the - * rest of the pattern. - */ - while (tlen > 0) - { - /* - * Optimization to prevent most recursion: don't recurse - * unless first pattern char might match this text char. - */ - if (BYTEA_CHAREQ(t, p) || (*p == '\\') || (*p == '_')) - { - int matched = MatchBytea(t, tlen, p, plen); - - if (matched != LIKE_FALSE) - return matched; /* TRUE or ABORT */ - } - - BYTEA_NextChar(t, tlen); - } - - /* - * End of text with no match, so no point in trying later places - * to start matching this pattern. - */ - return LIKE_ABORT; - } - else if ((*p != '_') && !BYTEA_CHAREQ(t, p)) - { - /* - * Not the single-character wildcard and no explicit match? Then - * time to quit... - */ - return LIKE_FALSE; - } - - BYTEA_NextChar(t, tlen); - BYTEA_NextChar(p, plen); - } - - if (tlen > 0) - return LIKE_FALSE; /* end of pattern, but not of text */ - - /* End of input string. Do we have matching pattern remaining? */ - while ((plen > 0) && (*p == '%')) /* allow multiple %'s at end of - * pattern */ - BYTEA_NextChar(p, plen); - if (plen <= 0) - return LIKE_TRUE; - - /* - * End of text with no match, so no point in trying later places to start - * matching this pattern. - */ - return LIKE_ABORT; -} /* MatchBytea() */ diff --git a/src/backend/utils/adt/like_match.c b/src/backend/utils/adt/like_match.c index 22e2705fb36..62f8bc40a15 100644 --- a/src/backend/utils/adt/like_match.c +++ b/src/backend/utils/adt/like_match.c @@ -3,23 +3,21 @@ * like_match.c * like expression handling internal code. * - * This file is included by like.c *twice*, to provide an optimization - * for single-byte encodings. + * This file is included by like.c three times, to provide natching code for + * single-byte encodings, UTF8, and for other multi-byte encodings. + * UTF8 is a special case because we can use a much more efficient version + * of NextChar than can be used for other multi-byte encodings. * * Before the inclusion, we need to define following macros: * - * CHAREQ - * ICHAREQ - * NextChar - * CopyAdvChar - * MatchText (MBMatchText) - * MatchTextIC (MBMatchTextIC) - * do_like_escape (MB_do_like_escape) + * NextChar + * MatchText - to name of function wanted + * do_like_escape - name of function if wanted - needs CHAREQ and CopyAdvChar * * Copyright (c) 1996-2007, PostgreSQL Global Development Group * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/adt/like_match.c,v 1.15 2007/02/27 23:48:08 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/utils/adt/like_match.c,v 1.16 2007/06/02 02:03:42 adunstan Exp $ * *------------------------------------------------------------------------- */ @@ -77,21 +75,36 @@ MatchText(char *t, int tlen, char *p, int plen) if ((plen == 1) && (*p == '%')) return LIKE_TRUE; + /* + * In this loop, we advance by char when matching wildcards (and thus + * on recursive entry to this function we are properly char-synced). On + * other occasions it is safe to advance by byte, as the text and pattern + * will be in lockstep. This allows us to perform all comparisons between + * the text and pattern on a byte by byte basis, even for multi-byte + * encodings. + */ + while ((tlen > 0) && (plen > 0)) { if (*p == '\\') { - /* Next pattern char must match literally, whatever it is */ - NextChar(p, plen); - if ((plen <= 0) || !CHAREQ(t, p)) + /* Next byte must match literally, whatever it is */ + NextByte(p, plen); + if ((plen <= 0) || *p != *t ) return LIKE_FALSE; } else if (*p == '%') { + /* + * % processing is essentially a search for a match for what + * follows the %, plus a recursive match of the remainder. + * We succeed if and only if both conditions are met. + */ + /* %% is the same as % according to the SQL standard */ /* Advance past all %'s */ while ((plen > 0) && (*p == '%')) - NextChar(p, plen); + NextByte(p, plen); /* Trailing percent matches everything. */ if (plen <= 0) return LIKE_TRUE; @@ -100,107 +113,62 @@ MatchText(char *t, int tlen, char *p, int plen) * Otherwise, scan for a text position at which we can match the * rest of the pattern. */ - while (tlen > 0) - { - /* - * Optimization to prevent most recursion: don't recurse - * unless first pattern char might match this text char. - */ - if (CHAREQ(t, p) || (*p == '\\') || (*p == '_')) - { - int matched = MatchText(t, tlen, p, plen); + if (*p == '_') - if (matched != LIKE_FALSE) - return matched; /* TRUE or ABORT */ - } + { + /* %_ is the same as _% - avoid matching _ repeatedly */ NextChar(t, tlen); - } + NextByte(p, plen); - /* - * End of text with no match, so no point in trying later places - * to start matching this pattern. - */ - return LIKE_ABORT; - } - else if ((*p != '_') && !CHAREQ(t, p)) - { - /* - * Not the single-character wildcard and no explicit match? Then - * time to quit... - */ - return LIKE_FALSE; - } - - NextChar(t, tlen); - NextChar(p, plen); - } - - if (tlen > 0) - return LIKE_FALSE; /* end of pattern, but not of text */ + if (tlen <= 0) + { + return (plen <= 0) ? LIKE_TRUE : LIKE_ABORT; + } + else if (plen <= 0) + { + return LIKE_FALSE; + } - /* End of input string. Do we have matching pattern remaining? */ - while ((plen > 0) && (*p == '%')) /* allow multiple %'s at end of - * pattern */ - NextChar(p, plen); - if (plen <= 0) - return LIKE_TRUE; + while (tlen > 0) + { + int matched = MatchText(t, tlen, p, plen); + + if (matched != LIKE_FALSE) + return matched; /* TRUE or ABORT */ - /* - * End of text with no match, so no point in trying later places to start - * matching this pattern. - */ - return LIKE_ABORT; -} /* MatchText() */ + NextChar(t, tlen); + } + } + else + { -/* - * Same as above, but ignore case - */ -static int -MatchTextIC(char *t, int tlen, char *p, int plen) -{ - /* Fast path for match-everything pattern */ - if ((plen == 1) && (*p == '%')) - return LIKE_TRUE; + char firstpat = *p ; - while ((tlen > 0) && (plen > 0)) - { - if (*p == '\\') - { - /* Next pattern char must match literally, whatever it is */ - NextChar(p, plen); - if ((plen <= 0) || !ICHAREQ(t, p)) - return LIKE_FALSE; - } - else if (*p == '%') - { - /* %% is the same as % according to the SQL standard */ - /* Advance past all %'s */ - while ((plen > 0) && (*p == '%')) - NextChar(p, plen); - /* Trailing percent matches everything. */ - if (plen <= 0) - return LIKE_TRUE; + if (*p == '\\') + { + if (plen < 2) + return LIKE_FALSE; + firstpat = p[1]; + } - /* - * Otherwise, scan for a text position at which we can match the - * rest of the pattern. - */ - while (tlen > 0) - { - /* - * Optimization to prevent most recursion: don't recurse - * unless first pattern char might match this text char. - */ - if (ICHAREQ(t, p) || (*p == '\\') || (*p == '_')) + while (tlen > 0) { - int matched = MatchTextIC(t, tlen, p, plen); + /* + * Optimization to prevent most recursion: don't recurse + * unless first pattern byte matches first text byte. + */ + if (*t == firstpat) + { + int matched = MatchText(t, tlen, p, plen); + + if (matched != LIKE_FALSE) + return matched; /* TRUE or ABORT */ + } + + NextChar(t, tlen); - if (matched != LIKE_FALSE) - return matched; /* TRUE or ABORT */ } - - NextChar(t, tlen); } /* @@ -209,7 +177,13 @@ MatchTextIC(char *t, int tlen, char *p, int plen) */ return LIKE_ABORT; } - else if ((*p != '_') && !ICHAREQ(t, p)) + else if (*p == '_') + { + NextChar(t, tlen); + NextByte(p, plen); + continue; + } + else if (*t != *p) { /* * Not the single-character wildcard and no explicit match? Then @@ -217,9 +191,20 @@ MatchTextIC(char *t, int tlen, char *p, int plen) */ return LIKE_FALSE; } - - NextChar(t, tlen); - NextChar(p, plen); + /* + * It is safe to use NextByte instead of NextChar here, even for + * multi-byte character sets, because we are not following + * immediately after a wildcard character. + * If we are in the middle of a multibyte character, we must + * already have matched at least one byte of the character from + * both text and pattern; so we cannot get out-of-sync + * on character boundaries. And we know that no backend-legal + * encoding allows ASCII characters such as '%' to appear as + * non-first bytes of characters, so we won't mistakenly detect + * a new wildcard. + */ + NextByte(t, tlen); + NextByte(p, plen); } if (tlen > 0) @@ -228,7 +213,8 @@ MatchTextIC(char *t, int tlen, char *p, int plen) /* End of input string. Do we have matching pattern remaining? */ while ((plen > 0) && (*p == '%')) /* allow multiple %'s at end of * pattern */ - NextChar(p, plen); + NextByte(p, plen); + if (plen <= 0) return LIKE_TRUE; @@ -237,12 +223,14 @@ MatchTextIC(char *t, int tlen, char *p, int plen) * matching this pattern. */ return LIKE_ABORT; -} /* MatchTextIC() */ +} /* MatchText() */ /* * like_escape() --- given a pattern and an ESCAPE string, * convert the pattern to use Postgres' standard backslash escape convention. */ +#ifdef do_like_escape + static text * do_like_escape(text *pat, text *esc) { @@ -336,3 +324,17 @@ do_like_escape(text *pat, text *esc) return result; } +#endif /* do_like_escape */ + +#ifdef CHAREQ +#undef CHAREQ +#endif + +#undef NextChar +#undef CopyAdvChar +#undef MatchText + +#ifdef do_like_escape +#undef do_like_escape +#endif + -- GitLab