diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml index 2946122350b25b5a774bfa89cf35eea578d949b5..4d482ec91f02949ec542aca51801698e60416009 100644 --- a/doc/src/sgml/func.sgml +++ b/doc/src/sgml/func.sgml @@ -4477,13 +4477,27 @@ SELECT foo FROM regexp_split_to_table('the quick brown fox', E'\\s*') AS foo; where no substring matching <replaceable>re</> begins (AREs only) </entry> </row> + + <row> + <entry> <literal>(?<=</><replaceable>re</><literal>)</> </entry> + <entry> <firstterm>positive lookbehind</> matches at any point + where a substring matching <replaceable>re</> ends + (AREs only) </entry> + </row> + + <row> + <entry> <literal>(?<!</><replaceable>re</><literal>)</> </entry> + <entry> <firstterm>negative lookbehind</> matches at any point + where no substring matching <replaceable>re</> ends + (AREs only) </entry> + </row> </tbody> </tgroup> </table> <para> - Lookahead constraints cannot contain <firstterm>back references</> - (see <xref linkend="posix-escape-sequences">), + Lookahead and lookbehind constraints cannot contain <firstterm>back + references</> (see <xref linkend="posix-escape-sequences">), and all parentheses within them are considered non-capturing. </para> </sect3> @@ -5355,7 +5369,7 @@ SELECT regexp_matches('abc01234xyz', '(?:(.*?)(\d+)(.*)){1,1}'); the lack of special treatment for a trailing newline, the addition of complemented bracket expressions to the things affected by newline-sensitive matching, - the restrictions on parentheses and back references in lookahead + the restrictions on parentheses and back references in lookahead/lookbehind constraints, and the longest/shortest-match (rather than first-match) matching semantics. </para> diff --git a/src/backend/regex/README b/src/backend/regex/README index 5c24d3dfe9de08641344aa14a97e0d1d1e0ca11a..6c9f48315e3a2be030288d98db53427b6016db78 100644 --- a/src/backend/regex/README +++ b/src/backend/regex/README @@ -332,10 +332,10 @@ The possible arc types are: as "$0->to_state" or "$1->to_state" for end-of-string and end-of-line constraints respectively. - LACON constraints, which represent "(?=re)" and "(?!re)" constraints, - i.e. the input starting at this point must match (or not match) a - given sub-RE, but the matching input is not consumed. These are - dumped as ":subtree_number:->to_state". + LACON constraints, which represent "(?=re)", "(?!re)", "(?<=re)", and + "(?<!re)" constraints, i.e. the input starting/ending at this point must + match (or not match) a given sub-RE, but the matching input is not + consumed. These are dumped as ":subtree_number:->to_state". If you see anything else (especially any question marks) in the display of an arc, it's dumpnfa() trying to tell you that there's something fishy diff --git a/src/backend/regex/re_syntax.n b/src/backend/regex/re_syntax.n index f37bb85abdb8a2450caf643a8214c39643b435f8..4621bfc25f46c847675d25015fe89e807ddca957 100644 --- a/src/backend/regex/re_syntax.n +++ b/src/backend/regex/re_syntax.n @@ -196,10 +196,18 @@ where a substring matching \fIre\fR begins \fB(?!\fIre\fB)\fR \fInegative lookahead\fR (AREs only), matches at any point where no substring matching \fIre\fR begins +.TP +\fB(?<=\fIre\fB)\fR +\fIpositive lookbehind\fR (AREs only), matches at any point +where a substring matching \fIre\fR ends +.TP +\fB(?<!\fIre\fB)\fR +\fInegative lookbehind\fR (AREs only), matches at any point +where no substring matching \fIre\fR ends .RE .PP -The lookahead constraints may not contain back references (see later), -and all parentheses within them are considered non-capturing. +Lookahead and lookbehind constraints may not contain back references +(see later), and all parentheses within them are considered non-capturing. .PP An RE may not end with `\fB\e\fR'. @@ -856,7 +864,8 @@ Incompatibilities of note include `\fB\eb\fR', `\fB\eB\fR', the lack of special treatment for a trailing newline, the addition of complemented bracket expressions to the things affected by newline-sensitive matching, -the restrictions on parentheses and back references in lookahead constraints, +the restrictions on parentheses and back references in lookahead/lookbehind +constraints, and the longest/shortest-match (rather than first-match) matching semantics. .PP The matching rules for REs containing both normal and non-greedy quantifiers diff --git a/src/backend/regex/regc_lex.c b/src/backend/regex/regc_lex.c index f6ed9f09ea4d2f4cc7f3066bc4562c6c3b80df5a..bfd9dcd2a492b25142d9f932bf74c00b6324dfc1 100644 --- a/src/backend/regex/regc_lex.c +++ b/src/backend/regex/regc_lex.c @@ -582,6 +582,8 @@ next(struct vars * v) { NOTE(REG_UNONPOSIX); v->now++; + if (ATEOS()) + FAILW(REG_BADRPT); switch (*v->now++) { case CHR(':'): /* non-capturing paren */ @@ -596,12 +598,31 @@ next(struct vars * v) return next(v); break; case CHR('='): /* positive lookahead */ - NOTE(REG_ULOOKAHEAD); - RETV(LACON, 1); + NOTE(REG_ULOOKAROUND); + RETV(LACON, LATYPE_AHEAD_POS); break; case CHR('!'): /* negative lookahead */ - NOTE(REG_ULOOKAHEAD); - RETV(LACON, 0); + NOTE(REG_ULOOKAROUND); + RETV(LACON, LATYPE_AHEAD_NEG); + break; + case CHR('<'): + if (ATEOS()) + FAILW(REG_BADRPT); + switch (*v->now++) + { + case CHR('='): /* positive lookbehind */ + NOTE(REG_ULOOKAROUND); + RETV(LACON, LATYPE_BEHIND_POS); + break; + case CHR('!'): /* negative lookbehind */ + NOTE(REG_ULOOKAROUND); + RETV(LACON, LATYPE_BEHIND_NEG); + break; + default: + FAILW(REG_BADRPT); + break; + } + assert(NOTREACHED); break; default: FAILW(REG_BADRPT); diff --git a/src/backend/regex/regc_nfa.c b/src/backend/regex/regc_nfa.c index 6f04321cd357822f76bbb58b8dd75f883d220be3..cd9a3239bd3267c2da7ea42d00e0baadc60d0d41 100644 --- a/src/backend/regex/regc_nfa.c +++ b/src/backend/regex/regc_nfa.c @@ -1348,6 +1348,49 @@ cleartraverse(struct nfa * nfa, cleartraverse(nfa, a->to); } +/* + * single_color_transition - does getting from s1 to s2 cross one PLAIN arc? + * + * If traversing from s1 to s2 requires a single PLAIN match (possibly of any + * of a set of colors), return a state whose outarc list contains only PLAIN + * arcs of those color(s). Otherwise return NULL. + * + * This is used before optimizing the NFA, so there may be EMPTY arcs, which + * we should ignore; the possibility of an EMPTY is why the result state could + * be different from s1. + * + * It's worth troubling to handle multiple parallel PLAIN arcs here because a + * bracket construct such as [abc] might yield either one or several parallel + * PLAIN arcs depending on earlier atoms in the expression. We'd rather that + * that implementation detail not create user-visible performance differences. + */ +static struct state * +single_color_transition(struct state * s1, struct state * s2) +{ + struct arc *a; + + /* Ignore leading EMPTY arc, if any */ + if (s1->nouts == 1 && s1->outs->type == EMPTY) + s1 = s1->outs->to; + /* Likewise for any trailing EMPTY arc */ + if (s2->nins == 1 && s2->ins->type == EMPTY) + s2 = s2->ins->from; + /* Perhaps we could have a single-state loop in between, if so reject */ + if (s1 == s2) + return NULL; + /* s1 must have at least one outarc... */ + if (s1->outs == NULL) + return NULL; + /* ... and they must all be PLAIN arcs to s2 */ + for (a = s1->outs; a != NULL; a = a->outchain) + { + if (a->type != PLAIN || a->to != s2) + return NULL; + } + /* OK, return s1 as the possessor of the relevant outarcs */ + return s1; +} + /* * specialcolors - fill in special colors for an NFA */ diff --git a/src/backend/regex/regcomp.c b/src/backend/regex/regcomp.c index b733bc7824e80500e3d278adf63e9e9f066040f4..aa759c264861b5e357c47910bf893fafe8ba1224 100644 --- a/src/backend/regex/regcomp.c +++ b/src/backend/regex/regcomp.c @@ -57,6 +57,8 @@ static const chr *scanplain(struct vars *); static void onechr(struct vars *, chr, struct state *, struct state *); static void dovec(struct vars *, struct cvec *, struct state *, struct state *); static void wordchrs(struct vars *); +static void processlacon(struct vars *, struct state *, struct state *, int, + struct state *, struct state *); static struct subre *subre(struct vars *, int, int, struct state *, struct state *); static void freesubre(struct vars *, struct subre *); static void freesrnode(struct vars *, struct subre *); @@ -65,7 +67,7 @@ static int numst(struct subre *, int); static void markst(struct subre *); static void cleanst(struct vars *); static long nfatree(struct vars *, struct subre *, FILE *); -static long nfanode(struct vars *, struct subre *, FILE *); +static long nfanode(struct vars *, struct subre *, int, FILE *); static int newlacon(struct vars *, struct state *, struct state *, int); static void freelacons(struct subre *, int); static void rfree(regex_t *); @@ -146,6 +148,7 @@ static void deltraverse(struct nfa *, struct state *, struct state *); static void dupnfa(struct nfa *, struct state *, struct state *, struct state *, struct state *); static void duptraverse(struct nfa *, struct state *, struct state *); static void cleartraverse(struct nfa *, struct state *); +static struct state *single_color_transition(struct state *, struct state *); static void specialcolors(struct nfa *); static long optimize(struct nfa *, FILE *); static void pullback(struct nfa *, FILE *); @@ -245,8 +248,9 @@ struct vars int ntree; /* number of tree nodes, plus one */ struct cvec *cv; /* interface cvec */ struct cvec *cv2; /* utility cvec */ - struct subre *lacons; /* lookahead-constraint vector */ - int nlacons; /* size of lacons */ + struct subre *lacons; /* lookaround-constraint vector */ + int nlacons; /* size of lacons[]; note that only slots + * numbered 1 .. nlacons-1 are used */ size_t spaceused; /* approx. space used for compilation */ }; @@ -277,7 +281,7 @@ struct vars #define CCLASS 'C' /* start of [: */ #define END 'X' /* end of [. [= [: */ #define RANGE 'R' /* - within [] which might be range delim. */ -#define LACON 'L' /* lookahead constraint subRE */ +#define LACON 'L' /* lookaround constraint subRE */ #define AHEAD 'a' /* color-lookahead arc */ #define BEHIND 'r' /* color-lookbehind arc */ #define WBDRY 'w' /* word boundary constraint */ @@ -432,11 +436,15 @@ pg_regcomp(regex_t *re, assert(v->nlacons == 0 || v->lacons != NULL); for (i = 1; i < v->nlacons; i++) { + struct subre *lasub = &v->lacons[i]; + #ifdef REG_DEBUG if (debug != NULL) fprintf(debug, "\n\n\n========= LA%d ==========\n", i); #endif - nfanode(v, &v->lacons[i], debug); + + /* Prepend .* to pattern if it's a lookbehind LACON */ + nfanode(v, lasub, !LATYPE_IS_AHEAD(lasub->subno), debug); } CNOERR(); if (v->tree->flags & SHORTER) @@ -640,7 +648,7 @@ makesearch(struct vars * v, static struct subre * parse(struct vars * v, int stopper, /* EOS or ')' */ - int type, /* LACON (lookahead subRE) or PLAIN */ + int type, /* LACON (lookaround subRE) or PLAIN */ struct state * init, /* initial state */ struct state * final) /* final state */ { @@ -719,7 +727,7 @@ parse(struct vars * v, static struct subre * parsebranch(struct vars * v, int stopper, /* EOS or ')' */ - int type, /* LACON (lookahead subRE) or PLAIN */ + int type, /* LACON (lookaround subRE) or PLAIN */ struct state * left, /* leftmost state */ struct state * right, /* rightmost state */ int partial) /* is this only part of a branch? */ @@ -768,7 +776,7 @@ parsebranch(struct vars * v, static void parseqatom(struct vars * v, int stopper, /* EOS or ')' */ - int type, /* LACON (lookahead subRE) or PLAIN */ + int type, /* LACON (lookaround subRE) or PLAIN */ struct state * lp, /* left state to hang it on */ struct state * rp, /* right state to hang it on */ struct subre * top) /* subtree top */ @@ -782,7 +790,7 @@ parseqatom(struct vars * v, struct subre *atom; /* atom's subtree */ struct subre *t; int cap; /* capturing parens? */ - int pos; /* positive lookahead? */ + int latype; /* lookaround constraint type */ int subno; /* capturing-parens or backref number */ int atomtype; int qprefer; /* quantifier short/long preference */ @@ -866,19 +874,18 @@ parseqatom(struct vars * v, nonword(v, AHEAD, s, rp); return; break; - case LACON: /* lookahead constraint */ - pos = v->nextvalue; + case LACON: /* lookaround constraint */ + latype = v->nextvalue; NEXT(); s = newstate(v->nfa); s2 = newstate(v->nfa); NOERR(); t = parse(v, ')', LACON, s, s2); freesubre(v, t); /* internal structure irrelevant */ - assert(SEE(')') || ISERR()); - NEXT(); - n = newlacon(v, s, s2, pos); NOERR(); - ARCV(LACON, n); + assert(SEE(')')); + NEXT(); + processlacon(v, s, s2, latype, lp, rp); return; break; /* then errors, to get them out of the way */ @@ -1633,6 +1640,75 @@ wordchrs(struct vars * v) v->wordchrs = left; } +/* + * processlacon - generate the NFA representation of a LACON + * + * In the general case this is just newlacon() + newarc(), but some cases + * can be optimized. + */ +static void +processlacon(struct vars * v, + struct state * begin, /* start of parsed LACON sub-re */ + struct state * end, /* end of parsed LACON sub-re */ + int latype, + struct state * lp, /* left state to hang it on */ + struct state * rp) /* right state to hang it on */ +{ + struct state *s1; + int n; + + /* + * Check for lookaround RE consisting of a single plain color arc (or set + * of arcs); this would typically be a simple chr or a bracket expression. + */ + s1 = single_color_transition(begin, end); + switch (latype) + { + case LATYPE_AHEAD_POS: + /* If lookahead RE is just colorset C, convert to AHEAD(C) */ + if (s1 != NULL) + { + cloneouts(v->nfa, s1, lp, rp, AHEAD); + return; + } + break; + case LATYPE_AHEAD_NEG: + /* If lookahead RE is just colorset C, convert to AHEAD(^C)|$ */ + if (s1 != NULL) + { + colorcomplement(v->nfa, v->cm, AHEAD, s1, lp, rp); + newarc(v->nfa, '$', 1, lp, rp); + newarc(v->nfa, '$', 0, lp, rp); + return; + } + break; + case LATYPE_BEHIND_POS: + /* If lookbehind RE is just colorset C, convert to BEHIND(C) */ + if (s1 != NULL) + { + cloneouts(v->nfa, s1, lp, rp, BEHIND); + return; + } + break; + case LATYPE_BEHIND_NEG: + /* If lookbehind RE is just colorset C, convert to BEHIND(^C)|^ */ + if (s1 != NULL) + { + colorcomplement(v->nfa, v->cm, BEHIND, s1, lp, rp); + newarc(v->nfa, '^', 1, lp, rp); + newarc(v->nfa, '^', 0, lp, rp); + return; + } + break; + default: + assert(NOTREACHED); + } + + /* General case: we need a LACON subre and arc */ + n = newlacon(v, begin, end, latype); + newarc(v->nfa, LACON, n, lp, rp); +} + /* * subre - allocate a subre */ @@ -1826,15 +1902,18 @@ nfatree(struct vars * v, if (t->right != NULL) (DISCARD) nfatree(v, t->right, f); - return nfanode(v, t, f); + return nfanode(v, t, 0, f); } /* - * nfanode - do one NFA for nfatree + * nfanode - do one NFA for nfatree or lacons + * + * If converttosearch is true, apply makesearch() to the NFA. */ static long /* optimize results */ nfanode(struct vars * v, struct subre * t, + int converttosearch, FILE *f) /* for debug output */ { struct nfa *nfa; @@ -1855,10 +1934,11 @@ nfanode(struct vars * v, NOERRZ(); dupnfa(nfa, t->begin, t->end, nfa->init, nfa->final); if (!ISERR()) - { specialcolors(nfa); + if (!ISERR()) ret = optimize(nfa, f); - } + if (converttosearch && !ISERR()) + makesearch(v, nfa); if (!ISERR()) compact(nfa, &t->cnfa); @@ -1867,13 +1947,13 @@ nfanode(struct vars * v, } /* - * newlacon - allocate a lookahead-constraint subRE + * newlacon - allocate a lookaround-constraint subRE */ static int /* lacon number */ newlacon(struct vars * v, struct state * begin, struct state * end, - int pos) + int latype) { int n; struct subre *newlacons; @@ -1900,13 +1980,13 @@ newlacon(struct vars * v, sub = &v->lacons[n]; sub->begin = begin; sub->end = end; - sub->subno = pos; + sub->subno = latype; ZAPCNFA(sub->cnfa); return n; } /* - * freelacons - free lookahead-constraint subRE vector + * freelacons - free lookaround-constraint subRE vector */ static void freelacons(struct subre * subs, @@ -2020,9 +2100,29 @@ dump(regex_t *re, } for (i = 1; i < g->nlacons; i++) { - fprintf(f, "\nla%d (%s):\n", i, - (g->lacons[i].subno) ? "positive" : "negative"); - dumpcnfa(&g->lacons[i].cnfa, f); + struct subre *lasub = &g->lacons[i]; + const char *latype; + + switch (lasub->subno) + { + case LATYPE_AHEAD_POS: + latype = "positive lookahead"; + break; + case LATYPE_AHEAD_NEG: + latype = "negative lookahead"; + break; + case LATYPE_BEHIND_POS: + latype = "positive lookbehind"; + break; + case LATYPE_BEHIND_NEG: + latype = "negative lookbehind"; + break; + default: + latype = "???"; + break; + } + fprintf(f, "\nla%d (%s):\n", i, latype); + dumpcnfa(&lasub->cnfa, f); } fprintf(f, "\n"); dumpst(g->tree, f, 0); diff --git a/src/backend/regex/rege_dfa.c b/src/backend/regex/rege_dfa.c index a37e4b0ef96660c47b01c9fe32e10a0b39d511b9..7d90242acefd50b33fc7c2551e1cf830b5752fee 100644 --- a/src/backend/regex/rege_dfa.c +++ b/src/backend/regex/rege_dfa.c @@ -286,6 +286,130 @@ shortest(struct vars * v, return cp; } +/* + * matchuntil - incremental matching engine + * + * This is meant for use with a search-style NFA (that is, the pattern is + * known to act as though it had a leading .*). We determine whether a + * match exists starting at v->start and ending at probe. Multiple calls + * require only O(N) time not O(N^2) so long as the probe values are + * nondecreasing. *lastcss and *lastcp must be initialized to NULL before + * starting a series of calls. + * + * Returns 1 if a match exists, 0 if not. + * Internal errors also return 0, with v->err set. + */ +static int +matchuntil(struct vars * v, + struct dfa * d, + chr *probe, /* we want to know if a match ends here */ + struct sset ** lastcss, /* state storage across calls */ + chr **lastcp) /* state storage across calls */ +{ + chr *cp = *lastcp; + color co; + struct sset *css = *lastcss; + struct sset *ss; + struct colormap *cm = d->cm; + + /* initialize and startup, or restart, if necessary */ + if (cp == NULL || cp > probe) + { + cp = v->start; + css = initialize(v, d, cp); + if (css == NULL) + return 0; + + FDEBUG((">>> startup >>>\n")); + co = d->cnfa->bos[(v->eflags & REG_NOTBOL) ? 0 : 1]; + FDEBUG(("color %ld\n", (long) co)); + + css = miss(v, d, css, co, cp, v->start); + if (css == NULL) + return 0; + css->lastseen = cp; + } + else if (css == NULL) + { + /* we previously found that no match is possible beyond *lastcp */ + return 0; + } + ss = css; + + /* + * This is the main text-scanning loop. It seems worth having two copies + * to avoid the overhead of REG_FTRACE tests here, even in REG_DEBUG + * builds, when you're not actively tracing. + */ +#ifdef REG_DEBUG + if (v->eflags & REG_FTRACE) + { + while (cp < probe) + { + FDEBUG((">>> at c%d >>>\n", (int) (css - d->ssets))); + co = GETCOLOR(cm, *cp); + FDEBUG(("char %c, color %ld\n", (char) *cp, (long) co)); + ss = css->outs[co]; + if (ss == NULL) + { + ss = miss(v, d, css, co, cp + 1, v->start); + if (ss == NULL) + break; /* NOTE BREAK OUT */ + } + cp++; + ss->lastseen = cp; + css = ss; + } + } + else +#endif + { + while (cp < probe) + { + co = GETCOLOR(cm, *cp); + ss = css->outs[co]; + if (ss == NULL) + { + ss = miss(v, d, css, co, cp + 1, v->start); + if (ss == NULL) + break; /* NOTE BREAK OUT */ + } + cp++; + ss->lastseen = cp; + css = ss; + } + } + + *lastcss = ss; + *lastcp = cp; + + if (ss == NULL) + return 0; /* impossible match, or internal error */ + + /* We need to process one more chr, or the EOS symbol, to check match */ + if (cp < v->stop) + { + FDEBUG((">>> at c%d >>>\n", (int) (css - d->ssets))); + co = GETCOLOR(cm, *cp); + FDEBUG(("char %c, color %ld\n", (char) *cp, (long) co)); + ss = css->outs[co]; + if (ss == NULL) + ss = miss(v, d, css, co, cp + 1, v->start); + } + else + { + assert(cp == v->stop); + co = d->cnfa->eos[(v->eflags & REG_NOTEOL) ? 0 : 1]; + FDEBUG(("color %ld\n", (long) co)); + ss = miss(v, d, css, co, cp, v->start); + } + + if (ss == NULL || !(ss->flags & POSTSTATE)) + return 0; + + return 1; +} + /* * lastcold - determine last point at which no progress had been made */ @@ -613,19 +737,19 @@ miss(struct vars * v, } /* - * lacon - lookahead-constraint checker for miss() + * lacon - lookaround-constraint checker for miss() */ static int /* predicate: constraint satisfied? */ lacon(struct vars * v, struct cnfa * pcnfa, /* parent cnfa */ chr *cp, - pcolor co) /* "color" of the lookahead constraint */ + pcolor co) /* "color" of the lookaround constraint */ { int n; struct subre *sub; struct dfa *d; - struct smalldfa sd; chr *end; + int satisfied; /* Since this is recursive, it could be driven to stack overflow */ if (STACK_TOO_DEEP(v->re)) @@ -635,19 +759,35 @@ lacon(struct vars * v, } n = co - pcnfa->ncolors; - assert(n < v->g->nlacons && v->g->lacons != NULL); + assert(n > 0 && n < v->g->nlacons && v->g->lacons != NULL); FDEBUG(("=== testing lacon %d\n", n)); sub = &v->g->lacons[n]; - d = newdfa(v, &sub->cnfa, &v->g->cmap, &sd); + d = getladfa(v, n); if (d == NULL) - { - ERR(REG_ESPACE); return 0; + if (LATYPE_IS_AHEAD(sub->subno)) + { + /* used to use longest() here, but shortest() could be much cheaper */ + end = shortest(v, d, cp, cp, v->stop, + (chr **) NULL, (int *) NULL); + satisfied = LATYPE_IS_POS(sub->subno) ? (end != NULL) : (end == NULL); + } + else + { + /* + * To avoid doing O(N^2) work when repeatedly testing a lookbehind + * constraint in an N-character string, we use matchuntil() which can + * cache the DFA state across calls. We only need to restart if the + * probe point decreases, which is not common. The NFA we're using is + * a search NFA, so it doesn't mind scanning over stuff before the + * nominal match. + */ + satisfied = matchuntil(v, d, cp, &v->lblastcss[n], &v->lblastcp[n]); + if (!LATYPE_IS_POS(sub->subno)) + satisfied = !satisfied; } - end = longest(v, d, cp, v->stop, (int *) NULL); - freedfa(d); - FDEBUG(("=== lacon %d match %d\n", n, (end != NULL))); - return (sub->subno) ? (end != NULL) : (end == NULL); + FDEBUG(("=== lacon %d satisfied %d\n", n, satisfied)); + return satisfied; } /* diff --git a/src/backend/regex/regexec.c b/src/backend/regex/regexec.c index 8a21f2cb7870b544165d363f62a0424f067e23cb..82659a0f2f468532662ac29583c9f536e727a1be 100644 --- a/src/backend/regex/regexec.c +++ b/src/backend/regex/regexec.c @@ -112,7 +112,10 @@ struct vars chr *search_start; /* search start of string */ chr *stop; /* just past end of string */ int err; /* error code if any (0 none) */ - struct dfa **subdfas; /* per-subre DFAs */ + struct dfa **subdfas; /* per-tree-subre DFAs */ + struct dfa **ladfas; /* per-lacon-subre DFAs */ + struct sset **lblastcss; /* per-lacon-subre lookbehind restart data */ + chr **lblastcp; /* per-lacon-subre lookbehind restart data */ struct smalldfa dfa1; struct smalldfa dfa2; }; @@ -132,6 +135,7 @@ struct vars */ /* === regexec.c === */ static struct dfa *getsubdfa(struct vars *, struct subre *); +static struct dfa *getladfa(struct vars *, int); static int find(struct vars *, struct cnfa *, struct colormap *); static int cfind(struct vars *, struct cnfa *, struct colormap *); static int cfindloop(struct vars *, struct cnfa *, struct colormap *, struct dfa *, struct dfa *, chr **); @@ -149,6 +153,7 @@ static int creviterdissect(struct vars *, struct subre *, chr *, chr *); /* === rege_dfa.c === */ static chr *longest(struct vars *, struct dfa *, chr *, chr *, int *); static chr *shortest(struct vars *, struct dfa *, chr *, chr *, chr *, chr **, int *); +static int matchuntil(struct vars *, struct dfa *, chr *, struct sset **, chr **); static chr *lastcold(struct vars *, struct dfa *); static struct dfa *newdfa(struct vars *, struct cnfa *, struct colormap *, struct smalldfa *); static void freedfa(struct dfa *); @@ -226,21 +231,54 @@ pg_regexec(regex_t *re, v->search_start = (chr *) string + search_start; v->stop = (chr *) string + len; v->err = 0; + v->subdfas = NULL; + v->ladfas = NULL; + v->lblastcss = NULL; + v->lblastcp = NULL; + /* below this point, "goto cleanup" will behave sanely */ + assert(v->g->ntree >= 0); n = (size_t) v->g->ntree; if (n <= LOCALDFAS) v->subdfas = subdfas; else - v->subdfas = (struct dfa **) MALLOC(n * sizeof(struct dfa *)); - if (v->subdfas == NULL) { - if (v->pmatch != pmatch && v->pmatch != mat) - FREE(v->pmatch); - return REG_ESPACE; + v->subdfas = (struct dfa **) MALLOC(n * sizeof(struct dfa *)); + if (v->subdfas == NULL) + { + st = REG_ESPACE; + goto cleanup; + } } for (i = 0; i < n; i++) v->subdfas[i] = NULL; + assert(v->g->nlacons >= 0); + n = (size_t) v->g->nlacons; + if (n > 0) + { + v->ladfas = (struct dfa **) MALLOC(n * sizeof(struct dfa *)); + if (v->ladfas == NULL) + { + st = REG_ESPACE; + goto cleanup; + } + for (i = 0; i < n; i++) + v->ladfas[i] = NULL; + v->lblastcss = (struct sset **) MALLOC(n * sizeof(struct sset *)); + v->lblastcp = (chr **) MALLOC(n * sizeof(chr *)); + if (v->lblastcss == NULL || v->lblastcp == NULL) + { + st = REG_ESPACE; + goto cleanup; + } + for (i = 0; i < n; i++) + { + v->lblastcss[i] = NULL; + v->lblastcp[i] = NULL; + } + } + /* do it */ assert(v->g->tree != NULL); if (backref) @@ -257,22 +295,40 @@ pg_regexec(regex_t *re, } /* clean up */ +cleanup: if (v->pmatch != pmatch && v->pmatch != mat) FREE(v->pmatch); - n = (size_t) v->g->ntree; - for (i = 0; i < n; i++) + if (v->subdfas != NULL) + { + n = (size_t) v->g->ntree; + for (i = 0; i < n; i++) + { + if (v->subdfas[i] != NULL) + freedfa(v->subdfas[i]); + } + if (v->subdfas != subdfas) + FREE(v->subdfas); + } + if (v->ladfas != NULL) { - if (v->subdfas[i] != NULL) - freedfa(v->subdfas[i]); + n = (size_t) v->g->nlacons; + for (i = 0; i < n; i++) + { + if (v->ladfas[i] != NULL) + freedfa(v->ladfas[i]); + } + FREE(v->ladfas); } - if (v->subdfas != subdfas) - FREE(v->subdfas); + if (v->lblastcss != NULL) + FREE(v->lblastcss); + if (v->lblastcp != NULL) + FREE(v->lblastcp); return st; } /* - * getsubdfa - create or re-fetch the DFA for a subre node + * getsubdfa - create or re-fetch the DFA for a tree subre node * * We only need to create the DFA once per overall regex execution. * The DFA will be freed by the cleanup step in pg_regexec(). @@ -290,6 +346,28 @@ getsubdfa(struct vars * v, return v->subdfas[t->id]; } +/* + * getladfa - create or re-fetch the DFA for a LACON subre node + * + * Same as above, but for LACONs. + */ +static struct dfa * +getladfa(struct vars * v, + int n) +{ + assert(n > 0 && n < v->g->nlacons && v->g->lacons != NULL); + + if (v->ladfas[n] == NULL) + { + struct subre *sub = &v->g->lacons[n]; + + v->ladfas[n] = newdfa(v, &sub->cnfa, &v->g->cmap, DOMALLOC); + if (ISERR()) + return NULL; + } + return v->ladfas[n]; +} + /* * find - find a match for the main NFA (no-complications case) */ diff --git a/src/backend/regex/regexport.c b/src/backend/regex/regexport.c index c5524ae492756e7aaf33149b1de9d741616618db..91340719e88f87f9ba86bea276539881872c51bc 100644 --- a/src/backend/regex/regexport.c +++ b/src/backend/regex/regexport.c @@ -6,7 +6,7 @@ * In this implementation, the NFA defines a necessary but not sufficient * condition for a string to match the regex: that is, there can be strings * that match the NFA but don't match the full regex, but not vice versa. - * Thus, for example, it is okay for the functions below to ignore lookahead + * Thus, for example, it is okay for the functions below to ignore lookaround * constraints, which merely constrain the string some more. * * Notice that these functions return info into caller-provided arrays diff --git a/src/backend/regex/regprefix.c b/src/backend/regex/regprefix.c index ce41620a0b4b1b740955da99aacdd5f502de9b36..86928453260b20d9df647d4cc284cd308e130106 100644 --- a/src/backend/regex/regprefix.c +++ b/src/backend/regex/regprefix.c @@ -36,7 +36,7 @@ static int findprefix(struct cnfa * cnfa, struct colormap * cm, * the common prefix or exact value, of length *slength (measured in chrs * not bytes!). * - * This function does not analyze all complex cases (such as lookahead + * This function does not analyze all complex cases (such as lookaround * constraints) exactly. Therefore it is possible that some strings matching * the reported prefix or exact-match string do not satisfy the regex. But * it should never be the case that a string satisfying the regex does not diff --git a/src/include/regex/regex.h b/src/include/regex/regex.h index 5e1b692d26c130d09cbe4f7e3150ba6cbba6f66a..2f89dc9326bfab16a10bd6ba070a6b1230c3ff0c 100644 --- a/src/include/regex/regex.h +++ b/src/include/regex/regex.h @@ -58,7 +58,7 @@ typedef struct size_t re_nsub; /* number of subexpressions */ long re_info; /* information about RE */ #define REG_UBACKREF 000001 -#define REG_ULOOKAHEAD 000002 +#define REG_ULOOKAROUND 000002 #define REG_UBOUNDS 000004 #define REG_UBRACES 000010 #define REG_UBSALNUM 000020 diff --git a/src/include/regex/regguts.h b/src/include/regex/regguts.h index 19fe991c74faa5213261ce3b060071a8c7f2eec2..2ceffa6563b04b7b9024f7dbebf2c37a8c392c31 100644 --- a/src/include/regex/regguts.h +++ b/src/include/regex/regguts.h @@ -89,13 +89,19 @@ */ #define NOTREACHED 0 -#define xxx 1 #define DUPMAX _POSIX2_RE_DUP_MAX #define DUPINF (DUPMAX+1) #define REMAGIC 0xfed7 /* magic number for main struct */ +/* Type codes for lookaround constraints */ +#define LATYPE_AHEAD_POS 03 /* positive lookahead */ +#define LATYPE_AHEAD_NEG 02 /* negative lookahead */ +#define LATYPE_BEHIND_POS 01 /* positive lookbehind */ +#define LATYPE_BEHIND_NEG 00 /* negative lookbehind */ +#define LATYPE_IS_POS(la) ((la) & 01) +#define LATYPE_IS_AHEAD(la) ((la) & 02) /* @@ -351,7 +357,7 @@ struct nfa * * The non-dummy carc structs are of two types: plain arcs and LACON arcs. * Plain arcs just store the transition color number as "co". LACON arcs - * store the lookahead constraint number plus cnfa.ncolors as "co". LACON + * store the lookaround constraint number plus cnfa.ncolors as "co". LACON * arcs can be distinguished from plain by testing for co >= cnfa.ncolors. */ struct carc @@ -365,7 +371,7 @@ struct cnfa int nstates; /* number of states */ int ncolors; /* number of colors (max color in use + 1) */ int flags; -#define HASLACONS 01 /* uses lookahead constraints */ +#define HASLACONS 01 /* uses lookaround constraints */ int pre; /* setup state number */ int post; /* teardown state number */ color bos[2]; /* colors, if any, assigned to BOS and BOL */ @@ -433,7 +439,8 @@ struct subre #define PREF2(f1, f2) ((PREF(f1) != 0) ? PREF(f1) : PREF(f2)) #define COMBINE(f1, f2) (UP((f1)|(f2)) | PREF2(f1, f2)) short id; /* ID of subre (1..ntree-1) */ - int subno; /* subexpression number (for 'b' and '(') */ + int subno; /* subexpression number for 'b' and '(', or + * LATYPE code for lookaround constraint */ short min; /* min repetitions for iteration or backref */ short max; /* max repetitions for iteration or backref */ struct subre *left; /* left child, if any (also freelist chain) */ @@ -479,6 +486,7 @@ struct guts int ntree; /* number of subre's, plus one */ struct colormap cmap; int FUNCPTR(compare, (const chr *, const chr *, size_t)); - struct subre *lacons; /* lookahead-constraint vector */ - int nlacons; /* size of lacons */ + struct subre *lacons; /* lookaround-constraint vector */ + int nlacons; /* size of lacons[]; note that only slots + * numbered 1 .. nlacons-1 are used */ }; diff --git a/src/test/regress/expected/regex.out b/src/test/regress/expected/regex.out index be151858a3884a9c9f32d64608d80e959e8f912c..f0e2fc9eb896786c3206ad88e68868867be317fe 100644 --- a/src/test/regress/expected/regex.out +++ b/src/test/regress/expected/regex.out @@ -90,6 +90,175 @@ select substring('a' from '((a)+)'); a (1 row) +-- Test lookahead constraints +select regexp_matches('ab', 'a(?=b)b*'); + regexp_matches +---------------- + {ab} +(1 row) + +select regexp_matches('a', 'a(?=b)b*'); + regexp_matches +---------------- +(0 rows) + +select regexp_matches('abc', 'a(?=b)b*(?=c)c*'); + regexp_matches +---------------- + {abc} +(1 row) + +select regexp_matches('ab', 'a(?=b)b*(?=c)c*'); + regexp_matches +---------------- +(0 rows) + +select regexp_matches('ab', 'a(?!b)b*'); + regexp_matches +---------------- +(0 rows) + +select regexp_matches('a', 'a(?!b)b*'); + regexp_matches +---------------- + {a} +(1 row) + +select regexp_matches('b', '(?=b)b'); + regexp_matches +---------------- + {b} +(1 row) + +select regexp_matches('a', '(?=b)b'); + regexp_matches +---------------- +(0 rows) + +-- Test lookbehind constraints +select regexp_matches('abb', '(?<=a)b*'); + regexp_matches +---------------- + {bb} +(1 row) + +select regexp_matches('a', 'a(?<=a)b*'); + regexp_matches +---------------- + {a} +(1 row) + +select regexp_matches('abc', 'a(?<=a)b*(?<=b)c*'); + regexp_matches +---------------- + {abc} +(1 row) + +select regexp_matches('ab', 'a(?<=a)b*(?<=b)c*'); + regexp_matches +---------------- + {ab} +(1 row) + +select regexp_matches('ab', 'a*(?<!a)b*'); + regexp_matches +---------------- + {""} +(1 row) + +select regexp_matches('ab', 'a*(?<!a)b+'); + regexp_matches +---------------- +(0 rows) + +select regexp_matches('b', 'a*(?<!a)b+'); + regexp_matches +---------------- + {b} +(1 row) + +select regexp_matches('a', 'a(?<!a)b*'); + regexp_matches +---------------- +(0 rows) + +select regexp_matches('b', '(?<=b)b'); + regexp_matches +---------------- +(0 rows) + +select regexp_matches('foobar', '(?<=f)b+'); + regexp_matches +---------------- +(0 rows) + +select regexp_matches('foobar', '(?<=foo)b+'); + regexp_matches +---------------- + {b} +(1 row) + +select regexp_matches('foobar', '(?<=oo)b+'); + regexp_matches +---------------- + {b} +(1 row) + +-- Test optimization of single-chr-or-bracket-expression lookaround constraints +select 'xz' ~ 'x(?=[xy])'; + ?column? +---------- + f +(1 row) + +select 'xy' ~ 'x(?=[xy])'; + ?column? +---------- + t +(1 row) + +select 'xz' ~ 'x(?![xy])'; + ?column? +---------- + t +(1 row) + +select 'xy' ~ 'x(?![xy])'; + ?column? +---------- + f +(1 row) + +select 'x' ~ 'x(?![xy])'; + ?column? +---------- + t +(1 row) + +select 'xyy' ~ '(?<=[xy])yy+'; + ?column? +---------- + t +(1 row) + +select 'zyy' ~ '(?<=[xy])yy+'; + ?column? +---------- + f +(1 row) + +select 'xyy' ~ '(?<![xy])yy+'; + ?column? +---------- + f +(1 row) + +select 'zyy' ~ '(?<![xy])yy+'; + ?column? +---------- + t +(1 row) + -- Test conversion of regex patterns to indexable conditions explain (costs off) select * from pg_proc where proname ~ 'abc'; QUERY PLAN diff --git a/src/test/regress/sql/regex.sql b/src/test/regress/sql/regex.sql index c59fa35f24d81f5e2c4aceae0ada1ca3832f0ac4..d3030af295d3852714c2dfa13ea2ca9597ef686d 100644 --- a/src/test/regress/sql/regex.sql +++ b/src/test/regress/sql/regex.sql @@ -25,6 +25,41 @@ select substring('asd TO foo' from ' TO (([a-z0-9._]+|"([^"]+|"")+")+)'); select substring('a' from '((a))+'); select substring('a' from '((a)+)'); +-- Test lookahead constraints +select regexp_matches('ab', 'a(?=b)b*'); +select regexp_matches('a', 'a(?=b)b*'); +select regexp_matches('abc', 'a(?=b)b*(?=c)c*'); +select regexp_matches('ab', 'a(?=b)b*(?=c)c*'); +select regexp_matches('ab', 'a(?!b)b*'); +select regexp_matches('a', 'a(?!b)b*'); +select regexp_matches('b', '(?=b)b'); +select regexp_matches('a', '(?=b)b'); + +-- Test lookbehind constraints +select regexp_matches('abb', '(?<=a)b*'); +select regexp_matches('a', 'a(?<=a)b*'); +select regexp_matches('abc', 'a(?<=a)b*(?<=b)c*'); +select regexp_matches('ab', 'a(?<=a)b*(?<=b)c*'); +select regexp_matches('ab', 'a*(?<!a)b*'); +select regexp_matches('ab', 'a*(?<!a)b+'); +select regexp_matches('b', 'a*(?<!a)b+'); +select regexp_matches('a', 'a(?<!a)b*'); +select regexp_matches('b', '(?<=b)b'); +select regexp_matches('foobar', '(?<=f)b+'); +select regexp_matches('foobar', '(?<=foo)b+'); +select regexp_matches('foobar', '(?<=oo)b+'); + +-- Test optimization of single-chr-or-bracket-expression lookaround constraints +select 'xz' ~ 'x(?=[xy])'; +select 'xy' ~ 'x(?=[xy])'; +select 'xz' ~ 'x(?![xy])'; +select 'xy' ~ 'x(?![xy])'; +select 'x' ~ 'x(?![xy])'; +select 'xyy' ~ '(?<=[xy])yy+'; +select 'zyy' ~ '(?<=[xy])yy+'; +select 'xyy' ~ '(?<![xy])yy+'; +select 'zyy' ~ '(?<![xy])yy+'; + -- Test conversion of regex patterns to indexable conditions explain (costs off) select * from pg_proc where proname ~ 'abc'; explain (costs off) select * from pg_proc where proname ~ '^abc';