diff --git a/src/backend/regex/README b/src/backend/regex/README index 3fd58c000119a24d61dff594baec28a27a4437f0..89ba6a62ea2f70bfda729f03efedba4b9b6fce9b 100644 --- a/src/backend/regex/README +++ b/src/backend/regex/README @@ -102,15 +102,15 @@ consists of a tree of sub-expressions ("subre"s). Leaf tree nodes are either plain regular expressions (which are executed as DFAs in the manner described above) or back-references (which try to match the input to some previous substring). Non-leaf nodes are capture nodes (which save the -location of the substring currently matching their child node) or -concatenation or alternation nodes. At execution time, the executor -recursively scans the tree. At concatenation or alternation nodes, -it considers each possible alternative way of matching the input string, -ie each place where the string could be split for a concatenation, or each -child node for an alternation. It tries the next alternative if the match -fails according to the child nodes. This is exactly the sort of -backtracking search done by a traditional NFA regex engine. If there are -many tree levels it can get very slow. +location of the substring currently matching their child node), +concatenation, alternation, or iteration nodes. At execution time, the +executor recursively scans the tree. At concatenation, alternation, or +iteration nodes, it considers each possible alternative way of matching the +input string, that is each place where the string could be split for a +concatenation or iteration, or each child node for an alternation. It +tries the next alternative if the match fails according to the child nodes. +This is exactly the sort of backtracking search done by a traditional NFA +regex engine. If there are many tree levels it can get very slow. But all is not lost: we can still be smarter than the average pure NFA engine. To do this, each subre node has an associated DFA, which diff --git a/src/backend/regex/regcomp.c b/src/backend/regex/regcomp.c index 6b80140e90940b4a348c342090e26fdd0bc82c8f..b84d0c3af55f04873bc61c8b843af3b99c865b4a 100644 --- a/src/backend/regex/regcomp.c +++ b/src/backend/regex/regcomp.c @@ -1036,11 +1036,17 @@ parseqatom(struct vars * v, /*---------- * Prepare a general-purpose state skeleton. * - * ---> [s] ---prefix---> [begin] ---atom---> [end] ----rest---> [rp] - * / / - * [lp] ----> [s2] ----bypass--------------------- + * In the no-backrefs case, we want this: * - * where bypass is an empty, and prefix is some repetitions of atom + * [lp] ---> [s] ---prefix---> [begin] ---atom---> [end] ---rest---> [rp] + * + * where prefix is some repetitions of atom. In the general case we need + * + * [lp] ---> [s] ---iterator---> [s2] ---rest---> [rp] + * + * where the iterator wraps around [begin] ---atom---> [end] + * + * We make the s state here for both cases; s2 is made below if needed *---------- */ s = newstate(v->nfa); /* first, new endpoints for the atom */ @@ -1051,11 +1057,9 @@ parseqatom(struct vars * v, NOERR(); atom->begin = s; atom->end = s2; - s = newstate(v->nfa); /* and spots for prefix and bypass */ - s2 = newstate(v->nfa); + s = newstate(v->nfa); /* set up starting state */ NOERR(); EMPTYARC(lp, s); - EMPTYARC(lp, s2); NOERR(); /* break remaining subRE into x{...} and what follows */ @@ -1089,28 +1093,9 @@ parseqatom(struct vars * v, } /* - * It's quantifier time. If the atom is just a BACKREF, we'll let it deal - * with quantifiers internally. Otherwise, the first step is to turn - * x{0,...} into x{1,...}|empty + * It's quantifier time. If the atom is just a backref, we'll let it deal + * with quantifiers internally. */ - if (m == 0 && atomtype != BACKREF) - { - EMPTYARC(s2, atom->end); /* the bypass */ - assert(PREF(qprefer) != 0); - f = COMBINE(qprefer, atom->flags); - t = subre(v, '|', f, lp, atom->end); - NOERR(); - t->left = atom; - t->right = subre(v, '|', PREF(f), s2, atom->end); - NOERR(); - t->right->left = subre(v, '=', 0, s2, atom->end); - NOERR(); - *atomp = t; - atomp = &t->left; - m = 1; - } - - /* deal with the rest of the quantifier */ if (atomtype == BACKREF) { /* special case: backrefs have internal quantifiers */ @@ -1120,17 +1105,25 @@ parseqatom(struct vars * v, atom->min = (short) m; atom->max = (short) n; atom->flags |= COMBINE(qprefer, atom->flags); + /* rest of branch can be strung starting from atom->end */ + s2 = atom->end; } else if (m == 1 && n == 1) { /* no/vacuous quantifier: done */ EMPTYARC(s, atom->begin); /* empty prefix */ + /* rest of branch can be strung starting from atom->end */ + s2 = atom->end; } - else + else if (m > 0 && !(atom->flags & BACKR)) { /* - * Turn x{m,n} into x{m-1,n-1}x, with capturing parens in only the - * second x + * If there's no backrefs involved, we can turn x{m,n} into + * x{m-1,n-1}x, with capturing parens in only the second x. This + * is valid because we only care about capturing matches from the + * final iteration of the quantifier. It's a win because we can + * implement the backref-free left side as a plain DFA node, since + * we don't really care where its submatches are. */ dupnfa(v->nfa, atom->begin, atom->end, s, atom->begin); assert(m >= 1 && m != INFINITY && n >= 1); @@ -1142,16 +1135,36 @@ parseqatom(struct vars * v, NOERR(); t->right = atom; *atomp = t; + /* rest of branch can be strung starting from atom->end */ + s2 = atom->end; + } + else + { + /* general case: need an iteration node */ + s2 = newstate(v->nfa); + NOERR(); + moveouts(v->nfa, atom->end, s2); + NOERR(); + dupnfa(v->nfa, atom->begin, atom->end, s, s2); + repeat(v, s, s2, m, n); + f = COMBINE(qprefer, atom->flags); + t = subre(v, '*', f, s, s2); + NOERR(); + t->min = (short) m; + t->max = (short) n; + t->left = atom; + *atomp = t; + /* rest of branch is to be strung from iteration's end state */ } /* and finally, look after that postponed recursion */ t = top->right; if (!(SEE('|') || SEE(stopper) || SEE(EOS))) - t->right = parsebranch(v, stopper, type, atom->end, rp, 1); + t->right = parsebranch(v, stopper, type, s2, rp, 1); else { - EMPTYARC(atom->end, rp); - t->right = subre(v, '=', 0, atom->end, rp); + EMPTYARC(s2, rp); + t->right = subre(v, '=', 0, s2, rp); } assert(SEE('|') || SEE(stopper) || SEE(EOS)); t->flags |= COMBINE(t->flags, t->right->flags); @@ -1214,6 +1227,9 @@ scannum(struct vars * v) /* * repeat - replicate subNFA for quantifiers * + * The sub-NFA strung from lp to rp is modified to represent m to n + * repetitions of its initial contents. + * * The duplication sequences used here are chosen carefully so that any * pointers starting out pointing into the subexpression end up pointing into * the last occurrence. (Note that it may not be strung between the same @@ -1229,7 +1245,7 @@ repeat(struct vars * v, int n) { #define SOME 2 -#define INF 3 +#define INF 3 #define PAIR(x, y) ((x)*4 + (y)) #define REDUCE(x) ( ((x) == INFINITY) ? INF : (((x) > 1) ? SOME : (x)) ) const int rm = REDUCE(m); @@ -1603,7 +1619,7 @@ subre(struct vars * v, v->treechain = ret; } - assert(strchr("|.b(=", op) != NULL); + assert(strchr("=b|.*(", op) != NULL); ret->op = op; ret->flags = flags; diff --git a/src/backend/regex/regexec.c b/src/backend/regex/regexec.c index 224da5064b69b9577856b21793bd9a92afb03377..ea16e39a6eddfc6267e7c8fc2977afab7cb17e49 100644 --- a/src/backend/regex/regexec.c +++ b/src/backend/regex/regexec.c @@ -140,11 +140,15 @@ static void subset(struct vars *, struct subre *, chr *, chr *); static int dissect(struct vars *, struct subre *, chr *, chr *); static int condissect(struct vars *, struct subre *, chr *, chr *); static int altdissect(struct vars *, struct subre *, chr *, chr *); +static int iterdissect(struct vars *, struct subre *, chr *, chr *); +static int reviterdissect(struct vars *, struct subre *, chr *, chr *); static int cdissect(struct vars *, struct subre *, chr *, chr *); static int ccondissect(struct vars *, struct subre *, chr *, chr *); static int crevdissect(struct vars *, struct subre *, chr *, chr *); static int cbrdissect(struct vars *, struct subre *, chr *, chr *); static int caltdissect(struct vars *, struct subre *, chr *, chr *); +static int citerdissect(struct vars *, struct subre *, chr *, chr *); +static int creviterdissect(struct vars *, struct subre *, chr *, chr *); /* === rege_dfa.c === */ static chr *longest(struct vars *, struct dfa *, chr *, chr *, int *); @@ -563,14 +567,17 @@ dissect(struct vars * v, case '=': /* terminal node */ assert(t->left == NULL && t->right == NULL); return REG_OKAY; /* no action, parent did the work */ - case '|': /* alternation */ - assert(t->left != NULL); - return altdissect(v, t, begin, end); case 'b': /* back ref -- shouldn't be calling us! */ return REG_ASSERT; case '.': /* concatenation */ assert(t->left != NULL && t->right != NULL); return condissect(v, t, begin, end); + case '|': /* alternation */ + assert(t->left != NULL); + return altdissect(v, t, begin, end); + case '*': /* iteration */ + assert(t->left != NULL); + return iterdissect(v, t, begin, end); case '(': /* capturing */ assert(t->left != NULL && t->right == NULL); assert(t->subno > 0); @@ -696,6 +703,375 @@ altdissect(struct vars * v, return REG_ASSERT; /* none of them matched?!? */ } +/* + * iterdissect - iteration subexpression matches (uncomplicated) + */ +static int /* regexec return code */ +iterdissect(struct vars * v, + struct subre * t, + chr *begin, /* beginning of relevant substring */ + chr *end) /* end of same */ +{ + struct dfa *d; + chr **endpts; + chr *limit; + int min_matches; + size_t max_matches; + int nverified; + int k; + int i; + int er; + + assert(t->op == '*'); + assert(t->left != NULL && t->left->cnfa.nstates > 0); + assert(begin <= end); + + if (t->left->flags & SHORTER) /* reverse scan */ + return reviterdissect(v, t, begin, end); + + /* + * If zero matches are allowed, and target string is empty, just declare + * victory. OTOH, if target string isn't empty, zero matches can't work + * so we pretend the min is 1. + */ + min_matches = t->min; + if (min_matches <= 0) + { + if (begin == end) + return REG_OKAY; + min_matches = 1; + } + + /* + * We need workspace to track the endpoints of each sub-match. Normally + * we consider only nonzero-length sub-matches, so there can be at most + * end-begin of them. However, if min is larger than that, we will also + * consider zero-length sub-matches in order to find enough matches. + * + * For convenience, endpts[0] contains the "begin" pointer and we store + * sub-match endpoints in endpts[1..max_matches]. + */ + max_matches = end - begin; + if (max_matches > t->max && t->max != INFINITY) + max_matches = t->max; + if (max_matches < min_matches) + max_matches = min_matches; + endpts = (chr **) MALLOC((max_matches + 1) * sizeof(chr *)); + if (endpts == NULL) + return REG_ESPACE; + endpts[0] = begin; + + d = newdfa(v, &t->left->cnfa, &v->g->cmap, DOMALLOC); + if (ISERR()) + { + FREE(endpts); + return v->err; + } + MDEBUG(("iter %d\n", t->retry)); + + /* + * Our strategy is to first find a set of sub-match endpoints that are + * valid according to the child node's DFA, and then recursively dissect + * each sub-match to confirm validity. If any validity check fails, + * backtrack the last sub-match and try again. And, when we next try for + * a validity check, we need not recheck any successfully verified + * sub-matches that we didn't move the endpoints of. nverified remembers + * how many sub-matches are currently known okay. + */ + + /* initialize to consider first sub-match */ + nverified = 0; + k = 1; + limit = end; + + /* iterate until satisfaction or failure */ + while (k > 0) + { + /* try to find an endpoint for the k'th sub-match */ + endpts[k] = longest(v, d, endpts[k - 1], limit, (int *) NULL); + if (endpts[k] == NULL) + { + /* no match possible, so see if we can shorten previous one */ + k--; + goto backtrack; + } + MDEBUG(("%d: working endpoint %d: %ld\n", + t->retry, k, LOFF(endpts[k]))); + + /* k'th sub-match can no longer be considered verified */ + if (nverified >= k) + nverified = k - 1; + + if (endpts[k] != end) + { + /* haven't reached end yet, try another iteration if allowed */ + if (k >= max_matches) + { + /* must try to shorten some previous match */ + k--; + goto backtrack; + } + + /* reject zero-length match unless necessary to achieve min */ + if (endpts[k] == endpts[k - 1] && + (k >= min_matches || min_matches - k < end - endpts[k])) + goto backtrack; + + k++; + limit = end; + continue; + } + + /* + * We've identified a way to divide the string into k sub-matches + * that works so far as the child DFA can tell. If k is an allowed + * number of matches, start the slow part: recurse to verify each + * sub-match. We always have k <= max_matches, needn't check that. + */ + if (k < min_matches) + goto backtrack; + + MDEBUG(("%d: verifying %d..%d\n", t->retry, nverified + 1, k)); + + for (i = nverified + 1; i <= k; i++) + { + er = dissect(v, t->left, endpts[i - 1], endpts[i]); + if (er == REG_OKAY) + { + nverified = i; + continue; + } + if (er == REG_NOMATCH) + break; + /* oops, something failed */ + freedfa(d); + FREE(endpts); + return er; + } + + if (i > k) + { + /* satisfaction */ + MDEBUG(("%d successful\n", t->retry)); + freedfa(d); + FREE(endpts); + return REG_OKAY; + } + + /* match failed to verify, so backtrack */ + +backtrack: + /* + * Must consider shorter versions of the current sub-match. However, + * we'll only ask for a zero-length match if necessary. + */ + while (k > 0) + { + chr *prev_end = endpts[k - 1]; + + if (endpts[k] > prev_end) + { + limit = endpts[k] - 1; + if (limit > prev_end || + (k < min_matches && min_matches - k >= end - prev_end)) + { + /* break out of backtrack loop, continue the outer one */ + break; + } + } + /* can't shorten k'th sub-match any more, consider previous one */ + k--; + } + } + + /* all possibilities exhausted - shouldn't happen in uncomplicated mode */ + MDEBUG(("%d failed\n", t->retry)); + freedfa(d); + FREE(endpts); + return REG_ASSERT; +} + +/* + * reviterdissect - shortest-first iteration subexpression matches + */ +static int /* regexec return code */ +reviterdissect(struct vars * v, + struct subre * t, + chr *begin, /* beginning of relevant substring */ + chr *end) /* end of same */ +{ + struct dfa *d; + chr **endpts; + chr *limit; + int min_matches; + size_t max_matches; + int nverified; + int k; + int i; + int er; + + assert(t->op == '*'); + assert(t->left != NULL && t->left->cnfa.nstates > 0); + assert(t->left->flags & SHORTER); + assert(begin <= end); + + /* + * If zero matches are allowed, and target string is empty, just declare + * victory. OTOH, if target string isn't empty, zero matches can't work + * so we pretend the min is 1. + */ + min_matches = t->min; + if (min_matches <= 0) + { + if (begin == end) + return REG_OKAY; + min_matches = 1; + } + + /* + * We need workspace to track the endpoints of each sub-match. Normally + * we consider only nonzero-length sub-matches, so there can be at most + * end-begin of them. However, if min is larger than that, we will also + * consider zero-length sub-matches in order to find enough matches. + * + * For convenience, endpts[0] contains the "begin" pointer and we store + * sub-match endpoints in endpts[1..max_matches]. + */ + max_matches = end - begin; + if (max_matches > t->max && t->max != INFINITY) + max_matches = t->max; + if (max_matches < min_matches) + max_matches = min_matches; + endpts = (chr **) MALLOC((max_matches + 1) * sizeof(chr *)); + if (endpts == NULL) + return REG_ESPACE; + endpts[0] = begin; + + d = newdfa(v, &t->left->cnfa, &v->g->cmap, DOMALLOC); + if (ISERR()) + { + FREE(endpts); + return v->err; + } + MDEBUG(("reviter %d\n", t->retry)); + + /* + * Our strategy is to first find a set of sub-match endpoints that are + * valid according to the child node's DFA, and then recursively dissect + * each sub-match to confirm validity. If any validity check fails, + * backtrack the last sub-match and try again. And, when we next try for + * a validity check, we need not recheck any successfully verified + * sub-matches that we didn't move the endpoints of. nverified remembers + * how many sub-matches are currently known okay. + */ + + /* initialize to consider first sub-match */ + nverified = 0; + k = 1; + limit = begin; + + /* iterate until satisfaction or failure */ + while (k > 0) + { + /* disallow zero-length match unless necessary to achieve min */ + if (limit == endpts[k - 1] && + limit != end && + (k >= min_matches || min_matches - k < end - limit)) + limit++; + + /* try to find an endpoint for the k'th sub-match */ + endpts[k] = shortest(v, d, endpts[k - 1], limit, end, + (chr **) NULL, (int *) NULL); + if (endpts[k] == NULL) + { + /* no match possible, so see if we can lengthen previous one */ + k--; + goto backtrack; + } + MDEBUG(("%d: working endpoint %d: %ld\n", + t->retry, k, LOFF(endpts[k]))); + + /* k'th sub-match can no longer be considered verified */ + if (nverified >= k) + nverified = k - 1; + + if (endpts[k] != end) + { + /* haven't reached end yet, try another iteration if allowed */ + if (k >= max_matches) + { + /* must try to lengthen some previous match */ + k--; + goto backtrack; + } + + k++; + limit = endpts[k - 1]; + continue; + } + + /* + * We've identified a way to divide the string into k sub-matches + * that works so far as the child DFA can tell. If k is an allowed + * number of matches, start the slow part: recurse to verify each + * sub-match. We always have k <= max_matches, needn't check that. + */ + if (k < min_matches) + goto backtrack; + + MDEBUG(("%d: verifying %d..%d\n", t->retry, nverified + 1, k)); + + for (i = nverified + 1; i <= k; i++) + { + er = dissect(v, t->left, endpts[i - 1], endpts[i]); + if (er == REG_OKAY) + { + nverified = i; + continue; + } + if (er == REG_NOMATCH) + break; + /* oops, something failed */ + freedfa(d); + FREE(endpts); + return er; + } + + if (i > k) + { + /* satisfaction */ + MDEBUG(("%d successful\n", t->retry)); + freedfa(d); + FREE(endpts); + return REG_OKAY; + } + + /* match failed to verify, so backtrack */ + +backtrack: + /* + * Must consider longer versions of the current sub-match. + */ + while (k > 0) + { + if (endpts[k] < end) + { + limit = endpts[k] + 1; + /* break out of backtrack loop, continue the outer one */ + break; + } + /* can't lengthen k'th sub-match any more, consider previous one */ + k--; + } + } + + /* all possibilities exhausted - shouldn't happen in uncomplicated mode */ + MDEBUG(("%d failed\n", t->retry)); + freedfa(d); + FREE(endpts); + return REG_ASSERT; +} + /* * cdissect - determine subexpression matches (with complications) * The retry memory stores the offset of the trial midpoint from begin, @@ -717,15 +1093,18 @@ cdissect(struct vars * v, case '=': /* terminal node */ assert(t->left == NULL && t->right == NULL); return REG_OKAY; /* no action, parent did the work */ - case '|': /* alternation */ - assert(t->left != NULL); - return caltdissect(v, t, begin, end); case 'b': /* back reference */ assert(t->left == NULL && t->right == NULL); return cbrdissect(v, t, begin, end); case '.': /* concatenation */ assert(t->left != NULL && t->right != NULL); return ccondissect(v, t, begin, end); + case '|': /* alternation */ + assert(t->left != NULL); + return caltdissect(v, t, begin, end); + case '*': /* iteration */ + assert(t->left != NULL); + return citerdissect(v, t, begin, end); case '(': /* capturing */ assert(t->left != NULL && t->right == NULL); assert(t->subno > 0); @@ -847,7 +1226,7 @@ ccondissect(struct vars * v, } /* - * crevdissect - determine backref shortest-first subexpression matches + * crevdissect - shortest-first concatenation subexpression matches * The retry memory stores the offset of the trial midpoint from begin, * plus 1 so that 0 uniquely means "clean slate". */ @@ -1088,6 +1467,377 @@ caltdissect(struct vars * v, return caltdissect(v, t->right, begin, end); } +/* + * citerdissect - iteration subexpression matches (with complications) + */ +static int /* regexec return code */ +citerdissect(struct vars * v, + struct subre * t, + chr *begin, /* beginning of relevant substring */ + chr *end) /* end of same */ +{ + struct dfa *d; + chr **endpts; + chr *limit; + int min_matches; + size_t max_matches; + int nverified; + int k; + int i; + int er; + + assert(t->op == '*'); + assert(t->left != NULL && t->left->cnfa.nstates > 0); + assert(begin <= end); + + if (t->left->flags & SHORTER) /* reverse scan */ + return creviterdissect(v, t, begin, end); + + /* + * If zero matches are allowed, and target string is empty, just declare + * victory. OTOH, if target string isn't empty, zero matches can't work + * so we pretend the min is 1. + */ + min_matches = t->min; + if (min_matches <= 0) + { + if (begin == end) + return REG_OKAY; + min_matches = 1; + } + + /* + * We need workspace to track the endpoints of each sub-match. Normally + * we consider only nonzero-length sub-matches, so there can be at most + * end-begin of them. However, if min is larger than that, we will also + * consider zero-length sub-matches in order to find enough matches. + * + * For convenience, endpts[0] contains the "begin" pointer and we store + * sub-match endpoints in endpts[1..max_matches]. + */ + max_matches = end - begin; + if (max_matches > t->max && t->max != INFINITY) + max_matches = t->max; + if (max_matches < min_matches) + max_matches = min_matches; + endpts = (chr **) MALLOC((max_matches + 1) * sizeof(chr *)); + if (endpts == NULL) + return REG_ESPACE; + endpts[0] = begin; + + d = newdfa(v, &t->left->cnfa, &v->g->cmap, DOMALLOC); + if (ISERR()) + { + FREE(endpts); + return v->err; + } + MDEBUG(("citer %d\n", t->retry)); + + /* + * Our strategy is to first find a set of sub-match endpoints that are + * valid according to the child node's DFA, and then recursively dissect + * each sub-match to confirm validity. If any validity check fails, + * backtrack the last sub-match and try again. And, when we next try for + * a validity check, we need not recheck any successfully verified + * sub-matches that we didn't move the endpoints of. nverified remembers + * how many sub-matches are currently known okay. + */ + + /* initialize to consider first sub-match */ + nverified = 0; + k = 1; + limit = end; + + /* iterate until satisfaction or failure */ + while (k > 0) + { + /* try to find an endpoint for the k'th sub-match */ + endpts[k] = longest(v, d, endpts[k - 1], limit, (int *) NULL); + if (endpts[k] == NULL) + { + /* no match possible, so see if we can shorten previous one */ + k--; + goto backtrack; + } + MDEBUG(("%d: working endpoint %d: %ld\n", + t->retry, k, LOFF(endpts[k]))); + + /* k'th sub-match can no longer be considered verified */ + if (nverified >= k) + nverified = k - 1; + + if (endpts[k] != end) + { + /* haven't reached end yet, try another iteration if allowed */ + if (k >= max_matches) + { + /* must try to shorten some previous match */ + k--; + goto backtrack; + } + + /* reject zero-length match unless necessary to achieve min */ + if (endpts[k] == endpts[k - 1] && + (k >= min_matches || min_matches - k < end - endpts[k])) + goto backtrack; + + k++; + limit = end; + continue; + } + + /* + * We've identified a way to divide the string into k sub-matches + * that works so far as the child DFA can tell. If k is an allowed + * number of matches, start the slow part: recurse to verify each + * sub-match. We always have k <= max_matches, needn't check that. + */ + if (k < min_matches) + goto backtrack; + + MDEBUG(("%d: verifying %d..%d\n", t->retry, nverified + 1, k)); + + for (i = nverified + 1; i <= k; i++) + { + zapmem(v, t->left); + er = cdissect(v, t->left, endpts[i - 1], endpts[i]); + if (er == REG_OKAY) + { + nverified = i; + continue; + } + if (er == REG_NOMATCH) + break; + /* oops, something failed */ + freedfa(d); + FREE(endpts); + return er; + } + + if (i > k) + { + /* satisfaction */ + MDEBUG(("%d successful\n", t->retry)); + freedfa(d); + FREE(endpts); + return REG_OKAY; + } + + /* match failed to verify, so backtrack */ + +backtrack: + /* + * Must consider shorter versions of the current sub-match. However, + * we'll only ask for a zero-length match if necessary. + */ + while (k > 0) + { + chr *prev_end = endpts[k - 1]; + + if (endpts[k] > prev_end) + { + limit = endpts[k] - 1; + if (limit > prev_end || + (k < min_matches && min_matches - k >= end - prev_end)) + { + /* break out of backtrack loop, continue the outer one */ + break; + } + } + /* can't shorten k'th sub-match any more, consider previous one */ + k--; + } + } + + /* all possibilities exhausted */ + MDEBUG(("%d failed\n", t->retry)); + freedfa(d); + FREE(endpts); + return REG_NOMATCH; +} + +/* + * creviterdissect - shortest-first iteration subexpression matches + */ +static int /* regexec return code */ +creviterdissect(struct vars * v, + struct subre * t, + chr *begin, /* beginning of relevant substring */ + chr *end) /* end of same */ +{ + struct dfa *d; + chr **endpts; + chr *limit; + int min_matches; + size_t max_matches; + int nverified; + int k; + int i; + int er; + + assert(t->op == '*'); + assert(t->left != NULL && t->left->cnfa.nstates > 0); + assert(t->left->flags & SHORTER); + assert(begin <= end); + + /* + * If zero matches are allowed, and target string is empty, just declare + * victory. OTOH, if target string isn't empty, zero matches can't work + * so we pretend the min is 1. + */ + min_matches = t->min; + if (min_matches <= 0) + { + if (begin == end) + return REG_OKAY; + min_matches = 1; + } + + /* + * We need workspace to track the endpoints of each sub-match. Normally + * we consider only nonzero-length sub-matches, so there can be at most + * end-begin of them. However, if min is larger than that, we will also + * consider zero-length sub-matches in order to find enough matches. + * + * For convenience, endpts[0] contains the "begin" pointer and we store + * sub-match endpoints in endpts[1..max_matches]. + */ + max_matches = end - begin; + if (max_matches > t->max && t->max != INFINITY) + max_matches = t->max; + if (max_matches < min_matches) + max_matches = min_matches; + endpts = (chr **) MALLOC((max_matches + 1) * sizeof(chr *)); + if (endpts == NULL) + return REG_ESPACE; + endpts[0] = begin; + + d = newdfa(v, &t->left->cnfa, &v->g->cmap, DOMALLOC); + if (ISERR()) + { + FREE(endpts); + return v->err; + } + MDEBUG(("creviter %d\n", t->retry)); + + /* + * Our strategy is to first find a set of sub-match endpoints that are + * valid according to the child node's DFA, and then recursively dissect + * each sub-match to confirm validity. If any validity check fails, + * backtrack the last sub-match and try again. And, when we next try for + * a validity check, we need not recheck any successfully verified + * sub-matches that we didn't move the endpoints of. nverified remembers + * how many sub-matches are currently known okay. + */ + + /* initialize to consider first sub-match */ + nverified = 0; + k = 1; + limit = begin; + + /* iterate until satisfaction or failure */ + while (k > 0) + { + /* disallow zero-length match unless necessary to achieve min */ + if (limit == endpts[k - 1] && + limit != end && + (k >= min_matches || min_matches - k < end - limit)) + limit++; + + /* try to find an endpoint for the k'th sub-match */ + endpts[k] = shortest(v, d, endpts[k - 1], limit, end, + (chr **) NULL, (int *) NULL); + if (endpts[k] == NULL) + { + /* no match possible, so see if we can lengthen previous one */ + k--; + goto backtrack; + } + MDEBUG(("%d: working endpoint %d: %ld\n", + t->retry, k, LOFF(endpts[k]))); + + /* k'th sub-match can no longer be considered verified */ + if (nverified >= k) + nverified = k - 1; + + if (endpts[k] != end) + { + /* haven't reached end yet, try another iteration if allowed */ + if (k >= max_matches) + { + /* must try to lengthen some previous match */ + k--; + goto backtrack; + } + + k++; + limit = endpts[k - 1]; + continue; + } + + /* + * We've identified a way to divide the string into k sub-matches + * that works so far as the child DFA can tell. If k is an allowed + * number of matches, start the slow part: recurse to verify each + * sub-match. We always have k <= max_matches, needn't check that. + */ + if (k < min_matches) + goto backtrack; + + MDEBUG(("%d: verifying %d..%d\n", t->retry, nverified + 1, k)); + + for (i = nverified + 1; i <= k; i++) + { + zapmem(v, t->left); + er = cdissect(v, t->left, endpts[i - 1], endpts[i]); + if (er == REG_OKAY) + { + nverified = i; + continue; + } + if (er == REG_NOMATCH) + break; + /* oops, something failed */ + freedfa(d); + FREE(endpts); + return er; + } + + if (i > k) + { + /* satisfaction */ + MDEBUG(("%d successful\n", t->retry)); + freedfa(d); + FREE(endpts); + return REG_OKAY; + } + + /* match failed to verify, so backtrack */ + +backtrack: + /* + * Must consider longer versions of the current sub-match. + */ + while (k > 0) + { + if (endpts[k] < end) + { + limit = endpts[k] + 1; + /* break out of backtrack loop, continue the outer one */ + break; + } + /* can't lengthen k'th sub-match any more, consider previous one */ + k--; + } + } + + /* all possibilities exhausted */ + MDEBUG(("%d failed\n", t->retry)); + freedfa(d); + FREE(endpts); + return REG_NOMATCH; +} + #include "rege_dfa.c" diff --git a/src/include/regex/regguts.h b/src/include/regex/regguts.h index fb6789b560f3899b7c77feef4e0e91ac1f2d9c5d..d420ea8316e18f2ff009af5c618027cf3fae8256 100644 --- a/src/include/regex/regguts.h +++ b/src/include/regex/regguts.h @@ -372,10 +372,28 @@ struct cnfa /* * subexpression tree + * + * "op" is one of: + * '=' plain regex without interesting substructure (implemented as DFA) + * 'b' back-reference (has no substructure either) + * '(' capture node: captures the match of its single child + * '.' concatenation: matches a match for left, then a match for right + * '|' alternation: matches a match for left or a match for right + * '*' iteration: matches some number of matches of its single child + * + * Note: the right child of an alternation must be another alternation or + * NULL; hence, an N-way branch requires N alternation nodes, not N-1 as you + * might expect. This could stand to be changed. Actually I'd rather see + * a single alternation node with N children, but that will take revising + * the representation of struct subre. + * + * Note: when a backref is directly quantified, we stick the min/max counts + * into the backref rather than plastering an iteration node on top. This is + * for efficiency: there is no need to search for possible division points. */ struct subre { - char op; /* '|', '.' (concat), 'b' (backref), '(', '=' */ + char op; /* see type codes above */ char flags; #define LONGER 01 /* prefers longer match */ #define SHORTER 02 /* prefers shorter match */ @@ -393,8 +411,8 @@ struct subre #define COMBINE(f1, f2) (UP((f1)|(f2)) | PREF2(f1, f2)) short retry; /* index into retry memory */ int subno; /* subexpression number (for 'b' and '(') */ - short min; /* min repetitions, for backref only */ - short max; /* max repetitions, for backref only */ + short min; /* min repetitions for iteration or backref */ + short max; /* max repetitions for iteration or backref */ struct subre *left; /* left child, if any (also freelist chain) */ struct subre *right; /* right child, if any */ struct state *begin; /* outarcs from here... */ diff --git a/src/test/regress/expected/regex.out b/src/test/regress/expected/regex.out index 5694908163af856f78940a9e31d75a370b4bfe09..4acc4a47a03bca1ad6303765c519f414fdbf5c42 100644 --- a/src/test/regress/expected/regex.out +++ b/src/test/regress/expected/regex.out @@ -34,3 +34,40 @@ select 'b' ~ '^([bc])\1*$' as t; t (1 row) +-- Test quantified backref within a larger expression +select 'abc abc abc' ~ '^(\w+)( \1)+$' as t; + t +--- + t +(1 row) + +select 'abc abd abc' ~ '^(\w+)( \1)+$' as f; + f +--- + f +(1 row) + +select 'abc abc abd' ~ '^(\w+)( \1)+$' as f; + f +--- + f +(1 row) + +select 'abc abc abc' ~ '^(.+)( \1)+$' as t; + t +--- + t +(1 row) + +select 'abc abd abc' ~ '^(.+)( \1)+$' as f; + f +--- + f +(1 row) + +select 'abc abc abd' ~ '^(.+)( \1)+$' as f; + f +--- + f +(1 row) + diff --git a/src/test/regress/sql/regex.sql b/src/test/regress/sql/regex.sql index 242a81ef3298a68d79cf4af971e17935f3630964..b5315a3df6ddda340d2bf6f27137aba1210267f1 100644 --- a/src/test/regress/sql/regex.sql +++ b/src/test/regress/sql/regex.sql @@ -11,3 +11,11 @@ select 'ccc' ~ '^([bc])\1*$' as t; select 'xxx' ~ '^([bc])\1*$' as f; select 'bbc' ~ '^([bc])\1*$' as f; select 'b' ~ '^([bc])\1*$' as t; + +-- Test quantified backref within a larger expression +select 'abc abc abc' ~ '^(\w+)( \1)+$' as t; +select 'abc abd abc' ~ '^(\w+)( \1)+$' as f; +select 'abc abc abd' ~ '^(\w+)( \1)+$' as f; +select 'abc abc abc' ~ '^(.+)( \1)+$' as t; +select 'abc abd abc' ~ '^(.+)( \1)+$' as f; +select 'abc abc abd' ~ '^(.+)( \1)+$' as f;