diff --git a/src/backend/regex/README b/src/backend/regex/README new file mode 100644 index 0000000000000000000000000000000000000000..3fd58c000119a24d61dff594baec28a27a4437f0 --- /dev/null +++ b/src/backend/regex/README @@ -0,0 +1,291 @@ +Implementation notes about Henry Spencer's regex library +======================================================== + +If Henry ever had any internals documentation, he didn't publish it. +So this file is an attempt to reverse-engineer some docs. + +General source-file layout +-------------------------- + +There are four separately-compilable source files, each exposing exactly +one exported function: + regcomp.c: pg_regcomp + regexec.c: pg_regexec + regerror.c: pg_regerror + regfree.c: pg_regfree +(The pg_ prefixes were added by the Postgres project to distinguish this +library version from any similar one that might be present on a particular +system. They'd need to be removed or replaced in any standalone version +of the library.) + +There are additional source files regc_*.c that are #include'd in regcomp, +and similarly additional source files rege_*.c that are #include'd in +regexec. This was done to avoid exposing internal symbols globally; +all functions not meant to be part of the library API are static. + +(Actually the above is a lie in one respect: there is one more global +symbol, pg_set_regex_collation in regcomp. It is not meant to be part of +the API, but it has to be global because both regcomp and regexec call it. +It'd be better to get rid of that, as well as the static variables it +sets, in favor of keeping the needed locale state in the regex structs. +We have not done this yet for lack of a design for how to add +application-specific state to the structs.) + +What's where in src/backend/regex/: + +regcomp.c Top-level regex compilation code +regc_color.c Color map management +regc_cvec.c Character vector (cvec) management +regc_lex.c Lexer +regc_nfa.c NFA handling +regc_locale.c Application-specific locale code from Tcl project +regc_pg_locale.c Postgres-added application-specific locale code +regexec.c Top-level regex execution code +rege_dfa.c DFA creation and execution +regerror.c pg_regerror: generate text for a regex error code +regfree.c pg_regfree: API to free a no-longer-needed regex_t + +The locale-specific code is concerned primarily with case-folding and with +expanding locale-specific character classes, such as [[:alnum:]]. It +really needs refactoring if this is ever to become a standalone library. + +The header files for the library are in src/include/regex/: + +regcustom.h Customizes library for particular application +regerrs.h Error message list +regex.h Exported API +regguts.h Internals declarations + + +DFAs, NFAs, and all that +------------------------ + +This library is a hybrid DFA/NFA regex implementation. (If you've never +heard either of those terms, get thee to a first-year comp sci textbook.) +It might not be clear at first glance what that really means and how it +relates to what you'll see in the code. Here's what really happens: + +* Initial parsing of a regex generates an NFA representation, with number +of states approximately proportional to the length of the regexp. + +* The NFA is then optimized into a "compact NFA" representation, which is +basically the same data but without fields that are not going to be needed +at runtime. We do a little bit of cleanup too, such as removing +unreachable states that might be created as a result of the rather naive +transformation done by initial parsing. The cNFA representation is what +is passed from regcomp to regexec. + +* Unlike traditional NFA-based regex engines, we do not execute directly +from the NFA representation, as that would require backtracking and so be +very slow in some cases. Rather, we execute a DFA, which ideally can +process an input string in linear time (O(M) for M characters of input) +without backtracking. Each state of the DFA corresponds to a set of +states of the NFA, that is all the states that the NFA might have been in +upon reaching the current point in the input string. Therefore, an NFA +with N states might require as many as 2^N states in the corresponding +DFA, which could easily require unreasonable amounts of memory. We deal +with this by materializing states of the DFA lazily (only when needed) and +keeping them in a limited-size cache. The possible need to build the same +state of the DFA repeatedly makes this approach not truly O(M) time, but +in the worst case as much as O(M*N). That's still far better than the +worst case for a backtracking NFA engine. + +If that were the end of it, we'd just say this is a DFA engine, with the +use of NFAs being merely an implementation detail. However, a DFA engine +cannot handle some important regex features such as capturing parens and +back-references. If the parser finds that a regex uses these features +(collectively called "messy cases" in the code), then we have to use +NFA-style backtracking search after all. + +When using the NFA mode, the representation constructed by the parser +consists of a tree of sub-expressions ("subre"s). Leaf tree nodes are +either plain regular expressions (which are executed as DFAs in the manner +described above) or back-references (which try to match the input to some +previous substring). Non-leaf nodes are capture nodes (which save the +location of the substring currently matching their child node) or +concatenation or alternation nodes. At execution time, the executor +recursively scans the tree. At concatenation or alternation nodes, +it considers each possible alternative way of matching the input string, +ie each place where the string could be split for a concatenation, or each +child node for an alternation. It tries the next alternative if the match +fails according to the child nodes. This is exactly the sort of +backtracking search done by a traditional NFA regex engine. If there are +many tree levels it can get very slow. + +But all is not lost: we can still be smarter than the average pure NFA +engine. To do this, each subre node has an associated DFA, which +represents what the node could possibly match insofar as a mathematically +pure regex can describe that, which basically means "no backrefs". +Before we perform any search of possible alternative sub-matches, we run +the DFA to see if it thinks the proposed substring could possibly match. +If not, we can reject the match immediately without iterating through many +possibilities. + +As an example, consider the regex "(a[bc]+)\1". The compiled +representation will have a top-level concatenation subre node. Its left +child is a capture node, and the child of that is a plain DFA node for +"a[bc]+". The concatenation's right child is a backref node for \1. +The DFA associated with the concatenation node will be "a[bc]+a[bc]+", +where the backref has been replaced by a copy of the DFA for its referent +expression. When executed, the concatenation node will have to search for +a possible division of the input string that allows its two child nodes to +each match their part of the string (and although this specific case can +only succeed when the division is at the middle, the code does not know +that, nor would it be true in general). However, we can first run the DFA +and quickly reject any input that doesn't contain two a's and some number +of b's and c's. If the DFA doesn't match, there is no need to recurse to +the two child nodes for each possible string division point. In many +cases, this prefiltering makes the search run much faster than a pure NFA +engine could do. It is this behavior that justifies using the phrase +"hybrid DFA/NFA engine" to describe Spencer's library. + + +Colors and colormapping +----------------------- + +In many common regex patterns, there are large numbers of characters that +can be treated alike by the execution engine. A simple example is the +pattern "[[:alpha:]][[:alnum:]]*" for an identifier. Basically the engine +only needs to care whether an input symbol is a letter, a digit, or other. +We could build the NFA or DFA with a separate arc for each possible letter +and digit, but that's very wasteful of space and not so cheap to execute +either, especially when dealing with Unicode which can have thousands of +letters. Instead, the parser builds a "color map" that maps each possible +input symbol to a "color", or equivalence class. The NFA or DFA +representation then has arcs labeled with colors, not specific input +symbols. At execution, the first thing the executor does with each input +symbol is to look up its color in the color map, and then everything else +works from the color only. + +To build the colormap, we start by assigning every possible input symbol +the color WHITE, which means "other" (that is, at the end of parsing, the +symbols that are still WHITE are those not explicitly referenced anywhere +in the regex). When we see a simple literal character or a bracket +expression in the regex, we want to assign that character, or all the +characters represented by the bracket expression, a unique new color that +can be used to label the NFA arc corresponding to the state transition for +matching this character or bracket expression. The basic idea is: +first, change the color assigned to a character to some new value; +second, run through all the existing arcs in the partially-built NFA, +and for each one referencing the character's old color, add a parallel +arc referencing its new color (this keeps the reassignment from changing +the semantics of what we already built); and third, add a new arc with +the character's new color to the current pair of NFA states, denoting +that seeing this character allows the state transition to be made. + +This is complicated a bit by not wanting to create more colors +(equivalence classes) than absolutely necessary. In particular, if a +bracket expression mentions two characters that had the same color before, +they should still share the same color after we process the bracket, since +there is still not a need to distinguish them. But we do need to +distinguish them from other characters that previously had the same color +yet are not listed in the bracket expression. To mechanize this, the code +has a concept of "parent colors" and "subcolors", where a color's subcolor +is the new color that we are giving to any characters of that color while +parsing the current atom. (The word "parent" is a bit unfortunate here, +because it suggests a long-lived relationship, but a subcolor link really +only lasts for the duration of parsing a single atom.) In other words, +a subcolor link means that we are in process of splitting the parent color +into two colors (equivalence classes), depending on whether or not each +member character should be included by the current regex atom. + +As an example, suppose we have the regex "a\d\wx". Initially all possible +character codes are labeled WHITE (color 0). To parse the atom "a", we +create a new color (1), update "a"'s color map entry to 1, and create an +arc labeled 1 between the first two states of the NFA. Now we see \d, +which is really a bracket expression containing the digits "0"-"9". +First we process "0", which is currently WHITE, so we create a new color +(2), update "0"'s color map entry to 2, and create an arc labeled 2 +between the second and third states of the NFA. We also mark color WHITE +as having the subcolor 2, which means that future relabelings of WHITE +characters should also select 2 as the new color. Thus, when we process +"1", we won't create a new color but re-use 2. We update "1"'s color map +entry to 2, and then find that we don't need a new arc because there is +already one labeled 2 between the second and third states of the NFA. +Similarly for the other 8 digits, so there will be only one arc labeled 2 +between NFA states 2 and 3 for all members of this bracket expression. +At completion of processing of the bracket expression, we call okcolors() +which breaks all the existing parent/subcolor links; there is no longer a +marker saying that WHITE characters should be relabeled 2. (Note: +actually, we did the same creation and clearing of a subcolor link for the +primitive atom "a", but it didn't do anything very interesting.) Now we +come to the "\w" bracket expression, which for simplicity assume expands +to just "[a-z0-9]". We process "a", but observe that it is already the +sole member of its color 1. This means there is no need to subdivide that +equivalence class more finely, so we do not create any new color. We just +make an arc labeled 1 between the third and fourth NFA states. Next we +process "b", which is WHITE and far from the only WHITE character, so we +create a new color (3), link that as WHITE's subcolor, relabel "b" as +color 3, and make an arc labeled 3. As we process "c" through "z", each +is relabeled from WHITE to 3, but no new arc is needed. Now we come to +"0", which is not the only member of its color 2, so we suppose that a new +color is needed and create color 4. We link 4 as subcolor of 2, relabel +"0" as color 4 in the map, and add an arc for color 4. Next "1" through +"9" are similarly relabeled as color 4, with no additional arcs needed. +Having finished the bracket expression, we call okcolors(), which breaks +the subcolor links. okcolors() further observes that we have removed +every member of color 2 (the previous color of the digit characters). +Therefore, it runs through the partial NFA built so far and relabels arcs +labeled 2 to color 4; in particular the arc from NFA state 2 to state 3 is +relabeled color 4. Then it frees up color 2, since we have no more use +for that color. We now have an NFA in which transitions for digits are +consistently labeled with color 4. Last, we come to the atom "x". +"x" is currently labeled with color 3, and it's not the only member of +that color, so we realize that we now need to distinguish "x" from other +letters when we did not before. We create a new color, which might have +been 5 but instead we recycle the unused color 2. "x" is relabeled 2 in +the color map and 2 is linked as the subcolor of 3, and we add an arc for +2 between states 4 and 5 of the NFA. Now we call okcolors(), which breaks +the subcolor link between colors 3 and 2 and notices that both colors are +nonempty. Therefore, it also runs through the existing NFA arcs and adds +an additional arc labeled 2 wherever there is an arc labeled 3; this +action ensures that characters of color 2 (i.e., "x") will still be +considered as allowing any transitions they did before. We are now done +parsing the regex, and we have these final color assignments: + color 1: "a" + color 2: "x" + color 3: other letters + color 4: digits +and the NFA has these arcs: + states 1 -> 2 on color 1 (hence, "a" only) + states 2 -> 3 on color 4 (digits) + states 3 -> 4 on colors 1, 3, 4, and 2 (covering all \w characters) + states 4 -> 5 on color 2 ("x" only) +which can be seen to be a correct representation of the regex. + +Given this summary, we can see we need the following operations for +colors: + +* A fast way to look up the current color assignment for any character + code. (This is needed during both parsing and execution, while the + remaining operations are needed only during parsing.) +* A way to alter the color assignment for any given character code. +* We must track the number of characters currently assigned to each + color, so that we can detect empty and singleton colors. +* We must track all existing NFA arcs of a given color, so that we + can relabel them at need, or add parallel arcs of a new color when + an existing color has to be subdivided. + +The last two of these are handled with the "struct colordesc" array and +the "colorchain" links in NFA arc structs. The color map proper (that +is, the per-character lookup array) is handled as a multi-level tree, +with each tree level indexed by one byte of a character's value. The +code arranges to not have more than one copy of bottom-level tree pages +that are all-the-same-color. + +Unfortunately, this design does not seem terribly efficient for common +cases such as a tree in which all Unicode letters are colored the same, +because there aren't that many places where we get a whole page all the +same color, except at the end of the map. (It also strikes me that given +PG's current restrictions on the range of Unicode values, we could use a +3-level rather than 4-level tree; but there's not provision for that in +regguts.h at the moment.) + +A bigger problem is that it just doesn't seem very reasonable to have to +consider each Unicode letter separately at regex parse time for a regex +such as "\w"; more than likely, a huge percentage of those codes will +never be seen at runtime. We need to fix things so that locale-based +character classes are somehow processed "symbolically" without making a +full expansion of their contents at parse time. This would mean that we'd +have to be ready to call iswalpha() at runtime, but if that only happens +for high-code-value characters, it shouldn't be a big performance hit. diff --git a/src/backend/regex/regc_cvec.c b/src/backend/regex/regc_cvec.c index fb6f06b5243f50bfad2cefa5c016d4e842791a3d..580a693161e89a3f6e84fc8c5149218469c85065 100644 --- a/src/backend/regex/regc_cvec.c +++ b/src/backend/regex/regc_cvec.c @@ -77,6 +77,7 @@ static void addchr(struct cvec * cv, /* character vector */ chr c) /* character to add */ { + assert(cv->nchrs < cv->chrspace); cv->chrs[cv->nchrs++] = (chr) c; } @@ -95,17 +96,27 @@ addrange(struct cvec * cv, /* character vector */ } /* - * getcvec - get a cvec, remembering it as v->cv + * getcvec - get a transient cvec, initialized to empty + * + * The returned cvec is valid only until the next call of getcvec, which + * typically will recycle the space. Callers should *not* free the cvec + * explicitly; it will be cleaned up when the struct vars is destroyed. + * + * This is typically used while interpreting bracket expressions. In that + * usage the cvec is only needed momentarily until we build arcs from it, + * so transientness is a convenient behavior. */ static struct cvec * getcvec(struct vars * v, /* context */ int nchrs, /* to hold this many chrs... */ int nranges) /* ... and this many ranges */ { + /* recycle existing transient cvec if large enough */ if (v->cv != NULL && nchrs <= v->cv->chrspace && nranges <= v->cv->rangespace) return clearcvec(v->cv); + /* nope, make a new one */ if (v->cv != NULL) freecvec(v->cv); v->cv = newcvec(nchrs, nranges); diff --git a/src/backend/regex/regcomp.c b/src/backend/regex/regcomp.c index bd4d4c3761928b098c1e5aab030968ea8046b4ab..4f9da5b0468d53b997ddb52eb0b5a66a17696318 100644 --- a/src/backend/regex/regcomp.c +++ b/src/backend/regex/regcomp.c @@ -356,6 +356,7 @@ pg_regcomp(regex_t *re, ZAPCNFA(g->search); v->nfa = newnfa(v, v->cm, (struct nfa *) NULL); CNOERR(); + /* set up a reasonably-sized transient cvec for getcvec usage */ v->cv = newcvec(100, 20); if (v->cv == NULL) return freev(v, REG_ESPACE); diff --git a/src/include/regex/regguts.h b/src/include/regex/regguts.h index 0cced701dbdc84578ff3d8df2dcbd91a65107064..fb6789b560f3899b7c77feef4e0e91ac1f2d9c5d 100644 --- a/src/include/regex/regguts.h +++ b/src/include/regex/regguts.h @@ -181,34 +181,52 @@ union tree #define tcolor colors.ccolor #define tptr ptrs.pptr -/* internal per-color descriptor structure for the color machinery */ +/* + * Per-color data structure for the compile-time color machinery + * + * If "sub" is not NOSUB then it is the number of the color's current + * subcolor, i.e. we are in process of dividing this color (character + * equivalence class) into two colors. See src/backend/regex/README for + * discussion of subcolors. + * + * Currently-unused colors have the FREECOL bit set and are linked into a + * freelist using their "sub" fields, but only if their color numbers are + * less than colormap.max. Any array entries beyond "max" are just garbage. + */ struct colordesc { uchr nchrs; /* number of chars of this color */ - color sub; /* open subcolor (if any); free chain ptr */ -#define NOSUB COLORLESS - struct arc *arcs; /* color chain */ - int flags; + color sub; /* open subcolor, if any; or free-chain ptr */ +#define NOSUB COLORLESS /* value of "sub" when no open subcolor */ + struct arc *arcs; /* chain of all arcs of this color */ + int flags; /* bit values defined next */ #define FREECOL 01 /* currently free */ #define PSEUDO 02 /* pseudocolor, no real chars */ #define UNUSEDCOLOR(cd) ((cd)->flags&FREECOL) union tree *block; /* block of solid color, if any */ }; -/* the color map itself */ +/* + * The color map itself + * + * Only the "tree" part is used at execution time, and that only via the + * GETCOLOR() macro. Possibly that should be separated from the compile-time + * data. + */ struct colormap { int magic; #define CMMAGIC 0x876 struct vars *v; /* for compile error reporting */ - size_t ncds; /* number of colordescs */ - size_t max; /* highest in use */ + size_t ncds; /* allocated length of colordescs array */ + size_t max; /* highest color number currently in use */ color free; /* beginning of free chain (if non-0) */ - struct colordesc *cd; + struct colordesc *cd; /* pointer to array of colordescs */ #define CDEND(cm) (&(cm)->cd[(cm)->max + 1]) + /* If we need up to NINLINECDS, we store them here to save a malloc */ #define NINLINECDS ((size_t)10) struct colordesc cdspace[NINLINECDS]; - union tree tree[NBYTS]; /* tree top, plus fill blocks */ + union tree tree[NBYTS]; /* tree top, plus lower-level fill blocks */ }; /* optimization magic to do fast chr->color mapping */ @@ -229,19 +247,25 @@ struct colormap /* - * Interface definitions for locale-interface functions in locale.c. + * Interface definitions for locale-interface functions in regc_locale.c. */ -/* Representation of a set of characters. */ +/* + * Representation of a set of characters. chrs[] represents individual + * code points, ranges[] represents ranges in the form min..max inclusive. + * + * Note that in cvecs gotten from newcvec() and intended to be freed by + * freecvec(), both arrays of chrs are after the end of the struct, not + * separately malloc'd; so chrspace and rangespace are effectively immutable. + */ struct cvec { int nchrs; /* number of chrs */ - int chrspace; /* number of chrs possible */ + int chrspace; /* number of chrs allocated in chrs[] */ chr *chrs; /* pointer to vector of chrs */ int nranges; /* number of ranges (chr pairs) */ - int rangespace; /* number of chrs possible */ + int rangespace; /* number of ranges allocated in ranges[] */ chr *ranges; /* pointer to vector of chr pairs */ - /* both batches of chrs are on the end */ };