From 7bcc6d98fb5c3bda2787ae085ef3ff3dbb65ae42 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Wed, 5 Feb 2003 17:41:33 +0000
Subject: [PATCH] Replace regular expression package with Henry Spencer's
 latest version (extracted from Tcl 8.4.1 release, as Henry still hasn't got
 round to making it a separate library).  This solves a performance problem
 for multibyte, as well as upgrading our regexp support to match recent Tcl
 and nearly match recent Perl.

---
 doc/src/sgml/func.sgml          | 1124 +++++++++--
 doc/src/sgml/release.sgml       |    3 +-
 src/backend/regex/COPYRIGHT     |  140 +-
 src/backend/regex/Makefile      |   18 +-
 src/backend/regex/WHATSNEW      |   94 -
 src/backend/regex/engine.c      | 1093 ----------
 src/backend/regex/re_format.7   |  269 ---
 src/backend/regex/re_syntax.n   |  970 +++++++++
 src/backend/regex/regc_color.c  |  728 +++++++
 src/backend/regex/regc_cvec.c   |  194 ++
 src/backend/regex/regc_lex.c    | 1028 ++++++++++
 src/backend/regex/regc_locale.c |  615 ++++++
 src/backend/regex/regc_nfa.c    | 1481 ++++++++++++++
 src/backend/regex/regcomp.c     | 3340 +++++++++++++++++--------------
 src/backend/regex/rege_dfa.c    |  655 ++++++
 src/backend/regex/regerror.c    |  241 +--
 src/backend/regex/regex.3       |  538 -----
 src/backend/regex/regexec.c     | 1163 +++++++++--
 src/backend/regex/regfree.c     |  101 +-
 src/backend/regex/retest.c      |   44 -
 src/backend/utils/adt/regexp.c  |  381 ++--
 src/include/regex/cclass.h      |   99 -
 src/include/regex/cname.h       |  336 ----
 src/include/regex/regcustom.h   |   64 +
 src/include/regex/regerrs.h     |   22 +
 src/include/regex/regex.h       |  251 ++-
 src/include/regex/regex2.h      |  174 --
 src/include/regex/regguts.h     |  393 ++++
 src/include/regex/utils.h       |   60 -
 29 files changed, 10554 insertions(+), 5065 deletions(-)
 delete mode 100644 src/backend/regex/WHATSNEW
 delete mode 100644 src/backend/regex/engine.c
 delete mode 100644 src/backend/regex/re_format.7
 create mode 100644 src/backend/regex/re_syntax.n
 create mode 100644 src/backend/regex/regc_color.c
 create mode 100644 src/backend/regex/regc_cvec.c
 create mode 100644 src/backend/regex/regc_lex.c
 create mode 100644 src/backend/regex/regc_locale.c
 create mode 100644 src/backend/regex/regc_nfa.c
 create mode 100644 src/backend/regex/rege_dfa.c
 delete mode 100644 src/backend/regex/regex.3
 delete mode 100644 src/backend/regex/retest.c
 delete mode 100644 src/include/regex/cclass.h
 delete mode 100644 src/include/regex/cname.h
 create mode 100644 src/include/regex/regcustom.h
 create mode 100644 src/include/regex/regerrs.h
 delete mode 100644 src/include/regex/regex2.h
 create mode 100644 src/include/regex/regguts.h
 delete mode 100644 src/include/regex/utils.h

diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml
index b3de02ef067..baeef816181 100644
--- a/doc/src/sgml/func.sgml
+++ b/doc/src/sgml/func.sgml
@@ -1,5 +1,5 @@
 <!--
-$Header: /cvsroot/pgsql/doc/src/sgml/func.sgml,v 1.136 2003/01/23 23:38:51 petere Exp $
+$Header: /cvsroot/pgsql/doc/src/sgml/func.sgml,v 1.137 2003/02/05 17:41:32 tgl Exp $
 PostgreSQL documentation
 -->
 
@@ -424,7 +424,7 @@ PostgreSQL documentation
       <row>
        <entry> <literal>&amp;</literal> </entry>
        <entry>binary AND</entry>
-       <entry>91 & 15</entry>
+       <entry>91 &amp; 15</entry>
        <entry>11</entry>
       </row>
 
@@ -471,7 +471,7 @@ PostgreSQL documentation
     The <quote>binary</quote> operators are also available for the bit
     string types <type>BIT</type> and <type>BIT VARYING</type>, as
     shown in <xref linkend="functions-math-bit-table">.
-    Bit string arguments to <literal>&</literal>, <literal>|</literal>,
+    Bit string arguments to <literal>&amp;</literal>, <literal>|</literal>,
     and <literal>#</literal> must be of equal length.  When bit
     shifting, the original length of the string is preserved, as shown
     in the table.
@@ -490,7 +490,7 @@ PostgreSQL documentation
 
       <tbody>
        <row>
-        <entry>B'10001' & B'01101'</entry>
+        <entry>B'10001' &amp; B'01101'</entry>
         <entry>00001</entry>
        </row>
        <row>
@@ -2629,7 +2629,7 @@ SUBSTRING('foobar' FROM '#"o_b#"%' FOR '#')    <lineannotation>NULL</lineannotat
      one whose left parenthesis comes first) is
      returned.  You can always put parentheses around the whole expression
      if you want to use parentheses within it without triggering this
-     exception.
+     exception.  Also see the non-capturing parentheses described below.
     </para>
 
    <para>
@@ -2640,110 +2640,319 @@ SUBSTRING('foobar' FROM 'o(.)b')   <lineannotation>o</lineannotation>
 </programlisting>
    </para>
 
-<!-- derived from the re_format.7 man page -->
+   <para>
+    <productname>PostgreSQL</productname>'s regular expressions are implemented
+    using a package written by Henry Spencer.  Much of
+    the description of regular expressions below is copied verbatim from his
+    manual entry.
+   </para>
+
+<!-- derived from the re_syntax.n man page -->
+
+   <sect3 id="posix-syntax-details">
+    <title>Regular Expression Details</title>
+
    <para>
     Regular expressions (<acronym>RE</acronym>s), as defined in
-     <acronym>POSIX</acronym> 
-    1003.2, come in two forms: modern <acronym>RE</acronym>s (roughly those of
-    <command>egrep</command>; 1003.2 calls these
-    <quote>extended</quote> <acronym>RE</acronym>s) and obsolete <acronym>RE</acronym>s (roughly those of
-    <command>ed</command>; 1003.2 <quote>basic</quote> <acronym>RE</acronym>s).
-    <productname>PostgreSQL</productname> implements the modern form.
+    <acronym>POSIX</acronym> 1003.2, come in two forms:
+    <firstterm>extended</> <acronym>RE</acronym>s or <acronym>ERE</>s
+    (roughly those of <command>egrep</command>), and
+    <firstterm>basic</> <acronym>RE</acronym>s or <acronym>BRE</>s
+    (roughly those of <command>ed</command>).
+    <productname>PostgreSQL</productname> supports both forms, and
+    also implements some extensions
+    that are not in the POSIX standard, but have become widely used anyway
+    due to their availability in programming languages such as Perl and Tcl.
+    <acronym>RE</acronym>s using these non-POSIX extensions are called
+    <firstterm>advanced</> <acronym>RE</acronym>s or <acronym>ARE</>s
+    in this documentation.  We first describe the ERE/ARE flavor and then
+    mention the restrictions of the BRE form.
    </para>
 
    <para>
-    A (modern) RE is one or more non-empty
+    A regular expression is defined as one or more
     <firstterm>branches</firstterm>, separated by
     <literal>|</literal>.  It matches anything that matches one of the
     branches.
    </para>
 
    <para>
-    A branch is one or more <firstterm>pieces</firstterm>,
-    concatenated.  It matches a match for the first, followed by a
-    match for the second, etc.
+    A branch is zero or more <firstterm>quantified atoms</> or
+    <firstterm>constraints</>, concatenated.
+    It matches a match for the first, followed by a match for the second, etc;
+    an empty branch matches the empty string.
    </para>
 
    <para>
-    A piece is an <firstterm>atom</firstterm> possibly followed by a
-    single <literal>*</literal>, <literal>+</literal>,
-    <literal>?</literal>, or <firstterm>bound</firstterm>.  An atom
-    followed by <literal>*</literal> matches a sequence of 0 or more
-    matches of the atom.  An atom followed by <literal>+</literal>
-    matches a sequence of 1 or more matches of the atom.  An atom
-    followed by <literal>?</literal> matches a sequence of 0 or 1
-    matches of the atom.
+    A quantified atom is an <firstterm>atom</> possibly followed
+    by a single <firstterm>quantifier</>.
+    Without a quantifier, it matches a match for the atom.
+    With a quantifier, it can match some number of matches of the atom.
+    An <firstterm>atom</firstterm> can be any of the possibilities
+    shown in <xref linkend="posix-atoms-table">.
+    The possible quantifiers and their meanings are shown in
+    <xref linkend="posix-quantifiers-table">.
    </para>
 
    <para>
-    A <firstterm>bound</firstterm> is <literal>{</literal> followed by
-    an unsigned decimal integer, possibly followed by
-    <literal>,</literal> possibly followed by another unsigned decimal
-    integer, always followed by <literal>}</literal>.  The integers
-    must lie between 0 and <symbol>RE_DUP_MAX</symbol> (255)
-    inclusive, and if there are two of them, the first may not exceed
-    the second.  An atom followed by a bound containing one integer
-    <replaceable>i</replaceable> and no comma matches a sequence of
-    exactly <replaceable>i</replaceable> matches of the atom.  An atom
-    followed by a bound containing one integer
-    <replaceable>i</replaceable> and a comma matches a sequence of
-    <replaceable>i</replaceable> or more matches of the atom.  An atom
-    followed by a bound containing two integers
-    <replaceable>i</replaceable> and <replaceable>j</replaceable>
-    matches a sequence of <replaceable>i</replaceable> through
-    <replaceable>j</replaceable> (inclusive) matches of the atom.
+    A <firstterm>constraint</> matches an empty string, but matches only when
+    specific conditions are met.  A constraint can be used where an atom
+    could be used, except it may not be followed by a quantifier.
+    The simple constraints are shown in
+    <xref linkend="posix-constraints-table">;
+    some more constraints are described later.
+   </para>
+
+
+   <table id="posix-atoms-table">
+    <title>Regular Expression Atoms</title>
+
+    <tgroup cols="2">
+     <thead>
+      <row>
+       <entry>Atom</entry>
+       <entry>Description</entry>
+      </row>
+     </thead>
+
+      <tbody>
+       <row>
+       <entry> <literal>(</><replaceable>re</><literal>)</> </entry>
+       <entry> (where <replaceable>re</> is any regular expression)
+       matches a match for
+       <replaceable>re</>, with the match noted for possible reporting </entry>
+       </row>
+
+       <row>
+       <entry> <literal>(?:</><replaceable>re</><literal>)</> </entry>
+       <entry> as above, but the match is not noted for reporting
+       (a <quote>non-capturing</> set of parentheses)
+       (AREs only) </entry>
+       </row>
+
+       <row>
+       <entry> <literal>.</> </entry>
+       <entry> matches any single character </entry>
+       </row>
+
+       <row>
+       <entry> <literal>[</><replaceable>chars</><literal>]</> </entry>
+       <entry> a <firstterm>bracket expression</>,
+       matching any one of the <replaceable>chars</> (see
+       <xref linkend="posix-bracket-expressions"> for more detail) </entry>
+       </row>
+
+       <row>
+       <entry> <literal>\</><replaceable>k</> </entry>
+       <entry> (where <replaceable>k</> is a non-alphanumeric character)
+       matches that character taken as an ordinary character,
+       e.g. <literal>\\</> matches a backslash character </entry>
+       </row>
+
+       <row>
+       <entry> <literal>\</><replaceable>c</> </entry>
+       <entry> where <replaceable>c</> is alphanumeric
+       (possibly followed by other characters)
+       is an <firstterm>escape</>, see <xref linkend="posix-escape-sequences">
+       (AREs only; in EREs and BREs, this matches <replaceable>c</>) </entry>
+       </row>
+
+       <row>
+       <entry> <literal>{</> </entry>
+       <entry> when followed by a character other than a digit,
+       matches the left-brace character <literal>{</>;
+       when followed by a digit, it is the beginning of a
+       <replaceable>bound</> (see below) </entry>
+       </row>
+
+       <row>
+       <entry> <replaceable>x</> </entry>
+       <entry> where <replaceable>x</> is a single character with no other
+       significance, matches that character </entry>
+       </row>
+      </tbody>
+     </tgroup>
+    </table>
+
+   <para>
+    An RE may not end with <literal>\</>.
    </para>
 
    <note>
     <para>
-     A repetition operator (<literal>?</literal>,
-     <literal>*</literal>, <literal>+</literal>, or bounds) cannot
-     follow another repetition operator.  A repetition operator cannot
+     Remember that the backslash (<literal>\</literal>) already has a special
+     meaning in <productname>PostgreSQL</> string literals.
+     To write a pattern constant that contains a backslash,
+     you must write two backslashes in the query.
+   </para>
+   </note>
+
+   <table id="posix-quantifiers-table">
+    <title>Regular Expression Quantifiers</title>
+
+    <tgroup cols="2">
+     <thead>
+      <row>
+       <entry>Quantifier</entry>
+       <entry>Matches</entry>
+      </row>
+     </thead>
+
+      <tbody>
+       <row>
+       <entry> <literal>*</> </entry>
+       <entry> a sequence of 0 or more matches of the atom </entry>
+       </row>
+
+       <row>
+       <entry> <literal>+</> </entry>
+       <entry> a sequence of 1 or more matches of the atom </entry>
+       </row>
+
+       <row>
+       <entry> <literal>?</> </entry>
+       <entry> a sequence of 0 or 1 matches of the atom </entry>
+       </row>
+
+       <row>
+       <entry> <literal>{</><replaceable>m</><literal>}</> </entry>
+       <entry> a sequence of exactly <replaceable>m</> matches of the atom </entry>
+       </row>
+
+       <row>
+       <entry> <literal>{</><replaceable>m</><literal>,}</> </entry>
+       <entry> a sequence of <replaceable>m</> or more matches of the atom </entry>
+       </row>
+
+       <row>
+       <entry>
+       <literal>{</><replaceable>m</><literal>,</><replaceable>n</><literal>}</> </entry>
+       <entry> a sequence of <replaceable>m</> through <replaceable>n</>
+       (inclusive) matches of the atom; <replaceable>m</> may not exceed
+       <replaceable>n</> </entry>
+       </row>
+
+       <row>
+       <entry> <literal>*?</> </entry>
+       <entry> non-greedy version of <literal>*</> </entry>
+       </row>
+
+       <row>
+       <entry> <literal>+?</> </entry>
+       <entry> non-greedy version of <literal>+</> </entry>
+       </row>
+
+       <row>
+       <entry> <literal>??</> </entry>
+       <entry> non-greedy version of <literal>?</> </entry>
+       </row>
+
+       <row>
+       <entry> <literal>{</><replaceable>m</><literal>}?</> </entry>
+       <entry> non-greedy version of <literal>{</><replaceable>m</><literal>}</> </entry>
+       </row>
+
+       <row>
+       <entry> <literal>{</><replaceable>m</><literal>,}?</> </entry>
+       <entry> non-greedy version of <literal>{</><replaceable>m</><literal>,}</> </entry>
+       </row>
+
+       <row>
+       <entry>
+       <literal>{</><replaceable>m</><literal>,</><replaceable>n</><literal>}?</> </entry>
+       <entry> non-greedy version of <literal>{</><replaceable>m</><literal>,</><replaceable>n</><literal>}</> </entry>
+       </row>
+      </tbody>
+     </tgroup>
+    </table>
+
+   <para>
+    The forms using <literal>{</><replaceable>...</><literal>}</>
+    are known as <firstterm>bound</>s.
+    The numbers <replaceable>m</> and <replaceable>n</> within a bound are
+    unsigned decimal integers with permissible values from 0 to 255 inclusive.
+   </para>
+
+    <para>
+     <firstterm>Non-greedy</> quantifiers (available in AREs only) match the
+     same possibilities as their corresponding normal (<firstterm>greedy</>)
+     counterparts, but prefer the smallest number rather than the largest
+     number of matches.
+     See <xref linkend="posix-matching-rules"> for more detail.
+   </para>
+
+   <note>
+    <para>
+     A quantifier cannot immediately follow another quantifier.
+     A quantifier cannot
      begin an expression or subexpression or follow
      <literal>^</literal> or <literal>|</literal>.
     </para>
    </note>
 
-   <para>
-    An <firstterm>atom</firstterm> is a regular expression enclosed in
-    <literal>()</literal> (matching a match for the regular
-    expression), an empty set of <literal>()</literal> (matching the
-    null string), a <firstterm>bracket expression</firstterm> (see
-    below), <literal>.</literal> (matching any single character),
-    <literal>^</literal> (matching the null string at the beginning of the
-    input string), <literal>$</literal> (matching the null string at the end
-    of the input string), a <literal>\</literal> followed by one of the
-    characters <literal>^.[$()|*+?{\</literal> (matching that
-    character taken as an ordinary character), a <literal>\</literal>
-    followed by any other character (matching that character taken as
-    an ordinary character, as if the <literal>\</literal> had not been
-    present), or a single character with no other significance
-    (matching that character).  A <literal>{</literal> followed by a
-    character other than a digit is an ordinary character, not the
-    beginning of a bound.  It is illegal to end an RE with
-    <literal>\</literal>.
-   </para>
+   <table id="posix-constraints-table">
+    <title>Regular Expression Constraints</title>
+
+    <tgroup cols="2">
+     <thead>
+      <row>
+       <entry>Constraint</entry>
+       <entry>Description</entry>
+      </row>
+     </thead>
+
+      <tbody>
+       <row>
+       <entry> <literal>^</> </entry>
+       <entry> matches at the beginning of the string </entry>
+       </row>
+
+       <row>
+       <entry> <literal>$</> </entry>
+       <entry> matches at the end of the string </entry>
+       </row>
+
+       <row>
+       <entry> <literal>(?=</><replaceable>re</><literal>)</> </entry>
+       <entry> <firstterm>positive lookahead</> matches at any point
+       where a substring matching <replaceable>re</> begins
+       (AREs only) </entry>
+       </row>
+
+       <row>
+       <entry> <literal>(?!</><replaceable>re</><literal>)</> </entry>
+       <entry> <firstterm>negative lookahead</> matches at any point
+       where no substring matching <replaceable>re</> begins
+       (AREs only) </entry>
+       </row>
+      </tbody>
+     </tgroup>
+    </table>
 
    <para>
-    Note that the backslash (<literal>\</literal>) already has a special
-    meaning in string
-    literals, so to write a pattern constant that contains a backslash
-    you must write two backslashes in the query.
+    Lookahead constraints may not contain <firstterm>back references</>
+    (see <xref linkend="posix-escape-sequences">),
+    and all parentheses within them are considered non-capturing.
    </para>
+   </sect3>
+
+   <sect3 id="posix-bracket-expressions">
+    <title>Bracket Expressions</title>
 
    <para>
     A <firstterm>bracket expression</firstterm> is a list of
     characters enclosed in <literal>[]</literal>.  It normally matches
     any single character from the list (but see below).  If the list
     begins with <literal>^</literal>, it matches any single character
-    (but see below) not from the rest of the list.  If two characters
+    <emphasis>not</> from the rest of the list.
+    If two characters
     in the list are separated by <literal>-</literal>, this is
     shorthand for the full range of characters between those two
     (inclusive) in the collating sequence,
     e.g. <literal>[0-9]</literal> in <acronym>ASCII</acronym> matches
     any decimal digit.  It is illegal for two ranges to share an
     endpoint, e.g.  <literal>a-c-e</literal>.  Ranges are very
-    collating-sequence-dependent, and portable programs should avoid
+    collating-sequence-dependent, so portable programs should avoid
     relying on them.
    </para>
 
@@ -2754,11 +2963,13 @@ SUBSTRING('foobar' FROM 'o(.)b')   <lineannotation>o</lineannotation>
     character, or the second endpoint of a range.  To use a literal
     <literal>-</literal> as the first endpoint of a range, enclose it
     in <literal>[.</literal> and <literal>.]</literal> to make it a
-    collating element (see below).  With the exception of these and
-    some combinations using <literal>[</literal> (see next
-    paragraphs), all other special characters, including
-    <literal>\</literal>, lose their special significance within a
-    bracket expression.
+    collating element (see below).  With the exception of these characters,
+    some combinations using <literal>[</literal>
+    (see next paragraphs), and escapes (AREs only), all other special
+    characters lose their special significance within a bracket expression.
+    In particular, <literal>\</literal> is not special when following
+    ERE or BRE rules, though it is special (as introducing an escape)
+    in AREs.
    </para>
 
    <para>
@@ -2775,6 +2986,13 @@ SUBSTRING('foobar' FROM 'o(.)b')   <lineannotation>o</lineannotation>
     <literal>chchcc</literal>.
    </para>
 
+   <note>
+    <para>
+     <productname>PostgreSQL</> currently has no multi-character collating
+     elements. This information describes possible future behavior.
+    </para>
+   </note>
+
    <para>
     Within a bracket expression, a collating element enclosed in
     <literal>[=</literal> and <literal>=]</literal> is an equivalence
@@ -2809,76 +3027,732 @@ SUBSTRING('foobar' FROM 'o(.)b')   <lineannotation>o</lineannotation>
    <para>
     There are two special cases of bracket expressions:  the bracket
     expressions <literal>[[:&lt;:]]</literal> and
-    <literal>[[:>:]]</literal> match the null string at the beginning
+    <literal>[[:&gt;:]]</literal> are constraints,
+    matching empty strings at the beginning
     and end of a word respectively.  A word is defined as a sequence
-    of word characters which is neither preceded nor followed by word
-    characters.  A word character is an alnum character (as defined by
+    of word characters that is neither preceded nor followed by word
+    characters.  A word character is an <literal>alnum</> character (as
+    defined by
     <citerefentry><refentrytitle>ctype</refentrytitle><manvolnum>3</manvolnum></citerefentry>)
     or an underscore.  This is an extension, compatible with but not
-    specified by <acronym>POSIX</acronym> 1003.2, and should be used with caution in
-    software intended to be portable to other systems.
+    specified by <acronym>POSIX</acronym> 1003.2, and should be used with
+    caution in software intended to be portable to other systems.
+    The constraint escapes described below are usually preferable (they
+    are no more standard, but are certainly easier to type).
+   </para>
+   </sect3>
+
+   <sect3 id="posix-escape-sequences">
+    <title>Regular Expression Escapes</title>
+
+   <para>
+    <firstterm>Escapes</> are special sequences beginning with <literal>\</>
+    followed by an alphanumeric character. Escapes come in several varieties:
+    character entry, class shorthands, constraint escapes, and back references.
+    A <literal>\</> followed by an alphanumeric character but not constituting
+    a valid escape is illegal in AREs.
+    In EREs, there are no escapes: outside a bracket expression,
+    a <literal>\</> followed by an alphanumeric character merely stands for
+    that character as an ordinary character, and inside a bracket expression,
+    <literal>\</> is an ordinary character.
+    (The latter is the one actual incompatibility between EREs and AREs.)
+   </para>
+
+   <para>
+    <firstterm>Character-entry escapes</> exist to make it easier to specify
+    non-printing and otherwise inconvenient characters in REs.  They are
+    shown in <xref linkend="posix-character-entry-escapes-table">.
+   </para>
+
+   <para>
+    <firstterm>Class-shorthand escapes</> provide shorthands for certain
+    commonly-used character classes.  They are
+    shown in <xref linkend="posix-class-shorthand-escapes-table">.
+   </para>
+
+   <para>
+    A <firstterm>constraint escape</> is a constraint,
+    matching the empty string if specific conditions are met,
+    written as an escape.  They are
+    shown in <xref linkend="posix-constraint-escapes-table">.
+   </para>
+
+   <para>
+    A <firstterm>back reference</> (<literal>\</><replaceable>n</>) matches the
+    same string matched by the previous parenthesized subexpression specified
+    by the number <replaceable>n</>
+    (see <xref linkend="posix-constraint-backref-table">).  For example,
+    <literal>([bc])\1</> matches <literal>bb</> or <literal>cc</>
+    but not <literal>bc</> or <literal>cb</>.
+    The subexpression must entirely precede the back reference in the RE.
+    Subexpressions are numbered in the order of their leading parentheses.
+    Non-capturing parentheses do not define subexpressions.
+   </para>
+
+   <note>
+    <para>
+     Keep in mind that an escape's leading <literal>\</> will need to be
+     doubled when entering the pattern as an SQL string constant.
+    </para>
+   </note>
+
+   <table id="posix-character-entry-escapes-table">
+    <title>Regular Expression Character-Entry Escapes</title>
+
+    <tgroup cols="2">
+     <thead>
+      <row>
+       <entry>Escape</entry>
+       <entry>Description</entry>
+      </row>
+     </thead>
+
+      <tbody>
+       <row>
+       <entry> <literal>\a</> </entry>
+       <entry> alert (bell) character, as in C </entry>
+       </row>
+
+       <row>
+       <entry> <literal>\b</> </entry>
+       <entry> backspace, as in C </entry>
+       </row>
+
+       <row>
+       <entry> <literal>\B</> </entry>
+       <entry> synonym for <literal>\</> to help reduce the need for backslash
+       doubling </entry>
+       </row>
+
+       <row>
+       <entry> <literal>\c</><replaceable>X</> </entry>
+       <entry> (where <replaceable>X</> is any character) the character whose
+       low-order 5 bits are the same as those of
+       <replaceable>X</>, and whose other bits are all zero </entry>
+       </row>
+
+       <row>
+       <entry> <literal>\e</> </entry>
+       <entry> the character whose collating-sequence name
+       is <literal>ESC</>,
+       or failing that, the character with octal value 033 </entry>
+       </row>
+
+       <row>
+       <entry> <literal>\f</> </entry>
+       <entry> formfeed, as in C </entry>
+       </row>
+
+       <row>
+       <entry> <literal>\n</> </entry>
+       <entry> newline, as in C </entry>
+       </row>
+
+       <row>
+       <entry> <literal>\r</> </entry>
+       <entry> carriage return, as in C </entry>
+       </row>
+
+       <row>
+       <entry> <literal>\t</> </entry>
+       <entry> horizontal tab, as in C </entry>
+       </row>
+
+       <row>
+       <entry> <literal>\u</><replaceable>wxyz</> </entry>
+       <entry> (where <replaceable>wxyz</> is exactly four hexadecimal digits)
+       the Unicode character <literal>U+</><replaceable>wxyz</>
+       in the local byte ordering </entry>
+       </row>
+
+       <row>
+       <entry> <literal>\U</><replaceable>stuvwxyz</> </entry>
+       <entry> (where <replaceable>stuvwxyz</> is exactly eight hexadecimal
+       digits)
+       reserved for a somewhat-hypothetical Unicode extension to 32 bits
+       </entry> 
+       </row>
+
+       <row>
+       <entry> <literal>\v</> </entry>
+       <entry> vertical tab, as in C </entry>
+       </row>
+
+       <row>
+       <entry> <literal>\x</><replaceable>hhh</> </entry>
+       <entry> (where <replaceable>hhh</> is any sequence of hexadecimal
+       digits)
+       the character whose hexadecimal value is
+       <literal>0x</><replaceable>hhh</>
+       (a single character no matter how many hexadecimal digits are used)
+       </entry>
+       </row>
+
+       <row>
+       <entry> <literal>\0</> </entry>
+       <entry> the character whose value is <literal>0</> </entry>
+       </row>
+
+       <row>
+       <entry> <literal>\</><replaceable>xy</> </entry>
+       <entry> (where <replaceable>xy</> is exactly two octal digits,
+       and is not a <firstterm>back reference</>)
+       the character whose octal value is
+       <literal>0</><replaceable>xy</> </entry>
+       </row>
+
+       <row>
+       <entry> <literal>\</><replaceable>xyz</> </entry>
+       <entry> (where <replaceable>xyz</> is exactly three octal digits,
+       and is not a <firstterm>back reference</>)
+       the character whose octal value is
+       <literal>0</><replaceable>xyz</> </entry>
+       </row>
+      </tbody>
+     </tgroup>
+    </table>
+
+   <para>
+    Hexadecimal digits are <literal>0</>-<literal>9</>,
+    <literal>a</>-<literal>f</>, and <literal>A</>-<literal>F</>.
+    Octal digits are <literal>0</>-<literal>7</>.
+   </para>
+
+   <para>
+    The character-entry escapes are always taken as ordinary characters.
+    For example, <literal>\135</> is <literal>]</> in ASCII, but
+    <literal>\135</> does not terminate a bracket expression.
    </para>
 
+   <table id="posix-class-shorthand-escapes-table">
+    <title>Regular Expression Class-Shorthand Escapes</title>
+
+    <tgroup cols="2">
+     <thead>
+      <row>
+       <entry>Escape</entry>
+       <entry>Description</entry>
+      </row>
+     </thead>
+
+      <tbody>
+       <row>
+       <entry> <literal>\d</> </entry>
+       <entry> <literal>[[:digit:]]</> </entry>
+       </row>
+
+       <row>
+       <entry> <literal>\s</> </entry>
+       <entry> <literal>[[:space:]]</> </entry>
+       </row>
+
+       <row>
+       <entry> <literal>\w</> </entry>
+       <entry> <literal>[[:alnum:]_]</>
+       (note underscore is included) </entry>
+       </row>
+
+       <row>
+       <entry> <literal>\D</> </entry>
+       <entry> <literal>[^[:digit:]]</> </entry>
+       </row>
+
+       <row>
+       <entry> <literal>\S</> </entry>
+       <entry> <literal>[^[:space:]]</> </entry>
+       </row>
+
+       <row>
+       <entry> <literal>\W</> </entry>
+       <entry> <literal>[^[:alnum:]_]</>
+       (note underscore is included) </entry>
+       </row>
+      </tbody>
+     </tgroup>
+    </table>
+
    <para>
-    In the event that an RE could match more than one substring of a
-    given string, the RE matches the one starting earliest in the
-    string.  If the RE could match more than one substring starting at
-    that point, it matches the longest.  Subexpressions also match the
-    longest possible substrings, subject to the constraint that the
-    whole match be as long as possible, with subexpressions starting
-    earlier in the RE taking priority over ones starting later.  Note
-    that higher-level subexpressions thus take priority over their
-    lower-level component subexpressions.
+    Within bracket expressions, <literal>\d</>, <literal>\s</>,
+    and <literal>\w</> lose their outer brackets,
+    and <literal>\D</>, <literal>\S</>, and <literal>\W</> are illegal.
+    (So, for example, <literal>[a-c\d]</> is equivalent to
+    <literal>[a-c[:digit:]]</>.
+    Also, <literal>[a-c\D]</>, which is equivalent to
+    <literal>[a-c^[:digit:]]</>, is illegal.)
    </para>
 
+   <table id="posix-constraint-escapes-table">
+    <title>Regular Expression Constraint Escapes</title>
+
+    <tgroup cols="2">
+     <thead>
+      <row>
+       <entry>Escape</entry>
+       <entry>Description</entry>
+      </row>
+     </thead>
+
+      <tbody>
+       <row>
+       <entry> <literal>\A</> </entry>
+       <entry> matches only at the beginning of the string
+       (see <xref linkend="posix-matching-rules"> for how this differs from
+       <literal>^</>) </entry>
+       </row>
+
+       <row>
+       <entry> <literal>\m</> </entry>
+       <entry> matches only at the beginning of a word </entry>
+       </row>
+
+       <row>
+       <entry> <literal>\M</> </entry>
+       <entry> matches only at the end of a word </entry>
+       </row>
+
+       <row>
+       <entry> <literal>\y</> </entry>
+       <entry> matches only at the beginning or end of a word </entry>
+       </row>
+
+       <row>
+       <entry> <literal>\Y</> </entry>
+       <entry> matches only at a point that is not the beginning or end of a
+       word </entry>
+       </row>
+
+       <row>
+       <entry> <literal>\Z</> </entry>
+       <entry> matches only at the end of the string
+       (see <xref linkend="posix-matching-rules"> for how this differs from
+       <literal>$</>) </entry>
+       </row>
+      </tbody>
+     </tgroup>
+    </table>
+
+   <para>
+    A word is defined as in the specification of
+    <literal>[[:&lt;:]]</> and <literal>[[:&gt;:]]</> above.
+    Constraint escapes are illegal within bracket expressions.
+   </para>
+
+   <table id="posix-constraint-backref-table">
+    <title>Regular Expression Back References</title>
+
+    <tgroup cols="2">
+     <thead>
+      <row>
+       <entry>Escape</entry>
+       <entry>Description</entry>
+      </row>
+     </thead>
+
+      <tbody>
+       <row>
+       <entry> <literal>\</><replaceable>m</> </entry>
+       <entry> (where <replaceable>m</> is a nonzero digit)
+       a back reference to the <replaceable>m</>'th subexpression </entry>
+       </row>
+
+       <row>
+       <entry> <literal>\</><replaceable>mnn</> </entry>
+       <entry> (where <replaceable>m</> is a nonzero digit, and
+       <replaceable>nn</> is some more digits, and the decimal value
+       <replaceable>mnn</> is not greater than the number of closing capturing
+       parentheses seen so far) 
+       a back reference to the <replaceable>mnn</>'th subexpression </entry>
+       </row>
+      </tbody>
+     </tgroup>
+    </table>
+
+   <note>
+    <para>
+     There is an inherent historical ambiguity between octal character-entry 
+     escapes and back references, which is resolved by heuristics,
+     as hinted at above.
+     A leading zero always indicates an octal escape.
+     A single non-zero digit, not followed by another digit,
+     is always taken as a back reference.
+     A multi-digit sequence not starting with a zero is taken as a back 
+     reference if it comes after a suitable subexpression
+     (i.e. the number is in the legal range for a back reference),
+     and otherwise is taken as octal.
+    </para>
+   </note>
+   </sect3>
+
+   <sect3 id="posix-metasyntax">
+    <title>Regular Expression Metasyntax</title>
+
    <para>
-    Match lengths are measured in characters, not collating
-    elements.  A null string is considered longer than no match at
-    all.  For example, <literal>bb*</literal> matches the three middle
-    characters of <literal>abbbc</literal>,
-    <literal>(wee|week)(knights|nights)</literal> matches all ten
-    characters of <literal>weeknights</literal>, when
-    <literal>(.*).*</literal> is matched against
-    <literal>abc</literal> the parenthesized subexpression matches all
-    three characters, and when <literal>(a*)*</literal> is matched
-    against <literal>bc</literal> both the whole RE and the
-    parenthesized subexpression match the null string.
+    In addition to the main syntax described above, there are some special
+    forms and miscellaneous syntactic facilities available.
    </para>
 
    <para>
-    If case-independent matching is specified, the effect is much as
-    if all case distinctions had vanished from the alphabet.  When an
-    alphabetic that exists in multiple cases appears as an ordinary
-    character outside a bracket expression, it is effectively
+    Normally the flavor of RE being used is specified by
+    application-dependent means.
+    However, this can be overridden by a <firstterm>director</>.
+    If an RE of any flavor begins with <literal>***:</>,
+    the rest of the RE is an ARE.
+    If an RE of any flavor begins with <literal>***=</>,
+    the rest of the RE is taken to be a literal string,
+    with all characters considered ordinary characters.
+   </para>
+
+   <para>
+    An ARE may begin with <firstterm>embedded options</>:
+    a sequence <literal>(?</><replaceable>xyz</><literal>)</>
+    (where <replaceable>xyz</> is one or more alphabetic characters)
+    specifies options affecting the rest of the RE.
+    These supplement, and can override,
+    any options specified externally.
+    The available option letters are
+    shown in <xref linkend="posix-embedded-options-table">.
+   </para>
+
+   <table id="posix-embedded-options-table">
+    <title>ARE Embedded-Option Letters</title>
+
+    <tgroup cols="2">
+     <thead>
+      <row>
+       <entry>Option</entry>
+       <entry>Description</entry>
+      </row>
+     </thead>
+
+      <tbody>
+       <row>
+       <entry> <literal>b</> </entry>
+       <entry> rest of RE is a BRE </entry>
+       </row>
+
+       <row>
+       <entry> <literal>c</> </entry>
+       <entry> case-sensitive matching (usual default) </entry>
+       </row>
+
+       <row>
+       <entry> <literal>e</> </entry>
+       <entry> rest of RE is an ERE </entry>
+       </row>
+
+       <row>
+       <entry> <literal>i</> </entry>
+       <entry> case-insensitive matching (see
+       <xref linkend="posix-matching-rules">) </entry>
+       </row>
+
+       <row>
+       <entry> <literal>m</> </entry>
+       <entry> historical synonym for <literal>n</> </entry>
+       </row>
+
+       <row>
+       <entry> <literal>n</> </entry>
+       <entry> newline-sensitive matching (see
+       <xref linkend="posix-matching-rules">) </entry>
+       </row>
+
+       <row>
+       <entry> <literal>p</> </entry>
+       <entry> partial newline-sensitive matching (see
+       <xref linkend="posix-matching-rules">) </entry>
+       </row>
+
+       <row>
+       <entry> <literal>q</> </entry>
+       <entry> rest of RE is a literal (<quote>quoted</>) string, all ordinary
+       characters </entry>
+       </row>
+
+       <row>
+       <entry> <literal>s</> </entry>
+       <entry> non-newline-sensitive matching (usual default) </entry>
+       </row>
+
+       <row>
+       <entry> <literal>t</> </entry>
+       <entry> tight syntax (usual default; see below) </entry>
+       </row>
+
+       <row>
+       <entry> <literal>w</> </entry>
+       <entry> inverse partial newline-sensitive (<quote>weird</>) matching
+       (see <xref linkend="posix-matching-rules">) </entry>
+       </row>
+
+       <row>
+       <entry> <literal>x</> </entry>
+       <entry> expanded syntax (see below) </entry>
+       </row>
+      </tbody>
+     </tgroup>
+    </table>
+
+   <para>
+    Embedded options take effect at the <literal>)</> terminating the sequence.
+    They are available only at the start of an ARE,
+    and may not be used later within it.
+   </para>
+
+   <para>
+    In addition to the usual (<firstterm>tight</>) RE syntax, in which all
+    characters are significant, there is an <firstterm>expanded</> syntax,
+    available by specifying the embedded <literal>x</> option.
+    In the expanded syntax,
+    white-space characters in the RE are ignored, as are
+    all characters between a <literal>#</>
+    and the following newline (or the end of the RE).  This
+    permits paragraphing and commenting a complex RE.
+    There are three exceptions to that basic rule:
+
+    <itemizedlist>
+     <listitem>
+      <para>
+       a white-space character or <literal>#</> preceded by <literal>\</> is
+       retained
+      </para>
+     </listitem>
+     <listitem>
+      <para>
+       white space or <literal>#</> within a bracket expression is retained
+      </para>
+     </listitem>
+     <listitem>
+      <para>
+       white space and comments are illegal within multi-character symbols,
+       like the ARE <literal>(?:</> or the BRE <literal>\(</>
+      </para>
+     </listitem>
+    </itemizedlist>
+
+    Expanded-syntax white-space characters are blank, tab, newline, and
+    any character that belongs to the <replaceable>space</> character class.
+   </para>
+
+   <para>
+    Finally, in an ARE, outside bracket expressions, the sequence
+    <literal>(?#</><replaceable>ttt</><literal>)</>
+    (where <replaceable>ttt</> is any text not containing a <literal>)</>)
+    is a comment, completely ignored.
+    Again, this is not allowed between the characters of
+    multi-character symbols, like <literal>(?:</>.
+    Such comments are more a historical artifact than a useful facility,
+    and their use is deprecated; use the expanded syntax instead.
+   </para>
+
+   <para>
+    <emphasis>None</> of these metasyntax extensions is available if
+    an initial <literal>***=</> director
+    has specified that the user's input be treated as a literal string
+    rather than as an RE.
+   </para>
+   </sect3>
+
+   <sect3 id="posix-matching-rules">
+    <title>Regular Expression Matching Rules</title>
+
+   <para>
+    In the event that an RE could match more than one substring of a given
+    string, the RE matches the one starting earliest in the string.
+    If the RE could match more than one substring starting at that point,
+    its choice is determined by its <firstterm>preference</>:
+    either the longest substring, or the shortest.
+   </para>
+
+   <para>
+    Most atoms, and all constraints, have no preference.
+    A parenthesized RE has the same preference (possibly none) as the RE.
+    A quantified atom with quantifier
+    <literal>{</><replaceable>m</><literal>}</>
+    or
+    <literal>{</><replaceable>m</><literal>}?</>
+    has the same preference (possibly none) as the atom itself.
+    A quantified atom with other normal quantifiers (including
+    <literal>{</><replaceable>m</><literal>,</><replaceable>n</><literal>}</>
+    with <replaceable>m</> equal to <replaceable>n</>)
+    prefers longest match.
+    A quantified atom with other non-greedy quantifiers (including
+    <literal>{</><replaceable>m</><literal>,</><replaceable>n</><literal>}?</>
+    with <replaceable>m</> equal to <replaceable>n</>)
+    prefers shortest match.
+    A branch has the same preference as the first quantified atom in it
+    which has a preference.
+    An RE consisting of two or more branches connected by the
+    <literal>|</> operator prefers longest match.
+   </para>
+
+   <para>
+    Subject to the constraints imposed by the rules for matching the whole RE,
+    subexpressions also match the longest or shortest possible substrings,
+    based on their preferences,
+    with subexpressions starting earlier in the RE taking priority over
+    ones starting later.
+    Note that outer subexpressions thus take priority over
+    their component subexpressions.
+   </para>
+
+   <para>
+    The quantifiers <literal>{1,1}</> and <literal>{1,1}?</>
+    can be used to force longest and shortest preference, respectively,
+    on a subexpression or a whole RE.
+   </para>
+
+   <para>
+    Match lengths are measured in characters, not collating elements.
+    An empty string is considered longer than no match at all.
+    For example:
+    <literal>bb*</>
+    matches the three middle characters of <literal>abbbc</>;
+    <literal>(week|wee)(night|knights)</>
+    matches all ten characters of <literal>weeknights</>;
+    when <literal>(.*).*</>
+    is matched against <literal>abc</> the parenthesized subexpression
+    matches all three characters; and when
+    <literal>(a*)*</> is matched against <literal>bc</>
+    both the whole RE and the parenthesized
+    subexpression match an empty string.
+   </para>
+
+   <para>
+    If case-independent matching is specified,
+    the effect is much as if all case distinctions had vanished from the
+    alphabet.
+    When an alphabetic that exists in multiple cases appears as an
+    ordinary character outside a bracket expression, it is effectively
     transformed into a bracket expression containing both cases,
-    e.g. <literal>x</literal> becomes <literal>[xX]</literal>.  When
-    it appears inside a bracket expression, all case counterparts of
-    it are added to the bracket expression, so that (e.g.)
-    <literal>[x]</literal> becomes <literal>[xX]</literal> and
-    <literal>[^x]</literal> becomes <literal>[^xX]</literal>.
+    e.g. <literal>x</> becomes <literal>[xX]</>.
+    When it appears inside a bracket expression, all case counterparts
+    of it are added to the bracket expression, e.g.
+    <literal>[x]</> becomes <literal>[xX]</>
+    and <literal>[^x]</> becomes <literal>[^xX]</>.
    </para>
 
    <para>
-    There is no particular limit on the length of <acronym>RE</acronym>s, except insofar
-    as memory is limited.  Memory usage is approximately linear in RE
-    size, and largely insensitive to RE complexity, except for bounded
-    repetitions.  Bounded repetitions are implemented by macro
-    expansion, which is costly in time and space if counts are large
-    or bounded repetitions are nested.  An RE like, say,
-    <literal>((((a{1,100}){1,100}){1,100}){1,100}){1,100}</literal>
-    will (eventually) run almost any existing machine out of swap
-    space.
-    <footnote>
-     <para>
-      This was written in 1994, mind you.  The
-      numbers have probably changed, but the problem
-      persists.
-     </para>
-    </footnote>
+    If newline-sensitive matching is specified, <literal>.</>
+    and bracket expressions using <literal>^</>
+    will never match the newline character
+    (so that matches will never cross newlines unless the RE
+    explicitly arranges it)
+    and <literal>^</>and <literal>$</>
+    will match the empty string after and before a newline
+    respectively, in addition to matching at beginning and end of string
+    respectively.
+    But the ARE escapes <literal>\A</> and <literal>\Z</>
+    continue to match beginning or end of string <emphasis>only</>.
    </para>
-<!-- end re_format.7 man page -->
-  </sect2>
 
+   <para>
+    If partial newline-sensitive matching is specified,
+    this affects <literal>.</> and bracket expressions
+    as with newline-sensitive matching, but not <literal>^</>
+    and <literal>$</>.
+   </para>
+
+   <para>
+    If inverse partial newline-sensitive matching is specified,
+    this affects <literal>^</> and <literal>$</>
+    as with newline-sensitive matching, but not <literal>.</>
+    and bracket expressions.
+    This isn't very useful but is provided for symmetry.
+   </para>
+   </sect3>
+
+   <sect3 id="posix-limits-compatibility">
+    <title>Limits and Compatibility</title>
+
+   <para>
+    No particular limit is imposed on the length of REs in this
+    implementation.  However,
+    programs intended to be highly portable should not employ REs longer
+    than 256 bytes,
+    as a POSIX-compliant implementation can refuse to accept such REs.
+   </para>
+
+   <para>
+    The only feature of AREs that is actually incompatible with
+    POSIX EREs is that <literal>\</> does not lose its special
+    significance inside bracket expressions.
+    All other ARE features use syntax which is illegal or has
+    undefined or unspecified effects in POSIX EREs;
+    the <literal>***</> syntax of directors likewise is outside the POSIX
+    syntax for both BREs and EREs.
+   </para>
+
+   <para>
+    Many of the ARE extensions are borrowed from Perl, but some have
+    been changed to clean them up, and a few Perl extensions are not present.
+    Incompatibilities of note include <literal>\b</>, <literal>\B</>,
+    the lack of special treatment for a trailing newline,
+    the addition of complemented bracket expressions to the things
+    affected by newline-sensitive matching,
+    the restrictions on parentheses and back references in lookahead
+    constraints, and the longest/shortest-match (rather than first-match)
+    matching semantics.
+   </para>
+
+   <para>
+    Two significant incompatibilites exist between AREs and the ERE syntax
+    recognized by pre-7.4 releases of <productname>PostgreSQL</>:
+
+    <itemizedlist>
+     <listitem>
+      <para>
+       In AREs, <literal>\</> followed by an alphanumeric character is either
+       an escape or an error, while in previous releases, it was just another
+       way of writing the alphanumeric.
+       This should not be much of a problem because there was no reason to
+       write such a sequence in earlier releases.
+      </para>
+     </listitem>
+     <listitem>
+      <para>
+       In AREs, <literal>\</> remains a special character within
+       <literal>[]</>, so a literal <literal>\</> within a bracket
+       expression must be written <literal>\\</>.
+      </para>
+     </listitem>
+    </itemizedlist>
+   </para>
+   </sect3>
+
+   <sect3 id="posix-basic-regexes">
+    <title>Basic Regular Expressions</title>
+
+   <para>
+    BREs differ from EREs in several respects.
+    <literal>|</>, <literal>+</>, and <literal>?</>
+    are ordinary characters and there is no equivalent
+    for their functionality.
+    The delimiters for bounds are
+    <literal>\{</> and <literal>\}</>,
+    with <literal>{</> and <literal>}</>
+    by themselves ordinary characters.
+    The parentheses for nested subexpressions are
+    <literal>\(</> and <literal>\)</>,
+    with <literal>(</> and <literal>)</> by themselves ordinary characters.
+    <literal>^</> is an ordinary character except at the beginning of the
+    RE or the beginning of a parenthesized subexpression,
+    <literal>$</> is an ordinary character except at the end of the
+    RE or the end of a parenthesized subexpression,
+    and <literal>*</> is an ordinary character if it appears at the beginning
+    of the RE or the beginning of a parenthesized subexpression
+    (after a possible leading <literal>^</>).
+    Finally, single-digit back references are available, and
+    <literal>\&lt;</> and <literal>\&gt;</>
+    are synonyms for
+    <literal>[[:&lt;:]]</> and <literal>[[:&gt;:]]</>
+    respectively; no other escapes are available.
+   </para>
+   </sect3>
+
+<!-- end re_syntax.n man page -->
+
+  </sect2>
  </sect1>
 
 
diff --git a/doc/src/sgml/release.sgml b/doc/src/sgml/release.sgml
index 354b70cc073..b4eabbcb777 100644
--- a/doc/src/sgml/release.sgml
+++ b/doc/src/sgml/release.sgml
@@ -1,5 +1,5 @@
 <!--
-$Header: /cvsroot/pgsql/doc/src/sgml/release.sgml,v 1.184 2003/02/02 23:46:38 tgl Exp $
+$Header: /cvsroot/pgsql/doc/src/sgml/release.sgml,v 1.185 2003/02/05 17:41:32 tgl Exp $
 -->
 
 <appendix id="release">
@@ -24,6 +24,7 @@ CDATA means the content is "SGML-free", so you can write without
 worries about funny characters.
 -->
 <literallayout><![CDATA[
+New regular expression package, many more regexp features (most of Perl5)
 Can now do EXPLAIN ... EXECUTE to see plan used for a prepared query
 Explicit JOINs no longer constrain query plan, unless JOIN_COLLAPSE_LIMIT = 1
 Performance of "foo IN (SELECT ...)" queries has been considerably improved
diff --git a/src/backend/regex/COPYRIGHT b/src/backend/regex/COPYRIGHT
index 574f6bcec6c..e50cfb1ff19 100644
--- a/src/backend/regex/COPYRIGHT
+++ b/src/backend/regex/COPYRIGHT
@@ -1,56 +1,84 @@
-Copyright 1992, 1993, 1994 Henry Spencer.  All rights reserved.
-This software is not subject to any license of the American Telephone
-and Telegraph Company or of the Regents of the University of California.
-
-Permission is granted to anyone to use this software for any purpose on
-any computer system, and to alter it and redistribute it, subject
-to the following restrictions:
-
-1. The author is not responsible for the consequences of use of this
-   software, no matter how awful, even if they arise from flaws in it.
-
-2. The origin of this software must not be misrepresented, either by
-   explicit claim or by omission.  Since few users ever read sources,
-   credits must appear in the documentation.
-
-3. Altered versions must be plainly marked as such, and must not be
-   misrepresented as being the original software.  Since few users
-   ever read sources, credits must appear in the documentation.
-
-4. This notice may not be removed or altered.
-
-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=-=
-/*-
- * Copyright (c) 1994
- *	The Regents of the University of California.  All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *    notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *    notice, this list of conditions and the following disclaimer in the
- *    documentation and/or other materials provided with the distribution.
- * 3. All advertising materials mentioning features or use of this software
- *    must display the following acknowledgement:
- *	This product includes software developed by the University of
- *	California, Berkeley and its contributors.
- * 4. Neither the name of the University nor the names of its contributors
- *    may be used to endorse or promote products derived from this software
- *    without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- *	@(#)COPYRIGHT	8.1 (Berkeley) 3/16/94
- */
+This regular expression package was originally developed by Henry Spencer.
+It bears the following copyright notice:
+
+**********************************************************************
+
+Copyright (c) 1998, 1999 Henry Spencer.  All rights reserved.
+
+Development of this software was funded, in part, by Cray Research Inc.,
+UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
+Corporation, none of whom are responsible for the results.  The author
+thanks all of them. 
+
+Redistribution and use in source and binary forms -- with or without
+modification -- are permitted for any purpose, provided that
+redistributions in source form retain this entire copyright notice and
+indicate the origin and nature of any modifications.
+
+I'd appreciate being given credit for this package in the documentation
+of software which uses it, but that is not a requirement.
+
+THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
+INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
+AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL
+HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+**********************************************************************
+
+PostgreSQL adopted the code out of Tcl 8.4.1.  Portions of regc_locale.c
+and re_syntax.n were developed by Tcl developers other than Henry; these
+files bear the Tcl copyright and license notice:
+
+**********************************************************************
+
+This software is copyrighted by the Regents of the University of
+California, Sun Microsystems, Inc., Scriptics Corporation, ActiveState
+Corporation and other parties.  The following terms apply to all files
+associated with the software unless explicitly disclaimed in
+individual files.
+
+The authors hereby grant permission to use, copy, modify, distribute,
+and license this software and its documentation for any purpose, provided
+that existing copyright notices are retained in all copies and that this
+notice is included verbatim in any distributions. No written agreement,
+license, or royalty fee is required for any of the authorized uses.
+Modifications to this software may be copyrighted by their authors
+and need not follow the licensing terms described here, provided that
+the new terms are clearly indicated on the first page of each file where
+they apply.
+
+IN NO EVENT SHALL THE AUTHORS OR DISTRIBUTORS BE LIABLE TO ANY PARTY
+FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
+ARISING OUT OF THE USE OF THIS SOFTWARE, ITS DOCUMENTATION, OR ANY
+DERIVATIVES THEREOF, EVEN IF THE AUTHORS HAVE BEEN ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+
+THE AUTHORS AND DISTRIBUTORS SPECIFICALLY DISCLAIM ANY WARRANTIES,
+INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT.  THIS SOFTWARE
+IS PROVIDED ON AN "AS IS" BASIS, AND THE AUTHORS AND DISTRIBUTORS HAVE
+NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR
+MODIFICATIONS.
+
+GOVERNMENT USE: If you are acquiring this software on behalf of the
+U.S. government, the Government shall have only "Restricted Rights"
+in the software and related documentation as defined in the Federal 
+Acquisition Regulations (FARs) in Clause 52.227.19 (c) (2).  If you
+are acquiring the software on behalf of the Department of Defense, the
+software shall be classified as "Commercial Computer Software" and the
+Government shall have only "Restricted Rights" as defined in Clause
+252.227-7013 (c) (1) of DFARs.  Notwithstanding the foregoing, the
+authors grant the U.S. Government and others acting in its behalf
+permission to use and distribute the software in accordance with the
+terms specified in this license. 
+
+**********************************************************************
+
+Subsequent modifications to the code by the PostgreSQL project follow
+the same license terms as the rest of PostgreSQL.
diff --git a/src/backend/regex/Makefile b/src/backend/regex/Makefile
index a5c1fc4337a..6635ec28f49 100644
--- a/src/backend/regex/Makefile
+++ b/src/backend/regex/Makefile
@@ -1,10 +1,10 @@
 #-------------------------------------------------------------------------
 #
 # Makefile--
-#    Makefile for regex
+#    Makefile for backend/regex
 #
 # IDENTIFICATION
-#    $Header: /cvsroot/pgsql/src/backend/regex/Makefile,v 1.19 2002/09/16 16:02:43 momjian Exp $
+#    $Header: /cvsroot/pgsql/src/backend/regex/Makefile,v 1.20 2003/02/05 17:41:32 tgl Exp $
 #
 #-------------------------------------------------------------------------
 
@@ -12,23 +12,17 @@ subdir = src/backend/regex
 top_builddir = ../../..
 include $(top_builddir)/src/Makefile.global
 
-override CPPFLAGS += -DPOSIX_MISTAKE 
-
 OBJS = regcomp.o regerror.o regexec.o regfree.o
-DEBUGOBJ += ../utils/mb/SUBSYS.o
 
 all: SUBSYS.o
 
 SUBSYS.o: $(OBJS)
 	$(LD) $(LDREL) $(LDOUT) SUBSYS.o $(OBJS)
 
-regexec.o: regexec.c engine.c
+# mark inclusion dependencies between .c files explicitly
+regcomp.o: regcomp.c regc_lex.c regc_color.c regc_nfa.c regc_cvec.c regc_locale.c
 
-# retest will not compile because multibyte is now enabled by default
-# and the multibyte calls require /mmgr, /adt, and other calls that
-# are complex for linkage,  bjm 2002-09-16
-#retest: retest.o SUBSYS.o $(DEBUGOBJ)
-#	$(CC) $(CFLAGS) $(LDFLAGS) $^ $(LIBS) -o $@
+regexec.o: regexec.c rege_dfa.c
 
 clean: 
-	rm -f SUBSYS.o $(OBJS) retest retest.o
+	rm -f SUBSYS.o $(OBJS)
diff --git a/src/backend/regex/WHATSNEW b/src/backend/regex/WHATSNEW
deleted file mode 100644
index f4301d300dd..00000000000
--- a/src/backend/regex/WHATSNEW
+++ /dev/null
@@ -1,94 +0,0 @@
-# @(#)WHATSNEW	8.3 (Berkeley) 3/18/94
-
-New in alpha3.4:  The complex bug alluded to below has been fixed (in a
-slightly kludgey temporary way that may hurt efficiency a bit; this is
-another "get it out the door for 4.4" release).  The tests at the end of
-the tests file have accordingly been uncommented.  The primary sign of
-the bug was that something like a?b matching ab matched b rather than ab.
-(The bug was essentially specific to this exact situation, else it would
-have shown up earlier.)
-
-New in alpha3.3:  The definition of word boundaries has been altered
-slightly, to more closely match the usual programming notion that "_"
-is an alphabetic.  Stuff used for pre-ANSI systems is now in a subdir,
-and the makefile no longer alludes to it in mysterious ways.  The
-makefile has generally been cleaned up some.  Fixes have been made
-(again!) so that the regression test will run without -DREDEBUG, at
-the cost of weaker checking.  A workaround for a bug in some folks'
-<assert.h> has been added.  And some more things have been added to
-tests, including a couple right at the end which are commented out
-because the code currently flunks them (complex bug; fix coming).
-Plus the usual minor cleanup.
-
-New in alpha3.2:  Assorted bits of cleanup and portability improvement
-(the development base is now a BSDI system using GCC instead of an ancient
-Sun system, and the newer compiler exposed some glitches).  Fix for a
-serious bug that affected REs using many [] (including REG_ICASE REs
-because of the way they are implemented), *sometimes*, depending on
-memory-allocation patterns.  The header-file prototypes no longer name
-the parameters, avoiding possible name conflicts.  The possibility that
-some clot has defined CHAR_MIN as (say) `-128' instead of `(-128)' is
-now handled gracefully.  "uchar" is no longer used as an internal type
-name (too many people have the same idea).  Still the same old lousy
-performance, alas.
-
-New in alpha3.1:  Basically nothing, this release is just a bookkeeping
-convenience.  Stay tuned.
-
-New in alpha3.0:  Performance is no better, alas, but some fixes have been
-made and some functionality has been added.  (This is basically the "get
-it out the door in time for 4.4" release.)  One bug fix:  regfree() didn't
-free the main internal structure (how embarrassing).  It is now possible
-to put NULs in either the RE or the target string, using (resp.) a new
-REG_PEND flag and the old REG_STARTEND flag.  The REG_NOSPEC flag to
-regcomp() makes all characters ordinary, so you can match a literal
-string easily (this will become more useful when performance improves!).
-There are now primitives to match beginnings and ends of words, although
-the syntax is disgusting and so is the implementation.  The REG_ATOI
-debugging interface has changed a bit.  And there has been considerable
-internal cleanup of various kinds.
-
-New in alpha2.3:  Split change list out of README, and moved flags notes
-into Makefile.  Macro-ized the name of regex(7) in regex(3), since it has
-to change for 4.4BSD.  Cleanup work in engine.c, and some new regression
-tests to catch tricky cases thereof.
-
-New in alpha2.2:  Out-of-date manpages updated.  Regerror() acquires two
-small extensions -- REG_ITOA and REG_ATOI -- which avoid debugging kludges
-in my own test program and might be useful to others for similar purposes.
-The regression test will now compile (and run) without REDEBUG.  The
-BRE \$ bug is fixed.  Most uses of "uchar" are gone; it's all chars now.
-Char/uchar parameters are now written int/unsigned, to avoid possible
-portability problems with unpromoted parameters.  Some unsigned casts have
-been introduced to minimize portability problems with shifting into sign
-bits.
-
-New in alpha2.1:  Lots of little stuff, cleanup and fixes.  The one big
-thing is that regex.h is now generated, using mkh, rather than being
-supplied in the distribution; due to circularities in dependencies,
-you have to build regex.h explicitly by "make h".  The two known bugs
-have been fixed (and the regression test now checks for them), as has a
-problem with assertions not being suppressed in the absence of REDEBUG.
-No performance work yet.
-
-New in alpha2:  Backslash-anything is an ordinary character, not an
-error (except, of course, for the handful of backslashed metacharacters
-in BREs), which should reduce script breakage.  The regression test
-checks *where* null strings are supposed to match, and has generally
-been tightened up somewhat.  Small bug fixes in parameter passing (not
-harmful, but technically errors) and some other areas.  Debugging
-invoked by defining REDEBUG rather than not defining NDEBUG.
-
-New in alpha+3:  full prototyping for internal routines, using a little
-helper program, mkh, which extracts prototypes given in stylized comments.
-More minor cleanup.  Buglet fix:  it's CHAR_BIT, not CHAR_BITS.  Simple
-pre-screening of input when a literal string is known to be part of the
-RE; this does wonders for performance.
-
-New in alpha+2:  minor bits of cleanup.  Notably, the number "32" for the
-word width isn't hardwired into regexec.c any more, the public header
-file prototypes the functions if __STDC__ is defined, and some small typos
-in the manpages have been fixed.
-
-New in alpha+1:  improvements to the manual pages, and an important
-extension, the REG_STARTEND option to regexec().
diff --git a/src/backend/regex/engine.c b/src/backend/regex/engine.c
deleted file mode 100644
index 4a64ec8c992..00000000000
--- a/src/backend/regex/engine.c
+++ /dev/null
@@ -1,1093 +0,0 @@
-/*-
- * Copyright (c) 1992, 1993, 1994 Henry Spencer.
- * Copyright (c) 1992, 1993, 1994
- *		The Regents of the University of California.  All rights reserved.
- *
- * This code is derived from software contributed to Berkeley by
- * Henry Spencer.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *	  notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *	  notice, this list of conditions and the following disclaimer in the
- *	  documentation and/or other materials provided with the distribution.
- * 3. All advertising materials mentioning features or use of this software
- *	  must display the following acknowledgement:
- *		This product includes software developed by the University of
- *		California, Berkeley and its contributors.
- * 4. Neither the name of the University nor the names of its contributors
- *	  may be used to endorse or promote products derived from this software
- *	  without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.	IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- *		@(#)engine.c	8.5 (Berkeley) 3/20/94
- */
-
-#include "postgres.h"
-
-/*
- * The matching engine and friends.  This file is #included by regexec.c
- * after suitable #defines of a variety of macros used herein, so that
- * different state representations can be used without duplicating masses
- * of code.
- */
-
-#ifdef SNAMES
-#define matcher smatcher
-#define fast	sfast
-#define slow	sslow
-#define dissect sdissect
-#define backref sbackref
-#define step	sstep
-#define print	sprint
-#define at		sat
-#define match	smat
-#endif
-#ifdef LNAMES
-#define matcher lmatcher
-#define fast	lfast
-#define slow	lslow
-#define dissect ldissect
-#define backref lbackref
-#define step	lstep
-#define print	lprint
-#define at		lat
-#define match	lmat
-#endif
-
-/* another structure passed up and down to avoid zillions of parameters */
-struct match
-{
-	struct re_guts *g;
-	int			eflags;
-	regmatch_t *pmatch;			/* [nsub+1] (0 element unused) */
-	pg_wchar   *offp;			/* offsets work from here */
-	pg_wchar   *beginp;			/* start of string -- virtual NUL precedes */
-	pg_wchar   *endp;			/* end of string -- virtual NUL here */
-	pg_wchar   *coldp;			/* can be no match starting before here */
-	pg_wchar  **lastpos;		/* [nplus+1] */
-				STATEVARS;
-	states		st;				/* current states */
-	states		fresh;			/* states for a fresh start */
-	states		tmp;			/* temporary */
-	states		empty;			/* empty set of states */
-};
-
-static int matcher(struct re_guts * g, pg_wchar *string, size_t nmatch,
-		regmatch_t *pmatch, int eflags);
-static pg_wchar *dissect(struct match * m, pg_wchar *start, pg_wchar *stop,
-		sopno startst, sopno stopst);
-static pg_wchar *backref(struct match * m, pg_wchar *start, pg_wchar *stop,
-		sopno startst, sopno stopst, sopno lev);
-static pg_wchar *fast(struct match * m, pg_wchar *start, pg_wchar *stop,
-	 sopno startst, sopno stopst);
-static pg_wchar *slow(struct match * m, pg_wchar *start, pg_wchar *stop,
-	 sopno startst, sopno stopst);
-static states step(struct re_guts * g, sopno start,
-	 sopno stop, states bef, int ch, states aft);
-
-#define BOL		(OUT+1)
-#define EOL		(BOL+1)
-#define BOLEOL	(BOL+2)
-#define NOTHING (BOL+3)
-#define BOW		(BOL+4)
-#define EOW		(BOL+5)
-#define CODEMAX (BOL+5)			/* highest code used */
-
-#define NONCHAR(c)	  ((c) > 16777216)	/* 16777216 == 2^24 == 3 bytes */
-#define NNONCHAR  (CODEMAX-16777216)
-
-#ifdef REDEBUG
-static void print(struct match * m, pg_wchar *caption, states st, int ch,
-	  FILE *d);
-static void at(struct match * m, pg_wchar *title, pg_wchar *start,
-   pg_wchar *stop, sopno startst, sopno stopst);
-static pg_wchar *pchar(int ch);
-static int	pg_isprint(int c);
-#endif
-
-#ifdef REDEBUG
-#define SP(t, s, c)		print(m, t, s, c, stdout)
-#define AT(t, p1, p2, s1, s2)	at(m, t, p1, p2, s1, s2)
-#define NOTE(str) \
-do { \
-	if (m->eflags&REG_TRACE) \
-		printf("=%s\n", (str)); \
-} while (0)
-
-#else
-#define SP(t, s, c)				/* nothing */
-#define AT(t, p1, p2, s1, s2)	/* nothing */
-#define NOTE(s)					/* nothing */
-#endif
-
-/*
- * matcher - the actual matching engine
- */
-static int						/* 0 success, REG_NOMATCH failure */
-matcher(struct re_guts * g, pg_wchar *string, size_t nmatch,
-		regmatch_t *pmatch, int eflags)
-{
-	pg_wchar   *endp;
-	int			i;
-	struct match mv;
-	struct match *m = &mv;
-	pg_wchar   *dp;
-	const sopno gf = g->firststate + 1; /* +1 for OEND */
-	const sopno gl = g->laststate;
-	pg_wchar   *start;
-	pg_wchar   *stop;
-
-	/* simplify the situation where possible */
-	if (g->cflags & REG_NOSUB)
-		nmatch = 0;
-	if (eflags & REG_STARTEND)
-	{
-		start = string + pmatch[0].rm_so;
-		stop = string + pmatch[0].rm_eo;
-	}
-	else
-	{
-		start = string;
-		stop = start + pg_wchar_strlen(start);
-	}
-	if (stop < start)
-		return REG_INVARG;
-
-	/* prescreening; this does wonders for this rather slow code */
-	if (g->must != NULL)
-	{
-		for (dp = start; dp < stop; dp++)
-			if (*dp == g->must[0] && stop - dp >= g->mlen &&
-				memcmp(dp, g->must, (size_t) (g->mlen * sizeof(pg_wchar))) == 0
-				)
-				break;
-		if (dp == stop)			/* we didn't find g->must */
-			return REG_NOMATCH;
-	}
-
-	/* match struct setup */
-	m->g = g;
-	m->eflags = eflags;
-	m->pmatch = NULL;
-	m->lastpos = NULL;
-	m->offp = string;
-	m->beginp = start;
-	m->endp = stop;
-	STATESETUP(m, 4);
-	SETUP(m->st);
-	SETUP(m->fresh);
-	SETUP(m->tmp);
-	SETUP(m->empty);
-	CLEAR(m->empty);
-
-	/* this loop does only one repetition except for backrefs */
-	for (;;)
-	{
-		endp = fast(m, start, stop, gf, gl);
-		if (endp == NULL)
-		{						/* a miss */
-			STATETEARDOWN(m);
-			return REG_NOMATCH;
-		}
-		if (nmatch == 0 && !g->backrefs)
-			break;				/* no further info needed */
-
-		/* where? */
-		assert(m->coldp != NULL);
-		for (;;)
-		{
-			NOTE("finding start");
-			endp = slow(m, m->coldp, stop, gf, gl);
-			if (endp != NULL)
-				break;
-			assert(m->coldp < m->endp);
-			m->coldp++;
-		}
-		if (nmatch == 1 && !g->backrefs)
-			break;				/* no further info needed */
-
-		/* oh my, he wants the subexpressions... */
-		if (m->pmatch == NULL)
-			m->pmatch = (regmatch_t *) malloc((m->g->nsub + 1) *
-											  sizeof(regmatch_t));
-		if (m->pmatch == NULL)
-		{
-			STATETEARDOWN(m);
-			return REG_ESPACE;
-		}
-		for (i = 1; i <= m->g->nsub; i++)
-			m->pmatch[i].rm_so = m->pmatch[i].rm_eo = -1;
-		if (!g->backrefs && !(m->eflags & REG_BACKR))
-		{
-			NOTE("dissecting");
-			dp = dissect(m, m->coldp, endp, gf, gl);
-		}
-		else
-		{
-			if (g->nplus > 0 && m->lastpos == NULL)
-				m->lastpos = (pg_wchar **) malloc((g->nplus + 1) *
-												  sizeof(pg_wchar *));
-			if (g->nplus > 0 && m->lastpos == NULL)
-			{
-				free(m->pmatch);
-				STATETEARDOWN(m);
-				return REG_ESPACE;
-			}
-			NOTE("backref dissect");
-			dp = backref(m, m->coldp, endp, gf, gl, (sopno) 0);
-		}
-		if (dp != NULL)
-			break;
-
-		/* uh-oh... we couldn't find a subexpression-level match */
-		assert(g->backrefs);	/* must be back references doing it */
-		assert(g->nplus == 0 || m->lastpos != NULL);
-		for (;;)
-		{
-			if (dp != NULL || endp <= m->coldp)
-				break;			/* defeat */
-			NOTE("backoff");
-			endp = slow(m, m->coldp, endp - 1, gf, gl);
-			if (endp == NULL)
-				break;			/* defeat */
-			/* try it on a shorter possibility */
-#ifndef NDEBUG
-			for (i = 1; i <= m->g->nsub; i++)
-			{
-				assert(m->pmatch[i].rm_so == -1);
-				assert(m->pmatch[i].rm_eo == -1);
-			}
-#endif
-			NOTE("backoff dissect");
-			dp = backref(m, m->coldp, endp, gf, gl, (sopno) 0);
-		}
-		assert(dp == NULL || dp == endp);
-		if (dp != NULL)			/* found a shorter one */
-			break;
-
-		/* despite initial appearances, there is no match here */
-		NOTE("false alarm");
-		start = m->coldp + 1;	/* recycle starting later */
-		assert(start <= stop);
-	}
-
-	/* fill in the details if requested */
-	if (nmatch > 0)
-	{
-		pmatch[0].rm_so = m->coldp - m->offp;
-		pmatch[0].rm_eo = endp - m->offp;
-	}
-	if (nmatch > 1)
-	{
-		assert(m->pmatch != NULL);
-		for (i = 1; i < nmatch; i++)
-			if (i <= m->g->nsub)
-				pmatch[i] = m->pmatch[i];
-			else
-			{
-				pmatch[i].rm_so = -1;
-				pmatch[i].rm_eo = -1;
-			}
-	}
-
-	if (m->pmatch != NULL)
-		free((pg_wchar *) m->pmatch);
-	if (m->lastpos != NULL)
-		free((pg_wchar *) m->lastpos);
-	STATETEARDOWN(m);
-	return 0;
-}
-
-/*
- * dissect - figure out what matched what, no back references
- */
-static pg_wchar *				/* == stop (success) always */
-dissect(struct match * m, pg_wchar *start, pg_wchar *stop,
-		sopno startst, sopno stopst)
-{
-	int			i;
-	sopno		ss;				/* start sop of current subRE */
-	sopno		es;				/* end sop of current subRE */
-	pg_wchar   *sp;				/* start of string matched by it */
-	pg_wchar   *stp;			/* string matched by it cannot pass here */
-	pg_wchar   *rest;			/* start of rest of string */
-	pg_wchar   *tail;			/* string unmatched by rest of RE */
-	sopno		ssub;			/* start sop of subsubRE */
-	sopno		esub;			/* end sop of subsubRE */
-	pg_wchar   *ssp;			/* start of string matched by subsubRE */
-	pg_wchar   *sep;			/* end of string matched by subsubRE */
-	pg_wchar   *oldssp;			/* previous ssp */
-	pg_wchar   *dp;
-
-	AT("diss", start, stop, startst, stopst);
-	sp = start;
-	for (ss = startst; ss < stopst; ss = es)
-	{
-		/* identify end of subRE */
-		es = ss;
-		switch (OP(m->g->strip[es]))
-		{
-			case OPLUS_:
-			case OQUEST_:
-				es += OPND(m->g->strip[es]);
-				break;
-			case OCH_:
-				while (OP(m->g->strip[es]) != O_CH)
-					es += OPND(m->g->strip[es]);
-				break;
-		}
-		es++;
-
-		/* figure out what it matched */
-		switch (OP(m->g->strip[ss]))
-		{
-			case OEND:
-				assert(nope);
-				break;
-			case OCHAR:
-				sp++;
-				break;
-			case OBOL:
-			case OEOL:
-			case OBOW:
-			case OEOW:
-				break;
-			case OANY:
-			case OANYOF:
-				sp++;
-				break;
-			case OBACK_:
-			case O_BACK:
-				assert(nope);
-				break;
-				/* cases where length of match is hard to find */
-			case OQUEST_:
-				stp = stop;
-				for (;;)
-				{
-					/* how long could this one be? */
-					rest = slow(m, sp, stp, ss, es);
-					assert(rest != NULL);		/* it did match */
-					/* could the rest match the rest? */
-					tail = slow(m, rest, stop, es, stopst);
-					if (tail == stop)
-						break;	/* yes! */
-					/* no -- try a shorter match for this one */
-					stp = rest - 1;
-					assert(stp >= sp);	/* it did work */
-				}
-				ssub = ss + 1;
-				esub = es - 1;
-				/* did innards match? */
-				if (slow(m, sp, rest, ssub, esub) != NULL)
-				{
-					dp = dissect(m, sp, rest, ssub, esub);
-					assert(dp == rest);
-				}
-				else
-/* no */
-					assert(sp == rest);
-				sp = rest;
-				break;
-			case OPLUS_:
-				stp = stop;
-				for (;;)
-				{
-					/* how long could this one be? */
-					rest = slow(m, sp, stp, ss, es);
-					assert(rest != NULL);		/* it did match */
-					/* could the rest match the rest? */
-					tail = slow(m, rest, stop, es, stopst);
-					if (tail == stop)
-						break;	/* yes! */
-					/* no -- try a shorter match for this one */
-					stp = rest - 1;
-					assert(stp >= sp);	/* it did work */
-				}
-				ssub = ss + 1;
-				esub = es - 1;
-				ssp = sp;
-				oldssp = ssp;
-				for (;;)
-				{				/* find last match of innards */
-					sep = slow(m, ssp, rest, ssub, esub);
-					if (sep == NULL || sep == ssp)
-						break;	/* failed or matched null */
-					oldssp = ssp;		/* on to next try */
-					ssp = sep;
-				}
-				if (sep == NULL)
-				{
-					/* last successful match */
-					sep = ssp;
-					ssp = oldssp;
-				}
-				assert(sep == rest);	/* must exhaust substring */
-				assert(slow(m, ssp, sep, ssub, esub) == rest);
-				dp = dissect(m, ssp, sep, ssub, esub);
-				assert(dp == sep);
-				sp = rest;
-				break;
-			case OCH_:
-				stp = stop;
-				for (;;)
-				{
-					/* how long could this one be? */
-					rest = slow(m, sp, stp, ss, es);
-					assert(rest != NULL);		/* it did match */
-					/* could the rest match the rest? */
-					tail = slow(m, rest, stop, es, stopst);
-					if (tail == stop)
-						break;	/* yes! */
-					/* no -- try a shorter match for this one */
-					stp = rest - 1;
-					assert(stp >= sp);	/* it did work */
-				}
-				ssub = ss + 1;
-				esub = ss + OPND(m->g->strip[ss]) - 1;
-				assert(OP(m->g->strip[esub]) == OOR1);
-				for (;;)
-				{				/* find first matching branch */
-					if (slow(m, sp, rest, ssub, esub) == rest)
-						break;	/* it matched all of it */
-					/* that one missed, try next one */
-					assert(OP(m->g->strip[esub]) == OOR1);
-					esub++;
-					assert(OP(m->g->strip[esub]) == OOR2);
-					ssub = esub + 1;
-					esub += OPND(m->g->strip[esub]);
-					if (OP(m->g->strip[esub]) == OOR2)
-						esub--;
-					else
-						assert(OP(m->g->strip[esub]) == O_CH);
-				}
-				dp = dissect(m, sp, rest, ssub, esub);
-				assert(dp == rest);
-				sp = rest;
-				break;
-			case O_PLUS:
-			case O_QUEST:
-			case OOR1:
-			case OOR2:
-			case O_CH:
-				assert(nope);
-				break;
-			case OLPAREN:
-				i = OPND(m->g->strip[ss]);
-				assert(0 < i && i <= m->g->nsub);
-				m->pmatch[i].rm_so = sp - m->offp;
-				break;
-			case ORPAREN:
-				i = OPND(m->g->strip[ss]);
-				assert(0 < i && i <= m->g->nsub);
-				m->pmatch[i].rm_eo = sp - m->offp;
-				break;
-			default:			/* uh oh */
-				assert(nope);
-				break;
-		}
-	}
-
-	assert(sp == stop);
-	return sp;
-}
-
-/*
- * backref - figure out what matched what, figuring in back references
- *
- * lev is PLUS nesting level
- */
-static pg_wchar *				/* == stop (success) or NULL (failure) */
-backref(struct match * m, pg_wchar *start, pg_wchar *stop,
-		sopno startst, sopno stopst, sopno lev)
-{
-	int			i;
-	sopno		ss;				/* start sop of current subRE */
-	pg_wchar   *sp;				/* start of string matched by it */
-	sopno		ssub;			/* start sop of subsubRE */
-	sopno		esub;			/* end sop of subsubRE */
-	pg_wchar   *ssp;			/* start of string matched by subsubRE */
-	pg_wchar   *dp;
-	size_t		len;
-	int			hard;
-	sop			s;
-	regoff_t	offsave;
-	cset	   *cs;
-
-	AT("back", start, stop, startst, stopst);
-	sp = start;
-
-	/* get as far as we can with easy stuff */
-	hard = 0;
-	for (ss = startst; !hard && ss < stopst; ss++)
-		switch (OP(s = m->g->strip[ss]))
-		{
-			case OCHAR:
-				if (sp == stop || *sp++ != (pg_wchar) OPND(s))
-					return NULL;
-				break;
-			case OANY:
-				if (sp == stop)
-					return NULL;
-				sp++;
-				break;
-			case OANYOF:
-				cs = &m->g->sets[OPND(s)];
-				if (sp == stop || !CHIN(cs, *sp++))
-					return NULL;
-				break;
-			case OBOL:
-				if ((sp == m->beginp && !(m->eflags & REG_NOTBOL)) ||
-					(sp < m->endp && *(sp - 1) == '\n' &&
-					 (m->g->cflags & REG_NEWLINE)))
-				{				/* yes */
-				}
-				else
-					return NULL;
-				break;
-			case OEOL:
-				if ((sp == m->endp && !(m->eflags & REG_NOTEOL)) ||
-					(sp < m->endp && *sp == '\n' &&
-					 (m->g->cflags & REG_NEWLINE)))
-				{				/* yes */
-				}
-				else
-					return NULL;
-				break;
-			case OBOW:
-				if (((sp == m->beginp && !(m->eflags & REG_NOTBOL)) ||
-					 (sp < m->endp && *(sp - 1) == '\n' &&
-					  (m->g->cflags & REG_NEWLINE)) ||
-					 (sp > m->beginp &&
-					  !ISWORD(*(sp - 1)))) &&
-					(sp < m->endp && ISWORD(*sp)))
-				{				/* yes */
-				}
-				else
-					return NULL;
-				break;
-			case OEOW:
-				if (((sp == m->endp && !(m->eflags & REG_NOTEOL)) ||
-					 (sp < m->endp && *sp == '\n' &&
-					  (m->g->cflags & REG_NEWLINE)) ||
-					 (sp < m->endp && !ISWORD(*sp))) &&
-					(sp > m->beginp && ISWORD(*(sp - 1))))
-				{				/* yes */
-				}
-				else
-					return NULL;
-				break;
-			case O_QUEST:
-				break;
-			case OOR1:			/* matches null but needs to skip */
-				ss++;
-				s = m->g->strip[ss];
-				do
-				{
-					assert(OP(s) == OOR2);
-					ss += OPND(s);
-				} while (OP(s = m->g->strip[ss]) != O_CH);
-				/* note that the ss++ gets us past the O_CH */
-				break;
-			default:			/* have to make a choice */
-				hard = 1;
-				break;
-		}
-	if (!hard)
-	{							/* that was it! */
-		if (sp != stop)
-			return NULL;
-		return sp;
-	}
-	ss--;						/* adjust for the for's final increment */
-
-	/* the hard stuff */
-	AT("hard", sp, stop, ss, stopst);
-	s = m->g->strip[ss];
-	switch (OP(s))
-	{
-		case OBACK_:			/* the vilest depths */
-			i = OPND(s);
-			assert(0 < i && i <= m->g->nsub);
-			if (m->pmatch[i].rm_eo == -1)
-				return NULL;
-			assert(m->pmatch[i].rm_so != -1);
-			len = m->pmatch[i].rm_eo - m->pmatch[i].rm_so;
-			assert(stop - m->beginp >= len);
-			if (sp > stop - len)
-				return NULL;	/* not enough left to match */
-			ssp = m->offp + m->pmatch[i].rm_so;
-			if (memcmp(sp, ssp, len) != 0)
-				return NULL;
-			while (m->g->strip[ss] != SOP(O_BACK, i))
-				ss++;
-			return backref(m, sp + len, stop, ss + 1, stopst, lev);
-			break;
-		case OQUEST_:			/* to null or not */
-			dp = backref(m, sp, stop, ss + 1, stopst, lev);
-			if (dp != NULL)
-				return dp;		/* not */
-			return backref(m, sp, stop, ss + OPND(s) + 1, stopst, lev);
-			break;
-		case OPLUS_:
-			assert(m->lastpos != NULL);
-			assert(lev + 1 <= m->g->nplus);
-			m->lastpos[lev + 1] = sp;
-			return backref(m, sp, stop, ss + 1, stopst, lev + 1);
-			break;
-		case O_PLUS:
-			if (sp == m->lastpos[lev])	/* last pass matched null */
-				return backref(m, sp, stop, ss + 1, stopst, lev - 1);
-			/* try another pass */
-			m->lastpos[lev] = sp;
-			dp = backref(m, sp, stop, ss - OPND(s) + 1, stopst, lev);
-			if (dp == NULL)
-				return backref(m, sp, stop, ss + 1, stopst, lev - 1);
-			else
-				return dp;
-			break;
-		case OCH_:				/* find the right one, if any */
-			ssub = ss + 1;
-			esub = ss + OPND(s) - 1;
-			assert(OP(m->g->strip[esub]) == OOR1);
-			for (;;)
-			{					/* find first matching branch */
-				dp = backref(m, sp, stop, ssub, esub, lev);
-				if (dp != NULL)
-					return dp;
-				/* that one missed, try next one */
-				if (OP(m->g->strip[esub]) == O_CH)
-					return NULL;	/* there is none */
-				esub++;
-				assert(OP(m->g->strip[esub]) == OOR2);
-				ssub = esub + 1;
-				esub += OPND(m->g->strip[esub]);
-				if (OP(m->g->strip[esub]) == OOR2)
-					esub--;
-				else
-					assert(OP(m->g->strip[esub]) == O_CH);
-			}
-			break;
-		case OLPAREN:			/* must undo assignment if rest fails */
-			i = OPND(s);
-			assert(0 < i && i <= m->g->nsub);
-			offsave = m->pmatch[i].rm_so;
-			m->pmatch[i].rm_so = sp - m->offp;
-			dp = backref(m, sp, stop, ss + 1, stopst, lev);
-			if (dp != NULL)
-				return dp;
-			m->pmatch[i].rm_so = offsave;
-			return NULL;
-			break;
-		case ORPAREN:			/* must undo assignment if rest fails */
-			i = OPND(s);
-			assert(0 < i && i <= m->g->nsub);
-			offsave = m->pmatch[i].rm_eo;
-			m->pmatch[i].rm_eo = sp - m->offp;
-			dp = backref(m, sp, stop, ss + 1, stopst, lev);
-			if (dp != NULL)
-				return dp;
-			m->pmatch[i].rm_eo = offsave;
-			return NULL;
-			break;
-		default:				/* uh oh */
-			assert(nope);
-			break;
-	}
-
-	/* "can't happen" */
-	assert(nope);
-	/* NOTREACHED */
-	return 0;
-}
-
-/*
- * fast - step through the string at top speed
- */
-static pg_wchar *				/* where tentative match ended, or NULL */
-fast(struct match * m, pg_wchar *start, pg_wchar *stop,
-	 sopno startst, sopno stopst)
-{
-	states		st = m->st;
-	states		fresh = m->fresh;
-	states		tmp = m->tmp;
-	pg_wchar   *p = start;
-	int			c = (start == m->beginp) ? OUT : *(start - 1);
-	int			lastc;			/* previous c */
-	int			flagch;
-	int			i;
-	pg_wchar   *coldp;			/* last p after which no match was
-								 * underway */
-
-	CLEAR(st);
-	SET1(st, startst);
-	st = step(m->g, startst, stopst, st, NOTHING, st);
-	ASSIGN(fresh, st);
-	SP("start", st, *p);
-	coldp = NULL;
-	for (;;)
-	{
-		/* next character */
-		lastc = c;
-		c = (p == m->endp) ? OUT : *p;
-		if (EQ(st, fresh))
-			coldp = p;
-
-		/* is there an EOL and/or BOL between lastc and c? */
-		flagch = '\0';
-		i = 0;
-		if ((lastc == '\n' && m->g->cflags & REG_NEWLINE) ||
-			(lastc == OUT && !(m->eflags & REG_NOTBOL)))
-		{
-			flagch = BOL;
-			i = m->g->nbol;
-		}
-		if ((c == '\n' && m->g->cflags & REG_NEWLINE) ||
-			(c == OUT && !(m->eflags & REG_NOTEOL)))
-		{
-			flagch = (flagch == BOL) ? BOLEOL : EOL;
-			i += m->g->neol;
-		}
-		if (i != 0)
-		{
-			for (; i > 0; i--)
-				st = step(m->g, startst, stopst, st, flagch, st);
-			SP("boleol", st, c);
-		}
-
-		/* how about a word boundary? */
-		if ((flagch == BOL || (lastc != OUT && !ISWORD(lastc))) &&
-			(c != OUT && ISWORD(c)))
-			flagch = BOW;
-		if ((lastc != OUT && ISWORD(lastc)) &&
-			(flagch == EOL || (c != OUT && !ISWORD(c))))
-			flagch = EOW;
-		if (flagch == BOW || flagch == EOW)
-		{
-			st = step(m->g, startst, stopst, st, flagch, st);
-			SP("boweow", st, c);
-		}
-
-		/* are we done? */
-		if (ISSET(st, stopst) || p == stop)
-			break;				/* NOTE BREAK OUT */
-
-		/* no, we must deal with this character */
-		ASSIGN(tmp, st);
-		ASSIGN(st, fresh);
-		assert(c != OUT);
-		st = step(m->g, startst, stopst, tmp, c, st);
-		SP("aft", st, c);
-		assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st));
-		p++;
-	}
-
-	assert(coldp != NULL);
-	m->coldp = coldp;
-	if (ISSET(st, stopst))
-		return p + 1;
-	else
-		return NULL;
-}
-
-/*
- * slow - step through the string more deliberately
- */
-static pg_wchar *				/* where it ended */
-slow(struct match * m, pg_wchar *start, pg_wchar *stop,
-	 sopno startst, sopno stopst)
-{
-	states		st = m->st;
-	states		empty = m->empty;
-	states		tmp = m->tmp;
-	pg_wchar   *p = start;
-	int			c = (start == m->beginp) ? OUT : *(start - 1);
-	int			lastc;			/* previous c */
-	int			flagch;
-	int			i;
-	pg_wchar   *matchp;			/* last p at which a match ended */
-
-	AT("slow", start, stop, startst, stopst);
-	CLEAR(st);
-	SET1(st, startst);
-	SP("sstart", st, *p);
-	st = step(m->g, startst, stopst, st, NOTHING, st);
-	matchp = NULL;
-	for (;;)
-	{
-		/* next character */
-		lastc = c;
-		c = (p == m->endp) ? OUT : *p;
-
-		/* is there an EOL and/or BOL between lastc and c? */
-		flagch = '\0';
-		i = 0;
-		if ((lastc == '\n' && m->g->cflags & REG_NEWLINE) ||
-			(lastc == OUT && !(m->eflags & REG_NOTBOL)))
-		{
-			flagch = BOL;
-			i = m->g->nbol;
-		}
-		if ((c == '\n' && m->g->cflags & REG_NEWLINE) ||
-			(c == OUT && !(m->eflags & REG_NOTEOL)))
-		{
-			flagch = (flagch == BOL) ? BOLEOL : EOL;
-			i += m->g->neol;
-		}
-		if (i != 0)
-		{
-			for (; i > 0; i--)
-				st = step(m->g, startst, stopst, st, flagch, st);
-			SP("sboleol", st, c);
-		}
-
-		/* how about a word boundary? */
-		if ((flagch == BOL || (lastc != OUT && !ISWORD(lastc))) &&
-			(c != OUT && ISWORD(c)))
-			flagch = BOW;
-		if ((lastc != OUT && ISWORD(lastc)) &&
-			(flagch == EOL || (c != OUT && !ISWORD(c))))
-			flagch = EOW;
-		if (flagch == BOW || flagch == EOW)
-		{
-			st = step(m->g, startst, stopst, st, flagch, st);
-			SP("sboweow", st, c);
-		}
-
-		/* are we done? */
-		if (ISSET(st, stopst))
-			matchp = p;
-		if (EQ(st, empty) || p == stop)
-			break;				/* NOTE BREAK OUT */
-
-		/* no, we must deal with this character */
-		ASSIGN(tmp, st);
-		ASSIGN(st, empty);
-		assert(c != OUT);
-		st = step(m->g, startst, stopst, tmp, c, st);
-		SP("saft", st, c);
-		assert(EQ(step(m->g, startst, stopst, st, NOTHING, st), st));
-		p++;
-	}
-
-	return matchp;
-}
-
-
-/*
- * step - map set of states reachable before char to set reachable after
- */
-static states
-step(struct re_guts * g,
-	 sopno start,				/* start state within strip */
-	 sopno stop,				/* state after stop state within strip */
-	 states bef,				/* states reachable before */
-	 int ch,					/* character or NONCHAR code */
-	 states aft)				/* states already known reachable after */
-{
-	cset	   *cs;
-	sop			s;
-	sopno		pc;
-	onestate	here;			/* note, macros know this name */
-	sopno		look;
-	int			i;
-
-	for (pc = start, INIT(here, pc); pc != stop; pc++, INC(here))
-	{
-		s = g->strip[pc];
-		switch (OP(s))
-		{
-			case OEND:
-				assert(pc == stop - 1);
-				break;
-			case OCHAR:
-				/* only characters can match */
-				assert(!NONCHAR(ch) || ch != (pg_wchar) OPND(s));
-				if (ch == (pg_wchar) OPND(s))
-					FWD(aft, bef, 1);
-				break;
-			case OBOL:
-				if (ch == BOL || ch == BOLEOL)
-					FWD(aft, bef, 1);
-				break;
-			case OEOL:
-				if (ch == EOL || ch == BOLEOL)
-					FWD(aft, bef, 1);
-				break;
-			case OBOW:
-				if (ch == BOW)
-					FWD(aft, bef, 1);
-				break;
-			case OEOW:
-				if (ch == EOW)
-					FWD(aft, bef, 1);
-				break;
-			case OANY:
-				if (!NONCHAR(ch))
-					FWD(aft, bef, 1);
-				break;
-			case OANYOF:
-				cs = &g->sets[OPND(s)];
-				if (!NONCHAR(ch) && CHIN(cs, ch))
-					FWD(aft, bef, 1);
-				break;
-			case OBACK_:		/* ignored here */
-			case O_BACK:
-				FWD(aft, aft, 1);
-				break;
-			case OPLUS_:		/* forward, this is just an empty */
-				FWD(aft, aft, 1);
-				break;
-			case O_PLUS:		/* both forward and back */
-				FWD(aft, aft, 1);
-				i = ISSETBACK(aft, OPND(s));
-				BACK(aft, aft, OPND(s));
-				if (!i && ISSETBACK(aft, OPND(s)))
-				{
-					/* oho, must reconsider loop body */
-					pc -= OPND(s) + 1;
-					INIT(here, pc);
-				}
-				break;
-			case OQUEST_:		/* two branches, both forward */
-				FWD(aft, aft, 1);
-				FWD(aft, aft, OPND(s));
-				break;
-			case O_QUEST:		/* just an empty */
-				FWD(aft, aft, 1);
-				break;
-			case OLPAREN:		/* not significant here */
-			case ORPAREN:
-				FWD(aft, aft, 1);
-				break;
-			case OCH_:			/* mark the first two branches */
-				FWD(aft, aft, 1);
-				assert(OP(g->strip[pc + OPND(s)]) == OOR2);
-				FWD(aft, aft, OPND(s));
-				break;
-			case OOR1:			/* done a branch, find the O_CH */
-				if (ISSTATEIN(aft, here))
-				{
-					for (look = 1;
-						 OP(s = g->strip[pc + look]) != O_CH;
-						 look += OPND(s))
-						assert(OP(s) == OOR2);
-					FWD(aft, aft, look);
-				}
-				break;
-			case OOR2:			/* propagate OCH_'s marking */
-				FWD(aft, aft, 1);
-				if (OP(g->strip[pc + OPND(s)]) != O_CH)
-				{
-					assert(OP(g->strip[pc + OPND(s)]) == OOR2);
-					FWD(aft, aft, OPND(s));
-				}
-				break;
-			case O_CH:			/* just empty */
-				FWD(aft, aft, 1);
-				break;
-			default:			/* ooooops... */
-				assert(nope);
-				break;
-		}
-	}
-
-	return aft;
-}
-
-#ifdef REDEBUG
-/*
- * print - print a set of states
- */
-static void
-print(struct match * m, pg_wchar *caption, states st,
-	  int ch, FILE *d)
-{
-	struct re_guts *g = m->g;
-	int			i;
-	int			first = 1;
-
-	if (!(m->eflags & REG_TRACE))
-		return;
-
-	fprintf(d, "%s", caption);
-	if (ch != '\0')
-		fprintf(d, " %s", pchar(ch));
-	for (i = 0; i < g->nstates; i++)
-		if (ISSET(st, i))
-		{
-			fprintf(d, "%s%d", (first) ? "\t" : ", ", i);
-			first = 0;
-		}
-	fprintf(d, "\n");
-}
-
-/*
- * at - print current situation
- */
-static void
-at(struct match * m, pg_wchar *title, pg_wchar *start, pg_wchar *stop,
-   sopno startst, sopno stopst)
-{
-	if (!(m->eflags & REG_TRACE))
-		return;
-
-	printf("%s %s-", title, pchar(*start));
-	printf("%s ", pchar(*stop));
-	printf("%ld-%ld\n", (long) startst, (long) stopst);
-}
-
-#ifndef PCHARDONE
-#define PCHARDONE				/* only do this once */
-/*
- * pchar - make a character printable
- *
- * Is this identical to regchar() over in debug.c?	Well, yes.	But a
- * duplicate here avoids having a debugging-capable regexec.o tied to
- * a matching debug.o, and this is convenient.	It all disappears in
- * the non-debug compilation anyway, so it doesn't matter much.
- */
-static pg_wchar *				/* -> representation */
-pchar(int ch)
-{
-	static pg_wchar pbuf[10];
-
-	if (pg_isprint(ch) || ch == ' ')
-		sprintf(pbuf, "%c", ch);
-	else
-		sprintf(pbuf, "\\%o", ch);
-	return pbuf;
-}
-
-static int
-pg_isprint(int c)
-{
-	return (c >= 0 && c <= UCHAR_MAX && isprint((unsigned char) c));
-}
-#endif
-#endif
-
-#undef	matcher
-#undef	fast
-#undef	slow
-#undef	dissect
-#undef	backref
-#undef	step
-#undef	print
-#undef	at
-#undef	match
diff --git a/src/backend/regex/re_format.7 b/src/backend/regex/re_format.7
deleted file mode 100644
index db2f6349c45..00000000000
--- a/src/backend/regex/re_format.7
+++ /dev/null
@@ -1,269 +0,0 @@
-.\" Copyright (c) 1992, 1993, 1994 Henry Spencer.
-.\" Copyright (c) 1992, 1993, 1994
-.\"	The Regents of the University of California.  All rights reserved.
-.\"
-.\" This code is derived from software contributed to Berkeley by
-.\" Henry Spencer.
-.\"
-.\" Redistribution and use in source and binary forms, with or without
-.\" modification, are permitted provided that the following conditions
-.\" are met:
-.\" 1. Redistributions of source code must retain the above copyright
-.\"    notice, this list of conditions and the following disclaimer.
-.\" 2. Redistributions in binary form must reproduce the above copyright
-.\"    notice, this list of conditions and the following disclaimer in the
-.\"    documentation and/or other materials provided with the distribution.
-.\" 3. All advertising materials mentioning features or use of this software
-.\"    must display the following acknowledgement:
-.\"	This product includes software developed by the University of
-.\"	California, Berkeley and its contributors.
-.\" 4. Neither the name of the University nor the names of its contributors
-.\"    may be used to endorse or promote products derived from this software
-.\"    without specific prior written permission.
-.\"
-.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
-.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-.\" ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
-.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
-.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
-.\" SUCH DAMAGE.
-.\"
-.\"	@(#)re_format.7	8.3 (Berkeley) 3/20/94
-.\"
-.TH RE_FORMAT 7 "March 20, 1994"
-.SH NAME
-re_format \- POSIX 1003.2 regular expressions
-.SH DESCRIPTION
-Regular expressions (``RE''s),
-as defined in POSIX 1003.2, come in two forms:
-modern REs (roughly those of
-.IR egrep ;
-1003.2 calls these ``extended'' REs)
-and obsolete REs (roughly those of
-.IR ed ;
-1003.2 ``basic'' REs).
-Obsolete REs mostly exist for backward compatibility in some old programs;
-they will be discussed at the end.
-1003.2 leaves some aspects of RE syntax and semantics open;
-`\(dg' marks decisions on these aspects that
-may not be fully portable to other 1003.2 implementations.
-.PP
-A (modern) RE is one\(dg or more non-empty\(dg \fIbranches\fR,
-separated by `|'.
-It matches anything that matches one of the branches.
-.PP
-A branch is one\(dg or more \fIpieces\fR, concatenated.
-It matches a match for the first, followed by a match for the second, etc.
-.PP
-A piece is an \fIatom\fR possibly followed
-by a single\(dg `*', `+', `?', or \fIbound\fR.
-An atom followed by `*' matches a sequence of 0 or more matches of the atom.
-An atom followed by `+' matches a sequence of 1 or more matches of the atom.
-An atom followed by `?' matches a sequence of 0 or 1 matches of the atom.
-.PP
-A \fIbound\fR is `{' followed by an unsigned decimal integer,
-possibly followed by `,'
-possibly followed by another unsigned decimal integer,
-always followed by `}'.
-The integers must lie between 0 and RE_DUP_MAX (255\(dg) inclusive,
-and if there are two of them, the first may not exceed the second.
-An atom followed by a bound containing one integer \fIi\fR
-and no comma matches
-a sequence of exactly \fIi\fR matches of the atom.
-An atom followed by a bound
-containing one integer \fIi\fR and a comma matches
-a sequence of \fIi\fR or more matches of the atom.
-An atom followed by a bound
-containing two integers \fIi\fR and \fIj\fR matches
-a sequence of \fIi\fR through \fIj\fR (inclusive) matches of the atom.
-.PP
-An atom is a regular expression enclosed in `()' (matching a match for the
-regular expression),
-an empty set of `()' (matching the null string)\(dg,
-a \fIbracket expression\fR (see below), `.'
-(matching any single character), `^' (matching the null string at the
-beginning of a line), `$' (matching the null string at the
-end of a line), a `\e' followed by one of the characters
-`^.[$()|*+?{\e'
-(matching that character taken as an ordinary character),
-a `\e' followed by any other character\(dg
-(matching that character taken as an ordinary character,
-as if the `\e' had not been present\(dg),
-or a single character with no other significance (matching that character).
-A `{' followed by a character other than a digit is an ordinary
-character, not the beginning of a bound\(dg.
-It is illegal to end an RE with `\e'.
-.PP
-A \fIbracket expression\fR is a list of characters enclosed in `[]'.
-It normally matches any single character from the list (but see below).
-If the list begins with `^',
-it matches any single character
-(but see below) \fInot\fR from the rest of the list.
-If two characters in the list are separated by `\-', this is shorthand
-for the full \fIrange\fR of characters between those two (inclusive) in the
-collating sequence,
-e.g. `[0-9]' in ASCII matches any decimal digit.
-It is illegal\(dg for two ranges to share an
-endpoint, e.g. `a-c-e'.
-Ranges are very collating-sequence-dependent,
-and portable programs should avoid relying on them.
-.PP
-To include a literal `]' in the list, make it the first character
-(following a possible `^').
-To include a literal `\-', make it the first or last character,
-or the second endpoint of a range.
-To use a literal `\-' as the first endpoint of a range,
-enclose it in `[.' and `.]' to make it a collating element (see below).
-With the exception of these and some combinations using `[' (see next
-paragraphs), all other special characters, including `\e', lose their
-special significance within a bracket expression.
-.PP
-Within a bracket expression, a collating element (a character,
-a multi-character sequence that collates as if it were a single character,
-or a collating-sequence name for either)
-enclosed in `[.' and `.]' stands for the
-sequence of characters of that collating element.
-The sequence is a single element of the bracket expression's list.
-A bracket expression containing a multi-character collating element 
-can thus match more than one character,
-e.g. if the collating sequence includes a `ch' collating element,
-then the RE `[[.ch.]]*c' matches the first five characters
-of `chchcc'.
-.PP
-Within a bracket expression, a collating element enclosed in `[=' and
-`=]' is an equivalence class, standing for the sequences of characters
-of all collating elements equivalent to that one, including itself.
-(If there are no other equivalent collating elements,
-the treatment is as if the enclosing delimiters were `[.' and `.]'.)
-For example, if o and \o'o^' are the members of an equivalence class,
-then `[[=o=]]', `[[=\o'o^'=]]', and `[o\o'o^']' are all synonymous.
-An equivalence class may not\(dg be an endpoint
-of a range.
-.PP
-Within a bracket expression, the name of a \fIcharacter class\fR enclosed
-in `[:' and `:]' stands for the list of all characters belonging to that
-class.
-Standard character class names are:
-.PP
-.RS
-.nf
-.ta 3c 6c 9c
-alnum	digit	punct
-alpha	graph	space
-blank	lower	upper
-cntrl	print	xdigit
-.fi
-.RE
-.PP
-These stand for the character classes defined in
-.IR ctype (3).
-A locale may provide others.
-A character class may not be used as an endpoint of a range.
-.PP
-There are two special cases\(dg of bracket expressions:
-the bracket expressions `[[:<:]]' and `[[:>:]]' match the null string at
-the beginning and end of a word respectively.
-A word is defined as a sequence of
-word characters
-which is neither preceded nor followed by
-word characters.
-A word character is an
-.I alnum
-character (as defined by
-.IR ctype (3))
-or an underscore.
-This is an extension,
-compatible with but not specified by POSIX 1003.2,
-and should be used with
-caution in software intended to be portable to other systems.
-.PP
-In the event that an RE could match more than one substring of a given
-string,
-the RE matches the one starting earliest in the string.
-If the RE could match more than one substring starting at that point,
-it matches the longest.
-Subexpressions also match the longest possible substrings, subject to
-the constraint that the whole match be as long as possible,
-with subexpressions starting earlier in the RE taking priority over
-ones starting later.
-Note that higher-level subexpressions thus take priority over
-their lower-level component subexpressions.
-.PP
-Match lengths are measured in characters, not collating elements.
-A null string is considered longer than no match at all.
-For example,
-`bb*' matches the three middle characters of `abbbc',
-`(wee|week)(knights|nights)' matches all ten characters of `weeknights',
-when `(.*).*' is matched against `abc' the parenthesized subexpression
-matches all three characters, and
-when `(a*)*' is matched against `bc' both the whole RE and the parenthesized
-subexpression match the null string.
-.PP
-If case-independent matching is specified,
-the effect is much as if all case distinctions had vanished from the
-alphabet.
-When an alphabetic that exists in multiple cases appears as an
-ordinary character outside a bracket expression, it is effectively
-transformed into a bracket expression containing both cases,
-e.g. `x' becomes `[xX]'.
-When it appears inside a bracket expression, all case counterparts
-of it are added to the bracket expression, so that (e.g.) `[x]'
-becomes `[xX]' and `[^x]' becomes `[^xX]'.
-.PP
-No particular limit is imposed on the length of REs\(dg.
-Programs intended to be portable should not employ REs longer
-than 256 bytes,
-as an implementation can refuse to accept such REs and remain
-POSIX-compliant.
-.PP
-Obsolete (``basic'') regular expressions differ in several respects.
-`|', `+', and `?' are ordinary characters and there is no equivalent
-for their functionality.
-The delimiters for bounds are `\e{' and `\e}',
-with `{' and `}' by themselves ordinary characters.
-The parentheses for nested subexpressions are `\e(' and `\e)',
-with `(' and `)' by themselves ordinary characters.
-`^' is an ordinary character except at the beginning of the
-RE or\(dg the beginning of a parenthesized subexpression,
-`$' is an ordinary character except at the end of the
-RE or\(dg the end of a parenthesized subexpression,
-and `*' is an ordinary character if it appears at the beginning of the
-RE or the beginning of a parenthesized subexpression
-(after a possible leading `^').
-Finally, there is one new type of atom, a \fIback reference\fR:
-`\e' followed by a non-zero decimal digit \fId\fR
-matches the same sequence of characters
-matched by the \fId\fRth parenthesized subexpression
-(numbering subexpressions by the positions of their opening parentheses,
-left to right),
-so that (e.g.) `\e([bc]\e)\e1' matches `bb' or `cc' but not `bc'.
-.SH SEE ALSO
-regex(3)
-.PP
-POSIX 1003.2, section 2.8 (Regular Expression Notation).
-.SH BUGS
-Having two kinds of REs is a botch.
-.PP
-The current 1003.2 spec says that `)' is an ordinary character in
-the absence of an unmatched `(';
-this was an unintentional result of a wording error,
-and change is likely.
-Avoid relying on it.
-.PP
-Back references are a dreadful botch,
-posing major problems for efficient implementations.
-They are also somewhat vaguely defined
-(does
-`a\e(\e(b\e)*\e2\e)*d' match `abbbd'?).
-Avoid using them.
-.PP
-1003.2's specification of case-independent matching is vague.
-The ``one case implies all cases'' definition given above
-is current consensus among implementors as to the right interpretation.
-.PP
-The syntax for word boundaries is incredibly ugly.
diff --git a/src/backend/regex/re_syntax.n b/src/backend/regex/re_syntax.n
new file mode 100644
index 00000000000..f37bb85abdb
--- /dev/null
+++ b/src/backend/regex/re_syntax.n
@@ -0,0 +1,970 @@
+'\"
+'\" Copyright (c) 1998 Sun Microsystems, Inc.
+'\" Copyright (c) 1999 Scriptics Corporation
+'\"
+'\" This software is copyrighted by the Regents of the University of
+'\" California, Sun Microsystems, Inc., Scriptics Corporation, ActiveState
+'\" Corporation and other parties.  The following terms apply to all files
+'\" associated with the software unless explicitly disclaimed in
+'\" individual files.
+'\" 
+'\" The authors hereby grant permission to use, copy, modify, distribute,
+'\" and license this software and its documentation for any purpose, provided
+'\" that existing copyright notices are retained in all copies and that this
+'\" notice is included verbatim in any distributions. No written agreement,
+'\" license, or royalty fee is required for any of the authorized uses.
+'\" Modifications to this software may be copyrighted by their authors
+'\" and need not follow the licensing terms described here, provided that
+'\" the new terms are clearly indicated on the first page of each file where
+'\" they apply.
+'\" 
+'\" IN NO EVENT SHALL THE AUTHORS OR DISTRIBUTORS BE LIABLE TO ANY PARTY
+'\" FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
+'\" ARISING OUT OF THE USE OF THIS SOFTWARE, ITS DOCUMENTATION, OR ANY
+'\" DERIVATIVES THEREOF, EVEN IF THE AUTHORS HAVE BEEN ADVISED OF THE
+'\" POSSIBILITY OF SUCH DAMAGE.
+'\" 
+'\" THE AUTHORS AND DISTRIBUTORS SPECIFICALLY DISCLAIM ANY WARRANTIES,
+'\" INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY,
+'\" FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT.  THIS SOFTWARE
+'\" IS PROVIDED ON AN "AS IS" BASIS, AND THE AUTHORS AND DISTRIBUTORS HAVE
+'\" NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR
+'\" MODIFICATIONS.
+'\" 
+'\" GOVERNMENT USE: If you are acquiring this software on behalf of the
+'\" U.S. government, the Government shall have only "Restricted Rights"
+'\" in the software and related documentation as defined in the Federal 
+'\" Acquisition Regulations (FARs) in Clause 52.227.19 (c) (2).  If you
+'\" are acquiring the software on behalf of the Department of Defense, the
+'\" software shall be classified as "Commercial Computer Software" and the
+'\" Government shall have only "Restricted Rights" as defined in Clause
+'\" 252.227-7013 (c) (1) of DFARs.  Notwithstanding the foregoing, the
+'\" authors grant the U.S. Government and others acting in its behalf
+'\" permission to use and distribute the software in accordance with the
+'\" terms specified in this license. 
+'\" 
+'\" RCS: @(#) Id: re_syntax.n,v 1.3 1999/07/14 19:09:36 jpeek Exp 
+'\"
+.so man.macros
+.TH re_syntax n "8.1" Tcl "Tcl Built-In Commands"
+.BS
+.SH NAME
+re_syntax \- Syntax of Tcl regular expressions.
+.BE
+
+.SH DESCRIPTION
+.PP
+A \fIregular expression\fR describes strings of characters.
+It's a pattern that matches certain strings and doesn't match others.
+
+.SH "DIFFERENT FLAVORS OF REs"
+Regular expressions (``RE''s), as defined by POSIX, come in two
+flavors: \fIextended\fR REs (``EREs'') and \fIbasic\fR REs (``BREs'').
+EREs are roughly those of the traditional \fIegrep\fR, while BREs are
+roughly those of the traditional \fIed\fR.  This implementation adds
+a third flavor, \fIadvanced\fR REs (``AREs''), basically EREs with
+some significant extensions.
+.PP
+This manual page primarily describes AREs.  BREs mostly exist for
+backward compatibility in some old programs; they will be discussed at
+the end.  POSIX EREs are almost an exact subset of AREs.  Features of
+AREs that are not present in EREs will be indicated.
+
+.SH "REGULAR EXPRESSION SYNTAX"
+.PP
+Tcl regular expressions are implemented using the package written by
+Henry Spencer, based on the 1003.2 spec and some (not quite all) of
+the Perl5 extensions (thanks, Henry!).  Much of the description of
+regular expressions below is copied verbatim from his manual entry.
+.PP
+An ARE is one or more \fIbranches\fR,
+separated by `\fB|\fR',
+matching anything that matches any of the branches.
+.PP
+A branch is zero or more \fIconstraints\fR or \fIquantified atoms\fR,
+concatenated.
+It matches a match for the first, followed by a match for the second, etc;
+an empty branch matches the empty string.
+.PP
+A quantified atom is an \fIatom\fR possibly followed
+by a single \fIquantifier\fR.
+Without a quantifier, it matches a match for the atom.
+The quantifiers,
+and what a so-quantified atom matches, are:
+.RS 2
+.TP 6
+\fB*\fR
+a sequence of 0 or more matches of the atom
+.TP
+\fB+\fR
+a sequence of 1 or more matches of the atom
+.TP
+\fB?\fR
+a sequence of 0 or 1 matches of the atom
+.TP
+\fB{\fIm\fB}\fR
+a sequence of exactly \fIm\fR matches of the atom
+.TP
+\fB{\fIm\fB,}\fR
+a sequence of \fIm\fR or more matches of the atom
+.TP
+\fB{\fIm\fB,\fIn\fB}\fR
+a sequence of \fIm\fR through \fIn\fR (inclusive) matches of the atom;
+\fIm\fR may not exceed \fIn\fR
+.TP
+\fB*?  +?  ??  {\fIm\fB}?  {\fIm\fB,}?  {\fIm\fB,\fIn\fB}?\fR
+\fInon-greedy\fR quantifiers,
+which match the same possibilities,
+but prefer the smallest number rather than the largest number
+of matches (see MATCHING)
+.RE
+.PP
+The forms using
+\fB{\fR and \fB}\fR
+are known as \fIbound\fRs.
+The numbers
+\fIm\fR and \fIn\fR are unsigned decimal integers
+with permissible values from 0 to 255 inclusive.
+.PP
+An atom is one of:
+.RS 2
+.TP 6
+\fB(\fIre\fB)\fR
+(where \fIre\fR is any regular expression)
+matches a match for
+\fIre\fR, with the match noted for possible reporting
+.TP
+\fB(?:\fIre\fB)\fR
+as previous,
+but does no reporting
+(a ``non-capturing'' set of parentheses)
+.TP
+\fB()\fR
+matches an empty string,
+noted for possible reporting
+.TP
+\fB(?:)\fR
+matches an empty string,
+without reporting
+.TP
+\fB[\fIchars\fB]\fR
+a \fIbracket expression\fR,
+matching any one of the \fIchars\fR (see BRACKET EXPRESSIONS for more detail)
+.TP
+ \fB.\fR
+matches any single character
+.TP
+\fB\e\fIk\fR
+(where \fIk\fR is a non-alphanumeric character)
+matches that character taken as an ordinary character,
+e.g. \e\e matches a backslash character
+.TP
+\fB\e\fIc\fR
+where \fIc\fR is alphanumeric
+(possibly followed by other characters),
+an \fIescape\fR (AREs only),
+see ESCAPES below
+.TP
+\fB{\fR
+when followed by a character other than a digit,
+matches the left-brace character `\fB{\fR';
+when followed by a digit, it is the beginning of a
+\fIbound\fR (see above)
+.TP
+\fIx\fR
+where \fIx\fR is
+a single character with no other significance, matches that character.
+.RE
+.PP
+A \fIconstraint\fR matches an empty string when specific conditions
+are met.
+A constraint may not be followed by a quantifier.
+The simple constraints are as follows; some more constraints are
+described later, under ESCAPES.
+.RS 2
+.TP 8
+\fB^\fR
+matches at the beginning of a line
+.TP
+\fB$\fR
+matches at the end of a line
+.TP
+\fB(?=\fIre\fB)\fR
+\fIpositive lookahead\fR (AREs only), matches at any point
+where a substring matching \fIre\fR begins
+.TP
+\fB(?!\fIre\fB)\fR
+\fInegative lookahead\fR (AREs only), matches at any point
+where no substring matching \fIre\fR begins
+.RE
+.PP
+The lookahead constraints may not contain back references (see later),
+and all parentheses within them are considered non-capturing.
+.PP
+An RE may not end with `\fB\e\fR'.
+
+.SH "BRACKET EXPRESSIONS"
+A \fIbracket expression\fR is a list of characters enclosed in `\fB[\|]\fR'.
+It normally matches any single character from the list (but see below).
+If the list begins with `\fB^\fR',
+it matches any single character
+(but see below) \fInot\fR from the rest of the list.
+.PP
+If two characters in the list are separated by `\fB\-\fR',
+this is shorthand
+for the full \fIrange\fR of characters between those two (inclusive) in the
+collating sequence,
+e.g.
+\fB[0\-9]\fR
+in ASCII matches any decimal digit.
+Two ranges may not share an
+endpoint, so e.g.
+\fBa\-c\-e\fR
+is illegal.
+Ranges are very collating-sequence-dependent,
+and portable programs should avoid relying on them.
+.PP
+To include a literal
+\fB]\fR
+or
+\fB\-\fR
+in the list,
+the simplest method is to
+enclose it in
+\fB[.\fR and \fB.]\fR
+to make it a collating element (see below).
+Alternatively,
+make it the first character
+(following a possible `\fB^\fR'),
+or (AREs only) precede it with `\fB\e\fR'.
+Alternatively, for `\fB\-\fR',
+make it the last character,
+or the second endpoint of a range.
+To use a literal
+\fB\-\fR
+as the first endpoint of a range,
+make it a collating element
+or (AREs only) precede it with `\fB\e\fR'.
+With the exception of these, some combinations using
+\fB[\fR
+(see next
+paragraphs), and escapes,
+all other special characters lose their
+special significance within a bracket expression.
+.PP
+Within a bracket expression, a collating element (a character,
+a multi-character sequence that collates as if it were a single character,
+or a collating-sequence name for either)
+enclosed in
+\fB[.\fR and \fB.]\fR
+stands for the
+sequence of characters of that collating element.
+The sequence is a single element of the bracket expression's list.
+A bracket expression in a locale that has
+multi-character collating elements
+can thus match more than one character.
+.VS 8.2
+So (insidiously), a bracket expression that starts with \fB^\fR
+can match multi-character collating elements even if none of them
+appear in the bracket expression!
+(\fINote:\fR Tcl currently has no multi-character collating elements.
+This information is only for illustration.)
+.PP
+For example, assume the collating sequence includes a \fBch\fR
+multi-character collating element.
+Then the RE \fB[[.ch.]]*c\fR (zero or more \fBch\fP's followed by \fBc\fP)
+matches the first five characters of `\fBchchcc\fR'.
+Also, the RE \fB[^c]b\fR matches all of `\fBchb\fR'
+(because \fB[^c]\fR matches the multi-character \fBch\fR).
+.VE 8.2
+.PP
+Within a bracket expression, a collating element enclosed in
+\fB[=\fR
+and
+\fB=]\fR
+is an equivalence class, standing for the sequences of characters
+of all collating elements equivalent to that one, including itself.
+(If there are no other equivalent collating elements,
+the treatment is as if the enclosing delimiters were `\fB[.\fR'\&
+and `\fB.]\fR'.)
+For example, if
+\fBo\fR
+and
+\fB\o'o^'\fR
+are the members of an equivalence class,
+then `\fB[[=o=]]\fR', `\fB[[=\o'o^'=]]\fR',
+and `\fB[o\o'o^']\fR'\&
+are all synonymous.
+An equivalence class may not be an endpoint
+of a range.
+.VS 8.2
+(\fINote:\fR 
+Tcl currently implements only the Unicode locale.
+It doesn't define any equivalence classes.
+The examples above are just illustrations.)
+.VE 8.2
+.PP
+Within a bracket expression, the name of a \fIcharacter class\fR enclosed
+in
+\fB[:\fR
+and
+\fB:]\fR
+stands for the list of all characters
+(not all collating elements!)
+belonging to that
+class.
+Standard character classes are:
+.PP
+.RS
+.ne 5
+.nf
+.ta 3c
+\fBalpha\fR	A letter. 
+\fBupper\fR	An upper-case letter. 
+\fBlower\fR	A lower-case letter. 
+\fBdigit\fR	A decimal digit. 
+\fBxdigit\fR	A hexadecimal digit. 
+\fBalnum\fR	An alphanumeric (letter or digit). 
+\fBprint\fR	An alphanumeric (same as alnum).
+\fBblank\fR	A space or tab character.
+\fBspace\fR	A character producing white space in displayed text. 
+\fBpunct\fR	A punctuation character. 
+\fBgraph\fR	A character with a visible representation. 
+\fBcntrl\fR	A control character. 
+.fi
+.RE
+.PP
+A locale may provide others.
+.VS 8.2
+(Note that the current Tcl implementation has only one locale:
+the Unicode locale.)
+.VE 8.2
+A character class may not be used as an endpoint of a range.
+.PP
+There are two special cases of bracket expressions:
+the bracket expressions
+\fB[[:<:]]\fR
+and
+\fB[[:>:]]\fR
+are constraints, matching empty strings at
+the beginning and end of a word respectively.
+'\" note, discussion of escapes below references this definition of word
+A word is defined as a sequence of
+word characters
+that is neither preceded nor followed by
+word characters.
+A word character is an
+\fIalnum\fR
+character
+or an underscore
+(\fB_\fR).
+These special bracket expressions are deprecated;
+users of AREs should use constraint escapes instead (see below).
+.SH ESCAPES
+Escapes (AREs only), which begin with a
+\fB\e\fR
+followed by an alphanumeric character,
+come in several varieties:
+character entry, class shorthands, constraint escapes, and back references.
+A
+\fB\e\fR
+followed by an alphanumeric character but not constituting
+a valid escape is illegal in AREs.
+In EREs, there are no escapes:
+outside a bracket expression,
+a
+\fB\e\fR
+followed by an alphanumeric character merely stands for that
+character as an ordinary character,
+and inside a bracket expression,
+\fB\e\fR
+is an ordinary character.
+(The latter is the one actual incompatibility between EREs and AREs.)
+.PP
+Character-entry escapes (AREs only) exist to make it easier to specify
+non-printing and otherwise inconvenient characters in REs:
+.RS 2
+.TP 5
+\fB\ea\fR
+alert (bell) character, as in C
+.TP
+\fB\eb\fR
+backspace, as in C
+.TP
+\fB\eB\fR
+synonym for
+\fB\e\fR
+to help reduce backslash doubling in some
+applications where there are multiple levels of backslash processing
+.TP
+\fB\ec\fIX\fR
+(where X is any character) the character whose
+low-order 5 bits are the same as those of
+\fIX\fR,
+and whose other bits are all zero
+.TP
+\fB\ee\fR
+the character whose collating-sequence name
+is `\fBESC\fR',
+or failing that, the character with octal value 033
+.TP
+\fB\ef\fR
+formfeed, as in C
+.TP
+\fB\en\fR
+newline, as in C
+.TP
+\fB\er\fR
+carriage return, as in C
+.TP
+\fB\et\fR
+horizontal tab, as in C
+.TP
+\fB\eu\fIwxyz\fR
+(where
+\fIwxyz\fR
+is exactly four hexadecimal digits)
+the Unicode character
+\fBU+\fIwxyz\fR
+in the local byte ordering
+.TP
+\fB\eU\fIstuvwxyz\fR
+(where
+\fIstuvwxyz\fR
+is exactly eight hexadecimal digits)
+reserved for a somewhat-hypothetical Unicode extension to 32 bits
+.TP
+\fB\ev\fR
+vertical tab, as in C
+are all available.
+.TP
+\fB\ex\fIhhh\fR
+(where
+\fIhhh\fR
+is any sequence of hexadecimal digits)
+the character whose hexadecimal value is
+\fB0x\fIhhh\fR
+(a single character no matter how many hexadecimal digits are used).
+.TP
+\fB\e0\fR
+the character whose value is
+\fB0\fR
+.TP
+\fB\e\fIxy\fR
+(where
+\fIxy\fR
+is exactly two octal digits,
+and is not a
+\fIback reference\fR (see below))
+the character whose octal value is
+\fB0\fIxy\fR
+.TP
+\fB\e\fIxyz\fR
+(where
+\fIxyz\fR
+is exactly three octal digits,
+and is not a
+back reference (see below))
+the character whose octal value is
+\fB0\fIxyz\fR
+.RE
+.PP
+Hexadecimal digits are `\fB0\fR'-`\fB9\fR', `\fBa\fR'-`\fBf\fR',
+and `\fBA\fR'-`\fBF\fR'.
+Octal digits are `\fB0\fR'-`\fB7\fR'.
+.PP
+The character-entry escapes are always taken as ordinary characters.
+For example,
+\fB\e135\fR
+is
+\fB]\fR
+in ASCII,
+but
+\fB\e135\fR
+does not terminate a bracket expression.
+Beware, however, that some applications (e.g., C compilers) interpret 
+such sequences themselves before the regular-expression package
+gets to see them, which may require doubling (quadrupling, etc.) the `\fB\e\fR'.
+.PP
+Class-shorthand escapes (AREs only) provide shorthands for certain commonly-used
+character classes:
+.RS 2
+.TP 10
+\fB\ed\fR
+\fB[[:digit:]]\fR
+.TP
+\fB\es\fR
+\fB[[:space:]]\fR
+.TP
+\fB\ew\fR
+\fB[[:alnum:]_]\fR
+(note underscore)
+.TP
+\fB\eD\fR
+\fB[^[:digit:]]\fR
+.TP
+\fB\eS\fR
+\fB[^[:space:]]\fR
+.TP
+\fB\eW\fR
+\fB[^[:alnum:]_]\fR
+(note underscore)
+.RE
+.PP
+Within bracket expressions, `\fB\ed\fR', `\fB\es\fR',
+and `\fB\ew\fR'\&
+lose their outer brackets,
+and `\fB\eD\fR', `\fB\eS\fR',
+and `\fB\eW\fR'\&
+are illegal.
+.VS 8.2
+(So, for example, \fB[a-c\ed]\fR is equivalent to \fB[a-c[:digit:]]\fR.
+Also, \fB[a-c\eD]\fR, which is equivalent to \fB[a-c^[:digit:]]\fR, is illegal.)
+.VE 8.2
+.PP
+A constraint escape (AREs only) is a constraint,
+matching the empty string if specific conditions are met,
+written as an escape:
+.RS 2
+.TP 6
+\fB\eA\fR
+matches only at the beginning of the string
+(see MATCHING, below, for how this differs from `\fB^\fR')
+.TP
+\fB\em\fR
+matches only at the beginning of a word
+.TP
+\fB\eM\fR
+matches only at the end of a word
+.TP
+\fB\ey\fR
+matches only at the beginning or end of a word
+.TP
+\fB\eY\fR
+matches only at a point that is not the beginning or end of a word
+.TP
+\fB\eZ\fR
+matches only at the end of the string
+(see MATCHING, below, for how this differs from `\fB$\fR')
+.TP
+\fB\e\fIm\fR
+(where
+\fIm\fR
+is a nonzero digit) a \fIback reference\fR, see below
+.TP
+\fB\e\fImnn\fR
+(where
+\fIm\fR
+is a nonzero digit, and
+\fInn\fR
+is some more digits,
+and the decimal value
+\fImnn\fR
+is not greater than the number of closing capturing parentheses seen so far)
+a \fIback reference\fR, see below
+.RE
+.PP
+A word is defined as in the specification of
+\fB[[:<:]]\fR
+and
+\fB[[:>:]]\fR
+above.
+Constraint escapes are illegal within bracket expressions.
+.PP
+A back reference (AREs only) matches the same string matched by the parenthesized
+subexpression specified by the number,
+so that (e.g.)
+\fB([bc])\e1\fR
+matches
+\fBbb\fR
+or
+\fBcc\fR
+but not `\fBbc\fR'.
+The subexpression must entirely precede the back reference in the RE.
+Subexpressions are numbered in the order of their leading parentheses.
+Non-capturing parentheses do not define subexpressions.
+.PP
+There is an inherent historical ambiguity between octal character-entry 
+escapes and back references, which is resolved by heuristics,
+as hinted at above.
+A leading zero always indicates an octal escape.
+A single non-zero digit, not followed by another digit,
+is always taken as a back reference.
+A multi-digit sequence not starting with a zero is taken as a back 
+reference if it comes after a suitable subexpression
+(i.e. the number is in the legal range for a back reference),
+and otherwise is taken as octal.
+.SH "METASYNTAX"
+In addition to the main syntax described above, there are some special
+forms and miscellaneous syntactic facilities available.
+.PP
+Normally the flavor of RE being used is specified by
+application-dependent means.
+However, this can be overridden by a \fIdirector\fR.
+If an RE of any flavor begins with `\fB***:\fR',
+the rest of the RE is an ARE.
+If an RE of any flavor begins with `\fB***=\fR',
+the rest of the RE is taken to be a literal string,
+with all characters considered ordinary characters.
+.PP
+An ARE may begin with \fIembedded options\fR:
+a sequence
+\fB(?\fIxyz\fB)\fR
+(where
+\fIxyz\fR
+is one or more alphabetic characters)
+specifies options affecting the rest of the RE.
+These supplement, and can override,
+any options specified by the application.
+The available option letters are:
+.RS 2
+.TP 3
+\fBb\fR
+rest of RE is a BRE
+.TP 3
+\fBc\fR
+case-sensitive matching (usual default)
+.TP 3
+\fBe\fR
+rest of RE is an ERE
+.TP 3
+\fBi\fR
+case-insensitive matching (see MATCHING, below)
+.TP 3
+\fBm\fR
+historical synonym for
+\fBn\fR
+.TP 3
+\fBn\fR
+newline-sensitive matching (see MATCHING, below)
+.TP 3
+\fBp\fR
+partial newline-sensitive matching (see MATCHING, below)
+.TP 3
+\fBq\fR
+rest of RE is a literal (``quoted'') string, all ordinary characters
+.TP 3
+\fBs\fR
+non-newline-sensitive matching (usual default)
+.TP 3
+\fBt\fR
+tight syntax (usual default; see below)
+.TP 3
+\fBw\fR
+inverse partial newline-sensitive (``weird'') matching (see MATCHING, below)
+.TP 3
+\fBx\fR
+expanded syntax (see below)
+.RE
+.PP
+Embedded options take effect at the
+\fB)\fR
+terminating the sequence.
+They are available only at the start of an ARE,
+and may not be used later within it.
+.PP
+In addition to the usual (\fItight\fR) RE syntax, in which all characters are
+significant, there is an \fIexpanded\fR syntax,
+available in all flavors of RE
+with the \fB-expanded\fR switch, or in AREs with the embedded x option.
+In the expanded syntax,
+white-space characters are ignored
+and all characters between a
+\fB#\fR
+and the following newline (or the end of the RE) are ignored,
+permitting paragraphing and commenting a complex RE.
+There are three exceptions to that basic rule:
+.RS 2
+.PP
+a white-space character or `\fB#\fR' preceded by `\fB\e\fR' is retained
+.PP
+white space or `\fB#\fR' within a bracket expression is retained
+.PP
+white space and comments are illegal within multi-character symbols
+like the ARE `\fB(?:\fR' or the BRE `\fB\e(\fR'
+.RE
+.PP
+Expanded-syntax white-space characters are blank, tab, newline, and
+.VS 8.2
+any character that belongs to the \fIspace\fR character class.
+.VE 8.2
+.PP
+Finally, in an ARE,
+outside bracket expressions, the sequence `\fB(?#\fIttt\fB)\fR'
+(where
+\fIttt\fR
+is any text not containing a `\fB)\fR')
+is a comment,
+completely ignored.
+Again, this is not allowed between the characters of
+multi-character symbols like `\fB(?:\fR'.
+Such comments are more a historical artifact than a useful facility,
+and their use is deprecated;
+use the expanded syntax instead.
+.PP
+\fINone\fR of these metasyntax extensions is available if the application
+(or an initial
+\fB***=\fR
+director)
+has specified that the user's input be treated as a literal string
+rather than as an RE.
+.SH MATCHING
+In the event that an RE could match more than one substring of a given
+string,
+the RE matches the one starting earliest in the string.
+If the RE could match more than one substring starting at that point,
+its choice is determined by its \fIpreference\fR:
+either the longest substring, or the shortest.
+.PP
+Most atoms, and all constraints, have no preference.
+A parenthesized RE has the same preference (possibly none) as the RE.
+A quantified atom with quantifier
+\fB{\fIm\fB}\fR
+or
+\fB{\fIm\fB}?\fR
+has the same preference (possibly none) as the atom itself.
+A quantified atom with other normal quantifiers (including
+\fB{\fIm\fB,\fIn\fB}\fR
+with
+\fIm\fR
+equal to
+\fIn\fR)
+prefers longest match.
+A quantified atom with other non-greedy quantifiers (including
+\fB{\fIm\fB,\fIn\fB}?\fR
+with
+\fIm\fR
+equal to
+\fIn\fR)
+prefers shortest match.
+A branch has the same preference as the first quantified atom in it
+which has a preference.
+An RE consisting of two or more branches connected by the
+\fB|\fR
+operator prefers longest match.
+.PP
+Subject to the constraints imposed by the rules for matching the whole RE,
+subexpressions also match the longest or shortest possible substrings,
+based on their preferences,
+with subexpressions starting earlier in the RE taking priority over
+ones starting later.
+Note that outer subexpressions thus take priority over
+their component subexpressions.
+.PP
+Note that the quantifiers
+\fB{1,1}\fR
+and
+\fB{1,1}?\fR
+can be used to force longest and shortest preference, respectively,
+on a subexpression or a whole RE.
+.PP
+Match lengths are measured in characters, not collating elements.
+An empty string is considered longer than no match at all.
+For example,
+\fBbb*\fR
+matches the three middle characters of `\fBabbbc\fR',
+\fB(week|wee)(night|knights)\fR
+matches all ten characters of `\fBweeknights\fR',
+when
+\fB(.*).*\fR
+is matched against
+\fBabc\fR
+the parenthesized subexpression
+matches all three characters, and
+when
+\fB(a*)*\fR
+is matched against
+\fBbc\fR
+both the whole RE and the parenthesized
+subexpression match an empty string.
+.PP
+If case-independent matching is specified,
+the effect is much as if all case distinctions had vanished from the
+alphabet.
+When an alphabetic that exists in multiple cases appears as an
+ordinary character outside a bracket expression, it is effectively
+transformed into a bracket expression containing both cases,
+so that
+\fBx\fR
+becomes `\fB[xX]\fR'.
+When it appears inside a bracket expression, all case counterparts
+of it are added to the bracket expression, so that
+\fB[x]\fR
+becomes
+\fB[xX]\fR
+and
+\fB[^x]\fR
+becomes `\fB[^xX]\fR'.
+.PP
+If newline-sensitive matching is specified, \fB.\fR
+and bracket expressions using
+\fB^\fR
+will never match the newline character
+(so that matches will never cross newlines unless the RE
+explicitly arranges it)
+and
+\fB^\fR
+and
+\fB$\fR
+will match the empty string after and before a newline
+respectively, in addition to matching at beginning and end of string
+respectively.
+ARE
+\fB\eA\fR
+and
+\fB\eZ\fR
+continue to match beginning or end of string \fIonly\fR.
+.PP
+If partial newline-sensitive matching is specified,
+this affects \fB.\fR
+and bracket expressions
+as with newline-sensitive matching, but not
+\fB^\fR
+and `\fB$\fR'.
+.PP
+If inverse partial newline-sensitive matching is specified,
+this affects
+\fB^\fR
+and
+\fB$\fR
+as with
+newline-sensitive matching,
+but not \fB.\fR
+and bracket expressions.
+This isn't very useful but is provided for symmetry.
+.SH "LIMITS AND COMPATIBILITY"
+No particular limit is imposed on the length of REs.
+Programs intended to be highly portable should not employ REs longer
+than 256 bytes,
+as a POSIX-compliant implementation can refuse to accept such REs.
+.PP
+The only feature of AREs that is actually incompatible with
+POSIX EREs is that
+\fB\e\fR
+does not lose its special
+significance inside bracket expressions.
+All other ARE features use syntax which is illegal or has
+undefined or unspecified effects in POSIX EREs;
+the
+\fB***\fR
+syntax of directors likewise is outside the POSIX
+syntax for both BREs and EREs.
+.PP
+Many of the ARE extensions are borrowed from Perl, but some have
+been changed to clean them up, and a few Perl extensions are not present.
+Incompatibilities of note include `\fB\eb\fR', `\fB\eB\fR',
+the lack of special treatment for a trailing newline,
+the addition of complemented bracket expressions to the things
+affected by newline-sensitive matching,
+the restrictions on parentheses and back references in lookahead constraints,
+and the longest/shortest-match (rather than first-match) matching semantics.
+.PP
+The matching rules for REs containing both normal and non-greedy quantifiers
+have changed since early beta-test versions of this package.
+(The new rules are much simpler and cleaner,
+but don't work as hard at guessing the user's real intentions.)
+.PP
+Henry Spencer's original 1986 \fIregexp\fR package,
+still in widespread use (e.g., in pre-8.1 releases of Tcl),
+implemented an early version of today's EREs.
+There are four incompatibilities between \fIregexp\fR's near-EREs
+(`RREs' for short) and AREs.
+In roughly increasing order of significance:
+.PP
+.RS
+In AREs,
+\fB\e\fR
+followed by an alphanumeric character is either an
+escape or an error,
+while in RREs, it was just another way of writing the 
+alphanumeric.
+This should not be a problem because there was no reason to write
+such a sequence in RREs.
+.PP
+\fB{\fR
+followed by a digit in an ARE is the beginning of a bound,
+while in RREs,
+\fB{\fR
+was always an ordinary character.
+Such sequences should be rare,
+and will often result in an error because following characters
+will not look like a valid bound.
+.PP
+In AREs,
+\fB\e\fR
+remains a special character within `\fB[\|]\fR',
+so a literal
+\fB\e\fR
+within
+\fB[\|]\fR
+must be written `\fB\e\e\fR'.
+\fB\e\e\fR
+also gives a literal
+\fB\e\fR
+within
+\fB[\|]\fR
+in RREs,
+but only truly paranoid programmers routinely doubled the backslash.
+.PP
+AREs report the longest/shortest match for the RE,
+rather than the first found in a specified search order.
+This may affect some RREs which were written in the expectation that
+the first match would be reported.
+(The careful crafting of RREs to optimize the search order for fast
+matching is obsolete (AREs examine all possible matches
+in parallel, and their performance is largely insensitive to their
+complexity) but cases where the search order was exploited to deliberately 
+find a match which was \fInot\fR the longest/shortest will need rewriting.)
+.RE
+
+.SH "BASIC REGULAR EXPRESSIONS"
+BREs differ from EREs in several respects.  `\fB|\fR', `\fB+\fR',
+and
+\fB?\fR
+are ordinary characters and there is no equivalent
+for their functionality.
+The delimiters for bounds are
+\fB\e{\fR
+and `\fB\e}\fR',
+with
+\fB{\fR
+and
+\fB}\fR
+by themselves ordinary characters.
+The parentheses for nested subexpressions are
+\fB\e(\fR
+and `\fB\e)\fR',
+with
+\fB(\fR
+and
+\fB)\fR
+by themselves ordinary characters.
+\fB^\fR
+is an ordinary character except at the beginning of the
+RE or the beginning of a parenthesized subexpression,
+\fB$\fR
+is an ordinary character except at the end of the
+RE or the end of a parenthesized subexpression,
+and
+\fB*\fR
+is an ordinary character if it appears at the beginning of the
+RE or the beginning of a parenthesized subexpression
+(after a possible leading `\fB^\fR').
+Finally,
+single-digit back references are available,
+and
+\fB\e<\fR
+and
+\fB\e>\fR
+are synonyms for
+\fB[[:<:]]\fR
+and
+\fB[[:>:]]\fR
+respectively;
+no other escapes are available.
+
+.SH "SEE ALSO"
+RegExp(3), regexp(n), regsub(n), lsearch(n), switch(n), text(n)
+
+.SH KEYWORDS
+match, regular expression, string
diff --git a/src/backend/regex/regc_color.c b/src/backend/regex/regc_color.c
new file mode 100644
index 00000000000..eb250556822
--- /dev/null
+++ b/src/backend/regex/regc_color.c
@@ -0,0 +1,728 @@
+/*
+ * colorings of characters
+ * This file is #included by regcomp.c.
+ *
+ * Copyright (c) 1998, 1999 Henry Spencer.  All rights reserved.
+ * 
+ * Development of this software was funded, in part, by Cray Research Inc.,
+ * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
+ * Corporation, none of whom are responsible for the results.  The author
+ * thanks all of them. 
+ * 
+ * Redistribution and use in source and binary forms -- with or without
+ * modification -- are permitted for any purpose, provided that
+ * redistributions in source form retain this entire copyright notice and
+ * indicate the origin and nature of any modifications.
+ * 
+ * I'd appreciate being given credit for this package in the documentation
+ * of software which uses it, but that is not a requirement.
+ * 
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
+ * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL
+ * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $Header: /cvsroot/pgsql/src/backend/regex/regc_color.c,v 1.1 2003/02/05 17:41:32 tgl Exp $
+ *
+ *
+ * Note that there are some incestuous relationships between this code and
+ * NFA arc maintenance, which perhaps ought to be cleaned up sometime.
+ */
+
+
+
+#define	CISERR()	VISERR(cm->v)
+#define	CERR(e)		VERR(cm->v, (e))
+
+
+
+/*
+ * initcm - set up new colormap
+ */
+static void
+initcm(struct vars *v,
+	   struct colormap *cm)
+{
+	int i;
+	int j;
+	union tree *t;
+	union tree *nextt;
+	struct colordesc *cd;
+
+	cm->magic = CMMAGIC;
+	cm->v = v;
+
+	cm->ncds = NINLINECDS;
+	cm->cd = cm->cdspace;
+	cm->max = 0;
+	cm->free = 0;
+
+	cd = cm->cd;			/* cm->cd[WHITE] */
+	cd->sub = NOSUB;
+	cd->arcs = NULL;
+	cd->flags = 0;
+	cd->nchrs = CHR_MAX - CHR_MIN + 1;
+
+	/* upper levels of tree */
+	for (t = &cm->tree[0], j = NBYTS-1; j > 0; t = nextt, j--) {
+		nextt = t + 1;
+		for (i = BYTTAB-1; i >= 0; i--)
+			t->tptr[i] = nextt;
+	}
+	/* bottom level is solid white */
+	t = &cm->tree[NBYTS-1];
+	for (i = BYTTAB-1; i >= 0; i--)
+		t->tcolor[i] = WHITE;
+	cd->block = t;
+}
+
+/*
+ * freecm - free dynamically-allocated things in a colormap
+ */
+static void
+freecm(struct colormap *cm)
+{
+	size_t i;
+	union tree *cb;
+
+	cm->magic = 0;
+	if (NBYTS > 1)
+		cmtreefree(cm, cm->tree, 0);
+	for (i = 1; i <= cm->max; i++)		/* skip WHITE */
+		if (!UNUSEDCOLOR(&cm->cd[i])) {
+			cb = cm->cd[i].block;
+			if (cb != NULL)
+				FREE(cb);
+		}
+	if (cm->cd != cm->cdspace)
+		FREE(cm->cd);
+}
+
+/*
+ * cmtreefree - free a non-terminal part of a colormap tree
+ */
+static void
+cmtreefree(struct colormap *cm,
+		   union tree *tree,
+		   int level)			/* level number (top == 0) of this block */
+{
+	int i;
+	union tree *t;
+	union tree *fillt = &cm->tree[level+1];
+	union tree *cb;
+
+	assert(level < NBYTS-1);	/* this level has pointers */
+	for (i = BYTTAB-1; i >= 0; i--) {
+		t = tree->tptr[i];
+		assert(t != NULL);
+		if (t != fillt) {
+			if (level < NBYTS-2) {	/* more pointer blocks below */
+				cmtreefree(cm, t, level+1);
+				FREE(t);
+			} else {		/* color block below */
+				cb = cm->cd[t->tcolor[0]].block;
+				if (t != cb)	/* not a solid block */
+					FREE(t);
+			}
+		}
+	}
+}
+
+/*
+ * setcolor - set the color of a character in a colormap
+ */
+static color			/* previous color */
+setcolor(struct colormap *cm,
+		 chr c,
+		 pcolor co)
+{
+	uchr uc = c;
+	int shift;
+	int level;
+	int b;
+	int bottom;
+	union tree *t;
+	union tree *newt;
+	union tree *fillt;
+	union tree *lastt;
+	union tree *cb;
+	color prev;
+
+	assert(cm->magic == CMMAGIC);
+	if (CISERR() || co == COLORLESS)
+		return COLORLESS;
+
+	t = cm->tree;
+	for (level = 0, shift = BYTBITS * (NBYTS - 1); shift > 0;
+						level++, shift -= BYTBITS) {
+		b = (uc >> shift) & BYTMASK;
+		lastt = t;
+		t = lastt->tptr[b];
+		assert(t != NULL);
+		fillt = &cm->tree[level+1];
+		bottom = (shift <= BYTBITS) ? 1 : 0;
+		cb = (bottom) ? cm->cd[t->tcolor[0]].block : fillt;
+		if (t == fillt || t == cb) {	/* must allocate a new block */
+			newt = (union tree *)MALLOC((bottom) ?
+				sizeof(struct colors) : sizeof(struct ptrs));
+			if (newt == NULL) {
+				CERR(REG_ESPACE);
+				return COLORLESS;
+			}
+			if (bottom)
+				memcpy(VS(newt->tcolor), VS(t->tcolor),
+							BYTTAB*sizeof(color));
+			else
+				memcpy(VS(newt->tptr), VS(t->tptr),
+						BYTTAB*sizeof(union tree *));
+			t = newt;
+			lastt->tptr[b] = t;
+		}
+	}
+
+	b = uc & BYTMASK;
+	prev = t->tcolor[b];
+	t->tcolor[b] = (color)co;
+	return prev;
+}
+
+/*
+ * maxcolor - report largest color number in use
+ */
+static color
+maxcolor(struct colormap *cm)
+{
+	if (CISERR())
+		return COLORLESS;
+
+	return (color)cm->max;
+}
+
+/*
+ * newcolor - find a new color (must be subject of setcolor at once)
+ * Beware:  may relocate the colordescs.
+ */
+static color			/* COLORLESS for error */
+newcolor(struct colormap *cm)
+{
+	struct colordesc *cd;
+	struct colordesc *new;
+	size_t n;
+
+	if (CISERR())
+		return COLORLESS;
+
+	if (cm->free != 0) {
+		assert(cm->free > 0);
+		assert((size_t)cm->free < cm->ncds);
+		cd = &cm->cd[cm->free];
+		assert(UNUSEDCOLOR(cd));
+		assert(cd->arcs == NULL);
+		cm->free = cd->sub;
+	} else if (cm->max < cm->ncds - 1) {
+		cm->max++;
+		cd = &cm->cd[cm->max];
+	} else {
+		/* oops, must allocate more */
+		n = cm->ncds * 2;
+		if (cm->cd == cm->cdspace) {
+			new = (struct colordesc *)MALLOC(n *
+						sizeof(struct colordesc));
+			if (new != NULL)
+				memcpy(VS(new), VS(cm->cdspace), cm->ncds *
+						sizeof(struct colordesc));
+		} else
+			new = (struct colordesc *)REALLOC(cm->cd,
+						n * sizeof(struct colordesc));
+		if (new == NULL) {
+			CERR(REG_ESPACE);
+			return COLORLESS;
+		}
+		cm->cd = new;
+		cm->ncds = n;
+		assert(cm->max < cm->ncds - 1);
+		cm->max++;
+		cd = &cm->cd[cm->max];
+	}
+
+	cd->nchrs = 0;
+	cd->sub = NOSUB;
+	cd->arcs = NULL;
+	cd->flags = 0;
+	cd->block = NULL;
+
+	return (color)(cd - cm->cd);
+}
+
+/*
+ * freecolor - free a color (must have no arcs or subcolor)
+ */
+static void
+freecolor(struct colormap *cm,
+		  pcolor co)
+{
+	struct colordesc *cd = &cm->cd[co];
+	color pco, nco;			/* for freelist scan */
+
+	assert(co >= 0);
+	if (co == WHITE)
+		return;
+
+	assert(cd->arcs == NULL);
+	assert(cd->sub == NOSUB);
+	assert(cd->nchrs == 0);
+	cd->flags = FREECOL;
+	if (cd->block != NULL) {
+		FREE(cd->block);
+		cd->block = NULL;	/* just paranoia */
+	}
+
+	if ((size_t)co == cm->max) {
+		while (cm->max > WHITE && UNUSEDCOLOR(&cm->cd[cm->max]))
+			cm->max--;
+		assert(cm->free >= 0);
+		while ((size_t)cm->free > cm->max)
+			cm->free = cm->cd[cm->free].sub;
+		if (cm->free > 0) {
+			assert(cm->free < cm->max);
+			pco = cm->free;
+			nco = cm->cd[pco].sub;
+			while (nco > 0)
+				if ((size_t)nco > cm->max) {
+					/* take this one out of freelist */
+					nco = cm->cd[nco].sub;
+					cm->cd[pco].sub = nco;
+				} else {
+					assert(nco < cm->max);
+					pco = nco;
+					nco = cm->cd[pco].sub;
+				}
+		}
+	} else {
+		cd->sub = cm->free;
+		cm->free = (color)(cd - cm->cd);
+	}
+}
+
+/*
+ * pseudocolor - allocate a false color, to be managed by other means
+ */
+static color
+pseudocolor(struct colormap *cm)
+{
+	color co;
+
+	co = newcolor(cm);
+	if (CISERR())
+		return COLORLESS;
+	cm->cd[co].nchrs = 1;
+	cm->cd[co].flags = PSEUDO;
+	return co;
+}
+
+/*
+ * subcolor - allocate a new subcolor (if necessary) to this chr
+ */
+static color
+subcolor(struct colormap *cm, chr c)
+{
+	color co;			/* current color of c */
+	color sco;			/* new subcolor */
+
+	co = GETCOLOR(cm, c);
+	sco = newsub(cm, co);
+	if (CISERR())
+		return COLORLESS;
+	assert(sco != COLORLESS);
+
+	if (co == sco)		/* already in an open subcolor */
+		return co;	/* rest is redundant */
+	cm->cd[co].nchrs--;
+	cm->cd[sco].nchrs++;
+	setcolor(cm, c, sco);
+	return sco;
+}
+
+/*
+ * newsub - allocate a new subcolor (if necessary) for a color
+ */
+static color
+newsub(struct colormap *cm,
+	   pcolor co)
+{
+	color sco;			/* new subcolor */
+
+	sco = cm->cd[co].sub;
+	if (sco == NOSUB) {		/* color has no open subcolor */
+		if (cm->cd[co].nchrs == 1)	/* optimization */
+			return co;
+		sco = newcolor(cm);	/* must create subcolor */
+		if (sco == COLORLESS) {
+			assert(CISERR());
+			return COLORLESS;
+		}
+		cm->cd[co].sub = sco;
+		cm->cd[sco].sub = sco;	/* open subcolor points to self */
+	}
+	assert(sco != NOSUB);
+
+	return sco;
+}
+
+/*
+ * subrange - allocate new subcolors to this range of chrs, fill in arcs
+ */
+static void
+subrange(struct vars *v,
+		 chr from,
+		 chr to,
+		 struct state *lp,
+		 struct state *rp)
+{
+	uchr uf;
+	int i;
+
+	assert(from <= to);
+
+	/* first, align "from" on a tree-block boundary */
+	uf = (uchr)from;
+	i = (int)( ((uf + BYTTAB-1) & (uchr)~BYTMASK) - uf );
+	for (; from <= to && i > 0; i--, from++)
+		newarc(v->nfa, PLAIN, subcolor(v->cm, from), lp, rp);
+	if (from > to)			/* didn't reach a boundary */
+		return;
+
+	/* deal with whole blocks */
+	for (; to - from >= BYTTAB; from += BYTTAB)
+		subblock(v, from, lp, rp);
+
+	/* clean up any remaining partial table */
+	for (; from <= to; from++)
+		newarc(v->nfa, PLAIN, subcolor(v->cm, from), lp, rp);
+}
+
+/*
+ * subblock - allocate new subcolors for one tree block of chrs, fill in arcs
+ */
+static void
+subblock(struct vars *v,
+		 chr start,				/* first of BYTTAB chrs */
+		 struct state *lp,
+		 struct state *rp)
+{
+	uchr uc = start;
+	struct colormap *cm = v->cm;
+	int shift;
+	int level;
+	int i;
+	int b;
+	union tree *t;
+	union tree *cb;
+	union tree *fillt;
+	union tree *lastt;
+	int previ;
+	int ndone;
+	color co;
+	color sco;
+
+	assert((uc % BYTTAB) == 0);
+
+	/* find its color block, making new pointer blocks as needed */
+	t = cm->tree;
+	fillt = NULL;
+	for (level = 0, shift = BYTBITS * (NBYTS - 1); shift > 0;
+						level++, shift -= BYTBITS) {
+		b = (uc >> shift) & BYTMASK;
+		lastt = t;
+		t = lastt->tptr[b];
+		assert(t != NULL);
+		fillt = &cm->tree[level+1];
+		if (t == fillt && shift > BYTBITS) {	/* need new ptr block */
+			t = (union tree *)MALLOC(sizeof(struct ptrs));
+			if (t == NULL) {
+				CERR(REG_ESPACE);
+				return;
+			}
+			memcpy(VS(t->tptr), VS(fillt->tptr),
+						BYTTAB*sizeof(union tree *));
+			lastt->tptr[b] = t;
+		}
+	}
+
+	/* special cases:  fill block or solid block */
+	co = t->tcolor[0];
+	cb = cm->cd[co].block;
+	if (t == fillt || t == cb) {
+		/* either way, we want a subcolor solid block */
+		sco = newsub(cm, co);
+		t = cm->cd[sco].block;
+		if (t == NULL) {	/* must set it up */
+			t = (union tree *)MALLOC(sizeof(struct colors));
+			if (t == NULL) {
+				CERR(REG_ESPACE);
+				return;
+			}
+			for (i = 0; i < BYTTAB; i++)
+				t->tcolor[i] = sco;
+			cm->cd[sco].block = t;
+		}
+		/* find loop must have run at least once */
+		lastt->tptr[b] = t;
+		newarc(v->nfa, PLAIN, sco, lp, rp);
+		cm->cd[co].nchrs -= BYTTAB;
+		cm->cd[sco].nchrs += BYTTAB;
+		return;
+	}
+
+	/* general case, a mixed block to be altered */
+	i = 0;
+	while (i < BYTTAB) {
+		co = t->tcolor[i];
+		sco = newsub(cm, co);
+		newarc(v->nfa, PLAIN, sco, lp, rp);
+		previ = i;
+		do {
+			t->tcolor[i++] = sco;
+		} while (i < BYTTAB && t->tcolor[i] == co);
+		ndone = i - previ;
+		cm->cd[co].nchrs -= ndone;
+		cm->cd[sco].nchrs += ndone;
+	}
+}
+
+/*
+ * okcolors - promote subcolors to full colors
+ */
+static void
+okcolors(struct nfa *nfa,
+		 struct colormap *cm)
+{
+	struct colordesc *cd;
+	struct colordesc *end = CDEND(cm);
+	struct colordesc *scd;
+	struct arc *a;
+	color co;
+	color sco;
+
+	for (cd = cm->cd, co = 0; cd < end; cd++, co++) {
+		sco = cd->sub;
+		if (UNUSEDCOLOR(cd) || sco == NOSUB) {
+			/* has no subcolor, no further action */
+		} else if (sco == co) {
+			/* is subcolor, let parent deal with it */
+		} else if (cd->nchrs == 0) {
+			/* parent empty, its arcs change color to subcolor */
+			cd->sub = NOSUB;
+			scd = &cm->cd[sco];
+			assert(scd->nchrs > 0);
+			assert(scd->sub == sco);
+			scd->sub = NOSUB;
+			while ((a = cd->arcs) != NULL) {
+				assert(a->co == co);
+				/* uncolorchain(cm, a); */
+				cd->arcs = a->colorchain;
+				a->co = sco;
+				/* colorchain(cm, a); */
+				a->colorchain = scd->arcs;
+				scd->arcs = a;
+			}
+			freecolor(cm, co);
+		} else {
+			/* parent's arcs must gain parallel subcolor arcs */
+			cd->sub = NOSUB;
+			scd = &cm->cd[sco];
+			assert(scd->nchrs > 0);
+			assert(scd->sub == sco);
+			scd->sub = NOSUB;
+			for (a = cd->arcs; a != NULL; a = a->colorchain) {
+				assert(a->co == co);
+				newarc(nfa, a->type, sco, a->from, a->to);
+			}
+		}
+	}
+}
+
+/*
+ * colorchain - add this arc to the color chain of its color
+ */
+static void
+colorchain(struct colormap *cm,
+		   struct arc *a)
+{
+	struct colordesc *cd = &cm->cd[a->co];
+
+	a->colorchain = cd->arcs;
+	cd->arcs = a;
+}
+
+/*
+ * uncolorchain - delete this arc from the color chain of its color
+ */
+static void
+uncolorchain(struct colormap *cm,
+			 struct arc *a)
+{
+	struct colordesc *cd = &cm->cd[a->co];
+	struct arc *aa;
+
+	aa = cd->arcs;
+	if (aa == a)		/* easy case */
+		cd->arcs = a->colorchain;
+	else {
+		for (; aa != NULL && aa->colorchain != a; aa = aa->colorchain)
+			continue;
+		assert(aa != NULL);
+		aa->colorchain = a->colorchain;
+	}
+	a->colorchain = NULL;	/* paranoia */
+}
+
+/*
+ * singleton - is this character in its own color?
+ */
+static int			/* predicate */
+singleton(struct colormap *cm,
+		  chr c)
+{
+	color co;			/* color of c */
+
+	co = GETCOLOR(cm, c);
+	if (cm->cd[co].nchrs == 1 && cm->cd[co].sub == NOSUB)
+		return 1;
+	return 0;
+}
+
+/*
+ * rainbow - add arcs of all full colors (but one) between specified states
+ */
+static void
+rainbow(struct nfa *nfa,
+		struct colormap *cm,
+		int type,
+		pcolor but,				/* COLORLESS if no exceptions */
+		struct state *from,
+		struct state *to)
+{
+	struct colordesc *cd;
+	struct colordesc *end = CDEND(cm);
+	color co;
+
+	for (cd = cm->cd, co = 0; cd < end && !CISERR(); cd++, co++)
+		if (!UNUSEDCOLOR(cd) && cd->sub != co && co != but &&
+							!(cd->flags&PSEUDO))
+			newarc(nfa, type, co, from, to);
+}
+
+/*
+ * colorcomplement - add arcs of complementary colors
+ *
+ * The calling sequence ought to be reconciled with cloneouts().
+ */
+static void
+colorcomplement(struct nfa *nfa,
+				struct colormap *cm,
+				int type,
+				struct state *of, /* complements of this guy's PLAIN outarcs */
+				struct state *from,
+				struct state *to)
+{
+	struct colordesc *cd;
+	struct colordesc *end = CDEND(cm);
+	color co;
+
+	assert(of != from);
+	for (cd = cm->cd, co = 0; cd < end && !CISERR(); cd++, co++)
+		if (!UNUSEDCOLOR(cd) && !(cd->flags&PSEUDO))
+			if (findarc(of, PLAIN, co) == NULL)
+				newarc(nfa, type, co, from, to);
+}
+
+
+#ifdef REG_DEBUG
+
+/*
+ * dumpcolors - debugging output
+ */
+static void
+dumpcolors(struct colormap *cm,
+		   FILE *f)
+{
+	struct colordesc *cd;
+	struct colordesc *end;
+	color co;
+	chr c;
+	char *has;
+
+	fprintf(f, "max %ld\n", (long)cm->max);
+	if (NBYTS > 1)
+		fillcheck(cm, cm->tree, 0, f);
+	end = CDEND(cm);
+	for (cd = cm->cd + 1, co = 1; cd < end; cd++, co++)	/* skip 0 */
+		if (!UNUSEDCOLOR(cd)) {
+			assert(cd->nchrs > 0);
+			has = (cd->block != NULL) ? "#" : "";
+			if (cd->flags&PSEUDO)
+				fprintf(f, "#%2ld%s(ps): ", (long)co, has);
+			else
+				fprintf(f, "#%2ld%s(%2d): ", (long)co,
+							has, cd->nchrs);
+			/* it's hard to do this more efficiently */
+			for (c = CHR_MIN; c < CHR_MAX; c++)
+				if (GETCOLOR(cm, c) == co)
+					dumpchr(c, f);
+			assert(c == CHR_MAX);
+			if (GETCOLOR(cm, c) == co)
+				dumpchr(c, f);
+			fprintf(f, "\n");
+		}
+}
+
+/*
+ * fillcheck - check proper filling of a tree
+ */
+static void
+fillcheck(struct colormap *cm,
+		  union tree *tree,
+		  int level,			/* level number (top == 0) of this block */
+		  FILE *f)
+{
+	int i;
+	union tree *t;
+	union tree *fillt = &cm->tree[level+1];
+
+	assert(level < NBYTS-1);	/* this level has pointers */
+	for (i = BYTTAB-1; i >= 0; i--) {
+		t = tree->tptr[i];
+		if (t == NULL)
+			fprintf(f, "NULL found in filled tree!\n");
+		else if (t == fillt)
+			{}
+		else if (level < NBYTS-2)	/* more pointer blocks below */
+			fillcheck(cm, t, level+1, f);
+	}
+}
+
+/*
+ * dumpchr - print a chr
+ *
+ * Kind of char-centric but works well enough for debug use.
+ */
+static void
+dumpchr(chr c,
+		FILE *f)
+{
+	if (c == '\\')
+		fprintf(f, "\\\\");
+	else if (c > ' ' && c <= '~')
+		putc((char)c, f);
+	else
+		fprintf(f, "\\u%04lx", (long)c);
+}
+
+#endif /* REG_DEBUG */
diff --git a/src/backend/regex/regc_cvec.c b/src/backend/regex/regc_cvec.c
new file mode 100644
index 00000000000..3b4e6ddb61b
--- /dev/null
+++ b/src/backend/regex/regc_cvec.c
@@ -0,0 +1,194 @@
+/*
+ * Utility functions for handling cvecs
+ * This file is #included by regcomp.c.
+ *
+ * Copyright (c) 1998, 1999 Henry Spencer.  All rights reserved.
+ * 
+ * Development of this software was funded, in part, by Cray Research Inc.,
+ * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
+ * Corporation, none of whom are responsible for the results.  The author
+ * thanks all of them. 
+ * 
+ * Redistribution and use in source and binary forms -- with or without
+ * modification -- are permitted for any purpose, provided that
+ * redistributions in source form retain this entire copyright notice and
+ * indicate the origin and nature of any modifications.
+ * 
+ * I'd appreciate being given credit for this package in the documentation
+ * of software which uses it, but that is not a requirement.
+ * 
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
+ * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL
+ * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $Header: /cvsroot/pgsql/src/backend/regex/regc_cvec.c,v 1.1 2003/02/05 17:41:32 tgl Exp $
+ *
+ */
+
+/*
+ * newcvec - allocate a new cvec
+ */
+static struct cvec *
+newcvec(int nchrs,				/* to hold this many chrs... */
+		int nranges,			/* ... and this many ranges... */
+		int nmcces)				/* ... and this many MCCEs */
+{
+    size_t n;
+    size_t nc;
+    struct cvec *cv;
+
+    nc = (size_t)nchrs + (size_t)nmcces*(MAXMCCE+1) + (size_t)nranges*2;
+    n = sizeof(struct cvec) + (size_t)(nmcces-1)*sizeof(chr *)
+	    + nc*sizeof(chr);
+    cv = (struct cvec *)MALLOC(n);
+    if (cv == NULL) {
+	return NULL;
+    }
+    cv->chrspace = nchrs;
+    cv->chrs = (chr *)&cv->mcces[nmcces];	/* chrs just after MCCE ptrs */
+    cv->mccespace = nmcces;
+    cv->ranges = cv->chrs + nchrs + nmcces*(MAXMCCE+1);
+    cv->rangespace = nranges;
+    return clearcvec(cv);
+}
+
+/*
+ * clearcvec - clear a possibly-new cvec
+ * Returns pointer as convenience.
+ */
+static struct cvec *
+clearcvec(struct cvec *cv)
+{
+    int i;
+
+    assert(cv != NULL);
+    cv->nchrs = 0;
+    assert(cv->chrs == (chr *)&cv->mcces[cv->mccespace]);
+    cv->nmcces = 0;
+    cv->nmccechrs = 0;
+    cv->nranges = 0;
+    for (i = 0; i < cv->mccespace; i++) {
+	cv->mcces[i] = NULL;
+    }
+
+    return cv;
+}
+
+/*
+ * addchr - add a chr to a cvec
+ */
+static void
+addchr(struct cvec *cv,			/* character vector */
+	   chr c)				/* character to add */
+{
+    assert(cv->nchrs < cv->chrspace - cv->nmccechrs);
+    cv->chrs[cv->nchrs++] = (chr)c;
+}
+
+/*
+ * addrange - add a range to a cvec
+ */
+static void
+addrange(struct cvec *cv,			/* character vector */
+		 chr from,				/* first character of range */
+		 chr to)				/* last character of range */
+{
+    assert(cv->nranges < cv->rangespace);
+    cv->ranges[cv->nranges*2] = (chr)from;
+    cv->ranges[cv->nranges*2 + 1] = (chr)to;
+    cv->nranges++;
+}
+
+/*
+ * addmcce - add an MCCE to a cvec
+ */
+static void
+addmcce(struct cvec *cv,			/* character vector */
+		chr *startp,			/* beginning of text */
+		chr *endp)				/* just past end of text */
+{
+    int len;
+    int i;
+    chr *s;
+    chr *d;
+
+    if (startp == NULL && endp == NULL) {
+	return;
+    }
+    len = endp - startp;
+    assert(len > 0);
+    assert(cv->nchrs + len < cv->chrspace - cv->nmccechrs);
+    assert(cv->nmcces < cv->mccespace);
+    d = &cv->chrs[cv->chrspace - cv->nmccechrs - len - 1];
+    cv->mcces[cv->nmcces++] = d;
+    for (s = startp, i = len; i > 0; s++, i--) {
+	*d++ = *s;
+    }
+    *d++ = 0;				/* endmarker */
+    assert(d == &cv->chrs[cv->chrspace - cv->nmccechrs]);
+    cv->nmccechrs += len + 1;
+}
+
+/*
+ * haschr - does a cvec contain this chr?
+ */
+static int				/* predicate */
+haschr(struct cvec *cv,			/* character vector */
+	   chr c)				/* character to test for */
+{
+    int i;
+    chr *p;
+
+    for (p = cv->chrs, i = cv->nchrs; i > 0; p++, i--) {
+	if (*p == c) {
+	    return 1;
+	}
+    }
+    for (p = cv->ranges, i = cv->nranges; i > 0; p += 2, i--) {
+	if ((*p <= c) && (c <= *(p+1))) {
+	    return 1;
+	}
+    }
+    return 0;
+}
+
+/*
+ * getcvec - get a cvec, remembering it as v->cv
+ */
+static struct cvec *
+getcvec(struct vars *v,			/* context */
+		int nchrs,				/* to hold this many chrs... */
+		int nranges,			/* ... and this many ranges... */
+		int nmcces)				/* ... and this many MCCEs */
+{
+    if (v->cv != NULL && nchrs <= v->cv->chrspace &&
+	    nranges <= v->cv->rangespace && nmcces <= v->cv->mccespace) {
+	return clearcvec(v->cv);
+    }
+
+    if (v->cv != NULL) {
+	freecvec(v->cv);
+    }
+    v->cv = newcvec(nchrs, nranges, nmcces);
+    if (v->cv == NULL) {
+	ERR(REG_ESPACE);
+    }
+
+    return v->cv;
+}
+
+/*
+ * freecvec - free a cvec
+ */
+static void
+freecvec(struct cvec *cv)
+{
+    FREE(cv);
+}
diff --git a/src/backend/regex/regc_lex.c b/src/backend/regex/regc_lex.c
new file mode 100644
index 00000000000..2f1a5840ff2
--- /dev/null
+++ b/src/backend/regex/regc_lex.c
@@ -0,0 +1,1028 @@
+/*
+ * lexical analyzer
+ * This file is #included by regcomp.c.
+ *
+ * Copyright (c) 1998, 1999 Henry Spencer.  All rights reserved.
+ * 
+ * Development of this software was funded, in part, by Cray Research Inc.,
+ * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
+ * Corporation, none of whom are responsible for the results.  The author
+ * thanks all of them. 
+ * 
+ * Redistribution and use in source and binary forms -- with or without
+ * modification -- are permitted for any purpose, provided that
+ * redistributions in source form retain this entire copyright notice and
+ * indicate the origin and nature of any modifications.
+ * 
+ * I'd appreciate being given credit for this package in the documentation
+ * of software which uses it, but that is not a requirement.
+ * 
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
+ * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL
+ * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $Header: /cvsroot/pgsql/src/backend/regex/regc_lex.c,v 1.1 2003/02/05 17:41:32 tgl Exp $
+ *
+ */
+
+/* scanning macros (know about v) */
+#define	ATEOS()		(v->now >= v->stop)
+#define	HAVE(n)		(v->stop - v->now >= (n))
+#define	NEXT1(c)	(!ATEOS() && *v->now == CHR(c))
+#define	NEXT2(a,b)	(HAVE(2) && *v->now == CHR(a) && *(v->now+1) == CHR(b))
+#define	NEXT3(a,b,c)	(HAVE(3) && *v->now == CHR(a) && \
+						*(v->now+1) == CHR(b) && \
+						*(v->now+2) == CHR(c))
+#define	SET(c)		(v->nexttype = (c))
+#define	SETV(c, n)	(v->nexttype = (c), v->nextvalue = (n))
+#define	RET(c)		return (SET(c), 1)
+#define	RETV(c, n)	return (SETV(c, n), 1)
+#define	FAILW(e)	return (ERR(e), 0)	/* ERR does SET(EOS) */
+#define	LASTTYPE(t)	(v->lasttype == (t))
+
+/* lexical contexts */
+#define	L_ERE	1	/* mainline ERE/ARE */
+#define	L_BRE	2	/* mainline BRE */
+#define	L_Q	3	/* REG_QUOTE */
+#define	L_EBND	4	/* ERE/ARE bound */
+#define	L_BBND	5	/* BRE bound */
+#define	L_BRACK	6	/* brackets */
+#define	L_CEL	7	/* collating element */
+#define	L_ECL	8	/* equivalence class */
+#define	L_CCL	9	/* character class */
+#define	INTOCON(c)	(v->lexcon = (c))
+#define	INCON(con)	(v->lexcon == (con))
+
+/* construct pointer past end of chr array */
+#define	ENDOF(array)	((array) + sizeof(array)/sizeof(chr))
+
+/*
+ * lexstart - set up lexical stuff, scan leading options
+ */
+static void
+lexstart(struct vars *v)
+{
+	prefixes(v);			/* may turn on new type bits etc. */
+	NOERR();
+
+	if (v->cflags&REG_QUOTE) {
+		assert(!(v->cflags&(REG_ADVANCED|REG_EXPANDED|REG_NEWLINE)));
+		INTOCON(L_Q);
+	} else if (v->cflags&REG_EXTENDED) {
+		assert(!(v->cflags&REG_QUOTE));
+		INTOCON(L_ERE);
+	} else {
+		assert(!(v->cflags&(REG_QUOTE|REG_ADVF)));
+		INTOCON(L_BRE);
+	}
+
+	v->nexttype = EMPTY;		/* remember we were at the start */
+	next(v);			/* set up the first token */
+}
+
+/*
+ * prefixes - implement various special prefixes
+ */
+static void
+prefixes(struct vars *v)
+{
+	/* literal string doesn't get any of this stuff */
+	if (v->cflags&REG_QUOTE)
+		return;
+
+	/* initial "***" gets special things */	
+	if (HAVE(4) && NEXT3('*', '*', '*'))
+		switch (*(v->now + 3)) {
+		case CHR('?'):		/* "***?" error, msg shows version */
+			ERR(REG_BADPAT);
+			return;		/* proceed no further */
+			break;
+		case CHR('='):		/* "***=" shifts to literal string */
+			NOTE(REG_UNONPOSIX);
+			v->cflags |= REG_QUOTE;
+			v->cflags &= ~(REG_ADVANCED|REG_EXPANDED|REG_NEWLINE);
+			v->now += 4;
+			return;		/* and there can be no more prefixes */
+			break;
+		case CHR(':'):		/* "***:" shifts to AREs */
+			NOTE(REG_UNONPOSIX);
+			v->cflags |= REG_ADVANCED;
+			v->now += 4;
+			break;
+		default:		/* otherwise *** is just an error */
+			ERR(REG_BADRPT);
+			return;
+			break;
+		}
+
+	/* BREs and EREs don't get embedded options */
+	if ((v->cflags&REG_ADVANCED) != REG_ADVANCED)
+		return;
+
+	/* embedded options (AREs only) */
+	if (HAVE(3) && NEXT2('(', '?') && iscalpha(*(v->now + 2))) {
+		NOTE(REG_UNONPOSIX);
+		v->now += 2;
+		for (; !ATEOS() && iscalpha(*v->now); v->now++)
+			switch (*v->now) {
+			case CHR('b'):		/* BREs (but why???) */
+				v->cflags &= ~(REG_ADVANCED|REG_QUOTE);
+				break;
+			case CHR('c'):		/* case sensitive */
+				v->cflags &= ~REG_ICASE;
+				break;
+			case CHR('e'):		/* plain EREs */
+				v->cflags |= REG_EXTENDED;
+				v->cflags &= ~(REG_ADVF|REG_QUOTE);
+				break;
+			case CHR('i'):		/* case insensitive */
+				v->cflags |= REG_ICASE;
+				break;
+			case CHR('m'):		/* Perloid synonym for n */
+			case CHR('n'):		/* \n affects ^ $ . [^ */
+				v->cflags |= REG_NEWLINE;
+				break;
+			case CHR('p'):		/* ~Perl, \n affects . [^ */
+				v->cflags |= REG_NLSTOP;
+				v->cflags &= ~REG_NLANCH;
+				break;
+			case CHR('q'):		/* literal string */
+				v->cflags |= REG_QUOTE;
+				v->cflags &= ~REG_ADVANCED;
+				break;
+			case CHR('s'):		/* single line, \n ordinary */
+				v->cflags &= ~REG_NEWLINE;
+				break;
+			case CHR('t'):		/* tight syntax */
+				v->cflags &= ~REG_EXPANDED;
+				break;
+			case CHR('w'):		/* weird, \n affects ^ $ only */
+				v->cflags &= ~REG_NLSTOP;
+				v->cflags |= REG_NLANCH;
+				break;
+			case CHR('x'):		/* expanded syntax */
+				v->cflags |= REG_EXPANDED;
+				break;
+			default:
+				ERR(REG_BADOPT);
+				return;
+			}
+		if (!NEXT1(')')) {
+			ERR(REG_BADOPT);
+			return;
+		}
+		v->now++;
+		if (v->cflags&REG_QUOTE)
+			v->cflags &= ~(REG_EXPANDED|REG_NEWLINE);
+	}
+}
+
+/*
+ * lexnest - "call a subroutine", interpolating string at the lexical level
+ *
+ * Note, this is not a very general facility.  There are a number of
+ * implicit assumptions about what sorts of strings can be subroutines.
+ */
+static void
+lexnest(struct vars *v,
+		chr *beginp,				/* start of interpolation */
+		chr *endp)				/* one past end of interpolation */
+{
+	assert(v->savenow == NULL);	/* only one level of nesting */
+	v->savenow = v->now;
+	v->savestop = v->stop;
+	v->now = beginp;
+	v->stop = endp;
+}
+
+/*
+ * string constants to interpolate as expansions of things like \d
+ */
+static chr backd[] = {		/* \d */
+	CHR('['), CHR('['), CHR(':'),
+	CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'),
+	CHR(':'), CHR(']'), CHR(']')
+};
+static chr backD[] = {		/* \D */
+	CHR('['), CHR('^'), CHR('['), CHR(':'),
+	CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'),
+	CHR(':'), CHR(']'), CHR(']')
+};
+static chr brbackd[] = {	/* \d within brackets */
+	CHR('['), CHR(':'),
+	CHR('d'), CHR('i'), CHR('g'), CHR('i'), CHR('t'),
+	CHR(':'), CHR(']')
+};
+static chr backs[] = {		/* \s */
+	CHR('['), CHR('['), CHR(':'),
+	CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'),
+	CHR(':'), CHR(']'), CHR(']')
+};
+static chr backS[] = {		/* \S */
+	CHR('['), CHR('^'), CHR('['), CHR(':'),
+	CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'),
+	CHR(':'), CHR(']'), CHR(']')
+};
+static chr brbacks[] = {	/* \s within brackets */
+	CHR('['), CHR(':'),
+	CHR('s'), CHR('p'), CHR('a'), CHR('c'), CHR('e'),
+	CHR(':'), CHR(']')
+};
+static chr backw[] = {		/* \w */
+	CHR('['), CHR('['), CHR(':'),
+	CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'),
+	CHR(':'), CHR(']'), CHR('_'), CHR(']')
+};
+static chr backW[] = {		/* \W */
+	CHR('['), CHR('^'), CHR('['), CHR(':'),
+	CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'),
+	CHR(':'), CHR(']'), CHR('_'), CHR(']')
+};
+static chr brbackw[] = {	/* \w within brackets */
+	CHR('['), CHR(':'),
+	CHR('a'), CHR('l'), CHR('n'), CHR('u'), CHR('m'),
+	CHR(':'), CHR(']'), CHR('_')
+};
+
+/*
+ * lexword - interpolate a bracket expression for word characters
+ * Possibly ought to inquire whether there is a "word" character class.
+ */
+static void
+lexword(struct vars *v)
+{
+	lexnest(v, backw, ENDOF(backw));
+}
+
+/*
+ * next - get next token
+ */
+static int			/* 1 normal, 0 failure */
+next(struct vars *v)
+{
+	chr c;
+
+	/* errors yield an infinite sequence of failures */
+	if (ISERR())
+		return 0;	/* the error has set nexttype to EOS */
+
+	/* remember flavor of last token */
+	v->lasttype = v->nexttype;
+
+	/* REG_BOSONLY */
+	if (v->nexttype == EMPTY && (v->cflags&REG_BOSONLY)) {
+		/* at start of a REG_BOSONLY RE */
+		RETV(SBEGIN, 0);		/* same as \A */
+	}
+
+	/* if we're nested and we've hit end, return to outer level */
+	if (v->savenow != NULL && ATEOS()) {
+		v->now = v->savenow;
+		v->stop = v->savestop;
+		v->savenow = v->savestop = NULL;
+	}
+
+	/* skip white space etc. if appropriate (not in literal or []) */
+	if (v->cflags&REG_EXPANDED)
+		switch (v->lexcon) {
+		case L_ERE:
+		case L_BRE:
+		case L_EBND:
+		case L_BBND:
+			skip(v);
+			break;
+		}
+
+	/* handle EOS, depending on context */
+	if (ATEOS()) {
+		switch (v->lexcon) {
+		case L_ERE:
+		case L_BRE:
+		case L_Q:
+			RET(EOS);
+			break;
+		case L_EBND:
+		case L_BBND:
+			FAILW(REG_EBRACE);
+			break;
+		case L_BRACK:
+		case L_CEL:
+		case L_ECL:
+		case L_CCL:
+			FAILW(REG_EBRACK);
+			break;
+		}
+		assert(NOTREACHED);
+	}
+
+	/* okay, time to actually get a character */
+	c = *v->now++;
+
+	/* deal with the easy contexts, punt EREs to code below */
+	switch (v->lexcon) {
+	case L_BRE:			/* punt BREs to separate function */
+		return brenext(v, c);
+		break;
+	case L_ERE:			/* see below */
+		break;
+	case L_Q:			/* literal strings are easy */
+		RETV(PLAIN, c);
+		break;
+	case L_BBND:			/* bounds are fairly simple */
+	case L_EBND:
+		switch (c) {
+		case CHR('0'): case CHR('1'): case CHR('2'): case CHR('3'):
+		case CHR('4'): case CHR('5'): case CHR('6'): case CHR('7'):
+		case CHR('8'): case CHR('9'):
+			RETV(DIGIT, (chr)DIGITVAL(c));
+			break;
+		case CHR(','):
+			RET(',');
+			break;
+		case CHR('}'):		/* ERE bound ends with } */
+			if (INCON(L_EBND)) {
+				INTOCON(L_ERE);
+				if ((v->cflags&REG_ADVF) && NEXT1('?')) {
+					v->now++;
+					NOTE(REG_UNONPOSIX);
+					RETV('}', 0);
+				}
+				RETV('}', 1);
+			} else
+				FAILW(REG_BADBR);
+			break;
+		case CHR('\\'):		/* BRE bound ends with \} */
+			if (INCON(L_BBND) && NEXT1('}')) {
+				v->now++;
+				INTOCON(L_BRE);
+				RET('}');
+			} else
+				FAILW(REG_BADBR);
+			break;
+		default:
+			FAILW(REG_BADBR);
+			break;
+		}
+		assert(NOTREACHED);
+		break;
+	case L_BRACK:			/* brackets are not too hard */
+		switch (c) {
+		case CHR(']'):
+			if (LASTTYPE('['))
+				RETV(PLAIN, c);
+			else {
+				INTOCON((v->cflags&REG_EXTENDED) ?
+							L_ERE : L_BRE);
+				RET(']');
+			}
+			break;
+		case CHR('\\'):
+			NOTE(REG_UBBS);
+			if (!(v->cflags&REG_ADVF))
+				RETV(PLAIN, c);
+			NOTE(REG_UNONPOSIX);
+			if (ATEOS())
+				FAILW(REG_EESCAPE);
+			(DISCARD)lexescape(v);
+			switch (v->nexttype) {	/* not all escapes okay here */
+			case PLAIN:
+				return 1;
+				break;
+			case CCLASS:
+				switch (v->nextvalue) {
+				case 'd':
+					lexnest(v, brbackd, ENDOF(brbackd));
+					break;
+				case 's':
+					lexnest(v, brbacks, ENDOF(brbacks));
+					break;
+				case 'w':
+					lexnest(v, brbackw, ENDOF(brbackw));
+					break;
+				default:
+					FAILW(REG_EESCAPE);
+					break;
+				}
+				/* lexnest done, back up and try again */
+				v->nexttype = v->lasttype;
+				return next(v);
+				break;
+			}
+			/* not one of the acceptable escapes */
+			FAILW(REG_EESCAPE);
+			break;
+		case CHR('-'):
+			if (LASTTYPE('[') || NEXT1(']'))
+				RETV(PLAIN, c);
+			else
+				RETV(RANGE, c);
+			break;
+		case CHR('['):
+			if (ATEOS())
+				FAILW(REG_EBRACK);
+			switch (*v->now++) {
+			case CHR('.'):
+				INTOCON(L_CEL);
+				/* might or might not be locale-specific */
+				RET(COLLEL);
+				break;
+			case CHR('='):
+				INTOCON(L_ECL);
+				NOTE(REG_ULOCALE);
+				RET(ECLASS);
+				break;
+			case CHR(':'):
+				INTOCON(L_CCL);
+				NOTE(REG_ULOCALE);
+				RET(CCLASS);
+				break;
+			default:			/* oops */
+				v->now--;
+				RETV(PLAIN, c);
+				break;
+			}
+			assert(NOTREACHED);
+			break;
+		default:
+			RETV(PLAIN, c);
+			break;
+		}
+		assert(NOTREACHED);
+		break;
+	case L_CEL:			/* collating elements are easy */
+		if (c == CHR('.') && NEXT1(']')) {
+			v->now++;
+			INTOCON(L_BRACK);
+			RETV(END, '.');
+		} else
+			RETV(PLAIN, c);
+		break;
+	case L_ECL:			/* ditto equivalence classes */
+		if (c == CHR('=') && NEXT1(']')) {
+			v->now++;
+			INTOCON(L_BRACK);
+			RETV(END, '=');
+		} else
+			RETV(PLAIN, c);
+		break;
+	case L_CCL:			/* ditto character classes */
+		if (c == CHR(':') && NEXT1(']')) {
+			v->now++;
+			INTOCON(L_BRACK);
+			RETV(END, ':');
+		} else
+			RETV(PLAIN, c);
+		break;
+	default:
+		assert(NOTREACHED);
+		break;
+	}
+
+	/* that got rid of everything except EREs and AREs */
+	assert(INCON(L_ERE));
+
+	/* deal with EREs and AREs, except for backslashes */
+	switch (c) {
+	case CHR('|'):
+		RET('|');
+		break;
+	case CHR('*'):
+		if ((v->cflags&REG_ADVF) && NEXT1('?')) {
+			v->now++;
+			NOTE(REG_UNONPOSIX);
+			RETV('*', 0);
+		}
+		RETV('*', 1);
+		break;
+	case CHR('+'):
+		if ((v->cflags&REG_ADVF) && NEXT1('?')) {
+			v->now++;
+			NOTE(REG_UNONPOSIX);
+			RETV('+', 0);
+		}
+		RETV('+', 1);
+		break;
+	case CHR('?'):
+		if ((v->cflags&REG_ADVF) && NEXT1('?')) {
+			v->now++;
+			NOTE(REG_UNONPOSIX);
+			RETV('?', 0);
+		}
+		RETV('?', 1);
+		break;
+	case CHR('{'):		/* bounds start or plain character */
+		if (v->cflags&REG_EXPANDED)
+			skip(v);
+		if (ATEOS() || !iscdigit(*v->now)) {
+			NOTE(REG_UBRACES);
+			NOTE(REG_UUNSPEC);
+			RETV(PLAIN, c);
+		} else {
+			NOTE(REG_UBOUNDS);
+			INTOCON(L_EBND);
+			RET('{');
+		}
+		assert(NOTREACHED);
+		break;
+	case CHR('('):		/* parenthesis, or advanced extension */
+		if ((v->cflags&REG_ADVF) && NEXT1('?')) {
+			NOTE(REG_UNONPOSIX);
+			v->now++;
+			switch (*v->now++) {
+			case CHR(':'):		/* non-capturing paren */
+				RETV('(', 0);
+				break;
+			case CHR('#'):		/* comment */
+				while (!ATEOS() && *v->now != CHR(')'))
+					v->now++;
+				if (!ATEOS())
+					v->now++;
+				assert(v->nexttype == v->lasttype);
+				return next(v);
+				break;
+			case CHR('='):		/* positive lookahead */
+				NOTE(REG_ULOOKAHEAD);
+				RETV(LACON, 1);
+				break;
+			case CHR('!'):		/* negative lookahead */
+				NOTE(REG_ULOOKAHEAD);
+				RETV(LACON, 0);
+				break;
+			default:
+				FAILW(REG_BADRPT);
+				break;
+			}
+			assert(NOTREACHED);
+		}
+		if (v->cflags&REG_NOSUB)
+			RETV('(', 0);		/* all parens non-capturing */
+		else
+			RETV('(', 1);
+		break;
+	case CHR(')'):
+		if (LASTTYPE('(')) {
+			NOTE(REG_UUNSPEC);
+		}
+		RETV(')', c);
+		break;
+	case CHR('['):		/* easy except for [[:<:]] and [[:>:]] */
+		if (HAVE(6) &&	*(v->now+0) == CHR('[') &&
+				*(v->now+1) == CHR(':') &&
+				(*(v->now+2) == CHR('<') ||
+						*(v->now+2) == CHR('>')) &&
+				*(v->now+3) == CHR(':') &&
+				*(v->now+4) == CHR(']') &&
+				*(v->now+5) == CHR(']')) {
+			c = *(v->now+2);
+			v->now += 6;
+			NOTE(REG_UNONPOSIX);
+			RET((c == CHR('<')) ? '<' : '>');
+		}
+		INTOCON(L_BRACK);
+		if (NEXT1('^')) {
+			v->now++;
+			RETV('[', 0);
+		}
+		RETV('[', 1);
+		break;
+	case CHR('.'):
+		RET('.');
+		break;
+	case CHR('^'):
+		RET('^');
+		break;
+	case CHR('$'):
+		RET('$');
+		break;
+	case CHR('\\'):		/* mostly punt backslashes to code below */
+		if (ATEOS())
+			FAILW(REG_EESCAPE);
+		break;
+	default:		/* ordinary character */
+		RETV(PLAIN, c);
+		break;
+	}
+
+	/* ERE/ARE backslash handling; backslash already eaten */
+	assert(!ATEOS());
+	if (!(v->cflags&REG_ADVF)) {	/* only AREs have non-trivial escapes */
+		if (iscalnum(*v->now)) {
+			NOTE(REG_UBSALNUM);
+			NOTE(REG_UUNSPEC);
+		}
+		RETV(PLAIN, *v->now++);
+	}
+	(DISCARD)lexescape(v);
+	if (ISERR())
+		FAILW(REG_EESCAPE);
+	if (v->nexttype == CCLASS) {	/* fudge at lexical level */
+		switch (v->nextvalue) {
+		case 'd':	lexnest(v, backd, ENDOF(backd)); break;
+		case 'D':	lexnest(v, backD, ENDOF(backD)); break;
+		case 's':	lexnest(v, backs, ENDOF(backs)); break;
+		case 'S':	lexnest(v, backS, ENDOF(backS)); break;
+		case 'w':	lexnest(v, backw, ENDOF(backw)); break;
+		case 'W':	lexnest(v, backW, ENDOF(backW)); break;
+		default:
+			assert(NOTREACHED);
+			FAILW(REG_ASSERT);
+			break;
+		}
+		/* lexnest done, back up and try again */
+		v->nexttype = v->lasttype;
+		return next(v);
+	}
+	/* otherwise, lexescape has already done the work */
+	return !ISERR();
+}
+
+/*
+ * lexescape - parse an ARE backslash escape (backslash already eaten)
+ * Note slightly nonstandard use of the CCLASS type code.
+ */
+static int			/* not actually used, but convenient for RETV */
+lexescape(struct vars *v)
+{
+	chr c;
+	static chr alert[] = {
+		CHR('a'), CHR('l'), CHR('e'), CHR('r'), CHR('t')
+	};
+	static chr esc[] = {
+		CHR('E'), CHR('S'), CHR('C')
+	};
+	chr *save;
+
+	assert(v->cflags&REG_ADVF);
+
+	assert(!ATEOS());
+	c = *v->now++;
+	if (!iscalnum(c))
+		RETV(PLAIN, c);
+
+	NOTE(REG_UNONPOSIX);
+	switch (c) {
+	case CHR('a'):
+		RETV(PLAIN, chrnamed(v, alert, ENDOF(alert), CHR('\007')));
+		break;
+	case CHR('A'):
+		RETV(SBEGIN, 0);
+		break;
+	case CHR('b'):
+		RETV(PLAIN, CHR('\b'));
+		break;
+	case CHR('B'):
+		RETV(PLAIN, CHR('\\'));
+		break;
+	case CHR('c'):
+		NOTE(REG_UUNPORT);
+		if (ATEOS())
+			FAILW(REG_EESCAPE);
+		RETV(PLAIN, (chr)(*v->now++ & 037));
+		break;
+	case CHR('d'):
+		NOTE(REG_ULOCALE);
+		RETV(CCLASS, 'd');
+		break;
+	case CHR('D'):
+		NOTE(REG_ULOCALE);
+		RETV(CCLASS, 'D');
+		break;
+	case CHR('e'):
+		NOTE(REG_UUNPORT);
+		RETV(PLAIN, chrnamed(v, esc, ENDOF(esc), CHR('\033')));
+		break;
+	case CHR('f'):
+		RETV(PLAIN, CHR('\f'));
+		break;
+	case CHR('m'):
+		RET('<');
+		break;
+	case CHR('M'):
+		RET('>');
+		break;
+	case CHR('n'):
+		RETV(PLAIN, CHR('\n'));
+		break;
+	case CHR('r'):
+		RETV(PLAIN, CHR('\r'));
+		break;
+	case CHR('s'):
+		NOTE(REG_ULOCALE);
+		RETV(CCLASS, 's');
+		break;
+	case CHR('S'):
+		NOTE(REG_ULOCALE);
+		RETV(CCLASS, 'S');
+		break;
+	case CHR('t'):
+		RETV(PLAIN, CHR('\t'));
+		break;
+	case CHR('u'):
+		c = lexdigits(v, 16, 4, 4);
+		if (ISERR())
+			FAILW(REG_EESCAPE);
+		RETV(PLAIN, c);
+		break;
+	case CHR('U'):
+		c = lexdigits(v, 16, 8, 8);
+		if (ISERR())
+			FAILW(REG_EESCAPE);
+		RETV(PLAIN, c);
+		break;
+	case CHR('v'):
+		RETV(PLAIN, CHR('\v'));
+		break;
+	case CHR('w'):
+		NOTE(REG_ULOCALE);
+		RETV(CCLASS, 'w');
+		break;
+	case CHR('W'):
+		NOTE(REG_ULOCALE);
+		RETV(CCLASS, 'W');
+		break;
+	case CHR('x'):
+		NOTE(REG_UUNPORT);
+		c = lexdigits(v, 16, 1, 255);	/* REs >255 long outside spec */
+		if (ISERR())
+			FAILW(REG_EESCAPE);
+		RETV(PLAIN, c);
+		break;
+	case CHR('y'):
+		NOTE(REG_ULOCALE);
+		RETV(WBDRY, 0);
+		break;
+	case CHR('Y'):
+		NOTE(REG_ULOCALE);
+		RETV(NWBDRY, 0);
+		break;
+	case CHR('Z'):
+		RETV(SEND, 0);
+		break;
+	case CHR('1'): case CHR('2'): case CHR('3'): case CHR('4'):
+	case CHR('5'): case CHR('6'): case CHR('7'): case CHR('8'):
+	case CHR('9'):
+		save = v->now;
+		v->now--;	/* put first digit back */
+		c = lexdigits(v, 10, 1, 255);	/* REs >255 long outside spec */
+		if (ISERR())
+			FAILW(REG_EESCAPE);
+		/* ugly heuristic (first test is "exactly 1 digit?") */
+		if (v->now - save == 0 || (int)c <= v->nsubexp) {
+			NOTE(REG_UBACKREF);
+			RETV(BACKREF, (chr)c);
+		}
+		/* oops, doesn't look like it's a backref after all... */
+		v->now = save;
+		/* and fall through into octal number */
+	case CHR('0'):
+		NOTE(REG_UUNPORT);
+		v->now--;	/* put first digit back */
+		c = lexdigits(v, 8, 1, 3);
+		if (ISERR())
+			FAILW(REG_EESCAPE);
+		RETV(PLAIN, c);
+		break;
+	default:
+		assert(iscalpha(c));
+		FAILW(REG_EESCAPE);	/* unknown alphabetic escape */
+		break;
+	}
+	assert(NOTREACHED);
+}
+
+/*
+ * lexdigits - slurp up digits and return chr value
+ */
+static chr			/* chr value; errors signalled via ERR */
+lexdigits(struct vars *v,
+		  int base,
+		  int minlen,
+		  int maxlen)
+{
+	uchr n;			/* unsigned to avoid overflow misbehavior */
+	int len;
+	chr c;
+	int d;
+	const uchr ub = (uchr) base;
+
+	n = 0;
+	for (len = 0; len < maxlen && !ATEOS(); len++) {
+		c = *v->now++;
+		switch (c) {
+		case CHR('0'): case CHR('1'): case CHR('2'): case CHR('3'):
+		case CHR('4'): case CHR('5'): case CHR('6'): case CHR('7'):
+		case CHR('8'): case CHR('9'):
+			d = DIGITVAL(c);
+			break;
+		case CHR('a'): case CHR('A'): d = 10; break;
+		case CHR('b'): case CHR('B'): d = 11; break;
+		case CHR('c'): case CHR('C'): d = 12; break;
+		case CHR('d'): case CHR('D'): d = 13; break;
+		case CHR('e'): case CHR('E'): d = 14; break;
+		case CHR('f'): case CHR('F'): d = 15; break;
+		default:
+			v->now--;	/* oops, not a digit at all */
+			d = -1;
+			break;
+		}
+
+		if (d >= base) {	/* not a plausible digit */
+			v->now--;
+			d = -1;
+		}
+		if (d < 0)
+			break;		/* NOTE BREAK OUT */
+		n = n*ub + (uchr)d;
+	}
+	if (len < minlen)
+		ERR(REG_EESCAPE);
+
+	return (chr)n;
+}
+
+/*
+ * brenext - get next BRE token
+ *
+ * This is much like EREs except for all the stupid backslashes and the
+ * context-dependency of some things.
+ */
+static int			/* 1 normal, 0 failure */
+brenext(struct vars *v,
+		chr pc)
+{
+	chr c = (chr)pc;
+
+	switch (c) {
+	case CHR('*'):
+		if (LASTTYPE(EMPTY) || LASTTYPE('(') || LASTTYPE('^'))
+			RETV(PLAIN, c);
+		RET('*');
+		break;
+	case CHR('['):
+		if (HAVE(6) &&	*(v->now+0) == CHR('[') &&
+				*(v->now+1) == CHR(':') &&
+				(*(v->now+2) == CHR('<') ||
+						*(v->now+2) == CHR('>')) &&
+				*(v->now+3) == CHR(':') &&
+				*(v->now+4) == CHR(']') &&
+				*(v->now+5) == CHR(']')) {
+			c = *(v->now+2);
+			v->now += 6;
+			NOTE(REG_UNONPOSIX);
+			RET((c == CHR('<')) ? '<' : '>');
+		}
+		INTOCON(L_BRACK);
+		if (NEXT1('^')) {
+			v->now++;
+			RETV('[', 0);
+		}
+		RETV('[', 1);
+		break;
+	case CHR('.'):
+		RET('.');
+		break;
+	case CHR('^'):
+		if (LASTTYPE(EMPTY))
+			RET('^');
+		if (LASTTYPE('(')) {
+			NOTE(REG_UUNSPEC);
+			RET('^');
+		}
+		RETV(PLAIN, c);
+		break;
+	case CHR('$'):
+		if (v->cflags&REG_EXPANDED)
+			skip(v);
+		if (ATEOS())
+			RET('$');
+		if (NEXT2('\\', ')')) {
+			NOTE(REG_UUNSPEC);
+			RET('$');
+		}
+		RETV(PLAIN, c);
+		break;
+	case CHR('\\'):
+		break;		/* see below */
+	default:
+		RETV(PLAIN, c);
+		break;
+	}
+
+	assert(c == CHR('\\'));
+
+	if (ATEOS())
+		FAILW(REG_EESCAPE);
+
+	c = *v->now++;
+	switch (c) {
+	case CHR('{'):
+		INTOCON(L_BBND);
+		NOTE(REG_UBOUNDS);
+		RET('{');
+		break;
+	case CHR('('):
+		RETV('(', 1);
+		break;
+	case CHR(')'):
+		RETV(')', c);
+		break;
+	case CHR('<'):
+		NOTE(REG_UNONPOSIX);
+		RET('<');
+		break;
+	case CHR('>'):
+		NOTE(REG_UNONPOSIX);
+		RET('>');
+		break;
+	case CHR('1'): case CHR('2'): case CHR('3'): case CHR('4'):
+	case CHR('5'): case CHR('6'): case CHR('7'): case CHR('8'):
+	case CHR('9'):
+		NOTE(REG_UBACKREF);
+		RETV(BACKREF, (chr)DIGITVAL(c));
+		break;
+	default:
+		if (iscalnum(c)) {
+			NOTE(REG_UBSALNUM);
+			NOTE(REG_UUNSPEC);
+		}
+		RETV(PLAIN, c);
+		break;
+	}
+
+	assert(NOTREACHED);
+}
+
+/*
+ * skip - skip white space and comments in expanded form
+ */
+static void
+skip(struct vars *v)
+{
+	chr *start = v->now;
+
+	assert(v->cflags&REG_EXPANDED);
+
+	for (;;) {
+		while (!ATEOS() && iscspace(*v->now))
+			v->now++;
+		if (ATEOS() || *v->now != CHR('#'))
+			break;				/* NOTE BREAK OUT */
+		assert(NEXT1('#'));
+		while (!ATEOS() && *v->now != CHR('\n'))
+			v->now++;
+		/* leave the newline to be picked up by the iscspace loop */
+	}
+
+	if (v->now != start)
+		NOTE(REG_UNONPOSIX);
+}
+
+/*
+ * newline - return the chr for a newline
+ *
+ * This helps confine use of CHR to this source file.
+ */
+static chr
+newline(void)
+{
+	return CHR('\n');
+}
+
+/*
+ * chrnamed - return the chr known by a given (chr string) name
+ *
+ * The code is a bit clumsy, but this routine gets only such specialized
+ * use that it hardly matters.
+ */
+static chr
+chrnamed(struct vars *v,
+		 chr *startp,			/* start of name */
+		 chr *endp,			/* just past end of name */
+		 chr lastresort)		/* what to return if name lookup fails */
+{
+	celt c;
+	int errsave;
+	int e;
+	struct cvec *cv;
+
+	errsave = v->err;
+	v->err = 0;
+	c = element(v, startp, endp);
+	e = v->err;
+	v->err = errsave;
+
+	if (e != 0)
+		return (chr)lastresort;
+
+	cv = range(v, c, c, 0);
+	if (cv->nchrs == 0)
+		return (chr)lastresort;
+	return cv->chrs[0];
+}
diff --git a/src/backend/regex/regc_locale.c b/src/backend/regex/regc_locale.c
new file mode 100644
index 00000000000..41ea9fe1f29
--- /dev/null
+++ b/src/backend/regex/regc_locale.c
@@ -0,0 +1,615 @@
+/* 
+ * regc_locale.c --
+ *
+ *	This file contains locale-specific regexp routines.
+ *	This file is #included by regcomp.c.
+ *
+ * Copyright (c) 1998 by Scriptics Corporation.
+ *
+ * This software is copyrighted by the Regents of the University of
+ * California, Sun Microsystems, Inc., Scriptics Corporation, ActiveState
+ * Corporation and other parties.  The following terms apply to all files
+ * associated with the software unless explicitly disclaimed in
+ * individual files.
+ * 
+ * The authors hereby grant permission to use, copy, modify, distribute,
+ * and license this software and its documentation for any purpose, provided
+ * that existing copyright notices are retained in all copies and that this
+ * notice is included verbatim in any distributions. No written agreement,
+ * license, or royalty fee is required for any of the authorized uses.
+ * Modifications to this software may be copyrighted by their authors
+ * and need not follow the licensing terms described here, provided that
+ * the new terms are clearly indicated on the first page of each file where
+ * they apply.
+ * 
+ * IN NO EVENT SHALL THE AUTHORS OR DISTRIBUTORS BE LIABLE TO ANY PARTY
+ * FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
+ * ARISING OUT OF THE USE OF THIS SOFTWARE, ITS DOCUMENTATION, OR ANY
+ * DERIVATIVES THEREOF, EVEN IF THE AUTHORS HAVE BEEN ADVISED OF THE
+ * POSSIBILITY OF SUCH DAMAGE.
+ * 
+ * THE AUTHORS AND DISTRIBUTORS SPECIFICALLY DISCLAIM ANY WARRANTIES,
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT.  THIS SOFTWARE
+ * IS PROVIDED ON AN "AS IS" BASIS, AND THE AUTHORS AND DISTRIBUTORS HAVE
+ * NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR
+ * MODIFICATIONS.
+ * 
+ * GOVERNMENT USE: If you are acquiring this software on behalf of the
+ * U.S. government, the Government shall have only "Restricted Rights"
+ * in the software and related documentation as defined in the Federal 
+ * Acquisition Regulations (FARs) in Clause 52.227.19 (c) (2).  If you
+ * are acquiring the software on behalf of the Department of Defense, the
+ * software shall be classified as "Commercial Computer Software" and the
+ * Government shall have only "Restricted Rights" as defined in Clause
+ * 252.227-7013 (c) (1) of DFARs.  Notwithstanding the foregoing, the
+ * authors grant the U.S. Government and others acting in its behalf
+ * permission to use and distribute the software in accordance with the
+ * terms specified in this license. 
+ *
+ * $Header: /cvsroot/pgsql/src/backend/regex/regc_locale.c,v 1.1 2003/02/05 17:41:32 tgl Exp $
+ */
+
+/* ASCII character-name table */
+
+static struct cname {
+    char *name;
+    char code;
+} cnames[] = {
+    {"NUL",		'\0'},
+    {"SOH",		'\001'},
+    {"STX",		'\002'},
+    {"ETX",		'\003'},
+    {"EOT",		'\004'},
+    {"ENQ",		'\005'},
+    {"ACK",		'\006'},
+    {"BEL",		'\007'},
+    {"alert",		'\007'},
+    {"BS",		'\010'},
+    {"backspace",	'\b'},
+    {"HT",		'\011'},
+    {"tab",		'\t'},
+    {"LF",		'\012'},
+    {"newline",		'\n'},
+    {"VT",		'\013'},
+    {"vertical-tab",	'\v'},
+    {"FF",		'\014'},
+    {"form-feed",	'\f'},
+    {"CR",		'\015'},
+    {"carriage-return",	'\r'},
+    {"SO",		'\016'},
+    {"SI",		'\017'},
+    {"DLE",		'\020'},
+    {"DC1",		'\021'},
+    {"DC2",		'\022'},
+    {"DC3",		'\023'},
+    {"DC4",		'\024'},
+    {"NAK",		'\025'},
+    {"SYN",		'\026'},
+    {"ETB",		'\027'},
+    {"CAN",		'\030'},
+    {"EM",		'\031'},
+    {"SUB",		'\032'},
+    {"ESC",		'\033'},
+    {"IS4",		'\034'},
+    {"FS",		'\034'},
+    {"IS3",		'\035'},
+    {"GS",		'\035'},
+    {"IS2",		'\036'},
+    {"RS",		'\036'},
+    {"IS1",		'\037'},
+    {"US",		'\037'},
+    {"space",		' '},
+    {"exclamation-mark",'!'},
+    {"quotation-mark",	'"'},
+    {"number-sign",	'#'},
+    {"dollar-sign",	'$'},
+    {"percent-sign",	'%'},
+    {"ampersand",	'&'},
+    {"apostrophe",	'\''},
+    {"left-parenthesis",'('},
+    {"right-parenthesis", ')'},
+    {"asterisk",	'*'},
+    {"plus-sign",	'+'},
+    {"comma",		','},
+    {"hyphen",		'-'},
+    {"hyphen-minus",	'-'},
+    {"period",		'.'},
+    {"full-stop",	'.'},
+    {"slash",		'/'},
+    {"solidus",		'/'},
+    {"zero",		'0'},
+    {"one",		'1'},
+    {"two",		'2'},
+    {"three",		'3'},
+    {"four",		'4'},
+    {"five",		'5'},
+    {"six",		'6'},
+    {"seven",		'7'},
+    {"eight",		'8'},
+    {"nine",		'9'},
+    {"colon",		':'},
+    {"semicolon",	';'},
+    {"less-than-sign",	'<'},
+    {"equals-sign",	'='},
+    {"greater-than-sign", '>'},
+    {"question-mark",	'?'},
+    {"commercial-at",	'@'},
+    {"left-square-bracket", '['},
+    {"backslash",	'\\'},
+    {"reverse-solidus",	'\\'},
+    {"right-square-bracket", ']'},
+    {"circumflex",	'^'},
+    {"circumflex-accent", '^'},
+    {"underscore",	'_'},
+    {"low-line",	'_'},
+    {"grave-accent",	'`'},
+    {"left-brace",	'{'},
+    {"left-curly-bracket", '{'},
+    {"vertical-line",	'|'},
+    {"right-brace",	'}'},
+    {"right-curly-bracket", '}'},
+    {"tilde",		'~'},
+    {"DEL",		'\177'},
+    {NULL,		0}
+};
+
+/*
+ * some ctype functions with non-ascii-char guard
+ */
+static int
+pg_isdigit(pg_wchar c)
+{
+	return (c >= 0 && c <= UCHAR_MAX && isdigit((unsigned char) c));
+}
+
+static int
+pg_isalpha(pg_wchar c)
+{
+	return (c >= 0 && c <= UCHAR_MAX && isalpha((unsigned char) c));
+}
+
+static int
+pg_isalnum(pg_wchar c)
+{
+	return (c >= 0 && c <= UCHAR_MAX && isalnum((unsigned char) c));
+}
+
+static int
+pg_isupper(pg_wchar c)
+{
+	return (c >= 0 && c <= UCHAR_MAX && isupper((unsigned char) c));
+}
+
+static int
+pg_islower(pg_wchar c)
+{
+	return (c >= 0 && c <= UCHAR_MAX && islower((unsigned char) c));
+}
+
+static int
+pg_isgraph(pg_wchar c)
+{
+	return (c >= 0 && c <= UCHAR_MAX && isgraph((unsigned char) c));
+}
+
+static int
+pg_ispunct(pg_wchar c)
+{
+	return (c >= 0 && c <= UCHAR_MAX && ispunct((unsigned char) c));
+}
+
+static int
+pg_isspace(pg_wchar c)
+{
+	return (c >= 0 && c <= UCHAR_MAX && isspace((unsigned char) c));
+}
+
+static pg_wchar
+pg_toupper(pg_wchar c)
+{
+	if (c >= 0 && c <= UCHAR_MAX)
+		return toupper((unsigned char) c);
+	return c;
+}
+
+static pg_wchar
+pg_tolower(pg_wchar c)
+{
+	if (c >= 0 && c <= UCHAR_MAX)
+		return tolower((unsigned char) c);
+	return c;
+}
+
+
+/*
+ * nmcces - how many distinct MCCEs are there?
+ */
+static int
+nmcces(struct vars *v)
+{
+    /*
+     * No multi-character collating elements defined at the moment.
+     */
+    return 0;
+}
+
+/*
+ * nleaders - how many chrs can be first chrs of MCCEs?
+ */
+static int
+nleaders(struct vars *v)
+{
+    return 0;
+}
+
+/*
+ * allmcces - return a cvec with all the MCCEs of the locale
+ */
+static struct cvec *
+allmcces(struct vars *v,			/* context */
+		 struct cvec *cv)			/* this is supposed to have enough room */
+{
+    return clearcvec(cv);
+}
+
+/*
+ * element - map collating-element name to celt
+ */
+static celt
+element(struct vars *v,			/* context */
+		chr *startp,			/* points to start of name */
+		chr *endp)				/* points just past end of name */
+{
+    struct cname *cn;
+    size_t len;
+
+    /* generic:  one-chr names stand for themselves */
+    assert(startp < endp);
+    len = endp - startp;
+    if (len == 1) {
+	return *startp;
+    }
+
+    NOTE(REG_ULOCALE);
+
+    /* search table */
+    for (cn=cnames; cn->name!=NULL; cn++) {
+	if (strlen(cn->name)==len &&
+		pg_char_and_wchar_strncmp(cn->name, startp, len)==0) {
+	    break;			/* NOTE BREAK OUT */
+	}
+    }
+    if (cn->name != NULL) {
+	return CHR(cn->code);
+    }
+
+    /* couldn't find it */
+    ERR(REG_ECOLLATE);
+    return 0;
+}
+
+/*
+ * range - supply cvec for a range, including legality check
+ */
+static struct cvec *
+range(struct vars *v,			/* context */
+	  celt a,				/* range start */
+	  celt b,				/* range end, might equal a */
+	  int cases)				/* case-independent? */
+{
+    int nchrs;
+    struct cvec *cv;
+    celt c, lc, uc;
+
+    if (a != b && !before(a, b)) {
+	ERR(REG_ERANGE);
+	return NULL;
+    }
+
+    if (!cases) {			/* easy version */
+	cv = getcvec(v, 0, 1, 0);
+	NOERRN();
+	addrange(cv, a, b);
+	return cv;
+    }
+
+    /*
+     * When case-independent, it's hard to decide when cvec ranges are
+     * usable, so for now at least, we won't try.  We allocate enough
+     * space for two case variants plus a little extra for the two
+     * title case variants.
+     */
+
+    nchrs = (b - a + 1)*2 + 4;
+
+    cv = getcvec(v, nchrs, 0, 0);
+    NOERRN();
+
+    for (c=a; c<=b; c++) {
+	addchr(cv, c);
+	lc = pg_tolower((chr)c);
+	if (c != lc) {
+	    addchr(cv, lc);
+	}
+	uc = pg_toupper((chr)c);
+	if (c != uc) {
+	    addchr(cv, uc);
+	}
+    }
+
+    return cv;
+}
+
+/*
+ * before - is celt x before celt y, for purposes of range legality?
+ */
+static int				/* predicate */
+before(celt x, celt y)
+{
+    /* trivial because no MCCEs */
+    if (x < y) {
+	return 1;
+    }
+    return 0;
+}
+
+/*
+ * eclass - supply cvec for an equivalence class
+ * Must include case counterparts on request.
+ */
+static struct cvec *
+eclass(struct vars *v,			/* context */
+	   celt c,					/* Collating element representing
+								 * the equivalence class. */
+	   int cases)				/* all cases? */
+{
+    struct cvec *cv;
+
+    /* crude fake equivalence class for testing */
+    if ((v->cflags&REG_FAKE) && c == 'x') {
+	cv = getcvec(v, 4, 0, 0);
+	addchr(cv, (chr)'x');
+	addchr(cv, (chr)'y');
+	if (cases) {
+	    addchr(cv, (chr)'X');
+	    addchr(cv, (chr)'Y');
+	}
+	return cv;
+    }
+
+    /* otherwise, none */
+    if (cases) {
+	return allcases(v, c);
+    }
+    cv = getcvec(v, 1, 0, 0);
+    assert(cv != NULL);
+    addchr(cv, (chr)c);
+    return cv;
+}
+
+/*
+ * cclass - supply cvec for a character class
+ *
+ * Must include case counterparts on request.
+ */
+static struct cvec *
+cclass(struct vars *v,			/* context */
+	   chr *startp,			/* where the name starts */
+	   chr *endp,				/* just past the end of the name */
+	   int cases)				/* case-independent? */
+{
+    size_t len;
+    struct cvec *cv = NULL;
+    char **namePtr;
+    int i, index;
+
+    /*
+     * The following arrays define the valid character class names.
+     */
+
+    static char *classNames[] = {
+	"alnum", "alpha", "ascii", "blank", "cntrl", "digit", "graph",
+	"lower", "print", "punct", "space", "upper", "xdigit", NULL
+    };
+
+    enum classes {
+	CC_ALNUM, CC_ALPHA, CC_ASCII, CC_BLANK, CC_CNTRL, CC_DIGIT, CC_GRAPH,
+	CC_LOWER, CC_PRINT, CC_PUNCT, CC_SPACE, CC_UPPER, CC_XDIGIT
+    };
+
+    /*
+     * Map the name to the corresponding enumerated value.
+     */
+    len = endp - startp;
+    index = -1;
+    for (namePtr=classNames,i=0 ; *namePtr!=NULL ; namePtr++,i++) {
+	if (strlen(*namePtr) == len &&
+		pg_char_and_wchar_strncmp(*namePtr, startp, len) == 0) {
+	    index = i;
+	    break;
+	}
+    }
+    if (index == -1) {
+	ERR(REG_ECTYPE);
+	return NULL;
+    }
+
+    /*
+     * Remap lower and upper to alpha if the match is case insensitive.
+     */
+
+    if (cases &&
+		((enum classes) index == CC_LOWER ||
+		 (enum classes) index == CC_UPPER))
+		index = (int) CC_ALPHA;
+    
+    /*
+     * Now compute the character class contents.
+	 *
+	 * For the moment, assume that only char codes < 256 can be in these
+	 * classes.
+     */
+
+    switch((enum classes) index) {
+    case CC_PRINT:
+    case CC_ALNUM:
+	cv = getcvec(v, UCHAR_MAX, 1, 0);
+	if (cv) {
+	    for (i=0 ; i<= UCHAR_MAX ; i++) {
+			if (pg_isalpha((chr) i))
+				addchr(cv, (chr) i);
+	    }
+		addrange(cv, (chr) '0', (chr) '9');
+	}
+	break;
+    case CC_ALPHA:
+	cv = getcvec(v, UCHAR_MAX, 0, 0);
+	if (cv) {
+	    for (i=0 ; i<= UCHAR_MAX ; i++) {
+			if (pg_isalpha((chr) i))
+				addchr(cv, (chr) i);
+	    }
+	}
+	break;
+    case CC_ASCII:
+	cv = getcvec(v, 0, 1, 0);
+	if (cv) {
+	    addrange(cv, 0, 0x7f);
+	}
+	break;
+    case CC_BLANK:
+	cv = getcvec(v, 2, 0, 0);
+	addchr(cv, '\t');
+	addchr(cv, ' ');
+	break;
+    case CC_CNTRL:
+	cv = getcvec(v, 0, 2, 0);
+	addrange(cv, 0x0, 0x1f);
+	addrange(cv, 0x7f, 0x9f);
+	break;
+    case CC_DIGIT:
+	cv = getcvec(v, 0, 1, 0);
+	if (cv) {	
+		addrange(cv, (chr) '0', (chr) '9');
+	}
+	break;
+    case CC_PUNCT:
+	cv = getcvec(v, UCHAR_MAX, 0, 0);
+	if (cv) {
+	    for (i=0 ; i<= UCHAR_MAX ; i++) {
+			if (pg_ispunct((chr) i))
+				addchr(cv, (chr) i);
+	    }
+	}
+	break;
+    case CC_XDIGIT:
+	cv = getcvec(v, 0, 3, 0);
+	if (cv) {	
+	    addrange(cv, '0', '9');
+	    addrange(cv, 'a', 'f');
+	    addrange(cv, 'A', 'F');
+	}
+	break;
+    case CC_SPACE:
+	cv = getcvec(v, UCHAR_MAX, 0, 0);
+	if (cv) {
+	    for (i=0 ; i<= UCHAR_MAX ; i++) {
+			if (pg_isspace((chr) i))
+				addchr(cv, (chr) i);
+	    }
+	}
+	break;
+    case CC_LOWER:
+	cv = getcvec(v, UCHAR_MAX, 0, 0);
+	if (cv) {
+	    for (i=0 ; i<= UCHAR_MAX ; i++) {
+			if (pg_islower((chr) i))
+				addchr(cv, (chr) i);
+	    }
+	}
+	break;
+    case CC_UPPER:
+	cv = getcvec(v, UCHAR_MAX, 0, 0);
+	if (cv) {
+	    for (i=0 ; i<= UCHAR_MAX ; i++) {
+			if (pg_isupper((chr) i))
+				addchr(cv, (chr) i);
+	    }
+	}
+	break;
+    case CC_GRAPH:
+	cv = getcvec(v, UCHAR_MAX, 0, 0);
+	if (cv) {
+	    for (i=0 ; i<= UCHAR_MAX ; i++) {
+			if (pg_isgraph((chr) i))
+				addchr(cv, (chr) i);
+	    }
+	}
+	break;
+    }
+    if (cv == NULL) {
+	ERR(REG_ESPACE);
+    }
+    return cv;
+}
+
+/*
+ * allcases - supply cvec for all case counterparts of a chr (including itself)
+ *
+ * This is a shortcut, preferably an efficient one, for simple characters;
+ * messy cases are done via range().
+ */
+static struct cvec *
+allcases(struct vars *v,			/* context */
+		 chr pc)				/* character to get case equivs of */
+{
+    struct cvec *cv;
+    chr c = (chr)pc;
+    chr lc, uc;
+
+    lc = pg_tolower((chr)c);
+    uc = pg_toupper((chr)c);
+
+	cv = getcvec(v, 2, 0, 0);
+    addchr(cv, lc);
+    if (lc != uc) {
+	addchr(cv, uc);
+    }
+    return cv;
+}
+
+/*
+ * cmp - chr-substring compare
+ *
+ * Backrefs need this.  It should preferably be efficient.
+ * Note that it does not need to report anything except equal/unequal.
+ * Note also that the length is exact, and the comparison should not
+ * stop at embedded NULs!
+ */
+static int				/* 0 for equal, nonzero for unequal */
+cmp(const chr *x, const chr *y,			/* strings to compare */
+    size_t len)				/* exact length of comparison */
+{
+    return memcmp(VS(x), VS(y), len*sizeof(chr));
+}
+
+/*
+ * casecmp - case-independent chr-substring compare
+ *
+ * REG_ICASE backrefs need this.  It should preferably be efficient.
+ * Note that it does not need to report anything except equal/unequal.
+ * Note also that the length is exact, and the comparison should not
+ * stop at embedded NULs!
+ */
+static int				/* 0 for equal, nonzero for unequal */
+casecmp(const chr *x, const chr *y,			/* strings to compare */
+		size_t len)				/* exact length of comparison */
+{
+    for (; len > 0; len--, x++, y++) {
+	if ((*x!=*y) && (pg_tolower(*x) != pg_tolower(*y))) {
+	    return 1;
+	}
+    }
+    return 0;
+}
diff --git a/src/backend/regex/regc_nfa.c b/src/backend/regex/regc_nfa.c
new file mode 100644
index 00000000000..43e01ebe92b
--- /dev/null
+++ b/src/backend/regex/regc_nfa.c
@@ -0,0 +1,1481 @@
+/*
+ * NFA utilities.
+ * This file is #included by regcomp.c.
+ *
+ * Copyright (c) 1998, 1999 Henry Spencer.  All rights reserved.
+ * 
+ * Development of this software was funded, in part, by Cray Research Inc.,
+ * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
+ * Corporation, none of whom are responsible for the results.  The author
+ * thanks all of them. 
+ * 
+ * Redistribution and use in source and binary forms -- with or without
+ * modification -- are permitted for any purpose, provided that
+ * redistributions in source form retain this entire copyright notice and
+ * indicate the origin and nature of any modifications.
+ * 
+ * I'd appreciate being given credit for this package in the documentation
+ * of software which uses it, but that is not a requirement.
+ * 
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
+ * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL
+ * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $Header: /cvsroot/pgsql/src/backend/regex/regc_nfa.c,v 1.1 2003/02/05 17:41:32 tgl Exp $
+ *
+ *
+ * One or two things that technically ought to be in here
+ * are actually in color.c, thanks to some incestuous relationships in
+ * the color chains.
+ */
+
+#define	NISERR()	VISERR(nfa->v)
+#define	NERR(e)		VERR(nfa->v, (e))
+
+
+/*
+ * newnfa - set up an NFA
+ */
+static struct nfa *		/* the NFA, or NULL */
+newnfa(struct vars *v,
+	   struct colormap *cm,
+	   struct nfa *parent)		/* NULL if primary NFA */
+{
+	struct nfa *nfa;
+
+	nfa = (struct nfa *)MALLOC(sizeof(struct nfa));
+	if (nfa == NULL)
+		return NULL;
+
+	nfa->states = NULL;
+	nfa->slast = NULL;
+	nfa->free = NULL;
+	nfa->nstates = 0;
+	nfa->cm = cm;
+	nfa->v = v;
+	nfa->bos[0] = nfa->bos[1] = COLORLESS;
+	nfa->eos[0] = nfa->eos[1] = COLORLESS;
+	nfa->post = newfstate(nfa, '@');	/* number 0 */
+	nfa->pre = newfstate(nfa, '>');		/* number 1 */
+	nfa->parent = parent;
+
+	nfa->init = newstate(nfa);		/* may become invalid later */
+	nfa->final = newstate(nfa);
+	if (ISERR()) {
+		freenfa(nfa);
+		return NULL;
+	}
+	rainbow(nfa, nfa->cm, PLAIN, COLORLESS, nfa->pre, nfa->init);
+	newarc(nfa, '^', 1, nfa->pre, nfa->init);
+	newarc(nfa, '^', 0, nfa->pre, nfa->init);
+	rainbow(nfa, nfa->cm, PLAIN, COLORLESS, nfa->final, nfa->post);
+	newarc(nfa, '$', 1, nfa->final, nfa->post);
+	newarc(nfa, '$', 0, nfa->final, nfa->post);
+
+	if (ISERR()) {
+		freenfa(nfa);
+		return NULL;
+	}
+	return nfa;
+}
+
+/*
+ * freenfa - free an entire NFA
+ */
+static void
+freenfa(struct nfa *nfa)
+{
+	struct state *s;
+
+	while ((s = nfa->states) != NULL) {
+		s->nins = s->nouts = 0;		/* don't worry about arcs */
+		freestate(nfa, s);
+	}
+	while ((s = nfa->free) != NULL) {
+		nfa->free = s->next;
+		destroystate(nfa, s);
+	}
+
+	nfa->slast = NULL;
+	nfa->nstates = -1;
+	nfa->pre = NULL;
+	nfa->post = NULL;
+	FREE(nfa);
+}
+
+/*
+ * newstate - allocate an NFA state, with zero flag value
+ */
+static struct state *		/* NULL on error */
+newstate(struct nfa *nfa)
+{
+	struct state *s;
+
+	if (nfa->free != NULL) {
+		s = nfa->free;
+		nfa->free = s->next;
+	} else {
+		s = (struct state *)MALLOC(sizeof(struct state));
+		if (s == NULL) {
+			NERR(REG_ESPACE);
+			return NULL;
+		}
+		s->oas.next = NULL;
+		s->free = NULL;
+		s->noas = 0;
+	}
+
+	assert(nfa->nstates >= 0);
+	s->no = nfa->nstates++;
+	s->flag = 0;
+	if (nfa->states == NULL)
+		nfa->states = s;
+	s->nins = 0;
+	s->ins = NULL;
+	s->nouts = 0;
+	s->outs = NULL;
+	s->tmp = NULL;
+	s->next = NULL;
+	if (nfa->slast != NULL) {
+		assert(nfa->slast->next == NULL);
+		nfa->slast->next = s;
+	}
+	s->prev = nfa->slast;
+	nfa->slast = s;
+	return s;
+}
+
+/*
+ * newfstate - allocate an NFA state with a specified flag value
+ */
+static struct state *		/* NULL on error */
+newfstate(struct nfa *nfa, int flag)
+{
+	struct state *s;
+
+	s = newstate(nfa);
+	if (s != NULL)
+		s->flag = (char)flag;
+	return s;
+}
+
+/*
+ * dropstate - delete a state's inarcs and outarcs and free it
+ */
+static void
+dropstate(struct nfa *nfa,
+		  struct state *s)
+{
+	struct arc *a;
+
+	while ((a = s->ins) != NULL)
+		freearc(nfa, a);
+	while ((a = s->outs) != NULL)
+		freearc(nfa, a);
+	freestate(nfa, s);
+}
+
+/*
+ * freestate - free a state, which has no in-arcs or out-arcs
+ */
+static void
+freestate(struct nfa *nfa,
+		  struct state *s)
+{
+	assert(s != NULL);
+	assert(s->nins == 0 && s->nouts == 0);
+
+	s->no = FREESTATE;
+	s->flag = 0;
+	if (s->next != NULL)
+		s->next->prev = s->prev;
+	else {
+		assert(s == nfa->slast);
+		nfa->slast = s->prev;
+	}
+	if (s->prev != NULL)
+		s->prev->next = s->next;
+	else {
+		assert(s == nfa->states);
+		nfa->states = s->next;
+	}
+	s->prev = NULL;
+	s->next = nfa->free;	/* don't delete it, put it on the free list */
+	nfa->free = s;
+}
+
+/*
+ * destroystate - really get rid of an already-freed state
+ */
+static void
+destroystate(struct nfa *nfa,
+			 struct state *s)
+{
+	struct arcbatch *ab;
+	struct arcbatch *abnext;
+
+	assert(s->no == FREESTATE);
+	for (ab = s->oas.next; ab != NULL; ab = abnext) {
+		abnext = ab->next;
+		FREE(ab);
+	}
+	s->ins = NULL;
+	s->outs = NULL;
+	s->next = NULL;
+	FREE(s);
+}
+
+/*
+ * newarc - set up a new arc within an NFA
+ */
+static void
+newarc(struct nfa *nfa,
+	   int t,
+	   pcolor co,
+	   struct state *from,
+	   struct state *to)
+{
+	struct arc *a;
+
+	assert(from != NULL && to != NULL);
+
+	/* check for duplicates */
+	for (a = from->outs; a != NULL; a = a->outchain)
+		if (a->to == to && a->co == co && a->type == t)
+			return;
+
+	a = allocarc(nfa, from);
+	if (NISERR())
+		return;
+	assert(a != NULL);
+
+	a->type = t;
+	a->co = (color)co;
+	a->to = to;
+	a->from = from;
+
+	/*
+	 * Put the new arc on the beginning, not the end, of the chains.
+	 * Not only is this easier, it has the very useful side effect that 
+	 * deleting the most-recently-added arc is the cheapest case rather
+	 * than the most expensive one.
+	 */
+	a->inchain = to->ins;
+	to->ins = a;
+	a->outchain = from->outs;
+	from->outs = a;
+
+	from->nouts++;
+	to->nins++;
+
+	if (COLORED(a) && nfa->parent == NULL)
+		colorchain(nfa->cm, a);
+
+	return;
+}
+
+/*
+ * allocarc - allocate a new out-arc within a state
+ */
+static struct arc *		/* NULL for failure */
+allocarc(struct nfa *nfa,
+		 struct state *s)
+{
+	struct arc *a;
+	struct arcbatch *new;
+	int i;
+
+	/* shortcut */
+	if (s->free == NULL && s->noas < ABSIZE) {
+		a = &s->oas.a[s->noas];
+		s->noas++;
+		return a;
+	}
+
+	/* if none at hand, get more */
+	if (s->free == NULL) {
+		new = (struct arcbatch *)MALLOC(sizeof(struct arcbatch));
+		if (new == NULL) {
+			NERR(REG_ESPACE);
+			return NULL;
+		}
+		new->next = s->oas.next;
+		s->oas.next = new;
+
+		for (i = 0; i < ABSIZE; i++) {
+			new->a[i].type = 0;
+			new->a[i].freechain = &new->a[i+1];
+		}
+		new->a[ABSIZE-1].freechain = NULL;
+		s->free = &new->a[0];
+	}
+	assert(s->free != NULL);
+
+	a = s->free;
+	s->free = a->freechain;
+	return a;
+}
+
+/*
+ * freearc - free an arc
+ */
+static void
+freearc(struct nfa *nfa,
+		struct arc *victim)
+{
+	struct state *from = victim->from;
+	struct state *to = victim->to;
+	struct arc *a;
+
+	assert(victim->type != 0);
+
+	/* take it off color chain if necessary */
+	if (COLORED(victim) && nfa->parent == NULL)
+		uncolorchain(nfa->cm, victim);
+
+	/* take it off source's out-chain */
+	assert(from != NULL);
+	assert(from->outs != NULL);
+	a = from->outs;
+	if (a == victim)		/* simple case:  first in chain */
+		from->outs = victim->outchain;
+	else {
+		for (; a != NULL && a->outchain != victim; a = a->outchain)
+			continue;
+		assert(a != NULL);
+		a->outchain = victim->outchain;
+	}
+	from->nouts--;
+
+	/* take it off target's in-chain */
+	assert(to != NULL);
+	assert(to->ins != NULL);
+	a = to->ins;
+	if (a == victim)		/* simple case:  first in chain */
+		to->ins = victim->inchain;
+	else {
+		for (; a != NULL && a->inchain != victim; a = a->inchain)
+			continue;
+		assert(a != NULL);
+		a->inchain = victim->inchain;
+	}
+	to->nins--;
+
+	/* clean up and place on free list */
+	victim->type = 0;
+	victim->from = NULL;		/* precautions... */
+	victim->to = NULL;
+	victim->inchain = NULL;
+	victim->outchain = NULL;
+	victim->freechain = from->free;
+	from->free = victim;
+}
+
+/*
+ * findarc - find arc, if any, from given source with given type and color
+ * If there is more than one such arc, the result is random.
+ */
+static struct arc *
+findarc(struct state *s,
+		int type,
+		pcolor co)
+{
+	struct arc *a;
+
+	for (a = s->outs; a != NULL; a = a->outchain)
+		if (a->type == type && a->co == co)
+			return a;
+	return NULL;
+}
+
+/*
+ * cparc - allocate a new arc within an NFA, copying details from old one
+ */
+static void
+cparc(struct nfa *nfa,
+	  struct arc *oa,
+	  struct state *from,
+	  struct state *to)
+{
+	newarc(nfa, oa->type, oa->co, from, to);
+}
+
+/*
+ * moveins - move all in arcs of a state to another state
+ *
+ * You might think this could be done better by just updating the
+ * existing arcs, and you would be right if it weren't for the desire
+ * for duplicate suppression, which makes it easier to just make new
+ * ones to exploit the suppression built into newarc.
+ */
+static void
+moveins(struct nfa *nfa,
+		struct state *old,
+		struct state *new)
+{
+	struct arc *a;
+
+	assert(old != new);
+
+	while ((a = old->ins) != NULL) {
+		cparc(nfa, a, a->from, new);
+		freearc(nfa, a);
+	}
+	assert(old->nins == 0);
+	assert(old->ins == NULL);
+}
+
+/*
+ * copyins - copy all in arcs of a state to another state
+ */
+static void
+copyins(struct nfa *nfa,
+		struct state *old,
+		struct state *new)
+{
+	struct arc *a;
+
+	assert(old != new);
+
+	for (a = old->ins; a != NULL; a = a->inchain)
+		cparc(nfa, a, a->from, new);
+}
+
+/*
+ * moveouts - move all out arcs of a state to another state
+ */
+static void
+moveouts(struct nfa *nfa,
+		 struct state *old,
+		 struct state *new)
+{
+	struct arc *a;
+
+	assert(old != new);
+
+	while ((a = old->outs) != NULL) {
+		cparc(nfa, a, new, a->to);
+		freearc(nfa, a);
+	}
+}
+
+/*
+ * copyouts - copy all out arcs of a state to another state
+ */
+static void
+copyouts(struct nfa *nfa,
+		 struct state *old,
+		 struct state *new)
+{
+	struct arc *a;
+
+	assert(old != new);
+
+	for (a = old->outs; a != NULL; a = a->outchain)
+		cparc(nfa, a, new, a->to);
+}
+
+/*
+ * cloneouts - copy out arcs of a state to another state pair, modifying type
+ */
+static void
+cloneouts(struct nfa *nfa,
+		  struct state *old,
+		  struct state *from,
+		  struct state *to,
+		  int type)
+{
+	struct arc *a;
+
+	assert(old != from);
+
+	for (a = old->outs; a != NULL; a = a->outchain)
+		newarc(nfa, type, a->co, from, to);
+}
+
+/*
+ * delsub - delete a sub-NFA, updating subre pointers if necessary
+ *
+ * This uses a recursive traversal of the sub-NFA, marking already-seen
+ * states using their tmp pointer.
+ */
+static void
+delsub(struct nfa *nfa,
+	   struct state *lp,	/* the sub-NFA goes from here... */
+	   struct state *rp)	/* ...to here, *not* inclusive */
+{
+	assert(lp != rp);
+
+	rp->tmp = rp;			/* mark end */
+
+	deltraverse(nfa, lp, lp);
+	assert(lp->nouts == 0 && rp->nins == 0);	/* did the job */
+	assert(lp->no != FREESTATE && rp->no != FREESTATE);	/* no more */
+
+	rp->tmp = NULL;			/* unmark end */
+	lp->tmp = NULL;			/* and begin, marked by deltraverse */
+}
+
+/*
+ * deltraverse - the recursive heart of delsub
+ * This routine's basic job is to destroy all out-arcs of the state.
+ */
+static void
+deltraverse(struct nfa *nfa,
+			struct state *leftend,
+			struct state *s)
+{
+	struct arc *a;
+	struct state *to;
+
+	if (s->nouts == 0)
+		return;			/* nothing to do */
+	if (s->tmp != NULL)
+		return;			/* already in progress */
+
+	s->tmp = s;			/* mark as in progress */
+
+	while ((a = s->outs) != NULL) {
+		to = a->to;
+		deltraverse(nfa, leftend, to);
+		assert(to->nouts == 0 || to->tmp != NULL);
+		freearc(nfa, a);
+		if (to->nins == 0 && to->tmp == NULL) {
+			assert(to->nouts == 0);
+			freestate(nfa, to);
+		}
+	}
+
+	assert(s->no != FREESTATE);	/* we're still here */
+	assert(s == leftend || s->nins != 0);	/* and still reachable */
+	assert(s->nouts == 0);		/* but have no outarcs */
+
+	s->tmp = NULL;			/* we're done here */
+}
+
+/*
+ * dupnfa - duplicate sub-NFA
+ *
+ * Another recursive traversal, this time using tmp to point to duplicates
+ * as well as mark already-seen states.  (You knew there was a reason why
+ * it's a state pointer, didn't you? :-))
+ */
+static void
+dupnfa(struct nfa *nfa,
+	   struct state *start,		/* duplicate of subNFA starting here */
+	   struct state *stop,		/* and stopping here */
+	   struct state *from,		/* stringing duplicate from here */
+	   struct state *to)		/* to here */
+{
+	if (start == stop) {
+		newarc(nfa, EMPTY, 0, from, to);
+		return;
+	}
+
+	stop->tmp = to;
+	duptraverse(nfa, start, from);
+	/* done, except for clearing out the tmp pointers */
+
+	stop->tmp = NULL;
+	cleartraverse(nfa, start);
+}
+
+/*
+ * duptraverse - recursive heart of dupnfa
+ */
+static void
+duptraverse(struct nfa *nfa,
+			struct state *s,
+			struct state *stmp)		/* s's duplicate, or NULL */
+{
+	struct arc *a;
+
+	if (s->tmp != NULL)
+		return;		/* already done */
+
+	s->tmp = (stmp == NULL) ? newstate(nfa) : stmp;
+	if (s->tmp == NULL) {
+		assert(NISERR());
+		return;
+	}
+
+	for (a = s->outs; a != NULL && !NISERR(); a = a->outchain) {
+		duptraverse(nfa, a->to, (struct state *)NULL);
+		assert(a->to->tmp != NULL);
+		cparc(nfa, a, s->tmp, a->to->tmp);
+	}
+}
+
+/*
+ * cleartraverse - recursive cleanup for algorithms that leave tmp ptrs set
+ */
+static void
+cleartraverse(struct nfa *nfa,
+			  struct state *s)
+{
+	struct arc *a;
+
+	if (s->tmp == NULL)
+		return;
+	s->tmp = NULL;
+
+	for (a = s->outs; a != NULL; a = a->outchain)
+		cleartraverse(nfa, a->to);
+}
+
+/*
+ * specialcolors - fill in special colors for an NFA
+ */
+static void
+specialcolors(struct nfa *nfa)
+{
+	/* false colors for BOS, BOL, EOS, EOL */
+	if (nfa->parent == NULL) {
+		nfa->bos[0] = pseudocolor(nfa->cm);
+		nfa->bos[1] = pseudocolor(nfa->cm);
+		nfa->eos[0] = pseudocolor(nfa->cm);
+		nfa->eos[1] = pseudocolor(nfa->cm);
+	} else {
+		assert(nfa->parent->bos[0] != COLORLESS);
+		nfa->bos[0] = nfa->parent->bos[0];
+		assert(nfa->parent->bos[1] != COLORLESS);
+		nfa->bos[1] = nfa->parent->bos[1];
+		assert(nfa->parent->eos[0] != COLORLESS);
+		nfa->eos[0] = nfa->parent->eos[0];
+		assert(nfa->parent->eos[1] != COLORLESS);
+		nfa->eos[1] = nfa->parent->eos[1];
+	}
+}
+
+/*
+ * optimize - optimize an NFA
+ */
+static long			/* re_info bits */
+optimize(struct nfa *nfa,
+		 FILE *f)				/* for debug output; NULL none */
+{
+#ifdef REG_DEBUG
+	int verbose = (f != NULL) ? 1 : 0;
+
+	if (verbose)
+		fprintf(f, "\ninitial cleanup:\n");
+#endif
+	cleanup(nfa);		/* may simplify situation */
+#ifdef REG_DEBUG
+	if (verbose)
+		dumpnfa(nfa, f);
+	if (verbose)
+		fprintf(f, "\nempties:\n");
+#endif
+	fixempties(nfa, f);	/* get rid of EMPTY arcs */
+#ifdef REG_DEBUG
+	if (verbose)
+		fprintf(f, "\nconstraints:\n");
+#endif
+	pullback(nfa, f);	/* pull back constraints backward */
+	pushfwd(nfa, f);	/* push fwd constraints forward */
+#ifdef REG_DEBUG
+	if (verbose)
+		fprintf(f, "\nfinal cleanup:\n");
+#endif
+	cleanup(nfa);		/* final tidying */
+	return analyze(nfa);	/* and analysis */
+}
+
+/*
+ * pullback - pull back constraints backward to (with luck) eliminate them
+ */
+static void
+pullback(struct nfa *nfa,
+		 FILE *f)			/* for debug output; NULL none */
+{
+	struct state *s;
+	struct state *nexts;
+	struct arc *a;
+	struct arc *nexta;
+	int progress;
+
+	/* find and pull until there are no more */
+	do {
+		progress = 0;
+		for (s = nfa->states; s != NULL && !NISERR(); s = nexts) {
+			nexts = s->next;
+			for (a = s->outs; a != NULL && !NISERR(); a = nexta) {
+				nexta = a->outchain;
+				if (a->type == '^' || a->type == BEHIND)
+					if (pull(nfa, a))
+						progress = 1;
+				assert(nexta == NULL || s->no != FREESTATE);
+			}
+		}
+		if (progress && f != NULL)
+			dumpnfa(nfa, f);
+	} while (progress && !NISERR());
+	if (NISERR())
+		return;
+
+	for (a = nfa->pre->outs; a != NULL; a = nexta) {
+		nexta = a->outchain;
+		if (a->type == '^') {
+			assert(a->co == 0 || a->co == 1);
+			newarc(nfa, PLAIN, nfa->bos[a->co], a->from, a->to);
+			freearc(nfa, a);
+		}
+	}
+}
+
+/*
+ * pull - pull a back constraint backward past its source state
+ * A significant property of this function is that it deletes at most
+ * one state -- the constraint's from state -- and only if the constraint
+ * was that state's last outarc.
+ */
+static int			/* 0 couldn't, 1 could */
+pull(struct nfa *nfa,
+	 struct arc *con)
+{
+	struct state *from = con->from;
+	struct state *to = con->to;
+	struct arc *a;
+	struct arc *nexta;
+	struct state *s;
+
+	if (from == to) {	/* circular constraint is pointless */
+		freearc(nfa, con);
+		return 1;
+	}
+	if (from->flag)		/* can't pull back beyond start */
+		return 0;
+	if (from->nins == 0) {	/* unreachable */
+		freearc(nfa, con);
+		return 1;
+	}
+
+	/* first, clone from state if necessary to avoid other outarcs */
+	if (from->nouts > 1) {
+		s = newstate(nfa);
+		if (NISERR())
+			return 0;
+		assert(to != from);		/* con is not an inarc */
+		copyins(nfa, from, s);		/* duplicate inarcs */
+		cparc(nfa, con, s, to);		/* move constraint arc */
+		freearc(nfa, con);
+		from = s;
+		con = from->outs;
+	}
+	assert(from->nouts == 1);
+
+	/* propagate the constraint into the from state's inarcs */
+	for (a = from->ins; a != NULL; a = nexta) {
+		nexta = a->inchain;
+		switch (combine(con, a)) {
+		case INCOMPATIBLE:	/* destroy the arc */
+			freearc(nfa, a);
+			break;
+		case SATISFIED:		/* no action needed */
+			break;
+		case COMPATIBLE:	/* swap the two arcs, more or less */
+			s = newstate(nfa);
+			if (NISERR())
+				return 0;
+			cparc(nfa, a, s, to);		/* anticipate move */
+			cparc(nfa, con, a->from, s);
+			if (NISERR())
+				return 0;
+			freearc(nfa, a);
+			break;
+		default:
+			assert(NOTREACHED);
+			break;
+		}
+	}
+
+	/* remaining inarcs, if any, incorporate the constraint */
+	moveins(nfa, from, to);
+	dropstate(nfa, from);		/* will free the constraint */
+	return 1;
+}
+
+/*
+ * pushfwd - push forward constraints forward to (with luck) eliminate them
+ */
+static void
+pushfwd(struct nfa *nfa,
+		FILE *f)			/* for debug output; NULL none */
+{
+	struct state *s;
+	struct state *nexts;
+	struct arc *a;
+	struct arc *nexta;
+	int progress;
+
+	/* find and push until there are no more */
+	do {
+		progress = 0;
+		for (s = nfa->states; s != NULL && !NISERR(); s = nexts) {
+			nexts = s->next;
+			for (a = s->ins; a != NULL && !NISERR(); a = nexta) {
+				nexta = a->inchain;
+				if (a->type == '$' || a->type == AHEAD)
+					if (push(nfa, a))
+						progress = 1;
+				assert(nexta == NULL || s->no != FREESTATE);
+			}
+		}
+		if (progress && f != NULL)
+			dumpnfa(nfa, f);
+	} while (progress && !NISERR());
+	if (NISERR())
+		return;
+
+	for (a = nfa->post->ins; a != NULL; a = nexta) {
+		nexta = a->inchain;
+		if (a->type == '$') {
+			assert(a->co == 0 || a->co == 1);
+			newarc(nfa, PLAIN, nfa->eos[a->co], a->from, a->to);
+			freearc(nfa, a);
+		}
+	}
+}
+
+/*
+ * push - push a forward constraint forward past its destination state
+ * A significant property of this function is that it deletes at most
+ * one state -- the constraint's to state -- and only if the constraint
+ * was that state's last inarc.
+ */
+static int			/* 0 couldn't, 1 could */
+push(struct nfa *nfa,
+	 struct arc *con)
+{
+	struct state *from = con->from;
+	struct state *to = con->to;
+	struct arc *a;
+	struct arc *nexta;
+	struct state *s;
+
+	if (to == from) {	/* circular constraint is pointless */
+		freearc(nfa, con);
+		return 1;
+	}
+	if (to->flag)		/* can't push forward beyond end */
+		return 0;
+	if (to->nouts == 0) {	/* dead end */
+		freearc(nfa, con);
+		return 1;
+	}
+
+	/* first, clone to state if necessary to avoid other inarcs */
+	if (to->nins > 1) {
+		s = newstate(nfa);
+		if (NISERR())
+			return 0;
+		copyouts(nfa, to, s);		/* duplicate outarcs */
+		cparc(nfa, con, from, s);	/* move constraint */
+		freearc(nfa, con);
+		to = s;
+		con = to->ins;
+	}
+	assert(to->nins == 1);
+
+	/* propagate the constraint into the to state's outarcs */
+	for (a = to->outs; a != NULL; a = nexta) {
+		nexta = a->outchain;
+		switch (combine(con, a)) {
+		case INCOMPATIBLE:	/* destroy the arc */
+			freearc(nfa, a);
+			break;
+		case SATISFIED:		/* no action needed */
+			break;
+		case COMPATIBLE:	/* swap the two arcs, more or less */
+			s = newstate(nfa);
+			if (NISERR())
+				return 0;
+			cparc(nfa, con, s, a->to);	/* anticipate move */
+			cparc(nfa, a, from, s);
+			if (NISERR())
+				return 0;
+			freearc(nfa, a);
+			break;
+		default:
+			assert(NOTREACHED);
+			break;
+		}
+	}
+
+	/* remaining outarcs, if any, incorporate the constraint */
+	moveouts(nfa, to, from);
+	dropstate(nfa, to);		/* will free the constraint */
+	return 1;
+}
+
+/*
+ * combine - constraint lands on an arc, what happens?
+ *
+ * #def	INCOMPATIBLE	1	// destroys arc
+ * #def	SATISFIED		2	// constraint satisfied
+ * #def	COMPATIBLE		3	// compatible but not satisfied yet
+ */
+static int
+combine(struct arc *con,
+		struct arc *a)
+{
+#	define	CA(ct,at)	(((ct)<<CHAR_BIT) | (at))
+
+	switch (CA(con->type, a->type)) {
+	case CA('^', PLAIN):		/* newlines are handled separately */
+	case CA('$', PLAIN):
+		return INCOMPATIBLE;
+		break;
+	case CA(AHEAD, PLAIN):		/* color constraints meet colors */
+	case CA(BEHIND, PLAIN):
+		if (con->co == a->co)
+			return SATISFIED;
+		return INCOMPATIBLE;
+		break;
+	case CA('^', '^'):		/* collision, similar constraints */
+	case CA('$', '$'):
+	case CA(AHEAD, AHEAD):
+	case CA(BEHIND, BEHIND):
+		if (con->co == a->co)		/* true duplication */
+			return SATISFIED;
+		return INCOMPATIBLE;
+		break;
+	case CA('^', BEHIND):		/* collision, dissimilar constraints */
+	case CA(BEHIND, '^'):
+	case CA('$', AHEAD):
+	case CA(AHEAD, '$'):
+		return INCOMPATIBLE;
+		break;
+	case CA('^', '$'):		/* constraints passing each other */
+	case CA('^', AHEAD):
+	case CA(BEHIND, '$'):
+	case CA(BEHIND, AHEAD):
+	case CA('$', '^'):
+	case CA('$', BEHIND):
+	case CA(AHEAD, '^'):
+	case CA(AHEAD, BEHIND):
+	case CA('^', LACON):
+	case CA(BEHIND, LACON):
+	case CA('$', LACON):
+	case CA(AHEAD, LACON):
+		return COMPATIBLE;
+		break;
+	}
+	assert(NOTREACHED);
+	return INCOMPATIBLE;		/* for benefit of blind compilers */
+}
+
+/*
+ * fixempties - get rid of EMPTY arcs
+ */
+static void
+fixempties(struct nfa *nfa,
+		   FILE *f)			/* for debug output; NULL none */
+{
+	struct state *s;
+	struct state *nexts;
+	struct arc *a;
+	struct arc *nexta;
+	int progress;
+
+	/* find and eliminate empties until there are no more */
+	do {
+		progress = 0;
+		for (s = nfa->states; s != NULL && !NISERR(); s = nexts) {
+			nexts = s->next;
+			for (a = s->outs; a != NULL && !NISERR(); a = nexta) {
+				nexta = a->outchain;
+				if (a->type == EMPTY && unempty(nfa, a))
+					progress = 1;
+				assert(nexta == NULL || s->no != FREESTATE);
+			}
+		}
+		if (progress && f != NULL)
+			dumpnfa(nfa, f);
+	} while (progress && !NISERR());
+}
+
+/*
+ * unempty - optimize out an EMPTY arc, if possible
+ *
+ * Actually, as it stands this function always succeeds, but the return
+ * value is kept with an eye on possible future changes.
+ */
+static int			/* 0 couldn't, 1 could */
+unempty(struct nfa *nfa,
+		struct arc *a)
+{
+	struct state *from = a->from;
+	struct state *to = a->to;
+	int usefrom;		/* work on from, as opposed to to? */
+
+	assert(a->type == EMPTY);
+	assert(from != nfa->pre && to != nfa->post);
+
+	if (from == to) {		/* vacuous loop */
+		freearc(nfa, a);
+		return 1;
+	}
+
+	/* decide which end to work on */
+	usefrom = 1;			/* default:  attack from */
+	if (from->nouts > to->nins)
+		usefrom = 0;
+	else if (from->nouts == to->nins) {
+		/* decide on secondary issue:  move/copy fewest arcs */
+		if (from->nins > to->nouts)
+			usefrom = 0;
+	}
+		
+	freearc(nfa, a);
+	if (usefrom) {
+		if (from->nouts == 0) {
+			/* was the state's only outarc */
+			moveins(nfa, from, to);
+			freestate(nfa, from);
+		} else
+			copyins(nfa, from, to);
+	} else {
+		if (to->nins == 0) {
+			/* was the state's only inarc */
+			moveouts(nfa, to, from);
+			freestate(nfa, to);
+		} else
+			copyouts(nfa, to, from);
+	}
+
+	return 1;
+}
+
+/*
+ * cleanup - clean up NFA after optimizations
+ */
+static void
+cleanup(struct nfa *nfa)
+{
+	struct state *s;
+	struct state *nexts;
+	int n;
+
+	/* clear out unreachable or dead-end states */
+	/* use pre to mark reachable, then post to mark can-reach-post */
+	markreachable(nfa, nfa->pre, (struct state *)NULL, nfa->pre);
+	markcanreach(nfa, nfa->post, nfa->pre, nfa->post);
+	for (s = nfa->states; s != NULL; s = nexts) {
+		nexts = s->next;
+		if (s->tmp != nfa->post && !s->flag)
+			dropstate(nfa, s);
+	}
+	assert(nfa->post->nins == 0 || nfa->post->tmp == nfa->post);
+	cleartraverse(nfa, nfa->pre);
+	assert(nfa->post->nins == 0 || nfa->post->tmp == NULL);
+	/* the nins==0 (final unreachable) case will be caught later */
+
+	/* renumber surviving states */
+	n = 0;
+	for (s = nfa->states; s != NULL; s = s->next)
+		s->no = n++;
+	nfa->nstates = n;
+}
+
+/*
+ * markreachable - recursive marking of reachable states
+ */
+static void
+markreachable(struct nfa *nfa,
+			  struct state *s,
+			  struct state *okay, /* consider only states with this mark */
+			  struct state *mark) /* the value to mark with */
+{
+	struct arc *a;
+
+	if (s->tmp != okay)
+		return;
+	s->tmp = mark;
+
+	for (a = s->outs; a != NULL; a = a->outchain)
+		markreachable(nfa, a->to, okay, mark);
+}
+
+/*
+ * markcanreach - recursive marking of states which can reach here
+ */
+static void
+markcanreach(struct nfa *nfa,
+			 struct state *s,
+			 struct state *okay, /* consider only states with this mark */
+			 struct state *mark) /* the value to mark with */
+{
+	struct arc *a;
+
+	if (s->tmp != okay)
+		return;
+	s->tmp = mark;
+
+	for (a = s->ins; a != NULL; a = a->inchain)
+		markcanreach(nfa, a->from, okay, mark);
+}
+
+/*
+ * analyze - ascertain potentially-useful facts about an optimized NFA
+ */
+static long			/* re_info bits to be ORed in */
+analyze(struct nfa *nfa)
+{
+	struct arc *a;
+	struct arc *aa;
+
+	if (nfa->pre->outs == NULL)
+		return REG_UIMPOSSIBLE;
+	for (a = nfa->pre->outs; a != NULL; a = a->outchain)
+		for (aa = a->to->outs; aa != NULL; aa = aa->outchain)
+			if (aa->to == nfa->post)
+				return REG_UEMPTYMATCH;
+	return 0;
+}
+
+/*
+ * compact - compact an NFA
+ */
+static void
+compact(struct nfa *nfa,
+		struct cnfa *cnfa)
+{
+	struct state *s;
+	struct arc *a;
+	size_t nstates;
+	size_t narcs;
+	struct carc *ca;
+	struct carc *first;
+
+	assert (!NISERR());
+
+	nstates = 0;
+	narcs = 0;
+	for (s = nfa->states; s != NULL; s = s->next) {
+		nstates++;
+		narcs += 1 + s->nouts + 1;
+		/* 1 as a fake for flags, nouts for arcs, 1 as endmarker */
+	}
+
+	cnfa->states = (struct carc **)MALLOC(nstates * sizeof(struct carc *));
+	cnfa->arcs = (struct carc *)MALLOC(narcs * sizeof(struct carc));
+	if (cnfa->states == NULL || cnfa->arcs == NULL) {
+		if (cnfa->states != NULL)
+			FREE(cnfa->states);
+		if (cnfa->arcs != NULL)
+			FREE(cnfa->arcs);
+		NERR(REG_ESPACE);
+		return;
+	}
+	cnfa->nstates = nstates;
+	cnfa->pre = nfa->pre->no;
+	cnfa->post = nfa->post->no;
+	cnfa->bos[0] = nfa->bos[0];
+	cnfa->bos[1] = nfa->bos[1];
+	cnfa->eos[0] = nfa->eos[0];
+	cnfa->eos[1] = nfa->eos[1];
+	cnfa->ncolors = maxcolor(nfa->cm) + 1;
+	cnfa->flags = 0;
+
+	ca = cnfa->arcs;
+	for (s = nfa->states; s != NULL; s = s->next) {
+		assert((size_t)s->no < nstates);
+		cnfa->states[s->no] = ca;
+		ca->co = 0;		/* clear and skip flags "arc" */
+		ca++;
+		first = ca;
+		for (a = s->outs; a != NULL; a = a->outchain)
+			switch (a->type) {
+			case PLAIN:
+				ca->co = a->co;
+				ca->to = a->to->no;
+				ca++;
+				break;
+			case LACON:
+				assert(s->no != cnfa->pre);
+				ca->co = (color)(cnfa->ncolors + a->co);
+				ca->to = a->to->no;
+				ca++;
+				cnfa->flags |= HASLACONS;
+				break;
+			default:
+				assert(NOTREACHED);
+				break;
+			}
+		carcsort(first, ca-1);
+		ca->co = COLORLESS;
+		ca->to = 0;
+		ca++;
+	}
+	assert(ca == &cnfa->arcs[narcs]);
+	assert(cnfa->nstates != 0);
+
+	/* mark no-progress states */
+	for (a = nfa->pre->outs; a != NULL; a = a->outchain)
+		cnfa->states[a->to->no]->co = 1;
+	cnfa->states[nfa->pre->no]->co = 1;
+}
+
+/*
+ * carcsort - sort compacted-NFA arcs by color
+ *
+ * Really dumb algorithm, but if the list is long enough for that to matter,
+ * you're in real trouble anyway.
+ */
+static void
+carcsort(struct carc *first,
+		 struct carc *last)
+{
+	struct carc *p;
+	struct carc *q;
+	struct carc tmp;
+
+	if (last - first <= 1)
+		return;
+
+	for (p = first; p <= last; p++)
+		for (q = p; q <= last; q++)
+			if (p->co > q->co ||
+					(p->co == q->co && p->to > q->to)) {
+				assert(p != q);
+				tmp = *p;
+				*p = *q;
+				*q = tmp;
+			}
+}
+
+/*
+ * freecnfa - free a compacted NFA
+ */
+static void
+freecnfa(struct cnfa *cnfa)
+{
+	assert(cnfa->nstates != 0);	/* not empty already */
+	cnfa->nstates = 0;
+	FREE(cnfa->states);
+	FREE(cnfa->arcs);
+}
+
+/*
+ * dumpnfa - dump an NFA in human-readable form
+ */
+static void
+dumpnfa(struct nfa *nfa,
+		FILE *f)
+{
+#ifdef REG_DEBUG
+	struct state *s;
+
+	fprintf(f, "pre %d, post %d", nfa->pre->no, nfa->post->no);
+	if (nfa->bos[0] != COLORLESS)
+		fprintf(f, ", bos [%ld]", (long)nfa->bos[0]);
+	if (nfa->bos[1] != COLORLESS)
+		fprintf(f, ", bol [%ld]", (long)nfa->bos[1]);
+	if (nfa->eos[0] != COLORLESS)
+		fprintf(f, ", eos [%ld]", (long)nfa->eos[0]);
+	if (nfa->eos[1] != COLORLESS)
+		fprintf(f, ", eol [%ld]", (long)nfa->eos[1]);
+	fprintf(f, "\n");
+	for (s = nfa->states; s != NULL; s = s->next)
+		dumpstate(s, f);
+	if (nfa->parent == NULL)
+		dumpcolors(nfa->cm, f);
+	fflush(f);
+#endif
+}
+
+#ifdef REG_DEBUG		/* subordinates of dumpnfa */
+
+/*
+ * dumpstate - dump an NFA state in human-readable form
+ */
+static void
+dumpstate(struct state *s,
+		  FILE *f)
+{
+	struct arc *a;
+
+	fprintf(f, "%d%s%c", s->no, (s->tmp != NULL) ? "T" : "",
+					(s->flag) ? s->flag : '.');
+	if (s->prev != NULL && s->prev->next != s)
+		fprintf(f, "\tstate chain bad\n");
+	if (s->nouts == 0)
+		fprintf(f, "\tno out arcs\n");
+	else
+		dumparcs(s, f);
+	fflush(f);
+	for (a = s->ins; a != NULL; a = a->inchain) {
+		if (a->to != s)
+			fprintf(f, "\tlink from %d to %d on %d's in-chain\n",
+					a->from->no, a->to->no, s->no);
+	}
+}
+
+/*
+ * dumparcs - dump out-arcs in human-readable form
+ */
+static void
+dumparcs(struct state *s,
+		 FILE *f)
+{
+	int pos;
+
+	assert(s->nouts > 0);
+	/* printing arcs in reverse order is usually clearer */
+	pos = dumprarcs(s->outs, s, f, 1);
+	if (pos != 1)
+		fprintf(f, "\n");
+}
+
+/*
+ * dumprarcs - dump remaining outarcs, recursively, in reverse order
+ */
+static int			/* resulting print position */
+dumprarcs(struct arc *a,
+		  struct state *s,
+		  FILE *f,
+		  int pos)			/* initial print position */
+{
+	if (a->outchain != NULL)
+		pos = dumprarcs(a->outchain, s, f, pos);
+	dumparc(a, s, f);
+	if (pos == 5) {
+		fprintf(f, "\n");
+		pos = 1;
+	} else
+		pos++;
+	return pos;
+}
+
+/*
+ * dumparc - dump one outarc in readable form, including prefixing tab
+ */
+static void
+dumparc(struct arc *a,
+		struct state *s,
+		FILE *f)
+{
+	struct arc *aa;
+	struct arcbatch *ab;
+
+	fprintf(f, "\t");
+	switch (a->type) {
+	case PLAIN:
+		fprintf(f, "[%ld]", (long)a->co);
+		break;
+	case AHEAD:
+		fprintf(f, ">%ld>", (long)a->co);
+		break;
+	case BEHIND:
+		fprintf(f, "<%ld<", (long)a->co);
+		break;
+	case LACON:
+		fprintf(f, ":%ld:", (long)a->co);
+		break;
+	case '^':
+	case '$':
+		fprintf(f, "%c%d", a->type, (int)a->co);
+		break;
+	case EMPTY:
+		break;
+	default:
+		fprintf(f, "0x%x/0%lo", a->type, (long)a->co);
+		break;
+	}
+	if (a->from != s)
+		fprintf(f, "?%d?", a->from->no);
+	for (ab = &a->from->oas; ab != NULL; ab = ab->next) {
+		for (aa = &ab->a[0]; aa < &ab->a[ABSIZE]; aa++)
+			if (aa == a)
+				break;		/* NOTE BREAK OUT */
+		if (aa < &ab->a[ABSIZE])	/* propagate break */
+				break;		/* NOTE BREAK OUT */
+	}
+	if (ab == NULL)
+		fprintf(f, "?!?");	/* not in allocated space */
+	fprintf(f, "->");
+	if (a->to == NULL) {
+		fprintf(f, "NULL");
+		return;
+	}
+	fprintf(f, "%d", a->to->no);
+	for (aa = a->to->ins; aa != NULL; aa = aa->inchain)
+		if (aa == a)
+			break;		/* NOTE BREAK OUT */
+	if (aa == NULL)
+		fprintf(f, "?!?");	/* missing from in-chain */
+}
+
+#endif /* REG_DEBUG */
+
+/*
+ * dumpcnfa - dump a compacted NFA in human-readable form
+ */
+#ifdef REG_DEBUG
+static void
+dumpcnfa(struct cnfa *cnfa,
+		 FILE *f)
+{
+	int st;
+
+	fprintf(f, "pre %d, post %d", cnfa->pre, cnfa->post);
+	if (cnfa->bos[0] != COLORLESS)
+		fprintf(f, ", bos [%ld]", (long)cnfa->bos[0]);
+	if (cnfa->bos[1] != COLORLESS)
+		fprintf(f, ", bol [%ld]", (long)cnfa->bos[1]);
+	if (cnfa->eos[0] != COLORLESS)
+		fprintf(f, ", eos [%ld]", (long)cnfa->eos[0]);
+	if (cnfa->eos[1] != COLORLESS)
+		fprintf(f, ", eol [%ld]", (long)cnfa->eos[1]);
+	if (cnfa->flags&HASLACONS)
+		fprintf(f, ", haslacons");
+	fprintf(f, "\n");
+	for (st = 0; st < cnfa->nstates; st++)
+		dumpcstate(st, cnfa->states[st], cnfa, f);
+	fflush(f);
+}
+#endif
+
+#ifdef REG_DEBUG		/* subordinates of dumpcnfa */
+
+/*
+ * dumpcstate - dump a compacted-NFA state in human-readable form
+ */
+static void
+dumpcstate(int st,
+		   struct carc *ca,
+		   struct cnfa *cnfa,
+		   FILE *f)
+{
+	int i;
+	int pos;
+
+	fprintf(f, "%d%s", st, (ca[0].co) ? ":" : ".");
+	pos = 1;
+	for (i = 1; ca[i].co != COLORLESS; i++) {
+		if (ca[i].co < cnfa->ncolors)
+			fprintf(f, "\t[%ld]->%d", (long)ca[i].co, ca[i].to);
+		else
+			fprintf(f, "\t:%ld:->%d", (long)ca[i].co-cnfa->ncolors,
+								ca[i].to);
+		if (pos == 5) {
+			fprintf(f, "\n");
+			pos = 1;
+		} else
+			pos++;
+	}
+	if (i == 1 || pos != 1)
+		fprintf(f, "\n");
+	fflush(f);
+}
+
+#endif /* REG_DEBUG */
diff --git a/src/backend/regex/regcomp.c b/src/backend/regex/regcomp.c
index fdc299bf797..099a1872a8d 100644
--- a/src/backend/regex/regcomp.c
+++ b/src/backend/regex/regcomp.c
@@ -1,1787 +1,2131 @@
-/*-
- * Copyright (c) 1992, 1993, 1994 Henry Spencer.
- * Copyright (c) 1992, 1993, 1994
- *		The Regents of the University of California.  All rights reserved.
- *
- * This code is derived from software contributed to Berkeley by
- * Henry Spencer.
+/*
+ * re_*comp and friends - compile REs
+ * This file #includes several others (see the bottom).
  *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *	  notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *	  notice, this list of conditions and the following disclaimer in the
- *	  documentation and/or other materials provided with the distribution.
- * 3. All advertising materials mentioning features or use of this software
- *	  must display the following acknowledgement:
- *		This product includes software developed by the University of
- *		California, Berkeley and its contributors.
- * 4. Neither the name of the University nor the names of its contributors
- *	  may be used to endorse or promote products derived from this software
- *	  without specific prior written permission.
+ * Copyright (c) 1998, 1999 Henry Spencer.  All rights reserved.
+ * 
+ * Development of this software was funded, in part, by Cray Research Inc.,
+ * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
+ * Corporation, none of whom are responsible for the results.  The author
+ * thanks all of them. 
+ * 
+ * Redistribution and use in source and binary forms -- with or without
+ * modification -- are permitted for any purpose, provided that
+ * redistributions in source form retain this entire copyright notice and
+ * indicate the origin and nature of any modifications.
+ * 
+ * I'd appreciate being given credit for this package in the documentation
+ * of software which uses it, but that is not a requirement.
+ * 
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
+ * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL
+ * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
- * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.	IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
+ * $Header: /cvsroot/pgsql/src/backend/regex/regcomp.c,v 1.36 2003/02/05 17:41:33 tgl Exp $
  *
- *		@(#)regcomp.c	8.5 (Berkeley) 3/20/94
  */
 
-#include "postgres.h"
-
-#include <ctype.h>
-#include <limits.h>
-#include <assert.h>
-
-#include "regex/regex.h"
-#include "regex/utils.h"
-#include "regex/regex2.h"
-#include "regex/cname.h"
-
-struct cclass
-{
-	char	   *name;
-	char	   *chars;
-	char	   *multis;
-};
-static struct cclass *cclasses = NULL;
-static struct cclass *cclass_init(void);
+#include "regex/regguts.h"
 
 /*
- * parse structure, passed up and down to avoid global variables and
- * other clumsinesses
+ * forward declarations, up here so forward datatypes etc. are defined early
  */
-struct parse
-{
-	pg_wchar   *next;			/* next character in RE */
-	pg_wchar   *end;			/* end of string (-> NUL normally) */
-	int			error;			/* has an error been seen? */
-	sop		   *strip;			/* malloced strip */
-	sopno		ssize;			/* malloced strip size (allocated) */
-	sopno		slen;			/* malloced strip length (used) */
-	int			ncsalloc;		/* number of csets allocated */
-	struct re_guts *g;
-#define  NPAREN  10				/* we need to remember () 1-9 for back
-								 * refs */
-	sopno		pbegin[NPAREN]; /* -> ( ([0] unused) */
-	sopno		pend[NPAREN];	/* -> ) ([0] unused) */
+/* === regcomp.c === */
+static void moresubs (struct vars *, int);
+static int freev (struct vars *, int);
+static void makesearch (struct vars *, struct nfa *);
+static struct subre *parse (struct vars *, int, int, struct state *, struct state *);
+static struct subre *parsebranch (struct vars *, int, int, struct state *, struct state *, int);
+static void parseqatom (struct vars *, int, int, struct state *, struct state *, struct subre *);
+static void nonword (struct vars *, int, struct state *, struct state *);
+static void word (struct vars *, int, struct state *, struct state *);
+static int scannum (struct vars *);
+static void repeat (struct vars *, struct state *, struct state *, int, int);
+static void bracket (struct vars *, struct state *, struct state *);
+static void cbracket (struct vars *, struct state *, struct state *);
+static void brackpart (struct vars *, struct state *, struct state *);
+static chr *scanplain (struct vars *);
+static void leaders (struct vars *, struct cvec *);
+static void onechr (struct vars *, chr, struct state *, struct state *);
+static void dovec (struct vars *, struct cvec *, struct state *, struct state *);
+static celt nextleader (struct vars *, chr, chr);
+static void wordchrs (struct vars *);
+static struct subre *subre (struct vars *, int, int, struct state *, struct state *);
+static void freesubre (struct vars *, struct subre *);
+static void freesrnode (struct vars *, struct subre *);
+static void optst (struct vars *, struct subre *);
+static int numst (struct subre *, int);
+static void markst (struct subre *);
+static void cleanst (struct vars *);
+static long nfatree (struct vars *, struct subre *, FILE *);
+static long nfanode (struct vars *, struct subre *, FILE *);
+static int newlacon (struct vars *, struct state *, struct state *, int);
+static void freelacons (struct subre *, int);
+static void rfree (regex_t *);
+#ifdef REG_DEBUG
+static void dump (regex_t *, FILE *);
+static void dumpst (struct subre *, FILE *, int);
+static void stdump (struct subre *, FILE *, int);
+static char *stid (struct subre *, char *, size_t);
+#endif
+/* === regc_lex.c === */
+static void lexstart (struct vars *);
+static void prefixes (struct vars *);
+static void lexnest (struct vars *, chr *, chr *);
+static void lexword (struct vars *);
+static int next (struct vars *);
+static int lexescape (struct vars *);
+static chr lexdigits (struct vars *, int, int, int);
+static int brenext (struct vars *, chr);
+static void skip (struct vars *);
+static chr newline (void);
+static chr chrnamed (struct vars *, chr *, chr *, chr);
+/* === regc_color.c === */
+static void initcm (struct vars *, struct colormap *);
+static void freecm (struct colormap *);
+static void cmtreefree (struct colormap *, union tree *, int);
+static color setcolor (struct colormap *, chr, pcolor);
+static color maxcolor (struct colormap *);
+static color newcolor (struct colormap *);
+static void freecolor (struct colormap *, pcolor);
+static color pseudocolor (struct colormap *);
+static color subcolor (struct colormap *, chr c);
+static color newsub (struct colormap *, pcolor);
+static void subrange (struct vars *, chr, chr, struct state *, struct state *);
+static void subblock (struct vars *, chr, struct state *, struct state *);
+static void okcolors (struct nfa *, struct colormap *);
+static void colorchain (struct colormap *, struct arc *);
+static void uncolorchain (struct colormap *, struct arc *);
+static int singleton (struct colormap *, chr c);
+static void rainbow (struct nfa *, struct colormap *, int, pcolor, struct state *, struct state *);
+static void colorcomplement (struct nfa *, struct colormap *, int, struct state *, struct state *, struct state *);
+#ifdef REG_DEBUG
+static void dumpcolors (struct colormap *, FILE *);
+static void fillcheck (struct colormap *, union tree *, int, FILE *);
+static void dumpchr (chr, FILE *);
+#endif
+/* === regc_nfa.c === */
+static struct nfa *newnfa (struct vars *, struct colormap *, struct nfa *);
+static void freenfa (struct nfa *);
+static struct state *newstate (struct nfa *);
+static struct state *newfstate (struct nfa *, int flag);
+static void dropstate (struct nfa *, struct state *);
+static void freestate (struct nfa *, struct state *);
+static void destroystate (struct nfa *, struct state *);
+static void newarc (struct nfa *, int, pcolor, struct state *, struct state *);
+static struct arc *allocarc (struct nfa *, struct state *);
+static void freearc (struct nfa *, struct arc *);
+static struct arc *findarc (struct state *, int, pcolor);
+static void cparc (struct nfa *, struct arc *, struct state *, struct state *);
+static void moveins (struct nfa *, struct state *, struct state *);
+static void copyins (struct nfa *, struct state *, struct state *);
+static void moveouts (struct nfa *, struct state *, struct state *);
+static void copyouts (struct nfa *, struct state *, struct state *);
+static void cloneouts (struct nfa *, struct state *, struct state *, struct state *, int);
+static void delsub (struct nfa *, struct state *, struct state *);
+static void deltraverse (struct nfa *, struct state *, struct state *);
+static void dupnfa (struct nfa *, struct state *, struct state *, struct state *, struct state *);
+static void duptraverse (struct nfa *, struct state *, struct state *);
+static void cleartraverse (struct nfa *, struct state *);
+static void specialcolors (struct nfa *);
+static long optimize (struct nfa *, FILE *);
+static void pullback (struct nfa *, FILE *);
+static int pull (struct nfa *, struct arc *);
+static void pushfwd (struct nfa *, FILE *);
+static int push (struct nfa *, struct arc *);
+#define	INCOMPATIBLE	1	/* destroys arc */
+#define	SATISFIED	2	/* constraint satisfied */
+#define	COMPATIBLE	3	/* compatible but not satisfied yet */
+static int combine (struct arc *, struct arc *);
+static void fixempties (struct nfa *, FILE *);
+static int unempty (struct nfa *, struct arc *);
+static void cleanup (struct nfa *);
+static void markreachable (struct nfa *, struct state *, struct state *, struct state *);
+static void markcanreach (struct nfa *, struct state *, struct state *, struct state *);
+static long analyze (struct nfa *);
+static void compact (struct nfa *, struct cnfa *);
+static void carcsort (struct carc *, struct carc *);
+static void freecnfa (struct cnfa *);
+static void dumpnfa (struct nfa *, FILE *);
+#ifdef REG_DEBUG
+static void dumpstate (struct state *, FILE *);
+static void dumparcs (struct state *, FILE *);
+static int dumprarcs (struct arc *, struct state *, FILE *, int);
+static void dumparc (struct arc *, struct state *, FILE *);
+static void dumpcnfa (struct cnfa *, FILE *);
+static void dumpcstate (int, struct carc *, struct cnfa *, FILE *);
+#endif
+/* === regc_cvec.c === */
+static struct cvec *newcvec (int, int, int);
+static struct cvec *clearcvec (struct cvec *);
+static void addchr (struct cvec *, chr);
+static void addrange (struct cvec *, chr, chr);
+static void addmcce (struct cvec *, chr *, chr *);
+static int haschr (struct cvec *, chr);
+static struct cvec *getcvec (struct vars *, int, int, int);
+static void freecvec (struct cvec *);
+/* === regc_locale.c === */
+static int pg_isdigit(pg_wchar c);
+static int pg_isalpha(pg_wchar c);
+static int pg_isalnum(pg_wchar c);
+static int pg_isupper(pg_wchar c);
+static int pg_islower(pg_wchar c);
+static int pg_isgraph(pg_wchar c);
+static int pg_ispunct(pg_wchar c);
+static int pg_isspace(pg_wchar c);
+static pg_wchar pg_toupper(pg_wchar c);
+static pg_wchar pg_tolower(pg_wchar c);
+static int nmcces (struct vars *);
+static int nleaders (struct vars *);
+static struct cvec *allmcces (struct vars *, struct cvec *);
+static celt element (struct vars *, chr *, chr *);
+static struct cvec *range (struct vars *, celt, celt, int);
+static int before (celt, celt);
+static struct cvec *eclass (struct vars *, celt, int);
+static struct cvec *cclass (struct vars *, chr *, chr *, int);
+static struct cvec *allcases (struct vars *, chr);
+static int cmp (const chr *, const chr *, size_t);
+static int casecmp (const chr *, const chr *, size_t);
+
+
+/* internal variables, bundled for easy passing around */
+struct vars {
+	regex_t *re;
+	chr *now;		/* scan pointer into string */
+	chr *stop;		/* end of string */
+	chr *savenow;		/* saved now and stop for "subroutine call" */
+	chr *savestop;
+	int err;		/* error code (0 if none) */
+	int cflags;		/* copy of compile flags */
+	int lasttype;		/* type of previous token */
+	int nexttype;		/* type of next token */
+	chr nextvalue;		/* value (if any) of next token */
+	int lexcon;		/* lexical context type (see lex.c) */
+	int nsubexp;		/* subexpression count */
+	struct subre **subs;	/* subRE pointer vector */
+	size_t nsubs;		/* length of vector */
+	struct subre *sub10[10];	/* initial vector, enough for most */
+	struct nfa *nfa;	/* the NFA */
+	struct colormap *cm;	/* character color map */
+	color nlcolor;		/* color of newline */
+	struct state *wordchrs;	/* state in nfa holding word-char outarcs */
+	struct subre *tree;	/* subexpression tree */
+	struct subre *treechain;	/* all tree nodes allocated */
+	struct subre *treefree;		/* any free tree nodes */
+	int ntree;		/* number of tree nodes */
+	struct cvec *cv;	/* interface cvec */
+	struct cvec *cv2;	/* utility cvec */
+	struct cvec *mcces;	/* collating-element information */
+#		define	ISCELEADER(v,c)	(v->mcces != NULL && haschr(v->mcces, (c)))
+	struct state *mccepbegin;	/* in nfa, start of MCCE prototypes */
+	struct state *mccepend;	/* in nfa, end of MCCE prototypes */
+	struct subre *lacons;	/* lookahead-constraint vector */
+	int nlacons;		/* size of lacons */
 };
 
-static void p_ere(struct parse * p, int stop);
-static void p_ere_exp(struct parse * p);
-static void p_str(struct parse * p);
-static void p_bre(struct parse * p, int end1, int end2);
-static int	p_simp_re(struct parse * p, int starordinary);
-static int	p_count(struct parse * p);
-static void p_bracket(struct parse * p);
-static void p_b_term(struct parse * p, cset *cs);
-static void p_b_cclass(struct parse * p, cset *cs);
-static void p_b_eclass(struct parse * p, cset *cs);
-static pg_wchar p_b_symbol(struct parse * p);
-static char p_b_coll_elem(struct parse * p, int endc);
-static unsigned char othercase(int ch);
-static void bothcases(struct parse * p, int ch);
-static void ordinary(struct parse * p, int ch);
-static void nonnewline(struct parse * p);
-static void repeat(struct parse * p, sopno start, int from, int to);
-static int	seterr(struct parse * p, int e);
-static cset *allocset(struct parse * p);
-static void freeset(struct parse * p, cset *cs);
-static int	freezeset(struct parse * p, cset *cs);
-static int	firstch(struct parse * p, cset *cs);
-static int	nch(struct parse * p, cset *cs);
-static void mcadd(struct parse * p, cset *cs, char *cp);
-static void mcinvert(struct parse * p, cset *cs);
-static void mccase(struct parse * p, cset *cs);
-static int	isinsets(struct re_guts * g, int c);
-static int	samesets(struct re_guts * g, int c1, int c2);
-static void categorize(struct parse * p, struct re_guts * g);
-static sopno dupl(struct parse * p, sopno start, sopno finish);
-static void doemit(struct parse * p, sop op, size_t opnd);
-static void doinsert(struct parse * p, sop op, size_t opnd, sopno pos);
-static void dofwd(struct parse * p, sopno pos, sop value);
-static void enlarge(struct parse * p, sopno size);
-static void stripsnug(struct parse * p, struct re_guts * g);
-static void findmust(struct parse * p, struct re_guts * g);
-static sopno pluscount(struct parse * p, struct re_guts * g);
-static int	pg_isdigit(int c);
-static int	pg_isalpha(int c);
-static int	pg_isalnum(int c);
-static int	pg_isupper(int c);
-static int	pg_islower(int c);
-static int	pg_iscntrl(int c);
-static int	pg_isgraph(int c);
-static int	pg_isprint(int c);
-static int	pg_ispunct(int c);
-
-static pg_wchar nuls[10];		/* place to point scanner in event of
-								 * error */
+/* parsing macros; most know that `v' is the struct vars pointer */
+#define	NEXT()	(next(v))		/* advance by one token */
+#define	SEE(t)	(v->nexttype == (t))	/* is next token this? */
+#define	EAT(t)	(SEE(t) && next(v))	/* if next is this, swallow it */
+#define	VISERR(vv)	((vv)->err != 0)	/* have we seen an error yet? */
+#define	ISERR()	VISERR(v)
+#define	VERR(vv,e)	((vv)->nexttype = EOS, ((vv)->err) ? (vv)->err :\
+							((vv)->err = (e)))
+#define	ERR(e)	VERR(v, e)		/* record an error */
+#define	NOERR()	{if (ISERR()) return;}	/* if error seen, return */
+#define	NOERRN()	{if (ISERR()) return NULL;}	/* NOERR with retval */
+#define	NOERRZ()	{if (ISERR()) return 0;}	/* NOERR with retval */
+#define	INSIST(c, e)	((c) ? 0 : ERR(e))	/* if condition false, error */
+#define	NOTE(b)	(v->re->re_info |= (b))		/* note visible condition */
+#define	EMPTYARC(x, y)	newarc(v->nfa, EMPTY, 0, x, y)
+
+/* token type codes, some also used as NFA arc types */
+#define	EMPTY	'n'		/* no token present */
+#define	EOS	'e'		/* end of string */
+#define	PLAIN	'p'		/* ordinary character */
+#define	DIGIT	'd'		/* digit (in bound) */
+#define	BACKREF	'b'		/* back reference */
+#define	COLLEL	'I'		/* start of [. */
+#define	ECLASS	'E'		/* start of [= */
+#define	CCLASS	'C'		/* start of [: */
+#define	END	'X'		/* end of [. [= [: */
+#define	RANGE	'R'		/* - within [] which might be range delim. */
+#define	LACON	'L'		/* lookahead constraint subRE */
+#define	AHEAD	'a'		/* color-lookahead arc */
+#define	BEHIND	'r'		/* color-lookbehind arc */
+#define	WBDRY	'w'		/* word boundary constraint */
+#define	NWBDRY	'W'		/* non-word-boundary constraint */
+#define	SBEGIN	'A'		/* beginning of string (even if not BOL) */
+#define	SEND	'Z'		/* end of string (even if not EOL) */
+#define	PREFER	'P'		/* length preference */
+
+/* is an arc colored, and hence on a color chain? */
+#define	COLORED(a)	((a)->type == PLAIN || (a)->type == AHEAD || \
+							(a)->type == BEHIND)
+
+
+
+/* static function list */
+static struct fns functions = {
+	rfree,			/* regfree insides */
+};
 
-/*
- * macros for use with parse structure
- * BEWARE:	these know that the parse structure is named `p' !!!
- */
-#define PEEK()	(*p->next)
-#define PEEK2() (*(p->next+1))
-#define MORE()	(p->next < p->end)
-#define MORE2() (p->next+1 < p->end)
-#define SEE(c)	(MORE() && PEEK() == (c))
-#define SEETWO(a, b)	(MORE() && MORE2() && PEEK() == (a) && PEEK2() == (b))
-#define EAT(c)	((SEE(c)) ? (NEXT(), 1) : 0)
-#define EATTWO(a, b)	((SEETWO(a, b)) ? (NEXT2(), 1) : 0)
-#define NEXT()	(p->next++)
-#define NEXT2() (p->next += 2)
-#define NEXTn(n)		(p->next += (n))
-#define GETNEXT()		(*p->next++)
-#define SETERROR(e)		seterr(p, (e))
-#define REQUIRE(co, e)	if (!(co)) SETERROR(e)
-#define MUSTSEE(c, e)	REQUIRE(MORE() && PEEK() == (c), e)
-#define MUSTEAT(c, e)	REQUIRE(MORE() && GETNEXT() == (c), e)
-#define MUSTNOTSEE(c, e)		REQUIRE(!MORE() || PEEK() != (c), e)
-#define EMIT(op, sopnd) doemit(p, (sop)(op), (size_t)(sopnd))
-#define INSERT(op, pos) doinsert(p, (sop)(op), HERE()-(pos)+1, pos)
-#define AHEAD(pos)				dofwd(p, pos, HERE()-(pos))
-#define ASTERN(sop, pos)		EMIT(sop, HERE()-pos)
-#define HERE()			(p->slen)
-#define THERE()			(p->slen - 1)
-#define THERETHERE()	(p->slen - 2)
-#define DROP(n) (p->slen -= (n))
-
-#ifndef NDEBUG
-static int	never = 0;			/* for use in asserts; shuts lint up */
 
-#else
-#define never	0				/* some <assert.h>s have bugs too */
-#endif
 
 /*
- * regcomp - interface for parser and compilation
- * returns 0 success, otherwise REG_something
+ * pg_regcomp - compile regular expression
  */
 int
-pg_regcomp(regex_t *preg, const char *pattern, int cflags)
+pg_regcomp(regex_t *re,
+		   const chr *string,
+		   size_t len,
+		   int flags)
 {
-	struct parse pa;
-	struct re_guts *g;
-	struct parse *p = &pa;
-	int			i;
-	size_t		len;
-	pg_wchar   *wcp;
-
-	if (cclasses == NULL)
-		cclasses = cclass_init();
-
-#ifdef REDEBUG
-#define  GOODFLAGS(f)	 (f)
+	struct vars var;
+	struct vars *v = &var;
+	struct guts *g;
+	int i;
+	size_t j;
+#ifdef REG_DEBUG
+	FILE *debug = (flags&REG_PROGRESS) ? stdout : (FILE *)NULL;
 #else
-#define  GOODFLAGS(f)	 ((f)&~REG_DUMP)
+	FILE *debug = (FILE *) NULL;
 #endif
 
-	cflags = GOODFLAGS(cflags);
-	if ((cflags & REG_EXTENDED) && (cflags & REG_NOSPEC))
-		return REG_INVARG;
+#	define	CNOERR()	{ if (ISERR()) return freev(v, v->err); }
 
-	if (cflags & REG_PEND)
-	{
-		wcp = preg->patsave;
-		if (preg->re_endp < wcp)
-			return REG_INVARG;
-		len = preg->re_endp - wcp;
-	}
-	else
-	{
-		wcp = (pg_wchar *) malloc((strlen(pattern) + 1) * sizeof(pg_wchar));
-		if (wcp == NULL)
-			return REG_ESPACE;
-		preg->patsave = wcp;
-		(void) pg_mb2wchar((unsigned char *) pattern, wcp);
-		len = pg_wchar_strlen(wcp);
-	}
+	/* sanity checks */
 
-	/* do the mallocs early so failure handling is easy */
-	g = (struct re_guts *) malloc(sizeof(struct re_guts) +
-								  (NC - 1) * sizeof(cat_t));
-	if (g == NULL)
-		return REG_ESPACE;
-	p->ssize = len / (size_t) 2 *(size_t) 3 + (size_t) 1;		/* ugh */
+	if (re == NULL || string == NULL)
+		return REG_INVARG;
+	if ((flags&REG_QUOTE) &&
+			(flags&(REG_ADVANCED|REG_EXPANDED|REG_NEWLINE)))
+		return REG_INVARG;
+	if (!(flags&REG_EXTENDED) && (flags&REG_ADVF))
+		return REG_INVARG;
 
-	p->strip = (sop *) malloc(p->ssize * sizeof(sop));
-	p->slen = 0;
-	if (p->strip == NULL)
-	{
-		free((char *) g);
-		return REG_ESPACE;
+	/* initial setup (after which freev() is callable) */
+	v->re = re;
+	v->now = (chr *)string;
+	v->stop = v->now + len;
+	v->savenow = v->savestop = NULL;
+	v->err = 0;
+	v->cflags = flags;
+	v->nsubexp = 0;
+	v->subs = v->sub10;
+	v->nsubs = 10;
+	for (j = 0; j < v->nsubs; j++)
+		v->subs[j] = NULL;
+	v->nfa = NULL;
+	v->cm = NULL;
+	v->nlcolor = COLORLESS;
+	v->wordchrs = NULL;
+	v->tree = NULL;
+	v->treechain = NULL;
+	v->treefree = NULL;
+	v->cv = NULL;
+	v->cv2 = NULL;
+	v->mcces = NULL;
+	v->lacons = NULL;
+	v->nlacons = 0;
+	re->re_magic = REMAGIC;
+	re->re_info = 0;		/* bits get set during parse */
+	re->re_csize = sizeof(chr);
+	re->re_guts = NULL;
+	re->re_fns = VS(&functions);
+
+	/* more complex setup, malloced things */
+	re->re_guts = VS(MALLOC(sizeof(struct guts)));
+	if (re->re_guts == NULL)
+		return freev(v, REG_ESPACE);
+	g = (struct guts *)re->re_guts;
+	g->tree = NULL;
+	initcm(v, &g->cmap);
+	v->cm = &g->cmap;
+	g->lacons = NULL;
+	g->nlacons = 0;
+	ZAPCNFA(g->search);
+	v->nfa = newnfa(v, v->cm, (struct nfa *)NULL);
+	CNOERR();
+	v->cv = newcvec(100, 20, 10);
+	if (v->cv == NULL)
+		return freev(v, REG_ESPACE);
+	i = nmcces(v);
+	if (i > 0) {
+		v->mcces = newcvec(nleaders(v), 0, i);
+		CNOERR();
+		v->mcces = allmcces(v, v->mcces);
+		leaders(v, v->mcces);
+		addmcce(v->mcces, (chr *)NULL, (chr *)NULL);	/* dummy */
 	}
-
-	/* set things up */
-	p->g = g;
-	p->next = wcp;
-	p->end = p->next + len;
-	p->error = 0;
-	p->ncsalloc = 0;
-	for (i = 0; i < NPAREN; i++)
-	{
-		p->pbegin[i] = 0;
-		p->pend[i] = 0;
+	CNOERR();
+
+	/* parsing */
+	lexstart(v);			/* also handles prefixes */
+	if ((v->cflags&REG_NLSTOP) || (v->cflags&REG_NLANCH)) {
+		/* assign newline a unique color */
+		v->nlcolor = subcolor(v->cm, newline());
+		okcolors(v->nfa, v->cm);
+	}
+	CNOERR();
+	v->tree = parse(v, EOS, PLAIN, v->nfa->init, v->nfa->final);
+	assert(SEE(EOS));		/* even if error; ISERR() => SEE(EOS) */
+	CNOERR();
+	assert(v->tree != NULL);
+
+	/* finish setup of nfa and its subre tree */
+	specialcolors(v->nfa);
+	CNOERR();
+#ifdef REG_DEBUG
+	if (debug != NULL) {
+		fprintf(debug, "\n\n\n========= RAW ==========\n");
+		dumpnfa(v->nfa, debug);
+		dumpst(v->tree, debug, 1);
 	}
-	g->csetsize = NC;
-	g->sets = NULL;
-	g->setbits = NULL;
-	g->ncsets = 0;
-	g->cflags = cflags;
-	g->iflags = 0;
-	g->nbol = 0;
-	g->neol = 0;
-	g->must = NULL;
-	g->mlen = 0;
-	g->nsub = 0;
-	g->ncategories = 1;			/* category 0 is "everything else" */
-	g->categories = &g->catspace[-(CHAR_MIN)];
-	memset((char *) g->catspace, 0, NC * sizeof(cat_t));
-	g->backrefs = 0;
-
-	/* do it */
-	EMIT(OEND, 0);
-	g->firststate = THERE();
-	if (cflags & REG_EXTENDED)
-		p_ere(p, OUT);
-	else if (cflags & REG_NOSPEC)
-		p_str(p);
-	else
-		p_bre(p, OUT, OUT);
-	EMIT(OEND, 0);
-	g->laststate = THERE();
-
-	/* tidy up loose ends and fill things in */
-	categorize(p, g);
-	stripsnug(p, g);
-	findmust(p, g);
-	g->nplus = pluscount(p, g);
-	g->magic = MAGIC2;
-	preg->re_nsub = g->nsub;
-	preg->re_g = g;
-	preg->re_magic = MAGIC1;
-#ifndef REDEBUG
-	/* not debugging, so can't rely on the assert() in regexec() */
-	if (g->iflags & BAD)
-		SETERROR(REG_ASSERT);
 #endif
-
-	/* win or lose, we're done */
-	if (p->error != 0)			/* lose */
-		pg_regfree(preg);
-	return p->error;
-}
-
-/*
- * p_ere - ERE parser top level, concatenation and alternation
- */
-static void
-p_ere(struct parse * p,
-	  int stop)					/* character this ERE should end at */
-{
-	char		c;
-	sopno		prevback = 0;
-	sopno		prevfwd = 0;
-	sopno		conc;
-	int			first = 1;		/* is this the first alternative? */
-
-	for (;;)
-	{
-		/* do a bunch of concatenated expressions */
-		conc = HERE();
-		while (MORE() && (c = PEEK()) != '|' && c != stop)
-			p_ere_exp(p);
-		REQUIRE(HERE() != conc, REG_EMPTY);		/* require nonempty */
-
-		if (!EAT('|'))
-			break;				/* NOTE BREAK OUT */
-
-		if (first)
-		{
-			INSERT(OCH_, conc); /* offset is wrong */
-			prevfwd = conc;
-			prevback = conc;
-			first = 0;
-		}
-		ASTERN(OOR1, prevback);
-		prevback = THERE();
-		AHEAD(prevfwd);			/* fix previous offset */
-		prevfwd = HERE();
-		EMIT(OOR2, 0);			/* offset is very wrong */
+	optst(v, v->tree);
+	v->ntree = numst(v->tree, 1);
+	markst(v->tree);
+	cleanst(v);
+#ifdef REG_DEBUG
+	if (debug != NULL) {
+		fprintf(debug, "\n\n\n========= TREE FIXED ==========\n");
+		dumpst(v->tree, debug, 1);
 	}
+#endif
 
-	if (!first)
-	{							/* tail-end fixups */
-		AHEAD(prevfwd);
-		ASTERN(O_CH, prevback);
+	/* build compacted NFAs for tree and lacons */
+	re->re_info |= nfatree(v, v->tree, debug);
+	CNOERR();
+	assert(v->nlacons == 0 || v->lacons != NULL);
+	for (i = 1; i < v->nlacons; i++) {
+#ifdef REG_DEBUG
+		if (debug != NULL)
+			fprintf(debug, "\n\n\n========= LA%d ==========\n", i);
+#endif
+		nfanode(v, &v->lacons[i], debug);
 	}
+	CNOERR();
+	if (v->tree->flags&SHORTER)
+		NOTE(REG_USHORTEST);
+
+	/* build compacted NFAs for tree, lacons, fast search */
+#ifdef REG_DEBUG
+	if (debug != NULL)
+		fprintf(debug, "\n\n\n========= SEARCH ==========\n");
+#endif
+	/* can sacrifice main NFA now, so use it as work area */
+	(DISCARD)optimize(v->nfa, debug);
+	CNOERR();
+	makesearch(v, v->nfa);
+	CNOERR();
+	compact(v->nfa, &g->search);
+	CNOERR();
+
+	/* looks okay, package it up */
+	re->re_nsub = v->nsubexp;
+	v->re = NULL;			/* freev no longer frees re */
+	g->magic = GUTSMAGIC;
+	g->cflags = v->cflags;
+	g->info = re->re_info;
+	g->nsub = re->re_nsub;
+	g->tree = v->tree;
+	v->tree = NULL;
+	g->ntree = v->ntree;
+	g->compare = (v->cflags&REG_ICASE) ? casecmp : cmp;
+	g->lacons = v->lacons;
+	v->lacons = NULL;
+	g->nlacons = v->nlacons;
+
+#ifdef REG_DEBUG
+	if (flags&REG_DUMP)
+		dump(re, stdout);
+#endif
 
-	assert(!MORE() || SEE(stop));
+	assert(v->err == 0);
+	return freev(v, 0);
 }
 
 /*
- * p_ere_exp - parse one subERE, an atom possibly followed by a repetition op
+ * moresubs - enlarge subRE vector
  */
 static void
-p_ere_exp(struct parse * p)
+moresubs(struct vars *v,
+		 int wanted)			/* want enough room for this one */
 {
-	pg_wchar	c;
-	sopno		pos;
-	int			count;
-	int			count2;
-	sopno		subno;
-	int			wascaret = 0;
-
-	assert(MORE());				/* caller should have ensured this */
-	c = GETNEXT();
-
-	pos = HERE();
-	switch (c)
-	{
-		case '(':
-			REQUIRE(MORE(), REG_EPAREN);
-			p->g->nsub++;
-			subno = p->g->nsub;
-			if (subno < NPAREN)
-				p->pbegin[subno] = HERE();
-			EMIT(OLPAREN, subno);
-			if (!SEE(')'))
-				p_ere(p, ')');
-			if (subno < NPAREN)
-			{
-				p->pend[subno] = HERE();
-				assert(p->pend[subno] != 0);
-			}
-			EMIT(ORPAREN, subno);
-			MUSTEAT(')', REG_EPAREN);
-			break;
-#ifndef POSIX_MISTAKE
-		case ')':				/* happens only if no current unmatched ( */
-
-			/*
-			 * You may ask, why the ifndef?  Because I didn't notice this
-			 * until slightly too late for 1003.2, and none of the other
-			 * 1003.2 regular-expression reviewers noticed it at all.  So
-			 * an unmatched ) is legal POSIX, at least until we can get it
-			 * fixed.
-			 */
-			SETERROR(REG_EPAREN);
-			break;
-#endif
-		case '^':
-			EMIT(OBOL, 0);
-			p->g->iflags |= USEBOL;
-			p->g->nbol++;
-			wascaret = 1;
-			break;
-		case '$':
-			EMIT(OEOL, 0);
-			p->g->iflags |= USEEOL;
-			p->g->neol++;
-			break;
-		case '|':
-			SETERROR(REG_EMPTY);
-			break;
-		case '*':
-		case '+':
-		case '?':
-			SETERROR(REG_BADRPT);
-			break;
-		case '.':
-			if (p->g->cflags & REG_NEWLINE)
-				nonnewline(p);
-			else
-				EMIT(OANY, 0);
-			break;
-		case '[':
-			p_bracket(p);
-			break;
-		case '\\':
-			REQUIRE(MORE(), REG_EESCAPE);
-			c = GETNEXT();
-			ordinary(p, c);
-			break;
-		case '{':				/* okay as ordinary except if digit
-								 * follows */
-			REQUIRE(!MORE() || !pg_isdigit(PEEK()), REG_BADRPT);
-			/* FALLTHROUGH */
-		default:
-			ordinary(p, c);
-			break;
-	}
-
-	if (!MORE())
+	struct subre **p;
+	size_t n;
+
+	assert(wanted > 0 && (size_t)wanted >= v->nsubs);
+	n = (size_t)wanted * 3 / 2 + 1;
+	if (v->subs == v->sub10) {
+		p = (struct subre **)MALLOC(n * sizeof(struct subre *));
+		if (p != NULL)
+			memcpy(VS(p), VS(v->subs),
+					v->nsubs * sizeof(struct subre *));
+	} else
+		p = (struct subre **)REALLOC(v->subs, n*sizeof(struct subre *));
+	if (p == NULL) {
+		ERR(REG_ESPACE);
 		return;
-	c = PEEK();
-	/* we call { a repetition if followed by a digit */
-	if (!(c == '*' || c == '+' || c == '?' ||
-		  (c == '{' && MORE2() && pg_isdigit(PEEK2()))))
-		return;					/* no repetition, we're done */
-	NEXT();
-
-	REQUIRE(!wascaret, REG_BADRPT);
-	switch (c)
-	{
-		case '*':				/* implemented as +? */
-			/* this case does not require the (y|) trick, noKLUDGE */
-			INSERT(OPLUS_, pos);
-			ASTERN(O_PLUS, pos);
-			INSERT(OQUEST_, pos);
-			ASTERN(O_QUEST, pos);
-			break;
-		case '+':
-			INSERT(OPLUS_, pos);
-			ASTERN(O_PLUS, pos);
-			break;
-		case '?':
-			/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
-			INSERT(OCH_, pos);	/* offset slightly wrong */
-			ASTERN(OOR1, pos);	/* this one's right */
-			AHEAD(pos);			/* fix the OCH_ */
-			EMIT(OOR2, 0);		/* offset very wrong... */
-			AHEAD(THERE());		/* ...so fix it */
-			ASTERN(O_CH, THERETHERE());
-			break;
-		case '{':
-			count = p_count(p);
-			if (EAT(','))
-			{
-				if (pg_isdigit(PEEK()))
-				{
-					count2 = p_count(p);
-					REQUIRE(count <= count2, REG_BADBR);
-				}
-				else
-/* single number with comma */
-					count2 = INFINITY;
-			}
-			else
-/* just a single number */
-				count2 = count;
-			repeat(p, pos, count, count2);
-			if (!EAT('}'))
-			{					/* error heuristics */
-				while (MORE() && PEEK() != '}')
-					NEXT();
-				REQUIRE(MORE(), REG_EBRACE);
-				SETERROR(REG_BADBR);
-			}
-			break;
 	}
-
-	if (!MORE())
-		return;
-	c = PEEK();
-	if (!(c == '*' || c == '+' || c == '?' ||
-		  (c == '{' && MORE2() && pg_isdigit(PEEK2()))))
-		return;
-	SETERROR(REG_BADRPT);
+	v->subs = p;
+	for (p = &v->subs[v->nsubs]; v->nsubs < n; p++, v->nsubs++)
+		*p = NULL;
+	assert(v->nsubs == n);
+	assert((size_t)wanted < v->nsubs);
 }
 
 /*
- * p_str - string (no metacharacters) "parser"
+ * freev - free vars struct's substructures where necessary
+ *
+ * Optionally does error-number setting, and always returns error code
+ * (if any), to make error-handling code terser.
  */
-static void
-p_str(struct parse * p)
+static int
+freev(struct vars *v,
+	  int err)
 {
-	REQUIRE(MORE(), REG_EMPTY);
-	while (MORE())
-		ordinary(p, GETNEXT());
+	if (v->re != NULL)
+		rfree(v->re);
+	if (v->subs != v->sub10)
+		FREE(v->subs);
+	if (v->nfa != NULL)
+		freenfa(v->nfa);
+	if (v->tree != NULL)
+		freesubre(v, v->tree);
+	if (v->treechain != NULL)
+		cleanst(v);
+	if (v->cv != NULL)
+		freecvec(v->cv);
+	if (v->cv2 != NULL)
+		freecvec(v->cv2);
+	if (v->mcces != NULL)
+		freecvec(v->mcces);
+	if (v->lacons != NULL)
+		freelacons(v->lacons, v->nlacons);
+	ERR(err);			/* nop if err==0 */
+
+	return v->err;
 }
 
 /*
- * p_bre - BRE parser top level, anchoring and concatenation
- *
- * Giving end1 as OUT essentially eliminates the end1/end2 check.
- *
- * This implementation is a bit of a kludge, in that a trailing $ is first
- * taken as an ordinary character and then revised to be an anchor.  The
- * only undesirable side effect is that '$' gets included as a character
- * category in such cases.	This is fairly harmless; not worth fixing.
- * The amount of lookahead needed to avoid this kludge is excessive.
+ * makesearch - turn an NFA into a search NFA (implicit prepend of .*?)
+ * NFA must have been optimize()d already.
  */
 static void
-p_bre(struct parse * p,
-	  int end1,					/* first terminating character */
-	  int end2)					/* second terminating character */
+makesearch(struct vars *v,
+		   struct nfa *nfa)
 {
-	sopno		start = HERE();
-	int			first = 1;		/* first subexpression? */
-	int			wasdollar = 0;
-
-	if (EAT('^'))
-	{
-		EMIT(OBOL, 0);
-		p->g->iflags |= USEBOL;
-		p->g->nbol++;
+	struct arc *a;
+	struct arc *b;
+	struct state *pre = nfa->pre;
+	struct state *s;
+	struct state *s2;
+	struct state *slist;
+
+	/* no loops are needed if it's anchored */
+	for (a = pre->outs; a != NULL; a = a->outchain) {
+		assert(a->type == PLAIN);
+		if (a->co != nfa->bos[0] && a->co != nfa->bos[1])
+			break;
 	}
-	while (MORE() && !SEETWO(end1, end2))
-	{
-		wasdollar = p_simp_re(p, first);
-		first = 0;
+	if (a != NULL) {
+		/* add implicit .* in front */
+		rainbow(nfa, v->cm, PLAIN, COLORLESS, pre, pre);
+
+		/* and ^* and \A* too -- not always necessary, but harmless */
+		newarc(nfa, PLAIN, nfa->bos[0], pre, pre);
+		newarc(nfa, PLAIN, nfa->bos[1], pre, pre);
 	}
-	if (wasdollar)
-	{							/* oops, that was a trailing anchor */
-		DROP(1);
-		EMIT(OEOL, 0);
-		p->g->iflags |= USEEOL;
-		p->g->neol++;
+
+	/*
+	 * Now here's the subtle part.  Because many REs have no lookback
+	 * constraints, often knowing when you were in the pre state tells
+	 * you little; it's the next state(s) that are informative.  But
+	 * some of them may have other inarcs, i.e. it may be possible to
+	 * make actual progress and then return to one of them.  We must
+	 * de-optimize such cases, splitting each such state into progress
+	 * and no-progress states.
+	 */
+
+	/* first, make a list of the states */
+	slist = NULL;
+	for (a = pre->outs; a != NULL; a = a->outchain) {
+		s = a->to;
+		for (b = s->ins; b != NULL; b = b->inchain)
+			if (b->from != pre)
+				break;
+		if (b != NULL) {		/* must be split */
+			s->tmp = slist;
+			slist = s;
+		}
 	}
 
-	REQUIRE(HERE() != start, REG_EMPTY);		/* require nonempty */
+	/* do the splits */
+	for (s = slist; s != NULL; s = s2) {
+		s2 = newstate(nfa);
+		copyouts(nfa, s, s2);
+		for (a = s->ins; a != NULL; a = b) {
+			b = a->inchain;
+			if (a->from != pre) {
+				cparc(nfa, a, a->from, s2);
+				freearc(nfa, a);
+			}
+		}
+		s2 = s->tmp;
+		s->tmp = NULL;		/* clean up while we're at it */
+	}
 }
 
 /*
- * p_simp_re - parse a simple RE, an atom possibly followed by a repetition
+ * parse - parse an RE
+ *
+ * This is actually just the top level, which parses a bunch of branches
+ * tied together with '|'.  They appear in the tree as the left children
+ * of a chain of '|' subres.
  */
-static int						/* was the simple RE an unbackslashed $? */
-p_simp_re(struct parse * p,
-		  int starordinary)		/* is a leading * an ordinary character? */
+static struct subre *
+parse(struct vars *v,
+	  int stopper,			/* EOS or ')' */
+	  int type,			/* LACON (lookahead subRE) or PLAIN */
+	  struct state *init,		/* initial state */
+	  struct state *final)		/* final state */
 {
-	int			c;
-	int			count;
-	int			count2;
-	sopno		pos;
-	int			i;
-	sopno		subno;
-
-#define  BACKSL  (1<<24)
-
-	pos = HERE();				/* repetion op, if any, covers from here */
-
-	assert(MORE());				/* caller should have ensured this */
-	c = GETNEXT();
-	if (c == '\\')
-	{
-		REQUIRE(MORE(), REG_EESCAPE);
-		c = BACKSL | (pg_wchar) GETNEXT();
-	}
-	switch (c)
-	{
-		case '.':
-			if (p->g->cflags & REG_NEWLINE)
-				nonnewline(p);
-			else
-				EMIT(OANY, 0);
-			break;
-		case '[':
-			p_bracket(p);
-			break;
-		case BACKSL | '{':
-			SETERROR(REG_BADRPT);
-			break;
-		case BACKSL | '(':
-			p->g->nsub++;
-			subno = p->g->nsub;
-			if (subno < NPAREN)
-				p->pbegin[subno] = HERE();
-			EMIT(OLPAREN, subno);
-			/* the MORE here is an error heuristic */
-			if (MORE() && !SEETWO('\\', ')'))
-				p_bre(p, '\\', ')');
-			if (subno < NPAREN)
-			{
-				p->pend[subno] = HERE();
-				assert(p->pend[subno] != 0);
-			}
-			EMIT(ORPAREN, subno);
-			REQUIRE(EATTWO('\\', ')'), REG_EPAREN);
-			break;
-		case BACKSL | ')':		/* should not get here -- must be user */
-		case BACKSL | '}':
-			SETERROR(REG_EPAREN);
-			break;
-		case BACKSL | '1':
-		case BACKSL | '2':
-		case BACKSL | '3':
-		case BACKSL | '4':
-		case BACKSL | '5':
-		case BACKSL | '6':
-		case BACKSL | '7':
-		case BACKSL | '8':
-		case BACKSL | '9':
-			i = (c & ~BACKSL) - '0';
-			assert(i < NPAREN);
-			if (p->pend[i] != 0)
-			{
-				assert(i <= p->g->nsub);
-				EMIT(OBACK_, i);
-				assert(p->pbegin[i] != 0);
-				assert(OP(p->strip[p->pbegin[i]]) == OLPAREN);
-				assert(OP(p->strip[p->pend[i]]) == ORPAREN);
-				dupl(p, p->pbegin[i] + 1, p->pend[i]);
-				EMIT(O_BACK, i);
-			}
-			else
-				SETERROR(REG_ESUBREG);
-			p->g->backrefs = 1;
-			break;
-		case '*':
-			REQUIRE(starordinary, REG_BADRPT);
-			/* FALLTHROUGH */
-		default:
-			ordinary(p, c & ~BACKSL);
-			break;
+	struct state *left;	/* scaffolding for branch */
+	struct state *right;
+	struct subre *branches;	/* top level */
+	struct subre *branch;	/* current branch */
+	struct subre *t;	/* temporary */
+	int firstbranch;	/* is this the first branch? */
+
+	assert(stopper == ')' || stopper == EOS);
+
+	branches = subre(v, '|', LONGER, init, final);
+	NOERRN();
+	branch = branches;
+	firstbranch = 1;
+	do {	/* a branch */
+		if (!firstbranch) {
+			/* need a place to hang it */
+			branch->right = subre(v, '|', LONGER, init, final);
+			NOERRN();
+			branch = branch->right;
+		}
+		firstbranch = 0;
+		left = newstate(v->nfa);
+		right = newstate(v->nfa);
+		NOERRN();
+		EMPTYARC(init, left);
+		EMPTYARC(right, final);
+		NOERRN();
+		branch->left = parsebranch(v, stopper, type, left, right, 0);
+		NOERRN();
+		branch->flags |= UP(branch->flags | branch->left->flags);
+		if ((branch->flags &~ branches->flags) != 0)	/* new flags */
+			for (t = branches; t != branch; t = t->right)
+				t->flags |= branch->flags;
+	} while (EAT('|'));
+	assert(SEE(stopper) || SEE(EOS));
+
+	if (!SEE(stopper)) {
+		assert(stopper == ')' && SEE(EOS));
+		ERR(REG_EPAREN);
 	}
 
-	if (EAT('*'))
-	{							/* implemented as +? */
-		/* this case does not require the (y|) trick, noKLUDGE */
-		INSERT(OPLUS_, pos);
-		ASTERN(O_PLUS, pos);
-		INSERT(OQUEST_, pos);
-		ASTERN(O_QUEST, pos);
-	}
-	else if (EATTWO('\\', '{'))
-	{
-		count = p_count(p);
-		if (EAT(','))
-		{
-			if (MORE() && pg_isdigit(PEEK()))
-			{
-				count2 = p_count(p);
-				REQUIRE(count <= count2, REG_BADBR);
-			}
-			else
-/* single number with comma */
-				count2 = INFINITY;
-		}
-		else
-/* just a single number */
-			count2 = count;
-		repeat(p, pos, count, count2);
-		if (!EATTWO('\\', '}'))
-		{						/* error heuristics */
-			while (MORE() && !SEETWO('\\', '}'))
-				NEXT();
-			REQUIRE(MORE(), REG_EBRACE);
-			SETERROR(REG_BADBR);
-		}
+	/* optimize out simple cases */
+	if (branch == branches) {	/* only one branch */
+		assert(branch->right == NULL);
+		t = branch->left;
+		branch->left = NULL;
+		freesubre(v, branches);
+		branches = t;
+	} else if (!MESSY(branches->flags)) {	/* no interesting innards */
+		freesubre(v, branches->left);
+		branches->left = NULL;
+		freesubre(v, branches->right);
+		branches->right = NULL;
+		branches->op = '=';
 	}
-	else if (c == (unsigned char) '$')	/* $ (but not \$) ends it */
-		return 1;
 
-	return 0;
+	return branches;
 }
 
 /*
- * p_count - parse a repetition count
+ * parsebranch - parse one branch of an RE
+ *
+ * This mostly manages concatenation, working closely with parseqatom().
+ * Concatenated things are bundled up as much as possible, with separate
+ * ',' nodes introduced only when necessary due to substructure.
  */
-static int						/* the value */
-p_count(struct parse * p)
+static struct subre *
+parsebranch(struct vars *v,
+			int stopper,			/* EOS or ')' */
+			int type,			/* LACON (lookahead subRE) or PLAIN */
+			struct state *left,		/* leftmost state */
+			struct state *right,		/* rightmost state */
+			int partial)			/* is this only part of a branch? */
 {
-	int			count = 0;
-	int			ndigits = 0;
+	struct state *lp;	/* left end of current construct */
+	int seencontent;	/* is there anything in this branch yet? */
+	struct subre *t;
+
+	lp = left;
+	seencontent = 0;
+	t = subre(v, '=', 0, left, right);	/* op '=' is tentative */
+	NOERRN();
+	while (!SEE('|') && !SEE(stopper) && !SEE(EOS)) {
+		if (seencontent) {	/* implicit concat operator */
+			lp = newstate(v->nfa);
+			NOERRN();
+			moveins(v->nfa, right, lp);
+		}
+		seencontent = 1;
 
-	while (MORE() && pg_isdigit(PEEK()) && count <= DUPMAX)
-	{
-		count = count * 10 + (GETNEXT() - '0');
-		ndigits++;
+		/* NB, recursion in parseqatom() may swallow rest of branch */
+		parseqatom(v, stopper, type, lp, right, t);
 	}
 
-	REQUIRE(ndigits > 0 && count <= DUPMAX, REG_BADBR);
-	return count;
+	if (!seencontent) {		/* empty branch */
+		if (!partial)
+			NOTE(REG_UUNSPEC);
+		assert(lp == left);
+		EMPTYARC(left, right);
+	}
+
+	return t;
 }
 
 /*
- * p_bracket - parse a bracketed character list
+ * parseqatom - parse one quantified atom or constraint of an RE
  *
- * Note a significant property of this code:  if the allocset() did SETERROR,
- * no set operations are done.
+ * The bookkeeping near the end cooperates very closely with parsebranch();
+ * in particular, it contains a recursion that can involve parsing the rest
+ * of the branch, making this function's name somewhat inaccurate.
  */
 static void
-p_bracket(struct parse * p)
+parseqatom(struct vars *v,
+		   int stopper,			/* EOS or ')' */
+		   int type,			/* LACON (lookahead subRE) or PLAIN */
+		   struct state *lp,		/* left state to hang it on */
+		   struct state *rp,		/* right state to hang it on */
+		   struct subre *top)		/* subtree top */
 {
-	cset	   *cs = allocset(p);
-	int			invert = 0;
-
-	pg_wchar	sp1[] = {'[', ':', '<', ':', ']', ']'};
-	pg_wchar	sp2[] = {'[', ':', '>', ':', ']', ']'};
-
-	/* Dept of Truly Sickening Special-Case Kludges */
-	if (p->next + 5 < p->end && pg_wchar_strncmp(p->next, sp1, 6) == 0)
-	{
-		EMIT(OBOW, 0);
-		NEXTn(6);
+	struct state *s;	/* temporaries for new states */
+	struct state *s2;
+#	define	ARCV(t, val)	newarc(v->nfa, t, val, lp, rp)
+	int m, n;
+	struct subre *atom;	/* atom's subtree */
+	struct subre *t;
+	int cap;		/* capturing parens? */
+	int pos;		/* positive lookahead? */
+	int subno;		/* capturing-parens or backref number */
+	int atomtype;
+	int qprefer;		/* quantifier short/long preference */
+	int f;
+	struct subre **atomp;	/* where the pointer to atom is */
+
+	/* initial bookkeeping */
+	atom = NULL;
+	assert(lp->nouts == 0);	/* must string new code */
+	assert(rp->nins == 0);	/*  between lp and rp */
+	subno = 0;		/* just to shut lint up */
+
+	/* an atom or constraint... */
+	atomtype = v->nexttype;
+	switch (atomtype) {
+	/* first, constraints, which end by returning */
+	case '^':
+		ARCV('^', 1);
+		if (v->cflags&REG_NLANCH)
+			ARCV(BEHIND, v->nlcolor);
+		NEXT();
 		return;
-	}
-	if (p->next + 5 < p->end && pg_wchar_strncmp(p->next, sp2, 6) == 0)
-	{
-		EMIT(OEOW, 0);
-		NEXTn(6);
+		break;
+	case '$':
+		ARCV('$', 1);
+		if (v->cflags&REG_NLANCH)
+			ARCV(AHEAD, v->nlcolor);
+		NEXT();
 		return;
-	}
-
-	if (EAT('^'))
-		invert++;				/* make note to invert set at end */
-	if (EAT(']'))
-		CHadd(cs, ']');
-	else if (EAT('-'))
-		CHadd(cs, '-');
-	while (MORE() && PEEK() != ']' && !SEETWO('-', ']'))
-		p_b_term(p, cs);
-	if (EAT('-'))
-		CHadd(cs, '-');
-	MUSTEAT(']', REG_EBRACK);
-
-	if (p->error != 0)			/* don't mess things up further */
+		break;
+	case SBEGIN:
+		ARCV('^', 1);	/* BOL */
+		ARCV('^', 0);	/* or BOS */
+		NEXT();
 		return;
-
-	if (p->g->cflags & REG_ICASE)
-	{
-		int			i;
-		int			ci;
-
-		for (i = p->g->csetsize - 1; i >= 0; i--)
-			if (CHIN(cs, i) && pg_isalpha(i))
-			{
-				ci = othercase(i);
-				if (ci != i)
-					CHadd(cs, ci);
-			}
-		if (cs->multis != NULL)
-			mccase(p, cs);
+		break;
+	case SEND:
+		ARCV('$', 1);	/* EOL */
+		ARCV('$', 0);	/* or EOS */
+		NEXT();
+		return;
+		break;
+	case '<':
+		wordchrs(v);	/* does NEXT() */
+		s = newstate(v->nfa);
+		NOERR();
+		nonword(v, BEHIND, lp, s);
+		word(v, AHEAD, s, rp);
+		return;
+		break;
+	case '>':
+		wordchrs(v);	/* does NEXT() */
+		s = newstate(v->nfa);
+		NOERR();
+		word(v, BEHIND, lp, s);
+		nonword(v, AHEAD, s, rp);
+		return;
+		break;
+	case WBDRY:
+		wordchrs(v);	/* does NEXT() */
+		s = newstate(v->nfa);
+		NOERR();
+		nonword(v, BEHIND, lp, s);
+		word(v, AHEAD, s, rp);
+		s = newstate(v->nfa);
+		NOERR();
+		word(v, BEHIND, lp, s);
+		nonword(v, AHEAD, s, rp);
+		return;
+		break;
+	case NWBDRY:
+		wordchrs(v);	/* does NEXT() */
+		s = newstate(v->nfa);
+		NOERR();
+		word(v, BEHIND, lp, s);
+		word(v, AHEAD, s, rp);
+		s = newstate(v->nfa);
+		NOERR();
+		nonword(v, BEHIND, lp, s);
+		nonword(v, AHEAD, s, rp);
+		return;
+		break;
+	case LACON:	/* lookahead constraint */
+		pos = v->nextvalue;
+		NEXT();
+		s = newstate(v->nfa);
+		s2 = newstate(v->nfa);
+		NOERR();
+		t = parse(v, ')', LACON, s, s2);
+		freesubre(v, t);	/* internal structure irrelevant */
+		assert(SEE(')') || ISERR());
+		NEXT();
+		n = newlacon(v, s, s2, pos);
+		NOERR();
+		ARCV(LACON, n);
+		return;
+		break;
+	/* then errors, to get them out of the way */
+	case '*':
+	case '+':
+	case '?':
+	case '{':
+		ERR(REG_BADRPT);
+		return;
+		break;
+	default:
+		ERR(REG_ASSERT);
+		return;
+		break;
+	/* then plain characters, and minor variants on that theme */
+	case ')':		/* unbalanced paren */
+		if ((v->cflags&REG_ADVANCED) != REG_EXTENDED) {
+			ERR(REG_EPAREN);
+			return;
+		}
+		/* legal in EREs due to specification botch */
+		NOTE(REG_UPBOTCH);
+		/* fallthrough into case PLAIN */
+	case PLAIN:
+		onechr(v, v->nextvalue, lp, rp);
+		okcolors(v->nfa, v->cm);
+		NOERR();
+		NEXT();
+		break;
+	case '[':
+		if (v->nextvalue == 1)
+			bracket(v, lp, rp);
+		else
+			cbracket(v, lp, rp);
+		assert(SEE(']') || ISERR());
+		NEXT();
+		break;
+	case '.':
+		rainbow(v->nfa, v->cm, PLAIN,
+				(v->cflags&REG_NLSTOP) ? v->nlcolor : COLORLESS,
+				lp, rp);
+		NEXT();
+		break;
+	/* and finally the ugly stuff */
+	case '(':	/* value flags as capturing or non */
+		cap = (type == LACON) ? 0 : v->nextvalue;
+		if (cap) {
+			v->nsubexp++;
+			subno = v->nsubexp;
+			if ((size_t)subno >= v->nsubs)
+				moresubs(v, subno);
+			assert((size_t)subno < v->nsubs);
+		} else
+			atomtype = PLAIN;	/* something that's not '(' */
+		NEXT();
+		/* need new endpoints because tree will contain pointers */
+		s = newstate(v->nfa);
+		s2 = newstate(v->nfa);
+		NOERR();
+		EMPTYARC(lp, s);
+		EMPTYARC(s2, rp);
+		NOERR();
+		atom = parse(v, ')', PLAIN, s, s2);
+		assert(SEE(')') || ISERR());
+		NEXT();
+		NOERR();
+		if (cap) {
+			v->subs[subno] = atom;
+			t = subre(v, '(', atom->flags|CAP, lp, rp);
+			NOERR();
+			t->subno = subno;
+			t->left = atom;
+			atom = t;
+		}
+		/* postpone everything else pending possible {0} */
+		break;
+	case BACKREF:	/* the Feature From The Black Lagoon */
+		INSIST(type != LACON, REG_ESUBREG);
+		INSIST(v->nextvalue < v->nsubs, REG_ESUBREG);
+		INSIST(v->subs[v->nextvalue] != NULL, REG_ESUBREG);
+		NOERR();
+		assert(v->nextvalue > 0);
+		atom = subre(v, 'b', BACKR, lp, rp);
+		subno = v->nextvalue;
+		atom->subno = subno;
+		EMPTYARC(lp, rp);	/* temporarily, so there's something */
+		NEXT();
+		break;
 	}
-	if (invert)
-	{
-		int			i;
 
-		for (i = p->g->csetsize - 1; i >= 0; i--)
-			if (CHIN(cs, i))
-				CHsub(cs, i);
+	/* ...and an atom may be followed by a quantifier */
+	switch (v->nexttype) {
+	case '*':
+		m = 0;
+		n = INFINITY;
+		qprefer = (v->nextvalue) ? LONGER : SHORTER;
+		NEXT();
+		break;
+	case '+':
+		m = 1;
+		n = INFINITY;
+		qprefer = (v->nextvalue) ? LONGER : SHORTER;
+		NEXT();
+		break;
+	case '?':
+		m = 0;
+		n = 1;
+		qprefer = (v->nextvalue) ? LONGER : SHORTER;
+		NEXT();
+		break;
+	case '{':
+		NEXT();
+		m = scannum(v);
+		if (EAT(',')) {
+			if (SEE(DIGIT))
+				n = scannum(v);
 			else
-				CHadd(cs, i);
-		if (p->g->cflags & REG_NEWLINE)
-			CHsub(cs, '\n');
-		if (cs->multis != NULL)
-			mcinvert(p, cs);
+				n = INFINITY;
+			if (m > n) {
+				ERR(REG_BADBR);
+				return;
+			}
+			/* {m,n} exercises preference, even if it's {m,m} */
+			qprefer = (v->nextvalue) ? LONGER : SHORTER;
+		} else {
+			n = m;
+			/* {m} passes operand's preference through */
+			qprefer = 0;
+		}
+		if (!SEE('}')) {	/* catches errors too */
+			ERR(REG_BADBR);
+			return;
+		}
+		NEXT();
+		break;
+	default:		/* no quantifier */
+		m = n = 1;
+		qprefer = 0;
+		break;
 	}
 
-	assert(cs->multis == NULL); /* xxx */
-
-	if (nch(p, cs) == 1)
-	{							/* optimize singleton sets */
-		ordinary(p, firstch(p, cs));
-		freeset(p, cs);
+	/* annoying special case:  {0} or {0,0} cancels everything */
+	if (m == 0 && n == 0) {
+		if (atom != NULL)
+			freesubre(v, atom);
+		if (atomtype == '(')
+			v->subs[subno] = NULL;
+		delsub(v->nfa, lp, rp);
+		EMPTYARC(lp, rp);
+		return;
 	}
-	else
-		EMIT(OANYOF, freezeset(p, cs));
-}
 
-/*
- * p_b_term - parse one term of a bracketed character list
- */
-static void
-p_b_term(struct parse * p, cset *cs)
-{
-	pg_wchar	c;
-	pg_wchar	start,
-				finish;
-	int			i;
-
-	/* classify what we've got */
-	switch ((MORE()) ? PEEK() : '\0')
-	{
-		case '[':
-			c = (MORE2()) ? PEEK2() : '\0';
-			break;
-		case '-':
-			SETERROR(REG_ERANGE);
-			return;				/* NOTE RETURN */
-			break;
-		default:
-			c = '\0';
-			break;
+	/* if not a messy case, avoid hard part */
+	assert(!MESSY(top->flags));
+	f = top->flags | qprefer | ((atom != NULL) ? atom->flags : 0);
+	if (atomtype != '(' && atomtype != BACKREF && !MESSY(UP(f))) {
+		if (!(m == 1 && n == 1))
+			repeat(v, lp, rp, m, n);
+		if (atom != NULL)
+			freesubre(v, atom);
+		top->flags = f;
+		return;
 	}
 
-	switch (c)
-	{
-		case ':':				/* character class */
-			NEXT2();
-			REQUIRE(MORE(), REG_EBRACK);
-			c = PEEK();
-			REQUIRE(c != '-' && c != ']', REG_ECTYPE);
-			p_b_cclass(p, cs);
-			REQUIRE(MORE(), REG_EBRACK);
-			REQUIRE(EATTWO(':', ']'), REG_ECTYPE);
-			break;
-		case '=':				/* equivalence class */
-			NEXT2();
-			REQUIRE(MORE(), REG_EBRACK);
-			c = PEEK();
-			REQUIRE(c != '-' && c != ']', REG_ECOLLATE);
-			p_b_eclass(p, cs);
-			REQUIRE(MORE(), REG_EBRACK);
-			REQUIRE(EATTWO('=', ']'), REG_ECOLLATE);
-			break;
-		default:				/* symbol, ordinary character, or range */
-/* xxx revision needed for multichar stuff */
-			start = p_b_symbol(p);
-			if (SEE('-') && MORE2() && PEEK2() != ']')
-			{
-				/* range */
-				NEXT();
-				if (EAT('-'))
-					finish = '-';
-				else
-					finish = p_b_symbol(p);
-			}
-			else
-				finish = start;
-/* xxx what about signed chars here... */
-			REQUIRE(start <= finish, REG_ERANGE);
-
-			if (CHlc(start) != CHlc(finish))
-				SETERROR(REG_ERANGE);
+	/*
+	 * hard part:  something messy
+	 * That is, capturing parens, back reference, short/long clash, or
+	 * an atom with substructure containing one of those.
+	 */
 
-			for (i = start; i <= finish; i++)
-				CHadd(cs, i);
-			break;
+	/* now we'll need a subre for the contents even if they're boring */
+	if (atom == NULL) {
+		atom = subre(v, '=', 0, lp, rp);
+		NOERR();
 	}
-}
 
-/*
- * p_b_cclass - parse a character-class name and deal with it
- */
-static void
-p_b_cclass(struct parse * p, cset *cs)
-{
-	pg_wchar   *sp = p->next;
-	struct cclass *cp;
-	size_t		len;
-	char	   *u;
-	unsigned char c;
-
-	while (MORE() && pg_isalpha(PEEK()))
-		NEXT();
-	len = p->next - sp;
+	/*
+	 * prepare a general-purpose state skeleton
+	 *
+	 *    ---> [s] ---prefix---> [begin] ---atom---> [end] ----rest---> [rp]
+	 *   /                                            /
+	 * [lp] ----> [s2] ----bypass---------------------
+	 *
+	 * where bypass is an empty, and prefix is some repetitions of atom
+	 */
+	s = newstate(v->nfa);		/* first, new endpoints for the atom */
+	s2 = newstate(v->nfa);
+	NOERR();
+	moveouts(v->nfa, lp, s);
+	moveins(v->nfa, rp, s2);
+	NOERR();
+	atom->begin = s;
+	atom->end = s2;
+	s = newstate(v->nfa);		/* and spots for prefix and bypass */
+	s2 = newstate(v->nfa);
+	NOERR();
+	EMPTYARC(lp, s);
+	EMPTYARC(lp, s2);
+	NOERR();
+
+	/* break remaining subRE into x{...} and what follows */
+	t = subre(v, '.', COMBINE(qprefer, atom->flags), lp, rp);
+	t->left = atom;
+	atomp = &t->left;
+	/* here we should recurse... but we must postpone that to the end */
+
+	/* split top into prefix and remaining */
+	assert(top->op == '=' && top->left == NULL && top->right == NULL);
+	top->left = subre(v, '=', top->flags, top->begin, lp);
+	top->op = '.';
+	top->right = t;
+
+	/* if it's a backref, now is the time to replicate the subNFA */
+	if (atomtype == BACKREF) {
+		assert(atom->begin->nouts == 1);	/* just the EMPTY */
+		delsub(v->nfa, atom->begin, atom->end);
+		assert(v->subs[subno] != NULL);
+		/* and here's why the recursion got postponed:  it must */
+		/* wait until the skeleton is filled in, because it may */
+		/* hit a backref that wants to copy the filled-in skeleton */
+		dupnfa(v->nfa, v->subs[subno]->begin, v->subs[subno]->end,
+						atom->begin, atom->end);
+		NOERR();
+	}
 
-	for (cp = cclasses; cp->name != NULL; cp++)
-		if (pg_char_and_wchar_strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
-			break;
+	/* it's quantifier time; first, turn x{0,...} into x{1,...}|empty */
+	if (m == 0) {
+		EMPTYARC(s2, atom->end);		/* the bypass */
+		assert(PREF(qprefer) != 0);
+		f = COMBINE(qprefer, atom->flags);
+		t = subre(v, '|', f, lp, atom->end);
+		NOERR();
+		t->left = atom;
+		t->right = subre(v, '|', PREF(f), s2, atom->end);
+		NOERR();
+		t->right->left = subre(v, '=', 0, s2, atom->end);
+		NOERR();
+		*atomp = t;
+		atomp = &t->left;
+		m = 1;
+	}
 
-	if (cp->name == NULL)
-	{
-		/* oops, didn't find it */
-		SETERROR(REG_ECTYPE);
-		return;
+	/* deal with the rest of the quantifier */
+	if (atomtype == BACKREF) {
+		/* special case:  backrefs have internal quantifiers */
+		EMPTYARC(s, atom->begin);	/* empty prefix */
+		/* just stuff everything into atom */
+		repeat(v, atom->begin, atom->end, m, n);
+		atom->min = (short)m;
+		atom->max = (short)n;
+		atom->flags |= COMBINE(qprefer, atom->flags);
+	} else if (m == 1 && n == 1) {
+		/* no/vacuous quantifier:  done */
+		EMPTYARC(s, atom->begin);	/* empty prefix */
+	} else {
+		/* turn x{m,n} into x{m-1,n-1}x, with capturing */
+		/*  parens in only second x */
+		dupnfa(v->nfa, atom->begin, atom->end, s, atom->begin);
+		assert(m >= 1 && m != INFINITY && n >= 1);
+		repeat(v, s, atom->begin, m-1, (n == INFINITY) ? n : n-1);
+		f = COMBINE(qprefer, atom->flags);
+		t = subre(v, '.', f, s, atom->end);	/* prefix and atom */
+		NOERR();
+		t->left = subre(v, '=', PREF(f), s, atom->begin);
+		NOERR();
+		t->right = atom;
+		*atomp = t;
 	}
 
-	u = cp->chars;
-	while ((c = *u++) != '\0')
-		CHadd(cs, c);
-	for (u = cp->multis; *u != '\0'; u += strlen(u) + 1)
-		MCadd(p, cs, u);
+	/* and finally, look after that postponed recursion */
+	t = top->right;
+	if (!(SEE('|') || SEE(stopper) || SEE(EOS)))
+		t->right = parsebranch(v, stopper, type, atom->end, rp, 1);
+	else {
+		EMPTYARC(atom->end, rp);
+		t->right = subre(v, '=', 0, atom->end, rp);
+	}
+	assert(SEE('|') || SEE(stopper) || SEE(EOS));
+	t->flags |= COMBINE(t->flags, t->right->flags);
+	top->flags |= COMBINE(top->flags, t->flags);
 }
 
 /*
- * p_b_eclass - parse an equivalence-class name and deal with it
- *
- * This implementation is incomplete. xxx
+ * nonword - generate arcs for non-word-character ahead or behind
  */
 static void
-p_b_eclass(struct parse * p, cset *cs)
+nonword(struct vars *v,
+		int dir,			/* AHEAD or BEHIND */
+		struct state *lp,
+		struct state *rp)
 {
-	char		c;
+	int anchor = (dir == AHEAD) ? '$' : '^';
 
-	c = p_b_coll_elem(p, '=');
-	CHadd(cs, c);
+	assert(dir == AHEAD || dir == BEHIND);
+	newarc(v->nfa, anchor, 1, lp, rp);
+	newarc(v->nfa, anchor, 0, lp, rp);
+	colorcomplement(v->nfa, v->cm, dir, v->wordchrs, lp, rp);
+	/* (no need for special attention to \n) */
 }
 
 /*
- * p_b_symbol - parse a character or [..]ed multicharacter collating symbol
+ * word - generate arcs for word character ahead or behind
  */
-static pg_wchar					/* value of symbol */
-p_b_symbol(struct parse * p)
+static void
+word(struct vars *v,
+	 int dir,			/* AHEAD or BEHIND */
+	 struct state *lp,
+	 struct state *rp)
 {
-	pg_wchar	value;
-
-	REQUIRE(MORE(), REG_EBRACK);
-	if (!EATTWO('[', '.'))
-		return GETNEXT();
-
-	/* collating symbol */
-	value = p_b_coll_elem(p, '.');
-	REQUIRE(EATTWO('.', ']'), REG_ECOLLATE);
-	return value;
+	assert(dir == AHEAD || dir == BEHIND);
+	cloneouts(v->nfa, v->wordchrs, lp, rp, dir);
+	/* (no need for special attention to \n) */
 }
 
 /*
- * p_b_coll_elem - parse a collating-element name and look it up
+ * scannum - scan a number
  */
-static char						/* value of collating element */
-p_b_coll_elem(struct parse * p, int endc)
+static int			/* value, <= DUPMAX */
+scannum(struct vars *v)
 {
-	pg_wchar   *sp = p->next;
-	struct cname *cp;
-	int			len;
+	int n = 0;
 
-	while (MORE() && !SEETWO(endc, ']'))
+	while (SEE(DIGIT) && n < DUPMAX) {
+		n = n*10 + v->nextvalue;
 		NEXT();
-	if (!MORE())
-	{
-		SETERROR(REG_EBRACK);
+	}
+	if (SEE(DIGIT) || n > DUPMAX) {
+		ERR(REG_BADBR);
 		return 0;
 	}
-	len = p->next - sp;
-
-	for (cp = cnames; cp->name != NULL; cp++)
-		if (pg_char_and_wchar_strncmp(cp->name, sp, len) == 0 && cp->name[len] == '\0')
-			return cp->code;	/* known name */
-
-	if (len == 1)
-		return *sp;				/* single character */
-	SETERROR(REG_ECOLLATE);		/* neither */
-	return 0;
+	return n;
 }
 
 /*
- * othercase - return the case counterpart of an alphabetic
+ * repeat - replicate subNFA for quantifiers
+ *
+ * The duplication sequences used here are chosen carefully so that any
+ * pointers starting out pointing into the subexpression end up pointing into
+ * the last occurrence.  (Note that it may not be strung between the same
+ * left and right end states, however!)  This used to be important for the
+ * subRE tree, although the important bits are now handled by the in-line
+ * code in parse(), and when this is called, it doesn't matter any more.
  */
-static unsigned char			/* if no counterpart, return ch */
-othercase(int ch)
+static void
+repeat(struct vars *v,
+	   struct state *lp,
+	   struct state *rp,
+	   int m,
+	   int n)
 {
-	assert(pg_isalpha(ch));
-	if (pg_isupper(ch))
-		return (unsigned char) tolower((unsigned char) ch);
-	else if (pg_islower(ch))
-		return (unsigned char) toupper((unsigned char) ch);
-	else
-/* peculiar, but could happen */
-		return (unsigned char) ch;
+#	define	SOME	2
+#	define	INF	3
+#	define	PAIR(x, y)	((x)*4 + (y))
+#	define	REDUCE(x)	( ((x) == INFINITY) ? INF : (((x) > 1) ? SOME : (x)) )
+	const int rm = REDUCE(m);
+	const int rn = REDUCE(n);
+	struct state *s;
+	struct state *s2;
+
+	switch (PAIR(rm, rn)) {
+	case PAIR(0, 0):		/* empty string */
+		delsub(v->nfa, lp, rp);
+		EMPTYARC(lp, rp);
+		break;
+	case PAIR(0, 1):		/* do as x| */
+		EMPTYARC(lp, rp);
+		break;
+	case PAIR(0, SOME):		/* do as x{1,n}| */
+		repeat(v, lp, rp, 1, n);
+		NOERR();
+		EMPTYARC(lp, rp);
+		break;
+	case PAIR(0, INF):		/* loop x around */
+		s = newstate(v->nfa);
+		NOERR();
+		moveouts(v->nfa, lp, s);
+		moveins(v->nfa, rp, s);
+		EMPTYARC(lp, s);
+		EMPTYARC(s, rp);
+		break;
+	case PAIR(1, 1):		/* no action required */
+		break;
+	case PAIR(1, SOME):		/* do as x{0,n-1}x = (x{1,n-1}|)x */
+		s = newstate(v->nfa);
+		NOERR();
+		moveouts(v->nfa, lp, s);
+		dupnfa(v->nfa, s, rp, lp, s);
+		NOERR();
+		repeat(v, lp, s, 1, n-1);
+		NOERR();
+		EMPTYARC(lp, s);
+		break;
+	case PAIR(1, INF):		/* add loopback arc */
+		s = newstate(v->nfa);
+		s2 = newstate(v->nfa);
+		NOERR();
+		moveouts(v->nfa, lp, s);
+		moveins(v->nfa, rp, s2);
+		EMPTYARC(lp, s);
+		EMPTYARC(s2, rp);
+		EMPTYARC(s2, s);
+		break;
+	case PAIR(SOME, SOME):		/* do as x{m-1,n-1}x */
+		s = newstate(v->nfa);
+		NOERR();
+		moveouts(v->nfa, lp, s);
+		dupnfa(v->nfa, s, rp, lp, s);
+		NOERR();
+		repeat(v, lp, s, m-1, n-1);
+		break;
+	case PAIR(SOME, INF):		/* do as x{m-1,}x */
+		s = newstate(v->nfa);
+		NOERR();
+		moveouts(v->nfa, lp, s);
+		dupnfa(v->nfa, s, rp, lp, s);
+		NOERR();
+		repeat(v, lp, s, m-1, n);
+		break;
+	default:
+		ERR(REG_ASSERT);
+		break;
+	}
 }
 
 /*
- * bothcases - emit a dualcase version of a two-case character
- *
- * Boy, is this implementation ever a kludge...
+ * bracket - handle non-complemented bracket expression
+ * Also called from cbracket for complemented bracket expressions.
  */
 static void
-bothcases(struct parse * p, int ch)
+bracket(struct vars *v,
+		struct state *lp,
+		struct state *rp)
 {
-	pg_wchar   *oldnext = p->next;
-	pg_wchar   *oldend = p->end;
-	pg_wchar	bracket[3];
-
-	assert(othercase(ch) != ch);	/* p_bracket() would recurse */
-	p->next = bracket;
-	p->end = bracket + 2;
-	bracket[0] = ch;
-	bracket[1] = ']';
-	bracket[2] = '\0';
-	p_bracket(p);
-	assert(p->next == bracket + 2);
-	p->next = oldnext;
-	p->end = oldend;
+	assert(SEE('['));
+	NEXT();
+	while (!SEE(']') && !SEE(EOS))
+		brackpart(v, lp, rp);
+	assert(SEE(']') || ISERR());
+	okcolors(v->nfa, v->cm);
 }
 
 /*
- * ordinary - emit an ordinary character
+ * cbracket - handle complemented bracket expression
+ * We do it by calling bracket() with dummy endpoints, and then complementing
+ * the result.  The alternative would be to invoke rainbow(), and then delete
+ * arcs as the b.e. is seen... but that gets messy.
  */
 static void
-ordinary(struct parse * p, int ch)
+cbracket(struct vars *v,
+		 struct state *lp,
+		 struct state *rp)
 {
-	cat_t	   *cap = p->g->categories;
+	struct state *left = newstate(v->nfa);
+	struct state *right = newstate(v->nfa);
+	struct state *s;
+	struct arc *a;			/* arc from lp */
+	struct arc *ba;			/* arc from left, from bracket() */
+	struct arc *pa;			/* MCCE-prototype arc */
+	color co;
+	chr *p;
+	int i;
+
+	NOERR();
+	bracket(v, left, right);
+	if (v->cflags&REG_NLSTOP)
+		newarc(v->nfa, PLAIN, v->nlcolor, left, right);
+	NOERR();
+
+	assert(lp->nouts == 0);		/* all outarcs will be ours */
+
+	/* easy part of complementing */
+	colorcomplement(v->nfa, v->cm, PLAIN, left, lp, rp);
+	NOERR();
+	if (v->mcces == NULL) {		/* no MCCEs -- we're done */
+		dropstate(v->nfa, left);
+		assert(right->nins == 0);
+		freestate(v->nfa, right);
+		return;
+	}
 
-	if ((p->g->cflags & REG_ICASE) && pg_isalpha(ch) && othercase(ch) != ch)
-		bothcases(p, ch);
-	else
-	{
-		EMIT(OCHAR, (pg_wchar) ch);
-		if (ch >= CHAR_MIN && ch <= CHAR_MAX && cap[ch] == 0)
-			cap[ch] = p->g->ncategories++;
+	/* but complementing gets messy in the presence of MCCEs... */
+	NOTE(REG_ULOCALE);
+	for (p = v->mcces->chrs, i = v->mcces->nchrs; i > 0; p++, i--) {
+		co = GETCOLOR(v->cm, *p);
+		a = findarc(lp, PLAIN, co);
+		ba = findarc(left, PLAIN, co);
+		if (ba == NULL) {
+			assert(a != NULL);
+			freearc(v->nfa, a);
+		} else {
+			assert(a == NULL);
+		}
+		s = newstate(v->nfa);
+		NOERR();
+		newarc(v->nfa, PLAIN, co, lp, s);
+		NOERR();
+		pa = findarc(v->mccepbegin, PLAIN, co);
+		assert(pa != NULL);
+		if (ba == NULL) {	/* easy case, need all of them */
+			cloneouts(v->nfa, pa->to, s, rp, PLAIN);
+			newarc(v->nfa, '$', 1, s, rp);
+			newarc(v->nfa, '$', 0, s, rp);
+			colorcomplement(v->nfa, v->cm, AHEAD, pa->to, s, rp);
+		} else {		/* must be selective */
+			if (findarc(ba->to, '$', 1) == NULL) {
+				newarc(v->nfa, '$', 1, s, rp);
+				newarc(v->nfa, '$', 0, s, rp);
+				colorcomplement(v->nfa, v->cm, AHEAD, pa->to,
+									 s, rp);
+			}
+			for (pa = pa->to->outs; pa != NULL; pa = pa->outchain)
+				if (findarc(ba->to, PLAIN, pa->co) == NULL)
+					newarc(v->nfa, PLAIN, pa->co, s, rp);
+			if (s->nouts == 0)	/* limit of selectivity: none */
+				dropstate(v->nfa, s);	/* frees arc too */
+		}
+		NOERR();
 	}
-}
 
-/*
- * nonnewline - emit REG_NEWLINE version of OANY
- *
- * Boy, is this implementation ever a kludge...
- */
-static void
-nonnewline(struct parse * p)
-{
-	pg_wchar   *oldnext = p->next;
-	pg_wchar   *oldend = p->end;
-	pg_wchar	bracket[4];
-
-	p->next = bracket;
-	p->end = bracket + 3;
-	bracket[0] = '^';
-	bracket[1] = '\n';
-	bracket[2] = ']';
-	bracket[3] = '\0';
-	p_bracket(p);
-	assert(p->next == bracket + 3);
-	p->next = oldnext;
-	p->end = oldend;
+	delsub(v->nfa, left, right);
+	assert(left->nouts == 0);
+	freestate(v->nfa, left);
+	assert(right->nins == 0);
+	freestate(v->nfa, right);
 }
-
+			
 /*
- * repeat - generate code for a bounded repetition, recursively if needed
+ * brackpart - handle one item (or range) within a bracket expression
  */
 static void
-repeat(struct parse * p,
-	   sopno start,				/* operand from here to end of strip */
-	   int from,				/* repeated from this number */
-	   int to)					/* to this number of times (maybe
-								 * INFINITY) */
+brackpart(struct vars *v,
+		  struct state *lp,
+		  struct state *rp)
 {
-	sopno		finish = HERE();
-
-#define  N		 2
-#define  INF	 3
-#define  REP(f, t)		 ((f)*8 + (t))
-#define  MAP(n)  (((n) <= 1) ? (n) : ((n) == INFINITY) ? INF : N)
-	sopno		copy;
-
-	if (p->error != 0)			/* head off possible runaway recursion */
+	celt startc;
+	celt endc;
+	struct cvec *cv;
+	chr *startp;
+	chr *endp;
+	chr c[1];
+
+	/* parse something, get rid of special cases, take shortcuts */
+	switch (v->nexttype) {
+	case RANGE:			/* a-b-c or other botch */
+		ERR(REG_ERANGE);
 		return;
+		break;
+	case PLAIN:
+		c[0] = v->nextvalue;
+		NEXT();
+		/* shortcut for ordinary chr (not range, not MCCE leader) */
+		if (!SEE(RANGE) && !ISCELEADER(v, c[0])) {
+			onechr(v, c[0], lp, rp);
+			return;
+		}
+		startc = element(v, c, c+1);
+		NOERR();
+		break;
+	case COLLEL:
+		startp = v->now;
+		endp = scanplain(v);
+		INSIST(startp < endp, REG_ECOLLATE);
+		NOERR();
+		startc = element(v, startp, endp);
+		NOERR();
+		break;
+	case ECLASS:
+		startp = v->now;
+		endp = scanplain(v);
+		INSIST(startp < endp, REG_ECOLLATE);
+		NOERR();
+		startc = element(v, startp, endp);
+		NOERR();
+		cv = eclass(v, startc, (v->cflags&REG_ICASE));
+		NOERR();
+		dovec(v, cv, lp, rp);
+		return;
+		break;
+	case CCLASS:
+		startp = v->now;
+		endp = scanplain(v);
+		INSIST(startp < endp, REG_ECTYPE);
+		NOERR();
+		cv = cclass(v, startp, endp, (v->cflags&REG_ICASE));
+		NOERR();
+		dovec(v, cv, lp, rp);
+		return;
+		break;
+	default:
+		ERR(REG_ASSERT);
+		return;
+		break;
+	}
 
-	assert(from <= to);
-
-	switch (REP(MAP(from), MAP(to)))
-	{
-		case REP(0, 0): /* must be user doing this */
-			DROP(finish - start);		/* drop the operand */
-			break;
-		case REP(0, 1): /* as x{1,1}? */
-		case REP(0, N): /* as x{1,n}? */
-		case REP(0, INF):		/* as x{1,}? */
-			/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
-			INSERT(OCH_, start);	/* offset is wrong... */
-			repeat(p, start + 1, 1, to);
-			ASTERN(OOR1, start);
-			AHEAD(start);		/* ... fix it */
-			EMIT(OOR2, 0);
-			AHEAD(THERE());
-			ASTERN(O_CH, THERETHERE());
-			break;
-		case REP(1, 1): /* trivial case */
-			/* done */
-			break;
-		case REP(1, N): /* as x?x{1,n-1} */
-			/* KLUDGE: emit y? as (y|) until subtle bug gets fixed */
-			INSERT(OCH_, start);
-			ASTERN(OOR1, start);
-			AHEAD(start);
-			EMIT(OOR2, 0);		/* offset very wrong... */
-			AHEAD(THERE());		/* ...so fix it */
-			ASTERN(O_CH, THERETHERE());
-			copy = dupl(p, start + 1, finish + 1);
-			assert(copy == finish + 4);
-			repeat(p, copy, 1, to - 1);
-			break;
-		case REP(1, INF):		/* as x+ */
-			INSERT(OPLUS_, start);
-			ASTERN(O_PLUS, start);
-			break;
-		case REP(N, N): /* as xx{m-1,n-1} */
-			copy = dupl(p, start, finish);
-			repeat(p, copy, from - 1, to - 1);
+	if (SEE(RANGE)) {
+		NEXT();
+		switch (v->nexttype) {
+		case PLAIN:
+		case RANGE:
+			c[0] = v->nextvalue;
+			NEXT();
+			endc = element(v, c, c+1);
+			NOERR();
 			break;
-		case REP(N, INF):		/* as xx{n-1,INF} */
-			copy = dupl(p, start, finish);
-			repeat(p, copy, from - 1, to);
+		case COLLEL:
+			startp = v->now;
+			endp = scanplain(v);
+			INSIST(startp < endp, REG_ECOLLATE);
+			NOERR();
+			endc = element(v, startp, endp);
+			NOERR();
 			break;
-		default:				/* "can't happen" */
-			SETERROR(REG_ASSERT);		/* just in case */
+		default:
+			ERR(REG_ERANGE);
+			return;
 			break;
-	}
+		}
+	} else
+		endc = startc;
+
+	/*
+	 * Ranges are unportable.  Actually, standard C does
+	 * guarantee that digits are contiguous, but making
+	 * that an exception is just too complicated.
+	 */
+	if (startc != endc)
+		NOTE(REG_UUNPORT);
+	cv = range(v, startc, endc, (v->cflags&REG_ICASE));
+	NOERR();
+	dovec(v, cv, lp, rp);
 }
 
 /*
- * seterr - set an error condition
+ * scanplain - scan PLAIN contents of [. etc.
+ *
+ * Certain bits of trickery in lex.c know that this code does not try
+ * to look past the final bracket of the [. etc.
  */
-static int						/* useless but makes type checking happy */
-seterr(struct parse * p, int e)
+static chr *			/* just after end of sequence */
+scanplain(struct vars *v)
 {
-	if (p->error == 0)			/* keep earliest error condition */
-		p->error = e;
-	p->next = nuls;				/* try to bring things to a halt */
-	p->end = nuls;
-	return 0;					/* make the return value well-defined */
+	chr *endp;
+
+	assert(SEE(COLLEL) || SEE(ECLASS) || SEE(CCLASS));
+	NEXT();
+
+	endp = v->now;
+	while (SEE(PLAIN)) {
+		endp = v->now;
+		NEXT();
+	}
+
+	assert(SEE(END) || ISERR());
+	NEXT();
+
+	return endp;
 }
 
 /*
- * allocset - allocate a set of characters for []
+ * leaders - process a cvec of collating elements to also include leaders
+ * Also gives all characters involved their own colors, which is almost
+ * certainly necessary, and sets up little disconnected subNFA.
  */
-static cset *
-allocset(struct parse * p)
+static void
+leaders(struct vars *v,
+		struct cvec *cv)
 {
-	int			no = p->g->ncsets++;
-	size_t		nc;
-	size_t		nbytes;
-	cset	   *cs;
-	size_t		css = (size_t) p->g->csetsize;
-	int			i;
-
-	if (no >= p->ncsalloc)
-	{							/* need another column of space */
-		p->ncsalloc += CHAR_BIT;
-		nc = p->ncsalloc;
-		assert(nc % CHAR_BIT == 0);
-		nbytes = nc / CHAR_BIT * css;
-		if (p->g->sets == NULL)
-			p->g->sets = (cset *) malloc(nc * sizeof(cset));
-		else
-			p->g->sets = (cset *) realloc((char *) p->g->sets,
-										  nc * sizeof(cset));
-		if (p->g->setbits == NULL)
-			p->g->setbits = (uch *) malloc(nbytes);
-		else
-		{
-			p->g->setbits = (uch *) realloc((char *) p->g->setbits,
-											nbytes);
-			/* xxx this isn't right if setbits is now NULL */
-			for (i = 0; i < no; i++)
-				p->g->sets[i].ptr = p->g->setbits + css * (i / CHAR_BIT);
-		}
-		if (p->g->sets != NULL && p->g->setbits != NULL)
-			memset((char *) p->g->setbits + (nbytes - css),
-				   0, css);
-		else
-		{
-			no = 0;
-			SETERROR(REG_ESPACE);
-			/* caller's responsibility not to do set ops */
+	int mcce;
+	chr *p;
+	chr leader;
+	struct state *s;
+	struct arc *a;
+
+	v->mccepbegin = newstate(v->nfa);
+	v->mccepend = newstate(v->nfa);
+	NOERR();
+
+	for (mcce = 0; mcce < cv->nmcces; mcce++) {
+		p = cv->mcces[mcce];
+		leader = *p;
+		if (!haschr(cv, leader)) {
+			addchr(cv, leader);
+			s = newstate(v->nfa);
+			newarc(v->nfa, PLAIN, subcolor(v->cm, leader),
+							v->mccepbegin, s);
+			okcolors(v->nfa, v->cm);
+		} else {
+			a = findarc(v->mccepbegin, PLAIN,
+						GETCOLOR(v->cm, leader));
+			assert(a != NULL);
+			s = a->to;
+			assert(s != v->mccepend);
 		}
+		p++;
+		assert(*p != 0 && *(p+1) == 0);	/* only 2-char MCCEs for now */
+		newarc(v->nfa, PLAIN, subcolor(v->cm, *p), s, v->mccepend);
+		okcolors(v->nfa, v->cm);
 	}
-
-	assert(p->g->sets != NULL); /* xxx */
-	cs = &p->g->sets[no];
-	cs->ptr = p->g->setbits + css * ((no) / CHAR_BIT);
-	cs->mask = 1 << ((no) % CHAR_BIT);
-	cs->hash = 0;
-	cs->smultis = 0;
-	cs->multis = NULL;
-
-	return cs;
 }
 
 /*
- * freeset - free a now-unused set
+ * onechr - fill in arcs for a plain character, and possible case complements
+ * This is mostly a shortcut for efficient handling of the common case.
  */
 static void
-freeset(struct parse * p, cset *cs)
+onechr(struct vars *v,
+	   chr c,
+	   struct state *lp,
+	   struct state *rp)
 {
-	int			i;
-	cset	   *top = &p->g->sets[p->g->ncsets];
-	size_t		css = (size_t) p->g->csetsize;
-
-	for (i = 0; i < css; i++)
-		CHsub(cs, i);
-	if (cs == top - 1)			/* recover only the easy case */
-		p->g->ncsets--;
+	if (!(v->cflags&REG_ICASE)) {
+		newarc(v->nfa, PLAIN, subcolor(v->cm, c), lp, rp);
+		return;
+	}
+
+	/* rats, need general case anyway... */
+	dovec(v, allcases(v, c), lp, rp);
 }
 
 /*
- * freezeset - final processing on a set of characters
- *
- * The main task here is merging identical sets.  This is usually a waste
- * of time (although the hash code minimizes the overhead), but can win
- * big if REG_ICASE is being used.	REG_ICASE, by the way, is why the hash
- * is done using addition rather than xor -- all ASCII [aA] sets xor to
- * the same value!
+ * dovec - fill in arcs for each element of a cvec
+ * This one has to handle the messy cases, like MCCEs and MCCE leaders.
  */
-static int						/* set number */
-freezeset(struct parse * p, cset *cs)
+static void
+dovec(struct vars *v,
+	  struct cvec *cv,
+	  struct state *lp,
+	  struct state *rp)
 {
-	uch			h = cs->hash;
-	int			i;
-	cset	   *top = &p->g->sets[p->g->ncsets];
-	cset	   *cs2;
-	size_t		css = (size_t) p->g->csetsize;
-
-	/* look for an earlier one which is the same */
-	for (cs2 = &p->g->sets[0]; cs2 < top; cs2++)
-		if (cs2->hash == h && cs2 != cs)
-		{
-			/* maybe */
-			for (i = 0; i < css; i++)
-				if (!!CHIN(cs2, i) != !!CHIN(cs, i))
-					break;		/* no */
-			if (i == css)
-				break;			/* yes */
+	chr ch, from, to;
+	celt ce;
+	chr *p;
+	int i;
+	color co;
+	struct cvec *leads;
+	struct arc *a;
+	struct arc *pa;		/* arc in prototype */
+	struct state *s;
+	struct state *ps;	/* state in prototype */
+
+	/* need a place to store leaders, if any */
+	if (nmcces(v) > 0) {
+		assert(v->mcces != NULL);
+		if (v->cv2 == NULL || v->cv2->nchrs < v->mcces->nchrs) {
+			if (v->cv2 != NULL)
+				free(v->cv2);
+			v->cv2 = newcvec(v->mcces->nchrs, 0, v->mcces->nmcces);
+			NOERR();
+			leads = v->cv2;
+		} else
+			leads = clearcvec(v->cv2);
+	} else
+		leads = NULL;
+
+	/* first, get the ordinary characters out of the way */
+	for (p = cv->chrs, i = cv->nchrs; i > 0; p++, i--) {
+		ch = *p;
+		if (!ISCELEADER(v, ch))
+			newarc(v->nfa, PLAIN, subcolor(v->cm, ch), lp, rp);
+		else {
+			assert(singleton(v->cm, ch));
+			assert(leads != NULL);
+			if (!haschr(leads, ch))
+				addchr(leads, ch);
 		}
+	}
 
-	if (cs2 < top)
-	{							/* found one */
-		freeset(p, cs);
-		cs = cs2;
+	/* and the ranges */
+	for (p = cv->ranges, i = cv->nranges; i > 0; p += 2, i--) {
+		from = *p;
+		to = *(p+1);
+		while (from <= to && (ce = nextleader(v, from, to)) != NOCELT) {
+			if (from < ce)
+				subrange(v, from, ce - 1, lp, rp);
+			assert(singleton(v->cm, ce));
+			assert(leads != NULL);
+			if (!haschr(leads, ce))
+				addchr(leads, ce);
+			from = ce + 1;
+		}
+		if (from <= to)
+			subrange(v, from, to, lp, rp);
 	}
 
-	return (int) (cs - p->g->sets);
-}
+	if ((leads == NULL || leads->nchrs == 0) && cv->nmcces == 0)
+		return;
 
-/*
- * firstch - return first character in a set (which must have at least one)
- */
-static int						/* character; there is no "none" value */
-firstch(struct parse * p, cset *cs)
-{
-	int			i;
-	size_t		css = (size_t) p->g->csetsize;
-
-	for (i = 0; i < css; i++)
-		if (CHIN(cs, i))
-			return i;
-	assert(never);
-	return 0;					/* arbitrary */
+	/* deal with the MCCE leaders */
+	NOTE(REG_ULOCALE);
+	for (p = leads->chrs, i = leads->nchrs; i > 0; p++, i--) {
+		co = GETCOLOR(v->cm, *p);
+		a = findarc(lp, PLAIN, co);
+		if (a != NULL)
+			s = a->to;
+		else {
+			s = newstate(v->nfa);
+			NOERR();
+			newarc(v->nfa, PLAIN, co, lp, s);
+			NOERR();
+		}
+		pa = findarc(v->mccepbegin, PLAIN, co);
+		assert(pa != NULL);
+		ps = pa->to;
+		newarc(v->nfa, '$', 1, s, rp);
+		newarc(v->nfa, '$', 0, s, rp);
+		colorcomplement(v->nfa, v->cm, AHEAD, ps, s, rp);
+		NOERR();
+	}
+
+	/* and the MCCEs */
+	for (i = 0; i < cv->nmcces; i++) {
+		p = cv->mcces[i];
+		assert(singleton(v->cm, *p));
+		if (!singleton(v->cm, *p)) {
+			ERR(REG_ASSERT);
+			return;
+		}
+		ch = *p++;
+		co = GETCOLOR(v->cm, ch);
+		a = findarc(lp, PLAIN, co);
+		if (a != NULL)
+			s = a->to;
+		else {
+			s = newstate(v->nfa);
+			NOERR();
+			newarc(v->nfa, PLAIN, co, lp, s);
+			NOERR();
+		}
+		assert(*p != 0);	/* at least two chars */
+		assert(singleton(v->cm, *p));
+		ch = *p++;
+		co = GETCOLOR(v->cm, ch);
+		assert(*p == 0);	/* and only two, for now */
+		newarc(v->nfa, PLAIN, co, s, rp);
+		NOERR();
+	}
 }
 
 /*
- * nch - number of characters in a set
+ * nextleader - find next MCCE leader within range
  */
-static int
-nch(struct parse * p, cset *cs)
+static celt			/* NOCELT means none */
+nextleader(struct vars *v,
+		   chr from,
+		   chr to)
 {
-	int			i;
-	size_t		css = (size_t) p->g->csetsize;
-	int			n = 0;
-
-	for (i = 0; i < css; i++)
-		if (CHIN(cs, i))
-			n++;
-	return n;
+	int i;
+	chr *p;
+	chr ch;
+	celt it = NOCELT;
+
+	if (v->mcces == NULL)
+		return it;
+
+	for (i = v->mcces->nchrs, p = v->mcces->chrs; i > 0; i--, p++) {
+		ch = *p;
+		if (from <= ch && ch <= to)
+			if (it == NOCELT || ch < it)
+				it = ch;
+	}
+	return it;
 }
 
 /*
- * mcadd - add a collating element to a cset
+ * wordchrs - set up word-chr list for word-boundary stuff, if needed
+ *
+ * The list is kept as a bunch of arcs between two dummy states; it's
+ * disposed of by the unreachable-states sweep in NFA optimization.
+ * Does NEXT().  Must not be called from any unusual lexical context.
+ * This should be reconciled with the \w etc. handling in lex.c, and
+ * should be cleaned up to reduce dependencies on input scanning.
  */
 static void
-mcadd(struct parse * p, cset *cs, char *cp)
+wordchrs(struct vars *v)
 {
-	size_t		oldend = cs->smultis;
+	struct state *left;
+	struct state *right;
 
-	cs->smultis += strlen(cp) + 1;
-	if (cs->multis == NULL)
-		cs->multis = malloc(cs->smultis);
-	else
-		cs->multis = realloc(cs->multis, cs->smultis);
-	if (cs->multis == NULL)
-	{
-		SETERROR(REG_ESPACE);
+	if (v->wordchrs != NULL) {
+		NEXT();		/* for consistency */
 		return;
 	}
 
-	strcpy(cs->multis + oldend - 1, cp);
-	cs->multis[cs->smultis - 1] = '\0';
+	left = newstate(v->nfa);
+	right = newstate(v->nfa);
+	NOERR();
+	/* fine point:  implemented with [::], and lexer will set REG_ULOCALE */
+	lexword(v);
+	NEXT();
+	assert(v->savenow != NULL && SEE('['));
+	bracket(v, left, right);
+	assert((v->savenow != NULL && SEE(']')) || ISERR());
+	NEXT();
+	NOERR();
+	v->wordchrs = left;
 }
 
 /*
- * mcinvert - invert the list of collating elements in a cset
- *
- * This would have to know the set of possibilities.  Implementation
- * is deferred.
+ * subre - allocate a subre
  */
-static void
-mcinvert(struct parse * p, cset *cs)
+static struct subre *
+subre(struct vars *v,
+	  int op,
+	  int flags,
+	  struct state *begin,
+	  struct state *end)
 {
-	assert(cs->multis == NULL); /* xxx */
+	struct subre *ret;
+
+	ret = v->treefree;
+	if (ret != NULL)
+		v->treefree = ret->left;
+	else {
+		ret = (struct subre *)MALLOC(sizeof(struct subre));
+		if (ret == NULL) {
+			ERR(REG_ESPACE);
+			return NULL;
+		}
+		ret->chain = v->treechain;
+		v->treechain = ret;
+	}
+
+	assert(strchr("|.b(=", op) != NULL);
+
+	ret->op = op;
+	ret->flags = flags;
+	ret->retry = 0;
+	ret->subno = 0;
+	ret->min = ret->max = 1;
+	ret->left = NULL;
+	ret->right = NULL;
+	ret->begin = begin;
+	ret->end = end;
+	ZAPCNFA(ret->cnfa);
+
+	return ret;
 }
 
 /*
- * mccase - add case counterparts of the list of collating elements in a cset
- *
- * This would have to know the set of possibilities.  Implementation
- * is deferred.
+ * freesubre - free a subRE subtree
  */
 static void
-mccase(struct parse * p, cset *cs)
+freesubre(struct vars *v,			/* might be NULL */
+		  struct subre *sr)
 {
-	assert(cs->multis == NULL); /* xxx */
-}
+	if (sr == NULL)
+		return;
 
-/*
- * isinsets - is this character in any sets?
- */
-static int						/* predicate */
-isinsets(struct re_guts * g, int c)
-{
-	uch		   *col;
-	int			i;
-	int			ncols = (g->ncsets + (CHAR_BIT - 1)) / CHAR_BIT;
-	unsigned	uc = (unsigned char) c;
-
-	for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
-		if (col[uc] != 0)
-			return 1;
-	return 0;
-}
+	if (sr->left != NULL)
+		freesubre(v, sr->left);
+	if (sr->right != NULL)
+		freesubre(v, sr->right);
 
-/*
- * samesets - are these two characters in exactly the same sets?
- */
-static int						/* predicate */
-samesets(struct re_guts * g, int c1, int c2)
-{
-	uch		   *col;
-	int			i;
-	int			ncols = (g->ncsets + (CHAR_BIT - 1)) / CHAR_BIT;
-	unsigned	uc1 = (unsigned char) c1;
-	unsigned	uc2 = (unsigned char) c2;
-
-	for (i = 0, col = g->setbits; i < ncols; i++, col += g->csetsize)
-		if (col[uc1] != col[uc2])
-			return 0;
-	return 1;
+	freesrnode(v, sr);
 }
 
 /*
- * categorize - sort out character categories
+ * freesrnode - free one node in a subRE subtree
  */
 static void
-categorize(struct parse * p, struct re_guts * g)
+freesrnode(struct vars *v,			/* might be NULL */
+		   struct subre *sr)
 {
-	cat_t	   *cats = g->categories;
-	int			c;
-	int			c2;
-	cat_t		cat;
-
-	/* avoid making error situations worse */
-	if (p->error != 0)
+	if (sr == NULL)
 		return;
 
-	for (c = CHAR_MIN; c <= CHAR_MAX; c++)
-		if (cats[c] == 0 && isinsets(g, c))
-		{
-			cat = g->ncategories++;
-			cats[c] = cat;
-			for (c2 = c + 1; c2 <= CHAR_MAX; c2++)
-				if (cats[c2] == 0 && samesets(g, c, c2))
-					cats[c2] = cat;
-		}
-}
+	if (!NULLCNFA(sr->cnfa))
+		freecnfa(&sr->cnfa);
+	sr->flags = 0;
 
-/*
- * dupl - emit a duplicate of a bunch of sops
- */
-static sopno					/* start of duplicate */
-dupl(struct parse * p,
-	 sopno start,				/* from here */
-	 sopno finish)				/* to this less one */
-{
-	sopno		ret = HERE();
-	sopno		len = finish - start;
-
-	assert(finish >= start);
-	if (len == 0)
-		return ret;
-	enlarge(p, p->ssize + len); /* this many unexpected additions */
-	assert(p->ssize >= p->slen + len);
-	memcpy((char *) (p->strip + p->slen),
-		   (char *) (p->strip + start), (size_t) len * sizeof(sop));
-	p->slen += len;
-	return ret;
+	if (v != NULL) {
+		sr->left = v->treefree;
+		v->treefree = sr;
+	} else
+		FREE(sr);
 }
 
 /*
- * doemit - emit a strip operator
- *
- * It might seem better to implement this as a macro with a function as
- * hard-case backup, but it's just too big and messy unless there are
- * some changes to the data structures.  Maybe later.
+ * optst - optimize a subRE subtree
  */
 static void
-doemit(struct parse * p, sop op, size_t opnd)
+optst(struct vars *v,
+	  struct subre *t)
 {
-	/* avoid making error situations worse */
-	if (p->error != 0)
+	if (t == NULL)
 		return;
 
-	/* deal with oversize operands ("can't happen", more or less) */
-	assert(opnd < 1 << OPSHIFT);
-
-	/* deal with undersized strip */
-	if (p->slen >= p->ssize)
-		enlarge(p, (p->ssize + 1) / 2 * 3);		/* +50% */
-	assert(p->slen < p->ssize);
-
-	/* finally, it's all reduced to the easy case */
-	p->strip[p->slen++] = SOP(op, opnd);
+	/* recurse through children */
+	if (t->left != NULL)
+		optst(v, t->left);
+	if (t->right != NULL)
+		optst(v, t->right);
 }
 
 /*
- * doinsert - insert a sop into the strip
+ * numst - number tree nodes (assigning retry indexes)
  */
-static void
-doinsert(struct parse * p, sop op, size_t opnd, sopno pos)
+static int			/* next number */
+numst(struct subre *t,
+	  int start)			/* starting point for subtree numbers */
 {
-	sopno		sn;
-	sop			s;
-	int			i;
-
-	/* avoid making error situations worse */
-	if (p->error != 0)
-		return;
-
-	sn = HERE();
-	EMIT(op, opnd);				/* do checks, ensure space */
-	assert(HERE() == sn + 1);
-	s = p->strip[sn];
+	int i;
 
-	/* adjust paren pointers */
-	assert(pos > 0);
-	for (i = 1; i < NPAREN; i++)
-	{
-		if (p->pbegin[i] >= pos)
-			p->pbegin[i]++;
-		if (p->pend[i] >= pos)
-			p->pend[i]++;
-	}
+	assert(t != NULL);
 
-	memmove((char *) &p->strip[pos + 1], (char *) &p->strip[pos],
-			(HERE() - pos - 1) * sizeof(sop));
-	p->strip[pos] = s;
+	i = start;
+	t->retry = (short)i++;
+	if (t->left != NULL)
+		i = numst(t->left, i);
+	if (t->right != NULL)
+		i = numst(t->right, i);
+	return i;
 }
 
 /*
- * dofwd - complete a forward reference
+ * markst - mark tree nodes as INUSE
  */
 static void
-dofwd(struct parse * p, sopno pos, sop value)
+markst(struct subre *t)
 {
-	/* avoid making error situations worse */
-	if (p->error != 0)
-		return;
+	assert(t != NULL);
 
-	assert(value < 1 << OPSHIFT);
-	p->strip[pos] = OP(p->strip[pos]) | value;
+	t->flags |= INUSE;
+	if (t->left != NULL)
+		markst(t->left);
+	if (t->right != NULL)
+		markst(t->right);
 }
 
 /*
- * enlarge - enlarge the strip
+ * cleanst - free any tree nodes not marked INUSE
  */
 static void
-enlarge(struct parse * p, sopno size)
+cleanst(struct vars *v)
 {
-	sop		   *sp;
+	struct subre *t;
+	struct subre *next;
 
-	if (p->ssize >= size)
-		return;
-
-	sp = (sop *) realloc(p->strip, size * sizeof(sop));
-	if (sp == NULL)
-	{
-		SETERROR(REG_ESPACE);
-		return;
+	for (t = v->treechain; t != NULL; t = next) {
+		next = t->chain;
+		if (!(t->flags&INUSE))
+			FREE(t);
 	}
-	p->strip = sp;
-	p->ssize = size;
+	v->treechain = NULL;
+	v->treefree = NULL;		/* just on general principles */
 }
 
 /*
- * stripsnug - compact the strip
+ * nfatree - turn a subRE subtree into a tree of compacted NFAs
  */
-static void
-stripsnug(struct parse * p, struct re_guts * g)
+static long			/* optimize results from top node */
+nfatree(struct vars *v,
+		struct subre *t,
+		FILE *f)				/* for debug output */
 {
-	g->nstates = p->slen;
-	g->strip = (sop *) realloc((char *) p->strip, p->slen * sizeof(sop));
-	if (g->strip == NULL)
-	{
-		SETERROR(REG_ESPACE);
-		g->strip = p->strip;
-	}
+	assert(t != NULL && t->begin != NULL);
+
+	if (t->left != NULL)
+		(DISCARD)nfatree(v, t->left, f);
+	if (t->right != NULL)
+		(DISCARD)nfatree(v, t->right, f);
+
+	return nfanode(v, t, f);
 }
 
 /*
- * findmust - fill in must and mlen with longest mandatory literal string
- *
- * This algorithm could do fancy things like analyzing the operands of |
- * for common subsequences.  Someday.  This code is simple and finds most
- * of the interesting cases.
- *
- * Note that must and mlen got initialized during setup.
+ * nfanode - do one NFA for nfatree
  */
-static void
-findmust(struct parse * p, struct re_guts * g)
+static long			/* optimize results */
+nfanode(struct vars *v,
+		struct subre *t,
+		FILE *f)				/* for debug output */
 {
-	sop		   *scan;
-	sop		   *start = 0;
-	sop		   *newstart = 0;
-	sopno		newlen;
-	sop			s;
-	pg_wchar   *cp;
-	sopno		i;
-
-	/* avoid making error situations worse */
-	if (p->error != 0)
-		return;
+	struct nfa *nfa;
+	long ret = 0;
 
-	/* find the longest OCHAR sequence in strip */
-	newlen = 0;
-	scan = g->strip + 1;
-	do
-	{
-		s = *scan++;
-		switch (OP(s))
-		{
-			case OCHAR: /* sequence member */
-				if (newlen == 0)	/* new sequence */
-					newstart = scan - 1;
-				newlen++;
-				break;
-			case OPLUS_:		/* things that don't break one */
-			case OLPAREN:
-			case ORPAREN:
-				break;
-			case OQUEST_:		/* things that must be skipped */
-			case OCH_:
-				scan--;
-				do
-				{
-					scan += OPND(s);
-					s = *scan;
-					/* assert() interferes w debug printouts */
-					if (OP(s) != O_QUEST && OP(s) != O_CH &&
-						OP(s) != OOR2)
-					{
-						g->iflags |= BAD;
-						return;
-					}
-				} while (OP(s) != O_QUEST && OP(s) != O_CH);
-				/* fallthrough */
-			default:			/* things that break a sequence */
-				if (newlen > g->mlen)
-				{				/* ends one */
-					start = newstart;
-					g->mlen = newlen;
-				}
-				newlen = 0;
-				break;
-		}
-	} while (OP(s) != OEND);
+	assert(t->begin != NULL);
 
-	if (g->mlen == 0)			/* there isn't one */
-		return;
+#ifdef REG_DEBUG
+	if (f != NULL)
+	{
+		char idbuf[50];
 
-	/* turn it into a character string */
-	g->must = (pg_wchar *) malloc((size_t) (g->mlen + 1) * sizeof(pg_wchar));
-	if (g->must == NULL)
-	{							/* argh; just forget it */
-		g->mlen = 0;
-		return;
+		fprintf(f, "\n\n\n========= TREE NODE %s ==========\n",
+						stid(t, idbuf, sizeof(idbuf)));
 	}
-	cp = g->must;
-	scan = start;
-	for (i = g->mlen; i > 0; i--)
-	{
-		while (OP(s = *scan++) != OCHAR)
-			continue;
-		assert(cp < g->must + g->mlen);
-		*cp++ = (pg_wchar) OPND(s);
+#endif
+	nfa = newnfa(v, v->cm, v->nfa);
+	NOERRZ();
+	dupnfa(nfa, t->begin, t->end, nfa->init, nfa->final);
+	if (!ISERR()) {
+		specialcolors(nfa);
+		ret = optimize(nfa, f);
 	}
-	assert(cp == g->must + g->mlen);
-	*cp++ = '\0';				/* just on general principles */
+	if (!ISERR())
+		compact(nfa, &t->cnfa);
+
+	freenfa(nfa);
+	return ret;
 }
 
 /*
- * pluscount - count + nesting
+ * newlacon - allocate a lookahead-constraint subRE
  */
-static sopno					/* nesting depth */
-pluscount(struct parse * p, struct re_guts * g)
+static int			/* lacon number */
+newlacon(struct vars *v,
+		 struct state *begin,
+		 struct state *end,
+		 int pos)
 {
-	sop		   *scan;
-	sop			s;
-	sopno		plusnest = 0;
-	sopno		maxnest = 0;
-
-	if (p->error != 0)
-		return 0;				/* there may not be an OEND */
-
-	scan = g->strip + 1;
-	do
-	{
-		s = *scan++;
-		switch (OP(s))
-		{
-			case OPLUS_:
-				plusnest++;
-				break;
-			case O_PLUS:
-				if (plusnest > maxnest)
-					maxnest = plusnest;
-				plusnest--;
-				break;
-		}
-	} while (OP(s) != OEND);
-	if (plusnest != 0)
-		g->iflags |= BAD;
-	return maxnest;
+	int n;
+	struct subre *sub;
+
+	if (v->nlacons == 0) {
+		v->lacons = (struct subre *)MALLOC(2 * sizeof(struct subre));
+		n = 1;		/* skip 0th */
+		v->nlacons = 2;
+	} else {
+		v->lacons = (struct subre *)REALLOC(v->lacons,
+					(v->nlacons+1)*sizeof(struct subre));
+		n = v->nlacons++;
+	}
+	if (v->lacons == NULL) {
+		ERR(REG_ESPACE);
+		return 0;
+	}
+	sub = &v->lacons[n];
+	sub->begin = begin;
+	sub->end = end;
+	sub->subno = pos;
+	ZAPCNFA(sub->cnfa);
+	return n;
 }
 
 /*
- * some ctype functions with non-ascii-char guard
+ * freelacons - free lookahead-constraint subRE vector
  */
-static int
-pg_isdigit(int c)
+static void
+freelacons(struct subre *subs,
+		   int n)
 {
-	return (c >= 0 && c <= UCHAR_MAX && isdigit((unsigned char) c));
+	struct subre *sub;
+	int i;
+
+	assert(n > 0);
+	for (sub = subs + 1, i = n - 1; i > 0; sub++, i--)	/* no 0th */
+		if (!NULLCNFA(sub->cnfa))
+			freecnfa(&sub->cnfa);
+	FREE(subs);
 }
 
-static int
-pg_isalpha(int c)
+/*
+ * rfree - free a whole RE (insides of regfree)
+ */
+static void
+rfree(regex_t *re)
 {
-	return (c >= 0 && c <= UCHAR_MAX && isalpha((unsigned char) c));
-}
+	struct guts *g;
 
-static int
-pg_isalnum(int c)
-{
-	return (c >= 0 && c <= UCHAR_MAX && isalnum((unsigned char) c));
-}
+	if (re == NULL || re->re_magic != REMAGIC)
+		return;
 
-static int
-pg_isupper(int c)
-{
-	return (c >= 0 && c <= UCHAR_MAX && isupper((unsigned char) c));
+	re->re_magic = 0;	/* invalidate RE */
+	g = (struct guts *)re->re_guts;
+	re->re_guts = NULL;
+	re->re_fns = NULL;
+	g->magic = 0;
+	freecm(&g->cmap);
+	if (g->tree != NULL)
+		freesubre((struct vars *)NULL, g->tree);
+	if (g->lacons != NULL)
+		freelacons(g->lacons, g->nlacons);
+	if (!NULLCNFA(g->search))
+		freecnfa(&g->search);
+	FREE(g);
 }
 
-static int
-pg_islower(int c)
-{
-	return (c >= 0 && c <= UCHAR_MAX && islower((unsigned char) c));
-}
+#ifdef REG_DEBUG
 
-static int
-pg_iscntrl(int c)
+/*
+ * dump - dump an RE in human-readable form
+ */
+static void
+dump(regex_t *re,
+	 FILE *f)
 {
-	return (c >= 0 && c <= UCHAR_MAX && iscntrl((unsigned char) c));
+	struct guts *g;
+	int i;
+
+	if (re->re_magic != REMAGIC)
+		fprintf(f, "bad magic number (0x%x not 0x%x)\n", re->re_magic,
+								REMAGIC);
+	if (re->re_guts == NULL) {
+		fprintf(f, "NULL guts!!!\n");
+		return;
+	}
+	g = (struct guts *)re->re_guts;
+	if (g->magic != GUTSMAGIC)
+		fprintf(f, "bad guts magic number (0x%x not 0x%x)\n", g->magic,
+								GUTSMAGIC);
+
+	fprintf(f, "\n\n\n========= DUMP ==========\n");
+	fprintf(f, "nsub %d, info 0%lo, csize %d, ntree %d\n", 
+		re->re_nsub, re->re_info, re->re_csize, g->ntree);
+
+	dumpcolors(&g->cmap, f);
+	if (!NULLCNFA(g->search)) {
+		printf("\nsearch:\n");
+		dumpcnfa(&g->search, f);
+	}
+	for (i = 1; i < g->nlacons; i++) {
+		fprintf(f, "\nla%d (%s):\n", i,
+				(g->lacons[i].subno) ? "positive" : "negative");
+		dumpcnfa(&g->lacons[i].cnfa, f);
+	}
+	fprintf(f, "\n");
+	dumpst(g->tree, f, 0);
 }
 
-static int
-pg_isgraph(int c)
+/*
+ * dumpst - dump a subRE tree
+ */
+static void
+dumpst(struct subre *t,
+	   FILE *f,
+	   int nfapresent)			/* is the original NFA still around? */
 {
-	return (c >= 0 && c <= UCHAR_MAX && isgraph((unsigned char) c));
+	if (t == NULL)
+		fprintf(f, "null tree\n");
+	else
+		stdump(t, f, nfapresent);
+	fflush(f);
 }
 
-static int
-pg_isprint(int c)
+/*
+ * stdump - recursive guts of dumpst
+ */
+static void
+stdump(struct subre *t,
+	   FILE *f,
+	   int nfapresent)			/* is the original NFA still around? */
 {
-	return (c >= 0 && c <= UCHAR_MAX && isprint((unsigned char) c));
+	char idbuf[50];
+
+	fprintf(f, "%s. `%c'", stid(t, idbuf, sizeof(idbuf)), t->op);
+	if (t->flags&LONGER)
+		fprintf(f, " longest");
+	if (t->flags&SHORTER)
+		fprintf(f, " shortest");
+	if (t->flags&MIXED)
+		fprintf(f, " hasmixed");
+	if (t->flags&CAP)
+		fprintf(f, " hascapture");
+	if (t->flags&BACKR)
+		fprintf(f, " hasbackref");
+	if (!(t->flags&INUSE))
+		fprintf(f, " UNUSED");
+	if (t->subno != 0)
+		fprintf(f, " (#%d)", t->subno);
+	if (t->min != 1 || t->max != 1) {
+		fprintf(f, " {%d,", t->min);
+		if (t->max != INFINITY)
+			fprintf(f, "%d", t->max);
+		fprintf(f, "}");
+	}
+	if (nfapresent)
+		fprintf(f, " %ld-%ld", (long)t->begin->no, (long)t->end->no);
+	if (t->left != NULL)
+		fprintf(f, " L:%s", stid(t->left, idbuf, sizeof(idbuf)));
+	if (t->right != NULL)
+		fprintf(f, " R:%s", stid(t->right, idbuf, sizeof(idbuf)));
+	if (!NULLCNFA(t->cnfa)) {
+		fprintf(f, "\n");
+		dumpcnfa(&t->cnfa, f);
+		fprintf(f, "\n");
+	}
+	if (t->left != NULL)
+		stdump(t->left, f, nfapresent);
+	if (t->right != NULL)
+		stdump(t->right, f, nfapresent);
 }
 
-static int
-pg_ispunct(int c)
+/*
+ * stid - identify a subtree node for dumping
+ */
+static char *			/* points to buf or constant string */
+stid(struct subre *t,
+	 char *buf,
+	 size_t bufsize)
 {
-	return (c >= 0 && c <= UCHAR_MAX && ispunct((unsigned char) c));
+	/* big enough for hex int or decimal t->retry? */
+	if (bufsize < sizeof(int)*2 + 3 || bufsize < sizeof(t->retry)*3 + 1)
+		return "unable";
+	if (t->retry != 0)
+		sprintf(buf, "%d", t->retry);
+	else
+		sprintf(buf, "0x%x", (int)t);	/* may lose bits, that's okay */
+	return buf;
 }
 
-static struct cclass *
-cclass_init(void)
-{
-	static struct cclass cclasses_C[] = {
-		{"alnum", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789", ""},
-		{"alpha", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz", ""},
-		{"blank", " \t", ""},
-		{"cntrl", "\007\b\t\n\v\f\r\1\2\3\4\5\6\16\17\20\21\22\23\24\25\26\27\30\31\32\33\34\35\36\37\177", ""},
-		{"digit", "0123456789", ""},
-		{"graph", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~", ""},
-		{"lower", "abcdefghijklmnopqrstuvwxyz", ""},
-		{"print", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~ ", ""},
-		{"punct", "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~", ""},
-		{"space", "\t\n\v\f\r ", ""},
-		{"upper", "ABCDEFGHIJKLMNOPQRSTUVWXYZ", ""},
-		{"xdigit", "0123456789ABCDEFabcdef", ""},
-		{NULL, NULL, ""}
-	};
-	struct cclass *cp = NULL;
-	struct cclass *classes = NULL;
-	struct cclass_factory
-	{
-		char	   *name;
-		int			(*func) (int);
-		char	   *chars;
-	}			cclass_factories[] =
-	{
-		{
-			"alnum", pg_isalnum, NULL
-		},
-		{
-			"alpha", pg_isalpha, NULL
-		},
-		{
-			"blank", NULL, " \t"
-		},
-		{
-			"cntrl", pg_iscntrl, NULL
-		},
-		{
-			"digit", NULL, "0123456789"
-		},
-		{
-			"graph", pg_isgraph, NULL
-		},
-		{
-			"lower", pg_islower, NULL
-		},
-		{
-			"print", pg_isprint, NULL
-		},
-		{
-			"punct", pg_ispunct, NULL
-		},
-		{
-			"space", NULL, "\t\n\v\f\r "
-		},
-		{
-			"upper", pg_isupper, NULL
-		},
-		{
-			"xdigit", NULL, "0123456789ABCDEFabcdef"
-		},
-		{
-			NULL, NULL, NULL
-		}
-	};
-	struct cclass_factory *cf = NULL;
-
-	if (strcmp(setlocale(LC_CTYPE, NULL), "C") == 0)
-		return cclasses_C;
-
-	classes = malloc(sizeof(struct cclass) * (sizeof(cclass_factories) / sizeof(struct cclass_factory)));
-	if (classes == NULL)
-		elog(ERROR, "cclass_init: out of memory");
+#endif /* REG_DEBUG */
 
-	cp = classes;
-	for (cf = cclass_factories; cf->name != NULL; cf++)
-	{
-		cp->name = strdup(cf->name);
-		if (cf->chars)
-			cp->chars = strdup(cf->chars);
-		else
-		{
-			int			x = 0,
-						y = 0;
-
-			cp->chars = malloc(sizeof(char) * 256);
-			if (cp->chars == NULL)
-				elog(ERROR, "cclass_init: out of memory");
-			for (x = 0; x < 256; x++)
-			{
-				if ((cf->func) (x))
-					*(cp->chars + y++) = x;
-			}
-			*(cp->chars + y) = '\0';
-		}
-		cp->multis = "";
-		cp++;
-	}
-	cp->name = cp->chars = NULL;
-	cp->multis = "";
 
-	return classes;
-}
+#include "regc_lex.c"
+#include "regc_color.c"
+#include "regc_nfa.c"
+#include "regc_cvec.c"
+#include "regc_locale.c"
diff --git a/src/backend/regex/rege_dfa.c b/src/backend/regex/rege_dfa.c
new file mode 100644
index 00000000000..3bdfc2ab182
--- /dev/null
+++ b/src/backend/regex/rege_dfa.c
@@ -0,0 +1,655 @@
+/*
+ * DFA routines
+ * This file is #included by regexec.c.
+ *
+ * Copyright (c) 1998, 1999 Henry Spencer.  All rights reserved.
+ * 
+ * Development of this software was funded, in part, by Cray Research Inc.,
+ * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
+ * Corporation, none of whom are responsible for the results.  The author
+ * thanks all of them. 
+ * 
+ * Redistribution and use in source and binary forms -- with or without
+ * modification -- are permitted for any purpose, provided that
+ * redistributions in source form retain this entire copyright notice and
+ * indicate the origin and nature of any modifications.
+ * 
+ * I'd appreciate being given credit for this package in the documentation
+ * of software which uses it, but that is not a requirement.
+ * 
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
+ * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL
+ * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $Header: /cvsroot/pgsql/src/backend/regex/rege_dfa.c,v 1.1 2003/02/05 17:41:33 tgl Exp $
+ *
+ */
+
+/*
+ * longest - longest-preferred matching engine
+ */
+static chr *			/* endpoint, or NULL */
+longest(struct vars *v,			/* used only for debug and exec flags */
+		struct dfa *d,
+		chr *start,			/* where the match should start */
+		chr *stop,			/* match must end at or before here */
+		int *hitstopp)			/* record whether hit v->stop, if non-NULL */
+{
+	chr *cp;
+	chr *realstop = (stop == v->stop) ? stop : stop + 1;
+	color co;
+	struct sset *css;
+	struct sset *ss;
+	chr *post;
+	int i;
+	struct colormap *cm = d->cm;
+
+	/* initialize */
+	css = initialize(v, d, start);
+	cp = start;
+	if (hitstopp != NULL)
+		*hitstopp = 0;
+
+	/* startup */
+	FDEBUG(("+++ startup +++\n"));
+	if (cp == v->start) {
+		co = d->cnfa->bos[(v->eflags&REG_NOTBOL) ? 0 : 1];
+		FDEBUG(("color %ld\n", (long)co));
+	} else {
+		co = GETCOLOR(cm, *(cp - 1));
+		FDEBUG(("char %c, color %ld\n", (char)*(cp-1), (long)co));
+	}
+	css = miss(v, d, css, co, cp, start);
+	if (css == NULL)
+		return NULL;
+	css->lastseen = cp;
+
+	/* main loop */
+	if (v->eflags&REG_FTRACE)
+		while (cp < realstop) {
+			FDEBUG(("+++ at c%d +++\n", css - d->ssets));
+			co = GETCOLOR(cm, *cp);
+			FDEBUG(("char %c, color %ld\n", (char)*cp, (long)co));
+			ss = css->outs[co];
+			if (ss == NULL) {
+				ss = miss(v, d, css, co, cp+1, start);
+				if (ss == NULL)
+					break;	/* NOTE BREAK OUT */
+			}
+			cp++;
+			ss->lastseen = cp;
+			css = ss;
+		}
+	else
+		while (cp < realstop) {
+			co = GETCOLOR(cm, *cp);
+			ss = css->outs[co];
+			if (ss == NULL) {
+				ss = miss(v, d, css, co, cp+1, start);
+				if (ss == NULL)
+					break;	/* NOTE BREAK OUT */
+			}
+			cp++;
+			ss->lastseen = cp;
+			css = ss;
+		}
+
+	/* shutdown */
+	FDEBUG(("+++ shutdown at c%d +++\n", css - d->ssets));
+	if (cp == v->stop && stop == v->stop) {
+		if (hitstopp != NULL)
+			*hitstopp = 1;
+		co = d->cnfa->eos[(v->eflags&REG_NOTEOL) ? 0 : 1];
+		FDEBUG(("color %ld\n", (long)co));
+		ss = miss(v, d, css, co, cp, start);
+		/* special case:  match ended at eol? */
+		if (ss != NULL && (ss->flags&POSTSTATE))
+			return cp;
+		else if (ss != NULL)
+			ss->lastseen = cp;	/* to be tidy */
+	}
+
+	/* find last match, if any */
+	post = d->lastpost;
+	for (ss = d->ssets, i = d->nssused; i > 0; ss++, i--)
+		if ((ss->flags&POSTSTATE) && post != ss->lastseen &&
+					(post == NULL || post < ss->lastseen))
+			post = ss->lastseen;
+	if (post != NULL)		/* found one */
+		return post - 1;
+
+	return NULL;
+}
+
+/*
+ * shortest - shortest-preferred matching engine
+ */
+static chr *			/* endpoint, or NULL */
+shortest(struct vars *v,
+		 struct dfa *d,
+		 chr *start,			/* where the match should start */
+		 chr *min,			/* match must end at or after here */
+		 chr *max,			/* match must end at or before here */
+		 chr **coldp,			/* store coldstart pointer here, if nonNULL */
+		 int *hitstopp)			/* record whether hit v->stop, if non-NULL */
+{
+	chr *cp;
+	chr *realmin = (min == v->stop) ? min : min + 1;
+	chr *realmax = (max == v->stop) ? max : max + 1;
+	color co;
+	struct sset *css;
+	struct sset *ss;
+	struct colormap *cm = d->cm;
+
+	/* initialize */
+	css = initialize(v, d, start);
+	cp = start;
+	if (hitstopp != NULL)
+		*hitstopp = 0;
+
+	/* startup */
+	FDEBUG(("--- startup ---\n"));
+	if (cp == v->start) {
+		co = d->cnfa->bos[(v->eflags&REG_NOTBOL) ? 0 : 1];
+		FDEBUG(("color %ld\n", (long)co));
+	} else {
+		co = GETCOLOR(cm, *(cp - 1));
+		FDEBUG(("char %c, color %ld\n", (char)*(cp-1), (long)co));
+	}
+	css = miss(v, d, css, co, cp, start);
+	if (css == NULL)
+		return NULL;
+	css->lastseen = cp;
+	ss = css;
+
+	/* main loop */
+	if (v->eflags&REG_FTRACE)
+		while (cp < realmax) {
+			FDEBUG(("--- at c%d ---\n", css - d->ssets));
+			co = GETCOLOR(cm, *cp);
+			FDEBUG(("char %c, color %ld\n", (char)*cp, (long)co));
+			ss = css->outs[co];
+			if (ss == NULL) {
+				ss = miss(v, d, css, co, cp+1, start);
+				if (ss == NULL)
+					break;	/* NOTE BREAK OUT */
+			}
+			cp++;
+			ss->lastseen = cp;
+			css = ss;
+			if ((ss->flags&POSTSTATE) && cp >= realmin)
+				break;		/* NOTE BREAK OUT */
+		}
+	else
+		while (cp < realmax) {
+			co = GETCOLOR(cm, *cp);
+			ss = css->outs[co];
+			if (ss == NULL) {
+				ss = miss(v, d, css, co, cp+1, start);
+				if (ss == NULL)
+					break;	/* NOTE BREAK OUT */
+			}
+			cp++;
+			ss->lastseen = cp;
+			css = ss;
+			if ((ss->flags&POSTSTATE) && cp >= realmin)
+				break;		/* NOTE BREAK OUT */
+		}
+
+	if (ss == NULL)
+		return NULL;
+
+	if (coldp != NULL)	/* report last no-progress state set, if any */
+		*coldp = lastcold(v, d);
+
+	if ((ss->flags&POSTSTATE) && cp > min) {
+		assert(cp >= realmin);
+		cp--;
+	} else if (cp == v->stop && max == v->stop) {
+		co = d->cnfa->eos[(v->eflags&REG_NOTEOL) ? 0 : 1];
+		FDEBUG(("color %ld\n", (long)co));
+		ss = miss(v, d, css, co, cp, start);
+		/* match might have ended at eol */
+		if ((ss == NULL || !(ss->flags&POSTSTATE)) && hitstopp != NULL)
+			*hitstopp = 1;
+	}
+
+	if (ss == NULL || !(ss->flags&POSTSTATE))
+		return NULL;
+
+	return cp;
+}
+
+/*
+ * lastcold - determine last point at which no progress had been made
+ */
+static chr *			/* endpoint, or NULL */
+lastcold(struct vars *v,
+		 struct dfa *d)
+{
+	struct sset *ss;
+	chr *nopr;
+	int i;
+
+	nopr = d->lastnopr;
+	if (nopr == NULL)
+		nopr = v->start;
+	for (ss = d->ssets, i = d->nssused; i > 0; ss++, i--)
+		if ((ss->flags&NOPROGRESS) && nopr < ss->lastseen)
+			nopr = ss->lastseen;
+	return nopr;
+}
+
+/*
+ * newdfa - set up a fresh DFA
+ */
+static struct dfa *
+newdfa(struct vars *v,
+	   struct cnfa *cnfa,
+	   struct colormap *cm,
+	   struct smalldfa *small)		/* preallocated space, may be NULL */
+{
+	struct dfa *d;
+	size_t nss = cnfa->nstates * 2;
+	int wordsper = (cnfa->nstates + UBITS - 1) / UBITS;
+	struct smalldfa *smallwas = small;
+
+	assert(cnfa != NULL && cnfa->nstates != 0);
+
+	if (nss <= FEWSTATES && cnfa->ncolors <= FEWCOLORS) {
+		assert(wordsper == 1);
+		if (small == NULL) {
+			small = (struct smalldfa *)MALLOC(
+						sizeof(struct smalldfa));
+			if (small == NULL) {
+				ERR(REG_ESPACE);
+				return NULL;
+			}
+		}
+		d = &small->dfa;
+		d->ssets = small->ssets;
+		d->statesarea = small->statesarea;
+		d->work = &d->statesarea[nss];
+		d->outsarea = small->outsarea;
+		d->incarea = small->incarea;
+		d->cptsmalloced = 0;
+		d->mallocarea = (smallwas == NULL) ? (char *)small : NULL;
+	} else {
+		d = (struct dfa *)MALLOC(sizeof(struct dfa));
+		if (d == NULL) {
+			ERR(REG_ESPACE);
+			return NULL;
+		}
+		d->ssets = (struct sset *)MALLOC(nss * sizeof(struct sset));
+		d->statesarea = (unsigned *)MALLOC((nss+WORK) * wordsper *
+							sizeof(unsigned));
+		d->work = &d->statesarea[nss * wordsper];
+		d->outsarea = (struct sset **)MALLOC(nss * cnfa->ncolors *
+							sizeof(struct sset *));
+		d->incarea = (struct arcp *)MALLOC(nss * cnfa->ncolors *
+							sizeof(struct arcp));
+		d->cptsmalloced = 1;
+		d->mallocarea = (char *)d;
+		if (d->ssets == NULL || d->statesarea == NULL ||
+				d->outsarea == NULL || d->incarea == NULL) {
+			freedfa(d);
+			ERR(REG_ESPACE);
+			return NULL;
+		}
+	}
+
+	d->nssets = (v->eflags&REG_SMALL) ? 7 : nss;
+	d->nssused = 0;
+	d->nstates = cnfa->nstates;
+	d->ncolors = cnfa->ncolors;
+	d->wordsper = wordsper;
+	d->cnfa = cnfa;
+	d->cm = cm;
+	d->lastpost = NULL;
+	d->lastnopr = NULL;
+	d->search = d->ssets;
+
+	/* initialization of sset fields is done as needed */
+
+	return d;
+}
+
+/*
+ * freedfa - free a DFA
+ */
+static void
+freedfa(struct dfa *d)
+{
+	if (d->cptsmalloced) {
+		if (d->ssets != NULL)
+			FREE(d->ssets);
+		if (d->statesarea != NULL)
+			FREE(d->statesarea);
+		if (d->outsarea != NULL)
+			FREE(d->outsarea);
+		if (d->incarea != NULL)
+			FREE(d->incarea);
+	}
+
+	if (d->mallocarea != NULL)
+		FREE(d->mallocarea);
+}
+
+/*
+ * hash - construct a hash code for a bitvector
+ *
+ * There are probably better ways, but they're more expensive.
+ */
+static unsigned
+hash(unsigned *uv,
+	 int n)
+{
+	int i;
+	unsigned h;
+
+	h = 0;
+	for (i = 0; i < n; i++)
+		h ^= uv[i];
+	return h;
+}
+
+/*
+ * initialize - hand-craft a cache entry for startup, otherwise get ready
+ */
+static struct sset *
+initialize(struct vars *v,			/* used only for debug flags */
+		   struct dfa *d,
+		   chr *start)
+{
+	struct sset *ss;
+	int i;
+
+	/* is previous one still there? */
+	if (d->nssused > 0 && (d->ssets[0].flags&STARTER))
+		ss = &d->ssets[0];
+	else {				/* no, must (re)build it */
+		ss = getvacant(v, d, start, start);
+		for (i = 0; i < d->wordsper; i++)
+			ss->states[i] = 0;
+		BSET(ss->states, d->cnfa->pre);
+		ss->hash = HASH(ss->states, d->wordsper);
+		assert(d->cnfa->pre != d->cnfa->post);
+		ss->flags = STARTER|LOCKED|NOPROGRESS;
+		/* lastseen dealt with below */
+	}
+
+	for (i = 0; i < d->nssused; i++)
+		d->ssets[i].lastseen = NULL;
+	ss->lastseen = start;		/* maybe untrue, but harmless */
+	d->lastpost = NULL;
+	d->lastnopr = NULL;
+	return ss;
+}
+
+/*
+ * miss - handle a cache miss
+ */
+static struct sset *		/* NULL if goes to empty set */
+miss(struct vars *v,			/* used only for debug flags */
+	 struct dfa *d,
+	 struct sset *css,
+	 pcolor co,
+	 chr *cp,			/* next chr */
+	 chr *start)			/* where the attempt got started */
+{
+	struct cnfa *cnfa = d->cnfa;
+	int i;
+	unsigned h;
+	struct carc *ca;
+	struct sset *p;
+	int ispost;
+	int noprogress;
+	int gotstate;
+	int dolacons;
+	int sawlacons;
+
+	/* for convenience, we can be called even if it might not be a miss */
+	if (css->outs[co] != NULL) {
+		FDEBUG(("hit\n"));
+		return css->outs[co];
+	}
+	FDEBUG(("miss\n"));
+
+	/* first, what set of states would we end up in? */
+	for (i = 0; i < d->wordsper; i++)
+		d->work[i] = 0;
+	ispost = 0;
+	noprogress = 1;
+	gotstate = 0;
+	for (i = 0; i < d->nstates; i++)
+		if (ISBSET(css->states, i))
+			for (ca = cnfa->states[i]+1; ca->co != COLORLESS; ca++)
+				if (ca->co == co) {
+					BSET(d->work, ca->to);
+					gotstate = 1;
+					if (ca->to == cnfa->post)
+						ispost = 1;
+					if (!cnfa->states[ca->to]->co)
+						noprogress = 0;
+					FDEBUG(("%d -> %d\n", i, ca->to));
+				}
+	dolacons = (gotstate) ? (cnfa->flags&HASLACONS) : 0;
+	sawlacons = 0;
+	while (dolacons) {		/* transitive closure */
+		dolacons = 0;
+		for (i = 0; i < d->nstates; i++)
+			if (ISBSET(d->work, i))
+				for (ca = cnfa->states[i]+1; ca->co != COLORLESS;
+									ca++) {
+					if (ca->co <= cnfa->ncolors)
+						continue; /* NOTE CONTINUE */
+					sawlacons = 1;
+					if (ISBSET(d->work, ca->to))
+						continue; /* NOTE CONTINUE */
+					if (!lacon(v, cnfa, cp, ca->co))
+						continue; /* NOTE CONTINUE */
+					BSET(d->work, ca->to);
+					dolacons = 1;
+					if (ca->to == cnfa->post)
+						ispost = 1;
+					if (!cnfa->states[ca->to]->co)
+						noprogress = 0;
+					FDEBUG(("%d :> %d\n", i, ca->to));
+				}
+	}
+	if (!gotstate)
+		return NULL;
+	h = HASH(d->work, d->wordsper);
+
+	/* next, is that in the cache? */
+	for (p = d->ssets, i = d->nssused; i > 0; p++, i--)
+		if (HIT(h, d->work, p, d->wordsper)) {
+			FDEBUG(("cached c%d\n", p - d->ssets));
+			break;			/* NOTE BREAK OUT */
+		}
+	if (i == 0) {		/* nope, need a new cache entry */
+		p = getvacant(v, d, cp, start);
+		assert(p != css);
+		for (i = 0; i < d->wordsper; i++)
+			p->states[i] = d->work[i];
+		p->hash = h;
+		p->flags = (ispost) ? POSTSTATE : 0;
+		if (noprogress)
+			p->flags |= NOPROGRESS;
+		/* lastseen to be dealt with by caller */
+	}
+
+	if (!sawlacons) {		/* lookahead conds. always cache miss */
+		FDEBUG(("c%d[%d]->c%d\n", css - d->ssets, co, p - d->ssets));
+		css->outs[co] = p;
+		css->inchain[co] = p->ins;
+		p->ins.ss = css;
+		p->ins.co = (color)co;
+	}
+	return p;
+}
+
+/*
+ * lacon - lookahead-constraint checker for miss()
+ */
+static int			/* predicate:  constraint satisfied? */
+lacon(struct vars *v,
+	  struct cnfa *pcnfa,		/* parent cnfa */
+	  chr *cp,
+	  pcolor co)			/* "color" of the lookahead constraint */
+{
+	int n;
+	struct subre *sub;
+	struct dfa *d;
+	struct smalldfa sd;
+	chr *end;
+
+	n = co - pcnfa->ncolors;
+	assert(n < v->g->nlacons && v->g->lacons != NULL);
+	FDEBUG(("=== testing lacon %d\n", n));
+	sub = &v->g->lacons[n];
+	d = newdfa(v, &sub->cnfa, &v->g->cmap, &sd);
+	if (d == NULL) {
+		ERR(REG_ESPACE);
+		return 0;
+	}
+	end = longest(v, d, cp, v->stop, (int *)NULL);
+	freedfa(d);
+	FDEBUG(("=== lacon %d match %d\n", n, (end != NULL)));
+	return (sub->subno) ? (end != NULL) : (end == NULL);
+}
+
+/*
+ * getvacant - get a vacant state set
+ * This routine clears out the inarcs and outarcs, but does not otherwise
+ * clear the innards of the state set -- that's up to the caller.
+ */
+static struct sset *
+getvacant(struct vars *v,			/* used only for debug flags */
+		  struct dfa *d,
+		  chr *cp,
+		  chr *start)
+{
+	int i;
+	struct sset *ss;
+	struct sset *p;
+	struct arcp ap;
+	struct arcp lastap;
+	color co;
+
+	ss = pickss(v, d, cp, start);
+	assert(!(ss->flags&LOCKED));
+
+	/* clear out its inarcs, including self-referential ones */
+	ap = ss->ins;
+	while ((p = ap.ss) != NULL) {
+		co = ap.co;
+		FDEBUG(("zapping c%d's %ld outarc\n", p - d->ssets, (long)co));
+		p->outs[co] = NULL;
+		ap = p->inchain[co];
+		p->inchain[co].ss = NULL;	/* paranoia */
+	}
+	ss->ins.ss = NULL;
+
+	/* take it off the inarc chains of the ssets reached by its outarcs */
+	for (i = 0; i < d->ncolors; i++) {
+		p = ss->outs[i];
+		assert(p != ss);		/* not self-referential */
+		if (p == NULL)
+			continue;		/* NOTE CONTINUE */
+		FDEBUG(("del outarc %d from c%d's in chn\n", i, p - d->ssets));
+		if (p->ins.ss == ss && p->ins.co == i)
+			p->ins = ss->inchain[i];
+		else {
+			assert(p->ins.ss != NULL);
+			for (ap = p->ins; ap.ss != NULL &&
+						!(ap.ss == ss && ap.co == i);
+						ap = ap.ss->inchain[ap.co])
+				lastap = ap;
+			assert(ap.ss != NULL);
+			lastap.ss->inchain[lastap.co] = ss->inchain[i];
+		}
+		ss->outs[i] = NULL;
+		ss->inchain[i].ss = NULL;
+	}
+
+	/* if ss was a success state, may need to remember location */
+	if ((ss->flags&POSTSTATE) && ss->lastseen != d->lastpost &&
+			(d->lastpost == NULL || d->lastpost < ss->lastseen))
+		d->lastpost = ss->lastseen;
+
+	/* likewise for a no-progress state */
+	if ((ss->flags&NOPROGRESS) && ss->lastseen != d->lastnopr &&
+			(d->lastnopr == NULL || d->lastnopr < ss->lastseen))
+		d->lastnopr = ss->lastseen;
+
+	return ss;
+}
+
+/*
+ * pickss - pick the next stateset to be used
+ */
+static struct sset *
+pickss(struct vars *v,			/* used only for debug flags */
+	   struct dfa *d,
+	   chr *cp,
+	   chr *start)
+{
+	int i;
+	struct sset *ss;
+	struct sset *end;
+	chr *ancient;
+
+	/* shortcut for cases where cache isn't full */
+	if (d->nssused < d->nssets) {
+		i = d->nssused;
+		d->nssused++;
+		ss = &d->ssets[i];
+		FDEBUG(("new c%d\n", i));
+		/* set up innards */
+		ss->states = &d->statesarea[i * d->wordsper];
+		ss->flags = 0;
+		ss->ins.ss = NULL;
+		ss->ins.co = WHITE;		/* give it some value */
+		ss->outs = &d->outsarea[i * d->ncolors];
+		ss->inchain = &d->incarea[i * d->ncolors];
+		for (i = 0; i < d->ncolors; i++) {
+			ss->outs[i] = NULL;
+			ss->inchain[i].ss = NULL;
+		}
+		return ss;
+	}
+
+	/* look for oldest, or old enough anyway */
+	if (cp - start > d->nssets*2/3)		/* oldest 33% are expendable */
+		ancient = cp - d->nssets*2/3;
+	else
+		ancient = start;
+	for (ss = d->search, end = &d->ssets[d->nssets]; ss < end; ss++)
+		if ((ss->lastseen == NULL || ss->lastseen < ancient) &&
+							!(ss->flags&LOCKED)) {
+			d->search = ss + 1;
+			FDEBUG(("replacing c%d\n", ss - d->ssets));
+			return ss;
+		}
+	for (ss = d->ssets, end = d->search; ss < end; ss++)
+		if ((ss->lastseen == NULL || ss->lastseen < ancient) &&
+							!(ss->flags&LOCKED)) {
+			d->search = ss + 1;
+			FDEBUG(("replacing c%d\n", ss - d->ssets));
+			return ss;
+		}
+
+	/* nobody's old enough?!? -- something's really wrong */
+	FDEBUG(("can't find victim to replace!\n"));
+	assert(NOTREACHED);
+	ERR(REG_ASSERT);
+	return d->ssets;
+}
diff --git a/src/backend/regex/regerror.c b/src/backend/regex/regerror.c
index 27ad9e2bd3e..94693eba211 100644
--- a/src/backend/regex/regerror.c
+++ b/src/backend/regex/regerror.c
@@ -1,181 +1,110 @@
-/*-
- * Copyright (c) 1992, 1993, 1994 Henry Spencer.
- * Copyright (c) 1992, 1993, 1994
- *		The Regents of the University of California.  All rights reserved.
- *
- * This code is derived from software contributed to Berkeley by
- * Henry Spencer.
+/*
+ * regerror - error-code expansion
  *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *	  notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *	  notice, this list of conditions and the following disclaimer in the
- *	  documentation and/or other materials provided with the distribution.
- * 3. All advertising materials mentioning features or use of this software
- *	  must display the following acknowledgement:
- *		This product includes software developed by the University of
- *		California, Berkeley and its contributors.
- * 4. Neither the name of the University nor the names of its contributors
- *	  may be used to endorse or promote products derived from this software
- *	  without specific prior written permission.
+ * Copyright (c) 1998, 1999 Henry Spencer.  All rights reserved.
+ * 
+ * Development of this software was funded, in part, by Cray Research Inc.,
+ * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
+ * Corporation, none of whom are responsible for the results.  The author
+ * thanks all of them. 
+ * 
+ * Redistribution and use in source and binary forms -- with or without
+ * modification -- are permitted for any purpose, provided that
+ * redistributions in source form retain this entire copyright notice and
+ * indicate the origin and nature of any modifications.
+ * 
+ * I'd appreciate being given credit for this package in the documentation
+ * of software which uses it, but that is not a requirement.
+ * 
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
+ * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL
+ * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
- * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.	IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
+ * $Header: /cvsroot/pgsql/src/backend/regex/regerror.c,v 1.25 2003/02/05 17:41:33 tgl Exp $
  *
- *		@(#)regerror.c	8.4 (Berkeley) 3/20/94
  */
 
-#include "postgres.h"
-
-#include <ctype.h>
-#include <limits.h>
-#include <assert.h>
+#include "regex/regguts.h"
 
-#include "regex/regex.h"
-#include "regex/utils.h"
-#include "regex/regex2.h"
-
-static char *regatoi(const regex_t *preg, char *localbuf);
-
-static struct rerr
-{
-	int			code;
-	char	   *name;
-	char	   *explain;
-}	rerrs[] =
+/* unknown-error explanation */
+static char unk[] = "*** unknown regex error code 0x%x ***";
 
-{
-	{
-		/* NOMATCH is not really an error condition, it just says no match */
-		REG_NOMATCH, "REG_NOMATCH", "no pattern match found"
-	},
-	{
-		REG_BADPAT, "REG_BADPAT", "invalid regex struct"
-	},
-	{
-		REG_ECOLLATE, "REG_ECOLLATE", "invalid collating element"
-	},
-	{
-		REG_ECTYPE, "REG_ECTYPE", "invalid character class"
-	},
-	{
-		REG_EESCAPE, "REG_EESCAPE", "trailing backslash (\\)"
-	},
-	{
-		REG_ESUBREG, "REG_ESUBREG", "invalid backreference number"
-	},
-	{
-		REG_EBRACK, "REG_EBRACK", "brackets [ ] not balanced"
-	},
-	{
-		REG_EPAREN, "REG_EPAREN", "parentheses ( ) not balanced"
-	},
-	{
-		REG_EBRACE, "REG_EBRACE", "braces { } not balanced"
-	},
-	{
-		REG_BADBR, "REG_BADBR", "invalid repetition count(s) in { }"
-	},
-	{
-		REG_ERANGE, "REG_ERANGE", "invalid character range in [ ]"
-	},
-	{
-		REG_ESPACE, "REG_ESPACE", "ran out of memory"
-	},
-	{
-		REG_BADRPT, "REG_BADRPT", "?, *, or + operand invalid"
-	},
-	{
-		REG_EMPTY, "REG_EMPTY", "empty expression or subexpression"
-	},
-	{
-		REG_ASSERT, "REG_ASSERT", "\"can't happen\" -- you found a bug"
-	},
-	{
-		REG_INVARG, "REG_INVARG", "invalid argument to regex routine"
-	},
-	{
-		0, "", "*** unknown regexp error code ***"
-	}
+/* struct to map among codes, code names, and explanations */
+static struct rerr {
+	int code;
+	char *name;
+	char *explain;
+} rerrs[] = {
+	/* the actual table is built from regex.h */
+#include "regex/regerrs.h"
+	{ -1,	"",	"oops" },	/* explanation special-cased in code */
 };
 
 /*
- * regerror - the interface to error numbers
+ * pg_regerror - the interface to error numbers
  */
 /* ARGSUSED */
-size_t
-pg_regerror(int errcode, const regex_t *preg,
-			char *errbuf, size_t errbuf_size)
+size_t				/* actual space needed (including NUL) */
+pg_regerror(int errcode,		/* error code, or REG_ATOI or REG_ITOA */
+			const regex_t *preg, /* associated regex_t (unused at present) */
+			char *errbuf,		/* result buffer (unless errbuf_size==0) */
+			size_t errbuf_size)	/* available space in errbuf, can be 0 */
 {
 	struct rerr *r;
-	size_t		len;
-	int			target = errcode & ~REG_ITOA;
-	char	   *s;
-	char		convbuf[50];
+	char *msg;
+	char convbuf[sizeof(unk)+50];	/* 50 = plenty for int */
+	size_t len;
+	int icode;
 
-	if (errcode == REG_ATOI)
-		s = regatoi(preg, convbuf);
-	else
-	{
-		for (r = rerrs; r->code != 0; r++)
-			if (r->code == target)
+	switch (errcode) {
+	case REG_ATOI:		/* convert name to number */
+		for (r = rerrs; r->code >= 0; r++)
+			if (strcmp(r->name, errbuf) == 0)
 				break;
-
-		if (errcode & REG_ITOA)
-		{
-			if (r->code != 0)
-				strcpy(convbuf, r->name);
-			else
-				sprintf(convbuf, "REG_0x%x", target);
-			assert(strlen(convbuf) < sizeof(convbuf));
-			s = convbuf;
+		sprintf(convbuf, "%d", r->code);	/* -1 for unknown */
+		msg = convbuf;
+		break;
+	case REG_ITOA:		/* convert number to name */
+		icode = atoi(errbuf);	/* not our problem if this fails */
+		for (r = rerrs; r->code >= 0; r++)
+			if (r->code == icode)
+				break;
+		if (r->code >= 0)
+			msg = r->name;
+		else {			/* unknown; tell him the number */
+			sprintf(convbuf, "REG_%u", (unsigned)icode);
+			msg = convbuf;
 		}
-		else
-			s = r->explain;
+		break;
+	default:		/* a real, normal error code */
+		for (r = rerrs; r->code >= 0; r++)
+			if (r->code == errcode)
+				break;
+		if (r->code >= 0)
+			msg = r->explain;
+		else {			/* unknown; say so */
+			sprintf(convbuf, unk, errcode);
+			msg = convbuf;
+		}
+		break;
 	}
 
-	len = strlen(s) + 1;
-	if (errbuf_size > 0)
-	{
+	len = strlen(msg) + 1;		/* space needed, including NUL */
+	if (errbuf_size > 0) {
 		if (errbuf_size > len)
-			strcpy(errbuf, s);
-		else
-		{
-			strncpy(errbuf, s, errbuf_size - 1);
-			errbuf[errbuf_size - 1] = '\0';
+			strcpy(errbuf, msg);
+		else {			/* truncate to fit */
+			strncpy(errbuf, msg, errbuf_size-1);
+			errbuf[errbuf_size-1] = '\0';
 		}
 	}
 
 	return len;
 }
-
-/*
- * regatoi - internal routine to implement REG_ATOI
- */
-static char *
-regatoi(const regex_t *preg, char *localbuf)
-{
-	struct rerr *r;
-
-	for (r = rerrs; r->code != 0; r++)
-		if (pg_char_and_wchar_strcmp(r->name, preg->re_endp) == 0)
-			break;
-
-	if (r->code == 0)
-		return "0";
-
-	sprintf(localbuf, "%d", r->code);
-	return localbuf;
-}
diff --git a/src/backend/regex/regex.3 b/src/backend/regex/regex.3
deleted file mode 100644
index 01a6109278a..00000000000
--- a/src/backend/regex/regex.3
+++ /dev/null
@@ -1,538 +0,0 @@
-.\" Copyright (c) 1992, 1993, 1994 Henry Spencer.
-.\" Copyright (c) 1992, 1993, 1994
-.\"	The Regents of the University of California.  All rights reserved.
-.\"
-.\" This code is derived from software contributed to Berkeley by
-.\" Henry Spencer.
-.\"
-.\" Redistribution and use in source and binary forms, with or without
-.\" modification, are permitted provided that the following conditions
-.\" are met:
-.\" 1. Redistributions of source code must retain the above copyright
-.\"    notice, this list of conditions and the following disclaimer.
-.\" 2. Redistributions in binary form must reproduce the above copyright
-.\"    notice, this list of conditions and the following disclaimer in the
-.\"    documentation and/or other materials provided with the distribution.
-.\" 3. All advertising materials mentioning features or use of this software
-.\"    must display the following acknowledgement:
-.\"	This product includes software developed by the University of
-.\"	California, Berkeley and its contributors.
-.\" 4. Neither the name of the University nor the names of its contributors
-.\"    may be used to endorse or promote products derived from this software
-.\"    without specific prior written permission.
-.\"
-.\" THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
-.\" ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
-.\" IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
-.\" ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
-.\" FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
-.\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
-.\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
-.\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
-.\" LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
-.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
-.\" SUCH DAMAGE.
-.\"
-.\"	@(#)regex.3	8.4 (Berkeley) 3/20/94
-.\"
-.TH REGEX 3 "March 20, 1994"
-.de ZR
-.\" one other place knows this name:  the SEE ALSO section
-.IR re_format (7) \\$1
-..
-.SH NAME
-regcomp, regexec, regerror, regfree \- regular-expression library
-.SH SYNOPSIS
-.ft B
-.\".na
-#include <sys/types.h>
-.br
-#include <regex.h>
-.HP 10
-int regcomp(regex_t\ *preg, const\ char\ *pattern, int\ cflags);
-.HP
-int\ regexec(const\ regex_t\ *preg, const\ char\ *string,
-size_t\ nmatch, regmatch_t\ pmatch[], int\ eflags);
-.HP
-size_t\ regerror(int\ errcode, const\ regex_t\ *preg,
-char\ *errbuf, size_t\ errbuf_size);
-.HP
-void\ regfree(regex_t\ *preg);
-.\".ad
-.ft
-.SH DESCRIPTION
-These routines implement POSIX 1003.2 regular expressions (``RE''s);
-see
-.ZR .
-.I Regcomp
-compiles an RE written as a string into an internal form,
-.I regexec
-matches that internal form against a string and reports results,
-.I regerror
-transforms error codes from either into human-readable messages,
-and
-.I regfree
-frees any dynamically-allocated storage used by the internal form
-of an RE.
-.PP
-The header
-.I <regex.h>
-declares two structure types,
-.I regex_t
-and
-.IR regmatch_t ,
-the former for compiled internal forms and the latter for match reporting.
-It also declares the four functions,
-a type
-.IR regoff_t ,
-and a number of constants with names starting with ``REG_''.
-.PP
-.I Regcomp
-compiles the regular expression contained in the
-.I pattern
-string,
-subject to the flags in
-.IR cflags ,
-and places the results in the
-.I regex_t
-structure pointed to by
-.IR preg .
-.I Cflags
-is the bitwise OR of zero or more of the following flags:
-.IP REG_EXTENDED \w'REG_EXTENDED'u+2n
-Compile modern (``extended'') REs,
-rather than the obsolete (``basic'') REs that
-are the default.
-.IP REG_BASIC
-This is a synonym for 0,
-provided as a counterpart to REG_EXTENDED to improve readability.
-.IP REG_NOSPEC
-Compile with recognition of all special characters turned off.
-All characters are thus considered ordinary,
-so the ``RE'' is a literal string.
-This is an extension,
-compatible with but not specified by POSIX 1003.2,
-and should be used with
-caution in software intended to be portable to other systems.
-REG_EXTENDED and REG_NOSPEC may not be used
-in the same call to
-.IR regcomp .
-.IP REG_ICASE
-Compile for matching that ignores upper/lower case distinctions.
-See
-.ZR .
-.IP REG_NOSUB
-Compile for matching that need only report success or failure,
-not what was matched.
-.IP REG_NEWLINE
-Compile for newline-sensitive matching.
-By default, newline is a completely ordinary character with no special
-meaning in either REs or strings.
-With this flag,
-`[^' bracket expressions and `.' never match newline,
-a `^' anchor matches the null string after any newline in the string
-in addition to its normal function,
-and the `$' anchor matches the null string before any newline in the
-string in addition to its normal function.
-.IP REG_PEND
-The regular expression ends,
-not at the first NUL,
-but just before the character pointed to by the
-.I re_endp
-member of the structure pointed to by
-.IR preg .
-The
-.I re_endp
-member is of type
-.IR const\ char\ * .
-This flag permits inclusion of NULs in the RE;
-they are considered ordinary characters.
-This is an extension,
-compatible with but not specified by POSIX 1003.2,
-and should be used with
-caution in software intended to be portable to other systems.
-.PP
-When successful,
-.I regcomp
-returns 0 and fills in the structure pointed to by
-.IR preg .
-One member of that structure
-(other than
-.IR re_endp )
-is publicized:
-.IR re_nsub ,
-of type
-.IR size_t ,
-contains the number of parenthesized subexpressions within the RE
-(except that the value of this member is undefined if the
-REG_NOSUB flag was used).
-If
-.I regcomp
-fails, it returns a non-zero error code;
-see DIAGNOSTICS.
-.PP
-.I Regexec
-matches the compiled RE pointed to by
-.I preg
-against the
-.IR string ,
-subject to the flags in
-.IR eflags ,
-and reports results using
-.IR nmatch ,
-.IR pmatch ,
-and the returned value.
-The RE must have been compiled by a previous invocation of
-.IR regcomp .
-The compiled form is not altered during execution of
-.IR regexec ,
-so a single compiled RE can be used simultaneously by multiple threads.
-.PP
-By default,
-the NUL-terminated string pointed to by
-.I string
-is considered to be the text of an entire line, minus any terminating
-newline.
-The
-.I eflags
-argument is the bitwise OR of zero or more of the following flags:
-.IP REG_NOTBOL \w'REG_STARTEND'u+2n
-The first character of
-the string
-is not the beginning of a line, so the `^' anchor should not match before it.
-This does not affect the behavior of newlines under REG_NEWLINE.
-.IP REG_NOTEOL
-The NUL terminating
-the string
-does not end a line, so the `$' anchor should not match before it.
-This does not affect the behavior of newlines under REG_NEWLINE.
-.IP REG_STARTEND
-The string is considered to start at
-\fIstring\fR\ + \fIpmatch\fR[0].\fIrm_so\fR
-and to have a terminating NUL located at
-\fIstring\fR\ + \fIpmatch\fR[0].\fIrm_eo\fR
-(there need not actually be a NUL at that location),
-regardless of the value of
-.IR nmatch .
-See below for the definition of
-.IR pmatch
-and
-.IR nmatch .
-This is an extension,
-compatible with but not specified by POSIX 1003.2,
-and should be used with
-caution in software intended to be portable to other systems.
-Note that a non-zero \fIrm_so\fR does not imply REG_NOTBOL;
-REG_STARTEND affects only the location of the string,
-not how it is matched.
-.PP
-See
-.ZR
-for a discussion of what is matched in situations where an RE or a
-portion thereof could match any of several substrings of
-.IR string .
-.PP
-Normally,
-.I regexec
-returns 0 for success and the non-zero code REG_NOMATCH for failure.
-Other non-zero error codes may be returned in exceptional situations;
-see DIAGNOSTICS.
-.PP
-If REG_NOSUB was specified in the compilation of the RE,
-or if
-.I nmatch
-is 0,
-.I regexec
-ignores the
-.I pmatch
-argument (but see below for the case where REG_STARTEND is specified).
-Otherwise,
-.I pmatch
-points to an array of
-.I nmatch
-structures of type
-.IR regmatch_t .
-Such a structure has at least the members
-.I rm_so
-and
-.IR rm_eo ,
-both of type
-.I regoff_t
-(a signed arithmetic type at least as large as an
-.I off_t
-and a
-.IR ssize_t ),
-containing respectively the offset of the first character of a substring
-and the offset of the first character after the end of the substring.
-Offsets are measured from the beginning of the
-.I string
-argument given to
-.IR regexec .
-An empty substring is denoted by equal offsets,
-both indicating the character following the empty substring.
-.PP
-The 0th member of the
-.I pmatch
-array is filled in to indicate what substring of
-.I string
-was matched by the entire RE.
-Remaining members report what substring was matched by parenthesized
-subexpressions within the RE;
-member
-.I i
-reports subexpression
-.IR i ,
-with subexpressions counted (starting at 1) by the order of their opening
-parentheses in the RE, left to right.
-Unused entries in the array\(emcorresponding either to subexpressions that
-did not participate in the match at all, or to subexpressions that do not
-exist in the RE (that is, \fIi\fR\ > \fIpreg\fR\->\fIre_nsub\fR)\(emhave both
-.I rm_so
-and
-.I rm_eo
-set to \-1.
-If a subexpression participated in the match several times,
-the reported substring is the last one it matched.
-(Note, as an example in particular, that when the RE `(b*)+' matches `bbb',
-the parenthesized subexpression matches each of the three `b's and then
-an infinite number of empty strings following the last `b',
-so the reported substring is one of the empties.)
-.PP
-If REG_STARTEND is specified,
-.I pmatch
-must point to at least one
-.I regmatch_t
-(even if
-.I nmatch
-is 0 or REG_NOSUB was specified),
-to hold the input offsets for REG_STARTEND.
-Use for output is still entirely controlled by
-.IR nmatch ;
-if
-.I nmatch
-is 0 or REG_NOSUB was specified,
-the value of
-.IR pmatch [0]
-will not be changed by a successful
-.IR regexec .
-.PP
-.I Regerror
-maps a non-zero
-.I errcode
-from either
-.I regcomp
-or
-.I regexec
-to a human-readable, printable message.
-If
-.I preg
-is non-NULL,
-the error code should have arisen from use of
-the
-.I regex_t
-pointed to by
-.IR preg ,
-and if the error code came from
-.IR regcomp ,
-it should have been the result from the most recent
-.I regcomp
-using that
-.IR regex_t .
-.RI ( Regerror
-may be able to supply a more detailed message using information
-from the
-.IR regex_t .)
-.I Regerror
-places the NUL-terminated message into the buffer pointed to by
-.IR errbuf ,
-limiting the length (including the NUL) to at most
-.I errbuf_size
-bytes.
-If the whole message won't fit,
-as much of it as will fit before the terminating NUL is supplied.
-In any case,
-the returned value is the size of buffer needed to hold the whole
-message (including terminating NUL).
-If
-.I errbuf_size
-is 0,
-.I errbuf
-is ignored but the return value is still correct.
-.PP
-If the
-.I errcode
-given to
-.I regerror
-is first ORed with REG_ITOA,
-the ``message'' that results is the printable name of the error code,
-e.g. ``REG_NOMATCH'',
-rather than an explanation thereof.
-If
-.I errcode
-is REG_ATOI,
-then
-.I preg
-shall be non-NULL and the
-.I re_endp
-member of the structure it points to
-must point to the printable name of an error code;
-in this case, the result in
-.I errbuf
-is the decimal digits of
-the numeric value of the error code
-(0 if the name is not recognized).
-REG_ITOA and REG_ATOI are intended primarily as debugging facilities;
-they are extensions,
-compatible with but not specified by POSIX 1003.2,
-and should be used with
-caution in software intended to be portable to other systems.
-Be warned also that they are considered experimental and changes are possible.
-.PP
-.I Regfree
-frees any dynamically-allocated storage associated with the compiled RE
-pointed to by
-.IR preg .
-The remaining
-.I regex_t
-is no longer a valid compiled RE
-and the effect of supplying it to
-.I regexec
-or
-.I regerror
-is undefined.
-.PP
-None of these functions references global variables except for tables
-of constants;
-all are safe for use from multiple threads if the arguments are safe.
-.SH IMPLEMENTATION CHOICES
-There are a number of decisions that 1003.2 leaves up to the implementor,
-either by explicitly saying ``undefined'' or by virtue of them being
-forbidden by the RE grammar.
-This implementation treats them as follows.
-.PP
-See
-.ZR
-for a discussion of the definition of case-independent matching.
-.PP
-There is no particular limit on the length of REs,
-except insofar as memory is limited.
-Memory usage is approximately linear in RE size, and largely insensitive
-to RE complexity, except for bounded repetitions.
-See BUGS for one short RE using them
-that will run almost any system out of memory.
-.PP
-A backslashed character other than one specifically given a magic meaning
-by 1003.2 (such magic meanings occur only in obsolete [``basic''] REs)
-is taken as an ordinary character.
-.PP
-Any unmatched [ is a REG_EBRACK error.
-.PP
-Equivalence classes cannot begin or end bracket-expression ranges.
-The endpoint of one range cannot begin another.
-.PP
-RE_DUP_MAX, the limit on repetition counts in bounded repetitions, is 255.
-.PP
-A repetition operator (?, *, +, or bounds) cannot follow another
-repetition operator.
-A repetition operator cannot begin an expression or subexpression
-or follow `^' or `|'.
-.PP
-`|' cannot appear first or last in a (sub)expression or after another `|',
-i.e. an operand of `|' cannot be an empty subexpression.
-An empty parenthesized subexpression, `()', is legal and matches an
-empty (sub)string.
-An empty string is not a legal RE.
-.PP
-A `{' followed by a digit is considered the beginning of bounds for a
-bounded repetition, which must then follow the syntax for bounds.
-A `{' \fInot\fR followed by a digit is considered an ordinary character.
-.PP
-`^' and `$' beginning and ending subexpressions in obsolete (``basic'')
-REs are anchors, not ordinary characters.
-.SH SEE ALSO
-grep(1), re_format(7)
-.PP
-POSIX 1003.2, sections 2.8 (Regular Expression Notation)
-and
-B.5 (C Binding for Regular Expression Matching).
-.SH DIAGNOSTICS
-Non-zero error codes from
-.I regcomp
-and
-.I regexec
-include the following:
-.PP
-.nf
-.ta \w'REG_ECOLLATE'u+3n
-REG_NOMATCH	no pattern match found
-REG_BADPAT	invalid regex struct
-REG_ECOLLATE	invalid collating element
-REG_ECTYPE	invalid character class
-REG_EESCAPE	trailing backslash (\e)
-REG_ESUBREG	invalid backreference number
-REG_EBRACK	brackets [ ] not balanced
-REG_EPAREN	parentheses ( ) not balanced
-REG_EBRACE	braces { } not balanced
-REG_BADBR	invalid repetition count(s) in { }
-REG_ERANGE	invalid character range in [ ]
-REG_ESPACE	ran out of memory
-REG_BADRPT	?, *, or + operand invalid
-REG_EMPTY	empty expression or subexpression
-REG_ASSERT	``can't happen''\(emyou found a bug
-REG_INVARG	invalid argument, e.g. negative-length string
-.fi
-.SH HISTORY
-Originally written by Henry Spencer.
-Altered for inclusion in the 4.4BSD distribution.
-.SH BUGS
-This is an alpha release with known defects.
-Please report problems.
-.PP
-There is one known functionality bug.
-The implementation of internationalization is incomplete:
-the locale is always assumed to be the default one of 1003.2,
-and only the collating elements etc. of that locale are available.
-.PP
-The back-reference code is subtle and doubts linger about its correctness
-in complex cases.
-.PP
-.I Regexec
-performance is poor.
-This will improve with later releases.
-.I Nmatch
-exceeding 0 is expensive;
-.I nmatch
-exceeding 1 is worse.
-.I Regexec
-is largely insensitive to RE complexity \fIexcept\fR that back
-references are massively expensive.
-RE length does matter; in particular, there is a strong speed bonus
-for keeping RE length under about 30 characters,
-with most special characters counting roughly double.
-.PP
-.I Regcomp
-implements bounded repetitions by macro expansion,
-which is costly in time and space if counts are large
-or bounded repetitions are nested.
-An RE like, say,
-`((((a{1,100}){1,100}){1,100}){1,100}){1,100}'
-will (eventually) run almost any existing machine out of swap space.
-.PP
-There are suspected problems with response to obscure error conditions.
-Notably,
-certain kinds of internal overflow,
-produced only by truly enormous REs or by multiply nested bounded repetitions,
-are probably not handled well.
-.PP
-Due to a mistake in 1003.2, things like `a)b' are legal REs because `)' is
-a special character only in the presence of a previous unmatched `('.
-This can't be fixed until the spec is fixed.
-.PP
-The standard's definition of back references is vague.
-For example, does
-`a\e(\e(b\e)*\e2\e)*d' match `abbbd'?
-Until the standard is clarified,
-behavior in such cases should not be relied on.
-.PP
-The implementation of word-boundary matching is a bit of a kludge,
-and bugs may lurk in combinations of word-boundary matching and anchoring.
diff --git a/src/backend/regex/regexec.c b/src/backend/regex/regexec.c
index 41ff00fa699..eef01b0bd58 100644
--- a/src/backend/regex/regexec.c
+++ b/src/backend/regex/regexec.c
@@ -1,182 +1,1003 @@
-/*-
- * Copyright (c) 1992, 1993, 1994 Henry Spencer.
- * Copyright (c) 1992, 1993, 1994
- *		The Regents of the University of California.  All rights reserved.
- *
- * This code is derived from software contributed to Berkeley by
- * Henry Spencer.
+/*
+ * re_*exec and friends - match REs
  *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *	  notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *	  notice, this list of conditions and the following disclaimer in the
- *	  documentation and/or other materials provided with the distribution.
- * 3. All advertising materials mentioning features or use of this software
- *	  must display the following acknowledgement:
- *		This product includes software developed by the University of
- *		California, Berkeley and its contributors.
- * 4. Neither the name of the University nor the names of its contributors
- *	  may be used to endorse or promote products derived from this software
- *	  without specific prior written permission.
+ * Copyright (c) 1998, 1999 Henry Spencer.  All rights reserved.
+ * 
+ * Development of this software was funded, in part, by Cray Research Inc.,
+ * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
+ * Corporation, none of whom are responsible for the results.  The author
+ * thanks all of them. 
+ * 
+ * Redistribution and use in source and binary forms -- with or without
+ * modification -- are permitted for any purpose, provided that
+ * redistributions in source form retain this entire copyright notice and
+ * indicate the origin and nature of any modifications.
+ * 
+ * I'd appreciate being given credit for this package in the documentation
+ * of software which uses it, but that is not a requirement.
+ * 
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
+ * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL
+ * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
- * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.	IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
+ * $Header: /cvsroot/pgsql/src/backend/regex/regexec.c,v 1.21 2003/02/05 17:41:33 tgl Exp $
  *
- *		@(#)regexec.c	8.3 (Berkeley) 3/20/94
  */
 
-#include "postgres.h"
+#include "regex/regguts.h"
+
+
+
+/* lazy-DFA representation */
+struct arcp {			/* "pointer" to an outarc */
+	struct sset *ss;
+	color co;
+};
+
+struct sset {			/* state set */
+	unsigned *states;	/* pointer to bitvector */
+	unsigned hash;		/* hash of bitvector */
+#		define	HASH(bv, nw)	(((nw) == 1) ? *(bv) : hash(bv, nw))
+#	define	HIT(h,bv,ss,nw)	((ss)->hash == (h) && ((nw) == 1 || \
+		memcmp(VS(bv), VS((ss)->states), (nw)*sizeof(unsigned)) == 0))
+	int flags;
+#		define	STARTER		01	/* the initial state set */
+#		define	POSTSTATE	02	/* includes the goal state */
+#		define	LOCKED		04	/* locked in cache */
+#		define	NOPROGRESS	010	/* zero-progress state set */
+	struct arcp ins;	/* chain of inarcs pointing here */
+	chr *lastseen;		/* last entered on arrival here */
+	struct sset **outs;	/* outarc vector indexed by color */
+	struct arcp *inchain;	/* chain-pointer vector for outarcs */
+};
+
+struct dfa {
+	int nssets;		/* size of cache */
+	int nssused;		/* how many entries occupied yet */
+	int nstates;		/* number of states */
+	int ncolors;		/* length of outarc and inchain vectors */
+	int wordsper;		/* length of state-set bitvectors */
+	struct sset *ssets;	/* state-set cache */
+	unsigned *statesarea;	/* bitvector storage */
+	unsigned *work;		/* pointer to work area within statesarea */
+	struct sset **outsarea;	/* outarc-vector storage */
+	struct arcp *incarea;	/* inchain storage */
+	struct cnfa *cnfa;
+	struct colormap *cm;
+	chr *lastpost;		/* location of last cache-flushed success */
+	chr *lastnopr;		/* location of last cache-flushed NOPROGRESS */
+	struct sset *search;	/* replacement-search-pointer memory */
+	int cptsmalloced;	/* were the areas individually malloced? */
+	char *mallocarea;	/* self, or master malloced area, or NULL */
+};
+
+#define	WORK	1		/* number of work bitvectors needed */
+
+/* setup for non-malloc allocation for small cases */
+#define	FEWSTATES	20	/* must be less than UBITS */
+#define	FEWCOLORS	15
+struct smalldfa {
+	struct dfa dfa;
+	struct sset ssets[FEWSTATES*2];
+	unsigned statesarea[FEWSTATES*2 + WORK];
+	struct sset *outsarea[FEWSTATES*2 * FEWCOLORS];
+	struct arcp incarea[FEWSTATES*2 * FEWCOLORS];
+};
+#define	DOMALLOC	((struct smalldfa *)NULL)	/* force malloc */
+
+
+
+/* internal variables, bundled for easy passing around */
+struct vars {
+	regex_t *re;
+	struct guts *g;
+	int eflags;		/* copies of arguments */
+	size_t nmatch;
+	regmatch_t *pmatch;
+	rm_detail_t *details;
+	chr *start;		/* start of string */
+	chr *stop;		/* just past end of string */
+	int err;		/* error code if any (0 none) */
+	regoff_t *mem;		/* memory vector for backtracking */
+	struct smalldfa dfa1;
+	struct smalldfa dfa2;
+};
+
+#define	VISERR(vv)	((vv)->err != 0)	/* have we seen an error yet? */
+#define	ISERR()	VISERR(v)
+#define	VERR(vv,e)	(((vv)->err) ? (vv)->err : ((vv)->err = (e)))
+#define	ERR(e)	VERR(v, e)		/* record an error */
+#define	NOERR()	{if (ISERR()) return v->err;}	/* if error seen, return it */
+#define	OFF(p)	((p) - v->start)
+#define	LOFF(p)	((long)OFF(p))
+
+
 
 /*
- * the outer shell of regexec()
- *
- * This file includes engine.c *twice*, after muchos fiddling with the
- * macros that code uses.  This lets the same code operate on two different
- * representations for state sets.
- */
-#include <limits.h>
-#include <ctype.h>
-#include <assert.h>
-
-#include "regex/regex.h"
-#include "regex/utils.h"
-#include "regex/regex2.h"
-
-static int	nope = 0;			/* for use in asserts; shuts lint up */
-
-/* macros for manipulating states, small version */
-#define states	long
-#define states1 states			/* for later use in regexec() decision */
-#define CLEAR(v)		((v) = 0)
-#define SET0(v, n)		((v) &= ~(1L << (n)))
-#define SET1(v, n)		((v) |= (1L << (n)))
-#define ISSET(v, n)		((v) & (1L << (n)))
-#define ASSIGN(d, s)	((d) = (s))
-#define EQ(a, b)		((a) == (b))
-#define STATEVARS		int dummy		/* dummy version */
-#define STATESETUP(m, n)		/* nothing */
-#define STATETEARDOWN(m)		/* nothing */
-#define SETUP(v)		((v) = 0)
-#define onestate		long
-#define INIT(o, n)		((o) = (1L << (n)))
-#define INC(o)			((o) <<= 1)
-#define ISSTATEIN(v, o) ((v) & (o))
-/* some abbreviations; note that some of these know variable names! */
-/* do "if I'm here, I can also be there" etc without branches */
-#define FWD(dst, src, n)		((dst) |= ((src) & (here)) << (n))
-#define BACK(dst, src, n)		((dst) |= ((src) & (here)) >> (n))
-#define ISSETBACK(v, n) ((v) & (here >> (n)))
-/* function names */
-#define SNAMES					/* engine.c looks after details */
-
-#include "engine.c"
-
-/* now undo things */
-#undef	states
-#undef	CLEAR
-#undef	SET0
-#undef	SET1
-#undef	ISSET
-#undef	ASSIGN
-#undef	EQ
-#undef	STATEVARS
-#undef	STATESETUP
-#undef	STATETEARDOWN
-#undef	SETUP
-#undef	onestate
-#undef	INIT
-#undef	INC
-#undef	ISSTATEIN
-#undef	FWD
-#undef	BACK
-#undef	ISSETBACK
-#undef	SNAMES
-
-/* macros for manipulating states, large version */
-#define states	char *
-#define CLEAR(v)		memset(v, 0, m->g->nstates)
-#define SET0(v, n)		((v)[n] = 0)
-#define SET1(v, n)		((v)[n] = 1)
-#define ISSET(v, n)		((v)[n])
-#define ASSIGN(d, s)	memcpy(d, s, m->g->nstates)
-#define EQ(a, b)		(memcmp(a, b, m->g->nstates) == 0)
-#define STATEVARS		int vn; char *space
-#define STATESETUP(m, nv) \
-do { \
-	(m)->space = malloc((nv)*(m)->g->nstates); \
-	if ((m)->space == NULL) \
-		return(REG_ESPACE); \
-	(m)->vn = 0; \
-} while (0)
-
-#define STATETEARDOWN(m) \
-do { \
-	free((m)->space); \
-} while (0)
-
-#define SETUP(v)		((v) = &m->space[m->vn++ * m->g->nstates])
-#define onestate		int
-#define INIT(o, n)		((o) = (n))
-#define INC(o)			((o)++)
-#define ISSTATEIN(v, o) ((v)[o])
-/* some abbreviations; note that some of these know variable names! */
-/* do "if I'm here, I can also be there" etc without branches */
-#define FWD(dst, src, n)		((dst)[here+(n)] |= (src)[here])
-#define BACK(dst, src, n)		((dst)[here-(n)] |= (src)[here])
-#define ISSETBACK(v, n) ((v)[here - (n)])
-/* function names */
-#define LNAMES					/* flag */
-
-#include "engine.c"
-
-/*
- * regexec - interface for matching
- *
- * We put this here so we can exploit knowledge of the state representation
- * when choosing which matcher to call.
+ * forward declarations
+ */
+/* === regexec.c === */
+static int find (struct vars *, struct cnfa *, struct colormap *);
+static int cfind (struct vars *, struct cnfa *, struct colormap *);
+static int cfindloop (struct vars *, struct cnfa *, struct colormap *, struct dfa *, struct dfa *, chr **);
+static void zapsubs (regmatch_t *, size_t);
+static void zapmem (struct vars *, struct subre *);
+static void subset (struct vars *, struct subre *, chr *, chr *);
+static int dissect (struct vars *, struct subre *, chr *, chr *);
+static int condissect (struct vars *, struct subre *, chr *, chr *);
+static int altdissect (struct vars *, struct subre *, chr *, chr *);
+static int cdissect (struct vars *, struct subre *, chr *, chr *);
+static int ccondissect (struct vars *, struct subre *, chr *, chr *);
+static int crevdissect (struct vars *, struct subre *, chr *, chr *);
+static int cbrdissect (struct vars *, struct subre *, chr *, chr *);
+static int caltdissect (struct vars *, struct subre *, chr *, chr *);
+/* === rege_dfa.c === */
+static chr *longest (struct vars *, struct dfa *, chr *, chr *, int *);
+static chr *shortest (struct vars *, struct dfa *, chr *, chr *, chr *, chr **, int *);
+static chr *lastcold (struct vars *, struct dfa *);
+static struct dfa *newdfa (struct vars *, struct cnfa *, struct colormap *, struct smalldfa *);
+static void freedfa (struct dfa *);
+static unsigned hash (unsigned *, int);
+static struct sset *initialize (struct vars *, struct dfa *, chr *);
+static struct sset *miss (struct vars *, struct dfa *, struct sset *, pcolor, chr *, chr *);
+static int lacon (struct vars *, struct cnfa *, chr *, pcolor);
+static struct sset *getvacant (struct vars *, struct dfa *, chr *, chr *);
+static struct sset *pickss (struct vars *, struct dfa *, chr *, chr *);
+
+
+/*
+ * pg_regexec - match regular expression
+ */
+int
+pg_regexec(regex_t *re,
+		   const chr *string,
+		   size_t len,
+		   rm_detail_t *details,
+		   size_t nmatch,
+		   regmatch_t pmatch[],
+		   int flags)
+{
+	struct vars var;
+	register struct vars *v = &var;
+	int st;
+	size_t n;
+	int backref;
+#	define	LOCALMAT	20
+	regmatch_t mat[LOCALMAT];
+#	define	LOCALMEM	40
+	regoff_t mem[LOCALMEM];
+
+	/* sanity checks */
+	if (re == NULL || string == NULL || re->re_magic != REMAGIC)
+		return REG_INVARG;
+	if (re->re_csize != sizeof(chr))
+		return REG_MIXED;
+
+	/* setup */
+	v->re = re;
+	v->g = (struct guts *)re->re_guts;
+	if ((v->g->cflags&REG_EXPECT) && details == NULL)
+		return REG_INVARG;
+	if (v->g->info&REG_UIMPOSSIBLE)
+		return REG_NOMATCH;
+	backref = (v->g->info&REG_UBACKREF) ? 1 : 0;
+	v->eflags = flags;
+	if (v->g->cflags&REG_NOSUB)
+		nmatch = 0;		/* override client */
+	v->nmatch = nmatch;
+	if (backref) {
+		/* need work area */
+		if (v->g->nsub + 1 <= LOCALMAT)
+			v->pmatch = mat;
+		else
+			v->pmatch = (regmatch_t *)MALLOC((v->g->nsub + 1) *
+							sizeof(regmatch_t));
+		if (v->pmatch == NULL)
+			return REG_ESPACE;
+		v->nmatch = v->g->nsub + 1;
+	} else
+		v->pmatch = pmatch;
+	v->details = details;
+	v->start = (chr *)string;
+	v->stop = (chr *)string + len;
+	v->err = 0;
+	if (backref) {
+		/* need retry memory */
+		assert(v->g->ntree >= 0);
+		n = (size_t)v->g->ntree;
+		if (n <= LOCALMEM)
+			v->mem = mem;
+		else
+			v->mem = (regoff_t *)MALLOC(n*sizeof(regoff_t));
+		if (v->mem == NULL) {
+			if (v->pmatch != pmatch && v->pmatch != mat)
+				FREE(v->pmatch);
+			return REG_ESPACE;
+		}
+	} else
+		v->mem = NULL;
+
+	/* do it */
+	assert(v->g->tree != NULL);
+	if (backref)
+		st = cfind(v, &v->g->tree->cnfa, &v->g->cmap);
+	else
+		st = find(v, &v->g->tree->cnfa, &v->g->cmap);
+
+	/* copy (portion of) match vector over if necessary */
+	if (st == REG_OKAY && v->pmatch != pmatch && nmatch > 0) {
+		zapsubs(pmatch, nmatch);
+		n = (nmatch < v->nmatch) ? nmatch : v->nmatch;
+		memcpy(VS(pmatch), VS(v->pmatch), n*sizeof(regmatch_t));
+	}
+
+	/* clean up */
+	if (v->pmatch != pmatch && v->pmatch != mat)
+		FREE(v->pmatch);
+	if (v->mem != NULL && v->mem != mem)
+		FREE(v->mem);
+	return st;
+}
+
+/*
+ * find - find a match for the main NFA (no-complications case)
+ */
+static int
+find(struct vars *v,
+	 struct cnfa *cnfa,
+	 struct colormap *cm)
+{
+	struct dfa *s;
+	struct dfa *d;
+	chr *begin;
+	chr *end = NULL;
+	chr *cold;
+	chr *open;		/* open and close of range of possible starts */
+	chr *close;
+	int hitend;
+	int shorter = (v->g->tree->flags&SHORTER) ? 1 : 0;
+
+	/* first, a shot with the search RE */
+	s = newdfa(v, &v->g->search, cm, &v->dfa1);
+	assert(!(ISERR() && s != NULL));
+	NOERR();
+	MDEBUG(("\nsearch at %ld\n", LOFF(v->start)));
+	cold = NULL;
+	close = shortest(v, s, v->start, v->start, v->stop, &cold, (int *)NULL);
+	freedfa(s);
+	NOERR();
+	if (v->g->cflags&REG_EXPECT) {
+		assert(v->details != NULL);
+		if (cold != NULL)
+			v->details->rm_extend.rm_so = OFF(cold);
+		else
+			v->details->rm_extend.rm_so = OFF(v->stop);
+		v->details->rm_extend.rm_eo = OFF(v->stop);	/* unknown */
+	}
+	if (close == NULL)		/* not found */
+		return REG_NOMATCH;
+	if (v->nmatch == 0)		/* found, don't need exact location */
+		return REG_OKAY;
+
+	/* find starting point and match */
+	assert(cold != NULL);
+	open = cold;
+	cold = NULL;
+	MDEBUG(("between %ld and %ld\n", LOFF(open), LOFF(close)));
+	d = newdfa(v, cnfa, cm, &v->dfa1);
+	assert(!(ISERR() && d != NULL));
+	NOERR();
+	for (begin = open; begin <= close; begin++) {
+		MDEBUG(("\nfind trying at %ld\n", LOFF(begin)));
+		if (shorter)
+			end = shortest(v, d, begin, begin, v->stop,
+							(chr **)NULL, &hitend);
+		else
+			end = longest(v, d, begin, v->stop, &hitend);
+		NOERR();
+		if (hitend && cold == NULL)
+			cold = begin;
+		if (end != NULL)
+			break;		/* NOTE BREAK OUT */
+	}
+	assert(end != NULL);		/* search RE succeeded so loop should */
+	freedfa(d);
+
+	/* and pin down details */
+	assert(v->nmatch > 0);
+	v->pmatch[0].rm_so = OFF(begin);
+	v->pmatch[0].rm_eo = OFF(end);
+	if (v->g->cflags&REG_EXPECT) {
+		if (cold != NULL)
+			v->details->rm_extend.rm_so = OFF(cold);
+		else
+			v->details->rm_extend.rm_so = OFF(v->stop);
+		v->details->rm_extend.rm_eo = OFF(v->stop);	/* unknown */
+	}
+	if (v->nmatch == 1)		/* no need for submatches */
+		return REG_OKAY;
+
+	/* submatches */
+	zapsubs(v->pmatch, v->nmatch);
+	return dissect(v, v->g->tree, begin, end);
+}
+
+/*
+ * cfind - find a match for the main NFA (with complications)
+ */
+static int
+cfind(struct vars *v,
+	  struct cnfa *cnfa,
+	  struct colormap *cm)
+{
+	struct dfa *s;
+	struct dfa *d;
+	chr *cold;
+	int ret;
+
+	s = newdfa(v, &v->g->search, cm, &v->dfa1);
+	NOERR();
+	d = newdfa(v, cnfa, cm, &v->dfa2);
+	if (ISERR()) {
+		assert(d == NULL);
+		freedfa(s);
+		return v->err;
+	}
+
+	ret = cfindloop(v, cnfa, cm, d, s, &cold);
+
+	freedfa(d);
+	freedfa(s);
+	NOERR();
+	if (v->g->cflags&REG_EXPECT) {
+		assert(v->details != NULL);
+		if (cold != NULL)
+			v->details->rm_extend.rm_so = OFF(cold);
+		else
+			v->details->rm_extend.rm_so = OFF(v->stop);
+		v->details->rm_extend.rm_eo = OFF(v->stop);	/* unknown */
+	}
+	return ret;
+}
+
+/*
+ * cfindloop - the heart of cfind
+ */
+static int
+cfindloop(struct vars *v,
+		  struct cnfa *cnfa,
+		  struct colormap *cm,
+		  struct dfa *d,
+		  struct dfa *s,
+		  chr **coldp)			/* where to put coldstart pointer */
+{
+	chr *begin;
+	chr *end;
+	chr *cold;
+	chr *open;		/* open and close of range of possible starts */
+	chr *close;
+	chr *estart;
+	chr *estop;
+	int er;
+	int shorter = v->g->tree->flags&SHORTER;
+	int hitend;
+
+	assert(d != NULL && s != NULL);
+	cold = NULL;
+	close = v->start;
+	do {
+		MDEBUG(("\ncsearch at %ld\n", LOFF(close)));
+		close = shortest(v, s, close, close, v->stop, &cold, (int *)NULL);
+		if (close == NULL)
+			break;				/* NOTE BREAK */
+		assert(cold != NULL);
+		open = cold;
+		cold = NULL;
+		MDEBUG(("cbetween %ld and %ld\n", LOFF(open), LOFF(close)));
+		for (begin = open; begin <= close; begin++) {
+			MDEBUG(("\ncfind trying at %ld\n", LOFF(begin)));
+			estart = begin;
+			estop = v->stop;
+			for (;;) {
+				if (shorter)
+					end = shortest(v, d, begin, estart,
+						estop, (chr **)NULL, &hitend);
+				else
+					end = longest(v, d, begin, estop,
+								&hitend);
+				if (hitend && cold == NULL)
+					cold = begin;
+				if (end == NULL)
+					break;		/* NOTE BREAK OUT */
+				MDEBUG(("tentative end %ld\n", LOFF(end)));
+				zapsubs(v->pmatch, v->nmatch);
+				zapmem(v, v->g->tree);
+				er = cdissect(v, v->g->tree, begin, end);
+				if (er == REG_OKAY) {
+					if (v->nmatch > 0) {
+						v->pmatch[0].rm_so = OFF(begin);
+						v->pmatch[0].rm_eo = OFF(end);
+					}
+					*coldp = cold;
+					return REG_OKAY;
+				}
+				if (er != REG_NOMATCH) {
+					ERR(er);
+					return er;
+				}
+				if ((shorter) ? end == estop : end == begin) {
+					/* no point in trying again */
+					*coldp = cold;
+					return REG_NOMATCH;
+				}
+				/* go around and try again */
+				if (shorter)
+					estart = end + 1;
+				else
+					estop = end - 1;
+			}
+		}
+	} while (close < v->stop);
+
+	*coldp = cold;
+	return REG_NOMATCH;
+}
+
+/*
+ * zapsubs - initialize the subexpression matches to "no match"
+ */
+static void
+zapsubs(regmatch_t *p,
+		size_t n)
+{
+	size_t i;
+
+	for (i = n-1; i > 0; i--) {
+		p[i].rm_so = -1;
+		p[i].rm_eo = -1;
+	}
+}
+
+/*
+ * zapmem - initialize the retry memory of a subtree to zeros
+ */
+static void
+zapmem(struct vars *v,
+	   struct subre *t)
+{
+	if (t == NULL)
+		return;
+
+	assert(v->mem != NULL);
+	v->mem[t->retry] = 0;
+	if (t->op == '(') {
+		assert(t->subno > 0);
+		v->pmatch[t->subno].rm_so = -1;
+		v->pmatch[t->subno].rm_eo = -1;
+	}
+
+	if (t->left != NULL)
+		zapmem(v, t->left);
+	if (t->right != NULL)
+		zapmem(v, t->right);
+}
+
+/*
+ * subset - set any subexpression relevant to a successful subre
+ */
+static void
+subset(struct vars *v,
+	   struct subre *sub,
+	   chr *begin,
+	   chr *end)
+{
+	int n = sub->subno;
+
+	assert(n > 0);
+	if ((size_t)n >= v->nmatch)
+		return;
+
+	MDEBUG(("setting %d\n", n));
+	v->pmatch[n].rm_so = OFF(begin);
+	v->pmatch[n].rm_eo = OFF(end);
+}
+
+/*
+ * dissect - determine subexpression matches (uncomplicated case)
+ */
+static int			/* regexec return code */
+dissect(struct vars *v,
+		struct subre *t,
+		chr *begin,			/* beginning of relevant substring */
+		chr *end)			/* end of same */
+{
+	assert(t != NULL);
+	MDEBUG(("dissect %ld-%ld\n", LOFF(begin), LOFF(end)));
+
+	switch (t->op) {
+	case '=':		/* terminal node */
+		assert(t->left == NULL && t->right == NULL);
+		return REG_OKAY;	/* no action, parent did the work */
+		break;
+	case '|':		/* alternation */
+		assert(t->left != NULL);
+		return altdissect(v, t, begin, end);
+		break;
+	case 'b':		/* back ref -- shouldn't be calling us! */
+		return REG_ASSERT;
+		break;
+	case '.':		/* concatenation */
+		assert(t->left != NULL && t->right != NULL);
+		return condissect(v, t, begin, end);
+		break;
+	case '(':		/* capturing */
+		assert(t->left != NULL && t->right == NULL);
+		assert(t->subno > 0);
+		subset(v, t, begin, end);
+		return dissect(v, t->left, begin, end);
+		break;
+	default:
+		return REG_ASSERT;
+		break;
+	}
+}
+
+/*
+ * condissect - determine concatenation subexpression matches (uncomplicated)
  */
-int								/* 0 success, REG_NOMATCH failure */
-pg_regexec(const regex_t *preg, const char *string, size_t nmatch,
-		   regmatch_t *pmatch, int eflags)
+static int			/* regexec return code */
+condissect(struct vars *v,
+		   struct subre *t,
+		   chr *begin,			/* beginning of relevant substring */
+		   chr *end)			/* end of same */
 {
-	struct re_guts *g = preg->re_g;
-
-	pg_wchar   *str;
-	int			sts;
-
-#ifdef REDEBUG
-#define  GOODFLAGS(f)	 (f)
-#else
-#define  GOODFLAGS(f)	 ((f)&(REG_NOTBOL|REG_NOTEOL|REG_STARTEND))
-#endif
-
-	if (preg->re_magic != MAGIC1 || g->magic != MAGIC2)
-		return REG_BADPAT;
-	assert(!(g->iflags & BAD));
-	if (g->iflags & BAD)		/* backstop for no-debug case */
-		return REG_BADPAT;
-	eflags = GOODFLAGS(eflags);
-
-	str = (pg_wchar *) malloc((strlen(string) + 1) * sizeof(pg_wchar));
-	if (!str)
-		return (REG_ESPACE);
-	(void) pg_mb2wchar((unsigned char *) string, str);
-	if (g->nstates <= CHAR_BIT * sizeof(states1) && !(eflags & REG_LARGE))
-		sts = smatcher(g, str, nmatch, pmatch, eflags);
+	struct dfa *d;
+	struct dfa *d2;
+	chr *mid;
+	int i;
+	int shorter = (t->left->flags&SHORTER) ? 1 : 0;
+	chr *stop = (shorter) ? end : begin;
+
+	assert(t->op == '.');
+	assert(t->left != NULL && t->left->cnfa.nstates > 0);
+	assert(t->right != NULL && t->right->cnfa.nstates > 0);
+
+	d = newdfa(v, &t->left->cnfa, &v->g->cmap, &v->dfa1);
+	NOERR();
+	d2 = newdfa(v, &t->right->cnfa, &v->g->cmap, &v->dfa2);
+	if (ISERR()) {
+		assert(d2 == NULL);
+		freedfa(d);
+		return v->err;
+	}
+
+	/* pick a tentative midpoint */
+	if (shorter)
+		mid = shortest(v, d, begin, begin, end, (chr **)NULL,
+								(int *)NULL);
 	else
-		sts = lmatcher(g, str, nmatch, pmatch, eflags);
-	free((char *) str);
-	return (sts);
+		mid = longest(v, d, begin, end, (int *)NULL);
+	if (mid == NULL) {
+		freedfa(d);
+		freedfa(d2);
+		return REG_ASSERT;
+	}
+	MDEBUG(("tentative midpoint %ld\n", LOFF(mid)));
+
+	/* iterate until satisfaction or failure */
+	while (longest(v, d2, mid, end, (int *)NULL) != end) {
+		/* that midpoint didn't work, find a new one */
+		if (mid == stop) {
+			/* all possibilities exhausted! */
+			MDEBUG(("no midpoint!\n"));
+			freedfa(d);
+			freedfa(d2);
+			return REG_ASSERT;
+		}
+		if (shorter)
+			mid = shortest(v, d, begin, mid+1, end, (chr **)NULL,
+								(int *)NULL);
+		else
+			mid = longest(v, d, begin, mid-1, (int *)NULL);
+		if (mid == NULL) {
+			/* failed to find a new one! */
+			MDEBUG(("failed midpoint!\n"));
+			freedfa(d);
+			freedfa(d2);
+			return REG_ASSERT;
+		}
+		MDEBUG(("new midpoint %ld\n", LOFF(mid)));
+	}
+
+	/* satisfaction */
+	MDEBUG(("successful\n"));
+	freedfa(d);
+	freedfa(d2);
+	i = dissect(v, t->left, begin, mid);
+	if (i != REG_OKAY)
+		return i;
+	return dissect(v, t->right, mid, end);
+}
+
+/*
+ * altdissect - determine alternative subexpression matches (uncomplicated)
+ */
+static int			/* regexec return code */
+altdissect(struct vars *v,
+		   struct subre *t,
+		   chr *begin,			/* beginning of relevant substring */
+		   chr *end)			/* end of same */
+{
+	struct dfa *d;
+	int i;
+
+	assert(t != NULL);
+	assert(t->op == '|');
+
+	for (i = 0; t != NULL; t = t->right, i++) {
+		MDEBUG(("trying %dth\n", i));
+		assert(t->left != NULL && t->left->cnfa.nstates > 0);
+		d = newdfa(v, &t->left->cnfa, &v->g->cmap, &v->dfa1);
+		if (ISERR())
+			return v->err;
+		if (longest(v, d, begin, end, (int *)NULL) == end) {
+			MDEBUG(("success\n"));
+			freedfa(d);
+			return dissect(v, t->left, begin, end);
+		}
+		freedfa(d);
+	}
+	return REG_ASSERT;	/* none of them matched?!? */
+}
+
+/*
+ * cdissect - determine subexpression matches (with complications)
+ * The retry memory stores the offset of the trial midpoint from begin, 
+ * plus 1 so that 0 uniquely means "clean slate".
+ */
+static int			/* regexec return code */
+cdissect(struct vars *v,
+		 struct subre *t,
+		 chr *begin,			/* beginning of relevant substring */
+		 chr *end)			/* end of same */
+{
+	int er;
+
+	assert(t != NULL);
+	MDEBUG(("cdissect %ld-%ld %c\n", LOFF(begin), LOFF(end), t->op));
+
+	switch (t->op) {
+	case '=':		/* terminal node */
+		assert(t->left == NULL && t->right == NULL);
+		return REG_OKAY;	/* no action, parent did the work */
+		break;
+	case '|':		/* alternation */
+		assert(t->left != NULL);
+		return caltdissect(v, t, begin, end);
+		break;
+	case 'b':		/* back ref -- shouldn't be calling us! */
+		assert(t->left == NULL && t->right == NULL);
+		return cbrdissect(v, t, begin, end);
+		break;
+	case '.':		/* concatenation */
+		assert(t->left != NULL && t->right != NULL);
+		return ccondissect(v, t, begin, end);
+		break;
+	case '(':		/* capturing */
+		assert(t->left != NULL && t->right == NULL);
+		assert(t->subno > 0);
+		er = cdissect(v, t->left, begin, end);
+		if (er == REG_OKAY)
+			subset(v, t, begin, end);
+		return er;
+		break;
+	default:
+		return REG_ASSERT;
+		break;
+	}
+}
+
+/*
+ * ccondissect - concatenation subexpression matches (with complications)
+ * The retry memory stores the offset of the trial midpoint from begin, 
+ * plus 1 so that 0 uniquely means "clean slate".
+ */
+static int			/* regexec return code */
+ccondissect(struct vars *v,
+			struct subre *t,
+			chr *begin,			/* beginning of relevant substring */
+			chr *end)			/* end of same */
+{
+	struct dfa *d;
+	struct dfa *d2;
+	chr *mid;
+	int er;
+
+	assert(t->op == '.');
+	assert(t->left != NULL && t->left->cnfa.nstates > 0);
+	assert(t->right != NULL && t->right->cnfa.nstates > 0);
+
+	if (t->left->flags&SHORTER)		/* reverse scan */
+		return crevdissect(v, t, begin, end);
+
+	d = newdfa(v, &t->left->cnfa, &v->g->cmap, DOMALLOC);
+	if (ISERR())
+		return v->err;
+	d2 = newdfa(v, &t->right->cnfa, &v->g->cmap, DOMALLOC);
+	if (ISERR()) {
+		freedfa(d);
+		return v->err;
+	}
+	MDEBUG(("cconcat %d\n", t->retry));
+
+	/* pick a tentative midpoint */
+	if (v->mem[t->retry] == 0) {
+		mid = longest(v, d, begin, end, (int *)NULL);
+		if (mid == NULL) {
+			freedfa(d);
+			freedfa(d2);
+			return REG_NOMATCH;
+		}
+		MDEBUG(("tentative midpoint %ld\n", LOFF(mid)));
+		v->mem[t->retry] = (mid - begin) + 1;
+	} else {
+		mid = begin + (v->mem[t->retry] - 1);
+		MDEBUG(("working midpoint %ld\n", LOFF(mid)));
+	}
+
+	/* iterate until satisfaction or failure */
+	for (;;) {
+		/* try this midpoint on for size */
+		er = cdissect(v, t->left, begin, mid);
+		if (er == REG_OKAY &&
+				longest(v, d2, mid, end, (int *)NULL) == end &&
+				(er = cdissect(v, t->right, mid, end)) == 
+								REG_OKAY)
+			break;			/* NOTE BREAK OUT */
+		if (er != REG_OKAY && er != REG_NOMATCH) {
+			freedfa(d);
+			freedfa(d2);
+			return er;
+		}
+
+		/* that midpoint didn't work, find a new one */
+		if (mid == begin) {
+			/* all possibilities exhausted */
+			MDEBUG(("%d no midpoint\n", t->retry));
+			freedfa(d);
+			freedfa(d2);
+			return REG_NOMATCH;
+		}
+		mid = longest(v, d, begin, mid-1, (int *)NULL);
+		if (mid == NULL) {
+			/* failed to find a new one */
+			MDEBUG(("%d failed midpoint\n", t->retry));
+			freedfa(d);
+			freedfa(d2);
+			return REG_NOMATCH;
+		}
+		MDEBUG(("%d: new midpoint %ld\n", t->retry, LOFF(mid)));
+		v->mem[t->retry] = (mid - begin) + 1;
+		zapmem(v, t->left);
+		zapmem(v, t->right);
+	}
+
+	/* satisfaction */
+	MDEBUG(("successful\n"));
+	freedfa(d);
+	freedfa(d2);
+	return REG_OKAY;
 }
+
+/*
+ * crevdissect - determine backref shortest-first subexpression matches
+ * The retry memory stores the offset of the trial midpoint from begin, 
+ * plus 1 so that 0 uniquely means "clean slate".
+ */
+static int			/* regexec return code */
+crevdissect(struct vars *v,
+			struct subre *t,
+			chr *begin,			/* beginning of relevant substring */
+			chr *end)			/* end of same */
+{
+	struct dfa *d;
+	struct dfa *d2;
+	chr *mid;
+	int er;
+
+	assert(t->op == '.');
+	assert(t->left != NULL && t->left->cnfa.nstates > 0);
+	assert(t->right != NULL && t->right->cnfa.nstates > 0);
+	assert(t->left->flags&SHORTER);
+
+	/* concatenation -- need to split the substring between parts */
+	d = newdfa(v, &t->left->cnfa, &v->g->cmap, DOMALLOC);
+	if (ISERR())
+		return v->err;
+	d2 = newdfa(v, &t->right->cnfa, &v->g->cmap, DOMALLOC);
+	if (ISERR()) {
+		freedfa(d);
+		return v->err;
+	}
+	MDEBUG(("crev %d\n", t->retry));
+
+	/* pick a tentative midpoint */
+	if (v->mem[t->retry] == 0) {
+		mid = shortest(v, d, begin, begin, end, (chr **)NULL, (int *)NULL);
+		if (mid == NULL) {
+			freedfa(d);
+			freedfa(d2);
+			return REG_NOMATCH;
+		}
+		MDEBUG(("tentative midpoint %ld\n", LOFF(mid)));
+		v->mem[t->retry] = (mid - begin) + 1;
+	} else {
+		mid = begin + (v->mem[t->retry] - 1);
+		MDEBUG(("working midpoint %ld\n", LOFF(mid)));
+	}
+
+	/* iterate until satisfaction or failure */
+	for (;;) {
+		/* try this midpoint on for size */
+		er = cdissect(v, t->left, begin, mid);
+		if (er == REG_OKAY &&
+				longest(v, d2, mid, end, (int *)NULL) == end &&
+				(er = cdissect(v, t->right, mid, end)) == 
+								REG_OKAY)
+			break;			/* NOTE BREAK OUT */
+		if (er != REG_OKAY && er != REG_NOMATCH) {
+			freedfa(d);
+			freedfa(d2);
+			return er;
+		}
+
+		/* that midpoint didn't work, find a new one */
+		if (mid == end) {
+			/* all possibilities exhausted */
+			MDEBUG(("%d no midpoint\n", t->retry));
+			freedfa(d);
+			freedfa(d2);
+			return REG_NOMATCH;
+		}
+		mid = shortest(v, d, begin, mid+1, end, (chr **)NULL, (int *)NULL);
+		if (mid == NULL) {
+			/* failed to find a new one */
+			MDEBUG(("%d failed midpoint\n", t->retry));
+			freedfa(d);
+			freedfa(d2);
+			return REG_NOMATCH;
+		}
+		MDEBUG(("%d: new midpoint %ld\n", t->retry, LOFF(mid)));
+		v->mem[t->retry] = (mid - begin) + 1;
+		zapmem(v, t->left);
+		zapmem(v, t->right);
+	}
+
+	/* satisfaction */
+	MDEBUG(("successful\n"));
+	freedfa(d);
+	freedfa(d2);
+	return REG_OKAY;
+}
+
+/*
+ * cbrdissect - determine backref subexpression matches
+ */
+static int			/* regexec return code */
+cbrdissect(struct vars *v,
+		   struct subre *t,
+		   chr *begin,			/* beginning of relevant substring */
+		   chr *end)			/* end of same */
+{
+	int i;
+	int n = t->subno;
+	size_t len;
+	chr *paren;
+	chr *p;
+	chr *stop;
+	int min = t->min;
+	int max = t->max;
+
+	assert(t != NULL);
+	assert(t->op == 'b');
+	assert(n >= 0);
+	assert((size_t)n < v->nmatch);
+
+	MDEBUG(("cbackref n%d %d{%d-%d}\n", t->retry, n, min, max));
+
+	if (v->pmatch[n].rm_so == -1)
+		return REG_NOMATCH;
+	paren = v->start + v->pmatch[n].rm_so;
+	len = v->pmatch[n].rm_eo - v->pmatch[n].rm_so;
+
+	/* no room to maneuver -- retries are pointless */
+	if (v->mem[t->retry])
+		return REG_NOMATCH;
+	v->mem[t->retry] = 1;
+
+	/* special-case zero-length string */
+	if (len == 0) {
+		if (begin == end)
+			return REG_OKAY;
+		return REG_NOMATCH;
+	}
+
+	/* and too-short string */
+	assert(end >= begin);
+	if ((size_t)(end - begin) < len)
+		return REG_NOMATCH;
+	stop = end - len;
+
+	/* count occurrences */
+	i = 0;
+	for (p = begin; p <= stop && (i < max || max == INFINITY); p += len) {
+		if ((*v->g->compare)(paren, p, len) != 0)
+				break;
+		i++;
+	}
+	MDEBUG(("cbackref found %d\n", i));
+
+	/* and sort it out */
+	if (p != end)			/* didn't consume all of it */
+		return REG_NOMATCH;
+	if (min <= i && (i <= max || max == INFINITY))
+		return REG_OKAY;
+	return REG_NOMATCH;		/* out of range */
+}
+
+/*
+ * caltdissect - determine alternative subexpression matches (w. complications)
+ */
+static int			/* regexec return code */
+caltdissect(struct vars *v,
+			struct subre *t,
+			chr *begin,			/* beginning of relevant substring */
+			chr *end)			/* end of same */
+{
+	struct dfa *d;
+	int er;
+#	define	UNTRIED	0	/* not yet tried at all */
+#	define	TRYING	1	/* top matched, trying submatches */
+#	define	TRIED	2	/* top didn't match or submatches exhausted */
+
+	if (t == NULL)
+		return REG_NOMATCH;
+	assert(t->op == '|');
+	if (v->mem[t->retry] == TRIED)
+		return caltdissect(v, t->right, begin, end);
+
+	MDEBUG(("calt n%d\n", t->retry));
+	assert(t->left != NULL);
+
+	if (v->mem[t->retry] == UNTRIED) {
+		d = newdfa(v, &t->left->cnfa, &v->g->cmap, DOMALLOC);
+		if (ISERR())
+			return v->err;
+		if (longest(v, d, begin, end, (int *)NULL) != end) {
+			freedfa(d);
+			v->mem[t->retry] = TRIED;
+			return caltdissect(v, t->right, begin, end);
+		}
+		freedfa(d);
+		MDEBUG(("calt matched\n"));
+		v->mem[t->retry] = TRYING;
+	}
+
+	er = cdissect(v, t->left, begin, end);
+	if (er != REG_NOMATCH)
+		return er;
+
+	v->mem[t->retry] = TRIED;
+	return caltdissect(v, t->right, begin, end);
+}
+
+
+
+#include "rege_dfa.c"
diff --git a/src/backend/regex/regfree.c b/src/backend/regex/regfree.c
index e46ea98e29f..88f3da32287 100644
--- a/src/backend/regex/regfree.c
+++ b/src/backend/regex/regfree.c
@@ -1,75 +1,54 @@
-/*-
- * Copyright (c) 1992, 1993, 1994 Henry Spencer.
- * Copyright (c) 1992, 1993, 1994
- *		The Regents of the University of California.  All rights reserved.
+/*
+ * regfree - free an RE
  *
- * This code is derived from software contributed to Berkeley by
- * Henry Spencer.
+ * Copyright (c) 1998, 1999 Henry Spencer.  All rights reserved.
+ * 
+ * Development of this software was funded, in part, by Cray Research Inc.,
+ * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
+ * Corporation, none of whom are responsible for the results.  The author
+ * thanks all of them. 
+ * 
+ * Redistribution and use in source and binary forms -- with or without
+ * modification -- are permitted for any purpose, provided that
+ * redistributions in source form retain this entire copyright notice and
+ * indicate the origin and nature of any modifications.
+ * 
+ * I'd appreciate being given credit for this package in the documentation
+ * of software which uses it, but that is not a requirement.
+ * 
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
+ * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL
+ * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *	  notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *	  notice, this list of conditions and the following disclaimer in the
- *	  documentation and/or other materials provided with the distribution.
- * 3. All advertising materials mentioning features or use of this software
- *	  must display the following acknowledgement:
- *		This product includes software developed by the University of
- *		California, Berkeley and its contributors.
- * 4. Neither the name of the University nor the names of its contributors
- *	  may be used to endorse or promote products derived from this software
- *	  without specific prior written permission.
+ * $Header: /cvsroot/pgsql/src/backend/regex/regfree.c,v 1.16 2003/02/05 17:41:33 tgl Exp $
  *
- * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.	IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
  *
- *		@(#)regfree.c	8.3 (Berkeley) 3/20/94
+ * You might think that this could be incorporated into regcomp.c, and
+ * that would be a reasonable idea... except that this is a generic
+ * function (with a generic name), applicable to all compiled REs
+ * regardless of the size of their characters, whereas the stuff in
+ * regcomp.c gets compiled once per character size.
  */
 
-#include "postgres.h"
-
+#include "regex/regguts.h"
 
-#include "regex/regex.h"
-#include "regex/utils.h"
-#include "regex/regex2.h"
 
 /*
- * regfree - free everything
+ * pg_regfree - free an RE (generic function, punts to RE-specific function)
+ *
+ * Ignoring invocation with NULL is a convenience.
  */
 void
-pg_regfree(regex_t *preg)
+pg_regfree(regex_t *re)
 {
-	struct re_guts *g;
-
-	if (preg->re_magic != MAGIC1)		/* oops */
-		return;					/* nice to complain, but hard */
-
-	g = preg->re_g;
-	if (g == NULL || g->magic != MAGIC2)		/* oops again */
+	if (re == NULL)
 		return;
-	preg->re_magic = 0;			/* mark it invalid */
-	g->magic = 0;				/* mark it invalid */
-
-	if (preg->patsave != NULL)
-		free((char *) preg->patsave);
-	if (g->strip != NULL)
-		free((char *) g->strip);
-	if (g->sets != NULL)
-		free((char *) g->sets);
-	if (g->setbits != NULL)
-		free((char *) g->setbits);
-	if (g->must != NULL)
-		free(g->must);
-	free((char *) g);
+	(*((struct fns *)re->re_fns)->free)(re);
 }
diff --git a/src/backend/regex/retest.c b/src/backend/regex/retest.c
deleted file mode 100644
index ca5d6c5394a..00000000000
--- a/src/backend/regex/retest.c
+++ /dev/null
@@ -1,44 +0,0 @@
-/*
- * a simple regexp debug program
- *
- * $Header: /cvsroot/pgsql/src/backend/regex/Attic/retest.c,v 1.5 2002/06/11 15:41:37 thomas Exp $
- */
-
-#include "postgres.h"
-#include "regex/regex.h"
-
-int
-main()
-{
-	int			sts;
-	regex_t		re;
-	char		buf[1024];
-	char	   *p;
-
-	printf("type in regexp string: ");
-	if (!fgets(buf, sizeof(buf), stdin))
-		exit(0);
-	p = strchr(buf, '\n');
-	if (p)
-		*p = '\0';
-
-	sts = pg_regcomp(&re, buf, 1);
-	printf("regcomp: parses \"%s\" and returns %d\n", buf, sts);
-	for (;;)
-	{
-		printf("type in target string: ");
-		if (!fgets(buf, sizeof(buf), stdin))
-			exit(0);
-		p = strchr(buf, '\n');
-		if (p)
-			*p = '\0';
-
-		sts = pg_regexec(&re, buf, 0, 0, 0);
-		printf("regexec: returns %d\n", sts);
-	}
-}
-
-void
-elog(int lev, const char *fmt,...)
-{
-}
diff --git a/src/backend/utils/adt/regexp.c b/src/backend/utils/adt/regexp.c
index ebbca8f0401..604e55d4145 100644
--- a/src/backend/utils/adt/regexp.c
+++ b/src/backend/utils/adt/regexp.c
@@ -1,14 +1,14 @@
 /*-------------------------------------------------------------------------
  *
  * regexp.c
- *	  regular expression handling code.
+ *	  Postgres' interface to the regular expression package.
  *
  * Portions Copyright (c) 1996-2002, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  *
  * IDENTIFICATION
- *	  $Header: /cvsroot/pgsql/src/backend/utils/adt/regexp.c,v 1.43 2002/09/22 17:27:23 tgl Exp $
+ *	  $Header: /cvsroot/pgsql/src/backend/utils/adt/regexp.c,v 1.44 2003/02/05 17:41:32 tgl Exp $
  *
  *		Alistair Crooks added the code for the regex caching
  *		agc - cached the regular expressions used - there's a good chance
@@ -30,171 +30,189 @@
 #include "postgres.h"
 
 #include "regex/regex.h"
+#include "mb/pg_wchar.h"
 #include "utils/builtins.h"
 
-#if defined(DISABLE_XOPEN_NLS)
-#undef _XOPEN_SOURCE
-#endif   /* DISABLE_XOPEN_NLS */
 
-/* this is the number of cached regular expressions held. */
+/*
+ * We cache precompiled regular expressions using a "self organizing list"
+ * structure, in which recently-used items tend to be near the front.
+ * Whenever we use an entry, it's moved up to the front of the list.
+ * Over time, an item's average position corresponds to its frequency of use.
+ *
+ * When we first create an entry, it's inserted at the front of
+ * the array, dropping the entry at the end of the array if necessary to
+ * make room.  (This might seem to be weighting the new entry too heavily,
+ * but if we insert new entries further back, we'll be unable to adjust to
+ * a sudden shift in the query mix where we are presented with MAX_CACHED_RES
+ * never-before-seen items used circularly.  We ought to be able to handle
+ * that case, so we have to insert at the front.)
+ *
+ * Knuth mentions a variant strategy in which a used item is moved up just
+ * one place in the list.  Although he says this uses fewer comparisons on
+ * average, it seems not to adapt very well to the situation where you have
+ * both some reusable patterns and a steady stream of non-reusable patterns.
+ * A reusable pattern that isn't used at least as often as non-reusable
+ * patterns are seen will "fail to keep up" and will drop off the end of the
+ * cache.  With move-to-front, a reusable pattern is guaranteed to stay in
+ * the cache as long as it's used at least once in every MAX_CACHED_RES uses.
+ */
+
+/* this is the maximum number of cached regular expressions */
 #ifndef MAX_CACHED_RES
 #define MAX_CACHED_RES	32
 #endif
 
-/* this structure describes a cached regular expression */
-struct cached_re_str
+/* this structure describes one cached regular expression */
+typedef struct cached_re_str
 {
-	char	   *cre_s;			/* pattern as null-terminated string */
-	int			cre_type;		/* compiled-type: extended,icase etc */
+	text	   *cre_pat;		/* original RE (untoasted TEXT form) */
+	int			cre_flags;		/* compile flags: extended,icase etc */
 	regex_t		cre_re;			/* the compiled regular expression */
-	unsigned long cre_lru;		/* lru tag */
-};
+} cached_re_str;
 
-static int	rec = 0;			/* # of cached re's */
-static struct cached_re_str rev[MAX_CACHED_RES];		/* cached re's */
-static unsigned long lru;		/* system lru tag */
-static int	pg_lastrec = 0;
+static int	num_res = 0;		/* # of cached re's */
+static cached_re_str re_array[MAX_CACHED_RES]; /* cached re's */
 
-/* attempt to compile `re' as an re, then match it against text */
-/* cflags - flag to regcomp indicates case sensitivity */
+
+/*
+ * RE_compile_and_execute - compile and execute a RE, caching if possible
+ *
+ * Returns TRUE on match, FALSE on no match
+ *
+ *	text_re --- the pattern, expressed as an *untoasted* TEXT object
+ *	dat --- the data to match against (need not be null-terminated)
+ *	dat_len --- the length of the data string
+ *	cflags --- compile options for the pattern
+ *	nmatch, pmatch  --- optional return area for match details
+ *
+ * Both pattern and data are given in the database encoding.  We internally
+ * convert to array of pg_wchar which is what Spencer's regex package wants.
+ */
 static bool
-RE_compile_and_execute(text *text_re, char *text, int cflags,
-					   int nmatch, regmatch_t *pmatch)
+RE_compile_and_execute(text *text_re, unsigned char *dat, int dat_len,
+					   int cflags, int nmatch, regmatch_t *pmatch)
 {
-	char	   *re;
-	int			oldest;
+	int			text_re_len = VARSIZE(text_re);
+	pg_wchar   *data;
+	size_t		data_len;
+	pg_wchar   *pattern;
+	size_t		pattern_len;
 	int			i;
 	int			regcomp_result;
+	int			regexec_result;
+	cached_re_str	re_temp;
 
-	/* Convert 'text' pattern to null-terminated string */
-	re = DatumGetCString(DirectFunctionCall1(textout,
-											 PointerGetDatum(text_re)));
+	/* Convert data string to wide characters */
+	data = (pg_wchar *) palloc((dat_len + 1) * sizeof(pg_wchar));
+	data_len = pg_mb2wchar_with_len(dat, data, dat_len);
 
 	/*
-	 * Find a previously compiled regular expression. Run the cache as a
-	 * ring buffer, starting the search from the previous match if any.
+	 * Look for a match among previously compiled REs.  Since the data
+	 * structure is self-organizing with most-used entries at the front,
+	 * our search strategy can just be to scan from the front.
 	 */
-	i = pg_lastrec;
-	while (i < rec)
+	for (i = 0; i < num_res; i++)
 	{
-		if (rev[i].cre_s != NULL)
+		if (memcmp(re_array[i].cre_pat, text_re, text_re_len) == 0 &&
+			re_array[i].cre_flags == cflags)
 		{
-			if (strcmp(rev[i].cre_s, re) == 0 &&
-				rev[i].cre_type == cflags)
+			/*
+			 * Found a match; move it to front if not there already.
+			 */
+			if (i > 0)
 			{
-				pg_lastrec = i;
-				rev[i].cre_lru = ++lru;
-				pfree(re);
-				return (pg_regexec(&rev[i].cre_re,
-								   text, nmatch,
-								   pmatch, 0) == 0);
+				re_temp = re_array[i];
+				memmove(&re_array[1], &re_array[0], i * sizeof(cached_re_str));
+				re_array[0] = re_temp;
 			}
-		}
-		i++;
 
-		/*
-		 * If we were not at the first slot to start, then think about
-		 * wrapping if necessary.
-		 */
-		if (pg_lastrec != 0)
-		{
-			if (i >= rec)
-				i = 0;
-			else if (i == pg_lastrec)
-				break;
-		}
-	}
+			/* Perform RE match and return result */
+			regexec_result = pg_regexec(&re_array[0].cre_re,
+										data,
+										data_len,
+										NULL, /* no details */
+										nmatch,
+										pmatch,
+										0);
 
-	/* we didn't find it - make room in the cache for it */
-	if (rec >= MAX_CACHED_RES)
-	{
-		/* cache is full - find the oldest entry */
-		for (oldest = 0, i = 1; i < rec; i++)
-		{
-			if (rev[i].cre_lru < rev[oldest].cre_lru)
-				oldest = i;
-		}
-	}
-	else
-		oldest = rec++;
+			pfree(data);
 
-	/* if there was an old re, then de-allocate the space it used */
-	if (rev[oldest].cre_s != (char *) NULL)
-	{
-		for (lru = i = 0; i < rec; i++)
-		{
-			/* downweight all of the other cached entries */
-			rev[i].cre_lru = (rev[i].cre_lru - rev[oldest].cre_lru) / 2;
-			if (rev[i].cre_lru > lru)
-				lru = rev[i].cre_lru;
+			return (regexec_result == 0);
 		}
-		pg_regfree(&rev[oldest].cre_re);
-
-		/*
-		 * use malloc/free for the cre_s field because the storage has to
-		 * persist across transactions
-		 */
-		free(rev[oldest].cre_s);
-		rev[oldest].cre_s = (char *) NULL;
 	}
 
-	/* compile the re */
-	regcomp_result = pg_regcomp(&rev[oldest].cre_re, re, cflags);
-	if (regcomp_result == 0)
-	{
-		pg_lastrec = oldest;
-
-		/*
-		 * use malloc/free for the cre_s field because the storage has to
-		 * persist across transactions
-		 */
-		rev[oldest].cre_s = strdup(re);
-		rev[oldest].cre_lru = ++lru;
-		rev[oldest].cre_type = cflags;
-		pfree(re);
-		/* agc - fixed an old typo here */
-		return (pg_regexec(&rev[oldest].cre_re, text,
-						   nmatch, pmatch, 0) == 0);
-	}
-	else
-	{
-		char		errMsg[1000];
+	/*
+	 * Couldn't find it, so try to compile the new RE.  To avoid leaking
+	 * resources on failure, we build into the re_temp local.
+	 */
+
+	/* Convert pattern string to wide characters */
+	pattern = (pg_wchar *) palloc((text_re_len - VARHDRSZ + 1) * sizeof(pg_wchar));
+	pattern_len = pg_mb2wchar_with_len((unsigned char *) VARDATA(text_re),
+									   pattern,
+									   text_re_len - VARHDRSZ);
 
+	regcomp_result = pg_regcomp(&re_temp.cre_re,
+								pattern,
+								pattern_len,
+								cflags);
+
+	pfree(pattern);
+
+	if (regcomp_result != 0)
+	{
 		/* re didn't compile */
-		pg_regerror(regcomp_result, &rev[oldest].cre_re, errMsg,
-					sizeof(errMsg));
+		char		errMsg[100];
+
+		pg_regerror(regcomp_result, &re_temp.cre_re, errMsg, sizeof(errMsg));
+		/* XXX should we pg_regfree here? */
 		elog(ERROR, "Invalid regular expression: %s", errMsg);
 	}
 
-	/* not reached */
-	return false;
-}
-
+	/*
+	 * use malloc/free for the cre_pat field because the storage has to
+	 * persist across transactions
+	 */
+	re_temp.cre_pat = malloc(text_re_len);
+	if (re_temp.cre_pat == NULL)
+	{
+		pg_regfree(&re_temp.cre_re);
+		elog(ERROR, "Out of memory");
+	}
+	memcpy(re_temp.cre_pat, text_re, text_re_len);
+	re_temp.cre_flags = cflags;
 
-/*
-   fixedlen_regexeq:
+	/*
+	 * Okay, we have a valid new item in re_temp; insert it into the
+	 * storage array.  Discard last entry if needed.
+	 */
+	if (num_res >= MAX_CACHED_RES)
+	{
+		--num_res;
+		Assert(num_res < MAX_CACHED_RES);
+		pg_regfree(&re_array[num_res].cre_re);
+		free(re_array[num_res].cre_pat);
+	}
 
-   a generic fixed length regexp routine
-		 s		- the string to match against (not necessarily null-terminated)
-		 p		- the pattern (as a text*)
-		 charlen   - the length of the string
-*/
-static bool
-fixedlen_regexeq(char *s, text *p, int charlen, int cflags)
-{
-	char	   *sterm;
-	bool		result;
+	if (num_res > 0)
+		memmove(&re_array[1], &re_array[0], num_res * sizeof(cached_re_str));
 
-	/* be sure sterm is null-terminated */
-	sterm = (char *) palloc(charlen + 1);
-	memcpy(sterm, s, charlen);
-	sterm[charlen] = '\0';
+	re_array[0] = re_temp;
+	num_res++;
 
-	result = RE_compile_and_execute(p, sterm, cflags, 0, NULL);
+	/* Perform RE match and return result */
+	regexec_result = pg_regexec(&re_array[0].cre_re,
+								data,
+								data_len,
+								NULL, /* no details */
+								nmatch,
+								pmatch,
+								0);
 
-	pfree(sterm);
+	pfree(data);
 
-	return result;
+	return (regexec_result == 0);
 }
 
 
@@ -208,10 +226,11 @@ nameregexeq(PG_FUNCTION_ARGS)
 	Name		n = PG_GETARG_NAME(0);
 	text	   *p = PG_GETARG_TEXT_P(1);
 
-	PG_RETURN_BOOL(fixedlen_regexeq(NameStr(*n),
-									p,
-									strlen(NameStr(*n)),
-									REG_EXTENDED));
+	PG_RETURN_BOOL(RE_compile_and_execute(p,
+										  (unsigned char *) NameStr(*n),
+										  strlen(NameStr(*n)),
+										  REG_ADVANCED,
+										  0, NULL));
 }
 
 Datum
@@ -220,10 +239,11 @@ nameregexne(PG_FUNCTION_ARGS)
 	Name		n = PG_GETARG_NAME(0);
 	text	   *p = PG_GETARG_TEXT_P(1);
 
-	PG_RETURN_BOOL(!fixedlen_regexeq(NameStr(*n),
-									 p,
-									 strlen(NameStr(*n)),
-									 REG_EXTENDED));
+	PG_RETURN_BOOL(!RE_compile_and_execute(p,
+										   (unsigned char *) NameStr(*n),
+										   strlen(NameStr(*n)),
+										   REG_ADVANCED,
+										   0, NULL));
 }
 
 Datum
@@ -232,10 +252,11 @@ textregexeq(PG_FUNCTION_ARGS)
 	text	   *s = PG_GETARG_TEXT_P(0);
 	text	   *p = PG_GETARG_TEXT_P(1);
 
-	PG_RETURN_BOOL(fixedlen_regexeq(VARDATA(s),
-									p,
-									VARSIZE(s) - VARHDRSZ,
-									REG_EXTENDED));
+	PG_RETURN_BOOL(RE_compile_and_execute(p,
+										  (unsigned char *) VARDATA(s),
+										  VARSIZE(s) - VARHDRSZ,
+										  REG_ADVANCED,
+										  0, NULL));
 }
 
 Datum
@@ -244,10 +265,11 @@ textregexne(PG_FUNCTION_ARGS)
 	text	   *s = PG_GETARG_TEXT_P(0);
 	text	   *p = PG_GETARG_TEXT_P(1);
 
-	PG_RETURN_BOOL(!fixedlen_regexeq(VARDATA(s),
-									 p,
-									 VARSIZE(s) - VARHDRSZ,
-									 REG_EXTENDED));
+	PG_RETURN_BOOL(!RE_compile_and_execute(p,
+										   (unsigned char *) VARDATA(s),
+										   VARSIZE(s) - VARHDRSZ,
+										   REG_ADVANCED,
+										   0, NULL));
 }
 
 
@@ -258,82 +280,81 @@ textregexne(PG_FUNCTION_ARGS)
 
 
 Datum
-texticregexeq(PG_FUNCTION_ARGS)
+nameicregexeq(PG_FUNCTION_ARGS)
 {
-	text	   *s = PG_GETARG_TEXT_P(0);
+	Name		n = PG_GETARG_NAME(0);
 	text	   *p = PG_GETARG_TEXT_P(1);
 
-	PG_RETURN_BOOL(fixedlen_regexeq(VARDATA(s),
-									p,
-									VARSIZE(s) - VARHDRSZ,
-									REG_ICASE | REG_EXTENDED));
+	PG_RETURN_BOOL(RE_compile_and_execute(p,
+										  (unsigned char *) NameStr(*n),
+										  strlen(NameStr(*n)),
+										  REG_ICASE | REG_ADVANCED,
+										  0, NULL));
 }
 
 Datum
-texticregexne(PG_FUNCTION_ARGS)
+nameicregexne(PG_FUNCTION_ARGS)
 {
-	text	   *s = PG_GETARG_TEXT_P(0);
+	Name		n = PG_GETARG_NAME(0);
 	text	   *p = PG_GETARG_TEXT_P(1);
 
-	PG_RETURN_BOOL(!fixedlen_regexeq(VARDATA(s),
-									 p,
-									 VARSIZE(s) - VARHDRSZ,
-									 REG_ICASE | REG_EXTENDED));
+	PG_RETURN_BOOL(!RE_compile_and_execute(p,
+										   (unsigned char *) NameStr(*n),
+										   strlen(NameStr(*n)),
+										   REG_ICASE | REG_ADVANCED,
+										   0, NULL));
 }
 
 Datum
-nameicregexeq(PG_FUNCTION_ARGS)
+texticregexeq(PG_FUNCTION_ARGS)
 {
-	Name		n = PG_GETARG_NAME(0);
+	text	   *s = PG_GETARG_TEXT_P(0);
 	text	   *p = PG_GETARG_TEXT_P(1);
 
-	PG_RETURN_BOOL(fixedlen_regexeq(NameStr(*n),
-									p,
-									strlen(NameStr(*n)),
-									REG_ICASE | REG_EXTENDED));
+	PG_RETURN_BOOL(RE_compile_and_execute(p,
+										  (unsigned char *) VARDATA(s),
+										  VARSIZE(s) - VARHDRSZ,
+										  REG_ICASE | REG_ADVANCED,
+										  0, NULL));
 }
 
 Datum
-nameicregexne(PG_FUNCTION_ARGS)
+texticregexne(PG_FUNCTION_ARGS)
 {
-	Name		n = PG_GETARG_NAME(0);
+	text	   *s = PG_GETARG_TEXT_P(0);
 	text	   *p = PG_GETARG_TEXT_P(1);
 
-	PG_RETURN_BOOL(!fixedlen_regexeq(NameStr(*n),
-									 p,
-									 strlen(NameStr(*n)),
-									 REG_ICASE | REG_EXTENDED));
+	PG_RETURN_BOOL(!RE_compile_and_execute(p,
+										   (unsigned char *) VARDATA(s),
+										   VARSIZE(s) - VARHDRSZ,
+										   REG_ICASE | REG_ADVANCED,
+										   0, NULL));
 }
 
 
-/* textregexsubstr()
- * Return a substring matched by a regular expression.
+/*
+ * textregexsubstr()
+ *		Return a substring matched by a regular expression.
  */
 Datum
 textregexsubstr(PG_FUNCTION_ARGS)
 {
 	text	   *s = PG_GETARG_TEXT_P(0);
 	text	   *p = PG_GETARG_TEXT_P(1);
-	char	   *sterm;
-	int			len;
 	bool		match;
 	regmatch_t	pmatch[2];
 
-	/* be sure sterm is null-terminated */
-	len = VARSIZE(s) - VARHDRSZ;
-	sterm = (char *) palloc(len + 1);
-	memcpy(sterm, VARDATA(s), len);
-	sterm[len] = '\0';
-
 	/*
 	 * We pass two regmatch_t structs to get info about the overall match
 	 * and the match for the first parenthesized subexpression (if any).
 	 * If there is a parenthesized subexpression, we return what it matched;
 	 * else return what the whole regexp matched.
 	 */
-	match = RE_compile_and_execute(p, sterm, REG_EXTENDED, 2, pmatch);
-
-	pfree(sterm);
+	match = RE_compile_and_execute(p,
+								   (unsigned char *) VARDATA(s),
+								   VARSIZE(s) - VARHDRSZ,
+								   REG_ADVANCED,
+								   2, pmatch);
 
 	/* match? then return the substring matching the pattern */
 	if (match)
diff --git a/src/include/regex/cclass.h b/src/include/regex/cclass.h
deleted file mode 100644
index 8b13c125830..00000000000
--- a/src/include/regex/cclass.h
+++ /dev/null
@@ -1,99 +0,0 @@
-/*-
- * Copyright (c) 1992, 1993, 1994 Henry Spencer.
- * Copyright (c) 1992, 1993, 1994
- *		The Regents of the University of California.  All rights reserved.
- *
- * This code is derived from software contributed to Berkeley by
- * Henry Spencer.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *	  notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *	  notice, this list of conditions and the following disclaimer in the
- *	  documentation and/or other materials provided with the distribution.
- * 3. All advertising materials mentioning features or use of this software
- *	  must display the following acknowledgement:
- *		This product includes software developed by the University of
- *		California, Berkeley and its contributors.
- * 4. Neither the name of the University nor the names of its contributors
- *	  may be used to endorse or promote products derived from this software
- *	  without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.	IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- *		@(#)cclass.h	8.3 (Berkeley) 3/20/94
- */
-
-/* character-class table */
-static struct cclass
-{
-	char	   *name;
-	char	   *chars;
-	char	   *multis;
-}	cclasses[] =
-
-{
-	{
-		"alnum", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\
-0123456789", ""
-	},
-	{
-		"alpha", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz",
-		""
-	},
-	{
-		"blank", " \t", ""
-	},
-	{
-		"cntrl", "\007\b\t\n\v\f\r\1\2\3\4\5\6\16\17\20\21\22\23\24\
-\25\26\27\30\31\32\33\34\35\36\37\177", ""
-	},
-	{
-		"digit", "0123456789", ""
-	},
-	{
-		"graph", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\
-0123456789!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~",
-		""
-	},
-	{
-		"lower", "abcdefghijklmnopqrstuvwxyz",
-		""
-	},
-	{
-		"print", "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz\
-0123456789!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~ ",
-		""
-	},
-	{
-		"punct", "!\"#$%&'()*+,-./:;<=>?@[\\]^_`{|}~",
-		""
-	},
-	{
-		"space", "\t\n\v\f\r ", ""
-	},
-	{
-		"upper", "ABCDEFGHIJKLMNOPQRSTUVWXYZ",
-		""
-	},
-	{
-		"xdigit", "0123456789ABCDEFabcdef",
-		""
-	},
-	{
-		NULL, NULL, ""
-	}
-};
diff --git a/src/include/regex/cname.h b/src/include/regex/cname.h
deleted file mode 100644
index bff408e4f0c..00000000000
--- a/src/include/regex/cname.h
+++ /dev/null
@@ -1,336 +0,0 @@
-/*-
- * Copyright (c) 1992, 1993, 1994 Henry Spencer.
- * Copyright (c) 1992, 1993, 1994
- *		The Regents of the University of California.  All rights reserved.
- *
- * This code is derived from software contributed to Berkeley by
- * Henry Spencer.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *	  notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *	  notice, this list of conditions and the following disclaimer in the
- *	  documentation and/or other materials provided with the distribution.
- * 3. All advertising materials mentioning features or use of this software
- *	  must display the following acknowledgement:
- *		This product includes software developed by the University of
- *		California, Berkeley and its contributors.
- * 4. Neither the name of the University nor the names of its contributors
- *	  may be used to endorse or promote products derived from this software
- *	  without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.	IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- *		@(#)cname.h		8.3 (Berkeley) 3/20/94
- */
-
-/* character-name table */
-static struct cname
-{
-	char	   *name;
-	char		code;
-}	cnames[] =
-
-{
-	{
-		"NUL", '\0'
-	},
-	{
-		"SOH", '\001'
-	},
-	{
-		"STX", '\002'
-	},
-	{
-		"ETX", '\003'
-	},
-	{
-		"EOT", '\004'
-	},
-	{
-		"ENQ", '\005'
-	},
-	{
-		"ACK", '\006'
-	},
-	{
-		"BEL", '\007'
-	},
-	{
-		"alert", '\007'
-	},
-	{
-		"BS", '\010'
-	},
-	{
-		"backspace", '\b'
-	},
-	{
-		"HT", '\011'
-	},
-	{
-		"tab", '\t'
-	},
-	{
-		"LF", '\012'
-	},
-	{
-		"newline", '\n'
-	},
-	{
-		"VT", '\013'
-	},
-	{
-		"vertical-tab", '\v'
-	},
-	{
-		"FF", '\014'
-	},
-	{
-		"form-feed", '\f'
-	},
-	{
-		"CR", '\015'
-	},
-	{
-		"carriage-return", '\r'
-	},
-	{
-		"SO", '\016'
-	},
-	{
-		"SI", '\017'
-	},
-	{
-		"DLE", '\020'
-	},
-	{
-		"DC1", '\021'
-	},
-	{
-		"DC2", '\022'
-	},
-	{
-		"DC3", '\023'
-	},
-	{
-		"DC4", '\024'
-	},
-	{
-		"NAK", '\025'
-	},
-	{
-		"SYN", '\026'
-	},
-	{
-		"ETB", '\027'
-	},
-	{
-		"CAN", '\030'
-	},
-	{
-		"EM", '\031'
-	},
-	{
-		"SUB", '\032'
-	},
-	{
-		"ESC", '\033'
-	},
-	{
-		"IS4", '\034'
-	},
-	{
-		"FS", '\034'
-	},
-	{
-		"IS3", '\035'
-	},
-	{
-		"GS", '\035'
-	},
-	{
-		"IS2", '\036'
-	},
-	{
-		"RS", '\036'
-	},
-	{
-		"IS1", '\037'
-	},
-	{
-		"US", '\037'
-	},
-	{
-		"space", ' '
-	},
-	{
-		"exclamation-mark", '!'
-	},
-	{
-		"quotation-mark", '"'
-	},
-	{
-		"number-sign", '#'
-	},
-	{
-		"dollar-sign", '$'
-	},
-	{
-		"percent-sign", '%'
-	},
-	{
-		"ampersand", '&'
-	},
-	{
-		"apostrophe", '\''
-	},
-	{
-		"left-parenthesis", '('
-	},
-	{
-		"right-parenthesis", ')'
-	},
-	{
-		"asterisk", '*'
-	},
-	{
-		"plus-sign", '+'
-	},
-	{
-		"comma", ','
-	},
-	{
-		"hyphen", '-'
-	},
-	{
-		"hyphen-minus", '-'
-	},
-	{
-		"period", '.'
-	},
-	{
-		"full-stop", '.'
-	},
-	{
-		"slash", '/'
-	},
-	{
-		"solidus", '/'
-	},
-	{
-		"zero", '0'
-	},
-	{
-		"one", '1'
-	},
-	{
-		"two", '2'
-	},
-	{
-		"three", '3'
-	},
-	{
-		"four", '4'
-	},
-	{
-		"five", '5'
-	},
-	{
-		"six", '6'
-	},
-	{
-		"seven", '7'
-	},
-	{
-		"eight", '8'
-	},
-	{
-		"nine", '9'
-	},
-	{
-		"colon", ':'
-	},
-	{
-		"semicolon", ';'
-	},
-	{
-		"less-than-sign", '<'
-	},
-	{
-		"equals-sign", '='
-	},
-	{
-		"greater-than-sign", '>'
-	},
-	{
-		"question-mark", '?'
-	},
-	{
-		"commercial-at", '@'
-	},
-	{
-		"left-square-bracket", '['
-	},
-	{
-		"backslash", '\\'
-	},
-	{
-		"reverse-solidus", '\\'
-	},
-	{
-		"right-square-bracket", ']'
-	},
-	{
-		"circumflex", '^'
-	},
-	{
-		"circumflex-accent", '^'
-	},
-	{
-		"underscore", '_'
-	},
-	{
-		"low-line", '_'
-	},
-	{
-		"grave-accent", '`'
-	},
-	{
-		"left-brace", '{'
-	},
-	{
-		"left-curly-bracket", '{'
-	},
-	{
-		"vertical-line", '|'
-	},
-	{
-		"right-brace", '}'
-	},
-	{
-		"right-curly-bracket", '}'
-	},
-	{
-		"tilde", '~'
-	},
-	{
-		"DEL", '\177'
-	},
-	{
-		NULL, 0
-	}
-};
diff --git a/src/include/regex/regcustom.h b/src/include/regex/regcustom.h
new file mode 100644
index 00000000000..305243296ff
--- /dev/null
+++ b/src/include/regex/regcustom.h
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 1998, 1999 Henry Spencer.  All rights reserved.
+ * 
+ * Development of this software was funded, in part, by Cray Research Inc.,
+ * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
+ * Corporation, none of whom are responsible for the results.  The author
+ * thanks all of them. 
+ * 
+ * Redistribution and use in source and binary forms -- with or without
+ * modification -- are permitted for any purpose, provided that
+ * redistributions in source form retain this entire copyright notice and
+ * indicate the origin and nature of any modifications.
+ * 
+ * I'd appreciate being given credit for this package in the documentation
+ * of software which uses it, but that is not a requirement.
+ * 
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
+ * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL
+ * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $Id: regcustom.h,v 1.1 2003/02/05 17:41:32 tgl Exp $
+ */
+
+/* headers if any */
+#include "postgres.h"
+
+#include <ctype.h>
+#include <limits.h>
+
+#include "mb/pg_wchar.h"
+
+
+/* overrides for regguts.h definitions, if any */
+#define	FUNCPTR(name, args)	(*name) args
+#define	MALLOC(n)		malloc(n)
+#define	FREE(p)			free(VS(p))
+#define	REALLOC(p,n)		realloc(VS(p),n)
+
+/* internal character type and related */
+typedef pg_wchar chr;	/* the type itself */
+typedef unsigned uchr;		/* unsigned type that will hold a chr */
+typedef int celt;		/* type to hold chr, MCCE number, or NOCELT */
+#define	NOCELT	(-1)		/* celt value which is not valid chr or MCCE */
+#define	CHR(c)	((unsigned char) (c)) /* turn char literal into chr literal */
+#define	DIGITVAL(c)	((c)-'0')	/* turn chr digit into its value */
+#define	CHRBITS	32		/* bits in a chr; must not use sizeof */
+#define	CHR_MIN	0x00000000		/* smallest and largest chr; the value */
+#define	CHR_MAX	0xfffffffe		/*  CHR_MAX-CHR_MIN+1 should fit in uchr */
+
+/* functions operating on chr */
+#define	iscalnum(x)	pg_isalnum(x)
+#define	iscalpha(x)	pg_isalpha(x)
+#define	iscdigit(x)	pg_isdigit(x)
+#define	iscspace(x)	pg_isspace(x)
+
+/* and pick up the standard header */
+#include "regex.h"
diff --git a/src/include/regex/regerrs.h b/src/include/regex/regerrs.h
new file mode 100644
index 00000000000..4b1a0541b51
--- /dev/null
+++ b/src/include/regex/regerrs.h
@@ -0,0 +1,22 @@
+/*
+ * $Id: regerrs.h,v 1.1 2003/02/05 17:41:32 tgl Exp $
+ */
+
+{ REG_OKAY,	"REG_OKAY",	"no errors detected" },
+{ REG_NOMATCH,	"REG_NOMATCH",	"failed to match" },
+{ REG_BADPAT,	"REG_BADPAT",	"invalid regexp (reg version 0.8)" },
+{ REG_ECOLLATE,	"REG_ECOLLATE",	"invalid collating element" },
+{ REG_ECTYPE,	"REG_ECTYPE",	"invalid character class" },
+{ REG_EESCAPE,	"REG_EESCAPE",	"invalid escape \\ sequence" },
+{ REG_ESUBREG,	"REG_ESUBREG",	"invalid backreference number" },
+{ REG_EBRACK,	"REG_EBRACK",	"brackets [] not balanced" },
+{ REG_EPAREN,	"REG_EPAREN",	"parentheses () not balanced" },
+{ REG_EBRACE,	"REG_EBRACE",	"braces {} not balanced" },
+{ REG_BADBR,	"REG_BADBR",	"invalid repetition count(s)" },
+{ REG_ERANGE,	"REG_ERANGE",	"invalid character range" },
+{ REG_ESPACE,	"REG_ESPACE",	"out of memory" },
+{ REG_BADRPT,	"REG_BADRPT",	"quantifier operand invalid" },
+{ REG_ASSERT,	"REG_ASSERT",	"\"can't happen\" -- you found a bug" },
+{ REG_INVARG,	"REG_INVARG",	"invalid argument to regex function" },
+{ REG_MIXED,	"REG_MIXED",	"character widths of regex and string differ" },
+{ REG_BADOPT,	"REG_BADOPT",	"invalid embedded option" },
diff --git a/src/include/regex/regex.h b/src/include/regex/regex.h
index e0889a11080..f29379126a5 100644
--- a/src/include/regex/regex.h
+++ b/src/include/regex/regex.h
@@ -1,110 +1,165 @@
-/*-
- * Copyright (c) 1992 Henry Spencer.
- * Copyright (c) 1992, 1993
- *		The Regents of the University of California.  All rights reserved.
- *
- * This code is derived from software contributed to Berkeley by
- * Henry Spencer of the University of Toronto.
+#ifndef _REGEX_H_
+#define	_REGEX_H_	/* never again */
+/*
+ * regular expressions
  *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *	  notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *	  notice, this list of conditions and the following disclaimer in the
- *	  documentation and/or other materials provided with the distribution.
- * 3. All advertising materials mentioning features or use of this software
- *	  must display the following acknowledgement:
- *		This product includes software developed by the University of
- *		California, Berkeley and its contributors.
- * 4. Neither the name of the University nor the names of its contributors
- *	  may be used to endorse or promote products derived from this software
- *	  without specific prior written permission.
+ * Copyright (c) 1998, 1999 Henry Spencer.  All rights reserved.
+ * 
+ * Development of this software was funded, in part, by Cray Research Inc.,
+ * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
+ * Corporation, none of whom are responsible for the results.  The author
+ * thanks all of them. 
+ * 
+ * Redistribution and use in source and binary forms -- with or without
+ * modification -- are permitted for any purpose, provided that
+ * redistributions in source form retain this entire copyright notice and
+ * indicate the origin and nature of any modifications.
  *
- * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.	IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
+ * I'd appreciate being given credit for this package in the documentation
+ * of software which uses it, but that is not a requirement.
+ * 
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
+ * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL
+ * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  *
- *		@(#)regex.h		8.2 (Berkeley) 1/3/94
+ * $Id: regex.h,v 1.23 2003/02/05 17:41:32 tgl Exp $
  */
 
-#ifndef _REGEX_H_
-#define _REGEX_H_
-
-#include <sys/types.h>
+/*
+ * Add your own defines, if needed, here.
+ */
 #include "mb/pg_wchar.h"
 
-/* types */
-typedef off_t regoff_t;
+/*
+ * interface types etc.
+ */
 
-typedef struct
-{
-	int			re_magic;
-	size_t		re_nsub;		/* number of parenthesized subexpressions */
-	const pg_wchar *re_endp;	/* end pointer for REG_PEND */
-	struct re_guts *re_g;		/* none of your business :-) */
-	pg_wchar   *patsave;		/* me too :-) */
+/*
+ * regoff_t has to be large enough to hold either off_t or ssize_t,
+ * and must be signed; it's only a guess that long is suitable.
+ */
+typedef long regoff_t;
+
+/*
+ * other interface types
+ */
+
+/* the biggie, a compiled RE (or rather, a front end to same) */
+typedef struct {
+	int re_magic;		/* magic number */
+	size_t re_nsub;		/* number of subexpressions */
+	long re_info;		/* information about RE */
+#		define	REG_UBACKREF		000001
+#		define	REG_ULOOKAHEAD		000002
+#		define	REG_UBOUNDS		000004
+#		define	REG_UBRACES		000010
+#		define	REG_UBSALNUM		000020
+#		define	REG_UPBOTCH		000040
+#		define	REG_UBBS		000100
+#		define	REG_UNONPOSIX		000200
+#		define	REG_UUNSPEC		000400
+#		define	REG_UUNPORT		001000
+#		define	REG_ULOCALE		002000
+#		define	REG_UEMPTYMATCH		004000
+#		define	REG_UIMPOSSIBLE		010000
+#		define	REG_USHORTEST		020000
+	int re_csize;		/* sizeof(character) */
+	char *re_endp;		/* backward compatibility kludge */
+	/* the rest is opaque pointers to hidden innards */
+	char *re_guts;		/* `char *' is more portable than `void *' */
+	char *re_fns;
 } regex_t;
 
-typedef struct
-{
-	regoff_t	rm_so;			/* start of match */
-	regoff_t	rm_eo;			/* end of match */
+/* result reporting (may acquire more fields later) */
+typedef struct {
+	regoff_t rm_so;		/* start of substring */
+	regoff_t rm_eo;		/* end of substring */
 } regmatch_t;
 
-/* regcomp() flags */
-#define REG_BASIC		0000
-#define REG_EXTENDED	0001
-#define REG_ICASE		0002
-#define REG_NOSUB		0004
-#define REG_NEWLINE		0010
-#define REG_NOSPEC		0020
-#define REG_PEND		0040
-#define REG_DUMP		0200
-
-/* regerror() flags */
-#define REG_NOMATCH		 1
-#define REG_BADPAT		 2
-#define REG_ECOLLATE	 3
-#define REG_ECTYPE		 4
-#define REG_EESCAPE		 5
-#define REG_ESUBREG		 6
-#define REG_EBRACK		 7
-#define REG_EPAREN		 8
-#define REG_EBRACE		 9
-#define REG_BADBR		10
-#define REG_ERANGE		11
-#define REG_ESPACE		12
-#define REG_BADRPT		13
-#define REG_EMPTY		14
-#define REG_ASSERT		15
-#define REG_INVARG		16
-#define REG_ATOI		255		/* convert name to number (!) */
-#define REG_ITOA		0400	/* convert number to name (!) */
-
-/* regexec() flags */
-#define REG_NOTBOL		00001
-#define REG_NOTEOL		00002
-#define REG_STARTEND	00004
-#define REG_TRACE		00400	/* tracing of execution */
-#define REG_LARGE		01000	/* force large representation */
-#define REG_BACKR		02000	/* force use of backref code */
-
-extern int	pg_regcomp(regex_t *preg, const char *pattern, int cflags);
-extern size_t pg_regerror(int errcode, const regex_t *preg,
-			char *errbuf, size_t errbuf_size);
-extern int pg_regexec(const regex_t *preg, const char *string,
-		   size_t nmatch,
-		   regmatch_t *pmatch, int eflags);
-extern void pg_regfree(regex_t *preg);
-
-#endif   /* !_REGEX_H_ */
+/* supplementary control and reporting */
+typedef struct {
+	regmatch_t rm_extend;	/* see REG_EXPECT */
+} rm_detail_t;
+
+
+
+/*
+ * regex compilation flags
+ */
+#define	REG_BASIC	000000	/* BREs (convenience) */
+#define	REG_EXTENDED	000001	/* EREs */
+#define	REG_ADVF	000002	/* advanced features in EREs */
+#define	REG_ADVANCED	000003	/* AREs (which are also EREs) */
+#define	REG_QUOTE	000004	/* no special characters, none */
+#define	REG_NOSPEC	REG_QUOTE	/* historical synonym */
+#define	REG_ICASE	000010	/* ignore case */
+#define	REG_NOSUB	000020	/* don't care about subexpressions */
+#define	REG_EXPANDED	000040	/* expanded format, white space & comments */
+#define	REG_NLSTOP	000100	/* \n doesn't match . or [^ ] */
+#define	REG_NLANCH	000200	/* ^ matches after \n, $ before */
+#define	REG_NEWLINE	000300	/* newlines are line terminators */
+#define	REG_PEND	000400	/* ugh -- backward-compatibility hack */
+#define	REG_EXPECT	001000	/* report details on partial/limited matches */
+#define	REG_BOSONLY	002000	/* temporary kludge for BOS-only matches */
+#define	REG_DUMP	004000	/* none of your business :-) */
+#define	REG_FAKE	010000	/* none of your business :-) */
+#define	REG_PROGRESS	020000	/* none of your business :-) */
+
+
+
+/*
+ * regex execution flags
+ */
+#define	REG_NOTBOL	0001	/* BOS is not BOL */
+#define	REG_NOTEOL	0002	/* EOS is not EOL */
+#define	REG_STARTEND	0004	/* backward compatibility kludge */
+#define	REG_FTRACE	0010	/* none of your business */
+#define	REG_MTRACE	0020	/* none of your business */
+#define	REG_SMALL	0040	/* none of your business */
+
+
+/*
+ * error reporting
+ * Be careful if modifying the list of error codes -- the table used by
+ * regerror() is generated automatically from this file!
+ */
+#define	REG_OKAY	 0	/* no errors detected */
+#define	REG_NOMATCH	 1	/* failed to match */
+#define	REG_BADPAT	 2	/* invalid regexp */
+#define	REG_ECOLLATE	 3	/* invalid collating element */
+#define	REG_ECTYPE	 4	/* invalid character class */
+#define	REG_EESCAPE	 5	/* invalid escape \ sequence */
+#define	REG_ESUBREG	 6	/* invalid backreference number */
+#define	REG_EBRACK	 7	/* brackets [] not balanced */
+#define	REG_EPAREN	 8	/* parentheses () not balanced */
+#define	REG_EBRACE	 9	/* braces {} not balanced */
+#define	REG_BADBR	10	/* invalid repetition count(s) */
+#define	REG_ERANGE	11	/* invalid character range */
+#define	REG_ESPACE	12	/* out of memory */
+#define	REG_BADRPT	13	/* quantifier operand invalid */
+#define	REG_ASSERT	15	/* "can't happen" -- you found a bug */
+#define	REG_INVARG	16	/* invalid argument to regex function */
+#define	REG_MIXED	17	/* character widths of regex and string differ */
+#define	REG_BADOPT	18	/* invalid embedded option */
+/* two specials for debugging and testing */
+#define	REG_ATOI	101	/* convert error-code name to number */
+#define	REG_ITOA	102	/* convert error-code number to name */
+
+
+
+/*
+ * the prototypes for exported functions
+ */
+extern int pg_regcomp(regex_t *, const pg_wchar *, size_t, int);
+extern int pg_regexec(regex_t *, const pg_wchar *, size_t, rm_detail_t *, size_t, regmatch_t [], int);
+extern void pg_regfree(regex_t *);
+extern size_t pg_regerror(int, const regex_t *, char *, size_t);
+
+#endif /* _REGEX_H_ */
diff --git a/src/include/regex/regex2.h b/src/include/regex/regex2.h
deleted file mode 100644
index 5ceed7fe9c0..00000000000
--- a/src/include/regex/regex2.h
+++ /dev/null
@@ -1,174 +0,0 @@
-/*-
- * Copyright (c) 1992, 1993, 1994 Henry Spencer.
- * Copyright (c) 1992, 1993, 1994
- *		The Regents of the University of California.  All rights reserved.
- *
- * This code is derived from software contributed to Berkeley by
- * Henry Spencer.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *	  notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *	  notice, this list of conditions and the following disclaimer in the
- *	  documentation and/or other materials provided with the distribution.
- * 3. All advertising materials mentioning features or use of this software
- *	  must display the following acknowledgement:
- *		This product includes software developed by the University of
- *		California, Berkeley and its contributors.
- * 4. Neither the name of the University nor the names of its contributors
- *	  may be used to endorse or promote products derived from this software
- *	  without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.	IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- *		@(#)regex2.h	8.4 (Berkeley) 3/20/94
- */
-
-#include <limits.h>
-
-/*
- * internals of regex_t
- */
-#define MAGIC1	((('r'^0200)<<8) | 'e')
-
-/*
- * The internal representation is a *strip*, a sequence of
- * operators ending with an endmarker.	(Some terminology etc. is a
- * historical relic of earlier versions which used multiple strips.)
- * Certain oddities in the representation are there to permit running
- * the machinery backwards; in particular, any deviation from sequential
- * flow must be marked at both its source and its destination.	Some
- * fine points:
- *
- * - OPLUS_ and O_PLUS are *inside* the loop they create.
- * - OQUEST_ and O_QUEST are *outside* the bypass they create.
- * - OCH_ and O_CH are *outside* the multi-way branch they create, while
- *	 OOR1 and OOR2 are respectively the end and the beginning of one of
- *	 the branches.	Note that there is an implicit OOR2 following OCH_
- *	 and an implicit OOR1 preceding O_CH.
- *
- * In state representations, an operator's bit is on to signify a state
- * immediately *preceding* "execution" of that operator.
- */
-typedef unsigned long sop;		/* strip operator */
-typedef long sopno;
-
-#define OPRMASK ((sop) 0xf8000000)
-#define OPDMASK ((sop) 0x07ffffff)
-#define OPSHIFT ((unsigned)27)
-#define OP(n)	((n)&OPRMASK)
-#define OPND(n) ((n)&OPDMASK)
-#define SOP(op, opnd)	((op)|(opnd))
-/* operators					   meaning		operand					*/
-/*												(back, fwd are offsets) */
-#define OEND	((size_t)1<<OPSHIFT)	/* endmarker	-						*/
-#define OCHAR	((size_t)2<<OPSHIFT)	/* character	unsigned char			*/
-#define OBOL	((size_t)3<<OPSHIFT)	/* left anchor	-						*/
-#define OEOL	((size_t)4<<OPSHIFT)	/* right anchor -						*/
-#define OANY	((size_t)5<<OPSHIFT)	/* .			-						*/
-#define OANYOF	((size_t)6<<OPSHIFT)	/* [...]		set number				*/
-#define OBACK_	((size_t)7<<OPSHIFT)	/* begin \d		paren number			*/
-#define O_BACK	((size_t)8<<OPSHIFT)	/* end \d		paren number			*/
-#define OPLUS_	((size_t)9<<OPSHIFT)	/* + prefix		fwd to suffix			*/
-#define O_PLUS	((size_t)10<<OPSHIFT)	/* + suffix		back to prefix			*/
-#define OQUEST_ ((size_t)11<<OPSHIFT)	/* ? prefix		fwd to suffix			*/
-#define O_QUEST ((size_t)12<<OPSHIFT)	/* ? suffix		back to prefix			*/
-#define OLPAREN ((size_t)13<<OPSHIFT)	/* (			fwd to )				*/
-#define ORPAREN ((size_t)14<<OPSHIFT)	/* )			back to (				*/
-#define OCH_	((size_t)15<<OPSHIFT)	/* begin choice fwd to OOR2				*/
-#define OOR1	((size_t)16<<OPSHIFT)	/* | pt. 1		back to OOR1 or
-										 * OCH_    */
-#define OOR2	((size_t)17<<OPSHIFT)	/* | pt. 2		fwd to OOR2 or
-										 * O_CH		*/
-#define O_CH	((size_t)18<<OPSHIFT)	/* end choice	back to OOR1			*/
-#define OBOW	((size_t)19<<OPSHIFT)	/* begin word	-						*/
-#define OEOW	((size_t)20<<OPSHIFT)	/* end word		-						*/
-
-/*
- * Structure for [] character-set representation.  Character sets are
- * done as bit vectors, grouped 8 to a byte vector for compactness.
- * The individual set therefore has both a pointer to the byte vector
- * and a mask to pick out the relevant bit of each byte.  A hash code
- * simplifies testing whether two sets could be identical.
- *
- * This will get trickier for multicharacter collating elements.  As
- * preliminary hooks for dealing with such things, we also carry along
- * a string of multi-character elements, and decide the size of the
- * vectors at run time.
- */
-typedef struct
-{
-	uch		   *ptr;			/* -> uch [csetsize] */
-	uch			mask;			/* bit within array */
-	pg_wchar	hash;			/* hash code */
-	unsigned int lc;			/* leading character (character-set) */
-	size_t		smultis;
-	char	   *multis;			/* -> char[smulti]	ab\0cd\0ef\0\0 */
-} cset;
-
-/* note that CHadd and CHsub are unsafe, and CHIN doesn't yield 0/1 */
-#define CHlc(c) (((unsigned)(c)&0xff0000)>>16)
-#define CHadd(cs, c)	((cs)->ptr[(unsigned)(c)&0xffff] |= (cs)->mask, (cs)->hash += (unsigned)(c)&0xffff,\
-			 (cs)->lc = CHlc(c))
-#define CHsub(cs, c)	((cs)->ptr[(unsigned)(c)&0xffff] &= ~(cs)->mask, (cs)->hash -= (unsigned)(c)&0xffff)
-#define CHIN(cs, c) ((cs)->ptr[(unsigned)(c)&0xffff] & (cs)->mask && \
-			 ((cs)->lc == CHlc(c)))
-#define MCadd(p, cs, cp)		mcadd(p, cs, cp)		/* regcomp() internal
-														 * fns */
-#define MCsub(p, cs, cp)		mcsub(p, cs, cp)
-#define MCin(p, cs, cp) mcin(p, cs, cp)
-
-/* stuff for character categories */
-typedef unsigned char cat_t;
-
-/*
- * main compiled-expression structure
- */
-struct re_guts
-{
-	int			magic;
-#define  MAGIC2  ((('R'^0200)<<8)|'E')
-	sop		   *strip;			/* malloced area for strip */
-	int			csetsize;		/* number of bits in a cset vector */
-	int			ncsets;			/* number of csets in use */
-	cset	   *sets;			/* -> cset [ncsets] */
-	uch		   *setbits;		/* -> uch[csetsize][ncsets/CHAR_BIT] */
-	int			cflags;			/* copy of regcomp() cflags argument */
-	sopno		nstates;		/* = number of sops */
-	sopno		firststate;		/* the initial OEND (normally 0) */
-	sopno		laststate;		/* the final OEND */
-	int			iflags;			/* internal flags */
-#define  USEBOL  01				/* used ^ */
-#define  USEEOL  02				/* used $ */
-#define  BAD	 04				/* something wrong */
-	int			nbol;			/* number of ^ used */
-	int			neol;			/* number of $ used */
-	int			ncategories;	/* how many character categories */
-	cat_t	   *categories;		/* ->catspace[-CHAR_MIN] */
-	pg_wchar   *must;			/* match must contain this string */
-	int			mlen;			/* length of must */
-	size_t		nsub;			/* copy of re_nsub */
-	int			backrefs;		/* does it use back references? */
-	sopno		nplus;			/* how deep does it nest +s? */
-	/* catspace must be last */
-	cat_t		catspace[1];	/* actually [NC] */
-};
-
-/* misc utilities */
-#define OUT		  (16777216+1)	/* 16777216 == 2^24 == 3 bytes */
-
-#define ISWORD(c)	(((c) >= 0 && (c) <= UCHAR_MAX) && \
-			 (isalnum((unsigned char) (c)) || (c) == '_'))
diff --git a/src/include/regex/regguts.h b/src/include/regex/regguts.h
new file mode 100644
index 00000000000..a08abb8af18
--- /dev/null
+++ b/src/include/regex/regguts.h
@@ -0,0 +1,393 @@
+/*
+ * Internal interface definitions, etc., for the reg package
+ *
+ * Copyright (c) 1998, 1999 Henry Spencer.  All rights reserved.
+ * 
+ * Development of this software was funded, in part, by Cray Research Inc.,
+ * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
+ * Corporation, none of whom are responsible for the results.  The author
+ * thanks all of them. 
+ * 
+ * Redistribution and use in source and binary forms -- with or without
+ * modification -- are permitted for any purpose, provided that
+ * redistributions in source form retain this entire copyright notice and
+ * indicate the origin and nature of any modifications.
+ * 
+ * I'd appreciate being given credit for this package in the documentation
+ * of software which uses it, but that is not a requirement.
+ * 
+ * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
+ * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
+ * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL
+ * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
+ * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
+ * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
+ * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+ * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+ * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+ * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ * $Id: regguts.h,v 1.1 2003/02/05 17:41:32 tgl Exp $
+ */
+
+
+
+/*
+ * Environmental customization.  It should not (I hope) be necessary to
+ * alter the file you are now reading -- regcustom.h should handle it all,
+ * given care here and elsewhere.
+ */
+#include "regcustom.h"
+
+
+
+/*
+ * Things that regcustom.h might override.
+ */
+
+/* assertions */
+#ifndef assert
+#	ifndef REG_DEBUG
+#	define	NDEBUG		/* no assertions */
+#	endif
+#include <assert.h>
+#endif
+
+/* voids */
+#ifndef DISCARD
+#define	DISCARD	void			/* for throwing values away */
+#endif
+#ifndef VS
+#define	VS(x)	((void *)(x))		/* cast something to generic ptr */
+#endif
+
+/* function-pointer declarator */
+#ifndef FUNCPTR
+#define	FUNCPTR(name, args)	(*name) args
+#endif
+
+/* memory allocation */
+#ifndef MALLOC
+#define	MALLOC(n)	malloc(n)
+#endif
+#ifndef REALLOC
+#define	REALLOC(p, n)	realloc(VS(p), n)
+#endif
+#ifndef FREE
+#define	FREE(p)		free(VS(p))
+#endif
+
+/* want size of a char in bits, and max value in bounded quantifiers */
+#ifndef CHAR_BIT
+#include <limits.h>
+#endif
+#ifndef _POSIX2_RE_DUP_MAX
+#define	_POSIX2_RE_DUP_MAX	255	/* normally from <limits.h> */
+#endif
+
+
+
+/*
+ * misc
+ */
+
+#define	NOTREACHED	0
+#define	xxx		1
+
+#define	DUPMAX	_POSIX2_RE_DUP_MAX
+#define	INFINITY	(DUPMAX+1)
+
+#define	REMAGIC	0xfed7		/* magic number for main struct */
+
+
+
+/*
+ * debugging facilities
+ */
+#ifdef REG_DEBUG
+/* FDEBUG does finite-state tracing */
+#define	FDEBUG(arglist)	{ if (v->eflags&REG_FTRACE) printf arglist; }
+/* MDEBUG does higher-level tracing */
+#define	MDEBUG(arglist)	{ if (v->eflags&REG_MTRACE) printf arglist; }
+#else
+#define	FDEBUG(arglist)	{}
+#define	MDEBUG(arglist)	{}
+#endif
+
+
+
+/*
+ * bitmap manipulation
+ */
+#define	UBITS	(CHAR_BIT * sizeof(unsigned))
+#define	BSET(uv, sn)	((uv)[(sn)/UBITS] |= (unsigned)1 << ((sn)%UBITS))
+#define	ISBSET(uv, sn)	((uv)[(sn)/UBITS] & ((unsigned)1 << ((sn)%UBITS)))
+
+
+
+/*
+ * We dissect a chr into byts for colormap table indexing.  Here we define
+ * a byt, which will be the same as a byte on most machines...  The exact
+ * size of a byt is not critical, but about 8 bits is good, and extraction
+ * of 8-bit chunks is sometimes especially fast.
+ */
+#ifndef BYTBITS
+#define	BYTBITS	8		/* bits in a byt */
+#endif
+#define	BYTTAB	(1<<BYTBITS)	/* size of table with one entry per byt value */
+#define	BYTMASK	(BYTTAB-1)	/* bit mask for byt */
+#define	NBYTS	((CHRBITS+BYTBITS-1)/BYTBITS)
+/* the definition of GETCOLOR(), below, assumes NBYTS <= 4 */
+
+
+
+/*
+ * As soon as possible, we map chrs into equivalence classes -- "colors" --
+ * which are of much more manageable number.
+ */
+typedef short color;		/* colors of characters */
+typedef int pcolor;		/* what color promotes to */
+#define	COLORLESS	(-1)	/* impossible color */
+#define	WHITE		0	/* default color, parent of all others */
+
+
+
+/*
+ * A colormap is a tree -- more precisely, a DAG -- indexed at each level
+ * by a byt of the chr, to map the chr to a color efficiently.  Because
+ * lower sections of the tree can be shared, it can exploit the usual
+ * sparseness of such a mapping table.  The tree is always NBYTS levels
+ * deep (in the past it was shallower during construction but was "filled"
+ * to full depth at the end of that); areas that are unaltered as yet point
+ * to "fill blocks" which are entirely WHITE in color.
+ */
+
+/* the tree itself */
+struct colors {
+	color ccolor[BYTTAB];
+};
+struct ptrs {
+	union tree *pptr[BYTTAB];
+};
+union tree {
+	struct colors colors;
+	struct ptrs ptrs;
+};
+#define	tcolor	colors.ccolor
+#define	tptr	ptrs.pptr
+
+/* internal per-color structure for the color machinery */
+struct colordesc {
+	uchr nchrs;		/* number of chars of this color */
+	color sub;		/* open subcolor (if any); free chain ptr */
+#		define	NOSUB	COLORLESS
+	struct arc *arcs;	/* color chain */
+	int flags;
+#		define	FREECOL	01	/* currently free */
+#		define	PSEUDO	02	/* pseudocolor, no real chars */
+#	define	UNUSEDCOLOR(cd)	((cd)->flags&FREECOL)
+	union tree *block;	/* block of solid color, if any */
+};
+
+/* the color map itself */
+struct colormap {
+	int magic;
+#		define	CMMAGIC	0x876
+	struct vars *v;			/* for compile error reporting */
+	size_t ncds;			/* number of colordescs */
+	size_t max;			/* highest in use */
+	color free;			/* beginning of free chain (if non-0) */
+	struct colordesc *cd;
+#	define	CDEND(cm)	(&(cm)->cd[(cm)->max + 1])
+#		define	NINLINECDS	((size_t)10)
+	struct colordesc cdspace[NINLINECDS];
+	union tree tree[NBYTS];		/* tree top, plus fill blocks */
+};
+
+/* optimization magic to do fast chr->color mapping */
+#define	B0(c)	((c) & BYTMASK)
+#define	B1(c)	(((c)>>BYTBITS) & BYTMASK)
+#define	B2(c)	(((c)>>(2*BYTBITS)) & BYTMASK)
+#define	B3(c)	(((c)>>(3*BYTBITS)) & BYTMASK)
+#if NBYTS == 1
+#define	GETCOLOR(cm, c)	((cm)->tree->tcolor[B0(c)])
+#endif
+/* beware, for NBYTS>1, GETCOLOR() is unsafe -- 2nd arg used repeatedly */
+#if NBYTS == 2
+#define	GETCOLOR(cm, c)	((cm)->tree->tptr[B1(c)]->tcolor[B0(c)])
+#endif
+#if NBYTS == 4
+#define	GETCOLOR(cm, c)	((cm)->tree->tptr[B3(c)]->tptr[B2(c)]->tptr[B1(c)]->tcolor[B0(c)])
+#endif
+
+
+
+/*
+ * Interface definitions for locale-interface functions in locale.c.
+ * Multi-character collating elements (MCCEs) cause most of the trouble.
+ */
+struct cvec {
+	int nchrs;		/* number of chrs */
+	int chrspace;		/* number of chrs possible */
+	chr *chrs;		/* pointer to vector of chrs */
+	int nranges;		/* number of ranges (chr pairs) */
+	int rangespace;		/* number of chrs possible */
+	chr *ranges;		/* pointer to vector of chr pairs */
+	int nmcces;		/* number of MCCEs */
+	int mccespace;		/* number of MCCEs possible */
+	int nmccechrs;		/* number of chrs used for MCCEs */
+	chr *mcces[1];		/* pointers to 0-terminated MCCEs */
+				/* and both batches of chrs are on the end */
+};
+
+/* caution:  this value cannot be changed easily */
+#define	MAXMCCE	2		/* length of longest MCCE */
+
+
+
+/*
+ * definitions for NFA internal representation
+ *
+ * Having a "from" pointer within each arc may seem redundant, but it
+ * saves a lot of hassle.
+ */
+struct state;
+
+struct arc {
+	int type;
+#		define	ARCFREE	'\0'
+	color co;
+	struct state *from;	/* where it's from (and contained within) */
+	struct state *to;	/* where it's to */
+	struct arc *outchain;	/* *from's outs chain or free chain */
+#	define	freechain	outchain
+	struct arc *inchain;	/* *to's ins chain */
+	struct arc *colorchain;	/* color's arc chain */
+};
+
+struct arcbatch {		/* for bulk allocation of arcs */
+	struct arcbatch *next;
+#	define	ABSIZE	10
+	struct arc a[ABSIZE];
+};
+
+struct state {
+	int no;
+#		define	FREESTATE	(-1)
+	char flag;		/* marks special states */
+	int nins;		/* number of inarcs */
+	struct arc *ins;	/* chain of inarcs */
+	int nouts;		/* number of outarcs */
+	struct arc *outs;	/* chain of outarcs */
+	struct arc *free;	/* chain of free arcs */
+	struct state *tmp;	/* temporary for traversal algorithms */
+	struct state *next;	/* chain for traversing all */
+	struct state *prev;	/* back chain */
+	struct arcbatch oas;	/* first arcbatch, avoid malloc in easy case */
+	int noas;		/* number of arcs used in first arcbatch */
+};
+
+struct nfa {
+	struct state *pre;	/* pre-initial state */
+	struct state *init;	/* initial state */
+	struct state *final;	/* final state */
+	struct state *post;	/* post-final state */
+	int nstates;		/* for numbering states */
+	struct state *states;	/* state-chain header */
+	struct state *slast;	/* tail of the chain */
+	struct state *free;	/* free list */
+	struct colormap *cm;	/* the color map */
+	color bos[2];		/* colors, if any, assigned to BOS and BOL */
+	color eos[2];		/* colors, if any, assigned to EOS and EOL */
+	struct vars *v;		/* simplifies compile error reporting */
+	struct nfa *parent;	/* parent NFA, if any */
+};
+
+
+
+/*
+ * definitions for compacted NFA
+ */
+struct carc {
+	color co;		/* COLORLESS is list terminator */
+	int to;			/* state number */
+};
+
+struct cnfa {
+	int nstates;		/* number of states */
+	int ncolors;		/* number of colors */
+	int flags;
+#		define	HASLACONS	01	/* uses lookahead constraints */
+	int pre;		/* setup state number */
+	int post;		/* teardown state number */
+	color bos[2];		/* colors, if any, assigned to BOS and BOL */
+	color eos[2];		/* colors, if any, assigned to EOS and EOL */
+	struct carc **states;	/* vector of pointers to outarc lists */
+	struct carc *arcs;	/* the area for the lists */
+};
+#define	ZAPCNFA(cnfa)	((cnfa).nstates = 0)
+#define	NULLCNFA(cnfa)	((cnfa).nstates == 0)
+
+
+
+/*
+ * subexpression tree
+ */
+struct subre {
+	char op;		/* '|', '.' (concat), 'b' (backref), '(', '=' */
+	char flags;
+#		define	LONGER	01	/* prefers longer match */
+#		define	SHORTER	02	/* prefers shorter match */
+#		define	MIXED	04	/* mixed preference below */
+#		define	CAP	010	/* capturing parens below */
+#		define	BACKR	020	/* back reference below */
+#		define	INUSE	0100	/* in use in final tree */
+#		define	LOCAL	03	/* bits which may not propagate up */
+#		define	LMIX(f)	((f)<<2)	/* LONGER -> MIXED */
+#		define	SMIX(f)	((f)<<1)	/* SHORTER -> MIXED */
+#		define	UP(f)	(((f)&~LOCAL) | (LMIX(f) & SMIX(f) & MIXED))
+#		define	MESSY(f)	((f)&(MIXED|CAP|BACKR))
+#		define	PREF(f)	((f)&LOCAL)
+#		define	PREF2(f1, f2)	((PREF(f1) != 0) ? PREF(f1) : PREF(f2))
+#		define	COMBINE(f1, f2)	(UP((f1)|(f2)) | PREF2(f1, f2))
+	short retry;		/* index into retry memory */
+	int subno;		/* subexpression number (for 'b' and '(') */
+	short min;		/* min repetitions, for backref only */
+	short max;		/* max repetitions, for backref only */
+	struct subre *left;	/* left child, if any (also freelist chain) */
+	struct subre *right;	/* right child, if any */
+	struct state *begin;	/* outarcs from here... */
+	struct state *end;	/* ...ending in inarcs here */
+	struct cnfa cnfa;	/* compacted NFA, if any */
+	struct subre *chain;	/* for bookkeeping and error cleanup */
+};
+
+
+
+/*
+ * table of function pointers for generic manipulation functions
+ * A regex_t's re_fns points to one of these.
+ */
+struct fns {
+	void FUNCPTR(free, (regex_t *));
+};
+
+
+
+/*
+ * the insides of a regex_t, hidden behind a void *
+ */
+struct guts {
+	int magic;
+#		define	GUTSMAGIC	0xfed9
+	int cflags;		/* copy of compile flags */
+	long info;		/* copy of re_info */
+	size_t nsub;		/* copy of re_nsub */
+	struct subre *tree;
+	struct cnfa search;	/* for fast preliminary search */
+	int ntree;
+	struct colormap cmap;
+	int FUNCPTR(compare, (const chr *, const chr *, size_t));
+	struct subre *lacons;	/* lookahead-constraint vector */
+	int nlacons;		/* size of lacons */
+};
diff --git a/src/include/regex/utils.h b/src/include/regex/utils.h
deleted file mode 100644
index 5831122f7c9..00000000000
--- a/src/include/regex/utils.h
+++ /dev/null
@@ -1,60 +0,0 @@
-/*-
- * Copyright (c) 1992, 1993, 1994 Henry Spencer.
- * Copyright (c) 1992, 1993, 1994
- *		The Regents of the University of California.  All rights reserved.
- *
- * This code is derived from software contributed to Berkeley by
- * Henry Spencer.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- * 1. Redistributions of source code must retain the above copyright
- *	  notice, this list of conditions and the following disclaimer.
- * 2. Redistributions in binary form must reproduce the above copyright
- *	  notice, this list of conditions and the following disclaimer in the
- *	  documentation and/or other materials provided with the distribution.
- * 3. All advertising materials mentioning features or use of this software
- *	  must display the following acknowledgement:
- *		This product includes software developed by the University of
- *		California, Berkeley and its contributors.
- * 4. Neither the name of the University nor the names of its contributors
- *	  may be used to endorse or promote products derived from this software
- *	  without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
- * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
- * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
- * ARE DISCLAIMED.	IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
- * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
- * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
- * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
- * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
- * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
- * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- *
- *		@(#)utils.h		8.3 (Berkeley) 3/20/94
- */
-
-#ifndef _REGEX_UTILS_H
-#define _REGEX_UTILS_H
-
-#include <limits.h>
-
-/* utility definitions */
-#define DUPMAX			100000000		/* xxx is this right? */
-#define INFINITY		(DUPMAX + 1)
-
-#define NC				(SHRT_MAX - SHRT_MIN + 1)
-
-typedef unsigned char uch;
-
-/* switch off assertions (if not already off) if no REDEBUG */
-#ifndef REDEBUG
-#ifndef NDEBUG
-#define NDEBUG					/* no assertions please */
-#endif
-#endif
-
-#endif   /* _REGEX_UTILS_H */
-- 
GitLab