diff --git a/doc/src/sgml/func.sgml b/doc/src/sgml/func.sgml index b8be507f2df34457bbbc9fbd527b93432871f205..084db0d40e5fec669df0bba4179cb40f1e4238e0 100644 --- a/doc/src/sgml/func.sgml +++ b/doc/src/sgml/func.sgml @@ -1,4 +1,4 @@ -<!-- $PostgreSQL: pgsql/doc/src/sgml/func.sgml,v 1.369 2007/02/20 19:59:04 momjian Exp $ --> +<!-- $PostgreSQL: pgsql/doc/src/sgml/func.sgml,v 1.370 2007/03/20 05:44:59 neilc Exp $ --> <chapter id="functions"> <title>Functions and Operators</title> @@ -1468,17 +1468,52 @@ </row> <row> - <entry><literal><function>regexp_replace</function>(<parameter>string</parameter> <type>text</type>, <parameter>pattern</parameter> <type>text</type>, <parameter>replacement</parameter> <type>text</type> [,<parameter>flags</parameter> <type>text</type>])</literal></entry> + <entry><literal><function>regexp_matches</function>(<parameter>string</parameter> <type>text</type>, <parameter>pattern</parameter> <type>text</type> [, <parameter>flags</parameter> <type>text</type>])</literal></entry> + <entry><type>setof text[]</type></entry> + <entry> + Return all capture groups resulting from matching POSIX regular + expression against the <parameter>string</parameter>. See + <xref linkend="functions-posix-regexp"> for more information. + </entry> + <entry><literal>regexp_matches('foobarbequebaz', '(bar)(beque)')</literal></entry> + <entry><literal>{bar,beque}</literal></entry> + </row> + + <row> + <entry><literal><function>regexp_replace</function>(<parameter>string</parameter> <type>text</type>, <parameter>pattern</parameter> <type>text</type>, <parameter>replacement</parameter> <type>text</type> [, <parameter>flags</parameter> <type>text</type>])</literal></entry> <entry><type>text</type></entry> <entry> Replace substring matching POSIX regular expression. See - <xref linkend="functions-matching"> for more information on pattern - matching. + <xref linkend="functions-posix-regexp"> for more information. </entry> <entry><literal>regexp_replace('Thomas', '.[mN]a.', 'M')</literal></entry> <entry><literal>ThM</literal></entry> </row> + <row> + <entry><literal><function>regexp_split_to_array</function>(<parameter>string</parameter> <type>text</type>, <parameter>pattern</parameter> <type>text</type> [, <parameter>flags</parameter> <type>text</type> ])</literal></entry> + <entry><type>text[]</type></entry> + <entry> + Split <parameter>string</parameter> using POSIX regular expression as + the delimiter. See <xref linkend="functions-posix-regexp"> for more + information. + </entry> + <entry><literal>regexp_split_to_array('hello world', E'\\s+')</literal></entry> + <entry><literal>{hello,world}</literal></entry> + </row> + + <row> + <entry><literal><function>regexp_split_to_table</function>(<parameter>string</parameter> <type>text</type>, <parameter>pattern</parameter> <type>text</type> [, <parameter>flags</parameter> <type>text</type>])</literal></entry> + <entry><type>setof text</type></entry> + <entry> + Split <parameter>string</parameter> using POSIX regular expression as + the delimiter. See <xref linkend="functions-posix-regexp"> for more + information. + </entry> + <entry><literal>regexp_split_to_table('hello world', E'\\s+')</literal></entry> + <entry><literal>hello</literal><para><literal>world</literal></para> (2 rows)</entry> + </row> + <row> <entry><literal><function>repeat</function>(<parameter>string</parameter> <type>text</type>, <parameter>number</parameter> <type>int</type>)</literal></entry> <entry><type>text</type></entry> @@ -2883,9 +2918,6 @@ cast(-44 as bit(12)) <lineannotation>111111010100</lineannotation> <indexterm> <primary>substring</primary> </indexterm> - <indexterm> - <primary>regexp_replace</primary> - </indexterm> <synopsis> <replaceable>string</replaceable> SIMILAR TO <replaceable>pattern</replaceable> <optional>ESCAPE <replaceable>escape-character</replaceable></optional> @@ -3004,6 +3036,21 @@ substring('foobar' from '#"o_b#"%' for '#') <lineannotation>NULL</lineannotat <primary>regular expression</primary> <seealso>pattern matching</seealso> </indexterm> + <indexterm> + <primary>substring</primary> + </indexterm> + <indexterm> + <primary>regexp_replace</primary> + </indexterm> + <indexterm> + <primary>regexp_matches</primary> + </indexterm> + <indexterm> + <primary>regexp_split_to_table</primary> + </indexterm> + <indexterm> + <primary>regexp_split_to_array</primary> + </indexterm> <para> <xref linkend="functions-posix-table"> lists the available @@ -3134,7 +3181,10 @@ substring('foobar' from 'o(.)b') <lineannotation>o</lineannotation> string containing zero or more single-letter flags that change the function's behavior. Flag <literal>i</> specifies case-insensitive matching, while flag <literal>g</> specifies replacement of each matching - substring rather than only the first one. + substring rather than only the first one. Other supported flags are + <literal>m</>, <literal>n</>, <literal>p</>, <literal>w</> and + <literal>x</>, whose meanings correspond to those shown in + <xref linkend="posix-embedded-options-table">. </para> <para> @@ -3149,6 +3199,124 @@ regexp_replace('foobarbaz', 'b(..)', E'X\\1Y', 'g') </programlisting> </para> + <para> + The <function>regexp_matches</> function returns all of the capture + groups resulting from matching a POSIX regular expression pattern. + It has the syntax + <function>regexp_matches</function>(<replaceable>string</>, <replaceable>pattern</> + <optional>, <replaceable>flags</> </optional>). + If there is no match to the <replaceable>pattern</>, the function returns no rows. + If there is a match, the function returns the contents of all of the capture groups + in a text array, or if there were no capture groups in the pattern, it returns the + contents of the entire match as a single-element text array. + The <replaceable>flags</> parameter is an optional text + string containing zero or more single-letter flags that change the + function's behavior. Flag <literal>i</> specifies case-insensitive + matching, while flag <literal>g</> causes the return of each matching + substring rather than only the first one. Other supported + flags are <literal>m</>, <literal>n</>, <literal>p</>, <literal>w</> and + <literal>x</>, whose meanings are described in + <xref linkend="posix-embedded-options-table">. + </para> + + <para> + Some examples: +<programlisting> +SELECT regexp_matches('foobarbequebaz', '(bar)(beque)'); + regexp_matches +---------------- + {bar,beque} +(1 row) + +SELECT regexp_matches('foobarbequebazilbarfbonk', '(b[^b]+)(b[^b]+)', 'g'); + regexp_matches +---------------- + {bar,beque} + {bazil,barf} +(2 rows) + +SELECT regexp_matches('foobarbequebaz', 'barbeque'); + regexp_matches +---------------- + {barbeque} +(1 row) +</programlisting> + </para> + + <para> + The <function>regexp_split_to_table</> function splits a string using a POSIX + regular expression pattern as a delimiter. It has the syntax + <function>regexp_split_to_table</function>(<replaceable>string</>, <replaceable>pattern</> + <optional>, <replaceable>flags</> </optional>). + If there is no match to the <replaceable>pattern</>, the function returns the + <replaceable>string</>. If there is at least one match, for each match it returns + the text from the end of the last match (or the beginning of the string) + to the beginning of the match. When there are no more matches, it + returns the text from the end of the last match to the end of the string. + The <replaceable>flags</> parameter is an optional text string containing + zero or more single-letter flags that change the function's behavior. + <function>regexp_split_to_table</function> supports the flags <literal>i</>, + <literal>m</>, <literal>n</>, <literal>p</>, <literal>w</> and + <literal>x</>, whose meanings are described in + <xref linkend="posix-embedded-options-table">. + </para> + + <para> + The <function>regexp_split_to_array</> function behaves the same as + <function>regexp_split_to_table</>, except that <function>regexp_split_to_array</> + returns its results as a <type>text[]</>. It has the syntax + <function>regexp_split_to_array</function>(<replaceable>string</>, <replaceable>pattern</> + <optional>, <replaceable>flags</> </optional>). + The parameters are the same as for <function>regexp_split_to_table</>. + </para> + + <para> + Some examples: +<programlisting> + +SELECT foo FROM regexp_split_to_table('the quick brown fox jumped over the lazy dog', E'\\\s+') AS foo; + foo +-------- + the + quick + brown + fox + jumped + over + the + lazy + dog +(9 rows) + +SELECT regexp_split_to_array('the quick brown fox jumped over the lazy dog', E'\\s+'); + regexp_split_to_array +------------------------------------------------ + {the,quick,brown,fox,jumped,over,the,lazy,dog} +(1 row) + +SELECT foo FROM regexp_split_to_table('the quick brown fox', E'\\s*') AS foo; + foo +----- + t + h + e + q + u + i + c + k + b + r + o + w + n + f + o + x +(16 rows) +</programlisting> + </para> + <para> <productname>PostgreSQL</productname>'s regular expressions are implemented using a package written by Henry Spencer. Much of diff --git a/src/backend/utils/adt/regexp.c b/src/backend/utils/adt/regexp.c index 6a7a5fcc13be2d29721e7720e10ded16a4eeaa26..9f8dd0323a9995bcd9d555f8c5569891b5a912e0 100644 --- a/src/backend/utils/adt/regexp.c +++ b/src/backend/utils/adt/regexp.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/adt/regexp.c,v 1.69 2007/02/27 23:48:08 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/utils/adt/regexp.c,v 1.70 2007/03/20 05:44:59 neilc Exp $ * * Alistair Crooks added the code for the regex caching * agc - cached the regular expressions used - there's a good chance @@ -29,9 +29,12 @@ */ #include "postgres.h" +#include "access/heapam.h" +#include "funcapi.h" #include "regex/regex.h" #include "utils/builtins.h" #include "utils/guc.h" +#include "utils/lsyscache.h" /* GUC-settable flavor parameter */ @@ -75,9 +78,56 @@ typedef struct cached_re_str regex_t cre_re; /* the compiled regular expression */ } cached_re_str; +typedef struct re_comp_flags +{ + int cflags; + bool glob; +} re_comp_flags; + +typedef struct regexp_matches_ctx +{ + text *orig_str; + size_t orig_len; + pg_wchar *wide_str; + size_t wide_len; + regex_t *cpattern; + regmatch_t *pmatch; + size_t offset; + + re_comp_flags flags; + + /* text type info */ + Oid param_type; + int16 typlen; + bool typbyval; + char typalign; +} regexp_matches_ctx; + +typedef struct regexp_split_ctx +{ + text *orig_str; + size_t orig_len; + pg_wchar *wide_str; + size_t wide_len; + regex_t *cpattern; + regmatch_t match; + size_t offset; + re_comp_flags flags; +} regexp_split_ctx; + + static int num_res = 0; /* # of cached re's */ static cached_re_str re_array[MAX_CACHED_RES]; /* cached re's */ +static regexp_matches_ctx *setup_regexp_matches(FunctionCallInfo fcinfo, + text *orig_str, text *pattern, + text *flags); +static ArrayType *perform_regexp_matches(regexp_matches_ctx *matchctx); + +static regexp_split_ctx *setup_regexp_split(text *str, text *pattern, + text *flags); +static Datum get_next_split(regexp_split_ctx *splitctx); + /* * RE_compile_and_cache - compile a RE, caching if possible @@ -88,7 +138,7 @@ static cached_re_str re_array[MAX_CACHED_RES]; /* cached re's */ * cflags --- compile options for the pattern * * Pattern is given in the database encoding. We internally convert to - * array of pg_wchar which is what Spencer's regex package wants. + * an array of pg_wchar, which is what Spencer's regex package wants. */ static regex_t * RE_compile_and_cache(text *text_re, int cflags) @@ -191,48 +241,36 @@ RE_compile_and_cache(text *text_re, int cflags) } /* - * RE_compile_and_execute - compile and execute a RE + * RE_wchar_execute - execute a RE * * Returns TRUE on match, FALSE on no match * - * text_re --- the pattern, expressed as an *untoasted* TEXT object - * dat --- the data to match against (need not be null-terminated) - * dat_len --- the length of the data string - * cflags --- compile options for the pattern + * re --- the compiled pattern as returned by RE_compile_and_cache + * data --- the data to match against (need not be null-terminated) + * data_len --- the length of the data string + * start_search -- the offset in the data to start searching * nmatch, pmatch --- optional return area for match details * - * Both pattern and data are given in the database encoding. We internally - * convert to array of pg_wchar which is what Spencer's regex package wants. + * Data is given as array of pg_wchar which is what Spencer's regex package + * wants. */ static bool -RE_compile_and_execute(text *text_re, char *dat, int dat_len, - int cflags, int nmatch, regmatch_t *pmatch) +RE_wchar_execute(regex_t *re, pg_wchar *data, int data_len, + size_t start_search, int nmatch, regmatch_t *pmatch) { - pg_wchar *data; - size_t data_len; int regexec_result; - regex_t *re; char errMsg[100]; - /* Convert data string to wide characters */ - data = (pg_wchar *) palloc((dat_len + 1) * sizeof(pg_wchar)); - data_len = pg_mb2wchar_with_len(dat, data, dat_len); - - /* Compile RE */ - re = RE_compile_and_cache(text_re, cflags); - /* Perform RE match and return result */ regexec_result = pg_regexec(re, data, data_len, - 0, + start_search, NULL, /* no details */ nmatch, pmatch, 0); - pfree(data); - if (regexec_result != REG_OKAY && regexec_result != REG_NOMATCH) { /* re failed??? */ @@ -245,13 +283,116 @@ RE_compile_and_execute(text *text_re, char *dat, int dat_len, return (regexec_result == REG_OKAY); } +/* + * RE_execute - execute a RE + * + * Returns TRUE on match, FALSE on no match + * + * re --- the compiled pattern as returned by RE_compile_and_cache + * dat --- the data to match against (need not be null-terminated) + * dat_len --- the length of the data string + * nmatch, pmatch --- optional return area for match details + * + * Data is given in the database encoding. We internally + * convert to array of pg_wchar which is what Spencer's regex package wants. + */ +static bool +RE_execute(regex_t *re, char *dat, int dat_len, + int nmatch, regmatch_t *pmatch) +{ + pg_wchar *data; + size_t data_len; + bool match; + + /* Convert data string to wide characters */ + data = (pg_wchar *) palloc((dat_len + 1) * sizeof(pg_wchar)); + data_len = pg_mb2wchar_with_len(dat, data, dat_len); + + /* Perform RE match and return result */ + match = RE_wchar_execute(re, data, data_len, 0, nmatch, pmatch); + pfree(data); + return match; +} + +/* + * RE_compile_and_execute - compile and execute a RE + * + * Returns TRUE on match, FALSE on no match + * + * text_re --- the pattern, expressed as an *untoasted* TEXT object + * dat --- the data to match against (need not be null-terminated) + * dat_len --- the length of the data string + * cflags --- compile options for the pattern + * nmatch, pmatch --- optional return area for match details + * + * Both pattern and data are given in the database encoding. We internally + * convert to array of pg_wchar which is what Spencer's regex package wants. + */ +static bool +RE_compile_and_execute(text *text_re, char *dat, int dat_len, + int cflags, int nmatch, regmatch_t *pmatch) +{ + regex_t *re; + + /* Compile RE */ + re = RE_compile_and_cache(text_re, cflags); + + return RE_execute(re, dat, dat_len, nmatch, pmatch); +} + +static void +parse_re_comp_flags(re_comp_flags *flags, text *opts) +{ + MemSet(flags, 0, sizeof(re_comp_flags)); + flags->cflags = regex_flavor; + + if (opts) + { + char *opt_p = VARDATA(opts); + size_t opt_len = VARSIZE(opts) - VARHDRSZ; + int i; + + for (i = 0; i < opt_len; i++) + { + switch (opt_p[i]) + { + case 'g': + flags->glob = true; + break; + case 'i': + flags->cflags |= REG_ICASE; + break; + case 'm': + case 'n': + flags->cflags |= REG_NEWLINE; + break; + case 'p': + flags->cflags |= REG_NLSTOP; + flags->cflags &= ~REG_NLANCH; + break; + case 'w': + flags->cflags &= ~REG_NLSTOP; + flags->cflags |= REG_NLANCH; + break; + case 'x': + flags->cflags |= REG_EXPANDED; + break; + default: + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid regexp option: %c", opt_p[i]))); + break; + } + } + } +} + /* * assign_regex_flavor - GUC hook to validate and set REGEX_FLAVOR */ const char * -assign_regex_flavor(const char *value, - bool doit, GucSource source) +assign_regex_flavor(const char *value, bool doit, GucSource source) { if (pg_strcasecmp(value, "advanced") == 0) { @@ -469,39 +610,14 @@ textregexreplace(PG_FUNCTION_ARGS) text *p = PG_GETARG_TEXT_P(1); text *r = PG_GETARG_TEXT_P(2); text *opt = PG_GETARG_TEXT_P(3); - char *opt_p = VARDATA(opt); - int opt_len = (VARSIZE(opt) - VARHDRSZ); - int i; - bool glob = false; - bool ignorecase = false; regex_t *re; + re_comp_flags flags; - /* parse options */ - for (i = 0; i < opt_len; i++) - { - switch (opt_p[i]) - { - case 'i': - ignorecase = true; - break; - case 'g': - glob = true; - break; - default: - ereport(ERROR, - (errcode(ERRCODE_INVALID_PARAMETER_VALUE), - errmsg("invalid option of regexp_replace: %c", - opt_p[i]))); - break; - } - } + parse_re_comp_flags(&flags, opt); - if (ignorecase) - re = RE_compile_and_cache(p, regex_flavor | REG_ICASE); - else - re = RE_compile_and_cache(p, regex_flavor); + re = RE_compile_and_cache(p, flags.cflags); - PG_RETURN_TEXT_P(replace_text_regexp(s, (void *) re, r, glob)); + PG_RETURN_TEXT_P(replace_text_regexp(s, (void *) re, r, flags.glob)); } /* similar_escape() @@ -625,6 +741,361 @@ similar_escape(PG_FUNCTION_ARGS) PG_RETURN_TEXT_P(result); } +#define PG_GETARG_TEXT_P_IF_EXISTS(_n) \ + (PG_NARGS() > _n ? PG_GETARG_TEXT_P(_n) : NULL) + +Datum +regexp_matches(PG_FUNCTION_ARGS) +{ + FuncCallContext *funcctx; + MemoryContext oldcontext; + regexp_matches_ctx *matchctx; + + if (SRF_IS_FIRSTCALL()) + { + text *pattern = PG_GETARG_TEXT_P(1); + text *flags = PG_GETARG_TEXT_P_IF_EXISTS(2); + + funcctx = SRF_FIRSTCALL_INIT(); + oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); + + /* be sure to copy the input string into the multi-call ctx */ + matchctx = setup_regexp_matches(fcinfo, PG_GETARG_TEXT_P_COPY(0), + pattern, flags); + + MemoryContextSwitchTo(oldcontext); + funcctx->user_fctx = (void *) matchctx; + + /* + * Avoid run-away function by making sure we never iterate + * more than the length of the text + 1 (the number of matches + * an empty pattern will make is length + 1) + */ + if (matchctx->flags.glob) + funcctx->max_calls = matchctx->wide_len + 1; + else + funcctx->max_calls = 0; + } + + funcctx = SRF_PERCALL_SETUP(); + matchctx = (regexp_matches_ctx *) funcctx->user_fctx; + + if (funcctx->call_cntr > funcctx->max_calls) + { + /* + * If max_calls == 0, then we are doing a non-global match, we + * should stop now, no problem. Otherwise, if we exceed + * max_calls something really wonky is going on, since it is + * returning more matches than there are characters in the + * string, which should not happen + */ + if (funcctx->max_calls != 0) + elog(ERROR, "set returning match function terminated after iterating %d times", + funcctx->call_cntr); + + SRF_RETURN_DONE(funcctx); + } + + if (matchctx->offset < matchctx->wide_len) + { + ArrayType *result_ary; + + if (matchctx->pmatch[0].rm_so == matchctx->pmatch[0].rm_eo) + matchctx->offset++; + + result_ary = perform_regexp_matches(matchctx); + if (result_ary != NULL) + { + matchctx->offset = matchctx->pmatch[0].rm_eo; + SRF_RETURN_NEXT(funcctx, PointerGetDatum(result_ary)); + } + /* else fall through and return done */ + } + + SRF_RETURN_DONE(funcctx); +} + +Datum +regexp_matches_no_flags(PG_FUNCTION_ARGS) +{ + return regexp_matches(fcinfo); +} + +static regexp_matches_ctx * +setup_regexp_matches(FunctionCallInfo fcinfo, text *orig_str, text *pattern, text *flags) +{ + regexp_matches_ctx *matchctx = palloc(sizeof(regexp_matches_ctx)); + + matchctx->orig_str = orig_str; + matchctx->orig_len = VARSIZE(matchctx->orig_str) - VARHDRSZ; + + parse_re_comp_flags(&matchctx->flags, flags); + + matchctx->cpattern = RE_compile_and_cache(pattern, matchctx->flags.cflags); + matchctx->pmatch = palloc(sizeof(regmatch_t) * (matchctx->cpattern->re_nsub + 1)); + matchctx->offset = 0; + + /* get text type oid, too lazy to do it some other way */ + matchctx->param_type = get_fn_expr_argtype(fcinfo->flinfo, 0); + get_typlenbyvalalign(matchctx->param_type, &matchctx->typlen, + &matchctx->typbyval, &matchctx->typalign); + + matchctx->wide_str = palloc(sizeof(pg_wchar) * (matchctx->orig_len + 1)); + matchctx->wide_len = pg_mb2wchar_with_len(VARDATA(matchctx->orig_str), + matchctx->wide_str, matchctx->orig_len); + + matchctx->pmatch[0].rm_so = -1; + /* both < 0 but not equal */ + matchctx->pmatch[0].rm_eo = -2; + + return matchctx; +} + +static ArrayType * +perform_regexp_matches(regexp_matches_ctx *matchctx) +{ + Datum *elems; + bool *nulls; + Datum fullmatch; /* used to avoid a palloc if no matches */ + int ndims = 1; + int dims[1]; + int lbs[1] = {1}; + + if (RE_wchar_execute(matchctx->cpattern, + matchctx->wide_str, + matchctx->wide_len, + matchctx->offset, + matchctx->cpattern->re_nsub + 1, + matchctx->pmatch) == false) + return NULL; + + if (matchctx->cpattern->re_nsub > 0) + { + int i; + + elems = palloc(sizeof(Datum) * matchctx->cpattern->re_nsub); + nulls = palloc(sizeof(bool) * matchctx->cpattern->re_nsub); + dims[0] = matchctx->cpattern->re_nsub; + + for (i = 0; i < matchctx->cpattern->re_nsub; i++) + { + int so = matchctx->pmatch[i + 1].rm_so; + int eo = matchctx->pmatch[i + 1].rm_eo; + + if (so < 0 || eo < 0) + { + elems[i] = 0; + nulls[i] = true; + } + else + { + elems[i] = DirectFunctionCall3(text_substr, + PointerGetDatum(matchctx->orig_str), + Int32GetDatum(so + 1), + Int32GetDatum(eo - so)); + nulls[i] = false; + } + } + } + else + { + int so = matchctx->pmatch[0].rm_so; + int eo = matchctx->pmatch[0].rm_eo; + + if (so < 0 || eo < 0) + elog(ERROR, "regexp code said it had a match, but did not return it"); + + fullmatch = DirectFunctionCall3(text_substr, + PointerGetDatum(matchctx->orig_str), + Int32GetDatum(so + 1), + Int32GetDatum(eo - so)); + + elems = &fullmatch; + nulls = NULL; + dims[0] = 1; + } + + return construct_md_array(elems, nulls, ndims, dims, lbs, + matchctx->param_type, matchctx->typlen, + matchctx->typbyval, matchctx->typalign); +} + +Datum +regexp_split_to_table(PG_FUNCTION_ARGS) +{ + FuncCallContext *funcctx; + regexp_split_ctx *splitctx; + + if (SRF_IS_FIRSTCALL()) + { + text *pattern = PG_GETARG_TEXT_P(1); + text *flags = PG_GETARG_TEXT_P_IF_EXISTS(2); + MemoryContext oldcontext; + + funcctx = SRF_FIRSTCALL_INIT(); + oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx); + + splitctx = setup_regexp_split(PG_GETARG_TEXT_P_COPY(0), pattern, flags); + + MemoryContextSwitchTo(oldcontext); + funcctx->user_fctx = (void *) splitctx; + + /* + * Avoid run-away function by making sure we never iterate + * more than the length of the text + */ + funcctx->max_calls = splitctx->wide_len; + } + + funcctx = SRF_PERCALL_SETUP(); + splitctx = (regexp_split_ctx *) funcctx->user_fctx; + + if (funcctx->call_cntr > funcctx->max_calls) + { + /* + * If we exceed wide_len something really wonky is going on, + * since it is returning more matches than there are + * characters in the string, which should not happen + */ + elog(ERROR, "set returning split function terminated after iterating %d times", + funcctx->call_cntr); + } + + if (splitctx->offset < splitctx->wide_len) + SRF_RETURN_NEXT(funcctx, get_next_split(splitctx)); + else + SRF_RETURN_DONE(funcctx); +} + +Datum regexp_split_to_table_no_flags(PG_FUNCTION_ARGS) +{ + return regexp_split_to_table(fcinfo); +} + +Datum regexp_split_to_array(PG_FUNCTION_ARGS) +{ + ArrayBuildState *astate = NULL; + regexp_split_ctx *splitctx; + Oid param_type; + int nitems; + + splitctx = setup_regexp_split(PG_GETARG_TEXT_P(0), + PG_GETARG_TEXT_P(1), + PG_GETARG_TEXT_P_IF_EXISTS(2)); + + /* get text type oid, too lazy to do it some other way */ + param_type = get_fn_expr_argtype(fcinfo->flinfo, 0); + + for (nitems = 0; splitctx->offset < splitctx->wide_len; nitems++) + { + if (nitems > splitctx->wide_len) + elog(ERROR, "split function terminated after iterating %d times", + nitems); + + astate = accumArrayResult(astate, + get_next_split(splitctx), + false, + param_type, + CurrentMemoryContext); + } + + PG_RETURN_ARRAYTYPE_P(makeArrayResult(astate, CurrentMemoryContext)); +} + +Datum regexp_split_to_array_no_flags(PG_FUNCTION_ARGS) +{ + return regexp_split_to_array(fcinfo); +} + +static regexp_split_ctx * +setup_regexp_split(text *str, text *pattern, text *flags) +{ + regexp_split_ctx *splitctx = palloc(sizeof(regexp_split_ctx)); + + splitctx->orig_str = str; + splitctx->orig_len = VARSIZE(splitctx->orig_str) - VARHDRSZ; + + parse_re_comp_flags(&splitctx->flags, flags); + if (splitctx->flags.glob) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("regexp_split does not support the global option"))); + + splitctx->cpattern = RE_compile_and_cache(pattern, splitctx->flags.cflags); + + splitctx->wide_str = palloc(sizeof(pg_wchar) * (splitctx->orig_len + 1)); + splitctx->wide_len = pg_mb2wchar_with_len(VARDATA(splitctx->orig_str), + splitctx->wide_str, + splitctx->orig_len); + + splitctx->offset = 0; + + splitctx->match.rm_so = -1; + /* both < 0 but not equal */ + splitctx->match.rm_eo = -2; + + return splitctx; +} + +static Datum +get_next_split(regexp_split_ctx *splitctx) +{ + regmatch_t *pmatch = &(splitctx->match); + + for (;;) + { + Datum result; + int startpos = splitctx->offset + 1; + + /* + * If the last match was zero-length, we need to push the + * offset forward to avoid matching the same place forever + */ + if (pmatch->rm_so == pmatch->rm_eo) + splitctx->offset++; + + if (RE_wchar_execute(splitctx->cpattern, + splitctx->wide_str, + splitctx->wide_len, + splitctx->offset, + 1, + pmatch)) + { + int length = splitctx->match.rm_so - startpos + 1; + + /* + * If we are trying to match at the beginning of the string and + * we got a zero-length match, or if we just matched where we + * left off last time, go around the loop again and increment + * the offset. If we have incremented the offset already and + * it matched at the new offset, that's ok + */ + if (length == 0) + continue; + + result = DirectFunctionCall3(text_substr, + PointerGetDatum(splitctx->orig_str), + Int32GetDatum(startpos), + Int32GetDatum(length)); + + /* set the offset to the end of this match for next time */ + splitctx->offset = pmatch->rm_eo; + + return result; + } + + /* no more matches, return rest of string */ + result = DirectFunctionCall2(text_substr_no_len, + PointerGetDatum(splitctx->orig_str), + Int32GetDatum(startpos)); + + /* so we know we're done next time through */ + splitctx->offset = splitctx->wide_len; + + return result; + } +} + /* * report whether regex_flavor is currently BASIC */ diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h index 2327e1cab5ef6265c7552f57e4ad97bd21ee1137..2bd1eda199d0f06f1e667eaf4bbee5a068a432a1 100644 --- a/src/include/catalog/catversion.h +++ b/src/include/catalog/catversion.h @@ -37,7 +37,7 @@ * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/catalog/catversion.h,v 1.392 2007/03/20 03:53:26 wieck Exp $ + * $PostgreSQL: pgsql/src/include/catalog/catversion.h,v 1.393 2007/03/20 05:45:00 neilc Exp $ * *------------------------------------------------------------------------- */ @@ -53,6 +53,6 @@ */ /* yyyymmddN */ -#define CATALOG_VERSION_NO 200703190 +#define CATALOG_VERSION_NO 200703201 #endif diff --git a/src/include/catalog/pg_proc.h b/src/include/catalog/pg_proc.h index 6148702f5ee85e7d3cdc50d461a778a46f4a5f49..8007129a1bbec02fb401a0ef0b98fe1d967ab661 100644 --- a/src/include/catalog/pg_proc.h +++ b/src/include/catalog/pg_proc.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/catalog/pg_proc.h,v 1.448 2007/03/16 17:57:36 mha Exp $ + * $PostgreSQL: pgsql/src/include/catalog/pg_proc.h,v 1.449 2007/03/20 05:45:00 neilc Exp $ * * NOTES * The script catalog/genbki.sh reads this file and generates .bki @@ -2261,8 +2261,20 @@ DATA(insert OID = 2284 ( regexp_replace PGNSP PGUID 12 1 0 f f t f i 3 25 "2 DESCR("replace text using regexp"); DATA(insert OID = 2285 ( regexp_replace PGNSP PGUID 12 1 0 f f t f i 4 25 "25 25 25 25" _null_ _null_ _null_ textregexreplace - _null_ )); DESCR("replace text using regexp"); +DATA(insert OID = 2763 ( regexp_matches PGNSP PGUID 12 1 1 f f t t i 2 1009 "25 25" _null_ _null_ _null_ regexp_matches_no_flags - _null_ )); +DESCR("return all match groups for regexp"); +DATA(insert OID = 2764 ( regexp_matches PGNSP PGUID 12 1 10 f f t t i 3 1009 "25 25 25" _null_ _null_ _null_ regexp_matches - _null_ )); +DESCR("return all match groups for regexp"); DATA(insert OID = 2088 ( split_part PGNSP PGUID 12 1 0 f f t f i 3 25 "25 25 23" _null_ _null_ _null_ split_text - _null_ )); DESCR("split string by field_sep and return field_num"); +DATA(insert OID = 2765 ( regexp_split_to_table PGNSP PGUID 12 1 1000 f f t t i 2 25 "25 25" _null_ _null_ _null_ regexp_split_to_table_no_flags - _null_ )); +DESCR("split string by pattern"); +DATA(insert OID = 2766 ( regexp_split_to_table PGNSP PGUID 12 1 1000 f f t t i 3 25 "25 25 25" _null_ _null_ _null_ regexp_split_to_table - _null_ )); +DESCR("split string by pattern"); +DATA(insert OID = 2767 ( regexp_split_to_array PGNSP PGUID 12 1 0 f f t f i 2 1009 "25 25" _null_ _null_ _null_ regexp_split_to_array_no_flags - _null_ )); +DESCR("split string by pattern"); +DATA(insert OID = 2768 ( regexp_split_to_array PGNSP PGUID 12 1 0 f f t f i 3 1009 "25 25 25" _null_ _null_ _null_ regexp_split_to_array - _null_ )); +DESCR("split string by pattern"); DATA(insert OID = 2089 ( to_hex PGNSP PGUID 12 1 0 f f t f i 1 25 "23" _null_ _null_ _null_ to_hex32 - _null_ )); DESCR("convert int4 number to hex"); DATA(insert OID = 2090 ( to_hex PGNSP PGUID 12 1 0 f f t f i 1 25 "20" _null_ _null_ _null_ to_hex64 - _null_ )); diff --git a/src/include/utils/builtins.h b/src/include/utils/builtins.h index 70c2142d8d422b9ace7f08ba1891ba69cb01bf4f..b2a55bbbb9839cbafc3e3d9c05a56d6d1123fb60 100644 --- a/src/include/utils/builtins.h +++ b/src/include/utils/builtins.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/utils/builtins.h,v 1.289 2007/02/23 21:59:45 tgl Exp $ + * $PostgreSQL: pgsql/src/include/utils/builtins.h,v 1.290 2007/03/20 05:45:00 neilc Exp $ * *------------------------------------------------------------------------- */ @@ -478,6 +478,12 @@ extern Datum textregexsubstr(PG_FUNCTION_ARGS); extern Datum textregexreplace_noopt(PG_FUNCTION_ARGS); extern Datum textregexreplace(PG_FUNCTION_ARGS); extern Datum similar_escape(PG_FUNCTION_ARGS); +extern Datum regexp_matches(PG_FUNCTION_ARGS); +extern Datum regexp_matches_no_flags(PG_FUNCTION_ARGS); +extern Datum regexp_split_to_table(PG_FUNCTION_ARGS); +extern Datum regexp_split_to_table_no_flags(PG_FUNCTION_ARGS); +extern Datum regexp_split_to_array(PG_FUNCTION_ARGS); +extern Datum regexp_split_to_array_no_flags(PG_FUNCTION_ARGS); extern bool regex_flavor_is_basic(void); /* regproc.c */ diff --git a/src/test/regress/expected/strings.out b/src/test/regress/expected/strings.out index 20081f699b68a73857ea84331bc8b48028996a84..e11dfc35d786231352f905a75ab7cb817d4f6eec 100644 --- a/src/test/regress/expected/strings.out +++ b/src/test/regress/expected/strings.out @@ -217,9 +217,232 @@ SELECT regexp_replace('AAA aaa', 'A+', 'Z', 'gi'); Z Z (1 row) --- invalid option of REGEXP_REPLACE +-- invalid regexp option SELECT regexp_replace('AAA aaa', 'A+', 'Z', 'z'); -ERROR: invalid option of regexp_replace: z +ERROR: invalid regexp option: z +-- set so we can tell NULL from empty string +\pset null '\\N' +-- return all matches from regexp +SELECT regexp_matches('foobarbequebaz', $re$(bar)(beque)$re$); + regexp_matches +---------------- + {bar,beque} +(1 row) + +-- test case insensitive +SELECT regexp_matches('foObARbEqUEbAz', $re$(bar)(beque)$re$, 'i'); + regexp_matches +---------------- + {bAR,bEqUE} +(1 row) + +-- global option - more than one match +SELECT regexp_matches('foobarbequebazilbarfbonk', $re$(b[^b]+)(b[^b]+)$re$, 'g'); + regexp_matches +---------------- + {bar,beque} + {bazil,barf} +(2 rows) + +-- empty capture group (matched empty string) +SELECT regexp_matches('foobarbequebaz', $re$(bar)(.*)(beque)$re$); + regexp_matches +---------------- + {bar,"",beque} +(1 row) + +-- no match +SELECT regexp_matches('foobarbequebaz', $re$(bar)(.+)(beque)$re$); + regexp_matches +---------------- +(0 rows) + +-- optional capture group did not match, null entry in array +SELECT regexp_matches('foobarbequebaz', $re$(bar)(.+)?(beque)$re$); + regexp_matches +------------------ + {bar,NULL,beque} +(1 row) + +-- no capture groups +SELECT regexp_matches('foobarbequebaz', $re$barbeque$re$); + regexp_matches +---------------- + {barbeque} +(1 row) + +-- give me errors +SELECT regexp_matches('foobarbequebaz', $re$(bar)(beque)$re$, 'zipper'); +ERROR: invalid regexp option: z +SELECT regexp_matches('foobarbequebaz', $re$(barbeque$re$); +ERROR: invalid regular expression: parentheses () not balanced +SELECT regexp_matches('foobarbequebaz', $re$(bar)(beque){2,1}$re$); +ERROR: invalid regular expression: invalid repetition count(s) +-- split string on regexp +SELECT foo, length(foo) FROM regexp_split_to_table('the quick brown fox jumped over the lazy dog', $re$\s+$re$) AS foo; + foo | length +--------+-------- + the | 3 + quick | 5 + brown | 5 + fox | 3 + jumped | 6 + over | 4 + the | 3 + lazy | 4 + dog | 3 +(9 rows) + +SELECT regexp_split_to_array('the quick brown fox jumped over the lazy dog', $re$\s+$re$); + regexp_split_to_array +------------------------------------------------ + {the,quick,brown,fox,jumped,over,the,lazy,dog} +(1 row) + +SELECT foo, length(foo) FROM regexp_split_to_table('the quick brown fox jumped over the lazy dog', $re$\s*$re$) AS foo; + foo | length +-----+-------- + t | 1 + h | 1 + e | 1 + q | 1 + u | 1 + i | 1 + c | 1 + k | 1 + b | 1 + r | 1 + o | 1 + w | 1 + n | 1 + f | 1 + o | 1 + x | 1 + j | 1 + u | 1 + m | 1 + p | 1 + e | 1 + d | 1 + o | 1 + v | 1 + e | 1 + r | 1 + t | 1 + h | 1 + e | 1 + l | 1 + a | 1 + z | 1 + y | 1 + d | 1 + o | 1 + g | 1 +(36 rows) + +SELECT regexp_split_to_array('the quick brown fox jumped over the lazy dog', $re$\s*$re$); + regexp_split_to_array +--------------------------------------------------------------------------- + {t,h,e,q,u,i,c,k,b,r,o,w,n,f,o,x,j,u,m,p,e,d,o,v,e,r,t,h,e,l,a,z,y,d,o,g} +(1 row) + +SELECT foo, length(foo) FROM regexp_split_to_table('the quick brown fox jumped over the lazy dog', '') AS foo; + foo | length +-----+-------- + t | 1 + h | 1 + e | 1 + | 1 + q | 1 + u | 1 + i | 1 + c | 1 + k | 1 + | 1 + b | 1 + r | 1 + o | 1 + w | 1 + n | 1 + | 1 + f | 1 + o | 1 + x | 1 + | 1 + j | 1 + u | 1 + m | 1 + p | 1 + e | 1 + d | 1 + | 1 + o | 1 + v | 1 + e | 1 + r | 1 + | 1 + t | 1 + h | 1 + e | 1 + | 1 + l | 1 + a | 1 + z | 1 + y | 1 + | 1 + d | 1 + o | 1 + g | 1 +(44 rows) + +SELECT regexp_split_to_array('the quick brown fox jumped over the lazy dog', ''); + regexp_split_to_array +----------------------------------------------------------------------------------------------------------- + {t,h,e," ",q,u,i,c,k," ",b,r,o,w,n," ",f,o,x," ",j,u,m,p,e,d," ",o,v,e,r," ",t,h,e," ",l,a,z,y," ",d,o,g} +(1 row) + +-- case insensitive +SELECT foo, length(foo) FROM regexp_split_to_table('thE QUick bROWn FOx jUMPed ovEr THE lazy dOG', 'e', 'i') AS foo; + foo | length +-----------------------+-------- + th | 2 + QUick bROWn FOx jUMP | 21 + d ov | 4 + r TH | 4 + lazy dOG | 9 +(5 rows) + +SELECT regexp_split_to_array('thE QUick bROWn FOx jUMPed ovEr THE lazy dOG', 'e', 'i'); + regexp_split_to_array +-------------------------------------------------------- + {th," QUick bROWn FOx jUMP","d ov","r TH"," lazy dOG"} +(1 row) + +-- no match of pattern +SELECT foo, length(foo) FROM regexp_split_to_table('the quick brown fox jumped over the lazy dog', 'nomatch') AS foo; + foo | length +----------------------------------------------+-------- + the quick brown fox jumped over the lazy dog | 44 +(1 row) + +SELECT regexp_split_to_array('the quick brown fox jumped over the lazy dog', 'nomatch'); + regexp_split_to_array +-------------------------------------------------- + {"the quick brown fox jumped over the lazy dog"} +(1 row) + +-- errors +SELECT foo, length(foo) FROM regexp_split_to_table('thE QUick bROWn FOx jUMPed ovEr THE lazy dOG', 'e', 'zippy') AS foo; +ERROR: invalid regexp option: z +SELECT regexp_split_to_array('thE QUick bROWn FOx jUMPed ovEr THE lazy dOG', 'e', 'zippy'); +ERROR: invalid regexp option: z +-- global option meaningless for regexp_split +SELECT foo, length(foo) FROM regexp_split_to_table('thE QUick bROWn FOx jUMPed ovEr THE lazy dOG', 'e', 'g') AS foo; +ERROR: regexp_split does not support the global option +SELECT regexp_split_to_array('thE QUick bROWn FOx jUMPed ovEr THE lazy dOG', 'e', 'g'); +ERROR: regexp_split does not support the global option +-- change NULL-display back +\pset null '' -- E021-11 position expression SELECT POSITION('4' IN '1234567890') = '4' AS "4"; 4 diff --git a/src/test/regress/sql/strings.sql b/src/test/regress/sql/strings.sql index 570d9a27f46060b8334019fde77b92fa2f0dec2f..389ff63517f75bb1ad94c943abc782063ae2f506 100644 --- a/src/test/regress/sql/strings.sql +++ b/src/test/regress/sql/strings.sql @@ -85,9 +85,60 @@ SELECT regexp_replace('1112223333', E'(\\d{3})(\\d{3})(\\d{4})', E'(\\1) \\2-\\3 SELECT regexp_replace('AAA BBB CCC ', E'\\s+', ' ', 'g'); SELECT regexp_replace('AAA', '^|$', 'Z', 'g'); SELECT regexp_replace('AAA aaa', 'A+', 'Z', 'gi'); --- invalid option of REGEXP_REPLACE +-- invalid regexp option SELECT regexp_replace('AAA aaa', 'A+', 'Z', 'z'); +-- set so we can tell NULL from empty string +\pset null '\\N' + +-- return all matches from regexp +SELECT regexp_matches('foobarbequebaz', $re$(bar)(beque)$re$); + +-- test case insensitive +SELECT regexp_matches('foObARbEqUEbAz', $re$(bar)(beque)$re$, 'i'); + +-- global option - more than one match +SELECT regexp_matches('foobarbequebazilbarfbonk', $re$(b[^b]+)(b[^b]+)$re$, 'g'); + +-- empty capture group (matched empty string) +SELECT regexp_matches('foobarbequebaz', $re$(bar)(.*)(beque)$re$); +-- no match +SELECT regexp_matches('foobarbequebaz', $re$(bar)(.+)(beque)$re$); +-- optional capture group did not match, null entry in array +SELECT regexp_matches('foobarbequebaz', $re$(bar)(.+)?(beque)$re$); + +-- no capture groups +SELECT regexp_matches('foobarbequebaz', $re$barbeque$re$); + +-- give me errors +SELECT regexp_matches('foobarbequebaz', $re$(bar)(beque)$re$, 'zipper'); +SELECT regexp_matches('foobarbequebaz', $re$(barbeque$re$); +SELECT regexp_matches('foobarbequebaz', $re$(bar)(beque){2,1}$re$); + +-- split string on regexp +SELECT foo, length(foo) FROM regexp_split_to_table('the quick brown fox jumped over the lazy dog', $re$\s+$re$) AS foo; +SELECT regexp_split_to_array('the quick brown fox jumped over the lazy dog', $re$\s+$re$); + +SELECT foo, length(foo) FROM regexp_split_to_table('the quick brown fox jumped over the lazy dog', $re$\s*$re$) AS foo; +SELECT regexp_split_to_array('the quick brown fox jumped over the lazy dog', $re$\s*$re$); +SELECT foo, length(foo) FROM regexp_split_to_table('the quick brown fox jumped over the lazy dog', '') AS foo; +SELECT regexp_split_to_array('the quick brown fox jumped over the lazy dog', ''); +-- case insensitive +SELECT foo, length(foo) FROM regexp_split_to_table('thE QUick bROWn FOx jUMPed ovEr THE lazy dOG', 'e', 'i') AS foo; +SELECT regexp_split_to_array('thE QUick bROWn FOx jUMPed ovEr THE lazy dOG', 'e', 'i'); +-- no match of pattern +SELECT foo, length(foo) FROM regexp_split_to_table('the quick brown fox jumped over the lazy dog', 'nomatch') AS foo; +SELECT regexp_split_to_array('the quick brown fox jumped over the lazy dog', 'nomatch'); +-- errors +SELECT foo, length(foo) FROM regexp_split_to_table('thE QUick bROWn FOx jUMPed ovEr THE lazy dOG', 'e', 'zippy') AS foo; +SELECT regexp_split_to_array('thE QUick bROWn FOx jUMPed ovEr THE lazy dOG', 'e', 'zippy'); +-- global option meaningless for regexp_split +SELECT foo, length(foo) FROM regexp_split_to_table('thE QUick bROWn FOx jUMPed ovEr THE lazy dOG', 'e', 'g') AS foo; +SELECT regexp_split_to_array('thE QUick bROWn FOx jUMPed ovEr THE lazy dOG', 'e', 'g'); + +-- change NULL-display back +\pset null '' + -- E021-11 position expression SELECT POSITION('4' IN '1234567890') = '4' AS "4";