diff --git a/doc/src/sgml/textsearch.sgml b/doc/src/sgml/textsearch.sgml index b43872cca5c3898f2dc790a3e8ddc0b1eb923953..61583df3a2105add6aa329763bb0eed9c9197e81 100644 --- a/doc/src/sgml/textsearch.sgml +++ b/doc/src/sgml/textsearch.sgml @@ -1,4 +1,4 @@ -<!-- $PostgreSQL: pgsql/doc/src/sgml/textsearch.sgml,v 1.36 2007/11/16 03:23:07 tgl Exp $ --> +<!-- $PostgreSQL: pgsql/doc/src/sgml/textsearch.sgml,v 1.37 2007/11/20 02:25:22 adunstan Exp $ --> <chapter id="textsearch"> <title id="textsearch-title">Full Text Search</title> @@ -1862,12 +1862,12 @@ LIMIT 10; </row> <row> <entry><literal>tag</></entry> - <entry>HTML tag</entry> - <entry><literal><A HREF="dictionaries.html"></literal></entry> + <entry>XML tag</entry> + <entry><literal><a href="dictionaries.html"></literal></entry> </row> <row> <entry><literal>entity</></entry> - <entry>HTML entity</entry> + <entry>XML entity</entry> <entry><literal>&amp;</literal></entry> </row> <row> diff --git a/src/backend/tsearch/wparser_def.c b/src/backend/tsearch/wparser_def.c index 3f95f60579ea9b1cab5ef276449ae0ddd79c164b..b80175456d2ee3ef0699eef2f3eeeac58c24bff0 100644 --- a/src/backend/tsearch/wparser_def.c +++ b/src/backend/tsearch/wparser_def.c @@ -7,7 +7,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/tsearch/wparser_def.c,v 1.10 2007/11/15 22:25:16 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/tsearch/wparser_def.c,v 1.11 2007/11/20 02:25:22 adunstan Exp $ * *------------------------------------------------------------------------- */ @@ -50,7 +50,7 @@ #define DECIMAL 20 #define SIGNEDINT 21 #define UNSIGNEDINT 22 -#define HTMLENTITY 23 +#define XMLENTITY 23 #define LASTNUM 23 @@ -95,7 +95,7 @@ static const char *const lex_descr[] = { "Hyphenated word part, all letters", "Hyphenated word part, all ASCII", "Space symbols", - "HTML tag", + "XML tag", "Protocol head", "Hyphenated word, letters and digits", "Hyphenated word, all ASCII", @@ -105,7 +105,7 @@ static const char *const lex_descr[] = { "Decimal notation", "Signed integer", "Unsigned integer", - "HTML entity" + "XML entity" }; @@ -132,11 +132,13 @@ typedef enum TPS_InMantissaFirst, TPS_InMantissaSign, TPS_InMantissa, - TPS_InHTMLEntityFirst, - TPS_InHTMLEntity, - TPS_InHTMLEntityNumFirst, - TPS_InHTMLEntityNum, - TPS_InHTMLEntityEnd, + TPS_InXMLEntityFirst, + TPS_InXMLEntity, + TPS_InXMLEntityNumFirst, + TPS_InXMLEntityNum, + TPS_InXMLEntityHexNumFirst, + TPS_InXMLEntityHexNum, + TPS_InXMLEntityEnd, TPS_InTagFirst, TPS_InXMLBegin, TPS_InTagCloseFirst, @@ -653,7 +655,7 @@ static const TParserStateActionItem actionTPS_Base[] = { {p_isdigit, 0, A_NEXT, TPS_InUnsignedInt, 0, NULL}, {p_iseqC, '-', A_PUSH, TPS_InSignedIntFirst, 0, NULL}, {p_iseqC, '+', A_PUSH, TPS_InSignedIntFirst, 0, NULL}, - {p_iseqC, '&', A_PUSH, TPS_InHTMLEntityFirst, 0, NULL}, + {p_iseqC, '&', A_PUSH, TPS_InXMLEntityFirst, 0, NULL}, {p_iseqC, '~', A_PUSH, TPS_InFileTwiddle, 0, NULL}, {p_iseqC, '/', A_PUSH, TPS_InFileFirst, 0, NULL}, {p_iseqC, '.', A_PUSH, TPS_InPathFirstFirst, 0, NULL}, @@ -811,35 +813,56 @@ static const TParserStateActionItem actionTPS_InMantissa[] = { {NULL, 0, A_BINGO, TPS_Base, SCIENTIFIC, NULL} }; -static const TParserStateActionItem actionTPS_InHTMLEntityFirst[] = { +static const TParserStateActionItem actionTPS_InXMLEntityFirst[] = { {p_isEOF, 0, A_POP, TPS_Null, 0, NULL}, - {p_iseqC, '#', A_NEXT, TPS_InHTMLEntityNumFirst, 0, NULL}, - {p_isasclet, 0, A_NEXT, TPS_InHTMLEntity, 0, NULL}, + {p_iseqC, '#', A_NEXT, TPS_InXMLEntityNumFirst, 0, NULL}, + {p_isasclet, 0, A_NEXT, TPS_InXMLEntity, 0, NULL}, + {p_iseqC, ':', A_NEXT, TPS_InXMLEntity, 0, NULL}, + {p_iseqC, '_', A_NEXT, TPS_InXMLEntity, 0, NULL}, {NULL, 0, A_POP, TPS_Null, 0, NULL} }; -static const TParserStateActionItem actionTPS_InHTMLEntity[] = { +static const TParserStateActionItem actionTPS_InXMLEntity[] = { {p_isEOF, 0, A_POP, TPS_Null, 0, NULL}, - {p_isasclet, 0, A_NEXT, TPS_InHTMLEntity, 0, NULL}, - {p_iseqC, ';', A_NEXT, TPS_InHTMLEntityEnd, 0, NULL}, + {p_isalnum, 0, A_NEXT, TPS_InXMLEntity, 0, NULL}, + {p_iseqC, ':', A_NEXT, TPS_InXMLEntity, 0, NULL}, + {p_iseqC, '_', A_NEXT, TPS_InXMLEntity, 0, NULL}, + {p_iseqC, ':', A_NEXT, TPS_InXMLEntity, 0, NULL}, + {p_iseqC, '.', A_NEXT, TPS_InXMLEntity, 0, NULL}, + {p_iseqC, '-', A_NEXT, TPS_InXMLEntity, 0, NULL}, + {p_iseqC, ';', A_NEXT, TPS_InXMLEntityEnd, 0, NULL}, {NULL, 0, A_POP, TPS_Null, 0, NULL} }; -static const TParserStateActionItem actionTPS_InHTMLEntityNumFirst[] = { +static const TParserStateActionItem actionTPS_InXMLEntityNumFirst[] = { {p_isEOF, 0, A_POP, TPS_Null, 0, NULL}, - {p_isdigit, 0, A_NEXT, TPS_InHTMLEntityNum, 0, NULL}, + {p_iseqC, 'x', A_NEXT, TPS_InXMLEntityHexNumFirst, 0, NULL}, + {p_isdigit, 0, A_NEXT, TPS_InXMLEntityNum, 0, NULL}, {NULL, 0, A_POP, TPS_Null, 0, NULL} }; -static const TParserStateActionItem actionTPS_InHTMLEntityNum[] = { +static const TParserStateActionItem actionTPS_InXMLEntityHexNumFirst[] = { {p_isEOF, 0, A_POP, TPS_Null, 0, NULL}, - {p_isdigit, 0, A_NEXT, TPS_InHTMLEntityNum, 0, NULL}, - {p_iseqC, ';', A_NEXT, TPS_InHTMLEntityEnd, 0, NULL}, + {p_isxdigit, 0, A_NEXT, TPS_InXMLEntityHexNum, 0, NULL}, {NULL, 0, A_POP, TPS_Null, 0, NULL} }; -static const TParserStateActionItem actionTPS_InHTMLEntityEnd[] = { - {NULL, 0, A_BINGO | A_CLEAR, TPS_Base, HTMLENTITY, NULL} +static const TParserStateActionItem actionTPS_InXMLEntityNum[] = { + {p_isEOF, 0, A_POP, TPS_Null, 0, NULL}, + {p_isdigit, 0, A_NEXT, TPS_InXMLEntityNum, 0, NULL}, + {p_iseqC, ';', A_NEXT, TPS_InXMLEntityEnd, 0, NULL}, + {NULL, 0, A_POP, TPS_Null, 0, NULL} +}; + +static const TParserStateActionItem actionTPS_InXMLEntityHexNum[] = { + {p_isEOF, 0, A_POP, TPS_Null, 0, NULL}, + {p_isxdigit, 0, A_NEXT, TPS_InXMLEntityHexNum, 0, NULL}, + {p_iseqC, ';', A_NEXT, TPS_InXMLEntityEnd, 0, NULL}, + {NULL, 0, A_POP, TPS_Null, 0, NULL} +}; + +static const TParserStateActionItem actionTPS_InXMLEntityEnd[] = { + {NULL, 0, A_BINGO | A_CLEAR, TPS_Base, XMLENTITY, NULL} }; static const TParserStateActionItem actionTPS_InTagFirst[] = { @@ -854,8 +877,8 @@ static const TParserStateActionItem actionTPS_InTagFirst[] = { static const TParserStateActionItem actionTPS_InXMLBegin[] = { {p_isEOF, 0, A_POP, TPS_Null, 0, NULL}, /* <?xml ... */ + /* XXX do we wants states for the m and l ? Right now this accepts <?xZ */ {p_iseqC, 'x', A_NEXT, TPS_InTag, 0, NULL}, - {p_iseqC, 'X', A_NEXT, TPS_InTag, 0, NULL}, {NULL, 0, A_POP, TPS_Null, 0, NULL} }; @@ -1278,11 +1301,13 @@ static const TParserStateAction Actions[] = { TPARSERSTATEACTION(TPS_InMantissaFirst), TPARSERSTATEACTION(TPS_InMantissaSign), TPARSERSTATEACTION(TPS_InMantissa), - TPARSERSTATEACTION(TPS_InHTMLEntityFirst), - TPARSERSTATEACTION(TPS_InHTMLEntity), - TPARSERSTATEACTION(TPS_InHTMLEntityNumFirst), - TPARSERSTATEACTION(TPS_InHTMLEntityNum), - TPARSERSTATEACTION(TPS_InHTMLEntityEnd), + TPARSERSTATEACTION(TPS_InXMLEntityFirst), + TPARSERSTATEACTION(TPS_InXMLEntity), + TPARSERSTATEACTION(TPS_InXMLEntityNumFirst), + TPARSERSTATEACTION(TPS_InXMLEntityNum), + TPARSERSTATEACTION(TPS_InXMLEntityHexNumFirst), + TPARSERSTATEACTION(TPS_InXMLEntityHexNum), + TPARSERSTATEACTION(TPS_InXMLEntityEnd), TPARSERSTATEACTION(TPS_InTagFirst), TPARSERSTATEACTION(TPS_InXMLBegin), TPARSERSTATEACTION(TPS_InTagCloseFirst), @@ -1556,9 +1581,9 @@ prsd_end(PG_FUNCTION_ARGS) #define COMPLEXTOKEN(x) ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD ) #define ENDPUNCTOKEN(x) ( (x)==SPACE ) -#define TS_IDIGNORE(x) ( (x)==TAG_T || (x)==PROTOCOL || (x)==SPACE || (x)==HTMLENTITY ) +#define TS_IDIGNORE(x) ( (x)==TAG_T || (x)==PROTOCOL || (x)==SPACE || (x)==XMLENTITY ) #define HLIDIGNORE(x) ( (x)==URL_T || (x)==TAG_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD ) -#define HTMLHLIDIGNORE(x) ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD ) +#define XMLHLIDIGNORE(x) ( (x)==URL_T || (x)==NUMHWORD || (x)==ASCIIHWORD || (x)==HWORD ) #define NONWORDTOKEN(x) ( (x)==SPACE || HLIDIGNORE(x) ) #define NOENDTOKEN(x) ( NONWORDTOKEN(x) || (x)==SCIENTIFIC || (x)==VERSIONNUMBER || (x)==DECIMAL || (x)==SIGNEDINT || (x)==UNSIGNEDINT || TS_IDIGNORE(x) ) @@ -1839,7 +1864,7 @@ prsd_headline(PG_FUNCTION_ARGS) } else { - if (HTMLHLIDIGNORE(prs->words[i].type)) + if (XMLHLIDIGNORE(prs->words[i].type)) prs->words[i].replace = 1; } diff --git a/src/test/regress/expected/tsearch.out b/src/test/regress/expected/tsearch.out index b6f8f05d228506add6dbe6fcaa27b12b59fa84bb..eb004020758e8b826acbc598078b5b6ca4f42254 100644 --- a/src/test/regress/expected/tsearch.out +++ b/src/test/regress/expected/tsearch.out @@ -222,7 +222,7 @@ SELECT * FROM ts_token_type('default'); 10 | hword_part | Hyphenated word part, all letters 11 | hword_asciipart | Hyphenated word part, all ASCII 12 | blank | Space symbols - 13 | tag | HTML tag + 13 | tag | XML tag 14 | protocol | Protocol head 15 | numhword | Hyphenated word, letters and digits 16 | asciihword | Hyphenated word, all ASCII @@ -232,7 +232,7 @@ SELECT * FROM ts_token_type('default'); 20 | float | Decimal notation 21 | int | Signed integer 22 | uint | Unsigned integer - 23 | entity | HTML entity + 23 | entity | XML entity (23 rows) SELECT * FROM ts_parse('default', '345 qwe@efd.r '' http://www.com/ http://aew.werc.ewr/?ad=qwe&dw 1aew.werc.ewr/?ad=qwe&dw 2aew.werc.ewr http://3aew.werc.ewr/?ad=qwe&dw http://4aew.werc.ewr http://5aew.werc.ewr:8100/? ad=qwe&dw 6aew.werc.ewr:8100/?ad=qwe&dw 7aew.werc.ewr:8100/?ad=qwe&dw=%20%32 +4.0e-10 qwe qwe qwqwe 234.435 455 5.005 teodor@stack.net qwe-wer asdf <fr>qwer jf sdjk<we hjwer <werrwe> ewr1> ewri2 <a href="qwe<qwe>">