diff --git a/contrib/tsearch/README.tsearch b/contrib/tsearch/README.tsearch index c63ae91edd096cfbf4646e565f4cd1c41b08e85e..a57df55eea79f39d9f4a7bcb9a39185680266152 100644 --- a/contrib/tsearch/README.tsearch +++ b/contrib/tsearch/README.tsearch @@ -4,6 +4,11 @@ a searchable data type (textual) with indexed access. All work was done by Teodor Sigaev (teodor@stack.net) and Oleg Bartunov (oleg@sai.msu.su). +CHANGES: + +August 13, 2002 + Use parser of OpenFTS v0.33. + IMPORTANT NOTICE: This is a first step of our work on integration of OpenFTS diff --git a/contrib/tsearch/deflex.h b/contrib/tsearch/deflex.h index f9d6847167988e8a11aae2226b134757ecf65b02..17c4fdf1ec3e765bfb71bb0f98c85b6c3805512e 100644 --- a/contrib/tsearch/deflex.h +++ b/contrib/tsearch/deflex.h @@ -2,28 +2,33 @@ #define __DEFLEX_H__ /* rememder !!!! */ -#define LASTNUM 19 +#define LASTNUM 23 #define LATWORD 1 -#define NONLATINWORD 2 +#define CYRWORD 2 #define UWORD 3 #define EMAIL 4 #define FURL 5 #define HOST 6 -#define FLOAT 7 -#define FINT 8 -#define PARTWORD 9 -#define NONLATINPARTWORD 10 -#define LATPARTWORD 11 -#define SPACE 12 -#define SYMTAG 13 -#define HTTP 14 -#define DEFISWORD 15 -#define DEFISLATWORD 16 -#define DEFISNONLATINWORD 17 +#define SCIENTIFIC 7 +#define VERSIONNUMBER 8 +#define PARTHYPHENWORD 9 +#define CYRPARTHYPHENWORD 10 +#define LATPARTHYPHENWORD 11 +#define SPACE 12 +#define TAG 13 +#define HTTP 14 +#define HYPHENWORD 15 +#define LATHYPHENWORD 16 +#define CYRHYPHENWORD 17 #define URI 18 #define FILEPATH 19 +#define DECIMAL 20 +#define SIGNEDINT 21 +#define UNSIGNEDINT 22 +#define HTMLENTITY 23 extern const char *descr[]; #endif + diff --git a/contrib/tsearch/expected/tsearch.out b/contrib/tsearch/expected/tsearch.out index f75b429bcbb436f671a14fe6017753cd5fb0931d..0b12765d8f6cf0335703f54fe0c2007685867ad2 100644 --- a/contrib/tsearch/expected/tsearch.out +++ b/contrib/tsearch/expected/tsearch.out @@ -689,9 +689,9 @@ SELECT count(*) FROM test_txtidx WHERE a ## '(eq|yt)&(wR|qh)'; select txt2txtidx('345 qwe@efd.r \' http://www.com/ http://aew.werc.ewr/?ad=qwe&dw 1aew.werc.ewr/?ad=qwe&dw 2aew.werc.ewr http://3aew.werc.ewr/?ad=qwe&dw http://4aew.werc.ewr http://5aew.werc.ewr:8100/? ad=qwe&dw 6aew.werc.ewr:8100/?ad=qwe&dw 7aew.werc.ewr:8100/?ad=qwe&dw=%20%32 +4.0e-10 qwe qwe qwqwe 234.435 455 5.005 teodor@stack.net qwe-wer asdf <fr>qwer jf sdjk<we hjwer <werrwe> ewr1> ewri2 <a href="qwe<qwe>"> /usr/local/fff /awdf/dwqe/4325 rewt/ewr wefjn /wqe-324/ewr gist.h gist.h.c gist.c. readline 4.2 4.2. 4.2, readline-4.2 readline-4.2. 234 <i <b> wow < jqw <> qwerty'); - txt2txtidx ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- - 'ad' 'dw' 'jf' '234' '345' '4.2' '455' 'jqw' 'qwe' 'wer' 'wow' 'asdf' 'ewr1' 'qwer' 'sdjk' '5.005' 'ewri2' 'qwqwe' 'wefjn' 'gist.c' 'gist.h' 'qwerti' '234.435' ':8100/?' 'qwe-wer' 'readlin' 'www.com' '+4.0e-10' 'gist.h.c' 'rewt/ewr' 'qwe@efd.r' '/?ad=qwe&dw' '/wqe-324/ewr' 'aew.werc.ewr' '1aew.werc.ewr' '2aew.werc.ewr' '3aew.werc.ewr' '4aew.werc.ewr' '5aew.werc.ewr' '6aew.werc.ewr' '7aew.werc.ewr' '/usr/local/fff' '/awdf/dwqe/4325' ':8100/?ad=qwe&dw' 'teodor@stack.net' '5aew.werc.ewr:8100/?' ':8100/?ad=qwe&dw=%20%32' 'aew.werc.ewr/?ad=qwe&dw' '1aew.werc.ewr/?ad=qwe&dw' '3aew.werc.ewr/?ad=qwe&dw' '6aew.werc.ewr:8100/?ad=qwe&dw' '7aew.werc.ewr:8100/?ad=qwe&dw=%20%32' + txt2txtidx +---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- + 'ad' 'dw' 'jf' '234' '345' '4.2' '455' 'jqw' 'qwe' 'wer' 'wow' 'asdf' 'ewr1' 'qwer' 'sdjk' '5.005' 'ewri2' 'qwqwe' 'wefjn' 'gist.c' 'gist.h' 'qwerti' '234.435' ':8100/?' 'qwe-wer' 'readlin' 'www.com' '+4.0e-10' 'gist.h.c' 'rewt/ewr' 'qwe@efd.r' 'readline-4' '/?ad=qwe&dw' '/wqe-324/ewr' 'aew.werc.ewr' '1aew.werc.ewr' '2aew.werc.ewr' '3aew.werc.ewr' '4aew.werc.ewr' '5aew.werc.ewr' '6aew.werc.ewr' '7aew.werc.ewr' '/usr/local/fff' '/awdf/dwqe/4325' ':8100/?ad=qwe&dw' 'teodor@stack.net' '5aew.werc.ewr:8100/?' ':8100/?ad=qwe&dw=%20%32' 'aew.werc.ewr/?ad=qwe&dw' '1aew.werc.ewr/?ad=qwe&dw' '3aew.werc.ewr/?ad=qwe&dw' '6aew.werc.ewr:8100/?ad=qwe&dw' '7aew.werc.ewr:8100/?ad=qwe&dw=%20%32' (1 row) select txtidxsize(txt2txtidx('345 qw')); @@ -705,7 +705,7 @@ select txtidxsize(txt2txtidx('345 qwe@efd.r \' http://www.com/ http://aew.werc.e <i <b> wow < jqw <> qwerty')); txtidxsize ------------ - 52 + 53 (1 row) insert into test_txtidx (a) values ('345 qwerty'); diff --git a/contrib/tsearch/morph.c b/contrib/tsearch/morph.c index 60797b07e92441c7f218c16d775e22e658a249ed..b29a3f6779dbe57e786f04312a7fb72d7fcb928d 100644 --- a/contrib/tsearch/morph.c +++ b/contrib/tsearch/morph.c @@ -75,19 +75,23 @@ static MAPDICT mapdict[] = { {NODICT, NODICT}, /* EMAIL */ {NODICT, NODICT}, /* FURL */ {NODICT, NODICT}, /* HOST */ - {NODICT, NODICT}, /* FLOAT */ - {NODICT, NODICT}, /* FINT */ - {BYLOCALE, DEFAULTDICT}, /* PARTWORD */ - {BYLOCALE, NODICT}, /* NONLATINPARTWORD */ - {DEFAULTDICT, NODICT}, /* LATPARTWORD */ + {NODICT, NODICT}, /* SCIENTIFIC */ + {NODICT, NODICT}, /* VERSIONNUMBER */ + {BYLOCALE, DEFAULTDICT}, /* PARTHYPHENWORD */ + {BYLOCALE, NODICT}, /* CYRPARTHYPHENWORD */ + {DEFAULTDICT, NODICT}, /* LATPARTHYPHENWORD */ {STOPLEXEM, NODICT}, /* SPACE */ - {STOPLEXEM, NODICT}, /* SYMTAG */ + {STOPLEXEM, NODICT}, /* TAG */ {STOPLEXEM, NODICT}, /* HTTP */ - {BYLOCALE, DEFAULTDICT}, /* DEFISWORD */ - {DEFAULTDICT, NODICT}, /* DEFISLATWORD */ - {BYLOCALE, NODICT}, /* DEFISNONLATINWORD */ + {BYLOCALE, DEFAULTDICT}, /* HYPHENWORD */ + {DEFAULTDICT, NODICT}, /* LATHYPHENWORD */ + {BYLOCALE, NODICT}, /* CYRHYPHENWORD */ {NODICT, NODICT}, /* URI */ - {NODICT, NODICT} /* FILEPATH */ + {NODICT, NODICT}, /* FILEPATH */ + {NODICT, NODICT}, /* DECIMAL */ + {NODICT, NODICT}, /* SIGNEDINT */ + {NODICT, NODICT}, /* UNSIGNEDINT */ + {STOPLEXEM, NODICT} /* HTMLENTITY */ }; static bool inited = false; diff --git a/contrib/tsearch/parser.l b/contrib/tsearch/parser.l index 6081fd4c7bec02bcbada539f43a6d65a3eb7cb5f..f30fbcd4f4608a8b104c1b1ce678e1c7a31af5bf 100644 --- a/contrib/tsearch/parser.l +++ b/contrib/tsearch/parser.l @@ -5,18 +5,17 @@ /* postgres allocation function */ #include "postgres.h" -#define free pfree -#define malloc palloc +#define free pfree +#define malloc palloc #define realloc repalloc #ifdef strdup #undef strdup #endif -#define strdup pstrdup - +#define strdup pstrdup char *token = NULL; /* pointer to token */ -char *s = NULL; /* for returning full defis-word */ +char *s = NULL; /* to return WHOLE hyphenated-word */ YY_BUFFER_STATE buf = NULL; /* buffer to parse; it need for parse from string */ @@ -57,21 +56,21 @@ int bytestoread = 0; /* for limiting read from filehandle */ %option nounput %option noyywrap - -/* parser's state for parsing defis-word */ +/* parser's state for parsing hyphenated-word */ %x DELIM /* parser's state for parsing URL*/ %x URL %x SERVER -/* parser's state for parsing filepath */ - +/* parser's state for parsing TAGS */ %x INTAG %x QINTAG +%x INCOMMENT +%x INSCRIPT -/* NONLATIN char */ -NONLATINALNUM [0-9\200-\377] -NONLATINALPHA [\200-\377] +/* cyrillic koi8 char */ +CYRALNUM [0-9\200-\377] +CYRALPHA [\200-\377] ALPHA [a-zA-Z\200-\377] ALNUM [0-9a-zA-Z\200-\377] @@ -81,66 +80,59 @@ URI [-_[:alnum:]/%,\.;=&?#]+ %% -"<"[[:alpha:]] { BEGIN INTAG; - token = tsearch_yytext; - tokenlen = tsearch_yyleng; - return SYMTAG; - } - -"</"[[:alpha:]] { BEGIN INTAG; - token = tsearch_yytext; - tokenlen = tsearch_yyleng; - return SYMTAG; - } +"<"[Ss][Cc][Rr][Ii][Pp][Tt] { BEGIN INSCRIPT; } -"<>" { +<INSCRIPT>"</"[Ss][Cc][Rr][Ii][Pp][Tt]">" { + BEGIN INITIAL; + *tsearch_yytext=' '; *(tsearch_yytext+1) = '\0'; token = tsearch_yytext; tokenlen = tsearch_yyleng; - return SYMTAG; + return SPACE; } -"<"[^>[:alpha:]] { +"<!--" { BEGIN INCOMMENT; } + +<INCOMMENT>"-->" { + BEGIN INITIAL; + *tsearch_yytext=' '; *(tsearch_yytext+1) = '\0'; token = tsearch_yytext; tokenlen = tsearch_yyleng; return SPACE; } -<INTAG>"\"" { BEGIN QINTAG; - token = tsearch_yytext; - tokenlen = tsearch_yyleng; - return SYMTAG; - } -<QINTAG>"\\\"" { - token = tsearch_yytext; - tokenlen = tsearch_yyleng; - return SYMTAG; -} +"<"[\![:alpha:]] { BEGIN INTAG; } -<QINTAG>"\"" { BEGIN INTAG; - token = tsearch_yytext; - tokenlen = tsearch_yyleng; - return SYMTAG; - } +"</"[[:alpha:]] { BEGIN INTAG; } -<QINTAG>.|\n { +<INTAG>"\"" { BEGIN QINTAG; } + +<QINTAG>"\\\"" ; + +<QINTAG>"\"" { BEGIN INTAG; } + +<INTAG>">" { + BEGIN INITIAL; token = tsearch_yytext; - tokenlen = tsearch_yyleng; - return SYMTAG; + *tsearch_yytext=' '; + token = tsearch_yytext; + tokenlen = 1; + return TAG; } -<INTAG>">" { BEGIN INITIAL; +<QINTAG,INTAG,INCOMMENT,INSCRIPT>.|\n ; + +\&(quot|amp|nbsp|lt|gt)\; { token = tsearch_yytext; tokenlen = tsearch_yyleng; - return SYMTAG; - } + return HTMLENTITY; +} -<INTAG>.|\n { +\&\#[0-9][0-9]?[0-9]?\; { token = tsearch_yytext; tokenlen = tsearch_yyleng; - return SYMTAG; + return HTMLENTITY; } - [-_\.[:alnum:]]+@{HOSTNAME} /* Emails */ { token = tsearch_yytext; @@ -148,22 +140,34 @@ URI [-_[:alnum:]/%,\.;=&?#]+ return EMAIL; } -<DELIM,INITIAL>[0-9] /* digit's and point (might be a version) */ { +[+-]?[0-9]+(\.[0-9]+)?[eEdD][+-]?[0-9]+ /* float */ { token = tsearch_yytext; tokenlen = tsearch_yyleng; - return FINT; + return SCIENTIFIC; +} + +[0-9]+\.[0-9]+\.[0-9\.]*[0-9] { + token = tsearch_yytext; + tokenlen = tsearch_yyleng; + return VERSIONNUMBER; +} + +[+-]?[0-9]+\.[0-9]+ { + token = tsearch_yytext; + tokenlen = tsearch_yyleng; + return DECIMAL; } -<DELIM,INITIAL>[0-9]+[0-9\.]*[0-9] /* digit's and point (might be a version) */ { +[+-][0-9]+ { token = tsearch_yytext; tokenlen = tsearch_yyleng; - return FINT; + return SIGNEDINT; } -[+-]?[0-9\.]+[eE][+-]?[0-9]+ /* float */ { +<DELIM,INITIAL>[0-9]+ { token = tsearch_yytext; tokenlen = tsearch_yyleng; - return FLOAT; + return UNSIGNEDINT; } http"://" { @@ -208,52 +212,58 @@ ftp"://" { return FILEPATH; } -({NONLATINALNUM}+-)+{NONLATINALPHA}+ /* composite-word */ { +({CYRALPHA}+-)+{CYRALPHA}+ /* composite-word */ { BEGIN DELIM; if (s) { free(s); s=NULL; } s = strdup( tsearch_yytext ); tokenlen = tsearch_yyleng; yyless( 0 ); token = s; - return DEFISNONLATINWORD; + return CYRHYPHENWORD; } -([[:alnum:]]+-)+[[:alpha:]]+ /* composite-word */ { +([[:alpha:]]+-)+[[:alpha:]]+ /* composite-word */ { BEGIN DELIM; if (s) { free(s); s=NULL; } - tokenlen = tsearch_yyleng; s = strdup( tsearch_yytext ); + tokenlen = tsearch_yyleng; yyless( 0 ); token = s; - return DEFISLATWORD; + return LATHYPHENWORD; } -({ALNUM}+-)+{ALPHA}+ /* composite-word */ { +({ALNUM}+-)+{ALNUM}+ /* composite-word */ { BEGIN DELIM; if (s) { free(s); s=NULL; } s = strdup( tsearch_yytext ); tokenlen = tsearch_yyleng; yyless( 0 ); token = s; - return DEFISWORD; + return HYPHENWORD; +} + +<DELIM>\+?[0-9]+\.[0-9]+ { + token = tsearch_yytext; + tokenlen = tsearch_yyleng; + return DECIMAL; } -<DELIM>{NONLATINALNUM}+ /* one word in composite-word */ { +<DELIM>{CYRALPHA}+ /* one word in composite-word */ { token = tsearch_yytext; tokenlen = tsearch_yyleng; - return NONLATINPARTWORD; + return CYRPARTHYPHENWORD; } -<DELIM>[[:alnum:]]+ /* one word in composite-word */ { +<DELIM>[[:alpha:]]+ /* one word in composite-word */ { token = tsearch_yytext; tokenlen = tsearch_yyleng; - return LATPARTWORD; + return LATPARTHYPHENWORD; } <DELIM>{ALNUM}+ /* one word in composite-word */ { token = tsearch_yytext; tokenlen = tsearch_yyleng; - return PARTWORD; + return PARTHYPHENWORD; } <DELIM>- { @@ -264,17 +274,16 @@ ftp"://" { <DELIM,SERVER,URL>.|\n /* return in basic state */ { BEGIN INITIAL; - tokenlen = tsearch_yyleng; yyless( 0 ); } -{NONLATINALNUM}+ /* normal word */ { +{CYRALPHA}+ /* normal word */ { token = tsearch_yytext; tokenlen = tsearch_yyleng; - return NONLATINWORD; + return CYRWORD; } -[[:alnum:]]+ /* normal word */ { +[[:alpha:]]+ /* normal word */ { token = tsearch_yytext; tokenlen = tsearch_yyleng; return LATWORD; @@ -286,7 +295,13 @@ ftp"://" { return UWORD; } -.|\n { +[ \r\n\t]+ { + token = tsearch_yytext; + tokenlen = tsearch_yyleng; + return SPACE; +} + +. { token = tsearch_yytext; tokenlen = tsearch_yyleng; return SPACE;