Skip to content
Snippets Groups Projects
Commit abd8c94f authored by Teodor Sigaev's avatar Teodor Sigaev
Browse files

Add prefix support for synonym dictionary

parent 0c738084
No related branches found
No related tags found
No related merge requests found
<!-- $PostgreSQL: pgsql/doc/src/sgml/textsearch.sgml,v 1.52 2009/06/17 21:58:49 tgl Exp $ --> <!-- $PostgreSQL: pgsql/doc/src/sgml/textsearch.sgml,v 1.53 2009/08/14 14:53:20 teodor Exp $ -->
<chapter id="textsearch"> <chapter id="textsearch">
<title id="textsearch-title">Full Text Search</title> <title id="textsearch-title">Full Text Search</title>
...@@ -2289,6 +2289,63 @@ SELECT * FROM ts_debug('english', 'Paris'); ...@@ -2289,6 +2289,63 @@ SELECT * FROM ts_debug('english', 'Paris');
</programlisting> </programlisting>
</para> </para>
<para>
An asterisk (<literal>*</literal>) at the end of definition word indicates
that definition word is a prefix, and <function>to_tsquery()</function>
function will transform that definition to the prefix search format (see
<xref linkend="textsearch-parsing-queries">).
Notice that it is ignored in <function>to_tsvector()</function>.
</para>
<para>
Contents of <filename>$SHAREDIR/tsearch_data/synonym_sample.syn</>:
</para>
<programlisting>
postgres pgsql
postgresql pgsql
postgre pgsql
gogle googl
indices index*
</programlisting>
<para>
Results:
</para>
<programlisting>
=# create text search dictionary syn( template=synonym,synonyms='synonym_sample');
=# select ts_lexize('syn','indices');
ts_lexize
-----------
{index}
(1 row)
=# create text search configuration tst ( copy=simple);
=# alter text search configuration tst alter mapping for asciiword with syn;
=# select to_tsquery('tst','indices');
to_tsquery
------------
'index':*
(1 row)
=# select 'indexes are very useful'::tsvector;
tsvector
---------------------------------
'are' 'indexes' 'useful' 'very'
(1 row)
=# select 'indexes are very useful'::tsvector @@ to_tsquery('tst','indices');
?column?
----------
t
(1 row)
=# select to_tsvector('tst','indices');
to_tsvector
-------------
'index':1
(1 row)
</programlisting>
<para> <para>
The only parameter required by the <literal>synonym</> template is The only parameter required by the <literal>synonym</> template is
<literal>SYNONYMS</>, which is the base name of its configuration file <literal>SYNONYMS</>, which is the base name of its configuration file
......
...@@ -7,7 +7,7 @@ ...@@ -7,7 +7,7 @@
* *
* *
* IDENTIFICATION * IDENTIFICATION
* $PostgreSQL: pgsql/src/backend/tsearch/dict_synonym.c,v 1.10 2009/01/01 17:23:48 momjian Exp $ * $PostgreSQL: pgsql/src/backend/tsearch/dict_synonym.c,v 1.11 2009/08/14 14:53:20 teodor Exp $
* *
*------------------------------------------------------------------------- *-------------------------------------------------------------------------
*/ */
...@@ -23,6 +23,8 @@ typedef struct ...@@ -23,6 +23,8 @@ typedef struct
{ {
char *in; char *in;
char *out; char *out;
int outlen;
uint16 flags;
} Syn; } Syn;
typedef struct typedef struct
...@@ -36,11 +38,14 @@ typedef struct ...@@ -36,11 +38,14 @@ typedef struct
* Finds the next whitespace-delimited word within the 'in' string. * Finds the next whitespace-delimited word within the 'in' string.
* Returns a pointer to the first character of the word, and a pointer * Returns a pointer to the first character of the word, and a pointer
* to the next byte after the last character in the word (in *end). * to the next byte after the last character in the word (in *end).
* Character '*' at the end of word will not be threated as word
* charater if flags is not null.
*/ */
static char * static char *
findwrd(char *in, char **end) findwrd(char *in, char **end, uint16 *flags)
{ {
char *start; char *start;
char *lastchar;
/* Skip leading spaces */ /* Skip leading spaces */
while (*in && t_isspace(in)) while (*in && t_isspace(in))
...@@ -53,13 +58,27 @@ findwrd(char *in, char **end) ...@@ -53,13 +58,27 @@ findwrd(char *in, char **end)
return NULL; return NULL;
} }
start = in; lastchar = start = in;
/* Find end of word */ /* Find end of word */
while (*in && !t_isspace(in)) while (*in && !t_isspace(in))
{
lastchar = in;
in += pg_mblen(in); in += pg_mblen(in);
}
if ( in - lastchar == 1 && t_iseq(lastchar, '*') && flags )
{
*flags = TSL_PREFIX;
*end = lastchar;
}
else
{
if (flags)
*flags = 0;
*end = in; *end = in;
}
return start; return start;
} }
...@@ -84,6 +103,7 @@ dsynonym_init(PG_FUNCTION_ARGS) ...@@ -84,6 +103,7 @@ dsynonym_init(PG_FUNCTION_ARGS)
*end = NULL; *end = NULL;
int cur = 0; int cur = 0;
char *line = NULL; char *line = NULL;
uint16 flags = 0;
foreach(l, dictoptions) foreach(l, dictoptions)
{ {
...@@ -117,7 +137,7 @@ dsynonym_init(PG_FUNCTION_ARGS) ...@@ -117,7 +137,7 @@ dsynonym_init(PG_FUNCTION_ARGS)
while ((line = tsearch_readline(&trst)) != NULL) while ((line = tsearch_readline(&trst)) != NULL)
{ {
starti = findwrd(line, &end); starti = findwrd(line, &end, NULL);
if (!starti) if (!starti)
{ {
/* Empty line */ /* Empty line */
...@@ -130,7 +150,7 @@ dsynonym_init(PG_FUNCTION_ARGS) ...@@ -130,7 +150,7 @@ dsynonym_init(PG_FUNCTION_ARGS)
} }
*end = '\0'; *end = '\0';
starto = findwrd(end + 1, &end); starto = findwrd(end + 1, &end, &flags);
if (!starto) if (!starto)
{ {
/* A line with only one word (+whitespace). Ignore silently. */ /* A line with only one word (+whitespace). Ignore silently. */
...@@ -168,6 +188,9 @@ dsynonym_init(PG_FUNCTION_ARGS) ...@@ -168,6 +188,9 @@ dsynonym_init(PG_FUNCTION_ARGS)
d->syn[cur].out = lowerstr(starto); d->syn[cur].out = lowerstr(starto);
} }
d->syn[cur].outlen = strlen(starto);
d->syn[cur].flags = flags;
cur++; cur++;
skipline: skipline:
...@@ -212,7 +235,8 @@ dsynonym_lexize(PG_FUNCTION_ARGS) ...@@ -212,7 +235,8 @@ dsynonym_lexize(PG_FUNCTION_ARGS)
PG_RETURN_POINTER(NULL); PG_RETURN_POINTER(NULL);
res = palloc0(sizeof(TSLexeme) * 2); res = palloc0(sizeof(TSLexeme) * 2);
res[0].lexeme = pstrdup(found->out); res[0].lexeme = pnstrdup(found->out, found->outlen);
res[0].flags = found->flags;
PG_RETURN_POINTER(res); PG_RETURN_POINTER(res);
} }
...@@ -2,3 +2,4 @@ postgres pgsql ...@@ -2,3 +2,4 @@ postgres pgsql
postgresql pgsql postgresql pgsql
postgre pgsql postgre pgsql
gogle googl gogle googl
indices index*
...@@ -208,6 +208,12 @@ SELECT ts_lexize('synonym', 'Gogle'); ...@@ -208,6 +208,12 @@ SELECT ts_lexize('synonym', 'Gogle');
{googl} {googl}
(1 row) (1 row)
SELECT ts_lexize('synonym', 'indices');
ts_lexize
-----------
{index}
(1 row)
-- Create and simple test thesaurus dictionary -- Create and simple test thesaurus dictionary
-- More tests in configuration checks because ts_lexize() -- More tests in configuration checks because ts_lexize()
-- cannot pass more than one word to thesaurus. -- cannot pass more than one word to thesaurus.
...@@ -290,6 +296,18 @@ SELECT to_tsvector('synonym_tst', 'Most common mistake is to write Gogle instead ...@@ -290,6 +296,18 @@ SELECT to_tsvector('synonym_tst', 'Most common mistake is to write Gogle instead
'common':2 'googl':7,10 'instead':8 'mistak':3 'write':6 'common':2 'googl':7,10 'instead':8 'mistak':3 'write':6
(1 row) (1 row)
SELECT to_tsvector('synonym_tst', 'Indexes or indices - Which is right plural form of index?');
to_tsvector
----------------------------------------------
'form':8 'index':1,3,10 'plural':7 'right':6
(1 row)
SELECT to_tsquery('synonym_tst', 'Index & indices');
to_tsquery
---------------------
'index' & 'index':*
(1 row)
-- test thesaurus in configuration -- test thesaurus in configuration
-- see thesaurus_sample.ths to understand 'odd' resulting tsvector -- see thesaurus_sample.ths to understand 'odd' resulting tsvector
CREATE TEXT SEARCH CONFIGURATION thesaurus_tst ( CREATE TEXT SEARCH CONFIGURATION thesaurus_tst (
......
...@@ -56,6 +56,7 @@ CREATE TEXT SEARCH DICTIONARY synonym ( ...@@ -56,6 +56,7 @@ CREATE TEXT SEARCH DICTIONARY synonym (
SELECT ts_lexize('synonym', 'PoStGrEs'); SELECT ts_lexize('synonym', 'PoStGrEs');
SELECT ts_lexize('synonym', 'Gogle'); SELECT ts_lexize('synonym', 'Gogle');
SELECT ts_lexize('synonym', 'indices');
-- Create and simple test thesaurus dictionary -- Create and simple test thesaurus dictionary
-- More tests in configuration checks because ts_lexize() -- More tests in configuration checks because ts_lexize()
...@@ -104,6 +105,8 @@ ALTER TEXT SEARCH CONFIGURATION synonym_tst ALTER MAPPING FOR ...@@ -104,6 +105,8 @@ ALTER TEXT SEARCH CONFIGURATION synonym_tst ALTER MAPPING FOR
SELECT to_tsvector('synonym_tst', 'Postgresql is often called as postgres or pgsql and pronounced as postgre'); SELECT to_tsvector('synonym_tst', 'Postgresql is often called as postgres or pgsql and pronounced as postgre');
SELECT to_tsvector('synonym_tst', 'Most common mistake is to write Gogle instead of Google'); SELECT to_tsvector('synonym_tst', 'Most common mistake is to write Gogle instead of Google');
SELECT to_tsvector('synonym_tst', 'Indexes or indices - Which is right plural form of index?');
SELECT to_tsquery('synonym_tst', 'Index & indices');
-- test thesaurus in configuration -- test thesaurus in configuration
-- see thesaurus_sample.ths to understand 'odd' resulting tsvector -- see thesaurus_sample.ths to understand 'odd' resulting tsvector
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment