From 3e3bb36ee9122fdb211cb08575d6837f8ab522cc Mon Sep 17 00:00:00 2001 From: Tom Lane <tgl@sss.pgh.pa.us> Date: Tue, 21 Aug 2007 21:08:47 +0000 Subject: [PATCH] First rough cut at text search documentation: bare bones reference pages for the new SQL commands. I also committed Bruce's text search introductory chapter, as-is except for fixing some markup errors, so that there would be a place for the reference pages to link to. --- doc/src/sgml/filelist.sgml | 5 +- doc/src/sgml/postgres.sgml | 3 +- doc/src/sgml/ref/allfiles.sgml | 14 +- doc/src/sgml/ref/alter_tsconfig.sgml | 202 ++ doc/src/sgml/ref/alter_tsdictionary.sgml | 118 + doc/src/sgml/ref/alter_tsparser.sgml | 82 + doc/src/sgml/ref/alter_tstemplate.sgml | 82 + doc/src/sgml/ref/comment.sgml | 18 +- doc/src/sgml/ref/create_tsconfig.sgml | 126 + doc/src/sgml/ref/create_tsdictionary.sgml | 111 + doc/src/sgml/ref/create_tsparser.sgml | 152 + doc/src/sgml/ref/create_tstemplate.sgml | 125 + doc/src/sgml/ref/drop_tsconfig.sgml | 118 + doc/src/sgml/ref/drop_tsdictionary.sgml | 117 + doc/src/sgml/ref/drop_tsparser.sgml | 115 + doc/src/sgml/ref/drop_tstemplate.sgml | 116 + doc/src/sgml/reference.sgml | 14 +- doc/src/sgml/textsearch.sgml | 3716 +++++++++++++++++++++ 18 files changed, 5224 insertions(+), 10 deletions(-) create mode 100644 doc/src/sgml/ref/alter_tsconfig.sgml create mode 100644 doc/src/sgml/ref/alter_tsdictionary.sgml create mode 100644 doc/src/sgml/ref/alter_tsparser.sgml create mode 100644 doc/src/sgml/ref/alter_tstemplate.sgml create mode 100644 doc/src/sgml/ref/create_tsconfig.sgml create mode 100644 doc/src/sgml/ref/create_tsdictionary.sgml create mode 100644 doc/src/sgml/ref/create_tsparser.sgml create mode 100644 doc/src/sgml/ref/create_tstemplate.sgml create mode 100644 doc/src/sgml/ref/drop_tsconfig.sgml create mode 100644 doc/src/sgml/ref/drop_tsdictionary.sgml create mode 100644 doc/src/sgml/ref/drop_tsparser.sgml create mode 100644 doc/src/sgml/ref/drop_tstemplate.sgml create mode 100644 doc/src/sgml/textsearch.sgml diff --git a/doc/src/sgml/filelist.sgml b/doc/src/sgml/filelist.sgml index fe1fdfd5794..8de816db6be 100644 --- a/doc/src/sgml/filelist.sgml +++ b/doc/src/sgml/filelist.sgml @@ -1,4 +1,4 @@ -<!-- $PostgreSQL: pgsql/doc/src/sgml/filelist.sgml,v 1.49 2006/11/17 16:38:44 momjian Exp $ --> +<!-- $PostgreSQL: pgsql/doc/src/sgml/filelist.sgml,v 1.50 2007/08/21 21:08:47 tgl Exp $ --> <!entity history SYSTEM "history.sgml"> <!entity info SYSTEM "info.sgml"> @@ -17,7 +17,6 @@ <!-- user's guide --> <!entity array SYSTEM "array.sgml"> -<!entity rowtypes SYSTEM "rowtypes.sgml"> <!entity datatype SYSTEM "datatype.sgml"> <!entity ddl SYSTEM "ddl.sgml"> <!entity dml SYSTEM "dml.sgml"> @@ -26,7 +25,9 @@ <!entity mvcc SYSTEM "mvcc.sgml"> <!entity perform SYSTEM "perform.sgml"> <!entity queries SYSTEM "queries.sgml"> +<!entity rowtypes SYSTEM "rowtypes.sgml"> <!entity syntax SYSTEM "syntax.sgml"> +<!entity textsearch SYSTEM "textsearch.sgml"> <!entity typeconv SYSTEM "typeconv.sgml"> <!-- administrator's guide --> diff --git a/doc/src/sgml/postgres.sgml b/doc/src/sgml/postgres.sgml index 12a90dcec7d..ecacb75bdff 100644 --- a/doc/src/sgml/postgres.sgml +++ b/doc/src/sgml/postgres.sgml @@ -1,4 +1,4 @@ -<!-- $PostgreSQL: pgsql/doc/src/sgml/postgres.sgml,v 1.81 2007/01/31 20:56:18 momjian Exp $ --> +<!-- $PostgreSQL: pgsql/doc/src/sgml/postgres.sgml,v 1.82 2007/08/21 21:08:47 tgl Exp $ --> <!DOCTYPE book PUBLIC "-//OASIS//DTD DocBook V4.2//EN" [ @@ -101,6 +101,7 @@ &func; &typeconv; &indices; + &textsearch; &mvcc; &perform; diff --git a/doc/src/sgml/ref/allfiles.sgml b/doc/src/sgml/ref/allfiles.sgml index 3e08b55bab1..898c2c67910 100644 --- a/doc/src/sgml/ref/allfiles.sgml +++ b/doc/src/sgml/ref/allfiles.sgml @@ -1,5 +1,5 @@ <!-- -$PostgreSQL: pgsql/doc/src/sgml/ref/allfiles.sgml,v 1.71 2007/07/03 01:30:35 neilc Exp $ +$PostgreSQL: pgsql/doc/src/sgml/ref/allfiles.sgml,v 1.72 2007/08/21 21:08:47 tgl Exp $ PostgreSQL documentation Complete list of usable sgml source files in this directory. --> @@ -22,6 +22,10 @@ Complete list of usable sgml source files in this directory. <!entity alterSequence system "alter_sequence.sgml"> <!entity alterTable system "alter_table.sgml"> <!entity alterTableSpace system "alter_tablespace.sgml"> +<!entity alterTSConfig system "alter_tsconfig.sgml"> +<!entity alterTSDictionary system "alter_tsdictionary.sgml"> +<!entity alterTSParser system "alter_tsparser.sgml"> +<!entity alterTSTemplate system "alter_tstemplate.sgml"> <!entity alterTrigger system "alter_trigger.sgml"> <!entity alterType system "alter_type.sgml"> <!entity alterUser system "alter_user.sgml"> @@ -56,6 +60,10 @@ Complete list of usable sgml source files in this directory. <!entity createTableAs system "create_table_as.sgml"> <!entity createTableSpace system "create_tablespace.sgml"> <!entity createTrigger system "create_trigger.sgml"> +<!entity createTSConfig system "create_tsconfig.sgml"> +<!entity createTSDictionary system "create_tsdictionary.sgml"> +<!entity createTSParser system "create_tsparser.sgml"> +<!entity createTSTemplate system "create_tstemplate.sgml"> <!entity createType system "create_type.sgml"> <!entity createUser system "create_user.sgml"> <!entity createView system "create_view.sgml"> @@ -83,6 +91,10 @@ Complete list of usable sgml source files in this directory. <!entity dropTable system "drop_table.sgml"> <!entity dropTableSpace system "drop_tablespace.sgml"> <!entity dropTrigger system "drop_trigger.sgml"> +<!entity dropTSConfig system "drop_tsconfig.sgml"> +<!entity dropTSDictionary system "drop_tsdictionary.sgml"> +<!entity dropTSParser system "drop_tsparser.sgml"> +<!entity dropTSTemplate system "drop_tstemplate.sgml"> <!entity dropType system "drop_type.sgml"> <!entity dropUser system "drop_user.sgml"> <!entity dropView system "drop_view.sgml"> diff --git a/doc/src/sgml/ref/alter_tsconfig.sgml b/doc/src/sgml/ref/alter_tsconfig.sgml new file mode 100644 index 00000000000..295ba1df647 --- /dev/null +++ b/doc/src/sgml/ref/alter_tsconfig.sgml @@ -0,0 +1,202 @@ +<!-- +$PostgreSQL: pgsql/doc/src/sgml/ref/alter_tsconfig.sgml,v 1.1 2007/08/21 21:08:47 tgl Exp $ +PostgreSQL documentation +--> + +<refentry id="SQL-ALTERTSCONFIG"> + <refmeta> + <refentrytitle id="SQL-ALTERTSCONFIG-TITLE">ALTER TEXT SEARCH CONFIGURATION</refentrytitle> + <refmiscinfo>SQL - Language Statements</refmiscinfo> + </refmeta> + + <refnamediv> + <refname>ALTER TEXT SEARCH CONFIGURATION</refname> + <refpurpose>change the definition of a text search configuration</refpurpose> + </refnamediv> + + <indexterm zone="sql-altertsconfig"> + <primary>ALTER TEXT SEARCH CONFIGURATION</primary> + </indexterm> + + <refsynopsisdiv> +<synopsis> +ALTER TEXT SEARCH CONFIGURATION <replaceable>name</replaceable> ( + PARSER = <replaceable class="parameter">parser_name</replaceable> +) +ALTER TEXT SEARCH CONFIGURATION <replaceable>name</replaceable> + ADD MAPPING FOR <replaceable class="parameter">token_type</replaceable> [, ... ] WITH <replaceable class="parameter">dictionary_name</replaceable> [, ... ] +ALTER TEXT SEARCH CONFIGURATION <replaceable>name</replaceable> + ALTER MAPPING FOR <replaceable class="parameter">token_type</replaceable> [, ... ] WITH <replaceable class="parameter">dictionary_name</replaceable> [, ... ] +ALTER TEXT SEARCH CONFIGURATION <replaceable>name</replaceable> + ALTER MAPPING REPLACE <replaceable class="parameter">old_dictionary</replaceable> WITH <replaceable class="parameter">new_dictionary</replaceable> +ALTER TEXT SEARCH CONFIGURATION <replaceable>name</replaceable> + ALTER MAPPING FOR <replaceable class="parameter">token_type</replaceable> [, ... ] REPLACE <replaceable class="parameter">old_dictionary</replaceable> WITH <replaceable class="parameter">new_dictionary</replaceable> +ALTER TEXT SEARCH CONFIGURATION <replaceable>name</replaceable> + DROP MAPPING [ IF EXISTS ] FOR <replaceable class="parameter">token_type</replaceable> [, ... ] +ALTER TEXT SEARCH CONFIGURATION <replaceable>name</replaceable> RENAME TO <replaceable>newname</replaceable> +ALTER TEXT SEARCH CONFIGURATION <replaceable>name</replaceable> OWNER TO <replaceable>newowner</replaceable> +</synopsis> + </refsynopsisdiv> + + <refsect1> + <title>Description</title> + + <para> + <command>ALTER TEXT SEARCH CONFIGURATION</command> changes the definition of + a text search configuration. You can change which parser it uses, modify + its mapping from token types to dictionaries, + or change the configuration's name or owner. + </para> + + <para> + You must be the owner of the configuration to use + <command>ALTER TEXT SEARCH CONFIGURATION</>. + </para> + </refsect1> + + <refsect1> + <title>Parameters</title> + + <variablelist> + <varlistentry> + <term><replaceable class="parameter">name</replaceable></term> + <listitem> + <para> + The name (optionally schema-qualified) of an existing text search + configuration. + </para> + </listitem> + </varlistentry> + + <varlistentry> + <term><replaceable class="parameter">parser_name</replaceable></term> + <listitem> + <para> + The name of a new text search parser to use for this configuration. + </para> + </listitem> + </varlistentry> + + <varlistentry> + <term><replaceable class="parameter">token_type</replaceable></term> + <listitem> + <para> + The name of a token type that is emitted by the configuration's + parser. + </para> + </listitem> + </varlistentry> + + <varlistentry> + <term><replaceable class="parameter">dictionary_name</replaceable></term> + <listitem> + <para> + The name of a text search dictionary to be consulted for the + specified token type(s). If multiple dictionaries are listed, + they are consulted in the specified order. + </para> + </listitem> + </varlistentry> + + <varlistentry> + <term><replaceable class="parameter">old_dictionary</replaceable></term> + <listitem> + <para> + The name of a text search dictionary to be replaced in the mapping. + </para> + </listitem> + </varlistentry> + + <varlistentry> + <term><replaceable class="parameter">old_dictionary</replaceable></term> + <listitem> + <para> + The name of a text search dictionary to be substituted for + <replaceable class="parameter">old_dictionary</replaceable>. + </para> + </listitem> + </varlistentry> + + <varlistentry> + <term><replaceable class="parameter">newname</replaceable></term> + <listitem> + <para> + The new name of the text search configuration. + </para> + </listitem> + </varlistentry> + + <varlistentry> + <term><replaceable class="parameter">newowner</replaceable></term> + <listitem> + <para> + The new owner of the text search configuration. + </para> + </listitem> + </varlistentry> + </variablelist> + + <para> + The <literal>ADD MAPPING FOR</> form installs a list of dictionaries to be + consulted for the specified token type(s); it is an error if there is + already a mapping for any of the token types. + The <literal>ALTER MAPPING FOR</> form does the same, but first removing + any existing mapping for those token types. + The <literal>ALTER MAPPING REPLACE</> forms substitute <replaceable + class="parameter">new_dictionary</replaceable> for <replaceable + class="parameter">old_dictionary</replaceable> anywhere the latter appears. + This is done for only the specified token types when <literal>FOR</> + appears, or for all mappings of the configuration when it doesn't. + The <literal>DROP MAPPING</> form removes all dictionaries for the + specified token type(s), causing tokens of those types to be ignored + by the text search configuration. It is an error if there is no mapping + for the token types, unless <literal>IF EXISTS</> appears. + </para> + + </refsect1> + + <refsect1> + <title>Notes</title> + + <para> + While changing the text search parser used by a configuration is allowed, + this will only work nicely if old and new parsers use the same set of + token types. It is advisable to drop the mappings for any incompatible + token types before changing parsers. + </para> + + </refsect1> + + <refsect1> + <title>Examples</title> + + <para> + The following example replaces the <literal>english</> dictionary + with the <literal>swedish</> dictionary anywhere that <literal>english</> + is used within <literal>my_config</>. + </para> + +<programlisting> +ALTER TEXT SEARCH CONFIGURATION my_config + ALTER MAPPING REPLACE english WITH swedish; +</programlisting> + </refsect1> + + <refsect1> + <title>Compatibility</title> + + <para> + There is no <command>ALTER TEXT SEARCH CONFIGURATION</command> statement in + the SQL standard. + </para> + </refsect1> + + <refsect1> + <title>See Also</title> + + <simplelist type="inline"> + <member><xref linkend="sql-createtsconfig" endterm="sql-createtsconfig-title"></member> + <member><xref linkend="sql-droptsconfig" endterm="sql-droptsconfig-title"></member> + </simplelist> + </refsect1> +</refentry> diff --git a/doc/src/sgml/ref/alter_tsdictionary.sgml b/doc/src/sgml/ref/alter_tsdictionary.sgml new file mode 100644 index 00000000000..59c33666557 --- /dev/null +++ b/doc/src/sgml/ref/alter_tsdictionary.sgml @@ -0,0 +1,118 @@ +<!-- +$PostgreSQL: pgsql/doc/src/sgml/ref/alter_tsdictionary.sgml,v 1.1 2007/08/21 21:08:47 tgl Exp $ +PostgreSQL documentation +--> + +<refentry id="SQL-ALTERTSDICTIONARY"> + <refmeta> + <refentrytitle id="SQL-ALTERTSDICTIONARY-TITLE">ALTER TEXT SEARCH DICTIONARY</refentrytitle> + <refmiscinfo>SQL - Language Statements</refmiscinfo> + </refmeta> + + <refnamediv> + <refname>ALTER TEXT SEARCH DICTIONARY</refname> + <refpurpose>change the definition of a text search dictionary</refpurpose> + </refnamediv> + + <indexterm zone="sql-altertsdictionary"> + <primary>ALTER TEXT SEARCH DICTIONARY</primary> + </indexterm> + + <refsynopsisdiv> +<synopsis> +ALTER TEXT SEARCH DICTIONARY <replaceable>name</replaceable> ( OPTION = <replaceable class="parameter">init_options</replaceable> ) +ALTER TEXT SEARCH DICTIONARY <replaceable>name</replaceable> RENAME TO <replaceable>newname</replaceable> +ALTER TEXT SEARCH DICTIONARY <replaceable>name</replaceable> OWNER TO <replaceable>newowner</replaceable> +</synopsis> + </refsynopsisdiv> + + <refsect1> + <title>Description</title> + + <para> + <command>ALTER TEXT SEARCH DICTIONARY</command> changes the definition of + a text search dictionary. You can change the dictionary's initialization + options, or change the dictionary's name or owner. + </para> + + <para> + You must be the owner of the dictionary to use + <command>ALTER TEXT SEARCH DICTIONARY</>. + </para> + </refsect1> + + <refsect1> + <title>Parameters</title> + + <variablelist> + <varlistentry> + <term><replaceable class="parameter">name</replaceable></term> + <listitem> + <para> + The name (optionally schema-qualified) of an existing text search + dictionary. + </para> + </listitem> + </varlistentry> + + <varlistentry> + <term><replaceable class="parameter">init_options</replaceable></term> + <listitem> + <para> + A new list of initialization options, or <literal>NULL</> to + remove all options. + </para> + </listitem> + </varlistentry> + + <varlistentry> + <term><replaceable class="parameter">newname</replaceable></term> + <listitem> + <para> + The new name of the text search dictionary. + </para> + </listitem> + </varlistentry> + + <varlistentry> + <term><replaceable class="parameter">newowner</replaceable></term> + <listitem> + <para> + The new owner of the text search dictionary. + </para> + </listitem> + </varlistentry> + </variablelist> + </refsect1> + + <refsect1> + <title>Examples</title> + + <para> + The following example command sets the language and stopword list + for a Snowball-based dictionary. + </para> + +<programlisting> +ALTER TEXT SEARCH DICTIONARY my_russian ( option = 'Language=russian, StopWords=my_russian' ); +</programlisting> + </refsect1> + + <refsect1> + <title>Compatibility</title> + + <para> + There is no <command>ALTER TEXT SEARCH DICTIONARY</command> statement in + the SQL standard. + </para> + </refsect1> + + <refsect1> + <title>See Also</title> + + <simplelist type="inline"> + <member><xref linkend="sql-createtsdictionary" endterm="sql-createtsdictionary-title"></member> + <member><xref linkend="sql-droptsdictionary" endterm="sql-droptsdictionary-title"></member> + </simplelist> + </refsect1> +</refentry> diff --git a/doc/src/sgml/ref/alter_tsparser.sgml b/doc/src/sgml/ref/alter_tsparser.sgml new file mode 100644 index 00000000000..a94d3939b89 --- /dev/null +++ b/doc/src/sgml/ref/alter_tsparser.sgml @@ -0,0 +1,82 @@ +<!-- +$PostgreSQL: pgsql/doc/src/sgml/ref/alter_tsparser.sgml,v 1.1 2007/08/21 21:08:47 tgl Exp $ +PostgreSQL documentation +--> + +<refentry id="SQL-ALTERTSPARSER"> + <refmeta> + <refentrytitle id="SQL-ALTERTSPARSER-TITLE">ALTER TEXT SEARCH PARSER</refentrytitle> + <refmiscinfo>SQL - Language Statements</refmiscinfo> + </refmeta> + + <refnamediv> + <refname>ALTER TEXT SEARCH PARSER</refname> + <refpurpose>change the definition of a text search parser</refpurpose> + </refnamediv> + + <indexterm zone="sql-altertsparser"> + <primary>ALTER TEXT SEARCH PARSER</primary> + </indexterm> + + <refsynopsisdiv> +<synopsis> +ALTER TEXT SEARCH PARSER <replaceable>name</replaceable> RENAME TO <replaceable>newname</replaceable> +</synopsis> + </refsynopsisdiv> + + <refsect1> + <title>Description</title> + + <para> + <command>ALTER TEXT SEARCH PARSER</command> changes the definition of + a text search parser. Currently, the only supported functionality + is to change the parser's name. + </para> + + <para> + You must be a superuser to use <command>ALTER TEXT SEARCH PARSER</>. + </para> + </refsect1> + + <refsect1> + <title>Parameters</title> + + <variablelist> + <varlistentry> + <term><replaceable class="parameter">name</replaceable></term> + <listitem> + <para> + The name (optionally schema-qualified) of an existing text search parser. + </para> + </listitem> + </varlistentry> + + <varlistentry> + <term><replaceable class="parameter">newname</replaceable></term> + <listitem> + <para> + The new name of the text search parser. + </para> + </listitem> + </varlistentry> + </variablelist> + </refsect1> + + <refsect1> + <title>Compatibility</title> + + <para> + There is no <command>ALTER TEXT SEARCH PARSER</command> statement in + the SQL standard. + </para> + </refsect1> + + <refsect1> + <title>See Also</title> + + <simplelist type="inline"> + <member><xref linkend="sql-createtsparser" endterm="sql-createtsparser-title"></member> + <member><xref linkend="sql-droptsparser" endterm="sql-droptsparser-title"></member> + </simplelist> + </refsect1> +</refentry> diff --git a/doc/src/sgml/ref/alter_tstemplate.sgml b/doc/src/sgml/ref/alter_tstemplate.sgml new file mode 100644 index 00000000000..4ee9e82bfb8 --- /dev/null +++ b/doc/src/sgml/ref/alter_tstemplate.sgml @@ -0,0 +1,82 @@ +<!-- +$PostgreSQL: pgsql/doc/src/sgml/ref/alter_tstemplate.sgml,v 1.1 2007/08/21 21:08:47 tgl Exp $ +PostgreSQL documentation +--> + +<refentry id="SQL-ALTERTSTEMPLATE"> + <refmeta> + <refentrytitle id="SQL-ALTERTSTEMPLATE-TITLE">ALTER TEXT SEARCH TEMPLATE</refentrytitle> + <refmiscinfo>SQL - Language Statements</refmiscinfo> + </refmeta> + + <refnamediv> + <refname>ALTER TEXT SEARCH TEMPLATE</refname> + <refpurpose>change the definition of a text search template</refpurpose> + </refnamediv> + + <indexterm zone="sql-altertstemplate"> + <primary>ALTER TEXT SEARCH TEMPLATE</primary> + </indexterm> + + <refsynopsisdiv> +<synopsis> +ALTER TEXT SEARCH TEMPLATE <replaceable>name</replaceable> RENAME TO <replaceable>newname</replaceable> +</synopsis> + </refsynopsisdiv> + + <refsect1> + <title>Description</title> + + <para> + <command>ALTER TEXT SEARCH TEMPLATE</command> changes the definition of + a text search template. Currently, the only supported functionality + is to change the template's name. + </para> + + <para> + You must be a superuser to use <command>ALTER TEXT SEARCH TEMPLATE</>. + </para> + </refsect1> + + <refsect1> + <title>Parameters</title> + + <variablelist> + <varlistentry> + <term><replaceable class="parameter">name</replaceable></term> + <listitem> + <para> + The name (optionally schema-qualified) of an existing text search template. + </para> + </listitem> + </varlistentry> + + <varlistentry> + <term><replaceable class="parameter">newname</replaceable></term> + <listitem> + <para> + The new name of the text search template. + </para> + </listitem> + </varlistentry> + </variablelist> + </refsect1> + + <refsect1> + <title>Compatibility</title> + + <para> + There is no <command>ALTER TEXT SEARCH TEMPLATE</command> statement in + the SQL standard. + </para> + </refsect1> + + <refsect1> + <title>See Also</title> + + <simplelist type="inline"> + <member><xref linkend="sql-createtstemplate" endterm="sql-createtstemplate-title"></member> + <member><xref linkend="sql-droptstemplate" endterm="sql-droptstemplate-title"></member> + </simplelist> + </refsect1> +</refentry> diff --git a/doc/src/sgml/ref/comment.sgml b/doc/src/sgml/ref/comment.sgml index 733f75ab3b7..c8993e915b8 100644 --- a/doc/src/sgml/ref/comment.sgml +++ b/doc/src/sgml/ref/comment.sgml @@ -1,5 +1,5 @@ <!-- -$PostgreSQL: pgsql/doc/src/sgml/ref/comment.sgml,v 1.35 2007/01/31 23:26:03 momjian Exp $ +$PostgreSQL: pgsql/doc/src/sgml/ref/comment.sgml,v 1.36 2007/08/21 21:08:47 tgl Exp $ PostgreSQL documentation --> @@ -42,6 +42,10 @@ COMMENT ON SCHEMA <replaceable class="PARAMETER">object_name</replaceable> | SEQUENCE <replaceable class="PARAMETER">object_name</replaceable> | TABLESPACE <replaceable class="PARAMETER">object_name</replaceable> | + TEXT SEARCH CONFIGURATION <replaceable class="PARAMETER">object_name</replaceable> | + TEXT SEARCH DICTIONARY <replaceable class="PARAMETER">object_name</replaceable> | + TEXT SEARCH PARSER <replaceable class="PARAMETER">object_name</replaceable> | + TEXT SEARCH TEMPLATE <replaceable class="PARAMETER">object_name</replaceable> | TRIGGER <replaceable class="PARAMETER">trigger_name</replaceable> ON <replaceable class="PARAMETER">table_name</replaceable> | TYPE <replaceable class="PARAMETER">object_name</replaceable> | VIEW <replaceable class="PARAMETER">object_name</replaceable> @@ -65,9 +69,8 @@ COMMENT ON </para> <para> - Comments can be - easily retrieved with the <application>psql</application> commands - <command>\dd</command>, <command>\d+</command>, and <command>\l+</command>. + Comments can be viewed using <application>psql</application>'s + <command>\d</command> family of commands. Other user interfaces to retrieve comments can be built atop the same built-in functions that <application>psql</application> uses, namely <function>obj_description</>, <function>col_description</>, @@ -93,7 +96,8 @@ COMMENT ON <para> The name of the object to be commented. Names of tables, aggregates, domains, functions, indexes, operators, operator classes, - operator families, sequences, types, and views can be schema-qualified. + operator families, sequences, text search objects, types, and views can + be schema-qualified. </para> </listitem> </varlistentry> @@ -255,6 +259,10 @@ COMMENT ON SCHEMA my_schema IS 'Departmental data'; COMMENT ON SEQUENCE my_sequence IS 'Used to generate primary keys'; COMMENT ON TABLE my_schema.my_table IS 'Employee Information'; COMMENT ON TABLESPACE my_tablespace IS 'Tablespace for indexes'; +COMMENT ON TEXT SEARCH CONFIGURATION my_config IS 'Special word filtering'; +COMMENT ON TEXT SEARCH DICTIONARY swedish IS 'Snowball stemmer for swedish language'; +COMMENT ON TEXT SEARCH PARSER my_parser IS 'Splits text into words'; +COMMENT ON TEXT SEARCH TEMPLATE snowball IS 'Snowball stemmer'; COMMENT ON TRIGGER my_trigger ON my_table IS 'Used for RI'; COMMENT ON TYPE complex IS 'Complex number data type'; COMMENT ON VIEW my_view IS 'View of departmental costs'; diff --git a/doc/src/sgml/ref/create_tsconfig.sgml b/doc/src/sgml/ref/create_tsconfig.sgml new file mode 100644 index 00000000000..49be9411d56 --- /dev/null +++ b/doc/src/sgml/ref/create_tsconfig.sgml @@ -0,0 +1,126 @@ +<!-- +$PostgreSQL: pgsql/doc/src/sgml/ref/create_tsconfig.sgml,v 1.1 2007/08/21 21:08:47 tgl Exp $ +PostgreSQL documentation +--> + +<refentry id="SQL-CREATETSCONFIG"> + <refmeta> + <refentrytitle id="sql-createtsconfig-title">CREATE TEXT SEARCH CONFIGURATION</refentrytitle> + <refmiscinfo>SQL - Language Statements</refmiscinfo> + </refmeta> + + <refnamediv> + <refname>CREATE TEXT SEARCH CONFIGURATION</refname> + <refpurpose>define a new text search configuration</refpurpose> + </refnamediv> + + <indexterm zone="sql-createtsconfig"> + <primary>CREATE TEXT SEARCH CONFIGURATION</primary> + </indexterm> + + <refsynopsisdiv> +<synopsis> +CREATE TEXT SEARCH CONFIGURATION <replaceable class="parameter">name</replaceable> ( + PARSER = <replaceable class="parameter">parser_name</replaceable> | + COPY = <replaceable class="parameter">source_config</replaceable> +) +</synopsis> + </refsynopsisdiv> + + <refsect1> + <title>Description</title> + + <para> + <command>CREATE TEXT SEARCH CONFIGURATION</command> creates a new text + search configuration. A text search configuration specifies a text + search parser that can divide a string into tokens, plus dictionaries + that can be used to determine which tokens are of interest for searching. + </para> + + <para> + If only the parser is specified, then the new text search configuration + initially has no mappings from token types to dictionaries, and therefore + will ignore all words. Subsequent <command>ALTER TEXT SEARCH + CONFIGURATION</command> commands must be used to create mappings to + make the configuration useful. Alternatively, an existing text search + configuration can be copied. + </para> + + <para> + If a schema name is given then the text search configuration is created in + the specified schema. Otherwise it is created in the current schema. + </para> + + <para> + The user who defines a text search configuration becomes its owner. + </para> + + <para> + Refer to <xref linkend="textsearch"> for further information. + </para> + </refsect1> + + <refsect1> + <title>Parameters</title> + + <variablelist> + <varlistentry> + <term><replaceable class="parameter">name</replaceable></term> + <listitem> + <para> + The name of the text search configuration to be created. The name can be + schema-qualified. + </para> + </listitem> + </varlistentry> + + <varlistentry> + <term><replaceable class="parameter">parser_name</replaceable></term> + <listitem> + <para> + The name of the text search parser to use for this configuration. + </para> + </listitem> + </varlistentry> + + <varlistentry> + <term><replaceable class="parameter">source_config</replaceable></term> + <listitem> + <para> + The name of an existing text search configuration to copy. + </para> + </listitem> + </varlistentry> + </variablelist> + </refsect1> + + <refsect1> + <title>Notes</title> + + <para> + It is allowed to specify both <literal>PARSER</> and <literal>COPY</>, + resulting in the specified parser being used with whatever mappings + are in the source configuration. This is generally inadvisable, + unless you know that both parsers involved use the same token type set. + </para> + + </refsect1> + + <refsect1> + <title>Compatibility</title> + + <para> + There is no <command>CREATE TEXT SEARCH CONFIGURATION</command> statement + in the SQL standard. + </para> + </refsect1> + + <refsect1> + <title>See Also</title> + + <simplelist type="inline"> + <member><xref linkend="sql-altertsconfig" endterm="sql-altertsconfig-title"></member> + <member><xref linkend="sql-droptsconfig" endterm="sql-droptsconfig-title"></member> + </simplelist> + </refsect1> +</refentry> diff --git a/doc/src/sgml/ref/create_tsdictionary.sgml b/doc/src/sgml/ref/create_tsdictionary.sgml new file mode 100644 index 00000000000..81c6a0c6edb --- /dev/null +++ b/doc/src/sgml/ref/create_tsdictionary.sgml @@ -0,0 +1,111 @@ +<!-- +$PostgreSQL: pgsql/doc/src/sgml/ref/create_tsdictionary.sgml,v 1.1 2007/08/21 21:08:47 tgl Exp $ +PostgreSQL documentation +--> + +<refentry id="SQL-CREATETSDICTIONARY"> + <refmeta> + <refentrytitle id="sql-createtsdictionary-title">CREATE TEXT SEARCH DICTIONARY</refentrytitle> + <refmiscinfo>SQL - Language Statements</refmiscinfo> + </refmeta> + + <refnamediv> + <refname>CREATE TEXT SEARCH DICTIONARY</refname> + <refpurpose>define a new text search dictionary</refpurpose> + </refnamediv> + + <indexterm zone="sql-createtsdictionary"> + <primary>CREATE TEXT SEARCH DICTIONARY</primary> + </indexterm> + + <refsynopsisdiv> +<synopsis> +CREATE TEXT SEARCH DICTIONARY <replaceable class="parameter">name</replaceable> ( + TEMPLATE = <replaceable class="parameter">template</replaceable> + [, OPTION = <replaceable class="parameter">init_options</replaceable> ] +) +</synopsis> + </refsynopsisdiv> + + <refsect1> + <title>Description</title> + + <para> + <command>CREATE TEXT SEARCH DICTIONARY</command> creates a new text search + dictionary. A text search dictionary specifies a way of recognizing + interesting or uninteresting words for searching. A dictionary depends + on a text search template, which specifies the functions that actually + perform the work. Typically the dictionary provides some options that + control the detailed behavior of the template's functions. + </para> + + <para> + If a schema name is given then the text search dictionary is created in the + specified schema. Otherwise it is created in the current schema. + </para> + + <para> + The user who defines a text search dictionary becomes its owner. + </para> + + <para> + Refer to <xref linkend="textsearch"> for further information. + </para> + </refsect1> + + <refsect1> + <title>Parameters</title> + + <variablelist> + <varlistentry> + <term><replaceable class="parameter">name</replaceable></term> + <listitem> + <para> + The name of the text search dictionary to be created. The name can be + schema-qualified. + </para> + </listitem> + </varlistentry> + + <varlistentry> + <term><replaceable class="parameter">template</replaceable></term> + <listitem> + <para> + The name of the text search template that will define the basic + behavior of this dictionary. + </para> + </listitem> + </varlistentry> + + <varlistentry> + <term><replaceable class="parameter">init_options</replaceable></term> + <listitem> + <para> + A list of initialization options for the template functions. + This is a string containing <replaceable>keyword</> <literal>=</> + <replaceable>value</> pairs. The specific keywords allowed + vary depending on the text search template. + </para> + </listitem> + </varlistentry> + </variablelist> + </refsect1> + + <refsect1> + <title>Compatibility</title> + + <para> + There is no <command>CREATE TEXT SEARCH DICTIONARY</command> statement in + the SQL standard. + </para> + </refsect1> + + <refsect1> + <title>See Also</title> + + <simplelist type="inline"> + <member><xref linkend="sql-altertsdictionary" endterm="sql-altertsdictionary-title"></member> + <member><xref linkend="sql-droptsdictionary" endterm="sql-droptsdictionary-title"></member> + </simplelist> + </refsect1> +</refentry> diff --git a/doc/src/sgml/ref/create_tsparser.sgml b/doc/src/sgml/ref/create_tsparser.sgml new file mode 100644 index 00000000000..5f612cf0d96 --- /dev/null +++ b/doc/src/sgml/ref/create_tsparser.sgml @@ -0,0 +1,152 @@ +<!-- +$PostgreSQL: pgsql/doc/src/sgml/ref/create_tsparser.sgml,v 1.1 2007/08/21 21:08:47 tgl Exp $ +PostgreSQL documentation +--> + +<refentry id="SQL-CREATETSPARSER"> + <refmeta> + <refentrytitle id="sql-createtsparser-title">CREATE TEXT SEARCH PARSER</refentrytitle> + <refmiscinfo>SQL - Language Statements</refmiscinfo> + </refmeta> + + <refnamediv> + <refname>CREATE TEXT SEARCH PARSER</refname> + <refpurpose>define a new text search parser</refpurpose> + </refnamediv> + + <indexterm zone="sql-createtsparser"> + <primary>CREATE TEXT SEARCH PARSER</primary> + </indexterm> + + <refsynopsisdiv> +<synopsis> +CREATE TEXT SEARCH PARSER <replaceable class="parameter">name</replaceable> ( + START = <replaceable class="parameter">start_function</replaceable> , + GETTOKEN = <replaceable class="parameter">gettoken_function</replaceable> , + END = <replaceable class="parameter">end_function</replaceable> , + LEXTYPES = <replaceable class="parameter">lextypes_function</replaceable> + [, HEADLINE = <replaceable class="parameter">headline_function</replaceable> ] +) +</synopsis> + </refsynopsisdiv> + + <refsect1> + <title>Description</title> + + <para> + <command>CREATE TEXT SEARCH PARSER</command> creates a new text search + parser. A text search parser defines a method for splitting a text + string into tokens and assigning types (categories) to the tokens. + A parser is not particularly useful by itself, but must be bound into a + text search configuration along with some text search dictionaries + to be used for searching. + </para> + + <para> + If a schema name is given then the text search parser is created in the + specified schema. Otherwise it is created in the current schema. + </para> + + <para> + You must be a superuser to use <command>CREATE TEXT SEARCH PARSER</command>. + (This restriction is made because an erroneous text search parser + definition could confuse or even crash the server.) + </para> + + <para> + Refer to <xref linkend="textsearch"> for further information. + </para> + </refsect1> + + <refsect1> + <title>Parameters</title> + + <variablelist> + <varlistentry> + <term><replaceable class="parameter">name</replaceable></term> + <listitem> + <para> + The name of the text search parser to be created. The name can be + schema-qualified. + </para> + </listitem> + </varlistentry> + + <varlistentry> + <term><replaceable class="parameter">start_function</replaceable></term> + <listitem> + <para> + The name of the start function for the parser. + </para> + </listitem> + </varlistentry> + + <varlistentry> + <term><replaceable class="parameter">gettoken_function</replaceable></term> + <listitem> + <para> + The name of the get-next-token function for the parser. + </para> + </listitem> + </varlistentry> + + <varlistentry> + <term><replaceable class="parameter">end_function</replaceable></term> + <listitem> + <para> + The name of the end function for the parser. + </para> + </listitem> + </varlistentry> + + <varlistentry> + <term><replaceable class="parameter">lextypes_function</replaceable></term> + <listitem> + <para> + The name of the lextypes function for the parser (a function that + returns information about the set of token types it produces). + </para> + </listitem> + </varlistentry> + + <varlistentry> + <term><replaceable class="parameter">headline_function</replaceable></term> + <listitem> + <para> + The name of the headline function for the parser (a function that + summarizes a set of tokens). + </para> + </listitem> + </varlistentry> + </variablelist> + + <para> + The function names can be schema-qualified if necessary. Argument types + are not given, since the argument list for each type of function is + predetermined. All except the headline function are required. + </para> + + <para> + The arguments can appear in any order, not only the one shown above. + </para> + </refsect1> + + <refsect1> + <title>Compatibility</title> + + <para> + There is no + <command>CREATE TEXT SEARCH PARSER</command> statement in the SQL + standard. + </para> + </refsect1> + + <refsect1> + <title>See Also</title> + + <simplelist type="inline"> + <member><xref linkend="sql-altertsparser" endterm="sql-altertsparser-title"></member> + <member><xref linkend="sql-droptsparser" endterm="sql-droptsparser-title"></member> + </simplelist> + </refsect1> +</refentry> diff --git a/doc/src/sgml/ref/create_tstemplate.sgml b/doc/src/sgml/ref/create_tstemplate.sgml new file mode 100644 index 00000000000..5abadb76be6 --- /dev/null +++ b/doc/src/sgml/ref/create_tstemplate.sgml @@ -0,0 +1,125 @@ +<!-- +$PostgreSQL: pgsql/doc/src/sgml/ref/create_tstemplate.sgml,v 1.1 2007/08/21 21:08:47 tgl Exp $ +PostgreSQL documentation +--> + +<refentry id="SQL-CREATETSTEMPLATE"> + <refmeta> + <refentrytitle id="sql-createtstemplate-title">CREATE TEXT SEARCH TEMPLATE</refentrytitle> + <refmiscinfo>SQL - Language Statements</refmiscinfo> + </refmeta> + + <refnamediv> + <refname>CREATE TEXT SEARCH TEMPLATE</refname> + <refpurpose>define a new text search template</refpurpose> + </refnamediv> + + <indexterm zone="sql-createtstemplate"> + <primary>CREATE TEXT SEARCH TEMPLATE</primary> + </indexterm> + + <refsynopsisdiv> +<synopsis> +CREATE TEXT SEARCH TEMPLATE <replaceable class="parameter">name</replaceable> ( + [ INIT = <replaceable class="parameter">init_function</replaceable> , ] + LEXIZE = <replaceable class="parameter">lexize_function</replaceable> +) +</synopsis> + </refsynopsisdiv> + + <refsect1> + <title>Description</title> + + <para> + <command>CREATE TEXT SEARCH TEMPLATE</command> creates a new text search + template. Text search templates define the functions that implement + text search dictionaries. A template is not useful by itself, but must + be instantiated as a dictionary to be used. The dictionary typically + specifies parameters to be given to the template functions. + </para> + + <para> + If a schema name is given then the text search template is created in the + specified schema. Otherwise it is created in the current schema. + </para> + + <para> + You must be a superuser to use <command>CREATE TEXT SEARCH + TEMPLATE</command>. This restriction is made because an erroneous text + search template definition could confuse or even crash the server. + The reason for separating templates from dictionaries is that a template + encapsulates the <quote>unsafe</> aspects of defining a dictionary. + The parameters that can be set when defining a dictionary are safe for + unprivileged users to set, and so creating a dictionary need not be a + privileged operation. + </para> + + <para> + Refer to <xref linkend="textsearch"> for further information. + </para> + </refsect1> + + <refsect1> + <title>Parameters</title> + + <variablelist> + <varlistentry> + <term><replaceable class="parameter">name</replaceable></term> + <listitem> + <para> + The name of the text search template to be created. The name can be + schema-qualified. + </para> + </listitem> + </varlistentry> + + <varlistentry> + <term><replaceable class="parameter">init_function</replaceable></term> + <listitem> + <para> + The name of the init function for the template. + </para> + </listitem> + </varlistentry> + + <varlistentry> + <term><replaceable class="parameter">lexize_function</replaceable></term> + <listitem> + <para> + The name of the lexize function for the template. + </para> + </listitem> + </varlistentry> + </variablelist> + + <para> + The function names can be schema-qualified if necessary. Argument types + are not given, since the argument list for each type of function is + predetermined. The lexize function is required, but the init function + is optional. + </para> + + <para> + The arguments can appear in any order, not only the one shown above. + </para> + </refsect1> + + <refsect1> + <title>Compatibility</title> + + <para> + There is no + <command>CREATE TEXT SEARCH TEMPLATE</command> statement in the SQL + standard. + </para> + </refsect1> + + <refsect1> + <title>See Also</title> + + <simplelist type="inline"> + <member><xref linkend="sql-altertstemplate" endterm="sql-altertstemplate-title"></member> + <member><xref linkend="sql-droptstemplate" endterm="sql-droptstemplate-title"></member> + </simplelist> + </refsect1> +</refentry> diff --git a/doc/src/sgml/ref/drop_tsconfig.sgml b/doc/src/sgml/ref/drop_tsconfig.sgml new file mode 100644 index 00000000000..5cc8e3caf33 --- /dev/null +++ b/doc/src/sgml/ref/drop_tsconfig.sgml @@ -0,0 +1,118 @@ +<!-- +$PostgreSQL: pgsql/doc/src/sgml/ref/drop_tsconfig.sgml,v 1.1 2007/08/21 21:08:47 tgl Exp $ +PostgreSQL documentation +--> + +<refentry id="SQL-DROPTSCONFIG"> + <refmeta> + <refentrytitle id="SQL-DROPTSCONFIG-TITLE">DROP TEXT SEARCH CONFIGURATION</refentrytitle> + <refmiscinfo>SQL - Language Statements</refmiscinfo> + </refmeta> + + <refnamediv> + <refname>DROP TEXT SEARCH CONFIGURATION</refname> + <refpurpose>remove a text search configuration</refpurpose> + </refnamediv> + + <indexterm zone="sql-droptsconfig"> + <primary>DROP TEXT SEARCH CONFIGURATION</primary> + </indexterm> + + <refsynopsisdiv> +<synopsis> +DROP TEXT SEARCH CONFIGURATION [ IF EXISTS ] <replaceable class="PARAMETER">name</replaceable> [ CASCADE | RESTRICT ] +</synopsis> + </refsynopsisdiv> + + <refsect1> + <title>Description</title> + + <para> + <command>DROP TEXT SEARCH CONFIGURATION</command> drops an existing text + search configuration. To execute this command you must be the owner of the + configuration. + </para> + </refsect1> + + <refsect1> + <title>Parameters</title> + + <variablelist> + + <varlistentry> + <term><literal>IF EXISTS</literal></term> + <listitem> + <para> + Do not throw an error if the text search configuration does not exist. + A notice is issued in this case. + </para> + </listitem> + </varlistentry> + + <varlistentry> + <term><replaceable class="parameter">name</replaceable></term> + <listitem> + <para> + The name (optionally schema-qualified) of an existing text search + configuration. + </para> + </listitem> + </varlistentry> + + <varlistentry> + <term><literal>CASCADE</literal></term> + <listitem> + <para> + Automatically drop objects that depend on the text search configuration. + </para> + </listitem> + </varlistentry> + + <varlistentry> + <term><literal>RESTRICT</literal></term> + <listitem> + <para> + Refuse to drop the text search configuration if any objects depend on it. + This is the default. + </para> + </listitem> + </varlistentry> + </variablelist> + </refsect1> + + <refsect1> + <title>Examples</title> + + <para> + Remove the text search configuration <literal>my_english</literal>: + +<programlisting> +DROP TEXT SEARCH CONFIGURATION my_english; +</programlisting> + + This command will not succeed if there are any existing indexes + that reference the configuration in <function>to_tsvector</> calls. + Add <literal>CASCADE</> to + drop such indexes along with the text search configuration. + </para> + </refsect1> + + <refsect1> + <title>Compatibility</title> + + <para> + There is no <command>DROP TEXT SEARCH CONFIGURATION</command> statement in + the SQL standard. + </para> + </refsect1> + + <refsect1> + <title>See Also</title> + + <simplelist type="inline"> + <member><xref linkend="sql-altertsconfig" endterm="sql-altertsconfig-title"></member> + <member><xref linkend="sql-createtsconfig" endterm="sql-createtsconfig-title"></member> + </simplelist> + </refsect1> + +</refentry> diff --git a/doc/src/sgml/ref/drop_tsdictionary.sgml b/doc/src/sgml/ref/drop_tsdictionary.sgml new file mode 100644 index 00000000000..683d4dc9b2f --- /dev/null +++ b/doc/src/sgml/ref/drop_tsdictionary.sgml @@ -0,0 +1,117 @@ +<!-- +$PostgreSQL: pgsql/doc/src/sgml/ref/drop_tsdictionary.sgml,v 1.1 2007/08/21 21:08:47 tgl Exp $ +PostgreSQL documentation +--> + +<refentry id="SQL-DROPTSDICTIONARY"> + <refmeta> + <refentrytitle id="SQL-DROPTSDICTIONARY-TITLE">DROP TEXT SEARCH DICTIONARY</refentrytitle> + <refmiscinfo>SQL - Language Statements</refmiscinfo> + </refmeta> + + <refnamediv> + <refname>DROP TEXT SEARCH DICTIONARY</refname> + <refpurpose>remove a text search dictionary</refpurpose> + </refnamediv> + + <indexterm zone="sql-droptsdictionary"> + <primary>DROP TEXT SEARCH DICTIONARY</primary> + </indexterm> + + <refsynopsisdiv> +<synopsis> +DROP TEXT SEARCH DICTIONARY [ IF EXISTS ] <replaceable class="PARAMETER">name</replaceable> [ CASCADE | RESTRICT ] +</synopsis> + </refsynopsisdiv> + + <refsect1> + <title>Description</title> + + <para> + <command>DROP TEXT SEARCH DICTIONARY</command> drops an existing text + search dictionary. To execute this command you must be the owner of the + dictionary. + </para> + </refsect1> + + <refsect1> + <title>Parameters</title> + + <variablelist> + + <varlistentry> + <term><literal>IF EXISTS</literal></term> + <listitem> + <para> + Do not throw an error if the text search dictionary does not exist. + A notice is issued in this case. + </para> + </listitem> + </varlistentry> + + <varlistentry> + <term><replaceable class="parameter">name</replaceable></term> + <listitem> + <para> + The name (optionally schema-qualified) of an existing text search + dictionary. + </para> + </listitem> + </varlistentry> + + <varlistentry> + <term><literal>CASCADE</literal></term> + <listitem> + <para> + Automatically drop objects that depend on the text search dictionary. + </para> + </listitem> + </varlistentry> + + <varlistentry> + <term><literal>RESTRICT</literal></term> + <listitem> + <para> + Refuse to drop the text search dictionary if any objects depend on it. + This is the default. + </para> + </listitem> + </varlistentry> + </variablelist> + </refsect1> + + <refsect1> + <title>Examples</title> + + <para> + Remove the text search dictionary <literal>english</literal>: + +<programlisting> +DROP TEXT SEARCH DICTIONARY english; +</programlisting> + + This command will not succeed if there are any existing text search + configurations that use the dictionary. Add <literal>CASCADE</> to + drop such configurations along with the dictionary. + </para> + </refsect1> + + <refsect1> + <title>Compatibility</title> + + <para> + There is no <command>DROP TEXT SEARCH DICTIONARY</command> statement in the + SQL standard. + </para> + </refsect1> + + <refsect1> + <title>See Also</title> + + <simplelist type="inline"> + <member><xref linkend="sql-altertsdictionary" endterm="sql-altertsdictionary-title"></member> + <member><xref linkend="sql-createtsdictionary" endterm="sql-createtsdictionary-title"></member> + </simplelist> + </refsect1> + +</refentry> diff --git a/doc/src/sgml/ref/drop_tsparser.sgml b/doc/src/sgml/ref/drop_tsparser.sgml new file mode 100644 index 00000000000..7a482283ada --- /dev/null +++ b/doc/src/sgml/ref/drop_tsparser.sgml @@ -0,0 +1,115 @@ +<!-- +$PostgreSQL: pgsql/doc/src/sgml/ref/drop_tsparser.sgml,v 1.1 2007/08/21 21:08:47 tgl Exp $ +PostgreSQL documentation +--> + +<refentry id="SQL-DROPTSPARSER"> + <refmeta> + <refentrytitle id="SQL-DROPTSPARSER-TITLE">DROP TEXT SEARCH PARSER</refentrytitle> + <refmiscinfo>SQL - Language Statements</refmiscinfo> + </refmeta> + + <refnamediv> + <refname>DROP TEXT SEARCH PARSER</refname> + <refpurpose>remove a text search parser</refpurpose> + </refnamediv> + + <indexterm zone="sql-droptsparser"> + <primary>DROP TEXT SEARCH PARSER</primary> + </indexterm> + + <refsynopsisdiv> +<synopsis> +DROP TEXT SEARCH PARSER [ IF EXISTS ] <replaceable class="PARAMETER">name</replaceable> [ CASCADE | RESTRICT ] +</synopsis> + </refsynopsisdiv> + + <refsect1> + <title>Description</title> + + <para> + <command>DROP TEXT SEARCH PARSER</command> drops an existing text search + parser. You must be a superuser to use this command. + </para> + </refsect1> + + <refsect1> + <title>Parameters</title> + + <variablelist> + + <varlistentry> + <term><literal>IF EXISTS</literal></term> + <listitem> + <para> + Do not throw an error if the text search parser does not exist. + A notice is issued in this case. + </para> + </listitem> + </varlistentry> + + <varlistentry> + <term><replaceable class="parameter">name</replaceable></term> + <listitem> + <para> + The name (optionally schema-qualified) of an existing text search parser. + </para> + </listitem> + </varlistentry> + + <varlistentry> + <term><literal>CASCADE</literal></term> + <listitem> + <para> + Automatically drop objects that depend on the text search parser. + </para> + </listitem> + </varlistentry> + + <varlistentry> + <term><literal>RESTRICT</literal></term> + <listitem> + <para> + Refuse to drop the text search parser if any objects depend on it. + This is the default. + </para> + </listitem> + </varlistentry> + </variablelist> + </refsect1> + + <refsect1> + <title>Examples</title> + + <para> + Remove the text search parser <literal>my_parser</literal>: + +<programlisting> +DROP TEXT SEARCH PARSER my_parser; +</programlisting> + + This command will not succeed if there are any existing text search + configurations that use the parser. Add <literal>CASCADE</> to + drop such configurations along with the parser. + </para> + </refsect1> + + <refsect1> + <title>Compatibility</title> + + <para> + There is no <command>DROP TEXT SEARCH PARSER</command> statement in the + SQL standard. + </para> + </refsect1> + + <refsect1> + <title>See Also</title> + + <simplelist type="inline"> + <member><xref linkend="sql-altertsparser" endterm="sql-altertsparser-title"></member> + <member><xref linkend="sql-createtsparser" endterm="sql-createtsparser-title"></member> + </simplelist> + </refsect1> + +</refentry> diff --git a/doc/src/sgml/ref/drop_tstemplate.sgml b/doc/src/sgml/ref/drop_tstemplate.sgml new file mode 100644 index 00000000000..6a776d1da97 --- /dev/null +++ b/doc/src/sgml/ref/drop_tstemplate.sgml @@ -0,0 +1,116 @@ +<!-- +$PostgreSQL: pgsql/doc/src/sgml/ref/drop_tstemplate.sgml,v 1.1 2007/08/21 21:08:47 tgl Exp $ +PostgreSQL documentation +--> + +<refentry id="SQL-DROPTSTEMPLATE"> + <refmeta> + <refentrytitle id="SQL-DROPTSTEMPLATE-TITLE">DROP TEXT SEARCH TEMPLATE</refentrytitle> + <refmiscinfo>SQL - Language Statements</refmiscinfo> + </refmeta> + + <refnamediv> + <refname>DROP TEXT SEARCH TEMPLATE</refname> + <refpurpose>remove a text search template</refpurpose> + </refnamediv> + + <indexterm zone="sql-droptstemplate"> + <primary>DROP TEXT SEARCH TEMPLATE</primary> + </indexterm> + + <refsynopsisdiv> +<synopsis> +DROP TEXT SEARCH TEMPLATE [ IF EXISTS ] <replaceable class="PARAMETER">name</replaceable> [ CASCADE | RESTRICT ] +</synopsis> + </refsynopsisdiv> + + <refsect1> + <title>Description</title> + + <para> + <command>DROP TEXT SEARCH TEMPLATE</command> drops an existing text search + template. You must be a superuser to use this command. + </para> + </refsect1> + + <refsect1> + <title>Parameters</title> + + <variablelist> + + <varlistentry> + <term><literal>IF EXISTS</literal></term> + <listitem> + <para> + Do not throw an error if the text search template does not exist. + A notice is issued in this case. + </para> + </listitem> + </varlistentry> + + <varlistentry> + <term><replaceable class="parameter">name</replaceable></term> + <listitem> + <para> + The name (optionally schema-qualified) of an existing text search + template. + </para> + </listitem> + </varlistentry> + + <varlistentry> + <term><literal>CASCADE</literal></term> + <listitem> + <para> + Automatically drop objects that depend on the text search template. + </para> + </listitem> + </varlistentry> + + <varlistentry> + <term><literal>RESTRICT</literal></term> + <listitem> + <para> + Refuse to drop the text search template if any objects depend on it. + This is the default. + </para> + </listitem> + </varlistentry> + </variablelist> + </refsect1> + + <refsect1> + <title>Examples</title> + + <para> + Remove the text search template <literal>thesaurus</literal>: + +<programlisting> +DROP TEXT SEARCH TEMPLATE thesaurus; +</programlisting> + + This command will not succeed if there are any existing text search + dictionaries that use the template. Add <literal>CASCADE</> to + drop such dictionaries along with the template. + </para> + </refsect1> + + <refsect1> + <title>Compatibility</title> + + <para> + There is no <command>DROP TEXT SEARCH TEMPLATE</command> statement in the + SQL standard. + </para> + </refsect1> + + <refsect1> + <title>See Also</title> + + <simplelist type="inline"> + <member><xref linkend="sql-altertstemplate" endterm="sql-altertstemplate-title"></member> + <member><xref linkend="sql-createtstemplate" endterm="sql-createtstemplate-title"></member> + </simplelist> + </refsect1> + +</refentry> diff --git a/doc/src/sgml/reference.sgml b/doc/src/sgml/reference.sgml index a52fe7a9644..1e8e5465314 100644 --- a/doc/src/sgml/reference.sgml +++ b/doc/src/sgml/reference.sgml @@ -1,4 +1,4 @@ -<!-- $PostgreSQL: pgsql/doc/src/sgml/reference.sgml,v 1.64 2007/07/03 01:30:35 neilc Exp $ --> +<!-- $PostgreSQL: pgsql/doc/src/sgml/reference.sgml,v 1.65 2007/08/21 21:08:47 tgl Exp $ --> <part id="reference"> <title>Reference</title> @@ -50,6 +50,10 @@ &alterSequence; &alterTable; &alterTableSpace; + &alterTSConfig; + &alterTSDictionary; + &alterTSParser; + &alterTSTemplate; &alterTrigger; &alterType; &alterUser; @@ -83,6 +87,10 @@ &createTable; &createTableAs; &createTableSpace; + &createTSConfig; + &createTSDictionary; + &createTSParser; + &createTSTemplate; &createTrigger; &createType; &createUser; @@ -110,6 +118,10 @@ &dropSequence; &dropTable; &dropTableSpace; + &dropTSConfig; + &dropTSDictionary; + &dropTSParser; + &dropTSTemplate; &dropTrigger; &dropType; &dropUser; diff --git a/doc/src/sgml/textsearch.sgml b/doc/src/sgml/textsearch.sgml new file mode 100644 index 00000000000..a6601d6edba --- /dev/null +++ b/doc/src/sgml/textsearch.sgml @@ -0,0 +1,3716 @@ +<chapter id="textsearch"> +<title>Full Text Search</title> + + +<sect1 id="textsearch-intro"> +<title>Introduction</title> + +<para> +Full Text Searching (<firstterm>text search</firstterm>) allows the +searching of documents that satisfy a <varname>query</varname>, and +optionally returns them in some order. The most common search is to find +all documents containing <varname>query terms</varname> and return them +in order of their <varname>similarity</varname> to the +<varname>query</varname>. Notions of <varname>query</varname> and +<varname>similarity</varname> are very flexible and depend on the specific +application. The simplest search considers <varname>query</varname> as a +set of words and <varname>similarity</varname> as the frequency of query +words in the document. Full text indexing can be done inside the +database or outside. Doing indexing inside the database allows easy access +to document metadata to assist in indexing and display. +</para> + +<para> +Textual search operators have existed in databases for years. +<productname>PostgreSQL</productname> has +<literal>~</literal>,<literal>~*</literal>, <literal>LIKE</literal>, +<literal>ILIKE</literal> operators for textual datatypes, but they lack +many essential properties required by modern information systems: + +<itemizedlist spacing="compact" mark="bullet"> +<listitem> +<para> +There is no linguistic support, even for English. Regular expressions are +not sufficient because they cannot easily handle derived words, +e.g., <literal>satisfies</literal> and <literal>satisfy</literal>. You might +miss documents which contain <literal>satisfies</literal>, although you +probably would like to find them when searching for +<literal>satisfy</literal>. It is possible to use <literal>OR</literal> +to search <emphasis>any</emphasis> of them, but it is tedious and error-prone +(some words can have several thousand derivatives). +</para> +</listitem> +<listitem><para> +They provide no ordering (ranking) of search results, which makes them +ineffective when thousands of matching documents are found. +</para></listitem> +<listitem> +<para> +They tend to be slow because they process all documents for every search and +there is no index support. +</para></listitem> +</itemizedlist> + +</para> + +<para> +Full text indexing allows documents to be <emphasis>preprocessed</emphasis> +and an index saved for later rapid searching. Preprocessing includes: + +<itemizedlist mark="none"> +<listitem><para> +<emphasis>Parsing documents into <firstterm>lexemes</></emphasis>. It is +useful to identify various lexemes, e.g. digits, words, complex words, +email addresses, so they can be processed differently. In principle +lexemes depend on the specific application but for an ordinary search it +is useful to have a predefined list of lexemes. <!-- add list of lexemes. +--> +</para></listitem> + +<listitem><para> +<emphasis>Dictionaries</emphasis> allow the conversion of lexemes into +a <emphasis>normalized form</emphasis> so it is not necessary to enter +search words in a specific form. +</para></listitem> + +<listitem><para> +<emphasis>Store</emphasis> preprocessed documents +optimized for searching. For example, represent each document as a sorted array +of lexemes. Along with lexemes it is desirable to store positional +information to use for <varname>proximity ranking</varname>, so that a +document which contains a more "dense" region of query words is assigned +a higher rank than one with scattered query words. +</para></listitem> +</itemizedlist> +</para> + +<para> +Dictionaries allow fine-grained control over how lexemes are created. With +dictionaries you can: +<itemizedlist spacing="compact" mark="bullet"> +<listitem><para> +Define "stop words" that should not be indexed. +</para> +</listitem> +<listitem><para> +Map synonyms to a single word using <application>ispell</>. +</para></listitem> +<listitem><para> +Map phrases to a single word using a thesaurus. +</para></listitem> +<listitem><para> +Map different variations of a word to a canonical form using +an <application>ispell</> dictionary. +</para></listitem> +<listitem><para> +Map different variations of a word to a canonical form using +<application>snowball</> stemmer rules. +</para></listitem> +</itemizedlist> + +</para> + +<para> +A data type (<xref linkend="textsearch-datatypes">), <type>tsvector</type> +is provided, for storing preprocessed documents, +along with a type <type>tsquery</type> for representing textual +queries. Also, a full text search operator <literal>@@</literal> is defined +for these data types (<xref linkend="textsearch-searches">). Full text +searches can be accelerated using indexes (<xref +linkend="textsearch-indexes">). +</para> + + +<sect2 id="textsearch-document"> +<title>What Is a <firstterm>Document</firstterm>?</title> + +<indexterm zone="textsearch-document"> +<primary>document</primary> +</indexterm> + +<para> +A document can be a simple text file stored in the file system. The full +text indexing engine can parse text files and store associations of lexemes +(words) with their parent document. Later, these associations are used to +search for documents which contain query words. In this case, the database +can be used to store the full text index and for executing searches, and +some unique identifier can be used to retrieve the document from the file +system. +</para> + +<para> +A document can also be any textual database attribute or a combination +(concatenation), which in turn can be stored in various tables or obtained +dynamically. In other words, a document can be constructed from different +parts for indexing and it might not exist as a whole. For example: +<programlisting> +SELECT title || ' ' || author || ' ' || abstract || ' ' || body AS document +FROM messages +WHERE mid = 12; + +SELECT m.title || ' ' || m.author || ' ' || m.abstract || ' ' || d.body AS document +FROM messages m, docs d +WHERE mid = did AND mid = 12; +</programlisting> +</para> + +<note> +<para> +Actually, in the previous example queries, <literal>COALESCE</literal> +<!-- TODO make this a link? --> +should be used to prevent a <literal>NULL</literal> attribute from causing +a <literal>NULL</literal> result. +</para> +</note> +</sect2> + +<sect2 id="textsearch-datatypes"> +<title>Data Types</title> + +<variablelist> + + +<indexterm zone="textsearch-datatypes"> +<primary>tsvector</primary> +</indexterm> + + +<varlistentry> +<term><firstterm>tsvector</firstterm></term> +<listitem> + +<para> +<type>tsvector</type> is a data type that represents a document and is +optimized for full text searching. In the simplest case, +<type>tsvector</type> is a sorted list of lexemes, so even without indexes +full text searches perform better than standard <literal>~</literal> and +<literal>LIKE</literal> operations: +<programlisting> +SELECT 'a fat cat sat on a mat and ate a fat rat'::tsvector; + tsvector +---------------------------------------------------- + 'a' 'on' 'and' 'ate' 'cat' 'fat' 'mat' 'rat' 'sat' +</programlisting> + +Notice, that <literal>space</literal> is also a lexeme: + +<programlisting> +SELECT 'space '' '' is a lexeme'::tsvector; + tsvector +---------------------------------- + 'a' 'is' ' ' 'space' 'lexeme' +</programlisting> + +Each lexeme, optionally, can have positional information which is used for +<varname>proximity ranking</varname>: +<programlisting> +SELECT 'a:1 fat:2 cat:3 sat:4 on:5 a:6 mat:7 and:8 ate:9 a:10 fat:11 rat:12'::tsvector; + tsvector +------------------------------------------------------------------------------- + 'a':1,6,10 'on':5 'and':8 'ate':9 'cat':3 'fat':2,11 'mat':7 'rat':12 'sat':4 +</programlisting> + +Each lexeme position also can be labeled as <literal>'A'</literal>, +<literal>'B'</literal>, <literal>'C'</literal>, <literal>'D'</literal>, +where <literal>'D'</literal> is the default. These labels can be used to group +lexemes into different <emphasis>importance</emphasis> or +<emphasis>rankings</emphasis>, for example to reflect document structure. +Actual values can be assigned at search time and used during the calculation +of the document rank. This is very useful for controlling search results. +</para> +<para> +The concatenation operator, e.g. <literal>tsvector || tsvector</literal>, +can "construct" a document from several parts. The order is important if +<type>tsvector</type> contains positional information. Of course, +it is also possible to build a document using different tables: + +<programlisting> +SELECT 'fat:1 cat:2'::tsvector || 'fat:1 rat:2'::tsvector; + ?column? +--------------------------- + 'cat':2 'fat':1,3 'rat':4 +SELECT 'fat:1 rat:2'::tsvector || 'fat:1 cat:2'::tsvector; + ?column? +--------------------------- + 'cat':4 'fat':1,3 'rat':2 +</programlisting> + +</para> + +</listitem> + +</varlistentry> + +<indexterm zone="textsearch-datatypes"> +<primary>tsquery</primary> +</indexterm> + +<varlistentry> +<term><firstterm>tsquery</firstterm></term> +<listitem> + +<para> +<type>Tsquery</type> is a data type for textual queries which supports +the boolean operators <literal>&</literal> (AND), <literal>|</literal> (OR), +and parentheses. A <type>Tsquery</type> consists of lexemes +(optionally labeled by letters) with boolean operators in between: + +<programlisting> +SELECT 'fat & cat'::tsquery; + tsquery +--------------- + 'fat' & 'cat' +SELECT 'fat:ab & cat'::tsquery; + tsquery +------------------ + 'fat':AB & 'cat' +</programlisting> +Labels can be used to restrict the search region, which allows the +development of different search engines using the same full text index. +</para> + +<para> +<type>tsqueries</type> can be concatenated using <literal>&&</literal> (AND) +and <literal>||</literal> (OR) operators: +<programlisting> +SELECT 'a & b'::tsquery && 'c|d'::tsquery; + ?column? +--------------------------- + 'a' & 'b' & ( 'c' | 'd' ) + +SELECT 'a & b'::tsquery || 'c|d'::tsquery; + ?column? +--------------------------- + 'a' & 'b' | ( 'c' | 'd' ) +</programlisting> +</para> +</listitem> +</varlistentry> +</variablelist> + +</sect2> + +<sect2 id="textsearch-searches"> +<title>Performing Searches</title> + +<para> +Full text searching in <productname>PostgreSQL</productname> provides the +operator <type>@@</type> for two data types: <type>tsvector</type> +(document) and <type>tsquery</type> (query). Also, this operator +supports <type>TEXT</type>, <type>VARCHAR</type>, and <type>CHAR</type> +data types so simple full text searches can be done, but without ranking +support: +<programlisting> +tsvector @@ tsquery +tsquery @@ tsvector +TEXT | VARCHAR | CHAR @@ TEXT | tsquery +</programlisting> +</para> + +<para> +The full text operator <type>@@</type> returns <literal>true</literal> if +<type>tsvector</type> contains <type>tsquery</type>: +<programlisting> +SELECT 'cat & rat'::tsquery @@ 'a fat cat sat on a mat and ate a fat rat'::tsvector; + ?column? +---------- + t +SELECT 'fat & cow'::tsquery @@ 'a fat cat sat on a mat and ate a fat rat'::tsvector; + ?column? +---------- + f +</programlisting> + +</para> + + +</sect2> + +</sect1> + +<sect1 id="textsearch-tables"> +<title>Tables and Indexes</title> + +<para> +The previous section described how to perform full text searches using +constant strings. This section shows how to search table data, optionally +using indexes. +</para> + +<sect2 id="textsearch-tables-search"> +<title>Searching a Table</title> + +<para> +It is possible to do full text table search with no index. A simple query +to find all <literal>title</> entries that contain the word +<literal>friend</> is: +<programlisting> +SELECT title +FROM pgweb +WHERE to_tsvector('english', body) @@ to_tsquery('friend') +</programlisting> +</para> + +<para> +A more complex query is to select the ten most recent documents which +contain <literal>create</> and <literal>table</> in the <literal>title</> +or <literal>body</>: +<programlisting> +SELECT title +FROM pgweb +WHERE to_tsvector('english', textcat(title, body)) @@ to_tsquery('create & table') +ORDER BY dlm DESC LIMIT 10; +</programlisting> +<literal>dlm</> is the last-modified date in seconds since 1970 so we +used <command>ORDER BY dlm LIMIT 10</> to get the most recent +matches. For clarity we omitted the <function>coalesce</function> function +which prevents the unwanted effect of <literal>NULL</literal> +concatenation. +</para> + +</sect2> + +<sect2 id="textsearch-tables-index"> +<title>Creating Indexes</title> + +<para> +We can create a <acronym>GIN</acronym> (<xref +linkend="textsearch-indexes">) index to speed up the search: +<programlisting> +CREATE INDEX pgweb_idx ON pgweb USING gin(to_tsvector('english', body)); +</programlisting> +Notice that the 2-argument version of <function>to_tsvector</function> is +used. Only text search functions which specify a configuration name can +be used in expression indexes (<xref linkend="indexes-expressional">). +Casting to a text search data type (<literal>::</>) is also unsupported. +This is because the index contents should be unaffected by +<varname>default_text_search_config</>. If they were affected, the index +contents might be inconsistent because they could contain +<type>tsvector</>s that were created with different default text search +configurations. Recovering a table from a <application>pg_dump</> would +also not recreate index <type>tsvector</>s properly. +</para> + +<para> +Because the two-argument version of <function>to_tsvector</function> was +used in the index above, only a query reference that uses the 2-argument +version of <function>to_tsvector</function> with the same configuration +name will use that index, i.e. <literal>WHERE 'a & b' @@ +to_svector('english', body)</> will use the index, but <literal>WHERE +'a & b' @@ to_svector(body))</> and <literal>WHERE 'a & b' @@ +body::tsvector</> will not. This guarantees that an index will be used +only with the same configuration used to create the index rows. +</para> + +<para> +It is possible to setup more complex expression indexes where the +configuration name is specified by another column, e.g.: +<programlisting> +CREATE INDEX pgweb_idx ON pgweb USING gin(to_tsvector(conf_name, body)); +</programlisting> +where <literal>conf_name</> is a column in the <literal>pgweb</> +table. This allows mixed configurations in the same index while +recording which configuration was used for each index row. +</para> + +<para> +Indexes can even concatenate columns: +<programlisting> +CREATE INDEX pgweb_idx ON pgweb USING gin(to_tsvector('english', textcat(title, body))); +</programlisting> +</para> + +<para> +A more complex case is to create a separate <type>tsvector</> column +to hold the output of <function>to_tsvector()</>. This example is a +concatenation of <literal>title</literal> and <literal>body</literal>, +with ranking information. We assign different labels to them to encode +information about the origin of each word: +<programlisting> +ALTER TABLE pgweb ADD COLUMN textsearch_index tsvector; +UPDATE pgweb SET textsearch_index = + setweight(to_tsvector('english', coalesce(title,'')), 'A') || ' ' || + setweight(to_tsvector('english', coalesce(body,'')),'D'); +</programlisting> +Then we create a <acronym>GIN</acronym> index to speed up the search: +<programlisting> +CREATE INDEX textsearch_idx ON pgweb USING gin(textsearch_index); +</programlisting> +After vacuuming, we are ready to perform a fast full text search: +<programlisting> +SELECT rank_cd(textsearch_index, q) AS rank, title +FROM pgweb, to_tsquery('create & table') q +WHERE q @@ textsearch_index +ORDER BY rank DESC LIMIT 10; +</programlisting> +It is necessary to create a trigger to keep the new <type>tsvector</> +column current anytime <literal>title</> or <literal>body</> changes. +Keep in mind that, just like with expression indexes, it is important to +specify the configuration name when creating text search data types +inside triggers so the column's contents are not affected by changes to +<varname>default_text_search_config</>. +</para> + +</sect2> + +</sect1> + +<sect1 id="textsearch-opfunc"> +<title>Operators and Functions</title> + +<para> +This section outlines all the functions and operators that are available +for full text searching. +</para> + +<para> +Full text search vectors and queries both use lexemes, but for different +purposes. A <type>tsvector</type> represents the lexemes (tokens) parsed +out of a document, with an optional position. A <type>tsquery</type> +specifies a boolean condition using lexemes. +</para> + +<para> +All of the following functions that accept a configuration argument can +use a textual configuration name to select a configuration. If the option +is omitted the configuration specified by +<varname>default_text_search_config</> is used. For more information on +configuration, see <xref linkend="textsearch-tables-configuration">. +</para> + +<sect2 id="textsearch-search-operator"> +<title>Search</title> + +<para>The operator <literal>@@</> is used to perform full text +searches:</para> + +<variablelist> + +<varlistentry> + +<indexterm zone="textsearch-search-operator"> +<primary>TSVECTOR @@ TSQUERY</primary> +</indexterm> + +<term> +<synopsis> +<!-- why allow such combinations? --> +TSVECTOR @@ TSQUERY +TSQUERY @@ TSVECTOR +</synopsis> +</term> + +<listitem> +<para> +Returns <literal>true</literal> if <literal>TSQUERY</literal> is contained +in <literal>TSVECTOR</literal>, and <literal>false</literal> if not: +<programlisting> +SELECT 'a fat cat sat on a mat and ate a fat rat'::tsvector @@ 'cat & rat'::tsquery; + ?column? + ---------- + t +SELECT 'a fat cat sat on a mat and ate a fat rat'::tsvector @@ 'fat & cow'::tsquery; + ?column? + ---------- + f +</programlisting> +</para> + +</listitem> +</varlistentry> + +<varlistentry> + +<indexterm zone="textsearch-search-operator"> +<primary>TEXT @@ TSQUERY</primary> +</indexterm> + +<term> +<synopsis> +TEXT @@ TSQUERY +VARCHAR @@ TSQUERY +CHAR @@ TSQUERY +</synopsis> +</term> + +<listitem> +<para> +Returns <literal>true</literal> if <literal>TSQUERY</literal> is contained +in <literal>TEXT/VARCHAR</literal>, and <literal>false</literal> if not: +<programlisting> +SELECT 'a fat cat sat on a mat and ate a fat rat'::text @@ 'cat & rat'::tsquery; + ?column? +---------- + t +SELECT 'a fat cat sat on a mat and ate a fat rat'::text @@ 'cat & cow'::tsquery; + ?column? +---------- + f +</programlisting> +</para> + +</listitem> +</varlistentry> + +<varlistentry> + +<indexterm zone="textsearch-search-operator"> +<primary>TEXT @@ TEXT</primary> +</indexterm> + +<term> +<synopsis> +<!-- this is very confusing because there is no rule suggesting which is +first. --> +TEXT @@ TEXT +VARCHAR @@ TEXT +CHAR @@ TEXT +</synopsis> +</term> + +<listitem> +<para> +Returns <literal>true</literal> if the right +argument (the query) is contained in the left argument, and +<literal>false</literal> otherwise: +<programlisting> +SELECT 'a fat cat sat on a mat and ate a fat rat' @@ 'cat rat'; + ?column? +---------- + t +SELECT 'a fat cat sat on a mat and ate a fat rat' @@ 'cat cow'; + ?column? +---------- + f +</programlisting> +</para> + +</listitem> +</varlistentry> + + +</variablelist> + +<para> +For index support of full text operators consult <xref linkend="textsearch-indexes">. +</para> + +</sect2> + + + +<sect2 id="textsearch-tsvector"> +<title>tsvector</title> + +<variablelist> + +<varlistentry> + +<indexterm zone="textsearch-tsvector"> +<primary>to_tsvector</primary> +</indexterm> + +<term> +<synopsis> +to_tsvector(<optional><replaceable class="PARAMETER">conf_name</replaceable></optional>, <replaceable class="PARAMETER">document</replaceable> TEXT) returns TSVECTOR +</synopsis> +</term> + +<listitem> +<para> +Parses a document into tokens, reduces the tokens to lexemes, and returns a +<type>tsvector</type> which lists the lexemes together with their positions in the document +in lexicographic order. +</para> + +</listitem> +</varlistentry> + +<varlistentry> + +<indexterm zone="textsearch-tsvector"> +<primary>strip</primary> +</indexterm> + +<term> +<synopsis> +strip(<replaceable class="PARAMETER">vector</replaceable> TSVECTOR) returns TSVECTOR +</synopsis> +</term> + +<listitem> +<para> +Returns a vector which lists the same lexemes as the given vector, but +which lacks any information about where in the document each lexeme +appeared. While the returned vector is useless for relevance ranking it +will usually be much smaller. +</para> +</listitem> + +</varlistentry> + +<varlistentry> + +<indexterm zone="textsearch-tsvector"> +<primary>setweight</primary> +</indexterm> + +<term> +<synopsis> +setweight(<replaceable class="PARAMETER">vector</replaceable> TSVECTOR, <replaceable class="PARAMETER">letter</replaceable>) returns TSVECTOR +</synopsis> +</term> + +<listitem> +<para> +This function returns a copy of the input vector in which every location +has been labeled with either the letter <literal>'A'</literal>, +<literal>'B'</literal>, or <literal>'C'</literal>, or the default label +<literal>'D'</literal> (which is the default for new vectors +and as such is usually not displayed). These labels are retained +when vectors are concatenated, allowing words from different parts of a +document to be weighted differently by ranking functions. +</para> +</listitem> +</varlistentry> + + + +<varlistentry> + +<indexterm zone="textsearch-tsvector"> +<primary>tsvector concatenation</primary> +</indexterm> + +<term> +<synopsis> +<replaceable class="PARAMETER">vector1</replaceable> || <replaceable class="PARAMETER">vector2</replaceable> +concat(<replaceable class="PARAMETER">vector1</replaceable> TSVECTOR, <replaceable class="PARAMETER">vector2</replaceable> TSVECTOR) returns TSVECTOR +</synopsis> +</term> + +<listitem> +<para> +Returns a vector which combines the lexemes and positional information of +the two vectors given as arguments. Positional weight labels (described +in the previous paragraph) are retained during the concatenation. This +has at least two uses. First, if some sections of your document need to be +parsed with different configurations than others, you can parse them +separately and then concatenate the resulting vectors. Second, you can +weigh words from one section of your document differently than the others +by parsing the sections into separate vectors and assigning each vector +a different position label with the <function>setweight()</function> +function. You can then concatenate them into a single vector and provide +a weights argument to the <function>rank()</function> function that assigns +different weights to positions with different labels. +</para> +</listitem> +</varlistentry> + + +<varlistentry> +<indexterm zone="textsearch-tsvector"> +<primary>length(tsvector)</primary> +</indexterm> + +<term> +<synopsis> +length(<replaceable class="PARAMETER">vector</replaceable> TSVECTOR) returns INT4 +</synopsis> +</term> + +<listitem> +<para> +Returns the number of lexemes stored in the vector. +</para> +</listitem> +</varlistentry> + + +<varlistentry> +<indexterm zone="textsearch-tsvector"> +<primary>text::tsvector</primary> +</indexterm> + +<term> +<synopsis> +<replaceable>text</replaceable>::TSVECTOR returns TSVECTOR +</synopsis> +</term> + +<listitem> +<para> +Directly casting <type>text</type> to a <type>tsvector</type> allows you +to directly inject lexemes into a vector with whatever positions and +positional weights you choose to specify. The text should be formatted to +match the way a vector is displayed by <literal>SELECT</literal>. +<!-- TODO what a strange definition, I think something like +"input format" or so should be used (and defined somewhere, didn't see +it yet) --> +</para> +</listitem> +</varlistentry> + + +<varlistentry> +<indexterm zone="textsearch-tsvector"> +<primary>trigger</primary> +</indexterm> + +<term> +<synopsis> +tsvector_update_trigger(<optional><replaceable class="PARAMETER">vector_column_name</replaceable></optional>, <optional><replaceable class="PARAMETER">filter_name</replaceable></optional>, <replaceable class="PARAMETER">text_column_name</replaceable> <optional>, ... </optional>) +</synopsis> +</term> + +<listitem> +<para> +The <function>tsvector_update_trigger()</function> trigger is used to +automatically update vector_column_name. +<replaceable>filter_name</replaceable> is the function name to preprocess +<replaceable>text_column_name</replaceable>. There can be many functions +and text columns specified in a +<function>tsvector_update_trigger()</function> trigger. If multiple +functions are specified, they apply to the following columns until the +next function appears. As an example of using a filter, function +<function>dropatsymbol</function> replaces all entries of the +<literal>@</literal> sign with a space: + +<programlisting> +CREATE FUNCTION dropatsymbol(text) +RETURNS text +AS 'SELECT replace($1, ''@'', '' '');' +LANGUAGE SQL; + +CREATE TRIGGER tsvectorupdate BEFORE UPDATE OR INSERT +ON tblMessages FOR EACH ROW EXECUTE PROCEDURE +tsvector_update_trigger(tsvector_column, dropatsymbol, strMessage); +</programlisting> + +</para> +</listitem> +</varlistentry> + + +<varlistentry> +<indexterm zone="textsearch-tsvector"> +<primary>stat</primary> +</indexterm> + +<term> +<synopsis> +stat(<optional><replaceable class="PARAMETER">sqlquery</replaceable> text </optional>, <optional>weight text </optional>) returns SETOF statinfo +<!-- TODO I guess that not both of the arguments are optional? --> +</synopsis> +</term> + +<listitem> +<para> +Here <type>statinfo</type> is a type, defined as: +<programlisting> +CREATE TYPE statinfo AS (word text, ndoc int4, nentry int4); +</programlisting> +and <replaceable>sqlquery</replaceable> is a query which returns a +<type>tsvector</type> column's contents. <function>stat</> returns +statistics about a <type>tsvector</type> column, i.e., the number of +documents, <literal>ndoc</>, and the total number of words in the +collection, <literal>nentry</>. It is useful for checking your +configuration and to find stop word candidates. For example, to find +the ten most frequent words: + +<programlisting> +SELECT * FROM stat('SELECT vector from apod') +ORDER BY ndoc DESC, nentry DESC, word +LIMIT 10; +</programlisting> + +Optionally, one can specify <replaceable>weight</replaceable> to obtain +statistics about words with a specific <replaceable>weight</replaceable>: + +<programlisting> +SELECT * FROM stat('SELECT vector FROM apod','a') +ORDER BY ndoc DESC, nentry DESC, word +LIMIT 10; +</programlisting> + +</para> +</listitem> +</varlistentry> + + +<varlistentry> +<indexterm zone="textsearch-tsvector"> +<primary>Btree operations for tsvector</primary> +</indexterm> + +<term> +<synopsis> +TSVECTOR < TSVECTOR +TSVECTOR <= TSVECTOR +TSVECTOR = TSVECTOR +TSVECTOR >= TSVECTOR +TSVECTOR > TSVECTOR +</synopsis> +</term> + +<listitem> +<para> +All btree operations are defined for the <type>tsvector</type> type. +<type>tsvector</>s are compared with each other using +<emphasis>lexicographical</emphasis> ordering. +<!-- TODO of the output representation or something else? --> +</para> +</listitem> +</varlistentry> + +</variablelist> + + +</sect2> + +<sect2 id="textsearch-tsquery"> +<title>tsquery</title> + + +<variablelist> + +<varlistentry> + +<indexterm zone="textsearch-tsquery"> +<primary>to_tsquery</primary> +</indexterm> + +<term> +<synopsis> +to_tsquery(<optional><replaceable class="PARAMETER">conf_name</replaceable></optional>, <replaceable class="PARAMETER">querytext</replaceable> text) returns TSQUERY +</synopsis> +</term> + +<listitem> +<para> +Accepts <replaceable>querytext</replaceable>, which should consist of single tokens +separated by the boolean operators <literal>&</literal> (and), <literal>|</literal> +(or) and <literal>!</literal> (not), which can be grouped using parentheses. +In other words, <function>to_tsquery</function> expects already parsed text. +Each token is reduced to a lexeme using the specified or current configuration. +A weight class can be assigned to each lexeme entry to restrict the search region +(see <function>setweight</function> for an explanation). For example: +<programlisting> +'fat:a & rats' +</programlisting> +The <function>to_tsquery</function> function can also accept a <literal>text +string</literal>. In this case <replaceable>querytext</replaceable> should +be quoted. This may be useful, for example, to use with a thesaurus +dictionary. In the example below, a thesaurus contains rule <literal>supernovae +stars : sn</literal>: +<programlisting> +SELECT to_tsquery('''supernovae stars'' & !crab'); + to_tsquery +---------------- + 'sn' & !'crab' +</programlisting> +Without quotes <function>to_tsquery</function> will generate a syntax error. +</para> + +</listitem> +</varlistentry> + + + +<varlistentry> + +<indexterm zone="textsearch-tsquery"> +<primary>plainto_tsquery</primary> +</indexterm> + +<term> +<synopsis> +plainto_tsquery(<optional><replaceable class="PARAMETER">conf_name</replaceable></optional>, <replaceable class="PARAMETER">querytext</replaceable> text) returns TSQUERY +</synopsis> +</term> + +<listitem> +<para> +Transforms unformatted text <replaceable>querytext</replaceable> to <type>tsquery</type>. +It is the same as <function>to_tsquery</function> but accepts <literal>text</literal> +without quotes and will call the parser to break it into tokens. +<function>plainto_tsquery</function> assumes the <literal>&</literal> boolean +operator between words and does not recognize weight classes. +</para> +</listitem> +</varlistentry> + + + +<varlistentry> + +<indexterm zone="textsearch-tsquery"> +<primary>querytree</primary> +</indexterm> + +<term> +<synopsis> +querytree(<replaceable class="PARAMETER">query</replaceable> TSQUERY) returns TEXT +</synopsis> +</term> + +<listitem> +<para> +This returns the query used for searching an index. It can be used to test +for an empty query. The <command>SELECT</> below returns <literal>NULL</>, +which corresponds to an empty query since GIN indexes do not support queries with negation +<!-- TODO or "negated queries" (depending on what the correct rule is) --> +(a full index scan is inefficient): +<programlisting> +SELECT querytree(to_tsquery('!defined')); + querytree +----------- + +</programlisting> +</para> +</listitem> +</varlistentry> + + +<varlistentry> + +<indexterm zone="textsearch-tsquery"> +<primary>text::tsquery casting</primary> +</indexterm> + +<term> +<synopsis> +<replaceable class="PARAMETER">text</replaceable>::TSQUERY returns TSQUERY +</synopsis> +</term> + +<listitem> +<para> +Directly casting <replaceable>text</replaceable> to a <type>tsquery</type> +allows you to directly inject lexemes into a query using whatever positions +and positional weight flags you choose to specify. The text should be +formatted to match the way a vector is displayed by +<literal>SELECT</literal>. +<!-- TODO what a strange definition, I think something like +"input format" or so should be used (and defined somewhere, didn't see +it yet) --> +</para> +</listitem> +</varlistentry> + +<varlistentry> + +<indexterm zone="textsearch-tsquery"> +<primary>numnode</primary> +</indexterm> + +<term> +<synopsis> +numnode(<replaceable class="PARAMETER">query</replaceable> TSQUERY) returns INTEGER +</synopsis> +</term> + +<listitem> +<para> +This returns the number of nodes in a query tree. This function can be +used to determine if <replaceable>query</replaceable> is meaningful +(returns > 0), or contains only stop words (returns 0): +<programlisting> +SELECT numnode(plainto_tsquery('the any')); +NOTICE: query contains only stopword(s) or does not contain lexeme(s), +ignored + numnode +--------- + 0 +SELECT numnode(plainto_tsquery('the table')); + numnode +--------- + 1 +SELECT numnode(plainto_tsquery('long table')); + numnode +--------- + 3 +</programlisting> +</para> +</listitem> +</varlistentry> + +<varlistentry> + +<indexterm zone="textsearch-tsquery"> +<primary>TSQUERY && TSQUERY</primary> +</indexterm> + +<term> +<synopsis> +TSQUERY && TSQUERY returns TSQUERY +</synopsis> +</term> + +<listitem> +<para> +Returns <literal>AND</literal>-ed TSQUERY +</para> +</listitem> +</varlistentry> + +<varlistentry> + +<indexterm zone="textsearch-tsquery"> +<primary>TSQUERY || TSQUERY</primary> +</indexterm> + +<term> +<synopsis> +TSQUERY || TSQUERY returns TSQUERY +</synopsis> +</term> + +<listitem> +<para> +Returns <literal>OR</literal>-ed TSQUERY +</para> +</listitem> +</varlistentry> + +<varlistentry> + +<indexterm zone="textsearch-tsquery"> +<primary>!! TSQUERY</primary> +</indexterm> + +<term> +<synopsis> +!! TSQUERY returns TSQUERY +</synopsis> +</term> + +<listitem> +<para> +negation of TSQUERY +</para> +</listitem> +</varlistentry> + +<varlistentry> + +<indexterm zone="textsearch-tsquery"> +<primary>Btree operations for tsquery</primary> +</indexterm> + +<term> +<synopsis> +TSQUERY < TSQUERY +TSQUERY <= TSQUERY +TSQUERY = TSQUERY +TSQUERY >= TSQUERY +TSQUERY > TSQUERY +</synopsis> +</term> + +<listitem> +<para> +All btree operations are defined for the <type>tsquery</type> type. +tsqueries are compared to each other using <emphasis>lexicographical</emphasis> +ordering. +</para> +</listitem> +</varlistentry> + +</variablelist> + +<sect3 id="textsearch-queryrewriting"> +<title>Query Rewriting</title> + +<para> +Query rewriting is a set of functions and operators for the +<type>tsquery</type> data type. It allows control at search +<emphasis>query time</emphasis> without reindexing (the opposite of the +thesaurus). For example, you can expand the search using synonyms +(<literal>new york</>, <literal>big apple</>, <literal>nyc</>, +<literal>gotham</>) or narrow the search to direct the user to some hot +topic. +</para> + +<para> +The <function>rewrite()</function> function changes the original query by +replacing part of the query with some other string of type <type>tsquery</type>, +as defined by the rewrite rule. Arguments to <function>rewrite()</function> +can be names of columns of type <type>tsquery</type>. +</para> + +<programlisting> +CREATE TABLE aliases (t TSQUERY PRIMARY KEY, s TSQUERY); +INSERT INTO aliases VALUES('a', 'c'); +</programlisting> + +<variablelist> +<varlistentry> + +<indexterm zone="textsearch-tsquery"> +<primary>rewrite - 1</primary> +</indexterm> + +<term> +<synopsis> +rewrite (<replaceable class="PARAMETER">query</replaceable> TSQUERY, <replaceable class="PARAMETER">target</replaceable> TSQUERY, <replaceable class="PARAMETER">sample</replaceable> TSQUERY) returns TSQUERY +</synopsis> +</term> + +<listitem> +<para> +<programlisting> +SELECT rewrite('a & b'::tsquery, 'a'::tsquery, 'c'::tsquery); + rewrite + ----------- + 'b' & 'c' +</programlisting> +</para> +</listitem> +</varlistentry> + +<varlistentry> + +<indexterm zone="textsearch-tsquery"> +<primary>rewrite - 2</primary> +</indexterm> + +<term> +<synopsis> +rewrite(ARRAY[<replaceable class="PARAMETER">query</replaceable> TSQUERY, <replaceable class="PARAMETER">target</replaceable> TSQUERY, <replaceable class="PARAMETER">sample</replaceable> TSQUERY]) returns TSQUERY +</synopsis> +</term> + +<listitem> +<para> +<programlisting> +SELECT rewrite(ARRAY['a & b'::tsquery, t,s]) FROM aliases; + rewrite + ----------- + 'b' & 'c' +</programlisting> +</para> +</listitem> +</varlistentry> + +<varlistentry> + +<indexterm zone="textsearch-tsquery"> +<primary>rewrite - 3</primary> +</indexterm> + +<term> +<synopsis> +rewrite (<replaceable class="PARAMETER">query</> TSQUERY,<literal>'SELECT target ,sample FROM test'</literal>::text) returns TSQUERY +</synopsis> +</term> + +<listitem> +<para> +<programlisting> +SELECT rewrite('a & b'::tsquery, 'SELECT t,s FROM aliases'); + rewrite + ----------- + 'b' & 'c' +</programlisting> +</para> +</listitem> +</varlistentry> +</variablelist> + +<para> +What if there are several instances of rewriting? For example, query +<literal>'a & b'</literal> can be rewritten as +<literal>'b & c'</literal> and <literal>'cc'</literal>. + +<programlisting> +SELECT * FROM aliases; + t | s +-----------+------ + 'a' | 'c' + 'x' | 'z' + 'a' & 'b' | 'cc' +</programlisting> +This ambiguity can be resolved by specifying a sort order: +<programlisting> +SELECT rewrite('a & b', 'SELECT t, s FROM aliases ORDER BY t DESC'); + rewrite +--------- + 'cc' +SELECT rewrite('a & b', 'SELECT t, s FROM aliases ORDER BY t ASC'); + rewrite +----------- + 'b' & 'c' +</programlisting> +</para> + +<para> +Let's consider a real-life astronomical example. We'll expand query +<literal>supernovae</literal> using table-driven rewriting rules: +<programlisting> +CREATE TABLE aliases (t tsquery primary key, s tsquery); +INSERT INTO aliases VALUES(to_tsquery('supernovae'), to_tsquery('supernovae|sn')); +SELECT rewrite(to_tsquery('supernovae'), 'SELECT * FROM aliases') && to_tsquery('crab'); + ?column? +--------------------------------- + ( 'supernova' | 'sn' ) & 'crab' +</programlisting> +Notice, that we can change the rewriting rule online<!-- TODO maybe use another word for "online"? -->: +<programlisting> +UPDATE aliases SET s=to_tsquery('supernovae|sn & !nebulae') WHERE t=to_tsquery('supernovae'); +SELECT rewrite(to_tsquery('supernovae'), 'SELECT * FROM aliases') && to_tsquery('crab'); + ?column? +--------------------------------------------- + ( 'supernova' | 'sn' & !'nebula' ) & 'crab' +</programlisting> +</para> +</sect3> + +<sect3 id="textsearch-tsquery-ops"> +<title>Operators For tsquery</title> + +<para> +Rewriting can be slow for many rewriting rules since it checks every rule +for a possible hit. To filter out obvious non-candidate rules there are containment +operators for the <type>tsquery</type> type. In the example below, we select only those +rules which might contain the original query: +<programlisting> +SELECT rewrite(ARRAY['a & b'::tsquery, t,s]) +FROM aliases +WHERE 'a & b' @> t; + rewrite +----------- + 'b' & 'c' +</programlisting> + +</para> + +<para> +Two operators are defined for <type>tsquery</type>: +</para> + +<variablelist> +<varlistentry> + +<indexterm zone="textsearch-tsquery"> +<primary>TSQUERY @> TSQUERY</primary> +</indexterm> + +<term> +<synopsis> +TSQUERY @> TSQUERY +</synopsis> +</term> + +<listitem> +<para> +Returns <literal>true</literal> if the right argument might be contained in left argument. +</para> +</listitem> +</varlistentry> + +<varlistentry> + +<indexterm zone="textsearch-tsquery"> +<primary>tsquery <@ tsquery</primary> +</indexterm> + +<term> +<synopsis> +TSQUERY <@ TSQUERY +</synopsis> +</term> + +<listitem> +<para> +Returns <literal>true</literal> if the left argument might be contained in right argument. +</para> +</listitem> +</varlistentry> +</variablelist> + + +</sect3> + +<sect3 id="textsearch-tsqueryindex"> +<title>Index For tsquery</title> + +<para> +To speed up operators <literal><@</> and <literal>@></literal> for +<type>tsquery</type> one can use a <acronym>GiST</acronym> index with +a <literal>tsquery_ops</literal> opclass: + +<programlisting> +CREATE INDEX t_idx ON aliases USING gist (t tsquery_ops); +</programlisting> +</para> + +</sect3> + +</sect2> + +</sect1> + +<sect1 id="textsearch-controls"> +<title>Additional Controls</title> + +<para> +To implement full text searching there must be a function to create a +<type>tsvector</type> from a document and a <type>tsquery</type> from a +user query. Also, we need to return results in some order, i.e., we need +a function which compares documents with respect to their relevance to +the <type>tsquery</type>. Full text searching in +<productname>PostgreSQL</productname> provides support for all of these +functions. +</para> + +<sect2 id="textsearch-parser"> +<title>Parsing</title> + +<para> +Full text searching in <productname>PostgreSQL</productname> provides +function <function>to_tsvector</function>, which converts a document to +the <type>tsvector</type> data type. More details are available in <xref +linkend="textsearch-tsvector">, but for now consider a simple example: +<programlisting> +SELECT to_tsvector('english', 'a fat cat sat on a mat - it ate a fat rats'); + to_tsvector +----------------------------------------------------- + 'ate':9 'cat':3 'fat':2,11 'mat':7 'rat':12 'sat':4 +</programlisting> +</para> + +<para> +In the example above we see that the resulting <type>tsvector</type> does not +contain the words <literal>a</literal>, <literal>on</literal>, or +<literal>it</literal>, the word <literal>rats</literal> became +<literal>rat</literal>, and the punctuation sign <literal>-</literal> was +ignored. +</para> + +<para> +The <function>to_tsvector</function> function internally calls a parser +which breaks the document (<literal>a fat cat sat on a mat - it ate a +fat rats</literal>) into words and corresponding types. The default parser +recognizes 23 types. Each word, depending on its type, passes through a +group of dictionaries (<xref linkend="textsearch-dictionaries">). At the +end of this step we obtain <emphasis>lexemes</emphasis>. For example, +<literal>rats</literal> became <literal>rat</literal> because one of the +dictionaries recognized that the word <literal>rats</literal> is a plural +form of <literal>rat</literal>. Some words are treated as "stop words" +(<xref linkend="textsearch-stopwords">) and ignored since they occur too +frequently and have little informational value. In our example these are +<literal>a</literal>, <literal>on</literal>, and <literal>it</literal>. +The punctuation sign <literal>-</literal> was also ignored because its +type (<literal>Space symbols</literal>) is not indexed. All information +about the parser, dictionaries and what types of lexemes to index is +documented in the full text configuration section (<xref +linkend="textsearch-tables-configuration">). It is possible to have +several different configurations in the same database, and many predefined +system configurations are available for different languages. In our example +we used the default configuration <literal>english</literal> for the +English language. +</para> + +<para> +As another example, below is the output from the <function>ts_debug</function> +function ( <xref linkend="textsearch-debugging"> ), which shows all details +of the full text machinery: +<programlisting> +SELECT * FROM ts_debug('english','a fat cat sat on a mat - it ate a fat rats'); + Alias | Description | Token | Dicts list | Lexized token +-------+---------------+-------+----------------------+--------------------------- + lword | Latin word | a | {pg_catalog.en_stem} | pg_catalog.en_stem: {} + blank | Space symbols | | | + lword | Latin word | fat | {pg_catalog.en_stem} | pg_catalog.en_stem: {fat} + blank | Space symbols | | | + lword | Latin word | cat | {pg_catalog.en_stem} | pg_catalog.en_stem: {cat} + blank | Space symbols | | | + lword | Latin word | sat | {pg_catalog.en_stem} | pg_catalog.en_stem: {sat} + blank | Space symbols | | | + lword | Latin word | on | {pg_catalog.en_stem} | pg_catalog.en_stem: {} + blank | Space symbols | | | + lword | Latin word | a | {pg_catalog.en_stem} | pg_catalog.en_stem: {} + blank | Space symbols | | | + lword | Latin word | mat | {pg_catalog.en_stem} | pg_catalog.en_stem: {mat} + blank | Space symbols | | | + blank | Space symbols | - | | + lword | Latin word | it | {pg_catalog.en_stem} | pg_catalog.en_stem: {} + blank | Space symbols | | | + lword | Latin word | ate | {pg_catalog.en_stem} | pg_catalog.en_stem: {ate} + blank | Space symbols | | | + lword | Latin word | a | {pg_catalog.en_stem} | pg_catalog.en_stem: {} + blank | Space symbols | | | + lword | Latin word | fat | {pg_catalog.en_stem} | pg_catalog.en_stem: {fat} + blank | Space symbols | | | + lword | Latin word | rats | {pg_catalog.en_stem} | pg_catalog.en_stem: {rat} +(24 rows) +</programlisting> +</para> + +<para> +Function <function>setweight()</function> is used to label +<type>tsvector</type>. The typical usage of this is to mark out the +different parts of a document, perhaps by importance. Later, this can be +used for ranking of search results in addition to positional information +(distance between query terms). If no ranking is required, positional +information can be removed from <type>tsvector</type> using the +<function>strip()</function> function to save space. +</para> + +<para> +Because <function>to_tsvector</function>(<LITERAL>NULL</LITERAL>) can +return <LITERAL>NULL</LITERAL>, it is recommended to use +<function>coalesce</function>. Here is the safe method for creating a +<type>tsvector</type> from a structured document: +<programlisting> +UPDATE tt SET ti= + setweight(to_tsvector(coalesce(title,'')), 'A') || ' ' || + setweight(to_tsvector(coalesce(keyword,'')), 'B') || ' ' || + setweight(to_tsvector(coalesce(abstract,'')), 'C') || ' ' || + setweight(to_tsvector(coalesce(body,'')), 'D'); +</programlisting> +</para> + +<para> +The following functions allow manual parsing control: + +<variablelist> + +<varlistentry> + +<indexterm zone="textsearch-parser"> +<primary>parse</primary> +</indexterm> + +<term> +<synopsis> +parse(<replaceable class="PARAMETER">parser</replaceable>, <replaceable class="PARAMETER">document</replaceable> TEXT) returns SETOF <type>tokenout</type> +</synopsis> +</term> + +<listitem> +<para> +Parses the given <replaceable>document</replaceable> and returns a series +of records, one for each token produced by parsing. Each record includes +a <varname>tokid</varname> giving its type and a <varname>token</varname> +which gives its content: +<programlisting> +SELECT * FROM parse('default','123 - a number'); + tokid | token +-------+-------- + 22 | 123 + 12 | + 12 | - + 1 | a + 12 | + 1 | number +</programlisting> +</para> +</listitem> +</varlistentry> + +<varlistentry> +<indexterm zone="textsearch-parser"> +<primary>token_type</primary> +</indexterm> + +<term> +<synopsis> +token_type(<replaceable class="PARAMETER">parser</replaceable> ) returns SETOF <type>tokentype</type> +</synopsis> +</term> + +<listitem> +<para> +Returns a table which describes each kind of token the +<replaceable>parser</replaceable> might produce as output. For each token +type the table gives the <varname>tokid</varname> which the +<replaceable>parser</replaceable> uses to label each +<varname>token</varname> of that type, the <varname>alias</varname> which +names the token type, and a short <varname>description</varname>: +<programlisting> +SELECT * FROM token_type('default'); + tokid | alias | description +-------+--------------+----------------------------------- + 1 | lword | Latin word + 2 | nlword | Non-latin word + 3 | word | Word + 4 | email | Email + 5 | url | URL + 6 | host | Host + 7 | sfloat | Scientific notation + 8 | version | VERSION + 9 | part_hword | Part of hyphenated word + 10 | nlpart_hword | Non-latin part of hyphenated word + 11 | lpart_hword | Latin part of hyphenated word + 12 | blank | Space symbols + 13 | tag | HTML Tag + 14 | protocol | Protocol head + 15 | hword | Hyphenated word + 16 | lhword | Latin hyphenated word + 17 | nlhword | Non-latin hyphenated word + 18 | uri | URI + 19 | file | File or path name + 20 | float | Decimal notation + 21 | int | Signed integer + 22 | uint | Unsigned integer + 23 | entity | HTML Entity +</programlisting> + +</para> +</listitem> +</varlistentry> + +</variablelist> +</para> + +</sect2> + +<sect2 id="textsearch-ranking"> +<title>Ranking Search Results</title> + +<para> +Ranking attempts to measure how relevant documents are to a particular +query by inspecting the number of times each search word appears in the +document, and whether different search terms occur near each other. Full +text searching provides two predefined ranking functions which attempt to +produce a measure of how a document is relevant to the query. In spite +of that, the concept of relevancy is vague and very application-specific. +These functions try to take into account lexical, proximity, and structural +information. Different applications might require additional information +for ranking, e.g. document modification time. +</para> + +<para> +The lexical part of ranking reflects how often the query terms appear in +the document, how close the document query terms are, and in what part of +the document they occur. Note that ranking functions that use positional +information will only work on unstripped tsvectors because stripped +tsvectors lack positional information. +</para> + +<para> +The two ranking functions currently available are: + +<variablelist> + +<varlistentry> + +<indexterm zone="textsearch-ranking"> +<primary>rank</primary> +</indexterm> + +<term> +<synopsis> +rank(<optional> <replaceable class="PARAMETER">weights</replaceable> float4[]</optional>, <replaceable class="PARAMETER">vector</replaceable> TSVECTOR, <replaceable class="PARAMETER">query</replaceable> TSQUERY, <optional> <replaceable class="PARAMETER">normalization</replaceable> int4 </optional>) returns float4 +</synopsis> +</term> + +<listitem> +<para> +This ranking function offers the ability to weigh word instances more +heavily depending on how you have classified them. The weights specify +how heavily to weigh each category of word: +<programlisting> +{D-weight, C-weight, B-weight, A-weight} +</programlisting> +If no weights are provided, +then these defaults are used: +<programlisting> +{0.1, 0.2, 0.4, 1.0} +</programlisting> +Often weights are used to mark words from special areas of the document, +like the title or an initial abstract, and make them more or less important +than words in the document body. +</para> +</listitem> +</varlistentry> + +<varlistentry> + +<indexterm zone="textsearch-ranking"> +<primary>rank_cd</primary> +</indexterm> + +<term> +<synopsis> +rank_cd(<optional> <replaceable class="PARAMETER">weights</replaceable> float4[], </optional> <replaceable class="PARAMETER">vector</replaceable> TSVECTOR, <replaceable class="PARAMETER">query</replaceable> TSQUERY, <optional> <replaceable class="PARAMETER">normalization</replaceable> int4 </optional>) returns float4 +</synopsis> +</term> + +<listitem> +<para> +This function computes the <emphasis>cover density</emphasis> ranking for +the given document vector and query, as described in Clarke, Cormack, and +Tudhope's "Relevance Ranking for One to Three Term Queries" in the +"Information Processing and Management", 1999. +</para> +</listitem> +</varlistentry> + +</variablelist> + +</para> + +<para> +Since a longer document has a greater chance of containing a query term +it is reasonable to take into account document size, i.e. a hundred-word +document with five instances of a search word is probably more relevant +than a thousand-word document with five instances. Both ranking functions +take an integer <replaceable>normalization</replaceable> option that +specifies whether a document's length should impact its rank. The integer +option controls several behaviors which is done using bit-wise fields and +<literal>|</literal> (for example, <literal>2|4</literal>): + +<itemizedlist spacing="compact" mark="bullet"> +<listitem><para> +0 (the default) ignores the document length +</para></listitem> +<listitem><para> +1 divides the rank by 1 + the logarithm of the document length +</para></listitem> +<listitem><para> +2 divides the rank by the length itself +</para></listitem> +<listitem><para> +<!-- what is mean harmonic distance --> +4 divides the rank by the mean harmonic distance between extents +</para></listitem> +<listitem><para> +8 divides the rank by the number of unique words in document +</para></listitem> +<listitem><para> +16 divides the rank by 1 + logarithm of the number of unique words in document +</para></listitem> +</itemizedlist> + +</para> + +<para> +It is important to note that ranking functions do not use any global +information so it is impossible to produce a fair normalization to 1% or +100%, as sometimes required. However, a simple technique like +<literal>rank/(rank+1)</literal> can be applied. Of course, this is just +a cosmetic change, i.e., the ordering of the search results will not change. +</para> + +<para> +Several examples are shown below; note that the second example uses +normalized ranking: +<programlisting> +SELECT title, rank_cd('{0.1, 0.2, 0.4, 1.0}',textsearch, query) AS rnk +FROM apod, to_tsquery('neutrino|(dark & matter)') query +WHERE query @@ textsearch +ORDER BY rnk DESC LIMIT 10; + title | rnk +-----------------------------------------------+---------- + Neutrinos in the Sun | 3.1 + The Sudbury Neutrino Detector | 2.4 + A MACHO View of Galactic Dark Matter | 2.01317 + Hot Gas and Dark Matter | 1.91171 + The Virgo Cluster: Hot Plasma and Dark Matter | 1.90953 + Rafting for Solar Neutrinos | 1.9 + NGC 4650A: Strange Galaxy and Dark Matter | 1.85774 + Hot Gas and Dark Matter | 1.6123 + Ice Fishing for Cosmic Neutrinos | 1.6 + Weak Lensing Distorts the Universe | 0.818218 + +SELECT title, rank_cd('{0.1, 0.2, 0.4, 1.0}',textsearch, query)/ +(rank_cd('{0.1, 0.2, 0.4, 1.0}',textsearch, query) + 1) AS rnk +FROM apod, to_tsquery('neutrino|(dark & matter)') query +WHERE query @@ textsearch +ORDER BY rnk DESC LIMIT 10; + title | rnk +-----------------------------------------------+------------------- + Neutrinos in the Sun | 0.756097569485493 + The Sudbury Neutrino Detector | 0.705882361190954 + A MACHO View of Galactic Dark Matter | 0.668123210574724 + Hot Gas and Dark Matter | 0.65655958650282 + The Virgo Cluster: Hot Plasma and Dark Matter | 0.656301290640973 + Rafting for Solar Neutrinos | 0.655172410958162 + NGC 4650A: Strange Galaxy and Dark Matter | 0.650072921219637 + Hot Gas and Dark Matter | 0.617195790024749 + Ice Fishing for Cosmic Neutrinos | 0.615384618911517 + Weak Lensing Distorts the Universe | 0.450010798361481 +</programlisting> +</para> + +<para> +The first argument in <function>rank_cd</function> (<literal>'{0.1, 0.2, +0.4, 1.0}'</literal>) is an optional parameter which specifies the +weights for labels <literal>D</literal>, <literal>C</literal>, +<literal>B</literal>, and <literal>A</literal> used in function +<function>setweight</function>. These default values show that lexemes +labeled as <literal>A</literal> are ten times more important than ones +that are labeled with <literal>D</literal>. +</para> + +<para> +Ranking can be expensive since it requires consulting the +<type>tsvector</type> of all documents, which can be I/O bound and +therefore slow. Unfortunately, it is almost impossible to avoid since full +text searching in a database should work without indexes <!-- TODO I don't +get this -->. Moreover an index can be lossy (a <acronym>GiST</acronym> +index, for example) so it must check documents to avoid false hits. +</para> + +<para> +Note that the ranking functions above are only examples. You can write +your own ranking functions and/or combine additional factors to fit your +specific needs. +</para> + +</sect2> + + +<sect2 id="textsearch-headline"> +<title>Highlighting Results</title> + +<indexterm zone="textsearch-headline"> +<primary>headline</primary> +</indexterm> + +<para> +To present search results it is ideal to show a part of each document and +how it is related to the query. Usually, search engines show fragments of +the document with marked search terms. <productname>PostgreSQL</> full +text searching provides the function <function>headline</function> that +implements such functionality. +</para> + +<variablelist> + +<varlistentry> + +<term> +<synopsis> +headline(<optional> <replaceable class="PARAMETER">conf_name</replaceable> text</optional>, <replaceable class="PARAMETER">document</replaceable> text, <replaceable class="PARAMETER">query</replaceable> TSQUERY, <optional> <replaceable class="PARAMETER">options</replaceable> text </optional>) returns text +</synopsis> +</term> + +<listitem> +<para> +The <function>headline()</function> function accepts a document along with +a query, and returns one or more ellipsis-separated excerpts from the +document in which terms from the query are highlighted. The configuration +used to parse the document can be specified by its +<replaceable>conf_name</replaceable>; if none is specified, the current +configuration is used. +</para> + + +</listitem> +</varlistentry> +</variablelist> + +<para> +If an <replaceable>options</replaceable> string is specified it should +consist of a comma-separated list of one or more 'option=value' pairs. +The available options are: + +<itemizedlist spacing="compact" mark="bullet"> +<listitem><para> +<literal>StartSel</>, <literal>StopSel</literal>: the strings with which +query words appearing in the document should be delimited to distinguish +them from other excerpted words. +</para></listitem> +<listitem><para> +<literal>MaxWords</>, <literal>MinWords</literal>: limit the shortest and +longest headlines to output +</para></listitem> +<listitem><para> +<literal>ShortWord</literal>: this prevents your headline from beginning +or ending with a word which has this many characters or less. The default +value of three eliminates the English articles. +</para></listitem> +<listitem><para> +<literal>HighlightAll</literal>: boolean flag; if +<literal>true</literal> the whole document will be highlighted +</para></listitem> +</itemizedlist> + +Any unspecified options receive these defaults: +<programlisting> +StartSel=<b>, StopSel=</b>, MaxWords=35, MinWords=15, ShortWord=3, HighlightAll=FALSE +</programlisting> +</para> + +<para> +For example: + +<programlisting> +SELECT headline('a b c', 'c'::tsquery); + headline +-------------- + a b <b>c</b> +SELECT headline('a b c', 'c'::tsquery, 'StartSel=<,StopSel=>'); + headline +---------- + a b <c> +</programlisting> +</para> + +<para> +<function>headline</> uses the original document, not +<type>tsvector</type>, so it can be slow and should be used with care. +A typical mistake is to call <function>headline()</function> for +<emphasis>every</emphasis> matching document when only ten documents are +shown. <acronym>SQL</acronym> subselects can help here; below is an +example: + +<programlisting> +SELECT id,headline(body,q), rank +FROM (SELECT id,body,q, rank_cd (ti,q) AS rank FROM apod, to_tsquery('stars') q + WHERE ti @@ q + ORDER BY rank DESC LIMIT 10) AS foo; +</programlisting> +</para> + +<para> +Note that the cascade dropping of the <function>parser</function> function +causes dropping of the <literal>headline</literal> used in the full text search +configuration <replaceable>conf_name</replaceable><!-- TODO I don't get this -->. +</para> + +</sect2> + +</sect1> + +<sect1 id="textsearch-dictionaries"> +<title>Dictionaries</title> + +<para> +Dictionaries are used to specify words that should not be considered in +a search and for the normalization of words to allow the user to use any +derived form of a word in a query. Also, normalization can reduce the size of +<type>tsvector</type>. Normalization does not always have linguistic +meaning and usually depends on application semantics. +</para> + +<para> +Some examples of normalization: + +<itemizedlist spacing="compact" mark="bullet"> + +<listitem> +<para> Linguistic - ispell dictionaries try to reduce input words to a +normalized form; stemmer dictionaries remove word endings +</para></listitem> +<listitem> +<para> Identical <acronym>URL</acronym> locations are identified and canonicalized: + +<itemizedlist spacing="compact" mark="bullet"> +<listitem><para> +http://www.pgsql.ru/db/mw/index.html +</para></listitem> +<listitem><para> +http://www.pgsql.ru/db/mw/ +</para></listitem> +<listitem><para> +http://www.pgsql.ru/db/../db/mw/index.html +</para></listitem> +</itemizedlist> + +</para></listitem> +<listitem><para> +Colour names are substituted by their hexadecimal values, e.g., +<literal>red, green, blue, magenta -> FF0000, 00FF00, 0000FF, FF00FF</literal> +</para></listitem> +<listitem><para> +Remove some numeric fractional digits to reduce the range of possible +numbers, so <emphasis>3.14</emphasis>159265359, +<emphasis>3.14</emphasis>15926, <emphasis>3.14</emphasis> will be the same +after normalization if only two digits are kept after the decimal point. +</para></listitem> +</itemizedlist> + +</para> + +<para> +A dictionary is a <emphasis>program</emphasis> which accepts lexemes as +input and returns: +<itemizedlist spacing="compact" mark="bullet"> +<listitem><para> +an array of lexemes if the input lexeme is known to the dictionary +</para></listitem> +<listitem><para> +a void array if the dictionary knows the lexeme, but it is a stop word +</para></listitem> +<listitem><para> +<literal>NULL</literal> if the dictionary does not recognize the input lexeme +</para></listitem> +</itemizedlist> + +<emphasis>WARNING:</emphasis> +Data files used by dictionaries should be in the <varname>server_encoding</varname> +so all encodings are consistent across databases. +</para> + +<para> +Full text searching provides predefined dictionaries for many languages, +and <acronym>SQL</acronym> commands to manipulate them. There are also +several predefined template dictionaries that can be used to create new +dictionaries by overriding their default parameters. Besides this, it is +possible to develop custom dictionaries using an <acronym>API</acronym>; +see the dictionary for integers (<xref +linkend="textsearch-rule-dictionary-example">) as an example. +</para> + +<para> +The <literal>ALTER TEXT SEARCH CONFIGURATION public.pg ADD +MAPPING</literal> command binds specific types of lexemes and a set of +dictionaries to process them. (Mappings can also be specified as part of +configuration creation.) Lexemes are processed by a stack of dictionaries +until some dictionary identifies it as a known word or it turns out to be +a stop word. If no dictionary recognizes a lexeme, it will be discarded +and not indexed. A general rule for configuring a stack of dictionaries +is to place first the most narrow, most specific dictionary, then the more +general dictionaries and finish it with a very general dictionary, like +the <application>snowball</> stemmer or <literal>simple</>, which +recognizes everything. For example, for an astronomy-specific search +(<literal>astro_en</literal> configuration) one could bind +<type>lword</type> (latin word) with a synonym dictionary of astronomical +terms, a general English dictionary and a <application>snowball</> English +stemmer: +<programlisting> +ALTER TEXT SEARCH CONFIGURATION astro_en ADD MAPPING FOR lword WITH astrosyn, en_ispell, en_stem; +</programlisting> +</para> + +<para> +Function <function>lexize</function> can be used to test dictionaries, +for example: +<programlisting> +SELECT lexize('en_stem', 'stars'); + lexize +-------- + {star} +(1 row) +</programlisting> +Also, the <function>ts_debug</function> function (<xref linkend="textsearch-debugging">) +can be used for this. +</para> + +<sect2 id="textsearch-stopwords"> +<title>Stop Words</title> +<para> +Stop words are words which are very common, appear in almost +every document, and have no discrimination value. Therefore, they can be ignored +in the context of full text searching. For example, every English text contains +words like <literal>a</literal> although it is useless to store them in an index. +However, stop words do affect the positions in <type>tsvector</type>, +which in turn, do affect ranking: +<programlisting> +SELECT to_tsvector('english','in the list of stop words'); + to_tsvector +---------------------------- + 'list':3 'stop':5 'word':6 +</programlisting> +The gaps between positions 1-3 and 3-5 are because of stop words, so ranks +calculated for documents with and without stop words are quite different: +<programlisting> +SELECT rank_cd ('{1,1,1,1}', to_tsvector('english','in the list of stop words'), to_tsquery('list & stop')); + rank_cd +--------- + 0.5 + +SELECT rank_cd ('{1,1,1,1}', to_tsvector('english','list stop words'), to_tsquery('list & stop')); + rank_cd +--------- + 1 +</programlisting> + +</para> + +<para> +It is up to the specific dictionary how it treats stop words. For example, +<literal>ispell</literal> dictionaries first normalize words and then +look at the list of stop words, while <literal>stemmers</literal> +first check the list of stop words. The reason for the different +behaviour is an attempt to decrease possible noise. +</para> + +<para> +Here is an example of a dictionary that returns the input word as lowercase +or <literal>NULL</literal> if it is a stop word; it also specifies the location +of the file of stop words. It uses the <literal>simple</> dictionary as +a template: +<programlisting> +CREATE TEXT SEARCH DICTIONARY public.simple_dict + TEMPLATE pg_catalog.simple + OPTION 'english.stop'; +</programlisting> +Relative paths in <literal>OPTION</literal> resolve relative to +<filename>share/</><!-- TODO and "share/" is relative to what? such +references occur elsewhere in this section -->. Now we can test our +dictionary: +<programlisting> +SELECT lexize('public.simple_dict','YeS'); + lexize +-------- + {yes} +SELECT lexize('public.simple_dict','The'); + lexize +-------- + {} +</programlisting> +</para> + +</sect2> + + +<sect2 id="textsearch-synonym-dictionary"> +<title>Synonym Dictionary</title> + +<para> +This dictionary template is used to create dictionaries which replace a +word with a synonym. Phrases are not supported (use the thesaurus +dictionary (<xref linkend="textsearch-thesaurus">) if you need them). Synonym +dictionary can be used to overcome linguistic problems, for example, to +prevent an English stemmer dictionary from reducing the word 'Paris' to +'pari'. In that case, it is enough to have a <literal>Paris +paris</literal> line in the synonym dictionary and put it before the +<literal>en_stem</> dictionary: +<programlisting> +SELECT * FROM ts_debug('english','Paris'); + Alias | Description | Token | Dicts list | Lexized token +-------+-------------+-------+----------------------+---------------------------- + lword | Latin word | Paris | {pg_catalog.en_stem} | pg_catalog.en_stem: {pari} +(1 row) +ALTER TEXT SEARCH CONFIGURATION ADD MAPPING ON english FOR lword WITH synonym, en_stem; +ALTER TEXT SEARCH MAPPING +Time: 340.867 ms +SELECT * FROM ts_debug('english','Paris'); + Alias | Description | Token | Dicts list | Lexized token +-------+-------------+-------+-----------------------------------------+----------------------------- + lword | Latin word | Paris | {pg_catalog.synonym,pg_catalog.en_stem} | pg_catalog.synonym: {paris} +(1 row) +</programlisting> +</para> + +</sect2> + +<sect2 id="textsearch-thesaurus"> +<title>Thesaurus Dictionary</title> + +<para> +A thesaurus dictionary (sometimes abbreviated as <acronym>TZ</acronym>) is +a collection of words which includes information about the relationships +of words and phrases, i.e., broader terms (<acronym>BT</acronym>), narrower +terms (<acronym>NT</acronym>), preferred terms, non-preferred terms, related +terms, etc. +</para> +<para> +Basically a thesaurus dictionary replaces all non-preferred terms by one +preferred term and, optionally, preserves them for indexing. Thesauruses +are used during indexing so any change in the thesaurus <emphasis>requires</emphasis> +reindexing. The current implementation of the thesaurus +dictionary is an extension of the synonym dictionary with added +<emphasis>phrase</emphasis> support. A thesaurus is a plain file of the +following format: +<programlisting> +# this is a comment +sample word(s) : indexed word(s) +............................... +</programlisting> +where the colon (<symbol>:</symbol>) symbol acts as a delimiter. +</para> + +<para> +A thesaurus dictionary uses a <emphasis>subdictionary</emphasis> (which +should be defined in the full text configuration) to normalize the +thesaurus text. It is only possible to define one dictionary. Notice that +the <emphasis>subdictionary</emphasis> will produce an error if it can +not recognize a word. In that case, you should remove the definition of +the word or teach the <emphasis>subdictionary</emphasis> to about it. +Use an asterisk (<symbol>*</symbol>) at the beginning of an indexed word to +skip the subdictionary. It is still required that sample words are known. +</para> + +<para> +The thesaurus dictionary looks for the longest match. +</para> + +<para> +Stop words recognized by the subdictionary are replaced by a 'stop word +placeholder' to record their position. To break possible ties the thesaurus +uses the last definition. To illustrate this, consider a thesaurus (with +a <parameter>simple</parameter> subdictionary) with pattern +<literal>'swsw'</>, where <literal>'s'</> designates any stop word and +<literal>'w'</>, any known word: +<programlisting> +a one the two : swsw +the one a two : swsw2 +</programlisting> +Words <literal>'a'</> and <literal>'the'</> are stop words defined in the +configuration of a subdictionary. The thesaurus considers <literal>'the +one the two'</literal> and <literal>'that one then two'</literal> as equal +and will use definition 'swsw2'. +</para> + +<para> +As any normal dictionary, it can be assigned to the specific lexeme types. +Since a thesaurus dictionary has the capability to recognize phrases it +must remember its state and interact with the parser. A thesaurus dictionary +uses these assignments to check if it should handle the next word or stop +accumulation. The thesaurus dictionary compiler must be configured +carefully. For example, if the thesaurus dictionary is assigned to handle +only the <token>lword</token> lexeme, then a thesaurus dictionary +definition like ' one 7' will not work since lexeme type +<token>digit</token> is not assigned to the thesaurus dictionary. +</para> + +</sect2> + +<sect2 id="textsearch-thesaurus-config"> +<title>Thesaurus Configuration</title> + +<para> +To define a new thesaurus dictionary one can use the thesaurus template. +For example: + +<programlisting> +CREATE TEXT SEARCH DICTIONARY thesaurus_simple + TEMPLATE thesaurus_template + OPTION 'DictFile="dicts_data/thesaurus.txt.sample", Dictionary="en_stem"'; +</programlisting> +Here: +<itemizedlist spacing="compact" mark="bullet"> +<listitem><para> +<literal>thesaurus_simple</literal> is the thesaurus dictionary name +</para></listitem> +<listitem><para> +<literal>DictFile="/path/to/thesaurus_simple.txt"</literal> is the location of the thesaurus file +</para></listitem> +<listitem><para> +<literal>Dictionary="en_stem"</literal> defines the dictionary (snowball +English stemmer) to use for thesaurus normalization. Notice that the +<literal>en_stem</> dictionary has it is own configuration (for example, +stop words). +</para></listitem> +</itemizedlist> + +Now it is possible to bind the thesaurus dictionary <literal>thesaurus_simple</literal> +and selected <literal>tokens</literal>, for example: + +<programlisting> +ALTER TEXT SEARCH russian ADD MAPPING FOR lword, lhword, lpart_hword WITH thesaurus_simple; +</programlisting> +</para> + +</sect2> + +<sect2 id="textsearch-thesaurus-examples"> +<title>Thesaurus Example</title> + +<para> +Consider a simple astronomical thesaurus <literal>thesaurus_astro</literal>, +which contains some astronomical word combinations: +<programlisting> +supernovae stars : sn +crab nebulae : crab +</programlisting> +Below we create a dictionary and bind some token types with +an astronomical thesaurus and english stemmer: +<programlisting> +CREATE TEXT SEARCH DICTIONARY thesaurus_astro OPTION + TEMPLATE thesaurus_template + 'DictFile="dicts_data/thesaurus_astro.txt", Dictionary="en_stem"'; +ALTER TEXT SEARCH CONFIGURATION russian ADD MAPPING FOR lword, lhword, lpart_hword + WITH thesaurus_astro, en_stem; +</programlisting> +Now we can see how it works. Note that <function>lexize</function> cannot +be used for testing the thesaurus (see description of +<function>lexize</function>), but we can use +<function>plainto_tsquery</function> and <function>to_tsvector</function> +which accept <literal>text</literal> arguments, not lexemes: + +<programlisting> +SELECT plainto_tsquery('supernova star'); + plainto_tsquery +----------------- + 'sn' +SELECT to_tsvector('supernova star'); + to_tsvector +------------- + 'sn':1 +</programlisting> +In principle, one can use <function>to_tsquery</function> if you quote +the argument: +<programlisting> +SELECT to_tsquery('''supernova star'''); + to_tsquery +------------ + 'sn' +</programlisting> +Notice that <literal>supernova star</literal> matches <literal>supernovae +stars</literal> in <literal>thesaurus_astro</literal> because we specified the +<literal>en_stem</literal> stemmer in the thesaurus definition. +</para> +<para> +To keep an original phrase in full text indexing just add it to the right part +of the definition: +<programlisting> +supernovae stars : sn supernovae stars + +SELECT plainto_tsquery('supernova star'); + plainto_tsquery +----------------------------- + 'sn' & 'supernova' & 'star' +</programlisting> +</para> + +</sect2> + +<sect2 id="textsearch-ispell-dictionary"> +<title>Ispell Dictionary</title> + +<para> +The <application>Ispell</> template dictionary for full text allows the +creation of morphological dictionaries based on <ulink +url="http://ficus-www.cs.ucla.edu/geoff/ispell.html">Ispell</ulink>, which +supports a large number of languages. This dictionary tries to change an +input word to its normalized form. Also, more modern spelling dictionaries +are supported - <ulink +url="http://en.wikipedia.org/wiki/MySpell">MySpell</ulink> (OO < 2.0.1) +and <ulink url="http://sourceforge.net/projects/hunspell">Hunspell</ulink> +(OO >= 2.0.2). A large list of dictionaries is available on the <ulink +url="http://wiki.services.openoffice.org/wiki/Dictionaries">OpenOffice +Wiki</ulink>. +</para> + +<para> +The <application>Ispell</> dictionary allows searches without bothering +about different linguistic forms of a word. For example, a search on +<literal>bank</literal> would return hits of all declensions and +conjugations of the search term <literal>bank</literal>, e.g. +<literal>banking</>, <literal>banked</>, <literal>banks</>, +<literal>banks'</>, and <literal>bank's</>. +<programlisting> +SELECT lexize('en_ispell','banking'); + lexize +-------- + {bank} +SELECT lexize('en_ispell','bank''s'); + lexize +-------- + {bank} +SELECT lexize('en_ispell','banked'); + lexize +-------- + {bank} +</programlisting> + +</para> + +<para> +To create an ispell dictionary one should use the built-in +<literal>ispell_template</literal> dictionary and specify several +parameters. +</para> +<programlisting> +CREATE TEXT SEARCH DICTIONARY en_ispell + TEMPLATE ispell_template + OPTION 'DictFile="/usr/local/share/dicts/ispell/english.dict", + AffFile="/usr/local/share/dicts/ispell/english.aff", + StopFile="/usr/local/share/dicts/ispell/english.stop"'; +</programlisting> +<para> +Here, <literal>DictFile</>, <literal>AffFile</>, <literal>StopFile</> +specify the location of the dictionary and stop words files. +</para> + +<para> +Relative paths in <literal>OPTION</literal> resolve relative to +<filename>share/dicts_data</>: +<programlisting> +CREATE TEXT SEARCH DICTIONARY en_ispell + TEMPLATE ispell_template + OPTION 'DictFile="ispell/english.dict", + AffFile="ispell/english.aff", + StopFile="english.stop"'; +</programlisting> +</para> + +<para> +Ispell dictionaries usually recognize a restricted set of words so it +should be used in conjunction with another broader dictionary; for +example, a stemming dictionary, which recognizes everything. + +</para> + +<para> +Ispell dictionaries support splitting compound words based on an +ispell dictionary. This is a nice feature and full text searching +in <productname>PostgreSQL</productname> supports it. +Notice that the affix file should specify a special flag using the +<literal>compoundwords controlled</literal> statement that marks dictionary +words that can participate in compound formation: +<programlisting> +compoundwords controlled z +</programlisting> +Several examples for the Norwegian language: +<programlisting> +SELECT lexize('norwegian_ispell','overbuljongterningpakkmesterassistent'); + {over,buljong,terning,pakk,mester,assistent} +SELECT lexize('norwegian_ispell','sjokoladefabrikk'); + {sjokoladefabrikk,sjokolade,fabrikk} +</programlisting> +</para> + +<note> +<para> +<application>MySpell</> does not support compound words. +<application>Hunspell</> has sophisticated support for compound words. At +present, full text searching implements only the basic compound word +operations of Hunspell. +</para> +</note> + +</sect2> + +<sect2 id="textsearch-stemming-dictionary"> +<title><application>Snowball</> Stemming Dictionary</title> + +<para> +The <application>Snowball</> template dictionary is based on the project +of Martin Porter, an inventor of the popular Porter's stemming algorithm +for the English language and now supported in many languages (see the <ulink +url="http://snowball.tartarus.org">Snowball site</ulink> for more +information). Full text searching contains a large number of stemmers for +many languages. The only option that is accepted by a snowball stemmer is the +location of a file with stop words. It can be defined using the +<literal>ALTER TEXT SEARCH DICTIONARY</literal> command. +</para> +<para> +<programlisting> +ALTER TEXT SEARCH DICTIONARY en_stem + SET OPTION 'StopFile=english-utf8.stop, Language=english'; +</programlisting> +</para> + +<para> +Relative paths in <literal>OPTION</literal> resolve relative +<filename>share/dicts/data</>: +<programlisting> +ALTER TEXT SEARCH DICTIONARY en_stem OPTION 'english.stop'; +</programlisting> +</para> + +<para> +The <application>Snowball</> dictionary recognizes everything, so it is best +to place it at the end of the dictionary stack. It it useless to have it +before any other dictionary because a lexeme will not pass through its stemmer. +</para> + +</sect2> + +<sect2 id="textsearch-dictionary-testing"> +<title>Dictionary Testing</title> + +<para> +The <function>lexize</> function facilitates dictionary testing: + +<variablelist> +<varlistentry> + +<indexterm zone="textsearch-dictionaries"> +<primary>lexize</primary> +</indexterm> + +<term> +<synopsis> +lexize(<optional> <replaceable class="PARAMETER">dict_name</replaceable> text</optional>, <replaceable class="PARAMETER">lexeme</replaceable> text) returns text[] +</synopsis> +</term> + +<listitem> +<para> +Returns an array of lexemes if the input <replaceable>lexeme</replaceable> +is known to the dictionary <replaceable>dictname</replaceable>, or a void +array if the lexeme is known to the dictionary but it is a stop word, or +<literal>NULL</literal> if it is an unknown word. +</para> +<programlisting> +SELECT lexize('en_stem', 'stars'); + lexize +-------- + {star} +SELECT lexize('en_stem', 'a'); + lexize +-------- + {} +</programlisting> +</listitem> +</varlistentry> + +</variablelist> +</para> + +<note> +<para> +The <function>lexize</function> function expects a +<replaceable>lexeme</replaceable>, not text. Below is an example: +<programlisting> +SELECT lexize('thesaurus_astro','supernovae stars') is null; + ?column? +---------- + t +</programlisting> +Thesaurus dictionary <literal>thesaurus_astro</literal> does know +<literal>supernovae stars</literal>, but lexize fails since it does not +parse the input text and considers it as a single lexeme. Use +<function>plainto_tsquery</> and <function>to_tsvector</> to test thesaurus +dictionaries: +<programlisting> +SELECT plainto_tsquery('supernovae stars'); + plainto_tsquery +----------------- + 'sn' +</programlisting> +</para> +</note> + +</sect2> + +<sect2 id="textsearch-tables-configuration"> +<title>Configuration Example</title> + +<para> +A full text configuration specifies all options necessary to transform a +document into a <type>tsvector</type>: the parser breaks text into tokens, +and the dictionaries transform each token into a lexeme. Every call to +<function>to_tsvector()</function> and <function>to_tsquery()</function> +needs a configuration to perform its processing. To facilitate management +of full text searching objects, a set of <acronym>SQL</acronym> commands +is available, and there are several psql commands which display information +about full text searching objects (<xref linkend="textsearch-psql">). +</para> + +<para> +The <acronym>GUC</acronym> variable <varname>default_text_search_config</varname> +(optionally schema-qualified) defines the name of the <emphasis>current +active</emphasis> configuration. It can be defined in +<literal>postgresql.conf</literal> or using the <command>SET</> command. +</para> + +<para> +Predefined full text searching objects are available in the +<literal>pg_catalog</literal> schema. If you need a custom configuration +you can create a new full text searching object and modify it using SQL +commands. + +New full text searching objects are created in the current schema by default +(usually the <literal>public</literal> schema), but a schema-qualified +name can be used to create objects in the specified schema. It is owned +by the current user and can be changed using the <command>ALTER TEXT +SEARCH OWNER</> command. +</para> + +<para> +As an example, we will create a configuration +<literal>pg</literal> which starts as a duplicate of the +<literal>english</> configuration. To be safe, we do this in a transaction: +<programlisting> +BEGIN; + +CREATE TEXT SEARCH CONFIGURATION public.pg LIKE english WITH MAP; +</programlisting> +</para> + +<para> +We will use a PostgreSQL-specific <literal>synonym</literal> dictionary +and store it in the <literal>share/dicts_data</literal> directory. The +dictionary looks like: +<Programlisting> +postgres pg +pgsql pg +postgresql pg +</programlisting> + +<programlisting> +CREATE TEXT SEARCH DICTIONARY pg_dict + TEMPLATE synonym + OPTION 'pg_dict.txt'; +</programlisting> + +</para> + +<para> +Then register the <productname>ispell</> dictionary <literal>en_ispell</literal> using +the <literal>ispell_template</literal> template: + +<programlisting> +CREATE TEXT SEARCH DICTIONARY en_ispell + TEMPLATE ispell_template + OPTION 'DictFile="english-utf8.dict", + AffFile="english-utf8.aff", + StopFile="english-utf8.stop"'; +</programlisting> +</para> + +<para> +Use the same stop word list for the <application>Snowball</> stemmer <literal>en_stem</literal>, +which is available by default: + +<programlisting> +ALTER TEXT SEARCH DICTIONARY en_stem SET OPTION 'english-utf8.stop'; +</programlisting> +</para> + +<para> +Modify mappings for Latin words for configuration <literal>'pg'</>: + +<programlisting> +ALTER TEXT SEARCH CONFIGURATION pg ALTER MAPPING FOR lword, lhword, lpart_hword + WITH pg_dict, en_ispell, en_stem; +</programlisting> +</para> + +<para> +We do not index or search some tokens: + +<programlisting> +ALTER TEXT SEARCH CONFIGURATION pg DROP MAPPING FOR email, url, sfloat, uri, float; +</programlisting> +</para> + +<para> +Now, we can test our configuration: +<programlisting> +SELECT * FROM ts_debug('public.pg', ' +PostgreSQL, the highly scalable, SQL compliant, open source object-relational +database management system, is now undergoing beta testing of the next +version of our software: PostgreSQL 8.2. +'); + +COMMIT; +</programlisting> +</para> + +<para> +With the dictionaries and mappings set up, suppose we have a table +<literal>pgweb</literal> which contains 11239 documents from the +<productname>PostgreSQL</productname> web site. Only relevant columns +are shown: +<programlisting> +=> \d pgweb + Table "public.pgweb" + Column | Type | Modifiers +-----------+-------------------+----------- + tid | integer | not null + path | character varying | not null + body | character varying | + title | character varying | + dlm | integer | +</programlisting> +</para> + +<para> +The next step is to set the session to use the new configuration, which was +created in the <literal>public</> schema: +<programlisting> +=> \dF +postgres=# \dF public.* +List of fulltext configurations + Schema | Name | Description +--------+------+------------- + public | pg | + +SET default_text_search_config = 'public.pg'; +SET + +SHOW default_text_search_config; + default_text_search_config +---------------------------- + public.pg +</programlisting> +</para> + +</sect2> + +<sect2 id="textsearch-tables-multiconfig"> +<title>Managing Multiple Configurations</title> + +<para> +If you are using the same text search configuration for the entire cluster +just set the value in <filename>postgresql.conf</>. If using a single +text search configuration for an entire database, use <command>ALTER +DATABASE ... SET</>. +</para> + +<para> +However, if you need to use several text search configurations in the same +database you must be careful to reference the proper text search +configuration. This can be done by either setting +<varname>default_text_search_conf</> in each session or supplying the +configuration name in every function call, e.g. to_tsquery('pg', +'friend'), to_tsvector('pg', col). If you are using an expression index, +you must also be sure to use the proper text search configuration every +time an <command>INSERT</> or <command>UPDATE</> is executed because these +will modify the index, or you can embed the configuration name into the +expression index, e.g.: +<programlisting> +CREATE INDEX pgweb_idx ON pgweb USING gin(to_tsvector('pg', textcat(title, body))); +</programlisting> +And if you do that, make sure you specify the configuration name in the +<literal>WHERE</> clause as well so the expression index will be used. +</para> + +</sect2> + +</sect1> + +<sect1 id="textsearch-indexes"> +<title>GiST and GIN Index Types</title> + + <indexterm zone="textsearch-indexes"> + <primary>index</primary> + <secondary>full text</secondary> + </indexterm> + + +<para> +There are two kinds of indexes which can be used to speed up full text +operators (<xref linkend="textsearch-searches">). +Note that indexes are not mandatory for full text searching. + +<variablelist> + +<varlistentry> + + +<indexterm zone="textsearch-indexes"> +<primary>index</primary> +<secondary>GIST</secondary> +</indexterm> + +<term> +<synopsis> +CREATE INDEX <replaceable>name</replaceable> ON <replaceable>table</replaceable> USING gist(<replaceable>column</replaceable>); +</synopsis> +</term> + +<listitem> +<para> +Creates a GiST (Generalized Search Tree)-based index. +</para> + +</listitem> +</varlistentry> + +<varlistentry> + +<indexterm zone="textsearch-indexes"> +<primary>index</primary> +<secondary>GIN</secondary> +</indexterm> + +<term> +<synopsis> +CREATE INDEX <replaceable>name</replaceable> ON <replaceable>table</replaceable> USING gin(<replaceable>column</replaceable>); +</synopsis> +</term> + +<listitem> +<para> +Creates a GIN (Generalized Inverted Index)-based index. +<replaceable class="PARAMETER">column</replaceable> is a +<literal>TSVECTOR</literal>, <literal>TEXT</literal>, +<literal>VARCHAR</literal>, or <literal>CHAR</literal>-type column. +</para> + +</listitem> +</varlistentry> + +</variablelist> +</para> + +<para> +A GiST index is <literal>lossy</literal>, meaning it is necessary +to consult the <literal>heap</literal> to check for false results. +<productname>PostgreSQL</productname> does this automatically; see +<literal>Filter:</literal> in the example below: +<programlisting> +EXPLAIN SELECT * FROM apod WHERE textsearch @@ to_tsquery('supernovae'); + QUERY PLAN +------------------------------------------------------------------------- + Index Scan using textsearch_gidx on apod (cost=0.00..12.29 rows=2 width=1469) + Index Cond: (textsearch @@ '''supernova'''::tsquery) + Filter: (textsearch @@ '''supernova'''::tsquery) +</programlisting> +GiST index lossiness happens because each document is represented by a +fixed-length signature. The signature is generated by hashing (crc32) each +word into a random bit in an n-bit string and all words combine to produce +an n-bit document signature. Because of hashing there is a chance that +some words hash to the same position and could result in a false hit. +Signatures calculated for each document in a collection are stored in an +<literal>RD-tree</literal> (Russian Doll tree), invented by Hellerstein, +which is an adaptation of <literal>R-tree</literal> for sets. In our case +the transitive containment relation <!-- huh --> is realized by +superimposed coding (Knuth, 1973) of signatures, i.e., a parent is the +result of 'OR'-ing the bit-strings of all children. This is a second +factor of lossiness. It is clear that parents tend to be full of +<literal>'1'</>s (degenerates) and become quite useless because of the +limited selectivity. Searching is performed as a bit comparison of a +signature representing the query and an <literal>RD-tree</literal> entry. +If all <literal>'1'</>s of both signatures are in the same position we +say that this branch probably matches the query, but if there is even one +discrepancy we can definitely reject this branch. +</para> + +<para> +Lossiness causes serious performance degradation since random access of +<literal>heap</literal> records is slow and limits the usefulness of GiST +indexes. The likelihood of false hits depends on several factors, like +the number of unique words, so using dictionaries to reduce this number +is recommended. +</para> + +<para> +Actually, this is not the whole story. GiST indexes have an optimization +for storing small tsvectors (< <literal>TOAST_INDEX_TARGET</literal> +bytes, 512 bytes). On leaf pages small tsvectors are stored unchanged, +while longer ones are represented by their signatures, which introduces +some lossiness. Unfortunately, the existing index API does not allow for +a return value to say whether it found an exact value (tsvector) or whether +the result needs to be checked. This is why the GiST index is +currently marked as lossy. We hope to improve this in the future. +</para> + +<para> +GIN indexes are not lossy but their performance depends logarithmically on +the number of unique words. +</para> + +<para> +There is one side-effect of the non-lossiness of a GIN index when using +query labels/weights, like <literal>'supernovae:a'</literal>. A GIN index +has all the information necessary to determine a match, so the heap is +not accessed. However, if the query has label information it must access +the heap. Therefore, a special full text search operator <literal>@@@</literal> +was created which forces the use of the heap to get information about +labels. GiST indexes are lossy so it always reads the heap and there is +no need for a special operator. In the example below, +<literal>fulltext_idx</literal> is a GIN index:<!-- why isn't this +automatic --> +<programlisting> +EXPLAIN SELECT * FROM apod WHERE textsearch @@@ to_tsquery('supernovae:a'); + QUERY PLAN +------------------------------------------------------------------------ + Index Scan using textsearch_idx on apod (cost=0.00..12.30 rows=2 width=1469) + Index Cond: (textsearch @@@ '''supernova'':A'::tsquery) + Filter: (textsearch @@@ '''supernova'':A'::tsquery) +</programlisting> + +</para> + +<para> +In choosing which index type to use, GiST or GIN, consider these differences: +<itemizedlist spacing="compact" mark="bullet"> +<listitem><para> +GiN index lookups are three times faster than GiST +</para></listitem> +<listitem><para> +GiN indexes take three times longer to build than GiST +</para></listitem> +<listitem><para> +GiN is about ten times slower to update than GiST +</para></listitem> +<listitem><para> +GiN indexes are two-to-three times larger than GiST +</para></listitem> +</itemizedlist> +</para> + +<para> +In summary, <acronym>GIN</acronym> indexes are best for static data because +the indexes are faster for lookups. For dynamic data, GiST indexes are +faster to update. Specifically, <acronym>GiST</acronym> indexes are very +good for dynamic data and fast if the number of unique words (lexemes) is +under 100,000, while <acronym>GIN</acronym> handles +100,000 lexemes better +but is slower to update. +</para> + +<para> +Partitioning of big collections and the proper use of GiST and GIN indexes +allows the implementation of very fast searches with online update. +Partitioning can be done at the database level using table inheritance +and <varname>constraint_exclusion</>, or distributing documents over +servers and collecting search results using the <filename>contrib/dblink</> +extension module. The latter is possible because ranking functions use +only local information. +</para> + +</sect1> + +<sect1 id="textsearch-limitations"> +<title>Limitations</title> + +<para> +The current limitations of Full Text Searching are: +<itemizedlist spacing="compact" mark="bullet"> +<listitem><para>The length of each lexeme must be less than 2K bytes</para></listitem> +<listitem><para>The length of a <type>tsvector</type> (lexemes + positions) must be less than 1 megabyte</para></listitem> +<listitem><para>The number of lexemes must be less than 2<superscript>64</superscript></para></listitem> +<listitem><para>Positional information must be non-negative and less than 16,383</para></listitem> +<listitem><para>No more than 256 positions per lexeme</para></listitem> +<listitem><para>The number of nodes (lexemes + operations) in tsquery must be less than 32,768</para></listitem> +</itemizedlist> +</para> + +<para> +For comparison, the <productname>PostgreSQL</productname> 8.1 documentation +consists of 10,441 unique words, a total of 335,420 words, and the most frequent word +'postgresql' is mentioned 6,127 times in 655 documents. +</para> + +<para> +Another example - the <productname>PostgreSQL</productname> mailing list archives +consists of 910,989 unique words with 57,491,343 lexemes in 461,020 messages. +</para> + +</sect1> + +<sect1 id="textsearch-psql"> +<title><application>psql</> Support</title> + +<para> +Information about full text searching objects can be obtained +in <literal>psql</literal> using a set of commands: +<synopsis> +\dF{,d,p}<optional>+</optional> <optional>PATTERN</optional> +</synopsis> +An optional <literal>+</literal> produces more details. +</para> +<para> +The optional parameter <literal>PATTERN</literal> should be the name of +a full text searching object, optionally schema-qualified. If +<literal>PATTERN</literal> is not specified then information about all +visible objects will be displayed. <literal>PATTERN</literal> can be a +regular expression and can apply <emphasis>separately</emphasis> to schema +names and object names. The following examples illustrate this: +<programlisting> +=> \dF *fulltext* + List of fulltext configurations + Schema | Name | Description +--------+--------------+------------- + public | fulltext_cfg | +</programlisting> + +<programlisting> +=> \dF *.fulltext* + List of fulltext configurations + Schema | Name | Description +----------+---------------------------- + fulltext | fulltext_cfg | + public | fulltext_cfg | +</programlisting> +</para> + +<variablelist> + + <varlistentry> +<term>\dF[+] [PATTERN]</term> + + <listitem> + <para> + List full text searching configurations (add "+" for more detail) + </para> + <para> + By default (without <literal>PATTERN</literal>), information about + all <emphasis>visible</emphasis> full text configurations will be + displayed. + </para> +<para> +<programlisting> +=> \dF russian + List of fulltext configurations + Schema | Name | Description +------------+---------+----------------------------------- + pg_catalog | russian | default configuration for Russian + +=> \dF+ russian +Configuration "pg_catalog.russian" +Parser name: "pg_catalog.default" +Locale: 'ru_RU.UTF-8' (default) + Token | Dictionaries +--------------+------------------------- + email | pg_catalog.simple + file | pg_catalog.simple + float | pg_catalog.simple + host | pg_catalog.simple + hword | pg_catalog.ru_stem_utf8 + int | pg_catalog.simple + lhword | public.tz_simple + lpart_hword | public.tz_simple + lword | public.tz_simple + nlhword | pg_catalog.ru_stem_utf8 + nlpart_hword | pg_catalog.ru_stem_utf8 + nlword | pg_catalog.ru_stem_utf8 + part_hword | pg_catalog.simple + sfloat | pg_catalog.simple + uint | pg_catalog.simple + uri | pg_catalog.simple + url | pg_catalog.simple + version | pg_catalog.simple + word | pg_catalog.ru_stem_utf8 +</programlisting> +</para> + </listitem> + </varlistentry> + + <varlistentry> +<term>\dFd[+] [PATTERN]</term> + <listitem> + <para> + List full text dictionaries (add "+" for more detail). + </para> + <para> + By default (without <literal>PATTERN</literal>), information about + all <emphasis>visible</emphasis> dictionaries will be displayed. + </para> +<para> +<programlisting> +=> \dFd + List of fulltext dictionaries + Schema | Name | Description +------------+------------+----------------------------------------------------------- + pg_catalog | danish | Snowball stemmer for danish language + pg_catalog | dutch | Snowball stemmer for dutch language + pg_catalog | english | Snowball stemmer for english language + pg_catalog | finnish | Snowball stemmer for finnish language + pg_catalog | french | Snowball stemmer for french language + pg_catalog | german | Snowball stemmer for german language + pg_catalog | hungarian | Snowball stemmer for hungarian language + pg_catalog | italian | Snowball stemmer for italian language + pg_catalog | norwegian | Snowball stemmer for norwegian language + pg_catalog | portuguese | Snowball stemmer for portuguese language + pg_catalog | romanian | Snowball stemmer for romanian language + pg_catalog | russian | Snowball stemmer for russian language + pg_catalog | simple | simple dictionary: just lower case and check for stopword + pg_catalog | spanish | Snowball stemmer for spanish language + pg_catalog | swedish | Snowball stemmer for swedish language + pg_catalog | turkish | Snowball stemmer for turkish language +</programlisting> +</para> + </listitem> + </varlistentry> + + <varlistentry> + +<term>\dFp[+] [PATTERN]</term> + <listitem> + <para> + List full text parsers (add "+" for more detail) + </para> + <para> + By default (without <literal>PATTERN</literal>), information about + all <emphasis>visible</emphasis> full text parsers will be displayed. + </para> +<para> +<programlisting> +=> \dFp + List of fulltext parsers + Schema | Name | Description +------------+---------+--------------------- + pg_catalog | default | default word parser +(1 row) +=> \dFp+ + Fulltext parser "pg_catalog.default" + Method | Function | Description +-------------------+---------------------------+------------- + Start parse | pg_catalog.prsd_start | + Get next token | pg_catalog.prsd_nexttoken | + End parse | pg_catalog.prsd_end | + Get headline | pg_catalog.prsd_headline | + Get lexeme's type | pg_catalog.prsd_lextype | + + Token's types for parser "pg_catalog.default" + Token name | Description +--------------+----------------------------------- + blank | Space symbols + email | Email + entity | HTML Entity + file | File or path name + float | Decimal notation + host | Host + hword | Hyphenated word + int | Signed integer + lhword | Latin hyphenated word + lpart_hword | Latin part of hyphenated word + lword | Latin word + nlhword | Non-latin hyphenated word + nlpart_hword | Non-latin part of hyphenated word + nlword | Non-latin word + part_hword | Part of hyphenated word + protocol | Protocol head + sfloat | Scientific notation + tag | HTML Tag + uint | Unsigned integer + uri | URI + url | URL + version | VERSION + word | Word +(23 rows) +</programlisting> +</para> + </listitem> + </varlistentry> + + + </variablelist> + +</sect1> + +<sect1 id="textsearch-debugging"> +<title>Debugging</title> + +<para> +Function <function>ts_debug</function> allows easy testing of your full text searching +configuration. +</para> + +<synopsis> +ts_debug(<optional><replaceable class="PARAMETER">conf_name</replaceable></optional>, <replaceable class="PARAMETER">document</replaceable> TEXT) returns SETOF tsdebug +</synopsis> + +<para> +<function>ts_debug</> displays information about every token of +<replaceable class="PARAMETER">document</replaceable> as produced by the +parser and processed by the configured dictionaries using the configuration +specified by <replaceable class="PARAMETER">conf_name</replaceable>. +</para> +<para> +<replaceable class="PARAMETER">tsdebug</replaceable> type defined as: +<programlisting> +CREATE TYPE tsdebug AS ( + "Alias" text, + "Description" text, + "Token" text, + "Dicts list" text[], + "Lexized token" text +</programlisting> +</para> + +<para> +For a demonstration of how function <function>ts_debug</function> works we +first create a <literal>public.english</literal> configuration and +ispell dictionary for the English language. You can skip the test step and +play with the standard <literal>english</literal> configuration. +</para> +<programlisting> +CREATE TEXT SEARCH CONFIGURATION public.english LIKE pg_catalog.english WITH MAP AS DEFAULT; +CREATE TEXT SEARCH DICTIONARY en_ispell + TEMPLATE ispell_template + OPTION 'DictFile="/usr/local/share/dicts/ispell/english-utf8.dict", + AffFile="/usr/local/share/dicts/ispell/english-utf8.aff", + StopFile="/usr/local/share/dicts/english.stop"'; +ALTER TEXT SEARCH MAPPING ON public.english FOR lword WITH en_ispell,en_stem; +</programlisting> + +<programlisting> +SELECT * FROM ts_debug('public.english','The Brightest supernovaes'); + Alias | Description | Token | Dicts list | Lexized token +-------+---------------+-------------+---------------------------------------+--------------------------------- + lword | Latin word | The | {public.en_ispell,pg_catalog.en_stem} | public.en_ispell: {} + blank | Space symbols | | | + lword | Latin word | Brightest | {public.en_ispell,pg_catalog.en_stem} | public.en_ispell: {bright} + blank | Space symbols | | | + lword | Latin word | supernovaes | {public.en_ispell,pg_catalog.en_stem} | pg_catalog.en_stem: {supernova} +(5 rows) +</programlisting> +<para> +In this example, the word <literal>'Brightest'</> was recognized by a +parser as a <literal>Latin word</literal> (alias <literal>lword</literal>) +and came through the dictionaries <literal>public.en_ispell</> and +<literal>pg_catalog.en_stem</literal>. It was recognized by +<literal>public.en_ispell</literal>, which reduced it to the noun +<literal>bright</literal>. The word <literal>supernovaes</literal> is unknown +by the <literal>public.en_ispell</literal> dictionary so it was passed to +the next dictionary, and, fortunately, was recognized (in fact, +<literal>public.en_stem</literal> is a stemming dictionary and recognizes +everything; that is why it was placed at the end of the dictionary stack). +</para> + +<para> +The word <literal>The</literal> was recognized by <literal>public.en_ispell</literal> +dictionary as a stop word (<xref linkend="textsearch-stopwords">) and will not be indexed. +</para> + +<para> +You can always explicitly specify which columns you want to see: +<programlisting> +SELECT "Alias", "Token", "Lexized token" +FROM ts_debug('public.english','The Brightest supernovaes'); + Alias | Token | Lexized token +-------+-------------+--------------------------------- + lword | The | public.en_ispell: {} + blank | | + lword | Brightest | public.en_ispell: {bright} + blank | | + lword | supernovaes | pg_catalog.en_stem: {supernova} +(5 rows) +</programlisting> +</para> + +</sect1> + +<sect1 id="textsearch-rule-dictionary-example"> +<title>Example of Creating a Rule-Based Dictionary</title> + +<para> +The motivation for this example dictionary is to control the indexing of +integers (signed and unsigned), and, consequently, to minimize the number +of unique words which greatly affects to performance of searching. +</para> + +<para> +The dictionary accepts two options: +<itemizedlist spacing="compact" mark="bullet"> + +<listitem><para> +The <LITERAL>MAXLEN</literal> parameter specifies the maximum length of the +number considered as a 'good' integer. The default value is 6. +</para></listitem> + +<listitem><para> +The <LITERAL>REJECTLONG</LITERAL> parameter specifies if a 'long' integer +should be indexed or treated as a stop word. If +<literal>REJECTLONG</literal>=<LITERAL>FALSE</LITERAL> (default), +the dictionary returns the prefixed part of the integer with length +<LITERAL>MAXLEN</literal>. If +<LITERAL>REJECTLONG</LITERAL>=<LITERAL>TRUE</LITERAL>, the dictionary +considers a long integer as a stop word. +</para></listitem> + +</itemizedlist> + +</para> + +<para> +A similar idea can be applied to the indexing of decimal numbers, for +example, in the <literal>DecDict</literal> dictionary. The dictionary +accepts two options: the <literal>MAXLENFRAC</literal> parameter specifies +the maximum length of the fractional part considered as a 'good' decimal. +The default value is 3. The <literal>REJECTLONG</literal> parameter +controls whether a decimal number with a 'long' fractional part should be indexed +or treated as a stop word. If +<literal>REJECTLONG</literal>=<literal>FALSE</literal> (default), +the dictionary returns the decimal number with the length of its fraction part +truncated to <literal>MAXLEN</literal>. If +<literal>REJECTLONG</literal>=<literal>TRUE</literal>, the dictionary +considers the number as a stop word. Notice that +<literal>REJECTLONG</literal>=<literal>FALSE</literal> allows the indexing +of 'shortened' numbers and search results will contain documents with +shortened numbers. +</para> + + +<para> +Examples: +<programlisting> +SELECT lexize('intdict', 11234567890); + lexize +---------- + {112345} +</programlisting> +</para> +<para> +Now, we want to ignore long integers: +<programlisting> + +ALTER TEXT SEARCH DICTIONARY intdict SET OPTION 'MAXLEN=6, REJECTLONG=TRUE'; +SELECT lexize('intdict', 11234567890); + lexize +-------- + {} +</programlisting> +</para> + +<para> +Create <filename>contrib/dict_intdict</> directory with files +<filename>dict_tmpl.c</>, <filename>Makefile</>, <filename>dict_intdict.sql.in</>: +<programlisting> +make && make install +psql DBNAME < dict_intdict.sql +</programlisting> +</para> + +<para> +This is a <filename>dict_tmpl.c</> file: +</para> + +<programlisting> +#include "postgres.h" +#include "utils/builtins.h" +#include "fmgr.h" + +#ifdef PG_MODULE_MAGIC +PG_MODULE_MAGIC; +#endif + +#include "utils/ts_locale.h" +#include "utils/ts_public.h" +#include "utils/ts_utils.h" + + typedef struct { + int maxlen; + bool rejectlong; + } DictInt; + + + PG_FUNCTION_INFO_V1(dinit_intdict); + Datum dinit_intdict(PG_FUNCTION_ARGS); + + Datum + dinit_intdict(PG_FUNCTION_ARGS) { + DictInt *d = (DictInt*)malloc( sizeof(DictInt) ); + Map *cfg, *pcfg; + text *in; + + if (!d) + elog(ERROR, "No memory"); + memset(d, 0, sizeof(DictInt)); + + /* Your INIT code */ +/* defaults */ + d->maxlen = 6; + d->rejectlong = false; + + if ( PG_ARGISNULL(0) || PG_GETARG_POINTER(0) == NULL ) /* no options */ + PG_RETURN_POINTER(d); + + in = PG_GETARG_TEXT_P(0); + parse_keyvalpairs(in, &cfg); + PG_FREE_IF_COPY(in, 0); + pcfg=cfg; + + while (pcfg->key) + { + if (strcasecmp("MAXLEN", pcfg->key) == 0) + d->maxlen=atoi(pcfg->value); + else if ( strcasecmp("REJECTLONG", pcfg->key) == 0) + { + if ( strcasecmp("true", pcfg->value) == 0 ) + d->rejectlong=true; + else if ( strcasecmp("false", pcfg->value) == 0) + d->rejectlong=false; + else + elog(ERROR,"Unknown value: %s => %s", pcfg->key, pcfg->value); + } + else + elog(ERROR,"Unknown option: %s => %s", pcfg->key, pcfg->value); + + pfree(pcfg->key); + pfree(pcfg->value); + pcfg++; + } + pfree(cfg); + + PG_RETURN_POINTER(d); + } + +PG_FUNCTION_INFO_V1(dlexize_intdict); +Datum dlexize_intdict(PG_FUNCTION_ARGS); +Datum +dlexize_intdict(PG_FUNCTION_ARGS) +{ + DictInt *d = (DictInt*)PG_GETARG_POINTER(0); + char *in = (char*)PG_GETARG_POINTER(1); + char *txt = pnstrdup(in, PG_GETARG_INT32(2)); + TSLexeme *res = palloc(sizeof(TSLexeme) * 2); + + /* Your INIT dictionary code */ + res[1].lexeme = NULL; + + if (PG_GETARG_INT32(2) > d->maxlen) + { + if (d->rejectlong) + { /* stop, return void array */ + pfree(txt); + res[0].lexeme = NULL; + } + else + { /* cut integer */ + txt[d->maxlen] = '\0'; + res[0].lexeme = txt; + } + } + else + res[0].lexeme = txt; + + PG_RETURN_POINTER(res); +} +</programlisting> + +<para> +This is the <literal>Makefile</literal>: +<programlisting> +subdir = contrib/dict_intdict +top_builddir = ../.. +include $(top_builddir)/src/Makefile.global + +MODULE_big = dict_intdict +OBJS = dict_tmpl.o +DATA_built = dict_intdict.sql +DOCS = + +include $(top_srcdir)/contrib/contrib-global.mk +</programlisting> +</para> + +<para> +This is a <literal>dict_intdict.sql.in</literal>: +<programlisting> +SET default_text_search_config = 'english'; + +BEGIN; + +CREATE OR REPLACE FUNCTION dinit_intdict(internal) +RETURNS internal +AS 'MODULE_PATHNAME' +LANGUAGE 'C'; + +CREATE OR REPLACE FUNCTION dlexize_intdict(internal,internal,internal,internal) +RETURNS internal +AS 'MODULE_PATHNAME' +LANGUAGE 'C' +WITH (isstrict); + +CREATE TEXT SEARCH DICTIONARY intdict + LEXIZE 'dlexize_intdict' INIT 'dinit_intdict' + OPTION 'MAXLEN=6,REJECTLONG = false'; + +COMMENT ON TEXT SEARCH DICTIONARY intdict IS 'Dictionary for Integers'; + +END; +</programlisting> +</para> + +</sect1> + +<sect1 id="textsearch-parser-example"> +<title>Example of Creating a Parser</title> + +<para> +<acronym>SQL</acronym> command <literal>CREATE TEXT SEARCH PARSER</literal> creates +a parser for full text searching. In our example we will implement +a simple parser which recognizes space-delimited words and +has only two types (3, word, Word; 12, blank, Space symbols). Identifiers +were chosen to keep compatibility with the default <function>headline()</function> function +since we do not implement our own version. +</para> + +<para> +To implement a parser one needs to create a minimum of four functions. +</para> + +<variablelist> + +<varlistentry> +<term> +<synopsis> +START = <replaceable class="PARAMETER">start_function</replaceable> +</synopsis> +</term> +<listitem> +<para> +Initialize the parser. Arguments are a pointer to the parsed text and its +length. +</para> +<para> +Returns a pointer to the internal structure of a parser. Note that it should +be <function>malloc</>ed or <function>palloc</>ed in the +<literal>TopMemoryContext</>. We name it <literal>ParserState</>. +</para> +</listitem> +</varlistentry> + +<varlistentry> +<term> +<synopsis> +GETTOKEN = <replaceable class="PARAMETER">gettoken_function</replaceable> +</synopsis> +</term> +<listitem> +<para> +Returns the next token. +Arguments are <literal>ParserState *, char **, int *</literal>. +</para> +<para> +This procedure will be called as long as the procedure returns token type zero. +</para> +</listitem> +</varlistentry> + +<varlistentry> +<term> +<synopsis> +END = <replaceable class="PARAMETER">end_function</replaceable>, +</synopsis> +</term> +<listitem> +<para> +This void function will be called after parsing is finished to free +allocated resources in this procedure (<literal>ParserState</>). The argument +is <literal>ParserState *</literal>. +</para> +</listitem> +</varlistentry> + +<varlistentry> +<term> +<synopsis> +LEXTYPES = <replaceable class="PARAMETER">lextypes_function</replaceable> +</synopsis> +</term> +<listitem> +<para> +Returns an array containing the id, alias, and the description of the tokens +in the parser. See <structname>LexDescr</structname> in <filename>src/include/utils/ts_public.h</>. +</para> +</listitem> +</varlistentry> + +</variablelist> + +<para> +Below is the source code of our test parser, organized as a <filename>contrib</> module. +</para> + +<para> +Testing: +<programlisting> +SELECT * FROM parse('testparser','That''s my first own parser'); + tokid | token +-------+-------- + 3 | That's + 12 | + 3 | my + 12 | + 3 | first + 12 | + 3 | own + 12 | + 3 | parser +SELECT to_tsvector('testcfg','That''s my first own parser'); + to_tsvector +------------------------------------------------- + 'my':2 'own':4 'first':3 'parser':5 'that''s':1 +SELECT headline('testcfg','Supernovae stars are the brightest phenomena in galaxies', to_tsquery('testcfg', 'star')); + headline +----------------------------------------------------------------- + Supernovae <b>stars</b> are the brightest phenomena in galaxies +</programlisting> + +</para> + +<para> +This test parser is an example adopted from a tutorial by Valli, <ulink +url="http://www.sai.msu.su/~megera/postgres/gist/tsearch/V2/docs/HOWTO-parser-tsearch2.html">parser +HOWTO</ulink>. +</para> + +<para> +To compile the example just do: +<programlisting> +make +make install +psql regression < test_parser.sql +</programlisting> +</para> + +<para> +This is a <filename>test_parser.c</>: +<programlisting> + +#ifdef PG_MODULE_MAGIC +PG_MODULE_MAGIC; +#endif + +/* + * types + */ + +/* self-defined type */ +typedef struct { + char * buffer; /* text to parse */ + int len; /* length of the text in buffer */ + int pos; /* position of the parser */ +} ParserState; + +/* copy-paste from wparser.h of tsearch2 */ +typedef struct { + int lexid; + char *alias; + char *descr; +} LexDescr; + +/* + * prototypes + */ +PG_FUNCTION_INFO_V1(testprs_start); +Datum testprs_start(PG_FUNCTION_ARGS); + +PG_FUNCTION_INFO_V1(testprs_getlexeme); +Datum testprs_getlexeme(PG_FUNCTION_ARGS); + +PG_FUNCTION_INFO_V1(testprs_end); +Datum testprs_end(PG_FUNCTION_ARGS); + +PG_FUNCTION_INFO_V1(testprs_lextype); +Datum testprs_lextype(PG_FUNCTION_ARGS); + +/* + * functions + */ +Datum testprs_start(PG_FUNCTION_ARGS) +{ + ParserState *pst = (ParserState *) palloc(sizeof(ParserState)); + pst->buffer = (char *) PG_GETARG_POINTER(0); + pst->len = PG_GETARG_INT32(1); + pst->pos = 0; + + PG_RETURN_POINTER(pst); +} + +Datum testprs_getlexeme(PG_FUNCTION_ARGS) +{ + ParserState *pst = (ParserState *) PG_GETARG_POINTER(0); + char **t = (char **) PG_GETARG_POINTER(1); + int *tlen = (int *) PG_GETARG_POINTER(2); + int type; + + *tlen = pst->pos; + *t = pst->buffer + pst->pos; + + if ((pst->buffer)[pst->pos] == ' ') + { + /* blank type */ + type = 12; + /* go to the next non-white-space character */ + while ((pst->buffer)[pst->pos] == ' ' && + pst->pos < pst->len) + (pst->pos)++; + } else { + /* word type */ + type = 3; + /* go to the next white-space character */ + while ((pst->buffer)[pst->pos] != ' ' && + pst->pos < pst->len) + (pst->pos)++; + } + + *tlen = pst->pos - *tlen; + + /* we are finished if (*tlen == 0) */ + if (*tlen == 0) + type=0; + + PG_RETURN_INT32(type); +} +Datum testprs_end(PG_FUNCTION_ARGS) +{ + ParserState *pst = (ParserState *) PG_GETARG_POINTER(0); + pfree(pst); + PG_RETURN_VOID(); +} + +Datum testprs_lextype(PG_FUNCTION_ARGS) +{ + /* + Remarks: + - we have to return the blanks for headline reason + - we use the same lexids like Teodor in the default + word parser; in this way we can reuse the headline + function of the default word parser. + */ + LexDescr *descr = (LexDescr *) palloc(sizeof(LexDescr) * (2+1)); + + /* there are only two types in this parser */ + descr[0].lexid = 3; + descr[0].alias = pstrdup("word"); + descr[0].descr = pstrdup("Word"); + descr[1].lexid = 12; + descr[1].alias = pstrdup("blank"); + descr[1].descr = pstrdup("Space symbols"); + descr[2].lexid = 0; + + PG_RETURN_POINTER(descr); +} + +</programlisting> + +This is a <literal>Makefile</literal> + +<programlisting> +override CPPFLAGS := -I. $(CPPFLAGS) + +MODULE_big = test_parser +OBJS = test_parser.o + +DATA_built = test_parser.sql +DATA = +DOCS = README.test_parser +REGRESS = test_parser + + +ifdef USE_PGXS +PGXS := $(shell pg_config --pgxs) +include $(PGXS) +else +subdir = contrib/test_parser +top_builddir = ../.. +include $(top_builddir)/src/Makefile.global +include $(top_srcdir)/contrib/contrib-global.mk +endif +</programlisting> + +This is a <literal>test_parser.sql.in</literal>: + +<programlisting> +SET default_text_search_config = 'english'; + +BEGIN; + +CREATE FUNCTION testprs_start(internal,int4) +RETURNS internal +AS 'MODULE_PATHNAME' +LANGUAGE 'C' with (isstrict); + +CREATE FUNCTION testprs_getlexeme(internal,internal,internal) +RETURNS internal +AS 'MODULE_PATHNAME' +LANGUAGE 'C' with (isstrict); + +CREATE FUNCTION testprs_end(internal) +RETURNS void +AS 'MODULE_PATHNAME' +LANGUAGE 'C' with (isstrict); + +CREATE FUNCTION testprs_lextype(internal) +RETURNS internal +AS 'MODULE_PATHNAME' +LANGUAGE 'C' with (isstrict); + + +CREATE TEXT SEARCH PARSER testparser + START 'testprs_start' + GETTOKEN 'testprs_getlexeme' + END 'testprs_end' + LEXTYPES 'testprs_lextype' +; + +CREATE TEXT SEARCH CONFIGURATION testcfg PARSER 'testparser'; +CREATE TEXT SEARCH CONFIGURATION testcfg ADD MAPPING FOR word WITH simple; + +END; +</programlisting> + +</para> + +</sect1> + +</chapter> -- GitLab