From 938236a29716c754a9a9238e377c3cd15db11dde Mon Sep 17 00:00:00 2001
From: Bruce Momjian <bruce@momjian.us>
Date: Wed, 1 Aug 2001 18:40:12 +0000
Subject: [PATCH] The fti.pl supplied with the fulltextindex module generate
 ALL possible substrings of two characters or greater, and is case-sensitive.

This patch makes it work correctly.  It generates only the suffixes of each
word, plus lowercases them - as specified by the README file.

This brings it into line with the fti.c function, makes it case-insensitive
properly, removes the problem with duplicate rows being returned from an fti
search and greatly reduces the size of the generated index table.

It was written by my co-worker, Brett Toolin.

Christopher Kings-Lynne
---
 contrib/fulltextindex/fti.pl | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/contrib/fulltextindex/fti.pl b/contrib/fulltextindex/fti.pl
index 02bf057e94a..230ba927033 100644
--- a/contrib/fulltextindex/fti.pl
+++ b/contrib/fulltextindex/fti.pl
@@ -1,6 +1,6 @@
 #!/usr/bin/perl
 #
-# This script substracts all substrings out of a specific column in a table
+# This script substracts all suffixes of all words in a specific column in a table
 # and generates output that can be loaded into a new table with the
 # psql '\copy' command. The new table should have the following structure:
 #
@@ -52,27 +52,28 @@ $PGRES_BAD_RESPONSE   = 5 ;
 $PGRES_NONFATAL_ERROR = 6 ;
 $PGRES_FATAL_ERROR    = 7 ;
 
+# the minimum length of word to include in the full text index
+$MIN_WORD_LENGTH = 2;
+
+# the minimum length of the substrings in the full text index
+$MIN_SUBSTRING_LENGTH = 2;
+
 $[ = 0; # make sure string offsets start at 0
 
 sub break_up {
 	my $string = pop @_;
 
+	# convert strings to lower case
+	$string = lc($string);
 	@strings = split(/\W+/, $string);
 	@subs = ();
 
 	foreach $s (@strings) {
 		$len = length($s);
-		next if ($len < 4);
-
-		$lpos = $len-1;
-		while ($lpos >= 3) {
-			$fpos = $lpos - 3;
-			while ($fpos >= 0) {
-				$sub = substr($s, $fpos, $lpos - $fpos + 1);
-				push(@subs, $sub);
-				$fpos = $fpos - 1;
-			}
-			$lpos = $lpos - 1;
+		next if ($len <= $MIN_WORD_LENGTH);
+		for ($i = 0; $i <= $len - $MIN_SUBSTRING_LENGTH; $i++) {
+			$tmp = substr($s, $i);
+			push(@subs, $tmp);
 		}
 	}
 
-- 
GitLab