From c63c1946a270aa6b287a4760987b96cfd67cc3fb Mon Sep 17 00:00:00 2001
From: Teodor Sigaev <teodor@sigaev.ru>
Date: Mon, 17 Nov 2003 17:34:35 +0000
Subject: [PATCH] Optimize. Improve ispell support for compound words. This
 work was sponsored by ABC Startsiden AS.

---
 contrib/tsearch2/dict_ispell.c  |  12 +-
 contrib/tsearch2/ispell/spell.c | 905 ++++++++++++++++++++++----------
 contrib/tsearch2/ispell/spell.h |  83 ++-
 3 files changed, 706 insertions(+), 294 deletions(-)

diff --git a/contrib/tsearch2/dict_ispell.c b/contrib/tsearch2/dict_ispell.c
index e3a100fa013..a0e67a69e1d 100644
--- a/contrib/tsearch2/dict_ispell.c
+++ b/contrib/tsearch2/dict_ispell.c
@@ -27,7 +27,7 @@ Datum		spell_lexize(PG_FUNCTION_ARGS);
 static void
 freeDictISpell(DictISpell * d)
 {
-	FreeIspell(&(d->obj));
+	NIFree(&(d->obj));
 	freestoplist(&(d->stoplist));
 	free(d);
 }
@@ -71,7 +71,7 @@ spell_init(PG_FUNCTION_ARGS)
 					  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
 					   errmsg("dictionary already loaded")));
 			}
-			if (ImportDictionary(&(d->obj), pcfg->value))
+			if (NIImportDictionary(&(d->obj), pcfg->value))
 			{
 				freeDictISpell(d);
 				ereport(ERROR,
@@ -90,7 +90,7 @@ spell_init(PG_FUNCTION_ARGS)
 					  (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
 					   errmsg("affixes already loaded")));
 			}
-			if (ImportAffixes(&(d->obj), pcfg->value))
+			if (NIImportAffixes(&(d->obj), pcfg->value))
 			{
 				freeDictISpell(d);
 				ereport(ERROR,
@@ -132,8 +132,8 @@ spell_init(PG_FUNCTION_ARGS)
 
 	if (affloaded && dictloaded)
 	{
-		SortDictionary(&(d->obj));
-		SortAffixes(&(d->obj));
+		NISortDictionary(&(d->obj));
+		NISortAffixes(&(d->obj));
 	}
 	else if (!affloaded)
 	{
@@ -168,7 +168,7 @@ spell_lexize(PG_FUNCTION_ARGS)
 
 	res = palloc(sizeof(char *) * 2);
 	txt = pnstrdup(in, PG_GETARG_INT32(2));
-	res = NormalizeWord(&(d->obj), txt);
+	res = NINormalizeWord(&(d->obj), txt);
 	pfree(txt);
 
 	if (res == NULL)
diff --git a/contrib/tsearch2/ispell/spell.c b/contrib/tsearch2/ispell/spell.c
index 45786cca652..6f0fe423140 100644
--- a/contrib/tsearch2/ispell/spell.c
+++ b/contrib/tsearch2/ispell/spell.c
@@ -7,15 +7,26 @@
 
 #include "spell.h"
 
-#define MAXNORMLEN 56
+#define MAX_NORM 1024
+#define MAXNORMLEN 256
 
 #define STRNCASECMP(x,y)		(strncasecmp(x,y,strlen(y)))
+#define GETWCHAR(W,L,N,T) ( ((u_int8_t*)(W))[ ((T)=='p') ? (N) : ( (L) - 1 - (N) ) ] )
+#define GETCHAR(A,N,T)	  GETWCHAR( (A)->repl, (A)->replen, N, T )
+
+
+#define MEMOUT(X)  if ( !(X) ) ereport(ERROR, (errcode(ERRCODE_OUT_OF_MEMORY), errmsg("out of memory")))
 
 static int
 cmpspell(const void *s1, const void *s2)
 {
 	return (strcmp(((const SPELL *) s1)->word, ((const SPELL *) s2)->word));
 }
+static int
+cmpspellaffix(const void *s1, const void *s2)
+{
+	return (strcmp(((const SPELL *) s1)->p.flag, ((const SPELL *) s2)->p.flag));
+}
 
 static void
 strlower(char *str)
@@ -29,6 +40,13 @@ strlower(char *str)
 	}
 }
 
+static char* 
+strndup(char *s, int len) {
+	char *d=(char*)palloc( len + 1 );
+	memcpy(d, s, len );
+	d[len]='\0';
+	return d;
+}
 /* backward string compaire for suffix tree operations */
 static int
 strbcmp(const char *s1, const char *s2)
@@ -92,7 +110,7 @@ cmpaffix(const void *s1, const void *s2)
 }
 
 int
-AddSpell(IspellDict * Conf, const char *word, const char *flag)
+NIAddSpell(IspellDict * Conf, const char *word, const char *flag)
 {
 	if (Conf->nspell >= Conf->mspell)
 	{
@@ -106,24 +124,18 @@ AddSpell(IspellDict * Conf, const char *word, const char *flag)
 			Conf->mspell = 1024 * 20;
 			Conf->Spell = (SPELL *) malloc(Conf->mspell * sizeof(SPELL));
 		}
-		if (Conf->Spell == NULL)
-			ereport(ERROR,
-					(errcode(ERRCODE_OUT_OF_MEMORY),
-					 errmsg("out of memory")));
+		MEMOUT(Conf->Spell);
 	}
 	Conf->Spell[Conf->nspell].word = strdup(word);
-	if (!Conf->Spell[Conf->nspell].word)
-		ereport(ERROR,
-				(errcode(ERRCODE_OUT_OF_MEMORY),
-				 errmsg("out of memory")));
-	strncpy(Conf->Spell[Conf->nspell].flag, flag, 10);
+	MEMOUT(Conf->Spell[Conf->nspell].word);
+	strncpy(Conf->Spell[Conf->nspell].p.flag, flag, 16);
 	Conf->nspell++;
 	return (0);
 }
 
 
 int
-ImportDictionary(IspellDict * Conf, const char *filename)
+NIImportDictionary(IspellDict * Conf, const char *filename)
 {
 	unsigned char str[BUFSIZ];
 	FILE	   *dict;
@@ -143,7 +155,7 @@ ImportDictionary(IspellDict * Conf, const char *filename)
 			flag = s;
 			while (*s)
 			{
-				if (((*s >= 'A') && (*s <= 'Z')) || ((*s >= 'a') && (*s <= 'z')))
+				if (isprint(*s) && !isspace(*s))
 					s++;
 				else
 				{
@@ -166,65 +178,49 @@ ImportDictionary(IspellDict * Conf, const char *filename)
 				*s = 0;
 			s++;
 		}
-		AddSpell(Conf, str, flag);
+		NIAddSpell(Conf, str, flag);
 	}
 	fclose(dict);
 	return (0);
 }
 
 
-static SPELL *
-FindWord(IspellDict * Conf, const char *word, int affixflag)
+static int
+FindWord(IspellDict * Conf, const char *word, int affixflag, char compoundonly)
 {
-	int			l,
-				c,
-				r,
-				resc,
-				resl,
-				resr,
-				i;
-
-	i = (int) (*word) & 255;
-	l = Conf->SpellTree.Left[i];
-	r = Conf->SpellTree.Right[i];
-	if (l == -1)
-		return (NULL);
-	while (l <= r)
-	{
-		c = (l + r) >> 1;
-		resc = strcmp(Conf->Spell[c].word, word);
-		if ((resc == 0) &&
-			((affixflag == 0) || (strchr(Conf->Spell[c].flag, affixflag) != NULL)))
-			return (&Conf->Spell[c]);
-		resl = strcmp(Conf->Spell[l].word, word);
-		if ((resl == 0) &&
-			((affixflag == 0) || (strchr(Conf->Spell[l].flag, affixflag) != NULL)))
-			return (&Conf->Spell[l]);
-		resr = strcmp(Conf->Spell[r].word, word);
-		if ((resr == 0) &&
-			((affixflag == 0) || (strchr(Conf->Spell[r].flag, affixflag) != NULL)))
-			return (&Conf->Spell[r]);
-		if (resc < 0)
-		{
-			l = c + 1;
-			r--;
-		}
-		else if (resc > 0)
-		{
-			r = c - 1;
-			l++;
-		}
-		else
-		{
-			l++;
-			r--;
+	SPNode *node = Conf->Dictionary;
+	SPNodeData *StopLow, *StopHigh, *StopMiddle;
+	int level=0, wrdlen=strlen(word);
+
+	while( node && level<wrdlen) {
+		StopLow = node->data;
+		StopHigh = node->data+node->length;
+		while (StopLow < StopHigh) {
+			StopMiddle = StopLow + (StopHigh - StopLow) / 2;
+			if ( StopMiddle->val == ((u_int8_t*)(word))[level] ) {
+				if ( wrdlen==level+1 && StopMiddle->isword ) {
+					if ( compoundonly && !StopMiddle->compoundallow )
+						return 0;
+					if ( (affixflag == 0) || (strchr(Conf->AffixData[StopMiddle->affix], affixflag) != NULL))
+						return 1;
+				}
+				node=StopMiddle->node;
+				level++;
+				break;
+			} else if ( StopMiddle->val < ((u_int8_t*)(word))[level] ) {
+				StopLow = StopMiddle + 1;
+			} else {
+				StopHigh = StopMiddle;
+			}
 		}
+		if ( StopLow >= StopHigh )
+			break; 
 	}
-	return (NULL);
+	return 0;
 }
 
 int
-AddAffix(IspellDict * Conf, int flag, const char *mask, const char *find, const char *repl, int type)
+NIAddAffix(IspellDict * Conf, int flag, char flagflags, const char *mask, const char *find, const char *repl, int type)
 {
 	if (Conf->naffixes >= Conf->maffixes)
 	{
@@ -238,16 +234,14 @@ AddAffix(IspellDict * Conf, int flag, const char *mask, const char *find, const
 			Conf->maffixes = 16;
 			Conf->Affix = (AFFIX *) malloc(Conf->maffixes * sizeof(AFFIX));
 		}
-		if (Conf->Affix == NULL)
-			ereport(ERROR,
-					(errcode(ERRCODE_OUT_OF_MEMORY),
-					 errmsg("out of memory")));
+		MEMOUT(Conf->Affix);
 	}
 	if (type == 's')
 		sprintf(Conf->Affix[Conf->naffixes].mask, "%s$", mask);
 	else
 		sprintf(Conf->Affix[Conf->naffixes].mask, "^%s", mask);
 	Conf->Affix[Conf->naffixes].compile = 1;
+	Conf->Affix[Conf->naffixes].flagflags = flagflags;
 	Conf->Affix[Conf->naffixes].flag = flag;
 	Conf->Affix[Conf->naffixes].type = type;
 
@@ -281,7 +275,7 @@ remove_spaces(char *dist, char *src)
 
 
 int
-ImportAffixes(IspellDict * Conf, const char *filename)
+NIImportAffixes(IspellDict * Conf, const char *filename)
 {
 	unsigned char str[BUFSIZ];
 	unsigned char flag = 0;
@@ -292,13 +286,24 @@ ImportAffixes(IspellDict * Conf, const char *filename)
 	int			i;
 	int			suffixes = 0;
 	int			prefixes = 0;
+	unsigned char flagflags = 0;
 	FILE	   *affix;
 
 	if (!(affix = fopen(filename, "r")))
 		return (1);
+	Conf->compoundcontrol='\t';
 
 	while (fgets(str, sizeof(str), affix))
 	{
+		if (STRNCASECMP(str, "compoundwords")==0) {
+			s=strchr(str, 'l');
+			if ( s ) {
+				while( *s!=' ' ) s++;
+				while( *s==' ' ) s++;
+				Conf->compoundcontrol = *s;
+				continue; 
+			}
+		}
 		if (!STRNCASECMP(str, "suffixes"))
 		{
 			suffixes = 1;
@@ -314,8 +319,18 @@ ImportAffixes(IspellDict * Conf, const char *filename)
 		if (!STRNCASECMP(str, "flag "))
 		{
 			s = str + 5;
-			while (strchr("* ", *s))
+			flagflags=0;
+			while( *s==' ' ) s++;
+			if ( *s=='*' ) {
+				flagflags|=FF_CROSSPRODUCT;
+				s++;
+			} else if ( *s=='~' ) {
+				flagflags|=FF_COMPOUNDONLYAFX;
 				s++;
+			}
+
+			if ( *s=='\\' ) s++;
+		
 			flag = *s;
 			continue;
 		}
@@ -351,7 +366,7 @@ ImportAffixes(IspellDict * Conf, const char *filename)
 				continue;
 		}
 
-		AddAffix(Conf, (int) flag, mask, find, repl, suffixes ? 's' : 'p');
+		NIAddAffix(Conf, (int) flag, (char) flagflags, mask, find, repl, suffixes ? 's' : 'p');
 
 	}
 	fclose(affix);
@@ -359,87 +374,266 @@ ImportAffixes(IspellDict * Conf, const char *filename)
 	return (0);
 }
 
+static int 
+MergeAffix(IspellDict *Conf, int a1, int a2) {
+	int naffix=0;
+	char **ptr=Conf->AffixData;
+
+	while(*ptr) {
+		naffix++;
+		ptr++;
+	}
+	
+	Conf->AffixData=(char**)realloc( Conf->AffixData, (naffix+2)*sizeof(char*) );
+	MEMOUT(Conf->AffixData);
+	ptr = Conf->AffixData + naffix;
+	*ptr=malloc( strlen(Conf->AffixData[a1]) + strlen(Conf->AffixData[a2]) + 1 /* space */ + 1 /* \0 */ );
+	MEMOUT(ptr);
+	sprintf(*ptr, "%s %s", Conf->AffixData[a1], Conf->AffixData[a2]);
+	ptr++;
+	*ptr='\0';
+	return naffix; 
+}
+
+
+static SPNode* 
+mkSPNode(IspellDict *Conf, int low, int high, int level) {
+	int i;
+	int nchar=0;
+	char lastchar='\0';
+	SPNode *rs;
+	SPNodeData *data;
+	int lownew=low;
+
+	for(i=low; i<high; i++)
+		if ( Conf->Spell[i].p.d.len>level && lastchar!=Conf->Spell[i].word[level] ) {
+			nchar++;
+			lastchar=Conf->Spell[i].word[level];
+		}
+
+	if (!nchar)
+		return NULL;
+
+	rs=(SPNode*)malloc(SPNHRDSZ+nchar*sizeof(SPNodeData));
+	MEMOUT(rs);
+	memset(rs,0,SPNHRDSZ+nchar*sizeof(SPNodeData));
+	rs->length = nchar;
+	data=rs->data;
+
+	lastchar='\0';
+	for(i=low; i<high; i++)
+		if ( Conf->Spell[i].p.d.len>level ) {
+			if ( lastchar!=Conf->Spell[i].word[level] ) {
+				if ( lastchar ) {
+					data->node = mkSPNode(Conf, lownew, i, level+1);
+					lownew=i;
+					data++;
+				}
+				lastchar=Conf->Spell[i].word[level];
+			}
+			data->val=((u_int8_t*)(Conf->Spell[i].word))[level];
+			if ( Conf->Spell[i].p.d.len == level+1 ) {
+				if ( data->isword && data->affix!=Conf->Spell[i].p.d.affix) {
+					/* 
+					fprintf(stderr,"Word already exists: %s (affixes: '%s' and '%s')\n", 
+						Conf->Spell[i].word, 
+						Conf->AffixData[data->affix],
+						Conf->AffixData[Conf->Spell[i].p.d.affix]
+					); 
+					*/
+					/* MergeAffix called a few times */
+					data->affix = MergeAffix(Conf, data->affix, Conf->Spell[i].p.d.affix);
+				} else
+					data->affix = Conf->Spell[i].p.d.affix;
+				data->isword=1;
+				if ( strchr( Conf->AffixData[ data->affix ], Conf->compoundcontrol ) )
+					data->compoundallow=1;
+			}
+		}
+		
+	data->node = mkSPNode(Conf, lownew, high, level+1);
+
+	return rs;
+}
+
+
+
 void
-SortDictionary(IspellDict * Conf)
+NISortDictionary(IspellDict * Conf)
 {
-	int			CurLet = -1,
-				Let;
 	size_t		i;
-
+	int	naffix=3;
+	
+	/* compress affixes */
+	qsort((void *) Conf->Spell, Conf->nspell, sizeof(SPELL), cmpspellaffix);
+	for (i = 1; i < Conf->nspell; i++)
+		if ( strcmp(Conf->Spell[i].p.flag,Conf->Spell[i-1].p.flag) )
+			naffix++;
+
+	Conf->AffixData=(char**)malloc( naffix*sizeof(char*) );
+	MEMOUT(Conf->AffixData);
+	memset(Conf->AffixData, 0, naffix*sizeof(char*));
+	naffix=1;
+	Conf->AffixData[0]=strdup("");
+	MEMOUT(Conf->AffixData[0]);
+	Conf->AffixData[1]=strdup( Conf->Spell[0].p.flag );
+	MEMOUT(Conf->AffixData[1]);
+	Conf->Spell[0].p.d.affix = 1;
+	Conf->Spell[0].p.d.len = strlen(Conf->Spell[0].word);
+	for (i = 1; i < Conf->nspell; i++) {
+		if ( strcmp(Conf->Spell[i].p.flag, Conf->AffixData[naffix]) ) {
+			naffix++;
+			Conf->AffixData[naffix] = strdup( Conf->Spell[i].p.flag );
+			MEMOUT(Conf->AffixData[naffix]);
+		}
+		Conf->Spell[i].p.d.affix = naffix;
+		Conf->Spell[i].p.d.len = strlen(Conf->Spell[i].word);
+	}
+	
 	qsort((void *) Conf->Spell, Conf->nspell, sizeof(SPELL), cmpspell);
+	Conf->Dictionary = mkSPNode(Conf, 0, Conf->nspell, 0);
+	
+	for (i = 0; i < Conf->nspell; i++) 
+		free( Conf->Spell[i].word );
+	free( Conf->Spell );
+	Conf->Spell=NULL;
+}
 
-	for (i = 0; i < 256; i++)
-		Conf->SpellTree.Left[i] = -1;
+static AffixNode*
+mkANode(IspellDict *Conf, int low, int high, int level, int type) {
+	int i;
+	int nchar=0;
+	u_int8_t lastchar='\0';
+	AffixNode *rs;
+	AffixNodeData *data;
+	int lownew=low;
+
+	for(i=low; i<high; i++)
+		if ( Conf->Affix[i].replen>level && lastchar!=GETCHAR( Conf->Affix + i, level, type ) ) {
+			nchar++;
+			lastchar=GETCHAR( Conf->Affix + i, level, type );
+		}
 
-	for (i = 0; i < Conf->nspell; i++)
-	{
-		Let = (int) (*(Conf->Spell[i].word)) & 255;
-		if (CurLet != Let)
-		{
-			Conf->SpellTree.Left[Let] = i;
-			CurLet = Let;
+	if (!nchar)
+		return NULL;
+
+	rs=(AffixNode*)malloc(ANHRDSZ+nchar*sizeof(AffixNodeData));
+	MEMOUT(rs);
+	memset(rs,0,ANHRDSZ+nchar*sizeof(AffixNodeData));
+	rs->length = nchar;
+	data=rs->data;
+
+	lastchar='\0';
+	for(i=low; i<high; i++)
+		if ( Conf->Affix[i].replen>level ) {
+			if ( lastchar!=GETCHAR( Conf->Affix + i, level, type ) ) {
+				if ( lastchar ) {
+					data->node = mkANode(Conf, lownew, i, level+1, type);
+					lownew=i;
+					data++;
+				}
+				lastchar=GETCHAR( Conf->Affix + i, level, type );
+			}
+			data->val=GETCHAR( Conf->Affix + i, level, type );
+			if ( Conf->Affix[i].replen == level+1 ) { /* affix stopped */
+				if ( !data->naff )
+					data->aff=(AFFIX**)malloc(sizeof(AFFIX*)*(high-i+1));
+					MEMOUT(data);
+				data->aff[ data->naff ] = Conf->Affix + i;
+				data->naff++;
+			}
 		}
-		Conf->SpellTree.Right[Let] = i;
-	}
+		
+	data->node = mkANode(Conf, lownew, high, level+1, type);
+
+	return rs;
 }
 
 void
-SortAffixes(IspellDict * Conf)
+NISortAffixes(IspellDict * Conf)
 {
-	int			CurLetP = -1,
-				CurLetS = -1,
-				Let;
 	AFFIX	   *Affix;
 	size_t		i;
+	CMPDAffix* ptr;
+	int	firstsuffix=-1;
 
 	if (Conf->naffixes > 1)
 		qsort((void *) Conf->Affix, Conf->naffixes, sizeof(AFFIX), cmpaffix);
-	for (i = 0; i < 256; i++)
-	{
-		Conf->PrefixTree.Left[i] = Conf->PrefixTree.Right[i] = -1;
-		Conf->SuffixTree.Left[i] = Conf->SuffixTree.Right[i] = -1;
-	}
 
-	for (i = 0; i < Conf->naffixes; i++)
-	{
+	Conf->CompoundAffix = ptr = (CMPDAffix*)malloc( sizeof(CMPDAffix) * Conf->naffixes );
+	MEMOUT(Conf->CompoundAffix);
+	ptr->affix=NULL;
+
+	for (i = 0; i < Conf->naffixes; i++) {
 		Affix = &(((AFFIX *) Conf->Affix)[i]);
-		if (Affix->type == 'p')
-		{
-			Let = (int) (*(Affix->repl)) & 255;
-			if (CurLetP != Let)
-			{
-				Conf->PrefixTree.Left[Let] = i;
-				CurLetP = Let;
+		if ( Affix->type == 's' ) {
+			if ( firstsuffix<0 ) firstsuffix=i;
+			if ( Affix->flagflags & FF_COMPOUNDONLYAFX ) {
+				if ( !ptr->affix || strbncmp((ptr-1)->affix, Affix->repl, (ptr-1)->len) ) {
+					/* leave only unique and minimals suffixes */
+					ptr->affix=Affix->repl;
+					ptr->len=Affix->replen;
+					ptr++;
+				}
 			}
-			Conf->PrefixTree.Right[Let] = i;
 		}
-		else
-		{
-			Let = (Affix->replen) ? (int) (Affix->repl[Affix->replen - 1]) & 255 : 0;
-			if (CurLetS != Let)
-			{
-				Conf->SuffixTree.Left[Let] = i;
-				CurLetS = Let;
+	}
+	ptr->affix = NULL;
+	Conf->CompoundAffix = (CMPDAffix*)realloc( Conf->CompoundAffix, sizeof(CMPDAffix) * (ptr-Conf->CompoundAffix+1) );
+
+	Conf->Prefix = mkANode(Conf, 0, firstsuffix, 0, 'p'); 
+	Conf->Suffix = mkANode(Conf, firstsuffix, Conf->naffixes, 0, 's');
+}
+
+static AffixNodeData*
+FinfAffixes(AffixNode *node, const char *word, int wrdlen, int *level, int type) {
+	AffixNodeData *StopLow, *StopHigh, *StopMiddle;
+	u_int8_t symbol;
+
+	while( node && *level<wrdlen) {
+		StopLow = node->data;
+		StopHigh = node->data+node->length;
+		while (StopLow < StopHigh) {
+			StopMiddle = StopLow + (StopHigh - StopLow) / 2;
+			symbol = GETWCHAR(word,wrdlen,*level,type);
+			if ( StopMiddle->val == symbol ) {
+				if ( StopMiddle->naff ) 
+					return StopMiddle;
+				node=StopMiddle->node;
+				(*level)++;
+				break;
+			} else if ( StopMiddle->val < symbol ) {
+				StopLow = StopMiddle + 1;
+			} else {
+				StopHigh = StopMiddle;
 			}
-			Conf->SuffixTree.Right[Let] = i;
 		}
+		if ( StopLow >= StopHigh )
+			break; 
 	}
+	return NULL;
 }
 
 static char *
-CheckSuffix(const char *word, size_t len, AFFIX * Affix, int *res, IspellDict * Conf)
-{
+CheckAffix(const char *word, size_t len, AFFIX * Affix, char flagflags, char *newword) {
 	regmatch_t	subs[2];		/* workaround for apache&linux */
-	char		newword[2 * MAXNORMLEN] = "";
 	int			err;
 
-	*res = strbncmp(word, Affix->repl, Affix->replen);
-	if (*res < 0)
-		return NULL;
-	if (*res > 0)
-		return NULL;
-	strcpy(newword, word);
-	strcpy(newword + len - Affix->replen, Affix->find);
+	if ( flagflags & FF_COMPOUNDONLYAFX ) {
+		if ( (Affix->flagflags & FF_COMPOUNDONLYAFX) == 0 )
+			return NULL;
+	} else {
+		if ( Affix->flagflags & FF_COMPOUNDONLYAFX )
+			return NULL;
+	} 
+
+	if ( Affix->type=='s' ) {
+		strcpy(newword, word);
+		strcpy(newword + len - Affix->replen, Affix->find);
+	} else {
+		strcpy(newword, Affix->find);
+		strcat(newword, word + Affix->replen);
+	}
 
 	if (Affix->compile)
 	{
@@ -452,205 +646,364 @@ CheckSuffix(const char *word, size_t len, AFFIX * Affix, int *res, IspellDict *
 		}
 		Affix->compile = 0;
 	}
-	if (!(err = regexec(&(Affix->reg), newword, 1, subs, 0)))
-	{
-		if (FindWord(Conf, newword, Affix->flag))
-			return pstrdup(newword);
-	}
+	if (!(err = regexec(&(Affix->reg), newword, 1, subs, 0))) 
+			return newword;
 	return NULL;
 }
 
-#define NS 1
-#define MAX_NORM 512
-static int
-CheckPrefix(const char *word, size_t len, AFFIX * Affix, IspellDict * Conf, int pi,
-			char **forms, char ***cur)
-{
-	regmatch_t	subs[NS * 2];
+
+static char	  **
+NormalizeSubWord(IspellDict * Conf, char *word, char flag) {
+	AffixNodeData	*suffix=NULL, *prefix=NULL;
+	int 	slevel=0, plevel=0;
+	int wrdlen = strlen(word), swrdlen;
+	char	  **forms;
+	char	  **cur;
 	char		newword[2 * MAXNORMLEN] = "";
-	int			err,
-				ls,
-				res,
-				lres;
-	size_t		newlen;
-	AFFIX	   *CAffix = Conf->Affix;
-
-	res = strncmp(word, Affix->repl, Affix->replen);
-	if (res != 0)
-		return res;
-	strcpy(newword, Affix->find);
-	strcat(newword, word + Affix->replen);
+	char		pnewword[2 * MAXNORMLEN] = "";
+	AffixNode *snode = Conf->Suffix, *pnode;
+	int i,j;
 
-	if (Affix->compile)
-	{
-		err = regcomp(&(Affix->reg), Affix->mask, REG_EXTENDED | REG_ICASE | REG_NOSUB);
-		if (err)
-		{
-			/* regerror(err, &(Affix->reg), regerrstr, ERRSTRSIZE); */
-			regfree(&(Affix->reg));
-			return (0);
-		}
-		Affix->compile = 0;
+	if (wrdlen > MAXNORMLEN) return NULL;
+	strlower(word);	
+	cur = forms = (char **) palloc(MAX_NORM * sizeof(char *));
+	*cur = NULL;
+
+
+	/* Check that the word itself is normal form */
+	if (FindWord(Conf, word, 0, flag & FF_COMPOUNDWORD)) {
+		*cur = pstrdup(word);
+		cur++;
+		*cur = NULL;
 	}
-	if (!(err = regexec(&(Affix->reg), newword, 1, subs, 0)))
-	{
-		SPELL	   *curspell;
 
-		if ((curspell = FindWord(Conf, newword, Affix->flag)))
-		{
-			if ((*cur - forms) < (MAX_NORM - 1))
-			{
-				**cur = pstrdup(newword);
-				(*cur)++;
-				**cur = NULL;
+	/* Find all other NORMAL forms of the 'word' (check only prefix)*/
+	pnode=Conf->Prefix;
+	plevel=0;
+	while(pnode) {
+		prefix=FinfAffixes(pnode, word, wrdlen, &plevel,'p');
+		if (!prefix) break;
+		for(j=0;j<prefix->naff;j++) {	
+			if ( CheckAffix(word,wrdlen,prefix->aff[j], flag, newword) ) {
+				/* prefix success */
+				if ( FindWord(Conf, newword, prefix->aff[j]->flag, flag&FF_COMPOUNDWORD) && (cur - forms) < (MAX_NORM-1) ) {
+					/* word search success */
+					*cur = pstrdup(newword);
+					cur++;
+					*cur=NULL;
+				}
 			}
 		}
-		newlen = strlen(newword);
-		ls = Conf->SuffixTree.Left[pi];
-		if (ls >= 0 && ((*cur - forms) < (MAX_NORM - 1)))
-		{
-			**cur = CheckSuffix(newword, newlen, &CAffix[ls], &lres, Conf);
-			if (**cur)
-			{
-				(*cur)++;
-				**cur = NULL;
+		pnode = prefix->node;
+		plevel++;
+	}
+ 
+	/* Find all other NORMAL forms of the 'word' (check suffix and then prefix)*/
+	while( snode ) {
+		/* find possible suffix */
+		suffix = FinfAffixes(snode, word, wrdlen, &slevel, 's');
+		if (!suffix) break;
+		/* foreach suffix check affix */
+		for(i=0;i<suffix->naff;i++) {
+			if ( CheckAffix(word, wrdlen, suffix->aff[i], flag, newword) ) {
+				/* suffix success */
+				if ( FindWord(Conf, newword, suffix->aff[i]->flag, flag&FF_COMPOUNDWORD) && (cur - forms) < (MAX_NORM-1) ) {
+					/* word search success */
+					*cur = pstrdup(newword);
+					cur++;
+					*cur=NULL;
+				}
+				/* now we will look changed word with prefixes */
+				pnode=Conf->Prefix;
+				plevel=0;
+				swrdlen=strlen(newword);
+				while(pnode) {
+					prefix=FinfAffixes(pnode, newword, swrdlen, &plevel,'p');
+					if (!prefix) break;
+					for(j=0;j<prefix->naff;j++) {	
+						if ( CheckAffix(newword,swrdlen,prefix->aff[j], flag, pnewword) ) {
+							/* prefix success */
+							int ff=( prefix->aff[j]->flagflags & suffix->aff[i]->flagflags & FF_CROSSPRODUCT ) ?
+								 0 : prefix->aff[j]->flag; 
+							if ( FindWord(Conf, pnewword, ff, flag&FF_COMPOUNDWORD) && (cur - forms) < (MAX_NORM-1) ) {
+								/* word search success */
+								*cur = pstrdup(pnewword);
+								cur++;
+								*cur=NULL;
+							}
+						}
+					}
+					pnode = prefix->node;
+					plevel++;
+				} 
 			}
 		}
-	}
-	return 0;
-}
 
+		snode=suffix->node;
+		slevel++;
+	}
 
-char	  **
-NormalizeWord(IspellDict * Conf, char *word)
-{
-/*regmatch_t subs[NS];*/
-	size_t		len;
-	char	  **forms;
-	char	  **cur;
-	AFFIX	   *Affix;
-	int			ri,
-				pi,
-				ipi,
-				lp,
-				rp,
-				cp,
-				ls,
-				rs;
-	int			lres,
-				rres,
-				cres = 0;
-	SPELL	   *spell;
-
-	len = strlen(word);
-	if (len > MAXNORMLEN)
+	if (cur == forms) {
+		free(forms);
 		return (NULL);
+	}
+	return (forms);
+}
 
-	strlower(word);
-
-	forms = (char **) palloc(MAX_NORM * sizeof(char **));
-	cur = forms;
-	*cur = NULL;
-
-	ri = (int) (*word) & 255;
-	pi = (int) (word[strlen(word) - 1]) & 255;
-	Affix = (AFFIX *) Conf->Affix;
-
-	/* Check that the word itself is normal form */
-	if ((spell = FindWord(Conf, word, 0)))
-	{
-		*cur = pstrdup(word);
-		cur++;
-		*cur = NULL;
+typedef struct SplitVar {
+	int	nstem;
+	char	**stem;	
+	struct	SplitVar *next;
+} SplitVar;
+
+static int 
+CheckCompoundAffixes(CMPDAffix **ptr, char *word, int len) {
+	while( (*ptr)->affix ) {
+		if ( len > (*ptr)->len && strncmp((*ptr)->affix, word, (*ptr)->len)==0 ) {
+			len = (*ptr)->len;
+			(*ptr)++;
+			return len;
+		}
+		(*ptr)++;
 	}
+	return 0;
+}
 
-	/* Find all other NORMAL forms of the 'word' */
+static SplitVar*
+CopyVar(SplitVar *s, int makedup) {
+	SplitVar *v = (SplitVar*)palloc(sizeof(SplitVar));
+
+	v->stem=(char**)palloc( sizeof(char*) * (MAX_NORM) );
+	v->next=NULL;
+	if ( s ) {
+		int i;
+		v->nstem = s->nstem;
+		for(i=0;i<s->nstem;i++)
+			v->stem[i] = (makedup) ? pstrdup( s->stem[i] ) : s->stem[i];
+	} else {
+		v->nstem=0;
+	}
+	return v;
+}
 
-	for (ipi = 0; ipi <= pi; ipi += pi)
-	{
 
-		/* check prefix */
-		lp = Conf->PrefixTree.Left[ri];
-		rp = Conf->PrefixTree.Right[ri];
-		while (lp >= 0 && lp <= rp)
-		{
-			cp = (lp + rp) >> 1;
-			cres = 0;
-			if ((cur - forms) < (MAX_NORM - 1))
-				cres = CheckPrefix(word, len, &Affix[cp], Conf, ipi, forms, &cur);
-			if ((lp < cp) && ((cur - forms) < (MAX_NORM - 1)))
-				lres = CheckPrefix(word, len, &Affix[lp], Conf, ipi, forms, &cur);
-			if ((rp > cp) && ((cur - forms) < (MAX_NORM - 1)))
-				rres = CheckPrefix(word, len, &Affix[rp], Conf, ipi, forms, &cur);
-			if (cres < 0)
-			{
-				rp = cp - 1;
-				lp++;
-			}
-			else if (cres > 0)
-			{
-				lp = cp + 1;
-				rp--;
+static SplitVar*
+SplitToVariants( IspellDict * Conf, SPNode *snode, SplitVar * orig, char *word, int wordlen, int startpos, int minpos ) {
+	SplitVar *var=NULL;
+	SPNodeData *StopLow, *StopHigh, *StopMiddle;
+	SPNode *node = (snode) ? snode : Conf->Dictionary;
+	int level=(snode) ? minpos : startpos; /* recursive minpos==level*/
+	int lenaff;
+	CMPDAffix *caff;
+	char	notprobed[wordlen];
+
+	memset(notprobed,1,wordlen);
+	var = CopyVar(orig,1);
+
+	while( node && level<wordlen) {
+		StopLow = node->data;
+		StopHigh = node->data+node->length;
+		while (StopLow < StopHigh) {
+			StopMiddle = StopLow + (StopHigh - StopLow) / 2;
+			if ( StopMiddle->val == ((u_int8_t*)(word))[level] ) {
+				break;
+			} else if ( StopMiddle->val < ((u_int8_t*)(word))[level] ) {
+				StopLow = StopMiddle + 1;
+			} else {
+				StopHigh = StopMiddle;
 			}
-			else
-			{
-				lp++;
-				rp--;
+		}
+		if ( StopLow >= StopHigh )
+			break;
+
+		/* find word with epenthetic */
+		caff = Conf->CompoundAffix;
+		while ( level>startpos && (lenaff=CheckCompoundAffixes( &caff, word + level, wordlen - level ))>0 ) {
+			/* there is one of compound suffixes, so check word for existings */
+			char buf[MAXNORMLEN];
+			char **subres;
+
+			lenaff=level-startpos+lenaff;
+		
+			if ( !notprobed[startpos+lenaff-1] )
+				continue;
+				
+			if ( level+lenaff-1 <= minpos )
+				continue;
+
+			memcpy(buf, word+startpos, lenaff);
+			buf[lenaff]='\0';
+
+			subres = NormalizeSubWord(Conf, buf, FF_COMPOUNDWORD | FF_COMPOUNDONLYAFX);
+			if ( subres ) {
+				/* Yes, it was a word from dictionary */
+				SplitVar *new=CopyVar(var,0);
+				SplitVar *ptr=var;
+				char **sptr=subres;
+			
+				notprobed[startpos+lenaff-1]=0;
+	
+				while(*sptr) {
+					new->stem[ new->nstem ] = *sptr;
+					new->nstem++;
+					sptr++;
+				}
+				free(subres);
+
+				while( ptr->next ) 
+					ptr = ptr->next;
+				ptr->next = SplitToVariants(Conf, NULL, new, word, wordlen, startpos+lenaff, startpos+lenaff);
+ 
+				free(new->stem);
+				free(new);
 			}
 		}
 
-		/* check suffix */
-		ls = Conf->SuffixTree.Left[ipi];
-		rs = Conf->SuffixTree.Right[ipi];
-		while (ls >= 0 && ls <= rs)
-		{
-			if (((cur - forms) < (MAX_NORM - 1)))
-			{
-				*cur = CheckSuffix(word, len, &Affix[ls], &lres, Conf);
-				if (*cur)
-				{
-					cur++;
-					*cur = NULL;
+		/* find infinitive */
+		if ( StopMiddle->isword && StopMiddle->compoundallow && notprobed[level] ) {
+			/* ok, we found full compoundallowed word*/
+			if ( level>minpos ) {
+				/* and its length more than minimal */
+				if ( wordlen==level+1 ) {
+					/* well, it was last word */
+					var->stem[ var->nstem ] = strndup(word + startpos, wordlen - startpos);
+					var->nstem++;
+					return var;
+				} else {
+					/* then we will search more big word at the same point */
+					SplitVar *ptr=var;
+					while( ptr->next ) 
+						ptr = ptr->next;
+					ptr->next=SplitToVariants(Conf, node, var, word, wordlen, startpos, level);
+					/* we can find next word */
+					level++;
+					var->stem[ var->nstem ] = strndup(word + startpos, level - startpos);
+					var->nstem++;
+					node = Conf->Dictionary;
+					startpos=level;
+					continue;
 				}
 			}
-			if ((rs > ls) && ((cur - forms) < (MAX_NORM - 1)))
-			{
-				*cur = CheckSuffix(word, len, &Affix[rs], &rres, Conf);
-				if (*cur)
-				{
-					cur++;
-					*cur = NULL;
+		}
+		level++;
+		node=StopMiddle->node;
+	}
+
+	var->stem[ var->nstem ] = strndup(word + startpos, wordlen - startpos);
+	var->nstem++;
+	return var;
+} 
+
+char  **
+NINormalizeWord(IspellDict * Conf, char *word) {
+	char **res= NormalizeSubWord(Conf, word, 0);
+
+	if ( Conf->compoundcontrol != '\t' ) {
+		int wordlen=strlen(word);
+		SplitVar *ptr, *var = SplitToVariants(Conf,NULL,NULL, word, wordlen, 0, -1);
+		char **cur=res;
+		int i;
+	
+		while(var) {
+			if ( var->nstem > 1 ) {
+				char **subres = NormalizeSubWord(Conf, var->stem[ var->nstem-1 ], FF_COMPOUNDWORD);
+				if ( subres ) {
+					char **ptr=subres;
+	
+					if ( cur ) {
+						while(*cur) 
+							cur++;
+					} else {
+						res=cur=(char **) palloc(MAX_NORM * sizeof(char *));
+					}
+	
+					for(i=0;i<var->nstem-1;i++) {
+						*cur=var->stem[ i ];
+						cur++;
+					}
+					while(*ptr) {
+						*cur=*ptr;
+						cur++; ptr++;
+					}
+					*cur=NULL;
+					free(subres);
+					var->stem[ 0 ] = NULL;
 				}
 			}
-			ls++;
-			rs--;
-		}						/* end while */
+	
+			for(i=0;i<var->nstem && var->stem[ i ];i++)
+				free( var->stem[i] );	
+			ptr = var->next;
+			free(var->stem);
+			free(var);	
+			var=ptr;
+		}
+	}
+	return res;
+}
 
-	}							/* for ipi */
 
-	if (cur == forms)
-	{
-		pfree(forms);
-		return (NULL);
+static void freeSPNode(SPNode *node) {
+	SPNodeData *data;
+
+	if (!node) return;
+	data=node->data;
+	while( node->length ) {
+		freeSPNode(data->node);
+		data++;
+		node->length--;
 	}
-	return (forms);
+	free(node);
 }
+	
+static void freeANode(AffixNode *node) {
+	AffixNodeData *data;
+
+	if (!node) return;
+	data=node->data;
+	while( node->length ) {
+		freeANode(data->node);
+		if (data->naff)
+			free(data->aff);	
+		data++;
+		node->length--;
+	}
+	free(node);
+}
+	
 
 void
-FreeIspell(IspellDict * Conf)
+NIFree(IspellDict * Conf)
 {
 	int			i;
 	AFFIX	   *Affix = (AFFIX *) Conf->Affix;
+	char**     aff = Conf->AffixData;
+
+	if ( aff ) {
+		while(*aff) {
+			free(*aff);
+			aff++;
+		}
+		free(Conf->AffixData);
+	}
 
+	
 	for (i = 0; i < Conf->naffixes; i++)
 	{
 		if (Affix[i].compile == 0)
 			regfree(&(Affix[i].reg));
 	}
-	for (i = 0; i < Conf->naffixes; i++)
-		free(Conf->Spell[i].word);
-	free(Conf->Affix);
-	free(Conf->Spell);
+	if (Conf->Spell) {
+		for (i = 0; i < Conf->nspell; i++)
+			free(Conf->Spell[i].word);
+		free(Conf->Spell);
+	}
+
+	if (Conf->Affix) free(Conf->Affix);
+	if ( Conf->CompoundAffix ) free(Conf->CompoundAffix);
+	freeSPNode(Conf->Dictionary);
+	freeANode(Conf->Suffix);
+	freeANode(Conf->Prefix);
 	memset((void *) Conf, 0, sizeof(IspellDict));
 	return;
 }
diff --git a/contrib/tsearch2/ispell/spell.h b/contrib/tsearch2/ispell/spell.h
index baf5052f026..1d4ad1b97b1 100644
--- a/contrib/tsearch2/ispell/spell.h
+++ b/contrib/tsearch2/ispell/spell.h
@@ -4,15 +4,43 @@
 #include <sys/types.h>
 #include <regex.h>
 
+
+struct SPNode;
+
+
+typedef struct {
+	u_int32_t 
+		val:8,
+		isword:1,
+		compoundallow:1,
+		affix:22;
+	struct SPNode *node; 
+} SPNodeData;
+
+typedef struct SPNode {
+	u_int32_t 	length;
+	SPNodeData	data[1];	
+} SPNode;
+
+#define SPNHRDSZ	(sizeof(u_int32_t))
+
+
 typedef struct spell_struct
 {
 	char	   *word;
-	char		flag[10];
+	union {
+		char		flag[16];
+		struct {
+			int		affix;
+			int 		len;
+		} d;
+	} p;
 }	SPELL;
 
 typedef struct aff_struct
 {
 	char		flag;
+	char		flagflags;
 	char		type;
 	char		mask[33];
 	char		find[16];
@@ -22,35 +50,66 @@ typedef struct aff_struct
 	char		compile;
 }	AFFIX;
 
+#define FF_CROSSPRODUCT 	0x01
+#define FF_COMPOUNDWORD 	0x02
+#define FF_COMPOUNDONLYAFX      0x04
+
+struct AffixNode;
+
+typedef struct {
+	u_int32_t
+		val:8,
+		naff:24;
+	AFFIX   **aff;
+	struct AffixNode *node;
+} AffixNodeData;
+
+typedef struct AffixNode {
+	u_int32_t length;
+	AffixNodeData	data[1];
+} AffixNode;
+
+#define ANHRDSZ        (sizeof(u_int32_t))
+
 typedef struct Tree_struct
 {
 	int			Left[256],
 				Right[256];
 }	Tree_struct;
 
+typedef struct {
+	char *affix;
+	int len;
+} CMPDAffix;
+
 typedef struct
 {
 	int			maffixes;
 	int			naffixes;
 	AFFIX	   *Affix;
+	char			compoundcontrol;
 
 	int			nspell;
 	int			mspell;
 	SPELL	   *Spell;
-	Tree_struct SpellTree;
-	Tree_struct PrefixTree;
-	Tree_struct SuffixTree;
+
+	AffixNode	*Suffix;
+	AffixNode	*Prefix;
+
+	SPNode	*Dictionary;
+	char	**AffixData;
+	CMPDAffix    *CompoundAffix;
 
 }	IspellDict;
 
-char	  **NormalizeWord(IspellDict * Conf, char *word);
-int			ImportAffixes(IspellDict * Conf, const char *filename);
-int			ImportDictionary(IspellDict * Conf, const char *filename);
+char	  **NINormalizeWord(IspellDict * Conf, char *word);
+int			NIImportAffixes(IspellDict * Conf, const char *filename);
+int			NIImportDictionary(IspellDict * Conf, const char *filename);
 
-int			AddSpell(IspellDict * Conf, const char *word, const char *flag);
-int			AddAffix(IspellDict * Conf, int flag, const char *mask, const char *find, const char *repl, int type);
-void		SortDictionary(IspellDict * Conf);
-void		SortAffixes(IspellDict * Conf);
-void		FreeIspell(IspellDict * Conf);
+int			NIAddSpell(IspellDict * Conf, const char *word, const char *flag);
+int			NIAddAffix(IspellDict * Conf, int flag, char flagflags, const char *mask, const char *find, const char *repl, int type);
+void		NISortDictionary(IspellDict * Conf);
+void		NISortAffixes(IspellDict * Conf);
+void		NIFree(IspellDict * Conf);
 
 #endif
-- 
GitLab