From 38c4fe87ac86965253235edf46a5908e49c8478e Mon Sep 17 00:00:00 2001
From: Teodor Sigaev <teodor@sigaev.ru>
Date: Thu, 2 Mar 2006 19:07:19 +0000
Subject: [PATCH] Significantly improve ranking: 1) rank_cd now use weight of
 lexemes 2) rank_cd and rank can use any combination of normalization methods:
         no normalization         normalization by log(length of document)    
     -----/------- by length of document         -----/------- by number of
 unique word in document         -----/------- by log(number of unique word in
 document)         -----/------- by number of covers (only rank_cd)

Improve cover's search.

TODO: changes in documentation
---
 contrib/tsearch2/expected/tsearch2.out |  14 +-
 contrib/tsearch2/rank.c                | 235 ++++++++++++++++---------
 contrib/tsearch2/tsearch.sql.in        |   4 +-
 3 files changed, 161 insertions(+), 92 deletions(-)

diff --git a/contrib/tsearch2/expected/tsearch2.out b/contrib/tsearch2/expected/tsearch2.out
index 8fb16b045d8..2a60e317cd6 100644
--- a/contrib/tsearch2/expected/tsearch2.out
+++ b/contrib/tsearch2/expected/tsearch2.out
@@ -2315,9 +2315,9 @@ An hour of storm to place
 The sculpture of these granite seams,
 Upon a woman s face. E.  J.  Pratt  (1882 1964)
 '), to_tsquery('sea&thousand&years'));
- rank_cd 
----------
-     1.2
+  rank_cd  
+-----------
+ 0.0555556
 (1 row)
 
 select rank_cd(to_tsvector('Erosion It took the sea a thousand years,
@@ -2329,9 +2329,9 @@ An hour of storm to place
 The sculpture of these granite seams,
 Upon a woman s face. E.  J.  Pratt  (1882 1964)
 '), to_tsquery('granite&sea'));
- rank_cd  
-----------
- 0.880303
+  rank_cd  
+-----------
+ 0.0238095
 (1 row)
 
 select rank_cd(to_tsvector('Erosion It took the sea a thousand years,
@@ -2345,7 +2345,7 @@ Upon a woman s face. E.  J.  Pratt  (1882 1964)
 '), to_tsquery('sea'));
  rank_cd 
 ---------
-       2
+     0.2
 (1 row)
 
 select get_covers(to_tsvector('Erosion It took the sea a thousand years,
diff --git a/contrib/tsearch2/rank.c b/contrib/tsearch2/rank.c
index a8c30bf3290..29732c1c91c 100644
--- a/contrib/tsearch2/rank.c
+++ b/contrib/tsearch2/rank.c
@@ -41,7 +41,13 @@ static float weights[] = {0.1, 0.2, 0.4, 1.0};
 
 #define wpos(wep)	( w[ WEP_GETWEIGHT(wep) ] )
 
-#define DEF_NORM_METHOD 0
+#define	RANK_NO_NORM		0x00
+#define RANK_NORM_LOGLENGTH 	0x01
+#define RANK_NORM_LENGTH 	0x02
+#define	RANK_NORM_EXTDIST	0x04
+#define RANK_NORM_UNIQ		0x08
+#define RANK_NORM_LOGUNIQ	0x10
+#define DEF_NORM_METHOD 	RANK_NO_NORM
 
 static float calc_rank_or(float *w, tsvector * t, QUERYTYPE * q);
 static float calc_rank_and(float *w, tsvector * t, QUERYTYPE * q);
@@ -328,23 +334,21 @@ calc_rank(float *w, tsvector * t, QUERYTYPE * q, int4 method)
 	if (res < 0)
 		res = 1e-20;
 
-	switch (method)
-	{
-		case 0:
-			break;
-		case 1:
-			res /= log((float) (cnt_length(t) + 1)) / log(2.0);
-			break;
-		case 2:
-			len = cnt_length(t);
-			if (len > 0)
-				res /= (float) len;
-			break;
-		default:
-			/* internal error */
-			elog(ERROR, "unrecognized normalization method: %d", method);
+	if ( (method & RANK_NORM_LOGLENGTH) && t->size>0 )
+		res /= log((double) (cnt_length(t) + 1)) / log(2.0);
+
+	if ( method & RANK_NORM_LENGTH ) {
+		len = cnt_length(t);
+		if ( len>0 )
+			res /= (float) len;
 	}
 
+	if ( (method & RANK_NORM_UNIQ) && t->size > 0 )
+		res /= (float)( t->size );
+
+	if ( (method & RANK_NORM_LOGUNIQ) && t->size > 0 )
+		res /= log((double) (t->size + 1)) / log(2.0);
+
 	return res;
 }
 
@@ -420,6 +424,7 @@ typedef struct
 	ITEM	  **item;
 	int16		nitem;
 	bool		needfree;
+	uint8		wclass;
 	int32		pos;
 }	DocRepresentation;
 
@@ -452,19 +457,28 @@ reset_istrue_flag(QUERYTYPE * query)
 	}
 }
 
+typedef struct {
+	int	pos;
+	int	p;
+	int	q;
+	DocRepresentation	*begin;
+	DocRepresentation	*end;
+} Extention;
+
+
 static bool
-Cover(DocRepresentation * doc, int len, QUERYTYPE * query, int *pos, int *p, int *q)
+Cover(DocRepresentation * doc, int len, QUERYTYPE * query, Extention *ext)
 {
 	DocRepresentation *ptr;
-	int			lastpos = *pos;
+	int			lastpos = ext->pos;
 	int			i;
 	bool		found = false;
 
 	reset_istrue_flag(query);
 
-	*p = 0x7fffffff;
-	*q = 0;
-	ptr = doc + *pos;
+	ext->p = 0x7fffffff;
+	ext->q = 0;
+	ptr = doc + ext->pos;
 
 	/* find upper bound of cover from current position, move up */
 	while (ptr - doc < len)
@@ -473,9 +487,10 @@ Cover(DocRepresentation * doc, int len, QUERYTYPE * query, int *pos, int *p, int
 			ptr->item[i]->istrue = 1;
 		if (TS_execute(GETQUERY(query), NULL, false, checkcondition_ITEM))
 		{
-			if (ptr->pos > *q)
+			if (ptr->pos > ext->q)
 			{
-				*q = ptr->pos;
+				ext->q = ptr->pos;
+				ext->end = ptr;
 				lastpos = ptr - doc;
 				found = true;
 			}
@@ -498,25 +513,27 @@ Cover(DocRepresentation * doc, int len, QUERYTYPE * query, int *pos, int *p, int
 			ptr->item[i]->istrue = 1;
 		if (TS_execute(GETQUERY(query), NULL, true, checkcondition_ITEM))
 		{
-			if (ptr->pos < *p)
-				*p = ptr->pos;
+			if (ptr->pos < ext->p) {
+				ext->begin = ptr;
+				ext->p = ptr->pos;
+			}
 			break;
 		}
 		ptr--;
 	}
 
-	if (*p <= *q)
+	if (ext->p <= ext->q)
 	{
 		/*
 		 * set position for next try to next lexeme after begining of founded
 		 * cover
 		 */
-		*pos = (ptr - doc) + 1;
+		ext->pos = (ptr - doc) + 1;
 		return true;
 	}
 
-	(*pos)++;
-	return Cover(doc, len, query, pos, p, q);
+	ext->pos++;
+	return Cover(doc, len, query, ext);
 }
 
 static DocRepresentation *
@@ -593,6 +610,7 @@ get_docrep(tsvector * txt, QUERYTYPE * query, int *doclen)
 				doc[cur].item = doc[cur - 1].item;
 			}
 			doc[cur].pos = WEP_GETPOS(post[j]);
+			doc[cur].wclass = WEP_GETWEIGHT(post[j]);
 			cur++;
 		}
 	}
@@ -610,61 +628,110 @@ get_docrep(tsvector * txt, QUERYTYPE * query, int *doclen)
 	return NULL;
 }
 
-
-Datum
-rank_cd(PG_FUNCTION_ARGS)
-{
-	int			K = PG_GETARG_INT32(0);
-	tsvector   *txt = (tsvector *) PG_DETOAST_DATUM(PG_GETARG_DATUM(1));
-	QUERYTYPE  *query = (QUERYTYPE *) PG_DETOAST_DATUM_COPY(PG_GETARG_DATUM(2));
-	int			method = DEF_NORM_METHOD;
+static float4
+calc_rank_cd(float4 *arrdata, tsvector *txt, QUERYTYPE *query, int method) {
 	DocRepresentation *doc;
-	float		res = 0.0;
-	int			p = 0,
-				q = 0,
-				len,
-				cur,
+	int 			len,
 				i,
 				doclen = 0;
+	Extention	ext;
+	double		Wdoc = 0.0;
+	double		invws[lengthof(weights)];
+	double		SumDist=0.0, PrevExtPos=0.0, CurExtPos=0.0;
+	int		NExtent=0;
 
-	doc = get_docrep(txt, query, &doclen);
-	if (!doc)
+	for (i = 0; i < lengthof(weights); i++)
 	{
-		PG_FREE_IF_COPY(txt, 1);
-		PG_FREE_IF_COPY(query, 2);
-		PG_RETURN_FLOAT4(0.0);
+		invws[i] = ((double)((arrdata[i] >= 0) ? arrdata[i] : weights[i]));
+		if (invws[i] > 1.0)
+			ereport(ERROR,
+					(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+					 errmsg("weight out of range")));
+		invws[i] = 1.0/invws[i]; 
 	}
 
-	cur = 0;
-	if (K <= 0)
-		K = 4;
-	while (Cover(doc, doclen, query, &cur, &p, &q))
-		res += (q - p + 1 > K) ? ((float) K) / ((float) (q - p + 1)) : 1.0;
+	doc = get_docrep(txt, query, &doclen);
+	if (!doc) 
+		return 0.0;
 
-	if (PG_NARGS() == 4)
-		method = PG_GETARG_INT32(3);
+	MemSet( &ext, 0, sizeof(Extention) );
+	while (Cover(doc, doclen, query, &ext)) {
+		double	Cpos = 0.0;
+		double	InvSum = 0.0;
+		DocRepresentation *ptr = ext.begin;
 
-	switch (method)
-	{
-		case 0:
-			break;
-		case 1:
-			res /= log((float) (cnt_length(txt) + 1));
-			break;
-		case 2:
-			len = cnt_length(txt);
-			if (len > 0)
-				res /= (float) len;
-			break;
-		default:
-			/* internal error */
-			elog(ERROR, "unrecognized normalization method: %d", method);
+		while ( ptr<=ext.end ) {
+			InvSum += invws[ ptr->wclass ];
+			ptr++;
+		}
+
+		Cpos = ((double)( ext.end-ext.begin+1 )) / InvSum;
+		Wdoc += Cpos / ( (double)(( 1 + (ext.q - ext.p) - (ext.end - ext.begin) )) ); 
+
+		CurExtPos = ((double)(ext.q + ext.p))/2.0; 
+		if ( NExtent>0 && CurExtPos > PrevExtPos /* prevent devision by zero in a case of multiple lexize */ ) 
+			SumDist += 1.0/( CurExtPos - PrevExtPos );
+
+		PrevExtPos = CurExtPos;
+		NExtent++; 
+	}
+
+	if ( (method & RANK_NORM_LOGLENGTH) && txt->size > 0 )
+		Wdoc /= log((double) (cnt_length(txt) + 1));
+
+	if ( method & RANK_NORM_LENGTH ) {
+		len = cnt_length(txt);
+		if ( len>0 )
+			Wdoc /= (double) len;
 	}
 
+	if ( (method & RANK_NORM_EXTDIST) && SumDist > 0 ) 
+		Wdoc /= ((double)NExtent) / SumDist;
+
+	if ( (method & RANK_NORM_UNIQ) && txt->size > 0 )
+		Wdoc /= (double)( txt->size );
+
+	if ( (method & RANK_NORM_LOGUNIQ) && txt->size > 0 )
+		Wdoc /= log((double) (txt->size + 1)) / log(2.0);
+
 	for (i = 0; i < doclen; i++)
 		if (doc[i].needfree)
 			pfree(doc[i].item);
 	pfree(doc);
+
+	return (float4)Wdoc;
+} 
+
+Datum
+rank_cd(PG_FUNCTION_ARGS)
+{
+	ArrayType *win =  (ArrayType *) PG_DETOAST_DATUM(PG_GETARG_DATUM(0));
+	tsvector   *txt = (tsvector *) PG_DETOAST_DATUM(PG_GETARG_DATUM(1));
+	QUERYTYPE  *query = (QUERYTYPE *) PG_DETOAST_DATUM_COPY(PG_GETARG_DATUM(2));
+	int			method = DEF_NORM_METHOD;
+	float4		res;
+
+	if (ARR_NDIM(win) != 1)
+		ereport(ERROR,
+				(errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
+				 errmsg("array of weight must be one-dimensional")));
+
+	if (ARRNELEMS(win) < lengthof(weights))
+		ereport(ERROR,
+				(errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
+				 errmsg("array of weight is too short")));
+
+	if (ARR_HASNULL(win))
+		ereport(ERROR,
+				(errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
+				 errmsg("array of weight must not contain nulls")));
+
+	if (PG_NARGS() == 4)
+		method = PG_GETARG_INT32(3);
+
+	res = calc_rank_cd( (float4 *) ARR_DATA_PTR(win), txt, query, method);
+
+	PG_FREE_IF_COPY(win, 0);
 	PG_FREE_IF_COPY(txt, 1);
 	PG_FREE_IF_COPY(query, 2);
 
@@ -675,13 +742,16 @@ rank_cd(PG_FUNCTION_ARGS)
 Datum
 rank_cd_def(PG_FUNCTION_ARGS)
 {
-	PG_RETURN_DATUM(DirectFunctionCall4(
-										rank_cd,
-										Int32GetDatum(-1),
-										PG_GETARG_DATUM(0),
-										PG_GETARG_DATUM(1),
-	  (PG_NARGS() == 3) ? PG_GETARG_DATUM(2) : Int32GetDatum(DEF_NORM_METHOD)
-										));
+	tsvector   *txt = (tsvector *) PG_DETOAST_DATUM(PG_GETARG_DATUM(0));
+	QUERYTYPE  *query = (QUERYTYPE *) PG_DETOAST_DATUM_COPY(PG_GETARG_DATUM(1));
+	float4 res;
+
+	res = calc_rank_cd( weights, txt, query, (PG_NARGS() == 3) ? PG_GETARG_DATUM(2) : DEF_NORM_METHOD);
+	
+	PG_FREE_IF_COPY(txt, 1);
+	PG_FREE_IF_COPY(query, 2);
+
+	PG_RETURN_FLOAT4(res);
 }
 
 /**************debug*************/
@@ -721,11 +791,9 @@ get_covers(PG_FUNCTION_ARGS)
 	text	   *out;
 	char	   *cptr;
 	DocRepresentation *doc;
-	int			pos = 0,
-				p,
-				q,
-				olddwpos = 0;
+	int 			olddwpos = 0;
 	int			ncover = 1;
+	Extention	ext;
 
 	doc = get_docrep(txt, query, &rlen);
 
@@ -765,14 +833,15 @@ get_covers(PG_FUNCTION_ARGS)
 	}
 	qsort((void *) dw, dlen, sizeof(DocWord), compareDocWord);
 
-	while (Cover(doc, rlen, query, &pos, &p, &q))
+	MemSet( &ext, 0, sizeof(Extention) );
+	while (Cover(doc, rlen, query, &ext))
 	{
 		dwptr = dw + olddwpos;
-		while (dwptr->pos < p && dwptr - dw < dlen)
+		while (dwptr->pos < ext.p && dwptr - dw < dlen)
 			dwptr++;
 		olddwpos = dwptr - dw;
 		dwptr->start = ncover;
-		while (dwptr->pos < q + 1 && dwptr - dw < dlen)
+		while (dwptr->pos < ext.q + 1 && dwptr - dw < dlen)
 			dwptr++;
 		(dwptr - 1)->finish = ncover;
 		len += 4 /* {}+two spaces */ + 2 * 16 /* numbers */ ;
diff --git a/contrib/tsearch2/tsearch.sql.in b/contrib/tsearch2/tsearch.sql.in
index c30a80e51ff..0ab0887b459 100644
--- a/contrib/tsearch2/tsearch.sql.in
+++ b/contrib/tsearch2/tsearch.sql.in
@@ -534,12 +534,12 @@ RETURNS float4
 AS 'MODULE_PATHNAME', 'rank_def'
 LANGUAGE C RETURNS NULL ON NULL INPUT IMMUTABLE;
 
-CREATE FUNCTION rank_cd(int4, tsvector, tsquery)
+CREATE FUNCTION rank_cd(float4[], tsvector, tsquery)
 RETURNS float4
 AS 'MODULE_PATHNAME'
 LANGUAGE C RETURNS NULL ON NULL INPUT IMMUTABLE;
 
-CREATE FUNCTION rank_cd(int4, tsvector, tsquery, int4)
+CREATE FUNCTION rank_cd(float4[], tsvector, tsquery, int4)
 RETURNS float4
 AS 'MODULE_PATHNAME'
 LANGUAGE C RETURNS NULL ON NULL INPUT IMMUTABLE;
-- 
GitLab