From b8c798ebc528f6b6300174929e66b8128ac5cef8 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Sat, 31 Jul 2010 03:27:40 +0000
Subject: [PATCH] Tweak tsmatchsel() so that it examines the structure of the
 tsquery whenever possible (ie, whenever the tsquery is a constant), even when
 no statistics are available for the tsvector.  For example, foo @@ 'a &
 b'::tsquery can be expected to be more selective than foo @@ 'a'::tsquery,
 whether or not we know anything about foo.  We use DEFAULT_TS_MATCH_SEL as
 the assumed selectivity of individual query terms when no stats are
 available, then combine the terms according to the query's AND/OR structure
 as usual.

Per experimentation with Artur Dabrowski's example.  (The fact that there
are no stats available in that example is a problem in itself, but
nonetheless tsmatchsel should be smarter about the case.)

Back-patch to 8.4 to keep all versions of tsmatchsel() in sync.
---
 src/backend/tsearch/ts_selfuncs.c | 41 +++++++++++++++++++------------
 1 file changed, 25 insertions(+), 16 deletions(-)

diff --git a/src/backend/tsearch/ts_selfuncs.c b/src/backend/tsearch/ts_selfuncs.c
index a22dba55e5e..68d67c7a4e6 100644
--- a/src/backend/tsearch/ts_selfuncs.c
+++ b/src/backend/tsearch/ts_selfuncs.c
@@ -7,7 +7,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/tsearch/ts_selfuncs.c,v 1.7 2010/01/04 02:44:39 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/tsearch/ts_selfuncs.c,v 1.8 2010/07/31 03:27:40 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -52,6 +52,9 @@ static Selectivity tsquery_opr_selec(QueryItem *item, char *operand,
 				  TextFreq *lookup, int length, float4 minfreq);
 static int	compare_lexeme_textfreq(const void *e1, const void *e2);
 
+#define tsquery_opr_selec_no_stats(query) \
+	tsquery_opr_selec(GETQUERY(query), GETOPERAND(query), NULL, 0, 0)
+
 
 /*
  *	tsmatchsel -- Selectivity of "@@"
@@ -101,21 +104,20 @@ tsmatchsel(PG_FUNCTION_ARGS)
 	}
 
 	/*
-	 * OK, there's a Var and a Const we're dealing with here. We need the Var
-	 * to be a TSVector (or else we don't have any useful statistic for it).
-	 * We have to check this because the Var might be the TSQuery not the
-	 * TSVector.
+	 * OK, there's a Var and a Const we're dealing with here.  We need the
+	 * Const to be a TSQuery, else we can't do anything useful.  We have to
+	 * check this because the Var might be the TSQuery not the TSVector.
 	 */
-	if (vardata.vartype == TSVECTOROID)
+	if (((Const *) other)->consttype == TSQUERYOID)
 	{
 		/* tsvector @@ tsquery or the other way around */
-		Assert(((Const *) other)->consttype == TSQUERYOID);
+		Assert(vardata.vartype == TSVECTOROID);
 
 		selec = tsquerysel(&vardata, ((Const *) other)->constvalue);
 	}
 	else
 	{
-		/* The Var is something we don't have useful statistics for */
+		/* If we can't see the query structure, must punt */
 		selec = DEFAULT_TS_MATCH_SEL;
 	}
 
@@ -184,14 +186,14 @@ tsquerysel(VariableStatData *vardata, Datum constval)
 		}
 		else
 		{
-			/* No most-common-elements info, so we must punt */
-			selec = (Selectivity) DEFAULT_TS_MATCH_SEL;
+			/* No most-common-elements info, so do without */
+			selec = tsquery_opr_selec_no_stats(query);
 		}
 	}
 	else
 	{
-		/* No stats at all, so we must punt */
-		selec = (Selectivity) DEFAULT_TS_MATCH_SEL;
+		/* No stats at all, so do without */
+		selec = tsquery_opr_selec_no_stats(query);
 	}
 
 	return selec;
@@ -214,7 +216,7 @@ mcelem_tsquery_selec(TSQuery query, Datum *mcelem, int nmcelem,
 	 * cells are taken for minimal and maximal frequency.  Punt if not.
 	 */
 	if (nnumbers != nmcelem + 2)
-		return DEFAULT_TS_MATCH_SEL;
+		return tsquery_opr_selec_no_stats(query);
 
 	/*
 	 * Transpose the data into a single array so we can use bsearch().
@@ -258,9 +260,12 @@ mcelem_tsquery_selec(TSQuery query, Datum *mcelem, int nmcelem,
  *	 freq[val] in VAL nodes, if the value is in MCELEM
  *	 min(freq[MCELEM]) / 2 in VAL nodes, if it is not
  *
- *
  * The MCELEM array is already sorted (see ts_typanalyze.c), so we can use
  * binary search for determining freq[MCELEM].
+ *
+ * If we don't have stats for the tsvector, we still use this logic,
+ * except we always use DEFAULT_TS_MATCH_SEL for VAL nodes.  This case
+ * is signaled by lookup == NULL.
  */
 static Selectivity
 tsquery_opr_selec(QueryItem *item, char *operand,
@@ -279,6 +284,10 @@ tsquery_opr_selec(QueryItem *item, char *operand,
 	{
 		QueryOperand *oper = (QueryOperand *) item;
 
+		/* If no stats for the variable, use DEFAULT_TS_MATCH_SEL */
+		if (lookup == NULL)
+			return (Selectivity) DEFAULT_TS_MATCH_SEL;
+
 		/*
 		 * Prepare the key for bsearch().
 		 */
@@ -292,7 +301,7 @@ tsquery_opr_selec(QueryItem *item, char *operand,
 		if (searchres)
 		{
 			/*
-			 * The element is in MCELEM. Return precise selectivity (or at
+			 * The element is in MCELEM.  Return precise selectivity (or at
 			 * least as precise as ANALYZE could find out).
 			 */
 			return (Selectivity) searchres->frequency;
@@ -300,7 +309,7 @@ tsquery_opr_selec(QueryItem *item, char *operand,
 		else
 		{
 			/*
-			 * The element is not in MCELEM. Punt, but assert that the
+			 * The element is not in MCELEM.  Punt, but assume that the
 			 * selectivity cannot be more than minfreq / 2.
 			 */
 			return (Selectivity) Min(DEFAULT_TS_MATCH_SEL, minfreq / 2);
-- 
GitLab