From 52b60530f257b1591d8b72264cd6c0dd9aabfd46 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Thu, 17 Feb 2011 19:00:49 -0500
Subject: [PATCH] Fix tsmatchsel() to account properly for null rows.

ts_typanalyze.c computes MCE statistics as fractions of the non-null rows,
which seems fairly reasonable, and anyway changing it in released versions
wouldn't be a good idea.  But then ts_selfuncs.c has to account for that.
Failure to do so results in overestimates in columns with a significant
fraction of null documents.  Back-patch to 8.4 where this stuff was
introduced.

Jesper Krogh
---
 src/backend/tsearch/ts_selfuncs.c  | 6 ++++++
 src/include/catalog/pg_statistic.h | 2 ++
 2 files changed, 8 insertions(+)

diff --git a/src/backend/tsearch/ts_selfuncs.c b/src/backend/tsearch/ts_selfuncs.c
index 8ce9fb46aa1..7f33c16a245 100644
--- a/src/backend/tsearch/ts_selfuncs.c
+++ b/src/backend/tsearch/ts_selfuncs.c
@@ -189,11 +189,17 @@ tsquerysel(VariableStatData *vardata, Datum constval)
 			/* No most-common-elements info, so do without */
 			selec = tsquery_opr_selec_no_stats(query);
 		}
+
+		/*
+		 * MCE stats count only non-null rows, so adjust for null rows.
+		 */
+		selec *= (1.0 - stats->stanullfrac);
 	}
 	else
 	{
 		/* No stats at all, so do without */
 		selec = tsquery_opr_selec_no_stats(query);
+		/* we assume no nulls here, so no stanullfrac correction */
 	}
 
 	return selec;
diff --git a/src/include/catalog/pg_statistic.h b/src/include/catalog/pg_statistic.h
index f38921f1c69..927cd0b0471 100644
--- a/src/include/catalog/pg_statistic.h
+++ b/src/include/catalog/pg_statistic.h
@@ -246,6 +246,8 @@ typedef FormData_pg_statistic *Form_pg_statistic;
  * type with identifiable elements (for instance, tsvector).  staop contains
  * the equality operator appropriate to the element type.  stavalues contains
  * the most common element values, and stanumbers their frequencies.  Unlike
+ * MCV slots, frequencies are measured as the fraction of non-null rows the
+ * element value appears in, not the frequency of all rows.  Also unlike
  * MCV slots, the values are sorted into order (to support binary search
  * for a particular value).  Since this puts the minimum and maximum
  * frequencies at unpredictable spots in stanumbers, there are two extra
-- 
GitLab