From 4c2777d0b733220d9029f78817af8ce671e4a5ad Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Sun, 4 Sep 2011 15:41:49 -0400
Subject: [PATCH] Change get_variable_numdistinct's API to flag default
 estimates explicitly.

Formerly, callers tested for DEFAULT_NUM_DISTINCT, which had the problem
that a perfectly solid estimate might be mistaken for a content-free
default.
---
 src/backend/utils/adt/selfuncs.c | 70 ++++++++++++++++++++------------
 src/include/utils/selfuncs.h     |  3 +-
 2 files changed, 45 insertions(+), 28 deletions(-)

diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index 4f3eb8cd01e..5d999e6bfa2 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -244,6 +244,7 @@ var_eq_const(VariableStatData *vardata, Oid operator,
 			 bool varonleft)
 {
 	double		selec;
+	bool		isdefault;
 
 	/*
 	 * If the constant is NULL, assume operator is strict and return zero, ie,
@@ -344,7 +345,7 @@ var_eq_const(VariableStatData *vardata, Oid operator,
 			 * all the not-common values share this remaining fraction
 			 * equally, so we divide by the number of other distinct values.
 			 */
-			otherdistinct = get_variable_numdistinct(vardata) - nnumbers;
+			otherdistinct = get_variable_numdistinct(vardata, &isdefault) - nnumbers;
 			if (otherdistinct > 1)
 				selec /= otherdistinct;
 
@@ -366,7 +367,7 @@ var_eq_const(VariableStatData *vardata, Oid operator,
 		 * of distinct values and assuming they are equally common. (The guess
 		 * is unlikely to be very good, but we do know a few special cases.)
 		 */
-		selec = 1.0 / get_variable_numdistinct(vardata);
+		selec = 1.0 / get_variable_numdistinct(vardata, &isdefault);
 	}
 
 	/* result should be in range, but make sure... */
@@ -384,6 +385,7 @@ var_eq_non_const(VariableStatData *vardata, Oid operator,
 				 bool varonleft)
 {
 	double		selec;
+	bool		isdefault;
 
 	/*
 	 * If we matched the var to a unique index, assume there is exactly one
@@ -414,7 +416,7 @@ var_eq_non_const(VariableStatData *vardata, Oid operator,
 		 * idea?)
 		 */
 		selec = 1.0 - stats->stanullfrac;
-		ndistinct = get_variable_numdistinct(vardata);
+		ndistinct = get_variable_numdistinct(vardata, &isdefault);
 		if (ndistinct > 1)
 			selec /= ndistinct;
 
@@ -441,7 +443,7 @@ var_eq_non_const(VariableStatData *vardata, Oid operator,
 		 * of distinct values and assuming they are equally common. (The guess
 		 * is unlikely to be very good, but we do know a few special cases.)
 		 */
-		selec = 1.0 / get_variable_numdistinct(vardata);
+		selec = 1.0 / get_variable_numdistinct(vardata, &isdefault);
 	}
 
 	/* result should be in range, but make sure... */
@@ -2071,6 +2073,8 @@ eqjoinsel_inner(Oid operator,
 	double		selec;
 	double		nd1;
 	double		nd2;
+	bool		isdefault1;
+	bool		isdefault2;
 	Form_pg_statistic stats1 = NULL;
 	Form_pg_statistic stats2 = NULL;
 	bool		have_mcvs1 = false;
@@ -2084,8 +2088,8 @@ eqjoinsel_inner(Oid operator,
 	float4	   *numbers2 = NULL;
 	int			nnumbers2 = 0;
 
-	nd1 = get_variable_numdistinct(vardata1);
-	nd2 = get_variable_numdistinct(vardata2);
+	nd1 = get_variable_numdistinct(vardata1, &isdefault1);
+	nd2 = get_variable_numdistinct(vardata2, &isdefault2);
 
 	if (HeapTupleIsValid(vardata1->statsTuple))
 	{
@@ -2296,6 +2300,8 @@ eqjoinsel_semi(Oid operator,
 	double		selec;
 	double		nd1;
 	double		nd2;
+	bool		isdefault1;
+	bool		isdefault2;
 	Form_pg_statistic stats1 = NULL;
 	bool		have_mcvs1 = false;
 	Datum	   *values1 = NULL;
@@ -2308,8 +2314,8 @@ eqjoinsel_semi(Oid operator,
 	float4	   *numbers2 = NULL;
 	int			nnumbers2 = 0;
 
-	nd1 = get_variable_numdistinct(vardata1);
-	nd2 = get_variable_numdistinct(vardata2);
+	nd1 = get_variable_numdistinct(vardata1, &isdefault1);
+	nd2 = get_variable_numdistinct(vardata2, &isdefault2);
 
 	/*
 	 * We clamp nd2 to be not more than what we estimate the inner relation's
@@ -2441,7 +2447,7 @@ eqjoinsel_semi(Oid operator,
 		 * nd2 is default, punt and assume half of the uncertain rows have
 		 * join partners.
 		 */
-		if (nd1 != DEFAULT_NUM_DISTINCT && nd2 != DEFAULT_NUM_DISTINCT)
+		if (!isdefault1 && !isdefault2)
 		{
 			nd1 -= nmatches;
 			nd2 -= nmatches;
@@ -2464,7 +2470,7 @@ eqjoinsel_semi(Oid operator,
 		 */
 		double		nullfrac1 = stats1 ? stats1->stanullfrac : 0.0;
 
-		if (nd1 != DEFAULT_NUM_DISTINCT && nd2 != DEFAULT_NUM_DISTINCT)
+		if (!isdefault1 && !isdefault2)
 		{
 			if (nd1 <= nd2 || nd2 < 0)
 				selec = 1.0 - nullfrac1;
@@ -2955,9 +2961,10 @@ add_unique_group_var(PlannerInfo *root, List *varinfos,
 {
 	GroupVarInfo *varinfo;
 	double		ndistinct;
+	bool		isdefault;
 	ListCell   *lc;
 
-	ndistinct = get_variable_numdistinct(vardata);
+	ndistinct = get_variable_numdistinct(vardata, &isdefault);
 
 	/* cannot use foreach here because of possible list_delete */
 	lc = list_head(varinfos);
@@ -3292,14 +3299,23 @@ estimate_hash_bucketsize(PlannerInfo *root, Node *hashkey, double nbuckets)
 				stanullfrac,
 				mcvfreq,
 				avgfreq;
+	bool		isdefault;
 	float4	   *numbers;
 	int			nnumbers;
 
 	examine_variable(root, hashkey, 0, &vardata);
 
-	/* Get number of distinct values and fraction that are null */
-	ndistinct = get_variable_numdistinct(&vardata);
+	/* Get number of distinct values */
+	ndistinct = get_variable_numdistinct(&vardata, &isdefault);
 
+	/* If ndistinct isn't real, punt and return 0.1, per comments above */
+	if (isdefault)
+	{
+		ReleaseVariableStats(vardata);
+		return (Selectivity) 0.1;
+	}
+
+	/* Get fraction that are null */
 	if (HeapTupleIsValid(vardata.statsTuple))
 	{
 		Form_pg_statistic stats;
@@ -3308,19 +3324,7 @@ estimate_hash_bucketsize(PlannerInfo *root, Node *hashkey, double nbuckets)
 		stanullfrac = stats->stanullfrac;
 	}
 	else
-	{
-		/*
-		 * Believe a default ndistinct only if it came from stats. Otherwise
-		 * punt and return 0.1, per comments above.
-		 */
-		if (ndistinct == DEFAULT_NUM_DISTINCT)
-		{
-			ReleaseVariableStats(vardata);
-			return (Selectivity) 0.1;
-		}
-
 		stanullfrac = 0.0;
-	}
 
 	/* Compute avg freq of all distinct data values in raw relation */
 	avgfreq = (1.0 - stanullfrac) / ndistinct;
@@ -4414,16 +4418,20 @@ examine_simple_variable(PlannerInfo *root, Var *var,
  *	  Estimate the number of distinct values of a variable.
  *
  * vardata: results of examine_variable
+ * *isdefault: set to TRUE if the result is a default rather than based on
+ * anything meaningful.
  *
  * NB: be careful to produce an integral result, since callers may compare
  * the result to exact integer counts.
  */
 double
-get_variable_numdistinct(VariableStatData *vardata)
+get_variable_numdistinct(VariableStatData *vardata, bool *isdefault)
 {
 	double		stadistinct;
 	double		ntuples;
 
+	*isdefault = false;
+
 	/*
 	 * Determine the stadistinct value to use.	There are cases where we can
 	 * get an estimate even without a pg_statistic entry, or can get a better
@@ -4496,10 +4504,16 @@ get_variable_numdistinct(VariableStatData *vardata)
 	 * Otherwise we need to get the relation size; punt if not available.
 	 */
 	if (vardata->rel == NULL)
+	{
+		*isdefault = true;
 		return DEFAULT_NUM_DISTINCT;
+	}
 	ntuples = vardata->rel->tuples;
 	if (ntuples <= 0.0)
+	{
+		*isdefault = true;
 		return DEFAULT_NUM_DISTINCT;
+	}
 
 	/*
 	 * If we had a relative estimate, use that.
@@ -4509,11 +4523,13 @@ get_variable_numdistinct(VariableStatData *vardata)
 
 	/*
 	 * With no data, estimate ndistinct = ntuples if the table is small, else
-	 * use default.
+	 * use default.  We use DEFAULT_NUM_DISTINCT as the cutoff for "small"
+	 * so that the behavior isn't discontinuous.
 	 */
 	if (ntuples < DEFAULT_NUM_DISTINCT)
 		return ntuples;
 
+	*isdefault = true;
 	return DEFAULT_NUM_DISTINCT;
 }
 
diff --git a/src/include/utils/selfuncs.h b/src/include/utils/selfuncs.h
index 4208588c2eb..32d14b60290 100644
--- a/src/include/utils/selfuncs.h
+++ b/src/include/utils/selfuncs.h
@@ -121,7 +121,8 @@ extern void get_join_variables(PlannerInfo *root, List *args,
 				   VariableStatData *vardata1,
 				   VariableStatData *vardata2,
 				   bool *join_is_reversed);
-extern double get_variable_numdistinct(VariableStatData *vardata);
+extern double get_variable_numdistinct(VariableStatData *vardata,
+									   bool *isdefault);
 extern double mcv_selectivity(VariableStatData *vardata, FmgrInfo *opproc,
 				Datum constval, bool varonleft,
 				double *sumcommonp);
-- 
GitLab