From 84c7cef5eb0f0d08d40dbfcf396d4c830c1f1030 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Sat, 18 Sep 2004 19:39:50 +0000
Subject: [PATCH] Fix estimate_num_groups to be able to use expression-index
 statistics when there is an expressional index matching a GROUP BY item.

---
 src/backend/utils/adt/selfuncs.c | 184 +++++++++++++++++++------------
 1 file changed, 112 insertions(+), 72 deletions(-)

diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index 412a0a32310..24759bf5c09 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -15,7 +15,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/utils/adt/selfuncs.c,v 1.165 2004/08/30 02:54:39 momjian Exp $
+ *	  $PostgreSQL: pgsql/src/backend/utils/adt/selfuncs.c,v 1.166 2004/09/18 19:39:50 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -1869,6 +1869,71 @@ fail:
 	ReleaseVariableStats(rightvar);
 }
 
+
+/*
+ * Helper routine for estimate_num_groups: add an item to a list of
+ * GroupVarInfos, but only if it's not known equal to any of the existing
+ * entries.
+ */
+typedef struct
+{
+	Node	   *var;		/* might be an expression, not just a Var */
+	RelOptInfo *rel;		/* relation it belongs to */
+	double		ndistinct;	/* # distinct values */
+} GroupVarInfo;
+
+static List *
+add_unique_group_var(Query *root, List *varinfos,
+					 Node *var, VariableStatData *vardata)
+{
+	GroupVarInfo *varinfo;
+	double		ndistinct;
+	ListCell   *lc;
+
+	ndistinct = get_variable_numdistinct(vardata);
+
+	/* cannot use foreach here because of possible list_delete */
+	lc = list_head(varinfos);
+	while (lc)
+	{
+		varinfo = (GroupVarInfo *) lfirst(lc);
+
+		/* must advance lc before list_delete possibly pfree's it */
+		lc = lnext(lc);
+
+		/* Drop exact duplicates */
+		if (equal(var, varinfo->var))
+			return varinfos;
+
+		/*
+		 * Drop known-equal vars, but only if they belong to different
+		 * relations (see comments for estimate_num_groups)
+		 */
+		if (vardata->rel != varinfo->rel &&
+			exprs_known_equal(root, var, varinfo->var))
+		{
+			if (varinfo->ndistinct <= ndistinct)
+			{
+				/* Keep older item, forget new one */
+				return varinfos;
+			}
+			else
+			{
+				/* Delete the older item */
+				varinfos = list_delete_ptr(varinfos, varinfo);
+			}
+		}
+	}
+
+	varinfo = (GroupVarInfo *) palloc(sizeof(GroupVarInfo));
+
+	varinfo->var = var;
+	varinfo->rel = vardata->rel;
+	varinfo->ndistinct = ndistinct;
+	varinfos = lappend(varinfos, varinfo);
+	return varinfos;
+}
+
 /*
  * estimate_num_groups		- Estimate number of groups in a grouped query
  *
@@ -1900,6 +1965,9 @@ fail:
  *		increase the number of distinct values (unless it is volatile,
  *		which we consider unlikely for grouping), but it probably won't
  *		reduce the number of distinct values much either.
+ *		As a special case, if a GROUP BY expression can be matched to an
+ *		expressional index for which we have statistics, then we treat the
+ *		whole expression as though it were just a Var.
  *	2.	If the list contains Vars of different relations that are known equal
  *		due to equijoin clauses, then drop all but one of the Vars from each
  *		known-equal set, keeping the one with smallest estimated # of values
@@ -1926,25 +1994,44 @@ fail:
 double
 estimate_num_groups(Query *root, List *groupExprs, double input_rows)
 {
-	List	   *allvars = NIL;
 	List	   *varinfos = NIL;
 	double		numdistinct;
 	ListCell   *l;
-	typedef struct
-	{							/* varinfos is a List of these */
-		Var		   *var;
-		double		ndistinct;
-	} MyVarInfo;
 
 	/* We should not be called unless query has GROUP BY (or DISTINCT) */
 	Assert(groupExprs != NIL);
 
-	/* Step 1: get the unique Vars used */
+	/*
+	 * Steps 1/2: find the unique Vars used, treating an expression as a Var
+	 * if we can find stats for it.  For each one, record the statistical
+	 * estimate of number of distinct values (total in its table, without
+	 * regard for filtering).
+	 */
 	foreach(l, groupExprs)
 	{
 		Node	   *groupexpr = (Node *) lfirst(l);
+		VariableStatData vardata;
 		List	   *varshere;
+		ListCell   *l2;
+
+		/*
+		 * If examine_variable is able to deduce anything about the GROUP BY
+		 * expression, treat it as a single variable even if it's really more
+		 * complicated.
+		 */
+		examine_variable(root, groupexpr, 0, &vardata);
+		if (vardata.statsTuple != NULL || vardata.isunique)
+		{
+			varinfos = add_unique_group_var(root, varinfos,
+											groupexpr, &vardata);
+			ReleaseVariableStats(vardata);
+			continue;
+		}
+		ReleaseVariableStats(vardata);
 
+		/*
+		 * Else pull out the component Vars
+		 */
 		varshere = pull_var_clause(groupexpr, false);
 
 		/*
@@ -1959,70 +2046,24 @@ estimate_num_groups(Query *root, List *groupExprs, double input_rows)
 				return input_rows;
 			continue;
 		}
-		allvars = list_concat(allvars, varshere);
-	}
-
-	/* If now no Vars, we must have an all-constant GROUP BY list. */
-	if (allvars == NIL)
-		return 1.0;
-
-	/* Use list_union() to discard duplicates */
-	allvars = list_union(NIL, allvars);
-
-	/*
-	 * Step 2: acquire statistical estimate of number of distinct values
-	 * of each Var (total in its table, without regard for filtering).
-	 * Also, detect known-equal Vars and discard the ones we don't want.
-	 */
-	foreach(l, allvars)
-	{
-		Var		   *var = (Var *) lfirst(l);
-		VariableStatData vardata;
-		double		ndistinct;
-		bool		keep = true;
-		ListCell   *l2;
-
-		examine_variable(root, (Node *) var, 0, &vardata);
-		ndistinct = get_variable_numdistinct(&vardata);
-		ReleaseVariableStats(vardata);
-
-		/* cannot use foreach here because of possible list_delete */
-		l2 = list_head(varinfos);
-		while (l2)
-		{
-			MyVarInfo  *varinfo = (MyVarInfo *) lfirst(l2);
-
-			/* must advance l2 before list_delete possibly pfree's it */
-			l2 = lnext(l2);
-
-			if (var->varno != varinfo->var->varno &&
-			exprs_known_equal(root, (Node *) var, (Node *) varinfo->var))
-			{
-				/* Found a match */
-				if (varinfo->ndistinct <= ndistinct)
-				{
-					/* Keep older item, forget new one */
-					keep = false;
-					break;
-				}
-				else
-				{
-					/* Delete the older item */
-					varinfos = list_delete_ptr(varinfos, varinfo);
-				}
-			}
-		}
 
-		if (keep)
+		/*
+		 * Else add variables to varinfos list
+		 */
+		foreach(l2, varshere)
 		{
-			MyVarInfo  *varinfo = (MyVarInfo *) palloc(sizeof(MyVarInfo));
+			Node	   *var = (Node *) lfirst(l2);
 
-			varinfo->var = var;
-			varinfo->ndistinct = ndistinct;
-			varinfos = lcons(varinfo, varinfos);
+			examine_variable(root, var, 0, &vardata);
+			varinfos = add_unique_group_var(root, varinfos, var, &vardata);
+			ReleaseVariableStats(vardata);
 		}
 	}
 
+	/* If now no Vars, we must have an all-constant GROUP BY list. */
+	if (varinfos == NIL)
+		return 1.0;
+
 	/*
 	 * Steps 3/4: group Vars by relation and estimate total numdistinct.
 	 *
@@ -2031,25 +2072,24 @@ estimate_num_groups(Query *root, List *groupExprs, double input_rows)
 	 * these Vars from the newvarinfos list for the next iteration. This
 	 * is the easiest way to group Vars of same rel together.
 	 */
-	Assert(varinfos != NIL);
 	numdistinct = 1.0;
 
 	do
 	{
-		MyVarInfo  *varinfo1 = (MyVarInfo *) linitial(varinfos);
-		RelOptInfo *rel = find_base_rel(root, varinfo1->var->varno);
+		GroupVarInfo *varinfo1 = (GroupVarInfo *) linitial(varinfos);
+		RelOptInfo *rel = varinfo1->rel;
 		double		reldistinct = varinfo1->ndistinct;
 		List	   *newvarinfos = NIL;
 
 		/*
-		 * Get the largest numdistinct estimate of the Vars for this rel.
+		 * Get the product of numdistinct estimates of the Vars for this rel.
 		 * Also, construct new varinfos list of remaining Vars.
 		 */
 		for_each_cell(l, lnext(list_head(varinfos)))
 		{
-			MyVarInfo  *varinfo2 = (MyVarInfo *) lfirst(l);
+			GroupVarInfo *varinfo2 = (GroupVarInfo *) lfirst(l);
 
-			if (varinfo2->var->varno == varinfo1->var->varno)
+			if (varinfo2->rel == varinfo1->rel)
 				reldistinct *= varinfo2->ndistinct;
 			else
 			{
-- 
GitLab