From 97930cf578e28c01f67fe4006ffcdbb5aedf18c2 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Wed, 31 Aug 2011 16:04:48 -0400
Subject: [PATCH] Improve eqjoinsel's ndistinct clamping to work for multiple
 levels of join.

This patch fixes an oversight in my commit
7f3eba30c9d622d1981b1368f2d79ba0999cdff2 of 2008-10-23.  That patch
accounted for baserel restriction clauses that reduced the number of rows
coming out of a table (and hence the number of possibly-distinct values of
a join variable), but not for join restriction clauses that might have been
applied at a lower level of join.  To account for the latter, look up the
sizes of the min_lefthand and min_righthand inputs of the current join,
and clamp with those in the same way as for the base relations.

Noted while investigating a complaint from Ben Chobot, although this in
itself doesn't seem to explain his report.

Back-patch to 8.4; previous versions used different estimation methods
for which this heuristic isn't relevant.
---
 src/backend/utils/adt/selfuncs.c | 81 ++++++++++++++++++++++++++++----
 1 file changed, 73 insertions(+), 8 deletions(-)

diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index e0658265ecb..18bbfe9b8d7 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -142,9 +142,11 @@ static double ineq_histogram_selectivity(PlannerInfo *root,
 						   FmgrInfo *opproc, bool isgt,
 						   Datum constval, Oid consttype);
 static double eqjoinsel_inner(Oid operator,
-				VariableStatData *vardata1, VariableStatData *vardata2);
+				VariableStatData *vardata1, VariableStatData *vardata2,
+				RelOptInfo *rel1, RelOptInfo *rel2);
 static double eqjoinsel_semi(Oid operator,
-			   VariableStatData *vardata1, VariableStatData *vardata2);
+			   VariableStatData *vardata1, VariableStatData *vardata2,
+			   RelOptInfo *rel1, RelOptInfo *rel2);
 static bool convert_to_scalar(Datum value, Oid valuetypid, double *scaledvalue,
 				  Datum lobound, Datum hibound, Oid boundstypid,
 				  double *scaledlobound, double *scaledhibound);
@@ -173,6 +175,7 @@ static bool get_actual_variable_range(PlannerInfo *root,
 						  VariableStatData *vardata,
 						  Oid sortop,
 						  Datum *min, Datum *max);
+static RelOptInfo *find_join_input_rel(PlannerInfo *root, Relids relids);
 static Selectivity prefix_selectivity(PlannerInfo *root,
 				   VariableStatData *vardata,
 				   Oid vartype, Oid opfamily, Const *prefixcon);
@@ -2008,24 +2011,47 @@ eqjoinsel(PG_FUNCTION_ARGS)
 	VariableStatData vardata1;
 	VariableStatData vardata2;
 	bool		join_is_reversed;
+	RelOptInfo *rel1;
+	RelOptInfo *rel2;
 
 	get_join_variables(root, args, sjinfo,
 					   &vardata1, &vardata2, &join_is_reversed);
 
+	/*
+	 * Identify the join's direct input relations.  We use the min lefthand
+	 * and min righthand as the inputs, even though the join might actually
+	 * get done with larger input relations.  The min inputs are guaranteed to
+	 * have been formed by now, though, and always using them ensures
+	 * consistency of estimates.
+	 */
+	if (!join_is_reversed)
+	{
+		rel1 = find_join_input_rel(root, sjinfo->min_lefthand);
+		rel2 = find_join_input_rel(root, sjinfo->min_righthand);
+	}
+	else
+	{
+		rel1 = find_join_input_rel(root, sjinfo->min_righthand);
+		rel2 = find_join_input_rel(root, sjinfo->min_lefthand);
+	}
+
 	switch (sjinfo->jointype)
 	{
 		case JOIN_INNER:
 		case JOIN_LEFT:
 		case JOIN_FULL:
-			selec = eqjoinsel_inner(operator, &vardata1, &vardata2);
+			selec = eqjoinsel_inner(operator, &vardata1, &vardata2,
+									rel1, rel2);
 			break;
 		case JOIN_SEMI:
 		case JOIN_ANTI:
 			if (!join_is_reversed)
-				selec = eqjoinsel_semi(operator, &vardata1, &vardata2);
+				selec = eqjoinsel_semi(operator, &vardata1, &vardata2,
+									   rel1, rel2);
 			else
 				selec = eqjoinsel_semi(get_commutator(operator),
-									   &vardata2, &vardata1);
+									   &vardata2, &vardata1,
+									   rel2, rel1);
 			break;
 		default:
 			/* other values not expected here */
@@ -2051,7 +2077,8 @@ eqjoinsel(PG_FUNCTION_ARGS)
  */
 static double
 eqjoinsel_inner(Oid operator,
-				VariableStatData *vardata1, VariableStatData *vardata2)
+				VariableStatData *vardata1, VariableStatData *vardata2,
+				RelOptInfo *rel1, RelOptInfo *rel2)
 {
 	double		selec;
 	double		nd1;
@@ -2252,15 +2279,19 @@ eqjoinsel_inner(Oid operator,
 		 * be, providing a crude correction for the selectivity of restriction
 		 * clauses on those relations.	(We don't do that in the other path
 		 * since there we are comparing the nd values to stats for the whole
-		 * relations.)
+		 * relations.)  We can apply this clamp both with respect to the base
+		 * relations from which the join variables come, and to the immediate
+		 * input relations of the current join.
 		 */
 		double		nullfrac1 = stats1 ? stats1->stanullfrac : 0.0;
 		double		nullfrac2 = stats2 ? stats2->stanullfrac : 0.0;
 
 		if (vardata1->rel)
 			nd1 = Min(nd1, vardata1->rel->rows);
+		nd1 = Min(nd1, rel1->rows);
 		if (vardata2->rel)
 			nd2 = Min(nd2, vardata2->rel->rows);
+		nd2 = Min(nd2, rel2->rows);
 
 		selec = (1.0 - nullfrac1) * (1.0 - nullfrac2);
 		if (nd1 > nd2)
@@ -2287,7 +2318,8 @@ eqjoinsel_inner(Oid operator,
  */
 static double
 eqjoinsel_semi(Oid operator,
-			   VariableStatData *vardata1, VariableStatData *vardata2)
+			   VariableStatData *vardata1, VariableStatData *vardata2,
+			   RelOptInfo *rel1, RelOptInfo *rel2)
 {
 	double		selec;
 	double		nd1;
@@ -2435,8 +2467,10 @@ eqjoinsel_semi(Oid operator,
 		{
 			if (vardata1->rel)
 				nd1 = Min(nd1, vardata1->rel->rows);
+			nd1 = Min(nd1, rel1->rows);
 			if (vardata2->rel)
 				nd2 = Min(nd2, vardata2->rel->rows);
+			nd2 = Min(nd2, rel2->rows);
 
 			if (nd1 <= nd2 || nd2 <= 0)
 				selec = 1.0 - nullfrac1;
@@ -4759,6 +4793,37 @@ get_actual_variable_range(PlannerInfo *root, VariableStatData *vardata,
 	return have_data;
 }
 
+/*
+ * find_join_input_rel
+ *		Look up the input relation for a join.
+ *
+ * We assume that the input relation's RelOptInfo must have been constructed
+ * already.
+ */
+static RelOptInfo *
+find_join_input_rel(PlannerInfo *root, Relids relids)
+{
+	RelOptInfo *rel = NULL;
+
+	switch (bms_membership(relids))
+	{
+		case BMS_EMPTY_SET:
+			/* should not happen */
+			break;
+		case BMS_SINGLETON:
+			rel = find_base_rel(root, bms_singleton_member(relids));
+			break;
+		case BMS_MULTIPLE:
+			rel = find_join_rel(root, relids);
+			break;
+	}
+
+	if (rel == NULL)
+		elog(ERROR, "could not find RelOptInfo for given relids");
+
+	return rel;
+}
+
 
 /*-------------------------------------------------------------------------
  *
-- 
GitLab