From 0ee5a39862748b846786db48f41cc29eab38c015 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Fri, 31 Aug 2007 23:35:22 +0000
Subject: [PATCH] Apply a band-aid fix for the problem that 8.2 and up
 completely misestimate the number of rows likely to be produced by a query
 such as 	SELECT * FROM t1 LEFT JOIN t2 USING (key) WHERE t2.key IS
 NULL; What this is doing is selecting for t1 rows with no match in t2, and
 thus it may produce a significant number of rows even if the t2.key table
 column contains no nulls at all.  8.2 thinks the table column's null fraction
 is relevant and thus may estimate no rows out, which results in terrible
 plans if there are more joins above this one.  A proper fix for this will
 involve passing much more information about the context of a clause to the
 selectivity estimator functions than we ever have.  There's no time left to
 write such a patch for 8.3, and it wouldn't be back-patchable into 8.2
 anyway.  Instead, put in an ad-hoc test to defeat the normal
 table-stats-based estimation when an IS NULL test is evaluated at an outer
 join, and just use a constant estimate instead --- I went with 0.5 for lack
 of a better idea.  This won't catch every case but it will catch the typical
 ways of writing such queries, and it seems unlikely to make things worse for
 other queries.

---
 src/backend/optimizer/path/clausesel.c |  9 ++++++---
 src/backend/utils/adt/selfuncs.c       | 17 +++++++++++++++--
 src/include/utils/selfuncs.h           |  4 ++--
 3 files changed, 23 insertions(+), 7 deletions(-)

diff --git a/src/backend/optimizer/path/clausesel.c b/src/backend/optimizer/path/clausesel.c
index 4b48ae1e260..f5ee929c56e 100644
--- a/src/backend/optimizer/path/clausesel.c
+++ b/src/backend/optimizer/path/clausesel.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/optimizer/path/clausesel.c,v 1.86 2007/06/11 01:16:22 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/optimizer/path/clausesel.c,v 1.87 2007/08/31 23:35:22 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -219,7 +219,9 @@ clauselist_selectivity(PlannerInfo *root,
 				s2 = rqlist->hibound + rqlist->lobound - 1.0;
 
 				/* Adjust for double-exclusion of NULLs */
-				s2 += nulltestsel(root, IS_NULL, rqlist->var, varRelid);
+				/* HACK: disable nulltestsel's special outer-join logic */
+				s2 += nulltestsel(root, IS_NULL, rqlist->var,
+								  varRelid, JOIN_INNER);
 
 				/*
 				 * A zero or slightly negative s2 should be converted into a
@@ -702,7 +704,8 @@ clause_selectivity(PlannerInfo *root,
 		s1 = nulltestsel(root,
 						 ((NullTest *) clause)->nulltesttype,
 						 (Node *) ((NullTest *) clause)->arg,
-						 varRelid);
+						 varRelid,
+						 jointype);
 	}
 	else if (IsA(clause, BooleanTest))
 	{
diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index 60da08b1ec4..ad9f662b68d 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -15,7 +15,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/utils/adt/selfuncs.c,v 1.235 2007/08/21 01:11:18 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/utils/adt/selfuncs.c,v 1.236 2007/08/31 23:35:22 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -1385,11 +1385,24 @@ booltestsel(PlannerInfo *root, BoolTestType booltesttype, Node *arg,
  */
 Selectivity
 nulltestsel(PlannerInfo *root, NullTestType nulltesttype,
-			Node *arg, int varRelid)
+			Node *arg, int varRelid, JoinType jointype)
 {
 	VariableStatData vardata;
 	double		selec;
 
+	/*
+	 * Special hack: an IS NULL test being applied at an outer join should not
+	 * be taken at face value, since it's very likely being used to select the
+	 * outer-side rows that don't have a match, and thus its selectivity has
+	 * nothing whatever to do with the statistics of the original table
+	 * column.  We do not have nearly enough context here to determine its
+	 * true selectivity, so for the moment punt and guess at 0.5.  Eventually
+	 * the planner should be made to provide enough info about the clause's
+	 * context to let us do better.
+	 */
+	if (IS_OUTER_JOIN(jointype) && nulltesttype == IS_NULL)
+		return (Selectivity) 0.5;
+
 	examine_variable(root, arg, varRelid, &vardata);
 
 	if (HeapTupleIsValid(vardata.statsTuple))
diff --git a/src/include/utils/selfuncs.h b/src/include/utils/selfuncs.h
index f0c6f20427c..fc409d9bddd 100644
--- a/src/include/utils/selfuncs.h
+++ b/src/include/utils/selfuncs.h
@@ -8,7 +8,7 @@
  * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/utils/selfuncs.h,v 1.39 2007/01/22 20:00:40 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/utils/selfuncs.h,v 1.40 2007/08/31 23:35:22 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -149,7 +149,7 @@ extern Datum icnlikejoinsel(PG_FUNCTION_ARGS);
 extern Selectivity booltestsel(PlannerInfo *root, BoolTestType booltesttype,
 			Node *arg, int varRelid, JoinType jointype);
 extern Selectivity nulltestsel(PlannerInfo *root, NullTestType nulltesttype,
-			Node *arg, int varRelid);
+			Node *arg, int varRelid, JoinType jointype);
 extern Selectivity scalararraysel(PlannerInfo *root,
 			   ScalarArrayOpExpr *clause,
 			   bool is_join_clause,
-- 
GitLab