From 54d20024c1dad339acc8624d7ca902b762fe0844 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Wed, 21 Mar 2007 22:18:12 +0000
Subject: [PATCH] Fix some problems with selectivity estimation for partial
 indexes.

First, genericcostestimate() was being way too liberal about including
partial-index conditions in its selectivity estimate, resulting in
substantial underestimates for situations such as an indexqual "x = 42"
used with an index on x "WHERE x >= 40 AND x < 50".  While the code is
intentionally set up to favor selecting partial indexes when available,
this was too much...

Second, choose_bitmap_and() was likewise easily fooled by cases of this
type, since it would similarly think that the partial index had selectivity
independent of the indexqual.

Fixed by using predicate_implied_by() rather than simple equality checks
to determine redundancy.  This is a good deal more expensive but I don't
see much alternative.  At least the extra cost is only paid when there's
actually a partial index under consideration.

Per report from Jeff Davis.  I'm not going to risk back-patching this,
though.
---
 src/backend/optimizer/path/indxpath.c | 109 ++++++++++++++++++--------
 src/backend/utils/adt/selfuncs.c      |  51 ++++++------
 src/test/regress/expected/select.out  |   9 ++-
 src/test/regress/sql/select.sql       |  11 ++-
 4 files changed, 120 insertions(+), 60 deletions(-)

diff --git a/src/backend/optimizer/path/indxpath.c b/src/backend/optimizer/path/indxpath.c
index 04e029beb28..7197658ae9b 100644
--- a/src/backend/optimizer/path/indxpath.c
+++ b/src/backend/optimizer/path/indxpath.c
@@ -9,7 +9,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/optimizer/path/indxpath.c,v 1.217 2007/03/17 00:11:04 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/optimizer/path/indxpath.c,v 1.218 2007/03/21 22:18:12 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -57,8 +57,8 @@ static Path *choose_bitmap_and(PlannerInfo *root, RelOptInfo *rel,
 static int	bitmap_path_comparator(const void *a, const void *b);
 static Cost bitmap_and_cost_est(PlannerInfo *root, RelOptInfo *rel,
 					List *paths, RelOptInfo *outer_rel);
-static List *pull_indexpath_quals(Path *bitmapqual);
-static bool lists_intersect_ptr(List *list1, List *list2);
+static void find_indexpath_quals(Path *bitmapqual, List **quals, List **preds);
+static bool lists_intersect(List *list1, List *list2);
 static bool match_clause_to_indexcol(IndexOptInfo *index,
 						 int indexcol, Oid opfamily,
 						 RestrictInfo *rinfo,
@@ -562,6 +562,7 @@ choose_bitmap_and(PlannerInfo *root, RelOptInfo *rel,
 	Path	  **patharray;
 	Cost		costsofar;
 	List	   *qualsofar;
+	List	   *firstpred;
 	ListCell   *lastcell;
 	int			i;
 	ListCell   *l;
@@ -586,8 +587,7 @@ choose_bitmap_and(PlannerInfo *root, RelOptInfo *rel,
 	 * consider an index redundant if any of its index conditions were already
 	 * used by earlier indexes.  (We could use predicate_implied_by to have a
 	 * more intelligent, but much more expensive, check --- but in most cases
-	 * simple pointer equality should suffice, since after all the index
-	 * conditions are all coming from the same RestrictInfo lists.)
+	 * simple equality should suffice.)
 	 *
 	 * You might think the condition for redundancy should be "all index
 	 * conditions already used", not "any", but this turns out to be wrong.
@@ -597,10 +597,27 @@ choose_bitmap_and(PlannerInfo *root, RelOptInfo *rel,
 	 * non-selective.  In any case, we'd surely be drastically misestimating
 	 * the selectivity if we count the same condition twice.
 	 *
-	 * We include index predicate conditions in the redundancy test.  Because
-	 * the test is just for pointer equality and not equal(), the effect is
-	 * that use of the same partial index in two different AND elements is
-	 * considered redundant.  (XXX is this too strong?)
+	 * We must also consider index predicate conditions in checking for
+	 * redundancy, because the estimated selectivity of a partial index
+	 * includes its predicate even if the explicit index conditions don't.
+	 * Here we have to work harder than just checking expression equality:
+	 * we check to see if any of the predicate clauses are implied by
+	 * index conditions or predicate clauses of previous paths.  This covers
+	 * cases such as a condition "x = 42" used with a plain index, followed
+	 * by a clauseless scan of a partial index "WHERE x >= 40 AND x < 50".
+	 * Also, we reject indexes that have a qual condition matching any
+	 * previously-used index's predicate (by including predicate conditions
+	 * into qualsofar).  It should be sufficient to check equality in this
+	 * case, not implication, since we've sorted the paths by selectivity
+	 * and so tighter conditions are seen first --- only for exactly equal
+	 * cases might the partial index come first.
+	 *
+	 * XXX the reason we need all these redundancy checks is that costsize.c
+	 * and clausesel.c aren't very smart about redundant clauses: they will
+	 * usually double-count the redundant clauses, producing a too-small
+	 * selectivity that makes a redundant AND look like it reduces the total
+	 * cost.  Perhaps someday that code will be smarter and we can remove
+	 * these heuristics.
 	 *
 	 * Note: outputting the selected sub-paths in selectivity order is a good
 	 * thing even if we weren't using that as part of the selection method,
@@ -619,18 +636,38 @@ choose_bitmap_and(PlannerInfo *root, RelOptInfo *rel,
 
 	paths = list_make1(patharray[0]);
 	costsofar = bitmap_and_cost_est(root, rel, paths, outer_rel);
-	qualsofar = pull_indexpath_quals(patharray[0]);
+	find_indexpath_quals(patharray[0], &qualsofar, &firstpred);
+	qualsofar = list_concat(qualsofar, firstpred);
 	lastcell = list_head(paths);	/* for quick deletions */
 
 	for (i = 1; i < npaths; i++)
 	{
 		Path	   *newpath = patharray[i];
 		List	   *newqual;
+		List	   *newpred;
 		Cost		newcost;
 
-		newqual = pull_indexpath_quals(newpath);
-		if (lists_intersect_ptr(newqual, qualsofar))
+		find_indexpath_quals(newpath, &newqual, &newpred);
+		if (lists_intersect(newqual, qualsofar))
 			continue;			/* consider it redundant */
+		if (newpred)
+		{
+			bool	redundant = false;
+
+			/* we check each predicate clause separately */
+			foreach(l, newpred)
+			{
+				Node	   *np = (Node *) lfirst(l);
+
+				if (predicate_implied_by(list_make1(np), qualsofar))
+				{
+					redundant = true;
+					break;		/* out of inner loop */
+				}
+			}
+			if (redundant)
+				continue;
+		}
 		/* tentatively add newpath to paths, so we can estimate cost */
 		paths = lappend(paths, newpath);
 		newcost = bitmap_and_cost_est(root, rel, paths, outer_rel);
@@ -638,7 +675,7 @@ choose_bitmap_and(PlannerInfo *root, RelOptInfo *rel,
 		{
 			/* keep newpath in paths, update subsidiary variables */
 			costsofar = newcost;
-			qualsofar = list_concat(qualsofar, newqual);
+			qualsofar = list_concat(list_concat(qualsofar, newqual), newpred);
 			lastcell = lnext(lastcell);
 		}
 		else
@@ -710,35 +747,39 @@ bitmap_and_cost_est(PlannerInfo *root, RelOptInfo *rel,
 }
 
 /*
- * pull_indexpath_quals
+ * find_indexpath_quals
  *
- * Given the Path structure for a plain or bitmap indexscan, extract a list
+ * Given the Path structure for a plain or bitmap indexscan, extract lists
  * of all the indexquals and index predicate conditions used in the Path.
  *
  * This is sort of a simplified version of make_restrictinfo_from_bitmapqual;
  * here, we are not trying to produce an accurate representation of the AND/OR
  * semantics of the Path, but just find out all the base conditions used.
  *
- * The result list contains pointers to the expressions used in the Path,
+ * The result lists contain pointers to the expressions used in the Path,
  * but all the list cells are freshly built, so it's safe to destructively
- * modify the list (eg, by concat'ing it with other lists).
+ * modify the lists (eg, by concat'ing with other lists).
  */
-static List *
-pull_indexpath_quals(Path *bitmapqual)
+static void
+find_indexpath_quals(Path *bitmapqual, List **quals, List **preds)
 {
-	List	   *result = NIL;
 	ListCell   *l;
 
+	*quals = NIL;
+	*preds = NIL;
+
 	if (IsA(bitmapqual, BitmapAndPath))
 	{
 		BitmapAndPath *apath = (BitmapAndPath *) bitmapqual;
 
 		foreach(l, apath->bitmapquals)
 		{
-			List	   *sublist;
+			List	   *subquals;
+			List	   *subpreds;
 
-			sublist = pull_indexpath_quals((Path *) lfirst(l));
-			result = list_concat(result, sublist);
+			find_indexpath_quals((Path *) lfirst(l), &subquals, &subpreds);
+			*quals = list_concat(*quals, subquals);
+			*preds = list_concat(*preds, subpreds);
 		}
 	}
 	else if (IsA(bitmapqual, BitmapOrPath))
@@ -747,36 +788,36 @@ pull_indexpath_quals(Path *bitmapqual)
 
 		foreach(l, opath->bitmapquals)
 		{
-			List	   *sublist;
+			List	   *subquals;
+			List	   *subpreds;
 
-			sublist = pull_indexpath_quals((Path *) lfirst(l));
-			result = list_concat(result, sublist);
+			find_indexpath_quals((Path *) lfirst(l), &subquals, &subpreds);
+			*quals = list_concat(*quals, subquals);
+			*preds = list_concat(*preds, subpreds);
 		}
 	}
 	else if (IsA(bitmapqual, IndexPath))
 	{
 		IndexPath  *ipath = (IndexPath *) bitmapqual;
 
-		result = get_actual_clauses(ipath->indexclauses);
-		result = list_concat(result, list_copy(ipath->indexinfo->indpred));
+		*quals = get_actual_clauses(ipath->indexclauses);
+		*preds = list_copy(ipath->indexinfo->indpred);
 	}
 	else
 		elog(ERROR, "unrecognized node type: %d", nodeTag(bitmapqual));
-
-	return result;
 }
 
 
 /*
- * lists_intersect_ptr
+ * lists_intersect
  *		Detect whether two lists have a nonempty intersection,
- *		using pointer equality to compare members.
+ *		using equal() to compare members.
  *
  * This possibly should go into list.c, but it doesn't yet have any use
  * except in choose_bitmap_and.
  */
 static bool
-lists_intersect_ptr(List *list1, List *list2)
+lists_intersect(List *list1, List *list2)
 {
 	ListCell   *cell1;
 
@@ -787,7 +828,7 @@ lists_intersect_ptr(List *list1, List *list2)
 
 		foreach(cell2, list2)
 		{
-			if (lfirst(cell2) == datum1)
+			if (equal(lfirst(cell2), datum1))
 				return true;
 		}
 	}
diff --git a/src/backend/utils/adt/selfuncs.c b/src/backend/utils/adt/selfuncs.c
index df61fea567b..a92317aeac1 100644
--- a/src/backend/utils/adt/selfuncs.c
+++ b/src/backend/utils/adt/selfuncs.c
@@ -15,7 +15,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/utils/adt/selfuncs.c,v 1.229 2007/03/17 00:11:05 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/utils/adt/selfuncs.c,v 1.230 2007/03/21 22:18:12 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -86,6 +86,7 @@
 #include "optimizer/pathnode.h"
 #include "optimizer/paths.h"
 #include "optimizer/plancat.h"
+#include "optimizer/predtest.h"
 #include "optimizer/restrictinfo.h"
 #include "optimizer/var.h"
 #include "parser/parse_coerce.h"
@@ -4775,36 +4776,38 @@ genericcostestimate(PlannerInfo *root,
 	List	   *selectivityQuals;
 	ListCell   *l;
 
-	/*
+	/*----------
 	 * If the index is partial, AND the index predicate with the explicitly
 	 * given indexquals to produce a more accurate idea of the index
-	 * selectivity.  This may produce redundant clauses.  We get rid of exact
-	 * duplicates in the code below.  We expect that most cases of partial
-	 * redundancy (such as "x < 4" from the qual and "x < 5" from the
-	 * predicate) will be recognized and handled correctly by
-	 * clauselist_selectivity().  This assumption is somewhat fragile, since
-	 * it depends on predicate_implied_by() and clauselist_selectivity()
-	 * having similar capabilities, and there are certainly many cases where
-	 * we will end up with a too-low selectivity estimate.	This will bias the
-	 * system in favor of using partial indexes where possible, which is not
-	 * necessarily a bad thing. But it'd be nice to do better someday.
+	 * selectivity.  However, we need to be careful not to insert redundant
+	 * clauses, because clauselist_selectivity() is easily fooled into
+	 * computing a too-low selectivity estimate.  Our approach is to add
+	 * only the index predicate clause(s) that cannot be proven to be implied
+	 * by the given indexquals.  This successfully handles cases such as a
+	 * qual "x = 42" used with a partial index "WHERE x >= 40 AND x < 50".
+	 * There are many other cases where we won't detect redundancy, leading
+	 * to a too-low selectivity estimate, which will bias the system in favor
+	 * of using partial indexes where possible.  That is not necessarily bad
+	 * though.
 	 *
-	 * Note that index->indpred and indexQuals are both in implicit-AND form,
-	 * so ANDing them together just takes merging the lists.  However,
-	 * eliminating duplicates is a bit trickier because indexQuals contains
-	 * RestrictInfo nodes and the indpred does not.  It is okay to pass a
-	 * mixed list to clauselist_selectivity, but we have to work a bit to
-	 * generate a list without logical duplicates.	(We could just list_union
-	 * indpred and strippedQuals, but then we'd not get caching of per-qual
-	 * selectivity estimates.)
+	 * Note that indexQuals contains RestrictInfo nodes while the indpred
+	 * does not.  This is OK for both predicate_implied_by() and
+	 * clauselist_selectivity().
+	 *----------
 	 */
 	if (index->indpred != NIL)
 	{
-		List	   *strippedQuals;
-		List	   *predExtraQuals;
+		List	   *predExtraQuals = NIL;
+
+		foreach(l, index->indpred)
+		{
+			Node   *predQual = (Node *) lfirst(l);
+			List   *oneQual = list_make1(predQual);
 
-		strippedQuals = get_actual_clauses(indexQuals);
-		predExtraQuals = list_difference(index->indpred, strippedQuals);
+			if (!predicate_implied_by(oneQual, indexQuals))
+				predExtraQuals = list_concat(predExtraQuals, oneQual);
+		}
+		/* list_concat avoids modifying the passed-in indexQuals list */
 		selectivityQuals = list_concat(predExtraQuals, indexQuals);
 	}
 	else
diff --git a/src/test/regress/expected/select.out b/src/test/regress/expected/select.out
index 0b3f546bdfb..d58c8d2811b 100644
--- a/src/test/regress/expected/select.out
+++ b/src/test/regress/expected/select.out
@@ -209,9 +209,13 @@ SELECT onek.unique1, onek.string4 FROM onek
 -- test partial btree indexes
 --
 -- As of 7.2, planner probably won't pick an indexscan without stats,
--- so ANALYZE first.
+-- so ANALYZE first.  Also, we want to prevent it from picking a bitmapscan
+-- followed by sort, because that could hide index ordering problems.
 --
 ANALYZE onek2;
+SET enable_seqscan TO off;
+SET enable_bitmapscan TO off;
+SET enable_sort TO off;
 --
 -- awk '{if($1<10){print $0;}else{next;}}' onek.data | sort +0n -1
 --
@@ -288,6 +292,9 @@ SELECT onek2.unique1, onek2.stringu1 FROM onek2
      999 | LMAAAA
 (19 rows)
 
+RESET enable_seqscan;
+RESET enable_bitmapscan;
+RESET enable_sort;
 SELECT two, stringu1, ten, string4
    INTO TABLE tmp
    FROM onek;
diff --git a/src/test/regress/sql/select.sql b/src/test/regress/sql/select.sql
index f23cccd24f9..8c92168c9b8 100644
--- a/src/test/regress/sql/select.sql
+++ b/src/test/regress/sql/select.sql
@@ -59,10 +59,15 @@ SELECT onek.unique1, onek.string4 FROM onek
 -- test partial btree indexes
 --
 -- As of 7.2, planner probably won't pick an indexscan without stats,
--- so ANALYZE first.
+-- so ANALYZE first.  Also, we want to prevent it from picking a bitmapscan
+-- followed by sort, because that could hide index ordering problems.
 --
 ANALYZE onek2;
 
+SET enable_seqscan TO off;
+SET enable_bitmapscan TO off;
+SET enable_sort TO off;
+
 --
 -- awk '{if($1<10){print $0;}else{next;}}' onek.data | sort +0n -1
 --
@@ -81,6 +86,10 @@ SELECT onek2.unique1, onek2.stringu1 FROM onek2
 SELECT onek2.unique1, onek2.stringu1 FROM onek2
    WHERE onek2.unique1 > 980;
 
+RESET enable_seqscan;
+RESET enable_bitmapscan;
+RESET enable_sort;
+
 
 SELECT two, stringu1, ten, string4
    INTO TABLE tmp
-- 
GitLab