From 4ab76b1c20500922aebfdd0c26aef4bdcc608e88 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Mon, 28 Nov 2005 23:46:03 +0000
Subject: [PATCH] Tweak hash join code to use an additional heuristic for
 deciding whether it's worth probing the outer relation for emptiness before
 building the hash table.  To wit, if we're rescanning a join previously
 performed, remember whether we found it nonempty the previous time, and don't
 bother with the probe if it was nonempty.  This buys back the performance
 lost in examples like Mario Weilguni's.

---
 src/backend/executor/nodeHashjoin.c | 43 ++++++++++++++++++++++++++---
 src/include/nodes/execnodes.h       |  4 ++-
 2 files changed, 42 insertions(+), 5 deletions(-)

diff --git a/src/backend/executor/nodeHashjoin.c b/src/backend/executor/nodeHashjoin.c
index ee2809a8b45..7363ab2a2cd 100644
--- a/src/backend/executor/nodeHashjoin.c
+++ b/src/backend/executor/nodeHashjoin.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/executor/nodeHashjoin.c,v 1.78 2005/11/28 17:14:23 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/executor/nodeHashjoin.c,v 1.79 2005/11/28 23:46:03 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -120,16 +120,28 @@ ExecHashJoin(HashJoinState *node)
 		 * since we aren't going to be able to skip the join on the strength
 		 * of an empty inner relation anyway.)
 		 *
+		 * If we are rescanning the join, we make use of information gained
+		 * on the previous scan: don't bother to try the prefetch if the
+		 * previous scan found the outer relation nonempty.  This is not
+		 * 100% reliable since with new parameters the outer relation might
+		 * yield different results, but it's a good heuristic.
+		 *
 		 * The only way to make the check is to try to fetch a tuple from the
 		 * outer plan node.  If we succeed, we have to stash it away for later
 		 * consumption by ExecHashJoinOuterGetTuple.
 		 */
-		if (outerNode->plan->startup_cost < hashNode->ps.plan->total_cost ||
-			node->js.jointype == JOIN_LEFT)
+		if (node->js.jointype == JOIN_LEFT ||
+			(outerNode->plan->startup_cost < hashNode->ps.plan->total_cost &&
+			 !node->hj_OuterNotEmpty))
 		{
 			node->hj_FirstOuterTupleSlot = ExecProcNode(outerNode);
 			if (TupIsNull(node->hj_FirstOuterTupleSlot))
+			{
+				node->hj_OuterNotEmpty = false;
 				return NULL;
+			}
+			else
+				node->hj_OuterNotEmpty = true;
 		}
 		else
 			node->hj_FirstOuterTupleSlot = NULL;
@@ -159,6 +171,13 @@ ExecHashJoin(HashJoinState *node)
 		 * scanning the outer relation
 		 */
 		hashtable->nbatch_outstart = hashtable->nbatch;
+
+		/*
+		 * Reset OuterNotEmpty for scan.  (It's OK if we fetched a tuple
+		 * above, because ExecHashJoinOuterGetTuple will immediately
+		 * set it again.)
+		 */
+		node->hj_OuterNotEmpty = false;
 	}
 
 	/*
@@ -454,6 +473,7 @@ ExecInitHashJoin(HashJoin *node, EState *estate)
 	hjstate->js.ps.ps_TupFromTlist = false;
 	hjstate->hj_NeedNewOuter = true;
 	hjstate->hj_MatchedOuter = false;
+	hjstate->hj_OuterNotEmpty = false;
 
 	return hjstate;
 }
@@ -546,6 +566,9 @@ ExecHashJoinOuterGetTuple(PlanState *outerNode,
 			*hashvalue = ExecHashGetHashValue(hashtable, econtext,
 											  hjstate->hj_OuterHashKeys);
 
+			/* remember outer relation is not empty for possible rescan */
+			hjstate->hj_OuterNotEmpty = true;
+
 			return slot;
 		}
 
@@ -809,7 +832,19 @@ ExecReScanHashJoin(HashJoinState *node, ExprContext *exprCtxt)
 		if (node->hj_HashTable->nbatch == 1 &&
 			((PlanState *) node)->righttree->chgParam == NULL)
 		{
-			/* okay to reuse the hash table; needn't rescan inner, either */
+			/*
+			 * okay to reuse the hash table; needn't rescan inner, either.
+			 *
+			 * What we do need to do is reset our state about the emptiness
+			 * of the outer relation, so that the new scan of the outer will
+			 * update it correctly if it turns out to be empty this time.
+			 * (There's no harm in clearing it now because ExecHashJoin won't
+			 * need the info.  In the other cases, where the hash table
+			 * doesn't exist or we are destroying it, we leave this state
+			 * alone because ExecHashJoin will need it the first time
+			 * through.)
+			 */
+			node->hj_OuterNotEmpty = false;
 		}
 		else
 		{
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index e9fb41f653a..7371f950070 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2005, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/nodes/execnodes.h,v 1.144 2005/11/26 22:14:57 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/nodes/execnodes.h,v 1.145 2005/11/28 23:46:03 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -1118,6 +1118,7 @@ typedef struct MergeJoinState
  *		hj_FirstOuterTupleSlot	first tuple retrieved from outer plan
  *		hj_NeedNewOuter			true if need new outer tuple on next call
  *		hj_MatchedOuter			true if found a join match for current outer
+ *		hj_OuterNotEmpty		true if outer relation known not empty
  * ----------------
  */
 
@@ -1142,6 +1143,7 @@ typedef struct HashJoinState
 	TupleTableSlot *hj_FirstOuterTupleSlot;
 	bool		hj_NeedNewOuter;
 	bool		hj_MatchedOuter;
+	bool		hj_OuterNotEmpty;
 } HashJoinState;
 
 
-- 
GitLab