From 69d0a15e2aa15a2ee851df33f5c15fa227c47bd7 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Tue, 27 Jun 2006 21:31:20 +0000
Subject: [PATCH] Convert hash join code to use MinimalTuple format in tuple
 hash table and batch files.  Should reduce memory and I/O demands for such
 joins.

---
 src/backend/executor/execTuples.c   | 51 +++++++++++++++++++-
 src/backend/executor/nodeHash.c     | 48 +++++++++----------
 src/backend/executor/nodeHashjoin.c | 72 +++++++++++++----------------
 src/include/executor/hashjoin.h     |  9 +++-
 src/include/executor/nodeHash.h     |  6 +--
 src/include/executor/nodeHashjoin.h |  4 +-
 src/include/executor/tuptable.h     |  3 +-
 7 files changed, 121 insertions(+), 72 deletions(-)

diff --git a/src/backend/executor/execTuples.c b/src/backend/executor/execTuples.c
index f03d738619d..fd54c3d03c1 100644
--- a/src/backend/executor/execTuples.c
+++ b/src/backend/executor/execTuples.c
@@ -15,7 +15,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/executor/execTuples.c,v 1.95 2006/06/27 02:51:39 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/executor/execTuples.c,v 1.96 2006/06/27 21:31:20 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -718,6 +718,55 @@ ExecFetchSlotTuple(TupleTableSlot *slot)
 	return ExecMaterializeSlot(slot);
 }
 
+/* --------------------------------
+ *		ExecFetchSlotMinimalTuple
+ *			Fetch the slot's minimal physical tuple.
+ *
+ *		If the slot contains a virtual tuple, we convert it to minimal
+ *		physical form.  The slot retains ownership of the physical tuple.
+ *		Likewise, if it contains a regular tuple we convert to minimal form.
+ *
+ * As above, the result must be treated as read-only.
+ * --------------------------------
+ */
+MinimalTuple
+ExecFetchSlotMinimalTuple(TupleTableSlot *slot)
+{
+	MinimalTuple newTuple;
+	MemoryContext oldContext;
+
+	/*
+	 * sanity checks
+	 */
+	Assert(slot != NULL);
+	Assert(!slot->tts_isempty);
+
+	/*
+	 * If we have a minimal physical tuple then just return it.
+	 */
+	if (slot->tts_mintuple)
+		return slot->tts_mintuple;
+
+	/*
+	 * Otherwise, build a minimal tuple, and then store it as the new slot
+	 * value.  (Note: tts_nvalid will be reset to zero here.  There are cases
+	 * in which this could be optimized but it's probably not worth worrying
+	 * about.)
+	 *
+	 * We may be called in a context that is shorter-lived than the tuple
+	 * slot, but we have to ensure that the materialized tuple will survive
+	 * anyway.
+	 */
+	oldContext = MemoryContextSwitchTo(slot->tts_mcxt);
+	newTuple = ExecCopySlotMinimalTuple(slot);
+	MemoryContextSwitchTo(oldContext);
+
+	ExecStoreMinimalTuple(newTuple, slot, true);
+
+	Assert(slot->tts_mintuple);
+	return slot->tts_mintuple;
+}
+
 /* --------------------------------
  *		ExecMaterializeSlot
  *			Force a slot into the "materialized" state.
diff --git a/src/backend/executor/nodeHash.c b/src/backend/executor/nodeHash.c
index 5710afb2fca..3c8de3f5e7f 100644
--- a/src/backend/executor/nodeHash.c
+++ b/src/backend/executor/nodeHash.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/executor/nodeHash.c,v 1.103 2006/05/30 14:01:58 momjian Exp $
+ *	  $PostgreSQL: pgsql/src/backend/executor/nodeHash.c,v 1.104 2006/06/27 21:31:20 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -92,7 +92,7 @@ MultiExecHash(HashState *node)
 		/* We have to compute the hash value */
 		econtext->ecxt_innertuple = slot;
 		hashvalue = ExecHashGetHashValue(hashtable, econtext, hashkeys);
-		ExecHashTableInsert(hashtable, ExecFetchSlotTuple(slot), hashvalue);
+		ExecHashTableInsert(hashtable, slot, hashvalue);
 	}
 
 	/* must provide our own instrumentation support */
@@ -358,8 +358,8 @@ ExecChooseHashTableSize(double ntuples, int tupwidth,
 	 * does not allow for any palloc overhead.	The manipulations of spaceUsed
 	 * don't count palloc overhead either.
 	 */
-	tupsize = MAXALIGN(sizeof(HashJoinTupleData)) +
-		MAXALIGN(sizeof(HeapTupleHeaderData)) +
+	tupsize = HJTUPLE_OVERHEAD +
+		MAXALIGN(sizeof(MinimalTupleData)) +
 		MAXALIGN(tupwidth);
 	inner_rel_bytes = ntuples * tupsize;
 
@@ -548,7 +548,8 @@ ExecHashIncreaseNumBatches(HashJoinTable hashtable)
 			{
 				/* dump it out */
 				Assert(batchno > curbatch);
-				ExecHashJoinSaveTuple(&tuple->htup, tuple->hashvalue,
+				ExecHashJoinSaveTuple(HJTUPLE_MINTUPLE(tuple),
+									  tuple->hashvalue,
 									  &hashtable->innerBatchFile[batchno]);
 				/* and remove from hash table */
 				if (prevtuple)
@@ -557,7 +558,7 @@ ExecHashIncreaseNumBatches(HashJoinTable hashtable)
 					hashtable->buckets[i] = nexttuple;
 				/* prevtuple doesn't change */
 				hashtable->spaceUsed -=
-					MAXALIGN(sizeof(HashJoinTupleData)) + tuple->htup.t_len;
+					HJTUPLE_OVERHEAD + HJTUPLE_MINTUPLE(tuple)->t_len;
 				pfree(tuple);
 				nfreed++;
 			}
@@ -592,12 +593,19 @@ ExecHashIncreaseNumBatches(HashJoinTable hashtable)
  * ExecHashTableInsert
  *		insert a tuple into the hash table depending on the hash value
  *		it may just go to a temp file for later batches
+ *
+ * Note: the passed TupleTableSlot may contain a regular, minimal, or virtual
+ * tuple; the minimal case in particular is certain to happen while reloading
+ * tuples from batch files.  We could save some cycles in the regular-tuple
+ * case by not forcing the slot contents into minimal form; not clear if it's
+ * worth the messiness required.
  */
 void
 ExecHashTableInsert(HashJoinTable hashtable,
-					HeapTuple tuple,
+					TupleTableSlot *slot,
 					uint32 hashvalue)
 {
+	MinimalTuple tuple = ExecFetchSlotMinimalTuple(slot);
 	int			bucketno;
 	int			batchno;
 
@@ -615,18 +623,11 @@ ExecHashTableInsert(HashJoinTable hashtable,
 		HashJoinTuple hashTuple;
 		int			hashTupleSize;
 
-		hashTupleSize = MAXALIGN(sizeof(HashJoinTupleData)) + tuple->t_len;
+		hashTupleSize = HJTUPLE_OVERHEAD + tuple->t_len;
 		hashTuple = (HashJoinTuple) MemoryContextAlloc(hashtable->batchCxt,
 													   hashTupleSize);
 		hashTuple->hashvalue = hashvalue;
-		memcpy((char *) &hashTuple->htup,
-			   (char *) tuple,
-			   sizeof(hashTuple->htup));
-		hashTuple->htup.t_data = (HeapTupleHeader)
-			(((char *) hashTuple) + MAXALIGN(sizeof(HashJoinTupleData)));
-		memcpy((char *) hashTuple->htup.t_data,
-			   (char *) tuple->t_data,
-			   tuple->t_len);
+		memcpy(HJTUPLE_MINTUPLE(hashTuple), tuple, tuple->t_len);
 		hashTuple->next = hashtable->buckets[bucketno];
 		hashtable->buckets[bucketno] = hashTuple;
 		hashtable->spaceUsed += hashTupleSize;
@@ -639,7 +640,8 @@ ExecHashTableInsert(HashJoinTable hashtable,
 		 * put the tuple into a temp file for later batches
 		 */
 		Assert(batchno > hashtable->curbatch);
-		ExecHashJoinSaveTuple(tuple, hashvalue,
+		ExecHashJoinSaveTuple(tuple,
+							  hashvalue,
 							  &hashtable->innerBatchFile[batchno]);
 	}
 }
@@ -749,7 +751,7 @@ ExecHashGetBucketAndBatch(HashJoinTable hashtable,
  *
  * The current outer tuple must be stored in econtext->ecxt_outertuple.
  */
-HeapTuple
+HashJoinTuple
 ExecScanHashBucket(HashJoinState *hjstate,
 				   ExprContext *econtext)
 {
@@ -771,14 +773,12 @@ ExecScanHashBucket(HashJoinState *hjstate,
 	{
 		if (hashTuple->hashvalue == hashvalue)
 		{
-			HeapTuple	heapTuple = &hashTuple->htup;
 			TupleTableSlot *inntuple;
 
 			/* insert hashtable's tuple into exec slot so ExecQual sees it */
-			inntuple = ExecStoreTuple(heapTuple,
-									  hjstate->hj_HashTupleSlot,
-									  InvalidBuffer,
-									  false);	/* do not pfree */
+			inntuple = ExecStoreMinimalTuple(HJTUPLE_MINTUPLE(hashTuple),
+											 hjstate->hj_HashTupleSlot,
+											 false);	/* do not pfree */
 			econtext->ecxt_innertuple = inntuple;
 
 			/* reset temp memory each time to avoid leaks from qual expr */
@@ -787,7 +787,7 @@ ExecScanHashBucket(HashJoinState *hjstate,
 			if (ExecQual(hjclauses, econtext, false))
 			{
 				hjstate->hj_CurTuple = hashTuple;
-				return heapTuple;
+				return hashTuple;
 			}
 		}
 
diff --git a/src/backend/executor/nodeHashjoin.c b/src/backend/executor/nodeHashjoin.c
index 097343fd88c..572aa1a5911 100644
--- a/src/backend/executor/nodeHashjoin.c
+++ b/src/backend/executor/nodeHashjoin.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/executor/nodeHashjoin.c,v 1.82 2006/06/16 18:42:22 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/executor/nodeHashjoin.c,v 1.83 2006/06/27 21:31:20 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -54,7 +54,7 @@ ExecHashJoin(HashJoinState *node)
 	ExprContext *econtext;
 	ExprDoneCond isDone;
 	HashJoinTable hashtable;
-	HeapTuple	curtuple;
+	HashJoinTuple curtuple;
 	TupleTableSlot *outerTupleSlot;
 	uint32		hashvalue;
 	int			batchno;
@@ -224,7 +224,7 @@ ExecHashJoin(HashJoinState *node)
 				 * in the corresponding outer-batch file.
 				 */
 				Assert(batchno > hashtable->curbatch);
-				ExecHashJoinSaveTuple(ExecFetchSlotTuple(outerTupleSlot),
+				ExecHashJoinSaveTuple(ExecFetchSlotMinimalTuple(outerTupleSlot),
 									  hashvalue,
 									  &hashtable->outerBatchFile[batchno]);
 				node->hj_NeedNewOuter = true;
@@ -244,10 +244,9 @@ ExecHashJoin(HashJoinState *node)
 			/*
 			 * we've got a match, but still need to test non-hashed quals
 			 */
-			inntuple = ExecStoreTuple(curtuple,
-									  node->hj_HashTupleSlot,
-									  InvalidBuffer,
-									  false);	/* don't pfree this tuple */
+			inntuple = ExecStoreMinimalTuple(HJTUPLE_MINTUPLE(curtuple),
+											 node->hj_HashTupleSlot,
+											 false);	/* don't pfree */
 			econtext->ecxt_innertuple = inntuple;
 
 			/* reset temp memory each time to avoid leaks from qual expr */
@@ -706,9 +705,7 @@ start_over:
 			 * NOTE: some tuples may be sent to future batches.  Also, it is
 			 * possible for hashtable->nbatch to be increased here!
 			 */
-			ExecHashTableInsert(hashtable,
-								ExecFetchSlotTuple(slot),
-								hashvalue);
+			ExecHashTableInsert(hashtable, slot, hashvalue);
 		}
 
 		/*
@@ -741,15 +738,14 @@ start_over:
  *		save a tuple to a batch file.
  *
  * The data recorded in the file for each tuple is its hash value,
- * then an image of its HeapTupleData (with meaningless t_data pointer)
- * followed by the HeapTupleHeader and tuple data.
+ * then the tuple in MinimalTuple format.
  *
  * Note: it is important always to call this in the regular executor
  * context, not in a shorter-lived context; else the temp file buffers
  * will get messed up.
  */
 void
-ExecHashJoinSaveTuple(HeapTuple heapTuple, uint32 hashvalue,
+ExecHashJoinSaveTuple(MinimalTuple tuple, uint32 hashvalue,
 					  BufFile **fileptr)
 {
 	BufFile    *file = *fileptr;
@@ -768,14 +764,8 @@ ExecHashJoinSaveTuple(HeapTuple heapTuple, uint32 hashvalue,
 				(errcode_for_file_access(),
 				 errmsg("could not write to hash-join temporary file: %m")));
 
-	written = BufFileWrite(file, (void *) heapTuple, sizeof(HeapTupleData));
-	if (written != sizeof(HeapTupleData))
-		ereport(ERROR,
-				(errcode_for_file_access(),
-				 errmsg("could not write to hash-join temporary file: %m")));
-
-	written = BufFileWrite(file, (void *) heapTuple->t_data, heapTuple->t_len);
-	if (written != (size_t) heapTuple->t_len)
+	written = BufFileWrite(file, (void *) tuple, tuple->t_len);
+	if (written != tuple->t_len)
 		ereport(ERROR,
 				(errcode_for_file_access(),
 				 errmsg("could not write to hash-join temporary file: %m")));
@@ -794,32 +784,36 @@ ExecHashJoinGetSavedTuple(HashJoinState *hjstate,
 						  uint32 *hashvalue,
 						  TupleTableSlot *tupleSlot)
 {
-	HeapTupleData htup;
+	uint32		header[2];
 	size_t		nread;
-	HeapTuple	heapTuple;
+	MinimalTuple tuple;
 
-	nread = BufFileRead(file, (void *) hashvalue, sizeof(uint32));
-	if (nread == 0)
-		return NULL;			/* end of file */
-	if (nread != sizeof(uint32))
-		ereport(ERROR,
-				(errcode_for_file_access(),
-				 errmsg("could not read from hash-join temporary file: %m")));
-	nread = BufFileRead(file, (void *) &htup, sizeof(HeapTupleData));
-	if (nread != sizeof(HeapTupleData))
+	/*
+	 * Since both the hash value and the MinimalTuple length word are
+	 * uint32, we can read them both in one BufFileRead() call without
+	 * any type cheating.
+	 */
+	nread = BufFileRead(file, (void *) header, sizeof(header));
+	if (nread == 0)			/* end of file */
+	{
+		ExecClearTuple(tupleSlot);
+		return NULL;
+	}
+	if (nread != sizeof(header))
 		ereport(ERROR,
 				(errcode_for_file_access(),
 				 errmsg("could not read from hash-join temporary file: %m")));
-	heapTuple = palloc(HEAPTUPLESIZE + htup.t_len);
-	memcpy((char *) heapTuple, (char *) &htup, sizeof(HeapTupleData));
-	heapTuple->t_data = (HeapTupleHeader)
-		((char *) heapTuple + HEAPTUPLESIZE);
-	nread = BufFileRead(file, (void *) heapTuple->t_data, htup.t_len);
-	if (nread != (size_t) htup.t_len)
+	*hashvalue = header[0];
+	tuple = (MinimalTuple) palloc(header[1]);
+	tuple->t_len = header[1];
+	nread = BufFileRead(file,
+						(void *) ((char *) tuple + sizeof(uint32)),
+						header[1] - sizeof(uint32));
+	if (nread != header[1] - sizeof(uint32))
 		ereport(ERROR,
 				(errcode_for_file_access(),
 				 errmsg("could not read from hash-join temporary file: %m")));
-	return ExecStoreTuple(heapTuple, tupleSlot, InvalidBuffer, true);
+	return ExecStoreMinimalTuple(tuple, tupleSlot, true);
 }
 
 
diff --git a/src/include/executor/hashjoin.h b/src/include/executor/hashjoin.h
index 38cae6251ec..c4e6e460fed 100644
--- a/src/include/executor/hashjoin.h
+++ b/src/include/executor/hashjoin.h
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/executor/hashjoin.h,v 1.38 2006/03/05 15:58:56 momjian Exp $
+ * $PostgreSQL: pgsql/src/include/executor/hashjoin.h,v 1.39 2006/06/27 21:31:20 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -65,9 +65,14 @@ typedef struct HashJoinTupleData
 {
 	struct HashJoinTupleData *next;		/* link to next tuple in same bucket */
 	uint32		hashvalue;		/* tuple's hash code */
-	HeapTupleData htup;			/* tuple header */
+	/* Tuple data, in MinimalTuple format, follows on a MAXALIGN boundary */
 } HashJoinTupleData;
 
+#define HJTUPLE_OVERHEAD  MAXALIGN(sizeof(HashJoinTupleData))
+#define HJTUPLE_MINTUPLE(hjtup)  \
+	((MinimalTuple) ((char *) (hjtup) + HJTUPLE_OVERHEAD))
+
+
 typedef struct HashJoinTableData
 {
 	int			nbuckets;		/* # buckets in the in-memory hash table */
diff --git a/src/include/executor/nodeHash.h b/src/include/executor/nodeHash.h
index 9a413827d72..0e0a9b5ec5e 100644
--- a/src/include/executor/nodeHash.h
+++ b/src/include/executor/nodeHash.h
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/executor/nodeHash.h,v 1.40 2006/03/05 15:58:56 momjian Exp $
+ * $PostgreSQL: pgsql/src/include/executor/nodeHash.h,v 1.41 2006/06/27 21:31:20 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -26,7 +26,7 @@ extern void ExecReScanHash(HashState *node, ExprContext *exprCtxt);
 extern HashJoinTable ExecHashTableCreate(Hash *node, List *hashOperators);
 extern void ExecHashTableDestroy(HashJoinTable hashtable);
 extern void ExecHashTableInsert(HashJoinTable hashtable,
-					HeapTuple tuple,
+					TupleTableSlot *slot,
 					uint32 hashvalue);
 extern uint32 ExecHashGetHashValue(HashJoinTable hashtable,
 					 ExprContext *econtext,
@@ -35,7 +35,7 @@ extern void ExecHashGetBucketAndBatch(HashJoinTable hashtable,
 						  uint32 hashvalue,
 						  int *bucketno,
 						  int *batchno);
-extern HeapTuple ExecScanHashBucket(HashJoinState *hjstate,
+extern HashJoinTuple ExecScanHashBucket(HashJoinState *hjstate,
 				   ExprContext *econtext);
 extern void ExecHashTableReset(HashJoinTable hashtable);
 extern void ExecChooseHashTableSize(double ntuples, int tupwidth,
diff --git a/src/include/executor/nodeHashjoin.h b/src/include/executor/nodeHashjoin.h
index 84f07d36448..cbbb76230b6 100644
--- a/src/include/executor/nodeHashjoin.h
+++ b/src/include/executor/nodeHashjoin.h
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/executor/nodeHashjoin.h,v 1.32 2006/03/05 15:58:56 momjian Exp $
+ * $PostgreSQL: pgsql/src/include/executor/nodeHashjoin.h,v 1.33 2006/06/27 21:31:20 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -23,7 +23,7 @@ extern TupleTableSlot *ExecHashJoin(HashJoinState *node);
 extern void ExecEndHashJoin(HashJoinState *node);
 extern void ExecReScanHashJoin(HashJoinState *node, ExprContext *exprCtxt);
 
-extern void ExecHashJoinSaveTuple(HeapTuple heapTuple, uint32 hashvalue,
+extern void ExecHashJoinSaveTuple(MinimalTuple tuple, uint32 hashvalue,
 					  BufFile **fileptr);
 
 #endif   /* NODEHASHJOIN_H */
diff --git a/src/include/executor/tuptable.h b/src/include/executor/tuptable.h
index 85318351340..6d5bc02b93d 100644
--- a/src/include/executor/tuptable.h
+++ b/src/include/executor/tuptable.h
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/executor/tuptable.h,v 1.32 2006/06/27 02:51:40 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/executor/tuptable.h,v 1.33 2006/06/27 21:31:20 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -159,6 +159,7 @@ extern TupleTableSlot *ExecStoreAllNullTuple(TupleTableSlot *slot);
 extern HeapTuple ExecCopySlotTuple(TupleTableSlot *slot);
 extern MinimalTuple ExecCopySlotMinimalTuple(TupleTableSlot *slot);
 extern HeapTuple ExecFetchSlotTuple(TupleTableSlot *slot);
+extern MinimalTuple ExecFetchSlotMinimalTuple(TupleTableSlot *slot);
 extern HeapTuple ExecMaterializeSlot(TupleTableSlot *slot);
 extern TupleTableSlot *ExecCopySlot(TupleTableSlot *dstslot,
 			 TupleTableSlot *srcslot);
-- 
GitLab