From 8db05ba411ecd3ce2cb0cb7c78fec87658e1a4ab Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Tue, 7 Mar 2006 19:06:50 +0000
Subject: [PATCH] Repair old performance bug in tuplesort.c/logtape.c.  In the
 case where we are doing the final merge pass on-the-fly, and not writing the
 data back onto a 'tape', the number of free blocks in the tape set will
 become large, leading to a lot of time wasted in ltsReleaseBlock().  There is
 really no need to track the free blocks anymore in this state, so add a
 simple shutoff switch.  Per report from Stefan Kaltenbrunner.

---
 src/backend/utils/sort/logtape.c   | 29 ++++++++++++++++-
 src/backend/utils/sort/tuplesort.c | 50 +++++++++++++++++++++---------
 src/include/utils/logtape.h        |  3 +-
 3 files changed, 65 insertions(+), 17 deletions(-)

diff --git a/src/backend/utils/sort/logtape.c b/src/backend/utils/sort/logtape.c
index 4f280509cad..6dc203063c9 100644
--- a/src/backend/utils/sort/logtape.c
+++ b/src/backend/utils/sort/logtape.c
@@ -64,7 +64,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/utils/sort/logtape.c,v 1.19 2006/03/05 15:58:49 momjian Exp $
+ *	  $PostgreSQL: pgsql/src/backend/utils/sort/logtape.c,v 1.20 2006/03/07 19:06:49 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -146,7 +146,12 @@ struct LogicalTapeSet
 	 * When there are no such blocks, we extend the underlying file.  Note
 	 * that the block numbers in freeBlocks are always in *decreasing* order,
 	 * so that removing the last entry gives us the lowest free block.
+	 *
+	 * If forgetFreeSpace is true then any freed blocks are simply forgotten
+	 * rather than being remembered in freeBlocks[].  See notes for
+	 * LogicalTapeSetForgetFreeSpace().
 	 */
+	bool		forgetFreeSpace;	/* are we remembering free blocks? */
 	long	   *freeBlocks;		/* resizable array */
 	int			nFreeBlocks;	/* # of currently free blocks */
 	int			freeBlocksLen;	/* current allocated length of freeBlocks[] */
@@ -247,6 +252,12 @@ ltsReleaseBlock(LogicalTapeSet *lts, long blocknum)
 	int			ndx;
 	long	   *ptr;
 
+	/*
+	 * Do nothing if we're no longer interested in remembering free space.
+	 */
+	if (lts->forgetFreeSpace)
+		return;
+
 	/*
 	 * Enlarge freeBlocks array if full.
 	 */
@@ -491,6 +502,7 @@ LogicalTapeSetCreate(int ntapes)
 									(ntapes - 1) * sizeof(LogicalTape));
 	lts->pfile = BufFileCreateTemp(false);
 	lts->nFileBlocks = 0L;
+	lts->forgetFreeSpace = false;
 	lts->freeBlocksLen = 32;	/* reasonable initial guess */
 	lts->freeBlocks = (long *) palloc(lts->freeBlocksLen * sizeof(long));
 	lts->nFreeBlocks = 0;
@@ -546,6 +558,21 @@ LogicalTapeSetClose(LogicalTapeSet *lts)
 	pfree(lts);
 }
 
+/*
+ * Mark a logical tape set as not needing management of free space anymore.
+ *
+ * This should be called if the caller does not intend to write any more data
+ * into the tape set, but is reading from un-frozen tapes.  Since no more
+ * writes are planned, remembering free blocks is no longer useful.  Setting
+ * this flag lets us avoid wasting time and space in ltsReleaseBlock(), which
+ * is not designed to handle large numbers of free blocks.
+ */
+void
+LogicalTapeSetForgetFreeSpace(LogicalTapeSet *lts)
+{
+	lts->forgetFreeSpace = true;
+}
+
 /*
  * Dump the dirty buffer of a logical tape.
  */
diff --git a/src/backend/utils/sort/tuplesort.c b/src/backend/utils/sort/tuplesort.c
index a9d5d79e033..af0109fda65 100644
--- a/src/backend/utils/sort/tuplesort.c
+++ b/src/backend/utils/sort/tuplesort.c
@@ -91,7 +91,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/utils/sort/tuplesort.c,v 1.62 2006/03/05 15:58:49 momjian Exp $
+ *	  $PostgreSQL: pgsql/src/backend/utils/sort/tuplesort.c,v 1.63 2006/03/07 19:06:50 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -1384,7 +1384,8 @@ mergeruns(Tuplesortstate *state)
 	/*
 	 * If we produced only one initial run (quite likely if the total data
 	 * volume is between 1X and 2X workMem), we can just use that tape as the
-	 * finished output, rather than doing a useless merge.
+	 * finished output, rather than doing a useless merge.  (This obvious
+	 * optimization is not in Knuth's algorithm.)
 	 */
 	if (state->currentRun == 1)
 	{
@@ -1401,33 +1402,51 @@ mergeruns(Tuplesortstate *state)
 
 	for (;;)
 	{
-		/* Step D5: merge runs onto tape[T] until tape[P] is empty */
-		while (state->tp_runs[state->tapeRange - 1] ||
-			   state->tp_dummy[state->tapeRange - 1])
+		/*
+		 * At this point we know that tape[T] is empty.  If there's just one
+		 * (real or dummy) run left on each input tape, then only one merge
+		 * pass remains.  If we don't have to produce a materialized sorted
+		 * tape, we can stop at this point and do the final merge on-the-fly.
+		 */
+		if (!state->randomAccess)
 		{
-			bool		allDummy = true;
 			bool		allOneRun = true;
 
+			Assert(state->tp_runs[state->tapeRange] == 0);
 			for (tapenum = 0; tapenum < state->tapeRange; tapenum++)
 			{
-				if (state->tp_dummy[tapenum] == 0)
-					allDummy = false;
 				if (state->tp_runs[tapenum] + state->tp_dummy[tapenum] != 1)
+				{
 					allOneRun = false;
+					break;
+				}
 			}
-
-			/*
-			 * If we don't have to produce a materialized sorted tape, quit as
-			 * soon as we're down to one real/dummy run per tape.
-			 */
-			if (!state->randomAccess && allOneRun)
+			if (allOneRun)
 			{
-				Assert(!allDummy);
+				/* Tell logtape.c we won't be writing anymore */
+				LogicalTapeSetForgetFreeSpace(state->tapeset);
 				/* Initialize for the final merge pass */
 				beginmerge(state);
 				state->status = TSS_FINALMERGE;
 				return;
 			}
+		}
+
+		/* Step D5: merge runs onto tape[T] until tape[P] is empty */
+		while (state->tp_runs[state->tapeRange - 1] ||
+			   state->tp_dummy[state->tapeRange - 1])
+		{
+			bool		allDummy = true;
+
+			for (tapenum = 0; tapenum < state->tapeRange; tapenum++)
+			{
+				if (state->tp_dummy[tapenum] == 0)
+				{
+					allDummy = false;
+					break;
+				}
+			}
+
 			if (allDummy)
 			{
 				state->tp_dummy[state->tapeRange]++;
@@ -1437,6 +1456,7 @@ mergeruns(Tuplesortstate *state)
 			else
 				mergeonerun(state);
 		}
+
 		/* Step D6: decrease level */
 		if (--state->Level == 0)
 			break;
diff --git a/src/include/utils/logtape.h b/src/include/utils/logtape.h
index 3791f03e9e3..7cd3db37774 100644
--- a/src/include/utils/logtape.h
+++ b/src/include/utils/logtape.h
@@ -8,7 +8,7 @@
  * Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/utils/logtape.h,v 1.14 2006/03/05 15:59:07 momjian Exp $
+ * $PostgreSQL: pgsql/src/include/utils/logtape.h,v 1.15 2006/03/07 19:06:50 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -26,6 +26,7 @@ typedef struct LogicalTapeSet LogicalTapeSet;
 
 extern LogicalTapeSet *LogicalTapeSetCreate(int ntapes);
 extern void LogicalTapeSetClose(LogicalTapeSet *lts);
+extern void LogicalTapeSetForgetFreeSpace(LogicalTapeSet *lts);
 extern size_t LogicalTapeRead(LogicalTapeSet *lts, int tapenum,
 				void *ptr, size_t size);
 extern void LogicalTapeWrite(LogicalTapeSet *lts, int tapenum,
-- 
GitLab