From fc19c1801bd2dbee1043b0c0b62e07747d30ea1c Mon Sep 17 00:00:00 2001
From: Robert Haas <rhaas@postgresql.org>
Date: Tue, 15 Nov 2016 10:28:46 -0500
Subject: [PATCH] Limit the number of number of tapes used for a sort to 501.

Gigantic numbers of tapes don't work out well.

Original patch by Peter Geoghegan; comments entirely rewritten by me.
---
 src/backend/utils/sort/tuplesort.c | 17 +++++++++++++++--
 1 file changed, 15 insertions(+), 2 deletions(-)

diff --git a/src/backend/utils/sort/tuplesort.c b/src/backend/utils/sort/tuplesort.c
index 46512cfca1a..3ad81862f35 100644
--- a/src/backend/utils/sort/tuplesort.c
+++ b/src/backend/utils/sort/tuplesort.c
@@ -108,7 +108,8 @@
  * code we determine the number of tapes M on the basis of workMem: we want
  * workMem/M to be large enough that we read a fair amount of data each time
  * we preread from a tape, so as to maintain the locality of access described
- * above.  Nonetheless, with large workMem we can have many tapes.
+ * above.  Nonetheless, with large workMem we can have many tapes (but not
+ * too many -- see the comments in tuplesort_merge_order).
  *
  *
  * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
@@ -247,6 +248,7 @@ typedef enum
  * tape during a preread cycle (see discussion at top of file).
  */
 #define MINORDER		6		/* minimum merge order */
+#define MAXORDER		500		/* maximum merge order */
 #define TAPE_BUFFER_OVERHEAD		(BLCKSZ * 3)
 #define MERGE_BUFFER_SIZE			(BLCKSZ * 32)
 
@@ -2313,8 +2315,19 @@ tuplesort_merge_order(int64 allowedMem)
 	mOrder = (allowedMem - TAPE_BUFFER_OVERHEAD) /
 		(MERGE_BUFFER_SIZE + TAPE_BUFFER_OVERHEAD);
 
-	/* Even in minimum memory, use at least a MINORDER merge */
+	/*
+	 * Even in minimum memory, use at least a MINORDER merge.  On the other
+	 * hand, even when we have lots of memory, do not use more than a MAXORDER
+	 * merge.  Tapes are pretty cheap, but they're not entirely free.  Each
+	 * additional tape reduces the amount of memory available to build runs,
+	 * which in turn can cause the same sort to need more runs, which makes
+	 * merging slower even if it can still be done in a single pass.  Also,
+	 * high order merges are quite slow due to CPU cache effects; it can be
+	 * faster to pay the I/O cost of a polyphase merge than to perform a single
+	 * merge pass across many hundreds of tapes.
+	 */
 	mOrder = Max(mOrder, MINORDER);
+	mOrder = Min(mOrder, MAXORDER);
 
 	return mOrder;
 }
-- 
GitLab