From 337b6f5ecf05b21b5e997986884d097d60e4e3d0 Mon Sep 17 00:00:00 2001
From: Robert Haas <rhaas@postgresql.org>
Date: Wed, 15 Feb 2012 12:13:32 -0500
Subject: [PATCH] Speed up in-memory tuplesorting.

Per recent work by Peter Geoghegan, it's significantly faster to
tuplesort on a single sortkey if ApplySortComparator is inlined into
quicksort rather reached via a function pointer.  It's also faster
in general to have a version of quicksort which is specialized for
sorting SortTuple objects rather than objects of arbitrary size and
type.  This requires a couple of additional copies of the quicksort
logic, which in this patch are generate using a Perl script.  There
might be some benefit in adding further specializations here too,
but thus far it's not clear that those gains are worth their weight
in code footprint.
---
 src/backend/Makefile                      |   4 +-
 src/backend/utils/sort/.gitignore         |   1 +
 src/backend/utils/sort/Makefile           |   8 +
 src/backend/utils/sort/gen_qsort_tuple.pl | 232 ++++++++++++++++++++++
 src/backend/utils/sort/tuplesort.c        |  68 ++++---
 src/port/qsort.c                          |   2 +-
 src/port/qsort_arg.c                      |   2 +-
 src/tools/msvc/Solution.pm                |   8 +
 8 files changed, 289 insertions(+), 36 deletions(-)
 create mode 100644 src/backend/utils/sort/.gitignore
 create mode 100644 src/backend/utils/sort/gen_qsort_tuple.pl

diff --git a/src/backend/Makefile b/src/backend/Makefile
index 0c763dd3758..01bb6e1171d 100644
--- a/src/backend/Makefile
+++ b/src/backend/Makefile
@@ -202,6 +202,7 @@ distprep:
 	$(MAKE) -C replication	repl_gram.c repl_scanner.c
 	$(MAKE) -C utils	fmgrtab.c fmgroids.h errcodes.h
 	$(MAKE) -C utils/misc	guc-file.c
+	$(MAKE) -C utils/sort	qsort_tuple.c
 
 
 ##########################################################################
@@ -315,7 +316,8 @@ maintainer-clean: distclean
 	      utils/fmgroids.h \
 	      utils/fmgrtab.c \
 	      utils/errcodes.h \
-	      utils/misc/guc-file.c
+	      utils/misc/guc-file.c \
+	      utils/misc/qsort_tuple.c
 
 
 ##########################################################################
diff --git a/src/backend/utils/sort/.gitignore b/src/backend/utils/sort/.gitignore
new file mode 100644
index 00000000000..f2958633e61
--- /dev/null
+++ b/src/backend/utils/sort/.gitignore
@@ -0,0 +1 @@
+/qsort_tuple.c
diff --git a/src/backend/utils/sort/Makefile b/src/backend/utils/sort/Makefile
index 2ef4965ee6d..f46ce416105 100644
--- a/src/backend/utils/sort/Makefile
+++ b/src/backend/utils/sort/Makefile
@@ -14,4 +14,12 @@ include $(top_builddir)/src/Makefile.global
 
 OBJS = logtape.o sortsupport.o tuplesort.o tuplestore.o
 
+tuplesort.o: qsort_tuple.c
+
+qsort_tuple.c: gen_qsort_tuple.pl
+	$(PERL) $(srcdir)/gen_qsort_tuple.pl $< > $@
+
 include $(top_srcdir)/src/backend/common.mk
+
+maintainer-clean:
+	rm -f qsort_tuple.c
diff --git a/src/backend/utils/sort/gen_qsort_tuple.pl b/src/backend/utils/sort/gen_qsort_tuple.pl
new file mode 100644
index 00000000000..40d55488f1a
--- /dev/null
+++ b/src/backend/utils/sort/gen_qsort_tuple.pl
@@ -0,0 +1,232 @@
+#!/usr/bin/perl -w
+
+#
+# gen_qsort_tuple.pl
+#
+# This script generates specialized versions of the quicksort algorithm for
+# tuple sorting.  The quicksort code is derived from the NetBSD code.  The
+# code generated by this script runs significantly faster than vanilla qsort
+# when used to sort tuples.  This speedup comes from a number of places.
+# The major effects are (1) inlining simple tuple comparators is much faster
+# than jumping through a function pointer and (2) swap and vecswap operations
+# specialized to the particular data type of interest (in this case, SortTuple)
+# are faster than the generic routines.
+#
+#	Modifications from vanilla NetBSD source:
+#	  Add do ... while() macro fix
+# 	  Remove __inline, _DIAGASSERTs, __P
+# 	  Remove ill-considered "swap_cnt" switch to insertion sort,
+# 	  in favor of a simple check for presorted input.
+#     Instead of sorting arbitrary objects, we're always sorting SortTuples
+#     Add CHECK_FOR_INTERRUPTS()
+#
+# CAUTION: if you change this file, see also qsort.c and qsort_arg.c
+#
+
+use strict;
+
+my $SUFFIX;
+my $EXTRAARGS;
+my $EXTRAPARAMS;
+my $CMPPARAMS;
+
+emit_qsort_boilerplate();
+
+$SUFFIX = 'tuple';
+$EXTRAARGS = ', SortTupleComparator cmp_tuple, Tuplesortstate *state';
+$EXTRAPARAMS = ', cmp_tuple, state';
+$CMPPARAMS = ', state';
+emit_qsort_implementation();
+
+$SUFFIX = 'ssup';
+$EXTRAARGS = ', SortSupport ssup';
+$EXTRAPARAMS = ', ssup';
+$CMPPARAMS = ', ssup';
+print <<'EOM';
+#define cmp_ssup(a, b, ssup) \
+	ApplySortComparator((a)->datum1, (a)->isnull1, \
+						(b)->datum1, (b)->isnull1, ssup)
+EOM
+emit_qsort_implementation();
+
+sub emit_qsort_boilerplate
+{
+	print <<'EOM';
+/*
+ * autogenerated by src/backend/utils/sort/gen_qsort_tuple.pl, do not edit
+ * This file is included by tuplesort.c, rather than compiled separately.
+ */
+
+/*	$NetBSD: qsort.c,v 1.13 2003/08/07 16:43:42 agc Exp $	*/
+
+/*-
+ * Copyright (c) 1992, 1993
+ *	The Regents of the University of California.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *	  notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *	  notice, this list of conditions and the following disclaimer in the
+ *	  documentation and/or other materials provided with the distribution.
+ * 3. Neither the name of the University nor the names of its contributors
+ *	  may be used to endorse or promote products derived from this software
+ *	  without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ * ARE DISCLAIMED.	IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
+ * SUCH DAMAGE.
+ */
+
+/*
+ * Qsort routine based on J. L. Bentley and M. D. McIlroy,
+ * "Engineering a sort function",
+ * Software--Practice and Experience 23 (1993) 1249-1265.
+ * We have modified their original by adding a check for already-sorted input,
+ * which seems to be a win per discussions on pgsql-hackers around 2006-03-21.
+ */
+
+static void
+swapfunc(SortTuple *a, SortTuple *b, size_t n)
+{
+	do
+	{
+		SortTuple 	t = *a;
+		*a++ = *b;
+		*b++ = t;
+	} while (--n > 0);
+}
+
+#define swap(a, b)						\
+	do { 								\
+		SortTuple t = *(a);				\
+		*(a) = *(b);					\
+		*(b) = t;						\
+	} while (0);
+
+#define vecswap(a, b, n) if ((n) > 0) swapfunc((a), (b), (size_t)(n))
+EOM
+}
+
+sub emit_qsort_implementation
+{
+	print <<EOM;
+static SortTuple *
+med3_$SUFFIX(SortTuple *a, SortTuple *b, SortTuple *c$EXTRAARGS)
+{
+	return cmp_$SUFFIX(a, b$CMPPARAMS) < 0 ?
+		(cmp_$SUFFIX(b, c$CMPPARAMS) < 0 ? b :
+			(cmp_$SUFFIX(a, c$CMPPARAMS) < 0 ? c : a))
+		: (cmp_$SUFFIX(b, c$CMPPARAMS) > 0 ? b :
+			(cmp_$SUFFIX(a, c$CMPPARAMS) < 0 ? a : c));
+}
+
+static void
+qsort_$SUFFIX(SortTuple *a, size_t n$EXTRAARGS)
+{
+	SortTuple  *pa,
+			   *pb,
+			   *pc,
+			   *pd,
+			   *pl,
+			   *pm,
+			   *pn;
+	int			d,
+				r,
+				presorted;
+
+loop:
+	CHECK_FOR_INTERRUPTS();
+	if (n < 7)
+	{
+		for (pm = a + 1; pm < a + n; pm++)
+			for (pl = pm; pl > a && cmp_$SUFFIX(pl - 1, pl$CMPPARAMS) > 0; pl--)
+				swap(pl, pl - 1);
+		return;
+	}
+	presorted = 1;
+	for (pm = a + 1; pm < a + n; pm++)
+	{
+		CHECK_FOR_INTERRUPTS();
+		if (cmp_$SUFFIX(pm - 1, pm$CMPPARAMS) > 0)
+		{
+			presorted = 0;
+			break;
+		}
+	}
+	if (presorted)
+		return;
+	pm = a + (n / 2);
+	if (n > 7)
+	{
+		pl = a;
+		pn = a + (n - 1);
+		if (n > 40)
+		{
+			d = (n / 8);
+			pl = med3_$SUFFIX(pl, pl + d, pl + 2 * d$EXTRAPARAMS);
+			pm = med3_$SUFFIX(pm - d, pm, pm + d$EXTRAPARAMS);
+			pn = med3_$SUFFIX(pn - 2 * d, pn - d, pn$EXTRAPARAMS);
+		}
+		pm = med3_$SUFFIX(pl, pm, pn$EXTRAPARAMS);
+	}
+	swap(a, pm);
+	pa = pb = a + 1;
+	pc = pd = a + (n - 1);
+	for (;;)
+	{
+		while (pb <= pc && (r = cmp_$SUFFIX(pb, a$CMPPARAMS)) <= 0)
+		{
+			CHECK_FOR_INTERRUPTS();
+			if (r == 0)
+			{
+				swap(pa, pb);
+				pa++;
+			}
+			pb++;
+		}
+		while (pb <= pc && (r = cmp_$SUFFIX(pc, a$CMPPARAMS)) >= 0)
+		{
+			CHECK_FOR_INTERRUPTS();
+			if (r == 0)
+			{
+				swap(pc, pd);
+				pd--;
+			}
+			pc--;
+		}
+		if (pb > pc)
+			break;
+		swap(pb, pc);
+		pb++;
+		pc--;
+	}
+	pn = a + n;
+	r = Min(pa - a, pb - pa);
+	vecswap(a, pb - r, r);
+	r = Min(pd - pc, pn - pd - 1);
+	vecswap(pb, pn - r, r);
+	if ((r = pb - pa) > 1)
+		qsort_$SUFFIX(a, r$EXTRAPARAMS);
+	if ((r = pd - pc) > 1)
+	{
+		/* Iterate rather than recurse to save stack space */
+		a = pn - r;
+		n = r;
+		goto loop;
+	}
+/*		qsort_$SUFFIX(pn - r, r$EXTRAPARAMS);*/
+}
+
+EOM
+}
diff --git a/src/backend/utils/sort/tuplesort.c b/src/backend/utils/sort/tuplesort.c
index 1452e8c7cfc..10b19c0b21f 100644
--- a/src/backend/utils/sort/tuplesort.c
+++ b/src/backend/utils/sort/tuplesort.c
@@ -195,6 +195,9 @@ typedef enum
 #define TAPE_BUFFER_OVERHEAD		(BLCKSZ * 3)
 #define MERGE_BUFFER_SIZE			(BLCKSZ * 32)
 
+typedef int	(*SortTupleComparator) (const SortTuple *a, const SortTuple *b,
+	Tuplesortstate *state);
+
 /*
  * Private state of a Tuplesort operation.
  */
@@ -223,8 +226,7 @@ struct Tuplesortstate
 	 * <0, 0, >0 according as a<b, a=b, a>b.  The API must match
 	 * qsort_arg_comparator.
 	 */
-	int			(*comparetup) (const SortTuple *a, const SortTuple *b,
-										   Tuplesortstate *state);
+	SortTupleComparator	comparetup;
 
 	/*
 	 * Function to copy a supplied input tuple into palloc'd space and set up
@@ -363,12 +365,14 @@ struct Tuplesortstate
 	/* These are specific to the index_hash subcase: */
 	uint32		hash_mask;		/* mask for sortable part of hash code */
 
+	/* This is initialized when, and only when, there's just one key. */
+	SortSupport	onlyKey;
+
 	/*
 	 * These variables are specific to the Datum case; they are set by
 	 * tuplesort_begin_datum and used only by the DatumTuple routines.
 	 */
 	Oid			datumType;
-	SortSupport	datumKey;
 	/* we need typelen and byval in order to know how to copy the Datums. */
 	int			datumTypeLen;
 	bool		datumTypeByVal;
@@ -492,6 +496,11 @@ static void readtup_datum(Tuplesortstate *state, SortTuple *stup,
 static void reversedirection_datum(Tuplesortstate *state);
 static void free_sort_tuple(Tuplesortstate *state, SortTuple *stup);
 
+/*
+ * Special version of qsort, just for SortTuple objects.
+ */
+#include "qsort_tuple.c"
+
 
 /*
  *		tuplesort_begin_xxx
@@ -631,6 +640,9 @@ tuplesort_begin_heap(TupleDesc tupDesc,
 		PrepareSortSupportFromOrderingOp(sortOperators[i], sortKey);
 	}
 
+	if (nkeys == 1)
+		state->onlyKey = state->sortKeys;
+
 	MemoryContextSwitchTo(oldcontext);
 
 	return state;
@@ -809,13 +821,13 @@ tuplesort_begin_datum(Oid datumType, Oid sortOperator, Oid sortCollation,
 	state->datumType = datumType;
 
 	/* Prepare SortSupport data */
-	state->datumKey = (SortSupport) palloc0(sizeof(SortSupportData));
+	state->onlyKey = (SortSupport) palloc0(sizeof(SortSupportData));
 
-	state->datumKey->ssup_cxt = CurrentMemoryContext;
-	state->datumKey->ssup_collation = sortCollation;
-	state->datumKey->ssup_nulls_first = nullsFirstFlag;
+	state->onlyKey->ssup_cxt = CurrentMemoryContext;
+	state->onlyKey->ssup_collation = sortCollation;
+	state->onlyKey->ssup_nulls_first = nullsFirstFlag;
 
-	PrepareSortSupportFromOrderingOp(sortOperator, state->datumKey);
+	PrepareSortSupportFromOrderingOp(sortOperator, state->onlyKey);
 
 	/* lookup necessary attributes of the datum type */
 	get_typlenbyval(datumType, &typlen, &typbyval);
@@ -1222,11 +1234,16 @@ tuplesort_performsort(Tuplesortstate *state)
 			 * amount of memory.  Just qsort 'em and we're done.
 			 */
 			if (state->memtupcount > 1)
-				qsort_arg((void *) state->memtuples,
-						  state->memtupcount,
-						  sizeof(SortTuple),
-						  (qsort_arg_comparator) state->comparetup,
-						  (void *) state);
+			{
+				if (state->onlyKey != NULL)
+					qsort_ssup(state->memtuples, state->memtupcount,
+							   state->onlyKey);
+				else
+					qsort_tuple(state->memtuples,
+								state->memtupcount,
+								state->comparetup,
+								state);
+			}
 			state->current = 0;
 			state->eof_reached = false;
 			state->markpos_offset = 0;
@@ -2660,9 +2677,6 @@ comparetup_heap(const SortTuple *a, const SortTuple *b, Tuplesortstate *state)
 	int			nkey;
 	int32		compare;
 
-	/* Allow interrupting long sorts */
-	CHECK_FOR_INTERRUPTS();
-
 	/* Compare the leading sort key */
 	compare = ApplySortComparator(a->datum1, a->isnull1,
 								  b->datum1, b->isnull1,
@@ -2804,9 +2818,6 @@ comparetup_cluster(const SortTuple *a, const SortTuple *b,
 	int			nkey;
 	int32		compare;
 
-	/* Allow interrupting long sorts */
-	CHECK_FOR_INTERRUPTS();
-
 	/* Compare the leading sort key, if it's simple */
 	if (state->indexInfo->ii_KeyAttrNumbers[0] != 0)
 	{
@@ -2995,9 +3006,6 @@ comparetup_index_btree(const SortTuple *a, const SortTuple *b,
 	int			nkey;
 	int32		compare;
 
-	/* Allow interrupting long sorts */
-	CHECK_FOR_INTERRUPTS();
-
 	/* Compare the leading sort key */
 	compare = inlineApplySortFunction(&scanKey->sk_func, scanKey->sk_flags,
 									  scanKey->sk_collation,
@@ -3102,9 +3110,6 @@ comparetup_index_hash(const SortTuple *a, const SortTuple *b,
 	IndexTuple	tuple1;
 	IndexTuple	tuple2;
 
-	/* Allow interrupting long sorts */
-	CHECK_FOR_INTERRUPTS();
-
 	/*
 	 * Fetch hash keys and mask off bits we don't want to sort by. We know
 	 * that the first column of the index tuple is the hash key.
@@ -3231,12 +3236,9 @@ reversedirection_index_hash(Tuplesortstate *state)
 static int
 comparetup_datum(const SortTuple *a, const SortTuple *b, Tuplesortstate *state)
 {
-	/* Allow interrupting long sorts */
-	CHECK_FOR_INTERRUPTS();
-
-	return ApplySortComparator(a->datum1, a->isnull1,
-							   b->datum1, b->isnull1,
-							   state->datumKey);
+	/* Not currently needed */
+	elog(ERROR, "comparetup_datum() should not be called");
+	return 0;
 }
 
 static void
@@ -3328,8 +3330,8 @@ readtup_datum(Tuplesortstate *state, SortTuple *stup,
 static void
 reversedirection_datum(Tuplesortstate *state)
 {
-	state->datumKey->ssup_reverse = !state->datumKey->ssup_reverse;
-	state->datumKey->ssup_nulls_first = !state->datumKey->ssup_nulls_first;
+	state->onlyKey->ssup_reverse = !state->onlyKey->ssup_reverse;
+	state->onlyKey->ssup_nulls_first = !state->onlyKey->ssup_nulls_first;
 }
 
 /*
diff --git a/src/port/qsort.c b/src/port/qsort.c
index 8e2c6d92c2d..49d8fa7ab6c 100644
--- a/src/port/qsort.c
+++ b/src/port/qsort.c
@@ -7,7 +7,7 @@
  *	  Remove ill-considered "swap_cnt" switch to insertion sort,
  *	  in favor of a simple check for presorted input.
  *
- *	CAUTION: if you change this file, see also qsort_arg.c
+ *	CAUTION: if you change this file, see also qsort_arg.c, gen_qsort_tuple.pl
  *
  *	src/port/qsort.c
  */
diff --git a/src/port/qsort_arg.c b/src/port/qsort_arg.c
index 28d1894992b..3091eb09ead 100644
--- a/src/port/qsort_arg.c
+++ b/src/port/qsort_arg.c
@@ -7,7 +7,7 @@
  *	  Remove ill-considered "swap_cnt" switch to insertion sort,
  *	  in favor of a simple check for presorted input.
  *
- *	CAUTION: if you change this file, see also qsort.c
+ *	CAUTION: if you change this file, see also qsort.c, gen_qsort_tuple.pl
  *
  *	src/port/qsort_arg.c
  */
diff --git a/src/tools/msvc/Solution.pm b/src/tools/msvc/Solution.pm
index 1725fbb5330..e1d85c85ad0 100644
--- a/src/tools/msvc/Solution.pm
+++ b/src/tools/msvc/Solution.pm
@@ -287,6 +287,14 @@ s{PG_VERSION_STR "[^"]+"}{__STRINGIFY(x) #x\n#define __STRINGIFY2(z) __STRINGIFY
         );
     }
 
+    if (IsNewer('src\backend\utils\sort\qsort_tuple.c','src\backend\utils\sort\gen_qsort_tuple.pl'))
+    {
+        print "Generating qsort_tuple.c...\n";
+        system(
+'perl src\backend\utils\sort\gen_qsort_tuple.pl > src\backend\utils\sort\qsort_tuple.c'
+        );
+    }
+
     if (IsNewer('src\interfaces\libpq\libpq.rc','src\interfaces\libpq\libpq.rc.in'))
     {
         print "Generating libpq.rc...\n";
-- 
GitLab