From 26c48b5e8cffafaf3b8acf345ca9fd8a1e408a54 Mon Sep 17 00:00:00 2001 From: Tom Lane <tgl@sss.pgh.pa.us> Date: Sun, 17 Oct 1999 22:15:09 +0000 Subject: [PATCH] Final stage of psort reconstruction work: replace psort.c with a generalized module 'tuplesort.c' that can sort either HeapTuples or IndexTuples, and is not tied to execution of a Sort node. Clean up memory leakages in sorting, and replace nbtsort.c's private implementation of mergesorting with calls to tuplesort.c. --- src/backend/access/nbtree/nbtree.c | 30 +- src/backend/access/nbtree/nbtsort.c | 1000 ++---------------- src/backend/executor/nodeSort.c | 153 +-- src/backend/utils/sort/Makefile | 4 +- src/backend/utils/sort/logtape.c | 8 +- src/backend/utils/sort/tuplesort.c | 1465 +++++++++++++++++++++++++++ src/include/access/nbtree.h | 13 +- src/include/nodes/execnodes.h | 20 +- src/include/nodes/plannodes.h | 4 +- src/include/utils/tuplesort.h | 68 ++ 10 files changed, 1747 insertions(+), 1018 deletions(-) create mode 100644 src/backend/utils/sort/tuplesort.c create mode 100644 src/include/utils/tuplesort.h diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index 11f527fc3ba..d8d835f424b 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -1,17 +1,17 @@ /*------------------------------------------------------------------------- * - * btree.c + * nbtree.c * Implementation of Lehman and Yao's btree management algorithm for * Postgres. * - * Copyright (c) 1994, Regents of the University of California + * NOTES + * This file contains only the public interface routines. * * - * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtree.c,v 1.46 1999/09/18 19:06:10 tgl Exp $ + * Copyright (c) 1994, Regents of the University of California * - * NOTES - * This file contains only the public interface routines. + * IDENTIFICATION + * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtree.c,v 1.47 1999/10/17 22:15:03 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -76,7 +76,7 @@ btbuild(Relation heap, #endif Node *pred, *oldPred; - void *spool = (void *) NULL; + BTSpool *spool = NULL; bool isunique; bool usefast; @@ -147,7 +147,7 @@ btbuild(Relation heap, if (usefast) { - spool = _bt_spoolinit(index, 7, isunique); + spool = _bt_spoolinit(index, isunique); res = (InsertIndexResult) NULL; } @@ -249,11 +249,11 @@ btbuild(Relation heap, /* * if we are doing bottom-up btree build, we insert the index into - * a spool page for subsequent processing. otherwise, we insert + * a spool file for subsequent processing. otherwise, we insert * into the btree. */ if (usefast) - _bt_spool(index, btitem, spool); + _bt_spool(btitem, spool); else res = _bt_doinsert(index, btitem, isunique, heap); @@ -275,15 +275,13 @@ btbuild(Relation heap, } /* - * if we are doing bottom-up btree build, we now have a bunch of - * sorted runs in the spool pages. finish the build by (1) merging - * the runs, (2) inserting the sorted tuples into btree pages and (3) - * building the upper levels. + * if we are doing bottom-up btree build, finish the build by + * (1) completing the sort of the spool file, (2) inserting the + * sorted tuples into btree pages and (3) building the upper levels. */ if (usefast) { - _bt_spool(index, (BTItem) NULL, spool); /* flush the spool */ - _bt_leafbuild(index, spool); + _bt_leafbuild(spool); _bt_spooldestroy(spool); } diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c index c1564544b03..48386c113f0 100644 --- a/src/backend/access/nbtree/nbtsort.c +++ b/src/backend/access/nbtree/nbtsort.c @@ -1,68 +1,47 @@ /*------------------------------------------------------------------------- - * btsort.c - * - * Copyright (c) 1994, Regents of the University of California - * - * - * IDENTIFICATION - * $Id: nbtsort.c,v 1.46 1999/07/19 07:07:19 momjian Exp $ + * nbtsort.c + * Build a btree from sorted input by loading leaf pages sequentially. * * NOTES * - * what we do is: - * - generate a set of initial one-block runs, distributed round-robin - * between the output tapes. - * - for each pass, - * - swap input and output tape sets, rewinding both and truncating - * the output tapes. - * - merge the current run in each input tape to the current output - * tape. - * - when each input run has been exhausted, switch to another output - * tape and start processing another run. - * - when we have fewer runs than tapes, we know we are ready to start - * merging into the btree leaf pages. (i.e., we do not have to wait - * until we have exactly one tape.) - * - as we extract tuples from the final runs, we build the pages for - * each level. when we have only one page on a level, it must be the - * root -- it can be attached to the btree metapage and we are done. - * - * conventions: - * - external interface routines take in and return "void *" for their - * opaque handles. this is for modularity reasons. + * We use tuplesort.c to sort the given index tuples into order. + * Then we scan the index tuples in order and build the btree pages + * for each level. When we have only one page on a level, it must be the + * root -- it can be attached to the btree metapage and we are done. * * this code is moderately slow (~10% slower) compared to the regular * btree (insertion) build code on sorted or well-clustered data. on * random data, however, the insertion build code is unusable -- the * difference on a 60MB heap is a factor of 15 because the random - * probes into the btree thrash the buffer pool. + * probes into the btree thrash the buffer pool. (NOTE: the above + * "10%" estimate is probably obsolete, since it refers to an old and + * not very good external sort implementation that used to exist in + * this module. tuplesort.c is almost certainly faster.) * * this code currently packs the pages to 100% of capacity. this is * not wise, since *any* insertion will cause splitting. filling to * something like the standard 70% steady-state load factor for btrees * would probably be better. * - * somebody desperately needs to figure out how to do a better job of - * balancing the merge passes -- the fan-in on the final merges can be - * pretty poor, which is bad for performance. + * + * Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtsort.c,v 1.47 1999/10/17 22:15:04 tgl Exp $ + * *------------------------------------------------------------------------- */ -#include <fcntl.h> - #include "postgres.h" #include "access/nbtree.h" +#include "utils/tuplesort.h" #ifdef BTREE_BUILD_STATS #define ShowExecutorStats pg_options[TRACE_EXECUTORSTATS] #endif -static BTItem _bt_buildadd(Relation index, void *pstate, BTItem bti, int flags); -static BTItem _bt_minitem(Page opage, BlockNumber oblkno, int atend); -static void *_bt_pagestate(Relation index, int flags, int level, bool doupper); -static void _bt_uppershutdown(Relation index, BTPageState *state); - /* * turn on debugging output. * @@ -70,689 +49,108 @@ static void _bt_uppershutdown(Relation index, BTPageState *state); * only really useful for integer keys. */ /*#define FASTBUILD_DEBUG*/ -#define FASTBUILD_SPOOL -#define FASTBUILD_MERGE - -#define MAXTAPES (7) -#define TAPEBLCKSZ (BLCKSZ << 2) - -extern int NDirectFileRead; -extern int NDirectFileWrite; - -/* - * this is what we use to shovel BTItems in and out of memory. it's - * bigger than a standard block because we are doing a lot of strictly - * sequential i/o. this is obviously something of a tradeoff since we - * are potentially reading a bunch of zeroes off of disk in many - * cases. - * - * BTItems are packed in and MAXALIGN'd. - * - * the fd should not be going out to disk, strictly speaking, but it's - * the only thing like that so i'm not going to worry about wasting a - * few bytes. - */ -typedef struct -{ - int bttb_magic; /* magic number */ - File bttb_fd; /* file descriptor */ - int bttb_top; /* top of free space within bttb_data */ - short bttb_ntup; /* number of tuples in this block */ - short bttb_eor; /* End-Of-Run marker */ - char bttb_data[TAPEBLCKSZ - 2 * sizeof(double)]; -} BTTapeBlock; /* - * this structure holds the bookkeeping for a simple balanced multiway - * merge. (polyphase merging is hairier than i want to get into right - * now, and i don't see why i have to care how many "tapes" i use - * right now. though if psort was in a condition that i could hack it - * to do this, you bet i would.) + * Status record for spooling. */ -typedef struct +struct BTSpool { - int bts_ntapes; - int bts_tape; - BTTapeBlock **bts_itape; /* input tape blocks */ - BTTapeBlock **bts_otape; /* output tape blocks */ + Tuplesortstate *sortstate; /* state data for tuplesort.c */ + Relation index; bool isunique; -} BTSpool; - -/*------------------------------------------------------------------------- - * sorting comparison routine - returns {-1,0,1} depending on whether - * the key in the left BTItem is {<,=,>} the key in the right BTItem. - * - * we want to use _bt_isortcmp as a comparison function for qsort(3), - * but it needs extra arguments, so we "pass them in" as global - * variables. ick. fortunately, they are the same throughout the - * build, so we need do this only once. this is why you must call - * _bt_isortcmpinit before the call to qsort(3). - * - * a NULL BTItem is always assumed to be greater than any actual - * value; our heap routines (see below) assume that the smallest - * element in the heap is returned. that way, NULL values from the - * exhausted tapes can sift down to the bottom of the heap. in point - * of fact we just don't replace the elements of exhausted tapes, but - * what the heck. - * *------------------------------------------------------------------------- - */ -typedef struct -{ - Datum *btsk_datum; - char *btsk_nulls; - BTItem btsk_item; -} BTSortKey; - -static Relation _bt_sortrel; -static int _bt_nattr; -static BTSpool *_bt_inspool; - -static void -_bt_isortcmpinit(Relation index, BTSpool *spool) -{ - _bt_sortrel = index; - _bt_inspool = spool; - _bt_nattr = index->rd_att->natts; -} - -static int -_bt_isortcmp(BTSortKey *k1, BTSortKey *k2) -{ - Datum *k1_datum = k1->btsk_datum; - Datum *k2_datum = k2->btsk_datum; - char *k1_nulls = k1->btsk_nulls; - char *k2_nulls = k2->btsk_nulls; - bool equal_isnull = false; - int i; - - if (k1->btsk_item == (BTItem) NULL) - { - if (k2->btsk_item == (BTItem) NULL) - return 0; /* 1 = 2 */ - return 1; /* 1 > 2 */ - } - else if (k2->btsk_item == (BTItem) NULL) - return -1; /* 1 < 2 */ - - for (i = 0; i < _bt_nattr; i++) - { - if (k1_nulls[i] != ' ') /* k1 attr is NULL */ - { - if (k2_nulls[i] != ' ') /* the same for k2 */ - { - equal_isnull = true; - continue; - } - return 1; /* NULL ">" NOT_NULL */ - } - else if (k2_nulls[i] != ' ') /* k2 attr is NULL */ - return -1; /* NOT_NULL "<" NULL */ - - if (_bt_invokestrat(_bt_sortrel, i + 1, BTGreaterStrategyNumber, - k1_datum[i], k2_datum[i])) - return 1; /* 1 > 2 */ - else if (_bt_invokestrat(_bt_sortrel, i + 1, BTGreaterStrategyNumber, - k2_datum[i], k1_datum[i])) - return -1; /* 1 < 2 */ - } - - if (_bt_inspool->isunique && !equal_isnull) - { - _bt_spooldestroy((void *) _bt_inspool); - elog(ERROR, "Cannot create unique index. Table contains non-unique values"); - } - return 0; /* 1 = 2 */ -} - -static void -_bt_setsortkey(Relation index, BTItem bti, BTSortKey *sk) -{ - sk->btsk_item = (BTItem) NULL; - sk->btsk_datum = (Datum *) NULL; - sk->btsk_nulls = (char *) NULL; - - if (bti != (BTItem) NULL) - { - IndexTuple it = &(bti->bti_itup); - TupleDesc itdesc = index->rd_att; - Datum *dp = (Datum *) palloc(_bt_nattr * sizeof(Datum)); - char *np = (char *) palloc(_bt_nattr * sizeof(char)); - bool isnull; - int i; - - for (i = 0; i < _bt_nattr; i++) - { - dp[i] = index_getattr(it, i + 1, itdesc, &isnull); - if (isnull) - np[i] = 'n'; - else - np[i] = ' '; - } - sk->btsk_item = bti; - sk->btsk_datum = dp; - sk->btsk_nulls = np; - } -} - -/*------------------------------------------------------------------------- - * priority queue methods - * - * these were more-or-less lifted from the heap section of the 1984 - * edition of gonnet's book on algorithms and data structures. they - * are coded so that the smallest element in the heap is returned (we - * use them for merging sorted runs). - * - * XXX these probably ought to be generic library functions. - *------------------------------------------------------------------------- - */ -typedef struct -{ - int btpqe_tape; /* tape identifier */ - BTSortKey btpqe_item; /* pointer to BTItem in tape buffer */ -} BTPriQueueElem; - -#define MAXELEM MAXTAPES -typedef struct -{ - int btpq_nelem; - BTPriQueueElem btpq_queue[MAXELEM]; - Relation btpq_rel; -} BTPriQueue; - -/* be sure to call _bt_isortcmpinit first */ -#define GREATER(a, b) \ - (_bt_isortcmp(&((a)->btpqe_item), &((b)->btpqe_item)) > 0) - -static void -_bt_pqsift(BTPriQueue *q, int parent) -{ - int child; - BTPriQueueElem e; - - for (child = parent * 2 + 1; - child < q->btpq_nelem; - child = parent * 2 + 1) - { - if (child < q->btpq_nelem - 1) - { - if (GREATER(&(q->btpq_queue[child]), &(q->btpq_queue[child + 1]))) - ++child; - } - if (GREATER(&(q->btpq_queue[parent]), &(q->btpq_queue[child]))) - { - e = q->btpq_queue[child]; /* struct = */ - q->btpq_queue[child] = q->btpq_queue[parent]; /* struct = */ - q->btpq_queue[parent] = e; /* struct = */ - parent = child; - } - else - parent = child + 1; - } -} - -static int -_bt_pqnext(BTPriQueue *q, BTPriQueueElem *e) -{ - if (q->btpq_nelem < 1) - { /* already empty */ - return -1; - } - *e = q->btpq_queue[0]; /* struct = */ - - if (--q->btpq_nelem < 1) - { /* now empty, don't sift */ - return 0; - } - q->btpq_queue[0] = q->btpq_queue[q->btpq_nelem]; /* struct = */ - _bt_pqsift(q, 0); - return 0; -} - -static void -_bt_pqadd(BTPriQueue *q, BTPriQueueElem *e) -{ - int child, - parent; - - if (q->btpq_nelem >= MAXELEM) - elog(ERROR, "_bt_pqadd: queue overflow"); - - child = q->btpq_nelem++; - while (child > 0) - { - parent = child / 2; - if (GREATER(e, &(q->btpq_queue[parent]))) - break; - else - { - q->btpq_queue[child] = q->btpq_queue[parent]; /* struct = */ - child = parent; - } - } - - q->btpq_queue[child] = *e; /* struct = */ -} - -/*------------------------------------------------------------------------- - * tape methods - *------------------------------------------------------------------------- - */ +}; #define BTITEMSZ(btitem) \ ((btitem) ? \ (IndexTupleDSize((btitem)->bti_itup) + \ (sizeof(BTItemData) - sizeof(IndexTupleData))) : \ 0) -#define SPCLEFT(tape) \ - (sizeof((tape)->bttb_data) - (tape)->bttb_top) -#define EMPTYTAPE(tape) \ - ((tape)->bttb_ntup <= 0) -#define BTTAPEMAGIC 0x19660226 - -/* - * reset the tape header for its next use without doing anything to - * the physical tape file. (setting bttb_top to 0 makes the block - * empty.) - */ -static void -_bt_tapereset(BTTapeBlock *tape) -{ - tape->bttb_eor = 0; - tape->bttb_top = 0; - tape->bttb_ntup = 0; -} - -/* - * rewind the physical tape file. - */ -static void -_bt_taperewind(BTTapeBlock *tape) -{ - FileSeek(tape->bttb_fd, 0L, SEEK_SET); -} - -/* - * destroy the contents of the physical tape file without destroying - * the tape data structure or removing the physical tape file. - * - * we use the VFD version of ftruncate(2) to do this rather than - * unlinking and recreating the file. you still have to wait while - * the OS frees up all of the file system blocks and stuff, but at - * least you don't have to delete and reinsert the directory entries. - */ -static void -_bt_tapeclear(BTTapeBlock *tape) -{ - /* blow away the contents of the old file */ - _bt_taperewind(tape); -#ifdef NOT_USED - FileSync(tape->bttb_fd); -#endif - FileTruncate(tape->bttb_fd, 0); - - /* reset the buffer */ - _bt_tapereset(tape); -} - -/* - * create a new BTTapeBlock, allocating memory for the data structure - * as well as opening a physical tape file. - */ -static BTTapeBlock * -_bt_tapecreate(void) -{ - BTTapeBlock *tape = (BTTapeBlock *) palloc(sizeof(BTTapeBlock)); - - if (tape == (BTTapeBlock *) NULL) - elog(ERROR, "_bt_tapecreate: out of memory"); - - tape->bttb_magic = BTTAPEMAGIC; - - tape->bttb_fd = OpenTemporaryFile(); - Assert(tape->bttb_fd >= 0); - - /* initialize the buffer */ - _bt_tapereset(tape); - - return tape; -} - -/* - * destroy the BTTapeBlock structure and its physical tape file. - */ -static void -_bt_tapedestroy(BTTapeBlock *tape) -{ - FileUnlink(tape->bttb_fd); - pfree((void *) tape); -} - -/* - * flush the tape block to the file, marking End-Of-Run if requested. - */ -static void -_bt_tapewrite(BTTapeBlock *tape, int eor) -{ - tape->bttb_eor = eor; - FileWrite(tape->bttb_fd, (char *) tape, TAPEBLCKSZ); - NDirectFileWrite += TAPEBLCKSZ / BLCKSZ; - _bt_tapereset(tape); -} - -/* - * read a tape block from the file, overwriting the current contents - * of the buffer. - * - * returns: - * - 0 if there are no more blocks in the tape or in this run (call - * _bt_tapereset to clear the End-Of-Run marker) - * - 1 if a valid block was read - */ -static int -_bt_taperead(BTTapeBlock *tape) -{ - File fd; - int nread; - - if (tape->bttb_eor) - { - return 0; /* we are already at End-Of-Run */ - } - - /* - * we're clobbering the old tape block, but we do need to save the VFD - * (the one in the block we're reading is bogus). - */ - fd = tape->bttb_fd; - nread = FileRead(fd, (char *) tape, TAPEBLCKSZ); - tape->bttb_fd = fd; - if (nread != TAPEBLCKSZ) - { - Assert(nread == 0); /* we are at EOF */ - return 0; - } - Assert(tape->bttb_magic == BTTAPEMAGIC); - NDirectFileRead += TAPEBLCKSZ / BLCKSZ; - return 1; -} -/* - * get the next BTItem from a tape block. - * - * returns: - * - NULL if we have run out of BTItems - * - a pointer to the BTItemData in the block otherwise - * - * side effects: - * - sets 'pos' to the current position within the block. - */ -static BTItem -_bt_tapenext(BTTapeBlock *tape, char **pos) -{ - Size itemsz; - BTItem bti; +static void _bt_load(Relation index, BTSpool *btspool); +static BTItem _bt_buildadd(Relation index, BTPageState *state, BTItem bti, + int flags); +static BTItem _bt_minitem(Page opage, BlockNumber oblkno, int atend); +static BTPageState *_bt_pagestate(Relation index, int flags, + int level, bool doupper); +static void _bt_uppershutdown(Relation index, BTPageState *state); - if (*pos >= tape->bttb_data + tape->bttb_top) - return (BTItem) NULL; - bti = (BTItem) *pos; - itemsz = BTITEMSZ(bti); - *pos += MAXALIGN(itemsz); - return bti; -} /* - * copy a BTItem into a tape block. - * - * assumes that we have already checked to see if the block has enough - * space for the item. - * - * side effects: - * - * - advances the 'top' pointer in the tape block header to point to - * the beginning of free space. + * Interface routines */ -static void -_bt_tapeadd(BTTapeBlock *tape, BTItem item, int itemsz) -{ - memcpy(tape->bttb_data + tape->bttb_top, item, itemsz); - ++tape->bttb_ntup; - tape->bttb_top += MAXALIGN(itemsz); -} -/*------------------------------------------------------------------------- - * spool methods - *------------------------------------------------------------------------- - */ /* - * create and initialize a spool structure, including the underlying - * files. + * create and initialize a spool structure */ -void * -_bt_spoolinit(Relation index, int ntapes, bool isunique) +BTSpool * +_bt_spoolinit(Relation index, bool isunique) { BTSpool *btspool = (BTSpool *) palloc(sizeof(BTSpool)); - int i; - if (btspool == (BTSpool *) NULL) - elog(ERROR, "_bt_spoolinit: out of memory"); MemSet((char *) btspool, 0, sizeof(BTSpool)); - btspool->bts_ntapes = ntapes; - btspool->bts_tape = 0; - btspool->isunique = isunique; - btspool->bts_itape = (BTTapeBlock **) palloc(sizeof(BTTapeBlock *) * ntapes); - btspool->bts_otape = (BTTapeBlock **) palloc(sizeof(BTTapeBlock *) * ntapes); - if (btspool->bts_itape == (BTTapeBlock **) NULL || - btspool->bts_otape == (BTTapeBlock **) NULL) - elog(ERROR, "_bt_spoolinit: out of memory"); + btspool->index = index; + btspool->isunique = isunique; - for (i = 0; i < ntapes; ++i) - { - btspool->bts_itape[i] = _bt_tapecreate(); - btspool->bts_otape[i] = _bt_tapecreate(); - } + btspool->sortstate = tuplesort_begin_index(index, isunique, false); - _bt_isortcmpinit(index, btspool); + /* + * Currently, tuplesort provides sort functions on IndexTuples. + * If we kept anything in a BTItem other than a regular IndexTuple, + * we'd need to modify tuplesort to understand BTItems as such. + */ + Assert(sizeof(BTItemData) == sizeof(IndexTupleData)); - return (void *) btspool; + return btspool; } /* * clean up a spool structure and its substructures. */ void -_bt_spooldestroy(void *spool) +_bt_spooldestroy(BTSpool *btspool) { - BTSpool *btspool = (BTSpool *) spool; - int i; - - for (i = 0; i < btspool->bts_ntapes; ++i) - { - _bt_tapedestroy(btspool->bts_otape[i]); - _bt_tapedestroy(btspool->bts_itape[i]); - } + tuplesort_end(btspool->sortstate); pfree((void *) btspool); } /* - * flush out any dirty output tape blocks + * spool a btitem into the sort file. */ -static void -_bt_spoolflush(BTSpool *btspool) +void +_bt_spool(BTItem btitem, BTSpool *btspool) { - int i; - - for (i = 0; i < btspool->bts_ntapes; ++i) - { - if (!EMPTYTAPE(btspool->bts_otape[i])) - _bt_tapewrite(btspool->bts_otape[i], 1); - } + /* A BTItem is really just an IndexTuple */ + tuplesort_puttuple(btspool->sortstate, (void *) btitem); } /* - * swap input tapes and output tapes by swapping their file - * descriptors. additional preparation for the next merge pass - * includes rewinding the new input tapes and clearing out the new - * output tapes. + * given a spool loaded by successive calls to _bt_spool, + * create an entire btree. */ -static void -_bt_spoolswap(BTSpool *btspool) +void +_bt_leafbuild(BTSpool *btspool) { - File tmpfd; - BTTapeBlock *itape; - BTTapeBlock *otape; - int i; - - for (i = 0; i < btspool->bts_ntapes; ++i) +#ifdef BTREE_BUILD_STATS + if (ShowExecutorStats) { - itape = btspool->bts_itape[i]; - otape = btspool->bts_otape[i]; - - /* - * swap the input and output VFDs. - */ - tmpfd = itape->bttb_fd; - itape->bttb_fd = otape->bttb_fd; - otape->bttb_fd = tmpfd; - - /* - * rewind the new input tape. - */ - _bt_taperewind(itape); - _bt_tapereset(itape); - - /* - * clear the new output tape -- it's ok to throw away the old - * inputs. - */ - _bt_tapeclear(otape); + fprintf(stderr, "! BtreeBuild (Spool) Stats:\n"); + ShowUsage(); + ResetUsage(); } +#endif + tuplesort_performsort(btspool->sortstate); + + _bt_load(btspool->index, btspool); } -/*------------------------------------------------------------------------- - * sorting routines - *------------------------------------------------------------------------- - */ /* - * spool 'btitem' into an initial run. as tape blocks are filled, the - * block BTItems are qsorted and written into some output tape (it - * doesn't matter which; we go round-robin for simplicity). the - * initial runs are therefore always just one block. + * Internal routines. */ -void -_bt_spool(Relation index, BTItem btitem, void *spool) -{ - BTSpool *btspool = (BTSpool *) spool; - BTTapeBlock *itape; - Size itemsz; - - _bt_isortcmpinit(index, btspool); - itape = btspool->bts_itape[btspool->bts_tape]; - itemsz = BTITEMSZ(btitem); - itemsz = MAXALIGN(itemsz); - - /* - * if this buffer is too full for this BTItemData, or if we have run - * out of BTItems, we need to sort the buffer and write it out. in - * this case, the BTItemData will go into the next tape's buffer. - */ - if (btitem == (BTItem) NULL || SPCLEFT(itape) < itemsz) - { - BTSortKey *parray = (BTSortKey *) NULL; - BTTapeBlock *otape; - BTItem bti; - char *pos; - int btisz; - int it_ntup = itape->bttb_ntup; - int i; - - /* - * build an array of pointers to the BTItemDatas on the input - * block. - */ - if (it_ntup > 0) - { - parray = (BTSortKey *) palloc(it_ntup * sizeof(BTSortKey)); - pos = itape->bttb_data; - for (i = 0; i < it_ntup; ++i) - _bt_setsortkey(index, _bt_tapenext(itape, &pos), &(parray[i])); - - /* - * qsort the pointer array. - */ - qsort((void *) parray, it_ntup, sizeof(BTSortKey), - (int (*) (const void *, const void *)) _bt_isortcmp); - } - - /* - * write the spooled run into the output tape. we copy the - * BTItemDatas in the order dictated by the sorted array of - * BTItems, not the original order. - * - * (since everything was MAXALIGN'd and is all on a single tape - * block, everything had *better* still fit on one tape block..) - */ - otape = btspool->bts_otape[btspool->bts_tape]; - for (i = 0; i < it_ntup; ++i) - { - bti = parray[i].btsk_item; - btisz = BTITEMSZ(bti); - btisz = MAXALIGN(btisz); - _bt_tapeadd(otape, bti, btisz); -#if defined(FASTBUILD_DEBUG) && defined(FASTBUILD_SPOOL) - { - bool isnull; - Datum d = index_getattr(&(bti->bti_itup), 1, index->rd_att, - &isnull); - - printf("_bt_spool: inserted <%x> into output tape %d\n", - d, btspool->bts_tape); - } -#endif /* FASTBUILD_DEBUG && FASTBUILD_SPOOL */ - } - - /* - * the initial runs are always single tape blocks. flush the - * output block, marking End-Of-Run. - */ - _bt_tapewrite(otape, 1); - - /* - * reset the input buffer for the next run. we don't have to - * write it out or anything -- we only use it to hold the unsorted - * BTItemDatas, the output tape contains all the sorted stuff. - * - * changing bts_tape changes the output tape and input tape; we - * change itape for the code below. - */ - _bt_tapereset(itape); - btspool->bts_tape = (btspool->bts_tape + 1) % btspool->bts_ntapes; - itape = btspool->bts_itape[btspool->bts_tape]; - - /* - * destroy the pointer array. - */ - if (parray != (BTSortKey *) NULL) - { - for (i = 0; i < it_ntup; i++) - { - if (parray[i].btsk_datum != (Datum *) NULL) - pfree((void *) (parray[i].btsk_datum)); - if (parray[i].btsk_nulls != (char *) NULL) - pfree((void *) (parray[i].btsk_nulls)); - } - pfree((void *) parray); - } - } - - /* insert this item into the current buffer */ - if (btitem != (BTItem) NULL) - _bt_tapeadd(itape, btitem, itemsz); -} /* * allocate a new, clean btree page, not linked to any siblings. @@ -805,7 +203,7 @@ _bt_slideleft(Relation index, Buffer buf, Page page) * allocate and initialize a new BTPageState. the returned structure * is suitable for immediate use by _bt_buildadd. */ -static void * +static BTPageState * _bt_pagestate(Relation index, int flags, int level, bool doupper) { BTPageState *state = (BTPageState *) palloc(sizeof(BTPageState)); @@ -819,7 +217,7 @@ _bt_pagestate(Relation index, int flags, int level, bool doupper) state->btps_level = level; state->btps_doupper = doupper; - return (void *) state; + return state; } /* @@ -883,9 +281,8 @@ _bt_minitem(Page opage, BlockNumber oblkno, int atend) * if all keys are unique, 'first' will always be the same as 'last'. */ static BTItem -_bt_buildadd(Relation index, void *pstate, BTItem bti, int flags) +_bt_buildadd(Relation index, BTPageState *state, BTItem bti, int flags) { - BTPageState *state = (BTPageState *) pstate; Buffer nbuf; Page npage; BTItem last_bti; @@ -944,8 +341,7 @@ _bt_buildadd(Relation index, void *pstate, BTItem bti, int flags) if (PageAddItem(npage, PageGetItem(opage, ii), ii->lp_len, n, LP_USED) == InvalidOffsetNumber) elog(FATAL, "btree: failed to add item to the page in _bt_sort (1)"); -#ifdef NOT_USED -#if defined(FASTBUILD_DEBUG) && defined(FASTBUILD_MERGE) +#ifdef FASTBUILD_DEBUG { bool isnull; BTItem tmpbti = @@ -956,7 +352,6 @@ _bt_buildadd(Relation index, void *pstate, BTItem bti, int flags) printf("_bt_buildadd: moved <%x> to offset %d at level %d\n", d, n, state->btps_level); } -#endif /* FASTBUILD_DEBUG && FASTBUILD_MERGE */ #endif } @@ -989,7 +384,7 @@ _bt_buildadd(Relation index, void *pstate, BTItem bti, int flags) nopaque->btpo_prev = BufferGetBlockNumber(obuf); nopaque->btpo_next = P_NONE; - if (_bt_itemcmp(index, _bt_nattr, + if (_bt_itemcmp(index, index->rd_att->natts, (BTItem) PageGetItem(opage, PageGetItemId(opage, P_HIKEY)), (BTItem) PageGetItem(opage, PageGetItemId(opage, P_FIRSTKEY)), BTEqualStrategyNumber)) @@ -1030,8 +425,7 @@ _bt_buildadd(Relation index, void *pstate, BTItem bti, int flags) off = OffsetNumberNext(last_off); if (PageAddItem(npage, (Item) bti, btisz, off, LP_USED) == InvalidOffsetNumber) elog(FATAL, "btree: failed to add item to the page in _bt_sort (2)"); -#ifdef NOT_USED -#if defined(FASTBUILD_DEBUG) && defined(FASTBUILD_MERGE) +#ifdef FASTBUILD_DEBUG { bool isnull; Datum d = index_getattr(&(bti->bti_itup), 1, index->rd_att, &isnull); @@ -1039,11 +433,10 @@ _bt_buildadd(Relation index, void *pstate, BTItem bti, int flags) printf("_bt_buildadd: inserted <%x> at offset %d at level %d\n", d, off, state->btps_level); } -#endif /* FASTBUILD_DEBUG && FASTBUILD_MERGE */ #endif if (last_bti == (BTItem) NULL) first_off = P_FIRSTKEY; - else if (!_bt_itemcmp(index, _bt_nattr, + else if (!_bt_itemcmp(index, index->rd_att->natts, bti, last_bti, BTEqualStrategyNumber)) first_off = off; last_off = off; @@ -1103,224 +496,31 @@ _bt_uppershutdown(Relation index, BTPageState *state) } /* - * take the input tapes stored by 'btspool' and perform successive - * merging passes until at most one run is left in each tape. at that - * point, merge the final tape runs into a set of btree leaves. - * - * XXX three nested loops? gross. cut me up into smaller routines. + * Read tuples in correct sort order from tuplesort, and load them into + * btree leaves. */ static void -_bt_merge(Relation index, BTSpool *btspool) +_bt_load(Relation index, BTSpool *btspool) { BTPageState *state; - BTPriQueue q; - BTPriQueueElem e; - BTSortKey btsk; BTItem bti; - BTTapeBlock *itape; - BTTapeBlock *otape; - char *tapepos[MAXTAPES]; - int tapedone[MAXTAPES]; - int t; - int goodtapes; - int npass; - int nruns; - Size btisz; - bool doleaf = false; + bool should_free; /* * initialize state needed for the merge into the btree leaf pages. */ - state = (BTPageState *) _bt_pagestate(index, BTP_LEAF, 0, true); - - npass = 0; - do - { /* pass */ - - /* - * each pass starts by flushing the previous outputs and swapping - * inputs and outputs. flushing sets End-of-Run for any dirty - * output tapes. swapping clears the new output tapes and rewinds - * the new input tapes. - */ - btspool->bts_tape = btspool->bts_ntapes - 1; - _bt_spoolflush(btspool); - _bt_spoolswap(btspool); - - ++npass; - nruns = 0; - - for (;;) - { /* run */ - - /* - * each run starts by selecting a new output tape. the merged - * results of a given run are always sent to this one tape. - */ - btspool->bts_tape = (btspool->bts_tape + 1) % btspool->bts_ntapes; - otape = btspool->bts_otape[btspool->bts_tape]; - - /* - * initialize the priority queue by loading it with the first - * element of the given run in each tape. since we are - * starting a new run, we reset the tape (clearing the - * End-Of-Run marker) before reading it. this means that - * _bt_taperead will return 0 only if the tape is actually at - * EOF. - */ - MemSet((char *) &q, 0, sizeof(BTPriQueue)); - goodtapes = 0; - for (t = 0; t < btspool->bts_ntapes; ++t) - { - itape = btspool->bts_itape[t]; - tapepos[t] = itape->bttb_data; - tapedone[t] = 0; - _bt_tapereset(itape); - do - { - if (_bt_taperead(itape) == 0) - tapedone[t] = 1; - } while (!tapedone[t] && EMPTYTAPE(itape)); - if (!tapedone[t]) - { - ++goodtapes; - e.btpqe_tape = t; - _bt_setsortkey(index, _bt_tapenext(itape, &tapepos[t]), - &(e.btpqe_item)); - if (e.btpqe_item.btsk_item != (BTItem) NULL) - _bt_pqadd(&q, &e); - } - } - - /* - * if we don't have any tapes with any input (i.e., they are - * all at EOF), there is no work to do in this run -- we must - * be done with this pass. - */ - if (goodtapes == 0) - { - break; /* for */ - } - ++nruns; - - /* - * output the smallest element from the queue until there are - * no more. - */ - while (_bt_pqnext(&q, &e) >= 0) - { /* item */ - - /* - * replace the element taken from priority queue, fetching - * a new block if needed. a tape can run out if it hits - * either End-Of-Run or EOF. - */ - t = e.btpqe_tape; - btsk = e.btpqe_item; - bti = btsk.btsk_item; - if (bti != (BTItem) NULL) - { - btisz = BTITEMSZ(bti); - btisz = MAXALIGN(btisz); - if (doleaf) - { - _bt_buildadd(index, state, bti, BTP_LEAF); -#if defined(FASTBUILD_DEBUG) && defined(FASTBUILD_MERGE) - { - bool isnull; - Datum d = index_getattr(&(bti->bti_itup), 1, - index->rd_att, &isnull); - - printf("_bt_merge: [pass %d run %d] inserted <%x> from tape %d into block %d\n", - npass, nruns, d, t, - BufferGetBlockNumber(state->btps_buf)); - } -#endif /* FASTBUILD_DEBUG && FASTBUILD_MERGE */ - } - else - { - if (SPCLEFT(otape) < btisz) - { - - /* - * if it's full, write it out and add the item - * to the next block. (since we will be - * adding another tuple immediately after - * this, we can be sure that there will be at - * least one more block in this run and so we - * know we do *not* want to set End-Of-Run - * here.) - */ - _bt_tapewrite(otape, 0); - } - _bt_tapeadd(otape, bti, btisz); -#if defined(FASTBUILD_DEBUG) && defined(FASTBUILD_MERGE) - { - bool isnull; - Datum d = index_getattr(&(bti->bti_itup), 1, - index->rd_att, &isnull); - - printf("_bt_merge: [pass %d run %d] inserted <%x> from tape %d into output tape %d\n", - npass, nruns, d, t, - btspool->bts_tape); - } -#endif /* FASTBUILD_DEBUG && FASTBUILD_MERGE */ - } - - if (btsk.btsk_datum != (Datum *) NULL) - pfree((void *) (btsk.btsk_datum)); - if (btsk.btsk_nulls != (char *) NULL) - pfree((void *) (btsk.btsk_nulls)); - - } - itape = btspool->bts_itape[t]; - if (!tapedone[t]) - { - BTItem newbti = _bt_tapenext(itape, &tapepos[t]); - - if (newbti == (BTItem) NULL) - { - do - { - if (_bt_taperead(itape) == 0) - tapedone[t] = 1; - } while (!tapedone[t] && EMPTYTAPE(itape)); - if (!tapedone[t]) - { - tapepos[t] = itape->bttb_data; - newbti = _bt_tapenext(itape, &tapepos[t]); - } - } - if (newbti != (BTItem) NULL) - { - BTPriQueueElem nexte; - - nexte.btpqe_tape = t; - _bt_setsortkey(index, newbti, &(nexte.btpqe_item)); - _bt_pqadd(&q, &nexte); - } - } - } /* item */ - - /* - * that's it for this run. flush the output tape, marking - * End-of-Run. - */ - _bt_tapewrite(otape, 1); - } /* run */ + state = _bt_pagestate(index, BTP_LEAF, 0, true); - /* - * we are here because we ran out of input on all of the input - * tapes. - * - * if this pass did not generate more actual output runs than we have - * tapes, we know we have at most one run in each tape. this - * means that we are ready to merge into the final btree leaf - * pages instead of merging into a tape file. - */ - if (nruns <= btspool->bts_ntapes) - doleaf = true; - } while (nruns > 0); /* pass */ + for (;;) + { + bti = (BTItem) tuplesort_getindextuple(btspool->sortstate, true, + &should_free); + if (bti == (BTItem) NULL) + break; + _bt_buildadd(index, state, bti, BTP_LEAF); + if (should_free) + pfree((void *) bti); + } _bt_uppershutdown(index, state); } @@ -1359,7 +559,7 @@ _bt_upperbuild(Relation index) ropaque->btpo_flags &= ~BTP_ROOT; _bt_wrtbuf(index, rbuf); - state = (BTPageState *) _bt_pagestate(index, 0, 0, true); + state = _bt_pagestate(index, 0, 0, true); /* for each page... */ do @@ -1380,7 +580,7 @@ _bt_upperbuild(Relation index) * the lower page and insert it into a page at this level. */ nbti = _bt_minitem(rpage, blk, P_RIGHTMOST(ropaque)); -#if defined(FASTBUILD_DEBUG) && defined(FASTBUILD_MERGE) +#ifdef FASTBUILD_DEBUG { bool isnull; Datum d = index_getattr(&(nbti->bti_itup), 1, index->rd_att, @@ -1389,7 +589,7 @@ _bt_upperbuild(Relation index) printf("_bt_upperbuild: inserting <%x> at %d\n", d, state->btps_level); } -#endif /* FASTBUILD_DEBUG && FASTBUILD_MERGE */ +#endif _bt_buildadd(index, state, nbti, 0); pfree((void *) nbti); } @@ -1401,25 +601,3 @@ _bt_upperbuild(Relation index) } #endif - -/* - * given a spool loading by successive calls to _bt_spool, create an - * entire btree. - */ -void -_bt_leafbuild(Relation index, void *spool) -{ - _bt_isortcmpinit(index, (BTSpool *) spool); - -#ifdef BTREE_BUILD_STATS - if (ShowExecutorStats) - { - fprintf(stderr, "! BtreeBuild (Spool) Stats:\n"); - ShowUsage(); - ResetUsage(); - } -#endif - - _bt_merge(index, (BTSpool *) spool); - -} diff --git a/src/backend/executor/nodeSort.c b/src/backend/executor/nodeSort.c index f82fecf0d6f..14e8b46aa86 100644 --- a/src/backend/executor/nodeSort.c +++ b/src/backend/executor/nodeSort.c @@ -7,16 +7,17 @@ * * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/executor/nodeSort.c,v 1.23 1999/07/17 20:16:58 momjian Exp $ + * $Header: /cvsroot/pgsql/src/backend/executor/nodeSort.c,v 1.24 1999/10/17 22:15:02 tgl Exp $ * *------------------------------------------------------------------------- */ #include "postgres.h" + #include "executor/executor.h" #include "executor/execdebug.h" #include "executor/nodeSort.h" -#include "utils/psort.h" +#include "utils/tuplesort.h" /* ---------------------------------------------------------------- * FormSortKeys(node) @@ -83,11 +84,9 @@ FormSortKeys(Sort *sortnode) /* ---------------------------------------------------------------- * ExecSort * - * old comments - * Sorts tuples from the outer subtree of the node in psort, + * Sorts tuples from the outer subtree of the node using tuplesort, * which saves the results in a temporary file or memory. After the * initial call, returns a tuple from the file with each call. - * Assumes that heap access method is used. * * Conditions: * -- none. @@ -101,10 +100,8 @@ ExecSort(Sort *node) { EState *estate; SortState *sortstate; - Plan *outerNode; ScanDirection dir; - int keycount; - ScanKey sortkeys; + Tuplesortstate *tuplesortstate; HeapTuple heapTuple; TupleTableSlot *slot; bool should_free; @@ -119,43 +116,71 @@ ExecSort(Sort *node) sortstate = node->sortstate; estate = node->plan.state; dir = estate->es_direction; + tuplesortstate = (Tuplesortstate *) sortstate->tuplesortstate; /* ---------------- - * the first time we call this, psort sorts this into a file. - * Subsequent calls return tuples from psort. + * If first time through, read all tuples from outer plan and + * pass them to tuplesort.c. + * Subsequent calls just fetch tuples from tuplesort. * ---------------- */ - if (sortstate->sort_Flag == false) + if (! sortstate->sort_Done) { + Plan *outerNode; + TupleDesc tupDesc; + int keycount; + ScanKey sortkeys; + SO1_printf("ExecSort: %s\n", - "sortstate == false -> sorting subplan"); + "sorting subplan"); /* ---------------- - * set all relations to be scanned in the forward direction - * while creating the temporary relation. + * Want to scan subplan in the forward direction while creating + * the sorted data. (Does setting my direction actually affect + * the subplan? I bet this is useless code...) * ---------------- */ estate->es_direction = ForwardScanDirection; /* ---------------- - * prepare information for psort_begin() + * Initialize tuplesort module. * ---------------- */ - outerNode = outerPlan((Plan *) node); + SO1_printf("ExecSort: %s\n", + "calling tuplesort_begin"); + outerNode = outerPlan((Plan *) node); + tupDesc = ExecGetTupType(outerNode); keycount = node->keycount; sortkeys = (ScanKey) sortstate->sort_Keys; - SO1_printf("ExecSort: %s\n", - "calling psort_begin"); - if (!psort_begin(node, /* this node */ - keycount, /* number keys */ - sortkeys)) /* keys */ + tuplesortstate = tuplesort_begin_heap(tupDesc, keycount, sortkeys, + true /* randomAccess */); + + sortstate->tuplesortstate = (void *) tuplesortstate; + + /* ---------------- + * Scan the subplan and feed all the tuples to tuplesort. + * ---------------- + */ + + for (;;) { - /* Psort says, there are no tuples to be sorted */ - return NULL; + slot = ExecProcNode(outerNode, (Plan *) node); + + if (TupIsNull(slot)) + break; + + tuplesort_puttuple(tuplesortstate, (void *) slot->val); + ExecClearTuple(slot); } + /* ---------------- + * Complete the sort. + * ---------------- + */ + tuplesort_performsort(tuplesortstate); + /* ---------------- * restore to user specified direction * ---------------- @@ -167,25 +192,29 @@ ExecSort(Sort *node) * ---------------- */ slot = (TupleTableSlot *) sortstate->csstate.cstate.cs_ResultTupleSlot; - slot->ttc_tupleDescriptor = ExecGetTupType(outerNode); + slot->ttc_tupleDescriptor = tupDesc; + /* ---------------- * finally set the sorted flag to true * ---------------- */ - sortstate->sort_Flag = true; + sortstate->sort_Done = true; SO1_printf(stderr, "ExecSort: sorting done.\n"); } else slot = (TupleTableSlot *) sortstate->csstate.cstate.cs_ResultTupleSlot; SO1_printf("ExecSort: %s\n", - "retrieving tuple from sorted relation"); + "retrieving tuple from tuplesort"); /* ---------------- - * at this point we grab a tuple from psort + * Get the first or next tuple from tuplesort. + * Returns NULL if no more tuples. * ---------------- */ - heapTuple = psort_grabtuple(node, &should_free); + heapTuple = tuplesort_getheaptuple(tuplesortstate, + ScanDirectionIsForward(dir), + &should_free); return ExecStoreTuple(heapTuple, slot, InvalidBuffer, should_free); } @@ -193,7 +222,6 @@ ExecSort(Sort *node) /* ---------------------------------------------------------------- * ExecInitSort * - * old comments * Creates the run-time state information for the sort node * produced by the planner and initailizes its outer subtree. * ---------------------------------------------------------------- @@ -203,7 +231,6 @@ ExecInitSort(Sort *node, EState *estate, Plan *parent) { SortState *sortstate; Plan *outerPlan; - ScanKey sortkeys; SO1_printf("ExecInitSort: %s\n", "initializing sort node"); @@ -219,14 +246,14 @@ ExecInitSort(Sort *node, EState *estate, Plan *parent) * ---------------- */ sortstate = makeNode(SortState); - sortstate->sort_Flag = 0; + sortstate->sort_Done = false; sortstate->sort_Keys = NULL; - node->cleaned = FALSE; + sortstate->tuplesortstate = NULL; node->sortstate = sortstate; /* ---------------- - * Miscellanious initialization + * Miscellaneous initialization * * + assign node's base_id * + assign debugging hooks @@ -259,9 +286,7 @@ ExecInitSort(Sort *node, EState *estate, Plan *parent) * initialize sortstate information * ---------------- */ - sortkeys = FormSortKeys(node); - sortstate->sort_Keys = sortkeys; - sortstate->sort_Flag = false; + sortstate->sort_Keys = FormSortKeys(node); /* ---------------- * initialize tuple type. no need to initialize projection @@ -275,11 +300,6 @@ ExecInitSort(Sort *node, EState *estate, Plan *parent) SO1_printf("ExecInitSort: %s\n", "sort node initialized"); - /* ---------------- - * return relation oid of temporary sort relation in a list - * (someday -- for now we return LispTrue... cim 10/12/89) - * ---------------- - */ return TRUE; } @@ -293,8 +313,6 @@ ExecCountSlotsSort(Sort *node) /* ---------------------------------------------------------------- * ExecEndSort(node) - * - * old comments * ---------------------------------------------------------------- */ void @@ -325,8 +343,13 @@ ExecEndSort(Sort *node) */ ExecClearTuple(sortstate->csstate.css_ScanTupleSlot); - /* Clean up after psort */ - psort_end(node); + /* ---------------- + * Release tuplesort resources + * ---------------- + */ + if (sortstate->tuplesortstate != NULL) + tuplesort_end((Tuplesortstate *) sortstate->tuplesortstate); + sortstate->tuplesortstate = NULL; SO1_printf("ExecEndSort: %s\n", "sort node shutdown"); @@ -335,51 +358,47 @@ ExecEndSort(Sort *node) /* ---------------------------------------------------------------- * ExecSortMarkPos * - * Calls psort to save the current position in the sorted file. + * Calls tuplesort to save the current position in the sorted file. * ---------------------------------------------------------------- */ void ExecSortMarkPos(Sort *node) { - SortState *sortstate; + SortState *sortstate = node->sortstate; /* ---------------- * if we haven't sorted yet, just return * ---------------- */ - sortstate = node->sortstate; - if (sortstate->sort_Flag == false) + if (! sortstate->sort_Done) return; - psort_markpos(node); - - return; + tuplesort_markpos((Tuplesortstate *) sortstate->tuplesortstate); } /* ---------------------------------------------------------------- * ExecSortRestrPos * - * Calls psort to restore the last saved sort file position. + * Calls tuplesort to restore the last saved sort file position. * ---------------------------------------------------------------- */ void ExecSortRestrPos(Sort *node) { - SortState *sortstate; + SortState *sortstate = node->sortstate; /* ---------------- * if we haven't sorted yet, just return. * ---------------- */ - sortstate = node->sortstate; - if (sortstate->sort_Flag == false) + if (! sortstate->sort_Done) return; /* ---------------- * restore the scan to the previously marked position * ---------------- */ - psort_restorepos(node); + tuplesort_restorepos((Tuplesortstate *) sortstate->tuplesortstate); } void @@ -392,17 +411,25 @@ ExecReScanSort(Sort *node, ExprContext *exprCtxt, Plan *parent) * not NULL then it will be re-scanned by ExecProcNode, else - no * reason to re-scan it at all. */ - if (sortstate->sort_Flag == false) + if (! sortstate->sort_Done) return; ExecClearTuple(sortstate->csstate.cstate.cs_ResultTupleSlot); - psort_rescan(node); - /* - * If subnode is to be rescanned then we aren't sorted + * If subnode is to be rescanned then we forget previous sort + * results; we have to re-read the subplan and re-sort. + * + * Otherwise we can just rewind and rescan the sorted output. */ if (((Plan *) node)->lefttree->chgParam != NULL) - sortstate->sort_Flag = false; - + { + sortstate->sort_Done = false; + tuplesort_end((Tuplesortstate *) sortstate->tuplesortstate); + sortstate->tuplesortstate = NULL; + } + else + { + tuplesort_rescan((Tuplesortstate *) sortstate->tuplesortstate); + } } diff --git a/src/backend/utils/sort/Makefile b/src/backend/utils/sort/Makefile index d411a89c735..c680a089230 100644 --- a/src/backend/utils/sort/Makefile +++ b/src/backend/utils/sort/Makefile @@ -4,7 +4,7 @@ # Makefile for utils/sort # # IDENTIFICATION -# $Header: /cvsroot/pgsql/src/backend/utils/sort/Makefile,v 1.6 1999/10/16 19:49:27 tgl Exp $ +# $Header: /cvsroot/pgsql/src/backend/utils/sort/Makefile,v 1.7 1999/10/17 22:15:05 tgl Exp $ # #------------------------------------------------------------------------- @@ -13,7 +13,7 @@ include ../../../Makefile.global CFLAGS += -I../.. -OBJS = logtape.o lselect.o psort.o +OBJS = logtape.o tuplesort.o all: SUBSYS.o diff --git a/src/backend/utils/sort/logtape.c b/src/backend/utils/sort/logtape.c index 8d5d34c00a7..46497598b56 100644 --- a/src/backend/utils/sort/logtape.c +++ b/src/backend/utils/sort/logtape.c @@ -4,8 +4,8 @@ * Management of "logical tapes" within temporary files. * * This module exists to support sorting via multiple merge passes (see - * psort.c). Merging is an ideal algorithm for tape devices, but if we - * implement it on disk by creating a separate file for each "tape", + * tuplesort.c). Merging is an ideal algorithm for tape devices, but if + * we implement it on disk by creating a separate file for each "tape", * there is an annoying problem: the peak space usage is at least twice * the volume of actual data to be sorted. (This must be so because each * datum will appear in both the input and output tapes of the final @@ -23,7 +23,7 @@ * Few OSes allow arbitrary parts of a file to be released back to the OS, * so we have to implement this space-recycling ourselves within a single * logical file. logtape.c exists to perform this bookkeeping and provide - * the illusion of N independent tape devices to psort.c. Note that + * the illusion of N independent tape devices to tuplesort.c. Note that * logtape.c itself depends on buffile.c to provide a "logical file" of * larger size than the underlying OS may support. * @@ -63,7 +63,7 @@ * Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/utils/sort/logtape.c,v 1.1 1999/10/16 19:49:27 tgl Exp $ + * $Header: /cvsroot/pgsql/src/backend/utils/sort/logtape.c,v 1.2 1999/10/17 22:15:05 tgl Exp $ * *------------------------------------------------------------------------- */ diff --git a/src/backend/utils/sort/tuplesort.c b/src/backend/utils/sort/tuplesort.c new file mode 100644 index 00000000000..2240564fa25 --- /dev/null +++ b/src/backend/utils/sort/tuplesort.c @@ -0,0 +1,1465 @@ +/*------------------------------------------------------------------------- + * + * tuplesort.c + * Generalized tuple sorting routines. + * + * This module handles sorting of either heap tuples or index tuples + * (and could fairly easily support other kinds of sortable objects, + * if necessary). It works efficiently for both small and large amounts + * of data. Small amounts are sorted in-memory using qsort(). Large + * amounts are sorted using temporary files and a standard external sort + * algorithm. + * + * See Knuth, volume 3, for more than you want to know about the external + * sorting algorithm. We divide the input into sorted runs using replacement + * selection, in the form of a priority tree implemented as a heap + * (essentially his Algorithm 5.2.3H), then merge the runs using polyphase + * merge, Knuth's Algorithm 5.4.2D. The logical "tapes" used by Algorithm D + * are implemented by logtape.c, which avoids space wastage by recycling + * disk space as soon as each block is read from its "tape". + * + * We do not form the initial runs using Knuth's recommended replacement + * selection method (Algorithm 5.4.1R), because it uses a fixed number of + * records in memory at all times. Since we are dealing with tuples that + * may vary considerably in size, we want to be able to vary the number of + * records kept in memory to ensure full utilization of the allowed sort + * memory space. This is easily done by keeping a variable-size heap in + * which the records of the current run are stored, plus a variable-size + * unsorted array holding records that must go into the next run. + * + * The (approximate) amount of memory allowed for any one sort operation + * is given in kilobytes by the external variable SortMem. Initially, + * we absorb tuples and simply store them in an unsorted array as long as + * we haven't exceeded SortMem. If we reach the end of the input without + * exceeding SortMem, we sort the array using qsort() and subsequently return + * tuples just by scanning the tuple array sequentially. If we do exceed + * SortMem, we construct a heap using Algorithm H and begin to emit tuples + * into sorted runs in temporary tapes, emitting just enough tuples at each + * step to get back within the SortMem limit. New tuples are added to the + * heap if they can go into the current run, else they are temporarily added + * to the unsorted array. Whenever the heap empties, we construct a new heap + * from the current contents of the unsorted array, and begin a new run with a + * new output tape (selected per Algorithm D). After the end of the input + * is reached, we dump out remaining tuples in memory into a final run + * (or two), then merge the runs using Algorithm D. + * + * When the caller requests random access to the sort result, we form + * the final sorted run on a logical tape which is then "frozen", so + * that we can access it randomly. When the caller does not need random + * access, we return from tuplesort_performsort() as soon as we are down + * to one run per logical tape. The final merge is then performed + * on-the-fly as the caller repeatedly calls tuplesort_gettuple; this + * saves one cycle of writing all the data out to disk and reading it in. + * + * + * Copyright (c) 1994, Regents of the University of California + * + * IDENTIFICATION + * $Header: /cvsroot/pgsql/src/backend/utils/sort/tuplesort.c,v 1.1 1999/10/17 22:15:05 tgl Exp $ + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "access/heapam.h" +#include "access/nbtree.h" +#include "miscadmin.h" +#include "utils/logtape.h" +#include "utils/tuplesort.h" + +/* + * Possible states of a Tuplesort object. These denote the states that + * persist between calls of Tuplesort routines. + */ +typedef enum +{ + TSS_INITIAL, /* Loading tuples; still within memory limit */ + TSS_BUILDRUNS, /* Loading tuples; writing to tape */ + TSS_SORTEDINMEM, /* Sort completed entirely in memory */ + TSS_SORTEDONTAPE, /* Sort completed, final run is on tape */ + TSS_FINALMERGE /* Performing final merge on-the-fly */ +} TupSortStatus; + +/* + * We use a seven-tape polyphase merge, which is the "sweet spot" on the + * tapes-to-passes curve according to Knuth's figure 70 (section 5.4.2). + */ +#define MAXTAPES 7 /* Knuth's T */ +#define TAPERANGE (MAXTAPES-1) /* Knuth's P */ + +/* + * Private state of a Tuplesort operation. + */ +struct Tuplesortstate +{ + TupSortStatus status; /* enumerated value as shown above */ + bool randomAccess; /* did caller request random access? */ + long availMem; /* remaining memory available, in bytes */ + LogicalTapeSet *tapeset; /* logtape.c object for tapes in a temp file */ + + /* + * These function pointers decouple the routines that must know what kind + * of tuple we are sorting from the routines that don't need to know it. + * They are set up by the tuplesort_begin_xxx routines. + * + * Function to compare two tuples; result is per qsort() convention, + * ie, <0, 0, >0 according as a<b, a=b, a>b. + */ + int (*comparetup) (Tuplesortstate *state, const void *a, const void *b); + /* + * Function to copy a supplied input tuple into palloc'd space. + * (NB: we assume that a single pfree() is enough to release the tuple + * later, so the representation must be "flat" in one palloc chunk.) + * state->availMem must be decreased by the amount of space used. + */ + void * (*copytup) (Tuplesortstate *state, void *tup); + /* + * Function to write a stored tuple onto tape. The representation of + * the tuple on tape need not be the same as it is in memory; requirements + * on the tape representation are given below. After writing the tuple, + * pfree() it, and increase state->availMem by the amount of memory space + * thereby released. + */ + void (*writetup) (Tuplesortstate *state, int tapenum, void *tup); + /* + * Function to read a stored tuple from tape back into memory. + * 'len' is the already-read length of the stored tuple. Create and + * return a palloc'd copy, and decrease state->availMem by the amount + * of memory space consumed. + */ + void * (*readtup) (Tuplesortstate *state, int tapenum, unsigned int len); + + /* + * This array holds "unsorted" tuples during the input phases. + * If we are able to complete the sort in memory, it holds the + * final sorted result as well. + */ + void **memtuples; /* array of pointers to palloc'd tuples */ + int memtupcount; /* number of tuples currently present */ + int memtupsize; /* allocated length of memtuples array */ + + /* + * This array holds the partially-sorted "heap" of tuples that will go + * out in the current run during BUILDRUNS state. While completing + * the sort, we use it to merge runs of tuples from input tapes. + * It is never allocated unless we need to use tapes. + */ + void **heaptuples; /* array of pointers to palloc'd tuples */ + int heaptupcount; /* number of tuples currently present */ + int heaptupsize; /* allocated length of heaptuples array */ + /* + * While merging, this array holds the actual number of the input tape + * that each tuple in heaptuples[] came from. + */ + int *heapsrctapes; + + /* + * Variables for Algorithm D. Note that destTape is a "logical" tape + * number, ie, an index into the tp_xxx[] arrays. Be careful to keep + * "logical" and "actual" tape numbers straight! + */ + int Level; /* Knuth's l */ + int destTape; /* current output tape (Knuth's j, less 1) */ + int tp_fib[MAXTAPES]; /* Target Fibonacci run counts (A[]) */ + int tp_runs[MAXTAPES]; /* # of real runs on each tape */ + int tp_dummy[MAXTAPES]; /* # of dummy runs for each tape (D[]) */ + int tp_tapenum[MAXTAPES]; /* Actual tape numbers (TAPE[]) */ + + bool multipleRuns; /* T if we have created more than 1 run */ + + /* + * These variables are used after completion of sorting to keep track + * of the next tuple to return. (In the tape case, the tape's current + * read position is also critical state.) + */ + int result_tape; /* actual tape number of finished output */ + int current; /* array index (only used if SORTEDINMEM) */ + bool eof_reached; /* reached EOF (needed for cursors) */ + + /* markpos_xxx holds marked position for mark and restore */ + long markpos_block; /* tape block# (only used if SORTEDONTAPE) */ + int markpos_offset; /* saved "current", or offset in tape block */ + bool markpos_eof; /* saved "eof_reached" */ + + /* + * These variables are specific to the HeapTuple case; they are set + * by tuplesort_begin_heap and used only by the HeapTuple routines. + */ + TupleDesc tupDesc; + int nKeys; + ScanKey scanKeys; + + /* + * These variables are specific to the IndexTuple case; they are set + * by tuplesort_begin_index and used only by the IndexTuple routines. + */ + Relation indexRel; + bool enforceUnique; /* complain if we find duplicate tuples */ +}; + +#define COMPARETUP(state,a,b) ((*(state)->comparetup) (state, a, b)) +#define COPYTUP(state,tup) ((*(state)->copytup) (state, tup)) +#define WRITETUP(state,tape,tup) ((*(state)->writetup) (state, tape, tup)) +#define READTUP(state,tape,len) ((*(state)->readtup) (state, tape, len)) +#define LACKMEM(state) ((state)->availMem < 0) +#define USEMEM(state,amt) ((state)->availMem -= (amt)) +#define FREEMEM(state,amt) ((state)->availMem += (amt)) + +/*-------------------- + * + * NOTES about on-tape representation of tuples: + * + * We require the first "unsigned int" of a stored tuple to be the total size + * on-tape of the tuple, including itself (so it is never zero; an all-zero + * unsigned int is used to delimit runs). The remainder of the stored tuple + * may or may not match the in-memory representation of the tuple --- + * any conversion needed is the job of the writetup and readtup routines. + * + * If state->randomAccess is true, then the stored representation of the + * tuple must be followed by another "unsigned int" that is a copy of the + * length --- so the total tape space used is actually sizeof(unsigned int) + * more than the stored length value. This allows read-backwards. When + * randomAccess is not true, the write/read routines may omit the extra + * length word. + * + * writetup is expected to write both length words as well as the tuple + * data. When readtup is called, the tape is positioned just after the + * front length word; readtup must read the tuple data and advance past + * the back length word (if present). + * + * The write/read routines can make use of the tuple description data + * stored in the Tuplesortstate record, if needed. They are also expected + * to adjust state->availMem by the amount of memory space (not tape space!) + * released or consumed. There is no error return from either writetup + * or readtup; they should elog() on failure. + * + * + * NOTES about memory consumption calculations: + * + * We count space requested for tuples against the SortMem limit. + * Fixed-size space (primarily the LogicalTapeSet I/O buffers) is not + * counted, nor do we count the variable-size memtuples and heaptuples + * arrays. (Even though those could grow pretty large, they should be + * small compared to the tuples proper, so this is not unreasonable.) + * + * The major deficiency in this approach is that it ignores palloc overhead. + * The memory space actually allocated for a palloc chunk is always more + * than the request size, and could be considerably more (as much as 2X + * larger, in the current aset.c implementation). So the space used could + * be considerably more than SortMem says. + * + * One way to fix this is to add a memory management function that, given + * a pointer to a palloc'd chunk, returns the actual space consumed by the + * chunk. This would be very easy in the current aset.c module, but I'm + * hesitant to do it because it might be unpleasant to support in future + * implementations of memory management. (For example, a direct + * implementation of palloc as malloc could not support such a function + * portably.) + * + * A cruder answer is just to apply a fudge factor, say by initializing + * availMem to only three-quarters of what SortMem indicates. This is + * probably the right answer if anyone complains that SortMem is not being + * obeyed very faithfully. + * + *-------------------- + */ + +static Tuplesortstate *tuplesort_begin_common(bool randomAccess); +static void inittapes(Tuplesortstate *state); +static void selectnewtape(Tuplesortstate *state); +static void mergeruns(Tuplesortstate *state); +static void mergeonerun(Tuplesortstate *state); +static void beginmerge(Tuplesortstate *state); +static void beginrun(Tuplesortstate *state); +static void dumptuples(Tuplesortstate *state, bool alltuples); +static void tuplesort_heap_insert(Tuplesortstate *state, void *tuple, + int tapenum); +static void tuplesort_heap_siftup(Tuplesortstate *state); +static unsigned int getlen(Tuplesortstate *state, int tapenum, bool eofOK); +static void markrunend(Tuplesortstate *state, int tapenum); +static int qsort_comparetup(const void *a, const void *b); +static int comparetup_heap(Tuplesortstate *state, + const void *a, const void *b); +static void *copytup_heap(Tuplesortstate *state, void *tup); +static void writetup_heap(Tuplesortstate *state, int tapenum, void *tup); +static void *readtup_heap(Tuplesortstate *state, int tapenum, + unsigned int len); +static int comparetup_index(Tuplesortstate *state, + const void *a, const void *b); +static void *copytup_index(Tuplesortstate *state, void *tup); +static void writetup_index(Tuplesortstate *state, int tapenum, void *tup); +static void *readtup_index(Tuplesortstate *state, int tapenum, + unsigned int len); + +/* + * Since qsort(3) will not pass any context info to qsort_comparetup(), + * we have to use this ugly static variable. It is set to point to the + * active Tuplesortstate object just before calling qsort. It should + * not be used directly by anything except qsort_comparetup(). + */ +static Tuplesortstate *qsort_tuplesortstate; + + +/* + * tuplesort_begin_xxx + * + * Initialize for a tuple sort operation. + * + * After calling tuplesort_begin, the caller should call tuplesort_puttuple + * zero or more times, then call tuplesort_performsort when all the tuples + * have been supplied. After performsort, retrieve the tuples in sorted + * order by calling tuplesort_gettuple until it returns NULL. (If random + * access was requested, rescan, markpos, and restorepos can also be called.) + * Call tuplesort_end to terminate the operation and release memory/disk space. + */ + +static Tuplesortstate * +tuplesort_begin_common(bool randomAccess) +{ + Tuplesortstate *state; + + state = (Tuplesortstate *) palloc(sizeof(Tuplesortstate)); + + MemSet((char *) state, 0, sizeof(Tuplesortstate)); + + state->status = TSS_INITIAL; + state->randomAccess = randomAccess; + state->availMem = SortMem * 1024L; + state->tapeset = NULL; + + state->memtupcount = 0; + state->memtupsize = 1024; /* initial guess */ + state->memtuples = (void **) palloc(state->memtupsize * sizeof(void *)); + + state->heaptuples = NULL; /* until and unless needed */ + state->heaptupcount = 0; + state->heaptupsize = 0; + state->heapsrctapes = NULL; + + /* Algorithm D variables will be initialized by inittapes, if needed */ + + state->result_tape = -1; /* flag that result tape has not been formed */ + + return state; +} + +Tuplesortstate * +tuplesort_begin_heap(TupleDesc tupDesc, + int nkeys, ScanKey keys, + bool randomAccess) +{ + Tuplesortstate *state = tuplesort_begin_common(randomAccess); + + AssertArg(nkeys >= 1); + AssertArg(keys[0].sk_attno != 0); + AssertArg(keys[0].sk_procedure != 0); + + state->comparetup = comparetup_heap; + state->copytup = copytup_heap; + state->writetup = writetup_heap; + state->readtup = readtup_heap; + + state->tupDesc = tupDesc; + state->nKeys = nkeys; + state->scanKeys = keys; + + return state; +} + +Tuplesortstate * +tuplesort_begin_index(Relation indexRel, + bool enforceUnique, + bool randomAccess) +{ + Tuplesortstate *state = tuplesort_begin_common(randomAccess); + + state->comparetup = comparetup_index; + state->copytup = copytup_index; + state->writetup = writetup_index; + state->readtup = readtup_index; + + state->indexRel = indexRel; + state->enforceUnique = enforceUnique; + + return state; +} + +/* + * tuplesort_end + * + * Release resources and clean up. + */ +void +tuplesort_end(Tuplesortstate *state) +{ + int i; + + if (state->tapeset) + LogicalTapeSetClose(state->tapeset); + if (state->memtuples) + { + for (i = 0; i < state->memtupcount; i++) + pfree(state->memtuples[i]); + pfree(state->memtuples); + } + if (state->heaptuples) + { + for (i = 0; i < state->heaptupcount; i++) + pfree(state->heaptuples[i]); + pfree(state->heaptuples); + } + if (state->heapsrctapes) + pfree(state->heapsrctapes); +} + +/* + * Accept one tuple while collecting input data for sort. + * + * Note that the input tuple is always copied; the caller need not save it. + */ +void +tuplesort_puttuple(Tuplesortstate *state, void *tuple) +{ + /* + * Copy the given tuple into memory we control, and decrease availMem. + */ + tuple = COPYTUP(state, tuple); + + switch (state->status) + { + case TSS_INITIAL: + /* + * Save the copied tuple into the unsorted array. + */ + if (state->memtupcount >= state->memtupsize) + { + /* Grow the unsorted array as needed. */ + state->memtupsize *= 2; + state->memtuples = (void **) + repalloc(state->memtuples, + state->memtupsize * sizeof(void *)); + } + state->memtuples[state->memtupcount++] = tuple; + /* + * Done if we still fit in available memory. + */ + if (! LACKMEM(state)) + return; + /* + * Nope; time to switch to tape-based operation. + */ + inittapes(state); + beginrun(state); + /* + * Dump tuples until we are back under the limit. + */ + dumptuples(state, false); + break; + case TSS_BUILDRUNS: + /* + * Insert the copied tuple into the heap if it can go into the + * current run; otherwise add it to the unsorted array, whence + * it will go into the next run. + * + * The tuple can go into the current run if it is >= the first + * not-yet-output tuple. (Actually, it could go into the current + * run if it is >= the most recently output tuple ... but that + * would require keeping around the tuple we last output, and + * it's simplest to let writetup free the tuple when written.) + * + * Note there will always be at least one tuple in the heap + * at this point; see dumptuples. + */ + Assert(state->heaptupcount > 0); + if (COMPARETUP(state, tuple, state->heaptuples[0]) >= 0) + { + tuplesort_heap_insert(state, tuple, 0); + } + else + { + if (state->memtupcount >= state->memtupsize) + { + /* Grow the unsorted array as needed. */ + state->memtupsize *= 2; + state->memtuples = (void **) + repalloc(state->memtuples, + state->memtupsize * sizeof(void *)); + } + state->memtuples[state->memtupcount++] = tuple; + } + /* + * If we are over the memory limit, dump tuples till we're under. + */ + dumptuples(state, false); + break; + default: + elog(ERROR, "tuplesort_puttuple: invalid state"); + break; + } +} + +/* + * All tuples have been provided; finish the sort. + */ +void +tuplesort_performsort(Tuplesortstate *state) +{ + switch (state->status) + { + case TSS_INITIAL: + /* + * We were able to accumulate all the tuples within the + * allowed amount of memory. Just qsort 'em and we're done. + */ + if (state->memtupcount > 1) + { + qsort_tuplesortstate = state; + qsort((void *) state->memtuples, state->memtupcount, + sizeof(void *), qsort_comparetup); + } + state->current = 0; + state->eof_reached = false; + state->markpos_offset = 0; + state->markpos_eof = false; + state->status = TSS_SORTEDINMEM; + break; + case TSS_BUILDRUNS: + /* + * Finish tape-based sort. First, flush all tuples remaining + * in memory out to tape; then merge until we have a single + * remaining run (or, if !randomAccess, one run per tape). + * Note that mergeruns sets the correct status. + */ + dumptuples(state, true); + mergeruns(state); + state->eof_reached = false; + state->markpos_block = 0L; + state->markpos_offset = 0; + state->markpos_eof = false; + break; + default: + elog(ERROR, "tuplesort_performsort: invalid state"); + break; + } +} + +/* + * Fetch the next tuple in either forward or back direction. + * Returns NULL if no more tuples. If should_free is set, the + * caller must pfree the returned tuple when done with it. + */ +void * +tuplesort_gettuple(Tuplesortstate *state, bool forward, + bool *should_free) +{ + unsigned int tuplen; + void *tup; + + switch (state->status) + { + case TSS_SORTEDINMEM: + Assert(forward || state->randomAccess); + *should_free = false; + if (forward) + { + if (state->current < state->memtupcount) + return state->memtuples[state->current++]; + state->eof_reached = true; + return NULL; + } + else + { + if (state->current <= 0) + return NULL; + /* + * if all tuples are fetched already then we return last tuple, + * else - tuple before last returned. + */ + if (state->eof_reached) + state->eof_reached = false; + else + { + state->current--; /* last returned tuple */ + if (state->current <= 0) + return NULL; + } + return state->memtuples[state->current - 1]; + } + break; + + case TSS_SORTEDONTAPE: + Assert(forward || state->randomAccess); + *should_free = true; + if (forward) + { + if (state->eof_reached) + return NULL; + if ((tuplen = getlen(state, state->result_tape, true)) != 0) + { + tup = READTUP(state, state->result_tape, tuplen); + return tup; + } + else + { + state->eof_reached = true; + return NULL; + } + } + /* Backward. + * + * if all tuples are fetched already then we return last tuple, + * else - tuple before last returned. + */ + if (state->eof_reached) + { + /* + * Seek position is pointing just past the zero tuplen + * at the end of file; back up to fetch last tuple's ending + * length word. If seek fails we must have a completely empty + * file. + */ + if (! LogicalTapeBackspace(state->tapeset, + state->result_tape, + 2 * sizeof(unsigned int))) + return NULL; + state->eof_reached = false; + } + else + { + /* + * Back up and fetch previously-returned tuple's ending length + * word. If seek fails, assume we are at start of file. + */ + if (! LogicalTapeBackspace(state->tapeset, + state->result_tape, + sizeof(unsigned int))) + return NULL; + tuplen = getlen(state, state->result_tape, false); + /* + * Back up to get ending length word of tuple before it. + */ + if (! LogicalTapeBackspace(state->tapeset, + state->result_tape, + tuplen + 2 * sizeof(unsigned int))) + { + /* If that fails, presumably the prev tuple is the first + * in the file. Back up so that it becomes next to read + * in forward direction (not obviously right, but that is + * what in-memory case does). + */ + if (! LogicalTapeBackspace(state->tapeset, + state->result_tape, + tuplen + sizeof(unsigned int))) + elog(ERROR, "tuplesort_gettuple: bogus tuple len in backward scan"); + return NULL; + } + } + + tuplen = getlen(state, state->result_tape, false); + /* + * Now we have the length of the prior tuple, back up and read it. + * Note: READTUP expects we are positioned after the initial + * length word of the tuple, so back up to that point. + */ + if (! LogicalTapeBackspace(state->tapeset, + state->result_tape, + tuplen)) + elog(ERROR, "tuplesort_gettuple: bogus tuple len in backward scan"); + tup = READTUP(state, state->result_tape, tuplen); + return tup; + + case TSS_FINALMERGE: + Assert(forward); + *should_free = true; + /* + * This code should match the inner loop of mergeonerun(). + */ + if (state->heaptupcount > 0) + { + int srcTape = state->heapsrctapes[0]; + + tup = state->heaptuples[0]; + tuplesort_heap_siftup(state); + if ((tuplen = getlen(state, srcTape, true)) != 0) + { + void *newtup = READTUP(state, srcTape, tuplen); + tuplesort_heap_insert(state, newtup, srcTape); + } + return tup; + } + return NULL; + + default: + elog(ERROR, "tuplesort_gettuple: invalid state"); + return NULL; /* keep compiler quiet */ + } +} + +/* + * inittapes - initialize for tape sorting. + * + * This is called only if we have found we don't have room to sort in memory. + */ +static void +inittapes(Tuplesortstate *state) +{ + int j; + + state->tapeset = LogicalTapeSetCreate(MAXTAPES); + + /* + * Initialize heaptuples array slightly larger than current memtuples + * usage; memtupcount is probably a good guess at how many tuples we + * will be able to have in the heap at once. + */ + state->heaptupcount = 0; + state->heaptupsize = state->memtupcount + state->memtupcount / 4; + state->heaptuples = (void **) palloc(state->heaptupsize * sizeof(void *)); + + /* + * Initialize variables of Algorithm D (step D1). + */ + for (j = 0; j < MAXTAPES; j++) + { + state->tp_fib[j] = 1; + state->tp_runs[j] = 0; + state->tp_dummy[j] = 1; + state->tp_tapenum[j] = j; + } + state->tp_fib[TAPERANGE] = 0; + state->tp_dummy[TAPERANGE] = 0; + + state->Level = 1; + state->destTape = 0; + + state->multipleRuns = false; + + state->status = TSS_BUILDRUNS; +} + +/* + * selectnewtape -- select new tape for new initial run. + * + * This is called after finishing a run when we know another run + * must be started. This implements steps D3, D4 of Algorithm D. + */ +static void +selectnewtape(Tuplesortstate *state) +{ + int j; + int a; + + /* We now have at least two initial runs */ + state->multipleRuns = true; + + /* Step D3: advance j (destTape) */ + if (state->tp_dummy[state->destTape] < state->tp_dummy[state->destTape+1]) + { + state->destTape++; + return; + } + if (state->tp_dummy[state->destTape] != 0) + { + state->destTape = 0; + return; + } + + /* Step D4: increase level */ + state->Level++; + a = state->tp_fib[0]; + for (j = 0; j < TAPERANGE; j++) + { + state->tp_dummy[j] = a + state->tp_fib[j+1] - state->tp_fib[j]; + state->tp_fib[j] = a + state->tp_fib[j+1]; + } + state->destTape = 0; +} + +/* + * mergeruns -- merge all the completed initial runs. + * + * This implements steps D5, D6 of Algorithm D. All input data has + * already been written to initial runs on tape (see dumptuples). + */ +static void +mergeruns(Tuplesortstate *state) +{ + int tapenum, + svTape, + svRuns, + svDummy; + + Assert(state->status == TSS_BUILDRUNS); + Assert(state->memtupcount == 0 && state->heaptupcount == 0); + /* + * If we produced only one initial run (quite likely if the total + * data volume is between 1X and 2X SortMem), we can just use that + * tape as the finished output, rather than doing a useless merge. + */ + if (! state->multipleRuns) + { + state->result_tape = state->tp_tapenum[state->destTape]; + /* must freeze and rewind the finished output tape */ + LogicalTapeFreeze(state->tapeset, state->result_tape); + state->status = TSS_SORTEDONTAPE; + return; + } + + /* End of step D2: rewind all output tapes to prepare for merging */ + for (tapenum = 0; tapenum < TAPERANGE; tapenum++) + LogicalTapeRewind(state->tapeset, tapenum, false); + + for (;;) + { + /* Step D5: merge runs onto tape[T] until tape[P] is empty */ + while (state->tp_runs[TAPERANGE-1] || state->tp_dummy[TAPERANGE-1]) + { + bool allDummy = true; + bool allOneRun = true; + + for (tapenum = 0; tapenum < TAPERANGE; tapenum++) + { + if (state->tp_dummy[tapenum] == 0) + allDummy = false; + if (state->tp_runs[tapenum] + state->tp_dummy[tapenum] != 1) + allOneRun = false; + } + /* + * If we don't have to produce a materialized sorted tape, + * quit as soon as we're down to one real/dummy run per tape. + */ + if (! state->randomAccess && allOneRun) + { + Assert(! allDummy); + /* Initialize for the final merge pass */ + beginmerge(state); + state->status = TSS_FINALMERGE; + return; + } + if (allDummy) + { + state->tp_dummy[TAPERANGE]++; + for (tapenum = 0; tapenum < TAPERANGE; tapenum++) + state->tp_dummy[tapenum]--; + } + else + { + mergeonerun(state); + } + } + /* Step D6: decrease level */ + if (--state->Level == 0) + break; + /* rewind output tape T to use as new input */ + LogicalTapeRewind(state->tapeset, state->tp_tapenum[TAPERANGE], + false); + /* rewind used-up input tape P, and prepare it for write pass */ + LogicalTapeRewind(state->tapeset, state->tp_tapenum[TAPERANGE-1], + true); + state->tp_runs[TAPERANGE-1] = 0; + /* reassign tape units per step D6; note we no longer care about A[] */ + svTape = state->tp_tapenum[TAPERANGE]; + svDummy = state->tp_dummy[TAPERANGE]; + svRuns = state->tp_runs[TAPERANGE]; + for (tapenum = TAPERANGE; tapenum > 0; tapenum--) + { + state->tp_tapenum[tapenum] = state->tp_tapenum[tapenum-1]; + state->tp_dummy[tapenum] = state->tp_dummy[tapenum-1]; + state->tp_runs[tapenum] = state->tp_runs[tapenum-1]; + } + state->tp_tapenum[0] = svTape; + state->tp_dummy[0] = svDummy; + state->tp_runs[0] = svRuns; + } + /* + * Done. Knuth says that the result is on TAPE[1], but since we exited + * the loop without performing the last iteration of step D6, we have not + * rearranged the tape unit assignment, and therefore the result is on + * TAPE[T]. We need to do it this way so that we can freeze the final + * output tape while rewinding it. The last iteration of step D6 would + * be a waste of cycles anyway... + */ + state->result_tape = state->tp_tapenum[TAPERANGE]; + LogicalTapeFreeze(state->tapeset, state->result_tape); + state->status = TSS_SORTEDONTAPE; +} + +/* + * Merge one run from each input tape, except ones with dummy runs. + * + * This is the inner loop of Algorithm D step D5. We know that the + * output tape is TAPE[T]. + */ +static void +mergeonerun(Tuplesortstate *state) +{ + int destTape = state->tp_tapenum[TAPERANGE]; + int srcTape; + unsigned int tuplen; + void *tup; + + /* + * Start the merge by loading one tuple from each active source tape + * into the heap. We can also decrease the input run/dummy run counts. + */ + beginmerge(state); + + /* + * Execute merge by repeatedly extracting lowest tuple in heap, + * writing it out, and replacing it with next tuple from same tape + * (if there is another one). + */ + while (state->heaptupcount > 0) + { + WRITETUP(state, destTape, state->heaptuples[0]); + srcTape = state->heapsrctapes[0]; + tuplesort_heap_siftup(state); + if ((tuplen = getlen(state, srcTape, true)) != 0) + { + tup = READTUP(state, srcTape, tuplen); + tuplesort_heap_insert(state, tup, srcTape); + } + } + + /* + * When the heap empties, we're done. Write an end-of-run marker + * on the output tape, and increment its count of real runs. + */ + markrunend(state, destTape); + state->tp_runs[TAPERANGE]++; +} + +/* + * beginmerge - initialize for a merge pass + * + * We load the first tuple from each nondummy input run into the heap. + * We also decrease the counts of real and dummy runs for each tape. + */ +static void +beginmerge(Tuplesortstate *state) +{ + int tapenum; + int srcTape; + unsigned int tuplen; + void *tup; + + Assert(state->heaptuples != NULL && state->heaptupcount == 0); + if (state->heapsrctapes == NULL) + state->heapsrctapes = (int *) palloc(MAXTAPES * sizeof(int)); + + for (tapenum = 0; tapenum < TAPERANGE; tapenum++) + { + if (state->tp_dummy[tapenum] > 0) + { + state->tp_dummy[tapenum]--; + } + else + { + Assert(state->tp_runs[tapenum] > 0); + state->tp_runs[tapenum]--; + srcTape = state->tp_tapenum[tapenum]; + tuplen = getlen(state, srcTape, false); + tup = READTUP(state, srcTape, tuplen); + tuplesort_heap_insert(state, tup, srcTape); + } + } + +} + +/* + * beginrun - start a new initial run + * + * The tuples presently in the unsorted memory array are moved into + * the heap. + */ +static void +beginrun(Tuplesortstate *state) +{ + int i; + + Assert(state->heaptupcount == 0 && state->memtupcount > 0); + for (i = 0; i < state->memtupcount; i++) + tuplesort_heap_insert(state, state->memtuples[i], 0); + state->memtupcount = 0; +} + +/* + * dumptuples - remove tuples from heap and write to tape + * + * When alltuples = false, dump only enough tuples to get under the + * availMem limit (and leave at least one tuple in the heap in any case, + * since puttuple assumes it always has a tuple to compare to). + * + * When alltuples = true, dump everything currently in memory. + * (This case is only used at end of input data.) + * + * If we empty the heap, then start a new run using the tuples that + * have accumulated in memtuples[] (if any). + */ +static void +dumptuples(Tuplesortstate *state, bool alltuples) +{ + while (alltuples || + (LACKMEM(state) && + (state->heaptupcount > 0 || state->memtupcount > 0))) + { + /* + * Dump the heap's frontmost entry, and sift up to remove it + * from the heap. + */ + Assert(state->heaptupcount > 0); + WRITETUP(state, state->tp_tapenum[state->destTape], + state->heaptuples[0]); + tuplesort_heap_siftup(state); + /* + * If the heap is now empty, we've finished a run. + */ + if (state->heaptupcount == 0) + { + markrunend(state, state->tp_tapenum[state->destTape]); + state->tp_runs[state->destTape]++; + state->tp_dummy[state->destTape]--; /* per Alg D step D2 */ + if (state->memtupcount == 0) + break; /* all input data has been written to tape */ + /* Select new output tape and start a new run */ + selectnewtape(state); + beginrun(state); + } + } +} + +/* + * tuplesort_rescan - rewind and replay the scan + */ +void +tuplesort_rescan(Tuplesortstate *state) +{ + Assert(state->randomAccess); + + switch (state->status) + { + case TSS_SORTEDINMEM: + state->current = 0; + state->eof_reached = false; + state->markpos_offset = 0; + state->markpos_eof = false; + break; + case TSS_SORTEDONTAPE: + LogicalTapeRewind(state->tapeset, + state->result_tape, + false); + state->eof_reached = false; + state->markpos_block = 0L; + state->markpos_offset = 0; + state->markpos_eof = false; + break; + default: + elog(ERROR, "tuplesort_rescan: invalid state"); + break; + } +} + +/* + * tuplesort_markpos - saves current position in the merged sort file + */ +void +tuplesort_markpos(Tuplesortstate *state) +{ + Assert(state->randomAccess); + + switch (state->status) + { + case TSS_SORTEDINMEM: + state->markpos_offset = state->current; + state->markpos_eof = state->eof_reached; + break; + case TSS_SORTEDONTAPE: + LogicalTapeTell(state->tapeset, + state->result_tape, + & state->markpos_block, + & state->markpos_offset); + state->markpos_eof = state->eof_reached; + break; + default: + elog(ERROR, "tuplesort_markpos: invalid state"); + break; + } +} + +/* + * tuplesort_restorepos - restores current position in merged sort file to + * last saved position + */ +void +tuplesort_restorepos(Tuplesortstate *state) +{ + Assert(state->randomAccess); + + switch (state->status) + { + case TSS_SORTEDINMEM: + state->current = state->markpos_offset; + state->eof_reached = state->markpos_eof; + break; + case TSS_SORTEDONTAPE: + if (! LogicalTapeSeek(state->tapeset, + state->result_tape, + state->markpos_block, + state->markpos_offset)) + elog(ERROR, "tuplesort_restorepos failed"); + state->eof_reached = state->markpos_eof; + break; + default: + elog(ERROR, "tuplesort_restorepos: invalid state"); + break; + } +} + + +/* + * Heap manipulation routines, per Knuth's Algorithm 5.2.3H. + */ + +/* + * Insert a new tuple into an empty or existing heap, maintaining the + * heap invariant. The heap lives in state->heaptuples[]. Also, if + * state->heapsrctapes is not NULL, we store each tuple's source tapenum + * in the corresponding element of state->heapsrctapes[]. + */ +static void +tuplesort_heap_insert(Tuplesortstate *state, void *tuple, + int tapenum) +{ + int j; + + /* + * Make sure heaptuples[] can handle another entry. + * NOTE: we do not enlarge heapsrctapes[]; it's supposed + * to be big enough when created. + */ + if (state->heaptupcount >= state->heaptupsize) + { + /* Grow the unsorted array as needed. */ + state->heaptupsize *= 2; + state->heaptuples = (void **) + repalloc(state->heaptuples, + state->heaptupsize * sizeof(void *)); + } + /* + * Sift-up the new entry, per Knuth 5.2.3 exercise 16. + * Note that Knuth is using 1-based array indexes, not 0-based. + */ + j = state->heaptupcount++; + while (j > 0) { + int i = (j-1) >> 1; + + if (COMPARETUP(state, tuple, state->heaptuples[i]) >= 0) + break; + state->heaptuples[j] = state->heaptuples[i]; + if (state->heapsrctapes) + state->heapsrctapes[j] = state->heapsrctapes[i]; + j = i; + } + state->heaptuples[j] = tuple; + if (state->heapsrctapes) + state->heapsrctapes[j] = tapenum; +} + +/* + * The tuple at state->heaptuples[0] has been removed from the heap. + * Decrement heaptupcount, and sift up to maintain the heap invariant. + */ +static void +tuplesort_heap_siftup(Tuplesortstate *state) +{ + void **heaptuples = state->heaptuples; + void *tuple; + int i, + n; + + if (--state->heaptupcount <= 0) + return; + n = state->heaptupcount; + tuple = heaptuples[n]; /* tuple that must be reinserted */ + i = 0; /* i is where the "hole" is */ + for (;;) { + int j = 2*i + 1; + + if (j >= n) + break; + if (j+1 < n && + COMPARETUP(state, heaptuples[j], heaptuples[j+1]) > 0) + j++; + if (COMPARETUP(state, tuple, heaptuples[j]) <= 0) + break; + heaptuples[i] = heaptuples[j]; + if (state->heapsrctapes) + state->heapsrctapes[i] = state->heapsrctapes[j]; + i = j; + } + heaptuples[i] = tuple; + if (state->heapsrctapes) + state->heapsrctapes[i] = state->heapsrctapes[n]; +} + + +/* + * Tape interface routines + */ + +static unsigned int +getlen(Tuplesortstate *state, int tapenum, bool eofOK) +{ + unsigned int len; + + if (LogicalTapeRead(state->tapeset, tapenum, (void *) &len, + sizeof(len)) != sizeof(len)) + elog(ERROR, "tuplesort: unexpected end of tape"); + if (len == 0 && !eofOK) + elog(ERROR, "tuplesort: unexpected end of data"); + return len; +} + +static void +markrunend(Tuplesortstate *state, int tapenum) +{ + unsigned int len = 0; + + LogicalTapeWrite(state->tapeset, tapenum, (void *) &len, sizeof(len)); +} + + +/* + * qsort interface + */ + +static int +qsort_comparetup(const void *a, const void *b) +{ + /* The passed pointers are pointers to void * ... */ + + return COMPARETUP(qsort_tuplesortstate, * (void **) a, * (void **) b); +} + + +/* + * Routines specialized for HeapTuple case + */ + +static int +comparetup_heap(Tuplesortstate *state, const void *a, const void *b) +{ + HeapTuple ltup = (HeapTuple) a; + HeapTuple rtup = (HeapTuple) b; + int nkey; + + for (nkey = 0; nkey < state->nKeys; nkey++) + { + ScanKey scanKey = state->scanKeys + nkey; + Datum lattr, + rattr; + bool isnull1, + isnull2; + int result; + + lattr = heap_getattr(ltup, + scanKey->sk_attno, + state->tupDesc, + &isnull1); + rattr = heap_getattr(rtup, + scanKey->sk_attno, + state->tupDesc, + &isnull2); + if (isnull1) + { + if (!isnull2) + return 1; /* NULL sorts after non-NULL */ + } + else if (isnull2) + return -1; + else if (scanKey->sk_flags & SK_COMMUTE) + { + if (!(result = - (int) (*fmgr_faddr(&scanKey->sk_func)) (rattr, lattr))) + result = (int) (*fmgr_faddr(&scanKey->sk_func)) (lattr, rattr); + if (result) + return result; + } + else + { + if (!(result = - (int) (*fmgr_faddr(&scanKey->sk_func)) (lattr, rattr))) + result = (int) (*fmgr_faddr(&scanKey->sk_func)) (rattr, lattr); + if (result) + return result; + } + } + + return 0; +} + +static void * +copytup_heap(Tuplesortstate *state, void *tup) +{ + HeapTuple tuple = (HeapTuple) tup; + + USEMEM(state, HEAPTUPLESIZE + tuple->t_len); + return (void *) heap_copytuple(tuple); +} + +/* + * We don't bother to write the HeapTupleData part of the tuple. + */ + +static void +writetup_heap(Tuplesortstate *state, int tapenum, void *tup) +{ + HeapTuple tuple = (HeapTuple) tup; + unsigned int tuplen; + + tuplen = tuple->t_len + sizeof(tuplen); + LogicalTapeWrite(state->tapeset, tapenum, + (void*) &tuplen, sizeof(tuplen)); + LogicalTapeWrite(state->tapeset, tapenum, + (void*) tuple->t_data, tuple->t_len); + if (state->randomAccess) /* need trailing length word? */ + LogicalTapeWrite(state->tapeset, tapenum, + (void*) &tuplen, sizeof(tuplen)); + + FREEMEM(state, HEAPTUPLESIZE + tuple->t_len); + pfree(tuple); +} + +static void * +readtup_heap(Tuplesortstate *state, int tapenum, unsigned int len) +{ + unsigned int tuplen = len - sizeof(unsigned int) + HEAPTUPLESIZE; + HeapTuple tuple = (HeapTuple) palloc(tuplen); + + USEMEM(state, tuplen); + /* reconstruct the HeapTupleData portion */ + tuple->t_len = len - sizeof(unsigned int); + ItemPointerSetInvalid(&(tuple->t_self)); + tuple->t_data = (HeapTupleHeader) (((char *) tuple) + HEAPTUPLESIZE); + /* read in the tuple proper */ + if (LogicalTapeRead(state->tapeset, tapenum, (void *) tuple->t_data, + tuple->t_len) != tuple->t_len) + elog(ERROR, "tuplesort: unexpected end of data"); + if (state->randomAccess) /* need trailing length word? */ + if (LogicalTapeRead(state->tapeset, tapenum, (void *) &tuplen, + sizeof(tuplen)) != sizeof(tuplen)) + elog(ERROR, "tuplesort: unexpected end of data"); + return (void *) tuple; +} + + +/* + * Routines specialized for IndexTuple case + * + * NOTE: actually, these are specialized for the btree case; it's not + * clear whether you could use them for a non-btree index. Possibly + * you'd need to make another set of routines if you needed to sort + * according to another kind of index. + */ + +static int +comparetup_index(Tuplesortstate *state, const void *a, const void *b) +{ + IndexTuple ltup = (IndexTuple) a; + IndexTuple rtup = (IndexTuple) b; + TupleDesc itdesc = state->indexRel->rd_att; + bool equal_isnull = false; + Datum lattr, + rattr; + bool isnull1, + isnull2; + int i; + + for (i = 0; i < itdesc->natts; i++) + { + lattr = index_getattr(ltup, i + 1, itdesc, &isnull1); + rattr = index_getattr(rtup, i + 1, itdesc, &isnull2); + + if (isnull1) + { + if (!isnull2) + return 1; /* NULL sorts after non-NULL */ + equal_isnull = true; + continue; + } + else if (isnull2) + return -1; + + if (_bt_invokestrat(state->indexRel, i + 1, + BTGreaterStrategyNumber, + lattr, rattr)) + return 1; + if (_bt_invokestrat(state->indexRel, i + 1, + BTGreaterStrategyNumber, + rattr, lattr)) + return -1; + } + + /* + * If btree has asked us to enforce uniqueness, complain if two equal + * tuples are detected (unless there was at least one NULL field). + * + * It is sufficient to make the test here, because if two tuples are + * equal they *must* get compared at some stage of the sort --- otherwise + * the sort algorithm wouldn't have checked whether one must appear + * before the other. + */ + if (state->enforceUnique && !equal_isnull) + elog(ERROR, "Cannot create unique index. Table contains non-unique values"); + + return 0; +} + +static void * +copytup_index(Tuplesortstate *state, void *tup) +{ + IndexTuple tuple = (IndexTuple) tup; + unsigned int tuplen = IndexTupleSize(tuple); + IndexTuple newtuple; + + USEMEM(state, tuplen); + newtuple = (IndexTuple) palloc(tuplen); + memcpy(newtuple, tuple, tuplen); + + return (void *) newtuple; +} + +static void +writetup_index(Tuplesortstate *state, int tapenum, void *tup) +{ + IndexTuple tuple = (IndexTuple) tup; + unsigned int tuplen; + + tuplen = IndexTupleSize(tuple) + sizeof(tuplen); + LogicalTapeWrite(state->tapeset, tapenum, + (void*) &tuplen, sizeof(tuplen)); + LogicalTapeWrite(state->tapeset, tapenum, + (void*) tuple, IndexTupleSize(tuple)); + if (state->randomAccess) /* need trailing length word? */ + LogicalTapeWrite(state->tapeset, tapenum, + (void*) &tuplen, sizeof(tuplen)); + + FREEMEM(state, IndexTupleSize(tuple)); + pfree(tuple); +} + +static void * +readtup_index(Tuplesortstate *state, int tapenum, unsigned int len) +{ + unsigned int tuplen = len - sizeof(unsigned int); + IndexTuple tuple = (IndexTuple) palloc(tuplen); + + USEMEM(state, tuplen); + if (LogicalTapeRead(state->tapeset, tapenum, (void *) tuple, + tuplen) != tuplen) + elog(ERROR, "tuplesort: unexpected end of data"); + if (state->randomAccess) /* need trailing length word? */ + if (LogicalTapeRead(state->tapeset, tapenum, (void *) &tuplen, + sizeof(tuplen)) != sizeof(tuplen)) + elog(ERROR, "tuplesort: unexpected end of data"); + return (void *) tuple; +} diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index 7c57a9a4f99..613595febf4 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -6,7 +6,7 @@ * * Copyright (c) 1994, Regents of the University of California * - * $Id: nbtree.h,v 1.31 1999/08/08 20:12:49 tgl Exp $ + * $Id: nbtree.h,v 1.32 1999/10/17 22:15:03 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -288,9 +288,12 @@ extern BTItem _bt_formitem(IndexTuple itup); /* * prototypes for functions in nbtsort.c */ -extern void *_bt_spoolinit(Relation index, int ntapes, bool isunique); -extern void _bt_spooldestroy(void *spool); -extern void _bt_spool(Relation index, BTItem btitem, void *spool); -extern void _bt_leafbuild(Relation index, void *spool); + +typedef struct BTSpool BTSpool; /* opaque type known only within nbtsort.c */ + +extern BTSpool *_bt_spoolinit(Relation index, bool isunique); +extern void _bt_spooldestroy(BTSpool *btspool); +extern void _bt_spool(BTItem btitem, BTSpool *btspool); +extern void _bt_leafbuild(BTSpool *btspool); #endif /* NBTREE_H */ diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index 092fa57acb1..44aa8b8ace5 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -6,7 +6,7 @@ * * Copyright (c) 1994, Regents of the University of California * - * $Id: execnodes.h,v 1.36 1999/09/26 21:21:04 tgl Exp $ + * $Id: execnodes.h,v 1.37 1999/10/17 22:15:07 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -597,17 +597,9 @@ typedef struct GroupState /* ---------------- * SortState information * - *| sort nodes are really just a kind of a scan since - *| we implement sorts by retrieving the entire subplan - *| into a temp relation, sorting the temp relation into - *| another sorted relation, and then preforming a simple - *| unqualified sequential scan on the sorted relation.. - *| -cim 10/15/89 - * - * Flag indicated whether relation has been sorted - * Keys scan key structures used to keep info on sort keys - * TempRelation temporary relation containing result of executing - * the subplan. + * sort_Done indicates whether sort has been performed yet + * sort_Keys scan key structures describing the sort keys + * tuplesortstate private state of tuplesort.c * * CommonScanState information * @@ -628,9 +620,9 @@ typedef struct GroupState typedef struct SortState { CommonScanState csstate; /* its first field is NodeTag */ - bool sort_Flag; + bool sort_Done; ScanKey sort_Keys; - bool cleaned; + void *tuplesortstate; } SortState; /* ---------------- diff --git a/src/include/nodes/plannodes.h b/src/include/nodes/plannodes.h index 095ee074d38..a03dacfb02b 100644 --- a/src/include/nodes/plannodes.h +++ b/src/include/nodes/plannodes.h @@ -6,7 +6,7 @@ * * Copyright (c) 1994, Regents of the University of California * - * $Id: plannodes.h,v 1.30 1999/08/21 03:49:09 tgl Exp $ + * $Id: plannodes.h,v 1.31 1999/10/17 22:15:07 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -288,8 +288,6 @@ typedef struct Sort Oid nonameid; int keycount; SortState *sortstate; - void *psortstate; - bool cleaned; } Sort; /* ---------------- diff --git a/src/include/utils/tuplesort.h b/src/include/utils/tuplesort.h new file mode 100644 index 00000000000..7c5a3209897 --- /dev/null +++ b/src/include/utils/tuplesort.h @@ -0,0 +1,68 @@ +/*------------------------------------------------------------------------- + * + * tuplesort.h + * Generalized tuple sorting routines. + * + * This module handles sorting of either heap tuples or index tuples + * (and could fairly easily support other kinds of sortable objects, + * if necessary). It works efficiently for both small and large amounts + * of data. Small amounts are sorted in-memory using qsort(). Large + * amounts are sorted using temporary files and a standard external sort + * algorithm. + * + * Copyright (c) 1994, Regents of the University of California + * + * $Id: tuplesort.h,v 1.1 1999/10/17 22:15:09 tgl Exp $ + * + *------------------------------------------------------------------------- + */ +#ifndef TUPLESORT_H +#define TUPLESORT_H + +#include "access/htup.h" +#include "access/itup.h" +#include "access/skey.h" +#include "access/tupdesc.h" +#include "utils/rel.h" + +/* Tuplesortstate is an opaque type whose details are not known outside tuplesort.c. */ + +typedef struct Tuplesortstate Tuplesortstate; + +/* + * We provide two different interfaces to what is essentially the same + * code: one for sorting HeapTuples and one for sorting IndexTuples. + * They differ primarily in the way that the sort key information is + * supplied. + */ + +extern Tuplesortstate *tuplesort_begin_heap(TupleDesc tupDesc, + int nkeys, ScanKey keys, + bool randomAccess); +extern Tuplesortstate *tuplesort_begin_index(Relation indexRel, + bool enforceUnique, + bool randomAccess); + +extern void tuplesort_puttuple(Tuplesortstate *state, void *tuple); + +extern void tuplesort_performsort(Tuplesortstate *state); + +extern void *tuplesort_gettuple(Tuplesortstate *state, bool forward, + bool *should_free); +#define tuplesort_getheaptuple(state, forward, should_free) \ + ((HeapTuple) tuplesort_gettuple(state, forward, should_free)) +#define tuplesort_getindextuple(state, forward, should_free) \ + ((IndexTuple) tuplesort_gettuple(state, forward, should_free)) + +extern void tuplesort_end(Tuplesortstate *state); + +/* + * These routines may only be called if randomAccess was specified 'true'. + * Backwards scan in gettuple is likewise only allowed if randomAccess. + */ + +extern void tuplesort_rescan(Tuplesortstate *state); +extern void tuplesort_markpos(Tuplesortstate *state); +extern void tuplesort_restorepos(Tuplesortstate *state); + +#endif /* TUPLESORT_H */ -- GitLab