diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c index 2ecce3086392673b23eca030cc0a19a0018d1147..0624bd06a87acc4014302827b2f2b2f70284299f 100644 --- a/src/backend/access/nbtree/nbtree.c +++ b/src/backend/access/nbtree/nbtree.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtree.c,v 1.12 1997/01/10 09:46:33 vadim Exp $ + * $Header: /cvsroot/pgsql/src/backend/access/nbtree/nbtree.c,v 1.13 1997/02/12 05:04:17 scrappy Exp $ * * NOTES * This file contains only the public interface routines. @@ -33,8 +33,8 @@ # include <string.h> #endif -bool BuildingBtree = false; -bool FastBuild = false; /* turn this on to make bulk builds work*/ +bool BuildingBtree = false; /* see comment in btbuild() */ +bool FastBuild = true; /* use sort/build instead of insertion build */ /* * btbuild() -- build a new btree index. @@ -67,21 +67,34 @@ btbuild(Relation heap, int i; BTItem btitem; #ifndef OMIT_PARTIAL_INDEX - ExprContext *econtext; - TupleTable tupleTable; - TupleTableSlot *slot; + ExprContext *econtext = (ExprContext *) NULL; + TupleTable tupleTable = (TupleTable) NULL; + TupleTableSlot *slot = (TupleTableSlot *) NULL; #endif Oid hrelid, irelid; Node *pred, *oldPred; - void *spool; + void *spool = (void *) NULL; bool isunique; - + bool usefast; + +#if 0 + ResetBufferUsage(); +#endif + /* note that this is a new btree */ BuildingBtree = true; pred = predInfo->pred; oldPred = predInfo->oldPred; + /* + * bootstrap processing does something strange, so don't use + * sort/build for initial catalog indices. at some point i need + * to look harder at this. (there is some kind of incremental + * processing going on there.) -- pma 08/29/95 + */ + usefast = (FastBuild && IsNormalProcessingMode()); + /* see if index is unique */ isunique = IndexIsUniqueNoCache(RelationGetRelationId(index)); @@ -110,13 +123,16 @@ btbuild(Relation heap, slot = ExecAllocTableSlot(tupleTable); econtext = makeNode(ExprContext); FillDummyExprContext(econtext, slot, htupdesc, InvalidBuffer); + + /* + * we never want to use sort/build if we are extending an + * existing partial index -- it works by inserting the + * newly-qualifying tuples into the existing index. + * (sort/build would overwrite the existing index with one + * consisting of the newly-qualifying tuples.) + */ + usefast = false; } - else - { - econtext = NULL; - tupleTable = NULL; - slot = NULL; - } #endif /* OMIT_PARTIAL_INDEX */ /* start a heap scan */ @@ -126,12 +142,10 @@ btbuild(Relation heap, /* build the index */ nhtups = nitups = 0; - if (FastBuild) { + if (usefast) { spool = _bt_spoolinit(index, 7); res = (InsertIndexResult) NULL; } - else - spool = NULL; for (; HeapTupleIsValid(htup); htup = heap_getnext(hscan, 0, &buffer)) { @@ -219,7 +233,7 @@ btbuild(Relation heap, * into a spool page for subsequent processing. otherwise, we * insert into the btree. */ - if (FastBuild) { + if (usefast) { _bt_spool(index, btitem, spool); } else { res = _bt_doinsert(index, btitem, isunique, heap); @@ -248,12 +262,24 @@ btbuild(Relation heap, * merging the runs, (2) inserting the sorted tuples into btree * pages and (3) building the upper levels. */ - if (FastBuild) { - _bt_spool(index, (BTItem) NULL, spool); /* flush spool */ + if (usefast) { + _bt_spool(index, (BTItem) NULL, spool); /* flush the spool */ _bt_leafbuild(index, spool); _bt_spooldestroy(spool); } +#if 0 + { + extern int ReadBufferCount, BufferHitCount, BufferFlushCount; + extern long NDirectFileRead, NDirectFileWrite; + + printf("buffer(%d): r=%d w=%d\n", heap->rd_rel->relblocksz, + ReadBufferCount - BufferHitCount, BufferFlushCount); + printf("direct(%d): r=%d w=%d\n", LocalBlockSize, + NDirectFileRead, NDirectFileWrite); + } +#endif + /* * Since we just counted the tuples in the heap, we update its * stats in pg_class to guarantee that the planner takes advantage @@ -312,7 +338,10 @@ btinsert(Relation rel, Datum *datum, char *nulls, ItemPointer ht_ctid, Relation pfree(btitem); pfree(itup); - + + /* adjust any active scans that will be affected by this insertion */ + _bt_adjscans(rel, &(res->pointerData), BT_INSERT); + return (res); } @@ -533,7 +562,7 @@ void btdelete(Relation rel, ItemPointer tid) { /* adjust any active scans that will be affected by this deletion */ - _bt_adjscans(rel, tid); + _bt_adjscans(rel, tid, BT_DELETE); /* delete the data from the page */ _bt_pagedel(rel, tid); diff --git a/src/backend/access/nbtree/nbtsort.c b/src/backend/access/nbtree/nbtsort.c index 6bf86d6f6e092ca92625843dfddebdf604bdcc18..00bf3bb85273c2dbaa1aa70286f152ec1079cde4 100644 --- a/src/backend/access/nbtree/nbtsort.c +++ b/src/backend/access/nbtree/nbtsort.c @@ -5,7 +5,7 @@ * * * IDENTIFICATION - * $Id: nbtsort.c,v 1.8 1996/11/05 10:35:35 scrappy Exp $ + * $Id: nbtsort.c,v 1.9 1997/02/12 05:04:20 scrappy Exp $ * * NOTES * @@ -20,15 +20,15 @@ * - when each input run has been exhausted, switch to another output * tape and start processing another run. * - when we have fewer runs than tapes, we know we are ready to start - * merging into the btree leaf pages. - * - every time we complete a level of the btree, we can construct the - * next level up. when we have only one page on a level, it can be - * attached to the btree metapage and we are done. + * merging into the btree leaf pages. (i.e., we do not have to wait + * until we have exactly one tape.) + * - as we extract tuples from the final runs, we build the pages for + * each level. when we have only one page on a level, it must be the + * root -- it can be attached to the btree metapage and we are done. * * conventions: * - external interface routines take in and return "void *" for their - * opaque handles. this is for modularity reasons (i prefer not to - * export these structures without good reason). + * opaque handles. this is for modularity reasons. * * this code is moderately slow (~10% slower) compared to the regular * btree (insertion) build code on sorted or well-clustered data. on @@ -63,12 +63,23 @@ # include <string.h> #endif -#ifdef FASTBUILD +/* + * turn on debugging output. + * + * XXX this code just does a numeric printf of the index key, so it's + * only really useful for integer keys. + */ +/*#define FASTBUILD_DEBUG*/ +#define FASTBUILD_SPOOL +#define FASTBUILD_MERGE #define MAXTAPES (7) -#define TAPEBLCKSZ (BLCKSZ << 2) +#define TAPEBLCKSZ (MAXBLCKSZ << 2) #define TAPETEMP "pg_btsortXXXXXX" +extern int NDirectFileRead; +extern int NDirectFileWrite; +extern char *mktemp(char *template); /*------------------------------------------------------------------------- * sorting comparison routine - returns {-1,0,1} depending on whether @@ -88,6 +99,11 @@ * what the heck. * *------------------------------------------------------------------------- */ +typedef struct { + Datum btsk_datum; + BTItem btsk_item; +} BTSortKey; + static Relation _bt_sortrel; static void @@ -97,28 +113,41 @@ _bt_isortcmpinit(Relation index) } static int -_bt_isortcmp(const void *bti1p,const void *bti2p) +_bt_isortcmp(BTSortKey *k1, BTSortKey *k2) { - BTItem bti1 = *(BTItem *)bti1p; - BTItem bti2 = *(BTItem *)bti2p; - - if (bti1 == (BTItem) NULL) { - if (bti2 == (BTItem) NULL) { + if (k1->btsk_item == (BTItem) NULL) { + if (k2->btsk_item == (BTItem) NULL) { return(0); /* 1 = 2 */ } return(1); /* 1 > 2 */ - } else if (bti2 == (BTItem) NULL) { + } else if (k2->btsk_item == (BTItem) NULL) { return(-1); /* 1 < 2 */ - } else if (_bt_itemcmp(_bt_sortrel, 1, bti1, bti2, - BTGreaterStrategyNumber)) { + } else if (_bt_invokestrat(_bt_sortrel, 1, BTGreaterStrategyNumber, + k1->btsk_datum, k2->btsk_datum)) { return(1); /* 1 > 2 */ - } else if (_bt_itemcmp(_bt_sortrel, 1, bti2, bti1, - BTGreaterStrategyNumber)) { + } else if (_bt_invokestrat(_bt_sortrel, 1, BTGreaterStrategyNumber, + k2->btsk_datum, k1->btsk_datum)) { return(-1); /* 1 < 2 */ } return(0); /* 1 = 2 */ } +static void +_bt_setsortkey(Relation index, BTItem bti, BTSortKey *sk) +{ + sk->btsk_item = (BTItem) NULL; + sk->btsk_datum = (Datum) NULL; + if (bti != (BTItem) NULL) { + bool isnull; + Datum d = index_getattr(&(bti->bti_itup), 1, index->rd_att, &isnull); + + if (!isnull) { + sk->btsk_item = bti; + sk->btsk_datum = d; + } + } +} + /*------------------------------------------------------------------------- * priority queue methods * @@ -133,7 +162,7 @@ _bt_isortcmp(const void *bti1p,const void *bti2p) typedef struct { int btpqe_tape; /* tape identifier */ - BTItem btpqe_item; /* pointer to BTItem in tape buffer */ + BTSortKey btpqe_item; /* pointer to BTItem in tape buffer */ } BTPriQueueElem; #define MAXELEM MAXTAPES @@ -336,7 +365,8 @@ static void _bt_tapewrite(BTTapeBlock *tape, int eor) { tape->bttb_eor = eor; - FileWrite(tape->bttb_fd, (char*)tape, TAPEBLCKSZ); + FileWrite(tape->bttb_fd, (char *) tape, TAPEBLCKSZ); + NDirectFileWrite += TAPEBLCKSZ; _bt_tapereset(tape); } @@ -356,7 +386,7 @@ _bt_taperead(BTTapeBlock *tape) int nread; if (tape->bttb_eor) { - return(0); /* we are at End-Of-Run */ + return(0); /* we are already at End-Of-Run */ } /* @@ -364,7 +394,7 @@ _bt_taperead(BTTapeBlock *tape) * VFD (the one in the block we're reading is bogus). */ fd = tape->bttb_fd; - nread = FileRead(fd, (char*) tape, TAPEBLCKSZ); + nread = FileRead(fd, (char *) tape, TAPEBLCKSZ); tape->bttb_fd = fd; if (nread != TAPEBLCKSZ) { @@ -372,6 +402,7 @@ _bt_taperead(BTTapeBlock *tape) return(0); } Assert(tape->bttb_magic == BTTAPEMAGIC); + NDirectFileRead += TAPEBLCKSZ; return(1); } @@ -445,8 +476,6 @@ typedef struct { void * _bt_spoolinit(Relation index, int ntapes) { - char *mktemp(); - BTSpool *btspool = (BTSpool *) palloc(sizeof(BTSpool)); int i; char *fname = (char *) palloc(sizeof(TAPETEMP) + 1); @@ -567,6 +596,7 @@ _bt_spool(Relation index, BTItem btitem, void *spool) BTSpool *btspool = (BTSpool *) spool; BTTapeBlock *itape; Size itemsz; + int i; itape = btspool->bts_itape[btspool->bts_tape]; itemsz = BTITEMSZ(btitem); @@ -579,7 +609,7 @@ _bt_spool(Relation index, BTItem btitem, void *spool) * buffer. */ if (btitem == (BTItem) NULL || SPCLEFT(itape) < itemsz) { - BTItem *parray; + BTSortKey *parray = (BTSortKey *) NULL; BTTapeBlock *otape; BTItem bti; char *pos; @@ -590,45 +620,49 @@ _bt_spool(Relation index, BTItem btitem, void *spool) * build an array of pointers to the BTItemDatas on the input * block. */ - parray = (BTItem *) palloc(itape->bttb_ntup * sizeof(BTItem)); - if (parray == (BTItem *) NULL) { - elog(WARN, "_bt_spool: out of memory"); - } - pos = itape->bttb_data; - for (i = 0; i < itape->bttb_ntup; ++i) { - parray[i] = _bt_tapenext(itape, &pos); + if (itape->bttb_ntup > 0) { + parray = + (BTSortKey *) palloc(itape->bttb_ntup * sizeof(BTSortKey)); + if (parray == (BTSortKey *) NULL) { + elog(WARN, "_bt_spool: out of memory"); + } + pos = itape->bttb_data; + for (i = 0; i < itape->bttb_ntup; ++i) { + _bt_setsortkey(index, _bt_tapenext(itape, &pos), &(parray[i])); + } + + /* + * qsort the pointer array. + */ + _bt_isortcmpinit(index); + qsort((void *) parray, itape->bttb_ntup, sizeof(BTSortKey), + _bt_isortcmp); } - /* - * qsort the pointer array. - */ - _bt_isortcmpinit(index); - qsort((void *) parray, itape->bttb_ntup, sizeof(BTItem), _bt_isortcmp); - /* * write the spooled run into the output tape. we copy the * BTItemDatas in the order dictated by the sorted array of * BTItems, not the original order. * * (since everything was DOUBLEALIGN'd and is all on a single - * page, everything had *better* still fit on one page..) + * tape block, everything had *better* still fit on one tape + * block..) */ otape = btspool->bts_otape[btspool->bts_tape]; for (i = 0; i < itape->bttb_ntup; ++i) { - bti = parray[i]; + bti = parray[i].btsk_item; btisz = BTITEMSZ(bti); btisz = DOUBLEALIGN(btisz); _bt_tapeadd(otape, bti, btisz); -#ifdef FASTBUILD_DEBUG +#if defined(FASTBUILD_DEBUG) && defined(FASTBUILD_SPOOL) { bool isnull; - Datum d = index_getattr(&(bti->bti_itup), 1, - RelationGetTupleDescriptor(index), - &isnull); + Datum d = index_getattr(&(bti->bti_itup), 1, index->rd_att, + &isnull); printf("_bt_spool: inserted <%x> into output tape %d\n", d, btspool->bts_tape); } -#endif /* FASTBUILD_DEBUG */ +#endif /* FASTBUILD_DEBUG && FASTBUILD_SPOOL */ } /* @@ -653,7 +687,9 @@ _bt_spool(Relation index, BTItem btitem, void *spool) /* * destroy the pointer array. */ - pfree((void *) parray); + if (parray != (BTSortKey *) NULL) { + pfree((void *) parray); + } } /* insert this item into the current buffer */ @@ -671,6 +707,9 @@ _bt_blnewpage(Relation index, Buffer *buf, Page *page, int flags) BTPageOpaque opaque; *buf = _bt_getbuf(index, P_NEW, BT_WRITE); +#if 0 + printf("\tblk=%d\n", BufferGetBlockNumber(*buf)); +#endif *page = BufferGetPage(*buf); _bt_pageinit(*page, BufferGetPageSize(*buf)); opaque = (BTPageOpaque) PageGetSpecialPointer(*page); @@ -680,8 +719,9 @@ _bt_blnewpage(Relation index, Buffer *buf, Page *page, int flags) /* * slide an array of ItemIds back one slot (from P_FIRSTKEY to - * P_HIKEY). we need to do this when we discover that we have built - * an ItemId array in what has turned out to be a P_RIGHTMOST page. + * P_HIKEY, overwriting P_HIKEY). we need to do this when we discover + * that we have built an ItemId array in what has turned out to be a + * P_RIGHTMOST page. */ static void _bt_slideleft(Relation index, Buffer buf, Page page) @@ -691,24 +731,71 @@ _bt_slideleft(Relation index, Buffer buf, Page page) ItemId previi; ItemId thisii; - maxoff = PageGetMaxOffsetNumber(page); - previi = PageGetItemId(page, P_HIKEY); - for (off = P_FIRSTKEY; off <= maxoff; off = OffsetNumberNext(off)) { - thisii = PageGetItemId(page, off); - *previi = *thisii; - previi = thisii; + if (!PageIsEmpty(page)) { + maxoff = PageGetMaxOffsetNumber(page); + previi = PageGetItemId(page, P_HIKEY); + for (off = P_FIRSTKEY; off <= maxoff; off = OffsetNumberNext(off)) { + thisii = PageGetItemId(page, off); + *previi = *thisii; + previi = thisii; + } + ((PageHeader) page)->pd_lower -= sizeof(ItemIdData); } - ((PageHeader) page)->pd_lower -= sizeof(ItemIdData); } -typedef struct { +typedef struct BTPageState { Buffer btps_buf; Page btps_page; BTItem btps_lastbti; OffsetNumber btps_lastoff; OffsetNumber btps_firstoff; + int btps_level; + bool btps_doupper; + struct BTPageState *btps_next; } BTPageState; +/* + * allocate and initialize a new BTPageState. the returned structure + * is suitable for immediate use by _bt_buildadd. + */ +void * +_bt_pagestate(Relation index, int flags, int level, bool doupper) +{ + BTPageState *state = (BTPageState *) palloc(sizeof(BTPageState)); + + (void) memset((char *) state, 0, sizeof(BTPageState)); + _bt_blnewpage(index, &(state->btps_buf), &(state->btps_page), flags); + state->btps_firstoff = InvalidOffsetNumber; + state->btps_lastoff = P_HIKEY; + state->btps_lastbti = (BTItem) NULL; + state->btps_next = (BTPageState *) NULL; + state->btps_level = level; + state->btps_doupper = doupper; + + return((void *) state); +} + +/* + * return a copy of the minimum (P_HIKEY or P_FIRSTKEY) item on + * 'opage'. the copy is modified to point to 'opage' (as opposed to + * the page to which the item used to point, e.g., a heap page if + * 'opage' is a leaf page). + */ +BTItem +_bt_minitem(Page opage, BlockNumber oblkno, int atend) +{ + OffsetNumber off; + BTItem obti; + BTItem nbti; + + off = atend ? P_HIKEY : P_FIRSTKEY; + obti = (BTItem) PageGetItem(opage, PageGetItemId(opage, off)); + nbti = _bt_formitem(&(obti->bti_itup)); + ItemPointerSet(&(nbti->bti_itup.t_tid), oblkno, P_HIKEY); + + return(nbti); +} + /* * add an item to a disk page from a merge tape block. * @@ -748,11 +835,13 @@ typedef struct { * * if all keys are unique, 'first' will always be the same as 'last'. */ -static void -_bt_buildadd(Relation index, BTPageState *state, BTItem bti, int flags) +BTItem +_bt_buildadd(Relation index, void *pstate, BTItem bti, int flags) { + BTPageState *state = (BTPageState *) pstate; Buffer nbuf; Page npage; + char *pos; BTItem last_bti; OffsetNumber first_off; OffsetNumber last_off; @@ -804,19 +893,26 @@ _bt_buildadd(Relation index, BTPageState *state, BTItem bti, int flags) ii = PageGetItemId(opage, o); (void) PageAddItem(npage, PageGetItem(opage, ii), ii->lp_len, n, LP_USED); -#ifdef FASTBUILD_DEBUG +#if 0 +#if defined(FASTBUILD_DEBUG) && defined(FASTBUILD_MERGE) { bool isnull; BTItem tmpbti = (BTItem) PageGetItem(npage, PageGetItemId(npage, n)); Datum d = index_getattr(&(tmpbti->bti_itup), 1, - RelationGetTupleDescriptor(index), - &isnull); - printf("_bt_buildadd: moved <%x> to offset %d\n", - d, n); + index->rd_att, &isnull); + printf("_bt_buildadd: moved <%x> to offset %d at level %d\n", + d, n, state->btps_level); } -#endif /* FASTBUILD_DEBUG */ +#endif /* FASTBUILD_DEBUG && FASTBUILD_MERGE */ +#endif } + /* + * this loop is backward because PageIndexTupleDelete shuffles + * the tuples to fill holes in the page -- by starting at the + * end and working back, we won't create holes (and thereby + * avoid shuffling). + */ for (o = last_off; o > first_off; o = OffsetNumberPrev(o)) { PageIndexTupleDelete(opage, o); } @@ -842,6 +938,23 @@ _bt_buildadd(Relation index, BTPageState *state, BTItem bti, int flags) nopaque->btpo_next = P_NONE; } + /* + * copy the old buffer's minimum key to its parent. if we + * don't have a parent, we have to create one; this adds a new + * btree level. + */ + if (state->btps_doupper) { + BTItem nbti; + + if (state->btps_next == (BTPageState *) NULL) { + state->btps_next = + _bt_pagestate(index, 0, state->btps_level + 1, true); + } + nbti = _bt_minitem(opage, BufferGetBlockNumber(obuf), 0); + (void) _bt_buildadd(index, state->btps_next, nbti, 0); + pfree((void *) nbti); + } + /* * write out the old stuff. we never want to see it again, so * we can give up our lock (if we had one; BuildingBtree is @@ -856,16 +969,16 @@ _bt_buildadd(Relation index, BTPageState *state, BTItem bti, int flags) */ off = OffsetNumberNext(last_off); (void) PageAddItem(npage, (Item) bti, btisz, off, LP_USED); -#ifdef FASTBUILD_DEBUG +#if 0 +#if defined(FASTBUILD_DEBUG) && defined(FASTBUILD_MERGE) { bool isnull; - Datum d = index_getattr(&(bti->bti_itup), 1, - RelationGetTupleDescriptor(index), - &isnull); - printf("_bt_buildadd: inserted <%x> at offset %d\n", - d, off); + Datum d = index_getattr(&(bti->bti_itup), 1, index->rd_att, &isnull); + printf("_bt_buildadd: inserted <%x> at offset %d at level %d\n", + d, off, state->btps_level); } -#endif /* FASTBUILD_DEBUG */ +#endif /* FASTBUILD_DEBUG && FASTBUILD_MERGE */ +#endif if (last_bti == (BTItem) NULL) { first_off = P_FIRSTKEY; } else if (!_bt_itemcmp(index, 1, bti, last_bti, BTEqualStrategyNumber)) { @@ -879,6 +992,48 @@ _bt_buildadd(Relation index, BTPageState *state, BTItem bti, int flags) state->btps_lastbti = last_bti; state->btps_lastoff = last_off; state->btps_firstoff = first_off; + + return(last_bti); +} + +void +_bt_uppershutdown(Relation index, BTPageState *state) +{ + BTPageState *s; + BlockNumber blkno; + BTPageOpaque opaque; + BTItem bti; + + for (s = state; s != (BTPageState *) NULL; s = s->btps_next) { + blkno = BufferGetBlockNumber(s->btps_buf); + opaque = (BTPageOpaque) PageGetSpecialPointer(s->btps_page); + + /* + * if this is the root, attach it to the metapage. otherwise, + * stick the minimum key of the last page on this level (which + * has not been split, or else it wouldn't be the last page) + * into its parent. this may cause the last page of upper + * levels to split, but that's not a problem -- we haven't + * gotten to them yet. + */ + if (s->btps_doupper) { + if (s->btps_next == (BTPageState *) NULL) { + opaque->btpo_flags |= BTP_ROOT; + _bt_metaproot(index, blkno); + } else { + bti = _bt_minitem(s->btps_page, blkno, 0); + (void) _bt_buildadd(index, s->btps_next, bti, 0); + pfree((void *) bti); + } + } + + /* + * this is the rightmost page, so the ItemId array needs to be + * slid back one slot. + */ + _bt_slideleft(index, s->btps_buf, s->btps_page); + _bt_wrtbuf(index, s->btps_buf); + } } /* @@ -888,11 +1043,10 @@ _bt_buildadd(Relation index, BTPageState *state, BTItem bti, int flags) * * XXX three nested loops? gross. cut me up into smaller routines. */ -static BlockNumber +static void _bt_merge(Relation index, BTSpool *btspool) { - BTPageState state; - BlockNumber firstblk; + BTPageState *state; BTPriQueue q; BTPriQueueElem e; BTItem bti; @@ -902,29 +1056,31 @@ _bt_merge(Relation index, BTSpool *btspool) int tapedone[MAXTAPES]; int t; int goodtapes; + int npass; int nruns; Size btisz; bool doleaf = false; + BTPageState *s; + BTPageOpaque *opaque; /* * initialize state needed for the merge into the btree leaf pages. */ - (void) memset((char *) &state, 0, sizeof(BTPageState)); - _bt_blnewpage(index, &(state.btps_buf), &(state.btps_page), BTP_LEAF); - state.btps_lastoff = P_HIKEY; - state.btps_lastbti = (BTItem) NULL; - firstblk = BufferGetBlockNumber(state.btps_buf); + state = (BTPageState *) _bt_pagestate(index, BTP_LEAF, 0, true); + npass = 0; do { /* pass */ /* * each pass starts by flushing the previous outputs and - * swapping inputs and outputs. this process also clears the - * new output tapes and rewinds the new input tapes. + * swapping inputs and outputs. flushing sets End-of-Run for + * any dirty output tapes. swapping clears the new output + * tapes and rewinds the new input tapes. */ btspool->bts_tape = btspool->bts_ntapes - 1; _bt_spoolflush(btspool); _bt_spoolswap(btspool); + ++npass; nruns = 0; for (;;) { /* run */ @@ -949,22 +1105,27 @@ _bt_merge(Relation index, BTSpool *btspool) for (t = 0; t < btspool->bts_ntapes; ++t) { itape = btspool->bts_itape[t]; tapepos[t] = itape->bttb_data; + tapedone[t] = 0; _bt_tapereset(itape); - if (_bt_taperead(itape) == 0) { - tapedone[t] = 1; - } else { + do { + if (_bt_taperead(itape) == 0) { + tapedone[t] = 1; + } + } while (!tapedone[t] && EMPTYTAPE(itape)); + if (!tapedone[t]) { ++goodtapes; - tapedone[t] = 0; e.btpqe_tape = t; - e.btpqe_item = _bt_tapenext(itape, &tapepos[t]); - if (e.btpqe_item != (BTItem) NULL) { + _bt_setsortkey(index, _bt_tapenext(itape, &tapepos[t]), + &(e.btpqe_item)); + if (e.btpqe_item.btsk_item != (BTItem) NULL) { _bt_pqadd(&q, &e); } } } /* * if we don't have any tapes with any input (i.e., they - * are all at EOF), we must be done with this pass. + * are all at EOF), there is no work to do in this run -- + * we must be done with this pass. */ if (goodtapes == 0) { break; /* for */ @@ -972,8 +1133,8 @@ _bt_merge(Relation index, BTSpool *btspool) ++nruns; /* - * output the smallest element from the queue until there are no - * more. + * output the smallest element from the queue until there + * are no more. */ while (_bt_pqnext(&q, &e) >= 0) { /* item */ /* @@ -982,63 +1143,59 @@ _bt_merge(Relation index, BTSpool *btspool) * if it hits either End-Of-Run or EOF. */ t = e.btpqe_tape; - bti = e.btpqe_item; + bti = e.btpqe_item.btsk_item; if (bti != (BTItem) NULL) { btisz = BTITEMSZ(bti); btisz = DOUBLEALIGN(btisz); if (doleaf) { - _bt_buildadd(index, &state, bti, BTP_LEAF); -#ifdef FASTBUILD_DEBUG + (void) _bt_buildadd(index, state, bti, BTP_LEAF); +#if defined(FASTBUILD_DEBUG) && defined(FASTBUILD_MERGE) { bool isnull; Datum d = index_getattr(&(bti->bti_itup), 1, - RelationGetTupleDescriptor(index), - &isnull); - printf("_bt_merge: inserted <%x> into block %d\n", - d, BufferGetBlockNumber(state.btps_buf)); + index->rd_att, &isnull); + printf("_bt_merge: [pass %d run %d] inserted <%x> from tape %d into block %d\n", + npass, nruns, d, t, + BufferGetBlockNumber(state->btps_buf)); } -#endif /* FASTBUILD_DEBUG */ +#endif /* FASTBUILD_DEBUG && FASTBUILD_MERGE */ } else { if (SPCLEFT(otape) < btisz) { /* * if it's full, write it out and add the - * item to the next block. (since we know - * there will be at least one more block, - * we know we do *not* want to set - * End-Of-Run here!) + * item to the next block. (since we will + * be adding another tuple immediately + * after this, we can be sure that there + * will be at least one more block in this + * run and so we know we do *not* want to + * set End-Of-Run here.) */ _bt_tapewrite(otape, 0); } _bt_tapeadd(otape, bti, btisz); -#ifdef FASTBUILD_DEBUG +#if defined(FASTBUILD_DEBUG) && defined(FASTBUILD_MERGE) { bool isnull; Datum d = index_getattr(&(bti->bti_itup), 1, - RelationGetTupleDescriptor(index), &isnull); - printf("_bt_merge: inserted <%x> into tape %d\n", - d, btspool->bts_tape); + index->rd_att, &isnull); + printf("_bt_merge: [pass %d run %d] inserted <%x> from tape %d into output tape %d\n", + npass, nruns, d, t, + btspool->bts_tape); } -#endif /* FASTBUILD_DEBUG */ +#endif /* FASTBUILD_DEBUG && FASTBUILD_MERGE */ } } -#ifdef FASTBUILD_DEBUG - { - bool isnull; - Datum d = index_getattr(&(bti->bti_itup), 1, - RelationGetTupleDescriptor(index), - &isnull); - printf("_bt_merge: got <%x> from tape %d\n", d, t); - } -#endif /* FASTBUILD_DEBUG */ - itape = btspool->bts_itape[t]; if (!tapedone[t]) { BTItem newbti = _bt_tapenext(itape, &tapepos[t]); if (newbti == (BTItem) NULL) { - if (_bt_taperead(itape) == 0) { - tapedone[t] = 1; - } else { + do { + if (_bt_taperead(itape) == 0) { + tapedone[t] = 1; + } + } while (!tapedone[t] && EMPTYTAPE(itape)); + if (!tapedone[t]) { tapepos[t] = itape->bttb_data; newbti = _bt_tapenext(itape, &tapepos[t]); } @@ -1047,11 +1204,17 @@ _bt_merge(Relation index, BTSpool *btspool) BTPriQueueElem nexte; nexte.btpqe_tape = t; - nexte.btpqe_item = newbti; + _bt_setsortkey(index, newbti, &(nexte.btpqe_item)); _bt_pqadd(&q, &nexte); } } } /* item */ + + /* + * that's it for this run. flush the output tape, marking + * End-of-Run. + */ + _bt_tapewrite(otape, 1); } /* run */ /* @@ -1068,60 +1231,50 @@ _bt_merge(Relation index, BTSpool *btspool) } } while (nruns > 0); /* pass */ - /* - * this is the rightmost page, so the ItemId array needs to be - * slid back one slot. - */ - _bt_slideleft(index, state.btps_buf, state.btps_page); - _bt_wrtbuf(index, state.btps_buf); - - return(firstblk); + _bt_uppershutdown(index, state); } /* - * given the block number 'blk' of the first page of a set of linked - * siblings (i.e., the start of an entire level of the btree), - * construct the corresponding next level of the btree. we do this by - * placing minimum keys from each page into this page. the format of - * the internal pages is otherwise the same as for leaf pages. + * given the (appropriately side-linked) leaf pages of a btree, + * construct the corresponding upper levels. we do this by inserting + * minimum keys from each page into parent pages as needed. the + * format of the internal pages is otherwise the same as for leaf + * pages. + * + * this routine is not called during conventional bulk-loading (in + * which case we can just build the upper levels as we create the + * sorted bottom level). it is only used for index recycling. */ void -_bt_upperbuild(Relation index, BlockNumber blk, int level) +_bt_upperbuild(Relation index) { Buffer rbuf; + BlockNumber blk; Page rpage; BTPageOpaque ropaque; - BTPageState state; - BlockNumber firstblk; - BTItem bti; + BTPageState *state; BTItem nbti; - OffsetNumber off; - - rbuf = _bt_getbuf(index, blk, BT_WRITE); - rpage = BufferGetPage(rbuf); - ropaque = (BTPageOpaque) PageGetSpecialPointer(rpage); /* - * if we only have one page on a level, we can just make it the - * root. + * find the first leaf block. while we're at it, clear the + * BTP_ROOT flag that we set while building it (so we could find + * it later). */ - if (P_RIGHTMOST(ropaque)) { - ropaque->btpo_flags |= BTP_ROOT; - _bt_wrtbuf(index, rbuf); - _bt_metaproot(index, blk); - return; - } - _bt_relbuf(index, rbuf, BT_WRITE); + rbuf = _bt_getroot(index, BT_WRITE); + blk = BufferGetBlockNumber(rbuf); + rpage = BufferGetPage(rbuf); + ropaque = (BTPageOpaque) PageGetSpecialPointer(rpage); + ropaque->btpo_flags &= ~BTP_ROOT; + _bt_wrtbuf(index, rbuf); - (void) memset((char *) &state, 0, sizeof(BTPageState)); - _bt_blnewpage(index, &(state.btps_buf), &(state.btps_page), 0); - state.btps_lastoff = P_HIKEY; - state.btps_lastbti = (BTItem) NULL; - firstblk = BufferGetBlockNumber(state.btps_buf); + state = (BTPageState *) _bt_pagestate(index, 0, 0, true); /* for each page... */ do { +#if 0 + printf("\t\tblk=%d\n", blk); +#endif rbuf = _bt_getbuf(index, blk, BT_READ); rpage = BufferGetPage(rbuf); ropaque = (BTPageOpaque) PageGetSpecialPointer(rpage); @@ -1133,35 +1286,24 @@ _bt_upperbuild(Relation index, BlockNumber blk, int level) * of the lower page and insert it into a page at this * level. */ - off = P_RIGHTMOST(ropaque) ? P_HIKEY : P_FIRSTKEY; - bti = (BTItem) PageGetItem(rpage, PageGetItemId(rpage, off)); - nbti = _bt_formitem(&(bti->bti_itup)); - ItemPointerSet(&(nbti->bti_itup.t_tid), blk, P_HIKEY); -#ifdef FASTBUILD_DEBUG + nbti = _bt_minitem(rpage, blk, P_RIGHTMOST(ropaque)); +#if defined(FASTBUILD_DEBUG) && defined(FASTBUILD_MERGE) { bool isnull; - Datum d = index_getattr(&(nbti->bti_itup), 1, - RelationGetTupleDescriptor(index), + Datum d = index_getattr(&(nbti->bti_itup), 1, index->rd_att, &isnull); printf("_bt_upperbuild: inserting <%x> at %d\n", - d, level); + d, state->btps_level); } -#endif /* FASTBUILD_DEBUG */ - _bt_buildadd(index, &state, nbti, 0); +#endif /* FASTBUILD_DEBUG && FASTBUILD_MERGE */ + (void) _bt_buildadd(index, state, nbti, 0); pfree((void *) nbti); } blk = ropaque->btpo_next; _bt_relbuf(index, rbuf, BT_READ); } while (blk != P_NONE); - /* - * this is the rightmost page, so the ItemId array needs to be - * slid back one slot. - */ - _bt_slideleft(index, state.btps_buf, state.btps_page); - _bt_wrtbuf(index, state.btps_buf); - - _bt_upperbuild(index, firstblk, level + 1); + _bt_uppershutdown(index, state); } /* @@ -1171,26 +1313,5 @@ _bt_upperbuild(Relation index, BlockNumber blk, int level) void _bt_leafbuild(Relation index, void *spool) { - BTSpool *btspool = (BTSpool *) spool; - BlockNumber firstblk; - - /* - * merge the runs into btree leaf pages. - */ - firstblk = _bt_merge(index, btspool); - - /* - * build the upper levels of the btree. - */ - _bt_upperbuild(index, firstblk, 0); + _bt_merge(index, (BTSpool *) spool); } - -#else /* !FASTBUILD */ - -void *_bt_spoolinit(Relation index, int ntapes) { return((void *) NULL); } -void _bt_spooldestroy(void *spool) { } -void _bt_spool(Relation index, BTItem btitem, void *spool) { } -void _bt_upperbuild(Relation index, BlockNumber blk, int level) { } -void _bt_leafbuild(Relation index, void *spool) { } - -#endif /* !FASTBUILD */ diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index affe0d584450da43ff3a0e501186201bfb472e1c..582f9933c09b88005f2a47f3c78bbf7441b298a1 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -6,7 +6,7 @@ * * Copyright (c) 1994, Regents of the University of California * - * $Id: nbtree.h,v 1.5 1997/01/10 09:36:33 vadim Exp $ + * $Id: nbtree.h,v 1.6 1997/02/12 05:04:28 scrappy Exp $ * *------------------------------------------------------------------------- */ @@ -131,6 +131,13 @@ typedef BTStackData *BTStack; #define BT_INSERTION 0 #define BT_DESCENT 1 +/* + * We must classify index modification types for the benefit of + * _bt_adjscans. + */ +#define BT_INSERT 0 +#define BT_DELETE 1 + /* * In general, the btree code tries to localize its knowledge about * page layout to a couple of routines. However, we need a special @@ -220,11 +227,7 @@ extern void btdelete(Relation rel, ItemPointer tid); */ extern void _bt_regscan(IndexScanDesc scan); extern void _bt_dropscan(IndexScanDesc scan); -extern void _bt_adjscans(Relation rel, ItemPointer tid); -extern void _bt_scandel(IndexScanDesc scan, BlockNumber blkno, - OffsetNumber offno); -extern bool _bt_scantouched(IndexScanDesc scan, BlockNumber blkno, - OffsetNumber offno); +extern void _bt_adjscans(Relation rel, ItemPointer tid, int op); /* * prototypes for functions in nbtsearch.c @@ -267,7 +270,7 @@ extern BTItem _bt_formitem(IndexTuple itup); extern void *_bt_spoolinit(Relation index, int ntapes); extern void _bt_spooldestroy(void *spool); extern void _bt_spool(Relation index, BTItem btitem, void *spool); -extern void _bt_upperbuild(Relation index, BlockNumber blk, int level); +extern void _bt_upperbuild(Relation index); extern void _bt_leafbuild(Relation index, void *spool); #endif /* NBTREE_H */ diff --git a/src/port/BSD44_derived.h b/src/port/BSD44_derived.h new file mode 100644 index 0000000000000000000000000000000000000000..919b38cffe053b38592789a65222fd5981379704 --- /dev/null +++ b/src/port/BSD44_derived.h @@ -0,0 +1,7 @@ +# define USE_POSIX_TIME +# define NEED_I386_TAS_ASM +# define HAS_TEST_AND_SET +# if defined(__mips__) +/* # undef HAS_TEST_AND_SET */ +# endif + typedef unsigned char slock_t;