diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c index ca3b634b07b43e824afae4789db456509ee0438d..b23006460956bff90bc4bda6436b1aaa99b5d2ab 100644 --- a/src/backend/access/nbtree/nbtinsert.c +++ b/src/backend/access/nbtree/nbtinsert.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.109 2003/11/29 19:51:40 pgsql Exp $ + * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.110 2003/12/21 01:23:06 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -86,8 +86,8 @@ _bt_doinsert(Relation rel, BTItem btitem, itup_scankey = _bt_mkscankey(rel, itup); top: - /* find the page containing this key */ - stack = _bt_search(rel, natts, itup_scankey, &buf, BT_WRITE); + /* find the first page containing this key */ + stack = _bt_search(rel, natts, itup_scankey, false, &buf, BT_WRITE); /* trade in our read lock for a write lock */ LockBuffer(buf, BUFFER_LOCK_UNLOCK); @@ -100,7 +100,7 @@ top: * need to move right in the tree. See Lehman and Yao for an * excruciatingly precise description. */ - buf = _bt_moveright(rel, buf, natts, itup_scankey, BT_WRITE); + buf = _bt_moveright(rel, buf, natts, itup_scankey, false, BT_WRITE); /* * If we're not allowing duplicates, make sure the key isn't already @@ -175,7 +175,7 @@ _bt_check_unique(Relation rel, BTItem btitem, Relation heapRel, * Find first item >= proposed new item. Note we could also get a * pointer to end-of-page here. */ - offset = _bt_binsrch(rel, buf, natts, itup_scankey); + offset = _bt_binsrch(rel, buf, natts, itup_scankey, false); /* * Scan over all equal tuples, looking for live conflicts. @@ -478,7 +478,7 @@ _bt_insertonpg(Relation rel, if (movedright) newitemoff = P_FIRSTDATAKEY(lpageop); else - newitemoff = _bt_binsrch(rel, buf, keysz, scankey); + newitemoff = _bt_binsrch(rel, buf, keysz, scankey, false); } /* diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c index 4004f02f02d03dbb50e0da7d0d1236082d75de73..bc0ed6641f7eac25a98eeacd77610485ba89e3f8 100644 --- a/src/backend/access/nbtree/nbtpage.c +++ b/src/backend/access/nbtree/nbtpage.c @@ -9,7 +9,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtpage.c,v 1.73 2003/11/29 19:51:40 pgsql Exp $ + * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtpage.c,v 1.74 2003/12/21 01:23:06 tgl Exp $ * * NOTES * Postgres btree pages look like ordinary relation pages. The opaque @@ -804,7 +804,7 @@ _bt_pagedel(Relation rel, Buffer buf, bool vacuum_full) /* we need a scan key to do our search, so build one */ itup_scankey = _bt_mkscankey(rel, &(targetkey->bti_itup)); /* find the leftmost leaf page containing this key */ - stack = _bt_search(rel, rel->rd_rel->relnatts, itup_scankey, + stack = _bt_search(rel, rel->rd_rel->relnatts, itup_scankey, false, &lbuf, BT_READ); /* don't need a pin on that either */ _bt_relbuf(rel, lbuf); diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c index 87d5d0a22a852321116ab2ad847b272273fda6ad..554959fc79ef8fd67809087302fe3d3d4d5256ed 100644 --- a/src/backend/access/nbtree/nbtsearch.c +++ b/src/backend/access/nbtree/nbtsearch.c @@ -8,7 +8,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtsearch.c,v 1.83 2003/11/29 19:51:40 pgsql Exp $ + * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtsearch.c,v 1.84 2003/12/21 01:23:06 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -28,6 +28,10 @@ static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir); * _bt_search() -- Search the tree for a particular scankey, * or more precisely for the first leaf page it could be on. * + * When nextkey is false (the usual case), we are looking for the first + * item >= scankey. When nextkey is true, we are looking for the first + * item strictly greater than scankey. + * * Return value is a stack of parent-page pointers. *bufP is set to the * address of the leaf-page buffer, which is read-locked and pinned. * No locks are held on the parent pages, however! @@ -38,7 +42,7 @@ static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir); * will result in *bufP being set to InvalidBuffer. */ BTStack -_bt_search(Relation rel, int keysz, ScanKey scankey, +_bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey, Buffer *bufP, int access) { BTStack stack_in = NULL; @@ -68,7 +72,7 @@ _bt_search(Relation rel, int keysz, ScanKey scankey, * its pointer in the parent (or metapage). If it has, we may * need to move right to its new sibling. Do that. */ - *bufP = _bt_moveright(rel, *bufP, keysz, scankey, BT_READ); + *bufP = _bt_moveright(rel, *bufP, keysz, scankey, nextkey, BT_READ); /* if this is a leaf page, we're done */ page = BufferGetPage(*bufP); @@ -80,7 +84,7 @@ _bt_search(Relation rel, int keysz, ScanKey scankey, * Find the appropriate item on the internal page, and get the * child page that it points to. */ - offnum = _bt_binsrch(rel, *bufP, keysz, scankey); + offnum = _bt_binsrch(rel, *bufP, keysz, scankey, nextkey); itemid = PageGetItemId(page, offnum); btitem = (BTItem) PageGetItem(page, itemid); itup = &(btitem->bti_itup); @@ -117,47 +121,59 @@ _bt_search(Relation rel, int keysz, ScanKey scankey, /* * _bt_moveright() -- move right in the btree if necessary. * - * When we follow a pointer to reach a page, it is possible that - * the page has changed in the meanwhile. If this happens, we're - * guaranteed that the page has "split right" -- that is, that any - * data that appeared on the page originally is either on the page - * or strictly to the right of it. + * When we follow a pointer to reach a page, it is possible that + * the page has changed in the meanwhile. If this happens, we're + * guaranteed that the page has "split right" -- that is, that any + * data that appeared on the page originally is either on the page + * or strictly to the right of it. + * + * When nextkey is false (the usual case), we are looking for the first + * item >= scankey. When nextkey is true, we are looking for the first + * item strictly greater than scankey. * - * This routine decides whether or not we need to move right in the - * tree by examining the high key entry on the page. If that entry - * is strictly less than one we expect to be on the page, then our - * picture of the page is incorrect and we need to move right. + * This routine decides whether or not we need to move right in the + * tree by examining the high key entry on the page. If that entry + * is strictly less than the scankey, or <= the scankey in the nextkey=true + * case, then we followed the wrong link and we need to move right. * - * On entry, we have the buffer pinned and a lock of the proper type. - * If we move right, we release the buffer and lock and acquire the - * same on the right sibling. Return value is the buffer we stop at. + * On entry, we have the buffer pinned and a lock of the type specified by + * 'access'. If we move right, we release the buffer and lock and acquire + * the same on the right sibling. Return value is the buffer we stop at. */ Buffer _bt_moveright(Relation rel, Buffer buf, int keysz, ScanKey scankey, + bool nextkey, int access) { Page page; BTPageOpaque opaque; + int32 cmpval; page = BufferGetPage(buf); opaque = (BTPageOpaque) PageGetSpecialPointer(page); /* - * If the scan key that brought us to this page is > the high key - * stored on the page, then the page has split and we need to move - * right. (If the scan key is equal to the high key, we might or - * might not need to move right; have to scan the page first anyway.) - * It could even have split more than once, so scan as far as needed. + * When nextkey = false (normal case): if the scan key that brought us to + * this page is > the high key stored on the page, then the page has split + * and we need to move right. (If the scan key is equal to the high key, + * we might or might not need to move right; have to scan the page first + * anyway.) + * + * When nextkey = true: move right if the scan key is >= page's high key. + * + * The page could even have split more than once, so scan as far as needed. * * We also have to move right if we followed a link that brought us to a * dead page. */ + cmpval = nextkey ? 0 : 1; + while (!P_RIGHTMOST(opaque) && (P_IGNORE(opaque) || - _bt_compare(rel, keysz, scankey, page, P_HIKEY) > 0)) + _bt_compare(rel, keysz, scankey, page, P_HIKEY) >= cmpval)) { /* step right one page */ BlockNumber rblkno = opaque->btpo_next; @@ -178,21 +194,26 @@ _bt_moveright(Relation rel, /* * _bt_binsrch() -- Do a binary search for a key on a particular page. * + * When nextkey is false (the usual case), we are looking for the first + * item >= scankey. When nextkey is true, we are looking for the first + * item strictly greater than scankey. + * * The scankey we get has the compare function stored in the procedure * entry of each data struct. We invoke this regproc to do the * comparison for every key in the scankey. * * On a leaf page, _bt_binsrch() returns the OffsetNumber of the first - * key >= given scankey. (NOTE: in particular, this means it is possible - * to return a value 1 greater than the number of keys on the page, - * if the scankey is > all keys on the page.) + * key >= given scankey, or > scankey if nextkey is true. (NOTE: in + * particular, this means it is possible to return a value 1 greater than the + * number of keys on the page, if the scankey is > all keys on the page.) * * On an internal (non-leaf) page, _bt_binsrch() returns the OffsetNumber - * of the last key < given scankey. (Since _bt_compare treats the first - * data key of such a page as minus infinity, there will be at least one - * key < scankey, so the result always points at one of the keys on the - * page.) This key indicates the right place to descend to be sure we - * find all leaf keys >= given scankey. + * of the last key < given scankey, or last key <= given scankey if nextkey + * is true. (Since _bt_compare treats the first data key of such a page as + * minus infinity, there will be at least one key < scankey, so the result + * always points at one of the keys on the page.) This key indicates the + * right place to descend to be sure we find all leaf keys >= given scankey + * (or leaf keys > given scankey when nextkey is true). * * This procedure is not responsible for walking right, it just examines * the given page. _bt_binsrch() has no lock or refcount side effects @@ -202,14 +223,16 @@ OffsetNumber _bt_binsrch(Relation rel, Buffer buf, int keysz, - ScanKey scankey) + ScanKey scankey, + bool nextkey) { TupleDesc itupdesc; Page page; BTPageOpaque opaque; OffsetNumber low, high; - int32 result; + int32 result, + cmpval; itupdesc = RelationGetDescr(rel); page = BufferGetPage(buf); @@ -229,12 +252,23 @@ _bt_binsrch(Relation rel, return low; /* - * Binary search to find the first key on the page >= scan key. Loop - * invariant: all slots before 'low' are < scan key, all slots at or - * after 'high' are >= scan key. We can fall out when high == low. + * Binary search to find the first key on the page >= scan key, or + * first key > scankey when nextkey is true. + * + * For nextkey=false (cmpval=1), the loop invariant is: all slots + * before 'low' are < scan key, all slots at or after 'high' + * are >= scan key. + * + * For nextkey=true (cmpval=0), the loop invariant is: all slots + * before 'low' are <= scan key, all slots at or after 'high' + * are > scan key. + * + * We can fall out when high == low. */ high++; /* establish the loop invariant for high */ + cmpval = nextkey ? 0 : 1; /* select comparison value */ + while (high > low) { OffsetNumber mid = low + ((high - low) / 2); @@ -243,7 +277,7 @@ _bt_binsrch(Relation rel, result = _bt_compare(rel, keysz, scankey, page, mid); - if (result > 0) + if (result >= cmpval) low = mid + 1; else high = mid; @@ -253,15 +287,15 @@ _bt_binsrch(Relation rel, * At this point we have high == low, but be careful: they could point * past the last slot on the page. * - * On a leaf page, we always return the first key >= scan key (which - * could be the last slot + 1). + * On a leaf page, we always return the first key >= scan key (resp. + * > scan key), which could be the last slot + 1. */ if (P_ISLEAF(opaque)) return low; /* - * On a non-leaf page, return the last key < scan key. There must be - * one if _bt_compare() is playing by the rules. + * On a non-leaf page, return the last key < scan key (resp. <= scan key). + * There must be one if _bt_compare() is playing by the rules. */ Assert(low > P_FIRSTDATAKEY(opaque)); @@ -462,6 +496,7 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) StrategyNumber strat; bool res; int32 result; + bool nextkey; bool continuescan; ScanKey scankeys = NULL; ScanKey *startKeys = NULL; @@ -662,11 +697,51 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) current = &(scan->currentItemData); + /* + * We want to locate either the first item >= boundary point, or + * first item > boundary point, depending on the initial-positioning + * strategy we just chose. + */ + switch (strat_total) + { + case BTLessStrategyNumber: + nextkey = false; + break; + + case BTLessEqualStrategyNumber: + nextkey = true; + break; + + case BTEqualStrategyNumber: + /* + * If a backward scan was specified, need to start with last + * equal item not first one. + */ + if (ScanDirectionIsBackward(dir)) + nextkey = true; + else + nextkey = false; + break; + + case BTGreaterEqualStrategyNumber: + nextkey = false; + break; + + case BTGreaterStrategyNumber: + nextkey = true; + break; + + default: + /* can't get here, but keep compiler quiet */ + elog(ERROR, "unrecognized strat_total: %d", (int) strat_total); + return false; + } + /* * Use the manufactured scan key to descend the tree and position * ourselves on the target leaf page. */ - stack = _bt_search(rel, keysCount, scankeys, &buf, BT_READ); + stack = _bt_search(rel, keysCount, scankeys, nextkey, &buf, BT_READ); /* don't need to keep the stack around... */ _bt_freestack(stack); @@ -686,35 +761,45 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) page = BufferGetPage(buf); /* position to the precise item on the page */ - offnum = _bt_binsrch(rel, buf, keysCount, scankeys); + offnum = _bt_binsrch(rel, buf, keysCount, scankeys, nextkey); ItemPointerSet(current, blkno, offnum); /* - * At this point we are positioned at the first item >= scan key, or - * possibly at the end of a page on which all the existing items are + * It's now time to examine the initial-positioning strategy to find the + * exact place to start the scan. + * + * If nextkey = false, we are positioned at the first item >= scan key, + * or possibly at the end of a page on which all the existing items are * less than the scan key and we know that everything on later pages * is greater than or equal to scan key. * - * We could step forward in the latter case, but that'd be a waste of - * time if we want to scan backwards. So, it's now time to examine - * the initial-positioning strategy to find the exact place to start - * the scan. + * If nextkey = true, we are positioned at the first item > scan key, + * or possibly at the end of a page on which all the existing items are + * less than or equal to the scan key and we know that everything on + * later pages is greater than scan key. + * + * The actually desired starting point is either this item or an adjacent + * one, or in the end-of-page case it's the last item on this page or + * the first item on the next. We apply _bt_step if needed to get to + * the right place. * * Note: if _bt_step fails (meaning we fell off the end of the index in - * one direction or the other), we either return false (no matches) or - * call _bt_endpoint() to set up a scan starting at that index - * endpoint, as appropriate for the desired scan type. + * one direction or the other), then there are no matches so we just + * return false. * * it's yet other place to add some code later for is(not)null ... */ - switch (strat_total) { case BTLessStrategyNumber: /* - * Back up one to arrive at last item < scankey + * We are on first item >= scankey. + * + * Back up one to arrive at last item < scankey. (Note: this + * positioning strategy is only used for a backward scan, so + * that is always the correct starting position.) */ if (!_bt_step(scan, &buf, BackwardScanDirection)) { @@ -726,30 +811,12 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) case BTLessEqualStrategyNumber: /* - * We need to find the last item <= scankey, so step forward - * till we find one > scankey, then step back one. + * We are on first item > scankey. + * + * Back up one to arrive at last item <= scankey. (Note: this + * positioning strategy is only used for a backward scan, so + * that is always the correct starting position.) */ - if (offnum > PageGetMaxOffsetNumber(page)) - { - if (!_bt_step(scan, &buf, ForwardScanDirection)) - { - pfree(scankeys); - return _bt_endpoint(scan, dir); - } - } - for (;;) - { - offnum = ItemPointerGetOffsetNumber(current); - page = BufferGetPage(buf); - result = _bt_compare(rel, keysCount, scankeys, page, offnum); - if (result < 0) - break; - if (!_bt_step(scan, &buf, ForwardScanDirection)) - { - pfree(scankeys); - return _bt_endpoint(scan, dir); - } - } if (!_bt_step(scan, &buf, BackwardScanDirection)) { pfree(scankeys); @@ -758,45 +825,49 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) break; case BTEqualStrategyNumber: - /* - * Make sure we are on the first equal item; might have to - * step forward if currently at end of page. + * If a backward scan was specified, need to start with last + * equal item not first one. */ - if (offnum > PageGetMaxOffsetNumber(page)) + if (ScanDirectionIsBackward(dir)) { - if (!_bt_step(scan, &buf, ForwardScanDirection)) + /* + * We are on first item > scankey. + * + * Back up one to arrive at last item <= scankey, then + * check to see if it is equal to scankey. + */ + if (!_bt_step(scan, &buf, BackwardScanDirection)) { pfree(scankeys); return false; } - offnum = ItemPointerGetOffsetNumber(current); - page = BufferGetPage(buf); } - result = _bt_compare(rel, keysCount, scankeys, page, offnum); - if (result != 0) - goto nomatches; /* no equal items! */ - - /* - * If a backward scan was specified, need to start with last - * equal item not first one. - */ - if (ScanDirectionIsBackward(dir)) + else { - do + /* + * We are on first item >= scankey. + * + * Make sure we are on a real item; might have to + * step forward if currently at end of page. Then check + * to see if it is equal to scankey. + */ + if (offnum > PageGetMaxOffsetNumber(page)) { if (!_bt_step(scan, &buf, ForwardScanDirection)) { pfree(scankeys); - return _bt_endpoint(scan, dir); + return false; } - offnum = ItemPointerGetOffsetNumber(current); - page = BufferGetPage(buf); - result = _bt_compare(rel, keysCount, scankeys, page, offnum); - } while (result == 0); - if (!_bt_step(scan, &buf, BackwardScanDirection)) - elog(ERROR, "equal items disappeared?"); + } } + + /* If we are not now on an equal item, then there ain't any. */ + offnum = ItemPointerGetOffsetNumber(current); + page = BufferGetPage(buf); + result = _bt_compare(rel, keysCount, scankeys, page, offnum); + if (result != 0) + goto nomatches; /* no equal items! */ break; case BTGreaterEqualStrategyNumber: @@ -818,8 +889,8 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) case BTGreaterStrategyNumber: /* - * We want the first item > scankey, so make sure we are on an - * item and then step over any equal items. + * We want the first item > scankey, which is where we are... + * unless we're not anywhere at all... */ if (offnum > PageGetMaxOffsetNumber(page)) { @@ -828,20 +899,6 @@ _bt_first(IndexScanDesc scan, ScanDirection dir) pfree(scankeys); return false; } - offnum = ItemPointerGetOffsetNumber(current); - page = BufferGetPage(buf); - } - result = _bt_compare(rel, keysCount, scankeys, page, offnum); - while (result == 0) - { - if (!_bt_step(scan, &buf, ForwardScanDirection)) - { - pfree(scankeys); - return false; - } - offnum = ItemPointerGetOffsetNumber(current); - page = BufferGetPage(buf); - result = _bt_compare(rel, keysCount, scankeys, page, offnum); } break; } diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index 6fc61a90999bb81879e2c0e6d3064ff61fe0a9bc..62af3ca30e9f602e5693a7130d8983790e406416 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/access/nbtree.h,v 1.74 2003/11/29 22:40:55 pgsql Exp $ + * $PostgreSQL: pgsql/src/include/access/nbtree.h,v 1.75 2003/12/21 01:23:06 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -459,12 +459,13 @@ extern int _bt_pagedel(Relation rel, Buffer buf, bool vacuum_full); /* * prototypes for functions in nbtsearch.c */ -extern BTStack _bt_search(Relation rel, int keysz, ScanKey scankey, - Buffer *bufP, int access); +extern BTStack _bt_search(Relation rel, + int keysz, ScanKey scankey, bool nextkey, + Buffer *bufP, int access); extern Buffer _bt_moveright(Relation rel, Buffer buf, int keysz, - ScanKey scankey, int access); + ScanKey scankey, bool nextkey, int access); extern OffsetNumber _bt_binsrch(Relation rel, Buffer buf, int keysz, - ScanKey scankey); + ScanKey scankey, bool nextkey); extern int32 _bt_compare(Relation rel, int keysz, ScanKey scankey, Page page, OffsetNumber offnum); extern bool _bt_next(IndexScanDesc scan, ScanDirection dir);