diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c index 413767ffeec1ce0800f2e9e27941137516052e03..9a3ca695d6de5c58a439649e80750a9e307a23e1 100644 --- a/src/backend/access/nbtree/nbtinsert.c +++ b/src/backend/access/nbtree/nbtinsert.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.161 2007/11/15 21:14:32 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtinsert.c,v 1.162 2007/11/16 19:53:50 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -371,13 +371,13 @@ _bt_check_unique(Relation rel, IndexTuple itup, Relation heapRel, * removing any LP_DEAD tuples. * * On entry, *buf and *offsetptr point to the first legal position - * where the new tuple could be inserted. The caller should hold an - * exclusive lock on *buf. *offsetptr can also be set to - * InvalidOffsetNumber, in which case the function will search the right - * location within the page if needed. On exit, they point to the chosen - * insert location. If findinsertloc decided to move right, the lock and - * pin on the original page will be released and the new page returned to - * the caller is exclusively locked instead. + * where the new tuple could be inserted. The caller should hold an + * exclusive lock on *buf. *offsetptr can also be set to + * InvalidOffsetNumber, in which case the function will search for the + * right location within the page if needed. On exit, they point to the + * chosen insert location. If _bt_findinsertloc decides to move right, + * the lock and pin on the original page will be released and the new + * page returned to the caller is exclusively locked instead. * * newtup is the new tuple we're inserting, and scankey is an insertion * type scan key for it. @@ -422,8 +422,6 @@ _bt_findinsertloc(Relation rel, "Consider a function index of an MD5 hash of the value, " "or use full text indexing."))); - - /*---------- * If we will need to split the page to put the item on this page, * check whether we can put the tuple somewhere to the right, @@ -1004,7 +1002,7 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright, xl_btree_split xlrec; uint8 xlinfo; XLogRecPtr recptr; - XLogRecData rdata[6]; + XLogRecData rdata[7]; XLogRecData *lastrdata; xlrec.node = rel->rd_node; @@ -1020,15 +1018,32 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright, lastrdata = &rdata[0]; - /* Log downlink on non-leaf pages. */ if (ropaque->btpo.level > 0) { + /* Log downlink on non-leaf pages */ lastrdata->next = lastrdata + 1; lastrdata++; lastrdata->data = (char *) &newitem->t_tid.ip_blkid; lastrdata->len = sizeof(BlockIdData); lastrdata->buffer = InvalidBuffer; + + /* + * We must also log the left page's high key, because the right + * page's leftmost key is suppressed on non-leaf levels. Show it + * as belonging to the left page buffer, so that it is not stored + * if XLogInsert decides it needs a full-page image of the left + * page. + */ + lastrdata->next = lastrdata + 1; + lastrdata++; + + itemid = PageGetItemId(origpage, P_HIKEY); + item = (IndexTuple) PageGetItem(origpage, itemid); + lastrdata->data = (char *) item; + lastrdata->len = MAXALIGN(IndexTupleSize(item)); + lastrdata->buffer = buf; /* backup block 1 */ + lastrdata->buffer_std = true; } /* @@ -1057,7 +1072,7 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright, lastrdata->buffer = buf; /* backup block 1 */ lastrdata->buffer_std = true; } - else + else if (ropaque->btpo.level == 0) { /* * Although we don't need to WAL-log the new item, we still need diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c index cdea9c1652529a1804abb650e2f6c6635f9b428a..37bcfb90e025af35ac009d1753715c95f82662a2 100644 --- a/src/backend/access/nbtree/nbtxlog.c +++ b/src/backend/access/nbtree/nbtxlog.c @@ -8,7 +8,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtxlog.c,v 1.48 2007/11/15 22:25:15 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/access/nbtree/nbtxlog.c,v 1.49 2007/11/16 19:53:50 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -273,6 +273,8 @@ btree_xlog_split(bool onleft, bool isroot, OffsetNumber newitemoff = 0; Item newitem = NULL; Size newitemsz = 0; + Item left_hikey = NULL; + Size left_hikeysz = 0; reln = XLogOpenRelation(xlrec->node); @@ -289,6 +291,17 @@ btree_xlog_split(bool onleft, bool isroot, datalen -= sizeof(BlockIdData); forget_matching_split(xlrec->node, downlink, false); + + /* Extract left hikey and its size (still assuming 16-bit alignment) */ + if (!(record->xl_info & XLR_BKP_BLOCK_1)) + { + /* We assume 16-bit alignment is enough for IndexTupleSize */ + left_hikey = (Item) datapos; + left_hikeysz = MAXALIGN(IndexTupleSize(left_hikey)); + + datapos += left_hikeysz; + datalen -= left_hikeysz; + } } /* Extract newitem and newitemoff, if present */ @@ -302,17 +315,13 @@ btree_xlog_split(bool onleft, bool isroot, if (onleft && !(record->xl_info & XLR_BKP_BLOCK_1)) { - IndexTupleData itupdata; - /* - * We need to copy the tuple header to apply IndexTupleDSize, because - * of alignment considerations. However, we assume that PageAddItem - * doesn't care about the alignment of the newitem pointer it's given. + * We assume that 16-bit alignment is enough to apply IndexTupleSize + * (since it's fetching from a uint16 field) and also enough for + * PageAddItem to insert the tuple. */ - newitem = datapos; - memcpy(&itupdata, datapos, sizeof(IndexTupleData)); - newitemsz = IndexTupleDSize(itupdata); - newitemsz = MAXALIGN(newitemsz); + newitem = (Item) datapos; + newitemsz = MAXALIGN(IndexTupleSize(newitem)); datapos += newitemsz; datalen -= newitemsz; } @@ -333,6 +342,18 @@ btree_xlog_split(bool onleft, bool isroot, _bt_restore_page(rpage, datapos, datalen); + /* + * On leaf level, the high key of the left page is equal to the + * first key on the right page. + */ + if (xlrec->level == 0) + { + ItemId hiItemId = PageGetItemId(rpage, P_FIRSTDATAKEY(ropaque)); + + left_hikey = PageGetItem(rpage, hiItemId); + left_hikeysz = ItemIdGetLength(hiItemId); + } + PageSetLSN(rpage, lsn); PageSetTLI(rpage, ThisTimeLineID); MarkBufferDirty(rbuf); @@ -360,8 +381,6 @@ btree_xlog_split(bool onleft, bool isroot, OffsetNumber maxoff = PageGetMaxOffsetNumber(lpage); OffsetNumber deletable[MaxOffsetNumber]; int ndeletable = 0; - ItemId hiItemId; - Item hiItem; /* * Remove the items from the left page that were copied to the @@ -394,11 +413,8 @@ btree_xlog_split(bool onleft, bool isroot, elog(PANIC, "failed to add new item to left page after split"); } - /* Set high key equal to the first key on the right page */ - hiItemId = PageGetItemId(rpage, P_FIRSTDATAKEY(ropaque)); - hiItem = PageGetItem(rpage, hiItemId); - - if (PageAddItem(lpage, hiItem, ItemIdGetLength(hiItemId), + /* Set high key */ + if (PageAddItem(lpage, left_hikey, left_hikeysz, P_HIKEY, false, false) == InvalidOffsetNumber) elog(PANIC, "failed to add high key to left page after split"); diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h index 34041b8da3c196624652b0a45ff359e8bd0a1649..31bea4e98fef73494010e9e5de6d853d00f91563 100644 --- a/src/include/access/nbtree.h +++ b/src/include/access/nbtree.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/access/nbtree.h,v 1.114 2007/11/15 21:14:42 momjian Exp $ + * $PostgreSQL: pgsql/src/include/access/nbtree.h,v 1.115 2007/11/16 19:53:50 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -289,8 +289,15 @@ typedef struct xl_btree_split * than BlockNumber for alignment reasons: SizeOfBtreeSplit is only 16-bit * aligned.) * + * If level > 0, an IndexTuple representing the HIKEY of the left page + * follows. We don't need this on leaf pages, because it's the same + * as the leftmost key in the new right page. Also, it's suppressed if + * XLogInsert chooses to store the left page's whole page image. + * * In the _L variants, next are OffsetNumber newitemoff and the new item. * (In the _R variants, the new item is one of the right page's tuples.) + * The new item, but not newitemoff, is suppressed if XLogInsert chooses + * to store the left page's whole page image. * * Last are the right page's tuples in the form used by _bt_restore_page. */