diff --git a/src/backend/access/nbtree/README b/src/backend/access/nbtree/README
index cd110df332ca29150538486326243a258db79897..4820f76e3bb80cae26e1a1ad7113aea860314403 100644
--- a/src/backend/access/nbtree/README
+++ b/src/backend/access/nbtree/README
@@ -404,12 +404,41 @@ an additional insertion above that, etc).
 For a root split, the followon WAL entry is a "new root" entry rather than
 an "insertion" entry, but details are otherwise much the same.
 
-Because insertion involves multiple atomic actions, the WAL replay logic
-has to detect the case where a page split isn't followed by a matching
-insertion on the parent level, and then do that insertion on its own (and
-recursively for any subsequent parent insertion, of course).  This is
-feasible because the WAL entry for the split contains enough info to know
-what must be inserted in the parent level.
+Because splitting involves multiple atomic actions, it's possible that the
+system crashes between splitting a page and inserting the downlink for the
+new half to the parent.  After recovery, the downlink for the new page will
+be missing.  The search algorithm works correctly, as the page will be found
+by following the right-link from its left sibling, although if a lot of
+downlinks in the tree are missing, performance will suffer.  A more serious
+consequence is that if the page without a downlink gets split again, the
+insertion algorithm will fail to find the location in the parent level to
+insert the downlink.
+
+Our approach is to create any missing downlinks on-the-fly, when searching
+the tree for a new insertion.  It could be done during searches, too, but
+it seems best not to put any extra updates in what would otherwise be a
+read-only operation (updating is not possible in hot standby mode anyway).
+It would seem natural to add the missing downlinks in VACUUM, but since
+inserting a downlink might require splitting a page, it might fail if you
+run out of disk space.  That would be bad during VACUUM - the reason for
+running VACUUM in the first place might be that you run out of disk space,
+and now VACUUM won't finish because you're out of disk space.  In contrast,
+an insertion can require enlarging the physical file anyway.
+
+To identify missing downlinks, when a page is split, the left page is
+flagged to indicate that the split is not yet complete (INCOMPLETE_SPLIT).
+When the downlink is inserted to the parent, the flag is cleared atomically
+with the insertion.  The child page is kept locked until the insertion in
+the parent is finished and the flag in the child cleared, but can be
+released immediately after that, before recursing up the tree if the parent
+also needs to be split.  This ensures that incompletely split pages should
+not be seen under normal circumstances; only if insertion to the parent
+has failed for some reason.
+
+We flag the left page, even though it's the right page that's missing the
+downlink, beacuse it's more convenient to know already when following the
+right-link from the left page to the right page that it will need to have
+its downlink inserted to the parent.
 
 When splitting a non-root page that is alone on its level, the required
 metapage update (of the "fast root" link) is performed and logged as part
@@ -419,8 +448,16 @@ metapage update is handled as part of the "new root" action.
 Each step in page deletion are logged as separate WAL entries: marking the
 leaf as half-dead and removing the downlink is one record, and unlinking a
 page is a second record.  If vacuum is interrupted for some reason, or the
-system crashes, the tree is consistent for searches and insertions.  The next
-VACUUM will find the half-dead leaf page and continue the deletion.
+system crashes, the tree is consistent for searches and insertions.  The
+next VACUUM will find the half-dead leaf page and continue the deletion.
+
+Before 9.4, we used to keep track of incomplete splits and page deletions
+during recovery and finish them immediately at end of recovery, instead of
+doing it lazily at the next  insertion or vacuum.  However, that made the
+recovery much more complicated, and only fixed the problem when crash
+recovery was performed.  An incomplete split can also occur if an otherwise
+recoverable error, like out-of-memory or out-of-disk-space, happens while
+inserting the downlink to the parent.
 
 Scans during Recovery
 ---------------------
diff --git a/src/backend/access/nbtree/nbtinsert.c b/src/backend/access/nbtree/nbtinsert.c
index 5f7953f5b38a7632af871ad5d4ab8b0adba4880d..3cd70ad4cfe7799decd6b44bf6d0bae608e06243 100644
--- a/src/backend/access/nbtree/nbtinsert.c
+++ b/src/backend/access/nbtree/nbtinsert.c
@@ -57,15 +57,18 @@ static void _bt_findinsertloc(Relation rel,
 				  int keysz,
 				  ScanKey scankey,
 				  IndexTuple newtup,
+				  BTStack stack,
 				  Relation heapRel);
-static void _bt_insertonpg(Relation rel, Buffer buf,
+static void _bt_insertonpg(Relation rel, Buffer buf, Buffer cbuf,
 			   BTStack stack,
 			   IndexTuple itup,
 			   OffsetNumber newitemoff,
 			   bool split_only_page);
-static Buffer _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
-		  OffsetNumber newitemoff, Size newitemsz,
+static Buffer _bt_split(Relation rel, Buffer buf, Buffer cbuf,
+		  OffsetNumber firstright, OffsetNumber newitemoff, Size newitemsz,
 		  IndexTuple newitem, bool newitemonleft);
+static void _bt_insert_parent(Relation rel, Buffer buf, Buffer rbuf,
+				  BTStack stack, bool is_root, bool is_only);
 static OffsetNumber _bt_findsplitloc(Relation rel, Page page,
 				 OffsetNumber newitemoff,
 				 Size newitemsz,
@@ -129,7 +132,8 @@ top:
 	 * move right in the tree.	See Lehman and Yao for an excruciatingly
 	 * precise description.
 	 */
-	buf = _bt_moveright(rel, buf, natts, itup_scankey, false, BT_WRITE);
+	buf = _bt_moveright(rel, buf, natts, itup_scankey, false,
+						true, stack, BT_WRITE);
 
 	/*
 	 * If we're not allowing duplicates, make sure the key isn't already in
@@ -182,8 +186,9 @@ top:
 		 */
 		CheckForSerializableConflictIn(rel, NULL, buf);
 		/* do the insertion */
-		_bt_findinsertloc(rel, &buf, &offset, natts, itup_scankey, itup, heapRel);
-		_bt_insertonpg(rel, buf, stack, itup, offset, false);
+		_bt_findinsertloc(rel, &buf, &offset, natts, itup_scankey, itup,
+						  stack, heapRel);
+		_bt_insertonpg(rel, buf, InvalidBuffer, stack, itup, offset, false);
 	}
 	else
 	{
@@ -507,6 +512,7 @@ _bt_findinsertloc(Relation rel,
 				  int keysz,
 				  ScanKey scankey,
 				  IndexTuple newtup,
+				  BTStack stack,
 				  Relation heapRel)
 {
 	Buffer		buf = *bufptr;
@@ -568,6 +574,7 @@ _bt_findinsertloc(Relation rel,
 	while (PageGetFreeSpace(page) < itemsz)
 	{
 		Buffer		rbuf;
+		BlockNumber	rblkno;
 
 		/*
 		 * before considering moving right, see if we can obtain enough space
@@ -605,18 +612,33 @@ _bt_findinsertloc(Relation rel,
 		 */
 		rbuf = InvalidBuffer;
 
+		rblkno = lpageop->btpo_next;
 		for (;;)
 		{
-			BlockNumber rblkno = lpageop->btpo_next;
-
 			rbuf = _bt_relandgetbuf(rel, rbuf, rblkno, BT_WRITE);
 			page = BufferGetPage(rbuf);
 			lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
+
+			/*
+			 * If this page was incompletely split, finish the split now.
+			 * We do this while holding a lock on the left sibling, which
+			 * is not good because finishing the split could be a fairly
+			 * lengthy operation.  But this should happen very seldom.
+			 */
+			if (P_INCOMPLETE_SPLIT(lpageop))
+			{
+				_bt_finish_split(rel, rbuf, stack);
+				rbuf = InvalidBuffer;
+				continue;
+			}
+
 			if (!P_IGNORE(lpageop))
 				break;
 			if (P_RIGHTMOST(lpageop))
 				elog(ERROR, "fell off the end of index \"%s\"",
 					 RelationGetRelationName(rel));
+
+			rblkno = lpageop->btpo_next;
 		}
 		_bt_relbuf(rel, buf);
 		buf = rbuf;
@@ -658,10 +680,14 @@ _bt_findinsertloc(Relation rel,
  *			   child page on the parent.
  *			+  updates the metapage if a true root or fast root is split.
  *
- *		On entry, we must have the right buffer in which to do the
+ *		On entry, we must have the correct buffer in which to do the
  *		insertion, and the buffer must be pinned and write-locked.	On return,
  *		we will have dropped both the pin and the lock on the buffer.
  *
+ *		When inserting to a non-leaf page, 'cbuf' is the left-sibling of the
+ *		page we're inserting the downlink for.  This function will clear the
+ *		INCOMPLETE_SPLIT flag on it, and release the buffer.
+ *
  *		The locking interactions in this code are critical.  You should
  *		grok Lehman and Yao's paper before making any changes.  In addition,
  *		you need to understand how we disambiguate duplicate keys in this
@@ -675,6 +701,7 @@ _bt_findinsertloc(Relation rel,
 static void
 _bt_insertonpg(Relation rel,
 			   Buffer buf,
+			   Buffer cbuf,
 			   BTStack stack,
 			   IndexTuple itup,
 			   OffsetNumber newitemoff,
@@ -688,6 +715,14 @@ _bt_insertonpg(Relation rel,
 	page = BufferGetPage(buf);
 	lpageop = (BTPageOpaque) PageGetSpecialPointer(page);
 
+	/* child buffer must be given iff inserting on an internal page */
+	Assert(P_ISLEAF(lpageop) == !BufferIsValid(cbuf));
+
+	/* The caller should've finished any incomplete splits already. */
+	if (P_INCOMPLETE_SPLIT(lpageop))
+		elog(ERROR, "cannot insert to incompletely split page %u",
+			 BufferGetBlockNumber(buf));
+
 	itemsz = IndexTupleDSize(*itup);
 	itemsz = MAXALIGN(itemsz);	/* be safe, PageAddItem will do this but we
 								 * need to be consistent */
@@ -712,7 +747,7 @@ _bt_insertonpg(Relation rel,
 									  &newitemonleft);
 
 		/* split the buffer into left and right halves */
-		rbuf = _bt_split(rel, buf, firstright,
+		rbuf = _bt_split(rel, buf, cbuf, firstright,
 						 newitemoff, itemsz, itup, newitemonleft);
 		PredicateLockPageSplit(rel,
 							   BufferGetBlockNumber(buf),
@@ -786,11 +821,22 @@ _bt_insertonpg(Relation rel,
 			MarkBufferDirty(metabuf);
 		}
 
+		/* clear INCOMPLETE_SPLIT flag on child if inserting a downlink */
+		if (BufferIsValid(cbuf))
+		{
+			Page		cpage = BufferGetPage(cbuf);
+			BTPageOpaque cpageop = (BTPageOpaque) PageGetSpecialPointer(cpage);
+
+			Assert(P_INCOMPLETE_SPLIT(cpageop));
+			cpageop->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
+			MarkBufferDirty(cbuf);
+		}
+
 		/* XLOG stuff */
 		if (RelationNeedsWAL(rel))
 		{
 			xl_btree_insert xlrec;
-			BlockNumber xldownlink;
+			BlockNumber xlleftchild;
 			xl_btree_metadata xlmeta;
 			uint8		xlinfo;
 			XLogRecPtr	recptr;
@@ -810,12 +856,15 @@ _bt_insertonpg(Relation rel,
 				xlinfo = XLOG_BTREE_INSERT_LEAF;
 			else
 			{
-				xldownlink = ItemPointerGetBlockNumber(&(itup->t_tid));
-				Assert(ItemPointerGetOffsetNumber(&(itup->t_tid)) == P_HIKEY);
-
-				nextrdata->data = (char *) &xldownlink;
+				/*
+				 * Include the block number of the left child, whose
+				 * INCOMPLETE_SPLIT flag was cleared.
+				 */
+				xlleftchild = BufferGetBlockNumber(cbuf);
+				nextrdata->data = (char *) &xlleftchild;
 				nextrdata->len = sizeof(BlockNumber);
-				nextrdata->buffer = InvalidBuffer;
+				nextrdata->buffer = cbuf;
+				nextrdata->buffer_std = true;
 				nextrdata->next = nextrdata + 1;
 				nextrdata++;
 
@@ -870,7 +919,8 @@ _bt_insertonpg(Relation rel,
 		/* release buffers */
 		if (BufferIsValid(metabuf))
 			_bt_relbuf(rel, metabuf);
-
+		if (BufferIsValid(cbuf))
+			_bt_relbuf(rel, cbuf);
 		_bt_relbuf(rel, buf);
 	}
 }
@@ -883,11 +933,15 @@ _bt_insertonpg(Relation rel,
  *		new right page.  newitemoff etc. tell us about the new item that
  *		must be inserted along with the data from the old page.
  *
+ *		When splitting a non-leaf page, 'cbuf' is the left-sibling of the
+ *		page we're inserting the downlink for.  This function will clear the
+ *		INCOMPLETE_SPLIT flag on it, and release the buffer.
+ *
  *		Returns the new right sibling of buf, pinned and write-locked.
  *		The pin and lock on buf are maintained.
  */
 static Buffer
-_bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
+_bt_split(Relation rel, Buffer buf, Buffer cbuf, OffsetNumber firstright,
 		  OffsetNumber newitemoff, Size newitemsz, IndexTuple newitem,
 		  bool newitemonleft)
 {
@@ -911,6 +965,7 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
 	OffsetNumber maxoff;
 	OffsetNumber i;
 	bool		isroot;
+	bool		isleaf;
 
 	/* Acquire a new page to split into */
 	rbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
@@ -949,12 +1004,15 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
 	ropaque = (BTPageOpaque) PageGetSpecialPointer(rightpage);
 
 	isroot = P_ISROOT(oopaque);
+	isleaf = P_ISLEAF(oopaque);
 
 	/* if we're splitting this page, it won't be the root when we're done */
 	/* also, clear the SPLIT_END and HAS_GARBAGE flags in both pages */
 	lopaque->btpo_flags = oopaque->btpo_flags;
 	lopaque->btpo_flags &= ~(BTP_ROOT | BTP_SPLIT_END | BTP_HAS_GARBAGE);
 	ropaque->btpo_flags = lopaque->btpo_flags;
+	/* set flag in left page indicating that the right page has no downlink */
+	lopaque->btpo_flags |= BTP_INCOMPLETE_SPLIT;
 	lopaque->btpo_prev = oopaque->btpo_prev;
 	lopaque->btpo_next = rightpagenumber;
 	ropaque->btpo_prev = origpagenumber;
@@ -1178,6 +1236,19 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
 		MarkBufferDirty(sbuf);
 	}
 
+	/*
+	 * Clear INCOMPLETE_SPLIT flag on child if inserting the new item finishes
+	 * a split.
+	 */
+	if (!isleaf)
+	{
+		Page		cpage = BufferGetPage(cbuf);
+		BTPageOpaque cpageop = (BTPageOpaque) PageGetSpecialPointer(cpage);
+
+		cpageop->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
+		MarkBufferDirty(cbuf);
+	}
+
 	/* XLOG stuff */
 	if (RelationNeedsWAL(rel))
 	{
@@ -1186,6 +1257,7 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
 		XLogRecPtr	recptr;
 		XLogRecData rdata[7];
 		XLogRecData *lastrdata;
+		BlockNumber cblkno;
 
 		xlrec.node = rel->rd_node;
 		xlrec.leftsib = origpagenumber;
@@ -1200,34 +1272,6 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
 
 		lastrdata = &rdata[0];
 
-		if (ropaque->btpo.level > 0)
-		{
-			/* Log downlink on non-leaf pages */
-			lastrdata->next = lastrdata + 1;
-			lastrdata++;
-
-			lastrdata->data = (char *) &newitem->t_tid.ip_blkid;
-			lastrdata->len = sizeof(BlockIdData);
-			lastrdata->buffer = InvalidBuffer;
-
-			/*
-			 * We must also log the left page's high key, because the right
-			 * page's leftmost key is suppressed on non-leaf levels.  Show it
-			 * as belonging to the left page buffer, so that it is not stored
-			 * if XLogInsert decides it needs a full-page image of the left
-			 * page.
-			 */
-			lastrdata->next = lastrdata + 1;
-			lastrdata++;
-
-			itemid = PageGetItemId(origpage, P_HIKEY);
-			item = (IndexTuple) PageGetItem(origpage, itemid);
-			lastrdata->data = (char *) item;
-			lastrdata->len = MAXALIGN(IndexTupleSize(item));
-			lastrdata->buffer = buf;	/* backup block 1 */
-			lastrdata->buffer_std = true;
-		}
-
 		/*
 		 * Log the new item and its offset, if it was inserted on the left
 		 * page. (If it was put on the right page, we don't need to explicitly
@@ -1254,23 +1298,62 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
 			lastrdata->buffer = buf;	/* backup block 1 */
 			lastrdata->buffer_std = true;
 		}
-		else if (ropaque->btpo.level == 0)
+
+		/* Log left page */
+		if (!isleaf)
 		{
+			lastrdata->next = lastrdata + 1;
+			lastrdata++;
+
 			/*
-			 * Although we don't need to WAL-log the new item, we still need
-			 * XLogInsert to consider storing a full-page image of the left
-			 * page, so make an empty entry referencing that buffer. This also
-			 * ensures that the left page is always backup block 1.
+			 * We must also log the left page's high key, because the right
+			 * page's leftmost key is suppressed on non-leaf levels.  Show it
+			 * as belonging to the left page buffer, so that it is not stored
+			 * if XLogInsert decides it needs a full-page image of the left
+			 * page.
 			 */
+			itemid = PageGetItemId(origpage, P_HIKEY);
+			item = (IndexTuple) PageGetItem(origpage, itemid);
+			lastrdata->data = (char *) item;
+			lastrdata->len = MAXALIGN(IndexTupleSize(item));
+			lastrdata->buffer = buf;	/* backup block 1 */
+			lastrdata->buffer_std = true;
+		}
+
+		if (isleaf && !newitemonleft)
+		{
 			lastrdata->next = lastrdata + 1;
 			lastrdata++;
 
+			/*
+			 * Although we don't need to WAL-log anything on the left page,
+			 * we still need XLogInsert to consider storing a full-page image
+			 * of the left page, so make an empty entry referencing that
+			 * buffer. This also ensures that the left page is always backup
+			 * block 1.
+			 */
 			lastrdata->data = NULL;
 			lastrdata->len = 0;
 			lastrdata->buffer = buf;	/* backup block 1 */
 			lastrdata->buffer_std = true;
 		}
 
+		/*
+		 * Log block number of left child, whose INCOMPLETE_SPLIT flag this
+		 * insertion clears.
+		 */
+		if (!isleaf)
+		{
+			lastrdata->next = lastrdata + 1;
+			lastrdata++;
+
+			cblkno = BufferGetBlockNumber(cbuf);
+			lastrdata->data = (char *) &cblkno;
+			lastrdata->len = sizeof(BlockNumber);
+			lastrdata->buffer = cbuf;	/* backup block 2 */
+			lastrdata->buffer_std = true;
+		}
+
 		/*
 		 * Log the contents of the right page in the format understood by
 		 * _bt_restore_page(). We set lastrdata->buffer to InvalidBuffer,
@@ -1300,7 +1383,7 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
 
 			lastrdata->data = NULL;
 			lastrdata->len = 0;
-			lastrdata->buffer = sbuf;	/* backup block 2 */
+			lastrdata->buffer = sbuf;	/* bkp block 2 (leaf) or 3 (non-leaf) */
 			lastrdata->buffer_std = true;
 		}
 
@@ -1327,6 +1410,10 @@ _bt_split(Relation rel, Buffer buf, OffsetNumber firstright,
 	if (!P_RIGHTMOST(ropaque))
 		_bt_relbuf(rel, sbuf);
 
+	/* release the child */
+	if (!isleaf)
+		_bt_relbuf(rel, cbuf);
+
 	/* split's done */
 	return rbuf;
 }
@@ -1597,10 +1684,8 @@ _bt_checksplitloc(FindSplitData *state,
  *			have to be efficient (concurrent ROOT split, WAL recovery)
  * is_root - we split the true root
  * is_only - we split a page alone on its level (might have been fast root)
- *
- * This is exported so it can be called by nbtxlog.c.
  */
-void
+static void
 _bt_insert_parent(Relation rel,
 				  Buffer buf,
 				  Buffer rbuf,
@@ -1619,8 +1704,7 @@ _bt_insert_parent(Relation rel,
 	 *
 	 * If we have to search for the parent level, we do so by re-descending
 	 * from the root.  This is not super-efficient, but it's rare enough not
-	 * to matter.  (This path is also taken when called from WAL recovery ---
-	 * we have no stack in that case.)
+	 * to matter.
 	 */
 	if (is_root)
 	{
@@ -1679,12 +1763,13 @@ _bt_insert_parent(Relation rel,
 		 * 05/27/97
 		 */
 		ItemPointerSet(&(stack->bts_btentry.t_tid), bknum, P_HIKEY);
-
 		pbuf = _bt_getstackbuf(rel, stack, BT_WRITE);
 
-		/* Now we can unlock the children */
+		/*
+		 * Now we can unlock the right child. The left child will be unlocked
+		 * by _bt_insertonpg().
+		 */
 		_bt_relbuf(rel, rbuf);
-		_bt_relbuf(rel, buf);
 
 		/* Check for error only after writing children */
 		if (pbuf == InvalidBuffer)
@@ -1692,7 +1777,7 @@ _bt_insert_parent(Relation rel,
 				 RelationGetRelationName(rel), bknum, rbknum);
 
 		/* Recursively update the parent */
-		_bt_insertonpg(rel, pbuf, stack->bts_parent,
+		_bt_insertonpg(rel, pbuf, buf, stack->bts_parent,
 					   new_item, stack->bts_offset + 1,
 					   is_only);
 
@@ -1701,6 +1786,62 @@ _bt_insert_parent(Relation rel,
 	}
 }
 
+/*
+ * _bt_finish_split() -- Finish an incomplete split
+ *
+ * A crash or other failure can leave a split incomplete.  The insertion
+ * routines won't allow to insert on a page that is incompletely split.
+ * Before inserting on such a page, call _bt_finish_split().
+ *
+ * On entry, 'lbuf' must be locked in write-mode.  On exit, it is unlocked
+ * and unpinned.
+ */
+void
+_bt_finish_split(Relation rel, Buffer lbuf, BTStack stack)
+{
+	Page		lpage = BufferGetPage(lbuf);
+	BTPageOpaque lpageop = (BTPageOpaque) PageGetSpecialPointer(lpage);
+	Buffer		rbuf;
+	Page		rpage;
+	BTPageOpaque rpageop;
+	bool		was_root;
+	bool		was_only;
+
+	Assert(P_INCOMPLETE_SPLIT(lpageop));
+
+	/* Lock right sibling, the one missing the downlink */
+	rbuf = _bt_getbuf(rel, lpageop->btpo_next, BT_WRITE);
+	rpage = BufferGetPage(rbuf);
+	rpageop = (BTPageOpaque) PageGetSpecialPointer(rpage);
+
+	/* Could this be a root split? */
+	if (!stack)
+	{
+		Buffer		metabuf;
+		Page		metapg;
+		BTMetaPageData *metad;
+
+		/* acquire lock on the metapage */
+		metabuf = _bt_getbuf(rel, BTREE_METAPAGE, BT_WRITE);
+		metapg = BufferGetPage(metabuf);
+		metad = BTPageGetMeta(metapg);
+
+		was_root = (metad->btm_root == BufferGetBlockNumber(lbuf));
+
+		_bt_relbuf(rel, metabuf);
+	}
+	else
+		was_root = false;
+
+	/* Was this the only page on the level before split? */
+	was_only = (P_LEFTMOST(lpageop) && P_RIGHTMOST(rpageop));
+
+	elog(DEBUG1, "finishing incomplete split of %u/%u",
+		 BufferGetBlockNumber(lbuf), BufferGetBlockNumber(rbuf));
+
+	_bt_insert_parent(rel, lbuf, rbuf, stack, was_root, was_only);
+}
+
 /*
  *	_bt_getstackbuf() -- Walk back up the tree one step, and find the item
  *						 we last looked at in the parent.
@@ -1733,6 +1874,12 @@ _bt_getstackbuf(Relation rel, BTStack stack, int access)
 		page = BufferGetPage(buf);
 		opaque = (BTPageOpaque) PageGetSpecialPointer(page);
 
+		if (access == BT_WRITE && P_INCOMPLETE_SPLIT(opaque))
+		{
+			_bt_finish_split(rel, buf, stack->bts_parent);
+			continue;
+		}
+
 		if (!P_IGNORE(opaque))
 		{
 			OffsetNumber offnum,
@@ -1837,6 +1984,7 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
 				rbkno;
 	BlockNumber rootblknum;
 	BTPageOpaque rootopaque;
+	BTPageOpaque lopaque;
 	ItemId		itemid;
 	IndexTuple	item;
 	Size		itemsz;
@@ -1848,6 +1996,7 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
 	lbkno = BufferGetBlockNumber(lbuf);
 	rbkno = BufferGetBlockNumber(rbuf);
 	lpage = BufferGetPage(lbuf);
+	lopaque = (BTPageOpaque) PageGetSpecialPointer(lpage);
 
 	/* get a new root page */
 	rootbuf = _bt_getbuf(rel, P_NEW, BT_WRITE);
@@ -1921,6 +2070,11 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
 			 BufferGetBlockNumber(lbuf), RelationGetRelationName(rel));
 	pfree(new_item);
 
+	/* Clear the incomplete-split flag in the left child */
+	Assert(P_INCOMPLETE_SPLIT(lopaque));
+	lopaque->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
+	MarkBufferDirty(lbuf);
+
 	MarkBufferDirty(rootbuf);
 	MarkBufferDirty(metabuf);
 
@@ -1929,7 +2083,7 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
 	{
 		xl_btree_newroot xlrec;
 		XLogRecPtr	recptr;
-		XLogRecData rdata[2];
+		XLogRecData rdata[3];
 
 		xlrec.node = rel->rd_node;
 		xlrec.rootblk = rootblknum;
@@ -1948,7 +2102,13 @@ _bt_newroot(Relation rel, Buffer lbuf, Buffer rbuf)
 		rdata[1].len = ((PageHeader) rootpage)->pd_special -
 			((PageHeader) rootpage)->pd_upper;
 		rdata[1].buffer = InvalidBuffer;
-		rdata[1].next = NULL;
+		rdata[1].next = &(rdata[2]);
+
+		/* Make a full-page image of the left child if needed */
+		rdata[2].data = NULL;
+		rdata[2].len = 0;
+		rdata[2].buffer = lbuf;
+		rdata[2].next = NULL;
 
 		recptr = XLogInsert(RM_BTREE_ID, XLOG_BTREE_NEWROOT, rdata);
 
diff --git a/src/backend/access/nbtree/nbtpage.c b/src/backend/access/nbtree/nbtpage.c
index 14e422a1d36d643192035dc5f022ec8c7bb6dc65..87ac5f4aafba7db496470cbbb3c19a27707373d7 100644
--- a/src/backend/access/nbtree/nbtpage.c
+++ b/src/backend/access/nbtree/nbtpage.c
@@ -992,6 +992,7 @@ _bt_lock_branch_parent(Relation rel, BlockNumber child, BTStack stack,
 	Buffer		pbuf;
 	Page		page;
 	BTPageOpaque opaque;
+	BlockNumber leftsib;
 
 	/* Locate the parent's downlink (updating the stack entry if needed) */
 	ItemPointerSet(&(stack->bts_btentry.t_tid), child, P_HIKEY);
@@ -1020,7 +1021,8 @@ _bt_lock_branch_parent(Relation rel, BlockNumber child, BTStack stack,
 			 * We have to check the parent itself, and then recurse to test
 			 * the conditions at the parent's parent.
 			 */
-			if (P_RIGHTMOST(opaque) || P_ISROOT(opaque))
+			if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) ||
+				P_INCOMPLETE_SPLIT(opaque))
 			{
 				_bt_relbuf(rel, pbuf);
 				return false;
@@ -1028,8 +1030,41 @@ _bt_lock_branch_parent(Relation rel, BlockNumber child, BTStack stack,
 
 			*target = parent;
 			*rightsib = opaque->btpo_next;
+			leftsib = opaque->btpo_prev;
 
 			_bt_relbuf(rel, pbuf);
+
+			/*
+			 * Like in _bt_pagedel, check that the left sibling is not marked
+			 * with INCOMPLETE_SPLIT flag.  That would mean that there is no
+			 * downlink to the page to be deleted, and the page deletion
+			 * algorithm isn't prepared to handle that.
+			 */
+			if (leftsib != P_NONE)
+			{
+				Buffer		lbuf;
+				Page		lpage;
+				BTPageOpaque lopaque;
+
+				lbuf = _bt_getbuf(rel, leftsib, BT_READ);
+				lpage = BufferGetPage(lbuf);
+				lopaque = (BTPageOpaque) PageGetSpecialPointer(lpage);
+				/*
+				 * If the left sibling was concurrently split, so that its
+				 * next-pointer doesn't point to the current page anymore,
+				 * the split that created the current page must be completed.
+				 * (We don't allow splitting an incompletely split page again
+				 * until the previous split has been completed)
+				 */
+				if (lopaque->btpo_next == parent &&
+					P_INCOMPLETE_SPLIT(lopaque))
+				{
+					_bt_relbuf(rel, lbuf);
+					return false;
+				}
+				_bt_relbuf(rel, lbuf);
+			}
+
 			return _bt_lock_branch_parent(rel, parent, stack->bts_parent,
 										  topparent, topoff, target, rightsib);
 		}
@@ -1081,6 +1116,10 @@ _bt_pagedel(Relation rel, Buffer buf)
 	 * "stack" is a search stack leading (approximately) to the target page.
 	 * It is initially NULL, but when iterating, we keep it to avoid
 	 * duplicated search effort.
+	 *
+	 * Also, when "stack" is not NULL, we have already checked that the
+	 * current page is not the right half of an incomplete split, i.e. the
+	 * left sibling does not have its INCOMPLETE_SPLIT flag set.
 	 */
 	BTStack		stack = NULL;
 
@@ -1117,11 +1156,25 @@ _bt_pagedel(Relation rel, Buffer buf)
 		}
 
 		/*
-		 * We can never delete rightmost pages nor root pages.	While at it,
-		 * check that page is not already deleted and is empty.
+		 * We can never delete rightmost pages nor root pages.  While at
+		 * it, check that page is not already deleted and is empty.
+		 *
+		 * To keep the algorithm simple, we also never delete an incompletely
+		 * split page (they should be rare enough that this doesn't make any
+		 * meaningful difference to disk usage):
+		 *
+		 * The INCOMPLETE_SPLIT flag on the page tells us if the page is the
+		 * left half of an incomplete split, but ensuring that it's not the
+		 * right half is more complicated.  For that, we have to check that
+		 * the left sibling doesn't have its INCOMPLETE_SPLIT flag set.  On
+		 * the first iteration, we temporarily release the lock on the
+		 * current page, and check the left sibling and also construct a
+		 * search stack to.  On subsequent iterations, we know we stepped right
+		 * from a page that passed these tests, so it's OK.
 		 */
 		if (P_RIGHTMOST(opaque) || P_ISROOT(opaque) || P_ISDELETED(opaque) ||
-			P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page))
+			P_FIRSTDATAKEY(opaque) <= PageGetMaxOffsetNumber(page) ||
+			P_INCOMPLETE_SPLIT(opaque))
 		{
 			/* Should never fail to delete a half-dead page */
 			Assert(!P_ISHALFDEAD(opaque));
@@ -1142,6 +1195,9 @@ _bt_pagedel(Relation rel, Buffer buf)
 			 * use the standard search mechanism to search for the page's high
 			 * key; this will give us a link to either the current parent or
 			 * someplace to its left (if there are multiple equal high keys).
+			 *
+			 * Also check if this is the right-half of an incomplete split
+			 * (see comment above).
 			 */
 			if (!stack)
 			{
@@ -1149,16 +1205,43 @@ _bt_pagedel(Relation rel, Buffer buf)
 				ItemId		itemid;
 				IndexTuple	targetkey;
 				Buffer		lbuf;
+				BlockNumber	leftsib;
 
 				itemid = PageGetItemId(page, P_HIKEY);
 				targetkey = CopyIndexTuple((IndexTuple) PageGetItem(page, itemid));
 
+				leftsib = opaque->btpo_prev;
+
 				/*
 				 * To avoid deadlocks, we'd better drop the leaf page lock
 				 * before going further.
 				 */
 				LockBuffer(buf, BUFFER_LOCK_UNLOCK);
 
+				/*
+				 * Fetch the left sibling, to check that it's not marked
+				 * with INCOMPLETE_SPLIT flag.  That would mean that the
+				 * page to-be-deleted doesn't have a downlink, and the page
+				 * deletion algorithm isn't prepared to handle that.
+				 */
+				if (!P_LEFTMOST(opaque))
+				{
+					BTPageOpaque lopaque;
+					Page		lpage;
+
+					lbuf = _bt_getbuf(rel, leftsib, BT_READ);
+					lpage = BufferGetPage(lbuf);
+					lopaque = (BTPageOpaque) PageGetSpecialPointer(lpage);
+					if (lopaque->btpo_next == BufferGetBlockNumber(buf) &&
+						P_INCOMPLETE_SPLIT(lopaque))
+					{
+						ReleaseBuffer(buf);
+						_bt_relbuf(rel, lbuf);
+						return ndeleted;
+					}
+					_bt_relbuf(rel, lbuf);
+				}
+
 				/* we need an insertion scan key for the search, so build one */
 				itup_scankey = _bt_mkscankey(rel, targetkey);
 				/* find the leftmost leaf page containing this key */
diff --git a/src/backend/access/nbtree/nbtsearch.c b/src/backend/access/nbtree/nbtsearch.c
index bc62fbc02e6c189438f1b693b1ca97f5b6068f88..0bf12f0e107a214d17b7432020c147311f8ea8c8 100644
--- a/src/backend/access/nbtree/nbtsearch.c
+++ b/src/backend/access/nbtree/nbtsearch.c
@@ -51,7 +51,8 @@ static bool _bt_endpoint(IndexScanDesc scan, ScanDirection dir);
  * NOTE that the returned buffer is read-locked regardless of the access
  * parameter.  However, access = BT_WRITE will allow an empty root page
  * to be created and returned.	When access = BT_READ, an empty index
- * will result in *bufP being set to InvalidBuffer.
+ * will result in *bufP being set to InvalidBuffer.  Also, in BT_WRITE mode,
+ * any incomplete splits encountered during the search will be finished.
  */
 BTStack
 _bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey,
@@ -82,8 +83,17 @@ _bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey,
 		 * Race -- the page we just grabbed may have split since we read its
 		 * pointer in the parent (or metapage).  If it has, we may need to
 		 * move right to its new sibling.  Do that.
+		 *
+		 * In write-mode, allow _bt_moveright to finish any incomplete splits
+		 * along the way.  Strictly speaking, we'd only need to finish an
+		 * incomplete split on the leaf page we're about to insert to, not on
+		 * any of the upper levels (they is taken care of in _bt_getstackbuf,
+		 * if the leaf page is split and we insert to the parent page).  But
+		 * this is a good opportunity to finish splits of internal pages too.
 		 */
-		*bufP = _bt_moveright(rel, *bufP, keysz, scankey, nextkey, BT_READ);
+		*bufP = _bt_moveright(rel, *bufP, keysz, scankey, nextkey,
+							  (access == BT_WRITE), stack_in,
+							  BT_READ);
 
 		/* if this is a leaf page, we're done */
 		page = BufferGetPage(*bufP);
@@ -148,6 +158,11 @@ _bt_search(Relation rel, int keysz, ScanKey scankey, bool nextkey,
  * item >= scankey.  When nextkey is true, we are looking for the first
  * item strictly greater than scankey.
  *
+ * If forupdate is true, we will attempt to finish any incomplete splits
+ * that we encounter.  This is required when locking a target page for an
+ * insertion, because we don't allow inserting on a page before the split
+ * is completed.  'stack' is only used if forupdate is true.
+ *
  * On entry, we have the buffer pinned and a lock of the type specified by
  * 'access'.  If we move right, we release the buffer and lock and acquire
  * the same on the right sibling.  Return value is the buffer we stop at.
@@ -158,15 +173,14 @@ _bt_moveright(Relation rel,
 			  int keysz,
 			  ScanKey scankey,
 			  bool nextkey,
+			  bool forupdate,
+			  BTStack stack,
 			  int access)
 {
 	Page		page;
 	BTPageOpaque opaque;
 	int32		cmpval;
 
-	page = BufferGetPage(buf);
-	opaque = (BTPageOpaque) PageGetSpecialPointer(page);
-
 	/*
 	 * When nextkey = false (normal case): if the scan key that brought us to
 	 * this page is > the high key stored on the page, then the page has split
@@ -184,16 +198,46 @@ _bt_moveright(Relation rel,
 	 */
 	cmpval = nextkey ? 0 : 1;
 
-	while (!P_RIGHTMOST(opaque) &&
-		   (P_IGNORE(opaque) ||
-			_bt_compare(rel, keysz, scankey, page, P_HIKEY) >= cmpval))
+	for (;;)
 	{
-		/* step right one page */
-		BlockNumber rblkno = opaque->btpo_next;
-
-		buf = _bt_relandgetbuf(rel, buf, rblkno, access);
 		page = BufferGetPage(buf);
 		opaque = (BTPageOpaque) PageGetSpecialPointer(page);
+
+		if (P_RIGHTMOST(opaque))
+			break;
+
+		/*
+		 * Finish any incomplete splits we encounter along the way.
+		 */
+		if (forupdate && P_INCOMPLETE_SPLIT(opaque))
+		{
+			BlockNumber blkno = BufferGetBlockNumber(buf);
+
+			/* upgrade our lock if necessary */
+			if (access == BT_READ)
+			{
+				LockBuffer(buf, BUFFER_LOCK_UNLOCK);
+				LockBuffer(buf, BT_WRITE);
+			}
+
+			if (P_INCOMPLETE_SPLIT(opaque))
+				_bt_finish_split(rel, buf, stack);
+			else
+				_bt_relbuf(rel, buf);
+
+			/* re-acquire the lock in the right mode, and re-check */
+			buf = _bt_getbuf(rel, blkno, access);
+			continue;
+		}
+
+		if (P_IGNORE(opaque) || _bt_compare(rel, keysz, scankey, page, P_HIKEY) >= cmpval)
+		{
+			/* step right one page */
+			buf = _bt_relandgetbuf(rel, buf, opaque->btpo_next, access);
+			continue;
+		}
+		else
+			break;
 	}
 
 	if (P_IGNORE(opaque))
diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c
index bc376fd4094f2489ead1d8022675c978c6fbea4a..665e60b7bd2f78feb4a856f556a9039f7715443e 100644
--- a/src/backend/access/nbtree/nbtxlog.c
+++ b/src/backend/access/nbtree/nbtxlog.c
@@ -20,61 +20,6 @@
 #include "storage/procarray.h"
 #include "miscadmin.h"
 
-/*
- * We must keep track of expected insertions due to page splits, and apply
- * them manually if they are not seen in the WAL log during replay.  This
- * makes it safe for page insertion to be a multiple-WAL-action process.
- *
- * The data structure is a simple linked list --- this should be good enough,
- * since we don't expect a page split to remain incomplete for long.  In any
- * case we need to respect the order of operations.
- */
-typedef struct bt_incomplete_split
-{
-	RelFileNode node;			/* the index */
-	bool		is_root;		/* we split the root */
-	BlockNumber leftblk;		/* left half of split */
-	BlockNumber rightblk;		/* right half of split */
-} bt_incomplete_split;
-
-static List *incomplete_splits;
-
-
-static void
-log_incomplete_split(RelFileNode node, BlockNumber leftblk,
-					 BlockNumber rightblk, bool is_root)
-{
-	bt_incomplete_split *action = palloc(sizeof(bt_incomplete_split));
-
-	action->node = node;
-	action->is_root = is_root;
-	action->leftblk = leftblk;
-	action->rightblk = rightblk;
-	incomplete_splits = lappend(incomplete_splits, action);
-}
-
-static void
-forget_matching_split(RelFileNode node, BlockNumber downlink, bool is_root)
-{
-	ListCell   *l;
-
-	foreach(l, incomplete_splits)
-	{
-		bt_incomplete_split *action = (bt_incomplete_split *) lfirst(l);
-
-		if (RelFileNodeEquals(node, action->node) &&
-			downlink == action->rightblk)
-		{
-			if (is_root != action->is_root)
-				elog(LOG, "forget_matching_split: fishy is_root data (expected %d, got %d)",
-					 action->is_root, is_root);
-			incomplete_splits = list_delete_ptr(incomplete_splits, action);
-			pfree(action);
-			break;				/* need not look further */
-		}
-	}
-}
-
 /*
  * _bt_restore_page -- re-enter all the index tuples on a page
  *
@@ -149,23 +94,60 @@ _bt_restore_meta(RelFileNode rnode, XLogRecPtr lsn,
 	UnlockReleaseBuffer(metabuf);
 }
 
+/*
+ * _bt_clear_incomplete_split -- clear INCOMPLETE_SPLIT flag on a page
+ *
+ * This is a common subroutine of the redo functions of all the WAL record
+ * types that can insert a downlink: insert, split, and newroot.
+ */
+static void
+_bt_clear_incomplete_split(XLogRecPtr lsn, XLogRecord *record,
+						   RelFileNode rnode, BlockNumber cblock)
+{
+	Buffer buf;
+
+	buf = XLogReadBuffer(rnode, cblock, false);
+	if (BufferIsValid(buf))
+	{
+		Page		page = (Page) BufferGetPage(buf);
+
+		if (lsn > PageGetLSN(page))
+		{
+			BTPageOpaque pageop = (BTPageOpaque) PageGetSpecialPointer(page);
+			Assert((pageop->btpo_flags & BTP_INCOMPLETE_SPLIT) != 0);
+			pageop->btpo_flags &= ~BTP_INCOMPLETE_SPLIT;
+
+			PageSetLSN(page, lsn);
+			MarkBufferDirty(buf);
+		}
+		UnlockReleaseBuffer(buf);
+	}
+}
+
 static void
 btree_xlog_insert(bool isleaf, bool ismeta,
 				  XLogRecPtr lsn, XLogRecord *record)
 {
 	xl_btree_insert *xlrec = (xl_btree_insert *) XLogRecGetData(record);
 	Buffer		buffer;
+	Buffer		cbuffer = InvalidBuffer;
 	Page		page;
 	char	   *datapos;
 	int			datalen;
 	xl_btree_metadata md;
-	BlockNumber downlink = 0;
+	BlockNumber cblkno = 0;
+	int			main_blk_index;
 
 	datapos = (char *) xlrec + SizeOfBtreeInsert;
 	datalen = record->xl_len - SizeOfBtreeInsert;
-	if (!isleaf)
+	/*
+	 * if this insert finishes a split at lower level, extract the block
+	 * number of the (left) child.
+	 */
+	if (!isleaf && (record->xl_info & XLR_BKP_BLOCK(0)) == 0)
 	{
-		memcpy(&downlink, datapos, sizeof(BlockNumber));
+		memcpy(&cblkno, datapos, sizeof(BlockNumber));
+		Assert(cblkno != 0);
 		datapos += sizeof(BlockNumber);
 		datalen -= sizeof(BlockNumber);
 	}
@@ -176,8 +158,19 @@ btree_xlog_insert(bool isleaf, bool ismeta,
 		datalen -= sizeof(xl_btree_metadata);
 	}
 
-	if (record->xl_info & XLR_BKP_BLOCK(0))
-		(void) RestoreBackupBlock(lsn, record, 0, false, false);
+	if (!isleaf)
+	{
+		if (record->xl_info & XLR_BKP_BLOCK(0))
+			(void) RestoreBackupBlock(lsn, record, 0, false, false);
+		else
+			_bt_clear_incomplete_split(lsn, record, xlrec->target.node, cblkno);
+		main_blk_index = 1;
+	}
+	else
+		main_blk_index = 0;
+
+	if (record->xl_info & XLR_BKP_BLOCK(main_blk_index))
+		(void) RestoreBackupBlock(lsn, record, main_blk_index, false, false);
 	else
 	{
 		buffer = XLogReadBuffer(xlrec->target.node,
@@ -187,11 +180,7 @@ btree_xlog_insert(bool isleaf, bool ismeta,
 		{
 			page = (Page) BufferGetPage(buffer);
 
-			if (lsn <= PageGetLSN(page))
-			{
-				UnlockReleaseBuffer(buffer);
-			}
-			else
+			if (lsn > PageGetLSN(page))
 			{
 				if (PageAddItem(page, (Item) datapos, datalen,
 							ItemPointerGetOffsetNumber(&(xlrec->target.tid)),
@@ -200,11 +189,14 @@ btree_xlog_insert(bool isleaf, bool ismeta,
 
 				PageSetLSN(page, lsn);
 				MarkBufferDirty(buffer);
-				UnlockReleaseBuffer(buffer);
 			}
+			UnlockReleaseBuffer(buffer);
 		}
 	}
 
+	if (BufferIsValid(cbuffer))
+		UnlockReleaseBuffer(cbuffer);
+
 	/*
 	 * Note: in normal operation, we'd update the metapage while still holding
 	 * lock on the page we inserted into.  But during replay it's not
@@ -216,10 +208,6 @@ btree_xlog_insert(bool isleaf, bool ismeta,
 		_bt_restore_meta(xlrec->target.node, lsn,
 						 md.root, md.level,
 						 md.fastroot, md.fastlevel);
-
-	/* Forget any split this insertion completes */
-	if (!isleaf)
-		forget_matching_split(xlrec->target.node, downlink, false);
 }
 
 static void
@@ -227,6 +215,8 @@ btree_xlog_split(bool onleft, bool isroot,
 				 XLogRecPtr lsn, XLogRecord *record)
 {
 	xl_btree_split *xlrec = (xl_btree_split *) XLogRecGetData(record);
+	bool		isleaf = (xlrec->level == 0);
+	Buffer		lbuf;
 	Buffer		rbuf;
 	Page		rpage;
 	BTPageOpaque ropaque;
@@ -237,42 +227,18 @@ btree_xlog_split(bool onleft, bool isroot,
 	Size		newitemsz = 0;
 	Item		left_hikey = NULL;
 	Size		left_hikeysz = 0;
+	BlockNumber cblkno = InvalidBlockNumber;
 
 	datapos = (char *) xlrec + SizeOfBtreeSplit;
 	datalen = record->xl_len - SizeOfBtreeSplit;
 
-	/* Forget any split this insertion completes */
-	if (xlrec->level > 0)
-	{
-		/* we assume SizeOfBtreeSplit is at least 16-bit aligned */
-		BlockNumber downlink = BlockIdGetBlockNumber((BlockId) datapos);
-
-		datapos += sizeof(BlockIdData);
-		datalen -= sizeof(BlockIdData);
-
-		forget_matching_split(xlrec->node, downlink, false);
-
-		/* Extract left hikey and its size (still assuming 16-bit alignment) */
-		if (!(record->xl_info & XLR_BKP_BLOCK(0)))
-		{
-			/* We assume 16-bit alignment is enough for IndexTupleSize */
-			left_hikey = (Item) datapos;
-			left_hikeysz = MAXALIGN(IndexTupleSize(left_hikey));
-
-			datapos += left_hikeysz;
-			datalen -= left_hikeysz;
-		}
-	}
-
-	/* Extract newitem and newitemoff, if present */
+	/* Extract newitemoff and newitem, if present */
 	if (onleft)
 	{
-		/* Extract the offset (still assuming 16-bit alignment) */
 		memcpy(&newitemoff, datapos, sizeof(OffsetNumber));
 		datapos += sizeof(OffsetNumber);
 		datalen -= sizeof(OffsetNumber);
 	}
-
 	if (onleft && !(record->xl_info & XLR_BKP_BLOCK(0)))
 	{
 		/*
@@ -286,6 +252,37 @@ btree_xlog_split(bool onleft, bool isroot,
 		datalen -= newitemsz;
 	}
 
+	/* Extract left hikey and its size (still assuming 16-bit alignment) */
+	if (!isleaf && !(record->xl_info & XLR_BKP_BLOCK(0)))
+	{
+		left_hikey = (Item) datapos;
+		left_hikeysz = MAXALIGN(IndexTupleSize(left_hikey));
+		datapos += left_hikeysz;
+		datalen -= left_hikeysz;
+	}
+	/*
+	 * If this insertion finishes an incomplete split, get the block number
+	 * of the child.
+	 */
+	if (!isleaf && !(record->xl_info & XLR_BKP_BLOCK(1)))
+	{
+		memcpy(&cblkno, datapos, sizeof(BlockNumber));
+		datapos += sizeof(BlockNumber);
+		datalen -= sizeof(BlockNumber);
+	}
+
+	/*
+	 * Clear the incomplete split flag on the left sibling of the child page
+	 * this is a downlink for.
+	 */
+	if (!isleaf)
+	{
+		if (record->xl_info & XLR_BKP_BLOCK(2))
+			(void) RestoreBackupBlock(lsn, record, 2, false, false);
+		else
+			_bt_clear_incomplete_split(lsn, record, xlrec->node, cblkno);
+	}
+
 	/* Reconstruct right (new) sibling page from scratch */
 	rbuf = XLogReadBuffer(xlrec->node, xlrec->rightsib, true);
 	Assert(BufferIsValid(rbuf));
@@ -297,7 +294,7 @@ btree_xlog_split(bool onleft, bool isroot,
 	ropaque->btpo_prev = xlrec->leftsib;
 	ropaque->btpo_next = xlrec->rnext;
 	ropaque->btpo.level = xlrec->level;
-	ropaque->btpo_flags = (xlrec->level == 0) ? BTP_LEAF : 0;
+	ropaque->btpo_flags = isleaf ? BTP_LEAF : 0;
 	ropaque->btpo_cycleid = 0;
 
 	_bt_restore_page(rpage, datapos, datalen);
@@ -306,7 +303,7 @@ btree_xlog_split(bool onleft, bool isroot,
 	 * On leaf level, the high key of the left page is equal to the first key
 	 * on the right page.
 	 */
-	if (xlrec->level == 0)
+	if (isleaf)
 	{
 		ItemId		hiItemId = PageGetItemId(rpage, P_FIRSTDATAKEY(ropaque));
 
@@ -321,10 +318,10 @@ btree_xlog_split(bool onleft, bool isroot,
 
 	/* Now reconstruct left (original) sibling page */
 	if (record->xl_info & XLR_BKP_BLOCK(0))
-		(void) RestoreBackupBlock(lsn, record, 0, false, false);
+		lbuf = RestoreBackupBlock(lsn, record, 0, false, true);
 	else
 	{
-		Buffer		lbuf = XLogReadBuffer(xlrec->node, xlrec->leftsib, false);
+		lbuf = XLogReadBuffer(xlrec->node, xlrec->leftsib, false);
 
 		if (BufferIsValid(lbuf))
 		{
@@ -381,19 +378,21 @@ btree_xlog_split(bool onleft, bool isroot,
 					elog(PANIC, "failed to add high key to left page after split");
 
 				/* Fix opaque fields */
-				lopaque->btpo_flags = (xlrec->level == 0) ? BTP_LEAF : 0;
+				lopaque->btpo_flags = BTP_INCOMPLETE_SPLIT;
+				if (isleaf)
+					lopaque->btpo_flags |= BTP_LEAF;
 				lopaque->btpo_next = xlrec->rightsib;
 				lopaque->btpo_cycleid = 0;
 
 				PageSetLSN(lpage, lsn);
 				MarkBufferDirty(lbuf);
 			}
-
-			UnlockReleaseBuffer(lbuf);
 		}
 	}
 
-	/* We no longer need the right buffer */
+	/* We no longer need the buffers */
+	if (BufferIsValid(lbuf))
+		UnlockReleaseBuffer(lbuf);
 	UnlockReleaseBuffer(rbuf);
 
 	/*
@@ -404,32 +403,39 @@ btree_xlog_split(bool onleft, bool isroot,
 	 * replay, because no other index update can be in progress, and readers
 	 * will cope properly when following an obsolete left-link.
 	 */
-	if (record->xl_info & XLR_BKP_BLOCK(1))
-		(void) RestoreBackupBlock(lsn, record, 1, false, false);
-	else if (xlrec->rnext != P_NONE)
+	if (xlrec->rnext != P_NONE)
 	{
-		Buffer		buffer = XLogReadBuffer(xlrec->node, xlrec->rnext, false);
+		/*
+		 * the backup block containing right sibling is 2 or 3, depending
+		 * whether this was a leaf or internal page.
+		 */
+		int		rnext_index = isleaf ? 2 : 3;
 
-		if (BufferIsValid(buffer))
+		if (record->xl_info & XLR_BKP_BLOCK(rnext_index))
+			(void) RestoreBackupBlock(lsn, record, rnext_index, false, false);
+		else
 		{
-			Page		page = (Page) BufferGetPage(buffer);
+			Buffer		buffer;
 
-			if (lsn > PageGetLSN(page))
+			buffer = XLogReadBuffer(xlrec->node, xlrec->rnext, false);
+
+			if (BufferIsValid(buffer))
 			{
-				BTPageOpaque pageop = (BTPageOpaque) PageGetSpecialPointer(page);
+				Page		page = (Page) BufferGetPage(buffer);
 
-				pageop->btpo_prev = xlrec->rightsib;
+				if (lsn > PageGetLSN(page))
+				{
+					BTPageOpaque pageop = (BTPageOpaque) PageGetSpecialPointer(page);
 
-				PageSetLSN(page, lsn);
-				MarkBufferDirty(buffer);
+					pageop->btpo_prev = xlrec->rightsib;
+
+					PageSetLSN(page, lsn);
+					MarkBufferDirty(buffer);
+				}
+				UnlockReleaseBuffer(buffer);
 			}
-			UnlockReleaseBuffer(buffer);
 		}
 	}
-
-	/* The job ain't done till the parent link is inserted... */
-	log_incomplete_split(xlrec->node,
-						 xlrec->leftsib, xlrec->rightsib, isroot);
 }
 
 static void
@@ -1003,10 +1009,6 @@ btree_xlog_newroot(XLogRecPtr lsn, XLogRecord *record)
 	Buffer		buffer;
 	Page		page;
 	BTPageOpaque pageop;
-	BlockNumber downlink = 0;
-
-	/* Backup blocks are not used in newroot records */
-	Assert(!(record->xl_info & XLR_BKP_BLOCK_MASK));
 
 	buffer = XLogReadBuffer(xlrec->node, xlrec->rootblk, true);
 	Assert(BufferIsValid(buffer));
@@ -1025,14 +1027,21 @@ btree_xlog_newroot(XLogRecPtr lsn, XLogRecord *record)
 	if (record->xl_len > SizeOfBtreeNewroot)
 	{
 		IndexTuple	itup;
+		BlockNumber cblkno;
 
 		_bt_restore_page(page,
 						 (char *) xlrec + SizeOfBtreeNewroot,
 						 record->xl_len - SizeOfBtreeNewroot);
-		/* extract downlink to the right-hand split page */
-		itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, P_FIRSTKEY));
-		downlink = ItemPointerGetBlockNumber(&(itup->t_tid));
+		/* extract block number of the left-hand split page */
+		itup = (IndexTuple) PageGetItem(page, PageGetItemId(page, P_HIKEY));
+		cblkno = ItemPointerGetBlockNumber(&(itup->t_tid));
 		Assert(ItemPointerGetOffsetNumber(&(itup->t_tid)) == P_HIKEY);
+
+		/* Clear the incomplete-split flag in left child */
+		if (record->xl_info & XLR_BKP_BLOCK(0))
+			(void) RestoreBackupBlock(lsn, record, 0, false, false);
+		else
+			_bt_clear_incomplete_split(lsn, record, xlrec->node, cblkno);
 	}
 
 	PageSetLSN(page, lsn);
@@ -1042,10 +1051,6 @@ btree_xlog_newroot(XLogRecPtr lsn, XLogRecord *record)
 	_bt_restore_meta(xlrec->node, lsn,
 					 xlrec->rootblk, xlrec->level,
 					 xlrec->rootblk, xlrec->level);
-
-	/* Check to see if this satisfies any incomplete insertions */
-	if (record->xl_len > SizeOfBtreeNewroot)
-		forget_matching_split(xlrec->node, downlink, true);
 }
 
 static void
@@ -1125,59 +1130,3 @@ btree_redo(XLogRecPtr lsn, XLogRecord *record)
 			elog(PANIC, "btree_redo: unknown op code %u", info);
 	}
 }
-
-void
-btree_xlog_startup(void)
-{
-	incomplete_splits = NIL;
-}
-
-void
-btree_xlog_cleanup(void)
-{
-	ListCell   *l;
-
-	foreach(l, incomplete_splits)
-	{
-		/* finish an incomplete split */
-		bt_incomplete_split *action = (bt_incomplete_split *) lfirst(l);
-		Buffer		lbuf,
-					rbuf;
-		Page		lpage,
-					rpage;
-		BTPageOpaque lpageop,
-					rpageop;
-		bool		is_only;
-		Relation	reln;
-
-		lbuf = XLogReadBuffer(action->node, action->leftblk, false);
-		/* failure is impossible because we wrote this page earlier */
-		if (!BufferIsValid(lbuf))
-			elog(PANIC, "btree_xlog_cleanup: left block unfound");
-		lpage = (Page) BufferGetPage(lbuf);
-		lpageop = (BTPageOpaque) PageGetSpecialPointer(lpage);
-		rbuf = XLogReadBuffer(action->node, action->rightblk, false);
-		/* failure is impossible because we wrote this page earlier */
-		if (!BufferIsValid(rbuf))
-			elog(PANIC, "btree_xlog_cleanup: right block unfound");
-		rpage = (Page) BufferGetPage(rbuf);
-		rpageop = (BTPageOpaque) PageGetSpecialPointer(rpage);
-
-		/* if the pages are all of their level, it's a only-page split */
-		is_only = P_LEFTMOST(lpageop) && P_RIGHTMOST(rpageop);
-
-		reln = CreateFakeRelcacheEntry(action->node);
-		_bt_insert_parent(reln, lbuf, rbuf, NULL,
-						  action->is_root, is_only);
-		FreeFakeRelcacheEntry(reln);
-	}
-	incomplete_splits = NIL;
-}
-
-bool
-btree_safe_restartpoint(void)
-{
-	if (incomplete_splits)
-		return false;
-	return true;
-}
diff --git a/src/include/access/nbtree.h b/src/include/access/nbtree.h
index 779a42229b7cde79b206608612e0c7a2c6d6c89f..64c6982f50e69e943b3f27daabdf55283018fcf4 100644
--- a/src/include/access/nbtree.h
+++ b/src/include/access/nbtree.h
@@ -73,6 +73,7 @@ typedef BTPageOpaqueData *BTPageOpaque;
 #define BTP_HALF_DEAD	(1 << 4)	/* empty, but still in tree */
 #define BTP_SPLIT_END	(1 << 5)	/* rightmost page of split group */
 #define BTP_HAS_GARBAGE (1 << 6)	/* page has LP_DEAD tuples */
+#define BTP_INCOMPLETE_SPLIT (1 << 7) /* right sibling's downlink is missing */
 
 /*
  * The max allowed value of a cycle ID is a bit less than 64K.	This is
@@ -178,6 +179,7 @@ typedef struct BTMetaPageData
 #define P_ISHALFDEAD(opaque)	((opaque)->btpo_flags & BTP_HALF_DEAD)
 #define P_IGNORE(opaque)		((opaque)->btpo_flags & (BTP_DELETED|BTP_HALF_DEAD))
 #define P_HAS_GARBAGE(opaque)	((opaque)->btpo_flags & BTP_HAS_GARBAGE)
+#define P_INCOMPLETE_SPLIT(opaque)	((opaque)->btpo_flags & BTP_INCOMPLETE_SPLIT)
 
 /*
  *	Lehman and Yao's algorithm requires a ``high key'' on every non-rightmost
@@ -253,7 +255,7 @@ typedef struct xl_btree_metadata
 typedef struct xl_btree_insert
 {
 	xl_btreetid target;			/* inserted tuple id */
-	/* BlockNumber downlink field FOLLOWS IF NOT XLOG_BTREE_INSERT_LEAF */
+	/* BlockNumber finishes_split field FOLLOWS IF NOT XLOG_BTREE_INSERT_LEAF */
 	/* xl_btree_metadata FOLLOWS IF XLOG_BTREE_INSERT_META */
 	/* INDEX TUPLE FOLLOWS AT END OF STRUCT */
 } xl_btree_insert;
@@ -286,19 +288,18 @@ typedef struct xl_btree_split
 	OffsetNumber firstright;	/* first item moved to right page */
 
 	/*
-	 * If level > 0, BlockIdData downlink follows.	(We use BlockIdData rather
-	 * than BlockNumber for alignment reasons: SizeOfBtreeSplit is only 16-bit
-	 * aligned.)
+	 * In the _L variants, next are OffsetNumber newitemoff and the new item.
+	 * (In the _R variants, the new item is one of the right page's tuples.)
+	 * The new item, but not newitemoff, is suppressed if XLogInsert chooses
+	 * to store the left page's whole page image.
 	 *
 	 * If level > 0, an IndexTuple representing the HIKEY of the left page
 	 * follows.  We don't need this on leaf pages, because it's the same as
 	 * the leftmost key in the new right page.	Also, it's suppressed if
 	 * XLogInsert chooses to store the left page's whole page image.
 	 *
-	 * In the _L variants, next are OffsetNumber newitemoff and the new item.
-	 * (In the _R variants, the new item is one of the right page's tuples.)
-	 * The new item, but not newitemoff, is suppressed if XLogInsert chooses
-	 * to store the left page's whole page image.
+	 * If level > 0, BlockNumber of the page whose incomplete-split flag
+	 * this insertion clears. (not aligned)
 	 *
 	 * Last are the right page's tuples in the form used by _bt_restore_page.
 	 */
@@ -642,8 +643,7 @@ extern Datum btoptions(PG_FUNCTION_ARGS);
 extern bool _bt_doinsert(Relation rel, IndexTuple itup,
 			 IndexUniqueCheck checkUnique, Relation heapRel);
 extern Buffer _bt_getstackbuf(Relation rel, BTStack stack, int access);
-extern void _bt_insert_parent(Relation rel, Buffer buf, Buffer rbuf,
-				  BTStack stack, bool is_root, bool is_only);
+extern void _bt_finish_split(Relation rel, Buffer bbuf, BTStack stack);
 
 /*
  * prototypes for functions in nbtpage.c
@@ -673,7 +673,8 @@ extern BTStack _bt_search(Relation rel,
 		   int keysz, ScanKey scankey, bool nextkey,
 		   Buffer *bufP, int access);
 extern Buffer _bt_moveright(Relation rel, Buffer buf, int keysz,
-			  ScanKey scankey, bool nextkey, int access);
+			  ScanKey scankey, bool nextkey, bool forupdate, BTStack stack,
+			  int access);
 extern OffsetNumber _bt_binsrch(Relation rel, Buffer buf, int keysz,
 			ScanKey scankey, bool nextkey);
 extern int32 _bt_compare(Relation rel, int keysz, ScanKey scankey,
@@ -722,8 +723,5 @@ extern void _bt_leafbuild(BTSpool *btspool, BTSpool *spool2);
  */
 extern void btree_redo(XLogRecPtr lsn, XLogRecord *record);
 extern void btree_desc(StringInfo buf, uint8 xl_info, char *rec);
-extern void btree_xlog_startup(void);
-extern void btree_xlog_cleanup(void);
-extern bool btree_safe_restartpoint(void);
 
 #endif   /* NBTREE_H */
diff --git a/src/include/access/rmgrlist.h b/src/include/access/rmgrlist.h
index c968101e8a2eaaf8b54c433046cff615294c847f..d9ee57d6da26127a2f003a308850233331eb6d68 100644
--- a/src/include/access/rmgrlist.h
+++ b/src/include/access/rmgrlist.h
@@ -36,7 +36,7 @@ PG_RMGR(RM_RELMAP_ID, "RelMap", relmap_redo, relmap_desc, NULL, NULL, NULL)
 PG_RMGR(RM_STANDBY_ID, "Standby", standby_redo, standby_desc, NULL, NULL, NULL)
 PG_RMGR(RM_HEAP2_ID, "Heap2", heap2_redo, heap2_desc, NULL, NULL, NULL)
 PG_RMGR(RM_HEAP_ID, "Heap", heap_redo, heap_desc, NULL, NULL, NULL)
-PG_RMGR(RM_BTREE_ID, "Btree", btree_redo, btree_desc, btree_xlog_startup, btree_xlog_cleanup, btree_safe_restartpoint)
+PG_RMGR(RM_BTREE_ID, "Btree", btree_redo, btree_desc, NULL, NULL, NULL)
 PG_RMGR(RM_HASH_ID, "Hash", hash_redo, hash_desc, NULL, NULL, NULL)
 PG_RMGR(RM_GIN_ID, "Gin", gin_redo, gin_desc, gin_xlog_startup, gin_xlog_cleanup, NULL)
 PG_RMGR(RM_GIST_ID, "Gist", gist_redo, gist_desc, gist_xlog_startup, gist_xlog_cleanup, NULL)
diff --git a/src/include/access/xlog_internal.h b/src/include/access/xlog_internal.h
index 297e4505db9e359cef227cde3f65ffec17d97588..ec021fe29aeac5d3b93148241cf21f55d46c5c7c 100644
--- a/src/include/access/xlog_internal.h
+++ b/src/include/access/xlog_internal.h
@@ -55,7 +55,7 @@ typedef struct BkpBlock
 /*
  * Each page of XLOG file has a header like this:
  */
-#define XLOG_PAGE_MAGIC 0xD07C	/* can be used as WAL version indicator */
+#define XLOG_PAGE_MAGIC 0xD07D	/* can be used as WAL version indicator */
 
 typedef struct XLogPageHeaderData
 {