diff --git a/src/backend/access/nbtree/nbtree.c b/src/backend/access/nbtree/nbtree.c
index 02f12c618a51fc9a39f7afe50eeb391bbb41ee31..b60d6e6002060ec933e0ae80262fae6463558668 100644
--- a/src/backend/access/nbtree/nbtree.c
+++ b/src/backend/access/nbtree/nbtree.c
@@ -55,8 +55,8 @@ typedef struct
 	IndexBulkDeleteCallback callback;
 	void	   *callback_state;
 	BTCycleId	cycleid;
-	BlockNumber lastBlockVacuumed;		/* last blkno reached by Vacuum scan */
-	BlockNumber lastUsedPage;	/* blkno of last non-recyclable page */
+	BlockNumber lastBlockVacuumed;		/* highest blkno actually vacuumed */
+	BlockNumber lastBlockLocked;	/* highest blkno we've cleanup-locked */
 	BlockNumber totFreePages;	/* true total # of free pages */
 	MemoryContext pagedelcontext;
 } BTVacState;
@@ -761,7 +761,7 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 	vstate.callback_state = callback_state;
 	vstate.cycleid = cycleid;
 	vstate.lastBlockVacuumed = BTREE_METAPAGE;	/* Initialise at first block */
-	vstate.lastUsedPage = BTREE_METAPAGE;
+	vstate.lastBlockLocked = BTREE_METAPAGE;
 	vstate.totFreePages = 0;
 
 	/* Create a temporary memory context to run _bt_pagedel in */
@@ -817,27 +817,30 @@ btvacuumscan(IndexVacuumInfo *info, IndexBulkDeleteResult *stats,
 	}
 
 	/*
-	 * InHotStandby we need to scan right up to the end of the index for
-	 * correct locking, so we may need to write a WAL record for the final
-	 * block in the index if it was not vacuumed. It's possible that VACUUMing
-	 * has actually removed zeroed pages at the end of the index so we need to
-	 * take care to issue the record for last actual block and not for the
-	 * last block that was scanned. Ignore empty indexes.
+	 * If the WAL is replayed in hot standby, the replay process needs to get
+	 * cleanup locks on all index leaf pages, just as we've been doing here.
+	 * However, we won't issue any WAL records about pages that have no items
+	 * to be deleted.  For pages between pages we've vacuumed, the replay code
+	 * will take locks under the direction of the lastBlockVacuumed fields in
+	 * the XLOG_BTREE_VACUUM WAL records.  To cover pages after the last one
+	 * we vacuum, we need to issue a dummy XLOG_BTREE_VACUUM WAL record
+	 * against the last leaf page in the index, if that one wasn't vacuumed.
 	 */
 	if (XLogStandbyInfoActive() &&
-		num_pages > 1 && vstate.lastBlockVacuumed < (num_pages - 1))
+		vstate.lastBlockVacuumed < vstate.lastBlockLocked)
 	{
 		Buffer		buf;
 
 		/*
-		 * We can't use _bt_getbuf() here because it always applies
-		 * _bt_checkpage(), which will barf on an all-zero page. We want to
-		 * recycle all-zero pages, not fail.  Also, we want to use a
-		 * nondefault buffer access strategy.
+		 * The page should be valid, but we can't use _bt_getbuf() because we
+		 * want to use a nondefault buffer access strategy.  Since we aren't
+		 * going to delete any items, getting cleanup lock again is probably
+		 * overkill, but for consistency do that anyway.
 		 */
-		buf = ReadBufferExtended(rel, MAIN_FORKNUM, num_pages - 1, RBM_NORMAL,
-								 info->strategy);
+		buf = ReadBufferExtended(rel, MAIN_FORKNUM, vstate.lastBlockLocked,
+								 RBM_NORMAL, info->strategy);
 		LockBufferForCleanup(buf);
+		_bt_checkpage(rel, buf);
 		_bt_delitems_vacuum(rel, buf, NULL, 0, vstate.lastBlockVacuumed);
 		_bt_relbuf(rel, buf);
 	}
@@ -912,10 +915,6 @@ restart:
 		}
 	}
 
-	/* If the page is in use, update lastUsedPage */
-	if (!_bt_page_recyclable(page) && vstate->lastUsedPage < blkno)
-		vstate->lastUsedPage = blkno;
-
 	/* Page is valid, see what to do with it */
 	if (_bt_page_recyclable(page))
 	{
@@ -951,6 +950,13 @@ restart:
 		LockBuffer(buf, BUFFER_LOCK_UNLOCK);
 		LockBufferForCleanup(buf);
 
+		/*
+		 * Remember highest leaf page number we've taken cleanup lock on; see
+		 * notes in btvacuumscan
+		 */
+		if (blkno > vstate->lastBlockLocked)
+			vstate->lastBlockLocked = blkno;
+
 		/*
 		 * Check whether we need to recurse back to earlier pages.	What we
 		 * are concerned about is a page split that happened since we started
@@ -1017,19 +1023,26 @@ restart:
 		 */
 		if (ndeletable > 0)
 		{
-			BlockNumber lastBlockVacuumed = BufferGetBlockNumber(buf);
-
+			/*
+			 * Notice that the issued XLOG_BTREE_VACUUM WAL record includes an
+			 * instruction to the replay code to get cleanup lock on all pages
+			 * between the previous lastBlockVacuumed and this page.  This
+			 * ensures that WAL replay locks all leaf pages at some point.
+			 *
+			 * Since we can visit leaf pages out-of-order when recursing,
+			 * replay might end up locking such pages an extra time, but it
+			 * doesn't seem worth the amount of bookkeeping it'd take to avoid
+			 * that.
+			 */
 			_bt_delitems_vacuum(rel, buf, deletable, ndeletable,
 								vstate->lastBlockVacuumed);
 
 			/*
-			 * Keep track of the block number of the lastBlockVacuumed, so we
-			 * can scan those blocks as well during WAL replay. This then
-			 * provides concurrency protection and allows btrees to be used
-			 * while in recovery.
+			 * Remember highest leaf page number we've issued a
+			 * XLOG_BTREE_VACUUM WAL record for.
 			 */
-			if (lastBlockVacuumed > vstate->lastBlockVacuumed)
-				vstate->lastBlockVacuumed = lastBlockVacuumed;
+			if (blkno > vstate->lastBlockVacuumed)
+				vstate->lastBlockVacuumed = blkno;
 
 			stats->tuples_removed += ndeletable;
 			/* must recompute maxoff */
diff --git a/src/backend/access/nbtree/nbtxlog.c b/src/backend/access/nbtree/nbtxlog.c
index 7736caaae275bad530f7baafa80983546394a261..5c6a7d5d980b1cf65aa02a7a9a932a0209cd11d0 100644
--- a/src/backend/access/nbtree/nbtxlog.c
+++ b/src/backend/access/nbtree/nbtxlog.c
@@ -486,28 +486,47 @@ btree_xlog_vacuum(XLogRecPtr lsn, XLogRecord *record)
 	BTPageOpaque opaque;
 
 	/*
-	 * If queries might be active then we need to ensure every block is
+	 * If queries might be active then we need to ensure every leaf page is
 	 * unpinned between the lastBlockVacuumed and the current block, if there
-	 * are any. This ensures that every block in the index is touched during
-	 * VACUUM as required to ensure scans work correctly.
+	 * are any.  This prevents replay of the VACUUM from reaching the stage of
+	 * removing heap tuples while there could still be indexscans "in flight"
+	 * to those particular tuples (see nbtree/README).
+	 *
+	 * It might be worth checking if there are actually any backends running;
+	 * if not, we could just skip this.
+	 *
+	 * Since VACUUM can visit leaf pages out-of-order, it might issue records
+	 * with lastBlockVacuumed >= block; that's not an error, it just means
+	 * nothing to do now.
+	 *
+	 * Note: since we touch all pages in the range, we will lock non-leaf
+	 * pages, and also any empty (all-zero) pages that may be in the index. It
+	 * doesn't seem worth the complexity to avoid that.  But it's important
+	 * that HotStandbyActiveInReplay() will not return true if the database
+	 * isn't yet consistent; so we need not fear reading still-corrupt blocks
+	 * here during crash recovery.
 	 */
-	if (standbyState == STANDBY_SNAPSHOT_READY &&
-		(xlrec->lastBlockVacuumed + 1) != xlrec->block)
+	if (HotStandbyActiveInReplay())
 	{
-		BlockNumber blkno = xlrec->lastBlockVacuumed + 1;
+		BlockNumber blkno;
 
-		for (; blkno < xlrec->block; blkno++)
+		for (blkno = xlrec->lastBlockVacuumed + 1; blkno < xlrec->block; blkno++)
 		{
 			/*
+			 * We use RBM_NORMAL_NO_LOG mode because it's not an error
+			 * condition to see all-zero pages.  The original btvacuumpage
+			 * scan would have skipped over all-zero pages, noting them in FSM
+			 * but not bothering to initialize them just yet; so we mustn't
+			 * throw an error here.  (We could skip acquiring the cleanup lock
+			 * if PageIsNew, but it's probably not worth the cycles to test.)
+			 *
 			 * XXX we don't actually need to read the block, we just need to
 			 * confirm it is unpinned. If we had a special call into the
 			 * buffer manager we could optimise this so that if the block is
 			 * not in shared_buffers we confirm it as unpinned.
-			 *
-			 * Another simple optimization would be to check if there's any
-			 * backends running; if not, we could just skip this.
 			 */
-			buffer = XLogReadBufferExtended(xlrec->node, MAIN_FORKNUM, blkno, RBM_NORMAL);
+			buffer = XLogReadBufferExtended(xlrec->node, MAIN_FORKNUM, blkno,
+											RBM_NORMAL_NO_LOG);
 			if (BufferIsValid(buffer))
 			{
 				LockBufferForCleanup(buffer);
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 679720a33ee0279f07675ec5d130f747c1500372..d639c4ac43a8f7e94fabcbe618be4ee2f4cc9550 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -7438,7 +7438,8 @@ RecoveryInProgress(void)
  * true. Postmaster knows this by way of signal, not via shared memory.
  *
  * Unlike testing standbyState, this works in any process that's connected to
- * shared memory.
+ * shared memory.  (And note that standbyState alone doesn't tell the truth
+ * anyway.)
  */
 bool
 HotStandbyActive(void)
@@ -7464,6 +7465,17 @@ HotStandbyActive(void)
 	}
 }
 
+/*
+ * Like HotStandbyActive(), but to be used only in WAL replay code,
+ * where we don't need to ask any other process what the state is.
+ */
+bool
+HotStandbyActiveInReplay(void)
+{
+	Assert(AmStartupProcess());
+	return LocalHotStandbyActive;
+}
+
 /*
  * Is this process allowed to insert new WAL records?
  *
diff --git a/src/backend/access/transam/xlogutils.c b/src/backend/access/transam/xlogutils.c
index 5676120a86310fb167de380810cc738a60397f43..ee70340d7a6d4781442a8783094c2f68d78bf305 100644
--- a/src/backend/access/transam/xlogutils.c
+++ b/src/backend/access/transam/xlogutils.c
@@ -287,6 +287,10 @@ XLogReadBuffer(RelFileNode rnode, BlockNumber blkno, bool init)
  *
  * In RBM_ZERO and RBM_ZERO_ON_ERROR modes, if the page doesn't exist, the
  * relation is extended with all-zeroes pages up to the given block number.
+ *
+ * In RBM_NORMAL_NO_LOG mode, we return InvalidBuffer if the page doesn't
+ * exist, and we don't check for all-zeroes.  Thus, no log entry is made
+ * to imply that the page should be dropped or truncated later.
  */
 Buffer
 XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum,
@@ -327,6 +331,8 @@ XLogReadBufferExtended(RelFileNode rnode, ForkNumber forknum,
 			log_invalid_page(rnode, forknum, blkno, false);
 			return InvalidBuffer;
 		}
+		if (mode == RBM_NORMAL_NO_LOG)
+			return InvalidBuffer;
 		/* OK to extend the file */
 		/* we do this in recovery only - no rel-extension lock needed */
 		Assert(InRecovery);
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index 91ef1a35e3e043de780320c941663f0f17ef3746..9b0283fa518bdc1a28c30017143af01be1110eff 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -203,7 +203,8 @@ ReadBuffer(Relation reln, BlockNumber blockNum)
  * Assume when this function is called, that reln has been opened already.
  *
  * In RBM_NORMAL mode, the page is read from disk, and the page header is
- * validated. An error is thrown if the page header is not valid.
+ * validated.  An error is thrown if the page header is not valid.	(But
+ * note that an all-zero page is considered "valid"; see PageIsVerified().)
  *
  * RBM_ZERO_ON_ERROR is like the normal mode, but if the page header is not
  * valid, the page is zeroed instead of throwing an error. This is intended
@@ -217,6 +218,8 @@ ReadBuffer(Relation reln, BlockNumber blockNum)
  * current physical EOF; that is likely to cause problems in md.c when
  * the page is modified and written out. P_NEW is OK, though.
  *
+ * RBM_NORMAL_NO_LOG mode is treated the same as RBM_NORMAL here.
+ *
  * If strategy is not NULL, a nondefault buffer access strategy is used.
  * See buffer/README for details.
  */
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index c21e43ae146bdc8a43d33dd83f162cc800a9614f..f61f24101f94ef989ca8c86bd4b2fa9c37a0fc3c 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -289,6 +289,7 @@ extern void issue_xlog_fsync(int fd, uint32 log, uint32 seg);
 
 extern bool RecoveryInProgress(void);
 extern bool HotStandbyActive(void);
+extern bool HotStandbyActiveInReplay(void);
 extern bool XLogInsertAllowed(void);
 extern void GetXLogReceiptTime(TimestampTz *rtime, bool *fromStream);
 extern XLogRecPtr GetXLogReplayRecPtr(TimeLineID *targetTLI);
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index 51eb77b689566712293e805edd462668f4cafb41..50c324957c3a61add916093cadb17a7c9b5ba523 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -38,7 +38,9 @@ typedef enum
 	RBM_NORMAL,					/* Normal read */
 	RBM_ZERO,					/* Don't read from disk, caller will
 								 * initialize */
-	RBM_ZERO_ON_ERROR			/* Read, but return an all-zeros page on error */
+	RBM_ZERO_ON_ERROR,			/* Read, but return an all-zeros page on error */
+	RBM_NORMAL_NO_LOG			/* Don't log page as invalid during WAL
+								 * replay; otherwise same as RBM_NORMAL */
 } ReadBufferMode;
 
 /* in globals.c ... this duplicates miscadmin.h */