From 45e191e3aa62d47a8bc1a33f784286b2051f45cb Mon Sep 17 00:00:00 2001
From: Andres Freund <andres@anarazel.de>
Date: Thu, 8 Sep 2016 17:02:43 -0700
Subject: [PATCH] Improve scalability of md.c for large relations.

So far md.c used a linked list of segments. That proved to be a problem
when processing large relations, because every smgr.c/md.c level access
to a page incurred walking through a linked list of all preceding
segments. Thus making accessing pages O(#segments).

Replace the linked list of segments hanging off SMgrRelationData with an
array of opened segments. That allows O(1) access to individual
segments, if they've previously been opened.

Discussion: <20140331101001.GE13135@alap3.anarazel.de>
Reviewed-By: Peter Geoghegan, Tom Lane (in an older version)
---
 src/backend/storage/smgr/md.c   | 393 ++++++++++++++++++--------------
 src/backend/storage/smgr/smgr.c |   4 +-
 src/include/storage/smgr.h      |   8 +-
 3 files changed, 229 insertions(+), 176 deletions(-)

diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
index a94828b32c7..1360f773df1 100644
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -92,27 +92,23 @@
  *	out to an unlinked old copy of a segment file that will eventually
  *	disappear.
  *
- *	The file descriptor pointer (md_fd field) stored in the SMgrRelation
- *	cache is, therefore, just the head of a list of MdfdVec objects, one
- *	per segment.  But note the md_fd pointer can be NULL, indicating
- *	relation not open.
- *
- *	Also note that mdfd_chain == NULL does not necessarily mean the relation
- *	doesn't have another segment after this one; we may just not have
- *	opened the next segment yet.  (We could not have "all segments are
- *	in the chain" as an invariant anyway, since another backend could
- *	extend the relation when we weren't looking.)  We do not make chain
+ *	File descriptors are stored in the per-fork md_seg_fds arrays inside
+ *	SMgrRelation. The length of these arrays is stored in md_num_open_segs.
+ *	Note that a fork's md_num_open_segs having a specific value does not
+ *	necessarily mean the relation doesn't have additional segments; we may
+ *	just not have opened the next segment yet.  (We could not have "all
+ *	segments are in the array" as an invariant anyway, since another backend
+ *	could extend the relation while we aren't looking.)  We do not have
  *	entries for inactive segments, however; as soon as we find a partial
  *	segment, we assume that any subsequent segments are inactive.
  *
- *	All MdfdVec objects are palloc'd in the MdCxt memory context.
+ *	The entire MdfdVec array is palloc'd in the MdCxt memory context.
  */
 
 typedef struct _MdfdVec
 {
 	File		mdfd_vfd;		/* fd number in fd.c's pool */
 	BlockNumber mdfd_segno;		/* segment number, from 0 */
-	struct _MdfdVec *mdfd_chain;	/* next segment, or NULL */
 } MdfdVec;
 
 static MemoryContext MdCxt;		/* context for all MdfdVec objects */
@@ -189,7 +185,9 @@ static MdfdVec *mdopen(SMgrRelation reln, ForkNumber forknum, int behavior);
 static void register_dirty_segment(SMgrRelation reln, ForkNumber forknum,
 					   MdfdVec *seg);
 static void register_unlink(RelFileNodeBackend rnode);
-static MdfdVec *_fdvec_alloc(void);
+static void _fdvec_resize(SMgrRelation reln,
+			  ForkNumber forknum,
+			  int nseg);
 static char *_mdfd_segpath(SMgrRelation reln, ForkNumber forknum,
 			  BlockNumber segno);
 static MdfdVec *_mdfd_openseg(SMgrRelation reln, ForkNumber forkno,
@@ -294,13 +292,14 @@ mdexists(SMgrRelation reln, ForkNumber forkNum)
 void
 mdcreate(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
 {
+	MdfdVec    *mdfd;
 	char	   *path;
 	File		fd;
 
-	if (isRedo && reln->md_fd[forkNum] != NULL)
+	if (isRedo && reln->md_num_open_segs[forkNum] > 0)
 		return;					/* created and opened already... */
 
-	Assert(reln->md_fd[forkNum] == NULL);
+	Assert(reln->md_num_open_segs[forkNum] == 0);
 
 	path = relpath(reln->smgr_rnode, forkNum);
 
@@ -330,11 +329,10 @@ mdcreate(SMgrRelation reln, ForkNumber forkNum, bool isRedo)
 
 	pfree(path);
 
-	reln->md_fd[forkNum] = _fdvec_alloc();
-
-	reln->md_fd[forkNum]->mdfd_vfd = fd;
-	reln->md_fd[forkNum]->mdfd_segno = 0;
-	reln->md_fd[forkNum]->mdfd_chain = NULL;
+	_fdvec_resize(reln, forkNum, 1);
+	mdfd = &reln->md_seg_fds[forkNum][0];
+	mdfd->mdfd_vfd = fd;
+	mdfd->mdfd_segno = 0;
 }
 
 /*
@@ -579,8 +577,8 @@ mdopen(SMgrRelation reln, ForkNumber forknum, int behavior)
 	File		fd;
 
 	/* No work if already open */
-	if (reln->md_fd[forknum])
-		return reln->md_fd[forknum];
+	if (reln->md_num_open_segs[forknum] > 0)
+		return &reln->md_seg_fds[forknum][0];
 
 	path = relpath(reln->smgr_rnode, forknum);
 
@@ -612,11 +610,11 @@ mdopen(SMgrRelation reln, ForkNumber forknum, int behavior)
 
 	pfree(path);
 
-	reln->md_fd[forknum] = mdfd = _fdvec_alloc();
-
+	_fdvec_resize(reln, forknum, 1);
+	mdfd = &reln->md_seg_fds[forknum][0];
 	mdfd->mdfd_vfd = fd;
 	mdfd->mdfd_segno = 0;
-	mdfd->mdfd_chain = NULL;
+
 	Assert(_mdnblocks(reln, forknum, mdfd) <= ((BlockNumber) RELSEG_SIZE));
 
 	return mdfd;
@@ -628,25 +626,29 @@ mdopen(SMgrRelation reln, ForkNumber forknum, int behavior)
 void
 mdclose(SMgrRelation reln, ForkNumber forknum)
 {
-	MdfdVec    *v = reln->md_fd[forknum];
+	int			nopensegs = reln->md_num_open_segs[forknum];
 
 	/* No work if already closed */
-	if (v == NULL)
+	if (nopensegs == 0)
 		return;
 
-	reln->md_fd[forknum] = NULL;	/* prevent dangling pointer after error */
-
-	while (v != NULL)
+	/* close segments starting from the end */
+	while (nopensegs > 0)
 	{
-		MdfdVec    *ov = v;
+		MdfdVec    *v = &reln->md_seg_fds[forknum][nopensegs - 1];
 
 		/* if not closed already */
 		if (v->mdfd_vfd >= 0)
+		{
 			FileClose(v->mdfd_vfd);
-		/* Now free vector */
-		v = v->mdfd_chain;
-		pfree(ov);
+			v->mdfd_vfd = -1;
+		}
+
+		nopensegs--;
 	}
+
+	/* resize just once, avoids pointless reallocations */
+	_fdvec_resize(reln, forknum, 0);
 }
 
 /*
@@ -862,9 +864,9 @@ mdwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum,
  *	mdnblocks() -- Get the number of blocks stored in a relation.
  *
  *		Important side effect: all active segments of the relation are opened
- *		and added to the mdfd_chain list.  If this routine has not been
+ *		and added to the mdfd_seg_fds array.  If this routine has not been
  *		called, then only segments up to the last one actually touched
- *		are present in the chain.
+ *		are present in the array.
  */
 BlockNumber
 mdnblocks(SMgrRelation reln, ForkNumber forknum)
@@ -873,24 +875,24 @@ mdnblocks(SMgrRelation reln, ForkNumber forknum)
 	BlockNumber nblocks;
 	BlockNumber segno = 0;
 
+	/* mdopen has opened the first segment */
+	Assert(reln->md_num_open_segs[forknum] > 0);
+
 	/*
-	 * Skip through any segments that aren't the last one, to avoid redundant
-	 * seeks on them.  We have previously verified that these segments are
-	 * exactly RELSEG_SIZE long, and it's useless to recheck that each time.
+	 * Start from the last open segments, to avoid redundant seeks.  We have
+	 * previously verified that these segments are exactly RELSEG_SIZE long,
+	 * and it's useless to recheck that each time.
 	 *
 	 * NOTE: this assumption could only be wrong if another backend has
 	 * truncated the relation.  We rely on higher code levels to handle that
 	 * scenario by closing and re-opening the md fd, which is handled via
 	 * relcache flush.  (Since the checkpointer doesn't participate in
-	 * relcache flush, it could have segment chain entries for inactive
-	 * segments; that's OK because the checkpointer never needs to compute
-	 * relation size.)
+	 * relcache flush, it could have segment entries for inactive segments;
+	 * that's OK because the checkpointer never needs to compute relation
+	 * size.)
 	 */
-	while (v->mdfd_chain != NULL)
-	{
-		segno++;
-		v = v->mdfd_chain;
-	}
+	segno = reln->md_num_open_segs[forknum] - 1;
+	v = &reln->md_seg_fds[forknum][segno];
 
 	for (;;)
 	{
@@ -905,21 +907,16 @@ mdnblocks(SMgrRelation reln, ForkNumber forknum)
 		 */
 		segno++;
 
-		if (v->mdfd_chain == NULL)
-		{
-			/*
-			 * We used to pass O_CREAT here, but that's has the disadvantage
-			 * that it might create a segment which has vanished through some
-			 * operating system misadventure.  In such a case, creating the
-			 * segment here undermines _mdfd_getseg's attempts to notice and
-			 * report an error upon access to a missing segment.
-			 */
-			v->mdfd_chain = _mdfd_openseg(reln, forknum, segno, 0);
-			if (v->mdfd_chain == NULL)
-				return segno * ((BlockNumber) RELSEG_SIZE);
-		}
-
-		v = v->mdfd_chain;
+		/*
+		 * We used to pass O_CREAT here, but that's has the disadvantage that
+		 * it might create a segment which has vanished through some operating
+		 * system misadventure.  In such a case, creating the segment here
+		 * undermines _mdfd_getseg's attempts to notice and report an error
+		 * upon access to a missing segment.
+		 */
+		v = _mdfd_openseg(reln, forknum, segno, 0);
+		if (v == NULL)
+			return segno * ((BlockNumber) RELSEG_SIZE);
 	}
 }
 
@@ -929,9 +926,9 @@ mdnblocks(SMgrRelation reln, ForkNumber forknum)
 void
 mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
 {
-	MdfdVec    *v;
 	BlockNumber curnblk;
 	BlockNumber priorblocks;
+	int			curopensegs;
 
 	/*
 	 * NOTE: mdnblocks makes sure we have opened all active segments, so that
@@ -951,19 +948,24 @@ mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
 	if (nblocks == curnblk)
 		return;					/* no work */
 
-	v = mdopen(reln, forknum, EXTENSION_FAIL);
-
-	priorblocks = 0;
-	while (v != NULL)
+	/*
+	 * Truncate segments, starting at the last one. Starting at the end makes
+	 * managing the memory for the fd array easier, should there be errors.
+	 */
+	curopensegs = reln->md_num_open_segs[forknum];
+	while (curopensegs > 0)
 	{
-		MdfdVec    *ov = v;
+		MdfdVec    *v;
+
+		priorblocks = (curopensegs - 1) * RELSEG_SIZE;
+
+		v = &reln->md_seg_fds[forknum][curopensegs - 1];
 
 		if (priorblocks > nblocks)
 		{
 			/*
-			 * This segment is no longer active (and has already been unlinked
-			 * from the mdfd_chain). We truncate the file, but do not delete
-			 * it, for reasons explained in the header comments.
+			 * This segment is no longer active. We truncate the file, but do
+			 * not delete it, for reasons explained in the header comments.
 			 */
 			if (FileTruncate(v->mdfd_vfd, 0) < 0)
 				ereport(ERROR,
@@ -973,21 +975,21 @@ mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
 
 			if (!SmgrIsTemp(reln))
 				register_dirty_segment(reln, forknum, v);
-			v = v->mdfd_chain;
-			Assert(ov != reln->md_fd[forknum]); /* we never drop the 1st
-												 * segment */
-			FileClose(ov->mdfd_vfd);
-			pfree(ov);
+
+			/* we never drop the 1st segment */
+			Assert(v != &reln->md_seg_fds[forknum][0]);
+
+			FileClose(v->mdfd_vfd);
+			_fdvec_resize(reln, forknum, curopensegs - 1);
 		}
 		else if (priorblocks + ((BlockNumber) RELSEG_SIZE) > nblocks)
 		{
 			/*
 			 * This is the last segment we want to keep. Truncate the file to
-			 * the right length, and clear chain link that points to any
-			 * remaining segments (which we shall zap). NOTE: if nblocks is
-			 * exactly a multiple K of RELSEG_SIZE, we will truncate the K+1st
-			 * segment to 0 length but keep it. This adheres to the invariant
-			 * given in the header comments.
+			 * the right length. NOTE: if nblocks is exactly a multiple K of
+			 * RELSEG_SIZE, we will truncate the K+1st segment to 0 length but
+			 * keep it. This adheres to the invariant given in the header
+			 * comments.
 			 */
 			BlockNumber lastsegblocks = nblocks - priorblocks;
 
@@ -999,18 +1001,16 @@ mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
 						   nblocks)));
 			if (!SmgrIsTemp(reln))
 				register_dirty_segment(reln, forknum, v);
-			v = v->mdfd_chain;
-			ov->mdfd_chain = NULL;
 		}
 		else
 		{
 			/*
-			 * We still need this segment and 0 or more blocks beyond it, so
-			 * nothing to do here.
+			 * We still need this segment, so nothing to do for this and any
+			 * earlier segment.
 			 */
-			v = v->mdfd_chain;
+			break;
 		}
-		priorblocks += RELSEG_SIZE;
+		curopensegs--;
 	}
 }
 
@@ -1023,7 +1023,7 @@ mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks)
 void
 mdimmedsync(SMgrRelation reln, ForkNumber forknum)
 {
-	MdfdVec    *v;
+	int			segno;
 
 	/*
 	 * NOTE: mdnblocks makes sure we have opened all active segments, so that
@@ -1031,16 +1031,18 @@ mdimmedsync(SMgrRelation reln, ForkNumber forknum)
 	 */
 	mdnblocks(reln, forknum);
 
-	v = mdopen(reln, forknum, EXTENSION_FAIL);
+	segno = reln->md_num_open_segs[forknum];
 
-	while (v != NULL)
+	while (segno > 0)
 	{
+		MdfdVec    *v = &reln->md_seg_fds[forknum][segno - 1];
+
 		if (FileSync(v->mdfd_vfd) < 0)
 			ereport(ERROR,
 					(errcode_for_file_access(),
 					 errmsg("could not fsync file \"%s\": %m",
 							FilePathName(v->mdfd_vfd))));
-		v = v->mdfd_chain;
+		segno--;
 	}
 }
 
@@ -1703,12 +1705,40 @@ ForgetDatabaseFsyncRequests(Oid dbid)
 
 
 /*
- *	_fdvec_alloc() -- Make a MdfdVec object.
+ *	_fdvec_resize() -- Resize the fork's open segments array
  */
-static MdfdVec *
-_fdvec_alloc(void)
+static void
+_fdvec_resize(SMgrRelation reln,
+			  ForkNumber forknum,
+			  int nseg)
 {
-	return (MdfdVec *) MemoryContextAlloc(MdCxt, sizeof(MdfdVec));
+	if (nseg == 0)
+	{
+		if (reln->md_num_open_segs[forknum] > 0)
+		{
+			pfree(reln->md_seg_fds[forknum]);
+			reln->md_seg_fds[forknum] = NULL;
+		}
+	}
+	else if (reln->md_num_open_segs[forknum] == 0)
+	{
+		reln->md_seg_fds[forknum] =
+			MemoryContextAlloc(MdCxt, sizeof(MdfdVec) * nseg);
+	}
+	else
+	{
+		/*
+		 * It doesn't seem worthwile complicating the code by having a more
+		 * aggressive growth strategy here; the number of segments doesn't
+		 * grow that fast, and the memory context internally will sometimes
+		 * avoid doing an actual reallocation.
+		 */
+		reln->md_seg_fds[forknum] =
+			repalloc(reln->md_seg_fds[forknum],
+					 sizeof(MdfdVec) * nseg);
+	}
+
+	reln->md_num_open_segs[forknum] = nseg;
 }
 
 /*
@@ -1756,13 +1786,14 @@ _mdfd_openseg(SMgrRelation reln, ForkNumber forknum, BlockNumber segno,
 	if (fd < 0)
 		return NULL;
 
-	/* allocate an mdfdvec entry for it */
-	v = _fdvec_alloc();
+	if (segno <= reln->md_num_open_segs[forknum])
+		_fdvec_resize(reln, forknum, segno + 1);
 
 	/* fill the entry */
+	v = &reln->md_seg_fds[forknum][segno];
 	v->mdfd_vfd = fd;
 	v->mdfd_segno = segno;
-	v->mdfd_chain = NULL;
+
 	Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE));
 
 	/* all done */
@@ -1781,7 +1812,7 @@ static MdfdVec *
 _mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
 			 bool skipFsync, int behavior)
 {
-	MdfdVec    *v = mdopen(reln, forknum, behavior);
+	MdfdVec    *v;
 	BlockNumber targetseg;
 	BlockNumber nextsegno;
 
@@ -1789,98 +1820,116 @@ _mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno,
 	Assert(behavior &
 		   (EXTENSION_FAIL | EXTENSION_CREATE | EXTENSION_RETURN_NULL));
 
-	if (!v)
-		return NULL;			/* if behavior & EXTENSION_RETURN_NULL */
-
 	targetseg = blkno / ((BlockNumber) RELSEG_SIZE);
-	for (nextsegno = 1; nextsegno <= targetseg; nextsegno++)
+
+	/* if an existing and opened segment, we're done */
+	if (targetseg < reln->md_num_open_segs[forknum])
 	{
-		Assert(nextsegno == v->mdfd_segno + 1);
+		v = &reln->md_seg_fds[forknum][targetseg];
+		return v;
+	}
 
-		if (v->mdfd_chain == NULL)
-		{
-			BlockNumber nblocks = _mdnblocks(reln, forknum, v);
-			int			flags = 0;
+	/*
+	 * The target segment is not yet open. Iterate over all the segments
+	 * between the last opened and the target segment. This way missing
+	 * segments either raise an error, or get created (according to
+	 * 'behavior'). Start with either the last opened, or the first segment if
+	 * none was opened before.
+	 */
+	if (reln->md_num_open_segs[forknum] > 0)
+		v = &reln->md_seg_fds[forknum][reln->md_num_open_segs[forknum] - 1];
+	else
+	{
+		v = mdopen(reln, forknum, behavior);
+		if (!v)
+			return NULL;		/* if behavior & EXTENSION_RETURN_NULL */
+	}
+
+	for (nextsegno = reln->md_num_open_segs[forknum];
+		 nextsegno <= targetseg; nextsegno++)
+	{
+		BlockNumber nblocks = _mdnblocks(reln, forknum, v);
+		int			flags = 0;
 
-			if (nblocks > ((BlockNumber) RELSEG_SIZE))
-				elog(FATAL, "segment too big");
+		Assert(nextsegno == v->mdfd_segno + 1);
+
+		if (nblocks > ((BlockNumber) RELSEG_SIZE))
+			elog(FATAL, "segment too big");
 
-			if ((behavior & EXTENSION_CREATE) ||
-				(InRecovery && (behavior & EXTENSION_CREATE_RECOVERY)))
+		if ((behavior & EXTENSION_CREATE) ||
+			(InRecovery && (behavior & EXTENSION_CREATE_RECOVERY)))
+		{
+			/*
+			 * Normally we will create new segments only if authorized by the
+			 * caller (i.e., we are doing mdextend()).  But when doing WAL
+			 * recovery, create segments anyway; this allows cases such as
+			 * replaying WAL data that has a write into a high-numbered
+			 * segment of a relation that was later deleted. We want to go
+			 * ahead and create the segments so we can finish out the replay.
+			 * However if the caller has specified
+			 * EXTENSION_REALLY_RETURN_NULL, then extension is not desired
+			 * even in recovery; we won't reach this point in that case.
+			 *
+			 * We have to maintain the invariant that segments before the last
+			 * active segment are of size RELSEG_SIZE; therefore, if
+			 * extending, pad them out with zeroes if needed.  (This only
+			 * matters if in recovery, or if the caller is extending the
+			 * relation discontiguously, but that can happen in hash indexes.)
+			 */
+			if (nblocks < ((BlockNumber) RELSEG_SIZE))
 			{
-				/*
-				 * Normally we will create new segments only if authorized by
-				 * the caller (i.e., we are doing mdextend()).  But when doing
-				 * WAL recovery, create segments anyway; this allows cases
-				 * such as replaying WAL data that has a write into a
-				 * high-numbered segment of a relation that was later deleted.
-				 * We want to go ahead and create the segments so we can
-				 * finish out the replay.  However if the caller has specified
-				 * EXTENSION_REALLY_RETURN_NULL, then extension is not desired
-				 * even in recovery; we won't reach this point in that case.
-				 *
-				 * We have to maintain the invariant that segments before the
-				 * last active segment are of size RELSEG_SIZE; therefore, if
-				 * extending, pad them out with zeroes if needed.  (This only
-				 * matters if in recovery, or if the caller is extending the
-				 * relation discontiguously, but that can happen in hash
-				 * indexes.)
-				 */
-				if (nblocks < ((BlockNumber) RELSEG_SIZE))
-				{
-					char	   *zerobuf = palloc0(BLCKSZ);
+				char	   *zerobuf = palloc0(BLCKSZ);
 
-					mdextend(reln, forknum,
-							 nextsegno * ((BlockNumber) RELSEG_SIZE) - 1,
-							 zerobuf, skipFsync);
-					pfree(zerobuf);
-				}
-				flags = O_CREAT;
+				mdextend(reln, forknum,
+						 nextsegno * ((BlockNumber) RELSEG_SIZE) - 1,
+						 zerobuf, skipFsync);
+				pfree(zerobuf);
 			}
-			else if (!(behavior & EXTENSION_DONT_CHECK_SIZE) &&
-					 nblocks < ((BlockNumber) RELSEG_SIZE))
+			flags = O_CREAT;
+		}
+		else if (!(behavior & EXTENSION_DONT_CHECK_SIZE) &&
+				 nblocks < ((BlockNumber) RELSEG_SIZE))
+		{
+			/*
+			 * When not extending (or explicitly including truncated
+			 * segments), only open the next segment if the current one is
+			 * exactly RELSEG_SIZE.  If not (this branch), either return NULL
+			 * or fail.
+			 */
+			if (behavior & EXTENSION_RETURN_NULL)
 			{
 				/*
-				 * When not extending (or explicitly including truncated
-				 * segments), only open the next segment if the current one is
-				 * exactly RELSEG_SIZE.  If not (this branch), either return
-				 * NULL or fail.
+				 * Some callers discern between reasons for _mdfd_getseg()
+				 * returning NULL based on errno. As there's no failing
+				 * syscall involved in this case, explicitly set errno to
+				 * ENOENT, as that seems the closest interpretation.
 				 */
-				if (behavior & EXTENSION_RETURN_NULL)
-				{
-					/*
-					 * Some callers discern between reasons for _mdfd_getseg()
-					 * returning NULL based on errno. As there's no failing
-					 * syscall involved in this case, explicitly set errno to
-					 * ENOENT, as that seems the closest interpretation.
-					 */
-					errno = ENOENT;
-					return NULL;
-				}
-
-				ereport(ERROR,
-						(errcode_for_file_access(),
-						 errmsg("could not open file \"%s\" (target block %u): previous segment is only %u blocks",
-								_mdfd_segpath(reln, forknum, nextsegno),
-								blkno, nblocks)));
+				errno = ENOENT;
+				return NULL;
 			}
 
-			v->mdfd_chain = _mdfd_openseg(reln, forknum, nextsegno, flags);
+			ereport(ERROR,
+					(errcode_for_file_access(),
+					 errmsg("could not open file \"%s\" (target block %u): previous segment is only %u blocks",
+							_mdfd_segpath(reln, forknum, nextsegno),
+							blkno, nblocks)));
+		}
 
-			if (v->mdfd_chain == NULL)
-			{
-				if ((behavior & EXTENSION_RETURN_NULL) &&
-					FILE_POSSIBLY_DELETED(errno))
-					return NULL;
-				ereport(ERROR,
-						(errcode_for_file_access(),
+		v = _mdfd_openseg(reln, forknum, nextsegno, flags);
+
+		if (v == NULL)
+		{
+			if ((behavior & EXTENSION_RETURN_NULL) &&
+				FILE_POSSIBLY_DELETED(errno))
+				return NULL;
+			ereport(ERROR,
+					(errcode_for_file_access(),
 				   errmsg("could not open file \"%s\" (target block %u): %m",
 						  _mdfd_segpath(reln, forknum, nextsegno),
 						  blkno)));
-			}
 		}
-		v = v->mdfd_chain;
 	}
+
 	return v;
 }
 
diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c
index 94aa952bcc8..93f0080a60c 100644
--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@@ -174,7 +174,7 @@ smgropen(RelFileNode rnode, BackendId backend)
 
 		/* mark it not open */
 		for (forknum = 0; forknum <= MAX_FORKNUM; forknum++)
-			reln->md_fd[forknum] = NULL;
+			reln->md_num_open_segs[forknum] = 0;
 
 		/* it has no owner yet */
 		add_to_unowned_list(reln);
@@ -379,7 +379,7 @@ smgrcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo)
 	 * Exit quickly in WAL replay mode if we've already opened the file. If
 	 * it's open, it surely must exist.
 	 */
-	if (isRedo && reln->md_fd[forknum] != NULL)
+	if (isRedo && reln->md_num_open_segs[forknum] > 0)
 		return;
 
 	/*
diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h
index a8e7877f704..3430d8665e4 100644
--- a/src/include/storage/smgr.h
+++ b/src/include/storage/smgr.h
@@ -64,8 +64,12 @@ typedef struct SMgrRelationData
 	 */
 	int			smgr_which;		/* storage manager selector */
 
-	/* for md.c; NULL for forks that are not open */
-	struct _MdfdVec *md_fd[MAX_FORKNUM + 1];
+	/*
+	 * for md.c; per-fork arrays of the number of open segments
+	 * (md_num_open_segs) and the segments themselves (md_seg_fds).
+	 */
+	int			md_num_open_segs[MAX_FORKNUM + 1];
+	struct _MdfdVec *md_seg_fds[MAX_FORKNUM + 1];
 
 	/* if unowned, list link in list of all unowned SMgrRelations */
 	struct SMgrRelationData *next_unowned_reln;
-- 
GitLab