From 45e191e3aa62d47a8bc1a33f784286b2051f45cb Mon Sep 17 00:00:00 2001 From: Andres Freund <andres@anarazel.de> Date: Thu, 8 Sep 2016 17:02:43 -0700 Subject: [PATCH] Improve scalability of md.c for large relations. So far md.c used a linked list of segments. That proved to be a problem when processing large relations, because every smgr.c/md.c level access to a page incurred walking through a linked list of all preceding segments. Thus making accessing pages O(#segments). Replace the linked list of segments hanging off SMgrRelationData with an array of opened segments. That allows O(1) access to individual segments, if they've previously been opened. Discussion: <20140331101001.GE13135@alap3.anarazel.de> Reviewed-By: Peter Geoghegan, Tom Lane (in an older version) --- src/backend/storage/smgr/md.c | 393 ++++++++++++++++++-------------- src/backend/storage/smgr/smgr.c | 4 +- src/include/storage/smgr.h | 8 +- 3 files changed, 229 insertions(+), 176 deletions(-) diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c index a94828b32c7..1360f773df1 100644 --- a/src/backend/storage/smgr/md.c +++ b/src/backend/storage/smgr/md.c @@ -92,27 +92,23 @@ * out to an unlinked old copy of a segment file that will eventually * disappear. * - * The file descriptor pointer (md_fd field) stored in the SMgrRelation - * cache is, therefore, just the head of a list of MdfdVec objects, one - * per segment. But note the md_fd pointer can be NULL, indicating - * relation not open. - * - * Also note that mdfd_chain == NULL does not necessarily mean the relation - * doesn't have another segment after this one; we may just not have - * opened the next segment yet. (We could not have "all segments are - * in the chain" as an invariant anyway, since another backend could - * extend the relation when we weren't looking.) We do not make chain + * File descriptors are stored in the per-fork md_seg_fds arrays inside + * SMgrRelation. The length of these arrays is stored in md_num_open_segs. + * Note that a fork's md_num_open_segs having a specific value does not + * necessarily mean the relation doesn't have additional segments; we may + * just not have opened the next segment yet. (We could not have "all + * segments are in the array" as an invariant anyway, since another backend + * could extend the relation while we aren't looking.) We do not have * entries for inactive segments, however; as soon as we find a partial * segment, we assume that any subsequent segments are inactive. * - * All MdfdVec objects are palloc'd in the MdCxt memory context. + * The entire MdfdVec array is palloc'd in the MdCxt memory context. */ typedef struct _MdfdVec { File mdfd_vfd; /* fd number in fd.c's pool */ BlockNumber mdfd_segno; /* segment number, from 0 */ - struct _MdfdVec *mdfd_chain; /* next segment, or NULL */ } MdfdVec; static MemoryContext MdCxt; /* context for all MdfdVec objects */ @@ -189,7 +185,9 @@ static MdfdVec *mdopen(SMgrRelation reln, ForkNumber forknum, int behavior); static void register_dirty_segment(SMgrRelation reln, ForkNumber forknum, MdfdVec *seg); static void register_unlink(RelFileNodeBackend rnode); -static MdfdVec *_fdvec_alloc(void); +static void _fdvec_resize(SMgrRelation reln, + ForkNumber forknum, + int nseg); static char *_mdfd_segpath(SMgrRelation reln, ForkNumber forknum, BlockNumber segno); static MdfdVec *_mdfd_openseg(SMgrRelation reln, ForkNumber forkno, @@ -294,13 +292,14 @@ mdexists(SMgrRelation reln, ForkNumber forkNum) void mdcreate(SMgrRelation reln, ForkNumber forkNum, bool isRedo) { + MdfdVec *mdfd; char *path; File fd; - if (isRedo && reln->md_fd[forkNum] != NULL) + if (isRedo && reln->md_num_open_segs[forkNum] > 0) return; /* created and opened already... */ - Assert(reln->md_fd[forkNum] == NULL); + Assert(reln->md_num_open_segs[forkNum] == 0); path = relpath(reln->smgr_rnode, forkNum); @@ -330,11 +329,10 @@ mdcreate(SMgrRelation reln, ForkNumber forkNum, bool isRedo) pfree(path); - reln->md_fd[forkNum] = _fdvec_alloc(); - - reln->md_fd[forkNum]->mdfd_vfd = fd; - reln->md_fd[forkNum]->mdfd_segno = 0; - reln->md_fd[forkNum]->mdfd_chain = NULL; + _fdvec_resize(reln, forkNum, 1); + mdfd = &reln->md_seg_fds[forkNum][0]; + mdfd->mdfd_vfd = fd; + mdfd->mdfd_segno = 0; } /* @@ -579,8 +577,8 @@ mdopen(SMgrRelation reln, ForkNumber forknum, int behavior) File fd; /* No work if already open */ - if (reln->md_fd[forknum]) - return reln->md_fd[forknum]; + if (reln->md_num_open_segs[forknum] > 0) + return &reln->md_seg_fds[forknum][0]; path = relpath(reln->smgr_rnode, forknum); @@ -612,11 +610,11 @@ mdopen(SMgrRelation reln, ForkNumber forknum, int behavior) pfree(path); - reln->md_fd[forknum] = mdfd = _fdvec_alloc(); - + _fdvec_resize(reln, forknum, 1); + mdfd = &reln->md_seg_fds[forknum][0]; mdfd->mdfd_vfd = fd; mdfd->mdfd_segno = 0; - mdfd->mdfd_chain = NULL; + Assert(_mdnblocks(reln, forknum, mdfd) <= ((BlockNumber) RELSEG_SIZE)); return mdfd; @@ -628,25 +626,29 @@ mdopen(SMgrRelation reln, ForkNumber forknum, int behavior) void mdclose(SMgrRelation reln, ForkNumber forknum) { - MdfdVec *v = reln->md_fd[forknum]; + int nopensegs = reln->md_num_open_segs[forknum]; /* No work if already closed */ - if (v == NULL) + if (nopensegs == 0) return; - reln->md_fd[forknum] = NULL; /* prevent dangling pointer after error */ - - while (v != NULL) + /* close segments starting from the end */ + while (nopensegs > 0) { - MdfdVec *ov = v; + MdfdVec *v = &reln->md_seg_fds[forknum][nopensegs - 1]; /* if not closed already */ if (v->mdfd_vfd >= 0) + { FileClose(v->mdfd_vfd); - /* Now free vector */ - v = v->mdfd_chain; - pfree(ov); + v->mdfd_vfd = -1; + } + + nopensegs--; } + + /* resize just once, avoids pointless reallocations */ + _fdvec_resize(reln, forknum, 0); } /* @@ -862,9 +864,9 @@ mdwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, * mdnblocks() -- Get the number of blocks stored in a relation. * * Important side effect: all active segments of the relation are opened - * and added to the mdfd_chain list. If this routine has not been + * and added to the mdfd_seg_fds array. If this routine has not been * called, then only segments up to the last one actually touched - * are present in the chain. + * are present in the array. */ BlockNumber mdnblocks(SMgrRelation reln, ForkNumber forknum) @@ -873,24 +875,24 @@ mdnblocks(SMgrRelation reln, ForkNumber forknum) BlockNumber nblocks; BlockNumber segno = 0; + /* mdopen has opened the first segment */ + Assert(reln->md_num_open_segs[forknum] > 0); + /* - * Skip through any segments that aren't the last one, to avoid redundant - * seeks on them. We have previously verified that these segments are - * exactly RELSEG_SIZE long, and it's useless to recheck that each time. + * Start from the last open segments, to avoid redundant seeks. We have + * previously verified that these segments are exactly RELSEG_SIZE long, + * and it's useless to recheck that each time. * * NOTE: this assumption could only be wrong if another backend has * truncated the relation. We rely on higher code levels to handle that * scenario by closing and re-opening the md fd, which is handled via * relcache flush. (Since the checkpointer doesn't participate in - * relcache flush, it could have segment chain entries for inactive - * segments; that's OK because the checkpointer never needs to compute - * relation size.) + * relcache flush, it could have segment entries for inactive segments; + * that's OK because the checkpointer never needs to compute relation + * size.) */ - while (v->mdfd_chain != NULL) - { - segno++; - v = v->mdfd_chain; - } + segno = reln->md_num_open_segs[forknum] - 1; + v = &reln->md_seg_fds[forknum][segno]; for (;;) { @@ -905,21 +907,16 @@ mdnblocks(SMgrRelation reln, ForkNumber forknum) */ segno++; - if (v->mdfd_chain == NULL) - { - /* - * We used to pass O_CREAT here, but that's has the disadvantage - * that it might create a segment which has vanished through some - * operating system misadventure. In such a case, creating the - * segment here undermines _mdfd_getseg's attempts to notice and - * report an error upon access to a missing segment. - */ - v->mdfd_chain = _mdfd_openseg(reln, forknum, segno, 0); - if (v->mdfd_chain == NULL) - return segno * ((BlockNumber) RELSEG_SIZE); - } - - v = v->mdfd_chain; + /* + * We used to pass O_CREAT here, but that's has the disadvantage that + * it might create a segment which has vanished through some operating + * system misadventure. In such a case, creating the segment here + * undermines _mdfd_getseg's attempts to notice and report an error + * upon access to a missing segment. + */ + v = _mdfd_openseg(reln, forknum, segno, 0); + if (v == NULL) + return segno * ((BlockNumber) RELSEG_SIZE); } } @@ -929,9 +926,9 @@ mdnblocks(SMgrRelation reln, ForkNumber forknum) void mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) { - MdfdVec *v; BlockNumber curnblk; BlockNumber priorblocks; + int curopensegs; /* * NOTE: mdnblocks makes sure we have opened all active segments, so that @@ -951,19 +948,24 @@ mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) if (nblocks == curnblk) return; /* no work */ - v = mdopen(reln, forknum, EXTENSION_FAIL); - - priorblocks = 0; - while (v != NULL) + /* + * Truncate segments, starting at the last one. Starting at the end makes + * managing the memory for the fd array easier, should there be errors. + */ + curopensegs = reln->md_num_open_segs[forknum]; + while (curopensegs > 0) { - MdfdVec *ov = v; + MdfdVec *v; + + priorblocks = (curopensegs - 1) * RELSEG_SIZE; + + v = &reln->md_seg_fds[forknum][curopensegs - 1]; if (priorblocks > nblocks) { /* - * This segment is no longer active (and has already been unlinked - * from the mdfd_chain). We truncate the file, but do not delete - * it, for reasons explained in the header comments. + * This segment is no longer active. We truncate the file, but do + * not delete it, for reasons explained in the header comments. */ if (FileTruncate(v->mdfd_vfd, 0) < 0) ereport(ERROR, @@ -973,21 +975,21 @@ mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) if (!SmgrIsTemp(reln)) register_dirty_segment(reln, forknum, v); - v = v->mdfd_chain; - Assert(ov != reln->md_fd[forknum]); /* we never drop the 1st - * segment */ - FileClose(ov->mdfd_vfd); - pfree(ov); + + /* we never drop the 1st segment */ + Assert(v != &reln->md_seg_fds[forknum][0]); + + FileClose(v->mdfd_vfd); + _fdvec_resize(reln, forknum, curopensegs - 1); } else if (priorblocks + ((BlockNumber) RELSEG_SIZE) > nblocks) { /* * This is the last segment we want to keep. Truncate the file to - * the right length, and clear chain link that points to any - * remaining segments (which we shall zap). NOTE: if nblocks is - * exactly a multiple K of RELSEG_SIZE, we will truncate the K+1st - * segment to 0 length but keep it. This adheres to the invariant - * given in the header comments. + * the right length. NOTE: if nblocks is exactly a multiple K of + * RELSEG_SIZE, we will truncate the K+1st segment to 0 length but + * keep it. This adheres to the invariant given in the header + * comments. */ BlockNumber lastsegblocks = nblocks - priorblocks; @@ -999,18 +1001,16 @@ mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) nblocks))); if (!SmgrIsTemp(reln)) register_dirty_segment(reln, forknum, v); - v = v->mdfd_chain; - ov->mdfd_chain = NULL; } else { /* - * We still need this segment and 0 or more blocks beyond it, so - * nothing to do here. + * We still need this segment, so nothing to do for this and any + * earlier segment. */ - v = v->mdfd_chain; + break; } - priorblocks += RELSEG_SIZE; + curopensegs--; } } @@ -1023,7 +1023,7 @@ mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks) void mdimmedsync(SMgrRelation reln, ForkNumber forknum) { - MdfdVec *v; + int segno; /* * NOTE: mdnblocks makes sure we have opened all active segments, so that @@ -1031,16 +1031,18 @@ mdimmedsync(SMgrRelation reln, ForkNumber forknum) */ mdnblocks(reln, forknum); - v = mdopen(reln, forknum, EXTENSION_FAIL); + segno = reln->md_num_open_segs[forknum]; - while (v != NULL) + while (segno > 0) { + MdfdVec *v = &reln->md_seg_fds[forknum][segno - 1]; + if (FileSync(v->mdfd_vfd) < 0) ereport(ERROR, (errcode_for_file_access(), errmsg("could not fsync file \"%s\": %m", FilePathName(v->mdfd_vfd)))); - v = v->mdfd_chain; + segno--; } } @@ -1703,12 +1705,40 @@ ForgetDatabaseFsyncRequests(Oid dbid) /* - * _fdvec_alloc() -- Make a MdfdVec object. + * _fdvec_resize() -- Resize the fork's open segments array */ -static MdfdVec * -_fdvec_alloc(void) +static void +_fdvec_resize(SMgrRelation reln, + ForkNumber forknum, + int nseg) { - return (MdfdVec *) MemoryContextAlloc(MdCxt, sizeof(MdfdVec)); + if (nseg == 0) + { + if (reln->md_num_open_segs[forknum] > 0) + { + pfree(reln->md_seg_fds[forknum]); + reln->md_seg_fds[forknum] = NULL; + } + } + else if (reln->md_num_open_segs[forknum] == 0) + { + reln->md_seg_fds[forknum] = + MemoryContextAlloc(MdCxt, sizeof(MdfdVec) * nseg); + } + else + { + /* + * It doesn't seem worthwile complicating the code by having a more + * aggressive growth strategy here; the number of segments doesn't + * grow that fast, and the memory context internally will sometimes + * avoid doing an actual reallocation. + */ + reln->md_seg_fds[forknum] = + repalloc(reln->md_seg_fds[forknum], + sizeof(MdfdVec) * nseg); + } + + reln->md_num_open_segs[forknum] = nseg; } /* @@ -1756,13 +1786,14 @@ _mdfd_openseg(SMgrRelation reln, ForkNumber forknum, BlockNumber segno, if (fd < 0) return NULL; - /* allocate an mdfdvec entry for it */ - v = _fdvec_alloc(); + if (segno <= reln->md_num_open_segs[forknum]) + _fdvec_resize(reln, forknum, segno + 1); /* fill the entry */ + v = &reln->md_seg_fds[forknum][segno]; v->mdfd_vfd = fd; v->mdfd_segno = segno; - v->mdfd_chain = NULL; + Assert(_mdnblocks(reln, forknum, v) <= ((BlockNumber) RELSEG_SIZE)); /* all done */ @@ -1781,7 +1812,7 @@ static MdfdVec * _mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, bool skipFsync, int behavior) { - MdfdVec *v = mdopen(reln, forknum, behavior); + MdfdVec *v; BlockNumber targetseg; BlockNumber nextsegno; @@ -1789,98 +1820,116 @@ _mdfd_getseg(SMgrRelation reln, ForkNumber forknum, BlockNumber blkno, Assert(behavior & (EXTENSION_FAIL | EXTENSION_CREATE | EXTENSION_RETURN_NULL)); - if (!v) - return NULL; /* if behavior & EXTENSION_RETURN_NULL */ - targetseg = blkno / ((BlockNumber) RELSEG_SIZE); - for (nextsegno = 1; nextsegno <= targetseg; nextsegno++) + + /* if an existing and opened segment, we're done */ + if (targetseg < reln->md_num_open_segs[forknum]) { - Assert(nextsegno == v->mdfd_segno + 1); + v = &reln->md_seg_fds[forknum][targetseg]; + return v; + } - if (v->mdfd_chain == NULL) - { - BlockNumber nblocks = _mdnblocks(reln, forknum, v); - int flags = 0; + /* + * The target segment is not yet open. Iterate over all the segments + * between the last opened and the target segment. This way missing + * segments either raise an error, or get created (according to + * 'behavior'). Start with either the last opened, or the first segment if + * none was opened before. + */ + if (reln->md_num_open_segs[forknum] > 0) + v = &reln->md_seg_fds[forknum][reln->md_num_open_segs[forknum] - 1]; + else + { + v = mdopen(reln, forknum, behavior); + if (!v) + return NULL; /* if behavior & EXTENSION_RETURN_NULL */ + } + + for (nextsegno = reln->md_num_open_segs[forknum]; + nextsegno <= targetseg; nextsegno++) + { + BlockNumber nblocks = _mdnblocks(reln, forknum, v); + int flags = 0; - if (nblocks > ((BlockNumber) RELSEG_SIZE)) - elog(FATAL, "segment too big"); + Assert(nextsegno == v->mdfd_segno + 1); + + if (nblocks > ((BlockNumber) RELSEG_SIZE)) + elog(FATAL, "segment too big"); - if ((behavior & EXTENSION_CREATE) || - (InRecovery && (behavior & EXTENSION_CREATE_RECOVERY))) + if ((behavior & EXTENSION_CREATE) || + (InRecovery && (behavior & EXTENSION_CREATE_RECOVERY))) + { + /* + * Normally we will create new segments only if authorized by the + * caller (i.e., we are doing mdextend()). But when doing WAL + * recovery, create segments anyway; this allows cases such as + * replaying WAL data that has a write into a high-numbered + * segment of a relation that was later deleted. We want to go + * ahead and create the segments so we can finish out the replay. + * However if the caller has specified + * EXTENSION_REALLY_RETURN_NULL, then extension is not desired + * even in recovery; we won't reach this point in that case. + * + * We have to maintain the invariant that segments before the last + * active segment are of size RELSEG_SIZE; therefore, if + * extending, pad them out with zeroes if needed. (This only + * matters if in recovery, or if the caller is extending the + * relation discontiguously, but that can happen in hash indexes.) + */ + if (nblocks < ((BlockNumber) RELSEG_SIZE)) { - /* - * Normally we will create new segments only if authorized by - * the caller (i.e., we are doing mdextend()). But when doing - * WAL recovery, create segments anyway; this allows cases - * such as replaying WAL data that has a write into a - * high-numbered segment of a relation that was later deleted. - * We want to go ahead and create the segments so we can - * finish out the replay. However if the caller has specified - * EXTENSION_REALLY_RETURN_NULL, then extension is not desired - * even in recovery; we won't reach this point in that case. - * - * We have to maintain the invariant that segments before the - * last active segment are of size RELSEG_SIZE; therefore, if - * extending, pad them out with zeroes if needed. (This only - * matters if in recovery, or if the caller is extending the - * relation discontiguously, but that can happen in hash - * indexes.) - */ - if (nblocks < ((BlockNumber) RELSEG_SIZE)) - { - char *zerobuf = palloc0(BLCKSZ); + char *zerobuf = palloc0(BLCKSZ); - mdextend(reln, forknum, - nextsegno * ((BlockNumber) RELSEG_SIZE) - 1, - zerobuf, skipFsync); - pfree(zerobuf); - } - flags = O_CREAT; + mdextend(reln, forknum, + nextsegno * ((BlockNumber) RELSEG_SIZE) - 1, + zerobuf, skipFsync); + pfree(zerobuf); } - else if (!(behavior & EXTENSION_DONT_CHECK_SIZE) && - nblocks < ((BlockNumber) RELSEG_SIZE)) + flags = O_CREAT; + } + else if (!(behavior & EXTENSION_DONT_CHECK_SIZE) && + nblocks < ((BlockNumber) RELSEG_SIZE)) + { + /* + * When not extending (or explicitly including truncated + * segments), only open the next segment if the current one is + * exactly RELSEG_SIZE. If not (this branch), either return NULL + * or fail. + */ + if (behavior & EXTENSION_RETURN_NULL) { /* - * When not extending (or explicitly including truncated - * segments), only open the next segment if the current one is - * exactly RELSEG_SIZE. If not (this branch), either return - * NULL or fail. + * Some callers discern between reasons for _mdfd_getseg() + * returning NULL based on errno. As there's no failing + * syscall involved in this case, explicitly set errno to + * ENOENT, as that seems the closest interpretation. */ - if (behavior & EXTENSION_RETURN_NULL) - { - /* - * Some callers discern between reasons for _mdfd_getseg() - * returning NULL based on errno. As there's no failing - * syscall involved in this case, explicitly set errno to - * ENOENT, as that seems the closest interpretation. - */ - errno = ENOENT; - return NULL; - } - - ereport(ERROR, - (errcode_for_file_access(), - errmsg("could not open file \"%s\" (target block %u): previous segment is only %u blocks", - _mdfd_segpath(reln, forknum, nextsegno), - blkno, nblocks))); + errno = ENOENT; + return NULL; } - v->mdfd_chain = _mdfd_openseg(reln, forknum, nextsegno, flags); + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not open file \"%s\" (target block %u): previous segment is only %u blocks", + _mdfd_segpath(reln, forknum, nextsegno), + blkno, nblocks))); + } - if (v->mdfd_chain == NULL) - { - if ((behavior & EXTENSION_RETURN_NULL) && - FILE_POSSIBLY_DELETED(errno)) - return NULL; - ereport(ERROR, - (errcode_for_file_access(), + v = _mdfd_openseg(reln, forknum, nextsegno, flags); + + if (v == NULL) + { + if ((behavior & EXTENSION_RETURN_NULL) && + FILE_POSSIBLY_DELETED(errno)) + return NULL; + ereport(ERROR, + (errcode_for_file_access(), errmsg("could not open file \"%s\" (target block %u): %m", _mdfd_segpath(reln, forknum, nextsegno), blkno))); - } } - v = v->mdfd_chain; } + return v; } diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c index 94aa952bcc8..93f0080a60c 100644 --- a/src/backend/storage/smgr/smgr.c +++ b/src/backend/storage/smgr/smgr.c @@ -174,7 +174,7 @@ smgropen(RelFileNode rnode, BackendId backend) /* mark it not open */ for (forknum = 0; forknum <= MAX_FORKNUM; forknum++) - reln->md_fd[forknum] = NULL; + reln->md_num_open_segs[forknum] = 0; /* it has no owner yet */ add_to_unowned_list(reln); @@ -379,7 +379,7 @@ smgrcreate(SMgrRelation reln, ForkNumber forknum, bool isRedo) * Exit quickly in WAL replay mode if we've already opened the file. If * it's open, it surely must exist. */ - if (isRedo && reln->md_fd[forknum] != NULL) + if (isRedo && reln->md_num_open_segs[forknum] > 0) return; /* diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h index a8e7877f704..3430d8665e4 100644 --- a/src/include/storage/smgr.h +++ b/src/include/storage/smgr.h @@ -64,8 +64,12 @@ typedef struct SMgrRelationData */ int smgr_which; /* storage manager selector */ - /* for md.c; NULL for forks that are not open */ - struct _MdfdVec *md_fd[MAX_FORKNUM + 1]; + /* + * for md.c; per-fork arrays of the number of open segments + * (md_num_open_segs) and the segments themselves (md_seg_fds). + */ + int md_num_open_segs[MAX_FORKNUM + 1]; + struct _MdfdVec *md_seg_fds[MAX_FORKNUM + 1]; /* if unowned, list link in list of all unowned SMgrRelations */ struct SMgrRelationData *next_unowned_reln; -- GitLab