diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index 2f726331ac4d42748e15858cdeb7903739f9bfb1..6c73fb439cc4ea6e23e841926adaa6652665c821 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -1843,6 +1843,35 @@ include_dir 'conf.d' </para> </listitem> </varlistentry> + + <varlistentry id="guc-bgwriter-flush-after" xreflabel="bgwriter_flush_after"> + <term><varname>bgwriter_flush_after</varname> (<type>int</type>) + <indexterm> + <primary><varname>bgwriter_flush_after</> configuration parameter</primary> + </indexterm> + </term> + <listitem> + <para> + Whenever more than <varname>bgwriter_flush_after</varname> bytes have + been written by the bgwriter, attempt to force the OS to issue these + writes to the underlying storage. Doing so will limit the amount of + dirty data in the kernel's page cache, reducing the likelihood of + stalls when an fsync is issued at the end of a checkpoint, or when + the OS writes data back in larger batches in the background. Often + that will result in greatly reduced transaction latency, but there + also are some cases, especially with workloads that are bigger than + <xref linkend="guc-shared-buffers">, but smaller than the OS's page + cache, where performance might degrade. This setting may have no + effect on some platforms. The valid range is between + <literal>0</literal>, which disables controlled writeback, and + <literal>2MB</literal>. The default is <literal>512Kb</> on Linux, + <literal>0</> elsewhere. (Non-default values of + <symbol>BLCKSZ</symbol> change the default and maximum.) + This parameter can only be set in the <filename>postgresql.conf</> + file or on the server command line. + </para> + </listitem> + </varlistentry> </variablelist> <para> @@ -1944,6 +1973,35 @@ include_dir 'conf.d' </para> </listitem> </varlistentry> + + <varlistentry id="guc-backend-flush-after" xreflabel="backend_flush_after"> + <term><varname>backend_flush_after</varname> (<type>int</type>) + <indexterm> + <primary><varname>backend_flush_after</> configuration parameter</primary> + </indexterm> + </term> + <listitem> + <para> + Whenever more than <varname>backend_flush_after</varname> bytes have + been written by a single backend, attempt to force the OS to issue + these writes to the underlying storage. Doing so will limit the + amount of dirty data in the kernel's page cache, reducing the + likelihood of stalls when an fsync is issued at the end of a + checkpoint, or when the OS writes data back in larger batches in the + background. Often that will result in greatly reduced transaction + latency, but there also are some cases, especially with workloads + that are bigger than <xref linkend="guc-shared-buffers">, but smaller + than the OS's page cache, where performance might degrade. This + setting may have no effect on some platforms. The valid range is + between <literal>0</literal>, which disables controlled writeback, + and <literal>2MB</literal>. The default is <literal>128Kb</> on + Linux, <literal>0</> elsewhere. (Non-default values of + <symbol>BLCKSZ</symbol> change the default and maximum.) + This parameter can only be set in the <filename>postgresql.conf</> + file or on the server command line. + </para> + </listitem> + </varlistentry> </variablelist> </sect2> </sect1> @@ -2475,6 +2533,35 @@ include_dir 'conf.d' </listitem> </varlistentry> + <varlistentry id="guc-checkpoint-flush-after" xreflabel="checkpoint_flush_after"> + <term><varname>checkpoint_flush_after</varname> (<type>int</type>) + <indexterm> + <primary><varname>checkpoint_flush_after</> configuration parameter</primary> + </indexterm> + </term> + <listitem> + <para> + Whenever more than <varname>checkpoint_flush_after</varname> bytes + have been written while performing a checkpoint, attempt to force the + OS to issue these writes to the underlying storage. Doing so will + limit the amount of dirty data in the kernel's page cache, reducing + the likelihood of stalls when an fsync is issued at the end of the + checkpoint, or when the OS writes data back in larger batches in the + background. Often that will result in greatly reduced transaction + latency, but there also are some cases, especially with workloads + that are bigger than <xref linkend="guc-shared-buffers">, but smaller + than the OS's page cache, where performance might degrade. This + setting may have no effect on some platforms. The valid range is + between <literal>0</literal>, which disables controlled writeback, + and <literal>2MB</literal>. The default is <literal>128Kb</> on + Linux, <literal>0</> elsewhere. (Non-default values of + <symbol>BLCKSZ</symbol> change the default and maximum.) + This parameter can only be set in the <filename>postgresql.conf</> + file or on the server command line. + </para> + </listitem> + </varlistentry> + <varlistentry id="guc-checkpoint-warning" xreflabel="checkpoint_warning"> <term><varname>checkpoint_warning</varname> (<type>integer</type>) <indexterm> diff --git a/doc/src/sgml/wal.sgml b/doc/src/sgml/wal.sgml index e3941c9391b7fe4f277e673e7a2f31ba7a168714..503ea8a2a7e913d1be7df792f2b1e7b092ee5c7f 100644 --- a/doc/src/sgml/wal.sgml +++ b/doc/src/sgml/wal.sgml @@ -545,6 +545,17 @@ unexpected variation in the number of WAL segments needed. </para> + <para> + On Linux and POSIX platforms <xref linkend="guc-checkpoint-flush-after"> + allows to force the OS that pages written by the checkpoint should be + flushed to disk after a configurable number of bytes. Otherwise, these + pages may be kept in the OS's page cache, inducing a stall when + <literal>fsync</> is issued at the end of a checkpoint. This setting will + often help to reduce transaction latency, but it also can an adverse effect + on performance; particularly for workloads that are bigger than + <xref linkend="guc-shared-buffers">, but smaller than the OS's page cache. + </para> + <para> The number of WAL segment files in <filename>pg_xlog</> directory depends on <varname>min_wal_size</>, <varname>max_wal_size</> and diff --git a/src/backend/postmaster/bgwriter.c b/src/backend/postmaster/bgwriter.c index ad948168a706064939ff091caecc8a43d29a82e7..00f03d8acbe7ba54b4fc91aa28c80f2c508850fb 100644 --- a/src/backend/postmaster/bgwriter.c +++ b/src/backend/postmaster/bgwriter.c @@ -111,6 +111,7 @@ BackgroundWriterMain(void) sigjmp_buf local_sigjmp_buf; MemoryContext bgwriter_context; bool prev_hibernate; + WritebackContext wb_context; /* * Properly accept or ignore signals the postmaster might send us. @@ -164,6 +165,8 @@ BackgroundWriterMain(void) ALLOCSET_DEFAULT_MAXSIZE); MemoryContextSwitchTo(bgwriter_context); + WritebackContextInit(&wb_context, &bgwriter_flush_after); + /* * If an exception is encountered, processing resumes here. * @@ -208,6 +211,9 @@ BackgroundWriterMain(void) /* Flush any leaked data in the top-level context */ MemoryContextResetAndDeleteChildren(bgwriter_context); + /* re-initilialize to avoid repeated errors causing problems */ + WritebackContextInit(&wb_context, &bgwriter_flush_after); + /* Now we can allow interrupts again */ RESUME_INTERRUPTS(); @@ -272,7 +278,7 @@ BackgroundWriterMain(void) /* * Do one cycle of dirty-buffer writing. */ - can_hibernate = BgBufferSync(); + can_hibernate = BgBufferSync(&wb_context); /* * Send off activity statistics to the stats collector diff --git a/src/backend/storage/buffer/buf_init.c b/src/backend/storage/buffer/buf_init.c index f013a4d9581553bf30bf15b9b2bd745d20e138c9..e10071d9c0ed76d3b17176238e62f535bc99f9ca 100644 --- a/src/backend/storage/buffer/buf_init.c +++ b/src/backend/storage/buffer/buf_init.c @@ -23,6 +23,7 @@ char *BufferBlocks; LWLockMinimallyPadded *BufferIOLWLockArray = NULL; LWLockTranche BufferIOLWLockTranche; LWLockTranche BufferContentLWLockTranche; +WritebackContext BackendWritebackContext; /* @@ -149,6 +150,10 @@ InitBufferPool(void) /* Init other shared buffer-management stuff */ StrategyInitialize(!foundDescs); + + /* Initialize per-backend file flush context */ + WritebackContextInit(&BackendWritebackContext, + &backend_flush_after); } /* diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index e8e0825eb0c5590241e8cfdff9a4a6da5c47f1cd..5b9192ed450b089b9fa4d404f06d6759eb0cfd85 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -82,6 +82,14 @@ double bgwriter_lru_multiplier = 2.0; bool track_io_timing = false; int effective_io_concurrency = 0; +/* + * GUC variables about triggering kernel writeback for buffers written; OS + * dependant defaults are set via the GUC mechanism. + */ +int checkpoint_flush_after = 0; +int bgwriter_flush_after = 0; +int backend_flush_after = 0; + /* * How many buffers PrefetchBuffer callers should try to stay ahead of their * ReadBuffer calls by. This is maintained by the assign hook for @@ -399,7 +407,7 @@ static bool PinBuffer(BufferDesc *buf, BufferAccessStrategy strategy); static void PinBuffer_Locked(BufferDesc *buf); static void UnpinBuffer(BufferDesc *buf, bool fixOwner); static void BufferSync(int flags); -static int SyncOneBuffer(int buf_id, bool skip_recently_used); +static int SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *flush_context); static void WaitIO(BufferDesc *buf); static bool StartBufferIO(BufferDesc *buf, bool forInput); static void TerminateBufferIO(BufferDesc *buf, bool clear_dirty, @@ -416,6 +424,7 @@ static void FlushBuffer(BufferDesc *buf, SMgrRelation reln); static void AtProcExit_Buffers(int code, Datum arg); static void CheckForBufferLeaks(void); static int rnode_comparator(const void *p1, const void *p2); +static int buffertag_comparator(const void *p1, const void *p2); /* @@ -818,6 +827,13 @@ ReadBuffer_common(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, MemSet((char *) bufBlock, 0, BLCKSZ); /* don't set checksum for all-zero page */ smgrextend(smgr, forkNum, blockNum, (char *) bufBlock, false); + + /* + * NB: we're *not* doing a ScheduleBufferTagForWriteback here; + * although we're essentially performing a write. At least on linux + * doing so defeats the 'delayed allocation' mechanism, leading to + * increased file fragmentation. + */ } else { @@ -1084,6 +1100,9 @@ BufferAlloc(SMgrRelation smgr, char relpersistence, ForkNumber forkNum, FlushBuffer(buf, NULL); LWLockRelease(BufferDescriptorGetContentLock(buf)); + ScheduleBufferTagForWriteback(&BackendWritebackContext, + &buf->tag); + TRACE_POSTGRESQL_BUFFER_WRITE_DIRTY_DONE(forkNum, blockNum, smgr->smgr_rnode.node.spcNode, smgr->smgr_rnode.node.dbNode, @@ -1642,6 +1661,7 @@ BufferSync(int flags) int num_to_write; int num_written; int mask = BM_DIRTY; + WritebackContext wb_context; /* Make sure we can handle the pin inside SyncOneBuffer */ ResourceOwnerEnlargeBuffers(CurrentResourceOwner); @@ -1694,6 +1714,8 @@ BufferSync(int flags) if (num_to_write == 0) return; /* nothing to do */ + WritebackContextInit(&wb_context, &checkpoint_flush_after); + TRACE_POSTGRESQL_BUFFER_SYNC_START(NBuffers, num_to_write); /* @@ -1725,7 +1747,7 @@ BufferSync(int flags) */ if (bufHdr->flags & BM_CHECKPOINT_NEEDED) { - if (SyncOneBuffer(buf_id, false) & BUF_WRITTEN) + if (SyncOneBuffer(buf_id, false, &wb_context) & BUF_WRITTEN) { TRACE_POSTGRESQL_BUFFER_SYNC_WRITTEN(buf_id); BgWriterStats.m_buf_written_checkpoints++; @@ -1756,6 +1778,9 @@ BufferSync(int flags) buf_id = 0; } + /* issue all pending flushes */ + IssuePendingWritebacks(&wb_context); + /* * Update checkpoint statistics. As noted above, this doesn't include * buffers written by other backends or bgwriter scan. @@ -1777,7 +1802,7 @@ BufferSync(int flags) * bgwriter_lru_maxpages to 0.) */ bool -BgBufferSync(void) +BgBufferSync(WritebackContext *wb_context) { /* info obtained from freelist.c */ int strategy_buf_id; @@ -2002,7 +2027,8 @@ BgBufferSync(void) /* Execute the LRU scan */ while (num_to_scan > 0 && reusable_buffers < upcoming_alloc_est) { - int buffer_state = SyncOneBuffer(next_to_clean, true); + int buffer_state = SyncOneBuffer(next_to_clean, true, + wb_context); if (++next_to_clean >= NBuffers) { @@ -2079,10 +2105,11 @@ BgBufferSync(void) * Note: caller must have done ResourceOwnerEnlargeBuffers. */ static int -SyncOneBuffer(int buf_id, bool skip_recently_used) +SyncOneBuffer(int buf_id, bool skip_recently_used, WritebackContext *wb_context) { BufferDesc *bufHdr = GetBufferDescriptor(buf_id); int result = 0; + BufferTag tag; ReservePrivateRefCountEntry(); @@ -2123,8 +2150,13 @@ SyncOneBuffer(int buf_id, bool skip_recently_used) FlushBuffer(bufHdr, NULL); LWLockRelease(BufferDescriptorGetContentLock(bufHdr)); + + tag = bufHdr->tag; + UnpinBuffer(bufHdr, true); + ScheduleBufferTagForWriteback(wb_context, &tag); + return result | BUF_WRITTEN; } @@ -3729,3 +3761,154 @@ rnode_comparator(const void *p1, const void *p2) else return 0; } + +/* + * BufferTag comparator. + */ +static int +buffertag_comparator(const void *a, const void *b) +{ + const BufferTag *ba = (const BufferTag *) a; + const BufferTag *bb = (const BufferTag *) b; + int ret; + + ret = rnode_comparator(&ba->rnode, &bb->rnode); + + if (ret != 0) + return ret; + + if (ba->forkNum < bb->forkNum) + return -1; + if (ba->forkNum > bb->forkNum) + return 1; + + if (ba->blockNum < bb->blockNum) + return -1; + if (ba->blockNum > bb->blockNum) + return 1; + + return 0; +} + +/* + * Initialize a writeback context, discarding potential previous state. + * + * *max_coalesce is a pointer to a variable containing the current maximum + * number of writeback requests that will be coalesced into a bigger one. A + * value <= 0 means that no writeback control will be performed. max_pending + * is a pointer instead of an immediate value, so the coalesce limits can + * easily changed by the GUC mechanism, and so calling code does not have to + * check the current configuration. + */ +void +WritebackContextInit(WritebackContext *context, int *max_pending) +{ + Assert(*max_pending <= WRITEBACK_MAX_PENDING_FLUSHES); + + context->max_pending = max_pending; + context->nr_pending = 0; +} + +/* + * Add buffer to list of pending writeback requests. + */ +void +ScheduleBufferTagForWriteback(WritebackContext *context, BufferTag *tag) +{ + PendingWriteback *pending; + + /* + * Add buffer to the pending writeback array, unless writeback control is + * disabled. + */ + if (*context->max_pending > 0) + { + Assert(*context->max_pending <= WRITEBACK_MAX_PENDING_FLUSHES); + + pending = &context->pending_writebacks[context->nr_pending++]; + + pending->tag = *tag; + } + + /* + * Perform pending flushes if the writeback limit is exceeded. This + * includes the case where previously an item has been added, but control + * is now disabled. + */ + if (context->nr_pending >= *context->max_pending) + IssuePendingWritebacks(context); +} + +/* + * Issue all pending writeback requests, previously scheduled with + * ScheduleBufferTagForWriteback, to the OS. + * + * Because this is only used to improve the OSs IO scheduling we try to never + * error out - it's just a hint. + */ +void +IssuePendingWritebacks(WritebackContext *context) +{ + int i; + + if (context->nr_pending == 0) + return; + + /* + * Executing the writes in-order can make them a lot faster, and allows to + * merge writeback requests to consecutive blocks into larger writebacks. + */ + qsort(&context->pending_writebacks, context->nr_pending, + sizeof(PendingWriteback), buffertag_comparator); + + /* + * Coalesce neighbouring writes, but nothing else. For that we iterate + * through the, now sorted, array of pending flushes, and look forward to + * find all neighbouring (or identical) writes. + */ + for (i = 0; i < context->nr_pending; i++) + { + PendingWriteback *cur; + PendingWriteback *next; + SMgrRelation reln; + int ahead; + BufferTag tag; + Size nblocks = 1; + + cur = &context->pending_writebacks[i]; + tag = cur->tag; + + /* + * Peek ahead, into following writeback requests, to see if they can + * be combined with the current one. + */ + for (ahead = 0; i + ahead + 1 < context->nr_pending; ahead++) + { + next = &context->pending_writebacks[i + ahead + 1]; + + /* different file, stop */ + if (!RelFileNodeEquals(cur->tag.rnode, next->tag.rnode) || + cur->tag.forkNum != next->tag.forkNum) + break; + + /* ok, block queued twice, skip */ + if (cur->tag.blockNum == next->tag.blockNum) + continue; + + /* only merge consecutive writes */ + if (cur->tag.blockNum + 1 != next->tag.blockNum) + break; + + nblocks++; + cur = next; + } + + i += ahead; + + /* and finally tell the kernel to write the data to storage */ + reln = smgropen(tag.rnode, InvalidBackendId); + smgrwriteback(reln, tag.forkNum, tag.blockNum, nblocks); + } + + context->nr_pending = 0; +} diff --git a/src/backend/storage/file/copydir.c b/src/backend/storage/file/copydir.c index 522f42079ed3bfcceef3bd2dfec0ab3f334d4c4f..a51ee815662d4b0dbb37cec9c7f28a723dd46852 100644 --- a/src/backend/storage/file/copydir.c +++ b/src/backend/storage/file/copydir.c @@ -190,9 +190,9 @@ copy_file(char *fromfile, char *tofile) /* * We fsync the files later but first flush them to avoid spamming the * cache and hopefully get the kernel to start writing them out before - * the fsync comes. Ignore any error, since it's only a hint. + * the fsync comes. */ - (void) pg_flush_data(dstfd, offset, nbytes); + pg_flush_data(dstfd, offset, nbytes); } if (CloseTransientFile(dstfd)) diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c index c1076992a332304b65137c3451f103efd6124de7..046d1b3cc309bc1b3b15a25dd9279e0c6d662f75 100644 --- a/src/backend/storage/file/fd.c +++ b/src/backend/storage/file/fd.c @@ -61,6 +61,9 @@ #include <sys/file.h> #include <sys/param.h> #include <sys/stat.h> +#ifndef WIN32 +#include <sys/mman.h> +#endif #include <unistd.h> #include <fcntl.h> #ifdef HAVE_SYS_RESOURCE_H @@ -82,6 +85,8 @@ /* Define PG_FLUSH_DATA_WORKS if we have an implementation for pg_flush_data */ #if defined(HAVE_SYNC_FILE_RANGE) #define PG_FLUSH_DATA_WORKS 1 +#elif !defined(WIN32) && defined(MS_ASYNC) +#define PG_FLUSH_DATA_WORKS 1 #elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED) #define PG_FLUSH_DATA_WORKS 1 #endif @@ -383,29 +388,126 @@ pg_fdatasync(int fd) } /* - * pg_flush_data --- advise OS that the data described won't be needed soon + * pg_flush_data --- advise OS that the described dirty data should be flushed * - * Not all platforms have sync_file_range or posix_fadvise; treat as no-op - * if not available. Also, treat as no-op if enableFsync is off; this is - * because the call isn't free, and some platforms such as Linux will actually - * block the requestor until the write is scheduled. + * An offset of 0 with an nbytes 0 means that the entire file should be + * flushed. */ -int -pg_flush_data(int fd, off_t offset, off_t amount) +void +pg_flush_data(int fd, off_t offset, off_t nbytes) { -#ifdef PG_FLUSH_DATA_WORKS - if (enableFsync) - { + /* + * Right now file flushing is primarily used to avoid making later + * fsync()/fdatasync() calls have a less impact. Thus don't trigger + * flushes if fsyncs are disabled - that's a decision we might want to + * make configurable at some point. + */ + if (!enableFsync) + return; + + /* + * XXX: compile all alternatives, to find portability problems more easily + */ #if defined(HAVE_SYNC_FILE_RANGE) - return sync_file_range(fd, offset, amount, SYNC_FILE_RANGE_WRITE); -#elif defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED) - return posix_fadvise(fd, offset, amount, POSIX_FADV_DONTNEED); -#else -#error PG_FLUSH_DATA_WORKS should not have been defined + { + int rc = 0; + + /* + * sync_file_range(SYNC_FILE_RANGE_WRITE), currently linux specific, + * tells the OS that writeback for the passed in blocks should be + * started, but that we don't want to wait for completion. Note that + * this call might block if too much dirty data exists in the range. + * This is the preferrable method on OSs supporting it, as it works + * reliably when available (contrast to msync()) and doesn't flush out + * clean data (like FADV_DONTNEED). + */ + rc = sync_file_range(fd, offset, nbytes, + SYNC_FILE_RANGE_WRITE); + + /* don't error out, this is just a performance optimization */ + if (rc != 0) + { + ereport(WARNING, + (errcode_for_file_access(), + errmsg("could not flush dirty data: %m"))); + } + + return; + } #endif +#if !defined(WIN32) && defined(MS_ASYNC) + { + int rc = 0; + void *p; + + /* + * On several OSs msync(MS_ASYNC) on a mmap'ed file triggers + * writeback. On linux it only does so with MS_SYNC is specified, but + * then it does the writeback synchronously. Luckily all common linux + * systems have sync_file_range(). This is preferrable over + * FADV_DONTNEED because it doesn't flush out clean data. + * + * We map the file (mmap()), tell the kernel to sync back the contents + * (msync()), and then remove the mapping again (munmap()). + */ + p = mmap(NULL, nbytes, + PROT_READ | PROT_WRITE, MAP_SHARED, + fd, offset); + if (p == MAP_FAILED) + { + ereport(WARNING, + (errcode_for_file_access(), + errmsg("could not mmap while flushing dirty data: %m"))); + return; + } + + rc = msync(p, nbytes, MS_ASYNC); + if (rc != 0) + { + ereport(WARNING, + (errcode_for_file_access(), + errmsg("could not flush dirty data: %m"))); + /* NB: need to fall through to munmap()! */ + } + + rc = munmap(p, nbytes); + if (rc != 0) + { + /* FATAL error because mapping would remain */ + ereport(FATAL, + (errcode_for_file_access(), + errmsg("could not munmap while flushing blocks: %m"))); + } + + return; + } +#endif +#if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED) + { + int rc = 0; + + /* + * Signal the kernel that the passed in range should not be cached + * anymore. This has the, desired, side effect of writing out dirty + * data, and the, undesired, side effect of likely discarding useful + * clean cached blocks. For the latter reason this is the least + * preferrable method. + */ + + rc = posix_fadvise(fd, offset, nbytes, POSIX_FADV_DONTNEED); + + /* don't error out, this is just a performance optimization */ + if (rc != 0) + { + ereport(WARNING, + (errcode_for_file_access(), + errmsg("could not flush dirty data: %m"))); + return; + } + + return; } #endif - return 0; } @@ -1396,6 +1498,24 @@ FilePrefetch(File file, off_t offset, int amount) #endif } +void +FileWriteback(File file, off_t offset, int amount) +{ + int returnCode; + + Assert(FileIsValid(file)); + + DO_DB(elog(LOG, "FileWriteback: %d (%s) " INT64_FORMAT " %d", + file, VfdCache[file].fileName, + (int64) offset, amount)); + + returnCode = FileAccess(file); + if (returnCode < 0) + return; + + pg_flush_data(VfdCache[file].fd, offset, amount); +} + int FileRead(File file, char *buffer, int amount) { @@ -2796,9 +2916,10 @@ pre_sync_fname(const char *fname, bool isdir, int elevel) } /* - * We ignore errors from pg_flush_data() because this is only a hint. + * pg_flush_data() ignores errors, which is ok because this is only a + * hint. */ - (void) pg_flush_data(fd, 0, 0); + pg_flush_data(fd, 0, 0); (void) CloseTransientFile(fd); } diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c index f6b79a9968969dda533777e610e046fd4bab01b8..764cfb539460f20fc965c73f590f06079eafb111 100644 --- a/src/backend/storage/smgr/md.c +++ b/src/backend/storage/smgr/md.c @@ -662,6 +662,56 @@ mdprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) #endif /* USE_PREFETCH */ } +/* + * mdwriteback() -- Tell the kernel to write pages back to storage. + * + * This accepts a range of blocks because flushing several pages at once is + * considerably more efficient than doing so individually. + */ +void +mdwriteback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, int nblocks) +{ + /* + * Issue flush requests in as few requests as possible; have to split at + * segment boundaries though, since those are actually separate files. + */ + while (nblocks != 0) + { + int nflush = nblocks; + off_t seekpos; + MdfdVec *v; + int segnum_start, + segnum_end; + + v = _mdfd_getseg(reln, forknum, blocknum, false, + EXTENSION_RETURN_NULL); + + /* + * We might be flushing buffers of already removed relations, that's + * ok, just ignore that case. + */ + if (!v) + return; + + /* compute offset inside the current segment */ + segnum_start = blocknum / RELSEG_SIZE; + + /* compute number of desired writes within the current segment */ + segnum_end = (blocknum + nblocks - 1) / RELSEG_SIZE; + if (segnum_start != segnum_end) + nflush = RELSEG_SIZE - (blocknum % ((BlockNumber) RELSEG_SIZE)); + + Assert(nflush >= 1); + Assert(nflush <= nblocks); + + seekpos = (off_t) BLCKSZ *(blocknum % ((BlockNumber) RELSEG_SIZE)); + + FileWriteback(v->mdfd_vfd, seekpos, BLCKSZ * nflush); + + nblocks -= nflush; + blocknum += nflush; + } +} /* * mdread() -- Read the specified block from a relation. diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c index 87ff3583ff89746973ea1bef7bfead6c7b26f682..c0915c8d889de21b0ec61ac639a902369181be48 100644 --- a/src/backend/storage/smgr/smgr.c +++ b/src/backend/storage/smgr/smgr.c @@ -53,6 +53,8 @@ typedef struct f_smgr BlockNumber blocknum, char *buffer); void (*smgr_write) (SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync); + void (*smgr_writeback) (SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, int nblocks); BlockNumber (*smgr_nblocks) (SMgrRelation reln, ForkNumber forknum); void (*smgr_truncate) (SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks); @@ -66,8 +68,8 @@ typedef struct f_smgr static const f_smgr smgrsw[] = { /* magnetic disk */ {mdinit, NULL, mdclose, mdcreate, mdexists, mdunlink, mdextend, - mdprefetch, mdread, mdwrite, mdnblocks, mdtruncate, mdimmedsync, - mdpreckpt, mdsync, mdpostckpt + mdprefetch, mdread, mdwrite, mdwriteback, mdnblocks, mdtruncate, + mdimmedsync, mdpreckpt, mdsync, mdpostckpt } }; @@ -649,6 +651,19 @@ smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, buffer, skipFsync); } + +/* + * smgrwriteback() -- Trigger kernel writeback for the supplied range of + * blocks. + */ +void +smgrwriteback(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, + int nblocks) +{ + (*(smgrsw[reln->smgr_which].smgr_writeback)) (reln, forknum, blocknum, + nblocks); +} + /* * smgrnblocks() -- Calculate the number of blocks in the * supplied relation. diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index 0be64a1c9f3cc65ac0a10686f13b6e809df79bca..edcafce2065c5eb6b57c27679bec31b6ac50a0bf 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -2384,6 +2384,42 @@ static struct config_int ConfigureNamesInt[] = check_effective_io_concurrency, assign_effective_io_concurrency, NULL }, + { + {"checkpoint_flush_after", PGC_SIGHUP, RESOURCES_ASYNCHRONOUS, + gettext_noop("Number of pages after which previously performed writes are flushed to disk."), + NULL, + GUC_UNIT_BLOCKS + }, + &checkpoint_flush_after, + /* see bufmgr.h: OS dependant default */ + DEFAULT_CHECKPOINT_FLUSH_AFTER, 0, WRITEBACK_MAX_PENDING_FLUSHES, + NULL, NULL, NULL + }, + + { + {"backend_flush_after", PGC_USERSET, WAL_CHECKPOINTS, + gettext_noop("Number of pages after which previously performed writes are flushed to disk."), + NULL, + GUC_UNIT_BLOCKS + }, + &backend_flush_after, + /* see bufmgr.h: OS dependant default */ + DEFAULT_BACKEND_FLUSH_AFTER, 0, WRITEBACK_MAX_PENDING_FLUSHES, + NULL, NULL, NULL + }, + + { + {"bgwriter_flush_after", PGC_SIGHUP, WAL_CHECKPOINTS, + gettext_noop("Number of pages after which previously performed writes are flushed to disk."), + NULL, + GUC_UNIT_BLOCKS + }, + &bgwriter_flush_after, + /* see bufmgr.h: 16 on Linux, 0 otherwise */ + DEFAULT_BGWRITER_FLUSH_AFTER, 0, WRITEBACK_MAX_PENDING_FLUSHES, + NULL, NULL, NULL + }, + { {"max_worker_processes", PGC_POSTMASTER, diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h index cbc4843377bf3a9ff11a488c6df75f9273817eaf..af050419f7fb2be47e81e2be0bdb3a05c55170c3 100644 --- a/src/include/storage/buf_internals.h +++ b/src/include/storage/buf_internals.h @@ -16,6 +16,7 @@ #define BUFMGR_INTERNALS_H #include "storage/buf.h" +#include "storage/bufmgr.h" #include "storage/latch.h" #include "storage/lwlock.h" #include "storage/shmem.h" @@ -208,16 +209,44 @@ extern PGDLLIMPORT LWLockMinimallyPadded *BufferIOLWLockArray; #define UnlockBufHdr(bufHdr) SpinLockRelease(&(bufHdr)->buf_hdr_lock) +/* + * The PendingWriteback & WritebackContext structure are used to keep + * information about pending flush requests to be issued to the OS. + */ +typedef struct PendingWriteback +{ + /* could store different types of pending flushes here */ + BufferTag tag; +} PendingWriteback; + +/* struct forward declared in bufmgr.h */ +typedef struct WritebackContext +{ + /* pointer to the max number of writeback requests to coalesce */ + int *max_pending; + + /* current number of pending writeback requests */ + int nr_pending; + + /* pending requests */ + PendingWriteback pending_writebacks[WRITEBACK_MAX_PENDING_FLUSHES]; +} WritebackContext; + /* in buf_init.c */ extern PGDLLIMPORT BufferDescPadded *BufferDescriptors; +extern PGDLLIMPORT WritebackContext BackendWritebackContext; /* in localbuf.c */ extern BufferDesc *LocalBufferDescriptors; /* - * Internal routines: only called by bufmgr + * Internal buffer management routines */ +/* bufmgr.c */ +extern void WritebackContextInit(WritebackContext *context, int *max_coalesce); +extern void IssuePendingWritebacks(WritebackContext *context); +extern void ScheduleBufferTagForWriteback(WritebackContext *context, BufferTag *tag); /* freelist.c */ extern BufferDesc *StrategyGetBuffer(BufferAccessStrategy strategy); diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h index 92c4bc543e234eda7f46e0e172a1a1b22cdb36e8..7d57c048714e557eb1ff636279ca4e7647c7a6b0 100644 --- a/src/include/storage/bufmgr.h +++ b/src/include/storage/bufmgr.h @@ -45,16 +45,36 @@ typedef enum * replay; otherwise same as RBM_NORMAL */ } ReadBufferMode; +/* forward declared, to avoid having to expose buf_internals.h here */ +struct WritebackContext; + /* in globals.c ... this duplicates miscadmin.h */ extern PGDLLIMPORT int NBuffers; /* in bufmgr.c */ +#define WRITEBACK_MAX_PENDING_FLUSHES 256 + +/* FIXME: Also default to on for mmap && msync(MS_ASYNC)? */ +#ifdef HAVE_SYNC_FILE_RANGE +#define DEFAULT_CHECKPOINT_FLUSH_AFTER 32 +#define DEFAULT_BACKEND_FLUSH_AFTER 16 +#define DEFAULT_BGWRITER_FLUSH_AFTER 64 +#else +#define DEFAULT_CHECKPOINT_FLUSH_AFTER 0 +#define DEFAULT_BACKEND_FLUSH_AFTER 0 +#define DEFAULT_BGWRITER_FLUSH_AFTER 0 +#endif /* HAVE_SYNC_FILE_RANGE */ + extern bool zero_damaged_pages; extern int bgwriter_lru_maxpages; extern double bgwriter_lru_multiplier; extern bool track_io_timing; extern int target_prefetch_pages; +extern int checkpoint_flush_after; +extern int backend_flush_after; +extern int bgwriter_flush_after; + /* in buf_init.c */ extern PGDLLIMPORT char *BufferBlocks; @@ -209,7 +229,7 @@ extern bool HoldingBufferPinThatDelaysRecovery(void); extern void AbortBufferIO(void); extern void BufmgrCommit(void); -extern bool BgBufferSync(void); +extern bool BgBufferSync(struct WritebackContext *wb_context); extern void AtProcExit_LocalBuffers(void); diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h index 1a7f8ae7c5f68a10de934198afb692fc78a76322..be243694cd9cf7ddcb1442371919ff9fcf9a7ef8 100644 --- a/src/include/storage/fd.h +++ b/src/include/storage/fd.h @@ -74,6 +74,7 @@ extern int FileWrite(File file, char *buffer, int amount); extern int FileSync(File file); extern off_t FileSeek(File file, off_t offset, int whence); extern int FileTruncate(File file, off_t offset); +extern void FileWriteback(File file, off_t offset, int amount); extern char *FilePathName(File file); extern int FileGetRawDesc(File file); extern int FileGetRawFlags(File file); @@ -115,7 +116,7 @@ extern int pg_fsync(int fd); extern int pg_fsync_no_writethrough(int fd); extern int pg_fsync_writethrough(int fd); extern int pg_fdatasync(int fd); -extern int pg_flush_data(int fd, off_t offset, off_t amount); +extern void pg_flush_data(int fd, off_t offset, off_t amount); extern void fsync_fname(const char *fname, bool isdir); extern int durable_rename(const char *oldfile, const char *newfile, int loglevel); extern int durable_link_or_rename(const char *oldfile, const char *newfile, int loglevel); diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h index a7267ea7e2d2062e9482b0dba79bb7f27bfbfabb..776b0d001fbf6e13441c96bff9af5dcd3e8ed799 100644 --- a/src/include/storage/smgr.h +++ b/src/include/storage/smgr.h @@ -96,6 +96,8 @@ extern void smgrread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer); extern void smgrwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync); +extern void smgrwriteback(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, int nblocks); extern BlockNumber smgrnblocks(SMgrRelation reln, ForkNumber forknum); extern void smgrtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks); @@ -122,6 +124,8 @@ extern void mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer); extern void mdwrite(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool skipFsync); +extern void mdwriteback(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum, int nblocks); extern BlockNumber mdnblocks(SMgrRelation reln, ForkNumber forknum); extern void mdtruncate(SMgrRelation reln, ForkNumber forknum, BlockNumber nblocks); diff --git a/src/tools/pgindent/typedefs.list b/src/tools/pgindent/typedefs.list index d96896b4c27d83eabf3fe8228b31318e8da89af8..f501f556af872e4f88db4976f6d0a4c7ad1c397c 100644 --- a/src/tools/pgindent/typedefs.list +++ b/src/tools/pgindent/typedefs.list @@ -1411,6 +1411,7 @@ Pattern_Type PendingOperationEntry PendingRelDelete PendingUnlinkEntry +PendingWriteback PerlInterpreter Perl_ppaddr_t Permutation @@ -2142,6 +2143,7 @@ WriteBytePtr WriteDataPtr WriteExtraTocPtr WriteFunc +WritebackContext X509 X509_NAME X509_NAME_ENTRY