diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index 074afee494eca127d31617551676bbebf74906fd..4e0492b939399a167df87d448560d5b1712386bb 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -1866,23 +1866,26 @@ SET ENABLE_SEQSCAN TO OFF; </indexterm> <listitem> <para> - When the commit data for a transaction is flushed to disk, any - additional commits ready at that time are also flushed out. <varname>commit_delay</varname> adds a time delay, set in - microseconds, before a transaction attempts to - flush the WAL buffer out to disk. A nonzero delay can allow more - transactions to be committed with only one flush operation, if - system load is high enough that additional transactions become - ready to commit within the given interval. But the delay is - just wasted if no other transactions become ready to - commit. Therefore, the delay is only performed if at least - <varname>commit_siblings</varname> other transactions are - active at the instant that a server process has written its - commit record. - The default <varname>commit_delay</> is zero (no delay). - Since all pending commit data will be written at every flush - regardless of this setting, it is rare that adding delay - by increasing this parameter will actually improve performance. + microseconds, before a WAL flush is initiated. This can improve + group commit throughput by allowing a larger number of transactions + to commit via a single WAL flush, if system load is high enough + that additional transactions become ready to commit within the + given interval. However, it also increases latency by up to + <varname>commit_delay</varname> microseconds for each WAL + flush. Because the delay is just wasted if no other transactions + become ready to commit, it is only performed if at least + <varname>commit_siblings</varname> other transactions are active + immediately before a flush would otherwise have been initiated. + In <productname>PostgreSQL</> releases prior to 9.3, + <varname>commit_delay</varname> behaved differently and was much + less effective: it affected only commits, rather than all WAL flushes, + and waited for the entire configured delay even if the WAL flush + was completed sooner. Beginning in <productname>PostgreSQL</> 9.3, + the first process that becomes ready to flush waits for the configured + interval, while subsequent processes wait only until the leader + completes the flush. The default <varname>commit_delay</> is zero + (no delay). </para> </listitem> </varlistentry> diff --git a/doc/src/sgml/wal.sgml b/doc/src/sgml/wal.sgml index 0afb9d6af600da01c943534f0089f97dc311c8d4..a98132d3f2a1f8c16d3a15d26d5d6251d87ca3cb 100644 --- a/doc/src/sgml/wal.sgml +++ b/doc/src/sgml/wal.sgml @@ -376,9 +376,7 @@ <acronym>WAL</acronym> to disk, in the hope that a single flush executed by one such transaction can also serve other transactions committing at about the same time. Setting <varname>commit_delay</varname> - can only help when there are many concurrently committing transactions, - and it is difficult to tune it to a value that actually helps rather - than hurt throughput. + can only help when there are many concurrently committing transactions. </para> </sect1> diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c index 86b1afa80d9330bcf5ac5adb223b7ebc00c08f53..49def6abbb66a2738c256dc6ae962f2837676eea 100644 --- a/src/backend/access/transam/xact.c +++ b/src/backend/access/transam/xact.c @@ -68,9 +68,6 @@ bool XactDeferrable; int synchronous_commit = SYNCHRONOUS_COMMIT_ON; -int CommitDelay = 0; /* precommit delay in microseconds */ -int CommitSiblings = 5; /* # concurrent xacts needed to sleep */ - /* * MyXactAccessedTempRel is set when a temporary relation is accessed. * We don't allow PREPARE TRANSACTION in that case. (This is global @@ -1123,22 +1120,6 @@ RecordTransactionCommit(void) if ((wrote_xlog && synchronous_commit > SYNCHRONOUS_COMMIT_OFF) || forceSyncCommit || nrels > 0) { - /* - * Synchronous commit case: - * - * Sleep before flush! So we can flush more than one commit records - * per single fsync. (The idea is some other backend may do the - * XLogFlush while we're sleeping. This needs work still, because on - * most Unixen, the minimum select() delay is 10msec or more, which is - * way too long.) - * - * We do not sleep if enableFsync is not turned on, nor if there are - * fewer than CommitSiblings other backends with active transactions. - */ - if (CommitDelay > 0 && enableFsync && - MinimumActiveBackends(CommitSiblings)) - pg_usleep(CommitDelay); - XLogFlush(XactLastRecEnd); /* diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index a43e2eeaf306eb15146abf8e7a253fa9f38cdb50..6ee50d01d52a1abf892790841c07d646929ebb32 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -80,6 +80,8 @@ bool fullPageWrites = true; bool log_checkpoints = false; int sync_method = DEFAULT_SYNC_METHOD; int wal_level = WAL_LEVEL_MINIMAL; +int CommitDelay = 0; /* precommit delay in microseconds */ +int CommitSiblings = 5; /* # concurrent xacts needed to sleep */ #ifdef WAL_DEBUG bool XLOG_DEBUG = false; @@ -2098,34 +2100,49 @@ XLogFlush(XLogRecPtr record) */ continue; } - /* Got the lock */ + + /* Got the lock; recheck whether request is satisfied */ LogwrtResult = XLogCtl->LogwrtResult; - if (!XLByteLE(record, LogwrtResult.Flush)) + if (XLByteLE(record, LogwrtResult.Flush)) + break; + + /* + * Sleep before flush! By adding a delay here, we may give further + * backends the opportunity to join the backlog of group commit + * followers; this can significantly improve transaction throughput, at + * the risk of increasing transaction latency. + * + * We do not sleep if enableFsync is not turned on, nor if there are + * fewer than CommitSiblings other backends with active transactions. + */ + if (CommitDelay > 0 && enableFsync && + MinimumActiveBackends(CommitSiblings)) + pg_usleep(CommitDelay); + + /* try to write/flush later additions to XLOG as well */ + if (LWLockConditionalAcquire(WALInsertLock, LW_EXCLUSIVE)) { - /* try to write/flush later additions to XLOG as well */ - if (LWLockConditionalAcquire(WALInsertLock, LW_EXCLUSIVE)) - { - XLogCtlInsert *Insert = &XLogCtl->Insert; - uint32 freespace = INSERT_FREESPACE(Insert); + XLogCtlInsert *Insert = &XLogCtl->Insert; + uint32 freespace = INSERT_FREESPACE(Insert); - if (freespace == 0) /* buffer is full */ - WriteRqstPtr = XLogCtl->xlblocks[Insert->curridx]; - else - { - WriteRqstPtr = XLogCtl->xlblocks[Insert->curridx]; - WriteRqstPtr -= freespace; - } - LWLockRelease(WALInsertLock); - WriteRqst.Write = WriteRqstPtr; - WriteRqst.Flush = WriteRqstPtr; - } + if (freespace == 0) /* buffer is full */ + WriteRqstPtr = XLogCtl->xlblocks[Insert->curridx]; else { - WriteRqst.Write = WriteRqstPtr; - WriteRqst.Flush = record; + WriteRqstPtr = XLogCtl->xlblocks[Insert->curridx]; + WriteRqstPtr -= freespace; } - XLogWrite(WriteRqst, false, false); + LWLockRelease(WALInsertLock); + WriteRqst.Write = WriteRqstPtr; + WriteRqst.Flush = WriteRqstPtr; } + else + { + WriteRqst.Write = WriteRqstPtr; + WriteRqst.Flush = record; + } + XLogWrite(WriteRqst, false, false); + LWLockRelease(WALWriteLock); /* done */ break;