diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index 074afee494eca127d31617551676bbebf74906fd..4e0492b939399a167df87d448560d5b1712386bb 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -1866,23 +1866,26 @@ SET ENABLE_SEQSCAN TO OFF;
       </indexterm>
       <listitem>
        <para>
-        When the commit data for a transaction is flushed to disk, any
-        additional commits ready at that time are also flushed out.
         <varname>commit_delay</varname> adds a time delay, set in
-        microseconds, before a transaction attempts to
-        flush the WAL buffer out to disk.  A nonzero delay can allow more
-        transactions to be committed with only one flush operation, if
-        system load is high enough that additional transactions become
-        ready to commit within the given interval. But the delay is
-        just wasted if no other transactions become ready to
-        commit. Therefore, the delay is only performed if at least
-        <varname>commit_siblings</varname> other transactions are
-        active at the instant that a server process has written its
-        commit record.
-        The default <varname>commit_delay</> is zero (no delay).
-        Since all pending commit data will be written at every flush
-        regardless of this setting, it is rare that adding delay
-        by increasing this parameter will actually improve performance.
+        microseconds, before a WAL flush is initiated.  This can improve
+        group commit throughput by allowing a larger number of transactions
+        to commit via a single WAL flush, if system load is high enough
+        that additional transactions become ready to commit within the
+        given interval.  However, it also increases latency by up to
+        <varname>commit_delay</varname> microseconds for each WAL
+        flush.  Because the delay is just wasted if no other transactions
+        become ready to commit, it is only performed if at least
+        <varname>commit_siblings</varname> other transactions are active
+        immediately before a flush would otherwise have been initiated.
+        In <productname>PostgreSQL</> releases prior to 9.3,
+        <varname>commit_delay</varname> behaved differently and was much
+        less effective: it affected only commits, rather than all WAL flushes,
+        and waited for the entire configured delay even if the WAL flush
+        was completed sooner.  Beginning in <productname>PostgreSQL</> 9.3, 
+        the first process that becomes ready to flush waits for the configured
+        interval, while subsequent processes wait only until the leader
+        completes the flush.  The default <varname>commit_delay</> is zero
+        (no delay).
        </para>
       </listitem>
      </varlistentry>
diff --git a/doc/src/sgml/wal.sgml b/doc/src/sgml/wal.sgml
index 0afb9d6af600da01c943534f0089f97dc311c8d4..a98132d3f2a1f8c16d3a15d26d5d6251d87ca3cb 100644
--- a/doc/src/sgml/wal.sgml
+++ b/doc/src/sgml/wal.sgml
@@ -376,9 +376,7 @@
    <acronym>WAL</acronym> to disk, in the hope that a single flush
    executed by one such transaction can also serve other transactions
    committing at about the same time.  Setting <varname>commit_delay</varname>
-   can only help when there are many concurrently committing transactions,
-   and it is difficult to tune it to a value that actually helps rather
-   than hurt throughput.
+   can only help when there are many concurrently committing transactions.
   </para>
 
  </sect1>
diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c
index 86b1afa80d9330bcf5ac5adb223b7ebc00c08f53..49def6abbb66a2738c256dc6ae962f2837676eea 100644
--- a/src/backend/access/transam/xact.c
+++ b/src/backend/access/transam/xact.c
@@ -68,9 +68,6 @@ bool		XactDeferrable;
 
 int			synchronous_commit = SYNCHRONOUS_COMMIT_ON;
 
-int			CommitDelay = 0;	/* precommit delay in microseconds */
-int			CommitSiblings = 5; /* # concurrent xacts needed to sleep */
-
 /*
  * MyXactAccessedTempRel is set when a temporary relation is accessed.
  * We don't allow PREPARE TRANSACTION in that case.  (This is global
@@ -1123,22 +1120,6 @@ RecordTransactionCommit(void)
 	if ((wrote_xlog && synchronous_commit > SYNCHRONOUS_COMMIT_OFF) ||
 		forceSyncCommit || nrels > 0)
 	{
-		/*
-		 * Synchronous commit case:
-		 *
-		 * Sleep before flush! So we can flush more than one commit records
-		 * per single fsync.  (The idea is some other backend may do the
-		 * XLogFlush while we're sleeping.  This needs work still, because on
-		 * most Unixen, the minimum select() delay is 10msec or more, which is
-		 * way too long.)
-		 *
-		 * We do not sleep if enableFsync is not turned on, nor if there are
-		 * fewer than CommitSiblings other backends with active transactions.
-		 */
-		if (CommitDelay > 0 && enableFsync &&
-			MinimumActiveBackends(CommitSiblings))
-			pg_usleep(CommitDelay);
-
 		XLogFlush(XactLastRecEnd);
 
 		/*
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index a43e2eeaf306eb15146abf8e7a253fa9f38cdb50..6ee50d01d52a1abf892790841c07d646929ebb32 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -80,6 +80,8 @@ bool		fullPageWrites = true;
 bool		log_checkpoints = false;
 int			sync_method = DEFAULT_SYNC_METHOD;
 int			wal_level = WAL_LEVEL_MINIMAL;
+int			CommitDelay = 0;	/* precommit delay in microseconds */
+int			CommitSiblings = 5; /* # concurrent xacts needed to sleep */
 
 #ifdef WAL_DEBUG
 bool		XLOG_DEBUG = false;
@@ -2098,34 +2100,49 @@ XLogFlush(XLogRecPtr record)
 			 */
 			continue;
 		}
-		/* Got the lock */
+
+		/* Got the lock; recheck whether request is satisfied */
 		LogwrtResult = XLogCtl->LogwrtResult;
-		if (!XLByteLE(record, LogwrtResult.Flush))
+		if (XLByteLE(record, LogwrtResult.Flush))
+			break;
+
+		/*
+		 * Sleep before flush! By adding a delay here, we may give further
+		 * backends the opportunity to join the backlog of group commit
+		 * followers; this can significantly improve transaction throughput, at
+		 * the risk of increasing transaction latency.
+		 *
+		 * We do not sleep if enableFsync is not turned on, nor if there are
+		 * fewer than CommitSiblings other backends with active transactions.
+		 */
+		if (CommitDelay > 0 && enableFsync &&
+			MinimumActiveBackends(CommitSiblings))
+			pg_usleep(CommitDelay);
+
+		/* try to write/flush later additions to XLOG as well */
+		if (LWLockConditionalAcquire(WALInsertLock, LW_EXCLUSIVE))
 		{
-			/* try to write/flush later additions to XLOG as well */
-			if (LWLockConditionalAcquire(WALInsertLock, LW_EXCLUSIVE))
-			{
-				XLogCtlInsert *Insert = &XLogCtl->Insert;
-				uint32		freespace = INSERT_FREESPACE(Insert);
+			XLogCtlInsert *Insert = &XLogCtl->Insert;
+			uint32		freespace = INSERT_FREESPACE(Insert);
 
-				if (freespace == 0)		/* buffer is full */
-					WriteRqstPtr = XLogCtl->xlblocks[Insert->curridx];
-				else
-				{
-					WriteRqstPtr = XLogCtl->xlblocks[Insert->curridx];
-					WriteRqstPtr -= freespace;
-				}
-				LWLockRelease(WALInsertLock);
-				WriteRqst.Write = WriteRqstPtr;
-				WriteRqst.Flush = WriteRqstPtr;
-			}
+			if (freespace == 0)		/* buffer is full */
+				WriteRqstPtr = XLogCtl->xlblocks[Insert->curridx];
 			else
 			{
-				WriteRqst.Write = WriteRqstPtr;
-				WriteRqst.Flush = record;
+				WriteRqstPtr = XLogCtl->xlblocks[Insert->curridx];
+				WriteRqstPtr -= freespace;
 			}
-			XLogWrite(WriteRqst, false, false);
+			LWLockRelease(WALInsertLock);
+			WriteRqst.Write = WriteRqstPtr;
+			WriteRqst.Flush = WriteRqstPtr;
 		}
+		else
+		{
+			WriteRqst.Write = WriteRqstPtr;
+			WriteRqst.Flush = record;
+		}
+		XLogWrite(WriteRqst, false, false);
+
 		LWLockRelease(WALWriteLock);
 		/* done */
 		break;