From 34c70c7ac4926495b7ab5ebfe748704313a4c822 Mon Sep 17 00:00:00 2001
From: Robert Haas <rhaas@postgresql.org>
Date: Tue, 14 Dec 2010 09:25:25 -0500
Subject: [PATCH] Instrument checkpoint sync calls.

Greg Smith, reviewed by Jeff Janes
---
 src/backend/access/transam/xlog.c | 41 ++++++++++++++++++++++++++-----
 src/backend/storage/smgr/md.c     | 37 ++++++++++++++++++++++++++++
 src/include/access/xlog.h         |  7 ++++++
 3 files changed, 79 insertions(+), 6 deletions(-)

diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 49764581049..b49b933de3e 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -6953,10 +6953,15 @@ LogCheckpointEnd(bool restartpoint)
 {
 	long		write_secs,
 				sync_secs,
-				total_secs;
+				total_secs,
+				longest_secs,
+				average_secs;
 	int			write_usecs,
 				sync_usecs,
-				total_usecs;
+				total_usecs,
+				longest_usecs,
+				average_usecs;
+	uint64		average_sync_time;
 
 	CheckpointStats.ckpt_end_t = GetCurrentTimestamp();
 
@@ -6972,18 +6977,39 @@ LogCheckpointEnd(bool restartpoint)
 						CheckpointStats.ckpt_sync_end_t,
 						&sync_secs, &sync_usecs);
 
+	/*
+	 * Timing values returned from CheckpointStats are in microseconds.
+	 * Convert to the second plus microsecond form that TimestampDifference
+	 * returns for homogeneous printing.
+	 */
+	longest_secs = (long) (CheckpointStats.ckpt_longest_sync / 1000000);
+	longest_usecs = CheckpointStats.ckpt_longest_sync -
+		(uint64) longest_secs * 1000000;
+
+	average_sync_time = 0;
+	if (CheckpointStats.ckpt_sync_rels > 0) 
+		average_sync_time = CheckpointStats.ckpt_agg_sync_time /
+			CheckpointStats.ckpt_sync_rels;
+	average_secs = (long) (average_sync_time / 1000000);
+	average_usecs = average_sync_time - (uint64) average_secs * 1000000;
+
 	if (restartpoint)
 		elog(LOG, "restartpoint complete: wrote %d buffers (%.1f%%); "
-			 "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s",
+			 "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s; "
+			 "sync files=%d, longest=%ld.%03d s, average=%ld.%03d s",
 			 CheckpointStats.ckpt_bufs_written,
 			 (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
 			 write_secs, write_usecs / 1000,
 			 sync_secs, sync_usecs / 1000,
-			 total_secs, total_usecs / 1000);
+			 total_secs, total_usecs / 1000,
+			 CheckpointStats.ckpt_sync_rels,
+			 longest_secs, longest_usecs / 1000,
+			 average_secs, average_usecs / 1000);
 	else
 		elog(LOG, "checkpoint complete: wrote %d buffers (%.1f%%); "
 			 "%d transaction log file(s) added, %d removed, %d recycled; "
-			 "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s",
+			 "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s; "
+			 "sync files=%d, longest=%ld.%03d s, average=%ld.%03d s",
 			 CheckpointStats.ckpt_bufs_written,
 			 (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
 			 CheckpointStats.ckpt_segs_added,
@@ -6991,7 +7017,10 @@ LogCheckpointEnd(bool restartpoint)
 			 CheckpointStats.ckpt_segs_recycled,
 			 write_secs, write_usecs / 1000,
 			 sync_secs, sync_usecs / 1000,
-			 total_secs, total_usecs / 1000);
+			 total_secs, total_usecs / 1000,
+			 CheckpointStats.ckpt_sync_rels,
+			 longest_secs, longest_usecs / 1000,
+			 average_secs, average_usecs / 1000);
 }
 
 /*
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
index 604db218eeb..944eed83c5a 100644
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -20,6 +20,7 @@
 
 #include "catalog/catalog.h"
 #include "miscadmin.h"
+#include "portability/instr_time.h"
 #include "postmaster/bgwriter.h"
 #include "storage/fd.h"
 #include "storage/bufmgr.h"
@@ -927,6 +928,15 @@ mdsync(void)
 	PendingOperationEntry *entry;
 	int			absorb_counter;
 
+	/* Statistics on sync times */
+	int			processed = 0;
+	instr_time	sync_start,
+				sync_end,
+				sync_diff; 
+	uint64		elapsed;
+	uint64		longest = 0;
+	uint64		total_elapsed = 0;
+
 	/*
 	 * This is only called during checkpoints, and checkpoints should only
 	 * occur in processes that have created a pendingOpsTable.
@@ -1069,9 +1079,31 @@ mdsync(void)
 				seg = _mdfd_getseg(reln, entry->tag.forknum,
 							  entry->tag.segno * ((BlockNumber) RELSEG_SIZE),
 								   false, EXTENSION_RETURN_NULL);
+
+				if (log_checkpoints)
+					INSTR_TIME_SET_CURRENT(sync_start);
+				else
+					INSTR_TIME_SET_ZERO(sync_start);
+
 				if (seg != NULL &&
 					FileSync(seg->mdfd_vfd) >= 0)
+				{
+					if (log_checkpoints && (! INSTR_TIME_IS_ZERO(sync_start)))
+					{
+						INSTR_TIME_SET_CURRENT(sync_end);
+						sync_diff = sync_end;
+						INSTR_TIME_SUBTRACT(sync_diff, sync_start);
+						elapsed = INSTR_TIME_GET_MICROSEC(sync_diff);
+						if (elapsed > longest)
+							longest = elapsed;
+						total_elapsed += elapsed;
+						processed++;
+						elog(DEBUG1, "checkpoint sync: number=%d file=%s time=%.3f msec", 
+							processed, FilePathName(seg->mdfd_vfd), (double) elapsed / 1000);
+					}
+
 					break;		/* success; break out of retry loop */
+				}
 
 				/*
 				 * XXX is there any point in allowing more than one retry?
@@ -1113,6 +1145,11 @@ mdsync(void)
 			elog(ERROR, "pendingOpsTable corrupted");
 	}							/* end loop over hashtable entries */
 
+	/* Return sync performance metrics for report at checkpoint end */
+	CheckpointStats.ckpt_sync_rels = processed;
+	CheckpointStats.ckpt_longest_sync = longest;
+	CheckpointStats.ckpt_agg_sync_time = total_elapsed;
+
 	/* Flag successful completion of mdsync */
 	mdsync_in_progress = false;
 }
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index fa7ae2ac126..e9d8d15814a 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -257,6 +257,13 @@ typedef struct CheckpointStatsData
 	int			ckpt_segs_added;	/* # of new xlog segments created */
 	int			ckpt_segs_removed;		/* # of xlog segments deleted */
 	int			ckpt_segs_recycled;		/* # of xlog segments recycled */
+
+	int			ckpt_sync_rels;		/* # of relations synced */
+	uint64		ckpt_longest_sync;	/* Longest sync for one relation */
+	uint64		ckpt_agg_sync_time;	/* The sum of all the individual sync
+									 * times, which is not necessarily the
+									 * same as the total elapsed time for
+									 * the entire sync phase. */
 } CheckpointStatsData;
 
 extern CheckpointStatsData CheckpointStats;
-- 
GitLab