From f9ed327f760e7794faecfda54b19515a95ba901d Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Sun, 2 May 2010 02:10:33 +0000
Subject: [PATCH] Clean up some awkward, inaccurate, and inefficient processing
 around MaxStandbyDelay.  Use the GUC units mechanism for the value, and
 choose more appropriate timestamp functions for performing tests with it. 
 Make the ps_activity manipulation in ResolveRecoveryConflictWithVirtualXIDs
 have behavior similar to ps_activity code elsewhere, notably not updating the
 display when update_process_title is off and not truncating the display
 contents at an arbitrarily-chosen length.  Improve the docs to be explicit
 about what MaxStandbyDelay actually measures, viz the difference between
 primary and standby servers' clocks, and the possible hazards if their clocks
 aren't in sync.

---
 doc/src/sgml/config.sgml                      |  13 +-
 doc/src/sgml/high-availability.sgml           |  34 ++++--
 src/backend/access/transam/xlog.c             |   4 +-
 src/backend/storage/ipc/standby.c             | 112 +++++++-----------
 src/backend/utils/misc/guc.c                  |   7 +-
 src/backend/utils/misc/postgresql.conf.sample |   2 +-
 6 files changed, 78 insertions(+), 94 deletions(-)

diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml
index c84529f267f..f9c13e7c491 100644
--- a/doc/src/sgml/config.sgml
+++ b/doc/src/sgml/config.sgml
@@ -1,4 +1,4 @@
-<!-- $PostgreSQL: pgsql/doc/src/sgml/config.sgml,v 1.272 2010/04/29 21:36:18 tgl Exp $ -->
+<!-- $PostgreSQL: pgsql/doc/src/sgml/config.sgml,v 1.273 2010/05/02 02:10:32 tgl Exp $ -->
 
 <chapter Id="runtime-config">
   <title>Server Configuration</title>
@@ -1947,13 +1947,12 @@ SET ENABLE_SEQSCAN TO OFF;
        <para>
         When Hot Standby is active, this parameter specifies a wait policy
         for applying WAL entries that conflict with active queries.
-        If a conflict should occur the server will delay up to this number
-        of seconds before it cancels conflicting queries, as
+        If a conflict should occur the server will delay up to this long
+        before it cancels conflicting queries, as
         described in <xref linkend="hot-standby-conflict">.
-        Typically, this parameter is used only during replication.
-        The value is specified in seconds, and -1 causes the standby to wait
-        forever for a conflicting query to complete.
         The default is 30 seconds.
+        A value of -1 causes the standby to wait forever for a conflicting
+        query to complete.
         This parameter can only be set in the <filename>postgresql.conf</>
         file or on the server command line.
        </para>
@@ -1964,7 +1963,7 @@ SET ENABLE_SEQSCAN TO OFF;
       </para>
       <para>
        While it is tempting to believe that <varname>max_standby_delay</>
-       is the maximum number of seconds a query can run before
+       is the maximum length of time a query can run before
        cancellation is possible, this is not true.  When a long-running
        query ends, there is a finite time required to apply backlogged
        WAL logs.  If a second long-running query appears before the
diff --git a/doc/src/sgml/high-availability.sgml b/doc/src/sgml/high-availability.sgml
index b2f1a583fc3..ef4c1bb2c21 100644
--- a/doc/src/sgml/high-availability.sgml
+++ b/doc/src/sgml/high-availability.sgml
@@ -1,4 +1,4 @@
-<!-- $PostgreSQL: pgsql/doc/src/sgml/high-availability.sgml,v 1.65 2010/04/29 21:36:18 tgl Exp $ -->
+<!-- $PostgreSQL: pgsql/doc/src/sgml/high-availability.sgml,v 1.66 2010/05/02 02:10:32 tgl Exp $ -->
 
 <chapter id="high-availability">
  <title>High Availability, Load Balancing, and Replication</title>
@@ -1480,7 +1480,8 @@ if (!triggered)
     There are a number of choices for resolving query conflicts.  The default
     is to wait and hope the query finishes. The server will wait
     automatically until the lag between primary and standby is at most
-    <varname>max_standby_delay</> seconds. Once that grace period expires,
+    <xref linkend="guc-max-standby-delay"> (30 seconds by default).
+    Once that grace period expires,
     one of the following actions is taken:
 
       <itemizedlist>
@@ -1514,15 +1515,28 @@ if (!triggered)
    </para>
 
    <para>
-    <varname>max_standby_delay</> is set in <filename>postgresql.conf</>.
-    The parameter applies to the server as a whole, so if the delay is consumed
-    by a single query then there may be little or no waiting for queries that
-    follow, though they will have benefited equally from the initial
-    waiting period. The server may take time to catch up again before the grace
-    period is available again, though if there is a heavy and constant stream
-    of conflicts it may seldom catch up fully.
+    Keep in mind that <varname>max_standby_delay</> is compared to the
+    difference between the standby server's clock and the transaction
+    commit timestamps read from the WAL log.  Thus, the grace period
+    allowed to any one query on the standby is never more than
+    <varname>max_standby_delay</>, and could be considerably less if the
+    standby has already fallen behind as a result of waiting for previous
+    queries to complete, or as a result of being unable to keep up with a
+    heavy update load.
    </para>
 
+   <caution>
+    <para>
+     Be sure that the primary and standby servers' clocks are kept in sync;
+     otherwise the values compared to <varname>max_standby_delay</> will be
+     erroneous, possibly leading to undesirable query cancellations.
+     If the clocks are intentionally not in sync, or if there is a large
+     propagation delay from primary to standby, it is advisable to set
+     <varname>max_standby_delay</> to -1.  In any case the value should be
+     larger than the largest expected clock skew between primary and standby.
+    </para>
+   </caution>
+
    <para>
     Users should be clear that tables that are regularly and heavily updated on the
     primary server will quickly cause cancellation of longer running queries on
@@ -1656,7 +1670,7 @@ LOG:  database system is ready to accept read only connections
     <varname>max_standby_delay</> or even set it to zero, though that is a
     very aggressive setting. If the standby server is tasked as an additional
     server for decision support queries then it might be acceptable to set this
-    to a value of many hours (in seconds).  It is also possible to set
+    to a value of many hours.  It is also possible to set
     <varname>max_standby_delay</> to -1 which means wait forever for queries
     to complete; this will be useful when performing
     an archive recovery from a backup.
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 61dfb20117f..99e53b76321 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.407 2010/04/29 21:49:03 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.408 2010/05/02 02:10:33 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -72,7 +72,7 @@ int			XLogArchiveTimeout = 0;
 bool		XLogArchiveMode = false;
 char	   *XLogArchiveCommand = NULL;
 bool		EnableHotStandby = false;
-int			MaxStandbyDelay = 30;
+int			MaxStandbyDelay = 30 * 1000;
 bool		fullPageWrites = true;
 bool		log_checkpoints = false;
 int			sync_method = DEFAULT_SYNC_METHOD;
diff --git a/src/backend/storage/ipc/standby.c b/src/backend/storage/ipc/standby.c
index 30723327cd2..6bd156845fb 100644
--- a/src/backend/storage/ipc/standby.c
+++ b/src/backend/storage/ipc/standby.c
@@ -11,7 +11,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/storage/ipc/standby.c,v 1.20 2010/04/28 16:10:42 heikki Exp $
+ *	  $PostgreSQL: pgsql/src/backend/storage/ipc/standby.c,v 1.21 2010/05/02 02:10:33 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -124,31 +124,24 @@ static int	standbyWait_us = STANDBY_INITIAL_WAIT_US;
 static bool
 WaitExceedsMaxStandbyDelay(void)
 {
-	long		delay_secs;
-	int			delay_usecs;
-
-	if (MaxStandbyDelay == -1)
-		return false;
-
 	/* Are we past max_standby_delay? */
-	TimestampDifference(GetLatestXLogTime(), GetCurrentTimestamp(),
-						&delay_secs, &delay_usecs);
-	if (delay_secs > MaxStandbyDelay)
+	if (MaxStandbyDelay >= 0 &&
+		TimestampDifferenceExceeds(GetLatestXLogTime(), GetCurrentTimestamp(),
+								   MaxStandbyDelay))
 		return true;
 
 	/*
-	 * Sleep, then do bookkeeping.
+	 * Sleep a bit (this is essential to avoid busy-waiting).
 	 */
 	pg_usleep(standbyWait_us);
 
 	/*
-	 * Progressively increase the sleep times.
+	 * Progressively increase the sleep times, but not to more than 1s,
+	 * since pg_usleep isn't interruptable on some platforms.
 	 */
 	standbyWait_us *= 2;
 	if (standbyWait_us > 1000000)
 		standbyWait_us = 1000000;
-	if (standbyWait_us > MaxStandbyDelay * 1000000 / 4)
-		standbyWait_us = MaxStandbyDelay * 1000000 / 4;
 
 	return false;
 }
@@ -163,50 +156,41 @@ static void
 ResolveRecoveryConflictWithVirtualXIDs(VirtualTransactionId *waitlist,
 									   ProcSignalReason reason)
 {
-	char		waitactivitymsg[100];
-	char		oldactivitymsg[101];
-
 	while (VirtualTransactionIdIsValid(*waitlist))
 	{
-		long		wait_s;
-		int			wait_us;	/* wait in microseconds (us) */
 		TimestampTz waitStart;
-		bool		logged;
+		char	   *new_status;
+
+		pgstat_report_waiting(true);
 
 		waitStart = GetCurrentTimestamp();
+		new_status = NULL;		/* we haven't changed the ps display */
+
+		/* reset standbyWait_us for each xact we wait for */
 		standbyWait_us = STANDBY_INITIAL_WAIT_US;
-		logged = false;
 
 		/* wait until the virtual xid is gone */
 		while (!ConditionalVirtualXactLockTableWait(*waitlist))
 		{
 			/*
-			 * Report if we have been waiting for a while now...
+			 * Report via ps if we have been waiting for more than 500 msec
+			 * (should that be configurable?)
 			 */
-			TimestampTz now = GetCurrentTimestamp();
-
-			TimestampDifference(waitStart, now, &wait_s, &wait_us);
-			if (!logged && (wait_s > 0 || wait_us > 500000))
+			if (update_process_title && new_status == NULL &&
+				TimestampDifferenceExceeds(waitStart, GetCurrentTimestamp(),
+										   500))
 			{
-				const char *oldactivitymsgp;
+				const char *old_status;
 				int			len;
 
-				oldactivitymsgp = get_ps_display(&len);
-
-				if (len > 100)
-					len = 100;
-
-				memcpy(oldactivitymsg, oldactivitymsgp, len);
-				oldactivitymsg[len] = 0;
-
-				snprintf(waitactivitymsg, sizeof(waitactivitymsg),
-						 "waiting for max_standby_delay (%u s)",
+				old_status = get_ps_display(&len);
+				new_status = (char *) palloc(len + 50);
+				memcpy(new_status, old_status, len);
+				snprintf(new_status + len, 50,
+						 " waiting for max_standby_delay (%d ms)",
 						 MaxStandbyDelay);
-				set_ps_display(waitactivitymsg, false);
-
-				pgstat_report_waiting(true);
-
-				logged = true;
+				set_ps_display(new_status, false);
+				new_status[len] = '\0'; /* truncate off " waiting" */
 			}
 
 			/* Is it time to kill it? */
@@ -225,16 +209,17 @@ ResolveRecoveryConflictWithVirtualXIDs(VirtualTransactionId *waitlist,
 				 * unresponsive backend when system is heavily loaded.
 				 */
 				if (pid != 0)
-					pg_usleep(5000);
+					pg_usleep(5000L);
 			}
 		}
 
-		/* Reset ps display */
-		if (logged)
+		/* Reset ps display if we changed it */
+		if (new_status)
 		{
-			set_ps_display(oldactivitymsg, false);
-			pgstat_report_waiting(false);
+			set_ps_display(new_status, false);
+			pfree(new_status);
 		}
+		pgstat_report_waiting(false);
 
 		/* The virtual transaction is gone now, wait for the next one */
 		waitlist++;
@@ -401,7 +386,7 @@ ResolveRecoveryConflictWithBufferPin(void)
 		 */
 		SendRecoveryConflictWithBufferPin(PROCSIG_RECOVERY_CONFLICT_BUFFERPIN);
 	}
-	else if (MaxStandbyDelay == -1)
+	else if (MaxStandbyDelay < 0)
 	{
 		/*
 		 * Send out a request to check for buffer pin deadlocks before we
@@ -412,17 +397,11 @@ ResolveRecoveryConflictWithBufferPin(void)
 	}
 	else
 	{
-		TimestampTz now;
-		long		standby_delay_secs; /* How far Startup process is lagging */
-		int			standby_delay_usecs;
-
-		now = GetCurrentTimestamp();
+		TimestampTz then = GetLatestXLogTime();
+		TimestampTz now = GetCurrentTimestamp();
 
 		/* Are we past max_standby_delay? */
-		TimestampDifference(GetLatestXLogTime(), now,
-							&standby_delay_secs, &standby_delay_usecs);
-
-		if (standby_delay_secs >= MaxStandbyDelay)
+		if (TimestampDifferenceExceeds(then, now, MaxStandbyDelay))
 		{
 			/*
 			 * We're already behind, so clear a path as quickly as possible.
@@ -434,7 +413,7 @@ ResolveRecoveryConflictWithBufferPin(void)
 			TimestampTz fin_time;		/* Expected wake-up time by timer */
 			long		timer_delay_secs;		/* Amount of time we set timer
 												 * for */
-			int			timer_delay_usecs = 0;
+			int			timer_delay_usecs;
 
 			/*
 			 * Send out a request to check for buffer pin deadlocks before we
@@ -446,12 +425,10 @@ ResolveRecoveryConflictWithBufferPin(void)
 			/*
 			 * How much longer we should wait?
 			 */
-			timer_delay_secs = MaxStandbyDelay - standby_delay_secs;
-			if (standby_delay_usecs > 0)
-			{
-				timer_delay_secs -= 1;
-				timer_delay_usecs = 1000000 - standby_delay_usecs;
-			}
+			fin_time = TimestampTzPlusMilliseconds(then, MaxStandbyDelay);
+
+			TimestampDifference(now, fin_time,
+								&timer_delay_secs, &timer_delay_usecs);
 
 			/*
 			 * It's possible that the difference is less than a microsecond;
@@ -460,13 +437,6 @@ ResolveRecoveryConflictWithBufferPin(void)
 			if (timer_delay_secs == 0 && timer_delay_usecs == 0)
 				timer_delay_usecs = 1;
 
-			/*
-			 * When is the finish time? We recheck this if we are woken early.
-			 */
-			fin_time = TimestampTzPlusMilliseconds(now,
-												   (timer_delay_secs * 1000) +
-												 (timer_delay_usecs / 1000));
-
 			if (enable_standby_sig_alarm(timer_delay_secs, timer_delay_usecs, fin_time))
 				sig_alarm_enabled = true;
 			else
diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c
index d913137b49d..430c37526d9 100644
--- a/src/backend/utils/misc/guc.c
+++ b/src/backend/utils/misc/guc.c
@@ -10,7 +10,7 @@
  * Written by Peter Eisentraut <peter_e@gmx.net>.
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/utils/misc/guc.c,v 1.553 2010/04/29 21:36:19 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/utils/misc/guc.c,v 1.554 2010/05/02 02:10:33 tgl Exp $
  *
  *--------------------------------------------------------------------
  */
@@ -1386,10 +1386,11 @@ static struct config_int ConfigureNamesInt[] =
 	{
 		{"max_standby_delay", PGC_SIGHUP, WAL_SETTINGS,
 			gettext_noop("Sets the maximum delay to avoid conflict processing on hot standby servers."),
-			NULL
+			NULL,
+			GUC_UNIT_MS
 		},
 		&MaxStandbyDelay,
-		30, -1, INT_MAX, NULL, NULL
+		30 * 1000, -1, INT_MAX / 1000, NULL, NULL
 	},
 
 	{
diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample
index 2ed84127f78..94a4e69c6cb 100644
--- a/src/backend/utils/misc/postgresql.conf.sample
+++ b/src/backend/utils/misc/postgresql.conf.sample
@@ -186,7 +186,7 @@
 # - Hot Standby -
 
 #hot_standby = off		# allows queries during recovery
-#max_standby_delay = 30s	# max acceptable lag (s) to allow queries to
+#max_standby_delay = 30s	# max acceptable lag to allow queries to
 				# complete without conflict; -1 means forever
 #vacuum_defer_cleanup_age = 0	# num transactions by which cleanup is deferred
 
-- 
GitLab