From 6f6a6d8b140393c974ec5ae65c6c605e70d08034 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Sun, 23 Nov 2008 01:40:19 +0000
Subject: [PATCH] Teach RequestCheckpoint() to wait and retry a few times if it
 can't signal the bgwriter immediately.  This covers the case where the
 bgwriter is still starting up, as seen in a recent buildfarm failure.  In
 future it might also assist with clean recovery after a bgwriter termination
 and restart --- right now the postmaster treats early bgwriter exit as a
 system crash, but that might not always be so.

---
 src/backend/postmaster/bgwriter.c | 42 ++++++++++++++++++++++++-------
 1 file changed, 33 insertions(+), 9 deletions(-)

diff --git a/src/backend/postmaster/bgwriter.c b/src/backend/postmaster/bgwriter.c
index 7d25811afd4..03f8f7e30f2 100644
--- a/src/backend/postmaster/bgwriter.c
+++ b/src/backend/postmaster/bgwriter.c
@@ -37,7 +37,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/postmaster/bgwriter.c,v 1.53 2008/10/14 08:06:39 heikki Exp $
+ *	  $PostgreSQL: pgsql/src/backend/postmaster/bgwriter.c,v 1.54 2008/11/23 01:40:19 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -864,6 +864,7 @@ RequestCheckpoint(int flags)
 {
 	/* use volatile pointer to prevent code rearrangement */
 	volatile BgWriterShmemStruct *bgs = BgWriterShmem;
+	int			ntries;
 	int			old_failed,
 				old_started;
 
@@ -905,15 +906,38 @@ RequestCheckpoint(int flags)
 	SpinLockRelease(&bgs->ckpt_lck);
 
 	/*
-	 * Send signal to request checkpoint.  When not waiting, we consider
-	 * failure to send the signal to be nonfatal.
+	 * Send signal to request checkpoint.  It's possible that the bgwriter
+	 * hasn't started yet, or is in process of restarting, so we will retry
+	 * a few times if needed.  Also, if not told to wait for the checkpoint
+	 * to occur, we consider failure to send the signal to be nonfatal and
+	 * merely LOG it.
 	 */
-	if (BgWriterShmem->bgwriter_pid == 0)
-		elog((flags & CHECKPOINT_WAIT) ? ERROR : LOG,
-			 "could not request checkpoint because bgwriter not running");
-	if (kill(BgWriterShmem->bgwriter_pid, SIGINT) != 0)
-		elog((flags & CHECKPOINT_WAIT) ? ERROR : LOG,
-			 "could not signal for checkpoint: %m");
+	for (ntries = 0; ; ntries++)
+	{
+		if (BgWriterShmem->bgwriter_pid == 0)
+		{
+			if (ntries >= 20)		/* max wait 2.0 sec */
+			{
+				elog((flags & CHECKPOINT_WAIT) ? ERROR : LOG,
+					 "could not request checkpoint because bgwriter not running");
+				break;
+			}
+		}
+		else if (kill(BgWriterShmem->bgwriter_pid, SIGINT) != 0)
+		{
+			if (ntries >= 20)		/* max wait 2.0 sec */
+			{
+				elog((flags & CHECKPOINT_WAIT) ? ERROR : LOG,
+					 "could not signal for checkpoint: %m");
+				break;
+			}
+		}
+		else
+			break;				/* signal sent successfully */
+
+		CHECK_FOR_INTERRUPTS();
+		pg_usleep(100000L);		/* wait 0.1 sec, then retry */
+	}
 
 	/*
 	 * If requested, wait for completion.  We detect completion according to
-- 
GitLab