diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c index 73d5b2e39c88ed0f29f4927e3de709930b406595..243dc8bc53e08958f5a8cc798a2133dd45d77668 100644 --- a/src/backend/postmaster/pgstat.c +++ b/src/backend/postmaster/pgstat.c @@ -1404,16 +1404,16 @@ pgstat_ping(void) * pgstat_send_inquiry() - * * Notify collector that we need fresh data. - * ts specifies the minimum acceptable timestamp for the stats file. * ---------- */ static void -pgstat_send_inquiry(TimestampTz ts) +pgstat_send_inquiry(TimestampTz clock_time, TimestampTz cutoff_time) { PgStat_MsgInquiry msg; pgstat_setheader(&msg.m_hdr, PGSTAT_MTYPE_INQUIRY); - msg.inquiry_time = ts; + msg.clock_time = clock_time; + msg.cutoff_time = cutoff_time; pgstat_send(&msg, sizeof(msg)); } @@ -3633,7 +3633,7 @@ pgstat_read_statsfile(Oid onlydb, bool permanent) /* * Set the current timestamp (will be kept only in case we can't load an - * existing statsfile. + * existing statsfile). */ globalStats.stat_reset_timestamp = GetCurrentTimestamp(); @@ -3922,8 +3922,8 @@ pgstat_read_statsfile_timestamp(bool permanent, TimestampTz *ts) static void backend_read_statsfile(void) { - TimestampTz cur_ts; - TimestampTz min_ts; + TimestampTz min_ts = 0; + TimestampTz ref_ts = 0; int count; /* already read it? */ @@ -3931,26 +3931,6 @@ backend_read_statsfile(void) return; Assert(!pgStatRunningInCollector); - /* - * We set the minimum acceptable timestamp to PGSTAT_STAT_INTERVAL msec - * before now. This indirectly ensures that the collector needn't write - * the file more often than PGSTAT_STAT_INTERVAL. In an autovacuum - * worker, however, we want a lower delay to avoid using stale data, so we - * use PGSTAT_RETRY_DELAY (since the number of worker is low, this - * shouldn't be a problem). - * - * Note that we don't recompute min_ts after sleeping; so we might end up - * accepting a file a bit older than PGSTAT_STAT_INTERVAL. In practice - * that shouldn't happen, though, as long as the sleep time is less than - * PGSTAT_STAT_INTERVAL; and we don't want to lie to the collector about - * what our cutoff time really is. - */ - cur_ts = GetCurrentTimestamp(); - if (IsAutoVacuumWorkerProcess()) - min_ts = TimestampTzPlusMilliseconds(cur_ts, -PGSTAT_RETRY_DELAY); - else - min_ts = TimestampTzPlusMilliseconds(cur_ts, -PGSTAT_STAT_INTERVAL); - /* * Loop until fresh enough stats file is available or we ran out of time. * The stats inquiry message is sent repeatedly in case collector drops @@ -3958,17 +3938,82 @@ backend_read_statsfile(void) */ for (count = 0; count < PGSTAT_POLL_LOOP_COUNT; count++) { + bool ok; TimestampTz file_ts = 0; + TimestampTz cur_ts; CHECK_FOR_INTERRUPTS(); - if (pgstat_read_statsfile_timestamp(false, &file_ts) && - file_ts >= min_ts) + ok = pgstat_read_statsfile_timestamp(false, &file_ts); + + cur_ts = GetCurrentTimestamp(); + /* Calculate min acceptable timestamp, if we didn't already */ + if (count == 0 || cur_ts < ref_ts) + { + /* + * We set the minimum acceptable timestamp to PGSTAT_STAT_INTERVAL + * msec before now. This indirectly ensures that the collector + * needn't write the file more often than PGSTAT_STAT_INTERVAL. + * In an autovacuum worker, however, we want a lower delay to + * avoid using stale data, so we use PGSTAT_RETRY_DELAY (since the + * number of workers is low, this shouldn't be a problem). + * + * We don't recompute min_ts after sleeping, except in the + * unlikely case that cur_ts went backwards. So we might end up + * accepting a file a bit older than PGSTAT_STAT_INTERVAL. In + * practice that shouldn't happen, though, as long as the sleep + * time is less than PGSTAT_STAT_INTERVAL; and we don't want to + * tell the collector that our cutoff time is less than what we'd + * actually accept. + */ + ref_ts = cur_ts; + if (IsAutoVacuumWorkerProcess()) + min_ts = TimestampTzPlusMilliseconds(ref_ts, + -PGSTAT_RETRY_DELAY); + else + min_ts = TimestampTzPlusMilliseconds(ref_ts, + -PGSTAT_STAT_INTERVAL); + } + + /* + * If the file timestamp is actually newer than cur_ts, we must have + * had a clock glitch (system time went backwards) or there is clock + * skew between our processor and the stats collector's processor. + * Accept the file, but send an inquiry message anyway to make + * pgstat_recv_inquiry do a sanity check on the collector's time. + */ + if (ok && file_ts > cur_ts) + { + /* + * A small amount of clock skew between processors isn't terribly + * surprising, but a large difference is worth logging. We + * arbitrarily define "large" as 1000 msec. + */ + if (file_ts >= TimestampTzPlusMilliseconds(cur_ts, 1000)) + { + char *filetime; + char *mytime; + + /* Copy because timestamptz_to_str returns a static buffer */ + filetime = pstrdup(timestamptz_to_str(file_ts)); + mytime = pstrdup(timestamptz_to_str(cur_ts)); + elog(LOG, "stats collector's time %s is later than backend local time %s", + filetime, mytime); + pfree(filetime); + pfree(mytime); + } + + pgstat_send_inquiry(cur_ts, min_ts); + break; + } + + /* Normal acceptance case: file is not older than cutoff time */ + if (ok && file_ts >= min_ts) break; /* Not there or too old, so kick the collector and wait a bit */ if ((count % PGSTAT_INQ_LOOP_COUNT) == 0) - pgstat_send_inquiry(min_ts); + pgstat_send_inquiry(cur_ts, min_ts); pg_usleep(PGSTAT_RETRY_DELAY * 1000L); } @@ -4036,8 +4081,46 @@ pgstat_clear_snapshot(void) static void pgstat_recv_inquiry(PgStat_MsgInquiry *msg, int len) { - if (msg->inquiry_time > last_statrequest) - last_statrequest = msg->inquiry_time; + /* + * Advance last_statrequest if this requestor has a newer cutoff time + * than any previous request. + */ + if (msg->cutoff_time > last_statrequest) + last_statrequest = msg->cutoff_time; + + /* + * If the requestor's local clock time is older than last_statwrite, we + * should suspect a clock glitch, ie system time going backwards; though + * the more likely explanation is just delayed message receipt. It is + * worth expending a GetCurrentTimestamp call to be sure, since a large + * retreat in the system clock reading could otherwise cause us to neglect + * to update the stats file for a long time. + */ + if (msg->clock_time < last_statwrite) + { + TimestampTz cur_ts = GetCurrentTimestamp(); + + if (cur_ts < last_statwrite) + { + /* + * Sure enough, time went backwards. Force a new stats file write + * to get back in sync; but first, log a complaint. + */ + char *writetime; + char *mytime; + + /* Copy because timestamptz_to_str returns a static buffer */ + writetime = pstrdup(timestamptz_to_str(last_statwrite)); + mytime = pstrdup(timestamptz_to_str(cur_ts)); + elog(LOG, "last_statwrite %s is later than collector's time %s", + writetime, mytime); + pfree(writetime); + pfree(mytime); + + last_statrequest = cur_ts; + last_statwrite = last_statrequest - 1; + } + } } diff --git a/src/include/pgstat.h b/src/include/pgstat.h index dd978d79c3dad830a5925341177a08cc57eeef61..cad9b54e7b50aadbc43ad66dd36e2262383ee24b 100644 --- a/src/include/pgstat.h +++ b/src/include/pgstat.h @@ -203,7 +203,8 @@ typedef struct PgStat_MsgDummy typedef struct PgStat_MsgInquiry { PgStat_MsgHdr m_hdr; - TimestampTz inquiry_time; /* minimum acceptable file timestamp */ + TimestampTz clock_time; /* observed local clock time */ + TimestampTz cutoff_time; /* minimum acceptable file timestamp */ } PgStat_MsgInquiry;