diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c index bab048d38a15703a36b4b92c7305b20515a28a77..b467b5c89d1c0ebffe5cf45e2df062408fdf9a3f 100644 --- a/src/backend/access/transam/xact.c +++ b/src/backend/access/transam/xact.c @@ -34,6 +34,7 @@ #include "commands/trigger.h" #include "executor/spi.h" #include "libpq/be-fsstubs.h" +#include "libpq/pqsignal.h" #include "miscadmin.h" #include "pgstat.h" #include "replication/walsender.h" @@ -52,6 +53,7 @@ #include "utils/memutils.h" #include "utils/relmapper.h" #include "utils/snapmgr.h" +#include "utils/timeout.h" #include "utils/timestamp.h" #include "pg_trace.h" @@ -2296,6 +2298,22 @@ AbortTransaction(void) */ LockErrorCleanup(); + /* + * If any timeout events are still active, make sure the timeout interrupt + * is scheduled. This covers possible loss of a timeout interrupt due to + * longjmp'ing out of the SIGINT handler (see notes in handle_sig_alarm). + * We delay this till after LockErrorCleanup so that we don't uselessly + * reschedule lock or deadlock check timeouts. + */ + reschedule_timeouts(); + + /* + * Re-enable signals, in case we got here by longjmp'ing out of a signal + * handler. We do this fairly early in the sequence so that the timeout + * infrastructure will be functional if needed while aborting. + */ + PG_SETMASK(&UnBlockSig); + /* * check the current transaction state */ @@ -4222,8 +4240,28 @@ AbortSubTransaction(void) AbortBufferIO(); UnlockBuffers(); + /* + * Also clean up any open wait for lock, since the lock manager will choke + * if we try to wait for another lock before doing this. + */ LockErrorCleanup(); + /* + * If any timeout events are still active, make sure the timeout interrupt + * is scheduled. This covers possible loss of a timeout interrupt due to + * longjmp'ing out of the SIGINT handler (see notes in handle_sig_alarm). + * We delay this till after LockErrorCleanup so that we don't uselessly + * reschedule lock or deadlock check timeouts. + */ + reschedule_timeouts(); + + /* + * Re-enable signals, in case we got here by longjmp'ing out of a signal + * handler. We do this fairly early in the sequence so that the timeout + * infrastructure will be functional if needed while aborting. + */ + PG_SETMASK(&UnBlockSig); + /* * check the current transaction state */ diff --git a/src/backend/postmaster/autovacuum.c b/src/backend/postmaster/autovacuum.c index 8c14d0f4e4441446d189b2826fca50f94bd905de..88ecd3834b383a51a8417ab5b0b0fa10190aa16b 100644 --- a/src/backend/postmaster/autovacuum.c +++ b/src/backend/postmaster/autovacuum.c @@ -493,9 +493,8 @@ AutoVacLauncherMain(int argc, char *argv[]) HOLD_INTERRUPTS(); /* Forget any pending QueryCancel or timeout request */ - QueryCancelPending = false; disable_all_timeouts(false); - QueryCancelPending = false; /* again in case timeout occurred */ + QueryCancelPending = false; /* second to avoid race condition */ /* Report the error to the server log */ EmitErrorReport(); diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c index 222251df6599dfbe84aa83bf18e11803798c721c..122afb24a8b1811638dd7611331aed3a2e3d4c14 100644 --- a/src/backend/storage/lmgr/proc.c +++ b/src/backend/storage/lmgr/proc.c @@ -1266,7 +1266,10 @@ ProcSleep(LOCALLOCK *locallock, LockMethod lockMethodTable) } while (myWaitStatus == STATUS_WAITING); /* - * Disable the timers, if they are still running + * Disable the timers, if they are still running. As in LockErrorCleanup, + * we must preserve the LOCK_TIMEOUT indicator flag: if a lock timeout has + * already caused QueryCancelPending to become set, we want the cancel to + * be reported as a lock timeout, not a user cancel. */ if (LockTimeout > 0) { @@ -1275,7 +1278,7 @@ ProcSleep(LOCALLOCK *locallock, LockMethod lockMethodTable) timeouts[0].id = DEADLOCK_TIMEOUT; timeouts[0].keep_indicator = false; timeouts[1].id = LOCK_TIMEOUT; - timeouts[1].keep_indicator = false; + timeouts[1].keep_indicator = true; disable_timeouts(timeouts, 2); } else diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c index 3d74654c82737c1ac6653d50dacb6a4154797e12..e6428b69f43a63e513ac4c513ca0d7fd8cf15fd9 100644 --- a/src/backend/tcop/postgres.c +++ b/src/backend/tcop/postgres.c @@ -3803,6 +3803,13 @@ PostgresMain(int argc, char *argv[], * always active, we have at least some chance of recovering from an error * during error recovery. (If we get into an infinite loop thereby, it * will soon be stopped by overflow of elog.c's internal state stack.) + * + * Note that we use sigsetjmp(..., 1), so that this function's signal mask + * (to wit, UnBlockSig) will be restored when longjmp'ing to here. This + * is essential in case we longjmp'd out of a signal handler on a platform + * where that leaves the signal blocked. It's not redundant with the + * unblock in AbortTransaction() because the latter is only called if we + * were inside a transaction. */ if (sigsetjmp(local_sigjmp_buf, 1) != 0) @@ -3823,11 +3830,17 @@ PostgresMain(int argc, char *argv[], /* * Forget any pending QueryCancel request, since we're returning to - * the idle loop anyway, and cancel any active timeout requests. + * the idle loop anyway, and cancel any active timeout requests. (In + * future we might want to allow some timeout requests to survive, but + * at minimum it'd be necessary to do reschedule_timeouts(), in case + * we got here because of a query cancel interrupting the SIGALRM + * interrupt handler.) Note in particular that we must clear the + * statement and lock timeout indicators, to prevent any future plain + * query cancels from being misreported as timeouts in case we're + * forgetting a timeout cancel. */ - QueryCancelPending = false; disable_all_timeouts(false); - QueryCancelPending = false; /* again in case timeout occurred */ + QueryCancelPending = false; /* second to avoid race condition */ /* * Turn off these interrupts too. This is only needed here and not in diff --git a/src/backend/utils/misc/timeout.c b/src/backend/utils/misc/timeout.c index 3c3e41eca91d13535554f1e285295f71c1468246..aae947e601b483b4ab0281cc884c6fe60a6f5380 100644 --- a/src/backend/utils/misc/timeout.c +++ b/src/backend/utils/misc/timeout.c @@ -16,6 +16,7 @@ #include <sys/time.h> +#include "miscadmin.h" #include "storage/proc.h" #include "utils/timeout.h" #include "utils/timestamp.h" @@ -259,6 +260,23 @@ handle_sig_alarm(SIGNAL_ARGS) { int save_errno = errno; + /* + * We may be executing while ImmediateInterruptOK is true (e.g., when + * mainline is waiting for a lock). If SIGINT or similar arrives while + * this code is running, we'd lose control and perhaps leave our data + * structures in an inconsistent state. Hold off interrupts to prevent + * that. + * + * Note: it's possible for a SIGINT to interrupt handle_sig_alarm even + * before we reach HOLD_INTERRUPTS(); the net effect would be as if the + * SIGALRM event had been silently lost. Therefore error recovery must + * include some action that will allow any lost interrupt to be + * rescheduled. Disabling some or all timeouts is sufficient, or if + * that's not appropriate, reschedule_timeouts() can be called. Also, the + * signal blocking hazard described below applies here too. + */ + HOLD_INTERRUPTS(); + /* * SIGALRM is always cause for waking anything waiting on the process * latch. Cope with MyProc not being there, as the startup process also @@ -311,6 +329,20 @@ handle_sig_alarm(SIGNAL_ARGS) } } + /* + * Re-allow query cancel, and then service any cancel request that arrived + * meanwhile (this might in particular include a cancel request fired by + * one of the timeout handlers). + * + * Note: a longjmp from here is safe so far as our own data structures are + * concerned; but on platforms that block a signal before calling the + * handler and then un-block it on return, longjmping out of the signal + * handler leaves SIGALRM still blocked. Error cleanup is responsible for + * unblocking any blocked signals. + */ + RESUME_INTERRUPTS(); + CHECK_FOR_INTERRUPTS(); + errno = save_errno; } @@ -387,6 +419,30 @@ RegisterTimeout(TimeoutId id, timeout_handler_proc handler) return id; } +/* + * Reschedule any pending SIGALRM interrupt. + * + * This can be used during error recovery in case query cancel resulted in loss + * of a SIGALRM event (due to longjmp'ing out of handle_sig_alarm before it + * could do anything). But note it's not necessary if any of the public + * enable_ or disable_timeout functions are called in the same area, since + * those all do schedule_alarm() internally if needed. + */ +void +reschedule_timeouts(void) +{ + /* For flexibility, allow this to be called before we're initialized. */ + if (!all_timeouts_initialized) + return; + + /* Disable timeout interrupts for safety. */ + disable_alarm(); + + /* Reschedule the interrupt, if any timeouts remain active. */ + if (num_active_timeouts > 0) + schedule_alarm(GetCurrentTimestamp()); +} + /* * Enable the specified timeout to fire after the specified delay. * diff --git a/src/include/utils/timeout.h b/src/include/utils/timeout.h index 06bcc5a0f3d92aabdb86b51570a398b19839bf55..759ee24774bb8ba0079150c1d710726f9629fbf4 100644 --- a/src/include/utils/timeout.h +++ b/src/include/utils/timeout.h @@ -67,6 +67,7 @@ typedef struct /* timeout setup */ extern void InitializeTimeouts(void); extern TimeoutId RegisterTimeout(TimeoutId id, timeout_handler_proc handler); +extern void reschedule_timeouts(void); /* timeout operation */ extern void enable_timeout_after(TimeoutId id, int delay_ms);