Newer
Older
/*-------------------------------------------------------------------------
*
* autovacuum.c
*
* PostgreSQL Integrated Autovacuum Daemon
*
*
* Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group
* Portions Copyright (c) 1994, Regents of the University of California
*
*
* IDENTIFICATION
Alvaro Herrera
committed
* $PostgreSQL: pgsql/src/backend/postmaster/autovacuum.c,v 1.43 2007/05/02 15:47:14 alvherre Exp $
*
*-------------------------------------------------------------------------
*/
#include "postgres.h"
#include <signal.h>
#include <sys/types.h>
#include <time.h>
#include <unistd.h>
#include "access/genam.h"
#include "access/heapam.h"
#include "access/transam.h"
#include "access/xact.h"
#include "catalog/indexing.h"
#include "catalog/namespace.h"
#include "catalog/pg_autovacuum.h"
#include "catalog/pg_database.h"
#include "commands/vacuum.h"
#include "libpq/hba.h"
#include "libpq/pqsignal.h"
#include "miscadmin.h"
#include "pgstat.h"
#include "postmaster/autovacuum.h"
#include "postmaster/fork_process.h"
#include "postmaster/postmaster.h"
#include "storage/fd.h"
#include "storage/ipc.h"
#include "storage/pmsignal.h"
#include "storage/proc.h"
#include "storage/procarray.h"
#include "storage/sinval.h"
#include "tcop/tcopprot.h"
#include "utils/flatfiles.h"
#include "utils/fmgroids.h"
#include "utils/memutils.h"
#include "utils/ps_status.h"
#include "utils/syscache.h"
static volatile sig_atomic_t got_SIGUSR1 = false;
static volatile sig_atomic_t got_SIGHUP = false;
static volatile sig_atomic_t avlauncher_shutdown_request = false;
/*
* GUC parameters
*/
bool autovacuum_start_daemon = false;
int autovacuum_max_workers;
int autovacuum_naptime;
int autovacuum_vac_thresh;
double autovacuum_vac_scale;
int autovacuum_anl_thresh;
double autovacuum_anl_scale;
int autovacuum_freeze_max_age;
int autovacuum_vac_cost_delay;
int autovacuum_vac_cost_limit;
int Log_autovacuum = -1;
/* Flags to tell if we are in an autovacuum process */
static bool am_autovacuum_launcher = false;
static bool am_autovacuum_worker = false;
/* Comparison point for determining whether freeze_max_age is exceeded */
static TransactionId recentXid;
/* Default freeze_min_age to use for autovacuum (varies by database) */
static int default_freeze_min_age;
/* Memory context for long-lived data */
/* struct to keep track of databases in launcher */
typedef struct avl_dbase
Oid adl_datid; /* hash key -- must be first */
TimestampTz adl_next_worker;
int adl_score;
} avl_dbase;
/* struct to keep track of databases in worker */
typedef struct avw_dbase
{
Oid adw_datid;
char *adw_name;
TransactionId adw_frozenxid;
PgStat_StatDBEntry *adw_entry;
} avw_dbase;
/* struct to keep track of tables to vacuum and/or analyze, in 1st pass */
typedef struct av_relation
{
Oid ar_relid;
Oid ar_toastrelid;
} av_relation;
Alvaro Herrera
committed
/* struct to keep track of tables to vacuum and/or analyze, after rechecking */
typedef struct autovac_table
{
Alvaro Herrera
committed
Oid at_relid;
Oid at_toastrelid;
bool at_dovacuum;
bool at_doanalyze;
int at_freeze_min_age;
int at_vacuum_cost_delay;
int at_vacuum_cost_limit;
} autovac_table;
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
/*-------------
* This struct holds information about a single worker's whereabouts. We keep
* an array of these in shared memory, sized according to
* autovacuum_max_workers.
*
* wi_links entry into free list or running list
* wi_dboid OID of the database this worker is supposed to work on
* wi_tableoid OID of the table currently being vacuumed
* wi_workerpid PID of the running worker, 0 if not yet started
* wi_launchtime Time at which this worker was launched
* wi_cost_* Vacuum cost-based delay parameters current in this worker
*
* All fields are protected by AutovacuumLock, except for wi_tableoid which is
* protected by AutovacuumScheduleLock (which is read-only for everyone except
* that worker itself).
*-------------
*/
typedef struct WorkerInfoData
{
SHM_QUEUE wi_links;
Oid wi_dboid;
Oid wi_tableoid;
int wi_workerpid;
TimestampTz wi_launchtime;
int wi_cost_delay;
int wi_cost_limit;
int wi_cost_limit_base;
} WorkerInfoData;
typedef struct WorkerInfoData *WorkerInfo;
/*-------------
* The main autovacuum shmem struct. On shared memory we store this main
* struct and the array of WorkerInfo structs. This struct keeps:
*
* av_launcherpid the PID of the autovacuum launcher
* av_freeWorkers the WorkerInfo freelist
* av_runningWorkers the WorkerInfo non-free queue
* av_startingWorker pointer to WorkerInfo currently being started (cleared by
* the worker itself as soon as it's up and running)
* av_rebalance true when a worker determines that cost limits must be
* rebalanced
*
* This struct is protected by AutovacuumLock.
*-------------
*/
typedef struct
{
pid_t av_launcherpid;
SHMEM_OFFSET av_freeWorkers;
SHM_QUEUE av_runningWorkers;
SHMEM_OFFSET av_startingWorker;
bool av_rebalance;
} AutoVacuumShmemStruct;
static AutoVacuumShmemStruct *AutoVacuumShmem;
/* the database list in the launcher, and the context that contains it */
static Dllist *DatabaseList = NULL;
static MemoryContext DatabaseListCxt = NULL;
/* Pointer to my own WorkerInfo, valid on each worker */
static WorkerInfo MyWorkerInfo = NULL;
/* PID of launcher, valid only in worker while shutting down */
int AutovacuumLauncherPid = 0;
#ifdef EXEC_BACKEND
static pid_t avlauncher_forkexec(void);
static pid_t avworker_forkexec(void);
NON_EXEC_STATIC void AutoVacWorkerMain(int argc, char *argv[]);
NON_EXEC_STATIC void AutoVacLauncherMain(int argc, char *argv[]);
static Oid do_start_worker(void);
static uint64 launcher_determine_sleep(bool canlaunch, bool recursing);
static void launch_worker(TimestampTz now);
static List *get_database_list(void);
static void rebuild_database_list(Oid newdb);
static int db_comparator(const void *a, const void *b);
static void autovac_balance_cost(void);
static void do_autovacuum(void);
static void FreeWorkerInfo(int code, Datum arg);
static void relation_check_autovac(Oid relid, Form_pg_class classForm,
Form_pg_autovacuum avForm, PgStat_StatTabEntry *tabentry,
List **table_oids, List **table_toast_list,
List **toast_oids);
static autovac_table *table_recheck_autovac(Oid relid);
static void relation_needs_vacanalyze(Oid relid, Form_pg_autovacuum avForm,
Form_pg_class classForm,
PgStat_StatTabEntry *tabentry, bool *dovacuum,
bool *doanalyze);
static void autovacuum_do_vac_analyze(Oid relid, bool dovacuum,
bool doanalyze, int freeze_min_age);
static HeapTuple get_pg_autovacuum_tuple_relid(Relation avRel, Oid relid);
Alvaro Herrera
committed
static PgStat_StatTabEntry *get_pgstat_tabentry_relid(Oid relid, bool isshared,
PgStat_StatDBEntry *shared,
PgStat_StatDBEntry *dbentry);
static void autovac_report_activity(VacuumStmt *vacstmt, Oid relid);
static void avl_sighup_handler(SIGNAL_ARGS);
static void avl_sigusr1_handler(SIGNAL_ARGS);
static void avlauncher_shutdown(SIGNAL_ARGS);
static void avl_quickdie(SIGNAL_ARGS);
/********************************************************************
* AUTOVACUUM LAUNCHER CODE
********************************************************************/
#ifdef EXEC_BACKEND
* forkexec routine for the autovacuum launcher process.
* Format up the arglist, then fork and exec.
static pid_t
avlauncher_forkexec(void)
char *av[10];
int ac = 0;
av[ac++] = "postgres";
av[ac++] = "--forkavlauncher";
av[ac++] = NULL; /* filled in by postmaster_forkexec */
av[ac] = NULL;
Alvaro Herrera
committed
Assert(ac < lengthof(av));
return postmaster_forkexec(ac, av);
}
/*
* We need this set from the outside, before InitProcess is called
*/
void
AutovacuumLauncherIAm(void)
{
am_autovacuum_launcher = true;
}
#endif
/*
* Main entry point for autovacuum launcher process, to be called from the
* postmaster.
*/
int
StartAutoVacLauncher(void)
{
pid_t AutoVacPID;
#ifdef EXEC_BACKEND
switch ((AutoVacPID = avlauncher_forkexec()))
#endif
{
case -1:
ereport(LOG,
(errmsg("could not fork autovacuum process: %m")));
return 0;
#ifndef EXEC_BACKEND
case 0:
/* in postmaster child ... */
/* Close the postmaster's sockets */
ClosePostmasterPorts(false);
/* Lose the postmaster's on-exit routines */
on_exit_reset();
AutoVacLauncherMain(0, NULL);
break;
#endif
default:
return (int) AutoVacPID;
}
/* shouldn't get here */
return 0;
}
/*
* Main loop for the autovacuum launcher process.
*
* The signalling between launcher and worker is as follows:
*
* When the worker has finished starting up, it stores its PID in wi_workerpid
* and sends a SIGUSR1 signal to the launcher. The launcher then knows that
* the postmaster is ready to start a new worker. We do it this way because
* otherwise we risk calling SendPostmasterSignal() when the postmaster hasn't
* yet processed the last one, in which case the second signal would be lost.
* This is only useful when two workers need to be started close to one
* another, which should be rare but it's possible.
*
* When a worker exits, it resets the WorkerInfo struct and puts it back into
* the free list. If there is no free worker slot, it will also signal the
* launcher, which then wakes up and can launch a new worker if it needs to.
* Note that we only need to do it when there's no free worker slot, because
* otherwise there is no need -- the launcher would be awakened normally per
* schedule.
*
* There is a potential problem if, for some reason, a worker starts and is not
* able to bootstrap itself correctly. To prevent this situation from starving
* the whole system, the launcher checks the launch time of the "starting
* worker". If it's too old (older than autovacuum_naptime seconds), it resets
* the worker entry and puts it back into the free list.
NON_EXEC_STATIC void
AutoVacLauncherMain(int argc, char *argv[])
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
sigjmp_buf local_sigjmp_buf;
/* we are a postmaster subprocess now */
IsUnderPostmaster = true;
am_autovacuum_launcher = true;
/* reset MyProcPid */
MyProcPid = getpid();
/* Identify myself via ps */
init_ps_display("autovacuum launcher process", "", "", "");
SetProcessingMode(InitProcessing);
/*
* If possible, make this process a group leader, so that the postmaster
* can signal any child processes too. (autovacuum probably never has
* any child processes, but for consistency we make all postmaster
* child processes do this.)
*/
#ifdef HAVE_SETSID
if (setsid() < 0)
elog(FATAL, "setsid() failed: %m");
#endif
/*
* Set up signal handlers. Since this is an auxiliary process, it has
* particular signal requirements -- no deadlock checker or sinval
* catchup, for example.
*/
pqsignal(SIGHUP, avl_sighup_handler);
pqsignal(SIGINT, SIG_IGN);
pqsignal(SIGTERM, avlauncher_shutdown);
pqsignal(SIGQUIT, avl_quickdie);
pqsignal(SIGALRM, SIG_IGN);
pqsignal(SIGPIPE, SIG_IGN);
pqsignal(SIGUSR1, avl_sigusr1_handler);
/* We don't listen for async notifies */
pqsignal(SIGUSR2, SIG_IGN);
pqsignal(SIGFPE, FloatExceptionHandler);
pqsignal(SIGCHLD, SIG_DFL);
/* Early initialization */
BaseInit();
/*
* Create a per-backend PGPROC struct in shared memory, except in the
* EXEC_BACKEND case where this was done in SubPostmasterMain. We must do
* this before we can use LWLocks (and in the EXEC_BACKEND case we already
* had to do some stuff with LWLocks).
*/
#ifndef EXEC_BACKEND
InitAuxiliaryProcess();
#endif
/*
* Create a memory context that we will do all our work in. We do this so
* that we can reset the context during error recovery and thereby avoid
* possible memory leaks.
*/
AutovacMemCxt = AllocSetContextCreate(TopMemoryContext,
"Autovacuum Launcher",
ALLOCSET_DEFAULT_MINSIZE,
ALLOCSET_DEFAULT_INITSIZE,
ALLOCSET_DEFAULT_MAXSIZE);
MemoryContextSwitchTo(AutovacMemCxt);
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
/*
* If an exception is encountered, processing resumes here.
*
* This code is heavily based on bgwriter.c, q.v.
*/
if (sigsetjmp(local_sigjmp_buf, 1) != 0)
{
/* since not using PG_TRY, must reset error stack by hand */
error_context_stack = NULL;
/* Prevents interrupts while cleaning up */
HOLD_INTERRUPTS();
/* Report the error to the server log */
EmitErrorReport();
/*
* These operations are really just a minimal subset of
* AbortTransaction(). We don't have very many resources to worry
* about, but we do have LWLocks.
*/
LWLockReleaseAll();
AtEOXact_Files();
/*
* Now return to normal top-level context and clear ErrorContext for
* next time.
*/
MemoryContextSwitchTo(AutovacMemCxt);
FlushErrorState();
/* Flush any leaked data in the top-level context */
MemoryContextResetAndDeleteChildren(AutovacMemCxt);
/* don't leave dangling pointers to freed memory */
DatabaseListCxt = NULL;
DatabaseList = NULL;
/* Make sure pgstat also considers our stat data as gone */
pgstat_clear_snapshot();
/* Now we can allow interrupts again */
RESUME_INTERRUPTS();
/*
* Sleep at least 1 second after any error. We don't want to be
* filling the error logs as fast as we can.
*/
pg_usleep(1000000L);
}
/* We can now handle ereport(ERROR) */
PG_exception_stack = &local_sigjmp_buf;
ereport(LOG,
(errmsg("autovacuum launcher started")));
/* must unblock signals before calling rebuild_database_list */
PG_SETMASK(&UnBlockSig);
/* in emergency mode, just start a worker and go away */
if (!autovacuum_start_daemon)
{
do_start_worker();
proc_exit(0); /* done */
}
AutoVacuumShmem->av_launcherpid = MyProcPid;
/*
* Create the initial database list. The invariant we want this list to
* keep is that it's ordered by decreasing next_time. As soon as an entry
* is updated to a higher time, it will be moved to the front (which is
* correct because the only operation is to add autovacuum_naptime to the
* entry, and time always increases).
*/
rebuild_database_list(InvalidOid);
for (;;)
{
uint64 micros;
bool can_launch;
TimestampTz current_time = 0;
/*
* Emergency bailout if postmaster has died. This is to avoid the
* necessity for manual cleanup of all postmaster children.
*/
if (!PostmasterIsAlive(true))
exit(1);
micros = launcher_determine_sleep(AutoVacuumShmem->av_freeWorkers !=
INVALID_OFFSET, false);
/* Sleep for a while according to schedule */
pg_usleep(micros);
/* the normal shutdown case */
if (avlauncher_shutdown_request)
break;
if (got_SIGHUP)
{
got_SIGHUP = false;
ProcessConfigFile(PGC_SIGHUP);
/* rebalance in case the default cost parameters changed */
LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE);
autovac_balance_cost();
LWLockRelease(AutovacuumLock);
/* rebuild the list in case the naptime changed */
rebuild_database_list(InvalidOid);
}
/* a worker started up or finished */
if (got_SIGUSR1)
{
got_SIGUSR1 = false;
/* rebalance cost limits, if needed */
if (AutoVacuumShmem->av_rebalance)
{
LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE);
AutoVacuumShmem->av_rebalance = false;
autovac_balance_cost();
LWLockRelease(AutovacuumLock);
}
}
/*
* There are some conditions that we need to check before trying to
* start a launcher. First, we need to make sure that there is a
* launcher slot available. Second, we need to make sure that no other
* worker is still starting up.
*/
LWLockAcquire(AutovacuumLock, LW_SHARED);
can_launch = (AutoVacuumShmem->av_freeWorkers != INVALID_OFFSET);
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
if (can_launch && AutoVacuumShmem->av_startingWorker != INVALID_OFFSET)
{
long secs;
int usecs;
WorkerInfo worker = (WorkerInfo) MAKE_PTR(AutoVacuumShmem->av_startingWorker);
if (current_time == 0)
current_time = GetCurrentTimestamp();
/*
* We can't launch another worker when another one is still
* starting up, so just sleep for a bit more; that worker will wake
* us up again as soon as it's ready. We will only wait
* autovacuum_naptime seconds for this to happen however. Note
* that failure to connect to a particular database is not a
* problem here, because the worker removes itself from the
* startingWorker pointer before trying to connect; only low-level
* problems, like fork() failure, can get us here.
*/
TimestampDifference(worker->wi_launchtime, current_time,
&secs, &usecs);
/* ignore microseconds, as they cannot make any difference */
if (secs > autovacuum_naptime)
{
LWLockRelease(AutovacuumLock);
LWLockAcquire(AutovacuumLock, LW_EXCLUSIVE);
/*
* No other process can put a worker in starting mode, so if
* startingWorker is still INVALID after exchanging our lock,
* we assume it's the same one we saw above (so we don't
* recheck the launch time).
*/
if (AutoVacuumShmem->av_startingWorker != INVALID_OFFSET)
{
worker = (WorkerInfo) MAKE_PTR(AutoVacuumShmem->av_startingWorker);
worker->wi_dboid = InvalidOid;
worker->wi_tableoid = InvalidOid;
worker->wi_workerpid = 0;
worker->wi_launchtime = 0;
worker->wi_links.next = AutoVacuumShmem->av_freeWorkers;
AutoVacuumShmem->av_freeWorkers = MAKE_OFFSET(worker);
AutoVacuumShmem->av_startingWorker = INVALID_OFFSET;
}
}
else
{
/*
* maybe the postmaster neglected this start signal --
* resend it. Note: the constraints in
* launcher_determine_sleep keep us from delivering signals too
* quickly (at most once every 100ms).
*/
SendPostmasterSignal(PMSIGNAL_START_AUTOVAC_WORKER);
can_launch = false;
}
}
LWLockRelease(AutovacuumLock); /* either shared or exclusive */
if (can_launch)
{
Dlelem *elem;
elem = DLGetTail(DatabaseList);
if (current_time == 0)
current_time = GetCurrentTimestamp();
if (elem != NULL)
{
avl_dbase *avdb = DLE_VAL(elem);
long secs;
int usecs;
TimestampDifference(current_time, avdb->adl_next_worker, &secs, &usecs);
/* do we have to start a worker? */
if (secs <= 0 && usecs <= 0)
launch_worker(current_time);
}
else
{
/*
* Special case when the list is empty: start a worker right
* away. This covers the initial case, when no database is in
* pgstats (thus the list is empty). Note that the constraints
* in launcher_determine_sleep keep us from starting workers
* too quickly (at most once every autovacuum_naptime when the
* list is empty).
*/
launch_worker(current_time);
}
}
}
/* Normal exit from the autovac launcher is here */
ereport(LOG,
(errmsg("autovacuum launcher shutting down")));
AutoVacuumShmem->av_launcherpid = 0;
proc_exit(0); /* done */
}
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
/*
* Determine the time to sleep, in microseconds, based on the database list.
*
* The "canlaunch" parameter indicates whether we can start a worker right now,
* for example due to the workers being all busy.
*/
static uint64
launcher_determine_sleep(bool canlaunch, bool recursing)
{
long secs;
int usecs;
Dlelem *elem;
/*
* We sleep until the next scheduled vacuum. We trust that when the
* database list was built, care was taken so that no entries have times in
* the past; if the first entry has too close a next_worker value, or a
* time in the past, we will sleep a small nominal time.
*/
if (!canlaunch)
{
secs = autovacuum_naptime;
usecs = 0;
}
else if ((elem = DLGetTail(DatabaseList)) != NULL)
{
avl_dbase *avdb = DLE_VAL(elem);
TimestampTz current_time = GetCurrentTimestamp();
TimestampTz next_wakeup;
next_wakeup = avdb->adl_next_worker;
TimestampDifference(current_time, next_wakeup, &secs, &usecs);
}
else
{
/* list is empty, sleep for whole autovacuum_naptime seconds */
secs = autovacuum_naptime;
usecs = 0;
}
/*
* If the result is exactly zero, it means a database had an entry with
* time in the past. Rebuild the list so that the databases are evenly
* distributed again, and recalculate the time to sleep. This can happen
* if there are more tables needing vacuum than workers, and they all take
* longer to vacuum than autovacuum_naptime.
*
* We only recurse once. rebuild_database_list should always return times
* in the future, but it seems best not to trust too much on that.
*/
if (secs == 0L && usecs == 0 && !recursing)
{
rebuild_database_list(InvalidOid);
return launcher_determine_sleep(canlaunch, true);
}
/* 100ms is the smallest time we'll allow the launcher to sleep */
if (secs <= 0L && usecs <= 100000)
{
secs = 0L;
usecs = 100000; /* 100 ms */
}
return secs * 1000000 + usecs;
}
/*
* Build an updated DatabaseList. It must only contain databases that appear
* in pgstats, and must be sorted by next_worker from highest to lowest,
* distributed regularly across the next autovacuum_naptime interval.
*
* Receives the Oid of the database that made this list be generated (we call
* this the "new" database, because when the database was already present on
* the list, we expect that this function is not called at all). The
* preexisting list, if any, will be used to preserve the order of the
* databases in the autovacuum_naptime period. The new database is put at the
* end of the interval. The actual values are not saved, which should not be
* much of a problem.
*/
static void
rebuild_database_list(Oid newdb)
{
List *dblist;
ListCell *cell;
MemoryContext newcxt;
MemoryContext oldcxt;
MemoryContext tmpcxt;
HASHCTL hctl;
int score;
int nelems;
HTAB *dbhash;
/* use fresh stats */
pgstat_clear_snapshot();
newcxt = AllocSetContextCreate(AutovacMemCxt,
"AV dblist",
ALLOCSET_DEFAULT_MINSIZE,
ALLOCSET_DEFAULT_INITSIZE,
ALLOCSET_DEFAULT_MAXSIZE);
tmpcxt = AllocSetContextCreate(newcxt,
"tmp AV dblist",
ALLOCSET_DEFAULT_MINSIZE,
ALLOCSET_DEFAULT_INITSIZE,
ALLOCSET_DEFAULT_MAXSIZE);
oldcxt = MemoryContextSwitchTo(tmpcxt);
/*
* Implementing this is not as simple as it sounds, because we need to put
* the new database at the end of the list; next the databases that were
* already on the list, and finally (at the tail of the list) all the other
* databases that are not on the existing list.
*
* To do this, we build an empty hash table of scored databases. We will
* start with the lowest score (zero) for the new database, then increasing
* scores for the databases in the existing list, in order, and lastly
* increasing scores for all databases gotten via get_database_list() that
* are not already on the hash.
*
* Then we will put all the hash elements into an array, sort the array by
* score, and finally put the array elements into the new doubly linked
* list.
*/
hctl.keysize = sizeof(Oid);
hctl.entrysize = sizeof(avl_dbase);
hctl.hash = oid_hash;
hctl.hcxt = tmpcxt;
dbhash = hash_create("db hash", 20, &hctl, /* magic number here FIXME */
HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
/* start by inserting the new database */
score = 0;
if (OidIsValid(newdb))
{
avl_dbase *db;
PgStat_StatDBEntry *entry;
/* only consider this database if it has a pgstat entry */
entry = pgstat_fetch_stat_dbentry(newdb);
if (entry != NULL)
{
/* we assume it isn't found because the hash was just created */
db = hash_search(dbhash, &newdb, HASH_ENTER, NULL);
/* hash_search already filled in the key */
db->adl_score = score++;
/* next_worker is filled in later */
}
}
/* Now insert the databases from the existing list */
if (DatabaseList != NULL)
{
Dlelem *elem;
elem = DLGetHead(DatabaseList);
while (elem != NULL)
{
avl_dbase *avdb = DLE_VAL(elem);
avl_dbase *db;
bool found;
PgStat_StatDBEntry *entry;
elem = DLGetSucc(elem);
/*
* skip databases with no stat entries -- in particular, this
* gets rid of dropped databases
*/
entry = pgstat_fetch_stat_dbentry(avdb->adl_datid);
if (entry == NULL)
continue;
db = hash_search(dbhash, &(avdb->adl_datid), HASH_ENTER, &found);
if (!found)
{
/* hash_search already filled in the key */
db->adl_score = score++;
/* next_worker is filled in later */
}
}
}
/* finally, insert all qualifying databases not previously inserted */
dblist = get_database_list();
foreach(cell, dblist)
{
avw_dbase *avdb = lfirst(cell);
avl_dbase *db;
bool found;
PgStat_StatDBEntry *entry;
/* only consider databases with a pgstat entry */
entry = pgstat_fetch_stat_dbentry(avdb->adw_datid);
if (entry == NULL)
continue;
db = hash_search(dbhash, &(avdb->adw_datid), HASH_ENTER, &found);
/* only update the score if the database was not already on the hash */
if (!found)
{
/* hash_search already filled in the key */
db->adl_score = score++;
/* next_worker is filled in later */
}
}
nelems = score;
/* from here on, the allocated memory belongs to the new list */
MemoryContextSwitchTo(newcxt);
DatabaseList = DLNewList();
if (nelems > 0)
{
TimestampTz current_time;
int millis_increment;
avl_dbase *dbary;
avl_dbase *db;
HASH_SEQ_STATUS seq;
int i;
/* put all the hash elements into an array */
dbary = palloc(nelems * sizeof(avl_dbase));
i = 0;
hash_seq_init(&seq, dbhash);
while ((db = hash_seq_search(&seq)) != NULL)
memcpy(&(dbary[i++]), db, sizeof(avl_dbase));
/* sort the array */
qsort(dbary, nelems, sizeof(avl_dbase), db_comparator);
/* this is the time interval between databases in the schedule */
millis_increment = 1000.0 * autovacuum_naptime / nelems;
current_time = GetCurrentTimestamp();
/*
* move the elements from the array into the dllist, setting the
* next_worker while walking the array
*/
for (i = 0; i < nelems; i++)
{
avl_dbase *db = &(dbary[i]);
Dlelem *elem;
current_time = TimestampTzPlusMilliseconds(current_time,
millis_increment);
db->adl_next_worker = current_time;
elem = DLNewElem(db);
/* later elements should go closer to the head of the list */
DLAddHead(DatabaseList, elem);
}
}
/* all done, clean up memory */
if (DatabaseListCxt != NULL)
MemoryContextDelete(DatabaseListCxt);
MemoryContextDelete(tmpcxt);
DatabaseListCxt = newcxt;
MemoryContextSwitchTo(oldcxt);
}
/* qsort comparator for avl_dbase, using adl_score */
static int
db_comparator(const void *a, const void *b)
{
if (((avl_dbase *) a)->adl_score == ((avl_dbase *) b)->adl_score)
return 0;
else
return (((avl_dbase *) a)->adl_score < ((avl_dbase *) b)->adl_score) ? 1 : -1;
}
/*
* do_start_worker
*
* Bare-bones procedure for starting an autovacuum worker from the launcher.
* It determines what database to work on, sets up shared memory stuff and
* signals postmaster to start the worker. It fails gracefully if invoked when
* autovacuum_workers are already active.
*
* Return value is the OID of the database that the worker is going to process,
* or InvalidOid if no worker was actually started.
*/
static Oid
do_start_worker(void)
{
List *dblist;
ListCell *cell;
TransactionId xidForceLimit;
bool for_xid_wrap;
avw_dbase *avdb;
TimestampTz current_time;
bool skipit = false;
/* return quickly when there are no free workers */
LWLockAcquire(AutovacuumLock, LW_SHARED);
if (AutoVacuumShmem->av_freeWorkers == INVALID_OFFSET)
{
LWLockRelease(AutovacuumLock);
return InvalidOid;
}
LWLockRelease(AutovacuumLock);
/* use fresh stats */
pgstat_clear_snapshot();
/* Get a list of databases */
dblist = get_database_list();
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
/*
* Determine the oldest datfrozenxid/relfrozenxid that we will allow
* to pass without forcing a vacuum. (This limit can be tightened for
* particular tables, but not loosened.)
*/
recentXid = ReadNewTransactionId();
xidForceLimit = recentXid - autovacuum_freeze_max_age;
/* ensure it's a "normal" XID, else TransactionIdPrecedes misbehaves */
if (xidForceLimit < FirstNormalTransactionId)
xidForceLimit -= FirstNormalTransactionId;
/*
* Choose a database to connect to. We pick the database that was least
* recently auto-vacuumed, or one that needs vacuuming to prevent Xid
* wraparound-related data loss. If any db at risk of wraparound is
* found, we pick the one with oldest datfrozenxid, independently of
* autovacuum times.
*
* Note that a database with no stats entry is not considered, except for
* Xid wraparound purposes. The theory is that if no one has ever
* connected to it since the stats were last initialized, it doesn't need
* vacuuming.
*
* XXX This could be improved if we had more info about whether it needs
* vacuuming before connecting to it. Perhaps look through the pgstats
* data for the database's tables? One idea is to keep track of the
* number of new and dead tuples per database in pgstats. However it
* isn't clear how to construct a metric that measures that and not cause
* starvation for less busy databases.
*/
avdb = NULL;
for_xid_wrap = false;
current_time = GetCurrentTimestamp();
foreach(cell, dblist)
{
avw_dbase *tmp = lfirst(cell);
Dlelem *elem;