diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index a3917aac7855bc3c71f7e6bb6b3c78078d981314..5ada5c8a1c2d092b9881cfee9925f4c1750f3a39 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -1325,7 +1325,7 @@ include_dir 'conf.d' 40% of RAM to <varname>shared_buffers</varname> will work better than a smaller amount. Larger settings for <varname>shared_buffers</varname> usually require a corresponding increase in - <varname>checkpoint_segments</varname>, in order to spread out the + <varname>max_wal_size</varname>, in order to spread out the process of writing large quantities of new or changed data over a longer period of time. </para> @@ -2394,18 +2394,20 @@ include_dir 'conf.d' <title>Checkpoints</title> <variablelist> - <varlistentry id="guc-checkpoint-segments" xreflabel="checkpoint_segments"> - <term><varname>checkpoint_segments</varname> (<type>integer</type>) + <varlistentry id="guc-max-wal-size" xreflabel="max_wal_size"> + <term><varname>max_wal_size</varname> (<type>integer</type>)</term> <indexterm> - <primary><varname>checkpoint_segments</> configuration parameter</primary> + <primary><varname>max_wal_size</> configuration parameter</primary> </indexterm> - </term> <listitem> <para> - Maximum number of log file segments between automatic WAL - checkpoints (each segment is normally 16 megabytes). The default - is three segments. Increasing this parameter can increase the - amount of time needed for crash recovery. + Maximum size to let the WAL grow to between automatic WAL + checkpoints. This is a soft limit; WAL size can exceed + <varname>max_wal_size</> under special circumstances, like + under heavy load, a failing <varname>archive_command</>, or a high + <varname>wal_keep_segments</> setting. The default is 128 MB. + Increasing this parameter can increase the amount of time needed for + crash recovery. This parameter can only be set in the <filename>postgresql.conf</> file or on the server command line. </para> @@ -2458,7 +2460,7 @@ include_dir 'conf.d' Write a message to the server log if checkpoints caused by the filling of checkpoint segment files happen closer together than this many seconds (which suggests that - <varname>checkpoint_segments</> ought to be raised). The default is + <varname>max_wal_size</> ought to be raised). The default is 30 seconds (<literal>30s</>). Zero disables the warning. No warnings will be generated if <varname>checkpoint_timeout</varname> is less than <varname>checkpoint_warning</varname>. @@ -2468,6 +2470,24 @@ include_dir 'conf.d' </listitem> </varlistentry> + <varlistentry id="guc-min-wal-size" xreflabel="min_wal_size"> + <term><varname>min_wal_size</varname> (<type>integer</type>)</term> + <indexterm> + <primary><varname>min_wal_size</> configuration parameter</primary> + </indexterm> + <listitem> + <para> + As long as WAL disk usage stays below this setting, old WAL files are + always recycled for future use at a checkpoint, rather than removed. + This can be used to ensure that enough WAL space is reserved to + handle spikes in WAL usage, for example when running large batch + jobs. The default is 80 MB. + This parameter can only be set in the <filename>postgresql.conf</> + file or on the server command line. + </para> + </listitem> + </varlistentry> + </variablelist> </sect2> <sect2 id="runtime-config-wal-archiving"> diff --git a/doc/src/sgml/perform.sgml b/doc/src/sgml/perform.sgml index 5a087fbe6a098f8bfb2ac55c99cf1e767a486409..c73580ed460c77806203dffd3ad7fb22b66597f7 100644 --- a/doc/src/sgml/perform.sgml +++ b/doc/src/sgml/perform.sgml @@ -1328,19 +1328,19 @@ SELECT * FROM x, y, a, b, c WHERE something AND somethingelse; </para> </sect2> - <sect2 id="populate-checkpoint-segments"> - <title>Increase <varname>checkpoint_segments</varname></title> + <sect2 id="populate-max-wal-size"> + <title>Increase <varname>max_wal_size</varname></title> <para> - Temporarily increasing the <xref - linkend="guc-checkpoint-segments"> configuration variable can also + Temporarily increasing the <xref linkend="guc-max-wal-size"> + configuration variable can also make large data loads faster. This is because loading a large amount of data into <productname>PostgreSQL</productname> will cause checkpoints to occur more often than the normal checkpoint frequency (specified by the <varname>checkpoint_timeout</varname> configuration variable). Whenever a checkpoint occurs, all dirty pages must be flushed to disk. By increasing - <varname>checkpoint_segments</varname> temporarily during bulk + <varname>max_wal_size</varname> temporarily during bulk data loads, the number of checkpoints that are required can be reduced. </para> @@ -1445,7 +1445,7 @@ SELECT * FROM x, y, a, b, c WHERE something AND somethingelse; <para> Set appropriate (i.e., larger than normal) values for <varname>maintenance_work_mem</varname> and - <varname>checkpoint_segments</varname>. + <varname>max_wal_size</varname>. </para> </listitem> <listitem> @@ -1512,7 +1512,7 @@ SELECT * FROM x, y, a, b, c WHERE something AND somethingelse; So when loading a data-only dump, it is up to you to drop and recreate indexes and foreign keys if you wish to use those techniques. - It's still useful to increase <varname>checkpoint_segments</varname> + It's still useful to increase <varname>max_wal_size</varname> while loading the data, but don't bother increasing <varname>maintenance_work_mem</varname>; rather, you'd do that while manually recreating indexes and foreign keys afterwards. @@ -1577,7 +1577,7 @@ SELECT * FROM x, y, a, b, c WHERE something AND somethingelse; <listitem> <para> - Increase <xref linkend="guc-checkpoint-segments"> and <xref + Increase <xref linkend="guc-max-wal-size"> and <xref linkend="guc-checkpoint-timeout"> ; this reduces the frequency of checkpoints, but increases the storage requirements of <filename>/pg_xlog</>. diff --git a/doc/src/sgml/wal.sgml b/doc/src/sgml/wal.sgml index 1254c03f80e3315198803575ea768f6eb413d8e0..b57749fdbc3f79947cf50b0834c42efdc7c41e91 100644 --- a/doc/src/sgml/wal.sgml +++ b/doc/src/sgml/wal.sgml @@ -472,9 +472,10 @@ <para> The server's checkpointer process automatically performs a checkpoint every so often. A checkpoint is begun every <xref - linkend="guc-checkpoint-segments"> log segments, or every <xref - linkend="guc-checkpoint-timeout"> seconds, whichever comes first. - The default settings are 3 segments and 300 seconds (5 minutes), respectively. + linkend="guc-checkpoint-timeout"> seconds, or if + <xref linkend="guc-max-wal-size"> is about to be exceeded, + whichever comes first. + The default settings are 5 minutes and 128 MB, respectively. If no WAL has been written since the previous checkpoint, new checkpoints will be skipped even if <varname>checkpoint_timeout</> has passed. (If WAL archiving is being used and you want to put a lower limit on how @@ -486,8 +487,8 @@ </para> <para> - Reducing <varname>checkpoint_segments</varname> and/or - <varname>checkpoint_timeout</varname> causes checkpoints to occur + Reducing <varname>checkpoint_timeout</varname> and/or + <varname>max_wal_size</varname> causes checkpoints to occur more often. This allows faster after-crash recovery, since less work will need to be redone. However, one must balance this against the increased cost of flushing dirty data pages more often. If @@ -510,11 +511,11 @@ parameter. If checkpoints happen closer together than <varname>checkpoint_warning</> seconds, a message will be output to the server log recommending increasing - <varname>checkpoint_segments</varname>. Occasional appearance of such + <varname>max_wal_size</varname>. Occasional appearance of such a message is not cause for alarm, but if it appears often then the checkpoint control parameters should be increased. Bulk operations such as large <command>COPY</> transfers might cause a number of such warnings - to appear if you have not set <varname>checkpoint_segments</> high + to appear if you have not set <varname>max_wal_size</> high enough. </para> @@ -525,10 +526,10 @@ <xref linkend="guc-checkpoint-completion-target">, which is given as a fraction of the checkpoint interval. The I/O rate is adjusted so that the checkpoint finishes when the - given fraction of <varname>checkpoint_segments</varname> WAL segments - have been consumed since checkpoint start, or the given fraction of - <varname>checkpoint_timeout</varname> seconds have elapsed, - whichever is sooner. With the default value of 0.5, + given fraction of + <varname>checkpoint_timeout</varname> seconds have elapsed, or before + <varname>max_wal_size</varname> is exceeded, whichever is sooner. + With the default value of 0.5, <productname>PostgreSQL</> can be expected to complete each checkpoint in about half the time before the next checkpoint starts. On a system that's very close to maximum I/O throughput during normal operation, @@ -545,18 +546,35 @@ </para> <para> - There will always be at least one WAL segment file, and will normally - not be more than (2 + <varname>checkpoint_completion_target</varname>) * <varname>checkpoint_segments</varname> + 1 - or <varname>checkpoint_segments</> + <xref linkend="guc-wal-keep-segments"> + 1 - files. Each segment file is normally 16 MB (though this size can be - altered when building the server). You can use this to estimate space - requirements for <acronym>WAL</acronym>. - Ordinarily, when old log segment files are no longer needed, they - are recycled (that is, renamed to become future segments in the numbered - sequence). If, due to a short-term peak of log output rate, there - are more than 3 * <varname>checkpoint_segments</varname> + 1 - segment files, the unneeded segment files will be deleted instead - of recycled until the system gets back under this limit. + The number of WAL segment files in <filename>pg_xlog</> directory depends on + <varname>min_wal_size</>, <varname>max_wal_size</> and + the amount of WAL generated in previous checkpoint cycles. When old log + segment files are no longer needed, they are removed or recycled (that is, + renamed to become future segments in the numbered sequence). If, due to a + short-term peak of log output rate, <varname>max_wal_size</> is + exceeded, the unneeded segment files will be removed until the system + gets back under this limit. Below that limit, the system recycles enough + WAL files to cover the estimated need until the next checkpoint, and + removes the rest. The estimate is based on a moving average of the number + of WAL files used in previous checkpoint cycles. The moving average + is increased immediately if the actual usage exceeds the estimate, so it + accommodates peak usage rather average usage to some extent. + <varname>min_wal_size</> puts a minimum on the amount of WAL files + recycled for future usage; that much WAL is always recycled for future use, + even if the system is idle and the WAL usage estimate suggests that little + WAL is needed. + </para> + + <para> + Independently of <varname>max_wal_size</varname>, + <xref linkend="guc-wal-keep-segments"> + 1 most recent WAL files are + kept at all times. Also, if WAL archiving is used, old segments can not be + removed or recycled until they are archived. If WAL archiving cannot keep up + with the pace that WAL is generated, or if <varname>archive_command</varname> + fails repeatedly, old WAL files will accumulate in <filename>pg_xlog</> + until the situation is resolved. A slow or failed standby server that + uses a replication slot will have the same effect (see + <xref linkend="streaming-replication-slots">). </para> <para> @@ -571,9 +589,8 @@ master because restartpoints can only be performed at checkpoint records. A restartpoint is triggered when a checkpoint record is reached if at least <varname>checkpoint_timeout</> seconds have passed since the last - restartpoint. In standby mode, a restartpoint is also triggered if at - least <varname>checkpoint_segments</> log segments have been replayed - since the last restartpoint. + restartpoint, or if WAL size is about to exceed + <varname>max_wal_size</>. </para> <para> diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index f68f82b255c0608d7a7f958865a2abe3d179911c..a28155f977d67bac2fe8fe05461c730ea93cbca1 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -79,7 +79,8 @@ extern uint32 bootstrap_data_checksum_version; /* User-settable parameters */ -int CheckPointSegments = 3; +int max_wal_size = 8; /* 128 MB */ +int min_wal_size = 5; /* 80 MB */ int wal_keep_segments = 0; int XLOGbuffers = -1; int XLogArchiveTimeout = 0; @@ -107,18 +108,14 @@ bool XLOG_DEBUG = false; #define NUM_XLOGINSERT_LOCKS 8 /* - * XLOGfileslop is the maximum number of preallocated future XLOG segments. - * When we are done with an old XLOG segment file, we will recycle it as a - * future XLOG segment as long as there aren't already XLOGfileslop future - * segments; else we'll delete it. This could be made a separate GUC - * variable, but at present I think it's sufficient to hardwire it as - * 2*CheckPointSegments+1. Under normal conditions, a checkpoint will free - * no more than 2*CheckPointSegments log segments, and we want to recycle all - * of them; the +1 allows boundary cases to happen without wasting a - * delete/create-segment cycle. + * Max distance from last checkpoint, before triggering a new xlog-based + * checkpoint. */ -#define XLOGfileslop (2*CheckPointSegments + 1) +int CheckPointSegments; +/* Estimated distance between checkpoints, in bytes */ +static double CheckPointDistanceEstimate = 0; +static double PrevCheckPointDistance = 0; /* * GUC support @@ -779,7 +776,7 @@ static void AdvanceXLInsertBuffer(XLogRecPtr upto, bool opportunistic); static bool XLogCheckpointNeeded(XLogSegNo new_segno); static void XLogWrite(XLogwrtRqst WriteRqst, bool flexible); static bool InstallXLogFileSegment(XLogSegNo *segno, char *tmppath, - bool find_free, int *max_advance, + bool find_free, XLogSegNo max_segno, bool use_lock); static int XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli, int source, bool notexistOk); @@ -792,7 +789,7 @@ static bool WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess, static int emode_for_corrupt_record(int emode, XLogRecPtr RecPtr); static void XLogFileClose(void); static void PreallocXlogFiles(XLogRecPtr endptr); -static void RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr endptr); +static void RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr PriorRedoPtr, XLogRecPtr endptr); static void UpdateLastRemovedPtr(char *filename); static void ValidateXLOGDirectoryStructure(void); static void CleanupBackupHistory(void); @@ -1958,6 +1955,104 @@ AdvanceXLInsertBuffer(XLogRecPtr upto, bool opportunistic) #endif } +/* + * Calculate CheckPointSegments based on max_wal_size and + * checkpoint_completion_target. + */ +static void +CalculateCheckpointSegments(void) +{ + double target; + + /*------- + * Calculate the distance at which to trigger a checkpoint, to avoid + * exceeding max_wal_size. This is based on two assumptions: + * + * a) we keep WAL for two checkpoint cycles, back to the "prev" checkpoint. + * b) during checkpoint, we consume checkpoint_completion_target * + * number of segments consumed between checkpoints. + *------- + */ + target = (double ) max_wal_size / (2.0 + CheckPointCompletionTarget); + + /* round down */ + CheckPointSegments = (int) target; + + if (CheckPointSegments < 1) + CheckPointSegments = 1; +} + +void +assign_max_wal_size(int newval, void *extra) +{ + max_wal_size = newval; + CalculateCheckpointSegments(); +} + +void +assign_checkpoint_completion_target(double newval, void *extra) +{ + CheckPointCompletionTarget = newval; + CalculateCheckpointSegments(); +} + +/* + * At a checkpoint, how many WAL segments to recycle as preallocated future + * XLOG segments? Returns the highest segment that should be preallocated. + */ +static XLogSegNo +XLOGfileslop(XLogRecPtr PriorRedoPtr) +{ + XLogSegNo minSegNo; + XLogSegNo maxSegNo; + double distance; + XLogSegNo recycleSegNo; + + /* + * Calculate the segment numbers that min_wal_size and max_wal_size + * correspond to. Always recycle enough segments to meet the minimum, and + * remove enough segments to stay below the maximum. + */ + minSegNo = PriorRedoPtr / XLOG_SEG_SIZE + min_wal_size - 1; + maxSegNo = PriorRedoPtr / XLOG_SEG_SIZE + max_wal_size - 1; + + /* + * Between those limits, recycle enough segments to get us through to the + * estimated end of next checkpoint. + * + * To estimate where the next checkpoint will finish, assume that the + * system runs steadily consuming CheckPointDistanceEstimate + * bytes between every checkpoint. + * + * The reason this calculation is done from the prior checkpoint, not the + * one that just finished, is that this behaves better if some checkpoint + * cycles are abnormally short, like if you perform a manual checkpoint + * right after a timed one. The manual checkpoint will make almost a full + * cycle's worth of WAL segments available for recycling, because the + * segments from the prior's prior, fully-sized checkpoint cycle are no + * longer needed. However, the next checkpoint will make only few segments + * available for recycling, the ones generated between the timed + * checkpoint and the manual one right after that. If at the manual + * checkpoint we only retained enough segments to get us to the next timed + * one, and removed the rest, then at the next checkpoint we would not + * have enough segments around for recycling, to get us to the checkpoint + * after that. Basing the calculations on the distance from the prior redo + * pointer largely fixes that problem. + */ + distance = (2.0 + CheckPointCompletionTarget) * CheckPointDistanceEstimate; + /* add 10% for good measure. */ + distance *= 1.10; + + recycleSegNo = (XLogSegNo) ceil(((double) PriorRedoPtr + distance) / XLOG_SEG_SIZE); + + if (recycleSegNo < minSegNo) + recycleSegNo = minSegNo; + if (recycleSegNo > maxSegNo) + recycleSegNo = maxSegNo; + + return recycleSegNo; +} + /* * Check whether we've consumed enough xlog space that a checkpoint is needed. * @@ -2765,7 +2860,7 @@ XLogFileInit(XLogSegNo logsegno, bool *use_existent, bool use_lock) char zbuffer_raw[XLOG_BLCKSZ + MAXIMUM_ALIGNOF]; char *zbuffer; XLogSegNo installed_segno; - int max_advance; + XLogSegNo max_segno; int fd; int nbytes; @@ -2868,9 +2963,19 @@ XLogFileInit(XLogSegNo logsegno, bool *use_existent, bool use_lock) * pre-create a future log segment. */ installed_segno = logsegno; - max_advance = XLOGfileslop; + + /* + * XXX: What should we use as max_segno? We used to use XLOGfileslop when + * that was a constant, but that was always a bit dubious: normally, at a + * checkpoint, XLOGfileslop was the offset from the checkpoint record, + * but here, it was the offset from the insert location. We can't do the + * normal XLOGfileslop calculation here because we don't have access to + * the prior checkpoint's redo location. So somewhat arbitrarily, just + * use CheckPointSegments. + */ + max_segno = logsegno + CheckPointSegments; if (!InstallXLogFileSegment(&installed_segno, tmppath, - *use_existent, &max_advance, + *use_existent, max_segno, use_lock)) { /* @@ -3011,7 +3116,7 @@ XLogFileCopy(XLogSegNo destsegno, TimeLineID srcTLI, XLogSegNo srcsegno, /* * Now move the segment into place with its final name. */ - if (!InstallXLogFileSegment(&destsegno, tmppath, false, NULL, false)) + if (!InstallXLogFileSegment(&destsegno, tmppath, false, 0, false)) elog(ERROR, "InstallXLogFileSegment should not have failed"); } @@ -3031,22 +3136,21 @@ XLogFileCopy(XLogSegNo destsegno, TimeLineID srcTLI, XLogSegNo srcsegno, * number at or after the passed numbers. If FALSE, install the new segment * exactly where specified, deleting any existing segment file there. * - * *max_advance: maximum number of segno slots to advance past the starting - * point. Fail if no free slot is found in this range. On return, reduced - * by the number of slots skipped over. (Irrelevant, and may be NULL, - * when find_free is FALSE.) + * max_segno: maximum segment number to install the new file as. Fail if no + * free slot is found between *segno and max_segno. (Ignored when find_free + * is FALSE.) * * use_lock: if TRUE, acquire ControlFileLock while moving file into * place. This should be TRUE except during bootstrap log creation. The * caller must *not* hold the lock at call. * * Returns TRUE if the file was installed successfully. FALSE indicates that - * max_advance limit was exceeded, or an error occurred while renaming the + * max_segno limit was exceeded, or an error occurred while renaming the * file into place. */ static bool InstallXLogFileSegment(XLogSegNo *segno, char *tmppath, - bool find_free, int *max_advance, + bool find_free, XLogSegNo max_segno, bool use_lock) { char path[MAXPGPATH]; @@ -3070,7 +3174,7 @@ InstallXLogFileSegment(XLogSegNo *segno, char *tmppath, /* Find a free slot to put it in */ while (stat(path, &stat_buf) == 0) { - if (*max_advance <= 0) + if ((*segno) >= max_segno) { /* Failed to find a free slot within specified range */ if (use_lock) @@ -3078,7 +3182,6 @@ InstallXLogFileSegment(XLogSegNo *segno, char *tmppath, return false; } (*segno)++; - (*max_advance)--; XLogFilePath(path, ThisTimeLineID, *segno); } } @@ -3426,14 +3529,15 @@ UpdateLastRemovedPtr(char *filename) /* * Recycle or remove all log files older or equal to passed segno * - * endptr is current (or recent) end of xlog; this is used to determine + * endptr is current (or recent) end of xlog, and PriorRedoRecPtr is the + * redo pointer of the previous checkpoint. These are used to determine * whether we want to recycle rather than delete no-longer-wanted log files. */ static void -RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr endptr) +RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr PriorRedoPtr, XLogRecPtr endptr) { XLogSegNo endlogSegNo; - int max_advance; + XLogSegNo recycleSegNo; DIR *xldir; struct dirent *xlde; char lastoff[MAXFNAMELEN]; @@ -3445,11 +3549,10 @@ RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr endptr) struct stat statbuf; /* - * Initialize info about where to try to recycle to. We allow recycling - * segments up to XLOGfileslop segments beyond the current XLOG location. + * Initialize info about where to try to recycle to. */ XLByteToPrevSeg(endptr, endlogSegNo); - max_advance = XLOGfileslop; + recycleSegNo = XLOGfileslop(PriorRedoPtr); xldir = AllocateDir(XLOGDIR); if (xldir == NULL) @@ -3498,20 +3601,17 @@ RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr endptr) * for example can create symbolic links pointing to a * separate archive directory. */ - if (lstat(path, &statbuf) == 0 && S_ISREG(statbuf.st_mode) && + if (endlogSegNo <= recycleSegNo && + lstat(path, &statbuf) == 0 && S_ISREG(statbuf.st_mode) && InstallXLogFileSegment(&endlogSegNo, path, - true, &max_advance, true)) + true, recycleSegNo, true)) { ereport(DEBUG2, (errmsg("recycled transaction log file \"%s\"", xlde->d_name))); CheckpointStats.ckpt_segs_recycled++; /* Needn't recheck that slot on future iterations */ - if (max_advance > 0) - { - endlogSegNo++; - max_advance--; - } + endlogSegNo++; } else { @@ -7594,7 +7694,8 @@ LogCheckpointEnd(bool restartpoint) elog(LOG, "%s complete: wrote %d buffers (%.1f%%); " "%d transaction log file(s) added, %d removed, %d recycled; " "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s; " - "sync files=%d, longest=%ld.%03d s, average=%ld.%03d s", + "sync files=%d, longest=%ld.%03d s, average=%ld.%03d s; " + "distance=%d kB, estimate=%d kB", restartpoint ? "restartpoint" : "checkpoint", CheckpointStats.ckpt_bufs_written, (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers, @@ -7606,7 +7707,48 @@ LogCheckpointEnd(bool restartpoint) total_secs, total_usecs / 1000, CheckpointStats.ckpt_sync_rels, longest_secs, longest_usecs / 1000, - average_secs, average_usecs / 1000); + average_secs, average_usecs / 1000, + (int) (PrevCheckPointDistance / 1024.0), + (int) (CheckPointDistanceEstimate / 1024.0)); +} + +/* + * Update the estimate of distance between checkpoints. + * + * The estimate is used to calculate the number of WAL segments to keep + * preallocated, see XLOGFileSlop(). + */ +static void +UpdateCheckPointDistanceEstimate(uint64 nbytes) +{ + /* + * To estimate the number of segments consumed between checkpoints, keep + * a moving average of the amount of WAL generated in previous checkpoint + * cycles. However, if the load is bursty, with quiet periods and busy + * periods, we want to cater for the peak load. So instead of a plain + * moving average, let the average decline slowly if the previous cycle + * used less WAL than estimated, but bump it up immediately if it used + * more. + * + * When checkpoints are triggered by max_wal_size, this should converge to + * CheckpointSegments * XLOG_SEG_SIZE, + * + * Note: This doesn't pay any attention to what caused the checkpoint. + * Checkpoints triggered manually with CHECKPOINT command, or by e.g. + * starting a base backup, are counted the same as those created + * automatically. The slow-decline will largely mask them out, if they are + * not frequent. If they are frequent, it seems reasonable to count them + * in as any others; if you issue a manual checkpoint every 5 minutes and + * never let a timed checkpoint happen, it makes sense to base the + * preallocation on that 5 minute interval rather than whatever + * checkpoint_timeout is set to. + */ + PrevCheckPointDistance = nbytes; + if (CheckPointDistanceEstimate < nbytes) + CheckPointDistanceEstimate = nbytes; + else + CheckPointDistanceEstimate = + (0.90 * CheckPointDistanceEstimate + 0.10 * (double) nbytes); } /* @@ -7646,7 +7788,7 @@ CreateCheckPoint(int flags) XLogRecPtr recptr; XLogCtlInsert *Insert = &XLogCtl->Insert; uint32 freespace; - XLogSegNo _logSegNo; + XLogRecPtr PriorRedoPtr; XLogRecPtr curInsert; VirtualTransactionId *vxids; int nvxids; @@ -7961,10 +8103,10 @@ CreateCheckPoint(int flags) (errmsg("concurrent transaction log activity while database system is shutting down"))); /* - * Select point at which we can truncate the log, which we base on the - * prior checkpoint's earliest info. + * Remember the prior checkpoint's redo pointer, used later to determine + * the point where the log can be truncated. */ - XLByteToSeg(ControlFile->checkPointCopy.redo, _logSegNo); + PriorRedoPtr = ControlFile->checkPointCopy.redo; /* * Update the control file. @@ -8019,11 +8161,17 @@ CreateCheckPoint(int flags) * Delete old log files (those no longer needed even for previous * checkpoint or the standbys in XLOG streaming). */ - if (_logSegNo) + if (PriorRedoPtr != InvalidXLogRecPtr) { + XLogSegNo _logSegNo; + + /* Update the average distance between checkpoints. */ + UpdateCheckPointDistanceEstimate(RedoRecPtr - PriorRedoPtr); + + XLByteToSeg(PriorRedoPtr, _logSegNo); KeepLogSeg(recptr, &_logSegNo); _logSegNo--; - RemoveOldXlogFiles(_logSegNo, recptr); + RemoveOldXlogFiles(_logSegNo, PriorRedoPtr, recptr); } /* @@ -8191,7 +8339,7 @@ CreateRestartPoint(int flags) { XLogRecPtr lastCheckPointRecPtr; CheckPoint lastCheckPoint; - XLogSegNo _logSegNo; + XLogRecPtr PriorRedoPtr; TimestampTz xtime; /* @@ -8256,14 +8404,14 @@ CreateRestartPoint(int flags) /* * Update the shared RedoRecPtr so that the startup process can calculate * the number of segments replayed since last restartpoint, and request a - * restartpoint if it exceeds checkpoint_segments. + * restartpoint if it exceeds CheckPointSegments. * * Like in CreateCheckPoint(), hold off insertions to update it, although * during recovery this is just pro forma, because no WAL insertions are * happening. */ WALInsertLockAcquireExclusive(); - XLogCtl->Insert.RedoRecPtr = lastCheckPoint.redo; + RedoRecPtr = XLogCtl->Insert.RedoRecPtr = lastCheckPoint.redo; WALInsertLockRelease(); /* Also update the info_lck-protected copy */ @@ -8287,10 +8435,10 @@ CreateRestartPoint(int flags) CheckPointGuts(lastCheckPoint.redo, flags); /* - * Select point at which we can truncate the xlog, which we base on the - * prior checkpoint's earliest info. + * Remember the prior checkpoint's redo pointer, used later to determine + * the point at which we can truncate the log. */ - XLByteToSeg(ControlFile->checkPointCopy.redo, _logSegNo); + PriorRedoPtr = ControlFile->checkPointCopy.redo; /* * Update pg_control, using current time. Check that it still shows @@ -8317,12 +8465,18 @@ CreateRestartPoint(int flags) * checkpoint/restartpoint) to prevent the disk holding the xlog from * growing full. */ - if (_logSegNo) + if (PriorRedoPtr != InvalidXLogRecPtr) { XLogRecPtr receivePtr; XLogRecPtr replayPtr; TimeLineID replayTLI; XLogRecPtr endptr; + XLogSegNo _logSegNo; + + /* Update the average distance between checkpoints/restartpoints. */ + UpdateCheckPointDistanceEstimate(RedoRecPtr - PriorRedoPtr); + + XLByteToSeg(PriorRedoPtr, _logSegNo); /* * Get the current end of xlog replayed or received, whichever is @@ -8351,7 +8505,7 @@ CreateRestartPoint(int flags) if (RecoveryInProgress()) ThisTimeLineID = replayTLI; - RemoveOldXlogFiles(_logSegNo, endptr); + RemoveOldXlogFiles(_logSegNo, PriorRedoPtr, endptr); /* * Make more log segments if needed. (Do this after recycling old log diff --git a/src/backend/postmaster/checkpointer.c b/src/backend/postmaster/checkpointer.c index cfad08d5528e992191c89139278ed4c159dad3ef..0dce6a8ffaa3f23a88bc55b22c0cbc98fff13268 100644 --- a/src/backend/postmaster/checkpointer.c +++ b/src/backend/postmaster/checkpointer.c @@ -471,7 +471,7 @@ CheckpointerMain(void) "checkpoints are occurring too frequently (%d seconds apart)", elapsed_secs, elapsed_secs), - errhint("Consider increasing the configuration parameter \"checkpoint_segments\"."))); + errhint("Consider increasing the configuration parameter \"max_wal_size\"."))); /* * Initialize checkpointer-private variables used during @@ -749,11 +749,11 @@ IsCheckpointOnSchedule(double progress) return false; /* - * Check progress against WAL segments written and checkpoint_segments. + * Check progress against WAL segments written and CheckPointSegments. * * We compare the current WAL insert location against the location * computed before calling CreateCheckPoint. The code in XLogInsert that - * actually triggers a checkpoint when checkpoint_segments is exceeded + * actually triggers a checkpoint when CheckPointSegments is exceeded * compares against RedoRecptr, so this is not completely accurate. * However, it's good enough for our purposes, we're only calculating an * estimate anyway. diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index 2499bee7399c50dc7c5d129bd10258e84b7a97b7..d84dba7732ee78fdbe4ac96af0cef07ca2efcc4b 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -685,6 +685,9 @@ typedef struct #if XLOG_BLCKSZ < 1024 || XLOG_BLCKSZ > (1024*1024) #error XLOG_BLCKSZ must be between 1KB and 1MB #endif +#if XLOG_SEG_SIZE < (1024*1024) || XLOG_BLCKSZ > (1024*1024*1024) +#error XLOG_SEG_SIZE must be between 1MB and 1GB +#endif static const char *memory_units_hint = gettext_noop("Valid units for this parameter are \"kB\", \"MB\", \"GB\", and \"TB\"."); @@ -706,6 +709,11 @@ static const unit_conversion memory_unit_conversion_table[] = { "MB", GUC_UNIT_XBLOCKS, 1024 / (XLOG_BLCKSZ / 1024) }, { "kB", GUC_UNIT_XBLOCKS, -(XLOG_BLCKSZ / 1024) }, + { "TB", GUC_UNIT_XSEGS, (1024*1024*1024) / (XLOG_SEG_SIZE / 1024) }, + { "GB", GUC_UNIT_XSEGS, (1024*1024) / (XLOG_SEG_SIZE / 1024) }, + { "MB", GUC_UNIT_XSEGS, -(XLOG_SEG_SIZE / (1024 * 1024)) }, + { "kB", GUC_UNIT_XSEGS, -(XLOG_SEG_SIZE / 1024) }, + { "" } /* end of table marker */ }; @@ -2146,15 +2154,27 @@ static struct config_int ConfigureNamesInt[] = }, { - {"checkpoint_segments", PGC_SIGHUP, WAL_CHECKPOINTS, - gettext_noop("Sets the maximum distance in log segments between automatic WAL checkpoints."), - NULL + {"min_wal_size", PGC_SIGHUP, WAL_CHECKPOINTS, + gettext_noop("Sets the minimum size to shrink the WAL to."), + NULL, + GUC_UNIT_XSEGS }, - &CheckPointSegments, - 3, 1, INT_MAX, + &min_wal_size, + 5, 2, INT_MAX, NULL, NULL, NULL }, + { + {"max_wal_size", PGC_SIGHUP, WAL_CHECKPOINTS, + gettext_noop("Sets the WAL size that triggers a checkpoint."), + NULL, + GUC_UNIT_XSEGS + }, + &max_wal_size, + 8, 2, INT_MAX, + NULL, assign_max_wal_size, NULL + }, + { {"checkpoint_timeout", PGC_SIGHUP, WAL_CHECKPOINTS, gettext_noop("Sets the maximum time between automatic WAL checkpoints."), diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index 29d8485964d696cccc0d45e63c1644485973b1f1..f8f9ce18eca30803f16e5b75ac915436e0a91a16 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -197,8 +197,9 @@ # - Checkpoints - -#checkpoint_segments = 3 # in logfile segments, min 1, 16MB each #checkpoint_timeout = 5min # range 30s-1h +#max_wal_size = 128MB # in logfile segments +#min_wal_size = 80MB #checkpoint_completion_target = 0.5 # checkpoint target duration, 0.0 - 1.0 #checkpoint_warning = 30s # 0 disables diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index be27a85648665ba7aaa9f4d61c516feaf7dcb675..0e8e5873cc20c57323e866e6b2e57ab41703fa65 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -89,7 +89,8 @@ extern XLogRecPtr XactLastRecEnd; extern bool reachedConsistency; /* these variables are GUC parameters related to XLOG */ -extern int CheckPointSegments; +extern int min_wal_size; +extern int max_wal_size; extern int wal_keep_segments; extern int XLOGbuffers; extern int XLogArchiveTimeout; @@ -101,6 +102,8 @@ extern bool fullPageWrites; extern bool wal_log_hints; extern bool log_checkpoints; +extern int CheckPointSegments; + /* WAL levels */ typedef enum WalLevel { @@ -246,6 +249,9 @@ extern bool CheckPromoteSignal(void); extern void WakeupRecovery(void); extern void SetWalWriterSleeping(bool sleeping); +extern void assign_max_wal_size(int newval, void *extra); +extern void assign_checkpoint_completion_target(double newval, void *extra); + /* * Starting/stopping a base backup */ diff --git a/src/include/utils/guc.h b/src/include/utils/guc.h index 22d3a6faea40b18993560f7572b2f6e0827650c9..d3100d1781ff0f49ca7da081062927be730c59e9 100644 --- a/src/include/utils/guc.h +++ b/src/include/utils/guc.h @@ -207,6 +207,7 @@ typedef enum #define GUC_UNIT_KB 0x1000 /* value is in kilobytes */ #define GUC_UNIT_BLOCKS 0x2000 /* value is in blocks */ #define GUC_UNIT_XBLOCKS 0x3000 /* value is in xlog blocks */ +#define GUC_UNIT_XSEGS 0x4000 /* value is in xlog segments */ #define GUC_UNIT_MEMORY 0xF000 /* mask for KB, BLOCKS, XBLOCKS */ #define GUC_UNIT_MS 0x10000 /* value is in milliseconds */