diff --git a/contrib/start-scripts/linux b/contrib/start-scripts/linux index b950cf512c3c519c1813f69af98a7a0ce7422769..bab8b0efc677e8815b85e06f35f7951811b059db 100644 --- a/contrib/start-scripts/linux +++ b/contrib/start-scripts/linux @@ -43,14 +43,17 @@ PGLOG="$PGDATA/serverlog" # It's often a good idea to protect the postmaster from being killed by the # OOM killer (which will tend to preferentially kill the postmaster because # of the way it accounts for shared memory). Setting the OOM_SCORE_ADJ value -# to -1000 will disable OOM kill altogether. If you enable this, you probably -# want to compile PostgreSQL with "-DLINUX_OOM_SCORE_ADJ=0", so that -# individual backends can still be killed by the OOM killer. +# to -1000 will disable OOM kill altogether, which is a good thing for the +# postmaster, but not so much for individual backends. If you enable this, +# also uncomment the DAEMON_ENV line, which will instruct backends to set +# their OOM adjustments back to the default setting of zero. #OOM_SCORE_ADJ=-1000 +#DAEMON_ENV="PG_OOM_ADJUST_FILE=/proc/self/oom_score_adj" # Older Linux kernels may not have /proc/self/oom_score_adj, but instead # /proc/self/oom_adj, which works similarly except the disable value is -17. -# For such a system, enable this and compile with "-DLINUX_OOM_ADJ=0". +# For such a system, uncomment these two lines instead. #OOM_ADJ=-17 +#DAEMON_ENV="PG_OOM_ADJUST_FILE=/proc/self/oom_adj" ## STOP EDITING HERE @@ -84,7 +87,7 @@ case $1 in echo -n "Starting PostgreSQL: " test x"$OOM_SCORE_ADJ" != x && echo "$OOM_SCORE_ADJ" > /proc/self/oom_score_adj test x"$OOM_ADJ" != x && echo "$OOM_ADJ" > /proc/self/oom_adj - su - $PGUSER -c "$DAEMON -D '$PGDATA' &" >>$PGLOG 2>&1 + su - $PGUSER -c "$DAEMON_ENV $DAEMON -D '$PGDATA' &" >>$PGLOG 2>&1 echo "ok" ;; stop) @@ -97,7 +100,7 @@ case $1 in su - $PGUSER -c "$PGCTL stop -D '$PGDATA' -s -m fast -w" test x"$OOM_SCORE_ADJ" != x && echo "$OOM_SCORE_ADJ" > /proc/self/oom_score_adj test x"$OOM_ADJ" != x && echo "$OOM_ADJ" > /proc/self/oom_adj - su - $PGUSER -c "$DAEMON -D '$PGDATA' &" >>$PGLOG 2>&1 + su - $PGUSER -c "$DAEMON_ENV $DAEMON -D '$PGDATA' &" >>$PGLOG 2>&1 echo "ok" ;; reload) diff --git a/doc/src/sgml/runtime.sgml b/doc/src/sgml/runtime.sgml index 9fadef5c9da4e2db845fb0cf1d845a01b0b22e70..a2081c4b13a538b4b6dd7d47bf8a0008d95f1d99 100644 --- a/doc/src/sgml/runtime.sgml +++ b/doc/src/sgml/runtime.sgml @@ -1275,7 +1275,7 @@ sysctl -w vm.overcommit_memory=2 <para> Another approach, which can be used with or without altering <varname>vm.overcommit_memory</>, is to set the process-specific - <varname>oom_score_adj</> value for the postmaster process to + <firstterm>OOM score adjustment</> value for the postmaster process to <literal>-1000</>, thereby guaranteeing it will not be targeted by the OOM killer. The simplest way to do this is to execute <programlisting> @@ -1284,20 +1284,28 @@ echo -1000 > /proc/self/oom_score_adj in the postmaster's startup script just before invoking the postmaster. Note that this action must be done as root, or it will have no effect; so a root-owned startup script is the easiest place to do it. If you - do this, you may also wish to build <productname>PostgreSQL</> - with <literal>-DLINUX_OOM_SCORE_ADJ=0</> added to <varname>CPPFLAGS</>. - That will cause postmaster child processes to run with the normal - <varname>oom_score_adj</> value of zero, so that the OOM killer can still - target them at need. + do this, you should also set these environment variables in the startup + script before invoking the postmaster: +<programlisting> +export PG_OOM_ADJUST_FILE=/proc/self/oom_score_adj +export PG_OOM_ADJUST_VALUE=0 +</programlisting> + These settings will cause postmaster child processes to run with the + normal OOM score adjustment of zero, so that the OOM killer can still + target them at need. You could use some other value for + <envar>PG_OOM_ADJUST_VALUE</> if you want the child processes to run + with some other OOM score adjustment. (<envar>PG_OOM_ADJUST_VALUE</> + can also be omitted, in which case it defaults to zero.) If you do not + set <envar>PG_OOM_ADJUST_FILE</>, the child processes will run with the + same OOM score adjustment as the postmaster, which is unwise since the + whole point is to ensure that the postmaster has a preferential setting. </para> <para> Older Linux kernels do not offer <filename>/proc/self/oom_score_adj</>, but may have a previous version of the same functionality called <filename>/proc/self/oom_adj</>. This works the same except the disable - value is <literal>-17</> not <literal>-1000</>. The corresponding - build flag for <productname>PostgreSQL</> is - <literal>-DLINUX_OOM_ADJ=0</>. + value is <literal>-17</> not <literal>-1000</>. </para> <note> diff --git a/src/backend/postmaster/fork_process.c b/src/backend/postmaster/fork_process.c index f6df2de8706afeca537e2b15b7398864ca6b43e5..5e5bd35e7e3c78149498ebd11bfb84d9e7483d21 100644 --- a/src/backend/postmaster/fork_process.c +++ b/src/backend/postmaster/fork_process.c @@ -31,6 +31,7 @@ pid_t fork_process(void) { pid_t result; + const char *oomfilename; #ifdef LINUX_PROFILE struct itimerval prof_itimer; @@ -71,62 +72,40 @@ fork_process(void) * process sizes *including shared memory*. (This is unbelievably * stupid, but the kernel hackers seem uninterested in improving it.) * Therefore it's often a good idea to protect the postmaster by - * setting its oom_score_adj value negative (which has to be done in a - * root-owned startup script). If you just do that much, all child - * processes will also be protected against OOM kill, which might not - * be desirable. You can then choose to build with - * LINUX_OOM_SCORE_ADJ #defined to 0, or to some other value that you - * want child processes to adopt here. + * setting its OOM score adjustment negative (which has to be done in + * a root-owned startup script). Since the adjustment is inherited by + * child processes, this would ordinarily mean that all the + * postmaster's children are equally protected against OOM kill, which + * is not such a good idea. So we provide this code to allow the + * children to change their OOM score adjustments again. Both the + * file name to write to and the value to write are controlled by + * environment variables, which can be set by the same startup script + * that did the original adjustment. */ -#ifdef LINUX_OOM_SCORE_ADJ - { - /* - * Use open() not stdio, to ensure we control the open flags. Some - * Linux security environments reject anything but O_WRONLY. - */ - int fd = open("/proc/self/oom_score_adj", O_WRONLY, 0); - - /* We ignore all errors */ - if (fd >= 0) - { - char buf[16]; - int rc; + oomfilename = getenv("PG_OOM_ADJUST_FILE"); - snprintf(buf, sizeof(buf), "%d\n", LINUX_OOM_SCORE_ADJ); - rc = write(fd, buf, strlen(buf)); - (void) rc; - close(fd); - } - } -#endif /* LINUX_OOM_SCORE_ADJ */ - - /* - * Older Linux kernels have oom_adj not oom_score_adj. This works - * similarly except with a different scale of adjustment values. If - * it's necessary to build Postgres to work with either API, you can - * define both LINUX_OOM_SCORE_ADJ and LINUX_OOM_ADJ. - */ -#ifdef LINUX_OOM_ADJ + if (oomfilename != NULL) { /* * Use open() not stdio, to ensure we control the open flags. Some * Linux security environments reject anything but O_WRONLY. */ - int fd = open("/proc/self/oom_adj", O_WRONLY, 0); + int fd = open(oomfilename, O_WRONLY, 0); /* We ignore all errors */ if (fd >= 0) { - char buf[16]; + const char *oomvalue = getenv("PG_OOM_ADJUST_VALUE"); int rc; - snprintf(buf, sizeof(buf), "%d\n", LINUX_OOM_ADJ); - rc = write(fd, buf, strlen(buf)); + if (oomvalue == NULL) /* supply a useful default */ + oomvalue = "0"; + + rc = write(fd, oomvalue, strlen(oomvalue)); (void) rc; close(fd); } } -#endif /* LINUX_OOM_ADJ */ /* * Make sure processes do not share OpenSSL randomness state.