From f8c183a1ac02aef14832c1f29946ef2bcb5866b7 Mon Sep 17 00:00:00 2001
From: Greg Stark <stark@mit.edu>
Date: Mon, 15 Feb 2010 00:50:57 +0000
Subject: [PATCH] Speed up CREATE DATABASE by deferring the fsyncs until after
 copying all the data and using posix_fadvise to nudge the OS into flushing it
 earlier. This also hopefully makes CREATE DATABASE avoid spamming the cache.

Tests show a big speedup on Linux at least on some filesystems.

Idea and patch from Andres Freund.
---
 src/backend/storage/file/fd.c | 18 +++++++-
 src/include/storage/fd.h      |  3 +-
 src/port/copydir.c            | 84 ++++++++++++++++++++++++-----------
 3 files changed, 76 insertions(+), 29 deletions(-)

diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c
index ec27859e606..adea849ab05 100644
--- a/src/backend/storage/file/fd.c
+++ b/src/backend/storage/file/fd.c
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/storage/file/fd.c,v 1.153 2010/01/12 02:42:52 momjian Exp $
+ *	  $PostgreSQL: pgsql/src/backend/storage/file/fd.c,v 1.154 2010/02/15 00:50:57 stark Exp $
  *
  * NOTES:
  *
@@ -319,6 +319,22 @@ pg_fdatasync(int fd)
 		return 0;
 }
 
+/*
+ * pg_flush_data --- advise OS that the data described won't be needed soon
+ *
+ * Not all platforms have posix_fadvise; treat as noop if not available.
+ */
+int
+pg_flush_data(int fd, off_t offset, off_t amount)
+{
+#if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
+	return posix_fadvise(fd, offset, amount, POSIX_FADV_DONTNEED);
+#else
+	return 0;
+#endif
+}
+
+
 /*
  * InitFileAccess --- initialize this module during backend startup
  *
diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h
index 20f60918afb..9dd240e34cf 100644
--- a/src/include/storage/fd.h
+++ b/src/include/storage/fd.h
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2010, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/storage/fd.h,v 1.66 2010/01/02 16:58:08 momjian Exp $
+ * $PostgreSQL: pgsql/src/include/storage/fd.h,v 1.67 2010/02/15 00:50:57 stark Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -98,6 +98,7 @@ extern int	pg_fsync(int fd);
 extern int	pg_fsync_no_writethrough(int fd);
 extern int	pg_fsync_writethrough(int fd);
 extern int	pg_fdatasync(int fd);
+extern int  pg_flush_data(int fd, off_t offset, off_t amount);
 
 /* Filename components for OpenTemporaryFile */
 #define PG_TEMP_FILES_DIR "pgsql_tmp"
diff --git a/src/port/copydir.c b/src/port/copydir.c
index 0bf764ecffc..a52b1f71a1b 100644
--- a/src/port/copydir.c
+++ b/src/port/copydir.c
@@ -11,7 +11,7 @@
  *	as a service.
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/port/copydir.c,v 1.25 2010/02/14 17:50:52 stark Exp $
+ *	  $PostgreSQL: pgsql/src/port/copydir.c,v 1.26 2010/02/15 00:50:57 stark Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -37,6 +37,7 @@
 
 
 static void copy_file(char *fromfile, char *tofile);
+static void fsync_fname(char *fname);
 
 
 /*
@@ -91,27 +92,32 @@ copydir(char *fromdir, char *todir, bool recurse)
 			copy_file(fromfile, tofile);
 	}
 
-	FreeDir(xldir);
-
 	/*
-	 * fsync the directory to make sure not just the data but also the
-	 * new directory file entries have reached the disk. While needed
-	 * by most filesystems, the window got bigger with newer ones like
-	 * ext4.
+	 * Be paranoid here and fsync all files to ensure we catch problems.
 	 */
-	dirfd = BasicOpenFile(todir,
-	                      O_RDONLY | PG_BINARY,
-	                      S_IRUSR | S_IWUSR);
-	if(dirfd == -1)
-		ereport(ERROR,
-		        (errcode_for_file_access(),
-		         errmsg("could not open directory for fsync \"%s\": %m", todir)));
-
-	if(pg_fsync(dirfd) == -1)
+	if (xldir == NULL)
 		ereport(ERROR,
 				(errcode_for_file_access(),
-				 errmsg("could not fsync directory \"%s\": %m", todir)));
-	close(dirfd);
+				 errmsg("could not open directory \"%s\": %m", fromdir)));
+
+	while ((xlde = ReadDir(xldir, fromdir)) != NULL)
+	{
+		if (strcmp(xlde->d_name, ".") == 0 ||
+			strcmp(xlde->d_name, "..") == 0)
+			continue;
+
+		snprintf(tofile, MAXPGPATH, "%s/%s", todir, xlde->d_name);
+		fsync_fname(tofile);
+	}
+	FreeDir(xldir);
+
+	/* It's important to fsync the destination directory itself as
+	 * individual file fsyncs don't guarantee that the directory entry
+	 * for the file is synced. Recent versions of ext4 have made the
+	 * window much wider but it's been true for ext3 and other
+	 * filesyetems in the past 
+	 */
+	fsync_fname(todir);
 }
 
 /*
@@ -124,6 +130,7 @@ copy_file(char *fromfile, char *tofile)
 	int			srcfd;
 	int			dstfd;
 	int			nbytes;
+	off_t		offset;
 
 	/* Use palloc to ensure we get a maxaligned buffer */
 #define COPY_BUF_SIZE (8 * BLCKSZ)
@@ -149,7 +156,7 @@ copy_file(char *fromfile, char *tofile)
 	/*
 	 * Do the data copying.
 	 */
-	for (;;)
+	for (offset=0; ; offset+=nbytes)
 	{
 		nbytes = read(srcfd, buffer, COPY_BUF_SIZE);
 		if (nbytes < 0)
@@ -168,15 +175,14 @@ copy_file(char *fromfile, char *tofile)
 					(errcode_for_file_access(),
 					 errmsg("could not write to file \"%s\": %m", tofile)));
 		}
-	}
 
-	/*
-	 * Be paranoid here to ensure we catch problems.
-	 */
-	if (pg_fsync(dstfd) != 0)
-		ereport(ERROR,
-				(errcode_for_file_access(),
-				 errmsg("could not fsync file \"%s\": %m", tofile)));
+		/*
+		 * We fsync the files later but first flush them to avoid spamming
+		 * the cache and hopefully get the kernel to start writing them
+		 * out before the fsync comes.
+		 */
+		pg_flush_data(dstfd, offset, nbytes);
+	}
 
 	if (close(dstfd))
 		ereport(ERROR,
@@ -187,3 +193,27 @@ copy_file(char *fromfile, char *tofile)
 
 	pfree(buffer);
 }
+
+
+
+/*
+ * fsync a file
+ */
+static void
+fsync_fname(char *fname)
+{
+	int	fd = BasicOpenFile(fname, 
+						   O_RDONLY | PG_BINARY,
+						   S_IRUSR | S_IWUSR);
+
+	if (fd < 0)
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not open file \"%s\": %m", fname)));
+
+	if (pg_fsync(fd) != 0)
+		ereport(ERROR,
+				(errcode_for_file_access(),
+				 errmsg("could not fsync file \"%s\": %m", fname)));
+	close(fd);
+}
-- 
GitLab