diff --git a/src/backend/access/transam/clog.c b/src/backend/access/transam/clog.c
index 88e1f1256ad80a8f007341325e83fa25db75b9c1..97f887d0a06ce234e256dda5bf204333ec7033b0 100644
--- a/src/backend/access/transam/clog.c
+++ b/src/backend/access/transam/clog.c
@@ -13,7 +13,7 @@
  * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/backend/access/transam/clog.c,v 1.19 2003/11/29 19:51:40 pgsql Exp $
+ * $PostgreSQL: pgsql/src/backend/access/transam/clog.c,v 1.20 2004/05/31 03:47:54 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -97,7 +97,7 @@ TransactionIdSetStatus(TransactionId xid, XidStatus status)
 	Assert(status == TRANSACTION_STATUS_COMMITTED ||
 		   status == TRANSACTION_STATUS_ABORTED);
 
-	LWLockAcquire(ClogCtl->locks->ControlLock, LW_EXCLUSIVE);
+	LWLockAcquire(ClogCtl->ControlLock, LW_EXCLUSIVE);
 
 	byteptr = SimpleLruReadPage(ClogCtl, pageno, xid, true);
 	byteptr += byteno;
@@ -110,7 +110,7 @@ TransactionIdSetStatus(TransactionId xid, XidStatus status)
 
 	/* ...->page_status[slotno] = CLOG_PAGE_DIRTY; already done */
 
-	LWLockRelease(ClogCtl->locks->ControlLock);
+	LWLockRelease(ClogCtl->ControlLock);
 }
 
 /*
@@ -128,14 +128,14 @@ TransactionIdGetStatus(TransactionId xid)
 	char	   *byteptr;
 	XidStatus	status;
 
-	LWLockAcquire(ClogCtl->locks->ControlLock, LW_EXCLUSIVE);
+	LWLockAcquire(ClogCtl->ControlLock, LW_EXCLUSIVE);
 
 	byteptr = SimpleLruReadPage(ClogCtl, pageno, xid, false);
 	byteptr += byteno;
 
 	status = (*byteptr >> bshift) & CLOG_XACT_BITMASK;
 
-	LWLockRelease(ClogCtl->locks->ControlLock);
+	LWLockRelease(ClogCtl->ControlLock);
 
 	return status;
 }
@@ -169,16 +169,16 @@ BootStrapCLOG(void)
 {
 	int			slotno;
 
-	LWLockAcquire(ClogCtl->locks->ControlLock, LW_EXCLUSIVE);
+	LWLockAcquire(ClogCtl->ControlLock, LW_EXCLUSIVE);
 
 	/* Create and zero the first page of the commit log */
 	slotno = ZeroCLOGPage(0, false);
 
 	/* Make sure it's written out */
-	SimpleLruWritePage(ClogCtl, slotno);
+	SimpleLruWritePage(ClogCtl, slotno, NULL);
 	/* Assert(ClogCtl->page_status[slotno] == CLOG_PAGE_CLEAN); */
 
-	LWLockRelease(ClogCtl->locks->ControlLock);
+	LWLockRelease(ClogCtl->ControlLock);
 }
 
 /*
@@ -256,12 +256,12 @@ ExtendCLOG(TransactionId newestXact)
 
 	pageno = TransactionIdToPage(newestXact);
 
-	LWLockAcquire(ClogCtl->locks->ControlLock, LW_EXCLUSIVE);
+	LWLockAcquire(ClogCtl->ControlLock, LW_EXCLUSIVE);
 
 	/* Zero the page and make an XLOG entry about it */
 	ZeroCLOGPage(pageno, true);
 
-	LWLockRelease(ClogCtl->locks->ControlLock);
+	LWLockRelease(ClogCtl->ControlLock);
 }
 
 
@@ -351,13 +351,13 @@ clog_redo(XLogRecPtr lsn, XLogRecord *record)
 
 		memcpy(&pageno, XLogRecGetData(record), sizeof(int));
 
-		LWLockAcquire(ClogCtl->locks->ControlLock, LW_EXCLUSIVE);
+		LWLockAcquire(ClogCtl->ControlLock, LW_EXCLUSIVE);
 
 		slotno = ZeroCLOGPage(pageno, false);
-		SimpleLruWritePage(ClogCtl, slotno);
+		SimpleLruWritePage(ClogCtl, slotno, NULL);
 		/* Assert(ClogCtl->page_status[slotno] == SLRU_PAGE_CLEAN); */
 
-		LWLockRelease(ClogCtl->locks->ControlLock);
+		LWLockRelease(ClogCtl->ControlLock);
 	}
 }
 
diff --git a/src/backend/access/transam/slru.c b/src/backend/access/transam/slru.c
index 57dcd2b33798c3e9230681bb5cfaee593903aa5a..58798d0f07fcb56d230f886258c94bc07e5a4f4a 100644
--- a/src/backend/access/transam/slru.c
+++ b/src/backend/access/transam/slru.c
@@ -6,7 +6,7 @@
  * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/backend/access/transam/slru.c,v 1.15 2004/05/29 22:48:18 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/access/transam/slru.c,v 1.16 2004/05/31 03:47:54 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -17,6 +17,7 @@
 #include <unistd.h>
 
 #include "access/slru.h"
+#include "access/clog.h"		/* only for NUM_CLOG_BUFFERS */
 #include "postmaster/bgwriter.h"
 #include "storage/fd.h"
 #include "storage/lwlock.h"
@@ -100,6 +101,8 @@ typedef enum
  */
 typedef struct SlruSharedData
 {
+	LWLockId	ControlLock;
+
 	/*
 	 * Info for each buffer slot.  Page number is undefined when status is
 	 * EMPTY.  lru_count is essentially the number of page switches since
@@ -110,6 +113,7 @@ typedef struct SlruSharedData
 	SlruPageStatus page_status[NUM_CLOG_BUFFERS];
 	int			page_number[NUM_CLOG_BUFFERS];
 	unsigned int page_lru_count[NUM_CLOG_BUFFERS];
+	LWLockId	BufferLocks[NUM_CLOG_BUFFERS];	/* Per-buffer I/O locks */
 
 	/*
 	 * latest_page_number is the page number of the current end of the
@@ -118,12 +122,24 @@ typedef struct SlruSharedData
 	 */
 	int			latest_page_number;
 } SlruSharedData;
-typedef SlruSharedData *SlruShared;
-
 
 #define SlruFileName(ctl, path, seg) \
 	snprintf(path, MAXPGPATH, "%s/%04X", (ctl)->Dir, seg)
 
+/*
+ * During SimpleLruFlush(), we will usually not need to write/fsync more
+ * than one or two physical files, but we may need to write several pages
+ * per file.  We can consolidate the I/O requests by leaving files open
+ * until control returns to SimpleLruFlush().  This data structure remembers
+ * which files are open.
+ */
+typedef struct SlruFlushData
+{
+	int			num_files;					/* # files actually open */
+	int			fd[NUM_CLOG_BUFFERS];		/* their FD's */
+	int			segno[NUM_CLOG_BUFFERS];	/* their clog seg#s */
+} SlruFlushData;
+
 /*
  * Macro to mark a buffer slot "most recently used".
  */
@@ -145,14 +161,17 @@ typedef enum
 	SLRU_SEEK_FAILED,
 	SLRU_READ_FAILED,
 	SLRU_WRITE_FAILED,
+	SLRU_FSYNC_FAILED,
 	SLRU_CLOSE_FAILED
 } SlruErrorCause;
+
 static SlruErrorCause slru_errcause;
 static int	slru_errno;
 
 
 static bool SlruPhysicalReadPage(SlruCtl ctl, int pageno, int slotno);
-static bool SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno);
+static bool SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno,
+								  SlruFlush fdata);
 static void SlruReportIOError(SlruCtl ctl, int pageno, TransactionId xid);
 static int	SlruSelectLRUPage(SlruCtl ctl, int pageno);
 static bool SlruScanDirectory(SlruCtl ctl, int cutoffPage, bool doDeletions);
@@ -165,24 +184,16 @@ static bool SlruScanDirectory(SlruCtl ctl, int cutoffPage, bool doDeletions);
 int
 SimpleLruShmemSize(void)
 {
-	return MAXALIGN(sizeof(SlruSharedData))
-		+ BLCKSZ * NUM_CLOG_BUFFERS
-		+ MAXALIGN(sizeof(SlruLockData))
-		;
+	return MAXALIGN(sizeof(SlruSharedData)) + BLCKSZ * NUM_CLOG_BUFFERS;
 }
 
 void
 SimpleLruInit(SlruCtl ctl, const char *name, const char *subdir)
 {
-	bool		found;
-	char	   *ptr;
 	SlruShared	shared;
-	SlruLock	locks;
+	bool		found;
 
-	ptr = ShmemInitStruct(name, SimpleLruShmemSize(), &found);
-	shared = (SlruShared) ptr;
-	locks = (SlruLock) (ptr + MAXALIGN(sizeof(SlruSharedData)) +
-						BLCKSZ * NUM_CLOG_BUFFERS);
+	shared = (SlruShared) ShmemInitStruct(name, SimpleLruShmemSize(), &found);
 
 	if (!IsUnderPostmaster)
 	{
@@ -192,18 +203,18 @@ SimpleLruInit(SlruCtl ctl, const char *name, const char *subdir)
 
 		Assert(!found);
 
-		locks->ControlLock = LWLockAssign();
-
 		memset(shared, 0, sizeof(SlruSharedData));
 
+		shared->ControlLock = LWLockAssign();
+
 		bufptr = (char *) shared + MAXALIGN(sizeof(SlruSharedData));
 
 		for (slotno = 0; slotno < NUM_CLOG_BUFFERS; slotno++)
 		{
-			locks->BufferLocks[slotno] = LWLockAssign();
 			shared->page_buffer[slotno] = bufptr;
 			shared->page_status[slotno] = SLRU_PAGE_EMPTY;
 			shared->page_lru_count[slotno] = 1;
+			shared->BufferLocks[slotno] = LWLockAssign();
 			bufptr += BLCKSZ;
 		}
 
@@ -213,10 +224,10 @@ SimpleLruInit(SlruCtl ctl, const char *name, const char *subdir)
 		Assert(found);
 
 	/* Initialize the unshared control struct */
-	ctl->locks = locks;
 	ctl->shared = shared;
+	ctl->ControlLock = shared->ControlLock;
 
-	/* Init directory path */
+	/* Initialize unshared copy of directory path */
 	snprintf(ctl->Dir, MAXPGPATH, "%s/%s", DataDir, subdir);
 }
 
@@ -232,7 +243,7 @@ int
 SimpleLruZeroPage(SlruCtl ctl, int pageno)
 {
 	int			slotno;
-	SlruShared	shared = (SlruShared) ctl->shared;
+	SlruShared	shared = ctl->shared;
 
 	/* Find a suitable buffer slot for the page */
 	slotno = SlruSelectLRUPage(ctl, pageno);
@@ -270,7 +281,7 @@ SimpleLruZeroPage(SlruCtl ctl, int pageno)
 char *
 SimpleLruReadPage(SlruCtl ctl, int pageno, TransactionId xid, bool forwrite)
 {
-	SlruShared	shared = (SlruShared) ctl->shared;
+	SlruShared	shared = ctl->shared;
 
 	/* Outer loop handles restart if we lose the buffer to someone else */
 	for (;;)
@@ -313,8 +324,8 @@ SimpleLruReadPage(SlruCtl ctl, int pageno, TransactionId xid, bool forwrite)
 		SlruRecentlyUsed(shared, slotno);
 
 		/* Release shared lock, grab per-buffer lock instead */
-		LWLockRelease(ctl->locks->ControlLock);
-		LWLockAcquire(ctl->locks->BufferLocks[slotno], LW_EXCLUSIVE);
+		LWLockRelease(shared->ControlLock);
+		LWLockAcquire(shared->BufferLocks[slotno], LW_EXCLUSIVE);
 
 		/*
 		 * Check to see if someone else already did the read, or took the
@@ -323,8 +334,8 @@ SimpleLruReadPage(SlruCtl ctl, int pageno, TransactionId xid, bool forwrite)
 		if (shared->page_number[slotno] != pageno ||
 			shared->page_status[slotno] != SLRU_PAGE_READ_IN_PROGRESS)
 		{
-			LWLockRelease(ctl->locks->BufferLocks[slotno]);
-			LWLockAcquire(ctl->locks->ControlLock, LW_EXCLUSIVE);
+			LWLockRelease(shared->BufferLocks[slotno]);
+			LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE);
 			continue;
 		}
 
@@ -332,14 +343,14 @@ SimpleLruReadPage(SlruCtl ctl, int pageno, TransactionId xid, bool forwrite)
 		ok = SlruPhysicalReadPage(ctl, pageno, slotno);
 
 		/* Re-acquire shared control lock and update page state */
-		LWLockAcquire(ctl->locks->ControlLock, LW_EXCLUSIVE);
+		LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE);
 
 		Assert(shared->page_number[slotno] == pageno &&
 			   shared->page_status[slotno] == SLRU_PAGE_READ_IN_PROGRESS);
 
 		shared->page_status[slotno] = ok ? SLRU_PAGE_CLEAN : SLRU_PAGE_EMPTY;
 
-		LWLockRelease(ctl->locks->BufferLocks[slotno]);
+		LWLockRelease(shared->BufferLocks[slotno]);
 
 		/* Now it's okay to ereport if we failed */
 		if (!ok)
@@ -364,11 +375,11 @@ SimpleLruReadPage(SlruCtl ctl, int pageno, TransactionId xid, bool forwrite)
  * Control lock must be held at entry, and will be held at exit.
  */
 void
-SimpleLruWritePage(SlruCtl ctl, int slotno)
+SimpleLruWritePage(SlruCtl ctl, int slotno, SlruFlush fdata)
 {
 	int			pageno;
 	bool		ok;
-	SlruShared	shared = (SlruShared) ctl->shared;
+	SlruShared	shared = ctl->shared;
 
 	/* Do nothing if page does not need writing */
 	if (shared->page_status[slotno] != SLRU_PAGE_DIRTY &&
@@ -378,8 +389,8 @@ SimpleLruWritePage(SlruCtl ctl, int slotno)
 	pageno = shared->page_number[slotno];
 
 	/* Release shared lock, grab per-buffer lock instead */
-	LWLockRelease(ctl->locks->ControlLock);
-	LWLockAcquire(ctl->locks->BufferLocks[slotno], LW_EXCLUSIVE);
+	LWLockRelease(shared->ControlLock);
+	LWLockAcquire(shared->BufferLocks[slotno], LW_EXCLUSIVE);
 
 	/*
 	 * Check to see if someone else already did the write, or took the
@@ -392,8 +403,8 @@ SimpleLruWritePage(SlruCtl ctl, int slotno)
 		(shared->page_status[slotno] != SLRU_PAGE_DIRTY &&
 		 shared->page_status[slotno] != SLRU_PAGE_WRITE_IN_PROGRESS))
 	{
-		LWLockRelease(ctl->locks->BufferLocks[slotno]);
-		LWLockAcquire(ctl->locks->ControlLock, LW_EXCLUSIVE);
+		LWLockRelease(shared->BufferLocks[slotno]);
+		LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE);
 		return;
 	}
 
@@ -412,10 +423,19 @@ SimpleLruWritePage(SlruCtl ctl, int slotno)
 	shared->page_status[slotno] = SLRU_PAGE_WRITE_IN_PROGRESS;
 
 	/* Okay, do the write */
-	ok = SlruPhysicalWritePage(ctl, pageno, slotno);
+	ok = SlruPhysicalWritePage(ctl, pageno, slotno, fdata);
+
+	/* If we failed, and we're in a flush, better close the files */
+	if (!ok && fdata)
+	{
+		int		i;
+
+		for (i = 0; i < fdata->num_files; i++)
+			close(fdata->fd[i]);
+	}
 
 	/* Re-acquire shared control lock and update page state */
-	LWLockAcquire(ctl->locks->ControlLock, LW_EXCLUSIVE);
+	LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE);
 
 	Assert(shared->page_number[slotno] == pageno &&
 		   (shared->page_status[slotno] == SLRU_PAGE_WRITE_IN_PROGRESS ||
@@ -425,7 +445,7 @@ SimpleLruWritePage(SlruCtl ctl, int slotno)
 	if (shared->page_status[slotno] == SLRU_PAGE_WRITE_IN_PROGRESS)
 		shared->page_status[slotno] = ok ? SLRU_PAGE_CLEAN : SLRU_PAGE_DIRTY;
 
-	LWLockRelease(ctl->locks->BufferLocks[slotno]);
+	LWLockRelease(shared->BufferLocks[slotno]);
 
 	/* Now it's okay to ereport if we failed */
 	if (!ok)
@@ -445,7 +465,7 @@ SimpleLruWritePage(SlruCtl ctl, int slotno)
 static bool
 SlruPhysicalReadPage(SlruCtl ctl, int pageno, int slotno)
 {
-	SlruShared	shared = (SlruShared) ctl->shared;
+	SlruShared	shared = ctl->shared;
 	int			segno = pageno / SLRU_PAGES_PER_SEGMENT;
 	int			rpageno = pageno % SLRU_PAGES_PER_SEGMENT;
 	int			offset = rpageno * BLCKSZ;
@@ -482,6 +502,7 @@ SlruPhysicalReadPage(SlruCtl ctl, int pageno, int slotno)
 	{
 		slru_errcause = SLRU_SEEK_FAILED;
 		slru_errno = errno;
+		close(fd);
 		return false;
 	}
 
@@ -490,6 +511,7 @@ SlruPhysicalReadPage(SlruCtl ctl, int pageno, int slotno)
 	{
 		slru_errcause = SLRU_READ_FAILED;
 		slru_errno = errno;
+		close(fd);
 		return false;
 	}
 
@@ -511,50 +533,80 @@ SlruPhysicalReadPage(SlruCtl ctl, int pageno, int slotno)
  * info in static variables to let SlruReportIOError make the report.
  *
  * For now, assume it's not worth keeping a file pointer open across
- * read/write operations.  We could cache one virtual file pointer ...
+ * independent read/write operations.  We do batch operations during
+ * SimpleLruFlush, though.
+ *
+ * fdata is NULL for a standalone write, pointer to open-file info during
+ * SimpleLruFlush.
  */
 static bool
-SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno)
+SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno, SlruFlush fdata)
 {
-	SlruShared	shared = (SlruShared) ctl->shared;
+	SlruShared	shared = ctl->shared;
 	int			segno = pageno / SLRU_PAGES_PER_SEGMENT;
 	int			rpageno = pageno % SLRU_PAGES_PER_SEGMENT;
 	int			offset = rpageno * BLCKSZ;
 	char		path[MAXPGPATH];
-	int			fd;
-
-	SlruFileName(ctl, path, segno);
+	int			fd = -1;
 
 	/*
-	 * If the file doesn't already exist, we should create it.  It is
-	 * possible for this to need to happen when writing a page that's not
-	 * first in its segment; we assume the OS can cope with that.  (Note:
-	 * it might seem that it'd be okay to create files only when
-	 * SimpleLruZeroPage is called for the first page of a segment.
-	 * However, if after a crash and restart the REDO logic elects to
-	 * replay the log from a checkpoint before the latest one, then it's
-	 * possible that we will get commands to set transaction status of
-	 * transactions that have already been truncated from the commit log.
-	 * Easiest way to deal with that is to accept references to
-	 * nonexistent files here and in SlruPhysicalReadPage.)
+	 * During a Flush, we may already have the desired file open.
 	 */
-	fd = BasicOpenFile(path, O_RDWR | PG_BINARY, S_IRUSR | S_IWUSR);
-	if (fd < 0)
+	if (fdata)
 	{
-		if (errno != ENOENT)
+		int		i;
+
+		for (i = 0; i < fdata->num_files; i++)
 		{
-			slru_errcause = SLRU_OPEN_FAILED;
-			slru_errno = errno;
-			return false;
+			if (fdata->segno[i] == segno)
+			{
+				fd = fdata->fd[i];
+				break;
+			}
 		}
+	}
 
-		fd = BasicOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
-						   S_IRUSR | S_IWUSR);
+	if (fd < 0)
+	{
+		/*
+		 * If the file doesn't already exist, we should create it.  It is
+		 * possible for this to need to happen when writing a page that's not
+		 * first in its segment; we assume the OS can cope with that.
+		 * (Note: it might seem that it'd be okay to create files only when
+		 * SimpleLruZeroPage is called for the first page of a segment.
+		 * However, if after a crash and restart the REDO logic elects to
+		 * replay the log from a checkpoint before the latest one, then it's
+		 * possible that we will get commands to set transaction status of
+		 * transactions that have already been truncated from the commit log.
+		 * Easiest way to deal with that is to accept references to
+		 * nonexistent files here and in SlruPhysicalReadPage.)
+		 */
+		SlruFileName(ctl, path, segno);
+		fd = BasicOpenFile(path, O_RDWR | PG_BINARY, S_IRUSR | S_IWUSR);
 		if (fd < 0)
 		{
-			slru_errcause = SLRU_CREATE_FAILED;
-			slru_errno = errno;
-			return false;
+			if (errno != ENOENT)
+			{
+				slru_errcause = SLRU_OPEN_FAILED;
+				slru_errno = errno;
+				return false;
+			}
+
+			fd = BasicOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
+							   S_IRUSR | S_IWUSR);
+			if (fd < 0)
+			{
+				slru_errcause = SLRU_CREATE_FAILED;
+				slru_errno = errno;
+				return false;
+			}
+		}
+
+		if (fdata)
+		{
+			fdata->fd[fdata->num_files] = fd;
+			fdata->segno[fdata->num_files] = segno;
+			fdata->num_files++;
 		}
 	}
 
@@ -562,6 +614,8 @@ SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno)
 	{
 		slru_errcause = SLRU_SEEK_FAILED;
 		slru_errno = errno;
+		if (!fdata)
+			close(fd);
 		return false;
 	}
 
@@ -573,14 +627,31 @@ SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno)
 			errno = ENOSPC;
 		slru_errcause = SLRU_WRITE_FAILED;
 		slru_errno = errno;
+		if (!fdata)
+			close(fd);
 		return false;
 	}
 
-	if (close(fd))
+	/*
+	 * If not part of Flush, need to fsync now.  We assume this happens
+	 * infrequently enough that it's not a performance issue.
+	 */
+	if (!fdata)
 	{
-		slru_errcause = SLRU_CLOSE_FAILED;
-		slru_errno = errno;
-		return false;
+		if (pg_fsync(fd))
+		{
+			slru_errcause = SLRU_FSYNC_FAILED;
+			slru_errno = errno;
+			close(fd);
+			return false;
+		}
+
+		if (close(fd))
+		{
+			slru_errcause = SLRU_CLOSE_FAILED;
+			slru_errno = errno;
+			return false;
+		}
 	}
 
 	return true;
@@ -637,6 +708,13 @@ SlruReportIOError(SlruCtl ctl, int pageno, TransactionId xid)
 				  errdetail("could not write to file \"%s\" at offset %u: %m",
 							path, offset)));
 			break;
+		case SLRU_FSYNC_FAILED:
+			ereport(ERROR,
+					(errcode_for_file_access(),
+				errmsg("could not access status of transaction %u", xid),
+				  errdetail("could not fsync file \"%s\": %m",
+							path)));
+			break;
 		case SLRU_CLOSE_FAILED:
 			ereport(ERROR,
 					(errcode_for_file_access(),
@@ -668,7 +746,7 @@ SlruReportIOError(SlruCtl ctl, int pageno, TransactionId xid)
 static int
 SlruSelectLRUPage(SlruCtl ctl, int pageno)
 {
-	SlruShared	shared = (SlruShared) ctl->shared;
+	SlruShared	shared = ctl->shared;
 
 	/* Outer loop handles restart after I/O */
 	for (;;)
@@ -717,7 +795,7 @@ SlruSelectLRUPage(SlruCtl ctl, int pageno)
 			(void) SimpleLruReadPage(ctl, shared->page_number[bestslot],
 									 InvalidTransactionId, false);
 		else
-			SimpleLruWritePage(ctl, bestslot);
+			SimpleLruWritePage(ctl, bestslot, NULL);
 
 		/*
 		 * Now loop back and try again.  This is the easiest way of
@@ -733,7 +811,7 @@ SlruSelectLRUPage(SlruCtl ctl, int pageno)
 void
 SimpleLruSetLatestPage(SlruCtl ctl, int pageno)
 {
-	SlruShared	shared = (SlruShared) ctl->shared;
+	SlruShared	shared = ctl->shared;
 
 	shared->latest_page_number = pageno;
 }
@@ -744,16 +822,20 @@ SimpleLruSetLatestPage(SlruCtl ctl, int pageno)
 void
 SimpleLruFlush(SlruCtl ctl, bool checkpoint)
 {
-#ifdef USE_ASSERT_CHECKING		/* only used in Assert() */
-	SlruShared	shared = (SlruShared) ctl->shared;
-#endif
+	SlruShared	shared = ctl->shared;
+	SlruFlushData fdata;
 	int			slotno;
+	int			pageno = 0;
+	int			i;
+	bool		ok;
+
+	fdata.num_files = 0;
 
-	LWLockAcquire(ctl->locks->ControlLock, LW_EXCLUSIVE);
+	LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE);
 
 	for (slotno = 0; slotno < NUM_CLOG_BUFFERS; slotno++)
 	{
-		SimpleLruWritePage(ctl, slotno);
+		SimpleLruWritePage(ctl, slotno, &fdata);
 
 		/*
 		 * When called during a checkpoint, we cannot assert that the slot
@@ -765,7 +847,32 @@ SimpleLruFlush(SlruCtl ctl, bool checkpoint)
 			   shared->page_status[slotno] == SLRU_PAGE_CLEAN);
 	}
 
-	LWLockRelease(ctl->locks->ControlLock);
+	LWLockRelease(shared->ControlLock);
+
+	/*
+	 * Now fsync and close any files that were open
+	 */
+	ok = true;
+	for (i = 0; i < fdata.num_files; i++)
+	{
+		if (pg_fsync(fdata.fd[i]))
+		{
+			slru_errcause = SLRU_FSYNC_FAILED;
+			slru_errno = errno;
+			pageno = fdata.segno[i] * SLRU_PAGES_PER_SEGMENT;
+			ok = false;
+		}
+
+		if (close(fdata.fd[i]))
+		{
+			slru_errcause = SLRU_CLOSE_FAILED;
+			slru_errno = errno;
+			pageno = fdata.segno[i] * SLRU_PAGES_PER_SEGMENT;
+			ok = false;
+		}
+	}
+	if (!ok)
+		SlruReportIOError(ctl, pageno, InvalidTransactionId);
 }
 
 /*
@@ -786,7 +893,7 @@ void
 SimpleLruTruncate(SlruCtl ctl, int cutoffPage)
 {
 	int			slotno;
-	SlruShared	shared = (SlruShared) ctl->shared;
+	SlruShared	shared = ctl->shared;
 
 	/*
 	 * The cutoff point is the start of the segment containing cutoffPage.
@@ -805,7 +912,7 @@ SimpleLruTruncate(SlruCtl ctl, int cutoffPage)
 	 * have been flushed already during the checkpoint, we're just being
 	 * extra careful here.)
 	 */
-	LWLockAcquire(ctl->locks->ControlLock, LW_EXCLUSIVE);
+	LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE);
 
 restart:;
 
@@ -817,7 +924,7 @@ restart:;
 	 */
 	if (ctl->PagePrecedes(shared->latest_page_number, cutoffPage))
 	{
-		LWLockRelease(ctl->locks->ControlLock);
+		LWLockRelease(shared->ControlLock);
 		ereport(LOG,
 				(errmsg("could not truncate directory \"%s\": apparent wraparound",
 						ctl->Dir)));
@@ -849,11 +956,11 @@ restart:;
 			(void) SimpleLruReadPage(ctl, shared->page_number[slotno],
 									 InvalidTransactionId, false);
 		else
-			SimpleLruWritePage(ctl, slotno);
+			SimpleLruWritePage(ctl, slotno, NULL);
 		goto restart;
 	}
 
-	LWLockRelease(ctl->locks->ControlLock);
+	LWLockRelease(shared->ControlLock);
 
 	/* Now we can remove the old segment(s) */
 	(void) SlruScanDirectory(ctl, cutoffPage, true);
@@ -878,7 +985,8 @@ SlruScanDirectory(SlruCtl ctl, int cutoffPage, bool doDeletions)
 	if (cldir == NULL)
 		ereport(ERROR,
 				(errcode_for_file_access(),
-			   errmsg("could not open directory \"%s\": %m", ctl->Dir)));
+				 errmsg("could not open directory \"%s\": %m",
+						ctl->Dir)));
 
 	errno = 0;
 	while ((clde = readdir(cldir)) != NULL)
diff --git a/src/backend/postmaster/bgwriter.c b/src/backend/postmaster/bgwriter.c
index ce80a4feff7a6454152d6c154cca5c50296c975d..6bb683386a2f109d2b413f73d76a4b860044dd10 100644
--- a/src/backend/postmaster/bgwriter.c
+++ b/src/backend/postmaster/bgwriter.c
@@ -27,14 +27,17 @@
  *
  * If the bgwriter exits unexpectedly, the postmaster treats that the same
  * as a backend crash: shared memory may be corrupted, so remaining backends
- * should be killed by SIGQUIT and then a recovery cycle started.
+ * should be killed by SIGQUIT and then a recovery cycle started.  (Even if
+ * shared memory isn't corrupted, we have lost information about which
+ * files need to be fsync'd for the next checkpoint, and so a system
+ * restart needs to be forced.)
  *
  *
  * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/postmaster/bgwriter.c,v 1.1 2004/05/29 22:48:19 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/postmaster/bgwriter.c,v 1.2 2004/05/31 03:47:59 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -55,13 +58,54 @@
 #include "utils/guc.h"
 
 
-/*
+/*----------
  * Shared memory area for communication between bgwriter and backends
+ *
+ * The ckpt counters allow backends to watch for completion of a checkpoint
+ * request they send.  Here's how it works:
+ *	* At start of a checkpoint, bgwriter increments ckpt_started.
+ *	* On completion of a checkpoint, bgwriter sets ckpt_done to
+ *	  equal ckpt_started.
+ *	* On failure of a checkpoint, bgwrite first increments ckpt_failed,
+ *	  then sets ckpt_done to equal ckpt_started.
+ * All three fields are declared sig_atomic_t to ensure they can be read
+ * and written without explicit locking.  The algorithm for backends is:
+ *	1. Record current values of ckpt_failed and ckpt_started (in that
+ *	   order!).
+ *	2. Send signal to request checkpoint.
+ *	3. Sleep until ckpt_started changes.  Now you know a checkpoint has
+ *	   begun since you started this algorithm (although *not* that it was
+ *	   specifically initiated by your signal).
+ *	4. Record new value of ckpt_started.
+ *	5. Sleep until ckpt_done >= saved value of ckpt_started.  (Use modulo
+ *	   arithmetic here in case counters wrap around.)  Now you know a
+ *	   checkpoint has started and completed, but not whether it was
+ *	   successful.
+ *	6. If ckpt_failed is different from the originally saved value,
+ *	   assume request failed; otherwise it was definitely successful.
+ *
+ * The requests array holds fsync requests sent by backends and not yet
+ * absorbed by the bgwriter.
+ *----------
  */
+typedef struct
+{
+	RelFileNode		rnode;
+	BlockNumber		segno;
+	/* might add a request-type field later */
+} BgWriterRequest;
+
 typedef struct
 {
 	pid_t	bgwriter_pid;		/* PID of bgwriter (0 if not started) */
-	sig_atomic_t	checkpoint_count; /* advances when checkpoint done */
+
+	sig_atomic_t	ckpt_started;	/* advances when checkpoint starts */
+	sig_atomic_t	ckpt_done;		/* advances when checkpoint done */
+	sig_atomic_t	ckpt_failed;	/* advances when checkpoint fails */
+
+	int				num_requests;	/* current # of requests */
+	int				max_requests;	/* allocated array size */
+	BgWriterRequest	requests[1];	/* VARIABLE LENGTH ARRAY */
 } BgWriterShmemStruct;
 
 static BgWriterShmemStruct *BgWriterShmem;
@@ -86,6 +130,10 @@ static volatile sig_atomic_t shutdown_requested = false;
 /*
  * Private state
  */
+static bool		am_bg_writer = false;
+
+static bool		ckpt_active = false;
+
 static time_t	last_checkpoint_time;
 
 
@@ -106,6 +154,7 @@ BackgroundWriterMain(void)
 {
 	Assert(BgWriterShmem != NULL);
 	BgWriterShmem->bgwriter_pid = MyProcPid;
+	am_bg_writer = true;
 
 	/*
 	 * Properly accept or ignore signals the postmaster might send us
@@ -180,6 +229,17 @@ BackgroundWriterMain(void)
 		 */
 		InError = false;
 
+		/* Warn any waiting backends that the checkpoint failed. */
+		if (ckpt_active)
+		{
+			/* use volatile pointer to prevent code rearrangement */
+			volatile BgWriterShmemStruct *bgs = BgWriterShmem;
+
+			bgs->ckpt_failed++;
+			bgs->ckpt_done = bgs->ckpt_started;
+			ckpt_active = false;
+		}
+
 		/*
 		 * Exit interrupt holdoff section we implicitly established above.
 		 */
@@ -214,8 +274,17 @@ BackgroundWriterMain(void)
 		long		udelay;
 
 		/*
-		 * Process any signals received recently.
+		 * Emergency bailout if postmaster has died.  This is to avoid the
+		 * necessity for manual cleanup of all postmaster children.
 		 */
+		if (!PostmasterIsAlive(true))
+			exit(1);
+
+		/*
+		 * Process any requests or signals received recently.
+		 */
+		AbsorbFsyncRequests();
+
 		if (got_SIGHUP)
 		{
 			got_SIGHUP = false;
@@ -265,8 +334,20 @@ BackgroundWriterMain(void)
 							 errhint("Consider increasing the configuration parameter \"checkpoint_segments\".")));
 			}
 
+			/*
+			 * Indicate checkpoint start to any waiting backends.
+			 */
+			ckpt_active = true;
+			BgWriterShmem->ckpt_started++;
+
 			CreateCheckPoint(false, force_checkpoint);
 
+			/*
+			 * Indicate checkpoint completion to any waiting backends.
+			 */
+			BgWriterShmem->ckpt_done = BgWriterShmem->ckpt_started;
+			ckpt_active = false;
+
 			/*
 			 * Note we record the checkpoint start time not end time as
 			 * last_checkpoint_time.  This is so that time-driven checkpoints
@@ -274,14 +355,11 @@ BackgroundWriterMain(void)
 			 */
 			last_checkpoint_time = now;
 
-			/*
-			 * Indicate checkpoint completion to any waiting backends.
-			 */
-			BgWriterShmem->checkpoint_count++;
-
 			/*
 			 * After any checkpoint, close all smgr files.  This is so we
 			 * won't hang onto smgr references to deleted files indefinitely.
+			 * (It is safe to do this because this process does not have a
+			 * relcache, and so no dangling references could remain.)
 			 */
 			smgrcloseall();
 
@@ -301,6 +379,8 @@ BackgroundWriterMain(void)
 		 * we respond reasonably promptly when someone signals us,
 		 * break down the sleep into 1-second increments, and check for
 		 * interrupts after each nap.
+		 *
+		 * We absorb pending requests after each short sleep.
 		 */
 		udelay = ((n > 0) ? BgWriterDelay : 10000) * 1000L;
 		while (udelay > 1000000L)
@@ -308,17 +388,11 @@ BackgroundWriterMain(void)
 			if (got_SIGHUP || checkpoint_requested || shutdown_requested)
 				break;
 			pg_usleep(1000000L);
+			AbsorbFsyncRequests();
 			udelay -= 1000000L;
 		}
 		if (!(got_SIGHUP || checkpoint_requested || shutdown_requested))
 			pg_usleep(udelay);
-
-		/*
-		 * Emergency bailout if postmaster has died.  This is to avoid the
-		 * necessity for manual cleanup of all postmaster children.
-		 */
-		if (!PostmasterIsAlive(true))
-			exit(1);
 	}
 }
 
@@ -387,10 +461,11 @@ int
 BgWriterShmemSize(void)
 {
 	/*
-	 * This is not worth measuring right now, but may become so after we
-	 * add fsync signaling ...
+	 * Currently, the size of the requests[] array is arbitrarily set
+	 * equal to NBuffers.  This may prove too large or small ...
 	 */
-	return MAXALIGN(sizeof(BgWriterShmemStruct));
+	return MAXALIGN(sizeof(BgWriterShmemStruct) +
+					(NBuffers - 1) * sizeof(BgWriterRequest));
 }
 
 /*
@@ -404,7 +479,7 @@ BgWriterShmemInit(void)
 
 	BgWriterShmem = (BgWriterShmemStruct *)
 		ShmemInitStruct("Background Writer Data",
-						sizeof(BgWriterShmemStruct),
+						BgWriterShmemSize(),
 						&found);
 	if (BgWriterShmem == NULL)
 		ereport(FATAL,
@@ -414,6 +489,7 @@ BgWriterShmemInit(void)
 		return;					/* already initialized */
 
 	MemSet(BgWriterShmem, 0, sizeof(BgWriterShmemStruct));
+	BgWriterShmem->max_requests = NBuffers;
 }
 
 /*
@@ -427,8 +503,10 @@ BgWriterShmemInit(void)
 void
 RequestCheckpoint(bool waitforit)
 {
-	volatile sig_atomic_t *count_ptr = &BgWriterShmem->checkpoint_count;
-	sig_atomic_t	old_count = *count_ptr;
+	/* use volatile pointer to prevent code rearrangement */
+	volatile BgWriterShmemStruct *bgs = BgWriterShmem;
+	sig_atomic_t	old_failed = bgs->ckpt_failed;
+	sig_atomic_t	old_started = bgs->ckpt_started;
 
 	/*
 	 * Send signal to request checkpoint.  When waitforit is false,
@@ -442,15 +520,119 @@ RequestCheckpoint(bool waitforit)
 			 "could not signal for checkpoint: %m");
 
 	/*
-	 * If requested, wait for completion.  We detect completion by
-	 * observing a change in checkpoint_count in shared memory.
+	 * If requested, wait for completion.  We detect completion according
+	 * to the algorithm given above.
 	 */
 	if (waitforit)
 	{
-		while (*count_ptr == old_count)
+		while (bgs->ckpt_started == old_started)
 		{
 			CHECK_FOR_INTERRUPTS();
-			pg_usleep(1000000L);
+			pg_usleep(100000L);
+		}
+		old_started = bgs->ckpt_started;
+		/*
+		 * We are waiting for ckpt_done >= old_started, in a modulo
+		 * sense.  This is a little tricky since we don't know the
+		 * width or signedness of sig_atomic_t.  We make the lowest
+		 * common denominator assumption that it is only as wide
+		 * as "char".  This means that this algorithm will cope
+		 * correctly as long as we don't sleep for more than 127
+		 * completed checkpoints.  (If we do, we will get another
+		 * chance to exit after 128 more checkpoints...)
+		 */
+		while (((signed char) (bgs->ckpt_done - old_started)) < 0)
+		{
+			CHECK_FOR_INTERRUPTS();
+			pg_usleep(100000L);
 		}
+		if (bgs->ckpt_failed != old_failed)
+			ereport(ERROR,
+					(errmsg("checkpoint request failed"),
+					 errhint("Consult the postmaster log for details.")));
+	}
+}
+
+/*
+ * ForwardFsyncRequest
+ *		Forward a file-fsync request from a backend to the bgwriter
+ *
+ * Whenever a backend is compelled to write directly to a relation
+ * (which should be seldom, if the bgwriter is getting its job done),
+ * the backend calls this routine to pass over knowledge that the relation
+ * is dirty and must be fsync'd before next checkpoint.
+ *
+ * If we are unable to pass over the request (at present, this can happen
+ * if the shared memory queue is full), we return false.  That forces
+ * the backend to do its own fsync.  We hope that will be even more seldom.
+ *
+ * Note: we presently make no attempt to eliminate duplicate requests
+ * in the requests[] queue.  The bgwriter will have to eliminate dups
+ * internally anyway, so we may as well avoid holding the lock longer
+ * than we have to here.
+ */
+bool
+ForwardFsyncRequest(RelFileNode rnode, BlockNumber segno)
+{
+	BgWriterRequest *request;
+
+	if (!IsUnderPostmaster)
+		return false;			/* probably shouldn't even get here */
+	Assert(BgWriterShmem != NULL);
+
+	LWLockAcquire(BgWriterCommLock, LW_EXCLUSIVE);
+	if (BgWriterShmem->bgwriter_pid == 0 ||
+		BgWriterShmem->num_requests >= BgWriterShmem->max_requests)
+	{
+		LWLockRelease(BgWriterCommLock);
+		return false;
+	}
+	request = &BgWriterShmem->requests[BgWriterShmem->num_requests++];
+	request->rnode = rnode;
+	request->segno = segno;
+	LWLockRelease(BgWriterCommLock);
+	return true;
+}
+
+/*
+ * AbsorbFsyncRequests
+ *		Retrieve queued fsync requests and pass them to local smgr.
+ *
+ * This is exported because it must be called during CreateCheckpoint;
+ * we have to be sure we have accepted all pending requests *after* we
+ * establish the checkpoint redo pointer.  Since CreateCheckpoint
+ * sometimes runs in non-bgwriter processes, do nothing if not bgwriter.
+ */
+void
+AbsorbFsyncRequests(void)
+{
+	BgWriterRequest *requests = NULL;
+	BgWriterRequest *request;
+	int			n;
+
+	if (!am_bg_writer)
+		return;
+
+	/*
+	 * We try to avoid holding the lock for a long time by copying the
+	 * request array.
+	 */
+	LWLockAcquire(BgWriterCommLock, LW_EXCLUSIVE);
+
+	n = BgWriterShmem->num_requests;
+	if (n > 0)
+	{
+		requests = (BgWriterRequest *) palloc(n * sizeof(BgWriterRequest));
+		memcpy(requests, BgWriterShmem->requests, n * sizeof(BgWriterRequest));
+	}
+	BgWriterShmem->num_requests = 0;
+
+	LWLockRelease(BgWriterCommLock);
+
+	for (request = requests; n > 0; request++, n--)
+	{
+		RememberFsyncRequest(request->rnode, request->segno);
 	}
+	if (requests)
+		pfree(requests);
 }
diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c
index f718e33cd598beddb09c76f8467e18f16017c162..2386bc89bf3b7eaf22be77b5448924e89529412f 100644
--- a/src/backend/storage/buffer/bufmgr.c
+++ b/src/backend/storage/buffer/bufmgr.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/storage/buffer/bufmgr.c,v 1.166 2004/05/29 22:48:19 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/storage/buffer/bufmgr.c,v 1.167 2004/05/31 03:48:02 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -1044,6 +1044,9 @@ RelationTruncate(Relation rel, BlockNumber nblocks)
  *		bothering to write them out first.	This is NOT rollback-able,
  *		and so should be used only with extreme caution!
  *
+ *		There is no particularly good reason why this doesn't have a
+ *		firstDelBlock parameter, except that current callers don't need it.
+ *
  *		We assume that the caller holds an exclusive lock on the relation,
  *		which should assure that no new buffers will be acquired for the rel
  *		meanwhile.
@@ -1052,14 +1055,15 @@ RelationTruncate(Relation rel, BlockNumber nblocks)
 void
 DropRelationBuffers(Relation rel)
 {
-	DropRelFileNodeBuffers(rel->rd_node, rel->rd_istemp);
+	DropRelFileNodeBuffers(rel->rd_node, rel->rd_istemp, 0);
 }
 
 /* ---------------------------------------------------------------------
  *		DropRelFileNodeBuffers
  *
  *		This is the same as DropRelationBuffers, except that the target
- *		relation is specified by RelFileNode and temp status.
+ *		relation is specified by RelFileNode and temp status, and one
+ *		may specify the first block to drop.
  *
  *		This is NOT rollback-able.	One legitimate use is to clear the
  *		buffer cache of buffers for a relation that is being deleted
@@ -1067,7 +1071,8 @@ DropRelationBuffers(Relation rel)
  * --------------------------------------------------------------------
  */
 void
-DropRelFileNodeBuffers(RelFileNode rnode, bool istemp)
+DropRelFileNodeBuffers(RelFileNode rnode, bool istemp,
+					   BlockNumber firstDelBlock)
 {
 	int			i;
 	BufferDesc *bufHdr;
@@ -1077,7 +1082,8 @@ DropRelFileNodeBuffers(RelFileNode rnode, bool istemp)
 		for (i = 0; i < NLocBuffer; i++)
 		{
 			bufHdr = &LocalBufferDescriptors[i];
-			if (RelFileNodeEquals(bufHdr->tag.rnode, rnode))
+			if (RelFileNodeEquals(bufHdr->tag.rnode, rnode) &&
+				bufHdr->tag.blockNum >= firstDelBlock)
 			{
 				bufHdr->flags &= ~(BM_DIRTY | BM_JUST_DIRTIED);
 				bufHdr->cntxDirty = false;
@@ -1094,7 +1100,8 @@ DropRelFileNodeBuffers(RelFileNode rnode, bool istemp)
 	{
 		bufHdr = &BufferDescriptors[i - 1];
 recheck:
-		if (RelFileNodeEquals(bufHdr->tag.rnode, rnode))
+		if (RelFileNodeEquals(bufHdr->tag.rnode, rnode) &&
+			bufHdr->tag.blockNum >= firstDelBlock)
 		{
 			/*
 			 * If there is I/O in progress, better wait till it's done;
diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c
index 5ef12de949518be73314b6341308886733f6b730..96de54110cfaab2f21d42120c63f3c09fbb6d961 100644
--- a/src/backend/storage/file/fd.c
+++ b/src/backend/storage/file/fd.c
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/storage/file/fd.c,v 1.108 2004/02/23 23:03:10 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/storage/file/fd.c,v 1.109 2004/05/31 03:48:04 tgl Exp $
  *
  * NOTES:
  *
@@ -484,6 +484,7 @@ Insert(File file)
 	DO_DB(_dump_lru());
 }
 
+/* returns 0 on success, -1 on re-open failure (with errno set) */
 static int
 LruInsert(File file)
 {
@@ -685,6 +686,7 @@ filepath(const char *filename)
 	return buf;
 }
 
+/* returns 0 on success, -1 on re-open failure (with errno set) */
 static int
 FileAccess(File file)
 {
@@ -954,7 +956,10 @@ FileRead(File file, char *buffer, int amount)
 			   file, VfdCache[file].fileName,
 			   VfdCache[file].seekPos, amount, buffer));
 
-	FileAccess(file);
+	returnCode = FileAccess(file);
+	if (returnCode < 0)
+		return returnCode;
+
 	returnCode = read(VfdCache[file].fd, buffer, amount);
 	if (returnCode > 0)
 		VfdCache[file].seekPos += returnCode;
@@ -975,7 +980,9 @@ FileWrite(File file, char *buffer, int amount)
 			   file, VfdCache[file].fileName,
 			   VfdCache[file].seekPos, amount, buffer));
 
-	FileAccess(file);
+	returnCode = FileAccess(file);
+	if (returnCode < 0)
+		return returnCode;
 
 	errno = 0;
 	returnCode = write(VfdCache[file].fd, buffer, amount);
@@ -992,9 +999,28 @@ FileWrite(File file, char *buffer, int amount)
 	return returnCode;
 }
 
+int
+FileSync(File file)
+{
+	int			returnCode;
+
+	Assert(FileIsValid(file));
+
+	DO_DB(elog(LOG, "FileSync: %d (%s)",
+			   file, VfdCache[file].fileName));
+
+	returnCode = FileAccess(file);
+	if (returnCode < 0)
+		return returnCode;
+
+	return pg_fsync(VfdCache[file].fd);
+}
+
 long
 FileSeek(File file, long offset, int whence)
 {
+	int			returnCode;
+
 	Assert(FileIsValid(file));
 
 	DO_DB(elog(LOG, "FileSeek: %d (%s) %ld %ld %d",
@@ -1014,8 +1040,11 @@ FileSeek(File file, long offset, int whence)
 				VfdCache[file].seekPos += offset;
 				break;
 			case SEEK_END:
-				FileAccess(file);
-				VfdCache[file].seekPos = lseek(VfdCache[file].fd, offset, whence);
+				returnCode = FileAccess(file);
+				if (returnCode < 0)
+					return returnCode;
+				VfdCache[file].seekPos = lseek(VfdCache[file].fd,
+											   offset, whence);
 				break;
 			default:
 				elog(ERROR, "invalid whence: %d", whence);
@@ -1030,14 +1059,17 @@ FileSeek(File file, long offset, int whence)
 				if (offset < 0)
 					elog(ERROR, "invalid seek offset: %ld", offset);
 				if (VfdCache[file].seekPos != offset)
-					VfdCache[file].seekPos = lseek(VfdCache[file].fd, offset, whence);
+					VfdCache[file].seekPos = lseek(VfdCache[file].fd,
+												   offset, whence);
 				break;
 			case SEEK_CUR:
 				if (offset != 0 || VfdCache[file].seekPos == FileUnknownPos)
-					VfdCache[file].seekPos = lseek(VfdCache[file].fd, offset, whence);
+					VfdCache[file].seekPos = lseek(VfdCache[file].fd,
+												   offset, whence);
 				break;
 			case SEEK_END:
-				VfdCache[file].seekPos = lseek(VfdCache[file].fd, offset, whence);
+				VfdCache[file].seekPos = lseek(VfdCache[file].fd,
+											   offset, whence);
 				break;
 			default:
 				elog(ERROR, "invalid whence: %d", whence);
@@ -1071,7 +1103,10 @@ FileTruncate(File file, long offset)
 	DO_DB(elog(LOG, "FileTruncate %d (%s)",
 			   file, VfdCache[file].fileName));
 
-	FileAccess(file);
+	returnCode = FileAccess(file);
+	if (returnCode < 0)
+		return returnCode;
+
 	returnCode = ftruncate(VfdCache[file].fd, (size_t) offset);
 	return returnCode;
 }
diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c
index 2122a243207b12049b5d655e3a01305e70b3aade..5ac5868f690b32196f7eca791674d2de3c6ca4b0 100644
--- a/src/backend/storage/smgr/md.c
+++ b/src/backend/storage/smgr/md.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.104 2004/04/19 17:42:58 momjian Exp $
+ *	  $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.105 2004/05/31 03:48:06 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -21,8 +21,10 @@
 
 #include "catalog/catalog.h"
 #include "miscadmin.h"
+#include "postmaster/bgwriter.h"
 #include "storage/fd.h"
 #include "storage/smgr.h"
+#include "utils/hsearch.h"
 #include "utils/memutils.h"
 
 
@@ -33,37 +35,68 @@
  *	system's file size limit (often 2GBytes).  In order to do that,
  *	we break relations up into chunks of < 2GBytes and store one chunk
  *	in each of several files that represent the relation.  See the
- *	BLCKSZ and RELSEG_SIZE configuration constants in
- *	include/pg_config.h.  All chunks except the last MUST have size exactly
- *	equal to RELSEG_SIZE blocks --- see mdnblocks() and mdtruncate().
+ *	BLCKSZ and RELSEG_SIZE configuration constants in pg_config_manual.h.
+ *	All chunks except the last MUST have size exactly equal to RELSEG_SIZE
+ *	blocks --- see mdnblocks() and mdtruncate().
  *
  *	The file descriptor pointer (md_fd field) stored in the SMgrRelation
  *	cache is, therefore, just the head of a list of MdfdVec objects.
  *	But note the md_fd pointer can be NULL, indicating relation not open.
  *
+ *	Note that mdfd_chain == NULL does not necessarily mean the relation
+ *	doesn't have another segment after this one; we may just not have
+ *	opened the next segment yet.  (We could not have "all segments are
+ *	in the chain" as an invariant anyway, since another backend could
+ *	extend the relation when we weren't looking.)
+ *
  *	All MdfdVec objects are palloc'd in the MdCxt memory context.
  */
 
 typedef struct _MdfdVec
 {
 	File		mdfd_vfd;			/* fd number in fd.c's pool */
-
-#ifndef LET_OS_MANAGE_FILESIZE
-	struct _MdfdVec *mdfd_chain;	/* for large relations */
+	BlockNumber	mdfd_segno;			/* segment number, from 0 */
+#ifndef LET_OS_MANAGE_FILESIZE		/* for large relations */
+	struct _MdfdVec *mdfd_chain;	/* next segment, or NULL */
 #endif
 } MdfdVec;
 
 static MemoryContext MdCxt;		/* context for all md.c allocations */
 
 
-/* routines declared here */
-static MdfdVec *mdopen(SMgrRelation reln);
+/*
+ * In some contexts (currently, standalone backends and the bgwriter process)
+ * we keep track of pending fsync operations: we need to remember all relation
+ * segments that have been written since the last checkpoint, so that we can
+ * fsync them down to disk before completing the next checkpoint.  This hash
+ * table remembers the pending operations.  We use a hash table not because
+ * we want to look up individual operations, but simply as a convenient way
+ * of eliminating duplicate requests.
+ *
+ * (Regular backends do not track pending operations locally, but forward
+ * them to the bgwriter.)
+ *
+ * XXX for WIN32, may want to expand this to track pending deletes, too.
+ */
+typedef struct
+{
+	RelFileNode	rnode;			/* the targeted relation */
+	BlockNumber	segno;			/* which segment */
+} PendingOperationEntry;
+
+static HTAB *pendingOpsTable = NULL;
+
+
+/* local routines */
+static MdfdVec *mdopen(SMgrRelation reln, bool allowNotFound);
+static bool register_dirty_segment(SMgrRelation reln, MdfdVec *seg);
 static MdfdVec *_fdvec_alloc(void);
 #ifndef LET_OS_MANAGE_FILESIZE
 static MdfdVec *_mdfd_openseg(SMgrRelation reln, BlockNumber segno,
 							  int oflags);
 #endif
-static MdfdVec *_mdfd_getseg(SMgrRelation reln, BlockNumber blkno);
+static MdfdVec *_mdfd_getseg(SMgrRelation reln, BlockNumber blkno,
+							 bool allowNotFound);
 static BlockNumber _mdnblocks(File file, Size blcksz);
 
 
@@ -79,6 +112,31 @@ mdinit(void)
 								  ALLOCSET_DEFAULT_INITSIZE,
 								  ALLOCSET_DEFAULT_MAXSIZE);
 
+	/*
+	 * Create pending-operations hashtable if we need it.  Currently,
+	 * we need it if we are standalone (not under a postmaster) OR
+	 * if we are a bootstrap-mode subprocess of a postmaster (that is,
+	 * a startup or bgwriter process).
+	 */
+	if (!IsUnderPostmaster || IsBootstrapProcessingMode())
+	{
+		HASHCTL		hash_ctl;
+
+		MemSet(&hash_ctl, 0, sizeof(hash_ctl));
+		hash_ctl.keysize = sizeof(PendingOperationEntry);
+		hash_ctl.entrysize = sizeof(PendingOperationEntry);
+		hash_ctl.hash = tag_hash;
+		hash_ctl.hcxt = MdCxt;
+		pendingOpsTable = hash_create("Pending Ops Table",
+									  100L,
+									  &hash_ctl,
+									  HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT);
+		if (pendingOpsTable == NULL)
+			ereport(FATAL,
+					(errcode(ERRCODE_OUT_OF_MEMORY),
+					 errmsg("out of memory")));
+	}
+
 	return true;
 }
 
@@ -130,6 +188,7 @@ mdcreate(SMgrRelation reln, bool isRedo)
 	reln->md_fd = _fdvec_alloc();
 
 	reln->md_fd->mdfd_vfd = fd;
+	reln->md_fd->mdfd_segno = 0;
 #ifndef LET_OS_MANAGE_FILESIZE
 	reln->md_fd->mdfd_chain = NULL;
 #endif
@@ -217,7 +276,7 @@ mdextend(SMgrRelation reln, BlockNumber blocknum, char *buffer)
 	int			nbytes;
 	MdfdVec    *v;
 
-	v = _mdfd_getseg(reln, blocknum);
+	v = _mdfd_getseg(reln, blocknum, false);
 
 #ifndef LET_OS_MANAGE_FILESIZE
 	seekpos = (long) (BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)));
@@ -252,6 +311,9 @@ mdextend(SMgrRelation reln, BlockNumber blocknum, char *buffer)
 		return false;
 	}
 
+	if (!register_dirty_segment(reln, v))
+		return false;
+
 #ifndef LET_OS_MANAGE_FILESIZE
 	Assert(_mdnblocks(v->mdfd_vfd, BLCKSZ) <= ((BlockNumber) RELSEG_SIZE));
 #endif
@@ -261,12 +323,14 @@ mdextend(SMgrRelation reln, BlockNumber blocknum, char *buffer)
 
 /*
  *	mdopen() -- Open the specified relation.  ereport's on failure.
+ *		(Optionally, can return NULL instead of ereport for ENOENT.)
  *
  * Note we only open the first segment, when there are multiple segments.
  */
 static MdfdVec *
-mdopen(SMgrRelation reln)
+mdopen(SMgrRelation reln, bool allowNotFound)
 {
+	MdfdVec	   *mdfd;
 	char	   *path;
 	File		fd;
 
@@ -292,6 +356,8 @@ mdopen(SMgrRelation reln)
 		if (fd < 0)
 		{
 			pfree(path);
+			if (allowNotFound && errno == ENOENT)
+				return NULL;
 			ereport(ERROR,
 					(errcode_for_file_access(),
 					 errmsg("could not open relation %u/%u: %m",
@@ -302,15 +368,16 @@ mdopen(SMgrRelation reln)
 
 	pfree(path);
 
-	reln->md_fd = _fdvec_alloc();
+	reln->md_fd = mdfd = _fdvec_alloc();
 
-	reln->md_fd->mdfd_vfd = fd;
+	mdfd->mdfd_vfd = fd;
+	mdfd->mdfd_segno = 0;
 #ifndef LET_OS_MANAGE_FILESIZE
-	reln->md_fd->mdfd_chain = NULL;
+	mdfd->mdfd_chain = NULL;
 	Assert(_mdnblocks(fd, BLCKSZ) <= ((BlockNumber) RELSEG_SIZE));
 #endif
 
-	return reln->md_fd;
+	return mdfd;
 }
 
 /*
@@ -361,7 +428,7 @@ mdread(SMgrRelation reln, BlockNumber blocknum, char *buffer)
 	int			nbytes;
 	MdfdVec    *v;
 
-	v = _mdfd_getseg(reln, blocknum);
+	v = _mdfd_getseg(reln, blocknum, false);
 
 #ifndef LET_OS_MANAGE_FILESIZE
 	seekpos = (long) (BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)));
@@ -403,7 +470,7 @@ mdwrite(SMgrRelation reln, BlockNumber blocknum, char *buffer)
 	long		seekpos;
 	MdfdVec    *v;
 
-	v = _mdfd_getseg(reln, blocknum);
+	v = _mdfd_getseg(reln, blocknum, false);
 
 #ifndef LET_OS_MANAGE_FILESIZE
 	seekpos = (long) (BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)));
@@ -418,6 +485,9 @@ mdwrite(SMgrRelation reln, BlockNumber blocknum, char *buffer)
 	if (FileWrite(v->mdfd_vfd, buffer, BLCKSZ) != BLCKSZ)
 		return false;
 
+	if (!register_dirty_segment(reln, v))
+		return false;
+
 	return true;
 }
 
@@ -434,7 +504,7 @@ mdwrite(SMgrRelation reln, BlockNumber blocknum, char *buffer)
 BlockNumber
 mdnblocks(SMgrRelation reln)
 {
-	MdfdVec    *v = mdopen(reln);
+	MdfdVec    *v = mdopen(reln, false);
 
 #ifndef LET_OS_MANAGE_FILESIZE
 	BlockNumber nblocks;
@@ -516,7 +586,7 @@ mdtruncate(SMgrRelation reln, BlockNumber nblocks)
 	if (nblocks == curnblk)
 		return nblocks;			/* no work */
 
-	v = mdopen(reln);
+	v = mdopen(reln, false);
 
 #ifndef LET_OS_MANAGE_FILESIZE
 	priorblocks = 0;
@@ -576,40 +646,154 @@ mdtruncate(SMgrRelation reln, BlockNumber nblocks)
 }
 
 /*
- *	mdcommit() -- Commit a transaction.
+ *	mdsync() -- Sync previous writes to stable storage.
+ *
+ * This is only called during checkpoints, and checkpoints should only
+ * occur in processes that have created a pendingOpsTable.
  */
 bool
-mdcommit(void)
+mdsync(void)
 {
+	HASH_SEQ_STATUS hstat;
+	PendingOperationEntry *entry;
+
+	if (!pendingOpsTable)
+		return false;
+
 	/*
-	 * We don't actually have to do anything here...
+	 * If we are in the bgwriter, the sync had better include all fsync
+	 * requests that were queued by backends before the checkpoint REDO
+	 * point was determined.  We go that a little better by accepting
+	 * all requests queued up to the point where we start fsync'ing.
 	 */
+	AbsorbFsyncRequests();
+
+	hash_seq_init(&hstat, pendingOpsTable);
+	while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL)
+	{
+		/*
+		 * If fsync is off then we don't have to bother opening the file
+		 * at all.  (We delay checking until this point so that changing
+		 * fsync on the fly behaves sensibly.)
+		 */
+		if (enableFsync)
+		{
+			SMgrRelation reln;
+			MdfdVec *seg;
+
+			/*
+			 * Find or create an smgr hash entry for this relation.
+			 * This may seem a bit unclean -- md calling smgr?  But it's
+			 * really the best solution.  It ensures that the open file
+			 * reference isn't permanently leaked if we get an error here.
+			 * (You may say "but an unreferenced SMgrRelation is still a
+			 * leak!"  Not really, because the only case in which a checkpoint
+			 * is done by a process that isn't about to shut down is in the
+			 * bgwriter, and it will periodically do smgrcloseall().  This
+			 * fact justifies our not closing the reln in the success path
+			 * either, which is a good thing since in non-bgwriter cases
+			 * we couldn't safely do that.)  Furthermore, in many cases
+			 * the relation will have been dirtied through this same smgr
+			 * relation, and so we can save a file open/close cycle.
+			 */
+			reln = smgropen(entry->rnode);
+
+			/*
+			 * It is possible that the relation has been dropped or truncated
+			 * since the fsync request was entered.  Therefore, we have to
+			 * allow file-not-found errors.  This applies both during
+			 * _mdfd_getseg() and during FileSync, since fd.c might have
+			 * closed the file behind our back.
+			 */
+			seg = _mdfd_getseg(reln,
+							   entry->segno * ((BlockNumber) RELSEG_SIZE),
+							   true);
+			if (seg)
+			{
+				if (FileSync(seg->mdfd_vfd) < 0 &&
+					errno != ENOENT)
+				{
+					ereport(LOG,
+							(errcode_for_file_access(),
+							 errmsg("could not fsync segment %u of relation %u/%u: %m",
+									entry->segno,
+									entry->rnode.tblNode,
+									entry->rnode.relNode)));
+					return false;
+				}
+			}
+		}
+
+		/* Okay, delete this entry */
+		if (hash_search(pendingOpsTable, entry,
+						HASH_REMOVE, NULL) == NULL)
+			elog(ERROR, "pendingOpsTable corrupted");
+	}
+
 	return true;
 }
 
 /*
- *	mdabort() -- Abort a transaction.
+ * register_dirty_segment() -- Mark a relation segment as needing fsync
+ *
+ * If there is a local pending-ops table, just make an entry in it for
+ * mdsync to process later.  Otherwise, try to pass off the fsync request
+ * to the background writer process.  If that fails, just do the fsync
+ * locally before returning (we expect this will not happen often enough
+ * to be a performance problem).
+ *
+ * A false result implies I/O failure during local fsync.  errno will be
+ * valid for error reporting.
  */
-bool
-mdabort(void)
+static bool
+register_dirty_segment(SMgrRelation reln, MdfdVec *seg)
 {
-	/*
-	 * We don't actually have to do anything here...
-	 */
+	if (pendingOpsTable)
+	{
+		PendingOperationEntry entry;
+
+		/* ensure any pad bytes in the struct are zeroed */
+		MemSet(&entry, 0, sizeof(entry));
+		entry.rnode = reln->smgr_rnode;
+		entry.segno = seg->mdfd_segno;
+
+		if (hash_search(pendingOpsTable, &entry, HASH_ENTER, NULL) != NULL)
+			return true;
+		/* out of memory: fall through to do it locally */
+	}
+	else
+	{
+		if (ForwardFsyncRequest(reln->smgr_rnode, seg->mdfd_segno))
+			return true;
+	}
+
+	if (FileSync(seg->mdfd_vfd) < 0)
+		return false;
 	return true;
 }
 
 /*
- *	mdsync() -- Sync previous writes to stable storage.
+ * RememberFsyncRequest() -- callback from bgwriter side of fsync request
+ *
+ * We stuff the fsync request into the local hash table for execution
+ * during the bgwriter's next checkpoint.
  */
-bool
-mdsync(void)
+void
+RememberFsyncRequest(RelFileNode rnode, BlockNumber segno)
 {
-	sync();
-	if (IsUnderPostmaster)
-		pg_usleep(2000000L);
-	sync();
-	return true;
+	PendingOperationEntry entry;
+
+	Assert(pendingOpsTable);
+
+	/* ensure any pad bytes in the struct are zeroed */
+	MemSet(&entry, 0, sizeof(entry));
+	entry.rnode = rnode;
+	entry.segno = segno;
+
+	if (hash_search(pendingOpsTable, &entry, HASH_ENTER, NULL) == NULL)
+		ereport(FATAL,
+				(errcode(ERRCODE_OUT_OF_MEMORY),
+				 errmsg("out of memory")));
 }
 
 /*
@@ -618,18 +802,11 @@ mdsync(void)
 static MdfdVec *
 _fdvec_alloc(void)
 {
-	MdfdVec *v;
-
-	v = (MdfdVec *) MemoryContextAlloc(MdCxt, sizeof(MdfdVec));
-	v->mdfd_vfd = -1;
-#ifndef LET_OS_MANAGE_FILESIZE
-	v->mdfd_chain = NULL;
-#endif
-
-	return v;
+	return (MdfdVec *) MemoryContextAlloc(MdCxt, sizeof(MdfdVec));
 }
 
 #ifndef LET_OS_MANAGE_FILESIZE
+
 /*
  * Open the specified segment of the relation,
  * and make a MdfdVec object for it.  Returns NULL on failure.
@@ -642,11 +819,11 @@ _mdfd_openseg(SMgrRelation reln, BlockNumber segno, int oflags)
 	char	   *path,
 			   *fullpath;
 
-	/* be sure we have enough space for the '.segno', if any */
 	path = relpath(reln->smgr_rnode);
 
 	if (segno > 0)
 	{
+		/* be sure we have enough space for the '.segno' */
 		fullpath = (char *) palloc(strlen(path) + 12);
 		sprintf(fullpath, "%s.%u", path, segno);
 		pfree(path);
@@ -667,32 +844,36 @@ _mdfd_openseg(SMgrRelation reln, BlockNumber segno, int oflags)
 
 	/* fill the entry */
 	v->mdfd_vfd = fd;
+	v->mdfd_segno = segno;
 	v->mdfd_chain = NULL;
 	Assert(_mdnblocks(fd, BLCKSZ) <= ((BlockNumber) RELSEG_SIZE));
 
 	/* all done */
 	return v;
 }
-#endif
+
+#endif /* LET_OS_MANAGE_FILESIZE */
 
 /*
  *	_mdfd_getseg() -- Find the segment of the relation holding the
- *					  specified block.  ereport's on failure.
+ *		specified block.  ereport's on failure.
+ *		(Optionally, can return NULL instead of ereport for ENOENT.)
  */
 static MdfdVec *
-_mdfd_getseg(SMgrRelation reln, BlockNumber blkno)
+_mdfd_getseg(SMgrRelation reln, BlockNumber blkno, bool allowNotFound)
 {
-	MdfdVec    *v = mdopen(reln);
-
+	MdfdVec    *v = mdopen(reln, allowNotFound);
 #ifndef LET_OS_MANAGE_FILESIZE
-	BlockNumber segno;
-	BlockNumber i;
+	BlockNumber segstogo;
+	BlockNumber nextsegno;
 
-	for (segno = blkno / ((BlockNumber) RELSEG_SIZE), i = 1;
-		 segno > 0;
-		 i++, segno--)
-	{
+	if (!v)
+		return NULL;			/* only possible if allowNotFound */
 
+	for (segstogo = blkno / ((BlockNumber) RELSEG_SIZE), nextsegno = 1;
+		 segstogo > 0;
+		 nextsegno++, segstogo--)
+	{
 		if (v->mdfd_chain == NULL)
 		{
 			/*
@@ -705,16 +886,21 @@ _mdfd_getseg(SMgrRelation reln, BlockNumber blkno)
 			 * one new segment per call, so this restriction seems
 			 * reasonable.
 			 */
-			v->mdfd_chain = _mdfd_openseg(reln, i, (segno == 1) ? O_CREAT : 0);
-
+			v->mdfd_chain = _mdfd_openseg(reln,
+										  nextsegno,
+										  (segstogo == 1) ? O_CREAT : 0);
 			if (v->mdfd_chain == NULL)
+			{
+				if (allowNotFound && errno == ENOENT)
+					return NULL;
 				ereport(ERROR,
 						(errcode_for_file_access(),
 						 errmsg("could not open segment %u of relation %u/%u (target block %u): %m",
-								i,
+								nextsegno,
 								reln->smgr_rnode.tblNode,
 								reln->smgr_rnode.relNode,
 								blkno)));
+			}
 		}
 		v = v->mdfd_chain;
 	}
diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c
index d242744a4d7c47d2cc6700e1c3af8ecc23ddc522..c204e2796c4b2125b5f83a1a3fb574e2a9bd85a1 100644
--- a/src/backend/storage/smgr/smgr.c
+++ b/src/backend/storage/smgr/smgr.c
@@ -11,7 +11,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/storage/smgr/smgr.c,v 1.70 2004/02/11 22:55:25 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/storage/smgr/smgr.c,v 1.71 2004/05/31 03:48:06 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -56,7 +56,7 @@ typedef struct f_smgr
 static const f_smgr smgrsw[] = {
 	/* magnetic disk */
 	{mdinit, NULL, mdclose, mdcreate, mdunlink, mdextend,
-	 mdread, mdwrite, mdnblocks, mdtruncate, mdcommit, mdabort, mdsync
+	 mdread, mdwrite, mdnblocks, mdtruncate, NULL, NULL, mdsync
 	}
 };
 
@@ -407,7 +407,7 @@ smgr_internal_unlink(RelFileNode rnode, int which, bool isTemp, bool isRedo)
 	 * Get rid of any leftover buffers for the rel (shouldn't be any in the
 	 * commit case, but there can be in the abort case).
 	 */
-	DropRelFileNodeBuffers(rnode, isTemp);
+	DropRelFileNodeBuffers(rnode, isTemp, 0);
 
 	/*
 	 * Tell the free space map to forget this relation.  It won't be accessed
@@ -638,7 +638,7 @@ smgrcommit(void)
 		if (smgrsw[i].smgr_commit)
 		{
 			if (! (*(smgrsw[i].smgr_commit)) ())
-				elog(FATAL, "transaction commit failed on %s: %m",
+				elog(ERROR, "transaction commit failed on %s: %m",
 					 DatumGetCString(DirectFunctionCall1(smgrout,
 													 Int16GetDatum(i))));
 		}
@@ -658,7 +658,7 @@ smgrabort(void)
 		if (smgrsw[i].smgr_abort)
 		{
 			if (! (*(smgrsw[i].smgr_abort)) ())
-				elog(FATAL, "transaction abort failed on %s: %m",
+				elog(ERROR, "transaction abort failed on %s: %m",
 					 DatumGetCString(DirectFunctionCall1(smgrout,
 													 Int16GetDatum(i))));
 		}
@@ -678,7 +678,7 @@ smgrsync(void)
 		if (smgrsw[i].smgr_sync)
 		{
 			if (! (*(smgrsw[i].smgr_sync)) ())
-				elog(PANIC, "storage sync failed on %s: %m",
+				elog(ERROR, "storage sync failed on %s: %m",
 					 DatumGetCString(DirectFunctionCall1(smgrout,
 													 Int16GetDatum(i))));
 		}
@@ -707,6 +707,13 @@ smgr_redo(XLogRecPtr lsn, XLogRecord *record)
 
 		reln = smgropen(xlrec->rnode);
 
+		/*
+		 * First, force bufmgr to drop any buffers it has for the to-be-
+		 * truncated blocks.  We must do this, else subsequent XLogReadBuffer
+		 * operations will not re-extend the file properly.
+		 */
+		DropRelFileNodeBuffers(xlrec->rnode, false, xlrec->blkno);
+
 		/* Can't use smgrtruncate because it would try to xlog */
 
 		/*
diff --git a/src/include/access/slru.h b/src/include/access/slru.h
index fec968e7a202c4b53146e0e451c64afdfe9d8fdf..213cca5c21654510ba3f78ed90f20ed981bc6e7a 100644
--- a/src/include/access/slru.h
+++ b/src/include/access/slru.h
@@ -6,26 +6,17 @@
  * Portions Copyright (c) 2003, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/access/slru.h,v 1.5 2004/05/28 05:13:17 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/access/slru.h,v 1.6 2004/05/31 03:48:08 tgl Exp $
  */
 #ifndef SLRU_H
 #define SLRU_H
 
-#include "access/xlog.h"
+#include "storage/lwlock.h"
 
-/* exported because lwlock.c needs it */
-#define NUM_CLOG_BUFFERS	8
 
-/*
- * Note: the separation between SlruLockData and SlruSharedData is purely
- * historical; the structs could be combined.
- */
-typedef struct SlruLockData
-{
-	LWLockId	ControlLock;
-	LWLockId	BufferLocks[NUM_CLOG_BUFFERS];	/* Per-buffer I/O locks */
-} SlruLockData;
-typedef SlruLockData *SlruLock;
+/* Opaque structs known only in slru.c */
+typedef struct SlruSharedData *SlruShared;
+typedef struct SlruFlushData *SlruFlush;
 
 /*
  * SlruCtlData is an unshared structure that points to the active information
@@ -33,13 +24,13 @@ typedef SlruLockData *SlruLock;
  */
 typedef struct SlruCtlData
 {
-	void	   *shared;			/* pointer to SlruSharedData */
-	SlruLock	locks;
+	SlruShared	shared;
+
+	LWLockId	ControlLock;
 
 	/*
-	 * Dir is set during SimpleLruShmemInit and does not change thereafter.
-	 * The value is automatically inherited by backends via fork, and
-	 * doesn't need to be in shared memory.
+	 * Dir is set during SimpleLruInit and does not change thereafter.
+	 * Since it's always the same, it doesn't need to be in shared memory.
 	 */
 	char		Dir[MAXPGPATH];
 
@@ -51,13 +42,16 @@ typedef struct SlruCtlData
 	bool		(*PagePrecedes) (int, int);
 
 } SlruCtlData;
+
 typedef SlruCtlData *SlruCtl;
 
+
 extern int	SimpleLruShmemSize(void);
 extern void SimpleLruInit(SlruCtl ctl, const char *name, const char *subdir);
 extern int	SimpleLruZeroPage(SlruCtl ctl, int pageno);
-extern char *SimpleLruReadPage(SlruCtl ctl, int pageno, TransactionId xid, bool forwrite);
-extern void SimpleLruWritePage(SlruCtl ctl, int slotno);
+extern char *SimpleLruReadPage(SlruCtl ctl, int pageno,
+							   TransactionId xid, bool forwrite);
+extern void SimpleLruWritePage(SlruCtl ctl, int slotno, SlruFlush fdata);
 extern void SimpleLruSetLatestPage(SlruCtl ctl, int pageno);
 extern void SimpleLruFlush(SlruCtl ctl, bool checkpoint);
 extern void SimpleLruTruncate(SlruCtl ctl, int cutoffPage);
diff --git a/src/include/postmaster/bgwriter.h b/src/include/postmaster/bgwriter.h
index c11af72e78945f91dfe76bfab00eeab7eabea886..ed56e9639e889ec64fe0dd2c45476561d9b0dcaf 100644
--- a/src/include/postmaster/bgwriter.h
+++ b/src/include/postmaster/bgwriter.h
@@ -5,13 +5,17 @@
  *
  * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
  *
- * $PostgreSQL: pgsql/src/include/postmaster/bgwriter.h,v 1.1 2004/05/29 22:48:23 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/postmaster/bgwriter.h,v 1.2 2004/05/31 03:48:09 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
 #ifndef _BGWRITER_H
 #define _BGWRITER_H
 
+#include "storage/block.h"
+#include "storage/relfilenode.h"
+
+
 /* GUC options */
 extern int	BgWriterDelay;
 extern int	BgWriterPercent;
@@ -23,6 +27,9 @@ extern void BackgroundWriterMain(void);
 
 extern void RequestCheckpoint(bool waitforit);
 
+extern bool ForwardFsyncRequest(RelFileNode rnode, BlockNumber segno);
+extern void AbsorbFsyncRequests(void);
+
 extern int	BgWriterShmemSize(void);
 extern void BgWriterShmemInit(void);
 
diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h
index 27752d412b56435ddbe61bf77adcd83554d1df92..95b426bb8b93f21faa93c2069167eb8fe048ea49 100644
--- a/src/include/storage/bufmgr.h
+++ b/src/include/storage/bufmgr.h
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/storage/bufmgr.h,v 1.80 2004/05/29 22:48:23 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/storage/bufmgr.h,v 1.81 2004/05/31 03:48:10 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -154,7 +154,8 @@ extern BlockNumber RelationGetNumberOfBlocks(Relation relation);
 extern void RelationTruncate(Relation rel, BlockNumber nblocks);
 extern int	FlushRelationBuffers(Relation rel, BlockNumber firstDelBlock);
 extern void DropRelationBuffers(Relation rel);
-extern void DropRelFileNodeBuffers(RelFileNode rnode, bool istemp);
+extern void DropRelFileNodeBuffers(RelFileNode rnode, bool istemp,
+								   BlockNumber firstDelBlock);
 extern void DropBuffers(Oid dbid);
 
 #ifdef NOT_USED
diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h
index 177925cf3e80776dbc34dd543ca8e36b5fcbea76..430ed5d8c74181f11b10a41428453f123159f06b 100644
--- a/src/include/storage/fd.h
+++ b/src/include/storage/fd.h
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/storage/fd.h,v 1.44 2004/02/23 23:03:10 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/storage/fd.h,v 1.45 2004/05/31 03:48:10 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -15,7 +15,7 @@
 /*
  * calls:
  *
- *	File {Close, Read, Write, Seek, Tell, MarkDirty, Sync}
+ *	File {Close, Read, Write, Seek, Tell, Sync}
  *	{File Name Open, Allocate, Free} File
  *
  * These are NOT JUST RENAMINGS OF THE UNIX ROUTINES.
@@ -66,6 +66,7 @@ extern void FileClose(File file);
 extern void FileUnlink(File file);
 extern int	FileRead(File file, char *buffer, int amount);
 extern int	FileWrite(File file, char *buffer, int amount);
+extern int	FileSync(File file);
 extern long FileSeek(File file, long offset, int whence);
 extern int	FileTruncate(File file, long offset);
 
diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h
index 34f9c6613c709f9122a054dfb37deab756bede29..e06d9a4bf77f43af92bdb216ad5b009c75a868c1 100644
--- a/src/include/storage/lwlock.h
+++ b/src/include/storage/lwlock.h
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/storage/lwlock.h,v 1.10 2003/12/20 17:31:21 momjian Exp $
+ * $PostgreSQL: pgsql/src/include/storage/lwlock.h,v 1.11 2004/05/31 03:48:10 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -37,6 +37,7 @@ typedef enum LWLockId
 	ControlFileLock,
 	CheckpointLock,
 	RelCacheInitLock,
+	BgWriterCommLock,
 
 	NumFixedLWLocks,			/* must be last except for
 								 * MaxDynamicLWLock */
diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h
index 41367d35e819b92ff1a5b889753e1f193006366a..6a28c3824fad8e6b5f1a90cabb455e8219b8b53f 100644
--- a/src/include/storage/smgr.h
+++ b/src/include/storage/smgr.h
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/storage/smgr.h,v 1.41 2004/02/11 22:55:26 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/storage/smgr.h,v 1.42 2004/05/31 03:48:10 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -83,10 +83,10 @@ extern bool mdread(SMgrRelation reln, BlockNumber blocknum, char *buffer);
 extern bool mdwrite(SMgrRelation reln, BlockNumber blocknum, char *buffer);
 extern BlockNumber mdnblocks(SMgrRelation reln);
 extern BlockNumber mdtruncate(SMgrRelation reln, BlockNumber nblocks);
-extern bool mdcommit(void);
-extern bool mdabort(void);
 extern bool mdsync(void);
 
+extern void RememberFsyncRequest(RelFileNode rnode, BlockNumber segno);
+
 /* smgrtype.c */
 extern Datum smgrout(PG_FUNCTION_ARGS);
 extern Datum smgrin(PG_FUNCTION_ARGS);