diff --git a/src/backend/access/transam/clog.c b/src/backend/access/transam/clog.c
index 6de10d16a129ded8741837e56ba18344aedee6f6..3c121b1bba2d43b2fa9ff88b873af260d33466ea 100644
--- a/src/backend/access/transam/clog.c
+++ b/src/backend/access/transam/clog.c
@@ -10,29 +10,34 @@
  * looked up again.  Now we use specialized access code so that the commit
  * log can be broken into relatively small, independent segments.
  *
+ * XLOG interactions: this module generates an XLOG record whenever a new
+ * CLOG page is initialized to zeroes.	Other writes of CLOG come from
+ * recording of transaction commit or abort in xact.c, which generates its
+ * own XLOG records for these events and will re-perform the status update
+ * on redo; so we need make no additional XLOG entry here.	Also, the XLOG
+ * is guaranteed flushed through the XLOG commit record before we are called
+ * to log a commit, so the WAL rule "write xlog before data" is satisfied
+ * automatically for commits, and we don't really care for aborts.  Therefore,
+ * we don't need to mark CLOG pages with LSN information; we have enough
+ * synchronization already.
+ *
  * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/backend/access/transam/clog.c,v 1.22 2004/07/03 02:55:56 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/access/transam/clog.c,v 1.23 2004/08/23 23:22:44 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
 #include "postgres.h"
 
-#include <fcntl.h>
-#include <dirent.h>
-#include <sys/stat.h>
-#include <unistd.h>
-
 #include "access/clog.h"
 #include "access/slru.h"
-#include "miscadmin.h"
-#include "storage/lwlock.h"
+#include "postmaster/bgwriter.h"
 
 
 /*
- * Defines for CLOG page and segment sizes.  A page is the same BLCKSZ
- * as is used everywhere else in Postgres.
+ * Defines for CLOG page sizes.  A page is the same BLCKSZ as is used
+ * everywhere else in Postgres.
  *
  * Note: because TransactionIds are 32 bits and wrap around at 0xFFFFFFFF,
  * CLOG page numbering also wraps around at 0xFFFFFFFF/CLOG_XACTS_PER_PAGE,
@@ -53,25 +58,11 @@
 #define TransactionIdToBIndex(xid)	((xid) % (TransactionId) CLOG_XACTS_PER_BYTE)
 
 
-/*----------
- * Shared-memory data structures for CLOG control
- *
- * XLOG interactions: this module generates an XLOG record whenever a new
- * CLOG page is initialized to zeroes.	Other writes of CLOG come from
- * recording of transaction commit or abort in xact.c, which generates its
- * own XLOG records for these events and will re-perform the status update
- * on redo; so we need make no additional XLOG entry here.	Also, the XLOG
- * is guaranteed flushed through the XLOG commit record before we are called
- * to log a commit, so the WAL rule "write xlog before data" is satisfied
- * automatically for commits, and we don't really care for aborts.  Therefore,
- * we don't need to mark CLOG pages with LSN information; we have enough
- * synchronization already.
- *----------
+/*
+ * Link to shared-memory data structures for CLOG control
  */
-
-
 static SlruCtlData ClogCtlData;
-static SlruCtl ClogCtl = &ClogCtlData;
+#define ClogCtl (&ClogCtlData)
 
 
 static int	ZeroCLOGPage(int pageno, bool writeXlog);
@@ -91,6 +82,7 @@ TransactionIdSetStatus(TransactionId xid, XidStatus status)
 	int			pageno = TransactionIdToPage(xid);
 	int			byteno = TransactionIdToByte(xid);
 	int			bshift = TransactionIdToBIndex(xid) * CLOG_BITS_PER_XACT;
+	int			slotno;
 	char	   *byteptr;
 	char		byteval;
 
@@ -98,10 +90,10 @@ TransactionIdSetStatus(TransactionId xid, XidStatus status)
 		   status == TRANSACTION_STATUS_ABORTED ||
 		   status == TRANSACTION_STATUS_SUB_COMMITTED);
 
-	LWLockAcquire(ClogCtl->ControlLock, LW_EXCLUSIVE);
+	LWLockAcquire(CLogControlLock, LW_EXCLUSIVE);
 
-	byteptr = SimpleLruReadPage(ClogCtl, pageno, xid, true);
-	byteptr += byteno;
+	slotno = SimpleLruReadPage(ClogCtl, pageno, xid);
+	byteptr = ClogCtl->shared->page_buffer[slotno] + byteno;
 
 	/* Current state should be 0, subcommitted or target state */
 	Assert(((*byteptr >> bshift) & CLOG_XACT_BITMASK) == 0 ||
@@ -114,9 +106,9 @@ TransactionIdSetStatus(TransactionId xid, XidStatus status)
 	byteval |= (status << bshift);
 	*byteptr = byteval;
 
-	/* ...->page_status[slotno] = SLRU_PAGE_DIRTY; already done */
+	ClogCtl->shared->page_status[slotno] = SLRU_PAGE_DIRTY;
 
-	LWLockRelease(ClogCtl->ControlLock);
+	LWLockRelease(CLogControlLock);
 }
 
 /*
@@ -131,17 +123,18 @@ TransactionIdGetStatus(TransactionId xid)
 	int			pageno = TransactionIdToPage(xid);
 	int			byteno = TransactionIdToByte(xid);
 	int			bshift = TransactionIdToBIndex(xid) * CLOG_BITS_PER_XACT;
+	int			slotno;
 	char	   *byteptr;
 	XidStatus	status;
 
-	LWLockAcquire(ClogCtl->ControlLock, LW_EXCLUSIVE);
+	LWLockAcquire(CLogControlLock, LW_EXCLUSIVE);
 
-	byteptr = SimpleLruReadPage(ClogCtl, pageno, xid, false);
-	byteptr += byteno;
+	slotno = SimpleLruReadPage(ClogCtl, pageno, xid);
+	byteptr = ClogCtl->shared->page_buffer[slotno] + byteno;
 
 	status = (*byteptr >> bshift) & CLOG_XACT_BITMASK;
 
-	LWLockRelease(ClogCtl->ControlLock);
+	LWLockRelease(CLogControlLock);
 
 	return status;
 }
@@ -160,8 +153,8 @@ CLOGShmemSize(void)
 void
 CLOGShmemInit(void)
 {
-	SimpleLruInit(ClogCtl, "CLOG Ctl", "pg_clog");
 	ClogCtl->PagePrecedes = CLOGPagePrecedes;
+	SimpleLruInit(ClogCtl, "CLOG Ctl", CLogControlLock, "pg_clog");
 }
 
 /*
@@ -175,16 +168,16 @@ BootStrapCLOG(void)
 {
 	int			slotno;
 
-	LWLockAcquire(ClogCtl->ControlLock, LW_EXCLUSIVE);
+	LWLockAcquire(CLogControlLock, LW_EXCLUSIVE);
 
 	/* Create and zero the first page of the commit log */
 	slotno = ZeroCLOGPage(0, false);
 
 	/* Make sure it's written out */
 	SimpleLruWritePage(ClogCtl, slotno, NULL);
-	/* Assert(ClogCtl->page_status[slotno] == SLRU_PAGE_CLEAN); */
+	Assert(ClogCtl->shared->page_status[slotno] == SLRU_PAGE_CLEAN);
 
-	LWLockRelease(ClogCtl->ControlLock);
+	LWLockRelease(CLogControlLock);
 }
 
 /*
@@ -199,7 +192,9 @@ BootStrapCLOG(void)
 static int
 ZeroCLOGPage(int pageno, bool writeXlog)
 {
-	int			slotno = SimpleLruZeroPage(ClogCtl, pageno);
+	int			slotno;
+
+	slotno = SimpleLruZeroPage(ClogCtl, pageno);
 
 	if (writeXlog)
 		WriteZeroPageXlogRec(pageno);
@@ -217,8 +212,7 @@ StartupCLOG(void)
 	/*
 	 * Initialize our idea of the latest page number.
 	 */
-	SimpleLruSetLatestPage(ClogCtl,
-						   TransactionIdToPage(ShmemVariableCache->nextXid));
+	ClogCtl->shared->latest_page_number = TransactionIdToPage(ShmemVariableCache->nextXid);
 }
 
 /*
@@ -227,6 +221,7 @@ StartupCLOG(void)
 void
 ShutdownCLOG(void)
 {
+	/* Flush dirty CLOG pages to disk */
 	SimpleLruFlush(ClogCtl, false);
 }
 
@@ -236,6 +231,7 @@ ShutdownCLOG(void)
 void
 CheckPointCLOG(void)
 {
+	/* Flush dirty CLOG pages to disk */
 	SimpleLruFlush(ClogCtl, true);
 }
 
@@ -263,12 +259,12 @@ ExtendCLOG(TransactionId newestXact)
 
 	pageno = TransactionIdToPage(newestXact);
 
-	LWLockAcquire(ClogCtl->ControlLock, LW_EXCLUSIVE);
+	LWLockAcquire(CLogControlLock, LW_EXCLUSIVE);
 
 	/* Zero the page and make an XLOG entry about it */
 	ZeroCLOGPage(pageno, true);
 
-	LWLockRelease(ClogCtl->ControlLock);
+	LWLockRelease(CLogControlLock);
 }
 
 
@@ -296,6 +292,15 @@ TruncateCLOG(TransactionId oldestXact)
 	 * We pass the *page* containing oldestXact to SimpleLruTruncate.
 	 */
 	cutoffPage = TransactionIdToPage(oldestXact);
+
+	/* Check to see if there's any files that could be removed */
+	if (!SlruScanDirectory(ClogCtl, cutoffPage, false))
+		return;					/* nothing to remove */
+
+	/* Perform a CHECKPOINT */
+	RequestCheckpoint(true);
+
+	/* Now we can remove the old CLOG segment(s) */
 	SimpleLruTruncate(ClogCtl, cutoffPage);
 }
 
@@ -340,20 +345,51 @@ WriteZeroPageXlogRec(int pageno)
 	rdata.data = (char *) (&pageno);
 	rdata.len = sizeof(int);
 	rdata.next = NULL;
-	(void) XLogInsert(RM_SLRU_ID, CLOG_ZEROPAGE | XLOG_NO_TRAN, &rdata);
+	(void) XLogInsert(RM_CLOG_ID, CLOG_ZEROPAGE | XLOG_NO_TRAN, &rdata);
 }
 
-/* Redo a ZEROPAGE action during WAL replay */
+/*
+ * CLOG resource manager's routines
+ */
 void
-clog_zeropage_redo(int pageno)
+clog_redo(XLogRecPtr lsn, XLogRecord *record)
 {
-	int			slotno;
+	uint8		info = record->xl_info & ~XLR_INFO_MASK;
 
-	LWLockAcquire(ClogCtl->ControlLock, LW_EXCLUSIVE);
+	if (info == CLOG_ZEROPAGE)
+	{
+		int			pageno;
+		int			slotno;
 
-	slotno = ZeroCLOGPage(pageno, false);
-	SimpleLruWritePage(ClogCtl, slotno, NULL);
-	/* Assert(ClogCtl->page_status[slotno] == SLRU_PAGE_CLEAN); */
+		memcpy(&pageno, XLogRecGetData(record), sizeof(int));
+
+		LWLockAcquire(CLogControlLock, LW_EXCLUSIVE);
+
+		slotno = ZeroCLOGPage(pageno, false);
+		SimpleLruWritePage(ClogCtl, slotno, NULL);
+		Assert(ClogCtl->shared->page_status[slotno] == SLRU_PAGE_CLEAN);
+
+		LWLockRelease(CLogControlLock);
+	}
+}
+
+void
+clog_undo(XLogRecPtr lsn, XLogRecord *record)
+{
+}
+
+void
+clog_desc(char *buf, uint8 xl_info, char *rec)
+{
+	uint8		info = xl_info & ~XLR_INFO_MASK;
+
+	if (info == CLOG_ZEROPAGE)
+	{
+		int			pageno;
 
-	LWLockRelease(ClogCtl->ControlLock);
+		memcpy(&pageno, rec, sizeof(int));
+		sprintf(buf + strlen(buf), "zeropage: %d", pageno);
+	}
+	else
+		strcat(buf, "UNKNOWN");
 }
diff --git a/src/backend/access/transam/rmgr.c b/src/backend/access/transam/rmgr.c
index ad68e4c99baef77a5b4023085babaae737c7f119..575ad7a08912897b58b1bfb818ea72bd4562b5d5 100644
--- a/src/backend/access/transam/rmgr.c
+++ b/src/backend/access/transam/rmgr.c
@@ -3,7 +3,7 @@
  *
  * Resource managers definition
  *
- * $PostgreSQL: pgsql/src/backend/access/transam/rmgr.c,v 1.14 2004/07/21 22:31:20 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/access/transam/rmgr.c,v 1.15 2004/08/23 23:22:44 tgl Exp $
  */
 #include "postgres.h"
 
@@ -12,7 +12,7 @@
 #include "access/heapam.h"
 #include "access/nbtree.h"
 #include "access/rtree.h"
-#include "access/slru.h"
+#include "access/clog.h"
 #include "access/xact.h"
 #include "access/xlog_internal.h"
 #include "storage/smgr.h"
@@ -23,7 +23,7 @@ const RmgrData RmgrTable[RM_MAX_ID + 1] = {
 	{"XLOG", xlog_redo, xlog_undo, xlog_desc, NULL, NULL},
 	{"Transaction", xact_redo, xact_undo, xact_desc, NULL, NULL},
 	{"Storage", smgr_redo, smgr_undo, smgr_desc, NULL, NULL},
-	{"SLRU", slru_redo, slru_undo, slru_desc, NULL, NULL},
+	{"CLOG", clog_redo, clog_undo, clog_desc, NULL, NULL},
 	{"Reserved 4", NULL, NULL, NULL, NULL, NULL},
 	{"Reserved 5", NULL, NULL, NULL, NULL, NULL},
 	{"Reserved 6", NULL, NULL, NULL, NULL, NULL},
diff --git a/src/backend/access/transam/slru.c b/src/backend/access/transam/slru.c
index d45a7d9f6141e60d3e13eb8b982702c38c34e431..5d51f69a53123f2c550c35a5aa5398b54d287a65 100644
--- a/src/backend/access/transam/slru.c
+++ b/src/backend/access/transam/slru.c
@@ -3,49 +3,6 @@
  * slru.c
  *		Simple LRU buffering for transaction status logfiles
  *
- * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
- * Portions Copyright (c) 1994, Regents of the University of California
- *
- * $PostgreSQL: pgsql/src/backend/access/transam/slru.c,v 1.18 2004/07/21 22:31:20 tgl Exp $
- *
- *-------------------------------------------------------------------------
- */
-#include "postgres.h"
-
-#include <fcntl.h>
-#include <sys/stat.h>
-#include <unistd.h>
-
-#include "access/clog.h"
-#include "access/slru.h"
-#include "access/subtrans.h"
-#include "postmaster/bgwriter.h"
-#include "storage/fd.h"
-#include "storage/lwlock.h"
-#include "storage/shmem.h"
-#include "miscadmin.h"
-
-
-/*
- * Define segment size.  A page is the same BLCKSZ as is used everywhere
- * else in Postgres.  The segment size can be chosen somewhat arbitrarily;
- * we make it 32 pages by default, or 256Kb, i.e. 1M transactions for CLOG
- * or 64K transactions for SUBTRANS.
- *
- * Note: because TransactionIds are 32 bits and wrap around at 0xFFFFFFFF,
- * page numbering also wraps around at 0xFFFFFFFF/xxxx_XACTS_PER_PAGE (where
- * xxxx is CLOG or SUBTRANS, respectively), and segment numbering at
- * 0xFFFFFFFF/xxxx_XACTS_PER_PAGE/SLRU_PAGES_PER_SEGMENT.  We need
- * take no explicit notice of that fact in this module, except when comparing
- * segment and page numbers in SimpleLruTruncate (see PagePrecedes()).
- */
-
-#define SLRU_PAGES_PER_SEGMENT	32
-
-
-/*----------
- * Shared-memory data structures for SLRU control
- *
  * We use a simple least-recently-used scheme to manage a pool of page
  * buffers.  Under ordinary circumstances we expect that write
  * traffic will occur mostly to the latest page (and to the just-prior
@@ -86,44 +43,46 @@
  * to re-dirty a page that is currently being written out.	This is handled
  * by setting the page's state from WRITE_IN_PROGRESS to DIRTY.  The writing
  * process must notice this and not mark the page CLEAN when it's done.
- *----------
+ *
+ *
+ * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1994, Regents of the University of California
+ *
+ * $PostgreSQL: pgsql/src/backend/access/transam/slru.c,v 1.19 2004/08/23 23:22:44 tgl Exp $
+ *
+ *-------------------------------------------------------------------------
  */
+#include "postgres.h"
 
-typedef enum
-{
-	SLRU_PAGE_EMPTY,			/* buffer is not in use */
-	SLRU_PAGE_READ_IN_PROGRESS, /* page is being read in */
-	SLRU_PAGE_CLEAN,			/* page is valid and not dirty */
-	SLRU_PAGE_DIRTY,			/* page is valid but needs write */
-	SLRU_PAGE_WRITE_IN_PROGRESS /* page is being written out */
-} SlruPageStatus;
+#include <fcntl.h>
+#include <sys/stat.h>
+#include <unistd.h>
 
-/*
- * Shared-memory state
- */
-typedef struct SlruSharedData
-{
-	LWLockId	ControlLock;
+#include "access/slru.h"
+#include "access/xlog.h"
+#include "storage/fd.h"
+#include "storage/shmem.h"
+#include "miscadmin.h"
 
-	/*
-	 * Info for each buffer slot.  Page number is undefined when status is
-	 * EMPTY.  lru_count is essentially the number of page switches since
-	 * last use of this page; the page with highest lru_count is the best
-	 * candidate to replace.
-	 */
-	char	   *page_buffer[NUM_CLOG_BUFFERS];
-	SlruPageStatus page_status[NUM_CLOG_BUFFERS];
-	int			page_number[NUM_CLOG_BUFFERS];
-	unsigned int page_lru_count[NUM_CLOG_BUFFERS];
-	LWLockId	BufferLocks[NUM_CLOG_BUFFERS];	/* Per-buffer I/O locks */
 
-	/*
-	 * latest_page_number is the page number of the current end of the
-	 * CLOG; this is not critical data, since we use it only to avoid
-	 * swapping out the latest page.
-	 */
-	int			latest_page_number;
-} SlruSharedData;
+/*
+ * Define segment size.  A page is the same BLCKSZ as is used everywhere
+ * else in Postgres.  The segment size can be chosen somewhat arbitrarily;
+ * we make it 32 pages by default, or 256Kb, i.e. 1M transactions for CLOG
+ * or 64K transactions for SUBTRANS.
+ *
+ * Note: because TransactionIds are 32 bits and wrap around at 0xFFFFFFFF,
+ * page numbering also wraps around at 0xFFFFFFFF/xxxx_XACTS_PER_PAGE (where
+ * xxxx is CLOG or SUBTRANS, respectively), and segment numbering at
+ * 0xFFFFFFFF/xxxx_XACTS_PER_PAGE/SLRU_PAGES_PER_SEGMENT.  We need
+ * take no explicit notice of that fact in this module, except when comparing
+ * segment and page numbers in SimpleLruTruncate (see PagePrecedes()).
+ *
+ * Note: this file currently assumes that segment file names will be four
+ * hex digits.  This sets a lower bound on the segment size (64K transactions
+ * for 32-bit TransactionIds).
+ */
+#define SLRU_PAGES_PER_SEGMENT	32
 
 #define SlruFileName(ctl, path, seg) \
 	snprintf(path, MAXPGPATH, "%s/%04X", (ctl)->Dir, seg)
@@ -138,8 +97,8 @@ typedef struct SlruSharedData
 typedef struct SlruFlushData
 {
 	int			num_files;					/* # files actually open */
-	int			fd[NUM_CLOG_BUFFERS];		/* their FD's */
-	int			segno[NUM_CLOG_BUFFERS];	/* their clog seg#s */
+	int			fd[NUM_SLRU_BUFFERS];		/* their FD's */
+	int			segno[NUM_SLRU_BUFFERS];	/* their log seg#s */
 } SlruFlushData;
 
 /*
@@ -149,7 +108,7 @@ typedef struct SlruFlushData
 	do { \
 		if ((shared)->page_lru_count[slotno] != 0) { \
 			int		iilru; \
-			for (iilru = 0; iilru < NUM_CLOG_BUFFERS; iilru++) \
+			for (iilru = 0; iilru < NUM_SLRU_BUFFERS; iilru++) \
 				(shared)->page_lru_count[iilru]++; \
 			(shared)->page_lru_count[slotno] = 0; \
 		} \
@@ -176,7 +135,6 @@ static bool SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno,
 								  SlruFlush fdata);
 static void SlruReportIOError(SlruCtl ctl, int pageno, TransactionId xid);
 static int	SlruSelectLRUPage(SlruCtl ctl, int pageno);
-static bool SlruScanDirectory(SlruCtl ctl, int cutoffPage, bool doDeletions);
 
 
 /*
@@ -186,11 +144,12 @@ static bool SlruScanDirectory(SlruCtl ctl, int cutoffPage, bool doDeletions);
 int
 SimpleLruShmemSize(void)
 {
-	return MAXALIGN(sizeof(SlruSharedData)) + BLCKSZ * NUM_CLOG_BUFFERS;
+	return BUFFERALIGN(sizeof(SlruSharedData)) + BLCKSZ * NUM_SLRU_BUFFERS;
 }
 
 void
-SimpleLruInit(SlruCtl ctl, const char *name, const char *subdir)
+SimpleLruInit(SlruCtl ctl, const char *name,
+			  LWLockId ctllock, const char *subdir)
 {
 	SlruShared	shared;
 	bool		found;
@@ -207,16 +166,16 @@ SimpleLruInit(SlruCtl ctl, const char *name, const char *subdir)
 
 		memset(shared, 0, sizeof(SlruSharedData));
 
-		shared->ControlLock = LWLockAssign();
+		shared->ControlLock = ctllock;
 
-		bufptr = (char *) shared + MAXALIGN(sizeof(SlruSharedData));
+		bufptr = (char *) shared + BUFFERALIGN(sizeof(SlruSharedData));
 
-		for (slotno = 0; slotno < NUM_CLOG_BUFFERS; slotno++)
+		for (slotno = 0; slotno < NUM_SLRU_BUFFERS; slotno++)
 		{
 			shared->page_buffer[slotno] = bufptr;
 			shared->page_status[slotno] = SLRU_PAGE_EMPTY;
 			shared->page_lru_count[slotno] = 1;
-			shared->BufferLocks[slotno] = LWLockAssign();
+			shared->buffer_locks[slotno] = LWLockAssign();
 			bufptr += BLCKSZ;
 		}
 
@@ -225,11 +184,12 @@ SimpleLruInit(SlruCtl ctl, const char *name, const char *subdir)
 	else
 		Assert(found);
 
-	/* Initialize the unshared control struct */
+	/*
+	 * Initialize the unshared control struct, including directory path.
+	 * We assume caller set PagePrecedes.
+	 */
 	ctl->shared = shared;
-	ctl->ControlLock = shared->ControlLock;
-
-	/* Initialize unshared copy of directory path */
+	ctl->do_fsync = true;		/* default behavior */
 	snprintf(ctl->Dir, MAXPGPATH, "%s/%s", DataDir, subdir);
 }
 
@@ -244,8 +204,8 @@ SimpleLruInit(SlruCtl ctl, const char *name, const char *subdir)
 int
 SimpleLruZeroPage(SlruCtl ctl, int pageno)
 {
-	int			slotno;
 	SlruShared	shared = ctl->shared;
+	int			slotno;
 
 	/* Find a suitable buffer slot for the page */
 	slotno = SlruSelectLRUPage(ctl, pageno);
@@ -274,14 +234,13 @@ SimpleLruZeroPage(SlruCtl ctl, int pageno)
  * The passed-in xid is used only for error reporting, and may be
  * InvalidTransactionId if no specific xid is associated with the action.
  *
- * Return value is the shared-buffer address of the page.
+ * Return value is the shared-buffer slot number now holding the page.
  * The buffer's LRU access info is updated.
- * If forwrite is true, the buffer is marked as dirty.
  *
  * Control lock must be held at entry, and will be held at exit.
  */
-char *
-SimpleLruReadPage(SlruCtl ctl, int pageno, TransactionId xid, bool forwrite)
+int
+SimpleLruReadPage(SlruCtl ctl, int pageno, TransactionId xid)
 {
 	SlruShared	shared = ctl->shared;
 
@@ -303,9 +262,7 @@ SimpleLruReadPage(SlruCtl ctl, int pageno, TransactionId xid, bool forwrite)
 			{
 				/* otherwise, it's ready to use */
 				SlruRecentlyUsed(shared, slotno);
-				if (forwrite)
-					shared->page_status[slotno] = SLRU_PAGE_DIRTY;
-				return shared->page_buffer[slotno];
+				return slotno;
 			}
 		}
 		else
@@ -327,7 +284,7 @@ SimpleLruReadPage(SlruCtl ctl, int pageno, TransactionId xid, bool forwrite)
 
 		/* Release shared lock, grab per-buffer lock instead */
 		LWLockRelease(shared->ControlLock);
-		LWLockAcquire(shared->BufferLocks[slotno], LW_EXCLUSIVE);
+		LWLockAcquire(shared->buffer_locks[slotno], LW_EXCLUSIVE);
 
 		/*
 		 * Check to see if someone else already did the read, or took the
@@ -336,7 +293,7 @@ SimpleLruReadPage(SlruCtl ctl, int pageno, TransactionId xid, bool forwrite)
 		if (shared->page_number[slotno] != pageno ||
 			shared->page_status[slotno] != SLRU_PAGE_READ_IN_PROGRESS)
 		{
-			LWLockRelease(shared->BufferLocks[slotno]);
+			LWLockRelease(shared->buffer_locks[slotno]);
 			LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE);
 			continue;
 		}
@@ -352,16 +309,14 @@ SimpleLruReadPage(SlruCtl ctl, int pageno, TransactionId xid, bool forwrite)
 
 		shared->page_status[slotno] = ok ? SLRU_PAGE_CLEAN : SLRU_PAGE_EMPTY;
 
-		LWLockRelease(shared->BufferLocks[slotno]);
+		LWLockRelease(shared->buffer_locks[slotno]);
 
 		/* Now it's okay to ereport if we failed */
 		if (!ok)
 			SlruReportIOError(ctl, pageno, xid);
 
 		SlruRecentlyUsed(shared, slotno);
-		if (forwrite)
-			shared->page_status[slotno] = SLRU_PAGE_DIRTY;
-		return shared->page_buffer[slotno];
+		return slotno;
 	}
 }
 
@@ -379,9 +334,9 @@ SimpleLruReadPage(SlruCtl ctl, int pageno, TransactionId xid, bool forwrite)
 void
 SimpleLruWritePage(SlruCtl ctl, int slotno, SlruFlush fdata)
 {
+	SlruShared	shared = ctl->shared;
 	int			pageno;
 	bool		ok;
-	SlruShared	shared = ctl->shared;
 
 	/* Do nothing if page does not need writing */
 	if (shared->page_status[slotno] != SLRU_PAGE_DIRTY &&
@@ -392,7 +347,7 @@ SimpleLruWritePage(SlruCtl ctl, int slotno, SlruFlush fdata)
 
 	/* Release shared lock, grab per-buffer lock instead */
 	LWLockRelease(shared->ControlLock);
-	LWLockAcquire(shared->BufferLocks[slotno], LW_EXCLUSIVE);
+	LWLockAcquire(shared->buffer_locks[slotno], LW_EXCLUSIVE);
 
 	/*
 	 * Check to see if someone else already did the write, or took the
@@ -405,7 +360,7 @@ SimpleLruWritePage(SlruCtl ctl, int slotno, SlruFlush fdata)
 		(shared->page_status[slotno] != SLRU_PAGE_DIRTY &&
 		 shared->page_status[slotno] != SLRU_PAGE_WRITE_IN_PROGRESS))
 	{
-		LWLockRelease(shared->BufferLocks[slotno]);
+		LWLockRelease(shared->buffer_locks[slotno]);
 		LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE);
 		return;
 	}
@@ -447,7 +402,7 @@ SimpleLruWritePage(SlruCtl ctl, int slotno, SlruFlush fdata)
 	if (shared->page_status[slotno] == SLRU_PAGE_WRITE_IN_PROGRESS)
 		shared->page_status[slotno] = ok ? SLRU_PAGE_CLEAN : SLRU_PAGE_DIRTY;
 
-	LWLockRelease(shared->BufferLocks[slotno]);
+	LWLockRelease(shared->buffer_locks[slotno]);
 
 	/* Now it's okay to ereport if we failed */
 	if (!ok)
@@ -640,7 +595,7 @@ SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno, SlruFlush fdata)
 	 */
 	if (!fdata)
 	{
-		if (pg_fsync(fd))
+		if (ctl->do_fsync && pg_fsync(fd))
 		{
 			slru_errcause = SLRU_FSYNC_FAILED;
 			slru_errno = errno;
@@ -758,7 +713,7 @@ SlruSelectLRUPage(SlruCtl ctl, int pageno)
 		unsigned int bestcount = 0;
 
 		/* See if page already has a buffer assigned */
-		for (slotno = 0; slotno < NUM_CLOG_BUFFERS; slotno++)
+		for (slotno = 0; slotno < NUM_SLRU_BUFFERS; slotno++)
 		{
 			if (shared->page_number[slotno] == pageno &&
 				shared->page_status[slotno] != SLRU_PAGE_EMPTY)
@@ -769,7 +724,7 @@ SlruSelectLRUPage(SlruCtl ctl, int pageno)
 		 * If we find any EMPTY slot, just select that one. Else locate
 		 * the least-recently-used slot that isn't the latest page.
 		 */
-		for (slotno = 0; slotno < NUM_CLOG_BUFFERS; slotno++)
+		for (slotno = 0; slotno < NUM_SLRU_BUFFERS; slotno++)
 		{
 			if (shared->page_status[slotno] == SLRU_PAGE_EMPTY)
 				return slotno;
@@ -795,7 +750,7 @@ SlruSelectLRUPage(SlruCtl ctl, int pageno)
 		 */
 		if (shared->page_status[bestslot] == SLRU_PAGE_READ_IN_PROGRESS)
 			(void) SimpleLruReadPage(ctl, shared->page_number[bestslot],
-									 InvalidTransactionId, false);
+									 InvalidTransactionId);
 		else
 			SimpleLruWritePage(ctl, bestslot, NULL);
 
@@ -808,18 +763,7 @@ SlruSelectLRUPage(SlruCtl ctl, int pageno)
 }
 
 /*
- * This must be called ONCE during postmaster or standalone-backend startup
- */
-void
-SimpleLruSetLatestPage(SlruCtl ctl, int pageno)
-{
-	SlruShared	shared = ctl->shared;
-
-	shared->latest_page_number = pageno;
-}
-
-/*
- * This is called during checkpoint and postmaster/standalone-backend shutdown
+ * Flush dirty pages to disk during checkpoint or database shutdown
  */
 void
 SimpleLruFlush(SlruCtl ctl, bool checkpoint)
@@ -831,11 +775,14 @@ SimpleLruFlush(SlruCtl ctl, bool checkpoint)
 	int			i;
 	bool		ok;
 
+	/*
+	 * Find and write dirty pages
+	 */
 	fdata.num_files = 0;
 
 	LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE);
 
-	for (slotno = 0; slotno < NUM_CLOG_BUFFERS; slotno++)
+	for (slotno = 0; slotno < NUM_SLRU_BUFFERS; slotno++)
 	{
 		SimpleLruWritePage(ctl, slotno, &fdata);
 
@@ -857,7 +804,7 @@ SimpleLruFlush(SlruCtl ctl, bool checkpoint)
 	ok = true;
 	for (i = 0; i < fdata.num_files; i++)
 	{
-		if (pg_fsync(fdata.fd[i]))
+		if (ctl->do_fsync && pg_fsync(fdata.fd[i]))
 		{
 			slru_errcause = SLRU_FSYNC_FAILED;
 			slru_errno = errno;
@@ -879,40 +826,23 @@ SimpleLruFlush(SlruCtl ctl, bool checkpoint)
 
 /*
  * Remove all segments before the one holding the passed page number
- *
- * When this is called, we know that the database logically contains no
- * reference to transaction IDs older than oldestXact.	However, we must
- * not remove any segment until we have performed a checkpoint, to ensure
- * that no such references remain on disk either; else a crash just after
- * the truncation might leave us with a problem.  Since CLOG segments hold
- * a large number of transactions, the opportunity to actually remove a
- * segment is fairly rare, and so it seems best not to do the checkpoint
- * unless we have confirmed that there is a removable segment.	Therefore
- * we issue the checkpoint command here, not in higher-level code as might
- * seem cleaner.
  */
 void
 SimpleLruTruncate(SlruCtl ctl, int cutoffPage)
 {
-	int			slotno;
 	SlruShared	shared = ctl->shared;
+	int			slotno;
 
 	/*
 	 * The cutoff point is the start of the segment containing cutoffPage.
 	 */
 	cutoffPage -= cutoffPage % SLRU_PAGES_PER_SEGMENT;
 
-	if (!SlruScanDirectory(ctl, cutoffPage, false))
-		return;					/* nothing to remove */
-
-	/* Perform a CHECKPOINT */
-	RequestCheckpoint(true);
-
 	/*
 	 * Scan shared memory and remove any pages preceding the cutoff page,
-	 * to ensure we won't rewrite them later.  (Any dirty pages should
-	 * have been flushed already during the checkpoint, we're just being
-	 * extra careful here.)
+	 * to ensure we won't rewrite them later.  (Since this is normally
+	 * called in or just after a checkpoint, any dirty pages should
+	 * have been flushed already ... we're just being extra careful here.)
 	 */
 	LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE);
 
@@ -933,7 +863,7 @@ restart:;
 		return;
 	}
 
-	for (slotno = 0; slotno < NUM_CLOG_BUFFERS; slotno++)
+	for (slotno = 0; slotno < NUM_SLRU_BUFFERS; slotno++)
 	{
 		if (shared->page_status[slotno] == SLRU_PAGE_EMPTY)
 			continue;
@@ -956,7 +886,7 @@ restart:;
 		 */
 		if (shared->page_status[slotno] == SLRU_PAGE_READ_IN_PROGRESS)
 			(void) SimpleLruReadPage(ctl, shared->page_number[slotno],
-									 InvalidTransactionId, false);
+									 InvalidTransactionId);
 		else
 			SimpleLruWritePage(ctl, slotno, NULL);
 		goto restart;
@@ -969,11 +899,13 @@ restart:;
 }
 
 /*
- * SlruTruncate subroutine: scan directory for removable segments.
+ * SimpleLruTruncate subroutine: scan directory for removable segments.
  * Actually remove them iff doDeletions is true.  Return TRUE iff any
  * removable segments were found.  Note: no locking is needed.
+ *
+ * This can be called directly from clog.c, for reasons explained there.
  */
-static bool
+bool
 SlruScanDirectory(SlruCtl ctl, int cutoffPage, bool doDeletions)
 {
 	bool		found = false;
@@ -983,6 +915,13 @@ SlruScanDirectory(SlruCtl ctl, int cutoffPage, bool doDeletions)
 	int			segpage;
 	char		path[MAXPGPATH];
 
+	/*
+	 * The cutoff point is the start of the segment containing cutoffPage.
+	 * (This is redundant when called from SimpleLruTruncate, but not when
+	 * called directly from clog.c.)
+	 */
+	cutoffPage -= cutoffPage % SLRU_PAGES_PER_SEGMENT;
+
 	cldir = AllocateDir(ctl->Dir);
 	if (cldir == NULL)
 		ereport(ERROR,
@@ -1003,10 +942,9 @@ SlruScanDirectory(SlruCtl ctl, int cutoffPage, bool doDeletions)
 				found = true;
 				if (doDeletions)
 				{
-					ereport(LOG,
-							(errmsg("removing file \"%s/%s\"",
-									ctl->Dir, clde->d_name)));
 					snprintf(path, MAXPGPATH, "%s/%s", ctl->Dir, clde->d_name);
+					ereport(LOG,
+							(errmsg("removing file \"%s\"", path)));
 					unlink(path);
 				}
 			}
@@ -1027,55 +965,3 @@ SlruScanDirectory(SlruCtl ctl, int cutoffPage, bool doDeletions)
 
 	return found;
 }
-
-/*
- * SLRU resource manager's routines
- */
-void
-slru_redo(XLogRecPtr lsn, XLogRecord *record)
-{
-	uint8		info = record->xl_info & ~XLR_INFO_MASK;
-	int			pageno;
-
-	memcpy(&pageno, XLogRecGetData(record), sizeof(int));
-
-	switch (info)
-	{
-		case CLOG_ZEROPAGE:
-			clog_zeropage_redo(pageno);
-			break;
-		case SUBTRANS_ZEROPAGE:
-			subtrans_zeropage_redo(pageno);
-			break;
-		default:
-			elog(PANIC, "slru_redo: unknown op code %u", info);
-	}
-}
-
-void
-slru_undo(XLogRecPtr lsn, XLogRecord *record)
-{
-}
-
-void
-slru_desc(char *buf, uint8 xl_info, char *rec)
-{
-	uint8		info = xl_info & ~XLR_INFO_MASK;
-
-	if (info == CLOG_ZEROPAGE)
-	{
-		int			pageno;
-
-		memcpy(&pageno, rec, sizeof(int));
-		sprintf(buf + strlen(buf), "clog zeropage: %d", pageno);
-	}
-	else if (info == SUBTRANS_ZEROPAGE)
-	{
-		int			pageno;
-
-		memcpy(&pageno, rec, sizeof(int));
-		sprintf(buf + strlen(buf), "subtrans zeropage: %d", pageno);
-	}
-	else
-		strcat(buf, "UNKNOWN");
-}
diff --git a/src/backend/access/transam/subtrans.c b/src/backend/access/transam/subtrans.c
index ace1bb1434a868cf7a2429d58589cee5f7a497bb..539dee98759ffd70b7248c299dc27b2eb26ea116 100644
--- a/src/backend/access/transam/subtrans.c
+++ b/src/backend/access/transam/subtrans.c
@@ -1,48 +1,49 @@
 /*-------------------------------------------------------------------------
  *
  * subtrans.c
- *		PostgreSQL subtrans-log manager
+ *		PostgreSQL subtransaction-log manager
  *
- * The pg_subtrans manager is a pg_clog-like manager which stores the parent
+ * The pg_subtrans manager is a pg_clog-like manager that stores the parent
  * transaction Id for each transaction.  It is a fundamental part of the
  * nested transactions implementation.  A main transaction has a parent
  * of InvalidTransactionId, and each subtransaction has its immediate parent.
  * The tree can easily be walked from child to parent, but not in the
  * opposite direction.
  *
- * This code is mostly derived from clog.c.
+ * This code is based on clog.c, but the robustness requirements
+ * are completely different from pg_clog, because we only need to remember
+ * pg_subtrans information for currently-open transactions.  Thus, there is
+ * no need to preserve data over a crash and restart.
+ *
+ * There are no XLOG interactions since we do not care about preserving
+ * data across crashes.  During database startup, we simply force the
+ * currently-active page of SUBTRANS to zeroes.
  *
  * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/backend/access/transam/subtrans.c,v 1.2 2004/08/22 02:41:57 tgl Exp $
+ * $PostgreSQL: pgsql/src/backend/access/transam/subtrans.c,v 1.3 2004/08/23 23:22:44 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
 #include "postgres.h"
 
-#include <fcntl.h>
-#include <dirent.h>
-#include <sys/stat.h>
-#include <unistd.h>
-
 #include "access/slru.h"
 #include "access/subtrans.h"
-#include "miscadmin.h"
-#include "storage/lwlock.h"
+#include "storage/sinval.h"
 #include "utils/tqual.h"
 
 
 /*
- * Defines for SubTrans page and segment sizes.  A page is the same BLCKSZ
- * as is used everywhere else in Postgres.
+ * Defines for SubTrans page sizes.  A page is the same BLCKSZ as is used
+ * everywhere else in Postgres.
  *
  * Note: because TransactionIds are 32 bits and wrap around at 0xFFFFFFFF,
  * SubTrans page numbering also wraps around at
  * 0xFFFFFFFF/SUBTRANS_XACTS_PER_PAGE, and segment numbering at
  * 0xFFFFFFFF/SUBTRANS_XACTS_PER_PAGE/SLRU_SEGMENTS_PER_PAGE.  We need take no
  * explicit notice of that fact in this module, except when comparing segment
- * and page numbers in TruncateSubTrans (see SubTransPagePrecedes).
+ * and page numbers in TruncateSUBTRANS (see SubTransPagePrecedes).
  */
 
 /* We need four bytes per xact */
@@ -52,30 +53,15 @@
 #define TransactionIdToEntry(xid) ((xid) % (TransactionId) SUBTRANS_XACTS_PER_PAGE)
 
 
-/*----------
- * Shared-memory data structures for SUBTRANS control
- *
- * XLOG interactions: this module generates an XLOG record whenever a new
- * SUBTRANS page is initialized to zeroes.	Other writes of SUBTRANS come from
- * recording of transaction commit or abort in xact.c, which generates its
- * own XLOG records for these events and will re-perform the status update
- * on redo; so we need make no additional XLOG entry here.	Also, the XLOG
- * is guaranteed flushed through the XLOG commit record before we are called
- * to log a commit, so the WAL rule "write xlog before data" is satisfied
- * automatically for commits, and we don't really care for aborts.  Therefore,
- * we don't need to mark SUBTRANS pages with LSN information; we have enough
- * synchronization already.
- *----------
+/*
+ * Link to shared-memory data structures for SUBTRANS control
  */
-
-
 static SlruCtlData SubTransCtlData;
-static SlruCtl SubTransCtl = &SubTransCtlData;
+#define SubTransCtl  (&SubTransCtlData)
 
 
-static int	ZeroSUBTRANSPage(int pageno, bool writeXlog);
+static int	ZeroSUBTRANSPage(int pageno);
 static bool SubTransPagePrecedes(int page1, int page2);
-static void WriteZeroPageXlogRec(int pageno);
 
 
 /*
@@ -86,21 +72,23 @@ SubTransSetParent(TransactionId xid, TransactionId parent)
 {
 	int			pageno = TransactionIdToPage(xid);
 	int			entryno = TransactionIdToEntry(xid);
+	int			slotno;
 	TransactionId *ptr;
 
-	LWLockAcquire(SubTransCtl->ControlLock, LW_EXCLUSIVE);
+	LWLockAcquire(SubtransControlLock, LW_EXCLUSIVE);
 
-	ptr = (TransactionId *) SimpleLruReadPage(SubTransCtl, pageno, xid, true);
+	slotno = SimpleLruReadPage(SubTransCtl, pageno, xid);
+	ptr = (TransactionId *) SubTransCtl->shared->page_buffer[slotno];
 	ptr += entryno;
 
-	/* Current state should be 0 or target state */
-	Assert(*ptr == InvalidTransactionId || *ptr == parent);
+	/* Current state should be 0 */
+	Assert(*ptr == InvalidTransactionId);
 
 	*ptr = parent;
 
-	/* ...->page_status[slotno] = SLRU_PAGE_DIRTY; already done */
+	SubTransCtl->shared->page_status[slotno] = SLRU_PAGE_DIRTY;
 
-	LWLockRelease(SubTransCtl->ControlLock);
+	LWLockRelease(SubtransControlLock);
 }
 
 /*
@@ -111,6 +99,7 @@ SubTransGetParent(TransactionId xid)
 {
 	int			pageno = TransactionIdToPage(xid);
 	int			entryno = TransactionIdToEntry(xid);
+	int			slotno;
 	TransactionId *ptr;
 	TransactionId	parent;
 
@@ -121,14 +110,15 @@ SubTransGetParent(TransactionId xid)
 	if (!TransactionIdIsNormal(xid))
 		return InvalidTransactionId;
 
-	LWLockAcquire(SubTransCtl->ControlLock, LW_EXCLUSIVE);
+	LWLockAcquire(SubtransControlLock, LW_EXCLUSIVE);
 
-	ptr = (TransactionId *) SimpleLruReadPage(SubTransCtl, pageno, xid, false);
+	slotno = SimpleLruReadPage(SubTransCtl, pageno, xid);
+	ptr = (TransactionId *) SubTransCtl->shared->page_buffer[slotno];
 	ptr += entryno;
 
 	parent = *ptr;
 
-	LWLockRelease(SubTransCtl->ControlLock);
+	LWLockRelease(SubtransControlLock);
 
 	return parent;
 }
@@ -169,7 +159,7 @@ SubTransGetTopmostTransaction(TransactionId xid)
 
 
 /*
- * Initialization of shared memory for Subtrans
+ * Initialization of shared memory for SUBTRANS
  */
 
 int
@@ -181,36 +171,42 @@ SUBTRANSShmemSize(void)
 void
 SUBTRANSShmemInit(void)
 {
-	SimpleLruInit(SubTransCtl, "SUBTRANS Ctl", "pg_subtrans");
 	SubTransCtl->PagePrecedes = SubTransPagePrecedes;
+	SimpleLruInit(SubTransCtl, "SUBTRANS Ctl",
+				  SubtransControlLock, "pg_subtrans");
+	/* Override default assumption that writes should be fsync'd */
+	SubTransCtl->do_fsync = false;
 }
 
 /*
  * This func must be called ONCE on system install.  It creates
- * the initial SubTrans segment.  (The SubTrans directory is assumed to
- * have been created by initdb, and SubTransShmemInit must have been called
- * already.)
+ * the initial SUBTRANS segment.  (The SUBTRANS directory is assumed to
+ * have been created by the initdb shell script, and SUBTRANSShmemInit
+ * must have been called already.)
+ *
+ * Note: it's not really necessary to create the initial segment now,
+ * since slru.c would create it on first write anyway.  But we may as well
+ * do it to be sure the directory is set up correctly.
  */
 void
 BootStrapSUBTRANS(void)
 {
 	int			slotno;
 
-	LWLockAcquire(SubTransCtl->ControlLock, LW_EXCLUSIVE);
+	LWLockAcquire(SubtransControlLock, LW_EXCLUSIVE);
 
-	/* Create and zero the first page of the commit log */
-	slotno = ZeroSUBTRANSPage(0, false);
+	/* Create and zero the first page of the subtrans log */
+	slotno = ZeroSUBTRANSPage(0);
 
 	/* Make sure it's written out */
 	SimpleLruWritePage(SubTransCtl, slotno, NULL);
-	/* Assert(SubTransCtl->page_status[slotno] == SLRU_PAGE_CLEAN); */
+	Assert(SubTransCtl->shared->page_status[slotno] == SLRU_PAGE_CLEAN);
 
-	LWLockRelease(SubTransCtl->ControlLock);
+	LWLockRelease(SubtransControlLock);
 }
 
 /*
- * Initialize (or reinitialize) a page of SubTrans to zeroes.
- * If writeXlog is TRUE, also emit an XLOG record saying we did this.
+ * Initialize (or reinitialize) a page of SUBTRANS to zeroes.
  *
  * The page is not actually written, just set up in shared memory.
  * The slot number of the new page is returned.
@@ -218,14 +214,9 @@ BootStrapSUBTRANS(void)
  * Control lock must be held at entry, and will be held at exit.
  */
 static int
-ZeroSUBTRANSPage(int pageno, bool writeXlog)
+ZeroSUBTRANSPage(int pageno)
 {
-	int			slotno = SimpleLruZeroPage(SubTransCtl, pageno);
-
-	if (writeXlog)
-		WriteZeroPageXlogRec(pageno);
-
-	return slotno;
+	return SimpleLruZeroPage(SubTransCtl, pageno);
 }
 
 /*
@@ -235,11 +226,20 @@ ZeroSUBTRANSPage(int pageno, bool writeXlog)
 void
 StartupSUBTRANS(void)
 {
+	int			startPage;
+
 	/*
-	 * Initialize our idea of the latest page number.
+	 * Since we don't expect pg_subtrans to be valid across crashes,
+	 * we initialize the currently-active page to zeroes during startup.
+	 * Whenever we advance into a new page, ExtendSUBTRANS will likewise
+	 * zero the new page without regard to whatever was previously on disk.
 	 */
-	SimpleLruSetLatestPage(SubTransCtl,
-						   TransactionIdToPage(ShmemVariableCache->nextXid));
+	LWLockAcquire(SubtransControlLock, LW_EXCLUSIVE);
+
+	startPage = TransactionIdToPage(ShmemVariableCache->nextXid);
+	(void) ZeroSUBTRANSPage(startPage);
+
+	LWLockRelease(SubtransControlLock);
 }
 
 /*
@@ -248,6 +248,12 @@ StartupSUBTRANS(void)
 void
 ShutdownSUBTRANS(void)
 {
+	/*
+	 * Flush dirty SUBTRANS pages to disk
+	 *
+	 * This is not actually necessary from a correctness point of view.
+	 * We do it merely as a debugging aid.
+	 */
 	SimpleLruFlush(SubTransCtl, false);
 }
 
@@ -257,16 +263,23 @@ ShutdownSUBTRANS(void)
 void
 CheckPointSUBTRANS(void)
 {
+	/*
+	 * Flush dirty SUBTRANS pages to disk
+	 *
+	 * This is not actually necessary from a correctness point of view.
+	 * We do it merely to improve the odds that writing of dirty pages is done
+	 * by the checkpoint process and not by backends.
+	 */
 	SimpleLruFlush(SubTransCtl, true);
 }
 
 
 /*
- * Make sure that SubTrans has room for a newly-allocated XID.
+ * Make sure that SUBTRANS has room for a newly-allocated XID.
  *
  * NB: this is called while holding XidGenLock.  We want it to be very fast
  * most of the time; even when it's not so fast, no actual I/O need happen
- * unless we're forced to write out a dirty subtrans or xlog page to make room
+ * unless we're forced to write out a dirty subtrans page to make room
  * in shared memory.
  */
 void
@@ -284,28 +297,20 @@ ExtendSUBTRANS(TransactionId newestXact)
 
 	pageno = TransactionIdToPage(newestXact);
 
-	LWLockAcquire(SubTransCtl->ControlLock, LW_EXCLUSIVE);
+	LWLockAcquire(SubtransControlLock, LW_EXCLUSIVE);
 
-	/* Zero the page and make an XLOG entry about it */
-	ZeroSUBTRANSPage(pageno, true);
+	/* Zero the page */
+	ZeroSUBTRANSPage(pageno);
 
-	LWLockRelease(SubTransCtl->ControlLock);
+	LWLockRelease(SubtransControlLock);
 }
 
 
 /*
- * Remove all SubTrans segments before the one holding the passed transaction ID
+ * Remove all SUBTRANS segments before the one holding the passed transaction ID
  *
- * When this is called, we know that the database logically contains no
- * reference to transaction IDs older than oldestXact.	However, we must
- * not truncate the SubTrans until we have performed a checkpoint, to ensure
- * that no such references remain on disk either; else a crash just after
- * the truncation might leave us with a problem.  Since SubTrans segments hold
- * a large number of transactions, the opportunity to actually remove a
- * segment is fairly rare, and so it seems best not to do the checkpoint
- * unless we have confirmed that there is a removable segment.	Therefore
- * we issue the checkpoint command here, not in higher-level code as might
- * seem cleaner.
+ * This is normally called during checkpoint, with oldestXact being the
+ * oldest XMIN of any running transaction.
  */
 void
 TruncateSUBTRANS(TransactionId oldestXact)
@@ -317,12 +322,13 @@ TruncateSUBTRANS(TransactionId oldestXact)
 	 * We pass the *page* containing oldestXact to SimpleLruTruncate.
 	 */
 	cutoffPage = TransactionIdToPage(oldestXact);
+
 	SimpleLruTruncate(SubTransCtl, cutoffPage);
 }
 
 
 /*
- * Decide which of two SubTrans page numbers is "older" for truncation purposes.
+ * Decide which of two SUBTRANS page numbers is "older" for truncation purposes.
  *
  * We need to use comparison of TransactionIds here in order to do the right
  * thing with wraparound XID arithmetic.  However, if we are asked about
@@ -343,38 +349,3 @@ SubTransPagePrecedes(int page1, int page2)
 
 	return TransactionIdPrecedes(xid1, xid2);
 }
-
-
-/*
- * Write a ZEROPAGE xlog record
- *
- * Note: xlog record is marked as outside transaction control, since we
- * want it to be redone whether the invoking transaction commits or not.
- * (Besides which, this is normally done just before entering a transaction.)
- */
-static void
-WriteZeroPageXlogRec(int pageno)
-{
-	XLogRecData rdata;
-
-	rdata.buffer = InvalidBuffer;
-	rdata.data = (char *) (&pageno);
-	rdata.len = sizeof(int);
-	rdata.next = NULL;
-	(void) XLogInsert(RM_SLRU_ID, SUBTRANS_ZEROPAGE | XLOG_NO_TRAN, &rdata);
-}
-
-/* Redo a ZEROPAGE action during WAL replay */
-void
-subtrans_zeropage_redo(int pageno)
-{
-	int			slotno;
-
-	LWLockAcquire(SubTransCtl->ControlLock, LW_EXCLUSIVE);
-
-	slotno = ZeroSUBTRANSPage(pageno, false);
-	SimpleLruWritePage(SubTransCtl, slotno, NULL);
-	/* Assert(SubTransCtl->page_status[slotno] == SLRU_PAGE_CLEAN); */
-
-	LWLockRelease(SubTransCtl->ControlLock);
-}
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index b9d0398b621188fe2eee1ecb14de6662efddf8d1..5c07795c8a90ec5cbafaad7c1d0df7538b998e60 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.162 2004/08/12 19:03:23 momjian Exp $
+ * $PostgreSQL: pgsql/src/backend/access/transam/xlog.c,v 1.163 2004/08/23 23:22:44 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -4918,6 +4918,14 @@ CreateCheckPoint(bool shutdown, bool force)
 	if (!shutdown)
 		PreallocXlogFiles(recptr);
 
+	/*
+	 * Truncate pg_subtrans if possible.  We can throw away all data before
+	 * the oldest XMIN of any running transaction.  No future transaction will
+	 * attempt to reference any pg_subtrans entry older than that (see Asserts
+	 * in subtrans.c).
+	 */
+	TruncateSUBTRANS(GetOldestXmin(true));
+
 	LWLockRelease(CheckpointLock);
 }
 
diff --git a/src/backend/commands/vacuum.c b/src/backend/commands/vacuum.c
index aa2708fcab1a2356bfd7ae72d64ac10d2f8e532a..fc80f8efcf2936a03d4c6ffdaf4099fd09e12c24 100644
--- a/src/backend/commands/vacuum.c
+++ b/src/backend/commands/vacuum.c
@@ -13,7 +13,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/commands/vacuum.c,v 1.286 2004/08/06 04:15:07 momjian Exp $
+ *	  $PostgreSQL: pgsql/src/backend/commands/vacuum.c,v 1.287 2004/08/23 23:22:45 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -810,9 +810,8 @@ vac_truncate_clog(TransactionId vacuumXID, TransactionId frozenXID)
 		return;
 	}
 
-	/* Truncate CLOG and SUBTRANS to the oldest vacuumxid */
+	/* Truncate CLOG to the oldest vacuumxid */
 	TruncateCLOG(vacuumXID);
-	TruncateSUBTRANS(vacuumXID);
 
 	/* Give warning about impending wraparound problems */
 	if (frozenAlreadyWrapped)
diff --git a/src/backend/storage/ipc/sinval.c b/src/backend/storage/ipc/sinval.c
index f28a883572eadc1eb4c7f5e83321d69b55f009c0..dd9ca8244f3f43a8a001d48eae6f3aa57128050b 100644
--- a/src/backend/storage/ipc/sinval.c
+++ b/src/backend/storage/ipc/sinval.c
@@ -8,7 +8,7 @@
  *
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/storage/ipc/sinval.c,v 1.69 2004/08/22 02:41:57 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/storage/ipc/sinval.c,v 1.70 2004/08/23 23:22:45 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -661,6 +661,9 @@ result_known:
  * FALSE is sufficient for non-shared relations, since only backends in my
  * own database could ever see the tuples in them.
  *
+ * This is also used to determine where to truncate pg_subtrans.  allDbs
+ * must be TRUE for that case.
+ *
  * Note: we include the currently running xids in the set of considered xids.
  * This ensures that if a just-started xact has not yet set its snapshot,
  * when it does set the snapshot it cannot set xmin less than what we compute.
@@ -673,7 +676,17 @@ GetOldestXmin(bool allDbs)
 	TransactionId result;
 	int			index;
 
-	result = GetTopTransactionId();
+	/*
+	 * Normally we start the min() calculation with our own XID.  But
+	 * if called by checkpointer, we will not be inside a transaction,
+	 * so use next XID as starting point for min() calculation.  (Note
+	 * that if there are no xacts running at all, that will be the subtrans
+	 * truncation point!)
+	 */
+	if (IsTransactionState())
+		result = GetTopTransactionId();
+	else
+		result = ReadNewTransactionId();
 
 	LWLockAcquire(SInvalLock, LW_SHARED);
 
diff --git a/src/backend/storage/lmgr/lwlock.c b/src/backend/storage/lmgr/lwlock.c
index e48531c10accdcb93cf68b85c59fb6a670d291b3..f3ee1173a54a676549eae366c44d3a8ab9430d41 100644
--- a/src/backend/storage/lmgr/lwlock.c
+++ b/src/backend/storage/lmgr/lwlock.c
@@ -15,14 +15,13 @@
  * Portions Copyright (c) 1994, Regents of the University of California
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/storage/lmgr/lwlock.c,v 1.21 2004/07/01 00:50:59 tgl Exp $
+ *	  $PostgreSQL: pgsql/src/backend/storage/lmgr/lwlock.c,v 1.22 2004/08/23 23:22:45 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
 #include "postgres.h"
 
-#include "access/clog.h"
-#include "access/subtrans.h"
+#include "access/slru.h"
 #include "storage/lwlock.h"
 #include "storage/proc.h"
 #include "storage/spin.h"
@@ -109,11 +108,11 @@ NumLWLocks(void)
 	/* bufmgr.c needs two for each shared buffer */
 	numLocks += 2 * NBuffers;
 
-	/* clog.c needs one per CLOG buffer + one control lock */
-	numLocks += NUM_CLOG_BUFFERS + 1;
+	/* clog.c needs one per CLOG buffer */
+	numLocks += NUM_SLRU_BUFFERS;
 
-	/* subtrans.c needs one per SubTrans buffer + one control lock */
-	numLocks += NUM_SUBTRANS_BUFFERS + 1;
+	/* subtrans.c needs one per SubTrans buffer */
+	numLocks += NUM_SLRU_BUFFERS;
 
 	/* Perhaps create a few more for use by user-defined modules? */
 
diff --git a/src/include/access/clog.h b/src/include/access/clog.h
index 2df1cedc1c9f21286ce070286e5c12ba0308cc63..0b8fa120756e7bef021734bbdfdc245b1f0733bd 100644
--- a/src/include/access/clog.h
+++ b/src/include/access/clog.h
@@ -6,7 +6,7 @@
  * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/access/clog.h,v 1.9 2004/07/01 00:51:38 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/access/clog.h,v 1.10 2004/08/23 23:22:45 tgl Exp $
  */
 #ifndef CLOG_H
 #define CLOG_H
@@ -27,9 +27,6 @@ typedef int XidStatus;
 #define TRANSACTION_STATUS_ABORTED			0x02
 #define TRANSACTION_STATUS_SUB_COMMITTED	0x03
 
-/* exported because lwlock.c needs it */
-#define NUM_CLOG_BUFFERS	8
-
 
 extern void TransactionIdSetStatus(TransactionId xid, XidStatus status);
 extern XidStatus TransactionIdGetStatus(TransactionId xid);
@@ -42,6 +39,12 @@ extern void ShutdownCLOG(void);
 extern void CheckPointCLOG(void);
 extern void ExtendCLOG(TransactionId newestXact);
 extern void TruncateCLOG(TransactionId oldestXact);
-extern void clog_zeropage_redo(int pageno);
+
+/* XLOG stuff */
+#define CLOG_ZEROPAGE		0x00
+
+extern void clog_redo(XLogRecPtr lsn, XLogRecord *record);
+extern void clog_undo(XLogRecPtr lsn, XLogRecord *record);
+extern void clog_desc(char *buf, uint8 xl_info, char *rec);
 
 #endif   /* CLOG_H */
diff --git a/src/include/access/rmgr.h b/src/include/access/rmgr.h
index 7ea3134031d7d72948e90b6e282203cf30329468..d43f6fdcaf151d43aa025d4d2e85cd3a2c274a94 100644
--- a/src/include/access/rmgr.h
+++ b/src/include/access/rmgr.h
@@ -3,7 +3,7 @@
  *
  * Resource managers definition
  *
- * $PostgreSQL: pgsql/src/include/access/rmgr.h,v 1.11 2004/07/01 00:51:38 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/access/rmgr.h,v 1.12 2004/08/23 23:22:45 tgl Exp $
  */
 #ifndef RMGR_H
 #define RMGR_H
@@ -16,7 +16,7 @@ typedef uint8 RmgrId;
 #define RM_XLOG_ID				0
 #define RM_XACT_ID				1
 #define RM_SMGR_ID				2
-#define RM_SLRU_ID				3
+#define RM_CLOG_ID				3
 #define RM_HEAP_ID				10
 #define RM_BTREE_ID				11
 #define RM_HASH_ID				12
diff --git a/src/include/access/slru.h b/src/include/access/slru.h
index e3245fac6583e3b972a649cacc5eb352389fb463..79abb2899c43c4271993c460176a5a2bd746d7b3 100644
--- a/src/include/access/slru.h
+++ b/src/include/access/slru.h
@@ -1,23 +1,66 @@
-/*
- * slru.h
+/*-------------------------------------------------------------------------
  *
- * Simple LRU
+ * slru.h
+ *		Simple LRU buffering for transaction status logfiles
  *
- * Portions Copyright (c) 2003, PostgreSQL Global Development Group
+ * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/access/slru.h,v 1.7 2004/07/01 00:51:38 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/access/slru.h,v 1.8 2004/08/23 23:22:45 tgl Exp $
+ *
+ *-------------------------------------------------------------------------
  */
 #ifndef SLRU_H
 #define SLRU_H
 
-#include "access/xlog.h"
 #include "storage/lwlock.h"
 
 
-/* Opaque structs known only in slru.c */
-typedef struct SlruSharedData *SlruShared;
-typedef struct SlruFlushData *SlruFlush;
+/*
+ * Number of page buffers.  Ideally this could be different for CLOG and
+ * SUBTRANS, but the benefit doesn't seem to be worth any additional
+ * notational cruft.
+ */
+#define NUM_SLRU_BUFFERS	8
+
+/* Page status codes */
+typedef enum
+{
+	SLRU_PAGE_EMPTY,			/* buffer is not in use */
+	SLRU_PAGE_READ_IN_PROGRESS, /* page is being read in */
+	SLRU_PAGE_CLEAN,			/* page is valid and not dirty */
+	SLRU_PAGE_DIRTY,			/* page is valid but needs write */
+	SLRU_PAGE_WRITE_IN_PROGRESS /* page is being written out */
+} SlruPageStatus;
+
+/*
+ * Shared-memory state
+ */
+typedef struct SlruSharedData
+{
+	LWLockId	ControlLock;
+
+	/*
+	 * Info for each buffer slot.  Page number is undefined when status is
+	 * EMPTY.  lru_count is essentially the number of page switches since
+	 * last use of this page; the page with highest lru_count is the best
+	 * candidate to replace.
+	 */
+	char	   *page_buffer[NUM_SLRU_BUFFERS];
+	SlruPageStatus page_status[NUM_SLRU_BUFFERS];
+	int			page_number[NUM_SLRU_BUFFERS];
+	unsigned int page_lru_count[NUM_SLRU_BUFFERS];
+	LWLockId	buffer_locks[NUM_SLRU_BUFFERS];
+
+	/*
+	 * latest_page_number is the page number of the current end of the
+	 * log; this is not critical data, since we use it only to avoid
+	 * swapping out the latest page.
+	 */
+	int			latest_page_number;
+} SlruSharedData;
+
+typedef SlruSharedData *SlruShared;
 
 /*
  * SlruCtlData is an unshared structure that points to the active information
@@ -27,13 +70,11 @@ typedef struct SlruCtlData
 {
 	SlruShared	shared;
 
-	LWLockId	ControlLock;
-
 	/*
-	 * Dir is set during SimpleLruInit and does not change thereafter.
-	 * Since it's always the same, it doesn't need to be in shared memory.
+	 * This flag tells whether to fsync writes (true for pg_clog,
+	 * false for pg_subtrans).
 	 */
-	char		Dir[MAXPGPATH];
+	bool		do_fsync;
 
 	/*
 	 * Decide which of two page numbers is "older" for truncation purposes.
@@ -42,27 +83,27 @@ typedef struct SlruCtlData
 	 */
 	bool		(*PagePrecedes) (int, int);
 
+	/*
+	 * Dir is set during SimpleLruInit and does not change thereafter.
+	 * Since it's always the same, it doesn't need to be in shared memory.
+	 */
+	char		Dir[MAXPGPATH];
 } SlruCtlData;
 
 typedef SlruCtlData *SlruCtl;
 
+/* Opaque struct known only in slru.c */
+typedef struct SlruFlushData *SlruFlush;
+
 
 extern int	SimpleLruShmemSize(void);
-extern void SimpleLruInit(SlruCtl ctl, const char *name, const char *subdir);
+extern void SimpleLruInit(SlruCtl ctl, const char *name,
+						  LWLockId ctllock, const char *subdir);
 extern int	SimpleLruZeroPage(SlruCtl ctl, int pageno);
-extern char *SimpleLruReadPage(SlruCtl ctl, int pageno,
-							   TransactionId xid, bool forwrite);
+extern int	SimpleLruReadPage(SlruCtl ctl, int pageno, TransactionId xid);
 extern void SimpleLruWritePage(SlruCtl ctl, int slotno, SlruFlush fdata);
-extern void SimpleLruSetLatestPage(SlruCtl ctl, int pageno);
 extern void SimpleLruFlush(SlruCtl ctl, bool checkpoint);
 extern void SimpleLruTruncate(SlruCtl ctl, int cutoffPage);
-
-/* XLOG stuff */
-#define CLOG_ZEROPAGE		0x00
-#define SUBTRANS_ZEROPAGE	0x10
-
-extern void slru_redo(XLogRecPtr lsn, XLogRecord *record);
-extern void slru_undo(XLogRecPtr lsn, XLogRecord *record);
-extern void slru_desc(char *buf, uint8 xl_info, char *rec);
+extern bool SlruScanDirectory(SlruCtl ctl, int cutoffPage, bool doDeletions);
 
 #endif   /* SLRU_H */
diff --git a/src/include/access/subtrans.h b/src/include/access/subtrans.h
index bf6cec64ec748126f415a74399d78eafa1cc6ce2..28a16fbee5816f3f3d5f70bca4f6e14ecd84cf9b 100644
--- a/src/include/access/subtrans.h
+++ b/src/include/access/subtrans.h
@@ -1,22 +1,16 @@
 /*
  * subtrans.h
  *
- * PostgreSQL subtrans-log manager
+ * PostgreSQL subtransaction-log manager
  *
  * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/access/subtrans.h,v 1.2 2004/08/22 02:41:58 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/access/subtrans.h,v 1.3 2004/08/23 23:22:45 tgl Exp $
  */
 #ifndef SUBTRANS_H
 #define SUBTRANS_H
 
-#include "access/xlog.h"
-
-/* exported because lwlock.c needs it */
-/* cannot be different from NUM_CLOG_BUFFERS without slru.c changes */
-#define NUM_SUBTRANS_BUFFERS	NUM_CLOG_BUFFERS
-
 extern void SubTransSetParent(TransactionId xid, TransactionId parent);
 extern TransactionId SubTransGetParent(TransactionId xid);
 extern TransactionId SubTransGetTopmostTransaction(TransactionId xid);
@@ -29,6 +23,5 @@ extern void ShutdownSUBTRANS(void);
 extern void CheckPointSUBTRANS(void);
 extern void ExtendSUBTRANS(TransactionId newestXact);
 extern void TruncateSUBTRANS(TransactionId oldestXact);
-extern void subtrans_zeropage_redo(int pageno);
 
 #endif   /* SUBTRANS_H */
diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h
index 7b08231e5103e9faa56a04d388f143f93478a560..cef886c3849305236263695c34515b9979f9fdfb 100644
--- a/src/include/storage/lwlock.h
+++ b/src/include/storage/lwlock.h
@@ -7,7 +7,7 @@
  * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group
  * Portions Copyright (c) 1994, Regents of the University of California
  *
- * $PostgreSQL: pgsql/src/include/storage/lwlock.h,v 1.13 2004/08/11 04:07:16 tgl Exp $
+ * $PostgreSQL: pgsql/src/include/storage/lwlock.h,v 1.14 2004/08/23 23:22:45 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -37,6 +37,8 @@ typedef enum LWLockId
 	ControlFileLock,
 	CheckpointLock,
 	CheckpointStartLock,
+	CLogControlLock,
+	SubtransControlLock,
 	RelCacheInitLock,
 	BgWriterCommLock,