diff --git a/src/backend/access/transam/clog.c b/src/backend/access/transam/clog.c index 88e1f1256ad80a8f007341325e83fa25db75b9c1..97f887d0a06ce234e256dda5bf204333ec7033b0 100644 --- a/src/backend/access/transam/clog.c +++ b/src/backend/access/transam/clog.c @@ -13,7 +13,7 @@ * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/backend/access/transam/clog.c,v 1.19 2003/11/29 19:51:40 pgsql Exp $ + * $PostgreSQL: pgsql/src/backend/access/transam/clog.c,v 1.20 2004/05/31 03:47:54 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -97,7 +97,7 @@ TransactionIdSetStatus(TransactionId xid, XidStatus status) Assert(status == TRANSACTION_STATUS_COMMITTED || status == TRANSACTION_STATUS_ABORTED); - LWLockAcquire(ClogCtl->locks->ControlLock, LW_EXCLUSIVE); + LWLockAcquire(ClogCtl->ControlLock, LW_EXCLUSIVE); byteptr = SimpleLruReadPage(ClogCtl, pageno, xid, true); byteptr += byteno; @@ -110,7 +110,7 @@ TransactionIdSetStatus(TransactionId xid, XidStatus status) /* ...->page_status[slotno] = CLOG_PAGE_DIRTY; already done */ - LWLockRelease(ClogCtl->locks->ControlLock); + LWLockRelease(ClogCtl->ControlLock); } /* @@ -128,14 +128,14 @@ TransactionIdGetStatus(TransactionId xid) char *byteptr; XidStatus status; - LWLockAcquire(ClogCtl->locks->ControlLock, LW_EXCLUSIVE); + LWLockAcquire(ClogCtl->ControlLock, LW_EXCLUSIVE); byteptr = SimpleLruReadPage(ClogCtl, pageno, xid, false); byteptr += byteno; status = (*byteptr >> bshift) & CLOG_XACT_BITMASK; - LWLockRelease(ClogCtl->locks->ControlLock); + LWLockRelease(ClogCtl->ControlLock); return status; } @@ -169,16 +169,16 @@ BootStrapCLOG(void) { int slotno; - LWLockAcquire(ClogCtl->locks->ControlLock, LW_EXCLUSIVE); + LWLockAcquire(ClogCtl->ControlLock, LW_EXCLUSIVE); /* Create and zero the first page of the commit log */ slotno = ZeroCLOGPage(0, false); /* Make sure it's written out */ - SimpleLruWritePage(ClogCtl, slotno); + SimpleLruWritePage(ClogCtl, slotno, NULL); /* Assert(ClogCtl->page_status[slotno] == CLOG_PAGE_CLEAN); */ - LWLockRelease(ClogCtl->locks->ControlLock); + LWLockRelease(ClogCtl->ControlLock); } /* @@ -256,12 +256,12 @@ ExtendCLOG(TransactionId newestXact) pageno = TransactionIdToPage(newestXact); - LWLockAcquire(ClogCtl->locks->ControlLock, LW_EXCLUSIVE); + LWLockAcquire(ClogCtl->ControlLock, LW_EXCLUSIVE); /* Zero the page and make an XLOG entry about it */ ZeroCLOGPage(pageno, true); - LWLockRelease(ClogCtl->locks->ControlLock); + LWLockRelease(ClogCtl->ControlLock); } @@ -351,13 +351,13 @@ clog_redo(XLogRecPtr lsn, XLogRecord *record) memcpy(&pageno, XLogRecGetData(record), sizeof(int)); - LWLockAcquire(ClogCtl->locks->ControlLock, LW_EXCLUSIVE); + LWLockAcquire(ClogCtl->ControlLock, LW_EXCLUSIVE); slotno = ZeroCLOGPage(pageno, false); - SimpleLruWritePage(ClogCtl, slotno); + SimpleLruWritePage(ClogCtl, slotno, NULL); /* Assert(ClogCtl->page_status[slotno] == SLRU_PAGE_CLEAN); */ - LWLockRelease(ClogCtl->locks->ControlLock); + LWLockRelease(ClogCtl->ControlLock); } } diff --git a/src/backend/access/transam/slru.c b/src/backend/access/transam/slru.c index 57dcd2b33798c3e9230681bb5cfaee593903aa5a..58798d0f07fcb56d230f886258c94bc07e5a4f4a 100644 --- a/src/backend/access/transam/slru.c +++ b/src/backend/access/transam/slru.c @@ -6,7 +6,7 @@ * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/backend/access/transam/slru.c,v 1.15 2004/05/29 22:48:18 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/access/transam/slru.c,v 1.16 2004/05/31 03:47:54 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -17,6 +17,7 @@ #include <unistd.h> #include "access/slru.h" +#include "access/clog.h" /* only for NUM_CLOG_BUFFERS */ #include "postmaster/bgwriter.h" #include "storage/fd.h" #include "storage/lwlock.h" @@ -100,6 +101,8 @@ typedef enum */ typedef struct SlruSharedData { + LWLockId ControlLock; + /* * Info for each buffer slot. Page number is undefined when status is * EMPTY. lru_count is essentially the number of page switches since @@ -110,6 +113,7 @@ typedef struct SlruSharedData SlruPageStatus page_status[NUM_CLOG_BUFFERS]; int page_number[NUM_CLOG_BUFFERS]; unsigned int page_lru_count[NUM_CLOG_BUFFERS]; + LWLockId BufferLocks[NUM_CLOG_BUFFERS]; /* Per-buffer I/O locks */ /* * latest_page_number is the page number of the current end of the @@ -118,12 +122,24 @@ typedef struct SlruSharedData */ int latest_page_number; } SlruSharedData; -typedef SlruSharedData *SlruShared; - #define SlruFileName(ctl, path, seg) \ snprintf(path, MAXPGPATH, "%s/%04X", (ctl)->Dir, seg) +/* + * During SimpleLruFlush(), we will usually not need to write/fsync more + * than one or two physical files, but we may need to write several pages + * per file. We can consolidate the I/O requests by leaving files open + * until control returns to SimpleLruFlush(). This data structure remembers + * which files are open. + */ +typedef struct SlruFlushData +{ + int num_files; /* # files actually open */ + int fd[NUM_CLOG_BUFFERS]; /* their FD's */ + int segno[NUM_CLOG_BUFFERS]; /* their clog seg#s */ +} SlruFlushData; + /* * Macro to mark a buffer slot "most recently used". */ @@ -145,14 +161,17 @@ typedef enum SLRU_SEEK_FAILED, SLRU_READ_FAILED, SLRU_WRITE_FAILED, + SLRU_FSYNC_FAILED, SLRU_CLOSE_FAILED } SlruErrorCause; + static SlruErrorCause slru_errcause; static int slru_errno; static bool SlruPhysicalReadPage(SlruCtl ctl, int pageno, int slotno); -static bool SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno); +static bool SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno, + SlruFlush fdata); static void SlruReportIOError(SlruCtl ctl, int pageno, TransactionId xid); static int SlruSelectLRUPage(SlruCtl ctl, int pageno); static bool SlruScanDirectory(SlruCtl ctl, int cutoffPage, bool doDeletions); @@ -165,24 +184,16 @@ static bool SlruScanDirectory(SlruCtl ctl, int cutoffPage, bool doDeletions); int SimpleLruShmemSize(void) { - return MAXALIGN(sizeof(SlruSharedData)) - + BLCKSZ * NUM_CLOG_BUFFERS - + MAXALIGN(sizeof(SlruLockData)) - ; + return MAXALIGN(sizeof(SlruSharedData)) + BLCKSZ * NUM_CLOG_BUFFERS; } void SimpleLruInit(SlruCtl ctl, const char *name, const char *subdir) { - bool found; - char *ptr; SlruShared shared; - SlruLock locks; + bool found; - ptr = ShmemInitStruct(name, SimpleLruShmemSize(), &found); - shared = (SlruShared) ptr; - locks = (SlruLock) (ptr + MAXALIGN(sizeof(SlruSharedData)) + - BLCKSZ * NUM_CLOG_BUFFERS); + shared = (SlruShared) ShmemInitStruct(name, SimpleLruShmemSize(), &found); if (!IsUnderPostmaster) { @@ -192,18 +203,18 @@ SimpleLruInit(SlruCtl ctl, const char *name, const char *subdir) Assert(!found); - locks->ControlLock = LWLockAssign(); - memset(shared, 0, sizeof(SlruSharedData)); + shared->ControlLock = LWLockAssign(); + bufptr = (char *) shared + MAXALIGN(sizeof(SlruSharedData)); for (slotno = 0; slotno < NUM_CLOG_BUFFERS; slotno++) { - locks->BufferLocks[slotno] = LWLockAssign(); shared->page_buffer[slotno] = bufptr; shared->page_status[slotno] = SLRU_PAGE_EMPTY; shared->page_lru_count[slotno] = 1; + shared->BufferLocks[slotno] = LWLockAssign(); bufptr += BLCKSZ; } @@ -213,10 +224,10 @@ SimpleLruInit(SlruCtl ctl, const char *name, const char *subdir) Assert(found); /* Initialize the unshared control struct */ - ctl->locks = locks; ctl->shared = shared; + ctl->ControlLock = shared->ControlLock; - /* Init directory path */ + /* Initialize unshared copy of directory path */ snprintf(ctl->Dir, MAXPGPATH, "%s/%s", DataDir, subdir); } @@ -232,7 +243,7 @@ int SimpleLruZeroPage(SlruCtl ctl, int pageno) { int slotno; - SlruShared shared = (SlruShared) ctl->shared; + SlruShared shared = ctl->shared; /* Find a suitable buffer slot for the page */ slotno = SlruSelectLRUPage(ctl, pageno); @@ -270,7 +281,7 @@ SimpleLruZeroPage(SlruCtl ctl, int pageno) char * SimpleLruReadPage(SlruCtl ctl, int pageno, TransactionId xid, bool forwrite) { - SlruShared shared = (SlruShared) ctl->shared; + SlruShared shared = ctl->shared; /* Outer loop handles restart if we lose the buffer to someone else */ for (;;) @@ -313,8 +324,8 @@ SimpleLruReadPage(SlruCtl ctl, int pageno, TransactionId xid, bool forwrite) SlruRecentlyUsed(shared, slotno); /* Release shared lock, grab per-buffer lock instead */ - LWLockRelease(ctl->locks->ControlLock); - LWLockAcquire(ctl->locks->BufferLocks[slotno], LW_EXCLUSIVE); + LWLockRelease(shared->ControlLock); + LWLockAcquire(shared->BufferLocks[slotno], LW_EXCLUSIVE); /* * Check to see if someone else already did the read, or took the @@ -323,8 +334,8 @@ SimpleLruReadPage(SlruCtl ctl, int pageno, TransactionId xid, bool forwrite) if (shared->page_number[slotno] != pageno || shared->page_status[slotno] != SLRU_PAGE_READ_IN_PROGRESS) { - LWLockRelease(ctl->locks->BufferLocks[slotno]); - LWLockAcquire(ctl->locks->ControlLock, LW_EXCLUSIVE); + LWLockRelease(shared->BufferLocks[slotno]); + LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE); continue; } @@ -332,14 +343,14 @@ SimpleLruReadPage(SlruCtl ctl, int pageno, TransactionId xid, bool forwrite) ok = SlruPhysicalReadPage(ctl, pageno, slotno); /* Re-acquire shared control lock and update page state */ - LWLockAcquire(ctl->locks->ControlLock, LW_EXCLUSIVE); + LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE); Assert(shared->page_number[slotno] == pageno && shared->page_status[slotno] == SLRU_PAGE_READ_IN_PROGRESS); shared->page_status[slotno] = ok ? SLRU_PAGE_CLEAN : SLRU_PAGE_EMPTY; - LWLockRelease(ctl->locks->BufferLocks[slotno]); + LWLockRelease(shared->BufferLocks[slotno]); /* Now it's okay to ereport if we failed */ if (!ok) @@ -364,11 +375,11 @@ SimpleLruReadPage(SlruCtl ctl, int pageno, TransactionId xid, bool forwrite) * Control lock must be held at entry, and will be held at exit. */ void -SimpleLruWritePage(SlruCtl ctl, int slotno) +SimpleLruWritePage(SlruCtl ctl, int slotno, SlruFlush fdata) { int pageno; bool ok; - SlruShared shared = (SlruShared) ctl->shared; + SlruShared shared = ctl->shared; /* Do nothing if page does not need writing */ if (shared->page_status[slotno] != SLRU_PAGE_DIRTY && @@ -378,8 +389,8 @@ SimpleLruWritePage(SlruCtl ctl, int slotno) pageno = shared->page_number[slotno]; /* Release shared lock, grab per-buffer lock instead */ - LWLockRelease(ctl->locks->ControlLock); - LWLockAcquire(ctl->locks->BufferLocks[slotno], LW_EXCLUSIVE); + LWLockRelease(shared->ControlLock); + LWLockAcquire(shared->BufferLocks[slotno], LW_EXCLUSIVE); /* * Check to see if someone else already did the write, or took the @@ -392,8 +403,8 @@ SimpleLruWritePage(SlruCtl ctl, int slotno) (shared->page_status[slotno] != SLRU_PAGE_DIRTY && shared->page_status[slotno] != SLRU_PAGE_WRITE_IN_PROGRESS)) { - LWLockRelease(ctl->locks->BufferLocks[slotno]); - LWLockAcquire(ctl->locks->ControlLock, LW_EXCLUSIVE); + LWLockRelease(shared->BufferLocks[slotno]); + LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE); return; } @@ -412,10 +423,19 @@ SimpleLruWritePage(SlruCtl ctl, int slotno) shared->page_status[slotno] = SLRU_PAGE_WRITE_IN_PROGRESS; /* Okay, do the write */ - ok = SlruPhysicalWritePage(ctl, pageno, slotno); + ok = SlruPhysicalWritePage(ctl, pageno, slotno, fdata); + + /* If we failed, and we're in a flush, better close the files */ + if (!ok && fdata) + { + int i; + + for (i = 0; i < fdata->num_files; i++) + close(fdata->fd[i]); + } /* Re-acquire shared control lock and update page state */ - LWLockAcquire(ctl->locks->ControlLock, LW_EXCLUSIVE); + LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE); Assert(shared->page_number[slotno] == pageno && (shared->page_status[slotno] == SLRU_PAGE_WRITE_IN_PROGRESS || @@ -425,7 +445,7 @@ SimpleLruWritePage(SlruCtl ctl, int slotno) if (shared->page_status[slotno] == SLRU_PAGE_WRITE_IN_PROGRESS) shared->page_status[slotno] = ok ? SLRU_PAGE_CLEAN : SLRU_PAGE_DIRTY; - LWLockRelease(ctl->locks->BufferLocks[slotno]); + LWLockRelease(shared->BufferLocks[slotno]); /* Now it's okay to ereport if we failed */ if (!ok) @@ -445,7 +465,7 @@ SimpleLruWritePage(SlruCtl ctl, int slotno) static bool SlruPhysicalReadPage(SlruCtl ctl, int pageno, int slotno) { - SlruShared shared = (SlruShared) ctl->shared; + SlruShared shared = ctl->shared; int segno = pageno / SLRU_PAGES_PER_SEGMENT; int rpageno = pageno % SLRU_PAGES_PER_SEGMENT; int offset = rpageno * BLCKSZ; @@ -482,6 +502,7 @@ SlruPhysicalReadPage(SlruCtl ctl, int pageno, int slotno) { slru_errcause = SLRU_SEEK_FAILED; slru_errno = errno; + close(fd); return false; } @@ -490,6 +511,7 @@ SlruPhysicalReadPage(SlruCtl ctl, int pageno, int slotno) { slru_errcause = SLRU_READ_FAILED; slru_errno = errno; + close(fd); return false; } @@ -511,50 +533,80 @@ SlruPhysicalReadPage(SlruCtl ctl, int pageno, int slotno) * info in static variables to let SlruReportIOError make the report. * * For now, assume it's not worth keeping a file pointer open across - * read/write operations. We could cache one virtual file pointer ... + * independent read/write operations. We do batch operations during + * SimpleLruFlush, though. + * + * fdata is NULL for a standalone write, pointer to open-file info during + * SimpleLruFlush. */ static bool -SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno) +SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno, SlruFlush fdata) { - SlruShared shared = (SlruShared) ctl->shared; + SlruShared shared = ctl->shared; int segno = pageno / SLRU_PAGES_PER_SEGMENT; int rpageno = pageno % SLRU_PAGES_PER_SEGMENT; int offset = rpageno * BLCKSZ; char path[MAXPGPATH]; - int fd; - - SlruFileName(ctl, path, segno); + int fd = -1; /* - * If the file doesn't already exist, we should create it. It is - * possible for this to need to happen when writing a page that's not - * first in its segment; we assume the OS can cope with that. (Note: - * it might seem that it'd be okay to create files only when - * SimpleLruZeroPage is called for the first page of a segment. - * However, if after a crash and restart the REDO logic elects to - * replay the log from a checkpoint before the latest one, then it's - * possible that we will get commands to set transaction status of - * transactions that have already been truncated from the commit log. - * Easiest way to deal with that is to accept references to - * nonexistent files here and in SlruPhysicalReadPage.) + * During a Flush, we may already have the desired file open. */ - fd = BasicOpenFile(path, O_RDWR | PG_BINARY, S_IRUSR | S_IWUSR); - if (fd < 0) + if (fdata) { - if (errno != ENOENT) + int i; + + for (i = 0; i < fdata->num_files; i++) { - slru_errcause = SLRU_OPEN_FAILED; - slru_errno = errno; - return false; + if (fdata->segno[i] == segno) + { + fd = fdata->fd[i]; + break; + } } + } - fd = BasicOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, - S_IRUSR | S_IWUSR); + if (fd < 0) + { + /* + * If the file doesn't already exist, we should create it. It is + * possible for this to need to happen when writing a page that's not + * first in its segment; we assume the OS can cope with that. + * (Note: it might seem that it'd be okay to create files only when + * SimpleLruZeroPage is called for the first page of a segment. + * However, if after a crash and restart the REDO logic elects to + * replay the log from a checkpoint before the latest one, then it's + * possible that we will get commands to set transaction status of + * transactions that have already been truncated from the commit log. + * Easiest way to deal with that is to accept references to + * nonexistent files here and in SlruPhysicalReadPage.) + */ + SlruFileName(ctl, path, segno); + fd = BasicOpenFile(path, O_RDWR | PG_BINARY, S_IRUSR | S_IWUSR); if (fd < 0) { - slru_errcause = SLRU_CREATE_FAILED; - slru_errno = errno; - return false; + if (errno != ENOENT) + { + slru_errcause = SLRU_OPEN_FAILED; + slru_errno = errno; + return false; + } + + fd = BasicOpenFile(path, O_RDWR | O_CREAT | O_EXCL | PG_BINARY, + S_IRUSR | S_IWUSR); + if (fd < 0) + { + slru_errcause = SLRU_CREATE_FAILED; + slru_errno = errno; + return false; + } + } + + if (fdata) + { + fdata->fd[fdata->num_files] = fd; + fdata->segno[fdata->num_files] = segno; + fdata->num_files++; } } @@ -562,6 +614,8 @@ SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno) { slru_errcause = SLRU_SEEK_FAILED; slru_errno = errno; + if (!fdata) + close(fd); return false; } @@ -573,14 +627,31 @@ SlruPhysicalWritePage(SlruCtl ctl, int pageno, int slotno) errno = ENOSPC; slru_errcause = SLRU_WRITE_FAILED; slru_errno = errno; + if (!fdata) + close(fd); return false; } - if (close(fd)) + /* + * If not part of Flush, need to fsync now. We assume this happens + * infrequently enough that it's not a performance issue. + */ + if (!fdata) { - slru_errcause = SLRU_CLOSE_FAILED; - slru_errno = errno; - return false; + if (pg_fsync(fd)) + { + slru_errcause = SLRU_FSYNC_FAILED; + slru_errno = errno; + close(fd); + return false; + } + + if (close(fd)) + { + slru_errcause = SLRU_CLOSE_FAILED; + slru_errno = errno; + return false; + } } return true; @@ -637,6 +708,13 @@ SlruReportIOError(SlruCtl ctl, int pageno, TransactionId xid) errdetail("could not write to file \"%s\" at offset %u: %m", path, offset))); break; + case SLRU_FSYNC_FAILED: + ereport(ERROR, + (errcode_for_file_access(), + errmsg("could not access status of transaction %u", xid), + errdetail("could not fsync file \"%s\": %m", + path))); + break; case SLRU_CLOSE_FAILED: ereport(ERROR, (errcode_for_file_access(), @@ -668,7 +746,7 @@ SlruReportIOError(SlruCtl ctl, int pageno, TransactionId xid) static int SlruSelectLRUPage(SlruCtl ctl, int pageno) { - SlruShared shared = (SlruShared) ctl->shared; + SlruShared shared = ctl->shared; /* Outer loop handles restart after I/O */ for (;;) @@ -717,7 +795,7 @@ SlruSelectLRUPage(SlruCtl ctl, int pageno) (void) SimpleLruReadPage(ctl, shared->page_number[bestslot], InvalidTransactionId, false); else - SimpleLruWritePage(ctl, bestslot); + SimpleLruWritePage(ctl, bestslot, NULL); /* * Now loop back and try again. This is the easiest way of @@ -733,7 +811,7 @@ SlruSelectLRUPage(SlruCtl ctl, int pageno) void SimpleLruSetLatestPage(SlruCtl ctl, int pageno) { - SlruShared shared = (SlruShared) ctl->shared; + SlruShared shared = ctl->shared; shared->latest_page_number = pageno; } @@ -744,16 +822,20 @@ SimpleLruSetLatestPage(SlruCtl ctl, int pageno) void SimpleLruFlush(SlruCtl ctl, bool checkpoint) { -#ifdef USE_ASSERT_CHECKING /* only used in Assert() */ - SlruShared shared = (SlruShared) ctl->shared; -#endif + SlruShared shared = ctl->shared; + SlruFlushData fdata; int slotno; + int pageno = 0; + int i; + bool ok; + + fdata.num_files = 0; - LWLockAcquire(ctl->locks->ControlLock, LW_EXCLUSIVE); + LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE); for (slotno = 0; slotno < NUM_CLOG_BUFFERS; slotno++) { - SimpleLruWritePage(ctl, slotno); + SimpleLruWritePage(ctl, slotno, &fdata); /* * When called during a checkpoint, we cannot assert that the slot @@ -765,7 +847,32 @@ SimpleLruFlush(SlruCtl ctl, bool checkpoint) shared->page_status[slotno] == SLRU_PAGE_CLEAN); } - LWLockRelease(ctl->locks->ControlLock); + LWLockRelease(shared->ControlLock); + + /* + * Now fsync and close any files that were open + */ + ok = true; + for (i = 0; i < fdata.num_files; i++) + { + if (pg_fsync(fdata.fd[i])) + { + slru_errcause = SLRU_FSYNC_FAILED; + slru_errno = errno; + pageno = fdata.segno[i] * SLRU_PAGES_PER_SEGMENT; + ok = false; + } + + if (close(fdata.fd[i])) + { + slru_errcause = SLRU_CLOSE_FAILED; + slru_errno = errno; + pageno = fdata.segno[i] * SLRU_PAGES_PER_SEGMENT; + ok = false; + } + } + if (!ok) + SlruReportIOError(ctl, pageno, InvalidTransactionId); } /* @@ -786,7 +893,7 @@ void SimpleLruTruncate(SlruCtl ctl, int cutoffPage) { int slotno; - SlruShared shared = (SlruShared) ctl->shared; + SlruShared shared = ctl->shared; /* * The cutoff point is the start of the segment containing cutoffPage. @@ -805,7 +912,7 @@ SimpleLruTruncate(SlruCtl ctl, int cutoffPage) * have been flushed already during the checkpoint, we're just being * extra careful here.) */ - LWLockAcquire(ctl->locks->ControlLock, LW_EXCLUSIVE); + LWLockAcquire(shared->ControlLock, LW_EXCLUSIVE); restart:; @@ -817,7 +924,7 @@ restart:; */ if (ctl->PagePrecedes(shared->latest_page_number, cutoffPage)) { - LWLockRelease(ctl->locks->ControlLock); + LWLockRelease(shared->ControlLock); ereport(LOG, (errmsg("could not truncate directory \"%s\": apparent wraparound", ctl->Dir))); @@ -849,11 +956,11 @@ restart:; (void) SimpleLruReadPage(ctl, shared->page_number[slotno], InvalidTransactionId, false); else - SimpleLruWritePage(ctl, slotno); + SimpleLruWritePage(ctl, slotno, NULL); goto restart; } - LWLockRelease(ctl->locks->ControlLock); + LWLockRelease(shared->ControlLock); /* Now we can remove the old segment(s) */ (void) SlruScanDirectory(ctl, cutoffPage, true); @@ -878,7 +985,8 @@ SlruScanDirectory(SlruCtl ctl, int cutoffPage, bool doDeletions) if (cldir == NULL) ereport(ERROR, (errcode_for_file_access(), - errmsg("could not open directory \"%s\": %m", ctl->Dir))); + errmsg("could not open directory \"%s\": %m", + ctl->Dir))); errno = 0; while ((clde = readdir(cldir)) != NULL) diff --git a/src/backend/postmaster/bgwriter.c b/src/backend/postmaster/bgwriter.c index ce80a4feff7a6454152d6c154cca5c50296c975d..6bb683386a2f109d2b413f73d76a4b860044dd10 100644 --- a/src/backend/postmaster/bgwriter.c +++ b/src/backend/postmaster/bgwriter.c @@ -27,14 +27,17 @@ * * If the bgwriter exits unexpectedly, the postmaster treats that the same * as a backend crash: shared memory may be corrupted, so remaining backends - * should be killed by SIGQUIT and then a recovery cycle started. + * should be killed by SIGQUIT and then a recovery cycle started. (Even if + * shared memory isn't corrupted, we have lost information about which + * files need to be fsync'd for the next checkpoint, and so a system + * restart needs to be forced.) * * * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/postmaster/bgwriter.c,v 1.1 2004/05/29 22:48:19 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/postmaster/bgwriter.c,v 1.2 2004/05/31 03:47:59 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -55,13 +58,54 @@ #include "utils/guc.h" -/* +/*---------- * Shared memory area for communication between bgwriter and backends + * + * The ckpt counters allow backends to watch for completion of a checkpoint + * request they send. Here's how it works: + * * At start of a checkpoint, bgwriter increments ckpt_started. + * * On completion of a checkpoint, bgwriter sets ckpt_done to + * equal ckpt_started. + * * On failure of a checkpoint, bgwrite first increments ckpt_failed, + * then sets ckpt_done to equal ckpt_started. + * All three fields are declared sig_atomic_t to ensure they can be read + * and written without explicit locking. The algorithm for backends is: + * 1. Record current values of ckpt_failed and ckpt_started (in that + * order!). + * 2. Send signal to request checkpoint. + * 3. Sleep until ckpt_started changes. Now you know a checkpoint has + * begun since you started this algorithm (although *not* that it was + * specifically initiated by your signal). + * 4. Record new value of ckpt_started. + * 5. Sleep until ckpt_done >= saved value of ckpt_started. (Use modulo + * arithmetic here in case counters wrap around.) Now you know a + * checkpoint has started and completed, but not whether it was + * successful. + * 6. If ckpt_failed is different from the originally saved value, + * assume request failed; otherwise it was definitely successful. + * + * The requests array holds fsync requests sent by backends and not yet + * absorbed by the bgwriter. + *---------- */ +typedef struct +{ + RelFileNode rnode; + BlockNumber segno; + /* might add a request-type field later */ +} BgWriterRequest; + typedef struct { pid_t bgwriter_pid; /* PID of bgwriter (0 if not started) */ - sig_atomic_t checkpoint_count; /* advances when checkpoint done */ + + sig_atomic_t ckpt_started; /* advances when checkpoint starts */ + sig_atomic_t ckpt_done; /* advances when checkpoint done */ + sig_atomic_t ckpt_failed; /* advances when checkpoint fails */ + + int num_requests; /* current # of requests */ + int max_requests; /* allocated array size */ + BgWriterRequest requests[1]; /* VARIABLE LENGTH ARRAY */ } BgWriterShmemStruct; static BgWriterShmemStruct *BgWriterShmem; @@ -86,6 +130,10 @@ static volatile sig_atomic_t shutdown_requested = false; /* * Private state */ +static bool am_bg_writer = false; + +static bool ckpt_active = false; + static time_t last_checkpoint_time; @@ -106,6 +154,7 @@ BackgroundWriterMain(void) { Assert(BgWriterShmem != NULL); BgWriterShmem->bgwriter_pid = MyProcPid; + am_bg_writer = true; /* * Properly accept or ignore signals the postmaster might send us @@ -180,6 +229,17 @@ BackgroundWriterMain(void) */ InError = false; + /* Warn any waiting backends that the checkpoint failed. */ + if (ckpt_active) + { + /* use volatile pointer to prevent code rearrangement */ + volatile BgWriterShmemStruct *bgs = BgWriterShmem; + + bgs->ckpt_failed++; + bgs->ckpt_done = bgs->ckpt_started; + ckpt_active = false; + } + /* * Exit interrupt holdoff section we implicitly established above. */ @@ -214,8 +274,17 @@ BackgroundWriterMain(void) long udelay; /* - * Process any signals received recently. + * Emergency bailout if postmaster has died. This is to avoid the + * necessity for manual cleanup of all postmaster children. */ + if (!PostmasterIsAlive(true)) + exit(1); + + /* + * Process any requests or signals received recently. + */ + AbsorbFsyncRequests(); + if (got_SIGHUP) { got_SIGHUP = false; @@ -265,8 +334,20 @@ BackgroundWriterMain(void) errhint("Consider increasing the configuration parameter \"checkpoint_segments\"."))); } + /* + * Indicate checkpoint start to any waiting backends. + */ + ckpt_active = true; + BgWriterShmem->ckpt_started++; + CreateCheckPoint(false, force_checkpoint); + /* + * Indicate checkpoint completion to any waiting backends. + */ + BgWriterShmem->ckpt_done = BgWriterShmem->ckpt_started; + ckpt_active = false; + /* * Note we record the checkpoint start time not end time as * last_checkpoint_time. This is so that time-driven checkpoints @@ -274,14 +355,11 @@ BackgroundWriterMain(void) */ last_checkpoint_time = now; - /* - * Indicate checkpoint completion to any waiting backends. - */ - BgWriterShmem->checkpoint_count++; - /* * After any checkpoint, close all smgr files. This is so we * won't hang onto smgr references to deleted files indefinitely. + * (It is safe to do this because this process does not have a + * relcache, and so no dangling references could remain.) */ smgrcloseall(); @@ -301,6 +379,8 @@ BackgroundWriterMain(void) * we respond reasonably promptly when someone signals us, * break down the sleep into 1-second increments, and check for * interrupts after each nap. + * + * We absorb pending requests after each short sleep. */ udelay = ((n > 0) ? BgWriterDelay : 10000) * 1000L; while (udelay > 1000000L) @@ -308,17 +388,11 @@ BackgroundWriterMain(void) if (got_SIGHUP || checkpoint_requested || shutdown_requested) break; pg_usleep(1000000L); + AbsorbFsyncRequests(); udelay -= 1000000L; } if (!(got_SIGHUP || checkpoint_requested || shutdown_requested)) pg_usleep(udelay); - - /* - * Emergency bailout if postmaster has died. This is to avoid the - * necessity for manual cleanup of all postmaster children. - */ - if (!PostmasterIsAlive(true)) - exit(1); } } @@ -387,10 +461,11 @@ int BgWriterShmemSize(void) { /* - * This is not worth measuring right now, but may become so after we - * add fsync signaling ... + * Currently, the size of the requests[] array is arbitrarily set + * equal to NBuffers. This may prove too large or small ... */ - return MAXALIGN(sizeof(BgWriterShmemStruct)); + return MAXALIGN(sizeof(BgWriterShmemStruct) + + (NBuffers - 1) * sizeof(BgWriterRequest)); } /* @@ -404,7 +479,7 @@ BgWriterShmemInit(void) BgWriterShmem = (BgWriterShmemStruct *) ShmemInitStruct("Background Writer Data", - sizeof(BgWriterShmemStruct), + BgWriterShmemSize(), &found); if (BgWriterShmem == NULL) ereport(FATAL, @@ -414,6 +489,7 @@ BgWriterShmemInit(void) return; /* already initialized */ MemSet(BgWriterShmem, 0, sizeof(BgWriterShmemStruct)); + BgWriterShmem->max_requests = NBuffers; } /* @@ -427,8 +503,10 @@ BgWriterShmemInit(void) void RequestCheckpoint(bool waitforit) { - volatile sig_atomic_t *count_ptr = &BgWriterShmem->checkpoint_count; - sig_atomic_t old_count = *count_ptr; + /* use volatile pointer to prevent code rearrangement */ + volatile BgWriterShmemStruct *bgs = BgWriterShmem; + sig_atomic_t old_failed = bgs->ckpt_failed; + sig_atomic_t old_started = bgs->ckpt_started; /* * Send signal to request checkpoint. When waitforit is false, @@ -442,15 +520,119 @@ RequestCheckpoint(bool waitforit) "could not signal for checkpoint: %m"); /* - * If requested, wait for completion. We detect completion by - * observing a change in checkpoint_count in shared memory. + * If requested, wait for completion. We detect completion according + * to the algorithm given above. */ if (waitforit) { - while (*count_ptr == old_count) + while (bgs->ckpt_started == old_started) { CHECK_FOR_INTERRUPTS(); - pg_usleep(1000000L); + pg_usleep(100000L); + } + old_started = bgs->ckpt_started; + /* + * We are waiting for ckpt_done >= old_started, in a modulo + * sense. This is a little tricky since we don't know the + * width or signedness of sig_atomic_t. We make the lowest + * common denominator assumption that it is only as wide + * as "char". This means that this algorithm will cope + * correctly as long as we don't sleep for more than 127 + * completed checkpoints. (If we do, we will get another + * chance to exit after 128 more checkpoints...) + */ + while (((signed char) (bgs->ckpt_done - old_started)) < 0) + { + CHECK_FOR_INTERRUPTS(); + pg_usleep(100000L); } + if (bgs->ckpt_failed != old_failed) + ereport(ERROR, + (errmsg("checkpoint request failed"), + errhint("Consult the postmaster log for details."))); + } +} + +/* + * ForwardFsyncRequest + * Forward a file-fsync request from a backend to the bgwriter + * + * Whenever a backend is compelled to write directly to a relation + * (which should be seldom, if the bgwriter is getting its job done), + * the backend calls this routine to pass over knowledge that the relation + * is dirty and must be fsync'd before next checkpoint. + * + * If we are unable to pass over the request (at present, this can happen + * if the shared memory queue is full), we return false. That forces + * the backend to do its own fsync. We hope that will be even more seldom. + * + * Note: we presently make no attempt to eliminate duplicate requests + * in the requests[] queue. The bgwriter will have to eliminate dups + * internally anyway, so we may as well avoid holding the lock longer + * than we have to here. + */ +bool +ForwardFsyncRequest(RelFileNode rnode, BlockNumber segno) +{ + BgWriterRequest *request; + + if (!IsUnderPostmaster) + return false; /* probably shouldn't even get here */ + Assert(BgWriterShmem != NULL); + + LWLockAcquire(BgWriterCommLock, LW_EXCLUSIVE); + if (BgWriterShmem->bgwriter_pid == 0 || + BgWriterShmem->num_requests >= BgWriterShmem->max_requests) + { + LWLockRelease(BgWriterCommLock); + return false; + } + request = &BgWriterShmem->requests[BgWriterShmem->num_requests++]; + request->rnode = rnode; + request->segno = segno; + LWLockRelease(BgWriterCommLock); + return true; +} + +/* + * AbsorbFsyncRequests + * Retrieve queued fsync requests and pass them to local smgr. + * + * This is exported because it must be called during CreateCheckpoint; + * we have to be sure we have accepted all pending requests *after* we + * establish the checkpoint redo pointer. Since CreateCheckpoint + * sometimes runs in non-bgwriter processes, do nothing if not bgwriter. + */ +void +AbsorbFsyncRequests(void) +{ + BgWriterRequest *requests = NULL; + BgWriterRequest *request; + int n; + + if (!am_bg_writer) + return; + + /* + * We try to avoid holding the lock for a long time by copying the + * request array. + */ + LWLockAcquire(BgWriterCommLock, LW_EXCLUSIVE); + + n = BgWriterShmem->num_requests; + if (n > 0) + { + requests = (BgWriterRequest *) palloc(n * sizeof(BgWriterRequest)); + memcpy(requests, BgWriterShmem->requests, n * sizeof(BgWriterRequest)); + } + BgWriterShmem->num_requests = 0; + + LWLockRelease(BgWriterCommLock); + + for (request = requests; n > 0; request++, n--) + { + RememberFsyncRequest(request->rnode, request->segno); } + if (requests) + pfree(requests); } diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index f718e33cd598beddb09c76f8467e18f16017c162..2386bc89bf3b7eaf22be77b5448924e89529412f 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/storage/buffer/bufmgr.c,v 1.166 2004/05/29 22:48:19 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/storage/buffer/bufmgr.c,v 1.167 2004/05/31 03:48:02 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -1044,6 +1044,9 @@ RelationTruncate(Relation rel, BlockNumber nblocks) * bothering to write them out first. This is NOT rollback-able, * and so should be used only with extreme caution! * + * There is no particularly good reason why this doesn't have a + * firstDelBlock parameter, except that current callers don't need it. + * * We assume that the caller holds an exclusive lock on the relation, * which should assure that no new buffers will be acquired for the rel * meanwhile. @@ -1052,14 +1055,15 @@ RelationTruncate(Relation rel, BlockNumber nblocks) void DropRelationBuffers(Relation rel) { - DropRelFileNodeBuffers(rel->rd_node, rel->rd_istemp); + DropRelFileNodeBuffers(rel->rd_node, rel->rd_istemp, 0); } /* --------------------------------------------------------------------- * DropRelFileNodeBuffers * * This is the same as DropRelationBuffers, except that the target - * relation is specified by RelFileNode and temp status. + * relation is specified by RelFileNode and temp status, and one + * may specify the first block to drop. * * This is NOT rollback-able. One legitimate use is to clear the * buffer cache of buffers for a relation that is being deleted @@ -1067,7 +1071,8 @@ DropRelationBuffers(Relation rel) * -------------------------------------------------------------------- */ void -DropRelFileNodeBuffers(RelFileNode rnode, bool istemp) +DropRelFileNodeBuffers(RelFileNode rnode, bool istemp, + BlockNumber firstDelBlock) { int i; BufferDesc *bufHdr; @@ -1077,7 +1082,8 @@ DropRelFileNodeBuffers(RelFileNode rnode, bool istemp) for (i = 0; i < NLocBuffer; i++) { bufHdr = &LocalBufferDescriptors[i]; - if (RelFileNodeEquals(bufHdr->tag.rnode, rnode)) + if (RelFileNodeEquals(bufHdr->tag.rnode, rnode) && + bufHdr->tag.blockNum >= firstDelBlock) { bufHdr->flags &= ~(BM_DIRTY | BM_JUST_DIRTIED); bufHdr->cntxDirty = false; @@ -1094,7 +1100,8 @@ DropRelFileNodeBuffers(RelFileNode rnode, bool istemp) { bufHdr = &BufferDescriptors[i - 1]; recheck: - if (RelFileNodeEquals(bufHdr->tag.rnode, rnode)) + if (RelFileNodeEquals(bufHdr->tag.rnode, rnode) && + bufHdr->tag.blockNum >= firstDelBlock) { /* * If there is I/O in progress, better wait till it's done; diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c index 5ef12de949518be73314b6341308886733f6b730..96de54110cfaab2f21d42120c63f3c09fbb6d961 100644 --- a/src/backend/storage/file/fd.c +++ b/src/backend/storage/file/fd.c @@ -7,7 +7,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/storage/file/fd.c,v 1.108 2004/02/23 23:03:10 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/storage/file/fd.c,v 1.109 2004/05/31 03:48:04 tgl Exp $ * * NOTES: * @@ -484,6 +484,7 @@ Insert(File file) DO_DB(_dump_lru()); } +/* returns 0 on success, -1 on re-open failure (with errno set) */ static int LruInsert(File file) { @@ -685,6 +686,7 @@ filepath(const char *filename) return buf; } +/* returns 0 on success, -1 on re-open failure (with errno set) */ static int FileAccess(File file) { @@ -954,7 +956,10 @@ FileRead(File file, char *buffer, int amount) file, VfdCache[file].fileName, VfdCache[file].seekPos, amount, buffer)); - FileAccess(file); + returnCode = FileAccess(file); + if (returnCode < 0) + return returnCode; + returnCode = read(VfdCache[file].fd, buffer, amount); if (returnCode > 0) VfdCache[file].seekPos += returnCode; @@ -975,7 +980,9 @@ FileWrite(File file, char *buffer, int amount) file, VfdCache[file].fileName, VfdCache[file].seekPos, amount, buffer)); - FileAccess(file); + returnCode = FileAccess(file); + if (returnCode < 0) + return returnCode; errno = 0; returnCode = write(VfdCache[file].fd, buffer, amount); @@ -992,9 +999,28 @@ FileWrite(File file, char *buffer, int amount) return returnCode; } +int +FileSync(File file) +{ + int returnCode; + + Assert(FileIsValid(file)); + + DO_DB(elog(LOG, "FileSync: %d (%s)", + file, VfdCache[file].fileName)); + + returnCode = FileAccess(file); + if (returnCode < 0) + return returnCode; + + return pg_fsync(VfdCache[file].fd); +} + long FileSeek(File file, long offset, int whence) { + int returnCode; + Assert(FileIsValid(file)); DO_DB(elog(LOG, "FileSeek: %d (%s) %ld %ld %d", @@ -1014,8 +1040,11 @@ FileSeek(File file, long offset, int whence) VfdCache[file].seekPos += offset; break; case SEEK_END: - FileAccess(file); - VfdCache[file].seekPos = lseek(VfdCache[file].fd, offset, whence); + returnCode = FileAccess(file); + if (returnCode < 0) + return returnCode; + VfdCache[file].seekPos = lseek(VfdCache[file].fd, + offset, whence); break; default: elog(ERROR, "invalid whence: %d", whence); @@ -1030,14 +1059,17 @@ FileSeek(File file, long offset, int whence) if (offset < 0) elog(ERROR, "invalid seek offset: %ld", offset); if (VfdCache[file].seekPos != offset) - VfdCache[file].seekPos = lseek(VfdCache[file].fd, offset, whence); + VfdCache[file].seekPos = lseek(VfdCache[file].fd, + offset, whence); break; case SEEK_CUR: if (offset != 0 || VfdCache[file].seekPos == FileUnknownPos) - VfdCache[file].seekPos = lseek(VfdCache[file].fd, offset, whence); + VfdCache[file].seekPos = lseek(VfdCache[file].fd, + offset, whence); break; case SEEK_END: - VfdCache[file].seekPos = lseek(VfdCache[file].fd, offset, whence); + VfdCache[file].seekPos = lseek(VfdCache[file].fd, + offset, whence); break; default: elog(ERROR, "invalid whence: %d", whence); @@ -1071,7 +1103,10 @@ FileTruncate(File file, long offset) DO_DB(elog(LOG, "FileTruncate %d (%s)", file, VfdCache[file].fileName)); - FileAccess(file); + returnCode = FileAccess(file); + if (returnCode < 0) + return returnCode; + returnCode = ftruncate(VfdCache[file].fd, (size_t) offset); return returnCode; } diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c index 2122a243207b12049b5d655e3a01305e70b3aade..5ac5868f690b32196f7eca791674d2de3c6ca4b0 100644 --- a/src/backend/storage/smgr/md.c +++ b/src/backend/storage/smgr/md.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.104 2004/04/19 17:42:58 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.105 2004/05/31 03:48:06 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -21,8 +21,10 @@ #include "catalog/catalog.h" #include "miscadmin.h" +#include "postmaster/bgwriter.h" #include "storage/fd.h" #include "storage/smgr.h" +#include "utils/hsearch.h" #include "utils/memutils.h" @@ -33,37 +35,68 @@ * system's file size limit (often 2GBytes). In order to do that, * we break relations up into chunks of < 2GBytes and store one chunk * in each of several files that represent the relation. See the - * BLCKSZ and RELSEG_SIZE configuration constants in - * include/pg_config.h. All chunks except the last MUST have size exactly - * equal to RELSEG_SIZE blocks --- see mdnblocks() and mdtruncate(). + * BLCKSZ and RELSEG_SIZE configuration constants in pg_config_manual.h. + * All chunks except the last MUST have size exactly equal to RELSEG_SIZE + * blocks --- see mdnblocks() and mdtruncate(). * * The file descriptor pointer (md_fd field) stored in the SMgrRelation * cache is, therefore, just the head of a list of MdfdVec objects. * But note the md_fd pointer can be NULL, indicating relation not open. * + * Note that mdfd_chain == NULL does not necessarily mean the relation + * doesn't have another segment after this one; we may just not have + * opened the next segment yet. (We could not have "all segments are + * in the chain" as an invariant anyway, since another backend could + * extend the relation when we weren't looking.) + * * All MdfdVec objects are palloc'd in the MdCxt memory context. */ typedef struct _MdfdVec { File mdfd_vfd; /* fd number in fd.c's pool */ - -#ifndef LET_OS_MANAGE_FILESIZE - struct _MdfdVec *mdfd_chain; /* for large relations */ + BlockNumber mdfd_segno; /* segment number, from 0 */ +#ifndef LET_OS_MANAGE_FILESIZE /* for large relations */ + struct _MdfdVec *mdfd_chain; /* next segment, or NULL */ #endif } MdfdVec; static MemoryContext MdCxt; /* context for all md.c allocations */ -/* routines declared here */ -static MdfdVec *mdopen(SMgrRelation reln); +/* + * In some contexts (currently, standalone backends and the bgwriter process) + * we keep track of pending fsync operations: we need to remember all relation + * segments that have been written since the last checkpoint, so that we can + * fsync them down to disk before completing the next checkpoint. This hash + * table remembers the pending operations. We use a hash table not because + * we want to look up individual operations, but simply as a convenient way + * of eliminating duplicate requests. + * + * (Regular backends do not track pending operations locally, but forward + * them to the bgwriter.) + * + * XXX for WIN32, may want to expand this to track pending deletes, too. + */ +typedef struct +{ + RelFileNode rnode; /* the targeted relation */ + BlockNumber segno; /* which segment */ +} PendingOperationEntry; + +static HTAB *pendingOpsTable = NULL; + + +/* local routines */ +static MdfdVec *mdopen(SMgrRelation reln, bool allowNotFound); +static bool register_dirty_segment(SMgrRelation reln, MdfdVec *seg); static MdfdVec *_fdvec_alloc(void); #ifndef LET_OS_MANAGE_FILESIZE static MdfdVec *_mdfd_openseg(SMgrRelation reln, BlockNumber segno, int oflags); #endif -static MdfdVec *_mdfd_getseg(SMgrRelation reln, BlockNumber blkno); +static MdfdVec *_mdfd_getseg(SMgrRelation reln, BlockNumber blkno, + bool allowNotFound); static BlockNumber _mdnblocks(File file, Size blcksz); @@ -79,6 +112,31 @@ mdinit(void) ALLOCSET_DEFAULT_INITSIZE, ALLOCSET_DEFAULT_MAXSIZE); + /* + * Create pending-operations hashtable if we need it. Currently, + * we need it if we are standalone (not under a postmaster) OR + * if we are a bootstrap-mode subprocess of a postmaster (that is, + * a startup or bgwriter process). + */ + if (!IsUnderPostmaster || IsBootstrapProcessingMode()) + { + HASHCTL hash_ctl; + + MemSet(&hash_ctl, 0, sizeof(hash_ctl)); + hash_ctl.keysize = sizeof(PendingOperationEntry); + hash_ctl.entrysize = sizeof(PendingOperationEntry); + hash_ctl.hash = tag_hash; + hash_ctl.hcxt = MdCxt; + pendingOpsTable = hash_create("Pending Ops Table", + 100L, + &hash_ctl, + HASH_ELEM | HASH_FUNCTION | HASH_CONTEXT); + if (pendingOpsTable == NULL) + ereport(FATAL, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + } + return true; } @@ -130,6 +188,7 @@ mdcreate(SMgrRelation reln, bool isRedo) reln->md_fd = _fdvec_alloc(); reln->md_fd->mdfd_vfd = fd; + reln->md_fd->mdfd_segno = 0; #ifndef LET_OS_MANAGE_FILESIZE reln->md_fd->mdfd_chain = NULL; #endif @@ -217,7 +276,7 @@ mdextend(SMgrRelation reln, BlockNumber blocknum, char *buffer) int nbytes; MdfdVec *v; - v = _mdfd_getseg(reln, blocknum); + v = _mdfd_getseg(reln, blocknum, false); #ifndef LET_OS_MANAGE_FILESIZE seekpos = (long) (BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE))); @@ -252,6 +311,9 @@ mdextend(SMgrRelation reln, BlockNumber blocknum, char *buffer) return false; } + if (!register_dirty_segment(reln, v)) + return false; + #ifndef LET_OS_MANAGE_FILESIZE Assert(_mdnblocks(v->mdfd_vfd, BLCKSZ) <= ((BlockNumber) RELSEG_SIZE)); #endif @@ -261,12 +323,14 @@ mdextend(SMgrRelation reln, BlockNumber blocknum, char *buffer) /* * mdopen() -- Open the specified relation. ereport's on failure. + * (Optionally, can return NULL instead of ereport for ENOENT.) * * Note we only open the first segment, when there are multiple segments. */ static MdfdVec * -mdopen(SMgrRelation reln) +mdopen(SMgrRelation reln, bool allowNotFound) { + MdfdVec *mdfd; char *path; File fd; @@ -292,6 +356,8 @@ mdopen(SMgrRelation reln) if (fd < 0) { pfree(path); + if (allowNotFound && errno == ENOENT) + return NULL; ereport(ERROR, (errcode_for_file_access(), errmsg("could not open relation %u/%u: %m", @@ -302,15 +368,16 @@ mdopen(SMgrRelation reln) pfree(path); - reln->md_fd = _fdvec_alloc(); + reln->md_fd = mdfd = _fdvec_alloc(); - reln->md_fd->mdfd_vfd = fd; + mdfd->mdfd_vfd = fd; + mdfd->mdfd_segno = 0; #ifndef LET_OS_MANAGE_FILESIZE - reln->md_fd->mdfd_chain = NULL; + mdfd->mdfd_chain = NULL; Assert(_mdnblocks(fd, BLCKSZ) <= ((BlockNumber) RELSEG_SIZE)); #endif - return reln->md_fd; + return mdfd; } /* @@ -361,7 +428,7 @@ mdread(SMgrRelation reln, BlockNumber blocknum, char *buffer) int nbytes; MdfdVec *v; - v = _mdfd_getseg(reln, blocknum); + v = _mdfd_getseg(reln, blocknum, false); #ifndef LET_OS_MANAGE_FILESIZE seekpos = (long) (BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE))); @@ -403,7 +470,7 @@ mdwrite(SMgrRelation reln, BlockNumber blocknum, char *buffer) long seekpos; MdfdVec *v; - v = _mdfd_getseg(reln, blocknum); + v = _mdfd_getseg(reln, blocknum, false); #ifndef LET_OS_MANAGE_FILESIZE seekpos = (long) (BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE))); @@ -418,6 +485,9 @@ mdwrite(SMgrRelation reln, BlockNumber blocknum, char *buffer) if (FileWrite(v->mdfd_vfd, buffer, BLCKSZ) != BLCKSZ) return false; + if (!register_dirty_segment(reln, v)) + return false; + return true; } @@ -434,7 +504,7 @@ mdwrite(SMgrRelation reln, BlockNumber blocknum, char *buffer) BlockNumber mdnblocks(SMgrRelation reln) { - MdfdVec *v = mdopen(reln); + MdfdVec *v = mdopen(reln, false); #ifndef LET_OS_MANAGE_FILESIZE BlockNumber nblocks; @@ -516,7 +586,7 @@ mdtruncate(SMgrRelation reln, BlockNumber nblocks) if (nblocks == curnblk) return nblocks; /* no work */ - v = mdopen(reln); + v = mdopen(reln, false); #ifndef LET_OS_MANAGE_FILESIZE priorblocks = 0; @@ -576,40 +646,154 @@ mdtruncate(SMgrRelation reln, BlockNumber nblocks) } /* - * mdcommit() -- Commit a transaction. + * mdsync() -- Sync previous writes to stable storage. + * + * This is only called during checkpoints, and checkpoints should only + * occur in processes that have created a pendingOpsTable. */ bool -mdcommit(void) +mdsync(void) { + HASH_SEQ_STATUS hstat; + PendingOperationEntry *entry; + + if (!pendingOpsTable) + return false; + /* - * We don't actually have to do anything here... + * If we are in the bgwriter, the sync had better include all fsync + * requests that were queued by backends before the checkpoint REDO + * point was determined. We go that a little better by accepting + * all requests queued up to the point where we start fsync'ing. */ + AbsorbFsyncRequests(); + + hash_seq_init(&hstat, pendingOpsTable); + while ((entry = (PendingOperationEntry *) hash_seq_search(&hstat)) != NULL) + { + /* + * If fsync is off then we don't have to bother opening the file + * at all. (We delay checking until this point so that changing + * fsync on the fly behaves sensibly.) + */ + if (enableFsync) + { + SMgrRelation reln; + MdfdVec *seg; + + /* + * Find or create an smgr hash entry for this relation. + * This may seem a bit unclean -- md calling smgr? But it's + * really the best solution. It ensures that the open file + * reference isn't permanently leaked if we get an error here. + * (You may say "but an unreferenced SMgrRelation is still a + * leak!" Not really, because the only case in which a checkpoint + * is done by a process that isn't about to shut down is in the + * bgwriter, and it will periodically do smgrcloseall(). This + * fact justifies our not closing the reln in the success path + * either, which is a good thing since in non-bgwriter cases + * we couldn't safely do that.) Furthermore, in many cases + * the relation will have been dirtied through this same smgr + * relation, and so we can save a file open/close cycle. + */ + reln = smgropen(entry->rnode); + + /* + * It is possible that the relation has been dropped or truncated + * since the fsync request was entered. Therefore, we have to + * allow file-not-found errors. This applies both during + * _mdfd_getseg() and during FileSync, since fd.c might have + * closed the file behind our back. + */ + seg = _mdfd_getseg(reln, + entry->segno * ((BlockNumber) RELSEG_SIZE), + true); + if (seg) + { + if (FileSync(seg->mdfd_vfd) < 0 && + errno != ENOENT) + { + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not fsync segment %u of relation %u/%u: %m", + entry->segno, + entry->rnode.tblNode, + entry->rnode.relNode))); + return false; + } + } + } + + /* Okay, delete this entry */ + if (hash_search(pendingOpsTable, entry, + HASH_REMOVE, NULL) == NULL) + elog(ERROR, "pendingOpsTable corrupted"); + } + return true; } /* - * mdabort() -- Abort a transaction. + * register_dirty_segment() -- Mark a relation segment as needing fsync + * + * If there is a local pending-ops table, just make an entry in it for + * mdsync to process later. Otherwise, try to pass off the fsync request + * to the background writer process. If that fails, just do the fsync + * locally before returning (we expect this will not happen often enough + * to be a performance problem). + * + * A false result implies I/O failure during local fsync. errno will be + * valid for error reporting. */ -bool -mdabort(void) +static bool +register_dirty_segment(SMgrRelation reln, MdfdVec *seg) { - /* - * We don't actually have to do anything here... - */ + if (pendingOpsTable) + { + PendingOperationEntry entry; + + /* ensure any pad bytes in the struct are zeroed */ + MemSet(&entry, 0, sizeof(entry)); + entry.rnode = reln->smgr_rnode; + entry.segno = seg->mdfd_segno; + + if (hash_search(pendingOpsTable, &entry, HASH_ENTER, NULL) != NULL) + return true; + /* out of memory: fall through to do it locally */ + } + else + { + if (ForwardFsyncRequest(reln->smgr_rnode, seg->mdfd_segno)) + return true; + } + + if (FileSync(seg->mdfd_vfd) < 0) + return false; return true; } /* - * mdsync() -- Sync previous writes to stable storage. + * RememberFsyncRequest() -- callback from bgwriter side of fsync request + * + * We stuff the fsync request into the local hash table for execution + * during the bgwriter's next checkpoint. */ -bool -mdsync(void) +void +RememberFsyncRequest(RelFileNode rnode, BlockNumber segno) { - sync(); - if (IsUnderPostmaster) - pg_usleep(2000000L); - sync(); - return true; + PendingOperationEntry entry; + + Assert(pendingOpsTable); + + /* ensure any pad bytes in the struct are zeroed */ + MemSet(&entry, 0, sizeof(entry)); + entry.rnode = rnode; + entry.segno = segno; + + if (hash_search(pendingOpsTable, &entry, HASH_ENTER, NULL) == NULL) + ereport(FATAL, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); } /* @@ -618,18 +802,11 @@ mdsync(void) static MdfdVec * _fdvec_alloc(void) { - MdfdVec *v; - - v = (MdfdVec *) MemoryContextAlloc(MdCxt, sizeof(MdfdVec)); - v->mdfd_vfd = -1; -#ifndef LET_OS_MANAGE_FILESIZE - v->mdfd_chain = NULL; -#endif - - return v; + return (MdfdVec *) MemoryContextAlloc(MdCxt, sizeof(MdfdVec)); } #ifndef LET_OS_MANAGE_FILESIZE + /* * Open the specified segment of the relation, * and make a MdfdVec object for it. Returns NULL on failure. @@ -642,11 +819,11 @@ _mdfd_openseg(SMgrRelation reln, BlockNumber segno, int oflags) char *path, *fullpath; - /* be sure we have enough space for the '.segno', if any */ path = relpath(reln->smgr_rnode); if (segno > 0) { + /* be sure we have enough space for the '.segno' */ fullpath = (char *) palloc(strlen(path) + 12); sprintf(fullpath, "%s.%u", path, segno); pfree(path); @@ -667,32 +844,36 @@ _mdfd_openseg(SMgrRelation reln, BlockNumber segno, int oflags) /* fill the entry */ v->mdfd_vfd = fd; + v->mdfd_segno = segno; v->mdfd_chain = NULL; Assert(_mdnblocks(fd, BLCKSZ) <= ((BlockNumber) RELSEG_SIZE)); /* all done */ return v; } -#endif + +#endif /* LET_OS_MANAGE_FILESIZE */ /* * _mdfd_getseg() -- Find the segment of the relation holding the - * specified block. ereport's on failure. + * specified block. ereport's on failure. + * (Optionally, can return NULL instead of ereport for ENOENT.) */ static MdfdVec * -_mdfd_getseg(SMgrRelation reln, BlockNumber blkno) +_mdfd_getseg(SMgrRelation reln, BlockNumber blkno, bool allowNotFound) { - MdfdVec *v = mdopen(reln); - + MdfdVec *v = mdopen(reln, allowNotFound); #ifndef LET_OS_MANAGE_FILESIZE - BlockNumber segno; - BlockNumber i; + BlockNumber segstogo; + BlockNumber nextsegno; - for (segno = blkno / ((BlockNumber) RELSEG_SIZE), i = 1; - segno > 0; - i++, segno--) - { + if (!v) + return NULL; /* only possible if allowNotFound */ + for (segstogo = blkno / ((BlockNumber) RELSEG_SIZE), nextsegno = 1; + segstogo > 0; + nextsegno++, segstogo--) + { if (v->mdfd_chain == NULL) { /* @@ -705,16 +886,21 @@ _mdfd_getseg(SMgrRelation reln, BlockNumber blkno) * one new segment per call, so this restriction seems * reasonable. */ - v->mdfd_chain = _mdfd_openseg(reln, i, (segno == 1) ? O_CREAT : 0); - + v->mdfd_chain = _mdfd_openseg(reln, + nextsegno, + (segstogo == 1) ? O_CREAT : 0); if (v->mdfd_chain == NULL) + { + if (allowNotFound && errno == ENOENT) + return NULL; ereport(ERROR, (errcode_for_file_access(), errmsg("could not open segment %u of relation %u/%u (target block %u): %m", - i, + nextsegno, reln->smgr_rnode.tblNode, reln->smgr_rnode.relNode, blkno))); + } } v = v->mdfd_chain; } diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c index d242744a4d7c47d2cc6700e1c3af8ecc23ddc522..c204e2796c4b2125b5f83a1a3fb574e2a9bd85a1 100644 --- a/src/backend/storage/smgr/smgr.c +++ b/src/backend/storage/smgr/smgr.c @@ -11,7 +11,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/storage/smgr/smgr.c,v 1.70 2004/02/11 22:55:25 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/storage/smgr/smgr.c,v 1.71 2004/05/31 03:48:06 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -56,7 +56,7 @@ typedef struct f_smgr static const f_smgr smgrsw[] = { /* magnetic disk */ {mdinit, NULL, mdclose, mdcreate, mdunlink, mdextend, - mdread, mdwrite, mdnblocks, mdtruncate, mdcommit, mdabort, mdsync + mdread, mdwrite, mdnblocks, mdtruncate, NULL, NULL, mdsync } }; @@ -407,7 +407,7 @@ smgr_internal_unlink(RelFileNode rnode, int which, bool isTemp, bool isRedo) * Get rid of any leftover buffers for the rel (shouldn't be any in the * commit case, but there can be in the abort case). */ - DropRelFileNodeBuffers(rnode, isTemp); + DropRelFileNodeBuffers(rnode, isTemp, 0); /* * Tell the free space map to forget this relation. It won't be accessed @@ -638,7 +638,7 @@ smgrcommit(void) if (smgrsw[i].smgr_commit) { if (! (*(smgrsw[i].smgr_commit)) ()) - elog(FATAL, "transaction commit failed on %s: %m", + elog(ERROR, "transaction commit failed on %s: %m", DatumGetCString(DirectFunctionCall1(smgrout, Int16GetDatum(i)))); } @@ -658,7 +658,7 @@ smgrabort(void) if (smgrsw[i].smgr_abort) { if (! (*(smgrsw[i].smgr_abort)) ()) - elog(FATAL, "transaction abort failed on %s: %m", + elog(ERROR, "transaction abort failed on %s: %m", DatumGetCString(DirectFunctionCall1(smgrout, Int16GetDatum(i)))); } @@ -678,7 +678,7 @@ smgrsync(void) if (smgrsw[i].smgr_sync) { if (! (*(smgrsw[i].smgr_sync)) ()) - elog(PANIC, "storage sync failed on %s: %m", + elog(ERROR, "storage sync failed on %s: %m", DatumGetCString(DirectFunctionCall1(smgrout, Int16GetDatum(i)))); } @@ -707,6 +707,13 @@ smgr_redo(XLogRecPtr lsn, XLogRecord *record) reln = smgropen(xlrec->rnode); + /* + * First, force bufmgr to drop any buffers it has for the to-be- + * truncated blocks. We must do this, else subsequent XLogReadBuffer + * operations will not re-extend the file properly. + */ + DropRelFileNodeBuffers(xlrec->rnode, false, xlrec->blkno); + /* Can't use smgrtruncate because it would try to xlog */ /* diff --git a/src/include/access/slru.h b/src/include/access/slru.h index fec968e7a202c4b53146e0e451c64afdfe9d8fdf..213cca5c21654510ba3f78ed90f20ed981bc6e7a 100644 --- a/src/include/access/slru.h +++ b/src/include/access/slru.h @@ -6,26 +6,17 @@ * Portions Copyright (c) 2003, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/access/slru.h,v 1.5 2004/05/28 05:13:17 tgl Exp $ + * $PostgreSQL: pgsql/src/include/access/slru.h,v 1.6 2004/05/31 03:48:08 tgl Exp $ */ #ifndef SLRU_H #define SLRU_H -#include "access/xlog.h" +#include "storage/lwlock.h" -/* exported because lwlock.c needs it */ -#define NUM_CLOG_BUFFERS 8 -/* - * Note: the separation between SlruLockData and SlruSharedData is purely - * historical; the structs could be combined. - */ -typedef struct SlruLockData -{ - LWLockId ControlLock; - LWLockId BufferLocks[NUM_CLOG_BUFFERS]; /* Per-buffer I/O locks */ -} SlruLockData; -typedef SlruLockData *SlruLock; +/* Opaque structs known only in slru.c */ +typedef struct SlruSharedData *SlruShared; +typedef struct SlruFlushData *SlruFlush; /* * SlruCtlData is an unshared structure that points to the active information @@ -33,13 +24,13 @@ typedef SlruLockData *SlruLock; */ typedef struct SlruCtlData { - void *shared; /* pointer to SlruSharedData */ - SlruLock locks; + SlruShared shared; + + LWLockId ControlLock; /* - * Dir is set during SimpleLruShmemInit and does not change thereafter. - * The value is automatically inherited by backends via fork, and - * doesn't need to be in shared memory. + * Dir is set during SimpleLruInit and does not change thereafter. + * Since it's always the same, it doesn't need to be in shared memory. */ char Dir[MAXPGPATH]; @@ -51,13 +42,16 @@ typedef struct SlruCtlData bool (*PagePrecedes) (int, int); } SlruCtlData; + typedef SlruCtlData *SlruCtl; + extern int SimpleLruShmemSize(void); extern void SimpleLruInit(SlruCtl ctl, const char *name, const char *subdir); extern int SimpleLruZeroPage(SlruCtl ctl, int pageno); -extern char *SimpleLruReadPage(SlruCtl ctl, int pageno, TransactionId xid, bool forwrite); -extern void SimpleLruWritePage(SlruCtl ctl, int slotno); +extern char *SimpleLruReadPage(SlruCtl ctl, int pageno, + TransactionId xid, bool forwrite); +extern void SimpleLruWritePage(SlruCtl ctl, int slotno, SlruFlush fdata); extern void SimpleLruSetLatestPage(SlruCtl ctl, int pageno); extern void SimpleLruFlush(SlruCtl ctl, bool checkpoint); extern void SimpleLruTruncate(SlruCtl ctl, int cutoffPage); diff --git a/src/include/postmaster/bgwriter.h b/src/include/postmaster/bgwriter.h index c11af72e78945f91dfe76bfab00eeab7eabea886..ed56e9639e889ec64fe0dd2c45476561d9b0dcaf 100644 --- a/src/include/postmaster/bgwriter.h +++ b/src/include/postmaster/bgwriter.h @@ -5,13 +5,17 @@ * * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group * - * $PostgreSQL: pgsql/src/include/postmaster/bgwriter.h,v 1.1 2004/05/29 22:48:23 tgl Exp $ + * $PostgreSQL: pgsql/src/include/postmaster/bgwriter.h,v 1.2 2004/05/31 03:48:09 tgl Exp $ * *------------------------------------------------------------------------- */ #ifndef _BGWRITER_H #define _BGWRITER_H +#include "storage/block.h" +#include "storage/relfilenode.h" + + /* GUC options */ extern int BgWriterDelay; extern int BgWriterPercent; @@ -23,6 +27,9 @@ extern void BackgroundWriterMain(void); extern void RequestCheckpoint(bool waitforit); +extern bool ForwardFsyncRequest(RelFileNode rnode, BlockNumber segno); +extern void AbsorbFsyncRequests(void); + extern int BgWriterShmemSize(void); extern void BgWriterShmemInit(void); diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h index 27752d412b56435ddbe61bf77adcd83554d1df92..95b426bb8b93f21faa93c2069167eb8fe048ea49 100644 --- a/src/include/storage/bufmgr.h +++ b/src/include/storage/bufmgr.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/storage/bufmgr.h,v 1.80 2004/05/29 22:48:23 tgl Exp $ + * $PostgreSQL: pgsql/src/include/storage/bufmgr.h,v 1.81 2004/05/31 03:48:10 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -154,7 +154,8 @@ extern BlockNumber RelationGetNumberOfBlocks(Relation relation); extern void RelationTruncate(Relation rel, BlockNumber nblocks); extern int FlushRelationBuffers(Relation rel, BlockNumber firstDelBlock); extern void DropRelationBuffers(Relation rel); -extern void DropRelFileNodeBuffers(RelFileNode rnode, bool istemp); +extern void DropRelFileNodeBuffers(RelFileNode rnode, bool istemp, + BlockNumber firstDelBlock); extern void DropBuffers(Oid dbid); #ifdef NOT_USED diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h index 177925cf3e80776dbc34dd543ca8e36b5fcbea76..430ed5d8c74181f11b10a41428453f123159f06b 100644 --- a/src/include/storage/fd.h +++ b/src/include/storage/fd.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/storage/fd.h,v 1.44 2004/02/23 23:03:10 tgl Exp $ + * $PostgreSQL: pgsql/src/include/storage/fd.h,v 1.45 2004/05/31 03:48:10 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -15,7 +15,7 @@ /* * calls: * - * File {Close, Read, Write, Seek, Tell, MarkDirty, Sync} + * File {Close, Read, Write, Seek, Tell, Sync} * {File Name Open, Allocate, Free} File * * These are NOT JUST RENAMINGS OF THE UNIX ROUTINES. @@ -66,6 +66,7 @@ extern void FileClose(File file); extern void FileUnlink(File file); extern int FileRead(File file, char *buffer, int amount); extern int FileWrite(File file, char *buffer, int amount); +extern int FileSync(File file); extern long FileSeek(File file, long offset, int whence); extern int FileTruncate(File file, long offset); diff --git a/src/include/storage/lwlock.h b/src/include/storage/lwlock.h index 34f9c6613c709f9122a054dfb37deab756bede29..e06d9a4bf77f43af92bdb216ad5b009c75a868c1 100644 --- a/src/include/storage/lwlock.h +++ b/src/include/storage/lwlock.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/storage/lwlock.h,v 1.10 2003/12/20 17:31:21 momjian Exp $ + * $PostgreSQL: pgsql/src/include/storage/lwlock.h,v 1.11 2004/05/31 03:48:10 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -37,6 +37,7 @@ typedef enum LWLockId ControlFileLock, CheckpointLock, RelCacheInitLock, + BgWriterCommLock, NumFixedLWLocks, /* must be last except for * MaxDynamicLWLock */ diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h index 41367d35e819b92ff1a5b889753e1f193006366a..6a28c3824fad8e6b5f1a90cabb455e8219b8b53f 100644 --- a/src/include/storage/smgr.h +++ b/src/include/storage/smgr.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2003, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/storage/smgr.h,v 1.41 2004/02/11 22:55:26 tgl Exp $ + * $PostgreSQL: pgsql/src/include/storage/smgr.h,v 1.42 2004/05/31 03:48:10 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -83,10 +83,10 @@ extern bool mdread(SMgrRelation reln, BlockNumber blocknum, char *buffer); extern bool mdwrite(SMgrRelation reln, BlockNumber blocknum, char *buffer); extern BlockNumber mdnblocks(SMgrRelation reln); extern BlockNumber mdtruncate(SMgrRelation reln, BlockNumber nblocks); -extern bool mdcommit(void); -extern bool mdabort(void); extern bool mdsync(void); +extern void RememberFsyncRequest(RelFileNode rnode, BlockNumber segno); + /* smgrtype.c */ extern Datum smgrout(PG_FUNCTION_ARGS); extern Datum smgrin(PG_FUNCTION_ARGS);