diff --git a/doc/src/sgml/config.sgml b/doc/src/sgml/config.sgml index 0d8cf0a84cb52f707f6cb529e42ce8510c8fd4aa..7d21e31bdb5143c249d8d14f28a0bfab462928d4 100644 --- a/doc/src/sgml/config.sgml +++ b/doc/src/sgml/config.sgml @@ -1,4 +1,4 @@ -<!-- $PostgreSQL: pgsql/doc/src/sgml/config.sgml,v 1.204 2009/01/09 10:13:18 mha Exp $ --> +<!-- $PostgreSQL: pgsql/doc/src/sgml/config.sgml,v 1.205 2009/01/12 05:10:44 tgl Exp $ --> <chapter Id="runtime-config"> <title>Server Configuration</title> @@ -1203,6 +1203,55 @@ SET ENABLE_SEQSCAN TO OFF; queries. </para> </sect2> + + <sect2 id="runtime-config-resource-async-behavior"> + <title>Asynchronous Behavior</title> + + <variablelist> + <varlistentry id="guc-effective-io-concurrency" xreflabel="effective_io_concurrency"> + <term><varname>effective_io_concurrency</varname> (<type>integer</type>)</term> + <indexterm> + <primary><varname>effective_io_concurrency</> configuration parameter</primary> + </indexterm> + <listitem> + <para> + Sets the number of concurrent disk I/O operations that + <productname>PostgreSQL</> expects can be executed + simultaneously. Raising this value will increase the number of I/O + operations that any individual <productname>PostgreSQL</> session + attempts to initiate in parallel. The allowed range is 1 to 1000, + or zero to disable issuance of asynchronous I/O requests. + </para> + + <para> + A good starting point for this setting is the number of separate + drives comprising a RAID 0 stripe or RAID 1 mirror being used for the + database. (For RAID 5 the parity drive should not be counted.) + However, if the database is often busy with multiple queries issued in + concurrent sessions, lower values may be sufficient to keep the disk + array busy. A value higher than needed to keep the disks busy will + only result in extra CPU overhead. + </para> + + <para> + For more exotic systems, such as memory-based storage or a RAID array + that is limited by bus bandwidth, the correct value might be the + number of I/O paths available. Some experimentation may be needed + to find the best value. + </para> + + <para> + Asynchronous I/O depends on an effective <function>posix_fadvise</> + function, which some operating systems lack. If the function is not + present then setting this parameter to anything but zero will result + in an error. On some operating systems the function is present but + does not actually do anything. On such systems setting a nonzero + value will add CPU overhead without improving performance. + </para> + </listitem> + </varlistentry> + </variablelist> + </sect2> </sect1> <sect1 id="runtime-config-wal"> diff --git a/src/backend/executor/nodeBitmapHeapscan.c b/src/backend/executor/nodeBitmapHeapscan.c index 880b9c9590eae67d2b1e5fa55cf2acf6c7e7a783..2ba8b89ee359c2910ce582d810ea51679fd39a60 100644 --- a/src/backend/executor/nodeBitmapHeapscan.c +++ b/src/backend/executor/nodeBitmapHeapscan.c @@ -21,7 +21,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/executor/nodeBitmapHeapscan.c,v 1.32 2009/01/10 21:08:36 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/executor/nodeBitmapHeapscan.c,v 1.33 2009/01/12 05:10:44 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -67,6 +67,7 @@ BitmapHeapNext(BitmapHeapScanState *node) TIDBitmap *tbm; TBMIterator *tbmiterator; TBMIterateResult *tbmres; + TBMIterator *prefetch_iterator; OffsetNumber targoffset; TupleTableSlot *slot; @@ -81,6 +82,7 @@ BitmapHeapNext(BitmapHeapScanState *node) tbm = node->tbm; tbmiterator = node->tbmiterator; tbmres = node->tbmres; + prefetch_iterator = node->prefetch_iterator; /* * Check if we are evaluating PlanQual for tuple of this relation. @@ -114,6 +116,15 @@ BitmapHeapNext(BitmapHeapScanState *node) /* * If we haven't yet performed the underlying index scan, do it, and * begin the iteration over the bitmap. + * + * For prefetching, we use *two* iterators, one for the pages we are + * actually scanning and another that runs ahead of the first for + * prefetching. node->prefetch_pages tracks exactly how many pages + * ahead the prefetch iterator is. Also, node->prefetch_target tracks + * the desired prefetch distance, which starts small and increases up + * to the GUC-controlled maximum, target_prefetch_pages. This is to + * avoid doing a lot of prefetching in a scan that stops after a few + * tuples because of a LIMIT. */ if (tbm == NULL) { @@ -125,6 +136,15 @@ BitmapHeapNext(BitmapHeapScanState *node) node->tbm = tbm; node->tbmiterator = tbmiterator = tbm_begin_iterate(tbm); node->tbmres = tbmres = NULL; + +#ifdef USE_PREFETCH + if (target_prefetch_pages > 0) + { + node->prefetch_iterator = prefetch_iterator = tbm_begin_iterate(tbm); + node->prefetch_pages = 0; + node->prefetch_target = -1; + } +#endif /* USE_PREFETCH */ } for (;;) @@ -144,6 +164,22 @@ BitmapHeapNext(BitmapHeapScanState *node) break; } +#ifdef USE_PREFETCH + if (node->prefetch_pages > 0) + { + /* The main iterator has closed the distance by one page */ + node->prefetch_pages--; + } + else if (prefetch_iterator) + { + /* Do not let the prefetch iterator get behind the main one */ + TBMIterateResult *tbmpre = tbm_iterate(prefetch_iterator); + + if (tbmpre == NULL || tbmpre->blockno != tbmres->blockno) + elog(ERROR, "prefetch and main iterators are out of sync"); + } +#endif /* USE_PREFETCH */ + /* * Ignore any claimed entries past what we think is the end of the * relation. (This is probably not necessary given that we got at @@ -165,6 +201,23 @@ BitmapHeapNext(BitmapHeapScanState *node) * Set rs_cindex to first slot to examine */ scan->rs_cindex = 0; + +#ifdef USE_PREFETCH + /* + * Increase prefetch target if it's not yet at the max. Note + * that we will increase it to zero after fetching the very + * first page/tuple, then to one after the second tuple is + * fetched, then it doubles as later pages are fetched. + */ + if (node->prefetch_target >= target_prefetch_pages) + /* don't increase any further */ ; + else if (node->prefetch_target >= target_prefetch_pages / 2) + node->prefetch_target = target_prefetch_pages; + else if (node->prefetch_target > 0) + node->prefetch_target *= 2; + else + node->prefetch_target++; +#endif /* USE_PREFETCH */ } else { @@ -172,7 +225,40 @@ BitmapHeapNext(BitmapHeapScanState *node) * Continuing in previously obtained page; advance rs_cindex */ scan->rs_cindex++; + +#ifdef USE_PREFETCH + /* + * Try to prefetch at least a few pages even before we get to the + * second page if we don't stop reading after the first tuple. + */ + if (node->prefetch_target < target_prefetch_pages) + node->prefetch_target++; +#endif /* USE_PREFETCH */ + } + +#ifdef USE_PREFETCH + /* + * We issue prefetch requests *after* fetching the current page + * to try to avoid having prefetching interfere with the main I/O. + */ + if (prefetch_iterator) + { + while (node->prefetch_pages < node->prefetch_target) + { + TBMIterateResult *tbmpre = tbm_iterate(prefetch_iterator); + + if (tbmpre == NULL) + { + /* No more pages to prefetch */ + tbm_end_iterate(prefetch_iterator); + node->prefetch_iterator = prefetch_iterator = NULL; + break; + } + node->prefetch_pages++; + PrefetchBuffer(scan->rs_rd, MAIN_FORKNUM, tbmpre->blockno); + } } +#endif /* USE_PREFETCH */ /* * Out of range? If so, nothing more to look at on this page @@ -379,11 +465,14 @@ ExecBitmapHeapReScan(BitmapHeapScanState *node, ExprContext *exprCtxt) if (node->tbmiterator) tbm_end_iterate(node->tbmiterator); + if (node->prefetch_iterator) + tbm_end_iterate(node->prefetch_iterator); if (node->tbm) tbm_free(node->tbm); node->tbm = NULL; node->tbmiterator = NULL; node->tbmres = NULL; + node->prefetch_iterator = NULL; /* * Always rescan the input immediately, to ensure we can pass down any @@ -429,6 +518,8 @@ ExecEndBitmapHeapScan(BitmapHeapScanState *node) */ if (node->tbmiterator) tbm_end_iterate(node->tbmiterator); + if (node->prefetch_iterator) + tbm_end_iterate(node->prefetch_iterator); if (node->tbm) tbm_free(node->tbm); @@ -474,6 +565,9 @@ ExecInitBitmapHeapScan(BitmapHeapScan *node, EState *estate, int eflags) scanstate->tbm = NULL; scanstate->tbmiterator = NULL; scanstate->tbmres = NULL; + scanstate->prefetch_iterator = NULL; + scanstate->prefetch_pages = 0; + scanstate->prefetch_target = 0; /* * Miscellaneous initialization diff --git a/src/backend/storage/buffer/bufmgr.c b/src/backend/storage/buffer/bufmgr.c index 6046f6ef6aa325fb258f48eaa6a707865b54e398..534c7516f78e1030a89b5692bc83ca06dd9e2a49 100644 --- a/src/backend/storage/buffer/bufmgr.c +++ b/src/backend/storage/buffer/bufmgr.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/storage/buffer/bufmgr.c,v 1.244 2009/01/01 17:23:47 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/storage/buffer/bufmgr.c,v 1.245 2009/01/12 05:10:44 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -65,6 +65,13 @@ bool zero_damaged_pages = false; int bgwriter_lru_maxpages = 100; double bgwriter_lru_multiplier = 2.0; +/* + * How many buffers PrefetchBuffer callers should try to stay ahead of their + * ReadBuffer calls by. This is maintained by the assign hook for + * effective_io_concurrency. Zero means "never prefetch". + */ +int target_prefetch_pages = 0; + /* local state for StartBufferIO and related functions */ static volatile BufferDesc *InProgressBuf = NULL; static bool IsForInput; @@ -95,6 +102,56 @@ static void FlushBuffer(volatile BufferDesc *buf, SMgrRelation reln); static void AtProcExit_Buffers(int code, Datum arg); +/* + * PrefetchBuffer -- initiate asynchronous read of a block of a relation + * + * This is named by analogy to ReadBuffer but doesn't actually allocate a + * buffer. Instead it tries to ensure that a future ReadBuffer for the given + * block will not be delayed by the I/O. Prefetching is optional. + * No-op if prefetching isn't compiled in. + */ +void +PrefetchBuffer(Relation reln, ForkNumber forkNum, BlockNumber blockNum) +{ +#ifdef USE_PREFETCH + Assert(RelationIsValid(reln)); + Assert(BlockNumberIsValid(blockNum)); + + /* Open it at the smgr level if not already done */ + RelationOpenSmgr(reln); + + if (reln->rd_istemp) + { + /* pass it off to localbuf.c */ + LocalPrefetchBuffer(reln->rd_smgr, forkNum, blockNum); + } + else + { + BufferTag newTag; /* identity of requested block */ + uint32 newHash; /* hash value for newTag */ + LWLockId newPartitionLock; /* buffer partition lock for it */ + int buf_id; + + /* create a tag so we can lookup the buffer */ + INIT_BUFFERTAG(newTag, reln->rd_smgr->smgr_rnode, forkNum, blockNum); + + /* determine its hash code and partition lock ID */ + newHash = BufTableHashCode(&newTag); + newPartitionLock = BufMappingPartitionLock(newHash); + + /* see if the block is in the buffer pool already */ + LWLockAcquire(newPartitionLock, LW_SHARED); + buf_id = BufTableLookup(&newTag, newHash); + LWLockRelease(newPartitionLock); + + /* If not in buffers, initiate prefetch */ + if (buf_id < 0) + smgrprefetch(reln->rd_smgr, forkNum, blockNum); + } +#endif /* USE_PREFETCH */ +} + + /* * ReadBuffer -- a shorthand for ReadBufferExtended, for reading from main * fork with RBM_NORMAL mode and default strategy. diff --git a/src/backend/storage/buffer/localbuf.c b/src/backend/storage/buffer/localbuf.c index 4dd5619f39fb668cab7b3e1ecddfb86e2bce319c..5431419cfe6705def7b0dcde6e5e9833a76fca80 100644 --- a/src/backend/storage/buffer/localbuf.c +++ b/src/backend/storage/buffer/localbuf.c @@ -9,7 +9,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/storage/buffer/localbuf.c,v 1.85 2009/01/01 17:23:47 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/storage/buffer/localbuf.c,v 1.86 2009/01/12 05:10:44 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -52,6 +52,43 @@ static void InitLocalBuffers(void); static Block GetLocalBufferStorage(void); +/* + * LocalPrefetchBuffer - + * initiate asynchronous read of a block of a relation + * + * Do PrefetchBuffer's work for temporary relations. + * No-op if prefetching isn't compiled in. + */ +void +LocalPrefetchBuffer(SMgrRelation smgr, ForkNumber forkNum, + BlockNumber blockNum) +{ +#ifdef USE_PREFETCH + BufferTag newTag; /* identity of requested block */ + LocalBufferLookupEnt *hresult; + + INIT_BUFFERTAG(newTag, smgr->smgr_rnode, forkNum, blockNum); + + /* Initialize local buffers if first request in this session */ + if (LocalBufHash == NULL) + InitLocalBuffers(); + + /* See if the desired buffer already exists */ + hresult = (LocalBufferLookupEnt *) + hash_search(LocalBufHash, (void *) &newTag, HASH_FIND, NULL); + + if (hresult) + { + /* Yes, so nothing to do */ + return; + } + + /* Not in buffers, so initiate prefetch */ + smgrprefetch(smgr, forkNum, blockNum); +#endif /* USE_PREFETCH */ +} + + /* * LocalBufferAlloc - * Find or create a local buffer for the given page of the given relation. diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c index f67ab94fd526886cd2e04207a003fc7ae3a70c29..b91946a035052926415453c4eb22e99e6826fd40 100644 --- a/src/backend/storage/file/fd.c +++ b/src/backend/storage/file/fd.c @@ -7,7 +7,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/storage/file/fd.c,v 1.146 2009/01/01 17:23:47 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/storage/file/fd.c,v 1.147 2009/01/12 05:10:44 tgl Exp $ * * NOTES: * @@ -1029,6 +1029,42 @@ FileClose(File file) FreeVfd(file); } +/* + * FilePrefetch - initiate asynchronous read of a given range of the file. + * The logical seek position is unaffected. + * + * Currently the only implementation of this function is using posix_fadvise + * which is the simplest standardized interface that accomplishes this. + * We could add an implementation using libaio in the future; but note that + * this API is inappropriate for libaio, which wants to have a buffer provided + * to read into. + */ +int +FilePrefetch(File file, off_t offset, int amount) +{ +#if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_WILLNEED) + int returnCode; + + Assert(FileIsValid(file)); + + DO_DB(elog(LOG, "FilePrefetch: %d (%s) " INT64_FORMAT " %d", + file, VfdCache[file].fileName, + (int64) offset, amount)); + + returnCode = FileAccess(file); + if (returnCode < 0) + return returnCode; + + returnCode = posix_fadvise(VfdCache[file].fd, offset, amount, + POSIX_FADV_WILLNEED); + + return returnCode; +#else + Assert(FileIsValid(file)); + return 0; +#endif +} + int FileRead(File file, char *buffer, int amount) { diff --git a/src/backend/storage/smgr/md.c b/src/backend/storage/smgr/md.c index b9c1273702fea6bfafa965cb539b85404f1cd47b..643c75e538b0b2a278a8d22eab84df7d6975ad5b 100644 --- a/src/backend/storage/smgr/md.c +++ b/src/backend/storage/smgr/md.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.143 2009/01/01 17:23:48 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/storage/smgr/md.c,v 1.144 2009/01/12 05:10:44 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -550,6 +550,26 @@ mdclose(SMgrRelation reln, ForkNumber forknum) } } +/* + * mdprefetch() -- Initiate asynchronous read of the specified block of a relation + */ +void +mdprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) +{ +#ifdef USE_PREFETCH + off_t seekpos; + MdfdVec *v; + + v = _mdfd_getseg(reln, forknum, blocknum, false, EXTENSION_FAIL); + + seekpos = (off_t) BLCKSZ * (blocknum % ((BlockNumber) RELSEG_SIZE)); + Assert(seekpos < (off_t) BLCKSZ * RELSEG_SIZE); + + (void) FilePrefetch(v->mdfd_vfd, seekpos, BLCKSZ); +#endif /* USE_PREFETCH */ +} + + /* * mdread() -- Read the specified block from a relation. */ diff --git a/src/backend/storage/smgr/smgr.c b/src/backend/storage/smgr/smgr.c index 6ed91bd96ff4e6183973a857a2fd95a7df619e0f..f2cc449f175de04958911e5ebc00241baea2b2c3 100644 --- a/src/backend/storage/smgr/smgr.c +++ b/src/backend/storage/smgr/smgr.c @@ -11,7 +11,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/storage/smgr/smgr.c,v 1.115 2009/01/01 17:23:48 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/storage/smgr/smgr.c,v 1.116 2009/01/12 05:10:44 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -48,6 +48,8 @@ typedef struct f_smgr bool isRedo); void (*smgr_extend) (SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool isTemp); + void (*smgr_prefetch) (SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum); void (*smgr_read) (SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer); void (*smgr_write) (SMgrRelation reln, ForkNumber forknum, @@ -65,7 +67,7 @@ typedef struct f_smgr static const f_smgr smgrsw[] = { /* magnetic disk */ {mdinit, NULL, mdclose, mdcreate, mdexists, mdunlink, mdextend, - mdread, mdwrite, mdnblocks, mdtruncate, mdimmedsync, + mdprefetch, mdread, mdwrite, mdnblocks, mdtruncate, mdimmedsync, mdpreckpt, mdsync, mdpostckpt } }; @@ -375,6 +377,15 @@ smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, buffer, isTemp); } +/* + * smgrprefetch() -- Initiate asynchronous read of the specified block of a relation. + */ +void +smgrprefetch(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum) +{ + (*(smgrsw[reln->smgr_which].smgr_prefetch)) (reln, forknum, blocknum); +} + /* * smgrread() -- read a particular block from a relation into the supplied * buffer. diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index 8d927ae138778bf2a4f6b78cbdb8da2e910bd0fb..63e9628a5dcfa8d4acafd0dbd405454dc43fc001 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -10,7 +10,7 @@ * Written by Peter Eisentraut <peter_e@gmx.net>. * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/misc/guc.c,v 1.492 2009/01/09 10:13:18 mha Exp $ + * $PostgreSQL: pgsql/src/backend/utils/misc/guc.c,v 1.493 2009/01/12 05:10:44 tgl Exp $ * *-------------------------------------------------------------------- */ @@ -18,6 +18,7 @@ #include <ctype.h> #include <float.h> +#include <math.h> #include <limits.h> #include <unistd.h> #include <sys/stat.h> @@ -163,8 +164,9 @@ static bool assign_tcp_keepalives_count(int newval, bool doit, GucSource source) static const char *show_tcp_keepalives_idle(void); static const char *show_tcp_keepalives_interval(void); static const char *show_tcp_keepalives_count(void); -static bool assign_autovacuum_max_workers(int newval, bool doit, GucSource source); static bool assign_maxconnections(int newval, bool doit, GucSource source); +static bool assign_autovacuum_max_workers(int newval, bool doit, GucSource source); +static bool assign_effective_io_concurrency(int newval, bool doit, GucSource source); static const char *assign_pgstat_temp_directory(const char *newval, bool doit, GucSource source); static char *config_enum_get_options(struct config_enum *record, @@ -413,6 +415,7 @@ static int segment_size; static int wal_block_size; static int wal_segment_size; static bool integer_datetimes; +static int effective_io_concurrency; /* should be static, but commands/variable.c needs to get at these */ char *role_string; @@ -1700,6 +1703,20 @@ static struct config_int ConfigureNamesInt[] = 100, 0, 1000, NULL, NULL }, + { + {"effective_io_concurrency", PGC_USERSET, RESOURCES, + gettext_noop("Number of simultaneous requests that can be handled efficiently by the disk subsystem."), + gettext_noop("For RAID arrays, this should be approximately the number of drive spindles in the array.") + }, + &effective_io_concurrency, +#ifdef USE_PREFETCH + 1, 0, 1000, +#else + 0, 0, 0, +#endif + assign_effective_io_concurrency, NULL + }, + { {"log_rotation_age", PGC_SIGHUP, LOGGING_WHERE, gettext_noop("Automatic log file rotation will occur after N minutes."), @@ -7587,6 +7604,61 @@ assign_autovacuum_max_workers(int newval, bool doit, GucSource source) return true; } +static bool +assign_effective_io_concurrency(int newval, bool doit, GucSource source) +{ +#ifdef USE_PREFETCH + double new_prefetch_pages = 0.0; + int i; + + /*---------- + * The user-visible GUC parameter is the number of drives (spindles), + * which we need to translate to a number-of-pages-to-prefetch target. + * + * The expected number of prefetch pages needed to keep N drives busy is: + * + * drives | I/O requests + * -------+---------------- + * 1 | 1 + * 2 | 2/1 + 2/2 = 3 + * 3 | 3/1 + 3/2 + 3/3 = 5 1/2 + * 4 | 4/1 + 4/2 + 4/3 + 4/4 = 8 1/3 + * n | n * H(n) + * + * This is called the "coupon collector problem" and H(n) is called the + * harmonic series. This could be approximated by n * ln(n), but for + * reasonable numbers of drives we might as well just compute the series. + * + * Alternatively we could set the target to the number of pages necessary + * so that the expected number of active spindles is some arbitrary + * percentage of the total. This sounds the same but is actually slightly + * different. The result ends up being ln(1-P)/ln((n-1)/n) where P is + * that desired fraction. + * + * Experimental results show that both of these formulas aren't aggressive + * enough, but we don't really have any better proposals. + * + * Note that if newval = 0 (disabled), we must set target = 0. + *---------- + */ + + for (i = 1; i <= newval; i++) + new_prefetch_pages += (double) newval / (double) i; + + /* This range check shouldn't fail, but let's be paranoid */ + if (new_prefetch_pages >= 0.0 && new_prefetch_pages < (double) INT_MAX) + { + if (doit) + target_prefetch_pages = (int) rint(new_prefetch_pages); + return true; + } + else + return false; +#else + return true; +#endif /* USE_PREFETCH */ +} + static const char * assign_pgstat_temp_directory(const char *newval, bool doit, GucSource source) { diff --git a/src/backend/utils/misc/postgresql.conf.sample b/src/backend/utils/misc/postgresql.conf.sample index ffa5055b76889b56bcdfd09a81f1fc29ee8af975..977e13e0aff54e700567b211cf9d734524453294 100644 --- a/src/backend/utils/misc/postgresql.conf.sample +++ b/src/backend/utils/misc/postgresql.conf.sample @@ -131,6 +131,10 @@ #bgwriter_lru_maxpages = 100 # 0-1000 max buffers written/round #bgwriter_lru_multiplier = 2.0 # 0-10.0 multipler on buffers scanned/round +# - Asynchronous Behavior - + +#effective_io_concurrency = 1 # 1-1000, or 0 to disable prefetching + #------------------------------------------------------------------------------ # WRITE AHEAD LOG diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index 506605df0014676ded4f42770ea11469d4087148..8d87ec19e1d09bb365eb3bb9ea598076d57675e7 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/nodes/execnodes.h,v 1.200 2009/01/10 21:08:36 tgl Exp $ + * $PostgreSQL: pgsql/src/include/nodes/execnodes.h,v 1.201 2009/01/12 05:10:45 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -1154,6 +1154,9 @@ typedef struct BitmapIndexScanState * tbm bitmap obtained from child index scan(s) * tbmiterator iterator for scanning current pages * tbmres current-page data + * prefetch_iterator iterator for prefetching ahead of current page + * prefetch_pages # pages prefetch iterator is ahead of current + * prefetch_target target prefetch distance * ---------------- */ typedef struct BitmapHeapScanState @@ -1163,6 +1166,9 @@ typedef struct BitmapHeapScanState TIDBitmap *tbm; TBMIterator *tbmiterator; TBMIterateResult *tbmres; + TBMIterator *prefetch_iterator; + int prefetch_pages; + int prefetch_target; } BitmapHeapScanState; /* ---------------- diff --git a/src/include/pg_config_manual.h b/src/include/pg_config_manual.h index ff9d6ce45decd23ebeeb96e7b940ac9a36355988..bc66df2eb340060620c1ff7ff74af01ee1560e56 100644 --- a/src/include/pg_config_manual.h +++ b/src/include/pg_config_manual.h @@ -6,7 +6,7 @@ * for developers. If you edit any of these, be sure to do a *full* * rebuild (and an initdb if noted). * - * $PostgreSQL: pgsql/src/include/pg_config_manual.h,v 1.36 2009/01/11 18:02:17 tgl Exp $ + * $PostgreSQL: pgsql/src/include/pg_config_manual.h,v 1.37 2009/01/12 05:10:45 tgl Exp $ *------------------------------------------------------------------------ */ @@ -135,6 +135,15 @@ #define USE_POSIX_FADVISE #endif +/* + * USE_PREFETCH code should be compiled only if we have a way to implement + * prefetching. (This is decoupled from USE_POSIX_FADVISE because there + * might in future be support for alternative low-level prefetch APIs.) + */ +#ifdef USE_POSIX_FADVISE +#define USE_PREFETCH +#endif + /* * This is the default directory in which AF_UNIX socket files are * placed. Caution: changing this risks breaking your existing client diff --git a/src/include/storage/buf_internals.h b/src/include/storage/buf_internals.h index 9ec9fcb98a57332234af7c46f3641abb95276813..12512d7428b01084de5bf053d44b53aebaf12069 100644 --- a/src/include/storage/buf_internals.h +++ b/src/include/storage/buf_internals.h @@ -8,7 +8,7 @@ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/storage/buf_internals.h,v 1.100 2009/01/01 17:24:01 momjian Exp $ + * $PostgreSQL: pgsql/src/include/storage/buf_internals.h,v 1.101 2009/01/12 05:10:45 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -208,7 +208,9 @@ extern int BufTableInsert(BufferTag *tagPtr, uint32 hashcode, int buf_id); extern void BufTableDelete(BufferTag *tagPtr, uint32 hashcode); /* localbuf.c */ -extern BufferDesc *LocalBufferAlloc(SMgrRelation reln, ForkNumber forkNum, +extern void LocalPrefetchBuffer(SMgrRelation smgr, ForkNumber forkNum, + BlockNumber blockNum); +extern BufferDesc *LocalBufferAlloc(SMgrRelation smgr, ForkNumber forkNum, BlockNumber blockNum, bool *foundPtr); extern void MarkLocalBufferDirty(Buffer buffer); extern void DropRelFileNodeLocalBuffers(RelFileNode rnode, ForkNumber forkNum, diff --git a/src/include/storage/bufmgr.h b/src/include/storage/bufmgr.h index 9d1f47d58a2665b9da0f32ec8d3243d5b34f389e..0ee09ced6d214d984aa3d1a445ffa80932a8a344 100644 --- a/src/include/storage/bufmgr.h +++ b/src/include/storage/bufmgr.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/storage/bufmgr.h,v 1.119 2009/01/01 17:24:01 momjian Exp $ + * $PostgreSQL: pgsql/src/include/storage/bufmgr.h,v 1.120 2009/01/12 05:10:45 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -47,6 +47,7 @@ extern PGDLLIMPORT int NBuffers; extern bool zero_damaged_pages; extern int bgwriter_lru_maxpages; extern double bgwriter_lru_multiplier; +extern int target_prefetch_pages; /* in buf_init.c */ extern PGDLLIMPORT char *BufferBlocks; @@ -152,6 +153,8 @@ extern PGDLLIMPORT int32 *LocalRefCount; /* * prototypes for functions in bufmgr.c */ +extern void PrefetchBuffer(Relation reln, ForkNumber forkNum, + BlockNumber blockNum); extern Buffer ReadBuffer(Relation reln, BlockNumber blockNum); extern Buffer ReadBufferExtended(Relation reln, ForkNumber forkNum, BlockNumber blockNum, ReadBufferMode mode, diff --git a/src/include/storage/fd.h b/src/include/storage/fd.h index 17aa150aa03ea33e2bdc4be9f522ec0c7ddcfa48..98d091c97872db05de18c1dcb99eb6cc77d6b9e5 100644 --- a/src/include/storage/fd.h +++ b/src/include/storage/fd.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/storage/fd.h,v 1.63 2009/01/01 17:24:01 momjian Exp $ + * $PostgreSQL: pgsql/src/include/storage/fd.h,v 1.64 2009/01/12 05:10:45 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -62,6 +62,7 @@ extern int max_files_per_process; extern File PathNameOpenFile(FileName fileName, int fileFlags, int fileMode); extern File OpenTemporaryFile(bool interXact); extern void FileClose(File file); +extern int FilePrefetch(File file, off_t offset, int amount); extern int FileRead(File file, char *buffer, int amount); extern int FileWrite(File file, char *buffer, int amount); extern int FileSync(File file); diff --git a/src/include/storage/smgr.h b/src/include/storage/smgr.h index 0392fdf81a3c1d73e99fcb5ab1653b7627552dfd..e753af76dde2a9ac8ac6a1903beaf566807a1a6e 100644 --- a/src/include/storage/smgr.h +++ b/src/include/storage/smgr.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2009, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/storage/smgr.h,v 1.65 2009/01/01 17:24:01 momjian Exp $ + * $PostgreSQL: pgsql/src/include/storage/smgr.h,v 1.66 2009/01/12 05:10:45 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -70,6 +70,8 @@ extern void smgrdounlink(SMgrRelation reln, ForkNumber forknum, bool isTemp, bool isRedo); extern void smgrextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool isTemp); +extern void smgrprefetch(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum); extern void smgrread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer); extern void smgrwrite(SMgrRelation reln, ForkNumber forknum, @@ -93,6 +95,8 @@ extern bool mdexists(SMgrRelation reln, ForkNumber forknum); extern void mdunlink(RelFileNode rnode, ForkNumber forknum, bool isRedo); extern void mdextend(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer, bool isTemp); +extern void mdprefetch(SMgrRelation reln, ForkNumber forknum, + BlockNumber blocknum); extern void mdread(SMgrRelation reln, ForkNumber forknum, BlockNumber blocknum, char *buffer); extern void mdwrite(SMgrRelation reln, ForkNumber forknum,