diff --git a/src/backend/access/hash/README b/src/backend/access/hash/README index 014f742c252ce34921e2e2fe532e5bd0388774f5..9f34fd73c483d3cd738d0b239b7aa09f35dab913 100644 --- a/src/backend/access/hash/README +++ b/src/backend/access/hash/README @@ -1,4 +1,4 @@ -$PostgreSQL: pgsql/src/backend/access/hash/README,v 1.6 2007/04/19 20:24:04 tgl Exp $ +$PostgreSQL: pgsql/src/backend/access/hash/README,v 1.7 2008/03/15 20:46:31 tgl Exp $ This directory contains an implementation of hash indexing for Postgres. Most of the core ideas are taken from Margo Seltzer and Ozan Yigit, A New Hashing @@ -65,6 +65,11 @@ hashm_spares[N] <= hashm_spares[N+1], since the latter count includes the former. The difference between the two represents the number of overflow pages appearing between the bucket page groups of splitpoints N and N+1. +(Note: the above describes what happens when filling an initially minimally +sized hash index. In practice, we try to estimate the required index size +and allocate a suitable number of splitpoints immediately, to avoid +expensive re-splitting during initial index build.) + When S splitpoints exist altogether, the array entries hashm_spares[0] through hashm_spares[S] are valid; hashm_spares[S] records the current total number of overflow pages. New overflow pages are created as needed @@ -101,9 +106,9 @@ includes the bitmap pages, which is the reason for saying that bitmap pages are a subset of the overflow pages. It turns out in fact that each bitmap page's first bit represents itself --- this is not an essential property, but falls out of the fact that we only allocate another bitmap -page when we really need one. Bit number zero always corresponds to block -number 3, which is the first bitmap page and is allocated during index -creation. +page when we really need one. Bit number zero always corresponds to the +first bitmap page, which is allocated during index creation just after all +the initially created buckets. Lock definitions diff --git a/src/backend/access/hash/hash.c b/src/backend/access/hash/hash.c index f6c4d5705d4284a6393d3ebcb81ba09be88b66bd..b008c0aa4a7abde67c91cc158abc16ae34b5d888 100644 --- a/src/backend/access/hash/hash.c +++ b/src/backend/access/hash/hash.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/hash/hash.c,v 1.98 2008/01/01 19:45:46 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/access/hash/hash.c,v 1.99 2008/03/15 20:46:31 tgl Exp $ * * NOTES * This file contains only the public interface routines. @@ -22,6 +22,7 @@ #include "access/hash.h" #include "catalog/index.h" #include "commands/vacuum.h" +#include "optimizer/plancat.h" /* Working state for hashbuild and its callback */ @@ -48,6 +49,7 @@ hashbuild(PG_FUNCTION_ARGS) Relation index = (Relation) PG_GETARG_POINTER(1); IndexInfo *indexInfo = (IndexInfo *) PG_GETARG_POINTER(2); IndexBuildResult *result; + BlockNumber relpages; double reltuples; HashBuildState buildstate; @@ -59,8 +61,11 @@ hashbuild(PG_FUNCTION_ARGS) elog(ERROR, "index \"%s\" already contains data", RelationGetRelationName(index)); - /* initialize the hash index metadata page */ - _hash_metapinit(index); + /* estimate the number of rows currently present in the table */ + estimate_rel_size(heap, NULL, &relpages, &reltuples); + + /* initialize the hash index metadata page and initial buckets */ + _hash_metapinit(index, reltuples); /* build the index */ buildstate.indtuples = 0; diff --git a/src/backend/access/hash/hashpage.c b/src/backend/access/hash/hashpage.c index 68861087585e97c873fe90eff1286d477d9af9d8..ec6f4b390fd944305e908aaebb035e325b733860 100644 --- a/src/backend/access/hash/hashpage.c +++ b/src/backend/access/hash/hashpage.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/access/hash/hashpage.c,v 1.72 2008/01/01 19:45:46 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/access/hash/hashpage.c,v 1.73 2008/03/15 20:46:31 tgl Exp $ * * NOTES * Postgres hash pages look like ordinary relation pages. The opaque @@ -312,15 +312,17 @@ _hash_chgbufaccess(Relation rel, /* * _hash_metapinit() -- Initialize the metadata page of a hash index, - * the two buckets that we begin with and the initial - * bitmap page. + * the initial buckets, and the initial bitmap page. + * + * The initial number of buckets is dependent on num_tuples, an estimate + * of the number of tuples to be loaded into the index initially. * * We are fairly cavalier about locking here, since we know that no one else * could be accessing this index. In particular the rule about not holding * multiple buffer locks is ignored. */ void -_hash_metapinit(Relation rel) +_hash_metapinit(Relation rel, double num_tuples) { HashMetaPage metap; HashPageOpaque pageopaque; @@ -330,7 +332,10 @@ _hash_metapinit(Relation rel) int32 data_width; int32 item_width; int32 ffactor; - uint16 i; + double dnumbuckets; + uint32 num_buckets; + uint32 log2_num_buckets; + uint32 i; /* safety check */ if (RelationGetNumberOfBlocks(rel) != 0) @@ -354,7 +359,26 @@ _hash_metapinit(Relation rel) ffactor = 10; /* - * We initialize the metapage, the first two bucket pages, and the first + * Choose the number of initial bucket pages to match the fill factor + * given the estimated number of tuples. We round up the result to the + * next power of 2, however, and always force at least 2 bucket pages. + * The upper limit is determined by considerations explained in + * _hash_expandtable(). + */ + dnumbuckets = num_tuples / ffactor; + if (dnumbuckets <= 2.0) + num_buckets = 2; + else if (dnumbuckets >= (double) 0x40000000) + num_buckets = 0x40000000; + else + num_buckets = ((uint32) 1) << _hash_log2((uint32) dnumbuckets); + + log2_num_buckets = _hash_log2(num_buckets); + Assert(num_buckets == (((uint32) 1) << log2_num_buckets)); + Assert(log2_num_buckets < HASH_MAX_SPLITPOINTS); + + /* + * We initialize the metapage, the first N bucket pages, and the first * bitmap page in sequence, using _hash_getnewbuf to cause smgrextend() * calls to occur. This ensures that the smgr level has the right idea of * the physical index length. @@ -398,23 +422,25 @@ _hash_metapinit(Relation rel) metap->hashm_procid = index_getprocid(rel, 1, HASHPROC); /* - * We initialize the index with two buckets, 0 and 1, occupying physical - * blocks 1 and 2. The first freespace bitmap page is in block 3. + * We initialize the index with N buckets, 0 .. N-1, occupying physical + * blocks 1 to N. The first freespace bitmap page is in block N+1. + * Since N is a power of 2, we can set the masks this way: */ - metap->hashm_maxbucket = metap->hashm_lowmask = 1; /* nbuckets - 1 */ - metap->hashm_highmask = 3; /* (nbuckets << 1) - 1 */ + metap->hashm_maxbucket = metap->hashm_lowmask = num_buckets - 1; + metap->hashm_highmask = (num_buckets << 1) - 1; MemSet(metap->hashm_spares, 0, sizeof(metap->hashm_spares)); MemSet(metap->hashm_mapp, 0, sizeof(metap->hashm_mapp)); - metap->hashm_spares[1] = 1; /* the first bitmap page is only spare */ - metap->hashm_ovflpoint = 1; + /* Set up mapping for one spare page after the initial splitpoints */ + metap->hashm_spares[log2_num_buckets] = 1; + metap->hashm_ovflpoint = log2_num_buckets; metap->hashm_firstfree = 0; /* - * Initialize the first two buckets + * Initialize the first N buckets */ - for (i = 0; i <= 1; i++) + for (i = 0; i < num_buckets; i++) { buf = _hash_getnewbuf(rel, BUCKET_TO_BLKNO(metap, i)); pg = BufferGetPage(buf); @@ -430,7 +456,7 @@ _hash_metapinit(Relation rel) /* * Initialize first bitmap page */ - _hash_initbitmap(rel, metap, 3); + _hash_initbitmap(rel, metap, num_buckets + 1); /* all done */ _hash_wrtbuf(rel, metabuf); @@ -511,6 +537,9 @@ _hash_expandtable(Relation rel, Buffer metabuf) * index with 2^32 buckets would certainly overflow BlockNumber and hence * _hash_alloc_buckets() would fail, but if we supported buckets smaller * than a disk block then this would be an independent constraint. + * + * If you change this, see also the maximum initial number of buckets + * in _hash_metapinit(). */ if (metap->hashm_maxbucket >= (uint32) 0x7FFFFFFE) goto fail; diff --git a/src/backend/optimizer/util/plancat.c b/src/backend/optimizer/util/plancat.c index 5f927095edcd9198f831652b32db52a64bf63346..a56dccd2ff5d9482736cd06e5c9769662948c9ba 100644 --- a/src/backend/optimizer/util/plancat.c +++ b/src/backend/optimizer/util/plancat.c @@ -9,7 +9,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/optimizer/util/plancat.c,v 1.140 2008/01/12 00:11:39 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/optimizer/util/plancat.c,v 1.141 2008/03/15 20:46:31 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -45,8 +45,6 @@ bool constraint_exclusion = false; get_relation_info_hook_type get_relation_info_hook = NULL; -static void estimate_rel_size(Relation rel, int32 *attr_widths, - BlockNumber *pages, double *tuples); static List *get_relation_constraints(Oid relationObjectId, RelOptInfo *rel, bool include_notnull); @@ -319,7 +317,7 @@ get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent, * relation's attr_width[] cache; we fill this in if we have need to compute * the attribute widths for estimation purposes. */ -static void +void estimate_rel_size(Relation rel, int32 *attr_widths, BlockNumber *pages, double *tuples) { diff --git a/src/include/access/hash.h b/src/include/access/hash.h index ac54c47f014ca704dd97e898ae8ad39d4c67d373..fd7b68e9aebeb18382e99628a041d66a78a0e3c2 100644 --- a/src/include/access/hash.h +++ b/src/include/access/hash.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/access/hash.h,v 1.84 2008/01/01 19:45:56 momjian Exp $ + * $PostgreSQL: pgsql/src/include/access/hash.h,v 1.85 2008/03/15 20:46:31 tgl Exp $ * * NOTES * modeled after Margo Seltzer's hash implementation for unix. @@ -298,7 +298,7 @@ extern void _hash_dropbuf(Relation rel, Buffer buf); extern void _hash_wrtbuf(Relation rel, Buffer buf); extern void _hash_chgbufaccess(Relation rel, Buffer buf, int from_access, int to_access); -extern void _hash_metapinit(Relation rel); +extern void _hash_metapinit(Relation rel, double num_tuples); extern void _hash_pageinit(Page page, Size size); extern void _hash_expandtable(Relation rel, Buffer metabuf); diff --git a/src/include/optimizer/plancat.h b/src/include/optimizer/plancat.h index 42e24832495bdee61fd2c3c8cfadf59a04f2bb17..82b4c2200ad9d0b3e397db1ee30fd3b63d08e83c 100644 --- a/src/include/optimizer/plancat.h +++ b/src/include/optimizer/plancat.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/optimizer/plancat.h,v 1.47 2008/01/01 19:45:58 momjian Exp $ + * $PostgreSQL: pgsql/src/include/optimizer/plancat.h,v 1.48 2008/03/15 20:46:31 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -15,6 +15,7 @@ #define PLANCAT_H #include "nodes/relation.h" +#include "utils/rel.h" /* Hook for plugins to get control in get_relation_info() */ typedef void (*get_relation_info_hook_type) (PlannerInfo *root, @@ -27,6 +28,9 @@ extern PGDLLIMPORT get_relation_info_hook_type get_relation_info_hook; extern void get_relation_info(PlannerInfo *root, Oid relationObjectId, bool inhparent, RelOptInfo *rel); +extern void estimate_rel_size(Relation rel, int32 *attr_widths, + BlockNumber *pages, double *tuples); + extern bool relation_excluded_by_constraints(RelOptInfo *rel, RangeTblEntry *rte);