diff --git a/contrib/pg_stat_statements/pg_stat_statements--1.1--1.2.sql b/contrib/pg_stat_statements/pg_stat_statements--1.1--1.2.sql index 74aa561afe29176fa0e31625ba394697825bdb60..74ae43868d13fe69aaa2bc437c8b47cbedcde5c7 100644 --- a/contrib/pg_stat_statements/pg_stat_statements--1.1--1.2.sql +++ b/contrib/pg_stat_statements/pg_stat_statements--1.1--1.2.sql @@ -12,7 +12,7 @@ DROP VIEW pg_stat_statements; DROP FUNCTION pg_stat_statements(); /* Now redefine */ -CREATE FUNCTION pg_stat_statements( +CREATE FUNCTION pg_stat_statements(IN showtext boolean, OUT userid oid, OUT dbid oid, OUT queryid bigint, @@ -34,10 +34,10 @@ CREATE FUNCTION pg_stat_statements( OUT blk_write_time float8 ) RETURNS SETOF record -AS 'MODULE_PATHNAME' -LANGUAGE C; +AS 'MODULE_PATHNAME', 'pg_stat_statements_1_2' +LANGUAGE C STRICT VOLATILE; CREATE VIEW pg_stat_statements AS - SELECT * FROM pg_stat_statements(); + SELECT * FROM pg_stat_statements(true); GRANT SELECT ON pg_stat_statements TO PUBLIC; diff --git a/contrib/pg_stat_statements/pg_stat_statements--1.2.sql b/contrib/pg_stat_statements/pg_stat_statements--1.2.sql index 80b74a140b68587822a41471519478f6b64b3a37..5bfa9a55d03e7a0e212aa759e0962a0b40e64716 100644 --- a/contrib/pg_stat_statements/pg_stat_statements--1.2.sql +++ b/contrib/pg_stat_statements/pg_stat_statements--1.2.sql @@ -9,7 +9,7 @@ RETURNS void AS 'MODULE_PATHNAME' LANGUAGE C; -CREATE FUNCTION pg_stat_statements( +CREATE FUNCTION pg_stat_statements(IN showtext boolean, OUT userid oid, OUT dbid oid, OUT queryid bigint, @@ -31,12 +31,12 @@ CREATE FUNCTION pg_stat_statements( OUT blk_write_time float8 ) RETURNS SETOF record -AS 'MODULE_PATHNAME' -LANGUAGE C; +AS 'MODULE_PATHNAME', 'pg_stat_statements_1_2' +LANGUAGE C STRICT VOLATILE; -- Register a view on the function for ease of use. CREATE VIEW pg_stat_statements AS - SELECT * FROM pg_stat_statements(); + SELECT * FROM pg_stat_statements(true); GRANT SELECT ON pg_stat_statements TO PUBLIC; diff --git a/contrib/pg_stat_statements/pg_stat_statements.c b/contrib/pg_stat_statements/pg_stat_statements.c index 858cce34576e2bbeff29dd94d50baa52cad5db23..aa6ae375773a6a550ec96af72ff7851c9b48a2e6 100644 --- a/contrib/pg_stat_statements/pg_stat_statements.c +++ b/contrib/pg_stat_statements/pg_stat_statements.c @@ -26,12 +26,26 @@ * tree(s) generated from the query. The executor can then use this value * to blame query costs on the proper queryId. * + * To facilitate presenting entries to users, we create "representative" query + * strings in which constants are replaced with '?' characters, to make it + * clearer what a normalized entry can represent. To save on shared memory, + * and to avoid having to truncate oversized query strings, we store these + * strings in a temporary external query-texts file. Offsets into this + * file are kept in shared memory. + * * Note about locking issues: to create or delete an entry in the shared * hashtable, one must hold pgss->lock exclusively. Modifying any field * in an entry except the counters requires the same. To look up an entry, * one must hold the lock shared. To read or update the counters within * an entry, one must hold the lock shared or exclusive (so the entry doesn't * disappear!) and also take the entry's mutex spinlock. + * The shared state variable pgss->extent (the next free spot in the external + * query-text file) should be accessed only while holding either the + * pgss->mutex spinlock, or exclusive lock on pgss->lock. We use the mutex to + * allow reserving file space while holding only shared lock on pgss->lock. + * Rewriting the entire external query-text file, eg for garbage collection, + * requires holding pgss->lock exclusively; this allows individual entries + * in the file to be read or written while holding only shared lock. * * * Copyright (c) 2008-2014, PostgreSQL Global Development Group @@ -43,6 +57,7 @@ */ #include "postgres.h" +#include <sys/stat.h> #include <unistd.h> #include "access/hash.h" @@ -53,21 +68,32 @@ #include "parser/analyze.h" #include "parser/parsetree.h" #include "parser/scanner.h" -#include "pgstat.h" #include "storage/fd.h" #include "storage/ipc.h" #include "storage/spin.h" #include "tcop/utility.h" #include "utils/builtins.h" +#include "utils/memutils.h" PG_MODULE_MAGIC; -/* Location of stats file */ +/* Location of permanent stats file (valid when database is shut down) */ #define PGSS_DUMP_FILE "global/pg_stat_statements.stat" -/* This constant defines the magic number in the stats file header */ -static const uint32 PGSS_FILE_HEADER = 0x20131115; +/* + * Location of external query text file. We don't keep it in the core + * system's stats_temp_directory. The core system can safely use that GUC + * setting, because the statistics collector temp file paths are set only once + * as part of changing the GUC, but pg_stat_statements has no way of avoiding + * race conditions. Besides, we only expect modest, infrequent I/O for query + * strings, so placing the file on a faster filesystem is not compelling. + */ +#define PGSS_TEXT_FILE "pg_stat_tmp/pgss_query_texts.stat" + +/* Magic number identifying the stats file format */ +static const uint32 PGSS_FILE_HEADER = 0x20140125; + /* PostgreSQL major version number, changes in which invalidate all entries */ static const uint32 PGSS_PG_MAJOR_VERSION = PG_VERSION_NUM / 100; @@ -75,6 +101,7 @@ static const uint32 PGSS_PG_MAJOR_VERSION = PG_VERSION_NUM / 100; #define USAGE_EXEC(duration) (1.0) #define USAGE_INIT (1.0) /* including initial planning */ #define ASSUMED_MEDIAN_INIT (10.0) /* initial assumed median usage */ +#define ASSUMED_LENGTH_INIT 1024 /* initial assumed mean query length */ #define USAGE_DECREASE_FACTOR (0.99) /* decreased every entry_dealloc */ #define STICKY_DECREASE_FACTOR (0.50) /* factor for sticky entries */ #define USAGE_DEALLOC_PERCENT 5 /* free this % of entries at once */ @@ -94,16 +121,11 @@ typedef enum pgssVersion /* * Hashtable key that defines the identity of a hashtable entry. We separate * queries by user and by database even if they are otherwise identical. - * - * Presently, the query encoding is fully determined by the source database - * and so we don't really need it to be in the key. But that might not always - * be true. Anyway it's notationally convenient to pass it as part of the key. */ typedef struct pgssHashKey { Oid userid; /* user OID */ Oid dbid; /* database OID */ - int encoding; /* query encoding */ uint32 queryid; /* query identifier */ } pgssHashKey; @@ -133,16 +155,18 @@ typedef struct Counters /* * Statistics per statement * - * NB: see the file read/write code before changing field order here. + * Note: in event of a failure in garbage collection of the query text file, + * we reset query_offset to zero and query_len to -1. This will be seen as + * an invalid state by qtext_fetch(). */ typedef struct pgssEntry { pgssHashKey key; /* hash key of entry - MUST BE FIRST */ Counters counters; /* the statistics for this query */ + Size query_offset; /* query text offset in external file */ int query_len; /* # of valid bytes in query string */ + int encoding; /* query text encoding */ slock_t mutex; /* protects the counters only */ - char query[1]; /* VARIABLE LENGTH ARRAY - MUST BE LAST */ - /* Note: the allocated length of query[] is actually pgss->query_size */ } pgssEntry; /* @@ -151,8 +175,12 @@ typedef struct pgssEntry typedef struct pgssSharedState { LWLock *lock; /* protects hashtable search/modification */ - int query_size; /* max query length in bytes */ double cur_median_usage; /* current median usage in hashtable */ + Size mean_query_len; /* current mean entry text length */ + slock_t mutex; /* protects following fields only: */ + Size extent; /* current extent of query file */ + int n_writers; /* number of active writers to query file */ + int gc_count; /* query file garbage collection cycle count */ } pgssSharedState; /* @@ -231,15 +259,25 @@ static bool pgss_save; /* whether to save stats across shutdown */ (pgss_track == PGSS_TRACK_ALL || \ (pgss_track == PGSS_TRACK_TOP && nested_level == 0)) +#define record_gc_qtexts() \ + do { \ + volatile pgssSharedState *s = (volatile pgssSharedState *) pgss; \ + SpinLockAcquire(&s->mutex); \ + s->gc_count++; \ + SpinLockRelease(&s->mutex); \ + } while(0) + /*---- Function declarations ----*/ void _PG_init(void); void _PG_fini(void); Datum pg_stat_statements_reset(PG_FUNCTION_ARGS); +Datum pg_stat_statements_1_2(PG_FUNCTION_ARGS); Datum pg_stat_statements(PG_FUNCTION_ARGS); PG_FUNCTION_INFO_V1(pg_stat_statements_reset); +PG_FUNCTION_INFO_V1(pg_stat_statements_1_2); PG_FUNCTION_INFO_V1(pg_stat_statements); static void pgss_shmem_startup(void); @@ -261,10 +299,20 @@ static void pgss_store(const char *query, uint32 queryId, double total_time, uint64 rows, const BufferUsage *bufusage, pgssJumbleState *jstate); +static void pg_stat_statements_internal(FunctionCallInfo fcinfo, + pgssVersion api_version, + bool showtext); static Size pgss_memsize(void); -static pgssEntry *entry_alloc(pgssHashKey *key, const char *query, - int query_len, bool sticky); +static pgssEntry *entry_alloc(pgssHashKey *key, Size query_offset, int query_len, + int encoding, bool sticky); static void entry_dealloc(void); +static bool qtext_store(const char *query, int query_len, + Size *query_offset, int *gc_count); +static char *qtext_load_file(Size *buffer_size); +static char *qtext_fetch(Size query_offset, int query_len, + char *buffer, Size buffer_size); +static bool need_gc_qtexts(void); +static void gc_qtexts(void); static void entry_reset(void); static void AppendJumble(pgssJumbleState *jstate, const unsigned char *item, Size size); @@ -302,7 +350,7 @@ _PG_init(void) "Sets the maximum number of statements tracked by pg_stat_statements.", NULL, &pgss_max, - 1000, + 5000, 100, INT_MAX, PGC_POSTMASTER, @@ -393,18 +441,20 @@ _PG_fini(void) /* * shmem_startup hook: allocate or attach to shared memory, * then load any pre-existing statistics from file. + * Also create and load the query-texts file, which is expected to exist + * (even if empty) while the module is enabled. */ static void pgss_shmem_startup(void) { bool found; HASHCTL info; - FILE *file; + FILE *file = NULL; + FILE *qfile = NULL; uint32 header; int32 num; int32 pgver; int32 i; - int query_size; int buffer_size; char *buffer = NULL; @@ -428,16 +478,17 @@ pgss_shmem_startup(void) { /* First time through ... */ pgss->lock = LWLockAssign(); - pgss->query_size = pgstat_track_activity_query_size; pgss->cur_median_usage = ASSUMED_MEDIAN_INIT; + pgss->mean_query_len = ASSUMED_LENGTH_INIT; + SpinLockInit(&pgss->mutex); + pgss->extent = 0; + pgss->n_writers = 0; + pgss->gc_count = 0; } - /* Be sure everyone agrees on the hash table entry size */ - query_size = pgss->query_size; - memset(&info, 0, sizeof(info)); info.keysize = sizeof(pgssHashKey); - info.entrysize = offsetof(pgssEntry, query) +query_size; + info.entrysize = sizeof(pgssEntry); info.hash = pgss_hash_fn; info.match = pgss_match_fn; pgss_hash = ShmemInitHash("pg_stat_statements hash", @@ -455,70 +506,100 @@ pgss_shmem_startup(void) on_shmem_exit(pgss_shmem_shutdown, (Datum) 0); /* - * Attempt to load old statistics from the dump file, if this is the first - * time through and we weren't told not to. + * Done if some other process already completed our initialization. */ - if (found || !pgss_save) + if (found) return; /* * Note: we don't bother with locks here, because there should be no other * processes running when this code is reached. */ + + /* Unlink query text file possibly left over from crash */ + unlink(PGSS_TEXT_FILE); + + /* Allocate new query text temp file */ + qfile = AllocateFile(PGSS_TEXT_FILE, PG_BINARY_W); + if (qfile == NULL) + goto write_error; + + /* + * If we were told not to load old statistics, we're done. (Note we do + * not try to unlink any old dump file in this case. This seems a bit + * questionable but it's the historical behavior.) + */ + if (!pgss_save) + { + FreeFile(qfile); + return; + } + + /* + * Attempt to load old statistics from the dump file. + */ file = AllocateFile(PGSS_DUMP_FILE, PG_BINARY_R); if (file == NULL) { - if (errno == ENOENT) - return; /* ignore not-found error */ - goto error; + if (errno != ENOENT) + goto read_error; + /* No existing persisted stats file, so we're done */ + FreeFile(qfile); + return; } - buffer_size = query_size; + buffer_size = 2048; buffer = (char *) palloc(buffer_size); if (fread(&header, sizeof(uint32), 1, file) != 1 || - header != PGSS_FILE_HEADER || fread(&pgver, sizeof(uint32), 1, file) != 1 || - pgver != PGSS_PG_MAJOR_VERSION || fread(&num, sizeof(int32), 1, file) != 1) - goto error; + goto read_error; + + if (header != PGSS_FILE_HEADER || + pgver != PGSS_PG_MAJOR_VERSION) + goto data_error; for (i = 0; i < num; i++) { pgssEntry temp; pgssEntry *entry; + Size query_offset; - if (fread(&temp, offsetof(pgssEntry, mutex), 1, file) != 1) - goto error; + if (fread(&temp, sizeof(pgssEntry), 1, file) != 1) + goto read_error; /* Encoding is the only field we can easily sanity-check */ - if (!PG_VALID_BE_ENCODING(temp.key.encoding)) - goto error; + if (!PG_VALID_BE_ENCODING(temp.encoding)) + goto data_error; - /* Previous incarnation might have had a larger query_size */ + /* Resize buffer as needed */ if (temp.query_len >= buffer_size) { - buffer = (char *) repalloc(buffer, temp.query_len + 1); - buffer_size = temp.query_len + 1; + buffer_size = Max(buffer_size * 2, temp.query_len + 1); + buffer = repalloc(buffer, buffer_size); } - if (fread(buffer, 1, temp.query_len, file) != temp.query_len) - goto error; + if (fread(buffer, 1, temp.query_len + 1, file) != temp.query_len + 1) + goto read_error; + + /* Should have a trailing null, but let's make sure */ buffer[temp.query_len] = '\0'; /* Skip loading "sticky" entries */ if (temp.counters.calls == 0) continue; - /* Clip to available length if needed */ - if (temp.query_len >= query_size) - temp.query_len = pg_encoding_mbcliplen(temp.key.encoding, - buffer, - temp.query_len, - query_size - 1); + /* Store the query text */ + query_offset = pgss->extent; + if (fwrite(buffer, 1, temp.query_len + 1, qfile) != temp.query_len + 1) + goto write_error; + pgss->extent += temp.query_len + 1; /* make the hashtable entry (discards old entries if too many) */ - entry = entry_alloc(&temp.key, buffer, temp.query_len, false); + entry = entry_alloc(&temp.key, query_offset, temp.query_len, + temp.encoding, + false); /* copy in the actual stats */ entry->counters = temp.counters; @@ -526,26 +607,56 @@ pgss_shmem_startup(void) pfree(buffer); FreeFile(file); + FreeFile(qfile); /* - * Remove the file so it's not included in backups/replication slaves, - * etc. A new file will be written on next shutdown. + * Remove the persisted stats file so it's not included in + * backups/replication slaves, etc. A new file will be written on next + * shutdown. + * + * Note: it's okay if the PGSS_TEXT_FILE is included in a basebackup, + * because we remove that file on startup; it acts inversely to + * PGSS_DUMP_FILE, in that it is only supposed to be around when the + * server is running, whereas PGSS_DUMP_FILE is only supposed to be around + * when the server is not running. Leaving the file creates no danger of + * a newly restored database having a spurious record of execution costs, + * which is what we're really concerned about here. */ unlink(PGSS_DUMP_FILE); return; -error: +read_error: ereport(LOG, (errcode_for_file_access(), errmsg("could not read pg_stat_statement file \"%s\": %m", PGSS_DUMP_FILE))); + goto fail; +data_error: + ereport(LOG, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("ignoring invalid data in pg_stat_statement file \"%s\"", + PGSS_DUMP_FILE))); + goto fail; +write_error: + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not write pg_stat_statement file \"%s\": %m", + PGSS_TEXT_FILE))); +fail: if (buffer) pfree(buffer); if (file) FreeFile(file); + if (qfile) + FreeFile(qfile); /* If possible, throw away the bogus file; ignore any error */ unlink(PGSS_DUMP_FILE); + + /* + * Don't unlink PGSS_TEXT_FILE here; it should always be around while the + * server is running with pg_stat_statements enabled + */ } /* @@ -558,6 +669,8 @@ static void pgss_shmem_shutdown(int code, Datum arg) { FILE *file; + char *qbuffer = NULL; + Size qbuffer_size = 0; HASH_SEQ_STATUS hash_seq; int32 num_entries; pgssEntry *entry; @@ -586,16 +699,36 @@ pgss_shmem_shutdown(int code, Datum arg) if (fwrite(&num_entries, sizeof(int32), 1, file) != 1) goto error; + qbuffer = qtext_load_file(&qbuffer_size); + if (qbuffer == NULL) + goto error; + + /* + * When serializing to disk, we store query texts immediately after their + * entry data. Any orphaned query texts are thereby excluded. + */ hash_seq_init(&hash_seq, pgss_hash); while ((entry = hash_seq_search(&hash_seq)) != NULL) { int len = entry->query_len; + char *qstr = qtext_fetch(entry->query_offset, len, + qbuffer, qbuffer_size); - if (fwrite(entry, offsetof(pgssEntry, mutex), 1, file) != 1 || - fwrite(entry->query, 1, len, file) != len) + if (qstr == NULL) + continue; /* Ignore any entries with bogus texts */ + + if (fwrite(entry, sizeof(pgssEntry), 1, file) != 1 || + fwrite(qstr, 1, len + 1, file) != len + 1) + { + /* note: we assume hash_seq_term won't change errno */ + hash_seq_term(&hash_seq); goto error; + } } + free(qbuffer); + qbuffer = NULL; + if (FreeFile(file)) { file = NULL; @@ -603,7 +736,7 @@ pgss_shmem_shutdown(int code, Datum arg) } /* - * Rename file into place, so we atomically replace the old one. + * Rename file into place, so we atomically replace any old one. */ if (rename(PGSS_DUMP_FILE ".tmp", PGSS_DUMP_FILE) != 0) ereport(LOG, @@ -611,6 +744,9 @@ pgss_shmem_shutdown(int code, Datum arg) errmsg("could not rename pg_stat_statement file \"%s\": %m", PGSS_DUMP_FILE ".tmp"))); + /* Unlink query-texts file; it's not needed while shutdown */ + unlink(PGSS_TEXT_FILE); + return; error: @@ -618,9 +754,12 @@ error: (errcode_for_file_access(), errmsg("could not write pg_stat_statement file \"%s\": %m", PGSS_DUMP_FILE ".tmp"))); + if (qbuffer) + free(qbuffer); if (file) FreeFile(file); unlink(PGSS_DUMP_FILE ".tmp"); + unlink(PGSS_TEXT_FILE); } /* @@ -916,7 +1055,6 @@ pgss_hash_fn(const void *key, Size keysize) { const pgssHashKey *k = (const pgssHashKey *) key; - /* we don't bother to include encoding in the hash */ return hash_uint32((uint32) k->userid) ^ hash_uint32((uint32) k->dbid) ^ hash_uint32((uint32) k->queryid); @@ -933,7 +1071,6 @@ pgss_match_fn(const void *key1, const void *key2, Size keysize) if (k1->userid == k2->userid && k1->dbid == k2->dbid && - k1->encoding == k2->encoding && k1->queryid == k2->queryid) return 0; else @@ -967,6 +1104,8 @@ pgss_store(const char *query, uint32 queryId, pgssHashKey key; pgssEntry *entry; char *norm_query = NULL; + int encoding = GetDatabaseEncoding(); + int query_len; Assert(query != NULL); @@ -974,10 +1113,11 @@ pgss_store(const char *query, uint32 queryId, if (!pgss || !pgss_hash) return; + query_len = strlen(query); + /* Set up key for hashtable search */ key.userid = GetUserId(); key.dbid = MyDatabaseId; - key.encoding = GetDatabaseEncoding(); key.queryid = queryId; /* Lookup the hash table entry with shared lock. */ @@ -988,45 +1128,64 @@ pgss_store(const char *query, uint32 queryId, /* Create new entry, if not present */ if (!entry) { - int query_len; + Size query_offset; + int gc_count; + bool stored; + bool do_gc; /* - * We'll need exclusive lock to make a new entry. There is no point - * in holding shared lock while we normalize the string, though. + * Create a new, normalized query string if caller asked. We don't + * need to hold the lock while doing this work. (Note: in any case, + * it's possible that someone else creates a duplicate hashtable entry + * in the interval where we don't hold the lock below. That case is + * handled by entry_alloc.) */ - LWLockRelease(pgss->lock); - - query_len = strlen(query); - if (jstate) { - /* Normalize the string if enabled */ + LWLockRelease(pgss->lock); norm_query = generate_normalized_query(jstate, query, &query_len, - key.encoding); + encoding); + LWLockAcquire(pgss->lock, LW_SHARED); + } - /* Acquire exclusive lock as required by entry_alloc() */ - LWLockAcquire(pgss->lock, LW_EXCLUSIVE); + /* Append new query text to file with only shared lock held */ + stored = qtext_store(norm_query ? norm_query : query, query_len, + &query_offset, &gc_count); - entry = entry_alloc(&key, norm_query, query_len, true); - } - else - { - /* - * We're just going to store the query string as-is; but we have - * to truncate it if over-length. - */ - if (query_len >= pgss->query_size) - query_len = pg_encoding_mbcliplen(key.encoding, - query, - query_len, - pgss->query_size - 1); + /* + * Determine whether we need to garbage collect external query texts + * while the shared lock is still held. This micro-optimization + * avoids taking the time to decide this while holding exclusive lock. + */ + do_gc = need_gc_qtexts(); - /* Acquire exclusive lock as required by entry_alloc() */ - LWLockAcquire(pgss->lock, LW_EXCLUSIVE); + /* Need exclusive lock to make a new hashtable entry - promote */ + LWLockRelease(pgss->lock); + LWLockAcquire(pgss->lock, LW_EXCLUSIVE); - entry = entry_alloc(&key, query, query_len, false); - } + /* + * A garbage collection may have occurred while we weren't holding the + * lock. In the unlikely event that this happens, the query text we + * stored above will have been garbage collected, so write it again. + * This should be infrequent enough that doing it while holding + * exclusive lock isn't a performance problem. + */ + if (!stored || pgss->gc_count != gc_count) + stored = qtext_store(norm_query ? norm_query : query, query_len, + &query_offset, NULL); + + /* If we failed to write to the text file, give up */ + if (!stored) + goto done; + + /* OK to create a new hashtable entry */ + entry = entry_alloc(&key, query_offset, query_len, encoding, + jstate != NULL); + + /* If needed, perform garbage collection while exclusive lock held */ + if (do_gc) + gc_qtexts(); } /* Increment the counts, except when jstate is not NULL */ @@ -1064,9 +1223,10 @@ pgss_store(const char *query, uint32 queryId, SpinLockRelease(&e->mutex); } +done: LWLockRelease(pgss->lock); - /* We postpone this pfree until we're out of the lock */ + /* We postpone this clean-up until we're out of the lock */ if (norm_query) pfree(norm_query); } @@ -1085,15 +1245,50 @@ pg_stat_statements_reset(PG_FUNCTION_ARGS) PG_RETURN_VOID(); } +/* Number of output arguments (columns) for various API versions */ #define PG_STAT_STATEMENTS_COLS_V1_0 14 #define PG_STAT_STATEMENTS_COLS_V1_1 18 -#define PG_STAT_STATEMENTS_COLS 19 +#define PG_STAT_STATEMENTS_COLS_V1_2 19 +#define PG_STAT_STATEMENTS_COLS 19 /* maximum of above */ /* * Retrieve statement statistics. + * + * The SQL API of this function has changed multiple times, and will likely + * do so again in future. To support the case where a newer version of this + * loadable module is being used with an old SQL declaration of the function, + * we continue to support the older API versions. For 1.2 and later, the + * expected API version is identified by embedding it in the C name of the + * function. Unfortunately we weren't bright enough to do that for 1.1. + */ +Datum +pg_stat_statements_1_2(PG_FUNCTION_ARGS) +{ + bool showtext = PG_GETARG_BOOL(0); + + pg_stat_statements_internal(fcinfo, PGSS_V1_2, showtext); + + return (Datum) 0; +} + +/* + * Legacy entry point for pg_stat_statements() API versions 1.0 and 1.1. + * This can be removed someday, perhaps. */ Datum pg_stat_statements(PG_FUNCTION_ARGS) +{ + /* If it's really API 1.1, we'll figure that out below */ + pg_stat_statements_internal(fcinfo, PGSS_V1_0, true); + + return (Datum) 0; +} + +/* Common code for all versions of pg_stat_statements() */ +static void +pg_stat_statements_internal(FunctionCallInfo fcinfo, + pgssVersion api_version, + bool showtext) { ReturnSetInfo *rsinfo = (ReturnSetInfo *) fcinfo->resultinfo; TupleDesc tupdesc; @@ -1102,10 +1297,14 @@ pg_stat_statements(PG_FUNCTION_ARGS) MemoryContext oldcontext; Oid userid = GetUserId(); bool is_superuser = superuser(); + char *qbuffer = NULL; + Size qbuffer_size = 0; + Size extent = 0; + int gc_count = 0; HASH_SEQ_STATUS hash_seq; pgssEntry *entry; - pgssVersion detected_version; + /* hash table must exist already */ if (!pgss || !pgss_hash) ereport(ERROR, (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE), @@ -1122,28 +1321,39 @@ pg_stat_statements(PG_FUNCTION_ARGS) errmsg("materialize mode required, but it is not " \ "allowed in this context"))); + /* Switch into long-lived context to construct returned data structures */ + per_query_ctx = rsinfo->econtext->ecxt_per_query_memory; + oldcontext = MemoryContextSwitchTo(per_query_ctx); + /* Build a tuple descriptor for our result type */ if (get_call_result_type(fcinfo, NULL, &tupdesc) != TYPEFUNC_COMPOSITE) elog(ERROR, "return type must be a row type"); + /* + * Check we have the expected number of output arguments. Aside from + * being a good safety check, we need a kluge here to detect API version + * 1.1, which was wedged into the code in an ill-considered way. + */ switch (tupdesc->natts) { case PG_STAT_STATEMENTS_COLS_V1_0: - detected_version = PGSS_V1_0; + if (api_version != PGSS_V1_0) + elog(ERROR, "incorrect number of output arguments"); break; case PG_STAT_STATEMENTS_COLS_V1_1: - detected_version = PGSS_V1_1; + /* pg_stat_statements() should have told us 1.0 */ + if (api_version != PGSS_V1_0) + elog(ERROR, "incorrect number of output arguments"); + api_version = PGSS_V1_1; break; - case PG_STAT_STATEMENTS_COLS: - detected_version = PGSS_V1_2; + case PG_STAT_STATEMENTS_COLS_V1_2: + if (api_version != PGSS_V1_2) + elog(ERROR, "incorrect number of output arguments"); break; default: - elog(ERROR, "pgss version unrecognized from tuple descriptor"); + elog(ERROR, "incorrect number of output arguments"); } - per_query_ctx = rsinfo->econtext->ecxt_per_query_memory; - oldcontext = MemoryContextSwitchTo(per_query_ctx); - tupstore = tuplestore_begin_heap(true, false, work_mem); rsinfo->returnMode = SFRM_Materialize; rsinfo->setResult = tupstore; @@ -1151,8 +1361,71 @@ pg_stat_statements(PG_FUNCTION_ARGS) MemoryContextSwitchTo(oldcontext); + /* + * We'd like to load the query text file (if needed) while not holding any + * lock on pgss->lock. In the worst case we'll have to do this again + * after we have the lock, but it's unlikely enough to make this a win + * despite occasional duplicated work. We need to reload if anybody + * writes to the file (either a retail qtext_store(), or a garbage + * collection) between this point and where we've gotten shared lock. If + * a qtext_store is actually in progress when we look, we might as well + * skip the speculative load entirely. + */ + if (showtext) + { + int n_writers; + + /* Take the mutex so we can examine variables */ + { + volatile pgssSharedState *s = (volatile pgssSharedState *) pgss; + + SpinLockAcquire(&s->mutex); + extent = s->extent; + n_writers = s->n_writers; + gc_count = s->gc_count; + SpinLockRelease(&s->mutex); + } + + /* No point in loading file now if there are active writers */ + if (n_writers == 0) + qbuffer = qtext_load_file(&qbuffer_size); + } + + /* + * Get shared lock, load or reload the query text file if we must, and + * iterate over the hashtable entries. + * + * With a large hash table, we might be holding the lock rather longer + * than one could wish. However, this only blocks creation of new hash + * table entries, and the larger the hash table the less likely that is to + * be needed. So we can hope this is okay. Perhaps someday we'll decide + * we need to partition the hash table to limit the time spent holding any + * one lock. + */ LWLockAcquire(pgss->lock, LW_SHARED); + if (showtext) + { + /* + * Here it is safe to examine extent and gc_count without taking the + * mutex. Note that although other processes might change + * pgss->extent just after we look at it, the strings they then write + * into the file cannot yet be referenced in the hashtable, so we + * don't care whether we see them or not. + * + * If qtext_load_file fails, we just press on; we'll return NULL for + * every query text. + */ + if (qbuffer == NULL || + pgss->extent != extent || + pgss->gc_count != gc_count) + { + if (qbuffer) + free(qbuffer); + qbuffer = qtext_load_file(&qbuffer_size); + } + } + hash_seq_init(&hash_seq, pgss_hash); while ((entry = hash_seq_search(&hash_seq)) != NULL) { @@ -1170,26 +1443,57 @@ pg_stat_statements(PG_FUNCTION_ARGS) if (is_superuser || entry->key.userid == userid) { - char *qstr; - - if (detected_version >= PGSS_V1_2) + if (api_version >= PGSS_V1_2) values[i++] = Int64GetDatumFast(queryid); - qstr = (char *) - pg_do_encoding_conversion((unsigned char *) entry->query, - entry->query_len, - entry->key.encoding, - GetDatabaseEncoding()); - values[i++] = CStringGetTextDatum(qstr); - if (qstr != entry->query) - pfree(qstr); + if (showtext) + { + char *qstr = qtext_fetch(entry->query_offset, + entry->query_len, + qbuffer, + qbuffer_size); + + if (qstr) + { + char *enc; + + enc = (char *) + pg_do_encoding_conversion((unsigned char *) qstr, + entry->query_len, + entry->encoding, + GetDatabaseEncoding()); + + values[i++] = CStringGetTextDatum(enc); + + if (enc != qstr) + pfree(enc); + } + else + { + /* Just return a null if we fail to find the text */ + nulls[i++] = true; + } + } + else + { + /* Query text not requested */ + nulls[i++] = true; + } } else { - if (detected_version >= PGSS_V1_2) + /* Don't show queryid */ + if (api_version >= PGSS_V1_2) nulls[i++] = true; - values[i++] = CStringGetTextDatum("<insufficient privilege>"); + /* + * Don't show query text, but hint as to the reason for not doing + * so if it was requested + */ + if (showtext) + values[i++] = CStringGetTextDatum("<insufficient privilege>"); + else + nulls[i++] = true; } /* copy counters to a local variable to keep locking time short */ @@ -1210,37 +1514,37 @@ pg_stat_statements(PG_FUNCTION_ARGS) values[i++] = Int64GetDatumFast(tmp.rows); values[i++] = Int64GetDatumFast(tmp.shared_blks_hit); values[i++] = Int64GetDatumFast(tmp.shared_blks_read); - if (detected_version >= PGSS_V1_1) + if (api_version >= PGSS_V1_1) values[i++] = Int64GetDatumFast(tmp.shared_blks_dirtied); values[i++] = Int64GetDatumFast(tmp.shared_blks_written); values[i++] = Int64GetDatumFast(tmp.local_blks_hit); values[i++] = Int64GetDatumFast(tmp.local_blks_read); - if (detected_version >= PGSS_V1_1) + if (api_version >= PGSS_V1_1) values[i++] = Int64GetDatumFast(tmp.local_blks_dirtied); values[i++] = Int64GetDatumFast(tmp.local_blks_written); values[i++] = Int64GetDatumFast(tmp.temp_blks_read); values[i++] = Int64GetDatumFast(tmp.temp_blks_written); - if (detected_version >= PGSS_V1_1) + if (api_version >= PGSS_V1_1) { values[i++] = Float8GetDatumFast(tmp.blk_read_time); values[i++] = Float8GetDatumFast(tmp.blk_write_time); } - Assert(i == (detected_version == PGSS_V1_0? - PG_STAT_STATEMENTS_COLS_V1_0: - detected_version == PGSS_V1_1? - PG_STAT_STATEMENTS_COLS_V1_1: - PG_STAT_STATEMENTS_COLS)); + Assert(i == (api_version == PGSS_V1_0 ? PG_STAT_STATEMENTS_COLS_V1_0 : + api_version == PGSS_V1_1 ? PG_STAT_STATEMENTS_COLS_V1_1 : + api_version == PGSS_V1_2 ? PG_STAT_STATEMENTS_COLS_V1_2 : + -1 /* fail if you forget to update this assert */ )); tuplestore_putvalues(tupstore, tupdesc, values, nulls); } + /* clean up and return the tuplestore */ LWLockRelease(pgss->lock); - /* clean up and return the tuplestore */ - tuplestore_donestoring(tupstore); + if (qbuffer) + free(qbuffer); - return (Datum) 0; + tuplestore_donestoring(tupstore); } /* @@ -1250,11 +1554,9 @@ static Size pgss_memsize(void) { Size size; - Size entrysize; size = MAXALIGN(sizeof(pgssSharedState)); - entrysize = offsetof(pgssEntry, query) +pgstat_track_activity_query_size; - size = add_size(size, hash_estimate_size(pgss_max, entrysize)); + size = add_size(size, hash_estimate_size(pgss_max, sizeof(pgssEntry))); return size; } @@ -1277,7 +1579,8 @@ pgss_memsize(void) * have made the entry while we waited to get exclusive lock. */ static pgssEntry * -entry_alloc(pgssHashKey *key, const char *query, int query_len, bool sticky) +entry_alloc(pgssHashKey *key, Size query_offset, int query_len, int encoding, + bool sticky) { pgssEntry *entry; bool found; @@ -1299,11 +1602,11 @@ entry_alloc(pgssHashKey *key, const char *query, int query_len, bool sticky) entry->counters.usage = sticky ? pgss->cur_median_usage : USAGE_INIT; /* re-initialize the mutex each time ... we assume no one using it */ SpinLockInit(&entry->mutex); - /* ... and don't forget the query text */ - Assert(query_len >= 0 && query_len < pgss->query_size); + /* ... and don't forget the query text metadata */ + Assert(query_len >= 0); + entry->query_offset = query_offset; entry->query_len = query_len; - memcpy(entry->query, query, query_len); - entry->query[query_len] = '\0'; + entry->encoding = encoding; } return entry; @@ -1338,6 +1641,7 @@ entry_dealloc(void) pgssEntry *entry; int nvictims; int i; + Size totlen = 0; /* * Sort entries by usage and deallocate USAGE_DEALLOC_PERCENT of them. @@ -1357,13 +1661,19 @@ entry_dealloc(void) entry->counters.usage *= STICKY_DECREASE_FACTOR; else entry->counters.usage *= USAGE_DECREASE_FACTOR; + /* Accumulate total size, too. */ + totlen += entry->query_len + 1; } qsort(entries, i, sizeof(pgssEntry *), entry_cmp); - /* Also, record the (approximate) median usage */ if (i > 0) + { + /* Record the (approximate) median usage */ pgss->cur_median_usage = entries[i / 2]->counters.usage; + /* Record the mean query length */ + pgss->mean_query_len = totlen / i; + } nvictims = Max(10, i * USAGE_DEALLOC_PERCENT / 100); nvictims = Min(nvictims, i); @@ -1376,6 +1686,396 @@ entry_dealloc(void) pfree(entries); } +/* + * Given a null-terminated string, allocate a new entry in the external query + * text file and store the string there. + * + * Although we could compute the string length via strlen(), callers already + * have it handy, so we require them to pass it too. + * + * If successful, returns true, and stores the new entry's offset in the file + * into *query_offset. Also, if gc_count isn't NULL, *gc_count is set to the + * number of garbage collections that have occurred so far. + * + * On failure, returns false. + * + * At least a shared lock on pgss->lock must be held by the caller, so as + * to prevent a concurrent garbage collection. Share-lock-holding callers + * should pass a gc_count pointer to obtain the number of garbage collections, + * so that they can recheck the count after obtaining exclusive lock to + * detect whether a garbage collection occurred (and removed this entry). + */ +static bool +qtext_store(const char *query, int query_len, + Size *query_offset, int *gc_count) +{ + Size off; + int fd; + + /* + * We use a spinlock to protect extent/n_writers/gc_count, so that + * multiple processes may execute this function concurrently. + */ + { + volatile pgssSharedState *s = (volatile pgssSharedState *) pgss; + + SpinLockAcquire(&s->mutex); + off = s->extent; + s->extent += query_len + 1; + s->n_writers++; + if (gc_count) + *gc_count = s->gc_count; + SpinLockRelease(&s->mutex); + } + + *query_offset = off; + + /* Now write the data into the successfully-reserved part of the file */ + fd = OpenTransientFile(PGSS_TEXT_FILE, O_RDWR | O_CREAT | PG_BINARY, + S_IRUSR | S_IWUSR); + if (fd < 0) + goto error; + + if (lseek(fd, off, SEEK_SET) != off) + goto error; + + if (write(fd, query, query_len + 1) != query_len + 1) + goto error; + + CloseTransientFile(fd); + + /* Mark our write complete */ + { + volatile pgssSharedState *s = (volatile pgssSharedState *) pgss; + + SpinLockAcquire(&s->mutex); + s->n_writers--; + SpinLockRelease(&s->mutex); + } + + return true; + +error: + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not write pg_stat_statement file \"%s\": %m", + PGSS_TEXT_FILE))); + + if (fd >= 0) + CloseTransientFile(fd); + + /* Mark our write complete */ + { + volatile pgssSharedState *s = (volatile pgssSharedState *) pgss; + + SpinLockAcquire(&s->mutex); + s->n_writers--; + SpinLockRelease(&s->mutex); + } + + return false; +} + +/* + * Read the external query text file into a malloc'd buffer. + * + * Returns NULL (without throwing an error) if unable to read, eg + * file not there or insufficient memory. + * + * On success, the buffer size is also returned into *buffer_size. + * + * This can be called without any lock on pgss->lock, but in that case + * the caller is responsible for verifying that the result is sane. + */ +static char * +qtext_load_file(Size *buffer_size) +{ + char *buf; + int fd; + struct stat stat; + + fd = OpenTransientFile(PGSS_TEXT_FILE, O_RDONLY | PG_BINARY, 0); + if (fd < 0) + { + if (errno != ENOENT) + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not read pg_stat_statement file \"%s\": %m", + PGSS_TEXT_FILE))); + return NULL; + } + + /* Get file length */ + if (fstat(fd, &stat)) + { + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not stat pg_stat_statement file \"%s\": %m", + PGSS_TEXT_FILE))); + CloseTransientFile(fd); + return NULL; + } + + /* Allocate buffer; beware that off_t might be wider than size_t */ + if (stat.st_size <= MaxAllocSize) + buf = (char *) malloc(stat.st_size); + else + buf = NULL; + if (buf == NULL) + { + ereport(LOG, + (errcode(ERRCODE_OUT_OF_MEMORY), + errmsg("out of memory"))); + CloseTransientFile(fd); + return NULL; + } + + /* + * OK, slurp in the file. If we get a short read and errno doesn't get + * set, the reason is probably that garbage collection truncated the file + * since we did the fstat(), so we don't log a complaint --- but we don't + * return the data, either, since it's most likely corrupt due to + * concurrent writes from garbage collection. + */ + errno = 0; + if (read(fd, buf, stat.st_size) != stat.st_size) + { + if (errno) + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not read pg_stat_statement file \"%s\": %m", + PGSS_TEXT_FILE))); + free(buf); + CloseTransientFile(fd); + return NULL; + } + + CloseTransientFile(fd); + + *buffer_size = stat.st_size; + return buf; +} + +/* + * Locate a query text in the file image previously read by qtext_load_file(). + * + * We validate the given offset/length, and return NULL if bogus. Otherwise, + * the result points to a null-terminated string within the buffer. + */ +static char * +qtext_fetch(Size query_offset, int query_len, + char *buffer, Size buffer_size) +{ + /* File read failed? */ + if (buffer == NULL) + return NULL; + /* Bogus offset/length? */ + if (query_len < 0 || + query_offset + query_len >= buffer_size) + return NULL; + /* As a further sanity check, make sure there's a trailing null */ + if (buffer[query_offset + query_len] != '\0') + return NULL; + /* Looks OK */ + return buffer + query_offset; +} + +/* + * Do we need to garbage-collect the external query text file? + * + * Caller should hold at least a shared lock on pgss->lock. + */ +static bool +need_gc_qtexts(void) +{ + Size extent; + + /* Read shared extent pointer */ + { + volatile pgssSharedState *s = (volatile pgssSharedState *) pgss; + + SpinLockAcquire(&s->mutex); + extent = s->extent; + SpinLockRelease(&s->mutex); + } + + /* Don't proceed if file does not exceed 512 bytes per possible entry */ + if (extent < 512 * pgss_max) + return false; + + /* + * Don't proceed if file is less than about 50% bloat. Nothing can or + * should be done in the event of unusually large query texts accounting + * for file's large size. We go to the trouble of maintaining the mean + * query length in order to prevent garbage collection from thrashing + * uselessly. + */ + if (extent < pgss->mean_query_len * pgss_max * 2) + return false; + + return true; +} + +/* + * Garbage-collect orphaned query texts in external file. + * + * This won't be called often in the typical case, since it's likely that + * there won't be too much churn, and besides, a similar compaction process + * occurs when serializing to disk at shutdown or as part of resetting. + * Despite this, it seems prudent to plan for the edge case where the file + * becomes unreasonably large, with no other method of compaction likely to + * occur in the foreseeable future. + * + * The caller must hold an exclusive lock on pgss->lock. + */ +static void +gc_qtexts(void) +{ + char *qbuffer; + Size qbuffer_size; + FILE *qfile; + HASH_SEQ_STATUS hash_seq; + pgssEntry *entry; + Size extent; + int nentries; + + /* + * When called from pgss_store, some other session might have proceeded + * with garbage collection in the no-lock-held interim of lock strength + * escalation. Check once more that this is actually necessary. + */ + if (!need_gc_qtexts()) + return; + + /* + * Load the old texts file. If we fail (out of memory, for instance) just + * skip the garbage collection. + */ + qbuffer = qtext_load_file(&qbuffer_size); + if (qbuffer == NULL) + return; + + /* + * We overwrite the query texts file in place, so as to reduce the risk of + * an out-of-disk-space failure. Since the file is guaranteed not to get + * larger, this should always work on traditional filesystems; though we + * could still lose on copy-on-write filesystems. + */ + qfile = AllocateFile(PGSS_TEXT_FILE, PG_BINARY_W); + if (qfile == NULL) + { + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not write pg_stat_statement file \"%s\": %m", + PGSS_TEXT_FILE))); + goto gc_fail; + } + + extent = 0; + nentries = 0; + + hash_seq_init(&hash_seq, pgss_hash); + while ((entry = hash_seq_search(&hash_seq)) != NULL) + { + int query_len = entry->query_len; + char *qry = qtext_fetch(entry->query_offset, + query_len, + qbuffer, + qbuffer_size); + + if (qry == NULL) + { + /* Trouble ... drop the text */ + entry->query_offset = 0; + entry->query_len = -1; + continue; + } + + if (fwrite(qry, 1, query_len + 1, qfile) != query_len + 1) + { + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not write pg_stat_statement file \"%s\": %m", + PGSS_TEXT_FILE))); + hash_seq_term(&hash_seq); + goto gc_fail; + } + + entry->query_offset = extent; + extent += query_len + 1; + nentries++; + } + + /* + * Truncate away any now-unused space. If this fails for some odd reason, + * we log it, but there's no need to fail. + */ + if (ftruncate(fileno(qfile), extent) != 0) + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not truncate pg_stat_statement file \"%s\": %m", + PGSS_TEXT_FILE))); + + if (FreeFile(qfile)) + { + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not write pg_stat_statement file \"%s\": %m", + PGSS_TEXT_FILE))); + qfile = NULL; + goto gc_fail; + } + + elog(DEBUG1, "pgss gc of queries file shrunk size from %zu to %zu", + pgss->extent, extent); + + /* Reset the shared extent pointer */ + pgss->extent = extent; + + /* + * Also update the mean query length, to be sure that need_gc_qtexts() + * won't still think we have a problem. + */ + if (nentries > 0) + pgss->mean_query_len = extent / nentries; + else + pgss->mean_query_len = ASSUMED_LENGTH_INIT; + + free(qbuffer); + + /* + * OK, count a garbage collection cycle. (Note: even though we have + * exclusive lock on pgss->lock, we must take pgss->mutex for this, since + * other processes may examine gc_count while holding only the mutex. + * Also, we have to advance the count *after* we've rewritten the file, + * else other processes might not realize they read a stale file.) + */ + record_gc_qtexts(); + + return; + +gc_fail: + /* clean up resources */ + if (qfile) + FreeFile(qfile); + if (qbuffer) + free(qbuffer); + + /* + * Since the contents of the external file are now uncertain, mark all + * hashtable entries as having invalid texts. + */ + hash_seq_init(&hash_seq, pgss_hash); + while ((entry = hash_seq_search(&hash_seq)) != NULL) + { + entry->query_offset = 0; + entry->query_len = -1; + } + + /* Seems like a good idea to bump the GC count even though we failed */ + record_gc_qtexts(); +} + /* * Release all entries. */ @@ -1384,6 +2084,7 @@ entry_reset(void) { HASH_SEQ_STATUS hash_seq; pgssEntry *entry; + FILE *qfile; LWLockAcquire(pgss->lock, LW_EXCLUSIVE); @@ -1393,6 +2094,34 @@ entry_reset(void) hash_search(pgss_hash, &entry->key, HASH_REMOVE, NULL); } + /* + * Write new empty query file, perhaps even creating a new one to recover + * if the file was missing. + */ + qfile = AllocateFile(PGSS_TEXT_FILE, PG_BINARY_W); + if (qfile == NULL) + { + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not create pg_stat_statement file \"%s\": %m", + PGSS_TEXT_FILE))); + goto done; + } + + /* If ftruncate fails, log it, but it's not a fatal problem */ + if (ftruncate(fileno(qfile), 0) != 0) + ereport(LOG, + (errcode_for_file_access(), + errmsg("could not truncate pg_stat_statement file \"%s\": %m", + PGSS_TEXT_FILE))); + + FreeFile(qfile); + +done: + pgss->extent = 0; + /* This counts as a query text garbage collection for our purposes */ + record_gc_qtexts(); + LWLockRelease(pgss->lock); } @@ -1962,7 +2691,7 @@ RecordConstLocation(pgssJumbleState *jstate, int location) * *query_len_p contains the input string length, and is updated with * the result string length (which cannot be longer) on exit. * - * Returns a palloc'd string, which is not necessarily null-terminated. + * Returns a palloc'd string. */ static char * generate_normalized_query(pgssJumbleState *jstate, const char *query, @@ -1970,7 +2699,6 @@ generate_normalized_query(pgssJumbleState *jstate, const char *query, { char *norm_query; int query_len = *query_len_p; - int max_output_len; int i, len_to_wrt, /* Length (in bytes) to write */ quer_loc = 0, /* Source query byte location */ @@ -1984,9 +2712,8 @@ generate_normalized_query(pgssJumbleState *jstate, const char *query, */ fill_in_constant_lengths(jstate, query); - /* Allocate result buffer, ensuring we limit result to allowed size */ - max_output_len = Min(query_len, pgss->query_size - 1); - norm_query = palloc(max_output_len); + /* Allocate result buffer */ + norm_query = palloc(query_len + 1); for (i = 0; i < jstate->clocations_count; i++) { @@ -2002,49 +2729,32 @@ generate_normalized_query(pgssJumbleState *jstate, const char *query, /* Copy next chunk, or as much as will fit */ len_to_wrt = off - last_off; len_to_wrt -= last_tok_len; - len_to_wrt = Min(len_to_wrt, max_output_len - n_quer_loc); Assert(len_to_wrt >= 0); memcpy(norm_query + n_quer_loc, query + quer_loc, len_to_wrt); n_quer_loc += len_to_wrt; - if (n_quer_loc < max_output_len) - norm_query[n_quer_loc++] = '?'; + norm_query[n_quer_loc++] = '?'; quer_loc = off + tok_len; last_off = off; last_tok_len = tok_len; - - /* If we run out of space, might as well stop iterating */ - if (n_quer_loc >= max_output_len) - break; } /* * We've copied up until the last ignorable constant. Copy over the - * remaining bytes of the original query string, or at least as much as - * will fit. + * remaining bytes of the original query string. */ len_to_wrt = query_len - quer_loc; - len_to_wrt = Min(len_to_wrt, max_output_len - n_quer_loc); Assert(len_to_wrt >= 0); memcpy(norm_query + n_quer_loc, query + quer_loc, len_to_wrt); n_quer_loc += len_to_wrt; - /* - * If we ran out of space, we need to do an encoding-aware truncation, - * just to make sure we don't have an incomplete character at the end. - */ - if (n_quer_loc >= max_output_len) - query_len = pg_encoding_mbcliplen(encoding, - norm_query, - n_quer_loc, - pgss->query_size - 1); - else - query_len = n_quer_loc; + Assert(n_quer_loc <= query_len); + norm_query[n_quer_loc] = '\0'; - *query_len_p = query_len; + *query_len_p = n_quer_loc; return norm_query; } diff --git a/doc/src/sgml/pgstatstatements.sgml b/doc/src/sgml/pgstatstatements.sgml index 6ea0415d861caf4582d72b4d3fbb26b915fbeb7f..d39f5a03ca635fcbb7dabac2e6ea87cbe988f09c 100644 --- a/doc/src/sgml/pgstatstatements.sgml +++ b/doc/src/sgml/pgstatstatements.sgml @@ -62,14 +62,14 @@ <entry><structfield>queryid</structfield></entry> <entry><type>bigint</type></entry> <entry></entry> - <entry>Internal hash identifier, computed from the entry's post-parse-analysis tree</entry> + <entry>Internal hash code, computed from the statement's parse tree</entry> </row> <row> <entry><structfield>query</structfield></entry> <entry><type>text</type></entry> <entry></entry> - <entry>Text of a representative statement (up to <xref linkend="guc-track-activity-query-size"> bytes)</entry> + <entry>Text of a representative statement</entry> </row> <row> @@ -188,9 +188,10 @@ </table> <para> - This view, and the function <function>pg_stat_statements_reset</>, - are available only in databases they have been specifically installed into - by installing the <literal>pg_stat_statements</> extension. + This view, and the functions <function>pg_stat_statements_reset</> + and <function>pg_stat_statements</>, are available only in + databases they have been specifically installed into by installing + the <literal>pg_stat_statements</> extension. However, statistics are tracked across all databases of the server whenever the <filename>pg_stat_statements</filename> module is loaded into the server, regardless of presence of the view. @@ -242,36 +243,34 @@ <para> Consumers of <literal>pg_stat_statements</> may wish to use - <structfield>queryid</> (perhaps in composite with + <structfield>queryid</> (perhaps in combination with <structfield>dbid</> and <structfield>userid</>) as a more stable and reliable identifier for each entry than its query text. However, it is important to understand that there are only limited guarantees around the stability of the <structfield>queryid</> hash value. Since the identifier is derived from the post-parse-analysis tree, its value is a function of, among other - things, the internal identifiers that comprise this representation. - This has some counterintuitive implications. For example, a query - against a table that is fingerprinted by - <literal>pg_stat_statements</> will appear distinct to a - subsequently executed query that a reasonable observer might judge - to be a non-distinct, if in the interim the table was dropped and - re-created. The hashing process is sensitive to difference in + things, the internal object identifiers appearing in this representation. + This has some counterintuitive implications. For example, + <literal>pg_stat_statements</> will consider two apparently-identical + queries to be distinct, if they reference a table that was dropped + and recreated between the executions of the two queries. + The hashing process is also sensitive to differences in machine architecture and other facets of the platform. Furthermore, it is not safe to assume that <structfield>queryid</> will be stable across major versions of <productname>PostgreSQL</>. </para> <para> - As a rule of thumb, an assumption of the stability or comparability - of <structfield>queryid</> values should be predicated on the - underlying catalog metadata and hash function implementation - details exactly matching. Any two servers participating in any - variety of replication based on physical WAL-replay can be expected + As a rule of thumb, <structfield>queryid</> values can be assumed to be + stable and comparable only so long as the underlying server version and + catalog metadata details stay exactly the same. Two servers + participating in replication based on physical WAL replay can be expected to have identical <structfield>queryid</> values for the same query. - Logical replication schemes do not have replicas comparable in all - relevant regards, and so <structfield>queryid</> will not be a - useful identifier for accumulating costs for the entire replica - set. If in doubt, direct testing is recommended. + However, logical replication schemes do not promise to keep replicas + identical in all relevant details, so <structfield>queryid</> will + not be a useful identifier for accumulating costs across a set of logical + replicas. If in doubt, direct testing is recommended. </para> </sect2> @@ -297,6 +296,36 @@ </listitem> </varlistentry> + <varlistentry> + <indexterm> + <primary>pg_stat_statements</primary> + <secondary>function</secondary> + </indexterm> + + <term> + <function>pg_stat_statements(showtext boolean) returns setof record</function> + </term> + + <listitem> + <para> + The <structname>pg_stat_statements</structname> view is defined in + terms of a function also named <function>pg_stat_statements</>. + It is possible for clients to call + the <function>pg_stat_statements</function> function directly, and by + specifying <literal>showtext := false</literal> have query text be + omitted (that is, the <literal>OUT</literal> argument that corresponds + to the view's <structfield>query</> column will return nulls). This + feature is intended to support external tools that might wish to avoid + the overhead of repeatedly retrieving query texts of indeterminate + length. Such tools can instead cache the first query text observed + for each entry themselves, since that is + all <filename>pg_stat_statements</> itself does, and then retrieve + query texts only as needed. Since the server stores query texts in a + file, this approach may reduce physical I/O for repeated examination + of the <structname>pg_stat_statements</structname> data. + </para> + </listitem> + </varlistentry> </variablelist> </sect2> @@ -316,7 +345,7 @@ in the <structname>pg_stat_statements</> view). If more distinct statements than that are observed, information about the least-executed statements is discarded. - The default value is 1000. + The default value is 5000. This parameter can only be set at server start. </para> </listitem> @@ -378,9 +407,8 @@ </variablelist> <para> - The module requires additional shared memory amounting to about - <varname>pg_stat_statements.max</varname> <literal>*</> - <xref linkend="guc-track-activity-query-size"> bytes. Note that this + The module requires additional shared memory proportional to + <varname>pg_stat_statements.max</varname>. Note that this memory is consumed whenever the module is loaded, even if <varname>pg_stat_statements.track</> is set to <literal>none</>. </para>