diff --git a/src/backend/executor/nodeLimit.c b/src/backend/executor/nodeLimit.c index d2ecb3722f7083d3ad54e931b99014d14346018a..9d40952647d7bd425a0b4092f4a6aa0c3a5508fb 100644 --- a/src/backend/executor/nodeLimit.c +++ b/src/backend/executor/nodeLimit.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/executor/nodeLimit.c,v 1.29 2007/01/05 22:19:28 momjian Exp $ + * $PostgreSQL: pgsql/src/backend/executor/nodeLimit.c,v 1.30 2007/05/04 01:13:43 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -280,6 +280,37 @@ recompute_limits(LimitState *node) /* Reset position to start-of-scan */ node->position = 0; node->subSlot = NULL; + + /* + * If we have a COUNT, and our input is a Sort node, notify it that it can + * use bounded sort. + * + * This is a bit of a kluge, but we don't have any more-abstract way of + * communicating between the two nodes; and it doesn't seem worth trying + * to invent one without some more examples of special communication needs. + * + * Note: it is the responsibility of nodeSort.c to react properly to + * changes of these parameters. If we ever do redesign this, it'd be + * a good idea to integrate this signaling with the parameter-change + * mechanism. + */ + if (IsA(outerPlanState(node), SortState)) + { + SortState *sortState = (SortState *) outerPlanState(node); + int64 tuples_needed = node->count + node->offset; + + /* negative test checks for overflow */ + if (node->noCount || tuples_needed < 0) + { + /* make sure flag gets reset if needed upon rescan */ + sortState->bounded = false; + } + else + { + sortState->bounded = true; + sortState->bound = tuples_needed; + } + } } /* ---------------------------------------------------------------- diff --git a/src/backend/executor/nodeSort.c b/src/backend/executor/nodeSort.c index 97b5c4ff15045f9a095aca5d802854a349fca109..5b18235258f4e8e96db5d2d840bf877e2b0ed0e0 100644 --- a/src/backend/executor/nodeSort.c +++ b/src/backend/executor/nodeSort.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/executor/nodeSort.c,v 1.60 2007/01/09 02:14:11 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/executor/nodeSort.c,v 1.61 2007/05/04 01:13:44 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -89,6 +89,8 @@ ExecSort(SortState *node) plannode->nullsFirst, work_mem, node->randomAccess); + if (node->bounded) + tuplesort_set_bound(tuplesortstate, node->bound); node->tuplesortstate = (void *) tuplesortstate; /* @@ -119,6 +121,8 @@ ExecSort(SortState *node) * finally set the sorted flag to true */ node->sort_Done = true; + node->bounded_Done = node->bounded; + node->bound_Done = node->bound; SO1_printf("ExecSort: %s\n", "sorting done"); } @@ -167,6 +171,7 @@ ExecInitSort(Sort *node, EState *estate, int eflags) EXEC_FLAG_BACKWARD | EXEC_FLAG_MARK)) != 0; + sortstate->bounded = false; sortstate->sort_Done = false; sortstate->tuplesortstate = NULL; @@ -307,11 +312,14 @@ ExecReScanSort(SortState *node, ExprContext *exprCtxt) /* * If subnode is to be rescanned then we forget previous sort results; we - * have to re-read the subplan and re-sort. + * have to re-read the subplan and re-sort. Also must re-sort if the + * bounded-sort parameters changed or we didn't select randomAccess. * * Otherwise we can just rewind and rescan the sorted output. */ if (((PlanState *) node)->lefttree->chgParam != NULL || + node->bounded != node->bounded_Done || + node->bound != node->bound_Done || !node->randomAccess) { node->sort_Done = false; diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c index 9cd3a51d8976c99b9157d21341df745f1145fa4a..82dfc77f065e3c93fe3739cdc597c17194cdcb04 100644 --- a/src/backend/optimizer/path/costsize.c +++ b/src/backend/optimizer/path/costsize.c @@ -54,7 +54,7 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/optimizer/path/costsize.c,v 1.181 2007/04/21 21:01:44 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/optimizer/path/costsize.c,v 1.182 2007/05/04 01:13:44 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -922,6 +922,10 @@ cost_valuesscan(Path *path, PlannerInfo *root, RelOptInfo *baserel) * disk traffic = 2 * relsize * ceil(logM(p / (2*work_mem))) * cpu = comparison_cost * t * log2(t) * + * If the sort is bounded (i.e., only the first k result tuples are needed) + * and k tuples can fit into work_mem, we use a heap method that keeps only + * k tuples in the heap; this will require about t*log2(k) tuple comparisons. + * * The disk traffic is assumed to be 3/4ths sequential and 1/4th random * accesses (XXX can't we refine that guess?) * @@ -932,6 +936,7 @@ cost_valuesscan(Path *path, PlannerInfo *root, RelOptInfo *baserel) * 'input_cost' is the total cost for reading the input data * 'tuples' is the number of tuples in the relation * 'width' is the average tuple width in bytes + * 'limit_tuples' is the bound on the number of output tuples; -1 if no bound * * NOTE: some callers currently pass NIL for pathkeys because they * can't conveniently supply the sort keys. Since this routine doesn't @@ -942,11 +947,14 @@ cost_valuesscan(Path *path, PlannerInfo *root, RelOptInfo *baserel) */ void cost_sort(Path *path, PlannerInfo *root, - List *pathkeys, Cost input_cost, double tuples, int width) + List *pathkeys, Cost input_cost, double tuples, int width, + double limit_tuples) { Cost startup_cost = input_cost; Cost run_cost = 0; - double nbytes = relation_byte_size(tuples, width); + double input_bytes = relation_byte_size(tuples, width); + double output_bytes; + double output_tuples; long work_mem_bytes = work_mem * 1024L; if (!enable_sort) @@ -959,23 +967,39 @@ cost_sort(Path *path, PlannerInfo *root, if (tuples < 2.0) tuples = 2.0; - /* - * CPU costs - * - * Assume about two operator evals per tuple comparison and N log2 N - * comparisons - */ - startup_cost += 2.0 * cpu_operator_cost * tuples * LOG2(tuples); + /* Do we have a useful LIMIT? */ + if (limit_tuples > 0 && limit_tuples < tuples) + { + output_tuples = limit_tuples; + output_bytes = relation_byte_size(output_tuples, width); + } + else + { + output_tuples = tuples; + output_bytes = input_bytes; + } - /* disk costs */ - if (nbytes > work_mem_bytes) + if (output_bytes > work_mem_bytes) { - double npages = ceil(nbytes / BLCKSZ); - double nruns = (nbytes / work_mem_bytes) * 0.5; + /* + * We'll have to use a disk-based sort of all the tuples + */ + double npages = ceil(input_bytes / BLCKSZ); + double nruns = (input_bytes / work_mem_bytes) * 0.5; double mergeorder = tuplesort_merge_order(work_mem_bytes); double log_runs; double npageaccesses; + /* + * CPU costs + * + * Assume about two operator evals per tuple comparison and N log2 N + * comparisons + */ + startup_cost += 2.0 * cpu_operator_cost * tuples * LOG2(tuples); + + /* Disk costs */ + /* Compute logM(r) as log(r) / log(M) */ if (nruns > mergeorder) log_runs = ceil(log(nruns) / log(mergeorder)); @@ -986,10 +1010,27 @@ cost_sort(Path *path, PlannerInfo *root, startup_cost += npageaccesses * (seq_page_cost * 0.75 + random_page_cost * 0.25); } + else if (tuples > 2 * output_tuples || input_bytes > work_mem_bytes) + { + /* + * We'll use a bounded heap-sort keeping just K tuples in memory, + * for a total number of tuple comparisons of N log2 K; but the + * constant factor is a bit higher than for quicksort. Tweak it + * so that the cost curve is continuous at the crossover point. + */ + startup_cost += 2.0 * cpu_operator_cost * tuples * LOG2(2.0 * output_tuples); + } + else + { + /* We'll use plain quicksort on all the input tuples */ + startup_cost += 2.0 * cpu_operator_cost * tuples * LOG2(tuples); + } /* * Also charge a small amount (arbitrarily set equal to operator cost) per - * extracted tuple. + * extracted tuple. Note it's correct to use tuples not output_tuples + * here --- the upper LIMIT will pro-rate the run cost so we'd be double + * counting the LIMIT otherwise. */ run_cost += cpu_operator_cost * tuples; @@ -1431,7 +1472,8 @@ cost_mergejoin(MergePath *path, PlannerInfo *root) outersortkeys, outer_path->total_cost, outer_path_rows, - outer_path->parent->width); + outer_path->parent->width, + -1.0); startup_cost += sort_path.startup_cost; run_cost += (sort_path.total_cost - sort_path.startup_cost) * outerscansel; @@ -1450,7 +1492,8 @@ cost_mergejoin(MergePath *path, PlannerInfo *root) innersortkeys, inner_path->total_cost, inner_path_rows, - inner_path->parent->width); + inner_path->parent->width, + -1.0); startup_cost += sort_path.startup_cost; run_cost += (sort_path.total_cost - sort_path.startup_cost) * innerscansel * rescanratio; diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c index 74b89125665f65c66094e71bfb87d7f80399ad5e..be0162406bd18e9ce7a5157948d498dadc9dabfd 100644 --- a/src/backend/optimizer/plan/createplan.c +++ b/src/backend/optimizer/plan/createplan.c @@ -10,7 +10,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/optimizer/plan/createplan.c,v 1.229 2007/04/21 21:01:44 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/optimizer/plan/createplan.c,v 1.230 2007/05/04 01:13:44 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -122,7 +122,8 @@ static MergeJoin *make_mergejoin(List *tlist, Plan *lefttree, Plan *righttree, JoinType jointype); static Sort *make_sort(PlannerInfo *root, Plan *lefttree, int numCols, - AttrNumber *sortColIdx, Oid *sortOperators, bool *nullsFirst); + AttrNumber *sortColIdx, Oid *sortOperators, bool *nullsFirst, + double limit_tuples); /* @@ -1579,7 +1580,8 @@ create_mergejoin_plan(PlannerInfo *root, outer_plan = (Plan *) make_sort_from_pathkeys(root, outer_plan, - best_path->outersortkeys); + best_path->outersortkeys, + -1.0); outerpathkeys = best_path->outersortkeys; } else @@ -1591,7 +1593,8 @@ create_mergejoin_plan(PlannerInfo *root, inner_plan = (Plan *) make_sort_from_pathkeys(root, inner_plan, - best_path->innersortkeys); + best_path->innersortkeys, + -1.0); innerpathkeys = best_path->innersortkeys; } else @@ -2589,11 +2592,13 @@ make_mergejoin(List *tlist, * make_sort --- basic routine to build a Sort plan node * * Caller must have built the sortColIdx, sortOperators, and nullsFirst - * arrays already. + * arrays already. limit_tuples is as for cost_sort (in particular, pass + * -1 if no limit) */ static Sort * make_sort(PlannerInfo *root, Plan *lefttree, int numCols, - AttrNumber *sortColIdx, Oid *sortOperators, bool *nullsFirst) + AttrNumber *sortColIdx, Oid *sortOperators, bool *nullsFirst, + double limit_tuples) { Sort *node = makeNode(Sort); Plan *plan = &node->plan; @@ -2603,7 +2608,8 @@ make_sort(PlannerInfo *root, Plan *lefttree, int numCols, cost_sort(&sort_path, root, NIL, lefttree->total_cost, lefttree->plan_rows, - lefttree->plan_width); + lefttree->plan_width, + limit_tuples); plan->startup_cost = sort_path.startup_cost; plan->total_cost = sort_path.total_cost; plan->targetlist = lefttree->targetlist; @@ -2664,6 +2670,8 @@ add_sort_column(AttrNumber colIdx, Oid sortOp, bool nulls_first, * * 'lefttree' is the node which yields input tuples * 'pathkeys' is the list of pathkeys by which the result is to be sorted + * 'limit_tuples' is the bound on the number of output tuples; + * -1 if no bound * * We must convert the pathkey information into arrays of sort key column * numbers and sort operator OIDs. @@ -2675,7 +2683,8 @@ add_sort_column(AttrNumber colIdx, Oid sortOp, bool nulls_first, * adding a Result node just to do the projection. */ Sort * -make_sort_from_pathkeys(PlannerInfo *root, Plan *lefttree, List *pathkeys) +make_sort_from_pathkeys(PlannerInfo *root, Plan *lefttree, List *pathkeys, + double limit_tuples) { List *tlist = lefttree->targetlist; ListCell *i; @@ -2810,7 +2819,7 @@ make_sort_from_pathkeys(PlannerInfo *root, Plan *lefttree, List *pathkeys) Assert(numsortkeys > 0); return make_sort(root, lefttree, numsortkeys, - sortColIdx, sortOperators, nullsFirst); + sortColIdx, sortOperators, nullsFirst, limit_tuples); } /* @@ -2859,7 +2868,7 @@ make_sort_from_sortclauses(PlannerInfo *root, List *sortcls, Plan *lefttree) Assert(numsortkeys > 0); return make_sort(root, lefttree, numsortkeys, - sortColIdx, sortOperators, nullsFirst); + sortColIdx, sortOperators, nullsFirst, -1.0); } /* @@ -2919,7 +2928,7 @@ make_sort_from_groupcols(PlannerInfo *root, Assert(numsortkeys > 0); return make_sort(root, lefttree, numsortkeys, - sortColIdx, sortOperators, nullsFirst); + sortColIdx, sortOperators, nullsFirst, -1.0); } Material * diff --git a/src/backend/optimizer/plan/planmain.c b/src/backend/optimizer/plan/planmain.c index f12e388d7235881c96a7cda4995300f8763d26f4..c31f7d5aa656884cff392d828eb86c653715b0d5 100644 --- a/src/backend/optimizer/plan/planmain.c +++ b/src/backend/optimizer/plan/planmain.c @@ -14,7 +14,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/optimizer/plan/planmain.c,v 1.100 2007/04/21 21:01:45 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/optimizer/plan/planmain.c,v 1.101 2007/05/04 01:13:44 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -47,6 +47,8 @@ * tlist is the target list the query should produce * (this is NOT necessarily root->parse->targetList!) * tuple_fraction is the fraction of tuples we expect will be retrieved + * limit_tuples is a hard limit on number of tuples to retrieve, + * or -1 if no limit * * Output parameters: * *cheapest_path receives the overall-cheapest path for the query @@ -74,9 +76,13 @@ * from the plan to be retrieved * tuple_fraction >= 1: tuple_fraction is the absolute number of tuples * expected to be retrieved (ie, a LIMIT specification) + * Note that a nonzero tuple_fraction could come from outer context; it is + * therefore not redundant with limit_tuples. We use limit_tuples to determine + * whether a bounded sort can be used at runtime. */ void -query_planner(PlannerInfo *root, List *tlist, double tuple_fraction, +query_planner(PlannerInfo *root, List *tlist, + double tuple_fraction, double limit_tuples, Path **cheapest_path, Path **sorted_path, double *num_groups) { @@ -354,7 +360,8 @@ query_planner(PlannerInfo *root, List *tlist, double tuple_fraction, /* Figure cost for sorting */ cost_sort(&sort_path, root, root->query_pathkeys, cheapestpath->total_cost, - final_rel->rows, final_rel->width); + final_rel->rows, final_rel->width, + limit_tuples); } if (compare_fractional_path_costs(sortedpath, &sort_path, diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c index d5dc6f7c7e6fb31b824e59a1f3c7153fbfc6a51e..366c50f2730f215b9e68cf6842a308837542335c 100644 --- a/src/backend/optimizer/plan/planner.c +++ b/src/backend/optimizer/plan/planner.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/optimizer/plan/planner.c,v 1.218 2007/04/27 22:05:47 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/optimizer/plan/planner.c,v 1.219 2007/05/04 01:13:44 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -61,7 +61,8 @@ static double preprocess_limit(PlannerInfo *root, double tuple_fraction, int64 *offset_est, int64 *count_est); static Oid *extract_grouping_ops(List *groupClause); -static bool choose_hashed_grouping(PlannerInfo *root, double tuple_fraction, +static bool choose_hashed_grouping(PlannerInfo *root, + double tuple_fraction, double limit_tuples, Path *cheapest_path, Path *sorted_path, Oid *groupOperators, double dNumGroups, AggClauseCounts *agg_counts); @@ -696,6 +697,7 @@ grouping_planner(PlannerInfo *root, double tuple_fraction) List *tlist = parse->targetList; int64 offset_est = 0; int64 count_est = 0; + double limit_tuples = -1.0; Plan *result_plan; List *current_pathkeys; List *sort_pathkeys; @@ -703,8 +705,16 @@ grouping_planner(PlannerInfo *root, double tuple_fraction) /* Tweak caller-supplied tuple_fraction if have LIMIT/OFFSET */ if (parse->limitCount || parse->limitOffset) + { tuple_fraction = preprocess_limit(root, tuple_fraction, &offset_est, &count_est); + /* + * If we have a known LIMIT, and don't have an unknown OFFSET, + * we can estimate the effects of using a bounded sort. + */ + if (count_est > 0 && offset_est >= 0) + limit_tuples = (double) count_est + (double) offset_est; + } if (parse->setOperations) { @@ -850,7 +860,7 @@ grouping_planner(PlannerInfo *root, double tuple_fraction) * estimate the number of groups in the query, and canonicalize all * the pathkeys. */ - query_planner(root, sub_tlist, tuple_fraction, + query_planner(root, sub_tlist, tuple_fraction, limit_tuples, &cheapest_path, &sorted_path, &dNumGroups); group_pathkeys = root->group_pathkeys; @@ -864,7 +874,7 @@ grouping_planner(PlannerInfo *root, double tuple_fraction) { groupOperators = extract_grouping_ops(parse->groupClause); use_hashed_grouping = - choose_hashed_grouping(root, tuple_fraction, + choose_hashed_grouping(root, tuple_fraction, limit_tuples, cheapest_path, sorted_path, groupOperators, dNumGroups, &agg_counts); @@ -1099,7 +1109,8 @@ grouping_planner(PlannerInfo *root, double tuple_fraction) { result_plan = (Plan *) make_sort_from_pathkeys(root, result_plan, - sort_pathkeys); + sort_pathkeys, + limit_tuples); current_pathkeys = sort_pathkeys; } } @@ -1414,7 +1425,8 @@ extract_grouping_ops(List *groupClause) * choose_hashed_grouping - should we use hashed grouping? */ static bool -choose_hashed_grouping(PlannerInfo *root, double tuple_fraction, +choose_hashed_grouping(PlannerInfo *root, + double tuple_fraction, double limit_tuples, Path *cheapest_path, Path *sorted_path, Oid *groupOperators, double dNumGroups, AggClauseCounts *agg_counts) @@ -1499,7 +1511,7 @@ choose_hashed_grouping(PlannerInfo *root, double tuple_fraction, /* Result of hashed agg is always unsorted */ if (root->sort_pathkeys) cost_sort(&hashed_p, root, root->sort_pathkeys, hashed_p.total_cost, - dNumGroups, cheapest_path_width); + dNumGroups, cheapest_path_width, limit_tuples); if (sorted_path) { @@ -1516,7 +1528,7 @@ choose_hashed_grouping(PlannerInfo *root, double tuple_fraction, if (!pathkeys_contained_in(root->group_pathkeys, current_pathkeys)) { cost_sort(&sorted_p, root, root->group_pathkeys, sorted_p.total_cost, - cheapest_path_rows, cheapest_path_width); + cheapest_path_rows, cheapest_path_width, -1.0); current_pathkeys = root->group_pathkeys; } @@ -1533,7 +1545,7 @@ choose_hashed_grouping(PlannerInfo *root, double tuple_fraction, if (root->sort_pathkeys && !pathkeys_contained_in(root->sort_pathkeys, current_pathkeys)) cost_sort(&sorted_p, root, root->sort_pathkeys, sorted_p.total_cost, - dNumGroups, cheapest_path_width); + dNumGroups, cheapest_path_width, limit_tuples); /* * Now make the decision using the top-level tuple fraction. First we diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c index 6d440001d6b2a28a0f6080cbe6805c9c91d8e66c..bd95a0e0e232ae1663f52b3344d56bceaf5c89c0 100644 --- a/src/backend/optimizer/util/pathnode.c +++ b/src/backend/optimizer/util/pathnode.c @@ -8,7 +8,7 @@ * * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/optimizer/util/pathnode.c,v 1.139 2007/04/21 21:01:45 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/optimizer/util/pathnode.c,v 1.140 2007/05/04 01:13:44 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -842,7 +842,8 @@ create_unique_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath) cost_sort(&sort_path, root, NIL, subpath->total_cost, rel->rows, - rel->width); + rel->width, + -1.0); /* * Charge one cpu_operator_cost per comparison per input tuple. We assume diff --git a/src/backend/utils/misc/guc.c b/src/backend/utils/misc/guc.c index a3a8f79a5d1299363f5bf1c001e5071c0e977be0..9f9d1e1bcbf9d5925ef0d61980345ac3d6ea75b8 100644 --- a/src/backend/utils/misc/guc.c +++ b/src/backend/utils/misc/guc.c @@ -10,7 +10,7 @@ * Written by Peter Eisentraut <peter_e@gmx.net>. * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/misc/guc.c,v 1.389 2007/04/26 16:13:12 neilc Exp $ + * $PostgreSQL: pgsql/src/backend/utils/misc/guc.c,v 1.390 2007/05/04 01:13:44 tgl Exp $ * *-------------------------------------------------------------------- */ @@ -108,6 +108,9 @@ extern bool fullPageWrites; #ifdef TRACE_SORT extern bool trace_sort; #endif +#ifdef DEBUG_BOUNDED_SORT +extern bool optimize_bounded_sort; +#endif #ifdef USE_SSL extern char *SSLCipherSuites; @@ -966,6 +969,20 @@ static struct config_bool ConfigureNamesBool[] = }, #endif +#ifdef DEBUG_BOUNDED_SORT + /* this is undocumented because not exposed in a standard build */ + { + { + "optimize_bounded_sort", PGC_USERSET, QUERY_TUNING_METHOD, + gettext_noop("Enable bounded sorting using heap sort."), + NULL, + GUC_NOT_IN_SAMPLE + }, + &optimize_bounded_sort, + true, NULL, NULL + }, +#endif + #ifdef WAL_DEBUG { {"wal_debug", PGC_SUSET, DEVELOPER_OPTIONS, @@ -1711,7 +1728,7 @@ static struct config_int ConfigureNamesInt[] = &server_version_num, PG_VERSION_NUM, PG_VERSION_NUM, PG_VERSION_NUM, NULL, NULL }, - + { {"log_temp_files", PGC_USERSET, LOGGING_WHAT, gettext_noop("Log the use of temporary files larger than this number of kilobytes."), @@ -2883,7 +2900,7 @@ InitializeGUCOptions(void) PGC_S_DEFAULT)) elog(FATAL, "failed to initialize %s to %d", conf->gen.name, conf->boot_val); - *conf->variable = conf->reset_val = conf->boot_val; + *conf->variable = conf->reset_val = conf->boot_val; break; } case PGC_REAL: @@ -2897,7 +2914,7 @@ InitializeGUCOptions(void) PGC_S_DEFAULT)) elog(FATAL, "failed to initialize %s to %g", conf->gen.name, conf->boot_val); - *conf->variable = conf->reset_val = conf->boot_val; + *conf->variable = conf->reset_val = conf->boot_val; break; } case PGC_STRING: diff --git a/src/backend/utils/sort/tuplesort.c b/src/backend/utils/sort/tuplesort.c index 76c59a423821363adfdb97324814d4c74d4f0191..e4fc74b3e933b22d95c9c603d2272315fc5d14ff 100644 --- a/src/backend/utils/sort/tuplesort.c +++ b/src/backend/utils/sort/tuplesort.c @@ -91,13 +91,15 @@ * Portions Copyright (c) 1994, Regents of the University of California * * IDENTIFICATION - * $PostgreSQL: pgsql/src/backend/utils/sort/tuplesort.c,v 1.74 2007/01/10 18:06:04 tgl Exp $ + * $PostgreSQL: pgsql/src/backend/utils/sort/tuplesort.c,v 1.75 2007/05/04 01:13:44 tgl Exp $ * *------------------------------------------------------------------------- */ #include "postgres.h" +#include <limits.h> + #include "access/heapam.h" #include "access/nbtree.h" #include "catalog/pg_amop.h" @@ -112,10 +114,13 @@ #include "utils/tuplesort.h" -/* GUC variable */ +/* GUC variables */ #ifdef TRACE_SORT bool trace_sort = false; #endif +#ifdef DEBUG_BOUNDED_SORT +bool optimize_bounded_sort = true; +#endif /* @@ -159,6 +164,7 @@ typedef struct typedef enum { TSS_INITIAL, /* Loading tuples; still within memory limit */ + TSS_BOUNDED, /* Loading tuples into bounded-size heap */ TSS_BUILDRUNS, /* Loading tuples; writing to tape */ TSS_SORTEDINMEM, /* Sort completed entirely in memory */ TSS_SORTEDONTAPE, /* Sort completed, final run is on tape */ @@ -188,6 +194,9 @@ struct Tuplesortstate TupSortStatus status; /* enumerated value as shown above */ int nKeys; /* number of columns in sort key */ bool randomAccess; /* did caller request random access? */ + bool bounded; /* did caller specify a maximum number of + * tuples to return? */ + int bound; /* if bounded, the maximum number of tuples */ long availMem; /* remaining memory available, in bytes */ long allowedMem; /* total memory allowed, in bytes */ int maxTapes; /* number of tapes (Knuth's T) */ @@ -234,6 +243,13 @@ struct Tuplesortstate void (*readtup) (Tuplesortstate *state, SortTuple *stup, int tapenum, unsigned int len); + /* + * Function to reverse the sort direction from its current state. + * (We could dispense with this if we wanted to enforce that all variants + * represent the sort key information alike.) + */ + void (*reversedirection) (Tuplesortstate *state); + /* * This array holds the tuples now in sort memory. If we are in state * INITIAL, the tuples are in no particular order; if we are in state @@ -347,6 +363,7 @@ struct Tuplesortstate #define COPYTUP(state,stup,tup) ((*(state)->copytup) (state, stup, tup)) #define WRITETUP(state,tape,stup) ((*(state)->writetup) (state, tape, stup)) #define READTUP(state,stup,tape,len) ((*(state)->readtup) (state, stup, tape, len)) +#define REVERSEDIRECTION(state) ((*(state)->reversedirection) (state)) #define LACKMEM(state) ((state)->availMem < 0) #define USEMEM(state,amt) ((state)->availMem -= (amt)) #define FREEMEM(state,amt) ((state)->availMem += (amt)) @@ -403,6 +420,8 @@ static void beginmerge(Tuplesortstate *state); static void mergepreread(Tuplesortstate *state); static void mergeprereadone(Tuplesortstate *state, int srcTape); static void dumptuples(Tuplesortstate *state, bool alltuples); +static void make_bounded_heap(Tuplesortstate *state); +static void sort_bounded_heap(Tuplesortstate *state); static void tuplesort_heap_insert(Tuplesortstate *state, SortTuple *tuple, int tupleindex, bool checkIndex); static void tuplesort_heap_siftup(Tuplesortstate *state, bool checkIndex); @@ -415,6 +434,7 @@ static void writetup_heap(Tuplesortstate *state, int tapenum, SortTuple *stup); static void readtup_heap(Tuplesortstate *state, SortTuple *stup, int tapenum, unsigned int len); +static void reversedirection_heap(Tuplesortstate *state); static int comparetup_index(const SortTuple *a, const SortTuple *b, Tuplesortstate *state); static void copytup_index(Tuplesortstate *state, SortTuple *stup, void *tup); @@ -422,6 +442,7 @@ static void writetup_index(Tuplesortstate *state, int tapenum, SortTuple *stup); static void readtup_index(Tuplesortstate *state, SortTuple *stup, int tapenum, unsigned int len); +static void reversedirection_index(Tuplesortstate *state); static int comparetup_datum(const SortTuple *a, const SortTuple *b, Tuplesortstate *state); static void copytup_datum(Tuplesortstate *state, SortTuple *stup, void *tup); @@ -429,6 +450,8 @@ static void writetup_datum(Tuplesortstate *state, int tapenum, SortTuple *stup); static void readtup_datum(Tuplesortstate *state, SortTuple *stup, int tapenum, unsigned int len); +static void reversedirection_datum(Tuplesortstate *state); +static void free_sort_tuple(Tuplesortstate *state, SortTuple *stup); /* @@ -538,6 +561,7 @@ tuplesort_begin_heap(TupleDesc tupDesc, state->copytup = copytup_heap; state->writetup = writetup_heap; state->readtup = readtup_heap; + state->reversedirection = reversedirection_heap; state->tupDesc = tupDesc; /* assume we need not copy tupDesc */ state->scanKeys = (ScanKey) palloc0(nkeys * sizeof(ScanKeyData)); @@ -601,6 +625,7 @@ tuplesort_begin_index(Relation indexRel, state->copytup = copytup_index; state->writetup = writetup_index; state->readtup = readtup_index; + state->reversedirection = reversedirection_index; state->indexRel = indexRel; /* see comments below about btree dependence of this code... */ @@ -639,6 +664,7 @@ tuplesort_begin_datum(Oid datumType, state->copytup = copytup_datum; state->writetup = writetup_datum; state->readtup = readtup_datum; + state->reversedirection = reversedirection_datum; state->datumType = datumType; @@ -664,6 +690,40 @@ tuplesort_begin_datum(Oid datumType, return state; } +/* + * tuplesort_set_bound + * + * Advise tuplesort that at most the first N result tuples are required. + * + * Must be called before inserting any tuples. (Actually, we could allow it + * as long as the sort hasn't spilled to disk, but there seems no need for + * delayed calls at the moment.) + * + * This is a hint only. The tuplesort may still return more tuples than + * requested. + */ +void +tuplesort_set_bound(Tuplesortstate *state, int64 bound) +{ + /* Assert we're called before loading any tuples */ + Assert(state->status == TSS_INITIAL); + Assert(state->memtupcount == 0); + Assert(!state->bounded); + +#ifdef DEBUG_BOUNDED_SORT + /* Honor GUC setting that disables the feature (for easy testing) */ + if (!optimize_bounded_sort) + return; +#endif + + /* We want to be able to compute bound * 2, so limit the setting */ + if (bound > (int64) (INT_MAX/2)) + return; + + state->bounded = true; + state->bound = (int) bound; +} + /* * tuplesort_end * @@ -862,6 +922,32 @@ puttuple_common(Tuplesortstate *state, SortTuple *tuple) } state->memtuples[state->memtupcount++] = *tuple; + /* + * Check if it's time to switch over to a bounded heapsort. + * We do so if the input tuple count exceeds twice the desired + * tuple count (this is a heuristic for where heapsort becomes + * cheaper than a quicksort), or if we've just filled workMem + * and have enough tuples to meet the bound. + * + * Note that once we enter TSS_BOUNDED state we will always try + * to complete the sort that way. In the worst case, if later + * input tuples are larger than earlier ones, this might cause + * us to exceed workMem significantly. + */ + if (state->bounded && + (state->memtupcount > state->bound * 2 || + (state->memtupcount > state->bound && LACKMEM(state)))) + { +#ifdef TRACE_SORT + if (trace_sort) + elog(LOG, "switching to bounded heapsort at %d tuples: %s", + state->memtupcount, + pg_rusage_show(&state->ru_start)); +#endif + make_bounded_heap(state); + return; + } + /* * Done if we still fit in available memory and have array slots. */ @@ -878,6 +964,31 @@ puttuple_common(Tuplesortstate *state, SortTuple *tuple) */ dumptuples(state, false); break; + + case TSS_BOUNDED: + /* + * We don't want to grow the array here, so check whether the + * new tuple can be discarded before putting it in. This should + * be a good speed optimization, too, since when there are many + * more input tuples than the bound, most input tuples can be + * discarded with just this one comparison. Note that because + * we currently have the sort direction reversed, we must check + * for <= not >=. + */ + if (COMPARETUP(state, tuple, &state->memtuples[0]) <= 0) + { + /* new tuple <= top of the heap, so we can discard it */ + free_sort_tuple(state, tuple); + } + else + { + /* discard top of heap, sift up, insert new tuple */ + free_sort_tuple(state, &state->memtuples[0]); + tuplesort_heap_siftup(state, false); + tuplesort_heap_insert(state, tuple, 0, false); + } + break; + case TSS_BUILDRUNS: /* @@ -904,6 +1015,7 @@ puttuple_common(Tuplesortstate *state, SortTuple *tuple) */ dumptuples(state, false); break; + default: elog(ERROR, "invalid tuplesort state"); break; @@ -944,6 +1056,23 @@ tuplesort_performsort(Tuplesortstate *state) state->markpos_eof = false; state->status = TSS_SORTEDINMEM; break; + + case TSS_BOUNDED: + + /* + * We were able to accumulate all the tuples required for output + * in memory, using a heap to eliminate excess tuples. Now we have + * to transform the heap to a properly-sorted array. + */ + if (state->memtupcount > 1) + sort_bounded_heap(state); + state->current = 0; + state->eof_reached = false; + state->markpos_offset = 0; + state->markpos_eof = false; + state->status = TSS_SORTEDINMEM; + break; + case TSS_BUILDRUNS: /* @@ -959,6 +1088,7 @@ tuplesort_performsort(Tuplesortstate *state) state->markpos_offset = 0; state->markpos_eof = false; break; + default: elog(ERROR, "invalid tuplesort state"); break; @@ -1004,6 +1134,15 @@ tuplesort_gettuple_common(Tuplesortstate *state, bool forward, return true; } state->eof_reached = true; + + /* + * Complain if caller tries to retrieve more tuples than + * originally asked for in a bounded sort. This is because + * returning EOF here might be the wrong thing. + */ + if (state->bounded && state->current >= state->bound) + elog(ERROR, "retrieved too many tuples in a bounded sort"); + return false; } else @@ -1987,6 +2126,98 @@ tuplesort_restorepos(Tuplesortstate *state) ((tup1)->tupindex) - ((tup2)->tupindex) : \ COMPARETUP(state, tup1, tup2)) +/* + * Convert the existing unordered array of SortTuples to a bounded heap, + * discarding all but the smallest "state->bound" tuples. + * + * When working with a bounded heap, we want to keep the largest entry + * at the root (array entry zero), instead of the smallest as in the normal + * sort case. This allows us to discard the largest entry cheaply. + * Therefore, we temporarily reverse the sort direction. + * + * We assume that all entries in a bounded heap will always have tupindex + * zero; it therefore doesn't matter that HEAPCOMPARE() doesn't reverse + * the direction of comparison for tupindexes. + */ +static void +make_bounded_heap(Tuplesortstate *state) +{ + int tupcount = state->memtupcount; + int i; + + Assert(state->status == TSS_INITIAL); + Assert(state->bounded); + Assert(tupcount >= state->bound); + + /* Reverse sort direction so largest entry will be at root */ + REVERSEDIRECTION(state); + + state->memtupcount = 0; /* make the heap empty */ + for (i=0; i<tupcount; i++) + { + if (state->memtupcount >= state->bound && + COMPARETUP(state, &state->memtuples[i], &state->memtuples[0]) <= 0) + { + /* New tuple would just get thrown out, so skip it */ + free_sort_tuple(state, &state->memtuples[i]); + } + else + { + /* Insert next tuple into heap */ + /* Must copy source tuple to avoid possible overwrite */ + SortTuple stup = state->memtuples[i]; + + tuplesort_heap_insert(state, &stup, 0, false); + + /* If heap too full, discard largest entry */ + if (state->memtupcount > state->bound) + { + free_sort_tuple(state, &state->memtuples[0]); + tuplesort_heap_siftup(state, false); + } + } + } + + Assert(state->memtupcount == state->bound); + state->status = TSS_BOUNDED; +} + +/* + * Convert the bounded heap to a properly-sorted array + */ +static void +sort_bounded_heap(Tuplesortstate *state) +{ + int tupcount = state->memtupcount; + + Assert(state->status == TSS_BOUNDED); + Assert(state->bounded); + Assert(tupcount == state->bound); + + /* + * We can unheapify in place because each sift-up will remove the largest + * entry, which we can promptly store in the newly freed slot at the end. + * Once we're down to a single-entry heap, we're done. + */ + while (state->memtupcount > 1) + { + SortTuple stup = state->memtuples[0]; + + /* this sifts-up the next-largest entry and decreases memtupcount */ + tuplesort_heap_siftup(state, false); + state->memtuples[state->memtupcount] = stup; + } + state->memtupcount = tupcount; + + /* + * Reverse sort direction back to the original state. This is not + * actually necessary but seems like a good idea for tidiness. + */ + REVERSEDIRECTION(state); + + state->status = TSS_SORTEDINMEM; +} + /* * Insert a new tuple into an empty or existing heap, maintaining the * heap invariant. Caller is responsible for ensuring there's room. @@ -2322,6 +2553,18 @@ readtup_heap(Tuplesortstate *state, SortTuple *stup, &stup->isnull1); } +static void +reversedirection_heap(Tuplesortstate *state) +{ + ScanKey scanKey = state->scanKeys; + int nkey; + + for (nkey = 0; nkey < state->nKeys; nkey++, scanKey++) + { + scanKey->sk_flags ^= (SK_BT_DESC | SK_BT_NULLS_FIRST); + } +} + /* * Routines specialized for IndexTuple case @@ -2497,6 +2740,18 @@ readtup_index(Tuplesortstate *state, SortTuple *stup, &stup->isnull1); } +static void +reversedirection_index(Tuplesortstate *state) +{ + ScanKey scanKey = state->indexScanKey; + int nkey; + + for (nkey = 0; nkey < state->nKeys; nkey++, scanKey++) + { + scanKey->sk_flags ^= (SK_BT_DESC | SK_BT_NULLS_FIRST); + } +} + /* * Routines specialized for DatumTuple case @@ -2601,3 +2856,19 @@ readtup_datum(Tuplesortstate *state, SortTuple *stup, sizeof(tuplen)) != sizeof(tuplen)) elog(ERROR, "unexpected end of data"); } + +static void +reversedirection_datum(Tuplesortstate *state) +{ + state->sortFnFlags ^= (SK_BT_DESC | SK_BT_NULLS_FIRST); +} + +/* + * Convenience routine to free a tuple previously loaded into sort memory + */ +static void +free_sort_tuple(Tuplesortstate *state, SortTuple *stup) +{ + FREEMEM(state, GetMemoryChunkSpace(stup->tuple)); + pfree(stup->tuple); +} diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h index 726ee5bdae6845b1ab28e876f32ce6669d4336a3..12219b883ebc8886bf04b606e069732250fafa58 100644 --- a/src/include/nodes/execnodes.h +++ b/src/include/nodes/execnodes.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/nodes/execnodes.h,v 1.172 2007/04/26 23:24:44 tgl Exp $ + * $PostgreSQL: pgsql/src/include/nodes/execnodes.h,v 1.173 2007/05/04 01:13:45 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -1294,7 +1294,11 @@ typedef struct SortState { ScanState ss; /* its first field is NodeTag */ bool randomAccess; /* need random access to sort output? */ + bool bounded; /* is the result set bounded? */ + int64 bound; /* if bounded, how many tuples are needed */ bool sort_Done; /* sort completed yet? */ + bool bounded_Done; /* value of bounded we did the sort with */ + int64 bound_Done; /* value of bound we did the sort with */ void *tuplesortstate; /* private state of tuplesort.c */ } SortState; diff --git a/src/include/optimizer/cost.h b/src/include/optimizer/cost.h index ef92d685eb4a433ef75bd77e5a7fbd9448bacef3..641555b70686e6e4a4049c8eda8379375a539c42 100644 --- a/src/include/optimizer/cost.h +++ b/src/include/optimizer/cost.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/optimizer/cost.h,v 1.85 2007/02/22 22:00:26 tgl Exp $ + * $PostgreSQL: pgsql/src/include/optimizer/cost.h,v 1.86 2007/05/04 01:13:45 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -73,7 +73,8 @@ extern void cost_functionscan(Path *path, PlannerInfo *root, extern void cost_valuesscan(Path *path, PlannerInfo *root, RelOptInfo *baserel); extern void cost_sort(Path *path, PlannerInfo *root, - List *pathkeys, Cost input_cost, double tuples, int width); + List *pathkeys, Cost input_cost, double tuples, int width, + double limit_tuples); extern void cost_material(Path *path, Cost input_cost, double tuples, int width); extern void cost_agg(Path *path, PlannerInfo *root, diff --git a/src/include/optimizer/planmain.h b/src/include/optimizer/planmain.h index b5ad4b3c3b503a5ae93db14f8a613477963e1505..319c38c66b50fe2b0627eda46605b3a92233a137 100644 --- a/src/include/optimizer/planmain.h +++ b/src/include/optimizer/planmain.h @@ -7,7 +7,7 @@ * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/optimizer/planmain.h,v 1.100 2007/02/22 22:00:26 tgl Exp $ + * $PostgreSQL: pgsql/src/include/optimizer/planmain.h,v 1.101 2007/05/04 01:13:45 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -21,7 +21,7 @@ * prototypes for plan/planmain.c */ extern void query_planner(PlannerInfo *root, List *tlist, - double tuple_fraction, + double tuple_fraction, double limit_tuples, Path **cheapest_path, Path **sorted_path, double *num_groups); @@ -39,7 +39,7 @@ extern SubqueryScan *make_subqueryscan(List *qptlist, List *qpqual, Index scanrelid, Plan *subplan, List *subrtable); extern Append *make_append(List *appendplans, bool isTarget, List *tlist); extern Sort *make_sort_from_pathkeys(PlannerInfo *root, Plan *lefttree, - List *pathkeys); + List *pathkeys, double limit_tuples); extern Sort *make_sort_from_sortclauses(PlannerInfo *root, List *sortcls, Plan *lefttree); extern Sort *make_sort_from_groupcols(PlannerInfo *root, List *groupcls, diff --git a/src/include/utils/tuplesort.h b/src/include/utils/tuplesort.h index cea50b4836ba1a4fd7850ba79ef68824caae5b52..c736896a9a984c78bb552e5319568e0dbb84f2a9 100644 --- a/src/include/utils/tuplesort.h +++ b/src/include/utils/tuplesort.h @@ -13,7 +13,7 @@ * Portions Copyright (c) 1996-2007, PostgreSQL Global Development Group * Portions Copyright (c) 1994, Regents of the University of California * - * $PostgreSQL: pgsql/src/include/utils/tuplesort.h,v 1.25 2007/01/09 02:14:16 tgl Exp $ + * $PostgreSQL: pgsql/src/include/utils/tuplesort.h,v 1.26 2007/05/04 01:13:45 tgl Exp $ * *------------------------------------------------------------------------- */ @@ -55,6 +55,8 @@ extern Tuplesortstate *tuplesort_begin_datum(Oid datumType, Oid sortOperator, bool nullsFirstFlag, int workMem, bool randomAccess); +extern void tuplesort_set_bound(Tuplesortstate *state, int64 bound); + extern void tuplesort_puttupleslot(Tuplesortstate *state, TupleTableSlot *slot); extern void tuplesort_putindextuple(Tuplesortstate *state, IndexTuple tuple);