diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c
index d3455228660e43962fe451eab03e310947c66065..c4404b1bd1712548071d1d7b80b72e65dd45d77b 100644
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@@ -1335,25 +1335,40 @@ cost_material(Path *path,
  *		Determines and returns the cost of performing an Agg plan node,
  *		including the cost of its input.
  *
+ * aggcosts can be NULL when there are no actual aggregate functions (i.e.,
+ * we are using a hashed Agg node just to do grouping).
+ *
  * Note: when aggstrategy == AGG_SORTED, caller must ensure that input costs
  * are for appropriately-sorted input.
  */
 void
 cost_agg(Path *path, PlannerInfo *root,
-		 AggStrategy aggstrategy, int numAggs,
+		 AggStrategy aggstrategy, const AggClauseCosts *aggcosts,
 		 int numGroupCols, double numGroups,
 		 Cost input_startup_cost, Cost input_total_cost,
 		 double input_tuples)
 {
 	Cost		startup_cost;
 	Cost		total_cost;
+	AggClauseCosts dummy_aggcosts;
+
+	/* Use all-zero per-aggregate costs if NULL is passed */
+	if (aggcosts == NULL)
+	{
+		Assert(aggstrategy == AGG_HASHED);
+		MemSet(&dummy_aggcosts, 0, sizeof(AggClauseCosts));
+		aggcosts = &dummy_aggcosts;
+	}
 
 	/*
-	 * We charge one cpu_operator_cost per aggregate function per input tuple,
-	 * and another one per output tuple (corresponding to transfn and finalfn
-	 * calls respectively).  If we are grouping, we charge an additional
-	 * cpu_operator_cost per grouping column per input tuple for grouping
-	 * comparisons.
+	 * The transCost.per_tuple component of aggcosts should be charged once
+	 * per input tuple, corresponding to the costs of evaluating the aggregate
+	 * transfns and their input expressions (with any startup cost of course
+	 * charged but once).  The finalCost component is charged once per output
+	 * tuple, corresponding to the costs of evaluating the finalfns.
+	 *
+	 * If we are grouping, we charge an additional cpu_operator_cost per
+	 * grouping column per input tuple for grouping comparisons.
 	 *
 	 * We will produce a single output tuple if not grouping, and a tuple per
 	 * group otherwise.  We charge cpu_tuple_cost for each output tuple.
@@ -1366,15 +1381,13 @@ cost_agg(Path *path, PlannerInfo *root,
 	 * there's roundoff error we might do the wrong thing.  So be sure that
 	 * the computations below form the same intermediate values in the same
 	 * order.
-	 *
-	 * Note: ideally we should use the pg_proc.procost costs of each
-	 * aggregate's component functions, but for now that seems like an
-	 * excessive amount of work.
 	 */
 	if (aggstrategy == AGG_PLAIN)
 	{
 		startup_cost = input_total_cost;
-		startup_cost += cpu_operator_cost * (input_tuples + 1) * numAggs;
+		startup_cost += aggcosts->transCost.startup;
+		startup_cost += aggcosts->transCost.per_tuple * input_tuples;
+		startup_cost += aggcosts->finalCost;
 		/* we aren't grouping */
 		total_cost = startup_cost + cpu_tuple_cost;
 	}
@@ -1384,19 +1397,21 @@ cost_agg(Path *path, PlannerInfo *root,
 		startup_cost = input_startup_cost;
 		total_cost = input_total_cost;
 		/* calcs phrased this way to match HASHED case, see note above */
-		total_cost += cpu_operator_cost * input_tuples * numGroupCols;
-		total_cost += cpu_operator_cost * input_tuples * numAggs;
-		total_cost += cpu_operator_cost * numGroups * numAggs;
+		total_cost += aggcosts->transCost.startup;
+		total_cost += aggcosts->transCost.per_tuple * input_tuples;
+		total_cost += (cpu_operator_cost * numGroupCols) * input_tuples;
+		total_cost += aggcosts->finalCost * numGroups;
 		total_cost += cpu_tuple_cost * numGroups;
 	}
 	else
 	{
 		/* must be AGG_HASHED */
 		startup_cost = input_total_cost;
-		startup_cost += cpu_operator_cost * input_tuples * numGroupCols;
-		startup_cost += cpu_operator_cost * input_tuples * numAggs;
+		startup_cost += aggcosts->transCost.startup;
+		startup_cost += aggcosts->transCost.per_tuple * input_tuples;
+		startup_cost += (cpu_operator_cost * numGroupCols) * input_tuples;
 		total_cost = startup_cost;
-		total_cost += cpu_operator_cost * numGroups * numAggs;
+		total_cost += aggcosts->finalCost * numGroups;
 		total_cost += cpu_tuple_cost * numGroups;
 	}
 
@@ -1413,25 +1428,53 @@ cost_agg(Path *path, PlannerInfo *root,
  */
 void
 cost_windowagg(Path *path, PlannerInfo *root,
-			   int numWindowFuncs, int numPartCols, int numOrderCols,
+			   List *windowFuncs, int numPartCols, int numOrderCols,
 			   Cost input_startup_cost, Cost input_total_cost,
 			   double input_tuples)
 {
 	Cost		startup_cost;
 	Cost		total_cost;
+	ListCell   *lc;
 
 	startup_cost = input_startup_cost;
 	total_cost = input_total_cost;
 
 	/*
-	 * We charge one cpu_operator_cost per window function per tuple (often a
-	 * drastic underestimate, but without a way to gauge how many tuples the
-	 * window function will fetch, it's hard to do better).  We also charge
-	 * cpu_operator_cost per grouping column per tuple for grouping
-	 * comparisons, plus cpu_tuple_cost per tuple for general overhead.
-	 */
-	total_cost += cpu_operator_cost * input_tuples * numWindowFuncs;
-	total_cost += cpu_operator_cost * input_tuples * (numPartCols + numOrderCols);
+	 * Window functions are assumed to cost their stated execution cost, plus
+	 * the cost of evaluating their input expressions, per tuple.  Since they
+	 * may in fact evaluate their inputs at multiple rows during each cycle,
+	 * this could be a drastic underestimate; but without a way to know how
+	 * many rows the window function will fetch, it's hard to do better.  In
+	 * any case, it's a good estimate for all the built-in window functions,
+	 * so we'll just do this for now.
+	 */
+	foreach(lc, windowFuncs)
+	{
+		WindowFunc *wfunc = (WindowFunc *) lfirst(lc);
+		Cost		wfunccost;
+		QualCost	argcosts;
+
+		Assert(IsA(wfunc, WindowFunc));
+
+		wfunccost = get_func_cost(wfunc->winfnoid) * cpu_operator_cost;
+
+		/* also add the input expressions' cost to per-input-row costs */
+		cost_qual_eval_node(&argcosts, (Node *) wfunc->args, root);
+		startup_cost += argcosts.startup;
+		wfunccost += argcosts.per_tuple;
+
+		total_cost += wfunccost * input_tuples;
+	}
+
+	/*
+	 * We also charge cpu_operator_cost per grouping column per tuple for
+	 * grouping comparisons, plus cpu_tuple_cost per tuple for general
+	 * overhead.
+	 *
+	 * XXX this neglects costs of spooling the data to disk when it overflows
+	 * work_mem.  Sooner or later that should get accounted for.
+	 */
+	total_cost += cpu_operator_cost * (numPartCols + numOrderCols) * input_tuples;
 	total_cost += cpu_tuple_cost * input_tuples;
 
 	path->startup_cost = startup_cost;
@@ -2640,17 +2683,12 @@ cost_qual_eval_walker(Node *node, cost_qual_eval_context *context)
 	 * Vars and Consts are charged zero, and so are boolean operators (AND,
 	 * OR, NOT). Simplistic, but a lot better than no model at all.
 	 *
-	 * Note that Aggref and WindowFunc nodes are (and should be) treated like
-	 * Vars --- whatever execution cost they have is absorbed into
-	 * plan-node-specific costing.	As far as expression evaluation is
-	 * concerned they're just like Vars.
-	 *
 	 * Should we try to account for the possibility of short-circuit
 	 * evaluation of AND/OR?  Probably *not*, because that would make the
 	 * results depend on the clause ordering, and we are not in any position
 	 * to expect that the current ordering of the clauses is the one that's
-	 * going to end up being used.	(Is it worth applying order_qual_clauses
-	 * much earlier in the planning process to fix this?)
+	 * going to end up being used.  The above per-RestrictInfo caching would
+	 * not mix well with trying to re-order clauses anyway.
 	 */
 	if (IsA(node, FuncExpr))
 	{
@@ -2679,6 +2717,20 @@ cost_qual_eval_walker(Node *node, cost_qual_eval_context *context)
 		context->total.per_tuple += get_func_cost(saop->opfuncid) *
 			cpu_operator_cost * estimate_array_length(arraynode) * 0.5;
 	}
+	else if (IsA(node, Aggref) ||
+			 IsA(node, WindowFunc))
+	{
+		/*
+		 * Aggref and WindowFunc nodes are (and should be) treated like Vars,
+		 * ie, zero execution cost in the current model, because they behave
+		 * essentially like Vars in execQual.c.  We disregard the costs of
+		 * their input expressions for the same reason.  The actual execution
+		 * costs of the aggregate/window functions and their arguments have to
+		 * be factored into plan-node-specific costing of the Agg or WindowAgg
+		 * plan node.
+		 */
+		return false;			/* don't recurse into children */
+	}
 	else if (IsA(node, CoerceViaIO))
 	{
 		CoerceViaIO *iocoerce = (CoerceViaIO *) node;
diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c
index bbff4f26f8d6b93456eb3ad5175cbba1f65353ad..ac8051147852345a54702ff31b4ef9017c30679b 100644
--- a/src/backend/optimizer/plan/createplan.c
+++ b/src/backend/optimizer/plan/createplan.c
@@ -939,11 +939,11 @@ create_unique_plan(PlannerInfo *root, UniquePath *best_path)
 								 build_relation_tlist(best_path->path.parent),
 								 NIL,
 								 AGG_HASHED,
+								 NULL,
 								 numGroupCols,
 								 groupColIdx,
 								 groupOperators,
 								 numGroups,
-								 0,
 								 subplan);
 	}
 	else
@@ -3841,9 +3841,9 @@ materialize_finished_plan(Plan *subplan)
 
 Agg *
 make_agg(PlannerInfo *root, List *tlist, List *qual,
-		 AggStrategy aggstrategy,
+		 AggStrategy aggstrategy, const AggClauseCosts *aggcosts,
 		 int numGroupCols, AttrNumber *grpColIdx, Oid *grpOperators,
-		 long numGroups, int numAggs,
+		 long numGroups,
 		 Plan *lefttree)
 {
 	Agg		   *node = makeNode(Agg);
@@ -3859,7 +3859,7 @@ make_agg(PlannerInfo *root, List *tlist, List *qual,
 
 	copy_plan_costsize(plan, lefttree); /* only care about copying size */
 	cost_agg(&agg_path, root,
-			 aggstrategy, numAggs,
+			 aggstrategy, aggcosts,
 			 numGroupCols, numGroups,
 			 lefttree->startup_cost,
 			 lefttree->total_cost,
@@ -3907,7 +3907,7 @@ make_agg(PlannerInfo *root, List *tlist, List *qual,
 
 WindowAgg *
 make_windowagg(PlannerInfo *root, List *tlist,
-			   int numWindowFuncs, Index winref,
+			   List *windowFuncs, Index winref,
 			   int partNumCols, AttrNumber *partColIdx, Oid *partOperators,
 			   int ordNumCols, AttrNumber *ordColIdx, Oid *ordOperators,
 			   int frameOptions, Node *startOffset, Node *endOffset,
@@ -3931,7 +3931,7 @@ make_windowagg(PlannerInfo *root, List *tlist,
 
 	copy_plan_costsize(plan, lefttree); /* only care about copying size */
 	cost_windowagg(&windowagg_path, root,
-				   numWindowFuncs, partNumCols, ordNumCols,
+				   windowFuncs, partNumCols, ordNumCols,
 				   lefttree->startup_cost,
 				   lefttree->total_cost,
 				   lefttree->plan_rows);
diff --git a/src/backend/optimizer/plan/planagg.c b/src/backend/optimizer/plan/planagg.c
index 7fce92c2f1561b762e77511dd83f8550032aca21..2f5955706a04cf9b93f37dbc7c56117c44d237c6 100644
--- a/src/backend/optimizer/plan/planagg.c
+++ b/src/backend/optimizer/plan/planagg.c
@@ -187,11 +187,13 @@ preprocess_minmax_aggregates(PlannerInfo *root, List *tlist)
  * Should we skip even trying to build the standard plan, if
  * preprocess_minmax_aggregates succeeds?
  *
- * We are passed the preprocessed tlist, as well as the best path devised for
+ * We are passed the preprocessed tlist, as well as the estimated costs for
+ * doing the aggregates the regular way, and the best path devised for
  * computing the input of a standard Agg node.
  */
 Plan *
-optimize_minmax_aggregates(PlannerInfo *root, List *tlist, Path *best_path)
+optimize_minmax_aggregates(PlannerInfo *root, List *tlist,
+						   const AggClauseCosts *aggcosts, Path *best_path)
 {
 	Query	   *parse = root->parse;
 	Cost		total_cost;
@@ -221,7 +223,7 @@ optimize_minmax_aggregates(PlannerInfo *root, List *tlist, Path *best_path)
 		total_cost += mminfo->pathcost;
 	}
 
-	cost_agg(&agg_p, root, AGG_PLAIN, list_length(root->minmax_aggs),
+	cost_agg(&agg_p, root, AGG_PLAIN, aggcosts,
 			 0, 0,
 			 best_path->startup_cost, best_path->total_cost,
 			 best_path->parent->rows);
diff --git a/src/backend/optimizer/plan/planner.c b/src/backend/optimizer/plan/planner.c
index 4b0b633c03b14253f386fbb6b1cafbdd319725d9..7b2b40f62984fdf93d23ab01b70cfdf23a3a9f61 100644
--- a/src/backend/optimizer/plan/planner.c
+++ b/src/backend/optimizer/plan/planner.c
@@ -74,7 +74,7 @@ static bool choose_hashed_grouping(PlannerInfo *root,
 					   double tuple_fraction, double limit_tuples,
 					   double path_rows, int path_width,
 					   Path *cheapest_path, Path *sorted_path,
-					   double dNumGroups, AggClauseCounts *agg_counts);
+					   double dNumGroups, AggClauseCosts *agg_costs);
 static bool choose_hashed_distinct(PlannerInfo *root,
 					   double tuple_fraction, double limit_tuples,
 					   double path_rows, int path_width,
@@ -979,7 +979,7 @@ grouping_planner(PlannerInfo *root, double tuple_fraction)
 		Path	   *sorted_path;
 		Path	   *best_path;
 		long		numGroups = 0;
-		AggClauseCounts agg_counts;
+		AggClauseCosts agg_costs;
 		int			numGroupCols;
 		double		path_rows;
 		int			path_width;
@@ -987,7 +987,7 @@ grouping_planner(PlannerInfo *root, double tuple_fraction)
 		WindowFuncLists *wflists = NULL;
 		List	   *activeWindows = NIL;
 
-		MemSet(&agg_counts, 0, sizeof(AggClauseCounts));
+		MemSet(&agg_costs, 0, sizeof(AggClauseCosts));
 
 		/* A recursive query should always have setOperations */
 		Assert(!root->hasRecursion);
@@ -1034,12 +1034,12 @@ grouping_planner(PlannerInfo *root, double tuple_fraction)
 		if (parse->hasAggs)
 		{
 			/*
-			 * Will need actual number of aggregates for estimating costs.
+			 * Collect statistics about aggregates for estimating costs.
 			 * Note: we do not attempt to detect duplicate aggregates here; a
-			 * somewhat-overestimated count is okay for our present purposes.
+			 * somewhat-overestimated cost is okay for our present purposes.
 			 */
-			count_agg_clauses((Node *) tlist, &agg_counts);
-			count_agg_clauses(parse->havingQual, &agg_counts);
+			count_agg_clauses(root, (Node *) tlist, &agg_costs);
+			count_agg_clauses(root, parse->havingQual, &agg_costs);
 
 			/*
 			 * Preprocess MIN/MAX aggregates, if any.  Note: be careful about
@@ -1176,7 +1176,7 @@ grouping_planner(PlannerInfo *root, double tuple_fraction)
 									   tuple_fraction, limit_tuples,
 									   path_rows, path_width,
 									   cheapest_path, sorted_path,
-									   dNumGroups, &agg_counts);
+									   dNumGroups, &agg_costs);
 			/* Also convert # groups to long int --- but 'ware overflow! */
 			numGroups = (long) Min(dNumGroups, (double) LONG_MAX);
 		}
@@ -1219,6 +1219,7 @@ grouping_planner(PlannerInfo *root, double tuple_fraction)
 		 */
 		result_plan = optimize_minmax_aggregates(root,
 												 tlist,
+												 &agg_costs,
 												 best_path);
 		if (result_plan != NULL)
 		{
@@ -1330,11 +1331,11 @@ grouping_planner(PlannerInfo *root, double tuple_fraction)
 												tlist,
 												(List *) parse->havingQual,
 												AGG_HASHED,
+												&agg_costs,
 												numGroupCols,
 												groupColIdx,
 									extract_grouping_ops(parse->groupClause),
 												numGroups,
-												agg_counts.numAggs,
 												result_plan);
 				/* Hashed aggregation produces randomly-ordered results */
 				current_pathkeys = NIL;
@@ -1373,11 +1374,11 @@ grouping_planner(PlannerInfo *root, double tuple_fraction)
 												tlist,
 												(List *) parse->havingQual,
 												aggstrategy,
+												&agg_costs,
 												numGroupCols,
 												groupColIdx,
 									extract_grouping_ops(parse->groupClause),
 												numGroups,
-												agg_counts.numAggs,
 												result_plan);
 			}
 			else if (parse->groupClause)
@@ -1559,7 +1560,7 @@ grouping_planner(PlannerInfo *root, double tuple_fraction)
 				result_plan = (Plan *)
 					make_windowagg(root,
 								   (List *) copyObject(window_tlist),
-							   list_length(wflists->windowFuncs[wc->winref]),
+								   wflists->windowFuncs[wc->winref],
 								   wc->winref,
 								   partNumCols,
 								   partColIdx,
@@ -1625,12 +1626,12 @@ grouping_planner(PlannerInfo *root, double tuple_fraction)
 											result_plan->targetlist,
 											NIL,
 											AGG_HASHED,
+											NULL,
 										  list_length(parse->distinctClause),
 								 extract_grouping_cols(parse->distinctClause,
 													result_plan->targetlist),
 								 extract_grouping_ops(parse->distinctClause),
 											numDistinctRows,
-											0,
 											result_plan);
 			/* Hashed aggregation produces randomly-ordered results */
 			current_pathkeys = NIL;
@@ -2213,7 +2214,7 @@ choose_hashed_grouping(PlannerInfo *root,
 					   double tuple_fraction, double limit_tuples,
 					   double path_rows, int path_width,
 					   Path *cheapest_path, Path *sorted_path,
-					   double dNumGroups, AggClauseCounts *agg_counts)
+					   double dNumGroups, AggClauseCosts *agg_costs)
 {
 	Query	   *parse = root->parse;
 	int			numGroupCols = list_length(parse->groupClause);
@@ -2231,7 +2232,7 @@ choose_hashed_grouping(PlannerInfo *root,
 	 * the hash table, and/or running many sorts in parallel, either of which
 	 * seems like a certain loser.)
 	 */
-	can_hash = (agg_counts->numOrderedAggs == 0 &&
+	can_hash = (agg_costs->numOrderedAggs == 0 &&
 				grouping_is_hashable(parse->groupClause));
 	can_sort = grouping_is_sortable(parse->groupClause);
 
@@ -2261,9 +2262,9 @@ choose_hashed_grouping(PlannerInfo *root,
 	/* Estimate per-hash-entry space at tuple width... */
 	hashentrysize = MAXALIGN(path_width) + MAXALIGN(sizeof(MinimalTupleData));
 	/* plus space for pass-by-ref transition values... */
-	hashentrysize += agg_counts->transitionSpace;
+	hashentrysize += agg_costs->transitionSpace;
 	/* plus the per-hash-entry overhead */
-	hashentrysize += hash_agg_entry_size(agg_counts->numAggs);
+	hashentrysize += hash_agg_entry_size(agg_costs->numAggs);
 
 	if (hashentrysize * dNumGroups > work_mem * 1024L)
 		return false;
@@ -2297,7 +2298,7 @@ choose_hashed_grouping(PlannerInfo *root,
 	 * These path variables are dummies that just hold cost fields; we don't
 	 * make actual Paths for these steps.
 	 */
-	cost_agg(&hashed_p, root, AGG_HASHED, agg_counts->numAggs,
+	cost_agg(&hashed_p, root, AGG_HASHED, agg_costs,
 			 numGroupCols, dNumGroups,
 			 cheapest_path->startup_cost, cheapest_path->total_cost,
 			 path_rows);
@@ -2328,7 +2329,7 @@ choose_hashed_grouping(PlannerInfo *root,
 	}
 
 	if (parse->hasAggs)
-		cost_agg(&sorted_p, root, AGG_SORTED, agg_counts->numAggs,
+		cost_agg(&sorted_p, root, AGG_SORTED, agg_costs,
 				 numGroupCols, dNumGroups,
 				 sorted_p.startup_cost, sorted_p.total_cost,
 				 path_rows);
@@ -2447,7 +2448,7 @@ choose_hashed_distinct(PlannerInfo *root,
 	 * These path variables are dummies that just hold cost fields; we don't
 	 * make actual Paths for these steps.
 	 */
-	cost_agg(&hashed_p, root, AGG_HASHED, 0,
+	cost_agg(&hashed_p, root, AGG_HASHED, NULL,
 			 numDistinctCols, dNumDistinctRows,
 			 cheapest_startup_cost, cheapest_total_cost,
 			 path_rows);
diff --git a/src/backend/optimizer/prep/prepunion.c b/src/backend/optimizer/prep/prepunion.c
index 76adb7cdaec3c20b7e372a29440c9fff5736539a..fcff015cdffc1d3833a85d8279790fbd102391bd 100644
--- a/src/backend/optimizer/prep/prepunion.c
+++ b/src/backend/optimizer/prep/prepunion.c
@@ -732,12 +732,12 @@ make_union_unique(SetOperationStmt *op, Plan *plan,
 								 plan->targetlist,
 								 NIL,
 								 AGG_HASHED,
+								 NULL,
 								 list_length(groupList),
 								 extract_grouping_cols(groupList,
 													   plan->targetlist),
 								 extract_grouping_ops(groupList),
 								 numGroups,
-								 0,
 								 plan);
 		/* Hashed aggregation produces randomly-ordered results */
 		*sortClauses = NIL;
@@ -814,7 +814,7 @@ choose_hashed_setop(PlannerInfo *root, List *groupClauses,
 	 * These path variables are dummies that just hold cost fields; we don't
 	 * make actual Paths for these steps.
 	 */
-	cost_agg(&hashed_p, root, AGG_HASHED, 0,
+	cost_agg(&hashed_p, root, AGG_HASHED, NULL,
 			 numGroupCols, dNumGroups,
 			 input_plan->startup_cost, input_plan->total_cost,
 			 input_plan->plan_rows);
diff --git a/src/backend/optimizer/util/clauses.c b/src/backend/optimizer/util/clauses.c
index b3c2aec97b00f64e607536b94363f778a055bd23..8b0d8623db6d775000f65da7579a70075456662d 100644
--- a/src/backend/optimizer/util/clauses.c
+++ b/src/backend/optimizer/util/clauses.c
@@ -48,6 +48,12 @@
 #include "utils/typcache.h"
 
 
+typedef struct
+{
+	PlannerInfo *root;
+	AggClauseCosts *costs;
+} count_agg_clauses_context;
+
 typedef struct
 {
 	ParamListInfo boundParams;
@@ -79,7 +85,8 @@ typedef struct
 
 static bool contain_agg_clause_walker(Node *node, void *context);
 static bool pull_agg_clause_walker(Node *node, List **context);
-static bool count_agg_clauses_walker(Node *node, AggClauseCounts *counts);
+static bool count_agg_clauses_walker(Node *node,
+									 count_agg_clauses_context *context);
 static bool find_window_functions_walker(Node *node, WindowFuncLists *lists);
 static bool expression_returns_set_rows_walker(Node *node, double *count);
 static bool contain_subplans_walker(Node *node, void *context);
@@ -448,48 +455,80 @@ pull_agg_clause_walker(Node *node, List **context)
 
 /*
  * count_agg_clauses
- *	  Recursively count the Aggref nodes in an expression tree.
+ *	  Recursively count the Aggref nodes in an expression tree, and
+ *	  accumulate other cost information about them too.
  *
  *	  Note: this also checks for nested aggregates, which are an error.
  *
- * We not only count the nodes, but attempt to estimate the total space
- * needed for their transition state values if all are evaluated in parallel
- * (as would be done in a HashAgg plan).  See AggClauseCounts for the exact
- * set of statistics returned.
+ * We not only count the nodes, but estimate their execution costs, and
+ * attempt to estimate the total space needed for their transition state
+ * values if all are evaluated in parallel (as would be done in a HashAgg
+ * plan).  See AggClauseCosts for the exact set of statistics collected.
  *
- * NOTE that the counts are ADDED to those already in *counts ... so the
- * caller is responsible for zeroing the struct initially.
+ * NOTE that the counts/costs are ADDED to those already in *costs ... so
+ * the caller is responsible for zeroing the struct initially.
  *
  * This does not descend into subqueries, and so should be used only after
  * reduction of sublinks to subplans, or in contexts where it's known there
  * are no subqueries.  There mustn't be outer-aggregate references either.
  */
 void
-count_agg_clauses(Node *clause, AggClauseCounts *counts)
+count_agg_clauses(PlannerInfo *root, Node *clause, AggClauseCosts *costs)
 {
-	/* no setup needed */
-	count_agg_clauses_walker(clause, counts);
+	count_agg_clauses_context context;
+
+	context.root = root;
+	context.costs = costs;
+	(void) count_agg_clauses_walker(clause, &context);
 }
 
 static bool
-count_agg_clauses_walker(Node *node, AggClauseCounts *counts)
+count_agg_clauses_walker(Node *node, count_agg_clauses_context *context)
 {
 	if (node == NULL)
 		return false;
 	if (IsA(node, Aggref))
 	{
 		Aggref	   *aggref = (Aggref *) node;
-		Oid		   *inputTypes;
-		int			numArguments;
+		AggClauseCosts *costs = context->costs;
 		HeapTuple	aggTuple;
 		Form_pg_aggregate aggform;
+		Oid			aggtransfn;
+		Oid			aggfinalfn;
 		Oid			aggtranstype;
+		QualCost	argcosts;
+		Oid		   *inputTypes;
+		int			numArguments;
 		ListCell   *l;
 
 		Assert(aggref->agglevelsup == 0);
-		counts->numAggs++;
+
+		/* fetch info about aggregate from pg_aggregate */
+		aggTuple = SearchSysCache1(AGGFNOID,
+								   ObjectIdGetDatum(aggref->aggfnoid));
+		if (!HeapTupleIsValid(aggTuple))
+			elog(ERROR, "cache lookup failed for aggregate %u",
+				 aggref->aggfnoid);
+		aggform = (Form_pg_aggregate) GETSTRUCT(aggTuple);
+		aggtransfn = aggform->aggtransfn;
+		aggfinalfn = aggform->aggfinalfn;
+		aggtranstype = aggform->aggtranstype;
+		ReleaseSysCache(aggTuple);
+
+		/* count it */
+		costs->numAggs++;
 		if (aggref->aggorder != NIL || aggref->aggdistinct != NIL)
-			counts->numOrderedAggs++;
+			costs->numOrderedAggs++;
+
+		/* add component function execution costs to appropriate totals */
+		costs->transCost.per_tuple += get_func_cost(aggtransfn) * cpu_operator_cost;
+		if (OidIsValid(aggfinalfn))
+			costs->finalCost += get_func_cost(aggfinalfn) * cpu_operator_cost;
+
+		/* also add the input expressions' cost to per-input-row costs */
+		cost_qual_eval_node(&argcosts, (Node *) aggref->args, context->root);
+		costs->transCost.startup += argcosts.startup;
+		costs->transCost.per_tuple += argcosts.per_tuple;
 
 		/* extract argument types (ignoring any ORDER BY expressions) */
 		inputTypes = (Oid *) palloc(sizeof(Oid) * list_length(aggref->args));
@@ -502,16 +541,6 @@ count_agg_clauses_walker(Node *node, AggClauseCounts *counts)
 				inputTypes[numArguments++] = exprType((Node *) tle->expr);
 		}
 
-		/* fetch aggregate transition datatype from pg_aggregate */
-		aggTuple = SearchSysCache1(AGGFNOID,
-								   ObjectIdGetDatum(aggref->aggfnoid));
-		if (!HeapTupleIsValid(aggTuple))
-			elog(ERROR, "cache lookup failed for aggregate %u",
-				 aggref->aggfnoid);
-		aggform = (Form_pg_aggregate) GETSTRUCT(aggTuple);
-		aggtranstype = aggform->aggtranstype;
-		ReleaseSysCache(aggTuple);
-
 		/* resolve actual type of transition state, if polymorphic */
 		if (IsPolymorphicType(aggtranstype))
 		{
@@ -554,7 +583,7 @@ count_agg_clauses_walker(Node *node, AggClauseCounts *counts)
 			avgwidth = get_typavgwidth(aggtranstype, aggtranstypmod);
 			avgwidth = MAXALIGN(avgwidth);
 
-			counts->transitionSpace += avgwidth + 2 * sizeof(void *);
+			costs->transitionSpace += avgwidth + 2 * sizeof(void *);
 		}
 		else if (aggtranstype == INTERNALOID)
 		{
@@ -566,7 +595,7 @@ count_agg_clauses_walker(Node *node, AggClauseCounts *counts)
 			 * being kept in a private memory context, as is done by
 			 * array_agg() for instance.
 			 */
-			counts->transitionSpace += ALLOCSET_DEFAULT_INITSIZE;
+			costs->transitionSpace += ALLOCSET_DEFAULT_INITSIZE;
 		}
 
 		/*
@@ -585,7 +614,7 @@ count_agg_clauses_walker(Node *node, AggClauseCounts *counts)
 	}
 	Assert(!IsA(node, SubLink));
 	return expression_tree_walker(node, count_agg_clauses_walker,
-								  (void *) counts);
+								  (void *) context);
 }
 
 
diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c
index 55218b58694bb43e7af654016ac53ac9d58d0a93..161d5ab122eea1f8b04ae88039b1913a1aae60cf 100644
--- a/src/backend/optimizer/util/pathnode.c
+++ b/src/backend/optimizer/util/pathnode.c
@@ -1103,7 +1103,7 @@ create_unique_path(PlannerInfo *root, RelOptInfo *rel, Path *subpath,
 			all_hash = false;	/* don't try to hash */
 		else
 			cost_agg(&agg_path, root,
-					 AGG_HASHED, 0,
+					 AGG_HASHED, NULL,
 					 numCols, pathnode->rows,
 					 subpath->startup_cost,
 					 subpath->total_cost,
diff --git a/src/include/nodes/relation.h b/src/include/nodes/relation.h
index 78b03e024e8e51eba7849e3f8fb597c3f70853d8..f6592697e44557d2b0850a3bcea0de7d4507df83 100644
--- a/src/include/nodes/relation.h
+++ b/src/include/nodes/relation.h
@@ -46,6 +46,20 @@ typedef struct QualCost
 	Cost		per_tuple;		/* per-evaluation cost */
 } QualCost;
 
+/*
+ * Costing aggregate function execution requires these statistics about
+ * the aggregates to be executed by a given Agg node.  Note that transCost
+ * includes the execution costs of the aggregates' input expressions.
+ */
+typedef struct AggClauseCosts
+{
+	int			numAggs;		/* total number of aggregate functions */
+	int			numOrderedAggs; /* number that use DISTINCT or ORDER BY */
+	QualCost	transCost;		/* total per-input-row execution costs */
+	Cost		finalCost;		/* total costs of agg final functions */
+	Size		transitionSpace;	/* space for pass-by-ref transition data */
+} AggClauseCosts;
+
 
 /*----------
  * PlannerGlobal
diff --git a/src/include/optimizer/clauses.h b/src/include/optimizer/clauses.h
index 7ae236d167c15fee98b1d6b3ff664ca72be2c260..4af772d255cbc7b178814d3ecf980a6e7223d017 100644
--- a/src/include/optimizer/clauses.h
+++ b/src/include/optimizer/clauses.h
@@ -20,13 +20,6 @@
 #define is_opclause(clause)		((clause) != NULL && IsA(clause, OpExpr))
 #define is_funcclause(clause)	((clause) != NULL && IsA(clause, FuncExpr))
 
-typedef struct
-{
-	int			numAggs;		/* total number of aggregate calls */
-	int			numOrderedAggs; /* number that use DISTINCT or ORDER BY */
-	Size		transitionSpace;	/* for pass-by-ref transition data */
-} AggClauseCounts;
-
 typedef struct
 {
 	int			numWindowFuncs; /* total number of WindowFuncs found */
@@ -56,7 +49,8 @@ extern List *make_ands_implicit(Expr *clause);
 
 extern bool contain_agg_clause(Node *clause);
 extern List *pull_agg_clause(Node *clause);
-extern void count_agg_clauses(Node *clause, AggClauseCounts *counts);
+extern void count_agg_clauses(PlannerInfo *root, Node *clause,
+							  AggClauseCosts *costs);
 
 extern bool contain_window_function(Node *clause);
 extern WindowFuncLists *find_window_functions(Node *clause, Index maxWinRef);
diff --git a/src/include/optimizer/cost.h b/src/include/optimizer/cost.h
index 08898c13b9fbad8c9baafed4b45e7603228d08d8..2763863af2724b3bc04df79ea0e2e45dcf4dfb2d 100644
--- a/src/include/optimizer/cost.h
+++ b/src/include/optimizer/cost.h
@@ -94,12 +94,12 @@ extern void cost_material(Path *path,
 			  Cost input_startup_cost, Cost input_total_cost,
 			  double tuples, int width);
 extern void cost_agg(Path *path, PlannerInfo *root,
-		 AggStrategy aggstrategy, int numAggs,
+		 AggStrategy aggstrategy, const AggClauseCosts *aggcosts,
 		 int numGroupCols, double numGroups,
 		 Cost input_startup_cost, Cost input_total_cost,
 		 double input_tuples);
 extern void cost_windowagg(Path *path, PlannerInfo *root,
-			   int numWindowFuncs, int numPartCols, int numOrderCols,
+			   List *windowFuncs, int numPartCols, int numOrderCols,
 			   Cost input_startup_cost, Cost input_total_cost,
 			   double input_tuples);
 extern void cost_group(Path *path, PlannerInfo *root,
diff --git a/src/include/optimizer/planmain.h b/src/include/optimizer/planmain.h
index d48bf39e410656a6a56eb048ea8434beefb5131e..5261691af69ca2e1e015fb6a439c5aad15bfd8d2 100644
--- a/src/include/optimizer/planmain.h
+++ b/src/include/optimizer/planmain.h
@@ -34,7 +34,7 @@ extern void query_planner(PlannerInfo *root, List *tlist,
  */
 extern void preprocess_minmax_aggregates(PlannerInfo *root, List *tlist);
 extern Plan *optimize_minmax_aggregates(PlannerInfo *root, List *tlist,
-						   Path *best_path);
+						   const AggClauseCosts *aggcosts, Path *best_path);
 
 /*
  * prototypes for plan/createplan.c
@@ -54,12 +54,12 @@ extern Sort *make_sort_from_sortclauses(PlannerInfo *root, List *sortcls,
 extern Sort *make_sort_from_groupcols(PlannerInfo *root, List *groupcls,
 						 AttrNumber *grpColIdx, Plan *lefttree);
 extern Agg *make_agg(PlannerInfo *root, List *tlist, List *qual,
-		 AggStrategy aggstrategy,
+		 AggStrategy aggstrategy, const AggClauseCosts *aggcosts,
 		 int numGroupCols, AttrNumber *grpColIdx, Oid *grpOperators,
-		 long numGroups, int numAggs,
+		 long numGroups,
 		 Plan *lefttree);
 extern WindowAgg *make_windowagg(PlannerInfo *root, List *tlist,
-			   int numWindowFuncs, Index winref,
+			   List *windowFuncs, Index winref,
 			   int partNumCols, AttrNumber *partColIdx, Oid *partOperators,
 			   int ordNumCols, AttrNumber *ordColIdx, Oid *ordOperators,
 			   int frameOptions, Node *startOffset, Node *endOffset,