From 4fb694aebc524f2085152d8c98a85e01ef6136f4 Mon Sep 17 00:00:00 2001
From: Tom Lane <tgl@sss.pgh.pa.us>
Date: Sun, 4 Mar 2012 15:40:16 -0500
Subject: [PATCH] Improve histogram-filling loop in new compute_array_stats()
 code.

Do "frac" arithmetic in int64 to prevent overflow with large statistics
targets, and improve the comments so people have some chance of
understanding how it works.

Alexander Korotkov and Tom Lane
---
 src/backend/utils/adt/array_typanalyze.c | 63 +++++++++++++++++-------
 1 file changed, 44 insertions(+), 19 deletions(-)

diff --git a/src/backend/utils/adt/array_typanalyze.c b/src/backend/utils/adt/array_typanalyze.c
index 941e2adb038..ba9873905e2 100644
--- a/src/backend/utils/adt/array_typanalyze.c
+++ b/src/backend/utils/adt/array_typanalyze.c
@@ -579,9 +579,9 @@ compute_array_stats(VacAttrStats *stats, AnalyzeAttrFetchFunc fetchfunc,
 		{
 			int			num_hist = stats->attr->attstattarget;
 			DECountItem **sorted_count_items;
-			int			count_item_index;
+			int			j;
 			int			delta;
-			int			frac;
+			int64		frac;
 			float4	   *hist;
 
 			/* num_hist must be at least 2 for the loop below to work */
@@ -594,45 +594,70 @@ compute_array_stats(VacAttrStats *stats, AnalyzeAttrFetchFunc fetchfunc,
 			sorted_count_items = (DECountItem **)
 				palloc(sizeof(DECountItem *) * count_items_count);
 			hash_seq_init(&scan_status, count_tab);
-			count_item_index = 0;
+			j = 0;
 			while ((count_item = (DECountItem *) hash_seq_search(&scan_status)) != NULL)
 			{
-				sorted_count_items[count_item_index++] = count_item;
+				sorted_count_items[j++] = count_item;
 			}
 			qsort(sorted_count_items, count_items_count,
 				  sizeof(DECountItem *), countitem_compare_count);
 
 			/*
-			 * Fill stanumbers with the histogram, followed by the average
-			 * count.  This array must be stored in anl_context.
+			 * Prepare to fill stanumbers with the histogram, followed by the
+			 * average count.  This array must be stored in anl_context.
 			 */
 			hist = (float4 *)
 				MemoryContextAlloc(stats->anl_context,
 								   sizeof(float4) * (num_hist + 1));
 			hist[num_hist] = (double) element_no / (double) nonnull_cnt;
 
-			/*
-			 * Construct the histogram.
+			/*----------
+			 * Construct the histogram of distinct-element counts (DECs).
+			 *
+			 * The object of this loop is to copy the min and max DECs to
+			 * hist[0] and hist[num_hist - 1], along with evenly-spaced DECs
+			 * in between (where "evenly-spaced" is with reference to the
+			 * whole input population of arrays).  If we had a complete sorted
+			 * array of DECs, one per analyzed row, the i'th hist value would
+			 * come from DECs[i * (analyzed_rows - 1) / (num_hist - 1)]
+			 * (compare the histogram-making loop in compute_scalar_stats()).
+			 * But instead of that we have the sorted_count_items[] array,
+			 * which holds unique DEC values with their frequencies (that is,
+			 * a run-length-compressed version of the full array).  So we
+			 * control advancing through sorted_count_items[] with the
+			 * variable "frac", which is defined as (x - y) * (num_hist - 1),
+			 * where x is the index in the notional DECs array corresponding
+			 * to the start of the next sorted_count_items[] element's run,
+			 * and y is the index in DECs from which we should take the next
+			 * histogram value.  We have to advance whenever x <= y, that is
+			 * frac <= 0.  The x component is the sum of the frequencies seen
+			 * so far (up through the current sorted_count_items[] element),
+			 * and of course y * (num_hist - 1) = i * (analyzed_rows - 1),
+			 * per the subscript calculation above.  (The subscript calculation
+			 * implies dropping any fractional part of y; in this formulation
+			 * that's handled by not advancing until frac reaches 1.)
 			 *
-			 * XXX this needs work: frac could overflow, and it's not clear
-			 * how or why the code works.  Even if it does work, it needs
-			 * documented.
+			 * Even though frac has a bounded range, it could overflow int32
+			 * when working with very large statistics targets, so we do that
+			 * math in int64.
+			 *----------
 			 */
 			delta = analyzed_rows - 1;
-			count_item_index = 0;
-			frac = sorted_count_items[0]->frequency * (num_hist - 1);
+			j = 0;				/* current index in sorted_count_items */
+			/* Initialize frac for sorted_count_items[0]; y is initially 0 */
+			frac = (int64) sorted_count_items[0]->frequency * (num_hist - 1);
 			for (i = 0; i < num_hist; i++)
 			{
 				while (frac <= 0)
 				{
-					count_item_index++;
-					Assert(count_item_index < count_items_count);
-					frac += sorted_count_items[count_item_index]->frequency * (num_hist - 1);
+					/* Advance, and update x component of frac */
+					j++;
+					frac += (int64) sorted_count_items[j]->frequency * (num_hist - 1);
 				}
-				hist[i] = sorted_count_items[count_item_index]->count;
-				frac -= delta;
+				hist[i] = sorted_count_items[j]->count;
+				frac -= delta;		/* update y for upcoming i increment */
 			}
-			Assert(count_item_index == count_items_count - 1);
+			Assert(j == count_items_count - 1);
 
 			stats->stakind[slot_idx] = STATISTIC_KIND_DECHIST;
 			stats->staop[slot_idx] = extra_data->eq_opr;
-- 
GitLab