diff --git a/src/backend/executor/instrument.c b/src/backend/executor/instrument.c
index 41b061bd526c6f4129c4b0d00e5e752193844553..d094ffbbcf6fd8d0acd3b4969fa725a99333e797 100644
--- a/src/backend/executor/instrument.c
+++ b/src/backend/executor/instrument.c
@@ -7,90 +7,117 @@
  * Copyright (c) 2001-2006, PostgreSQL Global Development Group
  *
  * IDENTIFICATION
- *	  $PostgreSQL: pgsql/src/backend/executor/instrument.c,v 1.15 2006/05/30 14:01:58 momjian Exp $
+ *	  $PostgreSQL: pgsql/src/backend/executor/instrument.c,v 1.16 2006/05/30 19:24:25 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
 #include "postgres.h"
 
-#include <unistd.h>
 #include <math.h>
+#include <unistd.h>
 
 #include "executor/instrument.h"
 
-/* This is the function that is used to determine the sampling intervals. In
- * general, if the function is f(x), then for N tuples we will take on the
- * order of integral(1/f(x), x=0..N) samples. Some examples follow, with the
- * number of samples that would be collected over 1,000,000 tuples.
 
-  f(x) = x         =>   log2(N)                        20
-  f(x) = x^(1/2)   =>   2 * N^(1/2)                  2000
-  f(x) = x^(1/3)   =>   1.5 * N^(2/3)               15000
-  
+/*
+ * As of PostgreSQL 8.2, we try to reduce the overhead of EXPLAIN ANALYZE
+ * by not calling INSTR_TIME_SET_CURRENT() for every single node execution.
+ * (Because that requires a kernel call on most systems, it's expensive.)
+ *
+ * This macro determines the sampling interval: that is, after how many more
+ * iterations will we take the next time sample, given that niters iterations
+ * have occurred already.  In general, if the function is f(x), then for N
+ * iterations we will take on the order of integral(1/f(x), x=0..N)
+ * samples.  Some examples follow, with the number of samples that would be
+ * collected over 1,000,000 iterations:
+ *
+ *		f(x) = x         =>   log2(N)                        20
+ *		f(x) = x^(1/2)   =>   2 * N^(1/2)                  2000
+ *		f(x) = x^(1/3)   =>   1.5 * N^(2/3)               15000
+ *
  * I've chosen the last one as it seems to provide a good compromise between
  * low overhead but still getting a meaningful number of samples. However,
  * not all machines have the cbrt() function so on those we substitute
  * sqrt(). The difference is not very significant in the tests I made.
-*/ 
+ *
+ * The actual sampling interval is randomized with the SampleFunc() value
+ * as the mean; this hopefully will reduce any measurement bias due to
+ * cyclic variation in the node execution time.
+ */
 #ifdef HAVE_CBRT
-#define SampleFunc cbrt
+#define SampleFunc(niters) cbrt(niters)
 #else
-#define SampleFunc sqrt
+#define SampleFunc(niters) sqrt(niters)
 #endif
 
+#define SampleInterval(niters) \
+	(SampleFunc(niters) * (double) random() / (double) (MAX_RANDOM_VALUE/2))
+
+/*
+ * We sample at every node iteration until we've reached this threshold,
+ * so that nodes not called a large number of times are completely accurate.
+ * (Perhaps this should be a GUC variable?  But beware of messing up
+ * CalculateSampleOverhead if value is too small.)
+ */
 #define SAMPLE_THRESHOLD 50
 
+/*
+ * Determine the sampling overhead.  This only needs to be done once per
+ * backend (actually, probably once per postmaster would do ...)
+ */
 static double SampleOverhead;
-static bool SampleOverheadCalculated;
+static bool SampleOverheadCalculated = false;
 
 static void
-CalculateSampleOverhead()
+CalculateSampleOverhead(void)
 {
 	Instrumentation instr;
 	int i;
-	
-	/* We want to determine the sampling overhead, to correct
-	 * calculations later. This only needs to be done once per backend.
-	 * Is this the place? A wrong value here (due to a mistimed
-	 * task-switch) will cause bad calculations later.
-	 *
+
+	/*
+	 * We could get a wrong result due to being interrupted by task switch.
 	 * To minimize the risk we do it a few times and take the lowest.
 	 */
-	
 	SampleOverhead = 1.0e6;
-	
-	for( i = 0; i<5; i++ )
+
+	for (i = 0; i < 5; i++)
 	{
 		int j;
 		double overhead;
-		
-		memset( &instr, 0, sizeof(instr) );
-	
-		/* Loop SAMPLE_THRESHOLD times or 100 microseconds, whichever is faster */
-		for( j=0; j<SAMPLE_THRESHOLD && INSTR_TIME_GET_DOUBLE(instr.counter) < 100e-6; i++ )
+
+		memset(&instr, 0, sizeof(instr));
+		/*
+		 * We know that samples will actually be taken up to SAMPLE_THRESHOLD,
+		 * so that's as far as we can test.
+		 */
+		for (j=0; j < SAMPLE_THRESHOLD; j++)
 		{
-			InstrStartNode( &instr );
-			InstrStopNode( &instr, 1 );
+			InstrStartNode(&instr);
+			InstrStopNode(&instr, 1);
 		}
 		overhead = INSTR_TIME_GET_DOUBLE(instr.counter) / instr.samplecount;
-		if( overhead < SampleOverhead )
+		if (overhead < SampleOverhead)
 			SampleOverhead = overhead;
 	}
-		
+
 	SampleOverheadCalculated = true;
 }
 
+
 /* Allocate new instrumentation structure(s) */
 Instrumentation *
 InstrAlloc(int n)
 {
-	Instrumentation *instr = palloc0(n * sizeof(Instrumentation));
+	Instrumentation *instr;
 
-	/* we don't need to do any initialization except zero 'em */
-	
-	/* Calculate overhead, if not done yet */
-	if( !SampleOverheadCalculated )
+	/* Calculate sampling overhead, if not done yet in this backend */
+	if (!SampleOverheadCalculated)
 		CalculateSampleOverhead();
+
+	instr = palloc0(n * sizeof(Instrumentation));
+
+	/* we don't need to do any initialization except zero 'em */
+
 	return instr;
 }
 
@@ -100,18 +127,17 @@ InstrStartNode(Instrumentation *instr)
 {
 	if (INSTR_TIME_IS_ZERO(instr->starttime))
 	{
-		/* We always sample the first SAMPLE_THRESHOLD tuples, so small nodes are always accurate */
-		if (instr->tuplecount < SAMPLE_THRESHOLD)
+		/*
+		 * Always sample if not yet up to threshold, else check whether
+		 * next threshold has been reached
+		 */
+		if (instr->itercount < SAMPLE_THRESHOLD)
 			instr->sampling = true;
-		else
+		else if (instr->itercount >= instr->nextsample)
 		{
-			/* Otherwise we go to sampling, see the comments on SampleFunc at the top of the file */
-			if( instr->tuplecount > instr->nextsample )
-			{
-				instr->sampling = true;
-				/* The doubling is so the random will average 1 over time */
-				instr->nextsample += 2.0 * SampleFunc(instr->tuplecount) * (double)rand() / (double)RAND_MAX;
-			}
+			instr->sampling = true;
+			instr->nextsample =
+				instr->itercount + SampleInterval(instr->itercount);
 		}
 		if (instr->sampling)
 			INSTR_TIME_SET_CURRENT(instr->starttime);
@@ -124,13 +150,15 @@ InstrStartNode(Instrumentation *instr)
 void
 InstrStopNode(Instrumentation *instr, double nTuples)
 {
-	instr_time	endtime;
-
-	/* count the returned tuples */
+	/* count the returned tuples and iterations */
 	instr->tuplecount += nTuples;
+	instr->itercount += 1;
 
+	/* measure runtime if appropriate */
 	if (instr->sampling)
 	{
+		instr_time	endtime;
+
 		if (INSTR_TIME_IS_ZERO(instr->starttime))
 		{
 			elog(DEBUG2, "InstrStopNode called without start");
@@ -159,7 +187,8 @@ InstrStopNode(Instrumentation *instr, double nTuples)
 #endif
 
 		INSTR_TIME_SET_ZERO(instr->starttime);
-		instr->samplecount += nTuples;
+
+		instr->samplecount += 1;
 		instr->sampling = false;
 	}
 
@@ -184,35 +213,44 @@ InstrEndLoop(Instrumentation *instr)
 	if (!INSTR_TIME_IS_ZERO(instr->starttime))
 		elog(DEBUG2, "InstrEndLoop called on running node");
 
-	/* Accumulate per-cycle statistics into totals */
+	/* Compute time spent in node */
 	totaltime = INSTR_TIME_GET_DOUBLE(instr->counter);
 
-	instr->startup += instr->firsttuple;
-
-	/* Here we take into account sampling effects. Doing it naively ends
-	 * up assuming the sampling overhead applies to all tuples, even the
-	 * ones we didn't measure. We've calculated an overhead, so we
-	 * subtract that for all samples we didn't measure. The first tuple
-	 * is also special cased, because it usually takes longer. */
-
-	if( instr->samplecount < instr->tuplecount )
+	/*
+	 * If we didn't measure runtime on every iteration, then we must increase
+	 * the measured total to account for the other iterations.  Naively
+	 * multiplying totaltime by itercount/samplecount would be wrong because
+	 * it effectively assumes the sampling overhead applies to all iterations,
+	 * even the ones we didn't measure.  (Note that what we are trying to
+	 * estimate here is the actual time spent in the node, including the
+	 * actual measurement overhead; not the time exclusive of measurement
+	 * overhead.)  We exclude the first iteration from the correction basis,
+	 * because it often takes longer than others.
+	 */
+	if (instr->itercount > instr->samplecount)
 	{
-		double pertuple = (totaltime - instr->firsttuple) / (instr->samplecount - 1);
-		instr->total += instr->firsttuple + (pertuple * (instr->samplecount - 1))
-		                                  + ((pertuple - SampleOverhead) * (instr->tuplecount - instr->samplecount));
+		double per_iter;
+
+		per_iter = (totaltime - instr->firsttuple) / (instr->samplecount - 1)
+			- SampleOverhead;
+		if (per_iter > 0)				/* sanity check */
+			totaltime += per_iter * (instr->itercount - instr->samplecount);
 	}
-	else
-		instr->total += totaltime;
-		
+
+	/* Accumulate per-cycle statistics into totals */
+	instr->startup += instr->firsttuple;
+	instr->total += totaltime;
 	instr->ntuples += instr->tuplecount;
-	instr->nsamples += instr->samplecount;
 	instr->nloops += 1;
 
 	/* Reset for next cycle (if any) */
 	instr->running = false;
+	instr->sampling = false;
 	INSTR_TIME_SET_ZERO(instr->starttime);
 	INSTR_TIME_SET_ZERO(instr->counter);
 	instr->firsttuple = 0;
-	instr->samplecount = 0;
 	instr->tuplecount = 0;
+	instr->itercount = 0;
+	instr->samplecount = 0;
+	instr->nextsample = 0;
 }
diff --git a/src/include/executor/instrument.h b/src/include/executor/instrument.h
index fbf578d473f8c7001f92ee633b0fa20d6ea518e6..fd57f8657884697510adf9d342dbb7eefa9d8e15 100644
--- a/src/include/executor/instrument.h
+++ b/src/include/executor/instrument.h
@@ -6,7 +6,7 @@
  *
  * Copyright (c) 2001-2006, PostgreSQL Global Development Group
  *
- * $PostgreSQL: pgsql/src/include/executor/instrument.h,v 1.14 2006/05/30 14:01:58 momjian Exp $
+ * $PostgreSQL: pgsql/src/include/executor/instrument.h,v 1.15 2006/05/30 19:24:25 tgl Exp $
  *
  *-------------------------------------------------------------------------
  */
@@ -57,20 +57,19 @@ typedef struct Instrumentation
 {
 	/* Info about current plan cycle: */
 	bool		running;		/* TRUE if we've completed first tuple */
+	bool		sampling;		/* Are we sampling in current iteration? */
 	instr_time	starttime;		/* Start time of current iteration of node */
 	instr_time	counter;		/* Accumulated runtime for this node */
 	double		firsttuple;		/* Time for first tuple of this cycle */
 	double		tuplecount;		/* Tuples emitted so far this cycle */
-	double		samplecount;		/* Samples collected this cycle */
+	double		itercount;		/* Plan node iterations this cycle */
+	double		samplecount;	/* Iterations in which we sampled runtime */
+	double		nextsample;		/* Next itercount to sample at */
 	/* Accumulated statistics across all completed cycles: */
 	double		startup;		/* Total startup time (in seconds) */
 	double		total;			/* Total total time (in seconds) */
 	double		ntuples;		/* Total tuples produced */
 	double		nloops;			/* # of run cycles for this node */
-	double		nsamples;		/* # of samples taken */
-	/* Tracking for sampling */
-	bool		sampling;		/* Are we sampling this iteration */
-	double		nextsample;		/* The next tuplecount we're going to sample */
 } Instrumentation;
 
 extern Instrumentation *InstrAlloc(int n);