diff --git a/src/backend/commands/explain.c b/src/backend/commands/explain.c
index 183d3d9bcb77af298986812f84bc4502214cb563..12dae778a76493f46a4df0ab795789ca315beca8 100644
--- a/src/backend/commands/explain.c
+++ b/src/backend/commands/explain.c
@@ -103,6 +103,7 @@ static void show_instrumentation_count(const char *qlabel, int which,
 						   PlanState *planstate, ExplainState *es);
 static void show_foreignscan_info(ForeignScanState *fsstate, ExplainState *es);
 static const char *explain_get_index_name(Oid indexId);
+static void show_buffer_usage(ExplainState *es, const BufferUsage *usage);
 static void ExplainIndexScanDetails(Oid indexid, ScanDirection indexorderdir,
 						ExplainState *es);
 static void ExplainScanTarget(Scan *plan, ExplainState *es);
@@ -1437,108 +1438,73 @@ ExplainNode(PlanState *planstate, List *ancestors,
 
 	/* Show buffer usage */
 	if (es->buffers && planstate->instrument)
+		show_buffer_usage(es, &planstate->instrument->bufusage);
+
+	/* Show worker detail */
+	if (es->analyze && es->verbose && planstate->worker_instrument)
 	{
-		const BufferUsage *usage = &planstate->instrument->bufusage;
+		WorkerInstrumentation *w = planstate->worker_instrument;
+		bool		opened_group = false;
+		int			n;
 
-		if (es->format == EXPLAIN_FORMAT_TEXT)
+		for (n = 0; n < w->num_workers; ++n)
 		{
-			bool		has_shared = (usage->shared_blks_hit > 0 ||
-									  usage->shared_blks_read > 0 ||
-									  usage->shared_blks_dirtied > 0 ||
-									  usage->shared_blks_written > 0);
-			bool		has_local = (usage->local_blks_hit > 0 ||
-									 usage->local_blks_read > 0 ||
-									 usage->local_blks_dirtied > 0 ||
-									 usage->local_blks_written > 0);
-			bool		has_temp = (usage->temp_blks_read > 0 ||
-									usage->temp_blks_written > 0);
-			bool		has_timing = (!INSTR_TIME_IS_ZERO(usage->blk_read_time) ||
-								 !INSTR_TIME_IS_ZERO(usage->blk_write_time));
+			Instrumentation *instrument = &w->instrument[n];
+			double		nloops = instrument->nloops;
+			double		startup_sec;
+			double		total_sec;
+			double		rows;
+
+			if (nloops <= 0)
+				continue;
+			startup_sec = 1000.0 * instrument->startup / nloops;
+			total_sec = 1000.0 * instrument->total / nloops;
+			rows = instrument->ntuples / nloops;
 
-			/* Show only positive counter values. */
-			if (has_shared || has_local || has_temp)
+			if (es->format == EXPLAIN_FORMAT_TEXT)
 			{
 				appendStringInfoSpaces(es->str, es->indent * 2);
-				appendStringInfoString(es->str, "Buffers:");
-
-				if (has_shared)
-				{
-					appendStringInfoString(es->str, " shared");
-					if (usage->shared_blks_hit > 0)
-						appendStringInfo(es->str, " hit=%ld",
-										 usage->shared_blks_hit);
-					if (usage->shared_blks_read > 0)
-						appendStringInfo(es->str, " read=%ld",
-										 usage->shared_blks_read);
-					if (usage->shared_blks_dirtied > 0)
-						appendStringInfo(es->str, " dirtied=%ld",
-										 usage->shared_blks_dirtied);
-					if (usage->shared_blks_written > 0)
-						appendStringInfo(es->str, " written=%ld",
-										 usage->shared_blks_written);
-					if (has_local || has_temp)
-						appendStringInfoChar(es->str, ',');
-				}
-				if (has_local)
+				appendStringInfo(es->str, "Worker %d: ", n);
+				if (es->timing)
+					appendStringInfo(es->str,
+							"actual time=%.3f..%.3f rows=%.0f loops=%.0f\n",
+								 startup_sec, total_sec, rows, nloops);
+				else
+					appendStringInfo(es->str,
+									 "actual rows=%.0f loops=%.0f\n",
+									 rows, nloops);
+				es->indent++;
+				if (es->buffers)
+					show_buffer_usage(es, &instrument->bufusage);
+				es->indent--;
+			}
+			else
+			{
+				if (!opened_group)
 				{
-					appendStringInfoString(es->str, " local");
-					if (usage->local_blks_hit > 0)
-						appendStringInfo(es->str, " hit=%ld",
-										 usage->local_blks_hit);
-					if (usage->local_blks_read > 0)
-						appendStringInfo(es->str, " read=%ld",
-										 usage->local_blks_read);
-					if (usage->local_blks_dirtied > 0)
-						appendStringInfo(es->str, " dirtied=%ld",
-										 usage->local_blks_dirtied);
-					if (usage->local_blks_written > 0)
-						appendStringInfo(es->str, " written=%ld",
-										 usage->local_blks_written);
-					if (has_temp)
-						appendStringInfoChar(es->str, ',');
+					ExplainOpenGroup("Workers", "Workers", false, es);
+					opened_group = true;
 				}
-				if (has_temp)
+				ExplainOpenGroup("Worker", NULL, true, es);
+				ExplainPropertyInteger("Worker Number", n, es);
+
+				if (es->timing)
 				{
-					appendStringInfoString(es->str, " temp");
-					if (usage->temp_blks_read > 0)
-						appendStringInfo(es->str, " read=%ld",
-										 usage->temp_blks_read);
-					if (usage->temp_blks_written > 0)
-						appendStringInfo(es->str, " written=%ld",
-										 usage->temp_blks_written);
+					ExplainPropertyFloat("Actual Startup Time", startup_sec, 3, es);
+					ExplainPropertyFloat("Actual Total Time", total_sec, 3, es);
 				}
-				appendStringInfoChar(es->str, '\n');
-			}
+				ExplainPropertyFloat("Actual Rows", rows, 0, es);
+				ExplainPropertyFloat("Actual Loops", nloops, 0, es);
 
-			/* As above, show only positive counter values. */
-			if (has_timing)
-			{
-				appendStringInfoSpaces(es->str, es->indent * 2);
-				appendStringInfoString(es->str, "I/O Timings:");
-				if (!INSTR_TIME_IS_ZERO(usage->blk_read_time))
-					appendStringInfo(es->str, " read=%0.3f",
-							  INSTR_TIME_GET_MILLISEC(usage->blk_read_time));
-				if (!INSTR_TIME_IS_ZERO(usage->blk_write_time))
-					appendStringInfo(es->str, " write=%0.3f",
-							 INSTR_TIME_GET_MILLISEC(usage->blk_write_time));
-				appendStringInfoChar(es->str, '\n');
+				if (es->buffers)
+					show_buffer_usage(es, &instrument->bufusage);
+
+				ExplainCloseGroup("Worker", NULL, true, es);
 			}
 		}
-		else
-		{
-			ExplainPropertyLong("Shared Hit Blocks", usage->shared_blks_hit, es);
-			ExplainPropertyLong("Shared Read Blocks", usage->shared_blks_read, es);
-			ExplainPropertyLong("Shared Dirtied Blocks", usage->shared_blks_dirtied, es);
-			ExplainPropertyLong("Shared Written Blocks", usage->shared_blks_written, es);
-			ExplainPropertyLong("Local Hit Blocks", usage->local_blks_hit, es);
-			ExplainPropertyLong("Local Read Blocks", usage->local_blks_read, es);
-			ExplainPropertyLong("Local Dirtied Blocks", usage->local_blks_dirtied, es);
-			ExplainPropertyLong("Local Written Blocks", usage->local_blks_written, es);
-			ExplainPropertyLong("Temp Read Blocks", usage->temp_blks_read, es);
-			ExplainPropertyLong("Temp Written Blocks", usage->temp_blks_written, es);
-			ExplainPropertyFloat("I/O Read Time", INSTR_TIME_GET_MILLISEC(usage->blk_read_time), 3, es);
-			ExplainPropertyFloat("I/O Write Time", INSTR_TIME_GET_MILLISEC(usage->blk_write_time), 3, es);
-		}
+
+		if (opened_group)
+			ExplainCloseGroup("Workers", "Workers", false, es);
 	}
 
 	/* Get ready to display the child plans */
@@ -2276,6 +2242,113 @@ explain_get_index_name(Oid indexId)
 	return result;
 }
 
+/*
+ * Show buffer usage details.
+ */
+static void
+show_buffer_usage(ExplainState *es, const BufferUsage *usage)
+{
+	if (es->format == EXPLAIN_FORMAT_TEXT)
+	{
+		bool		has_shared = (usage->shared_blks_hit > 0 ||
+								  usage->shared_blks_read > 0 ||
+								  usage->shared_blks_dirtied > 0 ||
+								  usage->shared_blks_written > 0);
+		bool		has_local = (usage->local_blks_hit > 0 ||
+								 usage->local_blks_read > 0 ||
+								 usage->local_blks_dirtied > 0 ||
+								 usage->local_blks_written > 0);
+		bool		has_temp = (usage->temp_blks_read > 0 ||
+								usage->temp_blks_written > 0);
+		bool		has_timing = (!INSTR_TIME_IS_ZERO(usage->blk_read_time) ||
+								 !INSTR_TIME_IS_ZERO(usage->blk_write_time));
+
+		/* Show only positive counter values. */
+		if (has_shared || has_local || has_temp)
+		{
+			appendStringInfoSpaces(es->str, es->indent * 2);
+			appendStringInfoString(es->str, "Buffers:");
+
+			if (has_shared)
+			{
+				appendStringInfoString(es->str, " shared");
+				if (usage->shared_blks_hit > 0)
+					appendStringInfo(es->str, " hit=%ld",
+									 usage->shared_blks_hit);
+				if (usage->shared_blks_read > 0)
+					appendStringInfo(es->str, " read=%ld",
+									 usage->shared_blks_read);
+				if (usage->shared_blks_dirtied > 0)
+					appendStringInfo(es->str, " dirtied=%ld",
+									 usage->shared_blks_dirtied);
+				if (usage->shared_blks_written > 0)
+					appendStringInfo(es->str, " written=%ld",
+									 usage->shared_blks_written);
+				if (has_local || has_temp)
+					appendStringInfoChar(es->str, ',');
+			}
+			if (has_local)
+			{
+				appendStringInfoString(es->str, " local");
+				if (usage->local_blks_hit > 0)
+					appendStringInfo(es->str, " hit=%ld",
+									 usage->local_blks_hit);
+				if (usage->local_blks_read > 0)
+					appendStringInfo(es->str, " read=%ld",
+									 usage->local_blks_read);
+				if (usage->local_blks_dirtied > 0)
+					appendStringInfo(es->str, " dirtied=%ld",
+									 usage->local_blks_dirtied);
+				if (usage->local_blks_written > 0)
+					appendStringInfo(es->str, " written=%ld",
+									 usage->local_blks_written);
+				if (has_temp)
+					appendStringInfoChar(es->str, ',');
+			}
+			if (has_temp)
+			{
+				appendStringInfoString(es->str, " temp");
+				if (usage->temp_blks_read > 0)
+					appendStringInfo(es->str, " read=%ld",
+									 usage->temp_blks_read);
+				if (usage->temp_blks_written > 0)
+					appendStringInfo(es->str, " written=%ld",
+									 usage->temp_blks_written);
+			}
+			appendStringInfoChar(es->str, '\n');
+		}
+
+		/* As above, show only positive counter values. */
+		if (has_timing)
+		{
+			appendStringInfoSpaces(es->str, es->indent * 2);
+			appendStringInfoString(es->str, "I/O Timings:");
+			if (!INSTR_TIME_IS_ZERO(usage->blk_read_time))
+				appendStringInfo(es->str, " read=%0.3f",
+						  INSTR_TIME_GET_MILLISEC(usage->blk_read_time));
+			if (!INSTR_TIME_IS_ZERO(usage->blk_write_time))
+				appendStringInfo(es->str, " write=%0.3f",
+						 INSTR_TIME_GET_MILLISEC(usage->blk_write_time));
+			appendStringInfoChar(es->str, '\n');
+		}
+	}
+	else
+	{
+		ExplainPropertyLong("Shared Hit Blocks", usage->shared_blks_hit, es);
+		ExplainPropertyLong("Shared Read Blocks", usage->shared_blks_read, es);
+		ExplainPropertyLong("Shared Dirtied Blocks", usage->shared_blks_dirtied, es);
+		ExplainPropertyLong("Shared Written Blocks", usage->shared_blks_written, es);
+		ExplainPropertyLong("Local Hit Blocks", usage->local_blks_hit, es);
+		ExplainPropertyLong("Local Read Blocks", usage->local_blks_read, es);
+		ExplainPropertyLong("Local Dirtied Blocks", usage->local_blks_dirtied, es);
+		ExplainPropertyLong("Local Written Blocks", usage->local_blks_written, es);
+		ExplainPropertyLong("Temp Read Blocks", usage->temp_blks_read, es);
+		ExplainPropertyLong("Temp Written Blocks", usage->temp_blks_written, es);
+		ExplainPropertyFloat("I/O Read Time", INSTR_TIME_GET_MILLISEC(usage->blk_read_time), 3, es);
+		ExplainPropertyFloat("I/O Write Time", INSTR_TIME_GET_MILLISEC(usage->blk_write_time), 3, es);
+	}
+}
+
 /*
  * Add some additional details about an IndexScan or IndexOnlyScan
  */
diff --git a/src/backend/executor/execParallel.c b/src/backend/executor/execParallel.c
index 6730037710912edc917913be650a820cfe6c2a91..30e6b3d2a7275271908dd152d44a7d85ba51350b 100644
--- a/src/backend/executor/execParallel.c
+++ b/src/backend/executor/execParallel.c
@@ -48,21 +48,19 @@
 
 #define PARALLEL_TUPLE_QUEUE_SIZE		65536
 
-/* DSM structure for accumulating per-PlanState instrumentation. */
-typedef struct SharedPlanStateInstrumentation
-{
-	int plan_node_id;
-	slock_t mutex;
-	Instrumentation	instr;
-} SharedPlanStateInstrumentation;
-
 /* DSM structure for accumulating per-PlanState instrumentation. */
 struct SharedExecutorInstrumentation
 {
 	int instrument_options;
-	int ps_ninstrument;			/* # of ps_instrument structures following */
-	SharedPlanStateInstrumentation ps_instrument[FLEXIBLE_ARRAY_MEMBER];
+	int instrument_offset;		/* offset of first Instrumentation struct */
+	int num_workers;							/* # of workers */
+	int num_plan_nodes;							/* # of plan nodes */
+	int plan_node_id[FLEXIBLE_ARRAY_MEMBER];	/* array of plan node IDs */
+	/* array of num_plan_nodes * num_workers Instrumentation objects follows */
 };
+#define GetInstrumentationArray(sei) \
+	(AssertVariableIsOfTypeMacro(sei, SharedExecutorInstrumentation *), \
+	 (Instrumentation *) (((char *) sei) + sei->instrument_offset))
 
 /* Context object for ExecParallelEstimate. */
 typedef struct ExecParallelEstimateContext
@@ -196,18 +194,10 @@ ExecParallelInitializeDSM(PlanState *planstate,
 	if (planstate == NULL)
 		return false;
 
-	/* If instrumentation is enabled, initialize array slot for this node. */
+	/* If instrumentation is enabled, initialize slot for this node. */
 	if (d->instrumentation != NULL)
-	{
-		SharedPlanStateInstrumentation *instrumentation;
-
-		instrumentation = &d->instrumentation->ps_instrument[d->nnodes];
-		Assert(d->nnodes < d->instrumentation->ps_ninstrument);
-		instrumentation->plan_node_id = planstate->plan->plan_node_id;
-		SpinLockInit(&instrumentation->mutex);
-		InstrInit(&instrumentation->instr,
-				  d->instrumentation->instrument_options);
-	}
+		d->instrumentation->plan_node_id[d->nnodes] =
+			planstate->plan->plan_node_id;
 
 	/* Count this node. */
 	d->nnodes++;
@@ -307,6 +297,7 @@ ExecInitParallelPlan(PlanState *planstate, EState *estate, int nworkers)
 	int			pstmt_len;
 	int			param_len;
 	int			instrumentation_len = 0;
+	int			instrument_offset = 0;
 
 	/* Allocate object for return value. */
 	pei = palloc0(sizeof(ParallelExecutorInfo));
@@ -364,8 +355,11 @@ ExecInitParallelPlan(PlanState *planstate, EState *estate, int nworkers)
 	if (estate->es_instrument)
 	{
 		instrumentation_len =
-			offsetof(SharedExecutorInstrumentation, ps_instrument)
-			+ sizeof(SharedPlanStateInstrumentation) * e.nnodes;
+			offsetof(SharedExecutorInstrumentation, plan_node_id)
+			+ sizeof(int) * e.nnodes;
+		instrumentation_len = MAXALIGN(instrumentation_len);
+		instrument_offset = instrumentation_len;
+		instrumentation_len += sizeof(Instrumentation) * e.nnodes * nworkers;
 		shm_toc_estimate_chunk(&pcxt->estimator, instrumentation_len);
 		shm_toc_estimate_keys(&pcxt->estimator, 1);
 	}
@@ -407,9 +401,17 @@ ExecInitParallelPlan(PlanState *planstate, EState *estate, int nworkers)
 	 */
 	if (estate->es_instrument)
 	{
+		Instrumentation *instrument;
+		int		i;
+
 		instrumentation = shm_toc_allocate(pcxt->toc, instrumentation_len);
 		instrumentation->instrument_options = estate->es_instrument;
-		instrumentation->ps_ninstrument = e.nnodes;
+		instrumentation->instrument_offset = instrument_offset;
+		instrumentation->num_workers = nworkers;
+		instrumentation->num_plan_nodes = e.nnodes;
+		instrument = GetInstrumentationArray(instrumentation);
+		for (i = 0; i < nworkers * e.nnodes; ++i)
+			InstrInit(&instrument[i], estate->es_instrument);
 		shm_toc_insert(pcxt->toc, PARALLEL_KEY_INSTRUMENTATION,
 					   instrumentation);
 		pei->instrumentation = instrumentation;
@@ -444,20 +446,31 @@ static bool
 ExecParallelRetrieveInstrumentation(PlanState *planstate,
 						  SharedExecutorInstrumentation *instrumentation)
 {
+	Instrumentation *instrument;
 	int		i;
+	int		n;
+	int		ibytes;
 	int		plan_node_id = planstate->plan->plan_node_id;
-	SharedPlanStateInstrumentation *ps_instrument;
 
 	/* Find the instumentation for this node. */
-	for (i = 0; i < instrumentation->ps_ninstrument; ++i)
-		if (instrumentation->ps_instrument[i].plan_node_id == plan_node_id)
+	for (i = 0; i < instrumentation->num_plan_nodes; ++i)
+		if (instrumentation->plan_node_id[i] == plan_node_id)
 			break;
-	if (i >= instrumentation->ps_ninstrument)
+	if (i >= instrumentation->num_plan_nodes)
 		elog(ERROR, "plan node %d not found", plan_node_id);
 
-	/* No need to acquire the spinlock here; workers have exited already. */
-	ps_instrument = &instrumentation->ps_instrument[i];
-	InstrAggNode(planstate->instrument, &ps_instrument->instr);
+	/* Accumulate the statistics from all workers. */
+	instrument = GetInstrumentationArray(instrumentation);
+	instrument += i * instrumentation->num_workers;
+	for (n = 0; n < instrumentation->num_workers; ++n)
+		InstrAggNode(planstate->instrument, &instrument[n]);
+
+	/* Also store the per-worker detail. */
+	ibytes = instrumentation->num_workers * sizeof(Instrumentation);
+	planstate->worker_instrument =
+		palloc(offsetof(WorkerInstrumentation, instrument) + ibytes);
+	planstate->worker_instrument->num_workers = instrumentation->num_workers;
+	memcpy(&planstate->worker_instrument->instrument, instrument, ibytes);
 
 	return planstate_tree_walker(planstate, ExecParallelRetrieveInstrumentation,
 								 instrumentation);
@@ -568,7 +581,9 @@ ExecParallelReportInstrumentation(PlanState *planstate,
 {
 	int		i;
 	int		plan_node_id = planstate->plan->plan_node_id;
-	SharedPlanStateInstrumentation *ps_instrument;
+	Instrumentation *instrument;
+
+	InstrEndLoop(planstate->instrument);
 
 	/*
 	 * If we shuffled the plan_node_id values in ps_instrument into sorted
@@ -576,20 +591,21 @@ ExecParallelReportInstrumentation(PlanState *planstate,
 	 * if we're pushing down sufficiently large plan trees.  For now, do it
 	 * the slow, dumb way.
 	 */
-	for (i = 0; i < instrumentation->ps_ninstrument; ++i)
-		if (instrumentation->ps_instrument[i].plan_node_id == plan_node_id)
+	for (i = 0; i < instrumentation->num_plan_nodes; ++i)
+		if (instrumentation->plan_node_id[i] == plan_node_id)
 			break;
-	if (i >= instrumentation->ps_ninstrument)
+	if (i >= instrumentation->num_plan_nodes)
 		elog(ERROR, "plan node %d not found", plan_node_id);
 
 	/*
-	 * There's one SharedPlanStateInstrumentation per plan_node_id, so we
-	 * must use a spinlock in case multiple workers report at the same time.
+	 * Add our statistics to the per-node, per-worker totals.  It's possible
+	 * that this could happen more than once if we relaunched workers.
 	 */
-	ps_instrument = &instrumentation->ps_instrument[i];
-	SpinLockAcquire(&ps_instrument->mutex);
-	InstrAggNode(&ps_instrument->instr, planstate->instrument);
-	SpinLockRelease(&ps_instrument->mutex);
+	instrument = GetInstrumentationArray(instrumentation);
+	instrument += i * instrumentation->num_workers;
+	Assert(IsParallelWorker());
+	Assert(ParallelWorkerNumber < instrumentation->num_workers);
+	InstrAggNode(&instrument[ParallelWorkerNumber], planstate->instrument);
 
 	return planstate_tree_walker(planstate, ExecParallelReportInstrumentation,
 								 instrumentation);
diff --git a/src/include/executor/instrument.h b/src/include/executor/instrument.h
index f28e56ce48c36c1bd00f691527d234006bf970af..52d3c8182825b5c39693937a6a8d9a06efa17721 100644
--- a/src/include/executor/instrument.h
+++ b/src/include/executor/instrument.h
@@ -63,6 +63,12 @@ typedef struct Instrumentation
 	BufferUsage bufusage;		/* Total buffer usage */
 } Instrumentation;
 
+typedef struct WorkerInstrumentation
+{
+	int			num_workers;	/* # of structures that follow */
+	Instrumentation instrument[FLEXIBLE_ARRAY_MEMBER];
+} WorkerInstrumentation;
+
 extern PGDLLIMPORT BufferUsage pgBufferUsage;
 
 extern Instrumentation *InstrAlloc(int n, int instrument_options);
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index eb3591a663f5d316242f9fa669c582bf76c1f339..5ccf4700afd113273b5e9fa90d8be3d4485c3d18 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -1029,6 +1029,7 @@ typedef struct PlanState
 								 * top-level plan */
 
 	Instrumentation *instrument;	/* Optional runtime stats for this node */
+	WorkerInstrumentation *worker_instrument; /* per-worker instrumentation */
 
 	/*
 	 * Common structural data for all Plan types.  These links to subsidiary