From f35742ccb7aa53ee3ed8416bbb378b0c3eeb6bb9 Mon Sep 17 00:00:00 2001
From: Robert Haas <rhaas@postgresql.org>
Date: Wed, 8 Mar 2017 12:05:43 -0500
Subject: [PATCH] Support parallel bitmap heap scans.

The index is scanned by a single process, but then all cooperating
processes can iterate jointly over the resulting set of heap blocks.
In the future, we might also want to support using a parallel bitmap
index scan to set up for a parallel bitmap heap scan, but that's a
job for another day.

Dilip Kumar, with some corrections and cosmetic changes by me.  The
larger patch set of which this is a part has been reviewed and tested
by (at least) Andres Freund, Amit Khandekar, Tushar Ahuja, Rafia
Sabih, Haribabu Kommi, Thomas Munro, and me.

Discussion: http://postgr.es/m/CAFiTN-uc4=0WxRGfCzs-xfkMYcSEWUC-Fon6thkJGjkh9i=13A@mail.gmail.com
---
 doc/src/sgml/monitoring.sgml                  |   6 +-
 src/backend/access/heap/heapam.c              |  16 +
 src/backend/executor/execParallel.c           |  14 +
 src/backend/executor/nodeBitmapHeapscan.c     | 422 ++++++++++++++++--
 src/backend/executor/nodeBitmapIndexscan.c    |   4 +-
 src/backend/executor/nodeBitmapOr.c           |   4 +-
 src/backend/nodes/copyfuncs.c                 |   2 +
 src/backend/nodes/outfuncs.c                  |   2 +
 src/backend/nodes/readfuncs.c                 |   2 +
 src/backend/optimizer/path/allpaths.c         |  24 +
 src/backend/optimizer/path/costsize.c         |  16 +-
 src/backend/optimizer/path/indxpath.c         |  19 +-
 src/backend/optimizer/plan/createplan.c       |  22 +
 src/backend/optimizer/util/pathnode.c         |   9 +-
 src/backend/postmaster/pgstat.c               |   3 +
 src/include/access/heapam.h                   |   1 +
 src/include/executor/nodeBitmapHeapscan.h     |   7 +
 src/include/nodes/execnodes.h                 |  57 +++
 src/include/nodes/plannodes.h                 |   2 +
 src/include/optimizer/pathnode.h              |   3 +-
 src/include/optimizer/paths.h                 |   2 +
 src/include/pgstat.h                          |   1 +
 src/test/regress/expected/select_parallel.out |  19 +
 src/test/regress/sql/select_parallel.sql      |  10 +
 24 files changed, 612 insertions(+), 55 deletions(-)

diff --git a/doc/src/sgml/monitoring.sgml b/doc/src/sgml/monitoring.sgml
index 27ed35f0a7b..4d03531cc15 100644
--- a/doc/src/sgml/monitoring.sgml
+++ b/doc/src/sgml/monitoring.sgml
@@ -1211,7 +1211,7 @@ postgres   27093  0.0  0.0  30096  2752 ?        Ss   11:34   0:00 postgres: ser
          <entry>Waiting in an extension.</entry>
         </row>
         <row>
-         <entry morerows="10"><literal>IPC</></entry>
+         <entry morerows="11"><literal>IPC</></entry>
          <entry><literal>BgWorkerShutdown</></entry>
          <entry>Waiting for background worker to shut down.</entry>
         </row>
@@ -1247,6 +1247,10 @@ postgres   27093  0.0  0.0  30096  2752 ?        Ss   11:34   0:00 postgres: ser
          <entry><literal>ParallelFinish</></entry>
          <entry>Waiting for parallel workers to finish computing.</entry>
         </row>
+        <row>
+         <entry><literal>ParallelBitmapPopulate</></entry>
+         <entry>Waiting for the leader to populate the TidBitmap.</entry>
+        </row>
         <row>
          <entry><literal>SafeSnapshot</></entry>
          <entry>Waiting for a snapshot for a <literal>READ ONLY DEFERRABLE</> transaction.</entry>
diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index af258366a20..bffc971d689 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -1753,6 +1753,22 @@ retry:
 	return page;
 }
 
+/* ----------------
+ *		heap_update_snapshot
+ *
+ *		Update snapshot info in heap scan descriptor.
+ * ----------------
+ */
+void
+heap_update_snapshot(HeapScanDesc scan, Snapshot snapshot)
+{
+	Assert(IsMVCCSnapshot(snapshot));
+
+	RegisterSnapshot(snapshot);
+	scan->rs_snapshot = snapshot;
+	scan->rs_temp_snap = true;
+}
+
 /* ----------------
  *		heap_getnext	- retrieve next tuple in scan
  *
diff --git a/src/backend/executor/execParallel.c b/src/backend/executor/execParallel.c
index de0e2bafe60..a1289e5f12e 100644
--- a/src/backend/executor/execParallel.c
+++ b/src/backend/executor/execParallel.c
@@ -25,6 +25,7 @@
 
 #include "executor/execParallel.h"
 #include "executor/executor.h"
+#include "executor/nodeBitmapHeapscan.h"
 #include "executor/nodeCustom.h"
 #include "executor/nodeForeignscan.h"
 #include "executor/nodeSeqscan.h"
@@ -217,6 +218,10 @@ ExecParallelEstimate(PlanState *planstate, ExecParallelEstimateContext *e)
 				ExecCustomScanEstimate((CustomScanState *) planstate,
 									   e->pcxt);
 				break;
+			case T_BitmapHeapScanState:
+				ExecBitmapHeapEstimate((BitmapHeapScanState *) planstate,
+									   e->pcxt);
+				break;
 			default:
 				break;
 		}
@@ -277,6 +282,11 @@ ExecParallelInitializeDSM(PlanState *planstate,
 				ExecCustomScanInitializeDSM((CustomScanState *) planstate,
 											d->pcxt);
 				break;
+			case T_BitmapHeapScanState:
+				ExecBitmapHeapInitializeDSM((BitmapHeapScanState *) planstate,
+											d->pcxt);
+				break;
+
 			default:
 				break;
 		}
@@ -775,6 +785,10 @@ ExecParallelInitializeWorker(PlanState *planstate, shm_toc *toc)
 				ExecCustomScanInitializeWorker((CustomScanState *) planstate,
 											   toc);
 				break;
+			case T_BitmapHeapScanState:
+				ExecBitmapHeapInitializeWorker(
+									 (BitmapHeapScanState *) planstate, toc);
+				break;
 			default:
 				break;
 		}
diff --git a/src/backend/executor/nodeBitmapHeapscan.c b/src/backend/executor/nodeBitmapHeapscan.c
index c1aa9f13bdf..833a93e1b7d 100644
--- a/src/backend/executor/nodeBitmapHeapscan.c
+++ b/src/backend/executor/nodeBitmapHeapscan.c
@@ -53,11 +53,15 @@
 
 static TupleTableSlot *BitmapHeapNext(BitmapHeapScanState *node);
 static void bitgetpage(HeapScanDesc scan, TBMIterateResult *tbmres);
+static inline void BitmapDoneInitializingSharedState(
+								  ParallelBitmapHeapState *pstate);
 static inline void BitmapAdjustPrefetchIterator(BitmapHeapScanState *node,
 							 TBMIterateResult *tbmres);
 static inline void BitmapAdjustPrefetchTarget(BitmapHeapScanState *node);
 static inline void BitmapPrefetch(BitmapHeapScanState *node,
 			   HeapScanDesc scan);
+static bool BitmapShouldInitializeSharedState(
+								  ParallelBitmapHeapState *pstate);
 
 
 /* ----------------------------------------------------------------
@@ -73,9 +77,12 @@ BitmapHeapNext(BitmapHeapScanState *node)
 	HeapScanDesc scan;
 	TIDBitmap  *tbm;
 	TBMIterator *tbmiterator;
+	TBMSharedIterator *shared_tbmiterator;
 	TBMIterateResult *tbmres;
 	OffsetNumber targoffset;
 	TupleTableSlot *slot;
+	ParallelBitmapHeapState *pstate = node->pstate;
+	dsa_area   *dsa = node->ss.ps.state->es_query_dsa;
 
 	/*
 	 * extract necessary information from index scan node
@@ -84,7 +91,10 @@ BitmapHeapNext(BitmapHeapScanState *node)
 	slot = node->ss.ss_ScanTupleSlot;
 	scan = node->ss.ss_currentScanDesc;
 	tbm = node->tbm;
-	tbmiterator = node->tbmiterator;
+	if (pstate == NULL)
+		tbmiterator = node->tbmiterator;
+	else
+		shared_tbmiterator = node->shared_tbmiterator;
 	tbmres = node->tbmres;
 
 	/*
@@ -99,25 +109,82 @@ BitmapHeapNext(BitmapHeapScanState *node)
 	 * node->prefetch_maximum.  This is to avoid doing a lot of prefetching in
 	 * a scan that stops after a few tuples because of a LIMIT.
 	 */
-	if (tbm == NULL)
+	if (!node->initialized)
 	{
-		tbm = (TIDBitmap *) MultiExecProcNode(outerPlanState(node));
+		if (!pstate)
+		{
+			tbm = (TIDBitmap *) MultiExecProcNode(outerPlanState(node));
 
-		if (!tbm || !IsA(tbm, TIDBitmap))
-			elog(ERROR, "unrecognized result from subplan");
+			if (!tbm || !IsA(tbm, TIDBitmap))
+				elog(ERROR, "unrecognized result from subplan");
 
-		node->tbm = tbm;
-		node->tbmiterator = tbmiterator = tbm_begin_iterate(tbm);
-		node->tbmres = tbmres = NULL;
+			node->tbm = tbm;
+			node->tbmiterator = tbmiterator = tbm_begin_iterate(tbm);
+			node->tbmres = tbmres = NULL;
 
 #ifdef USE_PREFETCH
-		if (node->prefetch_maximum > 0)
-		{
-			node->prefetch_iterator = tbm_begin_iterate(tbm);
-			node->prefetch_pages = 0;
-			node->prefetch_target = -1;
+			if (node->prefetch_maximum > 0)
+			{
+				node->prefetch_iterator = tbm_begin_iterate(tbm);
+				node->prefetch_pages = 0;
+				node->prefetch_target = -1;
+			}
+#endif   /* USE_PREFETCH */
 		}
+		else
+		{
+			/*
+			 * The leader will immediately come out of the function, but
+			 * others will be blocked until leader populates the TBM and wakes
+			 * them up.
+			 */
+			if (BitmapShouldInitializeSharedState(pstate))
+			{
+				tbm = (TIDBitmap *) MultiExecProcNode(outerPlanState(node));
+				if (!tbm || !IsA(tbm, TIDBitmap))
+					elog(ERROR, "unrecognized result from subplan");
+
+				node->tbm = tbm;
+
+				/*
+				 * Prepare to iterate over the TBM. This will return the
+				 * dsa_pointer of the iterator state which will be used by
+				 * multiple processes to iterate jointly.
+				 */
+				pstate->tbmiterator = tbm_prepare_shared_iterate(tbm);
+#ifdef USE_PREFETCH
+				if (node->prefetch_maximum > 0)
+				{
+					pstate->prefetch_iterator =
+						tbm_prepare_shared_iterate(tbm);
+
+					/*
+					 * We don't need the mutex here as we haven't yet woke up
+					 * others.
+					 */
+					pstate->prefetch_pages = 0;
+					pstate->prefetch_target = -1;
+				}
+#endif
+
+				/* We have initialized the shared state so wake up others. */
+				BitmapDoneInitializingSharedState(pstate);
+			}
+
+			/* Allocate a private iterator and attach the shared state to it */
+			node->shared_tbmiterator = shared_tbmiterator =
+				tbm_attach_shared_iterate(dsa, pstate->tbmiterator);
+			node->tbmres = tbmres = NULL;
+
+#ifdef USE_PREFETCH
+			if (node->prefetch_maximum > 0)
+			{
+				node->shared_prefetch_iterator =
+					tbm_attach_shared_iterate(dsa, pstate->prefetch_iterator);
+			}
 #endif   /* USE_PREFETCH */
+		}
+		node->initialized = true;
 	}
 
 	for (;;)
@@ -130,7 +197,10 @@ BitmapHeapNext(BitmapHeapScanState *node)
 		 */
 		if (tbmres == NULL)
 		{
-			node->tbmres = tbmres = tbm_iterate(tbmiterator);
+			if (!pstate)
+				node->tbmres = tbmres = tbm_iterate(tbmiterator);
+			else
+				node->tbmres = tbmres = tbm_shared_iterate(shared_tbmiterator);
 			if (tbmres == NULL)
 			{
 				/* no more entries in the bitmap */
@@ -182,8 +252,19 @@ BitmapHeapNext(BitmapHeapScanState *node)
 			 * Try to prefetch at least a few pages even before we get to the
 			 * second page if we don't stop reading after the first tuple.
 			 */
-			if (node->prefetch_target < node->prefetch_maximum)
-				node->prefetch_target++;
+			if (!pstate)
+			{
+				if (node->prefetch_target < node->prefetch_maximum)
+					node->prefetch_target++;
+			}
+			else if (pstate->prefetch_target < node->prefetch_maximum)
+			{
+				/* take spinlock while updating shared state */
+				SpinLockAcquire(&pstate->mutex);
+				if (pstate->prefetch_target < node->prefetch_maximum)
+					pstate->prefetch_target++;
+				SpinLockRelease(&pstate->mutex);
+			}
 #endif   /* USE_PREFETCH */
 		}
 
@@ -361,6 +442,21 @@ bitgetpage(HeapScanDesc scan, TBMIterateResult *tbmres)
 	scan->rs_ntuples = ntup;
 }
 
+/*
+ *	BitmapDoneInitializingSharedState - Shared state is initialized
+ *
+ *	By this time the leader has already populated the TBM and initialized the
+ *	shared state so wake up other processes.
+ */
+static inline void
+BitmapDoneInitializingSharedState(ParallelBitmapHeapState *pstate)
+{
+	SpinLockAcquire(&pstate->mutex);
+	pstate->state = BM_FINISHED;
+	SpinLockRelease(&pstate->mutex);
+	ConditionVariableBroadcast(&pstate->cv);
+}
+
 /*
  *	BitmapAdjustPrefetchIterator - Adjust the prefetch iterator
  */
@@ -369,20 +465,53 @@ BitmapAdjustPrefetchIterator(BitmapHeapScanState *node,
 							 TBMIterateResult *tbmres)
 {
 #ifdef USE_PREFETCH
-	TBMIterator *prefetch_iterator = node->prefetch_iterator;
+	ParallelBitmapHeapState *pstate = node->pstate;
 
-	if (node->prefetch_pages > 0)
+	if (pstate == NULL)
 	{
-		/* The main iterator has closed the distance by one page */
-		node->prefetch_pages--;
+		TBMIterator *prefetch_iterator = node->prefetch_iterator;
+
+		if (node->prefetch_pages > 0)
+		{
+			/* The main iterator has closed the distance by one page */
+			node->prefetch_pages--;
+		}
+		else if (prefetch_iterator)
+		{
+			/* Do not let the prefetch iterator get behind the main one */
+			TBMIterateResult *tbmpre = tbm_iterate(prefetch_iterator);
+
+			if (tbmpre == NULL || tbmpre->blockno != tbmres->blockno)
+				elog(ERROR, "prefetch and main iterators are out of sync");
+		}
+		return;
 	}
-	else if (prefetch_iterator)
+
+	if (node->prefetch_maximum > 0)
 	{
-		/* Do not let the prefetch iterator get behind the main one */
-		TBMIterateResult *tbmpre = tbm_iterate(prefetch_iterator);
+		TBMSharedIterator *prefetch_iterator = node->shared_prefetch_iterator;
+
+		SpinLockAcquire(&pstate->mutex);
+		if (pstate->prefetch_pages > 0)
+		{
+			node->prefetch_pages--;
+			SpinLockRelease(&pstate->mutex);
+		}
+		else
+		{
+			/* Release the mutex before iterating */
+			SpinLockRelease(&pstate->mutex);
 
-		if (tbmpre == NULL || tbmpre->blockno != tbmres->blockno)
-			elog(ERROR, "prefetch and main iterators are out of sync");
+			/*
+			 * In case of shared mode, we can not ensure that the current
+			 * blockno of the main iterator and that of the prefetch iterator
+			 * are same.  It's possible that whatever blockno we are
+			 * prefetching will be processed by another process.  Therefore, we
+			 * don't validate the blockno here as we do in non-parallel case.
+			 */
+			if (prefetch_iterator)
+				tbm_shared_iterate(prefetch_iterator);
+		}
 	}
 #endif   /* USE_PREFETCH */
 }
@@ -399,14 +528,35 @@ static inline void
 BitmapAdjustPrefetchTarget(BitmapHeapScanState *node)
 {
 #ifdef USE_PREFETCH
-	if (node->prefetch_target >= node->prefetch_maximum)
-		 /* don't increase any further */ ;
-	else if (node->prefetch_target >= node->prefetch_maximum / 2)
-		node->prefetch_target = node->prefetch_maximum;
-	else if (node->prefetch_target > 0)
-		node->prefetch_target *= 2;
-	else
-		node->prefetch_target++;
+	ParallelBitmapHeapState *pstate = node->pstate;
+
+	if (pstate == NULL)
+	{
+		if (node->prefetch_target >= node->prefetch_maximum)
+			 /* don't increase any further */ ;
+		else if (node->prefetch_target >= node->prefetch_maximum / 2)
+			node->prefetch_target = node->prefetch_maximum;
+		else if (node->prefetch_target > 0)
+			node->prefetch_target *= 2;
+		else
+			node->prefetch_target++;
+		return;
+	}
+
+	/* Do an unlocked check first to save spinlock acquisitions. */
+	if (pstate->prefetch_target < node->prefetch_maximum)
+	{
+		SpinLockAcquire(&pstate->mutex);
+		if (pstate->prefetch_target >= node->prefetch_maximum)
+			 /* don't increase any further */ ;
+		else if (pstate->prefetch_target >= node->prefetch_maximum / 2)
+			pstate->prefetch_target = node->prefetch_maximum;
+		else if (pstate->prefetch_target > 0)
+			pstate->prefetch_target *= 2;
+		else
+			pstate->prefetch_target++;
+		SpinLockRelease(&pstate->mutex);
+	}
 #endif   /* USE_PREFETCH */
 }
 
@@ -417,23 +567,70 @@ static inline void
 BitmapPrefetch(BitmapHeapScanState *node, HeapScanDesc scan)
 {
 #ifdef USE_PREFETCH
-	TBMIterator *prefetch_iterator = node->prefetch_iterator;
+	ParallelBitmapHeapState *pstate = node->pstate;
 
-	if (prefetch_iterator)
+	if (pstate == NULL)
 	{
-		while (node->prefetch_pages < node->prefetch_target)
+		TBMIterator *prefetch_iterator = node->prefetch_iterator;
+
+		if (prefetch_iterator)
 		{
-			TBMIterateResult *tbmpre = tbm_iterate(prefetch_iterator);
+			while (node->prefetch_pages < node->prefetch_target)
+			{
+				TBMIterateResult *tbmpre = tbm_iterate(prefetch_iterator);
+
+				if (tbmpre == NULL)
+				{
+					/* No more pages to prefetch */
+					tbm_end_iterate(prefetch_iterator);
+					node->prefetch_iterator = NULL;
+					break;
+				}
+				node->prefetch_pages++;
+				PrefetchBuffer(scan->rs_rd, MAIN_FORKNUM, tbmpre->blockno);
+			}
+		}
+
+		return;
+	}
 
-			if (tbmpre == NULL)
+	if (pstate->prefetch_pages < pstate->prefetch_target)
+	{
+		TBMSharedIterator *prefetch_iterator = node->shared_prefetch_iterator;
+
+		if (prefetch_iterator)
+		{
+			while (1)
 			{
-				/* No more pages to prefetch */
-				tbm_end_iterate(prefetch_iterator);
-				node->prefetch_iterator = NULL;
-				break;
+				TBMIterateResult *tbmpre;
+				bool		do_prefetch = false;
+
+				/*
+				 * Recheck under the mutex. If some other process has already
+				 * done enough prefetching then we need not to do anything.
+				 */
+				SpinLockAcquire(&pstate->mutex);
+				if (pstate->prefetch_pages < pstate->prefetch_target)
+				{
+					pstate->prefetch_pages++;
+					do_prefetch = true;
+				}
+				SpinLockRelease(&pstate->mutex);
+
+				if (!do_prefetch)
+					return;
+
+				tbmpre = tbm_shared_iterate(prefetch_iterator);
+				if (tbmpre == NULL)
+				{
+					/* No more pages to prefetch */
+					tbm_end_shared_iterate(prefetch_iterator);
+					node->shared_prefetch_iterator = NULL;
+					break;
+				}
+
+				PrefetchBuffer(scan->rs_rd, MAIN_FORKNUM, tbmpre->blockno);
 			}
-			node->prefetch_pages++;
-			PrefetchBuffer(scan->rs_rd, MAIN_FORKNUM, tbmpre->blockno);
 		}
 	}
 #endif   /* USE_PREFETCH */
@@ -488,12 +685,36 @@ ExecReScanBitmapHeapScan(BitmapHeapScanState *node)
 		tbm_end_iterate(node->tbmiterator);
 	if (node->prefetch_iterator)
 		tbm_end_iterate(node->prefetch_iterator);
+	if (node->shared_tbmiterator)
+		tbm_end_shared_iterate(node->shared_tbmiterator);
+	if (node->shared_prefetch_iterator)
+		tbm_end_shared_iterate(node->shared_prefetch_iterator);
 	if (node->tbm)
 		tbm_free(node->tbm);
 	node->tbm = NULL;
 	node->tbmiterator = NULL;
 	node->tbmres = NULL;
 	node->prefetch_iterator = NULL;
+	node->initialized = false;
+	node->shared_tbmiterator = NULL;
+	node->shared_prefetch_iterator = NULL;
+
+	/* Reset parallel bitmap state, if present */
+	if (node->pstate)
+	{
+		dsa_area   *dsa = node->ss.ps.state->es_query_dsa;
+
+		node->pstate->state = BM_INITIAL;
+
+		if (DsaPointerIsValid(node->pstate->tbmiterator))
+			tbm_free_shared_area(dsa, node->pstate->tbmiterator);
+
+		if (DsaPointerIsValid(node->pstate->prefetch_iterator))
+			tbm_free_shared_area(dsa, node->pstate->prefetch_iterator);
+
+		node->pstate->tbmiterator = InvalidDsaPointer;
+		node->pstate->prefetch_iterator = InvalidDsaPointer;
+	}
 
 	ExecScanReScan(&node->ss);
 
@@ -546,6 +767,10 @@ ExecEndBitmapHeapScan(BitmapHeapScanState *node)
 		tbm_end_iterate(node->prefetch_iterator);
 	if (node->tbm)
 		tbm_free(node->tbm);
+	if (node->shared_tbmiterator)
+		tbm_end_shared_iterate(node->shared_tbmiterator);
+	if (node->shared_prefetch_iterator)
+		tbm_end_shared_iterate(node->shared_prefetch_iterator);
 
 	/*
 	 * close heap scan
@@ -597,6 +822,10 @@ ExecInitBitmapHeapScan(BitmapHeapScan *node, EState *estate, int eflags)
 	scanstate->prefetch_target = 0;
 	/* may be updated below */
 	scanstate->prefetch_maximum = target_prefetch_pages;
+	scanstate->pscan_len = 0;
+	scanstate->initialized = false;
+	scanstate->shared_tbmiterator = NULL;
+	scanstate->pstate = NULL;
 
 	/*
 	 * Miscellaneous initialization
@@ -681,3 +910,108 @@ ExecInitBitmapHeapScan(BitmapHeapScan *node, EState *estate, int eflags)
 	 */
 	return scanstate;
 }
+
+/*----------------
+ *		BitmapShouldInitializeSharedState
+ *
+ *		The first process to come here and see the state to the BM_INITIAL
+ *		will become the leader for the parallel bitmap scan and will be
+ *		responsible for populating the TIDBitmap.  The other processes will
+ *		be blocked by the condition variable until the leader wakes them up.
+ * ---------------
+ */
+static bool
+BitmapShouldInitializeSharedState(ParallelBitmapHeapState *pstate)
+{
+	SharedBitmapState state;
+
+	while (1)
+	{
+		SpinLockAcquire(&pstate->mutex);
+		state = pstate->state;
+		if (pstate->state == BM_INITIAL)
+			pstate->state = BM_INPROGRESS;
+		SpinLockRelease(&pstate->mutex);
+
+		/* Exit if bitmap is done, or if we're the leader. */
+		if (state != BM_INPROGRESS)
+			break;
+
+		/* Wait for the leader to wake us up. */
+		ConditionVariableSleep(&pstate->cv, WAIT_EVENT_PARALLEL_BITMAP_SCAN);
+	}
+
+	ConditionVariableCancelSleep();
+
+	return (state == BM_INITIAL);
+}
+
+/* ----------------------------------------------------------------
+ *		ExecBitmapHeapEstimate
+ *
+ *		estimates the space required to serialize bitmap scan node.
+ * ----------------------------------------------------------------
+ */
+void
+ExecBitmapHeapEstimate(BitmapHeapScanState *node,
+					   ParallelContext *pcxt)
+{
+	EState	   *estate = node->ss.ps.state;
+
+	node->pscan_len = add_size(offsetof(ParallelBitmapHeapState,
+										phs_snapshot_data),
+							   EstimateSnapshotSpace(estate->es_snapshot));
+
+	shm_toc_estimate_chunk(&pcxt->estimator, node->pscan_len);
+	shm_toc_estimate_keys(&pcxt->estimator, 1);
+}
+
+/* ----------------------------------------------------------------
+ *		ExecBitmapHeapInitializeDSM
+ *
+ *		Set up a parallel bitmap heap scan descriptor.
+ * ----------------------------------------------------------------
+ */
+void
+ExecBitmapHeapInitializeDSM(BitmapHeapScanState *node,
+							ParallelContext *pcxt)
+{
+	ParallelBitmapHeapState *pstate;
+	EState	   *estate = node->ss.ps.state;
+
+	pstate = shm_toc_allocate(pcxt->toc, node->pscan_len);
+
+	pstate->tbmiterator = 0;
+	pstate->prefetch_iterator = 0;
+
+	/* Initialize the mutex */
+	SpinLockInit(&pstate->mutex);
+	pstate->prefetch_pages = 0;
+	pstate->prefetch_target = 0;
+	pstate->state = BM_INITIAL;
+
+	ConditionVariableInit(&pstate->cv);
+	SerializeSnapshot(estate->es_snapshot, pstate->phs_snapshot_data);
+
+	shm_toc_insert(pcxt->toc, node->ss.ps.plan->plan_node_id, pstate);
+	node->pstate = pstate;
+}
+
+/* ----------------------------------------------------------------
+ *		ExecBitmapHeapInitializeWorker
+ *
+ *		Copy relevant information from TOC into planstate.
+ * ----------------------------------------------------------------
+ */
+void
+ExecBitmapHeapInitializeWorker(BitmapHeapScanState *node, shm_toc *toc)
+{
+	ParallelBitmapHeapState *pstate;
+	Snapshot	snapshot;
+
+	pstate = shm_toc_lookup(toc, node->ss.ps.plan->plan_node_id);
+	node->pstate = pstate;
+
+	snapshot = RestoreSnapshot(pstate->phs_snapshot_data);
+	heap_update_snapshot(node->ss.ss_currentScanDesc, snapshot);
+}
diff --git a/src/backend/executor/nodeBitmapIndexscan.c b/src/backend/executor/nodeBitmapIndexscan.c
index 94bb289f1bb..ce2f3210a4a 100644
--- a/src/backend/executor/nodeBitmapIndexscan.c
+++ b/src/backend/executor/nodeBitmapIndexscan.c
@@ -78,7 +78,9 @@ MultiExecBitmapIndexScan(BitmapIndexScanState *node)
 	else
 	{
 		/* XXX should we use less than work_mem for this? */
-		tbm = tbm_create(work_mem * 1024L, NULL);
+		tbm = tbm_create(work_mem * 1024L,
+						 ((BitmapIndexScan *) node->ss.ps.plan)->isshared ?
+						 node->ss.ps.state->es_query_dsa : NULL);
 	}
 
 	/*
diff --git a/src/backend/executor/nodeBitmapOr.c b/src/backend/executor/nodeBitmapOr.c
index 1d280beddc1..c0f261407bf 100644
--- a/src/backend/executor/nodeBitmapOr.c
+++ b/src/backend/executor/nodeBitmapOr.c
@@ -129,7 +129,9 @@ MultiExecBitmapOr(BitmapOrState *node)
 			if (result == NULL) /* first subplan */
 			{
 				/* XXX should we use less than work_mem for this? */
-				result = tbm_create(work_mem * 1024L, NULL);
+				result = tbm_create(work_mem * 1024L,
+									((BitmapOr *) node->ps.plan)->isshared ?
+									node->ps.state->es_query_dsa : NULL);
 			}
 
 			((BitmapIndexScanState *) subnode)->biss_result = result;
diff --git a/src/backend/nodes/copyfuncs.c b/src/backend/nodes/copyfuncs.c
index b3eac06c50e..ac8e50ef1dc 100644
--- a/src/backend/nodes/copyfuncs.c
+++ b/src/backend/nodes/copyfuncs.c
@@ -331,6 +331,7 @@ _copyBitmapOr(const BitmapOr *from)
 	/*
 	 * copy remainder of node
 	 */
+	COPY_SCALAR_FIELD(isshared);
 	COPY_NODE_FIELD(bitmapplans);
 
 	return newnode;
@@ -496,6 +497,7 @@ _copyBitmapIndexScan(const BitmapIndexScan *from)
 	 * copy remainder of node
 	 */
 	COPY_SCALAR_FIELD(indexid);
+	COPY_SCALAR_FIELD(isshared);
 	COPY_NODE_FIELD(indexqual);
 	COPY_NODE_FIELD(indexqualorig);
 
diff --git a/src/backend/nodes/outfuncs.c b/src/backend/nodes/outfuncs.c
index d4297d11b18..825a7b283a3 100644
--- a/src/backend/nodes/outfuncs.c
+++ b/src/backend/nodes/outfuncs.c
@@ -441,6 +441,7 @@ _outBitmapOr(StringInfo str, const BitmapOr *node)
 
 	_outPlanInfo(str, (const Plan *) node);
 
+	WRITE_BOOL_FIELD(isshared);
 	WRITE_NODE_FIELD(bitmapplans);
 }
 
@@ -520,6 +521,7 @@ _outBitmapIndexScan(StringInfo str, const BitmapIndexScan *node)
 	_outScanInfo(str, (const Scan *) node);
 
 	WRITE_OID_FIELD(indexid);
+	WRITE_BOOL_FIELD(isshared);
 	WRITE_NODE_FIELD(indexqual);
 	WRITE_NODE_FIELD(indexqualorig);
 }
diff --git a/src/backend/nodes/readfuncs.c b/src/backend/nodes/readfuncs.c
index b02d9fa2469..8f39d93a123 100644
--- a/src/backend/nodes/readfuncs.c
+++ b/src/backend/nodes/readfuncs.c
@@ -1633,6 +1633,7 @@ _readBitmapOr(void)
 
 	ReadCommonPlan(&local_node->plan);
 
+	READ_BOOL_FIELD(isshared);
 	READ_NODE_FIELD(bitmapplans);
 
 	READ_DONE();
@@ -1744,6 +1745,7 @@ _readBitmapIndexScan(void)
 	ReadCommonScan(&local_node->scan);
 
 	READ_OID_FIELD(indexid);
+	READ_BOOL_FIELD(isshared);
 	READ_NODE_FIELD(indexqual);
 	READ_NODE_FIELD(indexqualorig);
 
diff --git a/src/backend/optimizer/path/allpaths.c b/src/backend/optimizer/path/allpaths.c
index 932c84c949b..fbb2cda9d73 100644
--- a/src/backend/optimizer/path/allpaths.c
+++ b/src/backend/optimizer/path/allpaths.c
@@ -2911,6 +2911,30 @@ remove_unused_subquery_outputs(Query *subquery, RelOptInfo *rel)
 	}
 }
 
+/*
+ * create_partial_bitmap_paths
+ *	  Build partial bitmap heap path for the relation
+ */
+void
+create_partial_bitmap_paths(PlannerInfo *root, RelOptInfo *rel,
+							Path *bitmapqual)
+{
+	int			parallel_workers;
+	double		pages_fetched;
+
+	/* Compute heap pages for bitmap heap scan */
+	pages_fetched = compute_bitmap_pages(root, rel, bitmapqual, 1.0,
+										 NULL, NULL);
+
+	parallel_workers = compute_parallel_worker(rel, pages_fetched, 0);
+
+	if (parallel_workers <= 0)
+		return;
+
+	add_partial_path(rel, (Path *) create_bitmap_heap_path(root, rel,
+					bitmapqual, rel->lateral_relids, 1.0, parallel_workers));
+}
+
 /*
  * Compute the number of parallel workers that should be used to scan a
  * relation.  We compute the parallel workers based on the size of the heap to
diff --git a/src/backend/optimizer/path/costsize.c b/src/backend/optimizer/path/costsize.c
index 3eaed5af7a5..627e3f1b954 100644
--- a/src/backend/optimizer/path/costsize.c
+++ b/src/backend/optimizer/path/costsize.c
@@ -860,6 +860,7 @@ cost_bitmap_heap_scan(Path *path, PlannerInfo *root, RelOptInfo *baserel,
 	QualCost	qpqual_cost;
 	Cost		cpu_per_tuple;
 	Cost		cost_per_page;
+	Cost		cpu_run_cost;
 	double		tuples_fetched;
 	double		pages_fetched;
 	double		spc_seq_page_cost,
@@ -921,8 +922,21 @@ cost_bitmap_heap_scan(Path *path, PlannerInfo *root, RelOptInfo *baserel,
 
 	startup_cost += qpqual_cost.startup;
 	cpu_per_tuple = cpu_tuple_cost + qpqual_cost.per_tuple;
+	cpu_run_cost = cpu_per_tuple * tuples_fetched;
+
+	/* Adjust costing for parallelism, if used. */
+	if (path->parallel_workers > 0)
+	{
+		double		parallel_divisor = get_parallel_divisor(path);
+
+		/* The CPU cost is divided among all the workers. */
+		cpu_run_cost /= parallel_divisor;
 
-	run_cost += cpu_per_tuple * tuples_fetched;
+		path->rows = clamp_row_est(path->rows / parallel_divisor);
+	}
+
+
+	run_cost += cpu_run_cost;
 
 	/* tlist eval costs are paid per output row, not per tuple scanned */
 	startup_cost += path->pathtarget->cost.startup;
diff --git a/src/backend/optimizer/path/indxpath.c b/src/backend/optimizer/path/indxpath.c
index d8e5b811268..c2b72d410af 100644
--- a/src/backend/optimizer/path/indxpath.c
+++ b/src/backend/optimizer/path/indxpath.c
@@ -337,8 +337,12 @@ create_index_paths(PlannerInfo *root, RelOptInfo *rel)
 
 		bitmapqual = choose_bitmap_and(root, rel, bitindexpaths);
 		bpath = create_bitmap_heap_path(root, rel, bitmapqual,
-										rel->lateral_relids, 1.0);
+										rel->lateral_relids, 1.0, 0);
 		add_path(rel, (Path *) bpath);
+
+		/* create a partial bitmap heap path */
+		if (rel->consider_parallel && rel->lateral_relids == NULL)
+			create_partial_bitmap_paths(root, rel, bitmapqual);
 	}
 
 	/*
@@ -410,7 +414,7 @@ create_index_paths(PlannerInfo *root, RelOptInfo *rel)
 			required_outer = get_bitmap_tree_required_outer(bitmapqual);
 			loop_count = get_loop_count(root, rel->relid, required_outer);
 			bpath = create_bitmap_heap_path(root, rel, bitmapqual,
-											required_outer, loop_count);
+											required_outer, loop_count, 0);
 			add_path(rel, (Path *) bpath);
 		}
 	}
@@ -1617,6 +1621,11 @@ bitmap_scan_cost_est(PlannerInfo *root, RelOptInfo *rel, Path *ipath)
 	bpath.path.pathkeys = NIL;
 	bpath.bitmapqual = ipath;
 
+	/*
+	 * Check the cost of temporary path without considering parallelism.
+	 * Parallel bitmap heap path will be considered at later stage.
+	 */
+	bpath.path.parallel_workers = 0;
 	cost_bitmap_heap_scan(&bpath.path, root, rel,
 						  bpath.path.param_info,
 						  ipath,
@@ -1659,6 +1668,12 @@ bitmap_and_cost_est(PlannerInfo *root, RelOptInfo *rel, List *paths)
 	bpath.path.pathkeys = NIL;
 	bpath.bitmapqual = (Path *) &apath;
 
+	/*
+	 * Check the cost of temporary path without considering parallelism.
+	 * Parallel bitmap heap path will be considered at later stage.
+	 */
+	bpath.path.parallel_workers = 0;
+
 	/* Now we can do cost_bitmap_heap_scan */
 	cost_bitmap_heap_scan(&bpath.path, root, rel,
 						  bpath.path.param_info,
diff --git a/src/backend/optimizer/plan/createplan.c b/src/backend/optimizer/plan/createplan.c
index f1c7f609c0a..8f8663c1e14 100644
--- a/src/backend/optimizer/plan/createplan.c
+++ b/src/backend/optimizer/plan/createplan.c
@@ -125,6 +125,7 @@ static BitmapHeapScan *create_bitmap_scan_plan(PlannerInfo *root,
 						List *tlist, List *scan_clauses);
 static Plan *create_bitmap_subplan(PlannerInfo *root, Path *bitmapqual,
 					  List **qual, List **indexqual, List **indexECs);
+static void bitmap_subplan_mark_shared(Plan *plan);
 static TidScan *create_tidscan_plan(PlannerInfo *root, TidPath *best_path,
 					List *tlist, List *scan_clauses);
 static SubqueryScan *create_subqueryscan_plan(PlannerInfo *root,
@@ -2590,6 +2591,9 @@ create_bitmap_scan_plan(PlannerInfo *root,
 										   &bitmapqualorig, &indexquals,
 										   &indexECs);
 
+	if (best_path->path.parallel_aware)
+		bitmap_subplan_mark_shared(bitmapqualplan);
+
 	/*
 	 * The qpqual list must contain all restrictions not automatically handled
 	 * by the index, other than pseudoconstant clauses which will be handled
@@ -4756,6 +4760,24 @@ label_sort_with_costsize(PlannerInfo *root, Sort *plan, double limit_tuples)
 	plan->plan.parallel_aware = false;
 }
 
+/*
+ * bitmap_subplan_mark_shared
+ *   Set isshared flag in bitmap subplan so that it will be created in
+ *	 shared memory.
+ */
+static void
+bitmap_subplan_mark_shared(Plan *plan)
+{
+	if (IsA(plan, BitmapAnd))
+		bitmap_subplan_mark_shared(
+								linitial(((BitmapAnd *) plan)->bitmapplans));
+	else if (IsA(plan, BitmapOr))
+		((BitmapOr *) plan)->isshared = true;
+	else if (IsA(plan, BitmapIndexScan))
+		((BitmapIndexScan *) plan)->isshared = true;
+	else
+		elog(ERROR, "unrecognized node type: %d", nodeTag(plan));
+}
 
 /*****************************************************************************
  *
diff --git a/src/backend/optimizer/util/pathnode.c b/src/backend/optimizer/util/pathnode.c
index 86aee2f8ec3..0d925c6fcbf 100644
--- a/src/backend/optimizer/util/pathnode.c
+++ b/src/backend/optimizer/util/pathnode.c
@@ -1068,7 +1068,8 @@ create_bitmap_heap_path(PlannerInfo *root,
 						RelOptInfo *rel,
 						Path *bitmapqual,
 						Relids required_outer,
-						double loop_count)
+						double loop_count,
+						int parallel_degree)
 {
 	BitmapHeapPath *pathnode = makeNode(BitmapHeapPath);
 
@@ -1077,9 +1078,9 @@ create_bitmap_heap_path(PlannerInfo *root,
 	pathnode->path.pathtarget = rel->reltarget;
 	pathnode->path.param_info = get_baserel_parampathinfo(root, rel,
 														  required_outer);
-	pathnode->path.parallel_aware = false;
+	pathnode->path.parallel_aware = parallel_degree > 0 ? true : false;
 	pathnode->path.parallel_safe = rel->consider_parallel;
-	pathnode->path.parallel_workers = 0;
+	pathnode->path.parallel_workers = parallel_degree;
 	pathnode->path.pathkeys = NIL;		/* always unordered */
 
 	pathnode->bitmapqual = bitmapqual;
@@ -3281,7 +3282,7 @@ reparameterize_path(PlannerInfo *root, Path *path,
 														rel,
 														bpath->bitmapqual,
 														required_outer,
-														loop_count);
+														loop_count, 0);
 			}
 		case T_SubqueryScan:
 			{
diff --git a/src/backend/postmaster/pgstat.c b/src/backend/postmaster/pgstat.c
index 2fb9a8bf580..7cacb1e9b24 100644
--- a/src/backend/postmaster/pgstat.c
+++ b/src/backend/postmaster/pgstat.c
@@ -3395,6 +3395,9 @@ pgstat_get_wait_ipc(WaitEventIPC w)
 		case WAIT_EVENT_PARALLEL_FINISH:
 			event_name = "ParallelFinish";
 			break;
+		case WAIT_EVENT_PARALLEL_BITMAP_SCAN:
+			event_name = "ParallelBitmapScan";
+			break;
 		case WAIT_EVENT_SAFE_SNAPSHOT:
 			event_name = "SafeSnapshot";
 			break;
diff --git a/src/include/access/heapam.h b/src/include/access/heapam.h
index a864f7860d8..7e85510d2fd 100644
--- a/src/include/access/heapam.h
+++ b/src/include/access/heapam.h
@@ -179,6 +179,7 @@ extern void simple_heap_update(Relation relation, ItemPointer otid,
 				   HeapTuple tup);
 
 extern void heap_sync(Relation relation);
+extern void heap_update_snapshot(HeapScanDesc scan, Snapshot snapshot);
 
 /* in heap/pruneheap.c */
 extern void heap_page_prune_opt(Relation relation, Buffer buffer);
diff --git a/src/include/executor/nodeBitmapHeapscan.h b/src/include/executor/nodeBitmapHeapscan.h
index d7659b94e61..465c58e6ee5 100644
--- a/src/include/executor/nodeBitmapHeapscan.h
+++ b/src/include/executor/nodeBitmapHeapscan.h
@@ -15,10 +15,17 @@
 #define NODEBITMAPHEAPSCAN_H
 
 #include "nodes/execnodes.h"
+#include "access/parallel.h"
 
 extern BitmapHeapScanState *ExecInitBitmapHeapScan(BitmapHeapScan *node, EState *estate, int eflags);
 extern TupleTableSlot *ExecBitmapHeapScan(BitmapHeapScanState *node);
 extern void ExecEndBitmapHeapScan(BitmapHeapScanState *node);
 extern void ExecReScanBitmapHeapScan(BitmapHeapScanState *node);
+extern void ExecBitmapHeapEstimate(BitmapHeapScanState *node,
+					   ParallelContext *pcxt);
+extern void ExecBitmapHeapInitializeDSM(BitmapHeapScanState *node,
+							ParallelContext *pcxt);
+extern void ExecBitmapHeapInitializeWorker(BitmapHeapScanState *node,
+							   shm_toc *toc);
 
 #endif   /* NODEBITMAPHEAPSCAN_H */
diff --git a/src/include/nodes/execnodes.h b/src/include/nodes/execnodes.h
index 2fde67a9c8d..6a0d590ef29 100644
--- a/src/include/nodes/execnodes.h
+++ b/src/include/nodes/execnodes.h
@@ -26,6 +26,8 @@
 #include "utils/sortsupport.h"
 #include "utils/tuplestore.h"
 #include "utils/tuplesort.h"
+#include "nodes/tidbitmap.h"
+#include "storage/condition_variable.h"
 
 
 /* ----------------
@@ -1464,6 +1466,51 @@ typedef struct BitmapIndexScanState
 	IndexScanDesc biss_ScanDesc;
 } BitmapIndexScanState;
 
+/* ----------------
+ *	 SharedBitmapState information
+ *
+ *		BM_INITIAL		TIDBitmap creation is not yet started, so first worker
+ *						to see this state will set the state to BM_INPROGRESS
+ *						and that process will be responsible for creating
+ *						TIDBitmap.
+ *		BM_INPROGRESS	TIDBitmap creation is in progress; workers need to
+ *						sleep until it's finished.
+ *		BM_FINISHED		TIDBitmap creation is done, so now all workers can
+ *						proceed to iterate over TIDBitmap.
+ * ----------------
+ */
+typedef enum
+{
+	BM_INITIAL,
+	BM_INPROGRESS,
+	BM_FINISHED
+} SharedBitmapState;
+
+/* ----------------
+ *	 ParallelBitmapHeapState information
+ *		tbmiterator				iterator for scanning current pages
+ *		prefetch_iterator		iterator for prefetching ahead of current page
+ *		mutex					mutual exclusion for the prefetching variable
+ *								and state
+ *		prefetch_pages			# pages prefetch iterator is ahead of current
+ *		prefetch_target			current target prefetch distance
+ *		state					current state of the TIDBitmap
+ *		cv						conditional wait variable
+ *		phs_snapshot_data		snapshot data shared to workers
+ * ----------------
+ */
+typedef struct ParallelBitmapHeapState
+{
+	dsa_pointer tbmiterator;
+	dsa_pointer prefetch_iterator;
+	slock_t		mutex;
+	int			prefetch_pages;
+	int			prefetch_target;
+	SharedBitmapState state;
+	ConditionVariable cv;
+	char		phs_snapshot_data[FLEXIBLE_ARRAY_MEMBER];
+} ParallelBitmapHeapState;
+
 /* ----------------
  *	 BitmapHeapScanState information
  *
@@ -1477,6 +1524,11 @@ typedef struct BitmapIndexScanState
  *		prefetch_pages	   # pages prefetch iterator is ahead of current
  *		prefetch_target    current target prefetch distance
  *		prefetch_maximum   maximum value for prefetch_target
+ *		pscan_len		   size of the shared memory for parallel bitmap
+ *		initialized		   is node is ready to iterate
+ *		shared_tbmiterator	   shared iterator
+ *		shared_prefetch_iterator shared iterator for prefetching
+ *		pstate			   shared state for parallel bitmap scan
  * ----------------
  */
 typedef struct BitmapHeapScanState
@@ -1492,6 +1544,11 @@ typedef struct BitmapHeapScanState
 	int			prefetch_pages;
 	int			prefetch_target;
 	int			prefetch_maximum;
+	Size		pscan_len;
+	bool		initialized;
+	TBMSharedIterator *shared_tbmiterator;
+	TBMSharedIterator *shared_prefetch_iterator;
+	ParallelBitmapHeapState *pstate;
 } BitmapHeapScanState;
 
 /* ----------------
diff --git a/src/include/nodes/plannodes.h b/src/include/nodes/plannodes.h
index e30ed6aa29b..7fbb0c2c77e 100644
--- a/src/include/nodes/plannodes.h
+++ b/src/include/nodes/plannodes.h
@@ -292,6 +292,7 @@ typedef struct BitmapAnd
 typedef struct BitmapOr
 {
 	Plan		plan;
+	bool		isshared;
 	List	   *bitmapplans;
 } BitmapOr;
 
@@ -420,6 +421,7 @@ typedef struct BitmapIndexScan
 {
 	Scan		scan;
 	Oid			indexid;		/* OID of index to scan */
+	bool		isshared;		/* Create shared bitmap if set */
 	List	   *indexqual;		/* list of index quals (OpExprs) */
 	List	   *indexqualorig;	/* the same in original form */
 } BitmapIndexScan;
diff --git a/src/include/optimizer/pathnode.h b/src/include/optimizer/pathnode.h
index befe5781416..f0fe8307224 100644
--- a/src/include/optimizer/pathnode.h
+++ b/src/include/optimizer/pathnode.h
@@ -53,7 +53,8 @@ extern BitmapHeapPath *create_bitmap_heap_path(PlannerInfo *root,
 						RelOptInfo *rel,
 						Path *bitmapqual,
 						Relids required_outer,
-						double loop_count);
+						double loop_count,
+						int parallel_degree);
 extern BitmapAndPath *create_bitmap_and_path(PlannerInfo *root,
 					   RelOptInfo *rel,
 					   List *bitmapquals);
diff --git a/src/include/optimizer/paths.h b/src/include/optimizer/paths.h
index bc0dcf4468f..247fd118793 100644
--- a/src/include/optimizer/paths.h
+++ b/src/include/optimizer/paths.h
@@ -56,6 +56,8 @@ extern RelOptInfo *standard_join_search(PlannerInfo *root, int levels_needed,
 extern void generate_gather_paths(PlannerInfo *root, RelOptInfo *rel);
 extern int compute_parallel_worker(RelOptInfo *rel, BlockNumber heap_pages,
 						BlockNumber index_pages);
+extern void create_partial_bitmap_paths(PlannerInfo *root, RelOptInfo *rel,
+										Path *bitmapqual);
 
 #ifdef OPTIMIZER_DEBUG
 extern void debug_print_rel(PlannerInfo *root, RelOptInfo *rel);
diff --git a/src/include/pgstat.h b/src/include/pgstat.h
index 0062fb8af24..60c78d118f9 100644
--- a/src/include/pgstat.h
+++ b/src/include/pgstat.h
@@ -787,6 +787,7 @@ typedef enum
 	WAIT_EVENT_MQ_RECEIVE,
 	WAIT_EVENT_MQ_SEND,
 	WAIT_EVENT_PARALLEL_FINISH,
+	WAIT_EVENT_PARALLEL_BITMAP_SCAN,
 	WAIT_EVENT_SAFE_SNAPSHOT,
 	WAIT_EVENT_SYNC_REP
 } WaitEventIPC;
diff --git a/src/test/regress/expected/select_parallel.out b/src/test/regress/expected/select_parallel.out
index 75558d05e08..290b735b6b5 100644
--- a/src/test/regress/expected/select_parallel.out
+++ b/src/test/regress/expected/select_parallel.out
@@ -169,6 +169,25 @@ select  count(*) from tenk1 where thousand > 95;
 
 reset enable_seqscan;
 reset enable_bitmapscan;
+-- test parallel bitmap heap scan.
+set enable_seqscan to off;
+set enable_indexscan to off;
+explain (costs off)
+	select  count((unique1)) from tenk1 where hundred > 1;
+                         QUERY PLAN                         
+------------------------------------------------------------
+ Finalize Aggregate
+   ->  Gather
+         Workers Planned: 4
+         ->  Partial Aggregate
+               ->  Parallel Bitmap Heap Scan on tenk1
+                     Recheck Cond: (hundred > 1)
+                     ->  Bitmap Index Scan on tenk1_hundred
+                           Index Cond: (hundred > 1)
+(8 rows)
+
+reset enable_seqscan;
+reset enable_indexscan;
 -- test parallel merge join path.
 set enable_hashjoin to off;
 set enable_nestloop to off;
diff --git a/src/test/regress/sql/select_parallel.sql b/src/test/regress/sql/select_parallel.sql
index ebdae7e9391..80412b990d2 100644
--- a/src/test/regress/sql/select_parallel.sql
+++ b/src/test/regress/sql/select_parallel.sql
@@ -64,6 +64,16 @@ select  count(*) from tenk1 where thousand > 95;
 reset enable_seqscan;
 reset enable_bitmapscan;
 
+-- test parallel bitmap heap scan.
+set enable_seqscan to off;
+set enable_indexscan to off;
+
+explain (costs off)
+	select  count((unique1)) from tenk1 where hundred > 1;
+
+reset enable_seqscan;
+reset enable_indexscan;
+
 -- test parallel merge join path.
 set enable_hashjoin to off;
 set enable_nestloop to off;
-- 
GitLab