From 19f492779f445a4a9d96d68d36ae119fa40f9923 Mon Sep 17 00:00:00 2001
From: Huansong Fu <fuhuansong@gmail.com>
Date: Thu, 30 Nov 2023 21:29:08 -0800
Subject: [PATCH 01/11] Enable hot standby dispatch

This is the initial commit to support hot standby dispatch in GPDB. In this
commit, hot standby dispatch is enabled when the hot_standby GUC is set to ON,
and the standby coordinator can be connected and run queries on. Basic query
dispatching and error handling cases are covered, please see the
isolation2/hot_standby tests for those cases.

Current limitations that will be addressed in coming works:
* No read-committed isolation from global transaction, so e.g. a SELECT on
  standby QD could see partial INSERT results on the primary QD.
* No repeatable-read isolation, so e.g., a UDF that runs multiple SELECTs on the
  standby QD could see different results from the SELECTs even they are the same.
* No transaction block BEGIN ... END, and as a result, no cursor support or
  other things that depend on BEGIN...END.
* Query conflict between primary and standby has not been tested yet. This will
  be done with/after the isolation work.

Co-authored-by: Soumyadeep Chakraborty <soumyadeep2007@gmail.com>
Co-authored-by: Jimmy Yih <jyih@vmware.com>
---
 src/backend/access/transam/xlog.c             |   2 +
 src/backend/cdb/cdbfts.c                      |   4 +
 src/backend/cdb/cdbtm.c                       |   1 +
 src/backend/cdb/cdbutil.c                     |  34 +-
 src/backend/cdb/dispatcher/cdbdisp_query.c    |   6 +
 src/backend/cdb/dispatcher/cdbgang.c          |   3 +-
 .../cdb/dispatcher/test/cdbdisp_query_test.c  | 341 ++++++++++++++++++
 src/backend/fts/fts.c                         |   2 +-
 src/backend/storage/lmgr/proc.c               |  10 +-
 src/backend/tcop/postgres.c                   |  15 +
 src/include/access/xlog.h                     |   2 +
 src/include/cdb/cdbvars.h                     |   3 +
 src/test/isolation2/Makefile                  |   3 +
 .../isolation2/expected/hot_standby/basic.out | 223 ++++++++++++
 .../expected/hot_standby/faults.out           | 296 +++++++++++++++
 .../isolation2/expected/hot_standby/setup.out |   8 +
 .../expected/hot_standby/teardown.out         |   7 +
 src/test/isolation2/hot_standby_schedule      |   4 +
 src/test/isolation2/sql/hot_standby/basic.sql |  88 +++++
 .../isolation2/sql/hot_standby/faults.sql     | 118 ++++++
 src/test/isolation2/sql/hot_standby/setup.sql |   5 +
 .../isolation2/sql/hot_standby/teardown.sql   |   4 +
 22 files changed, 1160 insertions(+), 19 deletions(-)
 create mode 100644 src/backend/cdb/dispatcher/test/cdbdisp_query_test.c
 create mode 100644 src/test/isolation2/expected/hot_standby/basic.out
 create mode 100644 src/test/isolation2/expected/hot_standby/faults.out
 create mode 100644 src/test/isolation2/expected/hot_standby/setup.out
 create mode 100644 src/test/isolation2/expected/hot_standby/teardown.out
 create mode 100644 src/test/isolation2/hot_standby_schedule
 create mode 100644 src/test/isolation2/sql/hot_standby/basic.sql
 create mode 100644 src/test/isolation2/sql/hot_standby/faults.sql
 create mode 100644 src/test/isolation2/sql/hot_standby/setup.sql
 create mode 100644 src/test/isolation2/sql/hot_standby/teardown.sql

diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 3fb9f121b93..469f078aa59 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -8331,6 +8331,8 @@ StartupXLOG(void)
 	 */
 	InRecovery = false;
 
+	SIMPLE_FAULT_INJECTOR("out_of_recovery_in_startupxlog");
+
 	/*
 	 * Hook for plugins to do additional startup works.
 	 *
diff --git a/src/backend/cdb/cdbfts.c b/src/backend/cdb/cdbfts.c
index 754d3054cbb..de268b6f662 100644
--- a/src/backend/cdb/cdbfts.c
+++ b/src/backend/cdb/cdbfts.c
@@ -84,6 +84,10 @@ FtsNotifyProber(void)
 	int32			started;
 	int32			done;
 
+	/* Ignore if we don't have a FTS probe process, like a standby QD in a mirrored cluster. */
+	if (FtsProbePID() == 0)
+		return;
+
 	if (am_ftsprobe)
 		return;
 
diff --git a/src/backend/cdb/cdbtm.c b/src/backend/cdb/cdbtm.c
index f0cd5fcb3f6..fbbecaa117c 100644
--- a/src/backend/cdb/cdbtm.c
+++ b/src/backend/cdb/cdbtm.c
@@ -2158,6 +2158,7 @@ performDtxProtocolCommitPrepared(const char *gid, bool raiseErrorIfNotFound)
 	sendWaitGxidsToQD(waitGxids);
 
 	finishDistributedTransactionContext("performDtxProtocolCommitPrepared -- Commit Prepared", false);
+	SIMPLE_FAULT_INJECTOR("finish_commit_prepared");
 }
 
 /**
diff --git a/src/backend/cdb/cdbutil.c b/src/backend/cdb/cdbutil.c
index 1671b17223b..f732368d725 100644
--- a/src/backend/cdb/cdbutil.c
+++ b/src/backend/cdb/cdbutil.c
@@ -565,7 +565,7 @@ getCdbComponentInfo(void)
 	{
 		cdbInfo = &component_databases->segment_db_info[i];
 
-		if (cdbInfo->config->role != GP_SEGMENT_CONFIGURATION_ROLE_PRIMARY)
+		if (!IS_HOT_STANDBY_QD() && cdbInfo->config->role != GP_SEGMENT_CONFIGURATION_ROLE_PRIMARY)
 			continue;
 
 		hsEntry = (HostPrimaryCountEntry *) hash_search(hostPrimaryCountHash, cdbInfo->config->hostname, HASH_FIND, &found);
@@ -577,7 +577,7 @@ getCdbComponentInfo(void)
 	{
 		cdbInfo = &component_databases->entry_db_info[i];
 
-		if (cdbInfo->config->role != GP_SEGMENT_CONFIGURATION_ROLE_PRIMARY)
+		if (!IS_HOT_STANDBY_QD() && cdbInfo->config->role != GP_SEGMENT_CONFIGURATION_ROLE_PRIMARY)
 			continue;
 
 		hsEntry = (HostPrimaryCountEntry *) hash_search(hostPrimaryCountHash, cdbInfo->config->hostname, HASH_FIND, &found);
@@ -1005,7 +1005,16 @@ cdbcomponent_getComponentInfo(int contentId)
 	/* entry db */
 	if (contentId == -1)
 	{
-		cdbInfo = &cdbs->entry_db_info[0];	
+		Assert(cdbs->total_entry_dbs == 1 || cdbs->total_entry_dbs == 2);
+		/*
+		 * For a standby QD, get the last entry db which can be the first (on
+		 * a replica cluster) or the second (on a mirrored cluster) entry.
+		 */
+		if (IS_HOT_STANDBY_QD())
+			cdbInfo = &cdbs->entry_db_info[cdbs->total_entry_dbs - 1];
+		else
+			cdbInfo = &cdbs->entry_db_info[0];	
+
 		return cdbInfo;
 	}
 
@@ -1022,10 +1031,10 @@ cdbcomponent_getComponentInfo(int contentId)
 		Assert(cdbs->total_segment_dbs == cdbs->total_segments * 2);
 		cdbInfo = &cdbs->segment_db_info[2 * contentId];
 
-		if (!SEGMENT_IS_ACTIVE_PRIMARY(cdbInfo))
-		{
+		/* use the other segment if it is not what the QD wants */
+		if ((IS_HOT_STANDBY_QD() && SEGMENT_IS_ACTIVE_PRIMARY(cdbInfo)) 
+						|| (!IS_HOT_STANDBY_QD() && !SEGMENT_IS_ACTIVE_PRIMARY(cdbInfo)))
 			cdbInfo = &cdbs->segment_db_info[2 * contentId + 1];
-		}
 
 		return cdbInfo;
 	}
@@ -1124,10 +1133,21 @@ cdb_setup(void)
 	 *
 	 * Ignore background worker because bgworker_should_start_mpp() already did
 	 * the check.
+	 *
+	 * Ignore if we are the standby coordinator started in hot standby mode.
+	 * We don't expect dtx recovery to have finished, as dtx recovery is
+	 * performed at the end of startup. In hot standby, we are recovering
+	 * continuously and should allow queries much earlier. Since a hot standby
+	 * won't proceed dtx, it is not required to wait for recovery of the dtx
+	 * that has been prepared but not committed (i.e. to commit them); on the
+	 * other hand, the recovery of any in-doubt transactions (i.e. not prepared)
+	 * won't bother a hot standby either, just like they can be recovered in the 
+	 * background when a primary instance is running.
 	 */
 	if (!IsBackgroundWorker &&
 		Gp_role == GP_ROLE_DISPATCH &&
-		!*shmDtmStarted)
+		!*shmDtmStarted &&
+		!IS_HOT_STANDBY_QD())
 	{
 		ereport(FATAL,
 				(errcode(ERRCODE_CANNOT_CONNECT_NOW),
diff --git a/src/backend/cdb/dispatcher/cdbdisp_query.c b/src/backend/cdb/dispatcher/cdbdisp_query.c
index ef4b84f0db0..c21cfd935f9 100644
--- a/src/backend/cdb/dispatcher/cdbdisp_query.c
+++ b/src/backend/cdb/dispatcher/cdbdisp_query.c
@@ -867,6 +867,7 @@ buildGpQueryString(DispatchCommandQueryParms *pQueryParms,
 {
 	const char *command = pQueryParms->strCommand;
 	int			command_len;
+	int			is_hs_dispatch = IS_HOT_STANDBY_QD() ? 1 : 0;
 	const char *plantree = pQueryParms->serializedPlantree;
 	int			plantree_len = pQueryParms->serializedPlantreelen;
 	const char *sddesc = pQueryParms->serializedQueryDispatchDesc;
@@ -921,6 +922,7 @@ buildGpQueryString(DispatchCommandQueryParms *pQueryParms,
 		sizeof(outerUserId) /* outerUserIsSuper */ +
 		sizeof(currentUserId) +
 		sizeof(n32) * 2 /* currentStatementStartTimestamp */ +
+		sizeof(is_hs_dispatch) +
 		sizeof(command_len) +
 		sizeof(plantree_len) +
 		sizeof(sddesc_len) +
@@ -976,6 +978,10 @@ buildGpQueryString(DispatchCommandQueryParms *pQueryParms,
 	memcpy(pos, &n32, sizeof(n32));
 	pos += sizeof(n32);
 
+	tmp = htonl(is_hs_dispatch);
+	memcpy(pos, &tmp, sizeof(is_hs_dispatch));
+	pos += sizeof(is_hs_dispatch);
+
 	tmp = htonl(command_len);
 	memcpy(pos, &tmp, sizeof(command_len));
 	pos += sizeof(command_len);
diff --git a/src/backend/cdb/dispatcher/cdbgang.c b/src/backend/cdb/dispatcher/cdbgang.c
index 780ddef0f42..87ce88504b0 100644
--- a/src/backend/cdb/dispatcher/cdbgang.c
+++ b/src/backend/cdb/dispatcher/cdbgang.c
@@ -698,8 +698,7 @@ getCdbProcessesForQD(int isPrimary)
 
 	qdinfo = cdbcomponent_getComponentInfo(MASTER_CONTENT_ID);
 
-	Assert(qdinfo->config->segindex == -1);
-	Assert(SEGMENT_IS_ACTIVE_PRIMARY(qdinfo));
+	Assert((qdinfo->config->segindex == -1 && SEGMENT_IS_ACTIVE_PRIMARY(qdinfo)) || IS_HOT_STANDBY_QD());
 	Assert(qdinfo->config->hostip != NULL);
 
 	proc = makeNode(CdbProcess);
diff --git a/src/backend/cdb/dispatcher/test/cdbdisp_query_test.c b/src/backend/cdb/dispatcher/test/cdbdisp_query_test.c
new file mode 100644
index 00000000000..6e07aebcc96
--- /dev/null
+++ b/src/backend/cdb/dispatcher/test/cdbdisp_query_test.c
@@ -0,0 +1,341 @@
+#include <stdarg.h>
+#include <stddef.h>
+#include <setjmp.h>
+#include "cmockery.h"
+#include "postgres.h"
+
+#include "storage/ipc.h"
+#include "storage/proc.h"
+
+#include "../cdbdisp_query.c"
+
+
+#undef PG_RE_THROW
+#define PG_RE_THROW() siglongjmp(*PG_exception_stack, 1)
+
+
+int			__wrap_errmsg(const char *fmt,...);
+int			__wrap_errcode(int sqlerrcode);
+bool		__wrap_errstart(int elevel, const char *filename, int lineno,
+							const char *funcname, const char *domain);
+void		__wrap_errfinish(int dummy __attribute__((unused)),...);
+Gang	   *__wrap_cdbgang_createGang_async(List *segments, SegmentType segmentType);
+int			__wrap_pqPutMsgStart(char msg_type, bool force_len, PGconn *conn);
+int			__wrap_PQcancel(PGcancel *cancel, char *errbuf, int errbufsize);
+char	   *__wrap_serializeNode(Node *node, int *size, int *uncompressed_size_out);
+char	   *__wrap_qdSerializeDtxContextInfo(int *size, bool wantSnapshot, bool inCursor, int txnOptions, char *debugCaller);
+void		__wrap_VirtualXactLockTableInsert(VirtualTransactionId vxid);
+void		__wrap_AcceptInvalidationMessages(void);
+static void terminate_process();
+
+
+int
+__wrap_errmsg(const char *fmt,...)
+{
+	check_expected(fmt);
+	optional_assignment(fmt);
+	return (int) mock();
+}
+
+
+int
+__wrap_errcode(int sqlerrcode)
+{
+	check_expected(sqlerrcode);
+	return (int) mock();
+}
+
+
+bool
+__wrap_errstart(int elevel, const char *filename, int lineno,
+				const char *funcname, const char *domain)
+{
+	if (elevel < LOG)
+		return false;
+
+	check_expected(elevel);
+	check_expected(filename);
+	check_expected(lineno);
+	check_expected(funcname);
+	check_expected(domain);
+	optional_assignment(filename);
+	optional_assignment(funcname);
+	optional_assignment(domain);
+	return (bool) mock();
+}
+
+
+void
+__wrap_errfinish(int dummy __attribute__((unused)),...)
+{
+	PG_RE_THROW();
+}
+
+
+static void
+expect_ereport(int expect_elevel)
+{
+	expect_any(__wrap_errmsg, fmt);
+	will_be_called(__wrap_errmsg);
+
+	expect_any(__wrap_errcode, sqlerrcode);
+	will_be_called(__wrap_errcode);
+
+	expect_value(__wrap_errstart, elevel, expect_elevel);
+	expect_any(__wrap_errstart, filename);
+	expect_any(__wrap_errstart, lineno);
+	expect_any(__wrap_errstart, funcname);
+	expect_any(__wrap_errstart, domain);
+	if (expect_elevel < ERROR)
+	{
+		will_return(__wrap_errstart, false);
+	}
+	else
+	{
+		will_return(__wrap_errstart, true);
+	}
+}
+
+
+Gang *
+__wrap_cdbgang_createGang_async(List *segments, SegmentType segmentType)
+{
+	MemoryContext oldContext = MemoryContextSwitchTo(DispatcherContext);
+	Gang	   *gang = buildGangDefinition(segments, segmentType);
+
+	MemoryContextSwitchTo(oldContext);
+
+	PGconn	   *conn = (PGconn *) malloc(sizeof(PGconn));
+
+	MemSet(conn, 0, sizeof(PGconn));
+	initPQExpBuffer(&conn->errorMessage);
+	initPQExpBuffer(&conn->workBuffer);
+	gang->db_descriptors[0]->conn = conn;
+
+	return gang;
+}
+
+
+int
+__wrap_pqPutMsgStart(char msg_type, bool force_len, PGconn *conn)
+{
+	if (conn->outBuffer_shared)
+		fail_msg("Mustn't send something else during dispatch!");
+	check_expected(msg_type);
+	check_expected(force_len);
+	check_expected(conn);
+	optional_assignment(conn);
+	return (int) mock();
+}
+
+
+int
+__wrap_PQcancel(PGcancel *cancel, char *errbuf, int errbufsize)
+{
+	return (int) mock();
+}
+
+
+char *
+__wrap_serializeNode(Node *node, int *size, int *uncompressed_size_out)
+{
+	const int	alloc_size = 1024;
+
+	if (size != NULL)
+		*size = alloc_size;
+	if (uncompressed_size_out != NULL)
+		*uncompressed_size_out = alloc_size;
+
+	return (char *) palloc(alloc_size);
+}
+
+
+char *
+__wrap_qdSerializeDtxContextInfo(int *size, bool wantSnapshot, bool inCursor, int txnOptions, char *debugCaller)
+{
+	const int	alloc_size = 1024;
+
+	assert_int_not_equal(size, NULL);
+	*size = alloc_size;
+
+	return (char *) palloc(alloc_size);
+}
+
+
+void
+__wrap_VirtualXactLockTableInsert(VirtualTransactionId vxid)
+{
+	mock();
+}
+
+void
+__wrap_AcceptInvalidationMessages(void)
+{
+	mock();
+}
+
+
+static void
+terminate_process()
+{
+	die(SIGTERM);
+}
+
+/*
+ * Test query may be interrupted during plan dispatching
+ */
+static void
+test__CdbDispatchPlan_may_be_interrupted(void **state)
+{
+	PlannedStmt *plannedstmt = (PlannedStmt *) palloc(sizeof(PlannedStmt));
+
+	/* slice table is needed to allocate gang */
+	plannedstmt->slices = palloc0(sizeof(PlanSlice));
+	plannedstmt->numSlices = 1;
+	PlanSlice  *slice = &plannedstmt->slices[0];
+
+	slice->sliceIndex = 1;
+	slice->gangType = GANGTYPE_PRIMARY_READER;
+	slice->numsegments = 1;
+	slice->parentIndex = -1;
+	slice->segindex = 0;
+
+	QueryDesc  *queryDesc = (QueryDesc *) palloc(sizeof(QueryDesc));
+
+	queryDesc->plannedstmt = plannedstmt;
+	/* ddesc->secContext is filled in cdbdisp_buildPlanQueryParms() */
+	queryDesc->ddesc = (QueryDispatchDesc *) palloc(sizeof(QueryDispatchDesc));
+	/* source text is required for buildGpQueryString() */
+	queryDesc->sourceText = "select a from t1;";
+
+	queryDesc->estate = CreateExecutorState();
+
+	/* will be called multiple times in e.g. FtsNotifyProber/getCdbComponentInfo */
+	will_return_count(RecoveryInProgress, false, -1);
+
+	/* cdbcomponent_getCdbComponents() mocks */
+	will_be_called(FtsNotifyProber);
+	will_return(getFtsVersion, 1);
+	will_return(GetGpExpandVersion, 1);
+
+	/* StartTransactionCommand() mocks */
+	will_be_called(__wrap_VirtualXactLockTableInsert);
+	will_be_called(__wrap_AcceptInvalidationMessages);
+	will_be_called(initialize_wal_bytes_written);
+
+	/*
+	 * cdbdisp_dispatchToGang()
+	 *
+	 * start sending MPP query to QE inside PQsendGpQuery_shared() replace
+	 * connection buffer with the shared one
+	 */
+	expect_any(PQsendQueryStart, conn);
+	will_return(PQsendQueryStart, true);
+
+	/* first try to flush MPP query inside PQsendGpQuery_shared() */
+	expect_any(pqFlushNonBlocking, conn);
+	will_return(pqFlushNonBlocking, 1);
+
+	/*
+	 * cdbdisp_waitDispatchFinish()
+	 *
+	 * query will be interrupted before poll()
+	 */
+	expect_any_count(ResetWaitEventSet, pset, 2);
+	expect_any_count(ResetWaitEventSet, context, 2);
+	expect_any_count(ResetWaitEventSet, nevents, 2);
+	will_be_called_count(ResetWaitEventSet, 2);
+
+	expect_any(pqFlushNonBlocking, conn);
+	will_return_with_sideeffect(pqFlushNonBlocking, 1, &terminate_process, NULL);
+
+	expect_any(SetLatch, latch);
+	will_be_called(SetLatch);
+
+	expect_any(AddWaitEventToSet, set);
+	expect_any(AddWaitEventToSet, events);
+	expect_any(AddWaitEventToSet, fd);
+	expect_any(AddWaitEventToSet, latch);
+	expect_any(AddWaitEventToSet, user_data);
+	will_be_called(AddWaitEventToSet);
+
+	will_return(IsLogicalLauncher, false);
+
+	/* process was terminated by administrative command */
+	expect_ereport(FATAL);
+
+	/* QD will trying to cancel queries on QEs */
+	will_return(__wrap_PQcancel, true);
+
+	/* during close and free connection */
+	expect_any_count(pqClearAsyncResult, conn, 2);
+	will_be_called_count(pqClearAsyncResult, 2);
+
+	/*
+	 * BUT! pqPutMsgStart mustn't be called
+	 *
+	 * we can't send termination message (X) until shared message isn't sent
+	 * out the buffer completely
+	 */
+
+	/*
+	 * dirty hack. cluster topology needed to allocate gangs is loaded from
+	 * gpsegconfig_dump outside of transaction
+	 */
+	cdbcomponent_getCdbComponents();
+
+	StartTransactionCommand();
+
+	PG_TRY();
+	{
+		queryDesc->estate->es_sliceTable = InitSliceTable(queryDesc->estate, plannedstmt);
+
+		CdbDispatchPlan(queryDesc, queryDesc->estate->es_param_exec_vals,
+						false, false);
+		fail();
+	}
+	PG_CATCH();
+	{
+		/*
+		 * SIGTERM handling emulation gpdb bail out from CheckDispatchResult
+		 * without flushing unsent messages in case of process exit in
+		 * progress AtAbort_DispatcherState will be called during transaction
+		 * abort
+		 */
+		proc_exit_inprogress = true;
+
+		AtAbort_DispatcherState();
+	}
+	PG_END_TRY();
+}
+
+int
+main(int argc, char *argv[])
+{
+	cmockery_parse_arguments(argc, argv);
+
+	const		UnitTest tests[] =
+	{
+		unit_test(test__CdbDispatchPlan_may_be_interrupted)
+	};
+
+	Gp_role = GP_ROLE_DISPATCH;
+	/* to start transaction */
+	PGPROC		proc;
+
+	MyBackendId = 7;
+	proc.backendId = MyBackendId;
+	MyProc = &proc;
+	/* to build cdb components info */
+	GpIdentity.dbid = 1;
+	GpIdentity.segindex = -1;
+
+	MemoryContextInit();
+
+	/* to avoid mocking cdbtm.c functions */
+	MyTmGxactLocal = (TMGXACTLOCAL *) MemoryContextAllocZero(TopMemoryContext, sizeof(TMGXACTLOCAL));
+
+	SetSessionUserId(1000, true);
+
+	return run_tests(tests);
+}
diff --git a/src/backend/fts/fts.c b/src/backend/fts/fts.c
index 719e8fbca1c..c7c1711e97f 100644
--- a/src/backend/fts/fts.c
+++ b/src/backend/fts/fts.c
@@ -102,7 +102,7 @@ sigIntHandler(SIGNAL_ARGS)
 pid_t
 FtsProbePID(void)
 {
-	return *shmFtsProbePID;
+	return shmFtsProbePID ? *shmFtsProbePID : 0;
 }
 
 bool
diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c
index a174e981b1f..37d917a1f3e 100644
--- a/src/backend/storage/lmgr/proc.c
+++ b/src/backend/storage/lmgr/proc.c
@@ -354,17 +354,9 @@ InitProcess(void)
 	 * WAL sender, etc are marked as GP_ROLE_UTILITY to prevent unwanted
 	 * GP_ROLE_DISPATCH MyProc settings such as mppSessionId being valid and
 	 * mppIsWriter set to true.
-	 *
-	 * RecoveryInProgress() to see if we are in hot standby, because
-	 * HotStandbyActive() is still true after promotion.
 	 */
-	if (am_walsender || am_ftshandler || am_faulthandler ||
-		(GpIdentity.segindex == -1 && RecoveryInProgress()))
-	{
+	if (am_walsender || am_ftshandler || am_faulthandler)
 		Gp_role = GP_ROLE_UTILITY;
-		if (GpIdentity.segindex == -1 && RecoveryInProgress())
-			elog(WARNING, "Force to run in utility mode in hot standby");
-	}
 
 	/*
 	 * ProcGlobal should be set up already (if we are a backend, we inherit
diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c
index f29c9c2e606..37dfe9978c0 100644
--- a/src/backend/tcop/postgres.c
+++ b/src/backend/tcop/postgres.c
@@ -5714,6 +5714,7 @@ PostgresMain(int argc, char *argv[],
 					const char *serializedQueryDispatchDesc = NULL;
 					const char *resgroupInfoBuf = NULL;
 
+					int is_hs_dispatch;
 					int query_string_len = 0;
 					int serializedDtxContextInfolen = 0;
 					int serializedPlantreelen = 0;
@@ -5750,6 +5751,20 @@ PostgresMain(int argc, char *argv[],
 					cuid = pq_getmsgint(&input_message, 4);
 
 					statementStart = pq_getmsgint64(&input_message);
+
+					/* check if the message is from standby QD and is expected */
+					is_hs_dispatch = pq_getmsgint(&input_message, 4);
+					if (is_hs_dispatch == 0 && IS_STANDBY_QE())
+						ereport(ERROR,
+								(errcode(ERRCODE_PROTOCOL_VIOLATION),
+								 errmsg("mirror segments can only process MPP protocol messages from standby QD"),
+								 errhint("Exit the current session and re-connect.")));
+					else if (is_hs_dispatch != 0 && !IS_STANDBY_QE())
+						ereport(ERROR,
+								(errcode(ERRCODE_PROTOCOL_VIOLATION),
+								 errmsg("primary segments can only process MPP protocol messages from primary QD"),
+								 errhint("Exit the current session and re-connect.")));
+
 					query_string_len = pq_getmsgint(&input_message, 4);
 					serializedPlantreelen = pq_getmsgint(&input_message, 4);
 					serializedQueryDispatchDesclen = pq_getmsgint(&input_message, 4);
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index 2dfad411b7a..e8a73ceb201 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -11,6 +11,8 @@
 #ifndef XLOG_H
 #define XLOG_H
 
+#include "postgres.h" /* for Datum */
+
 #include "access/rmgr.h"
 #include "access/xlogdefs.h"
 #include "access/xloginsert.h"
diff --git a/src/include/cdb/cdbvars.h b/src/include/cdb/cdbvars.h
index 90af5177ce0..ab5963a0661 100644
--- a/src/include/cdb/cdbvars.h
+++ b/src/include/cdb/cdbvars.h
@@ -19,6 +19,7 @@
 #ifndef CDBVARS_H
 #define CDBVARS_H
 
+#include "access/xlog.h"  /*RecoveryInProgress*/
 #include "access/xlogdefs.h"  /*XLogRecPtr*/
 #include "cdb/cdbutil.h" /* MASTER_CONTENT_ID */
 #ifdef USE_INTERNAL_FTS
@@ -757,8 +758,10 @@ extern GpId GpIdentity;
 
 #define UNINITIALIZED_GP_IDENTITY_VALUE (-10000)
 #define IS_QUERY_DISPATCHER() (GpIdentity.segindex == MASTER_CONTENT_ID)
+#define IS_HOT_STANDBY_QD() (EnableHotStandby && IS_QUERY_DISPATCHER() && RecoveryInProgress())
 
 #define IS_QUERY_EXECUTOR_BACKEND() (Gp_role == GP_ROLE_EXECUTE && gp_session_id > 0)
+#define IS_STANDBY_QE() (EnableHotStandby && IS_QUERY_EXECUTOR_BACKEND() && RecoveryInProgress())
 
 /* Stores the listener port that this process uses to listen for incoming
  * Interconnect connections from other Motion nodes.
diff --git a/src/test/isolation2/Makefile b/src/test/isolation2/Makefile
index 759b2855513..bc1e0f66be0 100644
--- a/src/test/isolation2/Makefile
+++ b/src/test/isolation2/Makefile
@@ -90,3 +90,6 @@ installcheck-cbdb-parallel: install
 	export PGOPTIONS='-c optimizer=off -c enable_parallel=true'; \
 	$(pg_isolation2_regress_installcheck) --init-file=$(top_builddir)/src/test/regress/init_file --init-file=./init_file_isolation2 --schedule=$(srcdir)/isolation2_schedule \
 	)
+
+installcheck-hot-standby: install
+	$(pg_isolation2_regress_installcheck) $(EXTRA_REGRESS_OPTS) --init-file=$(top_builddir)/src/test/regress/init_file --init-file=./init_file_isolation2 --schedule=$(srcdir)/hot_standby_schedule --dbname=isolation2-hot-standby
diff --git a/src/test/isolation2/expected/hot_standby/basic.out b/src/test/isolation2/expected/hot_standby/basic.out
new file mode 100644
index 00000000000..704814217f7
--- /dev/null
+++ b/src/test/isolation2/expected/hot_standby/basic.out
@@ -0,0 +1,223 @@
+-- Tests for basic query dispatch on a hot standy.
+
+-- must show on
+-1S: show hot_standby;
+ hot_standby 
+-------------
+ on          
+(1 row)
+
+-- will be checking if QD/QE info looks good
+-1S: select id, type, content, port from gp_backend_info();
+ id | type | content | port 
+----+------+---------+------
+ -1 | Q    | -1      | 7001 
+(1 row)
+
+----------------------------------------------------------------
+-- Test: basic query dispatch
+----------------------------------------------------------------
+create table hs_t1(a int);
+CREATE TABLE
+create table hs_t2(a int);
+CREATE TABLE
+insert into hs_t1 select * from generate_series(1,10);
+INSERT 0 10
+
+-- standby should see the result
+-1S: select * from hs_t1;
+ a  
+----
+ 2  
+ 3  
+ 4  
+ 7  
+ 8  
+ 5  
+ 6  
+ 9  
+ 10 
+ 1  
+(10 rows)
+
+-- we have three QEs launched on the mirror segments.
+-- note that the first QE on a segment is still a "writer" because we
+-- need it to manage locks, same as read-only queries on a primary QD.
+-1S: select id, type, content, port from gp_backend_info();
+ id | type | content | port 
+----+------+---------+------
+ -1 | Q    | -1      | 7001 
+ 0  | w    | 0       | 7005 
+ 1  | w    | 1       | 7006 
+ 2  | w    | 2       | 7007 
+(4 rows)
+
+-- should have parallel readers launched
+-1S: select * from hs_t1 join (select * from hs_t2) hs_t2 on (hs_t1 = hs_t2);
+ a | a 
+---+---
+(0 rows)
+-1S: select id, type, content, port from gp_backend_info();
+ id | type | content | port 
+----+------+---------+------
+ -1 | Q    | -1      | 7001 
+ 0  | w    | 0       | 7005 
+ 1  | w    | 1       | 7006 
+ 2  | w    | 2       | 7007 
+ 3  | r    | 0       | 7005 
+ 4  | r    | 1       | 7006 
+ 5  | r    | 2       | 7007 
+(7 rows)
+
+-- now a singleton reader added too
+-1S: select * from hs_t1 join (select oid::int from pg_class) hs_t2 on (hs_t1 = hs_t2);
+ a | oid 
+---+-----
+(0 rows)
+-1S: select id, type, content, port from gp_backend_info();
+ id | type | content | port 
+----+------+---------+------
+ -1 | Q    | -1      | 7001 
+ 0  | w    | 0       | 7005 
+ 1  | w    | 1       | 7006 
+ 2  | w    | 2       | 7007 
+ 3  | r    | 0       | 7005 
+ 4  | r    | 1       | 7006 
+ 5  | r    | 2       | 7007 
+ 6  | R    | -1      | 7001 
+(8 rows)
+
+-- un-committed result should not be seen by the standby
+begin;
+BEGIN
+insert into hs_t1 select * from generate_series(11,20);
+INSERT 0 10
+
+-- standby should only see 1...10
+-1S: select * from hs_t1;
+ a  
+----
+ 2  
+ 3  
+ 4  
+ 7  
+ 8  
+ 5  
+ 6  
+ 9  
+ 10 
+ 1  
+(10 rows)
+
+end;
+COMMIT
+
+-- standby should see 1...20 now
+-1S: select * from hs_t1;
+ a  
+----
+ 2  
+ 3  
+ 4  
+ 7  
+ 8  
+ 16 
+ 18 
+ 19 
+ 5  
+ 6  
+ 9  
+ 10 
+ 11 
+ 13 
+ 14 
+ 17 
+ 1  
+ 12 
+ 15 
+ 20 
+(20 rows)
+
+----------------------------------------------------------------
+-- Test: other things that a hot standby can do
+----------------------------------------------------------------
+-- set/reset and show GUC
+-1S: set optimizer = on;
+SET
+-1S: show optimizer;
+ optimizer 
+-----------
+ on        
+(1 row)
+-1S: reset optimizer;
+RESET
+-- copy command
+-1S: copy hs_t1 to '/tmp/hs_copyto.csv' csv null '';
+COPY 20
+-- query catalogs
+-1S: select count(*) from pg_class where relname = 'hs_t1';
+ count 
+-------
+ 1     
+(1 row)
+-1S: select dbid,content,role,preferred_role,mode,status from gp_segment_configuration where dbid = current_setting('gp_dbid')::integer;
+ dbid | content | role | preferred_role | mode | status 
+------+---------+------+----------------+------+--------
+ 8    | -1      | m    | m              | s    | u      
+(1 row)
+
+-- Here are the things hot standby in PG can do but currently cannot in GPDB:
+-- transaction block BEGIN...END;
+-1S: begin;
+ERROR:  cannot make new WAL entries during recovery (xloginsert.c:135)
+-1S: end;
+COMMIT
+-- cursor operation due to not supporting BEGIN...END yet;
+
+-- checkpoint is allowed on standby but a restart point is created instead
+-1S: checkpoint;
+CHECKPOINT
+
+----------------------------------------------------------------
+-- Test: things that can't be done on a hot standby in both PG and GDPB:
+-- no DML, DDL or anything that generates WAL
+----------------------------------------------------------------
+-1S: insert into hs_t1 values(1);
+ERROR:  cannot execute INSERT in a read-only transaction
+-1S: delete from hs_t1;
+ERROR:  cannot acquire lock mode ExclusiveLock on database objects while recovery is in progress
+LINE 1: delete from hs_t1;
+                    ^
+HINT:  Only RowExclusiveLock or less can be acquired on database objects during recovery.
+-1S: update hs_t1 set a = 0;
+ERROR:  cannot acquire lock mode ExclusiveLock on database objects while recovery is in progress
+LINE 1: update hs_t1 set a = 0;
+               ^
+HINT:  Only RowExclusiveLock or less can be acquired on database objects during recovery.
+-1S: create table hs_t2(a int);
+ERROR:  cannot execute CREATE TABLE in a read-only transaction
+-1S: create database hs_db;
+ERROR:  cannot execute CREATE DATABASE in a read-only transaction
+-1S: vacuum hs_t1;
+ERROR:  cannot execute VACUUM during recovery
+
+--
+-- No hintbit WAL generation in SELECT.
+--
+create table hs_nohintbit(a int) distributed by (a);
+CREATE TABLE
+insert into hs_nohintbit select generate_series (1, 10);
+INSERT 0 10
+-- flush the data to disk
+checkpoint;
+CHECKPOINT
+
+-1S: set gp_disable_tuple_hints=off;
+SET
+-- no WAL is being generated (otherwise an error would occur "cannot make new WAL entries during recovery")
+-1S: SELECT count(*) FROM hs_nohintbit;
+ count 
+-------
+ 10    
+(1 row)
+
diff --git a/src/test/isolation2/expected/hot_standby/faults.out b/src/test/isolation2/expected/hot_standby/faults.out
new file mode 100644
index 00000000000..5fdade0d37d
--- /dev/null
+++ b/src/test/isolation2/expected/hot_standby/faults.out
@@ -0,0 +1,296 @@
+-- Test system faults scenarios
+
+-- start_matchsubs
+--
+-- m/Is the server running on host.*/
+-- s/Is the server running on host "\d+.\d+.\d+.\d+" and accepting/Is the server running on host <IP> and accepting/
+-- m/(seg\d+ \d+.\d+.\d+.\d+:\d+)/
+-- s/(.*)/(seg<ID> IP:PORT)/
+-- m/ERROR:  connection to dbid 1 .*:7000 failed .*/
+-- s/ERROR:  connection to dbid 1 .*:7000 failed .*/ERROR:  connection to dbid 1 <host>:7000 failed/
+--
+-- end_matchsubs
+
+-- Let FTS detect/declare failure sooner
+!\retcode gpconfig -c gp_fts_probe_interval -v 10 --coordinatoronly;
+(exited with code 0)
+!\retcode gpstop -u;
+(exited with code 0)
+
+create table hs_failover(a int);
+CREATE TABLE
+insert into hs_failover select * from generate_series(1,10);
+INSERT 0 10
+-1S: select * from hs_failover;
+ a  
+----
+ 2  
+ 3  
+ 4  
+ 7  
+ 8  
+ 1  
+ 5  
+ 6  
+ 9  
+ 10 
+(10 rows)
+
+----------------------------------------------------------------
+-- Mirror segment fails
+----------------------------------------------------------------
+select pg_ctl(datadir, 'stop', 'immediate') from gp_segment_configuration where content=1 and role = 'm';
+ pg_ctl 
+--------
+ OK     
+(1 row)
+
+-- make sure mirror is detected down
+create temp table hs_tt(a int);
+CREATE TABLE
+select gp_request_fts_probe_scan();
+ gp_request_fts_probe_scan 
+---------------------------
+ t                         
+(1 row)
+
+-- will not succeed
+-1S: select * from hs_failover;
+ERROR:  Error on receive from seg1 slice1 127.0.1.1:7006 pid=26942: server closed the connection unexpectedly
+	This probably means the server terminated abnormally
+	before or while processing the request.
+-1Sq: ... <quitting>
+
+-- recovery
+!\retcode gprecoverseg -aF;
+(exited with code 0)
+
+-- sync-up
+select wait_until_all_segments_synchronized();
+ wait_until_all_segments_synchronized 
+--------------------------------------
+ OK                                   
+(1 row)
+
+-- works now
+-1S: select * from hs_failover;
+ a  
+----
+ 2  
+ 3  
+ 4  
+ 7  
+ 8  
+ 5  
+ 6  
+ 9  
+ 10 
+ 1  
+(10 rows)
+
+----------------------------------------------------------------
+-- Primary segment fails
+----------------------------------------------------------------
+-- inject a fault where the mirror gets out of recovery
+select gp_inject_fault('out_of_recovery_in_startupxlog', 'skip', dbid) from gp_segment_configuration where content = 1 and role = 'm';
+ gp_inject_fault 
+-----------------
+ Success:        
+(1 row)
+
+select pg_ctl(datadir, 'stop', 'immediate') from gp_segment_configuration where content=1 and role = 'p';
+ pg_ctl 
+--------
+ OK     
+(1 row)
+select gp_request_fts_probe_scan();
+ gp_request_fts_probe_scan 
+---------------------------
+ t                         
+(1 row)
+
+-- make sure failover happens
+select dbid, content, role, preferred_role, mode, status from gp_segment_configuration where content = 1;
+ dbid | content | role | preferred_role | mode | status 
+------+---------+------+----------------+------+--------
+ 3    | 1       | m    | p              | n    | d      
+ 6    | 1       | p    | m              | n    | u      
+(2 rows)
+select gp_wait_until_triggered_fault('out_of_recovery_in_startupxlog', 1, dbid) from gp_segment_configuration where content = 1 and role = 'p';
+ gp_wait_until_triggered_fault 
+-------------------------------
+ Success:                      
+(1 row)
+select gp_inject_fault('out_of_recovery_in_startupxlog', 'reset', dbid) from gp_segment_configuration where content = 1 and role = 'p';
+ gp_inject_fault 
+-----------------
+ Success:        
+(1 row)
+
+-- On an existing standby connection, query will run but it is dispatched to the previous mirror
+-- in an existing gang. That mirror is now a primary, so it will complain and the query fails.
+-1S: select * from hs_failover;
+ERROR:  primary segments can only process MPP protocol messages from primary QD  (seg1 slice1 127.0.1.1:7006 pid=14671)
+HINT:  Exit the current session and re-connect.
+-1Sq: ... <quitting>
+
+-- will fail due to downed mirror (previous primary)
+-1S: select * from hs_failover;
+ERROR:  failed to acquire resources on one or more segments
+DETAIL:  could not connect to server: Connection refused
+	Is the server running on host "127.0.1.1" and accepting
+	TCP/IP connections on port 7003?
+ (seg1 127.0.1.1:7003)
+-1Sq: ... <quitting>
+
+-- bring the downed mirror up
+!\retcode gprecoverseg -aF;
+(exited with code 0)
+select wait_until_all_segments_synchronized();
+ wait_until_all_segments_synchronized 
+--------------------------------------
+ OK                                   
+(1 row)
+
+-- mirror is up
+-1S: select dbid, content, role, preferred_role, mode, status from gp_segment_configuration where content = 1;
+ dbid | content | role | preferred_role | mode | status 
+------+---------+------+----------------+------+--------
+ 6    | 1       | p    | m              | s    | u      
+ 3    | 1       | m    | p              | s    | u      
+(2 rows)
+
+-- now the query will succeed
+-1S: select * from hs_failover;
+ a  
+----
+ 2  
+ 3  
+ 4  
+ 7  
+ 8  
+ 5  
+ 6  
+ 9  
+ 10 
+ 1  
+(10 rows)
+-1Sq: ... <quitting>
+
+-- re-balance, bring the segments to their preferred roles
+!\retcode gprecoverseg -ar;
+(exited with code 0)
+select wait_until_all_segments_synchronized();
+ wait_until_all_segments_synchronized 
+--------------------------------------
+ OK                                   
+(1 row)
+-1S: select dbid, content, role, preferred_role, mode, status from gp_segment_configuration where content = 1;
+ dbid | content | role | preferred_role | mode | status 
+------+---------+------+----------------+------+--------
+ 3    | 1       | p    | p              | s    | u      
+ 6    | 1       | m    | m              | s    | u      
+(2 rows)
+
+-- query runs fine still
+-1S: select * from hs_failover;
+ a  
+----
+ 5  
+ 6  
+ 9  
+ 10 
+ 1  
+ 2  
+ 3  
+ 4  
+ 7  
+ 8  
+(10 rows)
+
+----------------------------------------------------------------
+-- DTX recovery
+----------------------------------------------------------------
+-- skip FTS probe to prevent unexpected mirror promotion
+1: select gp_inject_fault_infinite('fts_probe', 'skip', dbid) from gp_segment_configuration where role='p' and content=-1;
+ gp_inject_fault_infinite 
+--------------------------
+ Success:                 
+(1 row)
+
+-- inject fault to cripple QE right after a QE finished the prepare phase of 2PC
+select gp_inject_fault_infinite('finish_commit_prepared', 'error', dbid) from gp_segment_configuration where content=1 and role='p';
+ gp_inject_fault_infinite 
+--------------------------
+ Success:                 
+(1 row)
+
+-- session 1 on primary QD tries to commit a DTX, but cannot finish due to the fault on a QE
+1: begin;
+BEGIN
+1: create table tt_hs_dtx(a int);
+CREATE TABLE
+1&: end;  <waiting ...>
+
+-- inject a panic on primary QD, essentially restarts the primary QD
+2: select gp_inject_fault('before_read_command', 'panic', dbid) from gp_segment_configuration where content=-1 and role='p';
+ gp_inject_fault 
+-----------------
+ Success:        
+(1 row)
+2: select 1;
+PANIC:  fault triggered, fault name:'before_read_command' fault type:'panic'
+server closed the connection unexpectedly
+	This probably means the server terminated abnormally
+	before or while processing the request.
+
+1<:  <... completed>
+server closed the connection unexpectedly
+	This probably means the server terminated abnormally
+	before or while processing the request.
+1q: ... <quitting>
+2q: ... <quitting>
+
+-- standby QD can still run query
+-1S: select * from hs_failover;
+ a  
+----
+ 1  
+ 10 
+ 2  
+ 3  
+ 4  
+ 5  
+ 6  
+ 7  
+ 8  
+ 9  
+(10 rows)
+-- XXX: currently it sees the in-doubt DTX but it shouldnt' when we supported DTX isolation.
+-1S: select * from tt_hs_dtx;
+ a 
+---
+(0 rows)
+
+-- resets the fault
+-1S: select gp_inject_fault_infinite('finish_commit_prepared', 'reset', dbid) from gp_segment_configuration where content=1 and role='p';
+ gp_inject_fault_infinite 
+--------------------------
+ Success:                 
+(1 row)
+
+-1S: select wait_until_all_segments_synchronized();
+ wait_until_all_segments_synchronized 
+--------------------------------------
+ OK                                   
+(1 row)
+1: select gp_inject_fault('before_read_command', 'reset', dbid) from gp_segment_configuration where content=-1 and role='p';
+ gp_inject_fault 
+-----------------
+ Success:        
+(1 row)
+1: select gp_inject_fault('fts_probe', 'reset', dbid) from gp_segment_configuration where role='p' and content=-1;
+ gp_inject_fault 
+-----------------
+ Success:        
+(1 row)
diff --git a/src/test/isolation2/expected/hot_standby/setup.out b/src/test/isolation2/expected/hot_standby/setup.out
new file mode 100644
index 00000000000..65fa2164584
--- /dev/null
+++ b/src/test/isolation2/expected/hot_standby/setup.out
@@ -0,0 +1,8 @@
+-- setup for hot standby tests
+!\retcode gpconfig -c hot_standby -v on;
+(exited with code 0)
+-- let primary wait for standby to apply changes, make test less flaky
+!\retcode gpconfig -c synchronous_commit -v remote_apply;
+(exited with code 0)
+!\retcode gpstop -ar;
+(exited with code 0)
diff --git a/src/test/isolation2/expected/hot_standby/teardown.out b/src/test/isolation2/expected/hot_standby/teardown.out
new file mode 100644
index 00000000000..d118ce08fa8
--- /dev/null
+++ b/src/test/isolation2/expected/hot_standby/teardown.out
@@ -0,0 +1,7 @@
+-- reset the setup for hot standby tests
+!\retcode gpconfig -r hot_standby;
+(exited with code 0)
+!\retcode gpconfig -r synchronous_commit;
+(exited with code 0)
+!\retcode gpstop -ar;
+(exited with code 0)
diff --git a/src/test/isolation2/hot_standby_schedule b/src/test/isolation2/hot_standby_schedule
new file mode 100644
index 00000000000..884610d9870
--- /dev/null
+++ b/src/test/isolation2/hot_standby_schedule
@@ -0,0 +1,4 @@
+test: hot_standby/setup
+test: hot_standby/basic
+test: hot_standby/faults
+test: hot_standby/teardown
diff --git a/src/test/isolation2/sql/hot_standby/basic.sql b/src/test/isolation2/sql/hot_standby/basic.sql
new file mode 100644
index 00000000000..64a0bb5f0c6
--- /dev/null
+++ b/src/test/isolation2/sql/hot_standby/basic.sql
@@ -0,0 +1,88 @@
+-- Tests for basic query dispatch on a hot standy.
+
+-- must show on
+-1S: show hot_standby;
+
+-- will be checking if QD/QE info looks good
+-1S: select id, type, content, port from gp_backend_info();
+
+----------------------------------------------------------------
+-- Test: basic query dispatch
+----------------------------------------------------------------
+create table hs_t1(a int);
+create table hs_t2(a int);
+insert into hs_t1 select * from generate_series(1,10);
+
+-- standby should see the result
+-1S: select * from hs_t1;
+
+-- we have three QEs launched on the mirror segments.
+-- note that the first QE on a segment is still a "writer" because we
+-- need it to manage locks, same as read-only queries on a primary QD.
+-1S: select id, type, content, port from gp_backend_info();
+
+-- should have parallel readers launched
+-1S: select * from hs_t1 join (select * from hs_t2) hs_t2 on (hs_t1 = hs_t2);
+-1S: select id, type, content, port from gp_backend_info();
+
+-- now a singleton reader added too
+-1S: select * from hs_t1 join (select oid::int from pg_class) hs_t2 on (hs_t1 = hs_t2);
+-1S: select id, type, content, port from gp_backend_info();
+
+-- un-committed result should not be seen by the standby
+begin;
+insert into hs_t1 select * from generate_series(11,20);
+
+-- standby should only see 1...10
+-1S: select * from hs_t1;
+
+end;
+
+-- standby should see 1...20 now
+-1S: select * from hs_t1;
+
+----------------------------------------------------------------
+-- Test: other things that a hot standby can do
+----------------------------------------------------------------
+-- set/reset and show GUC
+-1S: set optimizer = on;
+-1S: show optimizer;
+-1S: reset optimizer;
+-- copy command
+-1S: copy hs_t1 to '/tmp/hs_copyto.csv' csv null '';
+-- query catalogs
+-1S: select count(*) from pg_class where relname = 'hs_t1';
+-1S: select dbid,content,role,preferred_role,mode,status from gp_segment_configuration where dbid = current_setting('gp_dbid')::integer;
+
+-- Here are the things hot standby in PG can do but currently cannot in GPDB:
+-- transaction block BEGIN...END;
+-1S: begin;
+-1S: end;
+-- cursor operation due to not supporting BEGIN...END yet;
+
+-- checkpoint is allowed on standby but a restart point is created instead
+-1S: checkpoint;
+
+----------------------------------------------------------------
+-- Test: things that can't be done on a hot standby in both PG and GDPB:
+-- no DML, DDL or anything that generates WAL
+----------------------------------------------------------------
+-1S: insert into hs_t1 values(1);
+-1S: delete from hs_t1;
+-1S: update hs_t1 set a = 0;
+-1S: create table hs_t2(a int);
+-1S: create database hs_db;
+-1S: vacuum hs_t1;
+
+--
+-- No hintbit WAL generation in SELECT.
+--
+create table hs_nohintbit(a int) distributed by (a);
+insert into hs_nohintbit select generate_series (1, 10);
+-- flush the data to disk
+checkpoint;
+
+-1S: set gp_disable_tuple_hints=off;
+-- no WAL is being generated (otherwise an error would occur "cannot make new WAL entries during recovery")
+-1S: SELECT count(*) FROM hs_nohintbit;
+
diff --git a/src/test/isolation2/sql/hot_standby/faults.sql b/src/test/isolation2/sql/hot_standby/faults.sql
new file mode 100644
index 00000000000..ed82753615e
--- /dev/null
+++ b/src/test/isolation2/sql/hot_standby/faults.sql
@@ -0,0 +1,118 @@
+-- Test system faults scenarios
+
+-- start_matchsubs
+--
+-- m/Is the server running on host.*/
+-- s/Is the server running on host "\d+.\d+.\d+.\d+" and accepting/Is the server running on host <IP> and accepting/
+-- m/(seg\d+ \d+.\d+.\d+.\d+:\d+)/
+-- s/(.*)/(seg<ID> IP:PORT)/
+-- m/ERROR:  connection to dbid 1 .*:7000 failed .*/
+-- s/ERROR:  connection to dbid 1 .*:7000 failed .*/ERROR:  connection to dbid 1 <host>:7000 failed/
+--
+-- end_matchsubs
+
+-- Let FTS detect/declare failure sooner
+!\retcode gpconfig -c gp_fts_probe_interval -v 10 --coordinatoronly;
+!\retcode gpstop -u;
+
+create table hs_failover(a int);
+insert into hs_failover select * from generate_series(1,10);
+-1S: select * from hs_failover;
+
+----------------------------------------------------------------
+-- Mirror segment fails
+----------------------------------------------------------------
+select pg_ctl(datadir, 'stop', 'immediate') from gp_segment_configuration where content=1 and role = 'm';
+
+-- make sure mirror is detected down
+create temp table hs_tt(a int);
+select gp_request_fts_probe_scan();
+
+-- will not succeed
+-1S: select * from hs_failover;
+-1Sq:
+
+-- recovery
+!\retcode gprecoverseg -aF;
+
+-- sync-up
+select wait_until_all_segments_synchronized();
+
+-- works now
+-1S: select * from hs_failover;
+
+----------------------------------------------------------------
+-- Primary segment fails
+----------------------------------------------------------------
+-- inject a fault where the mirror gets out of recovery
+select gp_inject_fault('out_of_recovery_in_startupxlog', 'skip', dbid) from gp_segment_configuration where content = 1 and role = 'm';
+
+select pg_ctl(datadir, 'stop', 'immediate') from gp_segment_configuration where content=1 and role = 'p';
+select gp_request_fts_probe_scan();
+
+-- make sure failover happens
+select dbid, content, role, preferred_role, mode, status from gp_segment_configuration where content = 1;
+select gp_wait_until_triggered_fault('out_of_recovery_in_startupxlog', 1, dbid) from gp_segment_configuration where content = 1 and role = 'p';
+select gp_inject_fault('out_of_recovery_in_startupxlog', 'reset', dbid) from gp_segment_configuration where content = 1 and role = 'p';
+
+-- On an existing standby connection, query will run but it is dispatched to the previous mirror
+-- in an existing gang. That mirror is now a primary, so it will complain and the query fails.
+-1S: select * from hs_failover;
+-1Sq:
+
+-- will fail due to downed mirror (previous primary)
+-1S: select * from hs_failover;
+-1Sq:
+
+-- bring the downed mirror up
+!\retcode gprecoverseg -aF;
+select wait_until_all_segments_synchronized();
+
+-- mirror is up
+-1S: select dbid, content, role, preferred_role, mode, status from gp_segment_configuration where content = 1;
+
+-- now the query will succeed
+-1S: select * from hs_failover;
+-1Sq:
+
+-- re-balance, bring the segments to their preferred roles
+!\retcode gprecoverseg -ar;
+select wait_until_all_segments_synchronized();
+-1S: select dbid, content, role, preferred_role, mode, status from gp_segment_configuration where content = 1;
+
+-- query runs fine still
+-1S: select * from hs_failover;
+
+----------------------------------------------------------------
+-- DTX recovery
+----------------------------------------------------------------
+-- skip FTS probe to prevent unexpected mirror promotion
+1: select gp_inject_fault_infinite('fts_probe', 'skip', dbid) from gp_segment_configuration where role='p' and content=-1;
+
+-- inject fault to cripple QE right after a QE finished the prepare phase of 2PC
+select gp_inject_fault_infinite('finish_commit_prepared', 'error', dbid) from gp_segment_configuration where content=1 and role='p';
+
+-- session 1 on primary QD tries to commit a DTX, but cannot finish due to the fault on a QE
+1: begin;
+1: create table tt_hs_dtx(a int);
+1&: end;
+
+-- inject a panic on primary QD, essentially restarts the primary QD
+2: select gp_inject_fault('before_read_command', 'panic', dbid) from gp_segment_configuration where content=-1 and role='p';
+2: select 1;
+
+1<:
+1q:
+2q:
+
+-- standby QD can still run query
+-1S: select * from hs_failover;
+-- XXX: currently it sees the in-doubt DTX but it shouldnt' when we supported DTX isolation.
+-1S: select * from tt_hs_dtx;
+
+-- resets the fault
+-1S: select gp_inject_fault_infinite('finish_commit_prepared', 'reset', dbid) from gp_segment_configuration where content=1 and role='p';
+
+-1S: select wait_until_all_segments_synchronized();
+1: select gp_inject_fault('before_read_command', 'reset', dbid) from gp_segment_configuration where content=-1 and role='p';
+1: select gp_inject_fault('fts_probe', 'reset', dbid) from gp_segment_configuration where role='p' and content=-1;
diff --git a/src/test/isolation2/sql/hot_standby/setup.sql b/src/test/isolation2/sql/hot_standby/setup.sql
new file mode 100644
index 00000000000..cdf4ec67bd1
--- /dev/null
+++ b/src/test/isolation2/sql/hot_standby/setup.sql
@@ -0,0 +1,5 @@
+-- setup for hot standby tests
+!\retcode gpconfig -c hot_standby -v on;
+-- let primary wait for standby to apply changes, make test less flaky
+!\retcode gpconfig -c synchronous_commit -v remote_apply;
+!\retcode gpstop -ar;
diff --git a/src/test/isolation2/sql/hot_standby/teardown.sql b/src/test/isolation2/sql/hot_standby/teardown.sql
new file mode 100644
index 00000000000..3544c1d9beb
--- /dev/null
+++ b/src/test/isolation2/sql/hot_standby/teardown.sql
@@ -0,0 +1,4 @@
+-- reset the setup for hot standby tests
+!\retcode gpconfig -r hot_standby;
+!\retcode gpconfig -r synchronous_commit;
+!\retcode gpstop -ar;

From 7daf1a0e478ab3589ea455c73a4b18e2f950a56f Mon Sep 17 00:00:00 2001
From: wangxiaoran <fanfuxiaoran@gmail.com>
Date: Wed, 4 Jun 2025 11:03:43 +0800
Subject: [PATCH 02/11] Revert "Revert "Include distributed xid in transaction
 commit WAL in all cases""

This reverts commit e1c99e4d77be44d5400530fd32ce1ace992b2fb1.
---
 src/backend/access/transam/xact.c | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c
index f3f2a035281..2ac0af034ec 100644
--- a/src/backend/access/transam/xact.c
+++ b/src/backend/access/transam/xact.c
@@ -6880,8 +6880,8 @@ XactLogCommitRecord(TimestampTz commit_time,
 	xl_xact_distrib xl_distrib;
 	xl_xact_deldbs xl_deldbs;
 	XLogRecPtr recptr;
-	bool isOnePhaseQE = (Gp_role == GP_ROLE_EXECUTE && MyTmGxactLocal->isOnePhaseCommit);
 	bool isDtxPrepared = isPreparedDtxTransaction();
+	DistributedTransactionId distrib_xid = getDistributedTransactionId();
 
 	uint8		info;
 
@@ -6971,10 +6971,11 @@ XactLogCommitRecord(TimestampTz commit_time,
 		xl_origin.origin_timestamp = replorigin_session_origin_timestamp;
 	}
 
-	if (isDtxPrepared || isOnePhaseQE)
+	/* include distributed xid if there's one */
+	if (distrib_xid != InvalidDistributedTransactionId)
 	{
 		xl_xinfo.xinfo |= XACT_XINFO_HAS_DISTRIB;
-		xl_distrib.distrib_xid = getDistributedTransactionId();
+		xl_distrib.distrib_xid = distrib_xid;
 	}
 
 #if 0

From 299aa1565e2e2ea2c8ae104d1d5520208626e230 Mon Sep 17 00:00:00 2001
From: Kate Dontsova <kdontsova@vmware.com>
Date: Thu, 28 Mar 2024 16:05:56 -0700
Subject: [PATCH 03/11] Refactor restore point pausing logic for continuous
 archive recovery

We currently have the GPDB-specific gp_pause_on_restore_point_replay
hidden developer GUC which allows us to pause when replaying a restore
point record. The logic was a bit flawed and needed some refactoring
to accommodate the current hot standby work.

These are the changes that were made:
* The gp_pause_on_restore_point_replay GUC has been changed from a
  boolean type to a string type. This allows us to set exactly which
  restore point to pause on (assuming the restore points provided are
  unique). The user/application can update the GUC, do a reload, and
  resume WAL replay to advance towards the next restore point to pause
  on.
* The pausing logic has been moved out of the xlog_redo() function and
  into its own separate function. If WAL replay has reached the
  restore point designated in the gp_pause_on_restore_point_replay
  GUC, it will now pause near the end of the main redo apply
  loop. When resumed (via a `SELECT pg_wal_replay_resume()` call), we
  check if a promotion has been requested. If there is a promotion
  request, then the continuous recovery target has been reached where
  we will then stop recovery and go through promotion by piggybacking
  on the existing recovery target logic.

Co-authored-by: Jimmy Yih <jyih@vmware.com>
Co-authored-by: Soumyadeep Chakraborty <soumyadeep2007@gmail.com>
---
 .abi-check/7.1.0/postgres.symbols.ignore      |   8 +
 src/backend/access/transam/xlog.c             |  82 ++++++++--
 src/backend/utils/misc/guc_gp.c               |  21 +--
 src/include/access/transam.h                  |   2 +-
 src/test/perl/PostgresNode.pm                 |  44 ++++++
 .../t/101_restore_point_and_startup_pause.pl  | 140 +++++++++++++-----
 6 files changed, 242 insertions(+), 55 deletions(-)

diff --git a/.abi-check/7.1.0/postgres.symbols.ignore b/.abi-check/7.1.0/postgres.symbols.ignore
index 848dbf2841d..2f629d94fdc 100644
--- a/.abi-check/7.1.0/postgres.symbols.ignore
+++ b/.abi-check/7.1.0/postgres.symbols.ignore
@@ -1 +1,9 @@
 pgarch_start
+ConfigureNamesInt_gp
+child_triggers
+has_update_triggers
+ConfigureNamesBool_gp
+aocs_beginscan
+AppendOnlyBlockDirectory_GetEntry
+ConfigureNamesString_gp
+gp_pause_on_restore_point_replay
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 469f078aa59..857fa336433 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -133,7 +133,14 @@ bool		track_wal_io_timing = false;
 int         FileEncryptionEnabled = false;
 
 /* GPDB specific */
-bool gp_pause_on_restore_point_replay = false;
+char *gp_pause_on_restore_point_replay = "";
+
+/*
+ * GPDB: Have we reached a specific continuous recovery target? We set this to
+ * true if WAL replay has found a restore point matching the GPDB-specific GUC
+ * gp_pause_on_restore_point_replay and a promotion has been requested.
+ */
+static bool reachedContinuousRecoveryTarget = false;
 
 #ifdef WAL_DEBUG
 bool		XLOG_DEBUG = false;
@@ -6012,6 +6019,59 @@ recoveryStopsBefore(XLogReaderState *record)
 	return stopsHere;
 }
 
+/*
+ * GPDB: Restore point records can act as a point of synchronization to ensure
+ * cluster-wide consistency during WAL replay. If a restore point is specified
+ * in the gp_pause_on_restore_point_replay GUC, WAL replay will be paused at
+ * that restore point until replay is explicitly resumed.
+ */
+static void
+pauseRecoveryOnRestorePoint(XLogReaderState *record)
+{
+	uint8		info;
+	uint8		rmid;
+
+	/*
+	 * Ignore recovery target settings when not in archive recovery (meaning
+	 * we are in crash recovery).
+	 */
+	if (!ArchiveRecoveryRequested)
+		return;
+
+	info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
+	rmid = XLogRecGetRmid(record);
+
+	if (rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT)
+	{
+		xl_restore_point *recordRestorePointData;
+
+		recordRestorePointData = (xl_restore_point *) XLogRecGetData(record);
+
+		if (strcmp(recordRestorePointData->rp_name, gp_pause_on_restore_point_replay) == 0)
+		{
+			ereport(LOG,
+					(errmsg("setting recovery pause at restore point \"%s\", time %s",
+							recordRestorePointData->rp_name,
+							timestamptz_to_str(recordRestorePointData->rp_time))));
+
+			SetRecoveryPause(true);
+			recoveryPausesHere();
+
+			/*
+			 * If we've unpaused and there is a promotion request, then we've
+			 * reached our continuous recovery target and need to immediately
+			 * promote. We piggyback on the existing recovery target logic to
+			 * do this. See recoveryStopsAfter().
+			 */
+			if (CheckForStandbyTrigger())
+			{
+				reachedContinuousRecoveryTarget = true;
+				recoveryTargetAction = RECOVERY_TARGET_ACTION_PROMOTE;
+			}
+		}
+	}
+}
+
 /*
  * Same as recoveryStopsBefore, but called after applying the record.
  *
@@ -6039,15 +6099,19 @@ recoveryStopsAfter(XLogReaderState *record)
 	/*
 	 * There can be many restore points that share the same name; we stop at
 	 * the first one.
+	 *
+	 * GPDB: If we've reached the continuous recovery target, we'll use the
+	 * below logic to immediately stop recovery.
 	 */
-	if (recoveryTarget == RECOVERY_TARGET_NAME &&
+	if ((reachedContinuousRecoveryTarget || recoveryTarget == RECOVERY_TARGET_NAME) &&
 		rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT)
 	{
 		xl_restore_point *recordRestorePointData;
 
 		recordRestorePointData = (xl_restore_point *) XLogRecGetData(record);
 
-		if (strcmp(recordRestorePointData->rp_name, recoveryTargetName) == 0)
+		if (reachedContinuousRecoveryTarget ||
+			strcmp(recordRestorePointData->rp_name, recoveryTargetName) == 0)
 		{
 			recoveryStopAfter = true;
 			recoveryStopXid = InvalidTransactionId;
@@ -7900,6 +7964,9 @@ StartupXLOG(void)
 						WalSndWakeup();
 				}
 
+				if (gp_pause_on_restore_point_replay)
+					pauseRecoveryOnRestorePoint(xlogreader);
+
 				/* Exit loop if we reached inclusive recovery target */
 				if (recoveryStopsAfter(xlogreader))
 				{
@@ -11128,14 +11195,7 @@ xlog_redo(XLogReaderState *record)
 	}
 	else if (info == XLOG_RESTORE_POINT)
 	{
-		/*
-		 * GPDB: Restore point records can act as a point of
-		 * synchronization to ensure cluster-wide consistency during WAL
-		 * replay. WAL replay is paused at each restore point until it is
-		 * explicitly resumed.
-		 */
-		if (gp_pause_on_restore_point_replay)
-			SetRecoveryPause(true);
+		/* nothing to do here */
 	}
 	else if (info == XLOG_FPI || info == XLOG_FPI_FOR_HINT)
 	{
diff --git a/src/backend/utils/misc/guc_gp.c b/src/backend/utils/misc/guc_gp.c
index 70d51316875..df83e03b766 100644
--- a/src/backend/utils/misc/guc_gp.c
+++ b/src/backend/utils/misc/guc_gp.c
@@ -3095,16 +3095,6 @@ struct config_bool ConfigureNamesBool_gp[] =
 		 NULL, NULL, NULL
 	},
 
-	{
-		{"gp_pause_on_restore_point_replay", PGC_SIGHUP, DEVELOPER_OPTIONS,
-		 gettext_noop("Pause recovery when a restore point is replayed."),
-		 NULL,
-		 GUC_NO_SHOW_ALL | GUC_NOT_IN_SAMPLE
-		},
-		&gp_pause_on_restore_point_replay,
-		false,
-		NULL, NULL, NULL
-	},
 	{
 		{"gp_autostats_allow_nonowner", PGC_SUSET, DEVELOPER_OPTIONS,
 			gettext_noop("Allow automatic stats collection on tables even for users who are not the owner of the relation."),
@@ -5052,6 +5042,17 @@ struct config_string ConfigureNamesString_gp[] =
 		"udpifc",
 		check_gp_interconnect_type, assign_gp_interconnect_type, show_gp_interconnect_type
 	},
+	{
+		{"gp_pause_on_restore_point_replay", PGC_SUSET, DEVELOPER_OPTIONS,
+			gettext_noop("Specifies the restore point to pause replay on."),
+			gettext_noop("Unlike recovery_target_name, this can be used to continuously set/reset "
+						"how much a standby should replay up to."),
+			GUC_NO_SHOW_ALL | GUC_NOT_IN_SAMPLE
+		},
+		&gp_pause_on_restore_point_replay,
+		"",
+		NULL, NULL, NULL
+	},
 
 	/* End-of-list marker */
 	{
diff --git a/src/include/access/transam.h b/src/include/access/transam.h
index cec3e5f4cb7..687799bec9f 100644
--- a/src/include/access/transam.h
+++ b/src/include/access/transam.h
@@ -301,7 +301,7 @@ extern int xid_stop_limit;
 extern int xid_warn_limit;
 
 /* GPDB-specific */
-extern bool gp_pause_on_restore_point_replay;
+extern char *gp_pause_on_restore_point_replay;
 
 /* hook for plugins to assign new relfilenode */
 typedef Oid (*NewSegRelfilenode_assign_hook_type)(void);
diff --git a/src/test/perl/PostgresNode.pm b/src/test/perl/PostgresNode.pm
index 262e4e74fbe..9e6d4c653b9 100644
--- a/src/test/perl/PostgresNode.pm
+++ b/src/test/perl/PostgresNode.pm
@@ -603,6 +603,50 @@ sub append_conf
 
 =pod
 
+=item $node->adjust_conf(filename, setting, value, skip_equals)
+
+Modify the named config file setting with the value. If the value is undefined,
+instead delete the setting. If the setting is not present no action is taken.
+
+This will write "$setting = $value\n" in place of the existing line,
+unless skip_equals is true, in which case it will write
+"$setting $value\n". If the value needs to be quoted it is the caller's
+responsibility to do that.
+
+=cut
+
+sub adjust_conf
+{
+    my ($self, $filename, $setting, $value, $skip_equals) = @_;
+
+    my $conffile = $self->data_dir . '/' . $filename;
+
+    my $contents = PostgreSQL::Test::Utils::slurp_file($conffile);
+    my @lines    = split(/\n/, $contents);
+    my @result;
+    my $eq = $skip_equals ? '' : '= ';
+    foreach my $line (@lines)
+    {
+        if ($line !~ /^$setting\W/)
+        {
+            push(@result, "$line\n");
+        }
+        elsif (defined $value)
+        {
+            push(@result, "$setting $eq$value\n");
+        }
+    }
+    open my $fh, ">", $conffile
+        or croak "could not write \"$conffile\": $!";
+    print $fh @result;
+    close $fh;
+
+    chmod($self->group_access() ? 0640 : 0600, $conffile)
+        or die("unable to set permissions for $conffile");
+}
+
+=pod
+
 =item $node->backup(backup_name)
 
 Create a hot backup with B<pg_basebackup> in subdirectory B<backup_name> of
diff --git a/src/test/recovery/t/101_restore_point_and_startup_pause.pl b/src/test/recovery/t/101_restore_point_and_startup_pause.pl
index cda572524c1..f59acffb7ad 100644
--- a/src/test/recovery/t/101_restore_point_and_startup_pause.pl
+++ b/src/test/recovery/t/101_restore_point_and_startup_pause.pl
@@ -1,48 +1,122 @@
-# test for pausing on startup and on a specified restore point
+# Test for pausing and resuming recovery at specific restore points,
+# both at initial startup and in a continuous fashion by advancing
+# gp_pause_on_restore_point_replay.
+
 use strict;
 use warnings;
 use PostgresNode;
 use TestLib;
-use Test::More tests => 1;
+use Test::More tests => 12;
 use File::Copy;
 
-# Initialize primary node with WAL archiving setup
+# Initialize and start primary node
 my $node_primary = get_new_node('primary');
-$node_primary->init(
-    has_archiving    => 1,
-    allows_streaming => 1);
-$node_primary->append_conf('postgresql.conf', "wal_level = 'replica'");
-$node_primary->append_conf('postgresql.conf', "max_wal_senders = 10");
-my $backup_name = 'my_backup';
-
-# Start primary
+$node_primary->init(has_archiving => 1, allows_streaming => 1);
 $node_primary->start;
 
-# Initialize standby node from backup, fetching WAL from archives
-$node_primary->backup($backup_name);
-my $node_standby = get_new_node('standby');
-$node_standby->init_from_backup($node_primary, $backup_name,
-    has_restoring => 1);
-$node_standby->append_conf('postgresql.conf', "gp_pause_on_restore_point_replay = on");
+my $node_standby = get_new_node("standby");
+
+sub test_pause_in_recovery
+{
+	my ($restore_point, $test_lsn, $num_rows) = @_;
+
+	# Wait until standby has replayed enough data
+	my $caughtup_query = "SELECT pg_last_wal_replay_lsn() = '$test_lsn'::pg_lsn";
+	$node_standby->poll_query_until('postgres', $caughtup_query)
+		or die "Timed out while waiting for standby to catch up";
+
+	# Check data has been replayed
+	my $result = $node_standby->safe_psql('postgres', "SELECT count(*) FROM table_foo;");
+	is($result, $num_rows, "check standby content for $restore_point");
+	ok($node_standby->safe_psql('postgres', 'SELECT pg_is_wal_replay_paused();') eq 't',
+		"standby is paused in recovery on $restore_point");
+}
+
+# Create data before taking the backup
+$node_primary->safe_psql('postgres', "CREATE TABLE table_foo AS SELECT generate_series(1,1000);");
+# Take backup from which all operations will be run
+$node_primary->backup('my_backup');
+my $lsn0 = $node_primary->safe_psql('postgres', "SELECT pg_create_restore_point('rp0');");
+# Switching WAL guarantees that the restore point is available to the standby
+$node_primary->safe_psql('postgres', "SELECT pg_switch_wal();");
+
+# Add more data, create restore points and switch wal to guarantee
+# that the restore point is available to the standby
+
+# rp1
+$node_primary->safe_psql('postgres', "INSERT INTO table_foo VALUES (generate_series(1001,2000))");
+my $lsn1 = $node_primary->safe_psql('postgres', "SELECT pg_create_restore_point('rp1');");
+$node_primary->safe_psql('postgres', "SELECT pg_switch_wal();");
+
+# rp2
+$node_primary->safe_psql('postgres', "INSERT INTO table_foo VALUES (generate_series(2001, 3000))");
+my $lsn2 = $node_primary->safe_psql('postgres', "SELECT pg_create_restore_point('rp2');");
+$node_primary->safe_psql('postgres', "SELECT pg_switch_wal();");
+
+# rp3
+$node_primary->safe_psql('postgres', "INSERT INTO table_foo VALUES (generate_series(3001, 4000))");
+$node_primary->safe_psql('postgres', "SELECT pg_create_restore_point('rp3');");
+$node_primary->safe_psql('postgres', "SELECT pg_switch_wal();");
 
-# Start standby
+# rp4
+$node_primary->safe_psql('postgres', "INSERT INTO table_foo VALUES (generate_series(4001, 5000))");
+my $lsn4 = $node_primary->safe_psql('postgres', "SELECT pg_create_restore_point('rp4');");
+$node_primary->safe_psql('postgres', "SELECT pg_switch_wal();");
+
+# rp5
+$node_primary->safe_psql('postgres', "INSERT INTO table_foo VALUES (generate_series(5001, 6000))");
+$node_primary->safe_psql('postgres', "SELECT pg_create_restore_point('rp5');");
+$node_primary->safe_psql('postgres', "SELECT pg_switch_wal();");
+
+# Restore the backup
+$node_standby->init_from_backup($node_primary, 'my_backup', has_restoring => 1);
+# Enable `hot_standby`
+$node_standby->append_conf('postgresql.conf', qq(hot_standby = 'on'));
+
+# Set rp0 as a restore point to pause on start up
+$node_standby->append_conf('postgresql.conf', qq(gp_pause_on_restore_point_replay = 'rp0'));
+# Start the standby
 $node_standby->start;
+test_pause_in_recovery('rp0', $lsn0, 1000);
+
+# Advance to rp1
+$node_standby->adjust_conf('postgresql.conf', 'gp_pause_on_restore_point_replay', "rp1");
+$node_standby->reload;
+$node_standby->safe_psql('postgres', "SELECT pg_wal_replay_resume();");
+test_pause_in_recovery('rp1', $lsn1, 2000);
+
+# Advance to rp2
+$node_standby->adjust_conf('postgresql.conf', 'gp_pause_on_restore_point_replay', "rp2");
+$node_standby->reload;
+$node_standby->safe_psql('postgres', "SELECT pg_wal_replay_resume();");
+test_pause_in_recovery('rp2', $lsn2, 3000);
+
+# Verify that a restart will bring us back to rp2
+$node_standby->restart;
+test_pause_in_recovery('rp2', $lsn2, 3000);
+
+# Skip rp3 and advance to rp4
+$node_standby->adjust_conf('postgresql.conf', 'gp_pause_on_restore_point_replay', "rp4");
+$node_standby->reload;
+$node_standby->safe_psql('postgres', "SELECT pg_wal_replay_resume();");
+test_pause_in_recovery('rp4', $lsn4, 5000);
+
+# Do not advance to rp5; signal promote and then resume recovery
+$node_standby->safe_psql('postgres', "SELECT pg_promote(false);");
+$node_standby->safe_psql('postgres', "SELECT pg_wal_replay_resume();");
 
-# Create a restore point on the primary
-my $restore_point_lsn =
-    $node_primary->safe_psql('postgres', "SELECT pg_create_restore_point('rp')");
+# Wait for standby to promote
+$node_standby->poll_query_until('postgres', "SELECT NOT pg_is_in_recovery();")
+	or die "Timed out while waiting for standby to exit recovery";
 
-# Force archival of WAL file to make it present on standby
-$node_primary->safe_psql('postgres', "SELECT pg_switch_wal()");
+# Check that we promoted with rp4's table count and not rp5's
+my $result = $node_standby->safe_psql('postgres', "SELECT count(*) FROM table_foo;");
+is($result, 5000, "check standby content after promotion");
 
-# Wait until enough replay has been done on the standby before checking if replay
-# is paused at the restore point
-my $caughtup_query =
-    "SELECT '$restore_point_lsn'::pg_lsn <= pg_last_wal_replay_lsn()";
-$node_standby->poll_query_until('postgres', $caughtup_query)
-    or die "Timed out while waiting for standby to catch up";
+# Make sure the former standby is now writable
+$node_standby->safe_psql('postgres', "INSERT INTO table_foo VALUES (generate_series(6001, 7000));");
+$result = $node_standby->safe_psql('postgres', "SELECT count(*) FROM table_foo;");
+is($result, 6000, "check standby is writable after promotion");
 
-my $paused_at_restore_point_query =
-    "SELECT pg_is_wal_replay_paused() and pg_last_wal_replay_lsn() = '$restore_point_lsn'::pg_lsn";
-my $result2 = $node_standby->safe_psql('postgres', $paused_at_restore_point_query);
-is($result2, qq(t), 'check if WAL replay is paused at restore point');
+$node_primary->teardown_node;
+$node_standby->teardown_node;

From fac75fa87843961f04cb9a5aefe6278a700c9ccb Mon Sep 17 00:00:00 2001
From: Huansong Fu <fuhuansong@gmail.com>
Date: Tue, 9 Apr 2024 08:08:14 -0700
Subject: [PATCH 04/11] Add XLOG_LATESTCOMPLETED_GXID

To support hot standby, we need to reconstruct the state of running dtx at the
time of checkpoint on the standby. It is key to the correctness of distributed
snapshot the standby will use. One key piece of information is
latestCompletedGxid - it provides the xmax of the snapshot.

But unlike primary who just sets latestCompletedGxid = nextGxid - 1, the standby
cannot use nextGxid. This is because nextGxid was bumped in the checkpoint and
cannot represent the xmax of running dtx (see CreateCheckPoint). It is OK for
the primary since it does not need to reconstruct the running dtx.

So now we introduce a new XLOG type XLOG_LATESTCOMPLETED_GXID which directly
writes the latestCompletedGxid at the checkpoint time. It is only written on
QD and when hot standby is active.

P.S. the alternative is to bump nextGxid at the startup instead of checkpoint,
so its value can be used for the standby to initialize latestCompletedGxid. But
for the primary, it would be impossible to know the correct number of gxid to
bump, since gp_gxid_prefetch_num can change before restart.

CBDB: Change the rmgr from XLOG to STANDBY as there is no room in the 4
high bits in xl_info. And it makes sense to put it into STANDBY rmgr
since it is used to make hot_standby snapshot.
---
 src/backend/access/rmgrdesc/standbydesc.c | 11 ++++++++
 src/backend/access/transam/xlog.c         |  5 +++-
 src/backend/replication/logical/decode.c  |  3 +++
 src/backend/storage/ipc/standby.c         | 33 +++++++++++++++++++++++
 src/include/storage/standbydefs.h         |  1 +
 5 files changed, 52 insertions(+), 1 deletion(-)

diff --git a/src/backend/access/rmgrdesc/standbydesc.c b/src/backend/access/rmgrdesc/standbydesc.c
index 01ee7ac6d2c..899c621b240 100644
--- a/src/backend/access/rmgrdesc/standbydesc.c
+++ b/src/backend/access/rmgrdesc/standbydesc.c
@@ -66,6 +66,14 @@ standby_desc(StringInfo buf, XLogReaderState *record)
 								   xlrec->dbId, xlrec->tsId,
 								   xlrec->relcacheInitFileInval);
 	}
+	else if (info == XLOG_LATESTCOMPLETED_GXID)
+	{
+		DistributedTransactionId gxid;
+
+		gxid = *((DistributedTransactionId *) rec);
+		appendStringInfo(buf, UINT64_FORMAT, gxid);
+	}
+
 }
 
 const char *
@@ -84,6 +92,9 @@ standby_identify(uint8 info)
 		case XLOG_INVALIDATIONS:
 			id = "INVALIDATIONS";
 			break;
+		case XLOG_LATESTCOMPLETED_GXID:
+			id = "XLOG_LATESTCOMPLETED_GXID";
+			break;
 	}
 
 	return id;
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 857fa336433..ffc8714cf62 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -6055,7 +6055,7 @@ pauseRecoveryOnRestorePoint(XLogReaderState *record)
 							timestamptz_to_str(recordRestorePointData->rp_time))));
 
 			SetRecoveryPause(true);
-			recoveryPausesHere();
+			recoveryPausesHere(false);
 
 			/*
 			 * If we've unpaused and there is a promotion request, then we've
@@ -9870,8 +9870,11 @@ CreateCheckPoint(int flags)
 	 * recovery we don't need to write running xact data.
 	 */
 	if (!shutdown && XLogStandbyInfoActive())
+	{
 		LogStandbySnapshot();
 
+	}
+
 	SIMPLE_FAULT_INJECTOR("checkpoint_after_redo_calculated");
 
 	START_CRIT_SECTION();
diff --git a/src/backend/replication/logical/decode.c b/src/backend/replication/logical/decode.c
index 1a835983222..68524222d71 100644
--- a/src/backend/replication/logical/decode.c
+++ b/src/backend/replication/logical/decode.c
@@ -371,6 +371,9 @@ standby_decode(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
 			 * XLOG_XACT_INVALIDATIONS.  So we don't need to do anything here.
 			 */
 			break;
+		case XLOG_LATESTCOMPLETED_GXID:
+			/* FIXME: need to decode this part? */
+			break;
 		default:
 			elog(ERROR, "unexpected RM_STANDBY_ID record type: %u", info);
 	}
diff --git a/src/backend/storage/ipc/standby.c b/src/backend/storage/ipc/standby.c
index 687ce03767d..8891b4dbcb4 100644
--- a/src/backend/storage/ipc/standby.c
+++ b/src/backend/storage/ipc/standby.c
@@ -21,6 +21,7 @@
 #include "access/xact.h"
 #include "access/xlog.h"
 #include "access/xloginsert.h"
+#include "cdb/cdbvars.h"
 #include "miscadmin.h"
 #include "pgstat.h"
 #include "storage/bufmgr.h"
@@ -1148,6 +1149,23 @@ standby_redo(XLogReaderState *record)
 											 xlrec->dbId,
 											 xlrec->tsId);
 	}
+	else if (info == XLOG_LATESTCOMPLETED_GXID)
+	{
+		/*
+		 * This record is only logged by coordinator. But the segment in
+		 * some situation might see it too (e.g. gpexpand), but segment
+		 * doesn't need to update latestCompletedGxid.
+		 */
+		if (IS_QUERY_DISPATCHER())
+		{
+			DistributedTransactionId gxid;
+
+			gxid = *((DistributedTransactionId *) XLogRecGetData(record));
+			LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+			ShmemVariableCache->latestCompletedGxid = gxid;
+			LWLockRelease(ProcArrayLock);
+		}
+	}
 	else
 		elog(PANIC, "standby_redo: unknown op code %u", info);
 }
@@ -1265,6 +1283,21 @@ LogStandbySnapshot(void)
 
 	/* GetRunningTransactionData() acquired XidGenLock, we must release it */
 	LWLockRelease(XidGenLock);
+	if (IS_QUERY_DISPATCHER())
+	{
+		/*
+		 * GPDB: write latestCompletedGxid too, because the standby needs this 
+		 * value for creating distributed snapshot. The standby cannot rely on
+		 * the nextGxid value to set latestCompletedGxid during restart (which 
+		 * the primary does) because nextGxid was bumped in the checkpoint.
+		 */
+		LWLockAcquire(ProcArrayLock, LW_SHARED);
+		DistributedTransactionId lcgxid = ShmemVariableCache->latestCompletedGxid;
+		LWLockRelease(ProcArrayLock);
+		XLogBeginInsert();
+		XLogRegisterData((char *) (&lcgxid), sizeof(lcgxid));
+		recptr = XLogInsert(RM_STANDBY_ID, XLOG_LATESTCOMPLETED_GXID);
+	}
 
 	return recptr;
 }
diff --git a/src/include/storage/standbydefs.h b/src/include/storage/standbydefs.h
index d99e6f40c6d..f007fe25245 100644
--- a/src/include/storage/standbydefs.h
+++ b/src/include/storage/standbydefs.h
@@ -34,6 +34,7 @@ extern void standby_desc_invalidations(StringInfo buf,
 #define XLOG_STANDBY_LOCK			0x00
 #define XLOG_RUNNING_XACTS			0x10
 #define XLOG_INVALIDATIONS			0x20
+#define XLOG_LATESTCOMPLETED_GXID   0xF0
 
 typedef struct xl_standby_locks
 {

From 5bf5cb15c54fb4eba6a3193423f8828505bf13d1 Mon Sep 17 00:00:00 2001
From: Huansong Fu <fuhuansong@gmail.com>
Date: Thu, 11 Jan 2024 21:13:55 -0800
Subject: [PATCH 05/11] Support read-committed dtx isolation for hot standby

The previous few commits have removed some road blocks for supporting it. This
commit mainly just deals two more aspects wrt to distributed transactions:

* Initialize latestCompletedGxid during StartupXLOG, and update it while the
  standby replays new transactions.
* Construct an in-progress dtx array when creating distributed snapshot
  according to the shmCommittedGxidArray[] we already keep in the standby.

It was pondered whether or not to add a new WAL type XLOG_RUNNING_DISTRIBUTED_XACTS
similar to XLOG_RUNNING_XACTS. But it seems unnecessary at the moment: we already
have the running dtx information in the checkpoint record. The other information
in the XLOG_RUNNING_XACTS record does not seem to be needed to support
read-committed isolation. There are a few other callers of ProcArrayApplyRecoveryInfo()
that relies on the XLOG_RUNNING_XACTS, but it doesn't seem we have a need to
emulate them for dtx.
---
 .abi-check/7.1.0/postgres.symbols.ignore      |   3 +
 src/backend/access/transam/README             |  45 ++
 src/backend/access/transam/xact.c             |   5 +-
 src/backend/cdb/cdbdtxrecovery.c              |  47 +-
 src/backend/cdb/cdbtm.c                       |   5 +-
 src/backend/storage/ipc/procarray.c           |  41 +-
 src/backend/storage/lmgr/lwlocknames.txt      |   1 +
 src/backend/tcop/postgres.c                   |   2 +
 src/backend/tcop/pquery.c                     |   4 +
 .../isolation2/expected/hot_standby/basic.out |  50 +-
 .../expected/hot_standby/faults.out           |  47 +-
 .../hot_standby/transaction_isolation.out     | 755 ++++++++++++++++++
 src/test/isolation2/hot_standby_schedule      |   1 +
 src/test/isolation2/sql/hot_standby/basic.sql |  16 +-
 .../isolation2/sql/hot_standby/faults.sql     |  19 +-
 .../sql/hot_standby/transaction_isolation.sql | 244 ++++++
 16 files changed, 1247 insertions(+), 38 deletions(-)
 create mode 100644 src/test/isolation2/expected/hot_standby/transaction_isolation.out
 create mode 100644 src/test/isolation2/sql/hot_standby/transaction_isolation.sql

diff --git a/.abi-check/7.1.0/postgres.symbols.ignore b/.abi-check/7.1.0/postgres.symbols.ignore
index 2f629d94fdc..d42d77c4039 100644
--- a/.abi-check/7.1.0/postgres.symbols.ignore
+++ b/.abi-check/7.1.0/postgres.symbols.ignore
@@ -7,3 +7,6 @@ aocs_beginscan
 AppendOnlyBlockDirectory_GetEntry
 ConfigureNamesString_gp
 gp_pause_on_restore_point_replay
+ConfigureNamesReal_gp
+TableAmRoutine
+MainLWLockNames
diff --git a/src/backend/access/transam/README b/src/backend/access/transam/README
index f3112ff3070..efac0cb505e 100644
--- a/src/backend/access/transam/README
+++ b/src/backend/access/transam/README
@@ -897,3 +897,48 @@ yet simplifies emulation of subtransactions considerably.
 
 Further details on locking mechanics in recovery are given in comments
 with the Lock rmgr code.
+
+Distributed Transaction Emulation during Recovery
+-------------------------------------
+
+In GPDB, the MVCC snapshot also includes distributed transactions (aka dtx). 
+Accordingly, on a hot standby we also emulate running dtx. The way to do that 
+is to re-use the shmCommittedGxidArray which has been used on a primary for dtx
+recovery: it tracks all the 2PC dtx that have their PREPARE phase done, 
+but for which the COMMIT phase hasn't finished (i.e. window between the
+XLOG_XACT_DISTRIBUTED_COMMIT record being written and the 
+XLOG_XACT_DISTRIBUTED_FORGET record being written on the QD). On a hot standby, 
+any dtx shown in that array are regarded as in-progress. The MVCC snapshot does 
+not really need to account for dtx not in that array: for a dtx that hasn't 
+done PREPARE, we know no segment has committed any data yet; for a dtx that 
+hasn't done COMMIT, we know all segments have committed their data.
+
+Note: dtxes that are preparing will not be tracked in this array, and thus will 
+not be included in this snapshot. This is slightly different from a primary QD, 
+where such transactions would have been included in the distributed snapshot's 
+inProgressXidArray (as we construct the inProgressXidArray from the PGXACTs that
+would contain the dummy entries for prepared transactions). However, as 
+mentioned in CreateDistributedSnapshot, including these is not a requirement for
+correctness.
+
+Note: aborted/aborting dtxes are not accounted for by the standby either. Those
+are the dtxes that encountered error during preparing. Same as the previous 
+point, the standby does not need to be aware of them for correctness. Worth also
+noting that if a dtx encountered error after being prepared, it cannot be 
+aborted anymore and must be committed by the dtx recovery process. Until 
+committed, such a dtx will be seen as in-progress to the standby.
+
+For 1PC dtx, however, there is a known limitation where the hot standby won't 
+see the last 1PC (or the last few 1PCs if they are all 1PC). This is because 
+since 1PC does not have any WAL on QD, the standby QD won't advance its 
+latestCompletedGxid, so its distributed snapshot horizon does not include the
+last 1PC - it would view the last 1PC not yet started or at best still in 
+progress. Only if another 2PC comes, the standby would advance its 
+latestCompletedGxid and its distributed snapshot will include the previous 1PC. 
+
+We don't emulate the full architecture of "running transaction" for dtx because
+that is unnecessary, at least ATM. For example, we don't create a dtx-version
+of XLOG_RUNNING_XACTS, because we already have that information as part of the
+extended checkpoint (see TMGXACT_CHECKPOINT). We also don't need to emulate 
+other members in RunningTransactionsData, like subxid or xid-pruning related 
+variables because those do not apply to dtx.
diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c
index 2ac0af034ec..5fdbcf70953 100644
--- a/src/backend/access/transam/xact.c
+++ b/src/backend/access/transam/xact.c
@@ -2475,11 +2475,10 @@ StartTransaction(void)
 
 	/*
 	 * Transactions may be started while recovery is in progress, if
-	 * hot standby is enabled.  This mode is not supported in
-	 * Cloudberry yet.
+	 * hot standby is enabled.
 	 */
 	AssertImply(DistributedTransactionContext != DTX_CONTEXT_LOCAL_ONLY,
-				!s->startedInRecovery);
+				EnableHotStandby || !s->startedInRecovery);
 	/*
 	 * MPP Modification
 	 *
diff --git a/src/backend/cdb/cdbdtxrecovery.c b/src/backend/cdb/cdbdtxrecovery.c
index 186b01ff214..605ce323ddb 100644
--- a/src/backend/cdb/cdbdtxrecovery.c
+++ b/src/backend/cdb/cdbdtxrecovery.c
@@ -202,6 +202,11 @@ recoverInDoubtTransactions(void)
 
 	for (i = 0; i < *shmNumCommittedGxacts; i++)
 	{
+		/*
+		 * No need to acquire CommittedGxidArrayLock since dtx recovery
+		 * only happens on primary, but not hot standby where concurrent 
+		 * access to this array is possible from CreateDistributedSnapshot.
+		 */
 		DistributedTransactionId gxid = shmCommittedGxidArray[i];
 		char gid[TMGIDSIZE];
 
@@ -486,7 +491,12 @@ void
 redoDistributedCommitRecord(DistributedTransactionId gxid)
 {
 	int			i;
+	bool 			is_hot_standby_qd = IS_HOT_STANDBY_QD();
 
+	/* 
+	 * Only the startup process can be modifying shmNumCommittedGxacts
+	 * and shmCommittedGxidArray. So should be OK reading the value w/o lock.
+	 */
 	for (i = 0; i < *shmNumCommittedGxacts; i++)
 	{
 		if (gxid == shmCommittedGxidArray[i])
@@ -526,7 +536,18 @@ redoDistributedCommitRecord(DistributedTransactionId gxid)
 							   "around this issue and then report a bug")));
 		}
 
+		/*
+		 * only on hot standby there might be backends that call CreateDistributedSnapshot()
+		 * to access the committed gxid array concurrently.
+		 */
+		if (is_hot_standby_qd)
+			LWLockAcquire(CommittedGxidArrayLock, LW_EXCLUSIVE);
+
 		shmCommittedGxidArray[(*shmNumCommittedGxacts)++] = gxid;
+
+		if (is_hot_standby_qd)
+			LWLockRelease(CommittedGxidArrayLock);
+
 		elog((Debug_print_full_dtm ? LOG : DEBUG5),
 			 "Crash recovery redo added committed distributed transaction gid = "UINT64_FORMAT, gxid);
 	}
@@ -539,7 +560,13 @@ void
 redoDistributedForgetCommitRecord(DistributedTransactionId gxid)
 {
 	int			i;
-
+	bool 			is_hot_standby_qd = IS_HOT_STANDBY_QD();
+	
+	SIMPLE_FAULT_INJECTOR("redoDistributedForgetCommitRecord");
+	/* 
+	 * Only the startup process can be modifying shmNumCommittedGxacts
+	 * and shmCommittedGxidArray. So should be OK reading the value w/o lock.
+	 */
 	for (i = 0; i < *shmNumCommittedGxacts; i++)
 	{
 		if (gxid == shmCommittedGxidArray[i])
@@ -550,13 +577,27 @@ redoDistributedForgetCommitRecord(DistributedTransactionId gxid)
 				 gxid);
 
 			/*
-			 * there's no concurrent access to shmCommittedGxidArray during
-			 * recovery
+			 * only on hot standby there might be backends that call CreateDistributedSnapshot()
+			 * to access the committed gxid array concurrently.
 			 */
+			if (is_hot_standby_qd)
+				LWLockAcquire(CommittedGxidArrayLock, LW_EXCLUSIVE);
+
 			(*shmNumCommittedGxacts)--;
 			if (i != *shmNumCommittedGxacts)
 				shmCommittedGxidArray[i] = shmCommittedGxidArray[*shmNumCommittedGxacts];
 
+			if (is_hot_standby_qd)
+				LWLockRelease(CommittedGxidArrayLock);
+
+			/* on the hot standby, we rely on the forget record to advance latestCompletedGxid */
+			if (is_hot_standby_qd)
+			{
+				LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+				if (gxid > ShmemVariableCache->latestCompletedGxid)
+					ShmemVariableCache->latestCompletedGxid = gxid;
+				LWLockRelease(ProcArrayLock);
+			}
 			return;
 		}
 	}
diff --git a/src/backend/cdb/cdbtm.c b/src/backend/cdb/cdbtm.c
index fbbecaa117c..e25cca64873 100644
--- a/src/backend/cdb/cdbtm.c
+++ b/src/backend/cdb/cdbtm.c
@@ -1644,7 +1644,7 @@ isDtxQueryDispatcher(void)
 	isSharedLocalSnapshotSlotPresent = (SharedLocalSnapshotSlot != NULL);
 
 	return (Gp_role == GP_ROLE_DISPATCH &&
-			isDtmStarted &&
+			(isDtmStarted || EnableHotStandby) &&
 			isSharedLocalSnapshotSlotPresent);
 }
 
@@ -2047,6 +2047,8 @@ sendDtxExplicitBegin(void)
 static void
 performDtxProtocolPrepare(const char *gid)
 {
+	SIMPLE_FAULT_INJECTOR("qe_start_prepared");
+
 	StartTransactionCommand();
 
 	elog(DTM_DEBUG5, "performDtxProtocolCommand going to call PrepareTransactionBlock for distributed transaction (id = '%s')", gid);
@@ -2126,6 +2128,7 @@ performDtxProtocolCommitOnePhase(const char *gid)
 static void
 performDtxProtocolCommitPrepared(const char *gid, bool raiseErrorIfNotFound)
 {
+	SIMPLE_FAULT_INJECTOR("qe_start_commit_prepared");
 	Assert(Gp_role == GP_ROLE_EXECUTE);
 
 	elog(DTM_DEBUG5,
diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c
index 3154caba1bd..57c03cce7d9 100644
--- a/src/backend/storage/ipc/procarray.c
+++ b/src/backend/storage/ipc/procarray.c
@@ -2530,8 +2530,10 @@ getDtxCheckPointInfo(char **result, int *result_size)
 	gxid_array = &gxact_checkpoint->committedGxidArray[0];
 
 	actual = 0;
+	LWLockAcquire(CommittedGxidArrayLock, LW_SHARED);
 	for (; actual < *shmNumCommittedGxacts; actual++)
 		gxid_array[actual] = shmCommittedGxidArray[actual];
+	LWLockRelease(CommittedGxidArrayLock);
 
 	SIMPLE_FAULT_INJECTOR("checkpoint_dtx_info");
 
@@ -2609,7 +2611,8 @@ CreateDistributedSnapshot(DistributedSnapshot *ds)
 	ProcArrayStruct *arrayP = procArray;
 
 	Assert(LWLockHeldByMe(ProcArrayLock));
-	if (*shmNumCommittedGxacts != 0)
+	/* Hot standby accepts query while constantly replaying dtx, so this ERROR doesn't apply. */
+	if (!IS_HOT_STANDBY_QD() && *shmNumCommittedGxacts != 0)
 		elog(ERROR, "Create distributed snapshot before DTM recovery finish");
 
 	xmin = xmax = ShmemVariableCache->latestCompletedGxid + 1;
@@ -2623,9 +2626,45 @@ CreateDistributedSnapshot(DistributedSnapshot *ds)
 
 	Assert(ds->inProgressXidArray != NULL);
 
+	/*
+	 * For a hot standby QD, check shmCommittedGxidArray to build the knowledge.
+	 * Need to acquire shared lock to access the committed gxid array as the
+	 * startup process might modify it.
+	 */
+	if (IS_HOT_STANDBY_QD())
+	{
+		LWLockAcquire(CommittedGxidArrayLock, LW_SHARED);
+		for (i = 0; i < *shmNumCommittedGxacts; i++)
+		{
+			DistributedTransactionId gxid;
+
+			gxid = shmCommittedGxidArray[i];
+
+			if (gxid == InvalidDistributedTransactionId || gxid >= xmax)
+				continue;
+
+			if (gxid < xmin)
+				xmin = gxid;
+
+			ds->inProgressXidArray[count++] = gxid;
+		}
+		LWLockRelease(CommittedGxidArrayLock);
+	}
+
 	/*
 	 * Gather up current in-progress global transactions for the distributed
 	 * snapshot.
+	 *
+	 * Note: The inProgressXidArray built below may contain transactions that
+	 * have been prepared on some/all segments, and for which the QD hasn't
+	 * begun the COMMIT phase (by writing a XLOG_XACT_DISTRIBUTED_COMMIT record).
+	 * The gxids of these transactions don't necessarily have to be placed into
+	 * inProgressXidArray, for correctness. This is because for visibility
+	 * checks on the QEs, a state of DISTRIBUTEDSNAPSHOT_COMMITTED_UNKNOWN will
+	 * be encountered for such txs, prompting a local check. The local check will
+	 * always find these txs in progress (due to the dummy PGXACTs being
+	 * recorded for prepared txs). So, hypothetically we could exclude these txs
+	 * here, but we don't currently track them on the QD, so we can't.
 	 */
 	for (i = 0; i < arrayP->numProcs; i++)
 	{
diff --git a/src/backend/storage/lmgr/lwlocknames.txt b/src/backend/storage/lmgr/lwlocknames.txt
index c8f283198ce..c3583b146d7 100644
--- a/src/backend/storage/lmgr/lwlocknames.txt
+++ b/src/backend/storage/lmgr/lwlocknames.txt
@@ -75,3 +75,4 @@ LoginFailedControlLock				65
 LoginFailedSharedMemoryLock			66
 GPIVMResLock						67
 DirectoryTableLock                  68
+CommittedGxidArrayLock				69
diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c
index 37dfe9978c0..7dcb1f55917 100644
--- a/src/backend/tcop/postgres.c
+++ b/src/backend/tcop/postgres.c
@@ -1529,6 +1529,8 @@ exec_mpp_dtx_protocol_command(DtxProtocolCommand dtxProtocolCommand,
 	qc.commandTag = GetCommandTagEnum(loggingStr);
 	qc.nprocessed = 1;
 
+	SIMPLE_FAULT_INJECTOR("exec_dtx_protocol_start");
+
 	if (log_statement == LOGSTMT_ALL)
 		elog(LOG,"DTM protocol command '%s' for gid = %s", loggingStr, gid);
 
diff --git a/src/backend/tcop/pquery.c b/src/backend/tcop/pquery.c
index 728d12c604a..532690f1d51 100644
--- a/src/backend/tcop/pquery.c
+++ b/src/backend/tcop/pquery.c
@@ -617,6 +617,8 @@ PortalStart(Portal portal, ParamListInfo params,
 						needDistributedSnapshot = false;
 				}
 				
+				SIMPLE_FAULT_INJECTOR("select_before_qd_create_snapshot");
+
 				/* Must set snapshot before starting executor. */
 				if (snapshot)
 					PushActiveSnapshot(snapshot);
@@ -626,6 +628,8 @@ PortalStart(Portal portal, ParamListInfo params,
 				/* reset value */
 				needDistributedSnapshot = true;
 
+				SIMPLE_FAULT_INJECTOR("select_after_qd_create_snapshot");
+
 				/*
 				 * We could remember the snapshot in portal->portalSnapshot,
 				 * but presently there seems no need to, as this code path
diff --git a/src/test/isolation2/expected/hot_standby/basic.out b/src/test/isolation2/expected/hot_standby/basic.out
index 704814217f7..4a8396c3be7 100644
--- a/src/test/isolation2/expected/hot_standby/basic.out
+++ b/src/test/isolation2/expected/hot_standby/basic.out
@@ -1,11 +1,16 @@
 -- Tests for basic query dispatch on a hot standy.
 
--- must show on
+-- hot standby must show on and the sync mode is remote_apply for the tests to make sense
 -1S: show hot_standby;
  hot_standby 
 -------------
  on          
 (1 row)
+-1S: show synchronous_commit;
+ synchronous_commit 
+--------------------
+ remote_apply       
+(1 row)
 
 -- will be checking if QD/QE info looks good
 -1S: select id, type, content, port from gp_backend_info();
@@ -21,10 +26,10 @@ create table hs_t1(a int);
 CREATE TABLE
 create table hs_t2(a int);
 CREATE TABLE
+
+-- standby should see the results for 2pc immediately.
 insert into hs_t1 select * from generate_series(1,10);
 INSERT 0 10
-
--- standby should see the result
 -1S: select * from hs_t1;
  a  
 ----
@@ -39,6 +44,24 @@ INSERT 0 10
  10 
  1  
 (10 rows)
+-- standby won't see results for the last 1pc immediately because the standby QD
+-- isn't aware of of it so its distributed snapshot doesn't include the 1pc, but
+-- as long as another 2pc comes it will be able to see the previous 1pc. Wee
+-- tolerate this case in the mirrored cluster setup.
+insert into hs_t2 values(1);
+INSERT 0 1
+-1S: select * from hs_t2;
+ a 
+---
+(0 rows)
+-- any following 2pc will make the 1pc visible
+create temp table tt(a int);
+CREATE TABLE
+-1S: select * from hs_t2;
+ a 
+---
+ 1 
+(1 row)
 
 -- we have three QEs launched on the mirror segments.
 -- note that the first QE on a segment is still a "writer" because we
@@ -56,7 +79,8 @@ INSERT 0 10
 -1S: select * from hs_t1 join (select * from hs_t2) hs_t2 on (hs_t1 = hs_t2);
  a | a 
 ---+---
-(0 rows)
+ 1 | 1 
+(1 row)
 -1S: select id, type, content, port from gp_backend_info();
  id | type | content | port 
 ----+------+---------+------
@@ -97,15 +121,15 @@ INSERT 0 10
 -1S: select * from hs_t1;
  a  
 ----
+ 5  
+ 6  
+ 9  
+ 10 
  2  
  3  
  4  
  7  
  8  
- 5  
- 6  
- 9  
- 10 
  1  
 (10 rows)
 
@@ -124,6 +148,10 @@ COMMIT
  16 
  18 
  19 
+ 1  
+ 12 
+ 15 
+ 20 
  5  
  6  
  9  
@@ -132,10 +160,6 @@ COMMIT
  13 
  14 
  17 
- 1  
- 12 
- 15 
- 20 
 (20 rows)
 
 ----------------------------------------------------------------
@@ -169,7 +193,7 @@ COPY 20
 -- Here are the things hot standby in PG can do but currently cannot in GPDB:
 -- transaction block BEGIN...END;
 -1S: begin;
-ERROR:  cannot make new WAL entries during recovery (xloginsert.c:135)
+ERROR:  cannot setup distributed transaction during recovery (cdbtm.c:392)
 -1S: end;
 COMMIT
 -- cursor operation due to not supporting BEGIN...END yet;
diff --git a/src/test/isolation2/expected/hot_standby/faults.out b/src/test/isolation2/expected/hot_standby/faults.out
index 5fdade0d37d..f9f58eab83f 100644
--- a/src/test/isolation2/expected/hot_standby/faults.out
+++ b/src/test/isolation2/expected/hot_standby/faults.out
@@ -218,7 +218,10 @@ select wait_until_all_segments_synchronized();
  Success:                 
 (1 row)
 
--- inject fault to cripple QE right after a QE finished the prepare phase of 2PC
+1: create table tt_hs_dtx(a int);
+CREATE TABLE
+
+-- inject fault to repeatedly fail the COMMIT PREPARE phase of 2PC, which ensures that the dtx cannot finish even by the dtx recovery process.
 select gp_inject_fault_infinite('finish_commit_prepared', 'error', dbid) from gp_segment_configuration where content=1 and role='p';
  gp_inject_fault_infinite 
 --------------------------
@@ -226,11 +229,7 @@ select gp_inject_fault_infinite('finish_commit_prepared', 'error', dbid) from gp
 (1 row)
 
 -- session 1 on primary QD tries to commit a DTX, but cannot finish due to the fault on a QE
-1: begin;
-BEGIN
-1: create table tt_hs_dtx(a int);
-CREATE TABLE
-1&: end;  <waiting ...>
+1&: insert into tt_hs_dtx select * from generate_series(1,10);  <waiting ...>
 
 -- inject a panic on primary QD, essentially restarts the primary QD
 2: select gp_inject_fault('before_read_command', 'panic', dbid) from gp_segment_configuration where content=-1 and role='p';
@@ -266,18 +265,49 @@ server closed the connection unexpectedly
  8  
  9  
 (10 rows)
--- XXX: currently it sees the in-doubt DTX but it shouldnt' when we supported DTX isolation.
+-- it cannot see rows from the in-doubt DTX
 -1S: select * from tt_hs_dtx;
  a 
 ---
 (0 rows)
 
--- resets the fault
+-- let the failed dtx be recovered, also make sure the standby replays the forget record which signals the completion of the dtx
+-1S: select gp_inject_fault('redoDistributedForgetCommitRecord', 'skip', dbid) from gp_segment_configuration where content=-1 and role='m';
+ gp_inject_fault 
+-----------------
+ Success:        
+(1 row)
 -1S: select gp_inject_fault_infinite('finish_commit_prepared', 'reset', dbid) from gp_segment_configuration where content=1 and role='p';
  gp_inject_fault_infinite 
 --------------------------
  Success:                 
 (1 row)
+-1S: select gp_wait_until_triggered_fault('redoDistributedForgetCommitRecord', 1, dbid) from gp_segment_configuration where content=-1 and role='m';
+ gp_wait_until_triggered_fault 
+-------------------------------
+ Success:                      
+(1 row)
+-1S: select gp_inject_fault('redoDistributedForgetCommitRecord', 'reset', dbid) from gp_segment_configuration where content=-1 and role='m';
+ gp_inject_fault 
+-----------------
+ Success:        
+(1 row)
+
+-- standby should see the rows from the in-doubt DTX now
+-1S: select * from tt_hs_dtx;
+ a  
+----
+ 1  
+ 2  
+ 3  
+ 4  
+ 7  
+ 8  
+ 5  
+ 6  
+ 9  
+ 10 
+(10 rows)
 
 -1S: select wait_until_all_segments_synchronized();
  wait_until_all_segments_synchronized 
@@ -294,3 +324,4 @@ server closed the connection unexpectedly
 -----------------
  Success:        
 (1 row)
+
diff --git a/src/test/isolation2/expected/hot_standby/transaction_isolation.out b/src/test/isolation2/expected/hot_standby/transaction_isolation.out
new file mode 100644
index 00000000000..ca69d615967
--- /dev/null
+++ b/src/test/isolation2/expected/hot_standby/transaction_isolation.out
@@ -0,0 +1,755 @@
+----------------------------------------------------------------
+-- Test transaction isolation in general, not specific to dtx
+----------------------------------------------------------------
+1: create table hs_tx(a int);
+CREATE TABLE
+1: insert into hs_tx select * from generate_series(1,10);
+INSERT 0 10
+
+1: begin;
+BEGIN
+1: insert into hs_tx select * from generate_series(11,20);
+INSERT 0 10
+2: begin;
+BEGIN
+2: insert into hs_tx select * from generate_series(21,30);
+INSERT 0 10
+2: abort;
+ROLLBACK
+
+-- standby should only see completed transactions, not in-progress transactions, nor aborted transactions
+-1S: select * from hs_tx;
+ a  
+----
+ 1  
+ 5  
+ 6  
+ 9  
+ 10 
+ 2  
+ 3  
+ 4  
+ 7  
+ 8  
+(10 rows)
+
+1: end;
+COMMIT
+-1S: select * from hs_tx;
+ a  
+----
+ 2  
+ 3  
+ 4  
+ 7  
+ 8  
+ 16 
+ 18 
+ 19 
+ 1  
+ 12 
+ 15 
+ 20 
+ 5  
+ 6  
+ 9  
+ 10 
+ 11 
+ 13 
+ 14 
+ 17 
+(20 rows)
+
+----------------------------------------------------------------
+-- Test isolation between hot standby query and in-progress dtx
+----------------------------------------------------------------
+
+1: create table hs_dtx1(a int);
+CREATE TABLE
+1: create table hs_dtx2(a int);
+CREATE TABLE
+
+-- inject two suspend faults:
+-- 1. on seg0, suspend before PREPARE phase of 2PC
+1: select gp_inject_fault('qe_start_prepared', 'suspend',dbid) from gp_segment_configuration where content=0 and role='p';
+ gp_inject_fault 
+-----------------
+ Success:        
+(1 row)
+1&: insert into hs_dtx1 select * from  generate_series(1,10);  <waiting ...>
+-- 2. on seg1, suspend before COMMIT phase of 2PC
+2: select gp_inject_fault('qe_start_commit_prepared', 'suspend',dbid) from gp_segment_configuration where content=1 and role='p';
+ gp_inject_fault 
+-----------------
+ Success:        
+(1 row)
+2&: insert into hs_dtx2 select * from  generate_series(1,10);  <waiting ...>
+
+-- standby should not see any rows from either dtx
+-1S: select * from hs_dtx1;
+ a 
+---
+(0 rows)
+-1S: select * from hs_dtx2;
+ a 
+---
+(0 rows)
+
+-- reset
+3: select gp_inject_fault('qe_start_prepared', 'reset',dbid) from gp_segment_configuration where content=0 and role='p';
+ gp_inject_fault 
+-----------------
+ Success:        
+(1 row)
+3: select gp_inject_fault('qe_start_commit_prepared', 'reset',dbid) from gp_segment_configuration where content=1 and role='p';
+ gp_inject_fault 
+-----------------
+ Success:        
+(1 row)
+1<:  <... completed>
+INSERT 0 10
+2<:  <... completed>
+INSERT 0 10
+
+-- standby should see the results from the dtx now
+-1S: select * from hs_dtx1;
+ a  
+----
+ 2  
+ 3  
+ 4  
+ 7  
+ 8  
+ 1  
+ 5  
+ 6  
+ 9  
+ 10 
+(10 rows)
+-1S: select * from hs_dtx2;
+ a  
+----
+ 2  
+ 3  
+ 4  
+ 7  
+ 8  
+ 5  
+ 6  
+ 9  
+ 10 
+ 1  
+(10 rows)
+
+----------------------------------------------------------------
+-- Test DTX abort that happens in different phases
+----------------------------------------------------------------
+
+1: create table hs_abort_dtx1(a int);
+CREATE TABLE
+1: create table hs_abort_dtx2(a int);
+CREATE TABLE
+
+-- inject two errors:
+-- 1. on seg0, error out before PREPARE phase of 2PC
+1: select gp_inject_fault('qe_start_prepared', 'error', dbid) from gp_segment_configuration where content=0 and role='p';
+ gp_inject_fault 
+-----------------
+ Success:        
+(1 row)
+1: insert into hs_abort_dtx1 select * from  generate_series(1,10);
+ERROR:  fault triggered, fault name:'qe_start_prepared' fault type:'error'  (seg0 127.0.1.1:7002 pid=343)
+1: select gp_inject_fault('qe_start_prepared', 'reset',dbid) from gp_segment_configuration where content=0 and role='p';
+ gp_inject_fault 
+-----------------
+ Success:        
+(1 row)
+-- 2. on seg1, error out before COMMIT phase of 2PC
+1: select gp_inject_fault('qe_start_commit_prepared', 'error', dbid) from gp_segment_configuration where content=1 and role='p';
+ gp_inject_fault 
+-----------------
+ Success:        
+(1 row)
+1: insert into hs_abort_dtx2 select * from  generate_series(1,10);
+INSERT 0 10
+1: select gp_inject_fault('qe_start_commit_prepared', 'reset',dbid) from gp_segment_configuration where content=1 and role='p';
+ gp_inject_fault 
+-----------------
+ Success:        
+(1 row)
+
+-- standby should not see dtx1 which is aborted but should see dtx2 which is recovered
+-1S: select * from hs_abort_dtx1;
+ a 
+---
+(0 rows)
+-1S: select * from hs_abort_dtx2;
+ a  
+----
+ 2  
+ 3  
+ 4  
+ 7  
+ 8  
+ 1  
+ 5  
+ 6  
+ 9  
+ 10 
+(10 rows)
+
+----------------------------------------------------------------
+-- Test isolation between hot standby query and in-progress dtx,
+-- but also run more queries in between
+----------------------------------------------------------------
+1: create table hs_dtx3(a int);
+CREATE TABLE
+
+-- inject faults to suspend segments in 2PC
+1: select gp_inject_fault('qe_start_prepared', 'suspend', dbid) from gp_segment_configuration where content=0 and role='p';
+ gp_inject_fault 
+-----------------
+ Success:        
+(1 row)
+1&: insert into hs_dtx3 select * from  generate_series(1,10);  <waiting ...>
+2: select gp_inject_fault('qe_start_commit_prepared', 'suspend', dbid) from gp_segment_configuration where content=1 and role='p';
+ gp_inject_fault 
+-----------------
+ Success:        
+(1 row)
+2&: insert into hs_dtx3 select * from  generate_series(11,20);  <waiting ...>
+
+-- standby should not see rows in the in-progress dtx
+-1S: select * from hs_dtx3;
+ a 
+---
+(0 rows)
+
+-- now run some dtx and completed
+3: insert into hs_dtx3 values(99);
+INSERT 0 1
+3: create table hs_dtx4(a int);
+CREATE TABLE
+3: insert into hs_dtx4 select * from  generate_series(1,10);
+INSERT 0 10
+
+-- standby should still not see rows in the in-progress DTX, but should see the completed ones
+-1S: select * from hs_dtx3;
+ a  
+----
+ 99 
+(1 row)
+-1S: select * from hs_dtx4;
+ a  
+----
+ 2  
+ 3  
+ 4  
+ 7  
+ 8  
+ 5  
+ 6  
+ 9  
+ 10 
+ 1  
+(10 rows)
+
+3: select gp_inject_fault('qe_start_prepared', 'reset',dbid) from gp_segment_configuration where content=0 and role='p';
+ gp_inject_fault 
+-----------------
+ Success:        
+(1 row)
+3: select gp_inject_fault('qe_start_commit_prepared', 'reset',dbid) from gp_segment_configuration where content=1 and role='p';
+ gp_inject_fault 
+-----------------
+ Success:        
+(1 row)
+1<:  <... completed>
+INSERT 0 10
+2<:  <... completed>
+INSERT 0 10
+
+-- standby should see all rows now
+-1S: select * from hs_dtx3;
+ a  
+----
+ 1  
+ 12 
+ 15 
+ 20 
+ 2  
+ 3  
+ 4  
+ 7  
+ 8  
+ 16 
+ 18 
+ 19 
+ 99 
+ 5  
+ 6  
+ 9  
+ 10 
+ 11 
+ 13 
+ 14 
+ 17 
+(21 rows)
+
+----------------------------------------------------------------
+-- Test isolation between standby QD and in-progress dtx,
+-- but after standby QD resets and gets running DTX from checkpoint.
+----------------------------------------------------------------
+1: create table hs_t5(a int, b text);
+CREATE TABLE
+1: create table hs_t6(a int, b text);
+CREATE TABLE
+
+-- inject fault to suspend a primary right before it conducts the commit phase of 2PC,
+-- so in the subsequent INSERT, all local transactions will be committed but the dtx is not.
+1: select gp_inject_fault('qe_start_commit_prepared', 'suspend', dbid) from gp_segment_configuration where content=0 and role='p';
+ gp_inject_fault 
+-----------------
+ Success:        
+(1 row)
+1&: insert into hs_t5 select i, 'in-progress' from generate_series(1,10) i;  <waiting ...>
+
+-- now run some dtx and completed, and primary conducts a checkpoint
+2: insert into hs_t5 values(1, 'commited');
+INSERT 0 1
+2: insert into hs_t6 select i, 'committed' from generate_series(1,10) i;
+INSERT 0 10
+2: begin;
+BEGIN
+2: insert into hs_t5 values(99, 'aborted');
+INSERT 0 1
+2: abort;
+ROLLBACK
+2: checkpoint;
+CHECKPOINT
+
+-- now make the standby QD resets itself
+-1S: select gp_inject_fault('exec_simple_query_start', 'panic', dbid) from gp_segment_configuration where content=-1 and role='m';
+ gp_inject_fault 
+-----------------
+ Success:        
+(1 row)
+-1S: select 1;
+PANIC:  fault triggered, fault name:'exec_simple_query_start' fault type:'panic'
+server closed the connection unexpectedly
+	This probably means the server terminated abnormally
+	before or while processing the request.
+-1Sq: ... <quitting>
+
+-- standby should still not see rows in the in-progress DTX, but should see the completed ones
+-1S: select * from hs_t5;
+ a | b        
+---+----------
+ 1 | commited 
+(1 row)
+-1S: select * from hs_t6;
+ a  | b         
+----+-----------
+ 1  | committed 
+ 2  | committed 
+ 3  | committed 
+ 4  | committed 
+ 7  | committed 
+ 8  | committed 
+ 5  | committed 
+ 6  | committed 
+ 9  | committed 
+ 10 | committed 
+(10 rows)
+
+2: select gp_inject_fault('qe_start_commit_prepared', 'reset',dbid) from gp_segment_configuration where content=0 and role='p';
+ gp_inject_fault 
+-----------------
+ Success:        
+(1 row)
+1<:  <... completed>
+INSERT 0 10
+
+-- standby should see all rows now
+-1S: select * from hs_t5;
+ a  | b           
+----+-------------
+ 1  | in-progress 
+ 1  | commited    
+ 5  | in-progress 
+ 6  | in-progress 
+ 9  | in-progress 
+ 10 | in-progress 
+ 2  | in-progress 
+ 3  | in-progress 
+ 4  | in-progress 
+ 7  | in-progress 
+ 8  | in-progress 
+(11 rows)
+-1S: select * from hs_t6;
+ a  | b         
+----+-----------
+ 5  | committed 
+ 6  | committed 
+ 9  | committed 
+ 10 | committed 
+ 1  | committed 
+ 2  | committed 
+ 3  | committed 
+ 4  | committed 
+ 7  | committed 
+ 8  | committed 
+(10 rows)
+
+-- standby should correctly see more in-progress dtx on the primary.
+-- context: previously this would be fail because the standby updates latestCompletedGxid to the
+-- bumped nextGxid from checkpoint, which is too far (so that it thinks the new dtx already completed).
+1: select gp_inject_fault('qe_start_prepared', 'suspend', dbid) from gp_segment_configuration where content=0 and role='p';
+ gp_inject_fault 
+-----------------
+ Success:        
+(1 row)
+1&: delete from hs_t5;  <waiting ...>
+2: select gp_inject_fault('qe_start_commit_prepared', 'suspend', dbid) from gp_segment_configuration where content=1 and role='p';
+ gp_inject_fault 
+-----------------
+ Success:        
+(1 row)
+2&: delete from hs_t6;  <waiting ...>
+
+-- standby should not see the effect of the deletes
+-1S: select * from hs_t5;
+ a  | b           
+----+-------------
+ 2  | in-progress 
+ 3  | in-progress 
+ 4  | in-progress 
+ 7  | in-progress 
+ 8  | in-progress 
+ 1  | in-progress 
+ 1  | commited    
+ 5  | in-progress 
+ 6  | in-progress 
+ 9  | in-progress 
+ 10 | in-progress 
+(11 rows)
+-1S: select * from hs_t6;
+ a  | b         
+----+-----------
+ 1  | committed 
+ 2  | committed 
+ 3  | committed 
+ 4  | committed 
+ 7  | committed 
+ 8  | committed 
+ 5  | committed 
+ 6  | committed 
+ 9  | committed 
+ 10 | committed 
+(10 rows)
+
+3: select gp_inject_fault('qe_start_prepared', 'reset',dbid) from gp_segment_configuration where content=0 and role='p';
+ gp_inject_fault 
+-----------------
+ Success:        
+(1 row)
+3: select gp_inject_fault('qe_start_commit_prepared', 'reset',dbid) from gp_segment_configuration where content=1 and role='p';
+ gp_inject_fault 
+-----------------
+ Success:        
+(1 row)
+
+1<:  <... completed>
+DELETE 11
+2<:  <... completed>
+DELETE 10
+
+-- standby now see those deletes
+-1S: select * from hs_t5;
+ a | b 
+---+---
+(0 rows)
+-1S: select * from hs_t6;
+ a | b 
+---+---
+(0 rows)
+
+----------------------------------------------------------------
+-- Read-committed isolation: query on hot standby should not see dtx that completed after it
+-- created distributed snapshot, but should see dtx that completed before that.
+----------------------------------------------------------------
+
+1: create table hs_rc(a int);
+CREATE TABLE
+1: insert into hs_rc select * from generate_series(1,10);
+INSERT 0 10
+
+-- case 1: suspend SELECT on the standby QD right after it created snapshot
+-1S: select gp_inject_fault('select_after_qd_create_snapshot', 'suspend', dbid) from gp_segment_configuration where content=-1 and role='m';
+ gp_inject_fault 
+-----------------
+ Success:        
+(1 row)
+-1S&: select * from hs_rc;  <waiting ...>
+
+-- new INSERT or DELETE won't be observed by the standby
+1: insert into hs_rc select * from generate_series(11,20);
+INSERT 0 10
+1: delete from hs_rc where a < 5;
+DELETE 4
+1: select gp_inject_fault('select_after_qd_create_snapshot', 'reset', dbid) from gp_segment_configuration where content=-1 and role='m';
+ gp_inject_fault 
+-----------------
+ Success:        
+(1 row)
+
+-- should only see the rows at the time when SELECT started (1...10).
+-1S<:  <... completed>
+ a  
+----
+ 2  
+ 3  
+ 4  
+ 7  
+ 8  
+ 1  
+ 5  
+ 6  
+ 9  
+ 10 
+(10 rows)
+
+-- SELECT again, should see the effect from the INSERT and DELETE now
+-1S: select * from hs_rc;
+ a  
+----
+ 12 
+ 15 
+ 20 
+ 7  
+ 8  
+ 16 
+ 18 
+ 19 
+ 5  
+ 6  
+ 9  
+ 10 
+ 11 
+ 13 
+ 14 
+ 17 
+(16 rows)
+
+-- case 2: suspend SELECT on the standby QD before creating snapshot
+-1S: select gp_inject_fault('select_before_qd_create_snapshot', 'suspend', dbid) from gp_segment_configuration where content=-1 and role='m';
+ gp_inject_fault 
+-----------------
+ Success:        
+(1 row)
+-1S&: select * from hs_rc;  <waiting ...>
+
+1: insert into hs_rc select * from generate_series(21,30);
+INSERT 0 10
+1: delete from hs_rc where a < 21;
+DELETE 16
+1: select gp_inject_fault('select_before_qd_create_snapshot', 'reset', dbid) from gp_segment_configuration where content=-1 and role='m';
+ gp_inject_fault 
+-----------------
+ Success:        
+(1 row)
+
+-- standby should see the effect of the INSERT and DELETE
+-1S<:  <... completed>
+ a  
+----
+ 23 
+ 26 
+ 30 
+ 22 
+ 24 
+ 27 
+ 29 
+ 21 
+ 25 
+ 28 
+(10 rows)
+
+----------------------------------------------------------------
+-- Read-committed isolation in the BEGIN...END block
+----------------------------------------------------------------
+
+1: truncate hs_rc;
+TRUNCATE TABLE
+1: insert into hs_rc select * from generate_series(1,30);
+INSERT 0 30
+
+-1S: begin;
+BEGIN
+-1S: select count(*) from hs_rc;
+ count 
+-------
+ 30    
+(1 row)
+
+-- have some concurrent sessions on primary QD:
+-- 1. a completed transaction
+1: delete from hs_rc where a <= 10;
+DELETE 10
+-- 3. an aborted transaction
+2: begin;
+BEGIN
+2: delete from hs_rc where a > 10 and a <= 20;
+DELETE 10
+2: abort;
+ROLLBACK
+-- 3. an ongoing transaction
+3: begin;
+BEGIN
+3: delete from hs_rc where a > 20 and a <= 30;
+DELETE 10
+
+-- the standby should see results accordingly
+-1S: select * from hs_rc;
+ a  
+----
+ 12 
+ 15 
+ 20 
+ 23 
+ 26 
+ 30 
+ 11 
+ 13 
+ 14 
+ 17 
+ 21 
+ 25 
+ 28 
+ 16 
+ 18 
+ 19 
+ 22 
+ 24 
+ 27 
+ 29 
+(20 rows)
+-1S: end;
+COMMIT
+
+3: end;
+COMMIT
+-1S: select * from hs_rc;
+ a  
+----
+ 12 
+ 15 
+ 20 
+ 11 
+ 13 
+ 14 
+ 17 
+ 16 
+ 18 
+ 19 
+(10 rows)
+
+----------------------------------------------------------------
+-- Various isolation tests that involve AO/CO table.
+----------------------------------------------------------------
+1: create table hs_ao(a int, id int unique) using ao_row;
+CREATE TABLE
+1: insert into hs_ao select 1,i from generate_series(1,10) i;
+INSERT 0 10
+1: begin;
+BEGIN
+1: insert into hs_ao select 2,i from generate_series(11,20) i;
+INSERT 0 10
+
+-- standby sees the same AO metadata as primary
+2: select * from gp_toolkit.__gp_aoseg('hs_ao');
+ segment_id | segno | eof | tupcount | varblockcount | eof_uncompressed | modcount | formatversion | state 
+------------+-------+-----+----------+---------------+------------------+----------+---------------+-------
+ 0          | 1     | 128 | 5        | 1             | 128              | 1        | 3             | 1     
+ 1          | 1     | 40  | 1        | 1             | 40               | 1        | 3             | 1     
+ 2          | 1     | 104 | 4        | 1             | 104              | 1        | 3             | 1     
+(3 rows)
+-1S: select * from gp_toolkit.__gp_aoseg('hs_ao');
+ segment_id | segno | eof | tupcount | varblockcount | eof_uncompressed | modcount | formatversion | state 
+------------+-------+-----+----------+---------------+------------------+----------+---------------+-------
+ 0          | 1     | 128 | 5        | 1             | 128              | 1        | 3             | 1     
+ 1          | 1     | 40  | 1        | 1             | 40               | 1        | 3             | 1     
+ 2          | 1     | 104 | 4        | 1             | 104              | 1        | 3             | 1     
+(3 rows)
+2: select (gp_toolkit.__gp_aoblkdir('hs_ao')).* from gp_dist_random('gp_id');
+ tupleid | segno | columngroup_no | entry_no | first_row_no | file_offset | row_count 
+---------+-------+----------------+----------+--------------+-------------+-----------
+ (0,2)   | 1     | 0              | 0        | 1            | 0           | 4         
+ (0,2)   | 1     | 0              | 0        | 1            | 0           | 1         
+ (0,2)   | 1     | 0              | 0        | 1            | 0           | 5         
+(3 rows)
+-1S: select (gp_toolkit.__gp_aoblkdir('hs_ao')).* from gp_dist_random('gp_id');
+ tupleid | segno | columngroup_no | entry_no | first_row_no | file_offset | row_count 
+---------+-------+----------------+----------+--------------+-------------+-----------
+ (0,2)   | 1     | 0              | 0        | 1            | 0           | 5         
+ (0,2)   | 1     | 0              | 0        | 1            | 0           | 1         
+ (0,2)   | 1     | 0              | 0        | 1            | 0           | 4         
+(3 rows)
+
+-- standby sees correct table data
+-1S: select * from hs_ao;
+ a | id 
+---+----
+ 1 | 2  
+ 1 | 3  
+ 1 | 4  
+ 1 | 7  
+ 1 | 8  
+ 1 | 1  
+ 1 | 5  
+ 1 | 6  
+ 1 | 9  
+ 1 | 10 
+(10 rows)
+
+-- standby sees the effect of vacuum
+1: end;
+COMMIT
+1: delete from hs_ao where a = 1;
+DELETE 10
+1: vacuum hs_ao;
+VACUUM
+1: select * from gp_toolkit.__gp_aoseg('hs_ao');
+ segment_id | segno | eof | tupcount | varblockcount | eof_uncompressed | modcount | formatversion | state 
+------------+-------+-----+----------+---------------+------------------+----------+---------------+-------
+ 2          | 1     | 0   | 0        | 0             | 0                | 3        | 3             | 1     
+ 2          | 2     | 104 | 4        | 1             | 104              | 0        | 3             | 1     
+ 0          | 1     | 0   | 0        | 0             | 0                | 3        | 3             | 1     
+ 0          | 2     | 88  | 3        | 1             | 88               | 0        | 3             | 1     
+ 1          | 1     | 0   | 0        | 0             | 0                | 3        | 3             | 1     
+ 1          | 2     | 88  | 3        | 1             | 88               | 0        | 3             | 1     
+(6 rows)
+-1S: select * from gp_toolkit.__gp_aoseg('hs_ao');
+ segment_id | segno | eof | tupcount | varblockcount | eof_uncompressed | modcount | formatversion | state 
+------------+-------+-----+----------+---------------+------------------+----------+---------------+-------
+ 2          | 1     | 0   | 0        | 0             | 0                | 3        | 3             | 1     
+ 2          | 2     | 104 | 4        | 1             | 104              | 0        | 3             | 1     
+ 0          | 1     | 0   | 0        | 0             | 0                | 3        | 3             | 1     
+ 0          | 2     | 88  | 3        | 1             | 88               | 0        | 3             | 1     
+ 1          | 1     | 0   | 0        | 0             | 0                | 3        | 3             | 1     
+ 1          | 2     | 88  | 3        | 1             | 88               | 0        | 3             | 1     
+(6 rows)
+-1S: select * from hs_ao;
+ a | id 
+---+----
+ 2 | 11 
+ 2 | 13 
+ 2 | 14 
+ 2 | 17 
+ 2 | 12 
+ 2 | 15 
+ 2 | 20 
+ 2 | 16 
+ 2 | 18 
+ 2 | 19 
+(10 rows)
diff --git a/src/test/isolation2/hot_standby_schedule b/src/test/isolation2/hot_standby_schedule
index 884610d9870..3a120bf8582 100644
--- a/src/test/isolation2/hot_standby_schedule
+++ b/src/test/isolation2/hot_standby_schedule
@@ -1,4 +1,5 @@
 test: hot_standby/setup
 test: hot_standby/basic
+test: hot_standby/transaction_isolation
 test: hot_standby/faults
 test: hot_standby/teardown
diff --git a/src/test/isolation2/sql/hot_standby/basic.sql b/src/test/isolation2/sql/hot_standby/basic.sql
index 64a0bb5f0c6..a315366ac3f 100644
--- a/src/test/isolation2/sql/hot_standby/basic.sql
+++ b/src/test/isolation2/sql/hot_standby/basic.sql
@@ -1,7 +1,8 @@
 -- Tests for basic query dispatch on a hot standy.
 
--- must show on
+-- hot standby must show on and the sync mode is remote_apply for the tests to make sense
 -1S: show hot_standby;
+-1S: show synchronous_commit;
 
 -- will be checking if QD/QE info looks good
 -1S: select id, type, content, port from gp_backend_info();
@@ -11,10 +12,19 @@
 ----------------------------------------------------------------
 create table hs_t1(a int);
 create table hs_t2(a int);
-insert into hs_t1 select * from generate_series(1,10);
 
--- standby should see the result
+-- standby should see the results for 2pc immediately.
+insert into hs_t1 select * from generate_series(1,10);
 -1S: select * from hs_t1;
+-- standby won't see results for the last 1pc immediately because the standby QD
+-- isn't aware of of it so its distributed snapshot doesn't include the 1pc, but
+-- as long as another 2pc comes it will be able to see the previous 1pc. Wee 
+-- tolerate this case in the mirrored cluster setup.
+insert into hs_t2 values(1);
+-1S: select * from hs_t2;
+-- any following 2pc will make the 1pc visible
+create temp table tt(a int);
+-1S: select * from hs_t2;
 
 -- we have three QEs launched on the mirror segments.
 -- note that the first QE on a segment is still a "writer" because we
diff --git a/src/test/isolation2/sql/hot_standby/faults.sql b/src/test/isolation2/sql/hot_standby/faults.sql
index ed82753615e..6e25bcba272 100644
--- a/src/test/isolation2/sql/hot_standby/faults.sql
+++ b/src/test/isolation2/sql/hot_standby/faults.sql
@@ -89,13 +89,13 @@ select wait_until_all_segments_synchronized();
 -- skip FTS probe to prevent unexpected mirror promotion
 1: select gp_inject_fault_infinite('fts_probe', 'skip', dbid) from gp_segment_configuration where role='p' and content=-1;
 
--- inject fault to cripple QE right after a QE finished the prepare phase of 2PC
+1: create table tt_hs_dtx(a int);
+
+-- inject fault to repeatedly fail the COMMIT PREPARE phase of 2PC, which ensures that the dtx cannot finish even by the dtx recovery process. 
 select gp_inject_fault_infinite('finish_commit_prepared', 'error', dbid) from gp_segment_configuration where content=1 and role='p';
 
 -- session 1 on primary QD tries to commit a DTX, but cannot finish due to the fault on a QE
-1: begin;
-1: create table tt_hs_dtx(a int);
-1&: end;
+1&: insert into tt_hs_dtx select * from generate_series(1,10);
 
 -- inject a panic on primary QD, essentially restarts the primary QD
 2: select gp_inject_fault('before_read_command', 'panic', dbid) from gp_segment_configuration where content=-1 and role='p';
@@ -107,12 +107,19 @@ select gp_inject_fault_infinite('finish_commit_prepared', 'error', dbid) from gp
 
 -- standby QD can still run query
 -1S: select * from hs_failover;
--- XXX: currently it sees the in-doubt DTX but it shouldnt' when we supported DTX isolation.
+-- it cannot see rows from the in-doubt DTX
 -1S: select * from tt_hs_dtx;
 
--- resets the fault
+-- let the failed dtx be recovered, also make sure the standby replays the forget record which signals the completion of the dtx
+-1S: select gp_inject_fault('redoDistributedForgetCommitRecord', 'skip', dbid) from gp_segment_configuration where content=-1 and role='m';
 -1S: select gp_inject_fault_infinite('finish_commit_prepared', 'reset', dbid) from gp_segment_configuration where content=1 and role='p';
+-1S: select gp_wait_until_triggered_fault('redoDistributedForgetCommitRecord', 1, dbid) from gp_segment_configuration where content=-1 and role='m';
+-1S: select gp_inject_fault('redoDistributedForgetCommitRecord', 'reset', dbid) from gp_segment_configuration where content=-1 and role='m';
+
+-- standby should see the rows from the in-doubt DTX now
+-1S: select * from tt_hs_dtx;
 
 -1S: select wait_until_all_segments_synchronized();
 1: select gp_inject_fault('before_read_command', 'reset', dbid) from gp_segment_configuration where content=-1 and role='p';
 1: select gp_inject_fault('fts_probe', 'reset', dbid) from gp_segment_configuration where role='p' and content=-1;
+
diff --git a/src/test/isolation2/sql/hot_standby/transaction_isolation.sql b/src/test/isolation2/sql/hot_standby/transaction_isolation.sql
new file mode 100644
index 00000000000..6f0508d82b4
--- /dev/null
+++ b/src/test/isolation2/sql/hot_standby/transaction_isolation.sql
@@ -0,0 +1,244 @@
+----------------------------------------------------------------
+-- Test transaction isolation in general, not specific to dtx
+----------------------------------------------------------------
+1: create table hs_tx(a int);
+1: insert into hs_tx select * from generate_series(1,10);
+
+1: begin;
+1: insert into hs_tx select * from generate_series(11,20);
+2: begin;
+2: insert into hs_tx select * from generate_series(21,30);
+2: abort;
+
+-- standby should only see completed transactions, not in-progress transactions, nor aborted transactions
+-1S: select * from hs_tx;
+
+1: end;
+-1S: select * from hs_tx;
+
+----------------------------------------------------------------
+-- Test isolation between hot standby query and in-progress dtx
+----------------------------------------------------------------
+
+1: create table hs_dtx1(a int);
+1: create table hs_dtx2(a int);
+
+-- inject two suspend faults:
+-- 1. on seg0, suspend before PREPARE phase of 2PC
+1: select gp_inject_fault('qe_start_prepared', 'suspend',dbid) from gp_segment_configuration where content=0 and role='p';
+1&: insert into hs_dtx1 select * from  generate_series(1,10);
+-- 2. on seg1, suspend before COMMIT phase of 2PC
+2: select gp_inject_fault('qe_start_commit_prepared', 'suspend',dbid) from gp_segment_configuration where content=1 and role='p';
+2&: insert into hs_dtx2 select * from  generate_series(1,10);
+
+-- standby should not see any rows from either dtx
+-1S: select * from hs_dtx1;
+-1S: select * from hs_dtx2;
+
+-- reset
+3: select gp_inject_fault('qe_start_prepared', 'reset',dbid) from gp_segment_configuration where content=0 and role='p';
+3: select gp_inject_fault('qe_start_commit_prepared', 'reset',dbid) from gp_segment_configuration where content=1 and role='p';
+1<:
+2<:
+
+-- standby should see the results from the dtx now
+-1S: select * from hs_dtx1;
+-1S: select * from hs_dtx2;
+
+----------------------------------------------------------------
+-- Test DTX abort that happens in different phases
+----------------------------------------------------------------
+
+1: create table hs_abort_dtx1(a int);
+1: create table hs_abort_dtx2(a int);
+
+-- inject two errors:
+-- 1. on seg0, error out before PREPARE phase of 2PC
+1: select gp_inject_fault('qe_start_prepared', 'error', dbid) from gp_segment_configuration where content=0 and role='p';
+1: insert into hs_abort_dtx1 select * from  generate_series(1,10);
+1: select gp_inject_fault('qe_start_prepared', 'reset',dbid) from gp_segment_configuration where content=0 and role='p';
+-- 2. on seg1, error out before COMMIT phase of 2PC
+1: select gp_inject_fault('qe_start_commit_prepared', 'error', dbid) from gp_segment_configuration where content=1 and role='p';
+1: insert into hs_abort_dtx2 select * from  generate_series(1,10);
+1: select gp_inject_fault('qe_start_commit_prepared', 'reset',dbid) from gp_segment_configuration where content=1 and role='p';
+
+-- standby should not see dtx1 which is aborted but should see dtx2 which is recovered
+-1S: select * from hs_abort_dtx1;
+-1S: select * from hs_abort_dtx2;
+
+----------------------------------------------------------------
+-- Test isolation between hot standby query and in-progress dtx,
+-- but also run more queries in between
+----------------------------------------------------------------
+1: create table hs_dtx3(a int);
+
+-- inject faults to suspend segments in 2PC
+1: select gp_inject_fault('qe_start_prepared', 'suspend', dbid) from gp_segment_configuration where content=0 and role='p';
+1&: insert into hs_dtx3 select * from  generate_series(1,10);
+2: select gp_inject_fault('qe_start_commit_prepared', 'suspend', dbid) from gp_segment_configuration where content=1 and role='p';
+2&: insert into hs_dtx3 select * from  generate_series(11,20);
+
+-- standby should not see rows in the in-progress dtx
+-1S: select * from hs_dtx3;
+
+-- now run some dtx and completed
+3: insert into hs_dtx3 values(99);
+3: create table hs_dtx4(a int);
+3: insert into hs_dtx4 select * from  generate_series(1,10);
+
+-- standby should still not see rows in the in-progress DTX, but should see the completed ones
+-1S: select * from hs_dtx3;
+-1S: select * from hs_dtx4;
+
+3: select gp_inject_fault('qe_start_prepared', 'reset',dbid) from gp_segment_configuration where content=0 and role='p';
+3: select gp_inject_fault('qe_start_commit_prepared', 'reset',dbid) from gp_segment_configuration where content=1 and role='p';
+1<:
+2<:
+
+-- standby should see all rows now
+-1S: select * from hs_dtx3;
+
+----------------------------------------------------------------
+-- Test isolation between standby QD and in-progress dtx,
+-- but after standby QD resets and gets running DTX from checkpoint.
+----------------------------------------------------------------
+1: create table hs_t5(a int, b text);
+1: create table hs_t6(a int, b text);
+
+-- inject fault to suspend a primary right before it conducts the commit phase of 2PC,
+-- so in the subsequent INSERT, all local transactions will be committed but the dtx is not.
+1: select gp_inject_fault('qe_start_commit_prepared', 'suspend', dbid) from gp_segment_configuration where content=0 and role='p';
+1&: insert into hs_t5 select i, 'in-progress' from generate_series(1,10) i;
+
+-- now run some dtx and completed, and primary conducts a checkpoint
+2: insert into hs_t5 values(1, 'commited');
+2: insert into hs_t6 select i, 'committed' from generate_series(1,10) i;
+2: begin;
+2: insert into hs_t5 values(99, 'aborted');
+2: abort;
+2: checkpoint;
+
+-- now make the standby QD resets itself
+-1S: select gp_inject_fault('exec_simple_query_start', 'panic', dbid) from gp_segment_configuration where content=-1 and role='m';
+-1S: select 1;
+-1Sq:
+
+-- standby should still not see rows in the in-progress DTX, but should see the completed ones
+-1S: select * from hs_t5;
+-1S: select * from hs_t6;
+
+2: select gp_inject_fault('qe_start_commit_prepared', 'reset',dbid) from gp_segment_configuration where content=0 and role='p';
+1<:
+
+-- standby should see all rows now
+-1S: select * from hs_t5;
+-1S: select * from hs_t6;
+
+-- standby should correctly see more in-progress dtx on the primary.
+-- context: previously this would be fail because the standby updates latestCompletedGxid to the
+-- bumped nextGxid from checkpoint, which is too far (so that it thinks the new dtx already completed).
+1: select gp_inject_fault('qe_start_prepared', 'suspend', dbid) from gp_segment_configuration where content=0 and role='p';
+1&: delete from hs_t5;
+2: select gp_inject_fault('qe_start_commit_prepared', 'suspend', dbid) from gp_segment_configuration where content=1 and role='p';
+2&: delete from hs_t6;
+
+-- standby should not see the effect of the deletes
+-1S: select * from hs_t5;
+-1S: select * from hs_t6;
+
+3: select gp_inject_fault('qe_start_prepared', 'reset',dbid) from gp_segment_configuration where content=0 and role='p';
+3: select gp_inject_fault('qe_start_commit_prepared', 'reset',dbid) from gp_segment_configuration where content=1 and role='p';
+
+1<:
+2<:
+
+-- standby now see those deletes
+-1S: select * from hs_t5;
+-1S: select * from hs_t6;
+
+----------------------------------------------------------------
+-- Read-committed isolation: query on hot standby should not see dtx that completed after it
+-- created distributed snapshot, but should see dtx that completed before that.
+----------------------------------------------------------------
+
+1: create table hs_rc(a int);
+1: insert into hs_rc select * from generate_series(1,10);
+
+-- case 1: suspend SELECT on the standby QD right after it created snapshot
+-1S: select gp_inject_fault('select_after_qd_create_snapshot', 'suspend', dbid) from gp_segment_configuration where content=-1 and role='m';
+-1S&: select * from hs_rc;
+
+-- new INSERT or DELETE won't be observed by the standby
+1: insert into hs_rc select * from generate_series(11,20);
+1: delete from hs_rc where a < 5;
+1: select gp_inject_fault('select_after_qd_create_snapshot', 'reset', dbid) from gp_segment_configuration where content=-1 and role='m';
+
+-- should only see the rows at the time when SELECT started (1...10).
+-1S<:
+
+-- SELECT again, should see the effect from the INSERT and DELETE now
+-1S: select * from hs_rc;
+
+-- case 2: suspend SELECT on the standby QD before creating snapshot
+-1S: select gp_inject_fault('select_before_qd_create_snapshot', 'suspend', dbid) from gp_segment_configuration where content=-1 and role='m';
+-1S&: select * from hs_rc;
+
+1: insert into hs_rc select * from generate_series(21,30);
+1: delete from hs_rc where a < 21;
+1: select gp_inject_fault('select_before_qd_create_snapshot', 'reset', dbid) from gp_segment_configuration where content=-1 and role='m';
+
+-- standby should see the effect of the INSERT and DELETE
+-1S<:
+
+----------------------------------------------------------------
+-- Read-committed isolation in the BEGIN...END block
+----------------------------------------------------------------
+
+1: truncate hs_rc;
+1: insert into hs_rc select * from generate_series(1,30);
+
+-1S: begin;
+-1S: select count(*) from hs_rc;
+
+-- have some concurrent sessions on primary QD:
+-- 1. a completed transaction
+1: delete from hs_rc where a <= 10;
+-- 3. an aborted transaction
+2: begin;
+2: delete from hs_rc where a > 10 and a <= 20;
+2: abort;
+-- 3. an ongoing transaction
+3: begin;
+3: delete from hs_rc where a > 20 and a <= 30;
+
+-- the standby should see results accordingly
+-1S: select * from hs_rc;
+-1S: end;
+
+3: end;
+-1S: select * from hs_rc;
+
+----------------------------------------------------------------
+-- Various isolation tests that involve AO/CO table.
+----------------------------------------------------------------
+1: create table hs_ao(a int, id int unique) using ao_row;
+1: insert into hs_ao select 1,i from generate_series(1,10) i;
+1: begin;
+1: insert into hs_ao select 2,i from generate_series(11,20) i;
+
+-- standby sees the same AO metadata as primary
+2: select * from gp_toolkit.__gp_aoseg('hs_ao');
+-1S: select * from gp_toolkit.__gp_aoseg('hs_ao');
+2: select (gp_toolkit.__gp_aoblkdir('hs_ao')).* from gp_dist_random('gp_id');
+-1S: select (gp_toolkit.__gp_aoblkdir('hs_ao')).* from gp_dist_random('gp_id');
+
+-- standby sees correct table data
+-1S: select * from hs_ao; 
+
+-- standby sees the effect of vacuum
+1: end;
+1: delete from hs_ao where a = 1;
+1: vacuum hs_ao;
+1: select * from gp_toolkit.__gp_aoseg('hs_ao');
+-1S: select * from gp_toolkit.__gp_aoseg('hs_ao');
+-1S: select * from hs_ao; 

From a8c3bc28fb392a447bf6d8218363649a4f52f18f Mon Sep 17 00:00:00 2001
From: Huansong Fu <fuhuansong@gmail.com>
Date: Wed, 14 Feb 2024 10:58:21 -0800
Subject: [PATCH 06/11] Support repeatable-read dtx isolation for hot standby

In previous commits we've supported hot standby dispatch and read-committed
isolation. In order to support repeatable-read isolation, the only real
complication is just to support the BEGIN...END block. The snapshot selection
and usage for repeatable-read on a hot standby is exactly the same as a primary.

And, the main difference between a single-statement transaction and a
BEGIN...END block is just the DTX context of the QEs: in the former case the
QEs are DTX_CONTEXT_QE_AUTO_COMMIT_IMPLICIT, but in the latter case they
are DTX_CONTEXT_QE_TWO_PHASE_EXPLICIT_WRITER (see setupQEDtxContext()).

We had Assert/ERROR in the code to assume that for EXPLICIT_WRITER, there's
always a valid distributed xid for that transaction. However, that is not the
case for hot standby: a standby never allocates an xid and there's definitely
no use of an xid in its BEGIN...END block. Therefore, all we need to do is just
make sure to not apply this assumption to hot standby. After that, supporting
repeatable-read is a no-op.

Another small change is to rename IS_STANDBY_QE to IS_HOT_STANDBY_QE to better
correspond to IS_HOT_STANDBY_QD.
---
 src/backend/access/transam/xact.c             |  55 +++--
 src/backend/cdb/cdbdtxcontextinfo.c           |   2 +-
 src/backend/cdb/cdbtm.c                       |  15 ++
 src/backend/tcop/postgres.c                   |   4 +-
 src/include/cdb/cdbtm.h                       |   9 +-
 src/include/cdb/cdbvars.h                     |   2 +-
 .../isolation2/expected/hot_standby/basic.out |  19 +-
 .../hot_standby/transaction_isolation.out     | 229 ++++++++++++++++++
 src/test/isolation2/sql/hot_standby/basic.sql |  17 +-
 .../sql/hot_standby/transaction_isolation.sql |  75 ++++++
 10 files changed, 374 insertions(+), 53 deletions(-)

diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c
index 5fdbcf70953..ed655baf989 100644
--- a/src/backend/access/transam/xact.c
+++ b/src/backend/access/transam/xact.c
@@ -2525,20 +2525,39 @@ StartTransaction(void)
 
 		case DTX_CONTEXT_QE_TWO_PHASE_EXPLICIT_WRITER:
 		case DTX_CONTEXT_QE_TWO_PHASE_IMPLICIT_WRITER:
+			/*
+			 * Sanity check for the global xid.
+			 * 
+			 * Note for hot standby dispatch: the standby QEs are still 
+			 * writers, just like primary QEs for SELECT queries. But 
+			 * hot standby dispatch never has a valid gxid, so we skip
+			 * the gxid checks for the standby QEs.
+			 */
+			if (!IS_HOT_STANDBY_QE())
+			{
+				if (QEDtxContextInfo.distributedXid == InvalidDistributedTransactionId)
+					elog(ERROR,
+						 "distributed transaction id is invalid in context %s",
+						 DtxContextToString(DistributedTransactionContext));
+
+				/*
+				 * Update distributed XID info, this is only used for
+				 * debugging.
+				 */
+				LocalDistribXactData *ele = &MyProc->localDistribXactData;
+				ele->distribXid = QEDtxContextInfo.distributedXid;
+				ele->state = LOCALDISTRIBXACT_STATE_ACTIVE;
+			}
+			else
+				Assert(QEDtxContextInfo.distributedXid == InvalidDistributedTransactionId);
+
+			/* fall through */
 		case DTX_CONTEXT_QE_AUTO_COMMIT_IMPLICIT:
 		{
 			/* If we're running in test-mode insert a delay in writer. */
 			if (gp_enable_slow_writer_testmode)
 				pg_usleep(500000);
 
-			if (DistributedTransactionContext != DTX_CONTEXT_QE_AUTO_COMMIT_IMPLICIT &&
-				QEDtxContextInfo.distributedXid == InvalidDistributedTransactionId)
-			{
-				elog(ERROR,
-					 "distributed transaction id is invalid in context %s",
-					 DtxContextToString(DistributedTransactionContext));
-			}
-
 			/*
 			 * Snapshot must not be created before setting transaction
 			 * isolation level.
@@ -2551,28 +2570,14 @@ StartTransaction(void)
 			XactReadOnly = isMppTxOptions_ReadOnly(
 				QEDtxContextInfo.distributedTxnOptions);
 
+			/* a hot standby transaction must be read-only */
+			AssertImply(IS_HOT_STANDBY_QE(), XactReadOnly);
+
 			/*
 			 * MPP: we're a QE Writer.
 			 */
 			MyTmGxact->gxid = QEDtxContextInfo.distributedXid;
 
-			if (DistributedTransactionContext ==
-				DTX_CONTEXT_QE_TWO_PHASE_EXPLICIT_WRITER ||
-				DistributedTransactionContext ==
-				DTX_CONTEXT_QE_TWO_PHASE_IMPLICIT_WRITER)
-			{
-				Assert(QEDtxContextInfo.distributedXid !=
-					   InvalidDistributedTransactionId);
-
-				/*
-				 * Update distributed XID info, this is only used for
-				 * debugging.
-				 */
-				LocalDistribXactData *ele = &MyProc->localDistribXactData;
-				ele->distribXid = QEDtxContextInfo.distributedXid;
-				ele->state = LOCALDISTRIBXACT_STATE_ACTIVE;
-			}
-
 			if (SharedLocalSnapshotSlot != NULL)
 			{
 				LWLockAcquire(SharedLocalSnapshotSlot->slotLock, LW_EXCLUSIVE);
diff --git a/src/backend/cdb/cdbdtxcontextinfo.c b/src/backend/cdb/cdbdtxcontextinfo.c
index 1a3c1b8f295..2994821f8df 100644
--- a/src/backend/cdb/cdbdtxcontextinfo.c
+++ b/src/backend/cdb/cdbdtxcontextinfo.c
@@ -60,7 +60,7 @@ DtxContextInfo_CreateOnMaster(DtxContextInfo *dtxContextInfo, bool inCursor,
 				(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
 				 errmsg("cannot have more than 2^32-2 commands in a session")));
 
-	AssertImply(inCursor,
+	AssertImply(inCursor && !IS_HOT_STANDBY_QD(),
 				dtxContextInfo->distributedXid != InvalidDistributedTransactionId &&
 				gp_command_count == MySessionState->latestCursorCommandId);
 
diff --git a/src/backend/cdb/cdbtm.c b/src/backend/cdb/cdbtm.c
index e25cca64873..37550261149 100644
--- a/src/backend/cdb/cdbtm.c
+++ b/src/backend/cdb/cdbtm.c
@@ -264,6 +264,21 @@ currentDtxActivate(void)
 {
 	bool signal_dtx_recovery;
 
+	/*
+	 * A hot standby transaction does not have a valid gxid, so can skip 
+	 * most of the things in this function. We still explicitly set some 
+	 * fields that are irrelevant to hot standby for cleanness.
+	 */
+	if (IS_HOT_STANDBY_QD())
+	{
+		/* standby QD will stay in this state until transaction completed */
+		setCurrentDtxState(DTX_STATE_ACTIVE_DISTRIBUTED);
+		MyTmGxact->sessionId = gp_session_id;
+		MyTmGxact->gxid = InvalidDistributedTransactionId;
+		MyTmGxact->includeInCkpt = false;
+		return;
+	}
+
 	if (ShmemVariableCache->GxidCount <= GXID_PRETCH_THRESHOLD &&
 		(GetDtxRecoveryEvent() & DTX_RECOVERY_EVENT_BUMP_GXID) == 0)
 	{
diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c
index 7dcb1f55917..62ded58aafb 100644
--- a/src/backend/tcop/postgres.c
+++ b/src/backend/tcop/postgres.c
@@ -5756,12 +5756,12 @@ PostgresMain(int argc, char *argv[],
 
 					/* check if the message is from standby QD and is expected */
 					is_hs_dispatch = pq_getmsgint(&input_message, 4);
-					if (is_hs_dispatch == 0 && IS_STANDBY_QE())
+					if (is_hs_dispatch == 0 && IS_HOT_STANDBY_QE())
 						ereport(ERROR,
 								(errcode(ERRCODE_PROTOCOL_VIOLATION),
 								 errmsg("mirror segments can only process MPP protocol messages from standby QD"),
 								 errhint("Exit the current session and re-connect.")));
-					else if (is_hs_dispatch != 0 && !IS_STANDBY_QE())
+					else if (is_hs_dispatch != 0 && !IS_HOT_STANDBY_QE())
 						ereport(ERROR,
 								(errcode(ERRCODE_PROTOCOL_VIOLATION),
 								 errmsg("primary segments can only process MPP protocol messages from primary QD"),
diff --git a/src/include/cdb/cdbtm.h b/src/include/cdb/cdbtm.h
index 951b9013c00..2bf259a8744 100644
--- a/src/include/cdb/cdbtm.h
+++ b/src/include/cdb/cdbtm.h
@@ -35,8 +35,12 @@ typedef enum
 	DTX_STATE_NONE = 0,
 
 	/**
-	 * The distributed transaction is active and requires distributed coordination
-	 *   (because it is explicit or an implicit writer transaction)
+	 * The distributed transaction is active.
+	 * For a primary, this state means the transaction requires distributed
+	 * coordination (because it is explicit or an implicit writer transaction),
+	 * and it will switch to other dtx states in different phases.
+	 * For a hot standby, there is no coordination necessary so transaction 
+	 * will stay in this state until the end of the commit.
 	 */
 	DTX_STATE_ACTIVE_DISTRIBUTED,
 
@@ -232,6 +236,7 @@ typedef struct TMGXACTLOCAL
 {
 	/*
 	 * Memory only fields.
+	 * If we are in hot standby, only 'state' is relevant.
 	 */
  	DtxState				state;
 	
diff --git a/src/include/cdb/cdbvars.h b/src/include/cdb/cdbvars.h
index ab5963a0661..2393384ec3a 100644
--- a/src/include/cdb/cdbvars.h
+++ b/src/include/cdb/cdbvars.h
@@ -761,7 +761,7 @@ extern GpId GpIdentity;
 #define IS_HOT_STANDBY_QD() (EnableHotStandby && IS_QUERY_DISPATCHER() && RecoveryInProgress())
 
 #define IS_QUERY_EXECUTOR_BACKEND() (Gp_role == GP_ROLE_EXECUTE && gp_session_id > 0)
-#define IS_STANDBY_QE() (EnableHotStandby && IS_QUERY_EXECUTOR_BACKEND() && RecoveryInProgress())
+#define IS_HOT_STANDBY_QE() (EnableHotStandby && IS_QUERY_EXECUTOR_BACKEND() && RecoveryInProgress())
 
 /* Stores the listener port that this process uses to listen for incoming
  * Interconnect connections from other Motion nodes.
diff --git a/src/test/isolation2/expected/hot_standby/basic.out b/src/test/isolation2/expected/hot_standby/basic.out
index 4a8396c3be7..99d9f407b01 100644
--- a/src/test/isolation2/expected/hot_standby/basic.out
+++ b/src/test/isolation2/expected/hot_standby/basic.out
@@ -163,7 +163,9 @@ COMMIT
 (20 rows)
 
 ----------------------------------------------------------------
--- Test: other things that a hot standby can do
+-- Test: other things that a hot standby can do.
+--
+-- More refer to regress test 'hs_standby_allowed'.
 ----------------------------------------------------------------
 -- set/reset and show GUC
 -1S: set optimizer = on;
@@ -189,22 +191,15 @@ COPY 20
 ------+---------+------+----------------+------+--------
  8    | -1      | m    | m              | s    | u      
 (1 row)
-
--- Here are the things hot standby in PG can do but currently cannot in GPDB:
--- transaction block BEGIN...END;
--1S: begin;
-ERROR:  cannot setup distributed transaction during recovery (cdbtm.c:392)
--1S: end;
-COMMIT
--- cursor operation due to not supporting BEGIN...END yet;
-
 -- checkpoint is allowed on standby but a restart point is created instead
 -1S: checkpoint;
 CHECKPOINT
 
 ----------------------------------------------------------------
--- Test: things that can't be done on a hot standby in both PG and GDPB:
--- no DML, DDL or anything that generates WAL
+-- Test: things that can't be done on a hot standby:
+-- no DML, DDL or anything that generates WAL.
+--
+-- More refer to regress test 'hs_standby_disallowed'.
 ----------------------------------------------------------------
 -1S: insert into hs_t1 values(1);
 ERROR:  cannot execute INSERT in a read-only transaction
diff --git a/src/test/isolation2/expected/hot_standby/transaction_isolation.out b/src/test/isolation2/expected/hot_standby/transaction_isolation.out
index ca69d615967..139cca8e88e 100644
--- a/src/test/isolation2/expected/hot_standby/transaction_isolation.out
+++ b/src/test/isolation2/expected/hot_standby/transaction_isolation.out
@@ -654,6 +654,235 @@ COMMIT
  19 
 (10 rows)
 
+----------------------------------------------------------------
+-- Repeatable-read isolation: distributed snapshot is created at time of the
+-- first query in transaction block. All queries in the transaction block
+-- should only see results committed before the distributed snapshot creation.
+----------------------------------------------------------------
+
+1: create table hs_rr(a int);
+CREATE TABLE
+1: insert into hs_rr select * from generate_series(1,10);
+INSERT 0 10
+
+-1S: begin isolation level repeatable read;
+BEGIN
+-- should see 10
+-1S: select count(*) from hs_rr;
+ count 
+-------
+ 10    
+(1 row)
+
+-- do some more INSERT, DELETE and UPDATE
+1: insert into hs_rr select * from generate_series(11,20);
+INSERT 0 10
+1: delete from hs_rr where a <= 10;
+DELETE 10
+1: update hs_rr set a = a + 100;
+UPDATE 10
+
+-- should still the initial rows {1...10}
+-1S: select * from hs_rr;
+ a  
+----
+ 2  
+ 3  
+ 4  
+ 7  
+ 8  
+ 1  
+ 5  
+ 6  
+ 9  
+ 10 
+(10 rows)
+-1S: end;
+COMMIT
+
+-- should see the results from the INSERT, DELETE and UPDATE
+-1S: begin isolation level repeatable read;
+BEGIN
+-1S: select * from hs_rr;
+ a   
+-----
+ 115 
+ 120 
+ 118 
+ 113 
+ 114 
+ 112 
+ 116 
+ 119 
+ 111 
+ 117 
+(10 rows)
+
+-- standby won't see ongoing or aborted transactions either
+1: begin;
+BEGIN
+1: insert into hs_rr select * from generate_series(1,10);
+INSERT 0 10
+2: begin;
+BEGIN
+2: insert into hs_rr select * from generate_series(1,10);
+INSERT 0 10
+2: abort;
+ROLLBACK
+
+-1S: select * from hs_rr;
+ a   
+-----
+ 114 
+ 115 
+ 120 
+ 118 
+ 113 
+ 112 
+ 116 
+ 119 
+ 111 
+ 117 
+(10 rows)
+
+1: end;
+COMMIT
+-1S: end;
+COMMIT
+
+----------------------------------------------------------------
+-- Transaction isolation is respected in subtransactions too
+----------------------------------------------------------------
+
+1: create table hs_subtrx(a int);
+CREATE TABLE
+
+-- (1) read-committed
+-1S: begin;
+BEGIN
+-1S: select count(*) from hs_subtrx;
+ count 
+-------
+ 0     
+(1 row)
+-1S: savepoint s1;
+SAVEPOINT
+
+1: insert into hs_subtrx select * from generate_series(1,10);
+INSERT 0 10
+
+-1S: select count(*) from hs_subtrx;
+ count 
+-------
+ 10    
+(1 row)
+-1S: savepoint s2;
+SAVEPOINT
+-1S: select count(*) from hs_subtrx;
+ count 
+-------
+ 10    
+(1 row)
+-1S: rollback to savepoint s1;
+ROLLBACK
+-1S: select count(*) from hs_subtrx;
+ count 
+-------
+ 10    
+(1 row)
+-1S: end;
+COMMIT
+
+-- (2) repeatable-read
+-1S: begin isolation level repeatable read;
+BEGIN
+-1S: select * from hs_subtrx;
+ a  
+----
+ 1  
+ 2  
+ 3  
+ 4  
+ 7  
+ 8  
+ 5  
+ 6  
+ 9  
+ 10 
+(10 rows)
+-1S: savepoint s1;
+SAVEPOINT
+
+1: insert into hs_subtrx select * from generate_series(11,20);
+INSERT 0 10
+1: delete from hs_subtrx where a <= 10;
+DELETE 10
+1: update hs_subtrx set a = a + 100;
+UPDATE 10
+
+-1S: select * from hs_subtrx;
+ a  
+----
+ 2  
+ 3  
+ 4  
+ 7  
+ 8  
+ 1  
+ 5  
+ 6  
+ 9  
+ 10 
+(10 rows)
+-1S: savepoint s2;
+SAVEPOINT
+-1S: select * from hs_subtrx;
+ a  
+----
+ 2  
+ 3  
+ 4  
+ 7  
+ 8  
+ 1  
+ 5  
+ 6  
+ 9  
+ 10 
+(10 rows)
+-1S: rollback to savepoint s1;
+ROLLBACK
+-1S: select * from hs_subtrx;
+ a  
+----
+ 2  
+ 3  
+ 4  
+ 7  
+ 8  
+ 1  
+ 5  
+ 6  
+ 9  
+ 10 
+(10 rows)
+-1S: end;
+COMMIT
+-1S: select * from hs_subtrx;
+ a   
+-----
+ 114 
+ 115 
+ 120 
+ 118 
+ 113 
+ 112 
+ 116 
+ 119 
+ 111 
+ 117 
+(10 rows)
+
 ----------------------------------------------------------------
 -- Various isolation tests that involve AO/CO table.
 ----------------------------------------------------------------
diff --git a/src/test/isolation2/sql/hot_standby/basic.sql b/src/test/isolation2/sql/hot_standby/basic.sql
index a315366ac3f..a900b38a29c 100644
--- a/src/test/isolation2/sql/hot_standby/basic.sql
+++ b/src/test/isolation2/sql/hot_standby/basic.sql
@@ -52,7 +52,9 @@ end;
 -1S: select * from hs_t1;
 
 ----------------------------------------------------------------
--- Test: other things that a hot standby can do
+-- Test: other things that a hot standby can do.
+--
+-- More refer to regress test 'hs_standby_allowed'.
 ----------------------------------------------------------------
 -- set/reset and show GUC
 -1S: set optimizer = on;
@@ -63,19 +65,14 @@ end;
 -- query catalogs
 -1S: select count(*) from pg_class where relname = 'hs_t1';
 -1S: select dbid,content,role,preferred_role,mode,status from gp_segment_configuration where dbid = current_setting('gp_dbid')::integer;
-
--- Here are the things hot standby in PG can do but currently cannot in GPDB:
--- transaction block BEGIN...END;
--1S: begin;
--1S: end;
--- cursor operation due to not supporting BEGIN...END yet;
-
 -- checkpoint is allowed on standby but a restart point is created instead
 -1S: checkpoint;
 
 ----------------------------------------------------------------
--- Test: things that can't be done on a hot standby in both PG and GDPB:
--- no DML, DDL or anything that generates WAL
+-- Test: things that can't be done on a hot standby:
+-- no DML, DDL or anything that generates WAL.
+--
+-- More refer to regress test 'hs_standby_disallowed'.
 ----------------------------------------------------------------
 -1S: insert into hs_t1 values(1);
 -1S: delete from hs_t1;
diff --git a/src/test/isolation2/sql/hot_standby/transaction_isolation.sql b/src/test/isolation2/sql/hot_standby/transaction_isolation.sql
index 6f0508d82b4..68945228313 100644
--- a/src/test/isolation2/sql/hot_standby/transaction_isolation.sql
+++ b/src/test/isolation2/sql/hot_standby/transaction_isolation.sql
@@ -218,6 +218,81 @@
 3: end;
 -1S: select * from hs_rc;
 
+----------------------------------------------------------------
+-- Repeatable-read isolation: distributed snapshot is created at time of the 
+-- first query in transaction block. All queries in the transaction block 
+-- should only see results committed before the distributed snapshot creation.
+----------------------------------------------------------------
+
+1: create table hs_rr(a int);
+1: insert into hs_rr select * from generate_series(1,10);
+
+-1S: begin isolation level repeatable read;
+-- should see 10
+-1S: select count(*) from hs_rr;
+
+-- do some more INSERT, DELETE and UPDATE
+1: insert into hs_rr select * from generate_series(11,20);
+1: delete from hs_rr where a <= 10;
+1: update hs_rr set a = a + 100;
+
+-- should still the initial rows {1...10}
+-1S: select * from hs_rr;
+-1S: end;
+
+-- should see the results from the INSERT, DELETE and UPDATE
+-1S: begin isolation level repeatable read;
+-1S: select * from hs_rr;
+
+-- standby won't see ongoing or aborted transactions either
+1: begin;
+1: insert into hs_rr select * from generate_series(1,10);
+2: begin;
+2: insert into hs_rr select * from generate_series(1,10);
+2: abort;
+
+-1S: select * from hs_rr;
+
+1: end;
+-1S: end;
+
+----------------------------------------------------------------
+-- Transaction isolation is respected in subtransactions too
+----------------------------------------------------------------
+
+1: create table hs_subtrx(a int);
+
+-- (1) read-committed
+-1S: begin;
+-1S: select count(*) from hs_subtrx;
+-1S: savepoint s1;
+
+1: insert into hs_subtrx select * from generate_series(1,10);
+
+-1S: select count(*) from hs_subtrx;
+-1S: savepoint s2;
+-1S: select count(*) from hs_subtrx;
+-1S: rollback to savepoint s1;
+-1S: select count(*) from hs_subtrx;
+-1S: end;
+
+-- (2) repeatable-read
+-1S: begin isolation level repeatable read;
+-1S: select * from hs_subtrx;
+-1S: savepoint s1;
+
+1: insert into hs_subtrx select * from generate_series(11,20);
+1: delete from hs_subtrx where a <= 10;
+1: update hs_subtrx set a = a + 100;
+
+-1S: select * from hs_subtrx;
+-1S: savepoint s2;
+-1S: select * from hs_subtrx;
+-1S: rollback to savepoint s1;
+-1S: select * from hs_subtrx;
+-1S: end;
+-1S: select * from hs_subtrx;
+
 ----------------------------------------------------------------
 -- Various isolation tests that involve AO/CO table.
 ----------------------------------------------------------------

From 06f15f1732c95ab869dd6ddbfb2faaee2b1b5f4b Mon Sep 17 00:00:00 2001
From: Huansong Fu <fuhuansong@gmail.com>
Date: Thu, 15 Feb 2024 08:37:56 -0800
Subject: [PATCH 07/11] Enable upstream hot standby tests

Fixed result differences and non-runnable tests. The notable ones are:
1. Backward cursor fetch is not supported in GPDB. Move to the "_disallowed" test.
2. Some ERROR messages are replaced by "not supported" ones in GPDB which
   should be fine.
3. "cannot execute SELECT FOR SHARE in a read-only transaction" is replaced by
   "cannot acquire lock mode ExclusiveLock ... during recovery". The reason is
   that the QD needs to acquire the lock if GDD isn't enabled. If later we
   found it needed we may try to change the error message for standby just to
   be a little more informative.
4. The "test_setup" was added to the standby schedule by mistake. Removing it.

With that, we can add this test schedule to the hot standby pipeline job.
---
 GNUmakefile.in                                |  5 +++
 src/test/regress/GNUmakefile                  | 10 +++++
 .../regress/expected/hs_primary_setup.out     | 19 +++++++++
 .../regress/expected/hs_standby_allowed.out   | 24 +++++------
 .../expected/hs_standby_disallowed.out        | 40 +++++++++++++++----
 .../regress/expected/hs_standby_functions.out | 15 ++++---
 src/test/regress/sql/hs_primary_setup.sql     |  7 ++++
 src/test/regress/sql/hs_standby_allowed.sql   |  7 ++--
 .../regress/sql/hs_standby_disallowed.sql     |  8 ++++
 src/test/regress/sql/hs_standby_functions.sql |  4 +-
 10 files changed, 106 insertions(+), 33 deletions(-)
 create mode 100644 src/test/regress/expected/hs_primary_setup.out

diff --git a/GNUmakefile.in b/GNUmakefile.in
index bde27f24aa4..d836f7ba525 100644
--- a/GNUmakefile.in
+++ b/GNUmakefile.in
@@ -212,6 +212,11 @@ installcheck-gpcheckcat:
 $(call recurse,installcheck-world,gpcontrib/gp_replica_check,installcheck)
 $(call recurse,installcheck-world,src/bin/pg_upgrade,check)
 
+.PHONY: installcheck-hot-standby
+installcheck-hot-standby: submake-generated-headers
+	$(MAKE) -C src/test/regress installcheck-hot-standby
+	$(MAKE) -C src/test/isolation2 installcheck-hot-standby
+
 # Run mock tests, that don't require a running server. Arguably these should
 # be part of [install]check-world, but we treat them more like part of
 # compilation than regression testing, in the CI. But they are too heavy-weight
diff --git a/src/test/regress/GNUmakefile b/src/test/regress/GNUmakefile
index 91d24a8292b..9bc8b67591e 100644
--- a/src/test/regress/GNUmakefile
+++ b/src/test/regress/GNUmakefile
@@ -241,6 +241,16 @@ endif
 standbycheck: all
 	$(pg_regress_installcheck) $(REGRESS_OPTS) --schedule=$(srcdir)/standby_schedule  $(EXTRA_TESTS)
 
+# GPDB: installcheck for hot standby. This is essentially same as the upstream 'standbycheck'
+# above but we just make sure that we do the primary preparation and use the desired standby port.
+# If no standby port is given, just use the demo cluster's standby port 7001. 
+ifeq ($(STANDBY_PGPORT),)
+  STANDBY_PGPORT = 7001
+endif
+installcheck-hot-standby: all
+	$(pg_regress_installcheck) $(REGRESS_OPTS) hs_primary_setup
+	$(pg_regress_installcheck) $(REGRESS_OPTS) --port=$(STANDBY_PGPORT) --use-existing --schedule=$(srcdir)/standby_schedule  $(EXTRA_TESTS)
+
 # old interfaces follow...
 
 runcheck: check
diff --git a/src/test/regress/expected/hs_primary_setup.out b/src/test/regress/expected/hs_primary_setup.out
new file mode 100644
index 00000000000..0184b2b73e9
--- /dev/null
+++ b/src/test/regress/expected/hs_primary_setup.out
@@ -0,0 +1,19 @@
+--
+-- Hot Standby tests
+--
+-- hs_primary_setup.sql
+--
+drop table if exists hs1;
+create table hs1 (col1 integer primary key);
+insert into hs1 values (1);
+drop table if exists hs2;
+create table hs2 (col1 integer primary key);
+insert into hs2 values (12);
+insert into hs2 values (13);
+drop table if exists hs3;
+create table hs3 (col1 integer primary key);
+insert into hs3 values (113);
+insert into hs3 values (114);
+insert into hs3 values (115);
+DROP sequence if exists hsseq;
+create sequence hsseq;
diff --git a/src/test/regress/expected/hs_standby_allowed.out b/src/test/regress/expected/hs_standby_allowed.out
index 00b8faf9eb6..e6b6514642f 100644
--- a/src/test/regress/expected/hs_standby_allowed.out
+++ b/src/test/regress/expected/hs_standby_allowed.out
@@ -164,31 +164,25 @@ show synchronous_commit;
 reset synchronous_commit;
 discard temp;
 discard all;
+NOTICE:  command without clusterwide effect
+HINT:  Consider alternatives as DEALLOCATE ALL, or DISCARD TEMP if a clusterwide effect is desired.
 -- CURSOR commands
 BEGIN;
-DECLARE hsc CURSOR FOR select * from hs3;
+DECLARE hsc CURSOR FOR select * from hs3 order by col1 asc;
 FETCH next from hsc;
  col1 
 ------
   113
 (1 row)
 
-fetch first from hsc;
- col1 
-------
-  113
-(1 row)
-
-fetch last from hsc;
- col1 
-------
-  115
-(1 row)
-
+-- GPDB: backward fetch isn't allowed, moved to hs_standby_disallowed
+-- fetch first from hsc;
+-- fetch last from hsc;
 fetch 1 from hsc;
  col1 
 ------
-(0 rows)
+  114
+(1 row)
 
 CLOSE hsc;
 COMMIT;
@@ -216,3 +210,5 @@ UNLISTEN *;
 -- ALLOWED COMMANDS
 CHECKPOINT;
 discard all;
+NOTICE:  command without clusterwide effect
+HINT:  Consider alternatives as DEALLOCATE ALL, or DISCARD TEMP if a clusterwide effect is desired.
diff --git a/src/test/regress/expected/hs_standby_disallowed.out b/src/test/regress/expected/hs_standby_disallowed.out
index 8d3cafa5cec..853fa853c81 100644
--- a/src/test/regress/expected/hs_standby_disallowed.out
+++ b/src/test/regress/expected/hs_standby_disallowed.out
@@ -11,9 +11,15 @@ commit;
 WARNING:  there is no transaction in progress
 -- SELECT
 select * from hs1 FOR SHARE;
-ERROR:  cannot execute SELECT FOR SHARE in a read-only transaction
+ERROR:  cannot acquire lock mode ExclusiveLock on database objects while recovery is in progress
+LINE 1: select * from hs1 FOR SHARE;
+                      ^
+HINT:  Only RowExclusiveLock or less can be acquired on database objects during recovery.
 select * from hs1 FOR UPDATE;
-ERROR:  cannot execute SELECT FOR UPDATE in a read-only transaction
+ERROR:  cannot acquire lock mode ExclusiveLock on database objects while recovery is in progress
+LINE 1: select * from hs1 FOR UPDATE;
+                      ^
+HINT:  Only RowExclusiveLock or less can be acquired on database objects during recovery.
 -- DML
 BEGIN;
 insert into hs1 values (37);
@@ -21,11 +27,17 @@ ERROR:  cannot execute INSERT in a read-only transaction
 ROLLBACK;
 BEGIN;
 delete from hs1 where col1 = 1;
-ERROR:  cannot execute DELETE in a read-only transaction
+ERROR:  cannot acquire lock mode ExclusiveLock on database objects while recovery is in progress
+LINE 1: delete from hs1 where col1 = 1;
+                    ^
+HINT:  Only RowExclusiveLock or less can be acquired on database objects during recovery.
 ROLLBACK;
 BEGIN;
 update hs1 set col1 = NULL where col1 > 0;
-ERROR:  cannot execute UPDATE in a read-only transaction
+ERROR:  cannot acquire lock mode ExclusiveLock on database objects while recovery is in progress
+LINE 1: update hs1 set col1 = NULL where col1 > 0;
+               ^
+HINT:  Only RowExclusiveLock or less can be acquired on database objects during recovery.
 ROLLBACK;
 BEGIN;
 truncate hs3;
@@ -54,7 +66,7 @@ SELECT count(*) FROM hs1;
 (1 row)
 
 PREPARE TRANSACTION 'foobar';
-ERROR:  cannot execute PREPARE TRANSACTION during recovery
+ERROR:  PREPARE TRANSACTION is not yet supported in Greenplum Database
 ROLLBACK;
 BEGIN;
 SELECT count(*) FROM hs1;
@@ -64,7 +76,7 @@ SELECT count(*) FROM hs1;
 (1 row)
 
 COMMIT PREPARED 'foobar';
-ERROR:  cannot execute COMMIT PREPARED during recovery
+ERROR:  COMMIT PREPARED is not yet supported in Greenplum Database
 ROLLBACK;
 BEGIN;
 SELECT count(*) FROM hs1;
@@ -74,7 +86,7 @@ SELECT count(*) FROM hs1;
 (1 row)
 
 PREPARE TRANSACTION 'foobar';
-ERROR:  cannot execute PREPARE TRANSACTION during recovery
+ERROR:  PREPARE TRANSACTION is not yet supported in Greenplum Database
 ROLLBACK PREPARED 'foobar';
 ERROR:  current transaction is aborted, commands ignored until end of transaction block
 ROLLBACK;
@@ -86,7 +98,7 @@ SELECT count(*) FROM hs1;
 (1 row)
 
 ROLLBACK PREPARED 'foobar';
-ERROR:  cannot execute ROLLBACK PREPARED during recovery
+ERROR:  ROLLBACK PREPARED is not yet supported in Greenplum Database
 ROLLBACK;
 -- Locks
 BEGIN;
@@ -131,3 +143,15 @@ REVOKE SELECT ON hs1 FROM PUBLIC;
 ERROR:  cannot execute REVOKE in a read-only transaction
 GRANT SELECT ON hs1 TO PUBLIC;
 ERROR:  cannot execute GRANT in a read-only transaction
+-- GPDB: backward fetch is not supported, moved from hs_standby_allowed.
+BEGIN;
+DECLARE hsc CURSOR FOR select * from hs3 order by col1 asc;
+fetch next from hsc;
+ col1 
+------
+  113
+(1 row)
+
+fetch first from hsc;
+ERROR:  backward scan is not supported in this version of Greenplum Database
+COMMIT;
diff --git a/src/test/regress/expected/hs_standby_functions.out b/src/test/regress/expected/hs_standby_functions.out
index ce846b758bf..48cb480f47a 100644
--- a/src/test/regress/expected/hs_standby_functions.out
+++ b/src/test/regress/expected/hs_standby_functions.out
@@ -27,13 +27,16 @@ select * from pg_prepared_xacts;
 -------------+-----+----------+-------+----------
 (0 rows)
 
--- just the startup process
-select locktype, virtualxid, virtualtransaction, mode, granted
+-- just the startup processes of all standby coordinator and segments, since pg_locks show cluster-wide view
+select gp_segment_id, locktype, virtualxid, virtualtransaction, mode, granted
 from pg_locks where virtualxid = '1/1';
-  locktype  | virtualxid | virtualtransaction |     mode      | granted 
-------------+------------+--------------------+---------------+---------
- virtualxid | 1/1        | 1/0                | ExclusiveLock | t
-(1 row)
+ gp_segment_id |  locktype  | virtualxid | virtualtransaction |     mode      | granted 
+---------------+------------+------------+--------------------+---------------+---------
+            -1 | virtualxid | 1/1        | 1/0                | ExclusiveLock | t
+             0 | virtualxid | 1/1        | 1/0                | ExclusiveLock | t
+             1 | virtualxid | 1/1        | 1/0                | ExclusiveLock | t
+             2 | virtualxid | 1/1        | 1/0                | ExclusiveLock | t
+(4 rows)
 
 -- suicide is painless
 select pg_cancel_backend(pg_backend_pid());
diff --git a/src/test/regress/sql/hs_primary_setup.sql b/src/test/regress/sql/hs_primary_setup.sql
index eeb4421307f..a90979a89ba 100644
--- a/src/test/regress/sql/hs_primary_setup.sql
+++ b/src/test/regress/sql/hs_primary_setup.sql
@@ -22,4 +22,11 @@ insert into hs3 values (115);
 DROP sequence if exists hsseq;
 create sequence hsseq;
 
+-- start_ignore
 SELECT pg_switch_wal();
+
+-- GPDB: enable hot_standby for this cluster
+\! gpconfig -c hot_standby -v on;
+\! gpstop -ar;
+
+-- end_ignore
diff --git a/src/test/regress/sql/hs_standby_allowed.sql b/src/test/regress/sql/hs_standby_allowed.sql
index 6debddc5e99..873f3ef8643 100644
--- a/src/test/regress/sql/hs_standby_allowed.sql
+++ b/src/test/regress/sql/hs_standby_allowed.sql
@@ -82,11 +82,12 @@ discard all;
 
 BEGIN;
 
-DECLARE hsc CURSOR FOR select * from hs3;
+DECLARE hsc CURSOR FOR select * from hs3 order by col1 asc;
 
 FETCH next from hsc;
-fetch first from hsc;
-fetch last from hsc;
+-- GPDB: backward fetch isn't allowed, moved to hs_standby_disallowed
+-- fetch first from hsc;
+-- fetch last from hsc;
 fetch 1 from hsc;
 
 CLOSE hsc;
diff --git a/src/test/regress/sql/hs_standby_disallowed.sql b/src/test/regress/sql/hs_standby_disallowed.sql
index a470600eec8..72066e2d40b 100644
--- a/src/test/regress/sql/hs_standby_disallowed.sql
+++ b/src/test/regress/sql/hs_standby_disallowed.sql
@@ -101,3 +101,11 @@ REINDEX TABLE hs2;
 
 REVOKE SELECT ON hs1 FROM PUBLIC;
 GRANT SELECT ON hs1 TO PUBLIC;
+
+-- GPDB: backward fetch is not supported, moved from hs_standby_allowed.
+BEGIN;
+DECLARE hsc CURSOR FOR select * from hs3 order by col1 asc;
+fetch next from hsc;
+fetch first from hsc;
+COMMIT;
+
diff --git a/src/test/regress/sql/hs_standby_functions.sql b/src/test/regress/sql/hs_standby_functions.sql
index b57f67ff8b5..903c8f96037 100644
--- a/src/test/regress/sql/hs_standby_functions.sql
+++ b/src/test/regress/sql/hs_standby_functions.sql
@@ -16,8 +16,8 @@ select pg_stop_backup();
 -- should return no rows
 select * from pg_prepared_xacts;
 
--- just the startup process
-select locktype, virtualxid, virtualtransaction, mode, granted
+-- just the startup processes of all standby coordinator and segments, since pg_locks show cluster-wide view
+select gp_segment_id, locktype, virtualxid, virtualtransaction, mode, granted
 from pg_locks where virtualxid = '1/1';
 
 -- suicide is painless

From 773bbba416e285b271f9e5f36c8e872722cb6193 Mon Sep 17 00:00:00 2001
From: Huansong Fu <fuhuansong@gmail.com>
Date: Thu, 7 Mar 2024 15:11:27 -0800
Subject: [PATCH 08/11] Make sure query conflict on the standby works as
 expected

For the most part, query conflict on standby works w/o any changes in GPDB.
Add tests for the expected beavior.

One notable issue is that we are not considering distributed snapshot in the
snapshot conflict detection at this point. We added test for that behavior too.

Add these tests:
1. All the query conflict types mentioned in
   https://www.postgresql.org/docs/12/hot-standby.html#HOT-STANDBY-CONFLICT.
   There is actually one that's not mentioned there which is deadlock conflict.
   Still yet to produce a test for that.
2. GUCs hot_standby_feedback and vacuum_defer_cleanup_age.
3. System view gp_stat_database_conflicts which is a cluster-wide view of
   pg_stat_database_conflicts. Note that, in the test we need to get the max of
   conflict count among all segments to avoid flakiness. Ideally we should just
   have something like gp_stat_database_conflicts_summary to print the max
   counts, but we are not allowed to change catalog now. So leaving that as a
   FIXME item.
4. A test case showing distributed snapshot isn't taken into account when
   detecting snapshot conflict. This is a limitation that we'll address with
   a restore-point based dtx snapshot creation approach later.
---
 src/backend/access/heap/heapam.c              |   8 +
 src/backend/catalog/system_views.sql          |   4 +
 src/backend/storage/file/fd.c                 |   1 +
 src/backend/storage/ipc/standby.c             |   3 +
 .../isolation2/expected/hot_standby/setup.out |   6 +
 .../expected/hot_standby/teardown.out         |   2 +
 src/test/isolation2/hot_standby_schedule      |   1 +
 .../input/hot_standby/query_conflict.source   | 225 +++++++++
 .../output/hot_standby/query_conflict.source  | 470 ++++++++++++++++++
 src/test/isolation2/sql/.gitignore            |   1 +
 src/test/isolation2/sql/hot_standby/setup.sql |   4 +
 .../isolation2/sql/hot_standby/teardown.sql   |   1 +
 12 files changed, 726 insertions(+)
 create mode 100644 src/test/isolation2/input/hot_standby/query_conflict.source
 create mode 100644 src/test/isolation2/output/hot_standby/query_conflict.source

diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index 048ce9231a9..0003425b79f 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -530,6 +530,14 @@ heapgetpage(TableScanDesc sscan, BlockNumber page)
 
 	LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
 
+#ifdef FAULT_INJECTOR
+	FaultInjector_InjectFaultIfSet(
+								   "heapgetpage_after_unlock_buffer",
+								   DDLNotSpecified,
+								   "",	/* databaseName */
+								   RelationGetRelationName(scan->rs_base.rs_rd));	/* tableName */
+#endif
+
 	Assert(ntup <= MaxHeapTuplesPerPage);
 	scan->rs_ntuples = ntup;
 }
diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql
index 6b0b604ab5e..4cb9a9b57d6 100644
--- a/src/backend/catalog/system_views.sql
+++ b/src/backend/catalog/system_views.sql
@@ -1498,6 +1498,10 @@ rq.oid=rc.resqueueid AND rc.restypid = rt.restypid
 ORDER BY rsqname, restypid
 ;
 
+-- FIXME: we have a cluster-wide view gp_stat_database_conflicts, but that is 
+-- only showing conflicts of every segment. Some conflict might be encountered
+-- on just part of the segments. Ideally we should have a view like
+-- gp_stat_database_conflicts_summary that prints the overall conflicts and types.
 CREATE VIEW pg_stat_database_conflicts AS
     SELECT
             D.oid AS datid,
diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c
index 73a53822b3d..e2953686b8e 100644
--- a/src/backend/storage/file/fd.c
+++ b/src/backend/storage/file/fd.c
@@ -1712,6 +1712,7 @@ OpenTemporaryFile(bool interXact, const char *filePrefix)
 	if (!interXact)
 		RegisterTemporaryFile(file);
 
+	SIMPLE_FAULT_INJECTOR("after_open_temp_file");
 	return file;
 }
 
diff --git a/src/backend/storage/ipc/standby.c b/src/backend/storage/ipc/standby.c
index 8891b4dbcb4..13dc551ca54 100644
--- a/src/backend/storage/ipc/standby.c
+++ b/src/backend/storage/ipc/standby.c
@@ -30,6 +30,7 @@
 #include "storage/procarray.h"
 #include "storage/sinvaladt.h"
 #include "storage/standby.h"
+#include "utils/faultinjector.h"
 #include "utils/hsearch.h"
 #include "utils/memutils.h"
 #include "utils/ps_status.h"
@@ -849,6 +850,8 @@ SendRecoveryConflictWithBufferPin(ProcSignalReason reason)
 	 * SIGUSR1 handling in each backend decide their own fate.
 	 */
 	CancelDBBackends(InvalidOid, reason, false);
+
+	SIMPLE_FAULT_INJECTOR("recovery_conflict_bufferpin_signal_sent");
 }
 
 /*
diff --git a/src/test/isolation2/expected/hot_standby/setup.out b/src/test/isolation2/expected/hot_standby/setup.out
index 65fa2164584..f8f1e02fe40 100644
--- a/src/test/isolation2/expected/hot_standby/setup.out
+++ b/src/test/isolation2/expected/hot_standby/setup.out
@@ -4,5 +4,11 @@
 -- let primary wait for standby to apply changes, make test less flaky
 !\retcode gpconfig -c synchronous_commit -v remote_apply;
 (exited with code 0)
+-- make it faster to handle query conflict
+!\retcode gpconfig -c max_standby_streaming_delay -v 1000;
+(exited with code 0)
+-- disable autovacuum, to not affect the manual VACUUM in the tests
+!\retcode gpconfig -c autovacuum -v off;
+(exited with code 0)
 !\retcode gpstop -ar;
 (exited with code 0)
diff --git a/src/test/isolation2/expected/hot_standby/teardown.out b/src/test/isolation2/expected/hot_standby/teardown.out
index d118ce08fa8..8b4e1271610 100644
--- a/src/test/isolation2/expected/hot_standby/teardown.out
+++ b/src/test/isolation2/expected/hot_standby/teardown.out
@@ -3,5 +3,7 @@
 (exited with code 0)
 !\retcode gpconfig -r synchronous_commit;
 (exited with code 0)
+!\retcode gpconfig -r max_standby_streaming_delay;
+(exited with code 0)
 !\retcode gpstop -ar;
 (exited with code 0)
diff --git a/src/test/isolation2/hot_standby_schedule b/src/test/isolation2/hot_standby_schedule
index 3a120bf8582..73e0f71a84c 100644
--- a/src/test/isolation2/hot_standby_schedule
+++ b/src/test/isolation2/hot_standby_schedule
@@ -1,5 +1,6 @@
 test: hot_standby/setup
 test: hot_standby/basic
 test: hot_standby/transaction_isolation
+test: hot_standby/query_conflict
 test: hot_standby/faults
 test: hot_standby/teardown
diff --git a/src/test/isolation2/input/hot_standby/query_conflict.source b/src/test/isolation2/input/hot_standby/query_conflict.source
new file mode 100644
index 00000000000..0e2706bfa5a
--- /dev/null
+++ b/src/test/isolation2/input/hot_standby/query_conflict.source
@@ -0,0 +1,225 @@
+-- Tests for query conflict detection and cancellation on the hot standby.
+
+----------------------------------------------------------------
+-- Various query conflcit cases for hot standy.
+--
+-- All cases are written in this pattern:
+-- 1. Start a standby transaction that will be conflicted and cancelled;
+-- 2. Start a primary transaction that will conflict it;
+-- 3. Commit the primary transaction. Since we are using remote_apply, it will 
+--     wait until the WAL is applied on the standby, which would happen only
+--     after the standby query is cancelled;
+-- 4. Run something on the standby transaction and see the conflict error, which
+--     in some cases it's ERROR, in others it's FATAL. 
+-- 5. Quit, establish a new connection, and re-run
+-- 6. Check the system view gp_stat_database_conflicts to see that the conflict
+--     has been recorded. Note that we print the max count among all segments
+--     to avoid flakiness.
+-- See https://www.postgresql.org/docs/12/hot-standby.html#HOT-STANDBY-CONFLICT for more details.
+----------------------------------------------------------------
+
+-- We assume we start the test with clean records
+-1S: select max(confl_tablespace), max(confl_lock), max(confl_snapshot), max(confl_bufferpin), max(confl_deadlock) from gp_stat_database_conflicts where datname = 'isolation2-hot-standby';
+
+---------------------------------------------------------------------
+-- Conflict with explicit lock
+---------------------------------------------------------------------
+create table hs_qc_lock(a int);
+insert into hs_qc_lock select * from generate_series(1,5);
+-1S: begin;
+-1S: select * from hs_qc_lock;
+1: begin;
+1: lock table hs_qc_lock in access exclusive mode;
+1: end;
+-1S: select * from hs_qc_lock;
+-1Sq:
+-1S: select * from hs_qc_lock;
+-1S: select max(confl_lock) from gp_stat_database_conflicts where datname = 'isolation2-hot-standby';
+
+---------------------------------------------------------------------
+-- Conflict with implicit lock
+---------------------------------------------------------------------
+-1S: begin;
+-1S: select * from hs_qc_lock;
+1: alter table hs_qc_lock set access method ao_row;
+-1S: select * from hs_qc_lock;
+-1Sq:
+-1S: select * from hs_qc_lock;
+-1S: select max(confl_lock) from gp_stat_database_conflicts where datname = 'isolation2-hot-standby';
+
+---------------------------------------------------------------------
+-- Conflict with drop database
+---------------------------------------------------------------------
+1: create database hs_qc_dropdb;
+-1Sq:
+-1S:@db_name hs_qc_dropdb: select 1;
+1: drop database hs_qc_dropdb;
+-1S: select 1;
+-1Sq:
+-- Stats aren't counted for database conflicts. See: pgstat_recv_recoveryconflict
+
+---------------------------------------------------------------------
+-- Conflict with VACUUM (snapshot)
+---------------------------------------------------------------------
+1: create table hs_qc_vac1(a int);
+1: insert into hs_qc_vac1 select * from generate_series(1,10);
+-1S: begin transaction isolation level repeatable read;
+-1S: select count(*) from hs_qc_vac1;
+1: delete from hs_qc_vac1;
+1: vacuum hs_qc_vac1;
+-1S: select count(*) from hs_qc_vac1;
+-1Sq:
+-1S: select max(confl_snapshot) from gp_stat_database_conflicts where datname = 'isolation2-hot-standby';
+
+---------------------------------------------------------------------
+-- Conflict with VACUUM (buffer pin)
+-- VACUUM of page that the standby is still holding buffer pin on, the difference with
+-- the previous case is that here the deleted row is already invisible to the standby.
+---------------------------------------------------------------------
+1: create table hs_qc_vac2(a int);
+1: insert into hs_qc_vac2 values(2);
+1: delete from hs_qc_vac2;
+-- run select once on the standby, so the next select will fetch data from buffer
+-1S: select * from hs_qc_vac2;
+-- suspend the standby at where it just unlocks the buffer but still holds the pin
+1: select gp_inject_fault('heapgetpage_after_unlock_buffer', 'suspend','','','hs_qc_vac2',1,1,0,dbid) from gp_segment_configuration where content=0 and role='m';
+-- we'll also make sure the startup process has sent out the signal before we let the standby backend release the pin
+1: select gp_inject_fault('recovery_conflict_bufferpin_signal_sent', 'skip',dbid) from gp_segment_configuration where content=0 and role='m';
+-1S&: select * from hs_qc_vac2;
+1: vacuum hs_qc_vac2;
+-- as mentioned before, make sure startup process has sent the signal, and then let the standby proceed
+1: select gp_wait_until_triggered_fault('recovery_conflict_bufferpin_signal_sent', 1,dbid) from gp_segment_configuration where content=0 and role='m';
+1: select gp_inject_fault('recovery_conflict_bufferpin_signal_sent', 'reset',dbid) from gp_segment_configuration where content=0 and role='m';
+1: select gp_inject_fault('heapgetpage_after_unlock_buffer', 'reset',dbid) from gp_segment_configuration where content=0 and role='m';
+-- should see the conflict
+-1S<:
+-1Sq:
+-- XXX: sometimes it shows the number is 2 instead of 1. It still validates the test but it would be nice to know why.
+-1S: select max(confl_bufferpin) > 0 from gp_stat_database_conflicts where datname = 'isolation2-hot-standby';
+
+---------------------------------------------------------------------
+-- Conflict with drop (temp) tablespace
+-- Note: regular user tablespaces won't cause conflict on the standby since the standby cannot create any objects under them.
+---------------------------------------------------------------------
+-- create tablespace
+!\retcode mkdir -p @testtablespace@/hs_tablespace_directory;
+create tablespace hs_ts location '@testtablespace@/hs_tablespace_directory';
+
+-- some prepartion on the primary
+create table hs_ts_foo (i int, j int) distributed by(i);
+insert into hs_ts_foo select i, i from generate_series(1,800000)i;
+analyze hs_ts_foo;
+
+-- make sure the standby won't run too fast and delete the temp files
+select gp_inject_fault('after_open_temp_file', 'suspend',dbid) from gp_segment_configuration where content=1 and role='m';
+
+-- on the standby, run some query that requires workfile, this example is taken
+-- from regress/temp_tablespaces test
+-1S: set temp_tablespaces = hs_ts;
+-1S: set default_tablespace = hs_ts;
+-1S: set statement_mem='2MB';
+-1S&: with a1 as (select * from hs_ts_foo), a2 as (select * from hs_ts_foo) select a1.i xx from a1 inner join a2 on a2.i = a1.i union all select count(a1.i) from a1 inner join a2 on a2.i = a1.i order by xx limit 5;
+
+-- drop tablespace, should see conflict on the hot standby
+drop tablespace hs_ts;
+select gp_inject_fault('after_open_temp_file', 'reset',dbid) from gp_segment_configuration where content=1 and role='m';
+-1S<:
+-1Sq:
+
+-- conflict has been recorded
+-1S: select max(confl_tablespace) from gp_stat_database_conflicts where datname = 'isolation2-hot-standby';
+
+-- cleanup
+!\retcode rm -rf @testtablespace@/hs_tablespace_directory;
+-- Do one checkpoint. Otherwise if server restarts w/o doing checkpoint (some subsequent
+-- tests might do that), the server would complain it cannot find the directory for hs_ts.
+checkpoint;
+
+----------------------------------------------------------------
+-- Additional case to show that distributed transaction is not taken into 
+-- account w/o the help of restore-point-based distributed snapshot creation.
+----------------------------------------------------------------
+
+1: create table hs_qc_ds1(a int);
+1: insert into hs_qc_ds1 select * from generate_series(1,10);
+-- standby starts a repeatable read transaction, runs a local query that
+-- creates a distributed snapshot w/o creating QE.
+-1S: select count(*) from hs_qc_ds1;
+-1S: begin transaction isolation level repeatable read;
+-1S: select relname from pg_class where relname = 'hs_qc_ds1';
+-- primary runs VACUUM
+1: delete from hs_qc_ds1;
+1: vacuum hs_qc_ds1;
+-- The standby query in theory should be cancelled, because it started before
+-- the VACUUM. But in reality, it doesn't, and sees 0 rows, because the QE for the 
+-- SELECT below will create more recent local snapshot that does not conflict with
+-- the VACUUM, and sees the result of DELETE+VACUUM.
+-- Note: with the help of restore point, we would be able to create local snapshot 
+-- precisely corresponding to each distributed snapshot, and do conflict detection accordingly.
+-1S: select count(*) from hs_qc_ds1;
+-1S: end;
+
+----------------------------------------------------------------
+-- Test GUC hot_standby_feedback
+----------------------------------------------------------------
+!\retcode gpconfig -c hot_standby_feedback -v on;
+!\retcode gpstop -u;
+
+1: create table hs_qc_guc1(a int);
+1: insert into hs_qc_guc1 select * from generate_series(1,10);
+
+-1S: begin transaction isolation level repeatable read;
+-1S: select * from hs_qc_guc1;
+
+-- VACUUM won't cleanup this table since the standby still sees it
+1: delete from hs_qc_guc1;
+1: vacuum hs_qc_guc1;
+
+-- hot standby can still see those rows
+-1S: select * from hs_qc_guc1;
+
+-- after the conflicting read transaction ends, the next VACUUM will successfully vacuum the table
+-1S: end;
+1: vacuum hs_qc_guc1;
+-1S: select * from hs_qc_guc1;
+-1Sq:
+
+!\retcode gpconfig -r hot_standby_feedback;
+!\retcode gpstop -u;
+
+----------------------------------------------------------------
+-- Test GUC vacuum_defer_cleanup_age
+----------------------------------------------------------------
+-- Use a GUC value that's not 0, so VACUUM does not clean up
+-- recent dead rows that the hot standby might be still seeing.
+!\retcode gpconfig -c vacuum_defer_cleanup_age -v 1;
+!\retcode gpstop -u;
+
+1: create table hs_qc_guc2(a int);
+1: insert into hs_qc_guc2 select * from generate_series(1,10);
+
+-1S: begin transaction isolation level repeatable read;
+-1S: select count(*) from hs_qc_guc2;
+
+-- VACUUM won't cleanup this table since the DELETE is still within vacuum_defer_cleanup_age
+1: delete from hs_qc_guc2;
+1: vacuum hs_qc_guc2;
+
+-- showing all rows are deleted but not vacuumed
+1: select count(*) from hs_qc_guc2;
+1: set gp_select_invisible to on;
+1: select count(*) from hs_qc_guc2;
+
+-- hot standby can still query the table
+-1S: select count(*) from hs_qc_guc2;
+
+-- only if the age is reached, hot standby will see the same conflict as before
+1: create temp table tt1(a int);
+1: vacuum hs_qc_guc2;
+-1S: select count(*) from hs_qc_guc2;
+-1Sq:
+-1S: select max(confl_snapshot) from gp_stat_database_conflicts where datname = 'isolation2-hot-standby';
+
+!\retcode gpconfig -r vacuum_defer_cleanup_age;
+!\retcode gpstop -u;
+
diff --git a/src/test/isolation2/output/hot_standby/query_conflict.source b/src/test/isolation2/output/hot_standby/query_conflict.source
new file mode 100644
index 00000000000..a709f257007
--- /dev/null
+++ b/src/test/isolation2/output/hot_standby/query_conflict.source
@@ -0,0 +1,470 @@
+-- Tests for query conflict detection and cancellation on the hot standby.
+
+----------------------------------------------------------------
+-- Various query conflcit cases for hot standy.
+--
+-- All cases are written in this pattern:
+-- 1. Start a standby transaction that will be conflicted and cancelled;
+-- 2. Start a primary transaction that will conflict it;
+-- 3. Commit the primary transaction. Since we are using remote_apply, it will
+--     wait until the WAL is applied on the standby, which would happen only
+--     after the standby query is cancelled;
+-- 4. Run something on the standby transaction and see the conflict error, which
+--     in some cases it's ERROR, in others it's FATAL.
+-- 5. Quit, establish a new connection, and re-run
+-- 6. Check the system view gp_stat_database_conflicts to see that the conflict
+--     has been recorded. Note that we print the max count among all segments
+--     to avoid flakiness.
+-- See https://www.postgresql.org/docs/12/hot-standby.html#HOT-STANDBY-CONFLICT for more details.
+----------------------------------------------------------------
+
+-- We assume we start the test with clean records
+-1S: select max(confl_tablespace), max(confl_lock), max(confl_snapshot), max(confl_bufferpin), max(confl_deadlock) from gp_stat_database_conflicts where datname = 'isolation2-hot-standby';
+ max | max | max | max | max 
+-----+-----+-----+-----+-----
+ 0   | 0   | 0   | 0   | 0   
+(1 row)
+
+---------------------------------------------------------------------
+-- Conflict with explicit lock
+---------------------------------------------------------------------
+create table hs_qc_lock(a int);
+CREATE TABLE
+insert into hs_qc_lock select * from generate_series(1,5);
+INSERT 0 5
+-1S: begin;
+BEGIN
+-1S: select * from hs_qc_lock;
+ a 
+---
+ 2 
+ 3 
+ 4 
+ 1 
+ 5 
+(5 rows)
+1: begin;
+BEGIN
+1: lock table hs_qc_lock in access exclusive mode;
+LOCK TABLE
+1: end;
+COMMIT
+-1S: select * from hs_qc_lock;
+FATAL:  terminating connection due to conflict with recovery
+DETAIL:  User was holding a relation lock for too long.
+HINT:  In a moment you should be able to reconnect to the database and repeat your command.
+server closed the connection unexpectedly
+	This probably means the server terminated abnormally
+	before or while processing the request.
+-1Sq: ... <quitting>
+-1S: select * from hs_qc_lock;
+ a 
+---
+ 1 
+ 5 
+ 2 
+ 3 
+ 4 
+(5 rows)
+-1S: select max(confl_lock) from gp_stat_database_conflicts where datname = 'isolation2-hot-standby';
+ max 
+-----
+ 1   
+(1 row)
+
+---------------------------------------------------------------------
+-- Conflict with implicit lock
+---------------------------------------------------------------------
+-1S: begin;
+BEGIN
+-1S: select * from hs_qc_lock;
+ a 
+---
+ 1 
+ 5 
+ 2 
+ 3 
+ 4 
+(5 rows)
+1: alter table hs_qc_lock set access method ao_row;
+ALTER TABLE
+-1S: select * from hs_qc_lock;
+FATAL:  terminating connection due to conflict with recovery
+DETAIL:  User was holding a relation lock for too long.
+HINT:  In a moment you should be able to reconnect to the database and repeat your command.
+server closed the connection unexpectedly
+	This probably means the server terminated abnormally
+	before or while processing the request.
+-1Sq: ... <quitting>
+-1S: select * from hs_qc_lock;
+ a 
+---
+ 1 
+ 5 
+ 2 
+ 3 
+ 4 
+(5 rows)
+-1S: select max(confl_lock) from gp_stat_database_conflicts where datname = 'isolation2-hot-standby';
+ max 
+-----
+ 2   
+(1 row)
+
+---------------------------------------------------------------------
+-- Conflict with drop database
+---------------------------------------------------------------------
+1: create database hs_qc_dropdb;
+CREATE DATABASE
+-1Sq: ... <quitting>
+-1S:@db_name hs_qc_dropdb: select 1;
+ ?column? 
+----------
+ 1        
+(1 row)
+1: drop database hs_qc_dropdb;
+DROP DATABASE
+-1S: select 1;
+FATAL:  terminating connection due to conflict with recovery
+DETAIL:  User was connected to a database that must be dropped.
+server closed the connection unexpectedly
+	This probably means the server terminated abnormally
+	before or while processing the request.
+-1Sq: ... <quitting>
+-- Stats aren't counted for database conflicts. See: pgstat_recv_recoveryconflict
+
+---------------------------------------------------------------------
+-- Conflict with VACUUM (snapshot)
+---------------------------------------------------------------------
+1: create table hs_qc_vac1(a int);
+CREATE TABLE
+1: insert into hs_qc_vac1 select * from generate_series(1,10);
+INSERT 0 10
+-1S: begin transaction isolation level repeatable read;
+BEGIN
+-1S: select count(*) from hs_qc_vac1;
+ count 
+-------
+ 10    
+(1 row)
+1: delete from hs_qc_vac1;
+DELETE 10
+1: vacuum hs_qc_vac1;
+VACUUM
+-1S: select count(*) from hs_qc_vac1;
+DETAIL:  User query might have needed to see row versions that must be removed.
+ERROR:  terminating connection due to conflict with recovery
+HINT:  In a moment you should be able to reconnect to the database and repeat your command.
+-1Sq: ... <quitting>
+-1S: select max(confl_snapshot) from gp_stat_database_conflicts where datname = 'isolation2-hot-standby';
+ max 
+-----
+ 1   
+(1 row)
+
+---------------------------------------------------------------------
+-- Conflict with VACUUM (buffer pin)
+-- VACUUM of page that the standby is still holding buffer pin on, the difference with
+-- the previous case is that here the deleted row is already invisible to the standby.
+---------------------------------------------------------------------
+1: create table hs_qc_vac2(a int);
+CREATE TABLE
+1: insert into hs_qc_vac2 values(2);
+INSERT 0 1
+1: delete from hs_qc_vac2;
+DELETE 1
+-- run select once on the standby, so the next select will fetch data from buffer
+-1S: select * from hs_qc_vac2;
+ a 
+---
+(0 rows)
+-- suspend the standby at where it just unlocks the buffer but still holds the pin
+1: select gp_inject_fault('heapgetpage_after_unlock_buffer', 'suspend','','','hs_qc_vac2',1,1,0,dbid) from gp_segment_configuration where content=0 and role='m';
+ gp_inject_fault 
+-----------------
+ Success:        
+(1 row)
+-- we'll also make sure the startup process has sent out the signal before we let the standby backend release the pin
+1: select gp_inject_fault('recovery_conflict_bufferpin_signal_sent', 'skip',dbid) from gp_segment_configuration where content=0 and role='m';
+ gp_inject_fault 
+-----------------
+ Success:        
+(1 row)
+-1S&: select * from hs_qc_vac2;  <waiting ...>
+1: vacuum hs_qc_vac2;
+VACUUM
+-- as mentioned before, make sure startup process has sent the signal, and then let the standby proceed
+1: select gp_wait_until_triggered_fault('recovery_conflict_bufferpin_signal_sent', 1,dbid) from gp_segment_configuration where content=0 and role='m';
+ gp_wait_until_triggered_fault 
+-------------------------------
+ Success:                      
+(1 row)
+1: select gp_inject_fault('recovery_conflict_bufferpin_signal_sent', 'reset',dbid) from gp_segment_configuration where content=0 and role='m';
+ gp_inject_fault 
+-----------------
+ Success:        
+(1 row)
+1: select gp_inject_fault('heapgetpage_after_unlock_buffer', 'reset',dbid) from gp_segment_configuration where content=0 and role='m';
+ gp_inject_fault 
+-----------------
+ Success:        
+(1 row)
+-- should see the conflict
+-1S<:  <... completed>
+ERROR:  canceling statement due to conflict with recovery  (seg0 slice1 127.0.1.1:7005 pid=17044)
+DETAIL:  User was holding shared buffer pin for too long.
+-1Sq: ... <quitting>
+-- XXX: sometimes it shows the number is 2 instead of 1. It still validates the test but it would be nice to know why.
+-1S: select max(confl_bufferpin) > 0 from gp_stat_database_conflicts where datname = 'isolation2-hot-standby';
+ ?column? 
+----------
+ t        
+(1 row)
+
+---------------------------------------------------------------------
+-- Conflict with drop (temp) tablespace
+-- Note: regular user tablespaces won't cause conflict on the standby since the standby cannot create any objects under them.
+---------------------------------------------------------------------
+-- create tablespace
+!\retcode mkdir -p @testtablespace@/hs_tablespace_directory;
+(exited with code 0)
+create tablespace hs_ts location '@testtablespace@/hs_tablespace_directory';
+CREATE TABLESPACE
+
+-- some prepartion on the primary
+create table hs_ts_foo (i int, j int) distributed by(i);
+CREATE TABLE
+insert into hs_ts_foo select i, i from generate_series(1,800000)i;
+INSERT 0 800000
+analyze hs_ts_foo;
+ANALYZE
+
+-- make sure the standby won't run too fast and delete the temp files
+select gp_inject_fault('after_open_temp_file', 'suspend',dbid) from gp_segment_configuration where content=1 and role='m';
+ gp_inject_fault 
+-----------------
+ Success:        
+(1 row)
+
+-- on the standby, run some query that requires workfile, this example is taken
+-- from regress/temp_tablespaces test
+-1S: set temp_tablespaces = hs_ts;
+SET
+-1S: set default_tablespace = hs_ts;
+SET
+-1S: set statement_mem='2MB';
+SET
+-1S&: with a1 as (select * from hs_ts_foo), a2 as (select * from hs_ts_foo) select a1.i xx from a1 inner join a2 on a2.i = a1.i union all select count(a1.i) from a1 inner join a2 on a2.i = a1.i order by xx limit 5;  <waiting ...>
+
+-- drop tablespace, should see conflict on the hot standby
+drop tablespace hs_ts;
+DROP TABLESPACE
+select gp_inject_fault('after_open_temp_file', 'reset',dbid) from gp_segment_configuration where content=1 and role='m';
+ gp_inject_fault 
+-----------------
+ Success:        
+(1 row)
+-1S<:  <... completed>
+ERROR:  canceling statement due to conflict with recovery  (seg1 slice3 127.0.1.1:7006 pid=990)
+DETAIL:  User was or might have been using tablespace that must be dropped.
+-1Sq: ... <quitting>
+
+-- conflict has been recorded
+-1S: select max(confl_tablespace) from gp_stat_database_conflicts where datname = 'isolation2-hot-standby';
+ max 
+-----
+ 1   
+(1 row)
+
+-- cleanup
+!\retcode rm -rf @testtablespace@/hs_tablespace_directory;
+GP_IGNORE:-- start_ignore
+GP_IGNORE:
+GP_IGNORE:-- end_ignore
+(exited with code 0)
+-- Do one checkpoint. Otherwise if server restarts w/o doing checkpoint (some subsequent
+-- tests might do that), the server would complain it cannot find the directory for hs_ts.
+checkpoint;
+CHECKPOINT
+
+----------------------------------------------------------------
+-- Additional case to show that distributed transaction is not taken into
+-- account w/o the help of restore-point-based distributed snapshot creation.
+----------------------------------------------------------------
+
+1: create table hs_qc_ds1(a int);
+CREATE TABLE
+1: insert into hs_qc_ds1 select * from generate_series(1,10);
+INSERT 0 10
+-- standby starts a repeatable read transaction, runs a local query that
+-- creates a distributed snapshot w/o creating QE.
+-1S: select count(*) from hs_qc_ds1;
+ count 
+-------
+ 10    
+(1 row)
+-1S: begin transaction isolation level repeatable read;
+BEGIN
+-1S: select relname from pg_class where relname = 'hs_qc_ds1';
+ relname   
+-----------
+ hs_qc_ds1 
+(1 row)
+-- primary runs VACUUM
+1: delete from hs_qc_ds1;
+DELETE 10
+1: vacuum hs_qc_ds1;
+VACUUM
+-- The standby query in theory should be cancelled, because it started before
+-- the VACUUM. But in reality, it doesn't, and sees 0 rows, because the QE for the
+-- SELECT below will create more recent local snapshot that does not conflict with
+-- the VACUUM, and sees the result of DELETE+VACUUM.
+-- Note: with the help of restore point, we would be able to create local snapshot
+-- precisely corresponding to each distributed snapshot, and do conflict detection accordingly.
+-1S: select count(*) from hs_qc_ds1;
+ count 
+-------
+ 0     
+(1 row)
+-1S: end;
+COMMIT
+
+----------------------------------------------------------------
+-- Test GUC hot_standby_feedback
+----------------------------------------------------------------
+!\retcode gpconfig -c hot_standby_feedback -v on;
+(exited with code 0)
+!\retcode gpstop -u;
+(exited with code 0)
+
+1: create table hs_qc_guc1(a int);
+CREATE TABLE
+1: insert into hs_qc_guc1 select * from generate_series(1,10);
+INSERT 0 10
+
+-1S: begin transaction isolation level repeatable read;
+BEGIN
+-1S: select * from hs_qc_guc1;
+ a  
+----
+ 1  
+ 10 
+ 2  
+ 3  
+ 4  
+ 5  
+ 6  
+ 7  
+ 8  
+ 9  
+(10 rows)
+
+-- VACUUM won't cleanup this table since the standby still sees it
+1: delete from hs_qc_guc1;
+DELETE 10
+1: vacuum hs_qc_guc1;
+VACUUM
+
+-- hot standby can still see those rows
+-1S: select * from hs_qc_guc1;
+ a  
+----
+ 1  
+ 10 
+ 2  
+ 3  
+ 4  
+ 5  
+ 6  
+ 7  
+ 8  
+ 9  
+(10 rows)
+
+-- after the conflicting read transaction ends, the next VACUUM will successfully vacuum the table
+-1S: end;
+COMMIT
+1: vacuum hs_qc_guc1;
+VACUUM
+-1S: select * from hs_qc_guc1;
+ a 
+---
+(0 rows)
+-1Sq: ... <quitting>
+
+!\retcode gpconfig -r hot_standby_feedback;
+(exited with code 0)
+!\retcode gpstop -u;
+(exited with code 0)
+
+----------------------------------------------------------------
+-- Test GUC vacuum_defer_cleanup_age
+----------------------------------------------------------------
+-- Use a GUC value that's not 0, so VACUUM does not clean up
+-- recent dead rows that the hot standby might be still seeing.
+!\retcode gpconfig -c vacuum_defer_cleanup_age -v 1;
+(exited with code 0)
+!\retcode gpstop -u;
+(exited with code 0)
+
+1: create table hs_qc_guc2(a int);
+CREATE TABLE
+1: insert into hs_qc_guc2 select * from generate_series(1,10);
+INSERT 0 10
+
+-1S: begin transaction isolation level repeatable read;
+BEGIN
+-1S: select count(*) from hs_qc_guc2;
+ count 
+-------
+ 10    
+(1 row)
+
+-- VACUUM won't cleanup this table since the DELETE is still within vacuum_defer_cleanup_age
+1: delete from hs_qc_guc2;
+DELETE 10
+1: vacuum hs_qc_guc2;
+VACUUM
+
+-- showing all rows are deleted but not vacuumed
+1: select count(*) from hs_qc_guc2;
+ count 
+-------
+ 0     
+(1 row)
+1: set gp_select_invisible to on;
+SET
+1: select count(*) from hs_qc_guc2;
+ count 
+-------
+ 10    
+(1 row)
+
+-- hot standby can still query the table
+-1S: select count(*) from hs_qc_guc2;
+ count 
+-------
+ 10    
+(1 row)
+
+-- only if the age is reached, hot standby will see the same conflict as before
+1: create temp table tt1(a int);
+CREATE TABLE
+1: vacuum hs_qc_guc2;
+VACUUM
+-1S: select count(*) from hs_qc_guc2;
+ERROR:  terminating connection due to conflict with recovery  (seg0 slice1 127.0.1.1:7005 pid=18713)
+DETAIL:  User query might have needed to see row versions that must be removed.
+HINT:  In a moment you should be able to reconnect to the database and repeat your command.
+-1Sq: ... <quitting>
+-1S: select max(confl_snapshot) from gp_stat_database_conflicts where datname = 'isolation2-hot-standby';
+ max 
+-----
+ 2   
+(1 row)
+
+!\retcode gpconfig -r vacuum_defer_cleanup_age;
+(exited with code 0)
+!\retcode gpstop -u;
+(exited with code 0)
+
diff --git a/src/test/isolation2/sql/.gitignore b/src/test/isolation2/sql/.gitignore
index 361b986e18d..bfc3709082c 100644
--- a/src/test/isolation2/sql/.gitignore
+++ b/src/test/isolation2/sql/.gitignore
@@ -7,6 +7,7 @@
 /pt_io_in_progress_deadlock.sql
 /distributed_snapshot.sql
 /local_directory_table_mixed.sql
+/hot_standby/query_conflict.sql
 
 # ignores including sub-directories
 autovacuum-analyze.sql
diff --git a/src/test/isolation2/sql/hot_standby/setup.sql b/src/test/isolation2/sql/hot_standby/setup.sql
index cdf4ec67bd1..aa15f468b7d 100644
--- a/src/test/isolation2/sql/hot_standby/setup.sql
+++ b/src/test/isolation2/sql/hot_standby/setup.sql
@@ -2,4 +2,8 @@
 !\retcode gpconfig -c hot_standby -v on;
 -- let primary wait for standby to apply changes, make test less flaky
 !\retcode gpconfig -c synchronous_commit -v remote_apply;
+-- make it faster to handle query conflict
+!\retcode gpconfig -c max_standby_streaming_delay -v 1000;
+-- disable autovacuum, to not affect the manual VACUUM in the tests
+!\retcode gpconfig -c autovacuum -v off;
 !\retcode gpstop -ar;
diff --git a/src/test/isolation2/sql/hot_standby/teardown.sql b/src/test/isolation2/sql/hot_standby/teardown.sql
index 3544c1d9beb..af6fba50aed 100644
--- a/src/test/isolation2/sql/hot_standby/teardown.sql
+++ b/src/test/isolation2/sql/hot_standby/teardown.sql
@@ -1,4 +1,5 @@
 -- reset the setup for hot standby tests
 !\retcode gpconfig -r hot_standby;
 !\retcode gpconfig -r synchronous_commit;
+!\retcode gpconfig -r max_standby_streaming_delay;
 !\retcode gpstop -ar;

From dd683dff71c7088aa64c1cbd02129404bc6f3e46 Mon Sep 17 00:00:00 2001
From: wangxiaoran <fanfuxiaoran@gmail.com>
Date: Thu, 5 Jun 2025 18:16:40 +0800
Subject: [PATCH 09/11] Fix hot_standby isolation2 and regress tests

---
 .../isolation2/expected/hot_standby/basic.out |  18 +--
 .../expected/hot_standby/faults.out           |  15 ++-
 .../hot_standby/transaction_isolation.out     | 106 +++++++++---------
 .../output/hot_standby/query_conflict.source  |  48 ++++----
 .../expected/hs_standby_disallowed.out        |  10 +-
 src/test/regress/pg_regress.c                 |  13 ++-
 src/test/regress/sql/hs_primary_setup.sql     |   2 +-
 7 files changed, 111 insertions(+), 101 deletions(-)

diff --git a/src/test/isolation2/expected/hot_standby/basic.out b/src/test/isolation2/expected/hot_standby/basic.out
index 99d9f407b01..5318a35d7d7 100644
--- a/src/test/isolation2/expected/hot_standby/basic.out
+++ b/src/test/isolation2/expected/hot_standby/basic.out
@@ -23,13 +23,13 @@
 -- Test: basic query dispatch
 ----------------------------------------------------------------
 create table hs_t1(a int);
-CREATE TABLE
+CREATE
 create table hs_t2(a int);
-CREATE TABLE
+CREATE
 
 -- standby should see the results for 2pc immediately.
 insert into hs_t1 select * from generate_series(1,10);
-INSERT 0 10
+INSERT 10
 -1S: select * from hs_t1;
  a  
 ----
@@ -49,14 +49,14 @@ INSERT 0 10
 -- as long as another 2pc comes it will be able to see the previous 1pc. Wee
 -- tolerate this case in the mirrored cluster setup.
 insert into hs_t2 values(1);
-INSERT 0 1
+INSERT 1
 -1S: select * from hs_t2;
  a 
 ---
 (0 rows)
 -- any following 2pc will make the 1pc visible
 create temp table tt(a int);
-CREATE TABLE
+CREATE
 -1S: select * from hs_t2;
  a 
 ---
@@ -115,7 +115,7 @@ CREATE TABLE
 begin;
 BEGIN
 insert into hs_t1 select * from generate_series(11,20);
-INSERT 0 10
+INSERT 10
 
 -- standby should only see 1...10
 -1S: select * from hs_t1;
@@ -134,7 +134,7 @@ INSERT 0 10
 (10 rows)
 
 end;
-COMMIT
+END
 
 -- standby should see 1...20 now
 -1S: select * from hs_t1;
@@ -224,9 +224,9 @@ ERROR:  cannot execute VACUUM during recovery
 -- No hintbit WAL generation in SELECT.
 --
 create table hs_nohintbit(a int) distributed by (a);
-CREATE TABLE
+CREATE
 insert into hs_nohintbit select generate_series (1, 10);
-INSERT 0 10
+INSERT 10
 -- flush the data to disk
 checkpoint;
 CHECKPOINT
diff --git a/src/test/isolation2/expected/hot_standby/faults.out b/src/test/isolation2/expected/hot_standby/faults.out
index f9f58eab83f..39f3a06cca6 100644
--- a/src/test/isolation2/expected/hot_standby/faults.out
+++ b/src/test/isolation2/expected/hot_standby/faults.out
@@ -18,9 +18,9 @@
 (exited with code 0)
 
 create table hs_failover(a int);
-CREATE TABLE
+CREATE
 insert into hs_failover select * from generate_series(1,10);
-INSERT 0 10
+INSERT 10
 -1S: select * from hs_failover;
  a  
 ----
@@ -47,7 +47,7 @@ select pg_ctl(datadir, 'stop', 'immediate') from gp_segment_configuration where
 
 -- make sure mirror is detected down
 create temp table hs_tt(a int);
-CREATE TABLE
+CREATE
 select gp_request_fts_probe_scan();
  gp_request_fts_probe_scan 
 ---------------------------
@@ -137,10 +137,9 @@ HINT:  Exit the current session and re-connect.
 -- will fail due to downed mirror (previous primary)
 -1S: select * from hs_failover;
 ERROR:  failed to acquire resources on one or more segments
-DETAIL:  could not connect to server: Connection refused
-	Is the server running on host "127.0.1.1" and accepting
-	TCP/IP connections on port 7003?
- (seg1 127.0.1.1:7003)
+DETAIL:  connection to server at "10.13.9.74", port 7003 failed: Connection refused
+	Is the server running on that host and accepting TCP/IP connections?
+ (seg1 10.13.9.74:7003)
 -1Sq: ... <quitting>
 
 -- bring the downed mirror up
@@ -219,7 +218,7 @@ select wait_until_all_segments_synchronized();
 (1 row)
 
 1: create table tt_hs_dtx(a int);
-CREATE TABLE
+CREATE
 
 -- inject fault to repeatedly fail the COMMIT PREPARE phase of 2PC, which ensures that the dtx cannot finish even by the dtx recovery process.
 select gp_inject_fault_infinite('finish_commit_prepared', 'error', dbid) from gp_segment_configuration where content=1 and role='p';
diff --git a/src/test/isolation2/expected/hot_standby/transaction_isolation.out b/src/test/isolation2/expected/hot_standby/transaction_isolation.out
index 139cca8e88e..3990bd7cd56 100644
--- a/src/test/isolation2/expected/hot_standby/transaction_isolation.out
+++ b/src/test/isolation2/expected/hot_standby/transaction_isolation.out
@@ -2,20 +2,20 @@
 -- Test transaction isolation in general, not specific to dtx
 ----------------------------------------------------------------
 1: create table hs_tx(a int);
-CREATE TABLE
+CREATE
 1: insert into hs_tx select * from generate_series(1,10);
-INSERT 0 10
+INSERT 10
 
 1: begin;
 BEGIN
 1: insert into hs_tx select * from generate_series(11,20);
-INSERT 0 10
+INSERT 10
 2: begin;
 BEGIN
 2: insert into hs_tx select * from generate_series(21,30);
-INSERT 0 10
+INSERT 10
 2: abort;
-ROLLBACK
+ABORT
 
 -- standby should only see completed transactions, not in-progress transactions, nor aborted transactions
 -1S: select * from hs_tx;
@@ -34,7 +34,7 @@ ROLLBACK
 (10 rows)
 
 1: end;
-COMMIT
+END
 -1S: select * from hs_tx;
  a  
 ----
@@ -65,9 +65,9 @@ COMMIT
 ----------------------------------------------------------------
 
 1: create table hs_dtx1(a int);
-CREATE TABLE
+CREATE
 1: create table hs_dtx2(a int);
-CREATE TABLE
+CREATE
 
 -- inject two suspend faults:
 -- 1. on seg0, suspend before PREPARE phase of 2PC
@@ -107,9 +107,9 @@ CREATE TABLE
  Success:        
 (1 row)
 1<:  <... completed>
-INSERT 0 10
+INSERT 10
 2<:  <... completed>
-INSERT 0 10
+INSERT 10
 
 -- standby should see the results from the dtx now
 -1S: select * from hs_dtx1;
@@ -146,9 +146,9 @@ INSERT 0 10
 ----------------------------------------------------------------
 
 1: create table hs_abort_dtx1(a int);
-CREATE TABLE
+CREATE
 1: create table hs_abort_dtx2(a int);
-CREATE TABLE
+CREATE
 
 -- inject two errors:
 -- 1. on seg0, error out before PREPARE phase of 2PC
@@ -171,7 +171,7 @@ ERROR:  fault triggered, fault name:'qe_start_prepared' fault type:'error'  (seg
  Success:        
 (1 row)
 1: insert into hs_abort_dtx2 select * from  generate_series(1,10);
-INSERT 0 10
+INSERT 10
 1: select gp_inject_fault('qe_start_commit_prepared', 'reset',dbid) from gp_segment_configuration where content=1 and role='p';
  gp_inject_fault 
 -----------------
@@ -203,7 +203,7 @@ INSERT 0 10
 -- but also run more queries in between
 ----------------------------------------------------------------
 1: create table hs_dtx3(a int);
-CREATE TABLE
+CREATE
 
 -- inject faults to suspend segments in 2PC
 1: select gp_inject_fault('qe_start_prepared', 'suspend', dbid) from gp_segment_configuration where content=0 and role='p';
@@ -227,11 +227,11 @@ CREATE TABLE
 
 -- now run some dtx and completed
 3: insert into hs_dtx3 values(99);
-INSERT 0 1
+INSERT 1
 3: create table hs_dtx4(a int);
-CREATE TABLE
+CREATE
 3: insert into hs_dtx4 select * from  generate_series(1,10);
-INSERT 0 10
+INSERT 10
 
 -- standby should still not see rows in the in-progress DTX, but should see the completed ones
 -1S: select * from hs_dtx3;
@@ -265,9 +265,9 @@ INSERT 0 10
  Success:        
 (1 row)
 1<:  <... completed>
-INSERT 0 10
+INSERT 10
 2<:  <... completed>
-INSERT 0 10
+INSERT 10
 
 -- standby should see all rows now
 -1S: select * from hs_dtx3;
@@ -301,9 +301,9 @@ INSERT 0 10
 -- but after standby QD resets and gets running DTX from checkpoint.
 ----------------------------------------------------------------
 1: create table hs_t5(a int, b text);
-CREATE TABLE
+CREATE
 1: create table hs_t6(a int, b text);
-CREATE TABLE
+CREATE
 
 -- inject fault to suspend a primary right before it conducts the commit phase of 2PC,
 -- so in the subsequent INSERT, all local transactions will be committed but the dtx is not.
@@ -316,15 +316,15 @@ CREATE TABLE
 
 -- now run some dtx and completed, and primary conducts a checkpoint
 2: insert into hs_t5 values(1, 'commited');
-INSERT 0 1
+INSERT 1
 2: insert into hs_t6 select i, 'committed' from generate_series(1,10) i;
-INSERT 0 10
+INSERT 10
 2: begin;
 BEGIN
 2: insert into hs_t5 values(99, 'aborted');
-INSERT 0 1
+INSERT 1
 2: abort;
-ROLLBACK
+ABORT
 2: checkpoint;
 CHECKPOINT
 
@@ -368,7 +368,7 @@ server closed the connection unexpectedly
  Success:        
 (1 row)
 1<:  <... completed>
-INSERT 0 10
+INSERT 10
 
 -- standby should see all rows now
 -1S: select * from hs_t5;
@@ -480,9 +480,9 @@ DELETE 10
 ----------------------------------------------------------------
 
 1: create table hs_rc(a int);
-CREATE TABLE
+CREATE
 1: insert into hs_rc select * from generate_series(1,10);
-INSERT 0 10
+INSERT 10
 
 -- case 1: suspend SELECT on the standby QD right after it created snapshot
 -1S: select gp_inject_fault('select_after_qd_create_snapshot', 'suspend', dbid) from gp_segment_configuration where content=-1 and role='m';
@@ -494,7 +494,7 @@ INSERT 0 10
 
 -- new INSERT or DELETE won't be observed by the standby
 1: insert into hs_rc select * from generate_series(11,20);
-INSERT 0 10
+INSERT 10
 1: delete from hs_rc where a < 5;
 DELETE 4
 1: select gp_inject_fault('select_after_qd_create_snapshot', 'reset', dbid) from gp_segment_configuration where content=-1 and role='m';
@@ -550,7 +550,7 @@ DELETE 4
 -1S&: select * from hs_rc;  <waiting ...>
 
 1: insert into hs_rc select * from generate_series(21,30);
-INSERT 0 10
+INSERT 10
 1: delete from hs_rc where a < 21;
 DELETE 16
 1: select gp_inject_fault('select_before_qd_create_snapshot', 'reset', dbid) from gp_segment_configuration where content=-1 and role='m';
@@ -580,9 +580,9 @@ DELETE 16
 ----------------------------------------------------------------
 
 1: truncate hs_rc;
-TRUNCATE TABLE
+TRUNCATE
 1: insert into hs_rc select * from generate_series(1,30);
-INSERT 0 30
+INSERT 30
 
 -1S: begin;
 BEGIN
@@ -602,7 +602,7 @@ BEGIN
 2: delete from hs_rc where a > 10 and a <= 20;
 DELETE 10
 2: abort;
-ROLLBACK
+ABORT
 -- 3. an ongoing transaction
 3: begin;
 BEGIN
@@ -635,10 +635,10 @@ DELETE 10
  29 
 (20 rows)
 -1S: end;
-COMMIT
+END
 
 3: end;
-COMMIT
+END
 -1S: select * from hs_rc;
  a  
 ----
@@ -661,9 +661,9 @@ COMMIT
 ----------------------------------------------------------------
 
 1: create table hs_rr(a int);
-CREATE TABLE
+CREATE
 1: insert into hs_rr select * from generate_series(1,10);
-INSERT 0 10
+INSERT 10
 
 -1S: begin isolation level repeatable read;
 BEGIN
@@ -676,7 +676,7 @@ BEGIN
 
 -- do some more INSERT, DELETE and UPDATE
 1: insert into hs_rr select * from generate_series(11,20);
-INSERT 0 10
+INSERT 10
 1: delete from hs_rr where a <= 10;
 DELETE 10
 1: update hs_rr set a = a + 100;
@@ -698,7 +698,7 @@ UPDATE 10
  10 
 (10 rows)
 -1S: end;
-COMMIT
+END
 
 -- should see the results from the INSERT, DELETE and UPDATE
 -1S: begin isolation level repeatable read;
@@ -722,13 +722,13 @@ BEGIN
 1: begin;
 BEGIN
 1: insert into hs_rr select * from generate_series(1,10);
-INSERT 0 10
+INSERT 10
 2: begin;
 BEGIN
 2: insert into hs_rr select * from generate_series(1,10);
-INSERT 0 10
+INSERT 10
 2: abort;
-ROLLBACK
+ABORT
 
 -1S: select * from hs_rr;
  a   
@@ -746,16 +746,16 @@ ROLLBACK
 (10 rows)
 
 1: end;
-COMMIT
+END
 -1S: end;
-COMMIT
+END
 
 ----------------------------------------------------------------
 -- Transaction isolation is respected in subtransactions too
 ----------------------------------------------------------------
 
 1: create table hs_subtrx(a int);
-CREATE TABLE
+CREATE
 
 -- (1) read-committed
 -1S: begin;
@@ -769,7 +769,7 @@ BEGIN
 SAVEPOINT
 
 1: insert into hs_subtrx select * from generate_series(1,10);
-INSERT 0 10
+INSERT 10
 
 -1S: select count(*) from hs_subtrx;
  count 
@@ -791,7 +791,7 @@ ROLLBACK
  10    
 (1 row)
 -1S: end;
-COMMIT
+END
 
 -- (2) repeatable-read
 -1S: begin isolation level repeatable read;
@@ -814,7 +814,7 @@ BEGIN
 SAVEPOINT
 
 1: insert into hs_subtrx select * from generate_series(11,20);
-INSERT 0 10
+INSERT 10
 1: delete from hs_subtrx where a <= 10;
 DELETE 10
 1: update hs_subtrx set a = a + 100;
@@ -867,7 +867,7 @@ ROLLBACK
  10 
 (10 rows)
 -1S: end;
-COMMIT
+END
 -1S: select * from hs_subtrx;
  a   
 -----
@@ -887,13 +887,13 @@ COMMIT
 -- Various isolation tests that involve AO/CO table.
 ----------------------------------------------------------------
 1: create table hs_ao(a int, id int unique) using ao_row;
-CREATE TABLE
+CREATE
 1: insert into hs_ao select 1,i from generate_series(1,10) i;
-INSERT 0 10
+INSERT 10
 1: begin;
 BEGIN
 1: insert into hs_ao select 2,i from generate_series(11,20) i;
-INSERT 0 10
+INSERT 10
 
 -- standby sees the same AO metadata as primary
 2: select * from gp_toolkit.__gp_aoseg('hs_ao');
@@ -943,7 +943,7 @@ INSERT 0 10
 
 -- standby sees the effect of vacuum
 1: end;
-COMMIT
+END
 1: delete from hs_ao where a = 1;
 DELETE 10
 1: vacuum hs_ao;
diff --git a/src/test/isolation2/output/hot_standby/query_conflict.source b/src/test/isolation2/output/hot_standby/query_conflict.source
index a709f257007..397e3977d12 100644
--- a/src/test/isolation2/output/hot_standby/query_conflict.source
+++ b/src/test/isolation2/output/hot_standby/query_conflict.source
@@ -29,9 +29,9 @@
 -- Conflict with explicit lock
 ---------------------------------------------------------------------
 create table hs_qc_lock(a int);
-CREATE TABLE
+CREATE
 insert into hs_qc_lock select * from generate_series(1,5);
-INSERT 0 5
+INSERT 5
 -1S: begin;
 BEGIN
 -1S: select * from hs_qc_lock;
@@ -46,9 +46,9 @@ BEGIN
 1: begin;
 BEGIN
 1: lock table hs_qc_lock in access exclusive mode;
-LOCK TABLE
+LOCK
 1: end;
-COMMIT
+END
 -1S: select * from hs_qc_lock;
 FATAL:  terminating connection due to conflict with recovery
 DETAIL:  User was holding a relation lock for too long.
@@ -87,7 +87,7 @@ BEGIN
  4 
 (5 rows)
 1: alter table hs_qc_lock set access method ao_row;
-ALTER TABLE
+ALTER
 -1S: select * from hs_qc_lock;
 FATAL:  terminating connection due to conflict with recovery
 DETAIL:  User was holding a relation lock for too long.
@@ -115,7 +115,7 @@ server closed the connection unexpectedly
 -- Conflict with drop database
 ---------------------------------------------------------------------
 1: create database hs_qc_dropdb;
-CREATE DATABASE
+CREATE
 -1Sq: ... <quitting>
 -1S:@db_name hs_qc_dropdb: select 1;
  ?column? 
@@ -123,7 +123,7 @@ CREATE DATABASE
  1        
 (1 row)
 1: drop database hs_qc_dropdb;
-DROP DATABASE
+DROP
 -1S: select 1;
 FATAL:  terminating connection due to conflict with recovery
 DETAIL:  User was connected to a database that must be dropped.
@@ -137,9 +137,9 @@ server closed the connection unexpectedly
 -- Conflict with VACUUM (snapshot)
 ---------------------------------------------------------------------
 1: create table hs_qc_vac1(a int);
-CREATE TABLE
+CREATE
 1: insert into hs_qc_vac1 select * from generate_series(1,10);
-INSERT 0 10
+INSERT 10
 -1S: begin transaction isolation level repeatable read;
 BEGIN
 -1S: select count(*) from hs_qc_vac1;
@@ -168,9 +168,9 @@ HINT:  In a moment you should be able to reconnect to the database and repeat yo
 -- the previous case is that here the deleted row is already invisible to the standby.
 ---------------------------------------------------------------------
 1: create table hs_qc_vac2(a int);
-CREATE TABLE
+CREATE
 1: insert into hs_qc_vac2 values(2);
-INSERT 0 1
+INSERT 1
 1: delete from hs_qc_vac2;
 DELETE 1
 -- run select once on the standby, so the next select will fetch data from buffer
@@ -229,13 +229,13 @@ DETAIL:  User was holding shared buffer pin for too long.
 !\retcode mkdir -p @testtablespace@/hs_tablespace_directory;
 (exited with code 0)
 create tablespace hs_ts location '@testtablespace@/hs_tablespace_directory';
-CREATE TABLESPACE
+CREATE
 
 -- some prepartion on the primary
 create table hs_ts_foo (i int, j int) distributed by(i);
-CREATE TABLE
+CREATE
 insert into hs_ts_foo select i, i from generate_series(1,800000)i;
-INSERT 0 800000
+INSERT 800000
 analyze hs_ts_foo;
 ANALYZE
 
@@ -258,7 +258,7 @@ SET
 
 -- drop tablespace, should see conflict on the hot standby
 drop tablespace hs_ts;
-DROP TABLESPACE
+DROP
 select gp_inject_fault('after_open_temp_file', 'reset',dbid) from gp_segment_configuration where content=1 and role='m';
  gp_inject_fault 
 -----------------
@@ -293,9 +293,9 @@ CHECKPOINT
 ----------------------------------------------------------------
 
 1: create table hs_qc_ds1(a int);
-CREATE TABLE
+CREATE
 1: insert into hs_qc_ds1 select * from generate_series(1,10);
-INSERT 0 10
+INSERT 10
 -- standby starts a repeatable read transaction, runs a local query that
 -- creates a distributed snapshot w/o creating QE.
 -1S: select count(*) from hs_qc_ds1;
@@ -327,7 +327,7 @@ VACUUM
  0     
 (1 row)
 -1S: end;
-COMMIT
+END
 
 ----------------------------------------------------------------
 -- Test GUC hot_standby_feedback
@@ -338,9 +338,9 @@ COMMIT
 (exited with code 0)
 
 1: create table hs_qc_guc1(a int);
-CREATE TABLE
+CREATE
 1: insert into hs_qc_guc1 select * from generate_series(1,10);
-INSERT 0 10
+INSERT 10
 
 -1S: begin transaction isolation level repeatable read;
 BEGIN
@@ -383,7 +383,7 @@ VACUUM
 
 -- after the conflicting read transaction ends, the next VACUUM will successfully vacuum the table
 -1S: end;
-COMMIT
+END
 1: vacuum hs_qc_guc1;
 VACUUM
 -1S: select * from hs_qc_guc1;
@@ -408,9 +408,9 @@ VACUUM
 (exited with code 0)
 
 1: create table hs_qc_guc2(a int);
-CREATE TABLE
+CREATE
 1: insert into hs_qc_guc2 select * from generate_series(1,10);
-INSERT 0 10
+INSERT 10
 
 -1S: begin transaction isolation level repeatable read;
 BEGIN
@@ -449,7 +449,7 @@ SET
 
 -- only if the age is reached, hot standby will see the same conflict as before
 1: create temp table tt1(a int);
-CREATE TABLE
+CREATE
 1: vacuum hs_qc_guc2;
 VACUUM
 -1S: select count(*) from hs_qc_guc2;
diff --git a/src/test/regress/expected/hs_standby_disallowed.out b/src/test/regress/expected/hs_standby_disallowed.out
index 853fa853c81..0a62e40e743 100644
--- a/src/test/regress/expected/hs_standby_disallowed.out
+++ b/src/test/regress/expected/hs_standby_disallowed.out
@@ -66,7 +66,7 @@ SELECT count(*) FROM hs1;
 (1 row)
 
 PREPARE TRANSACTION 'foobar';
-ERROR:  PREPARE TRANSACTION is not yet supported in Greenplum Database
+ERROR:  cannot execute PREPARE TRANSACTION during recovery
 ROLLBACK;
 BEGIN;
 SELECT count(*) FROM hs1;
@@ -76,7 +76,7 @@ SELECT count(*) FROM hs1;
 (1 row)
 
 COMMIT PREPARED 'foobar';
-ERROR:  COMMIT PREPARED is not yet supported in Greenplum Database
+ERROR:  cannot execute COMMIT PREPARED during recovery
 ROLLBACK;
 BEGIN;
 SELECT count(*) FROM hs1;
@@ -86,7 +86,7 @@ SELECT count(*) FROM hs1;
 (1 row)
 
 PREPARE TRANSACTION 'foobar';
-ERROR:  PREPARE TRANSACTION is not yet supported in Greenplum Database
+ERROR:  cannot execute PREPARE TRANSACTION during recovery
 ROLLBACK PREPARED 'foobar';
 ERROR:  current transaction is aborted, commands ignored until end of transaction block
 ROLLBACK;
@@ -98,7 +98,7 @@ SELECT count(*) FROM hs1;
 (1 row)
 
 ROLLBACK PREPARED 'foobar';
-ERROR:  ROLLBACK PREPARED is not yet supported in Greenplum Database
+ERROR:  cannot execute ROLLBACK PREPARED during recovery
 ROLLBACK;
 -- Locks
 BEGIN;
@@ -153,5 +153,5 @@ fetch next from hsc;
 (1 row)
 
 fetch first from hsc;
-ERROR:  backward scan is not supported in this version of Greenplum Database
+ERROR:  backward scan is not supported in this version of Apache Cloudberry
 COMMIT;
diff --git a/src/test/regress/pg_regress.c b/src/test/regress/pg_regress.c
index 3b9e91136d4..9320cf0aeec 100644
--- a/src/test/regress/pg_regress.c
+++ b/src/test/regress/pg_regress.c
@@ -3615,9 +3615,20 @@ cluster_healthy(void)
 		return false;
 	}
 
+	char *p;
+	/* skip if the instance is hot standby */
+	psql_command_output("postgres", line, sizeof(line),
+			"SELECT pg_is_in_recovery();");
+	p = &line[0];
+	while (*p == ' ')
+		p++;
+	if (*p == 't')
+	{
+		return !halt_work;
+	}
+
 	i = 120;
 	do {
-		char *p;
 		/* check for the health for standby coordinator */
 		psql_command_output("postgres", line, sizeof(line),
 							"SELECT sync_state FROM pg_stat_get_wal_senders();");
diff --git a/src/test/regress/sql/hs_primary_setup.sql b/src/test/regress/sql/hs_primary_setup.sql
index a90979a89ba..83403299fd5 100644
--- a/src/test/regress/sql/hs_primary_setup.sql
+++ b/src/test/regress/sql/hs_primary_setup.sql
@@ -27,6 +27,6 @@ SELECT pg_switch_wal();
 
 -- GPDB: enable hot_standby for this cluster
 \! gpconfig -c hot_standby -v on;
-\! gpstop -ar;
+\! gpstop -ari;
 
 -- end_ignore

From fc32db7da23e642e3331ac7e0a6e61869d94dfed Mon Sep 17 00:00:00 2001
From: Huansong Fu <fuhuansong@gmail.com>
Date: Mon, 20 Mar 2023 07:51:33 -0700
Subject: [PATCH 10/11] Generate gp_ view for desired pg_ system views

For a selected list of PG system views (started with 'pg_'prefix ), we
will create a corresponding 'gp_' view for each one in the list.
Each 'gp_' view is basically a UNION ALL of the results of running the
corresponding 'pg_' view on all segments (including the coordinator).

Note that, these views do not aggregate the results. The aggregate
version of the views will be named with a '_summary' appendix (such
as 'gp_stat_all_tables_summary').

To add a new 'pg_' view to this list, simply put the name in file
'src/backend/catalog/system_views_gp.in'. This commit adds an initial
list of views that we think make sense to have 'gp_' views.

With this change, we also remove the existing definition of
gp_stat_archiver view and let it be generated automatically.
We also had gp_stat_replication but it carries additional column than
pg_stat_replication so it cannot use the automatic way.
---
 src/backend/catalog/.gitignore         |  1 +
 src/backend/catalog/Makefile           |  8 +++--
 src/backend/catalog/system_views.sql   |  6 +---
 src/backend/catalog/system_views_gp.in | 48 ++++++++++++++++++++++++++
 src/bin/initdb/initdb.c                |  4 +++
 src/include/catalog/catversion.h       |  2 +-
 6 files changed, 61 insertions(+), 8 deletions(-)
 create mode 100644 src/backend/catalog/system_views_gp.in

diff --git a/src/backend/catalog/.gitignore b/src/backend/catalog/.gitignore
index 6c4c6d228db..3912b022a03 100644
--- a/src/backend/catalog/.gitignore
+++ b/src/backend/catalog/.gitignore
@@ -8,3 +8,4 @@
 /pg_*_d.h
 /gp_*_d.h
 /bki-stamp
+/system_views_gp.sql
diff --git a/src/backend/catalog/Makefile b/src/backend/catalog/Makefile
index 8a58b8e5897..260bd608d50 100644
--- a/src/backend/catalog/Makefile
+++ b/src/backend/catalog/Makefile
@@ -56,6 +56,9 @@ OBJS += pg_extprotocol.o \
 	   gp_matview_aux.o \
        pg_directory_table.o storage_directory_table.o
 
+GP_SYSVIEW_IN = system_views_gp.in
+GP_SYSVIEW_SQL = system_views_gp.sql
+
 CATALOG_JSON:= $(addprefix $(top_srcdir)/gpMgmt/bin/gppylib/data/, $(addsuffix .json,$(GP_MAJORVERSION)))
 
 include $(top_srcdir)/src/backend/common.mk
@@ -133,7 +136,7 @@ POSTGRES_BKI_DATA += $(addprefix $(top_srcdir)/src/include/catalog/,\
 	$(top_builddir)/src/include/catalog/gp_version_at_initdb.dat
 
 
-all: distprep generated-header-symlinks
+all: distprep generated-header-symlinks $(GP_SYSVIEW_SQL)
 
 distprep: bki-stamp
 
@@ -197,6 +200,7 @@ ifeq ($(USE_INTERNAL_FTS_FOUND), false)
 endif
 	$(INSTALL_DATA) $(srcdir)/system_functions.sql '$(DESTDIR)$(datadir)/system_functions.sql'
 	$(INSTALL_DATA) $(srcdir)/system_views.sql '$(DESTDIR)$(datadir)/system_views.sql'
+	$(INSTALL_DATA) $(srcdir)/$(GP_SYSVIEW_SQL) '$(DESTDIR)$(datadir)/$(GP_SYSVIEW_SQL)'
 	$(INSTALL_DATA) $(srcdir)/information_schema.sql '$(DESTDIR)$(datadir)/information_schema.sql'
 	$(INSTALL_DATA) $(call vpathsearch,cdb_schema.sql) '$(DESTDIR)$(datadir)/cdb_init.d/cdb_schema.sql'
 	$(INSTALL_DATA) $(srcdir)/sql_features.txt '$(DESTDIR)$(datadir)/sql_features.txt'
@@ -216,4 +220,4 @@ endif
 clean:
 
 maintainer-clean: clean
-	rm -f bki-stamp postgres.bki system_constraints.sql $(GENERATED_HEADERS)
+	rm -f bki-stamp postgres.bki system_constraints.sql $(GENERATED_HEADERS) $(GP_SYSVIEW_SQL)
diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql
index 4cb9a9b57d6..d5b7b81e8a2 100644
--- a/src/backend/catalog/system_views.sql
+++ b/src/backend/catalog/system_views.sql
@@ -1098,6 +1098,7 @@ $$
 $$
 LANGUAGE SQL EXECUTE ON ALL SEGMENTS;
 
+-- This view has an additional column than pg_stat_replication so cannot be generated using system_views_gp.in
 CREATE VIEW gp_stat_replication AS
     SELECT *, pg_catalog.gp_replication_error() AS sync_error
     FROM pg_catalog.gp_stat_get_master_replication() AS R
@@ -1805,11 +1806,6 @@ UNION ALL
   SELECT gp_segment_id, gp_get_suboverflowed_backends() FROM gp_dist_random('gp_id') order by 1;
 
 
-CREATE OR REPLACE VIEW gp_stat_archiver AS
-    SELECT -1 AS gp_segment_id, * FROM pg_stat_archiver
-    UNION
-    SELECT gp_execution_segment() AS gp_segment_id, * FROM gp_dist_random('pg_stat_archiver');
-
 CREATE FUNCTION gp_get_session_endpoints (OUT gp_segment_id int, OUT auth_token text,
 									  OUT cursorname text, OUT sessionid int, OUT hostname varchar(64),
 									  OUT port int, OUT username text, OUT state text,
diff --git a/src/backend/catalog/system_views_gp.in b/src/backend/catalog/system_views_gp.in
new file mode 100644
index 00000000000..5c0a8c2dc7e
--- /dev/null
+++ b/src/backend/catalog/system_views_gp.in
@@ -0,0 +1,48 @@
+# This file lists all the PG system views 'pg_%' that we would like to create an
+# MPP-aware view 'gp_%' out of. The generated 'gp_%' view definitions will be placed
+# in system_views_gp.sql, and initialized at the same time as system_views.sql.
+pg_backend_memory_contexts
+pg_config
+pg_cursors
+pg_file_settings
+pg_replication_origin_status
+pg_replication_slots
+pg_settings
+pg_stat_activity
+pg_stat_archiver
+pg_stat_bgwriter
+pg_stat_database
+pg_stat_database_conflicts
+pg_stat_gssapi
+pg_stat_operations
+pg_stat_progress_analyze
+pg_stat_progress_basebackup
+pg_stat_progress_cluster
+pg_stat_progress_copy
+pg_stat_progress_create_index
+pg_stat_progress_vacuum
+pg_stat_slru
+pg_stat_ssl
+pg_stat_subscription
+pg_stat_sys_indexes
+pg_stat_sys_tables
+pg_stat_user_functions
+pg_stat_user_indexes
+pg_stat_user_tables
+pg_stat_wal
+pg_stat_wal_receiver
+pg_stat_xact_all_tables
+pg_stat_xact_sys_tables
+pg_stat_xact_user_functions
+pg_stat_xact_user_tables
+pg_statio_all_indexes
+pg_statio_all_sequences
+pg_statio_all_tables
+pg_statio_sys_indexes
+pg_statio_sys_sequences
+pg_statio_sys_tables
+pg_statio_user_indexes
+pg_statio_user_sequences
+pg_statio_user_tables
+pg_stats
+pg_stats_ext
diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c
index 53c3a82a45e..8525d0ca0d9 100644
--- a/src/bin/initdb/initdb.c
+++ b/src/bin/initdb/initdb.c
@@ -174,6 +174,7 @@ static char *external_fts_files;
 #endif
 static char *system_functions_file;
 static char *system_views_file;
+static char *system_views_gp_file;
 static bool success = false;
 static bool made_new_pgdata = false;
 static bool found_existing_pgdata = false;
@@ -2831,6 +2832,7 @@ setup_data_file_paths(void)
 	set_input(&system_constraints_file, "system_constraints.sql");
 	set_input(&system_functions_file, "system_functions.sql");
 	set_input(&system_views_file, "system_views.sql");
+	set_input(&system_views_gp_file, "system_views_gp.sql");
 
 	set_input(&cdb_init_d_dir, "cdb_init.d");
 
@@ -2864,6 +2866,7 @@ setup_data_file_paths(void)
 #endif
 	check_input(system_functions_file);
 	check_input(system_views_file);
+	check_input(system_views_gp_file);
 }
 
 
@@ -3231,6 +3234,7 @@ initialize_data_directory(void)
 	 */
 
 	setup_run_file(cmdfd, system_views_file);
+	setup_run_file(cmdfd, system_views_gp_file);
 
 	setup_description(cmdfd);
 
diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h
index 86910a0dada..026192b3674 100644
--- a/src/include/catalog/catversion.h
+++ b/src/include/catalog/catversion.h
@@ -56,6 +56,6 @@
  */
 
 /*							3yyymmddN */
-#define CATALOG_VERSION_NO	302502091
+#define CATALOG_VERSION_NO	302506101
 
 #endif

From 85d792009131c8bb76ea63637f914ee8280e409e Mon Sep 17 00:00:00 2001
From: wangxiaoran <fanfuxiaoran@gmail.com>
Date: Tue, 10 Jun 2025 17:03:18 +0800
Subject: [PATCH 11/11] Fix system_views_gp.in and fix test query_conflict

Some pg_ views have been modified by cbdb: the gp_segment_id
colmun has been added to them. So they are failed to be transformed
from the pg_ views to gp_ views (see commit
5028222620d410fe3d4c60f732a599e269006968)
So just remove them from system_vies_gp.in. Maybe better to fix
them later.
---
 pom.xml                                       |  2 ++
 src/backend/catalog/system_views_gp.in        | 20 +++++++++----------
 .../input/hot_standby/query_conflict.source   |  4 ++--
 .../output/hot_standby/query_conflict.source  | 10 +++++-----
 4 files changed, 19 insertions(+), 17 deletions(-)

diff --git a/pom.xml b/pom.xml
index ef1a10a6c7f..97ebb23bb70 100644
--- a/pom.xml
+++ b/pom.xml
@@ -996,6 +996,7 @@ code or new licensing patterns.
             <exclude>src/template/win32</exclude>
             <exclude>src/template/cygwin</exclude>
             <exclude>src/template/aix</exclude>
+            <exclude>src/backend/cdb/dispatcher/test/cdbdisp_query_test.c</exclude>
             <exclude>src/backend/cdb/cdbdistributedxid.c</exclude>
             <exclude>src/backend/cdb/test/cdbdistributedsnapshot_test.c</exclude>
             <exclude>src/backend/cdb/test/cdbbufferedread_test.c</exclude>
@@ -1046,6 +1047,7 @@ code or new licensing patterns.
             <exclude>src/backend/postmaster/test/checkpointer_test.c</exclude>
             <exclude>src/backend/postmaster/README.auto-ANALYZE</exclude>
             <exclude>src/backend/mock.mk</exclude>
+            <exclude>src/backend/catalog/system_views_gp.in</exclude>
             <exclude>src/backend/catalog/storage_tablespace.c</exclude>
             <exclude>src/backend/catalog/test/storage_tablespace_test.c</exclude>
             <exclude>src/backend/catalog/sql_features.txt</exclude>
diff --git a/src/backend/catalog/system_views_gp.in b/src/backend/catalog/system_views_gp.in
index 5c0a8c2dc7e..d46dde3191e 100644
--- a/src/backend/catalog/system_views_gp.in
+++ b/src/backend/catalog/system_views_gp.in
@@ -1,7 +1,7 @@
 # This file lists all the PG system views 'pg_%' that we would like to create an
 # MPP-aware view 'gp_%' out of. The generated 'gp_%' view definitions will be placed
 # in system_views_gp.sql, and initialized at the same time as system_views.sql.
-pg_backend_memory_contexts
+#pg_backend_memory_contexts
 pg_config
 pg_cursors
 pg_file_settings
@@ -11,16 +11,16 @@ pg_settings
 pg_stat_activity
 pg_stat_archiver
 pg_stat_bgwriter
-pg_stat_database
+#pg_stat_database
 pg_stat_database_conflicts
 pg_stat_gssapi
 pg_stat_operations
-pg_stat_progress_analyze
-pg_stat_progress_basebackup
-pg_stat_progress_cluster
-pg_stat_progress_copy
-pg_stat_progress_create_index
-pg_stat_progress_vacuum
+#pg_stat_progress_analyze
+#pg_stat_progress_basebackup
+#pg_stat_progress_cluster
+#pg_stat_progress_copy
+#pg_stat_progress_create_index
+#pg_stat_progress_vacuum
 pg_stat_slru
 pg_stat_ssl
 pg_stat_subscription
@@ -29,7 +29,7 @@ pg_stat_sys_tables
 pg_stat_user_functions
 pg_stat_user_indexes
 pg_stat_user_tables
-pg_stat_wal
+#pg_stat_wal
 pg_stat_wal_receiver
 pg_stat_xact_all_tables
 pg_stat_xact_sys_tables
@@ -44,5 +44,5 @@ pg_statio_sys_tables
 pg_statio_user_indexes
 pg_statio_user_sequences
 pg_statio_user_tables
-pg_stats
+#pg_stats ERROR:  column "most_common_vals" has pseudo-type anyarray
 pg_stats_ext
diff --git a/src/test/isolation2/input/hot_standby/query_conflict.source b/src/test/isolation2/input/hot_standby/query_conflict.source
index 0e2706bfa5a..5f2aee3be53 100644
--- a/src/test/isolation2/input/hot_standby/query_conflict.source
+++ b/src/test/isolation2/input/hot_standby/query_conflict.source
@@ -126,8 +126,8 @@ select gp_inject_fault('after_open_temp_file', 'reset',dbid) from gp_segment_con
 -1S<:
 -1Sq:
 
--- conflict has been recorded
--1S: select max(confl_tablespace) from gp_stat_database_conflicts where datname = 'isolation2-hot-standby';
+-- conflict has been recorded. The query has multiple slices
+-1S: select max(confl_tablespace) >= 1 from gp_stat_database_conflicts where datname = 'isolation2-hot-standby';
 
 -- cleanup
 !\retcode rm -rf @testtablespace@/hs_tablespace_directory;
diff --git a/src/test/isolation2/output/hot_standby/query_conflict.source b/src/test/isolation2/output/hot_standby/query_conflict.source
index 397e3977d12..909d2532df3 100644
--- a/src/test/isolation2/output/hot_standby/query_conflict.source
+++ b/src/test/isolation2/output/hot_standby/query_conflict.source
@@ -269,11 +269,11 @@ ERROR:  canceling statement due to conflict with recovery  (seg1 slice3 127.0.1.
 DETAIL:  User was or might have been using tablespace that must be dropped.
 -1Sq: ... <quitting>
 
--- conflict has been recorded
--1S: select max(confl_tablespace) from gp_stat_database_conflicts where datname = 'isolation2-hot-standby';
- max 
------
- 1   
+-- conflict has been recorded. The query has multiple slices
+-1S: select max(confl_tablespace) >= 1 from gp_stat_database_conflicts where datname = 'isolation2-hot-standby';
+ ?column? 
+----------
+ t        
 (1 row)
 
 -- cleanup