diff --git a/.abi-check/7.1.0/postgres.symbols.ignore b/.abi-check/7.1.0/postgres.symbols.ignore
index 848dbf2841d..d42d77c4039 100644
--- a/.abi-check/7.1.0/postgres.symbols.ignore
+++ b/.abi-check/7.1.0/postgres.symbols.ignore
@@ -1 +1,12 @@
pgarch_start
+ConfigureNamesInt_gp
+child_triggers
+has_update_triggers
+ConfigureNamesBool_gp
+aocs_beginscan
+AppendOnlyBlockDirectory_GetEntry
+ConfigureNamesString_gp
+gp_pause_on_restore_point_replay
+ConfigureNamesReal_gp
+TableAmRoutine
+MainLWLockNames
diff --git a/GNUmakefile.in b/GNUmakefile.in
index bde27f24aa4..d836f7ba525 100644
--- a/GNUmakefile.in
+++ b/GNUmakefile.in
@@ -212,6 +212,11 @@ installcheck-gpcheckcat:
$(call recurse,installcheck-world,gpcontrib/gp_replica_check,installcheck)
$(call recurse,installcheck-world,src/bin/pg_upgrade,check)
+.PHONY: installcheck-hot-standby
+installcheck-hot-standby: submake-generated-headers
+ $(MAKE) -C src/test/regress installcheck-hot-standby
+ $(MAKE) -C src/test/isolation2 installcheck-hot-standby
+
# Run mock tests, that don't require a running server. Arguably these should
# be part of [install]check-world, but we treat them more like part of
# compilation than regression testing, in the CI. But they are too heavy-weight
diff --git a/pom.xml b/pom.xml
index ef1a10a6c7f..97ebb23bb70 100644
--- a/pom.xml
+++ b/pom.xml
@@ -996,6 +996,7 @@ code or new licensing patterns.
src/template/win32
src/template/cygwin
src/template/aix
+ src/backend/cdb/dispatcher/test/cdbdisp_query_test.c
src/backend/cdb/cdbdistributedxid.c
src/backend/cdb/test/cdbdistributedsnapshot_test.c
src/backend/cdb/test/cdbbufferedread_test.c
@@ -1046,6 +1047,7 @@ code or new licensing patterns.
src/backend/postmaster/test/checkpointer_test.c
src/backend/postmaster/README.auto-ANALYZE
src/backend/mock.mk
+ src/backend/catalog/system_views_gp.in
src/backend/catalog/storage_tablespace.c
src/backend/catalog/test/storage_tablespace_test.c
src/backend/catalog/sql_features.txt
diff --git a/src/backend/access/heap/heapam.c b/src/backend/access/heap/heapam.c
index 048ce9231a9..0003425b79f 100644
--- a/src/backend/access/heap/heapam.c
+++ b/src/backend/access/heap/heapam.c
@@ -530,6 +530,14 @@ heapgetpage(TableScanDesc sscan, BlockNumber page)
LockBuffer(buffer, BUFFER_LOCK_UNLOCK);
+#ifdef FAULT_INJECTOR
+ FaultInjector_InjectFaultIfSet(
+ "heapgetpage_after_unlock_buffer",
+ DDLNotSpecified,
+ "", /* databaseName */
+ RelationGetRelationName(scan->rs_base.rs_rd)); /* tableName */
+#endif
+
Assert(ntup <= MaxHeapTuplesPerPage);
scan->rs_ntuples = ntup;
}
diff --git a/src/backend/access/rmgrdesc/standbydesc.c b/src/backend/access/rmgrdesc/standbydesc.c
index 01ee7ac6d2c..899c621b240 100644
--- a/src/backend/access/rmgrdesc/standbydesc.c
+++ b/src/backend/access/rmgrdesc/standbydesc.c
@@ -66,6 +66,14 @@ standby_desc(StringInfo buf, XLogReaderState *record)
xlrec->dbId, xlrec->tsId,
xlrec->relcacheInitFileInval);
}
+ else if (info == XLOG_LATESTCOMPLETED_GXID)
+ {
+ DistributedTransactionId gxid;
+
+ gxid = *((DistributedTransactionId *) rec);
+ appendStringInfo(buf, UINT64_FORMAT, gxid);
+ }
+
}
const char *
@@ -84,6 +92,9 @@ standby_identify(uint8 info)
case XLOG_INVALIDATIONS:
id = "INVALIDATIONS";
break;
+ case XLOG_LATESTCOMPLETED_GXID:
+ id = "XLOG_LATESTCOMPLETED_GXID";
+ break;
}
return id;
diff --git a/src/backend/access/transam/README b/src/backend/access/transam/README
index f3112ff3070..efac0cb505e 100644
--- a/src/backend/access/transam/README
+++ b/src/backend/access/transam/README
@@ -897,3 +897,48 @@ yet simplifies emulation of subtransactions considerably.
Further details on locking mechanics in recovery are given in comments
with the Lock rmgr code.
+
+Distributed Transaction Emulation during Recovery
+-------------------------------------
+
+In GPDB, the MVCC snapshot also includes distributed transactions (aka dtx).
+Accordingly, on a hot standby we also emulate running dtx. The way to do that
+is to re-use the shmCommittedGxidArray which has been used on a primary for dtx
+recovery: it tracks all the 2PC dtx that have their PREPARE phase done,
+but for which the COMMIT phase hasn't finished (i.e. window between the
+XLOG_XACT_DISTRIBUTED_COMMIT record being written and the
+XLOG_XACT_DISTRIBUTED_FORGET record being written on the QD). On a hot standby,
+any dtx shown in that array are regarded as in-progress. The MVCC snapshot does
+not really need to account for dtx not in that array: for a dtx that hasn't
+done PREPARE, we know no segment has committed any data yet; for a dtx that
+hasn't done COMMIT, we know all segments have committed their data.
+
+Note: dtxes that are preparing will not be tracked in this array, and thus will
+not be included in this snapshot. This is slightly different from a primary QD,
+where such transactions would have been included in the distributed snapshot's
+inProgressXidArray (as we construct the inProgressXidArray from the PGXACTs that
+would contain the dummy entries for prepared transactions). However, as
+mentioned in CreateDistributedSnapshot, including these is not a requirement for
+correctness.
+
+Note: aborted/aborting dtxes are not accounted for by the standby either. Those
+are the dtxes that encountered error during preparing. Same as the previous
+point, the standby does not need to be aware of them for correctness. Worth also
+noting that if a dtx encountered error after being prepared, it cannot be
+aborted anymore and must be committed by the dtx recovery process. Until
+committed, such a dtx will be seen as in-progress to the standby.
+
+For 1PC dtx, however, there is a known limitation where the hot standby won't
+see the last 1PC (or the last few 1PCs if they are all 1PC). This is because
+since 1PC does not have any WAL on QD, the standby QD won't advance its
+latestCompletedGxid, so its distributed snapshot horizon does not include the
+last 1PC - it would view the last 1PC not yet started or at best still in
+progress. Only if another 2PC comes, the standby would advance its
+latestCompletedGxid and its distributed snapshot will include the previous 1PC.
+
+We don't emulate the full architecture of "running transaction" for dtx because
+that is unnecessary, at least ATM. For example, we don't create a dtx-version
+of XLOG_RUNNING_XACTS, because we already have that information as part of the
+extended checkpoint (see TMGXACT_CHECKPOINT). We also don't need to emulate
+other members in RunningTransactionsData, like subxid or xid-pruning related
+variables because those do not apply to dtx.
diff --git a/src/backend/access/transam/xact.c b/src/backend/access/transam/xact.c
index f3f2a035281..ed655baf989 100644
--- a/src/backend/access/transam/xact.c
+++ b/src/backend/access/transam/xact.c
@@ -2475,11 +2475,10 @@ StartTransaction(void)
/*
* Transactions may be started while recovery is in progress, if
- * hot standby is enabled. This mode is not supported in
- * Cloudberry yet.
+ * hot standby is enabled.
*/
AssertImply(DistributedTransactionContext != DTX_CONTEXT_LOCAL_ONLY,
- !s->startedInRecovery);
+ EnableHotStandby || !s->startedInRecovery);
/*
* MPP Modification
*
@@ -2526,20 +2525,39 @@ StartTransaction(void)
case DTX_CONTEXT_QE_TWO_PHASE_EXPLICIT_WRITER:
case DTX_CONTEXT_QE_TWO_PHASE_IMPLICIT_WRITER:
+ /*
+ * Sanity check for the global xid.
+ *
+ * Note for hot standby dispatch: the standby QEs are still
+ * writers, just like primary QEs for SELECT queries. But
+ * hot standby dispatch never has a valid gxid, so we skip
+ * the gxid checks for the standby QEs.
+ */
+ if (!IS_HOT_STANDBY_QE())
+ {
+ if (QEDtxContextInfo.distributedXid == InvalidDistributedTransactionId)
+ elog(ERROR,
+ "distributed transaction id is invalid in context %s",
+ DtxContextToString(DistributedTransactionContext));
+
+ /*
+ * Update distributed XID info, this is only used for
+ * debugging.
+ */
+ LocalDistribXactData *ele = &MyProc->localDistribXactData;
+ ele->distribXid = QEDtxContextInfo.distributedXid;
+ ele->state = LOCALDISTRIBXACT_STATE_ACTIVE;
+ }
+ else
+ Assert(QEDtxContextInfo.distributedXid == InvalidDistributedTransactionId);
+
+ /* fall through */
case DTX_CONTEXT_QE_AUTO_COMMIT_IMPLICIT:
{
/* If we're running in test-mode insert a delay in writer. */
if (gp_enable_slow_writer_testmode)
pg_usleep(500000);
- if (DistributedTransactionContext != DTX_CONTEXT_QE_AUTO_COMMIT_IMPLICIT &&
- QEDtxContextInfo.distributedXid == InvalidDistributedTransactionId)
- {
- elog(ERROR,
- "distributed transaction id is invalid in context %s",
- DtxContextToString(DistributedTransactionContext));
- }
-
/*
* Snapshot must not be created before setting transaction
* isolation level.
@@ -2552,28 +2570,14 @@ StartTransaction(void)
XactReadOnly = isMppTxOptions_ReadOnly(
QEDtxContextInfo.distributedTxnOptions);
+ /* a hot standby transaction must be read-only */
+ AssertImply(IS_HOT_STANDBY_QE(), XactReadOnly);
+
/*
* MPP: we're a QE Writer.
*/
MyTmGxact->gxid = QEDtxContextInfo.distributedXid;
- if (DistributedTransactionContext ==
- DTX_CONTEXT_QE_TWO_PHASE_EXPLICIT_WRITER ||
- DistributedTransactionContext ==
- DTX_CONTEXT_QE_TWO_PHASE_IMPLICIT_WRITER)
- {
- Assert(QEDtxContextInfo.distributedXid !=
- InvalidDistributedTransactionId);
-
- /*
- * Update distributed XID info, this is only used for
- * debugging.
- */
- LocalDistribXactData *ele = &MyProc->localDistribXactData;
- ele->distribXid = QEDtxContextInfo.distributedXid;
- ele->state = LOCALDISTRIBXACT_STATE_ACTIVE;
- }
-
if (SharedLocalSnapshotSlot != NULL)
{
LWLockAcquire(SharedLocalSnapshotSlot->slotLock, LW_EXCLUSIVE);
@@ -6880,8 +6884,8 @@ XactLogCommitRecord(TimestampTz commit_time,
xl_xact_distrib xl_distrib;
xl_xact_deldbs xl_deldbs;
XLogRecPtr recptr;
- bool isOnePhaseQE = (Gp_role == GP_ROLE_EXECUTE && MyTmGxactLocal->isOnePhaseCommit);
bool isDtxPrepared = isPreparedDtxTransaction();
+ DistributedTransactionId distrib_xid = getDistributedTransactionId();
uint8 info;
@@ -6971,10 +6975,11 @@ XactLogCommitRecord(TimestampTz commit_time,
xl_origin.origin_timestamp = replorigin_session_origin_timestamp;
}
- if (isDtxPrepared || isOnePhaseQE)
+ /* include distributed xid if there's one */
+ if (distrib_xid != InvalidDistributedTransactionId)
{
xl_xinfo.xinfo |= XACT_XINFO_HAS_DISTRIB;
- xl_distrib.distrib_xid = getDistributedTransactionId();
+ xl_distrib.distrib_xid = distrib_xid;
}
#if 0
diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c
index 3fb9f121b93..ffc8714cf62 100644
--- a/src/backend/access/transam/xlog.c
+++ b/src/backend/access/transam/xlog.c
@@ -133,7 +133,14 @@ bool track_wal_io_timing = false;
int FileEncryptionEnabled = false;
/* GPDB specific */
-bool gp_pause_on_restore_point_replay = false;
+char *gp_pause_on_restore_point_replay = "";
+
+/*
+ * GPDB: Have we reached a specific continuous recovery target? We set this to
+ * true if WAL replay has found a restore point matching the GPDB-specific GUC
+ * gp_pause_on_restore_point_replay and a promotion has been requested.
+ */
+static bool reachedContinuousRecoveryTarget = false;
#ifdef WAL_DEBUG
bool XLOG_DEBUG = false;
@@ -6012,6 +6019,59 @@ recoveryStopsBefore(XLogReaderState *record)
return stopsHere;
}
+/*
+ * GPDB: Restore point records can act as a point of synchronization to ensure
+ * cluster-wide consistency during WAL replay. If a restore point is specified
+ * in the gp_pause_on_restore_point_replay GUC, WAL replay will be paused at
+ * that restore point until replay is explicitly resumed.
+ */
+static void
+pauseRecoveryOnRestorePoint(XLogReaderState *record)
+{
+ uint8 info;
+ uint8 rmid;
+
+ /*
+ * Ignore recovery target settings when not in archive recovery (meaning
+ * we are in crash recovery).
+ */
+ if (!ArchiveRecoveryRequested)
+ return;
+
+ info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
+ rmid = XLogRecGetRmid(record);
+
+ if (rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT)
+ {
+ xl_restore_point *recordRestorePointData;
+
+ recordRestorePointData = (xl_restore_point *) XLogRecGetData(record);
+
+ if (strcmp(recordRestorePointData->rp_name, gp_pause_on_restore_point_replay) == 0)
+ {
+ ereport(LOG,
+ (errmsg("setting recovery pause at restore point \"%s\", time %s",
+ recordRestorePointData->rp_name,
+ timestamptz_to_str(recordRestorePointData->rp_time))));
+
+ SetRecoveryPause(true);
+ recoveryPausesHere(false);
+
+ /*
+ * If we've unpaused and there is a promotion request, then we've
+ * reached our continuous recovery target and need to immediately
+ * promote. We piggyback on the existing recovery target logic to
+ * do this. See recoveryStopsAfter().
+ */
+ if (CheckForStandbyTrigger())
+ {
+ reachedContinuousRecoveryTarget = true;
+ recoveryTargetAction = RECOVERY_TARGET_ACTION_PROMOTE;
+ }
+ }
+ }
+}
+
/*
* Same as recoveryStopsBefore, but called after applying the record.
*
@@ -6039,15 +6099,19 @@ recoveryStopsAfter(XLogReaderState *record)
/*
* There can be many restore points that share the same name; we stop at
* the first one.
+ *
+ * GPDB: If we've reached the continuous recovery target, we'll use the
+ * below logic to immediately stop recovery.
*/
- if (recoveryTarget == RECOVERY_TARGET_NAME &&
+ if ((reachedContinuousRecoveryTarget || recoveryTarget == RECOVERY_TARGET_NAME) &&
rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT)
{
xl_restore_point *recordRestorePointData;
recordRestorePointData = (xl_restore_point *) XLogRecGetData(record);
- if (strcmp(recordRestorePointData->rp_name, recoveryTargetName) == 0)
+ if (reachedContinuousRecoveryTarget ||
+ strcmp(recordRestorePointData->rp_name, recoveryTargetName) == 0)
{
recoveryStopAfter = true;
recoveryStopXid = InvalidTransactionId;
@@ -7900,6 +7964,9 @@ StartupXLOG(void)
WalSndWakeup();
}
+ if (gp_pause_on_restore_point_replay)
+ pauseRecoveryOnRestorePoint(xlogreader);
+
/* Exit loop if we reached inclusive recovery target */
if (recoveryStopsAfter(xlogreader))
{
@@ -8331,6 +8398,8 @@ StartupXLOG(void)
*/
InRecovery = false;
+ SIMPLE_FAULT_INJECTOR("out_of_recovery_in_startupxlog");
+
/*
* Hook for plugins to do additional startup works.
*
@@ -9801,8 +9870,11 @@ CreateCheckPoint(int flags)
* recovery we don't need to write running xact data.
*/
if (!shutdown && XLogStandbyInfoActive())
+ {
LogStandbySnapshot();
+ }
+
SIMPLE_FAULT_INJECTOR("checkpoint_after_redo_calculated");
START_CRIT_SECTION();
@@ -11126,14 +11198,7 @@ xlog_redo(XLogReaderState *record)
}
else if (info == XLOG_RESTORE_POINT)
{
- /*
- * GPDB: Restore point records can act as a point of
- * synchronization to ensure cluster-wide consistency during WAL
- * replay. WAL replay is paused at each restore point until it is
- * explicitly resumed.
- */
- if (gp_pause_on_restore_point_replay)
- SetRecoveryPause(true);
+ /* nothing to do here */
}
else if (info == XLOG_FPI || info == XLOG_FPI_FOR_HINT)
{
diff --git a/src/backend/catalog/.gitignore b/src/backend/catalog/.gitignore
index 6c4c6d228db..3912b022a03 100644
--- a/src/backend/catalog/.gitignore
+++ b/src/backend/catalog/.gitignore
@@ -8,3 +8,4 @@
/pg_*_d.h
/gp_*_d.h
/bki-stamp
+/system_views_gp.sql
diff --git a/src/backend/catalog/Makefile b/src/backend/catalog/Makefile
index 8a58b8e5897..260bd608d50 100644
--- a/src/backend/catalog/Makefile
+++ b/src/backend/catalog/Makefile
@@ -56,6 +56,9 @@ OBJS += pg_extprotocol.o \
gp_matview_aux.o \
pg_directory_table.o storage_directory_table.o
+GP_SYSVIEW_IN = system_views_gp.in
+GP_SYSVIEW_SQL = system_views_gp.sql
+
CATALOG_JSON:= $(addprefix $(top_srcdir)/gpMgmt/bin/gppylib/data/, $(addsuffix .json,$(GP_MAJORVERSION)))
include $(top_srcdir)/src/backend/common.mk
@@ -133,7 +136,7 @@ POSTGRES_BKI_DATA += $(addprefix $(top_srcdir)/src/include/catalog/,\
$(top_builddir)/src/include/catalog/gp_version_at_initdb.dat
-all: distprep generated-header-symlinks
+all: distprep generated-header-symlinks $(GP_SYSVIEW_SQL)
distprep: bki-stamp
@@ -197,6 +200,7 @@ ifeq ($(USE_INTERNAL_FTS_FOUND), false)
endif
$(INSTALL_DATA) $(srcdir)/system_functions.sql '$(DESTDIR)$(datadir)/system_functions.sql'
$(INSTALL_DATA) $(srcdir)/system_views.sql '$(DESTDIR)$(datadir)/system_views.sql'
+ $(INSTALL_DATA) $(srcdir)/$(GP_SYSVIEW_SQL) '$(DESTDIR)$(datadir)/$(GP_SYSVIEW_SQL)'
$(INSTALL_DATA) $(srcdir)/information_schema.sql '$(DESTDIR)$(datadir)/information_schema.sql'
$(INSTALL_DATA) $(call vpathsearch,cdb_schema.sql) '$(DESTDIR)$(datadir)/cdb_init.d/cdb_schema.sql'
$(INSTALL_DATA) $(srcdir)/sql_features.txt '$(DESTDIR)$(datadir)/sql_features.txt'
@@ -216,4 +220,4 @@ endif
clean:
maintainer-clean: clean
- rm -f bki-stamp postgres.bki system_constraints.sql $(GENERATED_HEADERS)
+ rm -f bki-stamp postgres.bki system_constraints.sql $(GENERATED_HEADERS) $(GP_SYSVIEW_SQL)
diff --git a/src/backend/catalog/system_views.sql b/src/backend/catalog/system_views.sql
index 6b0b604ab5e..d5b7b81e8a2 100644
--- a/src/backend/catalog/system_views.sql
+++ b/src/backend/catalog/system_views.sql
@@ -1098,6 +1098,7 @@ $$
$$
LANGUAGE SQL EXECUTE ON ALL SEGMENTS;
+-- This view has an additional column than pg_stat_replication so cannot be generated using system_views_gp.in
CREATE VIEW gp_stat_replication AS
SELECT *, pg_catalog.gp_replication_error() AS sync_error
FROM pg_catalog.gp_stat_get_master_replication() AS R
@@ -1498,6 +1499,10 @@ rq.oid=rc.resqueueid AND rc.restypid = rt.restypid
ORDER BY rsqname, restypid
;
+-- FIXME: we have a cluster-wide view gp_stat_database_conflicts, but that is
+-- only showing conflicts of every segment. Some conflict might be encountered
+-- on just part of the segments. Ideally we should have a view like
+-- gp_stat_database_conflicts_summary that prints the overall conflicts and types.
CREATE VIEW pg_stat_database_conflicts AS
SELECT
D.oid AS datid,
@@ -1801,11 +1806,6 @@ UNION ALL
SELECT gp_segment_id, gp_get_suboverflowed_backends() FROM gp_dist_random('gp_id') order by 1;
-CREATE OR REPLACE VIEW gp_stat_archiver AS
- SELECT -1 AS gp_segment_id, * FROM pg_stat_archiver
- UNION
- SELECT gp_execution_segment() AS gp_segment_id, * FROM gp_dist_random('pg_stat_archiver');
-
CREATE FUNCTION gp_get_session_endpoints (OUT gp_segment_id int, OUT auth_token text,
OUT cursorname text, OUT sessionid int, OUT hostname varchar(64),
OUT port int, OUT username text, OUT state text,
diff --git a/src/backend/catalog/system_views_gp.in b/src/backend/catalog/system_views_gp.in
new file mode 100644
index 00000000000..d46dde3191e
--- /dev/null
+++ b/src/backend/catalog/system_views_gp.in
@@ -0,0 +1,48 @@
+# This file lists all the PG system views 'pg_%' that we would like to create an
+# MPP-aware view 'gp_%' out of. The generated 'gp_%' view definitions will be placed
+# in system_views_gp.sql, and initialized at the same time as system_views.sql.
+#pg_backend_memory_contexts
+pg_config
+pg_cursors
+pg_file_settings
+pg_replication_origin_status
+pg_replication_slots
+pg_settings
+pg_stat_activity
+pg_stat_archiver
+pg_stat_bgwriter
+#pg_stat_database
+pg_stat_database_conflicts
+pg_stat_gssapi
+pg_stat_operations
+#pg_stat_progress_analyze
+#pg_stat_progress_basebackup
+#pg_stat_progress_cluster
+#pg_stat_progress_copy
+#pg_stat_progress_create_index
+#pg_stat_progress_vacuum
+pg_stat_slru
+pg_stat_ssl
+pg_stat_subscription
+pg_stat_sys_indexes
+pg_stat_sys_tables
+pg_stat_user_functions
+pg_stat_user_indexes
+pg_stat_user_tables
+#pg_stat_wal
+pg_stat_wal_receiver
+pg_stat_xact_all_tables
+pg_stat_xact_sys_tables
+pg_stat_xact_user_functions
+pg_stat_xact_user_tables
+pg_statio_all_indexes
+pg_statio_all_sequences
+pg_statio_all_tables
+pg_statio_sys_indexes
+pg_statio_sys_sequences
+pg_statio_sys_tables
+pg_statio_user_indexes
+pg_statio_user_sequences
+pg_statio_user_tables
+#pg_stats ERROR: column "most_common_vals" has pseudo-type anyarray
+pg_stats_ext
diff --git a/src/backend/cdb/cdbdtxcontextinfo.c b/src/backend/cdb/cdbdtxcontextinfo.c
index 1a3c1b8f295..2994821f8df 100644
--- a/src/backend/cdb/cdbdtxcontextinfo.c
+++ b/src/backend/cdb/cdbdtxcontextinfo.c
@@ -60,7 +60,7 @@ DtxContextInfo_CreateOnMaster(DtxContextInfo *dtxContextInfo, bool inCursor,
(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
errmsg("cannot have more than 2^32-2 commands in a session")));
- AssertImply(inCursor,
+ AssertImply(inCursor && !IS_HOT_STANDBY_QD(),
dtxContextInfo->distributedXid != InvalidDistributedTransactionId &&
gp_command_count == MySessionState->latestCursorCommandId);
diff --git a/src/backend/cdb/cdbdtxrecovery.c b/src/backend/cdb/cdbdtxrecovery.c
index 186b01ff214..605ce323ddb 100644
--- a/src/backend/cdb/cdbdtxrecovery.c
+++ b/src/backend/cdb/cdbdtxrecovery.c
@@ -202,6 +202,11 @@ recoverInDoubtTransactions(void)
for (i = 0; i < *shmNumCommittedGxacts; i++)
{
+ /*
+ * No need to acquire CommittedGxidArrayLock since dtx recovery
+ * only happens on primary, but not hot standby where concurrent
+ * access to this array is possible from CreateDistributedSnapshot.
+ */
DistributedTransactionId gxid = shmCommittedGxidArray[i];
char gid[TMGIDSIZE];
@@ -486,7 +491,12 @@ void
redoDistributedCommitRecord(DistributedTransactionId gxid)
{
int i;
+ bool is_hot_standby_qd = IS_HOT_STANDBY_QD();
+ /*
+ * Only the startup process can be modifying shmNumCommittedGxacts
+ * and shmCommittedGxidArray. So should be OK reading the value w/o lock.
+ */
for (i = 0; i < *shmNumCommittedGxacts; i++)
{
if (gxid == shmCommittedGxidArray[i])
@@ -526,7 +536,18 @@ redoDistributedCommitRecord(DistributedTransactionId gxid)
"around this issue and then report a bug")));
}
+ /*
+ * only on hot standby there might be backends that call CreateDistributedSnapshot()
+ * to access the committed gxid array concurrently.
+ */
+ if (is_hot_standby_qd)
+ LWLockAcquire(CommittedGxidArrayLock, LW_EXCLUSIVE);
+
shmCommittedGxidArray[(*shmNumCommittedGxacts)++] = gxid;
+
+ if (is_hot_standby_qd)
+ LWLockRelease(CommittedGxidArrayLock);
+
elog((Debug_print_full_dtm ? LOG : DEBUG5),
"Crash recovery redo added committed distributed transaction gid = "UINT64_FORMAT, gxid);
}
@@ -539,7 +560,13 @@ void
redoDistributedForgetCommitRecord(DistributedTransactionId gxid)
{
int i;
-
+ bool is_hot_standby_qd = IS_HOT_STANDBY_QD();
+
+ SIMPLE_FAULT_INJECTOR("redoDistributedForgetCommitRecord");
+ /*
+ * Only the startup process can be modifying shmNumCommittedGxacts
+ * and shmCommittedGxidArray. So should be OK reading the value w/o lock.
+ */
for (i = 0; i < *shmNumCommittedGxacts; i++)
{
if (gxid == shmCommittedGxidArray[i])
@@ -550,13 +577,27 @@ redoDistributedForgetCommitRecord(DistributedTransactionId gxid)
gxid);
/*
- * there's no concurrent access to shmCommittedGxidArray during
- * recovery
+ * only on hot standby there might be backends that call CreateDistributedSnapshot()
+ * to access the committed gxid array concurrently.
*/
+ if (is_hot_standby_qd)
+ LWLockAcquire(CommittedGxidArrayLock, LW_EXCLUSIVE);
+
(*shmNumCommittedGxacts)--;
if (i != *shmNumCommittedGxacts)
shmCommittedGxidArray[i] = shmCommittedGxidArray[*shmNumCommittedGxacts];
+ if (is_hot_standby_qd)
+ LWLockRelease(CommittedGxidArrayLock);
+
+ /* on the hot standby, we rely on the forget record to advance latestCompletedGxid */
+ if (is_hot_standby_qd)
+ {
+ LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+ if (gxid > ShmemVariableCache->latestCompletedGxid)
+ ShmemVariableCache->latestCompletedGxid = gxid;
+ LWLockRelease(ProcArrayLock);
+ }
return;
}
}
diff --git a/src/backend/cdb/cdbfts.c b/src/backend/cdb/cdbfts.c
index 754d3054cbb..de268b6f662 100644
--- a/src/backend/cdb/cdbfts.c
+++ b/src/backend/cdb/cdbfts.c
@@ -84,6 +84,10 @@ FtsNotifyProber(void)
int32 started;
int32 done;
+ /* Ignore if we don't have a FTS probe process, like a standby QD in a mirrored cluster. */
+ if (FtsProbePID() == 0)
+ return;
+
if (am_ftsprobe)
return;
diff --git a/src/backend/cdb/cdbtm.c b/src/backend/cdb/cdbtm.c
index f0cd5fcb3f6..37550261149 100644
--- a/src/backend/cdb/cdbtm.c
+++ b/src/backend/cdb/cdbtm.c
@@ -264,6 +264,21 @@ currentDtxActivate(void)
{
bool signal_dtx_recovery;
+ /*
+ * A hot standby transaction does not have a valid gxid, so can skip
+ * most of the things in this function. We still explicitly set some
+ * fields that are irrelevant to hot standby for cleanness.
+ */
+ if (IS_HOT_STANDBY_QD())
+ {
+ /* standby QD will stay in this state until transaction completed */
+ setCurrentDtxState(DTX_STATE_ACTIVE_DISTRIBUTED);
+ MyTmGxact->sessionId = gp_session_id;
+ MyTmGxact->gxid = InvalidDistributedTransactionId;
+ MyTmGxact->includeInCkpt = false;
+ return;
+ }
+
if (ShmemVariableCache->GxidCount <= GXID_PRETCH_THRESHOLD &&
(GetDtxRecoveryEvent() & DTX_RECOVERY_EVENT_BUMP_GXID) == 0)
{
@@ -1644,7 +1659,7 @@ isDtxQueryDispatcher(void)
isSharedLocalSnapshotSlotPresent = (SharedLocalSnapshotSlot != NULL);
return (Gp_role == GP_ROLE_DISPATCH &&
- isDtmStarted &&
+ (isDtmStarted || EnableHotStandby) &&
isSharedLocalSnapshotSlotPresent);
}
@@ -2047,6 +2062,8 @@ sendDtxExplicitBegin(void)
static void
performDtxProtocolPrepare(const char *gid)
{
+ SIMPLE_FAULT_INJECTOR("qe_start_prepared");
+
StartTransactionCommand();
elog(DTM_DEBUG5, "performDtxProtocolCommand going to call PrepareTransactionBlock for distributed transaction (id = '%s')", gid);
@@ -2126,6 +2143,7 @@ performDtxProtocolCommitOnePhase(const char *gid)
static void
performDtxProtocolCommitPrepared(const char *gid, bool raiseErrorIfNotFound)
{
+ SIMPLE_FAULT_INJECTOR("qe_start_commit_prepared");
Assert(Gp_role == GP_ROLE_EXECUTE);
elog(DTM_DEBUG5,
@@ -2158,6 +2176,7 @@ performDtxProtocolCommitPrepared(const char *gid, bool raiseErrorIfNotFound)
sendWaitGxidsToQD(waitGxids);
finishDistributedTransactionContext("performDtxProtocolCommitPrepared -- Commit Prepared", false);
+ SIMPLE_FAULT_INJECTOR("finish_commit_prepared");
}
/**
diff --git a/src/backend/cdb/cdbutil.c b/src/backend/cdb/cdbutil.c
index 1671b17223b..f732368d725 100644
--- a/src/backend/cdb/cdbutil.c
+++ b/src/backend/cdb/cdbutil.c
@@ -565,7 +565,7 @@ getCdbComponentInfo(void)
{
cdbInfo = &component_databases->segment_db_info[i];
- if (cdbInfo->config->role != GP_SEGMENT_CONFIGURATION_ROLE_PRIMARY)
+ if (!IS_HOT_STANDBY_QD() && cdbInfo->config->role != GP_SEGMENT_CONFIGURATION_ROLE_PRIMARY)
continue;
hsEntry = (HostPrimaryCountEntry *) hash_search(hostPrimaryCountHash, cdbInfo->config->hostname, HASH_FIND, &found);
@@ -577,7 +577,7 @@ getCdbComponentInfo(void)
{
cdbInfo = &component_databases->entry_db_info[i];
- if (cdbInfo->config->role != GP_SEGMENT_CONFIGURATION_ROLE_PRIMARY)
+ if (!IS_HOT_STANDBY_QD() && cdbInfo->config->role != GP_SEGMENT_CONFIGURATION_ROLE_PRIMARY)
continue;
hsEntry = (HostPrimaryCountEntry *) hash_search(hostPrimaryCountHash, cdbInfo->config->hostname, HASH_FIND, &found);
@@ -1005,7 +1005,16 @@ cdbcomponent_getComponentInfo(int contentId)
/* entry db */
if (contentId == -1)
{
- cdbInfo = &cdbs->entry_db_info[0];
+ Assert(cdbs->total_entry_dbs == 1 || cdbs->total_entry_dbs == 2);
+ /*
+ * For a standby QD, get the last entry db which can be the first (on
+ * a replica cluster) or the second (on a mirrored cluster) entry.
+ */
+ if (IS_HOT_STANDBY_QD())
+ cdbInfo = &cdbs->entry_db_info[cdbs->total_entry_dbs - 1];
+ else
+ cdbInfo = &cdbs->entry_db_info[0];
+
return cdbInfo;
}
@@ -1022,10 +1031,10 @@ cdbcomponent_getComponentInfo(int contentId)
Assert(cdbs->total_segment_dbs == cdbs->total_segments * 2);
cdbInfo = &cdbs->segment_db_info[2 * contentId];
- if (!SEGMENT_IS_ACTIVE_PRIMARY(cdbInfo))
- {
+ /* use the other segment if it is not what the QD wants */
+ if ((IS_HOT_STANDBY_QD() && SEGMENT_IS_ACTIVE_PRIMARY(cdbInfo))
+ || (!IS_HOT_STANDBY_QD() && !SEGMENT_IS_ACTIVE_PRIMARY(cdbInfo)))
cdbInfo = &cdbs->segment_db_info[2 * contentId + 1];
- }
return cdbInfo;
}
@@ -1124,10 +1133,21 @@ cdb_setup(void)
*
* Ignore background worker because bgworker_should_start_mpp() already did
* the check.
+ *
+ * Ignore if we are the standby coordinator started in hot standby mode.
+ * We don't expect dtx recovery to have finished, as dtx recovery is
+ * performed at the end of startup. In hot standby, we are recovering
+ * continuously and should allow queries much earlier. Since a hot standby
+ * won't proceed dtx, it is not required to wait for recovery of the dtx
+ * that has been prepared but not committed (i.e. to commit them); on the
+ * other hand, the recovery of any in-doubt transactions (i.e. not prepared)
+ * won't bother a hot standby either, just like they can be recovered in the
+ * background when a primary instance is running.
*/
if (!IsBackgroundWorker &&
Gp_role == GP_ROLE_DISPATCH &&
- !*shmDtmStarted)
+ !*shmDtmStarted &&
+ !IS_HOT_STANDBY_QD())
{
ereport(FATAL,
(errcode(ERRCODE_CANNOT_CONNECT_NOW),
diff --git a/src/backend/cdb/dispatcher/cdbdisp_query.c b/src/backend/cdb/dispatcher/cdbdisp_query.c
index ef4b84f0db0..c21cfd935f9 100644
--- a/src/backend/cdb/dispatcher/cdbdisp_query.c
+++ b/src/backend/cdb/dispatcher/cdbdisp_query.c
@@ -867,6 +867,7 @@ buildGpQueryString(DispatchCommandQueryParms *pQueryParms,
{
const char *command = pQueryParms->strCommand;
int command_len;
+ int is_hs_dispatch = IS_HOT_STANDBY_QD() ? 1 : 0;
const char *plantree = pQueryParms->serializedPlantree;
int plantree_len = pQueryParms->serializedPlantreelen;
const char *sddesc = pQueryParms->serializedQueryDispatchDesc;
@@ -921,6 +922,7 @@ buildGpQueryString(DispatchCommandQueryParms *pQueryParms,
sizeof(outerUserId) /* outerUserIsSuper */ +
sizeof(currentUserId) +
sizeof(n32) * 2 /* currentStatementStartTimestamp */ +
+ sizeof(is_hs_dispatch) +
sizeof(command_len) +
sizeof(plantree_len) +
sizeof(sddesc_len) +
@@ -976,6 +978,10 @@ buildGpQueryString(DispatchCommandQueryParms *pQueryParms,
memcpy(pos, &n32, sizeof(n32));
pos += sizeof(n32);
+ tmp = htonl(is_hs_dispatch);
+ memcpy(pos, &tmp, sizeof(is_hs_dispatch));
+ pos += sizeof(is_hs_dispatch);
+
tmp = htonl(command_len);
memcpy(pos, &tmp, sizeof(command_len));
pos += sizeof(command_len);
diff --git a/src/backend/cdb/dispatcher/cdbgang.c b/src/backend/cdb/dispatcher/cdbgang.c
index 780ddef0f42..87ce88504b0 100644
--- a/src/backend/cdb/dispatcher/cdbgang.c
+++ b/src/backend/cdb/dispatcher/cdbgang.c
@@ -698,8 +698,7 @@ getCdbProcessesForQD(int isPrimary)
qdinfo = cdbcomponent_getComponentInfo(MASTER_CONTENT_ID);
- Assert(qdinfo->config->segindex == -1);
- Assert(SEGMENT_IS_ACTIVE_PRIMARY(qdinfo));
+ Assert((qdinfo->config->segindex == -1 && SEGMENT_IS_ACTIVE_PRIMARY(qdinfo)) || IS_HOT_STANDBY_QD());
Assert(qdinfo->config->hostip != NULL);
proc = makeNode(CdbProcess);
diff --git a/src/backend/cdb/dispatcher/test/cdbdisp_query_test.c b/src/backend/cdb/dispatcher/test/cdbdisp_query_test.c
new file mode 100644
index 00000000000..6e07aebcc96
--- /dev/null
+++ b/src/backend/cdb/dispatcher/test/cdbdisp_query_test.c
@@ -0,0 +1,341 @@
+#include
+#include
+#include
+#include "cmockery.h"
+#include "postgres.h"
+
+#include "storage/ipc.h"
+#include "storage/proc.h"
+
+#include "../cdbdisp_query.c"
+
+
+#undef PG_RE_THROW
+#define PG_RE_THROW() siglongjmp(*PG_exception_stack, 1)
+
+
+int __wrap_errmsg(const char *fmt,...);
+int __wrap_errcode(int sqlerrcode);
+bool __wrap_errstart(int elevel, const char *filename, int lineno,
+ const char *funcname, const char *domain);
+void __wrap_errfinish(int dummy __attribute__((unused)),...);
+Gang *__wrap_cdbgang_createGang_async(List *segments, SegmentType segmentType);
+int __wrap_pqPutMsgStart(char msg_type, bool force_len, PGconn *conn);
+int __wrap_PQcancel(PGcancel *cancel, char *errbuf, int errbufsize);
+char *__wrap_serializeNode(Node *node, int *size, int *uncompressed_size_out);
+char *__wrap_qdSerializeDtxContextInfo(int *size, bool wantSnapshot, bool inCursor, int txnOptions, char *debugCaller);
+void __wrap_VirtualXactLockTableInsert(VirtualTransactionId vxid);
+void __wrap_AcceptInvalidationMessages(void);
+static void terminate_process();
+
+
+int
+__wrap_errmsg(const char *fmt,...)
+{
+ check_expected(fmt);
+ optional_assignment(fmt);
+ return (int) mock();
+}
+
+
+int
+__wrap_errcode(int sqlerrcode)
+{
+ check_expected(sqlerrcode);
+ return (int) mock();
+}
+
+
+bool
+__wrap_errstart(int elevel, const char *filename, int lineno,
+ const char *funcname, const char *domain)
+{
+ if (elevel < LOG)
+ return false;
+
+ check_expected(elevel);
+ check_expected(filename);
+ check_expected(lineno);
+ check_expected(funcname);
+ check_expected(domain);
+ optional_assignment(filename);
+ optional_assignment(funcname);
+ optional_assignment(domain);
+ return (bool) mock();
+}
+
+
+void
+__wrap_errfinish(int dummy __attribute__((unused)),...)
+{
+ PG_RE_THROW();
+}
+
+
+static void
+expect_ereport(int expect_elevel)
+{
+ expect_any(__wrap_errmsg, fmt);
+ will_be_called(__wrap_errmsg);
+
+ expect_any(__wrap_errcode, sqlerrcode);
+ will_be_called(__wrap_errcode);
+
+ expect_value(__wrap_errstart, elevel, expect_elevel);
+ expect_any(__wrap_errstart, filename);
+ expect_any(__wrap_errstart, lineno);
+ expect_any(__wrap_errstart, funcname);
+ expect_any(__wrap_errstart, domain);
+ if (expect_elevel < ERROR)
+ {
+ will_return(__wrap_errstart, false);
+ }
+ else
+ {
+ will_return(__wrap_errstart, true);
+ }
+}
+
+
+Gang *
+__wrap_cdbgang_createGang_async(List *segments, SegmentType segmentType)
+{
+ MemoryContext oldContext = MemoryContextSwitchTo(DispatcherContext);
+ Gang *gang = buildGangDefinition(segments, segmentType);
+
+ MemoryContextSwitchTo(oldContext);
+
+ PGconn *conn = (PGconn *) malloc(sizeof(PGconn));
+
+ MemSet(conn, 0, sizeof(PGconn));
+ initPQExpBuffer(&conn->errorMessage);
+ initPQExpBuffer(&conn->workBuffer);
+ gang->db_descriptors[0]->conn = conn;
+
+ return gang;
+}
+
+
+int
+__wrap_pqPutMsgStart(char msg_type, bool force_len, PGconn *conn)
+{
+ if (conn->outBuffer_shared)
+ fail_msg("Mustn't send something else during dispatch!");
+ check_expected(msg_type);
+ check_expected(force_len);
+ check_expected(conn);
+ optional_assignment(conn);
+ return (int) mock();
+}
+
+
+int
+__wrap_PQcancel(PGcancel *cancel, char *errbuf, int errbufsize)
+{
+ return (int) mock();
+}
+
+
+char *
+__wrap_serializeNode(Node *node, int *size, int *uncompressed_size_out)
+{
+ const int alloc_size = 1024;
+
+ if (size != NULL)
+ *size = alloc_size;
+ if (uncompressed_size_out != NULL)
+ *uncompressed_size_out = alloc_size;
+
+ return (char *) palloc(alloc_size);
+}
+
+
+char *
+__wrap_qdSerializeDtxContextInfo(int *size, bool wantSnapshot, bool inCursor, int txnOptions, char *debugCaller)
+{
+ const int alloc_size = 1024;
+
+ assert_int_not_equal(size, NULL);
+ *size = alloc_size;
+
+ return (char *) palloc(alloc_size);
+}
+
+
+void
+__wrap_VirtualXactLockTableInsert(VirtualTransactionId vxid)
+{
+ mock();
+}
+
+void
+__wrap_AcceptInvalidationMessages(void)
+{
+ mock();
+}
+
+
+static void
+terminate_process()
+{
+ die(SIGTERM);
+}
+
+/*
+ * Test query may be interrupted during plan dispatching
+ */
+static void
+test__CdbDispatchPlan_may_be_interrupted(void **state)
+{
+ PlannedStmt *plannedstmt = (PlannedStmt *) palloc(sizeof(PlannedStmt));
+
+ /* slice table is needed to allocate gang */
+ plannedstmt->slices = palloc0(sizeof(PlanSlice));
+ plannedstmt->numSlices = 1;
+ PlanSlice *slice = &plannedstmt->slices[0];
+
+ slice->sliceIndex = 1;
+ slice->gangType = GANGTYPE_PRIMARY_READER;
+ slice->numsegments = 1;
+ slice->parentIndex = -1;
+ slice->segindex = 0;
+
+ QueryDesc *queryDesc = (QueryDesc *) palloc(sizeof(QueryDesc));
+
+ queryDesc->plannedstmt = plannedstmt;
+ /* ddesc->secContext is filled in cdbdisp_buildPlanQueryParms() */
+ queryDesc->ddesc = (QueryDispatchDesc *) palloc(sizeof(QueryDispatchDesc));
+ /* source text is required for buildGpQueryString() */
+ queryDesc->sourceText = "select a from t1;";
+
+ queryDesc->estate = CreateExecutorState();
+
+ /* will be called multiple times in e.g. FtsNotifyProber/getCdbComponentInfo */
+ will_return_count(RecoveryInProgress, false, -1);
+
+ /* cdbcomponent_getCdbComponents() mocks */
+ will_be_called(FtsNotifyProber);
+ will_return(getFtsVersion, 1);
+ will_return(GetGpExpandVersion, 1);
+
+ /* StartTransactionCommand() mocks */
+ will_be_called(__wrap_VirtualXactLockTableInsert);
+ will_be_called(__wrap_AcceptInvalidationMessages);
+ will_be_called(initialize_wal_bytes_written);
+
+ /*
+ * cdbdisp_dispatchToGang()
+ *
+ * start sending MPP query to QE inside PQsendGpQuery_shared() replace
+ * connection buffer with the shared one
+ */
+ expect_any(PQsendQueryStart, conn);
+ will_return(PQsendQueryStart, true);
+
+ /* first try to flush MPP query inside PQsendGpQuery_shared() */
+ expect_any(pqFlushNonBlocking, conn);
+ will_return(pqFlushNonBlocking, 1);
+
+ /*
+ * cdbdisp_waitDispatchFinish()
+ *
+ * query will be interrupted before poll()
+ */
+ expect_any_count(ResetWaitEventSet, pset, 2);
+ expect_any_count(ResetWaitEventSet, context, 2);
+ expect_any_count(ResetWaitEventSet, nevents, 2);
+ will_be_called_count(ResetWaitEventSet, 2);
+
+ expect_any(pqFlushNonBlocking, conn);
+ will_return_with_sideeffect(pqFlushNonBlocking, 1, &terminate_process, NULL);
+
+ expect_any(SetLatch, latch);
+ will_be_called(SetLatch);
+
+ expect_any(AddWaitEventToSet, set);
+ expect_any(AddWaitEventToSet, events);
+ expect_any(AddWaitEventToSet, fd);
+ expect_any(AddWaitEventToSet, latch);
+ expect_any(AddWaitEventToSet, user_data);
+ will_be_called(AddWaitEventToSet);
+
+ will_return(IsLogicalLauncher, false);
+
+ /* process was terminated by administrative command */
+ expect_ereport(FATAL);
+
+ /* QD will trying to cancel queries on QEs */
+ will_return(__wrap_PQcancel, true);
+
+ /* during close and free connection */
+ expect_any_count(pqClearAsyncResult, conn, 2);
+ will_be_called_count(pqClearAsyncResult, 2);
+
+ /*
+ * BUT! pqPutMsgStart mustn't be called
+ *
+ * we can't send termination message (X) until shared message isn't sent
+ * out the buffer completely
+ */
+
+ /*
+ * dirty hack. cluster topology needed to allocate gangs is loaded from
+ * gpsegconfig_dump outside of transaction
+ */
+ cdbcomponent_getCdbComponents();
+
+ StartTransactionCommand();
+
+ PG_TRY();
+ {
+ queryDesc->estate->es_sliceTable = InitSliceTable(queryDesc->estate, plannedstmt);
+
+ CdbDispatchPlan(queryDesc, queryDesc->estate->es_param_exec_vals,
+ false, false);
+ fail();
+ }
+ PG_CATCH();
+ {
+ /*
+ * SIGTERM handling emulation gpdb bail out from CheckDispatchResult
+ * without flushing unsent messages in case of process exit in
+ * progress AtAbort_DispatcherState will be called during transaction
+ * abort
+ */
+ proc_exit_inprogress = true;
+
+ AtAbort_DispatcherState();
+ }
+ PG_END_TRY();
+}
+
+int
+main(int argc, char *argv[])
+{
+ cmockery_parse_arguments(argc, argv);
+
+ const UnitTest tests[] =
+ {
+ unit_test(test__CdbDispatchPlan_may_be_interrupted)
+ };
+
+ Gp_role = GP_ROLE_DISPATCH;
+ /* to start transaction */
+ PGPROC proc;
+
+ MyBackendId = 7;
+ proc.backendId = MyBackendId;
+ MyProc = &proc;
+ /* to build cdb components info */
+ GpIdentity.dbid = 1;
+ GpIdentity.segindex = -1;
+
+ MemoryContextInit();
+
+ /* to avoid mocking cdbtm.c functions */
+ MyTmGxactLocal = (TMGXACTLOCAL *) MemoryContextAllocZero(TopMemoryContext, sizeof(TMGXACTLOCAL));
+
+ SetSessionUserId(1000, true);
+
+ return run_tests(tests);
+}
diff --git a/src/backend/fts/fts.c b/src/backend/fts/fts.c
index 719e8fbca1c..c7c1711e97f 100644
--- a/src/backend/fts/fts.c
+++ b/src/backend/fts/fts.c
@@ -102,7 +102,7 @@ sigIntHandler(SIGNAL_ARGS)
pid_t
FtsProbePID(void)
{
- return *shmFtsProbePID;
+ return shmFtsProbePID ? *shmFtsProbePID : 0;
}
bool
diff --git a/src/backend/replication/logical/decode.c b/src/backend/replication/logical/decode.c
index 1a835983222..68524222d71 100644
--- a/src/backend/replication/logical/decode.c
+++ b/src/backend/replication/logical/decode.c
@@ -371,6 +371,9 @@ standby_decode(LogicalDecodingContext *ctx, XLogRecordBuffer *buf)
* XLOG_XACT_INVALIDATIONS. So we don't need to do anything here.
*/
break;
+ case XLOG_LATESTCOMPLETED_GXID:
+ /* FIXME: need to decode this part? */
+ break;
default:
elog(ERROR, "unexpected RM_STANDBY_ID record type: %u", info);
}
diff --git a/src/backend/storage/file/fd.c b/src/backend/storage/file/fd.c
index 73a53822b3d..e2953686b8e 100644
--- a/src/backend/storage/file/fd.c
+++ b/src/backend/storage/file/fd.c
@@ -1712,6 +1712,7 @@ OpenTemporaryFile(bool interXact, const char *filePrefix)
if (!interXact)
RegisterTemporaryFile(file);
+ SIMPLE_FAULT_INJECTOR("after_open_temp_file");
return file;
}
diff --git a/src/backend/storage/ipc/procarray.c b/src/backend/storage/ipc/procarray.c
index 3154caba1bd..57c03cce7d9 100644
--- a/src/backend/storage/ipc/procarray.c
+++ b/src/backend/storage/ipc/procarray.c
@@ -2530,8 +2530,10 @@ getDtxCheckPointInfo(char **result, int *result_size)
gxid_array = &gxact_checkpoint->committedGxidArray[0];
actual = 0;
+ LWLockAcquire(CommittedGxidArrayLock, LW_SHARED);
for (; actual < *shmNumCommittedGxacts; actual++)
gxid_array[actual] = shmCommittedGxidArray[actual];
+ LWLockRelease(CommittedGxidArrayLock);
SIMPLE_FAULT_INJECTOR("checkpoint_dtx_info");
@@ -2609,7 +2611,8 @@ CreateDistributedSnapshot(DistributedSnapshot *ds)
ProcArrayStruct *arrayP = procArray;
Assert(LWLockHeldByMe(ProcArrayLock));
- if (*shmNumCommittedGxacts != 0)
+ /* Hot standby accepts query while constantly replaying dtx, so this ERROR doesn't apply. */
+ if (!IS_HOT_STANDBY_QD() && *shmNumCommittedGxacts != 0)
elog(ERROR, "Create distributed snapshot before DTM recovery finish");
xmin = xmax = ShmemVariableCache->latestCompletedGxid + 1;
@@ -2623,9 +2626,45 @@ CreateDistributedSnapshot(DistributedSnapshot *ds)
Assert(ds->inProgressXidArray != NULL);
+ /*
+ * For a hot standby QD, check shmCommittedGxidArray to build the knowledge.
+ * Need to acquire shared lock to access the committed gxid array as the
+ * startup process might modify it.
+ */
+ if (IS_HOT_STANDBY_QD())
+ {
+ LWLockAcquire(CommittedGxidArrayLock, LW_SHARED);
+ for (i = 0; i < *shmNumCommittedGxacts; i++)
+ {
+ DistributedTransactionId gxid;
+
+ gxid = shmCommittedGxidArray[i];
+
+ if (gxid == InvalidDistributedTransactionId || gxid >= xmax)
+ continue;
+
+ if (gxid < xmin)
+ xmin = gxid;
+
+ ds->inProgressXidArray[count++] = gxid;
+ }
+ LWLockRelease(CommittedGxidArrayLock);
+ }
+
/*
* Gather up current in-progress global transactions for the distributed
* snapshot.
+ *
+ * Note: The inProgressXidArray built below may contain transactions that
+ * have been prepared on some/all segments, and for which the QD hasn't
+ * begun the COMMIT phase (by writing a XLOG_XACT_DISTRIBUTED_COMMIT record).
+ * The gxids of these transactions don't necessarily have to be placed into
+ * inProgressXidArray, for correctness. This is because for visibility
+ * checks on the QEs, a state of DISTRIBUTEDSNAPSHOT_COMMITTED_UNKNOWN will
+ * be encountered for such txs, prompting a local check. The local check will
+ * always find these txs in progress (due to the dummy PGXACTs being
+ * recorded for prepared txs). So, hypothetically we could exclude these txs
+ * here, but we don't currently track them on the QD, so we can't.
*/
for (i = 0; i < arrayP->numProcs; i++)
{
diff --git a/src/backend/storage/ipc/standby.c b/src/backend/storage/ipc/standby.c
index 687ce03767d..13dc551ca54 100644
--- a/src/backend/storage/ipc/standby.c
+++ b/src/backend/storage/ipc/standby.c
@@ -21,6 +21,7 @@
#include "access/xact.h"
#include "access/xlog.h"
#include "access/xloginsert.h"
+#include "cdb/cdbvars.h"
#include "miscadmin.h"
#include "pgstat.h"
#include "storage/bufmgr.h"
@@ -29,6 +30,7 @@
#include "storage/procarray.h"
#include "storage/sinvaladt.h"
#include "storage/standby.h"
+#include "utils/faultinjector.h"
#include "utils/hsearch.h"
#include "utils/memutils.h"
#include "utils/ps_status.h"
@@ -848,6 +850,8 @@ SendRecoveryConflictWithBufferPin(ProcSignalReason reason)
* SIGUSR1 handling in each backend decide their own fate.
*/
CancelDBBackends(InvalidOid, reason, false);
+
+ SIMPLE_FAULT_INJECTOR("recovery_conflict_bufferpin_signal_sent");
}
/*
@@ -1148,6 +1152,23 @@ standby_redo(XLogReaderState *record)
xlrec->dbId,
xlrec->tsId);
}
+ else if (info == XLOG_LATESTCOMPLETED_GXID)
+ {
+ /*
+ * This record is only logged by coordinator. But the segment in
+ * some situation might see it too (e.g. gpexpand), but segment
+ * doesn't need to update latestCompletedGxid.
+ */
+ if (IS_QUERY_DISPATCHER())
+ {
+ DistributedTransactionId gxid;
+
+ gxid = *((DistributedTransactionId *) XLogRecGetData(record));
+ LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
+ ShmemVariableCache->latestCompletedGxid = gxid;
+ LWLockRelease(ProcArrayLock);
+ }
+ }
else
elog(PANIC, "standby_redo: unknown op code %u", info);
}
@@ -1265,6 +1286,21 @@ LogStandbySnapshot(void)
/* GetRunningTransactionData() acquired XidGenLock, we must release it */
LWLockRelease(XidGenLock);
+ if (IS_QUERY_DISPATCHER())
+ {
+ /*
+ * GPDB: write latestCompletedGxid too, because the standby needs this
+ * value for creating distributed snapshot. The standby cannot rely on
+ * the nextGxid value to set latestCompletedGxid during restart (which
+ * the primary does) because nextGxid was bumped in the checkpoint.
+ */
+ LWLockAcquire(ProcArrayLock, LW_SHARED);
+ DistributedTransactionId lcgxid = ShmemVariableCache->latestCompletedGxid;
+ LWLockRelease(ProcArrayLock);
+ XLogBeginInsert();
+ XLogRegisterData((char *) (&lcgxid), sizeof(lcgxid));
+ recptr = XLogInsert(RM_STANDBY_ID, XLOG_LATESTCOMPLETED_GXID);
+ }
return recptr;
}
diff --git a/src/backend/storage/lmgr/lwlocknames.txt b/src/backend/storage/lmgr/lwlocknames.txt
index c8f283198ce..c3583b146d7 100644
--- a/src/backend/storage/lmgr/lwlocknames.txt
+++ b/src/backend/storage/lmgr/lwlocknames.txt
@@ -75,3 +75,4 @@ LoginFailedControlLock 65
LoginFailedSharedMemoryLock 66
GPIVMResLock 67
DirectoryTableLock 68
+CommittedGxidArrayLock 69
diff --git a/src/backend/storage/lmgr/proc.c b/src/backend/storage/lmgr/proc.c
index a174e981b1f..37d917a1f3e 100644
--- a/src/backend/storage/lmgr/proc.c
+++ b/src/backend/storage/lmgr/proc.c
@@ -354,17 +354,9 @@ InitProcess(void)
* WAL sender, etc are marked as GP_ROLE_UTILITY to prevent unwanted
* GP_ROLE_DISPATCH MyProc settings such as mppSessionId being valid and
* mppIsWriter set to true.
- *
- * RecoveryInProgress() to see if we are in hot standby, because
- * HotStandbyActive() is still true after promotion.
*/
- if (am_walsender || am_ftshandler || am_faulthandler ||
- (GpIdentity.segindex == -1 && RecoveryInProgress()))
- {
+ if (am_walsender || am_ftshandler || am_faulthandler)
Gp_role = GP_ROLE_UTILITY;
- if (GpIdentity.segindex == -1 && RecoveryInProgress())
- elog(WARNING, "Force to run in utility mode in hot standby");
- }
/*
* ProcGlobal should be set up already (if we are a backend, we inherit
diff --git a/src/backend/tcop/postgres.c b/src/backend/tcop/postgres.c
index f29c9c2e606..62ded58aafb 100644
--- a/src/backend/tcop/postgres.c
+++ b/src/backend/tcop/postgres.c
@@ -1529,6 +1529,8 @@ exec_mpp_dtx_protocol_command(DtxProtocolCommand dtxProtocolCommand,
qc.commandTag = GetCommandTagEnum(loggingStr);
qc.nprocessed = 1;
+ SIMPLE_FAULT_INJECTOR("exec_dtx_protocol_start");
+
if (log_statement == LOGSTMT_ALL)
elog(LOG,"DTM protocol command '%s' for gid = %s", loggingStr, gid);
@@ -5714,6 +5716,7 @@ PostgresMain(int argc, char *argv[],
const char *serializedQueryDispatchDesc = NULL;
const char *resgroupInfoBuf = NULL;
+ int is_hs_dispatch;
int query_string_len = 0;
int serializedDtxContextInfolen = 0;
int serializedPlantreelen = 0;
@@ -5750,6 +5753,20 @@ PostgresMain(int argc, char *argv[],
cuid = pq_getmsgint(&input_message, 4);
statementStart = pq_getmsgint64(&input_message);
+
+ /* check if the message is from standby QD and is expected */
+ is_hs_dispatch = pq_getmsgint(&input_message, 4);
+ if (is_hs_dispatch == 0 && IS_HOT_STANDBY_QE())
+ ereport(ERROR,
+ (errcode(ERRCODE_PROTOCOL_VIOLATION),
+ errmsg("mirror segments can only process MPP protocol messages from standby QD"),
+ errhint("Exit the current session and re-connect.")));
+ else if (is_hs_dispatch != 0 && !IS_HOT_STANDBY_QE())
+ ereport(ERROR,
+ (errcode(ERRCODE_PROTOCOL_VIOLATION),
+ errmsg("primary segments can only process MPP protocol messages from primary QD"),
+ errhint("Exit the current session and re-connect.")));
+
query_string_len = pq_getmsgint(&input_message, 4);
serializedPlantreelen = pq_getmsgint(&input_message, 4);
serializedQueryDispatchDesclen = pq_getmsgint(&input_message, 4);
diff --git a/src/backend/tcop/pquery.c b/src/backend/tcop/pquery.c
index 728d12c604a..532690f1d51 100644
--- a/src/backend/tcop/pquery.c
+++ b/src/backend/tcop/pquery.c
@@ -617,6 +617,8 @@ PortalStart(Portal portal, ParamListInfo params,
needDistributedSnapshot = false;
}
+ SIMPLE_FAULT_INJECTOR("select_before_qd_create_snapshot");
+
/* Must set snapshot before starting executor. */
if (snapshot)
PushActiveSnapshot(snapshot);
@@ -626,6 +628,8 @@ PortalStart(Portal portal, ParamListInfo params,
/* reset value */
needDistributedSnapshot = true;
+ SIMPLE_FAULT_INJECTOR("select_after_qd_create_snapshot");
+
/*
* We could remember the snapshot in portal->portalSnapshot,
* but presently there seems no need to, as this code path
diff --git a/src/backend/utils/misc/guc_gp.c b/src/backend/utils/misc/guc_gp.c
index 70d51316875..df83e03b766 100644
--- a/src/backend/utils/misc/guc_gp.c
+++ b/src/backend/utils/misc/guc_gp.c
@@ -3095,16 +3095,6 @@ struct config_bool ConfigureNamesBool_gp[] =
NULL, NULL, NULL
},
- {
- {"gp_pause_on_restore_point_replay", PGC_SIGHUP, DEVELOPER_OPTIONS,
- gettext_noop("Pause recovery when a restore point is replayed."),
- NULL,
- GUC_NO_SHOW_ALL | GUC_NOT_IN_SAMPLE
- },
- &gp_pause_on_restore_point_replay,
- false,
- NULL, NULL, NULL
- },
{
{"gp_autostats_allow_nonowner", PGC_SUSET, DEVELOPER_OPTIONS,
gettext_noop("Allow automatic stats collection on tables even for users who are not the owner of the relation."),
@@ -5052,6 +5042,17 @@ struct config_string ConfigureNamesString_gp[] =
"udpifc",
check_gp_interconnect_type, assign_gp_interconnect_type, show_gp_interconnect_type
},
+ {
+ {"gp_pause_on_restore_point_replay", PGC_SUSET, DEVELOPER_OPTIONS,
+ gettext_noop("Specifies the restore point to pause replay on."),
+ gettext_noop("Unlike recovery_target_name, this can be used to continuously set/reset "
+ "how much a standby should replay up to."),
+ GUC_NO_SHOW_ALL | GUC_NOT_IN_SAMPLE
+ },
+ &gp_pause_on_restore_point_replay,
+ "",
+ NULL, NULL, NULL
+ },
/* End-of-list marker */
{
diff --git a/src/bin/initdb/initdb.c b/src/bin/initdb/initdb.c
index 53c3a82a45e..8525d0ca0d9 100644
--- a/src/bin/initdb/initdb.c
+++ b/src/bin/initdb/initdb.c
@@ -174,6 +174,7 @@ static char *external_fts_files;
#endif
static char *system_functions_file;
static char *system_views_file;
+static char *system_views_gp_file;
static bool success = false;
static bool made_new_pgdata = false;
static bool found_existing_pgdata = false;
@@ -2831,6 +2832,7 @@ setup_data_file_paths(void)
set_input(&system_constraints_file, "system_constraints.sql");
set_input(&system_functions_file, "system_functions.sql");
set_input(&system_views_file, "system_views.sql");
+ set_input(&system_views_gp_file, "system_views_gp.sql");
set_input(&cdb_init_d_dir, "cdb_init.d");
@@ -2864,6 +2866,7 @@ setup_data_file_paths(void)
#endif
check_input(system_functions_file);
check_input(system_views_file);
+ check_input(system_views_gp_file);
}
@@ -3231,6 +3234,7 @@ initialize_data_directory(void)
*/
setup_run_file(cmdfd, system_views_file);
+ setup_run_file(cmdfd, system_views_gp_file);
setup_description(cmdfd);
diff --git a/src/include/access/transam.h b/src/include/access/transam.h
index cec3e5f4cb7..687799bec9f 100644
--- a/src/include/access/transam.h
+++ b/src/include/access/transam.h
@@ -301,7 +301,7 @@ extern int xid_stop_limit;
extern int xid_warn_limit;
/* GPDB-specific */
-extern bool gp_pause_on_restore_point_replay;
+extern char *gp_pause_on_restore_point_replay;
/* hook for plugins to assign new relfilenode */
typedef Oid (*NewSegRelfilenode_assign_hook_type)(void);
diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h
index 2dfad411b7a..e8a73ceb201 100644
--- a/src/include/access/xlog.h
+++ b/src/include/access/xlog.h
@@ -11,6 +11,8 @@
#ifndef XLOG_H
#define XLOG_H
+#include "postgres.h" /* for Datum */
+
#include "access/rmgr.h"
#include "access/xlogdefs.h"
#include "access/xloginsert.h"
diff --git a/src/include/catalog/catversion.h b/src/include/catalog/catversion.h
index 86910a0dada..026192b3674 100644
--- a/src/include/catalog/catversion.h
+++ b/src/include/catalog/catversion.h
@@ -56,6 +56,6 @@
*/
/* 3yyymmddN */
-#define CATALOG_VERSION_NO 302502091
+#define CATALOG_VERSION_NO 302506101
#endif
diff --git a/src/include/cdb/cdbtm.h b/src/include/cdb/cdbtm.h
index 951b9013c00..2bf259a8744 100644
--- a/src/include/cdb/cdbtm.h
+++ b/src/include/cdb/cdbtm.h
@@ -35,8 +35,12 @@ typedef enum
DTX_STATE_NONE = 0,
/**
- * The distributed transaction is active and requires distributed coordination
- * (because it is explicit or an implicit writer transaction)
+ * The distributed transaction is active.
+ * For a primary, this state means the transaction requires distributed
+ * coordination (because it is explicit or an implicit writer transaction),
+ * and it will switch to other dtx states in different phases.
+ * For a hot standby, there is no coordination necessary so transaction
+ * will stay in this state until the end of the commit.
*/
DTX_STATE_ACTIVE_DISTRIBUTED,
@@ -232,6 +236,7 @@ typedef struct TMGXACTLOCAL
{
/*
* Memory only fields.
+ * If we are in hot standby, only 'state' is relevant.
*/
DtxState state;
diff --git a/src/include/cdb/cdbvars.h b/src/include/cdb/cdbvars.h
index 90af5177ce0..2393384ec3a 100644
--- a/src/include/cdb/cdbvars.h
+++ b/src/include/cdb/cdbvars.h
@@ -19,6 +19,7 @@
#ifndef CDBVARS_H
#define CDBVARS_H
+#include "access/xlog.h" /*RecoveryInProgress*/
#include "access/xlogdefs.h" /*XLogRecPtr*/
#include "cdb/cdbutil.h" /* MASTER_CONTENT_ID */
#ifdef USE_INTERNAL_FTS
@@ -757,8 +758,10 @@ extern GpId GpIdentity;
#define UNINITIALIZED_GP_IDENTITY_VALUE (-10000)
#define IS_QUERY_DISPATCHER() (GpIdentity.segindex == MASTER_CONTENT_ID)
+#define IS_HOT_STANDBY_QD() (EnableHotStandby && IS_QUERY_DISPATCHER() && RecoveryInProgress())
#define IS_QUERY_EXECUTOR_BACKEND() (Gp_role == GP_ROLE_EXECUTE && gp_session_id > 0)
+#define IS_HOT_STANDBY_QE() (EnableHotStandby && IS_QUERY_EXECUTOR_BACKEND() && RecoveryInProgress())
/* Stores the listener port that this process uses to listen for incoming
* Interconnect connections from other Motion nodes.
diff --git a/src/include/storage/standbydefs.h b/src/include/storage/standbydefs.h
index d99e6f40c6d..f007fe25245 100644
--- a/src/include/storage/standbydefs.h
+++ b/src/include/storage/standbydefs.h
@@ -34,6 +34,7 @@ extern void standby_desc_invalidations(StringInfo buf,
#define XLOG_STANDBY_LOCK 0x00
#define XLOG_RUNNING_XACTS 0x10
#define XLOG_INVALIDATIONS 0x20
+#define XLOG_LATESTCOMPLETED_GXID 0xF0
typedef struct xl_standby_locks
{
diff --git a/src/test/isolation2/Makefile b/src/test/isolation2/Makefile
index 759b2855513..bc1e0f66be0 100644
--- a/src/test/isolation2/Makefile
+++ b/src/test/isolation2/Makefile
@@ -90,3 +90,6 @@ installcheck-cbdb-parallel: install
export PGOPTIONS='-c optimizer=off -c enable_parallel=true'; \
$(pg_isolation2_regress_installcheck) --init-file=$(top_builddir)/src/test/regress/init_file --init-file=./init_file_isolation2 --schedule=$(srcdir)/isolation2_schedule \
)
+
+installcheck-hot-standby: install
+ $(pg_isolation2_regress_installcheck) $(EXTRA_REGRESS_OPTS) --init-file=$(top_builddir)/src/test/regress/init_file --init-file=./init_file_isolation2 --schedule=$(srcdir)/hot_standby_schedule --dbname=isolation2-hot-standby
diff --git a/src/test/isolation2/expected/hot_standby/basic.out b/src/test/isolation2/expected/hot_standby/basic.out
new file mode 100644
index 00000000000..5318a35d7d7
--- /dev/null
+++ b/src/test/isolation2/expected/hot_standby/basic.out
@@ -0,0 +1,242 @@
+-- Tests for basic query dispatch on a hot standy.
+
+-- hot standby must show on and the sync mode is remote_apply for the tests to make sense
+-1S: show hot_standby;
+ hot_standby
+-------------
+ on
+(1 row)
+-1S: show synchronous_commit;
+ synchronous_commit
+--------------------
+ remote_apply
+(1 row)
+
+-- will be checking if QD/QE info looks good
+-1S: select id, type, content, port from gp_backend_info();
+ id | type | content | port
+----+------+---------+------
+ -1 | Q | -1 | 7001
+(1 row)
+
+----------------------------------------------------------------
+-- Test: basic query dispatch
+----------------------------------------------------------------
+create table hs_t1(a int);
+CREATE
+create table hs_t2(a int);
+CREATE
+
+-- standby should see the results for 2pc immediately.
+insert into hs_t1 select * from generate_series(1,10);
+INSERT 10
+-1S: select * from hs_t1;
+ a
+----
+ 2
+ 3
+ 4
+ 7
+ 8
+ 5
+ 6
+ 9
+ 10
+ 1
+(10 rows)
+-- standby won't see results for the last 1pc immediately because the standby QD
+-- isn't aware of of it so its distributed snapshot doesn't include the 1pc, but
+-- as long as another 2pc comes it will be able to see the previous 1pc. Wee
+-- tolerate this case in the mirrored cluster setup.
+insert into hs_t2 values(1);
+INSERT 1
+-1S: select * from hs_t2;
+ a
+---
+(0 rows)
+-- any following 2pc will make the 1pc visible
+create temp table tt(a int);
+CREATE
+-1S: select * from hs_t2;
+ a
+---
+ 1
+(1 row)
+
+-- we have three QEs launched on the mirror segments.
+-- note that the first QE on a segment is still a "writer" because we
+-- need it to manage locks, same as read-only queries on a primary QD.
+-1S: select id, type, content, port from gp_backend_info();
+ id | type | content | port
+----+------+---------+------
+ -1 | Q | -1 | 7001
+ 0 | w | 0 | 7005
+ 1 | w | 1 | 7006
+ 2 | w | 2 | 7007
+(4 rows)
+
+-- should have parallel readers launched
+-1S: select * from hs_t1 join (select * from hs_t2) hs_t2 on (hs_t1 = hs_t2);
+ a | a
+---+---
+ 1 | 1
+(1 row)
+-1S: select id, type, content, port from gp_backend_info();
+ id | type | content | port
+----+------+---------+------
+ -1 | Q | -1 | 7001
+ 0 | w | 0 | 7005
+ 1 | w | 1 | 7006
+ 2 | w | 2 | 7007
+ 3 | r | 0 | 7005
+ 4 | r | 1 | 7006
+ 5 | r | 2 | 7007
+(7 rows)
+
+-- now a singleton reader added too
+-1S: select * from hs_t1 join (select oid::int from pg_class) hs_t2 on (hs_t1 = hs_t2);
+ a | oid
+---+-----
+(0 rows)
+-1S: select id, type, content, port from gp_backend_info();
+ id | type | content | port
+----+------+---------+------
+ -1 | Q | -1 | 7001
+ 0 | w | 0 | 7005
+ 1 | w | 1 | 7006
+ 2 | w | 2 | 7007
+ 3 | r | 0 | 7005
+ 4 | r | 1 | 7006
+ 5 | r | 2 | 7007
+ 6 | R | -1 | 7001
+(8 rows)
+
+-- un-committed result should not be seen by the standby
+begin;
+BEGIN
+insert into hs_t1 select * from generate_series(11,20);
+INSERT 10
+
+-- standby should only see 1...10
+-1S: select * from hs_t1;
+ a
+----
+ 5
+ 6
+ 9
+ 10
+ 2
+ 3
+ 4
+ 7
+ 8
+ 1
+(10 rows)
+
+end;
+END
+
+-- standby should see 1...20 now
+-1S: select * from hs_t1;
+ a
+----
+ 2
+ 3
+ 4
+ 7
+ 8
+ 16
+ 18
+ 19
+ 1
+ 12
+ 15
+ 20
+ 5
+ 6
+ 9
+ 10
+ 11
+ 13
+ 14
+ 17
+(20 rows)
+
+----------------------------------------------------------------
+-- Test: other things that a hot standby can do.
+--
+-- More refer to regress test 'hs_standby_allowed'.
+----------------------------------------------------------------
+-- set/reset and show GUC
+-1S: set optimizer = on;
+SET
+-1S: show optimizer;
+ optimizer
+-----------
+ on
+(1 row)
+-1S: reset optimizer;
+RESET
+-- copy command
+-1S: copy hs_t1 to '/tmp/hs_copyto.csv' csv null '';
+COPY 20
+-- query catalogs
+-1S: select count(*) from pg_class where relname = 'hs_t1';
+ count
+-------
+ 1
+(1 row)
+-1S: select dbid,content,role,preferred_role,mode,status from gp_segment_configuration where dbid = current_setting('gp_dbid')::integer;
+ dbid | content | role | preferred_role | mode | status
+------+---------+------+----------------+------+--------
+ 8 | -1 | m | m | s | u
+(1 row)
+-- checkpoint is allowed on standby but a restart point is created instead
+-1S: checkpoint;
+CHECKPOINT
+
+----------------------------------------------------------------
+-- Test: things that can't be done on a hot standby:
+-- no DML, DDL or anything that generates WAL.
+--
+-- More refer to regress test 'hs_standby_disallowed'.
+----------------------------------------------------------------
+-1S: insert into hs_t1 values(1);
+ERROR: cannot execute INSERT in a read-only transaction
+-1S: delete from hs_t1;
+ERROR: cannot acquire lock mode ExclusiveLock on database objects while recovery is in progress
+LINE 1: delete from hs_t1;
+ ^
+HINT: Only RowExclusiveLock or less can be acquired on database objects during recovery.
+-1S: update hs_t1 set a = 0;
+ERROR: cannot acquire lock mode ExclusiveLock on database objects while recovery is in progress
+LINE 1: update hs_t1 set a = 0;
+ ^
+HINT: Only RowExclusiveLock or less can be acquired on database objects during recovery.
+-1S: create table hs_t2(a int);
+ERROR: cannot execute CREATE TABLE in a read-only transaction
+-1S: create database hs_db;
+ERROR: cannot execute CREATE DATABASE in a read-only transaction
+-1S: vacuum hs_t1;
+ERROR: cannot execute VACUUM during recovery
+
+--
+-- No hintbit WAL generation in SELECT.
+--
+create table hs_nohintbit(a int) distributed by (a);
+CREATE
+insert into hs_nohintbit select generate_series (1, 10);
+INSERT 10
+-- flush the data to disk
+checkpoint;
+CHECKPOINT
+
+-1S: set gp_disable_tuple_hints=off;
+SET
+-- no WAL is being generated (otherwise an error would occur "cannot make new WAL entries during recovery")
+-1S: SELECT count(*) FROM hs_nohintbit;
+ count
+-------
+ 10
+(1 row)
+
diff --git a/src/test/isolation2/expected/hot_standby/faults.out b/src/test/isolation2/expected/hot_standby/faults.out
new file mode 100644
index 00000000000..39f3a06cca6
--- /dev/null
+++ b/src/test/isolation2/expected/hot_standby/faults.out
@@ -0,0 +1,326 @@
+-- Test system faults scenarios
+
+-- start_matchsubs
+--
+-- m/Is the server running on host.*/
+-- s/Is the server running on host "\d+.\d+.\d+.\d+" and accepting/Is the server running on host and accepting/
+-- m/(seg\d+ \d+.\d+.\d+.\d+:\d+)/
+-- s/(.*)/(seg IP:PORT)/
+-- m/ERROR: connection to dbid 1 .*:7000 failed .*/
+-- s/ERROR: connection to dbid 1 .*:7000 failed .*/ERROR: connection to dbid 1 :7000 failed/
+--
+-- end_matchsubs
+
+-- Let FTS detect/declare failure sooner
+!\retcode gpconfig -c gp_fts_probe_interval -v 10 --coordinatoronly;
+(exited with code 0)
+!\retcode gpstop -u;
+(exited with code 0)
+
+create table hs_failover(a int);
+CREATE
+insert into hs_failover select * from generate_series(1,10);
+INSERT 10
+-1S: select * from hs_failover;
+ a
+----
+ 2
+ 3
+ 4
+ 7
+ 8
+ 1
+ 5
+ 6
+ 9
+ 10
+(10 rows)
+
+----------------------------------------------------------------
+-- Mirror segment fails
+----------------------------------------------------------------
+select pg_ctl(datadir, 'stop', 'immediate') from gp_segment_configuration where content=1 and role = 'm';
+ pg_ctl
+--------
+ OK
+(1 row)
+
+-- make sure mirror is detected down
+create temp table hs_tt(a int);
+CREATE
+select gp_request_fts_probe_scan();
+ gp_request_fts_probe_scan
+---------------------------
+ t
+(1 row)
+
+-- will not succeed
+-1S: select * from hs_failover;
+ERROR: Error on receive from seg1 slice1 127.0.1.1:7006 pid=26942: server closed the connection unexpectedly
+ This probably means the server terminated abnormally
+ before or while processing the request.
+-1Sq: ...
+
+-- recovery
+!\retcode gprecoverseg -aF;
+(exited with code 0)
+
+-- sync-up
+select wait_until_all_segments_synchronized();
+ wait_until_all_segments_synchronized
+--------------------------------------
+ OK
+(1 row)
+
+-- works now
+-1S: select * from hs_failover;
+ a
+----
+ 2
+ 3
+ 4
+ 7
+ 8
+ 5
+ 6
+ 9
+ 10
+ 1
+(10 rows)
+
+----------------------------------------------------------------
+-- Primary segment fails
+----------------------------------------------------------------
+-- inject a fault where the mirror gets out of recovery
+select gp_inject_fault('out_of_recovery_in_startupxlog', 'skip', dbid) from gp_segment_configuration where content = 1 and role = 'm';
+ gp_inject_fault
+-----------------
+ Success:
+(1 row)
+
+select pg_ctl(datadir, 'stop', 'immediate') from gp_segment_configuration where content=1 and role = 'p';
+ pg_ctl
+--------
+ OK
+(1 row)
+select gp_request_fts_probe_scan();
+ gp_request_fts_probe_scan
+---------------------------
+ t
+(1 row)
+
+-- make sure failover happens
+select dbid, content, role, preferred_role, mode, status from gp_segment_configuration where content = 1;
+ dbid | content | role | preferred_role | mode | status
+------+---------+------+----------------+------+--------
+ 3 | 1 | m | p | n | d
+ 6 | 1 | p | m | n | u
+(2 rows)
+select gp_wait_until_triggered_fault('out_of_recovery_in_startupxlog', 1, dbid) from gp_segment_configuration where content = 1 and role = 'p';
+ gp_wait_until_triggered_fault
+-------------------------------
+ Success:
+(1 row)
+select gp_inject_fault('out_of_recovery_in_startupxlog', 'reset', dbid) from gp_segment_configuration where content = 1 and role = 'p';
+ gp_inject_fault
+-----------------
+ Success:
+(1 row)
+
+-- On an existing standby connection, query will run but it is dispatched to the previous mirror
+-- in an existing gang. That mirror is now a primary, so it will complain and the query fails.
+-1S: select * from hs_failover;
+ERROR: primary segments can only process MPP protocol messages from primary QD (seg1 slice1 127.0.1.1:7006 pid=14671)
+HINT: Exit the current session and re-connect.
+-1Sq: ...
+
+-- will fail due to downed mirror (previous primary)
+-1S: select * from hs_failover;
+ERROR: failed to acquire resources on one or more segments
+DETAIL: connection to server at "10.13.9.74", port 7003 failed: Connection refused
+ Is the server running on that host and accepting TCP/IP connections?
+ (seg1 10.13.9.74:7003)
+-1Sq: ...
+
+-- bring the downed mirror up
+!\retcode gprecoverseg -aF;
+(exited with code 0)
+select wait_until_all_segments_synchronized();
+ wait_until_all_segments_synchronized
+--------------------------------------
+ OK
+(1 row)
+
+-- mirror is up
+-1S: select dbid, content, role, preferred_role, mode, status from gp_segment_configuration where content = 1;
+ dbid | content | role | preferred_role | mode | status
+------+---------+------+----------------+------+--------
+ 6 | 1 | p | m | s | u
+ 3 | 1 | m | p | s | u
+(2 rows)
+
+-- now the query will succeed
+-1S: select * from hs_failover;
+ a
+----
+ 2
+ 3
+ 4
+ 7
+ 8
+ 5
+ 6
+ 9
+ 10
+ 1
+(10 rows)
+-1Sq: ...
+
+-- re-balance, bring the segments to their preferred roles
+!\retcode gprecoverseg -ar;
+(exited with code 0)
+select wait_until_all_segments_synchronized();
+ wait_until_all_segments_synchronized
+--------------------------------------
+ OK
+(1 row)
+-1S: select dbid, content, role, preferred_role, mode, status from gp_segment_configuration where content = 1;
+ dbid | content | role | preferred_role | mode | status
+------+---------+------+----------------+------+--------
+ 3 | 1 | p | p | s | u
+ 6 | 1 | m | m | s | u
+(2 rows)
+
+-- query runs fine still
+-1S: select * from hs_failover;
+ a
+----
+ 5
+ 6
+ 9
+ 10
+ 1
+ 2
+ 3
+ 4
+ 7
+ 8
+(10 rows)
+
+----------------------------------------------------------------
+-- DTX recovery
+----------------------------------------------------------------
+-- skip FTS probe to prevent unexpected mirror promotion
+1: select gp_inject_fault_infinite('fts_probe', 'skip', dbid) from gp_segment_configuration where role='p' and content=-1;
+ gp_inject_fault_infinite
+--------------------------
+ Success:
+(1 row)
+
+1: create table tt_hs_dtx(a int);
+CREATE
+
+-- inject fault to repeatedly fail the COMMIT PREPARE phase of 2PC, which ensures that the dtx cannot finish even by the dtx recovery process.
+select gp_inject_fault_infinite('finish_commit_prepared', 'error', dbid) from gp_segment_configuration where content=1 and role='p';
+ gp_inject_fault_infinite
+--------------------------
+ Success:
+(1 row)
+
+-- session 1 on primary QD tries to commit a DTX, but cannot finish due to the fault on a QE
+1&: insert into tt_hs_dtx select * from generate_series(1,10);
+
+-- inject a panic on primary QD, essentially restarts the primary QD
+2: select gp_inject_fault('before_read_command', 'panic', dbid) from gp_segment_configuration where content=-1 and role='p';
+ gp_inject_fault
+-----------------
+ Success:
+(1 row)
+2: select 1;
+PANIC: fault triggered, fault name:'before_read_command' fault type:'panic'
+server closed the connection unexpectedly
+ This probably means the server terminated abnormally
+ before or while processing the request.
+
+1<: <... completed>
+server closed the connection unexpectedly
+ This probably means the server terminated abnormally
+ before or while processing the request.
+1q: ...
+2q: ...
+
+-- standby QD can still run query
+-1S: select * from hs_failover;
+ a
+----
+ 1
+ 10
+ 2
+ 3
+ 4
+ 5
+ 6
+ 7
+ 8
+ 9
+(10 rows)
+-- it cannot see rows from the in-doubt DTX
+-1S: select * from tt_hs_dtx;
+ a
+---
+(0 rows)
+
+-- let the failed dtx be recovered, also make sure the standby replays the forget record which signals the completion of the dtx
+-1S: select gp_inject_fault('redoDistributedForgetCommitRecord', 'skip', dbid) from gp_segment_configuration where content=-1 and role='m';
+ gp_inject_fault
+-----------------
+ Success:
+(1 row)
+-1S: select gp_inject_fault_infinite('finish_commit_prepared', 'reset', dbid) from gp_segment_configuration where content=1 and role='p';
+ gp_inject_fault_infinite
+--------------------------
+ Success:
+(1 row)
+-1S: select gp_wait_until_triggered_fault('redoDistributedForgetCommitRecord', 1, dbid) from gp_segment_configuration where content=-1 and role='m';
+ gp_wait_until_triggered_fault
+-------------------------------
+ Success:
+(1 row)
+-1S: select gp_inject_fault('redoDistributedForgetCommitRecord', 'reset', dbid) from gp_segment_configuration where content=-1 and role='m';
+ gp_inject_fault
+-----------------
+ Success:
+(1 row)
+
+-- standby should see the rows from the in-doubt DTX now
+-1S: select * from tt_hs_dtx;
+ a
+----
+ 1
+ 2
+ 3
+ 4
+ 7
+ 8
+ 5
+ 6
+ 9
+ 10
+(10 rows)
+
+-1S: select wait_until_all_segments_synchronized();
+ wait_until_all_segments_synchronized
+--------------------------------------
+ OK
+(1 row)
+1: select gp_inject_fault('before_read_command', 'reset', dbid) from gp_segment_configuration where content=-1 and role='p';
+ gp_inject_fault
+-----------------
+ Success:
+(1 row)
+1: select gp_inject_fault('fts_probe', 'reset', dbid) from gp_segment_configuration where role='p' and content=-1;
+ gp_inject_fault
+-----------------
+ Success:
+(1 row)
+
diff --git a/src/test/isolation2/expected/hot_standby/setup.out b/src/test/isolation2/expected/hot_standby/setup.out
new file mode 100644
index 00000000000..f8f1e02fe40
--- /dev/null
+++ b/src/test/isolation2/expected/hot_standby/setup.out
@@ -0,0 +1,14 @@
+-- setup for hot standby tests
+!\retcode gpconfig -c hot_standby -v on;
+(exited with code 0)
+-- let primary wait for standby to apply changes, make test less flaky
+!\retcode gpconfig -c synchronous_commit -v remote_apply;
+(exited with code 0)
+-- make it faster to handle query conflict
+!\retcode gpconfig -c max_standby_streaming_delay -v 1000;
+(exited with code 0)
+-- disable autovacuum, to not affect the manual VACUUM in the tests
+!\retcode gpconfig -c autovacuum -v off;
+(exited with code 0)
+!\retcode gpstop -ar;
+(exited with code 0)
diff --git a/src/test/isolation2/expected/hot_standby/teardown.out b/src/test/isolation2/expected/hot_standby/teardown.out
new file mode 100644
index 00000000000..8b4e1271610
--- /dev/null
+++ b/src/test/isolation2/expected/hot_standby/teardown.out
@@ -0,0 +1,9 @@
+-- reset the setup for hot standby tests
+!\retcode gpconfig -r hot_standby;
+(exited with code 0)
+!\retcode gpconfig -r synchronous_commit;
+(exited with code 0)
+!\retcode gpconfig -r max_standby_streaming_delay;
+(exited with code 0)
+!\retcode gpstop -ar;
+(exited with code 0)
diff --git a/src/test/isolation2/expected/hot_standby/transaction_isolation.out b/src/test/isolation2/expected/hot_standby/transaction_isolation.out
new file mode 100644
index 00000000000..3990bd7cd56
--- /dev/null
+++ b/src/test/isolation2/expected/hot_standby/transaction_isolation.out
@@ -0,0 +1,984 @@
+----------------------------------------------------------------
+-- Test transaction isolation in general, not specific to dtx
+----------------------------------------------------------------
+1: create table hs_tx(a int);
+CREATE
+1: insert into hs_tx select * from generate_series(1,10);
+INSERT 10
+
+1: begin;
+BEGIN
+1: insert into hs_tx select * from generate_series(11,20);
+INSERT 10
+2: begin;
+BEGIN
+2: insert into hs_tx select * from generate_series(21,30);
+INSERT 10
+2: abort;
+ABORT
+
+-- standby should only see completed transactions, not in-progress transactions, nor aborted transactions
+-1S: select * from hs_tx;
+ a
+----
+ 1
+ 5
+ 6
+ 9
+ 10
+ 2
+ 3
+ 4
+ 7
+ 8
+(10 rows)
+
+1: end;
+END
+-1S: select * from hs_tx;
+ a
+----
+ 2
+ 3
+ 4
+ 7
+ 8
+ 16
+ 18
+ 19
+ 1
+ 12
+ 15
+ 20
+ 5
+ 6
+ 9
+ 10
+ 11
+ 13
+ 14
+ 17
+(20 rows)
+
+----------------------------------------------------------------
+-- Test isolation between hot standby query and in-progress dtx
+----------------------------------------------------------------
+
+1: create table hs_dtx1(a int);
+CREATE
+1: create table hs_dtx2(a int);
+CREATE
+
+-- inject two suspend faults:
+-- 1. on seg0, suspend before PREPARE phase of 2PC
+1: select gp_inject_fault('qe_start_prepared', 'suspend',dbid) from gp_segment_configuration where content=0 and role='p';
+ gp_inject_fault
+-----------------
+ Success:
+(1 row)
+1&: insert into hs_dtx1 select * from generate_series(1,10);
+-- 2. on seg1, suspend before COMMIT phase of 2PC
+2: select gp_inject_fault('qe_start_commit_prepared', 'suspend',dbid) from gp_segment_configuration where content=1 and role='p';
+ gp_inject_fault
+-----------------
+ Success:
+(1 row)
+2&: insert into hs_dtx2 select * from generate_series(1,10);
+
+-- standby should not see any rows from either dtx
+-1S: select * from hs_dtx1;
+ a
+---
+(0 rows)
+-1S: select * from hs_dtx2;
+ a
+---
+(0 rows)
+
+-- reset
+3: select gp_inject_fault('qe_start_prepared', 'reset',dbid) from gp_segment_configuration where content=0 and role='p';
+ gp_inject_fault
+-----------------
+ Success:
+(1 row)
+3: select gp_inject_fault('qe_start_commit_prepared', 'reset',dbid) from gp_segment_configuration where content=1 and role='p';
+ gp_inject_fault
+-----------------
+ Success:
+(1 row)
+1<: <... completed>
+INSERT 10
+2<: <... completed>
+INSERT 10
+
+-- standby should see the results from the dtx now
+-1S: select * from hs_dtx1;
+ a
+----
+ 2
+ 3
+ 4
+ 7
+ 8
+ 1
+ 5
+ 6
+ 9
+ 10
+(10 rows)
+-1S: select * from hs_dtx2;
+ a
+----
+ 2
+ 3
+ 4
+ 7
+ 8
+ 5
+ 6
+ 9
+ 10
+ 1
+(10 rows)
+
+----------------------------------------------------------------
+-- Test DTX abort that happens in different phases
+----------------------------------------------------------------
+
+1: create table hs_abort_dtx1(a int);
+CREATE
+1: create table hs_abort_dtx2(a int);
+CREATE
+
+-- inject two errors:
+-- 1. on seg0, error out before PREPARE phase of 2PC
+1: select gp_inject_fault('qe_start_prepared', 'error', dbid) from gp_segment_configuration where content=0 and role='p';
+ gp_inject_fault
+-----------------
+ Success:
+(1 row)
+1: insert into hs_abort_dtx1 select * from generate_series(1,10);
+ERROR: fault triggered, fault name:'qe_start_prepared' fault type:'error' (seg0 127.0.1.1:7002 pid=343)
+1: select gp_inject_fault('qe_start_prepared', 'reset',dbid) from gp_segment_configuration where content=0 and role='p';
+ gp_inject_fault
+-----------------
+ Success:
+(1 row)
+-- 2. on seg1, error out before COMMIT phase of 2PC
+1: select gp_inject_fault('qe_start_commit_prepared', 'error', dbid) from gp_segment_configuration where content=1 and role='p';
+ gp_inject_fault
+-----------------
+ Success:
+(1 row)
+1: insert into hs_abort_dtx2 select * from generate_series(1,10);
+INSERT 10
+1: select gp_inject_fault('qe_start_commit_prepared', 'reset',dbid) from gp_segment_configuration where content=1 and role='p';
+ gp_inject_fault
+-----------------
+ Success:
+(1 row)
+
+-- standby should not see dtx1 which is aborted but should see dtx2 which is recovered
+-1S: select * from hs_abort_dtx1;
+ a
+---
+(0 rows)
+-1S: select * from hs_abort_dtx2;
+ a
+----
+ 2
+ 3
+ 4
+ 7
+ 8
+ 1
+ 5
+ 6
+ 9
+ 10
+(10 rows)
+
+----------------------------------------------------------------
+-- Test isolation between hot standby query and in-progress dtx,
+-- but also run more queries in between
+----------------------------------------------------------------
+1: create table hs_dtx3(a int);
+CREATE
+
+-- inject faults to suspend segments in 2PC
+1: select gp_inject_fault('qe_start_prepared', 'suspend', dbid) from gp_segment_configuration where content=0 and role='p';
+ gp_inject_fault
+-----------------
+ Success:
+(1 row)
+1&: insert into hs_dtx3 select * from generate_series(1,10);
+2: select gp_inject_fault('qe_start_commit_prepared', 'suspend', dbid) from gp_segment_configuration where content=1 and role='p';
+ gp_inject_fault
+-----------------
+ Success:
+(1 row)
+2&: insert into hs_dtx3 select * from generate_series(11,20);
+
+-- standby should not see rows in the in-progress dtx
+-1S: select * from hs_dtx3;
+ a
+---
+(0 rows)
+
+-- now run some dtx and completed
+3: insert into hs_dtx3 values(99);
+INSERT 1
+3: create table hs_dtx4(a int);
+CREATE
+3: insert into hs_dtx4 select * from generate_series(1,10);
+INSERT 10
+
+-- standby should still not see rows in the in-progress DTX, but should see the completed ones
+-1S: select * from hs_dtx3;
+ a
+----
+ 99
+(1 row)
+-1S: select * from hs_dtx4;
+ a
+----
+ 2
+ 3
+ 4
+ 7
+ 8
+ 5
+ 6
+ 9
+ 10
+ 1
+(10 rows)
+
+3: select gp_inject_fault('qe_start_prepared', 'reset',dbid) from gp_segment_configuration where content=0 and role='p';
+ gp_inject_fault
+-----------------
+ Success:
+(1 row)
+3: select gp_inject_fault('qe_start_commit_prepared', 'reset',dbid) from gp_segment_configuration where content=1 and role='p';
+ gp_inject_fault
+-----------------
+ Success:
+(1 row)
+1<: <... completed>
+INSERT 10
+2<: <... completed>
+INSERT 10
+
+-- standby should see all rows now
+-1S: select * from hs_dtx3;
+ a
+----
+ 1
+ 12
+ 15
+ 20
+ 2
+ 3
+ 4
+ 7
+ 8
+ 16
+ 18
+ 19
+ 99
+ 5
+ 6
+ 9
+ 10
+ 11
+ 13
+ 14
+ 17
+(21 rows)
+
+----------------------------------------------------------------
+-- Test isolation between standby QD and in-progress dtx,
+-- but after standby QD resets and gets running DTX from checkpoint.
+----------------------------------------------------------------
+1: create table hs_t5(a int, b text);
+CREATE
+1: create table hs_t6(a int, b text);
+CREATE
+
+-- inject fault to suspend a primary right before it conducts the commit phase of 2PC,
+-- so in the subsequent INSERT, all local transactions will be committed but the dtx is not.
+1: select gp_inject_fault('qe_start_commit_prepared', 'suspend', dbid) from gp_segment_configuration where content=0 and role='p';
+ gp_inject_fault
+-----------------
+ Success:
+(1 row)
+1&: insert into hs_t5 select i, 'in-progress' from generate_series(1,10) i;
+
+-- now run some dtx and completed, and primary conducts a checkpoint
+2: insert into hs_t5 values(1, 'commited');
+INSERT 1
+2: insert into hs_t6 select i, 'committed' from generate_series(1,10) i;
+INSERT 10
+2: begin;
+BEGIN
+2: insert into hs_t5 values(99, 'aborted');
+INSERT 1
+2: abort;
+ABORT
+2: checkpoint;
+CHECKPOINT
+
+-- now make the standby QD resets itself
+-1S: select gp_inject_fault('exec_simple_query_start', 'panic', dbid) from gp_segment_configuration where content=-1 and role='m';
+ gp_inject_fault
+-----------------
+ Success:
+(1 row)
+-1S: select 1;
+PANIC: fault triggered, fault name:'exec_simple_query_start' fault type:'panic'
+server closed the connection unexpectedly
+ This probably means the server terminated abnormally
+ before or while processing the request.
+-1Sq: ...
+
+-- standby should still not see rows in the in-progress DTX, but should see the completed ones
+-1S: select * from hs_t5;
+ a | b
+---+----------
+ 1 | commited
+(1 row)
+-1S: select * from hs_t6;
+ a | b
+----+-----------
+ 1 | committed
+ 2 | committed
+ 3 | committed
+ 4 | committed
+ 7 | committed
+ 8 | committed
+ 5 | committed
+ 6 | committed
+ 9 | committed
+ 10 | committed
+(10 rows)
+
+2: select gp_inject_fault('qe_start_commit_prepared', 'reset',dbid) from gp_segment_configuration where content=0 and role='p';
+ gp_inject_fault
+-----------------
+ Success:
+(1 row)
+1<: <... completed>
+INSERT 10
+
+-- standby should see all rows now
+-1S: select * from hs_t5;
+ a | b
+----+-------------
+ 1 | in-progress
+ 1 | commited
+ 5 | in-progress
+ 6 | in-progress
+ 9 | in-progress
+ 10 | in-progress
+ 2 | in-progress
+ 3 | in-progress
+ 4 | in-progress
+ 7 | in-progress
+ 8 | in-progress
+(11 rows)
+-1S: select * from hs_t6;
+ a | b
+----+-----------
+ 5 | committed
+ 6 | committed
+ 9 | committed
+ 10 | committed
+ 1 | committed
+ 2 | committed
+ 3 | committed
+ 4 | committed
+ 7 | committed
+ 8 | committed
+(10 rows)
+
+-- standby should correctly see more in-progress dtx on the primary.
+-- context: previously this would be fail because the standby updates latestCompletedGxid to the
+-- bumped nextGxid from checkpoint, which is too far (so that it thinks the new dtx already completed).
+1: select gp_inject_fault('qe_start_prepared', 'suspend', dbid) from gp_segment_configuration where content=0 and role='p';
+ gp_inject_fault
+-----------------
+ Success:
+(1 row)
+1&: delete from hs_t5;
+2: select gp_inject_fault('qe_start_commit_prepared', 'suspend', dbid) from gp_segment_configuration where content=1 and role='p';
+ gp_inject_fault
+-----------------
+ Success:
+(1 row)
+2&: delete from hs_t6;
+
+-- standby should not see the effect of the deletes
+-1S: select * from hs_t5;
+ a | b
+----+-------------
+ 2 | in-progress
+ 3 | in-progress
+ 4 | in-progress
+ 7 | in-progress
+ 8 | in-progress
+ 1 | in-progress
+ 1 | commited
+ 5 | in-progress
+ 6 | in-progress
+ 9 | in-progress
+ 10 | in-progress
+(11 rows)
+-1S: select * from hs_t6;
+ a | b
+----+-----------
+ 1 | committed
+ 2 | committed
+ 3 | committed
+ 4 | committed
+ 7 | committed
+ 8 | committed
+ 5 | committed
+ 6 | committed
+ 9 | committed
+ 10 | committed
+(10 rows)
+
+3: select gp_inject_fault('qe_start_prepared', 'reset',dbid) from gp_segment_configuration where content=0 and role='p';
+ gp_inject_fault
+-----------------
+ Success:
+(1 row)
+3: select gp_inject_fault('qe_start_commit_prepared', 'reset',dbid) from gp_segment_configuration where content=1 and role='p';
+ gp_inject_fault
+-----------------
+ Success:
+(1 row)
+
+1<: <... completed>
+DELETE 11
+2<: <... completed>
+DELETE 10
+
+-- standby now see those deletes
+-1S: select * from hs_t5;
+ a | b
+---+---
+(0 rows)
+-1S: select * from hs_t6;
+ a | b
+---+---
+(0 rows)
+
+----------------------------------------------------------------
+-- Read-committed isolation: query on hot standby should not see dtx that completed after it
+-- created distributed snapshot, but should see dtx that completed before that.
+----------------------------------------------------------------
+
+1: create table hs_rc(a int);
+CREATE
+1: insert into hs_rc select * from generate_series(1,10);
+INSERT 10
+
+-- case 1: suspend SELECT on the standby QD right after it created snapshot
+-1S: select gp_inject_fault('select_after_qd_create_snapshot', 'suspend', dbid) from gp_segment_configuration where content=-1 and role='m';
+ gp_inject_fault
+-----------------
+ Success:
+(1 row)
+-1S&: select * from hs_rc;
+
+-- new INSERT or DELETE won't be observed by the standby
+1: insert into hs_rc select * from generate_series(11,20);
+INSERT 10
+1: delete from hs_rc where a < 5;
+DELETE 4
+1: select gp_inject_fault('select_after_qd_create_snapshot', 'reset', dbid) from gp_segment_configuration where content=-1 and role='m';
+ gp_inject_fault
+-----------------
+ Success:
+(1 row)
+
+-- should only see the rows at the time when SELECT started (1...10).
+-1S<: <... completed>
+ a
+----
+ 2
+ 3
+ 4
+ 7
+ 8
+ 1
+ 5
+ 6
+ 9
+ 10
+(10 rows)
+
+-- SELECT again, should see the effect from the INSERT and DELETE now
+-1S: select * from hs_rc;
+ a
+----
+ 12
+ 15
+ 20
+ 7
+ 8
+ 16
+ 18
+ 19
+ 5
+ 6
+ 9
+ 10
+ 11
+ 13
+ 14
+ 17
+(16 rows)
+
+-- case 2: suspend SELECT on the standby QD before creating snapshot
+-1S: select gp_inject_fault('select_before_qd_create_snapshot', 'suspend', dbid) from gp_segment_configuration where content=-1 and role='m';
+ gp_inject_fault
+-----------------
+ Success:
+(1 row)
+-1S&: select * from hs_rc;
+
+1: insert into hs_rc select * from generate_series(21,30);
+INSERT 10
+1: delete from hs_rc where a < 21;
+DELETE 16
+1: select gp_inject_fault('select_before_qd_create_snapshot', 'reset', dbid) from gp_segment_configuration where content=-1 and role='m';
+ gp_inject_fault
+-----------------
+ Success:
+(1 row)
+
+-- standby should see the effect of the INSERT and DELETE
+-1S<: <... completed>
+ a
+----
+ 23
+ 26
+ 30
+ 22
+ 24
+ 27
+ 29
+ 21
+ 25
+ 28
+(10 rows)
+
+----------------------------------------------------------------
+-- Read-committed isolation in the BEGIN...END block
+----------------------------------------------------------------
+
+1: truncate hs_rc;
+TRUNCATE
+1: insert into hs_rc select * from generate_series(1,30);
+INSERT 30
+
+-1S: begin;
+BEGIN
+-1S: select count(*) from hs_rc;
+ count
+-------
+ 30
+(1 row)
+
+-- have some concurrent sessions on primary QD:
+-- 1. a completed transaction
+1: delete from hs_rc where a <= 10;
+DELETE 10
+-- 3. an aborted transaction
+2: begin;
+BEGIN
+2: delete from hs_rc where a > 10 and a <= 20;
+DELETE 10
+2: abort;
+ABORT
+-- 3. an ongoing transaction
+3: begin;
+BEGIN
+3: delete from hs_rc where a > 20 and a <= 30;
+DELETE 10
+
+-- the standby should see results accordingly
+-1S: select * from hs_rc;
+ a
+----
+ 12
+ 15
+ 20
+ 23
+ 26
+ 30
+ 11
+ 13
+ 14
+ 17
+ 21
+ 25
+ 28
+ 16
+ 18
+ 19
+ 22
+ 24
+ 27
+ 29
+(20 rows)
+-1S: end;
+END
+
+3: end;
+END
+-1S: select * from hs_rc;
+ a
+----
+ 12
+ 15
+ 20
+ 11
+ 13
+ 14
+ 17
+ 16
+ 18
+ 19
+(10 rows)
+
+----------------------------------------------------------------
+-- Repeatable-read isolation: distributed snapshot is created at time of the
+-- first query in transaction block. All queries in the transaction block
+-- should only see results committed before the distributed snapshot creation.
+----------------------------------------------------------------
+
+1: create table hs_rr(a int);
+CREATE
+1: insert into hs_rr select * from generate_series(1,10);
+INSERT 10
+
+-1S: begin isolation level repeatable read;
+BEGIN
+-- should see 10
+-1S: select count(*) from hs_rr;
+ count
+-------
+ 10
+(1 row)
+
+-- do some more INSERT, DELETE and UPDATE
+1: insert into hs_rr select * from generate_series(11,20);
+INSERT 10
+1: delete from hs_rr where a <= 10;
+DELETE 10
+1: update hs_rr set a = a + 100;
+UPDATE 10
+
+-- should still the initial rows {1...10}
+-1S: select * from hs_rr;
+ a
+----
+ 2
+ 3
+ 4
+ 7
+ 8
+ 1
+ 5
+ 6
+ 9
+ 10
+(10 rows)
+-1S: end;
+END
+
+-- should see the results from the INSERT, DELETE and UPDATE
+-1S: begin isolation level repeatable read;
+BEGIN
+-1S: select * from hs_rr;
+ a
+-----
+ 115
+ 120
+ 118
+ 113
+ 114
+ 112
+ 116
+ 119
+ 111
+ 117
+(10 rows)
+
+-- standby won't see ongoing or aborted transactions either
+1: begin;
+BEGIN
+1: insert into hs_rr select * from generate_series(1,10);
+INSERT 10
+2: begin;
+BEGIN
+2: insert into hs_rr select * from generate_series(1,10);
+INSERT 10
+2: abort;
+ABORT
+
+-1S: select * from hs_rr;
+ a
+-----
+ 114
+ 115
+ 120
+ 118
+ 113
+ 112
+ 116
+ 119
+ 111
+ 117
+(10 rows)
+
+1: end;
+END
+-1S: end;
+END
+
+----------------------------------------------------------------
+-- Transaction isolation is respected in subtransactions too
+----------------------------------------------------------------
+
+1: create table hs_subtrx(a int);
+CREATE
+
+-- (1) read-committed
+-1S: begin;
+BEGIN
+-1S: select count(*) from hs_subtrx;
+ count
+-------
+ 0
+(1 row)
+-1S: savepoint s1;
+SAVEPOINT
+
+1: insert into hs_subtrx select * from generate_series(1,10);
+INSERT 10
+
+-1S: select count(*) from hs_subtrx;
+ count
+-------
+ 10
+(1 row)
+-1S: savepoint s2;
+SAVEPOINT
+-1S: select count(*) from hs_subtrx;
+ count
+-------
+ 10
+(1 row)
+-1S: rollback to savepoint s1;
+ROLLBACK
+-1S: select count(*) from hs_subtrx;
+ count
+-------
+ 10
+(1 row)
+-1S: end;
+END
+
+-- (2) repeatable-read
+-1S: begin isolation level repeatable read;
+BEGIN
+-1S: select * from hs_subtrx;
+ a
+----
+ 1
+ 2
+ 3
+ 4
+ 7
+ 8
+ 5
+ 6
+ 9
+ 10
+(10 rows)
+-1S: savepoint s1;
+SAVEPOINT
+
+1: insert into hs_subtrx select * from generate_series(11,20);
+INSERT 10
+1: delete from hs_subtrx where a <= 10;
+DELETE 10
+1: update hs_subtrx set a = a + 100;
+UPDATE 10
+
+-1S: select * from hs_subtrx;
+ a
+----
+ 2
+ 3
+ 4
+ 7
+ 8
+ 1
+ 5
+ 6
+ 9
+ 10
+(10 rows)
+-1S: savepoint s2;
+SAVEPOINT
+-1S: select * from hs_subtrx;
+ a
+----
+ 2
+ 3
+ 4
+ 7
+ 8
+ 1
+ 5
+ 6
+ 9
+ 10
+(10 rows)
+-1S: rollback to savepoint s1;
+ROLLBACK
+-1S: select * from hs_subtrx;
+ a
+----
+ 2
+ 3
+ 4
+ 7
+ 8
+ 1
+ 5
+ 6
+ 9
+ 10
+(10 rows)
+-1S: end;
+END
+-1S: select * from hs_subtrx;
+ a
+-----
+ 114
+ 115
+ 120
+ 118
+ 113
+ 112
+ 116
+ 119
+ 111
+ 117
+(10 rows)
+
+----------------------------------------------------------------
+-- Various isolation tests that involve AO/CO table.
+----------------------------------------------------------------
+1: create table hs_ao(a int, id int unique) using ao_row;
+CREATE
+1: insert into hs_ao select 1,i from generate_series(1,10) i;
+INSERT 10
+1: begin;
+BEGIN
+1: insert into hs_ao select 2,i from generate_series(11,20) i;
+INSERT 10
+
+-- standby sees the same AO metadata as primary
+2: select * from gp_toolkit.__gp_aoseg('hs_ao');
+ segment_id | segno | eof | tupcount | varblockcount | eof_uncompressed | modcount | formatversion | state
+------------+-------+-----+----------+---------------+------------------+----------+---------------+-------
+ 0 | 1 | 128 | 5 | 1 | 128 | 1 | 3 | 1
+ 1 | 1 | 40 | 1 | 1 | 40 | 1 | 3 | 1
+ 2 | 1 | 104 | 4 | 1 | 104 | 1 | 3 | 1
+(3 rows)
+-1S: select * from gp_toolkit.__gp_aoseg('hs_ao');
+ segment_id | segno | eof | tupcount | varblockcount | eof_uncompressed | modcount | formatversion | state
+------------+-------+-----+----------+---------------+------------------+----------+---------------+-------
+ 0 | 1 | 128 | 5 | 1 | 128 | 1 | 3 | 1
+ 1 | 1 | 40 | 1 | 1 | 40 | 1 | 3 | 1
+ 2 | 1 | 104 | 4 | 1 | 104 | 1 | 3 | 1
+(3 rows)
+2: select (gp_toolkit.__gp_aoblkdir('hs_ao')).* from gp_dist_random('gp_id');
+ tupleid | segno | columngroup_no | entry_no | first_row_no | file_offset | row_count
+---------+-------+----------------+----------+--------------+-------------+-----------
+ (0,2) | 1 | 0 | 0 | 1 | 0 | 4
+ (0,2) | 1 | 0 | 0 | 1 | 0 | 1
+ (0,2) | 1 | 0 | 0 | 1 | 0 | 5
+(3 rows)
+-1S: select (gp_toolkit.__gp_aoblkdir('hs_ao')).* from gp_dist_random('gp_id');
+ tupleid | segno | columngroup_no | entry_no | first_row_no | file_offset | row_count
+---------+-------+----------------+----------+--------------+-------------+-----------
+ (0,2) | 1 | 0 | 0 | 1 | 0 | 5
+ (0,2) | 1 | 0 | 0 | 1 | 0 | 1
+ (0,2) | 1 | 0 | 0 | 1 | 0 | 4
+(3 rows)
+
+-- standby sees correct table data
+-1S: select * from hs_ao;
+ a | id
+---+----
+ 1 | 2
+ 1 | 3
+ 1 | 4
+ 1 | 7
+ 1 | 8
+ 1 | 1
+ 1 | 5
+ 1 | 6
+ 1 | 9
+ 1 | 10
+(10 rows)
+
+-- standby sees the effect of vacuum
+1: end;
+END
+1: delete from hs_ao where a = 1;
+DELETE 10
+1: vacuum hs_ao;
+VACUUM
+1: select * from gp_toolkit.__gp_aoseg('hs_ao');
+ segment_id | segno | eof | tupcount | varblockcount | eof_uncompressed | modcount | formatversion | state
+------------+-------+-----+----------+---------------+------------------+----------+---------------+-------
+ 2 | 1 | 0 | 0 | 0 | 0 | 3 | 3 | 1
+ 2 | 2 | 104 | 4 | 1 | 104 | 0 | 3 | 1
+ 0 | 1 | 0 | 0 | 0 | 0 | 3 | 3 | 1
+ 0 | 2 | 88 | 3 | 1 | 88 | 0 | 3 | 1
+ 1 | 1 | 0 | 0 | 0 | 0 | 3 | 3 | 1
+ 1 | 2 | 88 | 3 | 1 | 88 | 0 | 3 | 1
+(6 rows)
+-1S: select * from gp_toolkit.__gp_aoseg('hs_ao');
+ segment_id | segno | eof | tupcount | varblockcount | eof_uncompressed | modcount | formatversion | state
+------------+-------+-----+----------+---------------+------------------+----------+---------------+-------
+ 2 | 1 | 0 | 0 | 0 | 0 | 3 | 3 | 1
+ 2 | 2 | 104 | 4 | 1 | 104 | 0 | 3 | 1
+ 0 | 1 | 0 | 0 | 0 | 0 | 3 | 3 | 1
+ 0 | 2 | 88 | 3 | 1 | 88 | 0 | 3 | 1
+ 1 | 1 | 0 | 0 | 0 | 0 | 3 | 3 | 1
+ 1 | 2 | 88 | 3 | 1 | 88 | 0 | 3 | 1
+(6 rows)
+-1S: select * from hs_ao;
+ a | id
+---+----
+ 2 | 11
+ 2 | 13
+ 2 | 14
+ 2 | 17
+ 2 | 12
+ 2 | 15
+ 2 | 20
+ 2 | 16
+ 2 | 18
+ 2 | 19
+(10 rows)
diff --git a/src/test/isolation2/hot_standby_schedule b/src/test/isolation2/hot_standby_schedule
new file mode 100644
index 00000000000..73e0f71a84c
--- /dev/null
+++ b/src/test/isolation2/hot_standby_schedule
@@ -0,0 +1,6 @@
+test: hot_standby/setup
+test: hot_standby/basic
+test: hot_standby/transaction_isolation
+test: hot_standby/query_conflict
+test: hot_standby/faults
+test: hot_standby/teardown
diff --git a/src/test/isolation2/input/hot_standby/query_conflict.source b/src/test/isolation2/input/hot_standby/query_conflict.source
new file mode 100644
index 00000000000..5f2aee3be53
--- /dev/null
+++ b/src/test/isolation2/input/hot_standby/query_conflict.source
@@ -0,0 +1,225 @@
+-- Tests for query conflict detection and cancellation on the hot standby.
+
+----------------------------------------------------------------
+-- Various query conflcit cases for hot standy.
+--
+-- All cases are written in this pattern:
+-- 1. Start a standby transaction that will be conflicted and cancelled;
+-- 2. Start a primary transaction that will conflict it;
+-- 3. Commit the primary transaction. Since we are using remote_apply, it will
+-- wait until the WAL is applied on the standby, which would happen only
+-- after the standby query is cancelled;
+-- 4. Run something on the standby transaction and see the conflict error, which
+-- in some cases it's ERROR, in others it's FATAL.
+-- 5. Quit, establish a new connection, and re-run
+-- 6. Check the system view gp_stat_database_conflicts to see that the conflict
+-- has been recorded. Note that we print the max count among all segments
+-- to avoid flakiness.
+-- See https://www.postgresql.org/docs/12/hot-standby.html#HOT-STANDBY-CONFLICT for more details.
+----------------------------------------------------------------
+
+-- We assume we start the test with clean records
+-1S: select max(confl_tablespace), max(confl_lock), max(confl_snapshot), max(confl_bufferpin), max(confl_deadlock) from gp_stat_database_conflicts where datname = 'isolation2-hot-standby';
+
+---------------------------------------------------------------------
+-- Conflict with explicit lock
+---------------------------------------------------------------------
+create table hs_qc_lock(a int);
+insert into hs_qc_lock select * from generate_series(1,5);
+-1S: begin;
+-1S: select * from hs_qc_lock;
+1: begin;
+1: lock table hs_qc_lock in access exclusive mode;
+1: end;
+-1S: select * from hs_qc_lock;
+-1Sq:
+-1S: select * from hs_qc_lock;
+-1S: select max(confl_lock) from gp_stat_database_conflicts where datname = 'isolation2-hot-standby';
+
+---------------------------------------------------------------------
+-- Conflict with implicit lock
+---------------------------------------------------------------------
+-1S: begin;
+-1S: select * from hs_qc_lock;
+1: alter table hs_qc_lock set access method ao_row;
+-1S: select * from hs_qc_lock;
+-1Sq:
+-1S: select * from hs_qc_lock;
+-1S: select max(confl_lock) from gp_stat_database_conflicts where datname = 'isolation2-hot-standby';
+
+---------------------------------------------------------------------
+-- Conflict with drop database
+---------------------------------------------------------------------
+1: create database hs_qc_dropdb;
+-1Sq:
+-1S:@db_name hs_qc_dropdb: select 1;
+1: drop database hs_qc_dropdb;
+-1S: select 1;
+-1Sq:
+-- Stats aren't counted for database conflicts. See: pgstat_recv_recoveryconflict
+
+---------------------------------------------------------------------
+-- Conflict with VACUUM (snapshot)
+---------------------------------------------------------------------
+1: create table hs_qc_vac1(a int);
+1: insert into hs_qc_vac1 select * from generate_series(1,10);
+-1S: begin transaction isolation level repeatable read;
+-1S: select count(*) from hs_qc_vac1;
+1: delete from hs_qc_vac1;
+1: vacuum hs_qc_vac1;
+-1S: select count(*) from hs_qc_vac1;
+-1Sq:
+-1S: select max(confl_snapshot) from gp_stat_database_conflicts where datname = 'isolation2-hot-standby';
+
+---------------------------------------------------------------------
+-- Conflict with VACUUM (buffer pin)
+-- VACUUM of page that the standby is still holding buffer pin on, the difference with
+-- the previous case is that here the deleted row is already invisible to the standby.
+---------------------------------------------------------------------
+1: create table hs_qc_vac2(a int);
+1: insert into hs_qc_vac2 values(2);
+1: delete from hs_qc_vac2;
+-- run select once on the standby, so the next select will fetch data from buffer
+-1S: select * from hs_qc_vac2;
+-- suspend the standby at where it just unlocks the buffer but still holds the pin
+1: select gp_inject_fault('heapgetpage_after_unlock_buffer', 'suspend','','','hs_qc_vac2',1,1,0,dbid) from gp_segment_configuration where content=0 and role='m';
+-- we'll also make sure the startup process has sent out the signal before we let the standby backend release the pin
+1: select gp_inject_fault('recovery_conflict_bufferpin_signal_sent', 'skip',dbid) from gp_segment_configuration where content=0 and role='m';
+-1S&: select * from hs_qc_vac2;
+1: vacuum hs_qc_vac2;
+-- as mentioned before, make sure startup process has sent the signal, and then let the standby proceed
+1: select gp_wait_until_triggered_fault('recovery_conflict_bufferpin_signal_sent', 1,dbid) from gp_segment_configuration where content=0 and role='m';
+1: select gp_inject_fault('recovery_conflict_bufferpin_signal_sent', 'reset',dbid) from gp_segment_configuration where content=0 and role='m';
+1: select gp_inject_fault('heapgetpage_after_unlock_buffer', 'reset',dbid) from gp_segment_configuration where content=0 and role='m';
+-- should see the conflict
+-1S<:
+-1Sq:
+-- XXX: sometimes it shows the number is 2 instead of 1. It still validates the test but it would be nice to know why.
+-1S: select max(confl_bufferpin) > 0 from gp_stat_database_conflicts where datname = 'isolation2-hot-standby';
+
+---------------------------------------------------------------------
+-- Conflict with drop (temp) tablespace
+-- Note: regular user tablespaces won't cause conflict on the standby since the standby cannot create any objects under them.
+---------------------------------------------------------------------
+-- create tablespace
+!\retcode mkdir -p @testtablespace@/hs_tablespace_directory;
+create tablespace hs_ts location '@testtablespace@/hs_tablespace_directory';
+
+-- some prepartion on the primary
+create table hs_ts_foo (i int, j int) distributed by(i);
+insert into hs_ts_foo select i, i from generate_series(1,800000)i;
+analyze hs_ts_foo;
+
+-- make sure the standby won't run too fast and delete the temp files
+select gp_inject_fault('after_open_temp_file', 'suspend',dbid) from gp_segment_configuration where content=1 and role='m';
+
+-- on the standby, run some query that requires workfile, this example is taken
+-- from regress/temp_tablespaces test
+-1S: set temp_tablespaces = hs_ts;
+-1S: set default_tablespace = hs_ts;
+-1S: set statement_mem='2MB';
+-1S&: with a1 as (select * from hs_ts_foo), a2 as (select * from hs_ts_foo) select a1.i xx from a1 inner join a2 on a2.i = a1.i union all select count(a1.i) from a1 inner join a2 on a2.i = a1.i order by xx limit 5;
+
+-- drop tablespace, should see conflict on the hot standby
+drop tablespace hs_ts;
+select gp_inject_fault('after_open_temp_file', 'reset',dbid) from gp_segment_configuration where content=1 and role='m';
+-1S<:
+-1Sq:
+
+-- conflict has been recorded. The query has multiple slices
+-1S: select max(confl_tablespace) >= 1 from gp_stat_database_conflicts where datname = 'isolation2-hot-standby';
+
+-- cleanup
+!\retcode rm -rf @testtablespace@/hs_tablespace_directory;
+-- Do one checkpoint. Otherwise if server restarts w/o doing checkpoint (some subsequent
+-- tests might do that), the server would complain it cannot find the directory for hs_ts.
+checkpoint;
+
+----------------------------------------------------------------
+-- Additional case to show that distributed transaction is not taken into
+-- account w/o the help of restore-point-based distributed snapshot creation.
+----------------------------------------------------------------
+
+1: create table hs_qc_ds1(a int);
+1: insert into hs_qc_ds1 select * from generate_series(1,10);
+-- standby starts a repeatable read transaction, runs a local query that
+-- creates a distributed snapshot w/o creating QE.
+-1S: select count(*) from hs_qc_ds1;
+-1S: begin transaction isolation level repeatable read;
+-1S: select relname from pg_class where relname = 'hs_qc_ds1';
+-- primary runs VACUUM
+1: delete from hs_qc_ds1;
+1: vacuum hs_qc_ds1;
+-- The standby query in theory should be cancelled, because it started before
+-- the VACUUM. But in reality, it doesn't, and sees 0 rows, because the QE for the
+-- SELECT below will create more recent local snapshot that does not conflict with
+-- the VACUUM, and sees the result of DELETE+VACUUM.
+-- Note: with the help of restore point, we would be able to create local snapshot
+-- precisely corresponding to each distributed snapshot, and do conflict detection accordingly.
+-1S: select count(*) from hs_qc_ds1;
+-1S: end;
+
+----------------------------------------------------------------
+-- Test GUC hot_standby_feedback
+----------------------------------------------------------------
+!\retcode gpconfig -c hot_standby_feedback -v on;
+!\retcode gpstop -u;
+
+1: create table hs_qc_guc1(a int);
+1: insert into hs_qc_guc1 select * from generate_series(1,10);
+
+-1S: begin transaction isolation level repeatable read;
+-1S: select * from hs_qc_guc1;
+
+-- VACUUM won't cleanup this table since the standby still sees it
+1: delete from hs_qc_guc1;
+1: vacuum hs_qc_guc1;
+
+-- hot standby can still see those rows
+-1S: select * from hs_qc_guc1;
+
+-- after the conflicting read transaction ends, the next VACUUM will successfully vacuum the table
+-1S: end;
+1: vacuum hs_qc_guc1;
+-1S: select * from hs_qc_guc1;
+-1Sq:
+
+!\retcode gpconfig -r hot_standby_feedback;
+!\retcode gpstop -u;
+
+----------------------------------------------------------------
+-- Test GUC vacuum_defer_cleanup_age
+----------------------------------------------------------------
+-- Use a GUC value that's not 0, so VACUUM does not clean up
+-- recent dead rows that the hot standby might be still seeing.
+!\retcode gpconfig -c vacuum_defer_cleanup_age -v 1;
+!\retcode gpstop -u;
+
+1: create table hs_qc_guc2(a int);
+1: insert into hs_qc_guc2 select * from generate_series(1,10);
+
+-1S: begin transaction isolation level repeatable read;
+-1S: select count(*) from hs_qc_guc2;
+
+-- VACUUM won't cleanup this table since the DELETE is still within vacuum_defer_cleanup_age
+1: delete from hs_qc_guc2;
+1: vacuum hs_qc_guc2;
+
+-- showing all rows are deleted but not vacuumed
+1: select count(*) from hs_qc_guc2;
+1: set gp_select_invisible to on;
+1: select count(*) from hs_qc_guc2;
+
+-- hot standby can still query the table
+-1S: select count(*) from hs_qc_guc2;
+
+-- only if the age is reached, hot standby will see the same conflict as before
+1: create temp table tt1(a int);
+1: vacuum hs_qc_guc2;
+-1S: select count(*) from hs_qc_guc2;
+-1Sq:
+-1S: select max(confl_snapshot) from gp_stat_database_conflicts where datname = 'isolation2-hot-standby';
+
+!\retcode gpconfig -r vacuum_defer_cleanup_age;
+!\retcode gpstop -u;
+
diff --git a/src/test/isolation2/output/hot_standby/query_conflict.source b/src/test/isolation2/output/hot_standby/query_conflict.source
new file mode 100644
index 00000000000..909d2532df3
--- /dev/null
+++ b/src/test/isolation2/output/hot_standby/query_conflict.source
@@ -0,0 +1,470 @@
+-- Tests for query conflict detection and cancellation on the hot standby.
+
+----------------------------------------------------------------
+-- Various query conflcit cases for hot standy.
+--
+-- All cases are written in this pattern:
+-- 1. Start a standby transaction that will be conflicted and cancelled;
+-- 2. Start a primary transaction that will conflict it;
+-- 3. Commit the primary transaction. Since we are using remote_apply, it will
+-- wait until the WAL is applied on the standby, which would happen only
+-- after the standby query is cancelled;
+-- 4. Run something on the standby transaction and see the conflict error, which
+-- in some cases it's ERROR, in others it's FATAL.
+-- 5. Quit, establish a new connection, and re-run
+-- 6. Check the system view gp_stat_database_conflicts to see that the conflict
+-- has been recorded. Note that we print the max count among all segments
+-- to avoid flakiness.
+-- See https://www.postgresql.org/docs/12/hot-standby.html#HOT-STANDBY-CONFLICT for more details.
+----------------------------------------------------------------
+
+-- We assume we start the test with clean records
+-1S: select max(confl_tablespace), max(confl_lock), max(confl_snapshot), max(confl_bufferpin), max(confl_deadlock) from gp_stat_database_conflicts where datname = 'isolation2-hot-standby';
+ max | max | max | max | max
+-----+-----+-----+-----+-----
+ 0 | 0 | 0 | 0 | 0
+(1 row)
+
+---------------------------------------------------------------------
+-- Conflict with explicit lock
+---------------------------------------------------------------------
+create table hs_qc_lock(a int);
+CREATE
+insert into hs_qc_lock select * from generate_series(1,5);
+INSERT 5
+-1S: begin;
+BEGIN
+-1S: select * from hs_qc_lock;
+ a
+---
+ 2
+ 3
+ 4
+ 1
+ 5
+(5 rows)
+1: begin;
+BEGIN
+1: lock table hs_qc_lock in access exclusive mode;
+LOCK
+1: end;
+END
+-1S: select * from hs_qc_lock;
+FATAL: terminating connection due to conflict with recovery
+DETAIL: User was holding a relation lock for too long.
+HINT: In a moment you should be able to reconnect to the database and repeat your command.
+server closed the connection unexpectedly
+ This probably means the server terminated abnormally
+ before or while processing the request.
+-1Sq: ...
+-1S: select * from hs_qc_lock;
+ a
+---
+ 1
+ 5
+ 2
+ 3
+ 4
+(5 rows)
+-1S: select max(confl_lock) from gp_stat_database_conflicts where datname = 'isolation2-hot-standby';
+ max
+-----
+ 1
+(1 row)
+
+---------------------------------------------------------------------
+-- Conflict with implicit lock
+---------------------------------------------------------------------
+-1S: begin;
+BEGIN
+-1S: select * from hs_qc_lock;
+ a
+---
+ 1
+ 5
+ 2
+ 3
+ 4
+(5 rows)
+1: alter table hs_qc_lock set access method ao_row;
+ALTER
+-1S: select * from hs_qc_lock;
+FATAL: terminating connection due to conflict with recovery
+DETAIL: User was holding a relation lock for too long.
+HINT: In a moment you should be able to reconnect to the database and repeat your command.
+server closed the connection unexpectedly
+ This probably means the server terminated abnormally
+ before or while processing the request.
+-1Sq: ...
+-1S: select * from hs_qc_lock;
+ a
+---
+ 1
+ 5
+ 2
+ 3
+ 4
+(5 rows)
+-1S: select max(confl_lock) from gp_stat_database_conflicts where datname = 'isolation2-hot-standby';
+ max
+-----
+ 2
+(1 row)
+
+---------------------------------------------------------------------
+-- Conflict with drop database
+---------------------------------------------------------------------
+1: create database hs_qc_dropdb;
+CREATE
+-1Sq: ...
+-1S:@db_name hs_qc_dropdb: select 1;
+ ?column?
+----------
+ 1
+(1 row)
+1: drop database hs_qc_dropdb;
+DROP
+-1S: select 1;
+FATAL: terminating connection due to conflict with recovery
+DETAIL: User was connected to a database that must be dropped.
+server closed the connection unexpectedly
+ This probably means the server terminated abnormally
+ before or while processing the request.
+-1Sq: ...
+-- Stats aren't counted for database conflicts. See: pgstat_recv_recoveryconflict
+
+---------------------------------------------------------------------
+-- Conflict with VACUUM (snapshot)
+---------------------------------------------------------------------
+1: create table hs_qc_vac1(a int);
+CREATE
+1: insert into hs_qc_vac1 select * from generate_series(1,10);
+INSERT 10
+-1S: begin transaction isolation level repeatable read;
+BEGIN
+-1S: select count(*) from hs_qc_vac1;
+ count
+-------
+ 10
+(1 row)
+1: delete from hs_qc_vac1;
+DELETE 10
+1: vacuum hs_qc_vac1;
+VACUUM
+-1S: select count(*) from hs_qc_vac1;
+DETAIL: User query might have needed to see row versions that must be removed.
+ERROR: terminating connection due to conflict with recovery
+HINT: In a moment you should be able to reconnect to the database and repeat your command.
+-1Sq: ...
+-1S: select max(confl_snapshot) from gp_stat_database_conflicts where datname = 'isolation2-hot-standby';
+ max
+-----
+ 1
+(1 row)
+
+---------------------------------------------------------------------
+-- Conflict with VACUUM (buffer pin)
+-- VACUUM of page that the standby is still holding buffer pin on, the difference with
+-- the previous case is that here the deleted row is already invisible to the standby.
+---------------------------------------------------------------------
+1: create table hs_qc_vac2(a int);
+CREATE
+1: insert into hs_qc_vac2 values(2);
+INSERT 1
+1: delete from hs_qc_vac2;
+DELETE 1
+-- run select once on the standby, so the next select will fetch data from buffer
+-1S: select * from hs_qc_vac2;
+ a
+---
+(0 rows)
+-- suspend the standby at where it just unlocks the buffer but still holds the pin
+1: select gp_inject_fault('heapgetpage_after_unlock_buffer', 'suspend','','','hs_qc_vac2',1,1,0,dbid) from gp_segment_configuration where content=0 and role='m';
+ gp_inject_fault
+-----------------
+ Success:
+(1 row)
+-- we'll also make sure the startup process has sent out the signal before we let the standby backend release the pin
+1: select gp_inject_fault('recovery_conflict_bufferpin_signal_sent', 'skip',dbid) from gp_segment_configuration where content=0 and role='m';
+ gp_inject_fault
+-----------------
+ Success:
+(1 row)
+-1S&: select * from hs_qc_vac2;
+1: vacuum hs_qc_vac2;
+VACUUM
+-- as mentioned before, make sure startup process has sent the signal, and then let the standby proceed
+1: select gp_wait_until_triggered_fault('recovery_conflict_bufferpin_signal_sent', 1,dbid) from gp_segment_configuration where content=0 and role='m';
+ gp_wait_until_triggered_fault
+-------------------------------
+ Success:
+(1 row)
+1: select gp_inject_fault('recovery_conflict_bufferpin_signal_sent', 'reset',dbid) from gp_segment_configuration where content=0 and role='m';
+ gp_inject_fault
+-----------------
+ Success:
+(1 row)
+1: select gp_inject_fault('heapgetpage_after_unlock_buffer', 'reset',dbid) from gp_segment_configuration where content=0 and role='m';
+ gp_inject_fault
+-----------------
+ Success:
+(1 row)
+-- should see the conflict
+-1S<: <... completed>
+ERROR: canceling statement due to conflict with recovery (seg0 slice1 127.0.1.1:7005 pid=17044)
+DETAIL: User was holding shared buffer pin for too long.
+-1Sq: ...
+-- XXX: sometimes it shows the number is 2 instead of 1. It still validates the test but it would be nice to know why.
+-1S: select max(confl_bufferpin) > 0 from gp_stat_database_conflicts where datname = 'isolation2-hot-standby';
+ ?column?
+----------
+ t
+(1 row)
+
+---------------------------------------------------------------------
+-- Conflict with drop (temp) tablespace
+-- Note: regular user tablespaces won't cause conflict on the standby since the standby cannot create any objects under them.
+---------------------------------------------------------------------
+-- create tablespace
+!\retcode mkdir -p @testtablespace@/hs_tablespace_directory;
+(exited with code 0)
+create tablespace hs_ts location '@testtablespace@/hs_tablespace_directory';
+CREATE
+
+-- some prepartion on the primary
+create table hs_ts_foo (i int, j int) distributed by(i);
+CREATE
+insert into hs_ts_foo select i, i from generate_series(1,800000)i;
+INSERT 800000
+analyze hs_ts_foo;
+ANALYZE
+
+-- make sure the standby won't run too fast and delete the temp files
+select gp_inject_fault('after_open_temp_file', 'suspend',dbid) from gp_segment_configuration where content=1 and role='m';
+ gp_inject_fault
+-----------------
+ Success:
+(1 row)
+
+-- on the standby, run some query that requires workfile, this example is taken
+-- from regress/temp_tablespaces test
+-1S: set temp_tablespaces = hs_ts;
+SET
+-1S: set default_tablespace = hs_ts;
+SET
+-1S: set statement_mem='2MB';
+SET
+-1S&: with a1 as (select * from hs_ts_foo), a2 as (select * from hs_ts_foo) select a1.i xx from a1 inner join a2 on a2.i = a1.i union all select count(a1.i) from a1 inner join a2 on a2.i = a1.i order by xx limit 5;
+
+-- drop tablespace, should see conflict on the hot standby
+drop tablespace hs_ts;
+DROP
+select gp_inject_fault('after_open_temp_file', 'reset',dbid) from gp_segment_configuration where content=1 and role='m';
+ gp_inject_fault
+-----------------
+ Success:
+(1 row)
+-1S<: <... completed>
+ERROR: canceling statement due to conflict with recovery (seg1 slice3 127.0.1.1:7006 pid=990)
+DETAIL: User was or might have been using tablespace that must be dropped.
+-1Sq: ...
+
+-- conflict has been recorded. The query has multiple slices
+-1S: select max(confl_tablespace) >= 1 from gp_stat_database_conflicts where datname = 'isolation2-hot-standby';
+ ?column?
+----------
+ t
+(1 row)
+
+-- cleanup
+!\retcode rm -rf @testtablespace@/hs_tablespace_directory;
+GP_IGNORE:-- start_ignore
+GP_IGNORE:
+GP_IGNORE:-- end_ignore
+(exited with code 0)
+-- Do one checkpoint. Otherwise if server restarts w/o doing checkpoint (some subsequent
+-- tests might do that), the server would complain it cannot find the directory for hs_ts.
+checkpoint;
+CHECKPOINT
+
+----------------------------------------------------------------
+-- Additional case to show that distributed transaction is not taken into
+-- account w/o the help of restore-point-based distributed snapshot creation.
+----------------------------------------------------------------
+
+1: create table hs_qc_ds1(a int);
+CREATE
+1: insert into hs_qc_ds1 select * from generate_series(1,10);
+INSERT 10
+-- standby starts a repeatable read transaction, runs a local query that
+-- creates a distributed snapshot w/o creating QE.
+-1S: select count(*) from hs_qc_ds1;
+ count
+-------
+ 10
+(1 row)
+-1S: begin transaction isolation level repeatable read;
+BEGIN
+-1S: select relname from pg_class where relname = 'hs_qc_ds1';
+ relname
+-----------
+ hs_qc_ds1
+(1 row)
+-- primary runs VACUUM
+1: delete from hs_qc_ds1;
+DELETE 10
+1: vacuum hs_qc_ds1;
+VACUUM
+-- The standby query in theory should be cancelled, because it started before
+-- the VACUUM. But in reality, it doesn't, and sees 0 rows, because the QE for the
+-- SELECT below will create more recent local snapshot that does not conflict with
+-- the VACUUM, and sees the result of DELETE+VACUUM.
+-- Note: with the help of restore point, we would be able to create local snapshot
+-- precisely corresponding to each distributed snapshot, and do conflict detection accordingly.
+-1S: select count(*) from hs_qc_ds1;
+ count
+-------
+ 0
+(1 row)
+-1S: end;
+END
+
+----------------------------------------------------------------
+-- Test GUC hot_standby_feedback
+----------------------------------------------------------------
+!\retcode gpconfig -c hot_standby_feedback -v on;
+(exited with code 0)
+!\retcode gpstop -u;
+(exited with code 0)
+
+1: create table hs_qc_guc1(a int);
+CREATE
+1: insert into hs_qc_guc1 select * from generate_series(1,10);
+INSERT 10
+
+-1S: begin transaction isolation level repeatable read;
+BEGIN
+-1S: select * from hs_qc_guc1;
+ a
+----
+ 1
+ 10
+ 2
+ 3
+ 4
+ 5
+ 6
+ 7
+ 8
+ 9
+(10 rows)
+
+-- VACUUM won't cleanup this table since the standby still sees it
+1: delete from hs_qc_guc1;
+DELETE 10
+1: vacuum hs_qc_guc1;
+VACUUM
+
+-- hot standby can still see those rows
+-1S: select * from hs_qc_guc1;
+ a
+----
+ 1
+ 10
+ 2
+ 3
+ 4
+ 5
+ 6
+ 7
+ 8
+ 9
+(10 rows)
+
+-- after the conflicting read transaction ends, the next VACUUM will successfully vacuum the table
+-1S: end;
+END
+1: vacuum hs_qc_guc1;
+VACUUM
+-1S: select * from hs_qc_guc1;
+ a
+---
+(0 rows)
+-1Sq: ...
+
+!\retcode gpconfig -r hot_standby_feedback;
+(exited with code 0)
+!\retcode gpstop -u;
+(exited with code 0)
+
+----------------------------------------------------------------
+-- Test GUC vacuum_defer_cleanup_age
+----------------------------------------------------------------
+-- Use a GUC value that's not 0, so VACUUM does not clean up
+-- recent dead rows that the hot standby might be still seeing.
+!\retcode gpconfig -c vacuum_defer_cleanup_age -v 1;
+(exited with code 0)
+!\retcode gpstop -u;
+(exited with code 0)
+
+1: create table hs_qc_guc2(a int);
+CREATE
+1: insert into hs_qc_guc2 select * from generate_series(1,10);
+INSERT 10
+
+-1S: begin transaction isolation level repeatable read;
+BEGIN
+-1S: select count(*) from hs_qc_guc2;
+ count
+-------
+ 10
+(1 row)
+
+-- VACUUM won't cleanup this table since the DELETE is still within vacuum_defer_cleanup_age
+1: delete from hs_qc_guc2;
+DELETE 10
+1: vacuum hs_qc_guc2;
+VACUUM
+
+-- showing all rows are deleted but not vacuumed
+1: select count(*) from hs_qc_guc2;
+ count
+-------
+ 0
+(1 row)
+1: set gp_select_invisible to on;
+SET
+1: select count(*) from hs_qc_guc2;
+ count
+-------
+ 10
+(1 row)
+
+-- hot standby can still query the table
+-1S: select count(*) from hs_qc_guc2;
+ count
+-------
+ 10
+(1 row)
+
+-- only if the age is reached, hot standby will see the same conflict as before
+1: create temp table tt1(a int);
+CREATE
+1: vacuum hs_qc_guc2;
+VACUUM
+-1S: select count(*) from hs_qc_guc2;
+ERROR: terminating connection due to conflict with recovery (seg0 slice1 127.0.1.1:7005 pid=18713)
+DETAIL: User query might have needed to see row versions that must be removed.
+HINT: In a moment you should be able to reconnect to the database and repeat your command.
+-1Sq: ...
+-1S: select max(confl_snapshot) from gp_stat_database_conflicts where datname = 'isolation2-hot-standby';
+ max
+-----
+ 2
+(1 row)
+
+!\retcode gpconfig -r vacuum_defer_cleanup_age;
+(exited with code 0)
+!\retcode gpstop -u;
+(exited with code 0)
+
diff --git a/src/test/isolation2/sql/.gitignore b/src/test/isolation2/sql/.gitignore
index 361b986e18d..bfc3709082c 100644
--- a/src/test/isolation2/sql/.gitignore
+++ b/src/test/isolation2/sql/.gitignore
@@ -7,6 +7,7 @@
/pt_io_in_progress_deadlock.sql
/distributed_snapshot.sql
/local_directory_table_mixed.sql
+/hot_standby/query_conflict.sql
# ignores including sub-directories
autovacuum-analyze.sql
diff --git a/src/test/isolation2/sql/hot_standby/basic.sql b/src/test/isolation2/sql/hot_standby/basic.sql
new file mode 100644
index 00000000000..a900b38a29c
--- /dev/null
+++ b/src/test/isolation2/sql/hot_standby/basic.sql
@@ -0,0 +1,95 @@
+-- Tests for basic query dispatch on a hot standy.
+
+-- hot standby must show on and the sync mode is remote_apply for the tests to make sense
+-1S: show hot_standby;
+-1S: show synchronous_commit;
+
+-- will be checking if QD/QE info looks good
+-1S: select id, type, content, port from gp_backend_info();
+
+----------------------------------------------------------------
+-- Test: basic query dispatch
+----------------------------------------------------------------
+create table hs_t1(a int);
+create table hs_t2(a int);
+
+-- standby should see the results for 2pc immediately.
+insert into hs_t1 select * from generate_series(1,10);
+-1S: select * from hs_t1;
+-- standby won't see results for the last 1pc immediately because the standby QD
+-- isn't aware of of it so its distributed snapshot doesn't include the 1pc, but
+-- as long as another 2pc comes it will be able to see the previous 1pc. Wee
+-- tolerate this case in the mirrored cluster setup.
+insert into hs_t2 values(1);
+-1S: select * from hs_t2;
+-- any following 2pc will make the 1pc visible
+create temp table tt(a int);
+-1S: select * from hs_t2;
+
+-- we have three QEs launched on the mirror segments.
+-- note that the first QE on a segment is still a "writer" because we
+-- need it to manage locks, same as read-only queries on a primary QD.
+-1S: select id, type, content, port from gp_backend_info();
+
+-- should have parallel readers launched
+-1S: select * from hs_t1 join (select * from hs_t2) hs_t2 on (hs_t1 = hs_t2);
+-1S: select id, type, content, port from gp_backend_info();
+
+-- now a singleton reader added too
+-1S: select * from hs_t1 join (select oid::int from pg_class) hs_t2 on (hs_t1 = hs_t2);
+-1S: select id, type, content, port from gp_backend_info();
+
+-- un-committed result should not be seen by the standby
+begin;
+insert into hs_t1 select * from generate_series(11,20);
+
+-- standby should only see 1...10
+-1S: select * from hs_t1;
+
+end;
+
+-- standby should see 1...20 now
+-1S: select * from hs_t1;
+
+----------------------------------------------------------------
+-- Test: other things that a hot standby can do.
+--
+-- More refer to regress test 'hs_standby_allowed'.
+----------------------------------------------------------------
+-- set/reset and show GUC
+-1S: set optimizer = on;
+-1S: show optimizer;
+-1S: reset optimizer;
+-- copy command
+-1S: copy hs_t1 to '/tmp/hs_copyto.csv' csv null '';
+-- query catalogs
+-1S: select count(*) from pg_class where relname = 'hs_t1';
+-1S: select dbid,content,role,preferred_role,mode,status from gp_segment_configuration where dbid = current_setting('gp_dbid')::integer;
+-- checkpoint is allowed on standby but a restart point is created instead
+-1S: checkpoint;
+
+----------------------------------------------------------------
+-- Test: things that can't be done on a hot standby:
+-- no DML, DDL or anything that generates WAL.
+--
+-- More refer to regress test 'hs_standby_disallowed'.
+----------------------------------------------------------------
+-1S: insert into hs_t1 values(1);
+-1S: delete from hs_t1;
+-1S: update hs_t1 set a = 0;
+-1S: create table hs_t2(a int);
+-1S: create database hs_db;
+-1S: vacuum hs_t1;
+
+--
+-- No hintbit WAL generation in SELECT.
+--
+create table hs_nohintbit(a int) distributed by (a);
+insert into hs_nohintbit select generate_series (1, 10);
+-- flush the data to disk
+checkpoint;
+
+-1S: set gp_disable_tuple_hints=off;
+-- no WAL is being generated (otherwise an error would occur "cannot make new WAL entries during recovery")
+-1S: SELECT count(*) FROM hs_nohintbit;
+
diff --git a/src/test/isolation2/sql/hot_standby/faults.sql b/src/test/isolation2/sql/hot_standby/faults.sql
new file mode 100644
index 00000000000..6e25bcba272
--- /dev/null
+++ b/src/test/isolation2/sql/hot_standby/faults.sql
@@ -0,0 +1,125 @@
+-- Test system faults scenarios
+
+-- start_matchsubs
+--
+-- m/Is the server running on host.*/
+-- s/Is the server running on host "\d+.\d+.\d+.\d+" and accepting/Is the server running on host and accepting/
+-- m/(seg\d+ \d+.\d+.\d+.\d+:\d+)/
+-- s/(.*)/(seg IP:PORT)/
+-- m/ERROR: connection to dbid 1 .*:7000 failed .*/
+-- s/ERROR: connection to dbid 1 .*:7000 failed .*/ERROR: connection to dbid 1 :7000 failed/
+--
+-- end_matchsubs
+
+-- Let FTS detect/declare failure sooner
+!\retcode gpconfig -c gp_fts_probe_interval -v 10 --coordinatoronly;
+!\retcode gpstop -u;
+
+create table hs_failover(a int);
+insert into hs_failover select * from generate_series(1,10);
+-1S: select * from hs_failover;
+
+----------------------------------------------------------------
+-- Mirror segment fails
+----------------------------------------------------------------
+select pg_ctl(datadir, 'stop', 'immediate') from gp_segment_configuration where content=1 and role = 'm';
+
+-- make sure mirror is detected down
+create temp table hs_tt(a int);
+select gp_request_fts_probe_scan();
+
+-- will not succeed
+-1S: select * from hs_failover;
+-1Sq:
+
+-- recovery
+!\retcode gprecoverseg -aF;
+
+-- sync-up
+select wait_until_all_segments_synchronized();
+
+-- works now
+-1S: select * from hs_failover;
+
+----------------------------------------------------------------
+-- Primary segment fails
+----------------------------------------------------------------
+-- inject a fault where the mirror gets out of recovery
+select gp_inject_fault('out_of_recovery_in_startupxlog', 'skip', dbid) from gp_segment_configuration where content = 1 and role = 'm';
+
+select pg_ctl(datadir, 'stop', 'immediate') from gp_segment_configuration where content=1 and role = 'p';
+select gp_request_fts_probe_scan();
+
+-- make sure failover happens
+select dbid, content, role, preferred_role, mode, status from gp_segment_configuration where content = 1;
+select gp_wait_until_triggered_fault('out_of_recovery_in_startupxlog', 1, dbid) from gp_segment_configuration where content = 1 and role = 'p';
+select gp_inject_fault('out_of_recovery_in_startupxlog', 'reset', dbid) from gp_segment_configuration where content = 1 and role = 'p';
+
+-- On an existing standby connection, query will run but it is dispatched to the previous mirror
+-- in an existing gang. That mirror is now a primary, so it will complain and the query fails.
+-1S: select * from hs_failover;
+-1Sq:
+
+-- will fail due to downed mirror (previous primary)
+-1S: select * from hs_failover;
+-1Sq:
+
+-- bring the downed mirror up
+!\retcode gprecoverseg -aF;
+select wait_until_all_segments_synchronized();
+
+-- mirror is up
+-1S: select dbid, content, role, preferred_role, mode, status from gp_segment_configuration where content = 1;
+
+-- now the query will succeed
+-1S: select * from hs_failover;
+-1Sq:
+
+-- re-balance, bring the segments to their preferred roles
+!\retcode gprecoverseg -ar;
+select wait_until_all_segments_synchronized();
+-1S: select dbid, content, role, preferred_role, mode, status from gp_segment_configuration where content = 1;
+
+-- query runs fine still
+-1S: select * from hs_failover;
+
+----------------------------------------------------------------
+-- DTX recovery
+----------------------------------------------------------------
+-- skip FTS probe to prevent unexpected mirror promotion
+1: select gp_inject_fault_infinite('fts_probe', 'skip', dbid) from gp_segment_configuration where role='p' and content=-1;
+
+1: create table tt_hs_dtx(a int);
+
+-- inject fault to repeatedly fail the COMMIT PREPARE phase of 2PC, which ensures that the dtx cannot finish even by the dtx recovery process.
+select gp_inject_fault_infinite('finish_commit_prepared', 'error', dbid) from gp_segment_configuration where content=1 and role='p';
+
+-- session 1 on primary QD tries to commit a DTX, but cannot finish due to the fault on a QE
+1&: insert into tt_hs_dtx select * from generate_series(1,10);
+
+-- inject a panic on primary QD, essentially restarts the primary QD
+2: select gp_inject_fault('before_read_command', 'panic', dbid) from gp_segment_configuration where content=-1 and role='p';
+2: select 1;
+
+1<:
+1q:
+2q:
+
+-- standby QD can still run query
+-1S: select * from hs_failover;
+-- it cannot see rows from the in-doubt DTX
+-1S: select * from tt_hs_dtx;
+
+-- let the failed dtx be recovered, also make sure the standby replays the forget record which signals the completion of the dtx
+-1S: select gp_inject_fault('redoDistributedForgetCommitRecord', 'skip', dbid) from gp_segment_configuration where content=-1 and role='m';
+-1S: select gp_inject_fault_infinite('finish_commit_prepared', 'reset', dbid) from gp_segment_configuration where content=1 and role='p';
+-1S: select gp_wait_until_triggered_fault('redoDistributedForgetCommitRecord', 1, dbid) from gp_segment_configuration where content=-1 and role='m';
+-1S: select gp_inject_fault('redoDistributedForgetCommitRecord', 'reset', dbid) from gp_segment_configuration where content=-1 and role='m';
+
+-- standby should see the rows from the in-doubt DTX now
+-1S: select * from tt_hs_dtx;
+
+-1S: select wait_until_all_segments_synchronized();
+1: select gp_inject_fault('before_read_command', 'reset', dbid) from gp_segment_configuration where content=-1 and role='p';
+1: select gp_inject_fault('fts_probe', 'reset', dbid) from gp_segment_configuration where role='p' and content=-1;
+
diff --git a/src/test/isolation2/sql/hot_standby/setup.sql b/src/test/isolation2/sql/hot_standby/setup.sql
new file mode 100644
index 00000000000..aa15f468b7d
--- /dev/null
+++ b/src/test/isolation2/sql/hot_standby/setup.sql
@@ -0,0 +1,9 @@
+-- setup for hot standby tests
+!\retcode gpconfig -c hot_standby -v on;
+-- let primary wait for standby to apply changes, make test less flaky
+!\retcode gpconfig -c synchronous_commit -v remote_apply;
+-- make it faster to handle query conflict
+!\retcode gpconfig -c max_standby_streaming_delay -v 1000;
+-- disable autovacuum, to not affect the manual VACUUM in the tests
+!\retcode gpconfig -c autovacuum -v off;
+!\retcode gpstop -ar;
diff --git a/src/test/isolation2/sql/hot_standby/teardown.sql b/src/test/isolation2/sql/hot_standby/teardown.sql
new file mode 100644
index 00000000000..af6fba50aed
--- /dev/null
+++ b/src/test/isolation2/sql/hot_standby/teardown.sql
@@ -0,0 +1,5 @@
+-- reset the setup for hot standby tests
+!\retcode gpconfig -r hot_standby;
+!\retcode gpconfig -r synchronous_commit;
+!\retcode gpconfig -r max_standby_streaming_delay;
+!\retcode gpstop -ar;
diff --git a/src/test/isolation2/sql/hot_standby/transaction_isolation.sql b/src/test/isolation2/sql/hot_standby/transaction_isolation.sql
new file mode 100644
index 00000000000..68945228313
--- /dev/null
+++ b/src/test/isolation2/sql/hot_standby/transaction_isolation.sql
@@ -0,0 +1,319 @@
+----------------------------------------------------------------
+-- Test transaction isolation in general, not specific to dtx
+----------------------------------------------------------------
+1: create table hs_tx(a int);
+1: insert into hs_tx select * from generate_series(1,10);
+
+1: begin;
+1: insert into hs_tx select * from generate_series(11,20);
+2: begin;
+2: insert into hs_tx select * from generate_series(21,30);
+2: abort;
+
+-- standby should only see completed transactions, not in-progress transactions, nor aborted transactions
+-1S: select * from hs_tx;
+
+1: end;
+-1S: select * from hs_tx;
+
+----------------------------------------------------------------
+-- Test isolation between hot standby query and in-progress dtx
+----------------------------------------------------------------
+
+1: create table hs_dtx1(a int);
+1: create table hs_dtx2(a int);
+
+-- inject two suspend faults:
+-- 1. on seg0, suspend before PREPARE phase of 2PC
+1: select gp_inject_fault('qe_start_prepared', 'suspend',dbid) from gp_segment_configuration where content=0 and role='p';
+1&: insert into hs_dtx1 select * from generate_series(1,10);
+-- 2. on seg1, suspend before COMMIT phase of 2PC
+2: select gp_inject_fault('qe_start_commit_prepared', 'suspend',dbid) from gp_segment_configuration where content=1 and role='p';
+2&: insert into hs_dtx2 select * from generate_series(1,10);
+
+-- standby should not see any rows from either dtx
+-1S: select * from hs_dtx1;
+-1S: select * from hs_dtx2;
+
+-- reset
+3: select gp_inject_fault('qe_start_prepared', 'reset',dbid) from gp_segment_configuration where content=0 and role='p';
+3: select gp_inject_fault('qe_start_commit_prepared', 'reset',dbid) from gp_segment_configuration where content=1 and role='p';
+1<:
+2<:
+
+-- standby should see the results from the dtx now
+-1S: select * from hs_dtx1;
+-1S: select * from hs_dtx2;
+
+----------------------------------------------------------------
+-- Test DTX abort that happens in different phases
+----------------------------------------------------------------
+
+1: create table hs_abort_dtx1(a int);
+1: create table hs_abort_dtx2(a int);
+
+-- inject two errors:
+-- 1. on seg0, error out before PREPARE phase of 2PC
+1: select gp_inject_fault('qe_start_prepared', 'error', dbid) from gp_segment_configuration where content=0 and role='p';
+1: insert into hs_abort_dtx1 select * from generate_series(1,10);
+1: select gp_inject_fault('qe_start_prepared', 'reset',dbid) from gp_segment_configuration where content=0 and role='p';
+-- 2. on seg1, error out before COMMIT phase of 2PC
+1: select gp_inject_fault('qe_start_commit_prepared', 'error', dbid) from gp_segment_configuration where content=1 and role='p';
+1: insert into hs_abort_dtx2 select * from generate_series(1,10);
+1: select gp_inject_fault('qe_start_commit_prepared', 'reset',dbid) from gp_segment_configuration where content=1 and role='p';
+
+-- standby should not see dtx1 which is aborted but should see dtx2 which is recovered
+-1S: select * from hs_abort_dtx1;
+-1S: select * from hs_abort_dtx2;
+
+----------------------------------------------------------------
+-- Test isolation between hot standby query and in-progress dtx,
+-- but also run more queries in between
+----------------------------------------------------------------
+1: create table hs_dtx3(a int);
+
+-- inject faults to suspend segments in 2PC
+1: select gp_inject_fault('qe_start_prepared', 'suspend', dbid) from gp_segment_configuration where content=0 and role='p';
+1&: insert into hs_dtx3 select * from generate_series(1,10);
+2: select gp_inject_fault('qe_start_commit_prepared', 'suspend', dbid) from gp_segment_configuration where content=1 and role='p';
+2&: insert into hs_dtx3 select * from generate_series(11,20);
+
+-- standby should not see rows in the in-progress dtx
+-1S: select * from hs_dtx3;
+
+-- now run some dtx and completed
+3: insert into hs_dtx3 values(99);
+3: create table hs_dtx4(a int);
+3: insert into hs_dtx4 select * from generate_series(1,10);
+
+-- standby should still not see rows in the in-progress DTX, but should see the completed ones
+-1S: select * from hs_dtx3;
+-1S: select * from hs_dtx4;
+
+3: select gp_inject_fault('qe_start_prepared', 'reset',dbid) from gp_segment_configuration where content=0 and role='p';
+3: select gp_inject_fault('qe_start_commit_prepared', 'reset',dbid) from gp_segment_configuration where content=1 and role='p';
+1<:
+2<:
+
+-- standby should see all rows now
+-1S: select * from hs_dtx3;
+
+----------------------------------------------------------------
+-- Test isolation between standby QD and in-progress dtx,
+-- but after standby QD resets and gets running DTX from checkpoint.
+----------------------------------------------------------------
+1: create table hs_t5(a int, b text);
+1: create table hs_t6(a int, b text);
+
+-- inject fault to suspend a primary right before it conducts the commit phase of 2PC,
+-- so in the subsequent INSERT, all local transactions will be committed but the dtx is not.
+1: select gp_inject_fault('qe_start_commit_prepared', 'suspend', dbid) from gp_segment_configuration where content=0 and role='p';
+1&: insert into hs_t5 select i, 'in-progress' from generate_series(1,10) i;
+
+-- now run some dtx and completed, and primary conducts a checkpoint
+2: insert into hs_t5 values(1, 'commited');
+2: insert into hs_t6 select i, 'committed' from generate_series(1,10) i;
+2: begin;
+2: insert into hs_t5 values(99, 'aborted');
+2: abort;
+2: checkpoint;
+
+-- now make the standby QD resets itself
+-1S: select gp_inject_fault('exec_simple_query_start', 'panic', dbid) from gp_segment_configuration where content=-1 and role='m';
+-1S: select 1;
+-1Sq:
+
+-- standby should still not see rows in the in-progress DTX, but should see the completed ones
+-1S: select * from hs_t5;
+-1S: select * from hs_t6;
+
+2: select gp_inject_fault('qe_start_commit_prepared', 'reset',dbid) from gp_segment_configuration where content=0 and role='p';
+1<:
+
+-- standby should see all rows now
+-1S: select * from hs_t5;
+-1S: select * from hs_t6;
+
+-- standby should correctly see more in-progress dtx on the primary.
+-- context: previously this would be fail because the standby updates latestCompletedGxid to the
+-- bumped nextGxid from checkpoint, which is too far (so that it thinks the new dtx already completed).
+1: select gp_inject_fault('qe_start_prepared', 'suspend', dbid) from gp_segment_configuration where content=0 and role='p';
+1&: delete from hs_t5;
+2: select gp_inject_fault('qe_start_commit_prepared', 'suspend', dbid) from gp_segment_configuration where content=1 and role='p';
+2&: delete from hs_t6;
+
+-- standby should not see the effect of the deletes
+-1S: select * from hs_t5;
+-1S: select * from hs_t6;
+
+3: select gp_inject_fault('qe_start_prepared', 'reset',dbid) from gp_segment_configuration where content=0 and role='p';
+3: select gp_inject_fault('qe_start_commit_prepared', 'reset',dbid) from gp_segment_configuration where content=1 and role='p';
+
+1<:
+2<:
+
+-- standby now see those deletes
+-1S: select * from hs_t5;
+-1S: select * from hs_t6;
+
+----------------------------------------------------------------
+-- Read-committed isolation: query on hot standby should not see dtx that completed after it
+-- created distributed snapshot, but should see dtx that completed before that.
+----------------------------------------------------------------
+
+1: create table hs_rc(a int);
+1: insert into hs_rc select * from generate_series(1,10);
+
+-- case 1: suspend SELECT on the standby QD right after it created snapshot
+-1S: select gp_inject_fault('select_after_qd_create_snapshot', 'suspend', dbid) from gp_segment_configuration where content=-1 and role='m';
+-1S&: select * from hs_rc;
+
+-- new INSERT or DELETE won't be observed by the standby
+1: insert into hs_rc select * from generate_series(11,20);
+1: delete from hs_rc where a < 5;
+1: select gp_inject_fault('select_after_qd_create_snapshot', 'reset', dbid) from gp_segment_configuration where content=-1 and role='m';
+
+-- should only see the rows at the time when SELECT started (1...10).
+-1S<:
+
+-- SELECT again, should see the effect from the INSERT and DELETE now
+-1S: select * from hs_rc;
+
+-- case 2: suspend SELECT on the standby QD before creating snapshot
+-1S: select gp_inject_fault('select_before_qd_create_snapshot', 'suspend', dbid) from gp_segment_configuration where content=-1 and role='m';
+-1S&: select * from hs_rc;
+
+1: insert into hs_rc select * from generate_series(21,30);
+1: delete from hs_rc where a < 21;
+1: select gp_inject_fault('select_before_qd_create_snapshot', 'reset', dbid) from gp_segment_configuration where content=-1 and role='m';
+
+-- standby should see the effect of the INSERT and DELETE
+-1S<:
+
+----------------------------------------------------------------
+-- Read-committed isolation in the BEGIN...END block
+----------------------------------------------------------------
+
+1: truncate hs_rc;
+1: insert into hs_rc select * from generate_series(1,30);
+
+-1S: begin;
+-1S: select count(*) from hs_rc;
+
+-- have some concurrent sessions on primary QD:
+-- 1. a completed transaction
+1: delete from hs_rc where a <= 10;
+-- 3. an aborted transaction
+2: begin;
+2: delete from hs_rc where a > 10 and a <= 20;
+2: abort;
+-- 3. an ongoing transaction
+3: begin;
+3: delete from hs_rc where a > 20 and a <= 30;
+
+-- the standby should see results accordingly
+-1S: select * from hs_rc;
+-1S: end;
+
+3: end;
+-1S: select * from hs_rc;
+
+----------------------------------------------------------------
+-- Repeatable-read isolation: distributed snapshot is created at time of the
+-- first query in transaction block. All queries in the transaction block
+-- should only see results committed before the distributed snapshot creation.
+----------------------------------------------------------------
+
+1: create table hs_rr(a int);
+1: insert into hs_rr select * from generate_series(1,10);
+
+-1S: begin isolation level repeatable read;
+-- should see 10
+-1S: select count(*) from hs_rr;
+
+-- do some more INSERT, DELETE and UPDATE
+1: insert into hs_rr select * from generate_series(11,20);
+1: delete from hs_rr where a <= 10;
+1: update hs_rr set a = a + 100;
+
+-- should still the initial rows {1...10}
+-1S: select * from hs_rr;
+-1S: end;
+
+-- should see the results from the INSERT, DELETE and UPDATE
+-1S: begin isolation level repeatable read;
+-1S: select * from hs_rr;
+
+-- standby won't see ongoing or aborted transactions either
+1: begin;
+1: insert into hs_rr select * from generate_series(1,10);
+2: begin;
+2: insert into hs_rr select * from generate_series(1,10);
+2: abort;
+
+-1S: select * from hs_rr;
+
+1: end;
+-1S: end;
+
+----------------------------------------------------------------
+-- Transaction isolation is respected in subtransactions too
+----------------------------------------------------------------
+
+1: create table hs_subtrx(a int);
+
+-- (1) read-committed
+-1S: begin;
+-1S: select count(*) from hs_subtrx;
+-1S: savepoint s1;
+
+1: insert into hs_subtrx select * from generate_series(1,10);
+
+-1S: select count(*) from hs_subtrx;
+-1S: savepoint s2;
+-1S: select count(*) from hs_subtrx;
+-1S: rollback to savepoint s1;
+-1S: select count(*) from hs_subtrx;
+-1S: end;
+
+-- (2) repeatable-read
+-1S: begin isolation level repeatable read;
+-1S: select * from hs_subtrx;
+-1S: savepoint s1;
+
+1: insert into hs_subtrx select * from generate_series(11,20);
+1: delete from hs_subtrx where a <= 10;
+1: update hs_subtrx set a = a + 100;
+
+-1S: select * from hs_subtrx;
+-1S: savepoint s2;
+-1S: select * from hs_subtrx;
+-1S: rollback to savepoint s1;
+-1S: select * from hs_subtrx;
+-1S: end;
+-1S: select * from hs_subtrx;
+
+----------------------------------------------------------------
+-- Various isolation tests that involve AO/CO table.
+----------------------------------------------------------------
+1: create table hs_ao(a int, id int unique) using ao_row;
+1: insert into hs_ao select 1,i from generate_series(1,10) i;
+1: begin;
+1: insert into hs_ao select 2,i from generate_series(11,20) i;
+
+-- standby sees the same AO metadata as primary
+2: select * from gp_toolkit.__gp_aoseg('hs_ao');
+-1S: select * from gp_toolkit.__gp_aoseg('hs_ao');
+2: select (gp_toolkit.__gp_aoblkdir('hs_ao')).* from gp_dist_random('gp_id');
+-1S: select (gp_toolkit.__gp_aoblkdir('hs_ao')).* from gp_dist_random('gp_id');
+
+-- standby sees correct table data
+-1S: select * from hs_ao;
+
+-- standby sees the effect of vacuum
+1: end;
+1: delete from hs_ao where a = 1;
+1: vacuum hs_ao;
+1: select * from gp_toolkit.__gp_aoseg('hs_ao');
+-1S: select * from gp_toolkit.__gp_aoseg('hs_ao');
+-1S: select * from hs_ao;
diff --git a/src/test/perl/PostgresNode.pm b/src/test/perl/PostgresNode.pm
index 262e4e74fbe..9e6d4c653b9 100644
--- a/src/test/perl/PostgresNode.pm
+++ b/src/test/perl/PostgresNode.pm
@@ -603,6 +603,50 @@ sub append_conf
=pod
+=item $node->adjust_conf(filename, setting, value, skip_equals)
+
+Modify the named config file setting with the value. If the value is undefined,
+instead delete the setting. If the setting is not present no action is taken.
+
+This will write "$setting = $value\n" in place of the existing line,
+unless skip_equals is true, in which case it will write
+"$setting $value\n". If the value needs to be quoted it is the caller's
+responsibility to do that.
+
+=cut
+
+sub adjust_conf
+{
+ my ($self, $filename, $setting, $value, $skip_equals) = @_;
+
+ my $conffile = $self->data_dir . '/' . $filename;
+
+ my $contents = PostgreSQL::Test::Utils::slurp_file($conffile);
+ my @lines = split(/\n/, $contents);
+ my @result;
+ my $eq = $skip_equals ? '' : '= ';
+ foreach my $line (@lines)
+ {
+ if ($line !~ /^$setting\W/)
+ {
+ push(@result, "$line\n");
+ }
+ elsif (defined $value)
+ {
+ push(@result, "$setting $eq$value\n");
+ }
+ }
+ open my $fh, ">", $conffile
+ or croak "could not write \"$conffile\": $!";
+ print $fh @result;
+ close $fh;
+
+ chmod($self->group_access() ? 0640 : 0600, $conffile)
+ or die("unable to set permissions for $conffile");
+}
+
+=pod
+
=item $node->backup(backup_name)
Create a hot backup with B in subdirectory B of
diff --git a/src/test/recovery/t/101_restore_point_and_startup_pause.pl b/src/test/recovery/t/101_restore_point_and_startup_pause.pl
index cda572524c1..f59acffb7ad 100644
--- a/src/test/recovery/t/101_restore_point_and_startup_pause.pl
+++ b/src/test/recovery/t/101_restore_point_and_startup_pause.pl
@@ -1,48 +1,122 @@
-# test for pausing on startup and on a specified restore point
+# Test for pausing and resuming recovery at specific restore points,
+# both at initial startup and in a continuous fashion by advancing
+# gp_pause_on_restore_point_replay.
+
use strict;
use warnings;
use PostgresNode;
use TestLib;
-use Test::More tests => 1;
+use Test::More tests => 12;
use File::Copy;
-# Initialize primary node with WAL archiving setup
+# Initialize and start primary node
my $node_primary = get_new_node('primary');
-$node_primary->init(
- has_archiving => 1,
- allows_streaming => 1);
-$node_primary->append_conf('postgresql.conf', "wal_level = 'replica'");
-$node_primary->append_conf('postgresql.conf', "max_wal_senders = 10");
-my $backup_name = 'my_backup';
-
-# Start primary
+$node_primary->init(has_archiving => 1, allows_streaming => 1);
$node_primary->start;
-# Initialize standby node from backup, fetching WAL from archives
-$node_primary->backup($backup_name);
-my $node_standby = get_new_node('standby');
-$node_standby->init_from_backup($node_primary, $backup_name,
- has_restoring => 1);
-$node_standby->append_conf('postgresql.conf', "gp_pause_on_restore_point_replay = on");
+my $node_standby = get_new_node("standby");
+
+sub test_pause_in_recovery
+{
+ my ($restore_point, $test_lsn, $num_rows) = @_;
+
+ # Wait until standby has replayed enough data
+ my $caughtup_query = "SELECT pg_last_wal_replay_lsn() = '$test_lsn'::pg_lsn";
+ $node_standby->poll_query_until('postgres', $caughtup_query)
+ or die "Timed out while waiting for standby to catch up";
+
+ # Check data has been replayed
+ my $result = $node_standby->safe_psql('postgres', "SELECT count(*) FROM table_foo;");
+ is($result, $num_rows, "check standby content for $restore_point");
+ ok($node_standby->safe_psql('postgres', 'SELECT pg_is_wal_replay_paused();') eq 't',
+ "standby is paused in recovery on $restore_point");
+}
+
+# Create data before taking the backup
+$node_primary->safe_psql('postgres', "CREATE TABLE table_foo AS SELECT generate_series(1,1000);");
+# Take backup from which all operations will be run
+$node_primary->backup('my_backup');
+my $lsn0 = $node_primary->safe_psql('postgres', "SELECT pg_create_restore_point('rp0');");
+# Switching WAL guarantees that the restore point is available to the standby
+$node_primary->safe_psql('postgres', "SELECT pg_switch_wal();");
+
+# Add more data, create restore points and switch wal to guarantee
+# that the restore point is available to the standby
+
+# rp1
+$node_primary->safe_psql('postgres', "INSERT INTO table_foo VALUES (generate_series(1001,2000))");
+my $lsn1 = $node_primary->safe_psql('postgres', "SELECT pg_create_restore_point('rp1');");
+$node_primary->safe_psql('postgres', "SELECT pg_switch_wal();");
+
+# rp2
+$node_primary->safe_psql('postgres', "INSERT INTO table_foo VALUES (generate_series(2001, 3000))");
+my $lsn2 = $node_primary->safe_psql('postgres', "SELECT pg_create_restore_point('rp2');");
+$node_primary->safe_psql('postgres', "SELECT pg_switch_wal();");
+
+# rp3
+$node_primary->safe_psql('postgres', "INSERT INTO table_foo VALUES (generate_series(3001, 4000))");
+$node_primary->safe_psql('postgres', "SELECT pg_create_restore_point('rp3');");
+$node_primary->safe_psql('postgres', "SELECT pg_switch_wal();");
-# Start standby
+# rp4
+$node_primary->safe_psql('postgres', "INSERT INTO table_foo VALUES (generate_series(4001, 5000))");
+my $lsn4 = $node_primary->safe_psql('postgres', "SELECT pg_create_restore_point('rp4');");
+$node_primary->safe_psql('postgres', "SELECT pg_switch_wal();");
+
+# rp5
+$node_primary->safe_psql('postgres', "INSERT INTO table_foo VALUES (generate_series(5001, 6000))");
+$node_primary->safe_psql('postgres', "SELECT pg_create_restore_point('rp5');");
+$node_primary->safe_psql('postgres', "SELECT pg_switch_wal();");
+
+# Restore the backup
+$node_standby->init_from_backup($node_primary, 'my_backup', has_restoring => 1);
+# Enable `hot_standby`
+$node_standby->append_conf('postgresql.conf', qq(hot_standby = 'on'));
+
+# Set rp0 as a restore point to pause on start up
+$node_standby->append_conf('postgresql.conf', qq(gp_pause_on_restore_point_replay = 'rp0'));
+# Start the standby
$node_standby->start;
+test_pause_in_recovery('rp0', $lsn0, 1000);
+
+# Advance to rp1
+$node_standby->adjust_conf('postgresql.conf', 'gp_pause_on_restore_point_replay', "rp1");
+$node_standby->reload;
+$node_standby->safe_psql('postgres', "SELECT pg_wal_replay_resume();");
+test_pause_in_recovery('rp1', $lsn1, 2000);
+
+# Advance to rp2
+$node_standby->adjust_conf('postgresql.conf', 'gp_pause_on_restore_point_replay', "rp2");
+$node_standby->reload;
+$node_standby->safe_psql('postgres', "SELECT pg_wal_replay_resume();");
+test_pause_in_recovery('rp2', $lsn2, 3000);
+
+# Verify that a restart will bring us back to rp2
+$node_standby->restart;
+test_pause_in_recovery('rp2', $lsn2, 3000);
+
+# Skip rp3 and advance to rp4
+$node_standby->adjust_conf('postgresql.conf', 'gp_pause_on_restore_point_replay', "rp4");
+$node_standby->reload;
+$node_standby->safe_psql('postgres', "SELECT pg_wal_replay_resume();");
+test_pause_in_recovery('rp4', $lsn4, 5000);
+
+# Do not advance to rp5; signal promote and then resume recovery
+$node_standby->safe_psql('postgres', "SELECT pg_promote(false);");
+$node_standby->safe_psql('postgres', "SELECT pg_wal_replay_resume();");
-# Create a restore point on the primary
-my $restore_point_lsn =
- $node_primary->safe_psql('postgres', "SELECT pg_create_restore_point('rp')");
+# Wait for standby to promote
+$node_standby->poll_query_until('postgres', "SELECT NOT pg_is_in_recovery();")
+ or die "Timed out while waiting for standby to exit recovery";
-# Force archival of WAL file to make it present on standby
-$node_primary->safe_psql('postgres', "SELECT pg_switch_wal()");
+# Check that we promoted with rp4's table count and not rp5's
+my $result = $node_standby->safe_psql('postgres', "SELECT count(*) FROM table_foo;");
+is($result, 5000, "check standby content after promotion");
-# Wait until enough replay has been done on the standby before checking if replay
-# is paused at the restore point
-my $caughtup_query =
- "SELECT '$restore_point_lsn'::pg_lsn <= pg_last_wal_replay_lsn()";
-$node_standby->poll_query_until('postgres', $caughtup_query)
- or die "Timed out while waiting for standby to catch up";
+# Make sure the former standby is now writable
+$node_standby->safe_psql('postgres', "INSERT INTO table_foo VALUES (generate_series(6001, 7000));");
+$result = $node_standby->safe_psql('postgres', "SELECT count(*) FROM table_foo;");
+is($result, 6000, "check standby is writable after promotion");
-my $paused_at_restore_point_query =
- "SELECT pg_is_wal_replay_paused() and pg_last_wal_replay_lsn() = '$restore_point_lsn'::pg_lsn";
-my $result2 = $node_standby->safe_psql('postgres', $paused_at_restore_point_query);
-is($result2, qq(t), 'check if WAL replay is paused at restore point');
+$node_primary->teardown_node;
+$node_standby->teardown_node;
diff --git a/src/test/regress/GNUmakefile b/src/test/regress/GNUmakefile
index 91d24a8292b..9bc8b67591e 100644
--- a/src/test/regress/GNUmakefile
+++ b/src/test/regress/GNUmakefile
@@ -241,6 +241,16 @@ endif
standbycheck: all
$(pg_regress_installcheck) $(REGRESS_OPTS) --schedule=$(srcdir)/standby_schedule $(EXTRA_TESTS)
+# GPDB: installcheck for hot standby. This is essentially same as the upstream 'standbycheck'
+# above but we just make sure that we do the primary preparation and use the desired standby port.
+# If no standby port is given, just use the demo cluster's standby port 7001.
+ifeq ($(STANDBY_PGPORT),)
+ STANDBY_PGPORT = 7001
+endif
+installcheck-hot-standby: all
+ $(pg_regress_installcheck) $(REGRESS_OPTS) hs_primary_setup
+ $(pg_regress_installcheck) $(REGRESS_OPTS) --port=$(STANDBY_PGPORT) --use-existing --schedule=$(srcdir)/standby_schedule $(EXTRA_TESTS)
+
# old interfaces follow...
runcheck: check
diff --git a/src/test/regress/expected/hs_primary_setup.out b/src/test/regress/expected/hs_primary_setup.out
new file mode 100644
index 00000000000..0184b2b73e9
--- /dev/null
+++ b/src/test/regress/expected/hs_primary_setup.out
@@ -0,0 +1,19 @@
+--
+-- Hot Standby tests
+--
+-- hs_primary_setup.sql
+--
+drop table if exists hs1;
+create table hs1 (col1 integer primary key);
+insert into hs1 values (1);
+drop table if exists hs2;
+create table hs2 (col1 integer primary key);
+insert into hs2 values (12);
+insert into hs2 values (13);
+drop table if exists hs3;
+create table hs3 (col1 integer primary key);
+insert into hs3 values (113);
+insert into hs3 values (114);
+insert into hs3 values (115);
+DROP sequence if exists hsseq;
+create sequence hsseq;
diff --git a/src/test/regress/expected/hs_standby_allowed.out b/src/test/regress/expected/hs_standby_allowed.out
index 00b8faf9eb6..e6b6514642f 100644
--- a/src/test/regress/expected/hs_standby_allowed.out
+++ b/src/test/regress/expected/hs_standby_allowed.out
@@ -164,31 +164,25 @@ show synchronous_commit;
reset synchronous_commit;
discard temp;
discard all;
+NOTICE: command without clusterwide effect
+HINT: Consider alternatives as DEALLOCATE ALL, or DISCARD TEMP if a clusterwide effect is desired.
-- CURSOR commands
BEGIN;
-DECLARE hsc CURSOR FOR select * from hs3;
+DECLARE hsc CURSOR FOR select * from hs3 order by col1 asc;
FETCH next from hsc;
col1
------
113
(1 row)
-fetch first from hsc;
- col1
-------
- 113
-(1 row)
-
-fetch last from hsc;
- col1
-------
- 115
-(1 row)
-
+-- GPDB: backward fetch isn't allowed, moved to hs_standby_disallowed
+-- fetch first from hsc;
+-- fetch last from hsc;
fetch 1 from hsc;
col1
------
-(0 rows)
+ 114
+(1 row)
CLOSE hsc;
COMMIT;
@@ -216,3 +210,5 @@ UNLISTEN *;
-- ALLOWED COMMANDS
CHECKPOINT;
discard all;
+NOTICE: command without clusterwide effect
+HINT: Consider alternatives as DEALLOCATE ALL, or DISCARD TEMP if a clusterwide effect is desired.
diff --git a/src/test/regress/expected/hs_standby_disallowed.out b/src/test/regress/expected/hs_standby_disallowed.out
index 8d3cafa5cec..0a62e40e743 100644
--- a/src/test/regress/expected/hs_standby_disallowed.out
+++ b/src/test/regress/expected/hs_standby_disallowed.out
@@ -11,9 +11,15 @@ commit;
WARNING: there is no transaction in progress
-- SELECT
select * from hs1 FOR SHARE;
-ERROR: cannot execute SELECT FOR SHARE in a read-only transaction
+ERROR: cannot acquire lock mode ExclusiveLock on database objects while recovery is in progress
+LINE 1: select * from hs1 FOR SHARE;
+ ^
+HINT: Only RowExclusiveLock or less can be acquired on database objects during recovery.
select * from hs1 FOR UPDATE;
-ERROR: cannot execute SELECT FOR UPDATE in a read-only transaction
+ERROR: cannot acquire lock mode ExclusiveLock on database objects while recovery is in progress
+LINE 1: select * from hs1 FOR UPDATE;
+ ^
+HINT: Only RowExclusiveLock or less can be acquired on database objects during recovery.
-- DML
BEGIN;
insert into hs1 values (37);
@@ -21,11 +27,17 @@ ERROR: cannot execute INSERT in a read-only transaction
ROLLBACK;
BEGIN;
delete from hs1 where col1 = 1;
-ERROR: cannot execute DELETE in a read-only transaction
+ERROR: cannot acquire lock mode ExclusiveLock on database objects while recovery is in progress
+LINE 1: delete from hs1 where col1 = 1;
+ ^
+HINT: Only RowExclusiveLock or less can be acquired on database objects during recovery.
ROLLBACK;
BEGIN;
update hs1 set col1 = NULL where col1 > 0;
-ERROR: cannot execute UPDATE in a read-only transaction
+ERROR: cannot acquire lock mode ExclusiveLock on database objects while recovery is in progress
+LINE 1: update hs1 set col1 = NULL where col1 > 0;
+ ^
+HINT: Only RowExclusiveLock or less can be acquired on database objects during recovery.
ROLLBACK;
BEGIN;
truncate hs3;
@@ -131,3 +143,15 @@ REVOKE SELECT ON hs1 FROM PUBLIC;
ERROR: cannot execute REVOKE in a read-only transaction
GRANT SELECT ON hs1 TO PUBLIC;
ERROR: cannot execute GRANT in a read-only transaction
+-- GPDB: backward fetch is not supported, moved from hs_standby_allowed.
+BEGIN;
+DECLARE hsc CURSOR FOR select * from hs3 order by col1 asc;
+fetch next from hsc;
+ col1
+------
+ 113
+(1 row)
+
+fetch first from hsc;
+ERROR: backward scan is not supported in this version of Apache Cloudberry
+COMMIT;
diff --git a/src/test/regress/expected/hs_standby_functions.out b/src/test/regress/expected/hs_standby_functions.out
index ce846b758bf..48cb480f47a 100644
--- a/src/test/regress/expected/hs_standby_functions.out
+++ b/src/test/regress/expected/hs_standby_functions.out
@@ -27,13 +27,16 @@ select * from pg_prepared_xacts;
-------------+-----+----------+-------+----------
(0 rows)
--- just the startup process
-select locktype, virtualxid, virtualtransaction, mode, granted
+-- just the startup processes of all standby coordinator and segments, since pg_locks show cluster-wide view
+select gp_segment_id, locktype, virtualxid, virtualtransaction, mode, granted
from pg_locks where virtualxid = '1/1';
- locktype | virtualxid | virtualtransaction | mode | granted
-------------+------------+--------------------+---------------+---------
- virtualxid | 1/1 | 1/0 | ExclusiveLock | t
-(1 row)
+ gp_segment_id | locktype | virtualxid | virtualtransaction | mode | granted
+---------------+------------+------------+--------------------+---------------+---------
+ -1 | virtualxid | 1/1 | 1/0 | ExclusiveLock | t
+ 0 | virtualxid | 1/1 | 1/0 | ExclusiveLock | t
+ 1 | virtualxid | 1/1 | 1/0 | ExclusiveLock | t
+ 2 | virtualxid | 1/1 | 1/0 | ExclusiveLock | t
+(4 rows)
-- suicide is painless
select pg_cancel_backend(pg_backend_pid());
diff --git a/src/test/regress/pg_regress.c b/src/test/regress/pg_regress.c
index 3b9e91136d4..9320cf0aeec 100644
--- a/src/test/regress/pg_regress.c
+++ b/src/test/regress/pg_regress.c
@@ -3615,9 +3615,20 @@ cluster_healthy(void)
return false;
}
+ char *p;
+ /* skip if the instance is hot standby */
+ psql_command_output("postgres", line, sizeof(line),
+ "SELECT pg_is_in_recovery();");
+ p = &line[0];
+ while (*p == ' ')
+ p++;
+ if (*p == 't')
+ {
+ return !halt_work;
+ }
+
i = 120;
do {
- char *p;
/* check for the health for standby coordinator */
psql_command_output("postgres", line, sizeof(line),
"SELECT sync_state FROM pg_stat_get_wal_senders();");
diff --git a/src/test/regress/sql/hs_primary_setup.sql b/src/test/regress/sql/hs_primary_setup.sql
index eeb4421307f..83403299fd5 100644
--- a/src/test/regress/sql/hs_primary_setup.sql
+++ b/src/test/regress/sql/hs_primary_setup.sql
@@ -22,4 +22,11 @@ insert into hs3 values (115);
DROP sequence if exists hsseq;
create sequence hsseq;
+-- start_ignore
SELECT pg_switch_wal();
+
+-- GPDB: enable hot_standby for this cluster
+\! gpconfig -c hot_standby -v on;
+\! gpstop -ari;
+
+-- end_ignore
diff --git a/src/test/regress/sql/hs_standby_allowed.sql b/src/test/regress/sql/hs_standby_allowed.sql
index 6debddc5e99..873f3ef8643 100644
--- a/src/test/regress/sql/hs_standby_allowed.sql
+++ b/src/test/regress/sql/hs_standby_allowed.sql
@@ -82,11 +82,12 @@ discard all;
BEGIN;
-DECLARE hsc CURSOR FOR select * from hs3;
+DECLARE hsc CURSOR FOR select * from hs3 order by col1 asc;
FETCH next from hsc;
-fetch first from hsc;
-fetch last from hsc;
+-- GPDB: backward fetch isn't allowed, moved to hs_standby_disallowed
+-- fetch first from hsc;
+-- fetch last from hsc;
fetch 1 from hsc;
CLOSE hsc;
diff --git a/src/test/regress/sql/hs_standby_disallowed.sql b/src/test/regress/sql/hs_standby_disallowed.sql
index a470600eec8..72066e2d40b 100644
--- a/src/test/regress/sql/hs_standby_disallowed.sql
+++ b/src/test/regress/sql/hs_standby_disallowed.sql
@@ -101,3 +101,11 @@ REINDEX TABLE hs2;
REVOKE SELECT ON hs1 FROM PUBLIC;
GRANT SELECT ON hs1 TO PUBLIC;
+
+-- GPDB: backward fetch is not supported, moved from hs_standby_allowed.
+BEGIN;
+DECLARE hsc CURSOR FOR select * from hs3 order by col1 asc;
+fetch next from hsc;
+fetch first from hsc;
+COMMIT;
+
diff --git a/src/test/regress/sql/hs_standby_functions.sql b/src/test/regress/sql/hs_standby_functions.sql
index b57f67ff8b5..903c8f96037 100644
--- a/src/test/regress/sql/hs_standby_functions.sql
+++ b/src/test/regress/sql/hs_standby_functions.sql
@@ -16,8 +16,8 @@ select pg_stop_backup();
-- should return no rows
select * from pg_prepared_xacts;
--- just the startup process
-select locktype, virtualxid, virtualtransaction, mode, granted
+-- just the startup processes of all standby coordinator and segments, since pg_locks show cluster-wide view
+select gp_segment_id, locktype, virtualxid, virtualtransaction, mode, granted
from pg_locks where virtualxid = '1/1';
-- suicide is painless