diff --git a/.github/workflows/build-cloudberry.yml b/.github/workflows/build-cloudberry.yml index de702b5790d..fd2b9c73949 100644 --- a/.github/workflows/build-cloudberry.yml +++ b/.github/workflows/build-cloudberry.yml @@ -310,6 +310,9 @@ jobs: {"test":"ic-isolation2", "make_configs":["src/test/isolation2:installcheck-isolation2"] }, + {"test":"ic-isolation2-hot-standby", + "make_configs":["src/test/isolation2:installcheck-hot-standby"] + }, {"test":"ic-isolation2-crash", "make_configs":["src/test/isolation2:installcheck-isolation2-crash"], "enable_core_check":false diff --git a/src/backend/access/transam/xlog.c b/src/backend/access/transam/xlog.c index ffc8714cf62..034aeb6473b 100644 --- a/src/backend/access/transam/xlog.c +++ b/src/backend/access/transam/xlog.c @@ -114,6 +114,7 @@ int XLogArchiveTimeout = 0; int XLogArchiveMode = ARCHIVE_MODE_OFF; char *XLogArchiveCommand = NULL; bool EnableHotStandby = false; +bool EnableHotDR = false; bool fullPageWrites = true; bool wal_log_hints = false; bool wal_compression = false; @@ -7967,6 +7968,12 @@ StartupXLOG(void) if (gp_pause_on_restore_point_replay) pauseRecoveryOnRestorePoint(xlogreader); + /* Exit the recovery loop if a promotion is triggered in pauseRecoveryOnRestorePoint() */ + if (reachedContinuousRecoveryTarget && recoveryTargetAction == RECOVERY_TARGET_ACTION_PROMOTE){ + reachedRecoveryTarget = true; + break; + } + /* Exit loop if we reached inclusive recovery target */ if (recoveryStopsAfter(xlogreader)) { @@ -10757,6 +10764,9 @@ XLogRestorePoint(const char *rpName) xlrec.rp_time = GetCurrentTimestamp(); strlcpy(xlrec.rp_name, rpName, MAXFNAMELEN); + /* LogHotStandby for the restore here */ + LogStandbySnapshot(); + XLogBeginInsert(); XLogRegisterData((char *) &xlrec, sizeof(xl_restore_point)); diff --git a/src/backend/cdb/cdbutil.c b/src/backend/cdb/cdbutil.c index a241549662d..fbf3f8900f2 100644 --- a/src/backend/cdb/cdbutil.c +++ b/src/backend/cdb/cdbutil.c @@ -92,6 +92,7 @@ static int CdbComponentDatabaseInfoCompare(const void *p1, const void *p2); static GpSegConfigEntry * readGpSegConfigFromCatalog(int *total_dbs); static GpSegConfigEntry * readGpSegConfigFromFTSFiles(int *total_dbs); +static GpSegConfigEntry * readGpSegConfigFromFiles(int *total_dbs); static void getAddressesForDBid(GpSegConfigEntry *c, int elevel); static HTAB *hostPrimaryCountHashTableInit(void); @@ -131,6 +132,15 @@ typedef struct HostPrimaryCountEntry */ static GpSegConfigEntry * readGpSegConfigFromFTSFiles(int *total_dbs) +{ + Assert(!IsTransactionState() && !IS_HOT_DR_CLUSTER()); + /* notify and wait FTS to finish a probe and update the dump file */ + FtsNotifyProber(); + return readGpSegConfigFromFiles(total_dbs); +} + +static GpSegConfigEntry * +readGpSegConfigFromFiles(int *total_dbs) { FILE *fd; int idx = 0; @@ -142,11 +152,6 @@ readGpSegConfigFromFTSFiles(int *total_dbs) char address[MAXHOSTNAMELEN]; char buf[MAXHOSTNAMELEN * 2 + 32]; - Assert(!IsTransactionState()); - - /* notify and wait FTS to finish a probe and update the dump file */ - FtsNotifyProber(); - fd = AllocateFile(GPSEGCONFIGDUMPFILE, "r"); if (!fd) @@ -188,6 +193,18 @@ readGpSegConfigFromFTSFiles(int *total_dbs) return configs; } +bool +checkGpSegConfigFtsFiles() +{ + FILE *fd = AllocateFile(GPSEGCONFIGDUMPFILE, "r"); + + if (!fd) + return false; + + FreeFile(fd); + return true; +} + /* * writeGpSegConfigToFTSFiles() dump gp_segment_configuration to the file * GPSEGCONFIGDUMPFILE, in $PGDATA, only FTS process can use this function. @@ -372,10 +389,17 @@ getCdbComponentInfo(void) HTAB *hostPrimaryCountHash = hostPrimaryCountHashTableInit(); - if (IsTransactionState()) - configs = readGpSegConfigFromCatalog(&total_dbs); + if (EnableHotDR) + { + configs = readGpSegConfigFromFiles(&total_dbs); + } else - configs = readGpSegConfigFromFTSFiles(&total_dbs); + { + if (IsTransactionState()) + configs = readGpSegConfigFromCatalog(&total_dbs); + else + configs = readGpSegConfigFromFTSFiles(&total_dbs); + } component_databases = palloc0(sizeof(CdbComponentDatabases)); diff --git a/src/backend/utils/misc/guc_gp.c b/src/backend/utils/misc/guc_gp.c index c7bb596cb61..2c373e5a582 100644 --- a/src/backend/utils/misc/guc_gp.c +++ b/src/backend/utils/misc/guc_gp.c @@ -86,6 +86,7 @@ static bool check_optimizer(bool *newval, void **extra, GucSource source); static bool check_verify_gpfdists_cert(bool *newval, void **extra, GucSource source); static bool check_dispatch_log_stats(bool *newval, void **extra, GucSource source); static bool check_gp_workfile_compression(bool *newval, void **extra, GucSource source); +static bool check_hot_dr(bool *newval, void **extra, GucSource source); /* Helper function for guc setter */ bool gpvars_check_gp_resqueue_priority_default_value(char **newval, @@ -3331,6 +3332,16 @@ struct config_bool ConfigureNamesBool_gp[] = NULL, NULL, NULL }, + { + {"hot_dr", PGC_POSTMASTER, REPLICATION_STANDBY, + gettext_noop("DR Cluster as well as allows connteions and queries"), + NULL + }, + &EnableHotDR, + false, + check_hot_dr, NULL, NULL + }, + { {"gp_enable_runtime_filter_pushdown", PGC_USERSET, DEVELOPER_OPTIONS, gettext_noop("Try to push the hash table of hash join to the seqscan or AM as bloom filter."), @@ -5455,6 +5466,22 @@ check_verify_gpfdists_cert(bool *newval, void **extra, GucSource source) return true; } +static bool +check_hot_dr(bool *newval, void **extra, GucSource source) +{ + if (*newval && !EnableHotStandby) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("cannot enable \"hot_dr\" when \"hot_standby\" is false"))); + + if (*newval && IS_QUERY_DISPATCHER() && !checkGpSegConfigFtsFiles()) + ereport(ERROR, + (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("cannot enable \"hot_dr\" since DR cluster segment configuration file does not exits"))); + + return true; +} + static bool check_dispatch_log_stats(bool *newval, void **extra, GucSource source) { diff --git a/src/include/access/xlog.h b/src/include/access/xlog.h index e8a73ceb201..6d1cc151ed2 100644 --- a/src/include/access/xlog.h +++ b/src/include/access/xlog.h @@ -123,6 +123,7 @@ extern int XLogArchiveTimeout; extern int wal_retrieve_retry_interval; extern char *XLogArchiveCommand; extern bool EnableHotStandby; +extern bool EnableHotDR; extern bool fullPageWrites; extern bool wal_log_hints; diff --git a/src/include/cdb/cdbutil.h b/src/include/cdb/cdbutil.h index 22c3cc782d8..0f638bbd521 100644 --- a/src/include/cdb/cdbutil.h +++ b/src/include/cdb/cdbutil.h @@ -132,6 +132,7 @@ extern char *getDnsAddress(char *name, int port, int elevel); #ifdef USE_INTERNAL_FTS extern void writeGpSegConfigToFTSFiles(void); +extern bool checkGpSegConfigFtsFiles(void); #else GpSegConfigEntry * readGpSegConfig(char * buff, int *total_dbs); diff --git a/src/include/cdb/cdbvars.h b/src/include/cdb/cdbvars.h index 2393384ec3a..534f957978d 100644 --- a/src/include/cdb/cdbvars.h +++ b/src/include/cdb/cdbvars.h @@ -757,6 +757,7 @@ extern GpId GpIdentity; #define MAX_DBID_STRING_LENGTH 11 #define UNINITIALIZED_GP_IDENTITY_VALUE (-10000) +#define IS_HOT_DR_CLUSTER() (EnableHotDR) #define IS_QUERY_DISPATCHER() (GpIdentity.segindex == MASTER_CONTENT_ID) #define IS_HOT_STANDBY_QD() (EnableHotStandby && IS_QUERY_DISPATCHER() && RecoveryInProgress()) diff --git a/src/include/utils/unsync_guc_name.h b/src/include/utils/unsync_guc_name.h index b26c5b43c7b..37f629e6e97 100644 --- a/src/include/utils/unsync_guc_name.h +++ b/src/include/utils/unsync_guc_name.h @@ -294,6 +294,7 @@ "gp_workfile_limit_per_segment", "gp_workfile_max_entries", "hba_file", + "hot_dr", "hot_standby", "hot_standby_feedback", "huge_pages", diff --git a/src/test/isolation2/expected/hot_standby/faults.out b/src/test/isolation2/expected/hot_standby/faults.out index 39f3a06cca6..2eb16b37229 100644 --- a/src/test/isolation2/expected/hot_standby/faults.out +++ b/src/test/isolation2/expected/hot_standby/faults.out @@ -133,7 +133,7 @@ select gp_inject_fault('out_of_recovery_in_startupxlog', 'reset', dbid) from gp_ ERROR: primary segments can only process MPP protocol messages from primary QD (seg1 slice1 127.0.1.1:7006 pid=14671) HINT: Exit the current session and re-connect. -1Sq: ... - +-- start_ignore -- will fail due to downed mirror (previous primary) -1S: select * from hs_failover; ERROR: failed to acquire resources on one or more segments @@ -141,6 +141,7 @@ DETAIL: connection to server at "10.13.9.74", port 7003 failed: Connection refu Is the server running on that host and accepting TCP/IP connections? (seg1 10.13.9.74:7003) -1Sq: ... +-- end_ignore -- bring the downed mirror up !\retcode gprecoverseg -aF; diff --git a/src/test/isolation2/sql/hot_standby/faults.sql b/src/test/isolation2/sql/hot_standby/faults.sql index 6e25bcba272..b1be240916a 100644 --- a/src/test/isolation2/sql/hot_standby/faults.sql +++ b/src/test/isolation2/sql/hot_standby/faults.sql @@ -59,10 +59,11 @@ select gp_inject_fault('out_of_recovery_in_startupxlog', 'reset', dbid) from gp_ -- in an existing gang. That mirror is now a primary, so it will complain and the query fails. -1S: select * from hs_failover; -1Sq: - +-- start_ignore -- will fail due to downed mirror (previous primary) -1S: select * from hs_failover; -1Sq: +-- end_ignore -- bring the downed mirror up !\retcode gprecoverseg -aF;