roachtest: add variations of c2c/bulk with diff settings

dt · dt · commit e6d66d1c9001 · 2025-09-23T20:59:49.000Z
We may linger in a place where we tell some customers to customize non-public
cluster settings to optimize for certain characteristics (pcr lag vs bulk op perf)
until we can develop a better set of default behaviors. While we're in this state,
we should test some of the more common configs we think we might recommend, even though
these will still be custom configs, recommended on case-by-case basis.

Release note: none.
Epic: none.
diff --git a/pkg/cmd/roachtest/tests/cluster_to_cluster.go b/pkg/cmd/roachtest/tests/cluster_to_cluster.go
@@ -398,6 +398,8 @@ type replicateBulkOps struct {
 
 	// debugSkipRollback skips all rollback steps during the test.
 	debugSkipRollback bool
+
+	withSettings []struct{ setting, value string }
 }
 
 func (bo replicateBulkOps) sourceInitCmd(tenantName string, nodes option.NodeListOption) string {
@@ -411,6 +413,18 @@ func (bo replicateBulkOps) sourceRunCmd(tenantName string, nodes option.NodeList
 func (bo replicateBulkOps) runDriver(
 	workloadCtx context.Context, c cluster.Cluster, t test.Test, setup *c2cSetup,
 ) error {
+	mainTenantConn := c.Conn(workloadCtx, t.L(), 1, option.VirtualClusterName(setup.src.name))
+	for _, pair := range bo.withSettings {
+		settingStmt := fmt.Sprintf("SET CLUSTER SETTING %s = '%s'", pair.setting, pair.value)
+		t.L().Printf("Setting on sys/main/standby-sys: %s", settingStmt)
+		setup.src.sysSQL.Exec(t, settingStmt)
+		// PCR settings are system-only; assume others are app-level.
+		if !strings.Contains(pair.setting, "physical_replication") {
+			if _, err := mainTenantConn.ExecContext(workloadCtx, settingStmt); err != nil {
+				return err
+			}
+		}
+	}
 	runBackupMVCCRangeTombstones(workloadCtx, t, c, mvccRangeTombstoneConfig{
 		skipBackupRestore: true,
 		skipClusterSetup:  true,
@@ -1519,7 +1533,7 @@ func registerClusterToCluster(r registry.Registry) {
 			suites:                    registry.Suites(registry.Nightly),
 		},
 		{
-			name:               "c2c/BulkOps",
+			name:               "c2c/BulkOps/settings=none",
 			srcNodes:           4,
 			dstNodes:           4,
 			cpus:               8,
@@ -1543,6 +1557,63 @@ func registerClusterToCluster(r registry.Registry) {
 			clouds:                    registry.OnlyGCE,
 			suites:                    registry.Suites(registry.Nightly),
 		},
+		{
+			name:     "c2c/BulkOps/settings=ac-import",
+			srcNodes: 4,
+			dstNodes: 4,
+			cpus:     8,
+			pdSize:   100,
+			workload: replicateBulkOps{withSettings: []struct{ setting, value string }{
+				{"bulkio.import.elastic_control.enabled", "true"},
+				{"bulkio.elastic_cpu_control.request_duration", "3ms"},
+			}},
+			timeout:            2 * time.Hour,
+			additionalDuration: 0,
+			// Cutover currently takes around 4 minutes, perhaps because we need to
+			// revert 10 GB of replicated data.
+			//
+			// TODO(msbutler): investigate further if cutover can be sped up.
+			cutoverTimeout: 20 * time.Minute,
+			cutover:        5 * time.Minute,
+			// In a few ad hoc runs, the max latency hikes up to 27 minutes before lag
+			// replanning and distributed catch up scans fix the poor initial plan. If
+			// max accepted latency doubles, then there's likely a regression.
+			maxAcceptedLatency: 1 * time.Hour,
+			// Skipping node distribution check because there is little data on the
+			// source when the replication stream begins.
+			skipNodeDistributionCheck: true,
+			clouds:                    registry.OnlyGCE,
+			suites:                    registry.Suites(registry.Nightly),
+		},
+		{
+			name:     "c2c/BulkOps/settings=ac-and-splits",
+			srcNodes: 4,
+			dstNodes: 4,
+			cpus:     8,
+			pdSize:   100,
+			workload: replicateBulkOps{withSettings: []struct{ setting, value string }{
+				{"bulkio.import.elastic_control.enabled", "true"},
+				{"bulkio.elastic_cpu_control.request_duration", "3ms"},
+				{"physical_replication.consumer.ingest_split_event.enabled", "true"},
+			}},
+			timeout:            2 * time.Hour,
+			additionalDuration: 0,
+			// Cutover currently takes around 4 minutes, perhaps because we need to
+			// revert 10 GB of replicated data.
+			//
+			// TODO(msbutler): investigate further if cutover can be sped up.
+			cutoverTimeout: 20 * time.Minute,
+			cutover:        5 * time.Minute,
+			// In a few ad hoc runs, the max latency hikes up to 27 minutes before lag
+			// replanning and distributed catch up scans fix the poor initial plan. If
+			// max accepted latency doubles, then there's likely a regression.
+			maxAcceptedLatency: 1 * time.Hour,
+			// Skipping node distribution check because there is little data on the
+			// source when the replication stream begins.
+			skipNodeDistributionCheck: true,
+			clouds:                    registry.OnlyGCE,
+			suites:                    registry.Suites(registry.Nightly),
+		},
 		{
 			name:               "c2c/BulkOps/singleImport",
 			srcNodes:           4,
diff --git a/pkg/cmd/roachtest/tests/latency_verifier.go b/pkg/cmd/roachtest/tests/latency_verifier.go
@@ -44,6 +44,7 @@ type latencyVerifier struct {
 	catchupScanEveryN roachtestutil.EveryN
 
 	maxSeenSteadyLatency time.Duration
+	tooLargeEveryN       roachtestutil.EveryN
 	maxSeenSteadyEveryN  roachtestutil.EveryN
 	latencyBecameSteady  bool
 
@@ -74,8 +75,9 @@ func makeLatencyVerifier(
 		setTestStatus:            setTestStatus,
 		latencyHist:              hist,
 		tolerateErrors:           tolerateErrors,
-		maxSeenSteadyEveryN:      roachtestutil.Every(10 * time.Second),
-		catchupScanEveryN:        roachtestutil.Every(2 * time.Second),
+		tooLargeEveryN:           roachtestutil.Every(120 * time.Second),
+		maxSeenSteadyEveryN:      roachtestutil.Every(30 * time.Second),
+		catchupScanEveryN:        roachtestutil.Every(10 * time.Second),
 	}
 }
 
@@ -132,7 +134,9 @@ func (lv *latencyVerifier) noteHighwater(highwaterTime time.Time) {
 		return
 	}
 	if err := lv.latencyHist.RecordValue(latency.Nanoseconds()); err != nil {
-		lv.logger.Printf("%s: could not record value %s: %s\n", lv.name, latency, err)
+		if lv.tooLargeEveryN.ShouldLog() {
+			lv.logger.Printf("%s: could not record value %s: %s\n", lv.name, latency, err)
+		}
 	}
 	if latency > lv.maxSeenSteadyLatency {
 		lv.maxSeenSteadyLatency = latency