Skip to content

Commit e6d66d1

Browse files
committed
roachtest: add variations of c2c/bulk with diff settings
We may linger in a place where we tell some customers to customize non-public cluster settings to optimize for certain characteristics (pcr lag vs bulk op perf) until we can develop a better set of default behaviors. While we're in this state, we should test some of the more common configs we think we might recommend, even though these will still be custom configs, recommended on case-by-case basis. Release note: none. Epic: none.
1 parent b63288a commit e6d66d1

File tree

2 files changed

+79
-4
lines changed

2 files changed

+79
-4
lines changed

pkg/cmd/roachtest/tests/cluster_to_cluster.go

Lines changed: 72 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -398,6 +398,8 @@ type replicateBulkOps struct {
398398

399399
// debugSkipRollback skips all rollback steps during the test.
400400
debugSkipRollback bool
401+
402+
withSettings []struct{ setting, value string }
401403
}
402404

403405
func (bo replicateBulkOps) sourceInitCmd(tenantName string, nodes option.NodeListOption) string {
@@ -411,6 +413,18 @@ func (bo replicateBulkOps) sourceRunCmd(tenantName string, nodes option.NodeList
411413
func (bo replicateBulkOps) runDriver(
412414
workloadCtx context.Context, c cluster.Cluster, t test.Test, setup *c2cSetup,
413415
) error {
416+
mainTenantConn := c.Conn(workloadCtx, t.L(), 1, option.VirtualClusterName(setup.src.name))
417+
for _, pair := range bo.withSettings {
418+
settingStmt := fmt.Sprintf("SET CLUSTER SETTING %s = '%s'", pair.setting, pair.value)
419+
t.L().Printf("Setting on sys/main/standby-sys: %s", settingStmt)
420+
setup.src.sysSQL.Exec(t, settingStmt)
421+
// PCR settings are system-only; assume others are app-level.
422+
if !strings.Contains(pair.setting, "physical_replication") {
423+
if _, err := mainTenantConn.ExecContext(workloadCtx, settingStmt); err != nil {
424+
return err
425+
}
426+
}
427+
}
414428
runBackupMVCCRangeTombstones(workloadCtx, t, c, mvccRangeTombstoneConfig{
415429
skipBackupRestore: true,
416430
skipClusterSetup: true,
@@ -1519,7 +1533,7 @@ func registerClusterToCluster(r registry.Registry) {
15191533
suites: registry.Suites(registry.Nightly),
15201534
},
15211535
{
1522-
name: "c2c/BulkOps",
1536+
name: "c2c/BulkOps/settings=none",
15231537
srcNodes: 4,
15241538
dstNodes: 4,
15251539
cpus: 8,
@@ -1543,6 +1557,63 @@ func registerClusterToCluster(r registry.Registry) {
15431557
clouds: registry.OnlyGCE,
15441558
suites: registry.Suites(registry.Nightly),
15451559
},
1560+
{
1561+
name: "c2c/BulkOps/settings=ac-import",
1562+
srcNodes: 4,
1563+
dstNodes: 4,
1564+
cpus: 8,
1565+
pdSize: 100,
1566+
workload: replicateBulkOps{withSettings: []struct{ setting, value string }{
1567+
{"bulkio.import.elastic_control.enabled", "true"},
1568+
{"bulkio.elastic_cpu_control.request_duration", "3ms"},
1569+
}},
1570+
timeout: 2 * time.Hour,
1571+
additionalDuration: 0,
1572+
// Cutover currently takes around 4 minutes, perhaps because we need to
1573+
// revert 10 GB of replicated data.
1574+
//
1575+
// TODO(msbutler): investigate further if cutover can be sped up.
1576+
cutoverTimeout: 20 * time.Minute,
1577+
cutover: 5 * time.Minute,
1578+
// In a few ad hoc runs, the max latency hikes up to 27 minutes before lag
1579+
// replanning and distributed catch up scans fix the poor initial plan. If
1580+
// max accepted latency doubles, then there's likely a regression.
1581+
maxAcceptedLatency: 1 * time.Hour,
1582+
// Skipping node distribution check because there is little data on the
1583+
// source when the replication stream begins.
1584+
skipNodeDistributionCheck: true,
1585+
clouds: registry.OnlyGCE,
1586+
suites: registry.Suites(registry.Nightly),
1587+
},
1588+
{
1589+
name: "c2c/BulkOps/settings=ac-and-splits",
1590+
srcNodes: 4,
1591+
dstNodes: 4,
1592+
cpus: 8,
1593+
pdSize: 100,
1594+
workload: replicateBulkOps{withSettings: []struct{ setting, value string }{
1595+
{"bulkio.import.elastic_control.enabled", "true"},
1596+
{"bulkio.elastic_cpu_control.request_duration", "3ms"},
1597+
{"physical_replication.consumer.ingest_split_event.enabled", "true"},
1598+
}},
1599+
timeout: 2 * time.Hour,
1600+
additionalDuration: 0,
1601+
// Cutover currently takes around 4 minutes, perhaps because we need to
1602+
// revert 10 GB of replicated data.
1603+
//
1604+
// TODO(msbutler): investigate further if cutover can be sped up.
1605+
cutoverTimeout: 20 * time.Minute,
1606+
cutover: 5 * time.Minute,
1607+
// In a few ad hoc runs, the max latency hikes up to 27 minutes before lag
1608+
// replanning and distributed catch up scans fix the poor initial plan. If
1609+
// max accepted latency doubles, then there's likely a regression.
1610+
maxAcceptedLatency: 1 * time.Hour,
1611+
// Skipping node distribution check because there is little data on the
1612+
// source when the replication stream begins.
1613+
skipNodeDistributionCheck: true,
1614+
clouds: registry.OnlyGCE,
1615+
suites: registry.Suites(registry.Nightly),
1616+
},
15461617
{
15471618
name: "c2c/BulkOps/singleImport",
15481619
srcNodes: 4,

pkg/cmd/roachtest/tests/latency_verifier.go

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ type latencyVerifier struct {
4444
catchupScanEveryN roachtestutil.EveryN
4545

4646
maxSeenSteadyLatency time.Duration
47+
tooLargeEveryN roachtestutil.EveryN
4748
maxSeenSteadyEveryN roachtestutil.EveryN
4849
latencyBecameSteady bool
4950

@@ -74,8 +75,9 @@ func makeLatencyVerifier(
7475
setTestStatus: setTestStatus,
7576
latencyHist: hist,
7677
tolerateErrors: tolerateErrors,
77-
maxSeenSteadyEveryN: roachtestutil.Every(10 * time.Second),
78-
catchupScanEveryN: roachtestutil.Every(2 * time.Second),
78+
tooLargeEveryN: roachtestutil.Every(120 * time.Second),
79+
maxSeenSteadyEveryN: roachtestutil.Every(30 * time.Second),
80+
catchupScanEveryN: roachtestutil.Every(10 * time.Second),
7981
}
8082
}
8183

@@ -132,7 +134,9 @@ func (lv *latencyVerifier) noteHighwater(highwaterTime time.Time) {
132134
return
133135
}
134136
if err := lv.latencyHist.RecordValue(latency.Nanoseconds()); err != nil {
135-
lv.logger.Printf("%s: could not record value %s: %s\n", lv.name, latency, err)
137+
if lv.tooLargeEveryN.ShouldLog() {
138+
lv.logger.Printf("%s: could not record value %s: %s\n", lv.name, latency, err)
139+
}
136140
}
137141
if latency > lv.maxSeenSteadyLatency {
138142
lv.maxSeenSteadyLatency = latency

0 commit comments

Comments
 (0)