Skip to content

Commit 1a7fc9d

Browse files
craig[bot]dtmgartnerwenyihu6
committed
153989: roachtest: add variations of c2c/bulk with diff settings r=dt a=dt We may linger in a place where we tell some customers to customize non-public cluster settings to optimize for certain characteristics (pcr lag vs bulk op perf) until we can develop a better set of default behaviors. While we're in this state, we should test some of the more common configs we think we might recommend, even though these will still be custom configs, recommended on case-by-case basis. Release note: none. Epic: none. 154275: util/intsets: fix outdated comment r=mgartner a=mgartner Release note: None 154435: allocatorimpl: remove alwaysAllowDecisionWithoutStats r=tbg a=wenyihu6 Previously, we called TransferLeaseTarget with alwaysAllowDecisionWithoutStats, but it was only relevant when the lease transfer goal was allocator.FollowTheWorkload. The only case where we passed true for alwaysAllowDecisionWithoutStats was from storeRebalancer.chooseLeaseToTransfer, where opt.Goal was set to allocator.LoadConvergence. This commit removes it since it is unclear whether flag was useful. Epic: none Release note: none Co-authored-by: David Taylor <[email protected]> Co-authored-by: Marcus Gartner <[email protected]> Co-authored-by: wenyihu6 <[email protected]>
4 parents a0d41da + e6d66d1 + 1aaddc3 + 702253c commit 1a7fc9d

File tree

11 files changed

+96
-35
lines changed

11 files changed

+96
-35
lines changed

pkg/cmd/roachtest/cluster.go

Lines changed: 15 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -325,7 +325,10 @@ func initBinariesAndLibraries() {
325325
cockroachPath := roachtestflags.CockroachPath
326326
cockroachEAPath := roachtestflags.CockroachEAPath
327327
workloadPath := roachtestflags.WorkloadPath
328-
cockroach[defaultArch], _ = resolveBinary("cockroach", cockroachPath, defaultArch, true, false)
328+
// Skip cockroach binary validation if using --cockroach-stage
329+
if roachtestflags.CockroachStage == "" {
330+
cockroach[defaultArch], _ = resolveBinary("cockroach", cockroachPath, defaultArch, true, false)
331+
}
329332
// Let the test runner verify the workload binary exists if TestSpec.RequiresDeprecatedWorkload is true.
330333
workload[defaultArch], _ = resolveBinary("workload", workloadPath, defaultArch, false, false)
331334
cockroachEA[defaultArch], err = resolveBinary("cockroach-ea", cockroachEAPath, defaultArch, false, true)
@@ -336,7 +339,9 @@ func initBinariesAndLibraries() {
336339
if roachtestflags.ARM64Probability > 0 && defaultArch != vm.ArchARM64 {
337340
fmt.Printf("Locating and verifying binaries for os=%q, arch=%q\n", defaultOSName, vm.ArchARM64)
338341
// We need to verify we have all the required binaries for arm64.
339-
cockroach[vm.ArchARM64], _ = resolveBinary("cockroach", cockroachPath, vm.ArchARM64, true, false)
342+
if roachtestflags.CockroachStage == "" {
343+
cockroach[vm.ArchARM64], _ = resolveBinary("cockroach", cockroachPath, vm.ArchARM64, true, false)
344+
}
340345
workload[vm.ArchARM64], _ = resolveBinary("workload", workloadPath, vm.ArchARM64, true, false)
341346
cockroachEA[vm.ArchARM64], err = resolveBinary("cockroach-ea", cockroachEAPath, vm.ArchARM64, false, true)
342347
if err != nil {
@@ -346,7 +351,9 @@ func initBinariesAndLibraries() {
346351
if roachtestflags.FIPSProbability > 0 && defaultArch != vm.ArchFIPS {
347352
fmt.Printf("Locating and verifying binaries for os=%q, arch=%q\n", defaultOSName, vm.ArchFIPS)
348353
// We need to verify we have all the required binaries for fips.
349-
cockroach[vm.ArchFIPS], _ = resolveBinary("cockroach", cockroachPath, vm.ArchFIPS, true, false)
354+
if roachtestflags.CockroachStage == "" {
355+
cockroach[vm.ArchFIPS], _ = resolveBinary("cockroach", cockroachPath, vm.ArchFIPS, true, false)
356+
}
350357
workload[vm.ArchFIPS], _ = resolveBinary("workload", workloadPath, vm.ArchFIPS, true, false)
351358
cockroachEA[vm.ArchFIPS], err = resolveBinary("cockroach-ea", cockroachEAPath, vm.ArchFIPS, false, true)
352359
if err != nil {
@@ -1939,7 +1946,11 @@ func (c *clusterImpl) PutE(
19391946
func (c *clusterImpl) PutCockroach(ctx context.Context, l *logger.Logger, t *testImpl) error {
19401947
if roachtestflags.CockroachStage != "" {
19411948
// Use staging instead of upload when --cockroach-stage is specified
1942-
return c.Stage(ctx, l, "cockroach", roachtestflags.CockroachStage, ".", c.All())
1949+
stageVersion := roachtestflags.CockroachStage
1950+
if stageVersion == "latest" {
1951+
stageVersion = "" // Stage() expects empty string for latest
1952+
}
1953+
return c.Stage(ctx, l, "cockroach", stageVersion, ".", c.All())
19431954
}
19441955
return c.PutE(ctx, l, t.Cockroach(), test.DefaultCockroachPath, c.All())
19451956
}

pkg/cmd/roachtest/main.go

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -581,8 +581,4 @@ func validateAndConfigure(cmd *cobra.Command, args []string) {
581581
printErrAndExit(fmt.Errorf("--cockroach and --cockroach-stage are mutually exclusive. Use one or the other"))
582582
}
583583

584-
// Normalize "latest" to empty string for staging system
585-
if roachtestflags.CockroachStage == "latest" {
586-
roachtestflags.CockroachStage = ""
587-
}
588584
}

pkg/cmd/roachtest/tests/cluster_to_cluster.go

Lines changed: 72 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -398,6 +398,8 @@ type replicateBulkOps struct {
398398

399399
// debugSkipRollback skips all rollback steps during the test.
400400
debugSkipRollback bool
401+
402+
withSettings []struct{ setting, value string }
401403
}
402404

403405
func (bo replicateBulkOps) sourceInitCmd(tenantName string, nodes option.NodeListOption) string {
@@ -411,6 +413,18 @@ func (bo replicateBulkOps) sourceRunCmd(tenantName string, nodes option.NodeList
411413
func (bo replicateBulkOps) runDriver(
412414
workloadCtx context.Context, c cluster.Cluster, t test.Test, setup *c2cSetup,
413415
) error {
416+
mainTenantConn := c.Conn(workloadCtx, t.L(), 1, option.VirtualClusterName(setup.src.name))
417+
for _, pair := range bo.withSettings {
418+
settingStmt := fmt.Sprintf("SET CLUSTER SETTING %s = '%s'", pair.setting, pair.value)
419+
t.L().Printf("Setting on sys/main/standby-sys: %s", settingStmt)
420+
setup.src.sysSQL.Exec(t, settingStmt)
421+
// PCR settings are system-only; assume others are app-level.
422+
if !strings.Contains(pair.setting, "physical_replication") {
423+
if _, err := mainTenantConn.ExecContext(workloadCtx, settingStmt); err != nil {
424+
return err
425+
}
426+
}
427+
}
414428
runBackupMVCCRangeTombstones(workloadCtx, t, c, mvccRangeTombstoneConfig{
415429
skipBackupRestore: true,
416430
skipClusterSetup: true,
@@ -1529,7 +1543,7 @@ func registerClusterToCluster(r registry.Registry) {
15291543
suites: registry.Suites(registry.Nightly),
15301544
},
15311545
{
1532-
name: "c2c/BulkOps",
1546+
name: "c2c/BulkOps/settings=none",
15331547
srcNodes: 4,
15341548
dstNodes: 4,
15351549
cpus: 8,
@@ -1553,6 +1567,63 @@ func registerClusterToCluster(r registry.Registry) {
15531567
clouds: registry.OnlyGCE,
15541568
suites: registry.Suites(registry.Nightly),
15551569
},
1570+
{
1571+
name: "c2c/BulkOps/settings=ac-import",
1572+
srcNodes: 4,
1573+
dstNodes: 4,
1574+
cpus: 8,
1575+
pdSize: 100,
1576+
workload: replicateBulkOps{withSettings: []struct{ setting, value string }{
1577+
{"bulkio.import.elastic_control.enabled", "true"},
1578+
{"bulkio.elastic_cpu_control.request_duration", "3ms"},
1579+
}},
1580+
timeout: 2 * time.Hour,
1581+
additionalDuration: 0,
1582+
// Cutover currently takes around 4 minutes, perhaps because we need to
1583+
// revert 10 GB of replicated data.
1584+
//
1585+
// TODO(msbutler): investigate further if cutover can be sped up.
1586+
cutoverTimeout: 20 * time.Minute,
1587+
cutover: 5 * time.Minute,
1588+
// In a few ad hoc runs, the max latency hikes up to 27 minutes before lag
1589+
// replanning and distributed catch up scans fix the poor initial plan. If
1590+
// max accepted latency doubles, then there's likely a regression.
1591+
maxAcceptedLatency: 1 * time.Hour,
1592+
// Skipping node distribution check because there is little data on the
1593+
// source when the replication stream begins.
1594+
skipNodeDistributionCheck: true,
1595+
clouds: registry.OnlyGCE,
1596+
suites: registry.Suites(registry.Nightly),
1597+
},
1598+
{
1599+
name: "c2c/BulkOps/settings=ac-and-splits",
1600+
srcNodes: 4,
1601+
dstNodes: 4,
1602+
cpus: 8,
1603+
pdSize: 100,
1604+
workload: replicateBulkOps{withSettings: []struct{ setting, value string }{
1605+
{"bulkio.import.elastic_control.enabled", "true"},
1606+
{"bulkio.elastic_cpu_control.request_duration", "3ms"},
1607+
{"physical_replication.consumer.ingest_split_event.enabled", "true"},
1608+
}},
1609+
timeout: 2 * time.Hour,
1610+
additionalDuration: 0,
1611+
// Cutover currently takes around 4 minutes, perhaps because we need to
1612+
// revert 10 GB of replicated data.
1613+
//
1614+
// TODO(msbutler): investigate further if cutover can be sped up.
1615+
cutoverTimeout: 20 * time.Minute,
1616+
cutover: 5 * time.Minute,
1617+
// In a few ad hoc runs, the max latency hikes up to 27 minutes before lag
1618+
// replanning and distributed catch up scans fix the poor initial plan. If
1619+
// max accepted latency doubles, then there's likely a regression.
1620+
maxAcceptedLatency: 1 * time.Hour,
1621+
// Skipping node distribution check because there is little data on the
1622+
// source when the replication stream begins.
1623+
skipNodeDistributionCheck: true,
1624+
clouds: registry.OnlyGCE,
1625+
suites: registry.Suites(registry.Nightly),
1626+
},
15561627
{
15571628
name: "c2c/BulkOps/singleImport",
15581629
srcNodes: 4,

pkg/cmd/roachtest/tests/latency_verifier.go

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -44,6 +44,7 @@ type latencyVerifier struct {
4444
catchupScanEveryN roachtestutil.EveryN
4545

4646
maxSeenSteadyLatency time.Duration
47+
tooLargeEveryN roachtestutil.EveryN
4748
maxSeenSteadyEveryN roachtestutil.EveryN
4849
latencyBecameSteady bool
4950

@@ -74,8 +75,9 @@ func makeLatencyVerifier(
7475
setTestStatus: setTestStatus,
7576
latencyHist: hist,
7677
tolerateErrors: tolerateErrors,
77-
maxSeenSteadyEveryN: roachtestutil.Every(10 * time.Second),
78-
catchupScanEveryN: roachtestutil.Every(2 * time.Second),
78+
tooLargeEveryN: roachtestutil.Every(120 * time.Second),
79+
maxSeenSteadyEveryN: roachtestutil.Every(30 * time.Second),
80+
catchupScanEveryN: roachtestutil.Every(10 * time.Second),
7981
}
8082
}
8183

@@ -132,7 +134,9 @@ func (lv *latencyVerifier) noteHighwater(highwaterTime time.Time) {
132134
return
133135
}
134136
if err := lv.latencyHist.RecordValue(latency.Nanoseconds()); err != nil {
135-
lv.logger.Printf("%s: could not record value %s: %s\n", lv.name, latency, err)
137+
if lv.tooLargeEveryN.ShouldLog() {
138+
lv.logger.Printf("%s: could not record value %s: %s\n", lv.name, latency, err)
139+
}
136140
}
137141
if latency > lv.maxSeenSteadyLatency {
138142
lv.maxSeenSteadyLatency = latency

pkg/kv/kvserver/allocator/allocatorimpl/allocator.go

Lines changed: 1 addition & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2389,7 +2389,6 @@ func (a *Allocator) TransferLeaseTarget(
23892389
SendStreamStats(*rac2.RangeSendStreamStats)
23902390
},
23912391
usageInfo allocator.RangeUsageInfo,
2392-
forceDecisionWithoutStats bool,
23932392
opts allocator.TransferLeaseOptions,
23942393
) roachpb.ReplicaDescriptor {
23952394
if a.knobs != nil {
@@ -2443,10 +2442,7 @@ func (a *Allocator) TransferLeaseTarget(
24432442
if !excludeLeaseRepl {
24442443
switch transferDec {
24452444
case shouldNotTransfer:
2446-
if !forceDecisionWithoutStats {
2447-
return roachpb.ReplicaDescriptor{}
2448-
}
2449-
fallthrough
2445+
return roachpb.ReplicaDescriptor{}
24502446
case decideWithoutStats:
24512447
if !a.shouldTransferLeaseForLeaseCountConvergence(ctx, storePool, sl, source, validTargets) {
24522448
return roachpb.ReplicaDescriptor{}

pkg/kv/kvserver/allocator/allocatorimpl/allocator_test.go

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -2052,7 +2052,6 @@ func TestAllocatorTransferLeaseTarget(t *testing.T) {
20522052
storeID: c.leaseholder,
20532053
},
20542054
allocator.RangeUsageInfo{}, /* stats */
2055-
false, /* forceDecisionWithoutStats */
20562055
allocator.TransferLeaseOptions{
20572056
ExcludeLeaseRepl: c.excludeLeaseRepl,
20582057
CheckCandidateFullness: true,
@@ -2184,7 +2183,6 @@ func TestAllocatorTransferLeaseTargetIOOverloadCheck(t *testing.T) {
21842183
storeID: tc.leaseholder,
21852184
},
21862185
allocator.RangeUsageInfo{}, /* stats */
2187-
false, /* forceDecisionWithoutStats */
21882186
allocator.TransferLeaseOptions{
21892187
CheckCandidateFullness: true,
21902188
},
@@ -2297,7 +2295,6 @@ func TestAllocatorTransferLeaseToReplicasNeedingSnapshot(t *testing.T) {
22972295
c.existing,
22982296
repl,
22992297
allocator.RangeUsageInfo{},
2300-
false, /* alwaysAllowDecisionWithoutStats */
23012298
allocator.TransferLeaseOptions{
23022299
ExcludeLeaseRepl: c.excludeLeaseRepl,
23032300
CheckCandidateFullness: true,
@@ -2441,7 +2438,6 @@ func TestAllocatorTransferLeaseToReplicasNeedingCatchup(t *testing.T) {
24412438
c.existing,
24422439
repl,
24432440
allocator.RangeUsageInfo{},
2444-
false, /* alwaysAllowDecisionWithoutStats */
24452441
allocator.TransferLeaseOptions{
24462442
ExcludeLeaseRepl: c.excludeLeaseRepl,
24472443
CheckCandidateFullness: true,
@@ -2537,7 +2533,6 @@ func TestAllocatorTransferLeaseTargetConstraints(t *testing.T) {
25372533
storeID: c.leaseholder,
25382534
},
25392535
allocator.RangeUsageInfo{},
2540-
false, /* forceDecisionWithoutStats */
25412536
allocator.TransferLeaseOptions{
25422537
ExcludeLeaseRepl: false,
25432538
CheckCandidateFullness: true,
@@ -2650,7 +2645,6 @@ func TestAllocatorTransferLeaseTargetDraining(t *testing.T) {
26502645
storeID: c.leaseholder,
26512646
},
26522647
allocator.RangeUsageInfo{},
2653-
false, /* forceDecisionWithoutStats */
26542648
allocator.TransferLeaseOptions{
26552649
ExcludeLeaseRepl: c.excludeLeaseRepl,
26562650
CheckCandidateFullness: true,
@@ -3312,7 +3306,6 @@ func TestAllocatorLeasePreferences(t *testing.T) {
33123306
storeID: c.leaseholder,
33133307
},
33143308
allocator.RangeUsageInfo{},
3315-
false, /* forceDecisionWithoutStats */
33163309
allocator.TransferLeaseOptions{
33173310
ExcludeLeaseRepl: false,
33183311
CheckCandidateFullness: true,
@@ -3332,7 +3325,6 @@ func TestAllocatorLeasePreferences(t *testing.T) {
33323325
storeID: c.leaseholder,
33333326
},
33343327
allocator.RangeUsageInfo{},
3335-
false, /* forceDecisionWithoutStats */
33363328
allocator.TransferLeaseOptions{
33373329
ExcludeLeaseRepl: true,
33383330
CheckCandidateFullness: true,
@@ -3426,7 +3418,6 @@ func TestAllocatorLeasePreferencesMultipleStoresPerLocality(t *testing.T) {
34263418
storeID: c.leaseholder,
34273419
},
34283420
allocator.RangeUsageInfo{},
3429-
false, /* forceDecisionWithoutStats */
34303421
allocator.TransferLeaseOptions{
34313422
ExcludeLeaseRepl: false,
34323423
CheckCandidateFullness: true,
@@ -3447,7 +3438,6 @@ func TestAllocatorLeasePreferencesMultipleStoresPerLocality(t *testing.T) {
34473438
storeID: c.leaseholder,
34483439
},
34493440
allocator.RangeUsageInfo{},
3450-
false, /* forceDecisionWithoutStats */
34513441
allocator.TransferLeaseOptions{
34523442
ExcludeLeaseRepl: true,
34533443
CheckCandidateFullness: true,
@@ -6084,7 +6074,6 @@ func TestAllocatorTransferLeaseTargetLoadBased(t *testing.T) {
60846074
storeID: c.leaseholder,
60856075
},
60866076
usage,
6087-
false,
60886077
allocator.TransferLeaseOptions{
60896078
ExcludeLeaseRepl: c.excludeLeaseRepl,
60906079
CheckCandidateFullness: true,

pkg/kv/kvserver/allocator/plan/lease.go

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -142,7 +142,6 @@ func (lp LeasePlanner) PlanOneChange(
142142
existingVoters,
143143
repl,
144144
usage,
145-
false, /* forceDecisionWithoutStats */
146145
allocator.TransferLeaseOptions{
147146
Goal: allocator.FollowTheWorkload,
148147
ExcludeLeaseRepl: false,

pkg/kv/kvserver/allocator/plan/replicate.go

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -900,7 +900,6 @@ func (rp ReplicaPlanner) maybeTransferLeaseAwayTarget(
900900
desc.Replicas().VoterDescriptors(),
901901
repl,
902902
usageInfo,
903-
false, /* forceDecisionWithoutStats */
904903
allocator.TransferLeaseOptions{
905904
Goal: allocator.LeaseCountConvergence,
906905
// NB: This option means that the allocator is asked to not consider the

pkg/kv/kvserver/replicate_queue.go

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1051,7 +1051,6 @@ func (rq *replicateQueue) shedLease(
10511051
desc.Replicas().VoterDescriptors(),
10521052
repl,
10531053
rangeUsageInfo,
1054-
false, /* forceDecisionWithoutStats */
10551054
opts,
10561055
)
10571056
if targetDesc == (roachpb.ReplicaDescriptor{}) {

pkg/kv/kvserver/store_rebalancer.go

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -814,7 +814,6 @@ func (sr *StoreRebalancer) chooseLeaseToTransfer(
814814
candidates,
815815
candidateReplica,
816816
candidateReplica.RangeUsageInfo(),
817-
true, /* forceDecisionWithoutStats */
818817
allocator.TransferLeaseOptions{
819818
Goal: allocator.LoadConvergence,
820819
ExcludeLeaseRepl: false,

0 commit comments

Comments
 (0)