Skip to content

Commit a8bdebf

Browse files
committed
roachtest: pass monitor via makeFailer in failover tests
Previously, we propagated the cluster monitor via `Failer.Ready()`, since the cluster hadn't been started yet when we constructed the failer (it might have to e.g. mess with the disk setup first). However, if `Failer.Cleanup()` relied on the monitor, and the test failed before `Ready()` was called, it could result in a nil pointer panic. It turns out the monitor doesn't actually monitor anything until we call `Wait()` on it, so we can safely construct it before starting the cluster. This patch therefore propagates the monitor via `makeFailer()` instead, such that it's never unset. Epic: none Release note: None
1 parent a416398 commit a8bdebf

File tree

1 file changed

+51
-44
lines changed

1 file changed

+51
-44
lines changed

pkg/cmd/roachtest/tests/failover.go

Lines changed: 51 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -194,9 +194,11 @@ func runFailoverChaos(ctx context.Context, t test.Test, c cluster.Cluster, readO
194194
settings.Env = append(settings.Env, "COCKROACH_ENABLE_UNSAFE_TEST_BUILTINS=true")
195195
settings.Env = append(settings.Env, "COCKROACH_SCAN_MAX_IDLE_TIME=100ms") // speed up replication
196196

197+
m := c.NewMonitor(ctx, c.Range(1, 9))
198+
197199
failers := []Failer{}
198200
for _, failureMode := range allFailureModes {
199-
failer := makeFailerWithoutLocalNoop(t, c, failureMode, opts, settings, rng)
201+
failer := makeFailerWithoutLocalNoop(t, c, m, failureMode, opts, settings, rng)
200202
if c.IsLocal() && !failer.CanUseLocal() {
201203
t.L().Printf("skipping failure mode %q on local cluster", failureMode)
202204
continue
@@ -209,7 +211,6 @@ func runFailoverChaos(ctx context.Context, t test.Test, c cluster.Cluster, readO
209211
c.Put(ctx, t.Cockroach(), "./cockroach")
210212
c.Start(ctx, t.L(), opts, settings, c.Range(1, 9))
211213

212-
m := c.NewMonitor(ctx, c.Range(1, 9))
213214
conn := c.Conn(ctx, t.L(), 1)
214215

215216
// Place 5 replicas of all ranges on n3-n9, keeping n1-n2 as SQL gateways.
@@ -285,7 +286,7 @@ func runFailoverChaos(ctx context.Context, t test.Test, c cluster.Cluster, readO
285286
d.numReplicas = 1 + rng.Intn(5)
286287
d.onlyLeaseholders = rng.Float64() < 0.5
287288
}
288-
failer.Ready(ctx, m)
289+
failer.Ready(ctx)
289290
nodeFailers[node] = failer
290291
}
291292

@@ -367,14 +368,15 @@ func runFailoverPartialLeaseGateway(ctx context.Context, t test.Test, c cluster.
367368
settings := install.MakeClusterSettings()
368369
settings.Env = append(settings.Env, "COCKROACH_SCAN_MAX_IDLE_TIME=100ms") // speed up replication
369370

370-
failer := makeFailer(t, c, failureModeBlackhole, opts, settings, rng).(PartialFailer)
371+
m := c.NewMonitor(ctx, c.Range(1, 7))
372+
373+
failer := makeFailer(t, c, m, failureModeBlackhole, opts, settings, rng).(PartialFailer)
371374
failer.Setup(ctx)
372375
defer failer.Cleanup(ctx)
373376

374377
c.Put(ctx, t.Cockroach(), "./cockroach")
375378
c.Start(ctx, t.L(), opts, settings, c.Range(1, 7))
376379

377-
m := c.NewMonitor(ctx, c.Range(1, 7))
378380
conn := c.Conn(ctx, t.L(), 1)
379381

380382
// Place all ranges on n1-n3 to start with.
@@ -439,7 +441,7 @@ func runFailoverPartialLeaseGateway(ctx context.Context, t test.Test, c cluster.
439441
for _, tc := range testcases {
440442
sleepFor(ctx, t, time.Minute)
441443

442-
failer.Ready(ctx, m)
444+
failer.Ready(ctx)
443445

444446
// Ranges and leases may occasionally escape their constraints. Move
445447
// them to where they should be.
@@ -499,7 +501,9 @@ func runFailoverPartialLeaseLeader(ctx context.Context, t test.Test, c cluster.C
499501
settings.Env = append(settings.Env, "COCKROACH_DISABLE_LEADER_FOLLOWS_LEASEHOLDER=true")
500502
settings.Env = append(settings.Env, "COCKROACH_SCAN_MAX_IDLE_TIME=100ms") // speed up replication
501503

502-
failer := makeFailer(t, c, failureModeBlackhole, opts, settings, rng).(PartialFailer)
504+
m := c.NewMonitor(ctx, c.Range(1, 6))
505+
506+
failer := makeFailer(t, c, m, failureModeBlackhole, opts, settings, rng).(PartialFailer)
503507
failer.Setup(ctx)
504508
defer failer.Cleanup(ctx)
505509

@@ -520,7 +524,6 @@ func runFailoverPartialLeaseLeader(ctx context.Context, t test.Test, c cluster.C
520524

521525
// Now that system ranges are properly placed on n1-n3, start n4-n6.
522526
c.Start(ctx, t.L(), opts, settings, c.Range(4, 6))
523-
m := c.NewMonitor(ctx, c.Range(1, 6))
524527

525528
// Create the kv database on n4-n6.
526529
t.L().Printf("creating workload database")
@@ -573,7 +576,7 @@ func runFailoverPartialLeaseLeader(ctx context.Context, t test.Test, c cluster.C
573576
for _, node := range []int{4, 5, 6} {
574577
sleepFor(ctx, t, time.Minute)
575578

576-
failer.Ready(ctx, m)
579+
failer.Ready(ctx)
577580

578581
// Ranges may occasionally escape their constraints. Move them to where
579582
// they should be.
@@ -631,14 +634,15 @@ func runFailoverPartialLeaseLiveness(ctx context.Context, t test.Test, c cluster
631634
settings := install.MakeClusterSettings()
632635
settings.Env = append(settings.Env, "COCKROACH_SCAN_MAX_IDLE_TIME=100ms") // speed up replication
633636

634-
failer := makeFailer(t, c, failureModeBlackhole, opts, settings, rng).(PartialFailer)
637+
m := c.NewMonitor(ctx, c.Range(1, 7))
638+
639+
failer := makeFailer(t, c, m, failureModeBlackhole, opts, settings, rng).(PartialFailer)
635640
failer.Setup(ctx)
636641
defer failer.Cleanup(ctx)
637642

638643
c.Put(ctx, t.Cockroach(), "./cockroach")
639644
c.Start(ctx, t.L(), opts, settings, c.Range(1, 7))
640645

641-
m := c.NewMonitor(ctx, c.Range(1, 7))
642646
conn := c.Conn(ctx, t.L(), 1)
643647

644648
// Place all ranges on n1-n3, and an extra liveness leaseholder replica on n4.
@@ -688,7 +692,7 @@ func runFailoverPartialLeaseLiveness(ctx context.Context, t test.Test, c cluster
688692
for _, node := range []int{5, 6, 7} {
689693
sleepFor(ctx, t, time.Minute)
690694

691-
failer.Ready(ctx, m)
695+
failer.Ready(ctx)
692696

693697
// Ranges and leases may occasionally escape their constraints. Move
694698
// them to where they should be.
@@ -747,14 +751,15 @@ func runFailoverNonSystem(
747751
settings.Env = append(settings.Env, "COCKROACH_ENABLE_UNSAFE_TEST_BUILTINS=true")
748752
settings.Env = append(settings.Env, "COCKROACH_SCAN_MAX_IDLE_TIME=100ms") // speed up replication
749753

750-
failer := makeFailer(t, c, failureMode, opts, settings, rng)
754+
m := c.NewMonitor(ctx, c.Range(1, 6))
755+
756+
failer := makeFailer(t, c, m, failureMode, opts, settings, rng)
751757
failer.Setup(ctx)
752758
defer failer.Cleanup(ctx)
753759

754760
c.Put(ctx, t.Cockroach(), "./cockroach")
755761
c.Start(ctx, t.L(), opts, settings, c.Range(1, 6))
756762

757-
m := c.NewMonitor(ctx, c.Range(1, 6))
758763
conn := c.Conn(ctx, t.L(), 1)
759764

760765
// Constrain all existing zone configs to n1-n3.
@@ -797,7 +802,7 @@ func runFailoverNonSystem(
797802
for _, node := range []int{4, 5, 6} {
798803
sleepFor(ctx, t, time.Minute)
799804

800-
failer.Ready(ctx, m)
805+
failer.Ready(ctx)
801806

802807
// Ranges may occasionally escape their constraints. Move them
803808
// to where they should be.
@@ -856,14 +861,15 @@ func runFailoverLiveness(
856861
settings.Env = append(settings.Env, "COCKROACH_ENABLE_UNSAFE_TEST_BUILTINS=true")
857862
settings.Env = append(settings.Env, "COCKROACH_SCAN_MAX_IDLE_TIME=100ms") // speed up replication
858863

859-
failer := makeFailer(t, c, failureMode, opts, settings, rng)
864+
m := c.NewMonitor(ctx, c.Range(1, 4))
865+
866+
failer := makeFailer(t, c, m, failureMode, opts, settings, rng)
860867
failer.Setup(ctx)
861868
defer failer.Cleanup(ctx)
862869

863870
c.Put(ctx, t.Cockroach(), "./cockroach")
864871
c.Start(ctx, t.L(), opts, settings, c.Range(1, 4))
865872

866-
m := c.NewMonitor(ctx, c.Range(1, 4))
867873
conn := c.Conn(ctx, t.L(), 1)
868874

869875
// Constrain all existing zone configs to n1-n3.
@@ -911,7 +917,7 @@ func runFailoverLiveness(
911917
for i := 0; i < 9; i++ {
912918
sleepFor(ctx, t, time.Minute)
913919

914-
failer.Ready(ctx, m)
920+
failer.Ready(ctx)
915921

916922
// Ranges and leases may occasionally escape their constraints. Move them
917923
// to where they should be.
@@ -969,14 +975,15 @@ func runFailoverSystemNonLiveness(
969975
settings.Env = append(settings.Env, "COCKROACH_ENABLE_UNSAFE_TEST_BUILTINS=true")
970976
settings.Env = append(settings.Env, "COCKROACH_SCAN_MAX_IDLE_TIME=100ms") // speed up replication
971977

972-
failer := makeFailer(t, c, failureMode, opts, settings, rng)
978+
m := c.NewMonitor(ctx, c.Range(1, 6))
979+
980+
failer := makeFailer(t, c, m, failureMode, opts, settings, rng)
973981
failer.Setup(ctx)
974982
defer failer.Cleanup(ctx)
975983

976984
c.Put(ctx, t.Cockroach(), "./cockroach")
977985
c.Start(ctx, t.L(), opts, settings, c.Range(1, 6))
978986

979-
m := c.NewMonitor(ctx, c.Range(1, 6))
980987
conn := c.Conn(ctx, t.L(), 1)
981988

982989
// Constrain all existing zone configs to n4-n6, except liveness which is
@@ -1024,7 +1031,7 @@ func runFailoverSystemNonLiveness(
10241031
for _, node := range []int{4, 5, 6} {
10251032
sleepFor(ctx, t, time.Minute)
10261033

1027-
failer.Ready(ctx, m)
1034+
failer.Ready(ctx)
10281035

10291036
// Ranges may occasionally escape their constraints. Move them
10301037
// to where they should be.
@@ -1081,12 +1088,13 @@ var allFailureModes = []failureMode{
10811088
func makeFailer(
10821089
t test.Test,
10831090
c cluster.Cluster,
1091+
m cluster.Monitor,
10841092
failureMode failureMode,
10851093
opts option.StartOpts,
10861094
settings install.ClusterSettings,
10871095
rng *rand.Rand,
10881096
) Failer {
1089-
f := makeFailerWithoutLocalNoop(t, c, failureMode, opts, settings, rng)
1097+
f := makeFailerWithoutLocalNoop(t, c, m, failureMode, opts, settings, rng)
10901098
if c.IsLocal() && !f.CanUseLocal() {
10911099
t.L().Printf(
10921100
`failure mode %q not supported on local clusters, using "noop" failure mode instead`,
@@ -1099,6 +1107,7 @@ func makeFailer(
10991107
func makeFailerWithoutLocalNoop(
11001108
t test.Test,
11011109
c cluster.Cluster,
1110+
m cluster.Monitor,
11021111
failureMode failureMode,
11031112
opts option.StartOpts,
11041113
settings install.ClusterSettings,
@@ -1128,13 +1137,15 @@ func makeFailerWithoutLocalNoop(
11281137
return &crashFailer{
11291138
t: t,
11301139
c: c,
1140+
m: m,
11311141
startOpts: opts,
11321142
startSettings: settings,
11331143
}
11341144
case failureModeDeadlock:
11351145
return &deadlockFailer{
11361146
t: t,
11371147
c: c,
1148+
m: m,
11381149
rng: rng,
11391150
startOpts: opts,
11401151
startSettings: settings,
@@ -1145,6 +1156,7 @@ func makeFailerWithoutLocalNoop(
11451156
return &diskStallFailer{
11461157
t: t,
11471158
c: c,
1159+
m: m,
11481160
startOpts: opts,
11491161
startSettings: settings,
11501162
staller: &dmsetupDiskStaller{t: t, c: c},
@@ -1182,7 +1194,7 @@ type Failer interface {
11821194

11831195
// Ready is called some time before failing each node, when the cluster and
11841196
// workload is running and after recovering the previous node failure if any.
1185-
Ready(ctx context.Context, m cluster.Monitor)
1197+
Ready(ctx context.Context)
11861198

11871199
// Cleanup cleans up when the test exits. This is needed e.g. when the cluster
11881200
// is reused by a different test.
@@ -1211,7 +1223,7 @@ func (f *noopFailer) String() string { return string(f.
12111223
func (f *noopFailer) CanUseLocal() bool { return true }
12121224
func (f *noopFailer) CanRunWith(failureMode) bool { return true }
12131225
func (f *noopFailer) Setup(context.Context) {}
1214-
func (f *noopFailer) Ready(context.Context, cluster.Monitor) {}
1226+
func (f *noopFailer) Ready(context.Context) {}
12151227
func (f *noopFailer) Cleanup(context.Context) {}
12161228
func (f *noopFailer) Fail(context.Context, int) {}
12171229
func (f *noopFailer) FailPartial(context.Context, int, []int) {}
@@ -1239,11 +1251,11 @@ func (f *blackholeFailer) Mode() failureMode {
12391251
return failureModeBlackhole
12401252
}
12411253

1242-
func (f *blackholeFailer) String() string { return string(f.Mode()) }
1243-
func (f *blackholeFailer) CanUseLocal() bool { return false } // needs iptables
1244-
func (f *blackholeFailer) CanRunWith(failureMode) bool { return true }
1245-
func (f *blackholeFailer) Setup(context.Context) {}
1246-
func (f *blackholeFailer) Ready(context.Context, cluster.Monitor) {}
1254+
func (f *blackholeFailer) String() string { return string(f.Mode()) }
1255+
func (f *blackholeFailer) CanUseLocal() bool { return false } // needs iptables
1256+
func (f *blackholeFailer) CanRunWith(failureMode) bool { return true }
1257+
func (f *blackholeFailer) Setup(context.Context) {}
1258+
func (f *blackholeFailer) Ready(context.Context) {}
12471259

12481260
func (f *blackholeFailer) Cleanup(ctx context.Context) {
12491261
f.c.Run(ctx, f.c.All(), `sudo iptables -F`)
@@ -1324,13 +1336,13 @@ type crashFailer struct {
13241336
startSettings install.ClusterSettings
13251337
}
13261338

1327-
func (f *crashFailer) Mode() failureMode { return failureModeCrash }
1328-
func (f *crashFailer) String() string { return string(f.Mode()) }
1329-
func (f *crashFailer) CanUseLocal() bool { return true }
1330-
func (f *crashFailer) CanRunWith(failureMode) bool { return true }
1331-
func (f *crashFailer) Setup(_ context.Context) {}
1332-
func (f *crashFailer) Ready(_ context.Context, m cluster.Monitor) { f.m = m }
1333-
func (f *crashFailer) Cleanup(_ context.Context) {}
1339+
func (f *crashFailer) Mode() failureMode { return failureModeCrash }
1340+
func (f *crashFailer) String() string { return string(f.Mode()) }
1341+
func (f *crashFailer) CanUseLocal() bool { return true }
1342+
func (f *crashFailer) CanRunWith(failureMode) bool { return true }
1343+
func (f *crashFailer) Setup(context.Context) {}
1344+
func (f *crashFailer) Ready(context.Context) {}
1345+
func (f *crashFailer) Cleanup(context.Context) {}
13341346

13351347
func (f *crashFailer) Fail(ctx context.Context, nodeID int) {
13361348
f.m.ExpectDeath()
@@ -1366,9 +1378,7 @@ func (f *deadlockFailer) CanRunWith(m failureMode) bool { return true }
13661378
func (f *deadlockFailer) Setup(context.Context) {}
13671379
func (f *deadlockFailer) Cleanup(context.Context) {}
13681380

1369-
func (f *deadlockFailer) Ready(ctx context.Context, m cluster.Monitor) {
1370-
f.m = m
1371-
1381+
func (f *deadlockFailer) Ready(ctx context.Context) {
13721382
// In chaos tests, other nodes will be failing concurrently. We therefore
13731383
// can't run SHOW CLUSTER RANGES WITH DETAILS in Fail(), since it needs to
13741384
// read from all ranges. Instead, we fetch a snapshot of replicas and leases
@@ -1483,15 +1493,12 @@ func (f *diskStallFailer) Mode() failureMode { return failureModeDiskS
14831493
func (f *diskStallFailer) String() string { return string(f.Mode()) }
14841494
func (f *diskStallFailer) CanUseLocal() bool { return false } // needs dmsetup
14851495
func (f *diskStallFailer) CanRunWith(failureMode) bool { return true }
1496+
func (f *diskStallFailer) Ready(context.Context) {}
14861497

14871498
func (f *diskStallFailer) Setup(ctx context.Context) {
14881499
f.staller.Setup(ctx)
14891500
}
14901501

1491-
func (f *diskStallFailer) Ready(_ context.Context, m cluster.Monitor) {
1492-
f.m = m
1493-
}
1494-
14951502
func (f *diskStallFailer) Cleanup(ctx context.Context) {
14961503
f.staller.Unstall(ctx, f.c.All())
14971504
// We have to stop the cluster before cleaning up the staller.
@@ -1533,7 +1540,7 @@ func (f *pauseFailer) CanRunWith(other failureMode) bool {
15331540
return other != failureModeDiskStall
15341541
}
15351542

1536-
func (f *pauseFailer) Ready(ctx context.Context, _ cluster.Monitor) {
1543+
func (f *pauseFailer) Ready(ctx context.Context) {
15371544
// The process pause can trip the disk stall detector, so we disable it. We
15381545
// could let it fire, but we'd like to see if the node can recover from the
15391546
// pause and keep working.

0 commit comments

Comments
 (0)