@@ -194,9 +194,11 @@ func runFailoverChaos(ctx context.Context, t test.Test, c cluster.Cluster, readO
194194 settings .Env = append (settings .Env , "COCKROACH_ENABLE_UNSAFE_TEST_BUILTINS=true" )
195195 settings .Env = append (settings .Env , "COCKROACH_SCAN_MAX_IDLE_TIME=100ms" ) // speed up replication
196196
197+ m := c .NewMonitor (ctx , c .Range (1 , 9 ))
198+
197199 failers := []Failer {}
198200 for _ , failureMode := range allFailureModes {
199- failer := makeFailerWithoutLocalNoop (t , c , failureMode , opts , settings , rng )
201+ failer := makeFailerWithoutLocalNoop (t , c , m , failureMode , opts , settings , rng )
200202 if c .IsLocal () && ! failer .CanUseLocal () {
201203 t .L ().Printf ("skipping failure mode %q on local cluster" , failureMode )
202204 continue
@@ -209,7 +211,6 @@ func runFailoverChaos(ctx context.Context, t test.Test, c cluster.Cluster, readO
209211 c .Put (ctx , t .Cockroach (), "./cockroach" )
210212 c .Start (ctx , t .L (), opts , settings , c .Range (1 , 9 ))
211213
212- m := c .NewMonitor (ctx , c .Range (1 , 9 ))
213214 conn := c .Conn (ctx , t .L (), 1 )
214215
215216 // Place 5 replicas of all ranges on n3-n9, keeping n1-n2 as SQL gateways.
@@ -285,7 +286,7 @@ func runFailoverChaos(ctx context.Context, t test.Test, c cluster.Cluster, readO
285286 d .numReplicas = 1 + rng .Intn (5 )
286287 d .onlyLeaseholders = rng .Float64 () < 0.5
287288 }
288- failer .Ready (ctx , m )
289+ failer .Ready (ctx )
289290 nodeFailers [node ] = failer
290291 }
291292
@@ -367,14 +368,15 @@ func runFailoverPartialLeaseGateway(ctx context.Context, t test.Test, c cluster.
367368 settings := install .MakeClusterSettings ()
368369 settings .Env = append (settings .Env , "COCKROACH_SCAN_MAX_IDLE_TIME=100ms" ) // speed up replication
369370
370- failer := makeFailer (t , c , failureModeBlackhole , opts , settings , rng ).(PartialFailer )
371+ m := c .NewMonitor (ctx , c .Range (1 , 7 ))
372+
373+ failer := makeFailer (t , c , m , failureModeBlackhole , opts , settings , rng ).(PartialFailer )
371374 failer .Setup (ctx )
372375 defer failer .Cleanup (ctx )
373376
374377 c .Put (ctx , t .Cockroach (), "./cockroach" )
375378 c .Start (ctx , t .L (), opts , settings , c .Range (1 , 7 ))
376379
377- m := c .NewMonitor (ctx , c .Range (1 , 7 ))
378380 conn := c .Conn (ctx , t .L (), 1 )
379381
380382 // Place all ranges on n1-n3 to start with.
@@ -439,7 +441,7 @@ func runFailoverPartialLeaseGateway(ctx context.Context, t test.Test, c cluster.
439441 for _ , tc := range testcases {
440442 sleepFor (ctx , t , time .Minute )
441443
442- failer .Ready (ctx , m )
444+ failer .Ready (ctx )
443445
444446 // Ranges and leases may occasionally escape their constraints. Move
445447 // them to where they should be.
@@ -499,7 +501,9 @@ func runFailoverPartialLeaseLeader(ctx context.Context, t test.Test, c cluster.C
499501 settings .Env = append (settings .Env , "COCKROACH_DISABLE_LEADER_FOLLOWS_LEASEHOLDER=true" )
500502 settings .Env = append (settings .Env , "COCKROACH_SCAN_MAX_IDLE_TIME=100ms" ) // speed up replication
501503
502- failer := makeFailer (t , c , failureModeBlackhole , opts , settings , rng ).(PartialFailer )
504+ m := c .NewMonitor (ctx , c .Range (1 , 6 ))
505+
506+ failer := makeFailer (t , c , m , failureModeBlackhole , opts , settings , rng ).(PartialFailer )
503507 failer .Setup (ctx )
504508 defer failer .Cleanup (ctx )
505509
@@ -520,7 +524,6 @@ func runFailoverPartialLeaseLeader(ctx context.Context, t test.Test, c cluster.C
520524
521525 // Now that system ranges are properly placed on n1-n3, start n4-n6.
522526 c .Start (ctx , t .L (), opts , settings , c .Range (4 , 6 ))
523- m := c .NewMonitor (ctx , c .Range (1 , 6 ))
524527
525528 // Create the kv database on n4-n6.
526529 t .L ().Printf ("creating workload database" )
@@ -573,7 +576,7 @@ func runFailoverPartialLeaseLeader(ctx context.Context, t test.Test, c cluster.C
573576 for _ , node := range []int {4 , 5 , 6 } {
574577 sleepFor (ctx , t , time .Minute )
575578
576- failer .Ready (ctx , m )
579+ failer .Ready (ctx )
577580
578581 // Ranges may occasionally escape their constraints. Move them to where
579582 // they should be.
@@ -631,14 +634,15 @@ func runFailoverPartialLeaseLiveness(ctx context.Context, t test.Test, c cluster
631634 settings := install .MakeClusterSettings ()
632635 settings .Env = append (settings .Env , "COCKROACH_SCAN_MAX_IDLE_TIME=100ms" ) // speed up replication
633636
634- failer := makeFailer (t , c , failureModeBlackhole , opts , settings , rng ).(PartialFailer )
637+ m := c .NewMonitor (ctx , c .Range (1 , 7 ))
638+
639+ failer := makeFailer (t , c , m , failureModeBlackhole , opts , settings , rng ).(PartialFailer )
635640 failer .Setup (ctx )
636641 defer failer .Cleanup (ctx )
637642
638643 c .Put (ctx , t .Cockroach (), "./cockroach" )
639644 c .Start (ctx , t .L (), opts , settings , c .Range (1 , 7 ))
640645
641- m := c .NewMonitor (ctx , c .Range (1 , 7 ))
642646 conn := c .Conn (ctx , t .L (), 1 )
643647
644648 // Place all ranges on n1-n3, and an extra liveness leaseholder replica on n4.
@@ -688,7 +692,7 @@ func runFailoverPartialLeaseLiveness(ctx context.Context, t test.Test, c cluster
688692 for _ , node := range []int {5 , 6 , 7 } {
689693 sleepFor (ctx , t , time .Minute )
690694
691- failer .Ready (ctx , m )
695+ failer .Ready (ctx )
692696
693697 // Ranges and leases may occasionally escape their constraints. Move
694698 // them to where they should be.
@@ -747,14 +751,15 @@ func runFailoverNonSystem(
747751 settings .Env = append (settings .Env , "COCKROACH_ENABLE_UNSAFE_TEST_BUILTINS=true" )
748752 settings .Env = append (settings .Env , "COCKROACH_SCAN_MAX_IDLE_TIME=100ms" ) // speed up replication
749753
750- failer := makeFailer (t , c , failureMode , opts , settings , rng )
754+ m := c .NewMonitor (ctx , c .Range (1 , 6 ))
755+
756+ failer := makeFailer (t , c , m , failureMode , opts , settings , rng )
751757 failer .Setup (ctx )
752758 defer failer .Cleanup (ctx )
753759
754760 c .Put (ctx , t .Cockroach (), "./cockroach" )
755761 c .Start (ctx , t .L (), opts , settings , c .Range (1 , 6 ))
756762
757- m := c .NewMonitor (ctx , c .Range (1 , 6 ))
758763 conn := c .Conn (ctx , t .L (), 1 )
759764
760765 // Constrain all existing zone configs to n1-n3.
@@ -797,7 +802,7 @@ func runFailoverNonSystem(
797802 for _ , node := range []int {4 , 5 , 6 } {
798803 sleepFor (ctx , t , time .Minute )
799804
800- failer .Ready (ctx , m )
805+ failer .Ready (ctx )
801806
802807 // Ranges may occasionally escape their constraints. Move them
803808 // to where they should be.
@@ -856,14 +861,15 @@ func runFailoverLiveness(
856861 settings .Env = append (settings .Env , "COCKROACH_ENABLE_UNSAFE_TEST_BUILTINS=true" )
857862 settings .Env = append (settings .Env , "COCKROACH_SCAN_MAX_IDLE_TIME=100ms" ) // speed up replication
858863
859- failer := makeFailer (t , c , failureMode , opts , settings , rng )
864+ m := c .NewMonitor (ctx , c .Range (1 , 4 ))
865+
866+ failer := makeFailer (t , c , m , failureMode , opts , settings , rng )
860867 failer .Setup (ctx )
861868 defer failer .Cleanup (ctx )
862869
863870 c .Put (ctx , t .Cockroach (), "./cockroach" )
864871 c .Start (ctx , t .L (), opts , settings , c .Range (1 , 4 ))
865872
866- m := c .NewMonitor (ctx , c .Range (1 , 4 ))
867873 conn := c .Conn (ctx , t .L (), 1 )
868874
869875 // Constrain all existing zone configs to n1-n3.
@@ -911,7 +917,7 @@ func runFailoverLiveness(
911917 for i := 0 ; i < 9 ; i ++ {
912918 sleepFor (ctx , t , time .Minute )
913919
914- failer .Ready (ctx , m )
920+ failer .Ready (ctx )
915921
916922 // Ranges and leases may occasionally escape their constraints. Move them
917923 // to where they should be.
@@ -969,14 +975,15 @@ func runFailoverSystemNonLiveness(
969975 settings .Env = append (settings .Env , "COCKROACH_ENABLE_UNSAFE_TEST_BUILTINS=true" )
970976 settings .Env = append (settings .Env , "COCKROACH_SCAN_MAX_IDLE_TIME=100ms" ) // speed up replication
971977
972- failer := makeFailer (t , c , failureMode , opts , settings , rng )
978+ m := c .NewMonitor (ctx , c .Range (1 , 6 ))
979+
980+ failer := makeFailer (t , c , m , failureMode , opts , settings , rng )
973981 failer .Setup (ctx )
974982 defer failer .Cleanup (ctx )
975983
976984 c .Put (ctx , t .Cockroach (), "./cockroach" )
977985 c .Start (ctx , t .L (), opts , settings , c .Range (1 , 6 ))
978986
979- m := c .NewMonitor (ctx , c .Range (1 , 6 ))
980987 conn := c .Conn (ctx , t .L (), 1 )
981988
982989 // Constrain all existing zone configs to n4-n6, except liveness which is
@@ -1024,7 +1031,7 @@ func runFailoverSystemNonLiveness(
10241031 for _ , node := range []int {4 , 5 , 6 } {
10251032 sleepFor (ctx , t , time .Minute )
10261033
1027- failer .Ready (ctx , m )
1034+ failer .Ready (ctx )
10281035
10291036 // Ranges may occasionally escape their constraints. Move them
10301037 // to where they should be.
@@ -1081,12 +1088,13 @@ var allFailureModes = []failureMode{
10811088func makeFailer (
10821089 t test.Test ,
10831090 c cluster.Cluster ,
1091+ m cluster.Monitor ,
10841092 failureMode failureMode ,
10851093 opts option.StartOpts ,
10861094 settings install.ClusterSettings ,
10871095 rng * rand.Rand ,
10881096) Failer {
1089- f := makeFailerWithoutLocalNoop (t , c , failureMode , opts , settings , rng )
1097+ f := makeFailerWithoutLocalNoop (t , c , m , failureMode , opts , settings , rng )
10901098 if c .IsLocal () && ! f .CanUseLocal () {
10911099 t .L ().Printf (
10921100 `failure mode %q not supported on local clusters, using "noop" failure mode instead` ,
@@ -1099,6 +1107,7 @@ func makeFailer(
10991107func makeFailerWithoutLocalNoop (
11001108 t test.Test ,
11011109 c cluster.Cluster ,
1110+ m cluster.Monitor ,
11021111 failureMode failureMode ,
11031112 opts option.StartOpts ,
11041113 settings install.ClusterSettings ,
@@ -1128,13 +1137,15 @@ func makeFailerWithoutLocalNoop(
11281137 return & crashFailer {
11291138 t : t ,
11301139 c : c ,
1140+ m : m ,
11311141 startOpts : opts ,
11321142 startSettings : settings ,
11331143 }
11341144 case failureModeDeadlock :
11351145 return & deadlockFailer {
11361146 t : t ,
11371147 c : c ,
1148+ m : m ,
11381149 rng : rng ,
11391150 startOpts : opts ,
11401151 startSettings : settings ,
@@ -1145,6 +1156,7 @@ func makeFailerWithoutLocalNoop(
11451156 return & diskStallFailer {
11461157 t : t ,
11471158 c : c ,
1159+ m : m ,
11481160 startOpts : opts ,
11491161 startSettings : settings ,
11501162 staller : & dmsetupDiskStaller {t : t , c : c },
@@ -1182,7 +1194,7 @@ type Failer interface {
11821194
11831195 // Ready is called some time before failing each node, when the cluster and
11841196 // workload is running and after recovering the previous node failure if any.
1185- Ready (ctx context.Context , m cluster. Monitor )
1197+ Ready (ctx context.Context )
11861198
11871199 // Cleanup cleans up when the test exits. This is needed e.g. when the cluster
11881200 // is reused by a different test.
@@ -1211,7 +1223,7 @@ func (f *noopFailer) String() string { return string(f.
12111223func (f * noopFailer ) CanUseLocal () bool { return true }
12121224func (f * noopFailer ) CanRunWith (failureMode ) bool { return true }
12131225func (f * noopFailer ) Setup (context.Context ) {}
1214- func (f * noopFailer ) Ready (context.Context , cluster. Monitor ) {}
1226+ func (f * noopFailer ) Ready (context.Context ) {}
12151227func (f * noopFailer ) Cleanup (context.Context ) {}
12161228func (f * noopFailer ) Fail (context.Context , int ) {}
12171229func (f * noopFailer ) FailPartial (context.Context , int , []int ) {}
@@ -1239,11 +1251,11 @@ func (f *blackholeFailer) Mode() failureMode {
12391251 return failureModeBlackhole
12401252}
12411253
1242- func (f * blackholeFailer ) String () string { return string (f .Mode ()) }
1243- func (f * blackholeFailer ) CanUseLocal () bool { return false } // needs iptables
1244- func (f * blackholeFailer ) CanRunWith (failureMode ) bool { return true }
1245- func (f * blackholeFailer ) Setup (context.Context ) {}
1246- func (f * blackholeFailer ) Ready (context.Context , cluster. Monitor ) {}
1254+ func (f * blackholeFailer ) String () string { return string (f .Mode ()) }
1255+ func (f * blackholeFailer ) CanUseLocal () bool { return false } // needs iptables
1256+ func (f * blackholeFailer ) CanRunWith (failureMode ) bool { return true }
1257+ func (f * blackholeFailer ) Setup (context.Context ) {}
1258+ func (f * blackholeFailer ) Ready (context.Context ) {}
12471259
12481260func (f * blackholeFailer ) Cleanup (ctx context.Context ) {
12491261 f .c .Run (ctx , f .c .All (), `sudo iptables -F` )
@@ -1324,13 +1336,13 @@ type crashFailer struct {
13241336 startSettings install.ClusterSettings
13251337}
13261338
1327- func (f * crashFailer ) Mode () failureMode { return failureModeCrash }
1328- func (f * crashFailer ) String () string { return string (f .Mode ()) }
1329- func (f * crashFailer ) CanUseLocal () bool { return true }
1330- func (f * crashFailer ) CanRunWith (failureMode ) bool { return true }
1331- func (f * crashFailer ) Setup (_ context.Context ) {}
1332- func (f * crashFailer ) Ready (_ context.Context , m cluster. Monitor ) { f . m = m }
1333- func (f * crashFailer ) Cleanup (_ context.Context ) {}
1339+ func (f * crashFailer ) Mode () failureMode { return failureModeCrash }
1340+ func (f * crashFailer ) String () string { return string (f .Mode ()) }
1341+ func (f * crashFailer ) CanUseLocal () bool { return true }
1342+ func (f * crashFailer ) CanRunWith (failureMode ) bool { return true }
1343+ func (f * crashFailer ) Setup (context.Context ) {}
1344+ func (f * crashFailer ) Ready (context.Context ) { }
1345+ func (f * crashFailer ) Cleanup (context.Context ) {}
13341346
13351347func (f * crashFailer ) Fail (ctx context.Context , nodeID int ) {
13361348 f .m .ExpectDeath ()
@@ -1366,9 +1378,7 @@ func (f *deadlockFailer) CanRunWith(m failureMode) bool { return true }
13661378func (f * deadlockFailer ) Setup (context.Context ) {}
13671379func (f * deadlockFailer ) Cleanup (context.Context ) {}
13681380
1369- func (f * deadlockFailer ) Ready (ctx context.Context , m cluster.Monitor ) {
1370- f .m = m
1371-
1381+ func (f * deadlockFailer ) Ready (ctx context.Context ) {
13721382 // In chaos tests, other nodes will be failing concurrently. We therefore
13731383 // can't run SHOW CLUSTER RANGES WITH DETAILS in Fail(), since it needs to
13741384 // read from all ranges. Instead, we fetch a snapshot of replicas and leases
@@ -1483,15 +1493,12 @@ func (f *diskStallFailer) Mode() failureMode { return failureModeDiskS
14831493func (f * diskStallFailer ) String () string { return string (f .Mode ()) }
14841494func (f * diskStallFailer ) CanUseLocal () bool { return false } // needs dmsetup
14851495func (f * diskStallFailer ) CanRunWith (failureMode ) bool { return true }
1496+ func (f * diskStallFailer ) Ready (context.Context ) {}
14861497
14871498func (f * diskStallFailer ) Setup (ctx context.Context ) {
14881499 f .staller .Setup (ctx )
14891500}
14901501
1491- func (f * diskStallFailer ) Ready (_ context.Context , m cluster.Monitor ) {
1492- f .m = m
1493- }
1494-
14951502func (f * diskStallFailer ) Cleanup (ctx context.Context ) {
14961503 f .staller .Unstall (ctx , f .c .All ())
14971504 // We have to stop the cluster before cleaning up the staller.
@@ -1533,7 +1540,7 @@ func (f *pauseFailer) CanRunWith(other failureMode) bool {
15331540 return other != failureModeDiskStall
15341541}
15351542
1536- func (f * pauseFailer ) Ready (ctx context.Context , _ cluster. Monitor ) {
1543+ func (f * pauseFailer ) Ready (ctx context.Context ) {
15371544 // The process pause can trip the disk stall detector, so we disable it. We
15381545 // could let it fire, but we'd like to see if the node can recover from the
15391546 // pause and keep working.
0 commit comments