@@ -17,6 +17,7 @@ import (
17
17
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option"
18
18
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil"
19
19
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil/clusterupgrade"
20
+ "github.com/cockroachdb/cockroach/pkg/roachprod/failureinjection/failures"
20
21
"github.com/cockroachdb/cockroach/pkg/roachprod/logger"
21
22
"github.com/cockroachdb/cockroach/pkg/roachprod/vm"
22
23
"github.com/cockroachdb/cockroach/pkg/testutils/datapathutils"
@@ -353,6 +354,76 @@ func Test_maxNumPlanSteps(t *testing.T) {
353
354
require .Nil (t , plan )
354
355
}
355
356
357
+ // TestNoConcurrentFailureInjections tests that failure injection
358
+ // steps properly manage node availability. Specifically:
359
+ // - Failure injection steps should only run if no other failure is currently injected.
360
+ // - Failure recovery steps can only occur if there is an active failure injected.
361
+ // - We can only bump the cluster version if no failures are currently injected.
362
+ func TestNoConcurrentFailureInjections (t * testing.T ) {
363
+ const numIterations = 500
364
+ rngSource := rand .NewSource (randutil .NewPseudoSeed ())
365
+ // Set all failure injection mutator probabilities to 1.
366
+ var opts []CustomOption
367
+ for _ , mutator := range failureInjectionMutators {
368
+ opts = append (opts , WithMutatorProbability (mutator .Name (), 1.0 ))
369
+ }
370
+ opts = append (opts , NumUpgrades (3 ))
371
+ getFailer := func (name string ) (* failures.Failer , error ) {
372
+ return nil , nil
373
+ }
374
+
375
+ for range numIterations {
376
+ mvt := newTest (opts ... )
377
+ mvt ._getFailer = getFailer
378
+ mvt .InMixedVersion ("test hook" , dummyHook )
379
+ // Use different seed for each iteration
380
+ mvt .prng = rand .New (rngSource )
381
+
382
+ plan , err := mvt .plan ()
383
+ require .NoError (t , err )
384
+
385
+ isFailureInjected := false
386
+
387
+ var checkSteps func (steps []testStep )
388
+ checkSteps = func (steps []testStep ) {
389
+ for _ , step := range steps {
390
+ switch s := step .(type ) {
391
+ case * singleStep :
392
+ switch s .impl .(type ) {
393
+ case panicNodeStep :
394
+ require .False (t , isFailureInjected , "there should be no active failure when panicNodeStep runs" )
395
+ isFailureInjected = true
396
+ case networkPartitionInjectStep :
397
+ require .False (t , isFailureInjected , "there should be no active failure when networkPartitionInjectStep runs" )
398
+ isFailureInjected = true
399
+ case restartNodeStep :
400
+ require .True (t , isFailureInjected , "there is no active failure to recover from" )
401
+ isFailureInjected = false
402
+ case networkPartitionRecoveryStep :
403
+ require .True (t , isFailureInjected , "there is no active failure to recover from" )
404
+ isFailureInjected = false
405
+ case waitForStableClusterVersionStep :
406
+ require .False (t , isFailureInjected , "waitForStableClusterVersionStep cannot run under failure injection" )
407
+ }
408
+ case sequentialRunStep :
409
+ checkSteps (s .steps )
410
+ case concurrentRunStep :
411
+ // Failure injection steps should never run concurrently with other steps, so treat concurrent
412
+ // steps as sequential for simplicity.
413
+ for _ , delayedStepInterface := range s .delayedSteps {
414
+ ds := delayedStepInterface .(delayedStep )
415
+ checkSteps ([]testStep {ds .step })
416
+ }
417
+ }
418
+ }
419
+ }
420
+
421
+ checkSteps (plan .Steps ())
422
+
423
+ require .False (t , isFailureInjected , "all failure injections should be cleaned up at the end of the test" )
424
+ }
425
+ }
426
+
356
427
// setDefaultVersions overrides the test's view of the current build
357
428
// as well as the oldest supported version. This allows the test
358
429
// output to remain stable as new versions are released and/or we bump
0 commit comments