roachtest: add unit test for failure injection

Dev-Kyle · Dev-Kyle · commit b4738823bcc5 · 2025-08-12T13:54:24.000-04:00
This adds a unit test to make sure that failure injections do not
overlap, and that there is always an active failure injection when
encountering a recovery step.
diff --git a/pkg/cmd/roachtest/roachtestutil/mixedversion/BUILD.bazel b/pkg/cmd/roachtest/roachtestutil/mixedversion/BUILD.bazel
@@ -66,6 +66,7 @@ go_test(
         "//pkg/cmd/roachtest/roachtestutil/task",
         "//pkg/cmd/roachtest/spec",
         "//pkg/roachpb",
+        "//pkg/roachprod/failureinjection/failures",
         "//pkg/roachprod/install",
         "//pkg/roachprod/logger",
         "//pkg/roachprod/vm",
diff --git a/pkg/cmd/roachtest/roachtestutil/mixedversion/mixedversion.go b/pkg/cmd/roachtest/roachtestutil/mixedversion/mixedversion.go
@@ -85,6 +85,7 @@ import (
 	"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil/clusterupgrade"
 	"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/spec"
 	"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test"
+	"github.com/cockroachdb/cockroach/pkg/roachprod/failureinjection/failures"
 	"github.com/cockroachdb/cockroach/pkg/roachprod/install"
 	"github.com/cockroachdb/cockroach/pkg/roachprod/logger"
 	"github.com/cockroachdb/cockroach/pkg/roachprod/vm"
@@ -383,8 +384,9 @@ type (
 		// the following are test-only fields, allowing tests to simulate
 		// cluster properties without passing a cluster.Cluster
 		// implementation.
-		_arch    *vm.CPUArch
-		_isLocal *bool
+		_arch      *vm.CPUArch
+		_isLocal   *bool
+		_getFailer func(name string) (*failures.Failer, error)
 	}
 
 	shouldStop chan struct{}
@@ -964,6 +966,7 @@ func (t *Test) plan() (plan *TestPlan, retErr error) {
 			bgChans:        t.bgChans,
 			logger:         t.logger,
 			cluster:        t.cluster,
+			_getFailer:     t._getFailer,
 		}
 		// Let's generate a plan.
 		plan, err = planner.Plan()
diff --git a/pkg/cmd/roachtest/roachtestutil/mixedversion/mutators.go b/pkg/cmd/roachtest/roachtestutil/mixedversion/mutators.go
@@ -498,6 +498,14 @@ func (m panicNodeMutator) Generate(
 	return mutations, nil
 }
 
+func GetFailer(planner *testPlanner, name string) (*failures.Failer, error) {
+	if planner._getFailer != nil {
+		return planner._getFailer(name)
+	}
+
+	return planner.cluster.GetFailer(planner.logger, planner.cluster.CRDBNodes(), name)
+}
+
 type networkPartitionMutator struct{}
 
 func (m networkPartitionMutator) Name() string { return failures.IPTablesNetworkPartitionName }
@@ -516,8 +524,7 @@ func (m networkPartitionMutator) Generate(
 	idx := newStepIndex(plan)
 	nodeList := planner.currentContext.System.Descriptor.Nodes
 
-	failure := failures.GetFailureRegistry()
-	f, err := failure.GetFailer(planner.cluster.Name(), failures.IPTablesNetworkPartitionName, planner.logger, false)
+	f, err := GetFailer(planner, failures.IPTablesNetworkPartitionName)
 	if err != nil {
 		return nil, fmt.Errorf("failed to get failer for %s: %w", failures.IPTablesNetworkPartitionName, err)
 	}
diff --git a/pkg/cmd/roachtest/roachtestutil/mixedversion/planner.go b/pkg/cmd/roachtest/roachtestutil/mixedversion/planner.go
@@ -16,6 +16,7 @@ import (
 	"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option"
 	"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil/clusterupgrade"
 	"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test"
+	"github.com/cockroachdb/cockroach/pkg/roachprod/failureinjection/failures"
 	"github.com/cockroachdb/cockroach/pkg/roachprod/install"
 	"github.com/cockroachdb/cockroach/pkg/roachprod/logger"
 	"github.com/cockroachdb/cockroach/pkg/util/randutil"
@@ -100,6 +101,9 @@ type (
 
 		// State variables updated as the test plan is generated.
 		usingFixtures bool
+
+		// Unit test only fields.
+		_getFailer func(name string) (*failures.Failer, error)
 	}
 
 	// UpgradeStage encodes in what part of an upgrade a test step is in
diff --git a/pkg/cmd/roachtest/roachtestutil/mixedversion/planner_test.go b/pkg/cmd/roachtest/roachtestutil/mixedversion/planner_test.go
@@ -17,6 +17,7 @@ import (
 	"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option"
 	"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil"
 	"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil/clusterupgrade"
+	"github.com/cockroachdb/cockroach/pkg/roachprod/failureinjection/failures"
 	"github.com/cockroachdb/cockroach/pkg/roachprod/logger"
 	"github.com/cockroachdb/cockroach/pkg/roachprod/vm"
 	"github.com/cockroachdb/cockroach/pkg/testutils/datapathutils"
@@ -353,6 +354,76 @@ func Test_maxNumPlanSteps(t *testing.T) {
 	require.Nil(t, plan)
 }
 
+// TestNoConcurrentFailureInjections tests that failure injection
+// steps properly manage node availability. Specifically:
+// - Failure injection steps should only run if no other failure is currently injected.
+// - Failure recovery steps can only occur if there is an active failure injected.
+// - We can only bump the cluster version if no failures are currently injected.
+func TestNoConcurrentFailureInjections(t *testing.T) {
+	const numIterations = 500
+	rngSource := rand.NewSource(randutil.NewPseudoSeed())
+	// Set all failure injection mutator probabilities to 1.
+	var opts []CustomOption
+	for _, mutator := range failureInjectionMutators {
+		opts = append(opts, WithMutatorProbability(mutator.Name(), 1.0))
+	}
+	opts = append(opts, NumUpgrades(3))
+	getFailer := func(name string) (*failures.Failer, error) {
+		return nil, nil
+	}
+
+	for range numIterations {
+		mvt := newTest(opts...)
+		mvt._getFailer = getFailer
+		mvt.InMixedVersion("test hook", dummyHook)
+		// Use different seed for each iteration
+		mvt.prng = rand.New(rngSource)
+
+		plan, err := mvt.plan()
+		require.NoError(t, err)
+
+		isFailureInjected := false
+
+		var checkSteps func(steps []testStep)
+		checkSteps = func(steps []testStep) {
+			for _, step := range steps {
+				switch s := step.(type) {
+				case *singleStep:
+					switch s.impl.(type) {
+					case panicNodeStep:
+						require.False(t, isFailureInjected, "there should be no active failure when panicNodeStep runs")
+						isFailureInjected = true
+					case networkPartitionInjectStep:
+						require.False(t, isFailureInjected, "there should be no active failure when networkPartitionInjectStep runs")
+						isFailureInjected = true
+					case restartNodeStep:
+						require.True(t, isFailureInjected, "there is no active failure to recover from")
+						isFailureInjected = false
+					case networkPartitionRecoveryStep:
+						require.True(t, isFailureInjected, "there is no active failure to recover from")
+						isFailureInjected = false
+					case waitForStableClusterVersionStep:
+						require.False(t, isFailureInjected, "waitForStableClusterVersionStep cannot run under failure injection")
+					}
+				case sequentialRunStep:
+					checkSteps(s.steps)
+				case concurrentRunStep:
+					// Failure injection steps should never run concurrently with other steps, so treat concurrent
+					// steps as sequential for simplicity.
+					for _, delayedStepInterface := range s.delayedSteps {
+						ds := delayedStepInterface.(delayedStep)
+						checkSteps([]testStep{ds.step})
+					}
+				}
+			}
+		}
+
+		checkSteps(plan.Steps())
+
+		require.False(t, isFailureInjected, "all failure injections should be cleaned up at the end of the test")
+	}
+}
+
 // setDefaultVersions overrides the test's view of the current build
 // as well as the oldest supported version. This allows the test
 // output to remain stable as new versions are released and/or we bump