Skip to content

Commit b473882

Browse files
committed
roachtest: add unit test for failure injection
This adds a unit test to make sure that failure injections do not overlap, and that there is always an active failure injection when encountering a recovery step.
1 parent d32cad7 commit b473882

File tree

5 files changed

+90
-4
lines changed

5 files changed

+90
-4
lines changed

pkg/cmd/roachtest/roachtestutil/mixedversion/BUILD.bazel

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,6 +66,7 @@ go_test(
6666
"//pkg/cmd/roachtest/roachtestutil/task",
6767
"//pkg/cmd/roachtest/spec",
6868
"//pkg/roachpb",
69+
"//pkg/roachprod/failureinjection/failures",
6970
"//pkg/roachprod/install",
7071
"//pkg/roachprod/logger",
7172
"//pkg/roachprod/vm",

pkg/cmd/roachtest/roachtestutil/mixedversion/mixedversion.go

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -85,6 +85,7 @@ import (
8585
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil/clusterupgrade"
8686
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/spec"
8787
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test"
88+
"github.com/cockroachdb/cockroach/pkg/roachprod/failureinjection/failures"
8889
"github.com/cockroachdb/cockroach/pkg/roachprod/install"
8990
"github.com/cockroachdb/cockroach/pkg/roachprod/logger"
9091
"github.com/cockroachdb/cockroach/pkg/roachprod/vm"
@@ -383,8 +384,9 @@ type (
383384
// the following are test-only fields, allowing tests to simulate
384385
// cluster properties without passing a cluster.Cluster
385386
// implementation.
386-
_arch *vm.CPUArch
387-
_isLocal *bool
387+
_arch *vm.CPUArch
388+
_isLocal *bool
389+
_getFailer func(name string) (*failures.Failer, error)
388390
}
389391

390392
shouldStop chan struct{}
@@ -964,6 +966,7 @@ func (t *Test) plan() (plan *TestPlan, retErr error) {
964966
bgChans: t.bgChans,
965967
logger: t.logger,
966968
cluster: t.cluster,
969+
_getFailer: t._getFailer,
967970
}
968971
// Let's generate a plan.
969972
plan, err = planner.Plan()

pkg/cmd/roachtest/roachtestutil/mixedversion/mutators.go

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -498,6 +498,14 @@ func (m panicNodeMutator) Generate(
498498
return mutations, nil
499499
}
500500

501+
func GetFailer(planner *testPlanner, name string) (*failures.Failer, error) {
502+
if planner._getFailer != nil {
503+
return planner._getFailer(name)
504+
}
505+
506+
return planner.cluster.GetFailer(planner.logger, planner.cluster.CRDBNodes(), name)
507+
}
508+
501509
type networkPartitionMutator struct{}
502510

503511
func (m networkPartitionMutator) Name() string { return failures.IPTablesNetworkPartitionName }
@@ -516,8 +524,7 @@ func (m networkPartitionMutator) Generate(
516524
idx := newStepIndex(plan)
517525
nodeList := planner.currentContext.System.Descriptor.Nodes
518526

519-
failure := failures.GetFailureRegistry()
520-
f, err := failure.GetFailer(planner.cluster.Name(), failures.IPTablesNetworkPartitionName, planner.logger, false)
527+
f, err := GetFailer(planner, failures.IPTablesNetworkPartitionName)
521528
if err != nil {
522529
return nil, fmt.Errorf("failed to get failer for %s: %w", failures.IPTablesNetworkPartitionName, err)
523530
}

pkg/cmd/roachtest/roachtestutil/mixedversion/planner.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ import (
1616
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option"
1717
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil/clusterupgrade"
1818
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test"
19+
"github.com/cockroachdb/cockroach/pkg/roachprod/failureinjection/failures"
1920
"github.com/cockroachdb/cockroach/pkg/roachprod/install"
2021
"github.com/cockroachdb/cockroach/pkg/roachprod/logger"
2122
"github.com/cockroachdb/cockroach/pkg/util/randutil"
@@ -100,6 +101,9 @@ type (
100101

101102
// State variables updated as the test plan is generated.
102103
usingFixtures bool
104+
105+
// Unit test only fields.
106+
_getFailer func(name string) (*failures.Failer, error)
103107
}
104108

105109
// UpgradeStage encodes in what part of an upgrade a test step is in

pkg/cmd/roachtest/roachtestutil/mixedversion/planner_test.go

Lines changed: 71 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@ import (
1717
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option"
1818
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil"
1919
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil/clusterupgrade"
20+
"github.com/cockroachdb/cockroach/pkg/roachprod/failureinjection/failures"
2021
"github.com/cockroachdb/cockroach/pkg/roachprod/logger"
2122
"github.com/cockroachdb/cockroach/pkg/roachprod/vm"
2223
"github.com/cockroachdb/cockroach/pkg/testutils/datapathutils"
@@ -353,6 +354,76 @@ func Test_maxNumPlanSteps(t *testing.T) {
353354
require.Nil(t, plan)
354355
}
355356

357+
// TestNoConcurrentFailureInjections tests that failure injection
358+
// steps properly manage node availability. Specifically:
359+
// - Failure injection steps should only run if no other failure is currently injected.
360+
// - Failure recovery steps can only occur if there is an active failure injected.
361+
// - We can only bump the cluster version if no failures are currently injected.
362+
func TestNoConcurrentFailureInjections(t *testing.T) {
363+
const numIterations = 500
364+
rngSource := rand.NewSource(randutil.NewPseudoSeed())
365+
// Set all failure injection mutator probabilities to 1.
366+
var opts []CustomOption
367+
for _, mutator := range failureInjectionMutators {
368+
opts = append(opts, WithMutatorProbability(mutator.Name(), 1.0))
369+
}
370+
opts = append(opts, NumUpgrades(3))
371+
getFailer := func(name string) (*failures.Failer, error) {
372+
return nil, nil
373+
}
374+
375+
for range numIterations {
376+
mvt := newTest(opts...)
377+
mvt._getFailer = getFailer
378+
mvt.InMixedVersion("test hook", dummyHook)
379+
// Use different seed for each iteration
380+
mvt.prng = rand.New(rngSource)
381+
382+
plan, err := mvt.plan()
383+
require.NoError(t, err)
384+
385+
isFailureInjected := false
386+
387+
var checkSteps func(steps []testStep)
388+
checkSteps = func(steps []testStep) {
389+
for _, step := range steps {
390+
switch s := step.(type) {
391+
case *singleStep:
392+
switch s.impl.(type) {
393+
case panicNodeStep:
394+
require.False(t, isFailureInjected, "there should be no active failure when panicNodeStep runs")
395+
isFailureInjected = true
396+
case networkPartitionInjectStep:
397+
require.False(t, isFailureInjected, "there should be no active failure when networkPartitionInjectStep runs")
398+
isFailureInjected = true
399+
case restartNodeStep:
400+
require.True(t, isFailureInjected, "there is no active failure to recover from")
401+
isFailureInjected = false
402+
case networkPartitionRecoveryStep:
403+
require.True(t, isFailureInjected, "there is no active failure to recover from")
404+
isFailureInjected = false
405+
case waitForStableClusterVersionStep:
406+
require.False(t, isFailureInjected, "waitForStableClusterVersionStep cannot run under failure injection")
407+
}
408+
case sequentialRunStep:
409+
checkSteps(s.steps)
410+
case concurrentRunStep:
411+
// Failure injection steps should never run concurrently with other steps, so treat concurrent
412+
// steps as sequential for simplicity.
413+
for _, delayedStepInterface := range s.delayedSteps {
414+
ds := delayedStepInterface.(delayedStep)
415+
checkSteps([]testStep{ds.step})
416+
}
417+
}
418+
}
419+
}
420+
421+
checkSteps(plan.Steps())
422+
423+
require.False(t, isFailureInjected, "all failure injections should be cleaned up at the end of the test")
424+
}
425+
}
426+
356427
// setDefaultVersions overrides the test's view of the current build
357428
// as well as the oldest supported version. This allows the test
358429
// output to remain stable as new versions are released and/or we bump

0 commit comments

Comments
 (0)