Skip to content

Commit 395ad12

Browse files
committed
roachtest: add mutator to panic and restart a node
This change adds a new mutator that will enable the ability to randomly panic a node. Currently the node is immediately restarted upon crashing, with the end goal of being able to panic a node and waiting some number of steps before restarting it. Fixes: None Release note: None
1 parent 8a1562c commit 395ad12

File tree

7 files changed

+246
-27
lines changed

7 files changed

+246
-27
lines changed

pkg/cmd/roachtest/roachtestutil/mixedversion/mixedversion.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -295,6 +295,11 @@ type (
295295
// Run implements the actual functionality of the step. This
296296
// signature should remain in sync with `stepFunc`.
297297
Run(context.Context, *logger.Logger, *rand.Rand, *Helper) error
298+
// ConcurrencyDisabled returns true if the step should not be run
299+
// concurrently with other steps. This is the case for any steps
300+
// that involve restarting a node, as they may attempt to connect
301+
// to an unavailable node.
302+
ConcurrencyDisabled() bool
298303
}
299304

300305
// singleStep represents steps that implement the pieces on top of

pkg/cmd/roachtest/roachtestutil/mixedversion/mutators.go

Lines changed: 53 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ func (m preserveDowngradeOptionRandomizerMutator) Probability() float64 {
5353
// current version is always mutated. The length of the returned
5454
// mutations is always even.
5555
func (m preserveDowngradeOptionRandomizerMutator) Generate(
56-
rng *rand.Rand, plan *TestPlan,
56+
rng *rand.Rand, plan *TestPlan, planner *testPlanner,
5757
) []mutation {
5858
var mutations []mutation
5959
for _, upgradeSelector := range randomUpgrades(rng, plan) {
@@ -221,7 +221,9 @@ func (m clusterSettingMutator) Probability() float64 {
221221
// original test plan. Up to `maxChanges` steps will be added to the
222222
// plan. Changes may be concurrent with user-provided steps and may
223223
// happen any time after cluster setup.
224-
func (m clusterSettingMutator) Generate(rng *rand.Rand, plan *TestPlan) []mutation {
224+
func (m clusterSettingMutator) Generate(
225+
rng *rand.Rand, plan *TestPlan, planner *testPlanner,
226+
) []mutation {
225227
var mutations []mutation
226228

227229
// possiblePointsInTime is the list of steps in the plan that are
@@ -245,13 +247,9 @@ func (m clusterSettingMutator) Generate(rng *rand.Rand, plan *TestPlan) []mutati
245247
return false
246248
}
247249
}
248-
249-
// We skip restart steps as we might insert the cluster setting
250-
// change step concurrently with the selected step.
251-
_, isRestartSystem := s.impl.(restartWithNewBinaryStep)
252-
_, isRestartTenant := s.impl.(restartVirtualClusterStep)
253-
isRestart := isRestartSystem || isRestartTenant
254-
return s.context.System.Stage >= OnStartupStage && !isRestart
250+
// Cluster setting changes can be inserted concurrently, so we want to avoid
251+
// inserting into any steps that cannot run concurrently with other steps.
252+
return s.context.System.Stage >= OnStartupStage && !s.impl.ConcurrencyDisabled()
255253
})
256254

257255
for _, changeStep := range m.changeSteps(rng, len(possiblePointsInTime)) {
@@ -383,3 +381,49 @@ func (m clusterSettingMutator) changeSteps(
383381

384382
return steps
385383
}
384+
385+
const (
386+
// PanicNode is a mutator that will randomly cause a node to crash
387+
// during the test, before restarting the node within the same step.
388+
PanicNode = "panic_node"
389+
)
390+
391+
type panicNodeMutator struct {
392+
}
393+
394+
func (m panicNodeMutator) Name() string {
395+
return PanicNode
396+
}
397+
398+
func (m panicNodeMutator) Probability() float64 {
399+
return 0.3
400+
}
401+
402+
func (m panicNodeMutator) Generate(
403+
rng *rand.Rand, plan *TestPlan, planner *testPlanner,
404+
) []mutation {
405+
var mutations []mutation
406+
maxPanics := len(plan.upgrades)
407+
numPanics := 1 + rng.Intn(maxPanics)
408+
409+
idx := newStepIndex(plan)
410+
possiblePointsInTime := plan.
411+
newStepSelector().
412+
// We don't want to panic concurrently with other steps, and inserting before a concurrent step
413+
// causes the step to run concurrently with that step, so we filter out any concurrent steps.
414+
Filter(func(s *singleStep) bool {
415+
return s.context.System.Stage >= InitUpgradeStage && !idx.IsConcurrent(s)
416+
})
417+
418+
for range numPanics {
419+
nodeList := planner.currentContext.System.Descriptor.Nodes
420+
targetNode := nodeList.SeededRandNode(rng)
421+
addRandomly := possiblePointsInTime.
422+
RandomStep(rng).
423+
InsertBefore(panicNodeStep{planner.currentContext.System.Descriptor.Nodes[0], targetNode, planner.rt})
424+
425+
mutations = append(mutations, addRandomly...)
426+
}
427+
428+
return mutations
429+
}

pkg/cmd/roachtest/roachtestutil/mixedversion/mutators_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ func TestPreserveDowngradeOptionRandomizerMutator(t *testing.T) {
2929
require.NoError(t, err)
3030

3131
var mut preserveDowngradeOptionRandomizerMutator
32-
mutations := mut.Generate(newRand(), plan)
32+
mutations := mut.Generate(newRand(), plan, nil)
3333
require.NotEmpty(t, mutations)
3434
require.True(t, len(mutations)%2 == 0, "should produce even number of mutations") // one removal and one insertion per upgrade
3535

@@ -97,7 +97,7 @@ func TestClusterSettingMutator(t *testing.T) {
9797

9898
const settingName = "test_cluster_setting"
9999
mut := newClusterSettingMutator(settingName, possibleValues, options...)
100-
mutations := mut.Generate(newRand(), plan)
100+
mutations := mut.Generate(newRand(), plan, nil)
101101

102102
// Number of mutations should be 1 <= n <= maxChanges
103103
require.GreaterOrEqual(t, len(mutations), 1, "plan:\n%s", plan.PrettyPrint())

pkg/cmd/roachtest/roachtestutil/mixedversion/planner.go

Lines changed: 33 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -127,9 +127,11 @@ type (
127127
// this probability for specific mutator implementations as
128128
// needed.
129129
Probability() float64
130-
// Generate takes a test plan and a RNG and returns the list of
131-
// mutations that should be applied to the plan.
132-
Generate(*rand.Rand, *TestPlan) []mutation
130+
// Generate takes a test plan, the testPlanner, and an RNG and returns the list of
131+
// mutations that should be applied to the plan. The test plan is the intended output
132+
// after applying the mutations. The testPlanner is used to access specific attributes
133+
// of the test plan, such as the current context or the services.
134+
Generate(*rand.Rand, *TestPlan, *testPlanner) []mutation
133135
}
134136

135137
// mutationOp encodes the type of mutation and controls how the
@@ -211,6 +213,10 @@ const (
211213
spanConfigTenantLimit = 50000
212214
)
213215

216+
var failureInjectionMutators = []mutator{
217+
panicNodeMutator{},
218+
}
219+
214220
// clusterSettingMutators includes a list of all
215221
// cluster setting mutator implementations.
216222
var clusterSettingMutators = []mutator{
@@ -250,9 +256,12 @@ var clusterSettingMutators = []mutator{
250256
// planMutators includes a list of all known `mutator`
251257
// implementations. A subset of these mutations might be enabled in
252258
// any mixedversion test plan.
253-
var planMutators = append([]mutator{
254-
preserveDowngradeOptionRandomizerMutator{},
255-
},
259+
var planMutators = append(
260+
append([]mutator{
261+
preserveDowngradeOptionRandomizerMutator{},
262+
},
263+
failureInjectionMutators...,
264+
),
256265
clusterSettingMutators...,
257266
)
258267

@@ -418,11 +427,22 @@ func (p *testPlanner) Plan() *TestPlan {
418427
isLocal: p.isLocal,
419428
}
420429

421-
// Probabilistically enable some of of the mutators on the base test
422-
// plan generated above.
430+
failureInjections := make(map[string]struct{})
431+
for _, m := range failureInjectionMutators {
432+
failureInjections[m.Name()] = struct{}{}
433+
}
434+
435+
// Probabilistically enable some of the mutators on the base test
436+
// plan generated above. We disable any failure injections that
437+
// would occur on clusters with less than three nodes as this
438+
// can lead to uninteresting failures (e.g. a single node
439+
// panic failure).
423440
for _, mut := range planMutators {
424441
if p.mutatorEnabled(mut) {
425-
mutations := mut.Generate(p.prng, testPlan)
442+
if _, found := failureInjections[mut.Name()]; found && len(p.currentContext.System.Descriptor.Nodes) < 3 {
443+
continue
444+
}
445+
mutations := mut.Generate(p.prng, testPlan, p)
426446
testPlan.applyMutations(p.prng, mutations)
427447
testPlan.enabledMutators = append(testPlan.enabledMutators, mut)
428448
}
@@ -1396,12 +1416,16 @@ func (ss stepSelector) InsertSequential(rng *rand.Rand, impl singleStepProtocol)
13961416
})
13971417
}
13981418

1419+
// InsertBefore inserts the step before the selected step. This is not guaranteed to be a non-concurrent insert, if the
1420+
// selected step is part of a `concurrentRunStep`, the new step will be concurrent with the selected step.
13991421
func (ss stepSelector) InsertBefore(impl singleStepProtocol) []mutation {
14001422
return ss.insert(impl, func() mutationOp {
14011423
return mutationInsertBefore
14021424
})
14031425
}
14041426

1427+
// InsertAfter inserts the step after the selected step. This is not guaranteed to be a non-concurrent insert, if the
1428+
// selected step is part of a `concurrentRunStep`, the new step will be concurrent with the selected step.
14051429
func (ss stepSelector) InsertAfter(impl singleStepProtocol) []mutation {
14061430
return ss.insert(impl, func() mutationOp {
14071431
return mutationInsertAfter

pkg/cmd/roachtest/roachtestutil/mixedversion/planner_test.go

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -553,7 +553,7 @@ func Test_stepSelectorFilter(t *testing.T) {
553553
name: "no filter",
554554
predicate: func(*singleStep) bool { return true },
555555
expectedAllSteps: true,
556-
expectedRandomStepType: runHookStep{},
556+
expectedRandomStepType: restartWithNewBinaryStep{},
557557
},
558558
{
559559
name: "filter eliminates all steps",
@@ -861,7 +861,9 @@ type concurrentUserHooksMutator struct{}
861861
func (concurrentUserHooksMutator) Name() string { return "concurrent_user_hooks_mutator" }
862862
func (concurrentUserHooksMutator) Probability() float64 { return 0.5 }
863863

864-
func (concurrentUserHooksMutator) Generate(rng *rand.Rand, plan *TestPlan) []mutation {
864+
func (concurrentUserHooksMutator) Generate(
865+
rng *rand.Rand, plan *TestPlan, planner *testPlanner,
866+
) []mutation {
865867
// Insert our `testSingleStep` implementation concurrently with every
866868
// user-provided function.
867869
return plan.
@@ -880,7 +882,9 @@ type removeUserHooksMutator struct{}
880882
func (removeUserHooksMutator) Name() string { return "remove_user_hooks_mutator" }
881883
func (removeUserHooksMutator) Probability() float64 { return 0.5 }
882884

883-
func (removeUserHooksMutator) Generate(rng *rand.Rand, plan *TestPlan) []mutation {
885+
func (removeUserHooksMutator) Generate(
886+
rng *rand.Rand, plan *TestPlan, planner *testPlanner,
887+
) []mutation {
884888
return plan.
885889
newStepSelector().
886890
Filter(func(s *singleStep) bool {

pkg/cmd/roachtest/roachtestutil/mixedversion/runner_test.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,9 @@ func (*testSingleStep) Background() shouldStop { return nil }
178178
func (tss *testSingleStep) Run(_ context.Context, _ *logger.Logger, _ *rand.Rand, _ *Helper) error {
179179
return tss.runFunc()
180180
}
181+
func (s testSingleStep) ConcurrencyDisabled() bool {
182+
return false
183+
}
181184

182185
func newTestStep(f func() error) *singleStep {
183186
initialVersion := clusterupgrade.MustParseVersion(predecessorVersion)

0 commit comments

Comments
 (0)