Skip to content

Commit 988d66f

Browse files
craig[bot]Dev-Kylecthumuluru-crdb
committed
147641: roachtest: add mutator to panic and restart a node r=DarrylWong a=Dev-Kyle This change adds a new mutator that will enable the ability to randomly panic a node. Currently the node is immediately restarted upon crashing, with the end goal of being able to panic a node and waiting some number of steps before restarting it. Fixes: None Epic: None Release note: None Team City: https://teamcity.cockroachdb.com/buildConfiguration/Cockroach_Nightlies_RoachtestNightlyGceBazel?branch=147641&buildTypeTab=overview&mode=builds&hideProblemsFromDependencies=false&hideTestsFromDependencies=false 148385: rpc: add DRPC dial methods r=cthumuluru-crdb a=cthumuluru-crdb Refactor `Peer` further to make it easy to support both gRPC and DRPC peers. In this change, gRPC peer continues to dial DRPC connections to keep the changes backward compatible. As part of enabling DRPC support across the codebase, future commits will remove the DRPC-specific code and replace it with generic peer. This commit also adds DRPC dial methods to `rpc.nodedialer` and `rpc.Context`. In the future commits, helper functions for creation RPC clients are update to create gRPC or DRPC clients base on `rpc.experimental_drpc.enabled` cluster setting. Epic: CRDB-48923 Fixes: #148383 Release note: none Co-authored-by: Kyle <[email protected]> Co-authored-by: Chandra Thumuluru <[email protected]>
3 parents ee7e8bc + 395ad12 + 7c55880 commit 988d66f

File tree

15 files changed

+621
-97
lines changed

15 files changed

+621
-97
lines changed

pkg/cmd/roachtest/roachtestutil/mixedversion/mixedversion.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -295,6 +295,11 @@ type (
295295
// Run implements the actual functionality of the step. This
296296
// signature should remain in sync with `stepFunc`.
297297
Run(context.Context, *logger.Logger, *rand.Rand, *Helper) error
298+
// ConcurrencyDisabled returns true if the step should not be run
299+
// concurrently with other steps. This is the case for any steps
300+
// that involve restarting a node, as they may attempt to connect
301+
// to an unavailable node.
302+
ConcurrencyDisabled() bool
298303
}
299304

300305
// singleStep represents steps that implement the pieces on top of

pkg/cmd/roachtest/roachtestutil/mixedversion/mutators.go

Lines changed: 53 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -53,7 +53,7 @@ func (m preserveDowngradeOptionRandomizerMutator) Probability() float64 {
5353
// current version is always mutated. The length of the returned
5454
// mutations is always even.
5555
func (m preserveDowngradeOptionRandomizerMutator) Generate(
56-
rng *rand.Rand, plan *TestPlan,
56+
rng *rand.Rand, plan *TestPlan, planner *testPlanner,
5757
) []mutation {
5858
var mutations []mutation
5959
for _, upgradeSelector := range randomUpgrades(rng, plan) {
@@ -221,7 +221,9 @@ func (m clusterSettingMutator) Probability() float64 {
221221
// original test plan. Up to `maxChanges` steps will be added to the
222222
// plan. Changes may be concurrent with user-provided steps and may
223223
// happen any time after cluster setup.
224-
func (m clusterSettingMutator) Generate(rng *rand.Rand, plan *TestPlan) []mutation {
224+
func (m clusterSettingMutator) Generate(
225+
rng *rand.Rand, plan *TestPlan, planner *testPlanner,
226+
) []mutation {
225227
var mutations []mutation
226228

227229
// possiblePointsInTime is the list of steps in the plan that are
@@ -245,13 +247,9 @@ func (m clusterSettingMutator) Generate(rng *rand.Rand, plan *TestPlan) []mutati
245247
return false
246248
}
247249
}
248-
249-
// We skip restart steps as we might insert the cluster setting
250-
// change step concurrently with the selected step.
251-
_, isRestartSystem := s.impl.(restartWithNewBinaryStep)
252-
_, isRestartTenant := s.impl.(restartVirtualClusterStep)
253-
isRestart := isRestartSystem || isRestartTenant
254-
return s.context.System.Stage >= OnStartupStage && !isRestart
250+
// Cluster setting changes can be inserted concurrently, so we want to avoid
251+
// inserting into any steps that cannot run concurrently with other steps.
252+
return s.context.System.Stage >= OnStartupStage && !s.impl.ConcurrencyDisabled()
255253
})
256254

257255
for _, changeStep := range m.changeSteps(rng, len(possiblePointsInTime)) {
@@ -383,3 +381,49 @@ func (m clusterSettingMutator) changeSteps(
383381

384382
return steps
385383
}
384+
385+
const (
386+
// PanicNode is a mutator that will randomly cause a node to crash
387+
// during the test, before restarting the node within the same step.
388+
PanicNode = "panic_node"
389+
)
390+
391+
type panicNodeMutator struct {
392+
}
393+
394+
func (m panicNodeMutator) Name() string {
395+
return PanicNode
396+
}
397+
398+
func (m panicNodeMutator) Probability() float64 {
399+
return 0.3
400+
}
401+
402+
func (m panicNodeMutator) Generate(
403+
rng *rand.Rand, plan *TestPlan, planner *testPlanner,
404+
) []mutation {
405+
var mutations []mutation
406+
maxPanics := len(plan.upgrades)
407+
numPanics := 1 + rng.Intn(maxPanics)
408+
409+
idx := newStepIndex(plan)
410+
possiblePointsInTime := plan.
411+
newStepSelector().
412+
// We don't want to panic concurrently with other steps, and inserting before a concurrent step
413+
// causes the step to run concurrently with that step, so we filter out any concurrent steps.
414+
Filter(func(s *singleStep) bool {
415+
return s.context.System.Stage >= InitUpgradeStage && !idx.IsConcurrent(s)
416+
})
417+
418+
for range numPanics {
419+
nodeList := planner.currentContext.System.Descriptor.Nodes
420+
targetNode := nodeList.SeededRandNode(rng)
421+
addRandomly := possiblePointsInTime.
422+
RandomStep(rng).
423+
InsertBefore(panicNodeStep{planner.currentContext.System.Descriptor.Nodes[0], targetNode, planner.rt})
424+
425+
mutations = append(mutations, addRandomly...)
426+
}
427+
428+
return mutations
429+
}

pkg/cmd/roachtest/roachtestutil/mixedversion/mutators_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,7 +29,7 @@ func TestPreserveDowngradeOptionRandomizerMutator(t *testing.T) {
2929
require.NoError(t, err)
3030

3131
var mut preserveDowngradeOptionRandomizerMutator
32-
mutations := mut.Generate(newRand(), plan)
32+
mutations := mut.Generate(newRand(), plan, nil)
3333
require.NotEmpty(t, mutations)
3434
require.True(t, len(mutations)%2 == 0, "should produce even number of mutations") // one removal and one insertion per upgrade
3535

@@ -97,7 +97,7 @@ func TestClusterSettingMutator(t *testing.T) {
9797

9898
const settingName = "test_cluster_setting"
9999
mut := newClusterSettingMutator(settingName, possibleValues, options...)
100-
mutations := mut.Generate(newRand(), plan)
100+
mutations := mut.Generate(newRand(), plan, nil)
101101

102102
// Number of mutations should be 1 <= n <= maxChanges
103103
require.GreaterOrEqual(t, len(mutations), 1, "plan:\n%s", plan.PrettyPrint())

pkg/cmd/roachtest/roachtestutil/mixedversion/planner.go

Lines changed: 33 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -127,9 +127,11 @@ type (
127127
// this probability for specific mutator implementations as
128128
// needed.
129129
Probability() float64
130-
// Generate takes a test plan and a RNG and returns the list of
131-
// mutations that should be applied to the plan.
132-
Generate(*rand.Rand, *TestPlan) []mutation
130+
// Generate takes a test plan, the testPlanner, and an RNG and returns the list of
131+
// mutations that should be applied to the plan. The test plan is the intended output
132+
// after applying the mutations. The testPlanner is used to access specific attributes
133+
// of the test plan, such as the current context or the services.
134+
Generate(*rand.Rand, *TestPlan, *testPlanner) []mutation
133135
}
134136

135137
// mutationOp encodes the type of mutation and controls how the
@@ -211,6 +213,10 @@ const (
211213
spanConfigTenantLimit = 50000
212214
)
213215

216+
var failureInjectionMutators = []mutator{
217+
panicNodeMutator{},
218+
}
219+
214220
// clusterSettingMutators includes a list of all
215221
// cluster setting mutator implementations.
216222
var clusterSettingMutators = []mutator{
@@ -250,9 +256,12 @@ var clusterSettingMutators = []mutator{
250256
// planMutators includes a list of all known `mutator`
251257
// implementations. A subset of these mutations might be enabled in
252258
// any mixedversion test plan.
253-
var planMutators = append([]mutator{
254-
preserveDowngradeOptionRandomizerMutator{},
255-
},
259+
var planMutators = append(
260+
append([]mutator{
261+
preserveDowngradeOptionRandomizerMutator{},
262+
},
263+
failureInjectionMutators...,
264+
),
256265
clusterSettingMutators...,
257266
)
258267

@@ -418,11 +427,22 @@ func (p *testPlanner) Plan() *TestPlan {
418427
isLocal: p.isLocal,
419428
}
420429

421-
// Probabilistically enable some of of the mutators on the base test
422-
// plan generated above.
430+
failureInjections := make(map[string]struct{})
431+
for _, m := range failureInjectionMutators {
432+
failureInjections[m.Name()] = struct{}{}
433+
}
434+
435+
// Probabilistically enable some of the mutators on the base test
436+
// plan generated above. We disable any failure injections that
437+
// would occur on clusters with less than three nodes as this
438+
// can lead to uninteresting failures (e.g. a single node
439+
// panic failure).
423440
for _, mut := range planMutators {
424441
if p.mutatorEnabled(mut) {
425-
mutations := mut.Generate(p.prng, testPlan)
442+
if _, found := failureInjections[mut.Name()]; found && len(p.currentContext.System.Descriptor.Nodes) < 3 {
443+
continue
444+
}
445+
mutations := mut.Generate(p.prng, testPlan, p)
426446
testPlan.applyMutations(p.prng, mutations)
427447
testPlan.enabledMutators = append(testPlan.enabledMutators, mut)
428448
}
@@ -1396,12 +1416,16 @@ func (ss stepSelector) InsertSequential(rng *rand.Rand, impl singleStepProtocol)
13961416
})
13971417
}
13981418

1419+
// InsertBefore inserts the step before the selected step. This is not guaranteed to be a non-concurrent insert, if the
1420+
// selected step is part of a `concurrentRunStep`, the new step will be concurrent with the selected step.
13991421
func (ss stepSelector) InsertBefore(impl singleStepProtocol) []mutation {
14001422
return ss.insert(impl, func() mutationOp {
14011423
return mutationInsertBefore
14021424
})
14031425
}
14041426

1427+
// InsertAfter inserts the step after the selected step. This is not guaranteed to be a non-concurrent insert, if the
1428+
// selected step is part of a `concurrentRunStep`, the new step will be concurrent with the selected step.
14051429
func (ss stepSelector) InsertAfter(impl singleStepProtocol) []mutation {
14061430
return ss.insert(impl, func() mutationOp {
14071431
return mutationInsertAfter

pkg/cmd/roachtest/roachtestutil/mixedversion/planner_test.go

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -553,7 +553,7 @@ func Test_stepSelectorFilter(t *testing.T) {
553553
name: "no filter",
554554
predicate: func(*singleStep) bool { return true },
555555
expectedAllSteps: true,
556-
expectedRandomStepType: runHookStep{},
556+
expectedRandomStepType: restartWithNewBinaryStep{},
557557
},
558558
{
559559
name: "filter eliminates all steps",
@@ -861,7 +861,9 @@ type concurrentUserHooksMutator struct{}
861861
func (concurrentUserHooksMutator) Name() string { return "concurrent_user_hooks_mutator" }
862862
func (concurrentUserHooksMutator) Probability() float64 { return 0.5 }
863863

864-
func (concurrentUserHooksMutator) Generate(rng *rand.Rand, plan *TestPlan) []mutation {
864+
func (concurrentUserHooksMutator) Generate(
865+
rng *rand.Rand, plan *TestPlan, planner *testPlanner,
866+
) []mutation {
865867
// Insert our `testSingleStep` implementation concurrently with every
866868
// user-provided function.
867869
return plan.
@@ -880,7 +882,9 @@ type removeUserHooksMutator struct{}
880882
func (removeUserHooksMutator) Name() string { return "remove_user_hooks_mutator" }
881883
func (removeUserHooksMutator) Probability() float64 { return 0.5 }
882884

883-
func (removeUserHooksMutator) Generate(rng *rand.Rand, plan *TestPlan) []mutation {
885+
func (removeUserHooksMutator) Generate(
886+
rng *rand.Rand, plan *TestPlan, planner *testPlanner,
887+
) []mutation {
884888
return plan.
885889
newStepSelector().
886890
Filter(func(s *singleStep) bool {

pkg/cmd/roachtest/roachtestutil/mixedversion/runner_test.go

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -178,6 +178,9 @@ func (*testSingleStep) Background() shouldStop { return nil }
178178
func (tss *testSingleStep) Run(_ context.Context, _ *logger.Logger, _ *rand.Rand, _ *Helper) error {
179179
return tss.runFunc()
180180
}
181+
func (s testSingleStep) ConcurrencyDisabled() bool {
182+
return false
183+
}
181184

182185
func newTestStep(f func() error) *singleStep {
183186
initialVersion := clusterupgrade.MustParseVersion(predecessorVersion)

0 commit comments

Comments
 (0)