Skip to content

Commit 301f0e9

Browse files
committed
roachtest: add more expansive panic node mutator
This change adds a new mutator that is similar to the panic node mutator, but will also upreplicate to an RF of 5 before panicking, and drop back down to 3 after recovering. This allows for a much larger duration of leaving the node down before recovering, as the cluster can now survive 2 nodes being down. The current implementation is only limited to tests that already have 5 nodes in the cluster, so this mutator remains disabled by default. Epic: none Release note: none
1 parent 1bedd25 commit 301f0e9

File tree

5 files changed

+136
-17
lines changed

5 files changed

+136
-17
lines changed

pkg/cmd/roachtest/roachtestutil/mixedversion/BUILD.bazel

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ go_library(
2525
"//pkg/cmd/roachtest/spec",
2626
"//pkg/cmd/roachtest/test",
2727
"//pkg/roachpb",
28+
"//pkg/roachprod",
2829
"//pkg/roachprod/failureinjection/failures",
2930
"//pkg/roachprod/install",
3031
"//pkg/roachprod/logger",

pkg/cmd/roachtest/roachtestutil/mixedversion/mixedversion.go

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -349,6 +349,7 @@ type (
349349
enabledDeploymentModes []DeploymentMode
350350
tag string
351351
overriddenMutatorProbabilities map[string]float64
352+
hooksSupportFailureInjection bool
352353
}
353354

354355
CustomOption func(*testOptions)
@@ -400,6 +401,12 @@ type (
400401
DeploymentMode string
401402
)
402403

404+
// EnableHooksDuringFailureInjection is an option that can be passed to
405+
// `NewTest` to enable the use of mixed-version hooks during failure injections.
406+
func EnableHooksDuringFailureInjection(opts *testOptions) {
407+
opts.hooksSupportFailureInjection = true
408+
}
409+
403410
// NeverUseFixtures is an option that can be passed to `NewTest` to
404411
// disable the use of fixtures in the test. Necessary if the test
405412
// wants to use a number of cockroach nodes other than 4.

pkg/cmd/roachtest/roachtestutil/mixedversion/mutators.go

Lines changed: 42 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -409,6 +409,10 @@ func (m panicNodeMutator) Generate(
409409
idx := newStepIndex(plan)
410410
nodeList := planner.currentContext.System.Descriptor.Nodes
411411

412+
// If we have at least 5 nodes, we can safely upreplicate to 5X before panicking a node.
413+
// This allows for a longer panic duration before recovery.
414+
supportsUpReplication := len(nodeList) >= 5
415+
412416
for _, upgrade := range upgrades {
413417
possiblePointsInTime := upgrade.
414418
// We don't want to panic concurrently with other steps, and inserting before a concurrent step
@@ -426,13 +430,21 @@ func (m panicNodeMutator) Generate(
426430

427431
isIncompatibleStep := func(s *singleStep) bool {
428432
// Restarting the system on a different node while our panicked node is still dead can
429-
// cause the cluster to lose quorum, so we avoid any system restarts.
430-
_, restart := s.impl.(restartWithNewBinaryStep)
433+
// cause the cluster to lose quorum, so we avoid any system restarts. If
434+
// the cluster has a high enough node count however, we can upreplicate
435+
// to 5X before panicking, allowing us to safely restart other nodes.
436+
restartImpl, restart := s.impl.(restartWithNewBinaryStep)
437+
if supportsUpReplication {
438+
// We can restart other nodes, but we do not want to restart the
439+
// node that is being panicked, as the panic recover step expects
440+
// the node to be down in order to restart it.
441+
restart = restart && restartImpl.node == targetNode[0]
442+
}
431443
// Waiting for stable cluster version targets every node in
432444
// the cluster, so a node cannot be dead during this step.
433445
_, waitForStable := s.impl.(waitForStableClusterVersionStep)
434446
// Many hook steps do not support running with a dead node,
435-
// so we avoid inserting after a hook step.
447+
// so we avoid inserting after an incompatible hook step.
436448
_, runHook := s.impl.(runHookStep)
437449

438450
if idx.IsConcurrent(s) {
@@ -445,7 +457,7 @@ func (m panicNodeMutator) Generate(
445457
firstStepInConcurrentBlock = nil
446458
}
447459

448-
return restart || waitForStable || runHook || s.context.System.hasUnavailableNodes
460+
return restart || waitForStable || (runHook && !planner.options.hooksSupportFailureInjection) || s.context.System.hasUnavailableNodes
449461
}
450462

451463
// The node should be restarted after the panic, but before any steps that are
@@ -469,19 +481,35 @@ func (m panicNodeMutator) Generate(
469481

470482
restartDesc := fmt.Sprintf("restarting node %d after panic", targetNode[0])
471483

484+
var addUpReplicateStep []mutation
485+
if supportsUpReplication {
486+
addUpReplicateStep = stepToPanic.
487+
InsertBefore(alterReplicationFactorStep{5, targetNode})
488+
}
472489
addPanicStep := stepToPanic.
473490
InsertBefore(panicNodeStep{planner.currentContext.System.Descriptor.Nodes[0], targetNode})
474491
var addRestartStep []mutation
492+
var addDownReplicateStep []mutation
475493
var restartStep stepSelector
476494
// If validEndStep is nil, it means that there are no steps after the panic step that
477495
// are compatible with a dead node, so we immediately restart the node after the panic.
478496
if validEndStep == nil {
479497
restartStep = cutStep
480-
addRestartStep = cutStep.InsertBefore(restartNodeStep{planner.currentContext.System.Descriptor.Nodes[0], targetNode, planner.rt, restartDesc})
498+
addRestartStep = restartStep.InsertBefore(restartNodeStep{planner.currentContext.System.Descriptor.Nodes[0], targetNode, planner.rt, restartDesc})
481499
} else {
482500
restartStep = validEndStep.RandomStep(rng)
483-
addRestartStep = restartStep.
484-
Insert(rng, restartNodeStep{planner.currentContext.System.Descriptor.Nodes[0], targetNode, planner.rt, restartDesc})
501+
if supportsUpReplication {
502+
addRestartStep = restartStep.
503+
InsertBefore(restartNodeStep{planner.currentContext.System.Descriptor.Nodes[0], targetNode, planner.rt, restartDesc})
504+
} else {
505+
addRestartStep = restartStep.
506+
Insert(rng, restartNodeStep{planner.currentContext.System.Descriptor.Nodes[0], targetNode, planner.rt, restartDesc})
507+
}
508+
}
509+
510+
if supportsUpReplication {
511+
addDownReplicateStep = restartStep.
512+
InsertBefore(alterReplicationFactorStep{3, targetNode})
485513
}
486514

487515
failureContextSteps, _ := validStartStep.CutBefore(func(s *singleStep) bool {
@@ -493,6 +521,10 @@ func (m panicNodeMutator) Generate(
493521

494522
mutations = append(mutations, addPanicStep...)
495523
mutations = append(mutations, addRestartStep...)
524+
if supportsUpReplication {
525+
mutations = append(addUpReplicateStep, mutations...)
526+
mutations = append(mutations, addDownReplicateStep...)
527+
}
496528
}
497529

498530
return mutations, nil
@@ -562,7 +594,8 @@ func (m networkPartitionMutator) Generate(
562594
_, restartSystem := s.impl.(restartWithNewBinaryStep)
563595
_, restartTenant := s.impl.(restartVirtualClusterStep)
564596
// Many hook steps require communication between specific nodes, so we
565-
// should recover the network partition before running them.
597+
// should recover the network partition before running any incompatible
598+
// hook steps.
566599
_, runHook := s.impl.(runHookStep)
567600
// Waiting for stable cluster version requires communication between
568601
// all nodes in the cluster, so we should recover the network partition
@@ -585,7 +618,7 @@ func (m networkPartitionMutator) Generate(
585618
} else {
586619
unavailableNodes = s.context.System.hasUnavailableNodes
587620
}
588-
return unavailableNodes || restartTenant || restartSystem || runHook || waitForStable
621+
return unavailableNodes || restartTenant || restartSystem || (runHook && !planner.options.hooksSupportFailureInjection) || waitForStable
589622
}
590623

591624
_, validStartStep := upgrade.CutAfter(func(s *singleStep) bool {

pkg/cmd/roachtest/roachtestutil/mixedversion/steps.go

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,10 @@ import (
1414
"time"
1515

1616
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/option"
17+
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil"
1718
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/roachtestutil/clusterupgrade"
1819
"github.com/cockroachdb/cockroach/pkg/cmd/roachtest/test"
20+
"github.com/cockroachdb/cockroach/pkg/roachprod"
1921
"github.com/cockroachdb/cockroach/pkg/roachprod/failureinjection/failures"
2022
"github.com/cockroachdb/cockroach/pkg/roachprod/install"
2123
"github.com/cockroachdb/cockroach/pkg/roachprod/logger"
@@ -969,3 +971,42 @@ func (s networkPartitionRecoveryStep) Run(
969971
func (s networkPartitionRecoveryStep) ConcurrencyDisabled() bool {
970972
return false
971973
}
974+
975+
type alterReplicationFactorStep struct {
976+
replicationFactor int
977+
targetNode option.NodeListOption
978+
}
979+
980+
func (s alterReplicationFactorStep) Background() shouldStop { return nil }
981+
982+
func (s alterReplicationFactorStep) Description() string {
983+
return fmt.Sprintf("alter replication factor to %d", s.replicationFactor)
984+
}
985+
986+
func (s alterReplicationFactorStep) Run(
987+
ctx context.Context, l *logger.Logger, rng *rand.Rand, h *Helper,
988+
) error {
989+
stmt := fmt.Sprintf("ALTER RANGE default CONFIGURE ZONE USING num_replicas = %d", s.replicationFactor)
990+
if err := h.System.Exec(
991+
rng,
992+
stmt,
993+
); err != nil {
994+
return errors.Wrap(err, "failed to change replication factor")
995+
}
996+
997+
replicationLogger, loggerName, err := roachtestutil.LoggerForCmd(l, s.targetNode, "range-replication")
998+
if err != nil {
999+
return errors.Wrapf(err, "failed to create logger %s", loggerName)
1000+
}
1001+
1002+
l.Printf("waiting to reach replication factor of %dX; details in %s.log", s.replicationFactor, loggerName)
1003+
db := h.System.Connect(s.targetNode[0])
1004+
if err := roachtestutil.WaitForReplication(ctx, replicationLogger, db, s.replicationFactor, roachprod.AtLeastReplicationFactor); err != nil {
1005+
return errors.Wrapf(err, "failed to reach replication factor of %dX", s.replicationFactor)
1006+
}
1007+
return nil
1008+
}
1009+
1010+
func (s alterReplicationFactorStep) ConcurrencyDisabled() bool {
1011+
return false
1012+
}

pkg/cmd/roachtest/tests/tpcc.go

Lines changed: 45 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -518,6 +518,9 @@ var tpccSupportedWarehouses = []struct {
518518
{hardware: "ibm-n5cpu16", v: version.MustParse(`v19.1.0-alpha.0`), warehouses: 1300},
519519
// Ditto.
520520
{hardware: "gce-n5cpu16", v: version.MustParse(`v2.1.0-alpha.0`), warehouses: 1300},
521+
522+
{hardware: "gce-n6cpu16", v: version.MustParse(`v19.1.0-alpha.0`), warehouses: 2000},
523+
{hardware: "gce-n6cpu16", v: version.MustParse(`v2.1.0-alpha.0`), warehouses: 2000},
521524
}
522525

523526
// tpccMaxRate calculates the max rate of the workload given a number of warehouses.
@@ -560,7 +563,7 @@ func maxSupportedTPCCWarehouses(
560563
// workload is running. The number of database upgrades is randomized
561564
// by the mixed-version framework which chooses a random predecessor version
562565
// and upgrades until it reaches the current version.
563-
func runTPCCMixedHeadroom(ctx context.Context, t test.Test, c cluster.Cluster) {
566+
func runTPCCMixedHeadroom(ctx context.Context, t test.Test, c cluster.Cluster, chaos bool) {
564567
maxWarehouses := maxSupportedTPCCWarehouses(*t.BuildVersion(), c.Cloud(), c.Spec())
565568
headroomWarehouses := int(float64(maxWarehouses) * 0.7)
566569

@@ -569,20 +572,39 @@ func runTPCCMixedHeadroom(ctx context.Context, t test.Test, c cluster.Cluster) {
569572
// The full 6.5m import ran into out of disk errors (on 250gb machines),
570573
// hence division by two.
571574
bankRows := 65104166 / 2
575+
572576
if c.IsLocal() {
573577
bankRows = 1000
574578
}
579+
// If the test is a chaos test, we decrease the number of rows and warehouses
580+
// in order to lower the time it takes to reach replication with a larger cluster.
581+
if chaos {
582+
bankRows = 1000
583+
headroomWarehouses = 200
584+
}
575585

576-
mvt := mixedversion.NewTest(
577-
ctx, t, t.L(), c, c.CRDBNodes(),
586+
customOpts := []mixedversion.CustomOption{
578587
// We test only upgrades from 23.2 in this test because it uses
579588
// the `workload fixtures import` command, which is only supported
580589
// reliably multi-tenant mode starting from that version.
581590
mixedversion.MinimumSupportedVersion("v23.2.0"),
582591
// We limit the total number of plan steps to 70, which is roughly 80% of all plan lengths.
583592
// See #138014 for more details.
584593
mixedversion.MaxNumPlanSteps(70),
585-
)
594+
}
595+
596+
// If the test is a chaos test, we want to opt for the more expansive panic
597+
// mutator, as well any other appropriate test opts for the unique test.
598+
if chaos {
599+
customOpts = append([]mixedversion.CustomOption{
600+
mixedversion.NeverUseFixtures,
601+
mixedversion.EnableHooksDuringFailureInjection,
602+
},
603+
customOpts...)
604+
}
605+
606+
mvt := mixedversion.NewTest(
607+
ctx, t, t.L(), c, c.CRDBNodes(), customOpts...)
586608

587609
tenantFeaturesEnabled := make(chan struct{})
588610
enableTenantFeatures := func(ctx context.Context, l *logger.Logger, rng *rand.Rand, h *mixedversion.Helper) error {
@@ -594,7 +616,7 @@ func runTPCCMixedHeadroom(ctx context.Context, t test.Test, c cluster.Cluster) {
594616
l.Printf("waiting for tenant features to be enabled")
595617
<-tenantFeaturesEnabled
596618

597-
randomNode := c.Node(c.CRDBNodes().SeededRandNode(rng)[0])
619+
randomNode := c.Node(h.AvailableNodes().SeededRandNode(rng)[0])
598620
cmd := tpccImportCmdWithCockroachBinary(test.DefaultCockroachPath, "", "tpcc", headroomWarehouses, fmt.Sprintf("{pgurl%s}", randomNode))
599621
return c.RunE(ctx, option.WithNodes(randomNode), cmd)
600622
}
@@ -603,7 +625,7 @@ func runTPCCMixedHeadroom(ctx context.Context, t test.Test, c cluster.Cluster) {
603625
// upgrade machinery, in which a) all ranges are touched and b) work proportional
604626
// to the amount data may be carried out.
605627
importLargeBank := func(ctx context.Context, l *logger.Logger, rng *rand.Rand, h *mixedversion.Helper) error {
606-
randomNode := c.Node(c.CRDBNodes().SeededRandNode(rng)[0])
628+
randomNode := c.Node(h.AvailableNodes().SeededRandNode(rng)[0])
607629
// Upload a versioned cockroach binary to the random node. The bank workload
608630
// is no longer backwards compatible after #149374, so we need to use the same
609631
// version as the cockroach cluster.
@@ -647,7 +669,7 @@ func runTPCCMixedHeadroom(ctx context.Context, t test.Test, c cluster.Cluster) {
647669
labelsMap = getTpccLabels(headroomWarehouses, rampDur, workloadDur/time.Millisecond, nil)
648670
}
649671
cmd := roachtestutil.NewCommand("./cockroach workload run tpcc").
650-
Arg("{pgurl%s}", c.CRDBNodes()).
672+
Arg("{pgurl%s}", h.AvailableNodes()).
651673
Flag("duration", workloadDur).
652674
Flag("warehouses", headroomWarehouses).
653675
Flag("histograms", histogramsPath).
@@ -728,6 +750,21 @@ func registerTPCC(r registry.Registry) {
728750
})
729751
},
730752
})
753+
mixedHeadroomChaosSpec := r.MakeClusterSpec(6, spec.CPU(16), spec.WorkloadNode(), spec.RandomlyUseZfs())
754+
r.Add(registry.TestSpec{
755+
Name: "tpcc/mixed-headroom/chaos/" + mixedHeadroomChaosSpec.String(),
756+
Timeout: 7 * time.Hour,
757+
Owner: registry.OwnerTestEng,
758+
CompatibleClouds: registry.AllClouds.NoAWS().NoIBM(),
759+
Suites: registry.Suites(registry.MixedVersion),
760+
Cluster: mixedHeadroomChaosSpec,
761+
EncryptionSupport: registry.EncryptionMetamorphic,
762+
Monitor: true,
763+
Randomized: true,
764+
Run: func(ctx context.Context, t test.Test, c cluster.Cluster) {
765+
runTPCCMixedHeadroom(ctx, t, c, true)
766+
},
767+
})
731768

732769
mixedHeadroomSpec := r.MakeClusterSpec(5, spec.CPU(16), spec.WorkloadNode(), spec.RandomlyUseZfs())
733770
r.Add(registry.TestSpec{
@@ -749,7 +786,7 @@ func registerTPCC(r registry.Registry) {
749786
Monitor: true,
750787
Randomized: true,
751788
Run: func(ctx context.Context, t test.Test, c cluster.Cluster) {
752-
runTPCCMixedHeadroom(ctx, t, c)
789+
runTPCCMixedHeadroom(ctx, t, c, false)
753790
},
754791
})
755792

0 commit comments

Comments
 (0)