Skip to content

Commit 8e52c96

Browse files
craig[bot]herkolategan
andcommitted
Merge #147835
147835: roachprod: add VM reset to failure injection framework r=DarrylWong,golgeek a=herkolategan This PR adds VM reset functionality to the failure injection framework, allowing for controlled node resets during testing. 1. Updated `roachprod` reset command to operate on a subset of nodes instead of all nodes 2. Implemented a new reset-vm failure mode that: - Captures running processes before reset - Resets specified nodes - Restarts processes after reset 3. Added smoke tests to validate the reset failure mode Fixes: #147361 Epic: None Release note: None Co-authored-by: Herko Lategan <[email protected]>
2 parents 211a8ec + d44036d commit 8e52c96

File tree

11 files changed

+216
-17
lines changed

11 files changed

+216
-17
lines changed

pkg/cmd/roachprod/cli/commands.go

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -199,15 +199,17 @@ Removing nodes from the middle of the cluster is not supported yet.
199199
}
200200

201201
func (cr *commandRegistry) buildResetCmd() *cobra.Command {
202-
return &cobra.Command{
202+
resetCmd := &cobra.Command{
203203
Use: "reset <cluster>",
204-
Short: "reset *all* VMs in a cluster",
205-
Long: `Reset a cloud VM.`,
204+
Short: "reset VMs in a cluster",
205+
Long: `Reset cloud VMs in a cluster.`,
206206
Args: cobra.ExactArgs(1),
207207
Run: wrap(func(cmd *cobra.Command, args []string) (retErr error) {
208208
return roachprod.Reset(config.Logger, args[0])
209209
}),
210210
}
211+
addHelpAboutNodes(resetCmd)
212+
return resetCmd
211213
}
212214

213215
func (cr *commandRegistry) buildDestroyCmd() *cobra.Command {

pkg/cmd/roachtest/cluster.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1852,6 +1852,12 @@ func (c *clusterImpl) doDestroy(ctx context.Context, l *logger.Logger) <-chan st
18521852
return ch
18531853
}
18541854

1855+
func (c *clusterImpl) Reset(
1856+
ctx context.Context, l *logger.Logger, nodes option.NodeListOption,
1857+
) error {
1858+
return roachprod.Reset(l, c.MakeNodes(nodes))
1859+
}
1860+
18551861
func (c *clusterImpl) addLabels(labels map[string]string) error {
18561862
// N.B. we must sanitize the values; e.g., some test names can exceed the maximum length (63 chars in GCE).
18571863
// N.B. we don't sanitize the keys; unlike values, they are typically _not_ (dynamically) generated.

pkg/cmd/roachtest/cluster/cluster_interface.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -160,6 +160,10 @@ type Cluster interface {
160160
) error
161161
PopulateEtcHosts(ctx context.Context, l *logger.Logger) error
162162

163+
// VM management
164+
165+
Reset(ctx context.Context, l *logger.Logger, nodes option.NodeListOption) error
166+
163167
// Methods whose inclusion on this interface is purely historical.
164168
// These should be removed over time.
165169

pkg/cmd/roachtest/clusterstats/mock_cluster_generated_test.go

Lines changed: 14 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

pkg/cmd/roachtest/tests/failure_injection.go

Lines changed: 43 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -726,6 +726,48 @@ var processKillTests = func(c cluster.Cluster) []failureSmokeTest {
726726
return tests
727727
}
728728

729+
var resetVMTests = func(c cluster.Cluster) failureSmokeTest {
730+
rng, _ := randutil.NewPseudoRand()
731+
rebootedNode := c.CRDBNodes().SeededRandNode(rng)
732+
return failureSmokeTest{
733+
testName: failures.ResetVMFailureName,
734+
failureName: failures.ResetVMFailureName,
735+
args: failures.ResetVMArgs{
736+
Nodes: rebootedNode.InstallNodes(),
737+
},
738+
validateFailure: func(ctx context.Context, l *logger.Logger, c cluster.Cluster, f *failures.Failer) error {
739+
// Check that we aren't able to establish a SQL connection to the rebooted node.
740+
// waitForFailureToPropagate already does a similar check, but we do it here
741+
// to satisfy the smoke test framework since this is a fairly simple failure
742+
// mode with less to validate.
743+
return testutils.SucceedsSoonError(func() error {
744+
if ctx.Err() != nil {
745+
return ctx.Err()
746+
}
747+
748+
killedDB, err := c.ConnE(ctx, l, rebootedNode[0])
749+
if err == nil {
750+
defer killedDB.Close()
751+
if err := killedDB.Ping(); err == nil {
752+
return errors.Errorf("expected node %d to be dead, but it is alive", rebootedNode)
753+
} else {
754+
l.Printf("failed to connect to node %d: %v", rebootedNode, err)
755+
}
756+
} else {
757+
l.Printf("unable to establish SQL connection to node %d", rebootedNode)
758+
}
759+
return nil
760+
})
761+
},
762+
validateRecover: func(ctx context.Context, l *logger.Logger, c cluster.Cluster, f *failures.Failer) error {
763+
return nil
764+
},
765+
workload: func(ctx context.Context, c cluster.Cluster, args ...string) error {
766+
return defaultFailureSmokeTestWorkload(ctx, c, "--tolerate-errors")
767+
},
768+
}
769+
}
770+
729771
func defaultFailureSmokeTestWorkload(ctx context.Context, c cluster.Cluster, args ...string) error {
730772
workloadArgs := strings.Join(args, " ")
731773
cmd := roachtestutil.NewCommand("./cockroach workload run kv %s", workloadArgs).
@@ -772,6 +814,7 @@ func runFailureSmokeTest(ctx context.Context, t test.Test, c cluster.Cluster, no
772814
asymmetricOutgoingNetworkPartitionTest(c),
773815
latencyTest(c),
774816
dmsetupDiskStallTest(c),
817+
resetVMTests(c),
775818
}
776819
failureSmokeTests = append(failureSmokeTests, cgroupsDiskStallTests(c)...)
777820
failureSmokeTests = append(failureSmokeTests, processKillTests(c)...)

pkg/roachprod/failureinjection/failures/BUILD.bazel

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ go_library(
1111
"noop.go",
1212
"process_kill.go",
1313
"registry.go",
14+
"reset.go",
1415
],
1516
importpath = "github.com/cockroachdb/cockroach/pkg/roachprod/failureinjection/failures",
1617
visibility = ["//visibility:public"],

pkg/roachprod/failureinjection/failures/registry.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ func (r *FailureRegistry) Register() {
3333
registerDmsetupDiskStall(r)
3434
registerIPTablesPartitionFailure(r)
3535
registerNetworkLatencyFailure(r)
36+
registerResetVM(r)
3637
registerNoopFailure(r)
3738
registerProcessKillFailure(r)
3839
}
Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
// Copyright 2025 The Cockroach Authors.
2+
//
3+
// Use of this software is governed by the CockroachDB Software License
4+
// included in the /LICENSE file.
5+
6+
package failures
7+
8+
import (
9+
"context"
10+
"time"
11+
12+
"github.com/cockroachdb/cockroach/pkg/roachprod"
13+
"github.com/cockroachdb/cockroach/pkg/roachprod/install"
14+
"github.com/cockroachdb/cockroach/pkg/roachprod/logger"
15+
)
16+
17+
type (
18+
ResetVMArgs struct {
19+
Nodes install.Nodes
20+
}
21+
resetVMFailure struct {
22+
GenericFailure
23+
Processes map[install.Node][]install.MonitorProcessRunning
24+
}
25+
)
26+
27+
var _ FailureMode = &resetVMFailure{}
28+
29+
const ResetVMFailureName = "reset-vm"
30+
31+
func registerResetVM(r *FailureRegistry) {
32+
r.add(ResetVMFailureName, ResetVMArgs{}, MakeResetVMFailure)
33+
}
34+
35+
func MakeResetVMFailure(clusterName string, l *logger.Logger, secure bool) (FailureMode, error) {
36+
c, err := roachprod.GetClusterFromCache(l, clusterName, install.SecureOption(secure))
37+
if err != nil {
38+
return nil, err
39+
}
40+
41+
return &resetVMFailure{
42+
GenericFailure: GenericFailure{
43+
c: c,
44+
},
45+
}, nil
46+
}
47+
48+
// Description implements FailureMode.
49+
func (r *resetVMFailure) Description() string {
50+
return ResetVMFailureName
51+
}
52+
53+
// Setup implements FailureMode.
54+
func (r *resetVMFailure) Setup(ctx context.Context, l *logger.Logger, args FailureArgs) error {
55+
return nil
56+
}
57+
58+
// Inject implements FailureMode.
59+
func (r *resetVMFailure) Inject(ctx context.Context, l *logger.Logger, args FailureArgs) error {
60+
// Capture the processes running on the nodes.
61+
nodes := args.(ResetVMArgs).Nodes
62+
monitorChan := r.c.WithNodes(nodes).Monitor(l, ctx, install.MonitorOpts{OneShot: true})
63+
r.Processes = make(map[install.Node][]install.MonitorProcessRunning, 0)
64+
for e := range monitorChan {
65+
if p, ok := e.Event.(install.MonitorProcessRunning); ok {
66+
r.Processes[e.Node] = append(r.Processes[e.Node], p)
67+
}
68+
}
69+
70+
return r.c.WithNodes(nodes).Reset(l)
71+
}
72+
73+
// Cleanup implements FailureMode.
74+
func (r *resetVMFailure) Cleanup(ctx context.Context, l *logger.Logger, args FailureArgs) error {
75+
return nil
76+
}
77+
78+
// Recover implements FailureMode.
79+
func (r *resetVMFailure) Recover(ctx context.Context, l *logger.Logger, args FailureArgs) error {
80+
// Restart the processes.
81+
for node, processes := range r.Processes {
82+
for _, p := range processes {
83+
l.Printf("Starting process %s on node %s", p.PID, p.VirtualClusterName)
84+
err := r.c.WithNodes([]install.Node{node}).Start(ctx, l, install.StartOpts{
85+
VirtualClusterName: p.VirtualClusterName,
86+
SQLInstance: p.SQLInstance,
87+
IsRestart: true,
88+
})
89+
if err != nil {
90+
return err
91+
}
92+
}
93+
}
94+
return nil
95+
}
96+
97+
// WaitForFailureToPropagate implements FailureMode.
98+
func (r *resetVMFailure) WaitForFailureToPropagate(
99+
ctx context.Context, l *logger.Logger, args FailureArgs,
100+
) error {
101+
nodes := args.(ResetVMArgs).Nodes
102+
l.Printf("Waiting for nodes to become unavailable: %v", nodes)
103+
104+
// Some providers take a while to stop VMs (>10 minutes).
105+
return forEachNode(nodes, func(n install.Nodes) error {
106+
return r.WaitForSQLUnavailable(ctx, l, n, 15*time.Minute)
107+
})
108+
}
109+
110+
// WaitForFailureToRecover implements FailureMode.
111+
func (r *resetVMFailure) WaitForFailureToRecover(
112+
ctx context.Context, l *logger.Logger, args FailureArgs,
113+
) error {
114+
nodes := args.(ResetVMArgs).Nodes
115+
l.Printf("Waiting for nodes to become available: %v", nodes)
116+
117+
// Some providers take a while to start VMs (>10 minutes).
118+
return forEachNode(nodes, func(n install.Nodes) error {
119+
return r.WaitForSQLReady(ctx, l, n, 15*time.Minute)
120+
})
121+
}

pkg/roachprod/install/cluster_synced.go

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2725,3 +2725,20 @@ done <<< "$HOSTS_LIST"
27252725

27262726
return nil
27272727
}
2728+
2729+
// Reset resets VMs in a cluster.
2730+
func (c *SyncedCluster) Reset(l *logger.Logger) error {
2731+
if c.IsLocal() {
2732+
return nil
2733+
}
2734+
2735+
nodes := c.TargetNodes()
2736+
targetVMs := make(vm.List, len(nodes))
2737+
for idx, node := range nodes {
2738+
targetVMs[idx] = c.VMs[node-1]
2739+
}
2740+
2741+
return vm.FanOut(targetVMs, func(p vm.Provider, vms vm.List) error {
2742+
return p.Reset(l, vms)
2743+
})
2744+
}

pkg/roachprod/roachprod.go

Lines changed: 3 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -626,24 +626,14 @@ func Stage(
626626
return install.StageApplication(ctx, l, c, applicationName, version, os, vm.CPUArch(arch), dir)
627627
}
628628

629-
// Reset resets all VMs in a cluster.
629+
// Reset resets VMs in a cluster.
630630
func Reset(l *logger.Logger, clusterName string) error {
631-
if err := LoadClusters(); err != nil {
632-
return err
633-
}
634-
635-
if config.IsLocalClusterName(clusterName) {
636-
return nil
637-
}
638-
639-
c, err := getClusterFromCloud(l, clusterName)
631+
c, err := GetClusterFromCache(l, clusterName)
640632
if err != nil {
641633
return err
642634
}
643635

644-
return vm.FanOut(c.VMs, func(p vm.Provider, vms vm.List) error {
645-
return p.Reset(l, vms)
646-
})
636+
return c.Reset(l)
647637
}
648638

649639
// SetupSSH sets up the keys and host keys for the vms in the cluster.

0 commit comments

Comments
 (0)