Skip to content

Commit 138b99d

Browse files
committed
roachprod: add reset failure mode
This change adds a reset failure mode to the failure injection framework. It uses the reset functionality provided by the cloud providers to reset the nodes in the cluster. Before a reset happens, roachprod Monitor will collect a list of processes running on the node and store them in a map. After the reset, the processes are restarted. Epic: None Release note: None
1 parent e81a040 commit 138b99d

File tree

3 files changed

+123
-0
lines changed

3 files changed

+123
-0
lines changed

pkg/roachprod/failureinjection/failures/BUILD.bazel

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@ go_library(
1111
"noop.go",
1212
"process_kill.go",
1313
"registry.go",
14+
"reset.go",
1415
],
1516
importpath = "github.com/cockroachdb/cockroach/pkg/roachprod/failureinjection/failures",
1617
visibility = ["//visibility:public"],

pkg/roachprod/failureinjection/failures/registry.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ func (r *FailureRegistry) Register() {
3333
registerDmsetupDiskStall(r)
3434
registerIPTablesPartitionFailure(r)
3535
registerNetworkLatencyFailure(r)
36+
registerResetVM(r)
3637
registerNoopFailure(r)
3738
registerProcessKillFailure(r)
3839
}
Lines changed: 121 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,121 @@
1+
// Copyright 2025 The Cockroach Authors.
2+
//
3+
// Use of this software is governed by the CockroachDB Software License
4+
// included in the /LICENSE file.
5+
6+
package failures
7+
8+
import (
9+
"context"
10+
"time"
11+
12+
"github.com/cockroachdb/cockroach/pkg/roachprod"
13+
"github.com/cockroachdb/cockroach/pkg/roachprod/install"
14+
"github.com/cockroachdb/cockroach/pkg/roachprod/logger"
15+
)
16+
17+
type (
18+
ResetVMArgs struct {
19+
Nodes install.Nodes
20+
}
21+
resetVMFailure struct {
22+
GenericFailure
23+
Processes map[install.Node][]install.MonitorProcessRunning
24+
}
25+
)
26+
27+
var _ FailureMode = &resetVMFailure{}
28+
29+
const ResetVMFailureName = "reset-vm"
30+
31+
func registerResetVM(r *FailureRegistry) {
32+
r.add(ResetVMFailureName, ResetVMArgs{}, MakeResetVMFailure)
33+
}
34+
35+
func MakeResetVMFailure(clusterName string, l *logger.Logger, secure bool) (FailureMode, error) {
36+
c, err := roachprod.GetClusterFromCache(l, clusterName, install.SecureOption(secure))
37+
if err != nil {
38+
return nil, err
39+
}
40+
41+
return &resetVMFailure{
42+
GenericFailure: GenericFailure{
43+
c: c,
44+
},
45+
}, nil
46+
}
47+
48+
// Description implements FailureMode.
49+
func (r *resetVMFailure) Description() string {
50+
return ResetVMFailureName
51+
}
52+
53+
// Setup implements FailureMode.
54+
func (r *resetVMFailure) Setup(ctx context.Context, l *logger.Logger, args FailureArgs) error {
55+
return nil
56+
}
57+
58+
// Inject implements FailureMode.
59+
func (r *resetVMFailure) Inject(ctx context.Context, l *logger.Logger, args FailureArgs) error {
60+
// Capture the processes running on the nodes.
61+
nodes := args.(ResetVMArgs).Nodes
62+
monitorChan := r.c.WithNodes(nodes).Monitor(l, ctx, install.MonitorOpts{OneShot: true})
63+
r.Processes = make(map[install.Node][]install.MonitorProcessRunning, 0)
64+
for e := range monitorChan {
65+
if p, ok := e.Event.(install.MonitorProcessRunning); ok {
66+
r.Processes[e.Node] = append(r.Processes[e.Node], p)
67+
}
68+
}
69+
70+
return r.c.WithNodes(nodes).Reset(l)
71+
}
72+
73+
// Cleanup implements FailureMode.
74+
func (r *resetVMFailure) Cleanup(ctx context.Context, l *logger.Logger, args FailureArgs) error {
75+
return nil
76+
}
77+
78+
// Recover implements FailureMode.
79+
func (r *resetVMFailure) Recover(ctx context.Context, l *logger.Logger, args FailureArgs) error {
80+
// Restart the processes.
81+
for node, processes := range r.Processes {
82+
for _, p := range processes {
83+
l.Printf("Starting process %s on node %s", p.PID, p.VirtualClusterName)
84+
err := r.c.WithNodes([]install.Node{node}).Start(ctx, l, install.StartOpts{
85+
VirtualClusterName: p.VirtualClusterName,
86+
SQLInstance: p.SQLInstance,
87+
IsRestart: true,
88+
})
89+
if err != nil {
90+
return err
91+
}
92+
}
93+
}
94+
return nil
95+
}
96+
97+
// WaitForFailureToPropagate implements FailureMode.
98+
func (r *resetVMFailure) WaitForFailureToPropagate(
99+
ctx context.Context, l *logger.Logger, args FailureArgs,
100+
) error {
101+
nodes := args.(ResetVMArgs).Nodes
102+
l.Printf("Waiting for nodes to become unavailable: %v", nodes)
103+
104+
// Some providers take a while to stop VMs (>10 minutes).
105+
return forEachNode(nodes, func(n install.Nodes) error {
106+
return r.WaitForSQLUnavailable(ctx, l, n, 15*time.Minute)
107+
})
108+
}
109+
110+
// WaitForFailureToRecover implements FailureMode.
111+
func (r *resetVMFailure) WaitForFailureToRecover(
112+
ctx context.Context, l *logger.Logger, args FailureArgs,
113+
) error {
114+
nodes := args.(ResetVMArgs).Nodes
115+
l.Printf("Waiting for nodes to become available: %v", nodes)
116+
117+
// Some providers take a while to start VMs (>10 minutes).
118+
return forEachNode(nodes, func(n install.Nodes) error {
119+
return r.WaitForSQLReady(ctx, l, n, 15*time.Minute)
120+
})
121+
}

0 commit comments

Comments
 (0)