Skip to content

Commit a62909d

Browse files
committed
roachprod: add stop option to reset VM failure
When the reset VM failure is injected, we now have the option to stop the processes before restarting the cluster. This change also improves process management by starting the processes in the correct order (System interface first, then tenants). Epic: None Release note: None
1 parent c9e7aad commit a62909d

File tree

1 file changed

+82
-10
lines changed
  • pkg/roachprod/failureinjection/failures

1 file changed

+82
-10
lines changed

pkg/roachprod/failureinjection/failures/reset.go

Lines changed: 82 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,7 @@ package failures
77

88
import (
99
"context"
10+
"slices"
1011
"time"
1112

1213
"github.com/cockroachdb/cockroach/pkg/roachprod"
@@ -16,12 +17,22 @@ import (
1617

1718
type (
1819
ResetVMArgs struct {
19-
Nodes install.Nodes
20+
Nodes install.Nodes
21+
StopProcesses bool
2022
}
2123
resetVMFailure struct {
2224
GenericFailure
23-
Processes map[install.Node][]install.MonitorProcessRunning
25+
processes processMap
2426
}
27+
// nodeSet is a set of nodes.
28+
nodeSet map[install.Node]struct{}
29+
// instanceMap is a map of SQL instances to a map of nodes that are running
30+
// that instance.
31+
instanceMap map[int]nodeSet
32+
// processMap is a map of virtual cluster names to instance maps. It's a
33+
// convenience type that allows grouping the processes that should be
34+
// started and stopped together.
35+
processMap map[string]instanceMap
2536
)
2637

2738
var _ FailureMode = &resetVMFailure{}
@@ -47,6 +58,43 @@ func MakeResetVMFailure(
4758
}, nil
4859
}
4960

61+
func (m *processMap) add(virtualClusterName string, instance int, node install.Node) {
62+
if virtualClusterName == "" {
63+
virtualClusterName = install.SystemInterfaceName
64+
}
65+
if _, ok := (*m)[virtualClusterName]; !ok {
66+
(*m)[virtualClusterName] = make(map[int]nodeSet, 0)
67+
}
68+
if _, ok := (*m)[virtualClusterName][instance]; !ok {
69+
(*m)[virtualClusterName][instance] = make(nodeSet, 0)
70+
}
71+
(*m)[virtualClusterName][instance][node] = struct{}{}
72+
}
73+
74+
// getStartOrder returns the order in which the processes should be started. It
75+
// ensures that the System interface is started first.
76+
func (m *processMap) getStartOrder() []string {
77+
var order []string
78+
// If the System interface is present, it should be the first to start.
79+
if _, ok := (*m)[install.SystemInterfaceName]; ok {
80+
order = append(order, install.SystemInterfaceName)
81+
}
82+
for virtualClusterName := range *m {
83+
if virtualClusterName != install.SystemInterfaceName {
84+
order = append(order, virtualClusterName)
85+
}
86+
}
87+
return order
88+
}
89+
90+
// getStopOrder returns the order in which the processes should be stopped. It
91+
// is the reverse of the start order.
92+
func (m *processMap) getStopOrder() []string {
93+
order := m.getStartOrder()
94+
slices.Reverse(order)
95+
return order
96+
}
97+
5098
// Description implements FailureMode.
5199
func (r *resetVMFailure) Description() string {
52100
return ResetVMFailureName
@@ -62,10 +110,29 @@ func (r *resetVMFailure) Inject(ctx context.Context, l *logger.Logger, args Fail
62110
// Capture the processes running on the nodes.
63111
nodes := args.(ResetVMArgs).Nodes
64112
monitorChan := r.c.WithNodes(nodes).Monitor(l, ctx, install.MonitorOpts{OneShot: true})
65-
r.Processes = make(map[install.Node][]install.MonitorProcessRunning, 0)
113+
r.processes = make(processMap, 0)
66114
for e := range monitorChan {
67115
if p, ok := e.Event.(install.MonitorProcessRunning); ok {
68-
r.Processes[e.Node] = append(r.Processes[e.Node], p)
116+
r.processes.add(p.VirtualClusterName, p.SQLInstance, e.Node)
117+
}
118+
}
119+
120+
// Optionally stop the processes.
121+
if args.(ResetVMArgs).StopProcesses {
122+
for _, virtualClusterName := range r.processes.getStopOrder() {
123+
instanceMap := r.processes[virtualClusterName]
124+
for _, nodeMap := range instanceMap {
125+
var stopNodes install.Nodes
126+
for node := range nodeMap {
127+
stopNodes = append(nodes, node)
128+
}
129+
l.Printf("Stopping process %s on nodes %v", virtualClusterName, stopNodes)
130+
stopOpts := roachprod.DefaultStopOpts()
131+
err := r.c.WithNodes(stopNodes).Stop(ctx, l, stopOpts.Sig, stopOpts.Wait, stopOpts.GracePeriod, virtualClusterName)
132+
if err != nil {
133+
return err
134+
}
135+
}
69136
}
70137
}
71138

@@ -80,12 +147,17 @@ func (r *resetVMFailure) Cleanup(ctx context.Context, l *logger.Logger, args Fai
80147
// Recover implements FailureMode.
81148
func (r *resetVMFailure) Recover(ctx context.Context, l *logger.Logger, args FailureArgs) error {
82149
// Restart the processes.
83-
for node, processes := range r.Processes {
84-
for _, p := range processes {
85-
l.Printf("Starting process %s on node %s", p.PID, p.VirtualClusterName)
86-
err := r.c.WithNodes([]install.Node{node}).Start(ctx, l, install.StartOpts{
87-
VirtualClusterName: p.VirtualClusterName,
88-
SQLInstance: p.SQLInstance,
150+
for _, virtualClusterName := range r.processes.getStartOrder() {
151+
instanceMap := r.processes[virtualClusterName]
152+
for instance, nodeMap := range instanceMap {
153+
var nodes install.Nodes
154+
for node := range nodeMap {
155+
nodes = append(nodes, node)
156+
}
157+
l.Printf("Starting process %s on nodes %v", virtualClusterName, nodes)
158+
err := r.c.WithNodes(nodes).Start(ctx, l, install.StartOpts{
159+
VirtualClusterName: virtualClusterName,
160+
SQLInstance: instance,
89161
IsRestart: true,
90162
})
91163
if err != nil {

0 commit comments

Comments
 (0)