@@ -7,6 +7,7 @@ package failures
7
7
8
8
import (
9
9
"context"
10
+ "slices"
10
11
"time"
11
12
12
13
"github.com/cockroachdb/cockroach/pkg/roachprod"
@@ -16,12 +17,22 @@ import (
16
17
17
18
type (
18
19
ResetVMArgs struct {
19
- Nodes install.Nodes
20
+ Nodes install.Nodes
21
+ StopProcesses bool
20
22
}
21
23
resetVMFailure struct {
22
24
GenericFailure
23
- Processes map [install. Node ][]install. MonitorProcessRunning
25
+ processes processMap
24
26
}
27
+ // nodeSet is a set of nodes.
28
+ nodeSet map [install.Node ]struct {}
29
+ // instanceMap is a map of SQL instances to a map of nodes that are running
30
+ // that instance.
31
+ instanceMap map [int ]nodeSet
32
+ // processMap is a map of virtual cluster names to instance maps. It's a
33
+ // convenience type that allows grouping the processes that should be
34
+ // started and stopped together.
35
+ processMap map [string ]instanceMap
25
36
)
26
37
27
38
var _ FailureMode = & resetVMFailure {}
@@ -47,6 +58,43 @@ func MakeResetVMFailure(
47
58
}, nil
48
59
}
49
60
61
+ func (m * processMap ) add (virtualClusterName string , instance int , node install.Node ) {
62
+ if virtualClusterName == "" {
63
+ virtualClusterName = install .SystemInterfaceName
64
+ }
65
+ if _ , ok := (* m )[virtualClusterName ]; ! ok {
66
+ (* m )[virtualClusterName ] = make (map [int ]nodeSet , 0 )
67
+ }
68
+ if _ , ok := (* m )[virtualClusterName ][instance ]; ! ok {
69
+ (* m )[virtualClusterName ][instance ] = make (nodeSet , 0 )
70
+ }
71
+ (* m )[virtualClusterName ][instance ][node ] = struct {}{}
72
+ }
73
+
74
+ // getStartOrder returns the order in which the processes should be started. It
75
+ // ensures that the System interface is started first.
76
+ func (m * processMap ) getStartOrder () []string {
77
+ var order []string
78
+ // If the System interface is present, it should be the first to start.
79
+ if _ , ok := (* m )[install .SystemInterfaceName ]; ok {
80
+ order = append (order , install .SystemInterfaceName )
81
+ }
82
+ for virtualClusterName := range * m {
83
+ if virtualClusterName != install .SystemInterfaceName {
84
+ order = append (order , virtualClusterName )
85
+ }
86
+ }
87
+ return order
88
+ }
89
+
90
+ // getStopOrder returns the order in which the processes should be stopped. It
91
+ // is the reverse of the start order.
92
+ func (m * processMap ) getStopOrder () []string {
93
+ order := m .getStartOrder ()
94
+ slices .Reverse (order )
95
+ return order
96
+ }
97
+
50
98
// Description implements FailureMode.
51
99
func (r * resetVMFailure ) Description () string {
52
100
return ResetVMFailureName
@@ -62,10 +110,29 @@ func (r *resetVMFailure) Inject(ctx context.Context, l *logger.Logger, args Fail
62
110
// Capture the processes running on the nodes.
63
111
nodes := args .(ResetVMArgs ).Nodes
64
112
monitorChan := r .c .WithNodes (nodes ).Monitor (l , ctx , install.MonitorOpts {OneShot : true })
65
- r .Processes = make (map [install. Node ][]install. MonitorProcessRunning , 0 )
113
+ r .processes = make (processMap , 0 )
66
114
for e := range monitorChan {
67
115
if p , ok := e .Event .(install.MonitorProcessRunning ); ok {
68
- r .Processes [e .Node ] = append (r .Processes [e .Node ], p )
116
+ r .processes .add (p .VirtualClusterName , p .SQLInstance , e .Node )
117
+ }
118
+ }
119
+
120
+ // Optionally stop the processes.
121
+ if args .(ResetVMArgs ).StopProcesses {
122
+ for _ , virtualClusterName := range r .processes .getStopOrder () {
123
+ instanceMap := r .processes [virtualClusterName ]
124
+ for _ , nodeMap := range instanceMap {
125
+ var stopNodes install.Nodes
126
+ for node := range nodeMap {
127
+ stopNodes = append (nodes , node )
128
+ }
129
+ l .Printf ("Stopping process %s on nodes %v" , virtualClusterName , stopNodes )
130
+ stopOpts := roachprod .DefaultStopOpts ()
131
+ err := r .c .WithNodes (stopNodes ).Stop (ctx , l , stopOpts .Sig , stopOpts .Wait , stopOpts .GracePeriod , virtualClusterName )
132
+ if err != nil {
133
+ return err
134
+ }
135
+ }
69
136
}
70
137
}
71
138
@@ -80,12 +147,17 @@ func (r *resetVMFailure) Cleanup(ctx context.Context, l *logger.Logger, args Fai
80
147
// Recover implements FailureMode.
81
148
func (r * resetVMFailure ) Recover (ctx context.Context , l * logger.Logger , args FailureArgs ) error {
82
149
// Restart the processes.
83
- for node , processes := range r .Processes {
84
- for _ , p := range processes {
85
- l .Printf ("Starting process %s on node %s" , p .PID , p .VirtualClusterName )
86
- err := r .c .WithNodes ([]install.Node {node }).Start (ctx , l , install.StartOpts {
87
- VirtualClusterName : p .VirtualClusterName ,
88
- SQLInstance : p .SQLInstance ,
150
+ for _ , virtualClusterName := range r .processes .getStartOrder () {
151
+ instanceMap := r .processes [virtualClusterName ]
152
+ for instance , nodeMap := range instanceMap {
153
+ var nodes install.Nodes
154
+ for node := range nodeMap {
155
+ nodes = append (nodes , node )
156
+ }
157
+ l .Printf ("Starting process %s on nodes %v" , virtualClusterName , nodes )
158
+ err := r .c .WithNodes (nodes ).Start (ctx , l , install.StartOpts {
159
+ VirtualClusterName : virtualClusterName ,
160
+ SQLInstance : instance ,
89
161
IsRestart : true ,
90
162
})
91
163
if err != nil {
0 commit comments