Skip to content

Commit 2ed7f1c

Browse files
author
Vidit Bhat
committed
drt: add configurable downtime for node kill operations
Previously, node kill operations would immediately attempt to restart the killed node during the cleanup phase. This was inadequate because it didn’t allow tests to simulate real-world scenarios where a node remains offline for an extended period. This patch introduces a configurable downtime parameter to node-kill operations. The `cleanupNodeKill` struct now includes a `downtime` field, and the `Cleanup` method waits for the specified duration before restarting the node. Several operation variants were registered with different combinations of signal type, drain behavior, and downtime durations (e.g., 1m, 10m, 1h, 5h). Epic: none Fixes: #138573, #138574, #138575, #138576 Release note: None
1 parent d1bfdaa commit 2ed7f1c

File tree

1 file changed

+66
-44
lines changed

1 file changed

+66
-44
lines changed

pkg/cmd/roachtest/operations/node_kill.go

Lines changed: 66 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -20,11 +20,19 @@ import (
2020
)
2121

2222
type cleanupNodeKill struct {
23-
nodes option.NodeListOption
23+
nodes option.NodeListOption
24+
downtime time.Duration
2425
}
2526

2627
func (cl *cleanupNodeKill) Cleanup(ctx context.Context, o operation.Operation, c cluster.Cluster) {
27-
// We might need to restart the node if it isn't live.
28+
o.Status(fmt.Sprintf("waiting for %s before restarting node %s", cl.downtime, cl.nodes))
29+
select {
30+
case <-time.After(cl.downtime):
31+
case <-ctx.Done():
32+
o.Status("cleanup context cancelled during wait")
33+
return
34+
}
35+
2836
db, err := c.ConnE(ctx, o.L(), cl.nodes[0])
2937
if err != nil {
3038
err = c.RunE(ctx, option.WithNodes(cl.nodes), "./cockroach.sh")
@@ -48,15 +56,20 @@ func (cl *cleanupNodeKill) Cleanup(ctx context.Context, o operation.Operation, c
4856
}
4957

5058
func nodeKillRunner(
51-
signal int, drain bool,
59+
signal int, drain bool, downtime time.Duration,
5260
) func(ctx context.Context, o operation.Operation, c cluster.Cluster) registry.OperationCleanup {
5361
return func(ctx context.Context, o operation.Operation, c cluster.Cluster) registry.OperationCleanup {
54-
return runNodeKill(ctx, o, c, signal, drain)
62+
return runNodeKill(ctx, o, c, signal, drain, downtime)
5563
}
5664
}
5765

5866
func runNodeKill(
59-
ctx context.Context, o operation.Operation, c cluster.Cluster, signal int, drain bool,
67+
ctx context.Context,
68+
o operation.Operation,
69+
c cluster.Cluster,
70+
signal int,
71+
drain bool,
72+
downtime time.Duration,
6073
) registry.OperationCleanup {
6174
rng, _ := randutil.NewPseudoRand()
6275
node := c.All().SeededRandNode(rng)
@@ -88,46 +101,55 @@ func runNodeKill(
88101
time.Sleep(1 * time.Second)
89102
}
90103

91-
return &cleanupNodeKill{
92-
nodes: node,
93-
}
104+
// Schedule the cleanup instead of returning it
105+
go func() {
106+
cleanup := cleanupNodeKill{
107+
nodes: node,
108+
downtime: downtime,
109+
}
110+
cleanup.Cleanup(ctx, o, c)
111+
}()
112+
113+
// return nil to avoid the hardcoded 5s + random [0s, 24h] wait
114+
return nil
94115
}
95116

96117
func registerNodeKill(r registry.Registry) {
97-
r.AddOperation(registry.OperationSpec{
98-
Name: "node-kill/sigkill/drain=true",
99-
Owner: registry.OwnerServer,
100-
Timeout: 15 * time.Minute,
101-
CompatibleClouds: registry.AllClouds,
102-
CanRunConcurrently: registry.OperationCannotRunConcurrently,
103-
Dependencies: []registry.OperationDependency{registry.OperationRequiresZeroUnderreplicatedRanges},
104-
Run: nodeKillRunner(9 /* signal */, true /* drain */),
105-
})
106-
r.AddOperation(registry.OperationSpec{
107-
Name: "node-kill/sigkill/drain=false",
108-
Owner: registry.OwnerServer,
109-
Timeout: 10 * time.Minute,
110-
CompatibleClouds: registry.AllClouds,
111-
CanRunConcurrently: registry.OperationCannotRunConcurrently,
112-
Dependencies: []registry.OperationDependency{registry.OperationRequiresZeroUnderreplicatedRanges},
113-
Run: nodeKillRunner(9 /* signal */, false /* drain */),
114-
})
115-
r.AddOperation(registry.OperationSpec{
116-
Name: "node-kill/sigterm/drain=true",
117-
Owner: registry.OwnerServer,
118-
Timeout: 15 * time.Minute,
119-
CompatibleClouds: registry.AllClouds,
120-
CanRunConcurrently: registry.OperationCannotRunConcurrently,
121-
Dependencies: []registry.OperationDependency{registry.OperationRequiresZeroUnderreplicatedRanges},
122-
Run: nodeKillRunner(15 /* signal */, true /* drain */),
123-
})
124-
r.AddOperation(registry.OperationSpec{
125-
Name: "node-kill/sigterm/drain=false",
126-
Owner: registry.OwnerServer,
127-
Timeout: 10 * time.Minute,
128-
CompatibleClouds: registry.AllClouds,
129-
CanRunConcurrently: registry.OperationCannotRunConcurrently,
130-
Dependencies: []registry.OperationDependency{registry.OperationRequiresZeroUnderreplicatedRanges},
131-
Run: nodeKillRunner(15 /* signal */, false /* drain */),
132-
})
118+
for _, spec := range []struct {
119+
name string
120+
signal int
121+
drain bool
122+
downtime time.Duration
123+
timeout time.Duration
124+
}{
125+
{"node-kill/sigkill/drain=true/downtime=1m", 9, true, 1 * time.Minute, 20 * time.Minute},
126+
{"node-kill/sigkill/drain=true/downtime=10m", 9, true, 10 * time.Minute, 25 * time.Minute},
127+
{"node-kill/sigkill/drain=true/downtime=1h", 9, true, 1 * time.Hour, 2 * time.Hour},
128+
{"node-kill/sigkill/drain=true/downtime=5h", 9, true, 5 * time.Hour, 6 * time.Hour},
129+
130+
{"node-kill/sigkill/drain=false/downtime=1m", 9, false, 1 * time.Minute, 20 * time.Minute},
131+
{"node-kill/sigkill/drain=false/downtime=10m", 9, false, 10 * time.Minute, 25 * time.Minute},
132+
{"node-kill/sigkill/drain=false/downtime=1h", 9, false, 1 * time.Hour, 2 * time.Hour},
133+
{"node-kill/sigkill/drain=false/downtime=5h", 9, true, 5 * time.Hour, 6 * time.Hour},
134+
135+
{"node-kill/sigterm/drain=true/downtime=1m", 15, true, 1 * time.Minute, 20 * time.Minute},
136+
{"node-kill/sigterm/drain=true/downtime=10m", 15, true, 10 * time.Minute, 25 * time.Minute},
137+
{"node-kill/sigterm/drain=true/downtime=1h", 15, true, 1 * time.Hour, 2 * time.Hour},
138+
{"node-kill/sigterm/drain=true/downtime=5h", 15, true, 5 * time.Hour, 6 * time.Hour},
139+
140+
{"node-kill/sigterm/drain=false/downtime=1m", 15, false, 1 * time.Minute, 20 * time.Minute},
141+
{"node-kill/sigterm/drain=false/downtime=10m", 15, false, 10 * time.Minute, 25 * time.Minute},
142+
{"node-kill/sigterm/drain=false/downtime=1h", 15, false, 1 * time.Hour, 2 * time.Hour},
143+
{"node-kill/sigterm/drain=false/downtime=5h", 15, true, 5 * time.Hour, 6 * time.Hour},
144+
} {
145+
r.AddOperation(registry.OperationSpec{
146+
Name: spec.name,
147+
Owner: registry.OwnerServer,
148+
Timeout: spec.timeout,
149+
CompatibleClouds: registry.AllClouds,
150+
CanRunConcurrently: registry.OperationCannotRunConcurrently,
151+
Dependencies: []registry.OperationDependency{registry.OperationRequiresZeroUnderreplicatedRanges},
152+
Run: nodeKillRunner(spec.signal, spec.drain, spec.downtime),
153+
})
154+
}
133155
}

0 commit comments

Comments
 (0)