@@ -20,11 +20,19 @@ import (
2020)
2121
2222type cleanupNodeKill struct {
23- nodes option.NodeListOption
23+ nodes option.NodeListOption
24+ downtime time.Duration
2425}
2526
2627func (cl * cleanupNodeKill ) Cleanup (ctx context.Context , o operation.Operation , c cluster.Cluster ) {
27- // We might need to restart the node if it isn't live.
28+ o .Status (fmt .Sprintf ("waiting for %s before restarting node %s" , cl .downtime , cl .nodes ))
29+ select {
30+ case <- time .After (cl .downtime ):
31+ case <- ctx .Done ():
32+ o .Status ("cleanup context cancelled during wait" )
33+ return
34+ }
35+
2836 db , err := c .ConnE (ctx , o .L (), cl .nodes [0 ])
2937 if err != nil {
3038 err = c .RunE (ctx , option .WithNodes (cl .nodes ), "./cockroach.sh" )
@@ -48,15 +56,20 @@ func (cl *cleanupNodeKill) Cleanup(ctx context.Context, o operation.Operation, c
4856}
4957
5058func nodeKillRunner (
51- signal int , drain bool ,
59+ signal int , drain bool , downtime time. Duration ,
5260) func (ctx context.Context , o operation.Operation , c cluster.Cluster ) registry.OperationCleanup {
5361 return func (ctx context.Context , o operation.Operation , c cluster.Cluster ) registry.OperationCleanup {
54- return runNodeKill (ctx , o , c , signal , drain )
62+ return runNodeKill (ctx , o , c , signal , drain , downtime )
5563 }
5664}
5765
5866func runNodeKill (
59- ctx context.Context , o operation.Operation , c cluster.Cluster , signal int , drain bool ,
67+ ctx context.Context ,
68+ o operation.Operation ,
69+ c cluster.Cluster ,
70+ signal int ,
71+ drain bool ,
72+ downtime time.Duration ,
6073) registry.OperationCleanup {
6174 rng , _ := randutil .NewPseudoRand ()
6275 node := c .All ().SeededRandNode (rng )
@@ -88,46 +101,55 @@ func runNodeKill(
88101 time .Sleep (1 * time .Second )
89102 }
90103
91- return & cleanupNodeKill {
92- nodes : node ,
93- }
104+ // Schedule the cleanup instead of returning it
105+ go func () {
106+ cleanup := cleanupNodeKill {
107+ nodes : node ,
108+ downtime : downtime ,
109+ }
110+ cleanup .Cleanup (ctx , o , c )
111+ }()
112+
113+ // return nil to avoid the hardcoded 5s + random [0s, 24h] wait
114+ return nil
94115}
95116
96117func registerNodeKill (r registry.Registry ) {
97- r .AddOperation (registry.OperationSpec {
98- Name : "node-kill/sigkill/drain=true" ,
99- Owner : registry .OwnerServer ,
100- Timeout : 15 * time .Minute ,
101- CompatibleClouds : registry .AllClouds ,
102- CanRunConcurrently : registry .OperationCannotRunConcurrently ,
103- Dependencies : []registry.OperationDependency {registry .OperationRequiresZeroUnderreplicatedRanges },
104- Run : nodeKillRunner (9 /* signal */ , true /* drain */ ),
105- })
106- r .AddOperation (registry.OperationSpec {
107- Name : "node-kill/sigkill/drain=false" ,
108- Owner : registry .OwnerServer ,
109- Timeout : 10 * time .Minute ,
110- CompatibleClouds : registry .AllClouds ,
111- CanRunConcurrently : registry .OperationCannotRunConcurrently ,
112- Dependencies : []registry.OperationDependency {registry .OperationRequiresZeroUnderreplicatedRanges },
113- Run : nodeKillRunner (9 /* signal */ , false /* drain */ ),
114- })
115- r .AddOperation (registry.OperationSpec {
116- Name : "node-kill/sigterm/drain=true" ,
117- Owner : registry .OwnerServer ,
118- Timeout : 15 * time .Minute ,
119- CompatibleClouds : registry .AllClouds ,
120- CanRunConcurrently : registry .OperationCannotRunConcurrently ,
121- Dependencies : []registry.OperationDependency {registry .OperationRequiresZeroUnderreplicatedRanges },
122- Run : nodeKillRunner (15 /* signal */ , true /* drain */ ),
123- })
124- r .AddOperation (registry.OperationSpec {
125- Name : "node-kill/sigterm/drain=false" ,
126- Owner : registry .OwnerServer ,
127- Timeout : 10 * time .Minute ,
128- CompatibleClouds : registry .AllClouds ,
129- CanRunConcurrently : registry .OperationCannotRunConcurrently ,
130- Dependencies : []registry.OperationDependency {registry .OperationRequiresZeroUnderreplicatedRanges },
131- Run : nodeKillRunner (15 /* signal */ , false /* drain */ ),
132- })
118+ for _ , spec := range []struct {
119+ name string
120+ signal int
121+ drain bool
122+ downtime time.Duration
123+ timeout time.Duration
124+ }{
125+ {"node-kill/sigkill/drain=true/downtime=1m" , 9 , true , 1 * time .Minute , 20 * time .Minute },
126+ {"node-kill/sigkill/drain=true/downtime=10m" , 9 , true , 10 * time .Minute , 25 * time .Minute },
127+ {"node-kill/sigkill/drain=true/downtime=1h" , 9 , true , 1 * time .Hour , 2 * time .Hour },
128+ {"node-kill/sigkill/drain=true/downtime=5h" , 9 , true , 5 * time .Hour , 6 * time .Hour },
129+
130+ {"node-kill/sigkill/drain=false/downtime=1m" , 9 , false , 1 * time .Minute , 20 * time .Minute },
131+ {"node-kill/sigkill/drain=false/downtime=10m" , 9 , false , 10 * time .Minute , 25 * time .Minute },
132+ {"node-kill/sigkill/drain=false/downtime=1h" , 9 , false , 1 * time .Hour , 2 * time .Hour },
133+ {"node-kill/sigkill/drain=false/downtime=5h" , 9 , true , 5 * time .Hour , 6 * time .Hour },
134+
135+ {"node-kill/sigterm/drain=true/downtime=1m" , 15 , true , 1 * time .Minute , 20 * time .Minute },
136+ {"node-kill/sigterm/drain=true/downtime=10m" , 15 , true , 10 * time .Minute , 25 * time .Minute },
137+ {"node-kill/sigterm/drain=true/downtime=1h" , 15 , true , 1 * time .Hour , 2 * time .Hour },
138+ {"node-kill/sigterm/drain=true/downtime=5h" , 15 , true , 5 * time .Hour , 6 * time .Hour },
139+
140+ {"node-kill/sigterm/drain=false/downtime=1m" , 15 , false , 1 * time .Minute , 20 * time .Minute },
141+ {"node-kill/sigterm/drain=false/downtime=10m" , 15 , false , 10 * time .Minute , 25 * time .Minute },
142+ {"node-kill/sigterm/drain=false/downtime=1h" , 15 , false , 1 * time .Hour , 2 * time .Hour },
143+ {"node-kill/sigterm/drain=false/downtime=5h" , 15 , true , 5 * time .Hour , 6 * time .Hour },
144+ } {
145+ r .AddOperation (registry.OperationSpec {
146+ Name : spec .name ,
147+ Owner : registry .OwnerServer ,
148+ Timeout : spec .timeout ,
149+ CompatibleClouds : registry .AllClouds ,
150+ CanRunConcurrently : registry .OperationCannotRunConcurrently ,
151+ Dependencies : []registry.OperationDependency {registry .OperationRequiresZeroUnderreplicatedRanges },
152+ Run : nodeKillRunner (spec .signal , spec .drain , spec .downtime ),
153+ })
154+ }
133155}
0 commit comments