@@ -250,6 +250,67 @@ func (provider OpenStackCloudProvider) waitForServerDetachVolumes(serverID strin
250
250
return rootVolumeID , err
251
251
}
252
252
253
+ // FirstTimeRepair Handle the first time repair for a node
254
+ // 1) If the node is the first time in error, reboot and uncordon it
255
+ // 2) If the node is not the first time in error, check if the last reboot time is in provider.Config.RebuildDelayAfterReboot
256
+ // That said, if the node has been found in broken status before but has been long time since then, the processed variable
257
+ // will be kept as False, which means the node need to be rebuilt to fix it, otherwise it means the has been processed.
258
+ // The bool type return value means that if the node has been processed from a first time repair PoV
259
+ func (provider OpenStackCloudProvider ) firstTimeRepair (n healthcheck.NodeInfo , serverID string , firstTimeRebootNodes map [string ]healthcheck.NodeInfo ) (bool , error ) {
260
+ var firstTimeUnhealthy = true
261
+ for id := range unHealthyNodes {
262
+ log .V (5 ).Infof ("comparing server ID %s with known broken ID %s" , serverID , id )
263
+ if id == serverID {
264
+ log .Infof ("it is NOT the first time that node %s is being repaired" , serverID )
265
+ firstTimeUnhealthy = false
266
+ break
267
+ }
268
+ }
269
+
270
+ var processed = false
271
+ if firstTimeUnhealthy == true {
272
+ log .Infof ("rebooting node %s to repair it" , serverID )
273
+
274
+ if res := servers .Reboot (provider .Nova , serverID , servers.RebootOpts {Type : servers .SoftReboot }); res .Err != nil {
275
+ // Usually it means the node is being rebooted
276
+ log .Warningf ("failed to reboot node %s, error: %v" , serverID , res .Err )
277
+ if strings .Contains (res .Err .Error (), "reboot_started" ) {
278
+ // The node has been restarted
279
+ firstTimeRebootNodes [serverID ] = n
280
+ processed = true
281
+ }
282
+ } else {
283
+ // Uncordon the node
284
+ if n .IsWorker == true {
285
+ nodeName := n .KubeNode .Name
286
+ newNode := n .KubeNode .DeepCopy ()
287
+ newNode .Spec .Unschedulable = false
288
+ if _ , err := provider .KubeClient .CoreV1 ().Nodes ().Update (context .TODO (), newNode , metav1.UpdateOptions {}); err != nil {
289
+ log .Errorf ("Failed to cordon node %s, error: %v" , nodeName , err )
290
+ } else {
291
+ log .Infof ("Node %s is cordoned" , nodeName )
292
+ }
293
+ }
294
+
295
+ n .RebootAt = time .Now ()
296
+ firstTimeRebootNodes [serverID ] = n
297
+ unHealthyNodes [serverID ] = n
298
+ log .Infof ("Node %s has been rebooted at %s" , serverID , n .RebootAt )
299
+ }
300
+ processed = true
301
+ } else {
302
+ // If the node was rebooted within mins (defined by RepairDelayAfterReboot), ignore it
303
+ log .Infof ("Node %s is found in unhealthy again which was rebooted at %s" , serverID , unHealthyNodes [serverID ].RebootAt )
304
+ if unHealthyNodes [serverID ].RebootAt .After (time .Now ().Add (- 1 * provider .Config .RebuildDelayAfterReboot )) {
305
+ log .Infof ("Node %s is found in unhealthy again, but we're going to defer the repair because it maybe in reboot process" , serverID )
306
+ firstTimeRebootNodes [serverID ] = n
307
+ processed = true
308
+ }
309
+ }
310
+
311
+ return processed , nil
312
+ }
313
+
253
314
// Repair For master nodes: detach etcd and docker volumes, find the root
254
315
// volume, then shutdown the VM, marks the both the VM and the root
255
316
// volume (heat resource) as "unhealthy" then trigger Heat stack update
@@ -275,6 +336,8 @@ func (provider OpenStackCloudProvider) Repair(nodes []healthcheck.NodeInfo) erro
275
336
masters = nodes
276
337
}
277
338
339
+ firstTimeRebootNodes := make (map [string ]healthcheck.NodeInfo )
340
+
278
341
err := provider .UpdateHealthStatus (masters , workers )
279
342
if err != nil {
280
343
return fmt .Errorf ("failed to update the helath status of cluster %s, error: %v" , clusterName , err )
@@ -295,21 +358,10 @@ func (provider OpenStackCloudProvider) Repair(nodes []healthcheck.NodeInfo) erro
295
358
}
296
359
serverID := machineID .String ()
297
360
298
- var firsttimeUnhealthy = true
299
- for id := range unHealthyNodes {
300
- log .V (5 ).Infof ("comparing server ID %s with known broken ID %s" , serverID , id )
301
- if id == serverID {
302
- firsttimeUnhealthy = false
303
- break
304
- }
305
- }
306
-
307
- if firsttimeUnhealthy == true {
308
- unHealthyNodes [serverID ] = n
309
- log .Infof ("rebooting node %s to repair it" , serverID )
310
- if res := servers .Reboot (provider .Nova , serverID , servers.RebootOpts {Type : servers .SoftReboot }); res .Err != nil {
311
- log .Warningf ("failed to reboot node %s, error: %v" , serverID , res .Err )
312
- }
361
+ if processed , err := provider .firstTimeRepair (n , serverID , firstTimeRebootNodes ); err != nil {
362
+ log .Warningf ("Failed to process if the node %s is in first time repair , error: %v" , serverID , err )
363
+ } else if processed == true {
364
+ log .Infof ("Node %s has been processed" , serverID )
313
365
continue
314
366
}
315
367
@@ -374,21 +426,10 @@ func (provider OpenStackCloudProvider) Repair(nodes []healthcheck.NodeInfo) erro
374
426
}
375
427
serverID := machineID .String ()
376
428
377
- var firsttimeUnhealthy = true
378
- for id := range unHealthyNodes {
379
- log .Infof ("comparing server ID %s with known broken ID %s" , serverID , id )
380
- if id == serverID {
381
- firsttimeUnhealthy = false
382
- break
383
- }
384
- }
385
-
386
- if firsttimeUnhealthy == true {
387
- unHealthyNodes [serverID ] = n
388
- log .Infof ("rebooting node %s to repair it" , serverID )
389
- if res := servers .Reboot (provider .Nova , serverID , servers.RebootOpts {Type : servers .SoftReboot }); res .Err != nil {
390
- log .Warningf ("failed to reboot node %s, error: %v" , serverID , res .Err )
391
- }
429
+ if processed , err := provider .firstTimeRepair (n , serverID , firstTimeRebootNodes ); err != nil {
430
+ log .Warningf ("Failed to process if the node %s is in first time repair , error: %v" , serverID , err )
431
+ } else if processed == true {
432
+ log .Infof ("Node %s has been processed" , serverID )
392
433
continue
393
434
}
394
435
@@ -404,8 +445,14 @@ func (provider OpenStackCloudProvider) Repair(nodes []healthcheck.NodeInfo) erro
404
445
}
405
446
}
406
447
407
- if err := provider .waitForServerPoweredOff (serverID , 30 * time .Second ); err != nil {
448
+ if err := provider .waitForServerPoweredOff (serverID , 180 * time .Second ); err != nil {
408
449
log .Warningf ("Failed to shutdown the server %s, error: %v" , serverID , err )
450
+ // If the server is failed to delete after 180s, then delete it to avoid the
451
+ // stack update failure later.
452
+ res := servers .ForceDelete (provider .Nova , serverID )
453
+ if res .Err != nil {
454
+ log .Warningf ("Failed to delete the server %s, error: %v" , serverID , err )
455
+ }
409
456
}
410
457
411
458
log .Infof ("Marking Nova VM %s(Heat resource %s) unhealthy for Heat stack %s" , serverID , allMapping [serverID ].ResourceID , cluster .StackID )
@@ -428,6 +475,11 @@ func (provider OpenStackCloudProvider) Repair(nodes []healthcheck.NodeInfo) erro
428
475
429
476
// Remove the broken nodes from the cluster
430
477
for _ , n := range nodes {
478
+ serverID := uuid .Parse (n .KubeNode .Status .NodeInfo .MachineID ).String ()
479
+ if _ , found := firstTimeRebootNodes [serverID ]; found {
480
+ log .Infof ("Skip node delete for %s because it's repaired by reboot" , serverID )
481
+ continue
482
+ }
431
483
if err := provider .KubeClient .CoreV1 ().Nodes ().Delete (context .TODO (), n .KubeNode .Name , metav1.DeleteOptions {}); err != nil {
432
484
log .Errorf ("Failed to remove the node %s from cluster, error: %v" , n .KubeNode .Name , err )
433
485
}
0 commit comments