@@ -250,6 +250,67 @@ func (provider OpenStackCloudProvider) waitForServerDetachVolumes(serverID strin
250250 return rootVolumeID , err
251251}
252252
253+ // FirstTimeRepair Handle the first time repair for a node
254+ // 1) If the node is the first time in error, reboot and uncordon it
255+ // 2) If the node is not the first time in error, check if the last reboot time is in provider.Config.RebuildDelayAfterReboot
256+ // That said, if the node has been found in broken status before but has been long time since then, the processed variable
257+ // will be kept as False, which means the node need to be rebuilt to fix it, otherwise it means the has been processed.
258+ // The bool type return value means that if the node has been processed from a first time repair PoV
259+ func (provider OpenStackCloudProvider ) firstTimeRepair (n healthcheck.NodeInfo , serverID string , firstTimeRebootNodes map [string ]healthcheck.NodeInfo ) (bool , error ) {
260+ var firstTimeUnhealthy = true
261+ for id := range unHealthyNodes {
262+ log .V (5 ).Infof ("comparing server ID %s with known broken ID %s" , serverID , id )
263+ if id == serverID {
264+ log .Infof ("it is NOT the first time that node %s is being repaired" , serverID )
265+ firstTimeUnhealthy = false
266+ break
267+ }
268+ }
269+
270+ var processed = false
271+ if firstTimeUnhealthy == true {
272+ log .Infof ("rebooting node %s to repair it" , serverID )
273+
274+ if res := servers .Reboot (provider .Nova , serverID , servers.RebootOpts {Type : servers .SoftReboot }); res .Err != nil {
275+ // Usually it means the node is being rebooted
276+ log .Warningf ("failed to reboot node %s, error: %v" , serverID , res .Err )
277+ if strings .Contains (res .Err .Error (), "reboot_started" ) {
278+ // The node has been restarted
279+ firstTimeRebootNodes [serverID ] = n
280+ processed = true
281+ }
282+ } else {
283+ // Uncordon the node
284+ if n .IsWorker == true {
285+ nodeName := n .KubeNode .Name
286+ newNode := n .KubeNode .DeepCopy ()
287+ newNode .Spec .Unschedulable = false
288+ if _ , err := provider .KubeClient .CoreV1 ().Nodes ().Update (context .TODO (), newNode , metav1.UpdateOptions {}); err != nil {
289+ log .Errorf ("Failed to cordon node %s, error: %v" , nodeName , err )
290+ } else {
291+ log .Infof ("Node %s is cordoned" , nodeName )
292+ }
293+ }
294+
295+ n .RebootAt = time .Now ()
296+ firstTimeRebootNodes [serverID ] = n
297+ unHealthyNodes [serverID ] = n
298+ log .Infof ("Node %s has been rebooted at %s" , serverID , n .RebootAt )
299+ }
300+ processed = true
301+ } else {
302+ // If the node was rebooted within mins (defined by RepairDelayAfterReboot), ignore it
303+ log .Infof ("Node %s is found in unhealthy again which was rebooted at %s" , serverID , unHealthyNodes [serverID ].RebootAt )
304+ if unHealthyNodes [serverID ].RebootAt .After (time .Now ().Add (- 1 * provider .Config .RebuildDelayAfterReboot )) {
305+ log .Infof ("Node %s is found in unhealthy again, but we're going to defer the repair because it maybe in reboot process" , serverID )
306+ firstTimeRebootNodes [serverID ] = n
307+ processed = true
308+ }
309+ }
310+
311+ return processed , nil
312+ }
313+
253314// Repair For master nodes: detach etcd and docker volumes, find the root
254315// volume, then shutdown the VM, marks the both the VM and the root
255316// volume (heat resource) as "unhealthy" then trigger Heat stack update
@@ -275,6 +336,8 @@ func (provider OpenStackCloudProvider) Repair(nodes []healthcheck.NodeInfo) erro
275336 masters = nodes
276337 }
277338
339+ firstTimeRebootNodes := make (map [string ]healthcheck.NodeInfo )
340+
278341 err := provider .UpdateHealthStatus (masters , workers )
279342 if err != nil {
280343 return fmt .Errorf ("failed to update the helath status of cluster %s, error: %v" , clusterName , err )
@@ -295,21 +358,10 @@ func (provider OpenStackCloudProvider) Repair(nodes []healthcheck.NodeInfo) erro
295358 }
296359 serverID := machineID .String ()
297360
298- var firsttimeUnhealthy = true
299- for id := range unHealthyNodes {
300- log .V (5 ).Infof ("comparing server ID %s with known broken ID %s" , serverID , id )
301- if id == serverID {
302- firsttimeUnhealthy = false
303- break
304- }
305- }
306-
307- if firsttimeUnhealthy == true {
308- unHealthyNodes [serverID ] = n
309- log .Infof ("rebooting node %s to repair it" , serverID )
310- if res := servers .Reboot (provider .Nova , serverID , servers.RebootOpts {Type : servers .SoftReboot }); res .Err != nil {
311- log .Warningf ("failed to reboot node %s, error: %v" , serverID , res .Err )
312- }
361+ if processed , err := provider .firstTimeRepair (n , serverID , firstTimeRebootNodes ); err != nil {
362+ log .Warningf ("Failed to process if the node %s is in first time repair , error: %v" , serverID , err )
363+ } else if processed == true {
364+ log .Infof ("Node %s has been processed" , serverID )
313365 continue
314366 }
315367
@@ -374,21 +426,10 @@ func (provider OpenStackCloudProvider) Repair(nodes []healthcheck.NodeInfo) erro
374426 }
375427 serverID := machineID .String ()
376428
377- var firsttimeUnhealthy = true
378- for id := range unHealthyNodes {
379- log .Infof ("comparing server ID %s with known broken ID %s" , serverID , id )
380- if id == serverID {
381- firsttimeUnhealthy = false
382- break
383- }
384- }
385-
386- if firsttimeUnhealthy == true {
387- unHealthyNodes [serverID ] = n
388- log .Infof ("rebooting node %s to repair it" , serverID )
389- if res := servers .Reboot (provider .Nova , serverID , servers.RebootOpts {Type : servers .SoftReboot }); res .Err != nil {
390- log .Warningf ("failed to reboot node %s, error: %v" , serverID , res .Err )
391- }
429+ if processed , err := provider .firstTimeRepair (n , serverID , firstTimeRebootNodes ); err != nil {
430+ log .Warningf ("Failed to process if the node %s is in first time repair , error: %v" , serverID , err )
431+ } else if processed == true {
432+ log .Infof ("Node %s has been processed" , serverID )
392433 continue
393434 }
394435
@@ -404,8 +445,14 @@ func (provider OpenStackCloudProvider) Repair(nodes []healthcheck.NodeInfo) erro
404445 }
405446 }
406447
407- if err := provider .waitForServerPoweredOff (serverID , 30 * time .Second ); err != nil {
448+ if err := provider .waitForServerPoweredOff (serverID , 180 * time .Second ); err != nil {
408449 log .Warningf ("Failed to shutdown the server %s, error: %v" , serverID , err )
450+ // If the server is failed to delete after 180s, then delete it to avoid the
451+ // stack update failure later.
452+ res := servers .ForceDelete (provider .Nova , serverID )
453+ if res .Err != nil {
454+ log .Warningf ("Failed to delete the server %s, error: %v" , serverID , err )
455+ }
409456 }
410457
411458 log .Infof ("Marking Nova VM %s(Heat resource %s) unhealthy for Heat stack %s" , serverID , allMapping [serverID ].ResourceID , cluster .StackID )
@@ -428,6 +475,11 @@ func (provider OpenStackCloudProvider) Repair(nodes []healthcheck.NodeInfo) erro
428475
429476 // Remove the broken nodes from the cluster
430477 for _ , n := range nodes {
478+ serverID := uuid .Parse (n .KubeNode .Status .NodeInfo .MachineID ).String ()
479+ if _ , found := firstTimeRebootNodes [serverID ]; found {
480+ log .Infof ("Skip node delete for %s because it's repaired by reboot" , serverID )
481+ continue
482+ }
431483 if err := provider .KubeClient .CoreV1 ().Nodes ().Delete (context .TODO (), n .KubeNode .Name , metav1.DeleteOptions {}); err != nil {
432484 log .Errorf ("Failed to remove the node %s from cluster, error: %v" , n .KubeNode .Name , err )
433485 }
0 commit comments