Skip to content

Commit 88ad2c9

Browse files
openstackerlingxiankong
authored andcommitted
Fix duplicated repair action (kubernetes#1530)
There are several changes in this commit: 1) Fix the duplicated repair actions 2) Change default health check period from 30s to 60s (cherry picked from commit db60e00)
1 parent 00d7c9f commit 88ad2c9

File tree

4 files changed

+103
-38
lines changed

4 files changed

+103
-38
lines changed

pkg/autohealing/cloudprovider/openstack/provider.go

Lines changed: 83 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -250,6 +250,67 @@ func (provider OpenStackCloudProvider) waitForServerDetachVolumes(serverID strin
250250
return rootVolumeID, err
251251
}
252252

253+
// FirstTimeRepair Handle the first time repair for a node
254+
// 1) If the node is the first time in error, reboot and uncordon it
255+
// 2) If the node is not the first time in error, check if the last reboot time is in provider.Config.RebuildDelayAfterReboot
256+
// That said, if the node has been found in broken status before but has been long time since then, the processed variable
257+
// will be kept as False, which means the node need to be rebuilt to fix it, otherwise it means the has been processed.
258+
// The bool type return value means that if the node has been processed from a first time repair PoV
259+
func (provider OpenStackCloudProvider) firstTimeRepair(n healthcheck.NodeInfo, serverID string, firstTimeRebootNodes map[string]healthcheck.NodeInfo) (bool, error) {
260+
var firstTimeUnhealthy = true
261+
for id := range unHealthyNodes {
262+
log.V(5).Infof("comparing server ID %s with known broken ID %s", serverID, id)
263+
if id == serverID {
264+
log.Infof("it is NOT the first time that node %s is being repaired", serverID)
265+
firstTimeUnhealthy = false
266+
break
267+
}
268+
}
269+
270+
var processed = false
271+
if firstTimeUnhealthy == true {
272+
log.Infof("rebooting node %s to repair it", serverID)
273+
274+
if res := servers.Reboot(provider.Nova, serverID, servers.RebootOpts{Type: servers.SoftReboot}); res.Err != nil {
275+
// Usually it means the node is being rebooted
276+
log.Warningf("failed to reboot node %s, error: %v", serverID, res.Err)
277+
if strings.Contains(res.Err.Error(), "reboot_started") {
278+
// The node has been restarted
279+
firstTimeRebootNodes[serverID] = n
280+
processed = true
281+
}
282+
} else {
283+
// Uncordon the node
284+
if n.IsWorker == true {
285+
nodeName := n.KubeNode.Name
286+
newNode := n.KubeNode.DeepCopy()
287+
newNode.Spec.Unschedulable = false
288+
if _, err := provider.KubeClient.CoreV1().Nodes().Update(context.TODO(), newNode, metav1.UpdateOptions{}); err != nil {
289+
log.Errorf("Failed to cordon node %s, error: %v", nodeName, err)
290+
} else {
291+
log.Infof("Node %s is cordoned", nodeName)
292+
}
293+
}
294+
295+
n.RebootAt = time.Now()
296+
firstTimeRebootNodes[serverID] = n
297+
unHealthyNodes[serverID] = n
298+
log.Infof("Node %s has been rebooted at %s", serverID, n.RebootAt)
299+
}
300+
processed = true
301+
} else {
302+
// If the node was rebooted within mins (defined by RepairDelayAfterReboot), ignore it
303+
log.Infof("Node %s is found in unhealthy again which was rebooted at %s", serverID, unHealthyNodes[serverID].RebootAt)
304+
if unHealthyNodes[serverID].RebootAt.After(time.Now().Add(-1 * provider.Config.RebuildDelayAfterReboot)) {
305+
log.Infof("Node %s is found in unhealthy again, but we're going to defer the repair because it maybe in reboot process", serverID)
306+
firstTimeRebootNodes[serverID] = n
307+
processed = true
308+
}
309+
}
310+
311+
return processed, nil
312+
}
313+
253314
// Repair For master nodes: detach etcd and docker volumes, find the root
254315
// volume, then shutdown the VM, marks the both the VM and the root
255316
// volume (heat resource) as "unhealthy" then trigger Heat stack update
@@ -275,6 +336,8 @@ func (provider OpenStackCloudProvider) Repair(nodes []healthcheck.NodeInfo) erro
275336
masters = nodes
276337
}
277338

339+
firstTimeRebootNodes := make(map[string]healthcheck.NodeInfo)
340+
278341
err := provider.UpdateHealthStatus(masters, workers)
279342
if err != nil {
280343
return fmt.Errorf("failed to update the helath status of cluster %s, error: %v", clusterName, err)
@@ -295,21 +358,10 @@ func (provider OpenStackCloudProvider) Repair(nodes []healthcheck.NodeInfo) erro
295358
}
296359
serverID := machineID.String()
297360

298-
var firsttimeUnhealthy = true
299-
for id := range unHealthyNodes {
300-
log.V(5).Infof("comparing server ID %s with known broken ID %s", serverID, id)
301-
if id == serverID {
302-
firsttimeUnhealthy = false
303-
break
304-
}
305-
}
306-
307-
if firsttimeUnhealthy == true {
308-
unHealthyNodes[serverID] = n
309-
log.Infof("rebooting node %s to repair it", serverID)
310-
if res := servers.Reboot(provider.Nova, serverID, servers.RebootOpts{Type: servers.SoftReboot}); res.Err != nil {
311-
log.Warningf("failed to reboot node %s, error: %v", serverID, res.Err)
312-
}
361+
if processed, err := provider.firstTimeRepair(n, serverID, firstTimeRebootNodes); err != nil {
362+
log.Warningf("Failed to process if the node %s is in first time repair , error: %v", serverID, err)
363+
} else if processed == true {
364+
log.Infof("Node %s has been processed", serverID)
313365
continue
314366
}
315367

@@ -374,21 +426,10 @@ func (provider OpenStackCloudProvider) Repair(nodes []healthcheck.NodeInfo) erro
374426
}
375427
serverID := machineID.String()
376428

377-
var firsttimeUnhealthy = true
378-
for id := range unHealthyNodes {
379-
log.Infof("comparing server ID %s with known broken ID %s", serverID, id)
380-
if id == serverID {
381-
firsttimeUnhealthy = false
382-
break
383-
}
384-
}
385-
386-
if firsttimeUnhealthy == true {
387-
unHealthyNodes[serverID] = n
388-
log.Infof("rebooting node %s to repair it", serverID)
389-
if res := servers.Reboot(provider.Nova, serverID, servers.RebootOpts{Type: servers.SoftReboot}); res.Err != nil {
390-
log.Warningf("failed to reboot node %s, error: %v", serverID, res.Err)
391-
}
429+
if processed, err := provider.firstTimeRepair(n, serverID, firstTimeRebootNodes); err != nil {
430+
log.Warningf("Failed to process if the node %s is in first time repair , error: %v", serverID, err)
431+
} else if processed == true {
432+
log.Infof("Node %s has been processed", serverID)
392433
continue
393434
}
394435

@@ -404,8 +445,14 @@ func (provider OpenStackCloudProvider) Repair(nodes []healthcheck.NodeInfo) erro
404445
}
405446
}
406447

407-
if err := provider.waitForServerPoweredOff(serverID, 30*time.Second); err != nil {
448+
if err := provider.waitForServerPoweredOff(serverID, 180*time.Second); err != nil {
408449
log.Warningf("Failed to shutdown the server %s, error: %v", serverID, err)
450+
// If the server is failed to delete after 180s, then delete it to avoid the
451+
// stack update failure later.
452+
res := servers.ForceDelete(provider.Nova, serverID)
453+
if res.Err != nil {
454+
log.Warningf("Failed to delete the server %s, error: %v", serverID, err)
455+
}
409456
}
410457

411458
log.Infof("Marking Nova VM %s(Heat resource %s) unhealthy for Heat stack %s", serverID, allMapping[serverID].ResourceID, cluster.StackID)
@@ -428,6 +475,11 @@ func (provider OpenStackCloudProvider) Repair(nodes []healthcheck.NodeInfo) erro
428475

429476
// Remove the broken nodes from the cluster
430477
for _, n := range nodes {
478+
serverID := uuid.Parse(n.KubeNode.Status.NodeInfo.MachineID).String()
479+
if _, found := firstTimeRebootNodes[serverID]; found {
480+
log.Infof("Skip node delete for %s because it's repaired by reboot", serverID)
481+
continue
482+
}
431483
if err := provider.KubeClient.CoreV1().Nodes().Delete(context.TODO(), n.KubeNode.Name, metav1.DeleteOptions{}); err != nil {
432484
log.Errorf("Failed to remove the node %s from cluster, error: %v", n.KubeNode.Name, err)
433485
}

pkg/autohealing/config/config.go

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -56,6 +56,9 @@ type Config struct {
5656

5757
// (Optional) How long after new node added that the node will be checked for health status. Default: 10m
5858
CheckDelayAfterAdd time.Duration `mapstructure:"check-delay-after-add"`
59+
60+
// (Optional) How long to wait after a node being rebooted
61+
RebuildDelayAfterReboot time.Duration `mapstructure:"rebuild-delay-after-reboot"`
5962
}
6063

6164
type healthCheck struct {
@@ -83,12 +86,13 @@ type kubeConfig struct {
8386
// NewConfig defines the default values for Config
8487
func NewConfig() Config {
8588
return Config{
86-
DryRun: false,
87-
CloudProvider: "openstack",
88-
MonitorInterval: 30 * time.Second,
89-
MasterMonitorEnabled: true,
90-
WorkerMonitorEnabled: true,
91-
LeaderElect: true,
92-
CheckDelayAfterAdd: 10 * time.Minute,
89+
DryRun: false,
90+
CloudProvider: "openstack",
91+
MonitorInterval: 60 * time.Second,
92+
MasterMonitorEnabled: true,
93+
WorkerMonitorEnabled: true,
94+
LeaderElect: true,
95+
CheckDelayAfterAdd: 10 * time.Minute,
96+
RebuildDelayAfterReboot: 5 * time.Minute,
9397
}
9498
}

pkg/autohealing/controller/controller.go

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -316,6 +316,10 @@ func (c *Controller) repairNodes(unhealthyNodes []healthcheck.NodeInfo) {
316316
newNode := node.KubeNode.DeepCopy()
317317
newNode.Spec.Unschedulable = true
318318

319+
// Skip cordon for master node
320+
if node.IsWorker == false {
321+
continue
322+
}
319323
if _, err := c.kubeClient.CoreV1().Nodes().Update(context.TODO(), newNode, metav1.UpdateOptions{}); err != nil {
320324
log.Errorf("Failed to cordon node %s, error: %v", nodeName, err)
321325
} else {

pkg/autohealing/healthcheck/healthcheck.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@ limitations under the License.
1717
package healthcheck
1818

1919
import (
20+
"time"
21+
2022
apiv1 "k8s.io/api/core/v1"
2123
log "k8s.io/klog/v2"
2224
)
@@ -32,6 +34,8 @@ type NodeInfo struct {
3234
KubeNode apiv1.Node
3335
IsWorker bool
3436
FailedCheck string
37+
FoundAt time.Time
38+
RebootAt time.Time
3539
}
3640

3741
type HealthCheck interface {
@@ -81,6 +85,7 @@ func CheckNodes(checkers []HealthCheck, nodes []NodeInfo, controller NodeControl
8185
for _, checker := range checkers {
8286
if !checker.Check(node, controller) {
8387
node.FailedCheck = checker.GetName()
88+
node.FoundAt = time.Now()
8489
unhealthyNodes = append(unhealthyNodes, node)
8590
break
8691
}

0 commit comments

Comments
 (0)