Skip to content

Commit c667f16

Browse files
Fix magnum auto healer (kubernetes#1447) (kubernetes#1461)
Co-authored-by: Feilong Wang <[email protected]>
1 parent 9a5688c commit c667f16

File tree

2 files changed

+116
-20
lines changed

2 files changed

+116
-20
lines changed

pkg/autohealing/cloudprovider/openstack/provider.go

Lines changed: 108 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ import (
2525
"time"
2626

2727
"github.com/gophercloud/gophercloud"
28+
"github.com/gophercloud/gophercloud/openstack/blockstorage/v2/volumes"
2829
"github.com/gophercloud/gophercloud/openstack/compute/v2/extensions/startstop"
2930
"github.com/gophercloud/gophercloud/openstack/compute/v2/extensions/volumeattach"
3031
"github.com/gophercloud/gophercloud/openstack/compute/v2/servers"
@@ -60,12 +61,18 @@ var statusesPreventingRepair = sets.NewString(
6061
stackStatusUpdateFailed,
6162
)
6263

64+
// Cache the unhealthy nodes, if it's the first time we found this
65+
// unhealthy node, then we just reboot it and save it in this list. If it's not
66+
// the first time we found this unhealthy node, we will rebuild it.
67+
var unHealthyNodes = make(map[string]healthcheck.NodeInfo)
68+
6369
// OpenStack is an implementation of cloud provider Interface for OpenStack.
6470
type OpenStackCloudProvider struct {
6571
KubeClient kubernetes.Interface
6672
Nova *gophercloud.ServiceClient
6773
Heat *gophercloud.ServiceClient
6874
Magnum *gophercloud.ServiceClient
75+
Cinder *gophercloud.ServiceClient
6976
Config config.Config
7077
ResourceStackMapping map[string]ResourceStackRelationship
7178
}
@@ -184,23 +191,44 @@ func (provider OpenStackCloudProvider) waitForClusterComplete(clusterID string,
184191
return err
185192
}
186193

187-
func (provider OpenStackCloudProvider) waitForServerDetachVolumes(serverID string, timeout time.Duration) error {
194+
// waitForServerDetachVolumes will detach all the attached volumes from the given
195+
// server with the timeout. And if there is a root volume of the server, the root
196+
// volume ID will be returned.
197+
func (provider OpenStackCloudProvider) waitForServerDetachVolumes(serverID string, timeout time.Duration) (string, error) {
198+
rootVolumeID := ""
188199
err := volumeattach.List(provider.Nova, serverID).EachPage(func(page pagination.Page) (bool, error) {
189200
attachments, err := volumeattach.ExtractVolumeAttachments(page)
190201
if err != nil {
191202
return false, err
192203
}
193204
for _, attachment := range attachments {
194-
log.Infof("detaching volume %s for instance %s", attachment.VolumeID, serverID)
195-
err := volumeattach.Delete(provider.Nova, serverID, attachment.ID).ExtractErr()
205+
volume, err := volumes.Get(provider.Cinder, attachment.VolumeID).Extract()
196206
if err != nil {
197-
return false, fmt.Errorf("failed to detach volume %s from instance %s", attachment.VolumeID, serverID)
207+
return false, fmt.Errorf("failed to get volume %s, error: %s", attachment.VolumeID, err)
208+
}
209+
210+
bootable, err := strconv.ParseBool(volume.Bootable)
211+
if err != nil {
212+
log.Warningf("Unexpected value for bootable volume %s in volume %s, error %s", volume.Bootable, volume, err)
213+
}
214+
215+
log.Infof("volume %s is bootable %t", attachment.VolumeID, bootable)
216+
217+
if bootable == false {
218+
log.Infof("detaching volume %s for instance %s", attachment.VolumeID, serverID)
219+
err := volumeattach.Delete(provider.Nova, serverID, attachment.ID).ExtractErr()
220+
if err != nil {
221+
return false, fmt.Errorf("failed to detach volume %s from instance %s, error: %s", attachment.VolumeID, serverID, err)
222+
}
223+
} else {
224+
rootVolumeID = attachment.VolumeID
225+
log.Infof("the root volume for server %s is %s", serverID, attachment.VolumeID)
198226
}
199227
}
200228
return true, err
201229
})
202230
if err != nil {
203-
return err
231+
return rootVolumeID, err
204232
}
205233
err = wait.Poll(3*time.Second, timeout,
206234
func() (bool, error) {
@@ -209,20 +237,26 @@ func (provider OpenStackCloudProvider) waitForServerDetachVolumes(serverID strin
209237
return false, err
210238
}
211239

212-
if len(server.AttachedVolumes) == 0 {
240+
if len(server.AttachedVolumes) == 0 && rootVolumeID == "" {
241+
return true, nil
242+
} else if len(server.AttachedVolumes) == 1 && rootVolumeID != "" {
243+
// Root volume is left
213244
return true, nil
214245
}
215246

216247
return false, nil
217248
})
218249

219-
return err
250+
return rootVolumeID, err
220251
}
221252

222-
// For master nodes: Soft deletes the VMs, marks the heat resource "unhealthy" then trigger Heat stack update in order to rebuild
223-
// the VMs. The information this function needs:
224-
// - Nova VM IDs
225-
// - Heat stack ID and resource ID.
253+
// Repair For master nodes: detach etcd and docker volumes, find the root
254+
// volume, then shutdown the VM, marks the both the VM and the root
255+
// volume (heat resource) as "unhealthy" then trigger Heat stack update
256+
// in order to rebuild the node. The information this function needs:
257+
// - Nova VM ID
258+
// - Root volume ID
259+
// - Heat stack ID and resource ID.
226260
// For worker nodes: Call Magnum resize API directly.
227261
func (provider OpenStackCloudProvider) Repair(nodes []healthcheck.NodeInfo) error {
228262
if len(nodes) == 0 {
@@ -234,6 +268,7 @@ func (provider OpenStackCloudProvider) Repair(nodes []healthcheck.NodeInfo) erro
234268

235269
clusterName := provider.Config.ClusterName
236270
isWorkerNode := nodes[0].IsWorker
271+
log.Infof("the node type to be repaired is worker node: %t", isWorkerNode)
237272
if isWorkerNode {
238273
workers = nodes
239274
} else {
@@ -242,7 +277,7 @@ func (provider OpenStackCloudProvider) Repair(nodes []healthcheck.NodeInfo) erro
242277

243278
err := provider.UpdateHealthStatus(masters, workers)
244279
if err != nil {
245-
return fmt.Errorf("Failed to update the helath status of cluster %s, error: %v", clusterName, err)
280+
return fmt.Errorf("failed to update the helath status of cluster %s, error: %v", clusterName, err)
246281
}
247282

248283
cluster, err := clusters.Get(provider.Magnum, clusterName).Extract()
@@ -260,7 +295,25 @@ func (provider OpenStackCloudProvider) Repair(nodes []healthcheck.NodeInfo) erro
260295
}
261296
serverID := machineID.String()
262297

263-
if err := provider.waitForServerDetachVolumes(serverID, 30*time.Second); err != nil {
298+
var firsttimeUnhealthy = true
299+
for id := range unHealthyNodes {
300+
log.V(5).Infof("comparing server ID %s with known broken ID %s", serverID, id)
301+
if id == serverID {
302+
firsttimeUnhealthy = false
303+
break
304+
}
305+
}
306+
307+
if firsttimeUnhealthy == true {
308+
unHealthyNodes[serverID] = n
309+
log.Infof("rebooting node %s to repair it", serverID)
310+
if res := servers.Reboot(provider.Nova, serverID, servers.RebootOpts{Type: servers.SoftReboot}); res.Err != nil {
311+
log.Warningf("failed to reboot node %s, error: %v", serverID, res.Err)
312+
}
313+
continue
314+
}
315+
316+
if _, err := provider.waitForServerDetachVolumes(serverID, 30*time.Second); err != nil {
264317
log.Warningf("Failed to detach volumes from server %s, error: %v", serverID, err)
265318
}
266319

@@ -292,6 +345,7 @@ func (provider OpenStackCloudProvider) Repair(nodes []healthcheck.NodeInfo) erro
292345
// return fmt.Errorf("failed to resize cluster %s, error: %v", clusterName, ret.Err)
293346
//}
294347

348+
delete(unHealthyNodes, serverID)
295349
log.Infof("Cluster %s resized", clusterName)
296350
}
297351
} else {
@@ -307,28 +361,62 @@ func (provider OpenStackCloudProvider) Repair(nodes []healthcheck.NodeInfo) erro
307361
return fmt.Errorf("failed to get the resource stack mapping for cluster %s, error: %v", clusterName, err)
308362
}
309363

364+
opts := stackresources.MarkUnhealthyOpts{
365+
MarkUnhealthy: true,
366+
ResourceStatusReason: "Mark resource unhealthy by autohealing service",
367+
}
368+
310369
for _, n := range nodes {
311-
id := uuid.Parse(n.KubeNode.Status.NodeInfo.MachineID)
312-
if id == nil {
370+
machineID := uuid.Parse(n.KubeNode.Status.NodeInfo.MachineID)
371+
if machineID == nil {
313372
log.Warningf("Failed to get the correct server ID for server %s", n.KubeNode.Name)
314373
continue
315374
}
316-
serverID := id.String()
375+
serverID := machineID.String()
376+
377+
var firsttimeUnhealthy = true
378+
for id := range unHealthyNodes {
379+
log.Infof("comparing server ID %s with known broken ID %s", serverID, id)
380+
if id == serverID {
381+
firsttimeUnhealthy = false
382+
break
383+
}
384+
}
385+
386+
if firsttimeUnhealthy == true {
387+
unHealthyNodes[serverID] = n
388+
log.Infof("rebooting node %s to repair it", serverID)
389+
if res := servers.Reboot(provider.Nova, serverID, servers.RebootOpts{Type: servers.SoftReboot}); res.Err != nil {
390+
log.Warningf("failed to reboot node %s, error: %v", serverID, res.Err)
391+
}
392+
continue
393+
}
394+
395+
if rootVolumeID, err := provider.waitForServerDetachVolumes(serverID, 30*time.Second); err != nil {
396+
log.Warningf("Failed to detach volumes from server %s, error: %v", serverID, err)
397+
} else {
398+
// Mark root volume as unhealthy
399+
if rootVolumeID != "" {
400+
err = stackresources.MarkUnhealthy(provider.Heat, allMapping[serverID].StackName, allMapping[serverID].StackID, rootVolumeID, opts).ExtractErr()
401+
if err != nil {
402+
log.Errorf("failed to mark resource %s unhealthy, error: %v", rootVolumeID, err)
403+
}
404+
}
405+
}
317406

318407
if err := provider.waitForServerPoweredOff(serverID, 30*time.Second); err != nil {
319408
log.Warningf("Failed to shutdown the server %s, error: %v", serverID, err)
320409
}
321410

322411
log.Infof("Marking Nova VM %s(Heat resource %s) unhealthy for Heat stack %s", serverID, allMapping[serverID].ResourceID, cluster.StackID)
323412

324-
opts := stackresources.MarkUnhealthyOpts{
325-
MarkUnhealthy: true,
326-
ResourceStatusReason: "Mark resource unhealthy by autohealing service",
327-
}
413+
// Mark VM as unhealthy
328414
err = stackresources.MarkUnhealthy(provider.Heat, allMapping[serverID].StackName, allMapping[serverID].StackID, allMapping[serverID].ResourceID, opts).ExtractErr()
329415
if err != nil {
330416
log.Errorf("failed to mark resource %s unhealthy, error: %v", serverID, err)
331417
}
418+
419+
delete(unHealthyNodes, serverID)
332420
}
333421

334422
if err := stacks.UpdatePatch(provider.Heat, clusterStackName, cluster.StackID, stacks.UpdateOpts{}).ExtractErr(); err != nil {

pkg/autohealing/cloudprovider/register/register.go

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,12 +64,20 @@ func registerOpenStack(cfg config.Config, kubeClient kubernetes.Interface) (clou
6464
}
6565
magnumClient.Microversion = "latest"
6666

67+
// get cinder service client
68+
var cinderClient *gophercloud.ServiceClient
69+
cinderClient, err = gopenstack.NewBlockStorageV2(client, eoOpts)
70+
if err != nil {
71+
return nil, fmt.Errorf("failed to find Cinder service endpoint in the region %s: %v", cfg.OpenStack.Region, err)
72+
}
73+
6774
var p cloudprovider.CloudProvider
6875
p = openstack.OpenStackCloudProvider{
6976
KubeClient: kubeClient,
7077
Nova: novaClient,
7178
Heat: heatClient,
7279
Magnum: magnumClient,
80+
Cinder: cinderClient,
7381
Config: cfg,
7482
}
7583

0 commit comments

Comments
 (0)