@@ -25,6 +25,7 @@ import (
25
25
"time"
26
26
27
27
"github.com/gophercloud/gophercloud"
28
+ "github.com/gophercloud/gophercloud/openstack/blockstorage/v2/volumes"
28
29
"github.com/gophercloud/gophercloud/openstack/compute/v2/extensions/startstop"
29
30
"github.com/gophercloud/gophercloud/openstack/compute/v2/extensions/volumeattach"
30
31
"github.com/gophercloud/gophercloud/openstack/compute/v2/servers"
@@ -60,12 +61,18 @@ var statusesPreventingRepair = sets.NewString(
60
61
stackStatusUpdateFailed ,
61
62
)
62
63
64
+ // Cache the unhealthy nodes, if it's the first time we found this
65
+ // unhealthy node, then we just reboot it and save it in this list. If it's not
66
+ // the first time we found this unhealthy node, we will rebuild it.
67
+ var unHealthyNodes = make (map [string ]healthcheck.NodeInfo )
68
+
63
69
// OpenStack is an implementation of cloud provider Interface for OpenStack.
64
70
type OpenStackCloudProvider struct {
65
71
KubeClient kubernetes.Interface
66
72
Nova * gophercloud.ServiceClient
67
73
Heat * gophercloud.ServiceClient
68
74
Magnum * gophercloud.ServiceClient
75
+ Cinder * gophercloud.ServiceClient
69
76
Config config.Config
70
77
ResourceStackMapping map [string ]ResourceStackRelationship
71
78
}
@@ -184,23 +191,44 @@ func (provider OpenStackCloudProvider) waitForClusterComplete(clusterID string,
184
191
return err
185
192
}
186
193
187
- func (provider OpenStackCloudProvider ) waitForServerDetachVolumes (serverID string , timeout time.Duration ) error {
194
+ // waitForServerDetachVolumes will detach all the attached volumes from the given
195
+ // server with the timeout. And if there is a root volume of the server, the root
196
+ // volume ID will be returned.
197
+ func (provider OpenStackCloudProvider ) waitForServerDetachVolumes (serverID string , timeout time.Duration ) (string , error ) {
198
+ rootVolumeID := ""
188
199
err := volumeattach .List (provider .Nova , serverID ).EachPage (func (page pagination.Page ) (bool , error ) {
189
200
attachments , err := volumeattach .ExtractVolumeAttachments (page )
190
201
if err != nil {
191
202
return false , err
192
203
}
193
204
for _ , attachment := range attachments {
194
- log .Infof ("detaching volume %s for instance %s" , attachment .VolumeID , serverID )
195
- err := volumeattach .Delete (provider .Nova , serverID , attachment .ID ).ExtractErr ()
205
+ volume , err := volumes .Get (provider .Cinder , attachment .VolumeID ).Extract ()
196
206
if err != nil {
197
- return false , fmt .Errorf ("failed to detach volume %s from instance %s" , attachment .VolumeID , serverID )
207
+ return false , fmt .Errorf ("failed to get volume %s, error: %s" , attachment .VolumeID , err )
208
+ }
209
+
210
+ bootable , err := strconv .ParseBool (volume .Bootable )
211
+ if err != nil {
212
+ log .Warningf ("Unexpected value for bootable volume %s in volume %s, error %s" , volume .Bootable , volume , err )
213
+ }
214
+
215
+ log .Infof ("volume %s is bootable %t" , attachment .VolumeID , bootable )
216
+
217
+ if bootable == false {
218
+ log .Infof ("detaching volume %s for instance %s" , attachment .VolumeID , serverID )
219
+ err := volumeattach .Delete (provider .Nova , serverID , attachment .ID ).ExtractErr ()
220
+ if err != nil {
221
+ return false , fmt .Errorf ("failed to detach volume %s from instance %s, error: %s" , attachment .VolumeID , serverID , err )
222
+ }
223
+ } else {
224
+ rootVolumeID = attachment .VolumeID
225
+ log .Infof ("the root volume for server %s is %s" , serverID , attachment .VolumeID )
198
226
}
199
227
}
200
228
return true , err
201
229
})
202
230
if err != nil {
203
- return err
231
+ return rootVolumeID , err
204
232
}
205
233
err = wait .Poll (3 * time .Second , timeout ,
206
234
func () (bool , error ) {
@@ -209,20 +237,26 @@ func (provider OpenStackCloudProvider) waitForServerDetachVolumes(serverID strin
209
237
return false , err
210
238
}
211
239
212
- if len (server .AttachedVolumes ) == 0 {
240
+ if len (server .AttachedVolumes ) == 0 && rootVolumeID == "" {
241
+ return true , nil
242
+ } else if len (server .AttachedVolumes ) == 1 && rootVolumeID != "" {
243
+ // Root volume is left
213
244
return true , nil
214
245
}
215
246
216
247
return false , nil
217
248
})
218
249
219
- return err
250
+ return rootVolumeID , err
220
251
}
221
252
222
- // For master nodes: Soft deletes the VMs, marks the heat resource "unhealthy" then trigger Heat stack update in order to rebuild
223
- // the VMs. The information this function needs:
224
- // - Nova VM IDs
225
- // - Heat stack ID and resource ID.
253
+ // Repair For master nodes: detach etcd and docker volumes, find the root
254
+ // volume, then shutdown the VM, marks the both the VM and the root
255
+ // volume (heat resource) as "unhealthy" then trigger Heat stack update
256
+ // in order to rebuild the node. The information this function needs:
257
+ // - Nova VM ID
258
+ // - Root volume ID
259
+ // - Heat stack ID and resource ID.
226
260
// For worker nodes: Call Magnum resize API directly.
227
261
func (provider OpenStackCloudProvider ) Repair (nodes []healthcheck.NodeInfo ) error {
228
262
if len (nodes ) == 0 {
@@ -234,6 +268,7 @@ func (provider OpenStackCloudProvider) Repair(nodes []healthcheck.NodeInfo) erro
234
268
235
269
clusterName := provider .Config .ClusterName
236
270
isWorkerNode := nodes [0 ].IsWorker
271
+ log .Infof ("the node type to be repaired is worker node: %t" , isWorkerNode )
237
272
if isWorkerNode {
238
273
workers = nodes
239
274
} else {
@@ -242,7 +277,7 @@ func (provider OpenStackCloudProvider) Repair(nodes []healthcheck.NodeInfo) erro
242
277
243
278
err := provider .UpdateHealthStatus (masters , workers )
244
279
if err != nil {
245
- return fmt .Errorf ("Failed to update the helath status of cluster %s, error: %v" , clusterName , err )
280
+ return fmt .Errorf ("failed to update the helath status of cluster %s, error: %v" , clusterName , err )
246
281
}
247
282
248
283
cluster , err := clusters .Get (provider .Magnum , clusterName ).Extract ()
@@ -260,7 +295,25 @@ func (provider OpenStackCloudProvider) Repair(nodes []healthcheck.NodeInfo) erro
260
295
}
261
296
serverID := machineID .String ()
262
297
263
- if err := provider .waitForServerDetachVolumes (serverID , 30 * time .Second ); err != nil {
298
+ var firsttimeUnhealthy = true
299
+ for id := range unHealthyNodes {
300
+ log .V (5 ).Infof ("comparing server ID %s with known broken ID %s" , serverID , id )
301
+ if id == serverID {
302
+ firsttimeUnhealthy = false
303
+ break
304
+ }
305
+ }
306
+
307
+ if firsttimeUnhealthy == true {
308
+ unHealthyNodes [serverID ] = n
309
+ log .Infof ("rebooting node %s to repair it" , serverID )
310
+ if res := servers .Reboot (provider .Nova , serverID , servers.RebootOpts {Type : servers .SoftReboot }); res .Err != nil {
311
+ log .Warningf ("failed to reboot node %s, error: %v" , serverID , res .Err )
312
+ }
313
+ continue
314
+ }
315
+
316
+ if _ , err := provider .waitForServerDetachVolumes (serverID , 30 * time .Second ); err != nil {
264
317
log .Warningf ("Failed to detach volumes from server %s, error: %v" , serverID , err )
265
318
}
266
319
@@ -292,6 +345,7 @@ func (provider OpenStackCloudProvider) Repair(nodes []healthcheck.NodeInfo) erro
292
345
// return fmt.Errorf("failed to resize cluster %s, error: %v", clusterName, ret.Err)
293
346
//}
294
347
348
+ delete (unHealthyNodes , serverID )
295
349
log .Infof ("Cluster %s resized" , clusterName )
296
350
}
297
351
} else {
@@ -307,28 +361,62 @@ func (provider OpenStackCloudProvider) Repair(nodes []healthcheck.NodeInfo) erro
307
361
return fmt .Errorf ("failed to get the resource stack mapping for cluster %s, error: %v" , clusterName , err )
308
362
}
309
363
364
+ opts := stackresources.MarkUnhealthyOpts {
365
+ MarkUnhealthy : true ,
366
+ ResourceStatusReason : "Mark resource unhealthy by autohealing service" ,
367
+ }
368
+
310
369
for _ , n := range nodes {
311
- id := uuid .Parse (n .KubeNode .Status .NodeInfo .MachineID )
312
- if id == nil {
370
+ machineID := uuid .Parse (n .KubeNode .Status .NodeInfo .MachineID )
371
+ if machineID == nil {
313
372
log .Warningf ("Failed to get the correct server ID for server %s" , n .KubeNode .Name )
314
373
continue
315
374
}
316
- serverID := id .String ()
375
+ serverID := machineID .String ()
376
+
377
+ var firsttimeUnhealthy = true
378
+ for id := range unHealthyNodes {
379
+ log .Infof ("comparing server ID %s with known broken ID %s" , serverID , id )
380
+ if id == serverID {
381
+ firsttimeUnhealthy = false
382
+ break
383
+ }
384
+ }
385
+
386
+ if firsttimeUnhealthy == true {
387
+ unHealthyNodes [serverID ] = n
388
+ log .Infof ("rebooting node %s to repair it" , serverID )
389
+ if res := servers .Reboot (provider .Nova , serverID , servers.RebootOpts {Type : servers .SoftReboot }); res .Err != nil {
390
+ log .Warningf ("failed to reboot node %s, error: %v" , serverID , res .Err )
391
+ }
392
+ continue
393
+ }
394
+
395
+ if rootVolumeID , err := provider .waitForServerDetachVolumes (serverID , 30 * time .Second ); err != nil {
396
+ log .Warningf ("Failed to detach volumes from server %s, error: %v" , serverID , err )
397
+ } else {
398
+ // Mark root volume as unhealthy
399
+ if rootVolumeID != "" {
400
+ err = stackresources .MarkUnhealthy (provider .Heat , allMapping [serverID ].StackName , allMapping [serverID ].StackID , rootVolumeID , opts ).ExtractErr ()
401
+ if err != nil {
402
+ log .Errorf ("failed to mark resource %s unhealthy, error: %v" , rootVolumeID , err )
403
+ }
404
+ }
405
+ }
317
406
318
407
if err := provider .waitForServerPoweredOff (serverID , 30 * time .Second ); err != nil {
319
408
log .Warningf ("Failed to shutdown the server %s, error: %v" , serverID , err )
320
409
}
321
410
322
411
log .Infof ("Marking Nova VM %s(Heat resource %s) unhealthy for Heat stack %s" , serverID , allMapping [serverID ].ResourceID , cluster .StackID )
323
412
324
- opts := stackresources.MarkUnhealthyOpts {
325
- MarkUnhealthy : true ,
326
- ResourceStatusReason : "Mark resource unhealthy by autohealing service" ,
327
- }
413
+ // Mark VM as unhealthy
328
414
err = stackresources .MarkUnhealthy (provider .Heat , allMapping [serverID ].StackName , allMapping [serverID ].StackID , allMapping [serverID ].ResourceID , opts ).ExtractErr ()
329
415
if err != nil {
330
416
log .Errorf ("failed to mark resource %s unhealthy, error: %v" , serverID , err )
331
417
}
418
+
419
+ delete (unHealthyNodes , serverID )
332
420
}
333
421
334
422
if err := stacks .UpdatePatch (provider .Heat , clusterStackName , cluster .StackID , stacks.UpdateOpts {}).ExtractErr (); err != nil {
0 commit comments