Skip to content

Commit e35da15

Browse files
authored
Waiting for volumeAttachments deletion (#1190)
* Waiting for volumeAttachments deletion Signed-off-by: Mattia Lavacca <[email protected]> * volumeAttachments check only for vSphere Signed-off-by: Mattia Lavacca <[email protected]> * ClusterRole updated Signed-off-by: Mattia Lavacca <[email protected]> * yaml linter fixed Signed-off-by: Mattia Lavacca <[email protected]> * VolumeAttachments correctly handled Signed-off-by: Mattia Lavacca <[email protected]> * Code factorized Signed-off-by: Mattia Lavacca <[email protected]> * renaming Signed-off-by: Mattia Lavacca <[email protected]> * fix yamllint Signed-off-by: Mattia Lavacca <[email protected]> * Logic applied only to vSphere Signed-off-by: Mattia Lavacca <[email protected]>
1 parent e479c57 commit e35da15

File tree

6 files changed

+458
-125
lines changed

6 files changed

+458
-125
lines changed

examples/machine-controller.yaml

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -536,6 +536,15 @@ rules:
536536
- "list"
537537
- "get"
538538
- "watch"
539+
# volumeAttachments permissions are needed by vsphere clusters
540+
- apiGroups:
541+
- "storage.k8s.io"
542+
resources:
543+
- "volumeattachments"
544+
verbs:
545+
- "list"
546+
- "get"
547+
- "watch"
539548
- apiGroups:
540549
- ""
541550
resources:

pkg/controller/machine/machine_controller.go

Lines changed: 94 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ import (
3939
"github.com/kubermatic/machine-controller/pkg/containerruntime"
4040
kuberneteshelper "github.com/kubermatic/machine-controller/pkg/kubernetes"
4141
"github.com/kubermatic/machine-controller/pkg/node/eviction"
42+
"github.com/kubermatic/machine-controller/pkg/node/poddeletion"
4243
"github.com/kubermatic/machine-controller/pkg/providerconfig"
4344
providerconfigtypes "github.com/kubermatic/machine-controller/pkg/providerconfig/types"
4445
"github.com/kubermatic/machine-controller/pkg/rhsm"
@@ -408,7 +409,7 @@ func (r *Reconciler) reconcile(ctx context.Context, machine *clusterv1alpha1.Mac
408409

409410
// step 2: check if a user requested to delete the machine
410411
if machine.DeletionTimestamp != nil {
411-
return r.deleteMachine(ctx, prov, machine)
412+
return r.deleteMachine(ctx, prov, providerConfig.CloudProvider, machine)
412413
}
413414

414415
// Step 3: Essentially creates an instance for the given machine.
@@ -462,6 +463,30 @@ func (r *Reconciler) ensureMachineHasNodeReadyCondition(machine *clusterv1alpha1
462463
})
463464
}
464465

466+
func (r *Reconciler) shouldCleanupVolumes(ctx context.Context, machine *clusterv1alpha1.Machine, providerName providerconfigtypes.CloudProvider) (bool, error) {
467+
// we need to wait for volumeAttachments clean up only for vSphere
468+
if providerName != providerconfigtypes.CloudProviderVsphere {
469+
return false, nil
470+
}
471+
472+
// No node - No volumeAttachments to be collected
473+
if machine.Status.NodeRef == nil {
474+
klog.V(4).Infof("Skipping eviction for machine %q since it does not have a node", machine.Name)
475+
return false, nil
476+
}
477+
478+
node := &corev1.Node{}
479+
if err := r.client.Get(ctx, types.NamespacedName{Name: machine.Status.NodeRef.Name}, node); err != nil {
480+
// Node does not exist - No volumeAttachments to be collected
481+
if kerrors.IsNotFound(err) {
482+
klog.V(4).Infof("Skipping eviction for machine %q since it does not have a node", machine.Name)
483+
return false, nil
484+
}
485+
return false, fmt.Errorf("failed to get node %q", machine.Status.NodeRef.Name)
486+
}
487+
return true, nil
488+
}
489+
465490
// evictIfNecessary checks if the machine has a node and evicts it if necessary
466491
func (r *Reconciler) shouldEvict(ctx context.Context, machine *clusterv1alpha1.Machine) (bool, error) {
467492
// If the deletion got triggered a few hours ago, skip eviction.
@@ -521,22 +546,35 @@ func (r *Reconciler) shouldEvict(ctx context.Context, machine *clusterv1alpha1.M
521546
}
522547

523548
// deleteMachine makes sure that an instance has gone in a series of steps.
524-
func (r *Reconciler) deleteMachine(ctx context.Context, prov cloudprovidertypes.Provider, machine *clusterv1alpha1.Machine) (*reconcile.Result, error) {
549+
func (r *Reconciler) deleteMachine(ctx context.Context, prov cloudprovidertypes.Provider, providerName providerconfigtypes.CloudProvider, machine *clusterv1alpha1.Machine) (*reconcile.Result, error) {
525550
shouldEvict, err := r.shouldEvict(ctx, machine)
526551
if err != nil {
527552
return nil, err
528553
}
554+
shouldCleanUpVolumes, err := r.shouldCleanupVolumes(ctx, machine, providerName)
555+
if err != nil {
556+
return nil, err
557+
}
529558

559+
var evictedSomething, deletedSomething bool
560+
var volumesFree = true
530561
if shouldEvict {
531-
evictedSomething, err := eviction.New(ctx, machine.Status.NodeRef.Name, r.client, r.kubeClient).Run()
562+
evictedSomething, err = eviction.New(ctx, machine.Status.NodeRef.Name, r.client, r.kubeClient).Run()
532563
if err != nil {
533564
return nil, fmt.Errorf("failed to evict node %s: %v", machine.Status.NodeRef.Name, err)
534565
}
535-
if evictedSomething {
536-
return &reconcile.Result{RequeueAfter: 10 * time.Second}, nil
566+
}
567+
if shouldCleanUpVolumes {
568+
deletedSomething, volumesFree, err = poddeletion.New(ctx, machine.Status.NodeRef.Name, r.client, r.kubeClient).Run()
569+
if err != nil {
570+
return nil, fmt.Errorf("failed to delete pods bound to volumes running on node %s: %v", machine.Status.NodeRef.Name, err)
537571
}
538572
}
539573

574+
if evictedSomething || deletedSomething || !volumesFree {
575+
return &reconcile.Result{RequeueAfter: 10 * time.Second}, nil
576+
}
577+
540578
if result, err := r.deleteCloudProviderInstance(prov, machine); result != nil || err != nil {
541579
return result, err
542580
}
@@ -550,7 +588,52 @@ func (r *Reconciler) deleteMachine(ctx context.Context, prov cloudprovidertypes.
550588
return nil, nil
551589
}
552590

553-
return nil, r.deleteNodeForMachine(ctx, machine)
591+
nodes, err := r.retrieveNodesRelatedToMachine(ctx, machine)
592+
if err != nil {
593+
return nil, err
594+
}
595+
596+
return nil, r.deleteNodeForMachine(ctx, nodes, machine)
597+
}
598+
599+
func (r *Reconciler) retrieveNodesRelatedToMachine(ctx context.Context, machine *clusterv1alpha1.Machine) ([]*corev1.Node, error) {
600+
nodes := make([]*corev1.Node, 0)
601+
602+
// If there's NodeRef on the Machine object, retrieve the node by using the
603+
// value of the NodeRef. If there's no NodeRef, try to find the Node by
604+
// listing nodes using the NodeOwner label selector.
605+
if machine.Status.NodeRef != nil {
606+
objKey := ctrlruntimeclient.ObjectKey{Name: machine.Status.NodeRef.Name}
607+
node := &corev1.Node{}
608+
if err := r.client.Get(ctx, objKey, node); err != nil {
609+
if !kerrors.IsNotFound(err) {
610+
return nil, fmt.Errorf("failed to get node %s: %v", machine.Status.NodeRef.Name, err)
611+
}
612+
klog.V(2).Infof("node %q does not longer exist for machine %q", machine.Status.NodeRef.Name, machine.Spec.Name)
613+
} else {
614+
nodes = append(nodes, node)
615+
}
616+
} else {
617+
selector, err := labels.Parse(NodeOwnerLabelName + "=" + string(machine.UID))
618+
if err != nil {
619+
return nil, fmt.Errorf("failed to parse label selector: %v", err)
620+
}
621+
listOpts := &ctrlruntimeclient.ListOptions{LabelSelector: selector}
622+
nodeList := &corev1.NodeList{}
623+
if err := r.client.List(ctx, nodeList, listOpts); err != nil {
624+
return nil, fmt.Errorf("failed to list nodes: %v", err)
625+
}
626+
if len(nodeList.Items) == 0 {
627+
// We just want log that we didn't found the node.
628+
klog.V(3).Infof("No node found for the machine %s", machine.Spec.Name)
629+
}
630+
631+
for _, node := range nodeList.Items {
632+
nodes = append(nodes, &node)
633+
}
634+
}
635+
636+
return nodes, nil
554637
}
555638

556639
func (r *Reconciler) deleteCloudProviderInstance(prov cloudprovidertypes.Provider, machine *clusterv1alpha1.Machine) (*reconcile.Result, error) {
@@ -623,50 +706,14 @@ func (r *Reconciler) deleteCloudProviderInstance(prov cloudprovidertypes.Provide
623706
})
624707
}
625708

626-
func (r *Reconciler) deleteNodeForMachine(ctx context.Context, machine *clusterv1alpha1.Machine) error {
627-
// If there's NodeRef on the Machine object, remove the Node by using the
628-
// value of the NodeRef. If there's no NodeRef, try to find the Node by
629-
// listing nodes using the NodeOwner label selector.
630-
if machine.Status.NodeRef != nil {
631-
objKey := ctrlruntimeclient.ObjectKey{Name: machine.Status.NodeRef.Name}
632-
node := &corev1.Node{}
633-
nodeFound := true
634-
if err := r.client.Get(ctx, objKey, node); err != nil {
709+
func (r *Reconciler) deleteNodeForMachine(ctx context.Context, nodes []*corev1.Node, machine *clusterv1alpha1.Machine) error {
710+
// iterates on all nodes and delete them. Finally, remove the finalizer on the machine
711+
for _, node := range nodes {
712+
if err := r.client.Delete(ctx, node); err != nil {
635713
if !kerrors.IsNotFound(err) {
636-
return fmt.Errorf("failed to get node %s: %v", machine.Status.NodeRef.Name, err)
637-
}
638-
nodeFound = false
639-
klog.V(2).Infof("node %q does not longer exist for machine %q", machine.Status.NodeRef.Name, machine.Spec.Name)
640-
}
641-
642-
if nodeFound {
643-
if err := r.client.Delete(ctx, node); err != nil {
644-
if !kerrors.IsNotFound(err) {
645-
return err
646-
}
647-
klog.V(2).Infof("node %q does not longer exist for machine %q", machine.Status.NodeRef.Name, machine.Spec.Name)
648-
}
649-
}
650-
} else {
651-
selector, err := labels.Parse(NodeOwnerLabelName + "=" + string(machine.UID))
652-
if err != nil {
653-
return fmt.Errorf("failed to parse label selector: %v", err)
654-
}
655-
listOpts := &ctrlruntimeclient.ListOptions{LabelSelector: selector}
656-
nodes := &corev1.NodeList{}
657-
if err := r.client.List(ctx, nodes, listOpts); err != nil {
658-
return fmt.Errorf("failed to list nodes: %v", err)
659-
}
660-
if len(nodes.Items) == 0 {
661-
// We just want log that we didn't found the node. We don't want to
662-
// return here, as we want to remove finalizers at the end.
663-
klog.V(3).Infof("No node found for the machine %s", machine.Spec.Name)
664-
}
665-
666-
for _, node := range nodes.Items {
667-
if err := r.client.Delete(ctx, &node); err != nil {
668714
return err
669715
}
716+
klog.V(2).Infof("node %q does not longer exist for machine %q", machine.Status.NodeRef.Name, machine.Spec.Name)
670717
}
671718
}
672719

pkg/controller/machine/machine_test.go

Lines changed: 33 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -474,7 +474,7 @@ func TestControllerDeleteNodeForMachine(t *testing.T) {
474474
tests := []struct {
475475
name string
476476
machine *clusterv1alpha1.Machine
477-
nodes []runtime.Object
477+
nodes []*corev1.Node
478478
err error
479479
shouldDeleteNode string
480480
}{
@@ -489,13 +489,17 @@ func TestControllerDeleteNodeForMachine(t *testing.T) {
489489
NodeRef: &corev1.ObjectReference{Name: "node-1"},
490490
},
491491
},
492-
nodes: []runtime.Object{&corev1.Node{
493-
ObjectMeta: metav1.ObjectMeta{
494-
Name: "node-0",
495-
}}, &corev1.Node{
496-
ObjectMeta: metav1.ObjectMeta{
497-
Name: "node-1",
498-
}},
492+
nodes: []*corev1.Node{
493+
{
494+
ObjectMeta: metav1.ObjectMeta{
495+
Name: "node-0",
496+
},
497+
},
498+
{
499+
ObjectMeta: metav1.ObjectMeta{
500+
Name: "node-1",
501+
},
502+
},
499503
},
500504
err: nil,
501505
shouldDeleteNode: "node-1",
@@ -510,16 +514,16 @@ func TestControllerDeleteNodeForMachine(t *testing.T) {
510514
},
511515
Status: clusterv1alpha1.MachineStatus{},
512516
},
513-
nodes: []runtime.Object{
514-
&corev1.Node{
517+
nodes: []*corev1.Node{
518+
{
515519
ObjectMeta: metav1.ObjectMeta{
516520
Name: "node-0",
517521
Labels: map[string]string{
518522
NodeOwnerLabelName: string(machineUID),
519523
},
520524
},
521525
},
522-
&corev1.Node{
526+
{
523527
ObjectMeta: metav1.ObjectMeta{
524528
Name: "node-1",
525529
},
@@ -538,13 +542,13 @@ func TestControllerDeleteNodeForMachine(t *testing.T) {
538542
},
539543
Status: clusterv1alpha1.MachineStatus{},
540544
},
541-
nodes: []runtime.Object{
542-
&corev1.Node{
545+
nodes: []*corev1.Node{
546+
{
543547
ObjectMeta: metav1.ObjectMeta{
544548
Name: "node-0",
545549
},
546550
},
547-
&corev1.Node{
551+
{
548552
ObjectMeta: metav1.ObjectMeta{
549553
Name: "node-1",
550554
},
@@ -564,10 +568,12 @@ func TestControllerDeleteNodeForMachine(t *testing.T) {
564568
NodeRef: &corev1.ObjectReference{Name: "node-1"},
565569
},
566570
},
567-
nodes: []runtime.Object{&corev1.Node{
568-
ObjectMeta: metav1.ObjectMeta{
569-
Name: "node-0",
570-
}},
571+
nodes: []*corev1.Node{
572+
{
573+
ObjectMeta: metav1.ObjectMeta{
574+
Name: "node-0",
575+
},
576+
},
571577
},
572578
err: nil,
573579
shouldDeleteNode: "",
@@ -579,7 +585,9 @@ func TestControllerDeleteNodeForMachine(t *testing.T) {
579585
ctx := context.Background()
580586

581587
objects := []runtime.Object{test.machine}
582-
objects = append(objects, test.nodes...)
588+
for _, n := range test.nodes {
589+
objects = append(objects, n)
590+
}
583591

584592
client := ctrlruntimefake.NewFakeClient(objects...)
585593

@@ -595,7 +603,12 @@ func TestControllerDeleteNodeForMachine(t *testing.T) {
595603
providerData: providerData,
596604
}
597605

598-
err := reconciler.deleteNodeForMachine(ctx, test.machine)
606+
nodes, err := reconciler.retrieveNodesRelatedToMachine(ctx, test.machine)
607+
if err != nil {
608+
return
609+
}
610+
611+
err = reconciler.deleteNodeForMachine(ctx, nodes, test.machine)
599612
if diff := deep.Equal(err, test.err); diff != nil {
600613
t.Errorf("expected to get %v instead got: %v", test.err, err)
601614
}

0 commit comments

Comments
 (0)