Skip to content

Commit 01f2d47

Browse files
authored
Delete Nodes by NodeRef and compare node and machine names when getting a Hetzner node (#813)
* Make deleteNodeForMachine function use NodeRef * Make getNode check for the instance name as well * Add more unit tests for getNode() * Add more unit tests for deleteNodeForMachine() * Update comment
1 parent 446ab57 commit 01f2d47

File tree

2 files changed

+228
-13
lines changed

2 files changed

+228
-13
lines changed

pkg/controller/machine/machine.go

Lines changed: 58 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -608,19 +608,42 @@ func (r *Reconciler) deleteCloudProviderInstance(prov cloudprovidertypes.Provide
608608
}
609609

610610
func (r *Reconciler) deleteNodeForMachine(machine *clusterv1alpha1.Machine) error {
611-
selector, err := labels.Parse(NodeOwnerLabelName + "=" + string(machine.UID))
612-
if err != nil {
613-
return fmt.Errorf("failed to parse label selector: %v", err)
614-
}
615-
listOpts := &ctrlruntimeclient.ListOptions{LabelSelector: selector}
616-
nodes := &corev1.NodeList{}
617-
if err := r.client.List(r.ctx, nodes, listOpts); err != nil {
618-
return fmt.Errorf("failed to list nodes: %v", err)
619-
}
611+
// If there's NodeRef on the Machine object, remove the Node by using the
612+
// value of the NodeRef. If there's no NodeRef, try to find the Node by
613+
// listing nodes using the NodeOwner label selector.
614+
if machine.Status.NodeRef != nil {
615+
objKey := ctrlruntimeclient.ObjectKey{Name: machine.Status.NodeRef.Name}
616+
node := &corev1.Node{}
617+
if err := r.client.Get(r.ctx, objKey, node); err != nil {
618+
return fmt.Errorf("failed to get node %s: %v", machine.Status.NodeRef.Name, err)
619+
}
620+
621+
if err := r.client.Delete(r.ctx, node); err != nil {
622+
if !kerrors.IsNotFound(err) {
623+
return err
624+
}
625+
klog.V(2).Infof("node %q does not longer exist for machine %q", machine.Status.NodeRef.Name, machine.Spec.Name)
626+
}
627+
} else {
628+
selector, err := labels.Parse(NodeOwnerLabelName + "=" + string(machine.UID))
629+
if err != nil {
630+
return fmt.Errorf("failed to parse label selector: %v", err)
631+
}
632+
listOpts := &ctrlruntimeclient.ListOptions{LabelSelector: selector}
633+
nodes := &corev1.NodeList{}
634+
if err := r.client.List(r.ctx, nodes, listOpts); err != nil {
635+
return fmt.Errorf("failed to list nodes: %v", err)
636+
}
637+
if len(nodes.Items) == 0 {
638+
// We just want log that we didn't found the node. We don't want to
639+
// return here, as we want to remove finalizers at the end.
640+
klog.V(3).Infof("No node found for the machine %s", machine.Spec.Name)
641+
}
620642

621-
for _, node := range nodes.Items {
622-
if err := r.client.Delete(r.ctx, &node); err != nil {
623-
return err
643+
for _, node := range nodes.Items {
644+
if err := r.client.Delete(r.ctx, &node); err != nil {
645+
return err
646+
}
624647
}
625648
}
626649

@@ -888,8 +911,31 @@ func (r *Reconciler) getNode(instance instance.Instance, provider providerconfig
888911
return node.DeepCopy(), true, nil
889912
}
890913
}
914+
// If we were unable to find Node by ProviderID, fallback to IP address matching.
915+
// This usually happens if there's no CCM deployed in the cluster.
916+
//
917+
// This mechanism is not always reliable, as providers reuse the IP addresses after
918+
// some time.
919+
//
920+
// If we rollout a Machine, it can happen that a new instance has the same
921+
// IP addresses as the instance that has just been deleted. If machine-controller
922+
// processes the new Machine before removing the old Machine and the corresponding
923+
// Node object, machine-controller could update the NodeOwner label on the old Node
924+
// object to point to the Machine that just got created, as IP addresses would match.
925+
// This causes machine-controller to fail to delete the old Node object, which could
926+
// then cause cluster stability issues in some cases.
891927
for _, nodeAddress := range node.Status.Addresses {
892928
for instanceAddress := range instance.Addresses() {
929+
// We observed that the issue described above happens often on Hetzner.
930+
// As we know that the Node and the instance name will always be same
931+
// on Hetzner, we can use it as an additional check to prevent this
932+
// issue.
933+
// TODO: We should do this for other providers, but there are providers where
934+
// the node and the instance names will not match, so it requires further
935+
// investigation (e.g. AWS).
936+
if provider == providerconfigtypes.CloudProviderHetzner && node.Name != instance.Name() {
937+
continue
938+
}
893939
if nodeAddress.Address == instanceAddress {
894940
return node.DeepCopy(), true, nil
895941
}

pkg/controller/machine/machine_test.go

Lines changed: 170 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ import (
2626

2727
clusterv1alpha1 "github.com/kubermatic/machine-controller/pkg/apis/cluster/v1alpha1"
2828
"github.com/kubermatic/machine-controller/pkg/cloudprovider/instance"
29+
cloudprovidertypes "github.com/kubermatic/machine-controller/pkg/cloudprovider/types"
2930
providerconfigtypes "github.com/kubermatic/machine-controller/pkg/providerconfig/types"
3031

3132
corev1 "k8s.io/api/core/v1"
@@ -36,6 +37,7 @@ import (
3637
"k8s.io/client-go/kubernetes/scheme"
3738
"k8s.io/client-go/tools/record"
3839
"k8s.io/klog"
40+
ctrlruntimeclient "sigs.k8s.io/controller-runtime/pkg/client"
3941
ctrlruntimefake "sigs.k8s.io/controller-runtime/pkg/client/fake"
4042
)
4143

@@ -99,7 +101,8 @@ func TestController_GetNode(t *testing.T) {
99101
node1 := getTestNode("1", "aws")
100102
node2 := getTestNode("2", "openstack")
101103
node3 := getTestNode("3", "")
102-
nodeList := []*corev1.Node{&node1, &node2, &node3}
104+
node4 := getTestNode("4", "hetzner")
105+
nodeList := []*corev1.Node{&node1, &node2, &node3, &node4}
103106

104107
tests := []struct {
105108
name string
@@ -149,6 +152,38 @@ func TestController_GetNode(t *testing.T) {
149152
err: nil,
150153
instance: &fakeInstance{id: "3", addresses: map[string]corev1.NodeAddressType{"172.16.1.3": corev1.NodeInternalIP}},
151154
},
155+
{
156+
name: "hetzner node found by internal ip",
157+
provider: "hetzner",
158+
resNode: &node3,
159+
exists: true,
160+
err: nil,
161+
instance: &fakeInstance{id: "3", name: "node3", addresses: map[string]corev1.NodeAddressType{"192.168.1.3": corev1.NodeInternalIP}},
162+
},
163+
{
164+
name: "hetzner node found by external ip",
165+
provider: "hetzner",
166+
resNode: &node3,
167+
exists: true,
168+
err: nil,
169+
instance: &fakeInstance{id: "3", name: "node3", addresses: map[string]corev1.NodeAddressType{"172.16.1.3": corev1.NodeExternalIP}},
170+
},
171+
{
172+
name: "hetzner node not found - node and instance names mismatch",
173+
provider: "hetzner",
174+
resNode: nil,
175+
exists: false,
176+
err: nil,
177+
instance: &fakeInstance{id: "3", name: "instance3", addresses: map[string]corev1.NodeAddressType{"192.168.1.3": corev1.NodeInternalIP}},
178+
},
179+
{
180+
name: "hetzner node found by provider id",
181+
provider: "hetzner",
182+
resNode: &node4,
183+
exists: true,
184+
err: nil,
185+
instance: &fakeInstance{id: "4", addresses: map[string]corev1.NodeAddressType{"": ""}},
186+
},
152187
}
153188

154189
for _, test := range tests {
@@ -427,3 +462,137 @@ func TestControllerShouldEvict(t *testing.T) {
427462
})
428463
}
429464
}
465+
466+
func TestControllerDeleteNodeForMachine(t *testing.T) {
467+
machineUID := types.UID("test-1")
468+
469+
tests := []struct {
470+
name string
471+
machine *clusterv1alpha1.Machine
472+
nodes []runtime.Object
473+
err error
474+
shouldDeleteNode string
475+
}{
476+
{
477+
name: "delete node by nodeRef",
478+
machine: &clusterv1alpha1.Machine{
479+
ObjectMeta: metav1.ObjectMeta{
480+
Name: "machine-1",
481+
Finalizers: []string{"machine-node-delete-finalizer"},
482+
},
483+
Status: clusterv1alpha1.MachineStatus{
484+
NodeRef: &corev1.ObjectReference{Name: "node-1"},
485+
},
486+
},
487+
nodes: []runtime.Object{&corev1.Node{
488+
ObjectMeta: metav1.ObjectMeta{
489+
Name: "node-0",
490+
}}, &corev1.Node{
491+
ObjectMeta: metav1.ObjectMeta{
492+
Name: "node-1",
493+
}},
494+
},
495+
err: nil,
496+
shouldDeleteNode: "node-1",
497+
},
498+
{
499+
name: "delete node by NodeOwner label",
500+
machine: &clusterv1alpha1.Machine{
501+
ObjectMeta: metav1.ObjectMeta{
502+
Name: "machine-1",
503+
Finalizers: []string{"machine-node-delete-finalizer"},
504+
UID: machineUID,
505+
},
506+
Status: clusterv1alpha1.MachineStatus{},
507+
},
508+
nodes: []runtime.Object{
509+
&corev1.Node{
510+
ObjectMeta: metav1.ObjectMeta{
511+
Name: "node-0",
512+
Labels: map[string]string{
513+
NodeOwnerLabelName: string(machineUID),
514+
},
515+
},
516+
},
517+
&corev1.Node{
518+
ObjectMeta: metav1.ObjectMeta{
519+
Name: "node-1",
520+
},
521+
},
522+
},
523+
err: nil,
524+
shouldDeleteNode: "node-0",
525+
},
526+
{
527+
name: "no node should be deleted",
528+
machine: &clusterv1alpha1.Machine{
529+
ObjectMeta: metav1.ObjectMeta{
530+
Name: "machine-1",
531+
Finalizers: []string{"machine-node-delete-finalizer"},
532+
UID: machineUID,
533+
},
534+
Status: clusterv1alpha1.MachineStatus{},
535+
},
536+
nodes: []runtime.Object{
537+
&corev1.Node{
538+
ObjectMeta: metav1.ObjectMeta{
539+
Name: "node-0",
540+
},
541+
},
542+
&corev1.Node{
543+
ObjectMeta: metav1.ObjectMeta{
544+
Name: "node-1",
545+
},
546+
},
547+
},
548+
err: nil,
549+
shouldDeleteNode: "",
550+
},
551+
}
552+
553+
for _, test := range tests {
554+
t.Run(test.name, func(t *testing.T) {
555+
objects := []runtime.Object{test.machine}
556+
objects = append(objects, test.nodes...)
557+
558+
client := ctrlruntimefake.NewFakeClient(objects...)
559+
560+
ctx := context.TODO()
561+
providerData := &cloudprovidertypes.ProviderData{
562+
Ctx: ctx,
563+
Update: cloudprovidertypes.GetMachineUpdater(ctx, client),
564+
Client: client,
565+
}
566+
567+
reconciler := &Reconciler{
568+
client: client,
569+
recorder: &record.FakeRecorder{},
570+
providerData: providerData,
571+
}
572+
573+
err := reconciler.deleteNodeForMachine(test.machine)
574+
if diff := deep.Equal(err, test.err); diff != nil {
575+
t.Errorf("expected to get %v instead got: %v", test.err, err)
576+
}
577+
if err != nil {
578+
return
579+
}
580+
581+
if test.shouldDeleteNode != "" {
582+
err = client.Get(ctx, types.NamespacedName{Name: test.shouldDeleteNode}, &corev1.Node{})
583+
if !kerrors.IsNotFound(err) {
584+
t.Errorf("expected node %q to be deleted, but got: %v", test.shouldDeleteNode, err)
585+
}
586+
} else {
587+
nodes := &corev1.NodeList{}
588+
err = client.List(ctx, nodes, &ctrlruntimeclient.ListOptions{})
589+
if err != nil {
590+
t.Errorf("error listing nodes: %v", err)
591+
}
592+
if len(test.nodes) != len(nodes.Items) {
593+
t.Errorf("expected %d nodes, but got %d", len(test.nodes), len(nodes.Items))
594+
}
595+
}
596+
})
597+
}
598+
}

0 commit comments

Comments
 (0)