Skip to content

Commit 9517bba

Browse files
committed
🐛 Machine Controller should try to retrieve node on delete
Consider this scenario: * Machine is created * InfraMachine is created * MachineHealthCheck is set to 10 minutes * [10 minutes pass] * Machine is marked as needing remediation * Machine and InfraMachine goes into deletion flow * Node finally joins the cluster, say 10 minutes + few seconds * InfraMachine is still waiting for VM to be deleted * Machine keeps retrying to delete, but `nodeRef` was never set * Machine, InfraMachine go away (finalizer removed) * Node object sticks around in the cluster This changset allows the Machine controller to look into the cluster during deletion flow if the Node eventually got created before the infrastructure provider was able to fully delete the machine. Signed-off-by: Vince Prignano <[email protected]>
1 parent 8f31a86 commit 9517bba

File tree

2 files changed

+150
-1
lines changed

2 files changed

+150
-1
lines changed

internal/controllers/machine/machine_controller.go

Lines changed: 26 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -562,8 +562,33 @@ func (r *Reconciler) isDeleteNodeAllowed(ctx context.Context, cluster *clusterv1
562562
return errClusterIsBeingDeleted
563563
}
564564

565-
// Cannot delete something that doesn't exist.
565+
if machine.Status.NodeRef == nil && machine.Spec.ProviderID != nil {
566+
// If we don't have a node reference, but a provider id has been set,
567+
// try to retrieve the node one more time.
568+
//
569+
// NOTE: The following is a best-effort attempt to retrieve the node,
570+
// errors are logged but not returned to ensure machines are deleted
571+
// even if the node cannot be retrieved.
572+
remoteClient, err := r.Tracker.GetClient(ctx, util.ObjectKey(cluster))
573+
if err != nil {
574+
log.Error(err, "Failed to get cluster client while deleting Machine and checking for nodes")
575+
} else {
576+
node, err := r.getNode(ctx, remoteClient, *machine.Spec.ProviderID)
577+
if err != nil && err != ErrNodeNotFound {
578+
log.Error(err, "Failed to get node while deleting Machine")
579+
} else if err == nil {
580+
machine.Status.NodeRef = &corev1.ObjectReference{
581+
APIVersion: corev1.SchemeGroupVersion.String(),
582+
Kind: "Node",
583+
Name: node.Name,
584+
UID: node.UID,
585+
}
586+
}
587+
}
588+
}
589+
566590
if machine.Status.NodeRef == nil {
591+
// Cannot delete something that doesn't exist.
567592
return errNilNodeRef
568593
}
569594

internal/controllers/machine/machine_controller_test.go

Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2522,6 +2522,130 @@ func TestNodeDeletion(t *testing.T) {
25222522
}
25232523
}
25242524

2525+
func TestNodeDeletionWithoutNodeRefFallback(t *testing.T) {
2526+
g := NewWithT(t)
2527+
2528+
deletionTime := metav1.Now().Add(-1 * time.Second)
2529+
2530+
testCluster := clusterv1.Cluster{
2531+
ObjectMeta: metav1.ObjectMeta{
2532+
Name: "test-cluster",
2533+
Namespace: metav1.NamespaceDefault,
2534+
},
2535+
}
2536+
2537+
node := &corev1.Node{
2538+
ObjectMeta: metav1.ObjectMeta{
2539+
Name: "test",
2540+
},
2541+
Spec: corev1.NodeSpec{ProviderID: "test://id-1"},
2542+
}
2543+
2544+
testMachine := clusterv1.Machine{
2545+
ObjectMeta: metav1.ObjectMeta{
2546+
Name: "test",
2547+
Namespace: metav1.NamespaceDefault,
2548+
Labels: map[string]string{
2549+
clusterv1.MachineControlPlaneLabel: "",
2550+
},
2551+
Annotations: map[string]string{
2552+
"machine.cluster.x-k8s.io/exclude-node-draining": "",
2553+
},
2554+
Finalizers: []string{clusterv1.MachineFinalizer},
2555+
DeletionTimestamp: &metav1.Time{Time: deletionTime},
2556+
},
2557+
Spec: clusterv1.MachineSpec{
2558+
ClusterName: "test-cluster",
2559+
InfrastructureRef: corev1.ObjectReference{
2560+
APIVersion: "infrastructure.cluster.x-k8s.io/v1beta1",
2561+
Kind: "GenericInfrastructureMachine",
2562+
Name: "infra-config1",
2563+
},
2564+
Bootstrap: clusterv1.Bootstrap{DataSecretName: ptr.To("data")},
2565+
ProviderID: ptr.To("test://id-1"),
2566+
},
2567+
}
2568+
2569+
cpmachine1 := &clusterv1.Machine{
2570+
ObjectMeta: metav1.ObjectMeta{
2571+
Name: "cp1",
2572+
Namespace: metav1.NamespaceDefault,
2573+
Labels: map[string]string{
2574+
clusterv1.ClusterNameLabel: "test-cluster",
2575+
clusterv1.MachineControlPlaneLabel: "",
2576+
},
2577+
Finalizers: []string{clusterv1.MachineFinalizer},
2578+
},
2579+
Spec: clusterv1.MachineSpec{
2580+
ClusterName: "test-cluster",
2581+
InfrastructureRef: corev1.ObjectReference{},
2582+
Bootstrap: clusterv1.Bootstrap{DataSecretName: ptr.To("data")},
2583+
},
2584+
Status: clusterv1.MachineStatus{
2585+
NodeRef: &corev1.ObjectReference{
2586+
Name: "cp1",
2587+
},
2588+
},
2589+
}
2590+
2591+
testCases := []struct {
2592+
name string
2593+
deletionTimeout *metav1.Duration
2594+
resultErr bool
2595+
clusterDeleted bool
2596+
expectNodeDeletion bool
2597+
createFakeClient func(...client.Object) client.Client
2598+
}{
2599+
{
2600+
name: "should return no error when the node exists and matches the provider id",
2601+
deletionTimeout: &metav1.Duration{Duration: time.Second},
2602+
resultErr: false,
2603+
expectNodeDeletion: true,
2604+
createFakeClient: func(initObjs ...client.Object) client.Client {
2605+
return fake.NewClientBuilder().
2606+
WithObjects(initObjs...).
2607+
WithIndex(&corev1.Node{}, index.NodeProviderIDField, index.NodeByProviderID).
2608+
WithStatusSubresource(&clusterv1.Machine{}).
2609+
Build()
2610+
},
2611+
},
2612+
}
2613+
2614+
for _, tc := range testCases {
2615+
t.Run(tc.name, func(*testing.T) {
2616+
m := testMachine.DeepCopy()
2617+
m.Spec.NodeDeletionTimeout = tc.deletionTimeout
2618+
2619+
fakeClient := tc.createFakeClient(node, m, cpmachine1)
2620+
tracker := remote.NewTestClusterCacheTracker(ctrl.Log, fakeClient, fakeClient, fakeScheme, client.ObjectKeyFromObject(&testCluster))
2621+
2622+
r := &Reconciler{
2623+
Client: fakeClient,
2624+
Tracker: tracker,
2625+
recorder: record.NewFakeRecorder(10),
2626+
nodeDeletionRetryTimeout: 10 * time.Millisecond,
2627+
}
2628+
2629+
cluster := testCluster.DeepCopy()
2630+
if tc.clusterDeleted {
2631+
cluster.DeletionTimestamp = &metav1.Time{Time: deletionTime.Add(time.Hour)}
2632+
}
2633+
2634+
_, err := r.reconcileDelete(context.Background(), cluster, m)
2635+
2636+
if tc.resultErr {
2637+
g.Expect(err).To(HaveOccurred())
2638+
} else {
2639+
g.Expect(err).ToNot(HaveOccurred())
2640+
if tc.expectNodeDeletion {
2641+
n := &corev1.Node{}
2642+
g.Expect(fakeClient.Get(context.Background(), client.ObjectKeyFromObject(node), n)).NotTo(Succeed())
2643+
}
2644+
}
2645+
})
2646+
}
2647+
}
2648+
25252649
// adds a condition list to an external object.
25262650
func addConditionsToExternal(u *unstructured.Unstructured, newConditions clusterv1.Conditions) {
25272651
existingConditions := clusterv1.Conditions{}

0 commit comments

Comments
 (0)