Skip to content

Commit f4bd93f

Browse files
authored
Merge pull request #7008 from killianmuldoon/fix/kcp-nil-pointer
🐛 Fix potential nilpointer error in machine remediation
2 parents 9cebf76 + fc5f391 commit f4bd93f

File tree

3 files changed

+66
-8
lines changed

3 files changed

+66
-8
lines changed

controlplane/kubeadm/internal/controllers/fakes_test.go

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,7 @@ import (
2020
"context"
2121

2222
"github.com/blang/semver"
23+
"github.com/pkg/errors"
2324
"sigs.k8s.io/controller-runtime/pkg/client"
2425

2526
clusterv1 "sigs.k8s.io/cluster-api/api/v1beta1"
@@ -69,7 +70,10 @@ type fakeWorkloadCluster struct {
6970
EtcdMembersResult []string
7071
}
7172

72-
func (f fakeWorkloadCluster) ForwardEtcdLeadership(_ context.Context, _ *clusterv1.Machine, _ *clusterv1.Machine) error {
73+
func (f fakeWorkloadCluster) ForwardEtcdLeadership(_ context.Context, _ *clusterv1.Machine, leaderCandidate *clusterv1.Machine) error {
74+
if leaderCandidate == nil {
75+
return errors.New("leaderCandidate is nil")
76+
}
7377
return nil
7478
}
7579

controlplane/kubeadm/internal/controllers/remediation.go

Lines changed: 16 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ import (
2323
"github.com/blang/semver"
2424
"github.com/pkg/errors"
2525
kerrors "k8s.io/apimachinery/pkg/util/errors"
26+
"k8s.io/klog/v2"
2627
ctrl "sigs.k8s.io/controller-runtime"
2728
"sigs.k8s.io/controller-runtime/pkg/client"
2829

@@ -106,27 +107,27 @@ func (r *KubeadmControlPlaneReconciler) reconcileUnhealthyMachines(ctx context.C
106107

107108
// Before starting remediation, run preflight checks in order to verify it is safe to remediate.
108109
// If any of the following checks fails, we'll surface the reason in the MachineOwnerRemediated condition.
109-
110+
log.WithValues("Machine", klog.KObj(machineToBeRemediated))
110111
desiredReplicas := int(*controlPlane.KCP.Spec.Replicas)
111112

112113
// The cluster MUST have more than one replica, because this is the smallest cluster size that allows any etcd failure tolerance.
113114
if controlPlane.Machines.Len() <= 1 {
114-
log.Info("A control plane machine needs remediation, but the number of current replicas is less or equal to 1. Skipping remediation", "UnhealthyMachine", machineToBeRemediated.Name, "Replicas", controlPlane.Machines.Len())
115+
log.Info("A control plane machine needs remediation, but the number of current replicas is less or equal to 1. Skipping remediation", "Replicas", controlPlane.Machines.Len())
115116
conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate if current replicas are less or equal to 1")
116117
return ctrl.Result{}, nil
117118
}
118119

119120
// The number of replicas MUST be equal to or greater than the desired replicas. This rule ensures that when the cluster
120121
// is missing replicas, we skip remediation and instead perform regular scale up/rollout operations first.
121122
if controlPlane.Machines.Len() < desiredReplicas {
122-
log.Info("A control plane machine needs remediation, but the current number of replicas is lower that expected. Skipping remediation", "UnhealthyMachine", machineToBeRemediated.Name, "Replicas", desiredReplicas, "CurrentReplicas", controlPlane.Machines.Len())
123+
log.Info("A control plane machine needs remediation, but the current number of replicas is lower that expected. Skipping remediation", "Replicas", desiredReplicas, "CurrentReplicas", controlPlane.Machines.Len())
123124
conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP waiting for having at least %d control plane machines before triggering remediation", desiredReplicas)
124125
return ctrl.Result{}, nil
125126
}
126127

127128
// The cluster MUST have no machines with a deletion timestamp. This rule prevents KCP taking actions while the cluster is in a transitional state.
128129
if controlPlane.HasDeletingMachine() {
129-
log.Info("A control plane machine needs remediation, but there are other control-plane machines being deleted. Skipping remediation", "UnhealthyMachine", machineToBeRemediated.Name)
130+
log.Info("A control plane machine needs remediation, but there are other control-plane machines being deleted. Skipping remediation")
130131
conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP waiting for control plane machine deletion to complete before triggering remediation")
131132
return ctrl.Result{}, nil
132133
}
@@ -140,7 +141,7 @@ func (r *KubeadmControlPlaneReconciler) reconcileUnhealthyMachines(ctx context.C
140141
return ctrl.Result{}, err
141142
}
142143
if !canSafelyRemediate {
143-
log.Info("A control plane machine needs remediation, but removing this machine could result in etcd quorum loss. Skipping remediation", "UnhealthyMachine", machineToBeRemediated.Name)
144+
log.Info("A control plane machine needs remediation, but removing this machine could result in etcd quorum loss. Skipping remediation")
144145
conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "KCP can't remediate this machine because this could result in etcd loosing quorum")
145146
return ctrl.Result{}, nil
146147
}
@@ -155,12 +156,20 @@ func (r *KubeadmControlPlaneReconciler) reconcileUnhealthyMachines(ctx context.C
155156
// If the machine that is about to be deleted is the etcd leader, move it to the newest member available.
156157
if controlPlane.IsEtcdManaged() {
157158
etcdLeaderCandidate := controlPlane.HealthyMachines().Newest()
159+
if etcdLeaderCandidate == nil {
160+
log.Info("A control plane machine needs remediation, but there is no healthy machine to forward etcd leadership to")
161+
conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.RemediationFailedReason, clusterv1.ConditionSeverityWarning,
162+
"A control plane machine needs remediation, but there is no healthy machine to forward etcd leadership to. Skipping remediation")
163+
return ctrl.Result{}, nil
164+
}
158165
if err := workloadCluster.ForwardEtcdLeadership(ctx, machineToBeRemediated, etcdLeaderCandidate); err != nil {
159-
log.Error(err, "Failed to move leadership to candidate machine", "candidate", etcdLeaderCandidate.Name)
166+
log.Error(err, "Failed to move etcd leadership to candidate machine", "candidate", klog.KObj(etcdLeaderCandidate))
167+
conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.RemediationFailedReason, clusterv1.ConditionSeverityError, err.Error())
160168
return ctrl.Result{}, err
161169
}
162170
if err := workloadCluster.RemoveEtcdMemberForMachine(ctx, machineToBeRemediated); err != nil {
163171
log.Error(err, "Failed to remove etcd member for machine")
172+
conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.RemediationFailedReason, clusterv1.ConditionSeverityError, err.Error())
164173
return ctrl.Result{}, err
165174
}
166175
}
@@ -180,7 +189,7 @@ func (r *KubeadmControlPlaneReconciler) reconcileUnhealthyMachines(ctx context.C
180189
return ctrl.Result{}, errors.Wrapf(err, "failed to delete unhealthy machine %s", machineToBeRemediated.Name)
181190
}
182191

183-
log.Info("Remediating unhealthy machine", "UnhealthyMachine", machineToBeRemediated.Name)
192+
log.Info("Remediating unhealthy machine")
184193
conditions.MarkFalse(machineToBeRemediated, clusterv1.MachineOwnerRemediatedCondition, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "")
185194
return ctrl.Result{Requeue: true}, nil
186195
}

controlplane/kubeadm/internal/controllers/remediation_test.go

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -389,6 +389,50 @@ func TestReconcileUnhealthyMachines(t *testing.T) {
389389
m1.ObjectMeta.Finalizers = nil
390390
g.Expect(patchHelper.Patch(ctx, m1))
391391

392+
g.Expect(env.Cleanup(ctx, m1, m2, m3, m4)).To(Succeed())
393+
})
394+
t.Run("Remediation does not happen if no healthy Control Planes are available to become etcd leader", func(t *testing.T) {
395+
g := NewWithT(t)
396+
397+
m1 := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withMachineHealthCheckFailed())
398+
patchHelper, err := patch.NewHelper(m1, env.GetClient())
399+
g.Expect(err).ToNot(HaveOccurred())
400+
m1.ObjectMeta.Finalizers = []string{"wait-before-delete"}
401+
g.Expect(patchHelper.Patch(ctx, m1))
402+
403+
m2 := createMachine(ctx, g, ns.Name, "m2-healthy-", withMachineHealthCheckFailed(), withHealthyEtcdMember())
404+
m3 := createMachine(ctx, g, ns.Name, "m3-healthy-", withMachineHealthCheckFailed(), withHealthyEtcdMember())
405+
m4 := createMachine(ctx, g, ns.Name, "m4-healthy-", withMachineHealthCheckFailed(), withHealthyEtcdMember())
406+
407+
controlPlane := &internal.ControlPlane{
408+
KCP: &controlplanev1.KubeadmControlPlane{Spec: controlplanev1.KubeadmControlPlaneSpec{
409+
Replicas: utilpointer.Int32Ptr(4),
410+
Version: "v1.19.1",
411+
}},
412+
Cluster: &clusterv1.Cluster{},
413+
Machines: collections.FromMachines(m1, m2, m3, m4),
414+
}
415+
416+
r := &KubeadmControlPlaneReconciler{
417+
Client: env.GetClient(),
418+
recorder: record.NewFakeRecorder(32),
419+
managementCluster: &fakeManagementCluster{
420+
Workload: fakeWorkloadCluster{
421+
EtcdMembersResult: nodes(controlPlane.Machines),
422+
},
423+
},
424+
}
425+
_, err = r.reconcileUnhealthyMachines(context.TODO(), controlPlane)
426+
g.Expect(err).ToNot(HaveOccurred())
427+
428+
assertMachineCondition(ctx, g, m1, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationFailedReason, clusterv1.ConditionSeverityWarning,
429+
"A control plane machine needs remediation, but there is no healthy machine to forward etcd leadership to. Skipping remediation")
430+
431+
patchHelper, err = patch.NewHelper(m1, env.GetClient())
432+
g.Expect(err).ToNot(HaveOccurred())
433+
m1.ObjectMeta.Finalizers = nil
434+
g.Expect(patchHelper.Patch(ctx, m1))
435+
392436
g.Expect(env.Cleanup(ctx, m1, m2, m3, m4)).To(Succeed())
393437
})
394438
}
@@ -432,6 +476,7 @@ func TestCanSafelyRemoveEtcdMember(t *testing.T) {
432476

433477
g.Expect(env.Cleanup(ctx, m1)).To(Succeed())
434478
})
479+
435480
t.Run("Can safely remediate 2 machine CP without additional etcd member failures", func(t *testing.T) {
436481
g := NewWithT(t)
437482

0 commit comments

Comments
 (0)