Skip to content

Commit 3afa230

Browse files
authored
Merge pull request kubernetes-sigs#10559 from fabriziopandini/improve-kcp-remediation-reentrancy
🌱 Improve KCP remediation re-entrancy
2 parents f5a538d + 280b7b5 commit 3afa230

File tree

2 files changed

+64
-3
lines changed

2 files changed

+64
-3
lines changed

controlplane/kubeadm/internal/controllers/remediation.go

Lines changed: 23 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -99,9 +99,29 @@ func (r *KubeadmControlPlaneReconciler) reconcileUnhealthyMachines(ctx context.C
9999
// Returns if another remediation is in progress but the new Machine is not yet created.
100100
// Note: This condition is checked after we check for unhealthy Machines and if machineToBeRemediated
101101
// is being deleted to avoid unnecessary logs if no further remediation should be done.
102-
if _, ok := controlPlane.KCP.Annotations[controlplanev1.RemediationInProgressAnnotation]; ok {
103-
log.Info("Another remediation is already in progress. Skipping remediation.")
104-
return ctrl.Result{}, nil
102+
if v, ok := controlPlane.KCP.Annotations[controlplanev1.RemediationInProgressAnnotation]; ok {
103+
// Check if the annotation is stale; this might happen in case there is a crash in the controller in between
104+
// when a new Machine is created and the annotation is eventually removed from KCP via defer patch at the end
105+
// of KCP reconcile.
106+
remediationData, err := RemediationDataFromAnnotation(v)
107+
if err != nil {
108+
return ctrl.Result{}, err
109+
}
110+
111+
staleAnnotation := false
112+
for _, m := range controlPlane.Machines.UnsortedList() {
113+
if m.CreationTimestamp.After(remediationData.Timestamp.Time) {
114+
// Remove the annotation tracking that a remediation is in progress (the annotation is stale).
115+
delete(controlPlane.KCP.Annotations, controlplanev1.RemediationInProgressAnnotation)
116+
staleAnnotation = true
117+
break
118+
}
119+
}
120+
121+
if !staleAnnotation {
122+
log.Info("Another remediation is already in progress. Skipping remediation.")
123+
return ctrl.Result{}, nil
124+
}
105125
}
106126

107127
patchHelper, err := patch.NewHelper(machineToBeRemediated, r.Client)

controlplane/kubeadm/internal/controllers/remediation_test.go

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -166,6 +166,47 @@ func TestReconcileUnhealthyMachines(t *testing.T) {
166166
g.Expect(ret.IsZero()).To(BeTrue()) // Remediation skipped
167167
g.Expect(err).ToNot(HaveOccurred())
168168
})
169+
t.Run("remediation in progress is ignored when stale", func(t *testing.T) {
170+
g := NewWithT(t)
171+
172+
m := createMachine(ctx, g, ns.Name, "m1-unhealthy-", withStuckRemediation(), withWaitBeforeDeleteFinalizer())
173+
conditions.MarkFalse(m, clusterv1.MachineHealthCheckSucceededCondition, clusterv1.MachineHasFailureReason, clusterv1.ConditionSeverityWarning, "")
174+
conditions.MarkFalse(m, clusterv1.MachineOwnerRemediatedCondition, clusterv1.WaitingForRemediationReason, clusterv1.ConditionSeverityWarning, "")
175+
controlPlane := &internal.ControlPlane{
176+
KCP: &controlplanev1.KubeadmControlPlane{
177+
ObjectMeta: metav1.ObjectMeta{
178+
Annotations: map[string]string{
179+
controlplanev1.RemediationInProgressAnnotation: MustMarshalRemediationData(&RemediationData{
180+
Machine: "foo",
181+
Timestamp: metav1.Time{Time: time.Now().Add(-1 * time.Hour).UTC()},
182+
RetryCount: 0,
183+
}),
184+
},
185+
},
186+
},
187+
Cluster: &clusterv1.Cluster{},
188+
Machines: collections.FromMachines(m),
189+
}
190+
ret, err := r.reconcileUnhealthyMachines(ctx, controlPlane)
191+
192+
g.Expect(ret.IsZero()).To(BeFalse()) // Remediation completed, requeue
193+
g.Expect(err).ToNot(HaveOccurred())
194+
195+
g.Expect(controlPlane.KCP.Annotations).To(HaveKey(controlplanev1.RemediationInProgressAnnotation))
196+
remediationData, err := RemediationDataFromAnnotation(controlPlane.KCP.Annotations[controlplanev1.RemediationInProgressAnnotation])
197+
g.Expect(err).ToNot(HaveOccurred())
198+
g.Expect(remediationData.Machine).To(Equal(m.Name))
199+
g.Expect(remediationData.RetryCount).To(Equal(0))
200+
201+
assertMachineCondition(ctx, g, m, clusterv1.MachineOwnerRemediatedCondition, corev1.ConditionFalse, clusterv1.RemediationInProgressReason, clusterv1.ConditionSeverityWarning, "")
202+
203+
err = env.Get(ctx, client.ObjectKey{Namespace: m.Namespace, Name: m.Name}, m)
204+
g.Expect(err).ToNot(HaveOccurred())
205+
g.Expect(m.ObjectMeta.DeletionTimestamp.IsZero()).To(BeFalse())
206+
207+
removeFinalizer(g, m)
208+
g.Expect(env.Cleanup(ctx, m)).To(Succeed())
209+
})
169210
t.Run("reconcileUnhealthyMachines return early if the machine to be remediated is already being deleted", func(t *testing.T) {
170211
g := NewWithT(t)
171212

0 commit comments

Comments
 (0)