Skip to content

Commit 917e2fa

Browse files
Merge pull request #1380 from damdo/fix-controllers-guard-on-empty-status-authoritativeAPI
OCPCLOUD-2986,OCPBUGS-56849: fix: controllers: guard on empty .status.authoritativeAPI
2 parents b9b1512 + 12253b5 commit 917e2fa

File tree

5 files changed

+200
-104
lines changed

5 files changed

+200
-104
lines changed

pkg/controller/machine/controller.go

Lines changed: 65 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -177,43 +177,60 @@ func (r *ReconcileMachine) Reconcile(ctx context.Context, request reconcile.Requ
177177
originalConditions := conditions.DeepCopyConditions(m.Status.Conditions)
178178

179179
if r.gate.Enabled(featuregate.Feature(openshiftfeatures.FeatureGateMachineAPIMigration)) {
180-
// Check Status.AuthoritativeAPI
181-
// If not MachineAPI. Set the paused condition true and return early.
182-
//
183-
// Once we have a webhook, we want to remove the check that the AuthoritativeAPI
184-
// field is populated.
185-
if m.Status.AuthoritativeAPI != "" &&
186-
m.Status.AuthoritativeAPI != machinev1.MachineAuthorityMachineAPI {
187-
conditions.Set(m, conditions.TrueConditionWithReason(
188-
PausedCondition,
189-
PausedConditionReason,
190-
"The AuthoritativeAPI is set to %s", string(m.Status.AuthoritativeAPI),
191-
))
192-
if patchErr := r.updateStatus(ctx, m, ptr.Deref(m.Status.Phase, ""), nil, originalConditions); patchErr != nil {
193-
klog.Errorf("%v: error patching status: %v", machineName, patchErr)
180+
switch m.Status.AuthoritativeAPI {
181+
case "":
182+
// An empty .status.authoritativeAPI normally means the resource has not yet been reconciled
183+
// by the migration controller, which has the responsibility for propagating .spec.authoritativeAPI to the status.
184+
// Pause the resource and take no further action but return until that field is populated.
185+
desiredCondition := conditions.TrueConditionWithReason(
186+
PausedCondition, PausedConditionReason,
187+
"The AuthoritativeAPI status is not yet set",
188+
)
189+
190+
if _, err := r.ensureUpdatedPausedCondition(ctx, m, desiredCondition,
191+
fmt.Sprintf("%v: machine .status.authoritativeAPI is not yet set, ensuring machine is paused", machineName)); err != nil {
192+
return reconcile.Result{}, fmt.Errorf("failed to ensure paused condition: %w", err)
194193
}
195194

196195
klog.Infof("%v: machine is paused, taking no further action", machineName)
196+
197197
return reconcile.Result{}, nil
198-
}
199198

200-
var pausedFalseReason string
201-
if m.Status.AuthoritativeAPI != "" {
202-
pausedFalseReason = fmt.Sprintf("The AuthoritativeAPI is set to %s", string(m.Status.AuthoritativeAPI))
203-
} else {
204-
pausedFalseReason = "The AuthoritativeAPI is not set"
205-
}
199+
case machinev1.MachineAuthorityClusterAPI, machinev1.MachineAuthorityMigrating:
200+
// In cases when .status.authoritativeAPI is set to machinev1.MachineAuthorityClusterAPI, machinev1.MachineAuthorityMigrating
201+
// the resource should be paused and not reconciled further.
202+
desiredCondition := conditions.TrueConditionWithReason(
203+
PausedCondition, PausedConditionReason,
204+
"The AuthoritativeAPI status is set to '%s'", string(m.Status.AuthoritativeAPI),
205+
)
206+
207+
if _, err := r.ensureUpdatedPausedCondition(ctx, m, desiredCondition,
208+
fmt.Sprintf("%v: machine .status.authoritativeAPI is set to '%s', ensuring machine is paused", machineName, m.Status.AuthoritativeAPI)); err != nil {
209+
return reconcile.Result{}, fmt.Errorf("failed to ensure paused condition: %w", err)
210+
}
206211

207-
// Set the paused condition to false, continue reconciliation
208-
conditions.Set(m, conditions.FalseCondition(
209-
PausedCondition,
210-
NotPausedConditionReason,
211-
machinev1.ConditionSeverityInfo,
212-
"%s",
213-
pausedFalseReason,
214-
))
215-
if patchErr := r.updateStatus(ctx, m, ptr.Deref(m.Status.Phase, ""), nil, originalConditions); patchErr != nil {
216-
klog.Errorf("%v: error patching status: %v", machineName, patchErr)
212+
klog.Infof("%v: machine is paused, taking no further action", machineName)
213+
214+
return reconcile.Result{}, nil
215+
216+
case machinev1.MachineAuthorityMachineAPI:
217+
// The authority is MachineAPI and the resource should not be paused.
218+
desiredCondition := conditions.FalseCondition(
219+
PausedCondition, NotPausedConditionReason, machinev1.ConditionSeverityInfo, "%s",
220+
fmt.Sprintf("The AuthoritativeAPI status is set to '%s'", string(m.Status.AuthoritativeAPI)),
221+
)
222+
223+
if updated, err := r.ensureUpdatedPausedCondition(ctx, m, desiredCondition,
224+
fmt.Sprintf("%v: machine .status.authoritativeAPI is set to '%s', unpausing machine", machineName, m.Status.AuthoritativeAPI)); err != nil {
225+
return reconcile.Result{}, fmt.Errorf("failed to ensure paused condition: %w", err)
226+
} else if updated {
227+
klog.Infof("%v: setting machine paused condition to false", machineName)
228+
}
229+
230+
// Fallthrough and continue reconcilation.
231+
default:
232+
klog.Errorf("%v: invalid .status.authoritativeAPI '%s'", machineName, m.Status.AuthoritativeAPI)
233+
return reconcile.Result{}, nil // Do not return an error to avoid immediate requeue.
217234
}
218235
}
219236

@@ -433,6 +450,23 @@ func (r *ReconcileMachine) deleteNode(ctx context.Context, name string) error {
433450
return r.Client.Delete(ctx, &node)
434451
}
435452

453+
// ensureUpdatedPausedCondition updates the paused condition if needed.
454+
func (r *ReconcileMachine) ensureUpdatedPausedCondition(ctx context.Context, m *machinev1.Machine, desiredCondition *machinev1.Condition, logMessage string) (bool, error) {
455+
oldM := m.DeepCopy()
456+
if !conditions.IsEquivalentTo(conditions.Get(m, PausedCondition), desiredCondition) {
457+
klog.Info(logMessage)
458+
conditions.Set(m, desiredCondition)
459+
if err := r.updateStatus(ctx, m, ptr.Deref(m.Status.Phase, ""), nil, oldM.Status.Conditions); err != nil {
460+
klog.Errorf("%v: error updating status: %v", oldM.Name, err)
461+
return false, fmt.Errorf("error updating status for machine %s: %w", oldM.Name, err)
462+
}
463+
464+
return true, nil
465+
}
466+
467+
return false, nil
468+
}
469+
436470
func delayIfRequeueAfterError(err error) (reconcile.Result, error) {
437471
var requeueAfterError *RequeueAfterError
438472
if errors.As(err, &requeueAfterError) {

pkg/controller/machine/machine_controller_test.go

Lines changed: 27 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -135,19 +135,20 @@ var _ = Describe("Machine Reconciler", func() {
135135

136136
It("Should set the Paused condition appropriately", func() {
137137

138-
By("Creating the Machine")
138+
By("Creating the Machine (empty status.authoritativeAPI)")
139139
Expect(k8sClient.Create(ctx, instance)).To(Succeed())
140140

141141
By("Setting the AuthoritativeAPI to ClusterAPI")
142142
Eventually(k.UpdateStatus(instance, func() {
143143
instance.Status.AuthoritativeAPI = machinev1.MachineAuthorityClusterAPI
144144
})).Should(Succeed())
145145

146-
By("Verifying the paused condition is approproately set to true")
146+
By("Verifying the paused condition is appropriately set to true")
147147
Eventually(k.Object(instance), timeout).Should(SatisfyAll(
148148
HaveField("Status.Conditions", ContainElement(SatisfyAll(
149149
HaveField("Type", Equal(PausedCondition)),
150150
HaveField("Status", Equal(corev1.ConditionTrue)),
151+
HaveField("Message", Equal("The AuthoritativeAPI status is set to 'ClusterAPI'")),
151152
))),
152153
HaveField("Status.AuthoritativeAPI", Equal(machinev1.MachineAuthorityClusterAPI)),
153154
))
@@ -189,36 +190,50 @@ var _ = Describe("Machine Reconciler", func() {
189190
})
190191
}()
191192

192-
By("Transitioning the AuthoritativeAPI though 'Migrating' to MachineAPI")
193+
By("Changing status.authoritativeAPI from ClusterAPI to Migrating")
193194
Eventually(k.UpdateStatus(instance, func() {
194195
instance.Status.AuthoritativeAPI = machinev1.MachineAuthorityMigrating
195196
})).Should(Succeed())
196197

197-
By("Updating the AuthoritativeAPI from Migrating to MachineAPI")
198+
By("Verifying the paused condition is appropriately set to true")
199+
Eventually(k.Object(instance), timeout).Should(SatisfyAll(
200+
HaveField("Status.Conditions", ContainElement(SatisfyAll(
201+
HaveField("Type", Equal(PausedCondition)),
202+
HaveField("Status", Equal(corev1.ConditionTrue)),
203+
HaveField("Message", Equal("The AuthoritativeAPI status is set to 'Migrating'")),
204+
))),
205+
HaveField("Status.AuthoritativeAPI", Equal(machinev1.MachineAuthorityMigrating)),
206+
))
207+
208+
By("Changing status.authoritativeAPI from Migrating to MachineAPI")
198209
Eventually(k.UpdateStatus(instance, func() {
199210
instance.Status.AuthoritativeAPI = machinev1.MachineAuthorityMachineAPI
200211
})).Should(Succeed())
201212

202-
By("Verifying the paused condition is approproately set to false")
213+
By("Verifying the paused condition is appropriately set to false")
203214
Eventually(k.Object(instance), timeout).Should(SatisfyAll(
204215
HaveField("Status.Conditions", ContainElement(SatisfyAll(
205216
HaveField("Type", Equal(PausedCondition)),
206217
HaveField("Status", Equal(corev1.ConditionFalse)),
218+
HaveField("Message", Equal("The AuthoritativeAPI status is set to 'MachineAPI'")),
207219
))),
208220
HaveField("Status.AuthoritativeAPI", Equal(machinev1.MachineAuthorityMachineAPI)),
209221
))
210222

211-
By("Unsetting the AuthoritativeAPI field in the status")
223+
By("Changing status.authoritativeAPI from MachineAPI to empty")
212224
Eventually(k.UpdateStatus(instance, func() {
213225
instance.Status.AuthoritativeAPI = ""
214226
})).Should(Succeed())
215227

216-
By("Verifying the paused condition is still approproately set to false")
217-
Eventually(k.Object(instance), timeout).Should(HaveField("Status.Conditions", ContainElement(SatisfyAll(
218-
HaveField("Type", Equal(PausedCondition)),
219-
HaveField("Status", Equal(corev1.ConditionFalse)),
220-
HaveField("Message", Equal("The AuthoritativeAPI is not set")),
221-
))))
228+
By("Verifying the paused condition is still appropriately set to true when empty status.authoritativeAPI")
229+
Eventually(k.Object(instance), timeout).Should(SatisfyAll(
230+
HaveField("Status.Conditions", ContainElement(SatisfyAll(
231+
HaveField("Type", Equal(PausedCondition)),
232+
HaveField("Status", Equal(corev1.ConditionTrue)),
233+
HaveField("Message", Equal("The AuthoritativeAPI status is not yet set")),
234+
))),
235+
HaveField("Status.AuthoritativeAPI", BeEquivalentTo("")),
236+
))
222237
})
223238
})
224239
})

pkg/controller/machineset/controller.go

Lines changed: 63 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -174,59 +174,60 @@ func (r *ReconcileMachineSet) Reconcile(ctx context.Context, request reconcile.R
174174
}
175175

176176
if r.gate.Enabled(featuregate.Feature(openshiftfeatures.FeatureGateMachineAPIMigration)) {
177-
machineSetName := machineSet.GetName()
178-
machineSetCopy := machineSet.DeepCopy()
179-
// Check Status.AuthoritativeAPI. If it's not set to MachineAPI. Set the
180-
// paused condition true and return early.
181-
//
182-
// Once we have a webhook, we want to remove the check that the AuthoritativeAPI
183-
// field is populated.
184-
if machineSet.Status.AuthoritativeAPI != "" &&
185-
machineSet.Status.AuthoritativeAPI != machinev1.MachineAuthorityMachineAPI {
186-
conditions.Set(machineSetCopy, conditions.TrueConditionWithReason(
187-
machine.PausedCondition,
188-
machine.PausedConditionReason,
189-
"The AuthoritativeAPI is set to %s", string(machineSet.Status.AuthoritativeAPI),
190-
))
191-
192-
_, err := updateMachineSetStatus(r.Client, machineSet, machineSetCopy.Status)
193-
if err != nil && !apierrors.IsNotFound(err) {
194-
klog.Errorf("%v: error updating status: %v", machineSetName, err)
195-
return reconcile.Result{}, fmt.Errorf("error updating status: %w", err)
196-
} else if apierrors.IsNotFound(err) {
197-
return reconcile.Result{}, nil
177+
switch machineSet.Status.AuthoritativeAPI {
178+
case "":
179+
// An empty .status.authoritativeAPI normally means the resource has not yet been reconciled
180+
// by the migration controller, which has the responsibility for propagating .spec.authoritativeAPI to the status.
181+
// Pause the resource and take no further action but return until that field is populated.
182+
desiredCondition := conditions.TrueConditionWithReason(
183+
machine.PausedCondition, machine.PausedConditionReason,
184+
"The AuthoritativeAPI status is not yet set",
185+
)
186+
187+
if _, err := r.ensureUpdatedPausedCondition(machineSet, desiredCondition,
188+
fmt.Sprintf("%v: machine set .status.authoritativeAPI is not yet set, ensuring machine set is paused", machineSet.Name)); err != nil {
189+
return reconcile.Result{}, fmt.Errorf("failed to ensure paused condition: %w", err)
198190
}
191+
klog.Infof("%v: machine set is paused, taking no further action", machineSet.Name)
199192

200-
klog.Infof("%v: machine set is paused, taking no further action", machineSetName)
201193
return reconcile.Result{}, nil
202-
}
203194

204-
var pausedFalseReason string
205-
if machineSet.Status.AuthoritativeAPI != "" {
206-
pausedFalseReason = fmt.Sprintf("The AuthoritativeAPI is set to %s", string(machineSet.Status.AuthoritativeAPI))
207-
} else {
208-
pausedFalseReason = "The AuthoritativeAPI is not set"
209-
}
195+
case machinev1.MachineAuthorityClusterAPI, machinev1.MachineAuthorityMigrating:
196+
// In case .status.authoritativeAPI is set to machinev1.MachineAuthorityClusterAPI, machinev1.MachineAuthorityMigrating
197+
// the resource should be paused and not reconciled further.
198+
desiredCondition := conditions.TrueConditionWithReason(
199+
machine.PausedCondition, machine.PausedConditionReason,
200+
"The AuthoritativeAPI status is set to '%s'", string(machineSet.Status.AuthoritativeAPI),
201+
)
202+
203+
if _, err := r.ensureUpdatedPausedCondition(machineSet, desiredCondition,
204+
fmt.Sprintf("%v: machine set .status.authoritativeAPI is set to '%s', ensuring machine set is paused", machineSet.Name, machineSet.Status.AuthoritativeAPI)); err != nil {
205+
return reconcile.Result{}, fmt.Errorf("failed to ensure paused condition: %w", err)
206+
}
210207

211-
// Set the paused condition to false, continue reconciliation
212-
conditions.Set(machineSetCopy, conditions.FalseCondition(
213-
machine.PausedCondition,
214-
machine.NotPausedConditionReason,
215-
machinev1.ConditionSeverityInfo,
216-
"%s",
217-
pausedFalseReason,
218-
))
208+
klog.Infof("%v: machine set is paused, taking no further action", machineSet.Name)
219209

220-
var err error
221-
machineSet, err = updateMachineSetStatus(r.Client, machineSet, machineSetCopy.Status)
222-
if err != nil && !apierrors.IsNotFound(err) {
223-
klog.Errorf("%v: error updating status: %v", machineSetName, err)
224-
return reconcile.Result{}, fmt.Errorf("error updating status: %w", err)
225-
} else if apierrors.IsNotFound(err) {
226210
return reconcile.Result{}, nil
227-
}
228211

229-
klog.Infof("%v: setting paused to false and continuing reconcile", machineSetName)
212+
case machinev1.MachineAuthorityMachineAPI:
213+
// The authority is MachineAPI and the resource should not be paused.
214+
desiredCondition := conditions.FalseCondition(
215+
machine.PausedCondition, machine.NotPausedConditionReason, machinev1.ConditionSeverityInfo, "%s",
216+
fmt.Sprintf("The AuthoritativeAPI status is set to '%s'", string(machineSet.Status.AuthoritativeAPI)),
217+
)
218+
219+
if updated, err := r.ensureUpdatedPausedCondition(machineSet, desiredCondition,
220+
fmt.Sprintf("%v: machine set .status.authoritativeAPI is set to '%s', unpausing machine set", machineSet.Name, machineSet.Status.AuthoritativeAPI)); err != nil {
221+
return reconcile.Result{}, fmt.Errorf("failed to ensure paused condition: %w", err)
222+
} else if updated {
223+
klog.Infof("%v: setting machine set paused condition to false", machineSet.Name)
224+
}
225+
226+
// Fallthrough and continue reconciliation.
227+
default:
228+
klog.Errorf("%v: invalid .status.authoritativeAPI '%s'", machineSet.Name, machineSet.Status.AuthoritativeAPI)
229+
return reconcile.Result{}, nil // Do not return an error to avoid immediate requeue.
230+
}
230231
}
231232

232233
result, err := r.reconcile(ctx, machineSet)
@@ -329,6 +330,23 @@ func (r *ReconcileMachineSet) reconcile(ctx context.Context, machineSet *machine
329330
return reconcile.Result{}, nil
330331
}
331332

333+
// ensureUpdatedPausedCondition updates the paused condition if needed.
334+
func (r *ReconcileMachineSet) ensureUpdatedPausedCondition(ms *machinev1.MachineSet, desiredCondition *machinev1.Condition, logMessage string) (bool, error) {
335+
newMs := ms.DeepCopy()
336+
if !conditions.IsEquivalentTo(conditions.Get(ms, machine.PausedCondition), desiredCondition) {
337+
klog.Info(logMessage)
338+
conditions.Set(newMs, desiredCondition)
339+
if _, err := updateMachineSetStatus(r.Client, ms, newMs.Status); err != nil {
340+
klog.Errorf("%v: error updating status: %v", newMs.Name, err)
341+
return false, fmt.Errorf("error updating status for machine set %s: %w", newMs.Name, err)
342+
}
343+
344+
return true, nil
345+
}
346+
347+
return false, nil
348+
}
349+
332350
// syncReplicas essentially scales machine resources up and down.
333351
func (r *ReconcileMachineSet) syncReplicas(ms *machinev1.MachineSet, machines []*machinev1.Machine) error {
334352
if ms.Spec.Replicas == nil {

0 commit comments

Comments
 (0)