Skip to content

Commit 81d86b5

Browse files
Merge pull request #1386 from damdo/fix-controllers-guard-on-empty-status-authoritativeAPI-take2
OCPCLOUD-2986,OCPCLOUD-2985,OCPBUGS-56849: Fix controllers guard on empty status authoritative api + defaulting
2 parents 41a1627 + 4421b2c commit 81d86b5

File tree

5 files changed

+208
-137
lines changed

5 files changed

+208
-137
lines changed

pkg/controller/machine/controller.go

Lines changed: 76 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -177,43 +177,60 @@ func (r *ReconcileMachine) Reconcile(ctx context.Context, request reconcile.Requ
177177
originalConditions := conditions.DeepCopyConditions(m.Status.Conditions)
178178

179179
if r.gate.Enabled(featuregate.Feature(openshiftfeatures.FeatureGateMachineAPIMigration)) {
180-
// Check Status.AuthoritativeAPI
181-
// If not MachineAPI. Set the paused condition true and return early.
182-
//
183-
// Once we have a webhook, we want to remove the check that the AuthoritativeAPI
184-
// field is populated.
185-
if m.Status.AuthoritativeAPI != "" &&
186-
m.Status.AuthoritativeAPI != machinev1.MachineAuthorityMachineAPI {
187-
conditions.Set(m, conditions.TrueConditionWithReason(
188-
PausedCondition,
189-
PausedConditionReason,
190-
"The AuthoritativeAPI is set to %s", string(m.Status.AuthoritativeAPI),
191-
))
192-
if patchErr := r.updateStatus(ctx, m, ptr.Deref(m.Status.Phase, ""), nil, originalConditions); patchErr != nil {
193-
klog.Errorf("%v: error patching status: %v", machineName, patchErr)
180+
switch m.Status.AuthoritativeAPI {
181+
case "":
182+
// An empty .status.authoritativeAPI normally means the resource has not yet been reconciled.
183+
// and that the value in .spec.authoritativeAPI has not been propagated to .status.authoritativeAPI yet.
184+
// This value can be set by two separate controllers, depending which one of them is running at that time,
185+
// or in case they are both running, which one gets to set it first (the operation is idempotent so there is no harm in racing).
186+
// - the cluster-capi-operator machine-api-migration's migration controller
187+
// - this controller
188+
189+
klog.Infof("%v: machine .status.authoritativeAPI is not yet set, setting it to .spec.authoritativeAPI", m.Name)
190+
191+
if err := r.patchStatusAuthoritativeAPI(ctx, m, m.Spec.AuthoritativeAPI); err != nil {
192+
klog.Errorf("%v: error patching status to set .status.authoritativeAPI for machine: %v", m.Name, err)
193+
return reconcile.Result{}, fmt.Errorf("error patching status to set .status.authoritativeAPI for machine %s: %w", m.Name, err)
194+
}
195+
196+
// Return to give a chance to the changes to get propagated.
197+
return reconcile.Result{}, nil
198+
199+
case machinev1.MachineAuthorityClusterAPI, machinev1.MachineAuthorityMigrating:
200+
// In cases when .status.authoritativeAPI is set to machinev1.MachineAuthorityClusterAPI, machinev1.MachineAuthorityMigrating
201+
// the resource should be paused and not reconciled further.
202+
desiredCondition := conditions.TrueConditionWithReason(
203+
PausedCondition, PausedConditionReason,
204+
"The AuthoritativeAPI status is set to '%s'", string(m.Status.AuthoritativeAPI),
205+
)
206+
207+
if _, err := r.ensureUpdatedPausedCondition(ctx, m, desiredCondition,
208+
fmt.Sprintf("%v: machine .status.authoritativeAPI is set to '%s', ensuring machine is paused", machineName, m.Status.AuthoritativeAPI)); err != nil {
209+
return reconcile.Result{}, fmt.Errorf("failed to ensure paused condition: %w", err)
194210
}
195211

196212
klog.Infof("%v: machine is paused, taking no further action", machineName)
213+
197214
return reconcile.Result{}, nil
198-
}
199215

200-
var pausedFalseReason string
201-
if m.Status.AuthoritativeAPI != "" {
202-
pausedFalseReason = fmt.Sprintf("The AuthoritativeAPI is set to %s", string(m.Status.AuthoritativeAPI))
203-
} else {
204-
pausedFalseReason = "The AuthoritativeAPI is not set"
205-
}
216+
case machinev1.MachineAuthorityMachineAPI:
217+
// The authority is MachineAPI and the resource should not be paused.
218+
desiredCondition := conditions.FalseCondition(
219+
PausedCondition, NotPausedConditionReason, machinev1.ConditionSeverityInfo, "%s",
220+
fmt.Sprintf("The AuthoritativeAPI status is set to '%s'", string(m.Status.AuthoritativeAPI)),
221+
)
222+
223+
if updated, err := r.ensureUpdatedPausedCondition(ctx, m, desiredCondition,
224+
fmt.Sprintf("%v: machine .status.authoritativeAPI is set to '%s', unpausing machine", machineName, m.Status.AuthoritativeAPI)); err != nil {
225+
return reconcile.Result{}, fmt.Errorf("failed to ensure paused condition: %w", err)
226+
} else if updated {
227+
klog.Infof("%v: setting machine paused condition to false", machineName)
228+
}
206229

207-
// Set the paused condition to false, continue reconciliation
208-
conditions.Set(m, conditions.FalseCondition(
209-
PausedCondition,
210-
NotPausedConditionReason,
211-
machinev1.ConditionSeverityInfo,
212-
"%s",
213-
pausedFalseReason,
214-
))
215-
if patchErr := r.updateStatus(ctx, m, ptr.Deref(m.Status.Phase, ""), nil, originalConditions); patchErr != nil {
216-
klog.Errorf("%v: error patching status: %v", machineName, patchErr)
230+
// Fallthrough and continue reconcilation.
231+
default:
232+
klog.Errorf("%v: invalid .status.authoritativeAPI '%s'", machineName, m.Status.AuthoritativeAPI)
233+
return reconcile.Result{}, nil // Do not return an error to avoid immediate requeue.
217234
}
218235
}
219236

@@ -433,6 +450,23 @@ func (r *ReconcileMachine) deleteNode(ctx context.Context, name string) error {
433450
return r.Client.Delete(ctx, &node)
434451
}
435452

453+
// ensureUpdatedPausedCondition updates the paused condition if needed.
454+
func (r *ReconcileMachine) ensureUpdatedPausedCondition(ctx context.Context, m *machinev1.Machine, desiredCondition *machinev1.Condition, logMessage string) (bool, error) {
455+
oldM := m.DeepCopy()
456+
if !conditions.IsEquivalentTo(conditions.Get(m, PausedCondition), desiredCondition) {
457+
klog.Info(logMessage)
458+
conditions.Set(m, desiredCondition)
459+
if err := r.updateStatus(ctx, m, ptr.Deref(m.Status.Phase, ""), nil, oldM.Status.Conditions); err != nil {
460+
klog.Errorf("%v: error updating status: %v", oldM.Name, err)
461+
return false, fmt.Errorf("error updating status for machine %s: %w", oldM.Name, err)
462+
}
463+
464+
return true, nil
465+
}
466+
467+
return false, nil
468+
}
469+
436470
func delayIfRequeueAfterError(err error) (reconcile.Result, error) {
437471
var requeueAfterError *RequeueAfterError
438472
if errors.As(err, &requeueAfterError) {
@@ -538,6 +572,17 @@ func (r *ReconcileMachine) updateStatus(ctx context.Context, machine *machinev1.
538572
return nil
539573
}
540574

575+
func (r *ReconcileMachine) patchStatusAuthoritativeAPI(ctx context.Context, machine *machinev1.Machine, authoritativeAPI machinev1.MachineAuthority) error {
576+
baseToPatch := client.MergeFrom(machine.DeepCopy())
577+
machine.Status.AuthoritativeAPI = authoritativeAPI
578+
579+
if err := r.Client.Status().Patch(ctx, machine, baseToPatch); err != nil {
580+
return fmt.Errorf("error patching machine status: %w", err)
581+
}
582+
583+
return nil
584+
}
585+
541586
func (r *ReconcileMachine) patchFailedMachineInstanceAnnotation(ctx context.Context, machine *machinev1.Machine) error {
542587
baseToPatch := client.MergeFrom(machine.DeepCopy())
543588
if machine.Annotations == nil {

pkg/controller/machine/machine_controller_test.go

Lines changed: 21 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -104,20 +104,15 @@ var _ = Describe("Machine Reconciler", func() {
104104
testutils.CleanupResources(Default, ctx, cfg, k8sClient, namespace.GetName(),
105105
&machinev1.Machine{},
106106
)
107-
108107
})
109108

110109
It("Should reconcile a Machine", func() {
111-
instance := machineBuilder.Build()
110+
instance := machineBuilder.WithAuthoritativeAPI(machinev1.MachineAuthorityMachineAPI).Build()
112111

113112
By("Creating the Machine")
114113
Expect(k8sClient.Create(ctx, instance)).To(Succeed())
115114

116-
By("Setting the AuthoritativeAPI to MachineAPI")
117-
Eventually(k.UpdateStatus(instance, func() {
118-
instance.Status.AuthoritativeAPI = machinev1.MachineAuthorityMachineAPI
119-
})).Should(Succeed())
120-
115+
By("Verying the Machine has been reconciled (status set)")
121116
Eventually(k.Object(instance), timeout).Should(HaveField("Status", Not(Equal(machinev1.MachineStatus{}))))
122117

123118
// // Expect platform status is not empty. This (check) means we've called the
@@ -127,27 +122,22 @@ var _ = Describe("Machine Reconciler", func() {
127122

128123
})
129124

130-
Describe("Paused condition", func() {
125+
Describe("Paused Condition", func() {
131126
var instance *machinev1.Machine
132127
BeforeEach(func() {
133-
instance = machineBuilder.Build()
128+
instance = machineBuilder.WithAuthoritativeAPI(machinev1.MachineAuthorityClusterAPI).Build()
134129
})
135130

136131
It("Should set the Paused condition appropriately", func() {
137-
138-
By("Creating the Machine")
132+
By("Creating the Machine (spec.authoritativeAPI=ClusterAPI)")
139133
Expect(k8sClient.Create(ctx, instance)).To(Succeed())
140134

141-
By("Setting the AuthoritativeAPI to ClusterAPI")
142-
Eventually(k.UpdateStatus(instance, func() {
143-
instance.Status.AuthoritativeAPI = machinev1.MachineAuthorityClusterAPI
144-
})).Should(Succeed())
145-
146-
By("Verifying the paused condition is approproately set to true")
135+
By("Verifying the status.authoritativeAPI has been set to ClusterAPI and the paused condition is appropriately set to true")
147136
Eventually(k.Object(instance), timeout).Should(SatisfyAll(
148137
HaveField("Status.Conditions", ContainElement(SatisfyAll(
149138
HaveField("Type", Equal(PausedCondition)),
150139
HaveField("Status", Equal(corev1.ConditionTrue)),
140+
HaveField("Message", Equal("The AuthoritativeAPI status is set to 'ClusterAPI'")),
151141
))),
152142
HaveField("Status.AuthoritativeAPI", Equal(machinev1.MachineAuthorityClusterAPI)),
153143
))
@@ -189,36 +179,35 @@ var _ = Describe("Machine Reconciler", func() {
189179
})
190180
}()
191181

192-
By("Transitioning the AuthoritativeAPI though 'Migrating' to MachineAPI")
182+
By("Changing status.authoritativeAPI from ClusterAPI to Migrating")
193183
Eventually(k.UpdateStatus(instance, func() {
194184
instance.Status.AuthoritativeAPI = machinev1.MachineAuthorityMigrating
195185
})).Should(Succeed())
196186

197-
By("Updating the AuthoritativeAPI from Migrating to MachineAPI")
187+
By("Verifying the paused condition is appropriately set to true while in Migrating")
188+
Eventually(k.Object(instance), timeout).Should(SatisfyAll(
189+
HaveField("Status.Conditions", ContainElement(SatisfyAll(
190+
HaveField("Type", Equal(PausedCondition)),
191+
HaveField("Status", Equal(corev1.ConditionTrue)),
192+
HaveField("Message", Equal("The AuthoritativeAPI status is set to 'Migrating'")),
193+
))),
194+
HaveField("Status.AuthoritativeAPI", Equal(machinev1.MachineAuthorityMigrating)),
195+
))
196+
197+
By("Changing status.authoritativeAPI from Migrating to MachineAPI")
198198
Eventually(k.UpdateStatus(instance, func() {
199199
instance.Status.AuthoritativeAPI = machinev1.MachineAuthorityMachineAPI
200200
})).Should(Succeed())
201201

202-
By("Verifying the paused condition is approproately set to false")
202+
By("Verifying the paused condition is appropriately set to false when in MachineAPI")
203203
Eventually(k.Object(instance), timeout).Should(SatisfyAll(
204204
HaveField("Status.Conditions", ContainElement(SatisfyAll(
205205
HaveField("Type", Equal(PausedCondition)),
206206
HaveField("Status", Equal(corev1.ConditionFalse)),
207+
HaveField("Message", Equal("The AuthoritativeAPI status is set to 'MachineAPI'")),
207208
))),
208209
HaveField("Status.AuthoritativeAPI", Equal(machinev1.MachineAuthorityMachineAPI)),
209210
))
210-
211-
By("Unsetting the AuthoritativeAPI field in the status")
212-
Eventually(k.UpdateStatus(instance, func() {
213-
instance.Status.AuthoritativeAPI = ""
214-
})).Should(Succeed())
215-
216-
By("Verifying the paused condition is still approproately set to false")
217-
Eventually(k.Object(instance), timeout).Should(HaveField("Status.Conditions", ContainElement(SatisfyAll(
218-
HaveField("Type", Equal(PausedCondition)),
219-
HaveField("Status", Equal(corev1.ConditionFalse)),
220-
HaveField("Message", Equal("The AuthoritativeAPI is not set")),
221-
))))
222211
})
223212
})
224213
})

pkg/controller/machineset/controller.go

Lines changed: 75 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -174,59 +174,61 @@ func (r *ReconcileMachineSet) Reconcile(ctx context.Context, request reconcile.R
174174
}
175175

176176
if r.gate.Enabled(featuregate.Feature(openshiftfeatures.FeatureGateMachineAPIMigration)) {
177-
machineSetName := machineSet.GetName()
178-
machineSetCopy := machineSet.DeepCopy()
179-
// Check Status.AuthoritativeAPI. If it's not set to MachineAPI. Set the
180-
// paused condition true and return early.
181-
//
182-
// Once we have a webhook, we want to remove the check that the AuthoritativeAPI
183-
// field is populated.
184-
if machineSet.Status.AuthoritativeAPI != "" &&
185-
machineSet.Status.AuthoritativeAPI != machinev1.MachineAuthorityMachineAPI {
186-
conditions.Set(machineSetCopy, conditions.TrueConditionWithReason(
187-
machine.PausedCondition,
188-
machine.PausedConditionReason,
189-
"The AuthoritativeAPI is set to %s", string(machineSet.Status.AuthoritativeAPI),
190-
))
191-
192-
_, err := updateMachineSetStatus(r.Client, machineSet, machineSetCopy.Status)
193-
if err != nil && !apierrors.IsNotFound(err) {
194-
klog.Errorf("%v: error updating status: %v", machineSetName, err)
195-
return reconcile.Result{}, fmt.Errorf("error updating status: %w", err)
196-
} else if apierrors.IsNotFound(err) {
197-
return reconcile.Result{}, nil
177+
switch machineSet.Status.AuthoritativeAPI {
178+
case "":
179+
// An empty .status.authoritativeAPI normally means the resource has not yet been reconciled.
180+
// and that the value in .spec.authoritativeAPI has not been propagated to .status.authoritativeAPI yet.
181+
// This value can be set by two separate controllers, depending which one of them is running at that time,
182+
// or in case they are both running, which one gets to set it first (the operation is idempotent so there is no harm in racing).
183+
// - the cluster-capi-operator machine-api-migration's migration controller
184+
// - this controller
185+
186+
klog.Infof("%v: machine set .status.authoritativeAPI is not yet set, setting it to .spec.authoritativeAPI", machineSet.Name)
187+
188+
if err := r.patchStatusAuthoritativeAPI(ctx, machineSet, machineSet.Spec.AuthoritativeAPI); err != nil {
189+
klog.Errorf("%v: error patching status to set .status.authoritativeAPI for machine set: %v", machineSet.Name, err)
190+
return reconcile.Result{}, fmt.Errorf("error patching status to set .status.authoritativeAPI for machine set %s: %w", machineSet.Name, err)
198191
}
199192

200-
klog.Infof("%v: machine set is paused, taking no further action", machineSetName)
193+
// Return to give a chance to the changes to get propagated.
201194
return reconcile.Result{}, nil
202-
}
203195

204-
var pausedFalseReason string
205-
if machineSet.Status.AuthoritativeAPI != "" {
206-
pausedFalseReason = fmt.Sprintf("The AuthoritativeAPI is set to %s", string(machineSet.Status.AuthoritativeAPI))
207-
} else {
208-
pausedFalseReason = "The AuthoritativeAPI is not set"
209-
}
196+
case machinev1.MachineAuthorityClusterAPI, machinev1.MachineAuthorityMigrating:
197+
// In case .status.authoritativeAPI is set to machinev1.MachineAuthorityClusterAPI, machinev1.MachineAuthorityMigrating
198+
// the resource should be paused and not reconciled further.
199+
desiredCondition := conditions.TrueConditionWithReason(
200+
machine.PausedCondition, machine.PausedConditionReason,
201+
"The AuthoritativeAPI status is set to '%s'", string(machineSet.Status.AuthoritativeAPI),
202+
)
203+
204+
if _, err := r.ensureUpdatedPausedCondition(machineSet, desiredCondition,
205+
fmt.Sprintf("%v: machine set .status.authoritativeAPI is set to '%s', ensuring machine set is paused", machineSet.Name, machineSet.Status.AuthoritativeAPI)); err != nil {
206+
return reconcile.Result{}, fmt.Errorf("failed to ensure paused condition: %w", err)
207+
}
210208

211-
// Set the paused condition to false, continue reconciliation
212-
conditions.Set(machineSetCopy, conditions.FalseCondition(
213-
machine.PausedCondition,
214-
machine.NotPausedConditionReason,
215-
machinev1.ConditionSeverityInfo,
216-
"%s",
217-
pausedFalseReason,
218-
))
209+
klog.Infof("%v: machine set is paused, taking no further action", machineSet.Name)
219210

220-
var err error
221-
machineSet, err = updateMachineSetStatus(r.Client, machineSet, machineSetCopy.Status)
222-
if err != nil && !apierrors.IsNotFound(err) {
223-
klog.Errorf("%v: error updating status: %v", machineSetName, err)
224-
return reconcile.Result{}, fmt.Errorf("error updating status: %w", err)
225-
} else if apierrors.IsNotFound(err) {
226211
return reconcile.Result{}, nil
227-
}
228212

229-
klog.Infof("%v: setting paused to false and continuing reconcile", machineSetName)
213+
case machinev1.MachineAuthorityMachineAPI:
214+
// The authority is MachineAPI and the resource should not be paused.
215+
desiredCondition := conditions.FalseCondition(
216+
machine.PausedCondition, machine.NotPausedConditionReason, machinev1.ConditionSeverityInfo, "%s",
217+
fmt.Sprintf("The AuthoritativeAPI status is set to '%s'", string(machineSet.Status.AuthoritativeAPI)),
218+
)
219+
220+
if updated, err := r.ensureUpdatedPausedCondition(machineSet, desiredCondition,
221+
fmt.Sprintf("%v: machine set .status.authoritativeAPI is set to '%s', unpausing machine set", machineSet.Name, machineSet.Status.AuthoritativeAPI)); err != nil {
222+
return reconcile.Result{}, fmt.Errorf("failed to ensure paused condition: %w", err)
223+
} else if updated {
224+
klog.Infof("%v: setting machine set paused condition to false", machineSet.Name)
225+
}
226+
227+
// Fallthrough and continue reconciliation.
228+
default:
229+
klog.Errorf("%v: invalid .status.authoritativeAPI '%s'", machineSet.Name, machineSet.Status.AuthoritativeAPI)
230+
return reconcile.Result{}, nil // Do not return an error to avoid immediate requeue.
231+
}
230232
}
231233

232234
result, err := r.reconcile(ctx, machineSet)
@@ -329,6 +331,23 @@ func (r *ReconcileMachineSet) reconcile(ctx context.Context, machineSet *machine
329331
return reconcile.Result{}, nil
330332
}
331333

334+
// ensureUpdatedPausedCondition updates the paused condition if needed.
335+
func (r *ReconcileMachineSet) ensureUpdatedPausedCondition(ms *machinev1.MachineSet, desiredCondition *machinev1.Condition, logMessage string) (bool, error) {
336+
newMs := ms.DeepCopy()
337+
if !conditions.IsEquivalentTo(conditions.Get(ms, machine.PausedCondition), desiredCondition) {
338+
klog.Info(logMessage)
339+
conditions.Set(newMs, desiredCondition)
340+
if _, err := updateMachineSetStatus(r.Client, ms, newMs.Status); err != nil {
341+
klog.Errorf("%v: error updating status: %v", newMs.Name, err)
342+
return false, fmt.Errorf("error updating status for machine set %s: %w", newMs.Name, err)
343+
}
344+
345+
return true, nil
346+
}
347+
348+
return false, nil
349+
}
350+
332351
// syncReplicas essentially scales machine resources up and down.
333352
func (r *ReconcileMachineSet) syncReplicas(ms *machinev1.MachineSet, machines []*machinev1.Machine) error {
334353
if ms.Spec.Replicas == nil {
@@ -500,6 +519,17 @@ func (r *ReconcileMachineSet) waitForMachineDeletion(machineList []*machinev1.Ma
500519
return nil
501520
}
502521

522+
func (r *ReconcileMachineSet) patchStatusAuthoritativeAPI(ctx context.Context, machineSet *machinev1.MachineSet, authoritativeAPI machinev1.MachineAuthority) error {
523+
baseToPatch := client.MergeFrom(machineSet.DeepCopy())
524+
machineSet.Status.AuthoritativeAPI = authoritativeAPI
525+
526+
if err := r.Client.Status().Patch(ctx, machineSet, baseToPatch); err != nil {
527+
return fmt.Errorf("error patching machine set status: %w", err)
528+
}
529+
530+
return nil
531+
}
532+
503533
func validateMachineset(m *machinev1.MachineSet) field.ErrorList {
504534
errors := field.ErrorList{}
505535

0 commit comments

Comments
 (0)