Skip to content

Commit 3c9b33f

Browse files
nojnhuhk8s-infra-cherrypick-robot
authored andcommitted
fix irrecoverable errors in async operations
Reconciliations involving asynchronous operations can fall into a loop where an eventual "Failed" result can block future reconciliations from making any further changes to that particular resource to fix the problem. This change sets `longRunningOperationStates` for agent pools on the corresponding AzureManagedMachinePool instead of the AzureManagedControlPlane, since changes to that resource were not being persisted. It also only blocks starting new operations on the status of existing operations of the same type. In-progress PUT operations will no longer block new DELETEs and in-progress DELETEs will not block in-progress PUTs. In cases where polling a future from the Azure API would eventually return both `isDone==true` and a non-nil error, a "failed checking if the operation was complete" message would be logged and the error would refer to the ultimate failure unrelated to polling the future itself. This change treats all `isDone==true` polling checks as successful and relies on the operation's error to be captured in the future's `Result`. The future will always be deleted from the status when the operation is done so it can be retried the next reconciliation if it failed.
1 parent dc0c196 commit 3c9b33f

File tree

67 files changed

+384
-449
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

67 files changed

+384
-449
lines changed

azure/interfaces.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -84,8 +84,8 @@ type ClusterDescriber interface {
8484
// AsyncStatusUpdater is an interface used to keep track of long running operations in Status that has Conditions and Futures.
8585
type AsyncStatusUpdater interface {
8686
SetLongRunningOperationState(*infrav1.Future)
87-
GetLongRunningOperationState(string, string) *infrav1.Future
88-
DeleteLongRunningOperationState(string, string)
87+
GetLongRunningOperationState(string, string, string) *infrav1.Future
88+
DeleteLongRunningOperationState(string, string, string)
8989
UpdatePutStatus(clusterv1.ConditionType, string, error)
9090
UpdateDeleteStatus(clusterv1.ConditionType, string, error)
9191
UpdatePatchStatus(clusterv1.ConditionType, string, error)

azure/mock_azure/azure_mock.go

Lines changed: 8 additions & 8 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

azure/scope/cluster.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -914,13 +914,13 @@ func (s *ClusterScope) SetLongRunningOperationState(future *infrav1.Future) {
914914
}
915915

916916
// GetLongRunningOperationState will get the future on the AzureCluster status.
917-
func (s *ClusterScope) GetLongRunningOperationState(name, service string) *infrav1.Future {
918-
return futures.Get(s.AzureCluster, name, service)
917+
func (s *ClusterScope) GetLongRunningOperationState(name, service, futureType string) *infrav1.Future {
918+
return futures.Get(s.AzureCluster, name, service, futureType)
919919
}
920920

921921
// DeleteLongRunningOperationState will delete the future from the AzureCluster status.
922-
func (s *ClusterScope) DeleteLongRunningOperationState(name, service string) {
923-
futures.Delete(s.AzureCluster, name, service)
922+
func (s *ClusterScope) DeleteLongRunningOperationState(name, service, futureType string) {
923+
futures.Delete(s.AzureCluster, name, service, futureType)
924924
}
925925

926926
// UpdateDeleteStatus updates a condition on the AzureCluster status after a DELETE operation.

azure/scope/machine.go

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -359,9 +359,9 @@ func (m *MachineScope) Subnet() infrav1.SubnetSpec {
359359

360360
// AvailabilityZone returns the AzureMachine Availability Zone.
361361
// Priority for selecting the AZ is
362-
// 1) Machine.Spec.FailureDomain
363-
// 2) AzureMachine.Spec.FailureDomain (This is to support deprecated AZ)
364-
// 3) No AZ
362+
// 1. Machine.Spec.FailureDomain
363+
// 2. AzureMachine.Spec.FailureDomain (This is to support deprecated AZ)
364+
// 3. No AZ
365365
func (m *MachineScope) AvailabilityZone() string {
366366
if m.Machine.Spec.FailureDomain != nil {
367367
return *m.Machine.Spec.FailureDomain
@@ -687,13 +687,13 @@ func (m *MachineScope) SetLongRunningOperationState(future *infrav1.Future) {
687687
}
688688

689689
// GetLongRunningOperationState will get the future on the AzureMachine status.
690-
func (m *MachineScope) GetLongRunningOperationState(name, service string) *infrav1.Future {
691-
return futures.Get(m.AzureMachine, name, service)
690+
func (m *MachineScope) GetLongRunningOperationState(name, service, futureType string) *infrav1.Future {
691+
return futures.Get(m.AzureMachine, name, service, futureType)
692692
}
693693

694694
// DeleteLongRunningOperationState will delete the future from the AzureMachine status.
695-
func (m *MachineScope) DeleteLongRunningOperationState(name, service string) {
696-
futures.Delete(m.AzureMachine, name, service)
695+
func (m *MachineScope) DeleteLongRunningOperationState(name, service, futureType string) {
696+
futures.Delete(m.AzureMachine, name, service, futureType)
697697
}
698698

699699
// UpdateDeleteStatus updates a condition on the AzureMachine status after a DELETE operation.

azure/scope/machinepool.go

Lines changed: 7 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -297,7 +297,9 @@ func (m *MachinePoolScope) applyAzureMachinePoolMachines(ctx context.Context) er
297297
return nil
298298
}
299299

300-
if futures.Has(m.AzureMachinePool, m.Name(), ScalesetsServiceName) {
300+
if futures.Has(m.AzureMachinePool, m.Name(), ScalesetsServiceName, infrav1.PatchFuture) ||
301+
futures.Has(m.AzureMachinePool, m.Name(), ScalesetsServiceName, infrav1.PutFuture) ||
302+
futures.Has(m.AzureMachinePool, m.Name(), ScalesetsServiceName, infrav1.DeleteFuture) {
301303
log.V(4).Info("exiting early due an in-progress long running operation on the ScaleSet")
302304
// exit early to be less greedy about delete
303305
return nil
@@ -377,13 +379,13 @@ func (m *MachinePoolScope) SetLongRunningOperationState(future *infrav1.Future)
377379
}
378380

379381
// GetLongRunningOperationState will get the future on the AzureMachinePool status.
380-
func (m *MachinePoolScope) GetLongRunningOperationState(name, service string) *infrav1.Future {
381-
return futures.Get(m.AzureMachinePool, name, service)
382+
func (m *MachinePoolScope) GetLongRunningOperationState(name, service, futureType string) *infrav1.Future {
383+
return futures.Get(m.AzureMachinePool, name, service, futureType)
382384
}
383385

384386
// DeleteLongRunningOperationState will delete the future from the AzureMachinePool status.
385-
func (m *MachinePoolScope) DeleteLongRunningOperationState(name, service string) {
386-
futures.Delete(m.AzureMachinePool, name, service)
387+
func (m *MachinePoolScope) DeleteLongRunningOperationState(name, service, futureType string) {
388+
futures.Delete(m.AzureMachinePool, name, service, futureType)
387389
}
388390

389391
// setProvisioningStateAndConditions sets the AzureMachinePool provisioning state and conditions.

azure/scope/machinepoolmachine.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -170,13 +170,13 @@ func (s *MachinePoolMachineScope) SetLongRunningOperationState(future *infrav1.F
170170
}
171171

172172
// GetLongRunningOperationState will get the future on the AzureMachinePoolMachine status.
173-
func (s *MachinePoolMachineScope) GetLongRunningOperationState(name, service string) *infrav1.Future {
174-
return futures.Get(s.AzureMachinePoolMachine, name, service)
173+
func (s *MachinePoolMachineScope) GetLongRunningOperationState(name, service, futureType string) *infrav1.Future {
174+
return futures.Get(s.AzureMachinePoolMachine, name, service, futureType)
175175
}
176176

177177
// DeleteLongRunningOperationState will delete the future from the AzureMachinePoolMachine status.
178-
func (s *MachinePoolMachineScope) DeleteLongRunningOperationState(name, service string) {
179-
futures.Delete(s.AzureMachinePoolMachine, name, service)
178+
func (s *MachinePoolMachineScope) DeleteLongRunningOperationState(name, service, futureType string) {
179+
futures.Delete(s.AzureMachinePoolMachine, name, service, futureType)
180180
}
181181

182182
// UpdateDeleteStatus updates a condition on the AzureMachinePoolMachine status after a DELETE operation.

azure/scope/managedcontrolplane.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -561,13 +561,13 @@ func (s *ManagedControlPlaneScope) SetLongRunningOperationState(future *infrav1.
561561
}
562562

563563
// GetLongRunningOperationState will get the future on the AzureManagedControlPlane status.
564-
func (s *ManagedControlPlaneScope) GetLongRunningOperationState(name, service string) *infrav1.Future {
565-
return futures.Get(s.ControlPlane, name, service)
564+
func (s *ManagedControlPlaneScope) GetLongRunningOperationState(name, service, futureType string) *infrav1.Future {
565+
return futures.Get(s.ControlPlane, name, service, futureType)
566566
}
567567

568568
// DeleteLongRunningOperationState will delete the future from the AzureManagedControlPlane status.
569-
func (s *ManagedControlPlaneScope) DeleteLongRunningOperationState(name, service string) {
570-
futures.Delete(s.ControlPlane, name, service)
569+
func (s *ManagedControlPlaneScope) DeleteLongRunningOperationState(name, service, futureType string) {
570+
futures.Delete(s.ControlPlane, name, service, futureType)
571571
}
572572

573573
// UpdateDeleteStatus updates a condition on the AzureManagedControlPlane status after a DELETE operation.

azure/scope/managedmachinepool.go

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -221,20 +221,20 @@ func (s *ManagedMachinePoolScope) SetAgentPoolReady(ready bool) {
221221
s.InfraMachinePool.Status.Ready = ready
222222
}
223223

224-
// SetLongRunningOperationState will set the future on the AzureManagedControlPlane status to allow the resource to continue
224+
// SetLongRunningOperationState will set the future on the AzureManagedMachinePool status to allow the resource to continue
225225
// in the next reconciliation.
226226
func (s *ManagedMachinePoolScope) SetLongRunningOperationState(future *infrav1.Future) {
227-
futures.Set(s.ControlPlane, future)
227+
futures.Set(s.InfraMachinePool, future)
228228
}
229229

230-
// GetLongRunningOperationState will get the future on the AzureManagedControlPlane status.
231-
func (s *ManagedMachinePoolScope) GetLongRunningOperationState(name, service string) *infrav1.Future {
232-
return futures.Get(s.ControlPlane, name, service)
230+
// GetLongRunningOperationState will get the future on the AzureManagedMachinePool status.
231+
func (s *ManagedMachinePoolScope) GetLongRunningOperationState(name, service, futureType string) *infrav1.Future {
232+
return futures.Get(s.InfraMachinePool, name, service, futureType)
233233
}
234234

235-
// DeleteLongRunningOperationState will delete the future from the AzureManagedControlPlane status.
236-
func (s *ManagedMachinePoolScope) DeleteLongRunningOperationState(name, service string) {
237-
futures.Delete(s.ControlPlane, name, service)
235+
// DeleteLongRunningOperationState will delete the future from the AzureManagedMachinePool status.
236+
func (s *ManagedMachinePoolScope) DeleteLongRunningOperationState(name, service, futureType string) {
237+
futures.Delete(s.InfraMachinePool, name, service, futureType)
238238
}
239239

240240
// UpdateDeleteStatus updates a condition on the AzureManagedControlPlane status after a DELETE operation.

azure/services/agentpools/client.go

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -131,12 +131,7 @@ func (ac *azureClient) IsDone(ctx context.Context, future azureautorest.FutureAP
131131
ctx, _, done := tele.StartSpanWithLogger(ctx, "agentpools.azureClient.IsDone")
132132
defer done()
133133

134-
isDone, err = future.DoneWithContext(ctx, ac.agentpools)
135-
if err != nil {
136-
return false, errors.Wrap(err, "failed checking if the operation was complete")
137-
}
138-
139-
return isDone, nil
134+
return future.DoneWithContext(ctx, ac.agentpools)
140135
}
141136

142137
// Result fetches the result of a long-running operation future.

azure/services/agentpools/mock_agentpools/agentpools_mock.go

Lines changed: 8 additions & 8 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

0 commit comments

Comments
 (0)