Skip to content

Commit 9a0a059

Browse files
authored
Merge pull request #1032 from shysank/feature/1014
Handle terminal errors to prevent reconciliation failure loop
2 parents 1b99af8 + 37b8332 commit 9a0a059

File tree

7 files changed

+105
-4
lines changed

7 files changed

+105
-4
lines changed

cloud/errors.go

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@ package azure
1919
import (
2020
"errors"
2121
"fmt"
22+
"time"
2223

2324
"github.com/Azure/go-autorest/autorest"
2425
)
@@ -41,3 +42,66 @@ type VMDeletedError struct {
4142
func (vde VMDeletedError) Error() string {
4243
return fmt.Sprintf("VM with provider id %q has been deleted", vde.ProviderID)
4344
}
45+
46+
// ReconcileError represents an error that is not automatically recoverable
47+
// errorType indicates what type of action is required to recover. It can take two values
48+
// 1. `Transient` - Can be recovered through manual intervention, will be requeued after
49+
// 2. `Terminal` - Cannot be recovered, will not be requeued
50+
type ReconcileError struct {
51+
error
52+
errorType ReconcileErrorType
53+
requestAfter time.Duration
54+
}
55+
56+
// ReconcileErrorType represents the type of a ReconcileError
57+
type ReconcileErrorType string
58+
59+
const (
60+
// TransientErrorType can be recovered, will be requeued after a configured time interval
61+
TransientErrorType ReconcileErrorType = "Transient"
62+
// TerminalErrorType cannot be recovered, will not be requeued
63+
TerminalErrorType ReconcileErrorType = "Terminal"
64+
)
65+
66+
// Error returns the error message for a ReconcileError
67+
func (t ReconcileError) Error() string {
68+
var errStr string
69+
if t.error != nil {
70+
errStr = t.error.Error()
71+
}
72+
switch t.errorType {
73+
case TransientErrorType:
74+
return fmt.Sprintf("reconcile error occurred that can be recovered. Object will be requeued after %s "+
75+
"The actual error is: %s", t.requestAfter.String(), errStr)
76+
case TerminalErrorType:
77+
return fmt.Sprintf("reconcile error occurred that cannot be recovered. Object will not be requeued. "+
78+
"The actual error is: %s", errStr)
79+
default:
80+
return fmt.Sprintf("reconcile error occurred with unknown recovery type. The actual error is: %s", errStr)
81+
}
82+
}
83+
84+
// IsTransient returns if the ReconcileError is recoverable
85+
func (t ReconcileError) IsTransient() bool {
86+
return t.errorType == TransientErrorType
87+
}
88+
89+
// IsTerminal returns if the ReconcileError is recoverable
90+
func (t ReconcileError) IsTerminal() bool {
91+
return t.errorType == TerminalErrorType
92+
}
93+
94+
// RequeueAfter returns requestAfter value
95+
func (t ReconcileError) RequeueAfter() time.Duration {
96+
return t.requestAfter
97+
}
98+
99+
// WithTransientError wraps the error in a ReconcileError with errorType as `Transient`
100+
func WithTransientError(err error, requeueAfter time.Duration) ReconcileError {
101+
return ReconcileError{error: err, errorType: TransientErrorType, requestAfter: requeueAfter}
102+
}
103+
104+
// WithTerminalError wraps the error in a ReconcileError with errorType as `Terminal`
105+
func WithTerminalError(err error) ReconcileError {
106+
return ReconcileError{error: err, errorType: TerminalErrorType}
107+
}

cloud/services/scalesets/scalesets.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -407,7 +407,7 @@ func getSecurityProfile(vmssSpec azure.ScaleSetSpec, sku resourceskus.SKU) (*com
407407
}
408408

409409
if !sku.HasCapability(resourceskus.EncryptionAtHost) {
410-
return nil, errors.Errorf("encryption at host is not supported for VM type %s", vmssSpec.Size)
410+
return nil, azure.WithTerminalError(errors.Errorf("encryption at host is not supported for VM type %s", vmssSpec.Size))
411411
}
412412

413413
return &compute.SecurityProfile{

cloud/services/scalesets/scalesets_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -973,7 +973,7 @@ func TestReconcileVMSS(t *testing.T) {
973973
},
974974
{
975975
name: "creating a vmss with encryption at host enabled for unsupported VM type fails",
976-
expectedError: "encryption at host is not supported for VM type VM_SIZE",
976+
expectedError: "reconcile error occurred that cannot be recovered. Object will not be requeued. The actual error is: encryption at host is not supported for VM type VM_SIZE",
977977
expect: func(g *gomega.WithT, s *mock_scalesets.MockScaleSetScopeMockRecorder, m *mock_scalesets.MockClientMockRecorder) {
978978
s.ScaleSetSpec().Return(azure.ScaleSetSpec{
979979
Name: "my-vmss",

cloud/services/virtualmachines/virtualmachines.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -458,7 +458,7 @@ func getSecurityProfile(vmSpec azure.VMSpec, sku resourceskus.SKU) (*compute.Sec
458458
}
459459

460460
if !sku.HasCapability(resourceskus.EncryptionAtHost) {
461-
return nil, errors.Errorf("encryption at host is not supported for VM type %s", vmSpec.Size)
461+
return nil, azure.WithTerminalError(errors.Errorf("encryption at host is not supported for VM type %s", vmSpec.Size))
462462
}
463463

464464
return &compute.SecurityProfile{

cloud/services/virtualmachines/virtualmachines_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -823,7 +823,7 @@ func TestReconcileVM(t *testing.T) {
823823
m.Get(gomockinternal.AContext(), "my-rg", "my-vm").
824824
Return(compute.VirtualMachine{}, autorest.NewErrorWithResponse("", "", &http.Response{StatusCode: 404}, "Not found"))
825825
},
826-
ExpectedError: "encryption at host is not supported for VM type Standard_D2v3",
826+
ExpectedError: "reconcile error occurred that cannot be recovered. Object will not be requeued. The actual error is: encryption at host is not supported for VM type Standard_D2v3",
827827
SetupSKUs: func(svc *Service) {
828828
skus := []compute.ResourceSku{
829829
{

controllers/azuremachine_controller.go

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -279,6 +279,25 @@ func (r *AzureMachineReconciler) reconcileNormal(ctx context.Context, machineSco
279279
return reconcile.Result{}, errors.Wrapf(err, "failed to reconcile AzureMachine")
280280
}
281281

282+
// Handle transient and terminal errors
283+
var reconcileError azure.ReconcileError
284+
if errors.As(err, &reconcileError) {
285+
r.Recorder.Eventf(machineScope.AzureMachine, corev1.EventTypeWarning, "ReconcileError", errors.Wrapf(err, "failed to reconcile AzureMachine").Error())
286+
conditions.MarkFalse(machineScope.AzureMachine, infrav1.VMRunningCondition, infrav1.VMProvisionFailedReason, clusterv1.ConditionSeverityError, err.Error())
287+
288+
if reconcileError.IsTerminal() {
289+
machineScope.Error(err, "failed to reconcile AzureMachine", "name", machineScope.Name())
290+
return reconcile.Result{}, nil
291+
}
292+
293+
if reconcileError.IsTransient() {
294+
machineScope.Error(err, "failed to reconcile AzureMachine", "name", machineScope.Name())
295+
return reconcile.Result{RequeueAfter: reconcileError.RequeueAfter()}, nil
296+
}
297+
298+
return reconcile.Result{}, errors.Wrapf(err, "failed to reconcile AzureMachine")
299+
}
300+
282301
r.Recorder.Eventf(machineScope.AzureMachine, corev1.EventTypeWarning, "Error creating new AzureMachine", errors.Wrapf(err, "failed to reconcile AzureMachine").Error())
283302
conditions.MarkFalse(machineScope.AzureMachine, infrav1.VMRunningCondition, infrav1.VMProvisionFailedReason, clusterv1.ConditionSeverityError, err.Error())
284303
return reconcile.Result{}, errors.Wrapf(err, "failed to reconcile AzureMachine")

exp/controllers/azuremachinepool_controller.go

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ import (
4343
"sigs.k8s.io/controller-runtime/pkg/source"
4444

4545
infrav1 "sigs.k8s.io/cluster-api-provider-azure/api/v1alpha3"
46+
azure "sigs.k8s.io/cluster-api-provider-azure/cloud"
4647
"sigs.k8s.io/cluster-api-provider-azure/cloud/scope"
4748
infracontroller "sigs.k8s.io/cluster-api-provider-azure/controllers"
4849
infrav1exp "sigs.k8s.io/cluster-api-provider-azure/exp/api/v1alpha3"
@@ -258,6 +259,23 @@ func (r *AzureMachinePoolReconciler) reconcileNormal(ctx context.Context, machin
258259

259260
err := ams.Reconcile(ctx)
260261
if err != nil {
262+
263+
// Handle transient and terminal errors
264+
var reconcileError azure.ReconcileError
265+
if errors.As(err, &reconcileError) {
266+
if reconcileError.IsTerminal() {
267+
machinePoolScope.Error(err, "failed to reconcile AzureMachinePool", "name", machinePoolScope.Name())
268+
return reconcile.Result{}, nil
269+
}
270+
271+
if reconcileError.IsTransient() {
272+
machinePoolScope.Error(err, "failed to reconcile AzureMachinePool", "name", machinePoolScope.Name())
273+
return reconcile.Result{RequeueAfter: reconcileError.RequeueAfter()}, nil
274+
}
275+
276+
return reconcile.Result{}, errors.Wrapf(err, "failed to reconcile AzureMachinePool")
277+
}
278+
261279
return reconcile.Result{}, err
262280
}
263281

0 commit comments

Comments
 (0)