Skip to content

Commit 50c8f73

Browse files
authored
Merge pull request kubernetes#88017 from feiskyer/fix-409
Make Azure clients only retry on specified HTTP status codes
2 parents de9bbcc + 6a48772 commit 50c8f73

File tree

7 files changed

+86
-47
lines changed

7 files changed

+86
-47
lines changed

staging/src/k8s.io/legacy-cloud-providers/azure/azure.go

Lines changed: 5 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -504,23 +504,17 @@ func (az *Cloud) InitializeCloudFromConfig(config *Config, fromSecret bool) erro
504504
az.VirtualMachineSizesClient = vmsizeclient.New(azClientConfig.WithRateLimiter(config.VirtualMachineSizeRateLimit))
505505
az.SnapshotsClient = snapshotclient.New(azClientConfig.WithRateLimiter(config.SnapshotRateLimit))
506506
az.StorageAccountClient = storageaccountclient.New(azClientConfig.WithRateLimiter(config.StorageAccountRateLimit))
507-
507+
az.VirtualMachinesClient = vmclient.New(azClientConfig.WithRateLimiter(config.VirtualMachineRateLimit))
508+
az.DisksClient = diskclient.New(azClientConfig.WithRateLimiter(config.DiskRateLimit))
508509
// fileClient is not based on armclient, but it's still backoff retried.
509-
az.FileClient = newAzureFileClient(env, azClientConfig.Backoff.WithNonRetriableErrors([]string{}, []int{http.StatusNotFound}))
510-
511-
vmClientConfig := azClientConfig.WithRateLimiter(config.VirtualMachineRateLimit)
512-
vmClientConfig.Backoff = vmClientConfig.Backoff.WithNonRetriableErrors([]string{}, []int{http.StatusConflict})
513-
az.VirtualMachinesClient = vmclient.New(vmClientConfig)
510+
az.FileClient = newAzureFileClient(env, azClientConfig.Backoff)
514511

515512
// Error "not an active Virtual Machine Scale Set VM" is not retriable for VMSS VM.
513+
// But http.StatusNotFound is retriable because of ARM replication latency.
516514
vmssVMClientConfig := azClientConfig.WithRateLimiter(config.VirtualMachineScaleSetRateLimit)
517-
vmssVMClientConfig.Backoff = vmssVMClientConfig.Backoff.WithNonRetriableErrors([]string{vmssVMNotActiveErrorMessage}, []int{http.StatusConflict})
515+
vmssVMClientConfig.Backoff = vmssVMClientConfig.Backoff.WithNonRetriableErrors([]string{vmssVMNotActiveErrorMessage}).WithRetriableHTTPStatusCodes([]int{http.StatusNotFound})
518516
az.VirtualMachineScaleSetVMsClient = vmssvmclient.New(vmssVMClientConfig)
519517

520-
disksClientConfig := azClientConfig.WithRateLimiter(config.DiskRateLimit)
521-
disksClientConfig.Backoff = disksClientConfig.Backoff.WithNonRetriableErrors([]string{}, []int{http.StatusNotFound, http.StatusConflict})
522-
az.DisksClient = diskclient.New(disksClientConfig)
523-
524518
if az.MaximumLoadBalancerRuleCount == 0 {
525519
az.MaximumLoadBalancerRuleCount = maximumLoadBalancerRuleCount
526520
}

staging/src/k8s.io/legacy-cloud-providers/azure/azure_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -801,7 +801,7 @@ func TestReconcileSecurityGroupEtagMismatch(t *testing.T) {
801801
HTTPStatusCode: http.StatusPreconditionFailed,
802802
RawError: errPreconditionFailedEtagMismatch,
803803
}
804-
assert.Equal(t, err, expectedError.Error())
804+
assert.Equal(t, expectedError.Error(), err)
805805
}
806806

807807
func TestReconcilePublicIPWithNewService(t *testing.T) {

staging/src/k8s.io/legacy-cloud-providers/azure/clients/armclient/azure_armclient_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -154,7 +154,7 @@ func TestSendAsync(t *testing.T) {
154154
assert.Nil(t, response)
155155
assert.Equal(t, 1, count)
156156
assert.NotNil(t, rerr)
157-
assert.Equal(t, true, rerr.Retriable)
157+
assert.Equal(t, false, rerr.Retriable)
158158
}
159159

160160
func TestNormalizeAzureRegion(t *testing.T) {
@@ -236,7 +236,7 @@ func TestDeleteResourceAsync(t *testing.T) {
236236
count := 0
237237
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
238238
count++
239-
http.Error(w, "failed", http.StatusForbidden)
239+
http.Error(w, "failed", http.StatusInternalServerError)
240240
}))
241241

242242
backoff := &retry.Backoff{Steps: 3}

staging/src/k8s.io/legacy-cloud-providers/azure/retry/azure_error.go

Lines changed: 36 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,15 @@ const (
3838
var (
3939
// The function to get current time.
4040
now = time.Now
41+
42+
// StatusCodesForRetry are a defined group of status code for which the client will retry.
43+
StatusCodesForRetry = []int{
44+
http.StatusRequestTimeout, // 408
45+
http.StatusInternalServerError, // 500
46+
http.StatusBadGateway, // 502
47+
http.StatusServiceUnavailable, // 503
48+
http.StatusGatewayTimeout, // 504
49+
}
4150
)
4251

4352
// Error indicates an error returned by Azure APIs.
@@ -184,18 +193,21 @@ func getHTTPStatusCode(resp *http.Response) int {
184193
// shouldRetryHTTPRequest determines if the request is retriable.
185194
func shouldRetryHTTPRequest(resp *http.Response, err error) bool {
186195
if resp != nil {
187-
// HTTP 412 (StatusPreconditionFailed) means etag mismatch
188-
// HTTP 400 (BadRequest) means the request cannot be accepted, hence we shouldn't retry.
189-
if resp.StatusCode == http.StatusPreconditionFailed || resp.StatusCode == http.StatusBadRequest {
190-
return false
196+
for _, code := range StatusCodesForRetry {
197+
if resp.StatusCode == code {
198+
return true
199+
}
191200
}
192201

193-
// HTTP 4xx (except 412) or 5xx suggests we should retry.
194-
if 399 < resp.StatusCode && resp.StatusCode < 600 {
202+
// should retry on <200, error>.
203+
if isSuccessHTTPResponse(resp) && err != nil {
195204
return true
196205
}
206+
207+
return false
197208
}
198209

210+
// should retry when error is not nil and no http.Response.
199211
if err != nil {
200212
return true
201213
}
@@ -224,6 +236,24 @@ func getRetryAfter(resp *http.Response) time.Duration {
224236
return dur
225237
}
226238

239+
// GetErrorWithRetriableHTTPStatusCodes gets an error with RetriableHTTPStatusCodes.
240+
// It is used to retry on some HTTPStatusCodes.
241+
func GetErrorWithRetriableHTTPStatusCodes(resp *http.Response, err error, retriableHTTPStatusCodes []int) *Error {
242+
rerr := GetError(resp, err)
243+
if rerr == nil {
244+
return nil
245+
}
246+
247+
for _, code := range retriableHTTPStatusCodes {
248+
if rerr.HTTPStatusCode == code {
249+
rerr.Retriable = true
250+
break
251+
}
252+
}
253+
254+
return rerr
255+
}
256+
227257
// GetStatusNotFoundAndForbiddenIgnoredError gets an error with StatusNotFound and StatusForbidden ignored.
228258
// It is only used in DELETE operations.
229259
func GetStatusNotFoundAndForbiddenIgnoredError(resp *http.Response, err error) *Error {

staging/src/k8s.io/legacy-cloud-providers/azure/retry/azure_error_test.go

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ func TestGetError(t *testing.T) {
7373
code: http.StatusSeeOther,
7474
err: fmt.Errorf("some error"),
7575
expected: &Error{
76-
Retriable: true,
76+
Retriable: false,
7777
HTTPStatusCode: http.StatusSeeOther,
7878
RawError: fmt.Errorf("some error"),
7979
},
@@ -82,7 +82,7 @@ func TestGetError(t *testing.T) {
8282
code: http.StatusTooManyRequests,
8383
retryAfter: 100,
8484
expected: &Error{
85-
Retriable: true,
85+
Retriable: false,
8686
HTTPStatusCode: http.StatusTooManyRequests,
8787
RetryAfter: now().Add(100 * time.Second),
8888
RawError: fmt.Errorf("some error"),
@@ -156,7 +156,7 @@ func TestGetStatusNotFoundAndForbiddenIgnoredError(t *testing.T) {
156156
code: http.StatusSeeOther,
157157
err: fmt.Errorf("some error"),
158158
expected: &Error{
159-
Retriable: true,
159+
Retriable: false,
160160
HTTPStatusCode: http.StatusSeeOther,
161161
RawError: fmt.Errorf("some error"),
162162
},
@@ -165,7 +165,7 @@ func TestGetStatusNotFoundAndForbiddenIgnoredError(t *testing.T) {
165165
code: http.StatusTooManyRequests,
166166
retryAfter: 100,
167167
expected: &Error{
168-
Retriable: true,
168+
Retriable: false,
169169
HTTPStatusCode: http.StatusTooManyRequests,
170170
RetryAfter: now().Add(100 * time.Second),
171171
RawError: fmt.Errorf("some error"),

staging/src/k8s.io/legacy-cloud-providers/azure/retry/azure_retry.go

Lines changed: 12 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -58,8 +58,8 @@ type Backoff struct {
5858
Cap time.Duration
5959
// The errors indicate that the request shouldn't do more retrying.
6060
NonRetriableErrors []string
61-
// The HTTPStatusCode indicates that the request shouldn't do more retrying.
62-
NonRetriableHTTPStatusCodes []int
61+
// The RetriableHTTPStatusCodes indicates that the HTTPStatusCode should do more retrying.
62+
RetriableHTTPStatusCodes []int
6363
}
6464

6565
// NewBackoff creates a new Backoff.
@@ -73,11 +73,17 @@ func NewBackoff(duration time.Duration, factor float64, jitter float64, steps in
7373
}
7474
}
7575

76-
// WithNonRetriableErrors returns a new *Backoff with NonRetriableErrors, NonRetriableHTTPStatusCodes assigned.
77-
func (b *Backoff) WithNonRetriableErrors(errs []string, httpStatusCodes []int) *Backoff {
76+
// WithNonRetriableErrors returns a new *Backoff with NonRetriableErrors assigned.
77+
func (b *Backoff) WithNonRetriableErrors(errs []string) *Backoff {
7878
newBackoff := *b
7979
newBackoff.NonRetriableErrors = errs
80-
newBackoff.NonRetriableHTTPStatusCodes = httpStatusCodes
80+
return &newBackoff
81+
}
82+
83+
// WithRetriableHTTPStatusCodes returns a new *Backoff with RetriableHTTPStatusCode assigned.
84+
func (b *Backoff) WithRetriableHTTPStatusCodes(httpStatusCodes []int) *Backoff {
85+
newBackoff := *b
86+
newBackoff.RetriableHTTPStatusCodes = httpStatusCodes
8187
return &newBackoff
8288
}
8389

@@ -93,11 +99,6 @@ func (b *Backoff) isNonRetriableError(rerr *Error) bool {
9399
}
94100
}
95101

96-
for _, code := range b.NonRetriableHTTPStatusCodes {
97-
if rerr.HTTPStatusCode == code {
98-
return true
99-
}
100-
}
101102
return false
102103
}
103104

@@ -162,7 +163,7 @@ func doBackoffRetry(s autorest.Sender, r *http.Request, backoff *Backoff) (resp
162163
return
163164
}
164165
resp, err = s.Do(rr.Request())
165-
rerr := GetError(resp, err)
166+
rerr := GetErrorWithRetriableHTTPStatusCodes(resp, err, backoff.RetriableHTTPStatusCodes)
166167
// Abort retries in the following scenarios:
167168
// 1) request succeed
168169
// 2) request is not retriable

staging/src/k8s.io/legacy-cloud-providers/azure/retry/azure_retry_test.go

Lines changed: 26 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -75,11 +75,6 @@ func TestStep(t *testing.T) {
7575
}
7676

7777
func TestDoBackoffRetry(t *testing.T) {
78-
backoff := &Backoff{
79-
Factor: 1.0,
80-
Steps: 3,
81-
NonRetriableHTTPStatusCodes: []int{http.StatusNotFound},
82-
}
8378
fakeRequest := &http.Request{
8479
URL: &url.URL{
8580
Host: "localhost",
@@ -96,7 +91,7 @@ func TestDoBackoffRetry(t *testing.T) {
9691
HTTPStatusCode: 500,
9792
RawError: fmt.Errorf("HTTP status code (500)"),
9893
}
99-
resp, err := doBackoffRetry(client, fakeRequest, backoff)
94+
resp, err := doBackoffRetry(client, fakeRequest, &Backoff{Factor: 1.0, Steps: 3})
10095
assert.NotNil(t, resp)
10196
assert.Equal(t, 500, resp.StatusCode)
10297
assert.Equal(t, expectedErr.Error(), err)
@@ -106,7 +101,7 @@ func TestDoBackoffRetry(t *testing.T) {
106101
r = mocks.NewResponseWithStatus("200 OK", http.StatusOK)
107102
client = mocks.NewSender()
108103
client.AppendAndRepeatResponse(r, 1)
109-
resp, err = doBackoffRetry(client, fakeRequest, backoff)
104+
resp, err = doBackoffRetry(client, fakeRequest, &Backoff{Factor: 1.0, Steps: 3})
110105
assert.Nil(t, err)
111106
assert.Equal(t, 1, client.Attempts())
112107
assert.NotNil(t, resp)
@@ -117,28 +112,47 @@ func TestDoBackoffRetry(t *testing.T) {
117112
client = mocks.NewSender()
118113
client.AppendAndRepeatResponse(r, 1)
119114
expectedErr = &Error{
120-
Retriable: true,
115+
Retriable: false,
121116
HTTPStatusCode: 429,
122117
RawError: fmt.Errorf("HTTP status code (429)"),
123118
}
124-
resp, err = doBackoffRetry(client, fakeRequest, backoff)
119+
resp, err = doBackoffRetry(client, fakeRequest, &Backoff{Factor: 1.0, Steps: 3})
125120
assert.Equal(t, expectedErr.Error(), err)
126121
assert.Equal(t, 1, client.Attempts())
127122
assert.NotNil(t, resp)
128123
assert.Equal(t, 429, resp.StatusCode)
129124

130-
// retry on non retriable error
125+
// don't retry on non retriable error
131126
r = mocks.NewResponseWithStatus("404 StatusNotFound", http.StatusNotFound)
132127
client = mocks.NewSender()
133128
client.AppendAndRepeatResponse(r, 1)
134129
expectedErr = &Error{
135-
Retriable: true,
130+
Retriable: false,
136131
HTTPStatusCode: 404,
137132
RawError: fmt.Errorf("HTTP status code (404)"),
138133
}
139-
resp, err = doBackoffRetry(client, fakeRequest, backoff)
134+
resp, err = doBackoffRetry(client, fakeRequest, &Backoff{Factor: 1.0, Steps: 3})
140135
assert.NotNil(t, resp)
141136
assert.Equal(t, 404, resp.StatusCode)
142137
assert.Equal(t, expectedErr.Error(), err)
143138
assert.Equal(t, 1, client.Attempts())
139+
140+
// retry on RetriableHTTPStatusCodes
141+
r = mocks.NewResponseWithStatus("102 StatusProcessing", http.StatusProcessing)
142+
client = mocks.NewSender()
143+
client.AppendAndRepeatResponse(r, 3)
144+
expectedErr = &Error{
145+
Retriable: true,
146+
HTTPStatusCode: 102,
147+
RawError: fmt.Errorf("HTTP status code (102)"),
148+
}
149+
resp, err = doBackoffRetry(client, fakeRequest, &Backoff{
150+
Factor: 1.0,
151+
Steps: 3,
152+
RetriableHTTPStatusCodes: []int{http.StatusProcessing},
153+
})
154+
assert.NotNil(t, resp)
155+
assert.Equal(t, 102, resp.StatusCode)
156+
assert.Equal(t, expectedErr.Error(), err)
157+
assert.Equal(t, 3, client.Attempts())
144158
}

0 commit comments

Comments
 (0)