Skip to content

Commit e5c9b41

Browse files
jmealoarmruleonardoce
authored
feat: classify known Azure CSI snapshot errors as retriable (cloudnative-pg#6906)
Fixes: cloudnative-pg#6901 Signed-off-by: Jeff Mealo <jmealo@protonmail.com> Signed-off-by: Armando Ruocco <armando.ruocco@enterprisedb.com> Signed-off-by: Leonardo Cecchi <leonardo.cecchi@enterprisedb.com> Co-authored-by: Armando Ruocco <armando.ruocco@enterprisedb.com> Co-authored-by: Leonardo Cecchi <leonardo.cecchi@enterprisedb.com>
1 parent 0b97551 commit e5c9b41

File tree

4 files changed

+132
-16
lines changed

4 files changed

+132
-16
lines changed

.wordlist-en-custom.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1147,6 +1147,7 @@ rehydrated
11471147
rehydration
11481148
relabelings
11491149
relatime
1150+
retryable
11501151
replicationSecretVersion
11511152
replicationSlots
11521153
replicationTLSSecret
Lines changed: 89 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,89 @@
1+
/*
2+
Copyright The CloudNativePG Contributors
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package volumesnapshot
18+
19+
import (
20+
"regexp"
21+
"strconv"
22+
"strings"
23+
)
24+
25+
var (
26+
retryableStatusCodes = []int{408, 429, 500, 502, 503, 504}
27+
httpStatusCodeRegex = regexp.MustCompile(`HTTPStatusCode:\s(\d{3})`)
28+
)
29+
30+
// isRetriableErrorMessage detects if a certain error message belongs
31+
// to a retriable error or not. This is obviously an heuristic but
32+
// unfortunately we don't have that information exposed in the
33+
// Kubernetes VolumeSnapshot API and the CSI driver haven't that too.
34+
func isRetriableErrorMessage(msg string) bool {
35+
isRetryableFuncs := []func(string) bool{
36+
isExplicitlyRetriableError,
37+
isRetryableHTTPError,
38+
isConflictError,
39+
}
40+
41+
for _, isRetryableFunc := range isRetryableFuncs {
42+
if isRetryableFunc(msg) {
43+
return true
44+
}
45+
}
46+
47+
return false
48+
}
49+
50+
// isConflictError detects optimistic locking errors
51+
func isConflictError(msg string) bool {
52+
// Obviously this is a heuristic, but unfortunately we don't have
53+
// the information we need.
54+
// We're trying to handle the cases where the external-snapshotter
55+
// controller failed on a conflict with the following error:
56+
//
57+
// > the object has been modified; please apply your changes to the
58+
// > latest version and try again
59+
60+
return strings.Contains(msg, "the object has been modified")
61+
}
62+
63+
// isExplicitlyRetriableError detects explicitly retriable errors as raised
64+
// by the Azure CSI driver. These errors contain the "Retriable: true"
65+
// string.
66+
func isExplicitlyRetriableError(msg string) bool {
67+
return strings.Contains(msg, "Retriable: true")
68+
}
69+
70+
// isRetryableHTTPError, will return a retry on the following status codes:
71+
// - 408: Request Timeout
72+
// - 429: Too Many Requests
73+
// - 500: Internal Server Error
74+
// - 502: Bad Gateway
75+
// - 503: Service Unavailable
76+
// - 504: Gateway Timeout
77+
func isRetryableHTTPError(msg string) bool {
78+
if matches := httpStatusCodeRegex.FindStringSubmatch(msg); len(matches) == 2 {
79+
if code, err := strconv.Atoi(matches[1]); err == nil {
80+
for _, retryableCode := range retryableStatusCodes {
81+
if code == retryableCode {
82+
return true
83+
}
84+
}
85+
}
86+
}
87+
88+
return false
89+
}
Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
/*
2+
Copyright The CloudNativePG Contributors
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package volumesnapshot
18+
19+
import (
20+
. "github.com/onsi/ginkgo/v2"
21+
. "github.com/onsi/gomega"
22+
)
23+
24+
var _ = Describe("Retriable error messages", func() {
25+
DescribeTable(
26+
"Retriable error messages",
27+
func(msg string, isRetriable bool) {
28+
Expect(isRetriableErrorMessage(msg)).To(Equal(isRetriable))
29+
},
30+
Entry("conflict", "Hey, the object has been modified!", true),
31+
Entry("non-retriable error", "VolumeSnapshotClass not found", false),
32+
Entry("explicitly retriable error", "Retriable: true, the storage is gone away forever", true),
33+
Entry("explicitly non-retriable error", "Retriable: false because my pod is working", false),
34+
Entry("error code 502 - retriable", "RetryAfter: 0s, HTTPStatusCode: 502, RawError: Internal Server Error", true),
35+
Entry("error code 404 - non retriable", "RetryAfter: 0s, HTTPStatusCode: 404, RawError: Not found", false),
36+
)
37+
})

pkg/reconciler/backup/volumesnapshot/resources.go

Lines changed: 5 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,6 @@ package volumesnapshot
1919
import (
2020
"context"
2121
"fmt"
22-
"strings"
2322

2423
storagesnapshotv1 "github.com/kubernetes-csi/external-snapshotter/client/v8/apis/volumesnapshot/v1"
2524
"sigs.k8s.io/controller-runtime/pkg/client"
@@ -69,26 +68,16 @@ func (err volumeSnapshotError) Error() string {
6968
// IsRetryable returns true if the external snapshotter controller
7069
// will retry taking the snapshot
7170
func (err volumeSnapshotError) isRetryable() bool {
72-
if err.InternalError.Message == nil {
73-
return false
74-
}
75-
76-
// Obviously this is a heuristic, but unfortunately we don't have
77-
// the information we need.
78-
// We're trying to handle the cases where the external-snapshotter
79-
// controller failed on a conflict with the following error:
80-
//
81-
// > the object has been modified; please apply your changes to the
82-
// > latest version and try again
83-
8471
// TODO: instead of blindingly retry on matching errors, we
8572
// should enhance our CRD with a configurable deadline. After
8673
// the deadline have been met on err.InternalError.CreatedAt
8774
// the backup can be marked as failed
8875

89-
return strings.Contains(
90-
*err.InternalError.Message,
91-
"the object has been modified")
76+
if err.InternalError.Message == nil {
77+
return false
78+
}
79+
80+
return isRetriableErrorMessage(*err.InternalError.Message)
9281
}
9382

9483
// slice represents a slice of []storagesnapshotv1.VolumeSnapshot

0 commit comments

Comments
 (0)