Skip to content

Commit b06160e

Browse files
authored
Merge pull request #987 from DataDog/fricounet/upstream/fix-controller-startup-time
Fix the max duration to wait for CRDs in the snapshot-controller
2 parents 612c89b + 72b51c0 commit b06160e

File tree

3 files changed

+30
-31
lines changed

3 files changed

+30
-31
lines changed

README.md

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -131,19 +131,19 @@ Read more about how to install the example webhook [here](deploy/kubernetes/webh
131131

132132
##### Volume Snapshot Classes
133133

134-
* There can only be a single default volume snapshot class for a particular driver.
134+
* There can only be a single default volume snapshot class for a particular driver.
135135

136136
### Distributed Snapshotting
137137

138138
The distributed snapshotting feature is provided to handle snapshot operations for local volumes. To use this functionality, the snapshotter sidecar should be deployed along with the csi driver on each node so that every node manages the snapshot operations only for the volumes local to that node. This feature can be enabled by setting the following command line options to true:
139139

140140
#### Snapshot controller option
141141

142-
* `--enable-distributed-snapshotting`: This option lets the snapshot controller know that distributed snapshotting is enabled and the snapshotter sidecar will be running on each node. Off by default.
142+
* `--enable-distributed-snapshotting`: This option lets the snapshot controller know that distributed snapshotting is enabled and the snapshotter sidecar will be running on each node. Off by default.
143143

144144
#### CSI external snapshotter sidecar option
145145

146-
* `--node-deployment`: Enables the snapshotter sidecar to handle snapshot operations for the volumes local to the node on which it is deployed. Off by default.
146+
* `--node-deployment`: Enables the snapshotter sidecar to handle snapshot operations for the volumes local to the node on which it is deployed. Off by default.
147147

148148
Other than this, the NODE_NAME environment variable must be set where the CSI snapshotter sidecar is deployed. The value of NODE_NAME should be the name of the node where the sidecar is running.
149149

@@ -174,7 +174,7 @@ Other than this, the NODE_NAME environment variable must be set where the CSI sn
174174

175175
* `--retry-interval-max`: Maximum retry interval of failed volume snapshot creation or deletion. Default value is 5 minutes.
176176

177-
* `--retry-crd-interval-max`: Maximum retry interval for detecting the snapshot CRDs on controller startup. Default is 5 seconds.
177+
* `--retry-crd-interval-max`: Maximum retry duration for detecting the snapshot CRDs on controller startup. Default is 30 seconds.
178178

179179
* `--enable-distributed-snapshotting` : Enables each node to handle snapshots for the volumes local to that node. Off by default. It should be set to true only if `--node-deployment` parameter for the csi external snapshotter sidecar is set to true. See https://github.com/kubernetes-csi/external-snapshotter/blob/master/README.md#distributed-snapshotting for details.
180180

cmd/snapshot-controller/main.go

Lines changed: 21 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -74,46 +74,49 @@ var (
7474
preventVolumeModeConversion = flag.Bool("prevent-volume-mode-conversion", true, "Prevents an unauthorised user from modifying the volume mode when creating a PVC from an existing VolumeSnapshot.")
7575
enableVolumeGroupSnapshots = flag.Bool("enable-volume-group-snapshots", false, "Enables the volume group snapshot feature, allowing the user to create a snapshot of a group of volumes.")
7676

77-
retryCRDIntervalMax = flag.Duration("retry-crd-interval-max", 5*time.Second, "Maximum retry interval to wait for CRDs to appear. The default is 5 seconds.")
77+
retryCRDIntervalMax = flag.Duration("retry-crd-interval-max", 30*time.Second, "Maximum time to wait for CRDs to appear. The default is 30 seconds.")
7878
)
7979

8080
var version = "unknown"
8181

82-
// Checks that the VolumeSnapshot v1 CRDs exist.
82+
// Checks that the VolumeSnapshot v1 CRDs exist. It will wait at most the duration specified by retryCRDIntervalMax
8383
func ensureCustomResourceDefinitionsExist(client *clientset.Clientset, enableVolumeGroupSnapshots bool) error {
84-
condition := func() (bool, error) {
84+
condition := func(ctx context.Context) (bool, error) {
8585
var err error
86+
// List calls should return faster with a limit of 0.
87+
// We do not care about what is returned and just want to make sure the CRDs exist.
88+
listOptions := metav1.ListOptions{Limit: 0}
8689

8790
// scoping to an empty namespace makes `List` work across all namespaces
88-
_, err = client.SnapshotV1().VolumeSnapshots("").List(context.TODO(), metav1.ListOptions{})
91+
_, err = client.SnapshotV1().VolumeSnapshots("").List(ctx, listOptions)
8992
if err != nil {
9093
klog.Errorf("Failed to list v1 volumesnapshots with error=%+v", err)
9194
return false, nil
9295
}
9396

94-
_, err = client.SnapshotV1().VolumeSnapshotClasses().List(context.TODO(), metav1.ListOptions{})
97+
_, err = client.SnapshotV1().VolumeSnapshotClasses().List(ctx, listOptions)
9598
if err != nil {
9699
klog.Errorf("Failed to list v1 volumesnapshotclasses with error=%+v", err)
97100
return false, nil
98101
}
99-
_, err = client.SnapshotV1().VolumeSnapshotContents().List(context.TODO(), metav1.ListOptions{})
102+
_, err = client.SnapshotV1().VolumeSnapshotContents().List(ctx, listOptions)
100103
if err != nil {
101104
klog.Errorf("Failed to list v1 volumesnapshotcontents with error=%+v", err)
102105
return false, nil
103106
}
104107
if enableVolumeGroupSnapshots {
105-
_, err = client.GroupsnapshotV1alpha1().VolumeGroupSnapshots("").List(context.TODO(), metav1.ListOptions{})
108+
_, err = client.GroupsnapshotV1alpha1().VolumeGroupSnapshots("").List(ctx, listOptions)
106109
if err != nil {
107110
klog.Errorf("Failed to list v1alpha1 volumegroupsnapshots with error=%+v", err)
108111
return false, nil
109112
}
110113

111-
_, err = client.GroupsnapshotV1alpha1().VolumeGroupSnapshotClasses().List(context.TODO(), metav1.ListOptions{})
114+
_, err = client.GroupsnapshotV1alpha1().VolumeGroupSnapshotClasses().List(ctx, listOptions)
112115
if err != nil {
113116
klog.Errorf("Failed to list v1alpha1 volumegroupsnapshotclasses with error=%+v", err)
114117
return false, nil
115118
}
116-
_, err = client.GroupsnapshotV1alpha1().VolumeGroupSnapshotContents().List(context.TODO(), metav1.ListOptions{})
119+
_, err = client.GroupsnapshotV1alpha1().VolumeGroupSnapshotContents().List(ctx, listOptions)
117120
if err != nil {
118121
klog.Errorf("Failed to list v1alpha1 volumegroupsnapshotcontents with error=%+v", err)
119122
return false, nil
@@ -123,24 +126,19 @@ func ensureCustomResourceDefinitionsExist(client *clientset.Clientset, enableVol
123126
return true, nil
124127
}
125128

126-
// The maximum retry duration = initial duration * retry factor ^ # steps. Rearranging, this gives
127-
// # steps = log(maximum retry / initial duration) / log(retry factor).
128129
const retryFactor = 1.5
129-
const initialDurationMs = 100
130-
maxMs := retryCRDIntervalMax.Milliseconds()
131-
if maxMs < initialDurationMs {
132-
maxMs = initialDurationMs
133-
}
134-
steps := int(math.Ceil(math.Log(float64(maxMs)/initialDurationMs) / math.Log(retryFactor)))
135-
if steps < 1 {
136-
steps = 1
137-
}
130+
const initialDuration = 100 * time.Millisecond
138131
backoff := wait.Backoff{
139-
Duration: initialDurationMs * time.Millisecond,
132+
Duration: initialDuration,
140133
Factor: retryFactor,
141-
Steps: steps,
134+
Steps: math.MaxInt32, // effectively no limit until the timeout is reached
142135
}
143-
if err := wait.ExponentialBackoff(backoff, condition); err != nil {
136+
137+
// Sanity check to make sure we have a minimum duration of 1 second to work with
138+
maxBackoffDuration := max(*retryCRDIntervalMax, 1*time.Second)
139+
ctx, cancel := context.WithTimeout(context.Background(), maxBackoffDuration)
140+
defer cancel()
141+
if err := wait.ExponentialBackoffWithContext(ctx, backoff, condition); err != nil {
144142
return err
145143
}
146144

deploy/kubernetes/snapshot-controller/setup-snapshot-controller.yaml

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -16,10 +16,11 @@ spec:
1616
selector:
1717
matchLabels:
1818
app.kubernetes.io/name: snapshot-controller
19-
# the snapshot controller won't be marked as ready if the v1 CRDs are unavailable
20-
# in #504 the snapshot-controller will exit after around 7.5 seconds if it
21-
# can't find the v1 CRDs so this value should be greater than that
22-
minReadySeconds: 15
19+
# The snapshot controller won't be marked as ready if the v1 CRDs are unavailable.
20+
# The flag --retry-crd-interval-max is used to determine how long the controller
21+
# will wait for the CRDs to become available before exiting. The default is 30 seconds
22+
# so minReadySeconds should be set slightly higher than the flag value.
23+
minReadySeconds: 35
2324
strategy:
2425
rollingUpdate:
2526
maxSurge: 0

0 commit comments

Comments
 (0)