@@ -37,6 +37,7 @@ import (
37
37
resourceinformers "k8s.io/client-go/informers/resource/v1alpha2"
38
38
"k8s.io/client-go/kubernetes"
39
39
"k8s.io/client-go/tools/cache"
40
+ "k8s.io/client-go/util/flowcontrol"
40
41
"k8s.io/client-go/util/workqueue"
41
42
"k8s.io/klog/v2"
42
43
drapb "k8s.io/kubelet/pkg/apis/dra/v1alpha3"
@@ -46,7 +47,10 @@ import (
46
47
const (
47
48
// resyncPeriod for informer
48
49
// TODO (https://github.com/kubernetes/kubernetes/issues/123688): disable?
49
- resyncPeriod = time .Duration (10 * time .Minute )
50
+ resyncPeriod = time .Duration (10 * time .Minute )
51
+ retryPeriod = 5 * time .Second
52
+ maxRetryPeriod = 180 * time .Second
53
+ backoffFactor = 2.0 // Introduce a backoff multiplier as jitter factor
50
54
)
51
55
52
56
// nodeResourcesController collects resource information from all registered
@@ -185,6 +189,9 @@ func (c *nodeResourcesController) monitorPlugin(ctx context.Context, active *act
185
189
logger .Info ("Stopping to monitor node resources of the plugin" , "reason" , context .Cause (ctx ), "err" , ctx .Err (), "recover" , r )
186
190
}()
187
191
192
+ backOff := flowcontrol .NewBackOffWithJitter (retryPeriod , maxRetryPeriod , backoffFactor )
193
+ backOffID := "retry"
194
+
188
195
// Keep trying until canceled.
189
196
for ctx .Err () == nil {
190
197
logger .V (5 ).Info ("Calling NodeListAndWatchResources" )
@@ -197,9 +204,9 @@ func (c *nodeResourcesController) monitorPlugin(ctx context.Context, active *act
197
204
default :
198
205
// This is a problem, report it and retry.
199
206
logger .Error (err , "Creating gRPC stream for node resources failed" )
200
- // TODO (https://github.com/kubernetes/kubernetes/issues/123689): expontential backoff?
201
207
select {
202
- case <- time .After (5 * time .Second ):
208
+ case <- time .After (backOff .Get (backOffID )):
209
+ backOff .Next (backOffID , time .Now ())
203
210
case <- ctx .Done ():
204
211
}
205
212
}
@@ -219,9 +226,9 @@ func (c *nodeResourcesController) monitorPlugin(ctx context.Context, active *act
219
226
case ctx .Err () == nil :
220
227
// This is a problem, report it and retry.
221
228
logger .Error (err , "Reading node resources from gRPC stream failed" )
222
- // TODO (https://github.com/kubernetes/kubernetes/issues/123689): expontential backoff?
223
229
select {
224
- case <- time .After (5 * time .Second ):
230
+ case <- time .After (backOff .Get (backOffID )):
231
+ backOff .Next (backOffID , time .Now ())
225
232
case <- ctx .Done ():
226
233
}
227
234
}
0 commit comments