Skip to content

Commit fad52ae

Browse files
authored
Merge pull request kubernetes#125086 from oxxenix/exponential-backoff
add exponential backoff in NodeResourceSlices controller
2 parents 1c84623 + c4ec248 commit fad52ae

File tree

1 file changed

+12
-5
lines changed

1 file changed

+12
-5
lines changed

pkg/kubelet/cm/dra/plugin/noderesources.go

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ import (
3737
resourceinformers "k8s.io/client-go/informers/resource/v1alpha2"
3838
"k8s.io/client-go/kubernetes"
3939
"k8s.io/client-go/tools/cache"
40+
"k8s.io/client-go/util/flowcontrol"
4041
"k8s.io/client-go/util/workqueue"
4142
"k8s.io/klog/v2"
4243
drapb "k8s.io/kubelet/pkg/apis/dra/v1alpha3"
@@ -46,7 +47,10 @@ import (
4647
const (
4748
// resyncPeriod for informer
4849
// TODO (https://github.com/kubernetes/kubernetes/issues/123688): disable?
49-
resyncPeriod = time.Duration(10 * time.Minute)
50+
resyncPeriod = time.Duration(10 * time.Minute)
51+
retryPeriod = 5 * time.Second
52+
maxRetryPeriod = 180 * time.Second
53+
backoffFactor = 2.0 // Introduce a backoff multiplier as jitter factor
5054
)
5155

5256
// nodeResourcesController collects resource information from all registered
@@ -185,6 +189,9 @@ func (c *nodeResourcesController) monitorPlugin(ctx context.Context, active *act
185189
logger.Info("Stopping to monitor node resources of the plugin", "reason", context.Cause(ctx), "err", ctx.Err(), "recover", r)
186190
}()
187191

192+
backOff := flowcontrol.NewBackOffWithJitter(retryPeriod, maxRetryPeriod, backoffFactor)
193+
backOffID := "retry"
194+
188195
// Keep trying until canceled.
189196
for ctx.Err() == nil {
190197
logger.V(5).Info("Calling NodeListAndWatchResources")
@@ -197,9 +204,9 @@ func (c *nodeResourcesController) monitorPlugin(ctx context.Context, active *act
197204
default:
198205
// This is a problem, report it and retry.
199206
logger.Error(err, "Creating gRPC stream for node resources failed")
200-
// TODO (https://github.com/kubernetes/kubernetes/issues/123689): expontential backoff?
201207
select {
202-
case <-time.After(5 * time.Second):
208+
case <-time.After(backOff.Get(backOffID)):
209+
backOff.Next(backOffID, time.Now())
203210
case <-ctx.Done():
204211
}
205212
}
@@ -219,9 +226,9 @@ func (c *nodeResourcesController) monitorPlugin(ctx context.Context, active *act
219226
case ctx.Err() == nil:
220227
// This is a problem, report it and retry.
221228
logger.Error(err, "Reading node resources from gRPC stream failed")
222-
// TODO (https://github.com/kubernetes/kubernetes/issues/123689): expontential backoff?
223229
select {
224-
case <-time.After(5 * time.Second):
230+
case <-time.After(backOff.Get(backOffID)):
231+
backOff.Next(backOffID, time.Now())
225232
case <-ctx.Done():
226233
}
227234
}

0 commit comments

Comments
 (0)