Skip to content

Commit 94e5537

Browse files
committed
Lengthening initial backoff time for EndpointSlice controller
The EndpointSlice controller has the potential to manage a large number of resources that are updated frequently. Without proper backoffs in place, there is potential for it to unnecessarily overload the API Server with requests. This makes two significant changes: Increasing the base backoff from 5ms to 1s and making all syncs triggered by EndpointSlice changes delayed by at least 1 second to enable batching.
1 parent bf4cc5b commit 94e5537

File tree

2 files changed

+40
-8
lines changed

2 files changed

+40
-8
lines changed

pkg/controller/endpointslice/BUILD

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ go_library(
4141
"//staging/src/k8s.io/client-go/tools/record:go_default_library",
4242
"//staging/src/k8s.io/client-go/util/workqueue:go_default_library",
4343
"//staging/src/k8s.io/component-base/metrics/prometheus/ratelimiter:go_default_library",
44+
"//vendor/golang.org/x/time/rate:go_default_library",
4445
"//vendor/k8s.io/klog:go_default_library",
4546
"//vendor/k8s.io/utils/net:go_default_library",
4647
],

pkg/controller/endpointslice/endpointslice_controller.go

Lines changed: 39 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@ import (
2020
"fmt"
2121
"time"
2222

23+
"golang.org/x/time/rate"
24+
2325
v1 "k8s.io/api/core/v1"
2426
discovery "k8s.io/api/discovery/v1beta1"
2527
apierrors "k8s.io/apimachinery/pkg/api/errors"
@@ -47,13 +49,24 @@ const (
4749
// maxRetries is the number of times a service will be retried before it is
4850
// dropped out of the queue. Any sync error, such as a failure to create or
4951
// update an EndpointSlice could trigger a retry. With the current
50-
// rate-limiter in use (5ms*2^(maxRetries-1)) the following numbers
51-
// represent the sequence of delays between successive queuings of a
52-
// service.
52+
// rate-limiter in use (1s*2^(numRetries-1)) the following numbers represent
53+
// the sequence of delays between successive queuings of a service.
5354
//
54-
// 5ms, 10ms, 20ms, 40ms, 80ms, 160ms, 320ms, 640ms, 1.3s, 2.6s, 5.1s,
55-
// 10.2s, 20.4s, 41s, 82s
55+
// 1s, 2s, 4s, 8s, 16s, 32s, 64s, 128s, 256s, 512s, 1000s (max)
5656
maxRetries = 15
57+
58+
// endpointSliceChangeMinSyncDelay indicates the mininum delay before
59+
// queuing a syncService call after an EndpointSlice changes. If
60+
// endpointUpdatesBatchPeriod is greater than this value, it will be used
61+
// instead. This helps batch processing of changes to multiple
62+
// EndpointSlices.
63+
endpointSliceChangeMinSyncDelay = 1 * time.Second
64+
65+
// defaultSyncBackOff is the default backoff period for syncService calls.
66+
defaultSyncBackOff = 1 * time.Second
67+
// maxSyncBackOff is the max backoff period for syncService calls.
68+
maxSyncBackOff = 100 * time.Second
69+
5770
// controllerName is a unique value used with LabelManagedBy to indicated
5871
// the component managing an EndpointSlice.
5972
controllerName = "endpointslice-controller.k8s.io"
@@ -80,8 +93,19 @@ func NewController(podInformer coreinformers.PodInformer,
8093
endpointslicemetrics.RegisterMetrics()
8194

8295
c := &Controller{
83-
client: client,
84-
queue: workqueue.NewNamedRateLimitingQueue(workqueue.DefaultControllerRateLimiter(), "endpoint_slice"),
96+
client: client,
97+
// This is similar to the DefaultControllerRateLimiter, just with a
98+
// significantly higher default backoff (1s vs 5ms). This controller
99+
// processes events that can require significant EndpointSlice changes,
100+
// such as an update to a Service or Deployment. A more significant
101+
// rate limit back off here helps ensure that the Controller does not
102+
// overwhelm the API Server.
103+
queue: workqueue.NewNamedRateLimitingQueue(workqueue.NewMaxOfRateLimiter(
104+
workqueue.NewItemExponentialFailureRateLimiter(defaultSyncBackOff, maxSyncBackOff),
105+
// 10 qps, 100 bucket size. This is only for retry speed and its
106+
// only the overall factor (not per item).
107+
&workqueue.BucketRateLimiter{Limiter: rate.NewLimiter(rate.Limit(10), 100)},
108+
), "endpoint_slice"),
85109
workerLoopPeriod: time.Second,
86110
}
87111

@@ -409,7 +433,14 @@ func (c *Controller) queueServiceForEndpointSlice(endpointSlice *discovery.Endpo
409433
utilruntime.HandleError(fmt.Errorf("Couldn't get key for EndpointSlice %+v: %v", endpointSlice, err))
410434
return
411435
}
412-
c.queue.Add(key)
436+
437+
// queue after the max of endpointSliceChangeMinSyncDelay and
438+
// endpointUpdatesBatchPeriod.
439+
delay := endpointSliceChangeMinSyncDelay
440+
if c.endpointUpdatesBatchPeriod > delay {
441+
delay = c.endpointUpdatesBatchPeriod
442+
}
443+
c.queue.AddAfter(key, delay)
413444
}
414445

415446
func (c *Controller) addPod(obj interface{}) {

0 commit comments

Comments
 (0)