Skip to content

Commit 8b60b4a

Browse files
authored
feat: allow backoff with a rate limiter in the work applier (#113)
1 parent 1c794de commit 8b60b4a

File tree

7 files changed

+2017
-60
lines changed

7 files changed

+2017
-60
lines changed

cmd/memberagent/main.go

Lines changed: 49 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -87,14 +87,21 @@ var (
8787
propertyProvider = flag.String("property-provider", "none", "The property provider to use for the agent.")
8888
region = flag.String("region", "", "The region where the member cluster resides.")
8989
cloudConfigFile = flag.String("cloud-config", "/etc/kubernetes/provider/config.json", "The path to the cloud cloudconfig file.")
90-
availabilityCheckInterval = flag.Int("availability-check-interval", 5, "The interval in seconds between attempts to check for resource availability when resources are not yet available.")
91-
driftDetectionInterval = flag.Int("drift-detection-interval", 15, "The interval in seconds between attempts to detect configuration drifts in the cluster.")
9290
watchWorkWithPriorityQueue = flag.Bool("enable-watch-work-with-priority-queue", false, "If set, the apply_work controller will watch/reconcile work objects that are created new or have recent updates")
9391
watchWorkReconcileAgeMinutes = flag.Int("watch-work-reconcile-age", 60, "maximum age (in minutes) of work objects for apply_work controller to watch/reconcile")
9492
deletionWaitTime = flag.Int("deletion-wait-time", 5, "The time the work-applier will wait for work object to be deleted before updating the applied work owner reference")
9593
enablePprof = flag.Bool("enable-pprof", false, "enable pprof profiling")
9694
pprofPort = flag.Int("pprof-port", 6065, "port for pprof profiling")
9795
hubPprofPort = flag.Int("hub-pprof-port", 6066, "port for hub pprof profiling")
96+
// Work applier requeue rate limiter settings.
97+
workApplierRequeueRateLimiterAttemptsWithFixedDelay = flag.Int("work-applier-requeue-rate-limiter-attempts-with-fixed-delay", 1, "If set, the work applier will requeue work objects with a fixed delay for the specified number of attempts before switching to exponential backoff.")
98+
workApplierRequeueRateLimiterFixedDelaySeconds = flag.Float64("work-applier-requeue-rate-limiter-fixed-delay-seconds", 5.0, "If set, the work applier will requeue work objects with this fixed delay in seconds for the specified number of attempts before switching to exponential backoff.")
99+
workApplierRequeueRateLimiterExponentialBaseForSlowBackoff = flag.Float64("work-applier-requeue-rate-limiter-exponential-base-for-slow-backoff", 1.2, "If set, the work applier will start to back off slowly at this factor after it finished requeueing with fixed delays, until it reaches the slow backoff delay cap. Its value should be larger than 1.0 and no larger than 100.0")
100+
workApplierRequeueRateLimiterInitialSlowBackoffDelaySeconds = flag.Float64("work-applier-requeue-rate-limiter-initial-slow-backoff-delay-seconds", 2, "If set, the work applier will start to back off slowly at this delay in seconds.")
101+
workApplierRequeueRateLimiterMaxSlowBackoffDelaySeconds = flag.Float64("work-applier-requeue-rate-limiter-max-slow-backoff-delay-seconds", 15, "If set, the work applier will not back off longer than this value in seconds when it is in the slow backoff stage.")
102+
workApplierRequeueRateLimiterExponentialBaseForFastBackoff = flag.Float64("work-applier-requeue-rate-limiter-exponential-base-for-fast-backoff", 1.2, "If set, the work applier will start to back off fast at this factor after it completes the slow backoff stage, until it reaches the fast backoff delay cap. Its value should be larger than the base value for the slow backoff stage.")
103+
workApplierRequeueRateLimiterMaxFastBackoffDelaySeconds = flag.Float64("work-applier-requeue-rate-limiter-max-fast-backoff-delay-seconds", 900, "If set, the work applier will not back off longer than this value in seconds when it is in the fast backoff stage.")
104+
workApplierRequeueRateLimiterSkipToFastBackoffForAvailableOrDiffReportedWorkObjs = flag.Bool("work-applier-requeue-rate-limiter-skip-to-fast-backoff-for-available-or-diff-reported-work-objs", true, "If set, the rate limiter will skip the slow backoff stage and start fast backoff immediately for work objects that are available or have diff reported.")
98105
)
99106

100107
func init() {
@@ -382,6 +389,45 @@ func Start(ctx context.Context, hubCfg, memberConfig *rest.Config, hubOpts, memb
382389
return err
383390
}
384391
// create the work controller, so we can pass it to the internal member cluster reconciler
392+
393+
// Set up the requeue rate limiter for the work applier.
394+
//
395+
// With default settings, the rate limiter will:
396+
// * allow 1 attempt of fixed delay; this helps give objects a bit of headroom to get available (or have
397+
// diffs reported).
398+
// * use a fixed delay of 5 seconds for the first attempt.
399+
//
400+
// Important (chenyu1): before the introduction of the requeue rate limiter, the work
401+
// applier uses static requeue intervals, specifically 5 seconds (if the work object is unavailable),
402+
// and 15 seconds (if the work object is available). There are a number of test cases that
403+
// implicitly assume this behavior (e.g., a test case might expect that the availability check completes
404+
// w/in 10 seconds), which is why the rate limiter uses the 5 seconds fixed requeue delay by default.
405+
// If you need to change this value and see that some test cases begin to fail, update the test
406+
// cases accordingly.
407+
// * after completing all attempts with fixed delay, switch to slow exponential backoff with a base of
408+
// 1.2 with an initial delay of 2 seconds and a cap of 15 seconds (12 requeues in total, ~90 seconds in total);
409+
// this is to allow fast checkups in cases where objects are not yet available or have not yet reported diffs.
410+
// * after completing the slow backoff stage, switch to a fast exponential backoff with a base of 1.5
411+
// with an initial delay of 15 seconds and a cap of 15 minutes (10 requeues in total, ~42 minutes in total).
412+
// * for Work objects that are available or have diffs reported, skip the slow backoff stage and
413+
// start fast backoff immediately.
414+
//
415+
// The requeue pattern is essentially:
416+
// * 1 attempts of requeue with fixed delay (5 seconds); then
417+
// * 12 attempts of requeues with slow exponential backoff (factor of 1.2, ~90 seconds in total); then
418+
// * 10 attempts of requeues with fast exponential backoff (factor of 1.5, ~42 minutes in total);
419+
// * afterwards, requeue with a delay of 15 minutes indefinitely.
420+
requeueRateLimiter := workapplier.NewRequeueMultiStageWithExponentialBackoffRateLimiter(
421+
*workApplierRequeueRateLimiterAttemptsWithFixedDelay,
422+
*workApplierRequeueRateLimiterFixedDelaySeconds,
423+
*workApplierRequeueRateLimiterExponentialBaseForSlowBackoff,
424+
*workApplierRequeueRateLimiterInitialSlowBackoffDelaySeconds,
425+
*workApplierRequeueRateLimiterMaxSlowBackoffDelaySeconds,
426+
*workApplierRequeueRateLimiterExponentialBaseForFastBackoff,
427+
*workApplierRequeueRateLimiterMaxFastBackoffDelaySeconds,
428+
*workApplierRequeueRateLimiterSkipToFastBackoffForAvailableOrDiffReportedWorkObjs,
429+
)
430+
385431
workController := workapplier.NewReconciler(
386432
hubMgr.GetClient(),
387433
targetNS,
@@ -394,11 +440,10 @@ func Start(ctx context.Context, hubCfg, memberConfig *rest.Config, hubOpts, memb
394440
5,
395441
// Use the default worker count (4) for parallelized manifest processing.
396442
parallelizer.DefaultNumOfWorkers,
397-
time.Second*time.Duration(*availabilityCheckInterval),
398-
time.Second*time.Duration(*driftDetectionInterval),
399443
time.Minute*time.Duration(*deletionWaitTime),
400444
*watchWorkWithPriorityQueue,
401445
*watchWorkReconcileAgeMinutes,
446+
requeueRateLimiter,
402447
)
403448

404449
if err = workController.SetupWithManager(hubMgr); err != nil {

pkg/controllers/internalmembercluster/v1beta1/member_suite_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -379,7 +379,7 @@ var _ = BeforeSuite(func() {
379379

380380
// This controller is created for testing purposes only; no reconciliation loop is actually
381381
// run.
382-
workApplier1 = workapplier.NewReconciler(hubClient, member1ReservedNSName, nil, nil, nil, nil, 0, 1, time.Second*5, time.Second*5, time.Minute, true, 60)
382+
workApplier1 = workapplier.NewReconciler(hubClient, member1ReservedNSName, nil, nil, nil, nil, 0, 1, time.Minute, false, 60, nil)
383383

384384
propertyProvider1 = &manuallyUpdatedProvider{}
385385
member1Reconciler, err := NewReconciler(ctx, hubClient, member1Cfg, member1Client, workApplier1, propertyProvider1)
@@ -402,7 +402,7 @@ var _ = BeforeSuite(func() {
402402

403403
// This controller is created for testing purposes only; no reconciliation loop is actually
404404
// run.
405-
workApplier2 = workapplier.NewReconciler(hubClient, member2ReservedNSName, nil, nil, nil, nil, 0, 1, time.Second*5, time.Second*5, time.Minute, true, 60)
405+
workApplier2 = workapplier.NewReconciler(hubClient, member2ReservedNSName, nil, nil, nil, nil, 0, 1, time.Minute, false, 60, nil)
406406

407407
member2Reconciler, err := NewReconciler(ctx, hubClient, member2Cfg, member2Client, workApplier2, nil)
408408
Expect(err).NotTo(HaveOccurred())

0 commit comments

Comments
 (0)