From 93b414585332499590c3e7632cac6a37b6e75a34 Mon Sep 17 00:00:00 2001 From: godwinpang Date: Wed, 5 Nov 2025 15:58:19 -0800 Subject: [PATCH] =?UTF-8?q?=E2=9C=A8=20Add=20ReconcileTimeouts=20metric=20?= =?UTF-8?q?to=20track=20ReconciliationTimeout=20timeouts.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This PR adds a new metric controller_runtime_reconcile_timeouts_total to track when a controller's reconciliation has reached a ReconciliationTimeout. This provides visibility into when reconcile operations time out due to the controller-runtime wrapper timeout, allowing users to alert / monitor unexpectedly long running controller reconiliations. --- pkg/internal/controller/controller.go | 22 +++- pkg/internal/controller/controller_test.go | 122 +++++++++++++++++++++ pkg/internal/controller/metrics/metrics.go | 10 ++ 3 files changed, 152 insertions(+), 2 deletions(-) diff --git a/pkg/internal/controller/controller.go b/pkg/internal/controller/controller.go index 7dd06957eb..f2638b9d9b 100644 --- a/pkg/internal/controller/controller.go +++ b/pkg/internal/controller/controller.go @@ -39,6 +39,10 @@ import ( "sigs.k8s.io/controller-runtime/pkg/source" ) +// errReconciliationTimeout is the error used as the cause when the ReconciliationTimeout guardrail fires. +// This allows us to distinguish wrapper timeouts from user-initiated context cancellations. +var errReconciliationTimeout = errors.New("reconciliation timeout") + // Options are the arguments for creating a new Controller. type Options[request comparable] struct { // Reconciler is a function that can be called at any time with the Name / Namespace of an object and @@ -207,13 +211,26 @@ func (c *Controller[request]) Reconcile(ctx context.Context, req request) (_ rec } }() + var timeoutCause error if c.ReconciliationTimeout > 0 { + timeoutCause = errReconciliationTimeout var cancel context.CancelFunc - ctx, cancel = context.WithTimeout(ctx, c.ReconciliationTimeout) + ctx, cancel = context.WithTimeoutCause(ctx, c.ReconciliationTimeout, timeoutCause) defer cancel() } - return c.Do.Reconcile(ctx, req) + res, err := c.Do.Reconcile(ctx, req) + + // Check if the reconciliation timed out due to our wrapper timeout guardrail. + // We check ctx.Err() == context.DeadlineExceeded first to ensure the context was actually + // cancelled due to a deadline (not parent cancellation or other reasons), then verify it was + // our specific timeout cause. This prevents false positives from parent context cancellations + // or other timeout scenarios. + if timeoutCause != nil && ctx.Err() == context.DeadlineExceeded && errors.Is(context.Cause(ctx), timeoutCause) { + ctrlmetrics.ReconcileTimeouts.WithLabelValues(c.Name).Inc() + } + + return res, err } // Watch implements controller.Controller. @@ -437,6 +454,7 @@ func (c *Controller[request]) initMetrics() { ctrlmetrics.ReconcileErrors.WithLabelValues(c.Name).Add(0) ctrlmetrics.TerminalReconcileErrors.WithLabelValues(c.Name).Add(0) ctrlmetrics.ReconcilePanics.WithLabelValues(c.Name).Add(0) + ctrlmetrics.ReconcileTimeouts.WithLabelValues(c.Name).Add(0) ctrlmetrics.WorkerCount.WithLabelValues(c.Name).Set(float64(c.MaxConcurrentReconciles)) ctrlmetrics.ActiveWorkers.WithLabelValues(c.Name).Set(0) } diff --git a/pkg/internal/controller/controller_test.go b/pkg/internal/controller/controller_test.go index 6d62b80e22..259f6669e2 100644 --- a/pkg/internal/controller/controller_test.go +++ b/pkg/internal/controller/controller_test.go @@ -155,6 +155,128 @@ var _ = Describe("controller", func() { Expect(err).To(Equal(context.DeadlineExceeded)) }) + Context("prometheus metric reconcile_timeouts", func() { + var reconcileTimeouts dto.Metric + + BeforeEach(func() { + ctrlmetrics.ReconcileTimeouts.Reset() + reconcileTimeouts.Reset() + ctrl.Name = testControllerName + Expect(ctrlmetrics.ReconcileTimeouts.WithLabelValues(ctrl.Name).Write(&reconcileTimeouts)).To(Succeed()) + Expect(reconcileTimeouts.GetCounter().GetValue()).To(Equal(0.0)) + }) + + It("should increment when ReconciliationTimeout context timeout hits DeadlineExceeded", func(ctx SpecContext) { + ctrl.ReconciliationTimeout = time.Duration(1) + ctrl.Do = reconcile.Func(func(ctx context.Context, _ reconcile.Request) (reconcile.Result, error) { + <-ctx.Done() + return reconcile.Result{}, ctx.Err() + }) + _, err := ctrl.Reconcile(ctx, + reconcile.Request{NamespacedName: types.NamespacedName{Namespace: "foo", Name: "bar"}}) + Expect(err).To(HaveOccurred()) + Expect(err).To(Equal(context.DeadlineExceeded)) + + Expect(ctrlmetrics.ReconcileTimeouts.WithLabelValues(ctrl.Name).Write(&reconcileTimeouts)).To(Succeed()) + Expect(reconcileTimeouts.GetCounter().GetValue()).To(Equal(1.0)) + }) + + It("should not increment when user code cancels context earlier than the ReconciliationTimeout", func(ctx SpecContext) { + ctrl.ReconciliationTimeout = 10 * time.Second + userCancelCause := errors.New("user cancellation") + ctrl.Do = reconcile.Func(func(ctx context.Context, _ reconcile.Request) (reconcile.Result, error) { + // User code creates its own timeout with a different cause + userCtx, cancel := context.WithTimeoutCause(ctx, time.Millisecond, userCancelCause) + defer cancel() + <-userCtx.Done() + return reconcile.Result{}, context.Cause(userCtx) + }) + _, err := ctrl.Reconcile(ctx, + reconcile.Request{NamespacedName: types.NamespacedName{Namespace: "foo", Name: "bar"}}) + Expect(err).To(HaveOccurred()) + Expect(errors.Is(err, userCancelCause)).To(BeTrue()) + + // Metric should not be incremented because the context timeout didn't hit DeadlineExceeded + Expect(ctrlmetrics.ReconcileTimeouts.WithLabelValues(ctrl.Name).Write(&reconcileTimeouts)).To(Succeed()) + Expect(reconcileTimeouts.GetCounter().GetValue()).To(Equal(0.0)) + }) + + It("should not increment when reconciliation completes before timeout", func(ctx SpecContext) { + ctrl.ReconciliationTimeout = 10 * time.Second + ctrl.Do = reconcile.Func(func(ctx context.Context, _ reconcile.Request) (reconcile.Result, error) { + // Reconcile completes successfully before timeout + return reconcile.Result{}, nil + }) + _, err := ctrl.Reconcile(ctx, + reconcile.Request{NamespacedName: types.NamespacedName{Namespace: "foo", Name: "bar"}}) + Expect(err).NotTo(HaveOccurred()) + + // Metric should not be incremented because the timeout was not exceeded + Expect(ctrlmetrics.ReconcileTimeouts.WithLabelValues(ctrl.Name).Write(&reconcileTimeouts)).To(Succeed()) + Expect(reconcileTimeouts.GetCounter().GetValue()).To(Equal(0.0)) + }) + + It("should increment multiple times when multiple reconciles timeout", func(ctx SpecContext) { + ctrl.ReconciliationTimeout = time.Duration(1) + ctrl.Do = reconcile.Func(func(ctx context.Context, _ reconcile.Request) (reconcile.Result, error) { + <-ctx.Done() + return reconcile.Result{}, ctx.Err() + }) + + const numTimeouts = 3 + // Call Reconcile multiple times, each should timeout and increment the metric + for i := range numTimeouts { + _, err := ctrl.Reconcile(ctx, + reconcile.Request{NamespacedName: types.NamespacedName{Namespace: "foo", Name: fmt.Sprintf("bar%d", i)}}) + Expect(err).To(HaveOccurred()) + Expect(err).To(Equal(context.DeadlineExceeded)) + } + + // Metric should be incremented 3 times + Expect(ctrlmetrics.ReconcileTimeouts.WithLabelValues(ctrl.Name).Write(&reconcileTimeouts)).To(Succeed()) + Expect(reconcileTimeouts.GetCounter().GetValue()).To(Equal(float64(numTimeouts))) + }) + + It("should not increment when parent context is cancelled", func(ctx SpecContext) { + parentCtx, cancel := context.WithCancel(ctx) + defer cancel() + + ctrl.ReconciliationTimeout = 10 * time.Second + ctrl.Do = reconcile.Func(func(ctx context.Context, _ reconcile.Request) (reconcile.Result, error) { + // Wait for parent cancellation + <-ctx.Done() + return reconcile.Result{}, ctx.Err() + }) + + // Cancel parent context immediately + cancel() + + _, err := ctrl.Reconcile(parentCtx, + reconcile.Request{NamespacedName: types.NamespacedName{Namespace: "foo", Name: "bar"}}) + Expect(err).To(HaveOccurred()) + Expect(err).To(Equal(context.Canceled)) + + // Metric should not be incremented because the wrapper timeout didn't fire + Expect(ctrlmetrics.ReconcileTimeouts.WithLabelValues(ctrl.Name).Write(&reconcileTimeouts)).To(Succeed()) + Expect(reconcileTimeouts.GetCounter().GetValue()).To(Equal(0.0)) + }) + + It("should not increment when ReconciliationTimeout is zero", func(ctx SpecContext) { + // Ensure ReconciliationTimeout is zero (not set) + ctrl.ReconciliationTimeout = 0 + ctrl.Do = reconcile.Func(func(ctx context.Context, _ reconcile.Request) (reconcile.Result, error) { + return reconcile.Result{}, nil + }) + _, err := ctrl.Reconcile(ctx, + reconcile.Request{NamespacedName: types.NamespacedName{Namespace: "foo", Name: "bar"}}) + Expect(err).NotTo(HaveOccurred()) + + // Metric should not be incremented because ReconciliationTimeout is not set + Expect(ctrlmetrics.ReconcileTimeouts.WithLabelValues(ctrl.Name).Write(&reconcileTimeouts)).To(Succeed()) + Expect(reconcileTimeouts.GetCounter().GetValue()).To(Equal(0.0)) + }) + }) + It("should not configure a timeout if ReconciliationTimeout is zero", func(ctx SpecContext) { ctrl.Do = reconcile.Func(func(ctx context.Context, _ reconcile.Request) (reconcile.Result, error) { defer GinkgoRecover() diff --git a/pkg/internal/controller/metrics/metrics.go b/pkg/internal/controller/metrics/metrics.go index 450e9ae25b..39b435c453 100644 --- a/pkg/internal/controller/metrics/metrics.go +++ b/pkg/internal/controller/metrics/metrics.go @@ -80,6 +80,15 @@ var ( Name: "controller_runtime_active_workers", Help: "Number of currently used workers per controller", }, []string{"controller"}) + + // ReconcileTimeouts is a prometheus counter metric which holds the total + // number of reconciliations that timed out due to the ReconciliationTimeout + // context timeout. This metric only increments when the wrapper timeout fires, + // not when user reconcilers cancels the context or completes before the timeout. + ReconcileTimeouts = prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: "controller_runtime_reconcile_timeouts_total", + Help: "Total number of reconciliation timeouts per controller", + }, []string{"controller"}) ) func init() { @@ -91,6 +100,7 @@ func init() { ReconcileTime, WorkerCount, ActiveWorkers, + ReconcileTimeouts, // expose process metrics like CPU, Memory, file descriptor usage etc. collectors.NewProcessCollector(collectors.ProcessCollectorOpts{}), // expose all Go runtime metrics like GC stats, memory stats etc.