Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 20 additions & 2 deletions pkg/internal/controller/controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,10 @@ import (
"sigs.k8s.io/controller-runtime/pkg/source"
)

// errReconciliationTimeout is the error used as the cause when the ReconciliationTimeout guardrail fires.
// This allows us to distinguish wrapper timeouts from user-initiated context cancellations.
var errReconciliationTimeout = errors.New("reconciliation timeout")

// Options are the arguments for creating a new Controller.
type Options[request comparable] struct {
// Reconciler is a function that can be called at any time with the Name / Namespace of an object and
Expand Down Expand Up @@ -207,13 +211,26 @@ func (c *Controller[request]) Reconcile(ctx context.Context, req request) (_ rec
}
}()

var timeoutCause error
if c.ReconciliationTimeout > 0 {
timeoutCause = errReconciliationTimeout
var cancel context.CancelFunc
ctx, cancel = context.WithTimeout(ctx, c.ReconciliationTimeout)
ctx, cancel = context.WithTimeoutCause(ctx, c.ReconciliationTimeout, timeoutCause)
defer cancel()
}

return c.Do.Reconcile(ctx, req)
res, err := c.Do.Reconcile(ctx, req)

// Check if the reconciliation timed out due to our wrapper timeout guardrail.
// We check ctx.Err() == context.DeadlineExceeded first to ensure the context was actually
// cancelled due to a deadline (not parent cancellation or other reasons), then verify it was
// our specific timeout cause. This prevents false positives from parent context cancellations
// or other timeout scenarios.
if timeoutCause != nil && ctx.Err() == context.DeadlineExceeded && errors.Is(context.Cause(ctx), timeoutCause) {
ctrlmetrics.ReconcileTimeouts.WithLabelValues(c.Name).Inc()
}

return res, err
}

// Watch implements controller.Controller.
Expand Down Expand Up @@ -437,6 +454,7 @@ func (c *Controller[request]) initMetrics() {
ctrlmetrics.ReconcileErrors.WithLabelValues(c.Name).Add(0)
ctrlmetrics.TerminalReconcileErrors.WithLabelValues(c.Name).Add(0)
ctrlmetrics.ReconcilePanics.WithLabelValues(c.Name).Add(0)
ctrlmetrics.ReconcileTimeouts.WithLabelValues(c.Name).Add(0)
ctrlmetrics.WorkerCount.WithLabelValues(c.Name).Set(float64(c.MaxConcurrentReconciles))
ctrlmetrics.ActiveWorkers.WithLabelValues(c.Name).Set(0)
}
Expand Down
122 changes: 122 additions & 0 deletions pkg/internal/controller/controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -155,6 +155,128 @@ var _ = Describe("controller", func() {
Expect(err).To(Equal(context.DeadlineExceeded))
})

Context("prometheus metric reconcile_timeouts", func() {
var reconcileTimeouts dto.Metric

BeforeEach(func() {
ctrlmetrics.ReconcileTimeouts.Reset()
reconcileTimeouts.Reset()
ctrl.Name = testControllerName
Expect(ctrlmetrics.ReconcileTimeouts.WithLabelValues(ctrl.Name).Write(&reconcileTimeouts)).To(Succeed())
Expect(reconcileTimeouts.GetCounter().GetValue()).To(Equal(0.0))
})

It("should increment when ReconciliationTimeout context timeout hits DeadlineExceeded", func(ctx SpecContext) {
ctrl.ReconciliationTimeout = time.Duration(1)
ctrl.Do = reconcile.Func(func(ctx context.Context, _ reconcile.Request) (reconcile.Result, error) {
<-ctx.Done()
return reconcile.Result{}, ctx.Err()
})
_, err := ctrl.Reconcile(ctx,
reconcile.Request{NamespacedName: types.NamespacedName{Namespace: "foo", Name: "bar"}})
Expect(err).To(HaveOccurred())
Expect(err).To(Equal(context.DeadlineExceeded))

Expect(ctrlmetrics.ReconcileTimeouts.WithLabelValues(ctrl.Name).Write(&reconcileTimeouts)).To(Succeed())
Expect(reconcileTimeouts.GetCounter().GetValue()).To(Equal(1.0))
})

It("should not increment when user code cancels context earlier than the ReconciliationTimeout", func(ctx SpecContext) {
ctrl.ReconciliationTimeout = 10 * time.Second
userCancelCause := errors.New("user cancellation")
ctrl.Do = reconcile.Func(func(ctx context.Context, _ reconcile.Request) (reconcile.Result, error) {
// User code creates its own timeout with a different cause
userCtx, cancel := context.WithTimeoutCause(ctx, time.Millisecond, userCancelCause)
defer cancel()
<-userCtx.Done()
return reconcile.Result{}, context.Cause(userCtx)
})
_, err := ctrl.Reconcile(ctx,
reconcile.Request{NamespacedName: types.NamespacedName{Namespace: "foo", Name: "bar"}})
Expect(err).To(HaveOccurred())
Expect(errors.Is(err, userCancelCause)).To(BeTrue())

// Metric should not be incremented because the context timeout didn't hit DeadlineExceeded
Expect(ctrlmetrics.ReconcileTimeouts.WithLabelValues(ctrl.Name).Write(&reconcileTimeouts)).To(Succeed())
Expect(reconcileTimeouts.GetCounter().GetValue()).To(Equal(0.0))
})

It("should not increment when reconciliation completes before timeout", func(ctx SpecContext) {
ctrl.ReconciliationTimeout = 10 * time.Second
ctrl.Do = reconcile.Func(func(ctx context.Context, _ reconcile.Request) (reconcile.Result, error) {
// Reconcile completes successfully before timeout
return reconcile.Result{}, nil
})
_, err := ctrl.Reconcile(ctx,
reconcile.Request{NamespacedName: types.NamespacedName{Namespace: "foo", Name: "bar"}})
Expect(err).NotTo(HaveOccurred())

// Metric should not be incremented because the timeout was not exceeded
Expect(ctrlmetrics.ReconcileTimeouts.WithLabelValues(ctrl.Name).Write(&reconcileTimeouts)).To(Succeed())
Expect(reconcileTimeouts.GetCounter().GetValue()).To(Equal(0.0))
})

It("should increment multiple times when multiple reconciles timeout", func(ctx SpecContext) {
ctrl.ReconciliationTimeout = time.Duration(1)
ctrl.Do = reconcile.Func(func(ctx context.Context, _ reconcile.Request) (reconcile.Result, error) {
<-ctx.Done()
return reconcile.Result{}, ctx.Err()
})

const numTimeouts = 3
// Call Reconcile multiple times, each should timeout and increment the metric
for i := range numTimeouts {
_, err := ctrl.Reconcile(ctx,
reconcile.Request{NamespacedName: types.NamespacedName{Namespace: "foo", Name: fmt.Sprintf("bar%d", i)}})
Expect(err).To(HaveOccurred())
Expect(err).To(Equal(context.DeadlineExceeded))
}

// Metric should be incremented 3 times
Expect(ctrlmetrics.ReconcileTimeouts.WithLabelValues(ctrl.Name).Write(&reconcileTimeouts)).To(Succeed())
Expect(reconcileTimeouts.GetCounter().GetValue()).To(Equal(float64(numTimeouts)))
})

It("should not increment when parent context is cancelled", func(ctx SpecContext) {
parentCtx, cancel := context.WithCancel(ctx)
defer cancel()

ctrl.ReconciliationTimeout = 10 * time.Second
ctrl.Do = reconcile.Func(func(ctx context.Context, _ reconcile.Request) (reconcile.Result, error) {
// Wait for parent cancellation
<-ctx.Done()
return reconcile.Result{}, ctx.Err()
})

// Cancel parent context immediately
cancel()

_, err := ctrl.Reconcile(parentCtx,
reconcile.Request{NamespacedName: types.NamespacedName{Namespace: "foo", Name: "bar"}})
Expect(err).To(HaveOccurred())
Expect(err).To(Equal(context.Canceled))

// Metric should not be incremented because the wrapper timeout didn't fire
Expect(ctrlmetrics.ReconcileTimeouts.WithLabelValues(ctrl.Name).Write(&reconcileTimeouts)).To(Succeed())
Expect(reconcileTimeouts.GetCounter().GetValue()).To(Equal(0.0))
})

It("should not increment when ReconciliationTimeout is zero", func(ctx SpecContext) {
// Ensure ReconciliationTimeout is zero (not set)
ctrl.ReconciliationTimeout = 0
ctrl.Do = reconcile.Func(func(ctx context.Context, _ reconcile.Request) (reconcile.Result, error) {
return reconcile.Result{}, nil
})
_, err := ctrl.Reconcile(ctx,
reconcile.Request{NamespacedName: types.NamespacedName{Namespace: "foo", Name: "bar"}})
Expect(err).NotTo(HaveOccurred())

// Metric should not be incremented because ReconciliationTimeout is not set
Expect(ctrlmetrics.ReconcileTimeouts.WithLabelValues(ctrl.Name).Write(&reconcileTimeouts)).To(Succeed())
Expect(reconcileTimeouts.GetCounter().GetValue()).To(Equal(0.0))
})
})

It("should not configure a timeout if ReconciliationTimeout is zero", func(ctx SpecContext) {
ctrl.Do = reconcile.Func(func(ctx context.Context, _ reconcile.Request) (reconcile.Result, error) {
defer GinkgoRecover()
Expand Down
10 changes: 10 additions & 0 deletions pkg/internal/controller/metrics/metrics.go
Original file line number Diff line number Diff line change
Expand Up @@ -80,6 +80,15 @@ var (
Name: "controller_runtime_active_workers",
Help: "Number of currently used workers per controller",
}, []string{"controller"})

// ReconcileTimeouts is a prometheus counter metric which holds the total
// number of reconciliations that timed out due to the ReconciliationTimeout
// context timeout. This metric only increments when the wrapper timeout fires,
// not when user reconcilers cancels the context or completes before the timeout.
ReconcileTimeouts = prometheus.NewCounterVec(prometheus.CounterOpts{
Name: "controller_runtime_reconcile_timeouts_total",
Help: "Total number of reconciliation timeouts per controller",
}, []string{"controller"})
)

func init() {
Expand All @@ -91,6 +100,7 @@ func init() {
ReconcileTime,
WorkerCount,
ActiveWorkers,
ReconcileTimeouts,
// expose process metrics like CPU, Memory, file descriptor usage etc.
collectors.NewProcessCollector(collectors.ProcessCollectorOpts{}),
// expose all Go runtime metrics like GC stats, memory stats etc.
Expand Down
Loading