diff --git a/pkg/builder/webhook.go b/pkg/builder/webhook.go index 6170180c74..81d8f74056 100644 --- a/pkg/builder/webhook.go +++ b/pkg/builder/webhook.go @@ -42,7 +42,7 @@ type WebhookBuilder struct { gvk schema.GroupVersionKind mgr manager.Manager config *rest.Config - recoverPanic bool + recoverPanic *bool logConstructor func(base logr.Logger, req *admission.Request) logr.Logger err error } @@ -84,8 +84,9 @@ func (blder *WebhookBuilder) WithLogConstructor(logConstructor func(base logr.Lo } // RecoverPanic indicates whether panics caused by the webhook should be recovered. -func (blder *WebhookBuilder) RecoverPanic() *WebhookBuilder { - blder.recoverPanic = true +// Defaults to true. +func (blder *WebhookBuilder) RecoverPanic(recoverPanic bool) *WebhookBuilder { + blder.recoverPanic = &recoverPanic return blder } @@ -169,10 +170,18 @@ func (blder *WebhookBuilder) registerDefaultingWebhook() { func (blder *WebhookBuilder) getDefaultingWebhook() *admission.Webhook { if defaulter := blder.customDefaulter; defaulter != nil { - return admission.WithCustomDefaulter(blder.mgr.GetScheme(), blder.apiType, defaulter).WithRecoverPanic(blder.recoverPanic) + w := admission.WithCustomDefaulter(blder.mgr.GetScheme(), blder.apiType, defaulter) + if blder.recoverPanic != nil { + w = w.WithRecoverPanic(*blder.recoverPanic) + } + return w } if defaulter, ok := blder.apiType.(admission.Defaulter); ok { - return admission.DefaultingWebhookFor(blder.mgr.GetScheme(), defaulter).WithRecoverPanic(blder.recoverPanic) + w := admission.DefaultingWebhookFor(blder.mgr.GetScheme(), defaulter) + if blder.recoverPanic != nil { + w = w.WithRecoverPanic(*blder.recoverPanic) + } + return w } log.Info( "skip registering a mutating webhook, object does not implement admission.Defaulter or WithDefaulter wasn't called", @@ -200,10 +209,18 @@ func (blder *WebhookBuilder) registerValidatingWebhook() { func (blder *WebhookBuilder) getValidatingWebhook() *admission.Webhook { if validator := blder.customValidator; validator != nil { - return admission.WithCustomValidator(blder.mgr.GetScheme(), blder.apiType, validator).WithRecoverPanic(blder.recoverPanic) + w := admission.WithCustomValidator(blder.mgr.GetScheme(), blder.apiType, validator) + if blder.recoverPanic != nil { + w = w.WithRecoverPanic(*blder.recoverPanic) + } + return w } if validator, ok := blder.apiType.(admission.Validator); ok { - return admission.ValidatingWebhookFor(blder.mgr.GetScheme(), validator).WithRecoverPanic(blder.recoverPanic) + w := admission.ValidatingWebhookFor(blder.mgr.GetScheme(), validator) + if blder.recoverPanic != nil { + w = w.WithRecoverPanic(*blder.recoverPanic) + } + return w } log.Info( "skip registering a validating webhook, object does not implement admission.Validator or WithValidator wasn't called", diff --git a/pkg/builder/webhook_test.go b/pkg/builder/webhook_test.go index abb11bf957..4574d5cc77 100644 --- a/pkg/builder/webhook_test.go +++ b/pkg/builder/webhook_test.go @@ -160,7 +160,7 @@ func runTests(admissionReviewVersion string) { err = WebhookManagedBy(m). For(&TestDefaulter{Panic: true}). - RecoverPanic(). + // RecoverPanic defaults to true. Complete() ExpectWithOffset(1, err).NotTo(HaveOccurred()) svr := m.GetWebhookServer() @@ -369,7 +369,7 @@ func runTests(admissionReviewVersion string) { err = WebhookManagedBy(m). For(&TestValidator{Panic: true}). - RecoverPanic(). + RecoverPanic(true). Complete() ExpectWithOffset(1, err).NotTo(HaveOccurred()) svr := m.GetWebhookServer() diff --git a/pkg/config/controller.go b/pkg/config/controller.go index b37dffaeea..0a64d46d36 100644 --- a/pkg/config/controller.go +++ b/pkg/config/controller.go @@ -41,6 +41,7 @@ type Controller struct { // RecoverPanic indicates whether the panic caused by reconcile should be recovered. // Defaults to the Controller.RecoverPanic setting from the Manager if unset. + // Defaults to true if Controller.RecoverPanic setting from the Manager is also unset. RecoverPanic *bool // NeedLeaderElection indicates whether the controller needs to use leader election. diff --git a/pkg/controller/controller.go b/pkg/controller/controller.go index d31199012f..c0a7c0cb85 100644 --- a/pkg/controller/controller.go +++ b/pkg/controller/controller.go @@ -45,6 +45,7 @@ type TypedOptions[request comparable] struct { // RecoverPanic indicates whether the panic caused by reconcile should be recovered. // Defaults to the Controller.RecoverPanic setting from the Manager if unset. + // Defaults to true if Controller.RecoverPanic setting from the Manager is also unset. RecoverPanic *bool // NeedLeaderElection indicates whether the controller needs to use leader election. diff --git a/pkg/internal/controller/controller.go b/pkg/internal/controller/controller.go index 9b5ba8fba9..dfe407f3b8 100644 --- a/pkg/internal/controller/controller.go +++ b/pkg/internal/controller/controller.go @@ -87,6 +87,7 @@ type Controller[request comparable] struct { LogConstructor func(request *request) logr.Logger // RecoverPanic indicates whether the panic caused by reconcile should be recovered. + // Defaults to true. RecoverPanic *bool // LeaderElected indicates whether the controller is leader elected or always running. @@ -97,7 +98,9 @@ type Controller[request comparable] struct { func (c *Controller[request]) Reconcile(ctx context.Context, req request) (_ reconcile.Result, err error) { defer func() { if r := recover(); r != nil { - if c.RecoverPanic != nil && *c.RecoverPanic { + ctrlmetrics.ReconcilePanics.WithLabelValues(c.Name).Inc() + + if c.RecoverPanic == nil || *c.RecoverPanic { for _, fn := range utilruntime.PanicHandlers { fn(ctx, r) } @@ -269,13 +272,15 @@ const ( ) func (c *Controller[request]) initMetrics() { - ctrlmetrics.ActiveWorkers.WithLabelValues(c.Name).Set(0) - ctrlmetrics.ReconcileErrors.WithLabelValues(c.Name).Add(0) ctrlmetrics.ReconcileTotal.WithLabelValues(c.Name, labelError).Add(0) ctrlmetrics.ReconcileTotal.WithLabelValues(c.Name, labelRequeueAfter).Add(0) ctrlmetrics.ReconcileTotal.WithLabelValues(c.Name, labelRequeue).Add(0) ctrlmetrics.ReconcileTotal.WithLabelValues(c.Name, labelSuccess).Add(0) + ctrlmetrics.ReconcileErrors.WithLabelValues(c.Name).Add(0) + ctrlmetrics.TerminalReconcileErrors.WithLabelValues(c.Name).Add(0) + ctrlmetrics.ReconcilePanics.WithLabelValues(c.Name).Add(0) ctrlmetrics.WorkerCount.WithLabelValues(c.Name).Set(float64(c.MaxConcurrentReconciles)) + ctrlmetrics.ActiveWorkers.WithLabelValues(c.Name).Set(0) } func (c *Controller[request]) reconcileHandler(ctx context.Context, req request) { diff --git a/pkg/internal/controller/controller_test.go b/pkg/internal/controller/controller_test.go index eec51ae0b9..638d21810e 100644 --- a/pkg/internal/controller/controller_test.go +++ b/pkg/internal/controller/controller_test.go @@ -90,13 +90,14 @@ var _ = Describe("controller", func() { Expect(result).To(Equal(reconcile.Result{Requeue: true})) }) - It("should not recover panic if RecoverPanic is false by default", func() { + It("should not recover panic if RecoverPanic is false", func() { ctx, cancel := context.WithCancel(context.Background()) defer cancel() defer func() { Expect(recover()).ShouldNot(BeNil()) }() + ctrl.RecoverPanic = ptr.To(false) ctrl.Do = reconcile.Func(func(context.Context, reconcile.Request) (reconcile.Result, error) { var res *reconcile.Result return *res, nil @@ -105,6 +106,24 @@ var _ = Describe("controller", func() { reconcile.Request{NamespacedName: types.NamespacedName{Namespace: "foo", Name: "bar"}}) }) + It("should recover panic if RecoverPanic is true by default", func() { + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + defer func() { + Expect(recover()).To(BeNil()) + }() + // RecoverPanic defaults to true. + ctrl.Do = reconcile.Func(func(context.Context, reconcile.Request) (reconcile.Result, error) { + var res *reconcile.Result + return *res, nil + }) + _, err := ctrl.Reconcile(ctx, + reconcile.Request{NamespacedName: types.NamespacedName{Namespace: "foo", Name: "bar"}}) + Expect(err).To(HaveOccurred()) + Expect(err.Error()).To(ContainSubstring("[recovered]")) + }) + It("should recover panic if RecoverPanic is true", func() { ctx, cancel := context.WithCancel(context.Background()) defer cancel() diff --git a/pkg/internal/controller/metrics/metrics.go b/pkg/internal/controller/metrics/metrics.go index b74ce062be..fbf15669d5 100644 --- a/pkg/internal/controller/metrics/metrics.go +++ b/pkg/internal/controller/metrics/metrics.go @@ -46,6 +46,13 @@ var ( Help: "Total number of terminal reconciliation errors per controller", }, []string{"controller"}) + // ReconcilePanics is a prometheus counter metrics which holds the total + // number of panics from the Reconciler. + ReconcilePanics = prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: "controller_runtime_reconcile_panics_total", + Help: "Total number of reconciliation panics per controller", + }, []string{"controller"}) + // ReconcileTime is a prometheus metric which keeps track of the duration // of reconciliations. ReconcileTime = prometheus.NewHistogramVec(prometheus.HistogramOpts{ @@ -75,6 +82,7 @@ func init() { ReconcileTotal, ReconcileErrors, TerminalReconcileErrors, + ReconcilePanics, ReconcileTime, WorkerCount, ActiveWorkers, diff --git a/pkg/webhook/admission/metrics/metrics.go b/pkg/webhook/admission/metrics/metrics.go new file mode 100644 index 0000000000..358a3a9162 --- /dev/null +++ b/pkg/webhook/admission/metrics/metrics.go @@ -0,0 +1,39 @@ +/* +Copyright 2024 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package metrics + +import ( + "github.com/prometheus/client_golang/prometheus" + "sigs.k8s.io/controller-runtime/pkg/metrics" +) + +var ( + // WebhookPanics is a prometheus counter metrics which holds the total + // number of panics from webhooks. + WebhookPanics = prometheus.NewCounterVec(prometheus.CounterOpts{ + Name: "controller_runtime_webhook_panics_total", + Help: "Total number of webhook panics", + }, []string{}) +) + +func init() { + metrics.Registry.MustRegister( + WebhookPanics, + ) + // Init metric. + WebhookPanics.WithLabelValues().Add(0) +} diff --git a/pkg/webhook/admission/webhook.go b/pkg/webhook/admission/webhook.go index 0f8f54fa83..cba6da2cb0 100644 --- a/pkg/webhook/admission/webhook.go +++ b/pkg/webhook/admission/webhook.go @@ -30,6 +30,7 @@ import ( "k8s.io/apimachinery/pkg/util/json" utilruntime "k8s.io/apimachinery/pkg/util/runtime" "k8s.io/klog/v2" + admissionmetrics "sigs.k8s.io/controller-runtime/pkg/webhook/admission/metrics" logf "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/controller-runtime/pkg/webhook/internal/metrics" @@ -123,7 +124,8 @@ type Webhook struct { Handler Handler // RecoverPanic indicates whether the panic caused by webhook should be recovered. - RecoverPanic bool + // Defaults to true. + RecoverPanic *bool // WithContextFunc will allow you to take the http.Request.Context() and // add any additional information such as passing the request path or @@ -141,8 +143,9 @@ type Webhook struct { } // WithRecoverPanic takes a bool flag which indicates whether the panic caused by webhook should be recovered. +// Defaults to true. func (wh *Webhook) WithRecoverPanic(recoverPanic bool) *Webhook { - wh.RecoverPanic = recoverPanic + wh.RecoverPanic = &recoverPanic return wh } @@ -151,17 +154,26 @@ func (wh *Webhook) WithRecoverPanic(recoverPanic bool) *Webhook { // If the webhook is validating type, it delegates the AdmissionRequest to each handler and // deny the request if anyone denies. func (wh *Webhook) Handle(ctx context.Context, req Request) (response Response) { - if wh.RecoverPanic { - defer func() { - if r := recover(); r != nil { + defer func() { + if r := recover(); r != nil { + admissionmetrics.WebhookPanics.WithLabelValues().Inc() + + if wh.RecoverPanic == nil || *wh.RecoverPanic { for _, fn := range utilruntime.PanicHandlers { fn(ctx, r) } response = Errored(http.StatusInternalServerError, fmt.Errorf("panic: %v [recovered]", r)) + // Note: We explicitly have to set the response UID. Usually that is done via resp.Complete below, + // but if we encounter a panic in wh.Handler.Handle we are never going to reach resp.Complete. + response.UID = req.UID return } - }() - } + + log := logf.FromContext(ctx) + log.Info(fmt.Sprintf("Observed a panic in webhook: %v", r)) + panic(r) + } + }() reqLog := wh.getLogger(&req) ctx = logf.IntoContext(ctx, reqLog) @@ -169,7 +181,10 @@ func (wh *Webhook) Handle(ctx context.Context, req Request) (response Response) resp := wh.Handler.Handle(ctx, req) if err := resp.Complete(req); err != nil { reqLog.Error(err, "unable to encode response") - return Errored(http.StatusInternalServerError, errUnableToEncodeResponse) + resp := Errored(http.StatusInternalServerError, errUnableToEncodeResponse) + // Note: We explicitly have to set the response UID. Usually that is done via resp.Complete. + resp.UID = req.UID + return resp } return resp diff --git a/pkg/webhook/admission/webhook_test.go b/pkg/webhook/admission/webhook_test.go index c7fc3b09ac..102988bc6e 100644 --- a/pkg/webhook/admission/webhook_test.go +++ b/pkg/webhook/admission/webhook_test.go @@ -30,6 +30,7 @@ import ( authenticationv1 "k8s.io/api/authentication/v1" metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" machinerytypes "k8s.io/apimachinery/pkg/types" + "k8s.io/utils/ptr" logf "sigs.k8s.io/controller-runtime/pkg/log" "sigs.k8s.io/controller-runtime/pkg/log/zap" @@ -199,6 +200,33 @@ var _ = Describe("Admission Webhooks", func() { }) Describe("panic recovery", func() { + It("should recover panic if RecoverPanic is true by default", func() { + panicHandler := func() *Webhook { + handler := &fakeHandler{ + fn: func(ctx context.Context, req Request) Response { + panic("fake panic test") + }, + } + webhook := &Webhook{ + Handler: handler, + // RecoverPanic defaults to true. + } + + return webhook + } + + By("setting up a webhook with a panicking handler") + webhook := panicHandler() + + By("invoking the webhook") + resp := webhook.Handle(context.Background(), Request{}) + + By("checking that it errored the request") + Expect(resp.Allowed).To(BeFalse()) + Expect(resp.Result.Code).To(Equal(int32(http.StatusInternalServerError))) + Expect(resp.Result.Message).To(Equal("panic: fake panic test [recovered]")) + }) + It("should recover panic if RecoverPanic is true", func() { panicHandler := func() *Webhook { handler := &fakeHandler{ @@ -208,7 +236,7 @@ var _ = Describe("Admission Webhooks", func() { } webhook := &Webhook{ Handler: handler, - RecoverPanic: true, + RecoverPanic: ptr.To[bool](true), } return webhook @@ -226,7 +254,7 @@ var _ = Describe("Admission Webhooks", func() { Expect(resp.Result.Message).To(Equal("panic: fake panic test [recovered]")) }) - It("should not recover panic if RecoverPanic is false by default", func() { + It("should not recover panic if RecoverPanic is false", func() { panicHandler := func() *Webhook { handler := &fakeHandler{ fn: func(ctx context.Context, req Request) Response { @@ -234,7 +262,8 @@ var _ = Describe("Admission Webhooks", func() { }, } webhook := &Webhook{ - Handler: handler, + Handler: handler, + RecoverPanic: ptr.To[bool](false), } return webhook