From f812bf686f444b129f558faaec9947878b40425d Mon Sep 17 00:00:00 2001 From: David Eads Date: Wed, 8 Oct 2025 19:13:38 -0400 Subject: [PATCH] temporarily disable metrics auth for hypershift clusters CVO does not honor client certificates per the OCP metrics standard and HCP does not configure the secret. The combination of these two things means that on HCP, if we enable the CVO's auth handler, we lose the ability to determine if clusteroperators are functioning correctly at scale. --- pkg/cvo/metrics.go | 17 +++++++++++++---- pkg/start/start.go | 3 ++- 2 files changed, 15 insertions(+), 5 deletions(-) diff --git a/pkg/cvo/metrics.go b/pkg/cvo/metrics.go index a0f81593c..202af0894 100644 --- a/pkg/cvo/metrics.go +++ b/pkg/cvo/metrics.go @@ -132,7 +132,16 @@ type asyncResult struct { error error } -func createHttpServer(ctx context.Context, client *authenticationclientsetv1.AuthenticationV1Client) *http.Server { +func createHttpServer(ctx context.Context, client *authenticationclientsetv1.AuthenticationV1Client, disableAuth bool) *http.Server { + if disableAuth { + handler := http.NewServeMux() + handler.Handle("/metrics", promhttp.Handler()) + server := &http.Server{ + Handler: handler, + } + return server + } + auth := authHandler{downstream: promhttp.Handler(), ctx: ctx, client: client.TokenReviews()} handler := http.NewServeMux() handler.Handle("/metrics", &auth) @@ -246,7 +255,7 @@ func handleServerResult(result asyncResult, lastLoopError error) error { // Also detects changes to metrics certificate files upon which // the metrics HTTP server is shutdown and recreated with a new // TLS configuration. -func RunMetrics(runContext context.Context, shutdownContext context.Context, listenAddress, certFile, keyFile string, restConfig *rest.Config) error { +func RunMetrics(runContext context.Context, shutdownContext context.Context, listenAddress, certFile, keyFile string, restConfig *rest.Config, disableMetricsAuth bool) error { var tlsConfig *tls.Config if listenAddress != "" { var err error @@ -263,7 +272,7 @@ func RunMetrics(runContext context.Context, shutdownContext context.Context, lis return fmt.Errorf("failed to create config: %w", err) } - server := createHttpServer(runContext, client) + server := createHttpServer(runContext, client, disableMetricsAuth) resultChannel := make(chan asyncResult, 1) resultChannelCount := 1 @@ -317,7 +326,7 @@ func RunMetrics(runContext context.Context, shutdownContext context.Context, lis case result := <-resultChannel: // crashed before a shutdown was requested or metrics server recreated if restartServer { klog.Info("Creating metrics server with updated TLS configuration.") - server = createHttpServer(runContext, client) + server = createHttpServer(runContext, client, disableMetricsAuth) go startListening(server, tlsConfig, listenAddress, resultChannel) restartServer = false continue diff --git a/pkg/start/start.go b/pkg/start/start.go index 75f352acf..c0b7af8d8 100644 --- a/pkg/start/start.go +++ b/pkg/start/start.go @@ -255,7 +255,8 @@ func (o *Options) run(ctx context.Context, controllerCtx *Context, lock resource resultChannelCount++ go func() { defer utilruntime.HandleCrash() - err := cvo.RunMetrics(postMainContext, shutdownContext, o.ListenAddr, o.ServingCertFile, o.ServingKeyFile, restConfig) + disableMetricsAuth := o.InjectClusterIdIntoPromQL // this is wired to the "--hypershift" flag, so when hypershfit is no, we disableMetricsAuth + err := cvo.RunMetrics(postMainContext, shutdownContext, o.ListenAddr, o.ServingCertFile, o.ServingKeyFile, restConfig, disableMetricsAuth) resultChannel <- asyncResult{name: "metrics server", error: err} }() }