Skip to content

Commit 34b5010

Browse files
committed
pkg/cvo/metrics: Add a new --serving-client-certificate-authorities-file option
In 313f8fb (CVO protects /metrics with authorization, 2025-07-22, #1215) and 833a491 (CVO protects /metrics with authorization, 2025-07-22, #1215), the /metrics endpoint began requiring client auth. The only authentication system was Bearer tokens, and the only authorization system was validating that the token belonged to system:serviceaccount:openshift-monitoring:prometheus-k8s. That worked well for standalone clusters, where the ServiceMonitor scraper is the Prometheus from the openshift-monitoring namespace. But it broke scraping on HyperShift [1], where the ServiceMonitor does not request any client authorization [2]. Getting ServiceAccount tokens (and keeping them fresh [3]) from the hosted cluster into a Prometheus scraper running on the management cluster is hard. But for other ServiceMonitors, HyperShift configures keySecret [4] asking the scraper to get its client certificate from a metrics-client Secret that the HostedControlPlane controller maintains in the management cluster namespace [5]. This commit adds a new --serving-client-certificate-authorities-file option, which HyperShift can set when it configures the CVO Deployment [6], while mounting the root CA ConfigMap into the Pod. Now that there are three files (serving key, serving cert, client CAs) that we might be watching for changes, I've taken the opportunity to refactor the checksumming and change-tracking to use a map from filename to checksum. That way we can extend more easily if further configuration files are added in the future, without having to pass around a series of paths and checksums as distinct arguments. I'm a bit sad about AppendCertsFromPEM only returning a boolean [7], leaving us unsure about whether all the certificates in the file were parsed, or, if there were parsing issues, what those issues were. But with HyperShift hopefully reliably setting up CA Secrets that do not cause parsing issues, I'm ok not coding that defensively. If the standard library grows a parser that is more transparent about parsing issues, we can pivot to that once it exists. [1]: https://issues.redhat.com/browse/OCPBUGS-62858 [2]: https://github.com/openshift/hypershift/blob/2728a4e2899ee58690e3d4e58b319eb5e78643af/control-plane-operator/controllers/hostedcontrolplane/v2/assets/cluster-version-operator/servicemonitor.yaml#L18 [3]: https://kubernetes.io/docs/concepts/configuration/secret/#serviceaccount-token-secrets [4]: https://github.com/openshift/hypershift/blob/2728a4e2899ee58690e3d4e58b319eb5e78643af/control-plane-operator/controllers/hostedcontrolplane/v2/assets/kube-apiserver/servicemonitor.yaml#L24-L26 [5]: https://github.com/openshift/hypershift/blob/2728a4e2899ee58690e3d4e58b319eb5e78643af/control-plane-operator/controllers/hostedcontrolplane/pki/sa.go#L35-L36 [6]: https://github.com/openshift/hypershift/blob/2728a4e2899ee58690e3d4e58b319eb5e78643af/control-plane-operator/controllers/hostedcontrolplane/v2/assets/cluster-version-operator/deployment.yaml#L25-L35 [7]: https://pkg.go.dev/crypto/x509#CertPool.AppendCertsFromPEM
1 parent 95685e0 commit 34b5010

File tree

3 files changed

+93
-77
lines changed

3 files changed

+93
-77
lines changed

cmd/cluster-version-operator/start.go

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,14 +34,20 @@ func init() {
3434
cmd.PersistentFlags().StringVar(&opts.NodeName, "node-name", opts.NodeName, "kubernetes node name CVO is scheduled on.")
3535
cmd.PersistentFlags().BoolVar(&opts.EnableAutoUpdate, "enable-auto-update", opts.EnableAutoUpdate, "Enables the autoupdate controller.")
3636
cmd.PersistentFlags().StringVar(&opts.ReleaseImage, "release-image", opts.ReleaseImage, "The Openshift release image url.")
37+
38+
// metrics serving options
3739
cmd.PersistentFlags().StringVar(&opts.ServingCertFile, "serving-cert-file", opts.ServingCertFile, "The X.509 certificate file for serving metrics over HTTPS. You must set both --serving-cert-file and --serving-key-file unless you set --listen empty.")
3840
cmd.PersistentFlags().StringVar(&opts.ServingKeyFile, "serving-key-file", opts.ServingKeyFile, "The X.509 key file for serving metrics over HTTPS. You must set both --serving-cert-file and --serving-key-file unless you set --listen empty.")
41+
cmd.PersistentFlags().StringVar(&opts.ServingClientCAFile, "serving-client-certificate-authorities-file", opts.ServingClientCAFile, "The X.509 certificates file containing a series of PEM encoded certificates for Certificate Authorities trusted to sign metrics client certificates. If set, only clients who provide mTLS client certs signed by those CAs will be trusted. If unset, only clients with valid tokens for the prometheus-k8s ServiceAccount in the openshift-monitoring namespace will be trusted.")
42+
43+
// metrics/PromQL client options
3944
cmd.PersistentFlags().StringVar(&opts.PromQLTarget.CABundleFile, "metrics-ca-bundle-file", opts.PromQLTarget.CABundleFile, "The service CA bundle file containing one or more X.509 certificate files for validating certificates generated from the service CA for the respective remote PromQL query service.")
4045
cmd.PersistentFlags().StringVar(&opts.PromQLTarget.BearerTokenFile, "metrics-token-file", opts.PromQLTarget.BearerTokenFile, "The bearer token file used to access the remote PromQL query service.")
4146
cmd.PersistentFlags().StringVar(&opts.PromQLTarget.KubeSvc.Namespace, "metrics-namespace", opts.PromQLTarget.KubeSvc.Namespace, "The name of the namespace where the the remote PromQL query service resides. Must be specified when --use-dns-for-services is disabled.")
4247
cmd.PersistentFlags().StringVar(&opts.PromQLTarget.KubeSvc.Name, "metrics-service", opts.PromQLTarget.KubeSvc.Name, "The name of the remote PromQL query service. Must be specified when --use-dns-for-services is disabled.")
4348
cmd.PersistentFlags().BoolVar(&opts.PromQLTarget.UseDNS, "use-dns-for-services", opts.PromQLTarget.UseDNS, "Configures the CVO to use DNS for resolution of services in the cluster.")
4449
cmd.PersistentFlags().StringVar(&opts.PrometheusURLString, "metrics-url", opts.PrometheusURLString, "The URL used to access the remote PromQL query service.")
50+
4551
cmd.PersistentFlags().BoolVar(&opts.HyperShift, "hypershift", opts.HyperShift, "This options indicates whether the CVO is running inside a hosted control plane.")
4652
cmd.PersistentFlags().StringVar(&opts.UpdateService, "update-service", opts.UpdateService, "The preferred update service. If set, this option overrides any upstream value configured in ClusterVersion spec.")
4753
cmd.PersistentFlags().StringSliceVar(&opts.AlwaysEnableCapabilities, "always-enable-capabilities", opts.AlwaysEnableCapabilities, "List of the cluster capabilities which will always be implicitly enabled.")

pkg/cvo/metrics.go

Lines changed: 79 additions & 73 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import (
55
"context"
66
"crypto/sha256"
77
"crypto/tls"
8+
"crypto/x509"
89
"errors"
910
"fmt"
1011
"io"
@@ -173,6 +174,7 @@ func (a *authHandler) authorize(token string) (bool, error) {
173174
}
174175

175176
func (a *authHandler) ServeHTTP(w http.ResponseWriter, r *http.Request) {
177+
176178
authHeader := r.Header.Get("Authorization")
177179
if authHeader == "" {
178180
http.Error(w, "failed to get the Authorization header", http.StatusUnauthorized)
@@ -246,11 +248,24 @@ func handleServerResult(result asyncResult, lastLoopError error) error {
246248
// Also detects changes to metrics certificate files upon which
247249
// the metrics HTTP server is shutdown and recreated with a new
248250
// TLS configuration.
249-
func RunMetrics(runContext context.Context, shutdownContext context.Context, listenAddress, certFile, keyFile string, restConfig *rest.Config) error {
251+
func RunMetrics(runContext context.Context, shutdownContext context.Context, listenAddress, certFile, keyFile, clientCAFile string, restConfig *rest.Config) error {
252+
checksums := map[string][]bytes{
253+
certFile: nil,
254+
keyFile: nil,
255+
}
256+
if clientCAFile != "" {
257+
checksums[clientCAFile] = nil
258+
}
259+
260+
checksums, err := checksumFiles(checksums)
261+
if err != nil {
262+
return err
263+
}
264+
250265
var tlsConfig *tls.Config
251266
if listenAddress != "" {
252267
var err error
253-
tlsConfig, err = makeTLSConfig(certFile, keyFile)
268+
tlsConfig, err = makeTLSConfig(certFile, keyFile, clientCAFile)
254269
if err != nil {
255270
return fmt.Errorf("Failed to create TLS config: %w", err)
256271
}
@@ -270,16 +285,9 @@ func RunMetrics(runContext context.Context, shutdownContext context.Context, lis
270285

271286
go startListening(server, tlsConfig, listenAddress, resultChannel)
272287

273-
certDir := filepath.Dir(certFile)
274-
keyDir := filepath.Dir(keyFile)
275-
276-
origCertChecksum, err := checksumFile(certFile)
277-
if err != nil {
278-
return fmt.Errorf("Failed to initialize certificate file checksum: %w", err)
279-
}
280-
origKeyChecksum, err := checksumFile(keyFile)
281-
if err != nil {
282-
return fmt.Errorf("Failed to initialize key file checksum: %w", err)
288+
watchDirectories := map[string]struct{}{}
289+
for fName := range checksums {
290+
watchDirectories[filepath.Dir(fName)] = struct{}{}
283291
}
284292

285293
// Set up and start the file watcher.
@@ -288,12 +296,9 @@ func RunMetrics(runContext context.Context, shutdownContext context.Context, lis
288296
return fmt.Errorf("Failed to create file watcher for certificate and key rotation: %w", err)
289297
} else {
290298
defer watcher.Close()
291-
if err := watcher.Add(certDir); err != nil {
292-
return fmt.Errorf("Failed to add %v to watcher: %w", certDir, err)
293-
}
294-
if certDir != keyDir {
295-
if err := watcher.Add(keyDir); err != nil {
296-
return fmt.Errorf("Failed to add %v to watcher: %w", keyDir, err)
299+
for watchDir := range watchDirectories {
300+
if err := watcher.Add(watchDir); err != nil {
301+
return fmt.Errorf("Failed to add %v to watcher: %w", watchDir, err)
297302
}
298303
}
299304
}
@@ -326,23 +331,10 @@ func RunMetrics(runContext context.Context, shutdownContext context.Context, lis
326331
loopError = handleServerResult(result, loopError)
327332
case event := <-watcher.Events:
328333
if event.Op != fsnotify.Chmod && event.Op != fsnotify.Remove {
329-
if changed, err := certsChanged(origCertChecksum, origKeyChecksum, certFile, keyFile); changed {
330-
331-
// Update file checksums with latest files.
332-
//
333-
if origCertChecksum, err = checksumFile(certFile); err != nil {
334-
klog.Errorf("Failed to update certificate file checksum: %v", err)
335-
loopError = err
336-
break
337-
}
338-
if origKeyChecksum, err = checksumFile(keyFile); err != nil {
339-
klog.Errorf("Failed to update key file checksum: %v", err)
340-
loopError = err
341-
break
342-
}
343-
344-
tlsConfig, err = makeTLSConfig(certFile, keyFile)
334+
if changed, csums, err := certsChanged(checksums); changed {
335+
tlsConfig, err = makeTLSConfig(certFile, keyFile, clientCAFile)
345336
if err == nil {
337+
checksums = csums
346338
restartServer = true
347339
shutdownHttpServer(shutdownContext, server)
348340
continue
@@ -712,41 +704,30 @@ func mostRecentTimestamp(cv *configv1.ClusterVersion) int64 {
712704
// Determine if the certificates have changed and need to be updated.
713705
// If no errors occur, returns true if both files have changed and
714706
// neither is an empty file. Otherwise returns false and any error.
715-
func certsChanged(origCertChecksum []byte, origKeyChecksum []byte, certFile, keyFile string) (bool, error) {
716-
// Check if both files exist.
717-
certNotEmpty, err := fileExistsAndNotEmpty(certFile)
718-
if err != nil {
719-
return false, fmt.Errorf("Error checking if changed TLS cert file empty/exists: %w", err)
720-
}
721-
keyNotEmpty, err := fileExistsAndNotEmpty(keyFile)
722-
if err != nil {
723-
return false, fmt.Errorf("Error checking if changed TLS key file empty/exists: %w", err)
724-
}
725-
if !certNotEmpty || !keyNotEmpty {
726-
// One of the files is missing despite some file event.
727-
return false, fmt.Errorf("Certificate or key is missing or empty, certificates will not be rotated.")
728-
}
729-
730-
currentCertChecksum, err := checksumFile(certFile)
731-
if err != nil {
732-
return false, fmt.Errorf("Error checking certificate file checksum: %w", err)
707+
func certsChanged(checksums map[string][]byte) (bool, map[string][]byte, error) {
708+
for fName, oldSum := range checksums {
709+
err := requireFileExistsAndNotEmpty(fName)
710+
if err != nil {
711+
return false, nil, fmt.Errorf("Server configuration file must exist and be non-empty. Certificates will not be rotated. %w", err)
712+
}
733713
}
734714

735-
currentKeyChecksum, err := checksumFile(keyFile)
715+
csums, err := checksumFiles(checksums)
736716
if err != nil {
737-
return false, fmt.Errorf("Error checking key file checksum: %w", err)
717+
return false, nil, err
738718
}
739719

740-
// Check if the non-empty certificate/key files have actually changed.
741-
if !bytes.Equal(origCertChecksum, currentCertChecksum) && !bytes.Equal(origKeyChecksum, currentKeyChecksum) {
742-
klog.V(2).Info("Certificate and key changed. Will recreate metrics server with updated TLS configuration.")
743-
return true, nil
720+
changed := false
721+
for fName, sum := range checksums {
722+
if !bytes.Equal(sum, csums[fName]) {
723+
changed = true
724+
klog.V(2).Infof("Server configuration file %s changed. Will recreate metrics server with updated TLS configuration.", fName)
725+
}
744726
}
745-
746-
return false, nil
727+
return changed, csums, nil
747728
}
748729

749-
func makeTLSConfig(servingCertFile, servingKeyFile string) (*tls.Config, error) {
730+
func makeTLSConfig(servingCertFile, servingKeyFile, clientCAFile string) (*tls.Config, error) {
750731
// Load the initial certificate contents.
751732
certBytes, err := os.ReadFile(servingCertFile)
752733
if err != nil {
@@ -761,11 +742,26 @@ func makeTLSConfig(servingCertFile, servingKeyFile string) (*tls.Config, error)
761742
return nil, err
762743
}
763744

764-
return crypto.SecureTLSConfig(&tls.Config{
745+
tlsConfig := tls.Config{
765746
GetCertificate: func(_ *tls.ClientHelloInfo) (*tls.Certificate, error) {
766747
return &certificate, nil
767748
},
768-
}), nil
749+
}
750+
751+
if clientCAFile != "" {
752+
clientCABytes, err := os.ReadFile(clientCAFile)
753+
if err != nil {
754+
return nil, err
755+
}
756+
757+
tlsConfig.ClientAuth = tls.RequireAndVerifyClientCert
758+
tlsConfig.ClientCAs = x509.NewCertPool()
759+
if ok := tlsConfig.ClientCAs.AppendCertsFromPEM(clientCABytes); !ok {
760+
return nil, fmt.Errorf("failed to load any client Certificate Authority certificates from %s", clientCAfile)
761+
}
762+
}
763+
764+
return crypto.SecureTLSConfig(&tlsConfig), nil
769765
}
770766

771767
// Compute the sha256 checksum for file 'fName' returning any error.
@@ -785,15 +781,25 @@ func checksumFile(fName string) ([]byte, error) {
785781
return hash.Sum(nil), nil
786782
}
787783

788-
// Check if a file exists and has file.Size() not equal to 0.
789-
// Returns any error returned by os.Stat other than os.ErrNotExist.
790-
func fileExistsAndNotEmpty(fName string) (bool, error) {
791-
if fi, err := os.Stat(fName); err == nil {
792-
return (fi.Size() != 0), nil
793-
} else if errors.Is(err, os.ErrNotExist) {
794-
return false, nil
795-
} else {
796-
// Some other error, file may not exist.
797-
return false, err
784+
// Compute the sha256 checksums for files 'fNames' returning any error.
785+
func checksumFiles(fNames map[string][]bytes) (map[string][]byte, error) {
786+
sums := make(map[string][]byte, len(fNames))
787+
for fName := range fNames {
788+
sum, err := checksumFile(fName)
789+
if err {
790+
return nil, err
791+
}
792+
sums[sName] = sum
793+
}
794+
return sums, nil
795+
}
796+
797+
// requireFileExistsAndNotEmpty errors if a file does not exist or has a file.Size() equal to zero.
798+
func requireFileExistsAndNotEmpty(fName string) error {
799+
if fi, err := os.Stat(fName); err != nil {
800+
return err
801+
} else if fi.Size() == 0 {
802+
return fmt.Errorf("file %s is empty", fName)
798803
}
804+
return nil
799805
}

pkg/start/start.go

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -57,9 +57,10 @@ const (
5757

5858
// Options are the valid inputs to starting the CVO.
5959
type Options struct {
60-
ReleaseImage string
61-
ServingCertFile string
62-
ServingKeyFile string
60+
ReleaseImage string
61+
ServingCertFile string
62+
ServingKeyFile string
63+
ServingClientCAFile string
6364

6465
Kubeconfig string
6566
NodeName string
@@ -144,6 +145,9 @@ func (o *Options) ValidateAndComplete() error {
144145
if o.ListenAddr != "" && o.ServingKeyFile == "" {
145146
return fmt.Errorf("--listen was not set empty, so --serving-key-file must be set")
146147
}
148+
if o.ListenAddr == "" && o.ServingClientCAFile != "" {
149+
return fmt.Errorf("--listen was set empty, so --serving-client-certificate-authorities-file must not be set")
150+
}
147151
if o.PrometheusURLString == "" {
148152
return fmt.Errorf("missing --metrics-url flag, it is required")
149153
}
@@ -350,7 +354,7 @@ func (o *Options) run(ctx context.Context, controllerCtx *Context, lock resource
350354
resultChannelCount++
351355
go func() {
352356
defer utilruntime.HandleCrash()
353-
err := cvo.RunMetrics(postMainContext, shutdownContext, o.ListenAddr, o.ServingCertFile, o.ServingKeyFile, restConfig)
357+
err := cvo.RunMetrics(postMainContext, shutdownContext, o.ListenAddr, o.ServingCertFile, o.ServingKeyFile, o.ServingClientCAFile, restConfig)
354358
resultChannel <- asyncResult{name: "metrics server", error: err}
355359
}()
356360
}

0 commit comments

Comments
 (0)