openshift · rexagod · Sep 29, 2025 · rexagod · Sep 30, 2025
diff --git a/pkg/client/client.go b/pkg/client/client.go
@@ -1748,6 +1748,11 @@ func (c *Client) HasConsoleCapability(ctx context.Context) (bool, error) {
 	return c.HasClusterCapability(ctx, configv1.ClusterVersionCapabilityConsole)
 }
 
+func (c *Client) HasOptionalMonitoringCapability(_ context.Context) (bool, error) {
+	//return c.HasClusterCapability(ctx, configv1.ClusterVersionCapabilityOptionalMonitoring)
+	return false, nil
+}
+
 // CreateOrUpdateConsolePlugin function uses retries because API requests related to the ConsolePlugin resource
 // may depend on the availability of a conversion container. This container is part of the console-operator Pod, which is not duplicated.
 // If this pod is down (due to restarts for upgrades or other reasons), transient failures will be reported.

diff --git a/pkg/operator/operator.go b/pkg/operator/operator.go
@@ -790,42 +790,64 @@ func (o *Operator) sync(ctx context.Context, key string) error {
 		consoleConfig,
 	)
 
+	optionalMonitoringEnabled, err := o.client.HasOptionalMonitoringCapability(ctx)
+	if err != nil {
+		o.reportFailed(ctx, newRunReportForError("OptionalMonitoringCheckError", err))
+		return err
+	}
+
+	// Update prometheus-operator before anything else because it is
+	// responsible for managing many other resources (e.g. Prometheus,
+	// Alertmanager, Thanos Ruler, ...). The metrics scraping client CA
+	// should also be created first because it is referenced by Prometheus.
+	prerequisiteTasks := []*tasks.TaskSpec{
+		newTaskSpec("MetricsScrapingClientCA", tasks.NewMetricsClientCATask(o.client, factory, config)),
+	}
+	if optionalMonitoringEnabled {
+		prerequisiteTasks = append(prerequisiteTasks,
+			newTaskSpec("PrometheusOperatorOptionalMonitoring", tasks.NewPrometheusOperatorOptionalMonitoringTask(o.client, factory)),
+		)
+	} else {
+		prerequisiteTasks = append(prerequisiteTasks,
+			newTaskSpec("PrometheusOperator", tasks.NewPrometheusOperatorTask(o.client, factory)),
+		)
+	}
+	followupTasks :=
+		[]*tasks.TaskSpec{
+			newTaskSpec("Prometheus", tasks.NewPrometheusTask(o.client, factory, config)),
+			newTaskSpec("NodeExporter", tasks.NewNodeExporterTask(o.client, factory)),
+			newTaskSpec("KubeStateMetrics", tasks.NewKubeStateMetricsTask(o.client, factory)),
+			newTaskSpec("OpenshiftStateMetrics", tasks.NewOpenShiftStateMetricsTask(o.client, factory)),
+			newTaskSpec("MetricsServer", tasks.NewMetricsServerTask(ctx, o.namespace, o.client, factory, config)),
+			newTaskSpec("TelemeterClient", tasks.NewTelemeterClientTask(o.client, factory, config)),
+			newTaskSpec("ThanosQuerier", tasks.NewThanosQuerierTask(o.client, factory, config)),
+			newTaskSpec("ControlPlaneComponents", tasks.NewControlPlaneTask(o.client, factory, config)),
+			// Tried to run the UWM prom-operator in the first group, but some e2e tests started failing.
+			newUWMTaskSpec("Prometheus", tasks.NewPrometheusUserWorkloadTask(o.client, factory, config)),
+		}
+	if optionalMonitoringEnabled {
+		followupTasks = append(followupTasks,
+			newTaskSpec("ClusterMonitoringOperatorOptionalMonitoringDeps", tasks.NewClusterMonitoringOperatorOptionalMonitoringTask(o.client, factory, config)),
+		)
+	} else {
+		followupTasks = append(followupTasks,
+			newTaskSpec("ClusterMonitoringOperatorDeps", tasks.NewClusterMonitoringOperatorTask(o.client, factory, config)),
+			newTaskSpec("Alertmanager", tasks.NewAlertmanagerTask(o.client, factory, config)),
+			newTaskSpec("ConsolePluginComponents", tasks.NewMonitoringPluginTask(o.client, factory, config)),
+			newUWMTaskSpec("PrometheusOperator", tasks.NewPrometheusOperatorUserWorkloadTask(o.client, factory, config)),
+			newUWMTaskSpec("Alertmanager", tasks.NewAlertmanagerUserWorkloadTask(o.client, factory, config)),
+			newUWMTaskSpec("ThanosRuler", tasks.NewThanosRulerUserWorkloadTask(o.client, factory, config)),
+		)
+	}
+	// The shared configmap depends on resources being created by the previous tasks hence run it last.
+	postRequisiteTasks := []*tasks.TaskSpec{
+		newTaskSpec("ConfigurationSharing", tasks.NewConfigSharingTask(o.client, factory, config)),
+	}
 	tl := tasks.NewTaskRunner(
 		o.client,
-		// Update prometheus-operator before anything else because it is
-		// responsible for managing many other resources (e.g. Prometheus,
-		// Alertmanager, Thanos Ruler, ...). The metrics scraping client CA
-		// should also be created first because it is referenced by Prometheus.
-		tasks.NewTaskGroup(
-			[]*tasks.TaskSpec{
-				newTaskSpec("MetricsScrapingClientCA", tasks.NewMetricsClientCATask(o.client, factory, config)),
-				newTaskSpec("PrometheusOperator", tasks.NewPrometheusOperatorTask(o.client, factory)),
-			}),
-		tasks.NewTaskGroup(
-			[]*tasks.TaskSpec{
-				newTaskSpec("ClusterMonitoringOperatorDeps", tasks.NewClusterMonitoringOperatorTask(o.client, factory, config)),
-				newTaskSpec("Prometheus", tasks.NewPrometheusTask(o.client, factory, config)),
-				newTaskSpec("Alertmanager", tasks.NewAlertmanagerTask(o.client, factory, config)),
-				newTaskSpec("NodeExporter", tasks.NewNodeExporterTask(o.client, factory)),
-				newTaskSpec("KubeStateMetrics", tasks.NewKubeStateMetricsTask(o.client, factory)),
-				newTaskSpec("OpenshiftStateMetrics", tasks.NewOpenShiftStateMetricsTask(o.client, factory)),
-				newTaskSpec("MetricsServer", tasks.NewMetricsServerTask(ctx, o.namespace, o.client, factory, config)),
-				newTaskSpec("TelemeterClient", tasks.NewTelemeterClientTask(o.client, factory, config)),
-				newTaskSpec("ThanosQuerier", tasks.NewThanosQuerierTask(o.client, factory, config)),
-				newTaskSpec("ControlPlaneComponents", tasks.NewControlPlaneTask(o.client, factory, config)),
-				newTaskSpec("ConsolePluginComponents", tasks.NewMonitoringPluginTask(o.client, factory, config)),
-				// Tried to run the UWM prom-operator in the first group, but some e2e tests started failing.
-				newUWMTaskSpec("PrometheusOperator", tasks.NewPrometheusOperatorUserWorkloadTask(o.client, factory, config)),
-				newUWMTaskSpec("Prometheus", tasks.NewPrometheusUserWorkloadTask(o.client, factory, config)),
-				newUWMTaskSpec("Alertmanager", tasks.NewAlertmanagerUserWorkloadTask(o.client, factory, config)),
-				newUWMTaskSpec("ThanosRuler", tasks.NewThanosRulerUserWorkloadTask(o.client, factory, config)),
-			}),
-		// The shared configmap depends on resources being created by the previous tasks hence run it last.
-		tasks.NewTaskGroup(
-			[]*tasks.TaskSpec{
-				newTaskSpec("ConfigurationSharing", tasks.NewConfigSharingTask(o.client, factory, config)),
-			},
-		),
+		tasks.NewTaskGroup(prerequisiteTasks),
+		tasks.NewTaskGroup(followupTasks),
+		tasks.NewTaskGroup(postRequisiteTasks),
 	)
 	klog.Info("Updating ClusterOperator status to InProgress.")
 	err = o.client.StatusReporter().SetRollOutInProgress(ctx)

diff --git a/pkg/tasks/clustermonitoringoperator_optionalmonitoring.go b/pkg/tasks/clustermonitoringoperator_optionalmonitoring.go
@@ -0,0 +1,126 @@
+// Copyright 2018 The Cluster Monitoring Operator Authors
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+package tasks
+
+import (
+	"context"
+	"fmt"
+
+	rbacv1 "k8s.io/api/rbac/v1"
+	apierrors "k8s.io/apimachinery/pkg/api/errors"
+	"k8s.io/klog/v2"
+
+	"github.com/openshift/cluster-monitoring-operator/pkg/client"
+	"github.com/openshift/cluster-monitoring-operator/pkg/manifests"
+)
+
+type ClusterMonitoringOperatorOptionalMonitoringTask struct {
+	client  *client.Client
+	factory *manifests.Factory
+	config  *manifests.Config
+}
+
+func NewClusterMonitoringOperatorOptionalMonitoringTask(
+	client *client.Client,
+	factory *manifests.Factory,
+	config *manifests.Config,
+) *ClusterMonitoringOperatorOptionalMonitoringTask {
+	return &ClusterMonitoringOperatorOptionalMonitoringTask{
+		client:  client,
+		factory: factory,
+		config:  config,
+	}
+}
+
+func (t *ClusterMonitoringOperatorOptionalMonitoringTask) Run(ctx context.Context) error {
+	crfs := map[string]func() (*rbacv1.ClusterRole, error){
+		"cluster-monitoring-view":          t.factory.ClusterMonitoringClusterRoleView,
+		"system:aggregated-metrics-reader": t.factory.ClusterMonitoringClusterRoleAggregatedMetricsReader,
+		"pod-metrics-reader":               t.factory.ClusterMonitoringClusterRolePodMetricsReader,
+		"monitoring-rules-edit":            t.factory.ClusterMonitoringRulesEditClusterRole,
+		"monitoring-rules-view":            t.factory.ClusterMonitoringRulesViewClusterRole,
+		"monitoring-edit":                  t.factory.ClusterMonitoringEditClusterRole,
+	}
+	for name, crf := range crfs {
+		cr, err := crf()
+		if err != nil {
+			return fmt.Errorf("initializing %s ClusterRole failed: %w", name, err)
+		}
+
+		err = t.client.CreateOrUpdateClusterRole(ctx, cr)
+		if err != nil {
+			return fmt.Errorf("reconciling %s ClusterRole failed: %w", name, err)
+		}
+	}
+
+	clarr, err := t.factory.ClusterMonitoringApiReaderRole()
+	if err != nil {
+		return fmt.Errorf("initializing ClusterMonitoringApiReader Role failed: %w", err)
+	}
+
+	err = t.client.CreateOrUpdateRole(ctx, clarr)
+	if err != nil {
+		return fmt.Errorf("reconciling ClusterMonitoringApiReader Role failed: %w", err)
+	}
+
+	pr, err := t.factory.ClusterMonitoringOperatorPrometheusRule()
+	if err != nil {
+		return fmt.Errorf("initializing cluster-monitoring-operator rules PrometheusRule failed: %w", err)
+	}
+	err = t.client.CreateOrUpdatePrometheusRule(ctx, pr)
+	if err != nil {
+		return fmt.Errorf("reconciling cluster-monitoring-operator rules PrometheusRule failed: %w", err)
+	}
+
+	smcmo, err := t.factory.ClusterMonitoringOperatorServiceMonitor()
+	if err != nil {
+		return fmt.Errorf("initializing Cluster Monitoring Operator ServiceMonitor failed: %w", err)
+	}
+
+	err = t.client.CreateOrUpdateServiceMonitor(ctx, smcmo)
+	if err != nil {
+		return fmt.Errorf("reconciling Cluster Monitoring Operator ServiceMonitor failed: %w", err)
+	}
+
+	s, err := t.factory.GRPCSecret()
+	if err != nil {
+		return fmt.Errorf("error initializing Cluster Monitoring Operator GRPC TLS secret: %w", err)
+	}
+
+	loaded, err := t.client.GetSecret(ctx, s.Namespace, s.Name)
+	switch {
+	case apierrors.IsNotFound(err):
+		// No secret was found, proceed with the default empty secret from manifests.
+		klog.V(5).Info("creating new Cluster Monitoring Operator GRPC TLS secret")
+	case err == nil:
+		// Secret was found, use that.
+		s = loaded
+		klog.V(5).Info("found existing Cluster Monitoring Operator GRPC TLS secret")
+	default:
+		return fmt.Errorf("error reading Cluster Monitoring Operator GRPC TLS secret: %w", err)
+	}
+
+	err = manifests.RotateGRPCSecret(s)
+	if err != nil {
+		return fmt.Errorf("error rotating Cluster Monitoring Operator GRPC TLS secret: %w", err)
+	}
+
+	err = t.client.CreateOrUpdateSecret(ctx, s)
+	if err != nil {
+		return fmt.Errorf("error creating Cluster Monitoring Operator GRPC TLS secret: %w", err)
+	}
+
+	return nil
+}