diff --git a/pkg/client/client.go b/pkg/client/client.go index e9058276f3..bdc2dde297 100644 --- a/pkg/client/client.go +++ b/pkg/client/client.go @@ -1748,6 +1748,11 @@ func (c *Client) HasConsoleCapability(ctx context.Context) (bool, error) { return c.HasClusterCapability(ctx, configv1.ClusterVersionCapabilityConsole) } +func (c *Client) HasOptionalMonitoringCapability(_ context.Context) (bool, error) { + //return c.HasClusterCapability(ctx, configv1.ClusterVersionCapabilityOptionalMonitoring) + return false, nil +} + // CreateOrUpdateConsolePlugin function uses retries because API requests related to the ConsolePlugin resource // may depend on the availability of a conversion container. This container is part of the console-operator Pod, which is not duplicated. // If this pod is down (due to restarts for upgrades or other reasons), transient failures will be reported. diff --git a/pkg/operator/operator.go b/pkg/operator/operator.go index 91f8e6ecc7..1b661d7e93 100644 --- a/pkg/operator/operator.go +++ b/pkg/operator/operator.go @@ -790,42 +790,64 @@ func (o *Operator) sync(ctx context.Context, key string) error { consoleConfig, ) + optionalMonitoringEnabled, err := o.client.HasOptionalMonitoringCapability(ctx) + if err != nil { + o.reportFailed(ctx, newRunReportForError("OptionalMonitoringCheckError", err)) + return err + } + + // Update prometheus-operator before anything else because it is + // responsible for managing many other resources (e.g. Prometheus, + // Alertmanager, Thanos Ruler, ...). The metrics scraping client CA + // should also be created first because it is referenced by Prometheus. + prerequisiteTasks := []*tasks.TaskSpec{ + newTaskSpec("MetricsScrapingClientCA", tasks.NewMetricsClientCATask(o.client, factory, config)), + } + if optionalMonitoringEnabled { + prerequisiteTasks = append(prerequisiteTasks, + newTaskSpec("PrometheusOperatorOptionalMonitoring", tasks.NewPrometheusOperatorOptionalMonitoringTask(o.client, factory)), + ) + } else { + prerequisiteTasks = append(prerequisiteTasks, + newTaskSpec("PrometheusOperator", tasks.NewPrometheusOperatorTask(o.client, factory)), + ) + } + followupTasks := + []*tasks.TaskSpec{ + newTaskSpec("Prometheus", tasks.NewPrometheusTask(o.client, factory, config)), + newTaskSpec("NodeExporter", tasks.NewNodeExporterTask(o.client, factory)), + newTaskSpec("KubeStateMetrics", tasks.NewKubeStateMetricsTask(o.client, factory)), + newTaskSpec("OpenshiftStateMetrics", tasks.NewOpenShiftStateMetricsTask(o.client, factory)), + newTaskSpec("MetricsServer", tasks.NewMetricsServerTask(ctx, o.namespace, o.client, factory, config)), + newTaskSpec("TelemeterClient", tasks.NewTelemeterClientTask(o.client, factory, config)), + newTaskSpec("ThanosQuerier", tasks.NewThanosQuerierTask(o.client, factory, config)), + newTaskSpec("ControlPlaneComponents", tasks.NewControlPlaneTask(o.client, factory, config)), + // Tried to run the UWM prom-operator in the first group, but some e2e tests started failing. + newUWMTaskSpec("Prometheus", tasks.NewPrometheusUserWorkloadTask(o.client, factory, config)), + } + if optionalMonitoringEnabled { + followupTasks = append(followupTasks, + newTaskSpec("ClusterMonitoringOperatorOptionalMonitoringDeps", tasks.NewClusterMonitoringOperatorOptionalMonitoringTask(o.client, factory, config)), + ) + } else { + followupTasks = append(followupTasks, + newTaskSpec("ClusterMonitoringOperatorDeps", tasks.NewClusterMonitoringOperatorTask(o.client, factory, config)), + newTaskSpec("Alertmanager", tasks.NewAlertmanagerTask(o.client, factory, config)), + newTaskSpec("ConsolePluginComponents", tasks.NewMonitoringPluginTask(o.client, factory, config)), + newUWMTaskSpec("PrometheusOperator", tasks.NewPrometheusOperatorUserWorkloadTask(o.client, factory, config)), + newUWMTaskSpec("Alertmanager", tasks.NewAlertmanagerUserWorkloadTask(o.client, factory, config)), + newUWMTaskSpec("ThanosRuler", tasks.NewThanosRulerUserWorkloadTask(o.client, factory, config)), + ) + } + // The shared configmap depends on resources being created by the previous tasks hence run it last. + postRequisiteTasks := []*tasks.TaskSpec{ + newTaskSpec("ConfigurationSharing", tasks.NewConfigSharingTask(o.client, factory, config)), + } tl := tasks.NewTaskRunner( o.client, - // Update prometheus-operator before anything else because it is - // responsible for managing many other resources (e.g. Prometheus, - // Alertmanager, Thanos Ruler, ...). The metrics scraping client CA - // should also be created first because it is referenced by Prometheus. - tasks.NewTaskGroup( - []*tasks.TaskSpec{ - newTaskSpec("MetricsScrapingClientCA", tasks.NewMetricsClientCATask(o.client, factory, config)), - newTaskSpec("PrometheusOperator", tasks.NewPrometheusOperatorTask(o.client, factory)), - }), - tasks.NewTaskGroup( - []*tasks.TaskSpec{ - newTaskSpec("ClusterMonitoringOperatorDeps", tasks.NewClusterMonitoringOperatorTask(o.client, factory, config)), - newTaskSpec("Prometheus", tasks.NewPrometheusTask(o.client, factory, config)), - newTaskSpec("Alertmanager", tasks.NewAlertmanagerTask(o.client, factory, config)), - newTaskSpec("NodeExporter", tasks.NewNodeExporterTask(o.client, factory)), - newTaskSpec("KubeStateMetrics", tasks.NewKubeStateMetricsTask(o.client, factory)), - newTaskSpec("OpenshiftStateMetrics", tasks.NewOpenShiftStateMetricsTask(o.client, factory)), - newTaskSpec("MetricsServer", tasks.NewMetricsServerTask(ctx, o.namespace, o.client, factory, config)), - newTaskSpec("TelemeterClient", tasks.NewTelemeterClientTask(o.client, factory, config)), - newTaskSpec("ThanosQuerier", tasks.NewThanosQuerierTask(o.client, factory, config)), - newTaskSpec("ControlPlaneComponents", tasks.NewControlPlaneTask(o.client, factory, config)), - newTaskSpec("ConsolePluginComponents", tasks.NewMonitoringPluginTask(o.client, factory, config)), - // Tried to run the UWM prom-operator in the first group, but some e2e tests started failing. - newUWMTaskSpec("PrometheusOperator", tasks.NewPrometheusOperatorUserWorkloadTask(o.client, factory, config)), - newUWMTaskSpec("Prometheus", tasks.NewPrometheusUserWorkloadTask(o.client, factory, config)), - newUWMTaskSpec("Alertmanager", tasks.NewAlertmanagerUserWorkloadTask(o.client, factory, config)), - newUWMTaskSpec("ThanosRuler", tasks.NewThanosRulerUserWorkloadTask(o.client, factory, config)), - }), - // The shared configmap depends on resources being created by the previous tasks hence run it last. - tasks.NewTaskGroup( - []*tasks.TaskSpec{ - newTaskSpec("ConfigurationSharing", tasks.NewConfigSharingTask(o.client, factory, config)), - }, - ), + tasks.NewTaskGroup(prerequisiteTasks), + tasks.NewTaskGroup(followupTasks), + tasks.NewTaskGroup(postRequisiteTasks), ) klog.Info("Updating ClusterOperator status to InProgress.") err = o.client.StatusReporter().SetRollOutInProgress(ctx) diff --git a/pkg/tasks/clustermonitoringoperator_optionalmonitoring.go b/pkg/tasks/clustermonitoringoperator_optionalmonitoring.go new file mode 100644 index 0000000000..a00700b36c --- /dev/null +++ b/pkg/tasks/clustermonitoringoperator_optionalmonitoring.go @@ -0,0 +1,126 @@ +// Copyright 2018 The Cluster Monitoring Operator Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tasks + +import ( + "context" + "fmt" + + rbacv1 "k8s.io/api/rbac/v1" + apierrors "k8s.io/apimachinery/pkg/api/errors" + "k8s.io/klog/v2" + + "github.com/openshift/cluster-monitoring-operator/pkg/client" + "github.com/openshift/cluster-monitoring-operator/pkg/manifests" +) + +type ClusterMonitoringOperatorOptionalMonitoringTask struct { + client *client.Client + factory *manifests.Factory + config *manifests.Config +} + +func NewClusterMonitoringOperatorOptionalMonitoringTask( + client *client.Client, + factory *manifests.Factory, + config *manifests.Config, +) *ClusterMonitoringOperatorOptionalMonitoringTask { + return &ClusterMonitoringOperatorOptionalMonitoringTask{ + client: client, + factory: factory, + config: config, + } +} + +func (t *ClusterMonitoringOperatorOptionalMonitoringTask) Run(ctx context.Context) error { + crfs := map[string]func() (*rbacv1.ClusterRole, error){ + "cluster-monitoring-view": t.factory.ClusterMonitoringClusterRoleView, + "system:aggregated-metrics-reader": t.factory.ClusterMonitoringClusterRoleAggregatedMetricsReader, + "pod-metrics-reader": t.factory.ClusterMonitoringClusterRolePodMetricsReader, + "monitoring-rules-edit": t.factory.ClusterMonitoringRulesEditClusterRole, + "monitoring-rules-view": t.factory.ClusterMonitoringRulesViewClusterRole, + "monitoring-edit": t.factory.ClusterMonitoringEditClusterRole, + } + for name, crf := range crfs { + cr, err := crf() + if err != nil { + return fmt.Errorf("initializing %s ClusterRole failed: %w", name, err) + } + + err = t.client.CreateOrUpdateClusterRole(ctx, cr) + if err != nil { + return fmt.Errorf("reconciling %s ClusterRole failed: %w", name, err) + } + } + + clarr, err := t.factory.ClusterMonitoringApiReaderRole() + if err != nil { + return fmt.Errorf("initializing ClusterMonitoringApiReader Role failed: %w", err) + } + + err = t.client.CreateOrUpdateRole(ctx, clarr) + if err != nil { + return fmt.Errorf("reconciling ClusterMonitoringApiReader Role failed: %w", err) + } + + pr, err := t.factory.ClusterMonitoringOperatorPrometheusRule() + if err != nil { + return fmt.Errorf("initializing cluster-monitoring-operator rules PrometheusRule failed: %w", err) + } + err = t.client.CreateOrUpdatePrometheusRule(ctx, pr) + if err != nil { + return fmt.Errorf("reconciling cluster-monitoring-operator rules PrometheusRule failed: %w", err) + } + + smcmo, err := t.factory.ClusterMonitoringOperatorServiceMonitor() + if err != nil { + return fmt.Errorf("initializing Cluster Monitoring Operator ServiceMonitor failed: %w", err) + } + + err = t.client.CreateOrUpdateServiceMonitor(ctx, smcmo) + if err != nil { + return fmt.Errorf("reconciling Cluster Monitoring Operator ServiceMonitor failed: %w", err) + } + + s, err := t.factory.GRPCSecret() + if err != nil { + return fmt.Errorf("error initializing Cluster Monitoring Operator GRPC TLS secret: %w", err) + } + + loaded, err := t.client.GetSecret(ctx, s.Namespace, s.Name) + switch { + case apierrors.IsNotFound(err): + // No secret was found, proceed with the default empty secret from manifests. + klog.V(5).Info("creating new Cluster Monitoring Operator GRPC TLS secret") + case err == nil: + // Secret was found, use that. + s = loaded + klog.V(5).Info("found existing Cluster Monitoring Operator GRPC TLS secret") + default: + return fmt.Errorf("error reading Cluster Monitoring Operator GRPC TLS secret: %w", err) + } + + err = manifests.RotateGRPCSecret(s) + if err != nil { + return fmt.Errorf("error rotating Cluster Monitoring Operator GRPC TLS secret: %w", err) + } + + err = t.client.CreateOrUpdateSecret(ctx, s) + if err != nil { + return fmt.Errorf("error creating Cluster Monitoring Operator GRPC TLS secret: %w", err) + } + + return nil +} diff --git a/pkg/tasks/prometheusoperator_optionalmonitoring.go b/pkg/tasks/prometheusoperator_optionalmonitoring.go new file mode 100644 index 0000000000..bcd093e8e8 --- /dev/null +++ b/pkg/tasks/prometheusoperator_optionalmonitoring.go @@ -0,0 +1,184 @@ +// Copyright 2018 The Cluster Monitoring Operator Authors +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package tasks + +import ( + "context" + "fmt" + + "github.com/openshift/cluster-monitoring-operator/pkg/client" + "github.com/openshift/cluster-monitoring-operator/pkg/manifests" +) + +type PrometheusOperatorOptionalMonitoringTask struct { + client *client.Client + factory *manifests.Factory +} + +func NewPrometheusOperatorOptionalMonitoringTask(client *client.Client, factory *manifests.Factory) *PrometheusOperatorOptionalMonitoringTask { + return &PrometheusOperatorOptionalMonitoringTask{ + client: client, + factory: factory, + } +} + +func (t *PrometheusOperatorOptionalMonitoringTask) Run(ctx context.Context) error { + sa, err := t.factory.PrometheusOperatorServiceAccount() + if err != nil { + return fmt.Errorf("initializing Prometheus Operator ServiceAccount failed: %w", err) + } + + err = t.client.CreateOrUpdateServiceAccount(ctx, sa) + if err != nil { + return fmt.Errorf("reconciling Prometheus Operator ServiceAccount failed: %w", err) + } + + cr, err := t.factory.PrometheusOperatorClusterRole() + if err != nil { + return fmt.Errorf("initializing Prometheus Operator ClusterRole failed: %w", err) + } + + err = t.client.CreateOrUpdateClusterRole(ctx, cr) + if err != nil { + return fmt.Errorf("reconciling Prometheus Operator ClusterRole failed: %w", err) + } + + crb, err := t.factory.PrometheusOperatorClusterRoleBinding() + if err != nil { + return fmt.Errorf("initializing Prometheus Operator ClusterRoleBinding failed: %w", err) + } + + err = t.client.CreateOrUpdateClusterRoleBinding(ctx, crb) + if err != nil { + return fmt.Errorf("reconciling Prometheus Operator ClusterRoleBinding failed: %w", err) + } + + err = t.runAdmissionWebhook(ctx) + if err != nil { + return err + } + + svc, err := t.factory.PrometheusOperatorService() + if err != nil { + return fmt.Errorf("initializing Prometheus Operator Service failed: %w", err) + } + + err = t.client.CreateOrUpdateService(ctx, svc) + if err != nil { + return fmt.Errorf("reconciling Prometheus Operator Service failed: %w", err) + } + + rs, err := t.factory.PrometheusOperatorRBACProxySecret() + if err != nil { + return fmt.Errorf("initializing Prometheus Operator RBAC proxy Secret failed: %w", err) + } + + err = t.client.CreateIfNotExistSecret(ctx, rs) + if err != nil { + return fmt.Errorf("creating Prometheus Operator RBAC proxy Secret failed: %w", err) + } + + d, err := t.factory.PrometheusOperatorDeployment() + if err != nil { + return fmt.Errorf("initializing Prometheus Operator Deployment failed: %w", err) + } + + err = t.client.CreateOrUpdateDeployment(ctx, d) + if err != nil { + return fmt.Errorf("reconciling Prometheus Operator Deployment failed: %w", err) + } + + err = t.client.AssurePrometheusOperatorCRsExist(ctx) + if err != nil { + return fmt.Errorf("waiting for Prometheus Operator CRs to become available failed: %w", err) + } + + pr, err := t.factory.PrometheusOperatorPrometheusRule() + if err != nil { + return fmt.Errorf("initializing prometheus-operator rules PrometheusRule failed: %w", err) + } + err = t.client.CreateOrUpdatePrometheusRule(ctx, pr) + if err != nil { + return fmt.Errorf("reconciling prometheus-operator rules PrometheusRule failed: %w", err) + } + + smpo, err := t.factory.PrometheusOperatorServiceMonitor() + if err != nil { + return fmt.Errorf("initializing Prometheus Operator ServiceMonitor failed: %w", err) + } + + err = t.client.CreateOrUpdateServiceMonitor(ctx, smpo) + if err != nil { + return fmt.Errorf("reconciling Prometheus Operator ServiceMonitor failed: %w", err) + } + return nil +} + +func (t *PrometheusOperatorOptionalMonitoringTask) runAdmissionWebhook(ctx context.Context) error { + // Deploy manifests for the admission webhook service. + sa, err := t.factory.PrometheusOperatorAdmissionWebhookServiceAccount() + if err != nil { + return fmt.Errorf("initializing Prometheus Operator Admission Webhook ServiceAccount failed: %w", err) + } + + err = t.client.CreateOrUpdateServiceAccount(ctx, sa) + if err != nil { + return fmt.Errorf("reconciling Prometheus Operator Admission Webhook ServiceAccount failed: %w", err) + } + + svc, err := t.factory.PrometheusOperatorAdmissionWebhookService() + if err != nil { + return fmt.Errorf("initializing Prometheus Operator Admission Webhook Service failed: %w", err) + } + + err = t.client.CreateOrUpdateService(ctx, svc) + if err != nil { + return fmt.Errorf("reconciling Prometheus Operator Admission Webhook Service failed: %w", err) + } + + pdb, err := t.factory.PrometheusOperatorAdmissionWebhookPodDisruptionBudget() + if err != nil { + return fmt.Errorf("initializing Prometheus Operator Admission Webhook PodDisruptionBudget failed: %w", err) + } + + if pdb != nil { + err = t.client.CreateOrUpdatePodDisruptionBudget(ctx, pdb) + if err != nil { + return fmt.Errorf("reconciling Prometheus Operator Admission Webhook PodDisruptionBudget failed: %w", err) + } + } + + d, err := t.factory.PrometheusOperatorAdmissionWebhookDeployment() + if err != nil { + return fmt.Errorf("initializing Prometheus Operator Admission Webhook Deployment failed: %w", err) + } + + err = t.client.CreateOrUpdateDeployment(ctx, d) + if err != nil { + return fmt.Errorf("reconciling Prometheus Operator Admission Webhook Deployment failed: %w", err) + } + + w, err := t.factory.PrometheusRuleValidatingWebhook() + if err != nil { + return fmt.Errorf("initializing Prometheus Rule Validating Webhook failed: %w", err) + } + + err = t.client.CreateOrUpdateValidatingWebhookConfiguration(ctx, w) + if err != nil { + return fmt.Errorf("reconciling Prometheus Rule Validating Webhook failed: %w", err) + } + + return nil +}