diff --git a/api/v1beta1/conditions.go b/api/v1beta1/conditions.go index f9db6290..631a5880 100644 --- a/api/v1beta1/conditions.go +++ b/api/v1beta1/conditions.go @@ -39,6 +39,9 @@ const ( // ScrapeConfigReadyCondition Status=True condition which indicates if the ScrapeConfig is configured and operational ScrapeConfigReadyCondition condition.Type = "ScrapeConfigReady" + // PodMonitorReadyCondition Status=True condition which indicates if the PodMonitor is configured and operational + PodMonitorReadyCondition condition.Type = "PodMonitorReady" + // PrometheusReadyCondition Status=True condition which indicates if the Prometheus watch is operational PrometheusReadyCondition condition.Type = "PrometheusReady" @@ -169,6 +172,15 @@ const ( // ScrapeConfigUnableToOwnMessage ScrapeConfigUnableToOwnMessage = "Error occured when trying to own %s" + // + // PodMonitorReady condition messages + // + // PodMonitorReadyInitMessage + PodMonitorReadyInitMessage = "PodMonitor not started" + + // PodMonitorUnableToOwnMessage + PodMonitorUnableToOwnMessage = "Error occured when trying to own %s" + // // PrometheusReady condition messages // diff --git a/config/rbac/role.yaml b/config/rbac/role.yaml index 7ef77f99..3b71eb8d 100644 --- a/config/rbac/role.yaml +++ b/config/rbac/role.yaml @@ -151,6 +151,7 @@ rules: - monitoring.rhobs resources: - monitoringstacks + - podmonitors - prometheusrules - scrapeconfigs verbs: @@ -169,6 +170,7 @@ rules: - delete - get - list + - watch - apiGroups: - network.openstack.org resources: diff --git a/controllers/metricstorage_controller.go b/controllers/metricstorage_controller.go index 2aaa2353..2b31c98c 100644 --- a/controllers/metricstorage_controller.go +++ b/controllers/metricstorage_controller.go @@ -122,8 +122,9 @@ func (r *MetricStorageReconciler) GetLogger(ctx context.Context) logr.Logger { //+kubebuilder:rbac:groups=telemetry.openstack.org,resources=metricstorages/status,verbs=get;update;patch //+kubebuilder:rbac:groups=telemetry.openstack.org,resources=metricstorages/finalizers,verbs=update;patch //+kubebuilder:rbac:groups=monitoring.rhobs,resources=monitoringstacks,verbs=get;list;watch;create;update;patch;delete -//+kubebuilder:rbac:groups=monitoring.rhobs,resources=servicemonitors,verbs=get;list;delete +//+kubebuilder:rbac:groups=monitoring.rhobs,resources=servicemonitors,verbs=get;list;watch;delete //+kubebuilder:rbac:groups=monitoring.rhobs,resources=scrapeconfigs,verbs=get;list;watch;create;update;patch;delete +//+kubebuilder:rbac:groups=monitoring.rhobs,resources=podmonitors,verbs=get;list;watch;create;update;patch;delete //+kubebuilder:rbac:groups=monitoring.rhobs,resources=prometheusrules,verbs=get;list;watch;create;update;patch;delete //+kubebuilder:rbac:groups=monitoring.rhobs,resources=prometheuses,verbs=get;list;watch;update;patch;delete //+kubebuilder:rbac:groups=monitoring.rhobs,resources=alertmanagers,verbs=get;list;watch;update;patch;delete @@ -202,6 +203,7 @@ func (r *MetricStorageReconciler) Reconcile(ctx context.Context, req ctrl.Reques condition.UnknownCondition(condition.ReadyCondition, condition.InitReason, condition.ReadyInitMessage), condition.UnknownCondition(telemetryv1.MonitoringStackReadyCondition, condition.InitReason, telemetryv1.MonitoringStackReadyInitMessage), condition.UnknownCondition(telemetryv1.ScrapeConfigReadyCondition, condition.InitReason, telemetryv1.ScrapeConfigReadyInitMessage), + condition.UnknownCondition(telemetryv1.PodMonitorReadyCondition, condition.InitReason, telemetryv1.PodMonitorReadyInitMessage), condition.UnknownCondition(telemetryv1.DashboardPrometheusRuleReadyCondition, condition.InitReason, telemetryv1.DashboardPrometheusRuleReadyInitMessage), condition.UnknownCondition(telemetryv1.DashboardPluginReadyCondition, condition.InitReason, telemetryv1.DashboardPluginReadyInitMessage), condition.UnknownCondition(telemetryv1.DashboardDatasourceReadyCondition, condition.InitReason, telemetryv1.DashboardDatasourceReadyInitMessage), @@ -247,8 +249,6 @@ func (r *MetricStorageReconciler) reconcileDelete( return ctrl.Result{}, nil } -// TODO: call the function appropriately -// //nolint:all func (r *MetricStorageReconciler) reconcileUpdate( ctx context.Context, @@ -262,6 +262,10 @@ func (r *MetricStorageReconciler) reconcileUpdate( if err != nil { return ctrl.Result{}, err } + err = r.deleteRabbitMQScrapeConfig(ctx, instance) + if err != nil { + return ctrl.Result{}, err + } Log.Info(fmt.Sprintf("Reconciled Service '%s' update successfully", instance.Name)) @@ -295,6 +299,33 @@ func (r *MetricStorageReconciler) deleteOldServiceMonitors( return nil } +// Delete RabbitMQ ScrapeConfig +// A ScrapeConfig for RabbitMQ was last used at the beginning of FR4 +func (r *MetricStorageReconciler) deleteRabbitMQScrapeConfig( + ctx context.Context, + instance *telemetryv1.MetricStorage, +) error { + namespacedName := types.NamespacedName{ + Name: fmt.Sprintf("%s-rabbitmq", telemetry.ServiceName), + Namespace: instance.Namespace, + } + scrapeConfig := &monv1alpha1.ScrapeConfig{} + err := r.Get(ctx, namespacedName, scrapeConfig) + if err != nil { + if k8s_errors.IsNotFound(err) { + return nil + } + return err + } + if object.CheckOwnerRefExist(instance.UID, scrapeConfig.OwnerReferences) { + err = r.Delete(ctx, scrapeConfig) + if err != nil { + return err + } + } + return nil +} + func (r *MetricStorageReconciler) reconcileNormal( ctx context.Context, instance *telemetryv1.MetricStorage, @@ -453,6 +484,11 @@ func (r *MetricStorageReconciler) reconcileNormal( return res, err } + // Deploy PodMonitors + if res, err := r.createPodMonitors(ctx, instance, eventHandler); err != nil { + return res, err + } + if !instance.Spec.DashboardsEnabled { if res, err := metricstorage.DeleteDashboardObjects(ctx, instance, helper); err != nil { return res, err @@ -622,6 +658,14 @@ func (r *MetricStorageReconciler) reconcileNormal( // when job passed, mark NetworkAttachmentsReadyCondition ready instance.Status.Conditions.MarkTrue(condition.NetworkAttachmentsReadyCondition, condition.NetworkAttachmentsReadyMessage) + // Handle service update + ctrlResult, err := r.reconcileUpdate(ctx, instance, helper) + if err != nil { + return ctrlResult, err + } else if (ctrlResult != ctrl.Result{}) { + return ctrlResult, nil + } + if instance.Status.Conditions.AllSubConditionIsTrue() { instance.Status.Conditions.MarkTrue( condition.ReadyCondition, condition.ReadyMessage) @@ -684,6 +728,31 @@ func (r *MetricStorageReconciler) prometheusEndpointSecret( return nil } +func (r *MetricStorageReconciler) createPodMonitor( + ctx context.Context, + instance *telemetryv1.MetricStorage, + log logr.Logger, + desiredPodMonitor *monv1.PodMonitor, +) error { + podMonitor := &monv1.PodMonitor{ + ObjectMeta: metav1.ObjectMeta{ + Name: desiredPodMonitor.Name, + Namespace: instance.Namespace, + }, + } + op, err := controllerutil.CreateOrPatch(ctx, r.Client, podMonitor, func() error { + desiredPodMonitor.Spec.DeepCopyInto(&podMonitor.Spec) + podMonitor.Labels = desiredPodMonitor.Labels + err := controllerutil.SetControllerReference(instance, podMonitor, r.Scheme) + return err + }) + + if err == nil && op != controllerutil.OperationResultNone { + log.Info(fmt.Sprintf("PodMonitor %s successfully changed - operation: %s", podMonitor.GetName(), string(op))) + } + return err +} + func (r *MetricStorageReconciler) createServiceScrapeConfig( ctx context.Context, instance *telemetryv1.MetricStorage, @@ -711,6 +780,50 @@ func (r *MetricStorageReconciler) createServiceScrapeConfig( return err } +func (r *MetricStorageReconciler) createPodMonitors( + ctx context.Context, + instance *telemetryv1.MetricStorage, + eventHandler handler.EventHandler, +) (ctrl.Result, error) { + Log := r.GetLogger(ctx) + err := r.ensureWatches(ctx, "podmonitors.monitoring.rhobs", &monv1.PodMonitor{}, eventHandler) + if err != nil { + instance.Status.Conditions.MarkFalse(telemetryv1.PodMonitorReadyCondition, + condition.Reason("Can't own PodMonitor resource. The Cluster Observability Operator probably isn't installed"), + condition.SeverityError, + telemetryv1.PodMonitorUnableToOwnMessage, err) + Log.Info("Can't own PodMonitor resource. The Cluster Observability Operator probably isn't installed") + return ctrl.Result{RequeueAfter: telemetryv1.PauseBetweenWatchAttempts}, nil + } + + // PodMonitors for RabbitMQ monitoring + // NOTE: We're watching Rabbits and reconciling with each of their change + // that should keep the PodMonitors always up to date. + rabbitList := &rabbitmqv1.RabbitmqClusterList{} + listOpts := []client.ListOption{ + client.InNamespace(instance.GetNamespace()), + } + err = r.List(ctx, rabbitList, listOpts...) + if err != nil && !k8s_errors.IsNotFound(err) { + return ctrl.Result{}, err + } + for _, rabbit := range rabbitList.Items { + desiredPodMonitor := metricstorage.RabbitMQPodMonitor( + instance, + serviceLabels, + rabbit.Name, + instance.Spec.PrometheusTLS.Enabled(), + ) + err = r.createPodMonitor(ctx, instance, Log, desiredPodMonitor) + if err != nil { + return ctrl.Result{}, err + } + } + + instance.Status.Conditions.MarkTrue(telemetryv1.PodMonitorReadyCondition, condition.ReadyMessage) + return ctrl.Result{}, nil +} + func (r *MetricStorageReconciler) createScrapeConfigs( ctx context.Context, instance *telemetryv1.MetricStorage, @@ -760,36 +873,6 @@ func (r *MetricStorageReconciler) createScrapeConfigs( return ctrl.Result{}, err } - // ScrapeConfigs for RabbitMQ monitoring - // NOTE: We're watching Rabbits and reconciling with each of their change - // that should keep the targets inside the ScrapeConfig always - // up to date. - rabbitList := &rabbitmqv1.RabbitmqClusterList{} - listOpts := []client.ListOption{ - client.InNamespace(instance.GetNamespace()), - } - err = r.List(ctx, rabbitList, listOpts...) - if err != nil && !k8s_errors.IsNotFound(err) { - return ctrl.Result{}, err - } - rabbitTargets := []string{} - for _, rabbit := range rabbitList.Items { - rabbitServerName := fmt.Sprintf("%s.%s.svc", rabbit.Name, rabbit.Namespace) - rabbitTargets = append(rabbitTargets, net.JoinHostPort(rabbitServerName, strconv.Itoa(metricstorage.RabbitMQPrometheusPort))) - } - rabbitCfgName := fmt.Sprintf("%s-rabbitmq", telemetry.ServiceName) - desiredScrapeConfig = metricstorage.ScrapeConfig( - instance, - serviceLabels, - rabbitTargets, - instance.Spec.PrometheusTLS.Enabled(), - ) - err = r.createServiceScrapeConfig(ctx, instance, Log, "RabbitMQ", - rabbitCfgName, desiredScrapeConfig) - if err != nil { - return ctrl.Result{}, err - } - // mysqld exporter ceilometerNamespacedName := types.NamespacedName{ Name: ceilometer.ServiceName, diff --git a/pkg/dashboards/openstack-rabbitmq.go b/pkg/dashboards/openstack-rabbitmq.go index 3cb54114..1e51d91f 100644 --- a/pkg/dashboards/openstack-rabbitmq.go +++ b/pkg/dashboards/openstack-rabbitmq.go @@ -102,7 +102,7 @@ func OpenstackRabbitmq(dsName string) *corev1.ConfigMap { "pluginVersion": "6.7.6", "targets": [ { - "expr": "sum(rabbitmq_queues{instance=\"$cluster\"})", + "expr": "sum(rabbitmq_queues * on(instance, job) group_left(rabbitmq_cluster) rabbitmq_identity_info{rabbitmq_cluster=\"$cluster\"})", "interval": "", "legendFormat": "", "refId": "A" @@ -157,7 +157,7 @@ func OpenstackRabbitmq(dsName string) *corev1.ConfigMap { "pluginVersion": "6.7.6", "targets": [ { - "expr": "sum(rabbitmq_consumers{instance=\"$cluster\"})", + "expr": "sum(rabbitmq_consumers * on(instance, job) group_left(rabbitmq_cluster) rabbitmq_identity_info{rabbitmq_cluster=\"$cluster\"})", "interval": "", "legendFormat": "", "refId": "A" @@ -212,7 +212,7 @@ func OpenstackRabbitmq(dsName string) *corev1.ConfigMap { "pluginVersion": "6.7.6", "targets": [ { - "expr": "sum(rabbitmq_connections{instance=\"$cluster\"})", + "expr": "sum(rabbitmq_connections * on(instance, job) group_left(rabbitmq_cluster) rabbitmq_identity_info{rabbitmq_cluster=\"$cluster\"})", "interval": "", "legendFormat": "", "refId": "A" @@ -267,7 +267,7 @@ func OpenstackRabbitmq(dsName string) *corev1.ConfigMap { "pluginVersion": "6.7.6", "targets": [ { - "expr": "sum(rabbitmq_channels{instance=\"$cluster\"})", + "expr": "sum(rabbitmq_channels * on(instance, job) group_left(rabbitmq_cluster) rabbitmq_identity_info{rabbitmq_cluster=\"$cluster\"})", "interval": "", "legendFormat": "", "refId": "A" @@ -322,7 +322,7 @@ func OpenstackRabbitmq(dsName string) *corev1.ConfigMap { "pluginVersion": "6.7.6", "targets": [ { - "expr": "sum(rate(rabbitmq_global_messages_received_total{instance=\"$cluster\"}[60s]))", + "expr": "sum(rate(rabbitmq_global_messages_received_total[60s]) * on(instance, job) group_left(rabbitmq_cluster) rabbitmq_identity_info{rabbitmq_cluster=\"$cluster\"})", "interval": "", "legendFormat": "", "refId": "A" @@ -377,7 +377,7 @@ func OpenstackRabbitmq(dsName string) *corev1.ConfigMap { "pluginVersion": "6.7.6", "targets": [ { - "expr": "sum(rate(rabbitmq_global_messages_redelivered_total{instance=\"$cluster\"}[60s])) + sum(rate(rabbitmq_global_messages_delivered_consume_auto_ack_total{instance=\"$cluster\"}[60s])) + sum(rate(rabbitmq_global_messages_delivered_consume_manual_ack_total{instance=\"$cluster\"}[60s])) + sum(rate(rabbitmq_global_messages_delivered_get_auto_ack_total{instance=\"$cluster\"}[60s])) + sum(rate(rabbitmq_global_messages_delivered_get_manual_ack_total{instance=\"$cluster\"}[60s]))", + "expr": "sum(rate(rabbitmq_global_messages_redelivered_total[60s]) * on(instance, job) group_left(rabbitmq_cluster) rabbitmq_identity_info{rabbitmq_cluster=\"$cluster\"}) +sum(rate(rabbitmq_global_messages_delivered_consume_auto_ack_total[60s]) * on(instance, job) group_left(rabbitmq_cluster) rabbitmq_identity_info{rabbitmq_cluster=\"$cluster\"}) +sum(rate(rabbitmq_global_messages_delivered_consume_manual_ack_total[60s]) * on(instance, job) group_left(rabbitmq_cluster) rabbitmq_identity_info{rabbitmq_cluster=\"$cluster\"}) +sum(rate(rabbitmq_global_messages_delivered_get_auto_ack_total[60s]) * on(instance, job) group_left(rabbitmq_cluster) rabbitmq_identity_info{rabbitmq_cluster=\"$cluster\"}) +sum(rate(rabbitmq_global_messages_delivered_get_manual_ack_total[60s]) * on(instance, job) group_left(rabbitmq_cluster) rabbitmq_identity_info{rabbitmq_cluster=\"$cluster\"})", "interval": "", "legendFormat": "", "refId": "A" @@ -432,7 +432,7 @@ func OpenstackRabbitmq(dsName string) *corev1.ConfigMap { "pluginVersion": "6.7.6", "targets": [ { - "expr": "sum(rabbitmq_queue_messages_ready{instance=\"$cluster\"})", + "expr": "sum(rabbitmq_queue_messages_ready * on(instance, job) group_left(rabbitmq_cluster) rabbitmq_identity_info{rabbitmq_cluster=\"$cluster\"})", "interval": "", "legendFormat": "", "refId": "A" @@ -487,7 +487,7 @@ func OpenstackRabbitmq(dsName string) *corev1.ConfigMap { "pluginVersion": "6.7.6", "targets": [ { - "expr": "sum(rabbitmq_queue_messages_unacked{instance=\"$cluster\"})", + "expr": "sum(rabbitmq_queue_messages_unacked * on(instance, job) group_left(rabbitmq_cluster) rabbitmq_identity_info{rabbitmq_cluster=\"$cluster\"})", "interval": "", "legendFormat": "", "refId": "A" @@ -562,9 +562,9 @@ func OpenstackRabbitmq(dsName string) *corev1.ConfigMap { "steppedLine": false, "targets": [ { - "expr": "(rabbitmq_resident_memory_limit_bytes{instance=\"$cluster\"}) -\n(rabbitmq_process_resident_memory_bytes{instance=\"$cluster\"})", + "expr": "(rabbitmq_resident_memory_limit_bytes * on(instance, job) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$cluster\"}) -(rabbitmq_process_resident_memory_bytes * on(instance, job) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$cluster\"})", "interval": "", - "legendFormat": "", + "legendFormat": "{{rabbitmq_node}}", "refId": "A" } ], @@ -654,9 +654,9 @@ func OpenstackRabbitmq(dsName string) *corev1.ConfigMap { "steppedLine": false, "targets": [ { - "expr": "rabbitmq_disk_space_available_bytes{instance=\"$cluster\"}", + "expr": "rabbitmq_disk_space_available_bytes * on(instance, job) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$cluster\"}", "interval": "", - "legendFormat": "", + "legendFormat": "{{rabbitmq_node}}", "refId": "A" } ], @@ -746,9 +746,9 @@ func OpenstackRabbitmq(dsName string) *corev1.ConfigMap { "steppedLine": false, "targets": [ { - "expr": "(rabbitmq_process_max_fds{instance=\"$cluster\"}) -\n(rabbitmq_process_open_fds{instance=\"$cluster\"})", + "expr": "(rabbitmq_process_max_fds * on(instance, job) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$cluster\"}) -(rabbitmq_process_open_fds * on(instance, job) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$cluster\"})", "interval": "", - "legendFormat": "", + "legendFormat": "{{rabbitmq_node}}", "refId": "A" } ], @@ -838,9 +838,9 @@ func OpenstackRabbitmq(dsName string) *corev1.ConfigMap { "steppedLine": false, "targets": [ { - "expr": "(rabbitmq_process_max_tcp_sockets{instance=\"$cluster\"}) -\n(rabbitmq_process_open_tcp_sockets{instance=\"$cluster\"})", + "expr": "(rabbitmq_process_max_tcp_sockets * on(instance, job) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$cluster\"}) -(rabbitmq_process_open_tcp_sockets * on(instance, job) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$cluster\"})", "interval": "", - "legendFormat": "", + "legendFormat": "{{rabbitmq_node}}", "refId": "A" } ], @@ -950,9 +950,9 @@ func OpenstackRabbitmq(dsName string) *corev1.ConfigMap { "steppedLine": false, "targets": [ { - "expr": "sum(rabbitmq_queue_messages_ready{instance=\"$cluster\"})", + "expr": "sum(rabbitmq_queue_messages_ready * on(instance, job) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$cluster\"}) by(rabbitmq_node)", "interval": "", - "legendFormat": "", + "legendFormat": "{{rabbitmq_node}}", "refId": "A" } ], @@ -1043,9 +1043,9 @@ func OpenstackRabbitmq(dsName string) *corev1.ConfigMap { "steppedLine": false, "targets": [ { - "expr": "sum(rabbitmq_queue_messages_unacked{instance=\"$cluster\"})", + "expr": "sum(rabbitmq_queue_messages_unacked * on(instance, job) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$cluster\"}) by(rabbitmq_node)", "interval": "", - "legendFormat": "", + "legendFormat": "{{rabbitmq_node}}", "refId": "A" } ], @@ -1154,9 +1154,9 @@ func OpenstackRabbitmq(dsName string) *corev1.ConfigMap { "steppedLine": false, "targets": [ { - "expr": "sum(rate(rabbitmq_global_messages_received_total{instance=\"$cluster\"}[60s]))", + "expr": "sum(rate(rabbitmq_global_messages_received_total[60s]) * on(instance, job) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$cluster\"}) by(rabbitmq_node)", "interval": "", - "legendFormat": "", + "legendFormat": "{{rabbitmq_node}}", "refId": "A" } ], @@ -1246,9 +1246,9 @@ func OpenstackRabbitmq(dsName string) *corev1.ConfigMap { "steppedLine": false, "targets": [ { - "expr": "sum(rate(rabbitmq_global_messages_confirmed_total{instance=\"$cluster\"}[60s]))", + "expr": "sum(rate(rabbitmq_global_messages_confirmed_total[60s]) * on(instance, job) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$cluster\"}) by(rabbitmq_node)", "interval": "", - "legendFormat": "", + "legendFormat": "{{rabbitmq_node}}", "refId": "A" } ], @@ -1338,9 +1338,9 @@ func OpenstackRabbitmq(dsName string) *corev1.ConfigMap { "steppedLine": false, "targets": [ { - "expr": "sum(rate(rabbitmq_global_messages_routed_total{instance=\"$cluster\"}[60s]))", + "expr": "sum(rate(rabbitmq_global_messages_routed_total[60s]) * on(instance, job) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$cluster\"}) by(rabbitmq_node)", "interval": "", - "legendFormat": "", + "legendFormat": "{{rabbitmq_node}}", "refId": "A" } ], @@ -1430,9 +1430,9 @@ func OpenstackRabbitmq(dsName string) *corev1.ConfigMap { "steppedLine": false, "targets": [ { - "expr": "sum(rate(rabbitmq_global_messages_received_confirm_total{instance=\"$cluster\"}[60s]) - \nrate(rabbitmq_global_messages_confirmed_total{instance=\"$cluster\"}[60s])\n)", + "expr": "sum(rate(rabbitmq_global_messages_received_confirm_total[60s]) * on(instance, job) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$cluster\"} - rate(rabbitmq_global_messages_confirmed_total[60s]) * on(instance, job) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$cluster\"}) by(rabbitmq_node)", "interval": "", - "legendFormat": "", + "legendFormat": "{{rabbitmq_node}}", "refId": "A" } ], @@ -1522,9 +1522,9 @@ func OpenstackRabbitmq(dsName string) *corev1.ConfigMap { "steppedLine": false, "targets": [ { - "expr": "sum(rate(rabbitmq_global_messages_unroutable_dropped_total{instance=\"$cluster\"}[60s]))", + "expr": "sum(rate(rabbitmq_global_messages_unroutable_dropped_total[60s]) * on(instance, job) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$cluster\"}) by(rabbitmq_node)", "interval": "", - "legendFormat": "", + "legendFormat": "{{rabbitmq_node}}", "refId": "A" } ], @@ -1614,9 +1614,9 @@ func OpenstackRabbitmq(dsName string) *corev1.ConfigMap { "steppedLine": false, "targets": [ { - "expr": "sum(rate(rabbitmq_global_messages_unroutable_returned_total{instance=\"$cluster\"}[60s]))", + "expr": "sum(rate(rabbitmq_global_messages_unroutable_returned_total[60s]) * on(instance, job) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$cluster\"}) by(rabbitmq_node)", "interval": "", - "legendFormat": "", + "legendFormat": "{{rabbitmq_node}}", "refId": "A" } ], @@ -1725,9 +1725,9 @@ func OpenstackRabbitmq(dsName string) *corev1.ConfigMap { "steppedLine": false, "targets": [ { - "expr": "sum(\n rate(rabbitmq_global_messages_delivered_consume_auto_ack_total{instance=\"$cluster\"}[60s])+\n rate(rabbitmq_global_messages_delivered_consume_manual_ack_total{instance=\"$cluster\"}[60s])\n)", + "expr": "sum( (rate(rabbitmq_global_messages_delivered_consume_auto_ack_total[60s]) * on(instance, job) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$cluster\"}) + (rate(rabbitmq_global_messages_delivered_consume_manual_ack_total[60s]) * on(instance, job) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$cluster\"})) by(rabbitmq_node)", "interval": "", - "legendFormat": "", + "legendFormat": "{{rabbitmq_node}}", "refId": "A" } ], @@ -1817,9 +1817,9 @@ func OpenstackRabbitmq(dsName string) *corev1.ConfigMap { "steppedLine": false, "targets": [ { - "expr": "sum(rate(rabbitmq_global_messages_redelivered_total{instance=\"$cluster\"}[60s]))", + "expr": "sum(rate(rabbitmq_global_messages_redelivered_total[60s]) * on(instance, job) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$cluster\"}) by(rabbitmq_node)", "interval": "", - "legendFormat": "", + "legendFormat": "{{rabbitmq_node}}", "refId": "A" } ], @@ -1909,9 +1909,9 @@ func OpenstackRabbitmq(dsName string) *corev1.ConfigMap { "steppedLine": false, "targets": [ { - "expr": "sum(rate(rabbitmq_global_messages_delivered_consume_manual_ack_total{instance=\"$cluster\"}[60s]))", + "expr": "sum(rate(rabbitmq_global_messages_delivered_consume_manual_ack_total[60s]) * on(instance, job) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$cluster\"}) by(rabbitmq_node)", "interval": "", - "legendFormat": "", + "legendFormat": "{{rabbitmq_node}}", "refId": "A" } ], @@ -2001,9 +2001,9 @@ func OpenstackRabbitmq(dsName string) *corev1.ConfigMap { "steppedLine": false, "targets": [ { - "expr": "sum(rate(rabbitmq_global_messages_delivered_consume_auto_ack_total{instance=\"$cluster\"}[60s]))", + "expr": "sum(rate(rabbitmq_global_messages_delivered_consume_auto_ack_total[60s]) * on(instance, job) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$cluster\"}) by(rabbitmq_node)", "interval": "", - "legendFormat": "", + "legendFormat": "{{rabbitmq_node}}", "refId": "A" } ], @@ -2093,9 +2093,9 @@ func OpenstackRabbitmq(dsName string) *corev1.ConfigMap { "steppedLine": false, "targets": [ { - "expr": "sum(rate(rabbitmq_global_messages_acknowledged_total{instance=\"$cluster\"}[60s]))", + "expr": "sum(rate(rabbitmq_global_messages_acknowledged_total[60s]) * on(instance, job) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$cluster\"}) by(rabbitmq_node)", "interval": "", - "legendFormat": "", + "legendFormat": "{{rabbitmq_node}}", "refId": "A" } ], @@ -2185,9 +2185,9 @@ func OpenstackRabbitmq(dsName string) *corev1.ConfigMap { "steppedLine": false, "targets": [ { - "expr": "sum(rate(rabbitmq_global_messages_delivered_get_auto_ack_total{instance=\"$cluster\"}[60s]))", + "expr": "sum(rate(rabbitmq_global_messages_delivered_get_auto_ack_total[60s]) * on(instance, job) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$cluster\"}) by(rabbitmq_node)", "interval": "", - "legendFormat": "", + "legendFormat": "{{rabbitmq_node}}", "refId": "A" } ], @@ -2277,9 +2277,9 @@ func OpenstackRabbitmq(dsName string) *corev1.ConfigMap { "steppedLine": false, "targets": [ { - "expr": "sum(rate(rabbitmq_global_messages_get_empty_total{instance=\"$cluster\"}[60s]))", + "expr": "sum(rate(rabbitmq_global_messages_get_empty_total[60s]) * on(instance, job) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$cluster\"}) by(rabbitmq_node)", "interval": "", - "legendFormat": "", + "legendFormat": "{{rabbitmq_node}}", "refId": "A" } ], @@ -2369,9 +2369,9 @@ func OpenstackRabbitmq(dsName string) *corev1.ConfigMap { "steppedLine": false, "targets": [ { - "expr": "sum(rate(rabbitmq_global_messages_delivered_get_manual_ack_total{instance=\"$cluster\"}[60s]))", + "expr": "sum(rate(rabbitmq_global_messages_delivered_get_manual_ack_total[60s]) * on(instance, job) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$cluster\"}) by(rabbitmq_node)", "interval": "", - "legendFormat": "", + "legendFormat": "{{rabbitmq_node}}", "refId": "A" } ], @@ -2480,9 +2480,9 @@ func OpenstackRabbitmq(dsName string) *corev1.ConfigMap { "steppedLine": false, "targets": [ { - "expr": "rabbitmq_queues{instance=\"$cluster\"}", + "expr": "rabbitmq_queues * on(instance, job) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$cluster\"}", "interval": "", - "legendFormat": "", + "legendFormat": "{{rabbitmq_node}}", "refId": "A" } ], @@ -2572,9 +2572,9 @@ func OpenstackRabbitmq(dsName string) *corev1.ConfigMap { "steppedLine": false, "targets": [ { - "expr": "sum(rate(rabbitmq_queues_declared_total{instance=\"$cluster\"}[60s]))", + "expr": "sum(rate(rabbitmq_queues_declared_total[60s]) * on(instance, job) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$cluster\"}) by(rabbitmq_node)", "interval": "", - "legendFormat": "", + "legendFormat": "{{rabbitmq_node}}", "refId": "A" } ], @@ -2664,9 +2664,9 @@ func OpenstackRabbitmq(dsName string) *corev1.ConfigMap { "steppedLine": false, "targets": [ { - "expr": "sum(rate(rabbitmq_queues_created_total{instance=\"$cluster\"}[60s]))", + "expr": "sum(rate(rabbitmq_queues_created_total[60s]) * on(instance, job) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$cluster\"}) by(rabbitmq_node)", "interval": "", - "legendFormat": "", + "legendFormat": "{{rabbitmq_node}}", "refId": "A" } ], @@ -2756,9 +2756,9 @@ func OpenstackRabbitmq(dsName string) *corev1.ConfigMap { "steppedLine": false, "targets": [ { - "expr": "sum(rate(rabbitmq_queues_deleted_total{instance=\"$cluster\"}[60s]))", + "expr": "sum(rate(rabbitmq_queues_deleted_total[60s]) * on(instance, job) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$cluster\"}) by(rabbitmq_node)", "interval": "", - "legendFormat": "", + "legendFormat": "{{rabbitmq_node}}", "refId": "A" } ], @@ -2867,9 +2867,9 @@ func OpenstackRabbitmq(dsName string) *corev1.ConfigMap { "steppedLine": false, "targets": [ { - "expr": "rabbitmq_channels{instance=\"$cluster\"}", + "expr": "rabbitmq_channels * on(instance, job) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$cluster\"}", "interval": "", - "legendFormat": "", + "legendFormat": "{{rabbitmq_node}}", "refId": "A" } ], @@ -2958,9 +2958,9 @@ func OpenstackRabbitmq(dsName string) *corev1.ConfigMap { "steppedLine": false, "targets": [ { - "expr": "sum(rate(rabbitmq_channels_opened_total{instance=\"$cluster\"}[60s]))", + "expr": "sum(rate(rabbitmq_channels_opened_total[60s]) * on(instance, job) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$cluster\"}) by(rabbitmq_node)", "interval": "", - "legendFormat": "", + "legendFormat": "{{rabbitmq_node}}", "refId": "A" } ], @@ -3050,9 +3050,9 @@ func OpenstackRabbitmq(dsName string) *corev1.ConfigMap { "steppedLine": false, "targets": [ { - "expr": "sum(rate(rabbitmq_channels_closed_total{instance=\"$cluster\"}[60s]))", + "expr": "sum(rate(rabbitmq_channels_closed_total[60s]) * on(instance, job) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$cluster\"}) by(rabbitmq_node)", "interval": "", - "legendFormat": "", + "legendFormat": "{{rabbitmq_node}}", "refId": "A" } ], @@ -3161,9 +3161,9 @@ func OpenstackRabbitmq(dsName string) *corev1.ConfigMap { "steppedLine": false, "targets": [ { - "expr": "rabbitmq_connections{instance=\"$cluster\"}", + "expr": "rabbitmq_connections * on(instance, job) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$cluster\"}", "interval": "", - "legendFormat": "", + "legendFormat": "{{rabbitmq_node}}", "refId": "A" } ], @@ -3252,9 +3252,9 @@ func OpenstackRabbitmq(dsName string) *corev1.ConfigMap { "steppedLine": false, "targets": [ { - "expr": "sum(rate(rabbitmq_connections_opened_total{instance=\"$cluster\"}[60s]))", + "expr": "sum(rate(rabbitmq_connections_opened_total[60s]) * on(instance, job) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$cluster\"}) by(rabbitmq_node)", "interval": "", - "legendFormat": "", + "legendFormat": "{{rabbitmq_node}}", "refId": "A" } ], @@ -3344,9 +3344,9 @@ func OpenstackRabbitmq(dsName string) *corev1.ConfigMap { "steppedLine": false, "targets": [ { - "expr": "sum(rate(rabbitmq_connections_closed_total{instance=\"$cluster\"}[60s]))", + "expr": "sum(rate(rabbitmq_connections_closed_total[60s]) * on(instance, job) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info{rabbitmq_cluster=\"$cluster\"}) by(rabbitmq_node)", "interval": "", - "legendFormat": "", + "legendFormat": "{{rabbitmq_node}}", "refId": "A" } ], @@ -3423,7 +3423,7 @@ func OpenstackRabbitmq(dsName string) *corev1.ConfigMap { "name": "cluster", "options": [ ], - "query": "label_values(rabbitmq_identity_info, instance)", + "query": "label_values(rabbitmq_identity_info, rabbitmq_cluster)", "skipUrlSync": false, "type": "query" } diff --git a/pkg/metricstorage/const.go b/pkg/metricstorage/const.go index edfb0e8a..fbb8b604 100644 --- a/pkg/metricstorage/const.go +++ b/pkg/metricstorage/const.go @@ -16,8 +16,10 @@ limitations under the License. package metricstorage const ( - // RabbitMQPrometheusPort is the port number for RabbitMQ Prometheus metrics - RabbitMQPrometheusPort = 15691 + // RabbitMQPrometheusTLSPortName is the port name for RabbitMQ Prometheus metrics with TLS enabled + RabbitMQPrometheusTLSPortName = "prometheus-tls" + // RabbitMQPrometheusNoTLSPortName is the port name for RabbitMQ Prometheus metrics with TLS disabled + RabbitMQPrometheusNoTLSPortName = "prometheus" // PrometheusHost is the key for Prometheus host configuration PrometheusHost = "host" diff --git a/pkg/metricstorage/pod_monitor.go b/pkg/metricstorage/pod_monitor.go new file mode 100644 index 00000000..8e158b56 --- /dev/null +++ b/pkg/metricstorage/pod_monitor.go @@ -0,0 +1,131 @@ +/* +Copyright 2025. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package metricstorage + +import ( + "fmt" + + tls "github.com/openstack-k8s-operators/lib-common/modules/common/tls" + telemetryv1 "github.com/openstack-k8s-operators/telemetry-operator/api/v1beta1" + "github.com/openstack-k8s-operators/telemetry-operator/pkg/telemetry" + monv1 "github.com/rhobs/obo-prometheus-operator/pkg/apis/monitoring/v1" + v1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +// RabbitMQPodMonitor creates a PodMonitor CR for monitoring a RabbitMQ cluster +func RabbitMQPodMonitor( + instance *telemetryv1.MetricStorage, + labels map[string]string, + rabbitMQClusterName string, + tlsEnabled bool, +) *monv1.PodMonitor { + metricRelabelConfigs := []*monv1.RelabelConfig{ + { + Action: "labeldrop", + Regex: "namespace", + SourceLabels: []monv1.LabelName{}, + }, + { + Action: "labeldrop", + Regex: "job", + SourceLabels: []monv1.LabelName{}, + }, + { + Action: "labeldrop", + Regex: "publisher", + SourceLabels: []monv1.LabelName{}, + }, + } + podMonitorName := fmt.Sprintf("%s-%s", telemetry.ServiceName, rabbitMQClusterName) + selector := metav1.LabelSelector{ + MatchLabels: map[string]string{ + "app.kubernetes.io/name": rabbitMQClusterName, + }, + } + serverName := fmt.Sprintf("%s.%s.svc", rabbitMQClusterName, instance.Namespace) + port := RabbitMQPrometheusNoTLSPortName + if tlsEnabled { + port = RabbitMQPrometheusTLSPortName + } + return PodMonitor(instance, labels, podMonitorName, metricRelabelConfigs, + selector, serverName, port, tlsEnabled) +} + +// PodMonitor creates a PodMonitor CR +// NOTE: Current implementation allows single metric endpoint per pod +func PodMonitor( + instance *telemetryv1.MetricStorage, + labels map[string]string, + name string, + metricRelabelConfigs []*monv1.RelabelConfig, + selector metav1.LabelSelector, + serverName string, + port string, + tlsEnabled bool, +) *monv1.PodMonitor { + var scrapeInterval monv1.Duration + if instance.Spec.MonitoringStack != nil && instance.Spec.MonitoringStack.ScrapeInterval != "" { + scrapeInterval = monv1.Duration(instance.Spec.MonitoringStack.ScrapeInterval) + } else if instance.Spec.CustomMonitoringStack != nil && + instance.Spec.CustomMonitoringStack.PrometheusConfig != nil && + instance.Spec.CustomMonitoringStack.PrometheusConfig.ScrapeInterval != nil && + *instance.Spec.CustomMonitoringStack.PrometheusConfig.ScrapeInterval != monv1.Duration("") { + scrapeInterval = *instance.Spec.CustomMonitoringStack.PrometheusConfig.ScrapeInterval + } else { + scrapeInterval = telemetryv1.DefaultScrapeInterval + } + + podMonitor := &monv1.PodMonitor{ + ObjectMeta: metav1.ObjectMeta{ + Name: name, + Namespace: instance.Namespace, + Labels: labels, + }, + Spec: monv1.PodMonitorSpec{ + Selector: selector, + PodMetricsEndpoints: []monv1.PodMetricsEndpoint{ + { + Interval: scrapeInterval, + MetricRelabelConfigs: metricRelabelConfigs, + Scheme: "http", + Port: port, + }, + }, + }, + } + + if tlsEnabled { + tlsConfig := monv1.PodMetricsEndpointTLSConfig{ + SafeTLSConfig: monv1.SafeTLSConfig{ + CA: monv1.SecretOrConfigMap{ + Secret: &v1.SecretKeySelector{ + Key: tls.CABundleKey, + LocalObjectReference: v1.LocalObjectReference{ + Name: instance.Spec.PrometheusTLS.CaBundleSecretName, + }, + }, + }, + ServerName: serverName, + }, + } + podMonitor.Spec.PodMetricsEndpoints[0].TLSConfig = &tlsConfig + podMonitor.Spec.PodMetricsEndpoints[0].Scheme = "https" + } + + return podMonitor +} diff --git a/tests/kuttl/suites/default/tests/01-assert.yaml b/tests/kuttl/suites/default/tests/01-assert.yaml index 9c4f8b0b..a372f3a4 100644 --- a/tests/kuttl/suites/default/tests/01-assert.yaml +++ b/tests/kuttl/suites/default/tests/01-assert.yaml @@ -199,6 +199,8 @@ status: status: "True" - type: NetworkAttachmentsReady status: "True" + - type: PodMonitorReady + status: "True" - type: PrometheusReady status: "True" - type: ScrapeConfigReady diff --git a/tests/kuttl/suites/metricstorage/tests/01-assert.yaml b/tests/kuttl/suites/metricstorage/tests/01-assert.yaml index 7bdff8e4..f5976c5d 100644 --- a/tests/kuttl/suites/metricstorage/tests/01-assert.yaml +++ b/tests/kuttl/suites/metricstorage/tests/01-assert.yaml @@ -51,8 +51,8 @@ spec: - action: labeldrop regex: publisher --- -apiVersion: monitoring.rhobs/v1alpha1 -kind: ScrapeConfig +apiVersion: monitoring.rhobs/v1 +kind: PodMonitor metadata: labels: service: metricStorage @@ -61,15 +61,20 @@ metadata: - kind: MetricStorage name: telemetry-kuttl spec: - scrapeInterval: 30s - metricRelabelings: - - action: labeldrop - regex: pod - - action: labeldrop - regex: namespace - - action: labeldrop - regex: job - - action: labeldrop + podMetricsEndpoints: + - interval: 30s + metricRelabelings: + - action: labeldrop + regex: namespace + - action: labeldrop + regex: job + - action: labeldrop + regex: publisher + port: prometheus + scheme: http + selector: + matchLabels: + app.kubernetes.io/name: rabbitmq --- apiVersion: monitoring.rhobs/v1alpha1 kind: ScrapeConfig diff --git a/tests/kuttl/suites/metricstorage/tests/04-assert.yaml b/tests/kuttl/suites/metricstorage/tests/04-assert.yaml index 291cf2e7..3674dd2e 100644 --- a/tests/kuttl/suites/metricstorage/tests/04-assert.yaml +++ b/tests/kuttl/suites/metricstorage/tests/04-assert.yaml @@ -62,8 +62,8 @@ spec: - action: labeldrop regex: publisher --- -apiVersion: monitoring.rhobs/v1alpha1 -kind: ScrapeConfig +apiVersion: monitoring.rhobs/v1 +kind: PodMonitor metadata: labels: service: metricStorage @@ -72,16 +72,20 @@ metadata: - kind: MetricStorage name: telemetry-kuttl spec: - scrapeInterval: 40s - metricRelabelings: - - action: labeldrop - regex: pod - - action: labeldrop - regex: namespace - - action: labeldrop - regex: job - - action: labeldrop - regex: publisher + podMetricsEndpoints: + - interval: 40s + metricRelabelings: + - action: labeldrop + regex: namespace + - action: labeldrop + regex: job + - action: labeldrop + regex: publisher + port: prometheus + scheme: http + selector: + matchLabels: + app.kubernetes.io/name: rabbitmq --- apiVersion: monitoring.rhobs/v1alpha1 kind: ScrapeConfig diff --git a/tests/kuttl/suites/tls/tests/02-assert.yaml b/tests/kuttl/suites/tls/tests/02-assert.yaml index de9a26e9..742bf64b 100644 --- a/tests/kuttl/suites/tls/tests/02-assert.yaml +++ b/tests/kuttl/suites/tls/tests/02-assert.yaml @@ -316,6 +316,22 @@ metadata: - kind: MetricStorage name: telemetry-kuttl-metricstorage --- +apiVersion: monitoring.rhobs/v1 +kind: PodMonitor +metadata: + labels: + service: metricStorage + name: telemetry-rabbitmq + ownerReferences: + - kind: MetricStorage + name: telemetry-kuttl-metricstorage +spec: + podMetricsEndpoints: + - scheme: https + port: prometheus-tls + tlsConfig: + serverName: rabbitmq.telemetry-kuttl-tests.svc +--- apiVersion: v1 kind: Service metadata: