Skip to content

Commit bf095e5

Browse files
committed
[OSPRH-21020] Change rabbitmq to use podmonitors
RabbitMQ can be deployed with multiple replicas, while each replica will export different metrics. Having a ScrapeConfig with a service as a target means we're not scraping all metrics. PodMonitor will find all rabbitMQ pods based on a label, so all pods will get scraped everytime.
1 parent bffdcca commit bf095e5

File tree

6 files changed

+326
-103
lines changed

6 files changed

+326
-103
lines changed

api/v1beta1/conditions.go

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,9 @@ const (
3939
// ScrapeConfigReadyCondition Status=True condition which indicates if the ScrapeConfig is configured and operational
4040
ScrapeConfigReadyCondition condition.Type = "ScrapeConfigReady"
4141

42+
// PodMonitorReadyCondition Status=True condition which indicates if the PodMonitor is configured and operational
43+
PodMonitorReadyCondition condition.Type = "PodMonitorReady"
44+
4245
// PrometheusReadyCondition Status=True condition which indicates if the Prometheus watch is operational
4346
PrometheusReadyCondition condition.Type = "PrometheusReady"
4447

@@ -169,6 +172,15 @@ const (
169172
// ScrapeConfigUnableToOwnMessage
170173
ScrapeConfigUnableToOwnMessage = "Error occured when trying to own %s"
171174

175+
//
176+
// PodMonitorReady condition messages
177+
//
178+
// PodMonitorReadyInitMessage
179+
PodMonitorReadyInitMessage = "PodMonitor not started"
180+
181+
// PodMonitorUnableToOwnMessage
182+
PodMonitorUnableToOwnMessage = "Error occured when trying to own %s"
183+
172184
//
173185
// PrometheusReady condition messages
174186
//

config/rbac/role.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,6 +151,7 @@ rules:
151151
- monitoring.rhobs
152152
resources:
153153
- monitoringstacks
154+
- podmonitors
154155
- prometheusrules
155156
- scrapeconfigs
156157
verbs:

controllers/metricstorage_controller.go

Lines changed: 115 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,7 @@ func (r *MetricStorageReconciler) GetLogger(ctx context.Context) logr.Logger {
124124
//+kubebuilder:rbac:groups=monitoring.rhobs,resources=monitoringstacks,verbs=get;list;watch;create;update;patch;delete
125125
//+kubebuilder:rbac:groups=monitoring.rhobs,resources=servicemonitors,verbs=get;list;delete
126126
//+kubebuilder:rbac:groups=monitoring.rhobs,resources=scrapeconfigs,verbs=get;list;watch;create;update;patch;delete
127+
//+kubebuilder:rbac:groups=monitoring.rhobs,resources=podmonitors,verbs=get;list;watch;create;update;patch;delete
127128
//+kubebuilder:rbac:groups=monitoring.rhobs,resources=prometheusrules,verbs=get;list;watch;create;update;patch;delete
128129
//+kubebuilder:rbac:groups=monitoring.rhobs,resources=prometheuses,verbs=get;list;watch;update;patch;delete
129130
//+kubebuilder:rbac:groups=monitoring.rhobs,resources=alertmanagers,verbs=get;list;watch;update;patch;delete
@@ -202,6 +203,7 @@ func (r *MetricStorageReconciler) Reconcile(ctx context.Context, req ctrl.Reques
202203
condition.UnknownCondition(condition.ReadyCondition, condition.InitReason, condition.ReadyInitMessage),
203204
condition.UnknownCondition(telemetryv1.MonitoringStackReadyCondition, condition.InitReason, telemetryv1.MonitoringStackReadyInitMessage),
204205
condition.UnknownCondition(telemetryv1.ScrapeConfigReadyCondition, condition.InitReason, telemetryv1.ScrapeConfigReadyInitMessage),
206+
condition.UnknownCondition(telemetryv1.PodMonitorReadyCondition, condition.InitReason, telemetryv1.PodMonitorReadyInitMessage),
205207
condition.UnknownCondition(telemetryv1.DashboardPrometheusRuleReadyCondition, condition.InitReason, telemetryv1.DashboardPrometheusRuleReadyInitMessage),
206208
condition.UnknownCondition(telemetryv1.DashboardPluginReadyCondition, condition.InitReason, telemetryv1.DashboardPluginReadyInitMessage),
207209
condition.UnknownCondition(telemetryv1.DashboardDatasourceReadyCondition, condition.InitReason, telemetryv1.DashboardDatasourceReadyInitMessage),
@@ -247,8 +249,6 @@ func (r *MetricStorageReconciler) reconcileDelete(
247249
return ctrl.Result{}, nil
248250
}
249251

250-
// TODO: call the function appropriately
251-
//
252252
//nolint:all
253253
func (r *MetricStorageReconciler) reconcileUpdate(
254254
ctx context.Context,
@@ -262,6 +262,10 @@ func (r *MetricStorageReconciler) reconcileUpdate(
262262
if err != nil {
263263
return ctrl.Result{}, err
264264
}
265+
err = r.deleteRabbitMQScrapeConfig(ctx, instance)
266+
if err != nil {
267+
return ctrl.Result{}, err
268+
}
265269

266270
Log.Info(fmt.Sprintf("Reconciled Service '%s' update successfully", instance.Name))
267271

@@ -295,6 +299,33 @@ func (r *MetricStorageReconciler) deleteOldServiceMonitors(
295299
return nil
296300
}
297301

302+
// Delete RabbitMQ ScrapeConfig
303+
// A ScrapeConfig for RabbitMQ was last used at the beginning of FR4
304+
func (r *MetricStorageReconciler) deleteRabbitMQScrapeConfig(
305+
ctx context.Context,
306+
instance *telemetryv1.MetricStorage,
307+
) error {
308+
namespacedName := types.NamespacedName{
309+
Name: fmt.Sprintf("%s-rabbitmq", telemetry.ServiceName),
310+
Namespace: instance.Namespace,
311+
}
312+
scrapeConfig := &monv1alpha1.ScrapeConfig{}
313+
err := r.Get(ctx, namespacedName, scrapeConfig)
314+
if err != nil {
315+
if k8s_errors.IsNotFound(err) {
316+
return nil
317+
}
318+
return err
319+
}
320+
if object.CheckOwnerRefExist(instance.UID, scrapeConfig.OwnerReferences) {
321+
err = r.Delete(ctx, scrapeConfig)
322+
if err != nil {
323+
return err
324+
}
325+
}
326+
return nil
327+
}
328+
298329
func (r *MetricStorageReconciler) reconcileNormal(
299330
ctx context.Context,
300331
instance *telemetryv1.MetricStorage,
@@ -453,6 +484,11 @@ func (r *MetricStorageReconciler) reconcileNormal(
453484
return res, err
454485
}
455486

487+
// Deploy PodMonitors
488+
if res, err := r.createPodMonitors(ctx, instance, eventHandler); err != nil {
489+
return res, err
490+
}
491+
456492
if !instance.Spec.DashboardsEnabled {
457493
if res, err := metricstorage.DeleteDashboardObjects(ctx, instance, helper); err != nil {
458494
return res, err
@@ -622,6 +658,14 @@ func (r *MetricStorageReconciler) reconcileNormal(
622658
// when job passed, mark NetworkAttachmentsReadyCondition ready
623659
instance.Status.Conditions.MarkTrue(condition.NetworkAttachmentsReadyCondition, condition.NetworkAttachmentsReadyMessage)
624660

661+
// Handle service update
662+
ctrlResult, err := r.reconcileUpdate(ctx, instance, helper)
663+
if err != nil {
664+
return ctrlResult, err
665+
} else if (ctrlResult != ctrl.Result{}) {
666+
return ctrlResult, nil
667+
}
668+
625669
if instance.Status.Conditions.AllSubConditionIsTrue() {
626670
instance.Status.Conditions.MarkTrue(
627671
condition.ReadyCondition, condition.ReadyMessage)
@@ -684,6 +728,31 @@ func (r *MetricStorageReconciler) prometheusEndpointSecret(
684728
return nil
685729
}
686730

731+
func (r *MetricStorageReconciler) createPodMonitor(
732+
ctx context.Context,
733+
instance *telemetryv1.MetricStorage,
734+
log logr.Logger,
735+
desiredPodMonitor *monv1.PodMonitor,
736+
) error {
737+
podMonitor := &monv1.PodMonitor{
738+
ObjectMeta: metav1.ObjectMeta{
739+
Name: desiredPodMonitor.Name,
740+
Namespace: instance.Namespace,
741+
},
742+
}
743+
op, err := controllerutil.CreateOrPatch(ctx, r.Client, podMonitor, func() error {
744+
desiredPodMonitor.Spec.DeepCopyInto(&podMonitor.Spec)
745+
podMonitor.Labels = desiredPodMonitor.Labels
746+
err := controllerutil.SetControllerReference(instance, podMonitor, r.Scheme)
747+
return err
748+
})
749+
750+
if err == nil && op != controllerutil.OperationResultNone {
751+
log.Info(fmt.Sprintf("PodMonitor %s successfully changed - operation: %s", podMonitor.GetName(), string(op)))
752+
}
753+
return err
754+
}
755+
687756
func (r *MetricStorageReconciler) createServiceScrapeConfig(
688757
ctx context.Context,
689758
instance *telemetryv1.MetricStorage,
@@ -711,6 +780,50 @@ func (r *MetricStorageReconciler) createServiceScrapeConfig(
711780
return err
712781
}
713782

783+
func (r *MetricStorageReconciler) createPodMonitors(
784+
ctx context.Context,
785+
instance *telemetryv1.MetricStorage,
786+
eventHandler handler.EventHandler,
787+
) (ctrl.Result, error) {
788+
Log := r.GetLogger(ctx)
789+
err := r.ensureWatches(ctx, "podmonitors.monitoring.rhobs", &monv1.PodMonitor{}, eventHandler)
790+
if err != nil {
791+
instance.Status.Conditions.MarkFalse(telemetryv1.PodMonitorReadyCondition,
792+
condition.Reason("Can't own PodMonitor resource. The Cluster Observability Operator probably isn't installed"),
793+
condition.SeverityError,
794+
telemetryv1.PodMonitorUnableToOwnMessage, err)
795+
Log.Info("Can't own PodMonitor resource. The Cluster Observability Operator probably isn't installed")
796+
return ctrl.Result{RequeueAfter: telemetryv1.PauseBetweenWatchAttempts}, nil
797+
}
798+
799+
// PodMonitors for RabbitMQ monitoring
800+
// NOTE: We're watching Rabbits and reconciling with each of their change
801+
// that should keep the PodMonitors always up to date.
802+
rabbitList := &rabbitmqv1.RabbitmqClusterList{}
803+
listOpts := []client.ListOption{
804+
client.InNamespace(instance.GetNamespace()),
805+
}
806+
err = r.List(ctx, rabbitList, listOpts...)
807+
if err != nil && !k8s_errors.IsNotFound(err) {
808+
return ctrl.Result{}, err
809+
}
810+
for _, rabbit := range rabbitList.Items {
811+
desiredPodMonitor := metricstorage.RabbitMQPodMonitor(
812+
instance,
813+
serviceLabels,
814+
rabbit.Name,
815+
instance.Spec.PrometheusTLS.Enabled(),
816+
)
817+
err = r.createPodMonitor(ctx, instance, Log, desiredPodMonitor)
818+
if err != nil {
819+
return ctrl.Result{}, err
820+
}
821+
}
822+
823+
instance.Status.Conditions.MarkTrue(telemetryv1.PodMonitorReadyCondition, condition.ReadyMessage)
824+
return ctrl.Result{}, nil
825+
}
826+
714827
func (r *MetricStorageReconciler) createScrapeConfigs(
715828
ctx context.Context,
716829
instance *telemetryv1.MetricStorage,
@@ -760,36 +873,6 @@ func (r *MetricStorageReconciler) createScrapeConfigs(
760873
return ctrl.Result{}, err
761874
}
762875

763-
// ScrapeConfigs for RabbitMQ monitoring
764-
// NOTE: We're watching Rabbits and reconciling with each of their change
765-
// that should keep the targets inside the ScrapeConfig always
766-
// up to date.
767-
rabbitList := &rabbitmqv1.RabbitmqClusterList{}
768-
listOpts := []client.ListOption{
769-
client.InNamespace(instance.GetNamespace()),
770-
}
771-
err = r.List(ctx, rabbitList, listOpts...)
772-
if err != nil && !k8s_errors.IsNotFound(err) {
773-
return ctrl.Result{}, err
774-
}
775-
rabbitTargets := []string{}
776-
for _, rabbit := range rabbitList.Items {
777-
rabbitServerName := fmt.Sprintf("%s.%s.svc", rabbit.Name, rabbit.Namespace)
778-
rabbitTargets = append(rabbitTargets, net.JoinHostPort(rabbitServerName, strconv.Itoa(metricstorage.RabbitMQPrometheusPort)))
779-
}
780-
rabbitCfgName := fmt.Sprintf("%s-rabbitmq", telemetry.ServiceName)
781-
desiredScrapeConfig = metricstorage.ScrapeConfig(
782-
instance,
783-
serviceLabels,
784-
rabbitTargets,
785-
instance.Spec.PrometheusTLS.Enabled(),
786-
)
787-
err = r.createServiceScrapeConfig(ctx, instance, Log, "RabbitMQ",
788-
rabbitCfgName, desiredScrapeConfig)
789-
if err != nil {
790-
return ctrl.Result{}, err
791-
}
792-
793876
// mysqld exporter
794877
ceilometerNamespacedName := types.NamespacedName{
795878
Name: ceilometer.ServiceName,

0 commit comments

Comments
 (0)