Skip to content

Commit ab640eb

Browse files
Merge pull request #939 from rackerlabs/PUC-939-20250529
feat: Adds prometheus monitoring for rabbitmq operator and clusters
2 parents 73daa26 + e33e7bc commit ab640eb

18 files changed

+581
-0
lines changed

operators/rabbitmq-system/kustomization.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,3 +3,4 @@ kind: Kustomization
33
resources:
44
- cluster-operator/
55
- messaging-topology/
6+
- monitoring/
Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
apiVersion: kustomize.config.k8s.io/v1beta1
2+
kind: Kustomization
3+
namespace: rabbitmq-system
4+
5+
resources:
6+
- rabbitmq-servicemonitor.yml
7+
- rabbitmq-cluster-operator-podmonitor.yml
8+
- ./rules/rabbitmq-per-object/queue-is-growing.yml
9+
- ./rules/rabbitmq-per-object/queue-has-no-consumers.yml
10+
- ./rules/rabbitmq/container-restarts.yml
11+
- ./rules/rabbitmq/high-connection-churn.yml
12+
- ./rules/rabbitmq/file-descriptors-near-limit.yml
13+
- ./rules/rabbitmq/low-disk-watermark-predicted.yml
14+
- ./rules/rabbitmq/unroutable-messages.yml
15+
- ./rules/rabbitmq/recording-rules.yml
16+
- ./rules/rabbitmq/no-majority-of-nodes-ready.yml
17+
- ./rules/rabbitmq/cluster-alarms.yml
18+
- ./rules/rabbitmq/insufficient-established-erlang-distribution-links.yml
19+
- ./rules/rabbitmq/persistent-volume-missing.yml
20+
- ./rules/rabbitmq/tcp-sockets-near-limit.yml
21+
- ./rules/rabbitmq-cluster-operator/unavailable-replicas.yml
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
---
2+
apiVersion: monitoring.coreos.com/v1
3+
kind: PodMonitor
4+
metadata:
5+
name: rabbitmq-cluster-operator
6+
namespace: rabbitmq-system
7+
# If labels are defined in spec.podMonitorSelector.matchLabels of your deployed Prometheus object, make sure to include them here.
8+
spec:
9+
podMetricsEndpoints:
10+
- port: metrics
11+
selector:
12+
matchLabels:
13+
app.kubernetes.io/component: rabbitmq-operator
14+
namespaceSelector:
15+
matchNames:
16+
- rabbitmq-system
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
---
2+
apiVersion: monitoring.coreos.com/v1
3+
kind: ServiceMonitor
4+
metadata:
5+
name: rabbitmq
6+
namespace: rabbitmq-system
7+
# If labels are defined in spec.serviceMonitorSelector.matchLabels of your deployed Prometheus object, make sure to include them here.
8+
spec:
9+
endpoints:
10+
- port: prometheus
11+
scheme: http
12+
interval: 15s
13+
scrapeTimeout: 14s
14+
- port: prometheus-tls
15+
scheme: https
16+
interval: 15s
17+
scrapeTimeout: 14s
18+
tlsConfig:
19+
insecureSkipVerify: true # set to false and uncomment lines below to enable tls verification
20+
# ca:
21+
# secret:
22+
# key: ca.crt
23+
# name: tls-secret # name of the secret containing the CA cert which signed the RabbitMQ Prometheus TLS cert
24+
# serverName: '*.RABBITMQ-INSTANCE-NAME.NAMESPACE.svc.cluster.local'
25+
- port: prometheus
26+
scheme: http
27+
path: /metrics/detailed
28+
params:
29+
family:
30+
- queue_coarse_metrics
31+
- queue_metrics
32+
interval: 15s
33+
scrapeTimeout: 14s
34+
- port: prometheus-tls
35+
scheme: https
36+
path: /metrics/detailed
37+
params:
38+
family:
39+
- queue_coarse_metrics
40+
- queue_metrics
41+
interval: 15s
42+
scrapeTimeout: 14s
43+
tlsConfig:
44+
insecureSkipVerify: true
45+
selector:
46+
matchLabels:
47+
app.kubernetes.io/component: rabbitmq
48+
namespaceSelector:
49+
any: true
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
---
2+
apiVersion: monitoring.coreos.com/v1
3+
kind: PrometheusRule
4+
metadata:
5+
name: rabbitmq-cluster-operator-unavailable-replicas
6+
# If labels are defined in spec.ruleSelector.matchLabels of your deployed Prometheus object, make sure to include them here.
7+
labels:
8+
role: alert-rules
9+
spec:
10+
groups:
11+
- name: rabbitmq-cluster-operator
12+
rules:
13+
- alert: RabbitMQClusterOperatorUnavailableReplicas
14+
expr: |
15+
kube_deployment_status_replicas_unavailable{deployment="rabbitmq-cluster-operator"}
16+
>
17+
0
18+
for: 5m
19+
annotations:
20+
description: |
21+
`{{ $value }}` replicas are unavailable in Deployment `rabbitmq-cluster-operator`
22+
in namespace `{{ $labels.namespace }}`.
23+
summary: |
24+
There are pods that are either running but not yet available or pods that still have not been created.
25+
Check the status of the deployment: `kubectl -n {{ $labels.namespace }} describe deployment rabbitmq-cluster-operator`
26+
Check the status of the pod: `kubectl -n {{ $labels.namespace }} describe pod -l app.kubernetes.io/component=rabbitmq-cluster-operator`
27+
labels:
28+
rulesgroup: rabbitmq-operator
29+
severity: warning
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
---
2+
apiVersion: monitoring.coreos.com/v1
3+
kind: PrometheusRule
4+
metadata:
5+
name: rabbitmq-queue-has-no-consumers
6+
# If labels are defined in spec.ruleSelector.matchLabels of your deployed Prometheus object, make sure to include them here.
7+
labels:
8+
role: alert-rules
9+
spec:
10+
groups:
11+
- name: rabbitmq
12+
rules:
13+
- alert: QueueHasNoConsumers
14+
expr: |
15+
(
16+
((rabbitmq_detailed_queue_consumers{vhost="/", queue=~".*"} == 0) + rabbitmq_detailed_queue_messages) > 0
17+
) * on (instance, job) group_left(rabbitmq_cluster) rabbitmq_identity_info
18+
for: 10m
19+
annotations:
20+
description: |
21+
Over the last 10 minutes, non-empty queue `{{ $labels.queue }}` with {{ $value }} messages
22+
in virtual host `{{ $labels.vhost }}` didn't have any consumers in
23+
RabbitMQ cluster `{{ $labels.rabbitmq_cluster }}` in namespace `{{ $labels.namespace }}`.
24+
summary: |
25+
Messages are sitting idle in the queue, without any processing.
26+
This alert is highly application specific (and e.g. doesn't make sense for stream queues).
27+
labels:
28+
rulesgroup: rabbitmq
29+
severity: warning
Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
---
2+
apiVersion: monitoring.coreos.com/v1
3+
kind: PrometheusRule
4+
metadata:
5+
name: rabbitmq-queue-is-growing
6+
# If labels are defined in spec.ruleSelector.matchLabels of your deployed Prometheus object, make sure to include them here.
7+
labels:
8+
role: alert-rules
9+
spec:
10+
groups:
11+
- name: rabbitmq
12+
rules:
13+
- alert: QueueIsGrowing
14+
# `> 1` because of floating point rounding errors
15+
expr: |
16+
(
17+
avg_over_time(rabbitmq_detailed_queue_messages[10m]) - avg_over_time(rabbitmq_detailed_queue_messages[10m] offset 1m) > 1
18+
) * on (instance, job) group_left(rabbitmq_cluster) rabbitmq_identity_info
19+
for: 10m
20+
annotations:
21+
description: |
22+
Over the last 10 minutes, queue `{{ $labels.queue }}` in virtual host `{{ $labels.vhost }}`
23+
was growing. 10 minute moving average has grown by {{ $value }}.
24+
This happens in RabbitMQ cluster `{{ $labels.rabbitmq_cluster }}` in namespace `{{ $labels.namespace }}`.
25+
summary: |
26+
Queue size is steadily growing over time.
27+
labels:
28+
rulesgroup: rabbitmq
29+
severity: warning
Lines changed: 60 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,60 @@
1+
---
2+
apiVersion: monitoring.coreos.com/v1
3+
kind: PrometheusRule
4+
metadata:
5+
name: rabbitmq-cluster-alarms
6+
# If labels are defined in spec.ruleSelector.matchLabels of your deployed Prometheus object, make sure to include them here.
7+
labels:
8+
role: alert-rules
9+
spec:
10+
groups:
11+
- name: rabbitmq
12+
rules:
13+
- alert: MemoryAlarm
14+
expr: |
15+
max by(rabbitmq_cluster) (
16+
max_over_time(rabbitmq_alarms_memory_used_watermark[5m])
17+
* on(instance) group_left(rabbitmq_cluster, rabbitmq_node, pod) max(rabbitmq_identity_info) by (namespace, pod, container, rabbitmq_cluster)
18+
) > 0
19+
keep_firing_for: 5m
20+
annotations:
21+
description: |
22+
RabbitMQ cluster `{{ $labels.rabbitmq_cluster }}` memory alarm active. Publishers are blocked.
23+
summary: |
24+
A RabbitMQ node reached the `vm_memory_high_watermark` threshold.
25+
See https://www.rabbitmq.com/docs/alarms#overview, https://www.rabbitmq.com/docs/memory.
26+
labels:
27+
rulesgroup: rabbitmq
28+
severity: warning
29+
- alert: RabbitmqDiskAlarm
30+
expr: |
31+
max by(rabbitmq_cluster) (
32+
max_over_time(rabbitmq_alarms_free_disk_space_watermark[5m])
33+
* on(instance) group_left(rabbitmq_cluster, rabbitmq_node, pod) max(rabbitmq_identity_info) by (namespace, pod, container, rabbitmq_cluster)
34+
) > 0
35+
keep_firing_for: 5m
36+
annotations:
37+
description: |
38+
RabbitMQ cluster `{{ $labels.rabbitmq_cluster }}` disk alarm active. Publishers are blocked.
39+
summary: |
40+
A RabbitMQ node reached the `disk_free_limit` threshold.
41+
See https://www.rabbitmq.com/docs/alarms#overview, https://www.rabbitmq.com/docs/disk-alarms.
42+
labels:
43+
rulesgroup: rabbitmq
44+
severity: warning
45+
- alert: RabbitmqFileDescriptorAlarm
46+
expr: |
47+
max by(rabbitmq_cluster) (
48+
max_over_time(rabbitmq_alarms_file_descriptor_limit[5m])
49+
* on(instance) group_left(rabbitmq_cluster, rabbitmq_node, pod) max(rabbitmq_identity_info) by (namespace, pod, container, rabbitmq_cluster)
50+
) > 0
51+
keep_firing_for: 5m
52+
annotations:
53+
description: |
54+
RabbitMQ cluster `{{ $labels.rabbitmq_cluster }}` file descriptor alarm active. Publishers are blocked.
55+
summary: |
56+
A RabbitMQ node ran out of file descriptors.
57+
See https://www.rabbitmq.com/docs/alarms#file-descriptors.
58+
labels:
59+
rulesgroup: rabbitmq
60+
severity: warning
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
---
2+
apiVersion: monitoring.coreos.com/v1
3+
kind: PrometheusRule
4+
metadata:
5+
name: rabbitmq-container-restarts
6+
# If labels are defined in spec.ruleSelector.matchLabels of your deployed Prometheus object, make sure to include them here.
7+
labels:
8+
role: alert-rules
9+
spec:
10+
groups:
11+
- name: rabbitmq
12+
rules:
13+
- alert: ContainerRestarts
14+
expr: |
15+
increase(kube_pod_container_status_restarts_total[10m]) * on(namespace, pod, container) group_left(rabbitmq_cluster) max(rabbitmq_identity_info) by (namespace, pod, container, rabbitmq_cluster)
16+
>=
17+
1
18+
for: 5m
19+
annotations:
20+
description: |
21+
Over the last 10 minutes, container `{{ $labels.container }}`
22+
restarted `{{ $value | printf "%.0f" }}` times in pod `{{ $labels.pod }}` of RabbitMQ cluster
23+
`{{ $labels.rabbitmq_cluster }}` in namespace `{{ $labels.namespace }}`.
24+
summary: |
25+
Investigate why the container got restarted.
26+
Check the logs of the current container: `kubectl -n {{ $labels.namespace }} logs {{ $labels.pod }}`
27+
Check the logs of the previous container: `kubectl -n {{ $labels.namespace }} logs {{ $labels.pod }} --previous`
28+
Check the last state of the container: `kubectl -n {{ $labels.namespace }} get pod {{ $labels.pod }} -o jsonpath='{.status.containerStatuses[].lastState}'`
29+
labels:
30+
rabbitmq_cluster: '{{ $labels.rabbitmq_cluster }}'
31+
rulesgroup: rabbitmq
32+
severity: warning
Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
---
2+
apiVersion: monitoring.coreos.com/v1
3+
kind: PrometheusRule
4+
metadata:
5+
name: rabbitmq-file-descriptors-near-limit
6+
# If labels are defined in spec.ruleSelector.matchLabels of your deployed Prometheus object, make sure to include them here.
7+
labels:
8+
role: alert-rules
9+
spec:
10+
groups:
11+
- name: rabbitmq
12+
rules:
13+
- alert: FileDescriptorsNearLimit
14+
expr: |
15+
sum by(namespace, rabbitmq_cluster, pod, rabbitmq_node) (max_over_time(rabbitmq_process_open_fds[5m]) * on(instance) group_left(rabbitmq_cluster, rabbitmq_node, pod) max(rabbitmq_identity_info) by (namespace, pod, container, rabbitmq_cluster))
16+
/
17+
sum by(namespace, rabbitmq_cluster, pod, rabbitmq_node) (rabbitmq_process_max_fds * on(instance) group_left(rabbitmq_cluster, rabbitmq_node, pod) max(rabbitmq_identity_info) by (namespace, pod, container, rabbitmq_cluster))
18+
> 0.8
19+
for: 10m
20+
annotations:
21+
description: |
22+
`{{ $value | humanizePercentage }}` file descriptors of file
23+
descriptor limit are used in RabbitMQ node `{{ $labels.rabbitmq_node }}`,
24+
pod `{{ $labels.pod }}`, RabbitMQ cluster `{{ $labels.rabbitmq_cluster }}`,
25+
namespace `{{ $labels.namespace }}`.
26+
summary: |
27+
More than 80% of file descriptors are used on the RabbitMQ node.
28+
When this value reaches 100%, new connections will not be accepted and disk write operations may fail.
29+
Client libraries, peer nodes and CLI tools will not be able to connect when the node runs out of available file descriptors.
30+
See https://www.rabbitmq.com/production-checklist.html#resource-limits-file-handle-limit.
31+
labels:
32+
rulesgroup: rabbitmq
33+
severity: warning

0 commit comments

Comments
 (0)