sysdiglabs
diff --git a/‎apps/rabbitmq.yaml‎
Lines changed: 2 additions & 2 deletions b/‎apps/rabbitmq.yaml‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎resources/rabbitmq/ALERTS.md‎
Lines changed: 22 additions & 0 deletions b/‎resources/rabbitmq/ALERTS.md‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎resources/rabbitmq/INSTALL.md‎
Lines changed: 8 additions & 0 deletions b/‎resources/rabbitmq/INSTALL.md‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎resources/rabbitmq/README.md‎
Lines changed: 11 additions & 0 deletions b/‎resources/rabbitmq/README.md‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎resources/rabbitmq/alerts.yaml‎
Lines changed: 242 additions & 0 deletions b/‎resources/rabbitmq/alerts.yaml‎
Lines changed: 242 additions & 0 deletions
diff --git a/‎resources/rabbitmq/dashboards.yaml‎
Lines changed: 29 additions & 0 deletions b/‎resources/rabbitmq/dashboards.yaml‎
Lines changed: 29 additions & 0 deletions
@@ -4,12 +4,12 @@ kind: App
 name: "rabbitmq"
 keywords: 
   - Message-broker
-  - Coming soon
+  - Available
 availableVersions: 
   - '3.8'
 shortDescription: "RabbitMQ is the most widely deployed open source message broker."
 description: |
   RabbitMQ is the most widely deployed open source message broker.
 icon: https://upload.wikimedia.org/wikipedia/commons/7/71/RabbitMQ_logo.svg
 website: https://www.rabbitmq.com/
-available: false
+available: true
@@ -0,0 +1,22 @@
+# Alerts
+## RabbitMQClusterOperatorUnavailableReplicas
+There are pods that are either running but not yet available or pods that still have not been created.
+## InsufficientEstablishedErlangDistributionLinks
+There are only `{{ $value }}` established Erlang distribution links
+## LowDiskWatermarkPredicted
+The predicted free disk space in 24 hours from now is `{{ $value | humanize1024 }}B`
+## HighConnectionChurn
+Over the last 5 minutes, `{{ $value | humanizePercentage }}` of total connections are closed or opened per second in RabbitMQ cluster `{{ $labels.rabbitmq_cluster }}` in namespace `{{ $labels.namespace }}`.
+## NoMajorityOfNodesReady
+Only `{{ $value }}` replicas are ready in StatefulSet `{{ $labels.statefulset }}` of RabbitMQ cluster `{{ $labels.label_app_kubernetes_io_name }}` in namespace `{{ $labels.namespace }}`.
+## PersistentVolumeMissing
+PersistentVolumeClaim `{{ $labels.persistentvolumeclaim }}` of RabbitMQ cluster `{{ $labels.label_app_kubernetes_io_name }}` in namespace `{{ $labels.namespace }}` is not bound.
+## UnroutableMessages
+There were `{{ $value | printf "%.0f" }}` unroutable messages within the last 5 minutes in RabbitMQ cluster `{{ $labels.rabbitmq_cluster }}` in namespace `{{ $labels.namespace }}`.
+## FileDescriptorsNearLimit
+`{{ $value | humanizePercentage }}` file descriptors of file descriptor limit are used in RabbitMQ node `{{ $labels.rabbitmq_node }}`, pod `{{ $labels.pod }}`, RabbitMQ cluster `{{ $labels.rabbitmq_cluster }}`, namespace `{{ $labels.namespace }}`.
+## ContainerRestarts
+Over the last 10 minutes, container `{{ $labels.container }}` restarted `{{ $value | printf "%.0f" }}` times in pod `{{ $labels.pod }}` of RabbitMQ cluster `{{ $labels.rabbitmq_cluster }}` in namespace `{{ $labels.namespace }}`.
+## TCPSocketsNearLimit
+`{{ $value | humanizePercentage }}` TCP sockets of TCP socket limit are open in RabbitMQ node `{{ $labels.rabbitmq_node }}`, pod `{{ $labels.pod }}`, RabbitMQ cluster `{{ $labels.rabbitmq_cluster }}`, namespace `{{ $labels.namespace }}`.
+## 
@@ -0,0 +1,8 @@
+## Enable Prometheus Metrics
+Rabbitmq instruments Prometheus metrics and annotates the metrics API pod with Prometheus annotations. 
+
+Make sure that the prometheus metrics are activated. In case you don't have activated the plugin use the next command:
+
+```sh
+rabbitmq-plugins enable rabbitmq_prometheus
+```
@@ -0,0 +1,11 @@
+# Rabbitmq
+RabbitMQ is the most widely deployed open source message broker.
+
+With tens of thousands of users, RabbitMQ is one of the most popular open source message brokers. From T-Mobile to Runtastic, RabbitMQ is used worldwide at small startups and large enterprises.
+
+RabbitMQ is lightweight and easy to deploy on premises and in the cloud. It supports multiple messaging protocols. RabbitMQ can be deployed in distributed and federated configurations to meet high-scale, high-availability requirements.
+
+# Attributions
+Configuration files and dashboards are maintained by [Sysdig team](https://sysdig.com/).
+
+All dashboards and alerts are modified from the [rabbitmq repository](https://github.com/rabbitmq/cluster-operator/tree/main/observability/) as reference.
@@ -0,0 +1,242 @@
+apiVersion: v1
+kind: Alert
+app: "rabbitmq"
+version: 1.0.0
+appVersion:
+- '3.8'
+descriptionFile: ALERTS.md
+configurations:
+- kind: Prometheus
+  data: |
+    groups:
+      - name: rabbitmq-cluster-operator
+        rules:
+          - alert: RabbitMQClusterOperatorUnavailableReplicas
+            expr: |
+              kube_deployment_status_replicas_unavailable{deployment="rabbitmq-cluster-operator"}
+              >
+              0
+            for: 5m
+            annotations:
+              description: |
+                `{{ $value }}` replicas are unavailable in Deployment `rabbitmq-cluster-operator`
+                in namespace `{{ $labels.namespace }}`.
+              summary: |
+                There are pods that are either running but not yet available or pods that still have not been created.
+                Check the status of the deployment: `kubectl -n {{ $labels.namespace }} describe deployment rabbitmq-cluster-operator`
+                Check the status of the pod: `kubectl -n {{ $labels.namespace }} describe pod -l app.kubernetes.io/component=rabbitmq-cluster-operator`
+            labels:
+              rulesgroup: rabbitmq-operator
+              severity: warning
+      - name: rabbitmq
+        rules:
+          - alert: InsufficientEstablishedErlangDistributionLinks
+            # erlang_vm_dist_node_state: 1=pending, 2=up_pending, 3=up
+            expr: |
+              count by (namespace, rabbitmq_cluster) (erlang_vm_dist_node_state * on(instance) group_left(rabbitmq_cluster) rabbitmq_identity_info == 3)
+              <
+              count by (namespace, rabbitmq_cluster) (rabbitmq_build_info * on(instance) group_left(rabbitmq_cluster) rabbitmq_identity_info)
+              *
+              (count by (namespace, rabbitmq_cluster) (rabbitmq_build_info * on(instance) group_left(rabbitmq_cluster) rabbitmq_identity_info) -1 )
+            for: 10m
+            annotations:
+              description: |
+                There are only `{{ $value }}` established Erlang distribution links
+                in RabbitMQ cluster `{{ $labels.rabbitmq_cluster }}` in namespace `{{ $labels.namespace }}`.
+              summary: |
+                RabbitMQ clusters have a full mesh topology.
+                All RabbitMQ nodes connect to all other RabbitMQ nodes in both directions.
+                The expected number of established Erlang distribution links is therefore `n*(n-1)` where `n` is the number of RabbitMQ nodes in the cluster.
+                Therefore, the expected number of distribution links are `0` for a 1-node cluster, `6` for a 3-node cluster, and `20` for a 5-node cluster.
+                This alert reports that the number of established distributions links is less than the expected number.
+                Some reasons for this alert include failed network links, network partitions, failed clustering (i.e. nodes can't join the cluster).
+                Check the panels `All distribution links`, `Established distribution links`, `Connecting distributions links`, `Waiting distribution links`, and `distribution links`
+                of the Grafana dashboard `Erlang-Distribution`.
+                Check the logs of the RabbitMQ nodes: `kubectl -n {{ $labels.namespace }} logs -l app.kubernetes.io/component=rabbitmq,app.kubernetes.io/name={{ $labels.rabbitmq_cluster }}`
+            labels:
+              rulesgroup: rabbitmq
+              severity: warning
+          - alert: LowDiskWatermarkPredicted
+            # The 2nd condition ensures that data points are available until 24 hours ago such that no false positive alerts are triggered for newly created RabbitMQ clusters.
+            expr: |
+              (
+                predict_linear(rabbitmq_disk_space_available_bytes[24h], 60*60*24) * on (instance) group_left(rabbitmq_cluster, rabbitmq_node, pod) rabbitmq_identity_info
+                <
+                rabbitmq_disk_space_available_limit_bytes * on (instance) group_left(rabbitmq_cluster, rabbitmq_node, pod) rabbitmq_identity_info
+              )
+              and
+              (
+                count_over_time(rabbitmq_disk_space_available_limit_bytes[2h] offset 22h) * on (instance) group_left(rabbitmq_cluster, rabbitmq_node, pod) rabbitmq_identity_info
+                >
+                0
+              )
+            for: 60m
+            annotations:
+              description: |
+                The predicted free disk space in 24 hours from now is `{{ $value | humanize1024 }}B`
+                in RabbitMQ node `{{ $labels.rabbitmq_node }}`, pod `{{ $labels.pod }}`,
+                RabbitMQ cluster `{{ $labels.rabbitmq_cluster }}`, namespace `{{ $labels.namespace }}`.
+              summary: |
+                Based on the trend of available disk space over the past 24 hours, it's predicted that, in 24 hours from now, a disk alarm will be triggered since the free disk space will drop below the free disk space limit.
+                This alert is reported for the partition where the RabbitMQ data directory is stored.
+                When the disk alarm will be triggered, all publishing connections across all cluster nodes will be blocked.
+                See
+                https://www.rabbitmq.com/alarms.html,
+                https://www.rabbitmq.com/disk-alarms.html,
+                https://www.rabbitmq.com/production-checklist.html#resource-limits-disk-space,
+                https://www.rabbitmq.com/persistence-conf.html,
+                https://www.rabbitmq.com/connection-blocked.html.
+            labels:
+              rulesgroup: rabbitmq
+              severity: warning
+          - alert: HighConnectionChurn
+            expr: |
+              (
+                sum(rate(rabbitmq_connections_closed_total[5m]) * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info) by(namespace, rabbitmq_cluster)
+                +
+                sum(rate(rabbitmq_connections_opened_total[5m]) * on(instance) group_left(rabbitmq_cluster, rabbitmq_node) rabbitmq_identity_info) by(namespace, rabbitmq_cluster)
+              )
+              /
+              sum (rabbitmq_connections * on(instance) group_left(rabbitmq_cluster) rabbitmq_identity_info) by (namespace, rabbitmq_cluster)
+              > 0.1
+              unless
+              sum (rabbitmq_connections * on(instance) group_left(rabbitmq_cluster) rabbitmq_identity_info) by (namespace, rabbitmq_cluster)
+              < 100
+            for: 10m
+            annotations:
+              description: |
+                Over the last 5 minutes, `{{ $value | humanizePercentage }}`
+                of total connections are closed or opened per second in RabbitMQ cluster `{{ $labels.rabbitmq_cluster }}`
+                in namespace `{{ $labels.namespace }}`.
+              summary: |
+                More than 10% of total connections are churning.
+                This means that client application connections are short-lived instead of long-lived.
+                Read https://www.rabbitmq.com/connections.html#high-connection-churn to understand why this is an anti-pattern.
+            labels:
+              rulesgroup: rabbitmq
+              severity: warning
+          - alert: NoMajorityOfNodesReady
+            expr: |
+              kube_statefulset_status_replicas_ready * on (namespace, statefulset) group_left(label_app_kubernetes_io_name) kube_statefulset_labels{label_app_kubernetes_io_component="rabbitmq"}
+              <=
+              kube_statefulset_replicas              * on (namespace, statefulset) group_left(label_app_kubernetes_io_name) kube_statefulset_labels{label_app_kubernetes_io_component="rabbitmq"}
+              / 2
+              unless
+              kube_statefulset_replicas              * on (namespace, statefulset) group_left(label_app_kubernetes_io_name) kube_statefulset_labels{label_app_kubernetes_io_component="rabbitmq"}
+              == 0
+            for: 5m
+            annotations:
+              description: |
+                Only `{{ $value }}` replicas are ready in StatefulSet `{{ $labels.statefulset }}`
+                of RabbitMQ cluster `{{ $labels.label_app_kubernetes_io_name }}` in namespace `{{ $labels.namespace }}`.
+              summary: |
+                No majority of nodes have been ready for the last 5 minutes.
+                Check the details of the pods:
+                `kubectl -n {{ $labels.namespace }} describe pods -l app.kubernetes.io/component=rabbitmq,app.kubernetes.io/name={{ $labels.label_app_kubernetes_io_name }}`
+            labels:
+              rabbitmq_cluster: '{{ $labels.label_app_kubernetes_io_name }}'
+              rulesgroup: rabbitmq
+              severity: warning
+          - alert: PersistentVolumeMissing
+            expr: |
+              kube_persistentvolumeclaim_status_phase{phase="Bound"} * on (namespace, persistentvolumeclaim) group_left(label_app_kubernetes_io_name) kube_persistentvolumeclaim_labels{label_app_kubernetes_io_component="rabbitmq"}
+              ==
+              0
+            for: 10m
+            annotations:
+              description: |
+                PersistentVolumeClaim `{{ $labels.persistentvolumeclaim }}` of
+                RabbitMQ cluster `{{ $labels.label_app_kubernetes_io_name }}` in namespace
+                `{{ $labels.namespace }}` is not bound.
+              summary: |
+                RabbitMQ needs a PersistentVolume for its data.
+                However, there is no PersistentVolume bound to the PersistentVolumeClaim.
+                This means the requested storage could not be provisioned.
+                Check the status of the PersistentVolumeClaim: `kubectl -n {{ $labels.namespace }} describe pvc {{ $labels.persistentvolumeclaim }}`.
+            labels:
+              rabbitmq_cluster: '{{ $labels.label_app_kubernetes_io_name }}'
+              rulesgroup: rabbitmq
+              severity: critical
+          - alert: UnroutableMessages
+            expr: |
+              sum by(namespace, rabbitmq_cluster) (increase(rabbitmq_channel_messages_unroutable_dropped_total[5m]) * on(instance) group_left(rabbitmq_cluster) rabbitmq_identity_info)
+              >= 1
+              or
+              sum by(namespace, rabbitmq_cluster) (increase(rabbitmq_channel_messages_unroutable_returned_total[5m]) * on(instance) group_left(rabbitmq_cluster) rabbitmq_identity_info)
+              >= 1
+            annotations:
+              description: |
+                There were `{{ $value | printf "%.0f" }}` unroutable messages within the last
+                5 minutes in RabbitMQ cluster `{{ $labels.rabbitmq_cluster }}` in namespace
+                `{{ $labels.namespace }}`.
+              summary: |
+                There are messages published into an exchange which cannot be routed and are either dropped silently, or returned to publishers.
+                Is your routing topology set up correctly?
+                Check your application code and bindings between exchanges and queues.
+                See
+                https://www.rabbitmq.com/publishers.html#unroutable,
+                https://www.rabbitmq.com/confirms.html#when-publishes-are-confirmed.
+            labels:
+              rulesgroup: rabbitmq
+              severity: warning
+          - alert: FileDescriptorsNearLimit
+            expr: |
+              sum by(namespace, rabbitmq_cluster, pod, rabbitmq_node) (max_over_time(rabbitmq_process_open_fds[5m]) * on(instance) group_left(rabbitmq_cluster, rabbitmq_node, pod) rabbitmq_identity_info)
+              /
+              sum by(namespace, rabbitmq_cluster, pod, rabbitmq_node) (rabbitmq_process_max_tcp_sockets  * on(instance) group_left(rabbitmq_cluster, rabbitmq_node, pod) rabbitmq_identity_info)
+              > 0.8
+            for: 10m
+            annotations:
+              description: |
+                `{{ $value | humanizePercentage }}` file descriptors of file
+                descriptor limit are used in RabbitMQ node `{{ $labels.rabbitmq_node }}`,
+                pod `{{ $labels.pod }}`, RabbitMQ cluster `{{ $labels.rabbitmq_cluster }}`,
+                namespace `{{ $labels.namespace }}`.
+              summary: |
+                More than 80% of file descriptors are used on the RabbitMQ node.
+                When this value reaches 100%, new connections will not be accepted and disk write operations may fail.
+                Client libraries, peer nodes and CLI tools will not be able to connect when the node runs out of available file descriptors.
+                See https://www.rabbitmq.com/production-checklist.html#resource-limits-file-handle-limit.
+            labels:
+              rulesgroup: rabbitmq
+              severity: warning
+          - alert: ContainerRestarts
+            expr: |
+              increase(kube_pod_container_status_restarts_total[10m]) * on(namespace, pod, container) group_left(rabbitmq_cluster) rabbitmq_identity_info
+              >=
+              1
+            for: 5m
+            annotations:
+              description: |
+                Over the last 10 minutes, container `{{ $labels.container }}`
+                restarted `{{ $value | printf "%.0f" }}` times in pod `{{ $labels.pod }}` of RabbitMQ cluster
+                `{{ $labels.rabbitmq_cluster }}` in namespace `{{ $labels.namespace }}`.
+              summary: |
+                Investigate why the container got restarted.
+                Check the logs of the current container: `kubectl -n {{ $labels.namespace }} logs {{ $labels.pod }}`
+                Check the logs of the previous container: `kubectl -n {{ $labels.namespace }} logs {{ $labels.pod }} --previous`
+                Check the last state of the container: `kubectl -n {{ $labels.namespace }} get pod {{ $labels.pod }} -o jsonpath='{.status.containerStatuses[].lastState}'`
+            labels:
+              rabbitmq_cluster: '{{ $labels.rabbitmq_cluster }}'
+              rulesgroup: rabbitmq
+              severity: warning
+          - alert: TCPSocketsNearLimit
+            expr: |
+              sum by(namespace, rabbitmq_cluster, pod, rabbitmq_node) (max_over_time(rabbitmq_process_open_tcp_sockets[5m]) * on(instance) group_left(rabbitmq_cluster, rabbitmq_node, pod) rabbitmq_identity_info)
+              /
+              sum by(namespace, rabbitmq_cluster, pod, rabbitmq_node) (rabbitmq_process_max_tcp_sockets  * on(instance) group_left(rabbitmq_cluster, rabbitmq_node, pod) rabbitmq_identity_info)
+              > 0.8
+            for: 10m
+            annotations:
+              description: |
+                `{{ $value | humanizePercentage }}` TCP sockets of TCP socket
+                limit are open in RabbitMQ node `{{ $labels.rabbitmq_node }}`, pod `{{ $labels.pod }}`,
+                RabbitMQ cluster `{{ $labels.rabbitmq_cluster }}`, namespace `{{ $labels.namespace }}`.
+              summary: |
+                More than 80% of TCP sockets are open on the RabbitMQ node.
+                When this value reaches 100%, new connections will not be accepted.
+                Client libraries, peer nodes and CLI tools will not be able to connect when the node runs out of available TCP sockets.
+                See https://www.rabbitmq.com/networking.html.
+            labels:
+              rulesgroup: rabbitmq
+              severity: warning
@@ -0,0 +1,29 @@
+apiVersion: v1
+kind: Dashboard
+app: "rabbitmq"
+version: 1.0.0
+appVersion:
+- '3.8'
+configurations:
+- name: Rabbitmq Overview
+  kind: Sysdig
+  image: rabbitmq/images/rabbitmq_overview_sysdig.png
+  description: |
+    This dashboard offers information on:
+    * Node identity, including RabbitMQ & Erlang/OTP version
+    * Node memory & disk available before publishers blocked (alarm triggers)
+    * Node file descriptors & TCP sockets available
+    * Ready & pending messages
+  file: include/dashboard-Sysdig-rabbitmq-overview.json
+- name: Rabbitmq Usage
+  kind: Sysdig
+  image: rabbitmq/images/rabbitmq_usage_sysdig.png
+  description: |
+    This dashboard offers information on:
+    * Incoming message rates: published / routed to queues / confirmed / unconfirmed / returned / dropped
+    * Outgoing message rated: delivered with auto or manual acks / acknowledged / redelivered
+    * Polling operation with auto or manual acks, as well as empty ops
+    * Queues, including declaration & deletion rates
+    * Channels, including open & close rates
+    * Connections, including open & close rates
+  file: include/dashboard-Sysdig-rabbitmq-usage.json