canonical · Deezzir · Aug 27, 2025 · Aug 27, 2025 · Aug 29, 2025 · carlcsaposs-canonical
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
@@ -42,6 +42,20 @@ jobs:
       - name: Upload Coverage to Codecov
         uses: codecov/codecov-action@v5
 
+  alert-test:
+    name: Test Prometheus Alert Rules
+    runs-on: ubuntu-latest
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-latest
+    timeout-minutes: 5
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-latest
+    timeout-minutes: 5
+    timeout-minutes: 5
+    steps:
+      - name: Checkout repo
+        uses: actions/checkout@v4
+      - name: Install prometheus snap
+        run: sudo snap install prometheus
+      - name: Check validity of prometheus alert rules
+        run: promtool check rules src/prometheus_alert_rules/*.yaml
+      - name: Run unit tests for prometheus alert rules
+        run: promtool test rules tests/alerts/*.yaml
+
   build:
     name: Build charm
     uses: canonical/data-platform-workflows/.github/workflows/[email protected]

diff --git a/...heus_alert_rules/metrics_alert_rules.yaml → ...prometheus_alert_rules/general_rules.yaml b/...heus_alert_rules/metrics_alert_rules.yaml → ...prometheus_alert_rules/general_rules.yaml
@@ -1,20 +1,30 @@
 groups:
-  - name: MySQLExporterK8s
-
+  - name: MySQL General Alert Rules
     rules:
-      # 2.1.1
       - alert: MySQLDown
-        expr: "mysql_up == 0"
+        expr: mysql_up == 0
         for: 0m
         labels:
           severity: critical
         annotations:
-          summary: MySQL instance {{ $labels.instance }} is down. 
+          summary: MySQL instance {{ $labels.instance }} is down.
           description: |
+            The MySQL instance is not reachable.
+            Please check if the MySQL process is running and the network connectivity.
+            LABELS = {{ $labels }}.
+
+      - alert: MySQLMetricsScrapeError
+        expr: increase(mysql_exporter_last_scrape_error[5m]) > 1
+        for: 0m
+        labels:
+          severity: warning
+        annotations:
+          summary: MySQL instance {{ $labels.instance }} has a metrics scrape error.
+          description: |
+            The MySQL Exporter encountered an error while scraping metrics.
+            Check the MySQL Exporter logs for more details.
             LABELS = {{ $labels }}.
 
-      # 2.1.2
-      # customized: 80% -> 90%
       - alert: MySQLTooManyConnections(>90%)
         expr: max_over_time(mysql_global_status_threads_connected[1m]) / mysql_global_variables_max_connections * 100 > 90
         for: 2m
@@ -24,10 +34,8 @@ groups:
           summary: MySQL instance {{ $labels.instance }} is using > 90% of `max_connections`.
           description: |
             Consider checking the client application responsible for generating those additional connections.
-            LABELS = {{ $labels }}. 
+            LABELS = {{ $labels }}.
 
-      # 2.1.4
-      # customized: 60% -> 80%
       - alert: MySQLHighThreadsRunning
         expr: max_over_time(mysql_global_status_threads_running[1m]) / mysql_global_variables_max_connections * 100 > 80
         for: 2m
@@ -36,10 +44,9 @@ groups:
         annotations:
           summary: MySQL instance {{ $labels.instance }} is actively using > 80% of `max_connections`.
           description: |
-            Consider reviewing the value of the `max-connections` config parameter or allocate more resources to your database server. 
-            LABELS = {{ $labels }}. 
+            Consider reviewing the value of the `max-connections` config parameter or allocate more resources to your database server.
+            LABELS = {{ $labels }}.
 
-      # 2.1.3
       - alert: MySQLHighPreparedStatementsUtilization(>80%)
         expr: max_over_time(mysql_global_status_prepared_stmt_count[1m]) / mysql_global_variables_max_prepared_stmt_count * 100 > 80
         for: 2m
@@ -48,36 +55,32 @@ groups:
         annotations:
           summary:  MySQL instance {{ $labels.instance }} is using > 80% of `max_prepared_stmt_count`.
           description: |
-            Too many prepared statements might consume a lot of memory. 
-            LABELS = {{ $labels }}. 
+            Too many prepared statements might consume a lot of memory.
+            LABELS = {{ $labels }}.
 
-      # 2.1.8
-      # customized: warning -> info
       - alert: MySQLSlowQueries
         expr: increase(mysql_global_status_slow_queries[1m]) > 0
         for: 2m
         labels:
           severity: info
         annotations:
-          summary: MySQL instance {{ $labels.instance }} has a slow query.
+          summary: MySQL instance {{ $labels.instance }} has slow queries.
           description: |
-            Consider optimizing the query by reviewing its execution plan, then rewrite the query and add any relevant indexes. 
+            Consider optimizing the query by reviewing its execution plan, then rewrite the query and add any relevant indexes.
             LABELS = {{ $labels }}.
 
-      # 2.1.9
       - alert: MySQLInnoDBLogWaits
         expr: rate(mysql_global_status_innodb_log_waits[15m]) > 10
         for: 0m
         labels:
           severity: warning
         annotations:
-          summary: MySQL instance {{ $labels.instance }} has long InnoDB log waits. 
+          summary: MySQL instance {{ $labels.instance }} has long InnoDB log waits.
           description: |
-            MySQL InnoDB log writes might be stalling. 
-            Check I/O activity on your nodes to find the responsible process or query. Consider using iotop and the performance_schema. 
+            MySQL InnoDB log writes might be stalling.
+            Check I/O activity on your nodes to find the responsible process or query. Consider using iotop and the performance_schema.
             LABELS = {{ $labels }}.
 
-      # 2.1.10
       - alert: MySQLRestarted
         expr: mysql_global_status_uptime < 60
         for: 0m
@@ -86,6 +89,18 @@ groups:
         annotations:
           summary: MySQL instance {{ $labels.instance }} restarted.
           description: |
-            MySQL restarted less than one minute ago. 
-            If the restart was unplanned or frequent, check Loki logs (e.g. `error.log`). 
+            MySQL restarted less than one minute ago.
+            If the restart was unplanned or frequent, check Loki logs (e.g. `error.log`).
+            LABELS = {{ $labels }}.
+
+      - alert: MySQLConnectionErrors
+        expr: increase(mysql_global_status_connection_errors_total[5m]) > 10
+        for: 0m
+        labels:
+          severity: warning
+        annotations:
+          summary: MySQL instance {{ $labels.instance }} has connection errors.
+          description: |
+            Connection errors might indicate network issues, authentication problems, or resource limitations.
+            Check the MySQL logs for more details.
             LABELS = {{ $labels }}.
diff --git a/src/prometheus_alert_rules/replication_rules.yaml b/src/prometheus_alert_rules/replication_rules.yaml
@@ -0,0 +1,91 @@
+groups:
+  - name: MySQL Replication Alert Rules
+    rules:
+      - alert: MySQLClusterUnitOffline
+        expr: mysql_perf_schema_replication_group_member_info{member_state="OFFLINE"} == 1
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: MySQL cluster member {{ $labels.instance }} is offline
+          description: |
+            The MySQL member is marked offline in the cluster, although the process might still be running.
+            If this is unexpected, please check the logs.
+            LABELS = {{ $labels }}.
+
+      - alert: MySQLClusterNoPrimary
+        expr: absent(mysql_perf_schema_replication_group_member_info{member_role="PRIMARY",member_state="ONLINE"})
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: MySQL cluster reports no primary
+          description: |
+            MySQL has no primaries. The cluster will likely be in a Read-Only state.
+            Please check the cluster health, the logs and investigate.
+            LABELS = {{ $labels }}.
+
+      - alert: MySQLClusterTooManyPrimaries
+        expr: count(mysql_perf_schema_replication_group_member_info{member_role="PRIMARY"}) > 1
+        for: 15m
+        labels:
+          severity: critical
+        annotations:
+          summary: MySQL cluster reports more than one primary.
+          description: |
+            MySQL reports more than one primary. This can indicate a split-brain situation.
+            Please refer to the troubleshooting docs.
+            LABELS = {{ $labels }}.
+
+      - alert: MySQLNoReplication
+        expr: absent(mysql_perf_schema_replication_group_member_info{member_role="SECONDARY"})
+        for: 15m
+        labels:
+          severity: warning
+        annotations:
+          summary: MySQL cluster has no secondaries.
+          description: |
+            The MySQL cluster has no secondaries. This means that the cluster is not redundant and a failure of the primary will lead to downtime.
+            Please check the cluster health, the logs and investigate.
+            LABELS = {{ $labels }}.
+
+      - alert: MySQLGroupReplicationReduced
+        expr: |
+          count(mysql_perf_schema_replication_group_member_info{member_state="ONLINE"} == 1)
+          <
+          max_over_time(
+            count(mysql_perf_schema_replication_group_member_info{member_state="ONLINE"} == 1)[6h:]
+          )
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: MySQL cluster's Group Replication size reduced
+          description: |
+            The number of ONLINE members in the MySQL Group Replication cluster has reduced compared to the maximum observed in the last 6 hours.
+            Please check the cluster health, the logs and investigate.
+            LABELS = {{ $labels }}.
+
+      - alert: MySQLGroupReplicationConflicts
+        expr: rate(mysql_perf_schema_conflicts_detected[5m]) > 0
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: MySQL cluster reports Group Replication conflicts
+          description: |
+            Conflicts indicate concurrent writes to the same rows/keys across members.
+            Please check the cluster health, the logs and investigate.
+            LABELS = {{ $labels }}.
+
+      - alert: MySQLGroupReplicationQueueSizeHigh
+        expr: mysql_perf_schema_transactions_in_queue > 100
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: MySQL cluster reports high Group Replication queue size
+          description: |
+            A high number of transactions in the Group Replication queue might indicate network issues or overloaded nodes.
+            Please check the cluster health, the logs and investigate.
+            LABELS = {{ $labels }}.