From 0f8417378d7506f48fde5d4fe05502b9e997a4ca Mon Sep 17 00:00:00 2001
From: Deezzir <yurii.kondrakov@canonical.com>
Date: Tue, 26 Aug 2025 22:04:40 -0400
Subject: [PATCH 1/3] feat: gp alerts

---
 .github/workflows/ci.yaml                     |  13 ++
 ...cs_alert_rules.yaml => general_rules.yaml} |  67 +++---
 .../replication_rules.yaml                    |  91 ++++++++
 tests/alerts/test_general_alerts.yaml         | 196 ++++++++++++++++++
 tests/alerts/test_replication_alerts.yaml     | 161 ++++++++++++++
 5 files changed, 502 insertions(+), 26 deletions(-)
 rename src/prometheus_alert_rules/{metrics_alert_rules.yaml => general_rules.yaml} (63%)
 create mode 100644 src/prometheus_alert_rules/replication_rules.yaml
 create mode 100644 tests/alerts/test_general_alerts.yaml
 create mode 100644 tests/alerts/test_replication_alerts.yaml

diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
index e9b685a6ce..f91491322b 100644
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -42,6 +42,19 @@ jobs:
       - name: Upload Coverage to Codecov
         uses: codecov/codecov-action@v5
 
+  alert-test:
+    name: Test Prometheus Alert Rules
+    runs-on: ubuntu-latest
+    steps:
+      - name: Checkout repo
+        uses: actions/checkout@v3
+      - name: Install prometheus snap
+        run: sudo snap install prometheus
+      - name: Check validity of prometheus alert rules
+        run: promtool check rules src/prometheus_alert_rules/*.yaml
+      - name: Run unit tests for prometheus alert rules
+        run: promtool test rules tests/alerts/*.yaml
+
   build:
     name: Build charm
     uses: canonical/data-platform-workflows/.github/workflows/build_charm.yaml@v32.2.1
diff --git a/src/prometheus_alert_rules/metrics_alert_rules.yaml b/src/prometheus_alert_rules/general_rules.yaml
similarity index 63%
rename from src/prometheus_alert_rules/metrics_alert_rules.yaml
rename to src/prometheus_alert_rules/general_rules.yaml
index ff0a604c15..d07a273732 100644
--- a/src/prometheus_alert_rules/metrics_alert_rules.yaml
+++ b/src/prometheus_alert_rules/general_rules.yaml
@@ -1,20 +1,30 @@
 groups:
-  - name: MySQLExporterK8s
-
+  - name: MySQL General Alert Rules
     rules:
-      # 2.1.1
       - alert: MySQLDown
-        expr: "mysql_up == 0"
+        expr: mysql_up == 0
         for: 0m
         labels:
           severity: critical
         annotations:
-          summary: MySQL instance {{ $labels.instance }} is down. 
+          summary: MySQL instance {{ $labels.instance }} is down.
           description: |
+            The MySQL instance is not reachable.
+            Please check if the MySQL process is running and the network connectivity.
+            LABELS = {{ $labels }}.
+
+      - alert: MySQLMetricsScrapeError
+        expr: increase(mysql_exporter_last_scrape_error[5m]) > 1
+        for: 0m
+        labels:
+          severity: warning
+        annotations:
+          summary: MySQL instance {{ $labels.instance }} has a metrics scrape error.
+          description: |
+            The MySQL Exporter encountered an error while scraping metrics.
+            Check the MySQL Exporter logs for more details.
             LABELS = {{ $labels }}.
 
-      # 2.1.2
-      # customized: 80% -> 90%
       - alert: MySQLTooManyConnections(>90%)
         expr: max_over_time(mysql_global_status_threads_connected[1m]) / mysql_global_variables_max_connections * 100 > 90
         for: 2m
@@ -24,10 +34,8 @@ groups:
           summary: MySQL instance {{ $labels.instance }} is using > 90% of `max_connections`.
           description: |
             Consider checking the client application responsible for generating those additional connections.
-            LABELS = {{ $labels }}. 
+            LABELS = {{ $labels }}.
 
-      # 2.1.4
-      # customized: 60% -> 80%
       - alert: MySQLHighThreadsRunning
         expr: max_over_time(mysql_global_status_threads_running[1m]) / mysql_global_variables_max_connections * 100 > 80
         for: 2m
@@ -36,10 +44,9 @@ groups:
         annotations:
           summary: MySQL instance {{ $labels.instance }} is actively using > 80% of `max_connections`.
           description: |
-            Consider reviewing the value of the `max-connections` config parameter or allocate more resources to your database server. 
-            LABELS = {{ $labels }}. 
+            Consider reviewing the value of the `max-connections` config parameter or allocate more resources to your database server.
+            LABELS = {{ $labels }}.
             
-      # 2.1.3
       - alert: MySQLHighPreparedStatementsUtilization(>80%)
         expr: max_over_time(mysql_global_status_prepared_stmt_count[1m]) / mysql_global_variables_max_prepared_stmt_count * 100 > 80
         for: 2m
@@ -48,36 +55,32 @@ groups:
         annotations:
           summary:  MySQL instance {{ $labels.instance }} is using > 80% of `max_prepared_stmt_count`.
           description: |
-            Too many prepared statements might consume a lot of memory. 
-            LABELS = {{ $labels }}. 
+            Too many prepared statements might consume a lot of memory.
+            LABELS = {{ $labels }}.
 
-      # 2.1.8
-      # customized: warning -> info
       - alert: MySQLSlowQueries
         expr: increase(mysql_global_status_slow_queries[1m]) > 0
         for: 2m
         labels:
           severity: info
         annotations:
-          summary: MySQL instance {{ $labels.instance }} has a slow query.
+          summary: MySQL instance {{ $labels.instance }} has slow queries.
           description: |
-            Consider optimizing the query by reviewing its execution plan, then rewrite the query and add any relevant indexes. 
+            Consider optimizing the query by reviewing its execution plan, then rewrite the query and add any relevant indexes.
             LABELS = {{ $labels }}.
 
-      # 2.1.9
       - alert: MySQLInnoDBLogWaits
         expr: rate(mysql_global_status_innodb_log_waits[15m]) > 10
         for: 0m
         labels:
           severity: warning
         annotations:
-          summary: MySQL instance {{ $labels.instance }} has long InnoDB log waits. 
+          summary: MySQL instance {{ $labels.instance }} has long InnoDB log waits.
           description: |
-            MySQL InnoDB log writes might be stalling. 
-            Check I/O activity on your nodes to find the responsible process or query. Consider using iotop and the performance_schema. 
+            MySQL InnoDB log writes might be stalling.
+            Check I/O activity on your nodes to find the responsible process or query. Consider using iotop and the performance_schema.
             LABELS = {{ $labels }}.
 
-      # 2.1.10
       - alert: MySQLRestarted
         expr: mysql_global_status_uptime < 60
         for: 0m
@@ -86,6 +89,18 @@ groups:
         annotations:
           summary: MySQL instance {{ $labels.instance }} restarted.
           description: |
-            MySQL restarted less than one minute ago. 
-            If the restart was unplanned or frequent, check Loki logs (e.g. `error.log`). 
+            MySQL restarted less than one minute ago.
+            If the restart was unplanned or frequent, check Loki logs (e.g. `error.log`).
+            LABELS = {{ $labels }}.
+      
+      - alert: MySQLConnectionErrors
+        expr: increase(mysql_global_status_connection_errors_total[5m]) > 10
+        for: 0m
+        labels:
+          severity: warning
+        annotations:
+          summary: MySQL instance {{ $labels.instance }} has connection errors.
+          description: |
+            Connection errors might indicate network issues, authentication problems, or resource limitations.
+            Check the MySQL logs for more details.
             LABELS = {{ $labels }}.
diff --git a/src/prometheus_alert_rules/replication_rules.yaml b/src/prometheus_alert_rules/replication_rules.yaml
new file mode 100644
index 0000000000..694dcd984e
--- /dev/null
+++ b/src/prometheus_alert_rules/replication_rules.yaml
@@ -0,0 +1,91 @@
+groups:
+  - name: MySQL Replication Alert Rules
+    rules:
+      - alert: MySQLClusterUnitOffline
+        expr: mysql_perf_schema_replication_group_member_info{member_state="OFFLINE"} == 1
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: MySQL cluster member {{ $labels.instance }} is offline
+          description: |
+            The MySQL member is marked offline in the cluster, although the process might still be running.
+            If this is unexptected, please check the logs.
+            LABELS = {{ $labels }}.
+
+      - alert: MySQLClusterNoPrimary
+        expr: absent(mysql_perf_schema_replication_group_member_info{member_role="PRIMARY"})
+        for: 0m
+        labels:
+          severity: critical
+        annotations:
+          summary: MySQL cluster reports no primariy
+          description: |
+            MySQL has no primaries. The cluster will likely be in a Read-Only state.
+            Please check the cluster health, the logs and investigate.
+            LABELS = {{ $labels }}.
+
+      - alert: MySQLClusterTooManyPrimaries
+        expr: count(mysql_perf_schema_replication_group_member_info{member_role="PRIMARY"}) > 1
+        for: 15m
+        labels:
+          severity: critical
+        annotations:
+          summary: MySQL cluster reports more than one primary.
+          description: |
+            MySQL reports more than one primary. This is can indicate a split-brain situation.
+            Please refer to the troubleshooting docs.
+            LABELS = {{ $labels }}.
+
+      - alert: MySQLNoReplication
+        expr: absent(mysql_perf_schema_replication_group_member_info{member_role="SECONDARY"})
+        for: 15m
+        labels:
+          severity: warning
+        annotations:
+          summary: MySQL cluster has no secondaries.
+          description: |
+            The MySQL cluster has no secondaries. This means that the cluster is not redundant and a failure of the primary will lead to downtime.
+            Please check the cluster health, the logs and investigate.
+            LABELS = {{ $labels }}.
+
+      - alert: MySQLGroupReplicationReduced
+        expr: |
+          count(mysql_perf_schema_replication_group_member_info{member_state="ONLINE"} == 1)
+          <
+          max_over_time(
+            count(mysql_perf_schema_replication_group_member_info{member_state="ONLINE"} == 1)[6h:]
+          )
+        for: 15m
+        labels:
+          severity: warning
+        annotations:
+          summary: MySQL cluster's Group Replication size reduced
+          description: |
+            The number of ONLINE members in the MySQL Group Replication cluster has reduced compared to the maximum observed in the last 6 hours.
+            Please check the cluster health, the logs and investigate.
+            LABELS = {{ $labels }}.
+
+      - alert: MySQLGroupReplicationConflicts
+        expr: rate(mysql_perf_schema_conflicts_detected[5m]) > 0
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: MySQL cluster reports Group Replication conflicts
+          description: |
+            Conflicts indicate concurrent writes to the same rows/keys across members.
+            Please check the cluster health, the logs and investigate.
+            LABELS = {{ $labels }}.
+
+      - alert: MySQLGroupReplicationQueueSizeHigh
+        expr: mysql_perf_schema_transactions_in_queue > 100
+        for: 5m
+        labels:
+          severity: warning
+        annotations:
+          summary: MySQL cluster reports high Group Replication queue size
+          description: |
+            A high number of transactions in the Group Replication queue might indicate network issues or overloaded nodes.
+            Please check the cluster health, the logs and investigate.
+            LABELS = {{ $labels }}.
diff --git a/tests/alerts/test_general_alerts.yaml b/tests/alerts/test_general_alerts.yaml
new file mode 100644
index 0000000000..d3a4aaa4f3
--- /dev/null
+++ b/tests/alerts/test_general_alerts.yaml
@@ -0,0 +1,196 @@
+rule_files:
+  - ../../src/prometheus_alert_rules/general_rules.yaml
+
+evaluation_interval: 1m
+
+tests:
+  - name: MySQLDown fires after 5m of mysql_up=0
+    interval: 1m
+    input_series:
+      - series: 'mysql_up{instance="db1"}'
+        values: '1 1 1 1 0 0 0 0 1 1 1'
+    alert_rule_test:
+      - alertname: MySQLDown
+        eval_time: 2m
+        exp_alerts: []
+      - alertname: MySQLDown
+        eval_time: 5m
+        exp_alerts:
+          - exp_labels:
+              instance: db1
+              severity: critical
+            exp_annotations:
+              summary: MySQL instance db1 is down.
+              description: |
+                The MySQL instance is not reachable.
+                Please check if the MySQL process is running and the network connectivity.
+                LABELS = map[__name__:mysql_up instance:db1].
+
+  - name: MySQLMetricsScrapeError fires at 4m when exporter scrape error > 0
+    interval: 1m
+    input_series:
+      - series: 'mysql_exporter_last_scrape_error{instance="db1"}'
+        values: '0 0 1 2 2 2 2 2 2'
+    alert_rule_test:
+      - alertname: MySQLMetricsScrapeError
+        eval_time: 4m
+        exp_alerts:
+          - exp_labels:
+              instance: db1
+              severity: warning
+            exp_annotations:
+              summary: MySQL instance db1 has a metrics scrape error.
+              description: |
+                The MySQL Exporter encountered an error while scraping metrics.
+                Check the MySQL Exporter logs for more details.
+                LABELS = map[instance:db1].
+
+  - name: MySQLTooManyConnections fires after 5m when >90% of max_connections
+    interval: 1m
+    input_series:
+      - series: 'mysql_global_variables_max_connections{instance="db1"}'
+        values: '100 100 100 100 100 100 100 100 100'
+      - series: 'mysql_global_status_threads_connected{instance="db1"}'
+        values: '50 60 70 80 95 95 95 70 60'
+    alert_rule_test:
+      - alertname: "MySQLTooManyConnections(>90%)"
+        eval_time: 4m
+        exp_alerts: []
+      - alertname: "MySQLTooManyConnections(>90%)"
+        eval_time: 6m
+        exp_alerts:
+          - exp_labels:
+              instance: db1
+              severity: warning
+            exp_annotations:
+              summary: MySQL instance db1 is using > 90% of `max_connections`.
+              description: |
+                Consider checking the client application responsible for generating those additional connections.
+                LABELS = map[instance:db1].
+
+  - name: MySQLHighThreadsRunning fires after 5m when >80% of max_connections
+    interval: 1m
+    input_series:
+      - series: 'mysql_global_variables_max_connections{instance="db1"}'
+        values: '100 100 100 100 100 100 100 100 100'
+      - series: 'mysql_global_status_threads_running{instance="db1"}'
+        values: '20 30 40 60 85 85 85 40 30'
+    alert_rule_test:
+      - alertname: MySQLHighThreadsRunning
+        eval_time: 4m
+        exp_alerts: []
+      - alertname: MySQLHighThreadsRunning
+        eval_time: 6m
+        exp_alerts:
+          - exp_labels:
+              instance: db1
+              severity: warning
+            exp_annotations:
+              summary: MySQL instance db1 is actively using > 80% of `max_connections`.
+              description: |
+                Consider reviewing the value of the `max-connections` config parameter or allocate more resources to your database server.
+                LABELS = map[instance:db1].
+
+  - name: MySQLHighPreparedStatementsUtilization fires after 5m when >80%
+    interval: 1m
+    input_series:
+      - series: 'mysql_global_variables_max_prepared_stmt_count{instance="db1"}'
+        values: '100 100 100 100 100 100 100 100 100'
+      - series: 'mysql_global_status_prepared_stmt_count{instance="db1"}'
+        values: '50 60 70 75 81 81 81 70 60'
+    alert_rule_test:
+      - alertname: "MySQLHighPreparedStatementsUtilization(>80%)"
+        eval_time: 4m
+        exp_alerts: []
+      - alertname: "MySQLHighPreparedStatementsUtilization(>80%)"
+        eval_time: 6m
+        exp_alerts:
+          - exp_labels:
+              instance: db1
+              severity: warning
+            exp_annotations:
+              summary: MySQL instance db1 is using > 80% of `max_prepared_stmt_count`.
+              description: |
+                Too many prepared statements might consume a lot of memory.
+                LABELS = map[instance:db1].
+
+  - name: MySQLSlowQueries fires at 4m when slow_queries increases
+    interval: 1m
+    input_series:
+      - series: 'mysql_global_status_slow_queries{instance="db1"}'
+        values: '10 10 11 12 13'
+    alert_rule_test:
+      - alertname: MySQLSlowQueries
+        eval_time: 2m
+        exp_alerts: []
+      - alertname: MySQLSlowQueries
+        eval_time: 4m
+        exp_alerts:
+          - exp_labels:
+              instance: db1
+              severity: info
+            exp_annotations:
+              summary: MySQL instance db1 has slow queries.
+              description: |
+                Consider optimizing the query by reviewing its execution plan, then rewrite the query and add any relevant indexes.
+                LABELS = map[instance:db1].
+
+  - name: MySQLInnoDBLogWaits fires at 16m when log waits keep growing
+    interval: 1m
+    input_series:
+      - series: 'mysql_global_status_innodb_log_waits{instance="db1"}'
+        values: '0 700 1400 2100 2800 3500 4200 4900 5600 6300 7000 7700 8400 9100 9800 10500 11200 11900 12600'
+    alert_rule_test:
+      - alertname: MySQLInnoDBLogWaits
+        eval_time: 16m
+        exp_alerts:
+          - exp_labels:
+              instance: db1
+              severity: warning
+            exp_annotations:
+              summary: MySQL instance db1 has long InnoDB log waits.
+              description: |
+                MySQL InnoDB log writes might be stalling.
+                Check I/O activity on your nodes to find the responsible process or query. Consider using iotop and the performance_schema.
+                LABELS = map[instance:db1].
+
+  - name: MySQLRestarted fires when uptime < 1m and clears by 5m
+    interval: 1m
+    input_series:
+      - series: 'mysql_global_status_uptime{instance="db1"}'
+        values: '10 20 30 40 120 180 240 300 360'
+    alert_rule_test:
+      - alertname: MySQLRestarted
+        eval_time: 2m
+        exp_alerts:
+          - exp_labels:
+              instance: db1
+              severity: info
+            exp_annotations:
+              summary: MySQL instance db1 restarted.
+              description: |
+                MySQL restarted less than one minute ago.
+                If the restart was unplanned or frequent, check Loki logs (e.g. `error.log`).
+                LABELS = map[__name__:mysql_global_status_uptime instance:db1].
+      - alertname: MySQLRestarted
+        eval_time: 5m
+        exp_alerts: []
+
+  - name: MySQLConnectionErrors fire at 5m when connection_errors_total > 0
+    interval: 1m
+    input_series:
+      - series: 'mysql_global_status_connection_errors_total{instance="db1"}'
+        values: '0 0 0 0 0 12 12 12 12'
+    alert_rule_test:
+      - alertname: MySQLConnectionErrors
+        eval_time: 5m
+        exp_alerts:
+          - exp_labels:
+              instance: db1
+              severity: warning
+            exp_annotations:
+              summary: MySQL instance db1 has connection errors.
+              description: |
+                Connection errors might indicate network issues, authentication problems, or resource limitations.
+                Check the MySQL logs for more details.
+                LABELS = map[instance:db1].
diff --git a/tests/alerts/test_replication_alerts.yaml b/tests/alerts/test_replication_alerts.yaml
new file mode 100644
index 0000000000..82f5970177
--- /dev/null
+++ b/tests/alerts/test_replication_alerts.yaml
@@ -0,0 +1,161 @@
+rule_files:
+  - ../../src/prometheus_alert_rules/replication_rules.yaml
+
+evaluation_interval: 1m
+
+tests:
+  - name: MySQLClusterUnitOffline fires after 5m of OFFLINE=1
+    interval: 1m
+    input_series:
+      - series: 'mysql_perf_schema_replication_group_member_info{instance="db1",member_state="OFFLINE",member_role="SECONDARY"}'
+        values: '1 1 1 1 1 1'
+    alert_rule_test:
+      - alertname: MySQLClusterUnitOffline
+        eval_time: 5m
+        exp_alerts:
+          - exp_labels:
+              alertname: MySQLClusterUnitOffline
+              severity: warning
+              instance: db1
+              member_state: OFFLINE
+              member_role: SECONDARY
+            exp_annotations:
+              summary: MySQL cluster member db1 is offline
+              description: |
+                The MySQL member is marked offline in the cluster, although the process might still be running.
+                If this is unexptected, please check the logs.
+                LABELS = map[__name__:mysql_perf_schema_replication_group_member_info instance:db1 member_role:SECONDARY member_state:OFFLINE].
+
+  - name: MySQLClusterUnitOffline does not fire when OFFLINE=0
+    interval: 1m
+    input_series:
+      - series: 'mysql_perf_schema_replication_group_member_info{instance="db2",member_state="OFFLINE",member_role="SECONDARY"}'
+        values: '0 0 0 0 0 0'
+    alert_rule_test:
+      - alertname: MySQLClusterUnitOffline
+        eval_time: 6m
+        exp_alerts: []
+
+  - name: MySQLClusterNoPrimary fires immediately when there is no PRIMARY
+    interval: 1m
+    input_series:
+      - series: 'mysql_perf_schema_replication_group_member_info{instance="db1",member_state="ONLINE",member_role="SECONDARY"}'
+        values: '1 1'
+    alert_rule_test:
+      - alertname: MySQLClusterNoPrimary
+        eval_time: 0m
+        exp_alerts:
+          - exp_labels:
+              alertname: MySQLClusterNoPrimary
+              severity: critical
+              member_role: PRIMARY
+            exp_annotations:
+              summary: MySQL cluster reports no primariy
+              description: |
+                MySQL has no primaries. The cluster will likely be in a Read-Only state.
+                Please check the cluster health, the logs and investigate.
+                LABELS = map[member_role:PRIMARY].
+
+  - name: MySQLClusterTooManyPrimaries fires after 15m when PRIMARY count > 1
+    interval: 1m
+    input_series:
+      - series: 'mysql_perf_schema_replication_group_member_info{instance="db1",member_state="ONLINE",member_role="PRIMARY"}'
+        values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
+      - series: 'mysql_perf_schema_replication_group_member_info{instance="db2",member_state="ONLINE",member_role="PRIMARY"}'
+        values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
+    alert_rule_test:
+      - alertname: MySQLClusterTooManyPrimaries
+        eval_time: 15m
+        exp_alerts:
+          - exp_labels:
+              alertname: MySQLClusterTooManyPrimaries
+              severity: critical
+            exp_annotations:
+              summary: MySQL cluster reports more than one primary.
+              description: |
+                MySQL reports more than one primary. This is can indicate a split-brain situation.
+                Please refer to the troubleshooting docs.
+                LABELS = map[].
+
+  - name: MySQLNoReplication fires after 15m if there are no SECONDARY
+    interval: 1m
+    input_series:
+      - series: 'mysql_perf_schema_replication_group_member_info{instance="db1",member_state="ONLINE",member_role="PRIMARY"}'
+        values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
+    alert_rule_test:
+      - alertname: MySQLNoReplication
+        eval_time: 15m
+        exp_alerts:
+          - exp_labels:
+              alertname: MySQLNoReplication
+              severity: warning
+              member_role: SECONDARY
+            exp_annotations:
+              summary: MySQL cluster has no secondaries.
+              description: |
+                The MySQL cluster has no secondaries. This means that the cluster is not redundant and a failure of the primary will lead to downtime.
+                Please check the cluster health, the logs and investigate.
+                LABELS = map[member_role:SECONDARY].
+
+  - name: MySQLGroupReplicationReduced fires when ONLINE count drops below prior 6h max
+    interval: 5m
+    input_series:
+      - series: 'mysql_perf_schema_replication_group_member_info{instance="db1",member_state="ONLINE",member_role="SECONDARY"}'
+        values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
+      - series: 'mysql_perf_schema_replication_group_member_info{instance="db2",member_state="ONLINE",member_role="SECONDARY"}'
+        values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0'
+      - series: 'mysql_perf_schema_replication_group_member_info{instance="db3",member_state="ONLINE",member_role="PRIMARY"}'
+        values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1'
+    alert_rule_test:
+      - alertname: MySQLGroupReplicationReduced
+        eval_time: 5h
+        exp_alerts:
+          - exp_labels:
+              alertname: MySQLGroupReplicationReduced
+              severity: warning
+            exp_annotations:
+              summary: MySQL cluster's Group Replication size reduced
+              description: |
+                The number of ONLINE members in the MySQL Group Replication cluster has reduced compared to the maximum observed in the last 6 hours.
+                Please check the cluster health, the logs and investigate.
+                LABELS = map[].
+
+  - name: MySQLGroupReplicationConflicts fires when rate>0 for 5m
+    interval: 1m
+    input_series:
+      - series: 'mysql_perf_schema_conflicts_detected{instance="db1"}'
+        values: '0 1 2 3 4 5 6 7 8 9 10'
+    alert_rule_test:
+      - alertname: MySQLGroupReplicationConflicts
+        eval_time: 10m
+        exp_alerts:
+          - exp_labels:
+              alertname: MySQLGroupReplicationConflicts
+              severity: warning
+              instance: db1
+            exp_annotations:
+              summary: MySQL cluster reports Group Replication conflicts
+              description: |
+                Conflicts indicate concurrent writes to the same rows/keys across members.
+                Please check the cluster health, the logs and investigate.
+                LABELS = map[instance:db1].
+
+  - name: MySQLGroupReplicationQueueSizeHigh fires when queue size >100 for 5m
+    interval: 1m
+    input_series:
+      - series: 'mysql_perf_schema_transactions_in_queue{instance="db2"}'
+        values: '0 0 0 120 120 120 120 120 120'
+    alert_rule_test:
+      - alertname: MySQLGroupReplicationQueueSizeHigh
+        eval_time: 8m
+        exp_alerts:
+          - exp_labels:
+              alertname: MySQLGroupReplicationQueueSizeHigh
+              severity: warning
+              instance: db2
+            exp_annotations:
+              summary: MySQL cluster reports high Group Replication queue size
+              description: |
+                A high number of transactions in the Group Replication queue might indicate network issues or overloaded nodes.
+                Please check the cluster health, the logs and investigate.
+                LABELS = map[__name__:mysql_perf_schema_transactions_in_queue instance:db2].

From bbfd74de2512d9cfe0df2cbfa769ae8881879a7c Mon Sep 17 00:00:00 2001
From: Deezzir <yurii.kondrakov@canonical.com>
Date: Wed, 27 Aug 2025 16:40:20 -0400
Subject: [PATCH 2/3] fix: update checkout action version

---
 .github/workflows/ci.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
index f91491322b..4ea9afbbc6 100644
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -47,7 +47,7 @@ jobs:
     runs-on: ubuntu-latest
     steps:
       - name: Checkout repo
-        uses: actions/checkout@v3
+        uses: actions/checkout@v4
       - name: Install prometheus snap
         run: sudo snap install prometheus
       - name: Check validity of prometheus alert rules

From 56718d0b4fcb4db7fc16d11545a4020d34fb9074 Mon Sep 17 00:00:00 2001
From: Deezzir <yurii.kondrakov@canonical.com>
Date: Fri, 29 Aug 2025 17:20:09 -0400
Subject: [PATCH 3/3] fix: spelling & rules

---
 .github/workflows/ci.yaml                         |  1 +
 src/prometheus_alert_rules/replication_rules.yaml | 10 +++++-----
 tests/alerts/test_replication_alerts.yaml         |  6 +++---
 3 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
index 4ea9afbbc6..9f5b9dfea7 100644
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -45,6 +45,7 @@ jobs:
   alert-test:
     name: Test Prometheus Alert Rules
     runs-on: ubuntu-latest
+    timeout-minutes: 5
     steps:
       - name: Checkout repo
         uses: actions/checkout@v4
diff --git a/src/prometheus_alert_rules/replication_rules.yaml b/src/prometheus_alert_rules/replication_rules.yaml
index 694dcd984e..e2e9b0d65e 100644
--- a/src/prometheus_alert_rules/replication_rules.yaml
+++ b/src/prometheus_alert_rules/replication_rules.yaml
@@ -10,16 +10,16 @@ groups:
           summary: MySQL cluster member {{ $labels.instance }} is offline
           description: |
             The MySQL member is marked offline in the cluster, although the process might still be running.
-            If this is unexptected, please check the logs.
+            If this is unexpected, please check the logs.
             LABELS = {{ $labels }}.
 
       - alert: MySQLClusterNoPrimary
-        expr: absent(mysql_perf_schema_replication_group_member_info{member_role="PRIMARY"})
+        expr: absent(mysql_perf_schema_replication_group_member_info{member_role="PRIMARY",member_state="ONLINE"})
         for: 0m
         labels:
           severity: critical
         annotations:
-          summary: MySQL cluster reports no primariy
+          summary: MySQL cluster reports no primary
           description: |
             MySQL has no primaries. The cluster will likely be in a Read-Only state.
             Please check the cluster health, the logs and investigate.
@@ -33,7 +33,7 @@ groups:
         annotations:
           summary: MySQL cluster reports more than one primary.
           description: |
-            MySQL reports more than one primary. This is can indicate a split-brain situation.
+            MySQL reports more than one primary. This can indicate a split-brain situation.
             Please refer to the troubleshooting docs.
             LABELS = {{ $labels }}.
 
@@ -56,7 +56,7 @@ groups:
           max_over_time(
             count(mysql_perf_schema_replication_group_member_info{member_state="ONLINE"} == 1)[6h:]
           )
-        for: 15m
+        for: 5m
         labels:
           severity: warning
         annotations:
diff --git a/tests/alerts/test_replication_alerts.yaml b/tests/alerts/test_replication_alerts.yaml
index 82f5970177..0e1c96968a 100644
--- a/tests/alerts/test_replication_alerts.yaml
+++ b/tests/alerts/test_replication_alerts.yaml
@@ -23,7 +23,7 @@ tests:
               summary: MySQL cluster member db1 is offline
               description: |
                 The MySQL member is marked offline in the cluster, although the process might still be running.
-                If this is unexptected, please check the logs.
+                If this is unexpected, please check the logs.
                 LABELS = map[__name__:mysql_perf_schema_replication_group_member_info instance:db1 member_role:SECONDARY member_state:OFFLINE].
 
   - name: MySQLClusterUnitOffline does not fire when OFFLINE=0
@@ -50,7 +50,7 @@ tests:
               severity: critical
               member_role: PRIMARY
             exp_annotations:
-              summary: MySQL cluster reports no primariy
+              summary: MySQL cluster reports no primary
               description: |
                 MySQL has no primaries. The cluster will likely be in a Read-Only state.
                 Please check the cluster health, the logs and investigate.
@@ -73,7 +73,7 @@ tests:
             exp_annotations:
               summary: MySQL cluster reports more than one primary.
               description: |
-                MySQL reports more than one primary. This is can indicate a split-brain situation.
+                MySQL reports more than one primary. This can indicate a split-brain situation.
                 Please refer to the troubleshooting docs.
                 LABELS = map[].