From 0f8417378d7506f48fde5d4fe05502b9e997a4ca Mon Sep 17 00:00:00 2001 From: Deezzir Date: Tue, 26 Aug 2025 22:04:40 -0400 Subject: [PATCH 1/3] feat: gp alerts --- .github/workflows/ci.yaml | 13 ++ ...cs_alert_rules.yaml => general_rules.yaml} | 67 +++--- .../replication_rules.yaml | 91 ++++++++ tests/alerts/test_general_alerts.yaml | 196 ++++++++++++++++++ tests/alerts/test_replication_alerts.yaml | 161 ++++++++++++++ 5 files changed, 502 insertions(+), 26 deletions(-) rename src/prometheus_alert_rules/{metrics_alert_rules.yaml => general_rules.yaml} (63%) create mode 100644 src/prometheus_alert_rules/replication_rules.yaml create mode 100644 tests/alerts/test_general_alerts.yaml create mode 100644 tests/alerts/test_replication_alerts.yaml diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index e9b685a6ce..f91491322b 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -42,6 +42,19 @@ jobs: - name: Upload Coverage to Codecov uses: codecov/codecov-action@v5 + alert-test: + name: Test Prometheus Alert Rules + runs-on: ubuntu-latest + steps: + - name: Checkout repo + uses: actions/checkout@v3 + - name: Install prometheus snap + run: sudo snap install prometheus + - name: Check validity of prometheus alert rules + run: promtool check rules src/prometheus_alert_rules/*.yaml + - name: Run unit tests for prometheus alert rules + run: promtool test rules tests/alerts/*.yaml + build: name: Build charm uses: canonical/data-platform-workflows/.github/workflows/build_charm.yaml@v32.2.1 diff --git a/src/prometheus_alert_rules/metrics_alert_rules.yaml b/src/prometheus_alert_rules/general_rules.yaml similarity index 63% rename from src/prometheus_alert_rules/metrics_alert_rules.yaml rename to src/prometheus_alert_rules/general_rules.yaml index ff0a604c15..d07a273732 100644 --- a/src/prometheus_alert_rules/metrics_alert_rules.yaml +++ b/src/prometheus_alert_rules/general_rules.yaml @@ -1,20 +1,30 @@ groups: - - name: MySQLExporterK8s - + - name: MySQL General Alert Rules rules: - # 2.1.1 - alert: MySQLDown - expr: "mysql_up == 0" + expr: mysql_up == 0 for: 0m labels: severity: critical annotations: - summary: MySQL instance {{ $labels.instance }} is down. + summary: MySQL instance {{ $labels.instance }} is down. description: | + The MySQL instance is not reachable. + Please check if the MySQL process is running and the network connectivity. + LABELS = {{ $labels }}. + + - alert: MySQLMetricsScrapeError + expr: increase(mysql_exporter_last_scrape_error[5m]) > 1 + for: 0m + labels: + severity: warning + annotations: + summary: MySQL instance {{ $labels.instance }} has a metrics scrape error. + description: | + The MySQL Exporter encountered an error while scraping metrics. + Check the MySQL Exporter logs for more details. LABELS = {{ $labels }}. - # 2.1.2 - # customized: 80% -> 90% - alert: MySQLTooManyConnections(>90%) expr: max_over_time(mysql_global_status_threads_connected[1m]) / mysql_global_variables_max_connections * 100 > 90 for: 2m @@ -24,10 +34,8 @@ groups: summary: MySQL instance {{ $labels.instance }} is using > 90% of `max_connections`. description: | Consider checking the client application responsible for generating those additional connections. - LABELS = {{ $labels }}. + LABELS = {{ $labels }}. - # 2.1.4 - # customized: 60% -> 80% - alert: MySQLHighThreadsRunning expr: max_over_time(mysql_global_status_threads_running[1m]) / mysql_global_variables_max_connections * 100 > 80 for: 2m @@ -36,10 +44,9 @@ groups: annotations: summary: MySQL instance {{ $labels.instance }} is actively using > 80% of `max_connections`. description: | - Consider reviewing the value of the `max-connections` config parameter or allocate more resources to your database server. - LABELS = {{ $labels }}. + Consider reviewing the value of the `max-connections` config parameter or allocate more resources to your database server. + LABELS = {{ $labels }}. - # 2.1.3 - alert: MySQLHighPreparedStatementsUtilization(>80%) expr: max_over_time(mysql_global_status_prepared_stmt_count[1m]) / mysql_global_variables_max_prepared_stmt_count * 100 > 80 for: 2m @@ -48,36 +55,32 @@ groups: annotations: summary: MySQL instance {{ $labels.instance }} is using > 80% of `max_prepared_stmt_count`. description: | - Too many prepared statements might consume a lot of memory. - LABELS = {{ $labels }}. + Too many prepared statements might consume a lot of memory. + LABELS = {{ $labels }}. - # 2.1.8 - # customized: warning -> info - alert: MySQLSlowQueries expr: increase(mysql_global_status_slow_queries[1m]) > 0 for: 2m labels: severity: info annotations: - summary: MySQL instance {{ $labels.instance }} has a slow query. + summary: MySQL instance {{ $labels.instance }} has slow queries. description: | - Consider optimizing the query by reviewing its execution plan, then rewrite the query and add any relevant indexes. + Consider optimizing the query by reviewing its execution plan, then rewrite the query and add any relevant indexes. LABELS = {{ $labels }}. - # 2.1.9 - alert: MySQLInnoDBLogWaits expr: rate(mysql_global_status_innodb_log_waits[15m]) > 10 for: 0m labels: severity: warning annotations: - summary: MySQL instance {{ $labels.instance }} has long InnoDB log waits. + summary: MySQL instance {{ $labels.instance }} has long InnoDB log waits. description: | - MySQL InnoDB log writes might be stalling. - Check I/O activity on your nodes to find the responsible process or query. Consider using iotop and the performance_schema. + MySQL InnoDB log writes might be stalling. + Check I/O activity on your nodes to find the responsible process or query. Consider using iotop and the performance_schema. LABELS = {{ $labels }}. - # 2.1.10 - alert: MySQLRestarted expr: mysql_global_status_uptime < 60 for: 0m @@ -86,6 +89,18 @@ groups: annotations: summary: MySQL instance {{ $labels.instance }} restarted. description: | - MySQL restarted less than one minute ago. - If the restart was unplanned or frequent, check Loki logs (e.g. `error.log`). + MySQL restarted less than one minute ago. + If the restart was unplanned or frequent, check Loki logs (e.g. `error.log`). + LABELS = {{ $labels }}. + + - alert: MySQLConnectionErrors + expr: increase(mysql_global_status_connection_errors_total[5m]) > 10 + for: 0m + labels: + severity: warning + annotations: + summary: MySQL instance {{ $labels.instance }} has connection errors. + description: | + Connection errors might indicate network issues, authentication problems, or resource limitations. + Check the MySQL logs for more details. LABELS = {{ $labels }}. diff --git a/src/prometheus_alert_rules/replication_rules.yaml b/src/prometheus_alert_rules/replication_rules.yaml new file mode 100644 index 0000000000..694dcd984e --- /dev/null +++ b/src/prometheus_alert_rules/replication_rules.yaml @@ -0,0 +1,91 @@ +groups: + - name: MySQL Replication Alert Rules + rules: + - alert: MySQLClusterUnitOffline + expr: mysql_perf_schema_replication_group_member_info{member_state="OFFLINE"} == 1 + for: 5m + labels: + severity: warning + annotations: + summary: MySQL cluster member {{ $labels.instance }} is offline + description: | + The MySQL member is marked offline in the cluster, although the process might still be running. + If this is unexptected, please check the logs. + LABELS = {{ $labels }}. + + - alert: MySQLClusterNoPrimary + expr: absent(mysql_perf_schema_replication_group_member_info{member_role="PRIMARY"}) + for: 0m + labels: + severity: critical + annotations: + summary: MySQL cluster reports no primariy + description: | + MySQL has no primaries. The cluster will likely be in a Read-Only state. + Please check the cluster health, the logs and investigate. + LABELS = {{ $labels }}. + + - alert: MySQLClusterTooManyPrimaries + expr: count(mysql_perf_schema_replication_group_member_info{member_role="PRIMARY"}) > 1 + for: 15m + labels: + severity: critical + annotations: + summary: MySQL cluster reports more than one primary. + description: | + MySQL reports more than one primary. This is can indicate a split-brain situation. + Please refer to the troubleshooting docs. + LABELS = {{ $labels }}. + + - alert: MySQLNoReplication + expr: absent(mysql_perf_schema_replication_group_member_info{member_role="SECONDARY"}) + for: 15m + labels: + severity: warning + annotations: + summary: MySQL cluster has no secondaries. + description: | + The MySQL cluster has no secondaries. This means that the cluster is not redundant and a failure of the primary will lead to downtime. + Please check the cluster health, the logs and investigate. + LABELS = {{ $labels }}. + + - alert: MySQLGroupReplicationReduced + expr: | + count(mysql_perf_schema_replication_group_member_info{member_state="ONLINE"} == 1) + < + max_over_time( + count(mysql_perf_schema_replication_group_member_info{member_state="ONLINE"} == 1)[6h:] + ) + for: 15m + labels: + severity: warning + annotations: + summary: MySQL cluster's Group Replication size reduced + description: | + The number of ONLINE members in the MySQL Group Replication cluster has reduced compared to the maximum observed in the last 6 hours. + Please check the cluster health, the logs and investigate. + LABELS = {{ $labels }}. + + - alert: MySQLGroupReplicationConflicts + expr: rate(mysql_perf_schema_conflicts_detected[5m]) > 0 + for: 5m + labels: + severity: warning + annotations: + summary: MySQL cluster reports Group Replication conflicts + description: | + Conflicts indicate concurrent writes to the same rows/keys across members. + Please check the cluster health, the logs and investigate. + LABELS = {{ $labels }}. + + - alert: MySQLGroupReplicationQueueSizeHigh + expr: mysql_perf_schema_transactions_in_queue > 100 + for: 5m + labels: + severity: warning + annotations: + summary: MySQL cluster reports high Group Replication queue size + description: | + A high number of transactions in the Group Replication queue might indicate network issues or overloaded nodes. + Please check the cluster health, the logs and investigate. + LABELS = {{ $labels }}. diff --git a/tests/alerts/test_general_alerts.yaml b/tests/alerts/test_general_alerts.yaml new file mode 100644 index 0000000000..d3a4aaa4f3 --- /dev/null +++ b/tests/alerts/test_general_alerts.yaml @@ -0,0 +1,196 @@ +rule_files: + - ../../src/prometheus_alert_rules/general_rules.yaml + +evaluation_interval: 1m + +tests: + - name: MySQLDown fires after 5m of mysql_up=0 + interval: 1m + input_series: + - series: 'mysql_up{instance="db1"}' + values: '1 1 1 1 0 0 0 0 1 1 1' + alert_rule_test: + - alertname: MySQLDown + eval_time: 2m + exp_alerts: [] + - alertname: MySQLDown + eval_time: 5m + exp_alerts: + - exp_labels: + instance: db1 + severity: critical + exp_annotations: + summary: MySQL instance db1 is down. + description: | + The MySQL instance is not reachable. + Please check if the MySQL process is running and the network connectivity. + LABELS = map[__name__:mysql_up instance:db1]. + + - name: MySQLMetricsScrapeError fires at 4m when exporter scrape error > 0 + interval: 1m + input_series: + - series: 'mysql_exporter_last_scrape_error{instance="db1"}' + values: '0 0 1 2 2 2 2 2 2' + alert_rule_test: + - alertname: MySQLMetricsScrapeError + eval_time: 4m + exp_alerts: + - exp_labels: + instance: db1 + severity: warning + exp_annotations: + summary: MySQL instance db1 has a metrics scrape error. + description: | + The MySQL Exporter encountered an error while scraping metrics. + Check the MySQL Exporter logs for more details. + LABELS = map[instance:db1]. + + - name: MySQLTooManyConnections fires after 5m when >90% of max_connections + interval: 1m + input_series: + - series: 'mysql_global_variables_max_connections{instance="db1"}' + values: '100 100 100 100 100 100 100 100 100' + - series: 'mysql_global_status_threads_connected{instance="db1"}' + values: '50 60 70 80 95 95 95 70 60' + alert_rule_test: + - alertname: "MySQLTooManyConnections(>90%)" + eval_time: 4m + exp_alerts: [] + - alertname: "MySQLTooManyConnections(>90%)" + eval_time: 6m + exp_alerts: + - exp_labels: + instance: db1 + severity: warning + exp_annotations: + summary: MySQL instance db1 is using > 90% of `max_connections`. + description: | + Consider checking the client application responsible for generating those additional connections. + LABELS = map[instance:db1]. + + - name: MySQLHighThreadsRunning fires after 5m when >80% of max_connections + interval: 1m + input_series: + - series: 'mysql_global_variables_max_connections{instance="db1"}' + values: '100 100 100 100 100 100 100 100 100' + - series: 'mysql_global_status_threads_running{instance="db1"}' + values: '20 30 40 60 85 85 85 40 30' + alert_rule_test: + - alertname: MySQLHighThreadsRunning + eval_time: 4m + exp_alerts: [] + - alertname: MySQLHighThreadsRunning + eval_time: 6m + exp_alerts: + - exp_labels: + instance: db1 + severity: warning + exp_annotations: + summary: MySQL instance db1 is actively using > 80% of `max_connections`. + description: | + Consider reviewing the value of the `max-connections` config parameter or allocate more resources to your database server. + LABELS = map[instance:db1]. + + - name: MySQLHighPreparedStatementsUtilization fires after 5m when >80% + interval: 1m + input_series: + - series: 'mysql_global_variables_max_prepared_stmt_count{instance="db1"}' + values: '100 100 100 100 100 100 100 100 100' + - series: 'mysql_global_status_prepared_stmt_count{instance="db1"}' + values: '50 60 70 75 81 81 81 70 60' + alert_rule_test: + - alertname: "MySQLHighPreparedStatementsUtilization(>80%)" + eval_time: 4m + exp_alerts: [] + - alertname: "MySQLHighPreparedStatementsUtilization(>80%)" + eval_time: 6m + exp_alerts: + - exp_labels: + instance: db1 + severity: warning + exp_annotations: + summary: MySQL instance db1 is using > 80% of `max_prepared_stmt_count`. + description: | + Too many prepared statements might consume a lot of memory. + LABELS = map[instance:db1]. + + - name: MySQLSlowQueries fires at 4m when slow_queries increases + interval: 1m + input_series: + - series: 'mysql_global_status_slow_queries{instance="db1"}' + values: '10 10 11 12 13' + alert_rule_test: + - alertname: MySQLSlowQueries + eval_time: 2m + exp_alerts: [] + - alertname: MySQLSlowQueries + eval_time: 4m + exp_alerts: + - exp_labels: + instance: db1 + severity: info + exp_annotations: + summary: MySQL instance db1 has slow queries. + description: | + Consider optimizing the query by reviewing its execution plan, then rewrite the query and add any relevant indexes. + LABELS = map[instance:db1]. + + - name: MySQLInnoDBLogWaits fires at 16m when log waits keep growing + interval: 1m + input_series: + - series: 'mysql_global_status_innodb_log_waits{instance="db1"}' + values: '0 700 1400 2100 2800 3500 4200 4900 5600 6300 7000 7700 8400 9100 9800 10500 11200 11900 12600' + alert_rule_test: + - alertname: MySQLInnoDBLogWaits + eval_time: 16m + exp_alerts: + - exp_labels: + instance: db1 + severity: warning + exp_annotations: + summary: MySQL instance db1 has long InnoDB log waits. + description: | + MySQL InnoDB log writes might be stalling. + Check I/O activity on your nodes to find the responsible process or query. Consider using iotop and the performance_schema. + LABELS = map[instance:db1]. + + - name: MySQLRestarted fires when uptime < 1m and clears by 5m + interval: 1m + input_series: + - series: 'mysql_global_status_uptime{instance="db1"}' + values: '10 20 30 40 120 180 240 300 360' + alert_rule_test: + - alertname: MySQLRestarted + eval_time: 2m + exp_alerts: + - exp_labels: + instance: db1 + severity: info + exp_annotations: + summary: MySQL instance db1 restarted. + description: | + MySQL restarted less than one minute ago. + If the restart was unplanned or frequent, check Loki logs (e.g. `error.log`). + LABELS = map[__name__:mysql_global_status_uptime instance:db1]. + - alertname: MySQLRestarted + eval_time: 5m + exp_alerts: [] + + - name: MySQLConnectionErrors fire at 5m when connection_errors_total > 0 + interval: 1m + input_series: + - series: 'mysql_global_status_connection_errors_total{instance="db1"}' + values: '0 0 0 0 0 12 12 12 12' + alert_rule_test: + - alertname: MySQLConnectionErrors + eval_time: 5m + exp_alerts: + - exp_labels: + instance: db1 + severity: warning + exp_annotations: + summary: MySQL instance db1 has connection errors. + description: | + Connection errors might indicate network issues, authentication problems, or resource limitations. + Check the MySQL logs for more details. + LABELS = map[instance:db1]. diff --git a/tests/alerts/test_replication_alerts.yaml b/tests/alerts/test_replication_alerts.yaml new file mode 100644 index 0000000000..82f5970177 --- /dev/null +++ b/tests/alerts/test_replication_alerts.yaml @@ -0,0 +1,161 @@ +rule_files: + - ../../src/prometheus_alert_rules/replication_rules.yaml + +evaluation_interval: 1m + +tests: + - name: MySQLClusterUnitOffline fires after 5m of OFFLINE=1 + interval: 1m + input_series: + - series: 'mysql_perf_schema_replication_group_member_info{instance="db1",member_state="OFFLINE",member_role="SECONDARY"}' + values: '1 1 1 1 1 1' + alert_rule_test: + - alertname: MySQLClusterUnitOffline + eval_time: 5m + exp_alerts: + - exp_labels: + alertname: MySQLClusterUnitOffline + severity: warning + instance: db1 + member_state: OFFLINE + member_role: SECONDARY + exp_annotations: + summary: MySQL cluster member db1 is offline + description: | + The MySQL member is marked offline in the cluster, although the process might still be running. + If this is unexptected, please check the logs. + LABELS = map[__name__:mysql_perf_schema_replication_group_member_info instance:db1 member_role:SECONDARY member_state:OFFLINE]. + + - name: MySQLClusterUnitOffline does not fire when OFFLINE=0 + interval: 1m + input_series: + - series: 'mysql_perf_schema_replication_group_member_info{instance="db2",member_state="OFFLINE",member_role="SECONDARY"}' + values: '0 0 0 0 0 0' + alert_rule_test: + - alertname: MySQLClusterUnitOffline + eval_time: 6m + exp_alerts: [] + + - name: MySQLClusterNoPrimary fires immediately when there is no PRIMARY + interval: 1m + input_series: + - series: 'mysql_perf_schema_replication_group_member_info{instance="db1",member_state="ONLINE",member_role="SECONDARY"}' + values: '1 1' + alert_rule_test: + - alertname: MySQLClusterNoPrimary + eval_time: 0m + exp_alerts: + - exp_labels: + alertname: MySQLClusterNoPrimary + severity: critical + member_role: PRIMARY + exp_annotations: + summary: MySQL cluster reports no primariy + description: | + MySQL has no primaries. The cluster will likely be in a Read-Only state. + Please check the cluster health, the logs and investigate. + LABELS = map[member_role:PRIMARY]. + + - name: MySQLClusterTooManyPrimaries fires after 15m when PRIMARY count > 1 + interval: 1m + input_series: + - series: 'mysql_perf_schema_replication_group_member_info{instance="db1",member_state="ONLINE",member_role="PRIMARY"}' + values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1' + - series: 'mysql_perf_schema_replication_group_member_info{instance="db2",member_state="ONLINE",member_role="PRIMARY"}' + values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1' + alert_rule_test: + - alertname: MySQLClusterTooManyPrimaries + eval_time: 15m + exp_alerts: + - exp_labels: + alertname: MySQLClusterTooManyPrimaries + severity: critical + exp_annotations: + summary: MySQL cluster reports more than one primary. + description: | + MySQL reports more than one primary. This is can indicate a split-brain situation. + Please refer to the troubleshooting docs. + LABELS = map[]. + + - name: MySQLNoReplication fires after 15m if there are no SECONDARY + interval: 1m + input_series: + - series: 'mysql_perf_schema_replication_group_member_info{instance="db1",member_state="ONLINE",member_role="PRIMARY"}' + values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1' + alert_rule_test: + - alertname: MySQLNoReplication + eval_time: 15m + exp_alerts: + - exp_labels: + alertname: MySQLNoReplication + severity: warning + member_role: SECONDARY + exp_annotations: + summary: MySQL cluster has no secondaries. + description: | + The MySQL cluster has no secondaries. This means that the cluster is not redundant and a failure of the primary will lead to downtime. + Please check the cluster health, the logs and investigate. + LABELS = map[member_role:SECONDARY]. + + - name: MySQLGroupReplicationReduced fires when ONLINE count drops below prior 6h max + interval: 5m + input_series: + - series: 'mysql_perf_schema_replication_group_member_info{instance="db1",member_state="ONLINE",member_role="SECONDARY"}' + values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1' + - series: 'mysql_perf_schema_replication_group_member_info{instance="db2",member_state="ONLINE",member_role="SECONDARY"}' + values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0' + - series: 'mysql_perf_schema_replication_group_member_info{instance="db3",member_state="ONLINE",member_role="PRIMARY"}' + values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1' + alert_rule_test: + - alertname: MySQLGroupReplicationReduced + eval_time: 5h + exp_alerts: + - exp_labels: + alertname: MySQLGroupReplicationReduced + severity: warning + exp_annotations: + summary: MySQL cluster's Group Replication size reduced + description: | + The number of ONLINE members in the MySQL Group Replication cluster has reduced compared to the maximum observed in the last 6 hours. + Please check the cluster health, the logs and investigate. + LABELS = map[]. + + - name: MySQLGroupReplicationConflicts fires when rate>0 for 5m + interval: 1m + input_series: + - series: 'mysql_perf_schema_conflicts_detected{instance="db1"}' + values: '0 1 2 3 4 5 6 7 8 9 10' + alert_rule_test: + - alertname: MySQLGroupReplicationConflicts + eval_time: 10m + exp_alerts: + - exp_labels: + alertname: MySQLGroupReplicationConflicts + severity: warning + instance: db1 + exp_annotations: + summary: MySQL cluster reports Group Replication conflicts + description: | + Conflicts indicate concurrent writes to the same rows/keys across members. + Please check the cluster health, the logs and investigate. + LABELS = map[instance:db1]. + + - name: MySQLGroupReplicationQueueSizeHigh fires when queue size >100 for 5m + interval: 1m + input_series: + - series: 'mysql_perf_schema_transactions_in_queue{instance="db2"}' + values: '0 0 0 120 120 120 120 120 120' + alert_rule_test: + - alertname: MySQLGroupReplicationQueueSizeHigh + eval_time: 8m + exp_alerts: + - exp_labels: + alertname: MySQLGroupReplicationQueueSizeHigh + severity: warning + instance: db2 + exp_annotations: + summary: MySQL cluster reports high Group Replication queue size + description: | + A high number of transactions in the Group Replication queue might indicate network issues or overloaded nodes. + Please check the cluster health, the logs and investigate. + LABELS = map[__name__:mysql_perf_schema_transactions_in_queue instance:db2]. From bbfd74de2512d9cfe0df2cbfa769ae8881879a7c Mon Sep 17 00:00:00 2001 From: Deezzir Date: Wed, 27 Aug 2025 16:40:20 -0400 Subject: [PATCH 2/3] fix: update checkout action version --- .github/workflows/ci.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index f91491322b..4ea9afbbc6 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -47,7 +47,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout repo - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Install prometheus snap run: sudo snap install prometheus - name: Check validity of prometheus alert rules From 56718d0b4fcb4db7fc16d11545a4020d34fb9074 Mon Sep 17 00:00:00 2001 From: Deezzir Date: Fri, 29 Aug 2025 17:20:09 -0400 Subject: [PATCH 3/3] fix: spelling & rules --- .github/workflows/ci.yaml | 1 + src/prometheus_alert_rules/replication_rules.yaml | 10 +++++----- tests/alerts/test_replication_alerts.yaml | 6 +++--- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index 4ea9afbbc6..9f5b9dfea7 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -45,6 +45,7 @@ jobs: alert-test: name: Test Prometheus Alert Rules runs-on: ubuntu-latest + timeout-minutes: 5 steps: - name: Checkout repo uses: actions/checkout@v4 diff --git a/src/prometheus_alert_rules/replication_rules.yaml b/src/prometheus_alert_rules/replication_rules.yaml index 694dcd984e..e2e9b0d65e 100644 --- a/src/prometheus_alert_rules/replication_rules.yaml +++ b/src/prometheus_alert_rules/replication_rules.yaml @@ -10,16 +10,16 @@ groups: summary: MySQL cluster member {{ $labels.instance }} is offline description: | The MySQL member is marked offline in the cluster, although the process might still be running. - If this is unexptected, please check the logs. + If this is unexpected, please check the logs. LABELS = {{ $labels }}. - alert: MySQLClusterNoPrimary - expr: absent(mysql_perf_schema_replication_group_member_info{member_role="PRIMARY"}) + expr: absent(mysql_perf_schema_replication_group_member_info{member_role="PRIMARY",member_state="ONLINE"}) for: 0m labels: severity: critical annotations: - summary: MySQL cluster reports no primariy + summary: MySQL cluster reports no primary description: | MySQL has no primaries. The cluster will likely be in a Read-Only state. Please check the cluster health, the logs and investigate. @@ -33,7 +33,7 @@ groups: annotations: summary: MySQL cluster reports more than one primary. description: | - MySQL reports more than one primary. This is can indicate a split-brain situation. + MySQL reports more than one primary. This can indicate a split-brain situation. Please refer to the troubleshooting docs. LABELS = {{ $labels }}. @@ -56,7 +56,7 @@ groups: max_over_time( count(mysql_perf_schema_replication_group_member_info{member_state="ONLINE"} == 1)[6h:] ) - for: 15m + for: 5m labels: severity: warning annotations: diff --git a/tests/alerts/test_replication_alerts.yaml b/tests/alerts/test_replication_alerts.yaml index 82f5970177..0e1c96968a 100644 --- a/tests/alerts/test_replication_alerts.yaml +++ b/tests/alerts/test_replication_alerts.yaml @@ -23,7 +23,7 @@ tests: summary: MySQL cluster member db1 is offline description: | The MySQL member is marked offline in the cluster, although the process might still be running. - If this is unexptected, please check the logs. + If this is unexpected, please check the logs. LABELS = map[__name__:mysql_perf_schema_replication_group_member_info instance:db1 member_role:SECONDARY member_state:OFFLINE]. - name: MySQLClusterUnitOffline does not fire when OFFLINE=0 @@ -50,7 +50,7 @@ tests: severity: critical member_role: PRIMARY exp_annotations: - summary: MySQL cluster reports no primariy + summary: MySQL cluster reports no primary description: | MySQL has no primaries. The cluster will likely be in a Read-Only state. Please check the cluster health, the logs and investigate. @@ -73,7 +73,7 @@ tests: exp_annotations: summary: MySQL cluster reports more than one primary. description: | - MySQL reports more than one primary. This is can indicate a split-brain situation. + MySQL reports more than one primary. This can indicate a split-brain situation. Please refer to the troubleshooting docs. LABELS = map[].