diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml index e9b685a6ce..9f5b9dfea7 100644 --- a/.github/workflows/ci.yaml +++ b/.github/workflows/ci.yaml @@ -42,6 +42,20 @@ jobs: - name: Upload Coverage to Codecov uses: codecov/codecov-action@v5 + alert-test: + name: Test Prometheus Alert Rules + runs-on: ubuntu-latest + timeout-minutes: 5 + steps: + - name: Checkout repo + uses: actions/checkout@v4 + - name: Install prometheus snap + run: sudo snap install prometheus + - name: Check validity of prometheus alert rules + run: promtool check rules src/prometheus_alert_rules/*.yaml + - name: Run unit tests for prometheus alert rules + run: promtool test rules tests/alerts/*.yaml + build: name: Build charm uses: canonical/data-platform-workflows/.github/workflows/build_charm.yaml@v32.2.1 diff --git a/src/prometheus_alert_rules/metrics_alert_rules.yaml b/src/prometheus_alert_rules/general_rules.yaml similarity index 63% rename from src/prometheus_alert_rules/metrics_alert_rules.yaml rename to src/prometheus_alert_rules/general_rules.yaml index ff0a604c15..d07a273732 100644 --- a/src/prometheus_alert_rules/metrics_alert_rules.yaml +++ b/src/prometheus_alert_rules/general_rules.yaml @@ -1,20 +1,30 @@ groups: - - name: MySQLExporterK8s - + - name: MySQL General Alert Rules rules: - # 2.1.1 - alert: MySQLDown - expr: "mysql_up == 0" + expr: mysql_up == 0 for: 0m labels: severity: critical annotations: - summary: MySQL instance {{ $labels.instance }} is down. + summary: MySQL instance {{ $labels.instance }} is down. description: | + The MySQL instance is not reachable. + Please check if the MySQL process is running and the network connectivity. + LABELS = {{ $labels }}. + + - alert: MySQLMetricsScrapeError + expr: increase(mysql_exporter_last_scrape_error[5m]) > 1 + for: 0m + labels: + severity: warning + annotations: + summary: MySQL instance {{ $labels.instance }} has a metrics scrape error. + description: | + The MySQL Exporter encountered an error while scraping metrics. + Check the MySQL Exporter logs for more details. LABELS = {{ $labels }}. - # 2.1.2 - # customized: 80% -> 90% - alert: MySQLTooManyConnections(>90%) expr: max_over_time(mysql_global_status_threads_connected[1m]) / mysql_global_variables_max_connections * 100 > 90 for: 2m @@ -24,10 +34,8 @@ groups: summary: MySQL instance {{ $labels.instance }} is using > 90% of `max_connections`. description: | Consider checking the client application responsible for generating those additional connections. - LABELS = {{ $labels }}. + LABELS = {{ $labels }}. - # 2.1.4 - # customized: 60% -> 80% - alert: MySQLHighThreadsRunning expr: max_over_time(mysql_global_status_threads_running[1m]) / mysql_global_variables_max_connections * 100 > 80 for: 2m @@ -36,10 +44,9 @@ groups: annotations: summary: MySQL instance {{ $labels.instance }} is actively using > 80% of `max_connections`. description: | - Consider reviewing the value of the `max-connections` config parameter or allocate more resources to your database server. - LABELS = {{ $labels }}. + Consider reviewing the value of the `max-connections` config parameter or allocate more resources to your database server. + LABELS = {{ $labels }}. - # 2.1.3 - alert: MySQLHighPreparedStatementsUtilization(>80%) expr: max_over_time(mysql_global_status_prepared_stmt_count[1m]) / mysql_global_variables_max_prepared_stmt_count * 100 > 80 for: 2m @@ -48,36 +55,32 @@ groups: annotations: summary: MySQL instance {{ $labels.instance }} is using > 80% of `max_prepared_stmt_count`. description: | - Too many prepared statements might consume a lot of memory. - LABELS = {{ $labels }}. + Too many prepared statements might consume a lot of memory. + LABELS = {{ $labels }}. - # 2.1.8 - # customized: warning -> info - alert: MySQLSlowQueries expr: increase(mysql_global_status_slow_queries[1m]) > 0 for: 2m labels: severity: info annotations: - summary: MySQL instance {{ $labels.instance }} has a slow query. + summary: MySQL instance {{ $labels.instance }} has slow queries. description: | - Consider optimizing the query by reviewing its execution plan, then rewrite the query and add any relevant indexes. + Consider optimizing the query by reviewing its execution plan, then rewrite the query and add any relevant indexes. LABELS = {{ $labels }}. - # 2.1.9 - alert: MySQLInnoDBLogWaits expr: rate(mysql_global_status_innodb_log_waits[15m]) > 10 for: 0m labels: severity: warning annotations: - summary: MySQL instance {{ $labels.instance }} has long InnoDB log waits. + summary: MySQL instance {{ $labels.instance }} has long InnoDB log waits. description: | - MySQL InnoDB log writes might be stalling. - Check I/O activity on your nodes to find the responsible process or query. Consider using iotop and the performance_schema. + MySQL InnoDB log writes might be stalling. + Check I/O activity on your nodes to find the responsible process or query. Consider using iotop and the performance_schema. LABELS = {{ $labels }}. - # 2.1.10 - alert: MySQLRestarted expr: mysql_global_status_uptime < 60 for: 0m @@ -86,6 +89,18 @@ groups: annotations: summary: MySQL instance {{ $labels.instance }} restarted. description: | - MySQL restarted less than one minute ago. - If the restart was unplanned or frequent, check Loki logs (e.g. `error.log`). + MySQL restarted less than one minute ago. + If the restart was unplanned or frequent, check Loki logs (e.g. `error.log`). + LABELS = {{ $labels }}. + + - alert: MySQLConnectionErrors + expr: increase(mysql_global_status_connection_errors_total[5m]) > 10 + for: 0m + labels: + severity: warning + annotations: + summary: MySQL instance {{ $labels.instance }} has connection errors. + description: | + Connection errors might indicate network issues, authentication problems, or resource limitations. + Check the MySQL logs for more details. LABELS = {{ $labels }}. diff --git a/src/prometheus_alert_rules/replication_rules.yaml b/src/prometheus_alert_rules/replication_rules.yaml new file mode 100644 index 0000000000..e2e9b0d65e --- /dev/null +++ b/src/prometheus_alert_rules/replication_rules.yaml @@ -0,0 +1,91 @@ +groups: + - name: MySQL Replication Alert Rules + rules: + - alert: MySQLClusterUnitOffline + expr: mysql_perf_schema_replication_group_member_info{member_state="OFFLINE"} == 1 + for: 5m + labels: + severity: warning + annotations: + summary: MySQL cluster member {{ $labels.instance }} is offline + description: | + The MySQL member is marked offline in the cluster, although the process might still be running. + If this is unexpected, please check the logs. + LABELS = {{ $labels }}. + + - alert: MySQLClusterNoPrimary + expr: absent(mysql_perf_schema_replication_group_member_info{member_role="PRIMARY",member_state="ONLINE"}) + for: 0m + labels: + severity: critical + annotations: + summary: MySQL cluster reports no primary + description: | + MySQL has no primaries. The cluster will likely be in a Read-Only state. + Please check the cluster health, the logs and investigate. + LABELS = {{ $labels }}. + + - alert: MySQLClusterTooManyPrimaries + expr: count(mysql_perf_schema_replication_group_member_info{member_role="PRIMARY"}) > 1 + for: 15m + labels: + severity: critical + annotations: + summary: MySQL cluster reports more than one primary. + description: | + MySQL reports more than one primary. This can indicate a split-brain situation. + Please refer to the troubleshooting docs. + LABELS = {{ $labels }}. + + - alert: MySQLNoReplication + expr: absent(mysql_perf_schema_replication_group_member_info{member_role="SECONDARY"}) + for: 15m + labels: + severity: warning + annotations: + summary: MySQL cluster has no secondaries. + description: | + The MySQL cluster has no secondaries. This means that the cluster is not redundant and a failure of the primary will lead to downtime. + Please check the cluster health, the logs and investigate. + LABELS = {{ $labels }}. + + - alert: MySQLGroupReplicationReduced + expr: | + count(mysql_perf_schema_replication_group_member_info{member_state="ONLINE"} == 1) + < + max_over_time( + count(mysql_perf_schema_replication_group_member_info{member_state="ONLINE"} == 1)[6h:] + ) + for: 5m + labels: + severity: warning + annotations: + summary: MySQL cluster's Group Replication size reduced + description: | + The number of ONLINE members in the MySQL Group Replication cluster has reduced compared to the maximum observed in the last 6 hours. + Please check the cluster health, the logs and investigate. + LABELS = {{ $labels }}. + + - alert: MySQLGroupReplicationConflicts + expr: rate(mysql_perf_schema_conflicts_detected[5m]) > 0 + for: 5m + labels: + severity: warning + annotations: + summary: MySQL cluster reports Group Replication conflicts + description: | + Conflicts indicate concurrent writes to the same rows/keys across members. + Please check the cluster health, the logs and investigate. + LABELS = {{ $labels }}. + + - alert: MySQLGroupReplicationQueueSizeHigh + expr: mysql_perf_schema_transactions_in_queue > 100 + for: 5m + labels: + severity: warning + annotations: + summary: MySQL cluster reports high Group Replication queue size + description: | + A high number of transactions in the Group Replication queue might indicate network issues or overloaded nodes. + Please check the cluster health, the logs and investigate. + LABELS = {{ $labels }}. diff --git a/tests/alerts/test_general_alerts.yaml b/tests/alerts/test_general_alerts.yaml new file mode 100644 index 0000000000..d3a4aaa4f3 --- /dev/null +++ b/tests/alerts/test_general_alerts.yaml @@ -0,0 +1,196 @@ +rule_files: + - ../../src/prometheus_alert_rules/general_rules.yaml + +evaluation_interval: 1m + +tests: + - name: MySQLDown fires after 5m of mysql_up=0 + interval: 1m + input_series: + - series: 'mysql_up{instance="db1"}' + values: '1 1 1 1 0 0 0 0 1 1 1' + alert_rule_test: + - alertname: MySQLDown + eval_time: 2m + exp_alerts: [] + - alertname: MySQLDown + eval_time: 5m + exp_alerts: + - exp_labels: + instance: db1 + severity: critical + exp_annotations: + summary: MySQL instance db1 is down. + description: | + The MySQL instance is not reachable. + Please check if the MySQL process is running and the network connectivity. + LABELS = map[__name__:mysql_up instance:db1]. + + - name: MySQLMetricsScrapeError fires at 4m when exporter scrape error > 0 + interval: 1m + input_series: + - series: 'mysql_exporter_last_scrape_error{instance="db1"}' + values: '0 0 1 2 2 2 2 2 2' + alert_rule_test: + - alertname: MySQLMetricsScrapeError + eval_time: 4m + exp_alerts: + - exp_labels: + instance: db1 + severity: warning + exp_annotations: + summary: MySQL instance db1 has a metrics scrape error. + description: | + The MySQL Exporter encountered an error while scraping metrics. + Check the MySQL Exporter logs for more details. + LABELS = map[instance:db1]. + + - name: MySQLTooManyConnections fires after 5m when >90% of max_connections + interval: 1m + input_series: + - series: 'mysql_global_variables_max_connections{instance="db1"}' + values: '100 100 100 100 100 100 100 100 100' + - series: 'mysql_global_status_threads_connected{instance="db1"}' + values: '50 60 70 80 95 95 95 70 60' + alert_rule_test: + - alertname: "MySQLTooManyConnections(>90%)" + eval_time: 4m + exp_alerts: [] + - alertname: "MySQLTooManyConnections(>90%)" + eval_time: 6m + exp_alerts: + - exp_labels: + instance: db1 + severity: warning + exp_annotations: + summary: MySQL instance db1 is using > 90% of `max_connections`. + description: | + Consider checking the client application responsible for generating those additional connections. + LABELS = map[instance:db1]. + + - name: MySQLHighThreadsRunning fires after 5m when >80% of max_connections + interval: 1m + input_series: + - series: 'mysql_global_variables_max_connections{instance="db1"}' + values: '100 100 100 100 100 100 100 100 100' + - series: 'mysql_global_status_threads_running{instance="db1"}' + values: '20 30 40 60 85 85 85 40 30' + alert_rule_test: + - alertname: MySQLHighThreadsRunning + eval_time: 4m + exp_alerts: [] + - alertname: MySQLHighThreadsRunning + eval_time: 6m + exp_alerts: + - exp_labels: + instance: db1 + severity: warning + exp_annotations: + summary: MySQL instance db1 is actively using > 80% of `max_connections`. + description: | + Consider reviewing the value of the `max-connections` config parameter or allocate more resources to your database server. + LABELS = map[instance:db1]. + + - name: MySQLHighPreparedStatementsUtilization fires after 5m when >80% + interval: 1m + input_series: + - series: 'mysql_global_variables_max_prepared_stmt_count{instance="db1"}' + values: '100 100 100 100 100 100 100 100 100' + - series: 'mysql_global_status_prepared_stmt_count{instance="db1"}' + values: '50 60 70 75 81 81 81 70 60' + alert_rule_test: + - alertname: "MySQLHighPreparedStatementsUtilization(>80%)" + eval_time: 4m + exp_alerts: [] + - alertname: "MySQLHighPreparedStatementsUtilization(>80%)" + eval_time: 6m + exp_alerts: + - exp_labels: + instance: db1 + severity: warning + exp_annotations: + summary: MySQL instance db1 is using > 80% of `max_prepared_stmt_count`. + description: | + Too many prepared statements might consume a lot of memory. + LABELS = map[instance:db1]. + + - name: MySQLSlowQueries fires at 4m when slow_queries increases + interval: 1m + input_series: + - series: 'mysql_global_status_slow_queries{instance="db1"}' + values: '10 10 11 12 13' + alert_rule_test: + - alertname: MySQLSlowQueries + eval_time: 2m + exp_alerts: [] + - alertname: MySQLSlowQueries + eval_time: 4m + exp_alerts: + - exp_labels: + instance: db1 + severity: info + exp_annotations: + summary: MySQL instance db1 has slow queries. + description: | + Consider optimizing the query by reviewing its execution plan, then rewrite the query and add any relevant indexes. + LABELS = map[instance:db1]. + + - name: MySQLInnoDBLogWaits fires at 16m when log waits keep growing + interval: 1m + input_series: + - series: 'mysql_global_status_innodb_log_waits{instance="db1"}' + values: '0 700 1400 2100 2800 3500 4200 4900 5600 6300 7000 7700 8400 9100 9800 10500 11200 11900 12600' + alert_rule_test: + - alertname: MySQLInnoDBLogWaits + eval_time: 16m + exp_alerts: + - exp_labels: + instance: db1 + severity: warning + exp_annotations: + summary: MySQL instance db1 has long InnoDB log waits. + description: | + MySQL InnoDB log writes might be stalling. + Check I/O activity on your nodes to find the responsible process or query. Consider using iotop and the performance_schema. + LABELS = map[instance:db1]. + + - name: MySQLRestarted fires when uptime < 1m and clears by 5m + interval: 1m + input_series: + - series: 'mysql_global_status_uptime{instance="db1"}' + values: '10 20 30 40 120 180 240 300 360' + alert_rule_test: + - alertname: MySQLRestarted + eval_time: 2m + exp_alerts: + - exp_labels: + instance: db1 + severity: info + exp_annotations: + summary: MySQL instance db1 restarted. + description: | + MySQL restarted less than one minute ago. + If the restart was unplanned or frequent, check Loki logs (e.g. `error.log`). + LABELS = map[__name__:mysql_global_status_uptime instance:db1]. + - alertname: MySQLRestarted + eval_time: 5m + exp_alerts: [] + + - name: MySQLConnectionErrors fire at 5m when connection_errors_total > 0 + interval: 1m + input_series: + - series: 'mysql_global_status_connection_errors_total{instance="db1"}' + values: '0 0 0 0 0 12 12 12 12' + alert_rule_test: + - alertname: MySQLConnectionErrors + eval_time: 5m + exp_alerts: + - exp_labels: + instance: db1 + severity: warning + exp_annotations: + summary: MySQL instance db1 has connection errors. + description: | + Connection errors might indicate network issues, authentication problems, or resource limitations. + Check the MySQL logs for more details. + LABELS = map[instance:db1]. diff --git a/tests/alerts/test_replication_alerts.yaml b/tests/alerts/test_replication_alerts.yaml new file mode 100644 index 0000000000..0e1c96968a --- /dev/null +++ b/tests/alerts/test_replication_alerts.yaml @@ -0,0 +1,161 @@ +rule_files: + - ../../src/prometheus_alert_rules/replication_rules.yaml + +evaluation_interval: 1m + +tests: + - name: MySQLClusterUnitOffline fires after 5m of OFFLINE=1 + interval: 1m + input_series: + - series: 'mysql_perf_schema_replication_group_member_info{instance="db1",member_state="OFFLINE",member_role="SECONDARY"}' + values: '1 1 1 1 1 1' + alert_rule_test: + - alertname: MySQLClusterUnitOffline + eval_time: 5m + exp_alerts: + - exp_labels: + alertname: MySQLClusterUnitOffline + severity: warning + instance: db1 + member_state: OFFLINE + member_role: SECONDARY + exp_annotations: + summary: MySQL cluster member db1 is offline + description: | + The MySQL member is marked offline in the cluster, although the process might still be running. + If this is unexpected, please check the logs. + LABELS = map[__name__:mysql_perf_schema_replication_group_member_info instance:db1 member_role:SECONDARY member_state:OFFLINE]. + + - name: MySQLClusterUnitOffline does not fire when OFFLINE=0 + interval: 1m + input_series: + - series: 'mysql_perf_schema_replication_group_member_info{instance="db2",member_state="OFFLINE",member_role="SECONDARY"}' + values: '0 0 0 0 0 0' + alert_rule_test: + - alertname: MySQLClusterUnitOffline + eval_time: 6m + exp_alerts: [] + + - name: MySQLClusterNoPrimary fires immediately when there is no PRIMARY + interval: 1m + input_series: + - series: 'mysql_perf_schema_replication_group_member_info{instance="db1",member_state="ONLINE",member_role="SECONDARY"}' + values: '1 1' + alert_rule_test: + - alertname: MySQLClusterNoPrimary + eval_time: 0m + exp_alerts: + - exp_labels: + alertname: MySQLClusterNoPrimary + severity: critical + member_role: PRIMARY + exp_annotations: + summary: MySQL cluster reports no primary + description: | + MySQL has no primaries. The cluster will likely be in a Read-Only state. + Please check the cluster health, the logs and investigate. + LABELS = map[member_role:PRIMARY]. + + - name: MySQLClusterTooManyPrimaries fires after 15m when PRIMARY count > 1 + interval: 1m + input_series: + - series: 'mysql_perf_schema_replication_group_member_info{instance="db1",member_state="ONLINE",member_role="PRIMARY"}' + values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1' + - series: 'mysql_perf_schema_replication_group_member_info{instance="db2",member_state="ONLINE",member_role="PRIMARY"}' + values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1' + alert_rule_test: + - alertname: MySQLClusterTooManyPrimaries + eval_time: 15m + exp_alerts: + - exp_labels: + alertname: MySQLClusterTooManyPrimaries + severity: critical + exp_annotations: + summary: MySQL cluster reports more than one primary. + description: | + MySQL reports more than one primary. This can indicate a split-brain situation. + Please refer to the troubleshooting docs. + LABELS = map[]. + + - name: MySQLNoReplication fires after 15m if there are no SECONDARY + interval: 1m + input_series: + - series: 'mysql_perf_schema_replication_group_member_info{instance="db1",member_state="ONLINE",member_role="PRIMARY"}' + values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1' + alert_rule_test: + - alertname: MySQLNoReplication + eval_time: 15m + exp_alerts: + - exp_labels: + alertname: MySQLNoReplication + severity: warning + member_role: SECONDARY + exp_annotations: + summary: MySQL cluster has no secondaries. + description: | + The MySQL cluster has no secondaries. This means that the cluster is not redundant and a failure of the primary will lead to downtime. + Please check the cluster health, the logs and investigate. + LABELS = map[member_role:SECONDARY]. + + - name: MySQLGroupReplicationReduced fires when ONLINE count drops below prior 6h max + interval: 5m + input_series: + - series: 'mysql_perf_schema_replication_group_member_info{instance="db1",member_state="ONLINE",member_role="SECONDARY"}' + values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1' + - series: 'mysql_perf_schema_replication_group_member_info{instance="db2",member_state="ONLINE",member_role="SECONDARY"}' + values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0' + - series: 'mysql_perf_schema_replication_group_member_info{instance="db3",member_state="ONLINE",member_role="PRIMARY"}' + values: '1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1' + alert_rule_test: + - alertname: MySQLGroupReplicationReduced + eval_time: 5h + exp_alerts: + - exp_labels: + alertname: MySQLGroupReplicationReduced + severity: warning + exp_annotations: + summary: MySQL cluster's Group Replication size reduced + description: | + The number of ONLINE members in the MySQL Group Replication cluster has reduced compared to the maximum observed in the last 6 hours. + Please check the cluster health, the logs and investigate. + LABELS = map[]. + + - name: MySQLGroupReplicationConflicts fires when rate>0 for 5m + interval: 1m + input_series: + - series: 'mysql_perf_schema_conflicts_detected{instance="db1"}' + values: '0 1 2 3 4 5 6 7 8 9 10' + alert_rule_test: + - alertname: MySQLGroupReplicationConflicts + eval_time: 10m + exp_alerts: + - exp_labels: + alertname: MySQLGroupReplicationConflicts + severity: warning + instance: db1 + exp_annotations: + summary: MySQL cluster reports Group Replication conflicts + description: | + Conflicts indicate concurrent writes to the same rows/keys across members. + Please check the cluster health, the logs and investigate. + LABELS = map[instance:db1]. + + - name: MySQLGroupReplicationQueueSizeHigh fires when queue size >100 for 5m + interval: 1m + input_series: + - series: 'mysql_perf_schema_transactions_in_queue{instance="db2"}' + values: '0 0 0 120 120 120 120 120 120' + alert_rule_test: + - alertname: MySQLGroupReplicationQueueSizeHigh + eval_time: 8m + exp_alerts: + - exp_labels: + alertname: MySQLGroupReplicationQueueSizeHigh + severity: warning + instance: db2 + exp_annotations: + summary: MySQL cluster reports high Group Replication queue size + description: | + A high number of transactions in the Group Replication queue might indicate network issues or overloaded nodes. + Please check the cluster health, the logs and investigate. + LABELS = map[__name__:mysql_perf_schema_transactions_in_queue instance:db2].