|
1 | 1 | apiVersion: v1 |
2 | 2 | kind: Alert |
3 | | -app: OpenShift HAProxy Router |
| 3 | +app: 'OpenShift HAProxy Router' |
4 | 4 | version: 1.0.0 |
5 | 5 | appVersion: |
6 | 6 | - '3.11' |
7 | | -- '4.3' |
| 7 | +- '4.7' |
8 | 8 | descriptionFile: ALERTS.md |
9 | 9 | configurations: |
10 | 10 | - kind: Prometheus |
11 | 11 | data: | |
12 | 12 | groups: |
13 | 13 | - name: OpenShift-HAProxy-Router |
14 | 14 | rules: |
15 | | - - alert: RouterDown |
16 | | - expr: | |
17 | | - absent((count(haproxy_process_start_time_seconds) < 1)) |
18 | | - for: 10m |
19 | | - labels: |
20 | | - severity: page |
21 | | - annotations: |
22 | | - summary: Router HAProxy down. No instances running. |
23 | | - - alert: DownTimeInService |
24 | | - expr: | |
25 | | - haproxy_backend_downtime_seconds_total > 0 |
26 | | - for: 10m |
27 | | - labels: |
28 | | - severity: page |
29 | | - annotations: |
30 | | - summary: DownTime detected in service. Route {{$labels.route}}, pod {{labels.pod}} |
31 | | - - alert: RouteDown |
32 | | - expr: | |
33 | | - sum by (route) (haproxy_server_up==1) == 0 |
34 | | - for: 10m |
35 | | - labels: |
36 | | - severity: page |
37 | | - annotations: |
38 | | - summary: All servers are down in route {{$labels.route}} |
39 | | - - alert: HighLatency |
40 | | - expr: | |
41 | | - max by (route)(haproxy_server_http_average_response_latency_milliseconds{route!=""}) > 250 |
42 | | - for: 10m |
43 | | - labels: |
44 | | - severity: page |
45 | | - annotations: |
46 | | - summary: High latency in at least one server for the route {{$labels.route}} |
47 | | - - alert: PodHealthCheckFailure |
48 | | - expr: | |
49 | | - rate(haproxy_server_check_failures_total[5m]) > 0 |
50 | | - for: 10m |
51 | | - labels: |
52 | | - severity: page |
53 | | - annotations: |
54 | | - summary: Recurrent health check failure in pod {{$labels.pod}} and route {{$labels.route}} |
55 | | - - alert: QueueNotEmptyInRoute |
56 | | - expr: | |
57 | | - sum by (route)(haproxy_server_current_queue{route!=""}) > 0 |
58 | | - for: 10m |
59 | | - labels: |
60 | | - severity: page |
61 | | - annotations: |
62 | | - summary: Queue not empty in route {{$labels.route}} |
63 | | - - alert: HighErrorRateInRoute |
64 | | - expr: | |
65 | | - sum by (route) (rate(haproxy_server_http_responses_total{code!="2xx"}[5m])) / |
66 | | - sum by (route) (rate(haproxy_server_http_responses_total{}[5m])) |
67 | | - for: 10m |
68 | | - labels: |
69 | | - severity: page |
70 | | - annotations: |
71 | | - summary: High error rate in route {{$labels.route}} |
72 | | - - alert: ConnectionErrorsInRoute |
73 | | - expr: | |
74 | | - sum by (route)(rate(haproxy_server_connection_errors_total{route!=""}[5m])) > 0 |
75 | | - for: 10m |
76 | | - labels: |
77 | | - severity: page |
78 | | - annotations: |
79 | | - summary: Recurring connection errors in route {{$labels.route}} |
| 15 | + - alert: '[OpenShift-HAProxy-Router] Router Down' |
| 16 | + expr: | |
| 17 | + absent(haproxy_process_start_time_seconds) == 1 |
| 18 | + for: 10m |
| 19 | + labels: |
| 20 | + severity: critical |
| 21 | + annotations: |
| 22 | + description: Router HAProxy down. No instances running. |
| 23 | + - alert: '[OpenShift-HAProxy-Router] Percentage of routers low' |
| 24 | + expr: | |
| 25 | + count (haproxy_process_start_time_seconds)/sum (kube_workload_status_desired) < 0.75 |
| 26 | + for: 10m |
| 27 | + labels: |
| 28 | + severity: critical |
| 29 | + annotations: |
| 30 | + description: Less than 75% Routers are up |
| 31 | + - alert: '[OpenShift-HAProxy-Router] Route Down' |
| 32 | + expr: | |
| 33 | + sum by (namespace,route)(haproxy_server_up) < 1 |
| 34 | + for: 10m |
| 35 | + labels: |
| 36 | + severity: critical |
| 37 | + annotations: |
| 38 | + description: This alert detects if all servers are down in a route |
| 39 | + - alert: '[OpenShift-HAProxy-Router] High Latency' |
| 40 | + expr: | |
| 41 | + max by (namespace,route)(haproxy_server_http_average_response_latency_milliseconds{route!=""}) > 250 |
| 42 | + for: 10m |
| 43 | + labels: |
| 44 | + severity: warning |
| 45 | + annotations: |
| 46 | + description: This alert detects high latency in at least one server of the route |
| 47 | + - alert: '[OpenShift-HAProxy-Router] Pod Health Check Failure' |
| 48 | + expr: | |
| 49 | + sum by (namespace,route,pod)(rate(haproxy_server_check_failures_total[5m])) > 0 |
| 50 | + for: 10m |
| 51 | + labels: |
| 52 | + severity: critical |
| 53 | + annotations: |
| 54 | + description: This alert triggers when there is a recurrent pod health check failure. |
| 55 | + - alert: '[OpenShift-HAProxy-Router] Queue not empty in route' |
| 56 | + expr: | |
| 57 | + sum by (namespace,route)(haproxy_server_current_queue{route!=""}) > 0 |
| 58 | + for: 10m |
| 59 | + labels: |
| 60 | + severity: warning |
| 61 | + annotations: |
| 62 | + description: This alert triggers when a queue is not empty in a route |
| 63 | + - alert: '[OpenShift-HAProxy-Router] High error rate in route' |
| 64 | + expr: | |
| 65 | + sum by (namespace,route) (rate(haproxy_server_http_responses_total{code!="2xx"}[5m])) /sum by (namespace,route) (rate(haproxy_server_http_responses_total[5m]))> 0.15 |
| 66 | + for: 10m |
| 67 | + labels: |
| 68 | + severity: critical |
| 69 | + annotations: |
| 70 | + description: This alert triggers when there is a high error rate in a route. |
| 71 | + - alert: '[OpenShift-HAProxy-Router] Connection errors in route' |
| 72 | + expr: | |
| 73 | + sum by (namespace,route)(rate(haproxy_server_connection_errors_total{route!=""}[5m])) > 0 |
| 74 | + for: 10m |
| 75 | + labels: |
| 76 | + severity: warning |
| 77 | + annotations: |
| 78 | + description: This alert triggers when there are recurring connection errors in a route |
0 commit comments