Skip to content

Commit 4ad3351

Browse files
committed
added blackbox alert rules from capi helm
1 parent 613a3f0 commit 4ad3351

File tree

3 files changed

+49
-11
lines changed

3 files changed

+49
-11
lines changed

ansible/roles/kube_prometheus_stack/defaults/main/main.yml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -67,7 +67,6 @@ prometheus_external_labels:
6767
prometheus_scrape_configs: []
6868

6969
prometheus_extra_rules: []
70-
prometheus_extra_rules_default: []
7170

7271
prometheus_rules:
7372
appliance-rules:

environments/common/inventory/group_vars/all/openondemand.yml

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -196,14 +196,6 @@ openondemand_scrape_configs:
196196
target_label: target
197197
- target_label: __address__
198198
replacement: "{{ kube_prometheus_stack_blackbox_exporter_release_name }}-prometheus-blackbox-exporter:9115"
199-
openondemand_extra_rules:
200-
- alert: OnDemandProbeFailed
201-
annotations:
202-
description: '{% raw %}Could not establish secure connection to OOD server at {{ $labels.target }}{% endraw %}'
203-
summary: 'Could not establish a secure connection to an Open OnDemand server'
204-
expr: "probe_success{target='ondemand.monitoring-system'} < 1\n"
205-
labels:
206-
severity: warning
207199

208200
openondemand_dashboard:
209201
- dashboard_id: 13465

environments/common/inventory/group_vars/all/prometheus.yml

Lines changed: 49 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -20,15 +20,62 @@ prometheus_scrape_configs_default:
2020
replacement: '${1}'
2121

2222
prometheus_scrape_configs: "{{ prometheus_scrape_configs_default + (openondemand_scrape_configs if groups['openondemand'] | count > 0 else []) }}"
23-
prometheus_extra_rules: "{{ prometheus_extra_rules_default + (openondemand_extra_rules if groups['openondemand'] | count > 0 else []) }}"
24-
prometheus_extra_rules_default:
23+
prometheus_extra_rules:
2524
- alert: SlurmNodeDown
2625
annotations:
2726
description: '{% raw %}{{ $value }} Slurm nodes are in down status.{% endraw %}'
2827
summary: 'At least one Slurm node is down.'
2928
expr: "slurm_nodes_down > 0\n"
3029
labels:
3130
severity: critical
31+
- alert: BlackboxProbeFailed
32+
expr: probe_success == 0
33+
for: 0m
34+
labels:
35+
severity: critical
36+
annotations:
37+
summary: '{% raw %}Blackbox probe failed (target {{ $labels.target }}){% endraw %}'
38+
description: "{% raw %}Blackbox probe '{{ $labels.target }}' failed{% endraw %}"
39+
- alert: BlackboxSlowProbe
40+
expr: avg_over_time(probe_duration_seconds[1m]) > 1.2 #around 1.14 expected due to indirection in cluster
41+
for: 1m
42+
labels:
43+
severity: warning
44+
annotations:
45+
summary: '{% raw %}Blackbox slow probe (target {{ $labels.target }}){% endraw %}'
46+
description: "{% raw %}Blackbox probe '{{ $labels.target }}' took more than 1s to complete - {{ $value }}{% endraw %}"
47+
- alert: BlackboxProbeHttpFailure
48+
expr: probe_http_status_code <= 199 OR probe_http_status_code >= 400
49+
for: 0m
50+
labels:
51+
severity: critical
52+
annotations:
53+
summary: '{% raw %}Blackbox probe HTTP failure (target {{ $labels.target }}){% endraw %}'
54+
description: "{% raw %}Blackbox probe '{{ $labels.target }}' returned an HTTP error status - {{ $value }}{% endraw %}"
55+
- alert: BlackboxSslCertificateWillExpireSoon
56+
expr: (7 * 24 * 3600) <= (last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) < (30 * 24 * 3600)
57+
for: 0m
58+
labels:
59+
severity: warning
60+
annotations:
61+
summary: '{% raw %}Blackbox SSL certificate will expire soon (target {{ $labels.target }}){% endraw %}'
62+
description: "{% raw %}SSL certificate for blackbox probe '{{ $labels.target }}' expires in {{ $value | humanizeDuration }}{% endraw %}"
63+
- alert: BlackboxSslCertificateWillExpireVerySoon
64+
expr: 0 <= (last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) < (7 * 24 * 3600)
65+
for: 0m
66+
labels:
67+
severity: critical
68+
annotations:
69+
summary: '{% raw %}Blackbox SSL certificate will expire very soon (target {{ $labels.target }}){% endraw %}'
70+
description: "{% raw %}SSL certificate for blackbox probe '{{ $labels.target }}' expires in {{ $value | humanizeDuration }}{% endraw %}"
71+
- alert: BlackboxSslCertificateExpired
72+
expr: (last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) < 0
73+
for: 0m
74+
labels:
75+
severity: critical
76+
annotations:
77+
summary: '{% raw %}Blackbox SSL certificate expired (target {{ $labels.target }}){% endraw %}'
78+
description: "{% raw %}SSL certificate for blackbox probe '{{ $labels.target }}' has expired{% endraw %}"
3279
- record: node_cpu_system_seconds:record
3380
expr: (100 * sum by(instance)(increase(node_cpu_seconds_total{mode="system",job="node-exporter"}[60s]))) / (sum by(instance)(increase(node_cpu_seconds_total{job="node-exporter"}[60s])))
3481
- record: node_cpu_user_seconds:record

0 commit comments

Comments
 (0)