@@ -20,15 +20,62 @@ prometheus_scrape_configs_default:
20
20
replacement : ' ${1}'
21
21
22
22
prometheus_scrape_configs : " {{ prometheus_scrape_configs_default + (openondemand_scrape_configs if groups['openondemand'] | count > 0 else []) }}"
23
- prometheus_extra_rules : " {{ prometheus_extra_rules_default + (openondemand_extra_rules if groups['openondemand'] | count > 0 else []) }}"
24
- prometheus_extra_rules_default :
23
+ prometheus_extra_rules :
25
24
- alert : SlurmNodeDown
26
25
annotations :
27
26
description : ' {% raw %}{{ $value }} Slurm nodes are in down status.{% endraw %}'
28
27
summary : ' At least one Slurm node is down.'
29
28
expr : " slurm_nodes_down > 0\n "
30
29
labels :
31
30
severity : critical
31
+ - alert : BlackboxProbeFailed
32
+ expr : probe_success == 0
33
+ for : 0m
34
+ labels :
35
+ severity : critical
36
+ annotations :
37
+ summary : ' {% raw %}Blackbox probe failed (target {{ $labels.target }}){% endraw %}'
38
+ description : " {% raw %}Blackbox probe '{{ $labels.target }}' failed{% endraw %}"
39
+ - alert : BlackboxSlowProbe
40
+ expr : avg_over_time(probe_duration_seconds[1m]) > 1.2 # around 1.14 expected due to indirection in cluster
41
+ for : 1m
42
+ labels :
43
+ severity : warning
44
+ annotations :
45
+ summary : ' {% raw %}Blackbox slow probe (target {{ $labels.target }}){% endraw %}'
46
+ description : " {% raw %}Blackbox probe '{{ $labels.target }}' took more than 1s to complete - {{ $value }}{% endraw %}"
47
+ - alert : BlackboxProbeHttpFailure
48
+ expr : probe_http_status_code <= 199 OR probe_http_status_code >= 400
49
+ for : 0m
50
+ labels :
51
+ severity : critical
52
+ annotations :
53
+ summary : ' {% raw %}Blackbox probe HTTP failure (target {{ $labels.target }}){% endraw %}'
54
+ description : " {% raw %}Blackbox probe '{{ $labels.target }}' returned an HTTP error status - {{ $value }}{% endraw %}"
55
+ - alert : BlackboxSslCertificateWillExpireSoon
56
+ expr : (7 * 24 * 3600) <= (last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) < (30 * 24 * 3600)
57
+ for : 0m
58
+ labels :
59
+ severity : warning
60
+ annotations :
61
+ summary : ' {% raw %}Blackbox SSL certificate will expire soon (target {{ $labels.target }}){% endraw %}'
62
+ description : " {% raw %}SSL certificate for blackbox probe '{{ $labels.target }}' expires in {{ $value | humanizeDuration }}{% endraw %}"
63
+ - alert : BlackboxSslCertificateWillExpireVerySoon
64
+ expr : 0 <= (last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) < (7 * 24 * 3600)
65
+ for : 0m
66
+ labels :
67
+ severity : critical
68
+ annotations :
69
+ summary : ' {% raw %}Blackbox SSL certificate will expire very soon (target {{ $labels.target }}){% endraw %}'
70
+ description : " {% raw %}SSL certificate for blackbox probe '{{ $labels.target }}' expires in {{ $value | humanizeDuration }}{% endraw %}"
71
+ - alert : BlackboxSslCertificateExpired
72
+ expr : (last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) < 0
73
+ for : 0m
74
+ labels :
75
+ severity : critical
76
+ annotations :
77
+ summary : ' {% raw %}Blackbox SSL certificate expired (target {{ $labels.target }}){% endraw %}'
78
+ description : " {% raw %}SSL certificate for blackbox probe '{{ $labels.target }}' has expired{% endraw %}"
32
79
- record : node_cpu_system_seconds:record
33
80
expr : (100 * sum by(instance)(increase(node_cpu_seconds_total{mode="system",job="node-exporter"}[60s]))) / (sum by(instance)(increase(node_cpu_seconds_total{job="node-exporter"}[60s])))
34
81
- record : node_cpu_user_seconds:record
0 commit comments