@@ -20,15 +20,62 @@ prometheus_scrape_configs_default:
2020 replacement : ' ${1}'
2121
2222prometheus_scrape_configs : " {{ prometheus_scrape_configs_default + (openondemand_scrape_configs if groups['openondemand'] | count > 0 else []) }}"
23- prometheus_extra_rules : " {{ prometheus_extra_rules_default + (openondemand_extra_rules if groups['openondemand'] | count > 0 else []) }}"
24- prometheus_extra_rules_default :
23+ prometheus_extra_rules :
2524 - alert : SlurmNodeDown
2625 annotations :
2726 description : ' {% raw %}{{ $value }} Slurm nodes are in down status.{% endraw %}'
2827 summary : ' At least one Slurm node is down.'
2928 expr : " slurm_nodes_down > 0\n "
3029 labels :
3130 severity : critical
31+ - alert : BlackboxProbeFailed
32+ expr : probe_success == 0
33+ for : 0m
34+ labels :
35+ severity : critical
36+ annotations :
37+ summary : ' {% raw %}Blackbox probe failed (target {{ $labels.target }}){% endraw %}'
38+ description : " {% raw %}Blackbox probe '{{ $labels.target }}' failed{% endraw %}"
39+ - alert : BlackboxSlowProbe
40+ expr : avg_over_time(probe_duration_seconds[1m]) > 1.2 # around 1.14 expected due to indirection in cluster
41+ for : 1m
42+ labels :
43+ severity : warning
44+ annotations :
45+ summary : ' {% raw %}Blackbox slow probe (target {{ $labels.target }}){% endraw %}'
46+ description : " {% raw %}Blackbox probe '{{ $labels.target }}' took more than 1s to complete - {{ $value }}{% endraw %}"
47+ - alert : BlackboxProbeHttpFailure
48+ expr : probe_http_status_code <= 199 OR probe_http_status_code >= 400
49+ for : 0m
50+ labels :
51+ severity : critical
52+ annotations :
53+ summary : ' {% raw %}Blackbox probe HTTP failure (target {{ $labels.target }}){% endraw %}'
54+ description : " {% raw %}Blackbox probe '{{ $labels.target }}' returned an HTTP error status - {{ $value }}{% endraw %}"
55+ - alert : BlackboxSslCertificateWillExpireSoon
56+ expr : (7 * 24 * 3600) <= (last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) < (30 * 24 * 3600)
57+ for : 0m
58+ labels :
59+ severity : warning
60+ annotations :
61+ summary : ' {% raw %}Blackbox SSL certificate will expire soon (target {{ $labels.target }}){% endraw %}'
62+ description : " {% raw %}SSL certificate for blackbox probe '{{ $labels.target }}' expires in {{ $value | humanizeDuration }}{% endraw %}"
63+ - alert : BlackboxSslCertificateWillExpireVerySoon
64+ expr : 0 <= (last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) < (7 * 24 * 3600)
65+ for : 0m
66+ labels :
67+ severity : critical
68+ annotations :
69+ summary : ' {% raw %}Blackbox SSL certificate will expire very soon (target {{ $labels.target }}){% endraw %}'
70+ description : " {% raw %}SSL certificate for blackbox probe '{{ $labels.target }}' expires in {{ $value | humanizeDuration }}{% endraw %}"
71+ - alert : BlackboxSslCertificateExpired
72+ expr : (last_over_time(probe_ssl_earliest_cert_expiry[10m]) - time()) < 0
73+ for : 0m
74+ labels :
75+ severity : critical
76+ annotations :
77+ summary : ' {% raw %}Blackbox SSL certificate expired (target {{ $labels.target }}){% endraw %}'
78+ description : " {% raw %}SSL certificate for blackbox probe '{{ $labels.target }}' has expired{% endraw %}"
3279 - record : node_cpu_system_seconds:record
3380 expr : (100 * sum by(instance)(increase(node_cpu_seconds_total{mode="system",job="node-exporter"}[60s]))) / (sum by(instance)(increase(node_cpu_seconds_total{job="node-exporter"}[60s])))
3481 - record : node_cpu_user_seconds:record
0 commit comments