File tree Expand file tree Collapse file tree 5 files changed +40
-1
lines changed
ansible/roles/kube_prometheus_stack
environments/common/inventory/group_vars/all Expand file tree Collapse file tree 5 files changed +40
-1
lines changed Original file line number Diff line number Diff line change @@ -20,6 +20,8 @@ kube_prometheus_stack_wait_timeout: 5m
20
20
kube_prometheus_stack_metrics_image_tag : v2.12.0
21
21
kube_prometheus_stack_patch_image_tag : v20221220-controller-v1.5.1-58-g787ea74b6
22
22
23
+ kube_prometheus_stack_blackbox_exporter_release_name : blackbox-exporter
24
+
23
25
control_ip : " {{ hostvars[groups['control'].0].ansible_host }}"
24
26
25
27
grafana_auth_anonymous : false
@@ -64,6 +66,7 @@ prometheus_external_labels:
64
66
prometheus_scrape_configs : []
65
67
66
68
prometheus_extra_rules : []
69
+ prometheus_extra_rules_default : []
67
70
68
71
prometheus_rules :
69
72
appliance-rules :
Original file line number Diff line number Diff line change 176
176
ansible.builtin.import_role :
177
177
name : grafana-dashboards
178
178
179
+ - name : Install blackbox exporter helm chart
180
+ kubernetes.core.helm :
181
+ chart_ref : prometheus-blackbox-exporter
182
+ chart_repo_url : https://prometheus-community.github.io/helm-charts
183
+ chart_version : 9.0.1
184
+ release_name : " {{ kube_prometheus_stack_blackbox_exporter_release_name }}"
185
+ release_namespace : " {{ kube_prometheus_stack_release_namespace }}"
186
+ release_values :
187
+ nodeSelector :
188
+ clusterrole : " server"
189
+ wait : yes
190
+
179
191
- name : Install kube-prometheus-stack on target Kubernetes cluster
180
192
kubernetes.core.helm :
181
193
chart_ref : " {{ kube_prometheus_stack_chart_name }}"
Original file line number Diff line number Diff line change @@ -2,5 +2,6 @@ kube_prometheus_stack_chart_version: 59.1.0
2
2
kube_prometheus_stack_release_namespace : monitoring-system
3
3
kube_prometheus_stack_release_name : kube-prometheus-stack
4
4
kube_prometheus_stack_wait_timeout : 5m
5
+ kube_prometheus_stack_blackbox_exporter_release_name : blackbox-exporter
5
6
6
7
# See prometheus.yml, grafana.yml and alertmanager.yml for config of individual monitoring services
Original file line number Diff line number Diff line change @@ -182,6 +182,28 @@ openondemand_scrape_configs:
182
182
labels :
183
183
environment : " {{ appliances_environment_name }}"
184
184
service : " openondemand"
185
+ - job_name : " blackbox-probes"
186
+ metrics_path : /probe
187
+ params :
188
+ module : [http_2xx]
189
+ static_configs :
190
+ - targets :
191
+ - " https://{{ openondemand_address }}"
192
+ relabel_configs :
193
+ - source_labels : [__address__]
194
+ target_label : __param_target
195
+ - source_labels : [__param_target]
196
+ target_label : target
197
+ - target_label : __address__
198
+ replacement : " {{ kube_prometheus_stack_blackbox_exporter_release_name }}-prometheus-blackbox-exporter:9115"
199
+ openondemand_extra_rules :
200
+ - alert : OnDemandProbeFailed
201
+ annotations :
202
+ description : ' {% raw %}Could not establish secure connection to OOD server at {{ $labels.target }}{% endraw %}'
203
+ summary : ' Could not establish a secure connection to an Open OnDemand server'
204
+ expr : " probe_success{target='https://{{ openondemand_address }}'} < 1\n "
205
+ labels :
206
+ severity : warning
185
207
186
208
openondemand_dashboard :
187
209
- dashboard_id : 13465
Original file line number Diff line number Diff line change @@ -20,7 +20,8 @@ prometheus_scrape_configs_default:
20
20
replacement : ' ${1}'
21
21
22
22
prometheus_scrape_configs : " {{ prometheus_scrape_configs_default + (openondemand_scrape_configs if groups['openondemand'] | count > 0 else []) }}"
23
- prometheus_extra_rules :
23
+ prometheus_extra_rules : " {{ prometheus_extra_rules_default + (openondemand_extra_rules if groups['openondemand'] | count > 0 else []) }}"
24
+ prometheus_extra_rules_default :
24
25
- alert : SlurmNodeDown
25
26
annotations :
26
27
description : ' {% raw %}{{ $value }} Slurm nodes are in down status.{% endraw %}'
You can’t perform that action at this time.
0 commit comments