Skip to content

Commit fc81f09

Browse files
committed
Added blackbox probe for OOD + alert
1 parent 603e818 commit fc81f09

File tree

5 files changed

+40
-1
lines changed

5 files changed

+40
-1
lines changed

ansible/roles/kube_prometheus_stack/defaults/main/main.yml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -20,6 +20,8 @@ kube_prometheus_stack_wait_timeout: 5m
2020
kube_prometheus_stack_metrics_image_tag: v2.12.0
2121
kube_prometheus_stack_patch_image_tag: v20221220-controller-v1.5.1-58-g787ea74b6
2222

23+
kube_prometheus_stack_blackbox_exporter_release_name: blackbox-exporter
24+
2325
control_ip: "{{ hostvars[groups['control'].0].ansible_host }}"
2426

2527
grafana_auth_anonymous: false
@@ -64,6 +66,7 @@ prometheus_external_labels:
6466
prometheus_scrape_configs: []
6567

6668
prometheus_extra_rules: []
69+
prometheus_extra_rules_default: []
6770

6871
prometheus_rules:
6972
appliance-rules:

ansible/roles/kube_prometheus_stack/tasks/main.yml

Lines changed: 12 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -176,6 +176,18 @@
176176
ansible.builtin.import_role:
177177
name: grafana-dashboards
178178

179+
- name: Install blackbox exporter helm chart
180+
kubernetes.core.helm:
181+
chart_ref: prometheus-blackbox-exporter
182+
chart_repo_url: https://prometheus-community.github.io/helm-charts
183+
chart_version: 9.0.1
184+
release_name: "{{ kube_prometheus_stack_blackbox_exporter_release_name }}"
185+
release_namespace: "{{ kube_prometheus_stack_release_namespace }}"
186+
release_values:
187+
nodeSelector:
188+
clusterrole: "server"
189+
wait: yes
190+
179191
- name: Install kube-prometheus-stack on target Kubernetes cluster
180192
kubernetes.core.helm:
181193
chart_ref: "{{ kube_prometheus_stack_chart_name }}"

environments/common/inventory/group_vars/all/monitoring.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,5 +2,6 @@ kube_prometheus_stack_chart_version: 59.1.0
22
kube_prometheus_stack_release_namespace: monitoring-system
33
kube_prometheus_stack_release_name: kube-prometheus-stack
44
kube_prometheus_stack_wait_timeout: 5m
5+
kube_prometheus_stack_blackbox_exporter_release_name: blackbox-exporter
56

67
# See prometheus.yml, grafana.yml and alertmanager.yml for config of individual monitoring services

environments/common/inventory/group_vars/all/openondemand.yml

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -182,6 +182,28 @@ openondemand_scrape_configs:
182182
labels:
183183
environment: "{{ appliances_environment_name }}"
184184
service: "openondemand"
185+
- job_name: "blackbox-probes"
186+
metrics_path: /probe
187+
params:
188+
module: [http_2xx]
189+
static_configs:
190+
- targets:
191+
- "https://{{ openondemand_address }}"
192+
relabel_configs:
193+
- source_labels: [__address__]
194+
target_label: __param_target
195+
- source_labels: [__param_target]
196+
target_label: target
197+
- target_label: __address__
198+
replacement: "{{ kube_prometheus_stack_blackbox_exporter_release_name }}-prometheus-blackbox-exporter:9115"
199+
openondemand_extra_rules:
200+
- alert: OnDemandProbeFailed
201+
annotations:
202+
description: '{% raw %}Could not establish secure connection to OOD server at {{ $labels.target }}{% endraw %}'
203+
summary: 'Could not establish a secure connection to an Open OnDemand server'
204+
expr: "probe_success{target='https://{{ openondemand_address }}'} < 1\n"
205+
labels:
206+
severity: warning
185207

186208
openondemand_dashboard:
187209
- dashboard_id: 13465

environments/common/inventory/group_vars/all/prometheus.yml

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,8 @@ prometheus_scrape_configs_default:
2020
replacement: '${1}'
2121

2222
prometheus_scrape_configs: "{{ prometheus_scrape_configs_default + (openondemand_scrape_configs if groups['openondemand'] | count > 0 else []) }}"
23-
prometheus_extra_rules:
23+
prometheus_extra_rules: "{{ prometheus_extra_rules_default + (openondemand_extra_rules if groups['openondemand'] | count > 0 else []) }}"
24+
prometheus_extra_rules_default:
2425
- alert: SlurmNodeDown
2526
annotations:
2627
description: '{% raw %}{{ $value }} Slurm nodes are in down status.{% endraw %}'

0 commit comments

Comments
 (0)