Skip to content

Commit 5f89be8

Browse files
committed
fixed openhpc dashboard
1 parent 9ab06a6 commit 5f89be8

File tree

2 files changed

+23
-10
lines changed

2 files changed

+23
-10
lines changed

ansible/roles/grafana-dashboards/files/openhpc-slurm.json

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -711,7 +711,7 @@
711711
"steppedLine": false,
712712
"targets": [
713713
{
714-
"expr": "100 - (100 * node_memory_MemAvailable_bytes{job=~\"node\", instance=~\"$instance\"} / node_memory_MemTotal_bytes{job=~\"node\", instance=~\"$instance\"})",
714+
"expr": "100 - (100 * node_memory_MemAvailable_bytes{job=~\"node-exporter\", instance=~\"$instance\"} / node_memory_MemTotal_bytes{job=~\"node-exporter\", instance=~\"$instance\"})",
715715
"format": "time_series",
716716
"interval": "",
717717
"intervalFactor": 1,
@@ -818,7 +818,7 @@
818818
"steppedLine": false,
819819
"targets": [
820820
{
821-
"expr": "(100 * sum by(instance)(increase(node_cpu_seconds_total{mode=\"iowait\",job=~\"node_fast\"}[1s]))) / (sum by(instance)(increase(node_cpu_seconds_total{job=~\"node_fast\"}[1s])))",
821+
"expr": "(100 * sum by(instance)(increase(node_cpu_seconds_total{mode=\"iowait\",job=~\"node-exporter_fast\"}[1s]))) / (sum by(instance)(increase(node_cpu_seconds_total{job=~\"node-exporter_fast\"}[1s])))",
822822
"hide": true,
823823
"interval": "",
824824
"legendFormat": "{{ instance }}",
@@ -1489,7 +1489,7 @@
14891489
"steppedLine": false,
14901490
"targets": [
14911491
{
1492-
"expr": "increase(node_infiniband_port_transmit_wait_total{job=\"node\",instance=~\"$instance\"}[1m]) / increase(node_infiniband_port_packets_transmitted_total{job=\"node\",instance=~\"$instance\"}[1m])",
1492+
"expr": "increase(node_infiniband_port_transmit_wait_total{job=\"node-exporter\",instance=~\"$instance\"}[1m]) / increase(node_infiniband_port_packets_transmitted_total{job=\"node-exporter\",instance=~\"$instance\"}[1m])",
14931493
"hide": false,
14941494
"interval": "",
14951495
"legendFormat": "{{ instance }} {{ device }} {{ port }}",
@@ -1950,15 +1950,15 @@
19501950
"allValue": null,
19511951
"current": {},
19521952
"datasource": "${DS_PROMETHEUS}",
1953-
"definition": "label_values(node_cpu_seconds_total{job=~\"node\"}, env)",
1953+
"definition": "label_values(node_cpu_seconds_total{job=~\"node-exporter\"}, cluster_env)",
19541954
"error": null,
19551955
"hide": 0,
19561956
"includeAll": true,
19571957
"label": "Environment",
19581958
"multi": true,
1959-
"name": "env",
1959+
"name": "cluster_env",
19601960
"options": [],
1961-
"query": "label_values(node_cpu_seconds_total{job=~\"node\"}, env)",
1961+
"query": "label_values(node_cpu_seconds_total{job=~\"node-exporter\"}, cluster_env)",
19621962
"refresh": 1,
19631963
"regex": "",
19641964
"skipUrlSync": false,
@@ -1994,15 +1994,15 @@
19941994
"allValue": null,
19951995
"current": {},
19961996
"datasource": "${DS_PROMETHEUS}",
1997-
"definition": "label_values(node_cpu_seconds_total{job=~\"node\", env=~\"$env\", instance=~\"$host_filter\"}, instance)",
1997+
"definition": "label_values(node_cpu_seconds_total{job=~\"node-exporter\", cluster_env=~\"$cluster_env\", instance=~\"$host_filter\"}, instance)",
19981998
"error": null,
19991999
"hide": 0,
20002000
"includeAll": true,
20012001
"label": null,
20022002
"multi": true,
20032003
"name": "instance",
20042004
"options": [],
2005-
"query": "label_values(node_cpu_seconds_total{job=~\"node\", env=~\"$env\", instance=~\"$host_filter\"}, instance)",
2005+
"query": "label_values(node_cpu_seconds_total{job=~\"node-exporter\", cluster_env=~\"$cluster_env\", instance=~\"$host_filter\"}, instance)",
20062006
"refresh": 1,
20072007
"regex": "",
20082008
"skipUrlSync": false,
@@ -2017,15 +2017,15 @@
20172017
"allValue": null,
20182018
"current": {},
20192019
"datasource": "${DS_PROMETHEUS}",
2020-
"definition": "label_values(node_infiniband_info{job=~\"node\"}, device)",
2020+
"definition": "label_values(node_infiniband_info{job=~\"node-exporter\"}, device)",
20212021
"error": null,
20222022
"hide": 0,
20232023
"includeAll": true,
20242024
"label": "infiniband device",
20252025
"multi": true,
20262026
"name": "device",
20272027
"options": [],
2028-
"query": "label_values(node_infiniband_info{job=~\"node\"}, device)",
2028+
"query": "label_values(node_infiniband_info{job=~\"node-exporter\"}, device)",
20292029
"refresh": 1,
20302030
"regex": "",
20312031
"skipUrlSync": false,

ansible/roles/kube_prometheus_stack/defaults/main/helm.yml

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -172,6 +172,19 @@ kube_prometheus_stack_release_defaults:
172172
prometheus-node-exporter:
173173
image:
174174
tag: "{{ node_exporter_image_tag }}"
175+
prometheus:
176+
monitor:
177+
relabelings:
178+
# relabels instances to hostnames with suffixes stripped
179+
- sourceLabels: [__meta_kubernetes_pod_node_name]
180+
separator: ;
181+
regex: ([^.]+).*
182+
targetLabel: instance
183+
replacement: $1
184+
action: replace
185+
metricRelabelings:
186+
- targetLabel: cluster_env
187+
replacement: ungrouped
175188

176189
kube_prometheus_stack_release_overrides: {}
177190

0 commit comments

Comments
 (0)