Skip to content

Commit e195b31

Browse files
Merge pull request #207 from oci-hpc/2.11.0_sam_17sep
changes based on feedback.
2 parents 9d11e5e + b3b8e08 commit e195b31

File tree

4 files changed

+24
-13
lines changed

4 files changed

+24
-13
lines changed

playbooks/roles/grafana/files/jb

-7.44 MB
Binary file not shown.
-5.01 MB
Binary file not shown.

playbooks/roles/grafana/files/main.jsonnet

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,8 @@ local health_status = [
8686
{ expr1: 'rdma_device_status{hostname=~"$hostname", oci_name=~"$oci_name"}==0', expr2: 'rdma_device_status{hostname=~"$hostname", oci_name=~"$oci_name"}==1', legend_format: '{{hostname}}:{{rdma_device}}', title: 'RDMA Device Status', unit: 'none' },
8787
{ expr1: 'rdma_link_noflap{hostname=~"$hostname", oci_name=~"$oci_name"}==0', expr2: 'rdma_link_noflap{hostname=~"$hostname", oci_name=~"$oci_name"}==1', legend_format: '{{hostname}}:{{rdma_device}}', title: 'RDMA Link flapping', unit: 'none' },
8888
{ expr1: 'rttcc_status{hostname=~"$hostname", oci_name=~"$oci_name"}==0', expr2: 'rttcc_status{hostname=~"$hostname", oci_name=~"$oci_name"}==1', legend_format: '{{hostname}}:{{rdma_device}}', title: 'RTTCC Status', unit: 'none' },
89+
{ expr1: 'gpu_count{hostname=~"$hostname", oci_name=~"$oci_name"}==0', expr2: 'gpu_count{hostname=~"$hostname", oci_name=~"$oci_name"}==1', legend_format: '{{hostname}}:{{instance_shape}}', title: 'GPU Status', unit: 'none' },
90+
{ expr1: 'oca_version{hostname=~"$hostname", oci_name=~"$oci_name"}==0', expr2: 'oca_version{hostname=~"$hostname", oci_name=~"$oci_name"}==1', legend_format: '{{hostname}}:{{version}}', title: 'OCA Version', unit: 'none' },
8991
];
9092

9193
local nfs_metrics = [
@@ -211,9 +213,9 @@ g.dashboard.new('Cluster Dashboard')
211213
+ g.panel.timeSeries.queryOptions.withTargets([
212214
g.query.prometheus.new(
213215
'$PROMETHEUS_DS',
214-
'avg by(Hostname) (' + metric.name + '{Hostname=~"$hostname", oci_name=~"$oci_name"})',
216+
'avg by(Hostname, gpu) (' + metric.name + '{Hostname=~"$hostname", oci_name=~"$oci_name"})',
215217
)
216-
+ g.query.prometheus.withLegendFormat('{{ Hostname }}')
218+
+ g.query.prometheus.withLegendFormat('{{ Hostname }}:{{ gpu }}')
217219
])
218220
+ g.panel.timeSeries.standardOptions.withUnit(metric.unit)
219221
+ g.panel.timeSeries.gridPos.withW(24)
@@ -227,9 +229,9 @@ g.dashboard.new('Cluster Dashboard')
227229
+ g.panel.timeSeries.queryOptions.withTargets([
228230
g.query.prometheus.new(
229231
'$PROMETHEUS_DS',
230-
'avg by(Hostname) (' + metric.name + '{Hostname=~"$hostname", oci_name=~"$oci_name"})',
232+
'avg by(Hostname, gpu) (' + metric.name + '{Hostname=~"$hostname", oci_name=~"$oci_name"})',
231233
)
232-
+ g.query.prometheus.withLegendFormat('{{ Hostname }}')
234+
+ g.query.prometheus.withLegendFormat('{{ Hostname }}:{{ gpu }}')
233235
])
234236
+ g.panel.timeSeries.standardOptions.withUnit(metric.unit)
235237
+ g.panel.timeSeries.gridPos.withW(24)

playbooks/roles/grafana/tasks/dashboard.yml

Lines changed: 18 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,24 @@
11
---
2-
- name: Copy binaries
3-
copy:
4-
src: "{{ item }}"
5-
dest: /usr/local/bin
6-
owner: "{{ user }}"
7-
group: "{{ user }}"
2+
- name: Download jb
3+
become: true
4+
get_url:
5+
url: https://github.com/jsonnet-bundler/jsonnet-bundler/releases/download/v0.6.0/jb-linux-amd64
6+
dest: /usr/local/bin/jb
87
mode: '0755'
9-
loop:
10-
- jb
11-
- jsonnet
8+
9+
- name: Download jsonnet
10+
become: true
11+
get_url:
12+
url: https://github.com/google/go-jsonnet/releases/download/v0.20.0/go-jsonnet_0.20.0_Linux_x86_64.tar.gz
13+
dest: /tmp
14+
15+
- name: Extract jsonnet
1216
become: true
17+
unarchive:
18+
src: /tmp/go-jsonnet_0.20.0_Linux_x86_64.tar.gz
19+
dest: /usr/local/bin
20+
mode: '0755'
21+
1322

1423
- name: Delete existing build directory
1524
file:

0 commit comments

Comments
 (0)