Skip to content

Commit 119306b

Browse files
committed
minor fixes
1 parent ac7cb43 commit 119306b

File tree

2 files changed

+6
-2
lines changed

2 files changed

+6
-2
lines changed

playbooks/roles/grafana/files/main.jsonnet

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,13 +12,13 @@ local critical_status_ts = [
1212

1313
local critical_status_stl = [
1414
{ expr1: 'rdma_device_status{hostname=~"$hostname", oci_name=~"$oci_name"}==0', expr2: 'rdma_device_status{hostname=~"$hostname", oci_name=~"$oci_name"}==1', legend_format: '{{hostname}}:{{rdma_device}}', title: 'RDMA Device Status', unit: 'none', colors: {'0': { text: 'down', color: 'red' },'1': { text: 'up', color: 'green' },} },
15-
{ expr1: 'gpu_row_remap_error_check{hostname=~"$hostname", oci_name=~"$oci_name"}==0', expr2: 'gpu_row_remap_error_check{hostname=~"$hostname", oci_name=~"$oci_name"}==1', legend_format: '{{hostname}}', title: 'GPU Row Remap Error Check', unit: 'none', colors: {'0': { text: 'failed', color: 'red' },'1': { text: 'passed', color: 'green' },} },
15+
{ expr1: 'gpu_row_remap_error_check{hostname=~"$hostname", oci_name=~"$oci_name"}==0', expr2: 'gpu_row_remap_error_check{hostname=~"$hostname", oci_name=~"$oci_name"}==1', legend_format: '{{hostname}}', title: 'GPU Row Remap Error Check', unit: 'none', colors: {'0': { text: 'passed', color: 'green' },'1': { text: 'failed', color: 'red' },} },
1616
{ expr1: 'gpu_ecc_error_check{hostname=~"$hostname", oci_name=~"$oci_name"}==0', expr2: 'gpu_ecc_error_check{hostname=~"$hostname", oci_name=~"$oci_name"}==1', legend_format: '{{hostname}}', title: 'GPU ECC Error Check', unit: 'none', colors: {'0': { text: 'failed', color: 'red' },'1': { text: 'passed', color: 'green' },} },
1717
{ expr1: 'xid_error_check{hostname=~"$hostname", oci_name=~"$oci_name"}==0', expr2: 'xid_error_check{hostname=~"$hostname", oci_name=~"$oci_name"}==1', legend_format: '{{hostname}}', title: 'Xid Error Check', unit: 'none', colors: {'0': { text: 'passed', color: 'green' },'1': { text: 'failed', color: 'red' },} },
1818
];
1919

2020
local health_status = [
21-
{ expr1: 'ib_link_state{hostname=~"$hostname", oci_name=~"$oci_name"}==1 or vector(0)', expr2: 'rdma_device_status{hostname=~"$hostname", oci_name=~"$oci_name"} > 1', legend_format: '{{hostname}}:{{rdma_device}}', title: 'RDMA Device Status', unit: 'none', colors: {'1': { text: 'down', color: 'red' },} },
21+
{ expr1: 'ib_link_state{hostname=~"$hostname", oci_name=~"$oci_name"}==1 or vector(0)', expr2: 'rdma_device_status{hostname=~"$hostname", oci_name=~"$oci_name"} > 1', legend_format: '{{hostname}}:{{rdma_device}}', title: 'RDMA Link State (h/w metric)', unit: 'none', colors: {'1': { text: 'down', color: 'red' },} },
2222
{ expr1: 'rdma_link_noflap{hostname=~"$hostname", oci_name=~"$oci_name"}==0', expr2: 'rdma_link_noflap{hostname=~"$hostname", oci_name=~"$oci_name"}==1', legend_format: '{{hostname}}:{{rdma_device}}', title: 'RDMA Link flapping', unit: 'none', colors: {'0': { text: 'down', color: 'red' },'1': { text: 'up', color: 'green' },} },
2323
{ expr1: 'rttcc_status{hostname=~"$hostname", oci_name=~"$oci_name"}==0', expr2: 'rttcc_status{hostname=~"$hostname", oci_name=~"$oci_name"}==1', legend_format: '{{hostname}}:{{rdma_device}}', title: 'RTTCC Status', unit: 'none', colors: {'0': { text: 'disabled', color: 'green' },'1': { text: 'enabled', color: 'red' },} },
2424
{ expr1: 'gpu_count{hostname=~"$hostname", oci_name=~"$oci_name"}==0', expr2: 'gpu_count{hostname=~"$hostname", oci_name=~"$oci_name"}==1', legend_format: '{{hostname}}:{{instance_shape}}', title: 'GPU Count', unit: 'none', colors: {'0': { text: 'down', color: 'red' },'1': { text: 'up', color: 'green' },} },

playbooks/roles/metrics-exporter/templates/custom_metrics.py.j2

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -567,6 +567,10 @@ def check_row_remap_errors():
567567
metric_text = "gpu_row_remap_error_check 0" + "\n"
568568
print(metric_text)
569569
tmp_tf.write('{}'.format(metric_text))
570+
else:
571+
metric_text = "gpu_row_remap_error_check 0" + "\n"
572+
print(metric_text)
573+
tmp_tf.write('{}'.format(metric_text))
570574
tmp_tf.close()
571575

572576
copy_metric_file(tmp_tf_path, tf_path, node_exporter_user, node_exporter_group)

0 commit comments

Comments
 (0)