@@ -12,13 +12,13 @@ local critical_status_ts = [
1212
1313local critical_status_stl = [
1414{ expr1: 'rdma_device_status{hostname=~"$hostname", oci_name=~"$oci_name"}==0' , expr2: 'rdma_device_status{hostname=~"$hostname", oci_name=~"$oci_name"}==1' , legend_format: '{{hostname}}:{{rdma_device}}' , title: 'RDMA Device Status' , unit: 'none' , colors: {'0' : { text: 'down' , color: 'red' },'1' : { text: 'up' , color: 'green' },} },
15- { expr1: 'gpu_row_remap_error_check{hostname=~"$hostname", oci_name=~"$oci_name"}==0' , expr2: 'gpu_row_remap_error_check{hostname=~"$hostname", oci_name=~"$oci_name"}==1' , legend_format: '{{hostname}}' , title: 'GPU Row Remap Error Check' , unit: 'none' , colors: {'0' : { text: 'failed ' , color: 'red ' },'1' : { text: 'passed ' , color: 'green ' },} },
15+ { expr1: 'gpu_row_remap_error_check{hostname=~"$hostname", oci_name=~"$oci_name"}==0' , expr2: 'gpu_row_remap_error_check{hostname=~"$hostname", oci_name=~"$oci_name"}==1' , legend_format: '{{hostname}}' , title: 'GPU Row Remap Error Check' , unit: 'none' , colors: {'0' : { text: 'passed ' , color: 'green ' },'1' : { text: 'failed ' , color: 'red ' },} },
1616{ expr1: 'gpu_ecc_error_check{hostname=~"$hostname", oci_name=~"$oci_name"}==0' , expr2: 'gpu_ecc_error_check{hostname=~"$hostname", oci_name=~"$oci_name"}==1' , legend_format: '{{hostname}}' , title: 'GPU ECC Error Check' , unit: 'none' , colors: {'0' : { text: 'failed' , color: 'red' },'1' : { text: 'passed' , color: 'green' },} },
1717{ expr1: 'xid_error_check{hostname=~"$hostname", oci_name=~"$oci_name"}==0' , expr2: 'xid_error_check{hostname=~"$hostname", oci_name=~"$oci_name"}==1' , legend_format: '{{hostname}}' , title: 'Xid Error Check' , unit: 'none' , colors: {'0' : { text: 'passed' , color: 'green' },'1' : { text: 'failed' , color: 'red' },} },
1818];
1919
2020local health_status = [
21- { expr1: 'ib_link_state{hostname=~"$hostname", oci_name=~"$oci_name"}==1 or vector(0)' , expr2: 'rdma_device_status{hostname=~"$hostname", oci_name=~"$oci_name"} > 1' , legend_format: '{{hostname}}:{{rdma_device}}' , title: 'RDMA Device Status ' , unit: 'none' , colors: {'1' : { text: 'down' , color: 'red' },} },
21+ { expr1: 'ib_link_state{hostname=~"$hostname", oci_name=~"$oci_name"}==1 or vector(0)' , expr2: 'rdma_device_status{hostname=~"$hostname", oci_name=~"$oci_name"} > 1' , legend_format: '{{hostname}}:{{rdma_device}}' , title: 'RDMA Link State (h/w metric) ' , unit: 'none' , colors: {'1' : { text: 'down' , color: 'red' },} },
2222{ expr1: 'rdma_link_noflap{hostname=~"$hostname", oci_name=~"$oci_name"}==0' , expr2: 'rdma_link_noflap{hostname=~"$hostname", oci_name=~"$oci_name"}==1' , legend_format: '{{hostname}}:{{rdma_device}}' , title: 'RDMA Link flapping' , unit: 'none' , colors: {'0' : { text: 'down' , color: 'red' },'1' : { text: 'up' , color: 'green' },} },
2323{ expr1: 'rttcc_status{hostname=~"$hostname", oci_name=~"$oci_name"}==0' , expr2: 'rttcc_status{hostname=~"$hostname", oci_name=~"$oci_name"}==1' , legend_format: '{{hostname}}:{{rdma_device}}' , title: 'RTTCC Status' , unit: 'none' , colors: {'0' : { text: 'disabled' , color: 'green' },'1' : { text: 'enabled' , color: 'red' },} },
2424{ expr1: 'gpu_count{hostname=~"$hostname", oci_name=~"$oci_name"}==0' , expr2: 'gpu_count{hostname=~"$hostname", oci_name=~"$oci_name"}==1' , legend_format: '{{hostname}}:{{instance_shape}}' , title: 'GPU Count' , unit: 'none' , colors: {'0' : { text: 'down' , color: 'red' },'1' : { text: 'up' , color: 'green' },} },
0 commit comments