@@ -86,6 +86,8 @@ local health_status = [
8686{ expr1: 'rdma_device_status{hostname=~"$hostname", oci_name=~"$oci_name"}==0' , expr2: 'rdma_device_status{hostname=~"$hostname", oci_name=~"$oci_name"}==1' , legend_format: '{{hostname}}:{{rdma_device}}' , title: 'RDMA Device Status' , unit: 'none' },
8787{ expr1: 'rdma_link_noflap{hostname=~"$hostname", oci_name=~"$oci_name"}==0' , expr2: 'rdma_link_noflap{hostname=~"$hostname", oci_name=~"$oci_name"}==1' , legend_format: '{{hostname}}:{{rdma_device}}' , title: 'RDMA Link flapping' , unit: 'none' },
8888{ expr1: 'rttcc_status{hostname=~"$hostname", oci_name=~"$oci_name"}==0' , expr2: 'rttcc_status{hostname=~"$hostname", oci_name=~"$oci_name"}==1' , legend_format: '{{hostname}}:{{rdma_device}}' , title: 'RTTCC Status' , unit: 'none' },
89+ { expr1: 'gpu_count{hostname=~"$hostname", oci_name=~"$oci_name"}==0' , expr2: 'gpu_count{hostname=~"$hostname", oci_name=~"$oci_name"}==1' , legend_format: '{{hostname}}:{{instance_shape}}' , title: 'GPU Status' , unit: 'none' },
90+ { expr1: 'oca_version{hostname=~"$hostname", oci_name=~"$oci_name"}==0' , expr2: 'oca_version{hostname=~"$hostname", oci_name=~"$oci_name"}==1' , legend_format: '{{hostname}}:{{version}}' , title: 'OCA Version' , unit: 'none' },
8991];
9092
9193local nfs_metrics = [
@@ -211,9 +213,9 @@ g.dashboard.new('Cluster Dashboard')
211213 + g.panel.timeSeries.queryOptions.withTargets([
212214 g.query.prometheus.new(
213215 '$PROMETHEUS_DS' ,
214- 'avg by(Hostname) (' + metric.name + '{Hostname=~"$hostname", oci_name=~"$oci_name"})' ,
216+ 'avg by(Hostname, gpu ) (' + metric.name + '{Hostname=~"$hostname", oci_name=~"$oci_name"})' ,
215217 )
216- + g.query.prometheus.withLegendFormat('{{ Hostname }}' )
218+ + g.query.prometheus.withLegendFormat('{{ Hostname }}:{{ gpu }} ' )
217219 ])
218220 + g.panel.timeSeries.standardOptions.withUnit(metric.unit)
219221 + g.panel.timeSeries.gridPos.withW(24 )
@@ -227,9 +229,9 @@ g.dashboard.new('Cluster Dashboard')
227229 + g.panel.timeSeries.queryOptions.withTargets([
228230 g.query.prometheus.new(
229231 '$PROMETHEUS_DS' ,
230- 'avg by(Hostname) (' + metric.name + '{Hostname=~"$hostname", oci_name=~"$oci_name"})' ,
232+ 'avg by(Hostname, gpu ) (' + metric.name + '{Hostname=~"$hostname", oci_name=~"$oci_name"})' ,
231233 )
232- + g.query.prometheus.withLegendFormat('{{ Hostname }}' )
234+ + g.query.prometheus.withLegendFormat('{{ Hostname }}:{{ gpu }} ' )
233235 ])
234236 + g.panel.timeSeries.standardOptions.withUnit(metric.unit)
235237 + g.panel.timeSeries.gridPos.withW(24 )
0 commit comments