@@ -25,26 +25,35 @@ import (
2525)
2626
2727type GPUsMetrics struct {
28- alloc float64
29- idle float64
30- other float64
31- total float64
28+ alloc float64
29+ idle float64
30+ total float64
31+ utilization float64
3232}
3333
3434func GPUsGetMetrics () * GPUsMetrics {
3535 return ParseGPUsMetrics ()
3636}
3737
3838func ParseAllocatedGPUs () float64 {
39- return 0.0 // TODO Implement
40- }
39+ var num_gpus = 0.0
4140
42- func ParseIdleGPUs () float64 {
43- return 0.0 // TOOD Implement
44- }
41+ args := []string {"-a" , "-X" , "--format=Allocgres" , "--state=RUNNING" , "--noheader" , "--parsable2" }
42+ output := string (Execute ("sacct" , args ))
43+ if len (output ) > 0 {
44+ for _ , line := range strings .Split (output , "\n " ) {
45+ if len (line ) > 0 {
46+ line = strings .Trim (line , "\" " )
47+ descriptor := strings .TrimPrefix (line , "gpu:" )
48+ job_gpus , err := strconv .ParseFloat (descriptor , 64 )
49+ if err != nil {
50+ num_gpus += job_gpus
51+ }
52+ }
53+ }
54+ }
4555
46- func ParseOtherGPUs () float64 {
47- return 0.0 // TODO Implement
56+ return num_gpus
4857}
4958
5059func ParseTotalGPUs () float64 {
@@ -72,10 +81,12 @@ func ParseTotalGPUs() float64 {
7281
7382func ParseGPUsMetrics () * GPUsMetrics {
7483 var gm GPUsMetrics
75- gm .alloc = ParseAllocatedGPUs ()
76- gm .idle = ParseIdleGPUs ()
77- gm .other = ParseOtherGPUs ()
78- gm .total = ParseTotalGPUs ()
84+ total_gpus := ParseTotalGPUs ()
85+ allocated_gpus := ParseAllocatedGPUs ()
86+ gm .alloc = allocated_gpus
87+ gm .idle = total_gpus - allocated_gpus
88+ gm .total = total_gpus
89+ gm .utilization = allocated_gpus / total_gpus
7990 return & gm
8091}
8192
@@ -106,29 +117,29 @@ func NewGPUsCollector() *GPUsCollector {
106117 return & GPUsCollector {
107118 alloc : prometheus .NewDesc ("slurm_gpus_alloc" , "Allocated GPUs" , nil , nil ),
108119 idle : prometheus .NewDesc ("slurm_gpus_idle" , "Idle GPUs" , nil , nil ),
109- other : prometheus .NewDesc ("slurm_gpus_other" , "Mix GPUs" , nil , nil ),
110120 total : prometheus .NewDesc ("slurm_gpus_total" , "Total GPUs" , nil , nil ),
121+ utilization : prometheus .NewDesc ("slurm_gpus_utilization" , "Total GPU utilization" , nil , nil ),
111122 }
112123}
113124
114125type GPUsCollector struct {
115- alloc * prometheus.Desc
116- idle * prometheus.Desc
117- other * prometheus.Desc
118- total * prometheus.Desc
126+ alloc * prometheus.Desc
127+ idle * prometheus.Desc
128+ total * prometheus.Desc
129+ utilization * prometheus.Desc
119130}
120131
121132// Send all metric descriptions
122133func (cc * GPUsCollector ) Describe (ch chan <- * prometheus.Desc ) {
123134 ch <- cc .alloc
124135 ch <- cc .idle
125- ch <- cc .other
126136 ch <- cc .total
137+ ch <- cc .utilization
127138}
128139func (cc * GPUsCollector ) Collect (ch chan <- prometheus.Metric ) {
129140 cm := GPUsGetMetrics ()
130141 ch <- prometheus .MustNewConstMetric (cc .alloc , prometheus .GaugeValue , cm .alloc )
131142 ch <- prometheus .MustNewConstMetric (cc .idle , prometheus .GaugeValue , cm .idle )
132- ch <- prometheus .MustNewConstMetric (cc .other , prometheus .GaugeValue , cm .other )
133143 ch <- prometheus .MustNewConstMetric (cc .total , prometheus .GaugeValue , cm .total )
144+ ch <- prometheus .MustNewConstMetric (cc .utilization , prometheus .GaugeValue , cm .utilization )
134145}
0 commit comments