Skip to content

Commit 3256015

Browse files
author
Joeri Hermans
committed
Add all GPU exporter features
1 parent a5d5a99 commit 3256015

File tree

1 file changed

+33
-22
lines changed

1 file changed

+33
-22
lines changed

gpus.go

Lines changed: 33 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -25,26 +25,35 @@ import (
2525
)
2626

2727
type GPUsMetrics struct {
28-
alloc float64
29-
idle float64
30-
other float64
31-
total float64
28+
alloc float64
29+
idle float64
30+
total float64
31+
utilization float64
3232
}
3333

3434
func GPUsGetMetrics() *GPUsMetrics {
3535
return ParseGPUsMetrics()
3636
}
3737

3838
func ParseAllocatedGPUs() float64 {
39-
return 0.0 // TODO Implement
40-
}
39+
var num_gpus = 0.0
4140

42-
func ParseIdleGPUs() float64 {
43-
return 0.0 // TOOD Implement
44-
}
41+
args := []string{"-a", "-X", "--format=Allocgres", "--state=RUNNING", "--noheader", "--parsable2"}
42+
output := string(Execute("sacct", args))
43+
if len(output) > 0 {
44+
for _, line := range strings.Split(output, "\n") {
45+
if len(line) > 0 {
46+
line = strings.Trim(line, "\"")
47+
descriptor := strings.TrimPrefix(line, "gpu:")
48+
job_gpus, err := strconv.ParseFloat(descriptor, 64)
49+
if err != nil {
50+
num_gpus += job_gpus
51+
}
52+
}
53+
}
54+
}
4555

46-
func ParseOtherGPUs() float64 {
47-
return 0.0 // TODO Implement
56+
return num_gpus
4857
}
4958

5059
func ParseTotalGPUs() float64 {
@@ -72,10 +81,12 @@ func ParseTotalGPUs() float64 {
7281

7382
func ParseGPUsMetrics() *GPUsMetrics {
7483
var gm GPUsMetrics
75-
gm.alloc = ParseAllocatedGPUs()
76-
gm.idle = ParseIdleGPUs()
77-
gm.other = ParseOtherGPUs()
78-
gm.total = ParseTotalGPUs()
84+
total_gpus := ParseTotalGPUs()
85+
allocated_gpus := ParseAllocatedGPUs()
86+
gm.alloc = allocated_gpus
87+
gm.idle = total_gpus - allocated_gpus
88+
gm.total = total_gpus
89+
gm.utilization = allocated_gpus / total_gpus
7990
return &gm
8091
}
8192

@@ -106,29 +117,29 @@ func NewGPUsCollector() *GPUsCollector {
106117
return &GPUsCollector{
107118
alloc: prometheus.NewDesc("slurm_gpus_alloc", "Allocated GPUs", nil, nil),
108119
idle: prometheus.NewDesc("slurm_gpus_idle", "Idle GPUs", nil, nil),
109-
other: prometheus.NewDesc("slurm_gpus_other", "Mix GPUs", nil, nil),
110120
total: prometheus.NewDesc("slurm_gpus_total", "Total GPUs", nil, nil),
121+
utilization: prometheus.NewDesc("slurm_gpus_utilization", "Total GPU utilization", nil, nil),
111122
}
112123
}
113124

114125
type GPUsCollector struct {
115-
alloc *prometheus.Desc
116-
idle *prometheus.Desc
117-
other *prometheus.Desc
118-
total *prometheus.Desc
126+
alloc *prometheus.Desc
127+
idle *prometheus.Desc
128+
total *prometheus.Desc
129+
utilization *prometheus.Desc
119130
}
120131

121132
// Send all metric descriptions
122133
func (cc *GPUsCollector) Describe(ch chan<- *prometheus.Desc) {
123134
ch <- cc.alloc
124135
ch <- cc.idle
125-
ch <- cc.other
126136
ch <- cc.total
137+
ch <- cc.utilization
127138
}
128139
func (cc *GPUsCollector) Collect(ch chan<- prometheus.Metric) {
129140
cm := GPUsGetMetrics()
130141
ch <- prometheus.MustNewConstMetric(cc.alloc, prometheus.GaugeValue, cm.alloc)
131142
ch <- prometheus.MustNewConstMetric(cc.idle, prometheus.GaugeValue, cm.idle)
132-
ch <- prometheus.MustNewConstMetric(cc.other, prometheus.GaugeValue, cm.other)
133143
ch <- prometheus.MustNewConstMetric(cc.total, prometheus.GaugeValue, cm.total)
144+
ch <- prometheus.MustNewConstMetric(cc.utilization, prometheus.GaugeValue, cm.utilization)
134145
}

0 commit comments

Comments
 (0)