@@ -17,29 +17,31 @@ package main
17
17
18
18
import (
19
19
"github.com/prometheus/client_golang/prometheus"
20
+ "github.com/prometheus/common/log"
20
21
"io/ioutil"
21
- "log"
22
22
"os/exec"
23
23
"regexp"
24
24
"sort"
25
+ "strconv"
25
26
"strings"
26
27
)
27
28
28
29
type NodesMetrics struct {
29
- alloc float64
30
- comp float64
31
- down float64
32
- drain float64
33
- err float64
34
- fail float64
35
- idle float64
36
- maint float64
37
- mix float64
38
- resv float64
30
+ alloc map [string ]float64
31
+ comp map [string ]float64
32
+ down map [string ]float64
33
+ drain map [string ]float64
34
+ err map [string ]float64
35
+ fail map [string ]float64
36
+ idle map [string ]float64
37
+ maint map [string ]float64
38
+ mix map [string ]float64
39
+ resv map [string ]float64
40
+ total map [string ]float64
39
41
}
40
42
41
- func NodesGetMetrics () * NodesMetrics {
42
- return ParseNodesMetrics (NodesData ())
43
+ func NodesGetMetrics (part string ) * NodesMetrics {
44
+ return ParseNodesMetrics (NodesData (part ))
43
45
}
44
46
45
47
func RemoveDuplicates (s []string ) []string {
@@ -57,17 +59,51 @@ func RemoveDuplicates(s []string) []string {
57
59
return t
58
60
}
59
61
62
+ func InitFeatureSet (nm * NodesMetrics , feature_set string ) {
63
+ nm .alloc [feature_set ] = nm .alloc [feature_set ]
64
+ nm .comp [feature_set ] = nm .comp [feature_set ]
65
+ nm .down [feature_set ] = nm .down [feature_set ]
66
+ nm .drain [feature_set ] = nm .drain [feature_set ]
67
+ nm .err [feature_set ] = nm .err [feature_set ]
68
+ nm .fail [feature_set ] = nm .fail [feature_set ]
69
+ nm .idle [feature_set ] = nm .idle [feature_set ]
70
+ nm .maint [feature_set ] = nm .maint [feature_set ]
71
+ nm .mix [feature_set ] = nm .mix [feature_set ]
72
+ nm .resv [feature_set ] = nm .resv [feature_set ]
73
+ nm .total [feature_set ] = nm .total [feature_set ]
74
+ }
75
+
60
76
func ParseNodesMetrics (input []byte ) * NodesMetrics {
61
77
var nm NodesMetrics
78
+ var feature_set string
62
79
lines := strings .Split (string (input ), "\n " )
63
80
64
81
// Sort and remove all the duplicates from the 'sinfo' output
65
82
sort .Strings (lines )
66
83
lines_uniq := RemoveDuplicates (lines )
67
84
85
+ nm .alloc = make (map [string ]float64 )
86
+ nm .comp = make (map [string ]float64 )
87
+ nm .down = make (map [string ]float64 )
88
+ nm .drain = make (map [string ]float64 )
89
+ nm .err = make (map [string ]float64 )
90
+ nm .fail = make (map [string ]float64 )
91
+ nm .idle = make (map [string ]float64 )
92
+ nm .maint = make (map [string ]float64 )
93
+ nm .mix = make (map [string ]float64 )
94
+ nm .resv = make (map [string ]float64 )
95
+ nm .total = make (map [string ]float64 )
96
+
68
97
for _ , line := range lines_uniq {
69
- if strings .Contains (line , "," ) {
70
- state := strings .Split (line , "," )[1 ]
98
+ if strings .Contains (line , "|" ) {
99
+ state := strings .Split (line , "|" )
100
+ features := strings .Split (state [2 ], "," )
101
+ sort .Strings (features )
102
+ feature_set = strings .Join (features [:], "," )
103
+ if feature_set == "(null)" {
104
+ feature_set = "null"
105
+ }
106
+ InitFeatureSet (& nm , feature_set )
71
107
alloc := regexp .MustCompile (`^alloc` )
72
108
comp := regexp .MustCompile (`^comp` )
73
109
down := regexp .MustCompile (`^down` )
@@ -79,35 +115,35 @@ func ParseNodesMetrics(input []byte) *NodesMetrics {
79
115
mix := regexp .MustCompile (`^mix` )
80
116
resv := regexp .MustCompile (`^res` )
81
117
switch {
82
- case alloc .MatchString (state ) == true :
83
- nm .alloc ++
84
- case comp .MatchString (state ) == true :
85
- nm .comp ++
86
- case down .MatchString (state ) == true :
87
- nm .down ++
88
- case drain .MatchString (state ) == true :
89
- nm .drain ++
90
- case fail .MatchString (state ) == true :
91
- nm .fail ++
92
- case err .MatchString (state ) == true :
93
- nm .err ++
94
- case idle .MatchString (state ) == true :
95
- nm .idle ++
96
- case maint .MatchString (state ) == true :
97
- nm .maint ++
98
- case mix .MatchString (state ) == true :
99
- nm .mix ++
100
- case resv .MatchString (state ) == true :
101
- nm .resv ++
118
+ case alloc .MatchString (state [ 1 ] ) == true :
119
+ nm .alloc [ feature_set ] ++
120
+ case comp .MatchString (state [ 1 ] ) == true :
121
+ nm .comp [ feature_set ] ++
122
+ case down .MatchString (state [ 1 ] ) == true :
123
+ nm .down [ feature_set ] ++
124
+ case drain .MatchString (state [ 1 ] ) == true :
125
+ nm .drain [ feature_set ] ++
126
+ case fail .MatchString (state [ 1 ] ) == true :
127
+ nm .fail [ feature_set ] ++
128
+ case err .MatchString (state [ 1 ] ) == true :
129
+ nm .err [ feature_set ] ++
130
+ case idle .MatchString (state [ 1 ] ) == true :
131
+ nm .idle [ feature_set ] ++
132
+ case maint .MatchString (state [ 1 ] ) == true :
133
+ nm .maint [ feature_set ] ++
134
+ case mix .MatchString (state [ 1 ] ) == true :
135
+ nm .mix [ feature_set ] ++
136
+ case resv .MatchString (state [ 1 ] ) == true :
137
+ nm .resv [ feature_set ] ++
102
138
}
103
139
}
104
140
}
105
141
return & nm
106
142
}
107
143
108
144
// Execute the squeue command and return its output
109
- func NodesData () []byte {
110
- cmd := exec .Command ("sinfo" , "-h" , "-o %n,%T " )
145
+ func NodesData (part string ) []byte {
146
+ cmd := exec .Command ("sinfo" , "-h" , "-o %n|%T|%b" , "-p" , part , "| sort" , "| uniq " )
111
147
stdout , err := cmd .StdoutPipe ()
112
148
if err != nil {
113
149
log .Fatal (err )
@@ -122,24 +158,68 @@ func NodesData() []byte {
122
158
return out
123
159
}
124
160
161
+ func SlurmGetTotal () float64 {
162
+ cmd := exec .Command ("bash" , "-c" , "scontrol show nodes -o | grep -c NodeName=[a-z]*[0-9]*" )
163
+ stdout , err := cmd .StdoutPipe ()
164
+ if err != nil {
165
+ log .Fatal (err )
166
+ }
167
+ stderr , err := cmd .StderrPipe ()
168
+ if err != nil {
169
+ log .Fatal (err )
170
+ }
171
+ if err := cmd .Start (); err != nil {
172
+ log .Fatalf ("cmd.Start: %v" , err )
173
+ }
174
+ out , _ := ioutil .ReadAll (stdout )
175
+ err_out , _ := ioutil .ReadAll (stderr )
176
+ if err := cmd .Wait (); err != nil {
177
+ log .Fatalf ("cmd.Wait: %v %s %s" , err , out , err_out )
178
+ }
179
+ data := strings .Split (string (out ), "\n " )
180
+ total , _ := strconv .ParseFloat (data [0 ], 64 )
181
+ return total
182
+ }
183
+
184
+ func SlurmGetPartitions () []string {
185
+ cmd := exec .Command ("sinfo" , "-h" , "-o %R" , "| sort" , "| uniq" )
186
+ stdout , err := cmd .StdoutPipe ()
187
+ if err != nil {
188
+ log .Fatal (err )
189
+ }
190
+ if err := cmd .Start (); err != nil {
191
+ log .Fatal (err )
192
+ }
193
+ out , _ := ioutil .ReadAll (stdout )
194
+ if err := cmd .Wait (); err != nil {
195
+ log .Fatal (err )
196
+ }
197
+ partitions := strings .Split (string (out ), "\n " )
198
+ return partitions
199
+ }
200
+
125
201
/*
126
202
* Implement the Prometheus Collector interface and feed the
127
203
* Slurm scheduler metrics into it.
128
204
* https://godoc.org/github.com/prometheus/client_golang/prometheus#Collector
129
205
*/
130
206
131
207
func NewNodesCollector () * NodesCollector {
208
+ labelnames := make ([]string , 0 , 1 )
209
+ labelnames = append (labelnames , "partition" )
210
+ labelnames = append (labelnames , "active_feature_set" )
132
211
return & NodesCollector {
133
- alloc : prometheus .NewDesc ("slurm_nodes_alloc" , "Allocated nodes" , nil , nil ),
134
- comp : prometheus .NewDesc ("slurm_nodes_comp" , "Completing nodes" , nil , nil ),
135
- down : prometheus .NewDesc ("slurm_nodes_down" , "Down nodes" , nil , nil ),
136
- drain : prometheus .NewDesc ("slurm_nodes_drain" , "Drain nodes" , nil , nil ),
137
- err : prometheus .NewDesc ("slurm_nodes_err" , "Error nodes" , nil , nil ),
138
- fail : prometheus .NewDesc ("slurm_nodes_fail" , "Fail nodes" , nil , nil ),
139
- idle : prometheus .NewDesc ("slurm_nodes_idle" , "Idle nodes" , nil , nil ),
140
- maint : prometheus .NewDesc ("slurm_nodes_maint" , "Maint nodes" , nil , nil ),
141
- mix : prometheus .NewDesc ("slurm_nodes_mix" , "Mix nodes" , nil , nil ),
142
- resv : prometheus .NewDesc ("slurm_nodes_resv" , "Reserved nodes" , nil , nil ),
212
+ alloc : prometheus .NewDesc ("slurm_nodes_alloc" , "Allocated nodes" , labelnames , nil ),
213
+ comp : prometheus .NewDesc ("slurm_nodes_comp" , "Completing nodes" , labelnames , nil ),
214
+ down : prometheus .NewDesc ("slurm_nodes_down" , "Down nodes" , labelnames , nil ),
215
+ drain : prometheus .NewDesc ("slurm_nodes_drain" , "Drain nodes" , labelnames , nil ),
216
+ err : prometheus .NewDesc ("slurm_nodes_err" , "Error nodes" , labelnames , nil ),
217
+ fail : prometheus .NewDesc ("slurm_nodes_fail" , "Fail nodes" , labelnames , nil ),
218
+ idle : prometheus .NewDesc ("slurm_nodes_idle" , "Idle nodes" , labelnames , nil ),
219
+ maint : prometheus .NewDesc ("slurm_nodes_maint" , "Maint nodes" , labelnames , nil ),
220
+ mix : prometheus .NewDesc ("slurm_nodes_mix" , "Mix nodes" , labelnames , nil ),
221
+ resv : prometheus .NewDesc ("slurm_nodes_resv" , "Reserved nodes" , labelnames , nil ),
222
+ total : prometheus .NewDesc ("slurm_nodes_total" , "Total number of nodes" , nil , nil ),
143
223
}
144
224
}
145
225
@@ -154,6 +234,7 @@ type NodesCollector struct {
154
234
maint * prometheus.Desc
155
235
mix * prometheus.Desc
156
236
resv * prometheus.Desc
237
+ total * prometheus.Desc
157
238
}
158
239
159
240
// Send all metric descriptions
@@ -168,17 +249,34 @@ func (nc *NodesCollector) Describe(ch chan<- *prometheus.Desc) {
168
249
ch <- nc .maint
169
250
ch <- nc .mix
170
251
ch <- nc .resv
252
+ ch <- nc .total
171
253
}
254
+
255
+ func SendFeatureSetMetric (ch chan <- prometheus.Metric , desc * prometheus.Desc , valueType prometheus.ValueType , featurestate map [string ]float64 , part string ) {
256
+ for set , value := range featurestate {
257
+ ch <- prometheus .MustNewConstMetric (desc , valueType , value , part , set )
258
+ }
259
+ }
260
+
172
261
func (nc * NodesCollector ) Collect (ch chan <- prometheus.Metric ) {
173
- nm := NodesGetMetrics ()
174
- ch <- prometheus .MustNewConstMetric (nc .alloc , prometheus .GaugeValue , nm .alloc )
175
- ch <- prometheus .MustNewConstMetric (nc .comp , prometheus .GaugeValue , nm .comp )
176
- ch <- prometheus .MustNewConstMetric (nc .down , prometheus .GaugeValue , nm .down )
177
- ch <- prometheus .MustNewConstMetric (nc .drain , prometheus .GaugeValue , nm .drain )
178
- ch <- prometheus .MustNewConstMetric (nc .err , prometheus .GaugeValue , nm .err )
179
- ch <- prometheus .MustNewConstMetric (nc .fail , prometheus .GaugeValue , nm .fail )
180
- ch <- prometheus .MustNewConstMetric (nc .idle , prometheus .GaugeValue , nm .idle )
181
- ch <- prometheus .MustNewConstMetric (nc .maint , prometheus .GaugeValue , nm .maint )
182
- ch <- prometheus .MustNewConstMetric (nc .mix , prometheus .GaugeValue , nm .mix )
183
- ch <- prometheus .MustNewConstMetric (nc .resv , prometheus .GaugeValue , nm .resv )
262
+ partitions := SlurmGetPartitions ()
263
+ for _ , part := range partitions {
264
+ part = strings .TrimSpace (part )
265
+ if part == "" {
266
+ continue
267
+ }
268
+ nm := NodesGetMetrics (part )
269
+ SendFeatureSetMetric (ch , nc .alloc , prometheus .GaugeValue , nm .alloc , part )
270
+ SendFeatureSetMetric (ch , nc .comp , prometheus .GaugeValue , nm .comp , part )
271
+ SendFeatureSetMetric (ch , nc .down , prometheus .GaugeValue , nm .down , part )
272
+ SendFeatureSetMetric (ch , nc .drain , prometheus .GaugeValue , nm .drain , part )
273
+ SendFeatureSetMetric (ch , nc .err , prometheus .GaugeValue , nm .err , part )
274
+ SendFeatureSetMetric (ch , nc .fail , prometheus .GaugeValue , nm .fail , part )
275
+ SendFeatureSetMetric (ch , nc .idle , prometheus .GaugeValue , nm .idle , part )
276
+ SendFeatureSetMetric (ch , nc .maint , prometheus .GaugeValue , nm .maint , part )
277
+ SendFeatureSetMetric (ch , nc .mix , prometheus .GaugeValue , nm .mix , part )
278
+ SendFeatureSetMetric (ch , nc .resv , prometheus .GaugeValue , nm .resv , part )
279
+ }
280
+ total := SlurmGetTotal ()
281
+ ch <- prometheus .MustNewConstMetric (nc .total , prometheus .GaugeValue , total )
184
282
}
0 commit comments