@@ -17,8 +17,8 @@ package main
17
17
18
18
import (
19
19
"github.com/prometheus/client_golang/prometheus"
20
+ "github.com/prometheus/common/log"
20
21
"io/ioutil"
21
- "log"
22
22
"os/exec"
23
23
"regexp"
24
24
"sort"
@@ -27,20 +27,21 @@ import (
27
27
)
28
28
29
29
type NodesMetrics struct {
30
- alloc float64
31
- comp float64
32
- down float64
33
- drain float64
34
- err float64
35
- fail float64
36
- idle float64
37
- maint float64
38
- mix float64
39
- resv float64
30
+ alloc map [string ]float64
31
+ comp map [string ]float64
32
+ down map [string ]float64
33
+ drain map [string ]float64
34
+ err map [string ]float64
35
+ fail map [string ]float64
36
+ idle map [string ]float64
37
+ maint map [string ]float64
38
+ mix map [string ]float64
39
+ resv map [string ]float64
40
+ total map [string ]float64
40
41
}
41
42
42
- func NodesGetMetrics () * NodesMetrics {
43
- return ParseNodesMetrics (NodesData ())
43
+ func NodesGetMetrics (part string ) * NodesMetrics {
44
+ return ParseNodesMetrics (NodesData (part ))
44
45
}
45
46
46
47
func RemoveDuplicates (s []string ) []string {
@@ -60,19 +61,53 @@ func RemoveDuplicates(s []string) []string {
60
61
return t
61
62
}
62
63
64
+ func InitFeatureSet (nm * NodesMetrics , feature_set string ) {
65
+ nm .alloc [feature_set ] = nm .alloc [feature_set ]
66
+ nm .comp [feature_set ] = nm .comp [feature_set ]
67
+ nm .down [feature_set ] = nm .down [feature_set ]
68
+ nm .drain [feature_set ] = nm .drain [feature_set ]
69
+ nm .err [feature_set ] = nm .err [feature_set ]
70
+ nm .fail [feature_set ] = nm .fail [feature_set ]
71
+ nm .idle [feature_set ] = nm .idle [feature_set ]
72
+ nm .maint [feature_set ] = nm .maint [feature_set ]
73
+ nm .mix [feature_set ] = nm .mix [feature_set ]
74
+ nm .resv [feature_set ] = nm .resv [feature_set ]
75
+ nm .total [feature_set ] = nm .total [feature_set ]
76
+ }
77
+
63
78
func ParseNodesMetrics (input []byte ) * NodesMetrics {
64
79
var nm NodesMetrics
80
+ var feature_set string
65
81
lines := strings .Split (string (input ), "\n " )
66
82
67
83
// Sort and remove all the duplicates from the 'sinfo' output
68
84
sort .Strings (lines )
69
85
lines_uniq := RemoveDuplicates (lines )
70
86
87
+ nm .alloc = make (map [string ]float64 )
88
+ nm .comp = make (map [string ]float64 )
89
+ nm .down = make (map [string ]float64 )
90
+ nm .drain = make (map [string ]float64 )
91
+ nm .err = make (map [string ]float64 )
92
+ nm .fail = make (map [string ]float64 )
93
+ nm .idle = make (map [string ]float64 )
94
+ nm .maint = make (map [string ]float64 )
95
+ nm .mix = make (map [string ]float64 )
96
+ nm .resv = make (map [string ]float64 )
97
+ nm .total = make (map [string ]float64 )
98
+
71
99
for _ , line := range lines_uniq {
72
- if strings .Contains (line , "," ) {
73
- split := strings .Split (line , "," )
74
- count , _ := strconv .ParseFloat (strings .TrimSpace (split [0 ]), 64 )
75
- state := split [1 ]
100
+ if strings .Contains (line , "|" ) {
101
+ split := strings .Split (line , "|" )
102
+ state := split [1 ]
103
+ count , _ := strconv .ParseFloat (strings .TrimSpace (split [0 ]), 64 )
104
+ features := strings .Split (split [2 ], "," )
105
+ sort .Strings (features )
106
+ feature_set = strings .Join (features [:], "," )
107
+ if feature_set == "(null)" {
108
+ feature_set = "null"
109
+ }
110
+ InitFeatureSet (& nm , feature_set )
76
111
alloc := regexp .MustCompile (`^alloc` )
77
112
comp := regexp .MustCompile (`^comp` )
78
113
down := regexp .MustCompile (`^down` )
@@ -85,34 +120,34 @@ func ParseNodesMetrics(input []byte) *NodesMetrics {
85
120
resv := regexp .MustCompile (`^res` )
86
121
switch {
87
122
case alloc .MatchString (state ) == true :
88
- nm .alloc += count
123
+ nm .alloc [ feature_set ] += count
89
124
case comp .MatchString (state ) == true :
90
- nm .comp += count
125
+ nm .comp [ feature_set ] += count
91
126
case down .MatchString (state ) == true :
92
- nm .down += count
127
+ nm .down [ feature_set ] += count
93
128
case drain .MatchString (state ) == true :
94
- nm .drain += count
129
+ nm .drain [ feature_set ] += count
95
130
case fail .MatchString (state ) == true :
96
- nm .fail += count
131
+ nm .fail [ feature_set ] += count
97
132
case err .MatchString (state ) == true :
98
- nm .err += count
133
+ nm .err [ feature_set ] += count
99
134
case idle .MatchString (state ) == true :
100
- nm .idle += count
135
+ nm .idle [ feature_set ] += count
101
136
case maint .MatchString (state ) == true :
102
- nm .maint += count
137
+ nm .maint [ feature_set ] += count
103
138
case mix .MatchString (state ) == true :
104
- nm .mix += count
139
+ nm .mix [ feature_set ] += count
105
140
case resv .MatchString (state ) == true :
106
- nm .resv += count
141
+ nm .resv [ feature_set ] += count
107
142
}
108
143
}
109
144
}
110
145
return & nm
111
146
}
112
147
113
148
// Execute the sinfo command and return its output
114
- func NodesData () []byte {
115
- cmd := exec .Command ("sinfo" , "-h" , "-o %D,%T " )
149
+ func NodesData (part string ) []byte {
150
+ cmd := exec .Command ("sinfo" , "-h" , "-o %D|%T|%b" , "-p" , part , "| sort" , "| uniq " )
116
151
stdout , err := cmd .StdoutPipe ()
117
152
if err != nil {
118
153
log .Fatal (err )
@@ -127,24 +162,68 @@ func NodesData() []byte {
127
162
return out
128
163
}
129
164
165
+ func SlurmGetTotal () float64 {
166
+ cmd := exec .Command ("bash" , "-c" , "scontrol show nodes -o | grep -c NodeName=[a-z]*[0-9]*" )
167
+ stdout , err := cmd .StdoutPipe ()
168
+ if err != nil {
169
+ log .Fatal (err )
170
+ }
171
+ stderr , err := cmd .StderrPipe ()
172
+ if err != nil {
173
+ log .Fatal (err )
174
+ }
175
+ if err := cmd .Start (); err != nil {
176
+ log .Fatalf ("cmd.Start: %v" , err )
177
+ }
178
+ out , _ := ioutil .ReadAll (stdout )
179
+ err_out , _ := ioutil .ReadAll (stderr )
180
+ if err := cmd .Wait (); err != nil {
181
+ log .Fatalf ("cmd.Wait: %v %s %s" , err , out , err_out )
182
+ }
183
+ data := strings .Split (string (out ), "\n " )
184
+ total , _ := strconv .ParseFloat (data [0 ], 64 )
185
+ return total
186
+ }
187
+
188
+ func SlurmGetPartitions () []string {
189
+ cmd := exec .Command ("sinfo" , "-h" , "-o %R" , "| sort" , "| uniq" )
190
+ stdout , err := cmd .StdoutPipe ()
191
+ if err != nil {
192
+ log .Fatal (err )
193
+ }
194
+ if err := cmd .Start (); err != nil {
195
+ log .Fatal (err )
196
+ }
197
+ out , _ := ioutil .ReadAll (stdout )
198
+ if err := cmd .Wait (); err != nil {
199
+ log .Fatal (err )
200
+ }
201
+ partitions := strings .Split (string (out ), "\n " )
202
+ return partitions
203
+ }
204
+
130
205
/*
131
206
* Implement the Prometheus Collector interface and feed the
132
207
* Slurm scheduler metrics into it.
133
208
* https://godoc.org/github.com/prometheus/client_golang/prometheus#Collector
134
209
*/
135
210
136
211
func NewNodesCollector () * NodesCollector {
212
+ labelnames := make ([]string , 0 , 1 )
213
+ labelnames = append (labelnames , "partition" )
214
+ labelnames = append (labelnames , "active_feature_set" )
137
215
return & NodesCollector {
138
- alloc : prometheus .NewDesc ("slurm_nodes_alloc" , "Allocated nodes" , nil , nil ),
139
- comp : prometheus .NewDesc ("slurm_nodes_comp" , "Completing nodes" , nil , nil ),
140
- down : prometheus .NewDesc ("slurm_nodes_down" , "Down nodes" , nil , nil ),
141
- drain : prometheus .NewDesc ("slurm_nodes_drain" , "Drain nodes" , nil , nil ),
142
- err : prometheus .NewDesc ("slurm_nodes_err" , "Error nodes" , nil , nil ),
143
- fail : prometheus .NewDesc ("slurm_nodes_fail" , "Fail nodes" , nil , nil ),
144
- idle : prometheus .NewDesc ("slurm_nodes_idle" , "Idle nodes" , nil , nil ),
145
- maint : prometheus .NewDesc ("slurm_nodes_maint" , "Maint nodes" , nil , nil ),
146
- mix : prometheus .NewDesc ("slurm_nodes_mix" , "Mix nodes" , nil , nil ),
147
- resv : prometheus .NewDesc ("slurm_nodes_resv" , "Reserved nodes" , nil , nil ),
216
+ alloc : prometheus .NewDesc ("slurm_nodes_alloc" , "Allocated nodes" , labelnames , nil ),
217
+ comp : prometheus .NewDesc ("slurm_nodes_comp" , "Completing nodes" , labelnames , nil ),
218
+ down : prometheus .NewDesc ("slurm_nodes_down" , "Down nodes" , labelnames , nil ),
219
+ drain : prometheus .NewDesc ("slurm_nodes_drain" , "Drain nodes" , labelnames , nil ),
220
+ err : prometheus .NewDesc ("slurm_nodes_err" , "Error nodes" , labelnames , nil ),
221
+ fail : prometheus .NewDesc ("slurm_nodes_fail" , "Fail nodes" , labelnames , nil ),
222
+ idle : prometheus .NewDesc ("slurm_nodes_idle" , "Idle nodes" , labelnames , nil ),
223
+ maint : prometheus .NewDesc ("slurm_nodes_maint" , "Maint nodes" , labelnames , nil ),
224
+ mix : prometheus .NewDesc ("slurm_nodes_mix" , "Mix nodes" , labelnames , nil ),
225
+ resv : prometheus .NewDesc ("slurm_nodes_resv" , "Reserved nodes" , labelnames , nil ),
226
+ total : prometheus .NewDesc ("slurm_nodes_total" , "Total number of nodes" , nil , nil ),
148
227
}
149
228
}
150
229
@@ -159,6 +238,7 @@ type NodesCollector struct {
159
238
maint * prometheus.Desc
160
239
mix * prometheus.Desc
161
240
resv * prometheus.Desc
241
+ total * prometheus.Desc
162
242
}
163
243
164
244
// Send all metric descriptions
@@ -173,17 +253,34 @@ func (nc *NodesCollector) Describe(ch chan<- *prometheus.Desc) {
173
253
ch <- nc .maint
174
254
ch <- nc .mix
175
255
ch <- nc .resv
256
+ ch <- nc .total
176
257
}
258
+
259
+ func SendFeatureSetMetric (ch chan <- prometheus.Metric , desc * prometheus.Desc , valueType prometheus.ValueType , featurestate map [string ]float64 , part string ) {
260
+ for set , value := range featurestate {
261
+ ch <- prometheus .MustNewConstMetric (desc , valueType , value , part , set )
262
+ }
263
+ }
264
+
177
265
func (nc * NodesCollector ) Collect (ch chan <- prometheus.Metric ) {
178
- nm := NodesGetMetrics ()
179
- ch <- prometheus .MustNewConstMetric (nc .alloc , prometheus .GaugeValue , nm .alloc )
180
- ch <- prometheus .MustNewConstMetric (nc .comp , prometheus .GaugeValue , nm .comp )
181
- ch <- prometheus .MustNewConstMetric (nc .down , prometheus .GaugeValue , nm .down )
182
- ch <- prometheus .MustNewConstMetric (nc .drain , prometheus .GaugeValue , nm .drain )
183
- ch <- prometheus .MustNewConstMetric (nc .err , prometheus .GaugeValue , nm .err )
184
- ch <- prometheus .MustNewConstMetric (nc .fail , prometheus .GaugeValue , nm .fail )
185
- ch <- prometheus .MustNewConstMetric (nc .idle , prometheus .GaugeValue , nm .idle )
186
- ch <- prometheus .MustNewConstMetric (nc .maint , prometheus .GaugeValue , nm .maint )
187
- ch <- prometheus .MustNewConstMetric (nc .mix , prometheus .GaugeValue , nm .mix )
188
- ch <- prometheus .MustNewConstMetric (nc .resv , prometheus .GaugeValue , nm .resv )
266
+ partitions := SlurmGetPartitions ()
267
+ for _ , part := range partitions {
268
+ part = strings .TrimSpace (part )
269
+ if part == "" {
270
+ continue
271
+ }
272
+ nm := NodesGetMetrics (part )
273
+ SendFeatureSetMetric (ch , nc .alloc , prometheus .GaugeValue , nm .alloc , part )
274
+ SendFeatureSetMetric (ch , nc .comp , prometheus .GaugeValue , nm .comp , part )
275
+ SendFeatureSetMetric (ch , nc .down , prometheus .GaugeValue , nm .down , part )
276
+ SendFeatureSetMetric (ch , nc .drain , prometheus .GaugeValue , nm .drain , part )
277
+ SendFeatureSetMetric (ch , nc .err , prometheus .GaugeValue , nm .err , part )
278
+ SendFeatureSetMetric (ch , nc .fail , prometheus .GaugeValue , nm .fail , part )
279
+ SendFeatureSetMetric (ch , nc .idle , prometheus .GaugeValue , nm .idle , part )
280
+ SendFeatureSetMetric (ch , nc .maint , prometheus .GaugeValue , nm .maint , part )
281
+ SendFeatureSetMetric (ch , nc .mix , prometheus .GaugeValue , nm .mix , part )
282
+ SendFeatureSetMetric (ch , nc .resv , prometheus .GaugeValue , nm .resv , part )
283
+ }
284
+ total := SlurmGetTotal ()
285
+ ch <- prometheus .MustNewConstMetric (nc .total , prometheus .GaugeValue , total )
189
286
}
0 commit comments