Skip to content

Commit 0f8f66b

Browse files
committed
nodes: Add partition and feature_set labels to nodes collector
1 parent e3cd573 commit 0f8f66b

File tree

1 file changed

+156
-58
lines changed

1 file changed

+156
-58
lines changed

nodes.go

Lines changed: 156 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -17,29 +17,31 @@ package main
1717

1818
import (
1919
"github.com/prometheus/client_golang/prometheus"
20+
"github.com/prometheus/common/log"
2021
"io/ioutil"
21-
"log"
2222
"os/exec"
2323
"regexp"
2424
"sort"
25+
"strconv"
2526
"strings"
2627
)
2728

2829
type NodesMetrics struct {
29-
alloc float64
30-
comp float64
31-
down float64
32-
drain float64
33-
err float64
34-
fail float64
35-
idle float64
36-
maint float64
37-
mix float64
38-
resv float64
30+
alloc map[string]float64
31+
comp map[string]float64
32+
down map[string]float64
33+
drain map[string]float64
34+
err map[string]float64
35+
fail map[string]float64
36+
idle map[string]float64
37+
maint map[string]float64
38+
mix map[string]float64
39+
resv map[string]float64
40+
total map[string]float64
3941
}
4042

41-
func NodesGetMetrics() *NodesMetrics {
42-
return ParseNodesMetrics(NodesData())
43+
func NodesGetMetrics(part string) *NodesMetrics {
44+
return ParseNodesMetrics(NodesData(part))
4345
}
4446

4547
func RemoveDuplicates(s []string) []string {
@@ -57,17 +59,51 @@ func RemoveDuplicates(s []string) []string {
5759
return t
5860
}
5961

62+
func InitFeatureSet(nm *NodesMetrics, feature_set string) {
63+
nm.alloc[feature_set] = nm.alloc[feature_set]
64+
nm.comp[feature_set] = nm.comp[feature_set]
65+
nm.down[feature_set] = nm.down[feature_set]
66+
nm.drain[feature_set] = nm.drain[feature_set]
67+
nm.err[feature_set] = nm.err[feature_set]
68+
nm.fail[feature_set] = nm.fail[feature_set]
69+
nm.idle[feature_set] = nm.idle[feature_set]
70+
nm.maint[feature_set] = nm.maint[feature_set]
71+
nm.mix[feature_set] = nm.mix[feature_set]
72+
nm.resv[feature_set] = nm.resv[feature_set]
73+
nm.total[feature_set] = nm.total[feature_set]
74+
}
75+
6076
func ParseNodesMetrics(input []byte) *NodesMetrics {
6177
var nm NodesMetrics
78+
var feature_set string
6279
lines := strings.Split(string(input), "\n")
6380

6481
// Sort and remove all the duplicates from the 'sinfo' output
6582
sort.Strings(lines)
6683
lines_uniq := RemoveDuplicates(lines)
6784

85+
nm.alloc = make(map[string]float64)
86+
nm.comp = make(map[string]float64)
87+
nm.down = make(map[string]float64)
88+
nm.drain = make(map[string]float64)
89+
nm.err = make(map[string]float64)
90+
nm.fail = make(map[string]float64)
91+
nm.idle = make(map[string]float64)
92+
nm.maint = make(map[string]float64)
93+
nm.mix = make(map[string]float64)
94+
nm.resv = make(map[string]float64)
95+
nm.total = make(map[string]float64)
96+
6897
for _, line := range lines_uniq {
69-
if strings.Contains(line, ",") {
70-
state := strings.Split(line, ",")[1]
98+
if strings.Contains(line, "|") {
99+
state := strings.Split(line, "|")
100+
features := strings.Split(state[2], ",")
101+
sort.Strings(features)
102+
feature_set = strings.Join(features[:], ",")
103+
if feature_set == "(null)" {
104+
feature_set = "null"
105+
}
106+
InitFeatureSet(&nm, feature_set)
71107
alloc := regexp.MustCompile(`^alloc`)
72108
comp := regexp.MustCompile(`^comp`)
73109
down := regexp.MustCompile(`^down`)
@@ -79,35 +115,35 @@ func ParseNodesMetrics(input []byte) *NodesMetrics {
79115
mix := regexp.MustCompile(`^mix`)
80116
resv := regexp.MustCompile(`^res`)
81117
switch {
82-
case alloc.MatchString(state) == true:
83-
nm.alloc++
84-
case comp.MatchString(state) == true:
85-
nm.comp++
86-
case down.MatchString(state) == true:
87-
nm.down++
88-
case drain.MatchString(state) == true:
89-
nm.drain++
90-
case fail.MatchString(state) == true:
91-
nm.fail++
92-
case err.MatchString(state) == true:
93-
nm.err++
94-
case idle.MatchString(state) == true:
95-
nm.idle++
96-
case maint.MatchString(state) == true:
97-
nm.maint++
98-
case mix.MatchString(state) == true:
99-
nm.mix++
100-
case resv.MatchString(state) == true:
101-
nm.resv++
118+
case alloc.MatchString(state[1]) == true:
119+
nm.alloc[feature_set]++
120+
case comp.MatchString(state[1]) == true:
121+
nm.comp[feature_set]++
122+
case down.MatchString(state[1]) == true:
123+
nm.down[feature_set]++
124+
case drain.MatchString(state[1]) == true:
125+
nm.drain[feature_set]++
126+
case fail.MatchString(state[1]) == true:
127+
nm.fail[feature_set]++
128+
case err.MatchString(state[1]) == true:
129+
nm.err[feature_set]++
130+
case idle.MatchString(state[1]) == true:
131+
nm.idle[feature_set]++
132+
case maint.MatchString(state[1]) == true:
133+
nm.maint[feature_set]++
134+
case mix.MatchString(state[1]) == true:
135+
nm.mix[feature_set]++
136+
case resv.MatchString(state[1]) == true:
137+
nm.resv[feature_set]++
102138
}
103139
}
104140
}
105141
return &nm
106142
}
107143

108144
// Execute the squeue command and return its output
109-
func NodesData() []byte {
110-
cmd := exec.Command("sinfo", "-h", "-o %n,%T")
145+
func NodesData(part string) []byte {
146+
cmd := exec.Command("sinfo", "-h", "-o %n|%T|%b", "-p", part, "| sort", "| uniq")
111147
stdout, err := cmd.StdoutPipe()
112148
if err != nil {
113149
log.Fatal(err)
@@ -122,24 +158,68 @@ func NodesData() []byte {
122158
return out
123159
}
124160

161+
func SlurmGetTotal() float64 {
162+
cmd := exec.Command("bash", "-c", "scontrol show nodes -o | grep -c NodeName=[a-z]*[0-9]*")
163+
stdout, err := cmd.StdoutPipe()
164+
if err != nil {
165+
log.Fatal(err)
166+
}
167+
stderr, err := cmd.StderrPipe()
168+
if err != nil {
169+
log.Fatal(err)
170+
}
171+
if err := cmd.Start(); err != nil {
172+
log.Fatalf("cmd.Start: %v", err)
173+
}
174+
out, _ := ioutil.ReadAll(stdout)
175+
err_out, _ := ioutil.ReadAll(stderr)
176+
if err := cmd.Wait(); err != nil {
177+
log.Fatalf("cmd.Wait: %v %s %s", err, out, err_out)
178+
}
179+
data := strings.Split(string(out), "\n")
180+
total, _ := strconv.ParseFloat(data[0], 64)
181+
return total
182+
}
183+
184+
func SlurmGetPartitions() []string {
185+
cmd := exec.Command("sinfo", "-h", "-o %R", "| sort", "| uniq")
186+
stdout, err := cmd.StdoutPipe()
187+
if err != nil {
188+
log.Fatal(err)
189+
}
190+
if err := cmd.Start(); err != nil {
191+
log.Fatal(err)
192+
}
193+
out, _ := ioutil.ReadAll(stdout)
194+
if err := cmd.Wait(); err != nil {
195+
log.Fatal(err)
196+
}
197+
partitions := strings.Split(string(out), "\n")
198+
return partitions
199+
}
200+
125201
/*
126202
* Implement the Prometheus Collector interface and feed the
127203
* Slurm scheduler metrics into it.
128204
* https://godoc.org/github.com/prometheus/client_golang/prometheus#Collector
129205
*/
130206

131207
func NewNodesCollector() *NodesCollector {
208+
labelnames := make([]string, 0, 1)
209+
labelnames = append(labelnames, "partition")
210+
labelnames = append(labelnames, "active_feature_set")
132211
return &NodesCollector{
133-
alloc: prometheus.NewDesc("slurm_nodes_alloc", "Allocated nodes", nil, nil),
134-
comp: prometheus.NewDesc("slurm_nodes_comp", "Completing nodes", nil, nil),
135-
down: prometheus.NewDesc("slurm_nodes_down", "Down nodes", nil, nil),
136-
drain: prometheus.NewDesc("slurm_nodes_drain", "Drain nodes", nil, nil),
137-
err: prometheus.NewDesc("slurm_nodes_err", "Error nodes", nil, nil),
138-
fail: prometheus.NewDesc("slurm_nodes_fail", "Fail nodes", nil, nil),
139-
idle: prometheus.NewDesc("slurm_nodes_idle", "Idle nodes", nil, nil),
140-
maint: prometheus.NewDesc("slurm_nodes_maint", "Maint nodes", nil, nil),
141-
mix: prometheus.NewDesc("slurm_nodes_mix", "Mix nodes", nil, nil),
142-
resv: prometheus.NewDesc("slurm_nodes_resv", "Reserved nodes", nil, nil),
212+
alloc: prometheus.NewDesc("slurm_nodes_alloc", "Allocated nodes", labelnames, nil),
213+
comp: prometheus.NewDesc("slurm_nodes_comp", "Completing nodes", labelnames, nil),
214+
down: prometheus.NewDesc("slurm_nodes_down", "Down nodes", labelnames, nil),
215+
drain: prometheus.NewDesc("slurm_nodes_drain", "Drain nodes", labelnames, nil),
216+
err: prometheus.NewDesc("slurm_nodes_err", "Error nodes", labelnames, nil),
217+
fail: prometheus.NewDesc("slurm_nodes_fail", "Fail nodes", labelnames, nil),
218+
idle: prometheus.NewDesc("slurm_nodes_idle", "Idle nodes", labelnames, nil),
219+
maint: prometheus.NewDesc("slurm_nodes_maint", "Maint nodes", labelnames, nil),
220+
mix: prometheus.NewDesc("slurm_nodes_mix", "Mix nodes", labelnames, nil),
221+
resv: prometheus.NewDesc("slurm_nodes_resv", "Reserved nodes", labelnames, nil),
222+
total: prometheus.NewDesc("slurm_nodes_total", "Total number of nodes", nil, nil),
143223
}
144224
}
145225

@@ -154,6 +234,7 @@ type NodesCollector struct {
154234
maint *prometheus.Desc
155235
mix *prometheus.Desc
156236
resv *prometheus.Desc
237+
total *prometheus.Desc
157238
}
158239

159240
// Send all metric descriptions
@@ -168,17 +249,34 @@ func (nc *NodesCollector) Describe(ch chan<- *prometheus.Desc) {
168249
ch <- nc.maint
169250
ch <- nc.mix
170251
ch <- nc.resv
252+
ch <- nc.total
171253
}
254+
255+
func SendFeatureSetMetric(ch chan<- prometheus.Metric, desc *prometheus.Desc, valueType prometheus.ValueType, featurestate map[string]float64, part string) {
256+
for set, value := range featurestate {
257+
ch <- prometheus.MustNewConstMetric(desc, valueType, value, part, set)
258+
}
259+
}
260+
172261
func (nc *NodesCollector) Collect(ch chan<- prometheus.Metric) {
173-
nm := NodesGetMetrics()
174-
ch <- prometheus.MustNewConstMetric(nc.alloc, prometheus.GaugeValue, nm.alloc)
175-
ch <- prometheus.MustNewConstMetric(nc.comp, prometheus.GaugeValue, nm.comp)
176-
ch <- prometheus.MustNewConstMetric(nc.down, prometheus.GaugeValue, nm.down)
177-
ch <- prometheus.MustNewConstMetric(nc.drain, prometheus.GaugeValue, nm.drain)
178-
ch <- prometheus.MustNewConstMetric(nc.err, prometheus.GaugeValue, nm.err)
179-
ch <- prometheus.MustNewConstMetric(nc.fail, prometheus.GaugeValue, nm.fail)
180-
ch <- prometheus.MustNewConstMetric(nc.idle, prometheus.GaugeValue, nm.idle)
181-
ch <- prometheus.MustNewConstMetric(nc.maint, prometheus.GaugeValue, nm.maint)
182-
ch <- prometheus.MustNewConstMetric(nc.mix, prometheus.GaugeValue, nm.mix)
183-
ch <- prometheus.MustNewConstMetric(nc.resv, prometheus.GaugeValue, nm.resv)
262+
partitions := SlurmGetPartitions()
263+
for _, part := range partitions {
264+
part = strings.TrimSpace(part)
265+
if part == "" {
266+
continue
267+
}
268+
nm := NodesGetMetrics(part)
269+
SendFeatureSetMetric(ch, nc.alloc, prometheus.GaugeValue, nm.alloc, part)
270+
SendFeatureSetMetric(ch, nc.comp, prometheus.GaugeValue, nm.comp, part)
271+
SendFeatureSetMetric(ch, nc.down, prometheus.GaugeValue, nm.down, part)
272+
SendFeatureSetMetric(ch, nc.drain, prometheus.GaugeValue, nm.drain, part)
273+
SendFeatureSetMetric(ch, nc.err, prometheus.GaugeValue, nm.err, part)
274+
SendFeatureSetMetric(ch, nc.fail, prometheus.GaugeValue, nm.fail, part)
275+
SendFeatureSetMetric(ch, nc.idle, prometheus.GaugeValue, nm.idle, part)
276+
SendFeatureSetMetric(ch, nc.maint, prometheus.GaugeValue, nm.maint, part)
277+
SendFeatureSetMetric(ch, nc.mix, prometheus.GaugeValue, nm.mix, part)
278+
SendFeatureSetMetric(ch, nc.resv, prometheus.GaugeValue, nm.resv, part)
279+
}
280+
total := SlurmGetTotal()
281+
ch <- prometheus.MustNewConstMetric(nc.total, prometheus.GaugeValue, total)
184282
}

0 commit comments

Comments
 (0)