Skip to content

Commit df50ddf

Browse files
committed
nodes: Add partition and feature_set labels to nodes collector
1 parent 895c7c6 commit df50ddf

File tree

2 files changed

+152
-51
lines changed

2 files changed

+152
-51
lines changed

nodes.go

Lines changed: 147 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -17,8 +17,8 @@ package main
1717

1818
import (
1919
"github.com/prometheus/client_golang/prometheus"
20+
"github.com/prometheus/common/log"
2021
"io/ioutil"
21-
"log"
2222
"os/exec"
2323
"regexp"
2424
"sort"
@@ -27,20 +27,21 @@ import (
2727
)
2828

2929
type NodesMetrics struct {
30-
alloc float64
31-
comp float64
32-
down float64
33-
drain float64
34-
err float64
35-
fail float64
36-
idle float64
37-
maint float64
38-
mix float64
39-
resv float64
30+
alloc map[string]float64
31+
comp map[string]float64
32+
down map[string]float64
33+
drain map[string]float64
34+
err map[string]float64
35+
fail map[string]float64
36+
idle map[string]float64
37+
maint map[string]float64
38+
mix map[string]float64
39+
resv map[string]float64
40+
total map[string]float64
4041
}
4142

42-
func NodesGetMetrics() *NodesMetrics {
43-
return ParseNodesMetrics(NodesData())
43+
func NodesGetMetrics(part string) *NodesMetrics {
44+
return ParseNodesMetrics(NodesData(part))
4445
}
4546

4647
func RemoveDuplicates(s []string) []string {
@@ -60,19 +61,53 @@ func RemoveDuplicates(s []string) []string {
6061
return t
6162
}
6263

64+
func InitFeatureSet(nm *NodesMetrics, feature_set string) {
65+
nm.alloc[feature_set] = nm.alloc[feature_set]
66+
nm.comp[feature_set] = nm.comp[feature_set]
67+
nm.down[feature_set] = nm.down[feature_set]
68+
nm.drain[feature_set] = nm.drain[feature_set]
69+
nm.err[feature_set] = nm.err[feature_set]
70+
nm.fail[feature_set] = nm.fail[feature_set]
71+
nm.idle[feature_set] = nm.idle[feature_set]
72+
nm.maint[feature_set] = nm.maint[feature_set]
73+
nm.mix[feature_set] = nm.mix[feature_set]
74+
nm.resv[feature_set] = nm.resv[feature_set]
75+
nm.total[feature_set] = nm.total[feature_set]
76+
}
77+
6378
func ParseNodesMetrics(input []byte) *NodesMetrics {
6479
var nm NodesMetrics
80+
var feature_set string
6581
lines := strings.Split(string(input), "\n")
6682

6783
// Sort and remove all the duplicates from the 'sinfo' output
6884
sort.Strings(lines)
6985
lines_uniq := RemoveDuplicates(lines)
7086

87+
nm.alloc = make(map[string]float64)
88+
nm.comp = make(map[string]float64)
89+
nm.down = make(map[string]float64)
90+
nm.drain = make(map[string]float64)
91+
nm.err = make(map[string]float64)
92+
nm.fail = make(map[string]float64)
93+
nm.idle = make(map[string]float64)
94+
nm.maint = make(map[string]float64)
95+
nm.mix = make(map[string]float64)
96+
nm.resv = make(map[string]float64)
97+
nm.total = make(map[string]float64)
98+
7199
for _, line := range lines_uniq {
72-
if strings.Contains(line, ",") {
73-
split := strings.Split(line, ",")
74-
count, _ := strconv.ParseFloat(strings.TrimSpace(split[0]), 64)
75-
state := split[1]
100+
if strings.Contains(line, "|") {
101+
split := strings.Split(line, "|")
102+
state := split[1]
103+
count, _ := strconv.ParseFloat(strings.TrimSpace(split[0]), 64)
104+
features := strings.Split(split[2], ",")
105+
sort.Strings(features)
106+
feature_set = strings.Join(features[:], ",")
107+
if feature_set == "(null)" {
108+
feature_set = "null"
109+
}
110+
InitFeatureSet(&nm, feature_set)
76111
alloc := regexp.MustCompile(`^alloc`)
77112
comp := regexp.MustCompile(`^comp`)
78113
down := regexp.MustCompile(`^down`)
@@ -85,34 +120,34 @@ func ParseNodesMetrics(input []byte) *NodesMetrics {
85120
resv := regexp.MustCompile(`^res`)
86121
switch {
87122
case alloc.MatchString(state) == true:
88-
nm.alloc += count
123+
nm.alloc[feature_set] += count
89124
case comp.MatchString(state) == true:
90-
nm.comp += count
125+
nm.comp[feature_set] += count
91126
case down.MatchString(state) == true:
92-
nm.down += count
127+
nm.down[feature_set] += count
93128
case drain.MatchString(state) == true:
94-
nm.drain += count
129+
nm.drain[feature_set] += count
95130
case fail.MatchString(state) == true:
96-
nm.fail += count
131+
nm.fail[feature_set] += count
97132
case err.MatchString(state) == true:
98-
nm.err += count
133+
nm.err[feature_set] += count
99134
case idle.MatchString(state) == true:
100-
nm.idle += count
135+
nm.idle[feature_set] += count
101136
case maint.MatchString(state) == true:
102-
nm.maint += count
137+
nm.maint[feature_set] += count
103138
case mix.MatchString(state) == true:
104-
nm.mix += count
139+
nm.mix[feature_set] += count
105140
case resv.MatchString(state) == true:
106-
nm.resv += count
141+
nm.resv[feature_set] += count
107142
}
108143
}
109144
}
110145
return &nm
111146
}
112147

113148
// Execute the sinfo command and return its output
114-
func NodesData() []byte {
115-
cmd := exec.Command("sinfo", "-h", "-o %D,%T")
149+
func NodesData(part string) []byte {
150+
cmd := exec.Command("sinfo", "-h", "-o %D|%T|%b", "-p", part, "| sort", "| uniq")
116151
stdout, err := cmd.StdoutPipe()
117152
if err != nil {
118153
log.Fatal(err)
@@ -127,24 +162,68 @@ func NodesData() []byte {
127162
return out
128163
}
129164

165+
func SlurmGetTotal() float64 {
166+
cmd := exec.Command("bash", "-c", "scontrol show nodes -o | grep -c NodeName=[a-z]*[0-9]*")
167+
stdout, err := cmd.StdoutPipe()
168+
if err != nil {
169+
log.Fatal(err)
170+
}
171+
stderr, err := cmd.StderrPipe()
172+
if err != nil {
173+
log.Fatal(err)
174+
}
175+
if err := cmd.Start(); err != nil {
176+
log.Fatalf("cmd.Start: %v", err)
177+
}
178+
out, _ := ioutil.ReadAll(stdout)
179+
err_out, _ := ioutil.ReadAll(stderr)
180+
if err := cmd.Wait(); err != nil {
181+
log.Fatalf("cmd.Wait: %v %s %s", err, out, err_out)
182+
}
183+
data := strings.Split(string(out), "\n")
184+
total, _ := strconv.ParseFloat(data[0], 64)
185+
return total
186+
}
187+
188+
func SlurmGetPartitions() []string {
189+
cmd := exec.Command("sinfo", "-h", "-o %R", "| sort", "| uniq")
190+
stdout, err := cmd.StdoutPipe()
191+
if err != nil {
192+
log.Fatal(err)
193+
}
194+
if err := cmd.Start(); err != nil {
195+
log.Fatal(err)
196+
}
197+
out, _ := ioutil.ReadAll(stdout)
198+
if err := cmd.Wait(); err != nil {
199+
log.Fatal(err)
200+
}
201+
partitions := strings.Split(string(out), "\n")
202+
return partitions
203+
}
204+
130205
/*
131206
* Implement the Prometheus Collector interface and feed the
132207
* Slurm scheduler metrics into it.
133208
* https://godoc.org/github.com/prometheus/client_golang/prometheus#Collector
134209
*/
135210

136211
func NewNodesCollector() *NodesCollector {
212+
labelnames := make([]string, 0, 1)
213+
labelnames = append(labelnames, "partition")
214+
labelnames = append(labelnames, "active_feature_set")
137215
return &NodesCollector{
138-
alloc: prometheus.NewDesc("slurm_nodes_alloc", "Allocated nodes", nil, nil),
139-
comp: prometheus.NewDesc("slurm_nodes_comp", "Completing nodes", nil, nil),
140-
down: prometheus.NewDesc("slurm_nodes_down", "Down nodes", nil, nil),
141-
drain: prometheus.NewDesc("slurm_nodes_drain", "Drain nodes", nil, nil),
142-
err: prometheus.NewDesc("slurm_nodes_err", "Error nodes", nil, nil),
143-
fail: prometheus.NewDesc("slurm_nodes_fail", "Fail nodes", nil, nil),
144-
idle: prometheus.NewDesc("slurm_nodes_idle", "Idle nodes", nil, nil),
145-
maint: prometheus.NewDesc("slurm_nodes_maint", "Maint nodes", nil, nil),
146-
mix: prometheus.NewDesc("slurm_nodes_mix", "Mix nodes", nil, nil),
147-
resv: prometheus.NewDesc("slurm_nodes_resv", "Reserved nodes", nil, nil),
216+
alloc: prometheus.NewDesc("slurm_nodes_alloc", "Allocated nodes", labelnames, nil),
217+
comp: prometheus.NewDesc("slurm_nodes_comp", "Completing nodes", labelnames, nil),
218+
down: prometheus.NewDesc("slurm_nodes_down", "Down nodes", labelnames, nil),
219+
drain: prometheus.NewDesc("slurm_nodes_drain", "Drain nodes", labelnames, nil),
220+
err: prometheus.NewDesc("slurm_nodes_err", "Error nodes", labelnames, nil),
221+
fail: prometheus.NewDesc("slurm_nodes_fail", "Fail nodes", labelnames, nil),
222+
idle: prometheus.NewDesc("slurm_nodes_idle", "Idle nodes", labelnames, nil),
223+
maint: prometheus.NewDesc("slurm_nodes_maint", "Maint nodes", labelnames, nil),
224+
mix: prometheus.NewDesc("slurm_nodes_mix", "Mix nodes", labelnames, nil),
225+
resv: prometheus.NewDesc("slurm_nodes_resv", "Reserved nodes", labelnames, nil),
226+
total: prometheus.NewDesc("slurm_nodes_total", "Total number of nodes", nil, nil),
148227
}
149228
}
150229

@@ -159,6 +238,7 @@ type NodesCollector struct {
159238
maint *prometheus.Desc
160239
mix *prometheus.Desc
161240
resv *prometheus.Desc
241+
total *prometheus.Desc
162242
}
163243

164244
// Send all metric descriptions
@@ -173,17 +253,34 @@ func (nc *NodesCollector) Describe(ch chan<- *prometheus.Desc) {
173253
ch <- nc.maint
174254
ch <- nc.mix
175255
ch <- nc.resv
256+
ch <- nc.total
176257
}
258+
259+
func SendFeatureSetMetric(ch chan<- prometheus.Metric, desc *prometheus.Desc, valueType prometheus.ValueType, featurestate map[string]float64, part string) {
260+
for set, value := range featurestate {
261+
ch <- prometheus.MustNewConstMetric(desc, valueType, value, part, set)
262+
}
263+
}
264+
177265
func (nc *NodesCollector) Collect(ch chan<- prometheus.Metric) {
178-
nm := NodesGetMetrics()
179-
ch <- prometheus.MustNewConstMetric(nc.alloc, prometheus.GaugeValue, nm.alloc)
180-
ch <- prometheus.MustNewConstMetric(nc.comp, prometheus.GaugeValue, nm.comp)
181-
ch <- prometheus.MustNewConstMetric(nc.down, prometheus.GaugeValue, nm.down)
182-
ch <- prometheus.MustNewConstMetric(nc.drain, prometheus.GaugeValue, nm.drain)
183-
ch <- prometheus.MustNewConstMetric(nc.err, prometheus.GaugeValue, nm.err)
184-
ch <- prometheus.MustNewConstMetric(nc.fail, prometheus.GaugeValue, nm.fail)
185-
ch <- prometheus.MustNewConstMetric(nc.idle, prometheus.GaugeValue, nm.idle)
186-
ch <- prometheus.MustNewConstMetric(nc.maint, prometheus.GaugeValue, nm.maint)
187-
ch <- prometheus.MustNewConstMetric(nc.mix, prometheus.GaugeValue, nm.mix)
188-
ch <- prometheus.MustNewConstMetric(nc.resv, prometheus.GaugeValue, nm.resv)
266+
partitions := SlurmGetPartitions()
267+
for _, part := range partitions {
268+
part = strings.TrimSpace(part)
269+
if part == "" {
270+
continue
271+
}
272+
nm := NodesGetMetrics(part)
273+
SendFeatureSetMetric(ch, nc.alloc, prometheus.GaugeValue, nm.alloc, part)
274+
SendFeatureSetMetric(ch, nc.comp, prometheus.GaugeValue, nm.comp, part)
275+
SendFeatureSetMetric(ch, nc.down, prometheus.GaugeValue, nm.down, part)
276+
SendFeatureSetMetric(ch, nc.drain, prometheus.GaugeValue, nm.drain, part)
277+
SendFeatureSetMetric(ch, nc.err, prometheus.GaugeValue, nm.err, part)
278+
SendFeatureSetMetric(ch, nc.fail, prometheus.GaugeValue, nm.fail, part)
279+
SendFeatureSetMetric(ch, nc.idle, prometheus.GaugeValue, nm.idle, part)
280+
SendFeatureSetMetric(ch, nc.maint, prometheus.GaugeValue, nm.maint, part)
281+
SendFeatureSetMetric(ch, nc.mix, prometheus.GaugeValue, nm.mix, part)
282+
SendFeatureSetMetric(ch, nc.resv, prometheus.GaugeValue, nm.resv, part)
283+
}
284+
total := SlurmGetTotal()
285+
ch <- prometheus.MustNewConstMetric(nc.total, prometheus.GaugeValue, total)
189286
}

nodes_test.go

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,10 @@ func TestNodesMetrics(t *testing.T) {
3131
t.Logf("%+v", ParseNodesMetrics(data))
3232
}
3333

34+
func TestNodesGetPartitions(t *testing.T) {
35+
t.Logf("%+v", SlurmGetPartitions())
36+
}
37+
3438
func TestNodesGetMetrics(t *testing.T) {
35-
t.Logf("%+v", NodesGetMetrics())
39+
t.Logf("%+v", NodesGetMetrics("foo"))
3640
}

0 commit comments

Comments
 (0)