Skip to content

Commit 6a34d8f

Browse files
committed
Add modified code from Chris Read (check PR#47)
1 parent 45f58f7 commit 6a34d8f

File tree

3 files changed

+215
-0
lines changed

3 files changed

+215
-0
lines changed

node.go

Lines changed: 137 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,137 @@
1+
/* Copyright 2021 Chris Read
2+
3+
This program is free software: you can redistribute it and/or modify
4+
it under the terms of the GNU General Public License as published by
5+
the Free Software Foundation, either version 3 of the License, or
6+
(at your option) any later version.
7+
8+
This program is distributed in the hope that it will be useful,
9+
but WITHOUT ANY WARRANTY; without even the implied warranty of
10+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11+
GNU General Public License for more details.
12+
13+
You should have received a copy of the GNU General Public License
14+
along with this program. If not, see <http://www.gnu.org/licenses/>. */
15+
16+
package main
17+
18+
import (
19+
"log"
20+
"os/exec"
21+
"sort"
22+
"strconv"
23+
"strings"
24+
25+
"github.com/prometheus/client_golang/prometheus"
26+
)
27+
28+
// NodeMetrics stores metrics for each node
29+
type NodeMetrics struct {
30+
memAlloc uint64
31+
memTotal uint64
32+
cpuAlloc uint64
33+
cpuIdle uint64
34+
cpuOther uint64
35+
cpuTotal uint64
36+
nodeStatus string
37+
}
38+
39+
func NodeGetMetrics() map[string]*NodeMetrics {
40+
return ParseNodeMetrics(NodeData())
41+
}
42+
43+
// ParseNodeMetrics takes the output of sinfo with node data
44+
// It returns a map of metrics per node
45+
func ParseNodeMetrics(input []byte) map[string]*NodeMetrics {
46+
nodes := make(map[string]*NodeMetrics)
47+
lines := strings.Split(string(input), "\n")
48+
49+
// Sort and remove all the duplicates from the 'sinfo' output
50+
sort.Strings(lines)
51+
linesUniq := RemoveDuplicates(lines)
52+
53+
for _, line := range linesUniq {
54+
node := strings.Fields(line)
55+
nodeName := node[0]
56+
nodeStatus := node[4] // mixed, allocated, etc.
57+
58+
nodes[nodeName] = &NodeMetrics{0, 0, 0, 0, 0, 0, ""}
59+
60+
memAlloc, _ := strconv.ParseUint(node[1], 10, 64)
61+
memTotal, _ := strconv.ParseUint(node[2], 10, 64)
62+
63+
64+
cpuInfo := strings.Split(node[3], "/")
65+
cpuAlloc, _ := strconv.ParseUint(cpuInfo[0], 10, 64)
66+
cpuIdle, _ := strconv.ParseUint(cpuInfo[1], 10, 64)
67+
cpuOther, _ := strconv.ParseUint(cpuInfo[2], 10, 64)
68+
cpuTotal, _ := strconv.ParseUint(cpuInfo[3], 10, 64)
69+
70+
nodes[nodeName].memAlloc = memAlloc
71+
nodes[nodeName].memTotal = memTotal
72+
nodes[nodeName].cpuAlloc = cpuAlloc
73+
nodes[nodeName].cpuIdle = cpuIdle
74+
nodes[nodeName].cpuOther = cpuOther
75+
nodes[nodeName].cpuTotal = cpuTotal
76+
nodes[nodeName].nodeStatus = nodeStatus
77+
}
78+
79+
return nodes
80+
}
81+
82+
// NodeData executes the sinfo command to get data for each node
83+
// It returns the output of the sinfo command
84+
func NodeData() []byte {
85+
cmd := exec.Command("sinfo", "-h", "-N", "-O", "NodeList,AllocMem,Memory,CPUsState,StateLong")
86+
out, err := cmd.Output()
87+
if err != nil {
88+
log.Fatal(err)
89+
}
90+
return out
91+
}
92+
93+
type NodeCollector struct {
94+
cpuAlloc *prometheus.Desc
95+
cpuIdle *prometheus.Desc
96+
cpuOther *prometheus.Desc
97+
cpuTotal *prometheus.Desc
98+
memAlloc *prometheus.Desc
99+
memTotal *prometheus.Desc
100+
}
101+
102+
// NewNodeCollector creates a Prometheus collector to keep all our stats in
103+
// It returns a set of collections for consumption
104+
func NewNodeCollector() *NodeCollector {
105+
labels := []string{"node","status"}
106+
107+
return &NodeCollector{
108+
cpuAlloc: prometheus.NewDesc("slurm_node_cpu_alloc", "Allocated CPUs per node", labels, nil),
109+
cpuIdle: prometheus.NewDesc("slurm_node_cpu_idle", "Idle CPUs per node", labels, nil),
110+
cpuOther: prometheus.NewDesc("slurm_node_cpu_other", "Other CPUs per node", labels, nil),
111+
cpuTotal: prometheus.NewDesc("slurm_node_cpu_total", "Total CPUs per node", labels, nil),
112+
memAlloc: prometheus.NewDesc("slurm_node_mem_alloc", "Allocated memory per node", labels, nil),
113+
memTotal: prometheus.NewDesc("slurm_node_mem_total", "Total memory per node", labels, nil),
114+
}
115+
}
116+
117+
// Send all metric descriptions
118+
func (nc *NodeCollector) Describe(ch chan<- *prometheus.Desc) {
119+
ch <- nc.cpuAlloc
120+
ch <- nc.cpuIdle
121+
ch <- nc.cpuOther
122+
ch <- nc.cpuTotal
123+
ch <- nc.memAlloc
124+
ch <- nc.memTotal
125+
}
126+
127+
func (nc *NodeCollector) Collect(ch chan<- prometheus.Metric) {
128+
nodes := NodeGetMetrics()
129+
for node := range nodes {
130+
ch <- prometheus.MustNewConstMetric(nc.cpuAlloc, prometheus.GaugeValue, float64(nodes[node].cpuAlloc), node, nodes[node].nodeStatus)
131+
ch <- prometheus.MustNewConstMetric(nc.cpuIdle, prometheus.GaugeValue, float64(nodes[node].cpuIdle), node, nodes[node].nodeStatus)
132+
ch <- prometheus.MustNewConstMetric(nc.cpuOther, prometheus.GaugeValue, float64(nodes[node].cpuOther), node, nodes[node].nodeStatus)
133+
ch <- prometheus.MustNewConstMetric(nc.cpuTotal, prometheus.GaugeValue, float64(nodes[node].cpuTotal), node, nodes[node].nodeStatus)
134+
ch <- prometheus.MustNewConstMetric(nc.memAlloc, prometheus.GaugeValue, float64(nodes[node].memAlloc), node, nodes[node].nodeStatus)
135+
ch <- prometheus.MustNewConstMetric(nc.memTotal, prometheus.GaugeValue, float64(nodes[node].memTotal), node, nodes[node].nodeStatus)
136+
}
137+
}

node_test.go

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
/* Copyright 2021 Chris Read
2+
3+
This program is free software: you can redistribute it and/or modify
4+
it under the terms of the GNU General Public License as published by
5+
the Free Software Foundation, either version 3 of the License, or
6+
(at your option) any later version.
7+
8+
This program is distributed in the hope that it will be useful,
9+
but WITHOUT ANY WARRANTY; without even the implied warranty of
10+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11+
GNU General Public License for more details.
12+
13+
You should have received a copy of the GNU General Public License
14+
along with this program. If not, see <http://www.gnu.org/licenses/>. */
15+
16+
package main
17+
18+
import (
19+
"io/ioutil"
20+
"testing"
21+
22+
"github.com/stretchr/testify/assert"
23+
)
24+
25+
/*
26+
For this example data line:
27+
28+
a048,79384,193000,3/13/0/16,mix
29+
30+
We want output that looks like:
31+
32+
slurm_node_cpus_allocated{name="a048",status="mix"} 3
33+
slurm_node_cpus_idle{name="a048",status="mix"} 3
34+
slurm_node_cpus_other{name="a048",status="mix"} 0
35+
slurm_node_cpus_total{name="a048",status="mix"} 16
36+
slurm_node_mem_allocated{name="a048",status="mix"} 179384
37+
slurm_node_mem_total{name="a048",status="mix"} 193000
38+
39+
*/
40+
41+
func TestNodeMetrics(t *testing.T) {
42+
// Read the input data from a file
43+
data, err := ioutil.ReadFile("test_data/sinfo_mem.txt")
44+
if err != nil {
45+
t.Fatalf("Can not open test data: %v", err)
46+
}
47+
metrics := ParseNodeMetrics(data)
48+
t.Logf("%+v", metrics)
49+
50+
assert.Contains(t, metrics, "b001")
51+
assert.Equal(t, uint64(327680), metrics["b001"].memAlloc)
52+
assert.Equal(t, uint64(386000), metrics["b001"].memTotal)
53+
assert.Equal(t, uint64(32), metrics["b001"].cpuAlloc)
54+
assert.Equal(t, uint64(0), metrics["b001"].cpuIdle)
55+
assert.Equal(t, uint64(0), metrics["b001"].cpuOther)
56+
assert.Equal(t, uint64(32), metrics["b001"].cpuTotal)
57+
}

test_data/sinfo_mem.txt

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
a048 163840 193000 16/0/0/16 mixed
2+
a048 163840 193000 16/0/0/16 mixed
3+
a048 163840 193000 16/0/0/16 idle
4+
a048 163840 193000 16/0/0/16 idle
5+
a049 163840 193000 16/0/0/16 idle
6+
a049 163840 193000 16/0/0/16 idle
7+
a049 163840 193000 16/0/0/16 idle
8+
a049 163840 193000 16/0/0/16 idle
9+
a050 163840 193000 16/0/0/16 idle
10+
a050 163840 193000 16/0/0/16 idle
11+
a050 163840 193000 16/0/0/16 idle
12+
a051 163840 193000 16/0/0/16 idle
13+
a051 163840 193000 16/0/0/16 idle
14+
a051 163840 193000 16/0/0/16 idle
15+
a052 0 193000 0/16/0/16 idle
16+
b001 327680 386000 32/0/0/32 down
17+
b001 327680 386000 32/0/0/32 down
18+
b002 327680 386000 32/0/0/32 down
19+
b002 327680 386000 32/0/0/32 idle
20+
b003 296960 386000 29/3/0/32 down
21+
b003 296960 386000 29/3/0/32 idle

0 commit comments

Comments
 (0)