|
| 1 | +/* Copyright 2021 Chris Read |
| 2 | +
|
| 3 | +This program is free software: you can redistribute it and/or modify |
| 4 | +it under the terms of the GNU General Public License as published by |
| 5 | +the Free Software Foundation, either version 3 of the License, or |
| 6 | +(at your option) any later version. |
| 7 | +
|
| 8 | +This program is distributed in the hope that it will be useful, |
| 9 | +but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 10 | +MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| 11 | +GNU General Public License for more details. |
| 12 | +
|
| 13 | +You should have received a copy of the GNU General Public License |
| 14 | +along with this program. If not, see <http://www.gnu.org/licenses/>. */ |
| 15 | + |
| 16 | +package main |
| 17 | + |
| 18 | +import ( |
| 19 | + "log" |
| 20 | + "os/exec" |
| 21 | + "sort" |
| 22 | + "strconv" |
| 23 | + "strings" |
| 24 | + |
| 25 | + "github.com/prometheus/client_golang/prometheus" |
| 26 | +) |
| 27 | + |
| 28 | +// NodeMetrics stores metrics for each node |
| 29 | +type NodeMetrics struct { |
| 30 | + memAlloc uint64 |
| 31 | + memTotal uint64 |
| 32 | + cpuAlloc uint64 |
| 33 | + cpuIdle uint64 |
| 34 | + cpuOther uint64 |
| 35 | + cpuTotal uint64 |
| 36 | + nodeStatus string |
| 37 | +} |
| 38 | + |
| 39 | +func NodeGetMetrics() map[string]*NodeMetrics { |
| 40 | + return ParseNodeMetrics(NodeData()) |
| 41 | +} |
| 42 | + |
| 43 | +// ParseNodeMetrics takes the output of sinfo with node data |
| 44 | +// It returns a map of metrics per node |
| 45 | +func ParseNodeMetrics(input []byte) map[string]*NodeMetrics { |
| 46 | + nodes := make(map[string]*NodeMetrics) |
| 47 | + lines := strings.Split(string(input), "\n") |
| 48 | + |
| 49 | + // Sort and remove all the duplicates from the 'sinfo' output |
| 50 | + sort.Strings(lines) |
| 51 | + linesUniq := RemoveDuplicates(lines) |
| 52 | + |
| 53 | + for _, line := range linesUniq { |
| 54 | + node := strings.Fields(line) |
| 55 | + nodeName := node[0] |
| 56 | + nodeStatus := node[4] // mixed, allocated, etc. |
| 57 | + |
| 58 | + nodes[nodeName] = &NodeMetrics{0, 0, 0, 0, 0, 0, ""} |
| 59 | + |
| 60 | + memAlloc, _ := strconv.ParseUint(node[1], 10, 64) |
| 61 | + memTotal, _ := strconv.ParseUint(node[2], 10, 64) |
| 62 | + |
| 63 | + |
| 64 | + cpuInfo := strings.Split(node[3], "/") |
| 65 | + cpuAlloc, _ := strconv.ParseUint(cpuInfo[0], 10, 64) |
| 66 | + cpuIdle, _ := strconv.ParseUint(cpuInfo[1], 10, 64) |
| 67 | + cpuOther, _ := strconv.ParseUint(cpuInfo[2], 10, 64) |
| 68 | + cpuTotal, _ := strconv.ParseUint(cpuInfo[3], 10, 64) |
| 69 | + |
| 70 | + nodes[nodeName].memAlloc = memAlloc |
| 71 | + nodes[nodeName].memTotal = memTotal |
| 72 | + nodes[nodeName].cpuAlloc = cpuAlloc |
| 73 | + nodes[nodeName].cpuIdle = cpuIdle |
| 74 | + nodes[nodeName].cpuOther = cpuOther |
| 75 | + nodes[nodeName].cpuTotal = cpuTotal |
| 76 | + nodes[nodeName].nodeStatus = nodeStatus |
| 77 | + } |
| 78 | + |
| 79 | + return nodes |
| 80 | +} |
| 81 | + |
| 82 | +// NodeData executes the sinfo command to get data for each node |
| 83 | +// It returns the output of the sinfo command |
| 84 | +func NodeData() []byte { |
| 85 | + cmd := exec.Command("sinfo", "-h", "-N", "-O", "NodeList,AllocMem,Memory,CPUsState,StateLong") |
| 86 | + out, err := cmd.Output() |
| 87 | + if err != nil { |
| 88 | + log.Fatal(err) |
| 89 | + } |
| 90 | + return out |
| 91 | +} |
| 92 | + |
| 93 | +type NodeCollector struct { |
| 94 | + cpuAlloc *prometheus.Desc |
| 95 | + cpuIdle *prometheus.Desc |
| 96 | + cpuOther *prometheus.Desc |
| 97 | + cpuTotal *prometheus.Desc |
| 98 | + memAlloc *prometheus.Desc |
| 99 | + memTotal *prometheus.Desc |
| 100 | +} |
| 101 | + |
| 102 | +// NewNodeCollector creates a Prometheus collector to keep all our stats in |
| 103 | +// It returns a set of collections for consumption |
| 104 | +func NewNodeCollector() *NodeCollector { |
| 105 | + labels := []string{"node","status"} |
| 106 | + |
| 107 | + return &NodeCollector{ |
| 108 | + cpuAlloc: prometheus.NewDesc("slurm_node_cpu_alloc", "Allocated CPUs per node", labels, nil), |
| 109 | + cpuIdle: prometheus.NewDesc("slurm_node_cpu_idle", "Idle CPUs per node", labels, nil), |
| 110 | + cpuOther: prometheus.NewDesc("slurm_node_cpu_other", "Other CPUs per node", labels, nil), |
| 111 | + cpuTotal: prometheus.NewDesc("slurm_node_cpu_total", "Total CPUs per node", labels, nil), |
| 112 | + memAlloc: prometheus.NewDesc("slurm_node_mem_alloc", "Allocated memory per node", labels, nil), |
| 113 | + memTotal: prometheus.NewDesc("slurm_node_mem_total", "Total memory per node", labels, nil), |
| 114 | + } |
| 115 | +} |
| 116 | + |
| 117 | +// Send all metric descriptions |
| 118 | +func (nc *NodeCollector) Describe(ch chan<- *prometheus.Desc) { |
| 119 | + ch <- nc.cpuAlloc |
| 120 | + ch <- nc.cpuIdle |
| 121 | + ch <- nc.cpuOther |
| 122 | + ch <- nc.cpuTotal |
| 123 | + ch <- nc.memAlloc |
| 124 | + ch <- nc.memTotal |
| 125 | +} |
| 126 | + |
| 127 | +func (nc *NodeCollector) Collect(ch chan<- prometheus.Metric) { |
| 128 | + nodes := NodeGetMetrics() |
| 129 | + for node := range nodes { |
| 130 | + ch <- prometheus.MustNewConstMetric(nc.cpuAlloc, prometheus.GaugeValue, float64(nodes[node].cpuAlloc), node, nodes[node].nodeStatus) |
| 131 | + ch <- prometheus.MustNewConstMetric(nc.cpuIdle, prometheus.GaugeValue, float64(nodes[node].cpuIdle), node, nodes[node].nodeStatus) |
| 132 | + ch <- prometheus.MustNewConstMetric(nc.cpuOther, prometheus.GaugeValue, float64(nodes[node].cpuOther), node, nodes[node].nodeStatus) |
| 133 | + ch <- prometheus.MustNewConstMetric(nc.cpuTotal, prometheus.GaugeValue, float64(nodes[node].cpuTotal), node, nodes[node].nodeStatus) |
| 134 | + ch <- prometheus.MustNewConstMetric(nc.memAlloc, prometheus.GaugeValue, float64(nodes[node].memAlloc), node, nodes[node].nodeStatus) |
| 135 | + ch <- prometheus.MustNewConstMetric(nc.memTotal, prometheus.GaugeValue, float64(nodes[node].memTotal), node, nodes[node].nodeStatus) |
| 136 | + } |
| 137 | +} |
0 commit comments