Skip to content

Commit a142e78

Browse files
committed
[BUG]: count all job states (issue vpenso#9)
1 parent 75908f5 commit a142e78

File tree

1 file changed

+124
-108
lines changed

1 file changed

+124
-108
lines changed

queue.go

Lines changed: 124 additions & 108 deletions
Original file line numberDiff line numberDiff line change
@@ -16,71 +16,87 @@ along with this program. If not, see <http://www.gnu.org/licenses/>. */
1616
package main
1717

1818
import (
19-
"log"
20-
"strings"
21-
"os/exec"
22-
"io/ioutil"
23-
"github.com/prometheus/client_golang/prometheus"
19+
"github.com/prometheus/client_golang/prometheus"
20+
"io/ioutil"
21+
"log"
22+
"os/exec"
23+
"strings"
2424
)
2525

2626
type QueueMetrics struct {
27-
pending float64
28-
pending_dep float64
29-
running float64
30-
suspended float64
31-
cancelled float64
32-
completing float64
33-
completed float64
34-
configuring float64
35-
failed float64
36-
timeout float64
37-
preempted float64
38-
node_fail float64
27+
pending float64
28+
pending_dep float64
29+
running float64
30+
suspended float64
31+
cancelled float64
32+
completing float64
33+
completed float64
34+
configuring float64
35+
failed float64
36+
timeout float64
37+
preempted float64
38+
node_fail float64
3939
}
4040

4141
// Returns the scheduler metrics
4242
func QueueGetMetrics() *QueueMetrics {
43-
return ParseQueueMetrics(QueueData())
43+
return ParseQueueMetrics(QueueData())
4444
}
4545

4646
func ParseQueueMetrics(input []byte) *QueueMetrics {
47-
var qm QueueMetrics
48-
lines := strings.Split(string(input), "\n")
49-
for _, line := range lines {
50-
if strings.Contains(line,",") {
51-
splitted := strings.Split(line, ",")
52-
state := splitted[1]
53-
switch state {
54-
case "PENDING":
55-
qm.pending++
56-
if len(splitted) > 2 && splitted[2] == "Dependency" {
57-
qm.pending_dep++
58-
}
59-
case "RUNNING": qm.running++
60-
case "SUSPENDED": qm.suspended++
61-
case "CANCELLED": qm.cancelled++
62-
case "COMPLETING": qm.completing++
63-
case "COMPLETED": qm.completed++
64-
case "CONFIGURING": qm.configuring++
65-
case "FAILED": qm.failed++
66-
case "TIMEOUT": qm.timeout++
67-
case "PREEMPTED": qm.preempted++
68-
case "NODE_FAIL": qm.node_fail++
69-
}
70-
}
71-
}
72-
return &qm
47+
var qm QueueMetrics
48+
lines := strings.Split(string(input), "\n")
49+
for _, line := range lines {
50+
if strings.Contains(line, ",") {
51+
splitted := strings.Split(line, ",")
52+
state := splitted[1]
53+
switch state {
54+
case "PENDING":
55+
qm.pending++
56+
if len(splitted) > 2 && splitted[2] == "Dependency" {
57+
qm.pending_dep++
58+
}
59+
case "RUNNING":
60+
qm.running++
61+
case "SUSPENDED":
62+
qm.suspended++
63+
case "CANCELLED":
64+
qm.cancelled++
65+
case "COMPLETING":
66+
qm.completing++
67+
case "COMPLETED":
68+
qm.completed++
69+
case "CONFIGURING":
70+
qm.configuring++
71+
case "FAILED":
72+
qm.failed++
73+
case "TIMEOUT":
74+
qm.timeout++
75+
case "PREEMPTED":
76+
qm.preempted++
77+
case "NODE_FAIL":
78+
qm.node_fail++
79+
}
80+
}
81+
}
82+
return &qm
7383
}
7484

7585
// Execute the squeue command and return its output
7686
func QueueData() []byte {
77-
cmd := exec.Command("/usr/bin/squeue", "-h", "-o %A,%T,%r")
78-
stdout, err := cmd.StdoutPipe()
79-
if err != nil { log.Fatal(err) }
80-
if err := cmd.Start(); err != nil { log.Fatal(err) }
81-
out, _ := ioutil.ReadAll(stdout)
82-
if err := cmd.Wait(); err != nil { log.Fatal(err) }
83-
return out
87+
cmd := exec.Command("/usr/bin/squeue", "-h", "-o %A,%T,%r", "--states=all")
88+
stdout, err := cmd.StdoutPipe()
89+
if err != nil {
90+
log.Fatal(err)
91+
}
92+
if err := cmd.Start(); err != nil {
93+
log.Fatal(err)
94+
}
95+
out, _ := ioutil.ReadAll(stdout)
96+
if err := cmd.Wait(); err != nil {
97+
log.Fatal(err)
98+
}
99+
return out
84100
}
85101

86102
/*
@@ -90,64 +106,64 @@ func QueueData() []byte {
90106
*/
91107

92108
func NewQueueCollector() *QueueCollector {
93-
return &QueueCollector {
94-
pending: prometheus.NewDesc("slurm_queue_pending", "Pending jobs in queue", nil, nil),
95-
pending_dep: prometheus.NewDesc("slurm_queue_pending_dependency", "Pending jobs because of dependency in queue", nil, nil),
96-
running: prometheus.NewDesc("slurm_queue_running", "Running jobs in the cluster", nil, nil),
97-
suspended: prometheus.NewDesc("slurm_queue_suspended", "Suspended jobs in the cluster", nil, nil),
98-
cancelled: prometheus.NewDesc("slurm_queue_cancelled", "Cancelled jobs in the cluster", nil, nil),
99-
completing: prometheus.NewDesc("slurm_queue_completing", "Completing jobs in the cluster", nil, nil),
100-
completed: prometheus.NewDesc("slurm_queue_completed", "Completed jobs in the cluster", nil, nil),
101-
configuring: prometheus.NewDesc("slurm_queue_configuring", "Configuring jobs in the cluster", nil, nil),
102-
failed: prometheus.NewDesc("slurm_queue_failed", "Number of failed jobs", nil, nil),
103-
timeout: prometheus.NewDesc("slurm_queue_timeout", "Jobs stopped by timeout", nil, nil),
104-
preempted: prometheus.NewDesc("slurm_queue_preempted", "Number of preempted jobs", nil, nil),
105-
node_fail: prometheus.NewDesc("slurm_queue_node_fail", "Number of jobs stopped due to node fail", nil, nil),
106-
}
109+
return &QueueCollector{
110+
pending: prometheus.NewDesc("slurm_queue_pending", "Pending jobs in queue", nil, nil),
111+
pending_dep: prometheus.NewDesc("slurm_queue_pending_dependency", "Pending jobs because of dependency in queue", nil, nil),
112+
running: prometheus.NewDesc("slurm_queue_running", "Running jobs in the cluster", nil, nil),
113+
suspended: prometheus.NewDesc("slurm_queue_suspended", "Suspended jobs in the cluster", nil, nil),
114+
cancelled: prometheus.NewDesc("slurm_queue_cancelled", "Cancelled jobs in the cluster", nil, nil),
115+
completing: prometheus.NewDesc("slurm_queue_completing", "Completing jobs in the cluster", nil, nil),
116+
completed: prometheus.NewDesc("slurm_queue_completed", "Completed jobs in the cluster", nil, nil),
117+
configuring: prometheus.NewDesc("slurm_queue_configuring", "Configuring jobs in the cluster", nil, nil),
118+
failed: prometheus.NewDesc("slurm_queue_failed", "Number of failed jobs", nil, nil),
119+
timeout: prometheus.NewDesc("slurm_queue_timeout", "Jobs stopped by timeout", nil, nil),
120+
preempted: prometheus.NewDesc("slurm_queue_preempted", "Number of preempted jobs", nil, nil),
121+
node_fail: prometheus.NewDesc("slurm_queue_node_fail", "Number of jobs stopped due to node fail", nil, nil),
122+
}
107123
}
108124

109-
type QueueCollector struct {
110-
pending *prometheus.Desc
111-
pending_dep *prometheus.Desc
112-
running *prometheus.Desc
113-
suspended *prometheus.Desc
114-
cancelled *prometheus.Desc
115-
completing *prometheus.Desc
116-
completed *prometheus.Desc
117-
configuring *prometheus.Desc
118-
failed *prometheus.Desc
119-
timeout *prometheus.Desc
120-
preempted *prometheus.Desc
121-
node_fail *prometheus.Desc
122-
}
125+
type QueueCollector struct {
126+
pending *prometheus.Desc
127+
pending_dep *prometheus.Desc
128+
running *prometheus.Desc
129+
suspended *prometheus.Desc
130+
cancelled *prometheus.Desc
131+
completing *prometheus.Desc
132+
completed *prometheus.Desc
133+
configuring *prometheus.Desc
134+
failed *prometheus.Desc
135+
timeout *prometheus.Desc
136+
preempted *prometheus.Desc
137+
node_fail *prometheus.Desc
138+
}
123139

124-
func (qc *QueueCollector) Describe(ch chan<- *prometheus.Desc) {
125-
ch <- qc.pending
126-
ch <- qc.pending_dep
127-
ch <- qc.running
128-
ch <- qc.suspended
129-
ch <- qc.cancelled
130-
ch <- qc.completing
131-
ch <- qc.completed
132-
ch <- qc.configuring
133-
ch <- qc.failed
134-
ch <- qc.timeout
135-
ch <- qc.preempted
136-
ch <- qc.node_fail
137-
}
140+
func (qc *QueueCollector) Describe(ch chan<- *prometheus.Desc) {
141+
ch <- qc.pending
142+
ch <- qc.pending_dep
143+
ch <- qc.running
144+
ch <- qc.suspended
145+
ch <- qc.cancelled
146+
ch <- qc.completing
147+
ch <- qc.completed
148+
ch <- qc.configuring
149+
ch <- qc.failed
150+
ch <- qc.timeout
151+
ch <- qc.preempted
152+
ch <- qc.node_fail
153+
}
138154

139-
func (qc *QueueCollector) Collect(ch chan<- prometheus.Metric) {
140-
qm := QueueGetMetrics()
141-
ch <- prometheus.MustNewConstMetric(qc.pending, prometheus.GaugeValue, qm.pending)
142-
ch <- prometheus.MustNewConstMetric(qc.pending_dep, prometheus.GaugeValue, qm.pending_dep)
143-
ch <- prometheus.MustNewConstMetric(qc.running, prometheus.GaugeValue, qm.running)
144-
ch <- prometheus.MustNewConstMetric(qc.suspended, prometheus.GaugeValue, qm.suspended)
145-
ch <- prometheus.MustNewConstMetric(qc.cancelled, prometheus.GaugeValue, qm.cancelled)
146-
ch <- prometheus.MustNewConstMetric(qc.completing, prometheus.GaugeValue, qm.completing)
147-
ch <- prometheus.MustNewConstMetric(qc.completed, prometheus.GaugeValue, qm.completed)
148-
ch <- prometheus.MustNewConstMetric(qc.configuring, prometheus.GaugeValue, qm.configuring)
149-
ch <- prometheus.MustNewConstMetric(qc.failed, prometheus.GaugeValue, qm.failed)
150-
ch <- prometheus.MustNewConstMetric(qc.timeout, prometheus.GaugeValue, qm.timeout)
151-
ch <- prometheus.MustNewConstMetric(qc.preempted, prometheus.GaugeValue, qm.preempted)
152-
ch <- prometheus.MustNewConstMetric(qc.node_fail, prometheus.GaugeValue, qm.node_fail)
153-
}
155+
func (qc *QueueCollector) Collect(ch chan<- prometheus.Metric) {
156+
qm := QueueGetMetrics()
157+
ch <- prometheus.MustNewConstMetric(qc.pending, prometheus.GaugeValue, qm.pending)
158+
ch <- prometheus.MustNewConstMetric(qc.pending_dep, prometheus.GaugeValue, qm.pending_dep)
159+
ch <- prometheus.MustNewConstMetric(qc.running, prometheus.GaugeValue, qm.running)
160+
ch <- prometheus.MustNewConstMetric(qc.suspended, prometheus.GaugeValue, qm.suspended)
161+
ch <- prometheus.MustNewConstMetric(qc.cancelled, prometheus.GaugeValue, qm.cancelled)
162+
ch <- prometheus.MustNewConstMetric(qc.completing, prometheus.GaugeValue, qm.completing)
163+
ch <- prometheus.MustNewConstMetric(qc.completed, prometheus.GaugeValue, qm.completed)
164+
ch <- prometheus.MustNewConstMetric(qc.configuring, prometheus.GaugeValue, qm.configuring)
165+
ch <- prometheus.MustNewConstMetric(qc.failed, prometheus.GaugeValue, qm.failed)
166+
ch <- prometheus.MustNewConstMetric(qc.timeout, prometheus.GaugeValue, qm.timeout)
167+
ch <- prometheus.MustNewConstMetric(qc.preempted, prometheus.GaugeValue, qm.preempted)
168+
ch <- prometheus.MustNewConstMetric(qc.node_fail, prometheus.GaugeValue, qm.node_fail)
169+
}

0 commit comments

Comments
 (0)