@@ -16,71 +16,87 @@ along with this program. If not, see <http://www.gnu.org/licenses/>. */
16
16
package main
17
17
18
18
import (
19
- "log "
20
- "strings "
21
- "os/exec "
22
- "io/ioutil "
23
- "github.com/prometheus/client_golang/prometheus "
19
+ "github.com/prometheus/client_golang/prometheus "
20
+ "io/ioutil "
21
+ "log "
22
+ "os/exec "
23
+ "strings "
24
24
)
25
25
26
26
type QueueMetrics struct {
27
- pending float64
28
- pending_dep float64
29
- running float64
30
- suspended float64
31
- cancelled float64
32
- completing float64
33
- completed float64
34
- configuring float64
35
- failed float64
36
- timeout float64
37
- preempted float64
38
- node_fail float64
27
+ pending float64
28
+ pending_dep float64
29
+ running float64
30
+ suspended float64
31
+ cancelled float64
32
+ completing float64
33
+ completed float64
34
+ configuring float64
35
+ failed float64
36
+ timeout float64
37
+ preempted float64
38
+ node_fail float64
39
39
}
40
40
41
41
// Returns the scheduler metrics
42
42
func QueueGetMetrics () * QueueMetrics {
43
- return ParseQueueMetrics (QueueData ())
43
+ return ParseQueueMetrics (QueueData ())
44
44
}
45
45
46
46
func ParseQueueMetrics (input []byte ) * QueueMetrics {
47
- var qm QueueMetrics
48
- lines := strings .Split (string (input ), "\n " )
49
- for _ , line := range lines {
50
- if strings .Contains (line ,"," ) {
51
- splitted := strings .Split (line , "," )
52
- state := splitted [1 ]
53
- switch state {
54
- case "PENDING" :
55
- qm .pending ++
56
- if len (splitted ) > 2 && splitted [2 ] == "Dependency" {
57
- qm .pending_dep ++
58
- }
59
- case "RUNNING" : qm .running ++
60
- case "SUSPENDED" : qm .suspended ++
61
- case "CANCELLED" : qm .cancelled ++
62
- case "COMPLETING" : qm .completing ++
63
- case "COMPLETED" : qm .completed ++
64
- case "CONFIGURING" : qm .configuring ++
65
- case "FAILED" : qm .failed ++
66
- case "TIMEOUT" : qm .timeout ++
67
- case "PREEMPTED" : qm .preempted ++
68
- case "NODE_FAIL" : qm .node_fail ++
69
- }
70
- }
71
- }
72
- return & qm
47
+ var qm QueueMetrics
48
+ lines := strings .Split (string (input ), "\n " )
49
+ for _ , line := range lines {
50
+ if strings .Contains (line , "," ) {
51
+ splitted := strings .Split (line , "," )
52
+ state := splitted [1 ]
53
+ switch state {
54
+ case "PENDING" :
55
+ qm .pending ++
56
+ if len (splitted ) > 2 && splitted [2 ] == "Dependency" {
57
+ qm .pending_dep ++
58
+ }
59
+ case "RUNNING" :
60
+ qm .running ++
61
+ case "SUSPENDED" :
62
+ qm .suspended ++
63
+ case "CANCELLED" :
64
+ qm .cancelled ++
65
+ case "COMPLETING" :
66
+ qm .completing ++
67
+ case "COMPLETED" :
68
+ qm .completed ++
69
+ case "CONFIGURING" :
70
+ qm .configuring ++
71
+ case "FAILED" :
72
+ qm .failed ++
73
+ case "TIMEOUT" :
74
+ qm .timeout ++
75
+ case "PREEMPTED" :
76
+ qm .preempted ++
77
+ case "NODE_FAIL" :
78
+ qm .node_fail ++
79
+ }
80
+ }
81
+ }
82
+ return & qm
73
83
}
74
84
75
85
// Execute the squeue command and return its output
76
86
func QueueData () []byte {
77
- cmd := exec .Command ("/usr/bin/squeue" , "-h" , "-o %A,%T,%r" )
78
- stdout , err := cmd .StdoutPipe ()
79
- if err != nil { log .Fatal (err ) }
80
- if err := cmd .Start (); err != nil { log .Fatal (err ) }
81
- out , _ := ioutil .ReadAll (stdout )
82
- if err := cmd .Wait (); err != nil { log .Fatal (err ) }
83
- return out
87
+ cmd := exec .Command ("/usr/bin/squeue" , "-h" , "-o %A,%T,%r" , "--states=all" )
88
+ stdout , err := cmd .StdoutPipe ()
89
+ if err != nil {
90
+ log .Fatal (err )
91
+ }
92
+ if err := cmd .Start (); err != nil {
93
+ log .Fatal (err )
94
+ }
95
+ out , _ := ioutil .ReadAll (stdout )
96
+ if err := cmd .Wait (); err != nil {
97
+ log .Fatal (err )
98
+ }
99
+ return out
84
100
}
85
101
86
102
/*
@@ -90,64 +106,64 @@ func QueueData() []byte {
90
106
*/
91
107
92
108
func NewQueueCollector () * QueueCollector {
93
- return & QueueCollector {
94
- pending : prometheus .NewDesc ("slurm_queue_pending" , "Pending jobs in queue" , nil , nil ),
95
- pending_dep : prometheus .NewDesc ("slurm_queue_pending_dependency" , "Pending jobs because of dependency in queue" , nil , nil ),
96
- running : prometheus .NewDesc ("slurm_queue_running" , "Running jobs in the cluster" , nil , nil ),
97
- suspended : prometheus .NewDesc ("slurm_queue_suspended" , "Suspended jobs in the cluster" , nil , nil ),
98
- cancelled : prometheus .NewDesc ("slurm_queue_cancelled" , "Cancelled jobs in the cluster" , nil , nil ),
99
- completing : prometheus .NewDesc ("slurm_queue_completing" , "Completing jobs in the cluster" , nil , nil ),
100
- completed : prometheus .NewDesc ("slurm_queue_completed" , "Completed jobs in the cluster" , nil , nil ),
101
- configuring : prometheus .NewDesc ("slurm_queue_configuring" , "Configuring jobs in the cluster" , nil , nil ),
102
- failed : prometheus .NewDesc ("slurm_queue_failed" , "Number of failed jobs" , nil , nil ),
103
- timeout : prometheus .NewDesc ("slurm_queue_timeout" , "Jobs stopped by timeout" , nil , nil ),
104
- preempted : prometheus .NewDesc ("slurm_queue_preempted" , "Number of preempted jobs" , nil , nil ),
105
- node_fail : prometheus .NewDesc ("slurm_queue_node_fail" , "Number of jobs stopped due to node fail" , nil , nil ),
106
- }
109
+ return & QueueCollector {
110
+ pending : prometheus .NewDesc ("slurm_queue_pending" , "Pending jobs in queue" , nil , nil ),
111
+ pending_dep : prometheus .NewDesc ("slurm_queue_pending_dependency" , "Pending jobs because of dependency in queue" , nil , nil ),
112
+ running : prometheus .NewDesc ("slurm_queue_running" , "Running jobs in the cluster" , nil , nil ),
113
+ suspended : prometheus .NewDesc ("slurm_queue_suspended" , "Suspended jobs in the cluster" , nil , nil ),
114
+ cancelled : prometheus .NewDesc ("slurm_queue_cancelled" , "Cancelled jobs in the cluster" , nil , nil ),
115
+ completing : prometheus .NewDesc ("slurm_queue_completing" , "Completing jobs in the cluster" , nil , nil ),
116
+ completed : prometheus .NewDesc ("slurm_queue_completed" , "Completed jobs in the cluster" , nil , nil ),
117
+ configuring : prometheus .NewDesc ("slurm_queue_configuring" , "Configuring jobs in the cluster" , nil , nil ),
118
+ failed : prometheus .NewDesc ("slurm_queue_failed" , "Number of failed jobs" , nil , nil ),
119
+ timeout : prometheus .NewDesc ("slurm_queue_timeout" , "Jobs stopped by timeout" , nil , nil ),
120
+ preempted : prometheus .NewDesc ("slurm_queue_preempted" , "Number of preempted jobs" , nil , nil ),
121
+ node_fail : prometheus .NewDesc ("slurm_queue_node_fail" , "Number of jobs stopped due to node fail" , nil , nil ),
122
+ }
107
123
}
108
124
109
- type QueueCollector struct {
110
- pending * prometheus.Desc
111
- pending_dep * prometheus.Desc
112
- running * prometheus.Desc
113
- suspended * prometheus.Desc
114
- cancelled * prometheus.Desc
115
- completing * prometheus.Desc
116
- completed * prometheus.Desc
117
- configuring * prometheus.Desc
118
- failed * prometheus.Desc
119
- timeout * prometheus.Desc
120
- preempted * prometheus.Desc
121
- node_fail * prometheus.Desc
122
- }
125
+ type QueueCollector struct {
126
+ pending * prometheus.Desc
127
+ pending_dep * prometheus.Desc
128
+ running * prometheus.Desc
129
+ suspended * prometheus.Desc
130
+ cancelled * prometheus.Desc
131
+ completing * prometheus.Desc
132
+ completed * prometheus.Desc
133
+ configuring * prometheus.Desc
134
+ failed * prometheus.Desc
135
+ timeout * prometheus.Desc
136
+ preempted * prometheus.Desc
137
+ node_fail * prometheus.Desc
138
+ }
123
139
124
- func (qc * QueueCollector ) Describe (ch chan <- * prometheus.Desc ) {
125
- ch <- qc .pending
126
- ch <- qc .pending_dep
127
- ch <- qc .running
128
- ch <- qc .suspended
129
- ch <- qc .cancelled
130
- ch <- qc .completing
131
- ch <- qc .completed
132
- ch <- qc .configuring
133
- ch <- qc .failed
134
- ch <- qc .timeout
135
- ch <- qc .preempted
136
- ch <- qc .node_fail
137
- }
140
+ func (qc * QueueCollector ) Describe (ch chan <- * prometheus.Desc ) {
141
+ ch <- qc .pending
142
+ ch <- qc .pending_dep
143
+ ch <- qc .running
144
+ ch <- qc .suspended
145
+ ch <- qc .cancelled
146
+ ch <- qc .completing
147
+ ch <- qc .completed
148
+ ch <- qc .configuring
149
+ ch <- qc .failed
150
+ ch <- qc .timeout
151
+ ch <- qc .preempted
152
+ ch <- qc .node_fail
153
+ }
138
154
139
- func (qc * QueueCollector ) Collect (ch chan <- prometheus.Metric ) {
140
- qm := QueueGetMetrics ()
141
- ch <- prometheus .MustNewConstMetric (qc .pending , prometheus .GaugeValue , qm .pending )
142
- ch <- prometheus .MustNewConstMetric (qc .pending_dep , prometheus .GaugeValue , qm .pending_dep )
143
- ch <- prometheus .MustNewConstMetric (qc .running , prometheus .GaugeValue , qm .running )
144
- ch <- prometheus .MustNewConstMetric (qc .suspended , prometheus .GaugeValue , qm .suspended )
145
- ch <- prometheus .MustNewConstMetric (qc .cancelled , prometheus .GaugeValue , qm .cancelled )
146
- ch <- prometheus .MustNewConstMetric (qc .completing , prometheus .GaugeValue , qm .completing )
147
- ch <- prometheus .MustNewConstMetric (qc .completed , prometheus .GaugeValue , qm .completed )
148
- ch <- prometheus .MustNewConstMetric (qc .configuring , prometheus .GaugeValue , qm .configuring )
149
- ch <- prometheus .MustNewConstMetric (qc .failed , prometheus .GaugeValue , qm .failed )
150
- ch <- prometheus .MustNewConstMetric (qc .timeout , prometheus .GaugeValue , qm .timeout )
151
- ch <- prometheus .MustNewConstMetric (qc .preempted , prometheus .GaugeValue , qm .preempted )
152
- ch <- prometheus .MustNewConstMetric (qc .node_fail , prometheus .GaugeValue , qm .node_fail )
153
- }
155
+ func (qc * QueueCollector ) Collect (ch chan <- prometheus.Metric ) {
156
+ qm := QueueGetMetrics ()
157
+ ch <- prometheus .MustNewConstMetric (qc .pending , prometheus .GaugeValue , qm .pending )
158
+ ch <- prometheus .MustNewConstMetric (qc .pending_dep , prometheus .GaugeValue , qm .pending_dep )
159
+ ch <- prometheus .MustNewConstMetric (qc .running , prometheus .GaugeValue , qm .running )
160
+ ch <- prometheus .MustNewConstMetric (qc .suspended , prometheus .GaugeValue , qm .suspended )
161
+ ch <- prometheus .MustNewConstMetric (qc .cancelled , prometheus .GaugeValue , qm .cancelled )
162
+ ch <- prometheus .MustNewConstMetric (qc .completing , prometheus .GaugeValue , qm .completing )
163
+ ch <- prometheus .MustNewConstMetric (qc .completed , prometheus .GaugeValue , qm .completed )
164
+ ch <- prometheus .MustNewConstMetric (qc .configuring , prometheus .GaugeValue , qm .configuring )
165
+ ch <- prometheus .MustNewConstMetric (qc .failed , prometheus .GaugeValue , qm .failed )
166
+ ch <- prometheus .MustNewConstMetric (qc .timeout , prometheus .GaugeValue , qm .timeout )
167
+ ch <- prometheus .MustNewConstMetric (qc .preempted , prometheus .GaugeValue , qm .preempted )
168
+ ch <- prometheus .MustNewConstMetric (qc .node_fail , prometheus .GaugeValue , qm .node_fail )
169
+ }
0 commit comments