@@ -40,23 +40,40 @@ func PartitionsData() []byte {
4040 return out
4141}
4242
43+ func PartitionsPendingJobsData () []byte {
44+ cmd := exec .Command ("squeue" ,"-a" ,"-r" ,"-h" ,"-o%P" ,"--states=PENDING" )
45+ stdout , err := cmd .StdoutPipe ()
46+ if err != nil {
47+ log .Fatal (err )
48+ }
49+ if err := cmd .Start (); err != nil {
50+ log .Fatal (err )
51+ }
52+ out , _ := ioutil .ReadAll (stdout )
53+ if err := cmd .Wait (); err != nil {
54+ log .Fatal (err )
55+ }
56+ return out
57+ }
58+
4359type PartitionMetrics struct {
4460 allocated float64
4561 idle float64
4662 other float64
63+ pending float64
4764 total float64
4865}
4966
50- func ParsePartitionsMetrics (input [] byte ) map [string ]* PartitionMetrics {
67+ func ParsePartitionsMetrics () map [string ]* PartitionMetrics {
5168 partitions := make (map [string ]* PartitionMetrics )
52- lines := strings .Split (string (input ), "\n " )
69+ lines := strings .Split (string (PartitionsData () ), "\n " )
5370 for _ , line := range lines {
5471 if strings .Contains (line ,"," ) {
5572 // name of a partition
5673 partition := strings .Split (line ,"," )[0 ]
5774 _ ,key := partitions [partition ]
5875 if ! key {
59- partitions [partition ] = & PartitionMetrics {0 ,0 ,0 ,0 }
76+ partitions [partition ] = & PartitionMetrics {0 ,0 ,0 ,0 , 0 }
6077 }
6178 states := strings .Split (line ,"," )[1 ]
6279 allocated ,_ := strconv .ParseFloat (strings .Split (states ,"/" )[0 ],64 )
@@ -69,13 +86,25 @@ func ParsePartitionsMetrics(input []byte) map[string]*PartitionMetrics {
6986 partitions [partition ].total = total
7087 }
7188 }
89+ // get list of pending jobs by partition name
90+ list := strings .Split (string (PartitionsPendingJobsData ()),"\n " )
91+ for _ ,partition := range list {
92+ // accumulate the number of pending jobs
93+ _ ,key := partitions [partition ]
94+ if key {
95+ partitions [partition ].pending += 1
96+ }
97+ }
98+
99+
72100 return partitions
73101}
74102
75103type PartitionsCollector struct {
76104 allocated * prometheus.Desc
77105 idle * prometheus.Desc
78106 other * prometheus.Desc
107+ pending * prometheus.Desc
79108 total * prometheus.Desc
80109}
81110
@@ -85,6 +114,7 @@ func NewPartitionsCollector() *PartitionsCollector {
85114 allocated : prometheus .NewDesc ("slurm_partition_cpus_allocated" , "Allocated CPUs for partition" , labels ,nil ),
86115 idle : prometheus .NewDesc ("slurm_partition_cpus_idle" , "Idle CPUs for partition" , labels ,nil ),
87116 other : prometheus .NewDesc ("slurm_partition_cpus_other" , "Other CPUs for partition" , labels ,nil ),
117+ pending : prometheus .NewDesc ("slurm_partition_jobs_pending" , "Pending jobs for partition" , labels ,nil ),
88118 total : prometheus .NewDesc ("slurm_partition_cpus_total" , "Total CPUs for partition" , labels ,nil ),
89119 }
90120}
@@ -93,11 +123,12 @@ func (pc *PartitionsCollector) Describe(ch chan<- *prometheus.Desc) {
93123 ch <- pc .allocated
94124 ch <- pc .idle
95125 ch <- pc .other
126+ ch <- pc .pending
96127 ch <- pc .total
97128}
98129
99130func (pc * PartitionsCollector ) Collect (ch chan <- prometheus.Metric ) {
100- pm := ParsePartitionsMetrics (PartitionsData () )
131+ pm := ParsePartitionsMetrics ()
101132 for p := range pm {
102133 if pm [p ].allocated > 0 {
103134 ch <- prometheus .MustNewConstMetric (pc .allocated , prometheus .GaugeValue , pm [p ].allocated , p )
@@ -108,6 +139,9 @@ func (pc *PartitionsCollector) Collect(ch chan<- prometheus.Metric) {
108139 if pm [p ].other > 0 {
109140 ch <- prometheus .MustNewConstMetric (pc .other , prometheus .GaugeValue , pm [p ].other , p )
110141 }
142+ if pm [p ].pending > 0 {
143+ ch <- prometheus .MustNewConstMetric (pc .pending , prometheus .GaugeValue , pm [p ].pending , p )
144+ }
111145 if pm [p ].total > 0 {
112146 ch <- prometheus .MustNewConstMetric (pc .total , prometheus .GaugeValue , pm [p ].total , p )
113147 }
0 commit comments