Skip to content

Commit 99d8fde

Browse files
committed
add a new metrics for pending jobs by partition
1 parent 7f340d6 commit 99d8fde

File tree

1 file changed

+38
-4
lines changed

1 file changed

+38
-4
lines changed

partitions.go

Lines changed: 38 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -40,23 +40,40 @@ func PartitionsData() []byte {
4040
return out
4141
}
4242

43+
func PartitionsPendingJobsData() []byte {
44+
cmd := exec.Command("squeue","-a","-r","-h","-o%P","--states=PENDING")
45+
stdout, err := cmd.StdoutPipe()
46+
if err != nil {
47+
log.Fatal(err)
48+
}
49+
if err := cmd.Start(); err != nil {
50+
log.Fatal(err)
51+
}
52+
out, _ := ioutil.ReadAll(stdout)
53+
if err := cmd.Wait(); err != nil {
54+
log.Fatal(err)
55+
}
56+
return out
57+
}
58+
4359
type PartitionMetrics struct {
4460
allocated float64
4561
idle float64
4662
other float64
63+
pending float64
4764
total float64
4865
}
4966

50-
func ParsePartitionsMetrics(input []byte) map[string]*PartitionMetrics {
67+
func ParsePartitionsMetrics() map[string]*PartitionMetrics {
5168
partitions := make(map[string]*PartitionMetrics)
52-
lines := strings.Split(string(input), "\n")
69+
lines := strings.Split(string(PartitionsData()), "\n")
5370
for _, line := range lines {
5471
if strings.Contains(line,",") {
5572
// name of a partition
5673
partition := strings.Split(line,",")[0]
5774
_,key := partitions[partition]
5875
if !key {
59-
partitions[partition] = &PartitionMetrics{0,0,0,0}
76+
partitions[partition] = &PartitionMetrics{0,0,0,0,0}
6077
}
6178
states := strings.Split(line,",")[1]
6279
allocated,_ := strconv.ParseFloat(strings.Split(states,"/")[0],64)
@@ -69,13 +86,25 @@ func ParsePartitionsMetrics(input []byte) map[string]*PartitionMetrics {
6986
partitions[partition].total = total
7087
}
7188
}
89+
// get list of pending jobs by partition name
90+
list := strings.Split(string(PartitionsPendingJobsData()),"\n")
91+
for _,partition := range list {
92+
// accumulate the number of pending jobs
93+
_,key := partitions[partition]
94+
if key {
95+
partitions[partition].pending += 1
96+
}
97+
}
98+
99+
72100
return partitions
73101
}
74102

75103
type PartitionsCollector struct {
76104
allocated *prometheus.Desc
77105
idle *prometheus.Desc
78106
other *prometheus.Desc
107+
pending *prometheus.Desc
79108
total *prometheus.Desc
80109
}
81110

@@ -85,6 +114,7 @@ func NewPartitionsCollector() *PartitionsCollector {
85114
allocated: prometheus.NewDesc("slurm_partition_cpus_allocated", "Allocated CPUs for partition", labels,nil),
86115
idle: prometheus.NewDesc("slurm_partition_cpus_idle", "Idle CPUs for partition", labels,nil),
87116
other: prometheus.NewDesc("slurm_partition_cpus_other", "Other CPUs for partition", labels,nil),
117+
pending: prometheus.NewDesc("slurm_partition_jobs_pending", "Pending jobs for partition", labels,nil),
88118
total: prometheus.NewDesc("slurm_partition_cpus_total", "Total CPUs for partition", labels,nil),
89119
}
90120
}
@@ -93,11 +123,12 @@ func (pc *PartitionsCollector) Describe(ch chan<- *prometheus.Desc) {
93123
ch <- pc.allocated
94124
ch <- pc.idle
95125
ch <- pc.other
126+
ch <- pc.pending
96127
ch <- pc.total
97128
}
98129

99130
func (pc *PartitionsCollector) Collect(ch chan<- prometheus.Metric) {
100-
pm := ParsePartitionsMetrics(PartitionsData())
131+
pm := ParsePartitionsMetrics()
101132
for p := range pm {
102133
if pm[p].allocated > 0 {
103134
ch <- prometheus.MustNewConstMetric(pc.allocated, prometheus.GaugeValue, pm[p].allocated, p)
@@ -108,6 +139,9 @@ func (pc *PartitionsCollector) Collect(ch chan<- prometheus.Metric) {
108139
if pm[p].other > 0 {
109140
ch <- prometheus.MustNewConstMetric(pc.other, prometheus.GaugeValue, pm[p].other, p)
110141
}
142+
if pm[p].pending > 0 {
143+
ch <- prometheus.MustNewConstMetric(pc.pending, prometheus.GaugeValue, pm[p].pending, p)
144+
}
111145
if pm[p].total > 0 {
112146
ch <- prometheus.MustNewConstMetric(pc.total, prometheus.GaugeValue, pm[p].total, p)
113147
}

0 commit comments

Comments
 (0)