Skip to content

Commit 4d03c86

Browse files
add jobs collector
1 parent 895c7c6 commit 4d03c86

File tree

2 files changed

+128
-0
lines changed

2 files changed

+128
-0
lines changed

jobs.go

Lines changed: 127 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,127 @@
1+
/* Copyright 2025 Scalableminds
2+
3+
This program is free software: you can redistribute it and/or modify
4+
it under the terms of the GNU General Public License as published by
5+
the Free Software Foundation, either version 3 of the License, or
6+
(at your option) any later version.
7+
8+
This program is distributed in the hope that it will be useful,
9+
but WITHOUT ANY WARRANTY; without even the implied warranty of
10+
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
11+
GNU General Public License for more details.
12+
13+
You should have received a copy of the GNU General Public License
14+
along with this program. If not, see <http://www.gnu.org/licenses/>. */
15+
16+
17+
package main
18+
19+
import (
20+
"os/exec"
21+
"encoding/json"
22+
"log"
23+
"fmt"
24+
"io/ioutil"
25+
"github.com/prometheus/client_golang/prometheus"
26+
)
27+
28+
func JobsData() []byte {
29+
cmd := exec.Command("squeue", "--json")
30+
stdout, err := cmd.StdoutPipe()
31+
if err != nil {
32+
log.Fatal(err)
33+
}
34+
if err := cmd.Start(); err != nil {
35+
log.Fatal(err)
36+
}
37+
out, _ := ioutil.ReadAll(stdout)
38+
if err := cmd.Wait(); err != nil {
39+
log.Fatal(err)
40+
}
41+
return out
42+
}
43+
44+
type SqueueResult struct {
45+
jobs []SqueueJob
46+
}
47+
48+
type SqueueJob struct {
49+
account string
50+
id int `json:"job_id"`
51+
name string
52+
resources SqueueJobResources `json:"job_resources"`
53+
state []string `json:"job_state"`
54+
nodes string
55+
partition string
56+
groupId int `json:"group_id"`
57+
groupName string `json:"group_name"`
58+
userId int `json:"user_id"`
59+
userName string `json:"user_name"`
60+
memoryPerNode SqeueueMemoryPerNode `json:"memory_per_node"`
61+
}
62+
63+
type SqueueJobResources struct {
64+
cpus int
65+
}
66+
67+
type SqeueueMemoryPerNode struct {
68+
number int
69+
}
70+
71+
func InstrumentJobs() SqueueResult {
72+
jobs := JobsData();
73+
var result SqueueResult
74+
if err := json.Unmarshal(jobs, &result); err != nil {
75+
log.Fatal(err)
76+
}
77+
return result
78+
}
79+
80+
type JobsCollector struct {
81+
jobs *prometheus.Desc
82+
}
83+
84+
func NewJobsCollector() *JobsCollector {
85+
labels := []string {
86+
"account",
87+
"job_id",
88+
"name",
89+
"cpus",
90+
"memory_per_node",
91+
"state",
92+
"nodes",
93+
"partition",
94+
"group_id",
95+
"group_name",
96+
"user_id",
97+
"user_name",
98+
};
99+
return &JobsCollector {
100+
jobs: prometheus.NewDesc("slurm_jobs", "Description of running Slurm jobs", labels, nil),
101+
}
102+
}
103+
104+
func (jc *JobsCollector) Describe(ch chan<- *prometheus.Desc) {
105+
ch <- jc.jobs
106+
}
107+
108+
func (jc *JobsCollector) Collect(ch chan<- prometheus.Metric) {
109+
jm := InstrumentJobs()
110+
for _, job := range jm.jobs {
111+
labels := []string{
112+
job.account,
113+
fmt.Sprintf("%s", job.id),
114+
job.name,
115+
fmt.Sprintf("%s", job.resources.cpus),
116+
fmt.Sprintf("%s", job.memoryPerNode.number),
117+
fmt.Sprintf("%s", job.state),
118+
job.nodes,
119+
job.partition,
120+
fmt.Sprintf("%d", job.groupId),
121+
job.groupName,
122+
fmt.Sprintf("%d", job.userId),
123+
job.userName,
124+
}
125+
ch <- prometheus.MustNewConstMetric(jc.jobs, prometheus.GaugeValue, 1.0, labels...)
126+
}
127+
}

main.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ func init() {
3434
prometheus.MustRegister(NewSchedulerCollector()) // from scheduler.go
3535
prometheus.MustRegister(NewFairShareCollector()) // from sshare.go
3636
prometheus.MustRegister(NewUsersCollector()) // from users.go
37+
prometheus.MustRegister(NewJobsCollector()) // from jobs.go
3738
}
3839

3940
var listenAddress = flag.String(

0 commit comments

Comments
 (0)