@@ -13,16 +13,15 @@ GNU General Public License for more details.
1313You should have received a copy of the GNU General Public License
1414along with this program. If not, see <http://www.gnu.org/licenses/>. */
1515
16-
1716package main
1817
1918import (
20- "os/exec"
2119 "encoding/json"
22- "log"
2320 "fmt"
24- "io/ioutil"
2521 "github.com/prometheus/client_golang/prometheus"
22+ "github.com/prometheus/common/log"
23+ "io/ioutil"
24+ "os/exec"
2625)
2726
2827func JobsData () []byte {
@@ -42,34 +41,38 @@ func JobsData() []byte {
4241}
4342
4443type SqueueResult struct {
45- jobs []SqueueJob
44+ Jobs []SqueueJob `json:"jobs"`
4645}
4746
4847type SqueueJob struct {
49- account string
50- id int `json:"job_id"`
51- name string
52- resources SqueueJobResources `json:"job_resources"`
53- state []string `json:"job_state"`
54- nodes string
55- partition string
56- groupId int `json:"group_id"`
57- groupName string `json:"group_name"`
58- userId int `json:"user_id"`
59- userName string `json:"user_name"`
60- memoryPerNode SqeueueMemoryPerNode `json:"memory_per_node"`
48+ Account string `json:"acccount"`
49+ Id int `json:"job_id"`
50+ Name string `json:"name"`
51+ Resources SqueueJobResources `json:"job_resources"`
52+ State []string `json:"job_state"`
53+ Nodes string `json:"nodes"`
54+ Partition string `json:"partition"`
55+ GroupId int `json:"group_id"`
56+ GroupName string `json:"group_name"`
57+ UserId int `json:"user_id"`
58+ UserName string `json:"user_name"`
6159}
6260
6361type SqueueJobResources struct {
64- cpus int
62+ Cpus int `json:"cpus"`
63+ Nodes struct {
64+ Allocation []SqueueJobResourcesAllocation `json:"allocation"`
65+ } `json:"nodes"`
6566}
6667
67- type SqeueueMemoryPerNode struct {
68- number int
68+ type SqueueJobResourcesAllocation struct {
69+ Memory struct {
70+ Allocated int `json:"allocated"`
71+ } `json:"memory"`
6972}
7073
7174func InstrumentJobs () SqueueResult {
72- jobs := JobsData ();
75+ jobs := JobsData ()
7376 var result SqueueResult
7477 if err := json .Unmarshal (jobs , & result ); err != nil {
7578 log .Fatal (err )
@@ -82,21 +85,21 @@ type JobsCollector struct {
8285}
8386
8487func NewJobsCollector () * JobsCollector {
85- labels := []string {
88+ labels := []string {
8689 "account" ,
8790 "job_id" ,
8891 "name" ,
8992 "cpus" ,
90- "memory_per_node " ,
93+ "memory " ,
9194 "state" ,
9295 "nodes" ,
9396 "partition" ,
9497 "group_id" ,
9598 "group_name" ,
9699 "user_id" ,
97100 "user_name" ,
98- };
99- return & JobsCollector {
101+ }
102+ return & JobsCollector {
100103 jobs : prometheus .NewDesc ("slurm_jobs" , "Description of running Slurm jobs" , labels , nil ),
101104 }
102105}
@@ -107,20 +110,24 @@ func (jc *JobsCollector) Describe(ch chan<- *prometheus.Desc) {
107110
108111func (jc * JobsCollector ) Collect (ch chan <- prometheus.Metric ) {
109112 jm := InstrumentJobs ()
110- for _ , job := range jm .jobs {
113+ for _ , job := range jm .Jobs {
114+ allocatedMemory := 0
115+ for _ , allocation := range job .Resources .Nodes .Allocation {
116+ allocatedMemory += allocation .Memory .Allocated
117+ }
111118 labels := []string {
112- job .account ,
113- fmt .Sprintf ("%s " , job .id ),
114- job .name ,
115- fmt .Sprintf ("%s " , job .resources . cpus ),
116- fmt .Sprintf ("%s " , job . memoryPerNode . number ),
117- fmt .Sprintf ("%s" , job .state ),
118- job .nodes ,
119- job .partition ,
120- fmt .Sprintf ("%d" , job .groupId ),
121- job .groupName ,
122- fmt .Sprintf ("%d" , job .userId ),
123- job .userName ,
119+ job .Account ,
120+ fmt .Sprintf ("%d " , job .Id ),
121+ job .Name ,
122+ fmt .Sprintf ("%d " , job .Resources . Cpus ),
123+ fmt .Sprintf ("%d " , allocatedMemory ),
124+ fmt .Sprintf ("%s" , job .State ),
125+ job .Nodes ,
126+ job .Partition ,
127+ fmt .Sprintf ("%d" , job .GroupId ),
128+ job .GroupName ,
129+ fmt .Sprintf ("%d" , job .UserId ),
130+ job .UserName ,
124131 }
125132 ch <- prometheus .MustNewConstMetric (jc .jobs , prometheus .GaugeValue , 1.0 , labels ... )
126133 }
0 commit comments