Skip to content

Commit 00a4137

Browse files
committed
Add monitoring healthstate support in nodestate API.
1 parent e9cd6b4 commit 00a4137

File tree

3 files changed

+611
-2
lines changed

3 files changed

+611
-2
lines changed

internal/api/node.go

Lines changed: 38 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -7,11 +7,14 @@ package api
77

88
import (
99
"fmt"
10+
"maps"
1011
"net/http"
1112
"strings"
1213
"time"
1314

1415
"github.com/ClusterCockpit/cc-backend/internal/repository"
16+
"github.com/ClusterCockpit/cc-backend/pkg/archive"
17+
"github.com/ClusterCockpit/cc-backend/pkg/metricstore"
1518
"github.com/ClusterCockpit/cc-lib/v2/schema"
1619
)
1720

@@ -20,6 +23,15 @@ type UpdateNodeStatesRequest struct {
2023
Cluster string `json:"cluster" example:"fritz"`
2124
}
2225

26+
// metricListToNames converts a map of metric configurations to a list of metric names
27+
func metricListToNames(metricList map[string]*schema.Metric) []string {
28+
names := make([]string, 0, len(metricList))
29+
for name := range metricList {
30+
names = append(names, name)
31+
}
32+
return names
33+
}
34+
2335
// this routine assumes that only one of them exists per node
2436
func determineState(states []string) schema.SchedulerState {
2537
for _, state := range states {
@@ -62,18 +74,42 @@ func (api *RestAPI) updateNodeStates(rw http.ResponseWriter, r *http.Request) {
6274
http.StatusBadRequest, rw)
6375
return
6476
}
65-
repo := repository.GetNodeRepository()
6677
requestReceived := time.Now().Unix()
78+
repo := repository.GetNodeRepository()
79+
ms := metricstore.GetMemoryStore()
80+
81+
m := make(map[string][]string)
82+
healthStates := make(map[string]metricstore.NodeHealthState)
83+
84+
for _, node := range req.Nodes {
85+
if sc, err := archive.GetSubClusterByNode(req.Cluster, node.Hostname); err == nil {
86+
m[sc] = append(m[sc], node.Hostname)
87+
}
88+
}
89+
90+
for sc, nl := range m {
91+
if sc != "" {
92+
metricList := archive.GetMetricConfigSubCluster(req.Cluster, sc)
93+
metricNames := metricListToNames(metricList)
94+
if states, err := ms.HealthCheckAlt(req.Cluster, nl, metricNames); err == nil {
95+
maps.Copy(healthStates, states)
96+
}
97+
}
98+
}
6799

68100
for _, node := range req.Nodes {
69101
state := determineState(node.States)
102+
healthState := schema.MonitoringStateFull
103+
if hs, ok := healthStates[node.Hostname]; ok {
104+
healthState = hs.Status
105+
}
70106
nodeState := schema.NodeStateDB{
71107
TimeStamp: requestReceived,
72108
NodeState: state,
73109
CpusAllocated: node.CpusAllocated,
74110
MemoryAllocated: node.MemoryAllocated,
75111
GpusAllocated: node.GpusAllocated,
76-
HealthState: schema.MonitoringStateFull,
112+
HealthState: healthState,
77113
JobsRunning: node.JobsRunning,
78114
}
79115

0 commit comments

Comments
 (0)