Skip to content

Commit e6662c4

Browse files
Merge pull request #479 from ClusterCockpit/dev
Dev
2 parents 195a1ed + 1ffcc5e commit e6662c4

File tree

27 files changed

+407
-265
lines changed

27 files changed

+407
-265
lines changed

internal/graph/schema.resolvers.go

Lines changed: 11 additions & 4 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

internal/graph/util.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -57,13 +57,13 @@ func (r *queryResolver) rooflineHeatmap(
5757

5858
jobdata, err := metricdispatch.LoadData(job, []string{"flops_any", "mem_bw"}, []schema.MetricScope{schema.MetricScopeNode}, ctx, 0)
5959
if err != nil {
60-
cclog.Errorf("Error while loading roofline metrics for job %d", job.ID)
60+
cclog.Warnf("Error while loading roofline metrics for job %d", job.ID)
6161
return nil, err
6262
}
6363

6464
flops_, membw_ := jobdata["flops_any"], jobdata["mem_bw"]
6565
if flops_ == nil && membw_ == nil {
66-
cclog.Infof("rooflineHeatmap(): 'flops_any' or 'mem_bw' missing for job %d", job.ID)
66+
cclog.Warnf("rooflineHeatmap(): 'flops_any' or 'mem_bw' missing for job %d", job.ID)
6767
continue
6868
// return nil, fmt.Errorf("GRAPH/UTIL > 'flops_any' or 'mem_bw' missing for job %d", job.ID)
6969
}

internal/metricdispatch/dataLoader.go

Lines changed: 26 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -97,8 +97,8 @@ func LoadData(job *schema.Job,
9797

9898
ms, err := GetMetricDataRepo(job.Cluster, job.SubCluster)
9999
if err != nil {
100-
cclog.Errorf("failed to load job data from metric store for job %d (user: %s, project: %s): %s",
101-
job.JobID, job.User, job.Project, err.Error())
100+
cclog.Errorf("failed to access metricDataRepo for cluster %s-%s: %s",
101+
job.Cluster, job.SubCluster, err.Error())
102102
return err, 0, 0
103103
}
104104

@@ -116,11 +116,11 @@ func LoadData(job *schema.Job,
116116
jd, err = ms.LoadData(job, metrics, scopes, ctx, resolution)
117117
if err != nil {
118118
if len(jd) != 0 {
119-
cclog.Warnf("partial error loading metrics from store for job %d (user: %s, project: %s): %s",
120-
job.JobID, job.User, job.Project, err.Error())
119+
cclog.Warnf("partial error loading metrics from store for job %d (user: %s, project: %s, cluster: %s-%s): %s",
120+
job.JobID, job.User, job.Project, job.Cluster, job.SubCluster, err.Error())
121121
} else {
122-
cclog.Errorf("failed to load job data from metric store for job %d (user: %s, project: %s): %s",
123-
job.JobID, job.User, job.Project, err.Error())
122+
cclog.Warnf("failed to load job data from metric store for job %d (user: %s, project: %s, cluster: %s-%s): %s",
123+
job.JobID, job.User, job.Project, job.Cluster, job.SubCluster, err.Error())
124124
return err, 0, 0
125125
}
126126
}
@@ -129,8 +129,8 @@ func LoadData(job *schema.Job,
129129
var jdTemp schema.JobData
130130
jdTemp, err = archive.GetHandle().LoadJobData(job)
131131
if err != nil {
132-
cclog.Errorf("failed to load job data from archive for job %d (user: %s, project: %s): %s",
133-
job.JobID, job.User, job.Project, err.Error())
132+
cclog.Warnf("failed to load job data from archive for job %d (user: %s, project: %s, cluster: %s-%s): %s",
133+
job.JobID, job.User, job.Project, job.Cluster, job.SubCluster, err.Error())
134134
return err, 0, 0
135135
}
136136

@@ -244,15 +244,15 @@ func LoadAverages(
244244

245245
ms, err := GetMetricDataRepo(job.Cluster, job.SubCluster)
246246
if err != nil {
247-
cclog.Errorf("failed to load job data from metric store for job %d (user: %s, project: %s): %s",
248-
job.JobID, job.User, job.Project, err.Error())
247+
cclog.Errorf("failed to access metricDataRepo for cluster %s-%s: %s",
248+
job.Cluster, job.SubCluster, err.Error())
249249
return err
250250
}
251251

252252
stats, err := ms.LoadStats(job, metrics, ctx)
253253
if err != nil {
254-
cclog.Errorf("failed to load statistics from metric store for job %d (user: %s, project: %s): %s",
255-
job.JobID, job.User, job.Project, err.Error())
254+
cclog.Warnf("failed to load statistics from metric store for job %d (user: %s, project: %s, cluster: %s-%s): %s",
255+
job.JobID, job.User, job.Project, job.Cluster, job.SubCluster, err.Error())
256256
return err
257257
}
258258

@@ -288,15 +288,15 @@ func LoadScopedJobStats(
288288

289289
ms, err := GetMetricDataRepo(job.Cluster, job.SubCluster)
290290
if err != nil {
291-
cclog.Errorf("failed to load job data from metric store for job %d (user: %s, project: %s): %s",
292-
job.JobID, job.User, job.Project, err.Error())
291+
cclog.Errorf("failed to access metricDataRepo for cluster %s-%s: %s",
292+
job.Cluster, job.SubCluster, err.Error())
293293
return nil, err
294294
}
295295

296296
scopedStats, err := ms.LoadScopedStats(job, metrics, scopes, ctx)
297297
if err != nil {
298-
cclog.Errorf("failed to load scoped statistics from metric store for job %d (user: %s, project: %s): %s",
299-
job.JobID, job.User, job.Project, err.Error())
298+
cclog.Warnf("failed to load scoped statistics from metric store for job %d (user: %s, project: %s, cluster: %s-%s): %s",
299+
job.JobID, job.User, job.Project, job.Cluster, job.SubCluster, err.Error())
300300
return nil, err
301301
}
302302

@@ -320,17 +320,17 @@ func LoadJobStats(
320320

321321
ms, err := GetMetricDataRepo(job.Cluster, job.SubCluster)
322322
if err != nil {
323-
cclog.Errorf("failed to load job data from metric store for job %d (user: %s, project: %s): %s",
324-
job.JobID, job.User, job.Project, err.Error())
323+
cclog.Errorf("failed to access metricDataRepo for cluster %s-%s: %s",
324+
job.Cluster, job.SubCluster, err.Error())
325325
return nil, err
326326
}
327327

328328
data := make(map[string]schema.MetricStatistics, len(metrics))
329329

330330
stats, err := ms.LoadStats(job, metrics, ctx)
331331
if err != nil {
332-
cclog.Errorf("failed to load statistics from metric store for job %d (user: %s, project: %s): %s",
333-
job.JobID, job.User, job.Project, err.Error())
332+
cclog.Warnf("failed to load statistics from metric store for job %d (user: %s, project: %s, cluster: %s-%s): %s",
333+
job.JobID, job.User, job.Project, job.Cluster, job.SubCluster, err.Error())
334334
return data, err
335335
}
336336

@@ -379,8 +379,8 @@ func LoadNodeData(
379379

380380
ms, err := GetMetricDataRepo(cluster, "")
381381
if err != nil {
382-
cclog.Errorf("failed to load node data from metric store: %s",
383-
err.Error())
382+
cclog.Errorf("failed to access metricDataRepo for cluster %s: %s",
383+
cluster, err.Error())
384384
return nil, err
385385
}
386386

@@ -389,7 +389,7 @@ func LoadNodeData(
389389
if len(data) != 0 {
390390
cclog.Warnf("partial error loading node data from metric store for cluster %s: %s", cluster, err.Error())
391391
} else {
392-
cclog.Errorf("failed to load node data from metric store for cluster %s: %s", cluster, err.Error())
392+
cclog.Warnf("failed to load node data from metric store for cluster %s: %s", cluster, err.Error())
393393
return nil, err
394394
}
395395
}
@@ -423,8 +423,8 @@ func LoadNodeListData(
423423

424424
ms, err := GetMetricDataRepo(cluster, subCluster)
425425
if err != nil {
426-
cclog.Errorf("failed to load node data from metric store: %s",
427-
err.Error())
426+
cclog.Errorf("failed to access metricDataRepo for cluster %s-%s: %s",
427+
cluster, subCluster, err.Error())
428428
return nil, err
429429
}
430430

@@ -434,7 +434,7 @@ func LoadNodeListData(
434434
cclog.Warnf("partial error loading node list data from metric store for cluster %s, subcluster %s: %s",
435435
cluster, subCluster, err.Error())
436436
} else {
437-
cclog.Errorf("failed to load node list data from metric store for cluster %s, subcluster %s: %s",
437+
cclog.Warnf("failed to load node list data from metric store for cluster %s, subcluster %s: %s",
438438
cluster, subCluster, err.Error())
439439
return nil, err
440440
}

internal/metricstoreclient/cc-metric-store.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -329,7 +329,7 @@ func (ccms *CCMetricStore) LoadStats(
329329
metric := query.Metric
330330
data := res[0]
331331
if data.Error != nil {
332-
cclog.Errorf("fetching %s for node %s failed: %s", metric, query.Hostname, *data.Error)
332+
cclog.Warnf("fetching %s for node %s failed: %s", metric, query.Hostname, *data.Error)
333333
continue
334334
}
335335

@@ -556,7 +556,7 @@ func (ccms *CCMetricStore) LoadNodeListData(
556556
) (map[string]schema.JobData, error) {
557557
queries, assignedScope, err := ccms.buildNodeQueries(cluster, subCluster, nodes, metrics, scopes, resolution)
558558
if err != nil {
559-
cclog.Errorf("Error while building node queries for Cluster %s, SubCLuster %s, Metrics %v, Scopes %v: %s", cluster, subCluster, metrics, scopes, err.Error())
559+
cclog.Errorf("Error while building node queries for Cluster %s, SubCluster %s, Metrics %v, Scopes %v: %s", cluster, subCluster, metrics, scopes, err.Error())
560560
return nil, err
561561
}
562562

internal/repository/stats.go

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@
3838
// - All queries use prepared statements via stmtCache
3939
// - Complex aggregations use SQL for efficiency
4040
// - Histogram pre-initialization ensures consistent bin ranges
41-
// - Metric histogram queries limited to 500 jobs for running job analysis
41+
// - Metric histogram queries limited to 5000 jobs for running job analysis
4242

4343
package repository
4444

@@ -686,7 +686,7 @@ func (r *JobRepository) AddHistograms(
686686
// - Pre-initialized with zeros for consistent visualization
687687
//
688688
// Limitations:
689-
// - Running jobs: Limited to 500 jobs for performance
689+
// - Running jobs: Limited to 5000 jobs for performance
690690
// - Requires valid cluster configuration with metric peak values
691691
// - Uses footprint statistic (avg/max/min) configured per metric
692692
func (r *JobRepository) AddMetricHistograms(
@@ -995,12 +995,12 @@ func (r *JobRepository) jobsMetricStatisticsHistogram(
995995
// Returns slice of MetricHistoPoints, one per metric.
996996
//
997997
// Limitations:
998-
// - Maximum 500 jobs (returns nil if more jobs match)
998+
// - Maximum 5000 jobs (returns nil if more jobs match)
999999
// - Requires metric backend availability
10001000
// - Bins based on metric peak values from cluster configuration
10011001
//
10021002
// Algorithm:
1003-
// 1. Query first 501 jobs to check count limit
1003+
// 1. Query first 5001 jobs to check count limit
10041004
// 2. Load metric averages for all jobs via metricdispatch
10051005
// 3. For each metric, create bins based on peak value
10061006
// 4. Iterate averages and count jobs per bin
@@ -1011,13 +1011,13 @@ func (r *JobRepository) runningJobsMetricStatisticsHistogram(
10111011
bins *int,
10121012
) []*model.MetricHistoPoints {
10131013
// Get Jobs
1014-
jobs, err := r.QueryJobs(ctx, filters, &model.PageRequest{Page: 1, ItemsPerPage: 500 + 1}, nil)
1014+
jobs, err := r.QueryJobs(ctx, filters, &model.PageRequest{Page: 1, ItemsPerPage: 5000 + 1}, nil)
10151015
if err != nil {
10161016
cclog.Errorf("Error while querying jobs for footprint: %s", err)
10171017
return nil
10181018
}
1019-
if len(jobs) > 500 {
1020-
cclog.Errorf("too many jobs matched (max: %d)", 500)
1019+
if len(jobs) > 5000 {
1020+
cclog.Errorf("too many jobs matched (max: %d)", 5000)
10211021
return nil
10221022
}
10231023

internal/taskmanager/updateFootprintService.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -68,8 +68,8 @@ func RegisterFootprintWorker() {
6868

6969
ms, err := metricdispatch.GetMetricDataRepo(job.Cluster, job.SubCluster)
7070
if err != nil {
71-
cclog.Errorf("failed to load job data from metric store for job %d (user: %s, project: %s): %s",
72-
job.JobID, job.User, job.Project, err.Error())
71+
cclog.Errorf("failed to access metricDataRepo for cluster %s-%s: %s",
72+
job.Cluster, job.SubCluster, err.Error())
7373
continue
7474
}
7575

pkg/metricstore/api.go

Lines changed: 8 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -13,13 +13,14 @@ import (
1313
"fmt"
1414
"math"
1515

16+
cclog "github.com/ClusterCockpit/cc-lib/v2/ccLogger"
1617
"github.com/ClusterCockpit/cc-lib/v2/schema"
1718
"github.com/ClusterCockpit/cc-lib/v2/util"
1819
)
1920

2021
var (
2122
// ErrNoHostOrMetric is returned when the metric store does not find the host or the metric
22-
ErrNoHostOrMetric error = errors.New("[METRICSTORE]> [METRICSTORE]> metric or host not found")
23+
ErrNoHostOrMetric error = errors.New("[METRICSTORE]> metric or host not found")
2324
// ErrInvalidTimeRange is returned when a query has 'from' >= 'to'
2425
ErrInvalidTimeRange = errors.New("[METRICSTORE]> invalid time range: 'from' must be before 'to'")
2526
// ErrEmptyCluster is returned when a query with ForAllNodes has no cluster specified
@@ -280,20 +281,16 @@ func FetchData(req APIQueryRequest) (*APIQueryResponse, error) {
280281

281282
data.Data, data.From, data.To, data.Resolution, err = ms.Read(sel, query.Metric, req.From, req.To, query.Resolution)
282283
if err != nil {
283-
// Check a special case where only the metric or host.
284-
// Dont send errors, instead just send empty array
285-
// where frontend already renders error for empty array.
286-
if err == ErrNoHostOrMetric {
287-
data.Data = make([]schema.Float, 0)
288-
data.From = req.From
289-
data.To = req.To
290-
data.Resolution = query.Resolution
291-
} else {
284+
// Skip Error If Just Missing Host or Metric, Continue
285+
// Empty Return For Metric Handled Gracefully By Frontend
286+
if err != ErrNoHostOrMetric {
292287
msg := err.Error()
293288
data.Error = &msg
294289
res = append(res, data)
295-
continue
290+
} else {
291+
cclog.Warnf("failed to fetch '%s' from host '%s' (cluster: %s): %s", query.Metric, query.Hostname, req.Cluster, err.Error())
296292
}
293+
continue
297294
}
298295

299296
if req.WithStats {

pkg/metricstore/query.go

Lines changed: 24 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -104,6 +104,11 @@ func (ccms *InternalMetricStore) LoadData(
104104
var errors []string
105105
jobData := make(schema.JobData)
106106
for i, row := range resBody.Results {
107+
if len(row) == 0 {
108+
// No Data Found For Metric, Logged in FetchData to Warn
109+
continue
110+
}
111+
107112
query := req.Queries[i]
108113
metric := query.Metric
109114
scope := assignedScope[i]
@@ -229,7 +234,7 @@ func buildQueries(
229234
for _, metric := range metrics {
230235
mc := archive.GetMetricConfig(job.Cluster, metric)
231236
if mc == nil {
232-
cclog.Infof("metric '%s' is not specified for cluster '%s'", metric, job.Cluster)
237+
cclog.Warnf("metric '%s' is not specified for cluster '%s'", metric, job.Cluster)
233238
continue
234239
}
235240

@@ -535,11 +540,15 @@ func (ccms *InternalMetricStore) LoadStats(
535540

536541
stats := make(map[string]map[string]schema.MetricStatistics, len(metrics))
537542
for i, res := range resBody.Results {
543+
if len(res) == 0 {
544+
// No Data Found For Metric, Logged in FetchData to Warn
545+
continue
546+
}
538547
query := req.Queries[i]
539548
metric := query.Metric
540549
data := res[0]
541550
if data.Error != nil {
542-
cclog.Errorf("fetching %s for node %s failed: %s", metric, query.Hostname, *data.Error)
551+
cclog.Warnf("fetching %s for node %s failed: %s", metric, query.Hostname, *data.Error)
543552
continue
544553
}
545554

@@ -609,6 +618,10 @@ func (ccms *InternalMetricStore) LoadScopedStats(
609618
scopedJobStats := make(schema.ScopedJobStats)
610619

611620
for i, row := range resBody.Results {
621+
if len(row) == 0 {
622+
// No Data Found For Metric, Logged in FetchData to Warn
623+
continue
624+
}
612625
query := req.Queries[i]
613626
metric := query.Metric
614627
scope := assignedScope[i]
@@ -717,6 +730,11 @@ func (ccms *InternalMetricStore) LoadNodeData(
717730
var errors []string
718731
data := make(map[string]map[string][]*schema.JobMetric)
719732
for i, res := range resBody.Results {
733+
if len(res) == 0 {
734+
// No Data Found For Metric, Logged in FetchData to Warn
735+
continue
736+
}
737+
720738
var query APIQuery
721739
if resBody.Queries != nil {
722740
query = resBody.Queries[i]
@@ -816,6 +834,10 @@ func (ccms *InternalMetricStore) LoadNodeListData(
816834
var errors []string
817835
data := make(map[string]schema.JobData)
818836
for i, row := range resBody.Results {
837+
if len(row) == 0 {
838+
// No Data Found For Metric, Logged in FetchData to Warn
839+
continue
840+
}
819841
var query APIQuery
820842
if resBody.Queries != nil {
821843
query = resBody.Queries[i]

0 commit comments

Comments
 (0)