Skip to content

Commit c48907d

Browse files
committed
Fix starter_rescued_runs metric and rename from recovered to rescued
- Rename CountRecovered to CountRescued to match naming convention - Rename starter_recovered_runs metric to starter_rescued_runs - Add logic to increment counter when rescue jobs are enqueued - Fix issue where metric was always zero
1 parent 4964293 commit c48907d

File tree

2 files changed

+17
-10
lines changed

2 files changed

+17
-10
lines changed

pkg/metric/scrape_memory.go

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -33,9 +33,9 @@ var (
3333
"waiting queue in starter",
3434
[]string{"starter"}, nil,
3535
)
36-
memoryStarterRecoveredRuns = prometheus.NewDesc(
37-
prometheus.BuildFQName(namespace, memoryName, "starter_recovered_runs"),
38-
"recovered runs in starter",
36+
memoryStarterRescuedRuns = prometheus.NewDesc(
37+
prometheus.BuildFQName(namespace, memoryName, "starter_rescued_runs"),
38+
"rescued runs in starter",
3939
[]string{"starter", "target"}, nil,
4040
)
4141
memoryGitHubRateLimitRemaining = prometheus.NewDesc(
@@ -125,9 +125,9 @@ func scrapeStarterValues(ch chan<- prometheus.Metric) error {
125125
ch <- prometheus.MustNewConstMetric(
126126
memoryStarterQueueWaiting, prometheus.GaugeValue, float64(countWaiting), labelStarter)
127127

128-
starter.CountRecovered.Range(func(key, value interface{}) bool {
128+
starter.CountRescued.Range(func(key, value interface{}) bool {
129129
ch <- prometheus.MustNewConstMetric(
130-
memoryStarterRecoveredRuns, prometheus.GaugeValue, float64(value.(int)), labelStarter, key.(string),
130+
memoryStarterRescuedRuns, prometheus.GaugeValue, float64(value.(int)), labelStarter, key.(string),
131131
)
132132
return true
133133
})

pkg/starter/starter.go

Lines changed: 12 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -35,8 +35,8 @@ var (
3535
// CountWaiting is count of waiting job
3636
CountWaiting atomic.Int64
3737

38-
// CountRecovered is count of recovered job per target
39-
CountRecovered = sync.Map{}
38+
// CountRescued is count of rescued job per target
39+
CountRescued = sync.Map{}
4040

4141
inProgress = sync.Map{}
4242

@@ -201,7 +201,7 @@ func (s *Starter) ProcessJob(ctx context.Context, job datastore.Job) error {
201201
return fmt.Errorf("failed to retrieve relational target: (target ID: %s, job ID: %s): %w", job.TargetID, job.UUID, err)
202202
}
203203

204-
CountRecovered.LoadOrStore(target.Scope, 0)
204+
CountRescued.LoadOrStore(target.Scope, 0)
205205

206206
cctx, cancel := context.WithTimeout(ctx, runner.MustRunningTime)
207207
defer cancel()
@@ -445,7 +445,7 @@ func enqueueRescueRun(ctx context.Context, pendingRun datastore.PendingWorkflowR
445445
// Get full installation data from cache
446446
installation, err := gh.GetInstallationByID(ctx, installationID)
447447
if err != nil {
448-
logger.Logf(false, "failed to get installation from cache (installationID: %d), using minimal data: %+v", installationID, err)
448+
logger.Logf(false, "failed to get installation from cache (installationID: %d), using minimal data: %+v", installationID, err)
449449
// Fallback to minimal installation data
450450
installation = &github.Installation{
451451
ID: &installationID,
@@ -461,7 +461,7 @@ func enqueueRescueRun(ctx context.Context, pendingRun datastore.PendingWorkflowR
461461
Name: owner.Name,
462462
}
463463
}
464-
464+
465465
event := &github.WorkflowJobEvent{
466466
WorkflowJob: job,
467467
Action: github.String("queued"),
@@ -524,5 +524,12 @@ func enqueueRescueJob(ctx context.Context, workflowJob *github.WorkflowJobEvent,
524524
return fmt.Errorf("failed to enqueue job: %w", err)
525525
}
526526

527+
// Increment rescued runs counter
528+
if count, ok := CountRescued.Load(target.Scope); ok {
529+
CountRescued.Store(target.Scope, count.(int)+1)
530+
} else {
531+
CountRescued.Store(target.Scope, 1)
532+
}
533+
527534
return nil
528535
}

0 commit comments

Comments
 (0)