Skip to content

Commit 08d40ff

Browse files
authored
Extend workflow job run metric (#167)
Signed-off-by: peterhalasz <[email protected]>
1 parent 6129f95 commit 08d40ff

File tree

4 files changed

+168
-103
lines changed

4 files changed

+168
-103
lines changed

internal/server/metrics.go

Lines changed: 13 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -8,21 +8,21 @@ var (
88
Help: "Time that a workflow job took to reach a given state.",
99
Buckets: prometheus.ExponentialBuckets(1, 1.4, 30),
1010
},
11-
[]string{"org", "repo", "state", "runner_group"},
11+
[]string{"org", "repo", "state", "runner_group", "workflow_name", "job_name"},
1212
)
1313

1414
workflowJobDurationCounter = prometheus.NewCounterVec(prometheus.CounterOpts{
1515
Name: "workflow_job_duration_seconds_total",
1616
Help: "The total duration of jobs.",
1717
},
18-
[]string{"org", "repo", "status", "conclusion", "runner_group"},
18+
[]string{"org", "repo", "status", "conclusion", "runner_group", "workflow_name", "job_name"},
1919
)
2020

2121
workflowJobStatusCounter = prometheus.NewCounterVec(prometheus.CounterOpts{
2222
Name: "workflow_job_status_count",
2323
Help: "Count of workflow job events.",
2424
},
25-
[]string{"org", "repo", "status", "conclusion", "runner_group"},
25+
[]string{"org", "repo", "status", "conclusion", "runner_group", "workflow_name", "job_name"},
2626
)
2727

2828
workflowRunHistogramVec = prometheus.NewHistogramVec(prometheus.HistogramOpts{
@@ -83,9 +83,10 @@ func init() {
8383
}
8484

8585
type WorkflowObserver interface {
86-
ObserveWorkflowJobDuration(org, repo, state, runnerGroup string, seconds float64)
87-
CountWorkflowJobStatus(org, repo, status, conclusion, runnerGroup string)
88-
CountWorkflowJobDuration(org, repo, status, conclusion, runnerGroup string, seconds float64)
86+
ObserveWorkflowJobDuration(org, repo, state, runnerGroup, workflowName, jobName string, seconds float64)
87+
CountWorkflowJobStatus(org, repo, status, conclusion, runnerGroup, workflowName, jobName string)
88+
CountWorkflowJobDuration(org, repo, status, conclusion, runnerGroup, workflowName, jobName string, seconds float64)
89+
8990
ObserveWorkflowRunDuration(org, repo, workflow, conclusion string, seconds float64)
9091
CountWorkflowRunStatus(org, repo, status, conclusion, workflow string)
9192
}
@@ -94,17 +95,17 @@ var _ WorkflowObserver = (*PrometheusObserver)(nil)
9495

9596
type PrometheusObserver struct{}
9697

97-
func (o *PrometheusObserver) ObserveWorkflowJobDuration(org, repo, state, runnerGroup string, seconds float64) {
98-
workflowJobHistogramVec.WithLabelValues(org, repo, state, runnerGroup).
98+
func (o *PrometheusObserver) ObserveWorkflowJobDuration(org, repo, state, runnerGroup, workflowName, jobName string, seconds float64) {
99+
workflowJobHistogramVec.WithLabelValues(org, repo, state, runnerGroup, workflowName, jobName).
99100
Observe(seconds)
100101
}
101102

102-
func (o *PrometheusObserver) CountWorkflowJobStatus(org, repo, status, conclusion, runnerGroup string) {
103-
workflowJobStatusCounter.WithLabelValues(org, repo, status, conclusion, runnerGroup).Inc()
103+
func (o *PrometheusObserver) CountWorkflowJobStatus(org, repo, status, conclusion, runnerGroup, workflowName, jobName string) {
104+
workflowJobStatusCounter.WithLabelValues(org, repo, status, conclusion, runnerGroup, workflowName, jobName).Inc()
104105
}
105106

106-
func (o *PrometheusObserver) CountWorkflowJobDuration(org, repo, status, conclusion, runnerGroup string, seconds float64) {
107-
workflowJobDurationCounter.WithLabelValues(org, repo, status, conclusion, runnerGroup).Add(seconds)
107+
func (o *PrometheusObserver) CountWorkflowJobDuration(org, repo, status, conclusion, runnerGroup, workflowName, jobName string, seconds float64) {
108+
workflowJobDurationCounter.WithLabelValues(org, repo, status, conclusion, runnerGroup, workflowName, jobName).Add(seconds)
108109
}
109110

110111
func (o *PrometheusObserver) ObserveWorkflowRunDuration(org, repo, workflowName, conclusion string, seconds float64) {

internal/server/server_test.go

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,8 @@ func Test_Server_MetricsRouteAfterWorkflowJob(t *testing.T) {
9292
jobStartedAt := time.Unix(1650308740, 0)
9393
completedAt := jobStartedAt.Add(time.Duration(expectedDuration) * time.Second)
9494
runnerGroupName := "runner-group"
95+
workflowName := "Build and test"
96+
jobName := "Test"
9597

9698
event := github.WorkflowJobEvent{
9799
Action: github.String("completed"),
@@ -107,6 +109,8 @@ func Test_Server_MetricsRouteAfterWorkflowJob(t *testing.T) {
107109
StartedAt: &github.Timestamp{Time: jobStartedAt},
108110
CompletedAt: &github.Timestamp{Time: completedAt},
109111
RunnerGroupName: &runnerGroupName,
112+
WorkflowName: &workflowName,
113+
Name: &jobName,
110114
},
111115
}
112116
req := testWebhookRequest(t, "http://localhost:8001/webhook", "workflow_job", event)
@@ -125,6 +129,6 @@ func Test_Server_MetricsRouteAfterWorkflowJob(t *testing.T) {
125129

126130
payload, err := io.ReadAll(metricsRes.Body)
127131
require.NoError(t, err)
128-
assert.Contains(t, string(payload), `workflow_job_duration_seconds_bucket{org="someone",repo="some-repo",runner_group="runner-group",state="in_progress",le="10.541350399999995"} 1`)
129-
assert.Contains(t, string(payload), `workflow_job_duration_seconds_total{conclusion="success",org="someone",repo="some-repo",runner_group="runner-group",status="completed"} 10`)
132+
assert.Contains(t, string(payload), `workflow_job_duration_seconds_bucket{job_name="Test",org="someone",repo="some-repo",runner_group="runner-group",state="in_progress",workflow_name="Build and test",le="10.541350399999995"} 1`)
133+
assert.Contains(t, string(payload), `workflow_job_duration_seconds_total{conclusion="success",job_name="Test",org="someone",repo="some-repo",runner_group="runner-group",status="completed",workflow_name="Build and test"} 10`)
130134
}

internal/server/workflow_metrics_exporter.go

Lines changed: 23 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -74,7 +74,13 @@ func (c *WorkflowMetricsExporter) HandleGHWebHook(w http.ResponseWriter, r *http
7474
return
7575
case "workflow_job":
7676
event := model.WorkflowJobEventFromJSON(io.NopCloser(bytes.NewBuffer(buf)))
77-
_ = level.Info(c.Logger).Log("msg", "got workflow_job event", "org", event.GetRepo().GetOwner().GetLogin(), "repo", event.GetRepo().GetName(), "runId", event.GetWorkflowJob().GetRunID(), "action", event.GetAction())
77+
_ = level.Info(c.Logger).Log("msg", "got workflow_job event",
78+
"org", event.GetRepo().GetOwner().GetLogin(),
79+
"repo", event.GetRepo().GetName(),
80+
"runId", event.GetWorkflowJob().GetRunID(),
81+
"action", event.GetAction(),
82+
"workflow_name", event.GetWorkflowJob().GetWorkflowName(),
83+
"job_name", event.GetWorkflowJob().GetName())
7884
go c.CollectWorkflowJobEvent(event)
7985
case "workflow_run":
8086
event := model.WorkflowRunEventFromJSON(io.NopCloser(bytes.NewBuffer(buf)))
@@ -93,37 +99,40 @@ func (c *WorkflowMetricsExporter) HandleGHWebHook(w http.ResponseWriter, r *http
9399
func (c *WorkflowMetricsExporter) CollectWorkflowJobEvent(event *github.WorkflowJobEvent) {
94100
repo := event.GetRepo().GetName()
95101
org := event.GetRepo().GetOwner().GetLogin()
96-
runnerGroup := event.WorkflowJob.GetRunnerGroupName()
97-
98102
action := event.GetAction()
99-
conclusion := event.GetWorkflowJob().GetConclusion()
100-
status := event.GetWorkflowJob().GetStatus()
103+
104+
workflowJob := event.GetWorkflowJob()
105+
runnerGroup := workflowJob.GetRunnerGroupName()
106+
conclusion := workflowJob.GetConclusion()
107+
status := workflowJob.GetStatus()
108+
workflowName := workflowJob.GetWorkflowName()
109+
jobName := workflowJob.GetName()
101110

102111
switch action {
103112
case "queued":
104113
// Do nothing.
105114
case "in_progress":
106115

107-
if len(event.WorkflowJob.Steps) == 0 {
116+
if len(workflowJob.Steps) == 0 {
108117
_ = level.Debug(c.Logger).Log("msg", "unable to calculate job duration of in_progress event as event has no steps")
109118
break
110119
}
111120

112-
firstStep := event.WorkflowJob.Steps[0]
113-
queuedSeconds := firstStep.StartedAt.Time.Sub(event.WorkflowJob.StartedAt.Time).Seconds()
114-
c.PrometheusObserver.ObserveWorkflowJobDuration(org, repo, "queued", runnerGroup, math.Max(0, queuedSeconds))
121+
firstStep := workflowJob.Steps[0]
122+
queuedSeconds := firstStep.StartedAt.Time.Sub(workflowJob.GetStartedAt().Time).Seconds()
123+
c.PrometheusObserver.ObserveWorkflowJobDuration(org, repo, "queued", runnerGroup, workflowName, jobName, math.Max(0, queuedSeconds))
115124
case "completed":
116-
if event.WorkflowJob.StartedAt == nil || event.WorkflowJob.CompletedAt == nil {
125+
if workflowJob.StartedAt == nil || workflowJob.CompletedAt == nil {
117126
_ = level.Debug(c.Logger).Log("msg", "unable to calculate job duration of completed event steps are missing timestamps")
118127
break
119128
}
120129

121-
jobSeconds := math.Max(0, event.WorkflowJob.GetCompletedAt().Time.Sub(event.WorkflowJob.GetStartedAt().Time).Seconds())
122-
c.PrometheusObserver.ObserveWorkflowJobDuration(org, repo, "in_progress", runnerGroup, jobSeconds)
123-
c.PrometheusObserver.CountWorkflowJobDuration(org, repo, status, conclusion, runnerGroup, jobSeconds)
130+
jobSeconds := math.Max(0, workflowJob.GetCompletedAt().Time.Sub(workflowJob.GetStartedAt().Time).Seconds())
131+
c.PrometheusObserver.ObserveWorkflowJobDuration(org, repo, "in_progress", runnerGroup, workflowName, jobName, jobSeconds)
132+
c.PrometheusObserver.CountWorkflowJobDuration(org, repo, status, conclusion, runnerGroup, workflowName, jobName, jobSeconds)
124133
}
125134

126-
c.PrometheusObserver.CountWorkflowJobStatus(org, repo, status, conclusion, runnerGroup)
135+
c.PrometheusObserver.CountWorkflowJobStatus(org, repo, status, conclusion, runnerGroup, workflowName, jobName)
127136
}
128137

129138
func (c *WorkflowMetricsExporter) CollectWorkflowRunEvent(event *github.WorkflowRunEvent) {

0 commit comments

Comments
 (0)