Skip to content

Commit 968faa2

Browse files
Adds workflow_job_duration_seconds and other small improvements (#38)
* adds .idea dir to .gitignore Signed-off-by: Jan Akerman <[email protected]> * handles body read error Signed-off-by: Jan Akerman <[email protected]> * adds guard for workflow job queued event handling... where no steps are present Signed-off-by: Jan Akerman <[email protected]> * adds debug logging of event payload Signed-off-by: Jan Akerman <[email protected]> * updates binary name from make build The Dockerfile expects the updated name. Aligning the two makes building local images easier. Signed-off-by: Jan Akerman <[email protected]> * calculates workflow_job duration from job timestamps Instead of using the step timestamps which are not always present. This is an improvement for two reasons * We don't need logic as to which event type is needed, just that it has the required timestamp fields. * GitHub sometimes sends strange events that are marked as completed but with partially completed steps. * Also fixes issue whereby a lack of timestamp results in dropping the count metric for that job due to incorrect use of return instead of break. Signed-off-by: Jan Akerman <[email protected]> * adds status and conclusion labels to coun metrics i.e workflow_job_status_count and workflow_status_count. Previously, for completed status webhooks, conclusion was being used as the status. However, having both of these labels allows you to query for all completed jobs (e.g status="completed") whilst also providing the granularity on the reasons in conclusion. Generally, it's making less assumptions about the webhook contents and just displaying what GitHub sends - so any future changes against current assumptions will show in the metrics labels. Signed-off-by: Jan Akerman <[email protected]> * adds workflow_job_duration_seconds_total * Metric is a counter for total job duration. Useful for more accurate usage as workflow_job_duration_seconds is a histogram that uses buckets, thus loosing this accuracy. * Updates server_test with additional assertion * Closes body correctly * Removes workflow job skipped tests as they are no longer handled differently than any other event with completed status. Signed-off-by: Jan Akerman <[email protected]> Signed-off-by: Jan Akerman <[email protected]> Co-authored-by: Carlos Tadeu Panato Junior <[email protected]>
1 parent 993dd1e commit 968faa2

File tree

7 files changed

+236
-77
lines changed

7 files changed

+236
-77
lines changed

.gitignore

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
github_actions_exporter
1+
github-actions-exporter
22
*.env
33
*.tar.gz
44

@@ -18,3 +18,4 @@ github_actions_exporter
1818
# VSCode
1919
.vscode/
2020
dist/
21+
.idea/

Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,4 +12,4 @@ test:
1212

1313
.PHONY: build
1414
build:
15-
go build .
15+
go build -o github-actions-exporter .

doc/webhooks.md

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
## GitHub Webhooks
2+
3+
## [Workflow Run](https://docs.github.com/en/developers/webhooks-and-events/webhooks/webhook-events-and-payloads#workflow_run)
4+
5+
* Webhook field `action` is `requested` or `completed`.
6+
* `requested` has `status=queued` and `conclusion=null`.
7+
* `completed` can be `failure`, `success`, `cancelled`.
8+
9+
## [Workflow Job](https://docs.github.com/en/developers/webhooks-and-events/webhooks/webhook-events-and-payloads#workflow_job)
10+
11+
* Webhook field `action` and `workflow_job[status]` are equal in webhook payloads and include `queued`, `in_progress`, `completed`
12+
* `workflow_job[conclusion]` includes but is not limited to `in_progress`, `queued`, `success`, `failure`, `cancelled`, `skipped`.
13+
* `workflow_job[started_at]` and `workflow_job[completed_at]` are present in all `conclusion`s that are associated with
14+
the `completed` `status` i.e `failure`, `success`, `cancelled`, `skipped`.
15+
16+
17+

internal/server/metrics.go

Lines changed: 22 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -11,11 +11,18 @@ var (
1111
[]string{"org", "repo", "state", "runner_group"},
1212
)
1313

14+
workflowJobDurationCounter = prometheus.NewCounterVec(prometheus.CounterOpts{
15+
Name: "workflow_job_duration_seconds_total",
16+
Help: "The total duration of jobs.",
17+
},
18+
[]string{"org", "repo", "status", "conclusion", "runner_group"},
19+
)
20+
1421
workflowJobStatusCounter = prometheus.NewCounterVec(prometheus.CounterOpts{
1522
Name: "workflow_job_status_count",
16-
Help: "Count of the occurrences of different workflow job states.",
23+
Help: "Count of workflow job events.",
1724
},
18-
[]string{"org", "repo", "status", "runner_group"},
25+
[]string{"org", "repo", "status", "conclusion", "runner_group"},
1926
)
2027

2128
workflowRunHistogramVec = prometheus.NewHistogramVec(prometheus.HistogramOpts{
@@ -30,7 +37,7 @@ var (
3037
Name: "workflow_status_count",
3138
Help: "Count of the occurrences of different workflow states.",
3239
},
33-
[]string{"org", "repo", "workflow_name", "status"},
40+
[]string{"org", "repo", "status", "conclusion", "workflow_name"},
3441
)
3542

3643
totalMinutesUsedActions = prometheus.NewGaugeVec(prometheus.GaugeOpts{
@@ -80,6 +87,7 @@ func init() {
8087
// Register metrics with prometheus
8188
prometheus.MustRegister(workflowJobHistogramVec)
8289
prometheus.MustRegister(workflowJobStatusCounter)
90+
prometheus.MustRegister(workflowJobDurationCounter)
8391
prometheus.MustRegister(workflowRunHistogramVec)
8492
prometheus.MustRegister(workflowRunStatusCounter)
8593
prometheus.MustRegister(totalMinutesUsedActions)
@@ -92,9 +100,10 @@ func init() {
92100

93101
type WorkflowObserver interface {
94102
ObserveWorkflowJobDuration(org, repo, state, runnerGroup string, seconds float64)
95-
CountWorkflowJobStatus(org, repo, status, runnerGroup string)
103+
CountWorkflowJobStatus(org, repo, status, conclusion, runnerGroup string)
104+
CountWorkflowJobDuration(org, repo, status, conclusion, runnerGroup string, seconds float64)
96105
ObserveWorkflowRunDuration(org, repo, workflow string, seconds float64)
97-
CountWorkflowRunStatus(org, repo, workflow, status string)
106+
CountWorkflowRunStatus(org, repo, status, conclusion, workflow string)
98107
}
99108

100109
var _ WorkflowObserver = (*PrometheusObserver)(nil)
@@ -106,15 +115,19 @@ func (o *PrometheusObserver) ObserveWorkflowJobDuration(org, repo, state, runner
106115
Observe(seconds)
107116
}
108117

109-
func (o *PrometheusObserver) CountWorkflowJobStatus(org, repo, status, runnerGroup string) {
110-
workflowJobStatusCounter.WithLabelValues(org, repo, status, runnerGroup).Inc()
118+
func (o *PrometheusObserver) CountWorkflowJobStatus(org, repo, status, conclusion, runnerGroup string) {
119+
workflowJobStatusCounter.WithLabelValues(org, repo, status, conclusion, runnerGroup).Inc()
120+
}
121+
122+
func (o *PrometheusObserver) CountWorkflowJobDuration(org, repo, status, conclusion, runnerGroup string, seconds float64) {
123+
workflowJobDurationCounter.WithLabelValues(org, repo, status, conclusion, runnerGroup).Add(seconds)
111124
}
112125

113126
func (o *PrometheusObserver) ObserveWorkflowRunDuration(org, repo, workflowName string, seconds float64) {
114127
workflowRunHistogramVec.WithLabelValues(org, repo, workflowName).
115128
Observe(seconds)
116129
}
117130

118-
func (o *PrometheusObserver) CountWorkflowRunStatus(org, repo, workflowName, status string) {
119-
workflowRunStatusCounter.WithLabelValues(org, repo, workflowName, status).Inc()
131+
func (o *PrometheusObserver) CountWorkflowRunStatus(org, repo, status, conclusion, workflowName string) {
132+
workflowRunStatusCounter.WithLabelValues(org, repo, status, conclusion, workflowName).Inc()
120133
}

internal/server/server_test.go

Lines changed: 8 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -72,24 +72,22 @@ func Test_Server_MetricsRouteAfterWorkflowJob(t *testing.T) {
7272
org := "someone"
7373
expectedDuration := 10.0
7474
jobStartedAt := time.Unix(1650308740, 0)
75-
stepStartedAt := jobStartedAt.Add(time.Duration(expectedDuration) * time.Second)
75+
completedAt := jobStartedAt.Add(time.Duration(expectedDuration) * time.Second)
7676
runnerGroupName := "runner-group"
7777

7878
event := github.WorkflowJobEvent{
79-
Action: github.String("in_progress"),
79+
Action: github.String("completed"),
8080
Repo: &github.Repository{
8181
Name: &repo,
8282
Owner: &github.User{
8383
Login: &org,
8484
},
8585
},
8686
WorkflowJob: &github.WorkflowJob{
87-
StartedAt: &github.Timestamp{Time: jobStartedAt},
88-
Steps: []*github.TaskStep{
89-
{
90-
StartedAt: &github.Timestamp{Time: stepStartedAt},
91-
},
92-
},
87+
Status: github.String("completed"),
88+
Conclusion: github.String("success"),
89+
StartedAt: &github.Timestamp{Time: jobStartedAt},
90+
CompletedAt: &github.Timestamp{Time: completedAt},
9391
RunnerGroupName: &runnerGroupName,
9492
},
9593
}
@@ -109,5 +107,6 @@ func Test_Server_MetricsRouteAfterWorkflowJob(t *testing.T) {
109107

110108
payload, err := ioutil.ReadAll(metricsRes.Body)
111109
require.NoError(t, err)
112-
assert.Contains(t, string(payload), `workflow_job_duration_seconds_bucket{org="someone",repo="some-repo",runner_group="runner-group",state="queued",le="10.541350399999995"} 1`)
110+
assert.Contains(t, string(payload), `workflow_job_duration_seconds_bucket{org="someone",repo="some-repo",runner_group="runner-group",state="in_progress",le="10.541350399999995"} 1`)
111+
assert.Contains(t, string(payload), `workflow_job_duration_seconds_total{conclusion="success",org="someone",repo="some-repo",runner_group="runner-group",status="completed"} 10`)
113112
}

internal/server/workflow_metrics_exporter.go

Lines changed: 34 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,13 @@ func NewWorkflowMetricsExporter(logger log.Logger, opts Opts) *WorkflowMetricsEx
3636

3737
// handleGHWebHook responds to POST /gh_event, when receive a event from GitHub.
3838
func (c *WorkflowMetricsExporter) HandleGHWebHook(w http.ResponseWriter, r *http.Request) {
39-
buf, _ := ioutil.ReadAll(r.Body)
39+
buf, err := ioutil.ReadAll(r.Body)
40+
if err != nil {
41+
_ = level.Error(c.Logger).Log("msg", "error reading body: %v", err)
42+
w.WriteHeader(http.StatusInternalServerError)
43+
return
44+
}
45+
defer r.Body.Close()
4046

4147
receivedHash := strings.SplitN(r.Header.Get("X-Hub-Signature"), "=", 2)
4248
if receivedHash[0] != "sha1" {
@@ -45,13 +51,15 @@ func (c *WorkflowMetricsExporter) HandleGHWebHook(w http.ResponseWriter, r *http
4551
return
4652
}
4753

48-
err := validateSignature(c.Opts.GitHubToken, receivedHash, buf)
54+
err = validateSignature(c.Opts.GitHubToken, receivedHash, buf)
4955
if err != nil {
5056
_ = level.Error(c.Logger).Log("msg", "invalid token", "err", err)
5157
w.WriteHeader(http.StatusForbidden)
5258
return
5359
}
5460

61+
_ = level.Debug(c.Logger).Log("msg", "received webhook", "payload", string(buf))
62+
5563
eventType := r.Header.Get("X-GitHub-Event")
5664
switch eventType {
5765
case "ping":
@@ -86,45 +94,51 @@ func (c *WorkflowMetricsExporter) CollectWorkflowJobEvent(event *github.Workflow
8694
repo := event.GetRepo().GetName()
8795
org := event.GetRepo().GetOwner().GetLogin()
8896
runnerGroup := event.WorkflowJob.GetRunnerGroupName()
97+
8998
action := event.GetAction()
99+
conclusion := event.GetWorkflowJob().GetConclusion()
100+
status := event.GetWorkflowJob().GetStatus()
90101

91-
var status string
92102
switch action {
93103
case "queued":
94-
status = "queued"
104+
// Do nothing.
95105
case "in_progress":
96-
status = "in_progress"
106+
107+
if len(event.WorkflowJob.Steps) == 0 {
108+
_ = level.Debug(c.Logger).Log("msg", "unable to calculate job duration of in_progress event as event has no steps")
109+
break
110+
}
97111

98112
firstStep := event.WorkflowJob.Steps[0]
99113
queuedSeconds := firstStep.StartedAt.Time.Sub(event.WorkflowJob.StartedAt.Time).Seconds()
100114
c.PrometheusObserver.ObserveWorkflowJobDuration(org, repo, "queued", runnerGroup, math.Max(0, queuedSeconds))
101115
case "completed":
102-
status = event.GetWorkflowJob().GetConclusion()
103-
104-
if event.GetWorkflowJob().GetConclusion() != "skipped" {
105-
firstStepStarted := event.WorkflowJob.Steps[0].StartedAt.Time
106-
lastStepCompleted := event.WorkflowJob.Steps[len(event.WorkflowJob.Steps)-1].CompletedAt.Time
107-
jobSeconds := lastStepCompleted.Sub(firstStepStarted).Seconds()
108-
c.PrometheusObserver.ObserveWorkflowJobDuration(org, repo, "in_progress", runnerGroup, math.Max(0, jobSeconds))
116+
if event.WorkflowJob.StartedAt == nil || event.WorkflowJob.CompletedAt == nil {
117+
_ = level.Debug(c.Logger).Log("msg", "unable to calculate job duration of completed event steps are missing timestamps")
118+
break
109119
}
120+
121+
jobSeconds := math.Max(0, event.WorkflowJob.GetCompletedAt().Time.Sub(event.WorkflowJob.GetStartedAt().Time).Seconds())
122+
c.PrometheusObserver.ObserveWorkflowJobDuration(org, repo, "in_progress", runnerGroup, jobSeconds)
123+
c.PrometheusObserver.CountWorkflowJobDuration(org, repo, status, conclusion, runnerGroup, jobSeconds)
110124
}
111125

112-
c.PrometheusObserver.CountWorkflowJobStatus(org, repo, status, runnerGroup)
126+
c.PrometheusObserver.CountWorkflowJobStatus(org, repo, status, conclusion, runnerGroup)
113127
}
114128

115129
func (c *WorkflowMetricsExporter) CollectWorkflowRunEvent(event *github.WorkflowRunEvent) {
116-
if event.GetAction() != "completed" {
117-
return
118-
}
119-
120130
repo := event.GetRepo().GetName()
121131
org := event.GetRepo().GetOwner().GetLogin()
122132
workflowName := event.GetWorkflow().GetName()
123-
seconds := event.GetWorkflowRun().UpdatedAt.Time.Sub(event.GetWorkflowRun().RunStartedAt.Time).Seconds()
124-
c.PrometheusObserver.ObserveWorkflowRunDuration(org, repo, workflowName, seconds)
133+
134+
if event.GetAction() == "completed" {
135+
seconds := event.GetWorkflowRun().UpdatedAt.Time.Sub(event.GetWorkflowRun().RunStartedAt.Time).Seconds()
136+
c.PrometheusObserver.ObserveWorkflowRunDuration(org, repo, workflowName, seconds)
137+
}
125138

126139
status := event.GetWorkflowRun().GetStatus()
127-
c.PrometheusObserver.CountWorkflowRunStatus(org, repo, workflowName, status)
140+
conclusion := event.GetWorkflowRun().GetConclusion()
141+
c.PrometheusObserver.CountWorkflowRunStatus(org, repo, status, conclusion, workflowName)
128142
}
129143

130144
// validateSignature validate the incoming github event.

0 commit comments

Comments
 (0)