1
1
import requests
2
+ import collections
2
3
import time
3
4
import os
4
5
from dataclasses import dataclass
12
13
GRAFANA_URL = (
13
14
"https://influx-prod-13-prod-us-east-0.grafana.net/api/v1/push/influx/write"
14
15
)
15
- GITHUB_PROJECT = "llvm/llvm-project"
16
- WORKFLOWS_TO_TRACK = ["LLVM Premerge Checks" ]
17
- SCRAPE_INTERVAL_SECONDS = 5 * 60
16
+ SCRAPE_INTERVAL_SECONDS = 60
17
+
18
+ # Lists the Github workflows we want to track. Maps the Github job name to
19
+ # the metric name prefix in grafana.
20
+ # This metric name is also used as a key in the job->name map.
21
+ GITHUB_WORKFLOW_TO_TRACK = {"LLVM Premerge Checks" : "github_llvm_premerge_checks" }
22
+
23
+ # Lists the Github jobs to track for a given workflow. The key is the stable
24
+ # name (metric name) of the workflow (see GITHUB_WORKFLOW_TO_TRACK).
25
+ # Each value is a map to link the github job name to the corresponding metric
26
+ # name.
27
+ GITHUB_JOB_TO_TRACK = {
28
+ "github_llvm_premerge_checks" : {
29
+ "Linux Premerge Checks (Test Only - Please Ignore Results)" : "premerge_linux" ,
30
+ "Windows Premerge Checks (Test Only - Please Ignore Results)" : "premerge_windows" ,
31
+ }
32
+ }
33
+
34
+ # The number of workflows to pull when sampling queue size & running count.
35
+ # Filtering at the query level doesn't work, and thus sampling workflow counts
36
+ # cannot be done in a clean way.
37
+ # If we miss running/queued workflows, we might want to bump this value.
38
+ GITHUB_WORKFLOWS_COUNT_FOR_SAMPLING = 200
18
39
19
40
20
41
@dataclass
@@ -34,7 +55,6 @@ class GaugeMetric:
34
55
value : int
35
56
time_ns : int
36
57
37
-
38
58
def get_sampled_workflow_metrics (github_repo : github .Repository ):
39
59
"""Gets global statistics about the Github workflow queue
40
60
@@ -45,131 +65,117 @@ def get_sampled_workflow_metrics(github_repo: github.Repository):
45
65
Returns a list of GaugeMetric objects, containing the relevant metrics about
46
66
the workflow
47
67
"""
48
- queued_job_counts = {}
49
- running_job_counts = {}
50
-
51
- # Other states are available (pending, waiting, etc), but the meaning
52
- # is not documented (See #70540).
53
- # "queued" seems to be the info we want.
54
- for queued_workflow in github_repo .get_workflow_runs (status = "queued" ):
55
- if queued_workflow .name not in WORKFLOWS_TO_TRACK :
56
- continue
57
- for queued_workflow_job in queued_workflow .jobs ():
58
- job_name = queued_workflow_job .name
59
- # Workflows marked as queued can potentially only have some jobs
60
- # queued, so make sure to also count jobs currently in progress.
61
- if queued_workflow_job .status == "queued" :
62
- if job_name not in queued_job_counts :
63
- queued_job_counts [job_name ] = 1
64
- else :
65
- queued_job_counts [job_name ] += 1
66
- elif queued_workflow_job .status == "in_progress" :
67
- if job_name not in running_job_counts :
68
- running_job_counts [job_name ] = 1
69
- else :
70
- running_job_counts [job_name ] += 1
71
-
72
- for running_workflow in github_repo .get_workflow_runs (status = "in_progress" ):
73
- if running_workflow .name not in WORKFLOWS_TO_TRACK :
68
+ queued_count = collections .Counter ()
69
+ running_count = collections .Counter ()
70
+
71
+ # Do not apply any filters to this query.
72
+ # See https://github.com/orgs/community/discussions/86766
73
+ # Applying filters like `status=completed` will break pagination, and
74
+ # return a non-sorted and incomplete list of workflows.
75
+ i = 0
76
+ for task in iter (github_repo .get_workflow_runs ()):
77
+ if i > GITHUB_WORKFLOWS_COUNT_FOR_SAMPLING :
78
+ break
79
+ i += 1
80
+
81
+ if task .name not in GITHUB_WORKFLOW_TO_TRACK :
74
82
continue
75
- for running_workflow_job in running_workflow .jobs ():
76
- job_name = running_workflow_job .name
77
- if running_workflow_job .status != "in_progress" :
83
+
84
+ prefix_name = GITHUB_WORKFLOW_TO_TRACK [task .name ]
85
+ for job in task .jobs ():
86
+ if job .name not in GITHUB_JOB_TO_TRACK [prefix_name ]:
78
87
continue
88
+ suffix_name = GITHUB_JOB_TO_TRACK [prefix_name ][job .name ]
89
+ metric_name = f"{ prefix_name } _{ suffix_name } "
90
+
91
+ # Other states are available (pending, waiting, etc), but the meaning
92
+ # is not documented (See #70540).
93
+ # "queued" seems to be the info we want.
94
+ if job .status == "queued" :
95
+ queued_count [metric_name ] += 1
96
+ elif job .status == "in_progress" :
97
+ running_count [metric_name ] += 1
79
98
80
- if job_name not in running_job_counts :
81
- running_job_counts [job_name ] = 1
82
- else :
83
- running_job_counts [job_name ] += 1
84
99
85
100
workflow_metrics = []
86
- for queued_job in queued_job_counts :
101
+ for name , value in queued_count . items () :
87
102
workflow_metrics .append (
88
- GaugeMetric (
89
- f"workflow_queue_size_{ queued_job } " ,
90
- queued_job_counts [queued_job ],
91
- time .time_ns (),
92
- )
103
+ GaugeMetric (f"workflow_queue_size_{ name } " , value , time .time_ns ())
93
104
)
94
- for running_job in running_job_counts :
105
+ for name , value in running_count . items () :
95
106
workflow_metrics .append (
96
- GaugeMetric (
97
- f"running_workflow_count_{ running_job } " ,
98
- running_job_counts [running_job ],
99
- time .time_ns (),
100
- )
107
+ GaugeMetric (f"running_workflow_count_{ name } " , value , time .time_ns ())
101
108
)
109
+
102
110
# Always send a hearbeat metric so we can monitor is this container is still able to log to Grafana.
103
111
workflow_metrics .append (
104
112
GaugeMetric ("metrics_container_heartbeat" , 1 , time .time_ns ())
105
113
)
106
114
return workflow_metrics
107
115
108
116
109
- def get_per_workflow_metrics (
110
- github_repo : github .Repository , workflows_to_track : dict [str , int ]
111
- ):
117
+ def get_per_workflow_metrics (github_repo : github .Repository , last_seen_workflow : str ):
112
118
"""Gets the metrics for specified Github workflows.
113
119
114
120
This function takes in a list of workflows to track, and optionally the
115
121
workflow ID of the last tracked invocation. It grabs the relevant data
116
122
from Github, returning it to the caller.
123
+ If the last_seen_workflow parameter is None, this returns no metrics, but
124
+ returns the id of the most recent workflow.
117
125
118
126
Args:
119
127
github_repo: A github repo object to use to query the relevant information.
120
- workflows_to_track: A dictionary mapping workflow names to the last
121
- invocation ID where metrics have been collected, or None to collect the
122
- last five results.
128
+ last_seen_workflow: the last workflow this function processed.
123
129
124
130
Returns:
125
- Returns a list of JobMetrics objects, containing the relevant metrics about
126
- the workflow.
131
+ Returns a tuple with 2 elements:
132
+ - a list of JobMetrics objects, one per processed job.
133
+ - the ID of the most recent processed workflow run.
127
134
"""
128
135
workflow_metrics = []
136
+ most_recent_workflow_processed = None
137
+
138
+ # Do not apply any filters to this query.
139
+ # See https://github.com/orgs/community/discussions/86766
140
+ # Applying filters like `status=completed` will break pagination, and
141
+ # return a non-sorted and incomplete list of workflows.
142
+ for task in iter (github_repo .get_workflow_runs ()):
143
+ # Ignoring non-completed workflows.
144
+ if task .status != "completed" :
145
+ continue
129
146
130
- workflows_to_include = set (workflows_to_track .keys ())
147
+ # Record the most recent workflow we processed so this script
148
+ # only processes it once.
149
+ if most_recent_workflow_processed is None :
150
+ most_recent_workflow_processed = task .id
131
151
132
- for workflow_run in iter (github_repo .get_workflow_runs ()):
133
- if len (workflows_to_include ) == 0 :
152
+ # This condition only happens when this script starts:
153
+ # this is used to determine a start point. Don't return any
154
+ # metrics, just the most recent workflow ID.
155
+ if last_seen_workflow is None :
134
156
break
135
157
136
- if workflow_run .status != "completed" :
137
- continue
138
-
139
- # This workflow was already sampled for this run, or is not tracked at
140
- # all. Ignoring.
141
- if workflow_run .name not in workflows_to_include :
142
- continue
158
+ # This workflow has already been processed. We can stop now.
159
+ if last_seen_workflow == task .id :
160
+ break
143
161
144
- # There were no new workflow invocations since the previous scrape.
145
- # The API returns a sorted list with the most recent invocations first,
146
- # so we can stop looking for this particular workflow. Continue to grab
147
- # information on the other workflows of interest, if present.
148
- if workflows_to_track [workflow_run .name ] == workflow_run .id :
149
- workflows_to_include .remove (workflow_run .name )
162
+ # This workflow is not interesting to us.
163
+ if task .name not in GITHUB_WORKFLOW_TO_TRACK :
150
164
continue
151
165
152
- workflow_jobs = workflow_run .jobs ()
153
- if workflow_jobs .totalCount == 0 :
154
- continue
166
+ name_prefix = GITHUB_WORKFLOW_TO_TRACK [task .name ]
155
167
156
- if (
157
- workflows_to_track [workflow_run .name ] is None
158
- or workflows_to_track [workflow_run .name ] == workflow_run .id
159
- ):
160
- workflows_to_include .remove (workflow_run .name )
161
- if (
162
- workflows_to_track [workflow_run .name ] is not None
163
- and len (workflows_to_include ) == 0
164
- ):
165
- break
168
+ for job in task .jobs ():
169
+ # This job is not interesting to us.
170
+ if job .name not in GITHUB_JOB_TO_TRACK [name_prefix ]:
171
+ continue
166
172
167
- for workflow_job in workflow_jobs :
168
- created_at = workflow_job .created_at
169
- started_at = workflow_job .started_at
170
- completed_at = workflow_job .completed_at
173
+ name_suffix = GITHUB_JOB_TO_TRACK [ name_prefix ][ job . name ]
174
+ created_at = job .created_at
175
+ started_at = job .started_at
176
+ completed_at = job .completed_at
171
177
172
- job_result = int (workflow_job .conclusion == "success" )
178
+ job_result = int (job .conclusion == "success" )
173
179
if job_result :
174
180
# We still might want to mark the job as a failure if one of the steps
175
181
# failed. This is required due to use setting continue-on-error in
@@ -178,7 +184,7 @@ def get_per_workflow_metrics(
178
184
# TODO(boomanaiden154): Remove this once the premerge pipeline is no
179
185
# longer in a testing state and we can directly assert the workflow
180
186
# result.
181
- for step in workflow_job .steps :
187
+ for step in job .steps :
182
188
if step .conclusion != "success" and step .conclusion != "skipped" :
183
189
job_result = 0
184
190
break
@@ -191,25 +197,23 @@ def get_per_workflow_metrics(
191
197
192
198
# The timestamp associated with the event is expected by Grafana to be
193
199
# in nanoseconds.
194
- created_at_ns = int (created_at .timestamp ()) * 10 ** 9
200
+ completed_at_ns = int (completed_at .timestamp ()) * 10 ** 9
195
201
196
- logging .info (
197
- f"Adding a job metric for job { workflow_job .id } in workflow { workflow_run .id } "
198
- )
202
+ logging .info (f"Adding a job metric for job { job .id } in workflow { task .id } " )
199
203
200
204
workflow_metrics .append (
201
205
JobMetrics (
202
- workflow_run . name + "- " + workflow_job . name ,
206
+ name_prefix + "_ " + name_suffix ,
203
207
queue_time .seconds ,
204
208
run_time .seconds ,
205
209
job_result ,
206
- created_at_ns ,
210
+ completed_at_ns ,
207
211
workflow_run .id ,
208
212
workflow_run .name ,
209
213
)
210
214
)
211
215
212
- return workflow_metrics
216
+ return workflow_metrics , most_recent_workflow_processed
213
217
214
218
215
219
def upload_metrics (workflow_metrics , metrics_userid , api_key ):
@@ -259,32 +263,27 @@ def upload_metrics(workflow_metrics, metrics_userid, api_key):
259
263
260
264
def main ():
261
265
# Authenticate with Github
262
- auth = Auth .Token (os .environ ["GITHUB_TOKEN" ])
263
-
266
+ github_auth = Auth .Token (os .environ ["GITHUB_TOKEN" ])
264
267
grafana_api_key = os .environ ["GRAFANA_API_KEY" ]
265
268
grafana_metrics_userid = os .environ ["GRAFANA_METRICS_USERID" ]
266
269
267
- workflows_to_track = {}
268
- for workflow_to_track in WORKFLOWS_TO_TRACK :
269
- workflows_to_track [workflow_to_track ] = None
270
+ # The last workflow this script processed.
271
+ github_last_seen_workflow = None
270
272
271
273
# Enter the main loop. Every five minutes we wake up and dump metrics for
272
274
# the relevant jobs.
273
275
while True :
274
- github_object = Github (auth = auth )
276
+ github_object = Github (auth = github_auth )
275
277
github_repo = github_object .get_repo ("llvm/llvm-project" )
276
278
277
- current_metrics = get_per_workflow_metrics (github_repo , workflows_to_track )
278
- current_metrics += get_sampled_workflow_metrics ( github_repo )
279
-
280
- upload_metrics ( current_metrics , grafana_metrics_userid , grafana_api_key )
281
- logging . info ( f"Uploaded { len ( current_metrics ) } metrics" )
279
+ github_metrics , github_last_seen_workflow = get_per_workflow_metrics (
280
+ github_repo , github_last_seen_workflow
281
+ )
282
+ sampled_metrics = get_sampled_workflow_metrics ( github_repo )
283
+ metrics = github_metrics + sampled_metrics
282
284
283
- for workflow_metric in reversed (current_metrics ):
284
- if isinstance (workflow_metric , JobMetrics ):
285
- workflows_to_track [
286
- workflow_metric .workflow_name
287
- ] = workflow_metric .workflow_id
285
+ upload_metrics (metrics , grafana_metrics_userid , grafana_api_key )
286
+ logging .info (f"Uploaded { len (metrics )} metrics" )
288
287
289
288
time .sleep (SCRAPE_INTERVAL_SECONDS )
290
289
0 commit comments