11import collections
22import datetime
3+ import dateutil
34import github
5+ import json
46import logging
57import os
68import requests
4143# This means we essentially have a list of workflows sorted by creation date,
4244# and that's all we can deduce from it. So for each iteration, we'll blindly
4345# process the last N workflows.
44- GITHUB_WORKFLOWS_MAX_PROCESS_COUNT = 1000
46+ GITHUB_WORKFLOWS_MAX_PROCESS_COUNT = 2000
4547# Second reason for the cut: reaching a workflow older than X.
4648# This means we will miss long-tails (exceptional jobs running for more than
4749# X hours), but that's also the case with the count cutoff above.
5355# by trial and error).
5456GRAFANA_METRIC_MAX_AGE_MN = 120
5557
58+ # Lists the BuildKite jobs we want to track. Maps the BuildKite job name to
59+ # the metric name in Grafana. This is important not to lose metrics history
60+ # if the workflow name changes.
61+ BUILDKITE_WORKFLOW_TO_TRACK = {
62+ ":linux: Linux x64" : "buildkite_linux" ,
63+ ":windows: Windows x64" : "buildkite_windows" ,
64+ }
65+
66+ # Number of builds to fetch per page. Since we scrape regularly, this can
67+ # remain small.
68+ BUILDKITE_GRAPHQL_BUILDS_PER_PAGE = 50
69+
5670@dataclass
5771class JobMetrics :
5872 job_name : str
@@ -70,6 +84,180 @@ class GaugeMetric:
7084 time_ns : int
7185
7286
87+ def buildkite_fetch_page_build_list (
88+ buildkite_token : str , after_cursor : str = None
89+ ) -> list [dict [str , str ]]:
90+ """Fetches a page of the build list using the GraphQL BuildKite API.
91+
92+ Returns the BUILDKITE_GRAPHQL_BUILDS_PER_PAGE last running/queued builds,
93+ or the BUILDKITE_GRAPHQL_BUILDS_PER_PAGE running/queued builds
94+ older than the one pointer by |after_cursor| if provided.
95+ The |after_cursor| value is taken from the previous page returned by the
96+ API.
97+
98+ Args:
99+ buildkite_token: the secret token to authenticate GraphQL requests.
100+ after_cursor: cursor after which to start the page fetch.
101+
102+ Returns:
103+ The most recent builds after cursor (if set) with the following format:
104+ [
105+ {
106+ "cursor": <value>,
107+ "number": <build-number>,
108+ }
109+ ]
110+ """
111+
112+ BUILDKITE_GRAPHQL_QUERY = """
113+ query OrganizationShowQuery {{
114+ organization(slug: "llvm-project") {{
115+ pipelines(search: "Github pull requests", first: 1) {{
116+ edges {{
117+ node {{
118+ builds (state: [CANCELING, CREATING, FAILING, RUNNING], first: {PAGE_SIZE}, after: {AFTER}) {{
119+ edges {{
120+ cursor
121+ node {{
122+ number
123+ }}
124+ }}
125+ }}
126+ }}
127+ }}
128+ }}
129+ }}
130+ }}
131+ """
132+ query = BUILDKITE_GRAPHQL_QUERY .format (
133+ PAGE_SIZE = BUILDKITE_GRAPHQL_BUILDS_PER_PAGE ,
134+ AFTER = "null" if after_cursor is None else '"{}"' .format (after_cursor ),
135+ )
136+ query = json .dumps ({"query" : query })
137+ url = "https://graphql.buildkite.com/v1"
138+ headers = {
139+ "Authorization" : "Bearer " + buildkite_token ,
140+ "Content-Type" : "application/json" ,
141+ }
142+ data = requests .post (url , data = query , headers = headers ).json ()
143+ # De-nest the build list.
144+ if "errors" in data :
145+ logging .info ("Failed to fetch BuildKite jobs: {}" .format (data ["errors" ]))
146+ return []
147+ builds = data ["data" ]["organization" ]["pipelines" ]["edges" ][0 ]["node" ]["builds" ][
148+ "edges"
149+ ]
150+ # Fold cursor info into the node dictionnary.
151+ return [{** x ["node" ], "cursor" : x ["cursor" ]} for x in builds ]
152+
153+
154+ def buildkite_get_build_info (build_number : str ) -> dict :
155+ """Returns all the info associated with the provided build number.
156+
157+ Note: for unknown reasons, graphql returns no jobs for a given build,
158+ while this endpoint does, hence why this uses this API instead of graphql.
159+
160+ Args:
161+ build_number: which build number to fetch info for.
162+
163+ Returns:
164+ The info for the target build, a JSON dictionnary.
165+ """
166+
167+ URL = "https://buildkite.com/llvm-project/github-pull-requests/builds/{}.json"
168+ return requests .get (URL .format (build_number )).json ()
169+
170+
171+ def buildkite_get_incomplete_tasks (buildkite_token : str ) -> list :
172+ """Returns all the running/pending BuildKite builds.
173+
174+ Args:
175+ buildkite_token: the secret token to authenticate GraphQL requests.
176+ last_cursor: the cursor to stop at if set. If None, a full page is fetched.
177+ """
178+ output = []
179+ cursor = None
180+ while True :
181+ page = buildkite_fetch_page_build_list (buildkite_token , cursor )
182+ if len (page ) == 0 :
183+ break
184+ cursor = page [- 1 ]["cursor" ]
185+ output += page
186+ return output
187+
188+
189+ def buildkite_get_metrics (
190+ buildkite_token : str , previously_incomplete : set [int ]
191+ ) -> (list [JobMetrics ], set [int ]):
192+ """Returns a tuple with:
193+
194+ - the metrics recorded for newly completed workflow jobs.
195+ - the set of workflow still running now.
196+
197+ Args:
198+ buildkite_token: the secret token to authenticate GraphQL requests.
199+ previously_incomplete: the set of running workflows the last time this
200+ function was called.
201+ """
202+
203+ running_builds = buildkite_get_incomplete_tasks (buildkite_token )
204+ incomplete_now = set ([x ["number" ] for x in running_builds ])
205+ output = []
206+
207+ for build_id in previously_incomplete :
208+ if build_id in incomplete_now :
209+ continue
210+
211+ info = buildkite_get_build_info (build_id )
212+ metric_timestamp = dateutil .parser .isoparse (info ["finished_at" ])
213+ for job in info ["jobs" ]:
214+ # This workflow is not interesting to us.
215+ if job ["name" ] not in BUILDKITE_WORKFLOW_TO_TRACK :
216+ continue
217+
218+ # Don't count canceled jobs.
219+ if job ["canceled_at" ]:
220+ continue
221+
222+ created_at = dateutil .parser .isoparse (job ["created_at" ])
223+ scheduled_at = dateutil .parser .isoparse (job ["scheduled_at" ])
224+ started_at = dateutil .parser .isoparse (job ["started_at" ])
225+ finished_at = dateutil .parser .isoparse (job ["finished_at" ])
226+
227+ job_name = BUILDKITE_WORKFLOW_TO_TRACK [job ["name" ]]
228+ queue_time = (started_at - scheduled_at ).seconds
229+ run_time = (finished_at - started_at ).seconds
230+ status = bool (job ["passed" ])
231+
232+ # Grafana will refuse to ingest metrics older than ~2 hours, so we
233+ # should avoid sending historical data.
234+ metric_age_mn = (
235+ datetime .datetime .now (datetime .timezone .utc ) - metric_timestamp
236+ ).total_seconds () / 60
237+ if metric_age_mn > GRAFANA_METRIC_MAX_AGE_MN :
238+ logging .warning (
239+ f"Job { job ['name' ]} from workflow { build_id } dropped due"
240+ + f" to staleness: { metric_age_mn } mn old."
241+ )
242+ continue
243+
244+ metric_timestamp_ns = int (metric_timestamp .timestamp ()) * 10 ** 9
245+ workflow_id = build_id
246+ workflow_name = "Github pull requests"
247+ output .append (
248+ JobMetrics (
249+ job_name ,
250+ queue_time ,
251+ run_time ,
252+ status ,
253+ metric_timestamp_ns ,
254+ workflow_id ,
255+ workflow_name ,
256+ )
257+ )
258+
259+ return output , incomplete_now
260+
73261def github_get_metrics (
74262 github_repo : github .Repository , last_workflows_seen_as_completed : set [int ]
75263) -> tuple [list [JobMetrics ], int ]:
@@ -168,8 +356,24 @@ def github_get_metrics(
168356 created_at = job .created_at
169357 started_at = job .started_at
170358 completed_at = job .completed_at
171- queue_time = started_at - created_at
172- run_time = completed_at - started_at
359+
360+ # GitHub API can return results where the started_at is slightly
361+ # later then the created_at (or completed earlier than started).
362+ # This would cause a -23h59mn delta, which will show up as +24h
363+ # queue/run time on grafana.
364+ if started_at < created_at :
365+ logging .info (
366+ "Workflow {} started before being created." .format (task .id )
367+ )
368+ queue_time = datetime .timedelta (seconds = 0 )
369+ else :
370+ queue_time = started_at - created_at
371+ if completed_at < started_at :
372+ logging .info ("Workflow {} finished before starting." .format (task .id ))
373+ run_time = datetime .timedelta (seconds = 0 )
374+ else :
375+ run_time = completed_at - started_at
376+
173377 if run_time .seconds == 0 :
174378 continue
175379
@@ -179,7 +383,7 @@ def github_get_metrics(
179383 datetime .datetime .now (datetime .timezone .utc ) - completed_at
180384 ).total_seconds () / 60
181385 if metric_age_mn > GRAFANA_METRIC_MAX_AGE_MN :
182- logging .info (
386+ logging .warning (
183387 f"Job { job .id } from workflow { task .id } dropped due"
184388 + f" to staleness: { metric_age_mn } mn old."
185389 )
@@ -276,23 +480,33 @@ def upload_metrics(workflow_metrics, metrics_userid, api_key):
276480def main ():
277481 # Authenticate with Github
278482 github_auth = Auth .Token (os .environ ["GITHUB_TOKEN" ])
483+ buildkite_token = os .environ ["BUILDKITE_TOKEN" ]
279484 grafana_api_key = os .environ ["GRAFANA_API_KEY" ]
280485 grafana_metrics_userid = os .environ ["GRAFANA_METRICS_USERID" ]
281486
282487 # The last workflow this script processed.
283488 # Because the Github queries are broken, we'll simply log a 'processed'
284489 # bit for the last COUNT_TO_PROCESS workflows.
285490 gh_last_workflows_seen_as_completed = set ()
491+ # Stores the list of pending/running builds in BuildKite we need to check
492+ # at the next iteration.
493+ bk_incomplete = set ()
286494
287495 # Enter the main loop. Every five minutes we wake up and dump metrics for
288496 # the relevant jobs.
289497 while True :
290498 github_object = Github (auth = github_auth )
291499 github_repo = github_object .get_repo ("llvm/llvm-project" )
292500
293- metrics , gh_last_workflows_seen_as_completed = github_get_metrics (
501+ gh_metrics , gh_last_workflows_seen_as_completed = github_get_metrics (
294502 github_repo , gh_last_workflows_seen_as_completed
295503 )
504+
505+ bk_metrics , bk_incomplete = buildkite_get_metrics (
506+ buildkite_token , bk_incomplete
507+ )
508+
509+ metrics = gh_metrics + bk_metrics
296510 upload_metrics (metrics , grafana_metrics_userid , grafana_api_key )
297511 logging .info (f"Uploaded { len (metrics )} metrics" )
298512
0 commit comments