11import collections
22import datetime
3- import dateutil
43import github
5- import json
64import logging
75import os
86import requests
9- import sys
107import time
118
129from dataclasses import dataclass
5552# by trial and error).
5653GRAFANA_METRIC_MAX_AGE_MN = 120
5754
58- # Lists the BuildKite jobs we want to track. Maps the BuildKite job name to
59- # the metric name in Grafana. This is important not to lose metrics history
60- # if the workflow name changes.
61- BUILDKITE_WORKFLOW_TO_TRACK = {
62- ":linux: Linux x64" : "buildkite_linux" ,
63- ":windows: Windows x64" : "buildkite_windows" ,
64- }
65-
66- # Number of builds to fetch per page. Since we scrape regularly, this can
67- # remain small.
68- BUILDKITE_GRAPHQL_BUILDS_PER_PAGE = 50
69-
7055
7156@dataclass
7257class JobMetrics :
@@ -86,181 +71,6 @@ class GaugeMetric:
8671 time_ns : int
8772
8873
89- def buildkite_fetch_page_build_list (
90- buildkite_token : str , after_cursor : str = None
91- ) -> list [dict [str , str ]]:
92- """Fetches a page of the build list using the GraphQL BuildKite API.
93-
94- Returns the BUILDKITE_GRAPHQL_BUILDS_PER_PAGE last running/queued builds,
95- or the BUILDKITE_GRAPHQL_BUILDS_PER_PAGE running/queued builds
96- older than the one pointer by |after_cursor| if provided.
97- The |after_cursor| value is taken from the previous page returned by the
98- API.
99-
100- Args:
101- buildkite_token: the secret token to authenticate GraphQL requests.
102- after_cursor: cursor after which to start the page fetch.
103-
104- Returns:
105- The most recent builds after cursor (if set) with the following format:
106- [
107- {
108- "cursor": <value>,
109- "number": <build-number>,
110- }
111- ]
112- """
113-
114- BUILDKITE_GRAPHQL_QUERY = """
115- query OrganizationShowQuery {{
116- organization(slug: "llvm-project") {{
117- pipelines(search: "Github pull requests", first: 1) {{
118- edges {{
119- node {{
120- builds (state: [CANCELING, CREATING, FAILING, RUNNING], first: {PAGE_SIZE}, after: {AFTER}) {{
121- edges {{
122- cursor
123- node {{
124- number
125- }}
126- }}
127- }}
128- }}
129- }}
130- }}
131- }}
132- }}
133- """
134- query = BUILDKITE_GRAPHQL_QUERY .format (
135- PAGE_SIZE = BUILDKITE_GRAPHQL_BUILDS_PER_PAGE ,
136- AFTER = "null" if after_cursor is None else '"{}"' .format (after_cursor ),
137- )
138- query = json .dumps ({"query" : query })
139- url = "https://graphql.buildkite.com/v1"
140- headers = {
141- "Authorization" : "Bearer " + buildkite_token ,
142- "Content-Type" : "application/json" ,
143- }
144- data = requests .post (url , data = query , headers = headers ).json ()
145- # De-nest the build list.
146- if "errors" in data :
147- logging .info ("Failed to fetch BuildKite jobs: {}" .format (data ["errors" ]))
148- return []
149- builds = data ["data" ]["organization" ]["pipelines" ]["edges" ][0 ]["node" ]["builds" ][
150- "edges"
151- ]
152- # Fold cursor info into the node dictionnary.
153- return [{** x ["node" ], "cursor" : x ["cursor" ]} for x in builds ]
154-
155-
156- def buildkite_get_build_info (build_number : str ) -> dict :
157- """Returns all the info associated with the provided build number.
158-
159- Note: for unknown reasons, graphql returns no jobs for a given build,
160- while this endpoint does, hence why this uses this API instead of graphql.
161-
162- Args:
163- build_number: which build number to fetch info for.
164-
165- Returns:
166- The info for the target build, a JSON dictionnary.
167- """
168-
169- URL = "https://buildkite.com/llvm-project/github-pull-requests/builds/{}.json"
170- return requests .get (URL .format (build_number )).json ()
171-
172-
173- def buildkite_get_incomplete_tasks (buildkite_token : str ) -> list :
174- """Returns all the running/pending BuildKite builds.
175-
176- Args:
177- buildkite_token: the secret token to authenticate GraphQL requests.
178- last_cursor: the cursor to stop at if set. If None, a full page is fetched.
179- """
180- output = []
181- cursor = None
182- while True :
183- page = buildkite_fetch_page_build_list (buildkite_token , cursor )
184- if len (page ) == 0 :
185- break
186- cursor = page [- 1 ]["cursor" ]
187- output += page
188- return output
189-
190-
191- def buildkite_get_metrics (
192- buildkite_token : str , previously_incomplete : set [int ]
193- ) -> (list [JobMetrics ], set [int ]):
194- """Returns a tuple with:
195-
196- - the metrics recorded for newly completed workflow jobs.
197- - the set of workflow still running now.
198-
199- Args:
200- buildkite_token: the secret token to authenticate GraphQL requests.
201- previously_incomplete: the set of running workflows the last time this
202- function was called.
203- """
204-
205- running_builds = buildkite_get_incomplete_tasks (buildkite_token )
206- incomplete_now = set ([x ["number" ] for x in running_builds ])
207- output = []
208-
209- for build_id in previously_incomplete :
210- if build_id in incomplete_now :
211- continue
212-
213- info = buildkite_get_build_info (build_id )
214- metric_timestamp = dateutil .parser .isoparse (info ["finished_at" ])
215- for job in info ["jobs" ]:
216- # This workflow is not interesting to us.
217- if job ["name" ] not in BUILDKITE_WORKFLOW_TO_TRACK :
218- continue
219-
220- # Don't count canceled jobs.
221- if job ["canceled_at" ]:
222- continue
223-
224- created_at = dateutil .parser .isoparse (job ["created_at" ])
225- scheduled_at = dateutil .parser .isoparse (job ["scheduled_at" ])
226- started_at = dateutil .parser .isoparse (job ["started_at" ])
227- finished_at = dateutil .parser .isoparse (job ["finished_at" ])
228-
229- job_name = BUILDKITE_WORKFLOW_TO_TRACK [job ["name" ]]
230- queue_time = (started_at - scheduled_at ).seconds
231- run_time = (finished_at - started_at ).seconds
232- status = bool (job ["passed" ])
233-
234- # Grafana will refuse to ingest metrics older than ~2 hours, so we
235- # should avoid sending historical data.
236- metric_age_mn = (
237- datetime .datetime .now (datetime .timezone .utc ) - metric_timestamp
238- ).total_seconds () / 60
239- if metric_age_mn > GRAFANA_METRIC_MAX_AGE_MN :
240- logging .warning (
241- f"Job { job ['name' ]} from workflow { build_id } dropped due"
242- + f" to staleness: { metric_age_mn } mn old."
243- )
244- continue
245-
246- metric_timestamp_ns = int (metric_timestamp .timestamp ()) * 10 ** 9
247- workflow_id = build_id
248- workflow_name = "Github pull requests"
249- output .append (
250- JobMetrics (
251- job_name ,
252- queue_time ,
253- run_time ,
254- status ,
255- metric_timestamp_ns ,
256- workflow_id ,
257- workflow_name ,
258- )
259- )
260-
261- return output , incomplete_now
262-
263-
26474def github_get_metrics (
26575 github_repo : github .Repository , last_workflows_seen_as_completed : set [int ]
26676) -> tuple [list [JobMetrics ], int ]:
@@ -478,17 +288,13 @@ def upload_metrics(workflow_metrics, metrics_userid, api_key):
478288def main ():
479289 # Authenticate with Github
480290 github_auth = Auth .Token (os .environ ["GITHUB_TOKEN" ])
481- buildkite_token = os .environ ["BUILDKITE_TOKEN" ]
482291 grafana_api_key = os .environ ["GRAFANA_API_KEY" ]
483292 grafana_metrics_userid = os .environ ["GRAFANA_METRICS_USERID" ]
484293
485294 # The last workflow this script processed.
486295 # Because the Github queries are broken, we'll simply log a 'processed'
487296 # bit for the last COUNT_TO_PROCESS workflows.
488297 gh_last_workflows_seen_as_completed = set ()
489- # Stores the list of pending/running builds in BuildKite we need to check
490- # at the next iteration.
491- bk_incomplete = set ()
492298
493299 # Enter the main loop. Every five minutes we wake up and dump metrics for
494300 # the relevant jobs.
@@ -500,13 +306,8 @@ def main():
500306 github_repo , gh_last_workflows_seen_as_completed
501307 )
502308
503- bk_metrics , bk_incomplete = buildkite_get_metrics (
504- buildkite_token , bk_incomplete
505- )
506-
507- metrics = gh_metrics + bk_metrics
508- upload_metrics (metrics , grafana_metrics_userid , grafana_api_key )
509- logging .info (f"Uploaded { len (metrics )} metrics" )
309+ upload_metrics (gh_metrics , grafana_metrics_userid , grafana_api_key )
310+ logging .info (f"Uploaded { len (gh_metrics )} metrics" )
510311
511312 time .sleep (SCRAPE_INTERVAL_SECONDS )
512313
0 commit comments