1
1
import collections
2
2
import datetime
3
- import dateutil
4
3
import github
5
- import json
6
4
import logging
7
5
import os
8
6
import requests
9
- import sys
10
7
import time
11
8
12
9
from dataclasses import dataclass
55
52
# by trial and error).
56
53
GRAFANA_METRIC_MAX_AGE_MN = 120
57
54
58
- # Lists the BuildKite jobs we want to track. Maps the BuildKite job name to
59
- # the metric name in Grafana. This is important not to lose metrics history
60
- # if the workflow name changes.
61
- BUILDKITE_WORKFLOW_TO_TRACK = {
62
- ":linux: Linux x64" : "buildkite_linux" ,
63
- ":windows: Windows x64" : "buildkite_windows" ,
64
- }
65
-
66
- # Number of builds to fetch per page. Since we scrape regularly, this can
67
- # remain small.
68
- BUILDKITE_GRAPHQL_BUILDS_PER_PAGE = 50
69
-
70
55
71
56
@dataclass
72
57
class JobMetrics :
@@ -86,181 +71,6 @@ class GaugeMetric:
86
71
time_ns : int
87
72
88
73
89
- def buildkite_fetch_page_build_list (
90
- buildkite_token : str , after_cursor : str = None
91
- ) -> list [dict [str , str ]]:
92
- """Fetches a page of the build list using the GraphQL BuildKite API.
93
-
94
- Returns the BUILDKITE_GRAPHQL_BUILDS_PER_PAGE last running/queued builds,
95
- or the BUILDKITE_GRAPHQL_BUILDS_PER_PAGE running/queued builds
96
- older than the one pointer by |after_cursor| if provided.
97
- The |after_cursor| value is taken from the previous page returned by the
98
- API.
99
-
100
- Args:
101
- buildkite_token: the secret token to authenticate GraphQL requests.
102
- after_cursor: cursor after which to start the page fetch.
103
-
104
- Returns:
105
- The most recent builds after cursor (if set) with the following format:
106
- [
107
- {
108
- "cursor": <value>,
109
- "number": <build-number>,
110
- }
111
- ]
112
- """
113
-
114
- BUILDKITE_GRAPHQL_QUERY = """
115
- query OrganizationShowQuery {{
116
- organization(slug: "llvm-project") {{
117
- pipelines(search: "Github pull requests", first: 1) {{
118
- edges {{
119
- node {{
120
- builds (state: [CANCELING, CREATING, FAILING, RUNNING], first: {PAGE_SIZE}, after: {AFTER}) {{
121
- edges {{
122
- cursor
123
- node {{
124
- number
125
- }}
126
- }}
127
- }}
128
- }}
129
- }}
130
- }}
131
- }}
132
- }}
133
- """
134
- query = BUILDKITE_GRAPHQL_QUERY .format (
135
- PAGE_SIZE = BUILDKITE_GRAPHQL_BUILDS_PER_PAGE ,
136
- AFTER = "null" if after_cursor is None else '"{}"' .format (after_cursor ),
137
- )
138
- query = json .dumps ({"query" : query })
139
- url = "https://graphql.buildkite.com/v1"
140
- headers = {
141
- "Authorization" : "Bearer " + buildkite_token ,
142
- "Content-Type" : "application/json" ,
143
- }
144
- data = requests .post (url , data = query , headers = headers ).json ()
145
- # De-nest the build list.
146
- if "errors" in data :
147
- logging .info ("Failed to fetch BuildKite jobs: {}" .format (data ["errors" ]))
148
- return []
149
- builds = data ["data" ]["organization" ]["pipelines" ]["edges" ][0 ]["node" ]["builds" ][
150
- "edges"
151
- ]
152
- # Fold cursor info into the node dictionnary.
153
- return [{** x ["node" ], "cursor" : x ["cursor" ]} for x in builds ]
154
-
155
-
156
- def buildkite_get_build_info (build_number : str ) -> dict :
157
- """Returns all the info associated with the provided build number.
158
-
159
- Note: for unknown reasons, graphql returns no jobs for a given build,
160
- while this endpoint does, hence why this uses this API instead of graphql.
161
-
162
- Args:
163
- build_number: which build number to fetch info for.
164
-
165
- Returns:
166
- The info for the target build, a JSON dictionnary.
167
- """
168
-
169
- URL = "https://buildkite.com/llvm-project/github-pull-requests/builds/{}.json"
170
- return requests .get (URL .format (build_number )).json ()
171
-
172
-
173
- def buildkite_get_incomplete_tasks (buildkite_token : str ) -> list :
174
- """Returns all the running/pending BuildKite builds.
175
-
176
- Args:
177
- buildkite_token: the secret token to authenticate GraphQL requests.
178
- last_cursor: the cursor to stop at if set. If None, a full page is fetched.
179
- """
180
- output = []
181
- cursor = None
182
- while True :
183
- page = buildkite_fetch_page_build_list (buildkite_token , cursor )
184
- if len (page ) == 0 :
185
- break
186
- cursor = page [- 1 ]["cursor" ]
187
- output += page
188
- return output
189
-
190
-
191
- def buildkite_get_metrics (
192
- buildkite_token : str , previously_incomplete : set [int ]
193
- ) -> (list [JobMetrics ], set [int ]):
194
- """Returns a tuple with:
195
-
196
- - the metrics recorded for newly completed workflow jobs.
197
- - the set of workflow still running now.
198
-
199
- Args:
200
- buildkite_token: the secret token to authenticate GraphQL requests.
201
- previously_incomplete: the set of running workflows the last time this
202
- function was called.
203
- """
204
-
205
- running_builds = buildkite_get_incomplete_tasks (buildkite_token )
206
- incomplete_now = set ([x ["number" ] for x in running_builds ])
207
- output = []
208
-
209
- for build_id in previously_incomplete :
210
- if build_id in incomplete_now :
211
- continue
212
-
213
- info = buildkite_get_build_info (build_id )
214
- metric_timestamp = dateutil .parser .isoparse (info ["finished_at" ])
215
- for job in info ["jobs" ]:
216
- # This workflow is not interesting to us.
217
- if job ["name" ] not in BUILDKITE_WORKFLOW_TO_TRACK :
218
- continue
219
-
220
- # Don't count canceled jobs.
221
- if job ["canceled_at" ]:
222
- continue
223
-
224
- created_at = dateutil .parser .isoparse (job ["created_at" ])
225
- scheduled_at = dateutil .parser .isoparse (job ["scheduled_at" ])
226
- started_at = dateutil .parser .isoparse (job ["started_at" ])
227
- finished_at = dateutil .parser .isoparse (job ["finished_at" ])
228
-
229
- job_name = BUILDKITE_WORKFLOW_TO_TRACK [job ["name" ]]
230
- queue_time = (started_at - scheduled_at ).seconds
231
- run_time = (finished_at - started_at ).seconds
232
- status = bool (job ["passed" ])
233
-
234
- # Grafana will refuse to ingest metrics older than ~2 hours, so we
235
- # should avoid sending historical data.
236
- metric_age_mn = (
237
- datetime .datetime .now (datetime .timezone .utc ) - metric_timestamp
238
- ).total_seconds () / 60
239
- if metric_age_mn > GRAFANA_METRIC_MAX_AGE_MN :
240
- logging .warning (
241
- f"Job { job ['name' ]} from workflow { build_id } dropped due"
242
- + f" to staleness: { metric_age_mn } mn old."
243
- )
244
- continue
245
-
246
- metric_timestamp_ns = int (metric_timestamp .timestamp ()) * 10 ** 9
247
- workflow_id = build_id
248
- workflow_name = "Github pull requests"
249
- output .append (
250
- JobMetrics (
251
- job_name ,
252
- queue_time ,
253
- run_time ,
254
- status ,
255
- metric_timestamp_ns ,
256
- workflow_id ,
257
- workflow_name ,
258
- )
259
- )
260
-
261
- return output , incomplete_now
262
-
263
-
264
74
def github_get_metrics (
265
75
github_repo : github .Repository , last_workflows_seen_as_completed : set [int ]
266
76
) -> tuple [list [JobMetrics ], int ]:
@@ -478,17 +288,13 @@ def upload_metrics(workflow_metrics, metrics_userid, api_key):
478
288
def main ():
479
289
# Authenticate with Github
480
290
github_auth = Auth .Token (os .environ ["GITHUB_TOKEN" ])
481
- buildkite_token = os .environ ["BUILDKITE_TOKEN" ]
482
291
grafana_api_key = os .environ ["GRAFANA_API_KEY" ]
483
292
grafana_metrics_userid = os .environ ["GRAFANA_METRICS_USERID" ]
484
293
485
294
# The last workflow this script processed.
486
295
# Because the Github queries are broken, we'll simply log a 'processed'
487
296
# bit for the last COUNT_TO_PROCESS workflows.
488
297
gh_last_workflows_seen_as_completed = set ()
489
- # Stores the list of pending/running builds in BuildKite we need to check
490
- # at the next iteration.
491
- bk_incomplete = set ()
492
298
493
299
# Enter the main loop. Every five minutes we wake up and dump metrics for
494
300
# the relevant jobs.
@@ -500,13 +306,8 @@ def main():
500
306
github_repo , gh_last_workflows_seen_as_completed
501
307
)
502
308
503
- bk_metrics , bk_incomplete = buildkite_get_metrics (
504
- buildkite_token , bk_incomplete
505
- )
506
-
507
- metrics = gh_metrics + bk_metrics
508
- upload_metrics (metrics , grafana_metrics_userid , grafana_api_key )
509
- logging .info (f"Uploaded { len (metrics )} metrics" )
309
+ upload_metrics (gh_metrics , grafana_metrics_userid , grafana_api_key )
310
+ logging .info (f"Uploaded { len (gh_metrics )} metrics" )
510
311
511
312
time .sleep (SCRAPE_INTERVAL_SECONDS )
512
313
0 commit comments