Skip to content

Commit e71f64c

Browse files
committed
[CI] Extend metrics container to log BuildKite metrics
The current container focuses on Github metrics. Before deprecating BuildKite, we want to make sure the new infra quality is better, or at least the same. Being able to compare buildkite metrics with github metrics on grafana will allow us to easily present the comparison.
1 parent e9de91e commit e71f64c

File tree

1 file changed

+161
-4
lines changed

1 file changed

+161
-4
lines changed

.ci/metrics/metrics.py

Lines changed: 161 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
import requests
2+
import dateutil
3+
import json
24
import time
35
import os
46
from dataclasses import dataclass
@@ -16,6 +18,17 @@
1618
WORKFLOWS_TO_TRACK = ["LLVM Premerge Checks"]
1719
SCRAPE_INTERVAL_SECONDS = 5 * 60
1820

21+
# Number of builds to fetch per page. Since we scrape regularly, this can
22+
# remain small.
23+
BUILDKITE_GRAPHQL_BUILDS_PER_PAGE = 10
24+
25+
# Lists the BuildKite jobs we want to track. Maps the BuildKite job name to
26+
# the metric name in Grafana. This is important not to lose metrics history
27+
# if the workflow name changes.
28+
BUILDKITE_WORKFLOW_TO_TRACK = {
29+
":linux: Linux x64": "buildkite_linux",
30+
":windows: Windows x64": "buildkite_windows",
31+
}
1932

2033
@dataclass
2134
class JobMetrics:
@@ -35,6 +48,146 @@ class GaugeMetric:
3548
time_ns: int
3649

3750

51+
# Fetches a page of the build list using the GraphQL BuildKite API.
52+
# Returns the BUILDKITE_GRAPHQL_BUILDS_PER_PAGE last **finished** builds by
53+
# default, or the BUILDKITE_GRAPHQL_BUILDS_PER_PAGE **finished** builds older
54+
# than the one pointer by
55+
# |cursor| if provided.
56+
# The |cursor| value is taken from the previous page returned by the API.
57+
# The returned data had the following format:
58+
# [
59+
# {
60+
# "cursor": <value>,
61+
# "number": <build-number>,
62+
# }
63+
# ]
64+
def buildkite_fetch_page_build_list(buildkite_token, after_cursor=None):
65+
BUILDKITE_GRAPHQL_QUERY = """
66+
query OrganizationShowQuery {{
67+
organization(slug: "llvm-project") {{
68+
pipelines(search: "Github pull requests", first: 1) {{
69+
edges {{
70+
node {{
71+
builds (state: [FAILED, PASSED], first: {PAGE_SIZE}, after: {AFTER}) {{
72+
edges {{
73+
cursor
74+
node {{
75+
number
76+
}}
77+
}}
78+
}}
79+
}}
80+
}}
81+
}}
82+
}}
83+
}}
84+
"""
85+
data = BUILDKITE_GRAPHQL_QUERY.format(
86+
PAGE_SIZE=BUILDKITE_GRAPHQL_BUILDS_PER_PAGE,
87+
AFTER="null" if after_cursor is None else '"{}"'.format(after_cursor),
88+
)
89+
data = data.replace("\n", "").replace('"', '\\"')
90+
data = '{ "query": "' + data + '" }'
91+
url = "https://graphql.buildkite.com/v1"
92+
headers = {
93+
"Authorization": "Bearer " + buildkite_token,
94+
"Content-Type": "application/json",
95+
}
96+
r = requests.post(url, data=data, headers=headers)
97+
data = r.json()
98+
# De-nest the build list.
99+
builds = data["data"]["organization"]["pipelines"]["edges"][0]["node"]["builds"][
100+
"edges"
101+
]
102+
# Fold cursor info into the node dictionnary.
103+
return [{**x["node"], "cursor": x["cursor"]} for x in builds]
104+
105+
106+
# Returns all the info associated with the provided |build_number|.
107+
# Note: for unknown reasons, graphql returns no jobs for a given build, while
108+
# this endpoint does, hence why this uses this API instead of graphql.
109+
def buildkite_get_build_info(build_number):
110+
URL = "https://buildkite.com/llvm-project/github-pull-requests/builds/{}.json"
111+
return requests.get(URL.format(build_number)).json()
112+
113+
114+
# returns the last BUILDKITE_GRAPHQL_BUILDS_PER_PAGE builds by default, or
115+
# until the build pointed by |last_cursor| is found.
116+
def buildkite_get_builds_up_to(buildkite_token, last_cursor=None):
117+
output = []
118+
cursor = None
119+
120+
while True:
121+
page = buildkite_fetch_page_build_list(buildkite_token, cursor)
122+
# No cursor provided, return the first page.
123+
if last_cursor is None:
124+
return page
125+
126+
# Cursor has been provided, check if present in this page.
127+
match_index = next(
128+
(i for i, x in enumerate(page) if x["cursor"] == last_cursor), None
129+
)
130+
# Not present, continue loading more pages.
131+
if match_index is None:
132+
output += page
133+
cursor = page[-1]["cursor"]
134+
continue
135+
# Cursor found, keep results up to cursor
136+
output += page[:match_index]
137+
return output
138+
139+
140+
# Returns a (metrics, cursor) tuple.
141+
# Returns the BuildKite workflow metrics up to the build pointed by |last_cursor|.
142+
# If |last_cursor| is None, no metrics are returned.
143+
# The returned cursor is either:
144+
# - the last processed build.
145+
# - the last build if no initial cursor was provided.
146+
def buildkite_get_metrics(buildkite_token, last_cursor=None):
147+
148+
builds = buildkite_get_builds_up_to(buildkite_token, last_cursor)
149+
# Don't return any metrics if last_cursor is None.
150+
# This happens when the program starts.
151+
if last_cursor is None:
152+
return [], builds[0]["cursor"]
153+
154+
last_recorded_build = last_cursor
155+
output = []
156+
for build in builds:
157+
info = buildkite_get_build_info(build["number"])
158+
last_recorded_build = build["cursor"]
159+
for job in info["jobs"]:
160+
# Skip this job.
161+
if job["name"] not in BUILDKITE_WORKFLOW_TO_TRACK:
162+
continue
163+
164+
created_at = dateutil.parser.isoparse(job["created_at"])
165+
scheduled_at = dateutil.parser.isoparse(job["scheduled_at"])
166+
started_at = dateutil.parser.isoparse(job["started_at"])
167+
finished_at = dateutil.parser.isoparse(job["finished_at"])
168+
169+
job_name = BUILDKITE_WORKFLOW_TO_TRACK[job["name"]]
170+
queue_time = (started_at - scheduled_at).seconds
171+
run_time = (finished_at - started_at).seconds
172+
status = bool(job["passed"])
173+
created_at_ns = int(created_at.timestamp()) * 10**9
174+
workflow_id = build["number"]
175+
workflow_name = "Github pull requests"
176+
output.append(
177+
JobMetrics(
178+
job_name,
179+
queue_time,
180+
run_time,
181+
status,
182+
created_at_ns,
183+
workflow_id,
184+
workflow_name,
185+
)
186+
)
187+
188+
return output, last_recorded_build
189+
190+
38191
def get_sampled_workflow_metrics(github_repo: github.Repository):
39192
"""Gets global statistics about the Github workflow queue
40193
@@ -105,7 +258,6 @@ def get_sampled_workflow_metrics(github_repo: github.Repository):
105258
)
106259
return workflow_metrics
107260

108-
109261
def get_per_workflow_metrics(
110262
github_repo: github.Repository, workflows_to_track: dict[str, int]
111263
):
@@ -211,7 +363,6 @@ def get_per_workflow_metrics(
211363

212364
return workflow_metrics
213365

214-
215366
def upload_metrics(workflow_metrics, metrics_userid, api_key):
216367
"""Upload metrics to Grafana.
217368
@@ -260,9 +411,12 @@ def upload_metrics(workflow_metrics, metrics_userid, api_key):
260411
def main():
261412
# Authenticate with Github
262413
auth = Auth.Token(os.environ["GITHUB_TOKEN"])
263-
264414
grafana_api_key = os.environ["GRAFANA_API_KEY"]
265415
grafana_metrics_userid = os.environ["GRAFANA_METRICS_USERID"]
416+
buildkite_token = os.environ["BUILDKITE_TOKEN"]
417+
418+
# The last buildkite build recorded.
419+
buildkite_last_cursor = None
266420

267421
workflows_to_track = {}
268422
for workflow_to_track in WORKFLOWS_TO_TRACK:
@@ -274,7 +428,10 @@ def main():
274428
github_object = Github(auth=auth)
275429
github_repo = github_object.get_repo("llvm/llvm-project")
276430

277-
current_metrics = get_per_workflow_metrics(github_repo, workflows_to_track)
431+
current_metrics, buildkite_last_cursor = buildkite_get_metrics(
432+
buildkite_token, buildkite_last_cursor
433+
)
434+
current_metrics += get_per_workflow_metrics(github_repo, workflows_to_track)
278435
current_metrics += get_sampled_workflow_metrics(github_repo)
279436

280437
upload_metrics(current_metrics, grafana_metrics_userid, grafana_api_key)

0 commit comments

Comments
 (0)