diff --git a/.ci/metrics/metrics.py b/.ci/metrics/metrics.py index 3878dce342fb4..1463ab43a812e 100644 --- a/.ci/metrics/metrics.py +++ b/.ci/metrics/metrics.py @@ -1,6 +1,8 @@ import collections import datetime +import dateutil import github +import json import logging import os import requests @@ -53,6 +55,18 @@ # by trial and error). GRAFANA_METRIC_MAX_AGE_MN = 120 +# Lists the BuildKite jobs we want to track. Maps the BuildKite job name to +# the metric name in Grafana. This is important not to lose metrics history +# if the workflow name changes. +BUILDKITE_WORKFLOW_TO_TRACK = { + ":linux: Linux x64": "buildkite_linux", + ":windows: Windows x64": "buildkite_windows", +} + +# Number of builds to fetch per page. Since we scrape regularly, this can +# remain small. +BUILDKITE_GRAPHQL_BUILDS_PER_PAGE = 50 + @dataclass class JobMetrics: job_name: str @@ -70,6 +84,191 @@ class GaugeMetric: time_ns: int +def buildkite_fetch_page_build_list( + buildkite_token: str, after_cursor: str = None +) -> list[dict[str, str]]: + """Fetches a page of the build list using the GraphQL BuildKite API. + + Returns the BUILDKITE_GRAPHQL_BUILDS_PER_PAGE last running/queued builds, + or the BUILDKITE_GRAPHQL_BUILDS_PER_PAGE running/queued builds + older than the one pointer by |after_cursor| if provided. + The |after_cursor| value is taken from the previous page returned by the + API. + + Args: + buildkite_token: the secret token to authenticate GraphQL requests. + after_cursor: cursor after which to start the page fetch. + + Returns: + The most recent builds after cursor (if set) with the following format: + [ + { + "cursor": , + "number": , + } + ] + """ + + BUILDKITE_GRAPHQL_QUERY = """ + query OrganizationShowQuery {{ + organization(slug: "llvm-project") {{ + pipelines(search: "Github pull requests", first: 1) {{ + edges {{ + node {{ + builds (state: [CANCELING, CREATING, FAILING, RUNNING], first: {PAGE_SIZE}, after: {AFTER}) {{ + edges {{ + cursor + node {{ + number + }} + }} + }} + }} + }} + }} + }} + }} + """ + query = BUILDKITE_GRAPHQL_QUERY.format( + PAGE_SIZE=BUILDKITE_GRAPHQL_BUILDS_PER_PAGE, + AFTER="null" if after_cursor is None else '"{}"'.format(after_cursor), + ) + query = json.dumps({"query": query}) + url = "https://graphql.buildkite.com/v1" + headers = { + "Authorization": "Bearer " + buildkite_token, + "Content-Type": "application/json", + } + data = requests.post(url, data=query, headers=headers).json() + # De-nest the build list. + if "errors" in data: + logging.info("Failed to fetch BuildKite jobs: {}".format(data["errors"])) + return [] + builds = data["data"]["organization"]["pipelines"]["edges"][0]["node"]["builds"][ + "edges" + ] + # Fold cursor info into the node dictionnary. + return [{**x["node"], "cursor": x["cursor"]} for x in builds] + + +def buildkite_get_build_info(build_number: str) -> dict: + """Returns all the info associated with the provided build number. + + Note: for unknown reasons, graphql returns no jobs for a given build, + while this endpoint does, hence why this uses this API instead of graphql. + + Args: + build_number: which build number to fetch info for. + + Returns: + The info for the target build, a JSON dictionnary. + """ + + URL = "https://buildkite.com/llvm-project/github-pull-requests/builds/{}.json" + return requests.get(URL.format(build_number)).json() + + +def buildkite_get_incomplete_tasks(buildkite_token: str) -> list: + """Returns all the running/pending BuildKite builds. + + Args: + buildkite_token: the secret token to authenticate GraphQL requests. + last_cursor: the cursor to stop at if set. If None, a full page is fetched. + """ + output = [] + cursor = None + while True: + page = buildkite_fetch_page_build_list(buildkite_token, cursor) + if len(page) == 0: + break + cursor = page[-1]["cursor"] + output += page + return output + + +def buildkite_get_metrics( + buildkite_token: str, previously_incomplete: set[int] +) -> (list[JobMetrics], set[int]): + """Returns a tuple with: + + - the metrics recorded for newly completed workflow jobs. + - the set of workflow still running now. + + Args: + buildkite_token: the secret token to authenticate GraphQL requests. + previously_incomplete: the set of running workflows the last time this + function was called. + """ + + running_builds = buildkite_get_incomplete_tasks(buildkite_token) + incomplete_now = set([x["number"] for x in running_builds]) + output = [] + + for build_id in previously_incomplete: + if build_id in incomplete_now: + continue + + info = buildkite_get_build_info(build_id) + metric_timestamp = dateutil.parser.isoparse(info["finished_at"]) + for job in info["jobs"]: + # This workflow is not interesting to us. + if job["name"] not in BUILDKITE_WORKFLOW_TO_TRACK: + continue + + created_at = dateutil.parser.isoparse(job["created_at"]) + scheduled_at = ( + created_at + if job["scheduled_at"] is None + else dateutil.parser.isoparse(job["scheduled_at"]) + ) + started_at = ( + scheduled_at + if job["started_at"] is None + else dateutil.parser.isoparse(job["started_at"]) + ) + if job["canceled_at"] is None: + finished_at = ( + started_at + if job["finished_at"] is None + else dateutil.parser.isoparse(job["finished_at"]) + ) + else: + finished_at = dateutil.parser.isoparse(job["canceled_at"]) + + job_name = BUILDKITE_WORKFLOW_TO_TRACK[job["name"]] + queue_time = (started_at - scheduled_at).seconds + run_time = (finished_at - started_at).seconds + status = bool(job["passed"]) + + # Grafana will refuse to ingest metrics older than ~2 hours, so we + # should avoid sending historical data. + metric_age_mn = ( + datetime.datetime.now(datetime.timezone.utc) - metric_timestamp + ).total_seconds() / 60 + if metric_age_mn > GRAFANA_METRIC_MAX_AGE_MN: + logging.warning( + f"Job {job['name']} from workflow {build_id} dropped due" + + f" to staleness: {metric_age_mn}mn old." + ) + continue + + metric_timestamp_ns = int(metric_timestamp.timestamp()) * 10**9 + workflow_id = build_id + workflow_name = "Github pull requests" + output.append( + JobMetrics( + job_name, + queue_time, + run_time, + status, + metric_timestamp_ns, + workflow_id, + workflow_name, + ) + ) + + return output, incomplete_now + def github_get_metrics( github_repo: github.Repository, last_workflows_seen_as_completed: set[int] ) -> tuple[list[JobMetrics], int]: @@ -195,7 +394,7 @@ def github_get_metrics( datetime.datetime.now(datetime.timezone.utc) - completed_at ).total_seconds() / 60 if metric_age_mn > GRAFANA_METRIC_MAX_AGE_MN: - logging.info( + logging.warning( f"Job {job.id} from workflow {task.id} dropped due" + f" to staleness: {metric_age_mn}mn old." ) @@ -292,6 +491,7 @@ def upload_metrics(workflow_metrics, metrics_userid, api_key): def main(): # Authenticate with Github github_auth = Auth.Token(os.environ["GITHUB_TOKEN"]) + buildkite_token = os.environ["BUILDKITE_TOKEN"] grafana_api_key = os.environ["GRAFANA_API_KEY"] grafana_metrics_userid = os.environ["GRAFANA_METRICS_USERID"] @@ -299,6 +499,9 @@ def main(): # Because the Github queries are broken, we'll simply log a 'processed' # bit for the last COUNT_TO_PROCESS workflows. gh_last_workflows_seen_as_completed = set() + # Stores the list of pending/running builds in BuildKite we need to check + # at the next iteration. + bk_incomplete = set() # Enter the main loop. Every five minutes we wake up and dump metrics for # the relevant jobs. @@ -306,9 +509,15 @@ def main(): github_object = Github(auth=github_auth) github_repo = github_object.get_repo("llvm/llvm-project") - metrics, gh_last_workflows_seen_as_completed = github_get_metrics( + gh_metrics, gh_last_workflows_seen_as_completed = github_get_metrics( github_repo, gh_last_workflows_seen_as_completed ) + + bk_metrics, bk_incomplete = buildkite_get_metrics( + buildkite_token, bk_incomplete + ) + + metrics = gh_metrics + bk_metrics upload_metrics(metrics, grafana_metrics_userid, grafana_api_key) logging.info(f"Uploaded {len(metrics)} metrics")