[CI] Export scraped commit data to a BigQuery dataset (#532)

jriv01 · web-flow · commit ff8bf023d550 · 2025-08-01T12:32:37.000-07:00
Currently, the data we scrape and process regarding LLVM commits isn't
persistent and cannot be referenced outside of each CronJob invocation.
This change uploads scraped and parsed LLVM commit data to a new
BigQuery dataset, so that we may access and reuse this data without
having to requery and reparse the same commits to llvm-project.
diff --git a/llvm-ops-metrics/ops-container/process_llvm_commits.py b/llvm-ops-metrics/ops-container/process_llvm_commits.py
@@ -3,6 +3,7 @@
 import logging
 import os
 import git
+from google.cloud import bigquery
 import requests
 
 GRAFANA_URL = (
@@ -11,6 +12,10 @@
 GITHUB_GRAPHQL_API_URL = "https://api.github.com/graphql"
 REPOSITORY_URL = "https://github.com/llvm/llvm-project.git"
 
+# BigQuery dataset and tables to write metrics to.
+OPERATIONAL_METRICS_DATASET = "operational_metrics"
+LLVM_COMMITS_TABLE = "llvm_commits"
+
 # How many commits to query the GitHub GraphQL API for at a time.
 # Querying too many commits at once often leads to the call failing.
 GITHUB_API_BATCH_SIZE = 50
@@ -27,11 +32,23 @@
 commit_{commit_sha}:
   object(oid:"{commit_sha}") {{
     ... on Commit {{
+      author {{
+        user {{
+          login
+        }}
+      }}
       associatedPullRequests(first: 1) {{
         totalCount
         pullRequest: nodes {{
           number
           reviewDecision
+          reviews(first: 10) {{
+            nodes {{
+              reviewer: author {{
+                login
+              }}
+            }}
+          }}
         }}
       }}
     }}
@@ -42,12 +59,14 @@
 @dataclasses.dataclass
 class LLVMCommitInfo:
   commit_sha: str
-  commit_datetime: datetime.datetime
   commit_timestamp_seconds: int
+  files_modified: set[str]
+  commit_author: str = ""  # GitHub username of author is unknown until API call
   has_pull_request: bool = False
-  pr_number: int = 0
+  pull_request_number: int = 0
   is_reviewed: bool = False
   is_approved: bool = False
+  reviewers: set[str] = dataclasses.field(default_factory=set)
 
 
 def scrape_new_commits_by_date(
@@ -99,7 +118,9 @@ def query_for_reviews(
   # Create a map of commit sha to info
   new_commits = {
       commit.hexsha: LLVMCommitInfo(
-          commit.hexsha, commit.committed_datetime, commit.committed_date
+          commit_sha=commit.hexsha,
+          commit_timestamp_seconds=commit.committed_date,
+          files_modified=set(commit.stats.files.keys()),
       )
       for commit in new_commits
   }
@@ -140,29 +161,41 @@ def query_for_reviews(
         },
         json={"query": query},
     )
+
+    # Exit if API call fails
+    # A failed API call means a large batch of data is missing and will not be
+    # reflected in the dashboard. The dashboard will silently misrepresent
+    # commit data if we continue execution, so it's better to fail loudly.
     if response.status_code < 200 or response.status_code >= 300:
       logging.error("Failed to query GitHub GraphQL API: %s", response.text)
+      exit(1)
+
     api_commit_data.update(response.json()["data"]["repository"])
 
+  # Amend commit information with GitHub data
   for commit_sha, data in api_commit_data.items():
-    # Verify that push commit has no pull requests
     commit_sha = commit_sha.removeprefix("commit_")
+    commit_info = new_commits[commit_sha]
+    commit_info.commit_author = data["author"]["user"]["login"]
 
     # If commit has no pull requests, skip it. No data to update.
     if data["associatedPullRequests"]["totalCount"] == 0:
       continue
 
     pull_request = data["associatedPullRequests"]["pullRequest"][0]
-    commit_info = new_commits[commit_sha]
     commit_info.has_pull_request = True
-    commit_info.pr_number = pull_request["number"]
+    commit_info.pull_request_number = pull_request["number"]
     commit_info.is_reviewed = pull_request["reviewDecision"] is not None
     commit_info.is_approved = pull_request["reviewDecision"] == "APPROVED"
+    commit_info.reviewers = set([
+        review["reviewer"]["login"]
+        for review in pull_request["reviews"]["nodes"]
+    ])
 
   return list(new_commits.values())
 
 
-def upload_daily_metrics(
+def upload_daily_metrics_to_grafana(
     grafana_api_key: str,
     grafana_metrics_userid: str,
     new_commits: list[LLVMCommitInfo],
@@ -205,6 +238,22 @@ def upload_daily_metrics(
     logging.error("Failed to submit data to Grafana: %s", response.text)
 
 
+def upload_daily_metrics_to_bigquery(new_commits: list[LLVMCommitInfo]) -> None:
+  """Upload processed commit metrics to a BigQuery dataset.
+
+  Args:
+    new_commits: List of commits to process & upload to BigQuery.
+  """
+  bq_client = bigquery.Client()
+  table_ref = bq_client.dataset(OPERATIONAL_METRICS_DATASET).table(
+      LLVM_COMMITS_TABLE
+  )
+  table = bq_client.get_table(table_ref)
+  commit_records = [dataclasses.asdict(commit) for commit in new_commits]
+  bq_client.insert_rows(table, commit_records)
+  bq_client.close()
+
+
 def main() -> None:
   github_token = os.environ["GITHUB_TOKEN"]
   grafana_api_key = os.environ["GRAFANA_API_KEY"]
@@ -227,7 +276,12 @@ def main() -> None:
   new_commit_info = query_for_reviews(new_commits, github_token)
 
   logging.info("Uploading metrics to Grafana.")
-  upload_daily_metrics(grafana_api_key, grafana_metrics_userid, new_commit_info)
+  upload_daily_metrics_to_grafana(
+      grafana_api_key, grafana_metrics_userid, new_commit_info
+  )
+
+  logging.info("Uploading metrics to BigQuery.")
+  upload_daily_metrics_to_bigquery(new_commit_info)
 
 
 if __name__ == "__main__":
diff --git a/premerge/bigquery_schema/llvm_commits_table_schema.json b/premerge/bigquery_schema/llvm_commits_table_schema.json
@@ -0,0 +1,56 @@
+[
+  {
+    "name": "commit_sha",
+    "type": "STRING",
+    "mode": "NULLABLE",
+    "description": "Commit hexsha of a commit made to llvm/llvm-project:main"
+  },
+  {
+    "name": "commit_author",
+    "type": "STRING",
+    "mode": "NULLABLE",
+    "description": "GitHub username of the commit author"
+  },
+  {
+    "name": "commit_timestamp_seconds",
+    "type": "INTEGER",
+    "mode": "NULLABLE",
+    "description": "Time this commit was made at, as a Unix timestamp"
+  },
+  {
+    "name": "has_pull_request",
+    "type": "BOOLEAN",
+    "mode": "NULLABLE",
+    "description": "Whether or not this commit has an associated pull request"
+  },
+  {
+    "name": "pull_request_number",
+    "type": "INTEGER",
+    "mode": "NULLABLE",
+    "description": "Number of the pull request associated with this commit"
+  },
+  {
+    "name": "is_reviewed",
+    "type": "BOOLEAN",
+    "mode": "NULLABLE",
+    "description": "Whether or not the pull request for this commit was reviewed"
+  },
+  {
+    "name": "is_approved",
+    "type": "BOOLEAN",
+    "mode": "NULLABLE",
+    "description": "Whether or not the pull request for this commit was approved"
+  },
+  {
+    "name": "reviewers",
+    "type": "STRING",
+    "mode": "REPEATED",
+    "description": "List of GitHub users who reviewed the pull request for this commit"
+  },
+  {
+    "name": "files_modified",
+    "type": "STRING",
+    "mode": "REPEATED",
+    "description": "List of filepaths modified by this commit"
+  }
+]
diff --git a/premerge/main.tf b/premerge/main.tf
@@ -293,3 +293,30 @@ resource "kubernetes_manifest" "operational_metrics_cronjob" {
     kubernetes_service_account.operational_metrics_ksa,
   ]
 }
+
+# BigQuery dataset and table resources
+resource "google_bigquery_dataset" "operational_metrics_dataset" {
+  dataset_id  = "operational_metrics"
+  description = "Dataset for retaining operational data regarding LLVM commit trends."
+}
+
+resource "google_bigquery_table" "llvm_commits_table" {
+  dataset_id  = google_bigquery_dataset.operational_metrics_dataset.dataset_id
+  table_id    = "llvm_commits"
+  description = "LLVM commit data, including pull request and review activity per commit."
+
+  schema = file("./bigquery_schema/llvm_commits_table_schema.json")
+
+  depends_on = [google_bigquery_dataset.operational_metrics_dataset]
+}
+
+resource "google_bigquery_dataset_iam_binding" "operational_metrics_dataset_editor_binding" {
+  dataset_id = google_bigquery_dataset.operational_metrics_dataset.dataset_id
+  role       = "roles/bigquery.dataEditor"
+
+  members = [
+    "serviceAccount:${google_service_account.operational_metrics_gsa.email}",
+  ]
+
+  depends_on = [google_bigquery_dataset.operational_metrics_dataset, google_service_account.operational_metrics_gsa]
+}