Skip to content

Commit ff8bf02

Browse files
authored
[CI] Export scraped commit data to a BigQuery dataset (#532)
Currently, the data we scrape and process regarding LLVM commits isn't persistent and cannot be referenced outside of each CronJob invocation. This change uploads scraped and parsed LLVM commit data to a new BigQuery dataset, so that we may access and reuse this data without having to requery and reparse the same commits to llvm-project.
1 parent bc2e038 commit ff8bf02

File tree

3 files changed

+145
-8
lines changed

3 files changed

+145
-8
lines changed

llvm-ops-metrics/ops-container/process_llvm_commits.py

Lines changed: 62 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
import logging
44
import os
55
import git
6+
from google.cloud import bigquery
67
import requests
78

89
GRAFANA_URL = (
@@ -11,6 +12,10 @@
1112
GITHUB_GRAPHQL_API_URL = "https://api.github.com/graphql"
1213
REPOSITORY_URL = "https://github.com/llvm/llvm-project.git"
1314

15+
# BigQuery dataset and tables to write metrics to.
16+
OPERATIONAL_METRICS_DATASET = "operational_metrics"
17+
LLVM_COMMITS_TABLE = "llvm_commits"
18+
1419
# How many commits to query the GitHub GraphQL API for at a time.
1520
# Querying too many commits at once often leads to the call failing.
1621
GITHUB_API_BATCH_SIZE = 50
@@ -27,11 +32,23 @@
2732
commit_{commit_sha}:
2833
object(oid:"{commit_sha}") {{
2934
... on Commit {{
35+
author {{
36+
user {{
37+
login
38+
}}
39+
}}
3040
associatedPullRequests(first: 1) {{
3141
totalCount
3242
pullRequest: nodes {{
3343
number
3444
reviewDecision
45+
reviews(first: 10) {{
46+
nodes {{
47+
reviewer: author {{
48+
login
49+
}}
50+
}}
51+
}}
3552
}}
3653
}}
3754
}}
@@ -42,12 +59,14 @@
4259
@dataclasses.dataclass
4360
class LLVMCommitInfo:
4461
commit_sha: str
45-
commit_datetime: datetime.datetime
4662
commit_timestamp_seconds: int
63+
files_modified: set[str]
64+
commit_author: str = "" # GitHub username of author is unknown until API call
4765
has_pull_request: bool = False
48-
pr_number: int = 0
66+
pull_request_number: int = 0
4967
is_reviewed: bool = False
5068
is_approved: bool = False
69+
reviewers: set[str] = dataclasses.field(default_factory=set)
5170

5271

5372
def scrape_new_commits_by_date(
@@ -99,7 +118,9 @@ def query_for_reviews(
99118
# Create a map of commit sha to info
100119
new_commits = {
101120
commit.hexsha: LLVMCommitInfo(
102-
commit.hexsha, commit.committed_datetime, commit.committed_date
121+
commit_sha=commit.hexsha,
122+
commit_timestamp_seconds=commit.committed_date,
123+
files_modified=set(commit.stats.files.keys()),
103124
)
104125
for commit in new_commits
105126
}
@@ -140,29 +161,41 @@ def query_for_reviews(
140161
},
141162
json={"query": query},
142163
)
164+
165+
# Exit if API call fails
166+
# A failed API call means a large batch of data is missing and will not be
167+
# reflected in the dashboard. The dashboard will silently misrepresent
168+
# commit data if we continue execution, so it's better to fail loudly.
143169
if response.status_code < 200 or response.status_code >= 300:
144170
logging.error("Failed to query GitHub GraphQL API: %s", response.text)
171+
exit(1)
172+
145173
api_commit_data.update(response.json()["data"]["repository"])
146174

175+
# Amend commit information with GitHub data
147176
for commit_sha, data in api_commit_data.items():
148-
# Verify that push commit has no pull requests
149177
commit_sha = commit_sha.removeprefix("commit_")
178+
commit_info = new_commits[commit_sha]
179+
commit_info.commit_author = data["author"]["user"]["login"]
150180

151181
# If commit has no pull requests, skip it. No data to update.
152182
if data["associatedPullRequests"]["totalCount"] == 0:
153183
continue
154184

155185
pull_request = data["associatedPullRequests"]["pullRequest"][0]
156-
commit_info = new_commits[commit_sha]
157186
commit_info.has_pull_request = True
158-
commit_info.pr_number = pull_request["number"]
187+
commit_info.pull_request_number = pull_request["number"]
159188
commit_info.is_reviewed = pull_request["reviewDecision"] is not None
160189
commit_info.is_approved = pull_request["reviewDecision"] == "APPROVED"
190+
commit_info.reviewers = set([
191+
review["reviewer"]["login"]
192+
for review in pull_request["reviews"]["nodes"]
193+
])
161194

162195
return list(new_commits.values())
163196

164197

165-
def upload_daily_metrics(
198+
def upload_daily_metrics_to_grafana(
166199
grafana_api_key: str,
167200
grafana_metrics_userid: str,
168201
new_commits: list[LLVMCommitInfo],
@@ -205,6 +238,22 @@ def upload_daily_metrics(
205238
logging.error("Failed to submit data to Grafana: %s", response.text)
206239

207240

241+
def upload_daily_metrics_to_bigquery(new_commits: list[LLVMCommitInfo]) -> None:
242+
"""Upload processed commit metrics to a BigQuery dataset.
243+
244+
Args:
245+
new_commits: List of commits to process & upload to BigQuery.
246+
"""
247+
bq_client = bigquery.Client()
248+
table_ref = bq_client.dataset(OPERATIONAL_METRICS_DATASET).table(
249+
LLVM_COMMITS_TABLE
250+
)
251+
table = bq_client.get_table(table_ref)
252+
commit_records = [dataclasses.asdict(commit) for commit in new_commits]
253+
bq_client.insert_rows(table, commit_records)
254+
bq_client.close()
255+
256+
208257
def main() -> None:
209258
github_token = os.environ["GITHUB_TOKEN"]
210259
grafana_api_key = os.environ["GRAFANA_API_KEY"]
@@ -227,7 +276,12 @@ def main() -> None:
227276
new_commit_info = query_for_reviews(new_commits, github_token)
228277

229278
logging.info("Uploading metrics to Grafana.")
230-
upload_daily_metrics(grafana_api_key, grafana_metrics_userid, new_commit_info)
279+
upload_daily_metrics_to_grafana(
280+
grafana_api_key, grafana_metrics_userid, new_commit_info
281+
)
282+
283+
logging.info("Uploading metrics to BigQuery.")
284+
upload_daily_metrics_to_bigquery(new_commit_info)
231285

232286

233287
if __name__ == "__main__":
Lines changed: 56 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,56 @@
1+
[
2+
{
3+
"name": "commit_sha",
4+
"type": "STRING",
5+
"mode": "NULLABLE",
6+
"description": "Commit hexsha of a commit made to llvm/llvm-project:main"
7+
},
8+
{
9+
"name": "commit_author",
10+
"type": "STRING",
11+
"mode": "NULLABLE",
12+
"description": "GitHub username of the commit author"
13+
},
14+
{
15+
"name": "commit_timestamp_seconds",
16+
"type": "INTEGER",
17+
"mode": "NULLABLE",
18+
"description": "Time this commit was made at, as a Unix timestamp"
19+
},
20+
{
21+
"name": "has_pull_request",
22+
"type": "BOOLEAN",
23+
"mode": "NULLABLE",
24+
"description": "Whether or not this commit has an associated pull request"
25+
},
26+
{
27+
"name": "pull_request_number",
28+
"type": "INTEGER",
29+
"mode": "NULLABLE",
30+
"description": "Number of the pull request associated with this commit"
31+
},
32+
{
33+
"name": "is_reviewed",
34+
"type": "BOOLEAN",
35+
"mode": "NULLABLE",
36+
"description": "Whether or not the pull request for this commit was reviewed"
37+
},
38+
{
39+
"name": "is_approved",
40+
"type": "BOOLEAN",
41+
"mode": "NULLABLE",
42+
"description": "Whether or not the pull request for this commit was approved"
43+
},
44+
{
45+
"name": "reviewers",
46+
"type": "STRING",
47+
"mode": "REPEATED",
48+
"description": "List of GitHub users who reviewed the pull request for this commit"
49+
},
50+
{
51+
"name": "files_modified",
52+
"type": "STRING",
53+
"mode": "REPEATED",
54+
"description": "List of filepaths modified by this commit"
55+
}
56+
]

premerge/main.tf

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -293,3 +293,30 @@ resource "kubernetes_manifest" "operational_metrics_cronjob" {
293293
kubernetes_service_account.operational_metrics_ksa,
294294
]
295295
}
296+
297+
# BigQuery dataset and table resources
298+
resource "google_bigquery_dataset" "operational_metrics_dataset" {
299+
dataset_id = "operational_metrics"
300+
description = "Dataset for retaining operational data regarding LLVM commit trends."
301+
}
302+
303+
resource "google_bigquery_table" "llvm_commits_table" {
304+
dataset_id = google_bigquery_dataset.operational_metrics_dataset.dataset_id
305+
table_id = "llvm_commits"
306+
description = "LLVM commit data, including pull request and review activity per commit."
307+
308+
schema = file("./bigquery_schema/llvm_commits_table_schema.json")
309+
310+
depends_on = [google_bigquery_dataset.operational_metrics_dataset]
311+
}
312+
313+
resource "google_bigquery_dataset_iam_binding" "operational_metrics_dataset_editor_binding" {
314+
dataset_id = google_bigquery_dataset.operational_metrics_dataset.dataset_id
315+
role = "roles/bigquery.dataEditor"
316+
317+
members = [
318+
"serviceAccount:${google_service_account.operational_metrics_gsa.email}",
319+
]
320+
321+
depends_on = [google_bigquery_dataset.operational_metrics_dataset, google_service_account.operational_metrics_gsa]
322+
}

0 commit comments

Comments
 (0)