Skip to content

Commit 772b264

Browse files
authored
[CI] Use GraphQL API instead of BigQuery to get review data (#525)
As we are already making calls to the GitHub GraphQL API for data validation, we can just remove the added complexity of using GitHub Archive BigQuery as a data source and query the API directly. Using BigQuery has the advantage of not being rate-limited, but we often have to query for 50-70 commits via the API anyway due to missing records of events in GitHub Archive. With more than half of the BigQuery data points needing amending, it makes more sense to use the API as the original data source.
1 parent 56073a1 commit 772b264

File tree

2 files changed

+12
-118
lines changed

2 files changed

+12
-118
lines changed

llvm-ops-metrics/ops-container/process_llvm_commits.py

Lines changed: 12 additions & 107 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
import logging
44
import os
55
import git
6-
from google.cloud import bigquery
76
import requests
87

98
GRAFANA_URL = (
@@ -14,41 +13,14 @@
1413

1514
# How many commits to query the GitHub GraphQL API for at a time.
1615
# Querying too many commits at once often leads to the call failing.
17-
GITHUB_API_BATCH_SIZE = 75
16+
GITHUB_API_BATCH_SIZE = 50
1817

1918
# Number of days to look back for new commits
2019
# We allow some buffer time between when a commit is made and when it is queried
2120
# for reviews. This is allow time for any events to propogate in the GitHub
2221
# Archive BigQuery tables.
2322
LOOKBACK_DAYS = 2
2423

25-
# Template query to find pull requests associated with commits on a given day.
26-
# Searches for pull requests within a lower and upper bound of Github Archive
27-
# event dates.
28-
GITHUB_ARCHIVE_REVIEW_QUERY = """
29-
WITH PullRequestReviews AS (
30-
SELECT DISTINCT
31-
JSON_VALUE(payload, '$.pull_request.id') AS pr_id,
32-
JSON_VALUE(payload, '$.review.state') as review_state,
33-
FROM `githubarchive.day.20*`
34-
WHERE
35-
repo.id = 75821432
36-
AND `type` = 'PullRequestReviewEvent'
37-
AND (_TABLE_SUFFIX BETWEEN '{lower_review_bound}' AND '{upper_review_bound}')
38-
)
39-
SELECT DISTINCT
40-
JSON_VALUE(pr_event.payload, '$.pull_request.merge_commit_sha') AS merge_commit_sha,
41-
JSON_VALUE(pr_event.payload, '$.pull_request.number') AS pull_request_number,
42-
pr_review.review_state as review_state
43-
FROM `githubarchive.day.{commit_date}` AS pr_event
44-
LEFT JOIN PullRequestReviews as pr_review ON
45-
JSON_VALUE(pr_event.payload, '$.pull_request.id') = pr_review.pr_id # PR ID should match the review events
46-
WHERE
47-
pr_event.repo.id = 75821432
48-
AND pr_event.`type` = 'PullRequestEvent'
49-
AND JSON_VALUE(pr_event.payload, '$.pull_request.merge_commit_sha') IS NOT NULL
50-
"""
51-
5224
# Template GraphQL subquery to check if a commit has an associated pull request
5325
# and whether that pull request has been reviewed and approved.
5426
COMMIT_GRAPHQL_SUBQUERY_TEMPLATE = """
@@ -113,26 +85,17 @@ def scrape_new_commits_by_date(
11385

11486

11587
def query_for_reviews(
116-
new_commits: list[git.Commit], commit_datetime: datetime.datetime
88+
new_commits: list[git.Commit], github_token: str
11789
) -> list[LLVMCommitInfo]:
118-
"""Query GitHub Archive BigQuery for reviews of new commits.
90+
"""Query GitHub GraphQL API for reviews of new commits.
11991
12092
Args:
12193
new_commits: List of new commits to query for reviews.
122-
commit_datetime: The date that the new commits were made on.
94+
github_token: The access token to use with the GitHub GraphQL API.
12395
12496
Returns:
12597
List of LLVMCommitInfo objects for each commit's review information.
12698
"""
127-
128-
# Search for reviews in the last 4 weeks
129-
earliest_review_date = (
130-
commit_datetime - datetime.timedelta(weeks=4)
131-
).strftime("%Y%m%d")
132-
latest_review_date = datetime.datetime.now(datetime.timezone.utc).strftime(
133-
"%Y%m%d"
134-
)
135-
13699
# Create a map of commit sha to info
137100
new_commits = {
138101
commit.hexsha: LLVMCommitInfo(
@@ -141,67 +104,13 @@ def query_for_reviews(
141104
for commit in new_commits
142105
}
143106

144-
# Query each relevant daily GitHub Archive table
145-
query = GITHUB_ARCHIVE_REVIEW_QUERY.format(
146-
commit_date=commit_datetime.strftime("%Y%m%d"),
147-
lower_review_bound=earliest_review_date.removeprefix("20"),
148-
upper_review_bound=latest_review_date.removeprefix("20"),
149-
)
150-
bq_client = bigquery.Client()
151-
query_job = bq_client.query(query)
152-
results = query_job.result()
153-
154-
# Process each found merge commit
155-
for row in results:
156-
# If this commit is irrelevant, skip it
157-
# Not every merge_commit_sha makes it into main, a "merge commit" can mean
158-
# different things depending on the state of the pull request.
159-
# docs.github.com/en/rest/pulls/pulls#get-a-pull-request for more details.
160-
merge_commit_sha = row["merge_commit_sha"]
161-
if merge_commit_sha not in new_commits:
162-
continue
163-
164-
commit_info = new_commits[merge_commit_sha]
165-
commit_info.has_pull_request = True
166-
commit_info.pr_number = row["pull_request_number"]
167-
commit_info.is_reviewed = row["review_state"] is not None
168-
commit_info.is_approved = row["review_state"] == "approved"
169-
170-
logging.info(
171-
"Total gigabytes processed: %d GB",
172-
query_job.total_bytes_processed / (1024**3),
173-
)
174-
175-
return list(new_commits.values())
176-
177-
178-
def validate_push_commits(
179-
new_commits: list[LLVMCommitInfo], github_token: str
180-
) -> None:
181-
"""Validate that push commits don't have a pull request.
182-
183-
To address lossiness of data from GitHub Archive BigQuery, we check each
184-
commit to see if it actually has an associated pull request.
185-
186-
Args:
187-
new_commits: List of commits to validate.
188-
github_token: The access token to use with the GitHub GraphQL API.
189-
"""
190-
191-
# Get all push commits from new commits and form their subqueries
107+
# Create GraphQL subqueries for each commit
192108
commit_subqueries = []
193-
potential_push_commits = {}
194-
for commit in new_commits:
195-
if commit.has_pull_request:
196-
continue
197-
potential_push_commits[commit.commit_sha] = commit
109+
for commit_sha in new_commits:
198110
commit_subqueries.append(
199-
COMMIT_GRAPHQL_SUBQUERY_TEMPLATE.format(commit_sha=commit.commit_sha)
111+
COMMIT_GRAPHQL_SUBQUERY_TEMPLATE.format(commit_sha=commit_sha)
200112
)
201-
logging.info("Found %d potential push commits", len(potential_push_commits))
202113

203-
# Query GitHub GraphQL API for pull requests associated with push commits
204-
# We query in batches as large queries often fail
205114
api_commit_data = {}
206115
query_template = """
207116
query {
@@ -235,23 +144,22 @@ def validate_push_commits(
235144
logging.error("Failed to query GitHub GraphQL API: %s", response.text)
236145
api_commit_data.update(response.json()["data"]["repository"])
237146

238-
amend_count = 0
239147
for commit_sha, data in api_commit_data.items():
240148
# Verify that push commit has no pull requests
241149
commit_sha = commit_sha.removeprefix("commit_")
150+
151+
# If commit has no pull requests, skip it. No data to update.
242152
if data["associatedPullRequests"]["totalCount"] == 0:
243153
continue
244154

245-
# Amend fields with new data from API
246155
pull_request = data["associatedPullRequests"]["pullRequest"][0]
247-
commit_info = potential_push_commits[commit_sha]
156+
commit_info = new_commits[commit_sha]
248157
commit_info.has_pull_request = True
249158
commit_info.pr_number = pull_request["number"]
250159
commit_info.is_reviewed = pull_request["reviewDecision"] is not None
251160
commit_info.is_approved = pull_request["reviewDecision"] == "APPROVED"
252-
amend_count += 1
253161

254-
logging.info("Amended %d commits", amend_count)
162+
return list(new_commits.values())
255163

256164

257165
def upload_daily_metrics(
@@ -316,10 +224,7 @@ def main() -> None:
316224
return
317225

318226
logging.info("Querying for reviews of new commits.")
319-
new_commit_info = query_for_reviews(new_commits, date_to_scrape)
320-
321-
logging.info("Validating push commits.")
322-
validate_push_commits(new_commit_info, github_token)
227+
new_commit_info = query_for_reviews(new_commits, github_token)
323228

324229
logging.info("Uploading metrics to Grafana.")
325230
upload_daily_metrics(grafana_api_key, grafana_metrics_userid, new_commit_info)

premerge/main.tf

Lines changed: 0 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -231,17 +231,6 @@ resource "google_service_account" "operational_metrics_gsa" {
231231
display_name = "Operational Metrics GSA"
232232
}
233233

234-
resource "google_project_iam_binding" "bigquery_jobuser_binding" {
235-
project = google_service_account.operational_metrics_gsa.project
236-
role = "roles/bigquery.jobUser"
237-
238-
members = [
239-
"serviceAccount:${google_service_account.operational_metrics_gsa.email}",
240-
]
241-
242-
depends_on = [google_service_account.operational_metrics_gsa]
243-
}
244-
245234
resource "kubernetes_namespace" "operational_metrics" {
246235
metadata {
247236
name = "operational-metrics"

0 commit comments

Comments
 (0)