Skip to content

Commit a16b133

Browse files
authored
[CI] Validate scraped push commits via GitHub API (#514)
As GitHubArchive BigQuery is known to be lossy, it is likely that we currently overestimate the number of commits made to llvm/llvm-project without an associated pull request. To remedy this, we can make calls to the [GitHub Event API](https://docs.github.com/en/rest/activity/events). While we want to avoid using the API to get information regarding every single commit made to LLVM, we can narrow our calls down to only commits that don't have any pull request data available via BigQuery. From those calls, we can determine if a "push" commit actually does have a pull request and, if it does, whether or not it has been approved.
1 parent 2bf1614 commit a16b133

File tree

1 file changed

+105
-3
lines changed

1 file changed

+105
-3
lines changed

llvm-ops-metrics/ops-container/process_llvm_commits.py

Lines changed: 105 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9,8 +9,13 @@
99
GRAFANA_URL = (
1010
"https://influx-prod-13-prod-us-east-0.grafana.net/api/v1/push/influx/write"
1111
)
12+
GITHUB_GRAPHQL_API_URL = "https://api.github.com/graphql"
1213
REPOSITORY_URL = "https://github.com/llvm/llvm-project.git"
1314

15+
# How many commits to query the GitHub GraphQL API for at a time.
16+
# Querying too many commits at once often leads to the call failing.
17+
GITHUB_API_BATCH_SIZE = 75
18+
1419
# Number of days to look back for new commits
1520
# We allow some buffer time between when a commit is made and when it is queried
1621
# for reviews. This is allow time for any events to propogate in the GitHub
@@ -44,6 +49,23 @@
4449
AND JSON_VALUE(pr_event.payload, '$.pull_request.merge_commit_sha') IS NOT NULL
4550
"""
4651

52+
# Template GraphQL subquery to check if a commit has an associated pull request
53+
# and whether that pull request has been reviewed and approved.
54+
COMMIT_GRAPHQL_SUBQUERY_TEMPLATE = """
55+
commit_{commit_sha}:
56+
object(oid:"{commit_sha}") {{
57+
... on Commit {{
58+
associatedPullRequests(first: 1) {{
59+
totalCount
60+
pullRequest: nodes {{
61+
number
62+
reviewDecision
63+
}}
64+
}}
65+
}}
66+
}}
67+
"""
68+
4769

4870
@dataclasses.dataclass
4971
class LLVMCommitInfo:
@@ -153,6 +175,85 @@ def query_for_reviews(
153175
return list(new_commits.values())
154176

155177

178+
def validate_push_commits(
179+
new_commits: list[LLVMCommitInfo], github_token: str
180+
) -> None:
181+
"""Validate that push commits don't have a pull request.
182+
183+
To address lossiness of data from GitHub Archive BigQuery, we check each
184+
commit to see if it actually has an associated pull request.
185+
186+
Args:
187+
new_commits: List of commits to validate.
188+
github_token: The access token to use with the GitHub GraphQL API.
189+
"""
190+
191+
# Get all push commits from new commits and form their subqueries
192+
commit_subqueries = []
193+
potential_push_commits = {}
194+
for commit in new_commits:
195+
if commit.has_pull_request:
196+
continue
197+
potential_push_commits[commit.commit_sha] = commit
198+
commit_subqueries.append(
199+
COMMIT_GRAPHQL_SUBQUERY_TEMPLATE.format(commit_sha=commit.commit_sha)
200+
)
201+
logging.info("Found %d potential push commits", len(potential_push_commits))
202+
203+
# Query GitHub GraphQL API for pull requests associated with push commits
204+
# We query in batches as large queries often fail
205+
api_commit_data = {}
206+
query_template = """
207+
query {
208+
repository(owner:"llvm", name:"llvm-project"){
209+
%s
210+
}
211+
}
212+
"""
213+
num_batches = len(commit_subqueries) // GITHUB_API_BATCH_SIZE + 1
214+
logging.info("Querying GitHub GraphQL API in %d batches", num_batches)
215+
for i in range(num_batches):
216+
subquery_batch = commit_subqueries[
217+
i * GITHUB_API_BATCH_SIZE : (i + 1) * GITHUB_API_BATCH_SIZE
218+
]
219+
query = query_template % "".join(subquery_batch)
220+
221+
logging.info(
222+
"Querying batch %d of %d (%d commits)",
223+
i + 1,
224+
num_batches,
225+
len(subquery_batch),
226+
)
227+
response = requests.post(
228+
url=GITHUB_GRAPHQL_API_URL,
229+
headers={
230+
"Authorization": f"bearer {github_token}",
231+
},
232+
json={"query": query},
233+
)
234+
if response.status_code < 200 or response.status_code >= 300:
235+
logging.error("Failed to query GitHub GraphQL API: %s", response.text)
236+
api_commit_data.update(response.json()["data"]["repository"])
237+
238+
amend_count = 0
239+
for commit_sha, data in api_commit_data.items():
240+
# Verify that push commit has no pull requests
241+
commit_sha = commit_sha.removeprefix("commit_")
242+
if data["associatedPullRequests"]["totalCount"] == 0:
243+
continue
244+
245+
# Amend fields with new data from API
246+
pull_request = data["associatedPullRequests"]["pullRequest"][0]
247+
commit_info = potential_push_commits[commit_sha]
248+
commit_info.has_pull_request = True
249+
commit_info.pr_number = pull_request["number"]
250+
commit_info.is_reviewed = pull_request["reviewDecision"] is not None
251+
commit_info.is_approved = pull_request["reviewDecision"] == "APPROVED"
252+
amend_count += 1
253+
254+
logging.info("Amended %d commits", amend_count)
255+
256+
156257
def upload_daily_metrics(
157258
grafana_api_key: str,
158259
grafana_metrics_userid: str,
@@ -164,9 +265,6 @@ def upload_daily_metrics(
164265
grafana_api_key: The key to make API requests with.
165266
grafana_metrics_userid: The user to make API requests with.
166267
new_commits: List of commits to process & upload to Grafana.
167-
168-
Returns:
169-
None
170268
"""
171269
# Count each type of commit made
172270
approval_count = 0
@@ -200,6 +298,7 @@ def upload_daily_metrics(
200298

201299

202300
def main() -> None:
301+
github_token = os.environ["GITHUB_TOKEN"]
203302
grafana_api_key = os.environ["GRAFANA_API_KEY"]
204303
grafana_metrics_userid = os.environ["GRAFANA_METRICS_USERID"]
205304

@@ -219,6 +318,9 @@ def main() -> None:
219318
logging.info("Querying for reviews of new commits.")
220319
new_commit_info = query_for_reviews(new_commits, date_to_scrape)
221320

321+
logging.info("Validating push commits.")
322+
validate_push_commits(new_commit_info, github_token)
323+
222324
logging.info("Uploading metrics to Grafana.")
223325
upload_daily_metrics(grafana_api_key, grafana_metrics_userid, new_commit_info)
224326

0 commit comments

Comments
 (0)