3
3
import logging
4
4
import os
5
5
import git
6
- from google .cloud import bigquery
7
6
import requests
8
7
9
8
GRAFANA_URL = (
14
13
15
14
# How many commits to query the GitHub GraphQL API for at a time.
16
15
# Querying too many commits at once often leads to the call failing.
17
- GITHUB_API_BATCH_SIZE = 75
16
+ GITHUB_API_BATCH_SIZE = 50
18
17
19
18
# Number of days to look back for new commits
20
19
# We allow some buffer time between when a commit is made and when it is queried
21
20
# for reviews. This is allow time for any events to propogate in the GitHub
22
21
# Archive BigQuery tables.
23
22
LOOKBACK_DAYS = 2
24
23
25
- # Template query to find pull requests associated with commits on a given day.
26
- # Searches for pull requests within a lower and upper bound of Github Archive
27
- # event dates.
28
- GITHUB_ARCHIVE_REVIEW_QUERY = """
29
- WITH PullRequestReviews AS (
30
- SELECT DISTINCT
31
- JSON_VALUE(payload, '$.pull_request.id') AS pr_id,
32
- JSON_VALUE(payload, '$.review.state') as review_state,
33
- FROM `githubarchive.day.20*`
34
- WHERE
35
- repo.id = 75821432
36
- AND `type` = 'PullRequestReviewEvent'
37
- AND (_TABLE_SUFFIX BETWEEN '{lower_review_bound}' AND '{upper_review_bound}')
38
- )
39
- SELECT DISTINCT
40
- JSON_VALUE(pr_event.payload, '$.pull_request.merge_commit_sha') AS merge_commit_sha,
41
- JSON_VALUE(pr_event.payload, '$.pull_request.number') AS pull_request_number,
42
- pr_review.review_state as review_state
43
- FROM `githubarchive.day.{commit_date}` AS pr_event
44
- LEFT JOIN PullRequestReviews as pr_review ON
45
- JSON_VALUE(pr_event.payload, '$.pull_request.id') = pr_review.pr_id # PR ID should match the review events
46
- WHERE
47
- pr_event.repo.id = 75821432
48
- AND pr_event.`type` = 'PullRequestEvent'
49
- AND JSON_VALUE(pr_event.payload, '$.pull_request.merge_commit_sha') IS NOT NULL
50
- """
51
-
52
24
# Template GraphQL subquery to check if a commit has an associated pull request
53
25
# and whether that pull request has been reviewed and approved.
54
26
COMMIT_GRAPHQL_SUBQUERY_TEMPLATE = """
@@ -113,26 +85,17 @@ def scrape_new_commits_by_date(
113
85
114
86
115
87
def query_for_reviews (
116
- new_commits : list [git .Commit ], commit_datetime : datetime . datetime
88
+ new_commits : list [git .Commit ], github_token : str
117
89
) -> list [LLVMCommitInfo ]:
118
- """Query GitHub Archive BigQuery for reviews of new commits.
90
+ """Query GitHub GraphQL API for reviews of new commits.
119
91
120
92
Args:
121
93
new_commits: List of new commits to query for reviews.
122
- commit_datetime : The date that the new commits were made on .
94
+ github_token : The access token to use with the GitHub GraphQL API .
123
95
124
96
Returns:
125
97
List of LLVMCommitInfo objects for each commit's review information.
126
98
"""
127
-
128
- # Search for reviews in the last 4 weeks
129
- earliest_review_date = (
130
- commit_datetime - datetime .timedelta (weeks = 4 )
131
- ).strftime ("%Y%m%d" )
132
- latest_review_date = datetime .datetime .now (datetime .timezone .utc ).strftime (
133
- "%Y%m%d"
134
- )
135
-
136
99
# Create a map of commit sha to info
137
100
new_commits = {
138
101
commit .hexsha : LLVMCommitInfo (
@@ -141,67 +104,13 @@ def query_for_reviews(
141
104
for commit in new_commits
142
105
}
143
106
144
- # Query each relevant daily GitHub Archive table
145
- query = GITHUB_ARCHIVE_REVIEW_QUERY .format (
146
- commit_date = commit_datetime .strftime ("%Y%m%d" ),
147
- lower_review_bound = earliest_review_date .removeprefix ("20" ),
148
- upper_review_bound = latest_review_date .removeprefix ("20" ),
149
- )
150
- bq_client = bigquery .Client ()
151
- query_job = bq_client .query (query )
152
- results = query_job .result ()
153
-
154
- # Process each found merge commit
155
- for row in results :
156
- # If this commit is irrelevant, skip it
157
- # Not every merge_commit_sha makes it into main, a "merge commit" can mean
158
- # different things depending on the state of the pull request.
159
- # docs.github.com/en/rest/pulls/pulls#get-a-pull-request for more details.
160
- merge_commit_sha = row ["merge_commit_sha" ]
161
- if merge_commit_sha not in new_commits :
162
- continue
163
-
164
- commit_info = new_commits [merge_commit_sha ]
165
- commit_info .has_pull_request = True
166
- commit_info .pr_number = row ["pull_request_number" ]
167
- commit_info .is_reviewed = row ["review_state" ] is not None
168
- commit_info .is_approved = row ["review_state" ] == "approved"
169
-
170
- logging .info (
171
- "Total gigabytes processed: %d GB" ,
172
- query_job .total_bytes_processed / (1024 ** 3 ),
173
- )
174
-
175
- return list (new_commits .values ())
176
-
177
-
178
- def validate_push_commits (
179
- new_commits : list [LLVMCommitInfo ], github_token : str
180
- ) -> None :
181
- """Validate that push commits don't have a pull request.
182
-
183
- To address lossiness of data from GitHub Archive BigQuery, we check each
184
- commit to see if it actually has an associated pull request.
185
-
186
- Args:
187
- new_commits: List of commits to validate.
188
- github_token: The access token to use with the GitHub GraphQL API.
189
- """
190
-
191
- # Get all push commits from new commits and form their subqueries
107
+ # Create GraphQL subqueries for each commit
192
108
commit_subqueries = []
193
- potential_push_commits = {}
194
- for commit in new_commits :
195
- if commit .has_pull_request :
196
- continue
197
- potential_push_commits [commit .commit_sha ] = commit
109
+ for commit_sha in new_commits :
198
110
commit_subqueries .append (
199
- COMMIT_GRAPHQL_SUBQUERY_TEMPLATE .format (commit_sha = commit . commit_sha )
111
+ COMMIT_GRAPHQL_SUBQUERY_TEMPLATE .format (commit_sha = commit_sha )
200
112
)
201
- logging .info ("Found %d potential push commits" , len (potential_push_commits ))
202
113
203
- # Query GitHub GraphQL API for pull requests associated with push commits
204
- # We query in batches as large queries often fail
205
114
api_commit_data = {}
206
115
query_template = """
207
116
query {
@@ -235,23 +144,22 @@ def validate_push_commits(
235
144
logging .error ("Failed to query GitHub GraphQL API: %s" , response .text )
236
145
api_commit_data .update (response .json ()["data" ]["repository" ])
237
146
238
- amend_count = 0
239
147
for commit_sha , data in api_commit_data .items ():
240
148
# Verify that push commit has no pull requests
241
149
commit_sha = commit_sha .removeprefix ("commit_" )
150
+
151
+ # If commit has no pull requests, skip it. No data to update.
242
152
if data ["associatedPullRequests" ]["totalCount" ] == 0 :
243
153
continue
244
154
245
- # Amend fields with new data from API
246
155
pull_request = data ["associatedPullRequests" ]["pullRequest" ][0 ]
247
- commit_info = potential_push_commits [commit_sha ]
156
+ commit_info = new_commits [commit_sha ]
248
157
commit_info .has_pull_request = True
249
158
commit_info .pr_number = pull_request ["number" ]
250
159
commit_info .is_reviewed = pull_request ["reviewDecision" ] is not None
251
160
commit_info .is_approved = pull_request ["reviewDecision" ] == "APPROVED"
252
- amend_count += 1
253
161
254
- logging . info ( "Amended %d commits" , amend_count )
162
+ return list ( new_commits . values () )
255
163
256
164
257
165
def upload_daily_metrics (
@@ -316,10 +224,7 @@ def main() -> None:
316
224
return
317
225
318
226
logging .info ("Querying for reviews of new commits." )
319
- new_commit_info = query_for_reviews (new_commits , date_to_scrape )
320
-
321
- logging .info ("Validating push commits." )
322
- validate_push_commits (new_commit_info , github_token )
227
+ new_commit_info = query_for_reviews (new_commits , github_token )
323
228
324
229
logging .info ("Uploading metrics to Grafana." )
325
230
upload_daily_metrics (grafana_api_key , grafana_metrics_userid , new_commit_info )
0 commit comments