Skip to content

Commit 11e09ca

Browse files
authored
Bug 2021787 - Rate limiting is failing on Github ETL when retrieving pull request data from mozilla-conduit/lando repository
1 parent f096724 commit 11e09ca

File tree

1 file changed

+40
-49
lines changed

1 file changed

+40
-49
lines changed

main.py

Lines changed: 40 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -70,17 +70,7 @@ def extract_pull_requests(
7070
pages = 0
7171

7272
while True:
73-
resp = session.get(base_url, params=params)
74-
if (
75-
resp.status_code == 403
76-
and int(resp.headers.get("X-RateLimit-Remaining", "1")) == 0
77-
):
78-
sleep_for_rate_limit(resp)
79-
# retry same URL/params after sleeping
80-
continue
81-
if resp.status_code != 200:
82-
error_text = resp.text if resp.text else "No response text"
83-
raise SystemExit(f"GitHub API error {resp.status_code}: {error_text}")
73+
resp = github_get(session, base_url, params=params)
8474

8575
batch = resp.json()
8676
pages += 1
@@ -164,26 +154,13 @@ def extract_commits(
164154

165155
logger.info(f"Commits URL: {commits_url}")
166156

167-
resp = session.get(commits_url)
168-
if (
169-
resp.status_code == 403
170-
and int(resp.headers.get("X-RateLimit-Remaining", "1")) == 0
171-
):
172-
sleep_for_rate_limit(resp)
173-
resp = session.get(commits_url)
174-
if resp.status_code != 200:
175-
raise SystemExit(f"GitHub API error {resp.status_code}: {resp.text}")
157+
resp = github_get(session, commits_url)
176158

177159
commits = resp.json()
178160
for commit in commits:
179161
commit_sha = commit.get("sha")
180162
commit_url = f"{api_base}/repos/{repo}/commits/{commit_sha}"
181-
commit_resp = session.get(commit_url)
182-
if commit_resp.status_code != 200:
183-
raise SystemExit(
184-
f"GitHub API error {commit_resp.status_code}: {commit_resp.text}"
185-
)
186-
commit_data = commit_resp.json()
163+
commit_data = github_get(session, commit_url).json()
187164
commit["files"] = commit_data.get("files", [])
188165

189166
logger.info(f"Extracted {len(commits)} commits for PR #{pr_number}")
@@ -216,17 +193,7 @@ def extract_reviewers(
216193

217194
logger.info(f"Reviewers URL: {reviewers_url}")
218195

219-
resp = session.get(reviewers_url)
220-
if (
221-
resp.status_code == 403
222-
and int(resp.headers.get("X-RateLimit-Remaining", "1")) == 0
223-
):
224-
sleep_for_rate_limit(resp)
225-
resp = session.get(reviewers_url)
226-
if resp.status_code != 200:
227-
raise SystemExit(f"GitHub API error {resp.status_code}: {resp.text}")
228-
229-
reviewers = resp.json()
196+
reviewers = github_get(session, reviewers_url).json()
230197

231198
logger.info(f"Extracted {len(reviewers)} reviewers for PR #{pr_number}")
232199
return reviewers
@@ -258,17 +225,7 @@ def extract_comments(
258225

259226
logger.info(f"Comments URL: {comments_url}")
260227

261-
resp = session.get(comments_url)
262-
if (
263-
resp.status_code == 403
264-
and int(resp.headers.get("X-RateLimit-Remaining", "1")) == 0
265-
):
266-
sleep_for_rate_limit(resp)
267-
resp = session.get(comments_url)
268-
if resp.status_code != 200:
269-
raise SystemExit(f"GitHub API error {resp.status_code}: {resp.text}")
270-
271-
comments = resp.json()
228+
comments = github_get(session, comments_url).json()
272229
logger.info(f"Extracted {len(comments)} comments for PR #{pr_number}")
273230
return comments
274231

@@ -278,13 +235,47 @@ def sleep_for_rate_limit(resp: requests.Response) -> None:
278235
remaining = int(resp.headers.get("X-RateLimit-Remaining", 1))
279236
reset = int(resp.headers.get("X-RateLimit-Reset", 0))
280237
if remaining == 0:
281-
sleep_time = max(0, reset - int(time.time()))
238+
sleep_time = max(0, reset - int(time.time())) + 5
282239
print(
283240
f"Rate limit exceeded. Sleeping for {sleep_time} seconds.", file=sys.stderr
284241
)
285242
time.sleep(sleep_time)
286243

287244

245+
def github_get(
246+
session: requests.Session,
247+
url: str,
248+
params: Optional[dict] = None,
249+
) -> requests.Response:
250+
"""
251+
Make a GitHub API GET request, retrying in a loop on rate limit.
252+
253+
Args:
254+
session: Authenticated requests session
255+
url: URL to fetch
256+
params: Optional query parameters
257+
258+
Returns:
259+
Successful response (status 200)
260+
261+
Raises:
262+
SystemExit: On non-200, non-rate-limit errors
263+
"""
264+
while True:
265+
resp = session.get(url, params=params)
266+
if resp.status_code == 200:
267+
return resp
268+
if (
269+
resp.status_code == 403
270+
and int(resp.headers.get("X-RateLimit-Remaining", "1")) == 0
271+
):
272+
sleep_for_rate_limit(resp)
273+
continue
274+
raise SystemExit(
275+
f"GitHub API error {resp.status_code} for {url}: {resp.text or 'No response text'}"
276+
)
277+
278+
288279
def transform_data(raw_data: list[dict], repo: str) -> dict:
289280
"""
290281
Transform GitHub pull request data into BigQuery-compatible format.

0 commit comments

Comments
 (0)