99import logging
1010import os
1111import re
12- import requests
1312import sys
1413import time
1514from datetime import datetime , timezone
1615from typing import Iterator , Optional
1716from urllib .parse import parse_qs , urlparse
18- from google .cloud import bigquery
17+
18+ import requests
1919from google .api_core .client_options import ClientOptions
2020from google .auth .credentials import AnonymousCredentials
21-
21+ from google . cloud import bigquery
2222
2323BUG_RE = re .compile (r"\b(?:bug|b=)\s*#?(\d+)\b" , re .I )
2424
@@ -29,6 +29,7 @@ def setup_logging() -> None:
2929 level = logging .INFO ,
3030 format = "%(asctime)s - %(name)s - %(levelname)s - %(message)s" ,
3131 handlers = [logging .StreamHandler (sys .stdout )],
32+ force = True ,
3233 )
3334
3435
@@ -58,7 +59,7 @@ def extract_pull_requests(
5859 # Support custom API URL for mocking/testing
5960 api_base = github_api_url or "https://api.github.com"
6061 base_url = f"{ api_base } /repos/{ repo } /pulls"
61- params = {
62+ params : dict = {
6263 "state" : "all" ,
6364 "per_page" : chunk_size ,
6465 "sort" : "created" ,
@@ -90,7 +91,7 @@ def extract_pull_requests(
9091 f"Extracted page { pages } with { len (batch )} PRs (total: { total } )"
9192 )
9293
93- for idx , pr in enumerate (batch ):
94+ for _idx , pr in enumerate (batch ):
9495 pr_number = pr .get ("number" )
9596 if not pr_number :
9697 continue
@@ -272,7 +273,7 @@ def extract_comments(
272273 return comments
273274
274275
275- def sleep_for_rate_limit (resp ) :
276+ def sleep_for_rate_limit (resp : requests . Response ) -> None :
276277 """Sleep until rate limit resets."""
277278 remaining = int (resp .headers .get ("X-RateLimit-Remaining" , 1 ))
278279 reset = int (resp .headers .get ("X-RateLimit-Reset" , 0 ))
@@ -297,7 +298,7 @@ def transform_data(raw_data: list[dict], repo: str) -> dict:
297298 logger = logging .getLogger (__name__ )
298299 logger .info (f"Starting data transformation for { len (raw_data )} PRs" )
299300
300- transformed_data = {
301+ transformed_data : dict = {
301302 "pull_requests" : [],
302303 "commits" : [],
303304 "reviewers" : [],
@@ -324,9 +325,11 @@ def transform_data(raw_data: list[dict], repo: str) -> dict:
324325 "bug_id" : bug_id ,
325326 "date_landed" : pr .get ("merged_at" ),
326327 "date_approved" : None , # This will be filled later
327- "labels" : [label .get ("name" ) for label in pr .get ("labels" , [])]
328- if pr .get ("labels" )
329- else [],
328+ "labels" : (
329+ [label .get ("name" ) for label in pr .get ("labels" , [])]
330+ if pr .get ("labels" )
331+ else []
332+ ),
330333 }
331334
332335 # Extract and flatten commit data
@@ -368,7 +371,8 @@ def transform_data(raw_data: list[dict], repo: str) -> dict:
368371 }
369372 transformed_data ["reviewers" ].append (transformed_reviewer )
370373
371- # If the request is approved then store the date in the date_approved for the pull request
374+ # If the request is approved then store the date in the
375+ # date_approved for the pull request
372376 if review .get ("state" ) == "APPROVED" :
373377 approved_date = review .get ("submitted_at" )
374378 if transformed_pr .get (
@@ -386,9 +390,9 @@ def transform_data(raw_data: list[dict], repo: str) -> dict:
386390 "date_created" : comment .get ("created_at" ),
387391 "author_email" : None , # TODO Placeholder for reviewer email extraction logic
388392 "author_username" : comment .get ("user" , {}).get ("login" ),
389- "character_count" : len ( comment . get ( "body" , "" ))
390- if comment .get ("body" )
391- else 0 ,
393+ "character_count" : (
394+ len ( comment . get ( "body" , "" )) if comment .get ("body" ) else 0
395+ ) ,
392396 "status" : None , # TODO
393397 }
394398
@@ -419,7 +423,8 @@ def load_data(
419423 Args:
420424 client: BigQuery client instance
421425 dataset_id: BigQuery dataset ID
422- transformed_data: Dictionary containing tables ('pull_requests', 'commits', 'reviewers', 'comments') mapped to lists of row dictionaries
426+ transformed_data: Dictionary containing tables ('pull_requests',
427+ 'commits', 'reviewers', 'comments') mapped to lists of row dictionaries
423428 """
424429 logger = logging .getLogger (__name__ )
425430
@@ -454,7 +459,8 @@ def load_data(
454459 raise Exception (error_msg )
455460
456461 logger .info (
457- f"Data loading completed successfully for table { table } with { len (load_table_data )} rows"
462+ f"Data loading completed successfully for table { table } "
463+ + f"with { len (load_table_data )} rows"
458464 )
459465
460466
@@ -476,7 +482,8 @@ def main() -> int:
476482 github_token = os .environ .get ("GITHUB_TOKEN" )
477483 if not github_token :
478484 logger .warning (
479- "Warning: No token provided. You will hit very low rate limits and private repos won't work."
485+ "Warning: No token provided. You will hit very low rate "
486+ + "limits and private repos won't work."
480487 )
481488
482489 # Read BigQuery configuration
@@ -519,9 +526,10 @@ def main() -> int:
519526 bigquery_client = bigquery .Client (project = bigquery_project )
520527
521528 # Read GitHub repository configuration
522- github_repos = os .getenv ("GITHUB_REPOS" )
523- if github_repos :
524- github_repos = github_repos .split ("," )
529+ github_repos = []
530+ github_repos_str = os .getenv ("GITHUB_REPOS" )
531+ if github_repos_str :
532+ github_repos = github_repos_str .split ("," )
525533 else :
526534 raise SystemExit (
527535 "Environment variable GITHUB_REPOS is required (format: 'owner/repo,owner/repo')"
0 commit comments