33
44import pandas as pd
55import sqlite3
6+ import logging
67from repofinder .scraping .repo_scraping_utils import github_api_request , get_next_link
78
9+ logger = logging .getLogger (__name__ )
10+
811#TODO: Figure out how to get duplicates
912
10- def get_contributors (owner , repo_name , headers ):
13+ def get_contributors (owner , repo_name , headers , rate_limiter = None ):
1114 """
1215 Retrieves the list of contributors for a given repository.
1316
@@ -24,7 +27,7 @@ def get_contributors(owner, repo_name, headers):
2427 contributors = []
2528 while url :
2629 try :
27- contributors_data , headers_response = github_api_request (url , headers , params )
30+ contributors_data , headers_response = github_api_request (url , headers , params , rate_limiter = rate_limiter )
2831 except :
2932 break
3033 if contributors_data :
@@ -36,20 +39,28 @@ def get_contributors(owner, repo_name, headers):
3639 break
3740 return contributors if contributors else []
3841
39- def get_contributor_details (username , headers ):
42+ def get_contributor_details (username , headers , rate_limiter = None ):
4043 """
4144 Retrieves detailed information about a contributor.
4245
4346 Args:
4447 username (str): The GitHub username of the contributor.
4548 headers (dict): HTTP headers for the request.
49+ rate_limiter : Semaphore, optional
50+ Thread-safe rate limiter for concurrent requests (default is None).
4651
4752 Returns:
48- dict: A dictionary containing contributor details.
53+ dict or None : A dictionary containing contributor details, or None if not found (404) or error .
4954 """
5055 url = f"https://api.github.com/users/{ username } "
5156 try :
52- contributor_data , _ = github_api_request (url , headers )
57+ contributor_data , _ = github_api_request (url , headers , rate_limiter = rate_limiter )
58+
59+ # Handle 404 or None response (user not found)
60+ if contributor_data is None :
61+ logger .debug (f"Contributor { username } not found (404) or request failed. Skipping." )
62+ return None
63+
5364 return {
5465 "login" : contributor_data .get ("login" ),
5566 "name" : contributor_data .get ("name" ),
@@ -61,19 +72,40 @@ def get_contributor_details(username, headers):
6172 "organizations" : contributor_data .get ("organizations_url" ), # This is a URL, requires additional fetch
6273 }
6374 except Exception as e :
64- print (f"Error fetching details for user { username } : { e } " )
75+ logger . debug (f"Error fetching details for user { username } : { e } " )
6576 return None
6677
6778
6879def get_contributor_data (repo_file , db_file , headers ):
69-
70- # TODO: This should probably read the database instead
71- repo_df = pd .read_json (repo_file )
72- repo_df = repo_df .drop_duplicates (subset = ['full_name' ])
73- repo_df = repo_df .reset_index (drop = True )
74- repo_df ["contributors" ] = None
80+ """
81+ Processes repositories to collect contributor data.
82+ Only processes repositories that are not archived, have size > 0, are not forks, and are not templates.
83+
84+ Args:
85+ repo_file (str): Path to the JSON file (unused, reads from DB instead).
86+ db_file (str): Path to the SQLite database file.
87+ headers (dict): HTTP headers for authenticated GitHub API requests.
88+
89+ Returns
90+ -------
91+ pd.DataFrame
92+ A DataFrame of the repositories with contributor data.
93+ """
7594 conn = sqlite3 .connect (db_file )
7695 cursor = conn .cursor ()
96+
97+ # Read repositories from database, filtering for non-archived, size > 0, not a fork, and not a template
98+ query = """
99+ SELECT full_name, owner
100+ FROM repositories
101+ WHERE (archived = 0 OR archived = FALSE OR archived IS NULL)
102+ AND (size > 0 OR size IS NULL)
103+ AND (fork = 0 OR fork = FALSE OR fork IS NULL)
104+ AND (is_template = 0 OR is_template = FALSE OR is_template IS NULL)
105+ """
106+ repo_df = pd .read_sql_query (query , conn )
107+ repo_df = repo_df .reset_index (drop = True )
108+ repo_df ["contributors" ] = None
77109 try :
78110 cursor .execute ("ALTER TABLE repositories ADD COLUMN contributors TEXT;" ) # Adjust the column type as needed
79111 except :
@@ -102,51 +134,67 @@ def get_contributor_data(repo_file, db_file, headers):
102134 )
103135 """ )
104136
105- for i in range (len (repo_df )): #TODO: Fix API rate limits
106- full_name = repo_df ["full_name" ][i ]
137+ # List of bot usernames/patterns to skip
138+ bots_to_skip = ["copilot" , "dependabot[bot]" , "github-actions[bot]" , "dependabot" , "github-actions" ]
139+
140+ # Process sequentially (no multithreading)
141+ total_repos = len (repo_df )
142+ print (f"Processing { total_repos } repositories for contributor data..." )
143+
144+ for idx , row in repo_df .iterrows ():
145+ full_name = row ["full_name" ]
107146 owner , repo_name = full_name .split ("/" )
108- contributors = get_contributors (owner , repo_name , headers )
109- contributors_login = []
110147
111- for contributor in contributors :
112- contributor_login = contributor ['login' ]
113- details = get_contributor_details (contributor_login , headers )
148+ try :
149+ contributors = get_contributors (owner , repo_name , headers )
150+ contributors_login = []
151+ contributor_details_list = []
114152
115- if details :
116- # # Fetch organizations data if needed
117- # organizations_url = details.pop("organizations", None)
118- # if organizations_url:
119- # try:
120- # org_data, _ = github_api_request(organizations_url, headers)
121- # organizations = ", ".join(org.get("login", "") for org in org_data)
122- # details["organizations"] = organizations
123- # except Exception as e:
124- # print(f"Error fetching organizations for user {contributor_login}: {e}")
125- # details["organizations"] = None
126- # print(i + '/' + len(repo_df))
127- # return i
128-
153+ for contributor in contributors :
154+ contributor_login = contributor ['login' ]
155+
156+ # Skip bot contributors (case-insensitive matching)
157+ contributor_lower = contributor_login .lower ()
158+ if any (bot .lower () in contributor_lower for bot in bots_to_skip ):
159+ continue
160+
161+ # Also check if login ends with [bot] pattern
162+ if contributor_login .endswith ('[bot]' ):
163+ continue
129164
165+ details = get_contributor_details (contributor_login , headers )
166+
167+ # Only add contributor if details were successfully fetched (not 404)
168+ if details :
169+ contributor_details_list .append ((details , contributor_login ))
170+ contributors_login .append (contributor_login )
171+
172+ # Insert contributor details into database
173+ for details , contributor_login in contributor_details_list :
130174 conn .execute ("""
131175 INSERT OR REPLACE INTO contributors (login, name, bio, location, company, email, twitter)
132176 VALUES (:login, :name, :bio, :location, :company, :email, :twitter)
133177 """ , details )
178+ conn .execute ("INSERT OR IGNORE INTO contributions (repository_name, contributor_login) VALUES (?, ?)" ,
179+ (full_name , contributor_login ))
180+
181+ # Update repository with contributors list
182+ repo_df .at [idx , "contributors" ] = contributors_login
183+ contributors_login_string = str (contributors_login )
184+ conn .execute ("UPDATE repositories SET contributors = ? WHERE full_name = ?;" ,
185+ (contributors_login_string , full_name ))
186+
187+ processed_count = idx + 1
188+ if processed_count % 25 == 0 or processed_count == total_repos :
189+ conn .commit ()
190+ print (f"{ processed_count } /{ total_repos } : repositories processed" )
134191
135-
136- conn .execute ("INSERT OR IGNORE INTO contributions (repository_name, contributor_login) VALUES (?, ?)" , (full_name , contributor_login ))
137- contributors_login .append (contributor_login )
138-
139-
140- # Update database with this new column
141- repo_df ["contributors" ][i ] = contributors_login
142- contributors_login_string = str (contributors_login )
143- # This is to add the contributors as a list in the repositories table
144- conn .execute (
145- "UPDATE repositories SET contributors = ? WHERE full_name = ?;" ,
146- (contributors_login_string , full_name )
147- )
148- conn .commit ()
149- print (str (i )+ "/" + str (len (repo_df )))
192+ except Exception as e :
193+ logger .error (f"Error processing repository { full_name } : { e } " )
194+ continue
195+
196+ conn .commit () # Final commit
197+ print (f"Completed: { total_repos } /{ total_repos } repositories processed" )
150198
151199 conn .close ()
152200# TODO: Should I try to build a JSON object with this too?
0 commit comments