Skip to content

Commit c3c7892

Browse files
committed
Update scraping
1 parent d0d92e2 commit c3c7892

File tree

6 files changed

+306
-118
lines changed

6 files changed

+306
-118
lines changed

repofinder/scraping/get_contributors.py

Lines changed: 96 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -3,11 +3,14 @@
33

44
import pandas as pd
55
import sqlite3
6+
import logging
67
from repofinder.scraping.repo_scraping_utils import github_api_request, get_next_link
78

9+
logger = logging.getLogger(__name__)
10+
811
#TODO: Figure out how to get duplicates
912

10-
def get_contributors(owner, repo_name, headers):
13+
def get_contributors(owner, repo_name, headers, rate_limiter=None):
1114
"""
1215
Retrieves the list of contributors for a given repository.
1316
@@ -24,7 +27,7 @@ def get_contributors(owner, repo_name, headers):
2427
contributors = []
2528
while url:
2629
try:
27-
contributors_data, headers_response = github_api_request(url, headers, params)
30+
contributors_data, headers_response = github_api_request(url, headers, params, rate_limiter=rate_limiter)
2831
except:
2932
break
3033
if contributors_data:
@@ -36,20 +39,28 @@ def get_contributors(owner, repo_name, headers):
3639
break
3740
return contributors if contributors else []
3841

39-
def get_contributor_details(username, headers):
42+
def get_contributor_details(username, headers, rate_limiter=None):
4043
"""
4144
Retrieves detailed information about a contributor.
4245
4346
Args:
4447
username (str): The GitHub username of the contributor.
4548
headers (dict): HTTP headers for the request.
49+
rate_limiter : Semaphore, optional
50+
Thread-safe rate limiter for concurrent requests (default is None).
4651
4752
Returns:
48-
dict: A dictionary containing contributor details.
53+
dict or None: A dictionary containing contributor details, or None if not found (404) or error.
4954
"""
5055
url = f"https://api.github.com/users/{username}"
5156
try:
52-
contributor_data, _ = github_api_request(url, headers)
57+
contributor_data, _ = github_api_request(url, headers, rate_limiter=rate_limiter)
58+
59+
# Handle 404 or None response (user not found)
60+
if contributor_data is None:
61+
logger.debug(f"Contributor {username} not found (404) or request failed. Skipping.")
62+
return None
63+
5364
return {
5465
"login": contributor_data.get("login"),
5566
"name": contributor_data.get("name"),
@@ -61,19 +72,40 @@ def get_contributor_details(username, headers):
6172
"organizations": contributor_data.get("organizations_url"), # This is a URL, requires additional fetch
6273
}
6374
except Exception as e:
64-
print(f"Error fetching details for user {username}: {e}")
75+
logger.debug(f"Error fetching details for user {username}: {e}")
6576
return None
6677

6778

6879
def get_contributor_data(repo_file, db_file, headers):
69-
70-
# TODO: This should probably read the database instead
71-
repo_df = pd.read_json(repo_file)
72-
repo_df = repo_df.drop_duplicates(subset=['full_name'])
73-
repo_df = repo_df.reset_index(drop=True)
74-
repo_df["contributors"] = None
80+
"""
81+
Processes repositories to collect contributor data.
82+
Only processes repositories that are not archived, have size > 0, are not forks, and are not templates.
83+
84+
Args:
85+
repo_file (str): Path to the JSON file (unused, reads from DB instead).
86+
db_file (str): Path to the SQLite database file.
87+
headers (dict): HTTP headers for authenticated GitHub API requests.
88+
89+
Returns
90+
-------
91+
pd.DataFrame
92+
A DataFrame of the repositories with contributor data.
93+
"""
7594
conn = sqlite3.connect(db_file)
7695
cursor = conn.cursor()
96+
97+
# Read repositories from database, filtering for non-archived, size > 0, not a fork, and not a template
98+
query = """
99+
SELECT full_name, owner
100+
FROM repositories
101+
WHERE (archived = 0 OR archived = FALSE OR archived IS NULL)
102+
AND (size > 0 OR size IS NULL)
103+
AND (fork = 0 OR fork = FALSE OR fork IS NULL)
104+
AND (is_template = 0 OR is_template = FALSE OR is_template IS NULL)
105+
"""
106+
repo_df = pd.read_sql_query(query, conn)
107+
repo_df = repo_df.reset_index(drop=True)
108+
repo_df["contributors"] = None
77109
try:
78110
cursor.execute("ALTER TABLE repositories ADD COLUMN contributors TEXT;") # Adjust the column type as needed
79111
except:
@@ -102,51 +134,67 @@ def get_contributor_data(repo_file, db_file, headers):
102134
)
103135
""")
104136

105-
for i in range(len(repo_df)): #TODO: Fix API rate limits
106-
full_name = repo_df["full_name"][i]
137+
# List of bot usernames/patterns to skip
138+
bots_to_skip = ["copilot", "dependabot[bot]", "github-actions[bot]", "dependabot", "github-actions"]
139+
140+
# Process sequentially (no multithreading)
141+
total_repos = len(repo_df)
142+
print(f"Processing {total_repos} repositories for contributor data...")
143+
144+
for idx, row in repo_df.iterrows():
145+
full_name = row["full_name"]
107146
owner, repo_name = full_name.split("/")
108-
contributors = get_contributors(owner, repo_name, headers)
109-
contributors_login = []
110147

111-
for contributor in contributors:
112-
contributor_login = contributor['login']
113-
details = get_contributor_details(contributor_login, headers)
148+
try:
149+
contributors = get_contributors(owner, repo_name, headers)
150+
contributors_login = []
151+
contributor_details_list = []
114152

115-
if details:
116-
# # Fetch organizations data if needed
117-
# organizations_url = details.pop("organizations", None)
118-
# if organizations_url:
119-
# try:
120-
# org_data, _ = github_api_request(organizations_url, headers)
121-
# organizations = ", ".join(org.get("login", "") for org in org_data)
122-
# details["organizations"] = organizations
123-
# except Exception as e:
124-
# print(f"Error fetching organizations for user {contributor_login}: {e}")
125-
# details["organizations"] = None
126-
# print(i + '/' + len(repo_df))
127-
# return i
128-
153+
for contributor in contributors:
154+
contributor_login = contributor['login']
155+
156+
# Skip bot contributors (case-insensitive matching)
157+
contributor_lower = contributor_login.lower()
158+
if any(bot.lower() in contributor_lower for bot in bots_to_skip):
159+
continue
160+
161+
# Also check if login ends with [bot] pattern
162+
if contributor_login.endswith('[bot]'):
163+
continue
129164

165+
details = get_contributor_details(contributor_login, headers)
166+
167+
# Only add contributor if details were successfully fetched (not 404)
168+
if details:
169+
contributor_details_list.append((details, contributor_login))
170+
contributors_login.append(contributor_login)
171+
172+
# Insert contributor details into database
173+
for details, contributor_login in contributor_details_list:
130174
conn.execute("""
131175
INSERT OR REPLACE INTO contributors (login, name, bio, location, company, email, twitter)
132176
VALUES (:login, :name, :bio, :location, :company, :email, :twitter)
133177
""", details)
178+
conn.execute("INSERT OR IGNORE INTO contributions (repository_name, contributor_login) VALUES (?, ?)",
179+
(full_name, contributor_login))
180+
181+
# Update repository with contributors list
182+
repo_df.at[idx, "contributors"] = contributors_login
183+
contributors_login_string = str(contributors_login)
184+
conn.execute("UPDATE repositories SET contributors = ? WHERE full_name = ?;",
185+
(contributors_login_string, full_name))
186+
187+
processed_count = idx + 1
188+
if processed_count % 25 == 0 or processed_count == total_repos:
189+
conn.commit()
190+
print(f"{processed_count}/{total_repos}: repositories processed")
134191

135-
136-
conn.execute("INSERT OR IGNORE INTO contributions (repository_name, contributor_login) VALUES (?, ?)", (full_name, contributor_login))
137-
contributors_login.append(contributor_login)
138-
139-
140-
# Update database with this new column
141-
repo_df["contributors"][i] = contributors_login
142-
contributors_login_string = str(contributors_login)
143-
# This is to add the contributors as a list in the repositories table
144-
conn.execute(
145-
"UPDATE repositories SET contributors = ? WHERE full_name = ?;",
146-
(contributors_login_string, full_name)
147-
)
148-
conn.commit()
149-
print(str(i)+"/"+str(len(repo_df)))
192+
except Exception as e:
193+
logger.error(f"Error processing repository {full_name}: {e}")
194+
continue
195+
196+
conn.commit() # Final commit
197+
print(f"Completed: {total_repos}/{total_repos} repositories processed")
150198

151199
conn.close()
152200
# TODO: Should I try to build a JSON object with this too?

repofinder/scraping/get_organizations.py

Lines changed: 38 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
from repofinder.scraping.repo_scraping_utils import github_api_request
77

88

9-
def get_organization_details(org_login, headers):
9+
def get_organization_details(org_login, headers, rate_limiter=None):
1010
"""
1111
Retrieves detailed information about an organization.
1212
@@ -19,7 +19,7 @@ def get_organization_details(org_login, headers):
1919
"""
2020
url = f"https://api.github.com/orgs/{org_login}"
2121
try:
22-
org_data, _ = github_api_request(url, headers)
22+
org_data, _ = github_api_request(url, headers, rate_limiter=rate_limiter)
2323
return {
2424
"login": org_data.get("login"),
2525
"name": org_data.get("name"),
@@ -41,29 +41,38 @@ def get_organization_data(repo_file, db_file, headers):
4141
"""
4242
Processes a list of repositories to identify those owned by organizations
4343
and stores organization metadata in a SQLite database.
44+
Only processes repositories that are not archived, have size > 0, are not forks, and are not templates.
4445
4546
This function:
46-
- Reads repository metadata from a JSON file.
47+
- Reads repository metadata from the database.
4748
- Identifies which repositories are owned by GitHub organizations.
4849
- Updates the 'repositories' table to mark organizational ownership.
4950
- Creates or updates an 'organizations' table with detailed organization info.
5051
5152
Args:
52-
repo_file (str): Path to the JSON file containing repository metadata.
53+
repo_file (str): Path to the JSON file (unused, reads from DB instead).
5354
db_file (str): Path to the SQLite database file.
5455
headers (dict): HTTP headers for authenticated GitHub API requests.
5556
5657
Returns:
5758
pd.DataFrame: A DataFrame of the repositories with an added 'organization' column.
5859
"""
5960

60-
# TODO: Should probably read the db instead
61-
repo_df = pd.read_json(repo_file)
62-
repo_df = repo_df.drop_duplicates(subset=['full_name'])
63-
repo_df = repo_df.reset_index(drop=True)
6461
conn = sqlite3.connect(db_file)
6562
cursor = conn.cursor()
6663

64+
# Read repositories from database, filtering for non-archived, size > 0, not a fork, and not a template
65+
query = """
66+
SELECT full_name, owner
67+
FROM repositories
68+
WHERE (archived = 0 OR archived = FALSE OR archived IS NULL)
69+
AND (size > 0 OR size IS NULL)
70+
AND (fork = 0 OR fork = FALSE OR fork IS NULL)
71+
AND (is_template = 0 OR is_template = FALSE OR is_template IS NULL)
72+
"""
73+
repo_df = pd.read_sql_query(query, conn)
74+
repo_df = repo_df.reset_index(drop=True)
75+
6776
try:
6877
# Ensure the repositories table has the organization column
6978
cursor.execute("ALTER TABLE repositories ADD COLUMN organization TEXT;") # Adjust the column type as needed
@@ -84,43 +93,45 @@ def get_organization_data(repo_file, db_file, headers):
8493
)
8594
""")
8695

96+
# Process sequentially (no multithreading)
97+
total_repos = len(repo_df)
98+
print(f"Processing {total_repos} repositories for organization data...")
8799

88-
for i in range(len(repo_df)):
89-
full_name = repo_df["full_name"][i]
90-
owner = repo_df['owner'][i]['login']
100+
processed_count = 0
101+
for idx, row in repo_df.iterrows():
102+
full_name = row["full_name"]
103+
owner = row['owner'] # owner is already a string from the database
91104
owner_url = f"https://api.github.com/users/{owner}"
92-
105+
93106
try:
94-
# Fetch owner data to check its type
95107
owner_data, _ = github_api_request(owner_url, headers)
108+
if not owner_data:
109+
processed_count += 1
110+
continue
111+
96112
owner_type = owner_data.get("type")
97113

98114
if owner_type == "Organization":
99-
# Mark the repository as owned by an organization
100-
repo_df.at[i, "organization"] = True
101-
conn.execute(
102-
"UPDATE repositories SET organization = ? WHERE full_name = ?;",
103-
(True, full_name)
104-
)
105-
106-
# Fetch organization details
107115
details = get_organization_details(owner, headers)
116+
repo_df.at[idx, "organization"] = True
117+
conn.execute("UPDATE repositories SET organization = ? WHERE full_name = ?;", (True, full_name))
108118
if details:
109-
# Insert organization details into the database
110119
conn.execute("""
111120
INSERT OR REPLACE INTO organizations
112121
(login, name, description, location, company, email, url, created_at, updated_at)
113122
VALUES
114123
(:login, :name, :description, :location, :company, :email, :url, :created_at, :updated_at)
115124
""", details)
116-
117125
except Exception as e:
118126
print(f"Error processing owner {owner}: {e}")
119-
120127

121-
# Commit changes to the database
122-
conn.commit()
123-
print(f"Processed {i + 1}/{len(repo_df)} repositories.")
128+
processed_count += 1
129+
if processed_count % 50 == 0 or processed_count == total_repos:
130+
conn.commit()
131+
print(f"Processed {processed_count}/{total_repos} repositories.")
132+
133+
conn.commit() # Final commit
134+
print(f"Completed: {processed_count}/{total_repos} repositories processed.")
124135

125136
conn.close()
126137
# TODO: Should I try to build a JSON object with this too?

0 commit comments

Comments
 (0)