UC-OSPO-Network
diff --git a/‎repofinder/scraping/get_contributors.py‎
Lines changed: 96 additions & 48 deletions b/‎repofinder/scraping/get_contributors.py‎
Lines changed: 96 additions & 48 deletions
diff --git a/‎repofinder/scraping/get_organizations.py‎
Lines changed: 38 additions & 27 deletions b/‎repofinder/scraping/get_organizations.py‎
Lines changed: 38 additions & 27 deletions
@@ -3,11 +3,14 @@
 
 import pandas as pd
 import sqlite3
+import logging
 from repofinder.scraping.repo_scraping_utils import github_api_request, get_next_link
 
+logger = logging.getLogger(__name__)
+
 #TODO: Figure out how to get duplicates
 
-def get_contributors(owner, repo_name, headers):
+def get_contributors(owner, repo_name, headers, rate_limiter=None):
     """
     Retrieves the list of contributors for a given repository.
 
@@ -24,7 +27,7 @@ def get_contributors(owner, repo_name, headers):
     contributors = []
     while url:
         try:
-            contributors_data, headers_response = github_api_request(url, headers, params)
+            contributors_data, headers_response = github_api_request(url, headers, params, rate_limiter=rate_limiter)
         except:
             break
         if contributors_data:
@@ -36,20 +39,28 @@ def get_contributors(owner, repo_name, headers):
             break
     return contributors if contributors else []
 
-def get_contributor_details(username, headers):
+def get_contributor_details(username, headers, rate_limiter=None):
     """
     Retrieves detailed information about a contributor.
 
     Args:
         username (str): The GitHub username of the contributor.
         headers (dict): HTTP headers for the request.
+        rate_limiter : Semaphore, optional
+            Thread-safe rate limiter for concurrent requests (default is None).
 
     Returns:
-        dict: A dictionary containing contributor details.
+        dict or None: A dictionary containing contributor details, or None if not found (404) or error.
     """
     url = f"https://api.github.com/users/{username}"
     try:
-        contributor_data, _ = github_api_request(url, headers)
+        contributor_data, _ = github_api_request(url, headers, rate_limiter=rate_limiter)
+        
+        # Handle 404 or None response (user not found)
+        if contributor_data is None:
+            logger.debug(f"Contributor {username} not found (404) or request failed. Skipping.")
+            return None
+        
         return {
             "login": contributor_data.get("login"),
             "name": contributor_data.get("name"),
@@ -61,19 +72,40 @@ def get_contributor_details(username, headers):
             "organizations": contributor_data.get("organizations_url"),  # This is a URL, requires additional fetch
         }
     except Exception as e:
-        print(f"Error fetching details for user {username}: {e}")
+        logger.debug(f"Error fetching details for user {username}: {e}")
         return None
 
 
 def get_contributor_data(repo_file, db_file, headers):
-
-    # TODO: This should probably read the database instead 
-    repo_df = pd.read_json(repo_file)
-    repo_df = repo_df.drop_duplicates(subset=['full_name'])
-    repo_df = repo_df.reset_index(drop=True)
-    repo_df["contributors"] = None
+    """
+    Processes repositories to collect contributor data.
+    Only processes repositories that are not archived, have size > 0, are not forks, and are not templates.
+    
+    Args:
+        repo_file (str): Path to the JSON file (unused, reads from DB instead).
+        db_file (str): Path to the SQLite database file.
+        headers (dict): HTTP headers for authenticated GitHub API requests.
+    
+    Returns
+    -------
+    pd.DataFrame
+        A DataFrame of the repositories with contributor data.
+    """
     conn = sqlite3.connect(db_file)
     cursor = conn.cursor()
+    
+    # Read repositories from database, filtering for non-archived, size > 0, not a fork, and not a template
+    query = """
+        SELECT full_name, owner
+        FROM repositories 
+        WHERE (archived = 0 OR archived = FALSE OR archived IS NULL)
+          AND (size > 0 OR size IS NULL)
+          AND (fork = 0 OR fork = FALSE OR fork IS NULL)
+          AND (is_template = 0 OR is_template = FALSE OR is_template IS NULL)
+    """
+    repo_df = pd.read_sql_query(query, conn)
+    repo_df = repo_df.reset_index(drop=True)
+    repo_df["contributors"] = None
     try:
         cursor.execute("ALTER TABLE repositories ADD COLUMN contributors TEXT;")  # Adjust the column type as needed
     except:
@@ -102,51 +134,67 @@ def get_contributor_data(repo_file, db_file, headers):
     )
     """)
 
-    for i in range(len(repo_df)):  #TODO: Fix API rate limits
-        full_name = repo_df["full_name"][i]
+    # List of bot usernames/patterns to skip
+    bots_to_skip = ["copilot", "dependabot[bot]", "github-actions[bot]", "dependabot", "github-actions"]
+    
+    # Process sequentially (no multithreading)
+    total_repos = len(repo_df)
+    print(f"Processing {total_repos} repositories for contributor data...")
+    
+    for idx, row in repo_df.iterrows():
+        full_name = row["full_name"]
         owner, repo_name = full_name.split("/")
-        contributors = get_contributors(owner, repo_name, headers)
-        contributors_login = []
 
-        for contributor in contributors:
-            contributor_login = contributor['login']
-            details = get_contributor_details(contributor_login, headers)
+        try:
+            contributors = get_contributors(owner, repo_name, headers)
+            contributors_login = []
+            contributor_details_list = []
 
-            if details:
-            #     # Fetch organizations data if needed
-            #     organizations_url = details.pop("organizations", None)
-            #     if organizations_url:
-            #         try:
-            #             org_data, _ = github_api_request(organizations_url, headers)
-            #             organizations = ", ".join(org.get("login", "") for org in org_data)
-            #             details["organizations"] = organizations
-            #         except Exception as e:
-            #             print(f"Error fetching organizations for user {contributor_login}: {e}")
-            #             details["organizations"] = None
-            #             print(i + '/' + len(repo_df))
-            #             return i
-
+            for contributor in contributors:
+                contributor_login = contributor['login']
+                
+                # Skip bot contributors (case-insensitive matching)
+                contributor_lower = contributor_login.lower()
+                if any(bot.lower() in contributor_lower for bot in bots_to_skip):
+                    continue
+                
+                # Also check if login ends with [bot] pattern
+                if contributor_login.endswith('[bot]'):
+                    continue
 
+                details = get_contributor_details(contributor_login, headers)
+                
+                # Only add contributor if details were successfully fetched (not 404)
+                if details:
+                    contributor_details_list.append((details, contributor_login))
+                    contributors_login.append(contributor_login)
+            
+            # Insert contributor details into database
+            for details, contributor_login in contributor_details_list:
                 conn.execute("""
                     INSERT OR REPLACE INTO contributors (login, name, bio, location, company, email, twitter)
                     VALUES (:login, :name, :bio, :location, :company, :email, :twitter)
                 """, details)
+                conn.execute("INSERT OR IGNORE INTO contributions (repository_name, contributor_login) VALUES (?, ?)", 
+                            (full_name, contributor_login))
+            
+            # Update repository with contributors list
+            repo_df.at[idx, "contributors"] = contributors_login
+            contributors_login_string = str(contributors_login)
+            conn.execute("UPDATE repositories SET contributors = ? WHERE full_name = ?;",
+                        (contributors_login_string, full_name))
+            
+            processed_count = idx + 1
+            if processed_count % 25 == 0 or processed_count == total_repos:
+                conn.commit()
+                print(f"{processed_count}/{total_repos}: repositories processed")
 
-
-            conn.execute("INSERT OR IGNORE INTO contributions (repository_name, contributor_login) VALUES (?, ?)", (full_name, contributor_login))
-            contributors_login.append(contributor_login)
-
-                           
-        # Update database with this new column
-        repo_df["contributors"][i] = contributors_login
-        contributors_login_string = str(contributors_login)
-        # This is to add the contributors as a list in the repositories table 
-        conn.execute(
-            "UPDATE repositories SET contributors = ? WHERE full_name = ?;",
-            (contributors_login_string, full_name)
-        )
-        conn.commit()
-        print(str(i)+"/"+str(len(repo_df)))
+        except Exception as e:
+            logger.error(f"Error processing repository {full_name}: {e}")
+            continue
+    
+    conn.commit()  # Final commit
+    print(f"Completed: {total_repos}/{total_repos} repositories processed")
 
     conn.close()
 # TODO: Should I try to build a JSON object with this too?
 
@@ -6,7 +6,7 @@
 from repofinder.scraping.repo_scraping_utils import github_api_request
 
 
-def get_organization_details(org_login, headers):
+def get_organization_details(org_login, headers, rate_limiter=None):
     """
     Retrieves detailed information about an organization.
 
@@ -19,7 +19,7 @@ def get_organization_details(org_login, headers):
     """
     url = f"https://api.github.com/orgs/{org_login}"
     try:
-        org_data, _ = github_api_request(url, headers)
+        org_data, _ = github_api_request(url, headers, rate_limiter=rate_limiter)
         return {
             "login": org_data.get("login"),
             "name": org_data.get("name"),
@@ -41,29 +41,38 @@ def get_organization_data(repo_file, db_file, headers):
     """
     Processes a list of repositories to identify those owned by organizations
     and stores organization metadata in a SQLite database.
+    Only processes repositories that are not archived, have size > 0, are not forks, and are not templates.
     
     This function:
-    - Reads repository metadata from a JSON file.
+    - Reads repository metadata from the database.
     - Identifies which repositories are owned by GitHub organizations.
     - Updates the 'repositories' table to mark organizational ownership.
     - Creates or updates an 'organizations' table with detailed organization info.
     
     Args:
-        repo_file (str): Path to the JSON file containing repository metadata.
+        repo_file (str): Path to the JSON file (unused, reads from DB instead).
         db_file (str): Path to the SQLite database file.
         headers (dict): HTTP headers for authenticated GitHub API requests.
     
     Returns:
         pd.DataFrame: A DataFrame of the repositories with an added 'organization' column.
     """
 
-    # TODO: Should probably read the db instead
-    repo_df = pd.read_json(repo_file)
-    repo_df = repo_df.drop_duplicates(subset=['full_name'])
-    repo_df = repo_df.reset_index(drop=True)
     conn = sqlite3.connect(db_file)
     cursor = conn.cursor()
 
+    # Read repositories from database, filtering for non-archived, size > 0, not a fork, and not a template
+    query = """
+        SELECT full_name, owner
+        FROM repositories 
+        WHERE (archived = 0 OR archived = FALSE OR archived IS NULL)
+          AND (size > 0 OR size IS NULL)
+          AND (fork = 0 OR fork = FALSE OR fork IS NULL)
+          AND (is_template = 0 OR is_template = FALSE OR is_template IS NULL)
+    """
+    repo_df = pd.read_sql_query(query, conn)
+    repo_df = repo_df.reset_index(drop=True)
+    
     try:
     # Ensure the repositories table has the organization column
         cursor.execute("ALTER TABLE repositories ADD COLUMN organization TEXT;")  # Adjust the column type as needed
@@ -84,43 +93,45 @@ def get_organization_data(repo_file, db_file, headers):
         )
     """)
 
+    # Process sequentially (no multithreading)
+    total_repos = len(repo_df)
+    print(f"Processing {total_repos} repositories for organization data...")
 
-    for i in range(len(repo_df)):
-        full_name = repo_df["full_name"][i]
-        owner = repo_df['owner'][i]['login']
+    processed_count = 0
+    for idx, row in repo_df.iterrows():
+        full_name = row["full_name"]
+        owner = row['owner']  # owner is already a string from the database
         owner_url = f"https://api.github.com/users/{owner}"
-
+        
         try:
-            # Fetch owner data to check its type
             owner_data, _ = github_api_request(owner_url, headers)
+            if not owner_data:
+                processed_count += 1
+                continue
+            
             owner_type = owner_data.get("type")
 
             if owner_type == "Organization":
-                # Mark the repository as owned by an organization
-                repo_df.at[i, "organization"] = True
-                conn.execute(
-                "UPDATE repositories SET organization = ? WHERE full_name = ?;",
-                (True, full_name)
-)
-
-                # Fetch organization details
                 details = get_organization_details(owner, headers)
+                repo_df.at[idx, "organization"] = True
+                conn.execute("UPDATE repositories SET organization = ? WHERE full_name = ?;", (True, full_name))
                 if details:
-                    # Insert organization details into the database
                     conn.execute("""
                         INSERT OR REPLACE INTO organizations 
                         (login, name, description, location, company, email, url, created_at, updated_at)
                         VALUES 
                         (:login, :name, :description, :location, :company, :email, :url, :created_at, :updated_at)
                     """, details)
-                    
         except Exception as e:
             print(f"Error processing owner {owner}: {e}")
-
 
-        # Commit changes to the database
-        conn.commit()
-        print(f"Processed {i + 1}/{len(repo_df)} repositories.")
+        processed_count += 1
+        if processed_count % 50 == 0 or processed_count == total_repos:
+            conn.commit()
+            print(f"Processed {processed_count}/{total_repos} repositories.")
+    
+    conn.commit()  # Final commit
+    print(f"Completed: {processed_count}/{total_repos} repositories processed.")
 
     conn.close()
 # TODO: Should I try to build a JSON object with this too?