From 88bdd639f9803614949c0726d0160db4d7ee8ea1 Mon Sep 17 00:00:00 2001 From: Zain Rizvi Date: Thu, 31 Jul 2025 15:35:33 -0500 Subject: [PATCH 1/9] Merge local changes --- tools/analytics/analyze_runner_usage.py | 40 +++++++++++++++++++++---- 1 file changed, 35 insertions(+), 5 deletions(-) diff --git a/tools/analytics/analyze_runner_usage.py b/tools/analytics/analyze_runner_usage.py index 6fc281245e..0183184d02 100644 --- a/tools/analytics/analyze_runner_usage.py +++ b/tools/analytics/analyze_runner_usage.py @@ -91,6 +91,8 @@ "pytorch/cppdocs", "pytorch/pytorch.github.io", "pytorch/examples", + # archived but not marked as such in github repo settings + "pytorch/serve", # proposed "pytorch/builder", "pytorch/xla", @@ -101,8 +103,6 @@ # List of runner labels to exclude from "runners not in scale-config" analysis # These are typically GitHub-hosted runners or other known external runners GITHUB_RUNNER_LABELS = [ - "linux.24_04.4x", - "linux.24_04.16x", "ubuntu-latest", "ubuntu-22.04", "ubuntu-24.04", @@ -110,17 +110,33 @@ "ubuntu-18.04", "windows-latest", "windows-2022", - "windows-11-arm64", "macos-latest", "macos-14", + "macos-14-xlarge", "macos-13", "macos-12", - "macos-14-xlarge", + # Offered at Meta enterprise level + "8-core-ubuntu", + "4-core-ubuntu", + "windows-8-core", + "4-core-ubuntu-gpu-t4", + "4-core-windows-gpu-t4", + "32-core-ubuntu", + "16-core-ubuntu", + "2-core-ubuntu-arm", + "4-core-ubuntu-arm", + "8-core-ubuntu-22.04", + "4-core-ubuntu-24.04", + # needs special access + "linux.24_04.4x", + "linux.24_04.16x", + "windows-11-arm64", # Add more runner labels to exclude here as needed ] USELESS_RUNNER_LABELS = [ - "self-hosted", # really, a useless label we want to ignoreß + "self-hosted", # really, a useless label we want to ignore + "linux.g5.4xlarge.nvidia.cpu", # a nonexistent label used by a repo ] HEADERS = { @@ -681,6 +697,20 @@ def main(): if repos_by_github_runner: output_data["repos_by_github_runner"] = dict(repos_by_github_runner) + # --- SORT OUTPUT ALPHABETICALLY FOR CONSISTENCY (except top-level keys) --- + def deep_sort(obj, sort_keys=True): + if isinstance(obj, dict): + keys = sorted(obj) if sort_keys else obj.keys() + return {k: deep_sort(obj[k]) for k in keys} + elif isinstance(obj, list): + # If list of dicts with 'repo' key, sort by 'repo', else sort normally + if obj and isinstance(obj[0], dict) and 'repo' in obj[0]: + return sorted([deep_sort(x) for x in obj], key=lambda x: x['repo']) + return sorted(deep_sort(x) for x in obj) + else: + return obj + + output_data = deep_sort(output_data, sort_keys=False) save_to_yaml(output_data) # Show final cache stats From 24b9b1d0233839eed34c6b670efad0e2cac06961 Mon Sep 17 00:00:00 2001 From: Zain Rizvi Date: Thu, 31 Jul 2025 15:42:57 -0500 Subject: [PATCH 2/9] Update analysis --- tools/analytics/analyze_runner_usage.py | 101 ++---------------------- tools/analytics/cache_manager.py | 98 +++++++++++++++++++++++ 2 files changed, 103 insertions(+), 96 deletions(-) create mode 100644 tools/analytics/cache_manager.py diff --git a/tools/analytics/analyze_runner_usage.py b/tools/analytics/analyze_runner_usage.py index 0183184d02..5d60e76ecd 100644 --- a/tools/analytics/analyze_runner_usage.py +++ b/tools/analytics/analyze_runner_usage.py @@ -67,6 +67,8 @@ import yaml from dotenv import load_dotenv +from tools.analytics.cache_manager import CACHE_DIR, CacheManager + load_dotenv() @@ -136,7 +138,7 @@ USELESS_RUNNER_LABELS = [ "self-hosted", # really, a useless label we want to ignore - "linux.g5.4xlarge.nvidia.cpu", # a nonexistent label used by a repo + "linux.g5.4xlarge.nvidia.cpu", # a nonexistent label used by a repo ] HEADERS = { @@ -147,99 +149,6 @@ BASE_URL = "https://api.github.com" WORKFLOW_RUN_LOOKBACK = (datetime.utcnow() - timedelta(days=180)).isoformat() + "Z" -# Cache configuration -CACHE_DIR = Path("cache") -CACHE_DIR.mkdir(exist_ok=True) - - -class CacheManager: - """Manages caching of GitHub API responses using URL as cache key.""" - - def __init__(self, cache_dir: Path = CACHE_DIR): - self.cache_dir = cache_dir - self.cache_dir.mkdir(exist_ok=True) - - def _get_cache_key(self, url: str) -> str: - """Generate a human-readable cache key from URL.""" - import re - from urllib.parse import parse_qs, urlencode, urlparse - - # Parse the URL to separate path and query parameters - parsed = urlparse(url) - path = parsed.path - query_params = parse_qs(parsed.query) - - # Remove the 'created' parameter from query params to avoid cache invalidation - if "created" in query_params: - del query_params["created"] - - # Reconstruct the query string without the 'created' parameter - if query_params: - # Flatten single-item lists (parse_qs returns lists) - flat_params = {} - for key, values in query_params.items(): - flat_params[key] = values[0] if len(values) == 1 else values - query_string = urlencode(flat_params) - # Reconstruct URL without the 'created' parameter - url_without_created = ( - f"{parsed.scheme}://{parsed.netloc}{path}?{query_string}" - ) - else: - # If no query params remain, use the original URL - url_without_created = url - - # Replace forward slashes with underscores - key = url_without_created.replace("/", "_") - - # Remove protocol and domain - key = key.replace("https___api.github.com_", "") - - # Handle illegal filename characters in query parameters - # Replace characters that are problematic in filenames - key = re.sub(r'[<>:"|?*]', "_", key) - - # Replace equals signs and ampersands in query params with underscores - key = key.replace("=", "_").replace("&", "_") - - # Clean up multiple consecutive underscores - key = re.sub(r"_+", "_", key) - - # Remove trailing underscore - key = key.rstrip("_") - - return key - - def _get_cache_path(self, url: str) -> Path: - """Get the cache file path for a given URL.""" - cache_key = self._get_cache_key(url) - return self.cache_dir / f"{cache_key}.json" - - def get(self, url: str) -> Optional[Dict]: - """Retrieve cached response for a URL.""" - cache_path = self._get_cache_path(url) - if cache_path.exists(): - try: - with open(cache_path, "r") as f: - cached_data = json.load(f) - logging.debug(f"[CacheManager] Cache hit for URL: {url}") - return cached_data - except (json.JSONDecodeError, IOError) as e: - logging.warning(f"[CacheManager] Failed to read cache for {url}: {e}") - return None - logging.debug(f"[CacheManager] Cache miss for URL: {url}") - return None - - def set(self, url: str, data: Dict) -> None: - """Cache response data for a URL.""" - cache_path = self._get_cache_path(url) - try: - with open(cache_path, "w") as f: - json.dump(data, f, indent=2) - logging.debug(f"[CacheManager] Cached response for URL: {url}") - except IOError as e: - logging.error(f"[CacheManager] Failed to write cache for {url}: {e}") - - # Global cache manager instance cache_manager = CacheManager() @@ -704,8 +613,8 @@ def deep_sort(obj, sort_keys=True): return {k: deep_sort(obj[k]) for k in keys} elif isinstance(obj, list): # If list of dicts with 'repo' key, sort by 'repo', else sort normally - if obj and isinstance(obj[0], dict) and 'repo' in obj[0]: - return sorted([deep_sort(x) for x in obj], key=lambda x: x['repo']) + if obj and isinstance(obj[0], dict) and "repo" in obj[0]: + return sorted([deep_sort(x) for x in obj], key=lambda x: x["repo"]) return sorted(deep_sort(x) for x in obj) else: return obj diff --git a/tools/analytics/cache_manager.py b/tools/analytics/cache_manager.py new file mode 100644 index 0000000000..f9d86e89f4 --- /dev/null +++ b/tools/analytics/cache_manager.py @@ -0,0 +1,98 @@ +import json +import logging +import re +from pathlib import Path +from typing import Dict, Optional + + +# Cache configuration +CACHE_DIR = Path("cache") + + +class CacheManager: + """Manages caching of GitHub API responses using URL as cache key.""" + + def __init__(self, cache_dir: Path = CACHE_DIR): + CACHE_DIR.mkdir(exist_ok=True) + + self.cache_dir = cache_dir + self.cache_dir.mkdir(exist_ok=True) + + def _get_cache_key(self, url: str) -> str: + """Generate a human-readable cache key from URL.""" + from urllib.parse import parse_qs, urlencode, urlparse + + # Parse the URL to separate path and query parameters + parsed = urlparse(url) + path = parsed.path + query_params = parse_qs(parsed.query) + + # Remove the 'created' parameter from query params to avoid cache invalidation + if "created" in query_params: + del query_params["created"] + + # Reconstruct the query string without the 'created' parameter + if query_params: + # Flatten single-item lists (parse_qs returns lists) + flat_params = {} + for key, values in query_params.items(): + flat_params[key] = values[0] if len(values) == 1 else values + query_string = urlencode(flat_params) + # Reconstruct URL without the 'created' parameter + url_without_created = ( + f"{parsed.scheme}://{parsed.netloc}{path}?{query_string}" + ) + else: + # If no query params remain, use the original URL + url_without_created = url + + # Replace forward slashes with underscores + key = url_without_created.replace("/", "_") + + # Remove protocol and domain + key = key.replace("https___api.github.com_", "") + + # Handle illegal filename characters in query parameters + # Replace characters that are problematic in filenames + key = re.sub(r'[<>:"|?*]', "_", key) + + # Replace equals signs and ampersands in query params with underscores + key = key.replace("=", "_").replace("&", "_") + + # Clean up multiple consecutive underscores + key = re.sub(r"_+", "_", key) + + # Remove trailing underscore + key = key.rstrip("_") + + return key + + def _get_cache_path(self, url: str) -> Path: + """Get the cache file path for a given URL.""" + cache_key = self._get_cache_key(url) + return self.cache_dir / f"{cache_key}.json" + + def get(self, url: str) -> Optional[Dict]: + """Retrieve cached response for a URL.""" + cache_path = self._get_cache_path(url) + if cache_path.exists(): + try: + with open(cache_path, "r") as f: + cached_data = json.load(f) + logging.debug(f"[CacheManager] Cache hit for URL: {url}") + return cached_data + except (json.JSONDecodeError, IOError) as e: + logging.warning(f"[CacheManager] Failed to read cache for {url}: {e}") + return None + logging.debug(f"[CacheManager] Cache miss for URL: {url}") + return None + + def set(self, url: str, data: Dict) -> None: + """Cache response data for a URL.""" + cache_path = self._get_cache_path(url) + try: + with open(cache_path, "w") as f: + json.dump(data, f, indent=2) + logging.debug(f"[CacheManager] Cached response for URL: {url}") + except IOError as e: + logging.error(f"[CacheManager] Failed to write cache for {url}: {e}") \ No newline at end of file From 0258597ffb80cde0dcebb1100f98fe1eb28e6bca Mon Sep 17 00:00:00 2001 From: Zain Rizvi Date: Thu, 31 Jul 2025 15:54:47 -0500 Subject: [PATCH 3/9] cleanup --- tools/analytics/org/.gitignore | 5 +++ tools/analytics/org/README.md | 39 +++++++++++++++++++ .../{ => org}/analyze_runner_usage.py | 2 +- tools/analytics/{ => org}/cache_manager.py | 0 4 files changed, 45 insertions(+), 1 deletion(-) create mode 100644 tools/analytics/org/.gitignore create mode 100644 tools/analytics/org/README.md rename tools/analytics/{ => org}/analyze_runner_usage.py (99%) rename tools/analytics/{ => org}/cache_manager.py (100%) diff --git a/tools/analytics/org/.gitignore b/tools/analytics/org/.gitignore new file mode 100644 index 0000000000..ecde95c402 --- /dev/null +++ b/tools/analytics/org/.gitignore @@ -0,0 +1,5 @@ +# Stores cached data for GitHub API responses +cache/ + +# Gets temporarily created by the script +scale-config.yml \ No newline at end of file diff --git a/tools/analytics/org/README.md b/tools/analytics/org/README.md new file mode 100644 index 0000000000..757e30670a --- /dev/null +++ b/tools/analytics/org/README.md @@ -0,0 +1,39 @@ +# Organization Analytics Tools + +This directory contains a collection of scripts designed to analyze GitHub Actions runner usage and other organizational metrics across a GitHub organization's repositories. + +## Overview + +The tools in this directory help us understand how GitHub Actions runners are being utilized across our repositories. + +## Scripts + +### `analyze_runner_usage.py` + +**Purpose**: Analyzes GitHub Actions runner label usage across all repositories in a specified GitHub organization. + +**Key Features**: +- Fetches all non-archived repositories in a GitHub organization +- Extracts runner labels used in workflow jobs from recent workflow runs +- Aggregates runner usage statistics across repositories +- Compares runner labels against those defined in `scale-config.yml` and standard GitHub-hosted runners +- Identifies unused or undefined runners +- Generates comprehensive usage reports + +**Output**: Creates `runner_labels_summary.yml` with detailed analytics including: +- Runner usage by repository +- Repository usage by runner type +- Repositories with zero workflow runs +- Runners not defined in scale-config or standard GitHub runners +- Usage patterns and trends + +### `cache_manager.py` + +**Purpose**: Helper script. Provides efficient caching functionality for GitHub API responses to optimize performance and avoid rate limiting. + +**Features**: +- URL-based cache key generation +- Intelligent cache invalidation +- Rate limit optimization +- Reduces redundant API calls during analysis + diff --git a/tools/analytics/analyze_runner_usage.py b/tools/analytics/org/analyze_runner_usage.py similarity index 99% rename from tools/analytics/analyze_runner_usage.py rename to tools/analytics/org/analyze_runner_usage.py index 5d60e76ecd..404bfabadf 100644 --- a/tools/analytics/analyze_runner_usage.py +++ b/tools/analytics/org/analyze_runner_usage.py @@ -67,7 +67,7 @@ import yaml from dotenv import load_dotenv -from tools.analytics.cache_manager import CACHE_DIR, CacheManager +from cache_manager import CACHE_DIR, CacheManager load_dotenv() diff --git a/tools/analytics/cache_manager.py b/tools/analytics/org/cache_manager.py similarity index 100% rename from tools/analytics/cache_manager.py rename to tools/analytics/org/cache_manager.py From 2a232b8c17bd59a8633b314d9b7365b0d92f3be9 Mon Sep 17 00:00:00 2001 From: Zain Rizvi Date: Thu, 31 Jul 2025 16:12:04 -0500 Subject: [PATCH 4/9] Move more cache functions over --- tools/analytics/org/analyze_runner_usage.py | 79 +------------------- tools/analytics/org/cache_manager.py | 81 ++++++++++++++++++++- 2 files changed, 81 insertions(+), 79 deletions(-) diff --git a/tools/analytics/org/analyze_runner_usage.py b/tools/analytics/org/analyze_runner_usage.py index 404bfabadf..80107700c0 100644 --- a/tools/analytics/org/analyze_runner_usage.py +++ b/tools/analytics/org/analyze_runner_usage.py @@ -55,7 +55,6 @@ """ import argparse -import json import logging import os from collections import defaultdict @@ -65,10 +64,9 @@ import requests import yaml +from cache_manager import get_cache_stats, make_cached_request from dotenv import load_dotenv -from cache_manager import CACHE_DIR, CacheManager - load_dotenv() @@ -141,58 +139,9 @@ "linux.g5.4xlarge.nvidia.cpu", # a nonexistent label used by a repo ] -HEADERS = { - "Authorization": f"Bearer {GITHUB_TOKEN}", - "Accept": "application/vnd.github+json", -} - BASE_URL = "https://api.github.com" WORKFLOW_RUN_LOOKBACK = (datetime.utcnow() - timedelta(days=180)).isoformat() + "Z" -# Global cache manager instance -cache_manager = CacheManager() - - -def make_cached_request( - url: str, headers: Optional[Dict[str, str]] = None -) -> Optional[Dict]: - """ - Make an HTTP request with caching. Returns the JSON response if successful. - - Args: - url: The URL to request - headers: Optional headers for the request - - Returns: - JSON response data if successful, None if failed - """ - # Check cache first - cached_response = cache_manager.get(url) - if cached_response: - logging.info(f"[make_cached_request] Using cached response for: {url}") - return cached_response - - # Make actual HTTP request - logging.info(f"[make_cached_request] Making HTTP request to: {url}") - try: - response = requests.get(url, headers=headers or HEADERS) - response.raise_for_status() - data = response.json() - - # Cache successful response - cache_manager.set(url, data) - logging.info(f"[make_cached_request] Successfully cached response for: {url}") - return data - - except requests.exceptions.RequestException as e: - logging.error(f"[make_cached_request] HTTP request failed for {url}: {e}") - return None - except json.JSONDecodeError as e: - logging.error( - f"[make_cached_request] Failed to parse JSON response for {url}: {e}" - ) - return None - def get_repos(org: str) -> List[str]: logging.info(f"[get_repos] Start fetching repositories for org: {org}") @@ -445,32 +394,6 @@ def save_to_yaml(data: Dict, filename: str = "runner_labels_summary.yml"): logging.info(f"[save_to_yaml] Data successfully saved to {filename}") -def clear_cache(): - """Clear all cached data.""" - import shutil - - if CACHE_DIR.exists(): - shutil.rmtree(CACHE_DIR) - CACHE_DIR.mkdir(exist_ok=True) - logging.info(f"[clear_cache] Cleared cache directory: {CACHE_DIR}") - else: - logging.info(f"[clear_cache] Cache directory does not exist: {CACHE_DIR}") - - -def get_cache_stats(): - """Get statistics about the cache.""" - if not CACHE_DIR.exists(): - return {"total_files": 0, "total_size_mb": 0} - - cache_files = list(CACHE_DIR.glob("*.json")) - total_size = sum(f.stat().st_size for f in cache_files) - - return { - "total_files": len(cache_files), - "total_size_mb": round(total_size / (1024 * 1024), 2), - } - - def download_scale_config(url: str, dest: str = "scale-config.yml") -> bool: """Download scale-config.yml from the given URL if it does not exist locally.""" if os.path.exists(dest): diff --git a/tools/analytics/org/cache_manager.py b/tools/analytics/org/cache_manager.py index f9d86e89f4..69eb42015f 100644 --- a/tools/analytics/org/cache_manager.py +++ b/tools/analytics/org/cache_manager.py @@ -4,6 +4,8 @@ from pathlib import Path from typing import Dict, Optional +import requests + # Cache configuration CACHE_DIR = Path("cache") @@ -95,4 +97,81 @@ def set(self, url: str, data: Dict) -> None: json.dump(data, f, indent=2) logging.debug(f"[CacheManager] Cached response for URL: {url}") except IOError as e: - logging.error(f"[CacheManager] Failed to write cache for {url}: {e}") \ No newline at end of file + logging.error(f"[CacheManager] Failed to write cache for {url}: {e}") + + +# Global cache manager instance +cache_manager = CacheManager() + + +def get_cache_stats(): + """Get statistics about the cache.""" + if not CACHE_DIR.exists(): + return {"total_files": 0, "total_size_mb": 0} + + cache_files = list(CACHE_DIR.glob("*.json")) + total_size = sum(f.stat().st_size for f in cache_files) + + return { + "total_files": len(cache_files), + "total_size_mb": round(total_size / (1024 * 1024), 2), + } + + +def clear_cache(): + """Clear all cached data.""" + import shutil + + if CACHE_DIR.exists(): + shutil.rmtree(CACHE_DIR) + CACHE_DIR.mkdir(exist_ok=True) + logging.info(f"[clear_cache] Cleared cache directory: {CACHE_DIR}") + else: + logging.info(f"[clear_cache] Cache directory does not exist: {CACHE_DIR}") + + +HEADERS = { + "Authorization": f"Bearer {GITHUB_TOKEN}", + "Accept": "application/vnd.github+json", +} + + +def make_cached_request( + url: str, headers: Optional[Dict[str, str]] = None +) -> Optional[Dict]: + """ + Make an HTTP request with caching. Returns the JSON response if successful. + + Args: + url: The URL to request + headers: Optional headers for the request + + Returns: + JSON response data if successful, None if failed + """ + # Check cache first + cached_response = cache_manager.get(url) + if cached_response: + logging.info(f"[make_cached_request] Using cached response for: {url}") + return cached_response + + # Make actual HTTP request + logging.info(f"[make_cached_request] Making HTTP request to: {url}") + try: + response = requests.get(url, headers=headers or HEADERS) + response.raise_for_status() + data = response.json() + + # Cache successful response + cache_manager.set(url, data) + logging.info(f"[make_cached_request] Successfully cached response for: {url}") + return data + + except requests.exceptions.RequestException as e: + logging.error(f"[make_cached_request] HTTP request failed for {url}: {e}") + return None + except json.JSONDecodeError as e: + logging.error( + f"[make_cached_request] Failed to parse JSON response for {url}: {e}" + ) + return None From 65e877e0fc97875785898572a53db29275026663 Mon Sep 17 00:00:00 2001 From: Zain Rizvi Date: Thu, 31 Jul 2025 16:22:13 -0500 Subject: [PATCH 5/9] Fix refactoring --- tools/analytics/org/analyze_runner_usage.py | 12 +++++++++--- tools/analytics/org/cache_manager.py | 12 +++--------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/tools/analytics/org/analyze_runner_usage.py b/tools/analytics/org/analyze_runner_usage.py index 80107700c0..4ff31306a1 100644 --- a/tools/analytics/org/analyze_runner_usage.py +++ b/tools/analytics/org/analyze_runner_usage.py @@ -79,6 +79,12 @@ GITHUB_TOKEN = os.getenv("GITHUB_TOKEN") ORG_NAME = None # Will be set by argparse +# GitHub API headers +HEADERS = { + "Authorization": f"Bearer {GITHUB_TOKEN}", + "Accept": "application/vnd.github+json", +} + # List of repositories to exclude in the format 'org/repo' EXCLUDED_REPOS = [ "pytorch/pytorch", @@ -150,7 +156,7 @@ def get_repos(org: str) -> List[str]: while True: url = f"{BASE_URL}/orgs/{org}/repos?per_page=100&page={page}" logging.debug(f"[get_repos] Requesting URL: {url}") - data = make_cached_request(url) + data = make_cached_request(url, HEADERS) if data is None: logging.error(f"[get_repos] Failed to fetch page {page} for org: {org}") break @@ -186,7 +192,7 @@ def get_workflow_runs(org: str, repo: str) -> List[Dict]: while True: url = f"{BASE_URL}/repos/{org}/{repo}/actions/runs?per_page=100&page={page}&created=>={WORKFLOW_RUN_LOOKBACK}" logging.debug(f"[get_workflow_runs] Requesting URL: {url}") - response_data = make_cached_request(url) + response_data = make_cached_request(url, HEADERS) if response_data is None: logging.error( f"[get_workflow_runs] Failed to fetch page {page} for repo: {repo}" @@ -271,7 +277,7 @@ def get_jobs_for_run( ) url = f"{BASE_URL}/repos/{org}/{repo}/actions/runs/{run_id}/jobs" logging.debug(f"[get_jobs_for_run] Requesting URL: {url}") - response_data = make_cached_request(url) + response_data = make_cached_request(url, HEADERS) if response_data is None: logging.error( f"[get_jobs_for_run] Failed to fetch jobs for run {run_id} in repo: {repo}" diff --git a/tools/analytics/org/cache_manager.py b/tools/analytics/org/cache_manager.py index 69eb42015f..48c05862d4 100644 --- a/tools/analytics/org/cache_manager.py +++ b/tools/analytics/org/cache_manager.py @@ -130,21 +130,15 @@ def clear_cache(): logging.info(f"[clear_cache] Cache directory does not exist: {CACHE_DIR}") -HEADERS = { - "Authorization": f"Bearer {GITHUB_TOKEN}", - "Accept": "application/vnd.github+json", -} - - def make_cached_request( - url: str, headers: Optional[Dict[str, str]] = None + url: str, headers: Dict[str, str] ) -> Optional[Dict]: """ Make an HTTP request with caching. Returns the JSON response if successful. Args: url: The URL to request - headers: Optional headers for the request + headers: Headers for the request (required) Returns: JSON response data if successful, None if failed @@ -158,7 +152,7 @@ def make_cached_request( # Make actual HTTP request logging.info(f"[make_cached_request] Making HTTP request to: {url}") try: - response = requests.get(url, headers=headers or HEADERS) + response = requests.get(url, headers=headers) response.raise_for_status() data = response.json() From 50bc9bba426e5ea8f15cff38842fb1cab9edf2c2 Mon Sep 17 00:00:00 2001 From: Zain Rizvi Date: Thu, 31 Jul 2025 17:15:16 -0500 Subject: [PATCH 6/9] lint fixes --- tools/analytics/org/README.md | 1 - tools/analytics/org/cache_manager.py | 4 +--- tools/analytics/org/requirements.txt | 3 +++ 3 files changed, 4 insertions(+), 4 deletions(-) create mode 100644 tools/analytics/org/requirements.txt diff --git a/tools/analytics/org/README.md b/tools/analytics/org/README.md index 757e30670a..6a22f25251 100644 --- a/tools/analytics/org/README.md +++ b/tools/analytics/org/README.md @@ -36,4 +36,3 @@ The tools in this directory help us understand how GitHub Actions runners are be - Intelligent cache invalidation - Rate limit optimization - Reduces redundant API calls during analysis - diff --git a/tools/analytics/org/cache_manager.py b/tools/analytics/org/cache_manager.py index 48c05862d4..60cf4544f1 100644 --- a/tools/analytics/org/cache_manager.py +++ b/tools/analytics/org/cache_manager.py @@ -130,9 +130,7 @@ def clear_cache(): logging.info(f"[clear_cache] Cache directory does not exist: {CACHE_DIR}") -def make_cached_request( - url: str, headers: Dict[str, str] -) -> Optional[Dict]: +def make_cached_request(url: str, headers: Dict[str, str]) -> Optional[Dict]: """ Make an HTTP request with caching. Returns the JSON response if successful. diff --git a/tools/analytics/org/requirements.txt b/tools/analytics/org/requirements.txt new file mode 100644 index 0000000000..dd9f3517df --- /dev/null +++ b/tools/analytics/org/requirements.txt @@ -0,0 +1,3 @@ +requests>=2.28.0 +pyyaml>=6.0 +python-dotenv>=0.19.0 From 782b327d278de73d2ace5a51f87ca995583236b5 Mon Sep 17 00:00:00 2001 From: Zain Rizvi Date: Tue, 12 Aug 2025 16:48:29 -0500 Subject: [PATCH 7/9] changes so far, with pytorch-labs remover and gh code search --- tools/analytics/org/.gitignore | 5 +- tools/analytics/org/analyze_contributors.py | 664 ++++++++++++++++++++ tools/analytics/org/analyze_repo_info.py | 259 ++++++++ tools/analytics/org/analyze_runner_usage.py | 17 +- tools/analytics/org/github_code_search.py | 589 +++++++++++++++++ tools/analytics/org/remove_pytorch_labs.py | 636 +++++++++++++++++++ 6 files changed, 2164 insertions(+), 6 deletions(-) create mode 100644 tools/analytics/org/analyze_contributors.py create mode 100644 tools/analytics/org/analyze_repo_info.py create mode 100644 tools/analytics/org/github_code_search.py create mode 100644 tools/analytics/org/remove_pytorch_labs.py diff --git a/tools/analytics/org/.gitignore b/tools/analytics/org/.gitignore index ecde95c402..5a4d01b96c 100644 --- a/tools/analytics/org/.gitignore +++ b/tools/analytics/org/.gitignore @@ -2,4 +2,7 @@ cache/ # Gets temporarily created by the script -scale-config.yml \ No newline at end of file +scale-config.yml + +# Stores the output of the analysis +reports/ \ No newline at end of file diff --git a/tools/analytics/org/analyze_contributors.py b/tools/analytics/org/analyze_contributors.py new file mode 100644 index 0000000000..5b9439eeb5 --- /dev/null +++ b/tools/analytics/org/analyze_contributors.py @@ -0,0 +1,664 @@ +""" +GitHub Organization Contributor Analyzer +======================================== + +Purpose: +-------- +This script analyzes contributors across all repositories in a specified GitHub organization over the past 6 months. +It identifies frequent contributors and attempts to determine their company affiliations based on email addresses +and GitHub profile information. + +Key Features: +------------- +- Fetches all non-archived repositories in a GitHub organization (excluding a configurable list). +- For each repository, analyzes commits from the past 6 months to identify contributors. +- Extracts contributor information including email addresses and GitHub profiles. +- Attempts to identify company affiliations from email domains and GitHub profile data. +- Aggregates contributor statistics across repositories. +- Outputs a YAML summary (reports/contributors_summary.yml) with detailed contributor analysis. +- Caches GitHub API responses for efficiency and rate limit avoidance. + +How to Run: +----------- +1. Ensure you have Python 3.9+ and install dependencies (see below). +2. Set the following environment variable (can be in a .env file): + - `GITHUB_TOKEN`: A GitHub personal access token with `repo` and `user` read permissions. +3. (Optional) Edit the EXCLUDED_REPOS list in the script to customize exclusions. +4. Run the script: + + ```bash + python analyze_contributors.py [--org ORG_NAME] + ``` + - Use `--org` to specify the GitHub organization to analyze (defaults to 'pytorch'). + +Dependencies: +------------- +- requests +- pyyaml +- python-dotenv + +Output: +------- +- `reports/contributors_summary.yml`: A YAML file containing: + - `contributors_by_frequency`: Contributors sorted by commit count across all repos. + - `contributors_by_repo`: For each repo, list of contributors with their stats. + - `company_analysis`: Contributors grouped by identified companies. + - `unidentified_contributors`: Contributors without identifiable company affiliation. +- Caches API responses in the `cache/` directory for faster reruns. + +Notes: +------ +- The script looks back 6 months for commits. +- Company identification is based on email domains and GitHub profile information. +- The script is safe to rerun; it uses caching to avoid redundant API calls. +- For large orgs, the script may take a while on the first run due to API rate limits. + +""" + +import argparse +import logging +import os +import re +from collections import defaultdict +from datetime import datetime, timedelta +from typing import Dict, List, Optional + +import requests +import yaml +from cache_manager import get_cache_stats, make_cached_request +from dotenv import load_dotenv + + +load_dotenv() + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s [%(levelname)s] %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", +) + +GITHUB_TOKEN = os.getenv("GITHUB_TOKEN") +ORG_NAME = None # Will be set by argparse + +# GitHub API headers +HEADERS = { + "Authorization": f"Bearer {GITHUB_TOKEN}", + "Accept": "application/vnd.github+json", +} + +# List of repositories to exclude in the format 'org/repo' +EXCLUDED_REPOS = [ + "pytorch/pytorch", + "pytorch/executorch", + "pytorch/test-infra", + "pytorch/ci-infra", + "pytorch/pytorch-canary", + "pytorch/tutorials", + "pytorch/docs", + "pytorch/cppdocs", + "pytorch/pytorch.github.io", + "pytorch/examples", + # archived but not marked as such in github repo settings + "pytorch/serve", + # proposed + "pytorch/builder", + "pytorch/xla", + "pytorch/benchmark", + "pytorch/pytorch-integration-testing", +] + +# Company domains mapping +COMPANY_DOMAINS = { + "meta.com": "Meta", + "fb.com": "Meta", + "facebook.com": "Meta", + "google.com": "Google", + "microsoft.com": "Microsoft", + "nvidia.com": "NVIDIA", + "intel.com": "Intel", + "amd.com": "AMD", + "apple.com": "Apple", + "amazon.com": "Amazon", + "aws.com": "Amazon", + "ibm.com": "IBM", + "redhat.com": "Red Hat", + "canonical.com": "Canonical", + "huggingface.co": "Hugging Face", + "openai.com": "OpenAI", + "anthropic.com": "Anthropic", + "deepmind.com": "DeepMind", + "salesforce.com": "Salesforce", + "uber.com": "Uber", + "netflix.com": "Netflix", + "airbnb.com": "Airbnb", + "spotify.com": "Spotify", + "tesla.com": "Tesla", +} + +BASE_URL = "https://api.github.com" +COMMIT_LOOKBACK = (datetime.utcnow() - timedelta(days=180)).isoformat() + "Z" # 6 months + + +def get_repos(org: str) -> List[str]: + logging.info(f"[get_repos] Start fetching repositories for org: {org}") + repos = [] + page = 1 + while True: + url = f"{BASE_URL}/orgs/{org}/repos?per_page=100&page={page}" + logging.debug(f"[get_repos] Requesting URL: {url}") + data = make_cached_request(url, HEADERS) + if data is None: + logging.error(f"[get_repos] Failed to fetch page {page} for org: {org}") + break + if not data: + logging.info( + f"[get_repos] No more repositories found on page {page} for org: {org}" + ) + break + logging.info( + f"[get_repos] Page {page}: Found {len(data)} repositories for org: {org}" + ) + # Filter out archived repositories + non_archived_repos = [ + repo["name"] for repo in data if not repo.get("archived", False) + ] + repos.extend(non_archived_repos) + logging.info( + f"[get_repos] Page {page}: Excluded {len(data) - len(non_archived_repos)} archived repositories" + ) + page += 1 + logging.info( + f"[get_repos] Finished fetching repositories for org: {org}. Total: {len(repos)} (excluding archived)" + ) + return repos + + +def get_commits(org: str, repo: str) -> List[Dict]: + """Get commits for a repository from the past 6 months.""" + logging.info(f"[get_commits] Start fetching commits for repo: {repo} in org: {org}") + all_commits = [] + page = 1 + + while True: + url = f"{BASE_URL}/repos/{org}/{repo}/commits?per_page=100&page={page}&since={COMMIT_LOOKBACK}" + logging.debug(f"[get_commits] Requesting URL: {url}") + data = make_cached_request(url, HEADERS) + if data is None: + logging.error(f"[get_commits] Failed to fetch page {page} for repo: {repo}") + break + if not data: + logging.info(f"[get_commits] No more commits found for repo: {repo} on page {page}") + break + logging.info(f"[get_commits] Page {page}: Found {len(data)} commits for repo: {repo}") + all_commits.extend(data) + page += 1 + + # Limit to reasonable number of commits to avoid API rate limits + if len(all_commits) >= 1000: + logging.info(f"[get_commits] Limiting to 1000 commits for repo: {repo}") + break + + logging.info(f"[get_commits] Finished fetching commits for repo: {repo}. Total: {len(all_commits)}") + return all_commits + + +def get_user_profile(username: str) -> Optional[Dict]: + """Get GitHub user profile information.""" + if not username: + return None + + url = f"{BASE_URL}/users/{username}" + logging.debug(f"[get_user_profile] Fetching profile for user: {username}") + return make_cached_request(url, HEADERS) + + +def extract_company_from_email(email: str) -> Optional[str]: + """Extract company name from email domain.""" + if not email or "@" not in email: + return None + + domain = email.split("@")[1].lower() + + # Check direct domain matches + if domain in COMPANY_DOMAINS: + return COMPANY_DOMAINS[domain] + + # Check for subdomains + for company_domain, company_name in COMPANY_DOMAINS.items(): + if domain.endswith(f".{company_domain}"): + return company_name + + # Skip generic email providers + generic_providers = { + "gmail.com", "yahoo.com", "hotmail.com", "outlook.com", "icloud.com", + "protonmail.com", "tutanota.com", "hey.com", "fastmail.com", + "users.noreply.github.com" # GitHub's privacy-preserving email addresses + } + + if domain in generic_providers: + return None + + # For other domains, try to extract company name + # Remove common TLDs and subdomains + domain_parts = domain.replace(".com", "").replace(".org", "").replace(".net", "").split(".") + if domain_parts and len(domain_parts[-1]) > 2: + return domain_parts[-1].title() + + return None + + +def extract_company_from_profile(profile: Dict) -> Optional[str]: + """Extract company name from GitHub profile.""" + if not profile: + return None + + company = profile.get("company") or "" + company = company.strip() if company else "" + if not company: + return None + + # Clean up company name + company = re.sub(r'^@', '', company) # Remove @ prefix + company = company.strip() + + if not company: + return None + + # Map common company variations + company_mappings = { + "meta": "Meta", + "facebook": "Meta", + "google": "Google", + "microsoft": "Microsoft", + "nvidia": "NVIDIA", + "intel": "Intel", + "amd": "AMD", + "apple": "Apple", + "amazon": "Amazon", + "aws": "Amazon", + "ibm": "IBM", + "red hat": "Red Hat", + "redhat": "Red Hat", + "canonical": "Canonical", + "hugging face": "Hugging Face", + "huggingface": "Hugging Face", + "openai": "OpenAI", + "anthropic": "Anthropic", + "deepmind": "DeepMind", + "salesforce": "Salesforce", + "uber": "Uber", + "netflix": "Netflix", + "airbnb": "Airbnb", + "spotify": "Spotify", + "tesla": "Tesla", + } + + company_lower = company.lower() + if company_lower in company_mappings: + return company_mappings[company_lower] + + return company.title() + + +def cache_to_disk(func): + """ + A decorator that caches the result of a function to disk. + The cache key is generated from the function name, its arguments, and today's date. + Handles complex types like lists and dictionaries properly. + """ + import hashlib + import json + import os + from datetime import date + from functools import wraps + + def make_hashable(obj): + """Convert a container to a frozen/hashable form for reliable caching.""" + if isinstance(obj, dict): + return tuple(sorted((k, make_hashable(v)) for k, v in obj.items())) + elif isinstance(obj, (list, tuple)): + return tuple(make_hashable(x) for x in obj) + # For sets, convert to sorted tuples + elif isinstance(obj, set): + return tuple(sorted(make_hashable(x) for x in obj)) + # Handle string representation for other objects that might not be JSON serializable + elif not isinstance(obj, (str, int, float, bool, type(None))): + return str(obj) + return obj + + @wraps(func) + def wrapper(*args, **kwargs): + # Create cache directory if it doesn't exist + cache_dir = "cache" + os.makedirs(cache_dir, exist_ok=True) + + # Generate a cache key based on function name and args + func_name = func.__name__ + # Create a function-specific subdirectory for better organization + func_cache_dir = os.path.join(cache_dir, func_name) + os.makedirs(func_cache_dir, exist_ok=True) + + # Get today's date for cache versioning + today = date.today().isoformat() # Format: YYYY-MM-DD + + # Make args and kwargs hashable before serializing + hashable_args = tuple(make_hashable(arg) for arg in args) + hashable_kwargs = {k: make_hashable(v) for k, v in kwargs.items()} + + try: + # Try to serialize with standard JSON, including today's date + arg_representation = { + "date": today, + "args": hashable_args, + "kwargs": sorted(hashable_kwargs.items()) + } + serialized_args = json.dumps(arg_representation, sort_keys=True) + except (TypeError, ValueError): + # If serialization fails, use string representation as fallback + serialized_args = today + str(hashable_args) + str(sorted(hashable_kwargs.items())) + + arg_hash = hashlib.sha256(serialized_args.encode()).hexdigest() + key = f"{func_name}_{today}_{arg_hash}" + + # Check if cached result exists + filepath = os.path.join(func_cache_dir, f"{today}_{arg_hash}.json") + if os.path.exists(filepath): + logging.debug(f"Cache hit for function: {func_name} (cached on {today})") + with open(filepath, "r") as f: + return json.load(f) + + # If not cached, call the function + result = func(*args, **kwargs) + + # Cache the result + with open(filepath, "w") as f: + json.dump(result, f) + logging.debug(f"Cached result for function: {func_name}, saved to: {filepath} (date: {today})") + + return result + + return wrapper + + +@cache_to_disk +def analyze_contributors(org: str, repos: List[str]) -> Dict: + """Analyze contributors across all repositories.""" + logging.info(f"[analyze_contributors] Start analyzing contributors for {len(repos)} repositories in org: {org}") + + # Track contributors across all repos + global_contributors = defaultdict(lambda: { + "total_commits": 0, + "repos": set(), + "emails": set(), + "username": None, + "company": None, + "profile": None + }) + + # Track contributors by repo + repo_contributors = {} + + for repo in repos: + logging.info(f"[analyze_contributors] Processing repo: {repo}") + commits = get_commits(org, repo) + repo_contributor_stats = defaultdict(lambda: { + "commits": 0, + "emails": set(), + "username": None + }) + + for commit in commits: + author = commit.get("commit", {}).get("author", {}) + github_author = commit.get("author") + + author_name = author.get("name", "Unknown") + author_email = author.get("email", "") + username = github_author.get("login") if github_author else None + + # Since we can assume GitHub username info is always there, use it as the primary key + contributor_key = username + if not contributor_key: + raise ValueError(f"Commit {commit['sha']} in repo {repo} has no identifiable contributor information.") + + # Update repo-specific stats + repo_contributor_stats[contributor_key]["commits"] += 1 + if author_email: + repo_contributor_stats[contributor_key]["emails"].add(author_email) + if username: + repo_contributor_stats[contributor_key]["username"] = username + + # Update global stats + global_contributors[contributor_key]["total_commits"] += 1 + global_contributors[contributor_key]["repos"].add(repo) + if author_email: + global_contributors[contributor_key]["emails"].add(author_email) + if username: + global_contributors[contributor_key]["username"] = username + + # Convert sets to lists for YAML serialization + repo_contributors[repo] = [] + for contributor_key, stats in repo_contributor_stats.items(): + repo_contributors[repo].append({ + "contributor": contributor_key, + "commits": stats["commits"], + "emails": list(stats["emails"]), + "username": stats["username"] + }) + + # Sort by commit count + repo_contributors[repo].sort(key=lambda x: x["commits"], reverse=True) + + logging.info(f"[analyze_contributors] Found {len(repo_contributors[repo])} contributors for repo: {repo}") + + # Enhance global contributors with profile and company information + logging.info(f"[analyze_contributors] Enhancing contributor information with profiles and companies") + for contributor_key, stats in global_contributors.items(): + # First, try to extract company from email addresses (prioritize this) + if stats["emails"]: + for email in stats["emails"]: + company_from_email = extract_company_from_email(email) + if company_from_email: + stats["company"] = company_from_email + break + + # Only if email didn't provide a clear company mapping, try GitHub profile + if not stats["company"] and stats["username"]: + profile = get_user_profile(stats["username"]) + stats["profile"] = profile + + # Try to extract company from profile + company_from_profile = extract_company_from_profile(profile) + if company_from_profile: + stats["company"] = company_from_profile + + # Convert sets to lists for YAML serialization + stats["repos"] = list(stats["repos"]) + stats["emails"] = list(stats["emails"]) + + logging.info(f"[analyze_contributors] Finished analyzing contributors for org: {org}") + return global_contributors, repo_contributors + + +def save_to_yaml(data: Dict, filename: str = "contributors_summary.yml"): + """Save data to YAML file.""" + # Create reports directory if it doesn't exist + reports_dir = "reports" + os.makedirs(reports_dir, exist_ok=True) + + # Build full path with reports directory + filepath = os.path.join(reports_dir, filename) + logging.info(f"[save_to_yaml] Saving contributor data to {filepath}") + + # Convert defaultdict to regular dict to avoid YAML serialization issues + if hasattr(data, "default_factory"): + data = dict(data) + + with open(filepath, "w") as f: + yaml.dump(data, f, sort_keys=False, default_flow_style=False) + + logging.info(f"[save_to_yaml] Data successfully saved to {filepath}") + + +def main(): + parser = argparse.ArgumentParser( + description="Analyze GitHub org contributor patterns and company affiliations." + ) + parser.add_argument( + "--org", + type=str, + default="pytorch-labs", + help="GitHub organization to analyze (default: pytorch-labs)", + ) + args = parser.parse_args() + + global ORG_NAME + ORG_NAME = args.org + + if not GITHUB_TOKEN: + logging.error("[main] Missing GITHUB_TOKEN in environment variables.") + return + + logging.info(f"[main] Starting contributor analysis for org: {ORG_NAME}") + + # Show cache stats at start + cache_stats = get_cache_stats() + logging.info( + f"[main] Cache stats: {cache_stats['total_files']} files, {cache_stats['total_size_mb']} MB" + ) + + # Get repositories + repos = get_repos(ORG_NAME) + filtered_repos = [ + repo for repo in repos if f"{ORG_NAME}/{repo}" not in EXCLUDED_REPOS + ] + + logging.info(f"[main] Analyzing {len(filtered_repos)} repositories (excluded {len(repos) - len(filtered_repos)})") + + # Analyze contributors + global_contributors, repo_contributors = analyze_contributors(ORG_NAME, filtered_repos) + + # Sort contributors by frequency + contributors_by_frequency = [] + for contributor_key, stats in global_contributors.items(): + contributors_by_frequency.append({ + "contributor": contributor_key, + "total_commits": stats["total_commits"], + "repos_count": len(stats["repos"]), + "repos": stats["repos"], + "emails": stats["emails"], + "username": stats["username"], + "company": stats["company"] + }) + + contributors_by_frequency.sort(key=lambda x: x["total_commits"], reverse=True) + + # Group contributors by company + company_analysis = defaultdict(list) + unidentified_contributors = [] + + for contributor in contributors_by_frequency: + if contributor["company"]: + company_analysis[contributor["company"]].append({ + "contributor": contributor["contributor"], + "total_commits": contributor["total_commits"], + "repos_count": contributor["repos_count"], + "username": contributor["username"] + }) + else: + unidentified_contributors.append({ + "contributor": contributor["contributor"], + "total_commits": contributor["total_commits"], + "repos_count": contributor["repos_count"], + "username": contributor["username"], + "emails": contributor["emails"] + }) + + # Sort company contributors by commit count + for company in company_analysis: + company_analysis[company].sort(key=lambda x: x["total_commits"], reverse=True) + + # Prepare output data + output_data = { + "analysis_metadata": { + "organization": ORG_NAME, + "analysis_date": datetime.utcnow().isoformat() + "Z", + "lookback_period_days": 180, + "repositories_analyzed": len(filtered_repos), + "total_contributors": len(contributors_by_frequency), + "contributors_with_company": len(contributors_by_frequency) - len(unidentified_contributors), + "contributors_without_company": len(unidentified_contributors) + }, + "contributors_by_frequency": contributors_by_frequency[:50], # Top 50 contributors + "company_analysis": dict(company_analysis), + "unidentified_contributors": unidentified_contributors[:20], # Top 20 unidentified + "contributors_by_repo": repo_contributors + } + + # Sort output for consistency + def deep_sort(obj, sort_keys=True): + if isinstance(obj, dict): + keys = sorted(obj) if sort_keys else obj.keys() + return {k: deep_sort(obj[k]) for k in keys} + elif isinstance(obj, list): + return [deep_sort(x) for x in obj] + else: + return obj + + # Don't sort top-level keys to maintain logical order + for key in ["company_analysis", "contributors_by_repo"]: + if key in output_data: + output_data[key] = deep_sort(output_data[key]) + + save_to_yaml(output_data) + + # Show final cache stats + final_cache_stats = get_cache_stats() + logging.info( + f"[main] Final cache stats: {final_cache_stats['total_files']} files, {final_cache_stats['total_size_mb']} MB" + ) + + # Print summary + print(f"\nAnalysis Summary:") + print(f"- Organization: {ORG_NAME}") + print(f"- Repositories analyzed: {len(filtered_repos)}") + print(f"- Total contributors: {len(contributors_by_frequency)}") + print(f"- Contributors with identified companies: {len(contributors_by_frequency) - len(unidentified_contributors)}") + print(f"- Top companies by contributor count:") + + # Show top companies + company_contributor_count = [(company, len(contributors)) for company, contributors in company_analysis.items()] + company_contributor_count.sort(key=lambda x: x[1], reverse=True) + + for company, count in company_contributor_count[:20]: + total_commits = sum(c["total_commits"] for c in company_analysis[company]) + print(f" - {company}: {count} contributors, {total_commits} total commits") + + # Show top contributors (>7 commits) with their repository breakdown + print(f"\nTop contributors (>7 commits):") + top_contributors = [c for c in contributors_by_frequency if c["total_commits"] > 7] + + for contributor in top_contributors: + contributor_key = contributor["contributor"] + + # Get repo-specific commit counts for this contributor + repo_commits = [] + for repo in contributor["repos"]: + # Find this contributor in the repo's contributor list + for repo_contrib in repo_contributors.get(repo, []): + if repo_contrib["contributor"] == contributor_key: + repo_commits.append(f"{repo}({repo_contrib['commits']})") + break + + # Sort by commit count (descending) + repo_commits.sort(key=lambda x: int(x.split('(')[1].split(')')[0]), reverse=True) + + # Format the contributor name (use username if available, otherwise email/name) + display_name = contributor["username"] if contributor["username"] else contributor_key + + print(f"- {display_name}, {', '.join(repo_commits)}") + + logging.info("[main] Script completed successfully.") + + +if __name__ == "__main__": + main() diff --git a/tools/analytics/org/analyze_repo_info.py b/tools/analytics/org/analyze_repo_info.py new file mode 100644 index 0000000000..da2263f3c1 --- /dev/null +++ b/tools/analytics/org/analyze_repo_info.py @@ -0,0 +1,259 @@ +""" +GitHub Organization Repository Information Analyzer +================================================== + +Purpose: +-------- +This script analyzes all repositories in a specified GitHub organization and outputs a CSV file with key repository information including visibility, archived status, and last commit date. + +Key Features: +------------- +- Fetches all repositories in a GitHub organization (including archived ones). +- Collects repository metadata including visibility, archived status, and last commit date. +- Outputs a CSV file with repository information for easy analysis. +- Caches GitHub API responses for efficiency and rate limit avoidance. + +How to Run: +----------- +1. Ensure you have Python 3.9+ and install dependencies (see below). +2. Set the following environment variable (can be in a .env file): + - `GITHUB_TOKEN`: A GitHub personal access token with `repo` read permissions. +3. Run the script: + + ```bash + python analyze_repo_info.py [--org ORG_NAME] + ``` + - Use `--org` to specify the GitHub organization to analyze (defaults to 'pytorch'). + +Dependencies: +------------- +- requests +- python-dotenv +- csv (built-in) + +Output: +------- +- `reports/repo_info_summary.csv`: A CSV file containing: + - Repo name (in org/repo format) + - Public (True if public, False if Private) + - Archived (True if archived, else False) + - Last commit date (date repo was last committed to, in YYYY-MM-DD format) +- Caches API responses in the `cache/` directory for faster reruns. + +Notes: +------ +- The script is safe to rerun; it uses caching to avoid redundant API calls. +- For large orgs, the script may take a while on the first run due to API rate limits. +""" + +import argparse +import csv +import logging +import os +from datetime import datetime +from pathlib import Path +from typing import Dict, List, Optional + +import requests +from cache_manager import get_cache_stats, make_cached_request +from dotenv import load_dotenv + + +load_dotenv() + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s [%(levelname)s] %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", +) + +GITHUB_TOKEN = os.getenv("GITHUB_TOKEN") +ORG_NAME = None # Will be set by argparse + +# GitHub API headers +HEADERS = { + "Authorization": f"Bearer {GITHUB_TOKEN}", + "Accept": "application/vnd.github+json", +} + +BASE_URL = "https://api.github.com" + + +def get_repos_with_info(org: str) -> List[Dict]: + """ + Fetch all repositories for an organization with their metadata. + + Args: + org: The GitHub organization name + + Returns: + List of repository dictionaries with metadata + """ + logging.info(f"[get_repos_with_info] Start fetching repositories for org: {org}") + repos = [] + page = 1 + while True: + url = f"{BASE_URL}/orgs/{org}/repos?per_page=100&page={page}" + logging.debug(f"[get_repos_with_info] Requesting URL: {url}") + data = make_cached_request(url, HEADERS) + if data is None: + logging.error(f"[get_repos_with_info] Failed to fetch page {page} for org: {org}") + break + if not data: + logging.info( + f"[get_repos_with_info] No more repositories found on page {page} for org: {org}" + ) + break + logging.info( + f"[get_repos_with_info] Page {page}: Found {len(data)} repositories for org: {org}" + ) + repos.extend(data) + page += 1 + logging.info( + f"[get_repos_with_info] Finished fetching repositories for org: {org}. Total: {len(repos)}" + ) + return repos + + +def get_last_commit_date(org: str, repo: str) -> Optional[str]: + """ + Get the date of the last commit for a repository. + + Args: + org: The GitHub organization name + repo: The repository name + + Returns: + Date string in YYYY-MM-DD format of the last commit, or None if no commits found + """ + logging.info(f"[get_last_commit_date] Getting last commit date for repo: {repo}") + url = f"{BASE_URL}/repos/{org}/{repo}/commits?per_page=1" + logging.debug(f"[get_last_commit_date] Requesting URL: {url}") + data = make_cached_request(url, HEADERS) + if data is None or not data: + logging.warning(f"[get_last_commit_date] No commits found for repo: {repo}") + return None + + if len(data) > 0: + commit_date = data[0]["commit"]["author"]["date"] + # Convert ISO format to YYYY-MM-DD format + try: + from datetime import datetime + dt = datetime.fromisoformat(commit_date.replace('Z', '+00:00')) + formatted_date = dt.strftime('%Y-%m-%d') + logging.info(f"[get_last_commit_date] Last commit date for {repo}: {formatted_date}") + return formatted_date + except (ValueError, AttributeError) as e: + logging.warning(f"[get_last_commit_date] Failed to parse date for {repo}: {e}") + return None + + return None + + +def process_repo_data(org: str, repos: List[Dict]) -> List[Dict]: + """ + Process repository data and add last commit date information. + + Args: + org: The GitHub organization name + repos: List of repository dictionaries from GitHub API + + Returns: + List of processed repository data with all required fields + """ + logging.info(f"[process_repo_data] Processing {len(repos)} repositories") + processed_repos = [] + + for i, repo in enumerate(repos, 1): + repo_name = repo["name"] + logging.info(f"[process_repo_data] Processing repo {i}/{len(repos)}: {repo_name}") + + # Get last commit date + last_commit_date = get_last_commit_date(org, repo_name) + + processed_repo = { + "repo_name": f"{org}/{repo_name}", + "public": repo.get("private", True) == False, # True if public, False if private + "archived": repo.get("archived", False), + "last_commit_date": last_commit_date + } + + processed_repos.append(processed_repo) + + logging.info(f"[process_repo_data] Finished processing {len(processed_repos)} repositories") + return processed_repos + + +def save_to_csv(data: List[Dict], filename: str = "repo_info_summary.csv"): + """ + Save repository data to a CSV file. + + Args: + data: List of repository dictionaries + filename: Name of the CSV file to create + """ + # Create reports directory if it doesn't exist + reports_dir = "reports" + os.makedirs(reports_dir, exist_ok=True) + + # Build full path with reports directory + filepath = os.path.join(reports_dir, filename) + logging.info(f"[save_to_csv] Saving repository data to {filepath}") + + # Define CSV headers + fieldnames = ["repo_name", "public", "archived", "last_commit_date"] + + with open(filepath, "w", newline="", encoding="utf-8") as f: + writer = csv.DictWriter(f, fieldnames=fieldnames) + writer.writeheader() + writer.writerows(data) + + logging.info(f"[save_to_csv] Data successfully saved to {filepath}") + + +def main(): + parser = argparse.ArgumentParser( + description="Analyze GitHub org repository information." + ) + parser.add_argument( + "--org", + type=str, + default="pytorch", + help="GitHub organization to analyze (default: pytorch)", + ) + args = parser.parse_args() + + global ORG_NAME + ORG_NAME = args.org + + if not GITHUB_TOKEN: + logging.error("[main] Missing GITHUB_TOKEN in environment variables.") + return + + logging.info(f"[main] Starting analysis for org: {ORG_NAME}") + + # Show cache stats at start + cache_stats = get_cache_stats() + logging.info( + f"[main] Cache stats: {cache_stats['total_files']} files, {cache_stats['total_size_mb']} MB" + ) + + # Step 1: Get all repositories with their metadata + repos = get_repos_with_info(ORG_NAME) + + # Step 2: Process repository data and add last commit dates + processed_repos = process_repo_data(ORG_NAME, repos) + + # Step 3: Save to CSV + save_to_csv(processed_repos) + + # Show final cache stats + final_cache_stats = get_cache_stats() + logging.info( + f"[main] Final cache stats: {final_cache_stats['total_files']} files, {final_cache_stats['total_size_mb']} MB" + ) + logging.info("[main] Script completed successfully.") + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/tools/analytics/org/analyze_runner_usage.py b/tools/analytics/org/analyze_runner_usage.py index 4ff31306a1..f38af446ff 100644 --- a/tools/analytics/org/analyze_runner_usage.py +++ b/tools/analytics/org/analyze_runner_usage.py @@ -12,7 +12,7 @@ - For each repository, fetches recent workflow runs and extracts the runner labels used in jobs. - Aggregates runner label usage across repositories, including last usage and workflow file. - Compares runner labels against those defined in scale-config.yml and standard GitHub runners. -- Outputs a YAML summary (runner_labels_summary.yml) with detailed runner usage, repos by runner, and special groupings (e.g., runners not in scale-config, repos with zero workflow runs). +- Outputs a YAML summary (reports/runner_labels_summary.yml) with detailed runner usage, repos by runner, and special groupings (e.g., runners not in scale-config, repos with zero workflow runs). - Caches GitHub API responses for efficiency and rate limit avoidance. How to Run: @@ -38,7 +38,7 @@ Output: ------- -- `runner_labels_summary.yml`: A YAML file containing: +- `reports/runner_labels_summary.yml`: A YAML file containing: - `runners_used`: For each runner label, a list of repos, last usage, and workflow file. - `repo_runners`: For each repo, a list of runner labels it uses. - `repositories_with_zero_workflow_runs`: Repos with no workflow runs in the lookback period. @@ -391,13 +391,20 @@ def process_repo_runs( def save_to_yaml(data: Dict, filename: str = "runner_labels_summary.yml"): - logging.info(f"[save_to_yaml] Saving runner label data to {filename}") + # Create reports directory if it doesn't exist + reports_dir = "reports" + os.makedirs(reports_dir, exist_ok=True) + + # Build full path with reports directory + filepath = os.path.join(reports_dir, filename) + logging.info(f"[save_to_yaml] Saving runner label data to {filepath}") + # Convert defaultdict to regular dict to avoid YAML serialization issues if hasattr(data, "default_factory"): data = dict(data) - with open(filename, "w") as f: + with open(filepath, "w") as f: yaml.dump(data, f, sort_keys=False) - logging.info(f"[save_to_yaml] Data successfully saved to {filename}") + logging.info(f"[save_to_yaml] Data successfully saved to {filepath}") def download_scale_config(url: str, dest: str = "scale-config.yml") -> bool: diff --git a/tools/analytics/org/github_code_search.py b/tools/analytics/org/github_code_search.py new file mode 100644 index 0000000000..56788873ff --- /dev/null +++ b/tools/analytics/org/github_code_search.py @@ -0,0 +1,589 @@ +# /// script +# requires-python = ">=3.10" +# dependencies = [ +# "requests>=2.31.0", +# "python-dotenv>=1.0.0", +# ] +# /// + +""" +GitHub Code Search API Script +============================ + +Purpose: +-------- +This script uses GitHub's Search API to perform code searches equivalent to the GitHub web interface. +It can search for code across organizations, repositories, and files with various filters. + +Key Features: +------------- +- Search code across GitHub organizations +- Filter by repository, language, file extension, etc. +- Handle GitHub API rate limits +- Cache results for efficiency +- Export results to various formats + +How to Run: +----------- +1. Ensure you have Python 3.10+ and install dependencies (see below). +2. Set the following environment variable (can be in a .env file): + - `GITHUB_TOKEN`: A GitHub personal access token with appropriate permissions. +3. Run the script: + + ```bash + python github_code_search.py --query "org:meta-pytorch pytorch-labs" [options] + ``` + +Examples: +--------- +```bash +# Search for "pytorch-labs" in meta-pytorch organization +python github_code_search.py --query "org:meta-pytorch pytorch-labs" + +# Search for specific file types +python github_code_search.py --query "org:meta-pytorch filename:README.md" + +# Search for code in specific language +python github_code_search.py --query "org:meta-pytorch language:python pytorch-labs" + +# Export results to JSON +python github_code_search.py --query "org:meta-pytorch pytorch-labs" --output results.json +``` + +Output: +------- +- Console output with search results +- Optional JSON/CSV export +- Rate limit information +- Search statistics +""" + +import argparse +import json +import logging +import os +import time +from datetime import datetime, timezone +from typing import Dict, List, Optional, Any, TypedDict, Union +from urllib.parse import quote_plus +from dataclasses import dataclass + +import requests +from dotenv import load_dotenv + +load_dotenv() + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s [%(levelname)s] %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", +) + +GITHUB_TOKEN = os.getenv("GITHUB_TOKEN") + +# GitHub API headers +HEADERS = { + "Authorization": f"Bearer {GITHUB_TOKEN}", + "Accept": "application/vnd.github+json", +} + +BASE_URL = "https://api.github.com" +SEARCH_URL = f"{BASE_URL}/search/code" + + +# Type definitions for well-defined schema +class RepositoryInfo(TypedDict): + """Repository information from GitHub search results.""" + id: int + node_id: str + name: str + full_name: str + private: bool + owner: Dict[str, Any] # GitHub user/org object + html_url: str + description: Optional[str] + fork: bool + url: str + forks_url: str + keys_url: str + collaborators_url: str + teams_url: str + hooks_url: str + issue_events_url: str + events_url: str + assignees_url: str + branches_url: str + tags_url: str + blobs_url: str + git_tags_url: str + git_refs_url: str + trees_url: str + statuses_url: str + languages_url: str + stargazers_url: str + contributors_url: str + subscribers_url: str + subscription_url: str + commits_url: str + git_commits_url: str + comments_url: str + issue_comment_url: str + contents_url: str + compare_url: str + merges_url: str + archive_url: str + downloads_url: str + issues_url: str + pulls_url: str + milestones_url: str + notifications_url: str + labels_url: str + releases_url: str + deployments_url: str + created_at: str + updated_at: str + pushed_at: str + git_url: str + ssh_url: str + clone_url: str + svn_url: str + homepage: Optional[str] + size: int + stargazers_count: int + watchers_count: int + language: Optional[str] + has_issues: bool + has_projects: bool + has_downloads: bool + has_wiki: bool + has_pages: bool + has_discussions: bool + forks_count: int + mirror_url: Optional[str] + archived: bool + disabled: bool + open_issues_count: int + license: Optional[Dict[str, Any]] + allow_forking: bool + is_template: bool + web_commit_signoff_required: bool + topics: List[str] + visibility: str + forks: int + open_issues: int + watchers: int + default_branch: str + score: float + + +class SearchResultItem(TypedDict): + """Individual search result item from GitHub code search.""" + name: str + path: str + sha: str + url: str + git_url: str + html_url: str + repository: RepositoryInfo + score: float + file_size: Optional[int] + language: Optional[str] + last_modified_at: Optional[str] + line_numbers: Optional[List[int]] + text_matches: Optional[List[Dict[str, Any]]] + + +class GitHubSearchResults(TypedDict): + """Complete search results from GitHub Search API.""" + query: str + total_count: int + retrieved_count: int + items: List[SearchResultItem] + search_time: str + rate_limit_remaining: Optional[int] + rate_limit_reset: Optional[str] + + +@dataclass +class SearchOptions: + """Options for GitHub code search.""" + per_page: int = 100 + max_results: Optional[int] = None + verbose: bool = True + + +class GitHubCodeSearch: + def __init__(self, token: str = None): + """ + Initialize GitHub Code Search client. + + Args: + token: GitHub personal access token. If None, will try to get from GITHUB_TOKEN env var. + """ + self.token = token or GITHUB_TOKEN + if not self.token: + raise ValueError("GitHub token is required. Set GITHUB_TOKEN environment variable or pass token parameter.") + + self.headers = { + "Authorization": f"Bearer {self.token}", + "Accept": "application/vnd.github+json", + } + self.session = requests.Session() + self.session.headers.update(self.headers) + + def search_code(self, query: str, per_page: int = 100, max_results: Optional[int] = None, + verbose: bool = True) -> GitHubSearchResults: + """ + Search for code using GitHub's Search API. + + Args: + query: Search query string + per_page: Number of results per page (max 100) + max_results: Maximum number of results to return (None for all) + verbose: Whether to log progress messages + + Returns: + GitHubSearchResults: Well-defined structure containing: + - query: The search query used + - total_count: Total number of results available from GitHub + - retrieved_count: Number of results actually retrieved + - items: List of SearchResultItem objects with file details + - search_time: ISO timestamp of when search was performed + - rate_limit_remaining: Remaining API calls (if available) + - rate_limit_reset: When rate limit resets (if available) + """ + all_items = [] + page = 1 + total_count = 0 + + if verbose: + logging.info(f"Starting code search with query: {query}") + + while True: + # Check rate limits + rate_limit_info = self._check_rate_limit() + if rate_limit_info['remaining'] == 0: + reset_time = rate_limit_info['reset_time'] + wait_time = max(0, reset_time - time.time()) + if verbose: + logging.warning(f"Rate limit exceeded. Waiting {wait_time:.0f} seconds...") + time.sleep(wait_time + 1) + + # Prepare request parameters + params = { + 'q': query, + 'per_page': min(per_page, 100), + 'page': page + } + + try: + if verbose: + logging.info(f"Fetching page {page}...") + response = self.session.get(SEARCH_URL, params=params) + response.raise_for_status() + + data = response.json() + + # Update total count on first page + if page == 1: + total_count = data.get('total_count', 0) + if verbose: + logging.info(f"Total results found: {total_count}") + + items = data.get('items', []) + if not items: + break + + all_items.extend(items) + if verbose: + logging.info(f"Retrieved {len(items)} items from page {page} (total: {len(all_items)})") + + # Check if we've reached the maximum results + if max_results and len(all_items) >= max_results: + all_items = all_items[:max_results] + if verbose: + logging.info(f"Reached maximum results limit: {max_results}") + break + + # Check if there are more pages + if len(items) < per_page: + break + + page += 1 + + # Be respectful to the API + time.sleep(1) + + except requests.exceptions.RequestException as e: + logging.error(f"Error fetching page {page}: {e}") + break + except json.JSONDecodeError as e: + logging.error(f"Error parsing JSON response from page {page}: {e}") + break + + # Get rate limit info for the response + rate_limit_info = self._check_rate_limit() + + return GitHubSearchResults( + query=query, + total_count=total_count, + retrieved_count=len(all_items), + items=all_items, + search_time=datetime.now(timezone.utc).isoformat(), + rate_limit_remaining=rate_limit_info.get('remaining'), + rate_limit_reset=datetime.fromtimestamp(rate_limit_info.get('reset_time', 0)).isoformat() if rate_limit_info.get('reset_time') else None + ) + + def get_rate_limit(self) -> Dict[str, Any]: + """Get GitHub API rate limit status.""" + return self._check_rate_limit() + + def _check_rate_limit(self) -> Dict[str, Any]: + """Check GitHub API rate limit status.""" + try: + response = self.session.get(f"{BASE_URL}/rate_limit") + response.raise_for_status() + data = response.json() + + search_limit = data.get('resources', {}).get('search', {}) + return { + 'limit': search_limit.get('limit', 0), + 'remaining': search_limit.get('remaining', 0), + 'reset_time': search_limit.get('reset', 0) + } + except Exception as e: + logging.warning(f"Could not check rate limit: {e}") + return {'limit': 0, 'remaining': 0, 'reset_time': 0} + + def format_results(self, results: GitHubSearchResults, format_type: str = 'console') -> str: + """Format search results for different output types.""" + if format_type == 'json': + return json.dumps(results, indent=2) + + elif format_type == 'console': + output = [] + output.append(f"=== GitHub Code Search Results ===") + output.append(f"Query: {results['query']}") + output.append(f"Total results: {results['total_count']}") + output.append(f"Retrieved: {results['retrieved_count']}") + output.append(f"Search time: {results['search_time']}") + output.append("") + + for i, item in enumerate(results['items'], 1): + repo_name = item.get('repository', {}).get('full_name', 'Unknown') + file_path = item.get('path', 'Unknown') + file_url = item.get('html_url', '') + score = item.get('score', 0) + + output.append(f"{i}. {repo_name}/{file_path}") + output.append(f" Score: {score}") + output.append(f" URL: {file_url}") + output.append("") + + return "\n".join(output) + + elif format_type == 'csv': + import csv + import io + + output = io.StringIO() + writer = csv.writer(output) + + # Write header + writer.writerow(['Repository', 'File Path', 'Score', 'URL', 'Search Time']) + + # Write data + for item in results['items']: + repo_name = item.get('repository', {}).get('full_name', 'Unknown') + file_path = item.get('path', 'Unknown') + file_url = item.get('html_url', '') + score = item.get('score', 0) + + writer.writerow([repo_name, file_path, score, file_url, results['search_time']]) + + return output.getvalue() + + else: + raise ValueError(f"Unsupported format type: {format_type}") + + def get_file_paths(self, results: GitHubSearchResults) -> List[str]: + """Extract just the file paths from search results.""" + return [item.get('path', '') for item in results.get('items', [])] + + def get_repositories(self, results: GitHubSearchResults) -> List[str]: + """Extract just the repository names from search results.""" + return [item.get('repository', {}).get('full_name', '') for item in results.get('items', [])] + + def get_unique_repositories(self, results: GitHubSearchResults) -> List[str]: + """Extract unique repository names from search results.""" + repos = self.get_repositories(results) + return list(set(repos)) + + def filter_by_score(self, results: GitHubSearchResults, min_score: float = 0.0) -> GitHubSearchResults: + """Filter results by minimum score.""" + filtered_items = [ + item for item in results.get('items', []) + if item.get('score', 0) >= min_score + ] + + return GitHubSearchResults( + query=results['query'], + total_count=results['total_count'], + retrieved_count=len(filtered_items), + items=filtered_items, + search_time=results['search_time'], + rate_limit_remaining=results.get('rate_limit_remaining'), + rate_limit_reset=results.get('rate_limit_reset') + ) + + def filter_by_repository(self, results: GitHubSearchResults, repo_pattern: str) -> GitHubSearchResults: + """Filter results by repository name pattern.""" + import re + pattern = re.compile(repo_pattern) + + filtered_items = [ + item for item in results.get('items', []) + if pattern.search(item.get('repository', {}).get('full_name', '')) + ] + + return GitHubSearchResults( + query=results['query'], + total_count=results['total_count'], + retrieved_count=len(filtered_items), + items=filtered_items, + search_time=results['search_time'], + rate_limit_remaining=results.get('rate_limit_remaining'), + rate_limit_reset=results.get('rate_limit_reset') + ) + + +def search_github_code(query: str, token: str = None, per_page: int = 100, + max_results: Optional[int] = None, verbose: bool = True) -> GitHubSearchResults: + """ + Convenience function to search GitHub code. + + Args: + query: Search query string + token: GitHub personal access token (optional, will use GITHUB_TOKEN env var if not provided) + per_page: Number of results per page (max 100) + max_results: Maximum number of results to return (None for all) + verbose: Whether to log progress messages + + Returns: + GitHubSearchResults: Well-defined structure containing search results with the following fields: + - query: The search query used + - total_count: Total number of results available from GitHub + - retrieved_count: Number of results actually retrieved + - items: List of SearchResultItem objects, each containing: + - name: File name + - path: File path in repository + - sha: Git SHA of the file + - url: API URL for the file + - html_url: Web URL for the file + - repository: RepositoryInfo object with full repo details + - score: Relevance score (0-100) + - file_size: File size in bytes (if available) + - language: Programming language (if detected) + - last_modified_at: Last modification time (if available) + - line_numbers: Line numbers where matches were found (if available) + - text_matches: Detailed text match information (if available) + - search_time: ISO timestamp of when search was performed + - rate_limit_remaining: Remaining API calls (if available) + - rate_limit_reset: When rate limit resets (if available) + """ + searcher = GitHubCodeSearch(token) + return searcher.search_code(query, per_page, max_results, verbose) + + +def main(): + parser = argparse.ArgumentParser( + description="Search GitHub code using the GitHub Search API" + ) + parser.add_argument( + "--query", + type=str, + required=True, + help="Search query (e.g., 'org:meta-pytorch pytorch-labs')", + ) + parser.add_argument( + "--per-page", + type=int, + default=100, + help="Number of results per page (max 100, default: 100)", + ) + parser.add_argument( + "--max-results", + type=int, + help="Maximum number of results to retrieve (default: all)", + ) + parser.add_argument( + "--output", + type=str, + help="Output file path (e.g., 'results.json' or 'results.csv')", + ) + parser.add_argument( + "--format", + type=str, + choices=['console', 'json', 'csv'], + default='console', + help="Output format (default: console)", + ) + parser.add_argument( + "--show-rate-limit", + action="store_true", + help="Show rate limit information before searching", + ) + + args = parser.parse_args() + + if not GITHUB_TOKEN: + logging.error("Missing GITHUB_TOKEN in environment variables.") + return + + # Create search instance + searcher = GitHubCodeSearch() + + # Show rate limit if requested + if args.show_rate_limit: + rate_limit = searcher.get_rate_limit() + print(f"Rate limit: {rate_limit['remaining']}/{rate_limit['limit']} remaining") + if rate_limit['remaining'] == 0: + reset_time = datetime.fromtimestamp(rate_limit['reset_time']) + print(f"Rate limit resets at: {reset_time}") + print() + + # Perform search + results = searcher.search_code( + query=args.query, + per_page=args.per_page, + max_results=args.max_results + ) + + # Format and output results + if args.output: + # Determine format from file extension + if args.output.endswith('.json'): + output_format = 'json' + elif args.output.endswith('.csv'): + output_format = 'csv' + else: + output_format = args.format + + formatted_output = searcher.format_results(results, output_format) + + with open(args.output, 'w', encoding='utf-8') as f: + f.write(formatted_output) + + print(f"Results saved to: {args.output}") + + # Also show console summary + console_output = searcher.format_results(results, 'console') + print(console_output) + else: + # Just show console output + formatted_output = searcher.format_results(results, args.format) + print(formatted_output) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/tools/analytics/org/remove_pytorch_labs.py b/tools/analytics/org/remove_pytorch_labs.py new file mode 100644 index 0000000000..9e246daf07 --- /dev/null +++ b/tools/analytics/org/remove_pytorch_labs.py @@ -0,0 +1,636 @@ +# /// script +# requires-python = ">=3.10" +# dependencies = [ +# "requests>=2.31.0", +# "python-dotenv>=1.0.0", +# ] +# /// + +""" +GitHub Organization Text Replacement Script +========================================== + +Purpose: +-------- +This script replaces all instances of "pytorch-labs" with "meta-pytorch" across all repositories +in a specified GitHub organization and creates pull requests for each repository with changes. + +Key Features: +------------- +- Uses pre-defined list of files known to contain "pytorch-labs" mentions (optimized for performance). This list was obtained by running codesea +- Replaces all instances of "pytorch-labs" with "meta-pytorch" in target files. +- Creates a new branch and commits changes for each repository. +- Creates pull requests with descriptive titles and descriptions. +- Caches GitHub API responses for efficiency and rate limit avoidance. + +How to Run: +----------- +1. Ensure you have Python 3.9+ and install dependencies (see below). +2. Set the following environment variable (can be in a .env file): + - `GITHUB_TOKEN`: A GitHub personal access token with `repo` permissions. +3. Run the script: + + ```bash + python remove_pytorch_labs.py [--org ORG_NAME] [--repos REPO_LIST] [--dry-run] + ``` + - Use `--org` to specify the GitHub organization to analyze (defaults to 'pytorch'). + - Use `--repos` to specify a comma-separated list of repositories to process (e.g., 'pytorch,vision,tutorials'). + - Use `--dry-run` to preview changes without making them. + + +Output: +------- +- Logs all operations to console and file +- Creates pull requests for repositories with changes +- Summary report of operations performed + +Notes: +------ +- Only processes 72 pre-identified files that contain "pytorch-labs" mentions +- Skips binary files and files larger than 1MB +- Creates one PR per repository with changes +- Handles GitHub API rate limits automatically +- Significantly faster than scanning all files in all repositories +""" + +import argparse +import base64 +import json +import logging +import os +import re +from datetime import datetime, timezone +from typing import Dict, List, Optional, Tuple + +import requests +from cache_manager import get_cache_stats, make_cached_request +from dotenv import load_dotenv + +load_dotenv() + +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s [%(levelname)s] %(message)s", + datefmt="%Y-%m-%d %H:%M:%S", +) + +GITHUB_TOKEN = os.getenv("GITHUB_TOKEN") +ORG_NAME = None # Will be set by argparse +DRY_RUN = False # Will be set by argparse + +# GitHub API headers +HEADERS = { + "Authorization": f"Bearer {GITHUB_TOKEN}", + "Accept": "application/vnd.github+json", +} + +BASE_URL = "https://api.github.com" + +# Text to replace +OLD_TEXT = "pytorch-labs" +NEW_TEXT = "meta-pytorch" + +# Maximum file size to process (1MB) +MAX_FILE_SIZE = 1024 * 1024 + +# Pre-defined list of files that contain "pytorch-labs" mentions +# This is based on search results and will significantly improve performance +TARGET_FILES = { + "pytorch": [ + "android/README.md", + "aten/src/ATen/native/cuda/int4mm.cu", + "torch/testing/_internal/common_quantization.py" + ], + "vision": [ + "torchvision/io/image.py" + ], + "tutorials": [ + "index.rst", + "docathon-leaderboard.md", + "intermediate_source/transformer_building_blocks.py", + "unstable_source/gpu_quantization_torchao_tutorial.py" + ], + "executorch": [ + "docs/source/index.md", + "docs/source/getting-started.md", + "backends/apple/mps/setup.md", + "docs/source/backends-mps.md", + "docs/source/llm/run-with-c-plus-plus.md", + "docs/source/using-executorch-android.md", + "docs/source/using-executorch-export.md", + "docs/source/using-executorch-building-from-source.md", + "docs/source/using-executorch-cpp.md", + "examples/models/llama/experimental/generate.py", + "scripts/test_ios.sh", + ".ci/scripts/test_ios_ci.sh", + "backends/test/facto/test_facto.py" + ], + "ao": [ + "scripts/download.py", + "torchao/_models/llama/tokenizer.py", + "scripts/convert_hf_checkpoint.py", + "examples/sam2_amg_server/annotate_with_rle.py", + "torchao/prototype/mx_formats/kernels.py", + "torchao/_models/sam/README.md", + "torchao/quantization/README.md", + "test/integration/test_integration.py", + ".github/workflows/dashboard_perf_test.yml" + ], + "benchmark": [ + "torchbenchmark/models/simple_gpt/origin", + "torchbenchmark/models/sam_fast/requirements.txt" + ], + "torchtune": [ + "docs/source/tutorials/qlora_finetune.rst", + "recipes/eleuther_eval.py", + "docs/source/tutorials/e2e_flow.rst", + "torchtune/generation/_generation.py", + "docs/source/tutorials/llama3.rst", + "README.md" + ], + "torchft": [ + "docs/source/protocol.rst", + "docs/source/assumptions_and_recommendations.rst", + "docs/source/conf.py", + "docs/source/index.rst", + "README.md" + ], + "torchchat": [ + "torchchat/usages/eval.py", + "README.md" + ], + "rl": [ + "examples/rlhf/requirements.txt" + ], + "builder": [ + "CUDA_UPGRADE_GUIDE.MD" + ], + "helion": [ + "benchmarks/run.py", + "benchmarks/README.md" + ], + "torchcodec": [ + "src/torchcodec/_core/SingleStreamDecoder.cpp" + ], + "test-infra": [ + "aws/lambda/README.md", + "torchci/clickhouse_queries/queued_jobs_aggregate/query.sql", + "tools/torchfix/README.md", + ".github/workflows/trigger_nightly.yml" + ], + "ci-infra": [ + "arc-backup-2024/scripts/deployment.py" + ], + "oss-docathons": [ + "pytorch/h1-2024/leaderboard-pytorch-docathon-h1-2024.md", + "pytorch/h1-2024/leaderboard-pytorch-docathon-h1-2024.csv", + ".github/scripts/pytorch-docathon-h1-2024.py" + ], + "serve": [ + "examples/large_models/segment_anything_fast/install_segment_anything_fast.sh", + "examples/large_models/gpt_fast/README.md", + "examples/large_models/gpt_fast_mixtral_moe/README.md", + "examples/large_models/diffusion_fast/README.md", + "examples/large_models/segment_anything_fast/README.md", + "kubernetes/kserve/examples/gpt_fast/README.md" + ], + "xla": [ + "torchax/test/llama/llama_model.py" + ], + "pytorch-canary": [ + "torch/testing/_internal/common_quantization.py" + ], + "pytorch-integration-testing": [ + ".github/scripts/generate_vllm_benchmark_matrix.py" + ], + "torcheval": [ + ".github/PULL_REQUEST_TEMPLATE.md", + ".github/ISSUE_TEMPLATE/bug-report.yml" + ] +} + + +def get_target_repos(org: str, filter_repos: Optional[List[str]] = None) -> List[str]: + """Get only the repositories that have files with 'pytorch-labs' mentions.""" + if org not in TARGET_FILES: + logging.info(f"[get_target_repos] No target files found for org: {org}") + return [] + + all_repos = list(TARGET_FILES.keys()) + + if filter_repos: + # Filter to only include repos that are in both the target files and the filter list + repos = [repo for repo in all_repos if repo in filter_repos] + logging.info(f"[get_target_repos] Filtered to {len(repos)} repositories from {len(all_repos)} available") + + # Log which repos were filtered out + filtered_out = [repo for repo in filter_repos if repo not in all_repos] + if filtered_out: + logging.warning(f"[get_target_repos] Repositories not found in target files: {filtered_out}") + else: + repos = all_repos + logging.info(f"[get_target_repos] Found {len(repos)} repositories with target files for org: {org}") + + return repos + + +def get_default_branch(org: str, repo: str) -> Optional[str]: + """Get the default branch for a repository.""" + url = f"{BASE_URL}/repos/{org}/{repo}" + data = make_cached_request(url, HEADERS) + if data: + return data.get("default_branch", "main") + return None + + +def get_target_files_for_repo(org: str, repo: str) -> List[str]: + """Get the list of target files for a specific repository.""" + if repo not in TARGET_FILES: + logging.info(f"[get_target_files_for_repo] No target files found for {org}/{repo}") + return [] + + files = TARGET_FILES[repo] + logging.info(f"[get_target_files_for_repo] Found {len(files)} target files for {org}/{repo}") + return files + + +def get_file_content(org: str, repo: str, file_path: str) -> Optional[str]: + """Get the content of a file from GitHub.""" + url = f"{BASE_URL}/repos/{org}/{repo}/contents/{file_path}" + data = make_cached_request(url, HEADERS) + if not data: + return None + + # Check file size + if data.get("size", 0) > MAX_FILE_SIZE: + logging.warning(f"[get_file_content] File {file_path} too large ({data['size']} bytes), skipping") + return None + + # Decode content + try: + content = base64.b64decode(data["content"]).decode("utf-8") + return content + except (UnicodeDecodeError, Exception) as e: + logging.warning(f"[get_file_content] Failed to decode {file_path}: {e}") + return None + + +def find_and_replace_in_file(org: str, repo: str, file_path: str) -> Optional[Tuple[str, str]]: + """Find and replace text in a file. Returns (old_content, new_content) if changes needed.""" + content = get_file_content(org, repo, file_path) + if content is None: + return None + + # Check if file contains the target text + if OLD_TEXT not in content: + return None + + # Replace all instances + new_content = content.replace(OLD_TEXT, NEW_TEXT) + + # Check if any changes were made + if new_content == content: + return None + + logging.info(f"[find_and_replace_in_file] Found {content.count(OLD_TEXT)} instances in {file_path}") + return content, new_content + + +def create_branch(org: str, repo: str, base_branch: str, new_branch: str) -> bool: + """Create a new branch from the base branch.""" + if DRY_RUN: + logging.info(f"[create_branch] DRY RUN: Would create branch {new_branch} in {org}/{repo}") + return True + + # Get the SHA of the base branch + url = f"{BASE_URL}/repos/{org}/{repo}/branches/{base_branch}" + branch_data = make_cached_request(url, HEADERS) + if not branch_data: + logging.error(f"[create_branch] Failed to get base branch {base_branch}") + return False + + base_sha = branch_data["commit"]["sha"] + + # Create the new branch + url = f"{BASE_URL}/repos/{org}/{repo}/git/refs" + data = { + "ref": f"refs/heads/{new_branch}", + "sha": base_sha + } + + response = requests.post(url, headers=HEADERS, json=data) + if response.status_code == 201: + logging.info(f"[create_branch] Created branch {new_branch} in {org}/{repo}") + return True + elif response.status_code == 422: # Branch already exists + logging.info(f"[create_branch] Branch {new_branch} already exists in {org}/{repo}") + return True + else: + logging.error(f"[create_branch] Failed to create branch {new_branch}: {response.status_code} - {response.text}") + return False + + +def create_file_commit(org: str, repo: str, file_path: str, content: str, branch: str, message: str) -> bool: + """Create a commit to update a file.""" + if DRY_RUN: + logging.info(f"[create_file_commit] DRY RUN: Would update {file_path} in {org}/{repo}") + return True + + # First get the current file to get its SHA + url = f"{BASE_URL}/repos/{org}/{repo}/contents/{file_path}" + current_file_data = make_cached_request(url, HEADERS) + if not current_file_data: + logging.error(f"[create_file_commit] Failed to get current file data for {file_path}") + return False + + current_sha = current_file_data.get("sha") + if not current_sha: + logging.error(f"[create_file_commit] No SHA found for {file_path}") + return False + + # Update the file with the SHA + data = { + "message": message, + "content": base64.b64encode(content.encode("utf-8")).decode("utf-8"), + "sha": current_sha, + "branch": branch + } + + response = requests.put(url, headers=HEADERS, json=data) + if response.status_code in [200, 201]: + logging.info(f"[create_file_commit] Updated {file_path} in {org}/{repo}") + return True + else: + logging.error(f"[create_file_commit] Failed to update {file_path}: {response.status_code} - {response.text}") + return False + + +def check_existing_pr(org: str, repo: str, title: str) -> Optional[str]: + """Check if there's already an open PR with the same title. Returns PR URL if found, None otherwise.""" + url = f"{BASE_URL}/repos/{org}/{repo}/pulls?state=open&per_page=100" + + # Don't use cache for PR checks since PR status can change quickly + logging.info(f"[check_existing_pr] Making direct request to check PRs for {org}/{repo}") + try: + response = requests.get(url, headers=HEADERS) + response.raise_for_status() + data = response.json() + + for pr in data: + if pr.get("title") == title: + pr_url = pr['html_url'] + logging.info(f"[check_existing_pr] Found existing open PR with same title in {org}/{repo}: {pr_url}") + return pr_url + + logging.info(f"[check_existing_pr] No existing PR found for {org}/{repo}") + return None + + except requests.exceptions.RequestException as e: + logging.warning(f"[check_existing_pr] Failed to get PRs for {org}/{repo}: {e}") + return None + except json.JSONDecodeError as e: + logging.warning(f"[check_existing_pr] Failed to parse JSON response for {org}/{repo}: {e}") + return None + + +def create_pull_request(org: str, repo: str, branch: str, base_branch: str) -> Optional[str]: + """Create a pull request and return the PR URL.""" + if DRY_RUN: + logging.info(f"[create_pull_request] DRY RUN: Would create PR for {org}/{repo}") + return "DRY_RUN_PR_URL" + + url = f"{BASE_URL}/repos/{org}/{repo}/pulls" + data = { + "title": f"[EZ] Replace `pytorch-labs` with `meta-pytorch`", + "body": f"""This PR replaces all instances of `pytorch-labs` with `meta-pytorch` in this repository now that the `pytorch-labs` org has been renamed to `meta-pytorch` + +## Changes Made +- Replaced all occurrences of `pytorch-labs` with `meta-pytorch` +- Only modified files with extensions: .py, .md, .sh, .rst, .cpp, .h, .txt, .yml +- Skipped binary files and files larger than 1MB due to GitHub api payload limits in the script to cover all repos in this org. Will do a more manual second pass later to cover any larger files + +## Files Modified +This PR updates files that contained the target text. + +Generated by automated script on {datetime.now(timezone.utc).isoformat()}Z""", + "head": branch, + "base": base_branch + } + + response = requests.post(url, headers=HEADERS, json=data) + if response.status_code == 201: + pr_data = response.json() + pr_url = pr_data["html_url"] + logging.info(f"[create_pull_request] Created PR: {pr_url}") + return pr_url + else: + logging.error(f"[create_pull_request] Failed to create PR: {response.status_code} - {response.text}") + return None + + +def process_repository(org: str, repo: str) -> Dict: + """Process a single repository for text replacement.""" + logging.info(f"[process_repository] Processing repository: {org}/{repo}") + + result = { + "repo": repo, + "status": "skipped", + "files_changed": 0, + "pr_url": None, + "error": None + } + + try: + # Check for existing PR first (before doing any work) + pr_title = f"[EZ] Replace `pytorch-labs` with `meta-pytorch`" + existing_pr_url = check_existing_pr(org, repo, pr_title) + if existing_pr_url: + result["status"] = "skipped_existing_pr" + result["pr_url"] = existing_pr_url + result["error"] = "Existing open PR with same title found" + logging.info(f"[process_repository] Skipping {org}/{repo} - existing open PR found: {existing_pr_url}") + return result + + # Get default branch + default_branch = get_default_branch(org, repo) + if not default_branch: + result["error"] = "Failed to get default branch" + return result + + # Get target files for this repository + target_files = get_target_files_for_repo(org, repo) + if not target_files: + logging.info(f"[process_repository] No target files found for {org}/{repo}") + return result + + # Check each target file for replacements + changes = [] + for file_path in target_files: + replacement = find_and_replace_in_file(org, repo, file_path) + if replacement: + old_content, new_content = replacement + changes.append({ + "path": file_path, + "old_content": old_content, + "new_content": new_content + }) + + if not changes: + logging.info(f"[process_repository] No changes needed in {org}/{repo}") + return result + + result["files_changed"] = len(changes) + logging.info(f"[process_repository] Found {len(changes)} files to update in {org}/{repo}") + + if DRY_RUN: + result["status"] = "dry_run" + logging.info(f"[process_repository] DRY RUN: Would update {len(changes)} files in {org}/{repo}") + return result + + # Create new branch + branch_name = f"replace-pytorch-labs-{datetime.now(timezone.utc).strftime('%Y%m%d-%H%M%S')}" + if not create_branch(org, repo, default_branch, branch_name): + result["error"] = "Failed to create branch" + return result + + # Commit changes + commit_message = f"Replace 'pytorch-labs' with 'meta-pytorch' in {len(changes)} files" + all_success = True + + for change in changes: + if not create_file_commit(org, repo, change["path"], change["new_content"], branch_name, commit_message): + all_success = False + break + + if not all_success: + result["error"] = "Failed to commit some files" + return result + + # Create pull request + pr_url = create_pull_request(org, repo, branch_name, default_branch) + if pr_url: + result["pr_url"] = pr_url + result["status"] = "success" + else: + result["error"] = "Failed to create pull request" + + except Exception as e: + logging.error(f"[process_repository] Error processing {org}/{repo}: {e}") + result["error"] = str(e) + + return result + + +def main(): + parser = argparse.ArgumentParser( + description="Replace 'pytorch-labs' with 'meta-pytorch' across GitHub organization repositories." + ) + parser.add_argument( + "--org", + type=str, + default="pytorch", + help="GitHub organization to process (default: pytorch)", + ) + parser.add_argument( + "--repos", + type=str, + help="Comma-separated list of repositories to process (e.g., 'pytorch,vision,tutorials'). If not specified, processes all repositories with target files.", + ) + parser.add_argument( + "--dry-run", + action="store_true", + help="Preview changes without making them", + ) + args = parser.parse_args() + + global ORG_NAME, DRY_RUN + ORG_NAME = args.org + DRY_RUN = args.dry_run + + # Parse repos filter if provided + filter_repos = None + if args.repos: + filter_repos = [repo.strip() for repo in args.repos.split(",")] + logging.info(f"[main] Repository filter applied: {filter_repos}") + + if not GITHUB_TOKEN: + logging.error("[main] Missing GITHUB_TOKEN in environment variables.") + return + + logging.info(f"[main] Starting text replacement for org: {ORG_NAME}") + if DRY_RUN: + logging.info("[main] DRY RUN MODE - No changes will be made") + + # Show cache stats at start + cache_stats = get_cache_stats() + logging.info( + f"[main] Cache stats: {cache_stats['total_files']} files, {cache_stats['total_size_mb']} MB" + ) + + # Get target repositories (only those with files containing "pytorch-labs") + repos = get_target_repos(ORG_NAME, filter_repos) + logging.info(f"[main] Processing {len(repos)} repositories with target files") + + # Process each repository + results = [] + for i, repo in enumerate(repos, 1): + logging.info(f"[main] Processing repository {i}/{len(repos)}: {repo}") + result = process_repository(ORG_NAME, repo) + results.append(result) + + # Add a small delay to be respectful to the API + import time + time.sleep(1) + + # Generate summary + successful = [r for r in results if r["status"] == "success"] + dry_run = [r for r in results if r["status"] == "dry_run"] + skipped = [r for r in results if r["status"] == "skipped"] + skipped_existing_pr = [r for r in results if r["status"] == "skipped_existing_pr"] + errors = [r for r in results if r["error"] and r["status"] not in ["skipped_existing_pr"]] + + print(f"\n=== SUMMARY ===") + print(f"Organization: {ORG_NAME}") + print(f"Total repositories: {len(repos)}") + print(f"Successful PRs created: {len(successful)}") + print(f"Dry run (would create): {len(dry_run)}") + print(f"Skipped (no changes): {len(skipped)}") + print(f"Skipped (existing PR): {len(skipped_existing_pr)}") + print(f"Errors: {len(errors)}") + print("\n") + + if skipped_existing_pr: + print(f"=== SKIPPED (existing PRs) ===") + for result in skipped_existing_pr: + print(f"- {result['repo']}: {result['files_changed']} files would be updated, but existing PR found: {result['pr_url']}") + print("\n") + + if successful: + print(f"=== SUCCESSFUL PRs ===") + for result in successful: + print(f"- {result['repo']}: {result['pr_url']} ({result['files_changed']} files)") + print("\n") + + if dry_run: + print(f"=== DRY RUN (would create PRs) ===") + for result in dry_run: + print(f"- {result['repo']}: {result['files_changed']} files would be updated") + print("\n") + + if errors: + print(f"=== ERRORS ===") + for result in errors: + print(f"- {result['repo']}: {result['error']}") + print("\n") + + # Show final cache stats + final_cache_stats = get_cache_stats() + logging.info( + f"[main] Final cache stats: {final_cache_stats['total_files']} files, {final_cache_stats['total_size_mb']} MB" + ) + + logging.info("[main] Script completed successfully.") + + +if __name__ == "__main__": + main() \ No newline at end of file From 81c59ad94c6baa1f50d059bfef347e30ad4a4d44 Mon Sep 17 00:00:00 2001 From: Zain Rizvi Date: Tue, 12 Aug 2025 17:36:00 -0500 Subject: [PATCH 8/9] Use CodeSearch to find files that should be updated in org --- tools/analytics/org/remove_pytorch_labs.py | 198 ++++++++------------- 1 file changed, 77 insertions(+), 121 deletions(-) diff --git a/tools/analytics/org/remove_pytorch_labs.py b/tools/analytics/org/remove_pytorch_labs.py index 9e246daf07..d40b6a6395 100644 --- a/tools/analytics/org/remove_pytorch_labs.py +++ b/tools/analytics/org/remove_pytorch_labs.py @@ -17,7 +17,7 @@ Key Features: ------------- -- Uses pre-defined list of files known to contain "pytorch-labs" mentions (optimized for performance). This list was obtained by running codesea +- Dynamically discovers files containing "pytorch-labs" mentions using GitHub's Search API - Replaces all instances of "pytorch-labs" with "meta-pytorch" in target files. - Creates a new branch and commits changes for each repository. - Creates pull requests with descriptive titles and descriptions. @@ -46,10 +46,11 @@ Notes: ------ -- Only processes 72 pre-identified files that contain "pytorch-labs" mentions +- Dynamically discovers files containing "pytorch-labs" mentions using GitHub Search API - Skips binary files and files larger than 1MB - Creates one PR per repository with changes - Handles GitHub API rate limits automatically +- Caches search results to avoid repeated API calls - Significantly faster than scanning all files in all repositories """ @@ -65,6 +66,7 @@ import requests from cache_manager import get_cache_stats, make_cached_request from dotenv import load_dotenv +from github_code_search import search_github_code, GitHubSearchResults load_dotenv() @@ -93,130 +95,81 @@ # Maximum file size to process (1MB) MAX_FILE_SIZE = 1024 * 1024 -# Pre-defined list of files that contain "pytorch-labs" mentions -# This is based on search results and will significantly improve performance -TARGET_FILES = { - "pytorch": [ - "android/README.md", - "aten/src/ATen/native/cuda/int4mm.cu", - "torch/testing/_internal/common_quantization.py" - ], - "vision": [ - "torchvision/io/image.py" - ], - "tutorials": [ - "index.rst", - "docathon-leaderboard.md", - "intermediate_source/transformer_building_blocks.py", - "unstable_source/gpu_quantization_torchao_tutorial.py" - ], - "executorch": [ - "docs/source/index.md", - "docs/source/getting-started.md", - "backends/apple/mps/setup.md", - "docs/source/backends-mps.md", - "docs/source/llm/run-with-c-plus-plus.md", - "docs/source/using-executorch-android.md", - "docs/source/using-executorch-export.md", - "docs/source/using-executorch-building-from-source.md", - "docs/source/using-executorch-cpp.md", - "examples/models/llama/experimental/generate.py", - "scripts/test_ios.sh", - ".ci/scripts/test_ios_ci.sh", - "backends/test/facto/test_facto.py" - ], - "ao": [ - "scripts/download.py", - "torchao/_models/llama/tokenizer.py", - "scripts/convert_hf_checkpoint.py", - "examples/sam2_amg_server/annotate_with_rle.py", - "torchao/prototype/mx_formats/kernels.py", - "torchao/_models/sam/README.md", - "torchao/quantization/README.md", - "test/integration/test_integration.py", - ".github/workflows/dashboard_perf_test.yml" - ], - "benchmark": [ - "torchbenchmark/models/simple_gpt/origin", - "torchbenchmark/models/sam_fast/requirements.txt" - ], - "torchtune": [ - "docs/source/tutorials/qlora_finetune.rst", - "recipes/eleuther_eval.py", - "docs/source/tutorials/e2e_flow.rst", - "torchtune/generation/_generation.py", - "docs/source/tutorials/llama3.rst", - "README.md" - ], - "torchft": [ - "docs/source/protocol.rst", - "docs/source/assumptions_and_recommendations.rst", - "docs/source/conf.py", - "docs/source/index.rst", - "README.md" - ], - "torchchat": [ - "torchchat/usages/eval.py", - "README.md" - ], - "rl": [ - "examples/rlhf/requirements.txt" - ], - "builder": [ - "CUDA_UPGRADE_GUIDE.MD" - ], - "helion": [ - "benchmarks/run.py", - "benchmarks/README.md" - ], - "torchcodec": [ - "src/torchcodec/_core/SingleStreamDecoder.cpp" - ], - "test-infra": [ - "aws/lambda/README.md", - "torchci/clickhouse_queries/queued_jobs_aggregate/query.sql", - "tools/torchfix/README.md", - ".github/workflows/trigger_nightly.yml" - ], - "ci-infra": [ - "arc-backup-2024/scripts/deployment.py" - ], - "oss-docathons": [ - "pytorch/h1-2024/leaderboard-pytorch-docathon-h1-2024.md", - "pytorch/h1-2024/leaderboard-pytorch-docathon-h1-2024.csv", - ".github/scripts/pytorch-docathon-h1-2024.py" - ], - "serve": [ - "examples/large_models/segment_anything_fast/install_segment_anything_fast.sh", - "examples/large_models/gpt_fast/README.md", - "examples/large_models/gpt_fast_mixtral_moe/README.md", - "examples/large_models/diffusion_fast/README.md", - "examples/large_models/segment_anything_fast/README.md", - "kubernetes/kserve/examples/gpt_fast/README.md" - ], - "xla": [ - "torchax/test/llama/llama_model.py" - ], - "pytorch-canary": [ - "torch/testing/_internal/common_quantization.py" - ], - "pytorch-integration-testing": [ - ".github/scripts/generate_vllm_benchmark_matrix.py" - ], - "torcheval": [ - ".github/PULL_REQUEST_TEMPLATE.md", - ".github/ISSUE_TEMPLATE/bug-report.yml" - ] -} +# Cache for search results to avoid repeated API calls +_SEARCH_CACHE: Dict[str, Dict[str, List[str]]] = {} + + + + +def get_target_files_from_search(org: str) -> Dict[str, List[str]]: + """ + Get target files by searching GitHub for 'pytorch-labs' mentions in the organization. + + Args: + org: GitHub organization name + + Returns: + Dictionary mapping repository names to lists of file paths + """ + # Check cache first + if org in _SEARCH_CACHE: + logging.info(f"[get_target_files_from_search] Using cached results for org: {org}") + return _SEARCH_CACHE[org] + + try: + logging.info(f"[get_target_files_from_search] Searching for 'pytorch-labs' mentions in org: {org}") + + # Search for files containing "pytorch-labs" in the organization + query = f"org:{org} pytorch-labs" + results: GitHubSearchResults = search_github_code( + query=query, + verbose=False # Reduce logging noise + ) + + if results['retrieved_count'] == 0: + logging.warning(f"[get_target_files_from_search] No files found containing 'pytorch-labs' in org: {org}") + _SEARCH_CACHE[org] = {} + return {} + + # Group files by repository + target_files: Dict[str, List[str]] = {} + for item in results['items']: + repo_name = item['repository']['name'] # Just the repo name, not full_name + file_path = item['path'] + + if repo_name not in target_files: + target_files[repo_name] = [] + + if file_path not in target_files[repo_name]: + target_files[repo_name].append(file_path) + + logging.info(f"[get_target_files_from_search] Found {len(target_files)} repositories with {sum(len(files) for files in target_files.values())} total files") + + # Log summary of repositories found + for repo_name, files in target_files.items(): + logging.info(f"[get_target_files_from_search] {repo_name}: {len(files)} files") + + # Cache the results + _SEARCH_CACHE[org] = target_files + + return target_files + + except Exception as e: + logging.error(f"[get_target_files_from_search] Error searching for files: {e}") + logging.warning(f"[get_target_files_from_search] No fallback available - search failed") + return {} def get_target_repos(org: str, filter_repos: Optional[List[str]] = None) -> List[str]: """Get only the repositories that have files with 'pytorch-labs' mentions.""" - if org not in TARGET_FILES: + # Get target files from search (with fallback to hardcoded list) + target_files = get_target_files_from_search(org) + + if not target_files: logging.info(f"[get_target_repos] No target files found for org: {org}") return [] - all_repos = list(TARGET_FILES.keys()) + all_repos = list(target_files.keys()) if filter_repos: # Filter to only include repos that are in both the target files and the filter list @@ -245,11 +198,14 @@ def get_default_branch(org: str, repo: str) -> Optional[str]: def get_target_files_for_repo(org: str, repo: str) -> List[str]: """Get the list of target files for a specific repository.""" - if repo not in TARGET_FILES: + # Get target files from search (with fallback to hardcoded list) + target_files = get_target_files_from_search(org) + + if repo not in target_files: logging.info(f"[get_target_files_for_repo] No target files found for {org}/{repo}") return [] - files = TARGET_FILES[repo] + files = target_files[repo] logging.info(f"[get_target_files_for_repo] Found {len(files)} target files for {org}/{repo}") return files From 344bbc47cfc29e12d108cc225bedf3f942439ed1 Mon Sep 17 00:00:00 2001 From: Zain Rizvi Date: Tue, 12 Aug 2025 18:01:43 -0500 Subject: [PATCH 9/9] lint fixes --- tools/analytics/org/analyze_contributors.py | 243 +++++++++------ tools/analytics/org/analyze_repo_info.py | 68 +++-- tools/analytics/org/github_code_search.py | 291 ++++++++++-------- tools/analytics/org/remove_pytorch_labs.py | 309 ++++++++++++-------- 4 files changed, 554 insertions(+), 357 deletions(-) diff --git a/tools/analytics/org/analyze_contributors.py b/tools/analytics/org/analyze_contributors.py index 5b9439eeb5..1a8158d8a9 100644 --- a/tools/analytics/org/analyze_contributors.py +++ b/tools/analytics/org/analyze_contributors.py @@ -61,7 +61,7 @@ import re from collections import defaultdict from datetime import datetime, timedelta -from typing import Dict, List, Optional +from typing import Any, Dict, List, Optional, Set, Tuple import requests import yaml @@ -136,7 +136,9 @@ } BASE_URL = "https://api.github.com" -COMMIT_LOOKBACK = (datetime.utcnow() - timedelta(days=180)).isoformat() + "Z" # 6 months +COMMIT_LOOKBACK = ( + datetime.utcnow() - timedelta(days=180) +).isoformat() + "Z" # 6 months def get_repos(org: str) -> List[str]: @@ -187,9 +189,13 @@ def get_commits(org: str, repo: str) -> List[Dict]: logging.error(f"[get_commits] Failed to fetch page {page} for repo: {repo}") break if not data: - logging.info(f"[get_commits] No more commits found for repo: {repo} on page {page}") + logging.info( + f"[get_commits] No more commits found for repo: {repo} on page {page}" + ) break - logging.info(f"[get_commits] Page {page}: Found {len(data)} commits for repo: {repo}") + logging.info( + f"[get_commits] Page {page}: Found {len(data)} commits for repo: {repo}" + ) all_commits.extend(data) page += 1 @@ -198,7 +204,9 @@ def get_commits(org: str, repo: str) -> List[Dict]: logging.info(f"[get_commits] Limiting to 1000 commits for repo: {repo}") break - logging.info(f"[get_commits] Finished fetching commits for repo: {repo}. Total: {len(all_commits)}") + logging.info( + f"[get_commits] Finished fetching commits for repo: {repo}. Total: {len(all_commits)}" + ) return all_commits @@ -230,9 +238,16 @@ def extract_company_from_email(email: str) -> Optional[str]: # Skip generic email providers generic_providers = { - "gmail.com", "yahoo.com", "hotmail.com", "outlook.com", "icloud.com", - "protonmail.com", "tutanota.com", "hey.com", "fastmail.com", - "users.noreply.github.com" # GitHub's privacy-preserving email addresses + "gmail.com", + "yahoo.com", + "hotmail.com", + "outlook.com", + "icloud.com", + "protonmail.com", + "tutanota.com", + "hey.com", + "fastmail.com", + "users.noreply.github.com", # GitHub's privacy-preserving email addresses } if domain in generic_providers: @@ -240,7 +255,9 @@ def extract_company_from_email(email: str) -> Optional[str]: # For other domains, try to extract company name # Remove common TLDs and subdomains - domain_parts = domain.replace(".com", "").replace(".org", "").replace(".net", "").split(".") + domain_parts = ( + domain.replace(".com", "").replace(".org", "").replace(".net", "").split(".") + ) if domain_parts and len(domain_parts[-1]) > 2: return domain_parts[-1].title() @@ -258,7 +275,7 @@ def extract_company_from_profile(profile: Dict) -> Optional[str]: return None # Clean up company name - company = re.sub(r'^@', '', company) # Remove @ prefix + company = re.sub(r"^@", "", company) # Remove @ prefix company = company.strip() if not company: @@ -350,12 +367,14 @@ def wrapper(*args, **kwargs): arg_representation = { "date": today, "args": hashable_args, - "kwargs": sorted(hashable_kwargs.items()) + "kwargs": sorted(hashable_kwargs.items()), } serialized_args = json.dumps(arg_representation, sort_keys=True) except (TypeError, ValueError): # If serialization fails, use string representation as fallback - serialized_args = today + str(hashable_args) + str(sorted(hashable_kwargs.items())) + serialized_args = ( + today + str(hashable_args) + str(sorted(hashable_kwargs.items())) + ) arg_hash = hashlib.sha256(serialized_args.encode()).hexdigest() key = f"{func_name}_{today}_{arg_hash}" @@ -373,7 +392,9 @@ def wrapper(*args, **kwargs): # Cache the result with open(filepath, "w") as f: json.dump(result, f) - logging.debug(f"Cached result for function: {func_name}, saved to: {filepath} (date: {today})") + logging.debug( + f"Cached result for function: {func_name}, saved to: {filepath} (date: {today})" + ) return result @@ -381,31 +402,35 @@ def wrapper(*args, **kwargs): @cache_to_disk -def analyze_contributors(org: str, repos: List[str]) -> Dict: +def analyze_contributors( + org: str, repos: List[str] +) -> Tuple[Dict[str, Dict[str, Any]], Dict[str, List[Dict[str, Any]]]]: """Analyze contributors across all repositories.""" - logging.info(f"[analyze_contributors] Start analyzing contributors for {len(repos)} repositories in org: {org}") + logging.info( + f"[analyze_contributors] Start analyzing contributors for {len(repos)} repositories in org: {org}" + ) # Track contributors across all repos - global_contributors = defaultdict(lambda: { - "total_commits": 0, - "repos": set(), - "emails": set(), - "username": None, - "company": None, - "profile": None - }) + global_contributors: Dict[str, Dict[str, Any]] = defaultdict( + lambda: { + "total_commits": 0, + "repos": set(), + "emails": set(), + "username": None, + "company": None, + "profile": None, + } + ) # Track contributors by repo - repo_contributors = {} + repo_contributors: Dict[str, List[Dict[str, Any]]] = {} for repo in repos: logging.info(f"[analyze_contributors] Processing repo: {repo}") commits = get_commits(org, repo) - repo_contributor_stats = defaultdict(lambda: { - "commits": 0, - "emails": set(), - "username": None - }) + repo_contributor_stats: Dict[str, Dict[str, Any]] = defaultdict( + lambda: {"commits": 0, "emails": set(), "username": None} + ) for commit in commits: author = commit.get("commit", {}).get("author", {}) @@ -418,64 +443,90 @@ def analyze_contributors(org: str, repos: List[str]) -> Dict: # Since we can assume GitHub username info is always there, use it as the primary key contributor_key = username if not contributor_key: - raise ValueError(f"Commit {commit['sha']} in repo {repo} has no identifiable contributor information.") + raise ValueError( + f"Commit {commit['sha']} in repo {repo} has no identifiable contributor information." + ) # Update repo-specific stats repo_contributor_stats[contributor_key]["commits"] += 1 if author_email: - repo_contributor_stats[contributor_key]["emails"].add(author_email) + emails_set = repo_contributor_stats[contributor_key]["emails"] + if isinstance(emails_set, set): + emails_set.add(author_email) if username: repo_contributor_stats[contributor_key]["username"] = username # Update global stats global_contributors[contributor_key]["total_commits"] += 1 - global_contributors[contributor_key]["repos"].add(repo) + repos_set = global_contributors[contributor_key]["repos"] + if isinstance(repos_set, set): + repos_set.add(repo) if author_email: - global_contributors[contributor_key]["emails"].add(author_email) + emails_set = global_contributors[contributor_key]["emails"] + if isinstance(emails_set, set): + emails_set.add(author_email) if username: global_contributors[contributor_key]["username"] = username # Convert sets to lists for YAML serialization repo_contributors[repo] = [] for contributor_key, stats in repo_contributor_stats.items(): - repo_contributors[repo].append({ - "contributor": contributor_key, - "commits": stats["commits"], - "emails": list(stats["emails"]), - "username": stats["username"] - }) + emails_list = ( + list(stats["emails"]) if isinstance(stats["emails"], set) else [] + ) + repo_contributors[repo].append( + { + "contributor": contributor_key, + "commits": stats["commits"], + "emails": emails_list, + "username": stats["username"], + } + ) # Sort by commit count repo_contributors[repo].sort(key=lambda x: x["commits"], reverse=True) - logging.info(f"[analyze_contributors] Found {len(repo_contributors[repo])} contributors for repo: {repo}") + logging.info( + f"[analyze_contributors] Found {len(repo_contributors[repo])} contributors for repo: {repo}" + ) # Enhance global contributors with profile and company information - logging.info(f"[analyze_contributors] Enhancing contributor information with profiles and companies") + logging.info( + f"[analyze_contributors] Enhancing contributor information with profiles and companies" + ) for contributor_key, stats in global_contributors.items(): # First, try to extract company from email addresses (prioritize this) - if stats["emails"]: - for email in stats["emails"]: + emails_set = stats["emails"] + if isinstance(emails_set, set) and emails_set: + for email in emails_set: company_from_email = extract_company_from_email(email) if company_from_email: stats["company"] = company_from_email break # Only if email didn't provide a clear company mapping, try GitHub profile - if not stats["company"] and stats["username"]: - profile = get_user_profile(stats["username"]) + username = stats["username"] + if not stats["company"] and username: + profile = get_user_profile(username) stats["profile"] = profile # Try to extract company from profile - company_from_profile = extract_company_from_profile(profile) - if company_from_profile: - stats["company"] = company_from_profile + if profile: + company_from_profile = extract_company_from_profile(profile) + if company_from_profile: + stats["company"] = company_from_profile # Convert sets to lists for YAML serialization - stats["repos"] = list(stats["repos"]) - stats["emails"] = list(stats["emails"]) + repos_set = stats["repos"] + if isinstance(repos_set, set): + stats["repos"] = list(repos_set) + emails_set = stats["emails"] + if isinstance(emails_set, set): + stats["emails"] = list(emails_set) - logging.info(f"[analyze_contributors] Finished analyzing contributors for org: {org}") + logging.info( + f"[analyze_contributors] Finished analyzing contributors for org: {org}" + ) return global_contributors, repo_contributors @@ -532,23 +583,29 @@ def main(): repo for repo in repos if f"{ORG_NAME}/{repo}" not in EXCLUDED_REPOS ] - logging.info(f"[main] Analyzing {len(filtered_repos)} repositories (excluded {len(repos) - len(filtered_repos)})") + logging.info( + f"[main] Analyzing {len(filtered_repos)} repositories (excluded {len(repos) - len(filtered_repos)})" + ) # Analyze contributors - global_contributors, repo_contributors = analyze_contributors(ORG_NAME, filtered_repos) + global_contributors, repo_contributors = analyze_contributors( + ORG_NAME, filtered_repos + ) # Sort contributors by frequency contributors_by_frequency = [] for contributor_key, stats in global_contributors.items(): - contributors_by_frequency.append({ - "contributor": contributor_key, - "total_commits": stats["total_commits"], - "repos_count": len(stats["repos"]), - "repos": stats["repos"], - "emails": stats["emails"], - "username": stats["username"], - "company": stats["company"] - }) + contributors_by_frequency.append( + { + "contributor": contributor_key, + "total_commits": stats["total_commits"], + "repos_count": len(stats["repos"]), + "repos": stats["repos"], + "emails": stats["emails"], + "username": stats["username"], + "company": stats["company"], + } + ) contributors_by_frequency.sort(key=lambda x: x["total_commits"], reverse=True) @@ -558,20 +615,24 @@ def main(): for contributor in contributors_by_frequency: if contributor["company"]: - company_analysis[contributor["company"]].append({ - "contributor": contributor["contributor"], - "total_commits": contributor["total_commits"], - "repos_count": contributor["repos_count"], - "username": contributor["username"] - }) + company_analysis[contributor["company"]].append( + { + "contributor": contributor["contributor"], + "total_commits": contributor["total_commits"], + "repos_count": contributor["repos_count"], + "username": contributor["username"], + } + ) else: - unidentified_contributors.append({ - "contributor": contributor["contributor"], - "total_commits": contributor["total_commits"], - "repos_count": contributor["repos_count"], - "username": contributor["username"], - "emails": contributor["emails"] - }) + unidentified_contributors.append( + { + "contributor": contributor["contributor"], + "total_commits": contributor["total_commits"], + "repos_count": contributor["repos_count"], + "username": contributor["username"], + "emails": contributor["emails"], + } + ) # Sort company contributors by commit count for company in company_analysis: @@ -585,13 +646,18 @@ def main(): "lookback_period_days": 180, "repositories_analyzed": len(filtered_repos), "total_contributors": len(contributors_by_frequency), - "contributors_with_company": len(contributors_by_frequency) - len(unidentified_contributors), - "contributors_without_company": len(unidentified_contributors) + "contributors_with_company": len(contributors_by_frequency) + - len(unidentified_contributors), + "contributors_without_company": len(unidentified_contributors), }, - "contributors_by_frequency": contributors_by_frequency[:50], # Top 50 contributors + "contributors_by_frequency": contributors_by_frequency[ + :50 + ], # Top 50 contributors "company_analysis": dict(company_analysis), - "unidentified_contributors": unidentified_contributors[:20], # Top 20 unidentified - "contributors_by_repo": repo_contributors + "unidentified_contributors": unidentified_contributors[ + :20 + ], # Top 20 unidentified + "contributors_by_repo": repo_contributors, } # Sort output for consistency @@ -622,11 +688,16 @@ def deep_sort(obj, sort_keys=True): print(f"- Organization: {ORG_NAME}") print(f"- Repositories analyzed: {len(filtered_repos)}") print(f"- Total contributors: {len(contributors_by_frequency)}") - print(f"- Contributors with identified companies: {len(contributors_by_frequency) - len(unidentified_contributors)}") + print( + f"- Contributors with identified companies: {len(contributors_by_frequency) - len(unidentified_contributors)}" + ) print(f"- Top companies by contributor count:") # Show top companies - company_contributor_count = [(company, len(contributors)) for company, contributors in company_analysis.items()] + company_contributor_count = [ + (company, len(contributors)) + for company, contributors in company_analysis.items() + ] company_contributor_count.sort(key=lambda x: x[1], reverse=True) for company, count in company_contributor_count[:20]: @@ -650,10 +721,14 @@ def deep_sort(obj, sort_keys=True): break # Sort by commit count (descending) - repo_commits.sort(key=lambda x: int(x.split('(')[1].split(')')[0]), reverse=True) + repo_commits.sort( + key=lambda x: int(x.split("(")[1].split(")")[0]), reverse=True + ) # Format the contributor name (use username if available, otherwise email/name) - display_name = contributor["username"] if contributor["username"] else contributor_key + display_name = ( + contributor["username"] if contributor["username"] else contributor_key + ) print(f"- {display_name}, {', '.join(repo_commits)}") diff --git a/tools/analytics/org/analyze_repo_info.py b/tools/analytics/org/analyze_repo_info.py index da2263f3c1..95203d3dab 100644 --- a/tools/analytics/org/analyze_repo_info.py +++ b/tools/analytics/org/analyze_repo_info.py @@ -82,10 +82,10 @@ def get_repos_with_info(org: str) -> List[Dict]: """ Fetch all repositories for an organization with their metadata. - + Args: org: The GitHub organization name - + Returns: List of repository dictionaries with metadata """ @@ -97,7 +97,9 @@ def get_repos_with_info(org: str) -> List[Dict]: logging.debug(f"[get_repos_with_info] Requesting URL: {url}") data = make_cached_request(url, HEADERS) if data is None: - logging.error(f"[get_repos_with_info] Failed to fetch page {page} for org: {org}") + logging.error( + f"[get_repos_with_info] Failed to fetch page {page} for org: {org}" + ) break if not data: logging.info( @@ -118,11 +120,11 @@ def get_repos_with_info(org: str) -> List[Dict]: def get_last_commit_date(org: str, repo: str) -> Optional[str]: """ Get the date of the last commit for a repository. - + Args: org: The GitHub organization name repo: The repository name - + Returns: Date string in YYYY-MM-DD format of the last commit, or None if no commits found """ @@ -133,61 +135,71 @@ def get_last_commit_date(org: str, repo: str) -> Optional[str]: if data is None or not data: logging.warning(f"[get_last_commit_date] No commits found for repo: {repo}") return None - + if len(data) > 0: commit_date = data[0]["commit"]["author"]["date"] # Convert ISO format to YYYY-MM-DD format try: from datetime import datetime - dt = datetime.fromisoformat(commit_date.replace('Z', '+00:00')) - formatted_date = dt.strftime('%Y-%m-%d') - logging.info(f"[get_last_commit_date] Last commit date for {repo}: {formatted_date}") + + dt = datetime.fromisoformat(commit_date.replace("Z", "+00:00")) + formatted_date = dt.strftime("%Y-%m-%d") + logging.info( + f"[get_last_commit_date] Last commit date for {repo}: {formatted_date}" + ) return formatted_date except (ValueError, AttributeError) as e: - logging.warning(f"[get_last_commit_date] Failed to parse date for {repo}: {e}") + logging.warning( + f"[get_last_commit_date] Failed to parse date for {repo}: {e}" + ) return None - + return None def process_repo_data(org: str, repos: List[Dict]) -> List[Dict]: """ Process repository data and add last commit date information. - + Args: org: The GitHub organization name repos: List of repository dictionaries from GitHub API - + Returns: List of processed repository data with all required fields """ logging.info(f"[process_repo_data] Processing {len(repos)} repositories") processed_repos = [] - + for i, repo in enumerate(repos, 1): repo_name = repo["name"] - logging.info(f"[process_repo_data] Processing repo {i}/{len(repos)}: {repo_name}") - + logging.info( + f"[process_repo_data] Processing repo {i}/{len(repos)}: {repo_name}" + ) + # Get last commit date last_commit_date = get_last_commit_date(org, repo_name) - + processed_repo = { "repo_name": f"{org}/{repo_name}", - "public": repo.get("private", True) == False, # True if public, False if private + "public": repo.get("private", True) + == False, # True if public, False if private "archived": repo.get("archived", False), - "last_commit_date": last_commit_date + "last_commit_date": last_commit_date, } - + processed_repos.append(processed_repo) - - logging.info(f"[process_repo_data] Finished processing {len(processed_repos)} repositories") + + logging.info( + f"[process_repo_data] Finished processing {len(processed_repos)} repositories" + ) return processed_repos def save_to_csv(data: List[Dict], filename: str = "repo_info_summary.csv"): """ Save repository data to a CSV file. - + Args: data: List of repository dictionaries filename: Name of the CSV file to create @@ -202,12 +214,12 @@ def save_to_csv(data: List[Dict], filename: str = "repo_info_summary.csv"): # Define CSV headers fieldnames = ["repo_name", "public", "archived", "last_commit_date"] - + with open(filepath, "w", newline="", encoding="utf-8") as f: writer = csv.DictWriter(f, fieldnames=fieldnames) writer.writeheader() writer.writerows(data) - + logging.info(f"[save_to_csv] Data successfully saved to {filepath}") @@ -240,10 +252,10 @@ def main(): # Step 1: Get all repositories with their metadata repos = get_repos_with_info(ORG_NAME) - + # Step 2: Process repository data and add last commit dates processed_repos = process_repo_data(ORG_NAME, repos) - + # Step 3: Save to CSV save_to_csv(processed_repos) @@ -256,4 +268,4 @@ def main(): if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/tools/analytics/org/github_code_search.py b/tools/analytics/org/github_code_search.py index 56788873ff..0907f1a502 100644 --- a/tools/analytics/org/github_code_search.py +++ b/tools/analytics/org/github_code_search.py @@ -63,14 +63,15 @@ import logging import os import time +from dataclasses import dataclass from datetime import datetime, timezone -from typing import Dict, List, Optional, Any, TypedDict, Union +from typing import Any, Dict, List, Optional, TypedDict, Union from urllib.parse import quote_plus -from dataclasses import dataclass import requests from dotenv import load_dotenv + load_dotenv() logging.basicConfig( @@ -94,6 +95,7 @@ # Type definitions for well-defined schema class RepositoryInfo(TypedDict): """Repository information from GitHub search results.""" + id: int node_id: str name: str @@ -178,6 +180,7 @@ class RepositoryInfo(TypedDict): class SearchResultItem(TypedDict): """Individual search result item from GitHub code search.""" + name: str path: str sha: str @@ -195,6 +198,7 @@ class SearchResultItem(TypedDict): class GitHubSearchResults(TypedDict): """Complete search results from GitHub Search API.""" + query: str total_count: int retrieved_count: int @@ -207,41 +211,49 @@ class GitHubSearchResults(TypedDict): @dataclass class SearchOptions: """Options for GitHub code search.""" + per_page: int = 100 max_results: Optional[int] = None verbose: bool = True class GitHubCodeSearch: - def __init__(self, token: str = None): + def __init__(self, token: Optional[str] = None): """ Initialize GitHub Code Search client. - + Args: token: GitHub personal access token. If None, will try to get from GITHUB_TOKEN env var. """ self.token = token or GITHUB_TOKEN if not self.token: - raise ValueError("GitHub token is required. Set GITHUB_TOKEN environment variable or pass token parameter.") - + raise ValueError( + "GitHub token is required. Set GITHUB_TOKEN environment variable or pass token parameter." + ) + self.headers = { "Authorization": f"Bearer {self.token}", "Accept": "application/vnd.github+json", } self.session = requests.Session() self.session.headers.update(self.headers) - - def search_code(self, query: str, per_page: int = 100, max_results: Optional[int] = None, - verbose: bool = True) -> GitHubSearchResults: + + def search_code( + self, + query: str, + per_page: int = 100, + max_results: Optional[int] = None, + verbose: bool = True, + ) -> GitHubSearchResults: """ Search for code using GitHub's Search API. - + Args: query: Search query string per_page: Number of results per page (max 100) max_results: Maximum number of results to return (None for all) verbose: Whether to log progress messages - + Returns: GitHubSearchResults: Well-defined structure containing: - query: The search query used @@ -255,112 +267,122 @@ def search_code(self, query: str, per_page: int = 100, max_results: Optional[int all_items = [] page = 1 total_count = 0 - + if verbose: logging.info(f"Starting code search with query: {query}") - + while True: # Check rate limits rate_limit_info = self._check_rate_limit() - if rate_limit_info['remaining'] == 0: - reset_time = rate_limit_info['reset_time'] + if rate_limit_info["remaining"] == 0: + reset_time = rate_limit_info["reset_time"] wait_time = max(0, reset_time - time.time()) if verbose: - logging.warning(f"Rate limit exceeded. Waiting {wait_time:.0f} seconds...") + logging.warning( + f"Rate limit exceeded. Waiting {wait_time:.0f} seconds..." + ) time.sleep(wait_time + 1) - + # Prepare request parameters - params = { - 'q': query, - 'per_page': min(per_page, 100), - 'page': page + params: Dict[str, Union[str, int]] = { + "q": query, + "per_page": min(per_page, 100), + "page": page, } - + try: if verbose: logging.info(f"Fetching page {page}...") response = self.session.get(SEARCH_URL, params=params) response.raise_for_status() - + data = response.json() - + # Update total count on first page if page == 1: - total_count = data.get('total_count', 0) + total_count = data.get("total_count", 0) if verbose: logging.info(f"Total results found: {total_count}") - - items = data.get('items', []) + + items = data.get("items", []) if not items: break - + all_items.extend(items) if verbose: - logging.info(f"Retrieved {len(items)} items from page {page} (total: {len(all_items)})") - + logging.info( + f"Retrieved {len(items)} items from page {page} (total: {len(all_items)})" + ) + # Check if we've reached the maximum results if max_results and len(all_items) >= max_results: all_items = all_items[:max_results] if verbose: logging.info(f"Reached maximum results limit: {max_results}") break - + # Check if there are more pages if len(items) < per_page: break - + page += 1 - + # Be respectful to the API time.sleep(1) - + except requests.exceptions.RequestException as e: logging.error(f"Error fetching page {page}: {e}") break except json.JSONDecodeError as e: logging.error(f"Error parsing JSON response from page {page}: {e}") break - + # Get rate limit info for the response rate_limit_info = self._check_rate_limit() - + return GitHubSearchResults( query=query, total_count=total_count, retrieved_count=len(all_items), items=all_items, search_time=datetime.now(timezone.utc).isoformat(), - rate_limit_remaining=rate_limit_info.get('remaining'), - rate_limit_reset=datetime.fromtimestamp(rate_limit_info.get('reset_time', 0)).isoformat() if rate_limit_info.get('reset_time') else None + rate_limit_remaining=rate_limit_info.get("remaining"), + rate_limit_reset=datetime.fromtimestamp( + rate_limit_info.get("reset_time", 0) + ).isoformat() + if rate_limit_info.get("reset_time") + else None, ) - + def get_rate_limit(self) -> Dict[str, Any]: """Get GitHub API rate limit status.""" return self._check_rate_limit() - + def _check_rate_limit(self) -> Dict[str, Any]: """Check GitHub API rate limit status.""" try: response = self.session.get(f"{BASE_URL}/rate_limit") response.raise_for_status() data = response.json() - - search_limit = data.get('resources', {}).get('search', {}) + + search_limit = data.get("resources", {}).get("search", {}) return { - 'limit': search_limit.get('limit', 0), - 'remaining': search_limit.get('remaining', 0), - 'reset_time': search_limit.get('reset', 0) + "limit": search_limit.get("limit", 0), + "remaining": search_limit.get("remaining", 0), + "reset_time": search_limit.get("reset", 0), } except Exception as e: logging.warning(f"Could not check rate limit: {e}") - return {'limit': 0, 'remaining': 0, 'reset_time': 0} - - def format_results(self, results: GitHubSearchResults, format_type: str = 'console') -> str: + return {"limit": 0, "remaining": 0, "reset_time": 0} + + def format_results( + self, results: GitHubSearchResults, format_type: str = "console" + ) -> str: """Format search results for different output types.""" - if format_type == 'json': + if format_type == "json": return json.dumps(results, indent=2) - - elif format_type == 'console': + + elif format_type == "console": output = [] output.append(f"=== GitHub Code Search Results ===") output.append(f"Query: {results['query']}") @@ -368,107 +390,124 @@ def format_results(self, results: GitHubSearchResults, format_type: str = 'conso output.append(f"Retrieved: {results['retrieved_count']}") output.append(f"Search time: {results['search_time']}") output.append("") - - for i, item in enumerate(results['items'], 1): - repo_name = item.get('repository', {}).get('full_name', 'Unknown') - file_path = item.get('path', 'Unknown') - file_url = item.get('html_url', '') - score = item.get('score', 0) - + + for i, item in enumerate(results["items"], 1): + repo_name = item.get("repository", {}).get("full_name", "Unknown") + file_path = item.get("path", "Unknown") + file_url = item.get("html_url", "") + score = item.get("score", 0) + output.append(f"{i}. {repo_name}/{file_path}") output.append(f" Score: {score}") output.append(f" URL: {file_url}") output.append("") - + return "\n".join(output) - - elif format_type == 'csv': + + elif format_type == "csv": import csv import io - - output = io.StringIO() - writer = csv.writer(output) - + + output_buffer = io.StringIO() + writer = csv.writer(output_buffer) + # Write header - writer.writerow(['Repository', 'File Path', 'Score', 'URL', 'Search Time']) - + writer.writerow(["Repository", "File Path", "Score", "URL", "Search Time"]) + # Write data - for item in results['items']: - repo_name = item.get('repository', {}).get('full_name', 'Unknown') - file_path = item.get('path', 'Unknown') - file_url = item.get('html_url', '') - score = item.get('score', 0) - - writer.writerow([repo_name, file_path, score, file_url, results['search_time']]) - - return output.getvalue() - + for item in results["items"]: + repo_name = item.get("repository", {}).get("full_name", "Unknown") + file_path = item.get("path", "Unknown") + file_url = item.get("html_url", "") + score = item.get("score", 0) + + writer.writerow( + [repo_name, file_path, score, file_url, results["search_time"]] + ) + + return output_buffer.getvalue() + else: raise ValueError(f"Unsupported format type: {format_type}") - + def get_file_paths(self, results: GitHubSearchResults) -> List[str]: """Extract just the file paths from search results.""" - return [item.get('path', '') for item in results.get('items', [])] - + return [item.get("path", "") for item in results.get("items", [])] + def get_repositories(self, results: GitHubSearchResults) -> List[str]: """Extract just the repository names from search results.""" - return [item.get('repository', {}).get('full_name', '') for item in results.get('items', [])] - + return [ + item.get("repository", {}).get("full_name", "") + for item in results.get("items", []) + ] + def get_unique_repositories(self, results: GitHubSearchResults) -> List[str]: """Extract unique repository names from search results.""" repos = self.get_repositories(results) return list(set(repos)) - - def filter_by_score(self, results: GitHubSearchResults, min_score: float = 0.0) -> GitHubSearchResults: + + def filter_by_score( + self, results: GitHubSearchResults, min_score: float = 0.0 + ) -> GitHubSearchResults: """Filter results by minimum score.""" filtered_items = [ - item for item in results.get('items', []) - if item.get('score', 0) >= min_score + item + for item in results.get("items", []) + if item.get("score", 0) >= min_score ] - + return GitHubSearchResults( - query=results['query'], - total_count=results['total_count'], + query=results["query"], + total_count=results["total_count"], retrieved_count=len(filtered_items), items=filtered_items, - search_time=results['search_time'], - rate_limit_remaining=results.get('rate_limit_remaining'), - rate_limit_reset=results.get('rate_limit_reset') + search_time=results["search_time"], + rate_limit_remaining=results.get("rate_limit_remaining"), + rate_limit_reset=results.get("rate_limit_reset"), ) - - def filter_by_repository(self, results: GitHubSearchResults, repo_pattern: str) -> GitHubSearchResults: + + def filter_by_repository( + self, results: GitHubSearchResults, repo_pattern: str + ) -> GitHubSearchResults: """Filter results by repository name pattern.""" import re + pattern = re.compile(repo_pattern) - + filtered_items = [ - item for item in results.get('items', []) - if pattern.search(item.get('repository', {}).get('full_name', '')) + item + for item in results.get("items", []) + if pattern.search(item.get("repository", {}).get("full_name", "")) ] - + return GitHubSearchResults( - query=results['query'], - total_count=results['total_count'], + query=results["query"], + total_count=results["total_count"], retrieved_count=len(filtered_items), items=filtered_items, - search_time=results['search_time'], - rate_limit_remaining=results.get('rate_limit_remaining'), - rate_limit_reset=results.get('rate_limit_reset') + search_time=results["search_time"], + rate_limit_remaining=results.get("rate_limit_remaining"), + rate_limit_reset=results.get("rate_limit_reset"), ) -def search_github_code(query: str, token: str = None, per_page: int = 100, - max_results: Optional[int] = None, verbose: bool = True) -> GitHubSearchResults: +def search_github_code( + query: str, + token: Optional[str] = None, + per_page: int = 100, + max_results: Optional[int] = None, + verbose: bool = True, +) -> GitHubSearchResults: """ Convenience function to search GitHub code. - + Args: query: Search query string token: GitHub personal access token (optional, will use GITHUB_TOKEN env var if not provided) per_page: Number of results per page (max 100) max_results: Maximum number of results to return (None for all) verbose: Whether to log progress messages - + Returns: GitHubSearchResults: Well-defined structure containing search results with the following fields: - query: The search query used @@ -524,8 +563,8 @@ def main(): parser.add_argument( "--format", type=str, - choices=['console', 'json', 'csv'], - default='console', + choices=["console", "json", "csv"], + default="console", help="Output format (default: console)", ) parser.add_argument( @@ -533,7 +572,7 @@ def main(): action="store_true", help="Show rate limit information before searching", ) - + args = parser.parse_args() if not GITHUB_TOKEN: @@ -542,42 +581,40 @@ def main(): # Create search instance searcher = GitHubCodeSearch() - + # Show rate limit if requested if args.show_rate_limit: rate_limit = searcher.get_rate_limit() print(f"Rate limit: {rate_limit['remaining']}/{rate_limit['limit']} remaining") - if rate_limit['remaining'] == 0: - reset_time = datetime.fromtimestamp(rate_limit['reset_time']) + if rate_limit["remaining"] == 0: + reset_time = datetime.fromtimestamp(rate_limit["reset_time"]) print(f"Rate limit resets at: {reset_time}") print() # Perform search results = searcher.search_code( - query=args.query, - per_page=args.per_page, - max_results=args.max_results + query=args.query, per_page=args.per_page, max_results=args.max_results ) # Format and output results if args.output: # Determine format from file extension - if args.output.endswith('.json'): - output_format = 'json' - elif args.output.endswith('.csv'): - output_format = 'csv' + if args.output.endswith(".json"): + output_format = "json" + elif args.output.endswith(".csv"): + output_format = "csv" else: output_format = args.format - + formatted_output = searcher.format_results(results, output_format) - - with open(args.output, 'w', encoding='utf-8') as f: + + with open(args.output, "w", encoding="utf-8") as f: f.write(formatted_output) - + print(f"Results saved to: {args.output}") - + # Also show console summary - console_output = searcher.format_results(results, 'console') + console_output = searcher.format_results(results, "console") print(console_output) else: # Just show console output @@ -586,4 +623,4 @@ def main(): if __name__ == "__main__": - main() \ No newline at end of file + main() diff --git a/tools/analytics/org/remove_pytorch_labs.py b/tools/analytics/org/remove_pytorch_labs.py index d40b6a6395..28fb6bb034 100644 --- a/tools/analytics/org/remove_pytorch_labs.py +++ b/tools/analytics/org/remove_pytorch_labs.py @@ -66,7 +66,8 @@ import requests from cache_manager import get_cache_stats, make_cached_request from dotenv import load_dotenv -from github_code_search import search_github_code, GitHubSearchResults +from github_code_search import GitHubSearchResults, search_github_code + load_dotenv() @@ -99,64 +100,74 @@ _SEARCH_CACHE: Dict[str, Dict[str, List[str]]] = {} - - def get_target_files_from_search(org: str) -> Dict[str, List[str]]: """ Get target files by searching GitHub for 'pytorch-labs' mentions in the organization. - + Args: org: GitHub organization name - + Returns: Dictionary mapping repository names to lists of file paths """ # Check cache first if org in _SEARCH_CACHE: - logging.info(f"[get_target_files_from_search] Using cached results for org: {org}") + logging.info( + f"[get_target_files_from_search] Using cached results for org: {org}" + ) return _SEARCH_CACHE[org] - + try: - logging.info(f"[get_target_files_from_search] Searching for 'pytorch-labs' mentions in org: {org}") - + logging.info( + f"[get_target_files_from_search] Searching for 'pytorch-labs' mentions in org: {org}" + ) + # Search for files containing "pytorch-labs" in the organization query = f"org:{org} pytorch-labs" results: GitHubSearchResults = search_github_code( query=query, - verbose=False # Reduce logging noise + verbose=False, # Reduce logging noise ) - - if results['retrieved_count'] == 0: - logging.warning(f"[get_target_files_from_search] No files found containing 'pytorch-labs' in org: {org}") + + if results["retrieved_count"] == 0: + logging.warning( + f"[get_target_files_from_search] No files found containing 'pytorch-labs' in org: {org}" + ) _SEARCH_CACHE[org] = {} return {} - + # Group files by repository target_files: Dict[str, List[str]] = {} - for item in results['items']: - repo_name = item['repository']['name'] # Just the repo name, not full_name - file_path = item['path'] - + for item in results["items"]: + repo_name = item["repository"]["name"] # Just the repo name, not full_name + file_path = item["path"] + if repo_name not in target_files: target_files[repo_name] = [] - + if file_path not in target_files[repo_name]: target_files[repo_name].append(file_path) - - logging.info(f"[get_target_files_from_search] Found {len(target_files)} repositories with {sum(len(files) for files in target_files.values())} total files") - + + logging.info( + f"[get_target_files_from_search] Found {len(target_files)} repositories with {sum(len(files) for files in target_files.values())} total files" + ) + # Log summary of repositories found for repo_name, files in target_files.items(): - logging.info(f"[get_target_files_from_search] {repo_name}: {len(files)} files") - + logging.info( + f"[get_target_files_from_search] {repo_name}: {len(files)} files" + ) + # Cache the results _SEARCH_CACHE[org] = target_files - + return target_files - + except Exception as e: logging.error(f"[get_target_files_from_search] Error searching for files: {e}") - logging.warning(f"[get_target_files_from_search] No fallback available - search failed") + logging.warning( + f"[get_target_files_from_search] No fallback available - search failed" + ) return {} @@ -164,26 +175,32 @@ def get_target_repos(org: str, filter_repos: Optional[List[str]] = None) -> List """Get only the repositories that have files with 'pytorch-labs' mentions.""" # Get target files from search (with fallback to hardcoded list) target_files = get_target_files_from_search(org) - + if not target_files: logging.info(f"[get_target_repos] No target files found for org: {org}") return [] - + all_repos = list(target_files.keys()) - + if filter_repos: # Filter to only include repos that are in both the target files and the filter list repos = [repo for repo in all_repos if repo in filter_repos] - logging.info(f"[get_target_repos] Filtered to {len(repos)} repositories from {len(all_repos)} available") - + logging.info( + f"[get_target_repos] Filtered to {len(repos)} repositories from {len(all_repos)} available" + ) + # Log which repos were filtered out filtered_out = [repo for repo in filter_repos if repo not in all_repos] if filtered_out: - logging.warning(f"[get_target_repos] Repositories not found in target files: {filtered_out}") + logging.warning( + f"[get_target_repos] Repositories not found in target files: {filtered_out}" + ) else: repos = all_repos - logging.info(f"[get_target_repos] Found {len(repos)} repositories with target files for org: {org}") - + logging.info( + f"[get_target_repos] Found {len(repos)} repositories with target files for org: {org}" + ) + return repos @@ -200,13 +217,17 @@ def get_target_files_for_repo(org: str, repo: str) -> List[str]: """Get the list of target files for a specific repository.""" # Get target files from search (with fallback to hardcoded list) target_files = get_target_files_from_search(org) - + if repo not in target_files: - logging.info(f"[get_target_files_for_repo] No target files found for {org}/{repo}") + logging.info( + f"[get_target_files_for_repo] No target files found for {org}/{repo}" + ) return [] - + files = target_files[repo] - logging.info(f"[get_target_files_for_repo] Found {len(files)} target files for {org}/{repo}") + logging.info( + f"[get_target_files_for_repo] Found {len(files)} target files for {org}/{repo}" + ) return files @@ -216,12 +237,14 @@ def get_file_content(org: str, repo: str, file_path: str) -> Optional[str]: data = make_cached_request(url, HEADERS) if not data: return None - + # Check file size if data.get("size", 0) > MAX_FILE_SIZE: - logging.warning(f"[get_file_content] File {file_path} too large ({data['size']} bytes), skipping") + logging.warning( + f"[get_file_content] File {file_path} too large ({data['size']} bytes), skipping" + ) return None - + # Decode content try: content = base64.b64decode(data["content"]).decode("utf-8") @@ -231,130 +254,153 @@ def get_file_content(org: str, repo: str, file_path: str) -> Optional[str]: return None -def find_and_replace_in_file(org: str, repo: str, file_path: str) -> Optional[Tuple[str, str]]: +def find_and_replace_in_file( + org: str, repo: str, file_path: str +) -> Optional[Tuple[str, str]]: """Find and replace text in a file. Returns (old_content, new_content) if changes needed.""" content = get_file_content(org, repo, file_path) if content is None: return None - + # Check if file contains the target text if OLD_TEXT not in content: return None - + # Replace all instances new_content = content.replace(OLD_TEXT, NEW_TEXT) - + # Check if any changes were made if new_content == content: return None - - logging.info(f"[find_and_replace_in_file] Found {content.count(OLD_TEXT)} instances in {file_path}") + + logging.info( + f"[find_and_replace_in_file] Found {content.count(OLD_TEXT)} instances in {file_path}" + ) return content, new_content def create_branch(org: str, repo: str, base_branch: str, new_branch: str) -> bool: """Create a new branch from the base branch.""" if DRY_RUN: - logging.info(f"[create_branch] DRY RUN: Would create branch {new_branch} in {org}/{repo}") + logging.info( + f"[create_branch] DRY RUN: Would create branch {new_branch} in {org}/{repo}" + ) return True - + # Get the SHA of the base branch url = f"{BASE_URL}/repos/{org}/{repo}/branches/{base_branch}" branch_data = make_cached_request(url, HEADERS) if not branch_data: logging.error(f"[create_branch] Failed to get base branch {base_branch}") return False - + base_sha = branch_data["commit"]["sha"] - + # Create the new branch url = f"{BASE_URL}/repos/{org}/{repo}/git/refs" - data = { - "ref": f"refs/heads/{new_branch}", - "sha": base_sha - } - + data = {"ref": f"refs/heads/{new_branch}", "sha": base_sha} + response = requests.post(url, headers=HEADERS, json=data) if response.status_code == 201: logging.info(f"[create_branch] Created branch {new_branch} in {org}/{repo}") return True elif response.status_code == 422: # Branch already exists - logging.info(f"[create_branch] Branch {new_branch} already exists in {org}/{repo}") + logging.info( + f"[create_branch] Branch {new_branch} already exists in {org}/{repo}" + ) return True else: - logging.error(f"[create_branch] Failed to create branch {new_branch}: {response.status_code} - {response.text}") + logging.error( + f"[create_branch] Failed to create branch {new_branch}: {response.status_code} - {response.text}" + ) return False -def create_file_commit(org: str, repo: str, file_path: str, content: str, branch: str, message: str) -> bool: +def create_file_commit( + org: str, repo: str, file_path: str, content: str, branch: str, message: str +) -> bool: """Create a commit to update a file.""" if DRY_RUN: - logging.info(f"[create_file_commit] DRY RUN: Would update {file_path} in {org}/{repo}") + logging.info( + f"[create_file_commit] DRY RUN: Would update {file_path} in {org}/{repo}" + ) return True - + # First get the current file to get its SHA url = f"{BASE_URL}/repos/{org}/{repo}/contents/{file_path}" current_file_data = make_cached_request(url, HEADERS) if not current_file_data: - logging.error(f"[create_file_commit] Failed to get current file data for {file_path}") + logging.error( + f"[create_file_commit] Failed to get current file data for {file_path}" + ) return False - + current_sha = current_file_data.get("sha") if not current_sha: logging.error(f"[create_file_commit] No SHA found for {file_path}") return False - + # Update the file with the SHA data = { "message": message, "content": base64.b64encode(content.encode("utf-8")).decode("utf-8"), "sha": current_sha, - "branch": branch + "branch": branch, } - + response = requests.put(url, headers=HEADERS, json=data) if response.status_code in [200, 201]: logging.info(f"[create_file_commit] Updated {file_path} in {org}/{repo}") return True else: - logging.error(f"[create_file_commit] Failed to update {file_path}: {response.status_code} - {response.text}") + logging.error( + f"[create_file_commit] Failed to update {file_path}: {response.status_code} - {response.text}" + ) return False def check_existing_pr(org: str, repo: str, title: str) -> Optional[str]: """Check if there's already an open PR with the same title. Returns PR URL if found, None otherwise.""" url = f"{BASE_URL}/repos/{org}/{repo}/pulls?state=open&per_page=100" - + # Don't use cache for PR checks since PR status can change quickly - logging.info(f"[check_existing_pr] Making direct request to check PRs for {org}/{repo}") + logging.info( + f"[check_existing_pr] Making direct request to check PRs for {org}/{repo}" + ) try: response = requests.get(url, headers=HEADERS) response.raise_for_status() data = response.json() - + for pr in data: - if pr.get("title") == title: - pr_url = pr['html_url'] - logging.info(f"[check_existing_pr] Found existing open PR with same title in {org}/{repo}: {pr_url}") + if pr.get("title", "").startswith(title): + pr_url = pr["html_url"] + logging.info( + f"[check_existing_pr] Found existing open PR with same title in {org}/{repo}: {pr_url}" + ) return pr_url - + logging.info(f"[check_existing_pr] No existing PR found for {org}/{repo}") return None - + except requests.exceptions.RequestException as e: logging.warning(f"[check_existing_pr] Failed to get PRs for {org}/{repo}: {e}") return None except json.JSONDecodeError as e: - logging.warning(f"[check_existing_pr] Failed to parse JSON response for {org}/{repo}: {e}") + logging.warning( + f"[check_existing_pr] Failed to parse JSON response for {org}/{repo}: {e}" + ) return None -def create_pull_request(org: str, repo: str, branch: str, base_branch: str) -> Optional[str]: +def create_pull_request( + org: str, repo: str, branch: str, base_branch: str +) -> Optional[str]: """Create a pull request and return the PR URL.""" if DRY_RUN: logging.info(f"[create_pull_request] DRY RUN: Would create PR for {org}/{repo}") return "DRY_RUN_PR_URL" - + url = f"{BASE_URL}/repos/{org}/{repo}/pulls" data = { "title": f"[EZ] Replace `pytorch-labs` with `meta-pytorch`", @@ -362,7 +408,6 @@ def create_pull_request(org: str, repo: str, branch: str, base_branch: str) -> O ## Changes Made - Replaced all occurrences of `pytorch-labs` with `meta-pytorch` -- Only modified files with extensions: .py, .md, .sh, .rst, .cpp, .h, .txt, .yml - Skipped binary files and files larger than 1MB due to GitHub api payload limits in the script to cover all repos in this org. Will do a more manual second pass later to cover any larger files ## Files Modified @@ -370,9 +415,9 @@ def create_pull_request(org: str, repo: str, branch: str, base_branch: str) -> O Generated by automated script on {datetime.now(timezone.utc).isoformat()}Z""", "head": branch, - "base": base_branch + "base": base_branch, } - + response = requests.post(url, headers=HEADERS, json=data) if response.status_code == 201: pr_data = response.json() @@ -380,88 +425,107 @@ def create_pull_request(org: str, repo: str, branch: str, base_branch: str) -> O logging.info(f"[create_pull_request] Created PR: {pr_url}") return pr_url else: - logging.error(f"[create_pull_request] Failed to create PR: {response.status_code} - {response.text}") + logging.error( + f"[create_pull_request] Failed to create PR: {response.status_code} - {response.text}" + ) return None def process_repository(org: str, repo: str) -> Dict: """Process a single repository for text replacement.""" logging.info(f"[process_repository] Processing repository: {org}/{repo}") - + result = { "repo": repo, "status": "skipped", "files_changed": 0, "pr_url": None, - "error": None + "error": None, } - + try: # Check for existing PR first (before doing any work) - pr_title = f"[EZ] Replace `pytorch-labs` with `meta-pytorch`" - existing_pr_url = check_existing_pr(org, repo, pr_title) + pr_title_prefix = f"[EZ] Replace `pytorch-labs`" + existing_pr_url = check_existing_pr(org, repo, pr_title_prefix) if existing_pr_url: result["status"] = "skipped_existing_pr" result["pr_url"] = existing_pr_url result["error"] = "Existing open PR with same title found" - logging.info(f"[process_repository] Skipping {org}/{repo} - existing open PR found: {existing_pr_url}") + logging.info( + f"[process_repository] Skipping {org}/{repo} - existing open PR found: {existing_pr_url}" + ) return result - + # Get default branch default_branch = get_default_branch(org, repo) if not default_branch: result["error"] = "Failed to get default branch" return result - + # Get target files for this repository target_files = get_target_files_for_repo(org, repo) if not target_files: logging.info(f"[process_repository] No target files found for {org}/{repo}") return result - + # Check each target file for replacements changes = [] for file_path in target_files: replacement = find_and_replace_in_file(org, repo, file_path) if replacement: old_content, new_content = replacement - changes.append({ - "path": file_path, - "old_content": old_content, - "new_content": new_content - }) - + changes.append( + { + "path": file_path, + "old_content": old_content, + "new_content": new_content, + } + ) + if not changes: logging.info(f"[process_repository] No changes needed in {org}/{repo}") return result - + result["files_changed"] = len(changes) - logging.info(f"[process_repository] Found {len(changes)} files to update in {org}/{repo}") - + logging.info( + f"[process_repository] Found {len(changes)} files to update in {org}/{repo}" + ) + if DRY_RUN: result["status"] = "dry_run" - logging.info(f"[process_repository] DRY RUN: Would update {len(changes)} files in {org}/{repo}") + logging.info( + f"[process_repository] DRY RUN: Would update {len(changes)} files in {org}/{repo}" + ) return result - + # Create new branch branch_name = f"replace-pytorch-labs-{datetime.now(timezone.utc).strftime('%Y%m%d-%H%M%S')}" if not create_branch(org, repo, default_branch, branch_name): result["error"] = "Failed to create branch" return result - + # Commit changes - commit_message = f"Replace 'pytorch-labs' with 'meta-pytorch' in {len(changes)} files" + commit_message = ( + f"Replace 'pytorch-labs' with 'meta-pytorch' in {len(changes)} files" + ) all_success = True - + for change in changes: - if not create_file_commit(org, repo, change["path"], change["new_content"], branch_name, commit_message): + if not create_file_commit( + org, + repo, + change["path"], + change["new_content"], + branch_name, + commit_message, + ): all_success = False break - + if not all_success: result["error"] = "Failed to commit some files" return result - + # Create pull request pr_url = create_pull_request(org, repo, branch_name, default_branch) if pr_url: @@ -469,11 +533,11 @@ def process_repository(org: str, repo: str) -> Dict: result["status"] = "success" else: result["error"] = "Failed to create pull request" - + except Exception as e: logging.error(f"[process_repository] Error processing {org}/{repo}: {e}") result["error"] = str(e) - + return result @@ -533,9 +597,10 @@ def main(): logging.info(f"[main] Processing repository {i}/{len(repos)}: {repo}") result = process_repository(ORG_NAME, repo) results.append(result) - + # Add a small delay to be respectful to the API import time + time.sleep(1) # Generate summary @@ -543,7 +608,9 @@ def main(): dry_run = [r for r in results if r["status"] == "dry_run"] skipped = [r for r in results if r["status"] == "skipped"] skipped_existing_pr = [r for r in results if r["status"] == "skipped_existing_pr"] - errors = [r for r in results if r["error"] and r["status"] not in ["skipped_existing_pr"]] + errors = [ + r for r in results if r["error"] and r["status"] not in ["skipped_existing_pr"] + ] print(f"\n=== SUMMARY ===") print(f"Organization: {ORG_NAME}") @@ -554,23 +621,29 @@ def main(): print(f"Skipped (existing PR): {len(skipped_existing_pr)}") print(f"Errors: {len(errors)}") print("\n") - + if skipped_existing_pr: print(f"=== SKIPPED (existing PRs) ===") for result in skipped_existing_pr: - print(f"- {result['repo']}: {result['files_changed']} files would be updated, but existing PR found: {result['pr_url']}") + print( + f"- {result['repo']}: {result['files_changed']} files would be updated, but existing PR found: {result['pr_url']}" + ) print("\n") if successful: print(f"=== SUCCESSFUL PRs ===") for result in successful: - print(f"- {result['repo']}: {result['pr_url']} ({result['files_changed']} files)") + print( + f"- {result['repo']}: {result['pr_url']} ({result['files_changed']} files)" + ) print("\n") if dry_run: print(f"=== DRY RUN (would create PRs) ===") for result in dry_run: - print(f"- {result['repo']}: {result['files_changed']} files would be updated") + print( + f"- {result['repo']}: {result['files_changed']} files would be updated" + ) print("\n") if errors: @@ -589,4 +662,4 @@ def main(): if __name__ == "__main__": - main() \ No newline at end of file + main()