From 88bdd639f9803614949c0726d0160db4d7ee8ea1 Mon Sep 17 00:00:00 2001
From: Zain Rizvi <ZainR@meta.com>
Date: Thu, 31 Jul 2025 15:35:33 -0500
Subject: [PATCH 1/9] Merge local changes

---
 tools/analytics/analyze_runner_usage.py | 40 +++++++++++++++++++++----
 1 file changed, 35 insertions(+), 5 deletions(-)

diff --git a/tools/analytics/analyze_runner_usage.py b/tools/analytics/analyze_runner_usage.py
index 6fc281245e..0183184d02 100644
--- a/tools/analytics/analyze_runner_usage.py
+++ b/tools/analytics/analyze_runner_usage.py
@@ -91,6 +91,8 @@
     "pytorch/cppdocs",
     "pytorch/pytorch.github.io",
     "pytorch/examples",
+    # archived but not marked as such in github repo settings
+    "pytorch/serve",
     # proposed
     "pytorch/builder",
     "pytorch/xla",
@@ -101,8 +103,6 @@
 # List of runner labels to exclude from "runners not in scale-config" analysis
 # These are typically GitHub-hosted runners or other known external runners
 GITHUB_RUNNER_LABELS = [
-    "linux.24_04.4x",
-    "linux.24_04.16x",
     "ubuntu-latest",
     "ubuntu-22.04",
     "ubuntu-24.04",
@@ -110,17 +110,33 @@
     "ubuntu-18.04",
     "windows-latest",
     "windows-2022",
-    "windows-11-arm64",
     "macos-latest",
     "macos-14",
+    "macos-14-xlarge",
     "macos-13",
     "macos-12",
-    "macos-14-xlarge",
+    # Offered at Meta enterprise level
+    "8-core-ubuntu",
+    "4-core-ubuntu",
+    "windows-8-core",
+    "4-core-ubuntu-gpu-t4",
+    "4-core-windows-gpu-t4",
+    "32-core-ubuntu",
+    "16-core-ubuntu",
+    "2-core-ubuntu-arm",
+    "4-core-ubuntu-arm",
+    "8-core-ubuntu-22.04",
+    "4-core-ubuntu-24.04",
+    # needs special access
+    "linux.24_04.4x",
+    "linux.24_04.16x",
+    "windows-11-arm64",
     # Add more runner labels to exclude here as needed
 ]
 
 USELESS_RUNNER_LABELS = [
-    "self-hosted",  # really, a useless label we want to ignoreß
+    "self-hosted",  # really, a useless label we want to ignore
+    "linux.g5.4xlarge.nvidia.cpu", # a nonexistent label used by a repo
 ]
 
 HEADERS = {
@@ -681,6 +697,20 @@ def main():
         if repos_by_github_runner:
             output_data["repos_by_github_runner"] = dict(repos_by_github_runner)
 
+        # --- SORT OUTPUT ALPHABETICALLY FOR CONSISTENCY (except top-level keys) ---
+        def deep_sort(obj, sort_keys=True):
+            if isinstance(obj, dict):
+                keys = sorted(obj) if sort_keys else obj.keys()
+                return {k: deep_sort(obj[k]) for k in keys}
+            elif isinstance(obj, list):
+                # If list of dicts with 'repo' key, sort by 'repo', else sort normally
+                if obj and isinstance(obj[0], dict) and 'repo' in obj[0]:
+                    return sorted([deep_sort(x) for x in obj], key=lambda x: x['repo'])
+                return sorted(deep_sort(x) for x in obj)
+            else:
+                return obj
+
+        output_data = deep_sort(output_data, sort_keys=False)
         save_to_yaml(output_data)
 
         # Show final cache stats

From 24b9b1d0233839eed34c6b670efad0e2cac06961 Mon Sep 17 00:00:00 2001
From: Zain Rizvi <ZainR@meta.com>
Date: Thu, 31 Jul 2025 15:42:57 -0500
Subject: [PATCH 2/9] Update analysis

---
 tools/analytics/analyze_runner_usage.py | 101 ++----------------------
 tools/analytics/cache_manager.py        |  98 +++++++++++++++++++++++
 2 files changed, 103 insertions(+), 96 deletions(-)
 create mode 100644 tools/analytics/cache_manager.py

diff --git a/tools/analytics/analyze_runner_usage.py b/tools/analytics/analyze_runner_usage.py
index 0183184d02..5d60e76ecd 100644
--- a/tools/analytics/analyze_runner_usage.py
+++ b/tools/analytics/analyze_runner_usage.py
@@ -67,6 +67,8 @@
 import yaml
 from dotenv import load_dotenv
 
+from tools.analytics.cache_manager import CACHE_DIR, CacheManager
+
 
 load_dotenv()
 
@@ -136,7 +138,7 @@
 
 USELESS_RUNNER_LABELS = [
     "self-hosted",  # really, a useless label we want to ignore
-    "linux.g5.4xlarge.nvidia.cpu", # a nonexistent label used by a repo
+    "linux.g5.4xlarge.nvidia.cpu",  # a nonexistent label used by a repo
 ]
 
 HEADERS = {
@@ -147,99 +149,6 @@
 BASE_URL = "https://api.github.com"
 WORKFLOW_RUN_LOOKBACK = (datetime.utcnow() - timedelta(days=180)).isoformat() + "Z"
 
-# Cache configuration
-CACHE_DIR = Path("cache")
-CACHE_DIR.mkdir(exist_ok=True)
-
-
-class CacheManager:
-    """Manages caching of GitHub API responses using URL as cache key."""
-
-    def __init__(self, cache_dir: Path = CACHE_DIR):
-        self.cache_dir = cache_dir
-        self.cache_dir.mkdir(exist_ok=True)
-
-    def _get_cache_key(self, url: str) -> str:
-        """Generate a human-readable cache key from URL."""
-        import re
-        from urllib.parse import parse_qs, urlencode, urlparse
-
-        # Parse the URL to separate path and query parameters
-        parsed = urlparse(url)
-        path = parsed.path
-        query_params = parse_qs(parsed.query)
-
-        # Remove the 'created' parameter from query params to avoid cache invalidation
-        if "created" in query_params:
-            del query_params["created"]
-
-        # Reconstruct the query string without the 'created' parameter
-        if query_params:
-            # Flatten single-item lists (parse_qs returns lists)
-            flat_params = {}
-            for key, values in query_params.items():
-                flat_params[key] = values[0] if len(values) == 1 else values
-            query_string = urlencode(flat_params)
-            # Reconstruct URL without the 'created' parameter
-            url_without_created = (
-                f"{parsed.scheme}://{parsed.netloc}{path}?{query_string}"
-            )
-        else:
-            # If no query params remain, use the original URL
-            url_without_created = url
-
-        # Replace forward slashes with underscores
-        key = url_without_created.replace("/", "_")
-
-        # Remove protocol and domain
-        key = key.replace("https___api.github.com_", "")
-
-        # Handle illegal filename characters in query parameters
-        # Replace characters that are problematic in filenames
-        key = re.sub(r'[<>:"|?*]', "_", key)
-
-        # Replace equals signs and ampersands in query params with underscores
-        key = key.replace("=", "_").replace("&", "_")
-
-        # Clean up multiple consecutive underscores
-        key = re.sub(r"_+", "_", key)
-
-        # Remove trailing underscore
-        key = key.rstrip("_")
-
-        return key
-
-    def _get_cache_path(self, url: str) -> Path:
-        """Get the cache file path for a given URL."""
-        cache_key = self._get_cache_key(url)
-        return self.cache_dir / f"{cache_key}.json"
-
-    def get(self, url: str) -> Optional[Dict]:
-        """Retrieve cached response for a URL."""
-        cache_path = self._get_cache_path(url)
-        if cache_path.exists():
-            try:
-                with open(cache_path, "r") as f:
-                    cached_data = json.load(f)
-                logging.debug(f"[CacheManager] Cache hit for URL: {url}")
-                return cached_data
-            except (json.JSONDecodeError, IOError) as e:
-                logging.warning(f"[CacheManager] Failed to read cache for {url}: {e}")
-                return None
-        logging.debug(f"[CacheManager] Cache miss for URL: {url}")
-        return None
-
-    def set(self, url: str, data: Dict) -> None:
-        """Cache response data for a URL."""
-        cache_path = self._get_cache_path(url)
-        try:
-            with open(cache_path, "w") as f:
-                json.dump(data, f, indent=2)
-            logging.debug(f"[CacheManager] Cached response for URL: {url}")
-        except IOError as e:
-            logging.error(f"[CacheManager] Failed to write cache for {url}: {e}")
-
-
 # Global cache manager instance
 cache_manager = CacheManager()
 
@@ -704,8 +613,8 @@ def deep_sort(obj, sort_keys=True):
                 return {k: deep_sort(obj[k]) for k in keys}
             elif isinstance(obj, list):
                 # If list of dicts with 'repo' key, sort by 'repo', else sort normally
-                if obj and isinstance(obj[0], dict) and 'repo' in obj[0]:
-                    return sorted([deep_sort(x) for x in obj], key=lambda x: x['repo'])
+                if obj and isinstance(obj[0], dict) and "repo" in obj[0]:
+                    return sorted([deep_sort(x) for x in obj], key=lambda x: x["repo"])
                 return sorted(deep_sort(x) for x in obj)
             else:
                 return obj
diff --git a/tools/analytics/cache_manager.py b/tools/analytics/cache_manager.py
new file mode 100644
index 0000000000..f9d86e89f4
--- /dev/null
+++ b/tools/analytics/cache_manager.py
@@ -0,0 +1,98 @@
+import json
+import logging
+import re
+from pathlib import Path
+from typing import Dict, Optional
+
+
+# Cache configuration
+CACHE_DIR = Path("cache")
+
+
+class CacheManager:
+    """Manages caching of GitHub API responses using URL as cache key."""
+
+    def __init__(self, cache_dir: Path = CACHE_DIR):
+        CACHE_DIR.mkdir(exist_ok=True)
+
+        self.cache_dir = cache_dir
+        self.cache_dir.mkdir(exist_ok=True)
+
+    def _get_cache_key(self, url: str) -> str:
+        """Generate a human-readable cache key from URL."""
+        from urllib.parse import parse_qs, urlencode, urlparse
+
+        # Parse the URL to separate path and query parameters
+        parsed = urlparse(url)
+        path = parsed.path
+        query_params = parse_qs(parsed.query)
+
+        # Remove the 'created' parameter from query params to avoid cache invalidation
+        if "created" in query_params:
+            del query_params["created"]
+
+        # Reconstruct the query string without the 'created' parameter
+        if query_params:
+            # Flatten single-item lists (parse_qs returns lists)
+            flat_params = {}
+            for key, values in query_params.items():
+                flat_params[key] = values[0] if len(values) == 1 else values
+            query_string = urlencode(flat_params)
+            # Reconstruct URL without the 'created' parameter
+            url_without_created = (
+                f"{parsed.scheme}://{parsed.netloc}{path}?{query_string}"
+            )
+        else:
+            # If no query params remain, use the original URL
+            url_without_created = url
+
+        # Replace forward slashes with underscores
+        key = url_without_created.replace("/", "_")
+
+        # Remove protocol and domain
+        key = key.replace("https___api.github.com_", "")
+
+        # Handle illegal filename characters in query parameters
+        # Replace characters that are problematic in filenames
+        key = re.sub(r'[<>:"|?*]', "_", key)
+
+        # Replace equals signs and ampersands in query params with underscores
+        key = key.replace("=", "_").replace("&", "_")
+
+        # Clean up multiple consecutive underscores
+        key = re.sub(r"_+", "_", key)
+
+        # Remove trailing underscore
+        key = key.rstrip("_")
+
+        return key
+
+    def _get_cache_path(self, url: str) -> Path:
+        """Get the cache file path for a given URL."""
+        cache_key = self._get_cache_key(url)
+        return self.cache_dir / f"{cache_key}.json"
+
+    def get(self, url: str) -> Optional[Dict]:
+        """Retrieve cached response for a URL."""
+        cache_path = self._get_cache_path(url)
+        if cache_path.exists():
+            try:
+                with open(cache_path, "r") as f:
+                    cached_data = json.load(f)
+                logging.debug(f"[CacheManager] Cache hit for URL: {url}")
+                return cached_data
+            except (json.JSONDecodeError, IOError) as e:
+                logging.warning(f"[CacheManager] Failed to read cache for {url}: {e}")
+                return None
+        logging.debug(f"[CacheManager] Cache miss for URL: {url}")
+        return None
+
+    def set(self, url: str, data: Dict) -> None:
+        """Cache response data for a URL."""
+        cache_path = self._get_cache_path(url)
+        try:
+            with open(cache_path, "w") as f:
+                json.dump(data, f, indent=2)
+            logging.debug(f"[CacheManager] Cached response for URL: {url}")
+        except IOError as e:
+            logging.error(f"[CacheManager] Failed to write cache for {url}: {e}")
\ No newline at end of file

From 0258597ffb80cde0dcebb1100f98fe1eb28e6bca Mon Sep 17 00:00:00 2001
From: Zain Rizvi <ZainR@meta.com>
Date: Thu, 31 Jul 2025 15:54:47 -0500
Subject: [PATCH 3/9] cleanup

---
 tools/analytics/org/.gitignore                |  5 +++
 tools/analytics/org/README.md                 | 39 +++++++++++++++++++
 .../{ => org}/analyze_runner_usage.py         |  2 +-
 tools/analytics/{ => org}/cache_manager.py    |  0
 4 files changed, 45 insertions(+), 1 deletion(-)
 create mode 100644 tools/analytics/org/.gitignore
 create mode 100644 tools/analytics/org/README.md
 rename tools/analytics/{ => org}/analyze_runner_usage.py (99%)
 rename tools/analytics/{ => org}/cache_manager.py (100%)

diff --git a/tools/analytics/org/.gitignore b/tools/analytics/org/.gitignore
new file mode 100644
index 0000000000..ecde95c402
--- /dev/null
+++ b/tools/analytics/org/.gitignore
@@ -0,0 +1,5 @@
+# Stores cached data for GitHub API responses
+cache/
+
+# Gets temporarily created by the script
+scale-config.yml
\ No newline at end of file
diff --git a/tools/analytics/org/README.md b/tools/analytics/org/README.md
new file mode 100644
index 0000000000..757e30670a
--- /dev/null
+++ b/tools/analytics/org/README.md
@@ -0,0 +1,39 @@
+# Organization Analytics Tools
+
+This directory contains a collection of scripts designed to analyze GitHub Actions runner usage and other organizational metrics across a GitHub organization's repositories.
+
+## Overview
+
+The tools in this directory help us understand how GitHub Actions runners are being utilized across our repositories.
+
+## Scripts
+
+### `analyze_runner_usage.py`
+
+**Purpose**: Analyzes GitHub Actions runner label usage across all repositories in a specified GitHub organization.
+
+**Key Features**:
+- Fetches all non-archived repositories in a GitHub organization
+- Extracts runner labels used in workflow jobs from recent workflow runs
+- Aggregates runner usage statistics across repositories
+- Compares runner labels against those defined in `scale-config.yml` and standard GitHub-hosted runners
+- Identifies unused or undefined runners
+- Generates comprehensive usage reports
+
+**Output**: Creates `runner_labels_summary.yml` with detailed analytics including:
+- Runner usage by repository
+- Repository usage by runner type
+- Repositories with zero workflow runs
+- Runners not defined in scale-config or standard GitHub runners
+- Usage patterns and trends
+
+### `cache_manager.py`
+
+**Purpose**: Helper script. Provides efficient caching functionality for GitHub API responses to optimize performance and avoid rate limiting.
+
+**Features**:
+- URL-based cache key generation
+- Intelligent cache invalidation
+- Rate limit optimization
+- Reduces redundant API calls during analysis
+
diff --git a/tools/analytics/analyze_runner_usage.py b/tools/analytics/org/analyze_runner_usage.py
similarity index 99%
rename from tools/analytics/analyze_runner_usage.py
rename to tools/analytics/org/analyze_runner_usage.py
index 5d60e76ecd..404bfabadf 100644
--- a/tools/analytics/analyze_runner_usage.py
+++ b/tools/analytics/org/analyze_runner_usage.py
@@ -67,7 +67,7 @@
 import yaml
 from dotenv import load_dotenv
 
-from tools.analytics.cache_manager import CACHE_DIR, CacheManager
+from cache_manager import CACHE_DIR, CacheManager
 
 
 load_dotenv()
diff --git a/tools/analytics/cache_manager.py b/tools/analytics/org/cache_manager.py
similarity index 100%
rename from tools/analytics/cache_manager.py
rename to tools/analytics/org/cache_manager.py

From 2a232b8c17bd59a8633b314d9b7365b0d92f3be9 Mon Sep 17 00:00:00 2001
From: Zain Rizvi <ZainR@meta.com>
Date: Thu, 31 Jul 2025 16:12:04 -0500
Subject: [PATCH 4/9] Move more cache functions over

---
 tools/analytics/org/analyze_runner_usage.py | 79 +-------------------
 tools/analytics/org/cache_manager.py        | 81 ++++++++++++++++++++-
 2 files changed, 81 insertions(+), 79 deletions(-)

diff --git a/tools/analytics/org/analyze_runner_usage.py b/tools/analytics/org/analyze_runner_usage.py
index 404bfabadf..80107700c0 100644
--- a/tools/analytics/org/analyze_runner_usage.py
+++ b/tools/analytics/org/analyze_runner_usage.py
@@ -55,7 +55,6 @@
 """
 
 import argparse
-import json
 import logging
 import os
 from collections import defaultdict
@@ -65,10 +64,9 @@
 
 import requests
 import yaml
+from cache_manager import get_cache_stats, make_cached_request
 from dotenv import load_dotenv
 
-from cache_manager import CACHE_DIR, CacheManager
-
 
 load_dotenv()
 
@@ -141,58 +139,9 @@
     "linux.g5.4xlarge.nvidia.cpu",  # a nonexistent label used by a repo
 ]
 
-HEADERS = {
-    "Authorization": f"Bearer {GITHUB_TOKEN}",
-    "Accept": "application/vnd.github+json",
-}
-
 BASE_URL = "https://api.github.com"
 WORKFLOW_RUN_LOOKBACK = (datetime.utcnow() - timedelta(days=180)).isoformat() + "Z"
 
-# Global cache manager instance
-cache_manager = CacheManager()
-
-
-def make_cached_request(
-    url: str, headers: Optional[Dict[str, str]] = None
-) -> Optional[Dict]:
-    """
-    Make an HTTP request with caching. Returns the JSON response if successful.
-
-    Args:
-        url: The URL to request
-        headers: Optional headers for the request
-
-    Returns:
-        JSON response data if successful, None if failed
-    """
-    # Check cache first
-    cached_response = cache_manager.get(url)
-    if cached_response:
-        logging.info(f"[make_cached_request] Using cached response for: {url}")
-        return cached_response
-
-    # Make actual HTTP request
-    logging.info(f"[make_cached_request] Making HTTP request to: {url}")
-    try:
-        response = requests.get(url, headers=headers or HEADERS)
-        response.raise_for_status()
-        data = response.json()
-
-        # Cache successful response
-        cache_manager.set(url, data)
-        logging.info(f"[make_cached_request] Successfully cached response for: {url}")
-        return data
-
-    except requests.exceptions.RequestException as e:
-        logging.error(f"[make_cached_request] HTTP request failed for {url}: {e}")
-        return None
-    except json.JSONDecodeError as e:
-        logging.error(
-            f"[make_cached_request] Failed to parse JSON response for {url}: {e}"
-        )
-        return None
-
 
 def get_repos(org: str) -> List[str]:
     logging.info(f"[get_repos] Start fetching repositories for org: {org}")
@@ -445,32 +394,6 @@ def save_to_yaml(data: Dict, filename: str = "runner_labels_summary.yml"):
     logging.info(f"[save_to_yaml] Data successfully saved to {filename}")
 
 
-def clear_cache():
-    """Clear all cached data."""
-    import shutil
-
-    if CACHE_DIR.exists():
-        shutil.rmtree(CACHE_DIR)
-        CACHE_DIR.mkdir(exist_ok=True)
-        logging.info(f"[clear_cache] Cleared cache directory: {CACHE_DIR}")
-    else:
-        logging.info(f"[clear_cache] Cache directory does not exist: {CACHE_DIR}")
-
-
-def get_cache_stats():
-    """Get statistics about the cache."""
-    if not CACHE_DIR.exists():
-        return {"total_files": 0, "total_size_mb": 0}
-
-    cache_files = list(CACHE_DIR.glob("*.json"))
-    total_size = sum(f.stat().st_size for f in cache_files)
-
-    return {
-        "total_files": len(cache_files),
-        "total_size_mb": round(total_size / (1024 * 1024), 2),
-    }
-
-
 def download_scale_config(url: str, dest: str = "scale-config.yml") -> bool:
     """Download scale-config.yml from the given URL if it does not exist locally."""
     if os.path.exists(dest):
diff --git a/tools/analytics/org/cache_manager.py b/tools/analytics/org/cache_manager.py
index f9d86e89f4..69eb42015f 100644
--- a/tools/analytics/org/cache_manager.py
+++ b/tools/analytics/org/cache_manager.py
@@ -4,6 +4,8 @@
 from pathlib import Path
 from typing import Dict, Optional
 
+import requests
+
 
 # Cache configuration
 CACHE_DIR = Path("cache")
@@ -95,4 +97,81 @@ def set(self, url: str, data: Dict) -> None:
                 json.dump(data, f, indent=2)
             logging.debug(f"[CacheManager] Cached response for URL: {url}")
         except IOError as e:
-            logging.error(f"[CacheManager] Failed to write cache for {url}: {e}")
\ No newline at end of file
+            logging.error(f"[CacheManager] Failed to write cache for {url}: {e}")
+
+
+# Global cache manager instance
+cache_manager = CacheManager()
+
+
+def get_cache_stats():
+    """Get statistics about the cache."""
+    if not CACHE_DIR.exists():
+        return {"total_files": 0, "total_size_mb": 0}
+
+    cache_files = list(CACHE_DIR.glob("*.json"))
+    total_size = sum(f.stat().st_size for f in cache_files)
+
+    return {
+        "total_files": len(cache_files),
+        "total_size_mb": round(total_size / (1024 * 1024), 2),
+    }
+
+
+def clear_cache():
+    """Clear all cached data."""
+    import shutil
+
+    if CACHE_DIR.exists():
+        shutil.rmtree(CACHE_DIR)
+        CACHE_DIR.mkdir(exist_ok=True)
+        logging.info(f"[clear_cache] Cleared cache directory: {CACHE_DIR}")
+    else:
+        logging.info(f"[clear_cache] Cache directory does not exist: {CACHE_DIR}")
+
+
+HEADERS = {
+    "Authorization": f"Bearer {GITHUB_TOKEN}",
+    "Accept": "application/vnd.github+json",
+}
+
+
+def make_cached_request(
+    url: str, headers: Optional[Dict[str, str]] = None
+) -> Optional[Dict]:
+    """
+    Make an HTTP request with caching. Returns the JSON response if successful.
+
+    Args:
+        url: The URL to request
+        headers: Optional headers for the request
+
+    Returns:
+        JSON response data if successful, None if failed
+    """
+    # Check cache first
+    cached_response = cache_manager.get(url)
+    if cached_response:
+        logging.info(f"[make_cached_request] Using cached response for: {url}")
+        return cached_response
+
+    # Make actual HTTP request
+    logging.info(f"[make_cached_request] Making HTTP request to: {url}")
+    try:
+        response = requests.get(url, headers=headers or HEADERS)
+        response.raise_for_status()
+        data = response.json()
+
+        # Cache successful response
+        cache_manager.set(url, data)
+        logging.info(f"[make_cached_request] Successfully cached response for: {url}")
+        return data
+
+    except requests.exceptions.RequestException as e:
+        logging.error(f"[make_cached_request] HTTP request failed for {url}: {e}")
+        return None
+    except json.JSONDecodeError as e:
+        logging.error(
+            f"[make_cached_request] Failed to parse JSON response for {url}: {e}"
+        )
+        return None

From 65e877e0fc97875785898572a53db29275026663 Mon Sep 17 00:00:00 2001
From: Zain Rizvi <ZainR@meta.com>
Date: Thu, 31 Jul 2025 16:22:13 -0500
Subject: [PATCH 5/9] Fix refactoring

---
 tools/analytics/org/analyze_runner_usage.py | 12 +++++++++---
 tools/analytics/org/cache_manager.py        | 12 +++---------
 2 files changed, 12 insertions(+), 12 deletions(-)

diff --git a/tools/analytics/org/analyze_runner_usage.py b/tools/analytics/org/analyze_runner_usage.py
index 80107700c0..4ff31306a1 100644
--- a/tools/analytics/org/analyze_runner_usage.py
+++ b/tools/analytics/org/analyze_runner_usage.py
@@ -79,6 +79,12 @@
 GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")
 ORG_NAME = None  # Will be set by argparse
 
+# GitHub API headers
+HEADERS = {
+    "Authorization": f"Bearer {GITHUB_TOKEN}",
+    "Accept": "application/vnd.github+json",
+}
+
 # List of repositories to exclude in the format 'org/repo'
 EXCLUDED_REPOS = [
     "pytorch/pytorch",
@@ -150,7 +156,7 @@ def get_repos(org: str) -> List[str]:
     while True:
         url = f"{BASE_URL}/orgs/{org}/repos?per_page=100&page={page}"
         logging.debug(f"[get_repos] Requesting URL: {url}")
-        data = make_cached_request(url)
+        data = make_cached_request(url, HEADERS)
         if data is None:
             logging.error(f"[get_repos] Failed to fetch page {page} for org: {org}")
             break
@@ -186,7 +192,7 @@ def get_workflow_runs(org: str, repo: str) -> List[Dict]:
     while True:
         url = f"{BASE_URL}/repos/{org}/{repo}/actions/runs?per_page=100&page={page}&created=>={WORKFLOW_RUN_LOOKBACK}"
         logging.debug(f"[get_workflow_runs] Requesting URL: {url}")
-        response_data = make_cached_request(url)
+        response_data = make_cached_request(url, HEADERS)
         if response_data is None:
             logging.error(
                 f"[get_workflow_runs] Failed to fetch page {page} for repo: {repo}"
@@ -271,7 +277,7 @@ def get_jobs_for_run(
     )
     url = f"{BASE_URL}/repos/{org}/{repo}/actions/runs/{run_id}/jobs"
     logging.debug(f"[get_jobs_for_run] Requesting URL: {url}")
-    response_data = make_cached_request(url)
+    response_data = make_cached_request(url, HEADERS)
     if response_data is None:
         logging.error(
             f"[get_jobs_for_run] Failed to fetch jobs for run {run_id} in repo: {repo}"
diff --git a/tools/analytics/org/cache_manager.py b/tools/analytics/org/cache_manager.py
index 69eb42015f..48c05862d4 100644
--- a/tools/analytics/org/cache_manager.py
+++ b/tools/analytics/org/cache_manager.py
@@ -130,21 +130,15 @@ def clear_cache():
         logging.info(f"[clear_cache] Cache directory does not exist: {CACHE_DIR}")
 
 
-HEADERS = {
-    "Authorization": f"Bearer {GITHUB_TOKEN}",
-    "Accept": "application/vnd.github+json",
-}
-
-
 def make_cached_request(
-    url: str, headers: Optional[Dict[str, str]] = None
+    url: str, headers: Dict[str, str]
 ) -> Optional[Dict]:
     """
     Make an HTTP request with caching. Returns the JSON response if successful.
 
     Args:
         url: The URL to request
-        headers: Optional headers for the request
+        headers: Headers for the request (required)
 
     Returns:
         JSON response data if successful, None if failed
@@ -158,7 +152,7 @@ def make_cached_request(
     # Make actual HTTP request
     logging.info(f"[make_cached_request] Making HTTP request to: {url}")
     try:
-        response = requests.get(url, headers=headers or HEADERS)
+        response = requests.get(url, headers=headers)
         response.raise_for_status()
         data = response.json()
 

From 50bc9bba426e5ea8f15cff38842fb1cab9edf2c2 Mon Sep 17 00:00:00 2001
From: Zain Rizvi <ZainR@meta.com>
Date: Thu, 31 Jul 2025 17:15:16 -0500
Subject: [PATCH 6/9] lint fixes

---
 tools/analytics/org/README.md        | 1 -
 tools/analytics/org/cache_manager.py | 4 +---
 tools/analytics/org/requirements.txt | 3 +++
 3 files changed, 4 insertions(+), 4 deletions(-)
 create mode 100644 tools/analytics/org/requirements.txt

diff --git a/tools/analytics/org/README.md b/tools/analytics/org/README.md
index 757e30670a..6a22f25251 100644
--- a/tools/analytics/org/README.md
+++ b/tools/analytics/org/README.md
@@ -36,4 +36,3 @@ The tools in this directory help us understand how GitHub Actions runners are be
 - Intelligent cache invalidation
 - Rate limit optimization
 - Reduces redundant API calls during analysis
-
diff --git a/tools/analytics/org/cache_manager.py b/tools/analytics/org/cache_manager.py
index 48c05862d4..60cf4544f1 100644
--- a/tools/analytics/org/cache_manager.py
+++ b/tools/analytics/org/cache_manager.py
@@ -130,9 +130,7 @@ def clear_cache():
         logging.info(f"[clear_cache] Cache directory does not exist: {CACHE_DIR}")
 
 
-def make_cached_request(
-    url: str, headers: Dict[str, str]
-) -> Optional[Dict]:
+def make_cached_request(url: str, headers: Dict[str, str]) -> Optional[Dict]:
     """
     Make an HTTP request with caching. Returns the JSON response if successful.
 
diff --git a/tools/analytics/org/requirements.txt b/tools/analytics/org/requirements.txt
new file mode 100644
index 0000000000..dd9f3517df
--- /dev/null
+++ b/tools/analytics/org/requirements.txt
@@ -0,0 +1,3 @@
+requests>=2.28.0
+pyyaml>=6.0
+python-dotenv>=0.19.0

From 782b327d278de73d2ace5a51f87ca995583236b5 Mon Sep 17 00:00:00 2001
From: Zain Rizvi <ZainR@meta.com>
Date: Tue, 12 Aug 2025 16:48:29 -0500
Subject: [PATCH 7/9] changes so far, with pytorch-labs remover and gh code
 search

---
 tools/analytics/org/.gitignore              |   5 +-
 tools/analytics/org/analyze_contributors.py | 664 ++++++++++++++++++++
 tools/analytics/org/analyze_repo_info.py    | 259 ++++++++
 tools/analytics/org/analyze_runner_usage.py |  17 +-
 tools/analytics/org/github_code_search.py   | 589 +++++++++++++++++
 tools/analytics/org/remove_pytorch_labs.py  | 636 +++++++++++++++++++
 6 files changed, 2164 insertions(+), 6 deletions(-)
 create mode 100644 tools/analytics/org/analyze_contributors.py
 create mode 100644 tools/analytics/org/analyze_repo_info.py
 create mode 100644 tools/analytics/org/github_code_search.py
 create mode 100644 tools/analytics/org/remove_pytorch_labs.py

diff --git a/tools/analytics/org/.gitignore b/tools/analytics/org/.gitignore
index ecde95c402..5a4d01b96c 100644
--- a/tools/analytics/org/.gitignore
+++ b/tools/analytics/org/.gitignore
@@ -2,4 +2,7 @@
 cache/
 
 # Gets temporarily created by the script
-scale-config.yml
\ No newline at end of file
+scale-config.yml
+
+# Stores the output of the analysis
+reports/
\ No newline at end of file
diff --git a/tools/analytics/org/analyze_contributors.py b/tools/analytics/org/analyze_contributors.py
new file mode 100644
index 0000000000..5b9439eeb5
--- /dev/null
+++ b/tools/analytics/org/analyze_contributors.py
@@ -0,0 +1,664 @@
+"""
+GitHub Organization Contributor Analyzer
+========================================
+
+Purpose:
+--------
+This script analyzes contributors across all repositories in a specified GitHub organization over the past 6 months.
+It identifies frequent contributors and attempts to determine their company affiliations based on email addresses
+and GitHub profile information.
+
+Key Features:
+-------------
+- Fetches all non-archived repositories in a GitHub organization (excluding a configurable list).
+- For each repository, analyzes commits from the past 6 months to identify contributors.
+- Extracts contributor information including email addresses and GitHub profiles.
+- Attempts to identify company affiliations from email domains and GitHub profile data.
+- Aggregates contributor statistics across repositories.
+- Outputs a YAML summary (reports/contributors_summary.yml) with detailed contributor analysis.
+- Caches GitHub API responses for efficiency and rate limit avoidance.
+
+How to Run:
+-----------
+1. Ensure you have Python 3.9+ and install dependencies (see below).
+2. Set the following environment variable (can be in a .env file):
+   - `GITHUB_TOKEN`: A GitHub personal access token with `repo` and `user` read permissions.
+3. (Optional) Edit the EXCLUDED_REPOS list in the script to customize exclusions.
+4. Run the script:
+
+   ```bash
+   python analyze_contributors.py [--org ORG_NAME]
+   ```
+   - Use `--org` to specify the GitHub organization to analyze (defaults to 'pytorch').
+
+Dependencies:
+-------------
+- requests
+- pyyaml
+- python-dotenv
+
+Output:
+-------
+- `reports/contributors_summary.yml`: A YAML file containing:
+    - `contributors_by_frequency`: Contributors sorted by commit count across all repos.
+    - `contributors_by_repo`: For each repo, list of contributors with their stats.
+    - `company_analysis`: Contributors grouped by identified companies.
+    - `unidentified_contributors`: Contributors without identifiable company affiliation.
+- Caches API responses in the `cache/` directory for faster reruns.
+
+Notes:
+------
+- The script looks back 6 months for commits.
+- Company identification is based on email domains and GitHub profile information.
+- The script is safe to rerun; it uses caching to avoid redundant API calls.
+- For large orgs, the script may take a while on the first run due to API rate limits.
+
+"""
+
+import argparse
+import logging
+import os
+import re
+from collections import defaultdict
+from datetime import datetime, timedelta
+from typing import Dict, List, Optional
+
+import requests
+import yaml
+from cache_manager import get_cache_stats, make_cached_request
+from dotenv import load_dotenv
+
+
+load_dotenv()
+
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s [%(levelname)s] %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+)
+
+GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")
+ORG_NAME = None  # Will be set by argparse
+
+# GitHub API headers
+HEADERS = {
+    "Authorization": f"Bearer {GITHUB_TOKEN}",
+    "Accept": "application/vnd.github+json",
+}
+
+# List of repositories to exclude in the format 'org/repo'
+EXCLUDED_REPOS = [
+    "pytorch/pytorch",
+    "pytorch/executorch",
+    "pytorch/test-infra",
+    "pytorch/ci-infra",
+    "pytorch/pytorch-canary",
+    "pytorch/tutorials",
+    "pytorch/docs",
+    "pytorch/cppdocs",
+    "pytorch/pytorch.github.io",
+    "pytorch/examples",
+    # archived but not marked as such in github repo settings
+    "pytorch/serve",
+    # proposed
+    "pytorch/builder",
+    "pytorch/xla",
+    "pytorch/benchmark",
+    "pytorch/pytorch-integration-testing",
+]
+
+# Company domains mapping
+COMPANY_DOMAINS = {
+    "meta.com": "Meta",
+    "fb.com": "Meta",
+    "facebook.com": "Meta",
+    "google.com": "Google",
+    "microsoft.com": "Microsoft",
+    "nvidia.com": "NVIDIA",
+    "intel.com": "Intel",
+    "amd.com": "AMD",
+    "apple.com": "Apple",
+    "amazon.com": "Amazon",
+    "aws.com": "Amazon",
+    "ibm.com": "IBM",
+    "redhat.com": "Red Hat",
+    "canonical.com": "Canonical",
+    "huggingface.co": "Hugging Face",
+    "openai.com": "OpenAI",
+    "anthropic.com": "Anthropic",
+    "deepmind.com": "DeepMind",
+    "salesforce.com": "Salesforce",
+    "uber.com": "Uber",
+    "netflix.com": "Netflix",
+    "airbnb.com": "Airbnb",
+    "spotify.com": "Spotify",
+    "tesla.com": "Tesla",
+}
+
+BASE_URL = "https://api.github.com"
+COMMIT_LOOKBACK = (datetime.utcnow() - timedelta(days=180)).isoformat() + "Z"  # 6 months
+
+
+def get_repos(org: str) -> List[str]:
+    logging.info(f"[get_repos] Start fetching repositories for org: {org}")
+    repos = []
+    page = 1
+    while True:
+        url = f"{BASE_URL}/orgs/{org}/repos?per_page=100&page={page}"
+        logging.debug(f"[get_repos] Requesting URL: {url}")
+        data = make_cached_request(url, HEADERS)
+        if data is None:
+            logging.error(f"[get_repos] Failed to fetch page {page} for org: {org}")
+            break
+        if not data:
+            logging.info(
+                f"[get_repos] No more repositories found on page {page} for org: {org}"
+            )
+            break
+        logging.info(
+            f"[get_repos] Page {page}: Found {len(data)} repositories for org: {org}"
+        )
+        # Filter out archived repositories
+        non_archived_repos = [
+            repo["name"] for repo in data if not repo.get("archived", False)
+        ]
+        repos.extend(non_archived_repos)
+        logging.info(
+            f"[get_repos] Page {page}: Excluded {len(data) - len(non_archived_repos)} archived repositories"
+        )
+        page += 1
+    logging.info(
+        f"[get_repos] Finished fetching repositories for org: {org}. Total: {len(repos)} (excluding archived)"
+    )
+    return repos
+
+
+def get_commits(org: str, repo: str) -> List[Dict]:
+    """Get commits for a repository from the past 6 months."""
+    logging.info(f"[get_commits] Start fetching commits for repo: {repo} in org: {org}")
+    all_commits = []
+    page = 1
+
+    while True:
+        url = f"{BASE_URL}/repos/{org}/{repo}/commits?per_page=100&page={page}&since={COMMIT_LOOKBACK}"
+        logging.debug(f"[get_commits] Requesting URL: {url}")
+        data = make_cached_request(url, HEADERS)
+        if data is None:
+            logging.error(f"[get_commits] Failed to fetch page {page} for repo: {repo}")
+            break
+        if not data:
+            logging.info(f"[get_commits] No more commits found for repo: {repo} on page {page}")
+            break
+        logging.info(f"[get_commits] Page {page}: Found {len(data)} commits for repo: {repo}")
+        all_commits.extend(data)
+        page += 1
+
+        # Limit to reasonable number of commits to avoid API rate limits
+        if len(all_commits) >= 1000:
+            logging.info(f"[get_commits] Limiting to 1000 commits for repo: {repo}")
+            break
+
+    logging.info(f"[get_commits] Finished fetching commits for repo: {repo}. Total: {len(all_commits)}")
+    return all_commits
+
+
+def get_user_profile(username: str) -> Optional[Dict]:
+    """Get GitHub user profile information."""
+    if not username:
+        return None
+
+    url = f"{BASE_URL}/users/{username}"
+    logging.debug(f"[get_user_profile] Fetching profile for user: {username}")
+    return make_cached_request(url, HEADERS)
+
+
+def extract_company_from_email(email: str) -> Optional[str]:
+    """Extract company name from email domain."""
+    if not email or "@" not in email:
+        return None
+
+    domain = email.split("@")[1].lower()
+
+    # Check direct domain matches
+    if domain in COMPANY_DOMAINS:
+        return COMPANY_DOMAINS[domain]
+
+    # Check for subdomains
+    for company_domain, company_name in COMPANY_DOMAINS.items():
+        if domain.endswith(f".{company_domain}"):
+            return company_name
+
+    # Skip generic email providers
+    generic_providers = {
+        "gmail.com", "yahoo.com", "hotmail.com", "outlook.com", "icloud.com",
+        "protonmail.com", "tutanota.com", "hey.com", "fastmail.com",
+        "users.noreply.github.com"  # GitHub's privacy-preserving email addresses
+    }
+
+    if domain in generic_providers:
+        return None
+
+    # For other domains, try to extract company name
+    # Remove common TLDs and subdomains
+    domain_parts = domain.replace(".com", "").replace(".org", "").replace(".net", "").split(".")
+    if domain_parts and len(domain_parts[-1]) > 2:
+        return domain_parts[-1].title()
+
+    return None
+
+
+def extract_company_from_profile(profile: Dict) -> Optional[str]:
+    """Extract company name from GitHub profile."""
+    if not profile:
+        return None
+
+    company = profile.get("company") or ""
+    company = company.strip() if company else ""
+    if not company:
+        return None
+
+    # Clean up company name
+    company = re.sub(r'^@', '', company)  # Remove @ prefix
+    company = company.strip()
+
+    if not company:
+        return None
+
+    # Map common company variations
+    company_mappings = {
+        "meta": "Meta",
+        "facebook": "Meta",
+        "google": "Google",
+        "microsoft": "Microsoft",
+        "nvidia": "NVIDIA",
+        "intel": "Intel",
+        "amd": "AMD",
+        "apple": "Apple",
+        "amazon": "Amazon",
+        "aws": "Amazon",
+        "ibm": "IBM",
+        "red hat": "Red Hat",
+        "redhat": "Red Hat",
+        "canonical": "Canonical",
+        "hugging face": "Hugging Face",
+        "huggingface": "Hugging Face",
+        "openai": "OpenAI",
+        "anthropic": "Anthropic",
+        "deepmind": "DeepMind",
+        "salesforce": "Salesforce",
+        "uber": "Uber",
+        "netflix": "Netflix",
+        "airbnb": "Airbnb",
+        "spotify": "Spotify",
+        "tesla": "Tesla",
+    }
+
+    company_lower = company.lower()
+    if company_lower in company_mappings:
+        return company_mappings[company_lower]
+
+    return company.title()
+
+
+def cache_to_disk(func):
+    """
+    A decorator that caches the result of a function to disk.
+    The cache key is generated from the function name, its arguments, and today's date.
+    Handles complex types like lists and dictionaries properly.
+    """
+    import hashlib
+    import json
+    import os
+    from datetime import date
+    from functools import wraps
+
+    def make_hashable(obj):
+        """Convert a container to a frozen/hashable form for reliable caching."""
+        if isinstance(obj, dict):
+            return tuple(sorted((k, make_hashable(v)) for k, v in obj.items()))
+        elif isinstance(obj, (list, tuple)):
+            return tuple(make_hashable(x) for x in obj)
+        # For sets, convert to sorted tuples
+        elif isinstance(obj, set):
+            return tuple(sorted(make_hashable(x) for x in obj))
+        # Handle string representation for other objects that might not be JSON serializable
+        elif not isinstance(obj, (str, int, float, bool, type(None))):
+            return str(obj)
+        return obj
+
+    @wraps(func)
+    def wrapper(*args, **kwargs):
+        # Create cache directory if it doesn't exist
+        cache_dir = "cache"
+        os.makedirs(cache_dir, exist_ok=True)
+
+        # Generate a cache key based on function name and args
+        func_name = func.__name__
+        # Create a function-specific subdirectory for better organization
+        func_cache_dir = os.path.join(cache_dir, func_name)
+        os.makedirs(func_cache_dir, exist_ok=True)
+
+        # Get today's date for cache versioning
+        today = date.today().isoformat()  # Format: YYYY-MM-DD
+
+        # Make args and kwargs hashable before serializing
+        hashable_args = tuple(make_hashable(arg) for arg in args)
+        hashable_kwargs = {k: make_hashable(v) for k, v in kwargs.items()}
+
+        try:
+            # Try to serialize with standard JSON, including today's date
+            arg_representation = {
+                "date": today,
+                "args": hashable_args,
+                "kwargs": sorted(hashable_kwargs.items())
+            }
+            serialized_args = json.dumps(arg_representation, sort_keys=True)
+        except (TypeError, ValueError):
+            # If serialization fails, use string representation as fallback
+            serialized_args = today + str(hashable_args) + str(sorted(hashable_kwargs.items()))
+
+        arg_hash = hashlib.sha256(serialized_args.encode()).hexdigest()
+        key = f"{func_name}_{today}_{arg_hash}"
+
+        # Check if cached result exists
+        filepath = os.path.join(func_cache_dir, f"{today}_{arg_hash}.json")
+        if os.path.exists(filepath):
+            logging.debug(f"Cache hit for function: {func_name} (cached on {today})")
+            with open(filepath, "r") as f:
+                return json.load(f)
+
+        # If not cached, call the function
+        result = func(*args, **kwargs)
+
+        # Cache the result
+        with open(filepath, "w") as f:
+            json.dump(result, f)
+        logging.debug(f"Cached result for function: {func_name}, saved to: {filepath} (date: {today})")
+
+        return result
+
+    return wrapper
+
+
+@cache_to_disk
+def analyze_contributors(org: str, repos: List[str]) -> Dict:
+    """Analyze contributors across all repositories."""
+    logging.info(f"[analyze_contributors] Start analyzing contributors for {len(repos)} repositories in org: {org}")
+
+    # Track contributors across all repos
+    global_contributors = defaultdict(lambda: {
+        "total_commits": 0,
+        "repos": set(),
+        "emails": set(),
+        "username": None,
+        "company": None,
+        "profile": None
+    })
+
+    # Track contributors by repo
+    repo_contributors = {}
+
+    for repo in repos:
+        logging.info(f"[analyze_contributors] Processing repo: {repo}")
+        commits = get_commits(org, repo)
+        repo_contributor_stats = defaultdict(lambda: {
+            "commits": 0,
+            "emails": set(),
+            "username": None
+        })
+
+        for commit in commits:
+            author = commit.get("commit", {}).get("author", {})
+            github_author = commit.get("author")
+
+            author_name = author.get("name", "Unknown")
+            author_email = author.get("email", "")
+            username = github_author.get("login") if github_author else None
+
+            # Since we can assume GitHub username info is always there, use it as the primary key
+            contributor_key = username
+            if not contributor_key:
+                raise ValueError(f"Commit {commit['sha']} in repo {repo} has no identifiable contributor information.")
+
+            # Update repo-specific stats
+            repo_contributor_stats[contributor_key]["commits"] += 1
+            if author_email:
+                repo_contributor_stats[contributor_key]["emails"].add(author_email)
+            if username:
+                repo_contributor_stats[contributor_key]["username"] = username
+
+            # Update global stats
+            global_contributors[contributor_key]["total_commits"] += 1
+            global_contributors[contributor_key]["repos"].add(repo)
+            if author_email:
+                global_contributors[contributor_key]["emails"].add(author_email)
+            if username:
+                global_contributors[contributor_key]["username"] = username
+
+        # Convert sets to lists for YAML serialization
+        repo_contributors[repo] = []
+        for contributor_key, stats in repo_contributor_stats.items():
+            repo_contributors[repo].append({
+                "contributor": contributor_key,
+                "commits": stats["commits"],
+                "emails": list(stats["emails"]),
+                "username": stats["username"]
+            })
+
+        # Sort by commit count
+        repo_contributors[repo].sort(key=lambda x: x["commits"], reverse=True)
+
+        logging.info(f"[analyze_contributors] Found {len(repo_contributors[repo])} contributors for repo: {repo}")
+
+    # Enhance global contributors with profile and company information
+    logging.info(f"[analyze_contributors] Enhancing contributor information with profiles and companies")
+    for contributor_key, stats in global_contributors.items():
+        # First, try to extract company from email addresses (prioritize this)
+        if stats["emails"]:
+            for email in stats["emails"]:
+                company_from_email = extract_company_from_email(email)
+                if company_from_email:
+                    stats["company"] = company_from_email
+                    break
+
+        # Only if email didn't provide a clear company mapping, try GitHub profile
+        if not stats["company"] and stats["username"]:
+            profile = get_user_profile(stats["username"])
+            stats["profile"] = profile
+
+            # Try to extract company from profile
+            company_from_profile = extract_company_from_profile(profile)
+            if company_from_profile:
+                stats["company"] = company_from_profile
+
+        # Convert sets to lists for YAML serialization
+        stats["repos"] = list(stats["repos"])
+        stats["emails"] = list(stats["emails"])
+
+    logging.info(f"[analyze_contributors] Finished analyzing contributors for org: {org}")
+    return global_contributors, repo_contributors
+
+
+def save_to_yaml(data: Dict, filename: str = "contributors_summary.yml"):
+    """Save data to YAML file."""
+    # Create reports directory if it doesn't exist
+    reports_dir = "reports"
+    os.makedirs(reports_dir, exist_ok=True)
+
+    # Build full path with reports directory
+    filepath = os.path.join(reports_dir, filename)
+    logging.info(f"[save_to_yaml] Saving contributor data to {filepath}")
+
+    # Convert defaultdict to regular dict to avoid YAML serialization issues
+    if hasattr(data, "default_factory"):
+        data = dict(data)
+
+    with open(filepath, "w") as f:
+        yaml.dump(data, f, sort_keys=False, default_flow_style=False)
+
+    logging.info(f"[save_to_yaml] Data successfully saved to {filepath}")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Analyze GitHub org contributor patterns and company affiliations."
+    )
+    parser.add_argument(
+        "--org",
+        type=str,
+        default="pytorch-labs",
+        help="GitHub organization to analyze (default: pytorch-labs)",
+    )
+    args = parser.parse_args()
+
+    global ORG_NAME
+    ORG_NAME = args.org
+
+    if not GITHUB_TOKEN:
+        logging.error("[main] Missing GITHUB_TOKEN in environment variables.")
+        return
+
+    logging.info(f"[main] Starting contributor analysis for org: {ORG_NAME}")
+
+    # Show cache stats at start
+    cache_stats = get_cache_stats()
+    logging.info(
+        f"[main] Cache stats: {cache_stats['total_files']} files, {cache_stats['total_size_mb']} MB"
+    )
+
+    # Get repositories
+    repos = get_repos(ORG_NAME)
+    filtered_repos = [
+        repo for repo in repos if f"{ORG_NAME}/{repo}" not in EXCLUDED_REPOS
+    ]
+
+    logging.info(f"[main] Analyzing {len(filtered_repos)} repositories (excluded {len(repos) - len(filtered_repos)})")
+
+    # Analyze contributors
+    global_contributors, repo_contributors = analyze_contributors(ORG_NAME, filtered_repos)
+
+    # Sort contributors by frequency
+    contributors_by_frequency = []
+    for contributor_key, stats in global_contributors.items():
+        contributors_by_frequency.append({
+            "contributor": contributor_key,
+            "total_commits": stats["total_commits"],
+            "repos_count": len(stats["repos"]),
+            "repos": stats["repos"],
+            "emails": stats["emails"],
+            "username": stats["username"],
+            "company": stats["company"]
+        })
+
+    contributors_by_frequency.sort(key=lambda x: x["total_commits"], reverse=True)
+
+    # Group contributors by company
+    company_analysis = defaultdict(list)
+    unidentified_contributors = []
+
+    for contributor in contributors_by_frequency:
+        if contributor["company"]:
+            company_analysis[contributor["company"]].append({
+                "contributor": contributor["contributor"],
+                "total_commits": contributor["total_commits"],
+                "repos_count": contributor["repos_count"],
+                "username": contributor["username"]
+            })
+        else:
+            unidentified_contributors.append({
+                "contributor": contributor["contributor"],
+                "total_commits": contributor["total_commits"],
+                "repos_count": contributor["repos_count"],
+                "username": contributor["username"],
+                "emails": contributor["emails"]
+            })
+
+    # Sort company contributors by commit count
+    for company in company_analysis:
+        company_analysis[company].sort(key=lambda x: x["total_commits"], reverse=True)
+
+    # Prepare output data
+    output_data = {
+        "analysis_metadata": {
+            "organization": ORG_NAME,
+            "analysis_date": datetime.utcnow().isoformat() + "Z",
+            "lookback_period_days": 180,
+            "repositories_analyzed": len(filtered_repos),
+            "total_contributors": len(contributors_by_frequency),
+            "contributors_with_company": len(contributors_by_frequency) - len(unidentified_contributors),
+            "contributors_without_company": len(unidentified_contributors)
+        },
+        "contributors_by_frequency": contributors_by_frequency[:50],  # Top 50 contributors
+        "company_analysis": dict(company_analysis),
+        "unidentified_contributors": unidentified_contributors[:20],  # Top 20 unidentified
+        "contributors_by_repo": repo_contributors
+    }
+
+    # Sort output for consistency
+    def deep_sort(obj, sort_keys=True):
+        if isinstance(obj, dict):
+            keys = sorted(obj) if sort_keys else obj.keys()
+            return {k: deep_sort(obj[k]) for k in keys}
+        elif isinstance(obj, list):
+            return [deep_sort(x) for x in obj]
+        else:
+            return obj
+
+    # Don't sort top-level keys to maintain logical order
+    for key in ["company_analysis", "contributors_by_repo"]:
+        if key in output_data:
+            output_data[key] = deep_sort(output_data[key])
+
+    save_to_yaml(output_data)
+
+    # Show final cache stats
+    final_cache_stats = get_cache_stats()
+    logging.info(
+        f"[main] Final cache stats: {final_cache_stats['total_files']} files, {final_cache_stats['total_size_mb']} MB"
+    )
+
+    # Print summary
+    print(f"\nAnalysis Summary:")
+    print(f"- Organization: {ORG_NAME}")
+    print(f"- Repositories analyzed: {len(filtered_repos)}")
+    print(f"- Total contributors: {len(contributors_by_frequency)}")
+    print(f"- Contributors with identified companies: {len(contributors_by_frequency) - len(unidentified_contributors)}")
+    print(f"- Top companies by contributor count:")
+
+    # Show top companies
+    company_contributor_count = [(company, len(contributors)) for company, contributors in company_analysis.items()]
+    company_contributor_count.sort(key=lambda x: x[1], reverse=True)
+
+    for company, count in company_contributor_count[:20]:
+        total_commits = sum(c["total_commits"] for c in company_analysis[company])
+        print(f"  - {company}: {count} contributors, {total_commits} total commits")
+
+    # Show top contributors (>7 commits) with their repository breakdown
+    print(f"\nTop contributors (>7 commits):")
+    top_contributors = [c for c in contributors_by_frequency if c["total_commits"] > 7]
+
+    for contributor in top_contributors:
+        contributor_key = contributor["contributor"]
+
+        # Get repo-specific commit counts for this contributor
+        repo_commits = []
+        for repo in contributor["repos"]:
+            # Find this contributor in the repo's contributor list
+            for repo_contrib in repo_contributors.get(repo, []):
+                if repo_contrib["contributor"] == contributor_key:
+                    repo_commits.append(f"{repo}({repo_contrib['commits']})")
+                    break
+
+        # Sort by commit count (descending)
+        repo_commits.sort(key=lambda x: int(x.split('(')[1].split(')')[0]), reverse=True)
+
+        # Format the contributor name (use username if available, otherwise email/name)
+        display_name = contributor["username"] if contributor["username"] else contributor_key
+
+        print(f"- {display_name}, {', '.join(repo_commits)}")
+
+    logging.info("[main] Script completed successfully.")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tools/analytics/org/analyze_repo_info.py b/tools/analytics/org/analyze_repo_info.py
new file mode 100644
index 0000000000..da2263f3c1
--- /dev/null
+++ b/tools/analytics/org/analyze_repo_info.py
@@ -0,0 +1,259 @@
+"""
+GitHub Organization Repository Information Analyzer
+==================================================
+
+Purpose:
+--------
+This script analyzes all repositories in a specified GitHub organization and outputs a CSV file with key repository information including visibility, archived status, and last commit date.
+
+Key Features:
+-------------
+- Fetches all repositories in a GitHub organization (including archived ones).
+- Collects repository metadata including visibility, archived status, and last commit date.
+- Outputs a CSV file with repository information for easy analysis.
+- Caches GitHub API responses for efficiency and rate limit avoidance.
+
+How to Run:
+-----------
+1. Ensure you have Python 3.9+ and install dependencies (see below).
+2. Set the following environment variable (can be in a .env file):
+   - `GITHUB_TOKEN`: A GitHub personal access token with `repo` read permissions.
+3. Run the script:
+
+   ```bash
+   python analyze_repo_info.py [--org ORG_NAME]
+   ```
+   - Use `--org` to specify the GitHub organization to analyze (defaults to 'pytorch').
+
+Dependencies:
+-------------
+- requests
+- python-dotenv
+- csv (built-in)
+
+Output:
+-------
+- `reports/repo_info_summary.csv`: A CSV file containing:
+    - Repo name (in org/repo format)
+    - Public (True if public, False if Private)
+    - Archived (True if archived, else False)
+    - Last commit date (date repo was last committed to, in YYYY-MM-DD format)
+- Caches API responses in the `cache/` directory for faster reruns.
+
+Notes:
+------
+- The script is safe to rerun; it uses caching to avoid redundant API calls.
+- For large orgs, the script may take a while on the first run due to API rate limits.
+"""
+
+import argparse
+import csv
+import logging
+import os
+from datetime import datetime
+from pathlib import Path
+from typing import Dict, List, Optional
+
+import requests
+from cache_manager import get_cache_stats, make_cached_request
+from dotenv import load_dotenv
+
+
+load_dotenv()
+
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s [%(levelname)s] %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+)
+
+GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")
+ORG_NAME = None  # Will be set by argparse
+
+# GitHub API headers
+HEADERS = {
+    "Authorization": f"Bearer {GITHUB_TOKEN}",
+    "Accept": "application/vnd.github+json",
+}
+
+BASE_URL = "https://api.github.com"
+
+
+def get_repos_with_info(org: str) -> List[Dict]:
+    """
+    Fetch all repositories for an organization with their metadata.
+    
+    Args:
+        org: The GitHub organization name
+        
+    Returns:
+        List of repository dictionaries with metadata
+    """
+    logging.info(f"[get_repos_with_info] Start fetching repositories for org: {org}")
+    repos = []
+    page = 1
+    while True:
+        url = f"{BASE_URL}/orgs/{org}/repos?per_page=100&page={page}"
+        logging.debug(f"[get_repos_with_info] Requesting URL: {url}")
+        data = make_cached_request(url, HEADERS)
+        if data is None:
+            logging.error(f"[get_repos_with_info] Failed to fetch page {page} for org: {org}")
+            break
+        if not data:
+            logging.info(
+                f"[get_repos_with_info] No more repositories found on page {page} for org: {org}"
+            )
+            break
+        logging.info(
+            f"[get_repos_with_info] Page {page}: Found {len(data)} repositories for org: {org}"
+        )
+        repos.extend(data)
+        page += 1
+    logging.info(
+        f"[get_repos_with_info] Finished fetching repositories for org: {org}. Total: {len(repos)}"
+    )
+    return repos
+
+
+def get_last_commit_date(org: str, repo: str) -> Optional[str]:
+    """
+    Get the date of the last commit for a repository.
+    
+    Args:
+        org: The GitHub organization name
+        repo: The repository name
+        
+    Returns:
+        Date string in YYYY-MM-DD format of the last commit, or None if no commits found
+    """
+    logging.info(f"[get_last_commit_date] Getting last commit date for repo: {repo}")
+    url = f"{BASE_URL}/repos/{org}/{repo}/commits?per_page=1"
+    logging.debug(f"[get_last_commit_date] Requesting URL: {url}")
+    data = make_cached_request(url, HEADERS)
+    if data is None or not data:
+        logging.warning(f"[get_last_commit_date] No commits found for repo: {repo}")
+        return None
+    
+    if len(data) > 0:
+        commit_date = data[0]["commit"]["author"]["date"]
+        # Convert ISO format to YYYY-MM-DD format
+        try:
+            from datetime import datetime
+            dt = datetime.fromisoformat(commit_date.replace('Z', '+00:00'))
+            formatted_date = dt.strftime('%Y-%m-%d')
+            logging.info(f"[get_last_commit_date] Last commit date for {repo}: {formatted_date}")
+            return formatted_date
+        except (ValueError, AttributeError) as e:
+            logging.warning(f"[get_last_commit_date] Failed to parse date for {repo}: {e}")
+            return None
+    
+    return None
+
+
+def process_repo_data(org: str, repos: List[Dict]) -> List[Dict]:
+    """
+    Process repository data and add last commit date information.
+    
+    Args:
+        org: The GitHub organization name
+        repos: List of repository dictionaries from GitHub API
+        
+    Returns:
+        List of processed repository data with all required fields
+    """
+    logging.info(f"[process_repo_data] Processing {len(repos)} repositories")
+    processed_repos = []
+    
+    for i, repo in enumerate(repos, 1):
+        repo_name = repo["name"]
+        logging.info(f"[process_repo_data] Processing repo {i}/{len(repos)}: {repo_name}")
+        
+        # Get last commit date
+        last_commit_date = get_last_commit_date(org, repo_name)
+        
+        processed_repo = {
+            "repo_name": f"{org}/{repo_name}",
+            "public": repo.get("private", True) == False,  # True if public, False if private
+            "archived": repo.get("archived", False),
+            "last_commit_date": last_commit_date
+        }
+        
+        processed_repos.append(processed_repo)
+    
+    logging.info(f"[process_repo_data] Finished processing {len(processed_repos)} repositories")
+    return processed_repos
+
+
+def save_to_csv(data: List[Dict], filename: str = "repo_info_summary.csv"):
+    """
+    Save repository data to a CSV file.
+    
+    Args:
+        data: List of repository dictionaries
+        filename: Name of the CSV file to create
+    """
+    # Create reports directory if it doesn't exist
+    reports_dir = "reports"
+    os.makedirs(reports_dir, exist_ok=True)
+
+    # Build full path with reports directory
+    filepath = os.path.join(reports_dir, filename)
+    logging.info(f"[save_to_csv] Saving repository data to {filepath}")
+
+    # Define CSV headers
+    fieldnames = ["repo_name", "public", "archived", "last_commit_date"]
+    
+    with open(filepath, "w", newline="", encoding="utf-8") as f:
+        writer = csv.DictWriter(f, fieldnames=fieldnames)
+        writer.writeheader()
+        writer.writerows(data)
+    
+    logging.info(f"[save_to_csv] Data successfully saved to {filepath}")
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Analyze GitHub org repository information."
+    )
+    parser.add_argument(
+        "--org",
+        type=str,
+        default="pytorch",
+        help="GitHub organization to analyze (default: pytorch)",
+    )
+    args = parser.parse_args()
+
+    global ORG_NAME
+    ORG_NAME = args.org
+
+    if not GITHUB_TOKEN:
+        logging.error("[main] Missing GITHUB_TOKEN in environment variables.")
+        return
+
+    logging.info(f"[main] Starting analysis for org: {ORG_NAME}")
+
+    # Show cache stats at start
+    cache_stats = get_cache_stats()
+    logging.info(
+        f"[main] Cache stats: {cache_stats['total_files']} files, {cache_stats['total_size_mb']} MB"
+    )
+
+    # Step 1: Get all repositories with their metadata
+    repos = get_repos_with_info(ORG_NAME)
+    
+    # Step 2: Process repository data and add last commit dates
+    processed_repos = process_repo_data(ORG_NAME, repos)
+    
+    # Step 3: Save to CSV
+    save_to_csv(processed_repos)
+
+    # Show final cache stats
+    final_cache_stats = get_cache_stats()
+    logging.info(
+        f"[main] Final cache stats: {final_cache_stats['total_files']} files, {final_cache_stats['total_size_mb']} MB"
+    )
+    logging.info("[main] Script completed successfully.")
+
+
+if __name__ == "__main__":
+    main() 
\ No newline at end of file
diff --git a/tools/analytics/org/analyze_runner_usage.py b/tools/analytics/org/analyze_runner_usage.py
index 4ff31306a1..f38af446ff 100644
--- a/tools/analytics/org/analyze_runner_usage.py
+++ b/tools/analytics/org/analyze_runner_usage.py
@@ -12,7 +12,7 @@
 - For each repository, fetches recent workflow runs and extracts the runner labels used in jobs.
 - Aggregates runner label usage across repositories, including last usage and workflow file.
 - Compares runner labels against those defined in scale-config.yml and standard GitHub runners.
-- Outputs a YAML summary (runner_labels_summary.yml) with detailed runner usage, repos by runner, and special groupings (e.g., runners not in scale-config, repos with zero workflow runs).
+- Outputs a YAML summary (reports/runner_labels_summary.yml) with detailed runner usage, repos by runner, and special groupings (e.g., runners not in scale-config, repos with zero workflow runs).
 - Caches GitHub API responses for efficiency and rate limit avoidance.
 
 How to Run:
@@ -38,7 +38,7 @@
 
 Output:
 -------
-- `runner_labels_summary.yml`: A YAML file containing:
+- `reports/runner_labels_summary.yml`: A YAML file containing:
     - `runners_used`: For each runner label, a list of repos, last usage, and workflow file.
     - `repo_runners`: For each repo, a list of runner labels it uses.
     - `repositories_with_zero_workflow_runs`: Repos with no workflow runs in the lookback period.
@@ -391,13 +391,20 @@ def process_repo_runs(
 
 
 def save_to_yaml(data: Dict, filename: str = "runner_labels_summary.yml"):
-    logging.info(f"[save_to_yaml] Saving runner label data to {filename}")
+    # Create reports directory if it doesn't exist
+    reports_dir = "reports"
+    os.makedirs(reports_dir, exist_ok=True)
+
+    # Build full path with reports directory
+    filepath = os.path.join(reports_dir, filename)
+    logging.info(f"[save_to_yaml] Saving runner label data to {filepath}")
+
     # Convert defaultdict to regular dict to avoid YAML serialization issues
     if hasattr(data, "default_factory"):
         data = dict(data)
-    with open(filename, "w") as f:
+    with open(filepath, "w") as f:
         yaml.dump(data, f, sort_keys=False)
-    logging.info(f"[save_to_yaml] Data successfully saved to {filename}")
+    logging.info(f"[save_to_yaml] Data successfully saved to {filepath}")
 
 
 def download_scale_config(url: str, dest: str = "scale-config.yml") -> bool:
diff --git a/tools/analytics/org/github_code_search.py b/tools/analytics/org/github_code_search.py
new file mode 100644
index 0000000000..56788873ff
--- /dev/null
+++ b/tools/analytics/org/github_code_search.py
@@ -0,0 +1,589 @@
+# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+#   "requests>=2.31.0",
+#   "python-dotenv>=1.0.0",
+# ]
+# ///
+
+"""
+GitHub Code Search API Script
+============================
+
+Purpose:
+--------
+This script uses GitHub's Search API to perform code searches equivalent to the GitHub web interface.
+It can search for code across organizations, repositories, and files with various filters.
+
+Key Features:
+-------------
+- Search code across GitHub organizations
+- Filter by repository, language, file extension, etc.
+- Handle GitHub API rate limits
+- Cache results for efficiency
+- Export results to various formats
+
+How to Run:
+-----------
+1. Ensure you have Python 3.10+ and install dependencies (see below).
+2. Set the following environment variable (can be in a .env file):
+   - `GITHUB_TOKEN`: A GitHub personal access token with appropriate permissions.
+3. Run the script:
+
+   ```bash
+   python github_code_search.py --query "org:meta-pytorch pytorch-labs" [options]
+   ```
+
+Examples:
+---------
+```bash
+# Search for "pytorch-labs" in meta-pytorch organization
+python github_code_search.py --query "org:meta-pytorch pytorch-labs"
+
+# Search for specific file types
+python github_code_search.py --query "org:meta-pytorch filename:README.md"
+
+# Search for code in specific language
+python github_code_search.py --query "org:meta-pytorch language:python pytorch-labs"
+
+# Export results to JSON
+python github_code_search.py --query "org:meta-pytorch pytorch-labs" --output results.json
+```
+
+Output:
+-------
+- Console output with search results
+- Optional JSON/CSV export
+- Rate limit information
+- Search statistics
+"""
+
+import argparse
+import json
+import logging
+import os
+import time
+from datetime import datetime, timezone
+from typing import Dict, List, Optional, Any, TypedDict, Union
+from urllib.parse import quote_plus
+from dataclasses import dataclass
+
+import requests
+from dotenv import load_dotenv
+
+load_dotenv()
+
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s [%(levelname)s] %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+)
+
+GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")
+
+# GitHub API headers
+HEADERS = {
+    "Authorization": f"Bearer {GITHUB_TOKEN}",
+    "Accept": "application/vnd.github+json",
+}
+
+BASE_URL = "https://api.github.com"
+SEARCH_URL = f"{BASE_URL}/search/code"
+
+
+# Type definitions for well-defined schema
+class RepositoryInfo(TypedDict):
+    """Repository information from GitHub search results."""
+    id: int
+    node_id: str
+    name: str
+    full_name: str
+    private: bool
+    owner: Dict[str, Any]  # GitHub user/org object
+    html_url: str
+    description: Optional[str]
+    fork: bool
+    url: str
+    forks_url: str
+    keys_url: str
+    collaborators_url: str
+    teams_url: str
+    hooks_url: str
+    issue_events_url: str
+    events_url: str
+    assignees_url: str
+    branches_url: str
+    tags_url: str
+    blobs_url: str
+    git_tags_url: str
+    git_refs_url: str
+    trees_url: str
+    statuses_url: str
+    languages_url: str
+    stargazers_url: str
+    contributors_url: str
+    subscribers_url: str
+    subscription_url: str
+    commits_url: str
+    git_commits_url: str
+    comments_url: str
+    issue_comment_url: str
+    contents_url: str
+    compare_url: str
+    merges_url: str
+    archive_url: str
+    downloads_url: str
+    issues_url: str
+    pulls_url: str
+    milestones_url: str
+    notifications_url: str
+    labels_url: str
+    releases_url: str
+    deployments_url: str
+    created_at: str
+    updated_at: str
+    pushed_at: str
+    git_url: str
+    ssh_url: str
+    clone_url: str
+    svn_url: str
+    homepage: Optional[str]
+    size: int
+    stargazers_count: int
+    watchers_count: int
+    language: Optional[str]
+    has_issues: bool
+    has_projects: bool
+    has_downloads: bool
+    has_wiki: bool
+    has_pages: bool
+    has_discussions: bool
+    forks_count: int
+    mirror_url: Optional[str]
+    archived: bool
+    disabled: bool
+    open_issues_count: int
+    license: Optional[Dict[str, Any]]
+    allow_forking: bool
+    is_template: bool
+    web_commit_signoff_required: bool
+    topics: List[str]
+    visibility: str
+    forks: int
+    open_issues: int
+    watchers: int
+    default_branch: str
+    score: float
+
+
+class SearchResultItem(TypedDict):
+    """Individual search result item from GitHub code search."""
+    name: str
+    path: str
+    sha: str
+    url: str
+    git_url: str
+    html_url: str
+    repository: RepositoryInfo
+    score: float
+    file_size: Optional[int]
+    language: Optional[str]
+    last_modified_at: Optional[str]
+    line_numbers: Optional[List[int]]
+    text_matches: Optional[List[Dict[str, Any]]]
+
+
+class GitHubSearchResults(TypedDict):
+    """Complete search results from GitHub Search API."""
+    query: str
+    total_count: int
+    retrieved_count: int
+    items: List[SearchResultItem]
+    search_time: str
+    rate_limit_remaining: Optional[int]
+    rate_limit_reset: Optional[str]
+
+
+@dataclass
+class SearchOptions:
+    """Options for GitHub code search."""
+    per_page: int = 100
+    max_results: Optional[int] = None
+    verbose: bool = True
+
+
+class GitHubCodeSearch:
+    def __init__(self, token: str = None):
+        """
+        Initialize GitHub Code Search client.
+        
+        Args:
+            token: GitHub personal access token. If None, will try to get from GITHUB_TOKEN env var.
+        """
+        self.token = token or GITHUB_TOKEN
+        if not self.token:
+            raise ValueError("GitHub token is required. Set GITHUB_TOKEN environment variable or pass token parameter.")
+            
+        self.headers = {
+            "Authorization": f"Bearer {self.token}",
+            "Accept": "application/vnd.github+json",
+        }
+        self.session = requests.Session()
+        self.session.headers.update(self.headers)
+        
+    def search_code(self, query: str, per_page: int = 100, max_results: Optional[int] = None, 
+                   verbose: bool = True) -> GitHubSearchResults:
+        """
+        Search for code using GitHub's Search API.
+        
+        Args:
+            query: Search query string
+            per_page: Number of results per page (max 100)
+            max_results: Maximum number of results to return (None for all)
+            verbose: Whether to log progress messages
+            
+        Returns:
+            GitHubSearchResults: Well-defined structure containing:
+                - query: The search query used
+                - total_count: Total number of results available from GitHub
+                - retrieved_count: Number of results actually retrieved
+                - items: List of SearchResultItem objects with file details
+                - search_time: ISO timestamp of when search was performed
+                - rate_limit_remaining: Remaining API calls (if available)
+                - rate_limit_reset: When rate limit resets (if available)
+        """
+        all_items = []
+        page = 1
+        total_count = 0
+        
+        if verbose:
+            logging.info(f"Starting code search with query: {query}")
+        
+        while True:
+            # Check rate limits
+            rate_limit_info = self._check_rate_limit()
+            if rate_limit_info['remaining'] == 0:
+                reset_time = rate_limit_info['reset_time']
+                wait_time = max(0, reset_time - time.time())
+                if verbose:
+                    logging.warning(f"Rate limit exceeded. Waiting {wait_time:.0f} seconds...")
+                time.sleep(wait_time + 1)
+            
+            # Prepare request parameters
+            params = {
+                'q': query,
+                'per_page': min(per_page, 100),
+                'page': page
+            }
+            
+            try:
+                if verbose:
+                    logging.info(f"Fetching page {page}...")
+                response = self.session.get(SEARCH_URL, params=params)
+                response.raise_for_status()
+                
+                data = response.json()
+                
+                # Update total count on first page
+                if page == 1:
+                    total_count = data.get('total_count', 0)
+                    if verbose:
+                        logging.info(f"Total results found: {total_count}")
+                
+                items = data.get('items', [])
+                if not items:
+                    break
+                
+                all_items.extend(items)
+                if verbose:
+                    logging.info(f"Retrieved {len(items)} items from page {page} (total: {len(all_items)})")
+                
+                # Check if we've reached the maximum results
+                if max_results and len(all_items) >= max_results:
+                    all_items = all_items[:max_results]
+                    if verbose:
+                        logging.info(f"Reached maximum results limit: {max_results}")
+                    break
+                
+                # Check if there are more pages
+                if len(items) < per_page:
+                    break
+                
+                page += 1
+                
+                # Be respectful to the API
+                time.sleep(1)
+                
+            except requests.exceptions.RequestException as e:
+                logging.error(f"Error fetching page {page}: {e}")
+                break
+            except json.JSONDecodeError as e:
+                logging.error(f"Error parsing JSON response from page {page}: {e}")
+                break
+        
+        # Get rate limit info for the response
+        rate_limit_info = self._check_rate_limit()
+        
+        return GitHubSearchResults(
+            query=query,
+            total_count=total_count,
+            retrieved_count=len(all_items),
+            items=all_items,
+            search_time=datetime.now(timezone.utc).isoformat(),
+            rate_limit_remaining=rate_limit_info.get('remaining'),
+            rate_limit_reset=datetime.fromtimestamp(rate_limit_info.get('reset_time', 0)).isoformat() if rate_limit_info.get('reset_time') else None
+        )
+    
+    def get_rate_limit(self) -> Dict[str, Any]:
+        """Get GitHub API rate limit status."""
+        return self._check_rate_limit()
+    
+    def _check_rate_limit(self) -> Dict[str, Any]:
+        """Check GitHub API rate limit status."""
+        try:
+            response = self.session.get(f"{BASE_URL}/rate_limit")
+            response.raise_for_status()
+            data = response.json()
+            
+            search_limit = data.get('resources', {}).get('search', {})
+            return {
+                'limit': search_limit.get('limit', 0),
+                'remaining': search_limit.get('remaining', 0),
+                'reset_time': search_limit.get('reset', 0)
+            }
+        except Exception as e:
+            logging.warning(f"Could not check rate limit: {e}")
+            return {'limit': 0, 'remaining': 0, 'reset_time': 0}
+    
+    def format_results(self, results: GitHubSearchResults, format_type: str = 'console') -> str:
+        """Format search results for different output types."""
+        if format_type == 'json':
+            return json.dumps(results, indent=2)
+        
+        elif format_type == 'console':
+            output = []
+            output.append(f"=== GitHub Code Search Results ===")
+            output.append(f"Query: {results['query']}")
+            output.append(f"Total results: {results['total_count']}")
+            output.append(f"Retrieved: {results['retrieved_count']}")
+            output.append(f"Search time: {results['search_time']}")
+            output.append("")
+            
+            for i, item in enumerate(results['items'], 1):
+                repo_name = item.get('repository', {}).get('full_name', 'Unknown')
+                file_path = item.get('path', 'Unknown')
+                file_url = item.get('html_url', '')
+                score = item.get('score', 0)
+                
+                output.append(f"{i}. {repo_name}/{file_path}")
+                output.append(f"   Score: {score}")
+                output.append(f"   URL: {file_url}")
+                output.append("")
+            
+            return "\n".join(output)
+        
+        elif format_type == 'csv':
+            import csv
+            import io
+            
+            output = io.StringIO()
+            writer = csv.writer(output)
+            
+            # Write header
+            writer.writerow(['Repository', 'File Path', 'Score', 'URL', 'Search Time'])
+            
+            # Write data
+            for item in results['items']:
+                repo_name = item.get('repository', {}).get('full_name', 'Unknown')
+                file_path = item.get('path', 'Unknown')
+                file_url = item.get('html_url', '')
+                score = item.get('score', 0)
+                
+                writer.writerow([repo_name, file_path, score, file_url, results['search_time']])
+            
+            return output.getvalue()
+        
+        else:
+            raise ValueError(f"Unsupported format type: {format_type}")
+    
+    def get_file_paths(self, results: GitHubSearchResults) -> List[str]:
+        """Extract just the file paths from search results."""
+        return [item.get('path', '') for item in results.get('items', [])]
+    
+    def get_repositories(self, results: GitHubSearchResults) -> List[str]:
+        """Extract just the repository names from search results."""
+        return [item.get('repository', {}).get('full_name', '') for item in results.get('items', [])]
+    
+    def get_unique_repositories(self, results: GitHubSearchResults) -> List[str]:
+        """Extract unique repository names from search results."""
+        repos = self.get_repositories(results)
+        return list(set(repos))
+    
+    def filter_by_score(self, results: GitHubSearchResults, min_score: float = 0.0) -> GitHubSearchResults:
+        """Filter results by minimum score."""
+        filtered_items = [
+            item for item in results.get('items', [])
+            if item.get('score', 0) >= min_score
+        ]
+        
+        return GitHubSearchResults(
+            query=results['query'],
+            total_count=results['total_count'],
+            retrieved_count=len(filtered_items),
+            items=filtered_items,
+            search_time=results['search_time'],
+            rate_limit_remaining=results.get('rate_limit_remaining'),
+            rate_limit_reset=results.get('rate_limit_reset')
+        )
+    
+    def filter_by_repository(self, results: GitHubSearchResults, repo_pattern: str) -> GitHubSearchResults:
+        """Filter results by repository name pattern."""
+        import re
+        pattern = re.compile(repo_pattern)
+        
+        filtered_items = [
+            item for item in results.get('items', [])
+            if pattern.search(item.get('repository', {}).get('full_name', ''))
+        ]
+        
+        return GitHubSearchResults(
+            query=results['query'],
+            total_count=results['total_count'],
+            retrieved_count=len(filtered_items),
+            items=filtered_items,
+            search_time=results['search_time'],
+            rate_limit_remaining=results.get('rate_limit_remaining'),
+            rate_limit_reset=results.get('rate_limit_reset')
+        )
+
+
+def search_github_code(query: str, token: str = None, per_page: int = 100, 
+                      max_results: Optional[int] = None, verbose: bool = True) -> GitHubSearchResults:
+    """
+    Convenience function to search GitHub code.
+    
+    Args:
+        query: Search query string
+        token: GitHub personal access token (optional, will use GITHUB_TOKEN env var if not provided)
+        per_page: Number of results per page (max 100)
+        max_results: Maximum number of results to return (None for all)
+        verbose: Whether to log progress messages
+        
+    Returns:
+        GitHubSearchResults: Well-defined structure containing search results with the following fields:
+            - query: The search query used
+            - total_count: Total number of results available from GitHub
+            - retrieved_count: Number of results actually retrieved
+            - items: List of SearchResultItem objects, each containing:
+                - name: File name
+                - path: File path in repository
+                - sha: Git SHA of the file
+                - url: API URL for the file
+                - html_url: Web URL for the file
+                - repository: RepositoryInfo object with full repo details
+                - score: Relevance score (0-100)
+                - file_size: File size in bytes (if available)
+                - language: Programming language (if detected)
+                - last_modified_at: Last modification time (if available)
+                - line_numbers: Line numbers where matches were found (if available)
+                - text_matches: Detailed text match information (if available)
+            - search_time: ISO timestamp of when search was performed
+            - rate_limit_remaining: Remaining API calls (if available)
+            - rate_limit_reset: When rate limit resets (if available)
+    """
+    searcher = GitHubCodeSearch(token)
+    return searcher.search_code(query, per_page, max_results, verbose)
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Search GitHub code using the GitHub Search API"
+    )
+    parser.add_argument(
+        "--query",
+        type=str,
+        required=True,
+        help="Search query (e.g., 'org:meta-pytorch pytorch-labs')",
+    )
+    parser.add_argument(
+        "--per-page",
+        type=int,
+        default=100,
+        help="Number of results per page (max 100, default: 100)",
+    )
+    parser.add_argument(
+        "--max-results",
+        type=int,
+        help="Maximum number of results to retrieve (default: all)",
+    )
+    parser.add_argument(
+        "--output",
+        type=str,
+        help="Output file path (e.g., 'results.json' or 'results.csv')",
+    )
+    parser.add_argument(
+        "--format",
+        type=str,
+        choices=['console', 'json', 'csv'],
+        default='console',
+        help="Output format (default: console)",
+    )
+    parser.add_argument(
+        "--show-rate-limit",
+        action="store_true",
+        help="Show rate limit information before searching",
+    )
+    
+    args = parser.parse_args()
+
+    if not GITHUB_TOKEN:
+        logging.error("Missing GITHUB_TOKEN in environment variables.")
+        return
+
+    # Create search instance
+    searcher = GitHubCodeSearch()
+    
+    # Show rate limit if requested
+    if args.show_rate_limit:
+        rate_limit = searcher.get_rate_limit()
+        print(f"Rate limit: {rate_limit['remaining']}/{rate_limit['limit']} remaining")
+        if rate_limit['remaining'] == 0:
+            reset_time = datetime.fromtimestamp(rate_limit['reset_time'])
+            print(f"Rate limit resets at: {reset_time}")
+        print()
+
+    # Perform search
+    results = searcher.search_code(
+        query=args.query,
+        per_page=args.per_page,
+        max_results=args.max_results
+    )
+
+    # Format and output results
+    if args.output:
+        # Determine format from file extension
+        if args.output.endswith('.json'):
+            output_format = 'json'
+        elif args.output.endswith('.csv'):
+            output_format = 'csv'
+        else:
+            output_format = args.format
+        
+        formatted_output = searcher.format_results(results, output_format)
+        
+        with open(args.output, 'w', encoding='utf-8') as f:
+            f.write(formatted_output)
+        
+        print(f"Results saved to: {args.output}")
+        
+        # Also show console summary
+        console_output = searcher.format_results(results, 'console')
+        print(console_output)
+    else:
+        # Just show console output
+        formatted_output = searcher.format_results(results, args.format)
+        print(formatted_output)
+
+
+if __name__ == "__main__":
+    main() 
\ No newline at end of file
diff --git a/tools/analytics/org/remove_pytorch_labs.py b/tools/analytics/org/remove_pytorch_labs.py
new file mode 100644
index 0000000000..9e246daf07
--- /dev/null
+++ b/tools/analytics/org/remove_pytorch_labs.py
@@ -0,0 +1,636 @@
+# /// script
+# requires-python = ">=3.10"
+# dependencies = [
+#   "requests>=2.31.0",
+#   "python-dotenv>=1.0.0",
+# ]
+# ///
+
+"""
+GitHub Organization Text Replacement Script
+==========================================
+
+Purpose:
+--------
+This script replaces all instances of "pytorch-labs" with "meta-pytorch" across all repositories
+in a specified GitHub organization and creates pull requests for each repository with changes.
+
+Key Features:
+-------------
+- Uses pre-defined list of files known to contain "pytorch-labs" mentions (optimized for performance). This list was obtained by running codesea
+- Replaces all instances of "pytorch-labs" with "meta-pytorch" in target files.
+- Creates a new branch and commits changes for each repository.
+- Creates pull requests with descriptive titles and descriptions.
+- Caches GitHub API responses for efficiency and rate limit avoidance.
+
+How to Run:
+-----------
+1. Ensure you have Python 3.9+ and install dependencies (see below).
+2. Set the following environment variable (can be in a .env file):
+   - `GITHUB_TOKEN`: A GitHub personal access token with `repo` permissions.
+3. Run the script:
+
+   ```bash
+   python remove_pytorch_labs.py [--org ORG_NAME] [--repos REPO_LIST] [--dry-run]
+   ```
+   - Use `--org` to specify the GitHub organization to analyze (defaults to 'pytorch').
+   - Use `--repos` to specify a comma-separated list of repositories to process (e.g., 'pytorch,vision,tutorials').
+   - Use `--dry-run` to preview changes without making them.
+
+
+Output:
+-------
+- Logs all operations to console and file
+- Creates pull requests for repositories with changes
+- Summary report of operations performed
+
+Notes:
+------
+- Only processes 72 pre-identified files that contain "pytorch-labs" mentions
+- Skips binary files and files larger than 1MB
+- Creates one PR per repository with changes
+- Handles GitHub API rate limits automatically
+- Significantly faster than scanning all files in all repositories
+"""
+
+import argparse
+import base64
+import json
+import logging
+import os
+import re
+from datetime import datetime, timezone
+from typing import Dict, List, Optional, Tuple
+
+import requests
+from cache_manager import get_cache_stats, make_cached_request
+from dotenv import load_dotenv
+
+load_dotenv()
+
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s [%(levelname)s] %(message)s",
+    datefmt="%Y-%m-%d %H:%M:%S",
+)
+
+GITHUB_TOKEN = os.getenv("GITHUB_TOKEN")
+ORG_NAME = None  # Will be set by argparse
+DRY_RUN = False  # Will be set by argparse
+
+# GitHub API headers
+HEADERS = {
+    "Authorization": f"Bearer {GITHUB_TOKEN}",
+    "Accept": "application/vnd.github+json",
+}
+
+BASE_URL = "https://api.github.com"
+
+# Text to replace
+OLD_TEXT = "pytorch-labs"
+NEW_TEXT = "meta-pytorch"
+
+# Maximum file size to process (1MB)
+MAX_FILE_SIZE = 1024 * 1024
+
+# Pre-defined list of files that contain "pytorch-labs" mentions
+# This is based on search results and will significantly improve performance
+TARGET_FILES = {
+    "pytorch": [
+        "android/README.md",
+        "aten/src/ATen/native/cuda/int4mm.cu",
+        "torch/testing/_internal/common_quantization.py"
+    ],
+    "vision": [
+        "torchvision/io/image.py"
+    ],
+    "tutorials": [
+        "index.rst",
+        "docathon-leaderboard.md",
+        "intermediate_source/transformer_building_blocks.py",
+        "unstable_source/gpu_quantization_torchao_tutorial.py"
+    ],
+    "executorch": [
+        "docs/source/index.md",
+        "docs/source/getting-started.md",
+        "backends/apple/mps/setup.md",
+        "docs/source/backends-mps.md",
+        "docs/source/llm/run-with-c-plus-plus.md",
+        "docs/source/using-executorch-android.md",
+        "docs/source/using-executorch-export.md",
+        "docs/source/using-executorch-building-from-source.md",
+        "docs/source/using-executorch-cpp.md",
+        "examples/models/llama/experimental/generate.py",
+        "scripts/test_ios.sh",
+        ".ci/scripts/test_ios_ci.sh",
+        "backends/test/facto/test_facto.py"
+    ],
+    "ao": [
+        "scripts/download.py",
+        "torchao/_models/llama/tokenizer.py",
+        "scripts/convert_hf_checkpoint.py",
+        "examples/sam2_amg_server/annotate_with_rle.py",
+        "torchao/prototype/mx_formats/kernels.py",
+        "torchao/_models/sam/README.md",
+        "torchao/quantization/README.md",
+        "test/integration/test_integration.py",
+        ".github/workflows/dashboard_perf_test.yml"
+    ],
+    "benchmark": [
+        "torchbenchmark/models/simple_gpt/origin",
+        "torchbenchmark/models/sam_fast/requirements.txt"
+    ],
+    "torchtune": [
+        "docs/source/tutorials/qlora_finetune.rst",
+        "recipes/eleuther_eval.py",
+        "docs/source/tutorials/e2e_flow.rst",
+        "torchtune/generation/_generation.py",
+        "docs/source/tutorials/llama3.rst",
+        "README.md"
+    ],
+    "torchft": [
+        "docs/source/protocol.rst",
+        "docs/source/assumptions_and_recommendations.rst",
+        "docs/source/conf.py",
+        "docs/source/index.rst",
+        "README.md"
+    ],
+    "torchchat": [
+        "torchchat/usages/eval.py",
+        "README.md"
+    ],
+    "rl": [
+        "examples/rlhf/requirements.txt"
+    ],
+    "builder": [
+        "CUDA_UPGRADE_GUIDE.MD"
+    ],
+    "helion": [
+        "benchmarks/run.py",
+        "benchmarks/README.md"
+    ],
+    "torchcodec": [
+        "src/torchcodec/_core/SingleStreamDecoder.cpp"
+    ],
+    "test-infra": [
+        "aws/lambda/README.md",
+        "torchci/clickhouse_queries/queued_jobs_aggregate/query.sql",
+        "tools/torchfix/README.md",
+        ".github/workflows/trigger_nightly.yml"
+    ],
+    "ci-infra": [
+        "arc-backup-2024/scripts/deployment.py"
+    ],
+    "oss-docathons": [
+        "pytorch/h1-2024/leaderboard-pytorch-docathon-h1-2024.md",
+        "pytorch/h1-2024/leaderboard-pytorch-docathon-h1-2024.csv",
+        ".github/scripts/pytorch-docathon-h1-2024.py"
+    ],
+    "serve": [
+        "examples/large_models/segment_anything_fast/install_segment_anything_fast.sh",
+        "examples/large_models/gpt_fast/README.md",
+        "examples/large_models/gpt_fast_mixtral_moe/README.md",
+        "examples/large_models/diffusion_fast/README.md",
+        "examples/large_models/segment_anything_fast/README.md",
+        "kubernetes/kserve/examples/gpt_fast/README.md"
+    ],
+    "xla": [
+        "torchax/test/llama/llama_model.py"
+    ],
+    "pytorch-canary": [
+        "torch/testing/_internal/common_quantization.py"
+    ],
+    "pytorch-integration-testing": [
+        ".github/scripts/generate_vllm_benchmark_matrix.py"
+    ],
+    "torcheval": [
+        ".github/PULL_REQUEST_TEMPLATE.md",
+        ".github/ISSUE_TEMPLATE/bug-report.yml"
+    ]
+}
+
+
+def get_target_repos(org: str, filter_repos: Optional[List[str]] = None) -> List[str]:
+    """Get only the repositories that have files with 'pytorch-labs' mentions."""
+    if org not in TARGET_FILES:
+        logging.info(f"[get_target_repos] No target files found for org: {org}")
+        return []
+    
+    all_repos = list(TARGET_FILES.keys())
+    
+    if filter_repos:
+        # Filter to only include repos that are in both the target files and the filter list
+        repos = [repo for repo in all_repos if repo in filter_repos]
+        logging.info(f"[get_target_repos] Filtered to {len(repos)} repositories from {len(all_repos)} available")
+        
+        # Log which repos were filtered out
+        filtered_out = [repo for repo in filter_repos if repo not in all_repos]
+        if filtered_out:
+            logging.warning(f"[get_target_repos] Repositories not found in target files: {filtered_out}")
+    else:
+        repos = all_repos
+        logging.info(f"[get_target_repos] Found {len(repos)} repositories with target files for org: {org}")
+    
+    return repos
+
+
+def get_default_branch(org: str, repo: str) -> Optional[str]:
+    """Get the default branch for a repository."""
+    url = f"{BASE_URL}/repos/{org}/{repo}"
+    data = make_cached_request(url, HEADERS)
+    if data:
+        return data.get("default_branch", "main")
+    return None
+
+
+def get_target_files_for_repo(org: str, repo: str) -> List[str]:
+    """Get the list of target files for a specific repository."""
+    if repo not in TARGET_FILES:
+        logging.info(f"[get_target_files_for_repo] No target files found for {org}/{repo}")
+        return []
+    
+    files = TARGET_FILES[repo]
+    logging.info(f"[get_target_files_for_repo] Found {len(files)} target files for {org}/{repo}")
+    return files
+
+
+def get_file_content(org: str, repo: str, file_path: str) -> Optional[str]:
+    """Get the content of a file from GitHub."""
+    url = f"{BASE_URL}/repos/{org}/{repo}/contents/{file_path}"
+    data = make_cached_request(url, HEADERS)
+    if not data:
+        return None
+    
+    # Check file size
+    if data.get("size", 0) > MAX_FILE_SIZE:
+        logging.warning(f"[get_file_content] File {file_path} too large ({data['size']} bytes), skipping")
+        return None
+    
+    # Decode content
+    try:
+        content = base64.b64decode(data["content"]).decode("utf-8")
+        return content
+    except (UnicodeDecodeError, Exception) as e:
+        logging.warning(f"[get_file_content] Failed to decode {file_path}: {e}")
+        return None
+
+
+def find_and_replace_in_file(org: str, repo: str, file_path: str) -> Optional[Tuple[str, str]]:
+    """Find and replace text in a file. Returns (old_content, new_content) if changes needed."""
+    content = get_file_content(org, repo, file_path)
+    if content is None:
+        return None
+    
+    # Check if file contains the target text
+    if OLD_TEXT not in content:
+        return None
+    
+    # Replace all instances
+    new_content = content.replace(OLD_TEXT, NEW_TEXT)
+    
+    # Check if any changes were made
+    if new_content == content:
+        return None
+    
+    logging.info(f"[find_and_replace_in_file] Found {content.count(OLD_TEXT)} instances in {file_path}")
+    return content, new_content
+
+
+def create_branch(org: str, repo: str, base_branch: str, new_branch: str) -> bool:
+    """Create a new branch from the base branch."""
+    if DRY_RUN:
+        logging.info(f"[create_branch] DRY RUN: Would create branch {new_branch} in {org}/{repo}")
+        return True
+    
+    # Get the SHA of the base branch
+    url = f"{BASE_URL}/repos/{org}/{repo}/branches/{base_branch}"
+    branch_data = make_cached_request(url, HEADERS)
+    if not branch_data:
+        logging.error(f"[create_branch] Failed to get base branch {base_branch}")
+        return False
+    
+    base_sha = branch_data["commit"]["sha"]
+    
+    # Create the new branch
+    url = f"{BASE_URL}/repos/{org}/{repo}/git/refs"
+    data = {
+        "ref": f"refs/heads/{new_branch}",
+        "sha": base_sha
+    }
+    
+    response = requests.post(url, headers=HEADERS, json=data)
+    if response.status_code == 201:
+        logging.info(f"[create_branch] Created branch {new_branch} in {org}/{repo}")
+        return True
+    elif response.status_code == 422:  # Branch already exists
+        logging.info(f"[create_branch] Branch {new_branch} already exists in {org}/{repo}")
+        return True
+    else:
+        logging.error(f"[create_branch] Failed to create branch {new_branch}: {response.status_code} - {response.text}")
+        return False
+
+
+def create_file_commit(org: str, repo: str, file_path: str, content: str, branch: str, message: str) -> bool:
+    """Create a commit to update a file."""
+    if DRY_RUN:
+        logging.info(f"[create_file_commit] DRY RUN: Would update {file_path} in {org}/{repo}")
+        return True
+    
+    # First get the current file to get its SHA
+    url = f"{BASE_URL}/repos/{org}/{repo}/contents/{file_path}"
+    current_file_data = make_cached_request(url, HEADERS)
+    if not current_file_data:
+        logging.error(f"[create_file_commit] Failed to get current file data for {file_path}")
+        return False
+    
+    current_sha = current_file_data.get("sha")
+    if not current_sha:
+        logging.error(f"[create_file_commit] No SHA found for {file_path}")
+        return False
+    
+    # Update the file with the SHA
+    data = {
+        "message": message,
+        "content": base64.b64encode(content.encode("utf-8")).decode("utf-8"),
+        "sha": current_sha,
+        "branch": branch
+    }
+    
+    response = requests.put(url, headers=HEADERS, json=data)
+    if response.status_code in [200, 201]:
+        logging.info(f"[create_file_commit] Updated {file_path} in {org}/{repo}")
+        return True
+    else:
+        logging.error(f"[create_file_commit] Failed to update {file_path}: {response.status_code} - {response.text}")
+        return False
+
+
+def check_existing_pr(org: str, repo: str, title: str) -> Optional[str]:
+    """Check if there's already an open PR with the same title. Returns PR URL if found, None otherwise."""
+    url = f"{BASE_URL}/repos/{org}/{repo}/pulls?state=open&per_page=100"
+    
+    # Don't use cache for PR checks since PR status can change quickly
+    logging.info(f"[check_existing_pr] Making direct request to check PRs for {org}/{repo}")
+    try:
+        response = requests.get(url, headers=HEADERS)
+        response.raise_for_status()
+        data = response.json()
+        
+        for pr in data:
+            if pr.get("title") == title:
+                pr_url = pr['html_url']
+                logging.info(f"[check_existing_pr] Found existing open PR with same title in {org}/{repo}: {pr_url}")
+                return pr_url
+        
+        logging.info(f"[check_existing_pr] No existing PR found for {org}/{repo}")
+        return None
+        
+    except requests.exceptions.RequestException as e:
+        logging.warning(f"[check_existing_pr] Failed to get PRs for {org}/{repo}: {e}")
+        return None
+    except json.JSONDecodeError as e:
+        logging.warning(f"[check_existing_pr] Failed to parse JSON response for {org}/{repo}: {e}")
+        return None
+
+
+def create_pull_request(org: str, repo: str, branch: str, base_branch: str) -> Optional[str]:
+    """Create a pull request and return the PR URL."""
+    if DRY_RUN:
+        logging.info(f"[create_pull_request] DRY RUN: Would create PR for {org}/{repo}")
+        return "DRY_RUN_PR_URL"
+    
+    url = f"{BASE_URL}/repos/{org}/{repo}/pulls"
+    data = {
+        "title": f"[EZ] Replace `pytorch-labs` with `meta-pytorch`",
+        "body": f"""This PR replaces all instances of `pytorch-labs` with `meta-pytorch` in this repository now that the `pytorch-labs` org has been renamed to `meta-pytorch`
+
+## Changes Made
+- Replaced all occurrences of `pytorch-labs` with `meta-pytorch`
+- Only modified files with extensions: .py, .md, .sh, .rst, .cpp, .h, .txt, .yml
+- Skipped binary files and files larger than 1MB due to GitHub api payload limits in the script to cover all repos in this org. Will do a more manual second pass later to cover any larger files
+
+## Files Modified
+This PR updates files that contained the target text.
+
+Generated by automated script on {datetime.now(timezone.utc).isoformat()}Z""",
+        "head": branch,
+        "base": base_branch
+    }
+    
+    response = requests.post(url, headers=HEADERS, json=data)
+    if response.status_code == 201:
+        pr_data = response.json()
+        pr_url = pr_data["html_url"]
+        logging.info(f"[create_pull_request] Created PR: {pr_url}")
+        return pr_url
+    else:
+        logging.error(f"[create_pull_request] Failed to create PR: {response.status_code} - {response.text}")
+        return None
+
+
+def process_repository(org: str, repo: str) -> Dict:
+    """Process a single repository for text replacement."""
+    logging.info(f"[process_repository] Processing repository: {org}/{repo}")
+    
+    result = {
+        "repo": repo,
+        "status": "skipped",
+        "files_changed": 0,
+        "pr_url": None,
+        "error": None
+    }
+    
+    try:
+        # Check for existing PR first (before doing any work)
+        pr_title = f"[EZ] Replace `pytorch-labs` with `meta-pytorch`"
+        existing_pr_url = check_existing_pr(org, repo, pr_title)
+        if existing_pr_url:
+            result["status"] = "skipped_existing_pr"
+            result["pr_url"] = existing_pr_url
+            result["error"] = "Existing open PR with same title found"
+            logging.info(f"[process_repository] Skipping {org}/{repo} - existing open PR found: {existing_pr_url}")
+            return result
+        
+        # Get default branch
+        default_branch = get_default_branch(org, repo)
+        if not default_branch:
+            result["error"] = "Failed to get default branch"
+            return result
+        
+        # Get target files for this repository
+        target_files = get_target_files_for_repo(org, repo)
+        if not target_files:
+            logging.info(f"[process_repository] No target files found for {org}/{repo}")
+            return result
+        
+        # Check each target file for replacements
+        changes = []
+        for file_path in target_files:
+            replacement = find_and_replace_in_file(org, repo, file_path)
+            if replacement:
+                old_content, new_content = replacement
+                changes.append({
+                    "path": file_path,
+                    "old_content": old_content,
+                    "new_content": new_content
+                })
+        
+        if not changes:
+            logging.info(f"[process_repository] No changes needed in {org}/{repo}")
+            return result
+        
+        result["files_changed"] = len(changes)
+        logging.info(f"[process_repository] Found {len(changes)} files to update in {org}/{repo}")
+        
+        if DRY_RUN:
+            result["status"] = "dry_run"
+            logging.info(f"[process_repository] DRY RUN: Would update {len(changes)} files in {org}/{repo}")
+            return result
+        
+        # Create new branch
+        branch_name = f"replace-pytorch-labs-{datetime.now(timezone.utc).strftime('%Y%m%d-%H%M%S')}"
+        if not create_branch(org, repo, default_branch, branch_name):
+            result["error"] = "Failed to create branch"
+            return result
+        
+        # Commit changes
+        commit_message = f"Replace 'pytorch-labs' with 'meta-pytorch' in {len(changes)} files"
+        all_success = True
+        
+        for change in changes:
+            if not create_file_commit(org, repo, change["path"], change["new_content"], branch_name, commit_message):
+                all_success = False
+                break
+        
+        if not all_success:
+            result["error"] = "Failed to commit some files"
+            return result
+        
+        # Create pull request
+        pr_url = create_pull_request(org, repo, branch_name, default_branch)
+        if pr_url:
+            result["pr_url"] = pr_url
+            result["status"] = "success"
+        else:
+            result["error"] = "Failed to create pull request"
+        
+    except Exception as e:
+        logging.error(f"[process_repository] Error processing {org}/{repo}: {e}")
+        result["error"] = str(e)
+    
+    return result
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Replace 'pytorch-labs' with 'meta-pytorch' across GitHub organization repositories."
+    )
+    parser.add_argument(
+        "--org",
+        type=str,
+        default="pytorch",
+        help="GitHub organization to process (default: pytorch)",
+    )
+    parser.add_argument(
+        "--repos",
+        type=str,
+        help="Comma-separated list of repositories to process (e.g., 'pytorch,vision,tutorials'). If not specified, processes all repositories with target files.",
+    )
+    parser.add_argument(
+        "--dry-run",
+        action="store_true",
+        help="Preview changes without making them",
+    )
+    args = parser.parse_args()
+
+    global ORG_NAME, DRY_RUN
+    ORG_NAME = args.org
+    DRY_RUN = args.dry_run
+
+    # Parse repos filter if provided
+    filter_repos = None
+    if args.repos:
+        filter_repos = [repo.strip() for repo in args.repos.split(",")]
+        logging.info(f"[main] Repository filter applied: {filter_repos}")
+
+    if not GITHUB_TOKEN:
+        logging.error("[main] Missing GITHUB_TOKEN in environment variables.")
+        return
+
+    logging.info(f"[main] Starting text replacement for org: {ORG_NAME}")
+    if DRY_RUN:
+        logging.info("[main] DRY RUN MODE - No changes will be made")
+
+    # Show cache stats at start
+    cache_stats = get_cache_stats()
+    logging.info(
+        f"[main] Cache stats: {cache_stats['total_files']} files, {cache_stats['total_size_mb']} MB"
+    )
+
+    # Get target repositories (only those with files containing "pytorch-labs")
+    repos = get_target_repos(ORG_NAME, filter_repos)
+    logging.info(f"[main] Processing {len(repos)} repositories with target files")
+
+    # Process each repository
+    results = []
+    for i, repo in enumerate(repos, 1):
+        logging.info(f"[main] Processing repository {i}/{len(repos)}: {repo}")
+        result = process_repository(ORG_NAME, repo)
+        results.append(result)
+        
+        # Add a small delay to be respectful to the API
+        import time
+        time.sleep(1)
+
+    # Generate summary
+    successful = [r for r in results if r["status"] == "success"]
+    dry_run = [r for r in results if r["status"] == "dry_run"]
+    skipped = [r for r in results if r["status"] == "skipped"]
+    skipped_existing_pr = [r for r in results if r["status"] == "skipped_existing_pr"]
+    errors = [r for r in results if r["error"] and r["status"] not in ["skipped_existing_pr"]]
+
+    print(f"\n=== SUMMARY ===")
+    print(f"Organization: {ORG_NAME}")
+    print(f"Total repositories: {len(repos)}")
+    print(f"Successful PRs created: {len(successful)}")
+    print(f"Dry run (would create): {len(dry_run)}")
+    print(f"Skipped (no changes): {len(skipped)}")
+    print(f"Skipped (existing PR): {len(skipped_existing_pr)}")
+    print(f"Errors: {len(errors)}")
+    print("\n")
+    
+    if skipped_existing_pr:
+        print(f"=== SKIPPED (existing PRs) ===")
+        for result in skipped_existing_pr:
+            print(f"- {result['repo']}: {result['files_changed']} files would be updated, but existing PR found: {result['pr_url']}")
+        print("\n")
+
+    if successful:
+        print(f"=== SUCCESSFUL PRs ===")
+        for result in successful:
+            print(f"- {result['repo']}: {result['pr_url']} ({result['files_changed']} files)")
+        print("\n")
+
+    if dry_run:
+        print(f"=== DRY RUN (would create PRs) ===")
+        for result in dry_run:
+            print(f"- {result['repo']}: {result['files_changed']} files would be updated")
+        print("\n")
+
+    if errors:
+        print(f"=== ERRORS ===")
+        for result in errors:
+            print(f"- {result['repo']}: {result['error']}")
+        print("\n")
+
+    # Show final cache stats
+    final_cache_stats = get_cache_stats()
+    logging.info(
+        f"[main] Final cache stats: {final_cache_stats['total_files']} files, {final_cache_stats['total_size_mb']} MB"
+    )
+
+    logging.info("[main] Script completed successfully.")
+
+
+if __name__ == "__main__":
+    main() 
\ No newline at end of file

From 81c59ad94c6baa1f50d059bfef347e30ad4a4d44 Mon Sep 17 00:00:00 2001
From: Zain Rizvi <ZainR@meta.com>
Date: Tue, 12 Aug 2025 17:36:00 -0500
Subject: [PATCH 8/9] Use CodeSearch to find files that should be updated in
 org

---
 tools/analytics/org/remove_pytorch_labs.py | 198 ++++++++-------------
 1 file changed, 77 insertions(+), 121 deletions(-)

diff --git a/tools/analytics/org/remove_pytorch_labs.py b/tools/analytics/org/remove_pytorch_labs.py
index 9e246daf07..d40b6a6395 100644
--- a/tools/analytics/org/remove_pytorch_labs.py
+++ b/tools/analytics/org/remove_pytorch_labs.py
@@ -17,7 +17,7 @@
 
 Key Features:
 -------------
-- Uses pre-defined list of files known to contain "pytorch-labs" mentions (optimized for performance). This list was obtained by running codesea
+- Dynamically discovers files containing "pytorch-labs" mentions using GitHub's Search API
 - Replaces all instances of "pytorch-labs" with "meta-pytorch" in target files.
 - Creates a new branch and commits changes for each repository.
 - Creates pull requests with descriptive titles and descriptions.
@@ -46,10 +46,11 @@
 
 Notes:
 ------
-- Only processes 72 pre-identified files that contain "pytorch-labs" mentions
+- Dynamically discovers files containing "pytorch-labs" mentions using GitHub Search API
 - Skips binary files and files larger than 1MB
 - Creates one PR per repository with changes
 - Handles GitHub API rate limits automatically
+- Caches search results to avoid repeated API calls
 - Significantly faster than scanning all files in all repositories
 """
 
@@ -65,6 +66,7 @@
 import requests
 from cache_manager import get_cache_stats, make_cached_request
 from dotenv import load_dotenv
+from github_code_search import search_github_code, GitHubSearchResults
 
 load_dotenv()
 
@@ -93,130 +95,81 @@
 # Maximum file size to process (1MB)
 MAX_FILE_SIZE = 1024 * 1024
 
-# Pre-defined list of files that contain "pytorch-labs" mentions
-# This is based on search results and will significantly improve performance
-TARGET_FILES = {
-    "pytorch": [
-        "android/README.md",
-        "aten/src/ATen/native/cuda/int4mm.cu",
-        "torch/testing/_internal/common_quantization.py"
-    ],
-    "vision": [
-        "torchvision/io/image.py"
-    ],
-    "tutorials": [
-        "index.rst",
-        "docathon-leaderboard.md",
-        "intermediate_source/transformer_building_blocks.py",
-        "unstable_source/gpu_quantization_torchao_tutorial.py"
-    ],
-    "executorch": [
-        "docs/source/index.md",
-        "docs/source/getting-started.md",
-        "backends/apple/mps/setup.md",
-        "docs/source/backends-mps.md",
-        "docs/source/llm/run-with-c-plus-plus.md",
-        "docs/source/using-executorch-android.md",
-        "docs/source/using-executorch-export.md",
-        "docs/source/using-executorch-building-from-source.md",
-        "docs/source/using-executorch-cpp.md",
-        "examples/models/llama/experimental/generate.py",
-        "scripts/test_ios.sh",
-        ".ci/scripts/test_ios_ci.sh",
-        "backends/test/facto/test_facto.py"
-    ],
-    "ao": [
-        "scripts/download.py",
-        "torchao/_models/llama/tokenizer.py",
-        "scripts/convert_hf_checkpoint.py",
-        "examples/sam2_amg_server/annotate_with_rle.py",
-        "torchao/prototype/mx_formats/kernels.py",
-        "torchao/_models/sam/README.md",
-        "torchao/quantization/README.md",
-        "test/integration/test_integration.py",
-        ".github/workflows/dashboard_perf_test.yml"
-    ],
-    "benchmark": [
-        "torchbenchmark/models/simple_gpt/origin",
-        "torchbenchmark/models/sam_fast/requirements.txt"
-    ],
-    "torchtune": [
-        "docs/source/tutorials/qlora_finetune.rst",
-        "recipes/eleuther_eval.py",
-        "docs/source/tutorials/e2e_flow.rst",
-        "torchtune/generation/_generation.py",
-        "docs/source/tutorials/llama3.rst",
-        "README.md"
-    ],
-    "torchft": [
-        "docs/source/protocol.rst",
-        "docs/source/assumptions_and_recommendations.rst",
-        "docs/source/conf.py",
-        "docs/source/index.rst",
-        "README.md"
-    ],
-    "torchchat": [
-        "torchchat/usages/eval.py",
-        "README.md"
-    ],
-    "rl": [
-        "examples/rlhf/requirements.txt"
-    ],
-    "builder": [
-        "CUDA_UPGRADE_GUIDE.MD"
-    ],
-    "helion": [
-        "benchmarks/run.py",
-        "benchmarks/README.md"
-    ],
-    "torchcodec": [
-        "src/torchcodec/_core/SingleStreamDecoder.cpp"
-    ],
-    "test-infra": [
-        "aws/lambda/README.md",
-        "torchci/clickhouse_queries/queued_jobs_aggregate/query.sql",
-        "tools/torchfix/README.md",
-        ".github/workflows/trigger_nightly.yml"
-    ],
-    "ci-infra": [
-        "arc-backup-2024/scripts/deployment.py"
-    ],
-    "oss-docathons": [
-        "pytorch/h1-2024/leaderboard-pytorch-docathon-h1-2024.md",
-        "pytorch/h1-2024/leaderboard-pytorch-docathon-h1-2024.csv",
-        ".github/scripts/pytorch-docathon-h1-2024.py"
-    ],
-    "serve": [
-        "examples/large_models/segment_anything_fast/install_segment_anything_fast.sh",
-        "examples/large_models/gpt_fast/README.md",
-        "examples/large_models/gpt_fast_mixtral_moe/README.md",
-        "examples/large_models/diffusion_fast/README.md",
-        "examples/large_models/segment_anything_fast/README.md",
-        "kubernetes/kserve/examples/gpt_fast/README.md"
-    ],
-    "xla": [
-        "torchax/test/llama/llama_model.py"
-    ],
-    "pytorch-canary": [
-        "torch/testing/_internal/common_quantization.py"
-    ],
-    "pytorch-integration-testing": [
-        ".github/scripts/generate_vllm_benchmark_matrix.py"
-    ],
-    "torcheval": [
-        ".github/PULL_REQUEST_TEMPLATE.md",
-        ".github/ISSUE_TEMPLATE/bug-report.yml"
-    ]
-}
+# Cache for search results to avoid repeated API calls
+_SEARCH_CACHE: Dict[str, Dict[str, List[str]]] = {}
+
+
+
+
+def get_target_files_from_search(org: str) -> Dict[str, List[str]]:
+    """
+    Get target files by searching GitHub for 'pytorch-labs' mentions in the organization.
+    
+    Args:
+        org: GitHub organization name
+        
+    Returns:
+        Dictionary mapping repository names to lists of file paths
+    """
+    # Check cache first
+    if org in _SEARCH_CACHE:
+        logging.info(f"[get_target_files_from_search] Using cached results for org: {org}")
+        return _SEARCH_CACHE[org]
+    
+    try:
+        logging.info(f"[get_target_files_from_search] Searching for 'pytorch-labs' mentions in org: {org}")
+        
+        # Search for files containing "pytorch-labs" in the organization
+        query = f"org:{org} pytorch-labs"
+        results: GitHubSearchResults = search_github_code(
+            query=query,
+            verbose=False  # Reduce logging noise
+        )
+        
+        if results['retrieved_count'] == 0:
+            logging.warning(f"[get_target_files_from_search] No files found containing 'pytorch-labs' in org: {org}")
+            _SEARCH_CACHE[org] = {}
+            return {}
+        
+        # Group files by repository
+        target_files: Dict[str, List[str]] = {}
+        for item in results['items']:
+            repo_name = item['repository']['name']  # Just the repo name, not full_name
+            file_path = item['path']
+            
+            if repo_name not in target_files:
+                target_files[repo_name] = []
+            
+            if file_path not in target_files[repo_name]:
+                target_files[repo_name].append(file_path)
+        
+        logging.info(f"[get_target_files_from_search] Found {len(target_files)} repositories with {sum(len(files) for files in target_files.values())} total files")
+        
+        # Log summary of repositories found
+        for repo_name, files in target_files.items():
+            logging.info(f"[get_target_files_from_search] {repo_name}: {len(files)} files")
+        
+        # Cache the results
+        _SEARCH_CACHE[org] = target_files
+        
+        return target_files
+        
+    except Exception as e:
+        logging.error(f"[get_target_files_from_search] Error searching for files: {e}")
+        logging.warning(f"[get_target_files_from_search] No fallback available - search failed")
+        return {}
 
 
 def get_target_repos(org: str, filter_repos: Optional[List[str]] = None) -> List[str]:
     """Get only the repositories that have files with 'pytorch-labs' mentions."""
-    if org not in TARGET_FILES:
+    # Get target files from search (with fallback to hardcoded list)
+    target_files = get_target_files_from_search(org)
+    
+    if not target_files:
         logging.info(f"[get_target_repos] No target files found for org: {org}")
         return []
     
-    all_repos = list(TARGET_FILES.keys())
+    all_repos = list(target_files.keys())
     
     if filter_repos:
         # Filter to only include repos that are in both the target files and the filter list
@@ -245,11 +198,14 @@ def get_default_branch(org: str, repo: str) -> Optional[str]:
 
 def get_target_files_for_repo(org: str, repo: str) -> List[str]:
     """Get the list of target files for a specific repository."""
-    if repo not in TARGET_FILES:
+    # Get target files from search (with fallback to hardcoded list)
+    target_files = get_target_files_from_search(org)
+    
+    if repo not in target_files:
         logging.info(f"[get_target_files_for_repo] No target files found for {org}/{repo}")
         return []
     
-    files = TARGET_FILES[repo]
+    files = target_files[repo]
     logging.info(f"[get_target_files_for_repo] Found {len(files)} target files for {org}/{repo}")
     return files
 

From 344bbc47cfc29e12d108cc225bedf3f942439ed1 Mon Sep 17 00:00:00 2001
From: Zain Rizvi <ZainR@meta.com>
Date: Tue, 12 Aug 2025 18:01:43 -0500
Subject: [PATCH 9/9] lint fixes

---
 tools/analytics/org/analyze_contributors.py | 243 +++++++++------
 tools/analytics/org/analyze_repo_info.py    |  68 +++--
 tools/analytics/org/github_code_search.py   | 291 ++++++++++--------
 tools/analytics/org/remove_pytorch_labs.py  | 309 ++++++++++++--------
 4 files changed, 554 insertions(+), 357 deletions(-)

diff --git a/tools/analytics/org/analyze_contributors.py b/tools/analytics/org/analyze_contributors.py
index 5b9439eeb5..1a8158d8a9 100644
--- a/tools/analytics/org/analyze_contributors.py
+++ b/tools/analytics/org/analyze_contributors.py
@@ -61,7 +61,7 @@
 import re
 from collections import defaultdict
 from datetime import datetime, timedelta
-from typing import Dict, List, Optional
+from typing import Any, Dict, List, Optional, Set, Tuple
 
 import requests
 import yaml
@@ -136,7 +136,9 @@
 }
 
 BASE_URL = "https://api.github.com"
-COMMIT_LOOKBACK = (datetime.utcnow() - timedelta(days=180)).isoformat() + "Z"  # 6 months
+COMMIT_LOOKBACK = (
+    datetime.utcnow() - timedelta(days=180)
+).isoformat() + "Z"  # 6 months
 
 
 def get_repos(org: str) -> List[str]:
@@ -187,9 +189,13 @@ def get_commits(org: str, repo: str) -> List[Dict]:
             logging.error(f"[get_commits] Failed to fetch page {page} for repo: {repo}")
             break
         if not data:
-            logging.info(f"[get_commits] No more commits found for repo: {repo} on page {page}")
+            logging.info(
+                f"[get_commits] No more commits found for repo: {repo} on page {page}"
+            )
             break
-        logging.info(f"[get_commits] Page {page}: Found {len(data)} commits for repo: {repo}")
+        logging.info(
+            f"[get_commits] Page {page}: Found {len(data)} commits for repo: {repo}"
+        )
         all_commits.extend(data)
         page += 1
 
@@ -198,7 +204,9 @@ def get_commits(org: str, repo: str) -> List[Dict]:
             logging.info(f"[get_commits] Limiting to 1000 commits for repo: {repo}")
             break
 
-    logging.info(f"[get_commits] Finished fetching commits for repo: {repo}. Total: {len(all_commits)}")
+    logging.info(
+        f"[get_commits] Finished fetching commits for repo: {repo}. Total: {len(all_commits)}"
+    )
     return all_commits
 
 
@@ -230,9 +238,16 @@ def extract_company_from_email(email: str) -> Optional[str]:
 
     # Skip generic email providers
     generic_providers = {
-        "gmail.com", "yahoo.com", "hotmail.com", "outlook.com", "icloud.com",
-        "protonmail.com", "tutanota.com", "hey.com", "fastmail.com",
-        "users.noreply.github.com"  # GitHub's privacy-preserving email addresses
+        "gmail.com",
+        "yahoo.com",
+        "hotmail.com",
+        "outlook.com",
+        "icloud.com",
+        "protonmail.com",
+        "tutanota.com",
+        "hey.com",
+        "fastmail.com",
+        "users.noreply.github.com",  # GitHub's privacy-preserving email addresses
     }
 
     if domain in generic_providers:
@@ -240,7 +255,9 @@ def extract_company_from_email(email: str) -> Optional[str]:
 
     # For other domains, try to extract company name
     # Remove common TLDs and subdomains
-    domain_parts = domain.replace(".com", "").replace(".org", "").replace(".net", "").split(".")
+    domain_parts = (
+        domain.replace(".com", "").replace(".org", "").replace(".net", "").split(".")
+    )
     if domain_parts and len(domain_parts[-1]) > 2:
         return domain_parts[-1].title()
 
@@ -258,7 +275,7 @@ def extract_company_from_profile(profile: Dict) -> Optional[str]:
         return None
 
     # Clean up company name
-    company = re.sub(r'^@', '', company)  # Remove @ prefix
+    company = re.sub(r"^@", "", company)  # Remove @ prefix
     company = company.strip()
 
     if not company:
@@ -350,12 +367,14 @@ def wrapper(*args, **kwargs):
             arg_representation = {
                 "date": today,
                 "args": hashable_args,
-                "kwargs": sorted(hashable_kwargs.items())
+                "kwargs": sorted(hashable_kwargs.items()),
             }
             serialized_args = json.dumps(arg_representation, sort_keys=True)
         except (TypeError, ValueError):
             # If serialization fails, use string representation as fallback
-            serialized_args = today + str(hashable_args) + str(sorted(hashable_kwargs.items()))
+            serialized_args = (
+                today + str(hashable_args) + str(sorted(hashable_kwargs.items()))
+            )
 
         arg_hash = hashlib.sha256(serialized_args.encode()).hexdigest()
         key = f"{func_name}_{today}_{arg_hash}"
@@ -373,7 +392,9 @@ def wrapper(*args, **kwargs):
         # Cache the result
         with open(filepath, "w") as f:
             json.dump(result, f)
-        logging.debug(f"Cached result for function: {func_name}, saved to: {filepath} (date: {today})")
+        logging.debug(
+            f"Cached result for function: {func_name}, saved to: {filepath} (date: {today})"
+        )
 
         return result
 
@@ -381,31 +402,35 @@ def wrapper(*args, **kwargs):
 
 
 @cache_to_disk
-def analyze_contributors(org: str, repos: List[str]) -> Dict:
+def analyze_contributors(
+    org: str, repos: List[str]
+) -> Tuple[Dict[str, Dict[str, Any]], Dict[str, List[Dict[str, Any]]]]:
     """Analyze contributors across all repositories."""
-    logging.info(f"[analyze_contributors] Start analyzing contributors for {len(repos)} repositories in org: {org}")
+    logging.info(
+        f"[analyze_contributors] Start analyzing contributors for {len(repos)} repositories in org: {org}"
+    )
 
     # Track contributors across all repos
-    global_contributors = defaultdict(lambda: {
-        "total_commits": 0,
-        "repos": set(),
-        "emails": set(),
-        "username": None,
-        "company": None,
-        "profile": None
-    })
+    global_contributors: Dict[str, Dict[str, Any]] = defaultdict(
+        lambda: {
+            "total_commits": 0,
+            "repos": set(),
+            "emails": set(),
+            "username": None,
+            "company": None,
+            "profile": None,
+        }
+    )
 
     # Track contributors by repo
-    repo_contributors = {}
+    repo_contributors: Dict[str, List[Dict[str, Any]]] = {}
 
     for repo in repos:
         logging.info(f"[analyze_contributors] Processing repo: {repo}")
         commits = get_commits(org, repo)
-        repo_contributor_stats = defaultdict(lambda: {
-            "commits": 0,
-            "emails": set(),
-            "username": None
-        })
+        repo_contributor_stats: Dict[str, Dict[str, Any]] = defaultdict(
+            lambda: {"commits": 0, "emails": set(), "username": None}
+        )
 
         for commit in commits:
             author = commit.get("commit", {}).get("author", {})
@@ -418,64 +443,90 @@ def analyze_contributors(org: str, repos: List[str]) -> Dict:
             # Since we can assume GitHub username info is always there, use it as the primary key
             contributor_key = username
             if not contributor_key:
-                raise ValueError(f"Commit {commit['sha']} in repo {repo} has no identifiable contributor information.")
+                raise ValueError(
+                    f"Commit {commit['sha']} in repo {repo} has no identifiable contributor information."
+                )
 
             # Update repo-specific stats
             repo_contributor_stats[contributor_key]["commits"] += 1
             if author_email:
-                repo_contributor_stats[contributor_key]["emails"].add(author_email)
+                emails_set = repo_contributor_stats[contributor_key]["emails"]
+                if isinstance(emails_set, set):
+                    emails_set.add(author_email)
             if username:
                 repo_contributor_stats[contributor_key]["username"] = username
 
             # Update global stats
             global_contributors[contributor_key]["total_commits"] += 1
-            global_contributors[contributor_key]["repos"].add(repo)
+            repos_set = global_contributors[contributor_key]["repos"]
+            if isinstance(repos_set, set):
+                repos_set.add(repo)
             if author_email:
-                global_contributors[contributor_key]["emails"].add(author_email)
+                emails_set = global_contributors[contributor_key]["emails"]
+                if isinstance(emails_set, set):
+                    emails_set.add(author_email)
             if username:
                 global_contributors[contributor_key]["username"] = username
 
         # Convert sets to lists for YAML serialization
         repo_contributors[repo] = []
         for contributor_key, stats in repo_contributor_stats.items():
-            repo_contributors[repo].append({
-                "contributor": contributor_key,
-                "commits": stats["commits"],
-                "emails": list(stats["emails"]),
-                "username": stats["username"]
-            })
+            emails_list = (
+                list(stats["emails"]) if isinstance(stats["emails"], set) else []
+            )
+            repo_contributors[repo].append(
+                {
+                    "contributor": contributor_key,
+                    "commits": stats["commits"],
+                    "emails": emails_list,
+                    "username": stats["username"],
+                }
+            )
 
         # Sort by commit count
         repo_contributors[repo].sort(key=lambda x: x["commits"], reverse=True)
 
-        logging.info(f"[analyze_contributors] Found {len(repo_contributors[repo])} contributors for repo: {repo}")
+        logging.info(
+            f"[analyze_contributors] Found {len(repo_contributors[repo])} contributors for repo: {repo}"
+        )
 
     # Enhance global contributors with profile and company information
-    logging.info(f"[analyze_contributors] Enhancing contributor information with profiles and companies")
+    logging.info(
+        f"[analyze_contributors] Enhancing contributor information with profiles and companies"
+    )
     for contributor_key, stats in global_contributors.items():
         # First, try to extract company from email addresses (prioritize this)
-        if stats["emails"]:
-            for email in stats["emails"]:
+        emails_set = stats["emails"]
+        if isinstance(emails_set, set) and emails_set:
+            for email in emails_set:
                 company_from_email = extract_company_from_email(email)
                 if company_from_email:
                     stats["company"] = company_from_email
                     break
 
         # Only if email didn't provide a clear company mapping, try GitHub profile
-        if not stats["company"] and stats["username"]:
-            profile = get_user_profile(stats["username"])
+        username = stats["username"]
+        if not stats["company"] and username:
+            profile = get_user_profile(username)
             stats["profile"] = profile
 
             # Try to extract company from profile
-            company_from_profile = extract_company_from_profile(profile)
-            if company_from_profile:
-                stats["company"] = company_from_profile
+            if profile:
+                company_from_profile = extract_company_from_profile(profile)
+                if company_from_profile:
+                    stats["company"] = company_from_profile
 
         # Convert sets to lists for YAML serialization
-        stats["repos"] = list(stats["repos"])
-        stats["emails"] = list(stats["emails"])
+        repos_set = stats["repos"]
+        if isinstance(repos_set, set):
+            stats["repos"] = list(repos_set)
+        emails_set = stats["emails"]
+        if isinstance(emails_set, set):
+            stats["emails"] = list(emails_set)
 
-    logging.info(f"[analyze_contributors] Finished analyzing contributors for org: {org}")
+    logging.info(
+        f"[analyze_contributors] Finished analyzing contributors for org: {org}"
+    )
     return global_contributors, repo_contributors
 
 
@@ -532,23 +583,29 @@ def main():
         repo for repo in repos if f"{ORG_NAME}/{repo}" not in EXCLUDED_REPOS
     ]
 
-    logging.info(f"[main] Analyzing {len(filtered_repos)} repositories (excluded {len(repos) - len(filtered_repos)})")
+    logging.info(
+        f"[main] Analyzing {len(filtered_repos)} repositories (excluded {len(repos) - len(filtered_repos)})"
+    )
 
     # Analyze contributors
-    global_contributors, repo_contributors = analyze_contributors(ORG_NAME, filtered_repos)
+    global_contributors, repo_contributors = analyze_contributors(
+        ORG_NAME, filtered_repos
+    )
 
     # Sort contributors by frequency
     contributors_by_frequency = []
     for contributor_key, stats in global_contributors.items():
-        contributors_by_frequency.append({
-            "contributor": contributor_key,
-            "total_commits": stats["total_commits"],
-            "repos_count": len(stats["repos"]),
-            "repos": stats["repos"],
-            "emails": stats["emails"],
-            "username": stats["username"],
-            "company": stats["company"]
-        })
+        contributors_by_frequency.append(
+            {
+                "contributor": contributor_key,
+                "total_commits": stats["total_commits"],
+                "repos_count": len(stats["repos"]),
+                "repos": stats["repos"],
+                "emails": stats["emails"],
+                "username": stats["username"],
+                "company": stats["company"],
+            }
+        )
 
     contributors_by_frequency.sort(key=lambda x: x["total_commits"], reverse=True)
 
@@ -558,20 +615,24 @@ def main():
 
     for contributor in contributors_by_frequency:
         if contributor["company"]:
-            company_analysis[contributor["company"]].append({
-                "contributor": contributor["contributor"],
-                "total_commits": contributor["total_commits"],
-                "repos_count": contributor["repos_count"],
-                "username": contributor["username"]
-            })
+            company_analysis[contributor["company"]].append(
+                {
+                    "contributor": contributor["contributor"],
+                    "total_commits": contributor["total_commits"],
+                    "repos_count": contributor["repos_count"],
+                    "username": contributor["username"],
+                }
+            )
         else:
-            unidentified_contributors.append({
-                "contributor": contributor["contributor"],
-                "total_commits": contributor["total_commits"],
-                "repos_count": contributor["repos_count"],
-                "username": contributor["username"],
-                "emails": contributor["emails"]
-            })
+            unidentified_contributors.append(
+                {
+                    "contributor": contributor["contributor"],
+                    "total_commits": contributor["total_commits"],
+                    "repos_count": contributor["repos_count"],
+                    "username": contributor["username"],
+                    "emails": contributor["emails"],
+                }
+            )
 
     # Sort company contributors by commit count
     for company in company_analysis:
@@ -585,13 +646,18 @@ def main():
             "lookback_period_days": 180,
             "repositories_analyzed": len(filtered_repos),
             "total_contributors": len(contributors_by_frequency),
-            "contributors_with_company": len(contributors_by_frequency) - len(unidentified_contributors),
-            "contributors_without_company": len(unidentified_contributors)
+            "contributors_with_company": len(contributors_by_frequency)
+            - len(unidentified_contributors),
+            "contributors_without_company": len(unidentified_contributors),
         },
-        "contributors_by_frequency": contributors_by_frequency[:50],  # Top 50 contributors
+        "contributors_by_frequency": contributors_by_frequency[
+            :50
+        ],  # Top 50 contributors
         "company_analysis": dict(company_analysis),
-        "unidentified_contributors": unidentified_contributors[:20],  # Top 20 unidentified
-        "contributors_by_repo": repo_contributors
+        "unidentified_contributors": unidentified_contributors[
+            :20
+        ],  # Top 20 unidentified
+        "contributors_by_repo": repo_contributors,
     }
 
     # Sort output for consistency
@@ -622,11 +688,16 @@ def deep_sort(obj, sort_keys=True):
     print(f"- Organization: {ORG_NAME}")
     print(f"- Repositories analyzed: {len(filtered_repos)}")
     print(f"- Total contributors: {len(contributors_by_frequency)}")
-    print(f"- Contributors with identified companies: {len(contributors_by_frequency) - len(unidentified_contributors)}")
+    print(
+        f"- Contributors with identified companies: {len(contributors_by_frequency) - len(unidentified_contributors)}"
+    )
     print(f"- Top companies by contributor count:")
 
     # Show top companies
-    company_contributor_count = [(company, len(contributors)) for company, contributors in company_analysis.items()]
+    company_contributor_count = [
+        (company, len(contributors))
+        for company, contributors in company_analysis.items()
+    ]
     company_contributor_count.sort(key=lambda x: x[1], reverse=True)
 
     for company, count in company_contributor_count[:20]:
@@ -650,10 +721,14 @@ def deep_sort(obj, sort_keys=True):
                     break
 
         # Sort by commit count (descending)
-        repo_commits.sort(key=lambda x: int(x.split('(')[1].split(')')[0]), reverse=True)
+        repo_commits.sort(
+            key=lambda x: int(x.split("(")[1].split(")")[0]), reverse=True
+        )
 
         # Format the contributor name (use username if available, otherwise email/name)
-        display_name = contributor["username"] if contributor["username"] else contributor_key
+        display_name = (
+            contributor["username"] if contributor["username"] else contributor_key
+        )
 
         print(f"- {display_name}, {', '.join(repo_commits)}")
 
diff --git a/tools/analytics/org/analyze_repo_info.py b/tools/analytics/org/analyze_repo_info.py
index da2263f3c1..95203d3dab 100644
--- a/tools/analytics/org/analyze_repo_info.py
+++ b/tools/analytics/org/analyze_repo_info.py
@@ -82,10 +82,10 @@
 def get_repos_with_info(org: str) -> List[Dict]:
     """
     Fetch all repositories for an organization with their metadata.
-    
+
     Args:
         org: The GitHub organization name
-        
+
     Returns:
         List of repository dictionaries with metadata
     """
@@ -97,7 +97,9 @@ def get_repos_with_info(org: str) -> List[Dict]:
         logging.debug(f"[get_repos_with_info] Requesting URL: {url}")
         data = make_cached_request(url, HEADERS)
         if data is None:
-            logging.error(f"[get_repos_with_info] Failed to fetch page {page} for org: {org}")
+            logging.error(
+                f"[get_repos_with_info] Failed to fetch page {page} for org: {org}"
+            )
             break
         if not data:
             logging.info(
@@ -118,11 +120,11 @@ def get_repos_with_info(org: str) -> List[Dict]:
 def get_last_commit_date(org: str, repo: str) -> Optional[str]:
     """
     Get the date of the last commit for a repository.
-    
+
     Args:
         org: The GitHub organization name
         repo: The repository name
-        
+
     Returns:
         Date string in YYYY-MM-DD format of the last commit, or None if no commits found
     """
@@ -133,61 +135,71 @@ def get_last_commit_date(org: str, repo: str) -> Optional[str]:
     if data is None or not data:
         logging.warning(f"[get_last_commit_date] No commits found for repo: {repo}")
         return None
-    
+
     if len(data) > 0:
         commit_date = data[0]["commit"]["author"]["date"]
         # Convert ISO format to YYYY-MM-DD format
         try:
             from datetime import datetime
-            dt = datetime.fromisoformat(commit_date.replace('Z', '+00:00'))
-            formatted_date = dt.strftime('%Y-%m-%d')
-            logging.info(f"[get_last_commit_date] Last commit date for {repo}: {formatted_date}")
+
+            dt = datetime.fromisoformat(commit_date.replace("Z", "+00:00"))
+            formatted_date = dt.strftime("%Y-%m-%d")
+            logging.info(
+                f"[get_last_commit_date] Last commit date for {repo}: {formatted_date}"
+            )
             return formatted_date
         except (ValueError, AttributeError) as e:
-            logging.warning(f"[get_last_commit_date] Failed to parse date for {repo}: {e}")
+            logging.warning(
+                f"[get_last_commit_date] Failed to parse date for {repo}: {e}"
+            )
             return None
-    
+
     return None
 
 
 def process_repo_data(org: str, repos: List[Dict]) -> List[Dict]:
     """
     Process repository data and add last commit date information.
-    
+
     Args:
         org: The GitHub organization name
         repos: List of repository dictionaries from GitHub API
-        
+
     Returns:
         List of processed repository data with all required fields
     """
     logging.info(f"[process_repo_data] Processing {len(repos)} repositories")
     processed_repos = []
-    
+
     for i, repo in enumerate(repos, 1):
         repo_name = repo["name"]
-        logging.info(f"[process_repo_data] Processing repo {i}/{len(repos)}: {repo_name}")
-        
+        logging.info(
+            f"[process_repo_data] Processing repo {i}/{len(repos)}: {repo_name}"
+        )
+
         # Get last commit date
         last_commit_date = get_last_commit_date(org, repo_name)
-        
+
         processed_repo = {
             "repo_name": f"{org}/{repo_name}",
-            "public": repo.get("private", True) == False,  # True if public, False if private
+            "public": repo.get("private", True)
+            == False,  # True if public, False if private
             "archived": repo.get("archived", False),
-            "last_commit_date": last_commit_date
+            "last_commit_date": last_commit_date,
         }
-        
+
         processed_repos.append(processed_repo)
-    
-    logging.info(f"[process_repo_data] Finished processing {len(processed_repos)} repositories")
+
+    logging.info(
+        f"[process_repo_data] Finished processing {len(processed_repos)} repositories"
+    )
     return processed_repos
 
 
 def save_to_csv(data: List[Dict], filename: str = "repo_info_summary.csv"):
     """
     Save repository data to a CSV file.
-    
+
     Args:
         data: List of repository dictionaries
         filename: Name of the CSV file to create
@@ -202,12 +214,12 @@ def save_to_csv(data: List[Dict], filename: str = "repo_info_summary.csv"):
 
     # Define CSV headers
     fieldnames = ["repo_name", "public", "archived", "last_commit_date"]
-    
+
     with open(filepath, "w", newline="", encoding="utf-8") as f:
         writer = csv.DictWriter(f, fieldnames=fieldnames)
         writer.writeheader()
         writer.writerows(data)
-    
+
     logging.info(f"[save_to_csv] Data successfully saved to {filepath}")
 
 
@@ -240,10 +252,10 @@ def main():
 
     # Step 1: Get all repositories with their metadata
     repos = get_repos_with_info(ORG_NAME)
-    
+
     # Step 2: Process repository data and add last commit dates
     processed_repos = process_repo_data(ORG_NAME, repos)
-    
+
     # Step 3: Save to CSV
     save_to_csv(processed_repos)
 
@@ -256,4 +268,4 @@ def main():
 
 
 if __name__ == "__main__":
-    main() 
\ No newline at end of file
+    main()
diff --git a/tools/analytics/org/github_code_search.py b/tools/analytics/org/github_code_search.py
index 56788873ff..0907f1a502 100644
--- a/tools/analytics/org/github_code_search.py
+++ b/tools/analytics/org/github_code_search.py
@@ -63,14 +63,15 @@
 import logging
 import os
 import time
+from dataclasses import dataclass
 from datetime import datetime, timezone
-from typing import Dict, List, Optional, Any, TypedDict, Union
+from typing import Any, Dict, List, Optional, TypedDict, Union
 from urllib.parse import quote_plus
-from dataclasses import dataclass
 
 import requests
 from dotenv import load_dotenv
 
+
 load_dotenv()
 
 logging.basicConfig(
@@ -94,6 +95,7 @@
 # Type definitions for well-defined schema
 class RepositoryInfo(TypedDict):
     """Repository information from GitHub search results."""
+
     id: int
     node_id: str
     name: str
@@ -178,6 +180,7 @@ class RepositoryInfo(TypedDict):
 
 class SearchResultItem(TypedDict):
     """Individual search result item from GitHub code search."""
+
     name: str
     path: str
     sha: str
@@ -195,6 +198,7 @@ class SearchResultItem(TypedDict):
 
 class GitHubSearchResults(TypedDict):
     """Complete search results from GitHub Search API."""
+
     query: str
     total_count: int
     retrieved_count: int
@@ -207,41 +211,49 @@ class GitHubSearchResults(TypedDict):
 @dataclass
 class SearchOptions:
     """Options for GitHub code search."""
+
     per_page: int = 100
     max_results: Optional[int] = None
     verbose: bool = True
 
 
 class GitHubCodeSearch:
-    def __init__(self, token: str = None):
+    def __init__(self, token: Optional[str] = None):
         """
         Initialize GitHub Code Search client.
-        
+
         Args:
             token: GitHub personal access token. If None, will try to get from GITHUB_TOKEN env var.
         """
         self.token = token or GITHUB_TOKEN
         if not self.token:
-            raise ValueError("GitHub token is required. Set GITHUB_TOKEN environment variable or pass token parameter.")
-            
+            raise ValueError(
+                "GitHub token is required. Set GITHUB_TOKEN environment variable or pass token parameter."
+            )
+
         self.headers = {
             "Authorization": f"Bearer {self.token}",
             "Accept": "application/vnd.github+json",
         }
         self.session = requests.Session()
         self.session.headers.update(self.headers)
-        
-    def search_code(self, query: str, per_page: int = 100, max_results: Optional[int] = None, 
-                   verbose: bool = True) -> GitHubSearchResults:
+
+    def search_code(
+        self,
+        query: str,
+        per_page: int = 100,
+        max_results: Optional[int] = None,
+        verbose: bool = True,
+    ) -> GitHubSearchResults:
         """
         Search for code using GitHub's Search API.
-        
+
         Args:
             query: Search query string
             per_page: Number of results per page (max 100)
             max_results: Maximum number of results to return (None for all)
             verbose: Whether to log progress messages
-            
+
         Returns:
             GitHubSearchResults: Well-defined structure containing:
                 - query: The search query used
@@ -255,112 +267,122 @@ def search_code(self, query: str, per_page: int = 100, max_results: Optional[int
         all_items = []
         page = 1
         total_count = 0
-        
+
         if verbose:
             logging.info(f"Starting code search with query: {query}")
-        
+
         while True:
             # Check rate limits
             rate_limit_info = self._check_rate_limit()
-            if rate_limit_info['remaining'] == 0:
-                reset_time = rate_limit_info['reset_time']
+            if rate_limit_info["remaining"] == 0:
+                reset_time = rate_limit_info["reset_time"]
                 wait_time = max(0, reset_time - time.time())
                 if verbose:
-                    logging.warning(f"Rate limit exceeded. Waiting {wait_time:.0f} seconds...")
+                    logging.warning(
+                        f"Rate limit exceeded. Waiting {wait_time:.0f} seconds..."
+                    )
                 time.sleep(wait_time + 1)
-            
+
             # Prepare request parameters
-            params = {
-                'q': query,
-                'per_page': min(per_page, 100),
-                'page': page
+            params: Dict[str, Union[str, int]] = {
+                "q": query,
+                "per_page": min(per_page, 100),
+                "page": page,
             }
-            
+
             try:
                 if verbose:
                     logging.info(f"Fetching page {page}...")
                 response = self.session.get(SEARCH_URL, params=params)
                 response.raise_for_status()
-                
+
                 data = response.json()
-                
+
                 # Update total count on first page
                 if page == 1:
-                    total_count = data.get('total_count', 0)
+                    total_count = data.get("total_count", 0)
                     if verbose:
                         logging.info(f"Total results found: {total_count}")
-                
-                items = data.get('items', [])
+
+                items = data.get("items", [])
                 if not items:
                     break
-                
+
                 all_items.extend(items)
                 if verbose:
-                    logging.info(f"Retrieved {len(items)} items from page {page} (total: {len(all_items)})")
-                
+                    logging.info(
+                        f"Retrieved {len(items)} items from page {page} (total: {len(all_items)})"
+                    )
+
                 # Check if we've reached the maximum results
                 if max_results and len(all_items) >= max_results:
                     all_items = all_items[:max_results]
                     if verbose:
                         logging.info(f"Reached maximum results limit: {max_results}")
                     break
-                
+
                 # Check if there are more pages
                 if len(items) < per_page:
                     break
-                
+
                 page += 1
-                
+
                 # Be respectful to the API
                 time.sleep(1)
-                
+
             except requests.exceptions.RequestException as e:
                 logging.error(f"Error fetching page {page}: {e}")
                 break
             except json.JSONDecodeError as e:
                 logging.error(f"Error parsing JSON response from page {page}: {e}")
                 break
-        
+
         # Get rate limit info for the response
         rate_limit_info = self._check_rate_limit()
-        
+
         return GitHubSearchResults(
             query=query,
             total_count=total_count,
             retrieved_count=len(all_items),
             items=all_items,
             search_time=datetime.now(timezone.utc).isoformat(),
-            rate_limit_remaining=rate_limit_info.get('remaining'),
-            rate_limit_reset=datetime.fromtimestamp(rate_limit_info.get('reset_time', 0)).isoformat() if rate_limit_info.get('reset_time') else None
+            rate_limit_remaining=rate_limit_info.get("remaining"),
+            rate_limit_reset=datetime.fromtimestamp(
+                rate_limit_info.get("reset_time", 0)
+            ).isoformat()
+            if rate_limit_info.get("reset_time")
+            else None,
         )
-    
+
     def get_rate_limit(self) -> Dict[str, Any]:
         """Get GitHub API rate limit status."""
         return self._check_rate_limit()
-    
+
     def _check_rate_limit(self) -> Dict[str, Any]:
         """Check GitHub API rate limit status."""
         try:
             response = self.session.get(f"{BASE_URL}/rate_limit")
             response.raise_for_status()
             data = response.json()
-            
-            search_limit = data.get('resources', {}).get('search', {})
+
+            search_limit = data.get("resources", {}).get("search", {})
             return {
-                'limit': search_limit.get('limit', 0),
-                'remaining': search_limit.get('remaining', 0),
-                'reset_time': search_limit.get('reset', 0)
+                "limit": search_limit.get("limit", 0),
+                "remaining": search_limit.get("remaining", 0),
+                "reset_time": search_limit.get("reset", 0),
             }
         except Exception as e:
             logging.warning(f"Could not check rate limit: {e}")
-            return {'limit': 0, 'remaining': 0, 'reset_time': 0}
-    
-    def format_results(self, results: GitHubSearchResults, format_type: str = 'console') -> str:
+            return {"limit": 0, "remaining": 0, "reset_time": 0}
+
+    def format_results(
+        self, results: GitHubSearchResults, format_type: str = "console"
+    ) -> str:
         """Format search results for different output types."""
-        if format_type == 'json':
+        if format_type == "json":
             return json.dumps(results, indent=2)
-        
-        elif format_type == 'console':
+
+        elif format_type == "console":
             output = []
             output.append(f"=== GitHub Code Search Results ===")
             output.append(f"Query: {results['query']}")
@@ -368,107 +390,124 @@ def format_results(self, results: GitHubSearchResults, format_type: str = 'conso
             output.append(f"Retrieved: {results['retrieved_count']}")
             output.append(f"Search time: {results['search_time']}")
             output.append("")
-            
-            for i, item in enumerate(results['items'], 1):
-                repo_name = item.get('repository', {}).get('full_name', 'Unknown')
-                file_path = item.get('path', 'Unknown')
-                file_url = item.get('html_url', '')
-                score = item.get('score', 0)
-                
+
+            for i, item in enumerate(results["items"], 1):
+                repo_name = item.get("repository", {}).get("full_name", "Unknown")
+                file_path = item.get("path", "Unknown")
+                file_url = item.get("html_url", "")
+                score = item.get("score", 0)
+
                 output.append(f"{i}. {repo_name}/{file_path}")
                 output.append(f"   Score: {score}")
                 output.append(f"   URL: {file_url}")
                 output.append("")
-            
+
             return "\n".join(output)
-        
-        elif format_type == 'csv':
+
+        elif format_type == "csv":
             import csv
             import io
-            
-            output = io.StringIO()
-            writer = csv.writer(output)
-            
+
+            output_buffer = io.StringIO()
+            writer = csv.writer(output_buffer)
+
             # Write header
-            writer.writerow(['Repository', 'File Path', 'Score', 'URL', 'Search Time'])
-            
+            writer.writerow(["Repository", "File Path", "Score", "URL", "Search Time"])
+
             # Write data
-            for item in results['items']:
-                repo_name = item.get('repository', {}).get('full_name', 'Unknown')
-                file_path = item.get('path', 'Unknown')
-                file_url = item.get('html_url', '')
-                score = item.get('score', 0)
-                
-                writer.writerow([repo_name, file_path, score, file_url, results['search_time']])
-            
-            return output.getvalue()
-        
+            for item in results["items"]:
+                repo_name = item.get("repository", {}).get("full_name", "Unknown")
+                file_path = item.get("path", "Unknown")
+                file_url = item.get("html_url", "")
+                score = item.get("score", 0)
+
+                writer.writerow(
+                    [repo_name, file_path, score, file_url, results["search_time"]]
+                )
+
+            return output_buffer.getvalue()
+
         else:
             raise ValueError(f"Unsupported format type: {format_type}")
-    
+
     def get_file_paths(self, results: GitHubSearchResults) -> List[str]:
         """Extract just the file paths from search results."""
-        return [item.get('path', '') for item in results.get('items', [])]
-    
+        return [item.get("path", "") for item in results.get("items", [])]
+
     def get_repositories(self, results: GitHubSearchResults) -> List[str]:
         """Extract just the repository names from search results."""
-        return [item.get('repository', {}).get('full_name', '') for item in results.get('items', [])]
-    
+        return [
+            item.get("repository", {}).get("full_name", "")
+            for item in results.get("items", [])
+        ]
+
     def get_unique_repositories(self, results: GitHubSearchResults) -> List[str]:
         """Extract unique repository names from search results."""
         repos = self.get_repositories(results)
         return list(set(repos))
-    
-    def filter_by_score(self, results: GitHubSearchResults, min_score: float = 0.0) -> GitHubSearchResults:
+
+    def filter_by_score(
+        self, results: GitHubSearchResults, min_score: float = 0.0
+    ) -> GitHubSearchResults:
         """Filter results by minimum score."""
         filtered_items = [
-            item for item in results.get('items', [])
-            if item.get('score', 0) >= min_score
+            item
+            for item in results.get("items", [])
+            if item.get("score", 0) >= min_score
         ]
-        
+
         return GitHubSearchResults(
-            query=results['query'],
-            total_count=results['total_count'],
+            query=results["query"],
+            total_count=results["total_count"],
             retrieved_count=len(filtered_items),
             items=filtered_items,
-            search_time=results['search_time'],
-            rate_limit_remaining=results.get('rate_limit_remaining'),
-            rate_limit_reset=results.get('rate_limit_reset')
+            search_time=results["search_time"],
+            rate_limit_remaining=results.get("rate_limit_remaining"),
+            rate_limit_reset=results.get("rate_limit_reset"),
         )
-    
-    def filter_by_repository(self, results: GitHubSearchResults, repo_pattern: str) -> GitHubSearchResults:
+
+    def filter_by_repository(
+        self, results: GitHubSearchResults, repo_pattern: str
+    ) -> GitHubSearchResults:
         """Filter results by repository name pattern."""
         import re
+
         pattern = re.compile(repo_pattern)
-        
+
         filtered_items = [
-            item for item in results.get('items', [])
-            if pattern.search(item.get('repository', {}).get('full_name', ''))
+            item
+            for item in results.get("items", [])
+            if pattern.search(item.get("repository", {}).get("full_name", ""))
         ]
-        
+
         return GitHubSearchResults(
-            query=results['query'],
-            total_count=results['total_count'],
+            query=results["query"],
+            total_count=results["total_count"],
             retrieved_count=len(filtered_items),
             items=filtered_items,
-            search_time=results['search_time'],
-            rate_limit_remaining=results.get('rate_limit_remaining'),
-            rate_limit_reset=results.get('rate_limit_reset')
+            search_time=results["search_time"],
+            rate_limit_remaining=results.get("rate_limit_remaining"),
+            rate_limit_reset=results.get("rate_limit_reset"),
         )
 
 
-def search_github_code(query: str, token: str = None, per_page: int = 100, 
-                      max_results: Optional[int] = None, verbose: bool = True) -> GitHubSearchResults:
+def search_github_code(
+    query: str,
+    token: Optional[str] = None,
+    per_page: int = 100,
+    max_results: Optional[int] = None,
+    verbose: bool = True,
+) -> GitHubSearchResults:
     """
     Convenience function to search GitHub code.
-    
+
     Args:
         query: Search query string
         token: GitHub personal access token (optional, will use GITHUB_TOKEN env var if not provided)
         per_page: Number of results per page (max 100)
         max_results: Maximum number of results to return (None for all)
         verbose: Whether to log progress messages
-        
+
     Returns:
         GitHubSearchResults: Well-defined structure containing search results with the following fields:
             - query: The search query used
@@ -524,8 +563,8 @@ def main():
     parser.add_argument(
         "--format",
         type=str,
-        choices=['console', 'json', 'csv'],
-        default='console',
+        choices=["console", "json", "csv"],
+        default="console",
         help="Output format (default: console)",
     )
     parser.add_argument(
@@ -533,7 +572,7 @@ def main():
         action="store_true",
         help="Show rate limit information before searching",
     )
-    
+
     args = parser.parse_args()
 
     if not GITHUB_TOKEN:
@@ -542,42 +581,40 @@ def main():
 
     # Create search instance
     searcher = GitHubCodeSearch()
-    
+
     # Show rate limit if requested
     if args.show_rate_limit:
         rate_limit = searcher.get_rate_limit()
         print(f"Rate limit: {rate_limit['remaining']}/{rate_limit['limit']} remaining")
-        if rate_limit['remaining'] == 0:
-            reset_time = datetime.fromtimestamp(rate_limit['reset_time'])
+        if rate_limit["remaining"] == 0:
+            reset_time = datetime.fromtimestamp(rate_limit["reset_time"])
             print(f"Rate limit resets at: {reset_time}")
         print()
 
     # Perform search
     results = searcher.search_code(
-        query=args.query,
-        per_page=args.per_page,
-        max_results=args.max_results
+        query=args.query, per_page=args.per_page, max_results=args.max_results
     )
 
     # Format and output results
     if args.output:
         # Determine format from file extension
-        if args.output.endswith('.json'):
-            output_format = 'json'
-        elif args.output.endswith('.csv'):
-            output_format = 'csv'
+        if args.output.endswith(".json"):
+            output_format = "json"
+        elif args.output.endswith(".csv"):
+            output_format = "csv"
         else:
             output_format = args.format
-        
+
         formatted_output = searcher.format_results(results, output_format)
-        
-        with open(args.output, 'w', encoding='utf-8') as f:
+
+        with open(args.output, "w", encoding="utf-8") as f:
             f.write(formatted_output)
-        
+
         print(f"Results saved to: {args.output}")
-        
+
         # Also show console summary
-        console_output = searcher.format_results(results, 'console')
+        console_output = searcher.format_results(results, "console")
         print(console_output)
     else:
         # Just show console output
@@ -586,4 +623,4 @@ def main():
 
 
 if __name__ == "__main__":
-    main() 
\ No newline at end of file
+    main()
diff --git a/tools/analytics/org/remove_pytorch_labs.py b/tools/analytics/org/remove_pytorch_labs.py
index d40b6a6395..28fb6bb034 100644
--- a/tools/analytics/org/remove_pytorch_labs.py
+++ b/tools/analytics/org/remove_pytorch_labs.py
@@ -66,7 +66,8 @@
 import requests
 from cache_manager import get_cache_stats, make_cached_request
 from dotenv import load_dotenv
-from github_code_search import search_github_code, GitHubSearchResults
+from github_code_search import GitHubSearchResults, search_github_code
+
 
 load_dotenv()
 
@@ -99,64 +100,74 @@
 _SEARCH_CACHE: Dict[str, Dict[str, List[str]]] = {}
 
 
-
-
 def get_target_files_from_search(org: str) -> Dict[str, List[str]]:
     """
     Get target files by searching GitHub for 'pytorch-labs' mentions in the organization.
-    
+
     Args:
         org: GitHub organization name
-        
+
     Returns:
         Dictionary mapping repository names to lists of file paths
     """
     # Check cache first
     if org in _SEARCH_CACHE:
-        logging.info(f"[get_target_files_from_search] Using cached results for org: {org}")
+        logging.info(
+            f"[get_target_files_from_search] Using cached results for org: {org}"
+        )
         return _SEARCH_CACHE[org]
-    
+
     try:
-        logging.info(f"[get_target_files_from_search] Searching for 'pytorch-labs' mentions in org: {org}")
-        
+        logging.info(
+            f"[get_target_files_from_search] Searching for 'pytorch-labs' mentions in org: {org}"
+        )
+
         # Search for files containing "pytorch-labs" in the organization
         query = f"org:{org} pytorch-labs"
         results: GitHubSearchResults = search_github_code(
             query=query,
-            verbose=False  # Reduce logging noise
+            verbose=False,  # Reduce logging noise
         )
-        
-        if results['retrieved_count'] == 0:
-            logging.warning(f"[get_target_files_from_search] No files found containing 'pytorch-labs' in org: {org}")
+
+        if results["retrieved_count"] == 0:
+            logging.warning(
+                f"[get_target_files_from_search] No files found containing 'pytorch-labs' in org: {org}"
+            )
             _SEARCH_CACHE[org] = {}
             return {}
-        
+
         # Group files by repository
         target_files: Dict[str, List[str]] = {}
-        for item in results['items']:
-            repo_name = item['repository']['name']  # Just the repo name, not full_name
-            file_path = item['path']
-            
+        for item in results["items"]:
+            repo_name = item["repository"]["name"]  # Just the repo name, not full_name
+            file_path = item["path"]
+
             if repo_name not in target_files:
                 target_files[repo_name] = []
-            
+
             if file_path not in target_files[repo_name]:
                 target_files[repo_name].append(file_path)
-        
-        logging.info(f"[get_target_files_from_search] Found {len(target_files)} repositories with {sum(len(files) for files in target_files.values())} total files")
-        
+
+        logging.info(
+            f"[get_target_files_from_search] Found {len(target_files)} repositories with {sum(len(files) for files in target_files.values())} total files"
+        )
+
         # Log summary of repositories found
         for repo_name, files in target_files.items():
-            logging.info(f"[get_target_files_from_search] {repo_name}: {len(files)} files")
-        
+            logging.info(
+                f"[get_target_files_from_search] {repo_name}: {len(files)} files"
+            )
+
         # Cache the results
         _SEARCH_CACHE[org] = target_files
-        
+
         return target_files
-        
+
     except Exception as e:
         logging.error(f"[get_target_files_from_search] Error searching for files: {e}")
-        logging.warning(f"[get_target_files_from_search] No fallback available - search failed")
+        logging.warning(
+            f"[get_target_files_from_search] No fallback available - search failed"
+        )
         return {}
 
 
@@ -164,26 +175,32 @@ def get_target_repos(org: str, filter_repos: Optional[List[str]] = None) -> List
     """Get only the repositories that have files with 'pytorch-labs' mentions."""
     # Get target files from search (with fallback to hardcoded list)
     target_files = get_target_files_from_search(org)
-    
+
     if not target_files:
         logging.info(f"[get_target_repos] No target files found for org: {org}")
         return []
-    
+
     all_repos = list(target_files.keys())
-    
+
     if filter_repos:
         # Filter to only include repos that are in both the target files and the filter list
         repos = [repo for repo in all_repos if repo in filter_repos]
-        logging.info(f"[get_target_repos] Filtered to {len(repos)} repositories from {len(all_repos)} available")
-        
+        logging.info(
+            f"[get_target_repos] Filtered to {len(repos)} repositories from {len(all_repos)} available"
+        )
+
         # Log which repos were filtered out
         filtered_out = [repo for repo in filter_repos if repo not in all_repos]
         if filtered_out:
-            logging.warning(f"[get_target_repos] Repositories not found in target files: {filtered_out}")
+            logging.warning(
+                f"[get_target_repos] Repositories not found in target files: {filtered_out}"
+            )
     else:
         repos = all_repos
-        logging.info(f"[get_target_repos] Found {len(repos)} repositories with target files for org: {org}")
-    
+        logging.info(
+            f"[get_target_repos] Found {len(repos)} repositories with target files for org: {org}"
+        )
+
     return repos
 
 
@@ -200,13 +217,17 @@ def get_target_files_for_repo(org: str, repo: str) -> List[str]:
     """Get the list of target files for a specific repository."""
     # Get target files from search (with fallback to hardcoded list)
     target_files = get_target_files_from_search(org)
-    
+
     if repo not in target_files:
-        logging.info(f"[get_target_files_for_repo] No target files found for {org}/{repo}")
+        logging.info(
+            f"[get_target_files_for_repo] No target files found for {org}/{repo}"
+        )
         return []
-    
+
     files = target_files[repo]
-    logging.info(f"[get_target_files_for_repo] Found {len(files)} target files for {org}/{repo}")
+    logging.info(
+        f"[get_target_files_for_repo] Found {len(files)} target files for {org}/{repo}"
+    )
     return files
 
 
@@ -216,12 +237,14 @@ def get_file_content(org: str, repo: str, file_path: str) -> Optional[str]:
     data = make_cached_request(url, HEADERS)
     if not data:
         return None
-    
+
     # Check file size
     if data.get("size", 0) > MAX_FILE_SIZE:
-        logging.warning(f"[get_file_content] File {file_path} too large ({data['size']} bytes), skipping")
+        logging.warning(
+            f"[get_file_content] File {file_path} too large ({data['size']} bytes), skipping"
+        )
         return None
-    
+
     # Decode content
     try:
         content = base64.b64decode(data["content"]).decode("utf-8")
@@ -231,130 +254,153 @@ def get_file_content(org: str, repo: str, file_path: str) -> Optional[str]:
         return None
 
 
-def find_and_replace_in_file(org: str, repo: str, file_path: str) -> Optional[Tuple[str, str]]:
+def find_and_replace_in_file(
+    org: str, repo: str, file_path: str
+) -> Optional[Tuple[str, str]]:
     """Find and replace text in a file. Returns (old_content, new_content) if changes needed."""
     content = get_file_content(org, repo, file_path)
     if content is None:
         return None
-    
+
     # Check if file contains the target text
     if OLD_TEXT not in content:
         return None
-    
+
     # Replace all instances
     new_content = content.replace(OLD_TEXT, NEW_TEXT)
-    
+
     # Check if any changes were made
     if new_content == content:
         return None
-    
-    logging.info(f"[find_and_replace_in_file] Found {content.count(OLD_TEXT)} instances in {file_path}")
+
+    logging.info(
+        f"[find_and_replace_in_file] Found {content.count(OLD_TEXT)} instances in {file_path}"
+    )
     return content, new_content
 
 
 def create_branch(org: str, repo: str, base_branch: str, new_branch: str) -> bool:
     """Create a new branch from the base branch."""
     if DRY_RUN:
-        logging.info(f"[create_branch] DRY RUN: Would create branch {new_branch} in {org}/{repo}")
+        logging.info(
+            f"[create_branch] DRY RUN: Would create branch {new_branch} in {org}/{repo}"
+        )
         return True
-    
+
     # Get the SHA of the base branch
     url = f"{BASE_URL}/repos/{org}/{repo}/branches/{base_branch}"
     branch_data = make_cached_request(url, HEADERS)
     if not branch_data:
         logging.error(f"[create_branch] Failed to get base branch {base_branch}")
         return False
-    
+
     base_sha = branch_data["commit"]["sha"]
-    
+
     # Create the new branch
     url = f"{BASE_URL}/repos/{org}/{repo}/git/refs"
-    data = {
-        "ref": f"refs/heads/{new_branch}",
-        "sha": base_sha
-    }
-    
+    data = {"ref": f"refs/heads/{new_branch}", "sha": base_sha}
+
     response = requests.post(url, headers=HEADERS, json=data)
     if response.status_code == 201:
         logging.info(f"[create_branch] Created branch {new_branch} in {org}/{repo}")
         return True
     elif response.status_code == 422:  # Branch already exists
-        logging.info(f"[create_branch] Branch {new_branch} already exists in {org}/{repo}")
+        logging.info(
+            f"[create_branch] Branch {new_branch} already exists in {org}/{repo}"
+        )
         return True
     else:
-        logging.error(f"[create_branch] Failed to create branch {new_branch}: {response.status_code} - {response.text}")
+        logging.error(
+            f"[create_branch] Failed to create branch {new_branch}: {response.status_code} - {response.text}"
+        )
         return False
 
 
-def create_file_commit(org: str, repo: str, file_path: str, content: str, branch: str, message: str) -> bool:
+def create_file_commit(
+    org: str, repo: str, file_path: str, content: str, branch: str, message: str
+) -> bool:
     """Create a commit to update a file."""
     if DRY_RUN:
-        logging.info(f"[create_file_commit] DRY RUN: Would update {file_path} in {org}/{repo}")
+        logging.info(
+            f"[create_file_commit] DRY RUN: Would update {file_path} in {org}/{repo}"
+        )
         return True
-    
+
     # First get the current file to get its SHA
     url = f"{BASE_URL}/repos/{org}/{repo}/contents/{file_path}"
     current_file_data = make_cached_request(url, HEADERS)
     if not current_file_data:
-        logging.error(f"[create_file_commit] Failed to get current file data for {file_path}")
+        logging.error(
+            f"[create_file_commit] Failed to get current file data for {file_path}"
+        )
         return False
-    
+
     current_sha = current_file_data.get("sha")
     if not current_sha:
         logging.error(f"[create_file_commit] No SHA found for {file_path}")
         return False
-    
+
     # Update the file with the SHA
     data = {
         "message": message,
         "content": base64.b64encode(content.encode("utf-8")).decode("utf-8"),
         "sha": current_sha,
-        "branch": branch
+        "branch": branch,
     }
-    
+
     response = requests.put(url, headers=HEADERS, json=data)
     if response.status_code in [200, 201]:
         logging.info(f"[create_file_commit] Updated {file_path} in {org}/{repo}")
         return True
     else:
-        logging.error(f"[create_file_commit] Failed to update {file_path}: {response.status_code} - {response.text}")
+        logging.error(
+            f"[create_file_commit] Failed to update {file_path}: {response.status_code} - {response.text}"
+        )
         return False
 
 
 def check_existing_pr(org: str, repo: str, title: str) -> Optional[str]:
     """Check if there's already an open PR with the same title. Returns PR URL if found, None otherwise."""
     url = f"{BASE_URL}/repos/{org}/{repo}/pulls?state=open&per_page=100"
-    
+
     # Don't use cache for PR checks since PR status can change quickly
-    logging.info(f"[check_existing_pr] Making direct request to check PRs for {org}/{repo}")
+    logging.info(
+        f"[check_existing_pr] Making direct request to check PRs for {org}/{repo}"
+    )
     try:
         response = requests.get(url, headers=HEADERS)
         response.raise_for_status()
         data = response.json()
-        
+
         for pr in data:
-            if pr.get("title") == title:
-                pr_url = pr['html_url']
-                logging.info(f"[check_existing_pr] Found existing open PR with same title in {org}/{repo}: {pr_url}")
+            if pr.get("title", "").startswith(title):
+                pr_url = pr["html_url"]
+                logging.info(
+                    f"[check_existing_pr] Found existing open PR with same title in {org}/{repo}: {pr_url}"
+                )
                 return pr_url
-        
+
         logging.info(f"[check_existing_pr] No existing PR found for {org}/{repo}")
         return None
-        
+
     except requests.exceptions.RequestException as e:
         logging.warning(f"[check_existing_pr] Failed to get PRs for {org}/{repo}: {e}")
         return None
     except json.JSONDecodeError as e:
-        logging.warning(f"[check_existing_pr] Failed to parse JSON response for {org}/{repo}: {e}")
+        logging.warning(
+            f"[check_existing_pr] Failed to parse JSON response for {org}/{repo}: {e}"
+        )
         return None
 
 
-def create_pull_request(org: str, repo: str, branch: str, base_branch: str) -> Optional[str]:
+def create_pull_request(
+    org: str, repo: str, branch: str, base_branch: str
+) -> Optional[str]:
     """Create a pull request and return the PR URL."""
     if DRY_RUN:
         logging.info(f"[create_pull_request] DRY RUN: Would create PR for {org}/{repo}")
         return "DRY_RUN_PR_URL"
-    
+
     url = f"{BASE_URL}/repos/{org}/{repo}/pulls"
     data = {
         "title": f"[EZ] Replace `pytorch-labs` with `meta-pytorch`",
@@ -362,7 +408,6 @@ def create_pull_request(org: str, repo: str, branch: str, base_branch: str) -> O
 
 ## Changes Made
 - Replaced all occurrences of `pytorch-labs` with `meta-pytorch`
-- Only modified files with extensions: .py, .md, .sh, .rst, .cpp, .h, .txt, .yml
 - Skipped binary files and files larger than 1MB due to GitHub api payload limits in the script to cover all repos in this org. Will do a more manual second pass later to cover any larger files
 
 ## Files Modified
@@ -370,9 +415,9 @@ def create_pull_request(org: str, repo: str, branch: str, base_branch: str) -> O
 
 Generated by automated script on {datetime.now(timezone.utc).isoformat()}Z""",
         "head": branch,
-        "base": base_branch
+        "base": base_branch,
     }
-    
+
     response = requests.post(url, headers=HEADERS, json=data)
     if response.status_code == 201:
         pr_data = response.json()
@@ -380,88 +425,107 @@ def create_pull_request(org: str, repo: str, branch: str, base_branch: str) -> O
         logging.info(f"[create_pull_request] Created PR: {pr_url}")
         return pr_url
     else:
-        logging.error(f"[create_pull_request] Failed to create PR: {response.status_code} - {response.text}")
+        logging.error(
+            f"[create_pull_request] Failed to create PR: {response.status_code} - {response.text}"
+        )
         return None
 
 
 def process_repository(org: str, repo: str) -> Dict:
     """Process a single repository for text replacement."""
     logging.info(f"[process_repository] Processing repository: {org}/{repo}")
-    
+
     result = {
         "repo": repo,
         "status": "skipped",
         "files_changed": 0,
         "pr_url": None,
-        "error": None
+        "error": None,
     }
-    
+
     try:
         # Check for existing PR first (before doing any work)
-        pr_title = f"[EZ] Replace `pytorch-labs` with `meta-pytorch`"
-        existing_pr_url = check_existing_pr(org, repo, pr_title)
+        pr_title_prefix = f"[EZ] Replace `pytorch-labs`"
+        existing_pr_url = check_existing_pr(org, repo, pr_title_prefix)
         if existing_pr_url:
             result["status"] = "skipped_existing_pr"
             result["pr_url"] = existing_pr_url
             result["error"] = "Existing open PR with same title found"
-            logging.info(f"[process_repository] Skipping {org}/{repo} - existing open PR found: {existing_pr_url}")
+            logging.info(
+                f"[process_repository] Skipping {org}/{repo} - existing open PR found: {existing_pr_url}"
+            )
             return result
-        
+
         # Get default branch
         default_branch = get_default_branch(org, repo)
         if not default_branch:
             result["error"] = "Failed to get default branch"
             return result
-        
+
         # Get target files for this repository
         target_files = get_target_files_for_repo(org, repo)
         if not target_files:
             logging.info(f"[process_repository] No target files found for {org}/{repo}")
             return result
-        
+
         # Check each target file for replacements
         changes = []
         for file_path in target_files:
             replacement = find_and_replace_in_file(org, repo, file_path)
             if replacement:
                 old_content, new_content = replacement
-                changes.append({
-                    "path": file_path,
-                    "old_content": old_content,
-                    "new_content": new_content
-                })
-        
+                changes.append(
+                    {
+                        "path": file_path,
+                        "old_content": old_content,
+                        "new_content": new_content,
+                    }
+                )
+
         if not changes:
             logging.info(f"[process_repository] No changes needed in {org}/{repo}")
             return result
-        
+
         result["files_changed"] = len(changes)
-        logging.info(f"[process_repository] Found {len(changes)} files to update in {org}/{repo}")
-        
+        logging.info(
+            f"[process_repository] Found {len(changes)} files to update in {org}/{repo}"
+        )
+
         if DRY_RUN:
             result["status"] = "dry_run"
-            logging.info(f"[process_repository] DRY RUN: Would update {len(changes)} files in {org}/{repo}")
+            logging.info(
+                f"[process_repository] DRY RUN: Would update {len(changes)} files in {org}/{repo}"
+            )
             return result
-        
+
         # Create new branch
         branch_name = f"replace-pytorch-labs-{datetime.now(timezone.utc).strftime('%Y%m%d-%H%M%S')}"
         if not create_branch(org, repo, default_branch, branch_name):
             result["error"] = "Failed to create branch"
             return result
-        
+
         # Commit changes
-        commit_message = f"Replace 'pytorch-labs' with 'meta-pytorch' in {len(changes)} files"
+        commit_message = (
+            f"Replace 'pytorch-labs' with 'meta-pytorch' in {len(changes)} files"
+        )
         all_success = True
-        
+
         for change in changes:
-            if not create_file_commit(org, repo, change["path"], change["new_content"], branch_name, commit_message):
+            if not create_file_commit(
+                org,
+                repo,
+                change["path"],
+                change["new_content"],
+                branch_name,
+                commit_message,
+            ):
                 all_success = False
                 break
-        
+
         if not all_success:
             result["error"] = "Failed to commit some files"
             return result
-        
+
         # Create pull request
         pr_url = create_pull_request(org, repo, branch_name, default_branch)
         if pr_url:
@@ -469,11 +533,11 @@ def process_repository(org: str, repo: str) -> Dict:
             result["status"] = "success"
         else:
             result["error"] = "Failed to create pull request"
-        
+
     except Exception as e:
         logging.error(f"[process_repository] Error processing {org}/{repo}: {e}")
         result["error"] = str(e)
-    
+
     return result
 
 
@@ -533,9 +597,10 @@ def main():
         logging.info(f"[main] Processing repository {i}/{len(repos)}: {repo}")
         result = process_repository(ORG_NAME, repo)
         results.append(result)
-        
+
         # Add a small delay to be respectful to the API
         import time
+
         time.sleep(1)
 
     # Generate summary
@@ -543,7 +608,9 @@ def main():
     dry_run = [r for r in results if r["status"] == "dry_run"]
     skipped = [r for r in results if r["status"] == "skipped"]
     skipped_existing_pr = [r for r in results if r["status"] == "skipped_existing_pr"]
-    errors = [r for r in results if r["error"] and r["status"] not in ["skipped_existing_pr"]]
+    errors = [
+        r for r in results if r["error"] and r["status"] not in ["skipped_existing_pr"]
+    ]
 
     print(f"\n=== SUMMARY ===")
     print(f"Organization: {ORG_NAME}")
@@ -554,23 +621,29 @@ def main():
     print(f"Skipped (existing PR): {len(skipped_existing_pr)}")
     print(f"Errors: {len(errors)}")
     print("\n")
-    
+
     if skipped_existing_pr:
         print(f"=== SKIPPED (existing PRs) ===")
         for result in skipped_existing_pr:
-            print(f"- {result['repo']}: {result['files_changed']} files would be updated, but existing PR found: {result['pr_url']}")
+            print(
+                f"- {result['repo']}: {result['files_changed']} files would be updated, but existing PR found: {result['pr_url']}"
+            )
         print("\n")
 
     if successful:
         print(f"=== SUCCESSFUL PRs ===")
         for result in successful:
-            print(f"- {result['repo']}: {result['pr_url']} ({result['files_changed']} files)")
+            print(
+                f"- {result['repo']}: {result['pr_url']} ({result['files_changed']} files)"
+            )
         print("\n")
 
     if dry_run:
         print(f"=== DRY RUN (would create PRs) ===")
         for result in dry_run:
-            print(f"- {result['repo']}: {result['files_changed']} files would be updated")
+            print(
+                f"- {result['repo']}: {result['files_changed']} files would be updated"
+            )
         print("\n")
 
     if errors:
@@ -589,4 +662,4 @@ def main():
 
 
 if __name__ == "__main__":
-    main() 
\ No newline at end of file
+    main()