From 0d77bc7ef52b58f3a5b0d0a4a6a0f21e7adb2961 Mon Sep 17 00:00:00 2001
From: Kevin Li <kevinli020508@gmail.com>
Date: Fri, 16 Jan 2026 08:56:26 -0800
Subject: [PATCH 01/32] Support --gather in bug_gen_modal.py

---
 scripts/bug_gen_modal.py   | 252 ++++++++++++++++++++++++++++++++++---
 swesmith/harness/gather.py |  14 ++-
 swesmith/profiles/base.py  |   2 +-
 3 files changed, 242 insertions(+), 26 deletions(-)

diff --git a/scripts/bug_gen_modal.py b/scripts/bug_gen_modal.py
index e9c54f2a..9ac529e7 100644
--- a/scripts/bug_gen_modal.py
+++ b/scripts/bug_gen_modal.py
@@ -1018,7 +1018,7 @@ async def check_with_sem(repo_tuple):
         completed = 0
         total_bugs = 0
 
-        for result_or_exc in generate_bugs_remote.map(
+        async for result_or_exc in generate_bugs_remote.map.aio(
             repo_names,
             kwargs={
                 "max_bugs": args.max_bugs,
@@ -1475,6 +1475,11 @@ async def process_single_task(task: dict) -> dict:
         report_volume_path = (
             f"{lang}/run_validation/{repo_id}/{instance_id}/report.json"
         )
+        # Write patch file to volume (required for gather step)
+        await volume_write_text(
+            f"{lang}/run_validation/{repo_id}/{instance_id}/patch.diff",
+            task["patch"],
+        )
 
         postgold_config = {
             "test_cmd": task["profile"].test_cmd,
@@ -1625,6 +1630,199 @@ def print_summary(results: list[dict], repos_count: int):
     )
 
 
+# ============================================================================
+# Gather Phase (Create Task Instances & Push Branches)
+# ============================================================================
+
+
+@app.function(
+    image=generator_image,
+    secrets=[modal.Secret.from_name("GITHUB_TOKEN")],
+    timeout=MODAL_TIMEOUT,
+    volumes={LOGS_MOUNT_PATH: logs_volume},
+)
+def gather_remote(
+    repo_name: str,
+    language: str,
+    repush_image: bool = False,
+    override_branch: bool = False,
+) -> dict:
+    """Run gather.py for a repository to create task instances and push branches."""
+    import os
+    import sys
+    import subprocess
+    import traceback
+    from pathlib import Path
+
+    # Ensure swesmith is in path
+    if "/root" not in sys.path:
+        sys.path.append("/root")
+
+    from swesmith.profiles import registry
+
+    # Resolve repo ID
+    def resolve_repo_id():
+        try:
+            return registry.get_from_inst(
+                {"repo": repo_name, "instance_id": "dummy"}
+            ).repo_name
+        except Exception:
+            target = repo_name.replace("/", "__")
+            candidates = [key for key in registry.keys() if target in key]
+            return candidates[0] if candidates else repo_name
+
+    repo_id = resolve_repo_id()
+    print(f"Gathering for {repo_name} (ID: {repo_id})")
+
+    # Setup environment to satisfy gather.py expectations
+    # 1. gather.py expects logs/run_validation to contain the repo logs
+    # 2. gather.py writes to logs/task_insts
+
+    work_dir = Path("/root")
+    logs_link_dir = work_dir / "logs"
+    logs_link_dir.mkdir(exist_ok=True)
+    
+    # Configure git authentication
+    if "GITHUB_TOKEN" in os.environ:
+        token = os.environ["GITHUB_TOKEN"]
+        print(f"DEBUG: Found GITHUB_TOKEN (len={len(token)}). Configuring git auth...")
+        
+        # Use simpler authenticated URL format for PATs
+        subprocess.run(
+            ["git", "config", "--global", f"url.https://{token}@github.com/.insteadOf", "https://github.com/"],
+            check=True
+        )
+        # Also configure user info
+        subprocess.run(["git", "config", "--global", "user.email", "swesmith@swesmith.ai"], check=False)
+        subprocess.run(["git", "config", "--global", "user.name", "swesmith"], check=False)
+    else:
+        print("Warning: GITHUB_TOKEN not found in environment. Git push may fail.")
+
+    # Link run_validation: logs/run_validation -> /logs/{language}/run_validation
+    # We use the mounted volume path directly as the target
+    validation_source = Path(LOGS_MOUNT_PATH) / language / "run_validation"
+    validation_link = logs_link_dir / "run_validation"
+
+    # Link task_insts: logs/task_insts -> /logs/task_insts (volume root)
+    task_insts_source = Path(LOGS_MOUNT_PATH) / "task_insts"
+    task_insts_link = logs_link_dir / "task_insts"
+
+    try:
+        # Ensure sources exist on volume
+        task_insts_source.mkdir(parents=True, exist_ok=True)
+        if not validation_source.exists():
+             return {"repo": repo_name, "status": "skipped", "reason": "No validation logs"}
+
+        # Create symlinks
+        if not validation_link.exists():
+            os.symlink(str(validation_source), str(validation_link))
+        
+        if not task_insts_link.exists():
+            os.symlink(str(task_insts_source), str(task_insts_link))
+
+        # Check if there are actually validation logs for this repo
+        repo_vals = validation_link / repo_id
+        if not repo_vals.exists():
+            return {"repo": repo_name, "status": "skipped", "reason": "No logs for repo"}
+        
+        # Build command
+        # python -m swesmith.harness.gather logs/run_validation/<repo_id>
+        cmd = [
+            sys.executable,
+            "-m", "swesmith.harness.gather",
+            str(Path("logs/run_validation") / repo_id),
+            "-v",
+            "-d",
+        ]
+        
+        if repush_image:
+            cmd.append("--repush_image")
+        if override_branch:
+            cmd.append("--override_branch")
+            
+        print(f"Running: {' '.join(cmd)}")
+            
+        # execution
+        result = subprocess.run(
+            cmd,
+            cwd=str(work_dir),
+            capture_output=True,
+            text=True,
+            env=os.environ
+        )
+        
+        if result.returncode != 0:
+            print("Gather failed:")
+            print(result.stdout)
+            print(result.stderr)
+            return {
+                "repo": repo_name, 
+                "status": "failed", 
+                "stdout": result.stdout, 
+                "stderr": result.stderr
+            }
+        else:
+            print("Gather succeeded:")
+            print(result.stdout)
+            print(result.stderr)
+        
+        return {
+            "repo": repo_name,
+            "status": "success",
+            "stdout": result.stdout,
+            "stderr": result.stderr
+        }
+
+    except Exception as e:
+        traceback.print_exc()
+        return {"repo": repo_name, "status": "error", "error": str(e)}
+
+
+async def run_gather_phase_async(repos: list[str], language: str, args) -> None:
+    """Run gather phase for all repos in parallel."""
+    print(f"\n{'#' * 60}")
+    print(f"# PHASE 3: GATHER ({len(repos)} repos)")
+    print(f"{'#' * 60}\n")
+    
+    # We can pass repush_image and override_branch via args if they existed, 
+    # but for now we'll assume defaults or add them to args class if needed.
+    repush = getattr(args, "repush_image", False)
+    override = getattr(args, "override_branch", False)
+    
+    completed = 0
+    success = 0
+    
+    print(f"Starting gather for {len(repos)} repos...")
+    
+    async for result in gather_remote.map.aio(
+        repos,
+        kwargs={
+            "language": language,
+            "repush_image": repush,
+            "override_branch": override,
+        }
+    ):
+        completed += 1
+        repo = result.get("repo", "unknown")
+        status = result.get("status", "unknown")
+        
+        if status == "success":
+            success += 1
+            print(f"  [{completed}/{len(repos)}] {repo}: Success")
+            # Print last few lines of stdout to see "Wrote X instances"
+            if "stdout" in result:
+                lines = result["stdout"].splitlines()
+                for line in lines[-5:]:
+                    print(f"    | {line}")
+        elif status == "skipped":
+            print(f"  [{completed}/{len(repos)}] {repo}: Skipped ({result.get('reason')})")
+        else:
+            err = result.get("error") or "Non-zero exit code"
+            print(f"  [{completed}/{len(repos)}] {repo}: Failed - {err}")
+            
+    print(f"\nGather complete: {success}/{len(repos)} repos processed successfully.\n")
+
+
 # ============================================================================
 # Stats Display
 # ============================================================================
@@ -1941,6 +2139,7 @@ async def main(
     max_candidates: int = 2000,
     max_concurrent_tests: int = 900,
     show_stats: bool = False,
+    gather: bool = False,
 ):
     """
     Modal Bug Generation & Validation script.
@@ -1948,6 +2147,7 @@ async def main(
     Runs two phases:
     1. Generation: Creates bugs for repos (skips repos that are already done/failed)
     2. Validation: Validates all patches from the volume
+    3. Gather: Creates task instances and pushes branches
 
     Run with: modal run scripts/bug_gen.py [OPTIONS]
 
@@ -1960,6 +2160,7 @@ async def main(
         max_candidates: Max candidates to process, -1 for all (default: 2000)
         max_concurrent_tests: Max concurrent tests (default: 900)
         show_stats: If True, show bug breakdown stats and exit without running generation/validation
+        gather: If True, only run the gather phase (skip generation and validation)
     """
     # Handle --show-stats early exit
     if show_stats:
@@ -2001,27 +2202,38 @@ class Args:
     args.max_candidates = max_candidates
 
     # Phase 1: Generation (skips repos that are already done/failed)
-    generation_results = await run_generation_phase(target_repos, args, language)
+    if not gather:
+        generation_results = await run_generation_phase(target_repos, args, language)
 
-    # Phase 2: Validation - collect ALL patches from volume (not just from this run)
-    print(f"\n{'#' * 60}")
-    print("# PHASE 2: VALIDATION")
-    print(f"{'#' * 60}\n")
+        # Phase 2: Validation - collect ALL patches from volume (not just from this run)
+        print(f"\n{'#' * 60}")
+        print("# PHASE 2: VALIDATION")
+        print(f"{'#' * 60}\n")
 
-    print("Collecting patches from volume...")
-    all_patches = await collect_patches_from_files(target_repos, language)
-    print(f"Total: {len(all_patches)} patches\n")
+        print("Collecting patches from volume...")
+        all_patches = await collect_patches_from_files(target_repos, language)
+        print(f"Total: {len(all_patches)} patches\n")
 
-    results = await run_validation_phase_async(
-        all_patches, max_concurrent_tests, ENV_NAME
-    )
+        results = await run_validation_phase_async(
+            all_patches, max_concurrent_tests, ENV_NAME
+        )
+
+        if results:
+            print_summary(results, len(build_repos_with_patches(all_patches)))
+
+        # Report generation errors from this run
+        errors = [r for r in generation_results if "error" in r]
+        if errors:
+            print(f"\nGeneration Errors ({len(errors)}):")
+            for err in errors:
+                print(f"  - {err['repo']}: {err.get('error', 'Unknown')}")
+    else:
+        results = []
+
+    # Phase 3: Gather (Create task instances & Push branches)
+    if not results and not gather:
+        print("No validation results found. Skipping gather phase.")
+        return
 
-    if results:
-        print_summary(results, len(build_repos_with_patches(all_patches)))
+    await run_gather_phase_async(target_repos, language, args)
 
-    # Report generation errors from this run
-    errors = [r for r in generation_results if "error" in r]
-    if errors:
-        print(f"\nGeneration Errors ({len(errors)}):")
-        for err in errors:
-            print(f"  - {err['repo']}: {err.get('error', 'Unknown')}")
diff --git a/swesmith/harness/gather.py b/swesmith/harness/gather.py
index 5b294594..a01a093f 100644
--- a/swesmith/harness/gather.py
+++ b/swesmith/harness/gather.py
@@ -8,7 +8,7 @@
     "patch":
     "test_patch":
     "problem_statement":
-    "FAIL_TO_PASS":
+    "PASS_TO_FAIL":
     "PASS_TO_PASS":
     "version":
 }
@@ -34,7 +34,7 @@
 
 from pathlib import Path
 from swebench.harness.constants import (
-    FAIL_TO_PASS,
+    PASS_TO_FAIL,
     PASS_TO_PASS,
     KEY_INSTANCE_ID,
     LOG_REPORT,
@@ -187,16 +187,20 @@ def _main(
         if not os.path.exists(path_results):
             stats = skip_print(f"{subfolder}: No results", pbar, stats, verbose)
             continue
+            
+        if not os.path.exists(path_patch):
+            stats = skip_print(f"{subfolder}: No patch.diff", pbar, stats, verbose)
+            continue
 
         with open(path_results) as f:
             results = json.load(f)
-        if FAIL_TO_PASS not in results or PASS_TO_PASS not in results:
+        if PASS_TO_FAIL not in results or PASS_TO_PASS not in results:
             stats = skip_print(
                 f"{subfolder}: No validatable bugs", pbar, stats, verbose
             )
             continue
 
-        n_f2p = len(results[FAIL_TO_PASS])
+        n_f2p = len(results[PASS_TO_FAIL])
         n_p2p = len(results[PASS_TO_PASS])
         pr_exception = (
             ".pr_" in subfolder and n_p2p == 0 and n_f2p > 0
@@ -216,7 +220,7 @@ def _main(
         task_instance = {
             KEY_INSTANCE_ID: subfolder,
             KEY_PATCH: patch_content,
-            FAIL_TO_PASS: results[FAIL_TO_PASS],
+            PASS_TO_FAIL: results[PASS_TO_FAIL],
             PASS_TO_PASS: results[PASS_TO_PASS],
         }
         rp = registry.get_from_inst(task_instance)
diff --git a/swesmith/profiles/base.py b/swesmith/profiles/base.py
index f1c4e068..af23febb 100644
--- a/swesmith/profiles/base.py
+++ b/swesmith/profiles/base.py
@@ -290,7 +290,7 @@ def clone(self, dest: str | None = None) -> tuple[str, bool]:
             token = os.getenv("GITHUB_TOKEN")
             if token:
                 base_url = (
-                    f"https://x-access-token:{token}@github.com/{self.mirror_name}.git"
+                    f"https://{token}@github.com/{self.mirror_name}.git"
                 )
             else:
                 base_url = f"git@github.com:{self.mirror_name}.git"

From 14659088e29c5ebdff47378e6ab775d1a7f0fe45 Mon Sep 17 00:00:00 2001
From: Kevin Li <kevinli020508@gmail.com>
Date: Fri, 16 Jan 2026 11:56:21 -0800
Subject: [PATCH 02/32] Update --gather to store to /logs/{language}/task_insts

---
 scripts/bug_gen_modal.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/bug_gen_modal.py b/scripts/bug_gen_modal.py
index 9ac529e7..ad403238 100644
--- a/scripts/bug_gen_modal.py
+++ b/scripts/bug_gen_modal.py
@@ -1704,7 +1704,7 @@ def resolve_repo_id():
     validation_link = logs_link_dir / "run_validation"
 
     # Link task_insts: logs/task_insts -> /logs/task_insts (volume root)
-    task_insts_source = Path(LOGS_MOUNT_PATH) / "task_insts"
+    task_insts_source = Path(LOGS_MOUNT_PATH) / language / "task_insts"
     task_insts_link = logs_link_dir / "task_insts"
 
     try:

From 5d0cf3d5c9bbcec711865f80d7089b9e7753d260 Mon Sep 17 00:00:00 2001
From: Kevin Li <kevinli020508@gmail.com>
Date: Fri, 16 Jan 2026 12:08:15 -0800
Subject: [PATCH 03/32] Only write out json if task instances is not empty

---
 swesmith/harness/gather.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/swesmith/harness/gather.py b/swesmith/harness/gather.py
index a01a093f..73da2631 100644
--- a/swesmith/harness/gather.py
+++ b/swesmith/harness/gather.py
@@ -347,10 +347,12 @@ def _main(
                 print(f"[{repo}] Rebuilding + pushing image")
                 registry.get(repo).push_image(rebuild_image=True)
 
-    task_instances_path.parent.mkdir(parents=True, exist_ok=True)
-    with open(task_instances_path, "w") as f:
-        json.dump(task_instances, f, indent=4)
-    print(f"Wrote {len(task_instances)} instances to {task_instances_path}")
+    if len(task_instances) > 0:
+        task_instances_path.parent.mkdir(parents=True, exist_ok=True)
+        with open(task_instances_path, "w") as f:
+            json.dump(task_instances, f, indent=4)
+        print(f"Wrote {len(task_instances)} instances to {task_instances_path}")
+    
     print(f"- {stats['skipped']} skipped")
     print(f"- {stats['new_tasks']} new instances")
 

From 96efde45436b767158104070715c61b8bd0f5a4d Mon Sep 17 00:00:00 2001
From: Kevin Li <kevinli020508@gmail.com>
Date: Fri, 16 Jan 2026 12:11:36 -0800
Subject: [PATCH 04/32] Doubled modal sandbox time out to 20 minutes to account
 for repos that take longer to gather

---
 scripts/bug_gen_modal.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/bug_gen_modal.py b/scripts/bug_gen_modal.py
index ad403238..f553c297 100644
--- a/scripts/bug_gen_modal.py
+++ b/scripts/bug_gen_modal.py
@@ -124,7 +124,7 @@ def custom_exception_handler(loop, context):
 APP_NAME = "swesmith-bug-gen"
 VOLUME_NAME = "swesmith-bug-gen"
 MINUTES = 60
-MODAL_TIMEOUT = 10 * MINUTES
+MODAL_TIMEOUT = 20 * MINUTES
 SANDBOX_RATE_LIMIT = 4  # Modal limits to 5/s, use 4 to be safe
 
 LANGUAGE_TO_BASE_CLASS = {

From d065420889410c21b8dc8dc36e50c7e39eb158c1 Mon Sep 17 00:00:00 2001
From: Kevin Li <kevinli020508@gmail.com>
Date: Fri, 16 Jan 2026 12:53:40 -0800
Subject: [PATCH 05/32] feat: parallelize gather.py and fix thread safety

---
 swesmith/harness/gather.py | 275 +++++++++++++++++++++++--------------
 swesmith/profiles/base.py  |   9 +-
 2 files changed, 176 insertions(+), 108 deletions(-)

diff --git a/swesmith/harness/gather.py b/swesmith/harness/gather.py
index 73da2631..c169515e 100644
--- a/swesmith/harness/gather.py
+++ b/swesmith/harness/gather.py
@@ -31,6 +31,8 @@
 import os
 import shutil
 import subprocess
+import concurrent.futures
+import functools
 
 from pathlib import Path
 from swebench.harness.constants import (
@@ -98,24 +100,25 @@ def check_if_branch_exists(
     main_branch: str,
     override_branch: bool,
     verbose: bool,
+    subprocess_args: dict,
 ):
     branch_exists = None
     try:
-        subprocess.run(f"git checkout {subfolder}", cwd=repo_name, **SUBPROCESS_ARGS)
+        subprocess.run(f"git checkout {subfolder}", cwd=repo_name, **subprocess_args)
         if override_branch:
             # Delete the branch remotely
             subprocess.run(
                 f"git push --delete origin {subfolder}",
                 cwd=repo_name,
-                **SUBPROCESS_ARGS,
+                **subprocess_args,
             )
             if verbose:
                 print(f"[{subfolder}] Overriding existing branch")
             branch_exists = False
         else:
             branch_exists = True
-        subprocess.run(f"git checkout {main_branch}", cwd=repo_name, **SUBPROCESS_ARGS)
-        subprocess.run(f"git branch -D {subfolder}", cwd=repo_name, **SUBPROCESS_ARGS)
+        subprocess.run(f"git checkout {main_branch}", cwd=repo_name, **subprocess_args)
+        subprocess.run(f"git branch -D {subfolder}", cwd=repo_name, **subprocess_args)
     except Exception:
         branch_exists = False
         pass
@@ -172,69 +175,139 @@ def _main(
         print(f"Found {len(task_instances)} existing task instances")
         subfolders = [x for x in subfolders if x not in completed_ids]
 
+    completed_ids = set(completed_ids)  # Optimize lookup
+    subfolders_to_process = [x for x in subfolders if x not in completed_ids]
+
+    print(f"Will process {len(subfolders_to_process)} instances")
+    
+    # Determine number of workers
+    n_workers = int(os.environ.get("MAX_WORKERS", os.cpu_count() or 1))
+    print(f"Using {n_workers} workers")
+
+    with concurrent.futures.ProcessPoolExecutor(max_workers=n_workers) as executor:
+        # Create a partial function with fixed arguments
+        func = functools.partial(
+            process_instance,
+            validation_logs_path=validation_logs_path,
+            override_branch=override_branch,
+            debug_subprocess=debug_subprocess,
+            verbose=verbose,
+        )
+
+        results = list(tqdm(
+            executor.map(func, sorted(subfolders_to_process)),
+            total=len(subfolders_to_process),
+            desc="Conversion"
+        ))
+
+    # Aggregate results
     stats = {"new_tasks": 0, "skipped": 0}
-    print(f"Will process {len(subfolders)} instances")
-    pbar = tqdm(subfolders, desc="Conversion", disable=verbose)
-    for subfolder in sorted(subfolders):
-        if subfolder.endswith(REF_SUFFIX) or subfolder in completed_ids:
-            # Skip reference run or instances that have been completed
-            stats = skip_print(f"{subfolder}: Reference", pbar, stats, verbose)
-            continue
-
-        path_results = os.path.join(validation_logs_path, subfolder, LOG_REPORT)
-        path_patch = os.path.join(validation_logs_path, subfolder, "patch.diff")
-
-        if not os.path.exists(path_results):
-            stats = skip_print(f"{subfolder}: No results", pbar, stats, verbose)
-            continue
-            
-        if not os.path.exists(path_patch):
-            stats = skip_print(f"{subfolder}: No patch.diff", pbar, stats, verbose)
-            continue
-
-        with open(path_results) as f:
-            results = json.load(f)
-        if PASS_TO_FAIL not in results or PASS_TO_PASS not in results:
-            stats = skip_print(
-                f"{subfolder}: No validatable bugs", pbar, stats, verbose
-            )
-            continue
-
-        n_f2p = len(results[PASS_TO_FAIL])
-        n_p2p = len(results[PASS_TO_PASS])
-        pr_exception = (
-            ".pr_" in subfolder and n_p2p == 0 and n_f2p > 0
-        )  # TODO: Better way to determine if it's a PR miror?
-        if not pr_exception and (KEY_TIMED_OUT in results or n_f2p == 0 or n_p2p == 0):
-            # Skip instances that timed out OR don't have F2P or P2P
-            stats = skip_print(
-                f"{subfolder}: No validatable bugs: {n_f2p=}, {n_p2p=}",
-                pbar,
-                stats,
-                verbose,
-            )
-            continue
-
-        with open(path_patch) as f:
-            patch_content = f.read()
-        task_instance = {
-            KEY_INSTANCE_ID: subfolder,
-            KEY_PATCH: patch_content,
-            PASS_TO_FAIL: results[PASS_TO_FAIL],
-            PASS_TO_PASS: results[PASS_TO_PASS],
-        }
-        rp = registry.get_from_inst(task_instance)
-        task_instance[KEY_IMAGE_NAME] = rp.image_name
-        task_instance["repo"] = rp.mirror_name
-
-        # Clone repository
-        _, cloned = rp.clone()
+    for res_tasks, res_repos, res_stats in results:
+        task_instances.extend(res_tasks)
+        created_repos.update(res_repos)
+        for k, v in res_stats.items():
+            stats[k] += v
+
+    if len(created_repos) > 0:
+        if repush_image:
+            print("Rebuilding + pushing images...")
+            for repo in created_repos:
+                 print(f"[{repo}] Rebuilding + pushing image")
+                 registry.get(repo).push_image(rebuild_image=True)
+
+    if len(task_instances) > 0:
+        task_instances_path.parent.mkdir(parents=True, exist_ok=True)
+        with open(task_instances_path, "w") as f:
+            json.dump(task_instances, f, indent=4)
+        print(f"Wrote {len(task_instances)} instances to {task_instances_path}")
+    
+    print(f"- {stats['skipped']} skipped")
+    print(f"- {stats['new_tasks']} new instances")
+
+
+def process_instance(
+    subfolder: str,
+    validation_logs_path: Path,
+    override_branch: bool,
+    debug_subprocess: bool,
+    verbose: bool,
+) -> tuple[list[dict], set[str], dict]:
+    """
+    Process a single task instance.
+    Returns:
+        task_instances: list of created task instances
+        created_repos: set of repository names that were cloned
+        stats: dictionary of statistics
+    """
+    stats = {"new_tasks": 0, "skipped": 0}
+    task_instances = []
+    created_repos = set()
+    
+    # Use a unique temporary directory for this process/task to avoid collision
+    # We append process ID or random string to repo path
+    import multiprocessing
+    pid = multiprocessing.current_process().pid
+    
+    # Define subprocess args locally to avoid global state issues with multiprocessing
+    subprocess_args = SUBPROCESS_ARGS.copy()
+    if not debug_subprocess:
+        subprocess_args["stdout"] = subprocess.DEVNULL
+        subprocess_args["stderr"] = subprocess.DEVNULL
+
+    if subfolder.endswith(REF_SUFFIX):
+        return [], set(), {"new_tasks": 0, "skipped": 1}
+
+    path_results = os.path.join(validation_logs_path, subfolder, LOG_REPORT)
+    path_patch = os.path.join(validation_logs_path, subfolder, "patch.diff")
+
+    if not os.path.exists(path_results):
+        if verbose: print(f"[SKIP] {subfolder}: No results")
+        return [], set(), {"new_tasks": 0, "skipped": 1}
+        
+    if not os.path.exists(path_patch):
+        if verbose: print(f"[SKIP] {subfolder}: No patch.diff")
+        return [], set(), {"new_tasks": 0, "skipped": 1}
+
+    with open(path_results) as f:
+        results = json.load(f)
+    if PASS_TO_FAIL not in results or PASS_TO_PASS not in results:
+        if verbose: print(f"[SKIP] {subfolder}: No validatable bugs")
+        return [], set(), {"new_tasks": 0, "skipped": 1}
+
+    n_f2p = len(results[PASS_TO_FAIL])
+    n_p2p = len(results[PASS_TO_PASS])
+    pr_exception = (
+        ".pr_" in subfolder and n_p2p == 0 and n_f2p > 0
+    )
+    if not pr_exception and (KEY_TIMED_OUT in results or n_f2p == 0 or n_p2p == 0):
+        if verbose: print(f"[SKIP] {subfolder}: No validatable bugs: {n_f2p=}, {n_p2p=}")
+        return [], set(), {"new_tasks": 0, "skipped": 1}
+
+    with open(path_patch) as f:
+        patch_content = f.read()
+    task_instance = {
+        KEY_INSTANCE_ID: subfolder,
+        KEY_PATCH: patch_content,
+        PASS_TO_FAIL: results[PASS_TO_FAIL],
+        PASS_TO_PASS: results[PASS_TO_PASS],
+    }
+    rp = registry.get_from_inst(task_instance)
+    task_instance[KEY_IMAGE_NAME] = rp.image_name
+    task_instance["repo"] = rp.mirror_name
+
+    # Unique clone path for this worker
+    repo_path = f"{rp.repo_name}_{pid}_{subfolder}"
+    
+    # Clone repository
+    try:
+        _, cloned = rp.clone(dest=repo_path)
         if cloned:
             created_repos.add(rp.repo_name)
+
         main_branch = (
             subprocess.run(
                 "git rev-parse --abbrev-ref HEAD",
-                cwd=rp.repo_name,
+                cwd=repo_path,
                 capture_output=True,
                 shell=True,
                 check=True,
@@ -244,18 +317,20 @@ def _main(
         )
 
         # Check if branch already created for this problem
+        # We pass the repo_path as cwd for the git operations inside the helper
+        
         branch_exists = check_if_branch_exists(
-            rp.repo_name, subfolder, main_branch, override_branch, verbose
+            repo_path, subfolder, main_branch, override_branch, verbose, subprocess_args
         )
         if branch_exists:
             task_instances.append(task_instance)
-            stats = skip_print(
-                f"{subfolder}: Branch `{subfolder}` exists",
-                pbar,
-                stats,
-                verbose,
-            )
-            continue
+            if verbose: print(f"[SKIP] {subfolder}: Branch `{subfolder}` exists")
+            stats["skipped"] += 1
+            # Cleanup
+            if cloned and os.path.exists(repo_path):
+                shutil.rmtree(repo_path)
+            return task_instances, created_repos, stats
+            
         elif verbose:
             print(f"[{subfolder}] Does not exist yet")
 
@@ -264,7 +339,7 @@ def _main(
         for git_apply in GIT_APPLY_CMDS:
             output = subprocess.run(
                 f"{git_apply} ../{path_patch}",
-                cwd=rp.repo_name,
+                cwd=repo_path,
                 capture_output=True,
                 shell=True,
             )
@@ -272,14 +347,21 @@ def _main(
                 applied = True
                 break
             else:
-                # Remove any artifacts
-                subprocess.run("git reset --hard", cwd=rp.repo_name, **SUBPROCESS_ARGS)
+                subprocess.run("git reset --hard", cwd=repo_path, **subprocess_args)
+        
         if not applied:
-            raise Exception(f"[{subfolder}] Failed to apply patch to {rp.repo_name}")
+            # We can't raise Exception here as it stops the worker? 
+            # Or we let it bubble up and fail the future?
+            # Better to catch and print/skip
+            print(f"[{subfolder}] Failed to apply patch to {rp.repo_name}")
+            if cloned and os.path.exists(repo_path):
+                shutil.rmtree(repo_path)
+            return [], set(), stats # Don't record this one
+            
         if verbose:
             print(f"[{subfolder}] Bug patch applied successfully")
 
-        # Create a branch, check it out, commit, push the branch, and cleanup
+        # Create branch etc
         cmds = [
             "git config user.email 'swesmith@swesmith.ai'",
             "git config user.name 'swesmith'",
@@ -291,20 +373,18 @@ def _main(
         for cmd in cmds:
             if debug_subprocess:
                 print(f"[{subfolder}] {cmd}")
-            subprocess.run(cmd, cwd=rp.repo_name, **SUBPROCESS_ARGS)
+            subprocess.run(cmd, cwd=repo_path, **subprocess_args)
 
-        # Create test patch by removing F2P test files
+        # F2P patch
         f2p_test_files, _ = rp.get_test_files(task_instance)
         if f2p_test_files:
-            # Remove the test files
             for test_file in f2p_test_files:
-                test_file_path = os.path.join(rp.repo_name, test_file)
+                test_file_path = os.path.join(repo_path, test_file)
                 if os.path.exists(test_file_path):
                     os.remove(test_file_path)
                     if verbose:
                         print(f"[{subfolder}] Removed F2P test file: {test_file}")
 
-            # Add and commit removal
             cmds = [
                 "git add .",
                 "git commit --no-gpg-sign -m 'Remove F2P Tests'",
@@ -312,12 +392,10 @@ def _main(
             for cmd in cmds:
                 if debug_subprocess:
                     print(f"[{subfolder}] {cmd}")
-                subprocess.run(cmd, cwd=rp.repo_name, **SUBPROCESS_ARGS)
+                subprocess.run(cmd, cwd=repo_path, **subprocess_args)
             if verbose:
                 print(f"[{subfolder}] Commit F2P test file(s) removal")
-        elif verbose:
-            print(f"[{subfolder}] No test files to remove")
-
+        
         cmds = [
             f"git push origin {subfolder}",
             f"git checkout {main_branch}",
@@ -327,7 +405,8 @@ def _main(
         for cmd in cmds:
             if debug_subprocess:
                 print(f"[{subfolder}] {cmd}")
-            subprocess.run(cmd, cwd=rp.repo_name, **SUBPROCESS_ARGS)
+            subprocess.run(cmd, cwd=repo_path, **subprocess_args)
+        
         if verbose:
             print(f"[{subfolder}] Bug @ branch `{subfolder}`")
 
@@ -335,27 +414,13 @@ def _main(
         if verbose:
             print(f"[{subfolder}] Created task instance")
         stats["new_tasks"] += 1
-        pbar.update()
-
-    pbar.close()
-    if len(created_repos) > 0:
-        print("Cleaning up...")
-        for repo in created_repos:
-            shutil.rmtree(repo)
-            print(f"[{repo}] Removed local clone")
-            if repush_image:
-                print(f"[{repo}] Rebuilding + pushing image")
-                registry.get(repo).push_image(rebuild_image=True)
-
-    if len(task_instances) > 0:
-        task_instances_path.parent.mkdir(parents=True, exist_ok=True)
-        with open(task_instances_path, "w") as f:
-            json.dump(task_instances, f, indent=4)
-        print(f"Wrote {len(task_instances)} instances to {task_instances_path}")
-    
-    print(f"- {stats['skipped']} skipped")
-    print(f"- {stats['new_tasks']} new instances")
-
+        
+    finally:
+        # Cleanup unique clone
+        if os.path.exists(repo_path):
+            shutil.rmtree(repo_path)
+            
+    return task_instances, created_repos, stats
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
diff --git a/swesmith/profiles/base.py b/swesmith/profiles/base.py
index af23febb..c5c9c83c 100644
--- a/swesmith/profiles/base.py
+++ b/swesmith/profiles/base.py
@@ -179,10 +179,13 @@ def _get_cached_test_paths(self) -> list[Path]:
         """Clone the repo, get all testing file paths relative to the repo directory, then clean up."""
         if self._cache_test_paths is None:
             with self._lock:  # Only one process enters this block at a time
-                dir_path, cloned = self.clone()
+                # Use unique temp dir to avoid race conditions in multiprocessing
+                import uuid
+                temp_dest = f"{self.repo_name}_{uuid.uuid4().hex[:8]}"
+                dir_path, cloned = self.clone(dest=temp_dest)
                 self._cache_test_paths = [
-                    Path(os.path.relpath(os.path.join(root, file), self.repo_name))
-                    for root, _, files in os.walk(Path(self.repo_name).resolve())
+                    Path(os.path.relpath(os.path.join(root, file), dir_path))
+                    for root, _, files in os.walk(Path(dir_path).resolve())
                     for file in files
                     if self._is_test_path(root, file)
                 ]

From cd3adc848b4fd0b4ab5c17429f108bb43a8e0d7c Mon Sep 17 00:00:00 2001
From: Kevin Li <kevinli020508@gmail.com>
Date: Fri, 16 Jan 2026 14:28:20 -0800
Subject: [PATCH 06/32] Fix gather.py to skip empty commits

Previously, the script would fail if `git commit` was attempted with no changes. This was observed in cases like `Automattic__mongoose.5f57a5bb` where the applied patch resulted in no tracked changes. Now, we check `git status --porcelain` before committing and skip the instance if no changes are detected.
---
 swesmith/harness/gather.py | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/swesmith/harness/gather.py b/swesmith/harness/gather.py
index c169515e..aca7664f 100644
--- a/swesmith/harness/gather.py
+++ b/swesmith/harness/gather.py
@@ -368,6 +368,30 @@ def process_instance(
             "git config commit.gpgsign false",
             f"git checkout -b {subfolder}",
             "git add .",
+        ]
+        for cmd in cmds:
+            if debug_subprocess:
+                print(f"[{subfolder}] {cmd}")
+            subprocess.run(cmd, cwd=repo_path, **subprocess_args)
+
+        # Check for changes
+        status_output = subprocess.run(
+            "git status --porcelain",
+            cwd=repo_path,
+            capture_output=True,
+            shell=True,
+            check=True,
+        ).stdout.decode().strip()
+
+        if not status_output:
+            if verbose:
+                print(f"[{subfolder}] No changes to commit, skipping")
+            stats["skipped"] += 1
+            if cloned and os.path.exists(repo_path):
+                shutil.rmtree(repo_path)
+            return task_instances, created_repos, stats
+
+        cmds = [
             "git commit --no-gpg-sign -m 'Bug Patch'",
         ]
         for cmd in cmds:

From 84f8587a428356b7fe6e2a71cf7448638354010e Mon Sep 17 00:00:00 2001
From: Kevin Li <kevinli020508@gmail.com>
Date: Fri, 16 Jan 2026 14:41:37 -0800
Subject: [PATCH 07/32] Reset MODAL_TIMEOUT back down to 10 minutes now that
 gather is parallelized

---
 scripts/bug_gen_modal.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/bug_gen_modal.py b/scripts/bug_gen_modal.py
index f553c297..ad403238 100644
--- a/scripts/bug_gen_modal.py
+++ b/scripts/bug_gen_modal.py
@@ -124,7 +124,7 @@ def custom_exception_handler(loop, context):
 APP_NAME = "swesmith-bug-gen"
 VOLUME_NAME = "swesmith-bug-gen"
 MINUTES = 60
-MODAL_TIMEOUT = 20 * MINUTES
+MODAL_TIMEOUT = 10 * MINUTES
 SANDBOX_RATE_LIMIT = 4  # Modal limits to 5/s, use 4 to be safe
 
 LANGUAGE_TO_BASE_CLASS = {

From 67a8529784c55ae958da35ada2e986dd6220587b Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Fri, 16 Jan 2026 22:50:27 +0000
Subject: [PATCH 08/32] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 scripts/bug_gen_modal.py   | 89 +++++++++++++++++++++++---------------
 swesmith/harness/gather.py | 87 +++++++++++++++++++++----------------
 swesmith/profiles/base.py  |  5 +--
 3 files changed, 104 insertions(+), 77 deletions(-)

diff --git a/scripts/bug_gen_modal.py b/scripts/bug_gen_modal.py
index ad403238..35d2142a 100644
--- a/scripts/bug_gen_modal.py
+++ b/scripts/bug_gen_modal.py
@@ -1681,20 +1681,31 @@ def resolve_repo_id():
     work_dir = Path("/root")
     logs_link_dir = work_dir / "logs"
     logs_link_dir.mkdir(exist_ok=True)
-    
+
     # Configure git authentication
     if "GITHUB_TOKEN" in os.environ:
         token = os.environ["GITHUB_TOKEN"]
         print(f"DEBUG: Found GITHUB_TOKEN (len={len(token)}). Configuring git auth...")
-        
+
         # Use simpler authenticated URL format for PATs
         subprocess.run(
-            ["git", "config", "--global", f"url.https://{token}@github.com/.insteadOf", "https://github.com/"],
-            check=True
+            [
+                "git",
+                "config",
+                "--global",
+                f"url.https://{token}@github.com/.insteadOf",
+                "https://github.com/",
+            ],
+            check=True,
         )
         # Also configure user info
-        subprocess.run(["git", "config", "--global", "user.email", "swesmith@swesmith.ai"], check=False)
-        subprocess.run(["git", "config", "--global", "user.name", "swesmith"], check=False)
+        subprocess.run(
+            ["git", "config", "--global", "user.email", "swesmith@swesmith.ai"],
+            check=False,
+        )
+        subprocess.run(
+            ["git", "config", "--global", "user.name", "swesmith"], check=False
+        )
     else:
         print("Warning: GITHUB_TOKEN not found in environment. Git push may fail.")
 
@@ -1711,66 +1722,71 @@ def resolve_repo_id():
         # Ensure sources exist on volume
         task_insts_source.mkdir(parents=True, exist_ok=True)
         if not validation_source.exists():
-             return {"repo": repo_name, "status": "skipped", "reason": "No validation logs"}
+            return {
+                "repo": repo_name,
+                "status": "skipped",
+                "reason": "No validation logs",
+            }
 
         # Create symlinks
         if not validation_link.exists():
             os.symlink(str(validation_source), str(validation_link))
-        
+
         if not task_insts_link.exists():
             os.symlink(str(task_insts_source), str(task_insts_link))
 
         # Check if there are actually validation logs for this repo
         repo_vals = validation_link / repo_id
         if not repo_vals.exists():
-            return {"repo": repo_name, "status": "skipped", "reason": "No logs for repo"}
-        
+            return {
+                "repo": repo_name,
+                "status": "skipped",
+                "reason": "No logs for repo",
+            }
+
         # Build command
         # python -m swesmith.harness.gather logs/run_validation/<repo_id>
         cmd = [
             sys.executable,
-            "-m", "swesmith.harness.gather",
+            "-m",
+            "swesmith.harness.gather",
             str(Path("logs/run_validation") / repo_id),
             "-v",
             "-d",
         ]
-        
+
         if repush_image:
             cmd.append("--repush_image")
         if override_branch:
             cmd.append("--override_branch")
-            
+
         print(f"Running: {' '.join(cmd)}")
-            
+
         # execution
         result = subprocess.run(
-            cmd,
-            cwd=str(work_dir),
-            capture_output=True,
-            text=True,
-            env=os.environ
+            cmd, cwd=str(work_dir), capture_output=True, text=True, env=os.environ
         )
-        
+
         if result.returncode != 0:
             print("Gather failed:")
             print(result.stdout)
             print(result.stderr)
             return {
-                "repo": repo_name, 
-                "status": "failed", 
-                "stdout": result.stdout, 
-                "stderr": result.stderr
+                "repo": repo_name,
+                "status": "failed",
+                "stdout": result.stdout,
+                "stderr": result.stderr,
             }
         else:
             print("Gather succeeded:")
             print(result.stdout)
             print(result.stderr)
-        
+
         return {
             "repo": repo_name,
             "status": "success",
             "stdout": result.stdout,
-            "stderr": result.stderr
+            "stderr": result.stderr,
         }
 
     except Exception as e:
@@ -1783,29 +1799,29 @@ async def run_gather_phase_async(repos: list[str], language: str, args) -> None:
     print(f"\n{'#' * 60}")
     print(f"# PHASE 3: GATHER ({len(repos)} repos)")
     print(f"{'#' * 60}\n")
-    
-    # We can pass repush_image and override_branch via args if they existed, 
+
+    # We can pass repush_image and override_branch via args if they existed,
     # but for now we'll assume defaults or add them to args class if needed.
     repush = getattr(args, "repush_image", False)
     override = getattr(args, "override_branch", False)
-    
+
     completed = 0
     success = 0
-    
+
     print(f"Starting gather for {len(repos)} repos...")
-    
+
     async for result in gather_remote.map.aio(
         repos,
         kwargs={
             "language": language,
             "repush_image": repush,
             "override_branch": override,
-        }
+        },
     ):
         completed += 1
         repo = result.get("repo", "unknown")
         status = result.get("status", "unknown")
-        
+
         if status == "success":
             success += 1
             print(f"  [{completed}/{len(repos)}] {repo}: Success")
@@ -1815,11 +1831,13 @@ async def run_gather_phase_async(repos: list[str], language: str, args) -> None:
                 for line in lines[-5:]:
                     print(f"    | {line}")
         elif status == "skipped":
-            print(f"  [{completed}/{len(repos)}] {repo}: Skipped ({result.get('reason')})")
+            print(
+                f"  [{completed}/{len(repos)}] {repo}: Skipped ({result.get('reason')})"
+            )
         else:
             err = result.get("error") or "Non-zero exit code"
             print(f"  [{completed}/{len(repos)}] {repo}: Failed - {err}")
-            
+
     print(f"\nGather complete: {success}/{len(repos)} repos processed successfully.\n")
 
 
@@ -2236,4 +2254,3 @@ class Args:
         return
 
     await run_gather_phase_async(target_repos, language, args)
-
diff --git a/swesmith/harness/gather.py b/swesmith/harness/gather.py
index aca7664f..f8d3d83b 100644
--- a/swesmith/harness/gather.py
+++ b/swesmith/harness/gather.py
@@ -179,7 +179,7 @@ def _main(
     subfolders_to_process = [x for x in subfolders if x not in completed_ids]
 
     print(f"Will process {len(subfolders_to_process)} instances")
-    
+
     # Determine number of workers
     n_workers = int(os.environ.get("MAX_WORKERS", os.cpu_count() or 1))
     print(f"Using {n_workers} workers")
@@ -194,11 +194,13 @@ def _main(
             verbose=verbose,
         )
 
-        results = list(tqdm(
-            executor.map(func, sorted(subfolders_to_process)),
-            total=len(subfolders_to_process),
-            desc="Conversion"
-        ))
+        results = list(
+            tqdm(
+                executor.map(func, sorted(subfolders_to_process)),
+                total=len(subfolders_to_process),
+                desc="Conversion",
+            )
+        )
 
     # Aggregate results
     stats = {"new_tasks": 0, "skipped": 0}
@@ -212,15 +214,15 @@ def _main(
         if repush_image:
             print("Rebuilding + pushing images...")
             for repo in created_repos:
-                 print(f"[{repo}] Rebuilding + pushing image")
-                 registry.get(repo).push_image(rebuild_image=True)
+                print(f"[{repo}] Rebuilding + pushing image")
+                registry.get(repo).push_image(rebuild_image=True)
 
     if len(task_instances) > 0:
         task_instances_path.parent.mkdir(parents=True, exist_ok=True)
         with open(task_instances_path, "w") as f:
             json.dump(task_instances, f, indent=4)
         print(f"Wrote {len(task_instances)} instances to {task_instances_path}")
-    
+
     print(f"- {stats['skipped']} skipped")
     print(f"- {stats['new_tasks']} new instances")
 
@@ -242,12 +244,13 @@ def process_instance(
     stats = {"new_tasks": 0, "skipped": 0}
     task_instances = []
     created_repos = set()
-    
+
     # Use a unique temporary directory for this process/task to avoid collision
     # We append process ID or random string to repo path
     import multiprocessing
+
     pid = multiprocessing.current_process().pid
-    
+
     # Define subprocess args locally to avoid global state issues with multiprocessing
     subprocess_args = SUBPROCESS_ARGS.copy()
     if not debug_subprocess:
@@ -261,26 +264,28 @@ def process_instance(
     path_patch = os.path.join(validation_logs_path, subfolder, "patch.diff")
 
     if not os.path.exists(path_results):
-        if verbose: print(f"[SKIP] {subfolder}: No results")
+        if verbose:
+            print(f"[SKIP] {subfolder}: No results")
         return [], set(), {"new_tasks": 0, "skipped": 1}
-        
+
     if not os.path.exists(path_patch):
-        if verbose: print(f"[SKIP] {subfolder}: No patch.diff")
+        if verbose:
+            print(f"[SKIP] {subfolder}: No patch.diff")
         return [], set(), {"new_tasks": 0, "skipped": 1}
 
     with open(path_results) as f:
         results = json.load(f)
     if PASS_TO_FAIL not in results or PASS_TO_PASS not in results:
-        if verbose: print(f"[SKIP] {subfolder}: No validatable bugs")
+        if verbose:
+            print(f"[SKIP] {subfolder}: No validatable bugs")
         return [], set(), {"new_tasks": 0, "skipped": 1}
 
     n_f2p = len(results[PASS_TO_FAIL])
     n_p2p = len(results[PASS_TO_PASS])
-    pr_exception = (
-        ".pr_" in subfolder and n_p2p == 0 and n_f2p > 0
-    )
+    pr_exception = ".pr_" in subfolder and n_p2p == 0 and n_f2p > 0
     if not pr_exception and (KEY_TIMED_OUT in results or n_f2p == 0 or n_p2p == 0):
-        if verbose: print(f"[SKIP] {subfolder}: No validatable bugs: {n_f2p=}, {n_p2p=}")
+        if verbose:
+            print(f"[SKIP] {subfolder}: No validatable bugs: {n_f2p=}, {n_p2p=}")
         return [], set(), {"new_tasks": 0, "skipped": 1}
 
     with open(path_patch) as f:
@@ -297,7 +302,7 @@ def process_instance(
 
     # Unique clone path for this worker
     repo_path = f"{rp.repo_name}_{pid}_{subfolder}"
-    
+
     # Clone repository
     try:
         _, cloned = rp.clone(dest=repo_path)
@@ -318,19 +323,20 @@ def process_instance(
 
         # Check if branch already created for this problem
         # We pass the repo_path as cwd for the git operations inside the helper
-        
+
         branch_exists = check_if_branch_exists(
             repo_path, subfolder, main_branch, override_branch, verbose, subprocess_args
         )
         if branch_exists:
             task_instances.append(task_instance)
-            if verbose: print(f"[SKIP] {subfolder}: Branch `{subfolder}` exists")
+            if verbose:
+                print(f"[SKIP] {subfolder}: Branch `{subfolder}` exists")
             stats["skipped"] += 1
             # Cleanup
             if cloned and os.path.exists(repo_path):
                 shutil.rmtree(repo_path)
             return task_instances, created_repos, stats
-            
+
         elif verbose:
             print(f"[{subfolder}] Does not exist yet")
 
@@ -348,16 +354,16 @@ def process_instance(
                 break
             else:
                 subprocess.run("git reset --hard", cwd=repo_path, **subprocess_args)
-        
+
         if not applied:
-            # We can't raise Exception here as it stops the worker? 
+            # We can't raise Exception here as it stops the worker?
             # Or we let it bubble up and fail the future?
             # Better to catch and print/skip
             print(f"[{subfolder}] Failed to apply patch to {rp.repo_name}")
             if cloned and os.path.exists(repo_path):
                 shutil.rmtree(repo_path)
-            return [], set(), stats # Don't record this one
-            
+            return [], set(), stats  # Don't record this one
+
         if verbose:
             print(f"[{subfolder}] Bug patch applied successfully")
 
@@ -375,13 +381,17 @@ def process_instance(
             subprocess.run(cmd, cwd=repo_path, **subprocess_args)
 
         # Check for changes
-        status_output = subprocess.run(
-            "git status --porcelain",
-            cwd=repo_path,
-            capture_output=True,
-            shell=True,
-            check=True,
-        ).stdout.decode().strip()
+        status_output = (
+            subprocess.run(
+                "git status --porcelain",
+                cwd=repo_path,
+                capture_output=True,
+                shell=True,
+                check=True,
+            )
+            .stdout.decode()
+            .strip()
+        )
 
         if not status_output:
             if verbose:
@@ -419,7 +429,7 @@ def process_instance(
                 subprocess.run(cmd, cwd=repo_path, **subprocess_args)
             if verbose:
                 print(f"[{subfolder}] Commit F2P test file(s) removal")
-        
+
         cmds = [
             f"git push origin {subfolder}",
             f"git checkout {main_branch}",
@@ -430,7 +440,7 @@ def process_instance(
             if debug_subprocess:
                 print(f"[{subfolder}] {cmd}")
             subprocess.run(cmd, cwd=repo_path, **subprocess_args)
-        
+
         if verbose:
             print(f"[{subfolder}] Bug @ branch `{subfolder}`")
 
@@ -438,14 +448,15 @@ def process_instance(
         if verbose:
             print(f"[{subfolder}] Created task instance")
         stats["new_tasks"] += 1
-        
+
     finally:
         # Cleanup unique clone
         if os.path.exists(repo_path):
             shutil.rmtree(repo_path)
-            
+
     return task_instances, created_repos, stats
 
+
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(
         description="Convert validation logs to SWE-bench style dataset"
diff --git a/swesmith/profiles/base.py b/swesmith/profiles/base.py
index c5c9c83c..4d68d105 100644
--- a/swesmith/profiles/base.py
+++ b/swesmith/profiles/base.py
@@ -181,6 +181,7 @@ def _get_cached_test_paths(self) -> list[Path]:
             with self._lock:  # Only one process enters this block at a time
                 # Use unique temp dir to avoid race conditions in multiprocessing
                 import uuid
+
                 temp_dest = f"{self.repo_name}_{uuid.uuid4().hex[:8]}"
                 dir_path, cloned = self.clone(dest=temp_dest)
                 self._cache_test_paths = [
@@ -292,9 +293,7 @@ def clone(self, dest: str | None = None) -> tuple[str, bool]:
         if not os.path.exists(dest):
             token = os.getenv("GITHUB_TOKEN")
             if token:
-                base_url = (
-                    f"https://{token}@github.com/{self.mirror_name}.git"
-                )
+                base_url = f"https://{token}@github.com/{self.mirror_name}.git"
             else:
                 base_url = f"git@github.com:{self.mirror_name}.git"
 

From 842650aa46be21c4c0d749a3eb58faab6fbec997 Mon Sep 17 00:00:00 2001
From: Kevin Li <kevinli020508@gmail.com>
Date: Fri, 16 Jan 2026 19:46:09 -0800
Subject: [PATCH 09/32] Replace slow and stateful git checkout and branch -D
 with a single stateless git ls-remote

---
 swesmith/harness/gather.py | 40 ++++++++++++++++++++++++--------------
 1 file changed, 25 insertions(+), 15 deletions(-)

diff --git a/swesmith/harness/gather.py b/swesmith/harness/gather.py
index f8d3d83b..7ac1fb90 100644
--- a/swesmith/harness/gather.py
+++ b/swesmith/harness/gather.py
@@ -102,23 +102,33 @@ def check_if_branch_exists(
     verbose: bool,
     subprocess_args: dict,
 ):
-    branch_exists = None
+    branch_exists = False
     try:
-        subprocess.run(f"git checkout {subfolder}", cwd=repo_name, **subprocess_args)
-        if override_branch:
-            # Delete the branch remotely
-            subprocess.run(
-                f"git push --delete origin {subfolder}",
-                cwd=repo_name,
-                **subprocess_args,
-            )
-            if verbose:
-                print(f"[{subfolder}] Overriding existing branch")
-            branch_exists = False
-        else:
+        # Check remote for branch existence directly
+        # This is more robust than checkout/fetch for cached repos
+        result = subprocess.run(
+            f"git ls-remote --heads origin {subfolder}",
+            cwd=repo_name,
+            capture_output=True,
+            shell=True,
+            text=True
+        )
+        
+        # If there is output, the branch exists on remote
+        if result.returncode == 0 and subfolder in result.stdout:
             branch_exists = True
-        subprocess.run(f"git checkout {main_branch}", cwd=repo_name, **subprocess_args)
-        subprocess.run(f"git branch -D {subfolder}", cwd=repo_name, **subprocess_args)
+            if override_branch:
+                # Delete the branch remotely
+                subprocess.run(
+                    f"git push --delete origin {subfolder}",
+                    cwd=repo_name,
+                    **subprocess_args,
+                )
+                if verbose:
+                    print(f"[{subfolder}] Overriding existing branch")
+                branch_exists = False
+        
+        
     except Exception:
         branch_exists = False
         pass

From af01d5cf4c66e4d3c533665520286fa8ebc07f99 Mon Sep 17 00:00:00 2001
From: Kevin Li <kevinli020508@gmail.com>
Date: Fri, 16 Jan 2026 19:46:38 -0800
Subject: [PATCH 10/32] Cache repo locally to avoid rate limits and speed up
 cloning

---
 swesmith/harness/gather.py | 98 +++++++++++++++++++++++++++++++-------
 1 file changed, 81 insertions(+), 17 deletions(-)

diff --git a/swesmith/harness/gather.py b/swesmith/harness/gather.py
index 7ac1fb90..518f6021 100644
--- a/swesmith/harness/gather.py
+++ b/swesmith/harness/gather.py
@@ -194,23 +194,60 @@ def _main(
     n_workers = int(os.environ.get("MAX_WORKERS", os.cpu_count() or 1))
     print(f"Using {n_workers} workers")
 
-    with concurrent.futures.ProcessPoolExecutor(max_workers=n_workers) as executor:
-        # Create a partial function with fixed arguments
-        func = functools.partial(
-            process_instance,
-            validation_logs_path=validation_logs_path,
-            override_branch=override_branch,
-            debug_subprocess=debug_subprocess,
-            verbose=verbose,
-        )
+    # Optimization: Cache repo locally to avoid rate limits and speed up cloning
+    import tempfile
+
+    with tempfile.TemporaryDirectory() as cache_root:
+        # cache_root exists, so rp.clone(dest=cache_root) would skip cloning.
+        # We must clone into a subdirectory which doesn't exist yet.
+        cache_dir = os.path.join(cache_root, "repo")
+        print(f"Pre-cloning repository to cache: {cache_dir}...")
+        
+        rp_cache = None
+        # Try resolving profile from run_id (directory name) first
+        try:
+            rp_cache = registry.get(run_id)
+        except Exception:
+            pass
+
+        if not rp_cache:
+            sample_id = next((s for s in subfolders if "." in s), None)
+            if sample_id:
+                try:
+                    rp_cache = registry.get_from_inst({KEY_INSTANCE_ID: sample_id})
+                except Exception as e:
+                    print(f"Warning: Could not resolve profile from {sample_id}: {e}")
+        
+        path_to_cache = None
+        if rp_cache:
+            try:
+                print(f"Cloning {rp_cache.repo_name} to cache...")
+                rp_cache.clone(dest=cache_dir)
+                path_to_cache = cache_dir
+                print("Pre-clone successful.")
+            except Exception as e:
+                print(f"Pre-clone failed: {e}. Will fall back to per-instance cloning.")
+        else:
+            print("Could not resolve profile for pre-cloning. Will iterate per instance.")
+
+        with concurrent.futures.ProcessPoolExecutor(max_workers=n_workers) as executor:
+            # Create a partial function with fixed arguments
+            func = functools.partial(
+                process_instance,
+                validation_logs_path=validation_logs_path,
+                override_branch=override_branch,
+                debug_subprocess=debug_subprocess,
+                verbose=verbose,
+                cache_dir=path_to_cache,
+            )
 
-        results = list(
-            tqdm(
-                executor.map(func, sorted(subfolders_to_process)),
-                total=len(subfolders_to_process),
-                desc="Conversion",
+            results = list(
+                tqdm(
+                    executor.map(func, sorted(subfolders_to_process)),
+                    total=len(subfolders_to_process),
+                    desc="Conversion",
+                )
             )
-        )
 
     # Aggregate results
     stats = {"new_tasks": 0, "skipped": 0}
@@ -243,6 +280,7 @@ def process_instance(
     override_branch: bool,
     debug_subprocess: bool,
     verbose: bool,
+    cache_dir: str | None = None,
 ) -> tuple[list[dict], set[str], dict]:
     """
     Process a single task instance.
@@ -315,10 +353,36 @@ def process_instance(
 
     # Clone repository
     try:
-        _, cloned = rp.clone(dest=repo_path)
-        if cloned:
+        if cache_dir and os.path.exists(cache_dir):
+            if verbose:
+                print(f"[{subfolder}] Cloning from cache {cache_dir}...")
+            
+            subprocess.run(
+                f"git clone {cache_dir} {repo_path}",
+                check=True,
+                shell=True,
+                stdout=subprocess.DEVNULL if not debug_subprocess else None,
+                stderr=subprocess.DEVNULL if not debug_subprocess else None,
+            )
+            cloned = True
             created_repos.add(rp.repo_name)
 
+            # Fix origin remote to point to actual GitHub repo so push works
+            remote_url = f"https://github.com/{rp.mirror_name}.git"
+            
+            subprocess.run(
+                f"git remote set-url origin {remote_url}",
+                cwd=repo_path,
+                check=True,
+                shell=True,
+                stdout=subprocess.DEVNULL if not debug_subprocess else None,
+                stderr=subprocess.DEVNULL if not debug_subprocess else None,
+            )
+        else:
+            _, cloned = rp.clone(dest=repo_path)
+            if cloned:
+                created_repos.add(rp.repo_name)
+
         main_branch = (
             subprocess.run(
                 "git rev-parse --abbrev-ref HEAD",

From 1d6bcd448ab2615052e3b57cbb59833739052c8b Mon Sep 17 00:00:00 2001
From: Kevin Li <kevinli020508@gmail.com>
Date: Fri, 16 Jan 2026 23:09:58 -0800
Subject: [PATCH 11/32] Optimize gather with persistent worker repos (~5min
 total)

- Switch from per-task clones to per-worker persistent repositories.
- Reduces clone operations from O(tasks) to O(workers) (e.g. 1400 -> 17).
- Eliminates file locking race conditions.
- Total gather time for Javascript is now ~5 minutes (bottlenecked by math.js).
---
 swesmith/harness/gather.py | 148 ++++++++++++++++++++++++-------------
 1 file changed, 96 insertions(+), 52 deletions(-)

diff --git a/swesmith/harness/gather.py b/swesmith/harness/gather.py
index 518f6021..119d758d 100644
--- a/swesmith/harness/gather.py
+++ b/swesmith/harness/gather.py
@@ -348,56 +348,103 @@ def process_instance(
     task_instance[KEY_IMAGE_NAME] = rp.image_name
     task_instance["repo"] = rp.mirror_name
 
-    # Unique clone path for this worker
-    repo_path = f"{rp.repo_name}_{pid}_{subfolder}"
+    # Persistent worker path - reused across tasks for this process
+    # We place it in the same temporary directory as the cache to ensure automatic cleanup.
+    if cache_dir:
+        # cache_dir is .../temp/repo, so dirname is .../temp
+        repo_path = os.path.join(os.path.dirname(cache_dir), f"{rp.repo_name}_worker_{pid}")
+    else:
+        # Fallback if no cache used (e.g. debugging), though likely not cleaned up automatically
+        repo_path = os.path.abspath(f"{rp.repo_name}_worker_{pid}")
+
+    # Helper to reset repo state
+    def reset_repo(path):
+        subprocess.run(
+            "git reset --hard", cwd=path, **subprocess_args
+        )
+        subprocess.run(
+            "git clean -fdx", cwd=path, **subprocess_args
+        )
+        # remove potential lock files if previous run crashed hard
+        lock_file = os.path.join(path, ".git", "index.lock")
+        if os.path.exists(lock_file):
+            try:
+                os.remove(lock_file)
+            except OSError:
+                pass
 
-    # Clone repository
+    cloned = False
     try:
-        if cache_dir and os.path.exists(cache_dir):
+        if os.path.exists(repo_path):
+            # Reuse existing repo for this worker
             if verbose:
-                print(f"[{subfolder}] Cloning from cache {cache_dir}...")
+                print(f"[{subfolder}] Reusing worker repo {repo_path}")
+            reset_repo(repo_path)
             
-            subprocess.run(
-                f"git clone {cache_dir} {repo_path}",
-                check=True,
-                shell=True,
-                stdout=subprocess.DEVNULL if not debug_subprocess else None,
-                stderr=subprocess.DEVNULL if not debug_subprocess else None,
+            # We need to know main branch name. We can get it from local repo now.
+            # Assuming main branch hasn't changed name/ref significantly.
+            # We avoid 'git pull' to save rate limits and time. 
+            main_branch = (
+                subprocess.run(
+                    "git rev-parse --abbrev-ref HEAD",
+                    cwd=repo_path,
+                    capture_output=True,
+                    shell=True,
+                    check=True,
+                )
+                .stdout.decode()
+                .strip()
             )
-            cloned = True
-            created_repos.add(rp.repo_name)
+            # Ensure we are on main branch
+            subprocess.run(f"git checkout {main_branch}", cwd=repo_path, **subprocess_args)
 
-            # Fix origin remote to point to actual GitHub repo so push works
-            remote_url = f"https://github.com/{rp.mirror_name}.git"
-            
-            subprocess.run(
-                f"git remote set-url origin {remote_url}",
-                cwd=repo_path,
-                check=True,
-                shell=True,
-                stdout=subprocess.DEVNULL if not debug_subprocess else None,
-                stderr=subprocess.DEVNULL if not debug_subprocess else None,
-            )
         else:
-            _, cloned = rp.clone(dest=repo_path)
-            if cloned:
+            # First time setup for this worker
+            if cache_dir and os.path.exists(cache_dir):
+                if verbose:
+                    print(f"[{subfolder}] First-time clone from cache {cache_dir}...")
+                
+                subprocess.run(
+                    f"git clone {cache_dir} {repo_path}",
+                    check=True,
+                    shell=True,
+                    stdout=subprocess.DEVNULL if not debug_subprocess else None,
+                    stderr=subprocess.DEVNULL if not debug_subprocess else None,
+                )
+                cloned = True
                 created_repos.add(rp.repo_name)
 
-        main_branch = (
-            subprocess.run(
-                "git rev-parse --abbrev-ref HEAD",
-                cwd=repo_path,
-                capture_output=True,
-                shell=True,
-                check=True,
+                # Fix origin remote
+                remote_url = f"https://github.com/{rp.mirror_name}.git"
+                subprocess.run(
+                    f"git remote set-url origin {remote_url}",
+                    cwd=repo_path,
+                    check=True,
+                    shell=True,
+                    stdout=subprocess.DEVNULL if not debug_subprocess else None,
+                    stderr=subprocess.DEVNULL if not debug_subprocess else None,
+                )
+            else:
+                _, cloned = rp.clone(dest=repo_path)
+                created_repos.add(rp.repo_name)
+
+            main_branch = (
+                subprocess.run(
+                    "git rev-parse --abbrev-ref HEAD",
+                    cwd=repo_path,
+                    capture_output=True,
+                    shell=True,
+                    check=True,
+                )
+                .stdout.decode()
+                .strip()
             )
-            .stdout.decode()
-            .strip()
-        )
 
-        # Check if branch already created for this problem
-        # We pass the repo_path as cwd for the git operations inside the helper
+        # Ensure we are clean on main branch before starting
+        subprocess.run(f"git checkout {main_branch}", cwd=repo_path, **subprocess_args)
+
 
+        # Check if branch already created for this problem
         branch_exists = check_if_branch_exists(
             repo_path, subfolder, main_branch, override_branch, verbose, subprocess_args
         )
@@ -406,9 +453,8 @@ def process_instance(
             if verbose:
                 print(f"[SKIP] {subfolder}: Branch `{subfolder}` exists")
             stats["skipped"] += 1
-            # Cleanup
-            if cloned and os.path.exists(repo_path):
-                shutil.rmtree(repo_path)
+            # Do NOT remove repo, just return. 
+            # We might want to checkout main to be polite to next run but reset_repo handles it.
             return task_instances, created_repos, stats
 
         elif verbose:
@@ -430,12 +476,9 @@ def process_instance(
                 subprocess.run("git reset --hard", cwd=repo_path, **subprocess_args)
 
         if not applied:
-            # We can't raise Exception here as it stops the worker?
-            # Or we let it bubble up and fail the future?
-            # Better to catch and print/skip
             print(f"[{subfolder}] Failed to apply patch to {rp.repo_name}")
-            if cloned and os.path.exists(repo_path):
-                shutil.rmtree(repo_path)
+            # Reset for next usage
+            reset_repo(repo_path)
             return [], set(), stats  # Don't record this one
 
         if verbose:
@@ -471,8 +514,10 @@ def process_instance(
             if verbose:
                 print(f"[{subfolder}] No changes to commit, skipping")
             stats["skipped"] += 1
-            if cloned and os.path.exists(repo_path):
-                shutil.rmtree(repo_path)
+            # Reset logic happens at start of next or via finally... 
+            # actually better to cleanup branch now
+            subprocess.run(f"git checkout {main_branch}", cwd=repo_path, **subprocess_args)
+            subprocess.run(f"git branch -D {subfolder}", cwd=repo_path, **subprocess_args)
             return task_instances, created_repos, stats
 
         cmds = [
@@ -524,10 +569,9 @@ def process_instance(
         stats["new_tasks"] += 1
 
     finally:
-        # Cleanup unique clone
-        if os.path.exists(repo_path):
-            shutil.rmtree(repo_path)
-
+        # DO NOT remove repo_path. We persist it for this worker logic.
+        pass
+    
     return task_instances, created_repos, stats
 
 

From e42a5e2a539ea9a9116265afb82668cd40a70e73 Mon Sep 17 00:00:00 2001
From: Kevin Li <kevinli020508@gmail.com>
Date: Fri, 16 Jan 2026 23:35:06 -0800
Subject: [PATCH 12/32] Flip PASS_TO_FAIL to FAIL_TO_PASS following SWE-bench
 naming convention

---
 swesmith/harness/gather.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/swesmith/harness/gather.py b/swesmith/harness/gather.py
index 119d758d..2eb0a12b 100644
--- a/swesmith/harness/gather.py
+++ b/swesmith/harness/gather.py
@@ -38,6 +38,7 @@
 from swebench.harness.constants import (
     PASS_TO_FAIL,
     PASS_TO_PASS,
+    FAIL_TO_PASS,
     KEY_INSTANCE_ID,
     LOG_REPORT,
 )
@@ -341,7 +342,7 @@ def process_instance(
     task_instance = {
         KEY_INSTANCE_ID: subfolder,
         KEY_PATCH: patch_content,
-        PASS_TO_FAIL: results[PASS_TO_FAIL],
+        FAIL_TO_PASS: results[PASS_TO_FAIL], # Flip PASS_TO_FAIL to FAIL_TO_PASS following SWE-bench naming convention
         PASS_TO_PASS: results[PASS_TO_PASS],
     }
     rp = registry.get_from_inst(task_instance)

From 302b73e61b46d0580c640ff6f434191388c2e348 Mon Sep 17 00:00:00 2001
From: Kevin Li <kevinli020508@gmail.com>
Date: Fri, 16 Jan 2026 23:38:56 -0800
Subject: [PATCH 13/32] Remove unused shutil import in gather.py

---
 swesmith/harness/gather.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/swesmith/harness/gather.py b/swesmith/harness/gather.py
index 2eb0a12b..170cddba 100644
--- a/swesmith/harness/gather.py
+++ b/swesmith/harness/gather.py
@@ -29,7 +29,6 @@
 import argparse
 import json
 import os
-import shutil
 import subprocess
 import concurrent.futures
 import functools

From 393aba43b5f699b81c0f43f46a798ebd8deb3d94 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Sat, 17 Jan 2026 07:39:12 +0000
Subject: [PATCH 14/32] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 swesmith/harness/gather.py | 56 +++++++++++++++++++++-----------------
 1 file changed, 31 insertions(+), 25 deletions(-)

diff --git a/swesmith/harness/gather.py b/swesmith/harness/gather.py
index 170cddba..55243628 100644
--- a/swesmith/harness/gather.py
+++ b/swesmith/harness/gather.py
@@ -111,9 +111,9 @@ def check_if_branch_exists(
             cwd=repo_name,
             capture_output=True,
             shell=True,
-            text=True
+            text=True,
         )
-        
+
         # If there is output, the branch exists on remote
         if result.returncode == 0 and subfolder in result.stdout:
             branch_exists = True
@@ -127,8 +127,7 @@ def check_if_branch_exists(
                 if verbose:
                     print(f"[{subfolder}] Overriding existing branch")
                 branch_exists = False
-        
-        
+
     except Exception:
         branch_exists = False
         pass
@@ -202,7 +201,7 @@ def _main(
         # We must clone into a subdirectory which doesn't exist yet.
         cache_dir = os.path.join(cache_root, "repo")
         print(f"Pre-cloning repository to cache: {cache_dir}...")
-        
+
         rp_cache = None
         # Try resolving profile from run_id (directory name) first
         try:
@@ -217,7 +216,7 @@ def _main(
                     rp_cache = registry.get_from_inst({KEY_INSTANCE_ID: sample_id})
                 except Exception as e:
                     print(f"Warning: Could not resolve profile from {sample_id}: {e}")
-        
+
         path_to_cache = None
         if rp_cache:
             try:
@@ -228,7 +227,9 @@ def _main(
             except Exception as e:
                 print(f"Pre-clone failed: {e}. Will fall back to per-instance cloning.")
         else:
-            print("Could not resolve profile for pre-cloning. Will iterate per instance.")
+            print(
+                "Could not resolve profile for pre-cloning. Will iterate per instance."
+            )
 
         with concurrent.futures.ProcessPoolExecutor(max_workers=n_workers) as executor:
             # Create a partial function with fixed arguments
@@ -341,7 +342,9 @@ def process_instance(
     task_instance = {
         KEY_INSTANCE_ID: subfolder,
         KEY_PATCH: patch_content,
-        FAIL_TO_PASS: results[PASS_TO_FAIL], # Flip PASS_TO_FAIL to FAIL_TO_PASS following SWE-bench naming convention
+        FAIL_TO_PASS: results[
+            PASS_TO_FAIL
+        ],  # Flip PASS_TO_FAIL to FAIL_TO_PASS following SWE-bench naming convention
         PASS_TO_PASS: results[PASS_TO_PASS],
     }
     rp = registry.get_from_inst(task_instance)
@@ -352,19 +355,17 @@ def process_instance(
     # We place it in the same temporary directory as the cache to ensure automatic cleanup.
     if cache_dir:
         # cache_dir is .../temp/repo, so dirname is .../temp
-        repo_path = os.path.join(os.path.dirname(cache_dir), f"{rp.repo_name}_worker_{pid}")
+        repo_path = os.path.join(
+            os.path.dirname(cache_dir), f"{rp.repo_name}_worker_{pid}"
+        )
     else:
         # Fallback if no cache used (e.g. debugging), though likely not cleaned up automatically
         repo_path = os.path.abspath(f"{rp.repo_name}_worker_{pid}")
 
     # Helper to reset repo state
     def reset_repo(path):
-        subprocess.run(
-            "git reset --hard", cwd=path, **subprocess_args
-        )
-        subprocess.run(
-            "git clean -fdx", cwd=path, **subprocess_args
-        )
+        subprocess.run("git reset --hard", cwd=path, **subprocess_args)
+        subprocess.run("git clean -fdx", cwd=path, **subprocess_args)
         # remove potential lock files if previous run crashed hard
         lock_file = os.path.join(path, ".git", "index.lock")
         if os.path.exists(lock_file):
@@ -380,10 +381,10 @@ def reset_repo(path):
             if verbose:
                 print(f"[{subfolder}] Reusing worker repo {repo_path}")
             reset_repo(repo_path)
-            
+
             # We need to know main branch name. We can get it from local repo now.
             # Assuming main branch hasn't changed name/ref significantly.
-            # We avoid 'git pull' to save rate limits and time. 
+            # We avoid 'git pull' to save rate limits and time.
             main_branch = (
                 subprocess.run(
                     "git rev-parse --abbrev-ref HEAD",
@@ -396,14 +397,16 @@ def reset_repo(path):
                 .strip()
             )
             # Ensure we are on main branch
-            subprocess.run(f"git checkout {main_branch}", cwd=repo_path, **subprocess_args)
+            subprocess.run(
+                f"git checkout {main_branch}", cwd=repo_path, **subprocess_args
+            )
 
         else:
             # First time setup for this worker
             if cache_dir and os.path.exists(cache_dir):
                 if verbose:
                     print(f"[{subfolder}] First-time clone from cache {cache_dir}...")
-                
+
                 subprocess.run(
                     f"git clone {cache_dir} {repo_path}",
                     check=True,
@@ -443,7 +446,6 @@ def reset_repo(path):
         # Ensure we are clean on main branch before starting
         subprocess.run(f"git checkout {main_branch}", cwd=repo_path, **subprocess_args)
 
-
         # Check if branch already created for this problem
         branch_exists = check_if_branch_exists(
             repo_path, subfolder, main_branch, override_branch, verbose, subprocess_args
@@ -453,7 +455,7 @@ def reset_repo(path):
             if verbose:
                 print(f"[SKIP] {subfolder}: Branch `{subfolder}` exists")
             stats["skipped"] += 1
-            # Do NOT remove repo, just return. 
+            # Do NOT remove repo, just return.
             # We might want to checkout main to be polite to next run but reset_repo handles it.
             return task_instances, created_repos, stats
 
@@ -514,10 +516,14 @@ def reset_repo(path):
             if verbose:
                 print(f"[{subfolder}] No changes to commit, skipping")
             stats["skipped"] += 1
-            # Reset logic happens at start of next or via finally... 
+            # Reset logic happens at start of next or via finally...
             # actually better to cleanup branch now
-            subprocess.run(f"git checkout {main_branch}", cwd=repo_path, **subprocess_args)
-            subprocess.run(f"git branch -D {subfolder}", cwd=repo_path, **subprocess_args)
+            subprocess.run(
+                f"git checkout {main_branch}", cwd=repo_path, **subprocess_args
+            )
+            subprocess.run(
+                f"git branch -D {subfolder}", cwd=repo_path, **subprocess_args
+            )
             return task_instances, created_repos, stats
 
         cmds = [
@@ -571,7 +577,7 @@ def reset_repo(path):
     finally:
         # DO NOT remove repo_path. We persist it for this worker logic.
         pass
-    
+
     return task_instances, created_repos, stats
 
 

From 1f0601ed79c3d8470e60779c06b223b181584959 Mon Sep 17 00:00:00 2001
From: Kevin Li <kevinli020508@gmail.com>
Date: Mon, 19 Jan 2026 20:33:53 -0800
Subject: [PATCH 15/32] First draft for issue gen

---
 configs/issue_gen/ig_v2.yaml   |   2 +-
 scripts/bug_gen_modal.py       | 220 ++++++++++++++++++++++++++++++++-
 swesmith/issue_gen/generate.py |   1 +
 3 files changed, 216 insertions(+), 7 deletions(-)

diff --git a/configs/issue_gen/ig_v2.yaml b/configs/issue_gen/ig_v2.yaml
index 8bc9e1a0..133d51d8 100644
--- a/configs/issue_gen/ig_v2.yaml
+++ b/configs/issue_gen/ig_v2.yaml
@@ -1,4 +1,4 @@
-model: anthropic/claude-sonnet-4-20250514
+model: anthropic/claude-haiku-4-5-20251001
 system: |-
   You are a software engineer helping to create a realistic dataset of synthetic GitHub issues.
   
diff --git a/scripts/bug_gen_modal.py b/scripts/bug_gen_modal.py
index 35d2142a..4707036c 100644
--- a/scripts/bug_gen_modal.py
+++ b/scripts/bug_gen_modal.py
@@ -295,8 +295,10 @@ def resolve_profile(repo_name: str):
     modal.Image.from_registry("ubuntu:22.04", add_python="3.11")
     .apt_install("git")
     .pip_install_from_pyproject("pyproject.toml", optional_dependencies=["generate"])
+    .pip_install("jinja2", "litellm", "datasets", "pyyaml")
     .env({"PYTHONPATH": "/root"})
     .add_local_dir("swesmith", remote_path="/root/swesmith")
+    .add_local_dir("configs", remote_path="/root/configs")
     .add_local_file(".env", remote_path="/root/.env")
 )
 
@@ -1841,6 +1843,197 @@ async def run_gather_phase_async(repos: list[str], language: str, args) -> None:
     print(f"\nGather complete: {success}/{len(repos)} repos processed successfully.\n")
 
 
+# ============================================================================
+# Issue Generation
+# ============================================================================
+
+
+@app.function(
+    image=generator_image,
+    volumes={LOGS_MOUNT_PATH: logs_volume},
+    timeout=3600,
+    secrets=[
+        modal.Secret.from_name("ANTHROPIC_API_KEY"),
+        modal.Secret.from_name("GITHUB_TOKEN"),
+    ],
+)
+def issue_gen_remote(
+    repo: str,
+    language: str,
+    config: str,
+    workers: int,
+) -> dict:
+    """Generate issue descriptions for a single repo's task instances.
+
+    Calls the existing swesmith.issue_gen.generate.IssueGen class to generate
+    issue descriptions for all task instances in a repo. Uses symlinks to redirect
+    local paths to Modal volume paths.
+
+    Args:
+        repo: Repository name (e.g., "astropy__astropy")
+        language: Programming language filter
+        config: Path to config file
+        workers: Number of workers per repo
+    """
+    import os
+    import sys
+    from pathlib import Path
+
+    # Set up paths
+    volume_root = Path(LOGS_MOUNT_PATH) / language
+    task_insts_dir = volume_root / "task_insts"
+    
+    # Resolve task instances file (it may have a hash suffix like repo__name.abcdef.json)
+    task_insts_file = None
+    repo_sanitized = repo.replace("/", "__")
+    
+    if task_insts_dir.exists():
+        for filename in os.listdir(task_insts_dir):
+            # Check for exact match or match with suffix
+            if filename == f"{repo_sanitized}.json" or (
+                filename.startswith(f"{repo_sanitized}.") and filename.endswith(".json")
+            ):
+                task_insts_file = task_insts_dir / filename
+                break
+
+    # Check if task instances file exists
+    if not task_insts_file or not task_insts_file.exists():
+        return {
+            "success": True,  # Not an error, just nothing to do
+            "repo": repo,
+            "instances_processed": 0,
+            "status": "skipped",
+            "reason": "No task insts file",
+        }
+
+    # Create symlinks to redirect local paths to volume paths
+    # This allows IssueGen to work with its expected local paths
+    local_logs = Path("/root/logs")
+    local_logs.mkdir(parents=True, exist_ok=True)
+
+    # Symlink the entire logs directory structure
+    for subdir in ["task_insts", "run_validation", "issue_gen"]:
+        local_subdir = local_logs / subdir
+        volume_subdir = volume_root / subdir
+
+        # Create the volume directory if it doesn't exist
+        volume_subdir.mkdir(parents=True, exist_ok=True)
+
+        # Create symlink (thread-safe with FileExistsError handling)
+        try:
+            if local_subdir.exists():
+                local_subdir.unlink()
+            local_subdir.symlink_to(volume_subdir)
+        except FileExistsError:
+            # Another concurrent task already created the symlink
+            pass
+
+    try:
+        # Import IssueGen after symlinks are set up
+        from swesmith.issue_gen.generate import IssueGen
+
+        # Verify config file exists
+        config_path = Path(config)
+        if not config_path.exists():
+            return {
+                "success": False,
+                "repo": repo,
+                "error": f"Config file not found: {config}",
+            }
+
+        # Set up IssueGen instance
+        issue_gen = IssueGen(
+            dataset_path=str(task_insts_file),
+            config_file=Path(config),
+            workers=workers,
+            redo_existing=False,
+        )
+
+        # Run issue generation
+        # This processes all instances in the repo using ThreadPoolExecutor
+        issue_gen.run()
+
+        # Count how many instances were processed
+        issue_gen_file = volume_root / "issue_gen" / f"{repo}.json"
+        instances_processed = 0
+        if issue_gen_file.exists():
+            import json
+            with open(issue_gen_file) as f:
+                data = json.load(f)
+                instances_processed = len(data)
+
+        return {
+            "success": True,
+            "repo": repo,
+            "instances_processed": instances_processed,
+        }
+
+    except Exception as e:
+        import traceback
+        return {
+            "success": False,
+            "repo": repo,
+            "error": f"{type(e).__name__}: {str(e)}",
+            "traceback": traceback.format_exc(),
+        }
+
+
+async def run_issue_gen_phase_async(
+    repos: list[str],
+    language: str,
+    issue_gen_config: str,
+    issue_gen_workers: int,
+) -> None:
+    """Run issue generation phase for all repos in parallel.
+
+    Args:
+        repos: List of repository names to process
+        language: Programming language filter
+        issue_gen_config: Path to config file
+        issue_gen_workers: Number of workers per repo
+        issue_gen_redo: Whether to regenerate existing issues
+    """
+    print(f"\n{'='*80}")
+    print(f"ISSUE GENERATION PHASE")
+    print(f"{'='*80}")
+    print(f"Processing {len(repos)} repositories...")
+    print(f"Config: {issue_gen_config}")
+    print(f"Workers per repo: {issue_gen_workers}")
+    print()
+
+    # Run issue generation in parallel across all repos
+    results = []
+    async for result in issue_gen_remote.map.aio(
+        repos,
+        kwargs={
+            "language": language,
+            "config": issue_gen_config,
+            "workers": issue_gen_workers,
+        },
+        order_outputs=False,
+    ):
+
+        results.append(result)
+
+        # Print progress
+        completed = len(results)
+        if result["success"]:
+            instances = result.get("instances_processed", 0)
+            print(f"  [{completed}/{len(repos)}] {result['repo']}: ✓ ({instances} instances)")
+        else:
+            error = result.get("error", "Unknown error")
+            print(f"  [{completed}/{len(repos)}] {result['repo']}: ✗ {error}")
+            if "traceback" in result:
+                print(f"    Traceback: {result['traceback'][:200]}...")
+
+    # Summary
+    success = sum(1 for r in results if r["success"])
+    total_instances = sum(r.get("instances_processed", 0) for r in results if r["success"])
+
+    print(f"\nIssue generation complete: {success}/{len(repos)} repos processed successfully.")
+    print(f"Total instances with issues: {total_instances}\n")
+
+
 # ============================================================================
 # Stats Display
 # ============================================================================
@@ -2158,6 +2351,8 @@ async def main(
     max_concurrent_tests: int = 900,
     show_stats: bool = False,
     gather: bool = False,
+    issue_gen_config: str = "configs/issue_gen/ig_v2.yaml",
+    issue_gen_workers: int = 8,
 ):
     """
     Modal Bug Generation & Validation script.
@@ -2166,6 +2361,7 @@ async def main(
     1. Generation: Creates bugs for repos (skips repos that are already done/failed)
     2. Validation: Validates all patches from the volume
     3. Gather: Creates task instances and pushes branches
+    4. Issue Generation: Generates issue descriptions for valid bugs
 
     Run with: modal run scripts/bug_gen.py [OPTIONS]
 
@@ -2179,6 +2375,8 @@ async def main(
         max_concurrent_tests: Max concurrent tests (default: 900)
         show_stats: If True, show bug breakdown stats and exit without running generation/validation
         gather: If True, only run the gather phase (skip generation and validation)
+        issue_gen_config: Path to issue generation config (default: configs/issue_gen/ig_v2.yaml)
+        issue_gen_workers: Number of workers per repo for issue generation (default: 4)
     """
     # Handle --show-stats early exit
     if show_stats:
@@ -2206,7 +2404,6 @@ async def main(
 
     print(f"\n{'=' * 60}")
     print(f"BUG GEN - {len(target_repos)} repos, {max_concurrent_tests} max concurrent")
-    print(f"Volume: {VOLUME_NAME}/{language}/")
     print(f"{'=' * 60}\n")
 
     # Create a simple args-like object for compatibility
@@ -2218,8 +2415,11 @@ class Args:
     args.interleave = interleave
     args.max_entities = max_entities
     args.max_candidates = max_candidates
+    args.timeout_buffer = 60
+    args.max_concurrent_tests = max_concurrent_tests
 
     # Phase 1: Generation (skips repos that are already done/failed)
+    generation_results = []
     if not gather:
         generation_results = await run_generation_phase(target_repos, args, language)
 
@@ -2248,9 +2448,17 @@ class Args:
     else:
         results = []
 
-    # Phase 3: Gather (Create task instances & Push branches)
-    if not results and not gather:
-        print("No validation results found. Skipping gather phase.")
-        return
+    # # Phase 3: Gather (Create task instances & Push branches)
+    # if not results and not gather:
+    #     print("No validation results found. Skipping gather phase.")
+    #     return
+
+    # await run_gather_phase_async(target_repos, language, args)
 
-    await run_gather_phase_async(target_repos, language, args)
+    # Phase 4: Issue Generation
+    await run_issue_gen_phase_async(
+        target_repos,
+        language,
+        issue_gen_config,
+        issue_gen_workers,
+    )
diff --git a/swesmith/issue_gen/generate.py b/swesmith/issue_gen/generate.py
index efba7a26..b2bb29a8 100644
--- a/swesmith/issue_gen/generate.py
+++ b/swesmith/issue_gen/generate.py
@@ -277,6 +277,7 @@ def jinja_shuffle(seq):
         else:
             # If messages already exist, get repos_to_remove from existing metadata
             _, repos_to_remove = self.get_test_functions(instance_curr)
+            messages = metadata["messages"]
 
         # Generate n_instructions completions containing problem statements
         response = completion(

From a50e614c8506f6bc6f43657b5af51b37c8ba642b Mon Sep 17 00:00:00 2001
From: Kevin Li <kevinli020508@gmail.com>
Date: Mon, 19 Jan 2026 22:54:44 -0800
Subject: [PATCH 16/32] Support PortKey for issue gen and switch to gpt-5-mini

---
 configs/issue_gen/ig_v2.yaml   |   4 +-
 pyproject.toml                 |   2 +
 scripts/bug_gen_modal.py       |   7 +-
 swesmith/issue_gen/generate.py | 120 +++++++++++++++++++++++++++++++--
 4 files changed, 123 insertions(+), 10 deletions(-)

diff --git a/configs/issue_gen/ig_v2.yaml b/configs/issue_gen/ig_v2.yaml
index 133d51d8..50a93ff4 100644
--- a/configs/issue_gen/ig_v2.yaml
+++ b/configs/issue_gen/ig_v2.yaml
@@ -1,4 +1,6 @@
-model: anthropic/claude-haiku-4-5-20251001
+model: portkey/gpt-5-mini
+litellm_model_name_override: openai/gpt-5-mini
+provider: "@openai"
 system: |-
   You are a software engineer helping to create a realistic dataset of synthetic GitHub issues.
   
diff --git a/pyproject.toml b/pyproject.toml
index a9e830ef..fe2bdb4f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -35,6 +35,7 @@ all = [
     "matplotlib",
     "modal",
     "openai",
+    "portkey-ai",
     "pre-commit",
     "python-dotenv",
     "rich",
@@ -83,6 +84,7 @@ generate = [
     "docker",
     "ghapi",
     "libcst",
+    "portkey-ai",
     "python-dotenv",
     "rich",
     "swebench",
diff --git a/scripts/bug_gen_modal.py b/scripts/bug_gen_modal.py
index 4707036c..63f3189c 100644
--- a/scripts/bug_gen_modal.py
+++ b/scripts/bug_gen_modal.py
@@ -521,7 +521,10 @@ async def acquire(self):
 
 @app.function(
     image=generator_image,
-    secrets=[modal.Secret.from_name("GITHUB_TOKEN")],
+    secrets=[
+        modal.Secret.from_name("GITHUB_TOKEN"),
+        modal.Secret.from_name("PORTKEY_API_KEY")
+    ],
     timeout=MODAL_TIMEOUT,
     volumes={LOGS_MOUNT_PATH: logs_volume},  # Mount volume for direct writes
 )
@@ -1853,8 +1856,8 @@ async def run_gather_phase_async(repos: list[str], language: str, args) -> None:
     volumes={LOGS_MOUNT_PATH: logs_volume},
     timeout=3600,
     secrets=[
-        modal.Secret.from_name("ANTHROPIC_API_KEY"),
         modal.Secret.from_name("GITHUB_TOKEN"),
+        modal.Secret.from_name("PORTKEY_API_KEY"),
     ],
 )
 def issue_gen_remote(
diff --git a/swesmith/issue_gen/generate.py b/swesmith/issue_gen/generate.py
index b2bb29a8..765e553b 100644
--- a/swesmith/issue_gen/generate.py
+++ b/swesmith/issue_gen/generate.py
@@ -46,12 +46,88 @@
 )
 from swesmith.issue_gen.utils import get_test_function
 from swesmith.profiles import registry
+from typing import Any, Literal
+from tenacity import (
+    retry,
+    retry_if_not_exception_type,
+    stop_after_attempt,
+    wait_exponential,
+    before_sleep_log,
+)
+from pydantic import BaseModel
+
+try:
+    from portkey_ai import Portkey
+except ImportError:
+    Portkey = None
 
 logging.getLogger("LiteLLM").setLevel(logging.WARNING)
 litellm.drop_params = True
 litellm.suppress_debug_info = True
 
 
+logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
+logger = logging.getLogger(__name__)
+
+class PortkeyModelConfig(BaseModel):
+    model_name: str
+    model_kwargs: dict[str, Any] = {}
+    provider: str = ""
+    litellm_model_name_override: str = ""
+    cost_tracking: Literal["default", "ignore_errors"] = "default"
+
+
+class PortkeyModel:
+    def __init__(self, *, config_class: type = PortkeyModelConfig, **kwargs):
+        if Portkey is None:
+            raise ImportError(
+                "The portkey-ai package is required to use PortkeyModel. Please install it with: pip install portkey-ai"
+            )
+        
+        self.config = config_class(**kwargs)
+        self.cost = 0.0
+        self.n_calls = 0
+
+        # Get API key from environment or raise error
+        self._api_key = os.getenv("PORTKEY_API_KEY")
+        if not self._api_key:
+            raise ValueError(
+                "Portkey API key is required. Set it via the "
+                "PORTKEY_API_KEY environment variable."
+            )
+
+        # Get virtual key from environment
+        virtual_key = os.getenv("PORTKEY_VIRTUAL_KEY")
+
+        # Initialize Portkey client
+        client_kwargs = {"api_key": self._api_key}
+        if virtual_key:
+            client_kwargs["virtual_key"] = virtual_key
+        elif self.config.provider:
+            client_kwargs["provider"] = self.config.provider
+
+        self.client = Portkey(**client_kwargs)
+
+    @retry(
+        reraise=True,
+        stop=stop_after_attempt(10),
+        wait=wait_exponential(multiplier=1, min=4, max=60),
+        retry=retry_if_not_exception_type((KeyboardInterrupt, TypeError, ValueError)),
+        before_sleep=before_sleep_log(logger, logging.WARNING),
+    )
+    def _query(self, messages: list[dict[str, str]], **kwargs):
+        return self.client.chat.completions.create(
+            model=self.config.model_name,
+            messages=messages,
+            **(self.config.model_kwargs | kwargs),
+        )
+
+    def query(self, messages: list[dict[str, str]], **kwargs) -> Any:
+        # Simple adapter to match what generate.py expects (return an object with choices and usage for cost)
+        response = self._query([{"role": msg["role"], "content": msg["content"]} for msg in messages], **kwargs)
+        return response
+
+
 TEST_SRC_CODE_PROMPT = r"""
 **Test Source Code:**
 Use the following test source code to help you write reasonable, effective reproduction code.
@@ -61,9 +137,6 @@
 
 load_dotenv()
 
-logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
-logger = logging.getLogger(__name__)
-
 
 def maybe_shorten(text_str: str, max_tokens: int, model: str) -> str:
     """Shorten text if it exceeds the max_tokens limit.
@@ -92,6 +165,16 @@ def __init__(
         self.n_instructions = settings.get("n_instructions", 1)
         self.max_var_tokens = settings.get("max_var_tokens", 10_000)
 
+        # Initialize Portkey model if needed
+        self.portkey_model = None
+        if self.model.startswith("portkey/") or self.config.get("provider") == "portkey":
+            self.portkey_model = PortkeyModel(
+                model_name=self.model.replace("portkey/", ""),
+                provider=self.config.get("provider", "openai"),
+                litellm_model_name_override=self.config.get("litellm_model_name_override", ""),
+                **settings.get("portkey_kwargs", {})
+            )
+
         data_smith = [x for x in load_dataset(HF_DATASET, split="train")]
         self.dataset = (
             data_smith
@@ -138,6 +221,7 @@ def _should_do_instance(
         self, instance: dict, instance_ids: list | None, redo_existing: bool, model: str
     ) -> bool:
         repo = instance["repo"].split("/")[-1]
+
         output_file = LOG_DIR_ISSUE_GEN / repo / f"{instance[KEY_INSTANCE_ID]}.json"
         if not matches_instance_filter(instance[KEY_INSTANCE_ID], instance_ids):
             return False
@@ -280,11 +364,19 @@ def jinja_shuffle(seq):
             messages = metadata["messages"]
 
         # Generate n_instructions completions containing problem statements
-        response = completion(
-            model=self.model, messages=messages, n=self.n_instructions, temperature=0
-        )
+        if self.portkey_model:
+            response = self.portkey_model.query(messages, n=self.n_instructions, stream=False)
+        else:
+            response = completion(
+                model=self.model, messages=messages, n=self.n_instructions, temperature=0
+            )
 
-        cost = completion_cost(response)
+        model_for_cost = self.model
+        if self.portkey_model and self.portkey_model.config.litellm_model_name_override:
+            model_for_cost = self.portkey_model.config.litellm_model_name_override
+        
+        cost = completion_cost(response, model=model_for_cost)
+            
         metadata["cost"] = (0 if "cost" not in metadata else metadata["cost"]) + cost
 
         # Extract problem statements from response
@@ -343,6 +435,20 @@ def run(self):
         # Track repos to remove for cleanup
         all_repos_to_remove = set()
 
+        # Pre-clone all required repositories to avoid race conditions in parallel execution
+        # (RepoProfile.clone is not thread-safe)
+        unique_repos = {instance["repo"].split("/")[-1] for instance in self.dataset}
+        for repo_name in unique_repos:
+            try:
+                # registry.get(repo_name).clone() returns (dest, cloned)
+                # cloned is True if it actually cloned, False if it already existed
+                _, cloned = registry.get(repo_name).clone()
+                if cloned:
+                    all_repos_to_remove.add(repo_name)
+            except Exception as e:
+                logger.error(f"Failed to pre-clone {repo_name}: {e}")
+                # We continue, assuming it might work later or will fail properly in the thread
+
         # Create a thread pool and call generate_issue for each instance
         with ThreadPoolExecutor(max_workers=self.workers) as executor:
             futures = []

From 9461c1f03c980190116fe4083caf0d36727fcd37 Mon Sep 17 00:00:00 2001
From: Kevin Li <kevinli020508@gmail.com>
Date: Mon, 19 Jan 2026 23:41:33 -0800
Subject: [PATCH 17/32] Uncomment gather part in bug_gen_modal.py

---
 scripts/bug_gen_modal.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/scripts/bug_gen_modal.py b/scripts/bug_gen_modal.py
index 63f3189c..0884ab29 100644
--- a/scripts/bug_gen_modal.py
+++ b/scripts/bug_gen_modal.py
@@ -2451,12 +2451,12 @@ class Args:
     else:
         results = []
 
-    # # Phase 3: Gather (Create task instances & Push branches)
-    # if not results and not gather:
-    #     print("No validation results found. Skipping gather phase.")
-    #     return
+    # Phase 3: Gather (Create task instances & Push branches)
+    if not results and not gather:
+        print("No validation results found. Skipping gather phase.")
+        return
 
-    # await run_gather_phase_async(target_repos, language, args)
+    await run_gather_phase_async(target_repos, language, args)
 
     # Phase 4: Issue Generation
     await run_issue_gen_phase_async(

From f7f68cb85ad589c31f4423adda0dc239227ec1a3 Mon Sep 17 00:00:00 2001
From: Kevin Li <kevinli020508@gmail.com>
Date: Tue, 20 Jan 2026 00:18:31 -0800
Subject: [PATCH 18/32] Add script to upload task instances to Hugging Face

---
 scripts/upload_tasks_to_hf_modal.py | 169 ++++++++++++++++++++++++++++
 1 file changed, 169 insertions(+)
 create mode 100644 scripts/upload_tasks_to_hf_modal.py

diff --git a/scripts/upload_tasks_to_hf_modal.py b/scripts/upload_tasks_to_hf_modal.py
new file mode 100644
index 00000000..0271bade
--- /dev/null
+++ b/scripts/upload_tasks_to_hf_modal.py
@@ -0,0 +1,169 @@
+import modal
+import json
+import asyncio
+from pathlib import Path
+import sys
+from concurrent.futures import ThreadPoolExecutor
+
+# Define Modal App
+app = modal.App("swesmith-upload-hf")
+vol = modal.Volume.from_name("swesmith-bug-gen")
+
+# Define an image with necessary dependencies
+# We need datasets and huggingface_hub for the remote push
+image = modal.Image.debian_slim().pip_install("tqdm", "datasets", "huggingface_hub")
+
+def _process_single_task(task, issue_gen_dir, repo_id):
+    """Helper to process a single task instance"""
+    instance_id = task.get("instance_id")
+    if not instance_id:
+        return task
+        
+    if "image_name" in task and ".architecture." in task["image_name"]:
+        task["image_name"] = task["image_name"].replace(".architecture", "")
+    
+    task["problem_statement"] = ""
+    issue_file = issue_gen_dir / repo_id / f"{instance_id}.json"
+    
+    if issue_file.exists():
+        try:
+            with open(issue_file, "r") as f_issue:
+                issue_data = json.load(f_issue)
+                resp = issue_data.get("responses", {})
+                if "portkey/gpt-5-mini" in resp:
+                    content = resp["portkey/gpt-5-mini"]
+                    if isinstance(content, list) and len(content) > 0:
+                        task["problem_statement"] = content[0]
+        except Exception:
+            pass
+    return task
+
+@app.function(image=image, volumes={"/data": vol}, timeout=1200, max_containers=10)
+def process_repo(task_filename: str):
+    """(Same as before)"""
+    import concurrent.futures
+    # Assume language is javascript for now or pass it in path
+    language = "javascript"
+    task_file_path = Path(f"/data/{language}/task_insts/{task_filename}")
+    issue_gen_dir = Path(f"/data/{language}/issue_gen")
+    
+    tasks_out = []
+    
+    if not task_file_path.exists():
+        print(f"File not found: {task_file_path}")
+        return []
+        
+    repo_id = task_file_path.stem
+    try:
+        with open(task_file_path, "r") as f:
+            tasks = json.load(f)
+        
+        print(f"[{repo_id}] Processing {len(tasks)} tasks...")
+        
+        with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
+            futures = [
+                executor.submit(_process_single_task, task, issue_gen_dir, repo_id)
+                for task in tasks
+            ]
+            
+            for future in concurrent.futures.as_completed(futures):
+                tasks_out.append(future.result())
+                
+    except Exception as e:
+        print(f"[{repo_id}] Error: {e}")
+        
+    return tasks_out
+
+@app.function(
+    image=image,
+    secrets=[modal.Secret.from_name("john-hf-secret")],
+    timeout=1800
+)
+def push_to_hf_remote(all_tasks: list, target_dataset: str):
+    import os
+    from datasets import load_dataset, Dataset, concatenate_datasets
+    from huggingface_hub import create_repo, HfApi
+    
+    print(f"Starting remote upload to {target_dataset}")
+    token = os.environ.get("HF_TOKEN")
+    if not token:
+        print("WARNING: HF_TOKEN not found in environment variables!")
+    else:
+        print("HF_TOKEN found in environment variables.")
+
+    # Validation
+    required_keys = ["instance_id", "patch", "FAIL_TO_PASS", "PASS_TO_PASS", "image_name", "repo"]
+    print("Validating keys...")
+    cleaned_tasks = []
+    for task in all_tasks:
+        valid = True
+        for k in required_keys:
+            if k not in task:
+                 print(f"Missing key {k} in task {task.get('instance_id')}. Skipping.")
+                 valid = False
+                 break
+        if not valid:
+            continue
+
+        if "problem_statement" not in task:
+            task["problem_statement"] = ""
+        cleaned_tasks.append(task)
+        
+    print(f"Valid tasks: {len(cleaned_tasks)}")
+    local_dataset = Dataset.from_list(cleaned_tasks)
+    local_ids = set(local_dataset["instance_id"])
+    
+    final_dataset = local_dataset
+    
+    # Try to ensure repo exists
+    print(f"Ensuring repository {target_dataset} exists...")
+    try:
+        create_repo(target_dataset, repo_type="dataset", token=token, exist_ok=True)
+    except Exception as e:
+        print(f"Warning: create_repo failed: {e}. Attempting upload anyway (might fail if permissions wrong).")
+
+    # print(f"Loading target dataset: {target_dataset}")
+    # try:
+    #     sweb = load_dataset(target_dataset, split="train", token=token)
+    #     print(f"Existing HF dataset size: {len(sweb)}")
+        
+    #     sweb_filtered = sweb.filter(lambda x: x["instance_id"] not in local_ids)
+    #     print(f"Would override {len(sweb) - len(sweb_filtered)} instances")
+        
+    #     final_dataset = concatenate_datasets([sweb_filtered, local_dataset])
+    # except Exception as e:
+    #     print(f"Note: Could not load existing dataset '{target_dataset}' (it might be new or empty). Error: {e}")
+    #     print("Proceeding with creating a new dataset from local tasks.")
+
+    print(f"Pushing {len(final_dataset)} instances to {target_dataset}...")
+    final_dataset.push_to_hub(target_dataset, token=token)
+    print("Remote push finished successfully.")
+
+
+@app.local_entrypoint()
+def main(target_dataset: str = "SWE-bench/SWE-smith-js", push: bool = False):
+    print("Listing task files from Modal volume...")
+    try:
+        entries = vol.listdir("javascript/task_insts")
+        filenames = [e.path.split("/")[-1] for e in entries if e.path.endswith(".json")]
+    except Exception as e:
+        print(f"Error listing volume: {e}")
+        return
+
+    print(f"Found {len(filenames)} files. Starting parallel processing...")
+    
+    all_tasks = []
+    for repo_tasks in process_repo.map(filenames):
+        all_tasks.extend(repo_tasks)
+        
+    print(f"Fetched total {len(all_tasks)} task instances.")
+    
+    if not push:
+        confirm = input(f"Ready to push to HF. Proceed? (y/n) ").lower()
+        if confirm != "y":
+            print("Aborting.")
+            return
+
+    print("Launching remote push job...")
+    push_to_hf_remote.remote(all_tasks, target_dataset)
+    print("Done!")

From 4976d857e36d79436451d1f6d3d95c34b6d49129 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 20 Jan 2026 08:20:19 +0000
Subject: [PATCH 19/32] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 scripts/bug_gen_modal.py            | 25 +++++++----
 scripts/upload_tasks_to_hf_modal.py | 67 +++++++++++++++++------------
 swesmith/issue_gen/generate.py      | 32 ++++++++++----
 3 files changed, 78 insertions(+), 46 deletions(-)

diff --git a/scripts/bug_gen_modal.py b/scripts/bug_gen_modal.py
index 0884ab29..3f1a69e8 100644
--- a/scripts/bug_gen_modal.py
+++ b/scripts/bug_gen_modal.py
@@ -523,7 +523,7 @@ async def acquire(self):
     image=generator_image,
     secrets=[
         modal.Secret.from_name("GITHUB_TOKEN"),
-        modal.Secret.from_name("PORTKEY_API_KEY")
+        modal.Secret.from_name("PORTKEY_API_KEY"),
     ],
     timeout=MODAL_TIMEOUT,
     volumes={LOGS_MOUNT_PATH: logs_volume},  # Mount volume for direct writes
@@ -1885,11 +1885,11 @@ def issue_gen_remote(
     # Set up paths
     volume_root = Path(LOGS_MOUNT_PATH) / language
     task_insts_dir = volume_root / "task_insts"
-    
+
     # Resolve task instances file (it may have a hash suffix like repo__name.abcdef.json)
     task_insts_file = None
     repo_sanitized = repo.replace("/", "__")
-    
+
     if task_insts_dir.exists():
         for filename in os.listdir(task_insts_dir):
             # Check for exact match or match with suffix
@@ -1961,6 +1961,7 @@ def issue_gen_remote(
         instances_processed = 0
         if issue_gen_file.exists():
             import json
+
             with open(issue_gen_file) as f:
                 data = json.load(f)
                 instances_processed = len(data)
@@ -1973,6 +1974,7 @@ def issue_gen_remote(
 
     except Exception as e:
         import traceback
+
         return {
             "success": False,
             "repo": repo,
@@ -1996,9 +1998,9 @@ async def run_issue_gen_phase_async(
         issue_gen_workers: Number of workers per repo
         issue_gen_redo: Whether to regenerate existing issues
     """
-    print(f"\n{'='*80}")
+    print(f"\n{'=' * 80}")
     print(f"ISSUE GENERATION PHASE")
-    print(f"{'='*80}")
+    print(f"{'=' * 80}")
     print(f"Processing {len(repos)} repositories...")
     print(f"Config: {issue_gen_config}")
     print(f"Workers per repo: {issue_gen_workers}")
@@ -2015,14 +2017,15 @@ async def run_issue_gen_phase_async(
         },
         order_outputs=False,
     ):
-
         results.append(result)
 
         # Print progress
         completed = len(results)
         if result["success"]:
             instances = result.get("instances_processed", 0)
-            print(f"  [{completed}/{len(repos)}] {result['repo']}: ✓ ({instances} instances)")
+            print(
+                f"  [{completed}/{len(repos)}] {result['repo']}: ✓ ({instances} instances)"
+            )
         else:
             error = result.get("error", "Unknown error")
             print(f"  [{completed}/{len(repos)}] {result['repo']}: ✗ {error}")
@@ -2031,9 +2034,13 @@ async def run_issue_gen_phase_async(
 
     # Summary
     success = sum(1 for r in results if r["success"])
-    total_instances = sum(r.get("instances_processed", 0) for r in results if r["success"])
+    total_instances = sum(
+        r.get("instances_processed", 0) for r in results if r["success"]
+    )
 
-    print(f"\nIssue generation complete: {success}/{len(repos)} repos processed successfully.")
+    print(
+        f"\nIssue generation complete: {success}/{len(repos)} repos processed successfully."
+    )
     print(f"Total instances with issues: {total_instances}\n")
 
 
diff --git a/scripts/upload_tasks_to_hf_modal.py b/scripts/upload_tasks_to_hf_modal.py
index 0271bade..a866584a 100644
--- a/scripts/upload_tasks_to_hf_modal.py
+++ b/scripts/upload_tasks_to_hf_modal.py
@@ -13,18 +13,19 @@
 # We need datasets and huggingface_hub for the remote push
 image = modal.Image.debian_slim().pip_install("tqdm", "datasets", "huggingface_hub")
 
+
 def _process_single_task(task, issue_gen_dir, repo_id):
     """Helper to process a single task instance"""
     instance_id = task.get("instance_id")
     if not instance_id:
         return task
-        
+
     if "image_name" in task and ".architecture." in task["image_name"]:
         task["image_name"] = task["image_name"].replace(".architecture", "")
-    
+
     task["problem_statement"] = ""
     issue_file = issue_gen_dir / repo_id / f"{instance_id}.json"
-    
+
     if issue_file.exists():
         try:
             with open(issue_file, "r") as f_issue:
@@ -38,52 +39,53 @@ def _process_single_task(task, issue_gen_dir, repo_id):
             pass
     return task
 
+
 @app.function(image=image, volumes={"/data": vol}, timeout=1200, max_containers=10)
 def process_repo(task_filename: str):
     """(Same as before)"""
     import concurrent.futures
+
     # Assume language is javascript for now or pass it in path
     language = "javascript"
     task_file_path = Path(f"/data/{language}/task_insts/{task_filename}")
     issue_gen_dir = Path(f"/data/{language}/issue_gen")
-    
+
     tasks_out = []
-    
+
     if not task_file_path.exists():
         print(f"File not found: {task_file_path}")
         return []
-        
+
     repo_id = task_file_path.stem
     try:
         with open(task_file_path, "r") as f:
             tasks = json.load(f)
-        
+
         print(f"[{repo_id}] Processing {len(tasks)} tasks...")
-        
+
         with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
             futures = [
                 executor.submit(_process_single_task, task, issue_gen_dir, repo_id)
                 for task in tasks
             ]
-            
+
             for future in concurrent.futures.as_completed(futures):
                 tasks_out.append(future.result())
-                
+
     except Exception as e:
         print(f"[{repo_id}] Error: {e}")
-        
+
     return tasks_out
 
+
 @app.function(
-    image=image,
-    secrets=[modal.Secret.from_name("john-hf-secret")],
-    timeout=1800
+    image=image, secrets=[modal.Secret.from_name("john-hf-secret")], timeout=1800
 )
 def push_to_hf_remote(all_tasks: list, target_dataset: str):
     import os
     from datasets import load_dataset, Dataset, concatenate_datasets
     from huggingface_hub import create_repo, HfApi
-    
+
     print(f"Starting remote upload to {target_dataset}")
     token = os.environ.get("HF_TOKEN")
     if not token:
@@ -92,44 +94,53 @@ def push_to_hf_remote(all_tasks: list, target_dataset: str):
         print("HF_TOKEN found in environment variables.")
 
     # Validation
-    required_keys = ["instance_id", "patch", "FAIL_TO_PASS", "PASS_TO_PASS", "image_name", "repo"]
+    required_keys = [
+        "instance_id",
+        "patch",
+        "FAIL_TO_PASS",
+        "PASS_TO_PASS",
+        "image_name",
+        "repo",
+    ]
     print("Validating keys...")
     cleaned_tasks = []
     for task in all_tasks:
         valid = True
         for k in required_keys:
             if k not in task:
-                 print(f"Missing key {k} in task {task.get('instance_id')}. Skipping.")
-                 valid = False
-                 break
+                print(f"Missing key {k} in task {task.get('instance_id')}. Skipping.")
+                valid = False
+                break
         if not valid:
             continue
 
         if "problem_statement" not in task:
             task["problem_statement"] = ""
         cleaned_tasks.append(task)
-        
+
     print(f"Valid tasks: {len(cleaned_tasks)}")
     local_dataset = Dataset.from_list(cleaned_tasks)
     local_ids = set(local_dataset["instance_id"])
-    
+
     final_dataset = local_dataset
-    
+
     # Try to ensure repo exists
     print(f"Ensuring repository {target_dataset} exists...")
     try:
         create_repo(target_dataset, repo_type="dataset", token=token, exist_ok=True)
     except Exception as e:
-        print(f"Warning: create_repo failed: {e}. Attempting upload anyway (might fail if permissions wrong).")
+        print(
+            f"Warning: create_repo failed: {e}. Attempting upload anyway (might fail if permissions wrong)."
+        )
 
     # print(f"Loading target dataset: {target_dataset}")
     # try:
     #     sweb = load_dataset(target_dataset, split="train", token=token)
     #     print(f"Existing HF dataset size: {len(sweb)}")
-        
+
     #     sweb_filtered = sweb.filter(lambda x: x["instance_id"] not in local_ids)
     #     print(f"Would override {len(sweb) - len(sweb_filtered)} instances")
-        
+
     #     final_dataset = concatenate_datasets([sweb_filtered, local_dataset])
     # except Exception as e:
     #     print(f"Note: Could not load existing dataset '{target_dataset}' (it might be new or empty). Error: {e}")
@@ -151,13 +162,13 @@ def main(target_dataset: str = "SWE-bench/SWE-smith-js", push: bool = False):
         return
 
     print(f"Found {len(filenames)} files. Starting parallel processing...")
-    
+
     all_tasks = []
     for repo_tasks in process_repo.map(filenames):
         all_tasks.extend(repo_tasks)
-        
+
     print(f"Fetched total {len(all_tasks)} task instances.")
-    
+
     if not push:
         confirm = input(f"Ready to push to HF. Proceed? (y/n) ").lower()
         if confirm != "y":
diff --git a/swesmith/issue_gen/generate.py b/swesmith/issue_gen/generate.py
index 765e553b..2042b6fb 100644
--- a/swesmith/issue_gen/generate.py
+++ b/swesmith/issue_gen/generate.py
@@ -69,6 +69,7 @@
 logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s")
 logger = logging.getLogger(__name__)
 
+
 class PortkeyModelConfig(BaseModel):
     model_name: str
     model_kwargs: dict[str, Any] = {}
@@ -83,7 +84,7 @@ def __init__(self, *, config_class: type = PortkeyModelConfig, **kwargs):
             raise ImportError(
                 "The portkey-ai package is required to use PortkeyModel. Please install it with: pip install portkey-ai"
             )
-        
+
         self.config = config_class(**kwargs)
         self.cost = 0.0
         self.n_calls = 0
@@ -124,7 +125,10 @@ def _query(self, messages: list[dict[str, str]], **kwargs):
 
     def query(self, messages: list[dict[str, str]], **kwargs) -> Any:
         # Simple adapter to match what generate.py expects (return an object with choices and usage for cost)
-        response = self._query([{"role": msg["role"], "content": msg["content"]} for msg in messages], **kwargs)
+        response = self._query(
+            [{"role": msg["role"], "content": msg["content"]} for msg in messages],
+            **kwargs,
+        )
         return response
 
 
@@ -167,12 +171,17 @@ def __init__(
 
         # Initialize Portkey model if needed
         self.portkey_model = None
-        if self.model.startswith("portkey/") or self.config.get("provider") == "portkey":
+        if (
+            self.model.startswith("portkey/")
+            or self.config.get("provider") == "portkey"
+        ):
             self.portkey_model = PortkeyModel(
                 model_name=self.model.replace("portkey/", ""),
                 provider=self.config.get("provider", "openai"),
-                litellm_model_name_override=self.config.get("litellm_model_name_override", ""),
-                **settings.get("portkey_kwargs", {})
+                litellm_model_name_override=self.config.get(
+                    "litellm_model_name_override", ""
+                ),
+                **settings.get("portkey_kwargs", {}),
             )
 
         data_smith = [x for x in load_dataset(HF_DATASET, split="train")]
@@ -365,18 +374,23 @@ def jinja_shuffle(seq):
 
         # Generate n_instructions completions containing problem statements
         if self.portkey_model:
-            response = self.portkey_model.query(messages, n=self.n_instructions, stream=False)
+            response = self.portkey_model.query(
+                messages, n=self.n_instructions, stream=False
+            )
         else:
             response = completion(
-                model=self.model, messages=messages, n=self.n_instructions, temperature=0
+                model=self.model,
+                messages=messages,
+                n=self.n_instructions,
+                temperature=0,
             )
 
         model_for_cost = self.model
         if self.portkey_model and self.portkey_model.config.litellm_model_name_override:
             model_for_cost = self.portkey_model.config.litellm_model_name_override
-        
+
         cost = completion_cost(response, model=model_for_cost)
-            
+
         metadata["cost"] = (0 if "cost" not in metadata else metadata["cost"]) + cost
 
         # Extract problem statements from response

From 2ab21d806aaa4a7fe49931200a2b39c2ba66bde2 Mon Sep 17 00:00:00 2001
From: Kevin Li <kevinli020508@gmail.com>
Date: Thu, 29 Jan 2026 17:50:07 -0800
Subject: [PATCH 20/32] Refactor bug generation phases to use --phases argument
 instead of --gather flag

---
 scripts/bug_gen_modal.py | 67 +++++++++++++++++++++++-----------------
 1 file changed, 39 insertions(+), 28 deletions(-)

diff --git a/scripts/bug_gen_modal.py b/scripts/bug_gen_modal.py
index 3f1a69e8..78900784 100644
--- a/scripts/bug_gen_modal.py
+++ b/scripts/bug_gen_modal.py
@@ -2360,18 +2360,18 @@ async def main(
     max_candidates: int = 2000,
     max_concurrent_tests: int = 900,
     show_stats: bool = False,
-    gather: bool = False,
+    phases: str = "gen,val,gather,issue",
     issue_gen_config: str = "configs/issue_gen/ig_v2.yaml",
     issue_gen_workers: int = 8,
 ):
     """
     Modal Bug Generation & Validation script.
 
-    Runs two phases:
-    1. Generation: Creates bugs for repos (skips repos that are already done/failed)
-    2. Validation: Validates all patches from the volume
-    3. Gather: Creates task instances and pushes branches
-    4. Issue Generation: Generates issue descriptions for valid bugs
+    Runs phases specified by --phases (comma-separated):
+    - gen: Generation (creates bugs for repos)
+    - val: Validation (validates patches from volume)
+    - gather: Gather (creates task instances & pushes branches)
+    - issue: Issue Generation (generates issue descriptions)
 
     Run with: modal run scripts/bug_gen.py [OPTIONS]
 
@@ -2384,7 +2384,7 @@ async def main(
         max_candidates: Max candidates to process, -1 for all (default: 2000)
         max_concurrent_tests: Max concurrent tests (default: 900)
         show_stats: If True, show bug breakdown stats and exit without running generation/validation
-        gather: If True, only run the gather phase (skip generation and validation)
+        phases: Comma-separated list of phases to run (default: "gen,val,gather,issue")
         issue_gen_config: Path to issue generation config (default: configs/issue_gen/ig_v2.yaml)
         issue_gen_workers: Number of workers per repo for issue generation (default: 4)
     """
@@ -2393,6 +2393,19 @@ async def main(
         await show_volume_stats(language)
         return
 
+    # Parse and validate phases
+    valid_phases = {"gen", "val", "gather", "issue"}
+    phase_list = [p.strip() for p in phases.split(",") if p.strip()]
+    active_phases = set(phase_list)
+
+    invalid_phases = active_phases - valid_phases
+    if invalid_phases:
+        print(f"Error: Invalid phases: {invalid_phases}")
+        print(f"Valid phases are: {valid_phases}")
+        return
+
+    print(f"Running phases: {', '.join(sorted(active_phases))}")
+
     from swesmith.constants import ENV_NAME
 
     # Parse repos (comma-separated string to list)
@@ -2430,10 +2443,12 @@ class Args:
 
     # Phase 1: Generation (skips repos that are already done/failed)
     generation_results = []
-    if not gather:
+    if "gen" in active_phases:
         generation_results = await run_generation_phase(target_repos, args, language)
 
-        # Phase 2: Validation - collect ALL patches from volume (not just from this run)
+    # Phase 2: Validation - collect ALL patches from volume (not just from this run)
+    results = []
+    if "val" in active_phases:
         print(f"\n{'#' * 60}")
         print("# PHASE 2: VALIDATION")
         print(f"{'#' * 60}\n")
@@ -2449,26 +2464,22 @@ class Args:
         if results:
             print_summary(results, len(build_repos_with_patches(all_patches)))
 
-        # Report generation errors from this run
-        errors = [r for r in generation_results if "error" in r]
-        if errors:
-            print(f"\nGeneration Errors ({len(errors)}):")
-            for err in errors:
-                print(f"  - {err['repo']}: {err.get('error', 'Unknown')}")
-    else:
-        results = []
+    # Report generation errors from this run (if any)
+    errors = [r for r in generation_results if "error" in r]
+    if errors:
+        print(f"\nGeneration Errors ({len(errors)}):")
+        for err in errors:
+            print(f"  - {err['repo']}: {err.get('error', 'Unknown')}")
 
     # Phase 3: Gather (Create task instances & Push branches)
-    if not results and not gather:
-        print("No validation results found. Skipping gather phase.")
-        return
-
-    await run_gather_phase_async(target_repos, language, args)
+    if "gather" in active_phases:
+        await run_gather_phase_async(target_repos, language, args)
 
     # Phase 4: Issue Generation
-    await run_issue_gen_phase_async(
-        target_repos,
-        language,
-        issue_gen_config,
-        issue_gen_workers,
-    )
+    if "issue" in active_phases:
+        await run_issue_gen_phase_async(
+            target_repos,
+            language,
+            issue_gen_config,
+            issue_gen_workers,
+        )

From 8979df46c73c9a4ca4682c9e95e2f1a21e42ffc5 Mon Sep 17 00:00:00 2001
From: Kevin Li <kevinli020508@gmail.com>
Date: Thu, 5 Feb 2026 23:01:37 -0800
Subject: [PATCH 21/32] Fix gather patch apply failures in worker repos

Cause:
- gather invoked apply commands with a relative patch path (`../logs/run_validation/.../patch.diff`).
- During modal gather, each worker runs from a temporary repo directory under `/tmp/...`, so that relative path did not resolve to the mounted logs directory.
- `git apply` and fallback `patch` both failed with "can't open patch ... No such file or directory", resulting in dropped instances and empty/underfilled task outputs.

Fix:
- Resolve `patch.diff` to an absolute path before apply.
- Shell-quote that absolute path and pass it to every command in `GIT_APPLY_CMDS`.

Result:
- Patch application no longer depends on worker cwd; gather can apply valid rust patches and produce task instances consistently.
---
 swesmith/harness/gather.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/swesmith/harness/gather.py b/swesmith/harness/gather.py
index 55243628..e639bd93 100644
--- a/swesmith/harness/gather.py
+++ b/swesmith/harness/gather.py
@@ -29,6 +29,7 @@
 import argparse
 import json
 import os
+import shlex
 import subprocess
 import concurrent.futures
 import functools
@@ -464,9 +465,10 @@ def reset_repo(path):
 
         # Apply patch
         applied = False
+        abs_patch_path = shlex.quote(os.path.abspath(path_patch))
         for git_apply in GIT_APPLY_CMDS:
             output = subprocess.run(
-                f"{git_apply} ../{path_patch}",
+                f"{git_apply} {abs_patch_path}",
                 cwd=repo_path,
                 capture_output=True,
                 shell=True,

From 17fe59ba53388696143ed5404b3cb88ad7a087e8 Mon Sep 17 00:00:00 2001
From: Kevin Li <kevinli020508@gmail.com>
Date: Fri, 6 Feb 2026 01:02:29 -0800
Subject: [PATCH 22/32] Make HF upload script language-aware for non-JS
 datasets

Root cause: upload_tasks_to_hf_modal.py was hardcoded to javascript paths in both task discovery and per-repo processing. Running with --language rust still read /data/javascript/... and javascript/task_insts, which breaks Rust upload workflows and can surface as missing/empty problem statements for non-JS datasets.

Fix: thread a language argument through the local entrypoint and worker function, list files from {language}/task_insts, and pass language explicitly through process_repo.map so each worker reads /data/{language}/task_insts and /data/{language}/issue_gen.
---
 scripts/upload_tasks_to_hf_modal.py | 14 ++++++++------
 1 file changed, 8 insertions(+), 6 deletions(-)

diff --git a/scripts/upload_tasks_to_hf_modal.py b/scripts/upload_tasks_to_hf_modal.py
index a866584a..eb08d5a3 100644
--- a/scripts/upload_tasks_to_hf_modal.py
+++ b/scripts/upload_tasks_to_hf_modal.py
@@ -41,12 +41,10 @@ def _process_single_task(task, issue_gen_dir, repo_id):
 
 
 @app.function(image=image, volumes={"/data": vol}, timeout=1200, max_containers=10)
-def process_repo(task_filename: str):
+def process_repo(task_filename: str, language: str = "javascript"):
     """(Same as before)"""
     import concurrent.futures
 
-    # Assume language is javascript for now or pass it in path
-    language = "javascript"
     task_file_path = Path(f"/data/{language}/task_insts/{task_filename}")
     issue_gen_dir = Path(f"/data/{language}/issue_gen")
 
@@ -152,10 +150,14 @@ def push_to_hf_remote(all_tasks: list, target_dataset: str):
 
 
 @app.local_entrypoint()
-def main(target_dataset: str = "SWE-bench/SWE-smith-js", push: bool = False):
+def main(
+    target_dataset: str = "SWE-bench/SWE-smith-js",
+    language: str = "javascript",
+    push: bool = False,
+):
     print("Listing task files from Modal volume...")
     try:
-        entries = vol.listdir("javascript/task_insts")
+        entries = vol.listdir(f"{language}/task_insts")
         filenames = [e.path.split("/")[-1] for e in entries if e.path.endswith(".json")]
     except Exception as e:
         print(f"Error listing volume: {e}")
@@ -164,7 +166,7 @@ def main(target_dataset: str = "SWE-bench/SWE-smith-js", push: bool = False):
     print(f"Found {len(filenames)} files. Starting parallel processing...")
 
     all_tasks = []
-    for repo_tasks in process_repo.map(filenames):
+    for repo_tasks in process_repo.map(filenames, [language] * len(filenames)):
         all_tasks.extend(repo_tasks)
 
     print(f"Fetched total {len(all_tasks)} task instances.")

From 9c58d47b25c744d487902c361a5d5774cb4b0991 Mon Sep 17 00:00:00 2001
From: Kevin Li <kevinli020508@gmail.com>
Date: Wed, 11 Feb 2026 22:30:03 -0800
Subject: [PATCH 23/32] Make Modal HF upload robust with single remote path

---
 scripts/upload_tasks_to_hf_modal.py | 233 ++++++++++++----------------
 1 file changed, 101 insertions(+), 132 deletions(-)

diff --git a/scripts/upload_tasks_to_hf_modal.py b/scripts/upload_tasks_to_hf_modal.py
index eb08d5a3..0953903f 100644
--- a/scripts/upload_tasks_to_hf_modal.py
+++ b/scripts/upload_tasks_to_hf_modal.py
@@ -1,21 +1,25 @@
-import modal
 import json
-import asyncio
 from pathlib import Path
-import sys
-from concurrent.futures import ThreadPoolExecutor
 
-# Define Modal App
+import modal
+
 app = modal.App("swesmith-upload-hf")
 vol = modal.Volume.from_name("swesmith-bug-gen")
+image = modal.Image.debian_slim().pip_install("datasets", "huggingface_hub")
 
-# Define an image with necessary dependencies
-# We need datasets and huggingface_hub for the remote push
-image = modal.Image.debian_slim().pip_install("tqdm", "datasets", "huggingface_hub")
+REQUIRED_KEYS = [
+    "instance_id",
+    "patch",
+    "FAIL_TO_PASS",
+    "PASS_TO_PASS",
+    "image_name",
+    "repo",
+]
+ISSUE_MODEL_KEY = "portkey/gpt-5-mini"
 
 
-def _process_single_task(task, issue_gen_dir, repo_id):
-    """Helper to process a single task instance"""
+def _attach_issue_statement(task: dict, issue_gen_dir: Path, repo_id: str) -> dict:
+    """Attach issue text and normalize fields for a task instance."""
     instance_id = task.get("instance_id")
     if not instance_id:
         return task
@@ -25,129 +29,105 @@ def _process_single_task(task, issue_gen_dir, repo_id):
 
     task["problem_statement"] = ""
     issue_file = issue_gen_dir / repo_id / f"{instance_id}.json"
+    if not issue_file.exists():
+        return task
 
-    if issue_file.exists():
-        try:
-            with open(issue_file, "r") as f_issue:
-                issue_data = json.load(f_issue)
-                resp = issue_data.get("responses", {})
-                if "portkey/gpt-5-mini" in resp:
-                    content = resp["portkey/gpt-5-mini"]
-                    if isinstance(content, list) and len(content) > 0:
-                        task["problem_statement"] = content[0]
-        except Exception:
-            pass
-    return task
-
-
-@app.function(image=image, volumes={"/data": vol}, timeout=1200, max_containers=10)
-def process_repo(task_filename: str, language: str = "javascript"):
-    """(Same as before)"""
-    import concurrent.futures
-
-    task_file_path = Path(f"/data/{language}/task_insts/{task_filename}")
-    issue_gen_dir = Path(f"/data/{language}/issue_gen")
-
-    tasks_out = []
-
-    if not task_file_path.exists():
-        print(f"File not found: {task_file_path}")
-        return []
-
-    repo_id = task_file_path.stem
     try:
-        with open(task_file_path, "r") as f:
-            tasks = json.load(f)
-
-        print(f"[{repo_id}] Processing {len(tasks)} tasks...")
-
-        with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor:
-            futures = [
-                executor.submit(_process_single_task, task, issue_gen_dir, repo_id)
-                for task in tasks
-            ]
-
-            for future in concurrent.futures.as_completed(futures):
-                tasks_out.append(future.result())
+        issue_data = json.loads(issue_file.read_text())
+    except Exception:
+        return task
 
-    except Exception as e:
-        print(f"[{repo_id}] Error: {e}")
+    responses = issue_data.get("responses", {})
+    content = responses.get(ISSUE_MODEL_KEY)
+    if isinstance(content, list) and content:
+        task["problem_statement"] = content[0]
 
-    return tasks_out
+    return task
 
 
 @app.function(
-    image=image, secrets=[modal.Secret.from_name("john-hf-secret")], timeout=1800
+    image=image,
+    volumes={"/data": vol},
+    secrets=[modal.Secret.from_name("john-hf-secret")],
+    timeout=10800,
 )
-def push_to_hf_remote(all_tasks: list, target_dataset: str):
+def upload_from_volume_remote(target_dataset: str, language: str = "javascript") -> dict:
+    """Robust end-to-end upload: volume -> issue merge -> validate -> HF push."""
     import os
-    from datasets import load_dataset, Dataset, concatenate_datasets
-    from huggingface_hub import create_repo, HfApi
+    from datasets import Dataset
+    from huggingface_hub import create_repo
 
-    print(f"Starting remote upload to {target_dataset}")
     token = os.environ.get("HF_TOKEN")
     if not token:
-        print("WARNING: HF_TOKEN not found in environment variables!")
-    else:
-        print("HF_TOKEN found in environment variables.")
-
-    # Validation
-    required_keys = [
-        "instance_id",
-        "patch",
-        "FAIL_TO_PASS",
-        "PASS_TO_PASS",
-        "image_name",
-        "repo",
-    ]
-    print("Validating keys...")
+        return {"success": False, "error": "HF_TOKEN not found in environment"}
+
+    task_insts_dir = Path(f"/data/{language}/task_insts")
+    issue_gen_dir = Path(f"/data/{language}/issue_gen")
+    if not task_insts_dir.exists():
+        return {"success": False, "error": f"Missing task_insts dir: {task_insts_dir}"}
+
+    task_files = sorted(task_insts_dir.glob("*.json"))
+    if not task_files:
+        return {"success": False, "error": f"No task files in {task_insts_dir}"}
+
+    print(f"Found {len(task_files)} task files in volume.")
+
     cleaned_tasks = []
-    for task in all_tasks:
-        valid = True
-        for k in required_keys:
-            if k not in task:
-                print(f"Missing key {k} in task {task.get('instance_id')}. Skipping.")
-                valid = False
-                break
-        if not valid:
+    skipped_missing_keys = 0
+    repos_processed = 0
+    repos_failed = 0
+
+    for task_file in task_files:
+        repo_id = task_file.stem
+        try:
+            tasks = json.loads(task_file.read_text())
+        except Exception as e:
+            repos_failed += 1
+            print(f"[{repo_id}] Failed to read tasks: {e}")
             continue
 
-        if "problem_statement" not in task:
-            task["problem_statement"] = ""
-        cleaned_tasks.append(task)
+        repos_processed += 1
+        print(f"[{repo_id}] Processing {len(tasks)} tasks...")
+
+        for task in tasks:
+            task = _attach_issue_statement(task, issue_gen_dir, repo_id)
+            if all(k in task for k in REQUIRED_KEYS):
+                if "problem_statement" not in task:
+                    task["problem_statement"] = ""
+                cleaned_tasks.append(task)
+            else:
+                skipped_missing_keys += 1
+
+        print(f"[{repo_id}] Done")
+
+    if not cleaned_tasks:
+        return {
+            "success": False,
+            "error": "No valid tasks to upload",
+            "repos_processed": repos_processed,
+            "repos_failed": repos_failed,
+            "skipped_missing_keys": skipped_missing_keys,
+        }
 
     print(f"Valid tasks: {len(cleaned_tasks)}")
-    local_dataset = Dataset.from_list(cleaned_tasks)
-    local_ids = set(local_dataset["instance_id"])
+    dataset = Dataset.from_list(cleaned_tasks)
 
-    final_dataset = local_dataset
+    print(f"Ensuring dataset repo exists: {target_dataset}")
+    create_repo(target_dataset, repo_type="dataset", token=token, exist_ok=True)
 
-    # Try to ensure repo exists
-    print(f"Ensuring repository {target_dataset} exists...")
-    try:
-        create_repo(target_dataset, repo_type="dataset", token=token, exist_ok=True)
-    except Exception as e:
-        print(
-            f"Warning: create_repo failed: {e}. Attempting upload anyway (might fail if permissions wrong)."
-        )
-
-    # print(f"Loading target dataset: {target_dataset}")
-    # try:
-    #     sweb = load_dataset(target_dataset, split="train", token=token)
-    #     print(f"Existing HF dataset size: {len(sweb)}")
-
-    #     sweb_filtered = sweb.filter(lambda x: x["instance_id"] not in local_ids)
-    #     print(f"Would override {len(sweb) - len(sweb_filtered)} instances")
-
-    #     final_dataset = concatenate_datasets([sweb_filtered, local_dataset])
-    # except Exception as e:
-    #     print(f"Note: Could not load existing dataset '{target_dataset}' (it might be new or empty). Error: {e}")
-    #     print("Proceeding with creating a new dataset from local tasks.")
-
-    print(f"Pushing {len(final_dataset)} instances to {target_dataset}...")
-    final_dataset.push_to_hub(target_dataset, token=token)
+    print(f"Pushing {len(dataset)} instances to {target_dataset}...")
+    dataset.push_to_hub(target_dataset, token=token)
     print("Remote push finished successfully.")
 
+    return {
+        "success": True,
+        "target_dataset": target_dataset,
+        "repos_processed": repos_processed,
+        "repos_failed": repos_failed,
+        "instances_uploaded": len(cleaned_tasks),
+        "skipped_missing_keys": skipped_missing_keys,
+    }
+
 
 @app.local_entrypoint()
 def main(
@@ -155,28 +135,17 @@ def main(
     language: str = "javascript",
     push: bool = False,
 ):
-    print("Listing task files from Modal volume...")
-    try:
-        entries = vol.listdir(f"{language}/task_insts")
-        filenames = [e.path.split("/")[-1] for e in entries if e.path.endswith(".json")]
-    except Exception as e:
-        print(f"Error listing volume: {e}")
-        return
-
-    print(f"Found {len(filenames)} files. Starting parallel processing...")
-
-    all_tasks = []
-    for repo_tasks in process_repo.map(filenames, [language] * len(filenames)):
-        all_tasks.extend(repo_tasks)
-
-    print(f"Fetched total {len(all_tasks)} task instances.")
-
     if not push:
-        confirm = input(f"Ready to push to HF. Proceed? (y/n) ").lower()
+        confirm = input(
+            f"Run remote upload to '{target_dataset}' for language '{language}'? (y/n) "
+        ).lower()
         if confirm != "y":
             print("Aborting.")
             return
 
-    print("Launching remote push job...")
-    push_to_hf_remote.remote(all_tasks, target_dataset)
-    print("Done!")
+    print("Starting robust remote upload...")
+    result = upload_from_volume_remote.remote(target_dataset, language)
+    print(json.dumps(result, indent=2))
+
+    if not result.get("success"):
+        raise RuntimeError(result.get("error", "Upload failed"))

From 3b9fe1defd158204f8255451cee152c896924655 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Thu, 12 Feb 2026 06:30:21 +0000
Subject: [PATCH 24/32] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 scripts/upload_tasks_to_hf_modal.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/scripts/upload_tasks_to_hf_modal.py b/scripts/upload_tasks_to_hf_modal.py
index 0953903f..d23cfc0f 100644
--- a/scripts/upload_tasks_to_hf_modal.py
+++ b/scripts/upload_tasks_to_hf_modal.py
@@ -51,7 +51,9 @@ def _attach_issue_statement(task: dict, issue_gen_dir: Path, repo_id: str) -> di
     secrets=[modal.Secret.from_name("john-hf-secret")],
     timeout=10800,
 )
-def upload_from_volume_remote(target_dataset: str, language: str = "javascript") -> dict:
+def upload_from_volume_remote(
+    target_dataset: str, language: str = "javascript"
+) -> dict:
     """Robust end-to-end upload: volume -> issue merge -> validate -> HF push."""
     import os
     from datasets import Dataset

From 0a61157b65cd0cf0a35353189d4ca0dbae9fc9d8 Mon Sep 17 00:00:00 2001
From: Kevin Li <kevinli020508@gmail.com>
Date: Fri, 27 Feb 2026 18:52:17 -0800
Subject: [PATCH 25/32] Support backfilling patch diffs

---
 scripts/backfill_patchdiff_modal.py | 111 ++++++++++++++++++++++++++++
 1 file changed, 111 insertions(+)
 create mode 100644 scripts/backfill_patchdiff_modal.py

diff --git a/scripts/backfill_patchdiff_modal.py b/scripts/backfill_patchdiff_modal.py
new file mode 100644
index 00000000..06ca5fc8
--- /dev/null
+++ b/scripts/backfill_patchdiff_modal.py
@@ -0,0 +1,111 @@
+import json
+from pathlib import Path
+
+import modal
+
+APP_NAME = "swesmith-backfill-patchdiff"
+VOLUME_NAME = "swesmith-bug-gen"
+LOGS_MOUNT_PATH = "/logs"
+
+app = modal.App(APP_NAME)
+logs_volume = modal.Volume.from_name(VOLUME_NAME)
+
+
+@app.function(
+    timeout=3600,
+    volumes={LOGS_MOUNT_PATH: logs_volume},
+    max_containers=20,
+)
+def backfill_repo(repo_patch_file: str, language: str = "java") -> dict:
+    bug_file = Path(LOGS_MOUNT_PATH) / language / "bug_gen" / repo_patch_file
+    repo_id = repo_patch_file.replace("_all_patches.json", "")
+    run_val_repo_dir = Path(LOGS_MOUNT_PATH) / language / "run_validation" / repo_id
+
+    if not bug_file.exists():
+        return {"repo_id": repo_id, "status": "skipped", "reason": "missing bug_gen file"}
+
+    if not run_val_repo_dir.exists():
+        return {"repo_id": repo_id, "status": "skipped", "reason": "missing run_validation repo dir"}
+
+    try:
+        patches = json.loads(bug_file.read_text())
+    except Exception as e:
+        return {"repo_id": repo_id, "status": "error", "error": f"parse patches: {e}"}
+
+    eligible = 0
+    written = 0
+    missing_instance = 0
+    missing_patch = 0
+
+    for patch in patches:
+        instance_id = patch.get("instance_id")
+        if not instance_id:
+            continue
+
+        instance_dir = run_val_repo_dir / instance_id
+        if not instance_dir.exists():
+            missing_instance += 1
+            continue
+
+        eligible += 1
+        patch_text = patch.get("patch")
+        if not patch_text:
+            missing_patch += 1
+            continue
+
+        (instance_dir / "patch.diff").write_text(patch_text)
+        written += 1
+
+    return {
+        "repo_id": repo_id,
+        "status": "ok",
+        "eligible": eligible,
+        "written": written,
+        "missing_instance": missing_instance,
+        "missing_patch": missing_patch,
+    }
+
+
+@app.local_entrypoint()
+def main(language: str = "java"):
+    entries = logs_volume.listdir(f"{language}/bug_gen")
+    patch_files = [e.path.split("/")[-1] for e in entries if e.path.endswith("_all_patches.json")]
+
+    print(f"Found {len(patch_files)} patch files in {language}/bug_gen")
+
+    total_eligible = 0
+    total_written = 0
+    total_missing_instance = 0
+    total_missing_patch = 0
+    ok = 0
+    skipped = 0
+    failed = 0
+
+    for i, result in enumerate(
+        backfill_repo.map(patch_files, [language] * len(patch_files), order_outputs=False),
+        start=1,
+    ):
+        status = result.get("status")
+        repo_id = result.get("repo_id", "unknown")
+        if status == "ok":
+            ok += 1
+            total_eligible += result.get("eligible", 0)
+            total_written += result.get("written", 0)
+            total_missing_instance += result.get("missing_instance", 0)
+            total_missing_patch += result.get("missing_patch", 0)
+            if i % 10 == 0 or result.get("written", 0) > 0:
+                print(f"[{i}/{len(patch_files)}] {repo_id}: wrote {result.get('written', 0)}")
+        elif status == "skipped":
+            skipped += 1
+        else:
+            failed += 1
+            print(f"[{i}/{len(patch_files)}] {repo_id}: ERROR {result.get('error')}")
+
+    print("\nBackfill summary")
+    print(f"  repos_ok:           {ok}")
+    print(f"  repos_skipped:      {skipped}")
+    print(f"  repos_failed:       {failed}")
+    print(f"  eligible_instances: {total_eligible}")
+    print(f"  patchdiff_written:  {total_written}")
+    print(f"  missing_instance:   {total_missing_instance}")
+    print(f"  missing_patch:      {total_missing_patch}")

From b2893eeb78c9309422637ef6bbaa7c957f7cb24f Mon Sep 17 00:00:00 2001
From: Kevin Li <kevinli020508@gmail.com>
Date: Fri, 27 Feb 2026 18:52:45 -0800
Subject: [PATCH 26/32] Relax gather timeout to 1 hour

---
 scripts/bug_gen_modal.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/scripts/bug_gen_modal.py b/scripts/bug_gen_modal.py
index 6a0a132f..9a2bf102 100644
--- a/scripts/bug_gen_modal.py
+++ b/scripts/bug_gen_modal.py
@@ -1643,7 +1643,8 @@ def print_summary(results: list[dict], repos_count: int):
 @app.function(
     image=generator_image,
     secrets=[modal.Secret.from_name("GITHUB_TOKEN")],
-    timeout=MODAL_TIMEOUT,
+    # Gather can push hundreds of branches for large repos; 10 minutes is too low.
+    timeout=60 * MINUTES,
     volumes={LOGS_MOUNT_PATH: logs_volume},
 )
 def gather_remote(

From e36f583aec03f0ea62a12abb0b199197481811f4 Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Sun, 8 Mar 2026 22:11:06 +0000
Subject: [PATCH 27/32] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 scripts/backfill_patchdiff_modal.py | 24 +++++++++++++++++++-----
 1 file changed, 19 insertions(+), 5 deletions(-)

diff --git a/scripts/backfill_patchdiff_modal.py b/scripts/backfill_patchdiff_modal.py
index 06ca5fc8..114d5040 100644
--- a/scripts/backfill_patchdiff_modal.py
+++ b/scripts/backfill_patchdiff_modal.py
@@ -22,10 +22,18 @@ def backfill_repo(repo_patch_file: str, language: str = "java") -> dict:
     run_val_repo_dir = Path(LOGS_MOUNT_PATH) / language / "run_validation" / repo_id
 
     if not bug_file.exists():
-        return {"repo_id": repo_id, "status": "skipped", "reason": "missing bug_gen file"}
+        return {
+            "repo_id": repo_id,
+            "status": "skipped",
+            "reason": "missing bug_gen file",
+        }
 
     if not run_val_repo_dir.exists():
-        return {"repo_id": repo_id, "status": "skipped", "reason": "missing run_validation repo dir"}
+        return {
+            "repo_id": repo_id,
+            "status": "skipped",
+            "reason": "missing run_validation repo dir",
+        }
 
     try:
         patches = json.loads(bug_file.read_text())
@@ -69,7 +77,9 @@ def backfill_repo(repo_patch_file: str, language: str = "java") -> dict:
 @app.local_entrypoint()
 def main(language: str = "java"):
     entries = logs_volume.listdir(f"{language}/bug_gen")
-    patch_files = [e.path.split("/")[-1] for e in entries if e.path.endswith("_all_patches.json")]
+    patch_files = [
+        e.path.split("/")[-1] for e in entries if e.path.endswith("_all_patches.json")
+    ]
 
     print(f"Found {len(patch_files)} patch files in {language}/bug_gen")
 
@@ -82,7 +92,9 @@ def main(language: str = "java"):
     failed = 0
 
     for i, result in enumerate(
-        backfill_repo.map(patch_files, [language] * len(patch_files), order_outputs=False),
+        backfill_repo.map(
+            patch_files, [language] * len(patch_files), order_outputs=False
+        ),
         start=1,
     ):
         status = result.get("status")
@@ -94,7 +106,9 @@ def main(language: str = "java"):
             total_missing_instance += result.get("missing_instance", 0)
             total_missing_patch += result.get("missing_patch", 0)
             if i % 10 == 0 or result.get("written", 0) > 0:
-                print(f"[{i}/{len(patch_files)}] {repo_id}: wrote {result.get('written', 0)}")
+                print(
+                    f"[{i}/{len(patch_files)}] {repo_id}: wrote {result.get('written', 0)}"
+                )
         elif status == "skipped":
             skipped += 1
         else:

From 0ea5927c36cab069eb86cab87c276bce53502cd6 Mon Sep 17 00:00:00 2001
From: Kevin Li <kevinli020508@gmail.com>
Date: Mon, 9 Mar 2026 11:13:42 -0700
Subject: [PATCH 28/32] Add modal helpers for issue-task uploads

---
 scripts/issue_gen_redo_existing_modal.py | 97 ++++++++++++++++++++++++
 scripts/overwrite_and_dedup_tasks.py     | 84 ++++++++++++++++++++
 scripts/upload_tasks_to_hf_modal.py      | 40 +++-------
 3 files changed, 193 insertions(+), 28 deletions(-)
 create mode 100644 scripts/issue_gen_redo_existing_modal.py
 create mode 100644 scripts/overwrite_and_dedup_tasks.py

diff --git a/scripts/issue_gen_redo_existing_modal.py b/scripts/issue_gen_redo_existing_modal.py
new file mode 100644
index 00000000..2d7445eb
--- /dev/null
+++ b/scripts/issue_gen_redo_existing_modal.py
@@ -0,0 +1,97 @@
+import json
+import os
+from pathlib import Path
+
+import modal
+
+from scripts.bug_gen_modal import generator_image
+
+VOLUME_NAME = "swesmith-bug-gen"
+LOGS_MOUNT_PATH = "/logs"
+
+app = modal.App("issue-gen-redo-existing")
+logs_volume = modal.Volume.from_name(VOLUME_NAME, create_if_missing=True, version=2)
+
+
+@app.function(
+    image=generator_image,
+    volumes={LOGS_MOUNT_PATH: logs_volume},
+    timeout=3600,
+    secrets=[
+        modal.Secret.from_name("GITHUB_TOKEN"),
+        modal.Secret.from_name("PORTKEY_API_KEY"),
+    ],
+)
+def redo_issue_gen_remote(
+    repo: str,
+    language: str = "cpp",
+    config: str = "configs/issue_gen/ig_v2.yaml",
+    workers: int = 8,
+) -> dict:
+    from swesmith.issue_gen.generate import IssueGen
+
+    volume_root = Path(LOGS_MOUNT_PATH) / language
+    task_insts_dir = volume_root / "task_insts"
+
+    task_insts_file = None
+    repo_sanitized = repo.replace("/", "__")
+    if task_insts_dir.exists():
+        for filename in os.listdir(task_insts_dir):
+            if filename == f"{repo_sanitized}.json" or (
+                filename.startswith(f"{repo_sanitized}.") and filename.endswith(".json")
+            ):
+                task_insts_file = task_insts_dir / filename
+                break
+
+    if not task_insts_file or not task_insts_file.exists():
+        return {
+            "success": False,
+            "repo": repo,
+            "error": "No task instances file found",
+        }
+
+    local_logs = Path("/root/logs")
+    local_logs.mkdir(parents=True, exist_ok=True)
+    for subdir in ["task_insts", "run_validation", "issue_gen"]:
+        local_subdir = local_logs / subdir
+        volume_subdir = volume_root / subdir
+        volume_subdir.mkdir(parents=True, exist_ok=True)
+        try:
+            if local_subdir.exists() or local_subdir.is_symlink():
+                local_subdir.unlink()
+            local_subdir.symlink_to(volume_subdir)
+        except FileExistsError:
+            pass
+
+    issue_gen = IssueGen(
+        dataset_path=str(task_insts_file),
+        config_file=Path(config),
+        workers=workers,
+        redo_existing=True,
+    )
+    issue_gen.run()
+
+    ig_file = task_insts_file.parent / f"{task_insts_file.stem}__ig_llm.json"
+    issue_count = 0
+    if ig_file.exists():
+        data = json.loads(ig_file.read_text())
+        issue_count = sum(1 for row in data if (row.get("problem_statement") or "").strip())
+
+    return {
+        "success": True,
+        "repo": repo,
+        "task_insts_file": str(task_insts_file),
+        "ig_file": str(ig_file),
+        "issue_count": issue_count,
+    }
+
+
+@app.local_entrypoint()
+def main(
+    repo: str,
+    language: str = "cpp",
+    config: str = "configs/issue_gen/ig_v2.yaml",
+    workers: int = 8,
+):
+    result = redo_issue_gen_remote.remote(repo, language, config, workers)
+    print(json.dumps(result, indent=2))
diff --git a/scripts/overwrite_and_dedup_tasks.py b/scripts/overwrite_and_dedup_tasks.py
new file mode 100644
index 00000000..96b991c9
--- /dev/null
+++ b/scripts/overwrite_and_dedup_tasks.py
@@ -0,0 +1,84 @@
+"""
+Modal script to filter and overwrite SWE-smith datasets on Hugging Face.
+
+This variant intentionally does NOT do task aggregation or issue generation.
+It only:
+- Loads a source HF dataset.
+- Filters out rows with empty `problem_statement`.
+- Pushes the filtered dataset to a target HF dataset, overwriting existing contents.
+"""
+
+import os
+
+import modal
+from datasets import Dataset, DatasetDict, load_dataset
+from huggingface_hub import create_repo
+
+app = modal.App("swesmith-overwrite-hf")
+image = modal.Image.debian_slim().pip_install("datasets", "huggingface_hub")
+
+
+@app.function(
+    image=image,
+    secrets=[modal.Secret.from_name("john-hf-secret")],
+    timeout=10800,
+)
+def filter_and_overwrite_remote(
+    source_dataset: str = "SWE-bench/SWE-smith-ts",
+    target_dataset: str = "SWE-bench/SWE-smith-ts",
+    source_split: str = "train",
+) -> dict:
+    """Filter source_dataset for non-empty problem_statement and push to target_dataset."""
+    token = os.environ.get("HF_TOKEN")
+    if not token:
+        return {"success": False, "error": "HF_TOKEN not found in environment"}
+
+    print(f"Loading source dataset: {source_dataset} split={source_split}")
+    ds = load_dataset(source_dataset, split=source_split)
+    print(f"Source rows: {len(ds)}")
+
+    filtered = ds.filter(lambda row: bool(str(row.get("problem_statement") or "").strip()))
+
+    print(f"Filtered rows (non-empty problem_statement): {len(filtered)}")
+    print(f"Dropped rows: {len(ds) - len(filtered)}")
+
+    create_repo(target_dataset, repo_type="dataset", token=token, exist_ok=True)
+
+    DatasetDict({"train": filtered}).push_to_hub(target_dataset, token=token)
+
+    return {
+        "success": True,
+        "source_dataset": source_dataset,
+        "target_dataset": target_dataset,
+        "source_split": source_split,
+        "source_rows": len(ds),
+        "kept_rows": len(filtered),
+        "dropped_rows": len(ds) - len(filtered),
+    }
+
+
+@app.local_entrypoint()
+def main(
+    source_dataset: str = "SWE-bench/SWE-smith-ts",
+    target_dataset: str = "SWE-bench/SWE-smith-ts",
+    source_split: str = "train",
+    push: bool = False,
+):
+    if not push:
+        confirm = input(
+            f"Overwrite '{target_dataset}' from '{source_dataset}' ({source_split}) with non-empty problem_statement? (y/n) "
+        ).lower()
+        if confirm != "y":
+            print("Aborting.")
+            return
+
+    print("Starting remote filter-and-overwrite...")
+    result = filter_and_overwrite_remote.remote(
+        source_dataset=source_dataset,
+        target_dataset=target_dataset,
+        source_split=source_split,
+    )
+    print(result)
+
+    if not result.get("success"):
+        raise RuntimeError(result.get("error", "Upload failed"))
diff --git a/scripts/upload_tasks_to_hf_modal.py b/scripts/upload_tasks_to_hf_modal.py
index d23cfc0f..3f90cb3d 100644
--- a/scripts/upload_tasks_to_hf_modal.py
+++ b/scripts/upload_tasks_to_hf_modal.py
@@ -18,29 +18,13 @@
 ISSUE_MODEL_KEY = "portkey/gpt-5-mini"
 
 
-def _attach_issue_statement(task: dict, issue_gen_dir: Path, repo_id: str) -> dict:
-    """Attach issue text and normalize fields for a task instance."""
-    instance_id = task.get("instance_id")
-    if not instance_id:
-        return task
-
+def _normalize_task(task: dict) -> dict:
+    """Normalize fields for a task instance before upload."""
     if "image_name" in task and ".architecture." in task["image_name"]:
         task["image_name"] = task["image_name"].replace(".architecture", "")
 
-    task["problem_statement"] = ""
-    issue_file = issue_gen_dir / repo_id / f"{instance_id}.json"
-    if not issue_file.exists():
-        return task
-
-    try:
-        issue_data = json.loads(issue_file.read_text())
-    except Exception:
-        return task
-
-    responses = issue_data.get("responses", {})
-    content = responses.get(ISSUE_MODEL_KEY)
-    if isinstance(content, list) and content:
-        task["problem_statement"] = content[0]
+    if "problem_statement" not in task:
+        task["problem_statement"] = ""
 
     return task
 
@@ -54,7 +38,7 @@ def _attach_issue_statement(task: dict, issue_gen_dir: Path, repo_id: str) -> di
 def upload_from_volume_remote(
     target_dataset: str, language: str = "javascript"
 ) -> dict:
-    """Robust end-to-end upload: volume -> issue merge -> validate -> HF push."""
+    """Upload issue-generated task instances from the Modal volume to HF."""
     import os
     from datasets import Dataset
     from huggingface_hub import create_repo
@@ -64,15 +48,17 @@ def upload_from_volume_remote(
         return {"success": False, "error": "HF_TOKEN not found in environment"}
 
     task_insts_dir = Path(f"/data/{language}/task_insts")
-    issue_gen_dir = Path(f"/data/{language}/issue_gen")
     if not task_insts_dir.exists():
         return {"success": False, "error": f"Missing task_insts dir: {task_insts_dir}"}
 
-    task_files = sorted(task_insts_dir.glob("*.json"))
+    task_files = sorted(task_insts_dir.glob("*__ig_llm.json"))
     if not task_files:
-        return {"success": False, "error": f"No task files in {task_insts_dir}"}
+        return {
+            "success": False,
+            "error": f"No __ig_llm task files in {task_insts_dir}",
+        }
 
-    print(f"Found {len(task_files)} task files in volume.")
+    print(f"Found {len(task_files)} __ig_llm task files in volume.")
 
     cleaned_tasks = []
     skipped_missing_keys = 0
@@ -92,10 +78,8 @@ def upload_from_volume_remote(
         print(f"[{repo_id}] Processing {len(tasks)} tasks...")
 
         for task in tasks:
-            task = _attach_issue_statement(task, issue_gen_dir, repo_id)
+            task = _normalize_task(task)
             if all(k in task for k in REQUIRED_KEYS):
-                if "problem_statement" not in task:
-                    task["problem_statement"] = ""
                 cleaned_tasks.append(task)
             else:
                 skipped_missing_keys += 1

From 3ce0bd2260c7fe08fe1cf87d52e1e8b939ce14be Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Mon, 9 Mar 2026 18:17:10 +0000
Subject: [PATCH 29/32] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 scripts/issue_gen_redo_existing_modal.py | 4 +++-
 scripts/overwrite_and_dedup_tasks.py     | 4 +++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/scripts/issue_gen_redo_existing_modal.py b/scripts/issue_gen_redo_existing_modal.py
index 2d7445eb..be015bdd 100644
--- a/scripts/issue_gen_redo_existing_modal.py
+++ b/scripts/issue_gen_redo_existing_modal.py
@@ -75,7 +75,9 @@ def redo_issue_gen_remote(
     issue_count = 0
     if ig_file.exists():
         data = json.loads(ig_file.read_text())
-        issue_count = sum(1 for row in data if (row.get("problem_statement") or "").strip())
+        issue_count = sum(
+            1 for row in data if (row.get("problem_statement") or "").strip()
+        )
 
     return {
         "success": True,
diff --git a/scripts/overwrite_and_dedup_tasks.py b/scripts/overwrite_and_dedup_tasks.py
index 96b991c9..51a250e8 100644
--- a/scripts/overwrite_and_dedup_tasks.py
+++ b/scripts/overwrite_and_dedup_tasks.py
@@ -37,7 +37,9 @@ def filter_and_overwrite_remote(
     ds = load_dataset(source_dataset, split=source_split)
     print(f"Source rows: {len(ds)}")
 
-    filtered = ds.filter(lambda row: bool(str(row.get("problem_statement") or "").strip()))
+    filtered = ds.filter(
+        lambda row: bool(str(row.get("problem_statement") or "").strip())
+    )
 
     print(f"Filtered rows (non-empty problem_statement): {len(filtered)}")
     print(f"Dropped rows: {len(ds) - len(filtered)}")

From 172ca185f251708afa60b0e0b8b49d3dd443edae Mon Sep 17 00:00:00 2001
From: Kevin Li <kevinli020508@gmail.com>
Date: Mon, 9 Mar 2026 11:20:36 -0700
Subject: [PATCH 30/32] Fix Ruff issues in modal scripts

---
 scripts/bug_gen_modal.py             | 3 +--
 scripts/overwrite_and_dedup_tasks.py | 2 +-
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/scripts/bug_gen_modal.py b/scripts/bug_gen_modal.py
index 3c3bd37d..4467390b 100644
--- a/scripts/bug_gen_modal.py
+++ b/scripts/bug_gen_modal.py
@@ -1710,7 +1710,6 @@ def gather_remote(
 ) -> dict:
     """Run gather.py for a repository to create task instances and push branches."""
     import os
-    import sys
     import subprocess
     import traceback
     from pathlib import Path
@@ -2055,7 +2054,7 @@ async def run_issue_gen_phase_async(
         issue_gen_redo: Whether to regenerate existing issues
     """
     print(f"\n{'=' * 80}")
-    print(f"ISSUE GENERATION PHASE")
+    print("ISSUE GENERATION PHASE")
     print(f"{'=' * 80}")
     print(f"Processing {len(repos)} repositories...")
     print(f"Config: {issue_gen_config}")
diff --git a/scripts/overwrite_and_dedup_tasks.py b/scripts/overwrite_and_dedup_tasks.py
index 96b991c9..200be6c1 100644
--- a/scripts/overwrite_and_dedup_tasks.py
+++ b/scripts/overwrite_and_dedup_tasks.py
@@ -11,7 +11,7 @@
 import os
 
 import modal
-from datasets import Dataset, DatasetDict, load_dataset
+from datasets import DatasetDict, load_dataset
 from huggingface_hub import create_repo
 
 app = modal.App("swesmith-overwrite-hf")

From 5424d12919f3f3980b271de2ea7adb9ecbcc4f75 Mon Sep 17 00:00:00 2001
From: Kevin Li <kevinli020508@gmail.com>
Date: Mon, 9 Mar 2026 11:23:04 -0700
Subject: [PATCH 31/32] Remove unused import in bug gen modal

---
 scripts/bug_gen_modal.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/scripts/bug_gen_modal.py b/scripts/bug_gen_modal.py
index 4467390b..b10a2aff 100644
--- a/scripts/bug_gen_modal.py
+++ b/scripts/bug_gen_modal.py
@@ -1934,7 +1934,6 @@ def issue_gen_remote(
         workers: Number of workers per repo
     """
     import os
-    import sys
     from pathlib import Path
 
     # Set up paths

From 2de1c697f43ad57888ab7714fa68d00cfa8be107 Mon Sep 17 00:00:00 2001
From: Kevin Li <kevinli020508@gmail.com>
Date: Mon, 9 Mar 2026 11:29:55 -0700
Subject: [PATCH 32/32] Remove unused modal helper scripts

---
 scripts/issue_gen_redo_existing_modal.py | 99 ------------------------
 scripts/overwrite_and_dedup_tasks.py     | 86 --------------------
 2 files changed, 185 deletions(-)
 delete mode 100644 scripts/issue_gen_redo_existing_modal.py
 delete mode 100644 scripts/overwrite_and_dedup_tasks.py

diff --git a/scripts/issue_gen_redo_existing_modal.py b/scripts/issue_gen_redo_existing_modal.py
deleted file mode 100644
index be015bdd..00000000
--- a/scripts/issue_gen_redo_existing_modal.py
+++ /dev/null
@@ -1,99 +0,0 @@
-import json
-import os
-from pathlib import Path
-
-import modal
-
-from scripts.bug_gen_modal import generator_image
-
-VOLUME_NAME = "swesmith-bug-gen"
-LOGS_MOUNT_PATH = "/logs"
-
-app = modal.App("issue-gen-redo-existing")
-logs_volume = modal.Volume.from_name(VOLUME_NAME, create_if_missing=True, version=2)
-
-
-@app.function(
-    image=generator_image,
-    volumes={LOGS_MOUNT_PATH: logs_volume},
-    timeout=3600,
-    secrets=[
-        modal.Secret.from_name("GITHUB_TOKEN"),
-        modal.Secret.from_name("PORTKEY_API_KEY"),
-    ],
-)
-def redo_issue_gen_remote(
-    repo: str,
-    language: str = "cpp",
-    config: str = "configs/issue_gen/ig_v2.yaml",
-    workers: int = 8,
-) -> dict:
-    from swesmith.issue_gen.generate import IssueGen
-
-    volume_root = Path(LOGS_MOUNT_PATH) / language
-    task_insts_dir = volume_root / "task_insts"
-
-    task_insts_file = None
-    repo_sanitized = repo.replace("/", "__")
-    if task_insts_dir.exists():
-        for filename in os.listdir(task_insts_dir):
-            if filename == f"{repo_sanitized}.json" or (
-                filename.startswith(f"{repo_sanitized}.") and filename.endswith(".json")
-            ):
-                task_insts_file = task_insts_dir / filename
-                break
-
-    if not task_insts_file or not task_insts_file.exists():
-        return {
-            "success": False,
-            "repo": repo,
-            "error": "No task instances file found",
-        }
-
-    local_logs = Path("/root/logs")
-    local_logs.mkdir(parents=True, exist_ok=True)
-    for subdir in ["task_insts", "run_validation", "issue_gen"]:
-        local_subdir = local_logs / subdir
-        volume_subdir = volume_root / subdir
-        volume_subdir.mkdir(parents=True, exist_ok=True)
-        try:
-            if local_subdir.exists() or local_subdir.is_symlink():
-                local_subdir.unlink()
-            local_subdir.symlink_to(volume_subdir)
-        except FileExistsError:
-            pass
-
-    issue_gen = IssueGen(
-        dataset_path=str(task_insts_file),
-        config_file=Path(config),
-        workers=workers,
-        redo_existing=True,
-    )
-    issue_gen.run()
-
-    ig_file = task_insts_file.parent / f"{task_insts_file.stem}__ig_llm.json"
-    issue_count = 0
-    if ig_file.exists():
-        data = json.loads(ig_file.read_text())
-        issue_count = sum(
-            1 for row in data if (row.get("problem_statement") or "").strip()
-        )
-
-    return {
-        "success": True,
-        "repo": repo,
-        "task_insts_file": str(task_insts_file),
-        "ig_file": str(ig_file),
-        "issue_count": issue_count,
-    }
-
-
-@app.local_entrypoint()
-def main(
-    repo: str,
-    language: str = "cpp",
-    config: str = "configs/issue_gen/ig_v2.yaml",
-    workers: int = 8,
-):
-    result = redo_issue_gen_remote.remote(repo, language, config, workers)
-    print(json.dumps(result, indent=2))
diff --git a/scripts/overwrite_and_dedup_tasks.py b/scripts/overwrite_and_dedup_tasks.py
deleted file mode 100644
index c46ba929..00000000
--- a/scripts/overwrite_and_dedup_tasks.py
+++ /dev/null
@@ -1,86 +0,0 @@
-"""
-Modal script to filter and overwrite SWE-smith datasets on Hugging Face.
-
-This variant intentionally does NOT do task aggregation or issue generation.
-It only:
-- Loads a source HF dataset.
-- Filters out rows with empty `problem_statement`.
-- Pushes the filtered dataset to a target HF dataset, overwriting existing contents.
-"""
-
-import os
-
-import modal
-from datasets import DatasetDict, load_dataset
-from huggingface_hub import create_repo
-
-app = modal.App("swesmith-overwrite-hf")
-image = modal.Image.debian_slim().pip_install("datasets", "huggingface_hub")
-
-
-@app.function(
-    image=image,
-    secrets=[modal.Secret.from_name("john-hf-secret")],
-    timeout=10800,
-)
-def filter_and_overwrite_remote(
-    source_dataset: str = "SWE-bench/SWE-smith-ts",
-    target_dataset: str = "SWE-bench/SWE-smith-ts",
-    source_split: str = "train",
-) -> dict:
-    """Filter source_dataset for non-empty problem_statement and push to target_dataset."""
-    token = os.environ.get("HF_TOKEN")
-    if not token:
-        return {"success": False, "error": "HF_TOKEN not found in environment"}
-
-    print(f"Loading source dataset: {source_dataset} split={source_split}")
-    ds = load_dataset(source_dataset, split=source_split)
-    print(f"Source rows: {len(ds)}")
-
-    filtered = ds.filter(
-        lambda row: bool(str(row.get("problem_statement") or "").strip())
-    )
-
-    print(f"Filtered rows (non-empty problem_statement): {len(filtered)}")
-    print(f"Dropped rows: {len(ds) - len(filtered)}")
-
-    create_repo(target_dataset, repo_type="dataset", token=token, exist_ok=True)
-
-    DatasetDict({"train": filtered}).push_to_hub(target_dataset, token=token)
-
-    return {
-        "success": True,
-        "source_dataset": source_dataset,
-        "target_dataset": target_dataset,
-        "source_split": source_split,
-        "source_rows": len(ds),
-        "kept_rows": len(filtered),
-        "dropped_rows": len(ds) - len(filtered),
-    }
-
-
-@app.local_entrypoint()
-def main(
-    source_dataset: str = "SWE-bench/SWE-smith-ts",
-    target_dataset: str = "SWE-bench/SWE-smith-ts",
-    source_split: str = "train",
-    push: bool = False,
-):
-    if not push:
-        confirm = input(
-            f"Overwrite '{target_dataset}' from '{source_dataset}' ({source_split}) with non-empty problem_statement? (y/n) "
-        ).lower()
-        if confirm != "y":
-            print("Aborting.")
-            return
-
-    print("Starting remote filter-and-overwrite...")
-    result = filter_and_overwrite_remote.remote(
-        source_dataset=source_dataset,
-        target_dataset=target_dataset,
-        source_split=source_split,
-    )
-    print(result)
-
-    if not result.get("success"):
-        raise RuntimeError(result.get("error", "Upload failed"))