From 0d77bc7ef52b58f3a5b0d0a4a6a0f21e7adb2961 Mon Sep 17 00:00:00 2001 From: Kevin Li Date: Fri, 16 Jan 2026 08:56:26 -0800 Subject: [PATCH 01/32] Support --gather in bug_gen_modal.py --- scripts/bug_gen_modal.py | 252 ++++++++++++++++++++++++++++++++++--- swesmith/harness/gather.py | 14 ++- swesmith/profiles/base.py | 2 +- 3 files changed, 242 insertions(+), 26 deletions(-) diff --git a/scripts/bug_gen_modal.py b/scripts/bug_gen_modal.py index e9c54f2a..9ac529e7 100644 --- a/scripts/bug_gen_modal.py +++ b/scripts/bug_gen_modal.py @@ -1018,7 +1018,7 @@ async def check_with_sem(repo_tuple): completed = 0 total_bugs = 0 - for result_or_exc in generate_bugs_remote.map( + async for result_or_exc in generate_bugs_remote.map.aio( repo_names, kwargs={ "max_bugs": args.max_bugs, @@ -1475,6 +1475,11 @@ async def process_single_task(task: dict) -> dict: report_volume_path = ( f"{lang}/run_validation/{repo_id}/{instance_id}/report.json" ) + # Write patch file to volume (required for gather step) + await volume_write_text( + f"{lang}/run_validation/{repo_id}/{instance_id}/patch.diff", + task["patch"], + ) postgold_config = { "test_cmd": task["profile"].test_cmd, @@ -1625,6 +1630,199 @@ def print_summary(results: list[dict], repos_count: int): ) +# ============================================================================ +# Gather Phase (Create Task Instances & Push Branches) +# ============================================================================ + + +@app.function( + image=generator_image, + secrets=[modal.Secret.from_name("GITHUB_TOKEN")], + timeout=MODAL_TIMEOUT, + volumes={LOGS_MOUNT_PATH: logs_volume}, +) +def gather_remote( + repo_name: str, + language: str, + repush_image: bool = False, + override_branch: bool = False, +) -> dict: + """Run gather.py for a repository to create task instances and push branches.""" + import os + import sys + import subprocess + import traceback + from pathlib import Path + + # Ensure swesmith is in path + if "/root" not in sys.path: + sys.path.append("/root") + + from swesmith.profiles import registry + + # Resolve repo ID + def resolve_repo_id(): + try: + return registry.get_from_inst( + {"repo": repo_name, "instance_id": "dummy"} + ).repo_name + except Exception: + target = repo_name.replace("/", "__") + candidates = [key for key in registry.keys() if target in key] + return candidates[0] if candidates else repo_name + + repo_id = resolve_repo_id() + print(f"Gathering for {repo_name} (ID: {repo_id})") + + # Setup environment to satisfy gather.py expectations + # 1. gather.py expects logs/run_validation to contain the repo logs + # 2. gather.py writes to logs/task_insts + + work_dir = Path("/root") + logs_link_dir = work_dir / "logs" + logs_link_dir.mkdir(exist_ok=True) + + # Configure git authentication + if "GITHUB_TOKEN" in os.environ: + token = os.environ["GITHUB_TOKEN"] + print(f"DEBUG: Found GITHUB_TOKEN (len={len(token)}). Configuring git auth...") + + # Use simpler authenticated URL format for PATs + subprocess.run( + ["git", "config", "--global", f"url.https://{token}@github.com/.insteadOf", "https://github.com/"], + check=True + ) + # Also configure user info + subprocess.run(["git", "config", "--global", "user.email", "swesmith@swesmith.ai"], check=False) + subprocess.run(["git", "config", "--global", "user.name", "swesmith"], check=False) + else: + print("Warning: GITHUB_TOKEN not found in environment. Git push may fail.") + + # Link run_validation: logs/run_validation -> /logs/{language}/run_validation + # We use the mounted volume path directly as the target + validation_source = Path(LOGS_MOUNT_PATH) / language / "run_validation" + validation_link = logs_link_dir / "run_validation" + + # Link task_insts: logs/task_insts -> /logs/task_insts (volume root) + task_insts_source = Path(LOGS_MOUNT_PATH) / "task_insts" + task_insts_link = logs_link_dir / "task_insts" + + try: + # Ensure sources exist on volume + task_insts_source.mkdir(parents=True, exist_ok=True) + if not validation_source.exists(): + return {"repo": repo_name, "status": "skipped", "reason": "No validation logs"} + + # Create symlinks + if not validation_link.exists(): + os.symlink(str(validation_source), str(validation_link)) + + if not task_insts_link.exists(): + os.symlink(str(task_insts_source), str(task_insts_link)) + + # Check if there are actually validation logs for this repo + repo_vals = validation_link / repo_id + if not repo_vals.exists(): + return {"repo": repo_name, "status": "skipped", "reason": "No logs for repo"} + + # Build command + # python -m swesmith.harness.gather logs/run_validation/ + cmd = [ + sys.executable, + "-m", "swesmith.harness.gather", + str(Path("logs/run_validation") / repo_id), + "-v", + "-d", + ] + + if repush_image: + cmd.append("--repush_image") + if override_branch: + cmd.append("--override_branch") + + print(f"Running: {' '.join(cmd)}") + + # execution + result = subprocess.run( + cmd, + cwd=str(work_dir), + capture_output=True, + text=True, + env=os.environ + ) + + if result.returncode != 0: + print("Gather failed:") + print(result.stdout) + print(result.stderr) + return { + "repo": repo_name, + "status": "failed", + "stdout": result.stdout, + "stderr": result.stderr + } + else: + print("Gather succeeded:") + print(result.stdout) + print(result.stderr) + + return { + "repo": repo_name, + "status": "success", + "stdout": result.stdout, + "stderr": result.stderr + } + + except Exception as e: + traceback.print_exc() + return {"repo": repo_name, "status": "error", "error": str(e)} + + +async def run_gather_phase_async(repos: list[str], language: str, args) -> None: + """Run gather phase for all repos in parallel.""" + print(f"\n{'#' * 60}") + print(f"# PHASE 3: GATHER ({len(repos)} repos)") + print(f"{'#' * 60}\n") + + # We can pass repush_image and override_branch via args if they existed, + # but for now we'll assume defaults or add them to args class if needed. + repush = getattr(args, "repush_image", False) + override = getattr(args, "override_branch", False) + + completed = 0 + success = 0 + + print(f"Starting gather for {len(repos)} repos...") + + async for result in gather_remote.map.aio( + repos, + kwargs={ + "language": language, + "repush_image": repush, + "override_branch": override, + } + ): + completed += 1 + repo = result.get("repo", "unknown") + status = result.get("status", "unknown") + + if status == "success": + success += 1 + print(f" [{completed}/{len(repos)}] {repo}: Success") + # Print last few lines of stdout to see "Wrote X instances" + if "stdout" in result: + lines = result["stdout"].splitlines() + for line in lines[-5:]: + print(f" | {line}") + elif status == "skipped": + print(f" [{completed}/{len(repos)}] {repo}: Skipped ({result.get('reason')})") + else: + err = result.get("error") or "Non-zero exit code" + print(f" [{completed}/{len(repos)}] {repo}: Failed - {err}") + + print(f"\nGather complete: {success}/{len(repos)} repos processed successfully.\n") + + # ============================================================================ # Stats Display # ============================================================================ @@ -1941,6 +2139,7 @@ async def main( max_candidates: int = 2000, max_concurrent_tests: int = 900, show_stats: bool = False, + gather: bool = False, ): """ Modal Bug Generation & Validation script. @@ -1948,6 +2147,7 @@ async def main( Runs two phases: 1. Generation: Creates bugs for repos (skips repos that are already done/failed) 2. Validation: Validates all patches from the volume + 3. Gather: Creates task instances and pushes branches Run with: modal run scripts/bug_gen.py [OPTIONS] @@ -1960,6 +2160,7 @@ async def main( max_candidates: Max candidates to process, -1 for all (default: 2000) max_concurrent_tests: Max concurrent tests (default: 900) show_stats: If True, show bug breakdown stats and exit without running generation/validation + gather: If True, only run the gather phase (skip generation and validation) """ # Handle --show-stats early exit if show_stats: @@ -2001,27 +2202,38 @@ class Args: args.max_candidates = max_candidates # Phase 1: Generation (skips repos that are already done/failed) - generation_results = await run_generation_phase(target_repos, args, language) + if not gather: + generation_results = await run_generation_phase(target_repos, args, language) - # Phase 2: Validation - collect ALL patches from volume (not just from this run) - print(f"\n{'#' * 60}") - print("# PHASE 2: VALIDATION") - print(f"{'#' * 60}\n") + # Phase 2: Validation - collect ALL patches from volume (not just from this run) + print(f"\n{'#' * 60}") + print("# PHASE 2: VALIDATION") + print(f"{'#' * 60}\n") - print("Collecting patches from volume...") - all_patches = await collect_patches_from_files(target_repos, language) - print(f"Total: {len(all_patches)} patches\n") + print("Collecting patches from volume...") + all_patches = await collect_patches_from_files(target_repos, language) + print(f"Total: {len(all_patches)} patches\n") - results = await run_validation_phase_async( - all_patches, max_concurrent_tests, ENV_NAME - ) + results = await run_validation_phase_async( + all_patches, max_concurrent_tests, ENV_NAME + ) + + if results: + print_summary(results, len(build_repos_with_patches(all_patches))) + + # Report generation errors from this run + errors = [r for r in generation_results if "error" in r] + if errors: + print(f"\nGeneration Errors ({len(errors)}):") + for err in errors: + print(f" - {err['repo']}: {err.get('error', 'Unknown')}") + else: + results = [] + + # Phase 3: Gather (Create task instances & Push branches) + if not results and not gather: + print("No validation results found. Skipping gather phase.") + return - if results: - print_summary(results, len(build_repos_with_patches(all_patches))) + await run_gather_phase_async(target_repos, language, args) - # Report generation errors from this run - errors = [r for r in generation_results if "error" in r] - if errors: - print(f"\nGeneration Errors ({len(errors)}):") - for err in errors: - print(f" - {err['repo']}: {err.get('error', 'Unknown')}") diff --git a/swesmith/harness/gather.py b/swesmith/harness/gather.py index 5b294594..a01a093f 100644 --- a/swesmith/harness/gather.py +++ b/swesmith/harness/gather.py @@ -8,7 +8,7 @@ "patch": "test_patch": "problem_statement": - "FAIL_TO_PASS": + "PASS_TO_FAIL": "PASS_TO_PASS": "version": } @@ -34,7 +34,7 @@ from pathlib import Path from swebench.harness.constants import ( - FAIL_TO_PASS, + PASS_TO_FAIL, PASS_TO_PASS, KEY_INSTANCE_ID, LOG_REPORT, @@ -187,16 +187,20 @@ def _main( if not os.path.exists(path_results): stats = skip_print(f"{subfolder}: No results", pbar, stats, verbose) continue + + if not os.path.exists(path_patch): + stats = skip_print(f"{subfolder}: No patch.diff", pbar, stats, verbose) + continue with open(path_results) as f: results = json.load(f) - if FAIL_TO_PASS not in results or PASS_TO_PASS not in results: + if PASS_TO_FAIL not in results or PASS_TO_PASS not in results: stats = skip_print( f"{subfolder}: No validatable bugs", pbar, stats, verbose ) continue - n_f2p = len(results[FAIL_TO_PASS]) + n_f2p = len(results[PASS_TO_FAIL]) n_p2p = len(results[PASS_TO_PASS]) pr_exception = ( ".pr_" in subfolder and n_p2p == 0 and n_f2p > 0 @@ -216,7 +220,7 @@ def _main( task_instance = { KEY_INSTANCE_ID: subfolder, KEY_PATCH: patch_content, - FAIL_TO_PASS: results[FAIL_TO_PASS], + PASS_TO_FAIL: results[PASS_TO_FAIL], PASS_TO_PASS: results[PASS_TO_PASS], } rp = registry.get_from_inst(task_instance) diff --git a/swesmith/profiles/base.py b/swesmith/profiles/base.py index f1c4e068..af23febb 100644 --- a/swesmith/profiles/base.py +++ b/swesmith/profiles/base.py @@ -290,7 +290,7 @@ def clone(self, dest: str | None = None) -> tuple[str, bool]: token = os.getenv("GITHUB_TOKEN") if token: base_url = ( - f"https://x-access-token:{token}@github.com/{self.mirror_name}.git" + f"https://{token}@github.com/{self.mirror_name}.git" ) else: base_url = f"git@github.com:{self.mirror_name}.git" From 14659088e29c5ebdff47378e6ab775d1a7f0fe45 Mon Sep 17 00:00:00 2001 From: Kevin Li Date: Fri, 16 Jan 2026 11:56:21 -0800 Subject: [PATCH 02/32] Update --gather to store to /logs/{language}/task_insts --- scripts/bug_gen_modal.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/bug_gen_modal.py b/scripts/bug_gen_modal.py index 9ac529e7..ad403238 100644 --- a/scripts/bug_gen_modal.py +++ b/scripts/bug_gen_modal.py @@ -1704,7 +1704,7 @@ def resolve_repo_id(): validation_link = logs_link_dir / "run_validation" # Link task_insts: logs/task_insts -> /logs/task_insts (volume root) - task_insts_source = Path(LOGS_MOUNT_PATH) / "task_insts" + task_insts_source = Path(LOGS_MOUNT_PATH) / language / "task_insts" task_insts_link = logs_link_dir / "task_insts" try: From 5d0cf3d5c9bbcec711865f80d7089b9e7753d260 Mon Sep 17 00:00:00 2001 From: Kevin Li Date: Fri, 16 Jan 2026 12:08:15 -0800 Subject: [PATCH 03/32] Only write out json if task instances is not empty --- swesmith/harness/gather.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/swesmith/harness/gather.py b/swesmith/harness/gather.py index a01a093f..73da2631 100644 --- a/swesmith/harness/gather.py +++ b/swesmith/harness/gather.py @@ -347,10 +347,12 @@ def _main( print(f"[{repo}] Rebuilding + pushing image") registry.get(repo).push_image(rebuild_image=True) - task_instances_path.parent.mkdir(parents=True, exist_ok=True) - with open(task_instances_path, "w") as f: - json.dump(task_instances, f, indent=4) - print(f"Wrote {len(task_instances)} instances to {task_instances_path}") + if len(task_instances) > 0: + task_instances_path.parent.mkdir(parents=True, exist_ok=True) + with open(task_instances_path, "w") as f: + json.dump(task_instances, f, indent=4) + print(f"Wrote {len(task_instances)} instances to {task_instances_path}") + print(f"- {stats['skipped']} skipped") print(f"- {stats['new_tasks']} new instances") From 96efde45436b767158104070715c61b8bd0f5a4d Mon Sep 17 00:00:00 2001 From: Kevin Li Date: Fri, 16 Jan 2026 12:11:36 -0800 Subject: [PATCH 04/32] Doubled modal sandbox time out to 20 minutes to account for repos that take longer to gather --- scripts/bug_gen_modal.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/bug_gen_modal.py b/scripts/bug_gen_modal.py index ad403238..f553c297 100644 --- a/scripts/bug_gen_modal.py +++ b/scripts/bug_gen_modal.py @@ -124,7 +124,7 @@ def custom_exception_handler(loop, context): APP_NAME = "swesmith-bug-gen" VOLUME_NAME = "swesmith-bug-gen" MINUTES = 60 -MODAL_TIMEOUT = 10 * MINUTES +MODAL_TIMEOUT = 20 * MINUTES SANDBOX_RATE_LIMIT = 4 # Modal limits to 5/s, use 4 to be safe LANGUAGE_TO_BASE_CLASS = { From d065420889410c21b8dc8dc36e50c7e39eb158c1 Mon Sep 17 00:00:00 2001 From: Kevin Li Date: Fri, 16 Jan 2026 12:53:40 -0800 Subject: [PATCH 05/32] feat: parallelize gather.py and fix thread safety --- swesmith/harness/gather.py | 275 +++++++++++++++++++++++-------------- swesmith/profiles/base.py | 9 +- 2 files changed, 176 insertions(+), 108 deletions(-) diff --git a/swesmith/harness/gather.py b/swesmith/harness/gather.py index 73da2631..c169515e 100644 --- a/swesmith/harness/gather.py +++ b/swesmith/harness/gather.py @@ -31,6 +31,8 @@ import os import shutil import subprocess +import concurrent.futures +import functools from pathlib import Path from swebench.harness.constants import ( @@ -98,24 +100,25 @@ def check_if_branch_exists( main_branch: str, override_branch: bool, verbose: bool, + subprocess_args: dict, ): branch_exists = None try: - subprocess.run(f"git checkout {subfolder}", cwd=repo_name, **SUBPROCESS_ARGS) + subprocess.run(f"git checkout {subfolder}", cwd=repo_name, **subprocess_args) if override_branch: # Delete the branch remotely subprocess.run( f"git push --delete origin {subfolder}", cwd=repo_name, - **SUBPROCESS_ARGS, + **subprocess_args, ) if verbose: print(f"[{subfolder}] Overriding existing branch") branch_exists = False else: branch_exists = True - subprocess.run(f"git checkout {main_branch}", cwd=repo_name, **SUBPROCESS_ARGS) - subprocess.run(f"git branch -D {subfolder}", cwd=repo_name, **SUBPROCESS_ARGS) + subprocess.run(f"git checkout {main_branch}", cwd=repo_name, **subprocess_args) + subprocess.run(f"git branch -D {subfolder}", cwd=repo_name, **subprocess_args) except Exception: branch_exists = False pass @@ -172,69 +175,139 @@ def _main( print(f"Found {len(task_instances)} existing task instances") subfolders = [x for x in subfolders if x not in completed_ids] + completed_ids = set(completed_ids) # Optimize lookup + subfolders_to_process = [x for x in subfolders if x not in completed_ids] + + print(f"Will process {len(subfolders_to_process)} instances") + + # Determine number of workers + n_workers = int(os.environ.get("MAX_WORKERS", os.cpu_count() or 1)) + print(f"Using {n_workers} workers") + + with concurrent.futures.ProcessPoolExecutor(max_workers=n_workers) as executor: + # Create a partial function with fixed arguments + func = functools.partial( + process_instance, + validation_logs_path=validation_logs_path, + override_branch=override_branch, + debug_subprocess=debug_subprocess, + verbose=verbose, + ) + + results = list(tqdm( + executor.map(func, sorted(subfolders_to_process)), + total=len(subfolders_to_process), + desc="Conversion" + )) + + # Aggregate results stats = {"new_tasks": 0, "skipped": 0} - print(f"Will process {len(subfolders)} instances") - pbar = tqdm(subfolders, desc="Conversion", disable=verbose) - for subfolder in sorted(subfolders): - if subfolder.endswith(REF_SUFFIX) or subfolder in completed_ids: - # Skip reference run or instances that have been completed - stats = skip_print(f"{subfolder}: Reference", pbar, stats, verbose) - continue - - path_results = os.path.join(validation_logs_path, subfolder, LOG_REPORT) - path_patch = os.path.join(validation_logs_path, subfolder, "patch.diff") - - if not os.path.exists(path_results): - stats = skip_print(f"{subfolder}: No results", pbar, stats, verbose) - continue - - if not os.path.exists(path_patch): - stats = skip_print(f"{subfolder}: No patch.diff", pbar, stats, verbose) - continue - - with open(path_results) as f: - results = json.load(f) - if PASS_TO_FAIL not in results or PASS_TO_PASS not in results: - stats = skip_print( - f"{subfolder}: No validatable bugs", pbar, stats, verbose - ) - continue - - n_f2p = len(results[PASS_TO_FAIL]) - n_p2p = len(results[PASS_TO_PASS]) - pr_exception = ( - ".pr_" in subfolder and n_p2p == 0 and n_f2p > 0 - ) # TODO: Better way to determine if it's a PR miror? - if not pr_exception and (KEY_TIMED_OUT in results or n_f2p == 0 or n_p2p == 0): - # Skip instances that timed out OR don't have F2P or P2P - stats = skip_print( - f"{subfolder}: No validatable bugs: {n_f2p=}, {n_p2p=}", - pbar, - stats, - verbose, - ) - continue - - with open(path_patch) as f: - patch_content = f.read() - task_instance = { - KEY_INSTANCE_ID: subfolder, - KEY_PATCH: patch_content, - PASS_TO_FAIL: results[PASS_TO_FAIL], - PASS_TO_PASS: results[PASS_TO_PASS], - } - rp = registry.get_from_inst(task_instance) - task_instance[KEY_IMAGE_NAME] = rp.image_name - task_instance["repo"] = rp.mirror_name - - # Clone repository - _, cloned = rp.clone() + for res_tasks, res_repos, res_stats in results: + task_instances.extend(res_tasks) + created_repos.update(res_repos) + for k, v in res_stats.items(): + stats[k] += v + + if len(created_repos) > 0: + if repush_image: + print("Rebuilding + pushing images...") + for repo in created_repos: + print(f"[{repo}] Rebuilding + pushing image") + registry.get(repo).push_image(rebuild_image=True) + + if len(task_instances) > 0: + task_instances_path.parent.mkdir(parents=True, exist_ok=True) + with open(task_instances_path, "w") as f: + json.dump(task_instances, f, indent=4) + print(f"Wrote {len(task_instances)} instances to {task_instances_path}") + + print(f"- {stats['skipped']} skipped") + print(f"- {stats['new_tasks']} new instances") + + +def process_instance( + subfolder: str, + validation_logs_path: Path, + override_branch: bool, + debug_subprocess: bool, + verbose: bool, +) -> tuple[list[dict], set[str], dict]: + """ + Process a single task instance. + Returns: + task_instances: list of created task instances + created_repos: set of repository names that were cloned + stats: dictionary of statistics + """ + stats = {"new_tasks": 0, "skipped": 0} + task_instances = [] + created_repos = set() + + # Use a unique temporary directory for this process/task to avoid collision + # We append process ID or random string to repo path + import multiprocessing + pid = multiprocessing.current_process().pid + + # Define subprocess args locally to avoid global state issues with multiprocessing + subprocess_args = SUBPROCESS_ARGS.copy() + if not debug_subprocess: + subprocess_args["stdout"] = subprocess.DEVNULL + subprocess_args["stderr"] = subprocess.DEVNULL + + if subfolder.endswith(REF_SUFFIX): + return [], set(), {"new_tasks": 0, "skipped": 1} + + path_results = os.path.join(validation_logs_path, subfolder, LOG_REPORT) + path_patch = os.path.join(validation_logs_path, subfolder, "patch.diff") + + if not os.path.exists(path_results): + if verbose: print(f"[SKIP] {subfolder}: No results") + return [], set(), {"new_tasks": 0, "skipped": 1} + + if not os.path.exists(path_patch): + if verbose: print(f"[SKIP] {subfolder}: No patch.diff") + return [], set(), {"new_tasks": 0, "skipped": 1} + + with open(path_results) as f: + results = json.load(f) + if PASS_TO_FAIL not in results or PASS_TO_PASS not in results: + if verbose: print(f"[SKIP] {subfolder}: No validatable bugs") + return [], set(), {"new_tasks": 0, "skipped": 1} + + n_f2p = len(results[PASS_TO_FAIL]) + n_p2p = len(results[PASS_TO_PASS]) + pr_exception = ( + ".pr_" in subfolder and n_p2p == 0 and n_f2p > 0 + ) + if not pr_exception and (KEY_TIMED_OUT in results or n_f2p == 0 or n_p2p == 0): + if verbose: print(f"[SKIP] {subfolder}: No validatable bugs: {n_f2p=}, {n_p2p=}") + return [], set(), {"new_tasks": 0, "skipped": 1} + + with open(path_patch) as f: + patch_content = f.read() + task_instance = { + KEY_INSTANCE_ID: subfolder, + KEY_PATCH: patch_content, + PASS_TO_FAIL: results[PASS_TO_FAIL], + PASS_TO_PASS: results[PASS_TO_PASS], + } + rp = registry.get_from_inst(task_instance) + task_instance[KEY_IMAGE_NAME] = rp.image_name + task_instance["repo"] = rp.mirror_name + + # Unique clone path for this worker + repo_path = f"{rp.repo_name}_{pid}_{subfolder}" + + # Clone repository + try: + _, cloned = rp.clone(dest=repo_path) if cloned: created_repos.add(rp.repo_name) + main_branch = ( subprocess.run( "git rev-parse --abbrev-ref HEAD", - cwd=rp.repo_name, + cwd=repo_path, capture_output=True, shell=True, check=True, @@ -244,18 +317,20 @@ def _main( ) # Check if branch already created for this problem + # We pass the repo_path as cwd for the git operations inside the helper + branch_exists = check_if_branch_exists( - rp.repo_name, subfolder, main_branch, override_branch, verbose + repo_path, subfolder, main_branch, override_branch, verbose, subprocess_args ) if branch_exists: task_instances.append(task_instance) - stats = skip_print( - f"{subfolder}: Branch `{subfolder}` exists", - pbar, - stats, - verbose, - ) - continue + if verbose: print(f"[SKIP] {subfolder}: Branch `{subfolder}` exists") + stats["skipped"] += 1 + # Cleanup + if cloned and os.path.exists(repo_path): + shutil.rmtree(repo_path) + return task_instances, created_repos, stats + elif verbose: print(f"[{subfolder}] Does not exist yet") @@ -264,7 +339,7 @@ def _main( for git_apply in GIT_APPLY_CMDS: output = subprocess.run( f"{git_apply} ../{path_patch}", - cwd=rp.repo_name, + cwd=repo_path, capture_output=True, shell=True, ) @@ -272,14 +347,21 @@ def _main( applied = True break else: - # Remove any artifacts - subprocess.run("git reset --hard", cwd=rp.repo_name, **SUBPROCESS_ARGS) + subprocess.run("git reset --hard", cwd=repo_path, **subprocess_args) + if not applied: - raise Exception(f"[{subfolder}] Failed to apply patch to {rp.repo_name}") + # We can't raise Exception here as it stops the worker? + # Or we let it bubble up and fail the future? + # Better to catch and print/skip + print(f"[{subfolder}] Failed to apply patch to {rp.repo_name}") + if cloned and os.path.exists(repo_path): + shutil.rmtree(repo_path) + return [], set(), stats # Don't record this one + if verbose: print(f"[{subfolder}] Bug patch applied successfully") - # Create a branch, check it out, commit, push the branch, and cleanup + # Create branch etc cmds = [ "git config user.email 'swesmith@swesmith.ai'", "git config user.name 'swesmith'", @@ -291,20 +373,18 @@ def _main( for cmd in cmds: if debug_subprocess: print(f"[{subfolder}] {cmd}") - subprocess.run(cmd, cwd=rp.repo_name, **SUBPROCESS_ARGS) + subprocess.run(cmd, cwd=repo_path, **subprocess_args) - # Create test patch by removing F2P test files + # F2P patch f2p_test_files, _ = rp.get_test_files(task_instance) if f2p_test_files: - # Remove the test files for test_file in f2p_test_files: - test_file_path = os.path.join(rp.repo_name, test_file) + test_file_path = os.path.join(repo_path, test_file) if os.path.exists(test_file_path): os.remove(test_file_path) if verbose: print(f"[{subfolder}] Removed F2P test file: {test_file}") - # Add and commit removal cmds = [ "git add .", "git commit --no-gpg-sign -m 'Remove F2P Tests'", @@ -312,12 +392,10 @@ def _main( for cmd in cmds: if debug_subprocess: print(f"[{subfolder}] {cmd}") - subprocess.run(cmd, cwd=rp.repo_name, **SUBPROCESS_ARGS) + subprocess.run(cmd, cwd=repo_path, **subprocess_args) if verbose: print(f"[{subfolder}] Commit F2P test file(s) removal") - elif verbose: - print(f"[{subfolder}] No test files to remove") - + cmds = [ f"git push origin {subfolder}", f"git checkout {main_branch}", @@ -327,7 +405,8 @@ def _main( for cmd in cmds: if debug_subprocess: print(f"[{subfolder}] {cmd}") - subprocess.run(cmd, cwd=rp.repo_name, **SUBPROCESS_ARGS) + subprocess.run(cmd, cwd=repo_path, **subprocess_args) + if verbose: print(f"[{subfolder}] Bug @ branch `{subfolder}`") @@ -335,27 +414,13 @@ def _main( if verbose: print(f"[{subfolder}] Created task instance") stats["new_tasks"] += 1 - pbar.update() - - pbar.close() - if len(created_repos) > 0: - print("Cleaning up...") - for repo in created_repos: - shutil.rmtree(repo) - print(f"[{repo}] Removed local clone") - if repush_image: - print(f"[{repo}] Rebuilding + pushing image") - registry.get(repo).push_image(rebuild_image=True) - - if len(task_instances) > 0: - task_instances_path.parent.mkdir(parents=True, exist_ok=True) - with open(task_instances_path, "w") as f: - json.dump(task_instances, f, indent=4) - print(f"Wrote {len(task_instances)} instances to {task_instances_path}") - - print(f"- {stats['skipped']} skipped") - print(f"- {stats['new_tasks']} new instances") - + + finally: + # Cleanup unique clone + if os.path.exists(repo_path): + shutil.rmtree(repo_path) + + return task_instances, created_repos, stats if __name__ == "__main__": parser = argparse.ArgumentParser( diff --git a/swesmith/profiles/base.py b/swesmith/profiles/base.py index af23febb..c5c9c83c 100644 --- a/swesmith/profiles/base.py +++ b/swesmith/profiles/base.py @@ -179,10 +179,13 @@ def _get_cached_test_paths(self) -> list[Path]: """Clone the repo, get all testing file paths relative to the repo directory, then clean up.""" if self._cache_test_paths is None: with self._lock: # Only one process enters this block at a time - dir_path, cloned = self.clone() + # Use unique temp dir to avoid race conditions in multiprocessing + import uuid + temp_dest = f"{self.repo_name}_{uuid.uuid4().hex[:8]}" + dir_path, cloned = self.clone(dest=temp_dest) self._cache_test_paths = [ - Path(os.path.relpath(os.path.join(root, file), self.repo_name)) - for root, _, files in os.walk(Path(self.repo_name).resolve()) + Path(os.path.relpath(os.path.join(root, file), dir_path)) + for root, _, files in os.walk(Path(dir_path).resolve()) for file in files if self._is_test_path(root, file) ] From cd3adc848b4fd0b4ab5c17429f108bb43a8e0d7c Mon Sep 17 00:00:00 2001 From: Kevin Li Date: Fri, 16 Jan 2026 14:28:20 -0800 Subject: [PATCH 06/32] Fix gather.py to skip empty commits Previously, the script would fail if `git commit` was attempted with no changes. This was observed in cases like `Automattic__mongoose.5f57a5bb` where the applied patch resulted in no tracked changes. Now, we check `git status --porcelain` before committing and skip the instance if no changes are detected. --- swesmith/harness/gather.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/swesmith/harness/gather.py b/swesmith/harness/gather.py index c169515e..aca7664f 100644 --- a/swesmith/harness/gather.py +++ b/swesmith/harness/gather.py @@ -368,6 +368,30 @@ def process_instance( "git config commit.gpgsign false", f"git checkout -b {subfolder}", "git add .", + ] + for cmd in cmds: + if debug_subprocess: + print(f"[{subfolder}] {cmd}") + subprocess.run(cmd, cwd=repo_path, **subprocess_args) + + # Check for changes + status_output = subprocess.run( + "git status --porcelain", + cwd=repo_path, + capture_output=True, + shell=True, + check=True, + ).stdout.decode().strip() + + if not status_output: + if verbose: + print(f"[{subfolder}] No changes to commit, skipping") + stats["skipped"] += 1 + if cloned and os.path.exists(repo_path): + shutil.rmtree(repo_path) + return task_instances, created_repos, stats + + cmds = [ "git commit --no-gpg-sign -m 'Bug Patch'", ] for cmd in cmds: From 84f8587a428356b7fe6e2a71cf7448638354010e Mon Sep 17 00:00:00 2001 From: Kevin Li Date: Fri, 16 Jan 2026 14:41:37 -0800 Subject: [PATCH 07/32] Reset MODAL_TIMEOUT back down to 10 minutes now that gather is parallelized --- scripts/bug_gen_modal.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/bug_gen_modal.py b/scripts/bug_gen_modal.py index f553c297..ad403238 100644 --- a/scripts/bug_gen_modal.py +++ b/scripts/bug_gen_modal.py @@ -124,7 +124,7 @@ def custom_exception_handler(loop, context): APP_NAME = "swesmith-bug-gen" VOLUME_NAME = "swesmith-bug-gen" MINUTES = 60 -MODAL_TIMEOUT = 20 * MINUTES +MODAL_TIMEOUT = 10 * MINUTES SANDBOX_RATE_LIMIT = 4 # Modal limits to 5/s, use 4 to be safe LANGUAGE_TO_BASE_CLASS = { From 67a8529784c55ae958da35ada2e986dd6220587b Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Fri, 16 Jan 2026 22:50:27 +0000 Subject: [PATCH 08/32] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- scripts/bug_gen_modal.py | 89 +++++++++++++++++++++++--------------- swesmith/harness/gather.py | 87 +++++++++++++++++++++---------------- swesmith/profiles/base.py | 5 +-- 3 files changed, 104 insertions(+), 77 deletions(-) diff --git a/scripts/bug_gen_modal.py b/scripts/bug_gen_modal.py index ad403238..35d2142a 100644 --- a/scripts/bug_gen_modal.py +++ b/scripts/bug_gen_modal.py @@ -1681,20 +1681,31 @@ def resolve_repo_id(): work_dir = Path("/root") logs_link_dir = work_dir / "logs" logs_link_dir.mkdir(exist_ok=True) - + # Configure git authentication if "GITHUB_TOKEN" in os.environ: token = os.environ["GITHUB_TOKEN"] print(f"DEBUG: Found GITHUB_TOKEN (len={len(token)}). Configuring git auth...") - + # Use simpler authenticated URL format for PATs subprocess.run( - ["git", "config", "--global", f"url.https://{token}@github.com/.insteadOf", "https://github.com/"], - check=True + [ + "git", + "config", + "--global", + f"url.https://{token}@github.com/.insteadOf", + "https://github.com/", + ], + check=True, ) # Also configure user info - subprocess.run(["git", "config", "--global", "user.email", "swesmith@swesmith.ai"], check=False) - subprocess.run(["git", "config", "--global", "user.name", "swesmith"], check=False) + subprocess.run( + ["git", "config", "--global", "user.email", "swesmith@swesmith.ai"], + check=False, + ) + subprocess.run( + ["git", "config", "--global", "user.name", "swesmith"], check=False + ) else: print("Warning: GITHUB_TOKEN not found in environment. Git push may fail.") @@ -1711,66 +1722,71 @@ def resolve_repo_id(): # Ensure sources exist on volume task_insts_source.mkdir(parents=True, exist_ok=True) if not validation_source.exists(): - return {"repo": repo_name, "status": "skipped", "reason": "No validation logs"} + return { + "repo": repo_name, + "status": "skipped", + "reason": "No validation logs", + } # Create symlinks if not validation_link.exists(): os.symlink(str(validation_source), str(validation_link)) - + if not task_insts_link.exists(): os.symlink(str(task_insts_source), str(task_insts_link)) # Check if there are actually validation logs for this repo repo_vals = validation_link / repo_id if not repo_vals.exists(): - return {"repo": repo_name, "status": "skipped", "reason": "No logs for repo"} - + return { + "repo": repo_name, + "status": "skipped", + "reason": "No logs for repo", + } + # Build command # python -m swesmith.harness.gather logs/run_validation/ cmd = [ sys.executable, - "-m", "swesmith.harness.gather", + "-m", + "swesmith.harness.gather", str(Path("logs/run_validation") / repo_id), "-v", "-d", ] - + if repush_image: cmd.append("--repush_image") if override_branch: cmd.append("--override_branch") - + print(f"Running: {' '.join(cmd)}") - + # execution result = subprocess.run( - cmd, - cwd=str(work_dir), - capture_output=True, - text=True, - env=os.environ + cmd, cwd=str(work_dir), capture_output=True, text=True, env=os.environ ) - + if result.returncode != 0: print("Gather failed:") print(result.stdout) print(result.stderr) return { - "repo": repo_name, - "status": "failed", - "stdout": result.stdout, - "stderr": result.stderr + "repo": repo_name, + "status": "failed", + "stdout": result.stdout, + "stderr": result.stderr, } else: print("Gather succeeded:") print(result.stdout) print(result.stderr) - + return { "repo": repo_name, "status": "success", "stdout": result.stdout, - "stderr": result.stderr + "stderr": result.stderr, } except Exception as e: @@ -1783,29 +1799,29 @@ async def run_gather_phase_async(repos: list[str], language: str, args) -> None: print(f"\n{'#' * 60}") print(f"# PHASE 3: GATHER ({len(repos)} repos)") print(f"{'#' * 60}\n") - - # We can pass repush_image and override_branch via args if they existed, + + # We can pass repush_image and override_branch via args if they existed, # but for now we'll assume defaults or add them to args class if needed. repush = getattr(args, "repush_image", False) override = getattr(args, "override_branch", False) - + completed = 0 success = 0 - + print(f"Starting gather for {len(repos)} repos...") - + async for result in gather_remote.map.aio( repos, kwargs={ "language": language, "repush_image": repush, "override_branch": override, - } + }, ): completed += 1 repo = result.get("repo", "unknown") status = result.get("status", "unknown") - + if status == "success": success += 1 print(f" [{completed}/{len(repos)}] {repo}: Success") @@ -1815,11 +1831,13 @@ async def run_gather_phase_async(repos: list[str], language: str, args) -> None: for line in lines[-5:]: print(f" | {line}") elif status == "skipped": - print(f" [{completed}/{len(repos)}] {repo}: Skipped ({result.get('reason')})") + print( + f" [{completed}/{len(repos)}] {repo}: Skipped ({result.get('reason')})" + ) else: err = result.get("error") or "Non-zero exit code" print(f" [{completed}/{len(repos)}] {repo}: Failed - {err}") - + print(f"\nGather complete: {success}/{len(repos)} repos processed successfully.\n") @@ -2236,4 +2254,3 @@ class Args: return await run_gather_phase_async(target_repos, language, args) - diff --git a/swesmith/harness/gather.py b/swesmith/harness/gather.py index aca7664f..f8d3d83b 100644 --- a/swesmith/harness/gather.py +++ b/swesmith/harness/gather.py @@ -179,7 +179,7 @@ def _main( subfolders_to_process = [x for x in subfolders if x not in completed_ids] print(f"Will process {len(subfolders_to_process)} instances") - + # Determine number of workers n_workers = int(os.environ.get("MAX_WORKERS", os.cpu_count() or 1)) print(f"Using {n_workers} workers") @@ -194,11 +194,13 @@ def _main( verbose=verbose, ) - results = list(tqdm( - executor.map(func, sorted(subfolders_to_process)), - total=len(subfolders_to_process), - desc="Conversion" - )) + results = list( + tqdm( + executor.map(func, sorted(subfolders_to_process)), + total=len(subfolders_to_process), + desc="Conversion", + ) + ) # Aggregate results stats = {"new_tasks": 0, "skipped": 0} @@ -212,15 +214,15 @@ def _main( if repush_image: print("Rebuilding + pushing images...") for repo in created_repos: - print(f"[{repo}] Rebuilding + pushing image") - registry.get(repo).push_image(rebuild_image=True) + print(f"[{repo}] Rebuilding + pushing image") + registry.get(repo).push_image(rebuild_image=True) if len(task_instances) > 0: task_instances_path.parent.mkdir(parents=True, exist_ok=True) with open(task_instances_path, "w") as f: json.dump(task_instances, f, indent=4) print(f"Wrote {len(task_instances)} instances to {task_instances_path}") - + print(f"- {stats['skipped']} skipped") print(f"- {stats['new_tasks']} new instances") @@ -242,12 +244,13 @@ def process_instance( stats = {"new_tasks": 0, "skipped": 0} task_instances = [] created_repos = set() - + # Use a unique temporary directory for this process/task to avoid collision # We append process ID or random string to repo path import multiprocessing + pid = multiprocessing.current_process().pid - + # Define subprocess args locally to avoid global state issues with multiprocessing subprocess_args = SUBPROCESS_ARGS.copy() if not debug_subprocess: @@ -261,26 +264,28 @@ def process_instance( path_patch = os.path.join(validation_logs_path, subfolder, "patch.diff") if not os.path.exists(path_results): - if verbose: print(f"[SKIP] {subfolder}: No results") + if verbose: + print(f"[SKIP] {subfolder}: No results") return [], set(), {"new_tasks": 0, "skipped": 1} - + if not os.path.exists(path_patch): - if verbose: print(f"[SKIP] {subfolder}: No patch.diff") + if verbose: + print(f"[SKIP] {subfolder}: No patch.diff") return [], set(), {"new_tasks": 0, "skipped": 1} with open(path_results) as f: results = json.load(f) if PASS_TO_FAIL not in results or PASS_TO_PASS not in results: - if verbose: print(f"[SKIP] {subfolder}: No validatable bugs") + if verbose: + print(f"[SKIP] {subfolder}: No validatable bugs") return [], set(), {"new_tasks": 0, "skipped": 1} n_f2p = len(results[PASS_TO_FAIL]) n_p2p = len(results[PASS_TO_PASS]) - pr_exception = ( - ".pr_" in subfolder and n_p2p == 0 and n_f2p > 0 - ) + pr_exception = ".pr_" in subfolder and n_p2p == 0 and n_f2p > 0 if not pr_exception and (KEY_TIMED_OUT in results or n_f2p == 0 or n_p2p == 0): - if verbose: print(f"[SKIP] {subfolder}: No validatable bugs: {n_f2p=}, {n_p2p=}") + if verbose: + print(f"[SKIP] {subfolder}: No validatable bugs: {n_f2p=}, {n_p2p=}") return [], set(), {"new_tasks": 0, "skipped": 1} with open(path_patch) as f: @@ -297,7 +302,7 @@ def process_instance( # Unique clone path for this worker repo_path = f"{rp.repo_name}_{pid}_{subfolder}" - + # Clone repository try: _, cloned = rp.clone(dest=repo_path) @@ -318,19 +323,20 @@ def process_instance( # Check if branch already created for this problem # We pass the repo_path as cwd for the git operations inside the helper - + branch_exists = check_if_branch_exists( repo_path, subfolder, main_branch, override_branch, verbose, subprocess_args ) if branch_exists: task_instances.append(task_instance) - if verbose: print(f"[SKIP] {subfolder}: Branch `{subfolder}` exists") + if verbose: + print(f"[SKIP] {subfolder}: Branch `{subfolder}` exists") stats["skipped"] += 1 # Cleanup if cloned and os.path.exists(repo_path): shutil.rmtree(repo_path) return task_instances, created_repos, stats - + elif verbose: print(f"[{subfolder}] Does not exist yet") @@ -348,16 +354,16 @@ def process_instance( break else: subprocess.run("git reset --hard", cwd=repo_path, **subprocess_args) - + if not applied: - # We can't raise Exception here as it stops the worker? + # We can't raise Exception here as it stops the worker? # Or we let it bubble up and fail the future? # Better to catch and print/skip print(f"[{subfolder}] Failed to apply patch to {rp.repo_name}") if cloned and os.path.exists(repo_path): shutil.rmtree(repo_path) - return [], set(), stats # Don't record this one - + return [], set(), stats # Don't record this one + if verbose: print(f"[{subfolder}] Bug patch applied successfully") @@ -375,13 +381,17 @@ def process_instance( subprocess.run(cmd, cwd=repo_path, **subprocess_args) # Check for changes - status_output = subprocess.run( - "git status --porcelain", - cwd=repo_path, - capture_output=True, - shell=True, - check=True, - ).stdout.decode().strip() + status_output = ( + subprocess.run( + "git status --porcelain", + cwd=repo_path, + capture_output=True, + shell=True, + check=True, + ) + .stdout.decode() + .strip() + ) if not status_output: if verbose: @@ -419,7 +429,7 @@ def process_instance( subprocess.run(cmd, cwd=repo_path, **subprocess_args) if verbose: print(f"[{subfolder}] Commit F2P test file(s) removal") - + cmds = [ f"git push origin {subfolder}", f"git checkout {main_branch}", @@ -430,7 +440,7 @@ def process_instance( if debug_subprocess: print(f"[{subfolder}] {cmd}") subprocess.run(cmd, cwd=repo_path, **subprocess_args) - + if verbose: print(f"[{subfolder}] Bug @ branch `{subfolder}`") @@ -438,14 +448,15 @@ def process_instance( if verbose: print(f"[{subfolder}] Created task instance") stats["new_tasks"] += 1 - + finally: # Cleanup unique clone if os.path.exists(repo_path): shutil.rmtree(repo_path) - + return task_instances, created_repos, stats + if __name__ == "__main__": parser = argparse.ArgumentParser( description="Convert validation logs to SWE-bench style dataset" diff --git a/swesmith/profiles/base.py b/swesmith/profiles/base.py index c5c9c83c..4d68d105 100644 --- a/swesmith/profiles/base.py +++ b/swesmith/profiles/base.py @@ -181,6 +181,7 @@ def _get_cached_test_paths(self) -> list[Path]: with self._lock: # Only one process enters this block at a time # Use unique temp dir to avoid race conditions in multiprocessing import uuid + temp_dest = f"{self.repo_name}_{uuid.uuid4().hex[:8]}" dir_path, cloned = self.clone(dest=temp_dest) self._cache_test_paths = [ @@ -292,9 +293,7 @@ def clone(self, dest: str | None = None) -> tuple[str, bool]: if not os.path.exists(dest): token = os.getenv("GITHUB_TOKEN") if token: - base_url = ( - f"https://{token}@github.com/{self.mirror_name}.git" - ) + base_url = f"https://{token}@github.com/{self.mirror_name}.git" else: base_url = f"git@github.com:{self.mirror_name}.git" From 842650aa46be21c4c0d749a3eb58faab6fbec997 Mon Sep 17 00:00:00 2001 From: Kevin Li Date: Fri, 16 Jan 2026 19:46:09 -0800 Subject: [PATCH 09/32] Replace slow and stateful git checkout and branch -D with a single stateless git ls-remote --- swesmith/harness/gather.py | 40 ++++++++++++++++++++++++-------------- 1 file changed, 25 insertions(+), 15 deletions(-) diff --git a/swesmith/harness/gather.py b/swesmith/harness/gather.py index f8d3d83b..7ac1fb90 100644 --- a/swesmith/harness/gather.py +++ b/swesmith/harness/gather.py @@ -102,23 +102,33 @@ def check_if_branch_exists( verbose: bool, subprocess_args: dict, ): - branch_exists = None + branch_exists = False try: - subprocess.run(f"git checkout {subfolder}", cwd=repo_name, **subprocess_args) - if override_branch: - # Delete the branch remotely - subprocess.run( - f"git push --delete origin {subfolder}", - cwd=repo_name, - **subprocess_args, - ) - if verbose: - print(f"[{subfolder}] Overriding existing branch") - branch_exists = False - else: + # Check remote for branch existence directly + # This is more robust than checkout/fetch for cached repos + result = subprocess.run( + f"git ls-remote --heads origin {subfolder}", + cwd=repo_name, + capture_output=True, + shell=True, + text=True + ) + + # If there is output, the branch exists on remote + if result.returncode == 0 and subfolder in result.stdout: branch_exists = True - subprocess.run(f"git checkout {main_branch}", cwd=repo_name, **subprocess_args) - subprocess.run(f"git branch -D {subfolder}", cwd=repo_name, **subprocess_args) + if override_branch: + # Delete the branch remotely + subprocess.run( + f"git push --delete origin {subfolder}", + cwd=repo_name, + **subprocess_args, + ) + if verbose: + print(f"[{subfolder}] Overriding existing branch") + branch_exists = False + + except Exception: branch_exists = False pass From af01d5cf4c66e4d3c533665520286fa8ebc07f99 Mon Sep 17 00:00:00 2001 From: Kevin Li Date: Fri, 16 Jan 2026 19:46:38 -0800 Subject: [PATCH 10/32] Cache repo locally to avoid rate limits and speed up cloning --- swesmith/harness/gather.py | 98 +++++++++++++++++++++++++++++++------- 1 file changed, 81 insertions(+), 17 deletions(-) diff --git a/swesmith/harness/gather.py b/swesmith/harness/gather.py index 7ac1fb90..518f6021 100644 --- a/swesmith/harness/gather.py +++ b/swesmith/harness/gather.py @@ -194,23 +194,60 @@ def _main( n_workers = int(os.environ.get("MAX_WORKERS", os.cpu_count() or 1)) print(f"Using {n_workers} workers") - with concurrent.futures.ProcessPoolExecutor(max_workers=n_workers) as executor: - # Create a partial function with fixed arguments - func = functools.partial( - process_instance, - validation_logs_path=validation_logs_path, - override_branch=override_branch, - debug_subprocess=debug_subprocess, - verbose=verbose, - ) + # Optimization: Cache repo locally to avoid rate limits and speed up cloning + import tempfile + + with tempfile.TemporaryDirectory() as cache_root: + # cache_root exists, so rp.clone(dest=cache_root) would skip cloning. + # We must clone into a subdirectory which doesn't exist yet. + cache_dir = os.path.join(cache_root, "repo") + print(f"Pre-cloning repository to cache: {cache_dir}...") + + rp_cache = None + # Try resolving profile from run_id (directory name) first + try: + rp_cache = registry.get(run_id) + except Exception: + pass + + if not rp_cache: + sample_id = next((s for s in subfolders if "." in s), None) + if sample_id: + try: + rp_cache = registry.get_from_inst({KEY_INSTANCE_ID: sample_id}) + except Exception as e: + print(f"Warning: Could not resolve profile from {sample_id}: {e}") + + path_to_cache = None + if rp_cache: + try: + print(f"Cloning {rp_cache.repo_name} to cache...") + rp_cache.clone(dest=cache_dir) + path_to_cache = cache_dir + print("Pre-clone successful.") + except Exception as e: + print(f"Pre-clone failed: {e}. Will fall back to per-instance cloning.") + else: + print("Could not resolve profile for pre-cloning. Will iterate per instance.") + + with concurrent.futures.ProcessPoolExecutor(max_workers=n_workers) as executor: + # Create a partial function with fixed arguments + func = functools.partial( + process_instance, + validation_logs_path=validation_logs_path, + override_branch=override_branch, + debug_subprocess=debug_subprocess, + verbose=verbose, + cache_dir=path_to_cache, + ) - results = list( - tqdm( - executor.map(func, sorted(subfolders_to_process)), - total=len(subfolders_to_process), - desc="Conversion", + results = list( + tqdm( + executor.map(func, sorted(subfolders_to_process)), + total=len(subfolders_to_process), + desc="Conversion", + ) ) - ) # Aggregate results stats = {"new_tasks": 0, "skipped": 0} @@ -243,6 +280,7 @@ def process_instance( override_branch: bool, debug_subprocess: bool, verbose: bool, + cache_dir: str | None = None, ) -> tuple[list[dict], set[str], dict]: """ Process a single task instance. @@ -315,10 +353,36 @@ def process_instance( # Clone repository try: - _, cloned = rp.clone(dest=repo_path) - if cloned: + if cache_dir and os.path.exists(cache_dir): + if verbose: + print(f"[{subfolder}] Cloning from cache {cache_dir}...") + + subprocess.run( + f"git clone {cache_dir} {repo_path}", + check=True, + shell=True, + stdout=subprocess.DEVNULL if not debug_subprocess else None, + stderr=subprocess.DEVNULL if not debug_subprocess else None, + ) + cloned = True created_repos.add(rp.repo_name) + # Fix origin remote to point to actual GitHub repo so push works + remote_url = f"https://github.com/{rp.mirror_name}.git" + + subprocess.run( + f"git remote set-url origin {remote_url}", + cwd=repo_path, + check=True, + shell=True, + stdout=subprocess.DEVNULL if not debug_subprocess else None, + stderr=subprocess.DEVNULL if not debug_subprocess else None, + ) + else: + _, cloned = rp.clone(dest=repo_path) + if cloned: + created_repos.add(rp.repo_name) + main_branch = ( subprocess.run( "git rev-parse --abbrev-ref HEAD", From 1d6bcd448ab2615052e3b57cbb59833739052c8b Mon Sep 17 00:00:00 2001 From: Kevin Li Date: Fri, 16 Jan 2026 23:09:58 -0800 Subject: [PATCH 11/32] Optimize gather with persistent worker repos (~5min total) - Switch from per-task clones to per-worker persistent repositories. - Reduces clone operations from O(tasks) to O(workers) (e.g. 1400 -> 17). - Eliminates file locking race conditions. - Total gather time for Javascript is now ~5 minutes (bottlenecked by math.js). --- swesmith/harness/gather.py | 148 ++++++++++++++++++++++++------------- 1 file changed, 96 insertions(+), 52 deletions(-) diff --git a/swesmith/harness/gather.py b/swesmith/harness/gather.py index 518f6021..119d758d 100644 --- a/swesmith/harness/gather.py +++ b/swesmith/harness/gather.py @@ -348,56 +348,103 @@ def process_instance( task_instance[KEY_IMAGE_NAME] = rp.image_name task_instance["repo"] = rp.mirror_name - # Unique clone path for this worker - repo_path = f"{rp.repo_name}_{pid}_{subfolder}" + # Persistent worker path - reused across tasks for this process + # We place it in the same temporary directory as the cache to ensure automatic cleanup. + if cache_dir: + # cache_dir is .../temp/repo, so dirname is .../temp + repo_path = os.path.join(os.path.dirname(cache_dir), f"{rp.repo_name}_worker_{pid}") + else: + # Fallback if no cache used (e.g. debugging), though likely not cleaned up automatically + repo_path = os.path.abspath(f"{rp.repo_name}_worker_{pid}") + + # Helper to reset repo state + def reset_repo(path): + subprocess.run( + "git reset --hard", cwd=path, **subprocess_args + ) + subprocess.run( + "git clean -fdx", cwd=path, **subprocess_args + ) + # remove potential lock files if previous run crashed hard + lock_file = os.path.join(path, ".git", "index.lock") + if os.path.exists(lock_file): + try: + os.remove(lock_file) + except OSError: + pass - # Clone repository + cloned = False try: - if cache_dir and os.path.exists(cache_dir): + if os.path.exists(repo_path): + # Reuse existing repo for this worker if verbose: - print(f"[{subfolder}] Cloning from cache {cache_dir}...") + print(f"[{subfolder}] Reusing worker repo {repo_path}") + reset_repo(repo_path) - subprocess.run( - f"git clone {cache_dir} {repo_path}", - check=True, - shell=True, - stdout=subprocess.DEVNULL if not debug_subprocess else None, - stderr=subprocess.DEVNULL if not debug_subprocess else None, + # We need to know main branch name. We can get it from local repo now. + # Assuming main branch hasn't changed name/ref significantly. + # We avoid 'git pull' to save rate limits and time. + main_branch = ( + subprocess.run( + "git rev-parse --abbrev-ref HEAD", + cwd=repo_path, + capture_output=True, + shell=True, + check=True, + ) + .stdout.decode() + .strip() ) - cloned = True - created_repos.add(rp.repo_name) + # Ensure we are on main branch + subprocess.run(f"git checkout {main_branch}", cwd=repo_path, **subprocess_args) - # Fix origin remote to point to actual GitHub repo so push works - remote_url = f"https://github.com/{rp.mirror_name}.git" - - subprocess.run( - f"git remote set-url origin {remote_url}", - cwd=repo_path, - check=True, - shell=True, - stdout=subprocess.DEVNULL if not debug_subprocess else None, - stderr=subprocess.DEVNULL if not debug_subprocess else None, - ) else: - _, cloned = rp.clone(dest=repo_path) - if cloned: + # First time setup for this worker + if cache_dir and os.path.exists(cache_dir): + if verbose: + print(f"[{subfolder}] First-time clone from cache {cache_dir}...") + + subprocess.run( + f"git clone {cache_dir} {repo_path}", + check=True, + shell=True, + stdout=subprocess.DEVNULL if not debug_subprocess else None, + stderr=subprocess.DEVNULL if not debug_subprocess else None, + ) + cloned = True created_repos.add(rp.repo_name) - main_branch = ( - subprocess.run( - "git rev-parse --abbrev-ref HEAD", - cwd=repo_path, - capture_output=True, - shell=True, - check=True, + # Fix origin remote + remote_url = f"https://github.com/{rp.mirror_name}.git" + subprocess.run( + f"git remote set-url origin {remote_url}", + cwd=repo_path, + check=True, + shell=True, + stdout=subprocess.DEVNULL if not debug_subprocess else None, + stderr=subprocess.DEVNULL if not debug_subprocess else None, + ) + else: + _, cloned = rp.clone(dest=repo_path) + created_repos.add(rp.repo_name) + + main_branch = ( + subprocess.run( + "git rev-parse --abbrev-ref HEAD", + cwd=repo_path, + capture_output=True, + shell=True, + check=True, + ) + .stdout.decode() + .strip() ) - .stdout.decode() - .strip() - ) - # Check if branch already created for this problem - # We pass the repo_path as cwd for the git operations inside the helper + # Ensure we are clean on main branch before starting + subprocess.run(f"git checkout {main_branch}", cwd=repo_path, **subprocess_args) + + # Check if branch already created for this problem branch_exists = check_if_branch_exists( repo_path, subfolder, main_branch, override_branch, verbose, subprocess_args ) @@ -406,9 +453,8 @@ def process_instance( if verbose: print(f"[SKIP] {subfolder}: Branch `{subfolder}` exists") stats["skipped"] += 1 - # Cleanup - if cloned and os.path.exists(repo_path): - shutil.rmtree(repo_path) + # Do NOT remove repo, just return. + # We might want to checkout main to be polite to next run but reset_repo handles it. return task_instances, created_repos, stats elif verbose: @@ -430,12 +476,9 @@ def process_instance( subprocess.run("git reset --hard", cwd=repo_path, **subprocess_args) if not applied: - # We can't raise Exception here as it stops the worker? - # Or we let it bubble up and fail the future? - # Better to catch and print/skip print(f"[{subfolder}] Failed to apply patch to {rp.repo_name}") - if cloned and os.path.exists(repo_path): - shutil.rmtree(repo_path) + # Reset for next usage + reset_repo(repo_path) return [], set(), stats # Don't record this one if verbose: @@ -471,8 +514,10 @@ def process_instance( if verbose: print(f"[{subfolder}] No changes to commit, skipping") stats["skipped"] += 1 - if cloned and os.path.exists(repo_path): - shutil.rmtree(repo_path) + # Reset logic happens at start of next or via finally... + # actually better to cleanup branch now + subprocess.run(f"git checkout {main_branch}", cwd=repo_path, **subprocess_args) + subprocess.run(f"git branch -D {subfolder}", cwd=repo_path, **subprocess_args) return task_instances, created_repos, stats cmds = [ @@ -524,10 +569,9 @@ def process_instance( stats["new_tasks"] += 1 finally: - # Cleanup unique clone - if os.path.exists(repo_path): - shutil.rmtree(repo_path) - + # DO NOT remove repo_path. We persist it for this worker logic. + pass + return task_instances, created_repos, stats From e42a5e2a539ea9a9116265afb82668cd40a70e73 Mon Sep 17 00:00:00 2001 From: Kevin Li Date: Fri, 16 Jan 2026 23:35:06 -0800 Subject: [PATCH 12/32] Flip PASS_TO_FAIL to FAIL_TO_PASS following SWE-bench naming convention --- swesmith/harness/gather.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/swesmith/harness/gather.py b/swesmith/harness/gather.py index 119d758d..2eb0a12b 100644 --- a/swesmith/harness/gather.py +++ b/swesmith/harness/gather.py @@ -38,6 +38,7 @@ from swebench.harness.constants import ( PASS_TO_FAIL, PASS_TO_PASS, + FAIL_TO_PASS, KEY_INSTANCE_ID, LOG_REPORT, ) @@ -341,7 +342,7 @@ def process_instance( task_instance = { KEY_INSTANCE_ID: subfolder, KEY_PATCH: patch_content, - PASS_TO_FAIL: results[PASS_TO_FAIL], + FAIL_TO_PASS: results[PASS_TO_FAIL], # Flip PASS_TO_FAIL to FAIL_TO_PASS following SWE-bench naming convention PASS_TO_PASS: results[PASS_TO_PASS], } rp = registry.get_from_inst(task_instance) From 302b73e61b46d0580c640ff6f434191388c2e348 Mon Sep 17 00:00:00 2001 From: Kevin Li Date: Fri, 16 Jan 2026 23:38:56 -0800 Subject: [PATCH 13/32] Remove unused shutil import in gather.py --- swesmith/harness/gather.py | 1 - 1 file changed, 1 deletion(-) diff --git a/swesmith/harness/gather.py b/swesmith/harness/gather.py index 2eb0a12b..170cddba 100644 --- a/swesmith/harness/gather.py +++ b/swesmith/harness/gather.py @@ -29,7 +29,6 @@ import argparse import json import os -import shutil import subprocess import concurrent.futures import functools From 393aba43b5f699b81c0f43f46a798ebd8deb3d94 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sat, 17 Jan 2026 07:39:12 +0000 Subject: [PATCH 14/32] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- swesmith/harness/gather.py | 56 +++++++++++++++++++++----------------- 1 file changed, 31 insertions(+), 25 deletions(-) diff --git a/swesmith/harness/gather.py b/swesmith/harness/gather.py index 170cddba..55243628 100644 --- a/swesmith/harness/gather.py +++ b/swesmith/harness/gather.py @@ -111,9 +111,9 @@ def check_if_branch_exists( cwd=repo_name, capture_output=True, shell=True, - text=True + text=True, ) - + # If there is output, the branch exists on remote if result.returncode == 0 and subfolder in result.stdout: branch_exists = True @@ -127,8 +127,7 @@ def check_if_branch_exists( if verbose: print(f"[{subfolder}] Overriding existing branch") branch_exists = False - - + except Exception: branch_exists = False pass @@ -202,7 +201,7 @@ def _main( # We must clone into a subdirectory which doesn't exist yet. cache_dir = os.path.join(cache_root, "repo") print(f"Pre-cloning repository to cache: {cache_dir}...") - + rp_cache = None # Try resolving profile from run_id (directory name) first try: @@ -217,7 +216,7 @@ def _main( rp_cache = registry.get_from_inst({KEY_INSTANCE_ID: sample_id}) except Exception as e: print(f"Warning: Could not resolve profile from {sample_id}: {e}") - + path_to_cache = None if rp_cache: try: @@ -228,7 +227,9 @@ def _main( except Exception as e: print(f"Pre-clone failed: {e}. Will fall back to per-instance cloning.") else: - print("Could not resolve profile for pre-cloning. Will iterate per instance.") + print( + "Could not resolve profile for pre-cloning. Will iterate per instance." + ) with concurrent.futures.ProcessPoolExecutor(max_workers=n_workers) as executor: # Create a partial function with fixed arguments @@ -341,7 +342,9 @@ def process_instance( task_instance = { KEY_INSTANCE_ID: subfolder, KEY_PATCH: patch_content, - FAIL_TO_PASS: results[PASS_TO_FAIL], # Flip PASS_TO_FAIL to FAIL_TO_PASS following SWE-bench naming convention + FAIL_TO_PASS: results[ + PASS_TO_FAIL + ], # Flip PASS_TO_FAIL to FAIL_TO_PASS following SWE-bench naming convention PASS_TO_PASS: results[PASS_TO_PASS], } rp = registry.get_from_inst(task_instance) @@ -352,19 +355,17 @@ def process_instance( # We place it in the same temporary directory as the cache to ensure automatic cleanup. if cache_dir: # cache_dir is .../temp/repo, so dirname is .../temp - repo_path = os.path.join(os.path.dirname(cache_dir), f"{rp.repo_name}_worker_{pid}") + repo_path = os.path.join( + os.path.dirname(cache_dir), f"{rp.repo_name}_worker_{pid}" + ) else: # Fallback if no cache used (e.g. debugging), though likely not cleaned up automatically repo_path = os.path.abspath(f"{rp.repo_name}_worker_{pid}") # Helper to reset repo state def reset_repo(path): - subprocess.run( - "git reset --hard", cwd=path, **subprocess_args - ) - subprocess.run( - "git clean -fdx", cwd=path, **subprocess_args - ) + subprocess.run("git reset --hard", cwd=path, **subprocess_args) + subprocess.run("git clean -fdx", cwd=path, **subprocess_args) # remove potential lock files if previous run crashed hard lock_file = os.path.join(path, ".git", "index.lock") if os.path.exists(lock_file): @@ -380,10 +381,10 @@ def reset_repo(path): if verbose: print(f"[{subfolder}] Reusing worker repo {repo_path}") reset_repo(repo_path) - + # We need to know main branch name. We can get it from local repo now. # Assuming main branch hasn't changed name/ref significantly. - # We avoid 'git pull' to save rate limits and time. + # We avoid 'git pull' to save rate limits and time. main_branch = ( subprocess.run( "git rev-parse --abbrev-ref HEAD", @@ -396,14 +397,16 @@ def reset_repo(path): .strip() ) # Ensure we are on main branch - subprocess.run(f"git checkout {main_branch}", cwd=repo_path, **subprocess_args) + subprocess.run( + f"git checkout {main_branch}", cwd=repo_path, **subprocess_args + ) else: # First time setup for this worker if cache_dir and os.path.exists(cache_dir): if verbose: print(f"[{subfolder}] First-time clone from cache {cache_dir}...") - + subprocess.run( f"git clone {cache_dir} {repo_path}", check=True, @@ -443,7 +446,6 @@ def reset_repo(path): # Ensure we are clean on main branch before starting subprocess.run(f"git checkout {main_branch}", cwd=repo_path, **subprocess_args) - # Check if branch already created for this problem branch_exists = check_if_branch_exists( repo_path, subfolder, main_branch, override_branch, verbose, subprocess_args @@ -453,7 +455,7 @@ def reset_repo(path): if verbose: print(f"[SKIP] {subfolder}: Branch `{subfolder}` exists") stats["skipped"] += 1 - # Do NOT remove repo, just return. + # Do NOT remove repo, just return. # We might want to checkout main to be polite to next run but reset_repo handles it. return task_instances, created_repos, stats @@ -514,10 +516,14 @@ def reset_repo(path): if verbose: print(f"[{subfolder}] No changes to commit, skipping") stats["skipped"] += 1 - # Reset logic happens at start of next or via finally... + # Reset logic happens at start of next or via finally... # actually better to cleanup branch now - subprocess.run(f"git checkout {main_branch}", cwd=repo_path, **subprocess_args) - subprocess.run(f"git branch -D {subfolder}", cwd=repo_path, **subprocess_args) + subprocess.run( + f"git checkout {main_branch}", cwd=repo_path, **subprocess_args + ) + subprocess.run( + f"git branch -D {subfolder}", cwd=repo_path, **subprocess_args + ) return task_instances, created_repos, stats cmds = [ @@ -571,7 +577,7 @@ def reset_repo(path): finally: # DO NOT remove repo_path. We persist it for this worker logic. pass - + return task_instances, created_repos, stats From 1f0601ed79c3d8470e60779c06b223b181584959 Mon Sep 17 00:00:00 2001 From: Kevin Li Date: Mon, 19 Jan 2026 20:33:53 -0800 Subject: [PATCH 15/32] First draft for issue gen --- configs/issue_gen/ig_v2.yaml | 2 +- scripts/bug_gen_modal.py | 220 ++++++++++++++++++++++++++++++++- swesmith/issue_gen/generate.py | 1 + 3 files changed, 216 insertions(+), 7 deletions(-) diff --git a/configs/issue_gen/ig_v2.yaml b/configs/issue_gen/ig_v2.yaml index 8bc9e1a0..133d51d8 100644 --- a/configs/issue_gen/ig_v2.yaml +++ b/configs/issue_gen/ig_v2.yaml @@ -1,4 +1,4 @@ -model: anthropic/claude-sonnet-4-20250514 +model: anthropic/claude-haiku-4-5-20251001 system: |- You are a software engineer helping to create a realistic dataset of synthetic GitHub issues. diff --git a/scripts/bug_gen_modal.py b/scripts/bug_gen_modal.py index 35d2142a..4707036c 100644 --- a/scripts/bug_gen_modal.py +++ b/scripts/bug_gen_modal.py @@ -295,8 +295,10 @@ def resolve_profile(repo_name: str): modal.Image.from_registry("ubuntu:22.04", add_python="3.11") .apt_install("git") .pip_install_from_pyproject("pyproject.toml", optional_dependencies=["generate"]) + .pip_install("jinja2", "litellm", "datasets", "pyyaml") .env({"PYTHONPATH": "/root"}) .add_local_dir("swesmith", remote_path="/root/swesmith") + .add_local_dir("configs", remote_path="/root/configs") .add_local_file(".env", remote_path="/root/.env") ) @@ -1841,6 +1843,197 @@ async def run_gather_phase_async(repos: list[str], language: str, args) -> None: print(f"\nGather complete: {success}/{len(repos)} repos processed successfully.\n") +# ============================================================================ +# Issue Generation +# ============================================================================ + + +@app.function( + image=generator_image, + volumes={LOGS_MOUNT_PATH: logs_volume}, + timeout=3600, + secrets=[ + modal.Secret.from_name("ANTHROPIC_API_KEY"), + modal.Secret.from_name("GITHUB_TOKEN"), + ], +) +def issue_gen_remote( + repo: str, + language: str, + config: str, + workers: int, +) -> dict: + """Generate issue descriptions for a single repo's task instances. + + Calls the existing swesmith.issue_gen.generate.IssueGen class to generate + issue descriptions for all task instances in a repo. Uses symlinks to redirect + local paths to Modal volume paths. + + Args: + repo: Repository name (e.g., "astropy__astropy") + language: Programming language filter + config: Path to config file + workers: Number of workers per repo + """ + import os + import sys + from pathlib import Path + + # Set up paths + volume_root = Path(LOGS_MOUNT_PATH) / language + task_insts_dir = volume_root / "task_insts" + + # Resolve task instances file (it may have a hash suffix like repo__name.abcdef.json) + task_insts_file = None + repo_sanitized = repo.replace("/", "__") + + if task_insts_dir.exists(): + for filename in os.listdir(task_insts_dir): + # Check for exact match or match with suffix + if filename == f"{repo_sanitized}.json" or ( + filename.startswith(f"{repo_sanitized}.") and filename.endswith(".json") + ): + task_insts_file = task_insts_dir / filename + break + + # Check if task instances file exists + if not task_insts_file or not task_insts_file.exists(): + return { + "success": True, # Not an error, just nothing to do + "repo": repo, + "instances_processed": 0, + "status": "skipped", + "reason": "No task insts file", + } + + # Create symlinks to redirect local paths to volume paths + # This allows IssueGen to work with its expected local paths + local_logs = Path("/root/logs") + local_logs.mkdir(parents=True, exist_ok=True) + + # Symlink the entire logs directory structure + for subdir in ["task_insts", "run_validation", "issue_gen"]: + local_subdir = local_logs / subdir + volume_subdir = volume_root / subdir + + # Create the volume directory if it doesn't exist + volume_subdir.mkdir(parents=True, exist_ok=True) + + # Create symlink (thread-safe with FileExistsError handling) + try: + if local_subdir.exists(): + local_subdir.unlink() + local_subdir.symlink_to(volume_subdir) + except FileExistsError: + # Another concurrent task already created the symlink + pass + + try: + # Import IssueGen after symlinks are set up + from swesmith.issue_gen.generate import IssueGen + + # Verify config file exists + config_path = Path(config) + if not config_path.exists(): + return { + "success": False, + "repo": repo, + "error": f"Config file not found: {config}", + } + + # Set up IssueGen instance + issue_gen = IssueGen( + dataset_path=str(task_insts_file), + config_file=Path(config), + workers=workers, + redo_existing=False, + ) + + # Run issue generation + # This processes all instances in the repo using ThreadPoolExecutor + issue_gen.run() + + # Count how many instances were processed + issue_gen_file = volume_root / "issue_gen" / f"{repo}.json" + instances_processed = 0 + if issue_gen_file.exists(): + import json + with open(issue_gen_file) as f: + data = json.load(f) + instances_processed = len(data) + + return { + "success": True, + "repo": repo, + "instances_processed": instances_processed, + } + + except Exception as e: + import traceback + return { + "success": False, + "repo": repo, + "error": f"{type(e).__name__}: {str(e)}", + "traceback": traceback.format_exc(), + } + + +async def run_issue_gen_phase_async( + repos: list[str], + language: str, + issue_gen_config: str, + issue_gen_workers: int, +) -> None: + """Run issue generation phase for all repos in parallel. + + Args: + repos: List of repository names to process + language: Programming language filter + issue_gen_config: Path to config file + issue_gen_workers: Number of workers per repo + issue_gen_redo: Whether to regenerate existing issues + """ + print(f"\n{'='*80}") + print(f"ISSUE GENERATION PHASE") + print(f"{'='*80}") + print(f"Processing {len(repos)} repositories...") + print(f"Config: {issue_gen_config}") + print(f"Workers per repo: {issue_gen_workers}") + print() + + # Run issue generation in parallel across all repos + results = [] + async for result in issue_gen_remote.map.aio( + repos, + kwargs={ + "language": language, + "config": issue_gen_config, + "workers": issue_gen_workers, + }, + order_outputs=False, + ): + + results.append(result) + + # Print progress + completed = len(results) + if result["success"]: + instances = result.get("instances_processed", 0) + print(f" [{completed}/{len(repos)}] {result['repo']}: ✓ ({instances} instances)") + else: + error = result.get("error", "Unknown error") + print(f" [{completed}/{len(repos)}] {result['repo']}: ✗ {error}") + if "traceback" in result: + print(f" Traceback: {result['traceback'][:200]}...") + + # Summary + success = sum(1 for r in results if r["success"]) + total_instances = sum(r.get("instances_processed", 0) for r in results if r["success"]) + + print(f"\nIssue generation complete: {success}/{len(repos)} repos processed successfully.") + print(f"Total instances with issues: {total_instances}\n") + + # ============================================================================ # Stats Display # ============================================================================ @@ -2158,6 +2351,8 @@ async def main( max_concurrent_tests: int = 900, show_stats: bool = False, gather: bool = False, + issue_gen_config: str = "configs/issue_gen/ig_v2.yaml", + issue_gen_workers: int = 8, ): """ Modal Bug Generation & Validation script. @@ -2166,6 +2361,7 @@ async def main( 1. Generation: Creates bugs for repos (skips repos that are already done/failed) 2. Validation: Validates all patches from the volume 3. Gather: Creates task instances and pushes branches + 4. Issue Generation: Generates issue descriptions for valid bugs Run with: modal run scripts/bug_gen.py [OPTIONS] @@ -2179,6 +2375,8 @@ async def main( max_concurrent_tests: Max concurrent tests (default: 900) show_stats: If True, show bug breakdown stats and exit without running generation/validation gather: If True, only run the gather phase (skip generation and validation) + issue_gen_config: Path to issue generation config (default: configs/issue_gen/ig_v2.yaml) + issue_gen_workers: Number of workers per repo for issue generation (default: 4) """ # Handle --show-stats early exit if show_stats: @@ -2206,7 +2404,6 @@ async def main( print(f"\n{'=' * 60}") print(f"BUG GEN - {len(target_repos)} repos, {max_concurrent_tests} max concurrent") - print(f"Volume: {VOLUME_NAME}/{language}/") print(f"{'=' * 60}\n") # Create a simple args-like object for compatibility @@ -2218,8 +2415,11 @@ class Args: args.interleave = interleave args.max_entities = max_entities args.max_candidates = max_candidates + args.timeout_buffer = 60 + args.max_concurrent_tests = max_concurrent_tests # Phase 1: Generation (skips repos that are already done/failed) + generation_results = [] if not gather: generation_results = await run_generation_phase(target_repos, args, language) @@ -2248,9 +2448,17 @@ class Args: else: results = [] - # Phase 3: Gather (Create task instances & Push branches) - if not results and not gather: - print("No validation results found. Skipping gather phase.") - return + # # Phase 3: Gather (Create task instances & Push branches) + # if not results and not gather: + # print("No validation results found. Skipping gather phase.") + # return + + # await run_gather_phase_async(target_repos, language, args) - await run_gather_phase_async(target_repos, language, args) + # Phase 4: Issue Generation + await run_issue_gen_phase_async( + target_repos, + language, + issue_gen_config, + issue_gen_workers, + ) diff --git a/swesmith/issue_gen/generate.py b/swesmith/issue_gen/generate.py index efba7a26..b2bb29a8 100644 --- a/swesmith/issue_gen/generate.py +++ b/swesmith/issue_gen/generate.py @@ -277,6 +277,7 @@ def jinja_shuffle(seq): else: # If messages already exist, get repos_to_remove from existing metadata _, repos_to_remove = self.get_test_functions(instance_curr) + messages = metadata["messages"] # Generate n_instructions completions containing problem statements response = completion( From a50e614c8506f6bc6f43657b5af51b37c8ba642b Mon Sep 17 00:00:00 2001 From: Kevin Li Date: Mon, 19 Jan 2026 22:54:44 -0800 Subject: [PATCH 16/32] Support PortKey for issue gen and switch to gpt-5-mini --- configs/issue_gen/ig_v2.yaml | 4 +- pyproject.toml | 2 + scripts/bug_gen_modal.py | 7 +- swesmith/issue_gen/generate.py | 120 +++++++++++++++++++++++++++++++-- 4 files changed, 123 insertions(+), 10 deletions(-) diff --git a/configs/issue_gen/ig_v2.yaml b/configs/issue_gen/ig_v2.yaml index 133d51d8..50a93ff4 100644 --- a/configs/issue_gen/ig_v2.yaml +++ b/configs/issue_gen/ig_v2.yaml @@ -1,4 +1,6 @@ -model: anthropic/claude-haiku-4-5-20251001 +model: portkey/gpt-5-mini +litellm_model_name_override: openai/gpt-5-mini +provider: "@openai" system: |- You are a software engineer helping to create a realistic dataset of synthetic GitHub issues. diff --git a/pyproject.toml b/pyproject.toml index a9e830ef..fe2bdb4f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -35,6 +35,7 @@ all = [ "matplotlib", "modal", "openai", + "portkey-ai", "pre-commit", "python-dotenv", "rich", @@ -83,6 +84,7 @@ generate = [ "docker", "ghapi", "libcst", + "portkey-ai", "python-dotenv", "rich", "swebench", diff --git a/scripts/bug_gen_modal.py b/scripts/bug_gen_modal.py index 4707036c..63f3189c 100644 --- a/scripts/bug_gen_modal.py +++ b/scripts/bug_gen_modal.py @@ -521,7 +521,10 @@ async def acquire(self): @app.function( image=generator_image, - secrets=[modal.Secret.from_name("GITHUB_TOKEN")], + secrets=[ + modal.Secret.from_name("GITHUB_TOKEN"), + modal.Secret.from_name("PORTKEY_API_KEY") + ], timeout=MODAL_TIMEOUT, volumes={LOGS_MOUNT_PATH: logs_volume}, # Mount volume for direct writes ) @@ -1853,8 +1856,8 @@ async def run_gather_phase_async(repos: list[str], language: str, args) -> None: volumes={LOGS_MOUNT_PATH: logs_volume}, timeout=3600, secrets=[ - modal.Secret.from_name("ANTHROPIC_API_KEY"), modal.Secret.from_name("GITHUB_TOKEN"), + modal.Secret.from_name("PORTKEY_API_KEY"), ], ) def issue_gen_remote( diff --git a/swesmith/issue_gen/generate.py b/swesmith/issue_gen/generate.py index b2bb29a8..765e553b 100644 --- a/swesmith/issue_gen/generate.py +++ b/swesmith/issue_gen/generate.py @@ -46,12 +46,88 @@ ) from swesmith.issue_gen.utils import get_test_function from swesmith.profiles import registry +from typing import Any, Literal +from tenacity import ( + retry, + retry_if_not_exception_type, + stop_after_attempt, + wait_exponential, + before_sleep_log, +) +from pydantic import BaseModel + +try: + from portkey_ai import Portkey +except ImportError: + Portkey = None logging.getLogger("LiteLLM").setLevel(logging.WARNING) litellm.drop_params = True litellm.suppress_debug_info = True +logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") +logger = logging.getLogger(__name__) + +class PortkeyModelConfig(BaseModel): + model_name: str + model_kwargs: dict[str, Any] = {} + provider: str = "" + litellm_model_name_override: str = "" + cost_tracking: Literal["default", "ignore_errors"] = "default" + + +class PortkeyModel: + def __init__(self, *, config_class: type = PortkeyModelConfig, **kwargs): + if Portkey is None: + raise ImportError( + "The portkey-ai package is required to use PortkeyModel. Please install it with: pip install portkey-ai" + ) + + self.config = config_class(**kwargs) + self.cost = 0.0 + self.n_calls = 0 + + # Get API key from environment or raise error + self._api_key = os.getenv("PORTKEY_API_KEY") + if not self._api_key: + raise ValueError( + "Portkey API key is required. Set it via the " + "PORTKEY_API_KEY environment variable." + ) + + # Get virtual key from environment + virtual_key = os.getenv("PORTKEY_VIRTUAL_KEY") + + # Initialize Portkey client + client_kwargs = {"api_key": self._api_key} + if virtual_key: + client_kwargs["virtual_key"] = virtual_key + elif self.config.provider: + client_kwargs["provider"] = self.config.provider + + self.client = Portkey(**client_kwargs) + + @retry( + reraise=True, + stop=stop_after_attempt(10), + wait=wait_exponential(multiplier=1, min=4, max=60), + retry=retry_if_not_exception_type((KeyboardInterrupt, TypeError, ValueError)), + before_sleep=before_sleep_log(logger, logging.WARNING), + ) + def _query(self, messages: list[dict[str, str]], **kwargs): + return self.client.chat.completions.create( + model=self.config.model_name, + messages=messages, + **(self.config.model_kwargs | kwargs), + ) + + def query(self, messages: list[dict[str, str]], **kwargs) -> Any: + # Simple adapter to match what generate.py expects (return an object with choices and usage for cost) + response = self._query([{"role": msg["role"], "content": msg["content"]} for msg in messages], **kwargs) + return response + + TEST_SRC_CODE_PROMPT = r""" **Test Source Code:** Use the following test source code to help you write reasonable, effective reproduction code. @@ -61,9 +137,6 @@ load_dotenv() -logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") -logger = logging.getLogger(__name__) - def maybe_shorten(text_str: str, max_tokens: int, model: str) -> str: """Shorten text if it exceeds the max_tokens limit. @@ -92,6 +165,16 @@ def __init__( self.n_instructions = settings.get("n_instructions", 1) self.max_var_tokens = settings.get("max_var_tokens", 10_000) + # Initialize Portkey model if needed + self.portkey_model = None + if self.model.startswith("portkey/") or self.config.get("provider") == "portkey": + self.portkey_model = PortkeyModel( + model_name=self.model.replace("portkey/", ""), + provider=self.config.get("provider", "openai"), + litellm_model_name_override=self.config.get("litellm_model_name_override", ""), + **settings.get("portkey_kwargs", {}) + ) + data_smith = [x for x in load_dataset(HF_DATASET, split="train")] self.dataset = ( data_smith @@ -138,6 +221,7 @@ def _should_do_instance( self, instance: dict, instance_ids: list | None, redo_existing: bool, model: str ) -> bool: repo = instance["repo"].split("/")[-1] + output_file = LOG_DIR_ISSUE_GEN / repo / f"{instance[KEY_INSTANCE_ID]}.json" if not matches_instance_filter(instance[KEY_INSTANCE_ID], instance_ids): return False @@ -280,11 +364,19 @@ def jinja_shuffle(seq): messages = metadata["messages"] # Generate n_instructions completions containing problem statements - response = completion( - model=self.model, messages=messages, n=self.n_instructions, temperature=0 - ) + if self.portkey_model: + response = self.portkey_model.query(messages, n=self.n_instructions, stream=False) + else: + response = completion( + model=self.model, messages=messages, n=self.n_instructions, temperature=0 + ) - cost = completion_cost(response) + model_for_cost = self.model + if self.portkey_model and self.portkey_model.config.litellm_model_name_override: + model_for_cost = self.portkey_model.config.litellm_model_name_override + + cost = completion_cost(response, model=model_for_cost) + metadata["cost"] = (0 if "cost" not in metadata else metadata["cost"]) + cost # Extract problem statements from response @@ -343,6 +435,20 @@ def run(self): # Track repos to remove for cleanup all_repos_to_remove = set() + # Pre-clone all required repositories to avoid race conditions in parallel execution + # (RepoProfile.clone is not thread-safe) + unique_repos = {instance["repo"].split("/")[-1] for instance in self.dataset} + for repo_name in unique_repos: + try: + # registry.get(repo_name).clone() returns (dest, cloned) + # cloned is True if it actually cloned, False if it already existed + _, cloned = registry.get(repo_name).clone() + if cloned: + all_repos_to_remove.add(repo_name) + except Exception as e: + logger.error(f"Failed to pre-clone {repo_name}: {e}") + # We continue, assuming it might work later or will fail properly in the thread + # Create a thread pool and call generate_issue for each instance with ThreadPoolExecutor(max_workers=self.workers) as executor: futures = [] From 9461c1f03c980190116fe4083caf0d36727fcd37 Mon Sep 17 00:00:00 2001 From: Kevin Li Date: Mon, 19 Jan 2026 23:41:33 -0800 Subject: [PATCH 17/32] Uncomment gather part in bug_gen_modal.py --- scripts/bug_gen_modal.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/scripts/bug_gen_modal.py b/scripts/bug_gen_modal.py index 63f3189c..0884ab29 100644 --- a/scripts/bug_gen_modal.py +++ b/scripts/bug_gen_modal.py @@ -2451,12 +2451,12 @@ class Args: else: results = [] - # # Phase 3: Gather (Create task instances & Push branches) - # if not results and not gather: - # print("No validation results found. Skipping gather phase.") - # return + # Phase 3: Gather (Create task instances & Push branches) + if not results and not gather: + print("No validation results found. Skipping gather phase.") + return - # await run_gather_phase_async(target_repos, language, args) + await run_gather_phase_async(target_repos, language, args) # Phase 4: Issue Generation await run_issue_gen_phase_async( From f7f68cb85ad589c31f4423adda0dc239227ec1a3 Mon Sep 17 00:00:00 2001 From: Kevin Li Date: Tue, 20 Jan 2026 00:18:31 -0800 Subject: [PATCH 18/32] Add script to upload task instances to Hugging Face --- scripts/upload_tasks_to_hf_modal.py | 169 ++++++++++++++++++++++++++++ 1 file changed, 169 insertions(+) create mode 100644 scripts/upload_tasks_to_hf_modal.py diff --git a/scripts/upload_tasks_to_hf_modal.py b/scripts/upload_tasks_to_hf_modal.py new file mode 100644 index 00000000..0271bade --- /dev/null +++ b/scripts/upload_tasks_to_hf_modal.py @@ -0,0 +1,169 @@ +import modal +import json +import asyncio +from pathlib import Path +import sys +from concurrent.futures import ThreadPoolExecutor + +# Define Modal App +app = modal.App("swesmith-upload-hf") +vol = modal.Volume.from_name("swesmith-bug-gen") + +# Define an image with necessary dependencies +# We need datasets and huggingface_hub for the remote push +image = modal.Image.debian_slim().pip_install("tqdm", "datasets", "huggingface_hub") + +def _process_single_task(task, issue_gen_dir, repo_id): + """Helper to process a single task instance""" + instance_id = task.get("instance_id") + if not instance_id: + return task + + if "image_name" in task and ".architecture." in task["image_name"]: + task["image_name"] = task["image_name"].replace(".architecture", "") + + task["problem_statement"] = "" + issue_file = issue_gen_dir / repo_id / f"{instance_id}.json" + + if issue_file.exists(): + try: + with open(issue_file, "r") as f_issue: + issue_data = json.load(f_issue) + resp = issue_data.get("responses", {}) + if "portkey/gpt-5-mini" in resp: + content = resp["portkey/gpt-5-mini"] + if isinstance(content, list) and len(content) > 0: + task["problem_statement"] = content[0] + except Exception: + pass + return task + +@app.function(image=image, volumes={"/data": vol}, timeout=1200, max_containers=10) +def process_repo(task_filename: str): + """(Same as before)""" + import concurrent.futures + # Assume language is javascript for now or pass it in path + language = "javascript" + task_file_path = Path(f"/data/{language}/task_insts/{task_filename}") + issue_gen_dir = Path(f"/data/{language}/issue_gen") + + tasks_out = [] + + if not task_file_path.exists(): + print(f"File not found: {task_file_path}") + return [] + + repo_id = task_file_path.stem + try: + with open(task_file_path, "r") as f: + tasks = json.load(f) + + print(f"[{repo_id}] Processing {len(tasks)} tasks...") + + with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor: + futures = [ + executor.submit(_process_single_task, task, issue_gen_dir, repo_id) + for task in tasks + ] + + for future in concurrent.futures.as_completed(futures): + tasks_out.append(future.result()) + + except Exception as e: + print(f"[{repo_id}] Error: {e}") + + return tasks_out + +@app.function( + image=image, + secrets=[modal.Secret.from_name("john-hf-secret")], + timeout=1800 +) +def push_to_hf_remote(all_tasks: list, target_dataset: str): + import os + from datasets import load_dataset, Dataset, concatenate_datasets + from huggingface_hub import create_repo, HfApi + + print(f"Starting remote upload to {target_dataset}") + token = os.environ.get("HF_TOKEN") + if not token: + print("WARNING: HF_TOKEN not found in environment variables!") + else: + print("HF_TOKEN found in environment variables.") + + # Validation + required_keys = ["instance_id", "patch", "FAIL_TO_PASS", "PASS_TO_PASS", "image_name", "repo"] + print("Validating keys...") + cleaned_tasks = [] + for task in all_tasks: + valid = True + for k in required_keys: + if k not in task: + print(f"Missing key {k} in task {task.get('instance_id')}. Skipping.") + valid = False + break + if not valid: + continue + + if "problem_statement" not in task: + task["problem_statement"] = "" + cleaned_tasks.append(task) + + print(f"Valid tasks: {len(cleaned_tasks)}") + local_dataset = Dataset.from_list(cleaned_tasks) + local_ids = set(local_dataset["instance_id"]) + + final_dataset = local_dataset + + # Try to ensure repo exists + print(f"Ensuring repository {target_dataset} exists...") + try: + create_repo(target_dataset, repo_type="dataset", token=token, exist_ok=True) + except Exception as e: + print(f"Warning: create_repo failed: {e}. Attempting upload anyway (might fail if permissions wrong).") + + # print(f"Loading target dataset: {target_dataset}") + # try: + # sweb = load_dataset(target_dataset, split="train", token=token) + # print(f"Existing HF dataset size: {len(sweb)}") + + # sweb_filtered = sweb.filter(lambda x: x["instance_id"] not in local_ids) + # print(f"Would override {len(sweb) - len(sweb_filtered)} instances") + + # final_dataset = concatenate_datasets([sweb_filtered, local_dataset]) + # except Exception as e: + # print(f"Note: Could not load existing dataset '{target_dataset}' (it might be new or empty). Error: {e}") + # print("Proceeding with creating a new dataset from local tasks.") + + print(f"Pushing {len(final_dataset)} instances to {target_dataset}...") + final_dataset.push_to_hub(target_dataset, token=token) + print("Remote push finished successfully.") + + +@app.local_entrypoint() +def main(target_dataset: str = "SWE-bench/SWE-smith-js", push: bool = False): + print("Listing task files from Modal volume...") + try: + entries = vol.listdir("javascript/task_insts") + filenames = [e.path.split("/")[-1] for e in entries if e.path.endswith(".json")] + except Exception as e: + print(f"Error listing volume: {e}") + return + + print(f"Found {len(filenames)} files. Starting parallel processing...") + + all_tasks = [] + for repo_tasks in process_repo.map(filenames): + all_tasks.extend(repo_tasks) + + print(f"Fetched total {len(all_tasks)} task instances.") + + if not push: + confirm = input(f"Ready to push to HF. Proceed? (y/n) ").lower() + if confirm != "y": + print("Aborting.") + return + + print("Launching remote push job...") + push_to_hf_remote.remote(all_tasks, target_dataset) + print("Done!") From 4976d857e36d79436451d1f6d3d95c34b6d49129 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 20 Jan 2026 08:20:19 +0000 Subject: [PATCH 19/32] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- scripts/bug_gen_modal.py | 25 +++++++---- scripts/upload_tasks_to_hf_modal.py | 67 +++++++++++++++++------------ swesmith/issue_gen/generate.py | 32 ++++++++++---- 3 files changed, 78 insertions(+), 46 deletions(-) diff --git a/scripts/bug_gen_modal.py b/scripts/bug_gen_modal.py index 0884ab29..3f1a69e8 100644 --- a/scripts/bug_gen_modal.py +++ b/scripts/bug_gen_modal.py @@ -523,7 +523,7 @@ async def acquire(self): image=generator_image, secrets=[ modal.Secret.from_name("GITHUB_TOKEN"), - modal.Secret.from_name("PORTKEY_API_KEY") + modal.Secret.from_name("PORTKEY_API_KEY"), ], timeout=MODAL_TIMEOUT, volumes={LOGS_MOUNT_PATH: logs_volume}, # Mount volume for direct writes @@ -1885,11 +1885,11 @@ def issue_gen_remote( # Set up paths volume_root = Path(LOGS_MOUNT_PATH) / language task_insts_dir = volume_root / "task_insts" - + # Resolve task instances file (it may have a hash suffix like repo__name.abcdef.json) task_insts_file = None repo_sanitized = repo.replace("/", "__") - + if task_insts_dir.exists(): for filename in os.listdir(task_insts_dir): # Check for exact match or match with suffix @@ -1961,6 +1961,7 @@ def issue_gen_remote( instances_processed = 0 if issue_gen_file.exists(): import json + with open(issue_gen_file) as f: data = json.load(f) instances_processed = len(data) @@ -1973,6 +1974,7 @@ def issue_gen_remote( except Exception as e: import traceback + return { "success": False, "repo": repo, @@ -1996,9 +1998,9 @@ async def run_issue_gen_phase_async( issue_gen_workers: Number of workers per repo issue_gen_redo: Whether to regenerate existing issues """ - print(f"\n{'='*80}") + print(f"\n{'=' * 80}") print(f"ISSUE GENERATION PHASE") - print(f"{'='*80}") + print(f"{'=' * 80}") print(f"Processing {len(repos)} repositories...") print(f"Config: {issue_gen_config}") print(f"Workers per repo: {issue_gen_workers}") @@ -2015,14 +2017,15 @@ async def run_issue_gen_phase_async( }, order_outputs=False, ): - results.append(result) # Print progress completed = len(results) if result["success"]: instances = result.get("instances_processed", 0) - print(f" [{completed}/{len(repos)}] {result['repo']}: ✓ ({instances} instances)") + print( + f" [{completed}/{len(repos)}] {result['repo']}: ✓ ({instances} instances)" + ) else: error = result.get("error", "Unknown error") print(f" [{completed}/{len(repos)}] {result['repo']}: ✗ {error}") @@ -2031,9 +2034,13 @@ async def run_issue_gen_phase_async( # Summary success = sum(1 for r in results if r["success"]) - total_instances = sum(r.get("instances_processed", 0) for r in results if r["success"]) + total_instances = sum( + r.get("instances_processed", 0) for r in results if r["success"] + ) - print(f"\nIssue generation complete: {success}/{len(repos)} repos processed successfully.") + print( + f"\nIssue generation complete: {success}/{len(repos)} repos processed successfully." + ) print(f"Total instances with issues: {total_instances}\n") diff --git a/scripts/upload_tasks_to_hf_modal.py b/scripts/upload_tasks_to_hf_modal.py index 0271bade..a866584a 100644 --- a/scripts/upload_tasks_to_hf_modal.py +++ b/scripts/upload_tasks_to_hf_modal.py @@ -13,18 +13,19 @@ # We need datasets and huggingface_hub for the remote push image = modal.Image.debian_slim().pip_install("tqdm", "datasets", "huggingface_hub") + def _process_single_task(task, issue_gen_dir, repo_id): """Helper to process a single task instance""" instance_id = task.get("instance_id") if not instance_id: return task - + if "image_name" in task and ".architecture." in task["image_name"]: task["image_name"] = task["image_name"].replace(".architecture", "") - + task["problem_statement"] = "" issue_file = issue_gen_dir / repo_id / f"{instance_id}.json" - + if issue_file.exists(): try: with open(issue_file, "r") as f_issue: @@ -38,52 +39,53 @@ def _process_single_task(task, issue_gen_dir, repo_id): pass return task + @app.function(image=image, volumes={"/data": vol}, timeout=1200, max_containers=10) def process_repo(task_filename: str): """(Same as before)""" import concurrent.futures + # Assume language is javascript for now or pass it in path language = "javascript" task_file_path = Path(f"/data/{language}/task_insts/{task_filename}") issue_gen_dir = Path(f"/data/{language}/issue_gen") - + tasks_out = [] - + if not task_file_path.exists(): print(f"File not found: {task_file_path}") return [] - + repo_id = task_file_path.stem try: with open(task_file_path, "r") as f: tasks = json.load(f) - + print(f"[{repo_id}] Processing {len(tasks)} tasks...") - + with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor: futures = [ executor.submit(_process_single_task, task, issue_gen_dir, repo_id) for task in tasks ] - + for future in concurrent.futures.as_completed(futures): tasks_out.append(future.result()) - + except Exception as e: print(f"[{repo_id}] Error: {e}") - + return tasks_out + @app.function( - image=image, - secrets=[modal.Secret.from_name("john-hf-secret")], - timeout=1800 + image=image, secrets=[modal.Secret.from_name("john-hf-secret")], timeout=1800 ) def push_to_hf_remote(all_tasks: list, target_dataset: str): import os from datasets import load_dataset, Dataset, concatenate_datasets from huggingface_hub import create_repo, HfApi - + print(f"Starting remote upload to {target_dataset}") token = os.environ.get("HF_TOKEN") if not token: @@ -92,44 +94,53 @@ def push_to_hf_remote(all_tasks: list, target_dataset: str): print("HF_TOKEN found in environment variables.") # Validation - required_keys = ["instance_id", "patch", "FAIL_TO_PASS", "PASS_TO_PASS", "image_name", "repo"] + required_keys = [ + "instance_id", + "patch", + "FAIL_TO_PASS", + "PASS_TO_PASS", + "image_name", + "repo", + ] print("Validating keys...") cleaned_tasks = [] for task in all_tasks: valid = True for k in required_keys: if k not in task: - print(f"Missing key {k} in task {task.get('instance_id')}. Skipping.") - valid = False - break + print(f"Missing key {k} in task {task.get('instance_id')}. Skipping.") + valid = False + break if not valid: continue if "problem_statement" not in task: task["problem_statement"] = "" cleaned_tasks.append(task) - + print(f"Valid tasks: {len(cleaned_tasks)}") local_dataset = Dataset.from_list(cleaned_tasks) local_ids = set(local_dataset["instance_id"]) - + final_dataset = local_dataset - + # Try to ensure repo exists print(f"Ensuring repository {target_dataset} exists...") try: create_repo(target_dataset, repo_type="dataset", token=token, exist_ok=True) except Exception as e: - print(f"Warning: create_repo failed: {e}. Attempting upload anyway (might fail if permissions wrong).") + print( + f"Warning: create_repo failed: {e}. Attempting upload anyway (might fail if permissions wrong)." + ) # print(f"Loading target dataset: {target_dataset}") # try: # sweb = load_dataset(target_dataset, split="train", token=token) # print(f"Existing HF dataset size: {len(sweb)}") - + # sweb_filtered = sweb.filter(lambda x: x["instance_id"] not in local_ids) # print(f"Would override {len(sweb) - len(sweb_filtered)} instances") - + # final_dataset = concatenate_datasets([sweb_filtered, local_dataset]) # except Exception as e: # print(f"Note: Could not load existing dataset '{target_dataset}' (it might be new or empty). Error: {e}") @@ -151,13 +162,13 @@ def main(target_dataset: str = "SWE-bench/SWE-smith-js", push: bool = False): return print(f"Found {len(filenames)} files. Starting parallel processing...") - + all_tasks = [] for repo_tasks in process_repo.map(filenames): all_tasks.extend(repo_tasks) - + print(f"Fetched total {len(all_tasks)} task instances.") - + if not push: confirm = input(f"Ready to push to HF. Proceed? (y/n) ").lower() if confirm != "y": diff --git a/swesmith/issue_gen/generate.py b/swesmith/issue_gen/generate.py index 765e553b..2042b6fb 100644 --- a/swesmith/issue_gen/generate.py +++ b/swesmith/issue_gen/generate.py @@ -69,6 +69,7 @@ logging.basicConfig(level=logging.INFO, format="%(asctime)s %(levelname)s %(message)s") logger = logging.getLogger(__name__) + class PortkeyModelConfig(BaseModel): model_name: str model_kwargs: dict[str, Any] = {} @@ -83,7 +84,7 @@ def __init__(self, *, config_class: type = PortkeyModelConfig, **kwargs): raise ImportError( "The portkey-ai package is required to use PortkeyModel. Please install it with: pip install portkey-ai" ) - + self.config = config_class(**kwargs) self.cost = 0.0 self.n_calls = 0 @@ -124,7 +125,10 @@ def _query(self, messages: list[dict[str, str]], **kwargs): def query(self, messages: list[dict[str, str]], **kwargs) -> Any: # Simple adapter to match what generate.py expects (return an object with choices and usage for cost) - response = self._query([{"role": msg["role"], "content": msg["content"]} for msg in messages], **kwargs) + response = self._query( + [{"role": msg["role"], "content": msg["content"]} for msg in messages], + **kwargs, + ) return response @@ -167,12 +171,17 @@ def __init__( # Initialize Portkey model if needed self.portkey_model = None - if self.model.startswith("portkey/") or self.config.get("provider") == "portkey": + if ( + self.model.startswith("portkey/") + or self.config.get("provider") == "portkey" + ): self.portkey_model = PortkeyModel( model_name=self.model.replace("portkey/", ""), provider=self.config.get("provider", "openai"), - litellm_model_name_override=self.config.get("litellm_model_name_override", ""), - **settings.get("portkey_kwargs", {}) + litellm_model_name_override=self.config.get( + "litellm_model_name_override", "" + ), + **settings.get("portkey_kwargs", {}), ) data_smith = [x for x in load_dataset(HF_DATASET, split="train")] @@ -365,18 +374,23 @@ def jinja_shuffle(seq): # Generate n_instructions completions containing problem statements if self.portkey_model: - response = self.portkey_model.query(messages, n=self.n_instructions, stream=False) + response = self.portkey_model.query( + messages, n=self.n_instructions, stream=False + ) else: response = completion( - model=self.model, messages=messages, n=self.n_instructions, temperature=0 + model=self.model, + messages=messages, + n=self.n_instructions, + temperature=0, ) model_for_cost = self.model if self.portkey_model and self.portkey_model.config.litellm_model_name_override: model_for_cost = self.portkey_model.config.litellm_model_name_override - + cost = completion_cost(response, model=model_for_cost) - + metadata["cost"] = (0 if "cost" not in metadata else metadata["cost"]) + cost # Extract problem statements from response From 2ab21d806aaa4a7fe49931200a2b39c2ba66bde2 Mon Sep 17 00:00:00 2001 From: Kevin Li Date: Thu, 29 Jan 2026 17:50:07 -0800 Subject: [PATCH 20/32] Refactor bug generation phases to use --phases argument instead of --gather flag --- scripts/bug_gen_modal.py | 67 +++++++++++++++++++++++----------------- 1 file changed, 39 insertions(+), 28 deletions(-) diff --git a/scripts/bug_gen_modal.py b/scripts/bug_gen_modal.py index 3f1a69e8..78900784 100644 --- a/scripts/bug_gen_modal.py +++ b/scripts/bug_gen_modal.py @@ -2360,18 +2360,18 @@ async def main( max_candidates: int = 2000, max_concurrent_tests: int = 900, show_stats: bool = False, - gather: bool = False, + phases: str = "gen,val,gather,issue", issue_gen_config: str = "configs/issue_gen/ig_v2.yaml", issue_gen_workers: int = 8, ): """ Modal Bug Generation & Validation script. - Runs two phases: - 1. Generation: Creates bugs for repos (skips repos that are already done/failed) - 2. Validation: Validates all patches from the volume - 3. Gather: Creates task instances and pushes branches - 4. Issue Generation: Generates issue descriptions for valid bugs + Runs phases specified by --phases (comma-separated): + - gen: Generation (creates bugs for repos) + - val: Validation (validates patches from volume) + - gather: Gather (creates task instances & pushes branches) + - issue: Issue Generation (generates issue descriptions) Run with: modal run scripts/bug_gen.py [OPTIONS] @@ -2384,7 +2384,7 @@ async def main( max_candidates: Max candidates to process, -1 for all (default: 2000) max_concurrent_tests: Max concurrent tests (default: 900) show_stats: If True, show bug breakdown stats and exit without running generation/validation - gather: If True, only run the gather phase (skip generation and validation) + phases: Comma-separated list of phases to run (default: "gen,val,gather,issue") issue_gen_config: Path to issue generation config (default: configs/issue_gen/ig_v2.yaml) issue_gen_workers: Number of workers per repo for issue generation (default: 4) """ @@ -2393,6 +2393,19 @@ async def main( await show_volume_stats(language) return + # Parse and validate phases + valid_phases = {"gen", "val", "gather", "issue"} + phase_list = [p.strip() for p in phases.split(",") if p.strip()] + active_phases = set(phase_list) + + invalid_phases = active_phases - valid_phases + if invalid_phases: + print(f"Error: Invalid phases: {invalid_phases}") + print(f"Valid phases are: {valid_phases}") + return + + print(f"Running phases: {', '.join(sorted(active_phases))}") + from swesmith.constants import ENV_NAME # Parse repos (comma-separated string to list) @@ -2430,10 +2443,12 @@ class Args: # Phase 1: Generation (skips repos that are already done/failed) generation_results = [] - if not gather: + if "gen" in active_phases: generation_results = await run_generation_phase(target_repos, args, language) - # Phase 2: Validation - collect ALL patches from volume (not just from this run) + # Phase 2: Validation - collect ALL patches from volume (not just from this run) + results = [] + if "val" in active_phases: print(f"\n{'#' * 60}") print("# PHASE 2: VALIDATION") print(f"{'#' * 60}\n") @@ -2449,26 +2464,22 @@ class Args: if results: print_summary(results, len(build_repos_with_patches(all_patches))) - # Report generation errors from this run - errors = [r for r in generation_results if "error" in r] - if errors: - print(f"\nGeneration Errors ({len(errors)}):") - for err in errors: - print(f" - {err['repo']}: {err.get('error', 'Unknown')}") - else: - results = [] + # Report generation errors from this run (if any) + errors = [r for r in generation_results if "error" in r] + if errors: + print(f"\nGeneration Errors ({len(errors)}):") + for err in errors: + print(f" - {err['repo']}: {err.get('error', 'Unknown')}") # Phase 3: Gather (Create task instances & Push branches) - if not results and not gather: - print("No validation results found. Skipping gather phase.") - return - - await run_gather_phase_async(target_repos, language, args) + if "gather" in active_phases: + await run_gather_phase_async(target_repos, language, args) # Phase 4: Issue Generation - await run_issue_gen_phase_async( - target_repos, - language, - issue_gen_config, - issue_gen_workers, - ) + if "issue" in active_phases: + await run_issue_gen_phase_async( + target_repos, + language, + issue_gen_config, + issue_gen_workers, + ) From 8979df46c73c9a4ca4682c9e95e2f1a21e42ffc5 Mon Sep 17 00:00:00 2001 From: Kevin Li Date: Thu, 5 Feb 2026 23:01:37 -0800 Subject: [PATCH 21/32] Fix gather patch apply failures in worker repos Cause: - gather invoked apply commands with a relative patch path (`../logs/run_validation/.../patch.diff`). - During modal gather, each worker runs from a temporary repo directory under `/tmp/...`, so that relative path did not resolve to the mounted logs directory. - `git apply` and fallback `patch` both failed with "can't open patch ... No such file or directory", resulting in dropped instances and empty/underfilled task outputs. Fix: - Resolve `patch.diff` to an absolute path before apply. - Shell-quote that absolute path and pass it to every command in `GIT_APPLY_CMDS`. Result: - Patch application no longer depends on worker cwd; gather can apply valid rust patches and produce task instances consistently. --- swesmith/harness/gather.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/swesmith/harness/gather.py b/swesmith/harness/gather.py index 55243628..e639bd93 100644 --- a/swesmith/harness/gather.py +++ b/swesmith/harness/gather.py @@ -29,6 +29,7 @@ import argparse import json import os +import shlex import subprocess import concurrent.futures import functools @@ -464,9 +465,10 @@ def reset_repo(path): # Apply patch applied = False + abs_patch_path = shlex.quote(os.path.abspath(path_patch)) for git_apply in GIT_APPLY_CMDS: output = subprocess.run( - f"{git_apply} ../{path_patch}", + f"{git_apply} {abs_patch_path}", cwd=repo_path, capture_output=True, shell=True, From 17fe59ba53388696143ed5404b3cb88ad7a087e8 Mon Sep 17 00:00:00 2001 From: Kevin Li Date: Fri, 6 Feb 2026 01:02:29 -0800 Subject: [PATCH 22/32] Make HF upload script language-aware for non-JS datasets Root cause: upload_tasks_to_hf_modal.py was hardcoded to javascript paths in both task discovery and per-repo processing. Running with --language rust still read /data/javascript/... and javascript/task_insts, which breaks Rust upload workflows and can surface as missing/empty problem statements for non-JS datasets. Fix: thread a language argument through the local entrypoint and worker function, list files from {language}/task_insts, and pass language explicitly through process_repo.map so each worker reads /data/{language}/task_insts and /data/{language}/issue_gen. --- scripts/upload_tasks_to_hf_modal.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/scripts/upload_tasks_to_hf_modal.py b/scripts/upload_tasks_to_hf_modal.py index a866584a..eb08d5a3 100644 --- a/scripts/upload_tasks_to_hf_modal.py +++ b/scripts/upload_tasks_to_hf_modal.py @@ -41,12 +41,10 @@ def _process_single_task(task, issue_gen_dir, repo_id): @app.function(image=image, volumes={"/data": vol}, timeout=1200, max_containers=10) -def process_repo(task_filename: str): +def process_repo(task_filename: str, language: str = "javascript"): """(Same as before)""" import concurrent.futures - # Assume language is javascript for now or pass it in path - language = "javascript" task_file_path = Path(f"/data/{language}/task_insts/{task_filename}") issue_gen_dir = Path(f"/data/{language}/issue_gen") @@ -152,10 +150,14 @@ def push_to_hf_remote(all_tasks: list, target_dataset: str): @app.local_entrypoint() -def main(target_dataset: str = "SWE-bench/SWE-smith-js", push: bool = False): +def main( + target_dataset: str = "SWE-bench/SWE-smith-js", + language: str = "javascript", + push: bool = False, +): print("Listing task files from Modal volume...") try: - entries = vol.listdir("javascript/task_insts") + entries = vol.listdir(f"{language}/task_insts") filenames = [e.path.split("/")[-1] for e in entries if e.path.endswith(".json")] except Exception as e: print(f"Error listing volume: {e}") @@ -164,7 +166,7 @@ def main(target_dataset: str = "SWE-bench/SWE-smith-js", push: bool = False): print(f"Found {len(filenames)} files. Starting parallel processing...") all_tasks = [] - for repo_tasks in process_repo.map(filenames): + for repo_tasks in process_repo.map(filenames, [language] * len(filenames)): all_tasks.extend(repo_tasks) print(f"Fetched total {len(all_tasks)} task instances.") From 9c58d47b25c744d487902c361a5d5774cb4b0991 Mon Sep 17 00:00:00 2001 From: Kevin Li Date: Wed, 11 Feb 2026 22:30:03 -0800 Subject: [PATCH 23/32] Make Modal HF upload robust with single remote path --- scripts/upload_tasks_to_hf_modal.py | 233 ++++++++++++---------------- 1 file changed, 101 insertions(+), 132 deletions(-) diff --git a/scripts/upload_tasks_to_hf_modal.py b/scripts/upload_tasks_to_hf_modal.py index eb08d5a3..0953903f 100644 --- a/scripts/upload_tasks_to_hf_modal.py +++ b/scripts/upload_tasks_to_hf_modal.py @@ -1,21 +1,25 @@ -import modal import json -import asyncio from pathlib import Path -import sys -from concurrent.futures import ThreadPoolExecutor -# Define Modal App +import modal + app = modal.App("swesmith-upload-hf") vol = modal.Volume.from_name("swesmith-bug-gen") +image = modal.Image.debian_slim().pip_install("datasets", "huggingface_hub") -# Define an image with necessary dependencies -# We need datasets and huggingface_hub for the remote push -image = modal.Image.debian_slim().pip_install("tqdm", "datasets", "huggingface_hub") +REQUIRED_KEYS = [ + "instance_id", + "patch", + "FAIL_TO_PASS", + "PASS_TO_PASS", + "image_name", + "repo", +] +ISSUE_MODEL_KEY = "portkey/gpt-5-mini" -def _process_single_task(task, issue_gen_dir, repo_id): - """Helper to process a single task instance""" +def _attach_issue_statement(task: dict, issue_gen_dir: Path, repo_id: str) -> dict: + """Attach issue text and normalize fields for a task instance.""" instance_id = task.get("instance_id") if not instance_id: return task @@ -25,129 +29,105 @@ def _process_single_task(task, issue_gen_dir, repo_id): task["problem_statement"] = "" issue_file = issue_gen_dir / repo_id / f"{instance_id}.json" + if not issue_file.exists(): + return task - if issue_file.exists(): - try: - with open(issue_file, "r") as f_issue: - issue_data = json.load(f_issue) - resp = issue_data.get("responses", {}) - if "portkey/gpt-5-mini" in resp: - content = resp["portkey/gpt-5-mini"] - if isinstance(content, list) and len(content) > 0: - task["problem_statement"] = content[0] - except Exception: - pass - return task - - -@app.function(image=image, volumes={"/data": vol}, timeout=1200, max_containers=10) -def process_repo(task_filename: str, language: str = "javascript"): - """(Same as before)""" - import concurrent.futures - - task_file_path = Path(f"/data/{language}/task_insts/{task_filename}") - issue_gen_dir = Path(f"/data/{language}/issue_gen") - - tasks_out = [] - - if not task_file_path.exists(): - print(f"File not found: {task_file_path}") - return [] - - repo_id = task_file_path.stem try: - with open(task_file_path, "r") as f: - tasks = json.load(f) - - print(f"[{repo_id}] Processing {len(tasks)} tasks...") - - with concurrent.futures.ThreadPoolExecutor(max_workers=10) as executor: - futures = [ - executor.submit(_process_single_task, task, issue_gen_dir, repo_id) - for task in tasks - ] - - for future in concurrent.futures.as_completed(futures): - tasks_out.append(future.result()) + issue_data = json.loads(issue_file.read_text()) + except Exception: + return task - except Exception as e: - print(f"[{repo_id}] Error: {e}") + responses = issue_data.get("responses", {}) + content = responses.get(ISSUE_MODEL_KEY) + if isinstance(content, list) and content: + task["problem_statement"] = content[0] - return tasks_out + return task @app.function( - image=image, secrets=[modal.Secret.from_name("john-hf-secret")], timeout=1800 + image=image, + volumes={"/data": vol}, + secrets=[modal.Secret.from_name("john-hf-secret")], + timeout=10800, ) -def push_to_hf_remote(all_tasks: list, target_dataset: str): +def upload_from_volume_remote(target_dataset: str, language: str = "javascript") -> dict: + """Robust end-to-end upload: volume -> issue merge -> validate -> HF push.""" import os - from datasets import load_dataset, Dataset, concatenate_datasets - from huggingface_hub import create_repo, HfApi + from datasets import Dataset + from huggingface_hub import create_repo - print(f"Starting remote upload to {target_dataset}") token = os.environ.get("HF_TOKEN") if not token: - print("WARNING: HF_TOKEN not found in environment variables!") - else: - print("HF_TOKEN found in environment variables.") - - # Validation - required_keys = [ - "instance_id", - "patch", - "FAIL_TO_PASS", - "PASS_TO_PASS", - "image_name", - "repo", - ] - print("Validating keys...") + return {"success": False, "error": "HF_TOKEN not found in environment"} + + task_insts_dir = Path(f"/data/{language}/task_insts") + issue_gen_dir = Path(f"/data/{language}/issue_gen") + if not task_insts_dir.exists(): + return {"success": False, "error": f"Missing task_insts dir: {task_insts_dir}"} + + task_files = sorted(task_insts_dir.glob("*.json")) + if not task_files: + return {"success": False, "error": f"No task files in {task_insts_dir}"} + + print(f"Found {len(task_files)} task files in volume.") + cleaned_tasks = [] - for task in all_tasks: - valid = True - for k in required_keys: - if k not in task: - print(f"Missing key {k} in task {task.get('instance_id')}. Skipping.") - valid = False - break - if not valid: + skipped_missing_keys = 0 + repos_processed = 0 + repos_failed = 0 + + for task_file in task_files: + repo_id = task_file.stem + try: + tasks = json.loads(task_file.read_text()) + except Exception as e: + repos_failed += 1 + print(f"[{repo_id}] Failed to read tasks: {e}") continue - if "problem_statement" not in task: - task["problem_statement"] = "" - cleaned_tasks.append(task) + repos_processed += 1 + print(f"[{repo_id}] Processing {len(tasks)} tasks...") + + for task in tasks: + task = _attach_issue_statement(task, issue_gen_dir, repo_id) + if all(k in task for k in REQUIRED_KEYS): + if "problem_statement" not in task: + task["problem_statement"] = "" + cleaned_tasks.append(task) + else: + skipped_missing_keys += 1 + + print(f"[{repo_id}] Done") + + if not cleaned_tasks: + return { + "success": False, + "error": "No valid tasks to upload", + "repos_processed": repos_processed, + "repos_failed": repos_failed, + "skipped_missing_keys": skipped_missing_keys, + } print(f"Valid tasks: {len(cleaned_tasks)}") - local_dataset = Dataset.from_list(cleaned_tasks) - local_ids = set(local_dataset["instance_id"]) + dataset = Dataset.from_list(cleaned_tasks) - final_dataset = local_dataset + print(f"Ensuring dataset repo exists: {target_dataset}") + create_repo(target_dataset, repo_type="dataset", token=token, exist_ok=True) - # Try to ensure repo exists - print(f"Ensuring repository {target_dataset} exists...") - try: - create_repo(target_dataset, repo_type="dataset", token=token, exist_ok=True) - except Exception as e: - print( - f"Warning: create_repo failed: {e}. Attempting upload anyway (might fail if permissions wrong)." - ) - - # print(f"Loading target dataset: {target_dataset}") - # try: - # sweb = load_dataset(target_dataset, split="train", token=token) - # print(f"Existing HF dataset size: {len(sweb)}") - - # sweb_filtered = sweb.filter(lambda x: x["instance_id"] not in local_ids) - # print(f"Would override {len(sweb) - len(sweb_filtered)} instances") - - # final_dataset = concatenate_datasets([sweb_filtered, local_dataset]) - # except Exception as e: - # print(f"Note: Could not load existing dataset '{target_dataset}' (it might be new or empty). Error: {e}") - # print("Proceeding with creating a new dataset from local tasks.") - - print(f"Pushing {len(final_dataset)} instances to {target_dataset}...") - final_dataset.push_to_hub(target_dataset, token=token) + print(f"Pushing {len(dataset)} instances to {target_dataset}...") + dataset.push_to_hub(target_dataset, token=token) print("Remote push finished successfully.") + return { + "success": True, + "target_dataset": target_dataset, + "repos_processed": repos_processed, + "repos_failed": repos_failed, + "instances_uploaded": len(cleaned_tasks), + "skipped_missing_keys": skipped_missing_keys, + } + @app.local_entrypoint() def main( @@ -155,28 +135,17 @@ def main( language: str = "javascript", push: bool = False, ): - print("Listing task files from Modal volume...") - try: - entries = vol.listdir(f"{language}/task_insts") - filenames = [e.path.split("/")[-1] for e in entries if e.path.endswith(".json")] - except Exception as e: - print(f"Error listing volume: {e}") - return - - print(f"Found {len(filenames)} files. Starting parallel processing...") - - all_tasks = [] - for repo_tasks in process_repo.map(filenames, [language] * len(filenames)): - all_tasks.extend(repo_tasks) - - print(f"Fetched total {len(all_tasks)} task instances.") - if not push: - confirm = input(f"Ready to push to HF. Proceed? (y/n) ").lower() + confirm = input( + f"Run remote upload to '{target_dataset}' for language '{language}'? (y/n) " + ).lower() if confirm != "y": print("Aborting.") return - print("Launching remote push job...") - push_to_hf_remote.remote(all_tasks, target_dataset) - print("Done!") + print("Starting robust remote upload...") + result = upload_from_volume_remote.remote(target_dataset, language) + print(json.dumps(result, indent=2)) + + if not result.get("success"): + raise RuntimeError(result.get("error", "Upload failed")) From 3b9fe1defd158204f8255451cee152c896924655 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Thu, 12 Feb 2026 06:30:21 +0000 Subject: [PATCH 24/32] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- scripts/upload_tasks_to_hf_modal.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/scripts/upload_tasks_to_hf_modal.py b/scripts/upload_tasks_to_hf_modal.py index 0953903f..d23cfc0f 100644 --- a/scripts/upload_tasks_to_hf_modal.py +++ b/scripts/upload_tasks_to_hf_modal.py @@ -51,7 +51,9 @@ def _attach_issue_statement(task: dict, issue_gen_dir: Path, repo_id: str) -> di secrets=[modal.Secret.from_name("john-hf-secret")], timeout=10800, ) -def upload_from_volume_remote(target_dataset: str, language: str = "javascript") -> dict: +def upload_from_volume_remote( + target_dataset: str, language: str = "javascript" +) -> dict: """Robust end-to-end upload: volume -> issue merge -> validate -> HF push.""" import os from datasets import Dataset From 0a61157b65cd0cf0a35353189d4ca0dbae9fc9d8 Mon Sep 17 00:00:00 2001 From: Kevin Li Date: Fri, 27 Feb 2026 18:52:17 -0800 Subject: [PATCH 25/32] Support backfilling patch diffs --- scripts/backfill_patchdiff_modal.py | 111 ++++++++++++++++++++++++++++ 1 file changed, 111 insertions(+) create mode 100644 scripts/backfill_patchdiff_modal.py diff --git a/scripts/backfill_patchdiff_modal.py b/scripts/backfill_patchdiff_modal.py new file mode 100644 index 00000000..06ca5fc8 --- /dev/null +++ b/scripts/backfill_patchdiff_modal.py @@ -0,0 +1,111 @@ +import json +from pathlib import Path + +import modal + +APP_NAME = "swesmith-backfill-patchdiff" +VOLUME_NAME = "swesmith-bug-gen" +LOGS_MOUNT_PATH = "/logs" + +app = modal.App(APP_NAME) +logs_volume = modal.Volume.from_name(VOLUME_NAME) + + +@app.function( + timeout=3600, + volumes={LOGS_MOUNT_PATH: logs_volume}, + max_containers=20, +) +def backfill_repo(repo_patch_file: str, language: str = "java") -> dict: + bug_file = Path(LOGS_MOUNT_PATH) / language / "bug_gen" / repo_patch_file + repo_id = repo_patch_file.replace("_all_patches.json", "") + run_val_repo_dir = Path(LOGS_MOUNT_PATH) / language / "run_validation" / repo_id + + if not bug_file.exists(): + return {"repo_id": repo_id, "status": "skipped", "reason": "missing bug_gen file"} + + if not run_val_repo_dir.exists(): + return {"repo_id": repo_id, "status": "skipped", "reason": "missing run_validation repo dir"} + + try: + patches = json.loads(bug_file.read_text()) + except Exception as e: + return {"repo_id": repo_id, "status": "error", "error": f"parse patches: {e}"} + + eligible = 0 + written = 0 + missing_instance = 0 + missing_patch = 0 + + for patch in patches: + instance_id = patch.get("instance_id") + if not instance_id: + continue + + instance_dir = run_val_repo_dir / instance_id + if not instance_dir.exists(): + missing_instance += 1 + continue + + eligible += 1 + patch_text = patch.get("patch") + if not patch_text: + missing_patch += 1 + continue + + (instance_dir / "patch.diff").write_text(patch_text) + written += 1 + + return { + "repo_id": repo_id, + "status": "ok", + "eligible": eligible, + "written": written, + "missing_instance": missing_instance, + "missing_patch": missing_patch, + } + + +@app.local_entrypoint() +def main(language: str = "java"): + entries = logs_volume.listdir(f"{language}/bug_gen") + patch_files = [e.path.split("/")[-1] for e in entries if e.path.endswith("_all_patches.json")] + + print(f"Found {len(patch_files)} patch files in {language}/bug_gen") + + total_eligible = 0 + total_written = 0 + total_missing_instance = 0 + total_missing_patch = 0 + ok = 0 + skipped = 0 + failed = 0 + + for i, result in enumerate( + backfill_repo.map(patch_files, [language] * len(patch_files), order_outputs=False), + start=1, + ): + status = result.get("status") + repo_id = result.get("repo_id", "unknown") + if status == "ok": + ok += 1 + total_eligible += result.get("eligible", 0) + total_written += result.get("written", 0) + total_missing_instance += result.get("missing_instance", 0) + total_missing_patch += result.get("missing_patch", 0) + if i % 10 == 0 or result.get("written", 0) > 0: + print(f"[{i}/{len(patch_files)}] {repo_id}: wrote {result.get('written', 0)}") + elif status == "skipped": + skipped += 1 + else: + failed += 1 + print(f"[{i}/{len(patch_files)}] {repo_id}: ERROR {result.get('error')}") + + print("\nBackfill summary") + print(f" repos_ok: {ok}") + print(f" repos_skipped: {skipped}") + print(f" repos_failed: {failed}") + print(f" eligible_instances: {total_eligible}") + print(f" patchdiff_written: {total_written}") + print(f" missing_instance: {total_missing_instance}") + print(f" missing_patch: {total_missing_patch}") From b2893eeb78c9309422637ef6bbaa7c957f7cb24f Mon Sep 17 00:00:00 2001 From: Kevin Li Date: Fri, 27 Feb 2026 18:52:45 -0800 Subject: [PATCH 26/32] Relax gather timeout to 1 hour --- scripts/bug_gen_modal.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/scripts/bug_gen_modal.py b/scripts/bug_gen_modal.py index 6a0a132f..9a2bf102 100644 --- a/scripts/bug_gen_modal.py +++ b/scripts/bug_gen_modal.py @@ -1643,7 +1643,8 @@ def print_summary(results: list[dict], repos_count: int): @app.function( image=generator_image, secrets=[modal.Secret.from_name("GITHUB_TOKEN")], - timeout=MODAL_TIMEOUT, + # Gather can push hundreds of branches for large repos; 10 minutes is too low. + timeout=60 * MINUTES, volumes={LOGS_MOUNT_PATH: logs_volume}, ) def gather_remote( From e36f583aec03f0ea62a12abb0b199197481811f4 Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Sun, 8 Mar 2026 22:11:06 +0000 Subject: [PATCH 27/32] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- scripts/backfill_patchdiff_modal.py | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/scripts/backfill_patchdiff_modal.py b/scripts/backfill_patchdiff_modal.py index 06ca5fc8..114d5040 100644 --- a/scripts/backfill_patchdiff_modal.py +++ b/scripts/backfill_patchdiff_modal.py @@ -22,10 +22,18 @@ def backfill_repo(repo_patch_file: str, language: str = "java") -> dict: run_val_repo_dir = Path(LOGS_MOUNT_PATH) / language / "run_validation" / repo_id if not bug_file.exists(): - return {"repo_id": repo_id, "status": "skipped", "reason": "missing bug_gen file"} + return { + "repo_id": repo_id, + "status": "skipped", + "reason": "missing bug_gen file", + } if not run_val_repo_dir.exists(): - return {"repo_id": repo_id, "status": "skipped", "reason": "missing run_validation repo dir"} + return { + "repo_id": repo_id, + "status": "skipped", + "reason": "missing run_validation repo dir", + } try: patches = json.loads(bug_file.read_text()) @@ -69,7 +77,9 @@ def backfill_repo(repo_patch_file: str, language: str = "java") -> dict: @app.local_entrypoint() def main(language: str = "java"): entries = logs_volume.listdir(f"{language}/bug_gen") - patch_files = [e.path.split("/")[-1] for e in entries if e.path.endswith("_all_patches.json")] + patch_files = [ + e.path.split("/")[-1] for e in entries if e.path.endswith("_all_patches.json") + ] print(f"Found {len(patch_files)} patch files in {language}/bug_gen") @@ -82,7 +92,9 @@ def main(language: str = "java"): failed = 0 for i, result in enumerate( - backfill_repo.map(patch_files, [language] * len(patch_files), order_outputs=False), + backfill_repo.map( + patch_files, [language] * len(patch_files), order_outputs=False + ), start=1, ): status = result.get("status") @@ -94,7 +106,9 @@ def main(language: str = "java"): total_missing_instance += result.get("missing_instance", 0) total_missing_patch += result.get("missing_patch", 0) if i % 10 == 0 or result.get("written", 0) > 0: - print(f"[{i}/{len(patch_files)}] {repo_id}: wrote {result.get('written', 0)}") + print( + f"[{i}/{len(patch_files)}] {repo_id}: wrote {result.get('written', 0)}" + ) elif status == "skipped": skipped += 1 else: From 0ea5927c36cab069eb86cab87c276bce53502cd6 Mon Sep 17 00:00:00 2001 From: Kevin Li Date: Mon, 9 Mar 2026 11:13:42 -0700 Subject: [PATCH 28/32] Add modal helpers for issue-task uploads --- scripts/issue_gen_redo_existing_modal.py | 97 ++++++++++++++++++++++++ scripts/overwrite_and_dedup_tasks.py | 84 ++++++++++++++++++++ scripts/upload_tasks_to_hf_modal.py | 40 +++------- 3 files changed, 193 insertions(+), 28 deletions(-) create mode 100644 scripts/issue_gen_redo_existing_modal.py create mode 100644 scripts/overwrite_and_dedup_tasks.py diff --git a/scripts/issue_gen_redo_existing_modal.py b/scripts/issue_gen_redo_existing_modal.py new file mode 100644 index 00000000..2d7445eb --- /dev/null +++ b/scripts/issue_gen_redo_existing_modal.py @@ -0,0 +1,97 @@ +import json +import os +from pathlib import Path + +import modal + +from scripts.bug_gen_modal import generator_image + +VOLUME_NAME = "swesmith-bug-gen" +LOGS_MOUNT_PATH = "/logs" + +app = modal.App("issue-gen-redo-existing") +logs_volume = modal.Volume.from_name(VOLUME_NAME, create_if_missing=True, version=2) + + +@app.function( + image=generator_image, + volumes={LOGS_MOUNT_PATH: logs_volume}, + timeout=3600, + secrets=[ + modal.Secret.from_name("GITHUB_TOKEN"), + modal.Secret.from_name("PORTKEY_API_KEY"), + ], +) +def redo_issue_gen_remote( + repo: str, + language: str = "cpp", + config: str = "configs/issue_gen/ig_v2.yaml", + workers: int = 8, +) -> dict: + from swesmith.issue_gen.generate import IssueGen + + volume_root = Path(LOGS_MOUNT_PATH) / language + task_insts_dir = volume_root / "task_insts" + + task_insts_file = None + repo_sanitized = repo.replace("/", "__") + if task_insts_dir.exists(): + for filename in os.listdir(task_insts_dir): + if filename == f"{repo_sanitized}.json" or ( + filename.startswith(f"{repo_sanitized}.") and filename.endswith(".json") + ): + task_insts_file = task_insts_dir / filename + break + + if not task_insts_file or not task_insts_file.exists(): + return { + "success": False, + "repo": repo, + "error": "No task instances file found", + } + + local_logs = Path("/root/logs") + local_logs.mkdir(parents=True, exist_ok=True) + for subdir in ["task_insts", "run_validation", "issue_gen"]: + local_subdir = local_logs / subdir + volume_subdir = volume_root / subdir + volume_subdir.mkdir(parents=True, exist_ok=True) + try: + if local_subdir.exists() or local_subdir.is_symlink(): + local_subdir.unlink() + local_subdir.symlink_to(volume_subdir) + except FileExistsError: + pass + + issue_gen = IssueGen( + dataset_path=str(task_insts_file), + config_file=Path(config), + workers=workers, + redo_existing=True, + ) + issue_gen.run() + + ig_file = task_insts_file.parent / f"{task_insts_file.stem}__ig_llm.json" + issue_count = 0 + if ig_file.exists(): + data = json.loads(ig_file.read_text()) + issue_count = sum(1 for row in data if (row.get("problem_statement") or "").strip()) + + return { + "success": True, + "repo": repo, + "task_insts_file": str(task_insts_file), + "ig_file": str(ig_file), + "issue_count": issue_count, + } + + +@app.local_entrypoint() +def main( + repo: str, + language: str = "cpp", + config: str = "configs/issue_gen/ig_v2.yaml", + workers: int = 8, +): + result = redo_issue_gen_remote.remote(repo, language, config, workers) + print(json.dumps(result, indent=2)) diff --git a/scripts/overwrite_and_dedup_tasks.py b/scripts/overwrite_and_dedup_tasks.py new file mode 100644 index 00000000..96b991c9 --- /dev/null +++ b/scripts/overwrite_and_dedup_tasks.py @@ -0,0 +1,84 @@ +""" +Modal script to filter and overwrite SWE-smith datasets on Hugging Face. + +This variant intentionally does NOT do task aggregation or issue generation. +It only: +- Loads a source HF dataset. +- Filters out rows with empty `problem_statement`. +- Pushes the filtered dataset to a target HF dataset, overwriting existing contents. +""" + +import os + +import modal +from datasets import Dataset, DatasetDict, load_dataset +from huggingface_hub import create_repo + +app = modal.App("swesmith-overwrite-hf") +image = modal.Image.debian_slim().pip_install("datasets", "huggingface_hub") + + +@app.function( + image=image, + secrets=[modal.Secret.from_name("john-hf-secret")], + timeout=10800, +) +def filter_and_overwrite_remote( + source_dataset: str = "SWE-bench/SWE-smith-ts", + target_dataset: str = "SWE-bench/SWE-smith-ts", + source_split: str = "train", +) -> dict: + """Filter source_dataset for non-empty problem_statement and push to target_dataset.""" + token = os.environ.get("HF_TOKEN") + if not token: + return {"success": False, "error": "HF_TOKEN not found in environment"} + + print(f"Loading source dataset: {source_dataset} split={source_split}") + ds = load_dataset(source_dataset, split=source_split) + print(f"Source rows: {len(ds)}") + + filtered = ds.filter(lambda row: bool(str(row.get("problem_statement") or "").strip())) + + print(f"Filtered rows (non-empty problem_statement): {len(filtered)}") + print(f"Dropped rows: {len(ds) - len(filtered)}") + + create_repo(target_dataset, repo_type="dataset", token=token, exist_ok=True) + + DatasetDict({"train": filtered}).push_to_hub(target_dataset, token=token) + + return { + "success": True, + "source_dataset": source_dataset, + "target_dataset": target_dataset, + "source_split": source_split, + "source_rows": len(ds), + "kept_rows": len(filtered), + "dropped_rows": len(ds) - len(filtered), + } + + +@app.local_entrypoint() +def main( + source_dataset: str = "SWE-bench/SWE-smith-ts", + target_dataset: str = "SWE-bench/SWE-smith-ts", + source_split: str = "train", + push: bool = False, +): + if not push: + confirm = input( + f"Overwrite '{target_dataset}' from '{source_dataset}' ({source_split}) with non-empty problem_statement? (y/n) " + ).lower() + if confirm != "y": + print("Aborting.") + return + + print("Starting remote filter-and-overwrite...") + result = filter_and_overwrite_remote.remote( + source_dataset=source_dataset, + target_dataset=target_dataset, + source_split=source_split, + ) + print(result) + + if not result.get("success"): + raise RuntimeError(result.get("error", "Upload failed")) diff --git a/scripts/upload_tasks_to_hf_modal.py b/scripts/upload_tasks_to_hf_modal.py index d23cfc0f..3f90cb3d 100644 --- a/scripts/upload_tasks_to_hf_modal.py +++ b/scripts/upload_tasks_to_hf_modal.py @@ -18,29 +18,13 @@ ISSUE_MODEL_KEY = "portkey/gpt-5-mini" -def _attach_issue_statement(task: dict, issue_gen_dir: Path, repo_id: str) -> dict: - """Attach issue text and normalize fields for a task instance.""" - instance_id = task.get("instance_id") - if not instance_id: - return task - +def _normalize_task(task: dict) -> dict: + """Normalize fields for a task instance before upload.""" if "image_name" in task and ".architecture." in task["image_name"]: task["image_name"] = task["image_name"].replace(".architecture", "") - task["problem_statement"] = "" - issue_file = issue_gen_dir / repo_id / f"{instance_id}.json" - if not issue_file.exists(): - return task - - try: - issue_data = json.loads(issue_file.read_text()) - except Exception: - return task - - responses = issue_data.get("responses", {}) - content = responses.get(ISSUE_MODEL_KEY) - if isinstance(content, list) and content: - task["problem_statement"] = content[0] + if "problem_statement" not in task: + task["problem_statement"] = "" return task @@ -54,7 +38,7 @@ def _attach_issue_statement(task: dict, issue_gen_dir: Path, repo_id: str) -> di def upload_from_volume_remote( target_dataset: str, language: str = "javascript" ) -> dict: - """Robust end-to-end upload: volume -> issue merge -> validate -> HF push.""" + """Upload issue-generated task instances from the Modal volume to HF.""" import os from datasets import Dataset from huggingface_hub import create_repo @@ -64,15 +48,17 @@ def upload_from_volume_remote( return {"success": False, "error": "HF_TOKEN not found in environment"} task_insts_dir = Path(f"/data/{language}/task_insts") - issue_gen_dir = Path(f"/data/{language}/issue_gen") if not task_insts_dir.exists(): return {"success": False, "error": f"Missing task_insts dir: {task_insts_dir}"} - task_files = sorted(task_insts_dir.glob("*.json")) + task_files = sorted(task_insts_dir.glob("*__ig_llm.json")) if not task_files: - return {"success": False, "error": f"No task files in {task_insts_dir}"} + return { + "success": False, + "error": f"No __ig_llm task files in {task_insts_dir}", + } - print(f"Found {len(task_files)} task files in volume.") + print(f"Found {len(task_files)} __ig_llm task files in volume.") cleaned_tasks = [] skipped_missing_keys = 0 @@ -92,10 +78,8 @@ def upload_from_volume_remote( print(f"[{repo_id}] Processing {len(tasks)} tasks...") for task in tasks: - task = _attach_issue_statement(task, issue_gen_dir, repo_id) + task = _normalize_task(task) if all(k in task for k in REQUIRED_KEYS): - if "problem_statement" not in task: - task["problem_statement"] = "" cleaned_tasks.append(task) else: skipped_missing_keys += 1 From 3ce0bd2260c7fe08fe1cf87d52e1e8b939ce14be Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Mon, 9 Mar 2026 18:17:10 +0000 Subject: [PATCH 29/32] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- scripts/issue_gen_redo_existing_modal.py | 4 +++- scripts/overwrite_and_dedup_tasks.py | 4 +++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/scripts/issue_gen_redo_existing_modal.py b/scripts/issue_gen_redo_existing_modal.py index 2d7445eb..be015bdd 100644 --- a/scripts/issue_gen_redo_existing_modal.py +++ b/scripts/issue_gen_redo_existing_modal.py @@ -75,7 +75,9 @@ def redo_issue_gen_remote( issue_count = 0 if ig_file.exists(): data = json.loads(ig_file.read_text()) - issue_count = sum(1 for row in data if (row.get("problem_statement") or "").strip()) + issue_count = sum( + 1 for row in data if (row.get("problem_statement") or "").strip() + ) return { "success": True, diff --git a/scripts/overwrite_and_dedup_tasks.py b/scripts/overwrite_and_dedup_tasks.py index 96b991c9..51a250e8 100644 --- a/scripts/overwrite_and_dedup_tasks.py +++ b/scripts/overwrite_and_dedup_tasks.py @@ -37,7 +37,9 @@ def filter_and_overwrite_remote( ds = load_dataset(source_dataset, split=source_split) print(f"Source rows: {len(ds)}") - filtered = ds.filter(lambda row: bool(str(row.get("problem_statement") or "").strip())) + filtered = ds.filter( + lambda row: bool(str(row.get("problem_statement") or "").strip()) + ) print(f"Filtered rows (non-empty problem_statement): {len(filtered)}") print(f"Dropped rows: {len(ds) - len(filtered)}") From 172ca185f251708afa60b0e0b8b49d3dd443edae Mon Sep 17 00:00:00 2001 From: Kevin Li Date: Mon, 9 Mar 2026 11:20:36 -0700 Subject: [PATCH 30/32] Fix Ruff issues in modal scripts --- scripts/bug_gen_modal.py | 3 +-- scripts/overwrite_and_dedup_tasks.py | 2 +- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/scripts/bug_gen_modal.py b/scripts/bug_gen_modal.py index 3c3bd37d..4467390b 100644 --- a/scripts/bug_gen_modal.py +++ b/scripts/bug_gen_modal.py @@ -1710,7 +1710,6 @@ def gather_remote( ) -> dict: """Run gather.py for a repository to create task instances and push branches.""" import os - import sys import subprocess import traceback from pathlib import Path @@ -2055,7 +2054,7 @@ async def run_issue_gen_phase_async( issue_gen_redo: Whether to regenerate existing issues """ print(f"\n{'=' * 80}") - print(f"ISSUE GENERATION PHASE") + print("ISSUE GENERATION PHASE") print(f"{'=' * 80}") print(f"Processing {len(repos)} repositories...") print(f"Config: {issue_gen_config}") diff --git a/scripts/overwrite_and_dedup_tasks.py b/scripts/overwrite_and_dedup_tasks.py index 96b991c9..200be6c1 100644 --- a/scripts/overwrite_and_dedup_tasks.py +++ b/scripts/overwrite_and_dedup_tasks.py @@ -11,7 +11,7 @@ import os import modal -from datasets import Dataset, DatasetDict, load_dataset +from datasets import DatasetDict, load_dataset from huggingface_hub import create_repo app = modal.App("swesmith-overwrite-hf") From 5424d12919f3f3980b271de2ea7adb9ecbcc4f75 Mon Sep 17 00:00:00 2001 From: Kevin Li Date: Mon, 9 Mar 2026 11:23:04 -0700 Subject: [PATCH 31/32] Remove unused import in bug gen modal --- scripts/bug_gen_modal.py | 1 - 1 file changed, 1 deletion(-) diff --git a/scripts/bug_gen_modal.py b/scripts/bug_gen_modal.py index 4467390b..b10a2aff 100644 --- a/scripts/bug_gen_modal.py +++ b/scripts/bug_gen_modal.py @@ -1934,7 +1934,6 @@ def issue_gen_remote( workers: Number of workers per repo """ import os - import sys from pathlib import Path # Set up paths From 2de1c697f43ad57888ab7714fa68d00cfa8be107 Mon Sep 17 00:00:00 2001 From: Kevin Li Date: Mon, 9 Mar 2026 11:29:55 -0700 Subject: [PATCH 32/32] Remove unused modal helper scripts --- scripts/issue_gen_redo_existing_modal.py | 99 ------------------------ scripts/overwrite_and_dedup_tasks.py | 86 -------------------- 2 files changed, 185 deletions(-) delete mode 100644 scripts/issue_gen_redo_existing_modal.py delete mode 100644 scripts/overwrite_and_dedup_tasks.py diff --git a/scripts/issue_gen_redo_existing_modal.py b/scripts/issue_gen_redo_existing_modal.py deleted file mode 100644 index be015bdd..00000000 --- a/scripts/issue_gen_redo_existing_modal.py +++ /dev/null @@ -1,99 +0,0 @@ -import json -import os -from pathlib import Path - -import modal - -from scripts.bug_gen_modal import generator_image - -VOLUME_NAME = "swesmith-bug-gen" -LOGS_MOUNT_PATH = "/logs" - -app = modal.App("issue-gen-redo-existing") -logs_volume = modal.Volume.from_name(VOLUME_NAME, create_if_missing=True, version=2) - - -@app.function( - image=generator_image, - volumes={LOGS_MOUNT_PATH: logs_volume}, - timeout=3600, - secrets=[ - modal.Secret.from_name("GITHUB_TOKEN"), - modal.Secret.from_name("PORTKEY_API_KEY"), - ], -) -def redo_issue_gen_remote( - repo: str, - language: str = "cpp", - config: str = "configs/issue_gen/ig_v2.yaml", - workers: int = 8, -) -> dict: - from swesmith.issue_gen.generate import IssueGen - - volume_root = Path(LOGS_MOUNT_PATH) / language - task_insts_dir = volume_root / "task_insts" - - task_insts_file = None - repo_sanitized = repo.replace("/", "__") - if task_insts_dir.exists(): - for filename in os.listdir(task_insts_dir): - if filename == f"{repo_sanitized}.json" or ( - filename.startswith(f"{repo_sanitized}.") and filename.endswith(".json") - ): - task_insts_file = task_insts_dir / filename - break - - if not task_insts_file or not task_insts_file.exists(): - return { - "success": False, - "repo": repo, - "error": "No task instances file found", - } - - local_logs = Path("/root/logs") - local_logs.mkdir(parents=True, exist_ok=True) - for subdir in ["task_insts", "run_validation", "issue_gen"]: - local_subdir = local_logs / subdir - volume_subdir = volume_root / subdir - volume_subdir.mkdir(parents=True, exist_ok=True) - try: - if local_subdir.exists() or local_subdir.is_symlink(): - local_subdir.unlink() - local_subdir.symlink_to(volume_subdir) - except FileExistsError: - pass - - issue_gen = IssueGen( - dataset_path=str(task_insts_file), - config_file=Path(config), - workers=workers, - redo_existing=True, - ) - issue_gen.run() - - ig_file = task_insts_file.parent / f"{task_insts_file.stem}__ig_llm.json" - issue_count = 0 - if ig_file.exists(): - data = json.loads(ig_file.read_text()) - issue_count = sum( - 1 for row in data if (row.get("problem_statement") or "").strip() - ) - - return { - "success": True, - "repo": repo, - "task_insts_file": str(task_insts_file), - "ig_file": str(ig_file), - "issue_count": issue_count, - } - - -@app.local_entrypoint() -def main( - repo: str, - language: str = "cpp", - config: str = "configs/issue_gen/ig_v2.yaml", - workers: int = 8, -): - result = redo_issue_gen_remote.remote(repo, language, config, workers) - print(json.dumps(result, indent=2)) diff --git a/scripts/overwrite_and_dedup_tasks.py b/scripts/overwrite_and_dedup_tasks.py deleted file mode 100644 index c46ba929..00000000 --- a/scripts/overwrite_and_dedup_tasks.py +++ /dev/null @@ -1,86 +0,0 @@ -""" -Modal script to filter and overwrite SWE-smith datasets on Hugging Face. - -This variant intentionally does NOT do task aggregation or issue generation. -It only: -- Loads a source HF dataset. -- Filters out rows with empty `problem_statement`. -- Pushes the filtered dataset to a target HF dataset, overwriting existing contents. -""" - -import os - -import modal -from datasets import DatasetDict, load_dataset -from huggingface_hub import create_repo - -app = modal.App("swesmith-overwrite-hf") -image = modal.Image.debian_slim().pip_install("datasets", "huggingface_hub") - - -@app.function( - image=image, - secrets=[modal.Secret.from_name("john-hf-secret")], - timeout=10800, -) -def filter_and_overwrite_remote( - source_dataset: str = "SWE-bench/SWE-smith-ts", - target_dataset: str = "SWE-bench/SWE-smith-ts", - source_split: str = "train", -) -> dict: - """Filter source_dataset for non-empty problem_statement and push to target_dataset.""" - token = os.environ.get("HF_TOKEN") - if not token: - return {"success": False, "error": "HF_TOKEN not found in environment"} - - print(f"Loading source dataset: {source_dataset} split={source_split}") - ds = load_dataset(source_dataset, split=source_split) - print(f"Source rows: {len(ds)}") - - filtered = ds.filter( - lambda row: bool(str(row.get("problem_statement") or "").strip()) - ) - - print(f"Filtered rows (non-empty problem_statement): {len(filtered)}") - print(f"Dropped rows: {len(ds) - len(filtered)}") - - create_repo(target_dataset, repo_type="dataset", token=token, exist_ok=True) - - DatasetDict({"train": filtered}).push_to_hub(target_dataset, token=token) - - return { - "success": True, - "source_dataset": source_dataset, - "target_dataset": target_dataset, - "source_split": source_split, - "source_rows": len(ds), - "kept_rows": len(filtered), - "dropped_rows": len(ds) - len(filtered), - } - - -@app.local_entrypoint() -def main( - source_dataset: str = "SWE-bench/SWE-smith-ts", - target_dataset: str = "SWE-bench/SWE-smith-ts", - source_split: str = "train", - push: bool = False, -): - if not push: - confirm = input( - f"Overwrite '{target_dataset}' from '{source_dataset}' ({source_split}) with non-empty problem_statement? (y/n) " - ).lower() - if confirm != "y": - print("Aborting.") - return - - print("Starting remote filter-and-overwrite...") - result = filter_and_overwrite_remote.remote( - source_dataset=source_dataset, - target_dataset=target_dataset, - source_split=source_split, - ) - print(result) - - if not result.get("success"): - raise RuntimeError(result.get("error", "Upload failed"))