Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
0d77bc7
Support --gather in bug_gen_modal.py
AlienKevin Jan 16, 2026
1465908
Update --gather to store to /logs/{language}/task_insts
AlienKevin Jan 16, 2026
5d0cf3d
Only write out json if task instances is not empty
AlienKevin Jan 16, 2026
96efde4
Doubled modal sandbox time out to 20 minutes to account for repos tha…
AlienKevin Jan 16, 2026
d065420
feat: parallelize gather.py and fix thread safety
AlienKevin Jan 16, 2026
cd3adc8
Fix gather.py to skip empty commits
AlienKevin Jan 16, 2026
84f8587
Reset MODAL_TIMEOUT back down to 10 minutes now that gather is parall…
AlienKevin Jan 16, 2026
67a8529
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jan 16, 2026
842650a
Replace slow and stateful git checkout and branch -D with a single st…
AlienKevin Jan 17, 2026
af01d5c
Cache repo locally to avoid rate limits and speed up cloning
AlienKevin Jan 17, 2026
1d6bcd4
Optimize gather with persistent worker repos (~5min total)
AlienKevin Jan 17, 2026
e42a5e2
Flip PASS_TO_FAIL to FAIL_TO_PASS following SWE-bench naming convention
AlienKevin Jan 17, 2026
302b73e
Remove unused shutil import in gather.py
AlienKevin Jan 17, 2026
393aba4
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jan 17, 2026
1f0601e
First draft for issue gen
AlienKevin Jan 20, 2026
a50e614
Support PortKey for issue gen and switch to gpt-5-mini
AlienKevin Jan 20, 2026
9461c1f
Uncomment gather part in bug_gen_modal.py
AlienKevin Jan 20, 2026
f7f68cb
Add script to upload task instances to Hugging Face
AlienKevin Jan 20, 2026
4976d85
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Jan 20, 2026
2ab21d8
Refactor bug generation phases to use --phases argument instead of --…
AlienKevin Jan 30, 2026
6f514a3
Merge remote-tracking branch 'upstream/main' into kevin/bug-gen-gather
AlienKevin Feb 6, 2026
8979df4
Fix gather patch apply failures in worker repos
AlienKevin Feb 6, 2026
17fe59b
Make HF upload script language-aware for non-JS datasets
AlienKevin Feb 6, 2026
9c58d47
Make Modal HF upload robust with single remote path
AlienKevin Feb 12, 2026
3b9fe1d
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Feb 12, 2026
b13ca66
Merge branch 'main' into kevin/bug-gen-gather
AlienKevin Feb 28, 2026
0a61157
Support backfilling patch diffs
AlienKevin Feb 28, 2026
b2893ee
Relax gather timeout to 1 hour
AlienKevin Feb 28, 2026
6c6469b
Merge upstream main
AlienKevin Feb 28, 2026
f8e9455
Merge remote-tracking branch 'upstream/main' into kevin/bug-gen-gather
AlienKevin Mar 8, 2026
17f5365
Merge remote-tracking branch 'origin/kevin/bug-gen-gather' into kevin…
AlienKevin Mar 8, 2026
e36f583
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Mar 8, 2026
0ea5927
Add modal helpers for issue-task uploads
AlienKevin Mar 9, 2026
12c37d4
Merge remote-tracking branch 'upstream/main' into kevin/bug-gen-gather
AlienKevin Mar 9, 2026
f337083
Merge remote-tracking branch 'origin/kevin/bug-gen-gather' into kevin…
AlienKevin Mar 9, 2026
3ce0bd2
[pre-commit.ci] auto fixes from pre-commit.com hooks
pre-commit-ci[bot] Mar 9, 2026
172ca18
Fix Ruff issues in modal scripts
AlienKevin Mar 9, 2026
2ba2306
Merge remote-tracking branch 'origin/kevin/bug-gen-gather' into kevin…
AlienKevin Mar 9, 2026
5424d12
Remove unused import in bug gen modal
AlienKevin Mar 9, 2026
2de1c69
Remove unused modal helper scripts
AlienKevin Mar 9, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 3 additions & 1 deletion configs/issue_gen/ig_v2.yaml
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
model: anthropic/claude-sonnet-4-20250514
model: portkey/gpt-5-mini
litellm_model_name_override: openai/gpt-5-mini
provider: "@openai"
system: |-
You are a software engineer helping to create a realistic dataset of synthetic GitHub issues.

Expand Down
2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ all = [
"matplotlib",
"modal",
"openai",
"portkey-ai",
"pre-commit",
"python-dotenv",
"rich",
Expand Down Expand Up @@ -86,6 +87,7 @@ generate = [
"ghapi",
"jinja2",
"libcst",
"portkey-ai",
"python-dotenv",
"rich",
"swebench",
Expand Down
125 changes: 125 additions & 0 deletions scripts/backfill_patchdiff_modal.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
import json
from pathlib import Path

import modal

APP_NAME = "swesmith-backfill-patchdiff"
VOLUME_NAME = "swesmith-bug-gen"
LOGS_MOUNT_PATH = "/logs"

app = modal.App(APP_NAME)
logs_volume = modal.Volume.from_name(VOLUME_NAME)


@app.function(
timeout=3600,
volumes={LOGS_MOUNT_PATH: logs_volume},
max_containers=20,
)
def backfill_repo(repo_patch_file: str, language: str = "java") -> dict:
bug_file = Path(LOGS_MOUNT_PATH) / language / "bug_gen" / repo_patch_file
repo_id = repo_patch_file.replace("_all_patches.json", "")
run_val_repo_dir = Path(LOGS_MOUNT_PATH) / language / "run_validation" / repo_id

if not bug_file.exists():
return {
"repo_id": repo_id,
"status": "skipped",
"reason": "missing bug_gen file",
}

if not run_val_repo_dir.exists():
return {
"repo_id": repo_id,
"status": "skipped",
"reason": "missing run_validation repo dir",
}

try:
patches = json.loads(bug_file.read_text())
except Exception as e:
return {"repo_id": repo_id, "status": "error", "error": f"parse patches: {e}"}

eligible = 0
written = 0
missing_instance = 0
missing_patch = 0

for patch in patches:
instance_id = patch.get("instance_id")
if not instance_id:
continue

instance_dir = run_val_repo_dir / instance_id
if not instance_dir.exists():
missing_instance += 1
continue

eligible += 1
patch_text = patch.get("patch")
if not patch_text:
missing_patch += 1
continue

(instance_dir / "patch.diff").write_text(patch_text)
written += 1

return {
"repo_id": repo_id,
"status": "ok",
"eligible": eligible,
"written": written,
"missing_instance": missing_instance,
"missing_patch": missing_patch,
}


@app.local_entrypoint()
def main(language: str = "java"):
entries = logs_volume.listdir(f"{language}/bug_gen")
patch_files = [
e.path.split("/")[-1] for e in entries if e.path.endswith("_all_patches.json")
]

print(f"Found {len(patch_files)} patch files in {language}/bug_gen")

total_eligible = 0
total_written = 0
total_missing_instance = 0
total_missing_patch = 0
ok = 0
skipped = 0
failed = 0

for i, result in enumerate(
backfill_repo.map(
patch_files, [language] * len(patch_files), order_outputs=False
),
start=1,
):
status = result.get("status")
repo_id = result.get("repo_id", "unknown")
if status == "ok":
ok += 1
total_eligible += result.get("eligible", 0)
total_written += result.get("written", 0)
total_missing_instance += result.get("missing_instance", 0)
total_missing_patch += result.get("missing_patch", 0)
if i % 10 == 0 or result.get("written", 0) > 0:
print(
f"[{i}/{len(patch_files)}] {repo_id}: wrote {result.get('written', 0)}"
)
elif status == "skipped":
skipped += 1
else:
failed += 1
print(f"[{i}/{len(patch_files)}] {repo_id}: ERROR {result.get('error')}")

print("\nBackfill summary")
print(f" repos_ok: {ok}")
print(f" repos_skipped: {skipped}")
print(f" repos_failed: {failed}")
print(f" eligible_instances: {total_eligible}")
print(f" patchdiff_written: {total_written}")
print(f" missing_instance: {total_missing_instance}")
print(f" missing_patch: {total_missing_patch}")
Loading