Skip to content

Commit b5f1828

Browse files
committed
fix: implement modified swebench harness evaluation
1 parent 4177e08 commit b5f1828

File tree

4 files changed

+325
-144
lines changed

4 files changed

+325
-144
lines changed

codegen-examples/examples/swebench_agent_run/run_eval.py

Lines changed: 11 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,13 @@
11
import asyncio
22
import json
33
import traceback
4-
import uuid
5-
from datetime import datetime
64
from pathlib import Path
7-
8-
import click
5+
import uuid
96
import modal
7+
import click
8+
from datetime import datetime
9+
from codegen.extensions.swebench.utils import SWEBenchDataset, SweBenchExample, get_swe_bench_examples
1010
from codegen.extensions.swebench.report import generate_report
11-
from codegen.extensions.swebench.utils import (
12-
SWEBenchDataset,
13-
SweBenchExample,
14-
get_swe_bench_examples,
15-
)
1611

1712
PREDS_DNAME = Path(__file__).parent / "predictions"
1813
LOG_DIR = Path(__file__).parent / "logs"
@@ -66,26 +61,11 @@ async def process_batch(examples: list[SweBenchExample], batch_size=10):
6661
print("Traceback:")
6762
print("".join(error_info["traceback"]))
6863

69-
results.append(
70-
{
71-
"instance_id": example.instance_id,
72-
"status": "error",
73-
"error_info": error_info,
74-
}
75-
)
64+
results.append({"instance_id": example.instance_id, "status": "error", "error_info": error_info})
7665
else:
7766
if result is None:
7867
print(f"Warning: Null result for {example.instance_id}")
79-
results.append(
80-
{
81-
"instance_id": example.instance_id,
82-
"status": "error",
83-
"error_info": {
84-
"error_type": "NullResult",
85-
"error_message": "Process returned None",
86-
},
87-
}
88-
)
68+
results.append({"instance_id": example.instance_id, "status": "error", "error_info": {"error_type": "NullResult", "error_message": "Process returned None"}})
8969
else:
9070
results.append(result)
9171

@@ -101,24 +81,14 @@ async def process_batch(examples: list[SweBenchExample], batch_size=10):
10181
{
10282
"instance_id": example.instance_id,
10383
"status": "error",
104-
"error_info": {
105-
"error_type": type(e).__name__,
106-
"error_message": str(e),
107-
"traceback": traceback.format_exc(),
108-
"batch_failure": True,
109-
},
84+
"error_info": {"error_type": type(e).__name__, "error_message": str(e), "traceback": traceback.format_exc(), "batch_failure": True},
11085
}
11186
)
11287

11388
return results
11489

11590

116-
async def run_eval(
117-
use_existing_preds: str | None,
118-
dataset: str,
119-
length: int,
120-
instance_id: str | None = None,
121-
):
91+
async def run_eval(use_existing_preds: str | None, dataset: str, length: int, instance_id: str | None = None):
12292
run_id = use_existing_preds or str(uuid.uuid4())
12393
predictions_dir = PREDS_DNAME / f"results_{run_id}"
12494
dataset = SWEBenchDataset(dataset)
@@ -185,25 +155,10 @@ async def run_eval(
185155

186156

187157
@click.command()
188-
@click.option(
189-
"--use-existing-preds",
190-
help="The run ID of the existing predictions to use.",
191-
type=str,
192-
default=None,
193-
)
194-
@click.option(
195-
"--dataset",
196-
help="The dataset to use.",
197-
type=click.Choice([dataset.value for dataset in SWEBenchDataset]),
198-
default=SWEBenchDataset.LITE.value,
199-
)
158+
@click.option("--use-existing-preds", help="The run ID of the existing predictions to use.", type=str, default=None)
159+
@click.option("--dataset", help="The dataset to use.", type=click.Choice([dataset.value for dataset in SWEBenchDataset]), default=SWEBenchDataset.LITE.value)
200160
@click.option("--length", help="The number of examples to process.", type=int, default=10)
201-
@click.option(
202-
"--instance-id",
203-
help="The instance ID of the example to process.",
204-
type=str,
205-
default=None,
206-
)
161+
@click.option("--instance-id", help="The instance ID of the example to process.", type=str, default=None)
207162
def run_eval_command(use_existing_preds, dataset, length, instance_id):
208163
asyncio.run(run_eval(use_existing_preds, dataset, length, instance_id))
209164

0 commit comments

Comments
 (0)