diff --git a/examples/evaluation/swe_bench/common.py b/examples/evaluation/swe_bench/common.py new file mode 100644 index 000000000..ef8c0cf46 --- /dev/null +++ b/examples/evaluation/swe_bench/common.py @@ -0,0 +1,81 @@ +import re +import yaml +from pathlib import Path + +from rock.logger import init_logger +from rock.sdk.sandbox.client import RunMode, Sandbox +from rock.sdk.sandbox.config import SandboxConfig + +UV_VERSION = "0.10.5" +UV_ARCH = "x86_64-unknown-linux-gnu" + +SWEBENCH_RESULT_START_MARKER = "SWEBench results starts here" +SWEBENCH_RESULT_END_MARKER = "SWEBench results ends here" +SWEBENCH_PASSED = "PASSED" + +logger = init_logger(__name__) + +def load_task_config(task_dir: Path) -> dict: + """Load task configuration from task.yaml.""" + task_yaml_path = task_dir / "task.yaml" + if not task_yaml_path.exists(): + raise FileNotFoundError(f"task.yaml not found in {task_dir}") + + with open(task_yaml_path, encoding="utf-8") as f: + config = yaml.safe_load(f) + + return config + + +async def _install_uv(sandbox: Sandbox, session: str): + uv_install_script_commands = [ + f"wget https://github.com/astral-sh/uv/releases/download/{UV_VERSION}/uv-{UV_ARCH}.tar.gz", + f"tar -xzf uv-{UV_ARCH}.tar.gz --strip-components=1 -C /usr/local/bin", + "uv --version", # verify installation + ] + for cmd in uv_install_script_commands: + result = await sandbox.arun(cmd, session=session, mode=RunMode.NOHUP) + if result.exit_code != 0: + raise RuntimeError(f"Failed to install uv: {cmd}, output: {result.output}") + + +async def setup_test_env( + sandbox: Sandbox, test_folder: Path, test_dir: str, run_tests_scripts: Path, session: str +) -> bool: + try: + await _install_uv(sandbox, session) + + res = await sandbox.fs.upload_dir(test_folder, test_dir) + if res.exit_code != 0: + return False + + res = await sandbox.upload_by_path(run_tests_scripts, f"{test_dir}/{run_tests_scripts.name}") + if not res.success: + return False + + return True + except Exception as e: + logger.error(f"Failed to setup test environment: {e}") + raise e + + +def parse_swebench_result(output: str) -> bool: + """Parse SWEBench test output to determine if the task is resolved. + + Matches the block between 'SWEBench results starts here' and + 'SWEBench results ends here', then checks whether it contains 'PASSED'. + """ + pattern = rf"{SWEBENCH_RESULT_START_MARKER}\s*(.*?)\s*{SWEBENCH_RESULT_END_MARKER}" + match = re.search(pattern, output, re.DOTALL) + if not match: + return False + return match.group(1).strip() == SWEBENCH_PASSED + + +async def start_sandbox(swe_task_name: str) -> Sandbox: + """Start a sandbox instance for evaluation.""" + image = f"slimshetty/swebench-verified:sweb.eval.x86_64.{swe_task_name}" + config = SandboxConfig(image=image) + sandbox = Sandbox(config) + await sandbox.start() + return sandbox diff --git a/examples/evaluation/swe_bench/iflow_config.yaml b/examples/evaluation/swe_bench/iflow_config.yaml new file mode 100644 index 000000000..dfbe8037f --- /dev/null +++ b/examples/evaluation/swe_bench/iflow_config.yaml @@ -0,0 +1,16 @@ +run_cmd: "iflow -p ${prompt} --yolo" + +project_path: "/testbed" + +agent_run_timeout: 3600 + +runtime_env_config: + type: node + custom_install_cmd: "npm i -g @iflow-ai/iflow-cli@latest" + +env: + IFLOW_API_KEY: "" + IFLOW_SEARCH_KEY: "" + IFLOW_BASE_URL: "" + IFLOW_MODEL_NAME: "" + \ No newline at end of file diff --git a/examples/evaluation/swe_bench/swe_bench_verified_demo.py b/examples/evaluation/swe_bench/swe_bench_verified_demo.py new file mode 100644 index 000000000..1c0befa21 --- /dev/null +++ b/examples/evaluation/swe_bench/swe_bench_verified_demo.py @@ -0,0 +1,127 @@ +"""SWE-bench Verified evaluation demo. + +This script runs a single SWE-bench Verified task end-to-end: + 1. Loads the task configuration (instruction & tests) from a local task directory. + 2. Starts a sandbox with the corresponding SWE-bench Docker image. + 3. Installs and runs an agent inside the sandbox to resolve the task. + 4. Executes the task's test suite and reports whether the fix is correct. + +Prerequisites: + - The ROCK admin server must be running (`rock admin start`). + - Fill in the agent configuration in iflow_config.yaml. + - Clone the SWE-bench Verified tasks repository: + * git clone https://github.com/laude-institute/terminal-bench-datasets.git + +Usage: + 1. Set `tasks_folder` and `task_name` in the __main__ block below. + 2. Run: + python -m examples.evaluation.swe_bench.swe_bench_verified_demo +""" + +import sys +import asyncio +from pathlib import Path + +from examples.evaluation.swe_bench.common import load_task_config, parse_swebench_result, setup_test_env, start_sandbox +from rock.actions.sandbox.request import CreateBashSessionRequest +from rock.actions.sandbox.response import Observation +from rock.logger import init_logger +from rock.sdk.sandbox.client import RunMode, Sandbox + +test_timeout_sec = 3600 +logger = init_logger(__name__) + +async def run_swe_evaluation(sandbox: Sandbox, task_dir: Path, instruction: str, agent_config_path: str) -> bool: + """Run SWE evaluation on the sandbox.""" + task_name = task_dir.name + # 1. Install agent + await sandbox.agent.install(config=agent_config_path) + + # 2. Run agent to resolve the task + result = await sandbox.agent.run(instruction) + logger.info(f"Task name: {task_name}, sandbox id: {sandbox.sandbox_id}, Agent run result: {result}") + + # 3. Setup test env + session = "swe-evaluation" + await sandbox.create_session( + CreateBashSessionRequest( + session=session, + env_enable=True, + env={"UV_PYTHON_INSTALL_MIRROR": "https://registry.npmmirror.com/-/binary/python-build-standalone"}, + ) + ) + + test_file_dir = task_dir / "tests" + sandbox_test_dir = "/tests" + is_success = await setup_test_env( + sandbox, test_file_dir, sandbox_test_dir, task_dir / "run-tests.sh", session=session + ) + if not is_success: + logger.error("Failed to setup test environment") + return False + + # 4. Run tests + logger.info(f"Task name: {task_name}, sandbox id: {sandbox.sandbox_id}, Start to run tests") + run_tests_command = f"sh -c 'bash {sandbox_test_dir}/run-tests.sh'" + resp: Observation = await sandbox.arun( + run_tests_command, session=session, mode=RunMode.NOHUP, wait_timeout=test_timeout_sec + ) + logger.info(f"Task name: {task_name}, sandbox id: {sandbox.sandbox_id}, Run tests result: {resp}") + + # 5. Parse results + resolve_result = parse_swebench_result(resp.output) + logger.info(f"Task name: {task_name}, sandbox id: {sandbox.sandbox_id}, is_resolved: {resolve_result}") + return resolve_result + + +async def run_task(task_dir: Path, agent_config_path: str) -> dict: + """Run evaluation for a single task.""" + task_name = task_dir.name + + try: + # Load task configuration + task_config = load_task_config(task_dir) + instruction = task_config.get("instruction", "") + + if not instruction: + logger.error(f"No instruction found in task.yaml for {task_name}") + return {"task_name": task_name, "status": "failed", "error": "No instruction in task.yaml"} + + # Start sandbox + sandbox = await start_sandbox(task_name) + + try: + # Run evaluation + resolve_result = await run_swe_evaluation(sandbox, task_dir, instruction, agent_config_path) + logger.info(f"Completed evaluation for task: {task_name}") + return { + "task_name": task_name, + "sandbox_id": sandbox.sandbox_id, + "status": "success", + "resolved": resolve_result, + } + except Exception as e: + logger.error(f"Error running evaluation for {task_name}: {e}") + return {"task_name": task_name, "sandbox_id": sandbox.sandbox_id, "status": "failed", "error": str(e)} + finally: + await sandbox.stop() + except Exception as e: + logger.error(f"Error loading task config for {task_name}: {e}") + return {"task_name": task_name, "status": "failed", "error": str(e)} + + +if __name__ == "__main__": + cur_dir = Path(__file__).resolve().parent + agent_config_path = f"{cur_dir}/iflow_config.yaml" + + # directory containing tasks, from https://github.com/laude-institute/terminal-bench-datasets/tree/main/datasets/swebench-verified + tasks_folder = "path_to_swebench_verified_tasks" # e.g., "/path/to/swebench-verified-tasks" + task_name = "task_name" # task name to run, e.g., "astropy__astropy-12907" + + task_dir = Path(tasks_folder) / task_name + if not task_dir.exists(): + logger.error(f"Task {task_name} not found") + sys.exit(1) + + # Ensure admin server is running before executing + asyncio.run(run_task(task_dir, agent_config_path))