Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
81 changes: 81 additions & 0 deletions examples/evaluation/swe_bench/common.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
import re
import yaml
from pathlib import Path

from rock.logger import init_logger
from rock.sdk.sandbox.client import RunMode, Sandbox
from rock.sdk.sandbox.config import SandboxConfig

UV_VERSION = "0.10.5"
UV_ARCH = "x86_64-unknown-linux-gnu"

SWEBENCH_RESULT_START_MARKER = "SWEBench results starts here"
SWEBENCH_RESULT_END_MARKER = "SWEBench results ends here"
SWEBENCH_PASSED = "PASSED"

logger = init_logger(__name__)

def load_task_config(task_dir: Path) -> dict:
"""Load task configuration from task.yaml."""
task_yaml_path = task_dir / "task.yaml"
if not task_yaml_path.exists():
raise FileNotFoundError(f"task.yaml not found in {task_dir}")

with open(task_yaml_path, encoding="utf-8") as f:
config = yaml.safe_load(f)

return config


async def _install_uv(sandbox: Sandbox, session: str):
uv_install_script_commands = [
f"wget https://github.com/astral-sh/uv/releases/download/{UV_VERSION}/uv-{UV_ARCH}.tar.gz",
f"tar -xzf uv-{UV_ARCH}.tar.gz --strip-components=1 -C /usr/local/bin",
"uv --version", # verify installation
]
for cmd in uv_install_script_commands:
result = await sandbox.arun(cmd, session=session, mode=RunMode.NOHUP)
if result.exit_code != 0:
raise RuntimeError(f"Failed to install uv: {cmd}, output: {result.output}")


async def setup_test_env(
sandbox: Sandbox, test_folder: Path, test_dir: str, run_tests_scripts: Path, session: str
) -> bool:
try:
await _install_uv(sandbox, session)

res = await sandbox.fs.upload_dir(test_folder, test_dir)
if res.exit_code != 0:
return False

res = await sandbox.upload_by_path(run_tests_scripts, f"{test_dir}/{run_tests_scripts.name}")
if not res.success:
return False

return True
except Exception as e:
logger.error(f"Failed to setup test environment: {e}")
raise e


def parse_swebench_result(output: str) -> bool:
"""Parse SWEBench test output to determine if the task is resolved.

Matches the block between 'SWEBench results starts here' and
'SWEBench results ends here', then checks whether it contains 'PASSED'.
"""
pattern = rf"{SWEBENCH_RESULT_START_MARKER}\s*(.*?)\s*{SWEBENCH_RESULT_END_MARKER}"
match = re.search(pattern, output, re.DOTALL)
if not match:
return False
return match.group(1).strip() == SWEBENCH_PASSED


async def start_sandbox(swe_task_name: str) -> Sandbox:
"""Start a sandbox instance for evaluation."""
image = f"slimshetty/swebench-verified:sweb.eval.x86_64.{swe_task_name}"
config = SandboxConfig(image=image)
sandbox = Sandbox(config)
await sandbox.start()
return sandbox
16 changes: 16 additions & 0 deletions examples/evaluation/swe_bench/iflow_config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
run_cmd: "iflow -p ${prompt} --yolo"

project_path: "/testbed"

agent_run_timeout: 3600

runtime_env_config:
type: node
custom_install_cmd: "npm i -g @iflow-ai/iflow-cli@latest"

env:
IFLOW_API_KEY: "<API_KEY>"
IFLOW_SEARCH_KEY: "<SEARCH_KEY>"
IFLOW_BASE_URL: "<BASE_URL>"
IFLOW_MODEL_NAME: "<MODEL_NAME>"

127 changes: 127 additions & 0 deletions examples/evaluation/swe_bench/swe_bench_verified_demo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
"""SWE-bench Verified evaluation demo.

This script runs a single SWE-bench Verified task end-to-end:
1. Loads the task configuration (instruction & tests) from a local task directory.
2. Starts a sandbox with the corresponding SWE-bench Docker image.
3. Installs and runs an agent inside the sandbox to resolve the task.
4. Executes the task's test suite and reports whether the fix is correct.

Prerequisites:
- The ROCK admin server must be running (`rock admin start`).
- Fill in the agent configuration in iflow_config.yaml.
- Clone the SWE-bench Verified tasks repository:
* git clone https://github.com/laude-institute/terminal-bench-datasets.git

Usage:
1. Set `tasks_folder` and `task_name` in the __main__ block below.
2. Run:
python -m examples.evaluation.swe_bench.swe_bench_verified_demo
"""

import sys
import asyncio
from pathlib import Path

from examples.evaluation.swe_bench.common import load_task_config, parse_swebench_result, setup_test_env, start_sandbox
from rock.actions.sandbox.request import CreateBashSessionRequest
from rock.actions.sandbox.response import Observation
from rock.logger import init_logger
from rock.sdk.sandbox.client import RunMode, Sandbox

test_timeout_sec = 3600
logger = init_logger(__name__)

async def run_swe_evaluation(sandbox: Sandbox, task_dir: Path, instruction: str, agent_config_path: str) -> bool:
"""Run SWE evaluation on the sandbox."""
task_name = task_dir.name
# 1. Install agent
await sandbox.agent.install(config=agent_config_path)

# 2. Run agent to resolve the task
result = await sandbox.agent.run(instruction)
logger.info(f"Task name: {task_name}, sandbox id: {sandbox.sandbox_id}, Agent run result: {result}")

# 3. Setup test env
session = "swe-evaluation"
await sandbox.create_session(
CreateBashSessionRequest(
session=session,
env_enable=True,
env={"UV_PYTHON_INSTALL_MIRROR": "https://registry.npmmirror.com/-/binary/python-build-standalone"},
)
)

test_file_dir = task_dir / "tests"
sandbox_test_dir = "/tests"
is_success = await setup_test_env(
sandbox, test_file_dir, sandbox_test_dir, task_dir / "run-tests.sh", session=session
)
if not is_success:
logger.error("Failed to setup test environment")
return False

# 4. Run tests
logger.info(f"Task name: {task_name}, sandbox id: {sandbox.sandbox_id}, Start to run tests")
run_tests_command = f"sh -c 'bash {sandbox_test_dir}/run-tests.sh'"
resp: Observation = await sandbox.arun(
run_tests_command, session=session, mode=RunMode.NOHUP, wait_timeout=test_timeout_sec
)
logger.info(f"Task name: {task_name}, sandbox id: {sandbox.sandbox_id}, Run tests result: {resp}")

# 5. Parse results
resolve_result = parse_swebench_result(resp.output)
logger.info(f"Task name: {task_name}, sandbox id: {sandbox.sandbox_id}, is_resolved: {resolve_result}")
return resolve_result


async def run_task(task_dir: Path, agent_config_path: str) -> dict:
"""Run evaluation for a single task."""
task_name = task_dir.name

try:
# Load task configuration
task_config = load_task_config(task_dir)
instruction = task_config.get("instruction", "")

if not instruction:
logger.error(f"No instruction found in task.yaml for {task_name}")
return {"task_name": task_name, "status": "failed", "error": "No instruction in task.yaml"}

# Start sandbox
sandbox = await start_sandbox(task_name)

try:
# Run evaluation
resolve_result = await run_swe_evaluation(sandbox, task_dir, instruction, agent_config_path)
logger.info(f"Completed evaluation for task: {task_name}")
return {
"task_name": task_name,
"sandbox_id": sandbox.sandbox_id,
"status": "success",
"resolved": resolve_result,
}
except Exception as e:
logger.error(f"Error running evaluation for {task_name}: {e}")
return {"task_name": task_name, "sandbox_id": sandbox.sandbox_id, "status": "failed", "error": str(e)}
finally:
await sandbox.stop()
except Exception as e:
logger.error(f"Error loading task config for {task_name}: {e}")
return {"task_name": task_name, "status": "failed", "error": str(e)}


if __name__ == "__main__":
cur_dir = Path(__file__).resolve().parent
agent_config_path = f"{cur_dir}/iflow_config.yaml"

# directory containing tasks, from https://github.com/laude-institute/terminal-bench-datasets/tree/main/datasets/swebench-verified
tasks_folder = "path_to_swebench_verified_tasks" # e.g., "/path/to/swebench-verified-tasks"
task_name = "task_name" # task name to run, e.g., "astropy__astropy-12907"

task_dir = Path(tasks_folder) / task_name
if not task_dir.exists():
logger.error(f"Task {task_name} not found")
sys.exit(1)

# Ensure admin server is running before executing
asyncio.run(run_task(task_dir, agent_config_path))
Loading