Skip to content
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 11 additions & 10 deletions benchmarks/commit0/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@
construct_eval_output_dir,
get_default_on_result_writer,
)
from benchmarks.utils.image_utils import image_exists
from benchmarks.utils.image_utils import create_docker_workspace, image_exists
from benchmarks.utils.llm_config import load_llm_config
from benchmarks.utils.models import (
EvalInstance,
Expand All @@ -36,7 +36,7 @@
from openhands.sdk.workspace import RemoteWorkspace
from openhands.tools.delegate import DelegateTool
from openhands.tools.preset.default import get_default_tools
from openhands.workspace import APIRemoteWorkspace, DockerDevWorkspace
from openhands.workspace import APIRemoteWorkspace


logger = get_logger(__name__)
Expand Down Expand Up @@ -188,15 +188,16 @@ def prepare_workspace(
logger.info(f"Using base docker image: {base_docker_image}")

if self.metadata.workspace_type == "docker":
# Build agent-server image from base commit0 image
workspace = DockerDevWorkspace(
base_image=base_docker_image,
working_dir="/workspace",
target=build_target,
forward_env=forward_env or [],
custom_tag = extract_custom_tag(base_docker_image)
suffix = f"-{build_target}" if build_target != "binary" else ""
agent_server_image = (
f"{EVAL_AGENT_SERVER_IMAGE}:{SDK_SHORT_SHA}-{custom_tag}{suffix}"
)
logger.info(
f"Building workspace from {base_docker_image}. This may take a while..."
workspace = create_docker_workspace(
agent_server_image=agent_server_image,
base_image=base_docker_image,
build_target=build_target,
forward_env=forward_env,
)
elif self.metadata.workspace_type == "remote":
runtime_api_key = os.getenv("RUNTIME_API_KEY")
Expand Down
15 changes: 9 additions & 6 deletions benchmarks/gaia/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
get_default_on_result_writer,
)
from benchmarks.utils.fake_user_response import run_conversation_with_fake_user_response
from benchmarks.utils.image_utils import image_exists
from benchmarks.utils.image_utils import create_docker_workspace, image_exists
from benchmarks.utils.llm_config import load_llm_config
from benchmarks.utils.models import EvalInstance, EvalMetadata, EvalOutput
from benchmarks.utils.version import SDK_SHORT_SHA
Expand All @@ -47,7 +47,7 @@
from openhands.sdk.workspace import RemoteWorkspace
from openhands.tools.delegate import DelegateTool
from openhands.tools.preset.default import get_default_tools
from openhands.workspace import APIRemoteWorkspace, DockerDevWorkspace
from openhands.workspace import APIRemoteWorkspace


logger = get_logger(__name__)
Expand Down Expand Up @@ -156,11 +156,14 @@ def prepare_workspace(
logger.info(f"Preparing workspace for instance {instance.id}")

if self.metadata.workspace_type == "docker":
# Use DockerDevWorkspace with base image (same as main branch)
workspace = DockerDevWorkspace(
agent_server_image = (
f"{EVAL_AGENT_SERVER_IMAGE}:{SDK_SHORT_SHA}-gaia-binary"
)
workspace = create_docker_workspace(
agent_server_image=agent_server_image,
base_image="nikolaik/python-nodejs:python3.12-nodejs22",
working_dir="/workspace",
forward_env=forward_env or [],
build_target="binary",
forward_env=forward_env,
)
elif self.metadata.workspace_type == "remote":
# For workflow, use APIRemoteWorkspace with pre-built GAIA image
Expand Down
36 changes: 6 additions & 30 deletions benchmarks/multiswebench/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from benchmarks.multiswebench.download_dataset import download_and_concat_dataset
from benchmarks.multiswebench.scripts.data.data_change import format_data_for_inference
from benchmarks.utils.args_parser import get_parser
from benchmarks.utils.build_utils import build_image
from benchmarks.utils.build_utils import ensure_local_image
from benchmarks.utils.console_logging import summarize_instance
from benchmarks.utils.constants import EVAL_AGENT_SERVER_IMAGE
from benchmarks.utils.conversation import build_event_persistence_callback
Expand Down Expand Up @@ -212,36 +212,12 @@ def prepare_workspace(
agent_server_image = (
f"{EVAL_AGENT_SERVER_IMAGE}:{SDK_SHORT_SHA}-{custom_tag}{suffix}"
)
SKIP_BUILD = os.getenv("MULTI_SWE_BENCH_SKIP_BUILD", "0").lower() in (
"1",
"true",
"yes",
ensure_local_image(
agent_server_image=agent_server_image,
base_image=official_docker_image,
custom_tag=custom_tag,
target=build_target,
)
logger.info(f"MULTI_SWE_BENCH_SKIP_BUILD={SKIP_BUILD}")
if not SKIP_BUILD:
logger.info(
f"Building workspace from {official_docker_image} "
f"for instance {instance.id}. "
"This may take a while...\n"
"You can run benchmarks/multiswebench/build_images.py and set "
"MULTI_SWE_BENCH_SKIP_BUILD=1 to skip building and use pre-built "
"agent-server image."
)
output = build_image(
base_image=official_docker_image,
target_image=EVAL_AGENT_SERVER_IMAGE,
custom_tag=custom_tag,
target=build_target,
push=False,
)
logger.info(f"Image build output: {output}")
assert output.error is None, f"Image build failed: {output.error}"
if agent_server_image not in output.tags:
raise RuntimeError(
f"Built image tags {output.tags} do not include expected tag "
f"{agent_server_image}"
)

workspace = DockerWorkspace(
server_image=agent_server_image,
working_dir="/workspace",
Expand Down
48 changes: 17 additions & 31 deletions benchmarks/swebench/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
)
from benchmarks.swebench.config import INFER_DEFAULTS
from benchmarks.utils.args_parser import get_parser
from benchmarks.utils.build_utils import build_image
from benchmarks.utils.build_utils import ensure_local_image
from benchmarks.utils.console_logging import summarize_instance
from benchmarks.utils.constants import EVAL_AGENT_SERVER_IMAGE
from benchmarks.utils.conversation import build_event_persistence_callback
Expand Down Expand Up @@ -160,38 +160,24 @@ def prepare_workspace(
agent_server_image = base_agent_image

if self.metadata.workspace_type == "docker":
SKIP_BUILD = os.getenv("SKIP_BUILD", "1").lower() in ("1", "true", "yes")
logger.info(f"SKIP_BUILD={SKIP_BUILD}")
if not SKIP_BUILD:
logger.info(
f"Building workspace from {official_docker_image} "
f"for instance {instance.id}. "
"This may take a while...\n"
"You can run benchmarks/swebench/build_images.py and set "
"SWE_BENCH_SKIP_BUILD=1 to skip building and use pre-built "
"agent-server image."
)
output = build_image(
base_image=official_docker_image,
target_image=EVAL_AGENT_SERVER_IMAGE,
custom_tag=custom_tag,
target=build_target,
push=False,
)
logger.info(f"Image build output: {output}")
assert output.error is None, f"Image build failed: {output.error}"
if base_agent_image not in output.tags:
built = ensure_local_image(
agent_server_image=base_agent_image,
base_image=official_docker_image,
custom_tag=custom_tag,
target=build_target,
)
if built and wrap_needed:
wrapped_result = wrap_image(base_agent_image, push=False)
if wrapped_result.error:
raise RuntimeError(
f"Built image tags {output.tags} do not include expected tag "
f"{base_agent_image}"
"Wrapped image build failed: "
f"{wrapped_result.error}; log={wrapped_result.log_path}"
)
if wrap_needed:
wrapped_result = wrap_image(base_agent_image, push=False)
if wrapped_result.error:
raise RuntimeError(
"Wrapped image build failed: "
f"{wrapped_result.error}; log={wrapped_result.log_path}"
)
elif not built and wrap_needed:
logger.info(
f"Using pre-built image {base_agent_image} "
"(assumed already wrapped)"
)

workspace = DockerWorkspace(
server_image=agent_server_image,
Expand Down
35 changes: 7 additions & 28 deletions benchmarks/swebenchmultimodal/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
)
from benchmarks.swebenchmultimodal.config import INFER_DEFAULTS
from benchmarks.utils.args_parser import get_parser
from benchmarks.utils.build_utils import build_image
from benchmarks.utils.build_utils import ensure_local_image
from benchmarks.utils.console_logging import summarize_instance
from benchmarks.utils.constants import EVAL_AGENT_SERVER_IMAGE
from benchmarks.utils.conversation import build_event_persistence_callback
Expand Down Expand Up @@ -165,33 +165,12 @@ def prepare_workspace(
agent_server_image = (
f"{EVAL_AGENT_SERVER_IMAGE}:{SDK_SHORT_SHA}-{custom_tag}{suffix}"
)
SKIP_BUILD = os.getenv("SKIP_BUILD", "1").lower() in ("1", "true", "yes")
logger.info(f"SKIP_BUILD={SKIP_BUILD}")
if not SKIP_BUILD:
logger.info(
f"Building workspace from {official_docker_image} "
f"for instance {instance.id}. "
"This may take a while...\n"
"You can run benchmarks/swebenchmultimodal/build_images.py and set "
"SWE_BENCH_SKIP_BUILD=1 to skip building and use pre-built "
"agent-server image."
)

output = build_image(
base_image=official_docker_image,
target_image=EVAL_AGENT_SERVER_IMAGE,
custom_tag=custom_tag,
target=build_target,
push=False,
)
logger.info(f"Image build output: {output}")
assert output.error is None, f"Image build failed: {output.error}"
if agent_server_image not in output.tags:
raise RuntimeError(
f"Built image tags {output.tags} do not include expected tag "
f"{agent_server_image}"
)

ensure_local_image(
agent_server_image=agent_server_image,
base_image=official_docker_image,
custom_tag=custom_tag,
target=build_target,
)
workspace = DockerWorkspace(
server_image=agent_server_image,
working_dir="/workspace",
Expand Down
32 changes: 7 additions & 25 deletions benchmarks/swefficiency/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from benchmarks.swefficiency.config import DOCKER_DEFAULTS, INFER_DEFAULTS
from benchmarks.swefficiency.workspace import ResourceLimitedDockerWorkspace
from benchmarks.utils.args_parser import get_parser
from benchmarks.utils.build_utils import build_image
from benchmarks.utils.build_utils import ensure_local_image
from benchmarks.utils.conversation import build_event_persistence_callback
from benchmarks.utils.critics import create_critic
from benchmarks.utils.dataset import get_dataset
Expand Down Expand Up @@ -208,30 +208,12 @@ def prepare_workspace(
logger.info(f"Agent server image: {agent_server_image}")

if self.metadata.workspace_type == "docker":
# Build agent-server image from base swefficiency image
SKIP_BUILD = os.getenv("SKIP_BUILD", "0").lower() in ("1", "true", "yes")
logger.info(f"SKIP_BUILD={SKIP_BUILD}")

if not SKIP_BUILD:
logger.info(
f"Building workspace from {base_docker_image} "
f"for instance {instance.id}. "
"This may take a while..."
)
output = build_image(
base_image=base_docker_image,
target_image=EVAL_AGENT_SERVER_IMAGE,
custom_tag=custom_tag,
target=build_target,
push=False,
)
logger.info(f"Image build output: {output}")
assert output.error is None, f"Image build failed: {output.error}"
if agent_server_image not in output.tags:
raise RuntimeError(
f"Built image tags {output.tags} do not include expected tag "
f"{agent_server_image}"
)
ensure_local_image(
agent_server_image=agent_server_image,
base_image=base_docker_image,
custom_tag=custom_tag,
target=build_target,
)

# Get CPU group for resource limiting
cpu_group = self._acquire_cpu_group()
Expand Down
34 changes: 8 additions & 26 deletions benchmarks/swtbench/run_infer.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
get_default_on_result_writer,
)
from benchmarks.utils.fake_user_response import run_conversation_with_fake_user_response
from benchmarks.utils.image_utils import image_exists
from benchmarks.utils.image_utils import create_docker_workspace, image_exists
from benchmarks.utils.llm_config import load_llm_config
from benchmarks.utils.models import (
EvalInstance,
Expand All @@ -31,7 +31,7 @@
from openhands.sdk.workspace import RemoteWorkspace
from openhands.tools.delegate import DelegateTool
from openhands.tools.preset.default import get_default_tools
from openhands.workspace import APIRemoteWorkspace, DockerDevWorkspace, DockerWorkspace
from openhands.workspace import APIRemoteWorkspace


logger = get_logger(__name__)
Expand Down Expand Up @@ -171,30 +171,12 @@ def prepare_workspace(
agent_server_image = (
f"{EVAL_AGENT_SERVER_IMAGE}:{SDK_SHORT_SHA}-{custom_tag}{suffix}"
)
SKIP_BUILD = os.getenv("SKIP_BUILD", "1").lower() in ("1", "true", "yes")
logger.info(f"SKIP_BUILD={SKIP_BUILD}")
if not SKIP_BUILD:
logger.info(
f"Building workspace from {official_docker_image} "
f"for instance {instance.id}. "
"This may take a while...\n"
"You can run benchmarks/swtbench/build_images.py and set "
"SKIP_BUILD=1 to skip building and use pre-built "
"agent-server image."
)
# For SWT-bench, we use DockerDevWorkspace with base_image
workspace = DockerDevWorkspace(
base_image=official_docker_image,
working_dir="/workspace",
target=build_target,
forward_env=forward_env or [],
)
else:
workspace = DockerWorkspace(
server_image=agent_server_image,
working_dir="/workspace",
forward_env=forward_env or [],
)
workspace = create_docker_workspace(
agent_server_image=agent_server_image,
base_image=official_docker_image,
build_target=build_target,
forward_env=forward_env,
)
elif self.metadata.workspace_type == "remote":
runtime_api_key = os.getenv("RUNTIME_API_KEY")
sdk_short_sha = os.getenv("SDK_SHORT_SHA", SDK_SHORT_SHA)
Expand Down
40 changes: 39 additions & 1 deletion benchmarks/utils/build_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@
maybe_reset_buildkit,
)
from benchmarks.utils.constants import EVAL_AGENT_SERVER_IMAGE
from benchmarks.utils.image_utils import image_exists
from benchmarks.utils.image_utils import image_exists, local_image_exists
from openhands.agent_server.docker.build import BuildOptions, TargetType, build
from openhands.sdk import get_logger

Expand Down Expand Up @@ -307,6 +307,44 @@ def build_image(
return BuildOutput(base_image=base_image, tags=tags, error=None)


def ensure_local_image(
agent_server_image: str,
base_image: str,
custom_tag: str,
target: TargetType = "source-minimal",
) -> bool:
"""Build an agent-server image locally if it doesn't already exist.

Returns True if a build occurred, False if the image already existed.
Set FORCE_BUILD=1 to skip auto-detection and always rebuild.
"""
force_build = os.getenv("FORCE_BUILD", "0").lower() in ("1", "true", "yes")
if not force_build and local_image_exists(agent_server_image):
logger.info(f"Using pre-built image {agent_server_image}")
return False

if force_build:
logger.info(f"FORCE_BUILD set, building image from {base_image}...")
else:
logger.info(f"Building image from {base_image}...")
output = build_image(
base_image=base_image,
target_image=EVAL_AGENT_SERVER_IMAGE,
custom_tag=custom_tag,
target=target,
push=False,
)
logger.info(f"Image build output: {output}")
if output.error is not None:
raise RuntimeError(f"Image build failed: {output.error}")
if agent_server_image not in output.tags:
raise RuntimeError(
f"Built image tags {output.tags} do not include expected tag "
f"{agent_server_image}"
)
return True


def _build_with_logging(
log_dir: Path,
base_image: str,
Expand Down
Loading
Loading