diff --git a/benchmarks/commit0/run_infer.py b/benchmarks/commit0/run_infer.py index 03ddecb85..36af180a5 100644 --- a/benchmarks/commit0/run_infer.py +++ b/benchmarks/commit0/run_infer.py @@ -24,7 +24,7 @@ construct_eval_output_dir, get_default_on_result_writer, ) -from benchmarks.utils.image_utils import image_exists +from benchmarks.utils.image_utils import create_docker_workspace, image_exists from benchmarks.utils.llm_config import load_llm_config from benchmarks.utils.models import ( EvalInstance, @@ -36,7 +36,7 @@ from openhands.sdk.workspace import RemoteWorkspace from openhands.tools.delegate import DelegateTool from openhands.tools.preset.default import get_default_tools -from openhands.workspace import APIRemoteWorkspace, DockerDevWorkspace +from openhands.workspace import APIRemoteWorkspace logger = get_logger(__name__) @@ -188,15 +188,16 @@ def prepare_workspace( logger.info(f"Using base docker image: {base_docker_image}") if self.metadata.workspace_type == "docker": - # Build agent-server image from base commit0 image - workspace = DockerDevWorkspace( - base_image=base_docker_image, - working_dir="/workspace", - target=build_target, - forward_env=forward_env or [], + custom_tag = extract_custom_tag(base_docker_image) + suffix = f"-{build_target}" if build_target != "binary" else "" + agent_server_image = ( + f"{EVAL_AGENT_SERVER_IMAGE}:{SDK_SHORT_SHA}-{custom_tag}{suffix}" ) - logger.info( - f"Building workspace from {base_docker_image}. This may take a while..." + workspace = create_docker_workspace( + agent_server_image=agent_server_image, + base_image=base_docker_image, + build_target=build_target, + forward_env=forward_env, ) elif self.metadata.workspace_type == "remote": runtime_api_key = os.getenv("RUNTIME_API_KEY") diff --git a/benchmarks/gaia/run_infer.py b/benchmarks/gaia/run_infer.py index cfbf0682c..fa0317a8e 100644 --- a/benchmarks/gaia/run_infer.py +++ b/benchmarks/gaia/run_infer.py @@ -27,7 +27,7 @@ get_default_on_result_writer, ) from benchmarks.utils.fake_user_response import run_conversation_with_fake_user_response -from benchmarks.utils.image_utils import image_exists +from benchmarks.utils.image_utils import create_docker_workspace, image_exists from benchmarks.utils.llm_config import load_llm_config from benchmarks.utils.models import EvalInstance, EvalMetadata, EvalOutput from benchmarks.utils.version import SDK_SHORT_SHA @@ -47,7 +47,7 @@ from openhands.sdk.workspace import RemoteWorkspace from openhands.tools.delegate import DelegateTool from openhands.tools.preset.default import get_default_tools -from openhands.workspace import APIRemoteWorkspace, DockerDevWorkspace +from openhands.workspace import APIRemoteWorkspace logger = get_logger(__name__) @@ -156,11 +156,14 @@ def prepare_workspace( logger.info(f"Preparing workspace for instance {instance.id}") if self.metadata.workspace_type == "docker": - # Use DockerDevWorkspace with base image (same as main branch) - workspace = DockerDevWorkspace( + agent_server_image = ( + f"{EVAL_AGENT_SERVER_IMAGE}:{SDK_SHORT_SHA}-gaia-binary" + ) + workspace = create_docker_workspace( + agent_server_image=agent_server_image, base_image="nikolaik/python-nodejs:python3.12-nodejs22", - working_dir="/workspace", - forward_env=forward_env or [], + build_target="binary", + forward_env=forward_env, ) elif self.metadata.workspace_type == "remote": # For workflow, use APIRemoteWorkspace with pre-built GAIA image diff --git a/benchmarks/multiswebench/run_infer.py b/benchmarks/multiswebench/run_infer.py index 20b926a61..ec5137a2d 100644 --- a/benchmarks/multiswebench/run_infer.py +++ b/benchmarks/multiswebench/run_infer.py @@ -14,7 +14,7 @@ from benchmarks.multiswebench.download_dataset import download_and_concat_dataset from benchmarks.multiswebench.scripts.data.data_change import format_data_for_inference from benchmarks.utils.args_parser import get_parser -from benchmarks.utils.build_utils import build_image +from benchmarks.utils.build_utils import ensure_local_image from benchmarks.utils.console_logging import summarize_instance from benchmarks.utils.constants import EVAL_AGENT_SERVER_IMAGE from benchmarks.utils.conversation import build_event_persistence_callback @@ -212,36 +212,12 @@ def prepare_workspace( agent_server_image = ( f"{EVAL_AGENT_SERVER_IMAGE}:{SDK_SHORT_SHA}-{custom_tag}{suffix}" ) - SKIP_BUILD = os.getenv("MULTI_SWE_BENCH_SKIP_BUILD", "0").lower() in ( - "1", - "true", - "yes", + ensure_local_image( + agent_server_image=agent_server_image, + base_image=official_docker_image, + custom_tag=custom_tag, + target=build_target, ) - logger.info(f"MULTI_SWE_BENCH_SKIP_BUILD={SKIP_BUILD}") - if not SKIP_BUILD: - logger.info( - f"Building workspace from {official_docker_image} " - f"for instance {instance.id}. " - "This may take a while...\n" - "You can run benchmarks/multiswebench/build_images.py and set " - "MULTI_SWE_BENCH_SKIP_BUILD=1 to skip building and use pre-built " - "agent-server image." - ) - output = build_image( - base_image=official_docker_image, - target_image=EVAL_AGENT_SERVER_IMAGE, - custom_tag=custom_tag, - target=build_target, - push=False, - ) - logger.info(f"Image build output: {output}") - assert output.error is None, f"Image build failed: {output.error}" - if agent_server_image not in output.tags: - raise RuntimeError( - f"Built image tags {output.tags} do not include expected tag " - f"{agent_server_image}" - ) - workspace = DockerWorkspace( server_image=agent_server_image, working_dir="/workspace", diff --git a/benchmarks/swebench/run_infer.py b/benchmarks/swebench/run_infer.py index e4e6af9f4..231de93e9 100644 --- a/benchmarks/swebench/run_infer.py +++ b/benchmarks/swebench/run_infer.py @@ -14,7 +14,7 @@ ) from benchmarks.swebench.config import INFER_DEFAULTS from benchmarks.utils.args_parser import get_parser -from benchmarks.utils.build_utils import build_image +from benchmarks.utils.build_utils import ensure_local_image from benchmarks.utils.console_logging import summarize_instance from benchmarks.utils.constants import EVAL_AGENT_SERVER_IMAGE from benchmarks.utils.conversation import build_event_persistence_callback @@ -160,38 +160,24 @@ def prepare_workspace( agent_server_image = base_agent_image if self.metadata.workspace_type == "docker": - SKIP_BUILD = os.getenv("SKIP_BUILD", "1").lower() in ("1", "true", "yes") - logger.info(f"SKIP_BUILD={SKIP_BUILD}") - if not SKIP_BUILD: - logger.info( - f"Building workspace from {official_docker_image} " - f"for instance {instance.id}. " - "This may take a while...\n" - "You can run benchmarks/swebench/build_images.py and set " - "SWE_BENCH_SKIP_BUILD=1 to skip building and use pre-built " - "agent-server image." - ) - output = build_image( - base_image=official_docker_image, - target_image=EVAL_AGENT_SERVER_IMAGE, - custom_tag=custom_tag, - target=build_target, - push=False, - ) - logger.info(f"Image build output: {output}") - assert output.error is None, f"Image build failed: {output.error}" - if base_agent_image not in output.tags: + built = ensure_local_image( + agent_server_image=base_agent_image, + base_image=official_docker_image, + custom_tag=custom_tag, + target=build_target, + ) + if built and wrap_needed: + wrapped_result = wrap_image(base_agent_image, push=False) + if wrapped_result.error: raise RuntimeError( - f"Built image tags {output.tags} do not include expected tag " - f"{base_agent_image}" + "Wrapped image build failed: " + f"{wrapped_result.error}; log={wrapped_result.log_path}" ) - if wrap_needed: - wrapped_result = wrap_image(base_agent_image, push=False) - if wrapped_result.error: - raise RuntimeError( - "Wrapped image build failed: " - f"{wrapped_result.error}; log={wrapped_result.log_path}" - ) + elif not built and wrap_needed: + logger.info( + f"Using pre-built image {base_agent_image} " + "(assumed already wrapped)" + ) workspace = DockerWorkspace( server_image=agent_server_image, diff --git a/benchmarks/swebenchmultimodal/run_infer.py b/benchmarks/swebenchmultimodal/run_infer.py index 295b3fd13..6a1b533a7 100644 --- a/benchmarks/swebenchmultimodal/run_infer.py +++ b/benchmarks/swebenchmultimodal/run_infer.py @@ -12,7 +12,7 @@ ) from benchmarks.swebenchmultimodal.config import INFER_DEFAULTS from benchmarks.utils.args_parser import get_parser -from benchmarks.utils.build_utils import build_image +from benchmarks.utils.build_utils import ensure_local_image from benchmarks.utils.console_logging import summarize_instance from benchmarks.utils.constants import EVAL_AGENT_SERVER_IMAGE from benchmarks.utils.conversation import build_event_persistence_callback @@ -165,33 +165,12 @@ def prepare_workspace( agent_server_image = ( f"{EVAL_AGENT_SERVER_IMAGE}:{SDK_SHORT_SHA}-{custom_tag}{suffix}" ) - SKIP_BUILD = os.getenv("SKIP_BUILD", "1").lower() in ("1", "true", "yes") - logger.info(f"SKIP_BUILD={SKIP_BUILD}") - if not SKIP_BUILD: - logger.info( - f"Building workspace from {official_docker_image} " - f"for instance {instance.id}. " - "This may take a while...\n" - "You can run benchmarks/swebenchmultimodal/build_images.py and set " - "SWE_BENCH_SKIP_BUILD=1 to skip building and use pre-built " - "agent-server image." - ) - - output = build_image( - base_image=official_docker_image, - target_image=EVAL_AGENT_SERVER_IMAGE, - custom_tag=custom_tag, - target=build_target, - push=False, - ) - logger.info(f"Image build output: {output}") - assert output.error is None, f"Image build failed: {output.error}" - if agent_server_image not in output.tags: - raise RuntimeError( - f"Built image tags {output.tags} do not include expected tag " - f"{agent_server_image}" - ) - + ensure_local_image( + agent_server_image=agent_server_image, + base_image=official_docker_image, + custom_tag=custom_tag, + target=build_target, + ) workspace = DockerWorkspace( server_image=agent_server_image, working_dir="/workspace", diff --git a/benchmarks/swefficiency/run_infer.py b/benchmarks/swefficiency/run_infer.py index 1f418a323..cf5a902d2 100644 --- a/benchmarks/swefficiency/run_infer.py +++ b/benchmarks/swefficiency/run_infer.py @@ -11,7 +11,7 @@ from benchmarks.swefficiency.config import DOCKER_DEFAULTS, INFER_DEFAULTS from benchmarks.swefficiency.workspace import ResourceLimitedDockerWorkspace from benchmarks.utils.args_parser import get_parser -from benchmarks.utils.build_utils import build_image +from benchmarks.utils.build_utils import ensure_local_image from benchmarks.utils.conversation import build_event_persistence_callback from benchmarks.utils.critics import create_critic from benchmarks.utils.dataset import get_dataset @@ -208,30 +208,12 @@ def prepare_workspace( logger.info(f"Agent server image: {agent_server_image}") if self.metadata.workspace_type == "docker": - # Build agent-server image from base swefficiency image - SKIP_BUILD = os.getenv("SKIP_BUILD", "0").lower() in ("1", "true", "yes") - logger.info(f"SKIP_BUILD={SKIP_BUILD}") - - if not SKIP_BUILD: - logger.info( - f"Building workspace from {base_docker_image} " - f"for instance {instance.id}. " - "This may take a while..." - ) - output = build_image( - base_image=base_docker_image, - target_image=EVAL_AGENT_SERVER_IMAGE, - custom_tag=custom_tag, - target=build_target, - push=False, - ) - logger.info(f"Image build output: {output}") - assert output.error is None, f"Image build failed: {output.error}" - if agent_server_image not in output.tags: - raise RuntimeError( - f"Built image tags {output.tags} do not include expected tag " - f"{agent_server_image}" - ) + ensure_local_image( + agent_server_image=agent_server_image, + base_image=base_docker_image, + custom_tag=custom_tag, + target=build_target, + ) # Get CPU group for resource limiting cpu_group = self._acquire_cpu_group() diff --git a/benchmarks/swtbench/run_infer.py b/benchmarks/swtbench/run_infer.py index f795a1a18..36b78f265 100644 --- a/benchmarks/swtbench/run_infer.py +++ b/benchmarks/swtbench/run_infer.py @@ -18,7 +18,7 @@ get_default_on_result_writer, ) from benchmarks.utils.fake_user_response import run_conversation_with_fake_user_response -from benchmarks.utils.image_utils import image_exists +from benchmarks.utils.image_utils import create_docker_workspace, image_exists from benchmarks.utils.llm_config import load_llm_config from benchmarks.utils.models import ( EvalInstance, @@ -31,7 +31,7 @@ from openhands.sdk.workspace import RemoteWorkspace from openhands.tools.delegate import DelegateTool from openhands.tools.preset.default import get_default_tools -from openhands.workspace import APIRemoteWorkspace, DockerDevWorkspace, DockerWorkspace +from openhands.workspace import APIRemoteWorkspace logger = get_logger(__name__) @@ -171,30 +171,12 @@ def prepare_workspace( agent_server_image = ( f"{EVAL_AGENT_SERVER_IMAGE}:{SDK_SHORT_SHA}-{custom_tag}{suffix}" ) - SKIP_BUILD = os.getenv("SKIP_BUILD", "1").lower() in ("1", "true", "yes") - logger.info(f"SKIP_BUILD={SKIP_BUILD}") - if not SKIP_BUILD: - logger.info( - f"Building workspace from {official_docker_image} " - f"for instance {instance.id}. " - "This may take a while...\n" - "You can run benchmarks/swtbench/build_images.py and set " - "SKIP_BUILD=1 to skip building and use pre-built " - "agent-server image." - ) - # For SWT-bench, we use DockerDevWorkspace with base_image - workspace = DockerDevWorkspace( - base_image=official_docker_image, - working_dir="/workspace", - target=build_target, - forward_env=forward_env or [], - ) - else: - workspace = DockerWorkspace( - server_image=agent_server_image, - working_dir="/workspace", - forward_env=forward_env or [], - ) + workspace = create_docker_workspace( + agent_server_image=agent_server_image, + base_image=official_docker_image, + build_target=build_target, + forward_env=forward_env, + ) elif self.metadata.workspace_type == "remote": runtime_api_key = os.getenv("RUNTIME_API_KEY") sdk_short_sha = os.getenv("SDK_SHORT_SHA", SDK_SHORT_SHA) diff --git a/benchmarks/utils/build_utils.py b/benchmarks/utils/build_utils.py index 9c700f1d8..df5c10fa7 100644 --- a/benchmarks/utils/build_utils.py +++ b/benchmarks/utils/build_utils.py @@ -27,7 +27,7 @@ maybe_reset_buildkit, ) from benchmarks.utils.constants import EVAL_AGENT_SERVER_IMAGE -from benchmarks.utils.image_utils import image_exists +from benchmarks.utils.image_utils import image_exists, local_image_exists from openhands.agent_server.docker.build import BuildOptions, TargetType, build from openhands.sdk import get_logger @@ -307,6 +307,44 @@ def build_image( return BuildOutput(base_image=base_image, tags=tags, error=None) +def ensure_local_image( + agent_server_image: str, + base_image: str, + custom_tag: str, + target: TargetType = "source-minimal", +) -> bool: + """Build an agent-server image locally if it doesn't already exist. + + Returns True if a build occurred, False if the image already existed. + Set FORCE_BUILD=1 to skip auto-detection and always rebuild. + """ + force_build = os.getenv("FORCE_BUILD", "0").lower() in ("1", "true", "yes") + if not force_build and local_image_exists(agent_server_image): + logger.info(f"Using pre-built image {agent_server_image}") + return False + + if force_build: + logger.info(f"FORCE_BUILD set, building image from {base_image}...") + else: + logger.info(f"Building image from {base_image}...") + output = build_image( + base_image=base_image, + target_image=EVAL_AGENT_SERVER_IMAGE, + custom_tag=custom_tag, + target=target, + push=False, + ) + logger.info(f"Image build output: {output}") + if output.error is not None: + raise RuntimeError(f"Image build failed: {output.error}") + if agent_server_image not in output.tags: + raise RuntimeError( + f"Built image tags {output.tags} do not include expected tag " + f"{agent_server_image}" + ) + return True + + def _build_with_logging( log_dir: Path, base_image: str, diff --git a/benchmarks/utils/image_utils.py b/benchmarks/utils/image_utils.py index a463f3b4f..bbbecb74f 100644 --- a/benchmarks/utils/image_utils.py +++ b/benchmarks/utils/image_utils.py @@ -1,9 +1,24 @@ #!/usr/bin/env python3 +from __future__ import annotations + import base64 +import os +import subprocess import sys +from typing import TYPE_CHECKING + + +if TYPE_CHECKING: + from openhands.sdk.workspace import TargetType + from openhands.workspace import DockerDevWorkspace, DockerWorkspace import requests +from openhands.sdk import get_logger + + +logger = get_logger(__name__) + ACCEPT = ",".join( [ @@ -54,6 +69,57 @@ def _ghcr_token(repo: str, username: str | None, pat: str | None) -> str | None: return None +def local_image_exists(image: str) -> bool: + """Check if a Docker image exists in the local Docker daemon.""" + try: + result = subprocess.run( + ["docker", "image", "inspect", image], + capture_output=True, + check=False, + timeout=5, + ) + return result.returncode == 0 + except (subprocess.TimeoutExpired, FileNotFoundError) as e: + logger.warning(f"Failed to check if image {image} exists: {e}") + return False + + +def create_docker_workspace( + agent_server_image: str, + base_image: str, + build_target: TargetType, + working_dir: str = "/workspace", + forward_env: list[str] | None = None, +) -> DockerWorkspace | DockerDevWorkspace: + """Create a Docker workspace, building the image only if not already available. + + Returns DockerWorkspace when a pre-built image is found locally, + DockerDevWorkspace otherwise (which builds on-the-fly). + Set FORCE_BUILD=1 to skip auto-detection and always build. + """ + from openhands.workspace import DockerDevWorkspace, DockerWorkspace + + force_build = os.getenv("FORCE_BUILD", "0").lower() in ("1", "true", "yes") + if not force_build and local_image_exists(agent_server_image): + logger.info(f"Using pre-built image {agent_server_image}") + return DockerWorkspace( + server_image=agent_server_image, + working_dir=working_dir, + forward_env=forward_env or [], + ) + else: + if force_build: + logger.info(f"FORCE_BUILD set, building workspace from {base_image}...") + else: + logger.info(f"Building workspace from {base_image}...") + return DockerDevWorkspace( + base_image=base_image, + working_dir=working_dir, + target=build_target, + forward_env=forward_env or [], + ) + + def image_exists( image_ref: str, gh_username: str | None = None, diff --git a/tests/test_image_utils.py b/tests/test_image_utils.py new file mode 100644 index 000000000..c46830cb6 --- /dev/null +++ b/tests/test_image_utils.py @@ -0,0 +1,257 @@ +"""Tests for image_utils and build_utils helper functions. + +Tests cover local_image_exists(), create_docker_workspace(), and ensure_local_image() +which centralize Docker image detection and build logic across all benchmarks. +""" + +import os +import subprocess +from unittest.mock import MagicMock, patch + +import pytest + +from benchmarks.utils.build_utils import BuildOutput + + +class TestLocalImageExists: + """Tests for local_image_exists().""" + + @patch("benchmarks.utils.image_utils.subprocess.run") + def test_image_exists(self, mock_run): + from benchmarks.utils.image_utils import local_image_exists + + mock_run.return_value = MagicMock(returncode=0) + assert local_image_exists("myimage:latest") is True + mock_run.assert_called_once_with( + ["docker", "image", "inspect", "myimage:latest"], + capture_output=True, + check=False, + timeout=5, + ) + + @patch("benchmarks.utils.image_utils.subprocess.run") + def test_image_not_found(self, mock_run): + from benchmarks.utils.image_utils import local_image_exists + + mock_run.return_value = MagicMock(returncode=1) + assert local_image_exists("noimage:latest") is False + + @patch("benchmarks.utils.image_utils.subprocess.run") + def test_timeout_returns_false(self, mock_run): + from benchmarks.utils.image_utils import local_image_exists + + mock_run.side_effect = subprocess.TimeoutExpired(cmd="docker", timeout=5) + assert local_image_exists("myimage:latest") is False + + @patch("benchmarks.utils.image_utils.subprocess.run") + def test_docker_not_installed_returns_false(self, mock_run): + from benchmarks.utils.image_utils import local_image_exists + + mock_run.side_effect = FileNotFoundError("docker not found") + assert local_image_exists("myimage:latest") is False + + +class TestCreateDockerWorkspace: + """Tests for create_docker_workspace(). + + These tests mock the Docker daemon interaction (local_image_exists) and + workspace constructors (which connect to Docker), but verify the actual + branching logic and argument forwarding. + """ + + @patch("benchmarks.utils.image_utils.local_image_exists", return_value=True) + def test_returns_docker_workspace_when_image_exists(self, _mock_exists): + from benchmarks.utils.image_utils import create_docker_workspace + from openhands.workspace import DockerWorkspace + + with patch("openhands.workspace.DockerWorkspace", wraps=DockerWorkspace) as spy: + # wraps=DockerWorkspace would call the real constructor which needs Docker, + # so we set a return_value to avoid that while still checking isinstance + sentinel = MagicMock(spec=DockerWorkspace) + spy.return_value = sentinel + ws = create_docker_workspace( + agent_server_image="server:v1", + base_image="base:latest", + build_target="binary", + ) + spy.assert_called_once_with( + server_image="server:v1", + working_dir="/workspace", + forward_env=[], + ) + assert ws is sentinel + + @patch("benchmarks.utils.image_utils.local_image_exists", return_value=False) + def test_returns_docker_dev_workspace_when_image_missing(self, _mock_exists): + from benchmarks.utils.image_utils import create_docker_workspace + from openhands.workspace import DockerDevWorkspace + + sentinel = MagicMock(spec=DockerDevWorkspace) + with patch( + "openhands.workspace.DockerDevWorkspace", return_value=sentinel + ) as spy: + ws = create_docker_workspace( + agent_server_image="server:v1", + base_image="base:latest", + build_target="source-minimal", + forward_env=["FOO"], + ) + spy.assert_called_once_with( + base_image="base:latest", + working_dir="/workspace", + target="source-minimal", + forward_env=["FOO"], + ) + assert ws is sentinel + + @patch.dict(os.environ, {"FORCE_BUILD": "1"}) + @patch("benchmarks.utils.image_utils.local_image_exists", return_value=True) + def test_force_build_skips_detection(self, mock_exists): + from benchmarks.utils.image_utils import create_docker_workspace + from openhands.workspace import DockerDevWorkspace + + sentinel = MagicMock(spec=DockerDevWorkspace) + with patch("openhands.workspace.DockerDevWorkspace", return_value=sentinel): + ws = create_docker_workspace( + agent_server_image="server:v1", + base_image="base:latest", + build_target="binary", + ) + # Should build even though image exists locally + assert ws is sentinel + # local_image_exists should NOT have been called when FORCE_BUILD=1 + mock_exists.assert_not_called() + + @patch("benchmarks.utils.image_utils.local_image_exists", return_value=True) + def test_custom_working_dir_and_forward_env(self, _mock_exists): + """Verify custom parameters are forwarded correctly.""" + from benchmarks.utils.image_utils import create_docker_workspace + + with patch("openhands.workspace.DockerWorkspace") as MockDW: + create_docker_workspace( + agent_server_image="server:v1", + base_image="base:latest", + build_target="binary", + working_dir="/custom", + forward_env=["API_KEY", "TOKEN"], + ) + MockDW.assert_called_once_with( + server_image="server:v1", + working_dir="/custom", + forward_env=["API_KEY", "TOKEN"], + ) + + +class TestEnsureLocalImage: + """Tests for ensure_local_image(). + + Uses real BuildOutput objects (not mocks) so validation logic in + ensure_local_image is exercised against actual data structures. + """ + + @patch("benchmarks.utils.build_utils.local_image_exists", return_value=True) + @patch("benchmarks.utils.build_utils.build_image") + def test_returns_false_when_image_exists(self, mock_build, _mock_exists): + from benchmarks.utils.build_utils import ensure_local_image + + result = ensure_local_image( + agent_server_image="server:v1", + base_image="base:latest", + custom_tag="mytag", + ) + assert result is False + mock_build.assert_not_called() + + @patch("benchmarks.utils.build_utils.local_image_exists", return_value=False) + @patch("benchmarks.utils.build_utils.build_image") + def test_returns_true_when_build_occurs(self, mock_build, _mock_exists): + from benchmarks.utils.build_utils import ensure_local_image + + mock_build.return_value = BuildOutput( + base_image="base:latest", + tags=["server:v1"], + error=None, + ) + result = ensure_local_image( + agent_server_image="server:v1", + base_image="base:latest", + custom_tag="mytag", + ) + assert result is True + mock_build.assert_called_once() + + @patch("benchmarks.utils.build_utils.local_image_exists", return_value=False) + @patch("benchmarks.utils.build_utils.build_image") + def test_raises_on_build_failure(self, mock_build, _mock_exists): + from benchmarks.utils.build_utils import ensure_local_image + + mock_build.return_value = BuildOutput( + base_image="base:latest", + tags=[], + error="build exploded", + ) + with pytest.raises(RuntimeError, match="Image build failed"): + ensure_local_image( + agent_server_image="server:v1", + base_image="base:latest", + custom_tag="mytag", + ) + + @patch("benchmarks.utils.build_utils.local_image_exists", return_value=False) + @patch("benchmarks.utils.build_utils.build_image") + def test_raises_on_tag_mismatch(self, mock_build, _mock_exists): + from benchmarks.utils.build_utils import ensure_local_image + + mock_build.return_value = BuildOutput( + base_image="base:latest", + tags=["server:wrong-tag"], + error=None, + ) + with pytest.raises(RuntimeError, match="do not include expected tag"): + ensure_local_image( + agent_server_image="server:v1", + base_image="base:latest", + custom_tag="mytag", + ) + + @patch.dict(os.environ, {"FORCE_BUILD": "1"}) + @patch("benchmarks.utils.build_utils.local_image_exists", return_value=True) + @patch("benchmarks.utils.build_utils.build_image") + def test_force_build_skips_detection(self, mock_build, mock_exists): + from benchmarks.utils.build_utils import ensure_local_image + + mock_build.return_value = BuildOutput( + base_image="base:latest", + tags=["server:v1"], + error=None, + ) + result = ensure_local_image( + agent_server_image="server:v1", + base_image="base:latest", + custom_tag="mytag", + ) + assert result is True + mock_build.assert_called_once() + # local_image_exists should NOT have been called when FORCE_BUILD=1 + mock_exists.assert_not_called() + + @patch("benchmarks.utils.build_utils.local_image_exists", return_value=False) + @patch("benchmarks.utils.build_utils.build_image") + def test_passes_target_to_build_image(self, mock_build, _mock_exists): + """Verify the target parameter flows through to build_image.""" + from benchmarks.utils.build_utils import ensure_local_image + + mock_build.return_value = BuildOutput( + base_image="base:latest", + tags=["server:v1"], + error=None, + ) + ensure_local_image( + agent_server_image="server:v1", + base_image="base:latest", + custom_tag="mytag", + target="binary", + ) + _, kwargs = mock_build.call_args + assert kwargs["target"] == "binary" + assert kwargs["push"] is False diff --git a/tests/test_llm_config.py b/tests/test_llm_config.py index 5d6cd348d..a244ff811 100644 --- a/tests/test_llm_config.py +++ b/tests/test_llm_config.py @@ -20,14 +20,14 @@ class TestLoadLLMConfigValidConfigs: def test_minimal_valid_config(self, tmp_path: Path) -> None: """Minimal config with only required 'model' field loads correctly.""" - config = {"model": "gpt-4"} + config = {"model": "gpt-4o"} config_path = tmp_path / "config.json" config_path.write_text(json.dumps(config)) llm = load_llm_config(config_path) assert isinstance(llm, LLM) - assert llm.model == "gpt-4" + assert llm.model == "gpt-4o" def test_full_valid_config(self, tmp_path: Path) -> None: """Config with all common fields loads correctly.""" @@ -211,7 +211,7 @@ def test_unreadable_file_raises_permission_error(self, tmp_path: Path) -> None: def test_config_with_extra_fields_loads(self, tmp_path: Path) -> None: """Config with unknown extra fields should still load (pydantic default).""" config = { - "model": "gpt-4", + "model": "gpt-4o", "unknown_field": "value", "another_unknown": 123, } @@ -220,11 +220,11 @@ def test_config_with_extra_fields_loads(self, tmp_path: Path) -> None: # Should not raise - pydantic by default ignores extra fields llm = load_llm_config(config_path) - assert llm.model == "gpt-4" + assert llm.model == "gpt-4o" def test_unicode_in_config(self, tmp_path: Path) -> None: """Config with unicode characters loads correctly.""" - config = {"model": "gpt-4", "api_key": "key-with-émojis-🔑"} + config = {"model": "gpt-4o", "api_key": "key-with-émojis-🔑"} config_path = tmp_path / "config.json" config_path.write_text(json.dumps(config, ensure_ascii=False)) diff --git a/vendor/software-agent-sdk b/vendor/software-agent-sdk index b498a6990..bde715c12 160000 --- a/vendor/software-agent-sdk +++ b/vendor/software-agent-sdk @@ -1 +1 @@ -Subproject commit b498a69908f7d06feb3921ffe05ff7e781a6f108 +Subproject commit bde715c12bce8fb112980529d5ad162f6b81a7f1