Skip to content

Commit 0a13934

Browse files
feat: auto-detect pre-built Docker images across all benchmarks
Add local_image_exists() and create_docker_workspace() to image_utils, and ensure_local_image() to build_utils. These centralize the pattern of checking for a pre-built image before building on-the-fly. Replace all SKIP_BUILD env var checks across every benchmark: - commit0, gaia, swtbench: use create_docker_workspace() - swebench, swebenchmultimodal, multiswebench, swefficiency: use ensure_local_image() Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent d3f24e4 commit 0a13934

File tree

9 files changed

+151
-156
lines changed

9 files changed

+151
-156
lines changed

benchmarks/commit0/run_infer.py

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
construct_eval_output_dir,
2525
get_default_on_result_writer,
2626
)
27-
from benchmarks.utils.image_utils import image_exists
27+
from benchmarks.utils.image_utils import create_docker_workspace, image_exists
2828
from benchmarks.utils.llm_config import load_llm_config
2929
from benchmarks.utils.models import (
3030
EvalInstance,
@@ -36,7 +36,7 @@
3636
from openhands.sdk.workspace import RemoteWorkspace
3737
from openhands.tools.delegate import DelegateTool
3838
from openhands.tools.preset.default import get_default_tools
39-
from openhands.workspace import APIRemoteWorkspace, DockerDevWorkspace
39+
from openhands.workspace import APIRemoteWorkspace
4040

4141

4242
logger = get_logger(__name__)
@@ -188,15 +188,16 @@ def prepare_workspace(
188188
logger.info(f"Using base docker image: {base_docker_image}")
189189

190190
if self.metadata.workspace_type == "docker":
191-
# Build agent-server image from base commit0 image
192-
workspace = DockerDevWorkspace(
193-
base_image=base_docker_image,
194-
working_dir="/workspace",
195-
target=build_target,
196-
forward_env=forward_env or [],
191+
custom_tag = extract_custom_tag(base_docker_image)
192+
suffix = f"-{build_target}" if build_target != "binary" else ""
193+
agent_server_image = (
194+
f"{EVAL_AGENT_SERVER_IMAGE}:{SDK_SHORT_SHA}-{custom_tag}{suffix}"
197195
)
198-
logger.info(
199-
f"Building workspace from {base_docker_image}. This may take a while..."
196+
workspace = create_docker_workspace(
197+
agent_server_image=agent_server_image,
198+
base_image=base_docker_image,
199+
build_target=build_target,
200+
forward_env=forward_env,
200201
)
201202
elif self.metadata.workspace_type == "remote":
202203
runtime_api_key = os.getenv("RUNTIME_API_KEY")

benchmarks/gaia/run_infer.py

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727
get_default_on_result_writer,
2828
)
2929
from benchmarks.utils.fake_user_response import run_conversation_with_fake_user_response
30-
from benchmarks.utils.image_utils import image_exists
30+
from benchmarks.utils.image_utils import create_docker_workspace, image_exists
3131
from benchmarks.utils.llm_config import load_llm_config
3232
from benchmarks.utils.models import EvalInstance, EvalMetadata, EvalOutput
3333
from benchmarks.utils.version import SDK_SHORT_SHA
@@ -47,7 +47,7 @@
4747
from openhands.sdk.workspace import RemoteWorkspace
4848
from openhands.tools.delegate import DelegateTool
4949
from openhands.tools.preset.default import get_default_tools
50-
from openhands.workspace import APIRemoteWorkspace, DockerDevWorkspace
50+
from openhands.workspace import APIRemoteWorkspace
5151

5252

5353
logger = get_logger(__name__)
@@ -156,11 +156,14 @@ def prepare_workspace(
156156
logger.info(f"Preparing workspace for instance {instance.id}")
157157

158158
if self.metadata.workspace_type == "docker":
159-
# Use DockerDevWorkspace with base image (same as main branch)
160-
workspace = DockerDevWorkspace(
159+
agent_server_image = (
160+
f"{EVAL_AGENT_SERVER_IMAGE}:{SDK_SHORT_SHA}-gaia-binary"
161+
)
162+
workspace = create_docker_workspace(
163+
agent_server_image=agent_server_image,
161164
base_image="nikolaik/python-nodejs:python3.12-nodejs22",
162-
working_dir="/workspace",
163-
forward_env=forward_env or [],
165+
build_target="binary",
166+
forward_env=forward_env,
164167
)
165168
elif self.metadata.workspace_type == "remote":
166169
# For workflow, use APIRemoteWorkspace with pre-built GAIA image

benchmarks/multiswebench/run_infer.py

Lines changed: 6 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
from benchmarks.multiswebench.download_dataset import download_and_concat_dataset
1515
from benchmarks.multiswebench.scripts.data.data_change import format_data_for_inference
1616
from benchmarks.utils.args_parser import get_parser
17-
from benchmarks.utils.build_utils import build_image
17+
from benchmarks.utils.build_utils import ensure_local_image
1818
from benchmarks.utils.console_logging import summarize_instance
1919
from benchmarks.utils.constants import EVAL_AGENT_SERVER_IMAGE
2020
from benchmarks.utils.conversation import build_event_persistence_callback
@@ -212,36 +212,12 @@ def prepare_workspace(
212212
agent_server_image = (
213213
f"{EVAL_AGENT_SERVER_IMAGE}:{SDK_SHORT_SHA}-{custom_tag}{suffix}"
214214
)
215-
SKIP_BUILD = os.getenv("MULTI_SWE_BENCH_SKIP_BUILD", "0").lower() in (
216-
"1",
217-
"true",
218-
"yes",
215+
ensure_local_image(
216+
agent_server_image=agent_server_image,
217+
base_image=official_docker_image,
218+
custom_tag=custom_tag,
219+
target=build_target,
219220
)
220-
logger.info(f"MULTI_SWE_BENCH_SKIP_BUILD={SKIP_BUILD}")
221-
if not SKIP_BUILD:
222-
logger.info(
223-
f"Building workspace from {official_docker_image} "
224-
f"for instance {instance.id}. "
225-
"This may take a while...\n"
226-
"You can run benchmarks/multiswebench/build_images.py and set "
227-
"MULTI_SWE_BENCH_SKIP_BUILD=1 to skip building and use pre-built "
228-
"agent-server image."
229-
)
230-
output = build_image(
231-
base_image=official_docker_image,
232-
target_image=EVAL_AGENT_SERVER_IMAGE,
233-
custom_tag=custom_tag,
234-
target=build_target,
235-
push=False,
236-
)
237-
logger.info(f"Image build output: {output}")
238-
assert output.error is None, f"Image build failed: {output.error}"
239-
if agent_server_image not in output.tags:
240-
raise RuntimeError(
241-
f"Built image tags {output.tags} do not include expected tag "
242-
f"{agent_server_image}"
243-
)
244-
245221
workspace = DockerWorkspace(
246222
server_image=agent_server_image,
247223
working_dir="/workspace",

benchmarks/swebench/run_infer.py

Lines changed: 12 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
)
1515
from benchmarks.swebench.config import INFER_DEFAULTS
1616
from benchmarks.utils.args_parser import get_parser
17-
from benchmarks.utils.build_utils import build_image
17+
from benchmarks.utils.build_utils import ensure_local_image
1818
from benchmarks.utils.console_logging import summarize_instance
1919
from benchmarks.utils.constants import EVAL_AGENT_SERVER_IMAGE
2020
from benchmarks.utils.conversation import build_event_persistence_callback
@@ -160,38 +160,19 @@ def prepare_workspace(
160160
agent_server_image = base_agent_image
161161

162162
if self.metadata.workspace_type == "docker":
163-
SKIP_BUILD = os.getenv("SKIP_BUILD", "1").lower() in ("1", "true", "yes")
164-
logger.info(f"SKIP_BUILD={SKIP_BUILD}")
165-
if not SKIP_BUILD:
166-
logger.info(
167-
f"Building workspace from {official_docker_image} "
168-
f"for instance {instance.id}. "
169-
"This may take a while...\n"
170-
"You can run benchmarks/swebench/build_images.py and set "
171-
"SWE_BENCH_SKIP_BUILD=1 to skip building and use pre-built "
172-
"agent-server image."
173-
)
174-
output = build_image(
175-
base_image=official_docker_image,
176-
target_image=EVAL_AGENT_SERVER_IMAGE,
177-
custom_tag=custom_tag,
178-
target=build_target,
179-
push=False,
180-
)
181-
logger.info(f"Image build output: {output}")
182-
assert output.error is None, f"Image build failed: {output.error}"
183-
if base_agent_image not in output.tags:
163+
ensure_local_image(
164+
agent_server_image=base_agent_image,
165+
base_image=official_docker_image,
166+
custom_tag=custom_tag,
167+
target=build_target,
168+
)
169+
if wrap_needed:
170+
wrapped_result = wrap_image(base_agent_image, push=False)
171+
if wrapped_result.error:
184172
raise RuntimeError(
185-
f"Built image tags {output.tags} do not include expected tag "
186-
f"{base_agent_image}"
173+
"Wrapped image build failed: "
174+
f"{wrapped_result.error}; log={wrapped_result.log_path}"
187175
)
188-
if wrap_needed:
189-
wrapped_result = wrap_image(base_agent_image, push=False)
190-
if wrapped_result.error:
191-
raise RuntimeError(
192-
"Wrapped image build failed: "
193-
f"{wrapped_result.error}; log={wrapped_result.log_path}"
194-
)
195176

196177
workspace = DockerWorkspace(
197178
server_image=agent_server_image,

benchmarks/swebenchmultimodal/run_infer.py

Lines changed: 7 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
)
1313
from benchmarks.swebenchmultimodal.config import INFER_DEFAULTS
1414
from benchmarks.utils.args_parser import get_parser
15-
from benchmarks.utils.build_utils import build_image
15+
from benchmarks.utils.build_utils import ensure_local_image
1616
from benchmarks.utils.console_logging import summarize_instance
1717
from benchmarks.utils.constants import EVAL_AGENT_SERVER_IMAGE
1818
from benchmarks.utils.conversation import build_event_persistence_callback
@@ -165,33 +165,12 @@ def prepare_workspace(
165165
agent_server_image = (
166166
f"{EVAL_AGENT_SERVER_IMAGE}:{SDK_SHORT_SHA}-{custom_tag}{suffix}"
167167
)
168-
SKIP_BUILD = os.getenv("SKIP_BUILD", "1").lower() in ("1", "true", "yes")
169-
logger.info(f"SKIP_BUILD={SKIP_BUILD}")
170-
if not SKIP_BUILD:
171-
logger.info(
172-
f"Building workspace from {official_docker_image} "
173-
f"for instance {instance.id}. "
174-
"This may take a while...\n"
175-
"You can run benchmarks/swebenchmultimodal/build_images.py and set "
176-
"SWE_BENCH_SKIP_BUILD=1 to skip building and use pre-built "
177-
"agent-server image."
178-
)
179-
180-
output = build_image(
181-
base_image=official_docker_image,
182-
target_image=EVAL_AGENT_SERVER_IMAGE,
183-
custom_tag=custom_tag,
184-
target=build_target,
185-
push=False,
186-
)
187-
logger.info(f"Image build output: {output}")
188-
assert output.error is None, f"Image build failed: {output.error}"
189-
if agent_server_image not in output.tags:
190-
raise RuntimeError(
191-
f"Built image tags {output.tags} do not include expected tag "
192-
f"{agent_server_image}"
193-
)
194-
168+
ensure_local_image(
169+
agent_server_image=agent_server_image,
170+
base_image=official_docker_image,
171+
custom_tag=custom_tag,
172+
target=build_target,
173+
)
195174
workspace = DockerWorkspace(
196175
server_image=agent_server_image,
197176
working_dir="/workspace",

benchmarks/swefficiency/run_infer.py

Lines changed: 7 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
from benchmarks.swefficiency.config import DOCKER_DEFAULTS, INFER_DEFAULTS
1212
from benchmarks.swefficiency.workspace import ResourceLimitedDockerWorkspace
1313
from benchmarks.utils.args_parser import get_parser
14-
from benchmarks.utils.build_utils import build_image
14+
from benchmarks.utils.build_utils import ensure_local_image
1515
from benchmarks.utils.conversation import build_event_persistence_callback
1616
from benchmarks.utils.critics import create_critic
1717
from benchmarks.utils.dataset import get_dataset
@@ -208,30 +208,12 @@ def prepare_workspace(
208208
logger.info(f"Agent server image: {agent_server_image}")
209209

210210
if self.metadata.workspace_type == "docker":
211-
# Build agent-server image from base swefficiency image
212-
SKIP_BUILD = os.getenv("SKIP_BUILD", "0").lower() in ("1", "true", "yes")
213-
logger.info(f"SKIP_BUILD={SKIP_BUILD}")
214-
215-
if not SKIP_BUILD:
216-
logger.info(
217-
f"Building workspace from {base_docker_image} "
218-
f"for instance {instance.id}. "
219-
"This may take a while..."
220-
)
221-
output = build_image(
222-
base_image=base_docker_image,
223-
target_image=EVAL_AGENT_SERVER_IMAGE,
224-
custom_tag=custom_tag,
225-
target=build_target,
226-
push=False,
227-
)
228-
logger.info(f"Image build output: {output}")
229-
assert output.error is None, f"Image build failed: {output.error}"
230-
if agent_server_image not in output.tags:
231-
raise RuntimeError(
232-
f"Built image tags {output.tags} do not include expected tag "
233-
f"{agent_server_image}"
234-
)
211+
ensure_local_image(
212+
agent_server_image=agent_server_image,
213+
base_image=base_docker_image,
214+
custom_tag=custom_tag,
215+
target=build_target,
216+
)
235217

236218
# Get CPU group for resource limiting
237219
cpu_group = self._acquire_cpu_group()

benchmarks/swtbench/run_infer.py

Lines changed: 8 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
get_default_on_result_writer,
1919
)
2020
from benchmarks.utils.fake_user_response import run_conversation_with_fake_user_response
21-
from benchmarks.utils.image_utils import image_exists
21+
from benchmarks.utils.image_utils import create_docker_workspace, image_exists
2222
from benchmarks.utils.llm_config import load_llm_config
2323
from benchmarks.utils.models import (
2424
EvalInstance,
@@ -31,7 +31,7 @@
3131
from openhands.sdk.workspace import RemoteWorkspace
3232
from openhands.tools.delegate import DelegateTool
3333
from openhands.tools.preset.default import get_default_tools
34-
from openhands.workspace import APIRemoteWorkspace, DockerDevWorkspace, DockerWorkspace
34+
from openhands.workspace import APIRemoteWorkspace
3535

3636

3737
logger = get_logger(__name__)
@@ -171,30 +171,12 @@ def prepare_workspace(
171171
agent_server_image = (
172172
f"{EVAL_AGENT_SERVER_IMAGE}:{SDK_SHORT_SHA}-{custom_tag}{suffix}"
173173
)
174-
SKIP_BUILD = os.getenv("SKIP_BUILD", "1").lower() in ("1", "true", "yes")
175-
logger.info(f"SKIP_BUILD={SKIP_BUILD}")
176-
if not SKIP_BUILD:
177-
logger.info(
178-
f"Building workspace from {official_docker_image} "
179-
f"for instance {instance.id}. "
180-
"This may take a while...\n"
181-
"You can run benchmarks/swtbench/build_images.py and set "
182-
"SKIP_BUILD=1 to skip building and use pre-built "
183-
"agent-server image."
184-
)
185-
# For SWT-bench, we use DockerDevWorkspace with base_image
186-
workspace = DockerDevWorkspace(
187-
base_image=official_docker_image,
188-
working_dir="/workspace",
189-
target=build_target,
190-
forward_env=forward_env or [],
191-
)
192-
else:
193-
workspace = DockerWorkspace(
194-
server_image=agent_server_image,
195-
working_dir="/workspace",
196-
forward_env=forward_env or [],
197-
)
174+
workspace = create_docker_workspace(
175+
agent_server_image=agent_server_image,
176+
base_image=official_docker_image,
177+
build_target=build_target,
178+
forward_env=forward_env,
179+
)
198180
elif self.metadata.workspace_type == "remote":
199181
runtime_api_key = os.getenv("RUNTIME_API_KEY")
200182
sdk_short_sha = os.getenv("SDK_SHORT_SHA", SDK_SHORT_SHA)

benchmarks/utils/build_utils.py

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -307,6 +307,41 @@ def build_image(
307307
return BuildOutput(base_image=base_image, tags=tags, error=None)
308308

309309

310+
def ensure_local_image(
311+
agent_server_image: str,
312+
base_image: str,
313+
custom_tag: str,
314+
target: TargetType = "source-minimal",
315+
) -> bool:
316+
"""Build an agent-server image locally if it doesn't already exist.
317+
318+
Returns True if a build occurred, False if the image already existed.
319+
"""
320+
from benchmarks.utils.image_utils import local_image_exists
321+
322+
if local_image_exists(agent_server_image):
323+
logger.info(f"Using pre-built image {agent_server_image}")
324+
return False
325+
326+
logger.info(f"Building image from {base_image}...")
327+
output = build_image(
328+
base_image=base_image,
329+
target_image=EVAL_AGENT_SERVER_IMAGE,
330+
custom_tag=custom_tag,
331+
target=target,
332+
push=False,
333+
)
334+
logger.info(f"Image build output: {output}")
335+
if output.error is not None:
336+
raise RuntimeError(f"Image build failed: {output.error}")
337+
if agent_server_image not in output.tags:
338+
raise RuntimeError(
339+
f"Built image tags {output.tags} do not include expected tag "
340+
f"{agent_server_image}"
341+
)
342+
return True
343+
344+
310345
def _build_with_logging(
311346
log_dir: Path,
312347
base_image: str,

0 commit comments

Comments
 (0)