Skip to content

Commit 7cdda09

Browse files
feat: auto-detect pre-built images across all benchmarks
Add local_image_exists() to image_utils — checks if a Docker image exists in the local daemon. Replace all SKIP_BUILD env var checks across every benchmark with auto-detection: if the expected agent-server image exists locally, skip building; otherwise build on-the-fly as before. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent d3f24e4 commit 7cdda09

File tree

8 files changed

+69
-73
lines changed

8 files changed

+69
-73
lines changed

benchmarks/commit0/run_infer.py

Lines changed: 21 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
construct_eval_output_dir,
2525
get_default_on_result_writer,
2626
)
27-
from benchmarks.utils.image_utils import image_exists
27+
from benchmarks.utils.image_utils import image_exists, local_image_exists
2828
from benchmarks.utils.llm_config import load_llm_config
2929
from benchmarks.utils.models import (
3030
EvalInstance,
@@ -36,7 +36,7 @@
3636
from openhands.sdk.workspace import RemoteWorkspace
3737
from openhands.tools.delegate import DelegateTool
3838
from openhands.tools.preset.default import get_default_tools
39-
from openhands.workspace import APIRemoteWorkspace, DockerDevWorkspace
39+
from openhands.workspace import APIRemoteWorkspace, DockerDevWorkspace, DockerWorkspace
4040

4141

4242
logger = get_logger(__name__)
@@ -188,16 +188,26 @@ def prepare_workspace(
188188
logger.info(f"Using base docker image: {base_docker_image}")
189189

190190
if self.metadata.workspace_type == "docker":
191-
# Build agent-server image from base commit0 image
192-
workspace = DockerDevWorkspace(
193-
base_image=base_docker_image,
194-
working_dir="/workspace",
195-
target=build_target,
196-
forward_env=forward_env or [],
197-
)
198-
logger.info(
199-
f"Building workspace from {base_docker_image}. This may take a while..."
191+
custom_tag = extract_custom_tag(base_docker_image)
192+
suffix = f"-{build_target}" if build_target != "binary" else ""
193+
agent_server_image = (
194+
f"{EVAL_AGENT_SERVER_IMAGE}:{SDK_SHORT_SHA}-{custom_tag}{suffix}"
200195
)
196+
if local_image_exists(agent_server_image):
197+
logger.info(f"Using pre-built image {agent_server_image}")
198+
workspace = DockerWorkspace(
199+
server_image=agent_server_image,
200+
working_dir="/workspace",
201+
forward_env=forward_env or [],
202+
)
203+
else:
204+
logger.info(f"Building workspace from {base_docker_image}...")
205+
workspace = DockerDevWorkspace(
206+
base_image=base_docker_image,
207+
working_dir="/workspace",
208+
target=build_target,
209+
forward_env=forward_env or [],
210+
)
201211
elif self.metadata.workspace_type == "remote":
202212
runtime_api_key = os.getenv("RUNTIME_API_KEY")
203213
if not runtime_api_key:

benchmarks/gaia/run_infer.py

Lines changed: 18 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@
2727
get_default_on_result_writer,
2828
)
2929
from benchmarks.utils.fake_user_response import run_conversation_with_fake_user_response
30-
from benchmarks.utils.image_utils import image_exists
30+
from benchmarks.utils.image_utils import image_exists, local_image_exists
3131
from benchmarks.utils.llm_config import load_llm_config
3232
from benchmarks.utils.models import EvalInstance, EvalMetadata, EvalOutput
3333
from benchmarks.utils.version import SDK_SHORT_SHA
@@ -47,7 +47,7 @@
4747
from openhands.sdk.workspace import RemoteWorkspace
4848
from openhands.tools.delegate import DelegateTool
4949
from openhands.tools.preset.default import get_default_tools
50-
from openhands.workspace import APIRemoteWorkspace, DockerDevWorkspace
50+
from openhands.workspace import APIRemoteWorkspace, DockerDevWorkspace, DockerWorkspace
5151

5252

5353
logger = get_logger(__name__)
@@ -156,12 +156,23 @@ def prepare_workspace(
156156
logger.info(f"Preparing workspace for instance {instance.id}")
157157

158158
if self.metadata.workspace_type == "docker":
159-
# Use DockerDevWorkspace with base image (same as main branch)
160-
workspace = DockerDevWorkspace(
161-
base_image="nikolaik/python-nodejs:python3.12-nodejs22",
162-
working_dir="/workspace",
163-
forward_env=forward_env or [],
159+
agent_server_image = (
160+
f"{EVAL_AGENT_SERVER_IMAGE}:{SDK_SHORT_SHA}-gaia-binary"
164161
)
162+
if local_image_exists(agent_server_image):
163+
logger.info(f"Using pre-built image {agent_server_image}")
164+
workspace = DockerWorkspace(
165+
server_image=agent_server_image,
166+
working_dir="/workspace",
167+
forward_env=forward_env or [],
168+
)
169+
else:
170+
logger.info("Building workspace from nikolaik/python-nodejs:python3.12-nodejs22...")
171+
workspace = DockerDevWorkspace(
172+
base_image="nikolaik/python-nodejs:python3.12-nodejs22",
173+
working_dir="/workspace",
174+
forward_env=forward_env or [],
175+
)
165176
elif self.metadata.workspace_type == "remote":
166177
# For workflow, use APIRemoteWorkspace with pre-built GAIA image
167178
# GAIA uses a universal agent server image (one image for all instances)

benchmarks/multiswebench/run_infer.py

Lines changed: 3 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
get_default_on_result_writer,
2727
)
2828
from benchmarks.utils.fake_user_response import run_conversation_with_fake_user_response
29-
from benchmarks.utils.image_utils import image_exists
29+
from benchmarks.utils.image_utils import image_exists, local_image_exists
3030
from benchmarks.utils.llm_config import load_llm_config
3131
from benchmarks.utils.models import (
3232
EvalInstance,
@@ -212,20 +212,10 @@ def prepare_workspace(
212212
agent_server_image = (
213213
f"{EVAL_AGENT_SERVER_IMAGE}:{SDK_SHORT_SHA}-{custom_tag}{suffix}"
214214
)
215-
SKIP_BUILD = os.getenv("MULTI_SWE_BENCH_SKIP_BUILD", "0").lower() in (
216-
"1",
217-
"true",
218-
"yes",
219-
)
220-
logger.info(f"MULTI_SWE_BENCH_SKIP_BUILD={SKIP_BUILD}")
221-
if not SKIP_BUILD:
215+
if not local_image_exists(agent_server_image):
222216
logger.info(
223217
f"Building workspace from {official_docker_image} "
224-
f"for instance {instance.id}. "
225-
"This may take a while...\n"
226-
"You can run benchmarks/multiswebench/build_images.py and set "
227-
"MULTI_SWE_BENCH_SKIP_BUILD=1 to skip building and use pre-built "
228-
"agent-server image."
218+
f"for instance {instance.id}. This may take a while..."
229219
)
230220
output = build_image(
231221
base_image=official_docker_image,

benchmarks/swebench/run_infer.py

Lines changed: 3 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
get_default_on_result_writer,
2727
)
2828
from benchmarks.utils.fake_user_response import run_conversation_with_fake_user_response
29-
from benchmarks.utils.image_utils import image_exists
29+
from benchmarks.utils.image_utils import image_exists, local_image_exists
3030
from benchmarks.utils.llm_config import load_llm_config
3131
from benchmarks.utils.models import (
3232
EvalInstance,
@@ -160,16 +160,10 @@ def prepare_workspace(
160160
agent_server_image = base_agent_image
161161

162162
if self.metadata.workspace_type == "docker":
163-
SKIP_BUILD = os.getenv("SKIP_BUILD", "1").lower() in ("1", "true", "yes")
164-
logger.info(f"SKIP_BUILD={SKIP_BUILD}")
165-
if not SKIP_BUILD:
163+
if not local_image_exists(agent_server_image):
166164
logger.info(
167165
f"Building workspace from {official_docker_image} "
168-
f"for instance {instance.id}. "
169-
"This may take a while...\n"
170-
"You can run benchmarks/swebench/build_images.py and set "
171-
"SWE_BENCH_SKIP_BUILD=1 to skip building and use pre-built "
172-
"agent-server image."
166+
f"for instance {instance.id}. This may take a while..."
173167
)
174168
output = build_image(
175169
base_image=official_docker_image,

benchmarks/swebenchmultimodal/run_infer.py

Lines changed: 3 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@
2424
get_default_on_result_writer,
2525
)
2626
from benchmarks.utils.fake_user_response import run_conversation_with_fake_user_response
27-
from benchmarks.utils.image_utils import image_exists
27+
from benchmarks.utils.image_utils import image_exists, local_image_exists
2828
from benchmarks.utils.llm_config import load_llm_config
2929
from benchmarks.utils.models import (
3030
EvalInstance,
@@ -165,16 +165,10 @@ def prepare_workspace(
165165
agent_server_image = (
166166
f"{EVAL_AGENT_SERVER_IMAGE}:{SDK_SHORT_SHA}-{custom_tag}{suffix}"
167167
)
168-
SKIP_BUILD = os.getenv("SKIP_BUILD", "1").lower() in ("1", "true", "yes")
169-
logger.info(f"SKIP_BUILD={SKIP_BUILD}")
170-
if not SKIP_BUILD:
168+
if not local_image_exists(agent_server_image):
171169
logger.info(
172170
f"Building workspace from {official_docker_image} "
173-
f"for instance {instance.id}. "
174-
"This may take a while...\n"
175-
"You can run benchmarks/swebenchmultimodal/build_images.py and set "
176-
"SWE_BENCH_SKIP_BUILD=1 to skip building and use pre-built "
177-
"agent-server image."
171+
f"for instance {instance.id}. This may take a while..."
178172
)
179173

180174
output = build_image(

benchmarks/swefficiency/run_infer.py

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@
2121
get_default_on_result_writer,
2222
)
2323
from benchmarks.utils.fake_user_response import run_conversation_with_fake_user_response
24-
from benchmarks.utils.image_utils import image_exists
24+
from benchmarks.utils.image_utils import image_exists, local_image_exists
2525
from benchmarks.utils.models import (
2626
EvalInstance,
2727
EvalMetadata,
@@ -208,11 +208,7 @@ def prepare_workspace(
208208
logger.info(f"Agent server image: {agent_server_image}")
209209

210210
if self.metadata.workspace_type == "docker":
211-
# Build agent-server image from base swefficiency image
212-
SKIP_BUILD = os.getenv("SKIP_BUILD", "0").lower() in ("1", "true", "yes")
213-
logger.info(f"SKIP_BUILD={SKIP_BUILD}")
214-
215-
if not SKIP_BUILD:
211+
if not local_image_exists(agent_server_image):
216212
logger.info(
217213
f"Building workspace from {base_docker_image} "
218214
f"for instance {instance.id}. "

benchmarks/swtbench/run_infer.py

Lines changed: 9 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -18,7 +18,7 @@
1818
get_default_on_result_writer,
1919
)
2020
from benchmarks.utils.fake_user_response import run_conversation_with_fake_user_response
21-
from benchmarks.utils.image_utils import image_exists
21+
from benchmarks.utils.image_utils import image_exists, local_image_exists
2222
from benchmarks.utils.llm_config import load_llm_config
2323
from benchmarks.utils.models import (
2424
EvalInstance,
@@ -171,28 +171,19 @@ def prepare_workspace(
171171
agent_server_image = (
172172
f"{EVAL_AGENT_SERVER_IMAGE}:{SDK_SHORT_SHA}-{custom_tag}{suffix}"
173173
)
174-
SKIP_BUILD = os.getenv("SKIP_BUILD", "1").lower() in ("1", "true", "yes")
175-
logger.info(f"SKIP_BUILD={SKIP_BUILD}")
176-
if not SKIP_BUILD:
177-
logger.info(
178-
f"Building workspace from {official_docker_image} "
179-
f"for instance {instance.id}. "
180-
"This may take a while...\n"
181-
"You can run benchmarks/swtbench/build_images.py and set "
182-
"SKIP_BUILD=1 to skip building and use pre-built "
183-
"agent-server image."
184-
)
185-
# For SWT-bench, we use DockerDevWorkspace with base_image
186-
workspace = DockerDevWorkspace(
187-
base_image=official_docker_image,
174+
if local_image_exists(agent_server_image):
175+
logger.info(f"Using pre-built image {agent_server_image}")
176+
workspace = DockerWorkspace(
177+
server_image=agent_server_image,
188178
working_dir="/workspace",
189-
target=build_target,
190179
forward_env=forward_env or [],
191180
)
192181
else:
193-
workspace = DockerWorkspace(
194-
server_image=agent_server_image,
182+
logger.info(f"Building workspace from {official_docker_image}...")
183+
workspace = DockerDevWorkspace(
184+
base_image=official_docker_image,
195185
working_dir="/workspace",
186+
target=build_target,
196187
forward_env=forward_env or [],
197188
)
198189
elif self.metadata.workspace_type == "remote":

benchmarks/utils/image_utils.py

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
#!/usr/bin/env python3
22
import base64
3+
import subprocess
34
import sys
45

56
import requests
@@ -54,6 +55,15 @@ def _ghcr_token(repo: str, username: str | None, pat: str | None) -> str | None:
5455
return None
5556

5657

58+
def local_image_exists(image: str) -> bool:
59+
"""Check if a Docker image exists in the local Docker daemon."""
60+
result = subprocess.run(
61+
["docker", "image", "inspect", image],
62+
capture_output=True,
63+
)
64+
return result.returncode == 0
65+
66+
5767
def image_exists(
5868
image_ref: str,
5969
gh_username: str | None = None,

0 commit comments

Comments
 (0)