Skip to content

Commit d6b07a8

Browse files
Revert "feat: auto-detect pre-built Docker images across all benchmarks (#456)"
This reverts commit 2bfcc6c. The auto-detect feature introduced in #456 is causing slow image builds and timeouts. This revert restores the previous image building behavior to restore benchmark build performance. Fixes #502 Co-authored-by: openhands <openhands@all-hands.dev>
1 parent e6f6da4 commit d6b07a8

File tree

12 files changed

+315
-499
lines changed

12 files changed

+315
-499
lines changed

benchmarks/commit0/run_infer.py

Lines changed: 31 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
import json
22
import os
33
from collections import Counter
4+
from pathlib import Path
45
from typing import Any, List
56

67
from commit0.harness.constants import SPLIT
@@ -12,7 +13,7 @@
1213
get_base_docker_image,
1314
)
1415
from benchmarks.commit0.config import INFER_DEFAULTS
15-
from benchmarks.utils.args_parser import add_prompt_path_argument, get_parser
16+
from benchmarks.utils.args_parser import get_parser
1617
from benchmarks.utils.console_logging import summarize_instance
1718
from benchmarks.utils.constants import EVAL_AGENT_SERVER_IMAGE
1819
from benchmarks.utils.conversation import build_event_persistence_callback
@@ -23,19 +24,19 @@
2324
construct_eval_output_dir,
2425
get_default_on_result_writer,
2526
)
26-
from benchmarks.utils.image_utils import create_docker_workspace, remote_image_exists
27+
from benchmarks.utils.image_utils import image_exists
2728
from benchmarks.utils.llm_config import load_llm_config
2829
from benchmarks.utils.models import (
2930
EvalInstance,
3031
EvalMetadata,
3132
EvalOutput,
3233
)
33-
from benchmarks.utils.version import IMAGE_TAG_PREFIX
34+
from benchmarks.utils.version import SDK_SHORT_SHA
3435
from openhands.sdk import Agent, Conversation, Tool, get_logger
3536
from openhands.sdk.workspace import RemoteWorkspace
3637
from openhands.tools.delegate import DelegateTool
3738
from openhands.tools.preset.default import get_default_tools
38-
from openhands.workspace import APIRemoteWorkspace
39+
from openhands.workspace import APIRemoteWorkspace, DockerDevWorkspace
3940

4041

4142
logger = get_logger(__name__)
@@ -187,16 +188,15 @@ def prepare_workspace(
187188
logger.info(f"Using base docker image: {base_docker_image}")
188189

189190
if self.metadata.workspace_type == "docker":
190-
custom_tag = extract_custom_tag(base_docker_image)
191-
suffix = f"-{build_target}" if build_target != "binary" else ""
192-
agent_server_image = (
193-
f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-{custom_tag}{suffix}"
194-
)
195-
workspace = create_docker_workspace(
196-
agent_server_image=agent_server_image,
191+
# Build agent-server image from base commit0 image
192+
workspace = DockerDevWorkspace(
197193
base_image=base_docker_image,
198-
build_target=build_target,
199-
forward_env=forward_env,
194+
working_dir="/workspace",
195+
target=build_target,
196+
forward_env=forward_env or [],
197+
)
198+
logger.info(
199+
f"Building workspace from {base_docker_image}. This may take a while..."
200200
)
201201
elif self.metadata.workspace_type == "remote":
202202
runtime_api_key = os.getenv("RUNTIME_API_KEY")
@@ -205,21 +205,22 @@ def prepare_workspace(
205205
"RUNTIME_API_KEY environment variable is not set for remote workspace"
206206
)
207207

208+
sdk_short_sha = os.getenv("SDK_SHORT_SHA", SDK_SHORT_SHA)
208209
custom_tag = extract_custom_tag(base_docker_image)
209210
suffix = f"-{build_target}" if build_target != "binary" else ""
210211
agent_server_image = (
211-
f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-{custom_tag}{suffix}"
212+
f"{EVAL_AGENT_SERVER_IMAGE}:{sdk_short_sha}-{custom_tag}{suffix}"
212213
)
213214

214-
if not remote_image_exists(agent_server_image):
215+
if not image_exists(agent_server_image):
215216
raise RuntimeError(
216217
f"Agent server image {agent_server_image} does not exist in container registry. "
217218
"Run 'benchmarks/commit0/build_images.py --push' to build and push it first."
218219
)
219220

220221
logger.info(
221222
f"Using remote workspace with image {agent_server_image} "
222-
f"(tag prefix: {IMAGE_TAG_PREFIX}, resource_factor: {resource_factor})"
223+
f"(sdk sha: {sdk_short_sha}, resource_factor: {resource_factor})"
223224
)
224225
startup_timeout = float(os.getenv("REMOTE_RUNTIME_STARTUP_TIMEOUT", "600"))
225226
workspace = APIRemoteWorkspace(
@@ -591,8 +592,21 @@ def evaluate_instance(
591592

592593

593594
def main() -> None:
595+
prompt_dir = (Path(__file__).parent / "prompts").resolve()
596+
choices = [str(p.relative_to(Path.cwd())) for p in prompt_dir.glob("*.j2")]
597+
default_prompt_path = prompt_dir / "default.j2"
598+
assert default_prompt_path.exists(), (
599+
f"Default prompt {default_prompt_path} not found"
600+
)
601+
594602
parser = get_parser()
595-
add_prompt_path_argument(parser, __file__)
603+
parser.add_argument(
604+
"--prompt-path",
605+
type=str,
606+
default=str(default_prompt_path),
607+
choices=choices,
608+
help="Path to prompt template file",
609+
)
596610
parser.add_argument(
597611
"--repo-split",
598612
type=str,

benchmarks/gaia/run_infer.py

Lines changed: 11 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -27,10 +27,10 @@
2727
get_default_on_result_writer,
2828
)
2929
from benchmarks.utils.fake_user_response import run_conversation_with_fake_user_response
30-
from benchmarks.utils.image_utils import create_docker_workspace, remote_image_exists
30+
from benchmarks.utils.image_utils import image_exists
3131
from benchmarks.utils.llm_config import load_llm_config
3232
from benchmarks.utils.models import EvalInstance, EvalMetadata, EvalOutput
33-
from benchmarks.utils.version import IMAGE_TAG_PREFIX
33+
from benchmarks.utils.version import SDK_SHORT_SHA
3434
from openhands.sdk import (
3535
Agent,
3636
Conversation,
@@ -47,7 +47,7 @@
4747
from openhands.sdk.workspace import RemoteWorkspace
4848
from openhands.tools.delegate import DelegateTool
4949
from openhands.tools.preset.default import get_default_tools
50-
from openhands.workspace import APIRemoteWorkspace
50+
from openhands.workspace import APIRemoteWorkspace, DockerDevWorkspace
5151

5252

5353
logger = get_logger(__name__)
@@ -156,14 +156,11 @@ def prepare_workspace(
156156
logger.info(f"Preparing workspace for instance {instance.id}")
157157

158158
if self.metadata.workspace_type == "docker":
159-
agent_server_image = (
160-
f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-gaia-binary"
161-
)
162-
workspace = create_docker_workspace(
163-
agent_server_image=agent_server_image,
159+
# Use DockerDevWorkspace with base image (same as main branch)
160+
workspace = DockerDevWorkspace(
164161
base_image="nikolaik/python-nodejs:python3.12-nodejs22",
165-
build_target="binary",
166-
forward_env=forward_env,
162+
working_dir="/workspace",
163+
forward_env=forward_env or [],
167164
)
168165
elif self.metadata.workspace_type == "remote":
169166
# For workflow, use APIRemoteWorkspace with pre-built GAIA image
@@ -177,19 +174,20 @@ def prepare_workspace(
177174
"RUNTIME_API_KEY environment variable is not set for remote workspace"
178175
)
179176

177+
sdk_short_sha = os.getenv("SDK_SHORT_SHA", SDK_SHORT_SHA)
180178
agent_server_image = (
181-
f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-gaia-binary"
179+
f"{EVAL_AGENT_SERVER_IMAGE}:{sdk_short_sha}-gaia-binary"
182180
)
183181

184-
if not remote_image_exists(agent_server_image):
182+
if not image_exists(agent_server_image):
185183
raise RuntimeError(
186184
f"Agent server image {agent_server_image} does not exist in container registry. "
187185
f"Run 'benchmarks/gaia/build_images.py --push' to build and push it first."
188186
)
189187

190188
logger.info(
191189
f"Using remote workspace with GAIA image {agent_server_image} "
192-
f"(tag prefix: {IMAGE_TAG_PREFIX}, resource_factor: {resource_factor})"
190+
f"(sdk sha: {sdk_short_sha}, resource_factor: {resource_factor})"
193191
)
194192
startup_timeout = float(os.getenv("REMOTE_RUNTIME_STARTUP_TIMEOUT", "600"))
195193
workspace = APIRemoteWorkspace(
@@ -592,7 +590,6 @@ def main() -> None:
592590
max_attempts=args.max_attempts,
593591
critic=critic,
594592
selected_instances_file=args.select,
595-
max_retries=args.max_retries,
596593
workspace_type=args.workspace,
597594
enable_delegation=args.enable_delegation,
598595
)

benchmarks/multiswebench/run_infer.py

Lines changed: 53 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import json
22
import os
3+
from pathlib import Path
34
from typing import List, cast
45

56
import pandas as pd
@@ -12,8 +13,8 @@
1213
)
1314
from benchmarks.multiswebench.download_dataset import download_and_concat_dataset
1415
from benchmarks.multiswebench.scripts.data.data_change import format_data_for_inference
15-
from benchmarks.utils.args_parser import add_prompt_path_argument, get_parser
16-
from benchmarks.utils.build_utils import ensure_local_image
16+
from benchmarks.utils.args_parser import get_parser
17+
from benchmarks.utils.build_utils import build_image
1718
from benchmarks.utils.console_logging import summarize_instance
1819
from benchmarks.utils.constants import EVAL_AGENT_SERVER_IMAGE
1920
from benchmarks.utils.conversation import build_event_persistence_callback
@@ -25,14 +26,14 @@
2526
get_default_on_result_writer,
2627
)
2728
from benchmarks.utils.fake_user_response import run_conversation_with_fake_user_response
28-
from benchmarks.utils.image_utils import remote_image_exists
29+
from benchmarks.utils.image_utils import image_exists
2930
from benchmarks.utils.llm_config import load_llm_config
3031
from benchmarks.utils.models import (
3132
EvalInstance,
3233
EvalMetadata,
3334
EvalOutput,
3435
)
35-
from benchmarks.utils.version import IMAGE_TAG_PREFIX
36+
from benchmarks.utils.version import SDK_SHORT_SHA
3637
from openhands.sdk import Agent, Conversation, Tool, get_logger
3738
from openhands.sdk.workspace import RemoteWorkspace
3839
from openhands.tools.delegate import DelegateTool
@@ -209,37 +210,62 @@ def prepare_workspace(
209210

210211
if self.metadata.workspace_type == "docker":
211212
agent_server_image = (
212-
f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-{custom_tag}{suffix}"
213+
f"{EVAL_AGENT_SERVER_IMAGE}:{SDK_SHORT_SHA}-{custom_tag}{suffix}"
213214
)
214-
ensure_local_image(
215-
agent_server_image=agent_server_image,
216-
base_image=official_docker_image,
217-
custom_tag=custom_tag,
218-
target=build_target,
215+
SKIP_BUILD = os.getenv("MULTI_SWE_BENCH_SKIP_BUILD", "0").lower() in (
216+
"1",
217+
"true",
218+
"yes",
219219
)
220+
logger.info(f"MULTI_SWE_BENCH_SKIP_BUILD={SKIP_BUILD}")
221+
if not SKIP_BUILD:
222+
logger.info(
223+
f"Building workspace from {official_docker_image} "
224+
f"for instance {instance.id}. "
225+
"This may take a while...\n"
226+
"You can run benchmarks/multiswebench/build_images.py and set "
227+
"MULTI_SWE_BENCH_SKIP_BUILD=1 to skip building and use pre-built "
228+
"agent-server image."
229+
)
230+
output = build_image(
231+
base_image=official_docker_image,
232+
target_image=EVAL_AGENT_SERVER_IMAGE,
233+
custom_tag=custom_tag,
234+
target=build_target,
235+
push=False,
236+
)
237+
logger.info(f"Image build output: {output}")
238+
assert output.error is None, f"Image build failed: {output.error}"
239+
if agent_server_image not in output.tags:
240+
raise RuntimeError(
241+
f"Built image tags {output.tags} do not include expected tag "
242+
f"{agent_server_image}"
243+
)
244+
220245
workspace = DockerWorkspace(
221246
server_image=agent_server_image,
222247
working_dir="/workspace",
223248
forward_env=forward_env or [],
224249
)
225250
elif self.metadata.workspace_type == "remote":
226251
runtime_api_key = os.getenv("RUNTIME_API_KEY")
252+
sdk_short_sha = os.getenv("SDK_SHORT_SHA", SDK_SHORT_SHA)
227253
if not runtime_api_key:
228254
raise ValueError(
229255
"RUNTIME_API_KEY environment variable is not set for remote workspace"
230256
)
231257

232258
agent_server_image = (
233-
f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-{custom_tag}{suffix}"
259+
f"{EVAL_AGENT_SERVER_IMAGE}:{sdk_short_sha}-{custom_tag}{suffix}"
234260
)
235-
if not remote_image_exists(agent_server_image):
261+
if not image_exists(agent_server_image):
236262
raise RuntimeError(
237263
f"Agent server image {agent_server_image} does not exist in container registry, "
238264
"make sure to build, push it, and make it public accessible before using remote workspace."
239265
)
240266
logger.info(
241267
f"Using remote workspace with image {agent_server_image} "
242-
f"(tag prefix: {IMAGE_TAG_PREFIX}, resource_factor: {resource_factor})"
268+
f"(sdk sha: {sdk_short_sha}, resource_factor: {resource_factor})"
243269
)
244270
startup_timeout = float(os.getenv("REMOTE_RUNTIME_STARTUP_TIMEOUT", "600"))
245271
workspace = APIRemoteWorkspace(
@@ -402,8 +428,21 @@ def evaluate_instance(
402428

403429

404430
def main() -> None:
431+
prompt_dir = (Path(__file__).parent / "prompts").resolve()
432+
choices = [str(p.relative_to(Path.cwd())) for p in prompt_dir.glob("*.j2")]
433+
default_prompt_path = prompt_dir / "default.j2"
434+
assert default_prompt_path.exists(), (
435+
f"Default prompt {default_prompt_path} not found"
436+
)
437+
405438
parser = get_parser()
406-
add_prompt_path_argument(parser, __file__)
439+
parser.add_argument(
440+
"--prompt-path",
441+
type=str,
442+
default=str(default_prompt_path),
443+
choices=choices,
444+
help="Path to prompt template file",
445+
)
407446
parser.add_argument(
408447
"--lang",
409448
type=str,

0 commit comments

Comments
 (0)