Skip to content

Commit e445fa9

Browse files
Merge main into nemo-evaluator, resolve conflicts with PR #456
Adopt centralized Docker image auto-detection helpers (create_docker_workspace, ensure_local_image, local_image_exists) from PR #456 while keeping IMAGE_TAG_PREFIX naming from this branch. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2 parents 0ff388e + 2bfcc6c commit e445fa9

File tree

14 files changed

+460
-234
lines changed

14 files changed

+460
-234
lines changed

Makefile

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@ YELLOW := \033[33m
88
RED := \033[31m
99
CYAN := \033[36m
1010
RESET := \033[0m
11+
UNDERLINE := \033[4m
1112

1213
# Required uv version
1314
REQUIRED_UV_VERSION := 0.8.13
@@ -51,9 +52,27 @@ lint:
5152
@uv run ruff check --fix
5253
@$(ECHO) "$(GREEN)Linting completed.$(RESET)"
5354

55+
pre-commit:
56+
@$(ECHO) "$(YELLOW)Run pre-commit...$(RESET)"
57+
@uv run pre-commit run --all-files
58+
@$(ECHO) "$(GREEN)Pre-commit run successfully.$(RESET)"
59+
5460
clean:
5561
@$(ECHO) "$(YELLOW)Cleaning up cache files...$(RESET)"
5662
@find . -type d -name "__pycache__" -exec rm -rf {} + 2>/dev/null || true
5763
@find . -type f -name "*.pyc" -delete 2>/dev/null || true
5864
@rm -rf .pytest_cache .ruff_cache .mypy_cache 2>/dev/null || true
5965
@$(ECHO) "$(GREEN)Cache files cleaned.$(RESET)"
66+
67+
help:
68+
@$(ECHO) "$(CYAN)OpenHands Benchmarks Makefile$(RESET)"
69+
@$(ECHO) ""
70+
@$(ECHO) "$(UNDERLINE)Usage:$(RESET) make <COMMAND>"
71+
@$(ECHO) ""
72+
@$(ECHO) "$(UNDERLINE)Commands:$(RESET)"
73+
@$(ECHO) " $(GREEN)build$(RESET) Set up development environment"
74+
@$(ECHO) " $(GREEN)format$(RESET) Format code with ruff"
75+
@$(ECHO) " $(GREEN)lint$(RESET) Lint code with ruff"
76+
@$(ECHO) " $(GREEN)pre-commit$(RESET) Run pre-commit hooks"
77+
@$(ECHO) " $(GREEN)clean$(RESET) Clean up cache files"
78+
@$(ECHO) " $(GREEN)help$(RESET) Show this help message"

benchmarks/commit0/run_infer.py

Lines changed: 13 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@
2323
construct_eval_output_dir,
2424
get_default_on_result_writer,
2525
)
26-
from benchmarks.utils.image_utils import image_exists
26+
from benchmarks.utils.image_utils import create_docker_workspace, image_exists
2727
from benchmarks.utils.llm_config import load_llm_config
2828
from benchmarks.utils.models import (
2929
EvalInstance,
@@ -34,7 +34,7 @@
3434
from openhands.sdk import Agent, Conversation, Tool, get_logger
3535
from openhands.sdk.workspace import RemoteWorkspace
3636
from openhands.tools.preset.default import get_default_tools
37-
from openhands.workspace import APIRemoteWorkspace, DockerDevWorkspace, DockerWorkspace
37+
from openhands.workspace import APIRemoteWorkspace
3838

3939

4040
logger = get_logger(__name__)
@@ -186,35 +186,17 @@ def prepare_workspace(
186186
logger.info(f"Using base docker image: {base_docker_image}")
187187

188188
if self.metadata.workspace_type == "docker":
189-
# Try to build agent-server image from base commit0 image
190-
# Fall back to pre-built image if build fails
191-
try:
192-
workspace = DockerDevWorkspace(
193-
base_image=base_docker_image,
194-
working_dir="/workspace",
195-
target=build_target,
196-
forward_env=forward_env or [],
197-
)
198-
logger.info(
199-
f"Building workspace from {base_docker_image}. This may take a while..."
200-
)
201-
except Exception:
202-
custom_tag = extract_custom_tag(base_docker_image)
203-
suffix = f"-{build_target}" if build_target != "binary" else ""
204-
agent_server_image = (
205-
f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-{custom_tag}{suffix}"
206-
)
207-
if not image_exists(agent_server_image):
208-
raise RuntimeError(
209-
f"On-the-fly build failed and pre-built image {agent_server_image} does not exist"
210-
)
211-
212-
workspace = DockerWorkspace(
213-
server_image=agent_server_image,
214-
working_dir="/workspace",
215-
forward_env=forward_env or [],
216-
)
217-
logger.info(f"Using pre-built image {agent_server_image}")
189+
custom_tag = extract_custom_tag(base_docker_image)
190+
suffix = f"-{build_target}" if build_target != "binary" else ""
191+
agent_server_image = (
192+
f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-{custom_tag}{suffix}"
193+
)
194+
workspace = create_docker_workspace(
195+
agent_server_image=agent_server_image,
196+
base_image=base_docker_image,
197+
build_target=build_target,
198+
forward_env=forward_env,
199+
)
218200
elif self.metadata.workspace_type == "remote":
219201
runtime_api_key = os.getenv("RUNTIME_API_KEY")
220202
if not runtime_api_key:

benchmarks/gaia/run_infer.py

Lines changed: 11 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
get_default_on_result_writer,
2727
)
2828
from benchmarks.utils.fake_user_response import run_conversation_with_fake_user_response
29-
from benchmarks.utils.image_utils import image_exists
29+
from benchmarks.utils.image_utils import create_docker_workspace, image_exists
3030
from benchmarks.utils.llm_config import load_llm_config
3131
from benchmarks.utils.models import EvalInstance, EvalMetadata, EvalOutput
3232
from benchmarks.utils.version import IMAGE_TAG_PREFIX
@@ -42,7 +42,7 @@
4242
)
4343
from openhands.sdk.workspace import RemoteWorkspace
4444
from openhands.tools.preset.default import get_default_tools
45-
from openhands.workspace import APIRemoteWorkspace, DockerDevWorkspace, DockerWorkspace
45+
from openhands.workspace import APIRemoteWorkspace
4646

4747

4848
logger = get_logger(__name__)
@@ -151,29 +151,15 @@ def prepare_workspace(
151151
logger.info(f"Preparing workspace for instance {instance.id}")
152152

153153
if self.metadata.workspace_type == "docker":
154-
# Use DockerDevWorkspace with base image
155-
# Fall back to pre-built image if build fails
156-
try:
157-
workspace = DockerDevWorkspace(
158-
base_image="nikolaik/python-nodejs:python3.12-nodejs22",
159-
working_dir="/workspace",
160-
forward_env=forward_env or [],
161-
)
162-
except Exception:
163-
build_target = os.getenv("GAIA_BUILD_TARGET", "binary-minimal")
164-
agent_server_image = (
165-
f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-gaia-{build_target}"
166-
)
167-
if not image_exists(agent_server_image):
168-
raise RuntimeError(
169-
f"On-the-fly build failed and pre-built image {agent_server_image} does not exist"
170-
)
171-
workspace = DockerWorkspace(
172-
server_image=agent_server_image,
173-
working_dir="/workspace",
174-
forward_env=forward_env or [],
175-
)
176-
logger.info(f"Using pre-built image {agent_server_image}")
154+
agent_server_image = (
155+
f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-gaia-binary"
156+
)
157+
workspace = create_docker_workspace(
158+
agent_server_image=agent_server_image,
159+
base_image="nikolaik/python-nodejs:python3.12-nodejs22",
160+
build_target="binary",
161+
forward_env=forward_env,
162+
)
177163
elif self.metadata.workspace_type == "remote":
178164
# For workflow, use APIRemoteWorkspace with pre-built GAIA image
179165
# GAIA uses a universal agent server image (one image for all instances)

benchmarks/multiswebench/run_infer.py

Lines changed: 6 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
from benchmarks.multiswebench.download_dataset import download_and_concat_dataset
1515
from benchmarks.multiswebench.scripts.data.data_change import format_data_for_inference
1616
from benchmarks.utils.args_parser import get_parser
17-
from benchmarks.utils.build_utils import build_image
17+
from benchmarks.utils.build_utils import ensure_local_image
1818
from benchmarks.utils.constants import EVAL_AGENT_SERVER_IMAGE
1919
from benchmarks.utils.conversation import build_event_persistence_callback
2020
from benchmarks.utils.critics import create_critic
@@ -210,44 +210,12 @@ def prepare_workspace(
210210
agent_server_image = (
211211
f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-{custom_tag}{suffix}"
212212
)
213-
SKIP_BUILD = os.getenv("MULTI_SWE_BENCH_SKIP_BUILD", "0").lower() in (
214-
"1",
215-
"true",
216-
"yes",
213+
ensure_local_image(
214+
agent_server_image=agent_server_image,
215+
base_image=official_docker_image,
216+
custom_tag=custom_tag,
217+
target=build_target,
217218
)
218-
logger.info(f"MULTI_SWE_BENCH_SKIP_BUILD={SKIP_BUILD}")
219-
if not SKIP_BUILD:
220-
logger.info(
221-
f"Building workspace from {official_docker_image} "
222-
f"for instance {instance.id}. "
223-
"This may take a while...\n"
224-
"You can run benchmarks/multiswebench/build_images.py and set "
225-
"MULTI_SWE_BENCH_SKIP_BUILD=1 to skip building and use pre-built "
226-
"agent-server image."
227-
)
228-
try:
229-
output = build_image(
230-
base_image=official_docker_image,
231-
target_image=EVAL_AGENT_SERVER_IMAGE,
232-
custom_tag=custom_tag,
233-
target=build_target,
234-
push=False,
235-
)
236-
logger.info(f"Image build output: {output}")
237-
if output.error is not None:
238-
raise RuntimeError(f"Image build failed: {output.error}")
239-
if agent_server_image not in output.tags:
240-
raise RuntimeError(
241-
f"Built image tags {output.tags} do not include expected tag "
242-
f"{agent_server_image}"
243-
)
244-
except Exception:
245-
if not image_exists(agent_server_image):
246-
raise RuntimeError(
247-
f"On-the-fly build failed and pre-built image {agent_server_image} does not exist"
248-
)
249-
logger.info(f"Using pre-built image {agent_server_image}")
250-
251219
workspace = DockerWorkspace(
252220
server_image=agent_server_image,
253221
working_dir="/workspace",

benchmarks/swebench/run_infer.py

Lines changed: 17 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@
1414
)
1515
from benchmarks.swebench.config import INFER_DEFAULTS
1616
from benchmarks.utils.args_parser import get_parser
17-
from benchmarks.utils.build_utils import build_image
17+
from benchmarks.utils.build_utils import ensure_local_image
1818
from benchmarks.utils.constants import EVAL_AGENT_SERVER_IMAGE
1919
from benchmarks.utils.conversation import build_event_persistence_callback
2020
from benchmarks.utils.critics import create_critic
@@ -131,38 +131,24 @@ def prepare_workspace(
131131
agent_server_image = base_agent_image
132132

133133
if self.metadata.workspace_type == "docker":
134-
SKIP_BUILD = os.getenv("SKIP_BUILD", "1").lower() in ("1", "true", "yes")
135-
logger.info(f"SKIP_BUILD={SKIP_BUILD}")
136-
if not SKIP_BUILD:
137-
logger.info(
138-
f"Building workspace from {official_docker_image} "
139-
f"for instance {instance.id}. "
140-
"This may take a while...\n"
141-
"You can run benchmarks/swebench/build_images.py and set "
142-
"SWE_BENCH_SKIP_BUILD=1 to skip building and use pre-built "
143-
"agent-server image."
144-
)
145-
output = build_image(
146-
base_image=official_docker_image,
147-
target_image=EVAL_AGENT_SERVER_IMAGE,
148-
custom_tag=custom_tag,
149-
target=build_target,
150-
push=False,
151-
)
152-
logger.info(f"Image build output: {output}")
153-
assert output.error is None, f"Image build failed: {output.error}"
154-
if base_agent_image not in output.tags:
134+
built = ensure_local_image(
135+
agent_server_image=base_agent_image,
136+
base_image=official_docker_image,
137+
custom_tag=custom_tag,
138+
target=build_target,
139+
)
140+
if built and wrap_needed:
141+
wrapped_result = wrap_image(base_agent_image, push=False)
142+
if wrapped_result.error:
155143
raise RuntimeError(
156-
f"Built image tags {output.tags} do not include expected tag "
157-
f"{base_agent_image}"
144+
"Wrapped image build failed: "
145+
f"{wrapped_result.error}; log={wrapped_result.log_path}"
158146
)
159-
if wrap_needed:
160-
wrapped_result = wrap_image(base_agent_image, push=False)
161-
if wrapped_result.error:
162-
raise RuntimeError(
163-
"Wrapped image build failed: "
164-
f"{wrapped_result.error}; log={wrapped_result.log_path}"
165-
)
147+
elif not built and wrap_needed:
148+
logger.info(
149+
f"Using pre-built image {base_agent_image} "
150+
"(assumed already wrapped)"
151+
)
166152

167153
workspace = DockerWorkspace(
168154
server_image=agent_server_image,

benchmarks/swebenchmultimodal/run_infer.py

Lines changed: 7 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
)
1313
from benchmarks.swebenchmultimodal.config import INFER_DEFAULTS
1414
from benchmarks.utils.args_parser import get_parser
15-
from benchmarks.utils.build_utils import build_image
15+
from benchmarks.utils.build_utils import ensure_local_image
1616
from benchmarks.utils.console_logging import summarize_instance
1717
from benchmarks.utils.constants import EVAL_AGENT_SERVER_IMAGE
1818
from benchmarks.utils.conversation import build_event_persistence_callback
@@ -165,33 +165,12 @@ def prepare_workspace(
165165
agent_server_image = (
166166
f"{EVAL_AGENT_SERVER_IMAGE}:{IMAGE_TAG_PREFIX}-{custom_tag}{suffix}"
167167
)
168-
SKIP_BUILD = os.getenv("SKIP_BUILD", "1").lower() in ("1", "true", "yes")
169-
logger.info(f"SKIP_BUILD={SKIP_BUILD}")
170-
if not SKIP_BUILD:
171-
logger.info(
172-
f"Building workspace from {official_docker_image} "
173-
f"for instance {instance.id}. "
174-
"This may take a while...\n"
175-
"You can run benchmarks/swebenchmultimodal/build_images.py and set "
176-
"SWE_BENCH_SKIP_BUILD=1 to skip building and use pre-built "
177-
"agent-server image."
178-
)
179-
180-
output = build_image(
181-
base_image=official_docker_image,
182-
target_image=EVAL_AGENT_SERVER_IMAGE,
183-
custom_tag=custom_tag,
184-
target=build_target,
185-
push=False,
186-
)
187-
logger.info(f"Image build output: {output}")
188-
assert output.error is None, f"Image build failed: {output.error}"
189-
if agent_server_image not in output.tags:
190-
raise RuntimeError(
191-
f"Built image tags {output.tags} do not include expected tag "
192-
f"{agent_server_image}"
193-
)
194-
168+
ensure_local_image(
169+
agent_server_image=agent_server_image,
170+
base_image=official_docker_image,
171+
custom_tag=custom_tag,
172+
target=build_target,
173+
)
195174
workspace = DockerWorkspace(
196175
server_image=agent_server_image,
197176
working_dir="/workspace",

benchmarks/swefficiency/run_infer.py

Lines changed: 7 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@
1111
from benchmarks.swefficiency.config import DOCKER_DEFAULTS, INFER_DEFAULTS
1212
from benchmarks.swefficiency.workspace import ResourceLimitedDockerWorkspace
1313
from benchmarks.utils.args_parser import get_parser
14-
from benchmarks.utils.build_utils import build_image
14+
from benchmarks.utils.build_utils import ensure_local_image
1515
from benchmarks.utils.conversation import build_event_persistence_callback
1616
from benchmarks.utils.critics import create_critic
1717
from benchmarks.utils.dataset import get_dataset
@@ -208,30 +208,12 @@ def prepare_workspace(
208208
logger.info(f"Agent server image: {agent_server_image}")
209209

210210
if self.metadata.workspace_type == "docker":
211-
# Build agent-server image from base swefficiency image
212-
SKIP_BUILD = os.getenv("SKIP_BUILD", "0").lower() in ("1", "true", "yes")
213-
logger.info(f"SKIP_BUILD={SKIP_BUILD}")
214-
215-
if not SKIP_BUILD:
216-
logger.info(
217-
f"Building workspace from {base_docker_image} "
218-
f"for instance {instance.id}. "
219-
"This may take a while..."
220-
)
221-
output = build_image(
222-
base_image=base_docker_image,
223-
target_image=EVAL_AGENT_SERVER_IMAGE,
224-
custom_tag=custom_tag,
225-
target=build_target,
226-
push=False,
227-
)
228-
logger.info(f"Image build output: {output}")
229-
assert output.error is None, f"Image build failed: {output.error}"
230-
if agent_server_image not in output.tags:
231-
raise RuntimeError(
232-
f"Built image tags {output.tags} do not include expected tag "
233-
f"{agent_server_image}"
234-
)
211+
ensure_local_image(
212+
agent_server_image=agent_server_image,
213+
base_image=base_docker_image,
214+
custom_tag=custom_tag,
215+
target=build_target,
216+
)
235217

236218
# Get CPU group for resource limiting
237219
cpu_group = self._acquire_cpu_group()

0 commit comments

Comments
 (0)