Skip to content

Commit 7a43031

Browse files
authored
Fix Problems for Online-Mind2Web (#207)
* Fix problems * Fix debug string format * fiix typos * fix: remove self-defined _ensure_browser
1 parent 6c19533 commit 7a43031

File tree

11 files changed

+707
-59
lines changed

11 files changed

+707
-59
lines changed

environments/online_mind2web/Dockerfile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,8 +15,8 @@ RUN uv pip install --python /opt/venv -e .
1515
# Create directories for logs and data
1616
RUN mkdir -p /app/logs /app/data
1717

18-
ENV DISPLAY_WIDTH=1448
19-
ENV DISPLAY_HEIGHT=944
18+
ENV DISPLAY_WIDTH=1400
19+
ENV DISPLAY_HEIGHT=850
2020

2121
ENV PYTHONUNBUFFERED=1 \
2222
PYTHONDONTWRITEBYTECODE=1

environments/online_mind2web/pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@ name = "hud-om2w"
33
version = "0.1.0"
44
description = "HUD Remote Browser Controller with MCP tools for cloud browser providers"
55
requires-python = ">=3.11,<3.13"
6-
dependencies = [ "hud-python==0.4.61", "pyautogui", "playwright", "httpx", "typer", "google-api-python-client", "google-auth",]
6+
dependencies = [ "hud-python>=0.4.67", "anthropic>=0.74.0", "pyautogui", "playwright", "httpx", "typer", "google-api-python-client", "google-auth",]
77

88
[build-system]
99
requires = [ "hatchling",]

environments/online_mind2web/src/hud_controller/evaluate/webjudge.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -393,7 +393,7 @@ async def webjudge_eval(ctx: Context, task_description: dict | str, score_thresh
393393
main_score = int(scores[0]) if scores else 3
394394
except:
395395
main_score = 3 # Default score if parsing fails
396-
logger.info("Score: ", main_score)
396+
logger.info(f"Score: {main_score}")
397397

398398
# Extract reasoning
399399
try:

environments/online_mind2web/src/hud_controller/providers/anchorbrowser.py

Lines changed: 17 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -40,8 +40,8 @@ async def launch(self, **kwargs) -> str:
4040
4141
Args:
4242
**kwargs: Launch options including:
43-
- max_duration: Maximum session duration in seconds (default: 120)
44-
- idle_timeout: Idle timeout in seconds (default: 30)
43+
- max_duration: Maximum session duration in seconds (default: 300)
44+
- idle_timeout: Idle timeout in seconds (default: 120)
4545
- proxy: Proxy configuration dict with:
4646
- type: "custom" or "anchor_residential"
4747
- server: Proxy server address (for custom)
@@ -61,8 +61,8 @@ async def launch(self, **kwargs) -> str:
6161
request_data = {
6262
"session": {
6363
"timeout": {
64-
"max_duration": kwargs.get("max_duration", 120),
65-
"idle_timeout": kwargs.get("idle_timeout", 30),
64+
"max_duration": kwargs.get("max_duration", 300),
65+
"idle_timeout": kwargs.get("idle_timeout", 120),
6666
},
6767
},
6868
"browser": {
@@ -72,6 +72,19 @@ async def launch(self, **kwargs) -> str:
7272
},
7373
}
7474

75+
# Add viewport configuration
76+
if "viewport" in kwargs:
77+
request_data["browser"]["viewport"] = kwargs["viewport"]
78+
else:
79+
# Use environment variables or AnchorBrowser's recommended default (1440x900)
80+
request_data["browser"]["viewport"] = {
81+
"width": int(os.getenv("DISPLAY_WIDTH", "1440")),
82+
"height": int(os.getenv("DISPLAY_HEIGHT", "900")),
83+
}
84+
logger.info(
85+
f"Setting viewport to {request_data['browser']['viewport']['width']}x{request_data['browser']['viewport']['height']}"
86+
)
87+
7588
proxy_config = await get_proxy_config()
7689

7790
# Default to residential proxy if nothing configured

environments/online_mind2web/src/hud_controller/server.py

Lines changed: 9 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@
2626
OpenAIComputerTool,
2727
HudComputerTool,
2828
)
29-
from hud.tools import PlaywrightTool
29+
from .tools import OnlineMind2Web_PlaywrightTool as PlaywrightTool
3030

3131
# Import setup and evaluate hubs
3232
from .setup import setup as setup_hub
@@ -283,10 +283,14 @@ async def send_progress(progress: int, message: str):
283283

284284
# Create and register computer tools with default dimensions
285285
mcp.add_tool(HudComputerTool(executor=browser_executor))
286-
# mcp.add_tool(AnthropicComputerTool(executor=browser_executor))
287-
# mcp.add_tool(OpenAIComputerTool(executor=browser_executor))
288-
mcp.add_tool(AnthropicComputerToolWithRecord(executor=browser_executor))
289-
mcp.add_tool(OpenAIComputerToolWithRecord(executor=browser_executor))
286+
mcp.add_tool(
287+
AnthropicComputerToolWithRecord(
288+
executor=browser_executor, name="anthropic_computer_tool"
289+
)
290+
)
291+
mcp.add_tool(
292+
OpenAIComputerToolWithRecord(executor=browser_executor, name="openai_computer_tool")
293+
)
290294

291295
await send_progress(80, "Registered hud computer tools")
292296

environments/online_mind2web/src/hud_controller/setup/navigate.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99

1010

1111
@setup.tool("navigate_to_url")
12-
async def navigate_to_url(ctx: Context, url: str, wait_for_load_state: str = "networkidle"):
12+
async def navigate_to_url(ctx: Context, url: str, wait_for_load_state: str = "load"):
1313
"""Navigate browser to a specific URL.
1414
1515
Args:

environments/online_mind2web/src/hud_controller/tools/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,11 @@
33
from .executor import BrowserExecutor
44
from .anthropic import AnthropicComputerToolWithRecord
55
from .openai import OpenAIComputerToolWithRecord
6+
from .playwright import OnlineMind2Web_PlaywrightTool
67

78
__all__ = [
89
"BrowserExecutor",
910
"AnthropicComputerToolWithRecord",
1011
"OpenAIComputerToolWithRecord",
12+
"OnlineMind2Web_PlaywrightTool",
1113
]

environments/online_mind2web/src/hud_controller/tools/anthropic.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -118,6 +118,15 @@ async def _on_recorded_action(
118118
except Exception as e:
119119
logger.warning(f"Failed to record action: {e}")
120120

121+
def _scale_coordinates(self, x: int | None, y: int | None) -> tuple[int | None, int | None]:
122+
"""Scale coordinates from target space to screen space."""
123+
if x is not None and self.scale_x != 1.0:
124+
x = round(x / self.scale_x)
125+
if y is not None and self.scale_y != 1.0:
126+
y = round(y / self.scale_y)
127+
128+
return x, y
129+
121130
async def __call__(
122131
self,
123132
action: str = Field(..., description="The action to perform on the computer"),
@@ -171,6 +180,12 @@ async def __call__(
171180
}
172181
if action in screenshot_actions and action != "screenshot" and take_screenshot_on_click:
173182
await self._trigger_callbacks("on_screenshot_action")
183+
logger.debug(
184+
"Env display size %s x %s",
185+
self.environment_width,
186+
self.environment_height,
187+
)
188+
174189
recorded_actions = {
175190
"left_click",
176191
"click",

environments/online_mind2web/src/hud_controller/tools/executor.py

Lines changed: 10 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -112,11 +112,16 @@ async def _ensure_page(self):
112112
async def screenshot(self) -> str | None:
113113
"""Take a screenshot and return base64 encoded image."""
114114
try:
115-
page = await self._ensure_page()
116-
screenshot_bytes = await page.screenshot(full_page=False)
117-
screenshot_b64 = base64.b64encode(screenshot_bytes).decode()
118-
logger.debug("Browser screenshot captured")
119-
return screenshot_b64
115+
result = await self.playwright_tool.screenshot()
116+
if result.base64_image:
117+
logger.debug("Browser screenshot captured via playwright tool")
118+
return result.base64_image
119+
elif result.error:
120+
logger.error(f"Screenshot failed: {result.error}")
121+
return None
122+
else:
123+
logger.error("Screenshot returned no image or error")
124+
return None
120125
except Exception as e:
121126
logger.error(f"Screenshot failed: {e}")
122127
return None

0 commit comments

Comments
 (0)