Skip to content

Commit f278c0f

Browse files
committed
fix done state parsing
1 parent cfc85c6 commit f278c0f

File tree

2 files changed

+31
-19
lines changed

2 files changed

+31
-19
lines changed

src/agentlab/backends/browser/mcp_playwright.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -26,8 +26,10 @@ def run_js(self, js: str):
2626
def step(self, action: ToolCallAction) -> str:
2727
tool_result = self._call_mcp(action)
2828
logger.info(f"Tool result: {tool_result}")
29-
snapshot = self.call_tool("browser_snapshot", {})
30-
return snapshot
29+
return tool_result
30+
31+
def page_snapshot(self) -> str:
32+
return self.call_tool("browser_snapshot", {})
3133

3234
def goto(self, url: str) -> str:
3335
tool_result = self.call_tool("browser_navigate", {"url": url})

src/agentlab/benchmarks/miniwob/task.py

Lines changed: 27 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,13 @@
1-
1+
import logging
22
import os
33
from typing import Any, ClassVar
44

55
from browsergym.miniwob import ALL_MINIWOB_TASKS
66

77
from agentlab.benchmarks.web_task import AbstractWebTask
88

9+
logger = logging.getLogger(__name__)
10+
911

1012
class MiniWobTask(AbstractWebTask):
1113
dataset: str = "miniwob"
@@ -17,10 +19,10 @@ class MiniWobTask(AbstractWebTask):
1719
remove_human_display: bool = True
1820
episode_max_time: int = 1000000
1921
max_turns: int = 10
22+
validate_per_step: bool = True
2023
actions_whitelist: ClassVar[list[str]] = [
2124
"browser_press_key",
2225
"browser_type",
23-
"browser_navigate",
2426
"browser_click",
2527
"browser_drag",
2628
"browser_hover",
@@ -29,9 +31,10 @@ class MiniWobTask(AbstractWebTask):
2931

3032
def model_post_init(self, __context: Any):
3133
self.url = f"{self.base_url}/{self.subdomain}.html"
32-
34+
3335
def get_setup_js(self) -> str:
3436
if self.remove_human_display:
37+
logger.info("Remove human display")
3538
js = r"""
3639
let __display_ids = ['reward-display', 'click-canvas', 'sync-task-cover'];
3740
let __display_divs = {};
@@ -93,10 +96,12 @@ def get_setup_js(self) -> str:
9396
Math.seedrandom(42);
9497
core.EPISODE_MAX_TIME = {self.episode_max_time};
9598
core.startEpisodeReal();
99+
start_time = Date.now();
96100
while (!WOB_TASK_READY) {{
97101
await new Promise(resolve => setTimeout(resolve, 100));
98102
}}
99-
return core.getUtterance();
103+
ready_time = Date.now();
104+
return {{'goal': core.getUtterance(), 'done': WOB_DONE_GLOBAL, 'task_start_time': ready_time - start_time}};
100105
"""
101106
return f"async () => {{{js}}}"
102107

@@ -113,29 +118,34 @@ def get_task_validate_js(self) -> str:
113118
return [WOB_REWARD_GLOBAL, WOB_RAW_REWARD_GLOBAL, WOB_REWARD_REASON, WOB_DONE_GLOBAL, WOB_EPISODE_ID, WOB_TASK_READY];
114119
}"""
115120

116-
117121
def parse_validation_result(self, validation_result: str) -> tuple[float, dict]:
122+
logger.info(f"Validation result: {validation_result}")
118123
chunks = [c.strip() for c in validation_result.split(",")]
119124
raw_reward = float(chunks[1])
120-
done = bool(chunks[3])
125+
done = chunks[3].strip().lower() == "true"
121126
reward = float(raw_reward > 0)
122127
return reward, {
123-
"raw_reward": raw_reward,
124-
"reward_reason": chunks[2],
125-
"done": done,
128+
"raw_reward": raw_reward,
129+
"reward_reason": chunks[2],
130+
"done": done,
126131
}
127132

128-
def get_miniwob_tasks(base_url: str | None = None, remove_human_display: bool = True, episode_max_time: int = 1000000) -> list[MiniWobTask]:
133+
134+
def get_miniwob_tasks(
135+
base_url: str | None = None, remove_human_display: bool = True, episode_max_time: int = 1000000
136+
) -> list[MiniWobTask]:
129137
if base_url is None:
130138
base_url = os.environ.get("MINIWOB_URL")
131139
if base_url is None:
132140
raise ValueError("MINIWOB_URL environment variable is not set")
133141
return [
134142
MiniWobTask(
135-
task_id=task.subdomain,
136-
desc=task.desc,
137-
subdomain=task.subdomain,
138-
base_url=base_url,
139-
remove_human_display=remove_human_display,
140-
episode_max_time=episode_max_time) for task in ALL_MINIWOB_TASKS
141-
]
143+
task_id=task.subdomain,
144+
desc=task.desc,
145+
subdomain=task.subdomain,
146+
base_url=base_url,
147+
remove_human_display=remove_human_display,
148+
episode_max_time=episode_max_time,
149+
)
150+
for task in ALL_MINIWOB_TASKS
151+
]

0 commit comments

Comments
 (0)