Skip to content

Commit 5ceeb60

Browse files
committed
miniwob with mcp browser backend, first draft
1 parent 0da2063 commit 5ceeb60

File tree

10 files changed

+427
-0
lines changed

10 files changed

+427
-0
lines changed

experiments/run_miniwob.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
import logging
2+
import os
3+
4+
from agentlab.agents.tapeagent.agent import TapeAgentArgs, load_config
5+
from agentlab.backends.browser.mcp_playwright import MCPPlaywright
6+
from agentlab.benchmarks.miniwob import MiniWobBenchmark
7+
from agentlab.experiments.study import make_study
8+
9+
fmt = "%(asctime)s - %(levelname)s - %(name)s:%(lineno)d - %(funcName)s() - %(message)s"
10+
logging.basicConfig(level=logging.INFO, force=True, format=fmt, handlers=[logging.StreamHandler()])
11+
12+
if __name__ == "__main__":
13+
config = load_config("gaia_l1")
14+
study = make_study(
15+
benchmark=MiniWobBenchmark(backend=MCPPlaywright()),
16+
agent_args=TapeAgentArgs(agent_name=config.name, config=config),
17+
comment=config.comment,
18+
logging_level=logging.INFO,
19+
logging_level_stdout=logging.INFO,
20+
)
21+
if os.environ.get("AGENTLAB_DEBUG"):
22+
study.exp_args_list = study.exp_args_list[:3]
23+
study.run(n_jobs=1, n_relaunch=1, parallel_backend="sequential")
24+
else:
25+
study.run(n_jobs=config.n_jobs, n_relaunch=1, parallel_backend=config.parallel_backend)

experiments/test_mcp.py

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
from tapeagents.environment import FunctionCall
2+
from tapeagents.mcp import ToolCallAction
3+
4+
from agentlab.backends.browser.mcp_playwright import MCPPlaywright
5+
from agentlab.benchmarks.miniwob.task import get_miniwob_tasks
6+
7+
8+
def main():
9+
tasks = get_miniwob_tasks()
10+
task = tasks[0]
11+
setup_js = task.get_setup_js()
12+
13+
backend = MCPPlaywright()
14+
print("="*100)
15+
# 1. goto task url
16+
print("URL: ", task.url)
17+
obs = backend.call_tool("browser_navigate", {"url": task.url})
18+
print("------")
19+
print(obs)
20+
print("-"*100)
21+
22+
# 2. eval js
23+
obs = backend.run_js(setup_js)
24+
print("------")
25+
print(obs)
26+
print("-"*100)
27+
28+
# 3. validate
29+
print("\n\nVALIDATE")
30+
js = task.get_task_validate_js()
31+
print(js)
32+
obs = backend.run_js(js)
33+
print("------")
34+
print(obs)
35+
print("-"*100)
36+
37+
if __name__ == "__main__":
38+
main()
39+
40+
41+
42+
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
from tapeagents.environment import FunctionCall
2+
from tapeagents.mcp import MCPEnvironment, ToolCallAction
3+
from tapeagents.tool_calling import as_openai_tool
4+
5+
6+
class BrowserBackend():
7+
def run_js(self, js: str):
8+
raise NotImplementedError
9+
10+
def call_tool(self, tool_name: str, arguments: dict) -> str:
11+
raise NotImplementedError
12+
13+
def tools_description(self) -> str:
14+
raise NotImplementedError
15+
16+
def tools(self) -> list[dict]:
17+
raise NotImplementedError
18+
19+
20+
class MCPBrowserBackend(BrowserBackend):
21+
def __init__(self, config_path: str):
22+
self.config_path = config_path
23+
self.mcp = MCPEnvironment(config_path=self.config_path)
24+
self.mcp.initialize()
25+
26+
def call_tool(self, tool_name: str, arguments: dict) -> str:
27+
action = ToolCallAction(
28+
function=FunctionCall(name=tool_name, arguments=arguments)
29+
)
30+
tool_result = self.mcp.step(action)
31+
return tool_result.content.content[0].text
32+
33+
34+
def tools_description(self) -> str:
35+
return self.mcp.tools_description()
36+
37+
def tools(self) -> list[dict]:
38+
actions = self.mcp.actions()
39+
tools = [as_openai_tool(a).model_dump() for a in actions]
40+
return tools
Lines changed: 82 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,82 @@
1+
import logging
2+
import time
3+
from typing import Any, Literal
4+
5+
from tapeagents.core import Action, Observation, StopStep
6+
7+
from agentlab.backends.browser.base import BrowserBackend
8+
from agentlab.benchmarks.abstract_env import AbstractEnv
9+
from agentlab.benchmarks.miniwob.task import AbstractWebTask
10+
11+
logger = logging.getLogger(__name__)
12+
13+
14+
class PageObservation(Observation):
15+
kind: Literal["page_observation"] = "page_observation"
16+
content: str
17+
18+
class BrowserAction(Action):
19+
kind: Literal["browser_action"] = "browser_action"
20+
name: str
21+
arguments: dict[str, Any]
22+
23+
24+
class BrowserEnv(AbstractEnv):
25+
def __init__(self, task_name: str, task: AbstractWebTask, backend: BrowserBackend, seed: int = 0):
26+
self.task_name = task_name
27+
self.task = task
28+
self.seed = seed
29+
self.backend = backend
30+
self._turns = 0
31+
32+
def reset(self, seed: int):
33+
self.seed = seed
34+
setup_js = self.task.get_setup_js()
35+
if setup_js:
36+
js_result_str = self.backend.run_js(setup_js)
37+
logger.info(f"Task reset result: {js_result_str}")
38+
39+
def step(self, action: BrowserAction) -> tuple[Observation, float, bool, bool, dict]:
40+
logger.info(f"BrowserEnv.step() called with action {type(action)}")
41+
42+
action_exec_start = time.time()
43+
finished = isinstance(action, StopStep)
44+
if finished:
45+
observation = Observation() # empty observation
46+
else:
47+
observation = self._step(action)
48+
action_exec_stop = time.time()
49+
self._turns += 1
50+
51+
truncated = self._turns >= self.max_turns
52+
53+
if self.task.validate_per_step or finished or truncated:
54+
reward = self.calculate_reward(action, observation)
55+
else:
56+
reward = None
57+
58+
env_info = {
59+
"step_metadata": observation.metadata,
60+
"action_exec_start": action_exec_start,
61+
"action_exec_stop": action_exec_stop,
62+
"action_exec_timeout": 0.0,
63+
}
64+
obs_view = observation.short_view() if isinstance(observation, Observation) else observation
65+
logger.info(f"Action result in observation: {obs_view}")
66+
return observation, reward, finished, truncated, env_info
67+
68+
def _step(self, action: Action) -> PageObservation:
69+
tool_result = self.backend.call_tool(action.name, action.arguments)
70+
return PageObservation(content=tool_result)
71+
72+
def calculate_reward(self, action: Action, observation: PageObservation) -> float:
73+
validate_js = self.task.get_step_validate_js()
74+
validate_result = self.backend.run_js(validate_js)
75+
reward, other = self.task.parse_validation_result(validate_result)
76+
return reward
77+
78+
def close(self):
79+
teardown_js = self.task.get_teardown_js()
80+
if teardown_js:
81+
js_result_str = self.backend.run_js(teardown_js)
82+
logger.info(f"Task teardown result: {js_result_str}")
Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,17 @@
1+
{
2+
"mcpServers": {
3+
"playwright": {
4+
"command": "npx",
5+
"args": [
6+
"@playwright/mcp@latest",
7+
"--browser",
8+
"chromium",
9+
"--headless",
10+
"--isolated"
11+
],
12+
"env": {
13+
"PLAYWRIGHT_BROWSERS_PATH": ""
14+
}
15+
}
16+
}
17+
}
Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
from agentlab.backends.browser.base import MCPBrowserBackend
2+
3+
DEFAULT_CONFIG_PATH = "src/agentlab/backends/browser/mcp_playwright.json"
4+
5+
class MCPPlaywright(MCPBrowserBackend):
6+
def __init__(self, config_path: str | None = None):
7+
super().__init__(config_path or DEFAULT_CONFIG_PATH)
8+
9+
def run_js(self, js: str):
10+
raw_response = self.call_tool("browser_evaluate", {"function": js})
11+
_, half_response = raw_response.split("### Result", maxsplit=1)
12+
result_str, _ = half_response.split("\n### Ran", maxsplit=1)
13+
result_str = result_str.strip()
14+
return result_str
Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
from .benchmark import MiniWobBenchmark
2+
from .task import MiniWobTask
3+
4+
__all__ = ["MiniWobBenchmark", "MiniWobTask"]
Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
import logging
2+
from dataclasses import dataclass
3+
from pathlib import Path
4+
from typing import Any
5+
6+
from pydantic import ConfigDict
7+
8+
from agentlab.backends.browser.base import BrowserBackend
9+
from agentlab.backends.browser.env import BrowserEnv
10+
from agentlab.benchmarks.abstract_env import AbstractBenchmark, AbstractEnvArgs
11+
from agentlab.benchmarks.miniwob.task import MiniWobTask, get_miniwob_tasks
12+
13+
logger = logging.getLogger(__name__)
14+
15+
16+
@dataclass
17+
class MiniwobArgs(AbstractEnvArgs):
18+
task: MiniWobTask
19+
task_seed: int
20+
task_name: str
21+
backend: BrowserBackend
22+
23+
def __init__(self, task_name: str, task: MiniWobTask, backend: BrowserBackend, task_seed: int = 0):
24+
self.task_name = task_name
25+
self.task = task
26+
self.task_seed = task_seed
27+
self.backend = backend
28+
29+
def make_env(self, exp_dir: Path, action_mapping=None) -> BrowserEnv:
30+
env = BrowserEnv(task_name=self.task_name, task=self.task, backend=self.backend, seed=self.task_seed)
31+
return env
32+
33+
34+
class MiniWobBenchmark(AbstractBenchmark):
35+
model_config = ConfigDict(arbitrary_types_allowed=True)
36+
37+
backend: BrowserBackend
38+
name: str = "miniwob"
39+
env_args_list: list[MiniwobArgs] = None # type: ignore
40+
dataset: list[MiniWobTask] = None # type: ignore
41+
42+
def model_post_init(self, __context: Any) -> None:
43+
self.env_args_list = []
44+
if self.dataset is None:
45+
self.dataset = get_miniwob_tasks()
46+
for task in self.dataset:
47+
name = f"miniwob.{task.task_id}"
48+
env_args = MiniwobArgs(task_name=name, task=task, backend=self.backend)
49+
self.env_args_list.append(env_args)
50+
logger.info(f"Loaded {len(self.env_args_list)} miniwob tasks")

0 commit comments

Comments
 (0)