diff --git a/experiments/run_miniwob.py b/experiments/run_miniwob.py new file mode 100644 index 00000000..f860ae9f --- /dev/null +++ b/experiments/run_miniwob.py @@ -0,0 +1,26 @@ +import logging +import os + +from agentlab.agents.tapeagent.agent import TapeAgentArgs, load_config +from agentlab.backends.browser.mcp_playwright import MCPPlaywright +from agentlab.benchmarks.miniwob import MiniWobBenchmark +from agentlab.experiments.study import make_study + +fmt = "%(asctime)s - %(levelname)s - %(name)s:%(lineno)d - %(funcName)s() - %(message)s" +logging.basicConfig(level=logging.INFO, force=True, format=fmt, handlers=[logging.StreamHandler()]) +logger = logging.getLogger(__name__) + +if __name__ == "__main__": + config = load_config("miniwob") + study = make_study( + benchmark=MiniWobBenchmark(backend=MCPPlaywright()), + agent_args=TapeAgentArgs(agent_name=config.name, config=config), + comment=config.comment, + logging_level=logging.INFO, + logging_level_stdout=logging.INFO, + ) + if os.environ.get("AGENTLAB_DEBUG"): + study.exp_args_list = study.exp_args_list[:3] + study.run(n_jobs=1, n_relaunch=1, parallel_backend="sequential") + else: + study.run(n_jobs=config.n_jobs, n_relaunch=1, parallel_backend=config.parallel_backend) diff --git a/experiments/test_mcp.py b/experiments/test_mcp.py new file mode 100644 index 00000000..34b9734a --- /dev/null +++ b/experiments/test_mcp.py @@ -0,0 +1,42 @@ +from tapeagents.environment import FunctionCall +from tapeagents.mcp import ToolCallAction + +from agentlab.backends.browser.mcp_playwright import MCPPlaywright +from agentlab.benchmarks.miniwob.task import get_miniwob_tasks + + +def main(): + tasks = get_miniwob_tasks() + task = tasks[0] + setup_js = task.get_setup_js() + + backend = MCPPlaywright() + print("="*100) + # 1. goto task url + print("URL: ", task.url) + obs = backend.call_tool("browser_navigate", {"url": task.url}) + print("------") + print(obs) + print("-"*100) + + # 2. eval js + obs = backend.run_js(setup_js) + print("------") + print(obs) + print("-"*100) + + # 3. validate + print("\n\nVALIDATE") + js = task.get_task_validate_js() + print(js) + obs = backend.run_js(js) + print("------") + print(obs) + print("-"*100) + +if __name__ == "__main__": + main() + + + + \ No newline at end of file diff --git a/src/agentlab/agents/tapeagent/agent.py b/src/agentlab/agents/tapeagent/agent.py index eefda1d1..4844d46b 100644 --- a/src/agentlab/agents/tapeagent/agent.py +++ b/src/agentlab/agents/tapeagent/agent.py @@ -9,6 +9,7 @@ from tapeagents.agent import Agent from tapeagents.core import Action, Observation, StopStep, TapeMetadata, Thought from tapeagents.core import Tape as BaseTape +from tapeagents.tool_calling import ToolSpec from agentlab.agents.agent_args import AgentArgs @@ -40,8 +41,12 @@ def load_config(config_name: str) -> DictConfig: class TapeAgentArgs(AgentArgs): config: DictConfig = None # type: ignore - def make_agent(self) -> bgym.Agent: - agent: Agent = hydra.utils.instantiate(self.config.agent) + def make_agent(self, known_actions: tuple[ToolSpec, ...] | None) -> bgym.Agent: + if known_actions is None: + agent = hydra.utils.instantiate(self.config.agent) + else: + tools_description = "\n".join([action.description() for action in known_actions]) + agent = hydra.utils.instantiate(self.config.agent, known_actions=known_actions, tools_description=tools_description) return TapeAgent(agent=agent) diff --git a/src/agentlab/agents/tapeagent/conf/llm/gpt5-mini.yaml b/src/agentlab/agents/tapeagent/conf/llm/gpt5-mini.yaml new file mode 100644 index 00000000..84dbe3b3 --- /dev/null +++ b/src/agentlab/agents/tapeagent/conf/llm/gpt5-mini.yaml @@ -0,0 +1,6 @@ +_target_: tapeagents.llms.LiteLLM +model_name: gpt-5-mini-2025-08-07 +use_cache: true +context_size: 128000 +parameters: + temperature: 1.0 \ No newline at end of file diff --git a/src/agentlab/agents/tapeagent/conf/miniwob.yaml b/src/agentlab/agents/tapeagent/conf/miniwob.yaml new file mode 100644 index 00000000..acc2c655 --- /dev/null +++ b/src/agentlab/agents/tapeagent/conf/miniwob.yaml @@ -0,0 +1,9 @@ +defaults: + - llm: gpt5-mini + - agent: plan_act + - _self_ + +name: miniwob_tapeagent +comment: MiniWob TapeAgent +parallel_backend: ray +n_jobs: 32 \ No newline at end of file diff --git a/src/agentlab/backends/browser/base.py b/src/agentlab/backends/browser/base.py new file mode 100644 index 00000000..636eb3fe --- /dev/null +++ b/src/agentlab/backends/browser/base.py @@ -0,0 +1,44 @@ +from pydantic import BaseModel +from tapeagents.mcp import MCPEnvironment +from tapeagents.tool_calling import FunctionCall, ToolCallAction, ToolSpec + + +class BrowserBackend(BaseModel): + def initialize(self) -> None: + raise NotImplementedError + + def run_js(self, js: str): + raise NotImplementedError + + def goto(self, url: str) -> str: + raise NotImplementedError + + def step(self, action: ToolCallAction) -> str: + raise NotImplementedError + + def actions(self) -> tuple[ToolSpec]: + raise NotImplementedError + + + +class MCPBrowserBackend(BrowserBackend): + config_path: str + _mcp = None + + def initialize(self) -> None: + self._mcp = MCPEnvironment(config_path=self.config_path) + self._mcp.initialize() + + def step(self, action: ToolCallAction) -> str: + return self._call_mcp(action) + + def call_tool(self, tool_name: str, arguments: dict) -> str: + return self._call_mcp(ToolCallAction(function=FunctionCall(name=tool_name, arguments=arguments))) + + def _call_mcp(self, action: ToolCallAction) -> str: + tool_result = self._mcp.step(action) + texts = [c.text for c in tool_result.content.content] + return "\n\n".join(texts) + + def actions(self) -> tuple[ToolSpec]: + return self._mcp.actions() \ No newline at end of file diff --git a/src/agentlab/backends/browser/env.py b/src/agentlab/backends/browser/env.py new file mode 100644 index 00000000..9661fde5 --- /dev/null +++ b/src/agentlab/backends/browser/env.py @@ -0,0 +1,112 @@ +import logging +import time +from dataclasses import dataclass +from pathlib import Path +from typing import Literal + +from tapeagents.core import Action, Observation, StopStep +from tapeagents.tool_calling import ToolCallAction, ToolSpec + +from agentlab.backends.browser.base import BrowserBackend +from agentlab.benchmarks.abstract_env import AbstractEnv, AbstractEnvArgs +from agentlab.benchmarks.miniwob.task import AbstractWebTask + +logger = logging.getLogger(__name__) + +class GoalObservation(Observation): + kind: Literal["goal_observation"] = "goal_observation" + goal: str + +class PageObservation(Observation): + kind: Literal["page_observation"] = "page_observation" + content: str + + +class BrowserEnv(AbstractEnv): + def __init__(self, task_name: str, task: AbstractWebTask, backend: BrowserBackend, seed: int = 0): + self.task_name = task_name + self.task = task + self.seed = seed + self._turns = 0 + self.backend = backend + self.backend.initialize() + + def reset(self, seed: int): + self.seed = seed + logger.info(f"Open task URL: {self.task.url}") + page_content = self.backend.goto(self.task.url) + setup_js = self.task.get_setup_js() + if setup_js: + js_result_str = self.backend.run_js(setup_js) + logger.info(f"Task reset result: {js_result_str}") + return [GoalObservation(goal=js_result_str), PageObservation(content=page_content)], {} + + def step(self, action: ToolCallAction) -> tuple[Observation, float, bool, bool, dict]: + logger.info(f"BrowserEnv.step() called with action {action.function.name}") + + action_exec_start = time.time() + finished = isinstance(action, StopStep) + if finished: + observation = Observation() # empty observation + else: + observation = self._step(action) + action_exec_stop = time.time() + self._turns += 1 + + truncated = self._turns >= self.max_turns + + if self.task.validate_per_step or finished or truncated: + reward = self.calculate_reward(action, observation) + else: + reward = None + + env_info = { + "step_metadata": observation.metadata, + "action_exec_start": action_exec_start, + "action_exec_stop": action_exec_stop, + "action_exec_timeout": 0.0, + } + obs_view = observation.short_view() if isinstance(observation, Observation) else observation + logger.info(f"Action result in observation: {obs_view}") + return observation, reward, finished, truncated, env_info + + def _step(self, action: ToolCallAction) -> PageObservation: + tool_result = self.backend.step(action) + return PageObservation(content=tool_result) + + def calculate_reward(self, action: Action, observation: PageObservation) -> float: + validate_js = self.task.get_step_validate_js() + validate_result = self.backend.run_js(validate_js) + reward, other = self.task.parse_validation_result(validate_result) + return reward + + def close(self): + teardown_js = self.task.get_teardown_js() + if teardown_js: + js_result_str = self.backend.run_js(teardown_js) + logger.info(f"Task teardown result: {js_result_str}") + + def actions(self) -> list[ToolSpec]: + all_actions = self.backend.actions() + filtered_actions = self.task.filter_actions(all_actions) + logger.info(f"Filtered {len(filtered_actions)} actions out of {len(all_actions)} for task {self.task.dataset}") + return filtered_actions + + +@dataclass +class BrowserEnvArgs(AbstractEnvArgs): + task: AbstractWebTask + task_seed: int + task_name: str + backend: BrowserBackend + + def __init__(self, task_name: str, task: AbstractWebTask, backend: BrowserBackend, task_seed: int = 0): + self.task_name = task_name + self.task = task + self.task_seed = task_seed + self.backend = backend + + def make_env(self, exp_dir: Path) -> BrowserEnv: + env = BrowserEnv(task_name=self.task_name, task=self.task, backend=self.backend, seed=self.task_seed) + return env + diff --git a/src/agentlab/backends/browser/mcp_playwright.json b/src/agentlab/backends/browser/mcp_playwright.json new file mode 100644 index 00000000..f184712b --- /dev/null +++ b/src/agentlab/backends/browser/mcp_playwright.json @@ -0,0 +1,17 @@ +{ + "mcpServers": { + "playwright": { + "command": "npx", + "args": [ + "@playwright/mcp@latest", + "--browser", + "chromium", + "--headless", + "--isolated" + ], + "env": { + "PLAYWRIGHT_BROWSERS_PATH": "" + } + } + } +} diff --git a/src/agentlab/backends/browser/mcp_playwright.py b/src/agentlab/backends/browser/mcp_playwright.py new file mode 100644 index 00000000..bb4371b4 --- /dev/null +++ b/src/agentlab/backends/browser/mcp_playwright.py @@ -0,0 +1,30 @@ +import logging + +from tapeagents.tool_calling import ToolCallAction + +from agentlab.backends.browser.base import MCPBrowserBackend + +logger = logging.getLogger(__name__) + +DEFAULT_CONFIG_PATH = "src/agentlab/backends/browser/mcp_playwright.json" + + +class MCPPlaywright(MCPBrowserBackend): + config_path: str = DEFAULT_CONFIG_PATH + + def run_js(self, js: str): + raw_response = self.call_tool("browser_evaluate", {"function": js}) + _, half_response = raw_response.split("### Result", maxsplit=1) + result_str, _ = half_response.split("\n### Ran", maxsplit=1) + result_str = result_str.strip() + return result_str + + def step(self, action: ToolCallAction) -> str: + tool_result = self._call_mcp(action) + logger.info(f"Tool result: {tool_result}") + snapshot = self.call_tool("browser_snapshot", {}) + return snapshot + + def goto(self, url: str) -> str: + tool_result = self.call_tool("browser_navigate", {"url": url}) + return tool_result diff --git a/src/agentlab/benchmarks/miniwob/__init__.py b/src/agentlab/benchmarks/miniwob/__init__.py new file mode 100644 index 00000000..558ed21e --- /dev/null +++ b/src/agentlab/benchmarks/miniwob/__init__.py @@ -0,0 +1,4 @@ +from .benchmark import MiniWobBenchmark +from .task import MiniWobTask + +__all__ = ["MiniWobBenchmark", "MiniWobTask"] \ No newline at end of file diff --git a/src/agentlab/benchmarks/miniwob/benchmark.py b/src/agentlab/benchmarks/miniwob/benchmark.py new file mode 100644 index 00000000..2851ef29 --- /dev/null +++ b/src/agentlab/benchmarks/miniwob/benchmark.py @@ -0,0 +1,26 @@ +import logging +from typing import Any + +from agentlab.backends.browser.base import BrowserBackend +from agentlab.backends.browser.env import BrowserEnvArgs +from agentlab.benchmarks.abstract_env import AbstractBenchmark +from agentlab.benchmarks.miniwob.task import MiniWobTask, get_miniwob_tasks + +logger = logging.getLogger(__name__) + + +class MiniWobBenchmark(AbstractBenchmark): + backend: BrowserBackend + name: str = "miniwob" + env_args_list: list[BrowserEnvArgs] = None # type: ignore + dataset: list[MiniWobTask] = None # type: ignore + + def model_post_init(self, __context: Any) -> None: + self.env_args_list = [] + if self.dataset is None: + self.dataset = get_miniwob_tasks() + for task in self.dataset: + name = f"miniwob.{task.task_id}" + env_args = BrowserEnvArgs(task_name=name, task=task, backend=self.backend) + self.env_args_list.append(env_args) + logger.info(f"Loaded {len(self.env_args_list)} miniwob tasks") diff --git a/src/agentlab/benchmarks/miniwob/task.py b/src/agentlab/benchmarks/miniwob/task.py new file mode 100644 index 00000000..e0d0b3e2 --- /dev/null +++ b/src/agentlab/benchmarks/miniwob/task.py @@ -0,0 +1,140 @@ + +import os +from typing import Any, ClassVar + +from browsergym.miniwob import ALL_MINIWOB_TASKS + +from agentlab.benchmarks.web_task import AbstractWebTask + + +class MiniWobTask(AbstractWebTask): + dataset: str = "miniwob" + task_id: str + desc: str + subdomain: str + base_url: str = None + url: str = None + remove_human_display: bool = True + episode_max_time: int = 1000000 + actions_whitelist: ClassVar[list[str]] = [ + "browser_press_key", + "browser_type", + "browser_navigate", + "browser_click", + "browser_drag", + "browser_hover", + "browser_select_option", + ] + + def model_post_init(self, __context: Any): + self.url = f"{self.base_url}/{self.subdomain}.html" + + def get_setup_js(self) -> str: + if self.remove_human_display: + js = r""" +let __display_ids = ['reward-display', 'click-canvas', 'sync-task-cover']; +let __display_divs = {}; +let __query_div_hidden_copy = null; + +removeDisplay = function() { + core.clearTimer(); + document.body.removeEventListener('click', core.canvasDrawClick); + + __query_div_hidden_copy = document.getElementById('query').cloneNode(true); + document.getElementById('query').innerHTML = ''; + + for (i in __display_ids) { + elem_id = __display_ids[i]; + elem = document.getElementById(elem_id); + // remove elem from the document + elem.remove(); + // but keep it stored somewhere to bring back later + __display_divs[elem_id] = elem; + } +}; + +bringBackDisplay = function() { + document.getElementById('query').innerHTML = __query_div_hidden_copy.innerHTML; + for (var elem_id in __display_divs){ + document.body.appendChild(__display_divs[elem_id]); + } + core.createDisplay(); +}; + +core.endEpisode_legacy = core.endEpisode; +core.startEpisodeReal_legacy = core.startEpisodeReal; +core.getUtterance_legacy = core.getUtterance; + +core.getUtterance = function () { + bringBackDisplay(); + utterance = core.getUtterance_legacy(); + removeDisplay(); + return utterance; +}; + +core.endEpisode = function(reward, time_proportional, reason){ + bringBackDisplay(); + core.endEpisode_legacy(reward, time_proportional, reason); + removeDisplay(); +}; + +core.startEpisodeReal = function() { + bringBackDisplay(); + core.startEpisodeReal_legacy(); + removeDisplay(); +}; + +removeDisplay(); +""" + else: + js = "" + js += f""" +Math.seedrandom(42); +core.EPISODE_MAX_TIME = {self.episode_max_time}; +core.startEpisodeReal(); +while (!WOB_TASK_READY) {{ + await new Promise(resolve => setTimeout(resolve, 100)); +}} +return core.getUtterance(); + """ + return f"async () => {{{js}}}" + + def get_teardown_js(self) -> str: + return "" + + def get_step_validate_js(self) -> str: + return """() => { +return [WOB_REWARD_GLOBAL, WOB_RAW_REWARD_GLOBAL, WOB_REWARD_REASON, WOB_DONE_GLOBAL, WOB_EPISODE_ID, WOB_TASK_READY]; +}""" + + def get_task_validate_js(self) -> str: + return """() => { +return [WOB_REWARD_GLOBAL, WOB_RAW_REWARD_GLOBAL, WOB_REWARD_REASON, WOB_DONE_GLOBAL, WOB_EPISODE_ID, WOB_TASK_READY]; +}""" + + + def parse_validation_result(self, validation_result: str) -> tuple[float, dict]: + chunks = [c.strip() for c in validation_result.split(",")] + raw_reward = float(chunks[1]) + done = bool(chunks[3]) + reward = float(raw_reward > 0) + return reward, { + "raw_reward": raw_reward, + "reward_reason": chunks[2], + "done": done, + } + +def get_miniwob_tasks(base_url: str | None = None, remove_human_display: bool = True, episode_max_time: int = 1000000) -> list[MiniWobTask]: + if base_url is None: + base_url = os.environ.get("MINIWOB_URL") + if base_url is None: + raise ValueError("MINIWOB_URL environment variable is not set") + return [ + MiniWobTask( + task_id=task.subdomain, + desc=task.desc, + subdomain=task.subdomain, + base_url=base_url, + remove_human_display=remove_human_display, + episode_max_time=episode_max_time) for task in ALL_MINIWOB_TASKS + ] \ No newline at end of file diff --git a/src/agentlab/benchmarks/web_task.py b/src/agentlab/benchmarks/web_task.py new file mode 100644 index 00000000..e8588d54 --- /dev/null +++ b/src/agentlab/benchmarks/web_task.py @@ -0,0 +1,30 @@ +from typing import ClassVar + +from pydantic import BaseModel +from tapeagents.tool_calling import ToolSpec + + +class AbstractWebTask(BaseModel): + dataset: str + url: str + validate_per_step: bool = False + actions_whitelist: ClassVar[list[str]] = [] + + @classmethod + def filter_actions(cls, actions: list[ToolSpec]) -> list[str]: + return [action for action in actions if action.function.name in cls.actions_whitelist] + + def get_setup_js(self) -> str: + raise NotImplementedError + + def get_teardown_js(self) -> str: + raise NotImplementedError + + def get_task_validate_js(self) -> str: + raise NotImplementedError + + def get_step_validate_js(self) -> str: + raise NotImplementedError + + def parse_validation_result(self, validate_result: str) -> tuple[float, dict]: + raise NotImplementedError diff --git a/src/agentlab/experiments/loop.py b/src/agentlab/experiments/loop.py index de4b976a..82bbc8ab 100644 --- a/src/agentlab/experiments/loop.py +++ b/src/agentlab/experiments/loop.py @@ -25,6 +25,8 @@ from PIL import Image from tqdm import tqdm +from agentlab.backends.browser.env import BrowserEnvArgs + try: from agentlab.agents.tapeagent import TapeAgent, save_tape except ImportError: @@ -414,19 +416,23 @@ def run(self): env, step_info, err_msg, stack_trace = None, None, None, None try: logger.info(f"Running experiment {self.exp_name} in:\n {self.exp_dir}") - agent = self.agent_args.make_agent() - if hasattr(agent, "set_task_name"): - agent.set_task_name(self.env_args.task_name) - - logger.debug("Agent created.") - - env = self.env_args.make_env( - action_mapping=agent.action_set.to_python_code, - exp_dir=self.exp_dir, - use_raw_page_output=getattr(self.agent_args, "use_raw_page_output", False), - ) + if isinstance(self.env_args, BrowserEnvArgs): + env = self.env_args.make_env(exp_dir=self.exp_dir) + logger.debug("Environment created.") + agent = self.agent_args.make_agent(known_actions=env.actions()) + logger.debug(f"Agent created with actions: {env.actions()}") + else: + agent = self.agent_args.make_agent() + if hasattr(agent, "set_task_name"): + agent.set_task_name(self.env_args.task_name) + logger.debug("Agent created.") + env = self.env_args.make_env( + action_mapping=agent.action_set.to_python_code, + exp_dir=self.exp_dir, + use_raw_page_output=getattr(self.agent_args, "use_raw_page_output", False), + ) + logger.debug("Environment created.") - logger.debug("Environment created.") step_info = StepInfo(step=0) episode_info = [step_info] step_info.from_reset(