Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions experiments/run_miniwob.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
import logging
import os

from agentlab.agents.tapeagent.agent import TapeAgentArgs, load_config
from agentlab.backends.browser.mcp_playwright import MCPPlaywright
from agentlab.benchmarks.miniwob import MiniWobBenchmark
from agentlab.experiments.study import make_study

fmt = "%(asctime)s - %(levelname)s - %(name)s:%(lineno)d - %(funcName)s() - %(message)s"
logging.basicConfig(level=logging.INFO, force=True, format=fmt, handlers=[logging.StreamHandler()])
logger = logging.getLogger(__name__)

if __name__ == "__main__":
config = load_config("miniwob")
study = make_study(
benchmark=MiniWobBenchmark(backend=MCPPlaywright()),
agent_args=TapeAgentArgs(agent_name=config.name, config=config),
comment=config.comment,
logging_level=logging.INFO,
logging_level_stdout=logging.INFO,
)
if os.environ.get("AGENTLAB_DEBUG"):
study.exp_args_list = study.exp_args_list[:3]
study.run(n_jobs=1, n_relaunch=1, parallel_backend="sequential")
else:
study.run(n_jobs=config.n_jobs, n_relaunch=1, parallel_backend=config.parallel_backend)
42 changes: 42 additions & 0 deletions experiments/test_mcp.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
from tapeagents.environment import FunctionCall
from tapeagents.mcp import ToolCallAction

from agentlab.backends.browser.mcp_playwright import MCPPlaywright
from agentlab.benchmarks.miniwob.task import get_miniwob_tasks


def main():
tasks = get_miniwob_tasks()
task = tasks[0]
setup_js = task.get_setup_js()

backend = MCPPlaywright()
print("="*100)
# 1. goto task url
print("URL: ", task.url)
obs = backend.call_tool("browser_navigate", {"url": task.url})
print("------")
print(obs)
print("-"*100)

# 2. eval js
obs = backend.run_js(setup_js)
print("------")
print(obs)
print("-"*100)

# 3. validate
print("\n\nVALIDATE")
js = task.get_task_validate_js()
print(js)
obs = backend.run_js(js)
print("------")
print(obs)
print("-"*100)

if __name__ == "__main__":
main()




9 changes: 7 additions & 2 deletions src/agentlab/agents/tapeagent/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from tapeagents.agent import Agent
from tapeagents.core import Action, Observation, StopStep, TapeMetadata, Thought
from tapeagents.core import Tape as BaseTape
from tapeagents.tool_calling import ToolSpec

from agentlab.agents.agent_args import AgentArgs

Expand Down Expand Up @@ -40,8 +41,12 @@ def load_config(config_name: str) -> DictConfig:
class TapeAgentArgs(AgentArgs):
config: DictConfig = None # type: ignore

def make_agent(self) -> bgym.Agent:
agent: Agent = hydra.utils.instantiate(self.config.agent)
def make_agent(self, known_actions: tuple[ToolSpec, ...] | None) -> bgym.Agent:
if known_actions is None:
agent = hydra.utils.instantiate(self.config.agent)
else:
tools_description = "\n".join([action.description() for action in known_actions])
agent = hydra.utils.instantiate(self.config.agent, known_actions=known_actions, tools_description=tools_description)
return TapeAgent(agent=agent)


Expand Down
6 changes: 6 additions & 0 deletions src/agentlab/agents/tapeagent/conf/llm/gpt5-mini.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
_target_: tapeagents.llms.LiteLLM
model_name: gpt-5-mini-2025-08-07
use_cache: true
context_size: 128000
parameters:
temperature: 1.0
9 changes: 9 additions & 0 deletions src/agentlab/agents/tapeagent/conf/miniwob.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
defaults:
- llm: gpt5-mini
- agent: plan_act
- _self_

name: miniwob_tapeagent
comment: MiniWob TapeAgent
parallel_backend: ray
n_jobs: 32
44 changes: 44 additions & 0 deletions src/agentlab/backends/browser/base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
from pydantic import BaseModel
from tapeagents.mcp import MCPEnvironment
from tapeagents.tool_calling import FunctionCall, ToolCallAction, ToolSpec


class BrowserBackend(BaseModel):
def initialize(self) -> None:
raise NotImplementedError

def run_js(self, js: str):
raise NotImplementedError

def goto(self, url: str) -> str:
raise NotImplementedError

def step(self, action: ToolCallAction) -> str:
raise NotImplementedError

def actions(self) -> tuple[ToolSpec]:
raise NotImplementedError



class MCPBrowserBackend(BrowserBackend):
config_path: str
_mcp = None

def initialize(self) -> None:
self._mcp = MCPEnvironment(config_path=self.config_path)
self._mcp.initialize()

def step(self, action: ToolCallAction) -> str:
return self._call_mcp(action)

def call_tool(self, tool_name: str, arguments: dict) -> str:
return self._call_mcp(ToolCallAction(function=FunctionCall(name=tool_name, arguments=arguments)))

def _call_mcp(self, action: ToolCallAction) -> str:
tool_result = self._mcp.step(action)
texts = [c.text for c in tool_result.content.content]
return "\n\n".join(texts)

def actions(self) -> tuple[ToolSpec]:
return self._mcp.actions()
112 changes: 112 additions & 0 deletions src/agentlab/backends/browser/env.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
import logging
import time
from dataclasses import dataclass
from pathlib import Path
from typing import Literal

from tapeagents.core import Action, Observation, StopStep
from tapeagents.tool_calling import ToolCallAction, ToolSpec

from agentlab.backends.browser.base import BrowserBackend
from agentlab.benchmarks.abstract_env import AbstractEnv, AbstractEnvArgs
from agentlab.benchmarks.miniwob.task import AbstractWebTask

logger = logging.getLogger(__name__)

class GoalObservation(Observation):
kind: Literal["goal_observation"] = "goal_observation"
goal: str

class PageObservation(Observation):
kind: Literal["page_observation"] = "page_observation"
content: str


class BrowserEnv(AbstractEnv):
def __init__(self, task_name: str, task: AbstractWebTask, backend: BrowserBackend, seed: int = 0):
self.task_name = task_name
self.task = task
self.seed = seed
self._turns = 0
self.backend = backend
self.backend.initialize()

def reset(self, seed: int):
self.seed = seed
logger.info(f"Open task URL: {self.task.url}")
page_content = self.backend.goto(self.task.url)
setup_js = self.task.get_setup_js()
if setup_js:
js_result_str = self.backend.run_js(setup_js)
logger.info(f"Task reset result: {js_result_str}")
return [GoalObservation(goal=js_result_str), PageObservation(content=page_content)], {}

def step(self, action: ToolCallAction) -> tuple[Observation, float, bool, bool, dict]:
logger.info(f"BrowserEnv.step() called with action {action.function.name}")

action_exec_start = time.time()
finished = isinstance(action, StopStep)
if finished:
observation = Observation() # empty observation
else:
observation = self._step(action)
action_exec_stop = time.time()
self._turns += 1

truncated = self._turns >= self.max_turns

if self.task.validate_per_step or finished or truncated:
reward = self.calculate_reward(action, observation)
else:
reward = None

env_info = {
"step_metadata": observation.metadata,
"action_exec_start": action_exec_start,
"action_exec_stop": action_exec_stop,
"action_exec_timeout": 0.0,
}
obs_view = observation.short_view() if isinstance(observation, Observation) else observation
logger.info(f"Action result in observation: {obs_view}")
return observation, reward, finished, truncated, env_info

def _step(self, action: ToolCallAction) -> PageObservation:
tool_result = self.backend.step(action)
return PageObservation(content=tool_result)

def calculate_reward(self, action: Action, observation: PageObservation) -> float:
validate_js = self.task.get_step_validate_js()
validate_result = self.backend.run_js(validate_js)
reward, other = self.task.parse_validation_result(validate_result)
return reward

def close(self):
teardown_js = self.task.get_teardown_js()
if teardown_js:
js_result_str = self.backend.run_js(teardown_js)
logger.info(f"Task teardown result: {js_result_str}")

def actions(self) -> list[ToolSpec]:
all_actions = self.backend.actions()
filtered_actions = self.task.filter_actions(all_actions)
logger.info(f"Filtered {len(filtered_actions)} actions out of {len(all_actions)} for task {self.task.dataset}")
return filtered_actions


@dataclass
class BrowserEnvArgs(AbstractEnvArgs):
task: AbstractWebTask
task_seed: int
task_name: str
backend: BrowserBackend

def __init__(self, task_name: str, task: AbstractWebTask, backend: BrowserBackend, task_seed: int = 0):
self.task_name = task_name
self.task = task
self.task_seed = task_seed
self.backend = backend

def make_env(self, exp_dir: Path) -> BrowserEnv:
env = BrowserEnv(task_name=self.task_name, task=self.task, backend=self.backend, seed=self.task_seed)
return env

17 changes: 17 additions & 0 deletions src/agentlab/backends/browser/mcp_playwright.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
{
"mcpServers": {
"playwright": {
"command": "npx",
"args": [
"@playwright/mcp@latest",
"--browser",
"chromium",
"--headless",
"--isolated"
],
"env": {
"PLAYWRIGHT_BROWSERS_PATH": ""
}
}
}
}
30 changes: 30 additions & 0 deletions src/agentlab/backends/browser/mcp_playwright.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
import logging

from tapeagents.tool_calling import ToolCallAction

from agentlab.backends.browser.base import MCPBrowserBackend

logger = logging.getLogger(__name__)

DEFAULT_CONFIG_PATH = "src/agentlab/backends/browser/mcp_playwright.json"


class MCPPlaywright(MCPBrowserBackend):
config_path: str = DEFAULT_CONFIG_PATH

def run_js(self, js: str):
raw_response = self.call_tool("browser_evaluate", {"function": js})
_, half_response = raw_response.split("### Result", maxsplit=1)
result_str, _ = half_response.split("\n### Ran", maxsplit=1)
result_str = result_str.strip()
return result_str

def step(self, action: ToolCallAction) -> str:
tool_result = self._call_mcp(action)
logger.info(f"Tool result: {tool_result}")
snapshot = self.call_tool("browser_snapshot", {})
return snapshot

def goto(self, url: str) -> str:
tool_result = self.call_tool("browser_navigate", {"url": url})
return tool_result
4 changes: 4 additions & 0 deletions src/agentlab/benchmarks/miniwob/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
from .benchmark import MiniWobBenchmark
from .task import MiniWobTask

__all__ = ["MiniWobBenchmark", "MiniWobTask"]
26 changes: 26 additions & 0 deletions src/agentlab/benchmarks/miniwob/benchmark.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
import logging
from typing import Any

from agentlab.backends.browser.base import BrowserBackend
from agentlab.backends.browser.env import BrowserEnvArgs
from agentlab.benchmarks.abstract_env import AbstractBenchmark
from agentlab.benchmarks.miniwob.task import MiniWobTask, get_miniwob_tasks

logger = logging.getLogger(__name__)


class MiniWobBenchmark(AbstractBenchmark):
backend: BrowserBackend
name: str = "miniwob"
env_args_list: list[BrowserEnvArgs] = None # type: ignore
dataset: list[MiniWobTask] = None # type: ignore

def model_post_init(self, __context: Any) -> None:
self.env_args_list = []
if self.dataset is None:
self.dataset = get_miniwob_tasks()
for task in self.dataset:
name = f"miniwob.{task.task_id}"
env_args = BrowserEnvArgs(task_name=name, task=task, backend=self.backend)
self.env_args_list.append(env_args)
logger.info(f"Loaded {len(self.env_args_list)} miniwob tasks")
Loading
Loading