simplest react agent with markdown observations, images and tool calls

ollmer · ollmer · commit 01e0719672c8 · 2025-11-18T13:40:02.000Z
diff --git a/experiments/run_miniwob.py b/experiments/run_miniwob.py
@@ -22,16 +22,16 @@
     config = load_config("miniwob")
 
     # benchmark = DEFAULT_BENCHMARKS["miniwob"](n_repeats=1)
-    benchmark = MiniWobBenchmark(backend=PlaywrightSyncBackend())
+    benchmark = MiniWobBenchmark(backend=MCPPlaywright())
 
-    agent_args = GenericAgentArgs(
-        chat_model_args=CHAT_MODEL_ARGS_DICT["anthropic/claude-sonnet-4-20250514"],
-        flags=FLAGS_GPT_4o,
-    )
+    # agent_args = GenericAgentArgs(
+    #     chat_model_args=CHAT_MODEL_ARGS_DICT["openrouter/openai/gpt-5-mini"],
+    #     flags=FLAGS_GPT_4o,
+    # )
     # agent_args.flags.obs.use_ax_tree = False
     # agent_args.flags.obs.use_html = True
     # agent_args.flags.obs.use_focused_element = False
-    # agent_args =TapeAgentArgs(agent_name=config.name, config=config)
+    agent_args = TapeAgentArgs(agent_name=config.name, config=config)
 
 
     study = make_study(
@@ -42,7 +42,7 @@
         logging_level_stdout=logging.INFO,
     )
     if os.environ.get("AGENTLAB_DEBUG"):
-        study.exp_args_list = study.exp_args_list[23:24]
+        study.exp_args_list = study.exp_args_list[23:27]
         study.run(n_jobs=1, n_relaunch=1, parallel_backend="sequential")
     else:
         study.run(n_jobs=config.n_jobs, n_relaunch=1, parallel_backend=config.parallel_backend)
diff --git a/src/agentlab/agents/tapeagent/agent.py b/src/agentlab/agents/tapeagent/agent.py
@@ -1,17 +1,33 @@
 import logging
+import tempfile
 from dataclasses import dataclass
 from typing import Literal
 
 import bgym
 import hydra
+from litellm import ChatCompletionThinkingBlock
 from omegaconf import DictConfig
+from PIL import Image
 from pydantic import Field
 from tapeagents.agent import Agent
-from tapeagents.core import Action, Observation, StopStep, TapeMetadata, Thought
+from tapeagents.core import (
+    Action,
+    LLMOutputParsingFailureAction,
+    Observation,
+    SetNextNode,
+    StopStep,
+    TapeMetadata,
+    Thought,
+)
 from tapeagents.core import Tape as BaseTape
+from tapeagents.llms import LLMStream
+from tapeagents.nodes import FatalError, StandardNode
+from tapeagents.steps import ImageObservation
 from tapeagents.tool_calling import ToolSpec
+from termcolor import colored
 
 from agentlab.agents.agent_args import AgentArgs
+from agentlab.backends.browser.base import ToolSpec as AgentlabToolSpec
 
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
@@ -27,10 +43,59 @@ class ExtendedMetadata(TapeMetadata):
     other: dict = {}
 
 
+class AgentResponse(Thought):
+    kind: Literal["agent_response"] = "agent_response"
+    response: str
+
+
+class AgentThinking(Thought):
+    kind: Literal["agent_thinking"] = "agent_thinking"
+    thinking: str
+
+
 class Tape(BaseTape):
     metadata: ExtendedMetadata = Field(default_factory=ExtendedMetadata)  # type: ignore
 
 
+class ToolCallNode(StandardNode):
+    use_known_actions: bool = True
+    use_function_calls: bool = True
+
+    def generate_steps(self, agent: Agent, tape: Tape, llm_stream: LLMStream):
+        new_steps = []
+        for event in llm_stream:
+            if event.output.get("reasoning_content"):
+                logger.info(colored(f"LLM reasoning:\n{event.output.reasoning_content}", "yellow"))
+                new_steps.append(AgentThinking(thinking=event.output.reasoning_content))
+            if event.output.get("thinking_blocks"):
+                for block in event.output.thinking_blocks:
+                    if isinstance(block, ChatCompletionThinkingBlock):
+                        logger.info(colored(f"LLM thinking block:\n{block}", "yellow"))
+                        new_steps.append(AgentThinking(thinking=block.content))
+            if event.output.content:
+                logger.info(colored(f"LLM output:\n{event.output.content}", "cyan"))
+                new_steps.append(AgentResponse(response=event.output.content))
+            if event.output.tool_calls:
+                logger.info(colored(f"LLM tool calls:\n{event.output.tool_calls}", "magenta"))
+                new_steps += [
+                    self.tool_call_to_step(agent, tool_call)
+                    for tool_call in event.output.tool_calls
+                ]
+        for step in new_steps:
+            yield step
+            if isinstance(step, LLMOutputParsingFailureAction):
+                yield SetNextNode(next_node=self.name)  # loop to the same node to retry
+                break
+        if not new_steps:
+            raise FatalError("No completions!")
+        if (
+            self.next_node
+            and not isinstance(new_steps[-1], StopStep)
+            and not any(isinstance(step, SetNextNode) for step in new_steps)
+        ):
+            yield SetNextNode(next_node=self.next_node)
+
+
 def load_config(config_name: str) -> DictConfig:
     with hydra.initialize(config_path="conf", version_base="1.1"):
         config = hydra.compose(config_name=config_name)
@@ -45,8 +110,16 @@ def make_agent(self, actions: tuple[ToolSpec, ...] | None) -> bgym.Agent:
         if actions is None:
             agent = hydra.utils.instantiate(self.config.agent)
         else:
+            tapeagents_actions = [
+                ToolSpec(**tool.model_dump()) if isinstance(tool, AgentlabToolSpec) else tool
+                for tool in actions
+            ]
             tools_description = "\n".join([action.description() for action in actions])
-            agent = hydra.utils.instantiate(self.config.agent, known_actions=actions, tools_description=tools_description)
+            agent = hydra.utils.instantiate(
+                self.config.agent,
+                known_actions=tapeagents_actions,
+                tools_description=tools_description,
+            )
         return TapeAgent(agent=agent)
 
 
@@ -64,6 +137,62 @@ class DictObservation(Observation):
     content: str
 
 
+class MarkdownObservation(Observation):
+    def llm_view(self, **kwargs) -> str:
+        return f"## Markdown:\n{self.content}"
+
+    def short_view(self, max_chars: int = 100) -> str:
+        return self.llm_view()[:max_chars]
+
+
+class GoalObservation(MarkdownObservation):
+    """
+    Contains task goal
+    """
+
+    kind: Literal["goal_observation"] = "goal_observation"  # type: ignore
+    goal: str
+
+    def llm_view(self, **kwargs) -> str:
+        return f"## Goal:\n{self.goal}"
+
+
+class HTMLPage(MarkdownObservation):
+    """
+    Contains page content
+    """
+
+    kind: Literal["html_page"] = "html_page"
+    html: str
+
+    def llm_view(self, **kwargs) -> str:
+        return f"## Page Content:\n{self.html}"
+
+
+class AXTreePage(MarkdownObservation):
+    """
+    Contains accessibility tree
+    """
+
+    kind: Literal["ax_tree_page"] = "ax_tree_page"
+    axtree: str
+
+    def llm_view(self, **kwargs) -> str:
+        return f"## Accessibility Tree:\n{self.axtree}"
+
+
+class ActionResult(MarkdownObservation):
+    """
+    Contains action result
+    """
+
+    kind: Literal["action_result"] = "action_result"
+    result: str
+
+    def llm_view(self, **kwargs) -> str:
+        return f"## Action Result:\n{self.result}"
+
+
 class TapeAgent(bgym.Agent):
     agent: Agent
     tape: Tape
@@ -73,11 +202,33 @@ def __init__(self, agent: Agent):
         self.agent = agent
         self.tape = Tape(steps=[])
 
-    def obs_preprocessor(self, obs: Observation | list[Observation]) -> list[Observation]:
+    def obs_preprocessor(self, obs: Observation | list[Observation] | dict) -> list[Observation]:
         if isinstance(obs, Observation):
             obs = [obs]
+        if isinstance(obs, dict):
+            obs_steps = []
+            if obs.get("goal_object"):
+                obs_steps.append(GoalObservation(goal=obs["goal_object"][0]["text"]))
+            if obs.get("action_result"):
+                obs_steps.append(ActionResult(result=obs["action_result"]))
+            if obs.get("pruned_html"):
+                obs_steps.append(HTMLPage(html=obs["pruned_html"]))
+            if obs.get("axtree_txt"):
+                obs_steps.append(AXTreePage(axtree=obs["axtree_txt"]))
+            if obs.get("screenshot"):
+                if isinstance(obs["screenshot"], Image.Image):
+                    tmp_image_path = tempfile.mktemp(suffix=".png")
+                    obs["screenshot"].save(tmp_image_path)
+                    obs_steps.append(ImageObservation(image_path=tmp_image_path))
+                else:
+                    raise ValueError(f"Expected Image.Image, got {type(obs['screenshot'])}")
+            if obs.get("last_action_error"):
+                obs_steps.append(ActionResult(result=f"Action error:\n{obs['last_action_error']}"))
+            assert len(obs_steps) > 0, f"Unknown dict observation, keys: {obs.keys()}"
+            obs = obs_steps
         assert isinstance(obs, list), f"Expected list of Observations, got {type(obs)}"
-        logger.info(f"Observations: {[type(o).__name__ for o in obs]}")
+        obs_view = "\n".join([o.short_view() for o in obs])
+        logger.info(colored(f"Observations:\n{obs_view}", "green"))
         return obs
 
     def get_action(self, obs: Observation | list[Observation]) -> tuple[Action, TapeAgentInfo]:
diff --git a/src/agentlab/agents/tapeagent/conf/agent/react.yaml b/src/agentlab/agents/tapeagent/conf/agent/react.yaml
@@ -0,0 +1,24 @@
+_target_: tapeagents.agent.Agent
+name : react_agent
+max_iterations: 10
+llms:
+  default: ${llm}
+templates: {}
+nodes:
+  - _target_: agentlab.agents.tapeagent.agent.ToolCallNode
+    name: react
+    system_prompt: |
+      You are an expert AI Agent trained to assist users with complex web tasks.
+      Your role is to understand the goal, perform actions until the goal is accomplished and respond in a helpful and accurate manner.
+      Keep your replies concise and direct. Prioritize clarity and avoid over-elaboration.
+      Do not express emotions or opinions.
+    guidance: |
+      Think along the following lines:
+        1. Summarize the last observation and describe the visible changes in the state.
+        2. Evaluate action success, explain impact on task/plan. 
+        3. If there are any errors, describe them and propose alternative.
+        4. List next steps to move towards the goaland propose next immediate action.
+      The produce the function call that performs the proposed step. If the task is complete, produce the final step.
+    steps:
+      - tapeagents.core.FinalStep
+    next_node: react
diff --git a/src/agentlab/agents/tapeagent/conf/llm/gpt5-mini.yaml b/src/agentlab/agents/tapeagent/conf/llm/gpt5-mini.yaml
@@ -1,6 +1,6 @@
 _target_: tapeagents.llms.LiteLLM
-model_name: gpt-5-mini-2025-08-07
-use_cache: true
+model_name: azure/gpt-5-mini
+use_cache: false
 context_size: 128000
 parameters:
   temperature: 1.0
diff --git a/src/agentlab/agents/tapeagent/conf/miniwob.yaml b/src/agentlab/agents/tapeagent/conf/miniwob.yaml
@@ -1,9 +1,9 @@
 defaults:
-  - llm: sonnet
-  - agent: plan_react_fcall
+  - llm: gpt5-mini
+  - agent: react
   - _self_
 
-name: miniwob_tapeagent
-comment: MiniWob TapeAgent
+name: miniwob
+comment: MiniWob Agent
 parallel_backend: ray
 n_jobs: 32