fixes

ollmer · ollmer · commit 8be928a27312 · 2025-11-18T19:49:02.000Z
diff --git a/experiments/run_miniwob.py b/experiments/run_miniwob.py
@@ -4,11 +4,11 @@
 from bgym import DEFAULT_BENCHMARKS
 from dotenv import load_dotenv
 
-from agentlab.agents.generic_agent.agent_configs import FLAGS_GPT_4o
+from agentlab.agents.generic_agent.agent_configs import GPT5_MINI_FLAGS
 from agentlab.agents.generic_agent.generic_agent import GenericAgentArgs
 from agentlab.agents.tapeagent.agent import TapeAgentArgs, load_config
 from agentlab.backends.browser.mcp_playwright import MCPPlaywright
-from agentlab.backends.browser.playwright import PlaywrightSyncBackend
+from agentlab.backends.browser.playwright import AsyncPlaywright
 from agentlab.benchmarks.miniwob import MiniWobBenchmark
 from agentlab.experiments.study import make_study
 from agentlab.llm.llm_configs import CHAT_MODEL_ARGS_DICT
@@ -22,11 +22,12 @@
     config = load_config("miniwob")
 
     # benchmark = DEFAULT_BENCHMARKS["miniwob"](n_repeats=1)
-    benchmark = MiniWobBenchmark(backend=MCPPlaywright())
+    # benchmark = MiniWobBenchmark(backend=MCPPlaywright())
+    benchmark = MiniWobBenchmark(backend=AsyncPlaywright())
 
     # agent_args = GenericAgentArgs(
-    #     chat_model_args=CHAT_MODEL_ARGS_DICT["openrouter/openai/gpt-5-mini"],
-    #     flags=FLAGS_GPT_4o,
+    #     chat_model_args=CHAT_MODEL_ARGS_DICT["azure/gpt-5-mini-2025-08-07"],
+    #     flags=GPT5_MINI_FLAGS,
     # )
     # agent_args.flags.obs.use_ax_tree = False
     # agent_args.flags.obs.use_html = True
diff --git a/src/agentlab/agents/tapeagent/agent.py b/src/agentlab/agents/tapeagent/agent.py
@@ -12,6 +12,7 @@
 from tapeagents.agent import Agent
 from tapeagents.core import (
     Action,
+    ControlFlow,
     LLMOutputParsingFailureAction,
     Observation,
     SetNextNode,
@@ -251,7 +252,7 @@ def get_action(
                 if not event.step:
                     continue
                 self.tape = self.tape.append(event.step)
-                if isinstance(event.step, Thought):
+                if isinstance(event.step, Thought) and not isinstance(event.step, ControlFlow):
                     thoughts.append(event.step)
                     logger.info(f"Thought: {event.step.llm_view()}")
                 elif isinstance(event.step, Action) and not action:  # we use first action only
diff --git a/src/agentlab/agents/tapeagent/conf/agent/react.yaml b/src/agentlab/agents/tapeagent/conf/agent/react.yaml
@@ -10,15 +10,15 @@ nodes:
     system_prompt: |
       You are an expert AI Agent trained to assist users with complex web tasks.
       Your role is to understand the goal, perform actions until the goal is accomplished and respond in a helpful and accurate manner.
-      Keep your replies concise and direct. Prioritize clarity and avoid over-elaboration.
+      Keep your replies brief, concise, direct and on topic. Prioritize clarity and avoid over-elaboration.
       Do not express emotions or opinions.
     guidance: |
       Think along the following lines:
         1. Summarize the last observation and describe the visible changes in the state.
-        2. Evaluate action success, explain impact on task/plan. 
-        3. If there are any errors, describe them and propose alternative.
-        4. List next steps to move towards the goaland propose next immediate action.
-      The produce the function call that performs the proposed step. If the task is complete, produce the final step.
+        2. Evaluate action success, explain impact on task and next steps.
+        3. If you see any errors in the last observation, think about it. If there is no error, just move on.
+        4. List next steps to move towards the goal and propose next immediate action.
+      Then produce the function call that performs the proposed action. If the task is complete, produce the final step.
     steps:
       - tapeagents.core.FinalStep
     next_node: react
diff --git a/src/agentlab/backends/browser/base.py b/src/agentlab/backends/browser/base.py
@@ -40,6 +40,9 @@ class ToolCallAction(BaseModel):
     id: str = ""
     function: FunctionCall
 
+    def llm_view(self, **kwargs) -> str:
+        return self.model_dump_json(indent=2)
+
 
 class ToolSpec(BaseModel):
     """
diff --git a/src/agentlab/experiments/loop.py b/src/agentlab/experiments/loop.py
@@ -23,6 +23,7 @@
 from browsergym.experiments.utils import count_tokens
 from dataclasses_json import DataClassJsonMixin
 from PIL import Image
+from pydantic import BaseModel
 from tqdm import tqdm
 
 from agentlab.backends.browser.env import BrowserEnvArgs
@@ -411,6 +412,7 @@ def run(self):
                 logger.debug(f"Starting step {step_info.step}.")
                 step_info.profiling.agent_start = time.time()
                 action, step_info.agent_info = agent.get_action(step_info.obs.copy())
+                step_info.action = action.model_dump_json(indent=2) if isinstance(action, BaseModel) else str(action)
                 step_info.profiling.agent_stop = time.time()
                 if step_info.agent_info.get("think", None):
                     logger.info(f"Agent thought: {step_info.agent_info['think']}")