fix cua example, remove root model

derekmeegan · derekmeegan · commit 2c2580bce636 · 2025-10-23T13:31:31.000-07:00
diff --git a/examples/agent_example.py b/examples/agent_example.py
@@ -36,11 +36,8 @@ async def main():
     # Build a unified configuration object for Stagehand
     config = StagehandConfig(
         env="BROWSERBASE",
-        # env="LOCAL",
         api_key=os.getenv("BROWSERBASE_API_KEY"),
         project_id=os.getenv("BROWSERBASE_PROJECT_ID"),
-        model_name="gpt-4o",
-        self_heal=True,
         system_prompt="You are a browser automation assistant that helps users navigate websites effectively.",
         model_client_options={"apiKey": os.getenv("MODEL_API_KEY")},
         verbose=2,
@@ -51,12 +48,11 @@ async def main():
 
     # Initialize - this creates a new session automatically.
     console.print("\n🚀 [info]Initializing Stagehand...[/]")
-    await stagehand.init()
-    if stagehand.env == "BROWSERBASE":    
-        console.print(f"\n[yellow]Created new session:[/] {stagehand.session_id}")
-        console.print(
-            f"🌐 [white]View your live browser:[/] [url]https://www.browserbase.com/sessions/{stagehand.session_id}[/]"
-        )
+    await stagehand.init() 
+    console.print(f"\n[yellow]Created new session:[/] {stagehand.session_id}")
+    console.print(
+        f"🌐 [white]View your live browser:[/] [url]https://www.browserbase.com/sessions/{stagehand.session_id}[/]"
+    )
 
     console.print("\n▶️ [highlight] Navigating[/] to Google")
     await stagehand.page.goto("https://google.com/")
diff --git a/examples/agent_example_local.py b/examples/agent_example_local.py
@@ -0,0 +1,102 @@
+import asyncio
+import logging
+import os
+
+from dotenv import load_dotenv
+from rich.console import Console
+from rich.panel import Panel
+from rich.theme import Theme
+
+from stagehand import Stagehand, StagehandConfig, configure_logging
+
+# Create a custom theme for consistent styling
+custom_theme = Theme(
+    {
+        "info": "cyan",
+        "success": "green",
+        "warning": "yellow",
+        "error": "red bold",
+        "highlight": "magenta",
+        "url": "blue underline",
+    }
+)
+
+# Create a Rich console instance with our theme
+console = Console(theme=custom_theme)
+
+load_dotenv()
+
+# Configure logging with the utility function
+configure_logging(
+    level=logging.INFO,  # Set to INFO for regular logs, DEBUG for detailed
+    quiet_dependencies=True,  # Reduce noise from dependencies
+)
+
+async def main():
+    # Build a unified configuration object for Stagehand
+    config = StagehandConfig(
+        env="LOCAL",
+        system_prompt="You are a browser automation assistant that helps users navigate websites effectively.",
+        model_client_options={"apiKey": os.getenv("MODEL_API_KEY")},
+        verbose=2,
+    )
+
+    # Create a Stagehand client using the configuration object.
+    stagehand = Stagehand(config)
+
+    # Initialize - this creates a new session automatically.
+    console.print("\n🚀 [info]Initializing Stagehand...[/]")
+    await stagehand.init()
+
+    console.print("\n▶️ [highlight] Navigating[/] to Google")
+    await stagehand.page.goto("https://google.com/")
+    console.print("✅ [success]Navigated to Google[/]")
+    
+    console.print("\n▶️ [highlight] Using Agent to perform a task[/]: playing a game of 2048")
+    agent = stagehand.agent(
+        model="gemini-2.5-computer-use-preview-10-2025",
+        instructions="You are a helpful web navigation assistant that helps users find information. You are currently on the following page: google.com. Do not ask follow up questions, the user will trust your judgement.",
+        options={"apiKey": os.getenv("GEMINI_API_KEY")}
+    )
+    agent_result = await agent.execute(
+        instruction="Play a game of 2048",
+        max_steps=20,
+        auto_screenshot=True,
+    )
+
+    console.print(agent_result)
+
+    console.print("📊 [info]Agent execution result:[/]")
+    console.print(f"🎯 Completed: [bold]{'Yes' if agent_result.completed else 'No'}[/]")
+    if agent_result.message:
+        console.print(f"💬 Message: [italic]{agent_result.message}[/]")
+    
+    if agent_result.actions:
+        console.print(f"🔄 Actions performed: [bold]{len(agent_result.actions)}[/]")
+        for i, action in enumerate(agent_result.actions):
+            action_type = action.type
+
+            console.print(f"  Action {i+1}: {action_type if action_type else 'Unknown'}")
+    
+    # For debugging, you can also print the full JSON
+    console.print("[dim]Full response JSON:[/]")
+    console.print_json(f"{agent_result.model_dump_json()}")
+
+    # Close the session
+    console.print("\n⏹️  [warning]Closing session...[/]")
+    await stagehand.close()
+    console.print("✅ [success]Session closed successfully![/]")
+    console.rule("[bold]End of Example[/]")
+
+
+if __name__ == "__main__":
+    # Add a fancy header
+    console.print(
+        "\n",
+        Panel(
+            "[light_gray]Stagehand 🤘 Agent Example[/]",
+            border_style="green",
+            padding=(1, 10),
+        ),
+    )
+    asyncio.run(main()) 
diff --git a/stagehand/agent/agent.py b/stagehand/agent/agent.py
@@ -170,13 +170,10 @@ async def execute(
                 f"Agent execution finished. Success: {agent_result.completed}. Message: {agent_result.message}",
                 category="agent",
             )
-            # To clean up pydantic model output
-            actions_repr = [action.root for action in agent_result.actions]
             self.logger.debug(
-                f"Agent actions: {actions_repr}",
+                f"Agent actions: {agent_result.actions}",
                 category="agent",
             )
-            agent_result.actions = actions_repr
             return agent_result
         else:
             agent_config_payload = self.config.model_dump(
diff --git a/stagehand/agent/google_cua.py b/stagehand/agent/google_cua.py
@@ -25,6 +25,7 @@
     AgentResult,
 )
 from .client import AgentClient
+from pydantic import TypeAdapter
 
 load_dotenv()
 
@@ -176,7 +177,7 @@ def _process_provider_response(
                 and candidate.safety_ratings
             ):
                 error_message += f" - Safety Ratings: {candidate.safety_ratings}"
-            self.logger.warning(error_message, category="agent")
+            self.logger.error(error_message, category="agent")
             return [], reasoning_text, True, error_message, []
 
         if not function_call_parts:
@@ -260,7 +261,7 @@ def _process_provider_response(
                         "keys": [self.key_to_playwright("PageDown")],
                     }
                 else:
-                    self.logger.warning(
+                    self.logger.error(
                         f"Unsupported scroll direction: {direction}", category="agent"
                     )
                     return (
@@ -282,7 +283,7 @@ def _process_provider_response(
                 elif direction in ("left", "right"):
                     magnitude = self._normalize_coordinates(magnitude, 0)[0]
                 else:
-                    self.logger.warning(
+                    self.logger.error(
                         f"Unsupported scroll direction: {direction}", category="agent"
                     )
                     return (
@@ -352,7 +353,7 @@ def _process_provider_response(
                     "arguments": {"url": "https://www.google.com"},
                 }
             else:
-                self.logger.warning(
+                self.logger.error(
                     f"Unsupported Gemini CUA function: {action_name}", category="agent"
                 )
                 return (
@@ -367,13 +368,11 @@ def _process_provider_response(
                 try:
                     # Directly construct the AgentActionType using the payload.
                     # Pydantic will use the 'type' field in action_payload_dict to discriminate the Union.
-                    action_payload_for_agent_action_type = AgentActionType(
-                        **action_payload_dict
-                    )
+                    action_payload_for_agent_action_type = TypeAdapter(AgentActionType).validate_python(action_payload_dict)
 
                     agent_action = AgentAction(
                         action_type=action_type_str,  # This should match the 'type' in action_payload_dict
-                        action=action_payload_for_agent_action_type,  # No RootModel wrapping if AgentActionType is the RootModel itself
+                        action=action_payload_for_agent_action_type,
                         reasoning=reasoning_text,
                         status="tool_code",
                     )
@@ -598,7 +597,7 @@ async def run_task(
                 )
 
             if not agent_action and not task_completed:
-                self.logger.warning(
+                self.logger.debug(
                     "Model did not request an action and task not marked complete. Ending task.",
                     category="agent",
                 )
@@ -614,7 +613,7 @@ async def run_task(
                     usage=usage_obj,
                 )
 
-        self.logger.warning("Max steps reached for Gemini CUA task.", category="agent")
+        self.logger.debug("Max steps reached for Gemini CUA task.", category="agent")
         usage_obj = {
             "input_tokens": total_input_tokens,
             "output_tokens": total_output_tokens,
diff --git a/stagehand/agent/openai_cua.py b/stagehand/agent/openai_cua.py
@@ -7,7 +7,7 @@
 from openai import (
     OpenAI as OpenAISDK,  # Renamed to avoid conflict with a potential class name
 )
-from pydantic import BaseModel  # Ensure BaseModel is imported for isinstance check
+from pydantic import BaseModel, TypeAdapter  # Ensure BaseModel is imported for isinstance check
 
 from ..handlers.cua_handler import CUAHandler
 from ..types.agent import (
@@ -175,8 +175,8 @@ def _process_provider_response(
                 )
 
             try:
-                action_payload = AgentActionType(
-                    **computer_call_item.action.model_dump()
+                action_payload = TypeAdapter(AgentActionType).validate_python(
+                    computer_call_item.action.model_dump()
                 )
                 agent_action = AgentAction(
                     action_type=computer_call_item.action.type,
@@ -225,7 +225,7 @@ def _process_provider_response(
                 function_action_payload = FunctionAction(type="function", name=function_call_item.name, arguments=arguments)  # type: ignore
                 agent_action = AgentAction(
                     action_type="function",  # Literal 'function'
-                    action=AgentActionType(root=function_action_payload),
+                    action=function_action_payload,
                     reasoning=reasoning_text,  # Reasoning applies to this action
                     status=(
                         function_call_item.status
diff --git a/stagehand/handlers/cua_handler.py b/stagehand/handlers/cua_handler.py
@@ -35,13 +35,12 @@ async def get_screenshot_base64(self) -> str:
 
     async def perform_action(self, action: AgentAction) -> ActionExecutionResult:
         """Execute a single action on the page."""
+        specific_action_model = action.action
         self.logger.info(
-            f"Performing action: {action.action.root if action.action else ''}",
+            f"Performing action: {specific_action_model or ''}",
             category=StagehandFunctionName.AGENT,
         )
         action_type = action.action_type
-        # action.action is the RootModel, action.action.root is the specific action model (e.g., ClickAction)
-        specific_action_model = action.action.root if action.action else None
 
         if not specific_action_model:
             self.logger.error(
diff --git a/stagehand/types/agent.py b/stagehand/types/agent.py
@@ -1,6 +1,6 @@
 from typing import Any, Literal, Optional, Union
 
-from pydantic import BaseModel, RootModel
+from pydantic import BaseModel
 
 
 class AgentConfig(BaseModel):
@@ -96,20 +96,18 @@ class KeyAction(BaseModel):  # From Anthropic
     text: str
 
 
-AgentActionType = RootModel[
-    Union[
-        ClickAction,
-        DoubleClickAction,
-        TypeAction,
-        KeyPressAction,
-        ScrollAction,
-        DragAction,
-        MoveAction,
-        WaitAction,
-        ScreenshotAction,
-        FunctionAction,
-        KeyAction,
-    ]
+AgentActionType = Union[
+    ClickAction,
+    DoubleClickAction,
+    TypeAction,
+    KeyPressAction,
+    ScrollAction,
+    DragAction,
+    MoveAction,
+    WaitAction,
+    ScreenshotAction,
+    FunctionAction,
+    KeyAction,
 ]