diff --git a/examples/agent_example.py b/examples/agent_example.py index c8cd56f2..8b214496 100644 --- a/examples/agent_example.py +++ b/examples/agent_example.py @@ -33,71 +33,48 @@ quiet_dependencies=True, # Reduce noise from dependencies ) -console.print( - Panel.fit( - "[yellow]Logging Levels:[/]\n" - "[white]- Set [bold]verbose=0[/] for errors (ERROR)[/]\n" - "[white]- Set [bold]verbose=1[/] for minimal logs (INFO)[/]\n" - "[white]- Set [bold]verbose=2[/] for medium logs (WARNING)[/]\n" - "[white]- Set [bold]verbose=3[/] for detailed logs (DEBUG)[/]", - title="Verbosity Options", - border_style="blue", - ) -) - async def main(): # Build a unified configuration object for Stagehand config = StagehandConfig( env="BROWSERBASE", + # env="LOCAL", api_key=os.getenv("BROWSERBASE_API_KEY"), project_id=os.getenv("BROWSERBASE_PROJECT_ID"), - headless=False, - dom_settle_timeout_ms=3000, model_name="gpt-4o", self_heal=True, - wait_for_captcha_solves=True, system_prompt="You are a browser automation assistant that helps users navigate websites effectively.", model_client_options={"apiKey": os.getenv("MODEL_API_KEY")}, - verbose=2, + verbose=1, ) # Create a Stagehand client using the configuration object. - stagehand = Stagehand( - config=config, - api_url=os.getenv("STAGEHAND_API_URL"), - ) + stagehand = Stagehand(config) # Initialize - this creates a new session automatically. console.print("\nšŸš€ [info]Initializing Stagehand...[/]") await stagehand.init() - console.print(f"\n[yellow]Created new session:[/] {stagehand.session_id}") - console.print( - f"🌐 [white]View your live browser:[/] [url]https://www.browserbase.com/sessions/{stagehand.session_id}[/]" - ) + if stagehand.env == "BROWSERBASE": + console.print(f"\n[yellow]Created new session:[/] {stagehand.session_id}") + console.print( + f"🌐 [white]View your live browser:[/] [url]https://www.browserbase.com/sessions/{stagehand.session_id}[/]" + ) + + console.print("\nā–¶ļø [highlight] Navigating[/] to Google") + await stagehand.page.goto("https://google.com/") + console.print("āœ… [success]Navigated to Google[/]") - # Configure the agent - agent_config = AgentConfig( - provider=AgentProvider.OPENAI, + console.print("\nā–¶ļø [highlight] Using Agent to perform a task[/]: playing a game of 2048") + agent = stagehand.agent( model="computer-use-preview", instructions="You are a helpful web navigation assistant that helps users find information. You are currently on the following page: google.com. Do not ask follow up questions, the user will trust your judgement.", options={"apiKey": os.getenv("MODEL_API_KEY")} ) - - # Define the task for the agent - execute_options = AgentExecuteOptions( + agent_result = await agent.execute( instruction="Play a game of 2048", max_steps=20, auto_screenshot=True, ) - console.print("\nā–¶ļø [highlight] Navigating[/] to Google") - await stagehand.page.goto("https://google.com/") - console.print("āœ… [success]Navigated to Google[/]") - - console.print("\nā–¶ļø [highlight] Using Agent to perform a task[/]: playing a game of 2048") - agent = stagehand.agent(agent_config) - agent_result = await agent.execute(execute_options) - console.print("šŸ“Š [info]Agent execution result:[/]") console.print(f"āœ… Success: [bold]{'Yes' if agent_result.success else 'No'}[/]") console.print(f"šŸŽÆ Completed: [bold]{'Yes' if agent_result.completed else 'No'}[/]") @@ -125,7 +102,7 @@ async def main(): console.print( "\n", Panel.fit( - "[light_gray]Stagehand 🤘 Async Agent Example[/]", + "[light_gray]Stagehand 🤘 Agent Example[/]", border_style="green", padding=(1, 10), ), diff --git a/stagehand/agent.py b/stagehand/agent.py deleted file mode 100644 index 32e689d2..00000000 --- a/stagehand/agent.py +++ /dev/null @@ -1,110 +0,0 @@ -from .schemas import ( - AgentConfig, - AgentExecuteOptions, - AgentExecuteResult, - AgentProvider, -) - -# Model to provider mapping -MODEL_TO_PROVIDER_MAP: dict[str, AgentProvider] = { - "computer-use-preview": AgentProvider.OPENAI, - "claude-3-5-sonnet-20240620": AgentProvider.ANTHROPIC, - "claude-3-7-sonnet-20250219": AgentProvider.ANTHROPIC, - # Add more mappings as needed -} - - -class Agent: - """ - Class to handle agent functionality in Stagehand - """ - - def __init__(self, stagehand_client, agent_config: AgentConfig): - """ - Initialize an Agent instance. - - Args: - stagehand_client: The client used to interface with the Stagehand server. - agent_config (AgentConfig): Configuration for the agent, - including provider, model, options, instructions. - """ - self._stagehand = stagehand_client - self._config = agent_config # Store the required config - - if not self._stagehand._initialized: - self._stagehand.logger.error( - "Stagehand must be initialized before creating an agent. Call await stagehand.init() first." - ) - raise RuntimeError( - "Stagehand must be initialized before creating an agent. Call await stagehand.init() first." - ) - - # Perform provider inference and validation - if self._config.model and not self._config.provider: - if self._config.model in MODEL_TO_PROVIDER_MAP: - self._config.provider = MODEL_TO_PROVIDER_MAP[self._config.model] - else: - self._stagehand.logger.error( - f"Could not infer provider for model: {self._config.model}" - ) - - # Ensure provider is correctly set as an enum if provided as a string - if self._config.provider and isinstance(self._config.provider, str): - try: - self._config.provider = AgentProvider(self._config.provider.lower()) - except ValueError as e: - raise ValueError( - f"Invalid provider: {self._config.provider}. Must be one of: {', '.join([p.value for p in AgentProvider])}" - ) from e - elif not self._config.provider: - raise ValueError( - "Agent provider is required and could not be determined from the provided config." - ) - - async def execute(self, execute_options: AgentExecuteOptions) -> AgentExecuteResult: - """ - Execute a task using the configured autonomous agent via the Stagehand server. - - Args: - execute_options (AgentExecuteOptions): Options for execution, including the instruction. - - Returns: - AgentExecuteResult: The result of the agent execution. - """ - - payload = { - # Use the stored config - "agentConfig": self._config.model_dump(exclude_none=True, by_alias=True), - "executeOptions": execute_options.model_dump( - exclude_none=True, by_alias=True - ), - } - - lock = self._stagehand._get_lock_for_session() - async with lock: - result = await self._stagehand._execute("agentExecute", payload) - - if isinstance(result, dict): - # Ensure all expected fields are present - # If not present in result, use defaults from AgentExecuteResult schema - if "success" not in result: - raise ValueError("Response missing required field 'success'") - - # Ensure completed is set with default if not present - if "completed" not in result: - result["completed"] = False - - # Add default for message if missing - if "message" not in result: - result["message"] = None - - return AgentExecuteResult(**result) - elif result is None: - # Handle cases where the server might return None or an empty response - # Return a default failure result or raise an error - return AgentExecuteResult( - success=False, completed=False, message="No result received from server" - ) - else: - # If the result is not a dict and not None, it's unexpected - raise TypeError(f"Unexpected result type from server: {type(result)}") diff --git a/stagehand/agent/agent.py b/stagehand/agent/agent.py index 09f849d9..09fe0c29 100644 --- a/stagehand/agent/agent.py +++ b/stagehand/agent/agent.py @@ -1,6 +1,10 @@ from typing import Optional, Union from ..handlers.cua_handler import CUAHandler +from ..schemas import ( + AgentExecuteResult, + AgentProvider, +) from ..types.agent import ( AgentConfig, AgentExecuteOptions, @@ -16,6 +20,12 @@ "claude-3-5-sonnet-latest": AnthropicCUAClient, "claude-3-7-sonnet-latest": AnthropicCUAClient, } +MODEL_TO_PROVIDER_MAP: dict[str, AgentProvider] = { + "computer-use-preview": AgentProvider.OPENAI, + "claude-3-5-sonnet-20240620": AgentProvider.ANTHROPIC, + "claude-3-7-sonnet-20250219": AgentProvider.ANTHROPIC, + # Add more mappings as needed +} AGENT_METRIC_FUNCTION_NAME = "AGENT_EXECUTE_TASK" @@ -26,22 +36,32 @@ def __init__(self, stagehand_client, **kwargs): self.stagehand = stagehand_client self.config = AgentConfig(**kwargs) if kwargs else AgentConfig() self.logger = self.stagehand.logger - - if not hasattr(self.stagehand, "page") or not hasattr( - self.stagehand.page, "_page" - ): - self.logger.error( - "Stagehand page object not available for CUAHandler initialization." + if self.stagehand.env == "BROWSERBASE": + if self.config.model in MODEL_TO_PROVIDER_MAP: + self.provider = MODEL_TO_PROVIDER_MAP[self.config.model] + else: + self.provider = None + self.logger.error( + f"Could not infer provider for model: {self.config.model}" + ) + else: + if not hasattr(self.stagehand, "page") or not hasattr( + self.stagehand.page, "_page" + ): + self.logger.error( + "Stagehand page object not available for CUAHandler initialization." + ) + raise ValueError("Stagehand page not initialized. Cannot create Agent.") + + self.cua_handler = CUAHandler( + stagehand=self.stagehand, + page=self.stagehand.page._page, + logger=self.logger, ) - raise ValueError("Stagehand page not initialized. Cannot create Agent.") - - self.cua_handler = CUAHandler( - stagehand=self.stagehand, page=self.stagehand.page._page, logger=self.logger - ) - self.viewport = self.stagehand.page._page.viewport_size - # self.viewport = {"width": 1024, "height": 768} - self.client: AgentClient = self._get_client() + self.viewport = self.stagehand.page._page.viewport_size + # self.viewport = {"width": 1024, "height": 768} + self.client: AgentClient = self._get_client() def _get_client(self) -> AgentClient: ClientClass = MODEL_TO_CLIENT_CLASS_MAP.get(self.config.model) # noqa: N806 @@ -67,72 +87,125 @@ def _get_client(self) -> AgentClient: ) async def execute( - self, options_or_instruction: Union[AgentExecuteOptions, str] + self, + options_or_instruction: Union[AgentExecuteOptions, str, dict, None] = None, + **kwargs, ) -> AgentResult: - options: Optional[AgentExecuteOptions] = None - instruction: str + options_dict = {} - if isinstance(options_or_instruction, str): - instruction = options_or_instruction - options = AgentExecuteOptions(instruction=instruction) + if isinstance(options_or_instruction, AgentExecuteOptions): + options_dict = options_or_instruction.model_dump() elif isinstance(options_or_instruction, dict): - options = AgentExecuteOptions(**options_or_instruction) - instruction = options.instruction - else: - options = options_or_instruction - instruction = options.instruction + options_dict = options_or_instruction.copy() + elif isinstance(options_or_instruction, str): + options_dict["instruction"] = options_or_instruction - if not instruction: + options_dict.update(kwargs) + + try: + options = AgentExecuteOptions(**options_dict) + except Exception as e: + self.logger.error(f"Invalid agent execute options: {e}") + raise + + if not options.instruction: self.logger.error("No instruction provided for agent execution.") return AgentResult( - message="No instruction provided.", completed=True, actions=[], usage={} + message="No instruction provided.", + completed=True, + actions=[], + usage={}, ) - self.logger.info( - f"Agent starting execution for instruction: '{instruction}'", - category="agent", - ) + instruction = options.instruction - try: - agent_result = await self.client.run_task( - instruction=instruction, - max_steps=self.config.max_steps, - options=options, + if self.stagehand.env == "LOCAL": + self.logger.info( + f"Agent starting execution for instruction: '{instruction}'", + category="agent", ) - except Exception as e: - self.logger.error( - f"Exception during client.run_task: {e}", category="agent" + + try: + agent_result = await self.client.run_task( + instruction=instruction, + max_steps=self.config.max_steps, + options=options, + ) + except Exception as e: + self.logger.error( + f"Exception during client.run_task: {e}", category="agent" + ) + empty_usage = AgentUsage( + input_tokens=0, output_tokens=0, inference_time_ms=0 + ) + return AgentResult( + message=f"Error: {str(e)}", + completed=True, + actions=[], + usage=empty_usage, + ) + + # Update metrics if usage data is available in the result + if agent_result.usage: + # self.stagehand.update_metrics( + # AGENT_METRIC_FUNCTION_NAME, + # agent_result.usage.get("input_tokens", 0), + # agent_result.usage.get("output_tokens", 0), + # agent_result.usage.get("inference_time_ms", 0), + # ) + pass # Placeholder if metrics are to be handled differently or not at all + + self.logger.info( + f"Agent execution finished. Success: {agent_result.completed}. Message: {agent_result.message}", + category="agent", ) - empty_usage = AgentUsage( - input_tokens=0, output_tokens=0, inference_time_ms=0 + # To clean up pydantic model output + actions_repr = [action.root for action in agent_result.actions] + self.logger.debug( + f"Agent actions: {actions_repr}", + category="agent", ) - return AgentResult( - message=f"Error: {str(e)}", - completed=True, - actions=[], - usage=empty_usage, + agent_result.actions = actions_repr + return agent_result + else: + agent_config_payload = self.config.model_dump( + exclude_none=True, by_alias=True ) - - # Update metrics if usage data is available in the result - if agent_result.usage: - # self.stagehand.update_metrics( - # AGENT_METRIC_FUNCTION_NAME, - # agent_result.usage.get("input_tokens", 0), - # agent_result.usage.get("output_tokens", 0), - # agent_result.usage.get("inference_time_ms", 0), - # ) - pass # Placeholder if metrics are to be handled differently or not at all - - self.logger.info( - f"Agent execution finished. Success: {agent_result.completed}. Message: {agent_result.message}", - category="agent", - ) - # To clean up pydantic model output - actions_repr = [action.root for action in agent_result.actions] - self.logger.debug( - f"Agent actions: {actions_repr}", - category="agent", - ) - agent_result.actions = actions_repr - return agent_result + agent_config_payload["provider"] = self.provider + payload = { + # Use the stored config + "agentConfig": agent_config_payload, + "executeOptions": options.model_dump(exclude_none=True, by_alias=True), + } + + lock = self.stagehand._get_lock_for_session() + async with lock: + result = await self.stagehand._execute("agentExecute", payload) + + if isinstance(result, dict): + # Ensure all expected fields are present + # If not present in result, use defaults from AgentExecuteResult schema + if "success" not in result: + raise ValueError("Response missing required field 'success'") + + # Ensure completed is set with default if not present + if "completed" not in result: + result["completed"] = False + + # Add default for message if missing + if "message" not in result: + result["message"] = None + + return AgentExecuteResult(**result) + elif result is None: + # Handle cases where the server might return None or an empty response + # Return a default failure result or raise an error + return AgentExecuteResult( + success=False, + completed=False, + message="No result received from server", + ) + else: + # If the result is not a dict and not None, it's unexpected + raise TypeError(f"Unexpected result type from server: {type(result)}") diff --git a/stagehand/agent/anthropic_cua.py b/stagehand/agent/anthropic_cua.py index 273b5795..edcdd39e 100644 --- a/stagehand/agent/anthropic_cua.py +++ b/stagehand/agent/anthropic_cua.py @@ -54,7 +54,9 @@ def __init__( **kwargs, ): super().__init__(model, instructions, config, logger, handler) - self.anthropic_sdk_client = Anthropic(api_key=os.getenv("ANTHROPIC_API_KEY")) + self.anthropic_sdk_client = Anthropic( + api_key=config.options.get("apiKey") or os.getenv("ANTHROPIC_API_KEY") + ) dimensions = ( (viewport["width"], viewport["height"]) if viewport else (1024, 768) diff --git a/stagehand/agent/openai_cua.py b/stagehand/agent/openai_cua.py index 8a86569f..a1f2caaf 100644 --- a/stagehand/agent/openai_cua.py +++ b/stagehand/agent/openai_cua.py @@ -37,7 +37,9 @@ def __init__( ): super().__init__(model, instructions, config, logger, handler) # TODO pass api key - self.openai_sdk_client = OpenAISDK(api_key=os.getenv("OPENAI_API_KEY")) + self.openai_sdk_client = OpenAISDK( + api_key=config.options.get("apiKey") or os.getenv("OPENAI_API_KEY") + ) dimensions = ( (viewport["width"], viewport["height"]) if viewport else (1024, 768) diff --git a/stagehand/main.py b/stagehand/main.py index e8662c2a..eb453fa7 100644 --- a/stagehand/main.py +++ b/stagehand/main.py @@ -28,7 +28,6 @@ from .logging import StagehandLogger, default_log_handler from .metrics import StagehandFunctionName, StagehandMetrics from .page import StagehandPage -from .schemas import AgentConfig from .utils import make_serializable load_dotenv() @@ -168,7 +167,6 @@ def __init__( self._context: Optional[BrowserContext] = None self._playwright_page: Optional[PlaywrightPage] = None self.page: Optional[StagehandPage] = None - self.agent = None self.context: Optional[StagehandContext] = None self._initialized = False # Flag to track if init() has run @@ -441,7 +439,7 @@ async def init(self): self._initialized = True - def agent(self, agent_config: AgentConfig) -> Agent: + def agent(self, **kwargs) -> Agent: """ Create an agent instance configured with the provided options. @@ -457,9 +455,9 @@ def agent(self, agent_config: AgentConfig) -> Agent: "Stagehand must be initialized with await init() before creating an agent." ) - self.logger.debug(f"Creating Agent instance with config: {agent_config}") + self.logger.debug(f"Creating Agent instance with config: {kwargs}") # Pass the required config directly to the Agent constructor - return Agent(self, agent_config=agent_config) + return Agent(self, **kwargs) async def close(self): """ diff --git a/stagehand/types/agent.py b/stagehand/types/agent.py index b5335380..8f29d3c2 100644 --- a/stagehand/types/agent.py +++ b/stagehand/types/agent.py @@ -163,12 +163,16 @@ class AgentExecuteOptions(BaseModel): Attributes: instruction (str): The instruction to execute. max_steps (Optional[int]): Maximum number of steps the agent can take. Defaults to 15. - auto_screenshot (Optional[bool]): Whether to automatically capture screenshots after each action. False will let the agent choose when to capture screenshots. Defaults to False. + auto_screenshot (Optional[bool]): Whether to automatically capture screenshots after each action. False will let the agent choose when to capture screenshots. Defaults to True. + wait_between_actions (Optional[int]): Milliseconds to wait between actions. + context (Optional[str]): Additional context for the agent. """ instruction: str max_steps: Optional[int] = 15 - auto_screenshot: Optional[bool] = False + auto_screenshot: Optional[bool] = True + wait_between_actions: Optional[int] = 1000 + context: Optional[str] = None class EnvState(BaseModel):