diff --git a/.github/pull_request_template.md b/.github/pull_request_template.md new file mode 100644 index 000000000..4450b1d33 --- /dev/null +++ b/.github/pull_request_template.md @@ -0,0 +1,27 @@ +## Description + + +## Quickstart +- [ ] Computer Use Demo +- [ ] Customer Support Agent +- [ ] Financial Data Analyst +- [ ] N/A + +## Type of Change +- [ ] Bug fix +- [ ] New feature +- [ ] Documentation update +- [ ] Code refactoring +- [ ] Other (please describe): + +## Testing + +- [ ] Added/updated unit tests +- [ ] Tested manually +- [ ] Verified in development environment + +## Screenshots + + +## Additional Notes + \ No newline at end of file diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml index ade2d45a6..88f258f8f 100644 --- a/.github/workflows/tests.yaml +++ b/.github/workflows/tests.yaml @@ -19,6 +19,8 @@ jobs: steps: - uses: actions/checkout@v4 - uses: astral-sh/ruff-action@v1 + with: + src: "computer-use-demo" pyright: runs-on: ubuntu-latest defaults: @@ -36,6 +38,8 @@ jobs: pip install -r dev-requirements.txt - run: echo "$PWD/.venv/bin" >> $GITHUB_PATH - uses: jakebailey/pyright-action@v1 + with: + working-directory: computer-use-demo pytest: runs-on: ubuntu-latest defaults: diff --git a/CLAUDE.md b/CLAUDE.md new file mode 100644 index 000000000..87b090fad --- /dev/null +++ b/CLAUDE.md @@ -0,0 +1,62 @@ +# Claude Quickstarts Development Guide + +## Legal + +- When changes are made to files that have a copyright notice add them to that subdirectory's CHANGELOG.md file. + +## Computer-Use Demo + +### Setup & Development + +- **Setup environment**: `./setup.sh` +- **Build Docker**: `docker build . -t computer-use-demo:local` +- **Run container**: `docker run -e ANTHROPIC_API_KEY=$ANTHROPIC_API_KEY -v $(pwd)/computer_use_demo:/home/computeruse/computer_use_demo/ -v $HOME/.anthropic:/home/computeruse/.anthropic -p 5900:5900 -p 8501:8501 -p 6080:6080 -p 8080:8080 -it computer-use-demo:local` + +### Testing & Code Quality + +- **Lint**: `ruff check .` +- **Format**: `ruff format .` +- **Typecheck**: `pyright` +- **Run tests**: `pytest` +- **Run single test**: `pytest tests/path_to_test.py::test_name -v` + +### Code Style + +- **Python**: snake_case for functions/variables, PascalCase for classes +- **Imports**: Use isort with combine-as-imports +- **Error handling**: Use custom ToolError for tool errors +- **Types**: Add type annotations for all parameters and returns +- **Classes**: Use dataclasses and abstract base classes + +## Customer Support Agent + +### Setup & Development + +- **Install dependencies**: `npm install` +- **Run dev server**: `npm run dev` (full UI) +- **UI variants**: `npm run dev:left` (left sidebar), `npm run dev:right` (right sidebar), `npm run dev:chat` (chat only) +- **Lint**: `npm run lint` +- **Build**: `npm run build` (full UI), see package.json for variants + +### Code Style + +- **TypeScript**: Strict mode with proper interfaces +- **Components**: Function components with React hooks +- **Formatting**: Follow ESLint Next.js configuration +- **UI components**: Use shadcn/ui components library + +## Financial Data Analyst + +### Setup & Development + +- **Install dependencies**: `npm install` +- **Run dev server**: `npm run dev` +- **Lint**: `npm run lint` +- **Build**: `npm run build` + +### Code Style + +- **TypeScript**: Strict mode with proper type definitions +- **Components**: Function components with type annotations +- **Visualization**: Use Recharts library for data visualization +- **State management**: React hooks for state \ No newline at end of file diff --git a/README.md b/README.md index 1e8c7fdb6..42c5183cd 100644 --- a/README.md +++ b/README.md @@ -1,10 +1,10 @@ -# Anthropic Quickstarts +# Claude Quickstarts -Anthropic Quickstarts is a collection of projects designed to help developers quickly get started with building applications using the Anthropic API. Each quickstart provides a foundation that you can easily build upon and customize for your specific needs. +Claude Quickstarts is a collection of projects designed to help developers quickly get started with building applications using the Claude API. Each quickstart provides a foundation that you can easily build upon and customize for your specific needs. ## Getting Started -To use these quickstarts, you'll need an Anthropic API key. If you don't have one yet, you can sign up for free at [console.anthropic.com](https://console.anthropic.com). +To use these quickstarts, you'll need an Claude API key. If you don't have one yet, you can sign up for free at [console.anthropic.com](https://console.anthropic.com). ## Available Quickstarts @@ -22,10 +22,22 @@ A financial data analyst powered by Claude. This project demonstrates how to lev ### Computer Use Demo -An environment and tools that Claude can use to control a desktop computer. This project demonstrates how to leverage the computer use capabilities of the the new Claude 3.5 Sonnet model. +An environment and tools that Claude can use to control a desktop computer. This project demonstrates how to leverage the computer use capabilities of Claude, including support for the latest `computer_use_20251124` tool version with zoom actions. [Go to Computer Use Demo Quickstart](./computer-use-demo) +### Browser Tools API Demo + +A complete reference implementation for browser automation powered by Claude. This project demonstrates how to leverage Claude's browser tools API for web interaction, including navigation, DOM inspection, and form manipulation using Playwright. + +[Go to Browser Tools API Demo Quickstart](./browser-tools-api-demo) + +### Autonomous Coding Agent + +An autonomous coding agent powered by the Claude Agent SDK. This project demonstrates a two-agent pattern (initializer + coding agent) that can build complete applications over multiple sessions, with progress persisted via git and a feature list that the agent works through incrementally. + +[Go to Autonomous Coding Agent Quickstart](./autonomous-coding) + ## General Usage Each quickstart project comes with its own README and setup instructions. Generally, you'll follow these steps: @@ -33,20 +45,20 @@ Each quickstart project comes with its own README and setup instructions. Genera 1. Clone this repository 2. Navigate to the specific quickstart directory 3. Install the required dependencies -4. Set up your Anthropic API key as an environment variable +4. Set up your Claude API key as an environment variable 5. Run the quickstart application ## Explore Further -To deepen your understanding of working with Claude and the Anthropic API, check out these resources: +To deepen your understanding of working with Claude and the Claude API, check out these resources: -- [Anthropic API Documentation](https://docs.anthropic.com) -- [Anthropic Cookbook](https://github.com/anthropics/anthropic-cookbook) - A collection of code snippets and guides for common tasks -- [Anthropic API Fundamentals Course](https://github.com/anthropics/courses/tree/master/anthropic_api_fundamentals) +- [Claude API Documentation](https://docs.claude.com) +- [Claude Cookbooks](https://github.com/anthropics/claude-cookbooks) - A collection of code snippets and guides for common tasks +- [Claude API Fundamentals Course](https://github.com/anthropics/courses/tree/master/anthropic_api_fundamentals) ## Contributing -We welcome contributions to the Anthropic Quickstarts repository! If you have ideas for new quickstart projects or improvements to existing ones, please open an issue or submit a pull request. +We welcome contributions to the Claude Quickstarts repository! If you have ideas for new quickstart projects or improvements to existing ones, please open an issue or submit a pull request. ## Community and Support diff --git a/agents/.gitignore b/agents/.gitignore new file mode 100644 index 000000000..f353c875e --- /dev/null +++ b/agents/.gitignore @@ -0,0 +1,27 @@ +# Python bytecode +__pycache__/ +*.py[cod] +*$py.class + +# Jupyter Notebook +.ipynb_checkpoints +*/.ipynb_checkpoints/* + +# MacOS +.DS_Store + +# Environment variables +.env + +# Virtual environments +venv/ +env/ +.venv/ + +# MCP logs +*.log + +# Distribution / packaging +dist/ +build/ +*.egg-info/ \ No newline at end of file diff --git a/agents/README.md b/agents/README.md new file mode 100644 index 000000000..0b28c4c87 --- /dev/null +++ b/agents/README.md @@ -0,0 +1,48 @@ +# Agents + +A minimal educational implementation of LLM agents using the Claude API. + +> **Note:** This is NOT an SDK, but a reference implementation of key concepts + +## Overview & Core Components + +This repo demonstrates how to [build effective agents](https://www.anthropic.com/engineering/building-effective-agents) with the Claude API. It shows how sophisticated AI behaviors can emerge from a simple foundation: LLMs using tools in a loop. This implementation is not prescriptive - the core logic is <300 lines of code and deliberately lacks production features. Feel free to translate these patterns to your language and production stack ([Claude Code](https://docs.claude.com/en/docs/agents-and-tools/claude-code/overview) can help!) + +It contains three components: + +- `agent.py`: Manages Claude API interactions and tool execution +- `tools/`: Tool implementations (both native and MCP tools) +- `utils/`: Utilities for message history and MCP server connections + +## Usage + +```python +from agents.agent import Agent +from agents.tools.think import ThinkTool + +# Create an agent with both local tools and MCP server tools +agent = Agent( + name="MyAgent", + system="You are a helpful assistant.", + tools=[ThinkTool()], # Local tools + mcp_servers=[ + { + "type": "stdio", + "command": "python", + "args": ["-m", "mcp_server"], + }, + ] +) + +# Run the agent +response = agent.run("What should I consider when buying a new laptop?") +``` + +From this foundation, you can add domain-specific tools, optimize performance, or implement custom response handling. We remain deliberately unopinionated - this backbone simply gets you started with fundamentals. + +## Requirements + +- Python 3.8+ +- Claude API key (set as `ANTHROPIC_API_KEY` environment variable) +- `anthropic` Python library +- `mcp` Python library \ No newline at end of file diff --git a/agents/__init__.py b/agents/__init__.py new file mode 100644 index 000000000..db8f43dec --- /dev/null +++ b/agents/__init__.py @@ -0,0 +1,6 @@ +"""Core agent implementations.""" + +from .agent import Agent, ModelConfig +from .tools.base import Tool + +__all__ = ["Agent", "ModelConfig", "Tool"] diff --git a/agents/agent.py b/agents/agent.py new file mode 100644 index 000000000..35365eea2 --- /dev/null +++ b/agents/agent.py @@ -0,0 +1,173 @@ +"""Agent implementation with Claude API and tools.""" + +import asyncio +import os +from contextlib import AsyncExitStack +from dataclasses import dataclass +from typing import Any + +from anthropic import Anthropic + +from .tools.base import Tool +from .utils.connections import setup_mcp_connections +from .utils.history_util import MessageHistory +from .utils.tool_util import execute_tools + + +@dataclass +class ModelConfig: + """Configuration settings for Claude model parameters.""" + + # Available models include: + # - claude-sonnet-4-20250514 (default) + # - claude-opus-4-20250514 + # - claude-haiku-4-5-20251001 + # - claude-3-5-sonnet-20240620 + # - claude-3-haiku-20240307 + model: str = "claude-sonnet-4-20250514" + max_tokens: int = 4096 + temperature: float = 1.0 + context_window_tokens: int = 180000 + + +class Agent: + """Claude-powered agent with tool use capabilities.""" + + def __init__( + self, + name: str, + system: str, + tools: list[Tool] | None = None, + mcp_servers: list[dict[str, Any]] | None = None, + config: ModelConfig | None = None, + verbose: bool = False, + client: Anthropic | None = None, + message_params: dict[str, Any] | None = None, + ): + """Initialize an Agent. + + Args: + name: Agent identifier for logging + system: System prompt for the agent + tools: List of tools available to the agent + mcp_servers: MCP server configurations + config: Model configuration with defaults + verbose: Enable detailed logging + client: Anthropic client instance + message_params: Additional parameters for client.messages.create(). + These override any conflicting parameters from config. + """ + self.name = name + self.system = system + self.verbose = verbose + self.tools = list(tools or []) + self.config = config or ModelConfig() + self.mcp_servers = mcp_servers or [] + self.message_params = message_params or {} + self.client = client or Anthropic( + api_key=os.environ.get("ANTHROPIC_API_KEY", "") + ) + self.history = MessageHistory( + model=self.config.model, + system=self.system, + context_window_tokens=self.config.context_window_tokens, + client=self.client, + ) + + if self.verbose: + print(f"\n[{self.name}] Agent initialized") + + def _prepare_message_params(self) -> dict[str, Any]: + """Prepare parameters for client.messages.create() call. + + Returns a dict with base parameters from config, with any + message_params overriding conflicting keys. + """ + return { + "model": self.config.model, + "max_tokens": self.config.max_tokens, + "temperature": self.config.temperature, + "system": self.system, + "messages": self.history.format_for_api(), + "tools": [tool.to_dict() for tool in self.tools], + **self.message_params, + } + + async def _agent_loop(self, user_input: str) -> list[dict[str, Any]]: + """Process user input and handle tool calls in a loop""" + if self.verbose: + print(f"\n[{self.name}] Received: {user_input}") + await self.history.add_message("user", user_input, None) + + tool_dict = {tool.name: tool for tool in self.tools} + + while True: + self.history.truncate() + params = self._prepare_message_params() + + # Merge headers properly - default beta header can be overridden by message_params + default_headers = {"anthropic-beta": "code-execution-2025-05-22"} + if "extra_headers" in params: + # Pop extra_headers from params and merge with defaults + custom_headers = params.pop("extra_headers") + merged_headers = {**default_headers, **custom_headers} + else: + merged_headers = default_headers + + response = self.client.messages.create( + **params, + extra_headers=merged_headers + ) + tool_calls = [ + block for block in response.content if block.type == "tool_use" + ] + + if self.verbose: + for block in response.content: + if block.type == "text": + print(f"\n[{self.name}] Output: {block.text}") + elif block.type == "tool_use": + params_str = ", ".join( + [f"{k}={v}" for k, v in block.input.items()] + ) + print( + f"\n[{self.name}] Tool call: " + f"{block.name}({params_str})" + ) + + await self.history.add_message( + "assistant", response.content, response.usage + ) + + if tool_calls: + tool_results = await execute_tools( + tool_calls, + tool_dict, + ) + if self.verbose: + for block in tool_results: + print( + f"\n[{self.name}] Tool result: " + f"{block.get('content')}" + ) + await self.history.add_message("user", tool_results) + else: + return response + + async def run_async(self, user_input: str) -> list[dict[str, Any]]: + """Run agent with MCP tools asynchronously.""" + async with AsyncExitStack() as stack: + original_tools = list(self.tools) + + try: + mcp_tools = await setup_mcp_connections( + self.mcp_servers, stack + ) + self.tools.extend(mcp_tools) + return await self._agent_loop(user_input) + finally: + self.tools = original_tools + + def run(self, user_input: str) -> list[dict[str, Any]]: + """Run agent synchronously""" + return asyncio.run(self.run_async(user_input)) diff --git a/agents/agent_demo.ipynb b/agents/agent_demo.ipynb new file mode 100644 index 000000000..2fc69640b --- /dev/null +++ b/agents/agent_demo.ipynb @@ -0,0 +1,662 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Agent Demo with Simplified Connection Management\n", + "\n", + "This notebook demonstrates an agent using multiple tool types with simplified connection management:\n", + "1. **Local Calculator**: Mathematical expressions via MCP\n", + "2. **Brave Search**: Web search via MCP (if API key available)\n", + "3. **Think Tool**: Internal reasoning tool" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import os\n", + "import sys\n", + "import nest_asyncio\n", + "nest_asyncio.apply()\n", + "\n", + "parent_dir = os.path.dirname(os.getcwd())\n", + "sys.path.insert(0, parent_dir)\n", + "from agents.agent import Agent, ModelConfig\n", + "from agents.tools.think import ThinkTool\n", + "from agents.tools.web_search import WebSearchServerTool\n", + "from agents.tools.code_execution import CodeExecutionServerTool" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 1. Configure tools and MCP servers" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Calculator server configured: Yes\n", + "Brave API key available: Yes\n", + "Brave search server configured: Yes\n" + ] + } + ], + "source": [ + "# Standard Python tool\n", + "think_tool = ThinkTool()\n", + "\n", + "# Python MCP server\n", + "calculator_server_path = os.path.abspath(os.path.join(os.getcwd(), \"tools/calculator_mcp.py\"))\n", + "calculator_server = {\n", + " \"type\": \"stdio\",\n", + " \"command\": \"python\",\n", + " \"args\": [calculator_server_path]\n", + "}\n", + "print(f\"Calculator server configured: {'Yes' if calculator_server else 'No'}\")\n", + "\n", + "\n", + "# Brave MCP server written in TypeScript\n", + "\n", + "brave_api_key = os.environ.get(\"BRAVE_API_KEY_BASE_DATA\", \"\")\n", + "print(f\"Brave API key available: {'Yes' if brave_api_key else 'No'}\")\n", + "brave_search_server = {\n", + " \"type\": \"stdio\",\n", + " \"command\": \"npx\",\n", + " \"args\": [\"-y\", \"@modelcontextprotocol/server-brave-search\"],\n", + " \"env\": {\n", + " \"BRAVE_API_KEY\": brave_api_key, \n", + " \"PATH\": f\"{os.path.dirname('npx')}:\" + os.environ.get(\"PATH\", \"\")\n", + " }\n", + "}\n", + "print(f\"Brave search server configured: {'Yes' if brave_search_server else 'No'}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 2. Initialize agent with combined tools" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "[Multi-Tool Agent] Agent initialized\n" + ] + } + ], + "source": [ + "# Create agent config \n", + "system_prompt = \"\"\"\n", + "You are a helpful assistant with access to:\n", + "1. Web search (brave_web_search, brave_local_search)\n", + "2. Mathematical calculator (calculate) \n", + "3. A tool to think and reason (think)\n", + "\n", + "Always use the most appropriate tool for each task.\n", + "\"\"\"\n", + "\n", + "\n", + "# Initialize agent with standard tools and MCP servers\n", + "agent = Agent(\n", + " name=\"Multi-Tool Agent\",\n", + " system=system_prompt,\n", + " tools=[think_tool], \n", + " mcp_servers=[brave_search_server, calculator_server], \n", + " config=ModelConfig(\n", + " model=\"claude-3-7-sonnet-20250219\", \n", + " max_tokens=4096,\n", + " temperature=1.0\n", + " ),\n", + " verbose=True\n", + ")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 3. Run queries" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Loaded 3 MCP tools from 2 servers.\n", + "\n", + "[Multi-Tool Agent] Received: What's the square root of the OKC population in 2022\n", + "\n", + "[Multi-Tool Agent] Output: I'll help you find the square root of Oklahoma City's population in 2022. First, I need to look up the population data, then I can calculate the square root.\n", + "\n", + "[Multi-Tool Agent] Tool call: brave_web_search(query=Oklahoma City population 2022 census data)\n", + "\n", + "[Multi-Tool Agent] Tool result: Title: Oklahoma Cities by Population (2025)\n", + "Description: The most populous cities in Oklahoma are Oklahoma City with 702,767, Tulsa at 411,894, Norman with 130,046, Broken Arrow at 119,194, and Edmond with 98,103. You can get the most current Census population data for all 834 Oklahoma cities, sorted from the most populous cities in Oklahoma to the ...\n", + "URL: https://www.oklahoma-demographics.com/cities_by_population\n", + "\n", + "Title: U.S. Census Bureau QuickFacts: Oklahoma City city, Oklahoma\n", + "Description: QuickFacts provides statistics for all states and counties. Also for cities and towns with a population of 5,000 or more. ... Methodology differences may exist between data sources, and so estimates from different sources are not comparable.\n", + "URL: https://www.census.gov/quickfacts/fact/table/oklahomacitycityoklahoma/PST045224\n", + "\n", + "Title: Oklahoma City lands in top 20 of largest US cities by population: New census data\n", + "Description: More: 10 Oklahoma cities where population is growing the fastest · Oklahoma City grew by about 1% between 2022 and 2023, adding 7,589 people. And between the 2010 and 2020 Censuses, Oklahoma City was one of 14 American cities that added more than 100,000 people.\n", + "URL: https://www.oklahoman.com/story/news/2024/05/17/oklahoma-city-population-top-20-us-census-data-growth/73729737007/\n", + "\n", + "Title: Oklahoma City - Wikipedia\n", + "Description: Oklahoma City (/ˌoʊkləˈhoʊmə ... it ranks 20th in population among U.S. cities and 8th in the Southern United States, with a population of 681,054 at the 2020 census....\n", + "URL: https://en.wikipedia.org/wiki/Oklahoma_City\n", + "\n", + "Title: Oklahoma City city, Oklahoma - Census Bureau Profile\n", + "Description: We're sorry but this website doesn't work properly without JavaScript enabled. Please enable it to continue\n", + "URL: https://data.census.gov/profile/Oklahoma_City_city,_Oklahoma?g=160XX00US4055000\n", + "\n", + "Title: Oklahoma City, Oklahoma Population 2025\n", + "Description: Oklahoma City is a city located in Cleveland County, Oklahoma. Oklahoma City has a 2025 population of 715,893. Oklahoma City is currently growing at a rate of 0.93% annually and its population has increased by 4.8% since the most recent census, which recorded a population of 683,078 in 2020.\n", + "URL: https://worldpopulationreview.com/us-cities/oklahoma/oklahoma-city\n", + "\n", + "Title: Oklahoma Demographics | Current Census Data\n", + "Description: Get the Oklahoma demographic data that you need. You can get the most current Oklahoma data from the US Census Bureau without spending 40 high-value hours searching the internet. The Oklahoma population is 4,053,824 people according to the US Census Bureau's 2023 Population Estimates Program.\n", + "URL: https://www.oklahoma-demographics.com/\n", + "\n", + "Title: Oklahoma City, OK - Profile data - Census Reporter\n", + "Description: a little less than the figure in the Oklahoma City, OK Metro Area: 23.5 14,717,310 (±0.2 / ±145,724)\n", + "URL: https://censusreporter.org/profiles/16000US4055000-oklahoma-city-ok/\n", + "\n", + "Title: Oklahoma population by year, county, race, & more | USAFacts\n", + "Description: The ages, races, and population density of Oklahoma tell a story. Understand the shifts in demographic trends with these charts visualizing decades of population data. ... Florida K-12 public teachers are paid the lowest average salary when adjusted for cost of living.Published on November 8, 2022 ... The Census ...\n", + "URL: https://usafacts.org/data/topics/people-society/population-and-demographics/our-changing-population/state/oklahoma/\n", + "\n", + "Title: Resident Population in Oklahoma City, OK (MSA) (OKCPOP) | FRED | St. Louis Fed\n", + "Description: Graph and download economic data for Resident Population in Oklahoma City, OK (MSA) (OKCPOP) from 2000 to 2024 about Oklahoma City, OK, residents, population, and USA.\n", + "URL: https://fred.stlouisfed.org/series/OKCPOP\n", + "\n", + "[Multi-Tool Agent] Output: Based on the search results, I can see multiple population figures for Oklahoma City in different years. The most relevant information for 2022 specifically isn't directly provided, but I can see:\n", + "\n", + "- 2020 census: 681,054 people\n", + "- 2023 estimate showing growth of 7,589 people from 2022\n", + "- 2025 projection: 715,893 people\n", + "\n", + "Let me search more specifically for the 2022 population:\n", + "\n", + "[Multi-Tool Agent] Tool call: brave_web_search(query=Oklahoma City population 2022 exact figure)\n", + "\n", + "[Multi-Tool Agent] Tool result: Title: Oklahoma City, Oklahoma Population 2025\n", + "Description: Oklahoma City is a city located in Cleveland County, Oklahoma. Oklahoma City has a 2025 population of 715,893.\n", + "URL: https://worldpopulationreview.com/us-cities/oklahoma/oklahoma-city\n", + "\n", + "Title: Oklahoma City Metro Area Population 1950-2025\n", + "Description: The metro area population of Oklahoma City in 2022 was 1,008,000, a 1% increase from 2021.\n", + "URL: https://www.macrotrends.net/cities/23088/oklahoma-city/population\n", + "\n", + "Title: Oklahoma City city, Oklahoma - Census Bureau Profile\n", + "Description: We're sorry but this website doesn't work properly without JavaScript enabled. Please enable it to continue\n", + "URL: https://data.census.gov/profile/Oklahoma_City_city,_Oklahoma?g=160XX00US4055000\n", + "\n", + "Title: Oklahoma City - Wikipedia\n", + "Description: The Oklahoma City metropolitan area has an estimated 1.49 million residents, the largest metropolitan area in the state and 42nd-most populous in the nation. The city limits extend into Canadian, Cleveland, and Pottawatomie counties; however, areas beyond Oklahoma County primarily consist of ...\n", + "URL: https://en.wikipedia.org/wiki/Oklahoma_City\n", + "\n", + "Title: U.S. Census Bureau QuickFacts: Oklahoma City city, Oklahoma\n", + "Description: QuickFacts provides statistics for all states and counties. Also for cities and towns with a population of 5,000 or more.\n", + "URL: https://www.census.gov/quickfacts/oklahomacitycityoklahoma\n", + "\n", + "Title: Oklahoma City, OK | Data USA\n", + "Description: As of 2023, 12% of Oklahoma City, OK residents (82.6k people) were born outside of the United States, which is lower than the national average of 13.8%. In 2022, the percentage of foreign-born citizens in Oklahoma City, OK was 11.8%, meaning that the rate has been increasing.\n", + "URL: https://datausa.io/profile/geo/oklahoma-city-ok\n", + "\n", + "Title: Oklahoma City lands in top 20 of largest US cities by population: New census data\n", + "Description: More: 10 Oklahoma cities where population is growing the fastest · Oklahoma City grew by about 1% between 2022 and 2023, adding 7,589 people.\n", + "URL: https://www.oklahoman.com/story/news/2024/05/17/oklahoma-city-population-top-20-us-census-data-growth/73729737007/\n", + "\n", + "Title: Oklahoma Cities by Population (2025)\n", + "Description: The most populous cities in Oklahoma are Oklahoma City with 702,767, Tulsa at 411,894, Norman with 130,046, Broken Arrow at 119,194, and Edmond with 98,103.\n", + "URL: https://www.oklahoma-demographics.com/cities_by_population\n", + "\n", + "Title: Oklahoma City, OK\n", + "Description: Oklahoma City, Oklahoma has a population of 688,700. The median household income is $66,700 and 15% of residents live below the federal poverty level. The racial and ethnic composition is 22% Hispanic, 51% White, 13% Black, 4% Asian and 2% American Indian or Alaska Native.\n", + "URL: https://bigcitieshealthdata.org/city/oklahoma-city-ok/\n", + "\n", + "Title: Oklahoma Cities by Population 2025\n", + "Description: Oklahoma City is the biggest city in Oklahoma, with a population of 715,893 in 2025.\n", + "URL: https://worldpopulationreview.com/us-cities/oklahoma\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "[Multi-Tool Agent] Output: I still don't see a specific figure for exactly 2022, but I can gather enough information to make a reasonable estimate.\n", + "\n", + "[Multi-Tool Agent] Tool call: think(thought=From the search results, I can see:\n", + "1. The 2020 census population was 681,054\n", + "2. Oklahoma City grew by about 1% between 2022 and 2023, adding 7,589 people\n", + "3. This implies the 2022 population was approximately 7,589/0.01 = 758,900 people\n", + "4. But this seems too high compared to other figures like 702,767 and 688,700 mentioned\n", + "\n", + "Let me reconsider. If Oklahoma City added 7,589 people between 2022-2023 and that represented a 1% growth, then the 2022 population would be:\n", + "7,589 ÷ 0.01 = 758,900\n", + "\n", + "However, this contradicts other data points. Perhaps the clearest estimate is from the second search result which states: \"The metro area population of Oklahoma City in 2022 was 1,008,000\". But this refers to the metro area (larger than just the city proper).\n", + "\n", + "Another data point from the first search states \"Oklahoma City has a 2025 population of 715,893\" and mentions it \"has increased by 4.8% since the most recent census, which recorded a population of 683,078 in 2020.\"\n", + "\n", + "If the growth was relatively steady from 2020 to 2025, the annual growth rate would be about 0.96% per year. \n", + "Using that rate, the 2022 population would be approximately:\n", + "683,078 × (1.0096)² = 683,078 × 1.0193 = 696,272\n", + "\n", + "This seems most consistent with the other data points. I'll use 696,272 as the best estimate for Oklahoma City's 2022 population.)\n", + "\n", + "[Multi-Tool Agent] Tool result: Thinking complete!\n", + "\n", + "[Multi-Tool Agent] Output: Based on the available data, I can estimate that Oklahoma City's population in 2022 was approximately 696,272 people. Now I'll calculate the square root of this number.\n", + "\n", + "[Multi-Tool Agent] Tool call: calculator(number1=696272, number2=0, operator=sqrt)\n", + "\n", + "[Multi-Tool Agent] Tool result: Result: 834.4291461831855\n", + "\n", + "[Multi-Tool Agent] Output: The square root of Oklahoma City's population in 2022 (estimated at 696,272 people) is approximately 834.43.\n" + ] + }, + { + "data": { + "text/plain": [ + "Message(id='msg_016wLY8JCd5baUgaWEHcrage', content=[TextBlock(text=\"The square root of Oklahoma City's population in 2022 (estimated at 696,272 people) is approximately 834.43.\", type='text')], model='claude-3-7-sonnet-20250219', role='assistant', stop_reason='end_turn', stop_sequence=None, type='message', usage=Usage(input_tokens=5, output_tokens=34, cache_creation_input_tokens=150, cache_read_input_tokens=3881, service_tier='standard'))" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Example query\n", + "agent.run(\"What's the square root of the OKC population in 2022\")" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Loaded 3 MCP tools from 2 servers.\n", + "\n", + "[Multi-Tool Agent] Received: How many bananas will fit in an Toyota GR86?\n", + "\n", + "[Multi-Tool Agent] Output: That's an interesting question! To answer how many bananas will fit in a Toyota GR86, I'll need to find information about the car's cargo capacity and then estimate how many bananas could fit in that space.\n", + "\n", + "[Multi-Tool Agent] Tool call: brave_web_search(query=Toyota GR86 cargo capacity trunk volume)\n", + "\n", + "[Multi-Tool Agent] Tool result: Title: Cargo space. What can it fit? : r/GR86\n", + "Description: 11 votes, 36 comments. 42K subscribers in the GR86 community. For anyone interested in the Toyota GR86 & Subaru BRZ ('22+).\n", + "URL: https://www.reddit.com/r/GR86/comments/16xh6ii/cargo_space_what_can_it_fit/\n", + "\n", + "Title: 2024 Toyota GR86 Review, Pricing, and Specs\n", + "Description: The 2024 Toyota GR86 turns every day into an autocross day thanks to its playful chassis and easy-to-slide rear-wheel drive antics.\n", + "URL: https://www.caranddriver.com/toyota/gr-86/specs\n", + "\n", + "Title: 2023 Toyota GR86 Interior, Cargo Space & Seating | U.S. News\n", + "Description: "You might not think practicality was a consideration with the GR86, but it is surprisingly useful for weekly errands. Its relatively small 6.3 cubic feet of trunk space is hampered by its short height. But when the seats are folded, Toyota says the GR86 can hold a set of its own replacement ...\n", + "URL: https://cars.usnews.com/cars-trucks/toyota/gr86/2023/interior\n", + "\n", + "Title: 2025 Toyota GR86 Review, Pricing, and Specs\n", + "Description: DIMENSIONS Wheelbase: 101.4 in Length: 167.9 in Width: 69.9 in Height: 51.6 in Passenger Volume: 78 ft3 Trunk Volume: 6 ft3 Curb Weight: 2838 lb · C/D TEST RESULTS 60 mph: 5.4 sec 100 mph: 13.8 sec 1/4-Mile: 14.0 sec @ 101 mph 130 mph: 27.2 sec Results above omit 1-ft rollout of 0.3 sec.\n", + "URL: https://www.caranddriver.com/toyota/gr-86\n", + "\n", + "Title: Toyota GR86 dimensions, boot space and similars\n", + "Description: Toyota GR86 dimensions with interior photos, boot space and measurements of length, width and height. Previous cars and similars in size.\n", + "URL: https://www.automobiledimension.com/model/toyota/gr86\n", + "\n", + "Title: Trunk Size | Toyota GR86 Forum - GT86 Forum, Subaru BRZ Forum, Scion ...\n", + "Description: It still surprises me that I can fit so much in the trunk. I am able to fit my entire paintball gear bag, pods, cleats, 2 cases of paint, mask case, small ice chest, and my extra clothes and smaller items. Gotta enjoy the little things with these beautiful cars!\n", + "URL: https://www.gr86.org/threads/trunk-size.8574/\n", + "\n", + "Title: Toyota GR86 Cargo Space | Toyota GR86 Forum - GT86 Forum, Subaru BRZ Forum, Scion ...\n", + "Description: Not sure if serious or market differences, but it is mandated in all cars in the U.S. if the trunk is much bigger than a shoebox lol. ... This is an older thread, you may not receive a response, and could be reviving an old thread. Please consider creating a new thread. ... Toyota GR86 Forum ...\n", + "URL: https://www.gr86.org/threads/toyota-gr86-cargo-space.372/\n", + "\n", + "Title: GR86 - GR86 Boot Dimensions | Toyota GR Forum for owners of GR Yaris, GR86, GR Supra and GR Corolla\n", + "Description: During my time with the GR86, I tried to take accurate measurements of the boot dimensions (in centimetres), as shown below. Sorry the pictures are not exactly to scale - my MS Paint skills are rather limited! = The boot opening is 60cm at its narrowest (at the bottom), 80cm at the widest...\n", + "URL: https://www.gr-zoo.com/threads/gr86-boot-dimensions.2567/\n", + "\n", + "Title: 2025 Toyota GR86 Interior, Cargo Space & Seating | U.S. News\n", + "Description: Few sports cars have large trunks, and the GR86 is no exception. It has just 6.3 cubic feet of trunk space, which is not much even for this class. You can fit several grocery bags or a couple of carry-on bags with the rear seats upright.\n", + "URL: https://cars.usnews.com/cars-trucks/toyota/gr86/interior\n", + "\n", + "Title: Toyota GR86 (2025) boot space & practicality\n", + "Description: Read what our experts think about the Toyota GR86's practicality, boot size and overall dimensions, how it compares to its rivals and more.\n", + "URL: https://www.parkers.co.uk/toyota/gr86/review/practicality/\n", + "\n", + "[Multi-Tool Agent] Output: Now I need to get information about the average size of a banana to make this calculation:\n", + "\n", + "[Multi-Tool Agent] Tool call: brave_web_search(query=average banana size dimensions volume)\n", + "\n", + "[Multi-Tool Agent] Tool result: Title: Banana for scale - Universal converter - Unit conversion for all!\n", + "Description: Introducing the banana as the universal measurement unit, all conversion-related confusion can be avoided. Use this handy app to convert imperial and metric measurements to universal bananas.\n", + "URL: http://bananaforscale.info/\n", + "\n", + "Title: How big is a banana? - Quora\n", + "Description: Answer (1 of 10): How big is a banana? Is that your banana or my banana? Musa Ingens, the world's largest banana tree growing in the forest of Papua New Guinea and Indonesia. It is the tallest species of bananas and the world's largest herbaceous plant. The size of the Musa Ingens banana is imp...\n", + "URL: https://www.quora.com/How-big-is-a-banana\n", + "\n", + "Title: How Many Calories and Carbs Are in a Banana?\n", + "Description: Bananas are extremely healthy and nutritious. Most people know this, but many wonder how many calories and carbs bananas contain.\n", + "URL: https://www.healthline.com/nutrition/bananas-calories-carbs\n", + "\n", + "Title: Banana - Wikipedia\n", + "Description: As of 2018, bananas are exported in larger volume and to a larger value than any other fruit. In 2022, world production of bananas and plantains combined was 179 million tonnes, led by India and China with a combined total of 26% of global production. Other major producers were Uganda, Indonesia, ...\n", + "URL: https://en.wikipedia.org/wiki/Banana\n", + "\n", + "Title: On the Usefulness of Bananas: A New System of Measurement | by Liam Foley | Medium\n", + "Description: According to a study conducted by Mahmoud Soltani of the University of Tehran, on average, a fully ripe banana fruit of the Cavendish variety measures 200.25 millimetres in length, with a mass of 180.56 grams, and an ellipsoid volume of 156.10 cubic centimetres.\n", + "URL: https://medium.com/@liamfoley2430/on-the-usefulness-of-bananas-a-new-system-of-measurement-76c50d8bb89a\n", + "\n", + "Title: Banana-based Length & Distance Unit Converter | Convert To Bananas\n", + "Description: Convert any length to bananas (and to useful units as well)\n", + "URL: https://www.converttobananas.com/\n", + "\n", + "Title: r/atlanticdiscussions on Reddit: HOW LONG IS THE, UM, AVERAGE BANANA? And why do you all want to know so bad?\n", + "Description: As a writer on the internet, I often feel as though I’m getting a peek into the American mind. Recently, while looking at search data, I discovered…\n", + "URL: https://www.reddit.com/r/atlanticdiscussions/comments/pbenqf/how_long_is_the_um_average_banana_and_why_do_you/\n", + "\n", + "Title: How Long Is the, Um, Average Banana?\n", + "Description: Bananas come in a total of five sizes — extra small, small, medium, large and extra large. But how long is the average banana?\n", + "URL: https://melmagazine.com/en-us/story/how-long-is-a-banana\n", + "\n", + "Title: r/theydidthemath on Reddit: [Request] Assuming the bananas are normal size how many rooms would 102,769 bananas fill?\n", + "Description: The average volume of a banana is 156.1 cm³. Assuming you mean whole, uncrushed bananas, google search results peg the packing effeciency around 65%.\n", + "URL: https://www.reddit.com/r/theydidthemath/comments/rdiyis/request_assuming_the_bananas_are_normal_size_how/\n", + "\n", + "Title: How big is your banana?\n", + "Description: BMI Calculator Enter your details: Height (cm): Weight (kg): BMI (kg/m2) is Category is\n", + "URL: https://healthieryou.net.au/wp-content/uploads/2015/11/How-big-is-your-banana-Article.pdf\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "[Multi-Tool Agent] Output: Now I have the information I need to calculate how many bananas will fit in a Toyota GR86. Let me think through this:\n", + "\n", + "[Multi-Tool Agent] Tool call: think(thought=From the search results, I found:\n", + "\n", + "1. Toyota GR86 trunk capacity:\n", + " - 6.3 cubic feet of trunk space (according to multiple sources)\n", + " - This equals approximately 178,385 cubic centimeters (6.3 ft³ × 28,316.85 cm³/ft³)\n", + "\n", + "2. Average banana volume:\n", + " - A fully ripe Cavendish banana has a volume of approximately 156.10 cubic centimeters\n", + "\n", + "3. Packing efficiency:\n", + " - When packing irregular objects like bananas, we need to account for the space between them\n", + " - A Reddit comment mentioned packing efficiency of around 65% for bananas\n", + " - This means only about 65% of the total volume can be filled with bananas due to gaps\n", + "\n", + "So to calculate how many bananas will fit in the Toyota GR86 trunk:\n", + "- Available volume with packing efficiency: 178,385 cm³ × 0.65 = 115,950 cm³\n", + "- Number of bananas: 115,950 cm³ ÷ 156.10 cm³ = 743 bananas\n", + "\n", + "However, the Toyota GR86 also has an interior cabin with a passenger volume of approximately 78 ft³ (according to one of the search results). If we were to fill the entire car with bananas (which is not practical but answers the hypothetical question):\n", + "\n", + "Total car volume = 78 ft³ (passenger) + 6.3 ft³ (trunk) = 84.3 ft³ = 2,387,110 cm³\n", + "\n", + "With 65% packing efficiency:\n", + "- Available volume: 2,387,110 cm³ × 0.65 = 1,551,622 cm³\n", + "- Number of bananas: 1,551,622 cm³ ÷ 156.10 cm³ = 9,940 bananas\n", + "\n", + "So, approximately 9,940 bananas could theoretically fit in an entire Toyota GR86 if completely filled, or about 743 bananas in just the trunk.)\n", + "\n", + "[Multi-Tool Agent] Tool result: Thinking complete!\n", + "\n", + "[Multi-Tool Agent] Output: # How Many Bananas Will Fit in a Toyota GR86\n", + "\n", + "Based on my research, I can provide you with an estimate of how many bananas would fit in a Toyota GR86.\n", + "\n", + "## The Data:\n", + "\n", + "- **Toyota GR86 trunk capacity**: 6.3 cubic feet (approximately 178,385 cubic centimeters)\n", + "- **Toyota GR86 passenger volume**: 78 cubic feet\n", + "- **Average banana volume**: 156.10 cubic centimeters per banana\n", + "- **Packing efficiency**: Approximately 65% (accounting for inevitable gaps between irregularly shaped bananas)\n", + "\n", + "## The Calculation:\n", + "\n", + "If we're talking about filling just the trunk:\n", + "- Available volume with packing efficiency: 178,385 cm³ × 0.65 = 115,950 cm³\n", + "- Number of bananas in trunk: 115,950 cm³ ÷ 156.10 cm³ = approximately **743 bananas**\n", + "\n", + "If we're considering the entire car (including passenger compartment):\n", + "- Total car volume: 84.3 cubic feet (2,387,110 cm³)\n", + "- Available volume with packing efficiency: 2,387,110 cm³ × 0.65 = 1,551,622 cm³\n", + "- Total number of bananas: 1,551,622 cm³ ÷ 156.10 cm³ = approximately **9,940 bananas**\n", + "\n", + "So, you could fit approximately 743 bananas in the trunk alone, or nearly 10,000 bananas if you filled the entire Toyota GR86 (though that would leave no room for a driver!).\n" + ] + }, + { + "data": { + "text/plain": [ + "Message(id='msg_01NC46TLNZSG97ZH5v7b7YcN', content=[TextBlock(text=\"# How Many Bananas Will Fit in a Toyota GR86\\n\\nBased on my research, I can provide you with an estimate of how many bananas would fit in a Toyota GR86.\\n\\n## The Data:\\n\\n- **Toyota GR86 trunk capacity**: 6.3 cubic feet (approximately 178,385 cubic centimeters)\\n- **Toyota GR86 passenger volume**: 78 cubic feet\\n- **Average banana volume**: 156.10 cubic centimeters per banana\\n- **Packing efficiency**: Approximately 65% (accounting for inevitable gaps between irregularly shaped bananas)\\n\\n## The Calculation:\\n\\nIf we're talking about filling just the trunk:\\n- Available volume with packing efficiency: 178,385 cm³ × 0.65 = 115,950 cm³\\n- Number of bananas in trunk: 115,950 cm³ ÷ 156.10 cm³ = approximately **743 bananas**\\n\\nIf we're considering the entire car (including passenger compartment):\\n- Total car volume: 84.3 cubic feet (2,387,110 cm³)\\n- Available volume with packing efficiency: 2,387,110 cm³ × 0.65 = 1,551,622 cm³\\n- Total number of bananas: 1,551,622 cm³ ÷ 156.10 cm³ = approximately **9,940 bananas**\\n\\nSo, you could fit approximately 743 bananas in the trunk alone, or nearly 10,000 bananas if you filled the entire Toyota GR86 (though that would leave no room for a driver!).\", type='text')], model='claude-3-7-sonnet-20250219', role='assistant', stop_reason='end_turn', stop_sequence=None, type='message', usage=Usage(input_tokens=4, output_tokens=385, cache_creation_input_tokens=574, cache_read_input_tokens=6630, service_tier='standard'))" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "await agent.run_async(\"How many bananas will fit in an Toyota GR86?\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# 4. Example with Anthropic Server Tools\n", + "\n", + "This example demonstrates using Anthropic's native server tools for web search and code execution." + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "[Server Tools Agent] Agent initialized\n" + ] + } + ], + "source": [ + "# Create Anthropic server tools\n", + "web_search_tool = WebSearchServerTool(\n", + " name=\"web_search\",\n", + " max_uses=5, # Limit to 5 searches per request\n", + " blocked_domains=[\"example.com\"] # Example of blocking specific domains\n", + ")\n", + "\n", + "code_execution_tool = CodeExecutionServerTool()\n", + "\n", + "# Initialize agent with server tools\n", + "server_agent = Agent(\n", + " name=\"Server Tools Agent\",\n", + " system=\"\"\"\n", + "You are a helpful assistant with access to:\n", + "1. Web search for finding current information\n", + "2. Code execution for running Python code\n", + "3. Think tool for complex reasoning\n", + "\n", + "Use these tools effectively to answer questions that require current data or calculations.\n", + "\"\"\",\n", + " tools=[think_tool, web_search_tool, code_execution_tool],\n", + " config=ModelConfig(\n", + " model=\"claude-sonnet-4-20250514\",\n", + " max_tokens=4096,\n", + " temperature=0.7\n", + " ),\n", + " verbose=True\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "[Server Tools Agent] Received: \n", + "Search for the current population of Tokyo, Japan. \n", + "Then write and execute Python code to calculate how many people that would be per square kilometer, \n", + "given that Tokyo's area is approximately 2,194 square kilometers.\n", + "\n", + "\n", + "[Server Tools Agent] Output: I'll help you find Tokyo's current population and then calculate the population density. Let me start by searching for the current population data.\n", + "\n", + "[Server Tools Agent] Output: Now let me search for more specific information about Tokyo's current population, as the results show different figures for different definitions of \"Tokyo\" (city proper vs metropolitan area).\n", + "\n", + "[Server Tools Agent] Tool call: think(thought=Now I have a clearer picture of Tokyo's population. From the search results, I can see there are different definitions of \"Tokyo\":\n", + "\n", + "1. Tokyo Metropolis (Tokyo-to) - the prefecture/administrative area: Around 14 million people as of 2024\n", + "2. The 23 Special Wards (central Tokyo): Around 9.2-9.7 million people\n", + "3. Greater Tokyo Area (metropolitan area): Around 37 million people\n", + "\n", + "For the calculation with the given area of 2,194 square kilometers, this seems to refer to the Tokyo Metropolis (Tokyo-to) area, as the search results mention:\n", + "- \"metropolis population is 13.9 Million as on 2019 and has area of 2,194 Sq km\" (from source 17)\n", + "- The surface area of the city metropolis comes to approximately 2,187.66 km² (from source 2)\n", + "\n", + "So I should use approximately 14 million as the current population for Tokyo Metropolis for the calculation with the 2,194 sq km area.\n", + "\n", + "Let me write Python code to calculate the population density.)\n", + "\n", + "[Server Tools Agent] Tool result: Thinking complete!\n", + "\n", + "[Server Tools Agent] Output: Now I'll write and execute Python code to calculate Tokyo's population density based on the information I found.\n", + "\n", + "[Server Tools Agent] Output: Based on my search and calculations, here are the key findings:\n", + "\n", + "## Tokyo's Current Population\n", + "\n", + "\n", + "\n", + "[Server Tools Agent] Output: Tokyo Metropolis (Tokyo-to) has a population of over 14 million people as of 2023-2024\n", + "\n", + "[Server Tools Agent] Output: . The search results show different figures depending on the specific definition of \"Tokyo\":\n", + "\n", + "- \n", + "\n", + "[Server Tools Agent] Output: The 23 wards have a population of 9.2 million\n", + "\n", + "[Server Tools Agent] Output: \n", + "- \n", + "\n", + "[Server Tools Agent] Output: The metropolis has a population that exceeds 13 million\n", + "\n", + "[Server Tools Agent] Output: \n", + "- \n", + "\n", + "[Server Tools Agent] Output: The Greater Tokyo Area has 41 million residents as of 2024\n", + "\n", + "[Server Tools Agent] Output: \n", + "\n", + "## Population Density Calculation\n", + "\n", + "Using the current population of approximately **14 million people** for Tokyo Metropolis and the given area of **2,194 square kilometers**, the calculation shows:\n", + "\n", + "**Tokyo's population density is approximately 6,381 people per square kilometer**\n", + "\n", + "This aligns well with existing data, as \n", + "\n", + "[Server Tools Agent] Output: one source indicates Tokyo metropolis has a density of 6,349 persons per square kilometer\n", + "\n", + "[Server Tools Agent] Output: , confirming our calculation is accurate.\n", + "\n", + "## Context\n", + "\n", + "This makes Tokyo one of the most densely populated metropolitan areas in the world. \n", + "\n", + "[Server Tools Agent] Output: The greater Tokyo metropolitan area is estimated to be over 36 million people and is home to 25% of Japan's population, making it the most populous metropolitan area in the world\n", + "\n", + "[Server Tools Agent] Output: .\n" + ] + }, + { + "data": { + "text/plain": [ + "Message(id='msg_018nAgEXuVMSJCTSrSjcZtrj', content=[TextBlock(text=\"Now I'll write and execute Python code to calculate Tokyo's population density based on the information I found.\", type='text'), TextBlock(text=None, type='server_tool_use', id='srvtoolu_01JAxi9fcajoScWQWEG1bJ2S', name='code_execution', input={'code': '# Tokyo population and area data\\n# Based on search results, Tokyo Metropolis (Tokyo-to) has approximately 14 million people\\n# and covers 2,194 square kilometers\\n\\ntokyo_population = 14_000_000 # Current population of Tokyo Metropolis (2024)\\ntokyo_area_sq_km = 2194 # Area in square kilometers\\n\\n# Calculate population density\\npopulation_density = tokyo_population / tokyo_area_sq_km\\n\\nprint(\"Tokyo Population Density Calculation\")\\nprint(\"=\" * 40)\\nprint(f\"Tokyo Metropolis Population: {tokyo_population:,} people\")\\nprint(f\"Tokyo Metropolis Area: {tokyo_area_sq_km:,} square kilometers\")\\nprint(f\"Population Density: {population_density:,.2f} people per square kilometer\")\\n\\n# Let\\'s also show some context with rounded numbers\\nprint(f\"\\\\nRounded: Approximately {population_density:,.0f} people per square kilometer\")\\n\\n# For comparison, let\\'s also calculate for different population estimates\\nprint(\"\\\\n\" + \"=\"*50)\\nprint(\"COMPARISON WITH DIFFERENT POPULATION ESTIMATES:\")\\nprint(\"=\"*50)\\n\\n# Different population estimates from search results\\nestimates = [\\n (\"Tokyo Metropolis (conservative estimate)\", 13_500_000),\\n (\"Tokyo Metropolis (current estimate)\", 14_000_000),\\n (\"Tokyo Metropolis (high estimate)\", 14_300_000),\\n]\\n\\nfor description, population in estimates:\\n density = population / tokyo_area_sq_km\\n print(f\"{description}: {density:,.0f} people/km²\")'}), TextBlock(text=None, type='code_execution_tool_result', tool_use_id='srvtoolu_01JAxi9fcajoScWQWEG1bJ2S', content={'type': 'code_execution_result', 'stdout': 'Tokyo Population Density Calculation\\n========================================\\nTokyo Metropolis Population: 14,000,000 people\\nTokyo Metropolis Area: 2,194 square kilometers\\nPopulation Density: 6,381.04 people per square kilometer\\n\\nRounded: Approximately 6,381 people per square kilometer\\n\\n==================================================\\nCOMPARISON WITH DIFFERENT POPULATION ESTIMATES:\\n==================================================\\nTokyo Metropolis (conservative estimate): 6,153 people/km²\\nTokyo Metropolis (current estimate): 6,381 people/km²\\nTokyo Metropolis (high estimate): 6,518 people/km²\\n', 'stderr': '', 'return_code': 0, 'content': []}), TextBlock(text=\"Based on my search and calculations, here are the key findings:\\n\\n## Tokyo's Current Population\\n\\n\", type='text'), TextBlock(text='Tokyo Metropolis (Tokyo-to) has a population of over 14 million people as of 2023-2024', type='text', citations=[{'type': 'web_search_result_location', 'cited_text': 'With a population of over 14 million in the city proper in 2023, it is one of the most populous urban areas in the world. ', 'url': 'https://en.wikipedia.org/wiki/Tokyo', 'title': 'Tokyo - Wikipedia', 'encrypted_index': 'Eo8BCioIBBgCIiQ3NmU4ZmI4OC1kOGFhLTQwMzMtYjQ1Mi1lMmZlY2M3MGFiZjkSDNkmb67meDKwvfNSWBoMey/C2mf4C76VGRm/IjB/p9OlOe00CdTjisZ+Soyr8DL2IDH8J+LioeVOOHK7L/3HfUnClNRaZvtHREGHnFQqE0Pko8rhETBI/USi53r6+Vq2i1sYBA=='}, {'type': 'web_search_result_location', 'cited_text': 'The area within the dashed white line is home to 14 million people, as of 2020. ', 'url': 'http://www.statsmapsnpix.com/2022/09/how-big-is-tokyo.html', 'title': 'Stats, Maps n Pix: How big is Tokyo?', 'encrypted_index': 'EpABCioIBBgCIiQ3NmU4ZmI4OC1kOGFhLTQwMzMtYjQ1Mi1lMmZlY2M3MGFiZjkSDN3Dl6f/w5+R2DpZzRoMZiaU1oKXtaWciBkzIjCeqzTw6VNVH9T8pJcQGylS/KdoN8tamtX70LsJ7ygZeDUBnbVxfCebu8Xfpv9tdJAqFN2KZA2z6Q/22MX5gewO0+OfwUtrGAQ='}, {'type': 'web_search_result_location', 'cited_text': 'The area within the dashed white line is home to 14 million people, as of 2020. ', 'url': 'http://www.statsmapsnpix.com/2022/09/how-big-is-tokyo.html', 'title': 'Stats, Maps n Pix: How big is Tokyo?', 'encrypted_index': 'EpABCioIBBgCIiQ3NmU4ZmI4OC1kOGFhLTQwMzMtYjQ1Mi1lMmZlY2M3MGFiZjkSDPgOVKuta9qNyug8dRoMzPFdKW3fR2mvZIK7IjCUZybt1VVAMoVOA3VEOCprNID+ZQWavx0Z9ug53tjoeanpJvyGUIqn65McFNAKksEqFDn6oY/apm+AjLJk908w1NyD4w0OGAQ='}]), TextBlock(text='. The search results show different figures depending on the specific definition of \"Tokyo\":\\n\\n- ', type='text'), TextBlock(text='The 23 wards have a population of 9.2 million', type='text', citations=[{'type': 'web_search_result_location', 'cited_text': 'The 23 wards claim a population of 9.2 million, but the metropolis has a population that exceeds 13 million. ', 'url': 'https://worldpopulationreview.com/cities/japan/tokyo', 'title': 'Tokyo Population 2025', 'encrypted_index': 'Eo8BCioIBBgCIiQ3NmU4ZmI4OC1kOGFhLTQwMzMtYjQ1Mi1lMmZlY2M3MGFiZjkSDGHmgL6AaaF2lV1RIBoMst2h5K/VIu2RGDTwIjBLqvkNQPtkPP1V49Wf9IxwZBOyM+HgBvi7rd6yR/Wkjr9LpT3HnIQOT5FetsLzqUoqE5k8jG1Xsz7hyicRcL/9UCJHKU8YBA=='}]), TextBlock(text='\\n- ', type='text'), TextBlock(text='The metropolis has a population that exceeds 13 million', type='text', citations=[{'type': 'web_search_result_location', 'cited_text': 'The 23 wards claim a population of 9.2 million, but the metropolis has a population that exceeds 13 million. ', 'url': 'https://worldpopulationreview.com/cities/japan/tokyo', 'title': 'Tokyo Population 2025', 'encrypted_index': 'Eo8BCioIBBgCIiQ3NmU4ZmI4OC1kOGFhLTQwMzMtYjQ1Mi1lMmZlY2M3MGFiZjkSDHsQQzoI+Hz+TiOD7xoMgVi2a9XbrLmJlkNKIjCXGiOzb7fzGY/ueYkCuE280j3RfDhLCJ3YWi6gVboWe2BidSHKRwJBoPbhHAaWwVYqEwZ9kwdNZiabocE1Kbh3m6GW/SMYBA=='}, {'type': 'web_search_result_location', 'cited_text': 'The 23 wards claim a population of 9.2 million, but the metropolis has a population that exceeds 13 million. ', 'url': 'https://worldpopulationreview.com/cities/japan/tokyo', 'title': 'Tokyo Population 2025', 'encrypted_index': 'EpEBCioIBBgCIiQ3NmU4ZmI4OC1kOGFhLTQwMzMtYjQ1Mi1lMmZlY2M3MGFiZjkSDHolkkUxMX/ORAlhZBoMoDhnLJx+SR+8R8aTIjDBKIO2DJG07HcLucjTH1y1LCGBRw1mv/t061+rSVpTbV7iY5DXREoEUGCcMFVasUgqFT9KXif7+m2jD/0yN42wWd02HG+jbRgE'}]), TextBlock(text='\\n- ', type='text'), TextBlock(text='The Greater Tokyo Area has 41 million residents as of 2024', type='text', citations=[{'type': 'web_search_result_location', 'cited_text': 'The Greater Tokyo Area, which includes Tokyo and parts of six neighboring prefectures, is the most populous metropolitan area in the world, with 41 mi...', 'url': 'https://en.wikipedia.org/wiki/Tokyo', 'title': 'Tokyo - Wikipedia', 'encrypted_index': 'Eo8BCioIBBgCIiQ3NmU4ZmI4OC1kOGFhLTQwMzMtYjQ1Mi1lMmZlY2M3MGFiZjkSDPu/83J4h+//6WejfhoMabArnZYopPgIgBadIjABzh8N5tuANZ8z7yijY+HpuAP+iw6WDSwqolJU6364du78SvzlqUXbWLujKQdIhMcqE1fHzX0qTHJmNVRZHIt3AcmEBLwYBA=='}]), TextBlock(text=\"\\n\\n## Population Density Calculation\\n\\nUsing the current population of approximately **14 million people** for Tokyo Metropolis and the given area of **2,194 square kilometers**, the calculation shows:\\n\\n**Tokyo's population density is approximately 6,381 people per square kilometer**\\n\\nThis aligns well with existing data, as \", type='text'), TextBlock(text='one source indicates Tokyo metropolis has a density of 6,349 persons per square kilometer', type='text', citations=[{'type': 'web_search_result_location', 'cited_text': 'Tokyo population in 2023 is estimated to be 35.8 million, population in 2022 is estimated to be 35.6 million, in 2021 is estimated to be 37.5 million,...', 'url': 'https://www.populationu.com/cities/tokyo-population', 'title': 'Tokyo Population 2025', 'encrypted_index': 'EpEBCioIBBgCIiQ3NmU4ZmI4OC1kOGFhLTQwMzMtYjQ1Mi1lMmZlY2M3MGFiZjkSDH+CHv/68jATiDYFzRoMKun6Si5c+OHOiG9WIjAq6lZhNchC2RxsNLZMgheItJe0yOT53PlDChrsyxJztTkx8aSELTlrH+2V953cZhYqFUMI2gojH59prn81fC8mSgHNmvaKKBgE'}]), TextBlock(text=', confirming our calculation is accurate.\\n\\n## Context\\n\\nThis makes Tokyo one of the most densely populated metropolitan areas in the world. ', type='text'), TextBlock(text=\"The greater Tokyo metropolitan area is estimated to be over 36 million people and is home to 25% of Japan's population, making it the most populous metropolitan area in the world\", type='text', citations=[{'type': 'web_search_result_location', 'cited_text': 'The greater Tokyo metropolitan area, which is spread over 3 prefectures, is much larger and has a population that is estimated to be over 36 million. ...', 'url': 'https://worldpopulationreview.com/cities/japan/tokyo', 'title': 'Tokyo Population 2025', 'encrypted_index': 'EpEBCioIBBgCIiQ3NmU4ZmI4OC1kOGFhLTQwMzMtYjQ1Mi1lMmZlY2M3MGFiZjkSDC3eHB8MpV+6PvSJ9hoMGBBbLOOC80uQmSO4IjCujon0QmWYTtDWe+Po+0YK+bXYbwcEwKmurDHUK1IuU6w2rvi9Rq2XeSecqpnOhNgqFU8JodAjBnotfaYFd1dee+V5IBAlIxgE'}]), TextBlock(text='.', type='text')], model='claude-sonnet-4-20250514', role='assistant', stop_reason='end_turn', stop_sequence=None, type='message', usage=Usage(input_tokens=9, output_tokens=841, cache_creation_input_tokens=928, cache_read_input_tokens=48328, service_tier='standard', server_tool_use={'web_search_requests': 0}), container={'id': 'container_011CQTnr9bhVu3fPTnaT85oA', 'expires_at': '2025-06-24T22:55:24.157374+00:00'})" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Example 1: Use web search to find current information and code execution for analysis\n", + "server_agent.run(\"\"\"\n", + "Search for the current population of Tokyo, Japan. \n", + "Then write and execute Python code to calculate how many people that would be per square kilometer, \n", + "given that Tokyo's area is approximately 2,194 square kilometers.\n", + "\"\"\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.13" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/agents/test_message_params.py b/agents/test_message_params.py new file mode 100644 index 000000000..461538ba3 --- /dev/null +++ b/agents/test_message_params.py @@ -0,0 +1,272 @@ +#!/usr/bin/env python3 +"""Test suite for Agent message_params functionality. + +This module tests the ability to pass custom parameters to the Claude API +through the Agent's message_params argument, including headers, metadata, +and API parameters. +""" + +import os +import sys +# Add parent directory to path for imports +sys.path.insert(0, os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + +from agents.agent import Agent, ModelConfig + + +class TestMessageParams: + """Test cases for message_params functionality.""" + + def __init__(self, verbose: bool = True): + """Initialize test suite. + + Args: + verbose: Whether to print detailed output + """ + self.verbose = verbose + self.passed = 0 + self.failed = 0 + + def _print(self, message: str) -> None: + """Print message if verbose mode is on.""" + if self.verbose: + print(message) + + def _run_test(self, test_name: str, test_func: callable) -> None: + """Run a single test and track results. + + Args: + test_name: Name of the test + test_func: Test function to execute + """ + self._print(f"\n{'='*60}") + self._print(f"Running: {test_name}") + self._print('='*60) + + try: + test_func() + self.passed += 1 + self._print(f"✓ {test_name} PASSED") + except Exception as e: + self.failed += 1 + self._print(f"✗ {test_name} FAILED: {str(e)}") + if self.verbose: + import traceback + traceback.print_exc() + + def test_basic_agent(self) -> None: + """Test agent without message_params to ensure backward compatibility.""" + agent = Agent( + name="BasicAgent", + system="You are a helpful assistant. Be very brief.", + verbose=False + ) + + response = agent.run("What is 2+2?") + # response is a list of message content blocks + assert any("4" in str(block.get("text", "")) for block in response if block.get("type") == "text") + response_text = next((block["text"] for block in response if block.get("type") == "text"), "") + self._print(f"Response: {response_text}") + + def test_custom_headers(self) -> None: + """Test passing custom headers through message_params.""" + agent = Agent( + name="HeaderAgent", + system="You are a helpful assistant. Be very brief.", + verbose=False, + message_params={ + "extra_headers": { + "X-Custom-Header": "test-value", + "X-Request-ID": "test-12345" + } + } + ) + + # Verify headers are stored + assert "extra_headers" in agent.message_params + assert agent.message_params["extra_headers"]["X-Custom-Header"] == "test-value" + + response = agent.run("What is 3+3?") + response_text = next((block["text"] for block in response if block.get("type") == "text"), "") + assert "6" in response_text + self._print(f"Response with custom headers: {response_text}") + + def test_beta_headers(self) -> None: + """Test passing beta feature headers.""" + agent = Agent( + name="BetaAgent", + system="You are a helpful assistant. Be very brief.", + verbose=False, + message_params={ + "extra_headers": { + "anthropic-beta": "files-api-2025-04-14" + } + } + ) + + # The API call should succeed even with beta headers + response = agent.run("What is 5*5?") + response_text = next((block["text"] for block in response if block.get("type") == "text"), "") + assert "25" in response_text + self._print(f"Response with beta headers: {response_text}") + + def test_metadata(self) -> None: + """Test passing valid metadata fields.""" + agent = Agent( + name="MetadataAgent", + system="You are a helpful assistant. Be very brief.", + verbose=False, + message_params={ + "metadata": { + "user_id": "test-user-123" + } + } + ) + + response = agent.run("What is 10/2?") + response_text = next((block["text"] for block in response if block.get("type") == "text"), "") + assert "5" in response_text + self._print(f"Response with metadata: {response_text}") + + def test_api_parameters(self) -> None: + """Test passing various API parameters.""" + agent = Agent( + name="ParamsAgent", + system="You are a helpful assistant.", + verbose=False, + message_params={ + "top_k": 10, + "top_p": 0.95, + "temperature": 0.7 + } + ) + + # Verify parameters are passed through + params = agent._prepare_message_params() + assert params["top_k"] == 10 + assert params["top_p"] == 0.95 + assert params["temperature"] == 0.7 + + response = agent.run("Say 'test'") + response_text = next((block["text"] for block in response if block.get("type") == "text"), "") + assert response_text + self._print(f"Response with custom params: {response_text}") + + def test_parameter_override(self) -> None: + """Test that message_params override config defaults.""" + config = ModelConfig( + temperature=1.0, + max_tokens=100 + ) + + agent = Agent( + name="OverrideAgent", + system="You are a helpful assistant.", + config=config, + verbose=False, + message_params={ + "temperature": 0.5, # Should override config + "max_tokens": 200 # Should override config + } + ) + + params = agent._prepare_message_params() + assert params["temperature"] == 0.5 + assert params["max_tokens"] == 200 + self._print("Parameter override successful") + + def test_invalid_metadata_field(self) -> None: + """Test that invalid metadata fields are properly rejected by the API.""" + agent = Agent( + name="InvalidAgent", + system="You are a helpful assistant.", + verbose=False, + message_params={ + "metadata": { + "user_id": "valid", + "invalid_field": "should-fail" + } + } + ) + + try: + agent.run("Test") + # Should not reach here + raise AssertionError("Expected API error for invalid metadata field") + except Exception as e: + assert "invalid_request_error" in str(e) or "metadata" in str(e).lower() + self._print(f"Correctly rejected invalid metadata: {type(e).__name__}") + + def test_combined_parameters(self) -> None: + """Test combining multiple parameter types.""" + agent = Agent( + name="CombinedAgent", + system="You are a helpful assistant. Be very brief.", + verbose=False, + message_params={ + "extra_headers": { + "X-Test": "combined", + "anthropic-beta": "files-api-2025-04-14" + }, + "metadata": { + "user_id": "combined-test" + }, + "temperature": 0.8, + "top_k": 5 + } + ) + + params = agent._prepare_message_params() + assert params["extra_headers"]["X-Test"] == "combined" + assert params["metadata"]["user_id"] == "combined-test" + assert params["temperature"] == 0.8 + assert params["top_k"] == 5 + + response = agent.run("What is 1+1?") + response_text = next((block["text"] for block in response if block.get("type") == "text"), "") + assert "2" in response_text + self._print(f"Response with combined params: {response_text}") + + def run_all_tests(self) -> None: + """Run all test cases.""" + self._print("\nAgent message_params Test Suite") + self._print("="*60) + + tests = [ + ("Basic Agent (No message_params)", self.test_basic_agent), + ("Custom Headers", self.test_custom_headers), + ("Beta Feature Headers", self.test_beta_headers), + ("Valid Metadata", self.test_metadata), + ("API Parameters", self.test_api_parameters), + ("Parameter Override", self.test_parameter_override), + ("Invalid Metadata Field", self.test_invalid_metadata_field), + ("Combined Parameters", self.test_combined_parameters), + ] + + for test_name, test_func in tests: + self._run_test(test_name, test_func) + + self._print(f"\n{'='*60}") + self._print(f"Test Results: {self.passed} passed, {self.failed} failed") + self._print("="*60) + + return self.failed == 0 + + +def main(): + """Run the test suite.""" + # Check for API key + if not os.environ.get("ANTHROPIC_API_KEY"): + print("Error: Please set ANTHROPIC_API_KEY environment variable") + sys.exit(1) + + # Run tests + test_suite = TestMessageParams(verbose=True) + success = test_suite.run_all_tests() + + # Exit with appropriate code + sys.exit(0 if success else 1) + + +if __name__ == "__main__": + main() \ No newline at end of file diff --git a/agents/tools/__init__.py b/agents/tools/__init__.py new file mode 100644 index 000000000..1924138a0 --- /dev/null +++ b/agents/tools/__init__.py @@ -0,0 +1,16 @@ +"""Tools module for agent framework.""" + +from .base import Tool +from .code_execution import CodeExecutionServerTool +from .file_tools import FileReadTool, FileWriteTool +from .think import ThinkTool +from .web_search import WebSearchServerTool + +__all__ = [ + "Tool", + "CodeExecutionServerTool", + "FileReadTool", + "FileWriteTool", + "ThinkTool", + "WebSearchServerTool", +] diff --git a/agents/tools/base.py b/agents/tools/base.py new file mode 100644 index 000000000..0cdb752d6 --- /dev/null +++ b/agents/tools/base.py @@ -0,0 +1,27 @@ +"""Base tool definitions for the agent framework.""" + +from dataclasses import dataclass +from typing import Any + + +@dataclass +class Tool: + """Base class for all agent tools.""" + + name: str + description: str + input_schema: dict[str, Any] + + def to_dict(self) -> dict[str, Any]: + """Convert tool to Claude API format.""" + return { + "name": self.name, + "description": self.description, + "input_schema": self.input_schema, + } + + async def execute(self, **kwargs) -> str: + """Execute the tool with provided parameters.""" + raise NotImplementedError( + "Tool subclasses must implement execute method" + ) diff --git a/agents/tools/calculator_mcp.py b/agents/tools/calculator_mcp.py new file mode 100644 index 000000000..4c10c429d --- /dev/null +++ b/agents/tools/calculator_mcp.py @@ -0,0 +1,55 @@ +#!/usr/bin/env python3 + +"""Simple calculator tool for basic math operations.""" + +import math + +from mcp.server import FastMCP + +mcp = FastMCP("Calculator") + + +@mcp.tool(name="calculator") +def calculator(number1: float, number2: float, operator: str) -> str: + """Performs basic calculations with two numbers. + + Args: + number1: First number in the calculation + number2: Second number in the calculation + operator: Operation symbol to perform (+, -, *, /, ^, sqrt) + Note: Only these exact symbols are supported, not words + + Returns: + Result of the calculation + """ + try: + if operator == "+": + result = number1 + number2 + elif operator == "-": + result = number1 - number2 + elif operator == "*": + result = number1 * number2 + elif operator == "/": + if number2 == 0: + return "Error: Division by zero" + result = number1 / number2 + elif operator == "^": + result = number1**number2 + elif operator == "sqrt": + if number1 < 0: + return "Error: Cannot take square root of negative number" + result = math.sqrt(number1) + else: + return f"Error: Unsupported operator '{operator}'" + + # Format the result + if isinstance(result, float) and result.is_integer(): + result = int(result) + + return f"Result: {result}" + except Exception as e: + return f"Error: {str(e)}" + + +if __name__ == "__main__": + mcp.run() diff --git a/agents/tools/code_execution.py b/agents/tools/code_execution.py new file mode 100644 index 000000000..596d7727a --- /dev/null +++ b/agents/tools/code_execution.py @@ -0,0 +1,19 @@ +"""Code execution server tool for the agent framework.""" + +from dataclasses import dataclass +from typing import Any + + +@dataclass +class CodeExecutionServerTool: + """Code execution server tool that uses Anthropic's server tool format.""" + + name: str = "code_execution" + type: str = "code_execution_20250522" + + def to_dict(self) -> dict[str, Any]: + """Convert to Anthropic server tool format.""" + return { + "type": self.type, + "name": self.name, + } \ No newline at end of file diff --git a/agents/tools/file_tools.py b/agents/tools/file_tools.py new file mode 100644 index 000000000..1ce5a1c45 --- /dev/null +++ b/agents/tools/file_tools.py @@ -0,0 +1,277 @@ +"""File operation tools for reading and writing files.""" + +import asyncio +import glob +import os +from pathlib import Path + +from .base import Tool + + +class FileReadTool(Tool): + """Tool for reading files and listing directories.""" + + def __init__(self): + super().__init__( + name="file_read", + description=""" + Read files or list directory contents. + + Operations: + - read: Read the contents of a file + - list: List files in a directory + """, + input_schema={ + "type": "object", + "properties": { + "operation": { + "type": "string", + "enum": ["read", "list"], + "description": "File operation to perform", + }, + "path": { + "type": "string", + "description": "File path for read or directory path", + }, + "max_lines": { + "type": "integer", + "description": "Maximum lines to read (0 means no limit)", + }, + "pattern": { + "type": "string", + "description": "File pattern to match", + }, + }, + "required": ["operation", "path"], + }, + ) + + async def execute( + self, + operation: str, + path: str, + max_lines: int = 0, + pattern: str = "*", + ) -> str: + """Execute a file read operation. + + Args: + operation: The operation to perform (read or list) + path: The file or directory path + max_lines: Maximum lines to read (for read operation, 0 means no limit) + pattern: File pattern to match (for list operation) + + Returns: + Result of the operation as string + """ + if operation == "read": + return await self._read_file(path, max_lines) + elif operation == "list": + return await self._list_files(path, pattern) + else: + return f"Error: Unsupported operation '{operation}'" + + async def _read_file(self, path: str, max_lines: int = 0) -> str: + """Read a file from disk. + + Args: + path: Path to the file to read + max_lines: Maximum number of lines to read (0 means read entire file) + """ + try: + file_path = Path(path) + + if not file_path.exists(): + return f"Error: File not found at {path}" + if not file_path.is_file(): + return f"Error: {path} is not a file" + + def read_sync(): + with open(file_path, encoding="utf-8", errors="replace") as f: + if max_lines > 0: + lines = [] + for i, line in enumerate(f): + if i >= max_lines: + break + lines.append(line) + return "".join(lines) + return f.read() + + return await asyncio.to_thread(read_sync) + except Exception as e: + return f"Error reading {path}: {str(e)}" + + async def _list_files(self, directory: str, pattern: str = "*") -> str: + """List files in a directory.""" + try: + dir_path = Path(directory) + + if not dir_path.exists(): + return f"Error: Directory not found at {directory}" + if not dir_path.is_dir(): + return f"Error: {directory} is not a directory" + + def list_sync(): + search_pattern = f"{directory}/{pattern}" + files = glob.glob(search_pattern) + + if not files: + return f"No files found matching {directory}/{pattern}" + + file_list = [] + for file_path in sorted(files): + path_obj = Path(file_path) + rel_path = str(file_path).replace(str(dir_path) + "/", "") + + if path_obj.is_dir(): + file_list.append(f"📁 {rel_path}/") + else: + file_list.append(f"📄 {rel_path}") + + return "\n".join(file_list) + + return await asyncio.to_thread(list_sync) + except Exception as e: + return f"Error listing files in {directory}: {str(e)}" + + +class FileWriteTool(Tool): + """Tool for writing and editing files.""" + + def __init__(self): + super().__init__( + name="file_write", + description=""" + Write or edit files. + + Operations: + - write: Create or completely replace a file + - edit: Make targeted changes to parts of a file + """, + input_schema={ + "type": "object", + "properties": { + "operation": { + "type": "string", + "enum": ["write", "edit"], + "description": "File operation to perform", + }, + "path": { + "type": "string", + "description": "File path to write to or edit", + }, + "content": { + "type": "string", + "description": "Content to write", + }, + "old_text": { + "type": "string", + "description": "Text to replace (for edit operation)", + }, + "new_text": { + "type": "string", + "description": "Replacement text (for edit operation)", + }, + }, + "required": ["operation", "path"], + }, + ) + + async def execute( + self, + operation: str, + path: str, + content: str = "", + old_text: str = "", + new_text: str = "", + ) -> str: + """Execute a file write operation. + + Args: + operation: The operation to perform (write or edit) + path: The file path + content: Content to write (for write operation) + old_text: Text to replace (for edit operation) + new_text: Replacement text (for edit operation) + + Returns: + Result of the operation as string + """ + if operation == "write": + if not content: + return "Error: content parameter is required" + return await self._write_file(path, content) + elif operation == "edit": + if not old_text or not new_text: + return ( + "Error: both old_text and new_text parameters " + "are required for edit operation" + ) + return await self._edit_file(path, old_text, new_text) + else: + return f"Error: Unsupported operation '{operation}'" + + async def _write_file(self, path: str, content: str) -> str: + """Write content to a file.""" + try: + file_path = Path(path) + os.makedirs(file_path.parent, exist_ok=True) + + def write_sync(): + with open(file_path, "w", encoding="utf-8") as f: + f.write(content) + return ( + f"Successfully wrote {len(content)} " + f"characters to {path}" + ) + + return await asyncio.to_thread(write_sync) + except Exception as e: + return f"Error writing to {path}: {str(e)}" + + async def _edit_file(self, path: str, old_text: str, new_text: str) -> str: + """Make targeted changes to a file.""" + try: + file_path = Path(path) + + if not file_path.exists(): + return f"Error: File not found at {path}" + if not file_path.is_file(): + return f"Error: {path} is not a file" + + def edit_sync(): + try: + with open( + file_path, encoding="utf-8", errors="replace" + ) as f: + content = f.read() + + if old_text not in content: + return ( + f"Error: The specified text was not " + f"found in {path}" + ) + + # Count occurrences to warn about multiple matches + count = content.count(old_text) + if count > 1: + # Edit with warning about multiple occurrences + new_content = content.replace(old_text, new_text) + with open(file_path, "w", encoding="utf-8") as f: + f.write(new_content) + return ( + f"Warning: Found {count} occurrences. " + f"All were replaced in {path}" + ) + else: + # One occurrence, straightforward replacement + new_content = content.replace(old_text, new_text) + with open(file_path, "w", encoding="utf-8") as f: + f.write(new_content) + return f"Successfully edited {path}" + except UnicodeDecodeError: + return f"Error: {path} appears to be a binary file" + + return await asyncio.to_thread(edit_sync) + except Exception as e: + return f"Error editing {path}: {str(e)}" diff --git a/agents/tools/mcp_tool.py b/agents/tools/mcp_tool.py new file mode 100644 index 000000000..dc3146299 --- /dev/null +++ b/agents/tools/mcp_tool.py @@ -0,0 +1,36 @@ +"""Tools that interface with MCP servers.""" + +from typing import Any +from .base import Tool +from ..utils.connections import MCPConnection + + +class MCPTool(Tool): + def __init__( + self, + name: str, + description: str, + input_schema: dict[str, Any], + connection: "MCPConnection", + ): + super().__init__( + name=name, description=description, input_schema=input_schema + ) + self.connection = connection + + async def execute(self, **kwargs) -> str: + """Execute the MCP tool with the given input_schema. + Note: Currently only supports text results from MCP tools.""" + try: + result = await self.connection.call_tool( + self.name, arguments=kwargs + ) + + if hasattr(result, "content") and result.content: + for item in result.content: + if getattr(item, "type", None) == "text": + return item.text + + return "No text content in tool response" + except Exception as e: + return f"Error executing {self.name}: {e}" diff --git a/agents/tools/think.py b/agents/tools/think.py new file mode 100644 index 000000000..79787aa8c --- /dev/null +++ b/agents/tools/think.py @@ -0,0 +1,32 @@ +"""Think tool for internal reasoning.""" + +from .base import Tool + + +class ThinkTool(Tool): + """Tool for internal reasoning without executing external actions.""" + + def __init__(self): + super().__init__( + name="think", + description=( + "Use the tool to think about something. It will not obtain " + "new information or change the database, but just append the " + "thought to the log. Use it when complex reasoning or some " + "cache memory is needed." + ), + input_schema={ + "type": "object", + "properties": { + "thought": { + "type": "string", + "description": "A thought to think about.", + } + }, + "required": ["thought"], + }, + ) + + async def execute(self, thought: str) -> str: + """Simply returns the thought back to the model.""" + return "Thinking complete!" diff --git a/agents/tools/web_search.py b/agents/tools/web_search.py new file mode 100644 index 000000000..76d11e39b --- /dev/null +++ b/agents/tools/web_search.py @@ -0,0 +1,38 @@ +"""Web search server tool for the agent framework.""" + +from dataclasses import dataclass +from typing import Any, Optional + + +@dataclass +class WebSearchServerTool: + """Web search server tool that uses Anthropic's server tool format.""" + + name: str = "web_search" + type: str = "web_search_20250305" + max_uses: Optional[int] = None + allowed_domains: Optional[list[str]] = None + blocked_domains: Optional[list[str]] = None + user_location: Optional[dict[str, Any]] = None + + def to_dict(self) -> dict[str, Any]: + """Convert to Anthropic server tool format.""" + tool_dict: dict[str, Any] = { + "type": self.type, + "name": self.name, + } + + # Add optional parameters if provided + if self.max_uses is not None: + tool_dict["max_uses"] = self.max_uses + + if self.allowed_domains is not None: + tool_dict["allowed_domains"] = self.allowed_domains + + if self.blocked_domains is not None: + tool_dict["blocked_domains"] = self.blocked_domains + + if self.user_location is not None: + tool_dict["user_location"] = self.user_location + + return tool_dict \ No newline at end of file diff --git a/agents/utils/__init__.py b/agents/utils/__init__.py new file mode 100644 index 000000000..6b2a9fc41 --- /dev/null +++ b/agents/utils/__init__.py @@ -0,0 +1,6 @@ +"""Agent utility modules.""" + +from .history_util import MessageHistory +from .tool_util import execute_tools + +__all__ = ["MessageHistory", "execute_tools"] diff --git a/agents/utils/connections.py b/agents/utils/connections.py new file mode 100644 index 000000000..82eca3be7 --- /dev/null +++ b/agents/utils/connections.py @@ -0,0 +1,150 @@ +"""Connection handling for MCP servers.""" + +from abc import ABC, abstractmethod +from contextlib import AsyncExitStack +from typing import Any + +from mcp import ClientSession, StdioServerParameters +from mcp.client.sse import sse_client +from mcp.client.stdio import stdio_client + +from ..tools.mcp_tool import MCPTool + + +class MCPConnection(ABC): + """Base class for MCP server connections.""" + + def __init__(self): + self.session = None + self._rw_ctx = None + self._session_ctx = None + + @abstractmethod + async def _create_rw_context(self): + """Create the read/write context based on connection type.""" + + async def __aenter__(self): + """Initialize MCP server connection.""" + self._rw_ctx = await self._create_rw_context() + read_write = await self._rw_ctx.__aenter__() + read, write = read_write + self._session_ctx = ClientSession(read, write) + self.session = await self._session_ctx.__aenter__() + await self.session.initialize() + return self + + async def __aexit__(self, exc_type, exc_val, exc_tb): + """Clean up MCP server connection resources.""" + try: + if self._session_ctx: + await self._session_ctx.__aexit__(exc_type, exc_val, exc_tb) + if self._rw_ctx: + await self._rw_ctx.__aexit__(exc_type, exc_val, exc_tb) + except Exception as e: + print(f"Error during cleanup: {e}") + finally: + self.session = None + self._session_ctx = None + self._rw_ctx = None + + async def list_tools(self) -> Any: + """Retrieve available tools from the MCP server.""" + response = await self.session.list_tools() + return response.tools + + async def call_tool( + self, tool_name: str, arguments: dict[str, Any] + ) -> Any: + """Call a tool on the MCP server with provided arguments.""" + return await self.session.call_tool(tool_name, arguments=arguments) + + +class MCPConnectionStdio(MCPConnection): + """MCP connection using standard input/output.""" + + def __init__( + self, command: str, args: list[str] = [], env: dict[str, str] = None + ): + super().__init__() + self.command = command + self.args = args + self.env = env + + async def _create_rw_context(self): + return stdio_client( + StdioServerParameters( + command=self.command, args=self.args, env=self.env + ) + ) + + +class MCPConnectionSSE(MCPConnection): + """MCP connection using Server-Sent Events.""" + + def __init__(self, url: str, headers: dict[str, str] = None): + super().__init__() + self.url = url + self.headers = headers or {} + + async def _create_rw_context(self): + return sse_client(url=self.url, headers=self.headers) + + +def create_mcp_connection(config: dict[str, Any]) -> MCPConnection: + """Factory function to create the appropriate MCP connection.""" + conn_type = config.get("type", "stdio").lower() + + if conn_type == "stdio": + if not config.get("command"): + raise ValueError("Command is required for STDIO connections") + return MCPConnectionStdio( + command=config["command"], + args=config.get("args"), + env=config.get("env"), + ) + + elif conn_type == "sse": + if not config.get("url"): + raise ValueError("URL is required for SSE connections") + return MCPConnectionSSE( + url=config["url"], headers=config.get("headers") + ) + + else: + raise ValueError(f"Unsupported connection type: {conn_type}") + + +async def setup_mcp_connections( + mcp_servers: list[dict[str, Any]] | None, + stack: AsyncExitStack, +) -> list[MCPTool]: + """Set up MCP server connections and create tool interfaces.""" + if not mcp_servers: + return [] + + mcp_tools = [] + + for config in mcp_servers: + try: + connection = create_mcp_connection(config) + await stack.enter_async_context(connection) + tool_definitions = await connection.list_tools() + + for tool_info in tool_definitions: + mcp_tools.append( + MCPTool( + name=tool_info.name, + description=tool_info.description + or f"MCP tool: {tool_info.name}", + input_schema=tool_info.inputSchema, + connection=connection, + ) + ) + + except Exception as e: + print(f"Error setting up MCP server {config}: {e}") + + print( + f"Loaded {len(mcp_tools)} MCP tools from {len(mcp_servers)} servers." + ) + return mcp_tools diff --git a/agents/utils/history_util.py b/agents/utils/history_util.py new file mode 100644 index 000000000..4b8d383a8 --- /dev/null +++ b/agents/utils/history_util.py @@ -0,0 +1,124 @@ +"""Message history with token tracking and prompt caching.""" + +from typing import Any + + +class MessageHistory: + """Manages chat history with token tracking and context management.""" + + def __init__( + self, + model: str, + system: str, + context_window_tokens: int, + client: Any, + enable_caching: bool = True, + ): + self.model = model + self.system = system + self.context_window_tokens = context_window_tokens + self.messages: list[dict[str, Any]] = [] + self.total_tokens = 0 + self.enable_caching = enable_caching + self.message_tokens: list[tuple[int, int]] = ( + [] + ) # List of (input_tokens, output_tokens) tuples + self.client = client + + # set initial total tokens to system prompt + try: + system_token = ( + self.client.messages.count_tokens( + model=self.model, + system=self.system, + messages=[{"role": "user", "content": "test"}], + ).input_tokens + - 1 + ) + + except Exception: + system_token = len(self.system) / 4 + + self.total_tokens = system_token + + async def add_message( + self, + role: str, + content: str | list[dict[str, Any]], + usage: Any | None = None, + ): + """Add a message to the history and track token usage.""" + if isinstance(content, str): + content = [{"type": "text", "text": content}] + + message = {"role": role, "content": content} + self.messages.append(message) + + if role == "assistant" and usage: + total_input = ( + usage.input_tokens + + getattr(usage, "cache_read_input_tokens", 0) + + getattr(usage, "cache_creation_input_tokens", 0) + ) + output_tokens = usage.output_tokens + + current_turn_input = total_input - self.total_tokens + self.message_tokens.append((current_turn_input, output_tokens)) + self.total_tokens += current_turn_input + output_tokens + + def truncate(self) -> None: + """Remove oldest messages when context window limit is exceeded.""" + if self.total_tokens <= self.context_window_tokens: + return + + TRUNCATION_NOTICE_TOKENS = 25 + TRUNCATION_MESSAGE = { + "role": "user", + "content": [ + { + "type": "text", + "text": "[Earlier history has been truncated.]", + } + ], + } + + def remove_message_pair(): + self.messages.pop(0) + self.messages.pop(0) + + if self.message_tokens: + input_tokens, output_tokens = self.message_tokens.pop(0) + self.total_tokens -= input_tokens + output_tokens + + while ( + self.message_tokens + and len(self.messages) >= 2 + and self.total_tokens > self.context_window_tokens + ): + remove_message_pair() + + if self.messages and self.message_tokens: + original_input_tokens, original_output_tokens = ( + self.message_tokens[0] + ) + self.messages[0] = TRUNCATION_MESSAGE + self.message_tokens[0] = ( + TRUNCATION_NOTICE_TOKENS, + original_output_tokens, + ) + self.total_tokens += ( + TRUNCATION_NOTICE_TOKENS - original_input_tokens + ) + + def format_for_api(self) -> list[dict[str, Any]]: + """Format messages for Claude API with optional caching.""" + result = [ + {"role": m["role"], "content": m["content"]} for m in self.messages + ] + + if self.enable_caching and self.messages: + result[-1]["content"] = [ + {**block, "cache_control": {"type": "ephemeral"}} + for block in self.messages[-1]["content"] + ] + return result diff --git a/agents/utils/tool_util.py b/agents/utils/tool_util.py new file mode 100644 index 000000000..763fb749e --- /dev/null +++ b/agents/utils/tool_util.py @@ -0,0 +1,39 @@ +"""Tool execution utility with parallel execution support.""" + +import asyncio +from typing import Any + + +async def _execute_single_tool( + call: Any, tool_dict: dict[str, Any] +) -> dict[str, Any]: + """Execute a single tool and handle errors.""" + response = {"type": "tool_result", "tool_use_id": call.id} + + try: + # Execute the tool directly + result = await tool_dict[call.name].execute(**call.input) + response["content"] = str(result) + except KeyError: + response["content"] = f"Tool '{call.name}' not found" + response["is_error"] = True + except Exception as e: + response["content"] = f"Error executing tool: {str(e)}" + response["is_error"] = True + + return response + + +async def execute_tools( + tool_calls: list[Any], tool_dict: dict[str, Any], parallel: bool = True +) -> list[dict[str, Any]]: + """Execute multiple tools sequentially or in parallel.""" + + if parallel: + return await asyncio.gather( + *[_execute_single_tool(call, tool_dict) for call in tool_calls] + ) + else: + return [ + await _execute_single_tool(call, tool_dict) for call in tool_calls + ] diff --git a/autonomous-coding/.gitignore b/autonomous-coding/.gitignore new file mode 100644 index 000000000..b4da783c2 --- /dev/null +++ b/autonomous-coding/.gitignore @@ -0,0 +1,5 @@ +# Agent-generated output directories +generations/ + +# Log files +logs/ diff --git a/autonomous-coding/README.md b/autonomous-coding/README.md new file mode 100644 index 000000000..6a3ac47f5 --- /dev/null +++ b/autonomous-coding/README.md @@ -0,0 +1,163 @@ +# Autonomous Coding Agent Demo + +A minimal harness demonstrating long-running autonomous coding with the Claude Agent SDK. This demo implements a two-agent pattern (initializer + coding agent) that can build complete applications over multiple sessions. + +## Prerequisites + +**Required:** Install the latest versions of both Claude Code and the Claude Agent SDK: + +```bash +# Install Claude Code CLI (latest version required) +npm install -g @anthropic-ai/claude-code + +# Install Python dependencies +pip install -r requirements.txt +``` + +Verify your installations: +```bash +claude --version # Should be latest version +pip show claude-code-sdk # Check SDK is installed +``` + +**API Key:** Set your Anthropic API key: +```bash +export ANTHROPIC_API_KEY='your-api-key-here' +``` + +## Quick Start + +```bash +python autonomous_agent_demo.py --project-dir ./my_project +``` + +For testing with limited iterations: +```bash +python autonomous_agent_demo.py --project-dir ./my_project --max-iterations 3 +``` + +## Important Timing Expectations + +> **Warning: This demo takes a long time to run!** + +- **First session (initialization):** The agent generates a `feature_list.json` with 200 test cases. This takes several minutes and may appear to hang - this is normal. The agent is writing out all the features. + +- **Subsequent sessions:** Each coding iteration can take **5-15 minutes** depending on complexity. + +- **Full app:** Building all 200 features typically requires **many hours** of total runtime across multiple sessions. + +**Tip:** The 200 features parameter in the prompts is designed for comprehensive coverage. If you want faster demos, you can modify `prompts/initializer_prompt.md` to reduce the feature count (e.g., 20-50 features for a quicker demo). + +## How It Works + +### Two-Agent Pattern + +1. **Initializer Agent (Session 1):** Reads `app_spec.txt`, creates `feature_list.json` with 200 test cases, sets up project structure, and initializes git. + +2. **Coding Agent (Sessions 2+):** Picks up where the previous session left off, implements features one by one, and marks them as passing in `feature_list.json`. + +### Session Management + +- Each session runs with a fresh context window +- Progress is persisted via `feature_list.json` and git commits +- The agent auto-continues between sessions (3 second delay) +- Press `Ctrl+C` to pause; run the same command to resume + +## Security Model + +This demo uses a defense-in-depth security approach (see `security.py` and `client.py`): + +1. **OS-level Sandbox:** Bash commands run in an isolated environment +2. **Filesystem Restrictions:** File operations restricted to the project directory only +3. **Bash Allowlist:** Only specific commands are permitted: + - File inspection: `ls`, `cat`, `head`, `tail`, `wc`, `grep` + - Node.js: `npm`, `node` + - Version control: `git` + - Process management: `ps`, `lsof`, `sleep`, `pkill` (dev processes only) + +Commands not in the allowlist are blocked by the security hook. + +## Project Structure + +``` +autonomous-coding/ +├── autonomous_agent_demo.py # Main entry point +├── agent.py # Agent session logic +├── client.py # Claude SDK client configuration +├── security.py # Bash command allowlist and validation +├── progress.py # Progress tracking utilities +├── prompts.py # Prompt loading utilities +├── prompts/ +│ ├── app_spec.txt # Application specification +│ ├── initializer_prompt.md # First session prompt +│ └── coding_prompt.md # Continuation session prompt +└── requirements.txt # Python dependencies +``` + +## Generated Project Structure + +After running, your project directory will contain: + +``` +my_project/ +├── feature_list.json # Test cases (source of truth) +├── app_spec.txt # Copied specification +├── init.sh # Environment setup script +├── claude-progress.txt # Session progress notes +├── .claude_settings.json # Security settings +└── [application files] # Generated application code +``` + +## Running the Generated Application + +After the agent completes (or pauses), you can run the generated application: + +```bash +cd generations/my_project + +# Run the setup script created by the agent +./init.sh + +# Or manually (typical for Node.js apps): +npm install +npm run dev +``` + +The application will typically be available at `http://localhost:3000` or similar (check the agent's output or `init.sh` for the exact URL). + +## Command Line Options + +| Option | Description | Default | +|--------|-------------|---------| +| `--project-dir` | Directory for the project | `./autonomous_demo_project` | +| `--max-iterations` | Max agent iterations | Unlimited | +| `--model` | Claude model to use | `claude-sonnet-4-5-20250929` | + +## Customization + +### Changing the Application + +Edit `prompts/app_spec.txt` to specify a different application to build. + +### Adjusting Feature Count + +Edit `prompts/initializer_prompt.md` and change the "200 features" requirement to a smaller number for faster demos. + +### Modifying Allowed Commands + +Edit `security.py` to add or remove commands from `ALLOWED_COMMANDS`. + +## Troubleshooting + +**"Appears to hang on first run"** +This is normal. The initializer agent is generating 200 detailed test cases, which takes significant time. Watch for `[Tool: ...]` output to confirm the agent is working. + +**"Command blocked by security hook"** +The agent tried to run a command not in the allowlist. This is the security system working as intended. If needed, add the command to `ALLOWED_COMMANDS` in `security.py`. + +**"API key not set"** +Ensure `ANTHROPIC_API_KEY` is exported in your shell environment. + +## License + +Internal Anthropic use. diff --git a/autonomous-coding/agent.py b/autonomous-coding/agent.py new file mode 100644 index 000000000..8856d4023 --- /dev/null +++ b/autonomous-coding/agent.py @@ -0,0 +1,206 @@ +""" +Agent Session Logic +=================== + +Core agent interaction functions for running autonomous coding sessions. +""" + +import asyncio +from pathlib import Path +from typing import Optional + +from claude_code_sdk import ClaudeSDKClient + +from client import create_client +from progress import print_session_header, print_progress_summary +from prompts import get_initializer_prompt, get_coding_prompt, copy_spec_to_project + + +# Configuration +AUTO_CONTINUE_DELAY_SECONDS = 3 + + +async def run_agent_session( + client: ClaudeSDKClient, + message: str, + project_dir: Path, +) -> tuple[str, str]: + """ + Run a single agent session using Claude Agent SDK. + + Args: + client: Claude SDK client + message: The prompt to send + project_dir: Project directory path + + Returns: + (status, response_text) where status is: + - "continue" if agent should continue working + - "error" if an error occurred + """ + print("Sending prompt to Claude Agent SDK...\n") + + try: + # Send the query + await client.query(message) + + # Collect response text and show tool use + response_text = "" + async for msg in client.receive_response(): + msg_type = type(msg).__name__ + + # Handle AssistantMessage (text and tool use) + if msg_type == "AssistantMessage" and hasattr(msg, "content"): + for block in msg.content: + block_type = type(block).__name__ + + if block_type == "TextBlock" and hasattr(block, "text"): + response_text += block.text + print(block.text, end="", flush=True) + elif block_type == "ToolUseBlock" and hasattr(block, "name"): + print(f"\n[Tool: {block.name}]", flush=True) + if hasattr(block, "input"): + input_str = str(block.input) + if len(input_str) > 200: + print(f" Input: {input_str[:200]}...", flush=True) + else: + print(f" Input: {input_str}", flush=True) + + # Handle UserMessage (tool results) + elif msg_type == "UserMessage" and hasattr(msg, "content"): + for block in msg.content: + block_type = type(block).__name__ + + if block_type == "ToolResultBlock": + result_content = getattr(block, "content", "") + is_error = getattr(block, "is_error", False) + + # Check if command was blocked by security hook + if "blocked" in str(result_content).lower(): + print(f" [BLOCKED] {result_content}", flush=True) + elif is_error: + # Show errors (truncated) + error_str = str(result_content)[:500] + print(f" [Error] {error_str}", flush=True) + else: + # Tool succeeded - just show brief confirmation + print(" [Done]", flush=True) + + print("\n" + "-" * 70 + "\n") + return "continue", response_text + + except Exception as e: + print(f"Error during agent session: {e}") + return "error", str(e) + + +async def run_autonomous_agent( + project_dir: Path, + model: str, + max_iterations: Optional[int] = None, +) -> None: + """ + Run the autonomous agent loop. + + Args: + project_dir: Directory for the project + model: Claude model to use + max_iterations: Maximum number of iterations (None for unlimited) + """ + print("\n" + "=" * 70) + print(" AUTONOMOUS CODING AGENT DEMO") + print("=" * 70) + print(f"\nProject directory: {project_dir}") + print(f"Model: {model}") + if max_iterations: + print(f"Max iterations: {max_iterations}") + else: + print("Max iterations: Unlimited (will run until completion)") + print() + + # Create project directory + project_dir.mkdir(parents=True, exist_ok=True) + + # Check if this is a fresh start or continuation + tests_file = project_dir / "feature_list.json" + is_first_run = not tests_file.exists() + + if is_first_run: + print("Fresh start - will use initializer agent") + print() + print("=" * 70) + print(" NOTE: First session takes 10-20+ minutes!") + print(" The agent is generating 200 detailed test cases.") + print(" This may appear to hang - it's working. Watch for [Tool: ...] output.") + print("=" * 70) + print() + # Copy the app spec into the project directory for the agent to read + copy_spec_to_project(project_dir) + else: + print("Continuing existing project") + print_progress_summary(project_dir) + + # Main loop + iteration = 0 + + while True: + iteration += 1 + + # Check max iterations + if max_iterations and iteration > max_iterations: + print(f"\nReached max iterations ({max_iterations})") + print("To continue, run the script again without --max-iterations") + break + + # Print session header + print_session_header(iteration, is_first_run) + + # Create client (fresh context) + client = create_client(project_dir, model) + + # Choose prompt based on session type + if is_first_run: + prompt = get_initializer_prompt() + is_first_run = False # Only use initializer once + else: + prompt = get_coding_prompt() + + # Run session with async context manager + async with client: + status, response = await run_agent_session(client, prompt, project_dir) + + # Handle status + if status == "continue": + print(f"\nAgent will auto-continue in {AUTO_CONTINUE_DELAY_SECONDS}s...") + print_progress_summary(project_dir) + await asyncio.sleep(AUTO_CONTINUE_DELAY_SECONDS) + + elif status == "error": + print("\nSession encountered an error") + print("Will retry with a fresh session...") + await asyncio.sleep(AUTO_CONTINUE_DELAY_SECONDS) + + # Small delay between sessions + if max_iterations is None or iteration < max_iterations: + print("\nPreparing next session...\n") + await asyncio.sleep(1) + + # Final summary + print("\n" + "=" * 70) + print(" SESSION COMPLETE") + print("=" * 70) + print(f"\nProject directory: {project_dir}") + print_progress_summary(project_dir) + + # Print instructions for running the generated application + print("\n" + "-" * 70) + print(" TO RUN THE GENERATED APPLICATION:") + print("-" * 70) + print(f"\n cd {project_dir.resolve()}") + print(" ./init.sh # Run the setup script") + print(" # Or manually:") + print(" npm install && npm run dev") + print("\n Then open http://localhost:3000 (or check init.sh for the URL)") + print("-" * 70) + + print("\nDone!") diff --git a/autonomous-coding/autonomous_agent_demo.py b/autonomous-coding/autonomous_agent_demo.py new file mode 100755 index 000000000..2cd3b5551 --- /dev/null +++ b/autonomous-coding/autonomous_agent_demo.py @@ -0,0 +1,116 @@ +#!/usr/bin/env python3 +""" +Autonomous Coding Agent Demo +============================ + +A minimal harness demonstrating long-running autonomous coding with Claude. +This script implements the two-agent pattern (initializer + coding agent) and +incorporates all the strategies from the long-running agents guide. + +Example Usage: + python autonomous_agent_demo.py --project-dir ./claude_clone_demo + python autonomous_agent_demo.py --project-dir ./claude_clone_demo --max-iterations 5 +""" + +import argparse +import asyncio +import os +from pathlib import Path + +from agent import run_autonomous_agent + + +# Configuration +DEFAULT_MODEL = "claude-sonnet-4-5-20250929" + + +def parse_args() -> argparse.Namespace: + """Parse command line arguments.""" + parser = argparse.ArgumentParser( + description="Autonomous Coding Agent Demo - Long-running agent harness", + formatter_class=argparse.RawDescriptionHelpFormatter, + epilog=""" +Examples: + # Start fresh project + python autonomous_agent_demo.py --project-dir ./claude_clone + + # Use a specific model + python autonomous_agent_demo.py --project-dir ./claude_clone --model claude-sonnet-4-5-20250929 + + # Limit iterations for testing + python autonomous_agent_demo.py --project-dir ./claude_clone --max-iterations 5 + + # Continue existing project + python autonomous_agent_demo.py --project-dir ./claude_clone + +Environment Variables: + ANTHROPIC_API_KEY Your Anthropic API key (required) + """, + ) + + parser.add_argument( + "--project-dir", + type=Path, + default=Path("./autonomous_demo_project"), + help="Directory for the project (default: generations/autonomous_demo_project). Relative paths automatically placed in generations/ directory.", + ) + + parser.add_argument( + "--max-iterations", + type=int, + default=None, + help="Maximum number of agent iterations (default: unlimited)", + ) + + parser.add_argument( + "--model", + type=str, + default=DEFAULT_MODEL, + help=f"Claude model to use (default: {DEFAULT_MODEL})", + ) + + return parser.parse_args() + + +def main() -> None: + """Main entry point.""" + args = parse_args() + + # Check for API key + if not os.environ.get("ANTHROPIC_API_KEY"): + print("Error: ANTHROPIC_API_KEY environment variable not set") + print("\nGet your API key from: https://console.anthropic.com/") + print("\nThen set it:") + print(" export ANTHROPIC_API_KEY='your-api-key-here'") + return + + # Automatically place projects in generations/ directory unless already specified + project_dir = args.project_dir + if not str(project_dir).startswith("generations/"): + # Convert relative paths to be under generations/ + if project_dir.is_absolute(): + # If absolute path, use as-is + pass + else: + # Prepend generations/ to relative paths + project_dir = Path("generations") / project_dir + + # Run the agent + try: + asyncio.run( + run_autonomous_agent( + project_dir=project_dir, + model=args.model, + max_iterations=args.max_iterations, + ) + ) + except KeyboardInterrupt: + print("\n\nInterrupted by user") + print("To resume, run the same command again") + except Exception as e: + print(f"\nFatal error: {e}") + raise + + +if __name__ == "__main__": + main() diff --git a/autonomous-coding/client.py b/autonomous-coding/client.py new file mode 100644 index 000000000..4b35ec083 --- /dev/null +++ b/autonomous-coding/client.py @@ -0,0 +1,122 @@ +""" +Claude SDK Client Configuration +=============================== + +Functions for creating and configuring the Claude Agent SDK client. +""" + +import json +import os +from pathlib import Path + +from claude_code_sdk import ClaudeCodeOptions, ClaudeSDKClient +from claude_code_sdk.types import HookMatcher + +from security import bash_security_hook + + +# Puppeteer MCP tools for browser automation +PUPPETEER_TOOLS = [ + "mcp__puppeteer__puppeteer_navigate", + "mcp__puppeteer__puppeteer_screenshot", + "mcp__puppeteer__puppeteer_click", + "mcp__puppeteer__puppeteer_fill", + "mcp__puppeteer__puppeteer_select", + "mcp__puppeteer__puppeteer_hover", + "mcp__puppeteer__puppeteer_evaluate", +] + +# Built-in tools +BUILTIN_TOOLS = [ + "Read", + "Write", + "Edit", + "Glob", + "Grep", + "Bash", +] + + +def create_client(project_dir: Path, model: str) -> ClaudeSDKClient: + """ + Create a Claude Agent SDK client with multi-layered security. + + Args: + project_dir: Directory for the project + model: Claude model to use + + Returns: + Configured ClaudeSDKClient + + Security layers (defense in depth): + 1. Sandbox - OS-level bash command isolation prevents filesystem escape + 2. Permissions - File operations restricted to project_dir only + 3. Security hooks - Bash commands validated against an allowlist + (see security.py for ALLOWED_COMMANDS) + """ + api_key = os.environ.get("ANTHROPIC_API_KEY") + if not api_key: + raise ValueError( + "ANTHROPIC_API_KEY environment variable not set.\n" + "Get your API key from: https://console.anthropic.com/" + ) + + # Create comprehensive security settings + # Note: Using relative paths ("./**") restricts access to project directory + # since cwd is set to project_dir + security_settings = { + "sandbox": {"enabled": True, "autoAllowBashIfSandboxed": True}, + "permissions": { + "defaultMode": "acceptEdits", # Auto-approve edits within allowed directories + "allow": [ + # Allow all file operations within the project directory + "Read(./**)", + "Write(./**)", + "Edit(./**)", + "Glob(./**)", + "Grep(./**)", + # Bash permission granted here, but actual commands are validated + # by the bash_security_hook (see security.py for allowed commands) + "Bash(*)", + # Allow Puppeteer MCP tools for browser automation + *PUPPETEER_TOOLS, + ], + }, + } + + # Ensure project directory exists before creating settings file + project_dir.mkdir(parents=True, exist_ok=True) + + # Write settings to a file in the project directory + settings_file = project_dir / ".claude_settings.json" + with open(settings_file, "w") as f: + json.dump(security_settings, f, indent=2) + + print(f"Created security settings at {settings_file}") + print(" - Sandbox enabled (OS-level bash isolation)") + print(f" - Filesystem restricted to: {project_dir.resolve()}") + print(" - Bash commands restricted to allowlist (see security.py)") + print(" - MCP servers: puppeteer (browser automation)") + print() + + return ClaudeSDKClient( + options=ClaudeCodeOptions( + model=model, + system_prompt="You are an expert full-stack developer building a production-quality web application.", + allowed_tools=[ + *BUILTIN_TOOLS, + *PUPPETEER_TOOLS, + ], + mcp_servers={ + "puppeteer": {"command": "npx", "args": ["puppeteer-mcp-server"]} + }, + hooks={ + "PreToolUse": [ + HookMatcher(matcher="Bash", hooks=[bash_security_hook]), + ], + }, + max_turns=1000, + cwd=str(project_dir.resolve()), + settings=str(settings_file.resolve()), # Use absolute path + ) + ) diff --git a/autonomous-coding/progress.py b/autonomous-coding/progress.py new file mode 100644 index 000000000..aebee826f --- /dev/null +++ b/autonomous-coding/progress.py @@ -0,0 +1,57 @@ +""" +Progress Tracking Utilities +=========================== + +Functions for tracking and displaying progress of the autonomous coding agent. +""" + +import json +from pathlib import Path + + +def count_passing_tests(project_dir: Path) -> tuple[int, int]: + """ + Count passing and total tests in feature_list.json. + + Args: + project_dir: Directory containing feature_list.json + + Returns: + (passing_count, total_count) + """ + tests_file = project_dir / "feature_list.json" + + if not tests_file.exists(): + return 0, 0 + + try: + with open(tests_file, "r") as f: + tests = json.load(f) + + total = len(tests) + passing = sum(1 for test in tests if test.get("passes", False)) + + return passing, total + except (json.JSONDecodeError, IOError): + return 0, 0 + + +def print_session_header(session_num: int, is_initializer: bool) -> None: + """Print a formatted header for the session.""" + session_type = "INITIALIZER" if is_initializer else "CODING AGENT" + + print("\n" + "=" * 70) + print(f" SESSION {session_num}: {session_type}") + print("=" * 70) + print() + + +def print_progress_summary(project_dir: Path) -> None: + """Print a summary of current progress.""" + passing, total = count_passing_tests(project_dir) + + if total > 0: + percentage = (passing / total) * 100 + print(f"\nProgress: {passing}/{total} tests passing ({percentage:.1f}%)") + else: + print("\nProgress: feature_list.json not yet created") diff --git a/autonomous-coding/prompts.py b/autonomous-coding/prompts.py new file mode 100644 index 000000000..b9f8eeb68 --- /dev/null +++ b/autonomous-coding/prompts.py @@ -0,0 +1,37 @@ +""" +Prompt Loading Utilities +======================== + +Functions for loading prompt templates from the prompts directory. +""" + +import shutil +from pathlib import Path + + +PROMPTS_DIR = Path(__file__).parent / "prompts" + + +def load_prompt(name: str) -> str: + """Load a prompt template from the prompts directory.""" + prompt_path = PROMPTS_DIR / f"{name}.md" + return prompt_path.read_text() + + +def get_initializer_prompt() -> str: + """Load the initializer prompt.""" + return load_prompt("initializer_prompt") + + +def get_coding_prompt() -> str: + """Load the coding agent prompt.""" + return load_prompt("coding_prompt") + + +def copy_spec_to_project(project_dir: Path) -> None: + """Copy the app spec file into the project directory for the agent to read.""" + spec_source = PROMPTS_DIR / "app_spec.txt" + spec_dest = project_dir / "app_spec.txt" + if not spec_dest.exists(): + shutil.copy(spec_source, spec_dest) + print("Copied app_spec.txt to project directory") diff --git a/autonomous-coding/prompts/app_spec.txt b/autonomous-coding/prompts/app_spec.txt new file mode 100644 index 000000000..1e35f6d6c --- /dev/null +++ b/autonomous-coding/prompts/app_spec.txt @@ -0,0 +1,681 @@ + + Claude.ai Clone - AI Chat Interface + + + Build a fully functional clone of claude.ai, Anthropic's conversational AI interface. The application should + provide a clean, modern chat interface for interacting with Claude via the API, including features like + conversation management, artifact rendering, project organization, multiple model selection, and advanced + settings. The UI should closely match claude.ai's design using Tailwind CSS with a focus on excellent + user experience and responsive design. + + + + + You can use an API key located at /tmp/api-key for testing. You will not be allowed to read this file, but you can reference it in code. + + + React with Vite + Tailwind CSS (via CDN) + React hooks and context + React Router for navigation + React Markdown for message rendering + Syntax highlighting for code blocks + Only launch on port {frontend_port} + + + Node.js with Express + SQLite with better-sqlite3 + Claude API for chat completions + Server-Sent Events for streaming responses + + + RESTful endpoints + SSE for real-time message streaming + Integration with Claude API using Anthropic SDK + + + + + + - Repository includes .env with VITE_ANTHROPIC_API_KEY configured + - Frontend dependencies pre-installed via pnpm + - Backend code goes in /server directory + - Install backend dependencies as needed + + + + + + - Clean, centered chat layout with message bubbles + - Streaming message responses with typing indicator + - Markdown rendering with proper formatting + - Code blocks with syntax highlighting and copy button + - LaTeX/math equation rendering + - Image upload and display in messages + - Multi-turn conversations with context + - Message editing and regeneration + - Stop generation button during streaming + - Input field with auto-resize textarea + - Character count and token estimation + - Keyboard shortcuts (Enter to send, Shift+Enter for newline) + + + + - Artifact detection and rendering in side panel + - Code artifact viewer with syntax highlighting + - HTML/SVG preview with live rendering + - React component preview + - Mermaid diagram rendering + - Text document artifacts + - Artifact editing and re-prompting + - Full-screen artifact view + - Download artifact content + - Artifact versioning and history + + + + - Create new conversations + - Conversation list in sidebar + - Rename conversations + - Delete conversations + - Search conversations by title/content + - Pin important conversations + - Archive conversations + - Conversation folders/organization + - Duplicate conversation + - Export conversation (JSON, Markdown, PDF) + - Conversation timestamps (created, last updated) + - Unread message indicators + + + + - Create projects to group related conversations + - Project knowledge base (upload documents) + - Project-specific custom instructions + - Share projects with team (mock feature) + - Project settings and configuration + - Move conversations between projects + - Project templates + - Project analytics (usage stats) + + + + - Model selector dropdown with the following models: + - Claude Sonnet 4.5 (claude-sonnet-4-5-20250929) - default + - Claude Haiku 4.5 (claude-haiku-4-5-20251001) + - Claude Opus 4.1 (claude-opus-4-1-20250805) + - Model capabilities display + - Context window indicator + - Model-specific pricing info (display only) + - Switch models mid-conversation + - Model comparison view + + + + - Global custom instructions + - Project-specific custom instructions + - Conversation-specific system prompts + - Custom instruction templates + - Preview how instructions affect responses + + + + - Theme selection (Light, Dark, Auto) + - Font size adjustment + - Message density (compact, comfortable, spacious) + - Code theme selection + - Language preferences + - Accessibility options + - Keyboard shortcuts reference + - Data export options + - Privacy settings + - API key management + + + + - Temperature control slider + - Max tokens adjustment + - Top-p (nucleus sampling) control + - System prompt override + - Thinking/reasoning mode toggle + - Multi-modal input (text + images) + - Voice input (optional, mock UI) + - Response suggestions + - Related prompts + - Conversation branching + + + + - Share conversation via link (read-only) + - Export conversation formats + - Conversation templates + - Prompt library + - Share artifacts + - Team workspaces (mock UI) + + + + - Search across all conversations + - Filter by project, date, model + - Prompt library with categories + - Example conversations + - Quick actions menu + - Command palette (Cmd/Ctrl+K) + + + + - Token usage display per message + - Conversation cost estimation + - Daily/monthly usage dashboard + - Usage limits and warnings + - API quota tracking + + + + - Welcome screen for new users + - Feature tour highlights + - Example prompts to get started + - Quick tips and best practices + - Keyboard shortcuts tutorial + + + + - Full keyboard navigation + - Screen reader support + - ARIA labels and roles + - High contrast mode + - Focus management + - Reduced motion support + + + + - Mobile-first responsive layout + - Touch-optimized interface + - Collapsible sidebar on mobile + - Swipe gestures for navigation + - Adaptive artifact display + - Progressive Web App (PWA) support + + + + + + + - id, email, name, avatar_url + - created_at, last_login + - preferences (JSON: theme, font_size, etc.) + - custom_instructions + + + + - id, user_id, name, description, color + - custom_instructions, knowledge_base_path + - created_at, updated_at + - is_archived, is_pinned + + + + - id, user_id, project_id, title + - model, created_at, updated_at, last_message_at + - is_archived, is_pinned, is_deleted + - settings (JSON: temperature, max_tokens, etc.) + - token_count, message_count + + + + - id, conversation_id, role (user/assistant/system) + - content, created_at, edited_at + - tokens, finish_reason + - images (JSON array of image data) + - parent_message_id (for branching) + + + + - id, message_id, conversation_id + - type (code/html/svg/react/mermaid/text) + - title, identifier, language + - content, version + - created_at, updated_at + + + + - id, conversation_id, share_token + - created_at, expires_at, view_count + - is_public + + + + - id, user_id, title, description + - prompt_template, category, tags (JSON) + - is_public, usage_count + - created_at, updated_at + + + + - id, user_id, project_id, name, parent_folder_id + - created_at, position + + + + - id, folder_id, conversation_id + + + + - id, user_id, conversation_id, message_id + - model, input_tokens, output_tokens + - cost_estimate, created_at + + + + - id, user_id, key_name, api_key_hash + - created_at, last_used_at + - is_active + + + + + + + - POST /api/auth/login + - POST /api/auth/logout + - GET /api/auth/me + - PUT /api/auth/profile + + + + - GET /api/conversations + - POST /api/conversations + - GET /api/conversations/:id + - PUT /api/conversations/:id + - DELETE /api/conversations/:id + - POST /api/conversations/:id/duplicate + - POST /api/conversations/:id/export + - PUT /api/conversations/:id/archive + - PUT /api/conversations/:id/pin + - POST /api/conversations/:id/branch + + + + - GET /api/conversations/:id/messages + - POST /api/conversations/:id/messages + - PUT /api/messages/:id + - DELETE /api/messages/:id + - POST /api/messages/:id/regenerate + - GET /api/messages/stream (SSE endpoint) + + + + - GET /api/conversations/:id/artifacts + - GET /api/artifacts/:id + - PUT /api/artifacts/:id + - DELETE /api/artifacts/:id + - POST /api/artifacts/:id/fork + - GET /api/artifacts/:id/versions + + + + - GET /api/projects + - POST /api/projects + - GET /api/projects/:id + - PUT /api/projects/:id + - DELETE /api/projects/:id + - POST /api/projects/:id/knowledge + - GET /api/projects/:id/conversations + - PUT /api/projects/:id/settings + + + + - POST /api/conversations/:id/share + - GET /api/share/:token + - DELETE /api/share/:token + - PUT /api/share/:token/settings + + + + - GET /api/prompts/library + - POST /api/prompts/library + - GET /api/prompts/:id + - PUT /api/prompts/:id + - DELETE /api/prompts/:id + - GET /api/prompts/categories + - GET /api/prompts/examples + + + + - GET /api/search/conversations?q=query + - GET /api/search/messages?q=query + - GET /api/search/artifacts?q=query + - GET /api/search/prompts?q=query + + + + - GET /api/folders + - POST /api/folders + - PUT /api/folders/:id + - DELETE /api/folders/:id + - POST /api/folders/:id/items + - DELETE /api/folders/:id/items/:conversationId + + + + - GET /api/usage/daily + - GET /api/usage/monthly + - GET /api/usage/by-model + - GET /api/usage/conversations/:id + + + + - GET /api/settings + - PUT /api/settings + - GET /api/settings/custom-instructions + - PUT /api/settings/custom-instructions + + + + - POST /api/claude/chat (proxy to Claude API) + - POST /api/claude/chat/stream (streaming proxy) + - GET /api/claude/models + - POST /api/claude/images/upload + + + + + + - Three-column layout: sidebar (conversations), main (chat), panel (artifacts) + - Collapsible sidebar with resize handle + - Responsive breakpoints: mobile (single column), tablet (two column), desktop (three column) + - Persistent header with project/model selector + - Bottom input area with send button and options + + + + - New chat button (prominent) + - Project selector dropdown + - Search conversations input + - Conversations list (grouped by date: Today, Yesterday, Previous 7 days, etc.) + - Folder tree view (collapsible) + - Settings gear icon at bottom + - User profile at bottom + + + + - Conversation title (editable inline) + - Model selector badge + - Message history (scrollable) + - Welcome screen for new conversations + - Suggested prompts (empty state) + - Input area with formatting toolbar + - Attachment button for images + - Send button with loading state + - Stop generation button + + + + - Artifact header with title and type badge + - Code editor or preview pane + - Tabs for multiple artifacts + - Full-screen toggle + - Download button + - Edit/Re-prompt button + - Version selector + - Close panel button + + + + - Settings modal (tabbed interface) + - Share conversation modal + - Export options modal + - Project settings modal + - Prompt library modal + - Command palette overlay + - Keyboard shortcuts reference + + + + + + - Primary: Orange/amber accent (#CC785C claude-style) + - Background: White (light mode), Dark gray (#1A1A1A dark mode) + - Surface: Light gray (#F5F5F5 light), Darker gray (#2A2A2A dark) + - Text: Near black (#1A1A1A light), Off-white (#E5E5E5 dark) + - Borders: Light gray (#E5E5E5 light), Dark gray (#404040 dark) + - Code blocks: Monaco editor theme + + + + - Sans-serif system font stack (Inter, SF Pro, Roboto, system-ui) + - Headings: font-semibold + - Body: font-normal, leading-relaxed + - Code: Monospace (JetBrains Mono, Consolas, Monaco) + - Message text: text-base (16px), comfortable line-height + + + + + - User messages: Right-aligned, subtle background + - Assistant messages: Left-aligned, no background + - Markdown formatting with proper spacing + - Inline code with bg-gray-100 background + - Code blocks with syntax highlighting + - Copy button on code blocks + + + + - Primary: Orange/amber background, white text, rounded + - Secondary: Border style with hover fill + - Icon buttons: Square with hover background + - Disabled state: Reduced opacity, no pointer events + + + + - Rounded borders with focus ring + - Textarea auto-resize + - Placeholder text in gray + - Error states in red + - Character counter + + + + - Subtle border or shadow + - Rounded corners (8px) + - Padding: p-4 to p-6 + - Hover state: slight shadow increase + + + + + - Smooth transitions (150-300ms) + - Fade in for new messages + - Slide in for sidebar + - Typing indicator animation + - Loading spinner for generation + - Skeleton loaders for content + + + + + + 1. User types message in input field + 2. Optional: Attach images via button + 3. Click send or press Enter + 4. Message appears in chat immediately + 5. Typing indicator shows while waiting + 6. Response streams in word by word + 7. Code blocks render with syntax highlighting + 8. Artifacts detected and rendered in side panel + 9. Message complete, enable regenerate option + + + + 1. Assistant generates artifact in response + 2. Artifact panel slides in from right + 3. Content renders (code with highlighting or live preview) + 4. User can edit artifact inline + 5. "Re-prompt" button to iterate with Claude + 6. Download or copy artifact content + 7. Full-screen mode for detailed work + 8. Close panel to return to chat focus + + + + 1. Click "New Chat" to start fresh conversation + 2. Conversations auto-save with first message + 3. Auto-generate title from first exchange + 4. Click title to rename inline + 5. Drag conversations into folders + 6. Right-click for context menu (pin, archive, delete, export) + 7. Search filters conversations in real-time + 8. Click conversation to switch context + + + + + + Setup Project Foundation and Database + + - Initialize Express server with SQLite database + - Set up Claude API client with streaming support + - Create database schema with migrations + - Implement authentication endpoints + - Set up basic CORS and middleware + - Create health check endpoint + + + + + Build Core Chat Interface + + - Create main layout with sidebar and chat area + - Implement message display with markdown rendering + - Add streaming message support with SSE + - Build input area with auto-resize textarea + - Add code block syntax highlighting + - Implement stop generation functionality + - Add typing indicators and loading states + + + + + Conversation Management + + - Create conversation list in sidebar + - Implement new conversation creation + - Add conversation switching + - Build conversation rename functionality + - Implement delete with confirmation + - Add conversation search + - Create conversation grouping by date + + + + + Artifacts System + + - Build artifact detection from Claude responses + - Create artifact rendering panel + - Implement code artifact viewer + - Add HTML/SVG live preview + - Build artifact editing interface + - Add artifact versioning + - Implement full-screen artifact view + + + + + Projects and Organization + + - Create projects CRUD endpoints + - Build project selector UI + - Implement project-specific custom instructions + - Add folder system for conversations + - Create drag-and-drop organization + - Build project settings panel + + + + + Advanced Features + + - Add model selection dropdown + - Implement temperature and parameter controls + - Build image upload functionality + - Create message editing and regeneration + - Add conversation branching + - Implement export functionality + + + + + Settings and Customization + + - Build settings modal with tabs + - Implement theme switching (light/dark) + - Add custom instructions management + - Create keyboard shortcuts + - Build prompt library + - Add usage tracking dashboard + + + + + Sharing and Collaboration + + - Implement conversation sharing with tokens + - Create public share view + - Add export to multiple formats + - Build prompt templates + - Create example conversations + + + + + Polish and Optimization + + - Optimize for mobile responsiveness + - Add command palette (Cmd+K) + - Implement comprehensive keyboard navigation + - Add onboarding flow + - Create accessibility improvements + - Performance optimization and caching + + + + + + + - Streaming chat responses work smoothly + - Artifact detection and rendering accurate + - Conversation management intuitive and reliable + - Project organization clear and useful + - Image upload and display working + - All CRUD operations functional + + + + - Interface matches claude.ai design language + - Responsive on all device sizes + - Smooth animations and transitions + - Fast response times and minimal lag + - Intuitive navigation and workflows + - Clear feedback for all actions + + + + - Clean, maintainable code structure + - Proper error handling throughout + - Secure API key management + - Optimized database queries + - Efficient streaming implementation + - Comprehensive testing coverage + + + + - Consistent with claude.ai visual design + - Beautiful typography and spacing + - Smooth animations and micro-interactions + - Excellent contrast and accessibility + - Professional, polished appearance + - Dark mode fully implemented + + + diff --git a/autonomous-coding/prompts/coding_prompt.md b/autonomous-coding/prompts/coding_prompt.md new file mode 100644 index 000000000..2af09ad1a --- /dev/null +++ b/autonomous-coding/prompts/coding_prompt.md @@ -0,0 +1,197 @@ +## YOUR ROLE - CODING AGENT + +You are continuing work on a long-running autonomous development task. +This is a FRESH context window - you have no memory of previous sessions. + +### STEP 1: GET YOUR BEARINGS (MANDATORY) + +Start by orienting yourself: + +```bash +# 1. See your working directory +pwd + +# 2. List files to understand project structure +ls -la + +# 3. Read the project specification to understand what you're building +cat app_spec.txt + +# 4. Read the feature list to see all work +cat feature_list.json | head -50 + +# 5. Read progress notes from previous sessions +cat claude-progress.txt + +# 6. Check recent git history +git log --oneline -20 + +# 7. Count remaining tests +cat feature_list.json | grep '"passes": false' | wc -l +``` + +Understanding the `app_spec.txt` is critical - it contains the full requirements +for the application you're building. + +### STEP 2: START SERVERS (IF NOT RUNNING) + +If `init.sh` exists, run it: +```bash +chmod +x init.sh +./init.sh +``` + +Otherwise, start servers manually and document the process. + +### STEP 3: VERIFICATION TEST (CRITICAL!) + +**MANDATORY BEFORE NEW WORK:** + +The previous session may have introduced bugs. Before implementing anything +new, you MUST run verification tests. + +Run 1-2 of the feature tests marked as `"passes": true` that are most core to the app's functionality to verify they still work. +For example, if this were a chat app, you should perform a test that logs into the app, sends a message, and gets a response. + +**If you find ANY issues (functional or visual):** +- Mark that feature as "passes": false immediately +- Add issues to a list +- Fix all issues BEFORE moving to new features +- This includes UI bugs like: + * White-on-white text or poor contrast + * Random characters displayed + * Incorrect timestamps + * Layout issues or overflow + * Buttons too close together + * Missing hover states + * Console errors + +### STEP 4: CHOOSE ONE FEATURE TO IMPLEMENT + +Look at feature_list.json and find the highest-priority feature with "passes": false. + +Focus on completing one feature perfectly and completing its testing steps in this session before moving on to other features. +It's ok if you only complete one feature in this session, as there will be more sessions later that continue to make progress. + +### STEP 5: IMPLEMENT THE FEATURE + +Implement the chosen feature thoroughly: +1. Write the code (frontend and/or backend as needed) +2. Test manually using browser automation (see Step 6) +3. Fix any issues discovered +4. Verify the feature works end-to-end + +### STEP 6: VERIFY WITH BROWSER AUTOMATION + +**CRITICAL:** You MUST verify features through the actual UI. + +Use browser automation tools: +- Navigate to the app in a real browser +- Interact like a human user (click, type, scroll) +- Take screenshots at each step +- Verify both functionality AND visual appearance + +**DO:** +- Test through the UI with clicks and keyboard input +- Take screenshots to verify visual appearance +- Check for console errors in browser +- Verify complete user workflows end-to-end + +**DON'T:** +- Only test with curl commands (backend testing alone is insufficient) +- Use JavaScript evaluation to bypass UI (no shortcuts) +- Skip visual verification +- Mark tests passing without thorough verification + +### STEP 7: UPDATE feature_list.json (CAREFULLY!) + +**YOU CAN ONLY MODIFY ONE FIELD: "passes"** + +After thorough verification, change: +```json +"passes": false +``` +to: +```json +"passes": true +``` + +**NEVER:** +- Remove tests +- Edit test descriptions +- Modify test steps +- Combine or consolidate tests +- Reorder tests + +**ONLY CHANGE "passes" FIELD AFTER VERIFICATION WITH SCREENSHOTS.** + +### STEP 8: COMMIT YOUR PROGRESS + +Make a descriptive git commit: +```bash +git add . +git commit -m "Implement [feature name] - verified end-to-end + +- Added [specific changes] +- Tested with browser automation +- Updated feature_list.json: marked test #X as passing +- Screenshots in verification/ directory +" +``` + +### STEP 9: UPDATE PROGRESS NOTES + +Update `claude-progress.txt` with: +- What you accomplished this session +- Which test(s) you completed +- Any issues discovered or fixed +- What should be worked on next +- Current completion status (e.g., "45/200 tests passing") + +### STEP 10: END SESSION CLEANLY + +Before context fills up: +1. Commit all working code +2. Update claude-progress.txt +3. Update feature_list.json if tests verified +4. Ensure no uncommitted changes +5. Leave app in working state (no broken features) + +--- + +## TESTING REQUIREMENTS + +**ALL testing must use browser automation tools.** + +Available tools: +- puppeteer_navigate - Start browser and go to URL +- puppeteer_screenshot - Capture screenshot +- puppeteer_click - Click elements +- puppeteer_fill - Fill form inputs +- puppeteer_evaluate - Execute JavaScript (use sparingly, only for debugging) + +Test like a human user with mouse and keyboard. Don't take shortcuts by using JavaScript evaluation. +Don't use the puppeteer "active tab" tool. + +--- + +## IMPORTANT REMINDERS + +**Your Goal:** Production-quality application with all 200+ tests passing + +**This Session's Goal:** Complete at least one feature perfectly + +**Priority:** Fix broken tests before implementing new features + +**Quality Bar:** +- Zero console errors +- Polished UI matching the design specified in app_spec.txt +- All features work end-to-end through the UI +- Fast, responsive, professional + +**You have unlimited time.** Take as long as needed to get it right. The most important thing is that you +leave the code base in a clean state before terminating the session (Step 10). + +--- + +Begin by running Step 1 (Get Your Bearings). diff --git a/autonomous-coding/prompts/initializer_prompt.md b/autonomous-coding/prompts/initializer_prompt.md new file mode 100644 index 000000000..41a7928e5 --- /dev/null +++ b/autonomous-coding/prompts/initializer_prompt.md @@ -0,0 +1,106 @@ +## YOUR ROLE - INITIALIZER AGENT (Session 1 of Many) + +You are the FIRST agent in a long-running autonomous development process. +Your job is to set up the foundation for all future coding agents. + +### FIRST: Read the Project Specification + +Start by reading `app_spec.txt` in your working directory. This file contains +the complete specification for what you need to build. Read it carefully +before proceeding. + +### CRITICAL FIRST TASK: Create feature_list.json + +Based on `app_spec.txt`, create a file called `feature_list.json` with 200 detailed +end-to-end test cases. This file is the single source of truth for what +needs to be built. + +**Format:** +```json +[ + { + "category": "functional", + "description": "Brief description of the feature and what this test verifies", + "steps": [ + "Step 1: Navigate to relevant page", + "Step 2: Perform action", + "Step 3: Verify expected result" + ], + "passes": false + }, + { + "category": "style", + "description": "Brief description of UI/UX requirement", + "steps": [ + "Step 1: Navigate to page", + "Step 2: Take screenshot", + "Step 3: Verify visual requirements" + ], + "passes": false + } +] +``` + +**Requirements for feature_list.json:** +- Minimum 200 features total with testing steps for each +- Both "functional" and "style" categories +- Mix of narrow tests (2-5 steps) and comprehensive tests (10+ steps) +- At least 25 tests MUST have 10+ steps each +- Order features by priority: fundamental features first +- ALL tests start with "passes": false +- Cover every feature in the spec exhaustively + +**CRITICAL INSTRUCTION:** +IT IS CATASTROPHIC TO REMOVE OR EDIT FEATURES IN FUTURE SESSIONS. +Features can ONLY be marked as passing (change "passes": false to "passes": true). +Never remove features, never edit descriptions, never modify testing steps. +This ensures no functionality is missed. + +### SECOND TASK: Create init.sh + +Create a script called `init.sh` that future agents can use to quickly +set up and run the development environment. The script should: + +1. Install any required dependencies +2. Start any necessary servers or services +3. Print helpful information about how to access the running application + +Base the script on the technology stack specified in `app_spec.txt`. + +### THIRD TASK: Initialize Git + +Create a git repository and make your first commit with: +- feature_list.json (complete with all 200+ features) +- init.sh (environment setup script) +- README.md (project overview and setup instructions) + +Commit message: "Initial setup: feature_list.json, init.sh, and project structure" + +### FOURTH TASK: Create Project Structure + +Set up the basic project structure based on what's specified in `app_spec.txt`. +This typically includes directories for frontend, backend, and any other +components mentioned in the spec. + +### OPTIONAL: Start Implementation + +If you have time remaining in this session, you may begin implementing +the highest-priority features from feature_list.json. Remember: +- Work on ONE feature at a time +- Test thoroughly before marking "passes": true +- Commit your progress before session ends + +### ENDING THIS SESSION + +Before your context fills up: +1. Commit all work with descriptive messages +2. Create `claude-progress.txt` with a summary of what you accomplished +3. Ensure feature_list.json is complete and saved +4. Leave the environment in a clean, working state + +The next agent will continue from here with a fresh context window. + +--- + +**Remember:** You have unlimited time across many sessions. Focus on +quality over speed. Production-ready is the goal. diff --git a/autonomous-coding/requirements.txt b/autonomous-coding/requirements.txt new file mode 100644 index 000000000..0c981f616 --- /dev/null +++ b/autonomous-coding/requirements.txt @@ -0,0 +1 @@ +claude-code-sdk>=0.0.25 diff --git a/autonomous-coding/security.py b/autonomous-coding/security.py new file mode 100644 index 000000000..8605bcffc --- /dev/null +++ b/autonomous-coding/security.py @@ -0,0 +1,359 @@ +""" +Security Hooks for Autonomous Coding Agent +========================================== + +Pre-tool-use hooks that validate bash commands for security. +Uses an allowlist approach - only explicitly permitted commands can run. +""" + +import os +import shlex + + +# Allowed commands for development tasks +# Minimal set needed for the autonomous coding demo +ALLOWED_COMMANDS = { + # File inspection + "ls", + "cat", + "head", + "tail", + "wc", + "grep", + # File operations (agent uses SDK tools for most file ops, but cp/mkdir needed occasionally) + "cp", + "mkdir", + "chmod", # For making scripts executable; validated separately + # Directory + "pwd", + # Node.js development + "npm", + "node", + # Version control + "git", + # Process management + "ps", + "lsof", + "sleep", + "pkill", # For killing dev servers; validated separately + # Script execution + "init.sh", # Init scripts; validated separately +} + +# Commands that need additional validation even when in the allowlist +COMMANDS_NEEDING_EXTRA_VALIDATION = {"pkill", "chmod", "init.sh"} + + +def split_command_segments(command_string: str) -> list[str]: + """ + Split a compound command into individual command segments. + + Handles command chaining (&&, ||, ;) but not pipes (those are single commands). + + Args: + command_string: The full shell command + + Returns: + List of individual command segments + """ + import re + + # Split on && and || while preserving the ability to handle each segment + # This regex splits on && or || that aren't inside quotes + segments = re.split(r"\s*(?:&&|\|\|)\s*", command_string) + + # Further split on semicolons + result = [] + for segment in segments: + sub_segments = re.split(r'(? list[str]: + """ + Extract command names from a shell command string. + + Handles pipes, command chaining (&&, ||, ;), and subshells. + Returns the base command names (without paths). + + Args: + command_string: The full shell command + + Returns: + List of command names found in the string + """ + commands = [] + + # shlex doesn't treat ; as a separator, so we need to pre-process + import re + + # Split on semicolons that aren't inside quotes (simple heuristic) + # This handles common cases like "echo hello; ls" + segments = re.split(r'(? tuple[bool, str]: + """ + Validate pkill commands - only allow killing dev-related processes. + + Uses shlex to parse the command, avoiding regex bypass vulnerabilities. + + Returns: + Tuple of (is_allowed, reason_if_blocked) + """ + # Allowed process names for pkill + allowed_process_names = { + "node", + "npm", + "npx", + "vite", + "next", + } + + try: + tokens = shlex.split(command_string) + except ValueError: + return False, "Could not parse pkill command" + + if not tokens: + return False, "Empty pkill command" + + # Separate flags from arguments + args = [] + for token in tokens[1:]: + if not token.startswith("-"): + args.append(token) + + if not args: + return False, "pkill requires a process name" + + # The target is typically the last non-flag argument + target = args[-1] + + # For -f flag (full command line match), extract the first word as process name + # e.g., "pkill -f 'node server.js'" -> target is "node server.js", process is "node" + if " " in target: + target = target.split()[0] + + if target in allowed_process_names: + return True, "" + return False, f"pkill only allowed for dev processes: {allowed_process_names}" + + +def validate_chmod_command(command_string: str) -> tuple[bool, str]: + """ + Validate chmod commands - only allow making files executable with +x. + + Returns: + Tuple of (is_allowed, reason_if_blocked) + """ + try: + tokens = shlex.split(command_string) + except ValueError: + return False, "Could not parse chmod command" + + if not tokens or tokens[0] != "chmod": + return False, "Not a chmod command" + + # Look for the mode argument + # Valid modes: +x, u+x, a+x, etc. (anything ending with +x for execute permission) + mode = None + files = [] + + for token in tokens[1:]: + if token.startswith("-"): + # Skip flags like -R (we don't allow recursive chmod anyway) + return False, "chmod flags are not allowed" + elif mode is None: + mode = token + else: + files.append(token) + + if mode is None: + return False, "chmod requires a mode" + + if not files: + return False, "chmod requires at least one file" + + # Only allow +x variants (making files executable) + # This matches: +x, u+x, g+x, o+x, a+x, ug+x, etc. + import re + + if not re.match(r"^[ugoa]*\+x$", mode): + return False, f"chmod only allowed with +x mode, got: {mode}" + + return True, "" + + +def validate_init_script(command_string: str) -> tuple[bool, str]: + """ + Validate init.sh script execution - only allow ./init.sh. + + Returns: + Tuple of (is_allowed, reason_if_blocked) + """ + try: + tokens = shlex.split(command_string) + except ValueError: + return False, "Could not parse init script command" + + if not tokens: + return False, "Empty command" + + # The command should be exactly ./init.sh (possibly with arguments) + script = tokens[0] + + # Allow ./init.sh or paths ending in /init.sh + if script == "./init.sh" or script.endswith("/init.sh"): + return True, "" + + return False, f"Only ./init.sh is allowed, got: {script}" + + +def get_command_for_validation(cmd: str, segments: list[str]) -> str: + """ + Find the specific command segment that contains the given command. + + Args: + cmd: The command name to find + segments: List of command segments + + Returns: + The segment containing the command, or empty string if not found + """ + for segment in segments: + segment_commands = extract_commands(segment) + if cmd in segment_commands: + return segment + return "" + + +async def bash_security_hook(input_data, tool_use_id=None, context=None): + """ + Pre-tool-use hook that validates bash commands using an allowlist. + + Only commands in ALLOWED_COMMANDS are permitted. + + Args: + input_data: Dict containing tool_name and tool_input + tool_use_id: Optional tool use ID + context: Optional context + + Returns: + Empty dict to allow, or {"decision": "block", "reason": "..."} to block + """ + if input_data.get("tool_name") != "Bash": + return {} + + command = input_data.get("tool_input", {}).get("command", "") + if not command: + return {} + + # Extract all commands from the command string + commands = extract_commands(command) + + if not commands: + # Could not parse - fail safe by blocking + return { + "decision": "block", + "reason": f"Could not parse command for security validation: {command}", + } + + # Split into segments for per-command validation + segments = split_command_segments(command) + + # Check each command against the allowlist + for cmd in commands: + if cmd not in ALLOWED_COMMANDS: + return { + "decision": "block", + "reason": f"Command '{cmd}' is not in the allowed commands list", + } + + # Additional validation for sensitive commands + if cmd in COMMANDS_NEEDING_EXTRA_VALIDATION: + # Find the specific segment containing this command + cmd_segment = get_command_for_validation(cmd, segments) + if not cmd_segment: + cmd_segment = command # Fallback to full command + + if cmd == "pkill": + allowed, reason = validate_pkill_command(cmd_segment) + if not allowed: + return {"decision": "block", "reason": reason} + elif cmd == "chmod": + allowed, reason = validate_chmod_command(cmd_segment) + if not allowed: + return {"decision": "block", "reason": reason} + elif cmd == "init.sh": + allowed, reason = validate_init_script(cmd_segment) + if not allowed: + return {"decision": "block", "reason": reason} + + return {} diff --git a/autonomous-coding/test_security.py b/autonomous-coding/test_security.py new file mode 100644 index 000000000..82d8a7aa9 --- /dev/null +++ b/autonomous-coding/test_security.py @@ -0,0 +1,290 @@ +#!/usr/bin/env python3 +""" +Security Hook Tests +=================== + +Tests for the bash command security validation logic. +Run with: python test_security.py +""" + +import asyncio +import sys + +from security import ( + bash_security_hook, + extract_commands, + validate_chmod_command, + validate_init_script, +) + + +def test_hook(command: str, should_block: bool) -> bool: + """Test a single command against the security hook.""" + input_data = {"tool_name": "Bash", "tool_input": {"command": command}} + result = asyncio.run(bash_security_hook(input_data)) + was_blocked = result.get("decision") == "block" + + if was_blocked == should_block: + status = "PASS" + else: + status = "FAIL" + expected = "blocked" if should_block else "allowed" + actual = "blocked" if was_blocked else "allowed" + reason = result.get("reason", "") + print(f" {status}: {command!r}") + print(f" Expected: {expected}, Got: {actual}") + if reason: + print(f" Reason: {reason}") + return False + + print(f" {status}: {command!r}") + return True + + +def test_extract_commands(): + """Test the command extraction logic.""" + print("\nTesting command extraction:\n") + passed = 0 + failed = 0 + + test_cases = [ + ("ls -la", ["ls"]), + ("npm install && npm run build", ["npm", "npm"]), + ("cat file.txt | grep pattern", ["cat", "grep"]), + ("/usr/bin/node script.js", ["node"]), + ("VAR=value ls", ["ls"]), + ("git status || git init", ["git", "git"]), + ] + + for cmd, expected in test_cases: + result = extract_commands(cmd) + if result == expected: + print(f" PASS: {cmd!r} -> {result}") + passed += 1 + else: + print(f" FAIL: {cmd!r}") + print(f" Expected: {expected}, Got: {result}") + failed += 1 + + return passed, failed + + +def test_validate_chmod(): + """Test chmod command validation.""" + print("\nTesting chmod validation:\n") + passed = 0 + failed = 0 + + # Test cases: (command, should_be_allowed, description) + test_cases = [ + # Allowed cases + ("chmod +x init.sh", True, "basic +x"), + ("chmod +x script.sh", True, "+x on any script"), + ("chmod u+x init.sh", True, "user +x"), + ("chmod a+x init.sh", True, "all +x"), + ("chmod ug+x init.sh", True, "user+group +x"), + ("chmod +x file1.sh file2.sh", True, "multiple files"), + # Blocked cases + ("chmod 777 init.sh", False, "numeric mode"), + ("chmod 755 init.sh", False, "numeric mode 755"), + ("chmod +w init.sh", False, "write permission"), + ("chmod +r init.sh", False, "read permission"), + ("chmod -x init.sh", False, "remove execute"), + ("chmod -R +x dir/", False, "recursive flag"), + ("chmod --recursive +x dir/", False, "long recursive flag"), + ("chmod +x", False, "missing file"), + ] + + for cmd, should_allow, description in test_cases: + allowed, reason = validate_chmod_command(cmd) + if allowed == should_allow: + print(f" PASS: {cmd!r} ({description})") + passed += 1 + else: + expected = "allowed" if should_allow else "blocked" + actual = "allowed" if allowed else "blocked" + print(f" FAIL: {cmd!r} ({description})") + print(f" Expected: {expected}, Got: {actual}") + if reason: + print(f" Reason: {reason}") + failed += 1 + + return passed, failed + + +def test_validate_init_script(): + """Test init.sh script execution validation.""" + print("\nTesting init.sh validation:\n") + passed = 0 + failed = 0 + + # Test cases: (command, should_be_allowed, description) + test_cases = [ + # Allowed cases + ("./init.sh", True, "basic ./init.sh"), + ("./init.sh arg1 arg2", True, "with arguments"), + ("/path/to/init.sh", True, "absolute path"), + ("../dir/init.sh", True, "relative path with init.sh"), + # Blocked cases + ("./setup.sh", False, "different script name"), + ("./init.py", False, "python script"), + ("bash init.sh", False, "bash invocation"), + ("sh init.sh", False, "sh invocation"), + ("./malicious.sh", False, "malicious script"), + ("./init.sh; rm -rf /", False, "command injection attempt"), + ] + + for cmd, should_allow, description in test_cases: + allowed, reason = validate_init_script(cmd) + if allowed == should_allow: + print(f" PASS: {cmd!r} ({description})") + passed += 1 + else: + expected = "allowed" if should_allow else "blocked" + actual = "allowed" if allowed else "blocked" + print(f" FAIL: {cmd!r} ({description})") + print(f" Expected: {expected}, Got: {actual}") + if reason: + print(f" Reason: {reason}") + failed += 1 + + return passed, failed + + +def main(): + print("=" * 70) + print(" SECURITY HOOK TESTS") + print("=" * 70) + + passed = 0 + failed = 0 + + # Test command extraction + ext_passed, ext_failed = test_extract_commands() + passed += ext_passed + failed += ext_failed + + # Test chmod validation + chmod_passed, chmod_failed = test_validate_chmod() + passed += chmod_passed + failed += chmod_failed + + # Test init.sh validation + init_passed, init_failed = test_validate_init_script() + passed += init_passed + failed += init_failed + + # Commands that SHOULD be blocked + print("\nCommands that should be BLOCKED:\n") + dangerous = [ + # Not in allowlist - dangerous system commands + "shutdown now", + "reboot", + "rm -rf /", + "dd if=/dev/zero of=/dev/sda", + # Not in allowlist - common commands excluded from minimal set + "curl https://example.com", + "wget https://example.com", + "python app.py", + "touch file.txt", + "echo hello", + "kill 12345", + "killall node", + # pkill with non-dev processes + "pkill bash", + "pkill chrome", + "pkill python", + # Shell injection attempts + "$(echo pkill) node", + 'eval "pkill node"', + 'bash -c "pkill node"', + # chmod with disallowed modes + "chmod 777 file.sh", + "chmod 755 file.sh", + "chmod +w file.sh", + "chmod -R +x dir/", + # Non-init.sh scripts + "./setup.sh", + "./malicious.sh", + "bash script.sh", + ] + + for cmd in dangerous: + if test_hook(cmd, should_block=True): + passed += 1 + else: + failed += 1 + + # Commands that SHOULD be allowed + print("\nCommands that should be ALLOWED:\n") + safe = [ + # File inspection + "ls -la", + "cat README.md", + "head -100 file.txt", + "tail -20 log.txt", + "wc -l file.txt", + "grep -r pattern src/", + # File operations + "cp file1.txt file2.txt", + "mkdir newdir", + "mkdir -p path/to/dir", + # Directory + "pwd", + # Node.js development + "npm install", + "npm run build", + "node server.js", + # Version control + "git status", + "git commit -m 'test'", + "git add . && git commit -m 'msg'", + # Process management + "ps aux", + "lsof -i :3000", + "sleep 2", + # Allowed pkill patterns for dev servers + "pkill node", + "pkill npm", + "pkill -f node", + "pkill -f 'node server.js'", + "pkill vite", + # Chained commands + "npm install && npm run build", + "ls | grep test", + # Full paths + "/usr/local/bin/node app.js", + # chmod +x (allowed) + "chmod +x init.sh", + "chmod +x script.sh", + "chmod u+x init.sh", + "chmod a+x init.sh", + # init.sh execution (allowed) + "./init.sh", + "./init.sh --production", + "/path/to/init.sh", + # Combined chmod and init.sh + "chmod +x init.sh && ./init.sh", + ] + + for cmd in safe: + if test_hook(cmd, should_block=False): + passed += 1 + else: + failed += 1 + + # Summary + print("\n" + "-" * 70) + print(f" Results: {passed} passed, {failed} failed") + print("-" * 70) + + if failed == 0: + print("\n ALL TESTS PASSED") + return 0 + else: + print(f"\n {failed} TEST(S) FAILED") + return 1 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/browser-use-demo/.dockerignore b/browser-use-demo/.dockerignore new file mode 100644 index 000000000..5c92e71ea --- /dev/null +++ b/browser-use-demo/.dockerignore @@ -0,0 +1,17 @@ +**/__pycache__ +**/*.pyc +**/*.pyo +**/*.pyd +.Python +*.egg-info/ +.git/ +.gitignore +*.md +.DS_Store +tests/ +*.log +.vscode/ +.idea/ +*.swp +*.swo +*~ \ No newline at end of file diff --git a/browser-use-demo/.env.example b/browser-use-demo/.env.example new file mode 100644 index 000000000..6e2cf111e --- /dev/null +++ b/browser-use-demo/.env.example @@ -0,0 +1,8 @@ +# Anthropic API Configuration +ANTHROPIC_API_KEY=your_anthropic_api_key_here + +# Port Configuration (optional - defaults shown) +VNC_PORT=5900 +STREAMLIT_PORT=8501 +NOVNC_PORT=6080 +HTTP_PORT=8080 \ No newline at end of file diff --git a/browser-use-demo/.gitignore b/browser-use-demo/.gitignore new file mode 100644 index 000000000..767c532bc --- /dev/null +++ b/browser-use-demo/.gitignore @@ -0,0 +1,51 @@ +# Python +__pycache__/ +*.py[cod] +*$py.class +*.so +.Python +env/ +venv/ +ENV/ +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +*.egg-info/ +.installed.cfg +*.egg + +# IDE +.vscode/ +.idea/ +*.swp +*.swo +*~ + +# OS +.DS_Store +Thumbs.db + +# Logs +*.log +/tmp/ + +# Environment +.env +.anthropic/ + +# Test +.pytest_cache/ +.coverage +htmlcov/ + +# Streamlit +.streamlit/cache/ \ No newline at end of file diff --git a/browser-use-demo/CHANGELOG.md b/browser-use-demo/CHANGELOG.md new file mode 100644 index 000000000..c0680d59c --- /dev/null +++ b/browser-use-demo/CHANGELOG.md @@ -0,0 +1,36 @@ +# Modifications to Microsoft Playwright Source + +This file tracks all modifications made to files derived from or inspired by Microsoft Playwright source code. + +## Modified Files + +### browser_use_demo/browser_tool_utils/browser_dom_script.js +- **Date Modified**: 9/23/25 +- **Original Source**: https://github.com/microsoft/playwright/blob/main/packages/injected/src/ariaSnapshot.ts +- **Nature of Changes**: Adapted Playwright's accessibility tree generation for use with browser tools API. Implemented accessibility tree extraction with element reference tracking, visibility filtering, and YAML-formatted output. + +### browser_use_demo/browser_tool_utils/browser_element_script.js +- **Date Modified**: 9/23/25 +- **Original Source**: Microsoft Playwright element interaction patterns +- **Nature of Changes**: Implemented element finding and interaction logic inspired by Playwright's approach to reliable element targeting and coordinate calculation. + +### browser_use_demo/tools/browser.py +- **Date Modified**: 9/23/25 +- **Original Source**: Microsoft Playwright click emulation implementation +- **Nature of Changes**: Click emulation methods developed with reference to Playwright source code during debugging to ensure reliable mouse interactions. +- **Date Modified**: 10/6/25 +- **Nature of Changes**: + - Fixed incorrect path to browser_tool_utils directory. Changed from `Path(__file__).parent / "browser_tool_utils"` to `Path(__file__).parent.parent / "browser_tool_utils"` to correctly locate JavaScript utility files. + - Fixed missing `cdp_url` attribute initialization in `__init__` method to prevent AttributeError in cleanup method. + - Fixed incorrect import path for browser_key_map. Changed from `.browser_tool_utils.browser_key_map` to `..browser_tool_utils.browser_key_map` to correctly import the KEY_MAP. +- **Date Modified**: 10/14/25 +- **Nature of Changes**: + - Enhanced `_scroll` and `_scroll_to` methods to return screenshots after scrolling actions. Added 0.5s stabilization delay before taking screenshots to show the new viewport content. This provides visual feedback to the model after scroll actions, consistent with the navigate action behavior. +- **Date Modified**: 12/19/25 +- **Nature of Changes**: + - Added `hover` action to move mouse cursor without clicking using Playwright's `mouse.move()` API. Useful for revealing tooltips, dropdown menus, or triggering hover states. + - Added `execute_js` action to execute JavaScript code in page context using Playwright's `page.evaluate()` API. Returns the result of the last expression. +- **Date Modified**: 1/18/26 +- **Nature of Changes**: + - Added clarifying comment in the `options` property explaining that this implementation uses fixed 1920x1080 dimensions with empirical coordinate correction, and directing users to the "Handle coordinate scaling" section in the computer use documentation for the recommended client-side downscaling approach. + diff --git a/browser-use-demo/Dockerfile b/browser-use-demo/Dockerfile new file mode 100644 index 000000000..d4659e1c4 --- /dev/null +++ b/browser-use-demo/Dockerfile @@ -0,0 +1,112 @@ +FROM docker.io/ubuntu:22.04 + +ENV DEBIAN_FRONTEND=noninteractive +ENV DEBIAN_PRIORITY=high + +# Install system dependencies +RUN apt-get update && \ + apt-get -y upgrade && \ + apt-get -y install \ + # UI Requirements + xvfb \ + xterm \ + xdotool \ + scrot \ + imagemagick \ + sudo \ + mutter \ + x11vnc \ + # Python/pyenv reqs + build-essential \ + libssl-dev \ + zlib1g-dev \ + libbz2-dev \ + libreadline-dev \ + libsqlite3-dev \ + curl \ + git \ + libncursesw5-dev \ + xz-utils \ + tk-dev \ + libxml2-dev \ + libxmlsec1-dev \ + libffi-dev \ + liblzma-dev \ + # Network tools + net-tools \ + netcat \ + # PPA req + software-properties-common && \ + # Browser and desktop apps + sudo apt-get install -y --no-install-recommends \ + x11-apps \ + tint2 \ + pcmanfm \ + unzip \ + # Playwright Chromium dependencies + libnss3 \ + libnspr4 \ + libatk1.0-0 \ + libatk-bridge2.0-0 \ + libcups2 \ + libatspi2.0-0 \ + libxcomposite1 \ + libxdamage1 \ + libxrandr2 \ + libgbm1 \ + libxkbcommon0 \ + libpango-1.0-0 \ + libcairo2 \ + libasound2 && \ + apt-get clean + +# Install noVNC +RUN git clone --branch v1.5.0 https://github.com/novnc/noVNC.git /opt/noVNC && \ + git clone --branch v0.12.0 https://github.com/novnc/websockify /opt/noVNC/utils/websockify && \ + ln -s /opt/noVNC/vnc.html /opt/noVNC/index.html + +# Setup user +ENV USERNAME=browseruse +ENV HOME=/home/$USERNAME +RUN useradd -m -s /bin/bash -d $HOME $USERNAME +RUN echo "${USERNAME} ALL=(ALL) NOPASSWD: ALL" >> /etc/sudoers +USER browseruse +WORKDIR $HOME + +# Setup Python +RUN git clone https://github.com/pyenv/pyenv.git ~/.pyenv && \ + cd ~/.pyenv && src/configure && make -C src && cd .. && \ + echo 'export PYENV_ROOT="$HOME/.pyenv"' >> ~/.bashrc && \ + echo 'command -v pyenv >/dev/null || export PATH="$PYENV_ROOT/bin:$PATH"' >> ~/.bashrc && \ + echo 'eval "$(pyenv init -)"' >> ~/.bashrc +ENV PYENV_ROOT="$HOME/.pyenv" +ENV PATH="$PYENV_ROOT/bin:$PATH" +ENV PYENV_VERSION=3.11.6 +RUN eval "$(pyenv init -)" && \ + pyenv install $PYENV_VERSION && \ + pyenv global $PYENV_VERSION && \ + pyenv rehash + +ENV PATH="$HOME/.pyenv/shims:$HOME/.pyenv/bin:$PATH" + +RUN python -m pip install --upgrade pip==23.1.2 setuptools==58.0.4 wheel==0.40.0 && \ + python -m pip config set global.disable-pip-version-check true + +# Copy requirements and install dependencies +COPY --chown=$USERNAME:$USERNAME browser_use_demo/requirements.txt $HOME/browser_use_demo/requirements.txt +RUN python -m pip install -r $HOME/browser_use_demo/requirements.txt + +# Install Playwright and Chromium +RUN python -m playwright install chromium && \ + python -m playwright install-deps chromium + +# Setup desktop environment & app +COPY --chown=$USERNAME:$USERNAME image/ $HOME +COPY --chown=$USERNAME:$USERNAME browser_use_demo/ $HOME/browser_use_demo/ + +# Display configuration - hardcoded values +ENV DISPLAY_NUM=1 +ENV HEIGHT=1080 +ENV WIDTH=1920 + +ENTRYPOINT [ "./entrypoint.sh" ] \ No newline at end of file diff --git a/browser-use-demo/NOTICE b/browser-use-demo/NOTICE new file mode 100644 index 000000000..dc81557e7 --- /dev/null +++ b/browser-use-demo/NOTICE @@ -0,0 +1,7 @@ +This software includes components from the following open source projects: + +Microsoft Playwright +Source: https://github.com/microsoft/playwright +License: Apache License 2.0 +Copyright (c) Microsoft Corporation +Modified files are marked with modification notices. See CHANGELOG.md for details. \ No newline at end of file diff --git a/browser-use-demo/README.md b/browser-use-demo/README.md new file mode 100644 index 000000000..f364fa2b2 --- /dev/null +++ b/browser-use-demo/README.md @@ -0,0 +1,248 @@ +# Browser Automation Demo with Claude + +A complete reference implementation for building browser automation with Claude using Playwright. This demo provides a containerized Streamlit interface showcasing how to give Claude the ability to navigate websites, interact with DOM elements, extract content, and fill forms. + +## Overview + +This demo implements a custom browser tool that enables Claude to interact with web browsers. It provides: + +- **DOM access**: Read page structure with element references +- **Navigation control**: Browse URLs and manage browser history +- **Form manipulation**: Directly set form input values +- **Text extraction**: Get all text content from pages +- **Element targeting**: Interact with elements via ref or coordinate parameters +- **Smart scrolling**: Scroll to specific elements or in specific directions +- **Page search**: Find and highlight text on pages +- **Visual capture**: Take screenshots and capture zoomed regions + +### Advantages Over Coordinate-Based Automation + +- **Reliability**: Element-based targeting via the `ref` parameter works across different screen sizes and layouts, unlike pixel coordinates that break when windows resize +- **Direct DOM manipulation**: Provides structured visibility into page elements and their properties, enabling precise interactions with dynamic content, hidden elements, and complex web applications +- **Web-specific actions**: Built-in support for navigation, text extraction, and form completion + +## Quick Start + +### Prerequisites + +- Docker and Docker Compose installed on your system +- Anthropic API key + +### Setup + +1. **Clone the repository**: + ```bash + git clone https://github.com/anthropics/claude-quickstarts.git + cd claude-quickstarts/browser-use-demo + ``` + +2. **Configure environment**: + ```bash + cp .env.example .env + # Edit .env file and add your ANTHROPIC_API_KEY + ``` + + The display resolution is set to 1920x1080 (16:9) for optimal coordinate accuracy. + - See `.env.example` for more options and coordinate scaling details + +### Running with Docker Compose + +```bash +# For production use: +docker-compose up --build + +# For development with file watching (auto-sync changes): +docker-compose up --build --watch +``` + +### Access the interfaces: + - **Main UI**: http://localhost:8080 (Streamlit interface) + - **NoVNC Browser View**: http://localhost:6080 (see the browser) + - **VNC**: Connect any VNC client to localhost:5900 + +https://github.com/user-attachments/assets/4fb72078-6902-4b63-bcd1-5f2c4cd60582 + +## Usage Examples + +Once the demo is running, try these prompts in the Streamlit interface: + +- "Navigate to news.ycombinator.com and tell me the top 3 stories" +- "Go to google.com and search for 'Anthropic Claude'" +- "Visit wikipedia.org and find information about artificial intelligence" +- "Navigate to github.com and search for 'playwright'" + +Note that the current Playwright implementation hits CAPTCHAs when searching Google.com. To avoid this, we recommend that you specify the website in the prompt (ie. navigate to Anthropic.com and search for x). + +## Safety Considerations + +Browser automation poses unique risks that are distinct from standard API features or chat interfaces. These risks are heightened when using the tool to interact with the internet. To minimize risks, consider taking precautions such as: + +1. Run the browser in an isolated virtual machine or container environment with minimal privileges to prevent direct system attacks or accidents. +2. Avoid giving the model access to sensitive data, such as account login information, to prevent information theft. +3. Consider an allowlist of domains to reduce exposure to malicious content. +4. Ask a human to confirm decisions that may result in meaningful real-world consequences as well as any tasks requiring affirmative consent, such as accepting cookies, executing financial transactions, or agreeing to terms of service. + +In some circumstances, Claude will follow commands found in content even if it conflicts with the user's instructions. For example, instructions on webpages or contained in images may override user instructions or cause Claude to make mistakes. We suggest taking precautions to isolate Claude from sensitive data and actions to avoid risks related to prompt injection. + +Finally, please inform end users of relevant risks and obtain their consent prior to enabling browser automation in your own products. + +This demo runs a browser in a containerized environment. While isolated, please note: + +- **Don't enter personal credentials or sensitive information** - This is a demonstration tool +- **Be cautious about the websites you visit** - Some sites may have anti-automation measures +- **Not for production use** - This demo is for learning and development purposes only + +## Implementation Reference + +This demo shows how to build browser automation with Claude using Playwright. All browser actions (navigate, click, type, scroll, form_input, etc.) are implemented as methods in [browser.py](browser_use_demo/tools/browser.py) using Playwright's async API. + +### Key Files + +- **[browser.py](browser_use_demo/tools/browser.py)** - Main tool with all browser actions +- **[loop.py](browser_use_demo/loop.py)** - Sampling loop for API calls and response handling +- **[streamlit.py](browser_use_demo/streamlit.py)** - Chat UI +- **[browser_tool_utils/](browser_use_demo/browser_tool_utils/)** - JavaScript utilities for DOM extraction, element finding, and form manipulation + +### Core Patterns + +**Element references:** JavaScript utilities generate `ref` identifiers for reliable element targeting across screen sizes (replacing brittle pixel coordinates). + +**Tool setup:** +```python +browser_tool = BrowserTool() + +def to_params(self): + return { + "name": "browser", + "description": BROWSER_TOOL_DESCRIPTION, + "input_schema": BROWSER_TOOL_INPUT_SCHEMA, + } +``` + +### Coordinate Scaling + +The browser tools implementation includes automatic coordinate scaling to ensure accurate interactions: + +**How it works:** +- The browser viewport is fixed at 1920x1080 (16:9 aspect ratio) +- Claude processes screenshots at 1456x819 pixels for 16:9 aspect ratio (see [documentation](https://docs.claude.com/en/docs/build-with-claude/vision#evaluate-image-size)) +- The browser tool automatically scales coordinates from Claude's processed resolution (1456x819) to the actual viewport (1920x1080) +- This ensures clicks and interactions happen at the correct locations + +See `browser_use_demo/tools/coordinate_scaling.py` for the implementation. + +This demo uses a custom tool definition with an explicit input schema, giving you full control over the tool interface. The `BROWSER_TOOL_DESCRIPTION` and `BROWSER_TOOL_INPUT_SCHEMA` constants in [browser.py](browser_use_demo/tools/browser.py) provide a complete example you can use as a starting point for your own browser automation tools. + + +### Modifying & Using as a Template + +**To modify this demo:** +1. Edit `browser_use_demo/tools/browser.py` to add features or change behavior +2. Rebuild the Docker image (volume mount allows live Python code updates) + +**To use as a template for your own project:** +1. Copy [browser.py](browser_use_demo/tools/browser.py) and [browser_tool_utils/](browser_use_demo/browser_tool_utils/) +2. Adapt [loop.py](browser_use_demo/loop.py) for your API integration +3. Build your UI or use [streamlit.py](browser_use_demo/streamlit.py) as a starting point + +## Architecture + +``` +┌──────────────────────────────────┐ +│ Docker Container │ +│ │ +│ ┌─────────────────────────────┐ │ +│ │ Streamlit Interface │ │ ← User interacts here +│ └──────────┬──────────────────┘ │ +│ │ │ +│ ┌──────────▼──────────────────┐ │ +│ │ Claude API + Browser Tool │ │ ← Claude controls browser +│ └──────────┬──────────────────┘ │ +│ │ │ +│ ┌──────────▼──────────────────┐ │ +│ │ Playwright + Chromium │ │ ← Browser automation +│ └──────────┬──────────────────┘ │ +│ │ │ +│ ┌──────────▼──────────────────┐ │ +│ │ XVFB Virtual Display │ │ ← Virtual display +│ └──────────┬──────────────────┘ │ +│ │ │ +│ ┌──────────▼──────────────────┐ │ +│ │ VNC/NoVNC Server │ │ ← Visual access +│ └─────────────────────────────┘ │ +└──────────────────────────────────┘ +``` + +## How Browser Automation Differs from Computer Use + +This browser automation demo is specifically optimized for web automation with DOM-aware features like element targeting, page reading, and form manipulation. While it shares many capabilities with the [computer use demo](../computer-use-demo), browser automation adds web-specific actions and the ability to target elements by reference instead of just coordinates. Computer use provides general desktop control for any application, while browser automation focuses exclusively on browser-based tasks. + +### Actions Unique to Browser Automation + +These web-specific actions are not available in computer use: + +- **navigate**: Navigate to URL or use "back"/"forward" for history (requires text) +- **read_page**: Get DOM tree with element refs; use text="interactive" to filter +- **get_page_text**: Extract all text content from the page +- **find**: Search for text and highlight matches (requires text) +- **form_input**: Set form element value directly (requires ref and value) +- **scroll_to**: Scroll element into view (requires ref) +- **execute_js**: Run JavaScript code in page context (requires text with JS code) + +### Actions Shared with Computer Use + +These actions work similarly to their computer use counterparts. The key difference is that browser automation allows targeting by element reference (`ref`) as an alternative to coordinates: + +**Mouse Actions** (accept either `ref` or `coordinate`): +- **left_click**, **right_click**, **middle_click**, **double_click**, **triple_click** +- **hover**: Move cursor without clicking (for tooltips, dropdowns) +- **left_click_drag**: Drag from start_coordinate to coordinate +- **left_mouse_down**, **left_mouse_up**: Fine-grained mouse control + +**Keyboard Actions**: +- **type**: Type text at cursor (requires text) +- **key**: Press key or combination (requires text) +- **hold_key**: Hold key for duration (requires text and duration) + +**Other**: +- **screenshot**: Capture current viewport +- **scroll**: Scroll in direction (requires scroll_direction, scroll_amount, coordinate) +- **zoom**: Zoomed screenshot of region (requires region: x1, y1, x2, y2) +- **wait**: Wait for duration in seconds (requires duration, 0-100) + +### Computer Use Actions Not Included + +These desktop-level actions from computer use are not in this browser demo: + +- **cursor_position**: Get current (x, y) pixel coordinate of cursor + +This is less relevant for browser automation since the `ref` parameter provides reliable element-based targeting, replacing the need for cursor tracking. Note that `hover` provides similar functionality to `mouse_move` for triggering hover states. + + +## Troubleshooting + +**Browser not visible?** +- Check that port 6080 is accessible +- Try refreshing the NoVNC page +- Ensure Docker has sufficient resources allocated + +**API errors?** +- Verify your Anthropic API key is set correctly +- Check you're using a compatible model (Claude 4.5 models: claude-sonnet-4-5-20250929, claude-opus-4-5-20251101, or claude-haiku-4-5-20251001) + +**Browser actions failing?** +- Some websites may have anti-automation measures +- Try simpler websites first to test functionality +- Check the browser view to see what's happening + +## Attribution + +This software includes components from Microsoft Playwright. See the [NOTICE](NOTICE) file for details. + +## Credits + +Built with: +- [Anthropic Claude API](https://www.anthropic.com) +- [Playwright](https://playwright.dev) +- [Streamlit](https://streamlit.io) +- [NoVNC](https://novnc.com) diff --git a/browser-use-demo/browser_use_demo/__init__.py b/browser-use-demo/browser_use_demo/__init__.py new file mode 100644 index 000000000..2bd94af8e --- /dev/null +++ b/browser-use-demo/browser_use_demo/__init__.py @@ -0,0 +1,3 @@ +"""Browser Use Demo - Web automation with Claude.""" + +__version__ = "0.1.0" \ No newline at end of file diff --git a/browser-use-demo/browser_use_demo/browser_tool_utils/__init__.py b/browser-use-demo/browser_use_demo/browser_tool_utils/__init__.py new file mode 100644 index 000000000..829b39a64 --- /dev/null +++ b/browser-use-demo/browser_use_demo/browser_tool_utils/__init__.py @@ -0,0 +1 @@ +# Browser tool utility files diff --git a/browser-use-demo/browser_use_demo/browser_tool_utils/browser_dom_script.js b/browser-use-demo/browser_use_demo/browser_tool_utils/browser_dom_script.js new file mode 100644 index 000000000..cc260853c --- /dev/null +++ b/browser-use-demo/browser_use_demo/browser_tool_utils/browser_dom_script.js @@ -0,0 +1,447 @@ +/* + * Modifications Copyright (c) 2025 Anthropic, PBC + * Modified from original Microsoft Playwright source + * Original Microsoft Playwright source licensed under Apache License 2.0 + * See CHANGELOG.md for details + */ + +// Content script that defines the accessibility tree generation function in the MAIN context + +(function () { + // Initialize global element map and ref counter if not already present + if (!window.__claudeElementMap) { + window.__claudeElementMap = {}; + } + if (!window.__claudeRefCounter) { + window.__claudeRefCounter = 0; + } + + // Define the accessibility tree generation function on the window (in content script context) + window.__generateAccessibilityTree = function (filterType) { + try { + var result = []; + + function getRole(element) { + var role = element.getAttribute("role"); + if (role) return role; + + var tag = element.tagName.toLowerCase(); + var type = element.getAttribute("type"); + + var roleMap = { + a: "link", + button: "button", + input: + type === "submit" || type === "button" + ? "button" + : type === "checkbox" + ? "checkbox" + : type === "radio" + ? "radio" + : type === "file" + ? "button" + : "textbox", + select: "combobox", + textarea: "textbox", + h1: "heading", + h2: "heading", + h3: "heading", + h4: "heading", + h5: "heading", + h6: "heading", + img: "image", + nav: "navigation", + main: "main", + header: "banner", + footer: "contentinfo", + section: "region", + article: "article", + aside: "complementary", + form: "form", + table: "table", + ul: "list", + ol: "list", + li: "listitem", + label: "label", + }; + + return roleMap[tag] || "generic"; + } + + function getCleanName(element) { + var tag = element.tagName.toLowerCase(); + + // For selects, get the selected option text + if (tag === "select") { + var selectElement = element; + var selectedOption = + selectElement.querySelector("option[selected]") || + selectElement.options[selectElement.selectedIndex]; + if (selectedOption && selectedOption.textContent) { + return selectedOption.textContent.trim(); + } + } + + // Priority order for getting meaningful names + var ariaLabel = element.getAttribute("aria-label"); + if (ariaLabel && ariaLabel.trim()) return ariaLabel.trim(); + + var placeholder = element.getAttribute("placeholder"); + if (placeholder && placeholder.trim()) return placeholder.trim(); + + var title = element.getAttribute("title"); + if (title && title.trim()) return title.trim(); + + var alt = element.getAttribute("alt"); + if (alt && alt.trim()) return alt.trim(); + + // For form labels + if (element.id) { + var label = document.querySelector('label[for="' + element.id + '"]'); + if (label && label.textContent && label.textContent.trim()) { + return label.textContent.trim(); + } + } + + // For inputs with values + if (tag === "input") { + var inputElement = element; + var type = element.getAttribute("type") || ""; + var value = element.getAttribute("value"); + + if (type === "submit" && value && value.trim()) { + return value.trim(); + } + + if ( + inputElement.value && + inputElement.value.length < 50 && + inputElement.value.trim() + ) { + return inputElement.value.trim(); + } + } + + // For buttons, links, and other interactive elements, get direct text + if (["button", "a", "summary"].includes(tag)) { + var directText = ""; + for (var i = 0; i < element.childNodes.length; i++) { + var node = element.childNodes[i]; + if (node.nodeType === Node.TEXT_NODE) { + directText += node.textContent; + } + } + if (directText.trim()) return directText.trim(); + } + + // For headings, get text content but limit it + if (tag.match(/^h[1-6]$/)) { + var headingText = element.textContent; + if (headingText && headingText.trim()) { + return headingText.trim().substring(0, 100); + } + } + + // For images without alt, try to get surrounding context + if (tag === "img") { + var src = element.getAttribute("src"); + if (src) { + var filename = src.split("/").pop()?.split("?")[0]; + return "Image: " + filename; + } + } + + // For generic elements, get direct text content (not including child elements) + // This helps capture important text in spans, divs, etc. + var directTextContent = ""; + for (var j = 0; j < element.childNodes.length; j++) { + var childNode = element.childNodes[j]; + if (childNode.nodeType === Node.TEXT_NODE) { + directTextContent += childNode.textContent; + } + } + + if ( + directTextContent && + directTextContent.trim() && + directTextContent.trim().length >= 3 + ) { + // Only return if it's meaningful text (at least 3 characters) + var trimmedText = directTextContent.trim(); + if (trimmedText.length > 50) { + return trimmedText.substring(0, 50) + "..."; + } + return trimmedText; + } + + return ""; + } + + function isVisible(element) { + var style = window.getComputedStyle(element); + return ( + style.display !== "none" && + style.visibility !== "hidden" && + style.opacity !== "0" && + element.offsetWidth > 0 && + element.offsetHeight > 0 + ); + } + + function isInteractive(element) { + var tag = element.tagName.toLowerCase(); + var interactiveTags = [ + "a", + "button", + "input", + "select", + "textarea", + "details", + "summary", + ]; + + return ( + interactiveTags.includes(tag) || + element.getAttribute("onclick") !== null || + element.getAttribute("tabindex") !== null || + element.getAttribute("role") === "button" || + element.getAttribute("role") === "link" || + element.getAttribute("contenteditable") === "true" + ); + } + + function isSemantic(element) { + var tag = element.tagName.toLowerCase(); + var semanticTags = [ + "h1", + "h2", + "h3", + "h4", + "h5", + "h6", + "nav", + "main", + "header", + "footer", + "section", + "article", + "aside", + ]; + return ( + semanticTags.includes(tag) || element.getAttribute("role") !== null + ); + } + + function shouldIncludeElement(element, options) { + var tag = element.tagName.toLowerCase(); + + // Always skip these + if ( + ["script", "style", "meta", "link", "title", "noscript"].includes(tag) + ) + return false; + if (element.getAttribute("aria-hidden") === "true") return false; + + // Always check visibility - this is now mandatory + if (!isVisible(element)) return false; + + // Check viewport visibility for all elements (unless using 'all' filter for find tool) + if (options.filter !== "all") { + var rect = element.getBoundingClientRect(); + var inViewport = + rect.top < window.innerHeight && + rect.bottom > 0 && + rect.left < window.innerWidth && + rect.right > 0; + if (!inViewport) return false; + } + + // Apply interactive filter if specified + if (options.filter === "interactive") { + return isInteractive(element); + } + + // Default behavior when no filter is specified (all visible elements) + // Always include interactive elements + if (isInteractive(element)) return true; + + // Always include semantic elements (headings, nav, etc.) + if (isSemantic(element)) return true; + + // Include elements with meaningful text content + if (getCleanName(element).length > 0) return true; + + // For generic divs and spans, be more selective but still include text-containing ones + var role = getRole(element); + if (role === "generic" && (tag === "div" || tag === "span")) { + var id = element.id || ""; + var className = element.className || ""; + var cleanName = getCleanName(element); + + // Include if it has meaningful text content (now that we extract text better) + if (cleanName && cleanName.length >= 3) { + return true; + } + + // Only keep divs/spans that are clearly functional containers (not layout) + var functionalKeywords = [ + "search", + "dropdown", + "menu", + "modal", + "dialog", + "popup", + "toolbar", + "sidebar", + "content", + "text", + ]; + var isFunctionalContainer = functionalKeywords.some( + function (keyword) { + return id.includes(keyword) || className.includes(keyword); + }, + ); + + if (isFunctionalContainer) { + return true; + } + + // Skip empty generic containers - they're just layout noise + return false; + } + + // Include other container elements that might have interactive children + if (isContainerElement(element)) return true; + + return false; + } + + function isContainerElement(element) { + var role = element.getAttribute("role"); + var tag = element.tagName.toLowerCase(); + var className = element.className || ""; + var id = element.id || ""; + + // These are containers that should be traversed deeper + return ( + role === "search" || + role === "form" || + role === "group" || + role === "toolbar" || + role === "navigation" || + tag === "form" || + tag === "fieldset" || + tag === "nav" || + // Generic functional containers + id.includes("search") || + className.includes("search") || + id.includes("form") || + className.includes("form") || + id.includes("menu") || + className.includes("menu") || + id.includes("nav") || + className.includes("nav") + ); + } + + function processElement(element, depth, options) { + if (depth > 15) return; // Generous depth limit for very complex pages + if (!element || !element.tagName) return; + + var shouldInclude = shouldIncludeElement(element, options); + var actuallyInclude = shouldInclude || depth === 0; // Always include root (body) + + if (actuallyInclude) { + var role = getRole(element); + var name = getCleanName(element); + var ref = null; + + // Check if this element already has a ref in the global map + for (var existingRef in window.__claudeElementMap) { + var weakRef = window.__claudeElementMap[existingRef]; + var existingElement = weakRef.deref(); + if (existingElement === element) { + ref = existingRef; + break; + } + } + + // If not found, create a new ref + if (!ref) { + ref = "ref_" + ++window.__claudeRefCounter; + window.__claudeElementMap[ref] = new WeakRef(element); + } + + var indent = " ".repeat(depth); + var yaml = indent + "- " + role; + + if (name) { + // Clean up the name - remove newlines, limit length + name = name.replace(/\s+/g, " ").substring(0, 100); + yaml += ' "' + name.replace(/"/g, '\\"') + '"'; + } + + yaml += " [ref=" + ref + "]"; + + // Add useful attributes + if (element.id) yaml += ' id="' + element.id + '"'; + if (element.getAttribute("href")) + yaml += ' href="' + element.getAttribute("href") + '"'; + if (element.getAttribute("type")) + yaml += ' type="' + element.getAttribute("type") + '"'; + if (element.getAttribute("placeholder")) + yaml += + ' placeholder="' + element.getAttribute("placeholder") + '"'; + + result.push(yaml); + } + + // Always traverse children - we need to go deep to find interactive elements + if (element.children && depth < 15) { + for (var i = 0; i < element.children.length; i++) { + processElement( + element.children[i], + actuallyInclude ? depth + 1 : depth, + options, + ); + } + } + } + + var options = { + filter: filterType, + }; + + if (document.body) { + processElement(document.body, 0, options); + } + + // Clean up stale references (elements that have been garbage collected) + for (var ref in window.__claudeElementMap) { + var weakRef = window.__claudeElementMap[ref]; + if (!weakRef.deref()) { + delete window.__claudeElementMap[ref]; + } + } + + // Filter out empty generic elements + var filteredResult = result.filter(function (line) { + return !/^\s*- generic \[ref=ref_\d+\]$/.test(line); + }); + + return { + pageContent: filteredResult.join("\n"), + viewport: { + width: window.innerWidth, + height: window.innerHeight, + }, + }; + } catch (error) { + console.error("Error in accessibility tree generation:", error); + throw new Error( + "Error generating accessibility tree: " + + (error.message || "Unknown error"), + ); + } + }; +})(); \ No newline at end of file diff --git a/browser-use-demo/browser_use_demo/browser_tool_utils/browser_element_script.js b/browser-use-demo/browser_use_demo/browser_tool_utils/browser_element_script.js new file mode 100644 index 000000000..431d24697 --- /dev/null +++ b/browser-use-demo/browser_use_demo/browser_tool_utils/browser_element_script.js @@ -0,0 +1,87 @@ +/* + * Modifications Copyright (c) 2025 Anthropic, PBC + * Modified from original Microsoft Playwright source + * Original Microsoft Playwright source licensed under Apache License 2.0 + * See CHANGELOG.md for details + */ + +// Script for interacting with elements by their reference IDs + +(function(elementRef) { + try { + // Get element from reference map + let targetElement = null; + + if (window.__claudeElementMap && window.__claudeElementMap[elementRef]) { + const weakRef = window.__claudeElementMap[elementRef]; + targetElement = weakRef.deref() || null; + + if (!targetElement || !document.contains(targetElement)) { + // Element has been removed from DOM + delete window.__claudeElementMap[elementRef]; + targetElement = null; + } + } + + if (!targetElement) { + return { + success: false, + action: 'get_element', + message: `No element found with reference: "${elementRef}". The element may have been removed from the page.` + }; + } + + // Scroll element into view if needed + targetElement.scrollIntoView({ behavior: 'instant', block: 'center', inline: 'center' }); + + // Force a layout/paint to ensure the element is properly positioned after scroll + targetElement.offsetHeight; + + // Get element coordinates + const rect = targetElement.getBoundingClientRect(); + const clickX = rect.left + rect.width / 2; + const clickY = rect.top + rect.height / 2; + + // Build element info string + const elementInfo = targetElement.tagName.toLowerCase() + + (targetElement.id ? '#' + targetElement.id : '') + + (targetElement.className ? '.' + targetElement.className.split(' ').filter(c => c).join('.') : ''); + + // Get additional element properties + const elementType = targetElement.getAttribute('type') || ''; + const elementRole = targetElement.getAttribute('role') || ''; + const elementAriaLabel = targetElement.getAttribute('aria-label') || ''; + const elementText = targetElement.textContent ? targetElement.textContent.substring(0, 100) : ''; + + return { + success: true, + coordinates: [clickX, clickY], + elementInfo: elementInfo, + elementRef: elementRef, + rect: { + left: rect.left, + top: rect.top, + right: rect.right, + bottom: rect.bottom, + width: rect.width, + height: rect.height + }, + attributes: { + type: elementType, + role: elementRole, + ariaLabel: elementAriaLabel, + text: elementText + }, + isVisible: rect.width > 0 && rect.height > 0, + isInteractable: !targetElement.disabled && + targetElement.style.display !== 'none' && + targetElement.style.visibility !== 'hidden' + }; + } catch (error) { + return { + success: false, + action: 'get_element', + message: 'Error finding element by reference: ' + (error.message || 'Unknown error') + }; + } +}) \ No newline at end of file diff --git a/browser-use-demo/browser_use_demo/browser_tool_utils/browser_form_input_script.js b/browser-use-demo/browser_use_demo/browser_tool_utils/browser_form_input_script.js new file mode 100644 index 000000000..1201aaf24 --- /dev/null +++ b/browser-use-demo/browser_use_demo/browser_tool_utils/browser_form_input_script.js @@ -0,0 +1,220 @@ +// Script for setting values in form elements by their reference IDs + +(function(elementRef, inputValue) { + try { + // Get element from reference map + let element = null; + + if (window.__claudeElementMap && window.__claudeElementMap[elementRef]) { + const weakRef = window.__claudeElementMap[elementRef]; + element = weakRef.deref() || null; + + if (!element || !document.contains(element)) { + // Element has been removed from DOM + delete window.__claudeElementMap[elementRef]; + element = null; + } + } + + if (!element) { + return { + success: false, + action: 'form_input', + message: `No element found with reference: "${elementRef}". The element may have been removed from the page.` + }; + } + + // Scroll element into view + element.scrollIntoView({ behavior: 'smooth', block: 'center' }); + + // Handle different element types + if (element instanceof HTMLSelectElement) { + const previousValue = element.value; + const options = Array.from(element.options); + + // Try to find option by value or text + let optionFound = false; + const valueStr = String(inputValue); + + for (let i = 0; i < options.length; i++) { + if (options[i].value === valueStr || options[i].text === valueStr) { + element.selectedIndex = i; + optionFound = true; + break; + } + } + + if (!optionFound) { + return { + success: false, + action: 'form_input', + message: `Option "${valueStr}" not found. Available options: ${options.map(o => `"${o.text}" (value: "${o.value}")`).join(', ')}` + }; + } + + // Focus and dispatch events + element.focus(); + element.dispatchEvent(new Event('change', { bubbles: true })); + element.dispatchEvent(new Event('input', { bubbles: true })); + + return { + success: true, + action: 'form_input', + ref: elementRef, + element_type: 'select', + previous_value: previousValue, + new_value: element.value, + message: `Selected option "${valueStr}" in dropdown` + }; + } else if (element instanceof HTMLInputElement && element.type === 'checkbox') { + const previousValue = element.checked; + + if (typeof inputValue !== 'boolean') { + return { + success: false, + action: 'form_input', + message: 'Checkbox requires a boolean value (true/false)' + }; + } + + element.checked = inputValue; + element.focus(); + element.dispatchEvent(new Event('change', { bubbles: true })); + element.dispatchEvent(new Event('input', { bubbles: true })); + + return { + success: true, + action: 'form_input', + ref: elementRef, + element_type: 'checkbox', + previous_value: previousValue, + new_value: element.checked, + message: `Checkbox ${element.checked ? 'checked' : 'unchecked'}` + }; + } else if (element instanceof HTMLInputElement && element.type === 'radio') { + const previousValue = element.checked; + const radioGroup = element.name; + + // For radio buttons, we always set to true (can't uncheck a radio by clicking) + element.checked = true; + element.focus(); + element.dispatchEvent(new Event('change', { bubbles: true })); + element.dispatchEvent(new Event('input', { bubbles: true })); + + return { + success: true, + action: 'form_input', + ref: elementRef, + element_type: 'radio', + previous_value: previousValue, + new_value: element.checked, + message: `Radio button selected${radioGroup ? ` in group "${radioGroup}"` : ''}` + }; + } else if (element instanceof HTMLInputElement && + (element.type === 'date' || element.type === 'time' || + element.type === 'datetime-local' || element.type === 'month' || + element.type === 'week')) { + const previousValue = element.value; + element.value = String(inputValue); + element.focus(); + element.dispatchEvent(new Event('change', { bubbles: true })); + element.dispatchEvent(new Event('input', { bubbles: true })); + + return { + success: true, + action: 'form_input', + ref: elementRef, + element_type: element.type, + previous_value: previousValue, + new_value: element.value, + message: `Set ${element.type} to "${element.value}"` + }; + } else if (element instanceof HTMLInputElement && element.type === 'range') { + const previousValue = element.value; + const numValue = Number(inputValue); + + if (isNaN(numValue)) { + return { + success: false, + action: 'form_input', + message: 'Range input requires a numeric value' + }; + } + + element.value = String(numValue); + element.focus(); + element.dispatchEvent(new Event('change', { bubbles: true })); + element.dispatchEvent(new Event('input', { bubbles: true })); + + return { + success: true, + action: 'form_input', + ref: elementRef, + element_type: 'range', + previous_value: previousValue, + new_value: element.value, + message: `Set range to ${element.value} (min: ${element.min}, max: ${element.max})` + }; + } else if (element instanceof HTMLInputElement && element.type === 'number') { + const previousValue = element.value; + const numValue = Number(inputValue); + + if (isNaN(numValue) && inputValue !== '') { + return { + success: false, + action: 'form_input', + message: 'Number input requires a numeric value' + }; + } + + element.value = String(inputValue); + element.focus(); + element.dispatchEvent(new Event('change', { bubbles: true })); + element.dispatchEvent(new Event('input', { bubbles: true })); + + return { + success: true, + action: 'form_input', + ref: elementRef, + element_type: 'number', + previous_value: previousValue, + new_value: element.value, + message: `Set number input to ${element.value}` + }; + } else if (element instanceof HTMLInputElement || element instanceof HTMLTextAreaElement) { + const previousValue = element.value; + element.value = String(inputValue); + element.focus(); + + // Set cursor position to end + element.setSelectionRange(element.value.length, element.value.length); + + element.dispatchEvent(new Event('change', { bubbles: true })); + element.dispatchEvent(new Event('input', { bubbles: true })); + + const elementType = element instanceof HTMLTextAreaElement ? 'textarea' : (element.type || 'text'); + + return { + success: true, + action: 'form_input', + ref: elementRef, + element_type: elementType, + previous_value: previousValue, + new_value: element.value, + message: `Set ${elementType} value to "${element.value}"` + }; + } else { + return { + success: false, + action: 'form_input', + message: `Element type "${element.tagName}" is not a supported form input` + }; + } + } catch (error) { + return { + success: false, + action: 'form_input', + message: `Error setting form value: ${error.message || 'Unknown error'}` + }; + } +}) \ No newline at end of file diff --git a/browser-use-demo/browser_use_demo/browser_tool_utils/browser_key_map.py b/browser-use-demo/browser_use_demo/browser_tool_utils/browser_key_map.py new file mode 100644 index 000000000..cd49cf751 --- /dev/null +++ b/browser-use-demo/browser_use_demo/browser_tool_utils/browser_key_map.py @@ -0,0 +1,201 @@ +"""Key mapping for browser keyboard input via Chrome DevTools Protocol.""" + +from typing import TypedDict + + +class KeyInfo(TypedDict, total=False): + key: str + code: str + keyCode: int + text: str + isKeypad: bool + + +KEY_MAP: dict[str, KeyInfo] = { + # Modifier keys (for key combinations like ctrl+a, cmd+c) + "ctrl": {"key": "Control", "code": "ControlLeft", "keyCode": 17}, + "control": {"key": "Control", "code": "ControlLeft", "keyCode": 17}, + "cmd": {"key": "Meta", "code": "MetaLeft", "keyCode": 91}, + "command": {"key": "Meta", "code": "MetaLeft", "keyCode": 91}, + "meta": {"key": "Meta", "code": "MetaLeft", "keyCode": 91}, + "alt": {"key": "Alt", "code": "AltLeft", "keyCode": 18}, + "option": {"key": "Alt", "code": "AltLeft", "keyCode": 18}, + "shift": {"key": "Shift", "code": "ShiftLeft", "keyCode": 16}, + # Enter keys + "enter": {"key": "Enter", "code": "Enter", "keyCode": 13, "text": "\r"}, + "return": {"key": "Enter", "code": "Enter", "keyCode": 13, "text": "\r"}, + "kp_enter": { + "key": "Enter", + "code": "Enter", + "keyCode": 13, + "text": "\r", + "isKeypad": True, + }, + # Navigation keys + "tab": {"key": "Tab", "code": "Tab", "keyCode": 9}, + "delete": {"key": "Delete", "code": "Delete", "keyCode": 46}, + "backspace": {"key": "Backspace", "code": "Backspace", "keyCode": 8}, + "escape": {"key": "Escape", "code": "Escape", "keyCode": 27}, + "esc": {"key": "Escape", "code": "Escape", "keyCode": 27}, + "space": {"key": " ", "code": "Space", "keyCode": 32, "text": " "}, + " ": {"key": " ", "code": "Space", "keyCode": 32, "text": " "}, + # Arrow keys + "arrowup": {"key": "ArrowUp", "code": "ArrowUp", "keyCode": 38}, + "arrowdown": {"key": "ArrowDown", "code": "ArrowDown", "keyCode": 40}, + "arrowleft": {"key": "ArrowLeft", "code": "ArrowLeft", "keyCode": 37}, + "arrowright": {"key": "ArrowRight", "code": "ArrowRight", "keyCode": 39}, + "up": {"key": "ArrowUp", "code": "ArrowUp", "keyCode": 38}, + "down": {"key": "ArrowDown", "code": "ArrowDown", "keyCode": 40}, + "left": {"key": "ArrowLeft", "code": "ArrowLeft", "keyCode": 37}, + "right": {"key": "ArrowRight", "code": "ArrowRight", "keyCode": 39}, + # Page navigation + "home": {"key": "Home", "code": "Home", "keyCode": 36}, + "end": {"key": "End", "code": "End", "keyCode": 35}, + "pageup": {"key": "PageUp", "code": "PageUp", "keyCode": 33}, + "pagedown": {"key": "PageDown", "code": "PageDown", "keyCode": 34}, + # Function keys + "f1": {"key": "F1", "code": "F1", "keyCode": 112}, + "f2": {"key": "F2", "code": "F2", "keyCode": 113}, + "f3": {"key": "F3", "code": "F3", "keyCode": 114}, + "f4": {"key": "F4", "code": "F4", "keyCode": 115}, + "f5": {"key": "F5", "code": "F5", "keyCode": 116}, + "f6": {"key": "F6", "code": "F6", "keyCode": 117}, + "f7": {"key": "F7", "code": "F7", "keyCode": 118}, + "f8": {"key": "F8", "code": "F8", "keyCode": 119}, + "f9": {"key": "F9", "code": "F9", "keyCode": 120}, + "f10": {"key": "F10", "code": "F10", "keyCode": 121}, + "f11": {"key": "F11", "code": "F11", "keyCode": 122}, + "f12": {"key": "F12", "code": "F12", "keyCode": 123}, + # Special characters + ";": {"key": ";", "code": "Semicolon", "keyCode": 186, "text": ";"}, + "=": {"key": "=", "code": "Equal", "keyCode": 187, "text": "="}, + ",": {"key": ",", "code": "Comma", "keyCode": 188, "text": ","}, + "-": {"key": "-", "code": "Minus", "keyCode": 189, "text": "-"}, + ".": {"key": ".", "code": "Period", "keyCode": 190, "text": "."}, + "/": {"key": "/", "code": "Slash", "keyCode": 191, "text": "/"}, + "`": {"key": "`", "code": "Backquote", "keyCode": 192, "text": "`"}, + "[": {"key": "[", "code": "BracketLeft", "keyCode": 219, "text": "["}, + "\\": {"key": "\\", "code": "Backslash", "keyCode": 220, "text": "\\"}, + "]": {"key": "]", "code": "BracketRight", "keyCode": 221, "text": "]"}, + "'": {"key": "'", "code": "Quote", "keyCode": 222, "text": "'"}, + "!": {"key": "!", "code": "Digit1", "keyCode": 49, "text": "!"}, + "@": {"key": "@", "code": "Digit2", "keyCode": 50, "text": "@"}, + "#": {"key": "#", "code": "Digit3", "keyCode": 51, "text": "#"}, + "$": {"key": "$", "code": "Digit4", "keyCode": 52, "text": "$"}, + "%": {"key": "%", "code": "Digit5", "keyCode": 53, "text": "%"}, + "^": {"key": "^", "code": "Digit6", "keyCode": 54, "text": "^"}, + "&": {"key": "&", "code": "Digit7", "keyCode": 55, "text": "&"}, + "*": {"key": "*", "code": "Digit8", "keyCode": 56, "text": "*"}, + "(": {"key": "(", "code": "Digit9", "keyCode": 57, "text": "("}, + ")": {"key": ")", "code": "Digit0", "keyCode": 48, "text": ")"}, + "_": {"key": "_", "code": "Minus", "keyCode": 189, "text": "_"}, + "+": {"key": "+", "code": "Equal", "keyCode": 187, "text": "+"}, + "{": {"key": "{", "code": "BracketLeft", "keyCode": 219, "text": "{"}, + "}": {"key": "}", "code": "BracketRight", "keyCode": 221, "text": "}"}, + "|": {"key": "|", "code": "Backslash", "keyCode": 220, "text": "|"}, + ":": {"key": ":", "code": "Semicolon", "keyCode": 186, "text": ":"}, + '"': {"key": '"', "code": "Quote", "keyCode": 222, "text": '"'}, + "<": {"key": "<", "code": "Comma", "keyCode": 188, "text": "<"}, + ">": {"key": ">", "code": "Period", "keyCode": 190, "text": ">"}, + "?": {"key": "?", "code": "Slash", "keyCode": 191, "text": "?"}, + "~": {"key": "~", "code": "Backquote", "keyCode": 192, "text": "~"}, + # Lock keys + "capslock": {"key": "CapsLock", "code": "CapsLock", "keyCode": 20}, + "numlock": {"key": "NumLock", "code": "NumLock", "keyCode": 144}, + "scrolllock": {"key": "ScrollLock", "code": "ScrollLock", "keyCode": 145}, + # Media keys + "pause": {"key": "Pause", "code": "Pause", "keyCode": 19}, + "insert": {"key": "Insert", "code": "Insert", "keyCode": 45}, + "printscreen": {"key": "PrintScreen", "code": "PrintScreen", "keyCode": 44}, + # Numpad + "numpad0": { + "key": "0", + "code": "Numpad0", + "keyCode": 96, + "isKeypad": True, + }, + "numpad1": { + "key": "1", + "code": "Numpad1", + "keyCode": 97, + "isKeypad": True, + }, + "numpad2": { + "key": "2", + "code": "Numpad2", + "keyCode": 98, + "isKeypad": True, + }, + "numpad3": { + "key": "3", + "code": "Numpad3", + "keyCode": 99, + "isKeypad": True, + }, + "numpad4": { + "key": "4", + "code": "Numpad4", + "keyCode": 100, + "isKeypad": True, + }, + "numpad5": { + "key": "5", + "code": "Numpad5", + "keyCode": 101, + "isKeypad": True, + }, + "numpad6": { + "key": "6", + "code": "Numpad6", + "keyCode": 102, + "isKeypad": True, + }, + "numpad7": { + "key": "7", + "code": "Numpad7", + "keyCode": 103, + "isKeypad": True, + }, + "numpad8": { + "key": "8", + "code": "Numpad8", + "keyCode": 104, + "isKeypad": True, + }, + "numpad9": { + "key": "9", + "code": "Numpad9", + "keyCode": 105, + "isKeypad": True, + }, + "numpadmultiply": { + "key": "*", + "code": "NumpadMultiply", + "keyCode": 106, + "isKeypad": True, + }, + "numpadadd": { + "key": "+", + "code": "NumpadAdd", + "keyCode": 107, + "isKeypad": True, + }, + "numpadsubtract": { + "key": "-", + "code": "NumpadSubtract", + "keyCode": 109, + "isKeypad": True, + }, + "numpaddecimal": { + "key": ".", + "code": "NumpadDecimal", + "keyCode": 110, + "isKeypad": True, + }, + "numpaddivide": { + "key": "/", + "code": "NumpadDivide", + "keyCode": 111, + "isKeypad": True, + }, +} diff --git a/browser-use-demo/browser_use_demo/browser_tool_utils/browser_text_script.js b/browser-use-demo/browser_use_demo/browser_tool_utils/browser_text_script.js new file mode 100644 index 000000000..2ac77d4bb --- /dev/null +++ b/browser-use-demo/browser_use_demo/browser_tool_utils/browser_text_script.js @@ -0,0 +1,66 @@ +// Script to extract raw text content from the page, prioritizing article content + +(function() { + try { + // Priority order for finding article content + const selectors = [ + 'article', + 'main', + '[class*="articleBody"]', + '[class*="article-body"]', + '[class*="post-content"]', + '[class*="entry-content"]', + '[class*="content-body"]', + '[role="main"]', + '.content', + '#content' + ]; + + let contentElement = null; + + // Try each selector in order + for (const selector of selectors) { + const elements = document.querySelectorAll(selector); + if (elements.length > 0) { + // If multiple elements found, prefer the one with the most text content + let bestElement = elements[0]; + let maxTextLength = 0; + + elements.forEach((el) => { + const textLength = el.textContent?.length || 0; + if (textLength > maxTextLength) { + maxTextLength = textLength; + bestElement = el; + } + }); + + contentElement = bestElement; + break; + } + } + + if (!contentElement) { + // Fallback to body if no specific content element found + contentElement = document.body; + } + + // Extract text content + const textContent = contentElement.textContent || ''; + + // Clean up the text: remove excessive whitespace, normalize line breaks + const cleanedText = textContent + .replace(/\s+/g, ' ') // Replace multiple spaces with single space + .replace(/\n{3,}/g, '\n\n') // Replace 3+ newlines with double newline + .trim(); + + return { + text: cleanedText, + source: contentElement.tagName.toLowerCase(), + title: document.title, + url: window.location.href + }; + } catch (error) { + console.error('Error extracting page text:', error); + throw new Error('Error extracting page text: ' + (error.message || 'Unknown error')); + } +}) \ No newline at end of file diff --git a/browser-use-demo/browser_use_demo/display_constants.py b/browser-use-demo/browser_use_demo/display_constants.py new file mode 100644 index 000000000..55951588e --- /dev/null +++ b/browser-use-demo/browser_use_demo/display_constants.py @@ -0,0 +1,14 @@ +"""Display and browser configuration constants. + +These values are hardcoded and not configurable via environment variables. +The standard resolution is 1920x1080 for consistent browser automation. +""" + +# Display configuration +DISPLAY_WIDTH = 1920 +DISPLAY_HEIGHT = 1080 +DISPLAY_NUM = 1 + +# Browser viewport configuration (matches display for consistency) +BROWSER_WIDTH = 1920 +BROWSER_HEIGHT = 1080 \ No newline at end of file diff --git a/browser-use-demo/browser_use_demo/loop.py b/browser-use-demo/browser_use_demo/loop.py new file mode 100644 index 000000000..4899223c9 --- /dev/null +++ b/browser-use-demo/browser_use_demo/loop.py @@ -0,0 +1,209 @@ +""" +Sampling loop for browser automation with Claude +""" + +import os +from collections.abc import Callable +from datetime import datetime +from enum import StrEnum +from typing import Optional + +import httpx + +from anthropic import ( + Anthropic, + AnthropicBedrock, + AnthropicVertex, +) +from anthropic.types.beta import ( + BetaCacheControlEphemeralParam, + BetaContentBlockParam, + BetaMessageParam, + BetaTextBlockParam, +) + +from .message_handler import MessageBuilder, ResponseProcessor +from .tools import BrowserTool, ToolCollection, ToolResult + +PROMPT_CACHING_BETA_FLAG = "prompt-caching-2024-07-31" + + +class APIProvider(StrEnum): + ANTHROPIC = "anthropic" + BEDROCK = "bedrock" + VERTEX = "vertex" + + +# Browser-specific system prompt +BROWSER_SYSTEM_PROMPT = f""" +* You control a Chromium browser via Playwright automation. +* The current date is {datetime.today().strftime("%A, %B %-d, %Y")}. + + + +You receive a screenshot at the start of each turn. Look at it to see the current page - if you're already where you need to be, don't re-navigate. + +After navigating to a new page, always call read_page to get element references (ref_1, ref_2, etc.) before interacting with the page. Use these refs with your interaction tools (click, type, hover, form_input, etc.). Refs are more reliable than coordinates. + +When you need to extract or read text content from a page, always use get_page_text - don't try to read text from screenshots. + +If DOM-based actions (refs) aren't working, fall back to screenshot + coordinate-based actions. + + + +* Prefer get_page_text over scrolling when looking for information - it's faster and more reliable +* Use execute_js to extract data from JavaScript variables, localStorage, or trigger behaviors not accessible through clicks +* Use full URLs with https:// +* Use wait for slow-loading pages +* Use scroll_to with a ref to reveal elements +* Use form_input with refs for form fields +* Use key for shortcuts (e.g., "ctrl+a") +* Close popups when they appear +* Verify actions succeeded before moving on +""" + + +async def sampling_loop( + *, + model: str, + provider: APIProvider, + system_prompt_suffix: str, + messages: list[BetaMessageParam], + output_callback: Callable[[BetaContentBlockParam], None], + tool_output_callback: Callable[[ToolResult, str], None], + api_response_callback: Callable[ + [httpx.Request | None, httpx.Response | object | None, Exception | None], None + ], + api_key: str, + only_n_most_recent_images: int | None = None, + max_tokens: int = 4096, + browser_tool: Optional[BrowserTool] = None, +): + """ + Sampling loop for browser automation. + + Args: + browser_tool: Optional persistent browser tool instance. If not provided, creates a new one. + """ + # Reuse existing browser tool or create a new one + if browser_tool is None: + # Create browser tool with standard dimensions + browser_tool = BrowserTool() + + tool_collection = ToolCollection(browser_tool) + + # Build system prompt + system = BetaTextBlockParam( + type="text", + text=f"{BROWSER_SYSTEM_PROMPT}{' ' + system_prompt_suffix if system_prompt_suffix else ''}", + ) + + while True: + # Configure client and betas + betas = [] + enable_prompt_caching = False + + if provider == APIProvider.ANTHROPIC: + client = Anthropic(api_key=api_key, max_retries=4) + enable_prompt_caching = True + elif provider == APIProvider.VERTEX: + client = AnthropicVertex() + elif provider == APIProvider.BEDROCK: + client = AnthropicBedrock() + else: + raise ValueError(f"Unsupported provider: {provider}") + + if enable_prompt_caching: + betas.append(PROMPT_CACHING_BETA_FLAG) + # Add cache control to system prompt + system = BetaTextBlockParam( + type="text", + text=system["text"], + cache_control=BetaCacheControlEphemeralParam(type="ephemeral"), + ) + + # Make API call + try: + api_kwargs = { + "max_tokens": max_tokens, + "messages": messages, + "model": model, + "system": [system], + "tools": tool_collection.to_params(), + } + # Only include betas if there are any (e.g., prompt caching) + if betas: + api_kwargs["betas"] = betas + response = client.beta.messages.create(**api_kwargs) + else: + # Use regular messages API when no beta features are needed + response = client.messages.create(**api_kwargs) + except Exception as e: + api_response_callback(None, None, e) + raise e + + api_response_callback(None, response, None) + + # Process response using our new abstractions + processor = ResponseProcessor() + processed = processor.process_response(response) + + # Output all content blocks to callbacks + for content_block in processed.assistant_content: + output_callback(content_block) + + # Build and append the complete assistant message (preserves text + tools) + builder = MessageBuilder() + builder.add_assistant_message(messages, processed.assistant_content) + + # Execute tools and collect results if there are any tool uses + if processed.tool_uses: + tool_results = await processor.execute_tools( + processed.tool_uses, + tool_collection, + tool_output_callback + ) + + # Add all tool results as a single user message + builder.add_tool_results(messages, tool_results) + + # Continue the loop to process any follow-up + else: + # No tools used, conversation can end here + return messages + + +def _maybe_filter_to_n_most_recent_images( + messages: list[BetaMessageParam], + images_to_keep: int, + min_removal_threshold: int = 10, +): + """ + Filter messages to keep only the N most recent images. + """ + if images_to_keep <= 0: + raise ValueError("images_to_keep must be > 0") + + total_images = sum( + 1 + for message in messages + if message["role"] == "user" + for block in message.get("content", []) + if isinstance(block, dict) and block.get("type") == "image" + ) + + images_to_remove = total_images - images_to_keep + if images_to_remove < min_removal_threshold: + return + + images_removed = 0 + for message in messages: + if message["role"] == "user" and isinstance(message.get("content"), list): + new_content = [] + for block in message["content"]: + if isinstance(block, dict) and block.get("type") == "image": + if images_removed < images_to_remove: + images_removed += 1 + continue + new_content.append(block) + message["content"] = new_content diff --git a/browser-use-demo/browser_use_demo/message_handler.py b/browser-use-demo/browser_use_demo/message_handler.py new file mode 100644 index 000000000..1c30dbbcd --- /dev/null +++ b/browser-use-demo/browser_use_demo/message_handler.py @@ -0,0 +1,288 @@ +""" +Message handling abstractions for proper API response processing. + +This module provides clean abstractions for processing API responses and building +messages that preserve both text explanations and tool uses together, matching +the Chrome extension's behavior. +""" + +from collections.abc import Callable +from dataclasses import dataclass +from typing import Any, Optional, cast + +from anthropic.types.beta import ( + BetaContentBlockParam, + BetaImageBlockParam, + BetaMessageParam, + BetaTextBlockParam, + BetaToolResultBlockParam, +) + +from .tools import ToolCollection, ToolResult + + +@dataclass +class ProcessedResponse: + """Container for processed API response data.""" + + assistant_content: list[BetaContentBlockParam] + tool_uses: list[dict[str, Any]] + has_text: bool + has_tools: bool + + +class ResponseProcessor: + """ + Handles API response processing with proper message preservation. + + This class ensures that text explanations are preserved alongside tool uses, + matching the behavior of the Chrome extension. + """ + + def process_response(self, response) -> ProcessedResponse: + """ + Process an API response into a structured format. + + Args: + response: The API response from Anthropic + + Returns: + ProcessedResponse containing all content blocks and metadata + """ + assistant_content = [] + tool_uses = [] + has_text = False + has_tools = False + + for content_block in response.content: + if content_block.type == "text": + has_text = True + assistant_content.append({ + "type": "text", + "text": content_block.text + }) + elif content_block.type == "tool_use": + has_tools = True + tool_use_dict = { + "type": "tool_use", + "id": content_block.id, + "name": content_block.name, + "input": content_block.input + } + assistant_content.append(tool_use_dict) + tool_uses.append(tool_use_dict) + + return ProcessedResponse( + assistant_content=assistant_content, + tool_uses=tool_uses, + has_text=has_text, + has_tools=has_tools + ) + + async def execute_tools( + self, + tool_uses: list[dict[str, Any]], + tool_collection: ToolCollection, + tool_output_callback: Optional[Callable[[ToolResult, str], None]] = None + ) -> list[BetaToolResultBlockParam]: + """ + Execute tools and collect results. + + Args: + tool_uses: List of tool use blocks to execute + tool_collection: The tool collection for execution + tool_output_callback: Optional callback for tool results + + Returns: + List of tool result blocks + """ + tool_results = [] + + for tool_use in tool_uses: + tool_id = tool_use["id"] + tool_name = tool_use["name"] + tool_input = tool_use["input"] + + try: + tool = tool_collection.tool_map.get(tool_name) + if not tool: + raise ValueError(f"Unknown tool: {tool_name}") + + result = await tool(**tool_input) + + if tool_output_callback: + tool_output_callback(result, tool_id) + + tool_result = self._build_tool_result(result, tool_id) + tool_results.append(tool_result) + + except Exception as e: + error_result = BetaToolResultBlockParam( + type="tool_result", + tool_use_id=tool_id, + is_error=True, + content=[{"type": "text", "text": str(e)}] + ) + tool_results.append(error_result) + + if tool_output_callback: + error_tool_result = ToolResult(error=str(e)) + tool_output_callback(error_tool_result, tool_id) + + return tool_results + + def _build_tool_result( + self, + result: ToolResult, + tool_use_id: str + ) -> BetaToolResultBlockParam: + """ + Build a tool result block from a ToolResult. + + Args: + result: The tool execution result + tool_use_id: The ID of the tool use + + Returns: + A properly formatted tool result block + """ + tool_result = BetaToolResultBlockParam( + type="tool_result", + tool_use_id=tool_use_id, + content=[] + ) + + content_list = cast(list[BetaTextBlockParam | BetaImageBlockParam], tool_result.get("content", [])) + + if result.output: + output_text = result.output + if "__PAGE_EXTRACTED__" in output_text or "__TEXT_EXTRACTED__" in output_text: + if "__FULL_CONTENT__" in output_text: + marker_pos = output_text.index("__FULL_CONTENT__") + output_text = output_text[marker_pos + len("__FULL_CONTENT__") + 1:] + + content_list.append({ + "type": "text", + "text": output_text + }) + + if result.base64_image: + content_list.append({ + "type": "image", + "source": { + "type": "base64", + "media_type": "image/png", + "data": result.base64_image, + } + }) + + if result.error: + tool_result["is_error"] = True + content_list.append({ + "type": "text", + "text": f"Error: {result.error}" + }) + + return tool_result + + +class MessageBuilder: + """ + Builds properly structured messages for the API. + + This class ensures messages are constructed in a way that preserves + all content, matching the Chrome extension's behavior. + """ + + def add_assistant_message( + self, + messages: list[BetaMessageParam], + content: list[BetaContentBlockParam] + ) -> None: + """ + Add a complete assistant message with all content blocks. + + Args: + messages: The message list to append to + content: The content blocks for the assistant message + """ + if content: # Only add if there's content + messages.append({ + "role": "assistant", + "content": content + }) + + def add_tool_results( + self, + messages: list[BetaMessageParam], + tool_results: list[BetaToolResultBlockParam] + ) -> None: + """ + Add tool results as a user message. + + Args: + messages: The message list to append to + tool_results: The tool result blocks to add + """ + if tool_results: # Only add if there are results + messages.append({ + "role": "user", + "content": tool_results + }) + + def ensure_message_integrity( + self, + messages: list[BetaMessageParam] + ) -> bool: + """ + Validate that messages maintain proper structure. + + Args: + messages: The message list to validate + + Returns: + True if messages are properly structured, False otherwise + """ + if not messages: + return True + + for message in messages: + role = message.get("role") + if not role: + return False + + if "content" not in message: + return False + + content = message["content"] + if isinstance(content, list) and len(content) == 0: + return False + + return True + + def extract_text_from_message( + self, + message: BetaMessageParam + ) -> Optional[str]: + """ + Extract text content from a message. + + Args: + message: The message to extract text from + + Returns: + The extracted text or None if no text content + """ + if message.get("role") != "assistant": + return None + + content = message.get("content", []) + if isinstance(content, str): + return content + + text_parts = [] + for block in content: + if isinstance(block, dict) and block.get("type") == "text": + text_parts.append(block.get("text", "")) + + return " ".join(text_parts) if text_parts else None diff --git a/browser-use-demo/browser_use_demo/message_renderer.py b/browser-use-demo/browser_use_demo/message_renderer.py new file mode 100644 index 000000000..00376a1c5 --- /dev/null +++ b/browser-use-demo/browser_use_demo/message_renderer.py @@ -0,0 +1,298 @@ +""" +Message rendering functionality for the Browser Use Demo. + +This module handles all message rendering logic for the Streamlit interface, +separating presentation concerns from the main application logic. +""" + +import base64 +from typing import cast + +import streamlit as st +from anthropic.types.beta import BetaContentBlockParam + +from browser_use_demo.tools import ToolResult +from browser_use_demo.tools.coordinate_scaling import CoordinateScaler + + +class Sender: + """Message sender types.""" + + USER = "user" + BOT = "assistant" + TOOL = "tool" + + +class MessageRenderer: + """Handles rendering of messages in the Streamlit chat interface.""" + + def __init__(self, session_state): + """Initialize the MessageRenderer with session state access. + + Args: + session_state: Streamlit session state object for accessing configuration + """ + self.session_state = session_state + + def _scale_browser_coordinates(self, input_dict: dict) -> dict: + """Apply coordinate scaling to browser tool inputs for display. + + This ensures the displayed coordinates match what the browser tool will actually use. + Uses the CoordinateScaler for consistent scaling across the codebase. + + Args: + input_dict: The tool input dictionary + + Returns: + Modified input dict with scaled coordinates + """ + # Only process browser tool inputs with coordinates + if not isinstance(input_dict, dict): + return input_dict + + # Get browser tool dimensions if available + browser_tool = getattr(self.session_state, 'browser_tool', None) + if not browser_tool: + return input_dict + + # Clone the input to avoid modifying the original + import copy + scaled_input = copy.deepcopy(input_dict) + + # Get viewport dimensions + width = browser_tool.width + height = browser_tool.height + + # Scale various coordinate fields using CoordinateScaler + if 'coordinate' in scaled_input: + scaled_input['coordinate'] = CoordinateScaler.scale_coordinate_list( + scaled_input['coordinate'], width, height + ) + + if 'start_coordinate' in scaled_input: + scaled_input['start_coordinate'] = CoordinateScaler.scale_coordinate_list( + scaled_input['start_coordinate'], width, height + ) + + return scaled_input + + def render(self, sender: str, message: str | BetaContentBlockParam | ToolResult): + """Render a message in the chat interface. + + Args: + sender: The sender type (USER, BOT, or TOOL) + message: The message content to render + """ + # Early return for empty messages or hidden screenshots without content + if self._should_skip_message(message): + return + + with st.chat_message(sender): + self._render_message_content(message) + + def _should_skip_message(self, message) -> bool: + """Check if message should be skipped from rendering. + + Args: + message: The message to check + + Returns: + True if the message should be skipped, False otherwise + """ + if not message: + return True + + # Skip tool results that only have screenshots when screenshots are hidden + is_tool_result = not isinstance(message, str | dict) + if is_tool_result and self.session_state.hide_screenshots: + return not hasattr(message, "error") and not hasattr(message, "output") + + return False + + def _render_message_content(self, message): + """Render the actual message content based on its type. + + Args: + message: The message content to render + """ + # Define rendering strategies for different message types + renderers = { + "tool_result": self._render_tool_result, + "dict": self._render_dict_message, + "string": lambda msg: st.markdown(msg), + } + + # Determine message type and render accordingly + if not isinstance(message, str | dict): + # It's a ToolResult object + renderers["tool_result"](cast(ToolResult, message)) + elif isinstance(message, dict): + renderers["dict"](message) + else: + renderers["string"](message) + + def _render_tool_result(self, tool_result: ToolResult): + """Render a tool result with output, error, and optional image. + + Args: + tool_result: The ToolResult object to render + """ + if tool_result.output: + # Check if this is a text extraction result with special markers + if "__PAGE_EXTRACTED__" in tool_result.output or "__TEXT_EXTRACTED__" in tool_result.output: + # Extract just the summary for display + lines = tool_result.output.split("\n") + summary_lines = [] + in_summary = False + + for line in lines: + if "__PAGE_EXTRACTED__" in line or "__TEXT_EXTRACTED__" in line: + in_summary = True + continue + if "__FULL_CONTENT__" in line: + break + if in_summary: + summary_lines.append(line) + + # Display only the summary + if summary_lines: + st.markdown("\n".join(summary_lines)) + else: + # Regular tool output + st.markdown(tool_result.output) + + if tool_result.error: + st.error(tool_result.error) + if tool_result.base64_image and not self.session_state.hide_screenshots: + st.image(base64.b64decode(tool_result.base64_image)) + + def _render_dict_message(self, message: dict): + """Render dictionary-based messages based on their type field. + + Args: + message: Dictionary containing the message to render + """ + message_type = message.get("type", "") + + # Dispatch table for different message types + type_handlers = { + "text": lambda: st.write(message["text"]), + "tool_use": lambda: self._render_tool_use(message), + "tool_result": lambda: self._render_stored_tool_result(message), + } + + # Execute the appropriate handler or fall back to generic display + handler = type_handlers.get(message_type, lambda: st.write(message)) + handler() + + def _render_tool_use(self, message: dict): + """Render a tool use message with coordinate scaling for browser tools. + + Args: + message: Dictionary containing tool use information + """ + tool_name = message.get('name', 'unknown') + tool_input = message.get('input', {}) + + # Apply coordinate scaling for browser tool + if tool_name == 'browser': + tool_input = self._scale_browser_coordinates(tool_input) + + st.code(f"Tool Use: {tool_name}\nInput: {tool_input}") + + def _render_stored_tool_result(self, message: dict): + """Render a tool result that was stored in session state. + + Args: + message: Dictionary containing the tool_use_id reference + """ + tool_id = message.get("tool_use_id") + if tool_id and tool_id in self.session_state.tools: + self._render_tool_result(self.session_state.tools[tool_id]) + + def render_conversation_history(self, messages: list): + """Render all messages in conversation history. + + This method processes a list of messages and renders each one + according to its role and content type, eliminating deep nesting. + + Args: + messages: List of message dictionaries from session state + """ + for message in messages: + self._render_message_by_role(message) + + def _render_message_by_role(self, message: dict): + """Route message rendering based on role. + + Args: + message: Message dictionary containing role and content + """ + role_handlers = { + "user": lambda m: self._render_user_content(m["content"]), + "assistant": lambda m: self._render_assistant_content(m["content"]), + } + + handler = role_handlers.get(message["role"]) + if handler: + handler(message) + + def _render_user_content(self, content): + """Render user message content. + + Handles both single items and lists of content blocks, + skipping image blocks in conversation history. + + Args: + content: User message content (string, dict, or list) + """ + for item in self._normalize_content(content): + # Skip image blocks in history + if isinstance(item, dict) and item.get("type") == "image": + continue + + # Extract text from dict blocks or use item directly + if isinstance(item, dict): + if item.get("type") == "text": + text_content = item.get("text", "") + self.render(Sender.USER, text_content) + else: + # For other dict types, cast as BetaContentBlockParam + self.render(Sender.USER, cast(BetaContentBlockParam, item)) + else: + self.render(Sender.USER, item) + + def _render_assistant_content(self, content): + """Render assistant message content. + + Handles both single items and lists of content blocks, + properly routing tool results to the TOOL sender. + + Args: + content: Assistant message content (string, dict, or list) + """ + for item in self._normalize_content(content): + if isinstance(item, dict) and item.get("type") == "tool_result": + # Handle tool results by fetching from session state + tool_id = item.get("tool_use_id") + if tool_id and tool_id in self.session_state.tools: + self.render(Sender.TOOL, self.session_state.tools[tool_id]) + elif isinstance(item, dict): + # Cast dict items as BetaContentBlockParam + self.render(Sender.BOT, cast(BetaContentBlockParam, item)) + else: + # String or other types + self.render(Sender.BOT, item) + + def _normalize_content(self, content): + """Convert content to list for uniform processing. + + This eliminates duplicate code for handling list vs non-list content. + + Args: + content: Content that may be a single item or list + + Returns: + List of content items for processing + """ + return content if isinstance(content, list) else [content] diff --git a/browser-use-demo/browser_use_demo/requirements.txt b/browser-use-demo/browser_use_demo/requirements.txt new file mode 100644 index 000000000..bf27c399e --- /dev/null +++ b/browser-use-demo/browser_use_demo/requirements.txt @@ -0,0 +1,6 @@ +streamlit==1.41.0 +anthropic[bedrock,vertex]>=0.39.0 +jsonschema==4.22.0 +boto3>=1.28.57 +google-auth<3,>=2 +playwright>=1.40.0 \ No newline at end of file diff --git a/browser-use-demo/browser_use_demo/streamlit.py b/browser-use-demo/browser_use_demo/streamlit.py new file mode 100644 index 000000000..28471595c --- /dev/null +++ b/browser-use-demo/browser_use_demo/streamlit.py @@ -0,0 +1,740 @@ +""" +Browser Use Demo - Streamlit interface for browser automation with Claude +""" + +import asyncio +import base64 +import io +import json +import os +import time +import traceback +import zipfile +from datetime import datetime +from pathlib import PosixPath + +import streamlit as st +from anthropic.types.beta import BetaContentBlockParam + +from anthropic import RateLimitError +from browser_use_demo.loop import APIProvider, sampling_loop +from browser_use_demo.message_renderer import MessageRenderer, Sender +from browser_use_demo.tools import ToolResult + +PROVIDER_TO_DEFAULT_MODEL_NAME: dict[APIProvider, str] = { + APIProvider.ANTHROPIC: "claude-sonnet-4-5-20250929", + APIProvider.BEDROCK: "anthropic.claude-sonnet-4-5-20250929-v1:0", + APIProvider.VERTEX: "claude-sonnet-4-5@20250929", +} + +CONFIG_DIR = PosixPath("~/.anthropic").expanduser() +API_KEY_FILE = CONFIG_DIR / "api_key" + +STREAMLIT_STYLE = """ + +""" + +# Claude 4.5 models for browser automation +BROWSER_COMPATIBLE_MODELS = [ + "claude-sonnet-4-5-20250929", + "claude-opus-4-5-20251101", + "claude-haiku-4-5-20251001", +] + + +def setup_state(): + """Initialize session state variables.""" + # Import here to avoid circular imports when browser_tool lambda is evaluated + from browser_use_demo.tools import BrowserTool + + # Define all defaults in one place - use lambdas for lazy evaluation of complex values + defaults = { + # UI State + "messages": [], + "system_prompt": "", + "hide_screenshots": False, + "rendered_message_count": 0, # Track rendered messages to avoid re-rendering + "last_error": None, # Store last error message to display persistently + # API Configuration + "api_key": os.environ.get("ANTHROPIC_API_KEY", ""), + "provider": APIProvider.ANTHROPIC, + "max_tokens": 8192, + "model": lambda: PROVIDER_TO_DEFAULT_MODEL_NAME[st.session_state.provider], + # Runtime State + "tools": {}, + "event_loop": None, # Persistent event loop for async operations + "chat_disabled": False, # Simple flag to disable chat input + "active_messages": [], # Store messages for current interaction + "active_response_container": None, # Container reference for streaming responses + # Complex initialization - browser tool (inline lambda) + "browser_tool": lambda: BrowserTool(), + } + + # Apply all defaults - evaluate lambdas when needed + for key, default_value in defaults.items(): + if key not in st.session_state: + # If it's a callable (lambda), call it to get the actual value + if callable(default_value): + st.session_state[key] = default_value() + else: + st.session_state[key] = default_value + + +def _clean_text_extraction_markers(text: str) -> str: + """Remove text extraction markers and return a summary.""" + if "__PAGE_EXTRACTED__" not in text and "__TEXT_EXTRACTED__" not in text: + return text + + lines = text.split("\n") + summary = [] + for line in lines: + if "__FULL_CONTENT__" in line: + break + if "__PAGE_EXTRACTED__" not in line and "__TEXT_EXTRACTED__" not in line: + summary.append(line) + return "\n".join(summary) + "\n[Full content extracted but truncated for readability]" + + +def create_transcript_zip(messages: list, include_images: bool = False) -> bytes: + """Create a ZIP archive containing the transcript and optionally images. + + Args: + messages: List of message dictionaries from session state + include_images: Whether to include images as separate files + + Returns: + Bytes of the ZIP archive + """ + # Create an in-memory ZIP file + zip_buffer = io.BytesIO() + + with zipfile.ZipFile(zip_buffer, 'w', zipfile.ZIP_DEFLATED) as zip_file: + if include_images: + # Extract images and create transcript with file references + transcript_json, image_files = extract_images_from_messages(messages) + + # Add images to ZIP + for idx, img_data in enumerate(image_files): + filename = f"images/screenshot_{idx+1:04d}.png" + try: + img_bytes = base64.b64decode(img_data) + zip_file.writestr(filename, img_bytes) + except Exception as e: + print(f"Error adding image to ZIP: {e}") + + # Add README + readme_content = f"""Browser Use Demo - Conversation Transcript +Generated: {datetime.now().isoformat()} + +This archive contains: +- transcript.json: The conversation transcript +- images/: {len(image_files)} screenshot images referenced in the transcript + +The transcript is in JSON format with images stored as separate PNG files. +Image references in the transcript point to files in the images/ directory. +""" + zip_file.writestr("README.txt", readme_content) + else: + # Just create transcript without images + transcript_json = format_transcript_for_download(messages, False) + + readme_content = f"""Browser Use Demo - Conversation Transcript +Generated: {datetime.now().isoformat()} + +This archive contains: +- transcript.json: The conversation transcript (text only) + +The transcript is in JSON format and includes all text messages from the conversation. +""" + zip_file.writestr("README.txt", readme_content) + + # Add the transcript JSON to the ZIP + zip_file.writestr("transcript.json", transcript_json) + + # Get the ZIP file bytes + zip_buffer.seek(0) + return zip_buffer.read() + + +class ImageExtractor: + """Helper class to extract images and track their file references.""" + + def __init__(self): + self.image_files = [] + self.image_counter = 0 + + def extract_image(self, source: dict) -> dict: + """Extract an image and return a file reference.""" + if source.get("type") == "base64": + self.image_counter += 1 + self.image_files.append(source.get("data", "")) + return { + "type": "image", + "file": f"images/screenshot_{self.image_counter:04d}.png" + } + else: + return {"type": "image", "note": "No image data"} + + def process_image_content(self, item: dict) -> dict: + """Process image content type.""" + source = item.get("source", {}) + return self.extract_image(source) + + def process_text_content(self, item: dict) -> dict: + """Process text content type.""" + return { + "type": "text", + "text": _clean_text_extraction_markers(item.get("text", "")) + } + + def process_tool_use_content(self, item: dict) -> dict: + """Process tool use content type.""" + return { + "type": "tool_use", + "name": item.get("name", ""), + "input": item.get("input", {}) + } + + def process_tool_result_content(self, item: dict) -> dict: + """Process tool result content type.""" + tool_content = [] + for content_item in item.get("content", []): + if isinstance(content_item, dict): + content_type = content_item.get("type") + if content_type == "image": + source = content_item.get("source", {}) + tool_content.append(self.extract_image(source)) + elif content_type == "text": + tool_content.append(self.process_text_content(content_item)) + else: + tool_content.append(content_item) + + return { + "type": "tool_result", + "tool_use_id": item.get("tool_use_id", ""), + "content": tool_content + } + + def process_default_content(self, item: dict) -> dict: + """Default processor for unknown content types.""" + return _format_content_item(item, False) + + +def extract_images_from_messages(messages: list) -> tuple: + """Extract images from messages and create transcript with file references. + + Returns: + Tuple of (transcript_json, list_of_base64_image_data) + """ + extractor = ImageExtractor() + + # Content type processors + processors = { + "image": extractor.process_image_content, + "text": extractor.process_text_content, + "tool_use": extractor.process_tool_use_content, + "tool_result": extractor.process_tool_result_content, + } + + def process_content(content): + """Process content using appropriate processors.""" + if isinstance(content, str): + return content + elif isinstance(content, list): + processed = [] + for item in content: + if isinstance(item, dict): + content_type = item.get("type") + processor = processors.get(content_type, extractor.process_default_content) + processed.append(processor(item)) + else: + processed.append(str(item)) + return processed + else: + return str(content) + + # Build transcript + transcript = { + "timestamp": datetime.now().isoformat(), + "format_version": "2.0", + "image_storage": "separate_files", + "conversation": [] + } + + # Process all messages + for message in messages: + cleaned_message = { + "role": message.get("role"), + "timestamp": datetime.now().isoformat(), + "content": process_content(message.get("content", "")) + } + transcript["conversation"].append(cleaned_message) + + return json.dumps(transcript, indent=2, ensure_ascii=False), extractor.image_files + + +def format_transcript_for_download(messages: list, include_images: bool = False) -> str: + """Format conversation messages into a readable transcript. + + Args: + messages: List of message dictionaries from session state + include_images: Whether to include base64 image data in the transcript + + Returns: + Formatted JSON string of the conversation + """ + transcript = { + "timestamp": datetime.now().isoformat(), + "format_version": "1.0", + "includes_images": include_images, + "conversation": [] + } + + for message in messages: + cleaned_message = { + "role": message.get("role"), + "timestamp": datetime.now().isoformat(), + "content": _format_message_content(message.get("content", ""), include_images) + } + transcript["conversation"].append(cleaned_message) + + return json.dumps(transcript, indent=2, ensure_ascii=False) + + +def _format_text_content(item: dict, include_images: bool = False) -> dict: + """Format a text content block.""" + return { + "type": "text", + "text": _clean_text_extraction_markers(item.get("text", "")) + } + + +def _format_tool_use_content(item: dict, include_images: bool = False) -> dict: + """Format a tool use content block.""" + return { + "type": "tool_use", + "name": item.get("name", ""), + "input": item.get("input", {}) + } + + +def _format_tool_result_content(item: dict, include_images: bool = False) -> dict: + """Format a tool result content block.""" + tool_content = [] + for content_item in item.get("content", []): + if isinstance(content_item, dict): + content_type = content_item.get("type") + if content_type == "text": + text = _clean_text_extraction_markers(content_item.get("text", "")) + tool_content.append({"type": "text", "text": text}) + elif content_type == "image": + if include_images: + source = content_item.get("source", {}) + if source.get("type") == "base64": + tool_content.append({ + "type": "image", + "media_type": source.get("media_type", "image/png"), + "base64_data": source.get("data", "") + }) + else: + tool_content.append({"type": "image", "note": "Screenshot taken"}) + + return { + "type": "tool_result", + "tool_use_id": item.get("tool_use_id", ""), + "content": tool_content + } + + +def _format_image_content(item: dict, include_images: bool = False) -> dict: + """Format an image content block.""" + if include_images: + source = item.get("source", {}) + if source.get("type") == "base64": + return { + "type": "image", + "media_type": source.get("media_type", "image/png"), + "base64_data": source.get("data", "") + } + return {"type": "image", "note": "Image/Screenshot included"} + + +def _format_default_content(item: dict, include_images: bool = False) -> dict: + """Format unknown content types - fallback handler.""" + return item + + +# Strategy pattern: Map content types to their formatting functions +CONTENT_FORMATTERS = { + "text": _format_text_content, + "tool_use": _format_tool_use_content, + "tool_result": _format_tool_result_content, + "image": _format_image_content, +} + + +def _format_content_item(item, include_images: bool = False): + """Format a single content item using the appropriate formatter. + + Uses the Strategy pattern to dispatch to the correct formatter based on content type. + """ + if not isinstance(item, dict): + return str(item) + + content_type = item.get("type") + formatter = CONTENT_FORMATTERS.get(content_type, _format_default_content) + return formatter(item, include_images) + + +def _format_message_content(content, include_images: bool = False): + """Format message content based on its type. + + This is the main entry point that handles different content structures. + """ + if isinstance(content, str): + return content + elif isinstance(content, list): + return [_format_content_item(item, include_images) for item in content] + else: + return str(content) + + +def authenticate(): + """Handle API key authentication.""" + if st.session_state.provider == APIProvider.ANTHROPIC: + if not st.session_state.api_key: + st.error("Please provide your Anthropic API key in the sidebar") + st.stop() + return True + + +def get_or_create_event_loop(): + """Get existing event loop or create a new one if needed. + + This function ensures we have a valid event loop for async operations, + reusing existing loops when possible to avoid Playwright issues with asyncio.run(). + + Returns: + The active asyncio event loop. + """ + if st.session_state.event_loop is None or st.session_state.event_loop.is_closed(): + st.session_state.event_loop = asyncio.new_event_loop() + + asyncio.set_event_loop(st.session_state.event_loop) + return st.session_state.event_loop + + +async def run_agent(user_input: str): + """Run the browser automation agent with user input.""" + try: + # Ensure chat is disabled while processing + st.session_state.chat_disabled = True + + # Create message renderer + renderer = MessageRenderer(st.session_state) + + # Add user message to history + st.session_state.messages.append({"role": "user", "content": user_input}) + + # Display user message in active container + with st.session_state.active_response_container: + renderer.render(Sender.USER, user_input) + + # Clear active messages for new interaction + st.session_state.active_messages = [] + + # Prepare messages for API - preserve full conversation history + api_messages = list(st.session_state.messages) + + # Setup callbacks for streaming responses + def output_callback(content_block: BetaContentBlockParam): + """Handle agent output - both text and tool use.""" + # Stream to active container in real-time + with st.session_state.active_response_container: + renderer.render(Sender.BOT, content_block) + # Store for later persistence + st.session_state.active_messages.append(("assistant", content_block)) + + def tool_output_callback(result: ToolResult, tool_id: str): + """Handle tool execution results.""" + st.session_state.tools[tool_id] = result + # Stream to active container in real-time + with st.session_state.active_response_container: + renderer.render(Sender.TOOL, result) + # Store for later persistence + st.session_state.active_messages.append(("tool", result, tool_id)) + + def api_response_callback(request, response, error): + """Handle API responses.""" + if error: + with st.session_state.active_response_container: + st.error(f"API Error: {error}") + + # Run the agent with persistent browser tool + updated_messages = await sampling_loop( + model=st.session_state.model, + provider=st.session_state.provider, + system_prompt_suffix=st.session_state.system_prompt, + messages=api_messages, + output_callback=output_callback, + tool_output_callback=tool_output_callback, + api_response_callback=api_response_callback, + api_key=st.session_state.api_key, + max_tokens=st.session_state.max_tokens, + browser_tool=st.session_state.browser_tool, # Pass persistent browser instance + only_n_most_recent_images=3, # Keep only 3 most recent screenshots for context + ) + + # Update session state with the complete message history + if updated_messages: + st.session_state.messages = updated_messages + + # Re-enable chat input + st.session_state.chat_disabled = False + + # Trigger a rerun to update the history display + st.rerun() + + except RateLimitError: + error_msg = "Rate limit exceeded. Please wait before sending another message." + st.session_state.last_error = {"message": error_msg, "traceback": None} + with st.session_state.active_response_container: + st.error(error_msg) + st.session_state.chat_disabled = False + st.rerun() + except Exception as e: + error_msg = f"Error: {str(e)}" + error_traceback = traceback.format_exc() + st.session_state.last_error = {"message": error_msg, "traceback": error_traceback} + with st.session_state.active_response_container: + st.error(error_msg) + st.code(error_traceback) + st.session_state.chat_disabled = False + st.rerun() + + +def main(): + """Main application entry point.""" + st.set_page_config( + page_title="Claude Browser Use Demo", + page_icon="🌐", + layout="wide" + ) + + st.markdown(STREAMLIT_STYLE, unsafe_allow_html=True) + + setup_state() + + + # Sidebar configuration + with st.sidebar: + st.header("⚙️ Configuration") + + # API Provider (fixed to Anthropic for browser use) + st.selectbox( + "API Provider", + options=[APIProvider.ANTHROPIC], + index=0, + key="provider", + disabled=True, + help="Browser Use requires Anthropic API", + ) + + # Model selection (only browser-compatible models) + st.selectbox("Model", options=BROWSER_COMPATIBLE_MODELS, index=0, key="model") + + # API Key + st.text_input( + "Anthropic API Key", + type="password", + value=st.session_state.api_key, + key="api_key", + help="Get your API key from https://console.anthropic.com", + ) + + # Max tokens + st.number_input( + "Max Output Tokens", + min_value=1024, + max_value=32768, + value=st.session_state.max_tokens, + step=1024, + key="max_tokens", + ) + + # System prompt + st.text_area( + "Additional System Prompt", + value=st.session_state.system_prompt, + key="system_prompt", + help="Add custom instructions for the browser agent", + ) + + # Hide screenshots + st.checkbox( + "Hide Screenshots", + value=st.session_state.hide_screenshots, + key="hide_screenshots", + help="Hide screenshot outputs in the chat", + ) + + # Conversation Management Section + st.divider() + st.subheader("💬 Conversation") + + # Download transcript options and button + if st.session_state.messages: + # Checkbox to include images + include_images = st.checkbox( + "Include images in transcript", + value=False, + help="Include screenshots as separate PNG files in a ZIP archive" + ) + + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + + if include_images: + # Generate ZIP with images + zip_data = create_transcript_zip( + st.session_state.messages, + include_images=True + ) + + # Show file size + file_size_kb = len(zip_data) / 1024 + if file_size_kb > 1024: + size_str = f"{file_size_kb / 1024:.1f} MB" + else: + size_str = f"{file_size_kb:.1f} KB" + + st.download_button( + label=f"📦 Download Transcript ZIP ({size_str})", + data=zip_data, + file_name=f"browser_demo_transcript_{timestamp}.zip", + mime="application/zip", + help=f"Download conversation with images as ZIP archive ({size_str})", + type="primary", + use_container_width=True, + ) + else: + # Generate JSON only + transcript_json = format_transcript_for_download( + st.session_state.messages, + include_images=False + ) + + # Show file size + file_size_kb = len(transcript_json.encode('utf-8')) / 1024 + if file_size_kb > 1024: + size_str = f"{file_size_kb / 1024:.1f} MB" + else: + size_str = f"{file_size_kb:.1f} KB" + + st.download_button( + label=f"📄 Download Transcript JSON ({size_str})", + data=transcript_json, + file_name=f"browser_demo_transcript_{timestamp}.json", + mime="application/json", + help=f"Download conversation transcript as JSON ({size_str})", + type="primary", + use_container_width=True, + ) + else: + st.info("No messages to download yet", icon="💬") + + # Clear conversation + if st.button("🗑️ Clear Conversation", type="secondary", use_container_width=True): + st.session_state.messages = [] + st.session_state.tools = {} + st.session_state.rendered_message_count = 0 + st.session_state.active_messages = [] + st.session_state.chat_disabled = False + st.rerun() + + # Reset browser to blank page + if st.button("Reset Browser", type="secondary"): + async def reset_browser(): + if st.session_state.browser_tool._page: + await st.session_state.browser_tool._page.goto("about:blank") + + if st.session_state.event_loop is None or st.session_state.event_loop.is_closed(): + st.session_state.event_loop = asyncio.new_event_loop() + asyncio.set_event_loop(st.session_state.event_loop) + st.session_state.event_loop.run_until_complete(reset_browser()) + st.rerun() + + # Main chat interface + st.title("🌐 Claude Browser Use Demo") + st.markdown( + "This demo showcases Claude's ability to interact with web browsers using " + "Playwright automation. Ask Claude to navigate websites, fill forms, " + "extract information, and more!" + ) + + # Authenticate + if not authenticate(): + return + + + # Create container for conversation history + history_container = st.container() + + # Display conversation history in the history container + renderer = MessageRenderer(st.session_state) + with history_container: + renderer.render_conversation_history(st.session_state.messages) + + # Create container for active/streaming responses + active_container = st.container() + st.session_state.active_response_container = active_container + + # Simple callback to disable chat input on submit + def disable_chat_callback(): + st.session_state.chat_disabled = True + + # Show persistent error message if there is one + if st.session_state.last_error: + st.error(st.session_state.last_error["message"]) + if st.session_state.last_error["traceback"]: + with st.expander("Show full traceback"): + st.code(st.session_state.last_error["traceback"]) + if st.button("Clear Error"): + st.session_state.last_error = None + st.rerun() + + # Show status when chat is disabled + if st.session_state.chat_disabled: + st.info("🤖 Claude is currently processing your request. Please wait...") + + # Simple chat input with disabled state + prompt = st.chat_input( + "Ask Claude to browse the web...", + disabled=st.session_state.chat_disabled, + on_submit=disable_chat_callback + ) + + if prompt: + # Clear any previous error when starting a new request + st.session_state.last_error = None + # Process the prompt + loop = get_or_create_event_loop() + loop.run_until_complete(run_agent(prompt)) + + +if __name__ == "__main__": + main() diff --git a/browser-use-demo/browser_use_demo/tools/__init__.py b/browser-use-demo/browser_use_demo/tools/__init__.py new file mode 100644 index 000000000..adf45b2ba --- /dev/null +++ b/browser-use-demo/browser_use_demo/tools/__init__.py @@ -0,0 +1,11 @@ +from .base import BaseAnthropicTool, ToolError, ToolResult +from .browser import BrowserTool +from .collection import ToolCollection + +__all__ = [ + "BaseAnthropicTool", + "ToolError", + "ToolResult", + "BrowserTool", + "ToolCollection", +] diff --git a/browser-use-demo/browser_use_demo/tools/base.py b/browser-use-demo/browser_use_demo/tools/base.py new file mode 100644 index 000000000..d6f13712d --- /dev/null +++ b/browser-use-demo/browser_use_demo/tools/base.py @@ -0,0 +1,69 @@ +from abc import ABCMeta, abstractmethod +from dataclasses import dataclass, fields, replace +from typing import Any + +from anthropic.types.beta import BetaToolUnionParam + + +class BaseAnthropicTool(metaclass=ABCMeta): + """Abstract base class for Anthropic-defined tools.""" + + @abstractmethod + def __call__(self, **kwargs) -> Any: + """Executes the tool with the given arguments.""" + ... + + @abstractmethod + def to_params( + self, + ) -> BetaToolUnionParam: + raise NotImplementedError + + +@dataclass(kw_only=True, frozen=True) +class ToolResult: + """Represents the result of a tool execution.""" + + output: str | None = None + error: str | None = None + base64_image: str | None = None + system: str | None = None + + def __bool__(self): + return any(getattr(self, field.name) for field in fields(self)) + + def __add__(self, other: "ToolResult"): + def combine_fields( + field: str | None, other_field: str | None, concatenate: bool = True + ): + if field and other_field: + if concatenate: + return field + other_field + raise ValueError("Cannot combine tool results") + return field or other_field + + return ToolResult( + output=combine_fields(self.output, other.output), + error=combine_fields(self.error, other.error), + base64_image=combine_fields(self.base64_image, other.base64_image, False), + system=combine_fields(self.system, other.system), + ) + + def replace(self, **kwargs): + """Returns a new ToolResult with the given fields replaced.""" + return replace(self, **kwargs) + + +class CLIResult(ToolResult): + """A ToolResult that can be rendered as a CLI output.""" + + +class ToolFailure(ToolResult): + """A ToolResult that represents a failure.""" + + +class ToolError(Exception): + """Raised when a tool encounters an error.""" + + def __init__(self, message): + self.message = message diff --git a/browser-use-demo/browser_use_demo/tools/browser.py b/browser-use-demo/browser_use_demo/tools/browser.py new file mode 100644 index 000000000..3fb96688c --- /dev/null +++ b/browser-use-demo/browser_use_demo/tools/browser.py @@ -0,0 +1,1276 @@ +# Modifications Copyright (c) 2025 Anthropic, PBC +# Modified from original Microsoft Playwright source +# Original Microsoft Playwright source licensed under Apache License 2.0 +# See CHANGELOG.md for details + +"""Browser automation tool using Playwright for web interaction.""" + +import asyncio +import base64 +import json +import os +import sys +from pathlib import Path +from typing import Any, Literal, Optional, TypedDict, cast +from uuid import uuid4 + +from anthropic.types.beta import BetaToolUnionParam +from playwright.async_api import Browser, BrowserContext, Page + +from ..display_constants import BROWSER_HEIGHT, BROWSER_WIDTH, DISPLAY_NUM +from .base import BaseAnthropicTool, ToolError, ToolResult +from .coordinate_scaling import CoordinateScaler + +# Simple logging for debugging - removed, using print directly + + +# Custom browser tool input schema +BROWSER_TOOL_INPUT_SCHEMA: dict[str, Any] = { + "properties": { + "action": { + "description": 'The action to perform. The available actions are:\n* `navigate`: Navigate to a URL or use "back"/"forward" for browser history navigation. Automatically includes a screenshot of the loaded page.\n* `screenshot`: Take a screenshot of the current browser viewport.\n* `left_click`: Click the left mouse button at the specified coordinate or element reference.\n* `right_click`: Click the right mouse button at the specified coordinate or element reference.\n* `middle_click`: Click the middle mouse button at the specified coordinate or element reference.\n* `double_click`: Double-click the left mouse button at the specified coordinate or element reference.\n* `triple_click`: Triple-click the left mouse button at the specified coordinate or element reference.\n* `hover`: Move the mouse cursor to the specified coordinate or element reference without clicking. Useful for revealing tooltips, dropdown menus, or triggering hover states.\n* `left_click_drag`: Click and drag from start_coordinate to coordinate.\n* `left_mouse_down`: Press and hold the left mouse button at the specified coordinate.\n* `left_mouse_up`: Release the left mouse button at the specified coordinate.\n* `scroll`: Scroll the page in a specified direction.\n* `scroll_to`: Scroll to bring an element into view.\n* `type`: Type text at the current cursor position.\n* `key`: Press a key or key combination (supports standard keys and modifiers).\n* `hold_key`: Hold down a key or key combination for a specified duration.\n* `read_page`: Get the DOM tree structure, optionally filtered for interactive elements.\n* `find`: Find text on the page and highlight matches.\n* `get_page_text`: Get all text content from the page.\n* `wait`: Wait for a specified duration in seconds.\n* `form_input`: Set the value of a form input element.\n* `zoom`: Take a zoomed screenshot of a specific region.\n* `execute_js`: Execute JavaScript code in the page context. Returns the result of the last expression.', + "enum": [ + "navigate", + "screenshot", + "left_click", + "right_click", + "middle_click", + "double_click", + "triple_click", + "hover", + "left_click_drag", + "left_mouse_down", + "left_mouse_up", + "scroll", + "scroll_to", + "type", + "key", + "hold_key", + "read_page", + "find", + "get_page_text", + "wait", + "form_input", + "zoom", + "execute_js", + ], + "type": "string", + }, + "text": { + "description": 'Required for: `navigate` (URL or "back"/"forward"), `type` (text to type), `key` (key combination), `hold_key` (key to hold), `find` (text to search), `execute_js` (valid JavaScript code ONLY - no explanatory text, just the code). Optional for `read_page` (filter type: "interactive"), click actions (modifier keys to hold during click).', + "type": "string", + }, + "ref": { + "description": "Element reference string for targeting specific DOM elements. Required for `scroll_to` and `form_input`. Optional for click actions and `hover` as an alternative to coordinates.", + "type": "string", + }, + "coordinate": { + "description": "(x, y): The x (pixels from the left edge) and y (pixels from the top edge) coordinates. Required for mouse actions when `ref` is not provided: `left_click`, `right_click`, `middle_click`, `double_click`, `triple_click`, `hover`, `left_mouse_down`, `left_mouse_up`, `scroll`. Also serves as the end coordinate for `left_click_drag`.", + "type": "array", + "items": {"type": "integer"}, + }, + "start_coordinate": { + "description": "(x, y): The starting x and y coordinates for drag operations. Required only for `left_click_drag`.", + "type": "array", + "items": {"type": "integer"}, + }, + "scroll_direction": { + "description": "The direction to scroll. Required for `scroll` action.", + "enum": ["up", "down", "left", "right"], + "type": "string", + }, + "scroll_amount": { + "description": "The number of scroll units (similar to mouse wheel clicks). Required for `scroll` action.", + "type": "integer", + }, + "duration": { + "description": "Duration in seconds. Required for `hold_key` and `wait` actions. For `wait`, must be between 0 and 100 seconds.", + "type": "number", + }, + "value": { + "description": "The value to set for a form input element. Required for `form_input` action. Can be string, number, or boolean depending on the input type.", + "type": ["string", "number", "boolean"], + }, + "region": { + "description": "(x1, y1, x2, y2): Defines a rectangular region for the `zoom` action. Coordinates specify top-left (x1, y1) and bottom-right (x2, y2) corners.", + "type": "array", + "items": {"type": "integer"}, + }, + }, + "required": ["action"], + "type": "object", +} + +BROWSER_TOOL_DESCRIPTION = """A browser automation tool for web interaction. Use this tool to navigate websites, interact with elements, and extract content. + +Key actions: +- navigate: Go to a URL (automatically includes a screenshot) +- screenshot: Take a visual screenshot +- read_page: Get DOM structure with element references +- get_page_text: Extract all text content +- left_click, right_click, double_click: Click elements +- hover: Move cursor without clicking (for tooltips, dropdowns) +- type: Enter text at cursor +- scroll: Scroll the page +- form_input: Fill form fields +- execute_js: Run JavaScript in page context""" + + +OUTPUT_DIR = Path("/tmp/outputs") +OUTPUT_DIR.mkdir(parents=True, exist_ok=True) + +# Directory containing browser tool utility files (JS scripts) +BROWSER_TOOL_UTILS_DIR = Path(__file__).parent.parent / "browser_tool_utils" + + +class BrowserOptions(TypedDict): + display_width_px: int + display_height_px: int + + +Actions = Literal[ + "navigate", + "screenshot", + "left_click", + "right_click", + "middle_click", + "double_click", + "triple_click", + "hover", + "left_click_drag", + "left_mouse_down", + "left_mouse_up", + "scroll", + "scroll_to", + "type", + "key", + "hold_key", + "read_page", + "find", + "get_page_text", + "wait", + "form_input", + "zoom", + "execute_js", +] + + +class BrowserTool(BaseAnthropicTool): + """ + A browser automation tool using Playwright for web interaction. + + Key actions for extracting content: + - read_page: Extract structured DOM tree with element references (USE THIS for analyzing page structure) + - get_page_text: Extract all text content from the page (USE THIS for reading articles/posts) + - screenshot: Take a visual screenshot (only for visual confirmation, not for reading content) + + Navigation actions: + - navigate: Go to a URL + - find: Search for elements on the page + + Interaction actions: + - left_click, right_click, double_click: Click elements + - type: Enter text + - scroll: Scroll the page + """ + + name: Literal["browser"] = "browser" + + # Instance-level browser connection (recreated per request) + _browser: Optional[Browser] = None + _context: Optional[BrowserContext] = None + _page: Optional[Page] = None + _playwright = None + + def __init__(self): + """Initialize the browser tool with standard viewport dimensions.""" + super().__init__() + # Use constants for display configuration + self.width = BROWSER_WIDTH + self.height = BROWSER_HEIGHT + self._initialized = False + self._event_loop = None # Track which event loop we're initialized in + self.cdp_url = None # Initialize CDP URL attribute for cleanup method + + @property + def options(self) -> BrowserOptions: + """Return browser display options.""" + # Note: This implementation uses fixed 1920x1080 dimensions with empirical + # coordinate correction. For the recommended approach using client-side + # downscaling, see the "Handle coordinate scaling" section in the computer + # use documentation. + return { + "display_width_px": self.width, + "display_height_px": self.height, + } + + def to_params(self) -> BetaToolUnionParam: + """Convert tool to API parameters using custom tool definition.""" + return cast( + BetaToolUnionParam, + { + "name": self.name, + "description": BROWSER_TOOL_DESCRIPTION, + "input_schema": BROWSER_TOOL_INPUT_SCHEMA, + }, + ) + + async def _ensure_browser(self) -> None: + """Launch browser and ensure page is ready.""" + # NOTE: We intentionally DON'T reset the browser if the event loop changes + # The browser should persist across conversation turns + # Commenting out event loop check that was causing browser resets: + # try: + # current_loop = asyncio.get_running_loop() + # if self._initialized and hasattr(self, "_event_loop"): + # if self._event_loop != current_loop: + # self._initialized = False + # self._browser = None + # self._context = None + # self._page = None + # self._playwright = None + # except RuntimeError: + # pass + + if self._initialized: + print( + f"[Browser] Reusing existing browser instance", + file=sys.stderr, + flush=True, + ) + if self._page: + current_url = self._page.url + print( + f"[Browser] Current page URL: {current_url}", + file=sys.stderr, + flush=True, + ) + + if not self._initialized: + print( + f"[Browser] Initializing browser for first time", + file=sys.stderr, + flush=True, + ) + if self._playwright is None: + from playwright.async_api import async_playwright + + self._playwright = await async_playwright().start() + + if self._browser is None: + viewport_width = self.width + viewport_height = self.height + + is_docker = os.path.exists("/.dockerenv") + + launch_args = [ + "--start-maximized", + f"--window-size={viewport_width},{viewport_height}", + "--window-position=0,0", + "--disable-blink-features=AutomationControlled", + "--disable-dev-shm-usage", + "--no-sandbox", + "--disable-setuid-sandbox", + "--disable-gpu-sandbox", + "--disable-software-rasterizer", + ] + + if is_docker: + launch_args.extend([ + f"--display=:{DISPLAY_NUM}", + "--disable-infobars", + "--disable-session-crashed-bubble", + "--no-first-run", + "--disable-features=TranslateUI", + "--disable-component-extensions-with-background-pages", + ]) + + print( + f"[Browser] Launching browser with viewport {viewport_width}x{viewport_height}", + file=sys.stderr, + flush=True, + ) + + self._browser = await self._playwright.chromium.launch( + headless=False, + args=launch_args, + ) + + self._context = await self._browser.new_context( + viewport={"width": viewport_width, "height": viewport_height}, + user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36", + ) + self._page = await self._context.new_page() + self._page.set_default_timeout(30000) + + print( + f"[Browser] Browser initialized with viewport: {viewport_width}x{viewport_height}", + file=sys.stderr, + flush=True, + ) + print( + f"[Browser] New browser instance created", + file=sys.stderr, + flush=True, + ) + + self._initialized = True + try: + self._event_loop = asyncio.get_running_loop() + except RuntimeError: + self._event_loop = None + + async def _execute_js_from_file(self, filename: str, *args) -> Any: + """Load and execute JavaScript from a file.""" + if self._page is None: + raise ToolError("Browser not initialized") + + script_path = BROWSER_TOOL_UTILS_DIR / filename + if not script_path.exists(): + raise ToolError(f"Script file not found: {filename}") + + script = script_path.read_text() + + # Special handling for browser_dom_script.js + if filename == "browser_dom_script.js": + # The DOM script defines window.__generateAccessibilityTree function + # We need to inject it and then call it + filter_type = args[0] if args else "" + combined_expression = f""" + (function() {{ + {script} + return window.__generateAccessibilityTree('{filter_type}'); + }})() + """ + return await self._page.evaluate(combined_expression) + else: + # For other scripts, wrap as a function and call with arguments + escaped_args = ", ".join(json.dumps(arg) for arg in args) + js_expression = f"({script})({escaped_args})" + return await self._page.evaluate(js_expression) + + async def _take_screenshot(self) -> ToolResult: + """ + Take a visual screenshot of the current page. + NOTE: This only returns an image, not text content. + Use read_page or get_page_text to extract actual content. + """ + if self._page is None: + raise ToolError("Browser not initialized") + + try: + # Save screenshot directly to file (like browser.py does with scrot) + screenshot_path = OUTPUT_DIR / f"screenshot_{uuid4().hex}.png" + await self._page.screenshot(path=str(screenshot_path), full_page=False) + + # Read the file and encode to base64 + screenshot_bytes = screenshot_path.read_bytes() + image_base64 = base64.b64encode(screenshot_bytes).decode() + + return ToolResult(output="", error=None, base64_image=image_base64) + except Exception as e: + raise ToolError(f"Failed to take screenshot: {str(e)}") from e + + async def _zoom_screenshot( + self, x: int, y: int, width: int, height: int + ) -> ToolResult: + """Take a screenshot of a specific region.""" + if self._page is None: + raise ToolError("Browser not initialized") + + try: + # Take screenshot with clipping + screenshot_path = OUTPUT_DIR / f"zoom_screenshot_{uuid4().hex}.png" + await self._page.screenshot( + path=str(screenshot_path), + clip={"x": x, "y": y, "width": width, "height": height}, + ) + + # Read the file and encode to base64 + screenshot_bytes = screenshot_path.read_bytes() + image_base64 = base64.b64encode(screenshot_bytes).decode() + + return ToolResult(output="", error=None, base64_image=image_base64) + except Exception as e: + raise ToolError(f"Failed to take zoom screenshot: {str(e)}") from e + + async def _navigate(self, url: str) -> ToolResult: + """Navigate to a URL.""" + if self._page is None: + raise ToolError("Browser not initialized") + + try: + # Add protocol if missing + if not url.startswith(("http://", "https://", "file://", "about:")): + url = f"https://{url}" + + await self._page.goto(url, wait_until="domcontentloaded") + await asyncio.sleep(2) # Wait for page to stabilize + + # Take screenshot after navigation + return await self._take_screenshot() + + except Exception as e: + raise ToolError(f"Failed to navigate to {url}: {str(e)}") from e + + def _scale_coordinates(self, x: int, y: int) -> tuple[int, int]: + """ + Apply auto-scaling to coordinates using the CoordinateScaler. + + Claude's vision model interprets images at a different resolution than actual. + We use empirically-derived base resolution for accurate coordinate mapping. + + Args: + x: Original x coordinate + y: Original y coordinate + + Returns: + Tuple of (scaled_x, scaled_y) + """ + # Get scale factors for this viewport + scale_x, scale_y = CoordinateScaler.get_scale_factors(self.width, self.height) + + # Only log scale factors if they're being initialized + if not hasattr(self, '_logged_scale_factors'): + print( + f"[Auto-Scale] Using scale factors: {scale_x:.3f}x, {scale_y:.3f}y", + file=sys.stderr, + flush=True, + ) + self._logged_scale_factors = True + + # Apply scaling using CoordinateScaler + scaled_x, scaled_y = CoordinateScaler.scale_coordinates( + x, y, self.width, self.height + ) + + # Log if scaling was actually applied + if scaled_x != x or scaled_y != y: + print( + f"[Auto-Scale] Scaled ({x}, {y}) -> ({scaled_x}, {scaled_y})", + file=sys.stderr, + flush=True, + ) + + return scaled_x, scaled_y + + async def _click( + self, + action: str, + coordinate: Optional[tuple[int, int]] = None, + ref: Optional[str] = None, + text: Optional[str] = None, + ) -> ToolResult: + """Handle various click actions.""" + if self._page is None: + raise ToolError("Browser not initialized") + + try: + button = "left" + click_count = 1 + + if action == "right_click": + button = "right" + elif action == "middle_click": + button = "middle" + elif action == "double_click": + click_count = 2 + elif action == "triple_click": + click_count = 3 + + if coordinate: + x, y = coordinate + + # Apply auto-scaling to coordinates + x, y = self._scale_coordinates(x, y) + + # Validate coordinates are within viewport bounds + viewport = self._page.viewport_size + if viewport: + if x < 0 or x > viewport['width'] or y < 0 or y > viewport['height']: + print( + f"[Click] WARNING: Coordinates ({x}, {y}) are outside viewport " + f"({viewport['width']}x{viewport['height']})", + file=sys.stderr, + flush=True, + ) + # Still attempt the click but warn about potential issues + if x > viewport['width']: + print( + f"[Click] X coordinate {x} exceeds viewport width {viewport['width']}", + file=sys.stderr, + flush=True, + ) + if y > viewport['height']: + print( + f"[Click] Y coordinate {y} exceeds viewport height {viewport['height']}", + file=sys.stderr, + flush=True, + ) + + # Ensure the page has focus + await self._page.bring_to_front() + + # Move mouse to position and click + await self._page.mouse.move(x, y) + await asyncio.sleep(0.01) # Small delay to ensure mouse is positioned + + # Perform the click based on type + await self._page.mouse.click( + x, y, button=button, click_count=click_count + ) + return ToolResult(output=f"Clicked at ({x}, {y})", error=None) + elif ref: + # Use the browser_element_script.js to find and click element + element_info = await self._execute_js_from_file( + "browser_element_script.js", ref + ) + + if not element_info.get("success", False): + raise ToolError( + element_info.get("message", "Failed to find element") + ) + + # Get the coordinates from element_info + click_x, click_y = element_info["coordinates"] + + # Move to element and click + await self._page.mouse.move(click_x, click_y) + await asyncio.sleep(0.1) + await self._page.mouse.click( + click_x, click_y, button=button, click_count=click_count + ) + return ToolResult(output=f"Clicked element with ref: {ref}", error=None) + elif text: + # Click on element containing text + await self._page.click( + f"text={text}", button=button, click_count=click_count + ) + return ToolResult(output=f"Clicked on text: {text}", error=None) + else: + raise ToolError( + "Either coordinate, ref, or text is required for click action" + ) + + except Exception as e: + raise ToolError(f"Failed to perform {action}: {str(e)}") from e + + async def _type_text(self, text: str) -> ToolResult: + """Type text into the focused element.""" + if self._page is None: + raise ToolError("Browser not initialized") + + try: + await self._page.keyboard.type(text) + return ToolResult(output=f"Typed: {text}", error=None) + except Exception as e: + raise ToolError(f"Failed to type text: {str(e)}") from e + + async def _press_key( + self, key: str, hold: bool = False, duration: float = 0.01 + ) -> ToolResult: + """Press a keyboard key or key combination.""" + if self._page is None: + raise ToolError("Browser not initialized") + + try: + # Load the key map + from ..browser_tool_utils.browser_key_map import KEY_MAP + + def map_key(k: str) -> str: + """Map a key name to Playwright's expected format.""" + key_info = KEY_MAP.get(k.lower()) + if key_info and "key" in key_info: + return key_info["key"] + return k + + # Handle key combinations (e.g., "cmd+a", "ctrl+c") + if "+" in key: + parts = key.split("+") + mapped_parts = [map_key(p) for p in parts] + mapped_key = "+".join(mapped_parts) + await self._page.keyboard.press(mapped_key) + return ToolResult(output=f"Pressed key combination: {mapped_key}", error=None) + + # Map single key if needed + key_info = KEY_MAP.get(key.lower()) + if key_info: + key_to_press = key_info["code"] if "code" in key_info else key + else: + key_to_press = key + + if hold: + await self._page.keyboard.down(key_to_press) + await asyncio.sleep(duration) + await self._page.keyboard.up(key_to_press) + return ToolResult( + output=f"Held key '{key}' for {duration} seconds", error=None + ) + else: + await self._page.keyboard.press(key_to_press) + return ToolResult(output=f"Pressed key: {key}", error=None) + + except Exception as e: + raise ToolError(f"Failed to press key '{key}': {str(e)}") from e + + async def _scroll( + self, + coordinate: Optional[tuple[int, int]] = None, + direction: Optional[str] = None, + amount: Optional[int] = None, + ) -> ToolResult: + """Scroll the page or element.""" + if self._page is None: + raise ToolError("Browser not initialized") + + try: + if not direction: + direction = "down" + if not amount: + amount = 3 # Default scroll amount + + # Calculate scroll delta based on direction + delta_x = 0 + delta_y = 0 + + if direction == "up": + delta_y = -amount * 100 + elif direction == "down": + delta_y = amount * 100 + elif direction == "left": + delta_x = -amount * 100 + elif direction == "right": + delta_x = amount * 100 + + if coordinate: + x, y = coordinate + await self._page.mouse.wheel(delta_x, delta_y) + else: + # Scroll the main page + await self._page.evaluate(f"window.scrollBy({delta_x}, {delta_y})") + + # Wait for content to stabilize after scroll + await asyncio.sleep(0.5) + + # Take screenshot to show new viewport content + screenshot_result = await self._take_screenshot() + return ToolResult( + output=f"Scrolled {direction} by {amount} units", + error=None, + base64_image=screenshot_result.base64_image + ) + + except Exception as e: + raise ToolError(f"Failed to scroll: {str(e)}") from e + + async def _scroll_to(self, ref: str) -> ToolResult: + """Scroll to a specific element.""" + if self._page is None: + raise ToolError("Browser not initialized") + + try: + element_info = await self._execute_js_from_file( + "browser_element_script.js", ref + ) + + if not element_info["success"]: + raise ToolError(element_info.get("message", "Failed to find element")) + + # Wait for content to stabilize after scroll + await asyncio.sleep(0.5) + + # Take screenshot to show new viewport content + screenshot_result = await self._take_screenshot() + return ToolResult( + output=f"Scrolled to element with ref: {ref}", + error=None, + base64_image=screenshot_result.base64_image + ) + + except Exception as e: + raise ToolError(f"Failed to scroll to element: {str(e)}") from e + + async def _drag( + self, start_x: int, start_y: int, end_x: int, end_y: int + ) -> ToolResult: + """Perform a drag operation.""" + if self._page is None: + raise ToolError("Browser not initialized") + + try: + # Apply auto-scaling to both start and end coordinates + scaled_start_x, scaled_start_y = self._scale_coordinates(start_x, start_y) + scaled_end_x, scaled_end_y = self._scale_coordinates(end_x, end_y) + + await self._page.mouse.move(scaled_start_x, scaled_start_y) + await self._page.mouse.down() + await self._page.mouse.move(scaled_end_x, scaled_end_y) + await self._page.mouse.up() + + return ToolResult( + output=f"Dragged from ({scaled_start_x}, {scaled_start_y}) to ({scaled_end_x}, {scaled_end_y})", + error=None, + ) + + except Exception as e: + raise ToolError(f"Failed to perform drag: {str(e)}") from e + + async def _mouse_down(self, x: int, y: int) -> ToolResult: + """Press mouse button down.""" + if self._page is None: + raise ToolError("Browser not initialized") + + try: + # Apply auto-scaling to coordinates + scaled_x, scaled_y = self._scale_coordinates(x, y) + + await self._page.mouse.move(scaled_x, scaled_y) + await self._page.mouse.down() + return ToolResult(output=f"Mouse down at ({scaled_x}, {scaled_y})", error=None) + + except Exception as e: + raise ToolError(f"Failed to perform mouse down: {str(e)}") from e + + async def _mouse_up(self, x: int, y: int) -> ToolResult: + """Release mouse button.""" + if self._page is None: + raise ToolError("Browser not initialized") + + try: + # Apply auto-scaling to coordinates + scaled_x, scaled_y = self._scale_coordinates(x, y) + + await self._page.mouse.move(scaled_x, scaled_y) + await self._page.mouse.up() + return ToolResult(output=f"Mouse up at ({scaled_x}, {scaled_y})", error=None) + + except Exception as e: + raise ToolError(f"Failed to perform mouse up: {str(e)}") from e + + async def _hover( + self, + coordinate: Optional[tuple[int, int]] = None, + ref: Optional[str] = None, + ) -> ToolResult: + """ + Move the mouse cursor to a position without clicking. + Useful for revealing tooltips, dropdown menus, or triggering hover states. + """ + if self._page is None: + raise ToolError("Browser not initialized") + + try: + # Prefer ref over coordinate (refs are more reliable) + if ref: + # Use the browser_element_script.js to find element coordinates + element_info = await self._execute_js_from_file( + "browser_element_script.js", ref + ) + + if not element_info.get("success", False): + raise ToolError( + element_info.get("message", "Failed to find element") + ) + + # Get the coordinates from element_info + hover_x, hover_y = element_info["coordinates"] + + await self._page.bring_to_front() + await self._page.mouse.move(hover_x, hover_y) + # Wait for hover effects to render + await asyncio.sleep(0.5) + # Take screenshot to show hover result + screenshot_result = await self._take_screenshot() + return ToolResult( + output=f"Hovered over element with ref: {ref}", + error=None, + base64_image=screenshot_result.base64_image + ) + elif coordinate: + x, y = coordinate + # Apply auto-scaling to coordinates + scaled_x, scaled_y = self._scale_coordinates(x, y) + + await self._page.bring_to_front() + await self._page.mouse.move(scaled_x, scaled_y) + + # Wait for hover effects to render + await asyncio.sleep(0.3) + # Take screenshot to show hover result + screenshot_result = await self._take_screenshot() + return ToolResult( + output=f"Hovered at ({scaled_x}, {scaled_y})", + error=None, + base64_image=screenshot_result.base64_image + ) + else: + raise ToolError( + "Either coordinate or ref is required for hover action" + ) + + except Exception as e: + raise ToolError(f"Failed to perform hover: {str(e)}") from e + + async def _read_page(self, filter_type: str = "") -> ToolResult: + """ + Extract the DOM tree with structured content and element references. + USE THIS to analyze page structure and find specific elements. + Returns a structured tree with text content, not just a screenshot. + """ + if self._page is None: + raise ToolError("Browser not initialized") + + try: + # Use the browser_dom_script.js from reference implementation + dom_tree = await self._execute_js_from_file( + "browser_dom_script.js", filter_type + ) + + # The script returns {pageContent: string}, extract just the pageContent + if isinstance(dom_tree, dict) and "pageContent" in dom_tree: + full_content = dom_tree["pageContent"] + elif isinstance(dom_tree, dict): + full_content = json.dumps(dom_tree, indent=2) + else: + full_content = str(dom_tree) + + # Calculate content size for summary + content_length = len(full_content) + # Estimate token count + # Note: For exact counts, use client.beta.messages.count_tokens API + # This estimate uses ~3.5 chars/token which is typical for Claude with English text + # Actual ratio varies by content type (code, languages, special characters) + estimated_tokens = int(content_length / 3.5) + + # Create a summary for UI display + summary = f"Extracted page DOM tree (~{estimated_tokens:,} tokens, {content_length:,} characters)" + + # Return the full content for the API but with a marker for the UI + return ToolResult( + output=f"__PAGE_EXTRACTED__\n{summary}\n__FULL_CONTENT__\n{full_content}", + error=None + ) + + except Exception as e: + raise ToolError(f"Failed to read page: {str(e)}") from e + + async def _get_page_text(self) -> ToolResult: + """ + Extract ALL text content from the current page. + USE THIS to read articles, posts, or any text content. + Returns the actual text, not a screenshot. + Perfect for reading Reddit posts, articles, etc. + """ + if self._page is None: + raise ToolError("Browser not initialized") + + try: + # Use the browser_text_script.js from reference implementation + result = await self._execute_js_from_file("browser_text_script.js") + + # Format the output like the reference implementation + if isinstance(result, dict): + full_content = f"""Title: {result.get("title", "N/A")} +URL: {result.get("url", "N/A")} +Source element: <{result.get("source", "unknown")}> +--- +{result.get("text", "")}""" + else: + full_content = str(result) + + # Calculate content size for summary + content_length = len(full_content) + # Estimate token count + # Note: For exact counts, use client.beta.messages.count_tokens API + # This estimate uses ~3.5 chars/token which is typical for Claude with English text + # Actual ratio varies by content type (code, languages, special characters) + estimated_tokens = int(content_length / 3.5) + + # Create a summary for UI display + title = result.get("title", "N/A") if isinstance(result, dict) else "N/A" + url = result.get("url", "N/A") if isinstance(result, dict) else "N/A" + summary = f"Extracted page text from: {title}\nURL: {url}\n(~{estimated_tokens:,} tokens, {content_length:,} characters)" + + # Return the full content for the API but with a marker for the UI + return ToolResult( + output=f"__TEXT_EXTRACTED__\n{summary}\n__FULL_CONTENT__\n{full_content}", + error=None + ) + + except Exception as e: + raise ToolError(f"Failed to get page text: {str(e)}") from e + + async def _find(self, search_query: str) -> ToolResult: + """Find elements on the page matching the search query using AI.""" + if self._page is None: + raise ToolError("Browser not initialized") + + try: + # First get the DOM tree for analysis + dom_tree = await self._execute_js_from_file("browser_dom_script.js", "all") + + if isinstance(dom_tree, dict) and "pageContent" in dom_tree: + dom_tree_json = dom_tree["pageContent"] + else: + dom_tree_json = json.dumps(dom_tree, indent=2) + + # Try to use Anthropic API if available + api_key = os.environ.get("ANTHROPIC_API_KEY") + if api_key: + try: + from anthropic import AsyncAnthropic + + client = AsyncAnthropic(api_key=api_key) + + prompt = f"""You are helping find elements on a web page. The user wants to find: "{search_query}" + +Here is the accessibility tree of the page: +{dom_tree_json} + +Find ALL elements that match the user's query. Return up to 20 most relevant matches, ordered by relevance. + +Return your findings in this exact format (one line per matching element): + +FOUND: +SHOWING: +--- +ref_X | role | name | type | reason why this matches +ref_Y | role | name | type | reason why this matches +... + +If there are more than 20 matches, add this line at the end: +MORE: Use a more specific query to see additional results + +If no matching elements are found, return only: +FOUND: 0 +ERROR: explanation of why no elements were found""" + + response = await client.messages.create( + model="claude-3-5-sonnet-20241022", + max_tokens=800, + temperature=1.0, + messages=[{"role": "user", "content": prompt}], + ) + + # Handle the response properly + first_content = response.content[0] + if hasattr(first_content, "text"): + response_text = first_content.text.strip() + else: + # Handle other content types if needed + response_text = str(first_content) + lines = [ + line.strip() + for line in response_text.split("\n") + if line.strip() + ] + + total_found = 0 + elements = [] + has_more = False + error_message = None + + for line in lines: + if line.startswith("FOUND:"): + try: + total_found = int(line.split(":")[1].strip()) + except (ValueError, IndexError): + total_found = 0 + elif line.startswith("SHOWING:"): + pass + elif line.startswith("ERROR:"): + error_message = line[6:].strip() + elif line.startswith("MORE:"): + has_more = True + elif line.startswith("ref_") and "|" in line: + parts = [p.strip() for p in line.split("|")] + if len(parts) >= 4: + elements.append( + { + "ref": parts[0], + "role": parts[1], + "name": parts[2] if len(parts) > 2 else "", + "type": parts[3] if len(parts) > 3 else "", + "description": parts[4] + if len(parts) > 4 + else "", + } + ) + + if total_found == 0 or len(elements) == 0: + return ToolResult( + output=error_message or "No matching elements found", + error=None, + ) + + message = f"Found {total_found} matching element{'s' if total_found != 1 else ''}" + if has_more: + message += f" (showing first {len(elements)}, use a more specific query to narrow results)" + + # Format elements for output + elements_output = [] + for el in elements: + element_str = f"- {el['ref']}: {el['role']}" + if el.get("name"): + element_str += f" {el['name']}" + if el.get("type"): + element_str += f" {el['type']}" + if el.get("description"): + element_str += f" - {el['description']}" + elements_output.append(element_str) + + elements_str = "\n".join(elements_output) + return ToolResult(output=f"{message}\n\n{elements_str}", error=None) + + except Exception: + pass # Failed to use AI for find, falling back to simple search + + # Fallback to simple text search if AI is not available + elements = await self._page.query_selector_all( + f"*:has-text('{search_query}')" + ) + + if not elements: + return ToolResult( + output=f"No matching elements found for: {search_query}", error=None + ) + + # For simple fallback, just report count (no ref_ids without AI analysis) + return ToolResult( + output=f"Found {len(elements)} matching element{'s' if len(elements) != 1 else ''} (Note: AI-based search with ref_ids requires ANTHROPIC_API_KEY)", + error=None, + ) + + except Exception as e: + raise ToolError(f"Failed to find elements: {str(e)}") from e + + async def _form_input(self, ref: str, value: Any) -> ToolResult: + """Fill a form field with a value.""" + if self._page is None: + raise ToolError("Browser not initialized") + + try: + # Use the browser_form_input_script.js from reference implementation + result = await self._execute_js_from_file( + "browser_form_input_script.js", ref, value + ) + + if isinstance(result, dict) and not result.get("success", False): + raise ToolError(result.get("message", "Failed to fill form field")) + + return ToolResult( + output=f"Filled form field {ref} with value: {value}", error=None + ) + + except Exception as e: + raise ToolError(f"Failed to fill form field: {str(e)}") from e + + async def _wait(self, duration: float) -> ToolResult: + """Wait for a specified duration.""" + try: + await asyncio.sleep(duration) + return ToolResult( + output=f"Waited for {duration} second{'s' if duration != 1 else ''}", + error=None, + ) + except Exception as e: + raise ToolError(f"Failed to wait: {str(e)}") from e + + async def _execute_js(self, code: str) -> ToolResult: + """ + Execute JavaScript code in the page context. + Returns the result of the last expression. + """ + if self._page is None: + raise ToolError("Browser not initialized") + + try: + # Execute the code in page context + # Playwright's evaluate handles async/await automatically + result = await self._page.evaluate(code) + + # Format the result + if result is None: + result_str = "undefined" + elif isinstance(result, (dict, list)): + result_str = json.dumps(result, indent=2) + else: + result_str = str(result) + + return ToolResult(output=result_str, error=None) + + except Exception as e: + raise ToolError(f"JavaScript execution error: {str(e)}") from e + + async def __call__( + self, + *, + action: Actions, + text: Optional[str] = None, + ref: Optional[str] = None, + coordinate: Optional[tuple[int, int]] = None, + start_coordinate: Optional[tuple[int, int]] = None, + scroll_direction: Optional[Literal["up", "down", "left", "right"]] = None, + scroll_amount: Optional[int] = None, + duration: Optional[float] = None, + value: Optional[Any] = None, + region: Optional[tuple[int, int, int, int]] = None, + **kwargs, + ) -> ToolResult: + """ + Execute browser actions. + + Parameters: + - action: The action to perform + - text: Text input for type, key, navigate, find actions + - ref: Element reference for element-based actions + - coordinate: (x, y) coordinates for mouse actions + - start_coordinate: Starting point for drag actions + - scroll_direction: Direction for scroll action + - scroll_amount: Amount to scroll + - duration: Duration for wait or hold_key actions + - value: Value for form_input action + - region: (x, y, width, height) for zoom screenshot + """ + + # Ensure browser is running for all actions + await self._ensure_browser() + + if action == "navigate": + if not text: + raise ToolError("URL is required for navigate action") + return await self._navigate(text) + + elif action == "screenshot": + return await self._take_screenshot() + + elif action == "zoom": + if not region: + raise ToolError( + "Region (x1, y1, x2, y2) is required for zoom action" + ) + x1, y1, x2, y2 = region + # Convert corner coordinates to x, y, width, height + x = min(x1, x2) + y = min(y1, y2) + width = abs(x2 - x1) + height = abs(y2 - y1) + return await self._zoom_screenshot(x, y, width, height) + + elif action in [ + "left_click", + "right_click", + "middle_click", + "double_click", + "triple_click", + ]: + return await self._click(action, coordinate, ref, text) + + elif action == "hover": + return await self._hover(coordinate, ref) + + elif action == "type": + if not text: + raise ToolError("Text is required for type action") + return await self._type_text(text) + + elif action == "key": + if not text: + raise ToolError("Key is required for key action") + return await self._press_key(text) + + elif action == "hold_key": + if not text: + raise ToolError("Key is required for hold_key action") + if not duration: + duration = 1.0 + return await self._press_key(text, hold=True, duration=duration) + + elif action == "scroll": + return await self._scroll(coordinate, scroll_direction, scroll_amount) + + elif action == "scroll_to": + if not ref: + raise ToolError("Element reference is required for scroll_to action") + return await self._scroll_to(ref) + + elif action == "left_click_drag": + if not start_coordinate or not coordinate: + raise ToolError( + "Both start_coordinate and coordinate are required for drag action" + ) + start_x, start_y = start_coordinate + end_x, end_y = coordinate + return await self._drag(start_x, start_y, end_x, end_y) + + elif action == "left_mouse_down": + if not coordinate: + raise ToolError("Coordinate is required for mouse_down action") + x, y = coordinate + return await self._mouse_down(x, y) + + elif action == "left_mouse_up": + if not coordinate: + raise ToolError("Coordinate is required for mouse_up action") + x, y = coordinate + return await self._mouse_up(x, y) + + elif action == "read_page": + filter_type = text if text in ["interactive", ""] else "" + return await self._read_page(filter_type) + + elif action == "get_page_text": + return await self._get_page_text() + + elif action == "find": + if not text: + raise ToolError("Text is required for find action") + return await self._find(text) + + elif action == "form_input": + if not ref: + raise ToolError("Element reference is required for form_input action") + if value is None: + raise ToolError("Value is required for form_input action") + return await self._form_input(ref, value) + + elif action == "wait": + if not duration: + duration = 1.0 + return await self._wait(duration) + + elif action == "execute_js": + if not text: + raise ToolError("JavaScript code is required for execute_js action") + return await self._execute_js(text) + + else: + raise ToolError(f"Unknown action: {action}") + + async def cleanup(self): + """Cleanup method to ensure browser is closed properly.""" + # Clean up browser resources + if self.cdp_url: + # When connected to CDP server, just disconnect without closing tabs + self._page = None + self._context = None + self._browser = None + else: + # For local browser, close everything + if self._page: + await self._page.close() + self._page = None + + if self._context: + await self._context.close() + self._context = None + + if self._browser: + await self._browser.close() + self._browser = None + + if self._playwright: + await self._playwright.stop() + self._playwright = None + + self._initialized = False diff --git a/browser-use-demo/browser_use_demo/tools/collection.py b/browser-use-demo/browser_use_demo/tools/collection.py new file mode 100644 index 000000000..2b4b94da1 --- /dev/null +++ b/browser-use-demo/browser_use_demo/tools/collection.py @@ -0,0 +1,17 @@ +from typing import Any + +from anthropic.types.beta import BetaToolUnionParam + +from .base import BaseAnthropicTool + + +class ToolCollection: + """Collection of tools for browser automation.""" + + def __init__(self, *tools: BaseAnthropicTool): + self.tools = tools + self.tool_map = {tool.name: tool for tool in tools} + + def to_params(self) -> list[BetaToolUnionParam]: + """Convert all tools to API parameters.""" + return [tool.to_params() for tool in self.tools] diff --git a/browser-use-demo/browser_use_demo/tools/coordinate_scaling.py b/browser-use-demo/browser_use_demo/tools/coordinate_scaling.py new file mode 100644 index 000000000..91d956790 --- /dev/null +++ b/browser-use-demo/browser_use_demo/tools/coordinate_scaling.py @@ -0,0 +1,178 @@ +""" +Coordinate scaling utilities for browser tool. + +This module handles the scaling of coordinates from Claude's vision model +resolution to the actual browser viewport resolution. +""" + + +class CoordinateScaler: + """Handles coordinate scaling between Claude's vision and actual viewport.""" + + # Claude's image processing resolution for 16:9 aspect ratio + # According to the official documentation: + # https://docs.claude.com/en/docs/build-with-claude/vision#evaluate-image-size + # When images exceed size thresholds, they are resized while preserving aspect ratio. + # For 16:9 aspect ratio (like 1920x1080 screenshots), images are resized to these exact dimensions: + CLAUDE_ACTUAL_WIDTH = 1456 # Width for 16:9 landscape (from documentation) + CLAUDE_ACTUAL_HEIGHT = 819 # Height for 16:9 landscape (from documentation) + + # Documented maximum image sizes for different aspect ratios + # Source: https://docs.claude.com/en/docs/build-with-claude/vision#evaluate-image-size + DOCUMENTED_SIZES = { + # aspect_ratio: (width, height) + (1, 1): (1092, 1092), # Square + (3, 4): (951, 1268), # Portrait + (4, 3): (1268, 951), # Landscape + (2, 3): (896, 1344), # Portrait + (3, 2): (1344, 896), # Landscape + (9, 16): (819, 1456), # Portrait (phone) + (16, 9): (1456, 819), # Landscape (widescreen) + (1, 2): (784, 1568), # Portrait (tall) + (2, 1): (1568, 784), # Landscape (wide) + } + + @classmethod + def get_documented_size_for_aspect_ratio(cls, viewport_width: int, viewport_height: int) -> tuple[int, int]: + """ + Get the documented size for the given viewport's aspect ratio. + + Args: + viewport_width: Actual viewport width + viewport_height: Actual viewport height + + Returns: + Tuple of (width, height) from documented sizes that matches the aspect ratio + + Raises: + ValueError: If the aspect ratio doesn't match any documented sizes + """ + viewport_ratio = viewport_width / viewport_height + + # Tolerance for aspect ratio matching (to handle minor differences) + ASPECT_RATIO_TOLERANCE = 0.02 + + # Try to find an exact or very close aspect ratio match + for (ratio_w, ratio_h), (doc_width, doc_height) in cls.DOCUMENTED_SIZES.items(): + doc_ratio = ratio_w / ratio_h + if abs(viewport_ratio - doc_ratio) < ASPECT_RATIO_TOLERANCE: + return (doc_width, doc_height) + + # If no match found, raise an error with helpful information + supported_ratios = [f"{w}:{h} ({w/h:.3f})" for (w, h) in cls.DOCUMENTED_SIZES.keys()] + raise ValueError( + f"Viewport aspect ratio {viewport_ratio:.3f} ({viewport_width}x{viewport_height}) " + f"does not match any documented aspect ratios. " + f"Supported ratios: {', '.join(supported_ratios)}. " + f"See https://docs.claude.com/en/docs/build-with-claude/vision#evaluate-image-size" + ) + + @classmethod + def get_scale_factors( + cls, + viewport_width: int, + viewport_height: int, + match_aspect_ratio: bool = False + ) -> tuple[float, float]: + """ + Calculate scale factors for converting Claude coordinates to viewport coordinates. + + Args: + viewport_width: Actual browser viewport width + viewport_height: Actual browser viewport height + match_aspect_ratio: If True, match the viewport's aspect ratio to documented sizes. + If False (default), use 16:9 for browser scenarios. + + Returns: + Tuple of (scale_x, scale_y) factors + + Raises: + ValueError: If match_aspect_ratio is True and aspect ratio doesn't match documented sizes + """ + if match_aspect_ratio: + # Match the documented size based on aspect ratio (will raise if no match) + base_width, base_height = cls.get_documented_size_for_aspect_ratio(viewport_width, viewport_height) + else: + # Use default 16:9 dimensions (typical for browser screenshots) + base_width = cls.CLAUDE_ACTUAL_WIDTH + base_height = cls.CLAUDE_ACTUAL_HEIGHT + + # Calculate scale factors + scale_x = viewport_width / base_width + scale_y = viewport_height / base_height + + return scale_x, scale_y + + @classmethod + def scale_coordinates( + cls, + x: int, + y: int, + viewport_width: int, + viewport_height: int, + apply_threshold: bool = True + ) -> tuple[int, int]: + """ + Scale coordinates from Claude's vision to actual viewport. + + Args: + x: X coordinate from Claude + y: Y coordinate from Claude + viewport_width: Actual browser viewport width + viewport_height: Actual browser viewport height + apply_threshold: Whether to check if coordinates need scaling + + Returns: + Tuple of (scaled_x, scaled_y) + """ + scale_x, scale_y = cls.get_scale_factors(viewport_width, viewport_height) + + # If scaling factors are close to 1.0, no scaling needed + if abs(scale_x - 1.0) < 0.05 and abs(scale_y - 1.0) < 0.05: + return x, y + + if apply_threshold: + # Check if coordinates appear to be in Claude's resolution + # (with 20% margin for edge cases) + max_expected_x = cls.CLAUDE_ACTUAL_WIDTH * 1.2 + max_expected_y = cls.CLAUDE_ACTUAL_HEIGHT * 1.2 + + # If coordinates are larger than expected Claude resolution, + # they might already be in viewport coordinates + if x > max_expected_x or y > max_expected_y: + return x, y + + # Apply scaling + scaled_x = int(x * scale_x) + scaled_y = int(y * scale_y) + + # Ensure we don't exceed viewport bounds + scaled_x = min(scaled_x, viewport_width - 1) + scaled_y = min(scaled_y, viewport_height - 1) + + return scaled_x, scaled_y + + @classmethod + def scale_coordinate_list( + cls, + coords: list | tuple, + viewport_width: int, + viewport_height: int + ) -> list: + """ + Scale a coordinate pair list/tuple. + + Args: + coords: [x, y] coordinate pair + viewport_width: Actual browser viewport width + viewport_height: Actual browser viewport height + + Returns: + Scaled [x, y] coordinate pair + """ + if not isinstance(coords, (list, tuple)) or len(coords) != 2: + return list(coords) if isinstance(coords, tuple) else coords + + x, y = coords[0], coords[1] + scaled_x, scaled_y = cls.scale_coordinates(x, y, viewport_width, viewport_height) + return [scaled_x, scaled_y] \ No newline at end of file diff --git a/browser-use-demo/build.sh b/browser-use-demo/build.sh new file mode 100755 index 000000000..f28d0b1e3 --- /dev/null +++ b/browser-use-demo/build.sh @@ -0,0 +1,17 @@ +#!/bin/bash +set -e + +echo "🏗️ Building Browser Use Demo Docker image..." +docker build . -t browser-use-demo:latest + +echo "✅ Build complete!" +echo "" +echo "To run the demo:" +echo " docker run -e ANTHROPIC_API_KEY=\$ANTHROPIC_API_KEY \\" +echo " -v \$(pwd)/browser_use_demo:/home/browseruse/browser_use_demo/ \\" +echo " -p 5900:5900 -p 8501:8501 -p 6080:6080 -p 8080:8080 \\" +echo " -it browser-use-demo:latest" +echo "" +echo "Then open:" +echo " - http://localhost:8501 for the Streamlit interface" +echo " - http://localhost:8080 to see the browser" \ No newline at end of file diff --git a/browser-use-demo/docker-compose.yml b/browser-use-demo/docker-compose.yml new file mode 100644 index 000000000..976439ef2 --- /dev/null +++ b/browser-use-demo/docker-compose.yml @@ -0,0 +1,27 @@ +services: + browser-use: + build: + context: . + image: browser-use-demo:latest + environment: + - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY} + volumes: + - ./validate_env.py:/home/browseruse/validate_env.py + - ${HOME}/.anthropic:/home/browseruse/.anthropic + ports: + - "${VNC_PORT:-5900}:5900" # VNC + - "${STREAMLIT_PORT:-8501}:8501" # Streamlit + - "${NOVNC_PORT:-6080}:6080" # noVNC + - "${HTTP_PORT:-8080}:8080" # HTTP server + stdin_open: true + tty: true + env_file: + - .env + develop: + watch: + - path: ./browser_use_demo + action: sync+restart + target: /home/browseruse/browser_use_demo + - path: ./validate_env.py + action: rebuild + target: /home/browseruse/validate_env.py \ No newline at end of file diff --git a/browser-use-demo/image/.config/tint2/applications/firefox-custom.desktop b/browser-use-demo/image/.config/tint2/applications/firefox-custom.desktop new file mode 100755 index 000000000..948021262 --- /dev/null +++ b/browser-use-demo/image/.config/tint2/applications/firefox-custom.desktop @@ -0,0 +1,8 @@ +[Desktop Entry] +Name=Firefox Custom +Comment=Open Firefox with custom URL +Exec=firefox-esr -new-window +Icon=firefox-esr +Terminal=false +Type=Application +Categories=Network;WebBrowser; diff --git a/browser-use-demo/image/.config/tint2/applications/gedit.desktop b/browser-use-demo/image/.config/tint2/applications/gedit.desktop new file mode 100755 index 000000000..d5af03f40 --- /dev/null +++ b/browser-use-demo/image/.config/tint2/applications/gedit.desktop @@ -0,0 +1,8 @@ +[Desktop Entry] +Name=Gedit +Comment=Open gedit +Exec=gedit +Icon=text-editor-symbolic +Terminal=false +Type=Application +Categories=TextEditor; diff --git a/browser-use-demo/image/.config/tint2/applications/terminal.desktop b/browser-use-demo/image/.config/tint2/applications/terminal.desktop new file mode 100644 index 000000000..0c2d45d4d --- /dev/null +++ b/browser-use-demo/image/.config/tint2/applications/terminal.desktop @@ -0,0 +1,8 @@ +[Desktop Entry] +Name=Terminal +Comment=Open Terminal +Exec=xterm +Icon=utilities-terminal +Terminal=false +Type=Application +Categories=System;TerminalEmulator; diff --git a/browser-use-demo/image/.config/tint2/tint2rc b/browser-use-demo/image/.config/tint2/tint2rc new file mode 100644 index 000000000..5db6d3128 --- /dev/null +++ b/browser-use-demo/image/.config/tint2/tint2rc @@ -0,0 +1,100 @@ +#------------------------------------- +# Panel +panel_items = TL +panel_size = 100% 60 +panel_margin = 0 0 +panel_padding = 2 0 2 +panel_background_id = 1 +wm_menu = 0 +panel_dock = 0 +panel_position = bottom center horizontal +panel_layer = top +panel_monitor = all +panel_shrink = 0 +autohide = 0 +autohide_show_timeout = 0 +autohide_hide_timeout = 0.5 +autohide_height = 2 +strut_policy = follow_size +panel_window_name = tint2 +disable_transparency = 1 +mouse_effects = 1 +font_shadow = 0 +mouse_hover_icon_asb = 100 0 10 +mouse_pressed_icon_asb = 100 0 0 +scale_relative_to_dpi = 0 +scale_relative_to_screen_height = 0 + +#------------------------------------- +# Taskbar +taskbar_mode = single_desktop +taskbar_hide_if_empty = 0 +taskbar_padding = 0 0 2 +taskbar_background_id = 0 +taskbar_active_background_id = 0 +taskbar_name = 1 +taskbar_hide_inactive_tasks = 0 +taskbar_hide_different_monitor = 0 +taskbar_hide_different_desktop = 0 +taskbar_always_show_all_desktop_tasks = 0 +taskbar_name_padding = 4 2 +taskbar_name_background_id = 0 +taskbar_name_active_background_id = 0 +taskbar_name_font_color = #e3e3e3 100 +taskbar_name_active_font_color = #ffffff 100 +taskbar_distribute_size = 0 +taskbar_sort_order = none +task_align = left + +#------------------------------------- +# Launcher +launcher_padding = 4 8 4 +launcher_background_id = 0 +launcher_icon_background_id = 0 +launcher_icon_size = 48 +launcher_icon_asb = 100 0 0 +launcher_icon_theme_override = 0 +startup_notifications = 1 +launcher_tooltip = 1 + +#------------------------------------- +# Launcher icon +launcher_item_app = /usr/share/applications/libreoffice-calc.desktop +launcher_item_app = /home/computeruse/.config/tint2/applications/terminal.desktop +launcher_item_app = /home/computeruse/.config/tint2/applications/firefox-custom.desktop +launcher_item_app = /usr/share/applications/xpaint.desktop +launcher_item_app = /usr/share/applications/xpdf.desktop +launcher_item_app = /home/computeruse/.config/tint2/applications/gedit.desktop +launcher_item_app = /usr/share/applications/galculator.desktop + +#------------------------------------- +# Background definitions +# ID 1 +rounded = 0 +border_width = 0 +background_color = #000000 60 +border_color = #000000 30 + +# ID 2 +rounded = 4 +border_width = 1 +background_color = #777777 20 +border_color = #777777 30 + +# ID 3 +rounded = 4 +border_width = 1 +background_color = #777777 20 +border_color = #ffffff 40 + +# ID 4 +rounded = 4 +border_width = 1 +background_color = #aa4400 100 +border_color = #aa7733 100 + +# ID 5 +rounded = 4 +border_width = 1 +background_color = #aaaa00 100 +border_color = #aaaa00 100 diff --git a/browser-use-demo/image/entrypoint.sh b/browser-use-demo/image/entrypoint.sh new file mode 100755 index 000000000..ef7fc6fc9 --- /dev/null +++ b/browser-use-demo/image/entrypoint.sh @@ -0,0 +1,23 @@ +#!/bin/bash +set -e + +# Validate environment configuration +echo "Validating environment configuration..." +python validate_env.py +if [ $? -ne 0 ]; then + echo "Environment validation failed. Exiting." + exit 1 +fi + +./start_all.sh +./novnc_startup.sh + +python http_server.py > /tmp/server_logs.txt 2>&1 & + +STREAMLIT_SERVER_PORT=8501 python -m streamlit run browser_use_demo/streamlit.py > /tmp/streamlit_stdout.log & + +echo "✨ Browser Use Demo is ready!" +echo "➡️ Open http://localhost:8080 in your browser to begin" + +# Keep the container running +tail -f /dev/null diff --git a/browser-use-demo/image/http_server.py b/browser-use-demo/image/http_server.py new file mode 100644 index 000000000..082ff4de1 --- /dev/null +++ b/browser-use-demo/image/http_server.py @@ -0,0 +1,19 @@ +import os +import socket +from http.server import HTTPServer, SimpleHTTPRequestHandler + + +class HTTPServerV6(HTTPServer): + address_family = socket.AF_INET6 + + +def run_server(): + os.chdir(os.path.dirname(__file__) + "/static_content") + server_address = ("::", 8080) + httpd = HTTPServerV6(server_address, SimpleHTTPRequestHandler) + print("Starting HTTP server on port 8080...") # noqa: T201 + httpd.serve_forever() + + +if __name__ == "__main__": + run_server() diff --git a/browser-use-demo/image/index.html b/browser-use-demo/image/index.html new file mode 100644 index 000000000..7df7c2e65 --- /dev/null +++ b/browser-use-demo/image/index.html @@ -0,0 +1,43 @@ + + + + Browser Use Demo + + + + +
+ + +
+ + diff --git a/browser-use-demo/image/mutter_startup.sh b/browser-use-demo/image/mutter_startup.sh new file mode 100755 index 000000000..5f714f74f --- /dev/null +++ b/browser-use-demo/image/mutter_startup.sh @@ -0,0 +1,20 @@ +echo "starting mutter" +XDG_SESSION_TYPE=x11 mutter --replace --sm-disable 2>/tmp/mutter_stderr.log & + +# Wait for tint2 window properties to appear +timeout=30 +while [ $timeout -gt 0 ]; do + if xdotool search --class "mutter" >/dev/null 2>&1; then + break + fi + sleep 1 + ((timeout--)) +done + +if [ $timeout -eq 0 ]; then + echo "mutter stderr output:" >&2 + cat /tmp/mutter_stderr.log >&2 + exit 1 +fi + +rm /tmp/mutter_stderr.log diff --git a/browser-use-demo/image/novnc_startup.sh b/browser-use-demo/image/novnc_startup.sh new file mode 100755 index 000000000..da56816ca --- /dev/null +++ b/browser-use-demo/image/novnc_startup.sh @@ -0,0 +1,21 @@ +#!/bin/bash +echo "starting noVNC" + +# Start noVNC with explicit websocket settings +/opt/noVNC/utils/novnc_proxy \ + --vnc localhost:5900 \ + --listen 6080 \ + --web /opt/noVNC \ + > /tmp/novnc.log 2>&1 & + +# Wait for noVNC to start +timeout=10 +while [ $timeout -gt 0 ]; do + if netstat -tuln | grep -q ":6080 "; then + break + fi + sleep 1 + ((timeout--)) +done + +echo "noVNC started successfully" diff --git a/browser-use-demo/image/start_all.sh b/browser-use-demo/image/start_all.sh new file mode 100755 index 000000000..ce14e3917 --- /dev/null +++ b/browser-use-demo/image/start_all.sh @@ -0,0 +1,9 @@ +#!/bin/bash + +set -e + +export DISPLAY=:${DISPLAY_NUM} +./xvfb_startup.sh +./tint2_startup.sh # Keep taskbar for window management +./mutter_startup.sh # Keep window manager for controls +./x11vnc_startup.sh diff --git a/browser-use-demo/image/static_content/index.html b/browser-use-demo/image/static_content/index.html new file mode 100644 index 000000000..6fadf3382 --- /dev/null +++ b/browser-use-demo/image/static_content/index.html @@ -0,0 +1,72 @@ + + + + Browser Use Demo + + + + +
+ + + + +
+ + diff --git a/browser-use-demo/image/tint2_startup.sh b/browser-use-demo/image/tint2_startup.sh new file mode 100755 index 000000000..34f39a18b --- /dev/null +++ b/browser-use-demo/image/tint2_startup.sh @@ -0,0 +1,24 @@ +#!/bin/bash +echo "starting tint2 on display :$DISPLAY_NUM ..." + +# Start tint2 and capture its stderr +tint2 -c $HOME/.config/tint2/tint2rc 2>/tmp/tint2_stderr.log & + +# Wait for tint2 window properties to appear +timeout=30 +while [ $timeout -gt 0 ]; do + if xdotool search --class "tint2" >/dev/null 2>&1; then + break + fi + sleep 1 + ((timeout--)) +done + +if [ $timeout -eq 0 ]; then + echo "tint2 stderr output:" >&2 + cat /tmp/tint2_stderr.log >&2 + exit 1 +fi + +# Remove the temporary stderr log file +rm /tmp/tint2_stderr.log diff --git a/browser-use-demo/image/x11vnc_startup.sh b/browser-use-demo/image/x11vnc_startup.sh new file mode 100755 index 000000000..ad4b352c3 --- /dev/null +++ b/browser-use-demo/image/x11vnc_startup.sh @@ -0,0 +1,46 @@ +#!/bin/bash +echo "starting vnc" + +(x11vnc -display $DISPLAY \ + -forever \ + -shared \ + -wait 50 \ + -rfbport 5900 \ + -nopw \ + 2>/tmp/x11vnc_stderr.log) & + +x11vnc_pid=$! + +# Wait for x11vnc to start +timeout=10 +while [ $timeout -gt 0 ]; do + if netstat -tuln | grep -q ":5900 "; then + break + fi + sleep 1 + ((timeout--)) +done + +if [ $timeout -eq 0 ]; then + echo "x11vnc failed to start, stderr output:" >&2 + cat /tmp/x11vnc_stderr.log >&2 + exit 1 +fi + +: > /tmp/x11vnc_stderr.log + +# Monitor x11vnc process in the background +( + while true; do + if ! kill -0 $x11vnc_pid 2>/dev/null; then + echo "x11vnc process crashed, restarting..." >&2 + if [ -f /tmp/x11vnc_stderr.log ]; then + echo "x11vnc stderr output:" >&2 + cat /tmp/x11vnc_stderr.log >&2 + rm /tmp/x11vnc_stderr.log + fi + exec "$0" + fi + sleep 5 + done +) & diff --git a/browser-use-demo/image/xvfb_startup.sh b/browser-use-demo/image/xvfb_startup.sh new file mode 100755 index 000000000..205da2933 --- /dev/null +++ b/browser-use-demo/image/xvfb_startup.sh @@ -0,0 +1,57 @@ +#!/bin/bash +set -e # Exit on error + +DPI=96 +RES_AND_DEPTH=${WIDTH}x${HEIGHT}x24 + +# Function to check if Xvfb is already running +check_xvfb_running() { + if [ -e /tmp/.X${DISPLAY_NUM}-lock ]; then + return 0 # Xvfb is already running + else + return 1 # Xvfb is not running + fi +} + +# Function to check if Xvfb is ready +wait_for_xvfb() { + local timeout=10 + local start_time=$(date +%s) + while ! xdpyinfo >/dev/null 2>&1; do + if [ $(($(date +%s) - start_time)) -gt $timeout ]; then + echo "Xvfb failed to start within $timeout seconds" >&2 + return 1 + fi + sleep 0.1 + done + return 0 +} + +# Check if Xvfb is already running +if check_xvfb_running; then + echo "Lock file found for display ${DISPLAY}, checking if display is accessible..." + # Verify that the display is actually accessible + if xdpyinfo >/dev/null 2>&1; then + echo "Xvfb is already running and accessible on display ${DISPLAY}" + exit 0 + else + echo "Display ${DISPLAY} is not accessible, cleaning up stale lock file..." + rm -f /tmp/.X${DISPLAY_NUM}-lock + # Also clean up the socket if it exists + rm -f /tmp/.X11-unix/X${DISPLAY_NUM} + fi +fi + +# Start Xvfb +Xvfb $DISPLAY -ac -screen 0 $RES_AND_DEPTH -retro -dpi $DPI -nolisten tcp -nolisten unix & +XVFB_PID=$! + +# Wait for Xvfb to start +if wait_for_xvfb; then + echo "Xvfb started successfully on display ${DISPLAY}" + echo "Xvfb PID: $XVFB_PID" +else + echo "Xvfb failed to start" + kill $XVFB_PID + exit 1 +fi diff --git a/browser-use-demo/pytest.ini b/browser-use-demo/pytest.ini new file mode 100644 index 000000000..eb62803d4 --- /dev/null +++ b/browser-use-demo/pytest.ini @@ -0,0 +1,34 @@ +[pytest] +# Test discovery patterns +python_files = test_*.py +python_classes = Test* +python_functions = test_* + +# Test directories +testpaths = tests + +# Markers for test categorization +markers = + integration: Integration tests that test multiple components + slow: Tests that take longer than usual to run + asyncio: Tests that use asyncio + +# Coverage settings +addopts = + -v + --strict-markers + --tb=short + --disable-warnings + -p pytest_asyncio + +# Asyncio configuration +asyncio_mode = auto +asyncio_default_fixture_loop_scope = function + +# Minimum Python version +minversion = 3.11 + +# Ignore specific warnings +filterwarnings = + ignore::DeprecationWarning + ignore::PendingDeprecationWarning \ No newline at end of file diff --git a/browser-use-demo/setup.py b/browser-use-demo/setup.py new file mode 100644 index 000000000..300a31630 --- /dev/null +++ b/browser-use-demo/setup.py @@ -0,0 +1,29 @@ +from setuptools import setup, find_packages + +setup( + name="browser-use-demo", + version="0.1.0", + packages=find_packages(), + python_requires=">=3.11", + install_requires=[ + "streamlit==1.41.0", + "anthropic[bedrock,vertex]>=0.39.0", + "jsonschema==4.22.0", + "boto3>=1.28.57", + "google-auth<3,>=2", + "playwright>=1.40.0", + ], + extras_require={ + "test": [ + "pytest==8.3.3", + "pytest-cov==4.1.0", + "pytest-mock==3.11.1", + "pytest-asyncio==0.23.6", + ], + "dev": [ + "ruff==0.6.7", + "pyright>=1.1.300", + "pre-commit==3.8.0", + ], + }, +) diff --git a/browser-use-demo/test-requirements.txt b/browser-use-demo/test-requirements.txt new file mode 100644 index 000000000..6efd10ea4 --- /dev/null +++ b/browser-use-demo/test-requirements.txt @@ -0,0 +1,4 @@ +pytest==8.3.3 +pytest-cov==4.1.0 +pytest-mock==3.11.1 +pytest-asyncio==0.23.6 \ No newline at end of file diff --git a/browser-use-demo/tests/README.md b/browser-use-demo/tests/README.md new file mode 100644 index 000000000..1ddeee764 --- /dev/null +++ b/browser-use-demo/tests/README.md @@ -0,0 +1,174 @@ +# Browser Use Demo - Test Suite + +Comprehensive test suite for the refactored Browser Use Demo with extensive edge case coverage. + +## Installation + +```bash +# Install test dependencies +pip install -r test-requirements.txt + +# Or install with extras +pip install -e ".[test]" +``` + +## Running Tests + +### Run all tests +```bash +pytest tests/ +``` + +### Run with coverage report +```bash +pytest tests/ --cov=browser_tools_api_demo --cov-report=html +# Open htmlcov/index.html to view coverage report +``` + +### Run specific test file +```bash +pytest tests/test_message_renderer.py -v +``` + +### Run specific test class or method +```bash +pytest tests/test_message_renderer.py::TestMessageRenderer -v +pytest tests/test_message_renderer.py::TestRenderMethod::test_render_string_message -v +``` + +### Run tests by marker +```bash +# Run only integration tests +pytest -m integration + +# Run tests excluding integration +pytest -m "not integration" + +# Run async tests +pytest -m asyncio +``` + +## Test Structure + +``` +tests/ +├── conftest.py # Shared fixtures and mocks +├── test_message_renderer.py # MessageRenderer class tests (~300 test cases) +├── test_streamlit_helpers.py # Helper function tests (~150 test cases) +└── test_integration.py # End-to-end integration tests (~50 test cases) +``` + +## Test Coverage + +The test suite covers: + +### MessageRenderer (`test_message_renderer.py`) +- Initialization with various state configurations +- Rendering all message types (string, dict, ToolResult) +- Conversation history rendering with complex structures +- Edge cases: empty messages, None values, circular references +- Error handling: malformed data, missing fields, exceptions +- Unicode and special character handling +- Performance with large messages + +### Streamlit Helpers (`test_streamlit_helpers.py`) +- `setup_state()` with fresh and partial initialization +- Environment variable handling (present/missing/invalid) +- Lambda evaluation in state initialization +- `get_or_create_event_loop()` with various loop states +- `authenticate()` with different providers and key states +- Concurrent access and thread safety +- Error recovery scenarios + +### Integration Tests (`test_integration.py`) +- Complete message rendering pipeline +- State initialization and persistence +- Event loop management with async operations +- Error propagation across components +- Full user interaction workflow +- Performance with large datasets (1000+ messages) +- Deeply nested content structures + +## Edge Cases Covered + +1. **Boundary Conditions** + - Empty strings, lists, dictionaries + - Single item collections + - Maximum size inputs (100k+ character messages) + - Null/None values + +2. **Type Mismatches** + - Wrong types for expected fields + - Missing required fields + - Extra unexpected fields + - Invalid message structures + +3. **State Inconsistencies** + - Tools referenced but not in session_state + - Partially initialized state + - Concurrent modifications + - Corrupted state + +4. **Error Conditions** + - Import errors + - Asyncio exceptions + - Environment variable errors + - Lambda evaluation failures + - Base64 decode errors + +5. **Performance Edge Cases** + - Very large message histories (1000+ messages) + - Deeply nested content (100+ levels) + - Circular references + - Unicode and special characters + +## Mocking Strategy + +### Streamlit Components +All Streamlit components are mocked to enable testing without a running Streamlit server: +- `st.session_state` +- `st.chat_message` +- `st.markdown`, `st.write`, `st.error`, `st.code`, `st.image` +- `st.chat_input`, `st.stop` + +### External Dependencies +- `BrowserTool` - Mocked to avoid Playwright dependencies +- `asyncio` event loops - Mocked for controlled testing +- Environment variables - Mocked via `monkeypatch` + +## Fixtures + +Key fixtures provided in `conftest.py`: + +- `mock_streamlit` - Complete Streamlit mocking setup +- `mock_browser_tool` - BrowserTool mock +- `sample_tool_result` - Various ToolResult configurations +- `sample_messages` - Diverse message structures for testing +- `edge_case_messages` - Messages designed to trigger edge cases +- `mock_asyncio_loop` - Controlled event loop for testing +- `mock_environment` - Environment variable setup +- `clean_environment` - Remove environment variables + +## Continuous Integration + +To run tests in CI: + +```bash +# Install dependencies +pip install -e ".[test]" + +# Run tests with coverage +pytest tests/ --cov=browser_tools_api_demo --cov-report=xml --cov-report=term + +# Generate coverage badge +coverage-badge -o coverage.svg +``` + +## Contributing + +When adding new features or refactoring: +1. Add corresponding tests for new functionality +2. Ensure all edge cases are covered +3. Run the full test suite before committing +4. Maintain >95% code coverage +5. Update this README if test structure changes \ No newline at end of file diff --git a/browser-use-demo/tests/__init__.py b/browser-use-demo/tests/__init__.py new file mode 100644 index 000000000..af8757ad5 --- /dev/null +++ b/browser-use-demo/tests/__init__.py @@ -0,0 +1 @@ +"""Test suite for Browser Use Demo.""" diff --git a/browser-use-demo/tests/conftest.py b/browser-use-demo/tests/conftest.py new file mode 100644 index 000000000..5446cecb8 --- /dev/null +++ b/browser-use-demo/tests/conftest.py @@ -0,0 +1,301 @@ +"""Shared fixtures and configuration for tests.""" + +import asyncio +import sys +from pathlib import Path +from unittest.mock import MagicMock, Mock, patch + +import pytest + +# Add the parent directory to the path +sys.path.insert(0, str(Path(__file__).parent.parent)) + +from browser_use_demo.tools import ToolResult + + +@pytest.fixture +def mock_streamlit(): + """Mock Streamlit module and session_state.""" + with patch("streamlit.session_state") as mock_state: + # Initialize with default values + mock_state.hide_screenshots = False + mock_state.tools = {} + mock_state.messages = [] + mock_state.api_key = "test-key" + mock_state.provider = MagicMock() + mock_state.event_loop = None + + # Mock other streamlit components + with patch("streamlit.chat_message") as mock_chat: + mock_chat.return_value.__enter__ = Mock() + mock_chat.return_value.__exit__ = Mock() + + with patch("streamlit.markdown") as mock_markdown: + with patch("streamlit.write") as mock_write: + with patch("streamlit.error") as mock_error: + with patch("streamlit.code") as mock_code: + with patch("streamlit.image") as mock_image: + yield { + "session_state": mock_state, + "chat_message": mock_chat, + "markdown": mock_markdown, + "write": mock_write, + "error": mock_error, + "code": mock_code, + "image": mock_image, + } + + +@pytest.fixture +def mock_browser_tool(): + """Mock BrowserTool to avoid Playwright dependencies.""" + with patch("browser_use_demo.tools.BrowserTool") as mock_tool: + mock_instance = MagicMock() + mock_tool.return_value = mock_instance + yield mock_instance + + +@pytest.fixture +def sample_tool_result(): + """Create sample ToolResult objects for testing.""" + return { + "success": ToolResult(output="Success message"), + "error": ToolResult(error="Error message"), + "with_image": ToolResult( + output="With screenshot", + base64_image="iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNkYPhfDwAChwGA60e6kgAAAABJRU5ErkJggg==", + ), + "empty": ToolResult(), + "all_fields": ToolResult( + output="Output text", + error="Error text", + base64_image="base64data", + system="System message", + ), + } + + +@pytest.fixture +def sample_messages(): + """Provide various message structures for testing edge cases.""" + return [ + # Normal messages + {"role": "user", "content": "Hello"}, + {"role": "assistant", "content": "Hi there!"}, + # Complex content structures + { + "role": "user", + "content": [ + {"type": "text", "text": "Multiple items"}, + {"type": "image", "source": "data:image/png;base64,abc123"}, + ], + }, + # Assistant with tool results + { + "role": "assistant", + "content": [ + {"type": "text", "text": "Let me help with that"}, + { + "type": "tool_use", + "name": "browser", + "input": {"url": "example.com"}, + }, + {"type": "tool_result", "tool_use_id": "tool_123"}, + ], + }, + # Edge cases + {"role": "user", "content": ""}, # Empty content + {"role": "assistant", "content": None}, # None content + {"role": "unknown", "content": "Unknown role"}, # Unknown role + {"role": "user"}, # Missing content field + {"content": "No role"}, # Missing role field + # Unicode and special characters + {"role": "user", "content": "Hello 世界 🌍 \n\t\r"}, + # Very long content + {"role": "user", "content": "x" * 10000}, + # Nested structures + { + "role": "assistant", + "content": [ + { + "type": "text", + "text": "Nested", + "metadata": {"nested": {"deeply": {"value": 123}}}, + } + ], + }, + ] + + +@pytest.fixture +def edge_case_messages(): + """Messages specifically designed to test edge cases and error conditions.""" + return { + "empty_list": [], + "none": None, + "malformed_dict": {"not": "valid", "message": "structure"}, + "circular_ref": _create_circular_reference(), + "missing_tool": { + "role": "assistant", + "content": [{"type": "tool_result", "tool_use_id": "nonexistent_tool"}], + }, + "invalid_types": [ + {"role": 123, "content": "Invalid role type"}, + {"role": "user", "content": {"invalid": "content structure"}}, + ], + "huge_message": { + "role": "user", + "content": [{"type": "text", "text": "x" * 1000000}], + }, + } + + +def _create_circular_reference(): + """Helper to create a message with circular reference.""" + msg = {"role": "user", "content": []} + msg["content"].append(msg) # Circular reference + return msg + + +@pytest.fixture +def mock_asyncio_loop(): + """Mock asyncio event loop for testing.""" + loop = Mock(spec=asyncio.AbstractEventLoop) + loop.is_closed.return_value = False + loop.run_until_complete = Mock(side_effect=lambda coro: asyncio.run(coro)) + return loop + + +@pytest.fixture +def mock_environment(monkeypatch): + """Mock environment variables for testing.""" + env_vars = { + "ANTHROPIC_API_KEY": "test-api-key", + } + + for key, value in env_vars.items(): + monkeypatch.setenv(key, value) + + return env_vars + + +@pytest.fixture +def clean_environment(monkeypatch): + """Remove environment variables for testing missing env scenarios.""" + keys_to_remove = [ + "ANTHROPIC_API_KEY", + ] + + for key in keys_to_remove: + monkeypatch.delenv(key, raising=False) + + return keys_to_remove + + +@pytest.fixture +def mock_provider(): + """Mock APIProvider enum.""" + with patch("browser_use_demo.loop.APIProvider") as mock: + mock.ANTHROPIC = "anthropic" + mock.BEDROCK = "bedrock" + mock.VERTEX = "vertex" + yield mock + + +@pytest.fixture +def mock_api_response_with_text_and_tools(): + """Mock API response containing both text and tool uses.""" + response = Mock() + response.content = [ + Mock(type="text", text="I'll help you with that task"), + Mock( + type="tool_use", + id="tool_001", + name="browser", + input={"action": "screenshot"} + ), + Mock(type="text", text="Here's what I found"), + Mock( + type="tool_use", + id="tool_002", + name="browser", + input={"action": "navigate", "url": "example.com"} + ) + ] + return response + + +@pytest.fixture +def mock_tool_collection(): + """Mock ToolCollection for testing.""" + from unittest.mock import AsyncMock + + mock_browser = AsyncMock() + mock_browser.return_value = ToolResult(output="Tool executed successfully") + + collection = Mock() + collection.tool_map = {"browser": mock_browser} + collection.to_params = Mock(return_value=[ + { + "name": "browser", + "description": "Browser automation tool", + "input_schema": {} + } + ]) + + return collection + + +@pytest.fixture +def sample_mixed_content_messages(): + """Sample messages with mixed text and tool content.""" + return [ + { + "role": "user", + "content": "Can you help me navigate and take a screenshot?" + }, + { + "role": "assistant", + "content": [ + {"type": "text", "text": "Sure, I'll navigate to the page first"}, + { + "type": "tool_use", + "id": "tool_nav", + "name": "browser", + "input": {"action": "navigate", "url": "example.com"} + }, + {"type": "text", "text": "Now taking a screenshot"}, + { + "type": "tool_use", + "id": "tool_shot", + "name": "browser", + "input": {"action": "screenshot"} + } + ] + }, + { + "role": "user", + "content": [ + { + "type": "tool_result", + "tool_use_id": "tool_nav", + "content": [{"type": "text", "text": "Navigation successful"}] + }, + { + "type": "tool_result", + "tool_use_id": "tool_shot", + "content": [ + {"type": "text", "text": "Screenshot taken"}, + { + "type": "image", + "source": { + "type": "base64", + "media_type": "image/png", + "data": "base64imagedata" + } + } + ] + } + ] + } + ] diff --git a/browser-use-demo/tests/test_integration.py b/browser-use-demo/tests/test_integration.py new file mode 100644 index 000000000..4e0920f50 --- /dev/null +++ b/browser-use-demo/tests/test_integration.py @@ -0,0 +1,360 @@ +"""Integration tests for the refactored Browser Use Demo.""" + +from unittest.mock import AsyncMock, MagicMock, Mock, patch + +import pytest +from browser_use_demo.loop import APIProvider +from browser_use_demo.message_renderer import MessageRenderer +from browser_use_demo.streamlit import ( + get_or_create_event_loop, + setup_state, +) +from browser_use_demo.tools import ToolResult + + +@pytest.mark.integration +class TestFullMessageRenderingPipeline: + """Test complete message rendering pipeline.""" + + @patch("streamlit.session_state", new_callable=MagicMock) + @patch("streamlit.chat_message") + @patch("streamlit.markdown") + @patch("streamlit.write") + @patch("streamlit.error") + def test_full_conversation_rendering( + self, mock_error, mock_write, mock_markdown, mock_chat, mock_state + ): + """Test rendering a complete conversation with various message types.""" + + # Setup mock state + mock_state.hide_screenshots = False + mock_state.tools = { + "tool_1": ToolResult(output="Tool output 1"), + "tool_2": ToolResult(error="Tool error 2"), + } + + # Create complex conversation + messages = [ + {"role": "user", "content": "Hello, can you help me?"}, + { + "role": "assistant", + "content": [ + {"type": "text", "text": "Sure, let me help you."}, + { + "type": "tool_use", + "name": "browser", + "input": {"url": "example.com"}, + }, + {"type": "tool_result", "tool_use_id": "tool_1"}, + ], + }, + {"role": "user", "content": [{"type": "text", "text": "Thank you!"}]}, + { + "role": "assistant", + "content": [ + {"type": "text", "text": "You're welcome!"}, + {"type": "tool_result", "tool_use_id": "tool_2"}, + ], + }, + ] + + # Mock chat_message context manager + mock_chat.return_value.__enter__ = Mock() + mock_chat.return_value.__exit__ = Mock() + + # Render full conversation + renderer = MessageRenderer(mock_state) + renderer.render_conversation_history(messages) + + # Verify all message types were rendered + assert mock_markdown.call_count >= 3 # Text messages + assert mock_write.call_count >= 2 # Tool use and text blocks + assert mock_error.call_count == 1 # Tool error + + +@pytest.mark.integration +class TestStateInitializationAndPersistence: + """Test state initialization and persistence across operations.""" + + @patch("streamlit.session_state", new_callable=MagicMock) + @patch("browser_use_demo.tools.BrowserTool") + def test_complete_state_initialization(self, mock_browser_tool, mock_state): + """Test complete state initialization flow.""" + + # Simulate fresh state + initialized_keys = set() + + def setitem_side_effect(key, value): + initialized_keys.add(key) + setattr(mock_state, key, value) + + mock_state.__contains__.return_value = False + mock_state.__setitem__.side_effect = setitem_side_effect + + # Initialize state + setup_state() + + # Verify all required keys were initialized + expected_keys = { + "messages", + "api_key", + "provider", + "model", + "max_tokens", + "system_prompt", + "hide_screenshots", + "tools", + "browser_tool", + "event_loop", + "rendered_message_count", + "is_agent_running", + "active_messages", + "active_response_container", + } + + assert expected_keys.issubset(initialized_keys) + + @patch("streamlit.session_state", new_callable=MagicMock) + def test_state_persistence_across_renders(self, mock_state): + """Test that state persists across multiple render calls.""" + + # Initialize state + mock_state.tools = {"tool_1": ToolResult(output="Persistent tool")} + mock_state.messages = [{"role": "user", "content": "Initial message"}] + + # Create renderer and render + renderer1 = MessageRenderer(mock_state) + with patch("streamlit.chat_message"), patch("streamlit.markdown"): + renderer1.render_conversation_history(mock_state.messages) + + # Add more messages + mock_state.messages.append({"role": "assistant", "content": "Response"}) + + # Create new renderer instance and render again + renderer2 = MessageRenderer(mock_state) + with patch("streamlit.chat_message"), patch("streamlit.markdown") as mock_md: + renderer2.render_conversation_history(mock_state.messages) + + # Should render both messages + assert mock_md.call_count >= 2 + + +@pytest.mark.integration +class TestEventLoopManagementWithAsync: + """Test event loop management with async operations.""" + + def test_async_agent_execution(self): + """Test running async agent with event loop management.""" + + with patch("streamlit.session_state", new_callable=MagicMock) as mock_state: + mock_state.event_loop = None + + with patch("asyncio.set_event_loop"): + with patch("asyncio.new_event_loop") as mock_new_loop: + mock_loop = MagicMock() + mock_loop.is_closed.return_value = False + mock_new_loop.return_value = mock_loop + + loop = get_or_create_event_loop() + + # Verify loop was created and set + assert loop == mock_loop + assert mock_state.event_loop == mock_loop + + # Test that the loop can handle async operations (mocked) + async def mock_agent(input_text): + return f"Processed: {input_text}" + + # Mock running the async function + mock_loop.run_until_complete = MagicMock( + return_value="Processed: Test input" + ) + result = mock_loop.run_until_complete(mock_agent("Test input")) + assert result == "Processed: Test input" + + @patch("streamlit.session_state", new_callable=MagicMock) + def test_concurrent_async_operations(self, mock_state): + """Test handling concurrent async operations.""" + + mock_state.event_loop = None + + with patch("asyncio.set_event_loop"): + with patch("asyncio.new_event_loop") as mock_new_loop: + mock_loop = MagicMock() + mock_loop.is_closed.return_value = False + mock_new_loop.return_value = mock_loop + + loop = get_or_create_event_loop() + + # Simulate concurrent operations + async def async_task(task_id): + return f"Task {task_id} complete" + + # Mock gather to simulate concurrent execution + expected_results = [f"Task {i} complete" for i in range(5)] + + with patch("asyncio.gather") as mock_gather: + mock_gather.return_value = expected_results + + # Test that multiple tasks can be handled + tasks = [async_task(i) for i in range(5)] + results = mock_gather(*tasks) + + assert len(results) == 5 + assert all("complete" in r for r in results) + mock_gather.assert_called_once() + + +@pytest.mark.integration +class TestErrorPropagationAndHandling: + """Test error propagation and handling across the system.""" + + @patch("streamlit.session_state", new_callable=MagicMock) + @patch("streamlit.error") + def test_rendering_error_propagation(self, mock_error, mock_state): + """Test that rendering errors are properly propagated.""" + + mock_state.tools = {} + + # Create message that will cause error + messages = [ + { + "role": "assistant", + "content": [{"type": "tool_result", "tool_use_id": "nonexistent"}], + } + ] + + renderer = MessageRenderer(mock_state) + with patch("streamlit.chat_message"): + # Should handle missing tool gracefully + renderer.render_conversation_history(messages) + + # Error should not be called for missing tool (handled gracefully) + mock_error.assert_not_called() + + @patch("streamlit.session_state", new_callable=MagicMock) + def test_initialization_error_recovery(self, mock_state): + """Test recovery from initialization errors.""" + + mock_state.__contains__.return_value = False + # Set provider to valid enum value so lambda can access it + mock_state.provider = APIProvider.ANTHROPIC + + # First call fails + with patch("browser_use_demo.tools.BrowserTool") as mock_browser: + mock_browser.side_effect = [Exception("Init failed"), MagicMock()] + + # First attempt should fail + with pytest.raises(Exception, match="Init failed"): + setup_state() + + # Second attempt should succeed + setup_state() + assert mock_browser.call_count == 2 + + +@pytest.mark.integration +class TestCompleteWorkflow: + """Test complete workflow from initialization to rendering.""" + + @patch("streamlit.session_state", new_callable=MagicMock) + @patch("streamlit.chat_input") + @patch("streamlit.chat_message") + @patch("streamlit.markdown") + @patch("browser_use_demo.tools.BrowserTool") + @patch("browser_use_demo.streamlit.run_agent", new_callable=AsyncMock) + def test_complete_user_interaction_flow( + self, + mock_run_agent, + mock_browser_tool, + mock_markdown, + mock_chat_message, + mock_chat_input, + mock_state, + ): + """Test complete flow from user input to message rendering.""" + + # Setup initial state + mock_state.__contains__.return_value = False + # Set provider to valid enum value so lambda can access it + mock_state.provider = APIProvider.ANTHROPIC + setup_state() + + # Simulate user input + user_input = "Browse to example.com" + mock_chat_input.return_value = user_input + + # Setup event loop + with patch("asyncio.new_event_loop") as mock_new_loop: + mock_loop = MagicMock() + mock_loop.is_closed.return_value = False + mock_loop.run_until_complete = MagicMock() + mock_new_loop.return_value = mock_loop + + with patch("asyncio.set_event_loop"): + loop = get_or_create_event_loop() + + # Simulate agent response + mock_run_agent.return_value = None + + # Mock chat message context + mock_chat_message.return_value.__enter__ = Mock() + mock_chat_message.return_value.__exit__ = Mock() + + # Simulate the workflow + # User provides input + if user_input: + loop.run_until_complete(mock_run_agent(user_input)) + + mock_loop.run_until_complete.assert_called_once() + mock_run_agent.assert_called_once_with(user_input) + + +@pytest.mark.integration +class TestPerformanceAndScalability: + """Test performance with large datasets and edge cases.""" + + @patch("streamlit.session_state", new_callable=MagicMock) + @patch("streamlit.chat_message") + @patch("streamlit.markdown") + def test_large_conversation_history( + self, mock_markdown, mock_chat_message, mock_state + ): + """Test rendering very large conversation history.""" + + # Create large conversation (1000 messages) + large_conversation = [] + for i in range(1000): + role = "user" if i % 2 == 0 else "assistant" + large_conversation.append({"role": role, "content": f"Message {i}"}) + + mock_state.tools = {} + mock_chat_message.return_value.__enter__ = Mock() + mock_chat_message.return_value.__exit__ = Mock() + + renderer = MessageRenderer(mock_state) + + # Should handle large conversation without issues + renderer.render_conversation_history(large_conversation) + + # Verify all messages were processed + assert mock_markdown.call_count == 1000 + + @patch("streamlit.session_state", new_callable=MagicMock) + def test_deeply_nested_content_performance(self, mock_state): + """Test performance with deeply nested content structures.""" + + # Create deeply nested structure + content = {"type": "text", "text": "Base"} + for i in range(100): + content = {"type": "wrapper", "content": [content], "depth": i} + + messages = [{"role": "user", "content": [content]}] + + mock_state.tools = {} + + with patch("streamlit.chat_message"), patch("streamlit.write"): + renderer = MessageRenderer(mock_state) + + # Should handle deep nesting without stack overflow + renderer.render_conversation_history(messages) diff --git a/browser-use-demo/tests/test_message_renderer.py b/browser-use-demo/tests/test_message_renderer.py new file mode 100644 index 000000000..a52489084 --- /dev/null +++ b/browser-use-demo/tests/test_message_renderer.py @@ -0,0 +1,385 @@ +"""Tests for MessageRenderer class with comprehensive edge case coverage.""" + +from unittest.mock import MagicMock, Mock, patch + +import pytest +from browser_use_demo.message_renderer import MessageRenderer, Sender +from browser_use_demo.tools import ToolResult + + +class TestMessageRenderer: + """Test suite for MessageRenderer class.""" + + def test_initialization(self, mock_streamlit): + """Test MessageRenderer initialization.""" + renderer = MessageRenderer(mock_streamlit["session_state"]) + assert renderer.session_state == mock_streamlit["session_state"] + + def test_initialization_with_none_state(self): + """Test initialization with None session state.""" + renderer = MessageRenderer(None) + assert renderer.session_state is None + + def test_initialization_with_empty_state(self): + """Test initialization with empty session state.""" + empty_state = MagicMock() + renderer = MessageRenderer(empty_state) + assert renderer.session_state == empty_state + + +class TestRenderMethod: + """Test the main render method with various inputs.""" + + def test_render_string_message(self, mock_streamlit): + """Test rendering a simple string message.""" + renderer = MessageRenderer(mock_streamlit["session_state"]) + renderer.render(Sender.USER, "Hello world") + + mock_streamlit["chat_message"].assert_called_with(Sender.USER) + mock_streamlit["markdown"].assert_called_with("Hello world") + + def test_render_empty_string(self, mock_streamlit): + """Test rendering an empty string (should skip).""" + renderer = MessageRenderer(mock_streamlit["session_state"]) + renderer.render(Sender.USER, "") + + mock_streamlit["chat_message"].assert_not_called() + + def test_render_none_message(self, mock_streamlit): + """Test rendering None message (should skip).""" + renderer = MessageRenderer(mock_streamlit["session_state"]) + renderer.render(Sender.BOT, None) + + mock_streamlit["chat_message"].assert_not_called() + + def test_render_tool_result_with_output(self, mock_streamlit, sample_tool_result): + """Test rendering ToolResult with output.""" + renderer = MessageRenderer(mock_streamlit["session_state"]) + renderer.render(Sender.TOOL, sample_tool_result["success"]) + + mock_streamlit["markdown"].assert_called_with("Success message") + + def test_render_tool_result_with_error(self, mock_streamlit, sample_tool_result): + """Test rendering ToolResult with error.""" + renderer = MessageRenderer(mock_streamlit["session_state"]) + renderer.render(Sender.TOOL, sample_tool_result["error"]) + + mock_streamlit["error"].assert_called_with("Error message") + + def test_render_tool_result_with_image(self, mock_streamlit, sample_tool_result): + """Test rendering ToolResult with image.""" + mock_streamlit["session_state"].hide_screenshots = False + renderer = MessageRenderer(mock_streamlit["session_state"]) + renderer.render(Sender.TOOL, sample_tool_result["with_image"]) + + mock_streamlit["markdown"].assert_called_with("With screenshot") + # Image should be decoded and displayed + assert mock_streamlit["image"].called + + def test_render_tool_result_with_hidden_screenshots( + self, mock_streamlit, sample_tool_result + ): + """Test that images are hidden when hide_screenshots is True.""" + mock_streamlit["session_state"].hide_screenshots = True + renderer = MessageRenderer(mock_streamlit["session_state"]) + renderer.render(Sender.TOOL, sample_tool_result["with_image"]) + + # Should render text but not image + mock_streamlit["markdown"].assert_called_with("With screenshot") + mock_streamlit["image"].assert_not_called() + + def test_render_dict_message_text_type(self, mock_streamlit): + """Test rendering dictionary message with text type.""" + renderer = MessageRenderer(mock_streamlit["session_state"]) + message = {"type": "text", "text": "Hello from dict"} + renderer.render(Sender.USER, message) + + mock_streamlit["write"].assert_called_with("Hello from dict") + + def test_render_dict_message_tool_use_type(self, mock_streamlit): + """Test rendering dictionary message with tool_use type.""" + renderer = MessageRenderer(mock_streamlit["session_state"]) + message = { + "type": "tool_use", + "name": "browser_tool", + "input": {"url": "example.com"}, + } + renderer.render(Sender.BOT, message) + + expected_code = "Tool Use: browser_tool\nInput: {'url': 'example.com'}" + mock_streamlit["code"].assert_called_with(expected_code) + + def test_render_dict_message_unknown_type(self, mock_streamlit): + """Test rendering dictionary message with unknown type.""" + renderer = MessageRenderer(mock_streamlit["session_state"]) + message = {"type": "unknown", "data": "some data"} + renderer.render(Sender.BOT, message) + + # Should fall back to generic write + mock_streamlit["write"].assert_called_with(message) + + def test_render_very_long_message(self, mock_streamlit): + """Test rendering extremely long messages.""" + renderer = MessageRenderer(mock_streamlit["session_state"]) + long_message = "x" * 100000 # 100k characters + renderer.render(Sender.USER, long_message) + + mock_streamlit["markdown"].assert_called_with(long_message) + + def test_render_unicode_special_chars(self, mock_streamlit): + """Test rendering messages with unicode and special characters.""" + renderer = MessageRenderer(mock_streamlit["session_state"]) + special_message = "Hello 世界 🌍 \n\t\r ñáéíóú" + renderer.render(Sender.USER, special_message) + + mock_streamlit["markdown"].assert_called_with(special_message) + + +class TestConversationHistory: + """Test render_conversation_history method with various scenarios.""" + + def test_render_empty_history(self, mock_streamlit): + """Test rendering empty conversation history.""" + renderer = MessageRenderer(mock_streamlit["session_state"]) + renderer.render_conversation_history([]) + + # No rendering should occur + mock_streamlit["chat_message"].assert_not_called() + + def test_render_single_message(self, mock_streamlit): + """Test rendering single message in history.""" + renderer = MessageRenderer(mock_streamlit["session_state"]) + messages = [{"role": "user", "content": "Hello"}] + renderer.render_conversation_history(messages) + + mock_streamlit["markdown"].assert_called_with("Hello") + + def test_render_multiple_messages(self, mock_streamlit, sample_messages): + """Test rendering multiple messages with different roles.""" + renderer = MessageRenderer(mock_streamlit["session_state"]) + renderer.render_conversation_history(sample_messages[:2]) + + # Should render both messages + assert mock_streamlit["markdown"].call_count >= 2 + + def test_render_unknown_role(self, mock_streamlit): + """Test handling messages with unknown roles.""" + renderer = MessageRenderer(mock_streamlit["session_state"]) + messages = [{"role": "unknown_role", "content": "Test"}] + renderer.render_conversation_history(messages) + + # Should not crash, but won't render + mock_streamlit["markdown"].assert_not_called() + + def test_render_missing_content_field(self, mock_streamlit): + """Test handling messages missing content field.""" + renderer = MessageRenderer(mock_streamlit["session_state"]) + messages = [{"role": "user"}] # Missing content + + # Should not crash - will get KeyError but handler should manage it gracefully + try: + renderer.render_conversation_history(messages) + except KeyError: + pass # Expected when content field is missing + + def test_render_none_content(self, mock_streamlit): + """Test handling messages with None content.""" + renderer = MessageRenderer(mock_streamlit["session_state"]) + messages = [{"role": "user", "content": None}] + renderer.render_conversation_history(messages) + + # Should handle gracefully without rendering + mock_streamlit["markdown"].assert_not_called() + + def test_render_list_content(self, mock_streamlit): + """Test rendering messages with list content.""" + renderer = MessageRenderer(mock_streamlit["session_state"]) + messages = [ + { + "role": "user", + "content": [ + {"type": "text", "text": "First"}, + {"type": "text", "text": "Second"}, + ], + } + ] + renderer.render_conversation_history(messages) + + # Should render both text blocks + calls = mock_streamlit["markdown"].call_args_list + assert any("First" in str(call) for call in calls) + assert any("Second" in str(call) for call in calls) + + def test_skip_image_blocks_in_history(self, mock_streamlit): + """Test that image blocks are skipped in conversation history.""" + renderer = MessageRenderer(mock_streamlit["session_state"]) + messages = [ + { + "role": "user", + "content": [ + {"type": "text", "text": "Text message"}, + {"type": "image", "source": "data:image/png;base64,abc"}, + ], + } + ] + renderer.render_conversation_history(messages) + + # Should only render text, not image + mock_streamlit["markdown"].assert_called_with("Text message") + mock_streamlit["image"].assert_not_called() + + def test_tool_result_in_assistant_message(self, mock_streamlit, sample_tool_result): + """Test rendering tool results from assistant messages.""" + mock_streamlit["session_state"].tools = { + "tool_123": sample_tool_result["success"] + } + renderer = MessageRenderer(mock_streamlit["session_state"]) + messages = [ + { + "role": "assistant", + "content": [{"type": "tool_result", "tool_use_id": "tool_123"}], + } + ] + renderer.render_conversation_history(messages) + + # Should render the tool result from session state + mock_streamlit["markdown"].assert_called_with("Success message") + + def test_missing_tool_in_session_state(self, mock_streamlit): + """Test handling tool_use_id that doesn't exist in session state.""" + renderer = MessageRenderer(mock_streamlit["session_state"]) + messages = [ + { + "role": "assistant", + "content": [{"type": "tool_result", "tool_use_id": "nonexistent"}], + } + ] + renderer.render_conversation_history(messages) + + # Should handle gracefully without crashing + mock_streamlit["markdown"].assert_not_called() + + +class TestEdgeCases: + """Test edge cases and error conditions.""" + + def test_circular_reference_handling(self, mock_streamlit): + """Test handling circular references in messages.""" + renderer = MessageRenderer(mock_streamlit["session_state"]) + + # Create circular reference + content = [] + content.append({"type": "text", "text": "Normal", "ref": content}) + messages = [{"role": "user", "content": content}] + + # Should not crash or infinite loop + renderer.render_conversation_history(messages) + + def test_malformed_tool_result(self, mock_streamlit): + """Test handling malformed ToolResult objects.""" + renderer = MessageRenderer(mock_streamlit["session_state"]) + + # Create a mock that doesn't have expected attributes + malformed = Mock(spec=[]) # No attributes + renderer.render(Sender.TOOL, malformed) + + # Should handle gracefully + mock_streamlit["markdown"].assert_not_called() + + def test_exception_in_rendering(self, mock_streamlit): + """Test that exceptions during rendering are propagated.""" + # Setup the chat_message context manager properly + mock_chat_cm = MagicMock() + mock_chat_cm.__enter__ = Mock(return_value=None) + mock_chat_cm.__exit__ = Mock(return_value=None) + mock_streamlit["chat_message"].return_value = mock_chat_cm + + # Set markdown to raise an exception + mock_streamlit["markdown"].side_effect = Exception("Render error") + renderer = MessageRenderer(mock_streamlit["session_state"]) + + # Should let the exception propagate for markdown rendering + with pytest.raises(Exception, match="Render error"): + renderer.render(Sender.USER, "Test message") + + def test_normalize_content_with_various_inputs(self, mock_streamlit): + """Test _normalize_content with various input types.""" + renderer = MessageRenderer(mock_streamlit["session_state"]) + + # String input + assert renderer._normalize_content("test") == ["test"] + + # List input + assert renderer._normalize_content([1, 2, 3]) == [1, 2, 3] + + # None input + assert renderer._normalize_content(None) == [None] + + # Dict input + assert renderer._normalize_content({"key": "value"}) == [{"key": "value"}] + + def test_deeply_nested_content(self, mock_streamlit): + """Test handling deeply nested content structures.""" + renderer = MessageRenderer(mock_streamlit["session_state"]) + + # Create deeply nested structure + nested = {"type": "text", "text": "Deep"} + for _ in range(100): + nested = {"type": "wrapper", "content": nested} + + messages = [{"role": "user", "content": [nested]}] + renderer.render_conversation_history(messages) + + # Should handle without stack overflow + + def test_concurrent_modification(self, mock_streamlit): + """Test behavior when session state is modified during rendering.""" + renderer = MessageRenderer(mock_streamlit["session_state"]) + + # Setup mock properly for context manager + mock_cm = MagicMock() + mock_cm.__enter__ = Mock(return_value=None) + mock_cm.__exit__ = Mock(return_value=None) + + # Simulate modification during rendering + def modify_state(*args, **kwargs): + mock_streamlit["session_state"].tools = {} + return mock_cm + + mock_streamlit["chat_message"].side_effect = modify_state + + # Should complete rendering despite modifications + renderer.render(Sender.USER, "Test") + + def test_invalid_sender_type(self, mock_streamlit): + """Test handling invalid sender types.""" + renderer = MessageRenderer(mock_streamlit["session_state"]) + + # Use invalid sender + renderer.render("invalid_sender", "Message") + + # Should still render with the provided sender + mock_streamlit["chat_message"].assert_called_with("invalid_sender") + + def test_base64_decode_error(self, mock_streamlit): + """Test handling invalid base64 image data.""" + # Setup the chat_message context manager properly + mock_chat_cm = MagicMock() + mock_chat_cm.__enter__ = Mock(return_value=None) + mock_chat_cm.__exit__ = Mock(return_value=None) + mock_streamlit["chat_message"].return_value = mock_chat_cm + + # Setup session state to not hide screenshots + mock_streamlit["session_state"].hide_screenshots = False + + renderer = MessageRenderer(mock_streamlit["session_state"]) + tool_result = ToolResult( + output="With bad image", base64_image="invalid_base64_!@#$" + ) + + with patch("base64.b64decode") as mock_decode: + mock_decode.side_effect = Exception("Invalid base64") + + # Should handle the error gracefully - the exception should propagate + with pytest.raises(Exception, match="Invalid base64"): + renderer.render(Sender.TOOL, tool_result) diff --git a/browser-use-demo/tests/test_sampling_loop.py b/browser-use-demo/tests/test_sampling_loop.py new file mode 100644 index 000000000..5f82e81f6 --- /dev/null +++ b/browser-use-demo/tests/test_sampling_loop.py @@ -0,0 +1,512 @@ +""" +Comprehensive tests for the sampling loop and message handling. +""" + +import asyncio +from unittest.mock import AsyncMock, Mock, patch + +import pytest +from browser_use_demo.loop import APIProvider, sampling_loop +from browser_use_demo.message_handler import ( + MessageBuilder, + ResponseProcessor, +) +from browser_use_demo.tools import ToolResult + + +class TestResponseProcessor: + """Test the ResponseProcessor class.""" + + def test_process_response_text_only(self): + """Test processing a response with only text content.""" + mock_response = Mock() + mock_response.content = [ + Mock(type="text", text="This is a text response") + ] + + processor = ResponseProcessor() + result = processor.process_response(mock_response) + + assert result.has_text is True + assert result.has_tools is False + assert len(result.assistant_content) == 1 + assert result.assistant_content[0]["type"] == "text" + assert result.assistant_content[0]["text"] == "This is a text response" + assert len(result.tool_uses) == 0 + + def test_process_response_tool_only(self): + """Test processing a response with only tool use.""" + mock_response = Mock() + mock_tool_use = Mock( + type="tool_use", + id="tool_123", + name="browser", + input={"action": "screenshot"} + ) + mock_response.content = [mock_tool_use] + + processor = ResponseProcessor() + result = processor.process_response(mock_response) + + assert result.has_text is False + assert result.has_tools is True + assert len(result.assistant_content) == 1 + assert result.assistant_content[0]["type"] == "tool_use" + assert len(result.tool_uses) == 1 + + def test_process_response_mixed_content(self): + """Test processing a response with both text and tool use.""" + mock_response = Mock() + mock_response.content = [ + Mock(type="text", text="Let me take a screenshot"), + Mock( + type="tool_use", + id="tool_456", + name="browser", + input={"action": "screenshot"} + ) + ] + + processor = ResponseProcessor() + result = processor.process_response(mock_response) + + assert result.has_text is True + assert result.has_tools is True + assert len(result.assistant_content) == 2 + assert result.assistant_content[0]["type"] == "text" + assert result.assistant_content[0]["text"] == "Let me take a screenshot" + assert result.assistant_content[1]["type"] == "tool_use" + assert len(result.tool_uses) == 1 + + def test_execute_tools_success(self): + """Test successful tool execution.""" + + async def run_test(): + mock_tool = AsyncMock(return_value=ToolResult(output="Tool executed")) + mock_collection = Mock() + mock_collection.tool_map = {"browser": mock_tool} + + tool_uses = [ + { + "type": "tool_use", + "id": "tool_789", + "name": "browser", + "input": {"action": "screenshot"} + } + ] + + processor = ResponseProcessor() + results = await processor.execute_tools(tool_uses, mock_collection) + + assert len(results) == 1 + assert results[0]["type"] == "tool_result" + assert results[0]["tool_use_id"] == "tool_789" + assert any( + block.get("text") == "Tool executed" + for block in results[0]["content"] + ) + + asyncio.run(run_test()) + + def test_execute_tools_with_error(self): + """Test tool execution with error.""" + + async def run_test(): + mock_tool = AsyncMock(side_effect=Exception("Tool failed")) + mock_collection = Mock() + mock_collection.tool_map = {"browser": mock_tool} + + tool_uses = [ + { + "type": "tool_use", + "id": "tool_error", + "name": "browser", + "input": {"action": "invalid"} + } + ] + + processor = ResponseProcessor() + results = await processor.execute_tools(tool_uses, mock_collection) + + assert len(results) == 1 + assert results[0]["type"] == "tool_result" + assert results[0]["is_error"] is True + assert any( + "Tool failed" in block.get("text", "") + for block in results[0]["content"] + ) + + asyncio.run(run_test()) + + def test_build_tool_result_with_image(self): + """Test building tool result with base64 image.""" + result = ToolResult(base64_image="base64_data_here") + processor = ResponseProcessor() + + tool_result = processor._build_tool_result(result, "tool_img") + + assert tool_result["type"] == "tool_result" + assert tool_result["tool_use_id"] == "tool_img" + assert any( + block.get("type") == "image" + for block in tool_result["content"] + ) + + def test_build_tool_result_with_text_extraction_markers(self): + """Test handling of text extraction markers in tool results.""" + result = ToolResult( + output="__PAGE_EXTRACTED__\nSome content\n__FULL_CONTENT__\nThe actual content" + ) + processor = ResponseProcessor() + + tool_result = processor._build_tool_result(result, "tool_extract") + + assert any( + block.get("text") == "The actual content" + for block in tool_result["content"] + ) + + +class TestMessageBuilder: + """Test the MessageBuilder class.""" + + def test_add_assistant_message(self): + """Test adding an assistant message.""" + messages = [] + content = [ + {"type": "text", "text": "Hello"}, + {"type": "tool_use", "id": "123", "name": "test", "input": {}} + ] + + builder = MessageBuilder() + builder.add_assistant_message(messages, content) + + assert len(messages) == 1 + assert messages[0]["role"] == "assistant" + assert messages[0]["content"] == content + + def test_add_assistant_message_empty_content(self): + """Test that empty content is not added.""" + messages = [] + content = [] + + builder = MessageBuilder() + builder.add_assistant_message(messages, content) + + assert len(messages) == 0 + + def test_add_tool_results(self): + """Test adding tool results.""" + messages = [] + tool_results = [ + { + "type": "tool_result", + "tool_use_id": "123", + "content": [{"type": "text", "text": "Result"}] + } + ] + + builder = MessageBuilder() + builder.add_tool_results(messages, tool_results) + + assert len(messages) == 1 + assert messages[0]["role"] == "user" + assert messages[0]["content"] == tool_results + + def test_add_tool_results_empty(self): + """Test that empty tool results are not added.""" + messages = [] + tool_results = [] + + builder = MessageBuilder() + builder.add_tool_results(messages, tool_results) + + assert len(messages) == 0 + + def test_ensure_message_integrity_valid(self): + """Test message integrity validation with valid messages.""" + messages = [ + {"role": "user", "content": "Hello"}, + {"role": "assistant", "content": [{"type": "text", "text": "Hi"}]}, + ] + + builder = MessageBuilder() + assert builder.ensure_message_integrity(messages) is True + + def test_ensure_message_integrity_missing_role(self): + """Test message integrity with missing role.""" + messages = [ + {"content": "Hello"}, + ] + + builder = MessageBuilder() + assert builder.ensure_message_integrity(messages) is False + + def test_ensure_message_integrity_missing_content(self): + """Test message integrity with missing content.""" + messages = [ + {"role": "user"}, + ] + + builder = MessageBuilder() + assert builder.ensure_message_integrity(messages) is False + + def test_ensure_message_integrity_empty_list_content(self): + """Test message integrity with empty content list.""" + messages = [ + {"role": "user", "content": []}, + ] + + builder = MessageBuilder() + assert builder.ensure_message_integrity(messages) is False + + def test_extract_text_from_message(self): + """Test extracting text from assistant message.""" + message = { + "role": "assistant", + "content": [ + {"type": "text", "text": "First part"}, + {"type": "tool_use", "id": "123", "name": "test", "input": {}}, + {"type": "text", "text": "Second part"} + ] + } + + builder = MessageBuilder() + text = builder.extract_text_from_message(message) + + assert text == "First part Second part" + + def test_extract_text_from_message_no_text(self): + """Test extracting text when there's no text content.""" + message = { + "role": "assistant", + "content": [ + {"type": "tool_use", "id": "123", "name": "test", "input": {}} + ] + } + + builder = MessageBuilder() + text = builder.extract_text_from_message(message) + + assert text is None + + def test_extract_text_from_user_message(self): + """Test that text extraction returns None for non-assistant messages.""" + message = { + "role": "user", + "content": "User message" + } + + builder = MessageBuilder() + text = builder.extract_text_from_message(message) + + assert text is None + + +@pytest.mark.integration +class TestSamplingLoopIntegration: + """Integration tests for the sampling loop.""" + + @patch("browser_use_demo.loop.Anthropic") + def test_sampling_loop_preserves_text_with_tools(self, mock_anthropic): + """Test that text is preserved when tools are used.""" + + async def run_test(): + mock_client = Mock() + mock_anthropic.return_value = mock_client + + mock_response = Mock() + mock_response.content = [ + Mock(type="text", text="I'll help you with that"), + Mock( + type="tool_use", + id="tool_001", + name="browser", + input={"action": "screenshot"} + ) + ] + + mock_client.beta.messages.create = Mock(return_value=mock_response) + + mock_browser = AsyncMock() + mock_browser.return_value = ToolResult(output="Screenshot taken") + + messages = [{"role": "user", "content": "Take a screenshot"}] + output_messages = [] + tool_outputs = {} + + def output_callback(content): + output_messages.append(content) + + def tool_output_callback(result, tool_id): + tool_outputs[tool_id] = result + + updated_messages = await sampling_loop( + model="claude-sonnet-4-5", + provider=APIProvider.ANTHROPIC, + system_prompt_suffix="", + messages=messages, + output_callback=output_callback, + tool_output_callback=tool_output_callback, + api_response_callback=lambda *args: None, + api_key="test_key", + browser_tool=mock_browser + ) + + api_call_args = mock_client.beta.messages.create.call_args[1] + assert api_call_args["tool_choice"] == {"type": "auto"} + + assert len(output_messages) >= 2 + assert any( + msg.get("type") == "text" and "help you" in msg.get("text", "") + for msg in output_messages + ) + assert any(msg.get("type") == "tool_use" for msg in output_messages) + + assistant_msgs = [m for m in updated_messages if m["role"] == "assistant"] + assert len(assistant_msgs) > 0 + + last_assistant = assistant_msgs[-1] + assert isinstance(last_assistant["content"], list) + + has_text = any( + block.get("type") == "text" + for block in last_assistant["content"] + ) + has_tool = any( + block.get("type") == "tool_use" + for block in last_assistant["content"] + ) + + assert has_text and has_tool, "Assistant message should contain both text and tool use" + + asyncio.run(run_test()) + + @patch("browser_use_demo.loop.Anthropic") + def test_sampling_loop_text_only_response(self, mock_anthropic): + """Test handling of text-only responses.""" + + async def run_test(): + mock_client = Mock() + mock_anthropic.return_value = mock_client + + mock_response = Mock() + mock_response.content = [ + Mock(type="text", text="This is just a text response") + ] + + mock_client.beta.messages.create = Mock(return_value=mock_response) + + messages = [{"role": "user", "content": "Hello"}] + + updated_messages = await sampling_loop( + model="claude-sonnet-4-5", + provider=APIProvider.ANTHROPIC, + system_prompt_suffix="", + messages=messages, + output_callback=lambda x: None, + tool_output_callback=lambda r, i: None, + api_response_callback=lambda *args: None, + api_key="test_key" + ) + + assert len(updated_messages) == 2 + assert updated_messages[-1]["role"] == "assistant" + assert any( + block.get("text") == "This is just a text response" + for block in updated_messages[-1]["content"] + ) + + asyncio.run(run_test()) + + @patch("browser_use_demo.loop.Anthropic") + def test_sampling_loop_multiple_tools_with_text(self, mock_anthropic): + """Test handling of multiple tool uses with text.""" + + async def run_test(): + mock_client = Mock() + mock_anthropic.return_value = mock_client + + mock_response = Mock() + mock_response.content = [ + Mock(type="text", text="I'll perform multiple actions"), + Mock( + type="tool_use", + id="tool_001", + name="browser", + input={"action": "screenshot"} + ), + Mock(type="text", text="Now navigating"), + Mock( + type="tool_use", + id="tool_002", + name="browser", + input={"action": "navigate", "url": "example.com"} + ) + ] + + mock_client.beta.messages.create = Mock(return_value=mock_response) + + mock_browser = AsyncMock() + mock_browser.return_value = ToolResult(output="Action completed") + + messages = [{"role": "user", "content": "Do multiple things"}] + + updated_messages = await sampling_loop( + model="claude-sonnet-4-5", + provider=APIProvider.ANTHROPIC, + system_prompt_suffix="", + messages=messages, + output_callback=lambda x: None, + tool_output_callback=lambda r, i: None, + api_response_callback=lambda *args: None, + api_key="test_key", + browser_tool=mock_browser + ) + + assistant_msgs = [m for m in updated_messages if m["role"] == "assistant"] + last_assistant = assistant_msgs[-1] + + text_blocks = [ + block for block in last_assistant["content"] + if block.get("type") == "text" + ] + tool_blocks = [ + block for block in last_assistant["content"] + if block.get("type") == "tool_use" + ] + + assert len(text_blocks) == 2 + assert len(tool_blocks) == 2 + + asyncio.run(run_test()) + + @patch("browser_use_demo.loop.Anthropic") + def test_tool_choice_parameter_set(self, mock_anthropic): + """Test that tool_choice is explicitly set to auto.""" + + async def run_test(): + mock_client = Mock() + mock_anthropic.return_value = mock_client + + mock_response = Mock() + mock_response.content = [Mock(type="text", text="Response")] + + mock_client.beta.messages.create = Mock(return_value=mock_response) + + await sampling_loop( + model="claude-sonnet-4-5", + provider=APIProvider.ANTHROPIC, + system_prompt_suffix="", + messages=[{"role": "user", "content": "Test"}], + output_callback=lambda x: None, + tool_output_callback=lambda r, i: None, + api_response_callback=lambda *args: None, + api_key="test_key" + ) + + call_args = mock_client.beta.messages.create.call_args[1] + assert "tool_choice" in call_args + assert call_args["tool_choice"] == {"type": "auto"} + + asyncio.run(run_test()) diff --git a/browser-use-demo/tests/test_streamlit_helpers.py b/browser-use-demo/tests/test_streamlit_helpers.py new file mode 100644 index 000000000..4e7a33d36 --- /dev/null +++ b/browser-use-demo/tests/test_streamlit_helpers.py @@ -0,0 +1,325 @@ +"""Tests for Streamlit helper functions with edge case coverage.""" + +import asyncio +from unittest.mock import MagicMock, Mock, patch + +import pytest +from browser_use_demo.loop import APIProvider +from browser_use_demo.streamlit import ( + authenticate, + get_or_create_event_loop, + setup_state, +) + + +class TestSetupState: + """Test suite for setup_state function.""" + + @patch("streamlit.session_state", new_callable=MagicMock) + def test_setup_state_fresh_initialization(self, mock_state, mock_environment): + """Test setup_state with completely empty session state.""" + # Simulate empty session state + mock_state.__contains__.return_value = False + # Set provider to valid enum value so lambda can access it + mock_state.provider = APIProvider.ANTHROPIC + + with patch("browser_use_demo.tools.BrowserTool") as mock_browser: + setup_state() + + # Check all defaults were set + assert "messages" in mock_state.__setitem__.call_args_list[0][0] + assert "api_key" in str(mock_state.__setitem__.call_args_list) + assert "event_loop" in str(mock_state.__setitem__.call_args_list) + + # Browser tool should be created + mock_browser.assert_called_once() + + @patch("streamlit.session_state", new_callable=MagicMock) + def test_setup_state_partial_initialization(self, mock_state): + """Test setup_state when some keys already exist.""" + + # Simulate partial state + existing_keys = ["messages", "api_key"] + + def contains_side_effect(key): + return key in existing_keys + + mock_state.__contains__.side_effect = contains_side_effect + # Set provider to valid enum value so lambda can access it + mock_state.provider = APIProvider.ANTHROPIC + + with patch("browser_use_demo.tools.BrowserTool"): + setup_state() + + # Only missing keys should be set + set_keys = [call[0][0] for call in mock_state.__setitem__.call_args_list] + assert "messages" not in set_keys + assert "api_key" not in set_keys + + @patch("streamlit.session_state", new_callable=MagicMock) + def test_setup_state_missing_env_variables(self, mock_state, clean_environment): + """Test setup_state when environment variables are missing.""" + + mock_state.__contains__.return_value = False + # Set provider to valid enum value so lambda can access it + mock_state.provider = APIProvider.ANTHROPIC + + with patch("browser_use_demo.tools.BrowserTool") as mock_browser: + setup_state() + + # BrowserTool no longer takes dimensions as arguments + mock_browser.assert_called_with() + + @patch("streamlit.session_state", new_callable=MagicMock) + def test_setup_state_lambda_evaluation(self, mock_state, mock_provider): + """Test that lambda functions are evaluated correctly.""" + + mock_state.__contains__.return_value = False + mock_state.provider = mock_provider.ANTHROPIC + + setup_state() + + # Model should be set based on provider + model_calls = [ + call + for call in mock_state.__setitem__.call_args_list + if call[0][0] == "model" + ] + assert len(model_calls) > 0 + + @patch("streamlit.session_state", new_callable=MagicMock) + def test_setup_state_browser_tool_error(self, mock_state): + """Test setup_state when BrowserTool initialization fails.""" + + mock_state.__contains__.return_value = False + # Set provider to valid enum value so lambda can access it + mock_state.provider = APIProvider.ANTHROPIC + + with patch("browser_use_demo.tools.BrowserTool") as mock_browser: + mock_browser.side_effect = Exception("Browser init failed") + + # Should raise the exception + with pytest.raises(Exception, match="Browser init failed"): + setup_state() + + # Test removed - BrowserTool no longer reads dimensions from environment + + +class TestGetOrCreateEventLoop: + """Test suite for get_or_create_event_loop function.""" + + @patch("streamlit.session_state", new_callable=MagicMock) + @patch("asyncio.new_event_loop") + @patch("asyncio.set_event_loop") + def test_create_new_loop_when_none(self, mock_set_loop, mock_new_loop, mock_state): + """Test creating new event loop when none exists.""" + + mock_state.event_loop = None + new_loop = Mock() + mock_new_loop.return_value = new_loop + + result = get_or_create_event_loop() + + mock_new_loop.assert_called_once() + mock_set_loop.assert_called_once_with(new_loop) + assert mock_state.event_loop == new_loop + assert result == new_loop + + @patch("streamlit.session_state", new_callable=MagicMock) + @patch("asyncio.new_event_loop") + @patch("asyncio.set_event_loop") + def test_create_new_loop_when_closed( + self, mock_set_loop, mock_new_loop, mock_state + ): + """Test creating new event loop when existing is closed.""" + + closed_loop = Mock() + closed_loop.is_closed.return_value = True + mock_state.event_loop = closed_loop + + new_loop = Mock() + mock_new_loop.return_value = new_loop + + result = get_or_create_event_loop() + + mock_new_loop.assert_called_once() + mock_set_loop.assert_called_once_with(new_loop) + assert mock_state.event_loop == new_loop + assert result == new_loop + + @patch("streamlit.session_state", new_callable=MagicMock) + @patch("asyncio.new_event_loop") + @patch("asyncio.set_event_loop") + def test_reuse_existing_open_loop(self, mock_set_loop, mock_new_loop, mock_state): + """Test reusing existing open event loop.""" + + existing_loop = Mock() + existing_loop.is_closed.return_value = False + mock_state.event_loop = existing_loop + + result = get_or_create_event_loop() + + mock_new_loop.assert_not_called() + mock_set_loop.assert_called_once_with(existing_loop) + assert result == existing_loop + + @patch("streamlit.session_state", new_callable=MagicMock) + @patch("asyncio.new_event_loop") + def test_event_loop_creation_error(self, mock_new_loop, mock_state): + """Test handling error during event loop creation.""" + + mock_state.event_loop = None + mock_new_loop.side_effect = RuntimeError("Cannot create loop") + + with pytest.raises(RuntimeError, match="Cannot create loop"): + get_or_create_event_loop() + + @patch("streamlit.session_state", new_callable=MagicMock) + @patch("asyncio.set_event_loop") + def test_set_event_loop_error(self, mock_set_loop, mock_state): + """Test handling error when setting event loop.""" + + mock_state.event_loop = None + mock_set_loop.side_effect = RuntimeError("Cannot set loop") + + with pytest.raises(RuntimeError, match="Cannot set loop"): + get_or_create_event_loop() + + +class TestAuthenticate: + """Test suite for authenticate function.""" + + @patch("streamlit.session_state", new_callable=MagicMock) + @patch("streamlit.error") + @patch("streamlit.stop") + def test_authenticate_with_valid_key( + self, mock_stop, mock_error, mock_state, mock_provider + ): + """Test authenticate with valid API key.""" + + mock_state.provider = mock_provider.ANTHROPIC + mock_state.api_key = "valid-key" + + result = authenticate() + + assert result is True + mock_error.assert_not_called() + mock_stop.assert_not_called() + + @patch("streamlit.session_state", new_callable=MagicMock) + @patch("streamlit.error") + @patch("streamlit.stop") + def test_authenticate_with_missing_key( + self, mock_stop, mock_error, mock_state, mock_provider + ): + """Test authenticate with missing API key.""" + + mock_state.provider = mock_provider.ANTHROPIC + mock_state.api_key = "" + + authenticate() + + mock_error.assert_called_once_with( + "Please provide your Anthropic API key in the sidebar" + ) + mock_stop.assert_called_once() + # Function doesn't return after stop() in real scenario + + @patch("streamlit.session_state", new_callable=MagicMock) + @patch("streamlit.error") + @patch("streamlit.stop") + def test_authenticate_with_none_key( + self, mock_stop, mock_error, mock_state, mock_provider + ): + """Test authenticate with None API key.""" + + mock_state.provider = mock_provider.ANTHROPIC + mock_state.api_key = None + + authenticate() + + mock_error.assert_called_once() + mock_stop.assert_called_once() + + @patch("streamlit.session_state", new_callable=MagicMock) + def test_authenticate_non_anthropic_provider(self, mock_state, mock_provider): + """Test authenticate with non-Anthropic provider.""" + + mock_state.provider = mock_provider.BEDROCK + mock_state.api_key = "" # Empty key should be OK for non-Anthropic + + result = authenticate() + + assert result is True + + +class TestEdgeCasesAndErrors: + """Test edge cases and error conditions for helper functions.""" + + @patch("streamlit.session_state", new_callable=MagicMock) + def test_setup_state_with_corrupted_state(self, mock_state): + """Test setup_state with corrupted session state.""" + + # Simulate corrupted state that raises on access + mock_state.__contains__.side_effect = Exception("State corrupted") + + with pytest.raises(Exception, match="State corrupted"): + setup_state() + + @patch("streamlit.session_state", new_callable=MagicMock) + def test_concurrent_setup_state_calls(self, mock_state): + """Test concurrent calls to setup_state.""" + import threading + + mock_state.__contains__.return_value = False + # Set provider to valid enum value so lambda can access it + mock_state.provider = APIProvider.ANTHROPIC + errors = [] + + def run_setup(): + try: + with patch("browser_use_demo.tools.BrowserTool"): + setup_state() + except Exception as e: + errors.append(e) + + threads = [threading.Thread(target=run_setup) for _ in range(5)] + for t in threads: + t.start() + for t in threads: + t.join() + + # Should handle concurrent access without crashes + assert len(errors) == 0 + + @patch("streamlit.session_state", new_callable=MagicMock) + @patch("asyncio.get_event_loop") + def test_get_or_create_with_running_loop(self, mock_get_loop, mock_state): + """Test get_or_create_event_loop when another loop is running.""" + + mock_state.event_loop = None + running_loop = Mock(spec=asyncio.AbstractEventLoop) + running_loop.is_running.return_value = True + mock_get_loop.return_value = running_loop + + # Should create new loop despite running loop exists + with patch("asyncio.new_event_loop") as mock_new: + new_loop = Mock(spec=asyncio.AbstractEventLoop) + new_loop.is_closed.return_value = False + mock_new.return_value = new_loop + + with patch("asyncio.set_event_loop"): + result = get_or_create_event_loop() + assert result == new_loop + + @patch("streamlit.session_state", new_callable=MagicMock) + def test_setup_state_with_readonly_state(self, mock_state): + """Test setup_state when session state is read-only.""" + + mock_state.__contains__.return_value = False + mock_state.__setitem__.side_effect = AttributeError("Read-only state") + + with pytest.raises(AttributeError, match="Read-only state"): + setup_state() + + # Test removed - BrowserTool no longer reads dimensions from environment diff --git a/browser-use-demo/validate_env.py b/browser-use-demo/validate_env.py new file mode 100644 index 000000000..6ccffbb6c --- /dev/null +++ b/browser-use-demo/validate_env.py @@ -0,0 +1,53 @@ +#!/usr/bin/env python3 +"""Validate environment configuration before startup.""" + +import os +import sys +from pathlib import Path + +# Import constants for display information +try: + from browser_use_demo.display_constants import BROWSER_WIDTH, BROWSER_HEIGHT, DISPLAY_WIDTH, DISPLAY_HEIGHT +except ImportError: + # Fallback if running outside the package + DISPLAY_WIDTH = 1920 + DISPLAY_HEIGHT = 1080 + BROWSER_WIDTH = 1920 + BROWSER_HEIGHT = 1080 + + +def validate_env(): + """Validate required environment variables are set.""" + # Check API key + api_key = os.environ.get("ANTHROPIC_API_KEY") + + if not api_key: + print("\n" + "=" * 60) + print("ERROR: Missing required configuration!") + print("=" * 60) + print("\nThe Browser Use Demo requires proper configuration to run.") + print("\n🔧 RECOMMENDED: Use docker-compose with a .env file:") + print(" 1. Copy the example environment file:") + print(" cp .env.example .env") + print(" 2. Edit .env and add your Anthropic API key") + print(" 3. Run with docker-compose:") + print(" docker-compose up --build") + print("=" * 60) + sys.exit(1) + + if api_key == "your_anthropic_api_key_here" or len(api_key) < 10: + print("\n" + "=" * 60) + print("ERROR: Invalid API key!") + print("=" * 60) + print(" ANTHROPIC_API_KEY: Must be a valid API key") + print("\nTo fix this, please edit your .env file with a valid API key") + print("=" * 60) + sys.exit(1) + + print("\n✓ Environment validation passed") + print(f" Display: {DISPLAY_WIDTH}x{DISPLAY_HEIGHT}") + print(f" Browser: {BROWSER_WIDTH}x{BROWSER_HEIGHT}") + + +if __name__ == "__main__": + validate_env() diff --git a/computer-use-demo/CONTRIBUTING.md b/computer-use-demo/CONTRIBUTING.md new file mode 100644 index 000000000..63b8dfbca --- /dev/null +++ b/computer-use-demo/CONTRIBUTING.md @@ -0,0 +1,129 @@ +# Contributing Guidelines + +Thank you for your interest in contributing to the Computer Use Demo quickstart! This document outlines the process and guidelines for contributing to this project. + +## Code of Conduct + +Please be respectful and constructive in all interactions. We aim to maintain a welcoming and inclusive environment for all contributors. + +## Contribution Policy + +- Bugfixes and updates to our documentation that address correctness issues are always welcome +- Feature additions, refactors, and documentation updates beyond the scope of correctness (major changes) are accepted at the sole determination of the maintainers. We require each major change to be submitted in a separate Pull Request. We will assess new features under the following criteria: + - Adherence to coding standards + - Ease of use as a reference implementation + - User experience + - Applicability to a wide audience of developers + - Minimization of third-party dependencies + - Does not promote a product or service + +Please open a github issue if you are need clarification on this policy or you want to discuss a new feature addition. + +## Development Setup + +1. Create and activate a Python virtual environment: + + ```bash + python -m venv .venv + source .venv/bin/activate # On Unix + # or + .venv\Scripts\activate # On Windows + ``` + +2. Install development dependencies: + + ```bash + pip install -r dev-requirements.txt + ``` + +3. Install pre-commit hooks: + ```bash + pre-commit install + ``` + +## Development Process + +1. Fork the repository and create a branch for your changes +2. Make your changes following our coding standards +3. Submit a pull request with a clear description of the changes + +## Coding Standards + +- Use clear, descriptive variable and function names +- Follow PEP 8 style guidelines for Python code +- Keep functions focused and single-purpose +- Avoid inline comments - code should be self-documenting +- Use type hints for all Python functions +- Use dataclasses for structured data (see `tools/base.py` for examples) +- All tools must inherit from `BaseAnthropicTool` and implement required methods +- Use abstract base classes (ABC) for defining interfaces +- Handle errors using `ToolError` and `ToolFailure` classes + +## Code Quality Tools + +We use several tools to maintain code quality: + +- **Ruff**: For linting and formatting + - Run `ruff check .` for linting + - Run `ruff format .` for formatting + - See `ruff.toml` for enabled rules +- **Pyright**: For type checking + - Configuration in `pyproject.toml` +- **Pre-commit**: For automated checks before commits + +## Testing + +- Add tests for new functionality in the `tests/` directory +- Follow existing test patterns (see `tests/tools/` for examples) +- Use pytest fixtures where appropriate +- Run tests with: + ```bash + pytest + ``` +- Tests must pass in async mode (configured in pyproject.toml) + +## Commit Guidelines + +- All commits MUST be signed (use `git commit -S`) +- Write clear, descriptive commit messages +- Use present tense ("Add feature" not "Added feature") +- Reference issue numbers when applicable + +## Pull Request Process + +1. Update documentation as needed +2. Add tests for new functionality +3. Ensure all checks pass: + - All tests pass + - Ruff linting passes + - Type checking passes + - Pre-commit hooks pass +4. Request review from maintainers +5. Address review feedback + +## Tool Development + +When creating new tools: + +1. Inherit from `BaseAnthropicTool` +2. Implement `__call__` and `to_params` methods +3. Use appropriate result types (`ToolResult`, `CLIResult`, or `ToolFailure`) +4. Add comprehensive tests +5. Document parameters and return types + +## Documentation + +- Keep README.md up to date +- Document new features and changes +- Use clear, concise language +- Include docstrings for all public classes and methods +- Use concise, single-line docstrings for simple functions +- For complex functions, include: + - A brief description + - Args/parameters if not obvious + - Return value if not obvious + - Any important notes about behavior + +## Questions? + +If you have questions, please open an issue for discussion. diff --git a/computer-use-demo/Dockerfile b/computer-use-demo/Dockerfile index f3b00255a..8a0e9b472 100644 --- a/computer-use-demo/Dockerfile +++ b/computer-use-demo/Dockerfile @@ -6,7 +6,6 @@ ENV DEBIAN_PRIORITY=high RUN apt-get update && \ apt-get -y upgrade && \ apt-get -y install \ - build-essential \ # UI Requirements xvfb \ xterm \ diff --git a/computer-use-demo/README.md b/computer-use-demo/README.md index 2b896da88..6e05b255c 100644 --- a/computer-use-demo/README.md +++ b/computer-use-demo/README.md @@ -1,5 +1,8 @@ # Anthropic Computer Use Demo +> [!NOTE] +> Now featuring support for the new Claude 4 models! The latest Claude Opus 4.5 (claude-opus-4-5-20251101) is now available, along with Claude Sonnet 4.5 (claude-sonnet-4-5-20250929), Claude Sonnet 4 (claude-sonnet-4-20250514), Claude Opus 4 (claude-opus-4-20250514), and Claude Haiku 4.5 (claude-haiku-4-5-20251001). These models bring next-generation capabilities with the updated str_replace_based_edit_tool that replaces the previous str_replace_editor tool. The undo_edit command has been removed in this latest version for a more streamlined experience. + > [!CAUTION] > Computer use is a beta feature. Please be aware that computer use poses unique risks that are distinct from standard API features or chat interfaces. These risks are heightened when using computer use to interact with the internet. To minimize risks, consider taking precautions such as: > @@ -14,25 +17,25 @@ This repository helps you get started with computer use on Claude, with reference implementations of: -* Build files to create a Docker container with all necessary dependencies -* A computer use agent loop using the Anthropic API, Bedrock, or Vertex to access the updated Claude 3.5 Sonnet model -* Anthropic-defined computer use tools -* A streamlit app for interacting with the agent loop +- Build files to create a Docker container with all necessary dependencies +- A computer use agent loop using the Claude API, Bedrock, or Vertex to access Claude Opus 4.5, Claude Sonnet 4.5, Claude Sonnet 4, Claude Opus 4, Claude Haiku 4.5, Claude 3.7 Sonnet, and Claude 3.5 Sonnet models +- Anthropic-defined computer use tools +- A streamlit app for interacting with the agent loop Please use [this form](https://forms.gle/BT1hpBrqDPDUrCqo7) to provide feedback on the quality of the model responses, the API itself, or the quality of the documentation - we cannot wait to hear from you! > [!IMPORTANT] -> The Beta API used in this reference implementation is subject to change. Please refer to the [API release notes](https://docs.anthropic.com/en/release-notes/api) for the most up-to-date information. +> The Beta API used in this reference implementation is subject to change. Please refer to the [API release notes](https://docs.claude.com/en/release-notes/api) for the most up-to-date information. > [!IMPORTANT] > The components are weakly separated: the agent loop runs in the container being controlled by Claude, can only be used by one session at a time, and must be restarted or reset between sessions if necessary. ## Quickstart: running the Docker container -### Anthropic API +### Claude API > [!TIP] -> You can find your API key in the [Anthropic Console](https://console.anthropic.com/). +> You can find your API key in the [Claude Console](https://console.anthropic.com/). ```bash export ANTHROPIC_API_KEY=%your_api_key% @@ -51,10 +54,9 @@ Once the container is running, see the [Accessing the demo app](#accessing-the-d ### Bedrock > [!TIP] -> To use the new Claude 3.5 Sonnet on Bedrock, you first need to [request model access](https://docs.aws.amazon.com/bedrock/latest/userguide/model-access-modify.html). +> To use the new Claude 3.7 Sonnet on Bedrock, you first need to [request model access](https://docs.aws.amazon.com/bedrock/latest/userguide/model-access-modify.html). You'll need to pass in AWS credentials with appropriate permissions to use Claude on Bedrock. - You have a few options for authenticating with Bedrock. See the [boto3 documentation](https://boto3.amazonaws.com/v1/documentation/api/latest/guide/credentials.html#environment-variables) for more details and options. #### Option 1: (suggested) Use the host's AWS credentials file and AWS profile @@ -122,7 +124,6 @@ docker run \ Once the container is running, see the [Accessing the demo app](#accessing-the-demo-app) section below for instructions on how to connect to the interface. This example shows how to use the Google Cloud Application Default Credentials to authenticate with Vertex. - You can also set `GOOGLE_APPLICATION_CREDENTIALS` to use an arbitrary credential file, see the [Google Cloud Authentication documentation](https://cloud.google.com/docs/authentication/application-default-credentials#GAC) for more details. ### Accessing the demo app @@ -154,9 +155,14 @@ docker run \ -it ghcr.io/anthropics/anthropic-quickstarts:computer-use-demo-latest ``` -We do not recommend sending screenshots in resolutions above [XGA/WXGA](https://en.wikipedia.org/wiki/Display_resolution_standards#XGA) to avoid issues related to [image resizing](https://docs.anthropic.com/en/docs/build-with-claude/vision#evaluate-image-size). +We do not recommend sending screenshots in resolutions above [XGA/WXGA](https://en.wikipedia.org/wiki/Display_resolution_standards#XGA) to avoid issues related to [image resizing](https://docs.claude.com/en/docs/build-with-claude/vision#evaluate-image-size). Relying on the image resizing behavior in the API will result in lower model accuracy and slower performance than implementing scaling in your tools directly. The `computer` tool implementation in this project demonstrates how to scale both images and coordinates from higher resolutions to the suggested resolutions. +When implementing computer use yourself, we recommend using XGA resolution (1024x768): + +- For higher resolutions: Scale the image down to XGA and let the model interact with this scaled version, then map the coordinates back to the original resolution proportionally. +- For lower resolutions or smaller devices (e.g. mobile devices): Add black padding around the display area until it reaches 1024x768. + ## Development ```bash diff --git a/computer-use-demo/computer_use_demo/loop.py b/computer-use-demo/computer_use_demo/loop.py index 9db525a39..0a51d84ea 100644 --- a/computer-use-demo/computer_use_demo/loop.py +++ b/computer-use-demo/computer_use_demo/loop.py @@ -1,5 +1,5 @@ """ -Agentic sampling loop that calls the Anthropic API and local implementation of anthropic-defined computer use tools. +Agentic sampling loop that calls the Claude API and local implementation of anthropic-defined computer use tools. """ import platform @@ -29,9 +29,13 @@ BetaToolUseBlockParam, ) -from .tools import BashTool, ComputerTool, EditTool, ToolCollection, ToolResult +from .tools import ( + TOOL_GROUPS_BY_VERSION, + ToolCollection, + ToolResult, + ToolVersion, +) -COMPUTER_USE_BETA_FLAG = "computer-use-2024-10-22" PROMPT_CACHING_BETA_FLAG = "prompt-caching-2024-07-31" @@ -41,13 +45,6 @@ class APIProvider(StrEnum): VERTEX = "vertex" -PROVIDER_TO_DEFAULT_MODEL_NAME: dict[APIProvider, str] = { - APIProvider.ANTHROPIC: "claude-3-5-sonnet-20241022", - APIProvider.BEDROCK: "anthropic.claude-3-5-sonnet-20241022-v2:0", - APIProvider.VERTEX: "claude-3-5-sonnet-v2@20241022", -} - - # This system prompt is optimized for the Docker environment in this repository and # specific tool combinations enabled. # We encourage modifying this system prompt to ensure the model has context for the @@ -58,15 +55,15 @@ class APIProvider(StrEnum): * You can feel free to install Ubuntu applications with your bash tool. Use curl instead of wget. * To open firefox, please just click on the firefox icon. Note, firefox-esr is what is installed on your system. * Using bash tool you can start GUI applications, but you need to set export DISPLAY=:1 and use a subshell. For example "(DISPLAY=:1 xterm &)". GUI apps run with bash tool will appear within your desktop environment, but they may take some time to appear. Take a screenshot to confirm it did. -* When using your bash tool with commands that are expected to output very large quantities of text, redirect into a tmp file and use str_replace_editor or `grep -n -B -A ` to confirm output. +* When using your bash tool with commands that are expected to output very large quantities of text, redirect into a tmp file and use str_replace_based_edit_tool or `grep -n -B -A ` to confirm output. * When viewing a page it can be helpful to zoom out so that you can see everything on the page. Either that, or make sure you scroll down to see everything before deciding something isn't available. * When using your computer function calls, they take a while to run and send back to you. Where possible/feasible, try to chain multiple of these calls all into one function calls request. -* The current date is {datetime.today().strftime('%A, %B %-d, %Y')}. +* The current date is {datetime.today().strftime("%A, %B %-d, %Y")}. * When using Firefox, if a startup wizard appears, IGNORE IT. Do not even click "skip this step". Instead, click on the address bar where it says "Search or enter address", and enter the appropriate search term or URL there. -* If the item you are looking at is a pdf, if after taking a single screenshot of the pdf it seems that you want to read the entire document instead of trying to continue to read the pdf from your screenshots + navigation, determine the URL, use curl to download the pdf, install and use pdftotext to convert it to a text file, and then read that text file directly with your StrReplaceEditTool. +* If the item you are looking at is a pdf, if after taking a single screenshot of the pdf it seems that you want to read the entire document instead of trying to continue to read the pdf from your screenshots + navigation, determine the URL, use curl to download the pdf, install and use pdftotext to convert it to a text file, and then read that text file directly with your str_replace_based_edit_tool. """ @@ -84,15 +81,15 @@ async def sampling_loop( api_key: str, only_n_most_recent_images: int | None = None, max_tokens: int = 4096, + tool_version: ToolVersion, + thinking_budget: int | None = None, + token_efficient_tools_beta: bool = False, ): """ Agentic sampling loop for the assistant/tool interaction of computer use. """ - tool_collection = ToolCollection( - ComputerTool(), - BashTool(), - EditTool(), - ) + tool_group = TOOL_GROUPS_BY_VERSION[tool_version] + tool_collection = ToolCollection(*(ToolCls() for ToolCls in tool_group.tools)) system = BetaTextBlockParam( type="text", text=f"{SYSTEM_PROMPT}{' ' + system_prompt_suffix if system_prompt_suffix else ''}", @@ -100,10 +97,12 @@ async def sampling_loop( while True: enable_prompt_caching = False - betas = [COMPUTER_USE_BETA_FLAG] - image_truncation_threshold = 10 + betas = [tool_group.beta_flag] if tool_group.beta_flag else [] + if token_efficient_tools_beta: + betas.append("token-efficient-tools-2025-02-19") + image_truncation_threshold = only_n_most_recent_images or 0 if provider == APIProvider.ANTHROPIC: - client = Anthropic(api_key=api_key) + client = Anthropic(api_key=api_key, max_retries=4) enable_prompt_caching = True elif provider == APIProvider.VERTEX: client = AnthropicVertex() @@ -113,9 +112,11 @@ async def sampling_loop( if enable_prompt_caching: betas.append(PROMPT_CACHING_BETA_FLAG) _inject_prompt_caching(messages) - # Is it ever worth it to bust the cache with prompt caching? - image_truncation_threshold = 50 - system["cache_control"] = {"type": "ephemeral"} + # Because cached reads are 10% of the price, we don't think it's + # ever sensible to break the cache by truncating images + only_n_most_recent_images = 0 + # Use type ignore to bypass TypedDict check until SDK types are updated + system["cache_control"] = {"type": "ephemeral"} # type: ignore if only_n_most_recent_images: _maybe_filter_to_n_most_recent_images( @@ -123,6 +124,12 @@ async def sampling_loop( only_n_most_recent_images, min_removal_threshold=image_truncation_threshold, ) + extra_body = {} + if thinking_budget: + # Ensure we only send the required fields for thinking + extra_body = { + "thinking": {"type": "enabled", "budget_tokens": thinking_budget} + } # Call the API # we use raw_response to provide debug information to streamlit. Your @@ -136,6 +143,7 @@ async def sampling_loop( system=[system], tools=tool_collection.to_params(), betas=betas, + extra_body=extra_body, ) except (APIStatusError, APIResponseValidationError) as e: api_response_callback(e.request, e.response, e) @@ -161,15 +169,20 @@ async def sampling_loop( tool_result_content: list[BetaToolResultBlockParam] = [] for content_block in response_params: output_callback(content_block) - if content_block["type"] == "tool_use": + if ( + isinstance(content_block, dict) + and content_block.get("type") == "tool_use" + ): + # Type narrowing for tool use blocks + tool_use_block = cast(BetaToolUseBlockParam, content_block) result = await tool_collection.run( - name=content_block["name"], - tool_input=cast(dict[str, Any], content_block["input"]), + name=tool_use_block["name"], + tool_input=cast(dict[str, Any], tool_use_block.get("input", {})), ) tool_result_content.append( - _make_api_tool_result(result, content_block["id"]) + _make_api_tool_result(result, tool_use_block["id"]) ) - tool_output_callback(result, content_block["id"]) + tool_output_callback(result, tool_use_block["id"]) if not tool_result_content: return messages @@ -228,12 +241,23 @@ def _maybe_filter_to_n_most_recent_images( def _response_to_params( response: BetaMessage, -) -> list[BetaTextBlockParam | BetaToolUseBlockParam]: - res: list[BetaTextBlockParam | BetaToolUseBlockParam] = [] +) -> list[BetaContentBlockParam]: + res: list[BetaContentBlockParam] = [] for block in response.content: if isinstance(block, BetaTextBlock): - res.append({"type": "text", "text": block.text}) + if block.text: + res.append(BetaTextBlockParam(type="text", text=block.text)) + elif getattr(block, "type", None) == "thinking": + # Handle thinking blocks - include signature field + thinking_block = { + "type": "thinking", + "thinking": getattr(block, "thinking", None), + } + if hasattr(block, "signature"): + thinking_block["signature"] = getattr(block, "signature", None) + res.append(cast(BetaContentBlockParam, thinking_block)) else: + # Handle tool use blocks normally res.append(cast(BetaToolUseBlockParam, block.model_dump())) return res @@ -253,11 +277,13 @@ def _inject_prompt_caching( ): if breakpoints_remaining: breakpoints_remaining -= 1 - content[-1]["cache_control"] = BetaCacheControlEphemeralParam( + # Use type ignore to bypass TypedDict check until SDK types are updated + content[-1]["cache_control"] = BetaCacheControlEphemeralParam( # type: ignore {"type": "ephemeral"} ) else: - content[-1].pop("cache_control", None) + if isinstance(content[-1], dict) and "cache_control" in content[-1]: + del content[-1]["cache_control"] # type: ignore # we'll only every have one extra turn per loop break diff --git a/computer-use-demo/computer_use_demo/requirements.txt b/computer-use-demo/computer_use_demo/requirements.txt index 115da8df3..510e1c915 100644 --- a/computer-use-demo/computer_use_demo/requirements.txt +++ b/computer-use-demo/computer_use_demo/requirements.txt @@ -1,5 +1,5 @@ -streamlit>=1.38.0 -anthropic[bedrock,vertex]>=0.37.1 +streamlit==1.41.0 +anthropic[bedrock,vertex]>=0.39.0 jsonschema==4.22.0 boto3>=1.28.57 google-auth<3,>=2 diff --git a/computer-use-demo/computer_use_demo/streamlit.py b/computer-use-demo/computer_use_demo/streamlit.py index 45318ed91..a3428d882 100644 --- a/computer-use-demo/computer_use_demo/streamlit.py +++ b/computer-use-demo/computer_use_demo/streamlit.py @@ -7,11 +7,13 @@ import os import subprocess import traceback +from contextlib import contextmanager +from dataclasses import dataclass from datetime import datetime, timedelta from enum import StrEnum from functools import partial from pathlib import PosixPath -from typing import cast +from typing import cast, get_args import httpx import streamlit as st @@ -19,24 +21,84 @@ from anthropic.types.beta import ( BetaContentBlockParam, BetaTextBlockParam, + BetaToolResultBlockParam, ) from streamlit.delta_generator import DeltaGenerator from computer_use_demo.loop import ( - PROVIDER_TO_DEFAULT_MODEL_NAME, APIProvider, sampling_loop, ) -from computer_use_demo.tools import ToolResult +from computer_use_demo.tools import ToolResult, ToolVersion + +PROVIDER_TO_DEFAULT_MODEL_NAME: dict[APIProvider, str] = { + APIProvider.ANTHROPIC: "claude-sonnet-4-5-20250929", + APIProvider.BEDROCK: "anthropic.claude-3-5-sonnet-20241022-v2:0", + APIProvider.VERTEX: "claude-3-5-sonnet-v2@20241022", +} + + +@dataclass(kw_only=True, frozen=True) +class ModelConfig: + tool_version: ToolVersion + max_output_tokens: int + default_output_tokens: int + has_thinking: bool = False + + +CLAUDE_4 = ModelConfig( + tool_version="computer_use_20250429", + max_output_tokens=64_000, + default_output_tokens=1024 * 16, + has_thinking=True, +) + +CLAUDE_4_5 = ModelConfig( + tool_version="computer_use_20250124", + max_output_tokens=128_000, + default_output_tokens=1024 * 16, + has_thinking=True, +) + +CLAUDE_4_WITH_ZOOMABLE_TOOL = ModelConfig( + tool_version="computer_use_20251124", + max_output_tokens=64_000, + default_output_tokens=1024 * 16, + has_thinking=True, +) + +HAIKU_4_5 = ModelConfig( + tool_version="computer_use_20250124", + max_output_tokens=1024 * 8, + default_output_tokens=1024 * 4, + has_thinking=False, +) + +MODEL_TO_MODEL_CONF: dict[str, ModelConfig] = { + "claude-opus-4-1-20250805": CLAUDE_4, + "claude-sonnet-4-20250514": CLAUDE_4, + "claude-opus-4-20250514": CLAUDE_4, + "claude-sonnet-4-5-20250929": CLAUDE_4_5, + "anthropic.claude-sonnet-4-5-20250929-v1:0": CLAUDE_4_5, + "claude-sonnet-4-5@20250929": CLAUDE_4_5, + "claude-haiku-4-5-20251001": HAIKU_4_5, + "anthropic.claude-haiku-4-5-20251001-v1:0": HAIKU_4_5, # Bedrock + "claude-haiku-4-5@20251001": HAIKU_4_5, # Vertex + "claude-opus-4-5-20251101": CLAUDE_4_WITH_ZOOMABLE_TOOL, +} CONFIG_DIR = PosixPath("~/.anthropic").expanduser() API_KEY_FILE = CONFIG_DIR / "api_key" STREAMLIT_STYLE = """