+---
+ref_X | role | name | type | reason why this matches
+ref_Y | role | name | type | reason why this matches
+...
+
+If there are more than 20 matches, add this line at the end:
+MORE: Use a more specific query to see additional results
+
+If no matching elements are found, return only:
+FOUND: 0
+ERROR: explanation of why no elements were found"""
+
+ response = await client.messages.create(
+ model="claude-3-5-sonnet-20241022",
+ max_tokens=800,
+ temperature=1.0,
+ messages=[{"role": "user", "content": prompt}],
+ )
+
+ # Handle the response properly
+ first_content = response.content[0]
+ if hasattr(first_content, "text"):
+ response_text = first_content.text.strip()
+ else:
+ # Handle other content types if needed
+ response_text = str(first_content)
+ lines = [
+ line.strip()
+ for line in response_text.split("\n")
+ if line.strip()
+ ]
+
+ total_found = 0
+ elements = []
+ has_more = False
+ error_message = None
+
+ for line in lines:
+ if line.startswith("FOUND:"):
+ try:
+ total_found = int(line.split(":")[1].strip())
+ except (ValueError, IndexError):
+ total_found = 0
+ elif line.startswith("SHOWING:"):
+ pass
+ elif line.startswith("ERROR:"):
+ error_message = line[6:].strip()
+ elif line.startswith("MORE:"):
+ has_more = True
+ elif line.startswith("ref_") and "|" in line:
+ parts = [p.strip() for p in line.split("|")]
+ if len(parts) >= 4:
+ elements.append(
+ {
+ "ref": parts[0],
+ "role": parts[1],
+ "name": parts[2] if len(parts) > 2 else "",
+ "type": parts[3] if len(parts) > 3 else "",
+ "description": parts[4]
+ if len(parts) > 4
+ else "",
+ }
+ )
+
+ if total_found == 0 or len(elements) == 0:
+ return ToolResult(
+ output=error_message or "No matching elements found",
+ error=None,
+ )
+
+ message = f"Found {total_found} matching element{'s' if total_found != 1 else ''}"
+ if has_more:
+ message += f" (showing first {len(elements)}, use a more specific query to narrow results)"
+
+ # Format elements for output
+ elements_output = []
+ for el in elements:
+ element_str = f"- {el['ref']}: {el['role']}"
+ if el.get("name"):
+ element_str += f" {el['name']}"
+ if el.get("type"):
+ element_str += f" {el['type']}"
+ if el.get("description"):
+ element_str += f" - {el['description']}"
+ elements_output.append(element_str)
+
+ elements_str = "\n".join(elements_output)
+ return ToolResult(output=f"{message}\n\n{elements_str}", error=None)
+
+ except Exception:
+ pass # Failed to use AI for find, falling back to simple search
+
+ # Fallback to simple text search if AI is not available
+ elements = await self._page.query_selector_all(
+ f"*:has-text('{search_query}')"
+ )
+
+ if not elements:
+ return ToolResult(
+ output=f"No matching elements found for: {search_query}", error=None
+ )
+
+ # For simple fallback, just report count (no ref_ids without AI analysis)
+ return ToolResult(
+ output=f"Found {len(elements)} matching element{'s' if len(elements) != 1 else ''} (Note: AI-based search with ref_ids requires ANTHROPIC_API_KEY)",
+ error=None,
+ )
+
+ except Exception as e:
+ raise ToolError(f"Failed to find elements: {str(e)}") from e
+
+ async def _form_input(self, ref: str, value: Any) -> ToolResult:
+ """Fill a form field with a value."""
+ if self._page is None:
+ raise ToolError("Browser not initialized")
+
+ try:
+ # Use the browser_form_input_script.js from reference implementation
+ result = await self._execute_js_from_file(
+ "browser_form_input_script.js", ref, value
+ )
+
+ if isinstance(result, dict) and not result.get("success", False):
+ raise ToolError(result.get("message", "Failed to fill form field"))
+
+ return ToolResult(
+ output=f"Filled form field {ref} with value: {value}", error=None
+ )
+
+ except Exception as e:
+ raise ToolError(f"Failed to fill form field: {str(e)}") from e
+
+ async def _wait(self, duration: float) -> ToolResult:
+ """Wait for a specified duration."""
+ try:
+ await asyncio.sleep(duration)
+ return ToolResult(
+ output=f"Waited for {duration} second{'s' if duration != 1 else ''}",
+ error=None,
+ )
+ except Exception as e:
+ raise ToolError(f"Failed to wait: {str(e)}") from e
+
+ async def _execute_js(self, code: str) -> ToolResult:
+ """
+ Execute JavaScript code in the page context.
+ Returns the result of the last expression.
+ """
+ if self._page is None:
+ raise ToolError("Browser not initialized")
+
+ try:
+ # Execute the code in page context
+ # Playwright's evaluate handles async/await automatically
+ result = await self._page.evaluate(code)
+
+ # Format the result
+ if result is None:
+ result_str = "undefined"
+ elif isinstance(result, (dict, list)):
+ result_str = json.dumps(result, indent=2)
+ else:
+ result_str = str(result)
+
+ return ToolResult(output=result_str, error=None)
+
+ except Exception as e:
+ raise ToolError(f"JavaScript execution error: {str(e)}") from e
+
+ async def __call__(
+ self,
+ *,
+ action: Actions,
+ text: Optional[str] = None,
+ ref: Optional[str] = None,
+ coordinate: Optional[tuple[int, int]] = None,
+ start_coordinate: Optional[tuple[int, int]] = None,
+ scroll_direction: Optional[Literal["up", "down", "left", "right"]] = None,
+ scroll_amount: Optional[int] = None,
+ duration: Optional[float] = None,
+ value: Optional[Any] = None,
+ region: Optional[tuple[int, int, int, int]] = None,
+ **kwargs,
+ ) -> ToolResult:
+ """
+ Execute browser actions.
+
+ Parameters:
+ - action: The action to perform
+ - text: Text input for type, key, navigate, find actions
+ - ref: Element reference for element-based actions
+ - coordinate: (x, y) coordinates for mouse actions
+ - start_coordinate: Starting point for drag actions
+ - scroll_direction: Direction for scroll action
+ - scroll_amount: Amount to scroll
+ - duration: Duration for wait or hold_key actions
+ - value: Value for form_input action
+ - region: (x, y, width, height) for zoom screenshot
+ """
+
+ # Ensure browser is running for all actions
+ await self._ensure_browser()
+
+ if action == "navigate":
+ if not text:
+ raise ToolError("URL is required for navigate action")
+ return await self._navigate(text)
+
+ elif action == "screenshot":
+ return await self._take_screenshot()
+
+ elif action == "zoom":
+ if not region:
+ raise ToolError(
+ "Region (x1, y1, x2, y2) is required for zoom action"
+ )
+ x1, y1, x2, y2 = region
+ # Convert corner coordinates to x, y, width, height
+ x = min(x1, x2)
+ y = min(y1, y2)
+ width = abs(x2 - x1)
+ height = abs(y2 - y1)
+ return await self._zoom_screenshot(x, y, width, height)
+
+ elif action in [
+ "left_click",
+ "right_click",
+ "middle_click",
+ "double_click",
+ "triple_click",
+ ]:
+ return await self._click(action, coordinate, ref, text)
+
+ elif action == "hover":
+ return await self._hover(coordinate, ref)
+
+ elif action == "type":
+ if not text:
+ raise ToolError("Text is required for type action")
+ return await self._type_text(text)
+
+ elif action == "key":
+ if not text:
+ raise ToolError("Key is required for key action")
+ return await self._press_key(text)
+
+ elif action == "hold_key":
+ if not text:
+ raise ToolError("Key is required for hold_key action")
+ if not duration:
+ duration = 1.0
+ return await self._press_key(text, hold=True, duration=duration)
+
+ elif action == "scroll":
+ return await self._scroll(coordinate, scroll_direction, scroll_amount)
+
+ elif action == "scroll_to":
+ if not ref:
+ raise ToolError("Element reference is required for scroll_to action")
+ return await self._scroll_to(ref)
+
+ elif action == "left_click_drag":
+ if not start_coordinate or not coordinate:
+ raise ToolError(
+ "Both start_coordinate and coordinate are required for drag action"
+ )
+ start_x, start_y = start_coordinate
+ end_x, end_y = coordinate
+ return await self._drag(start_x, start_y, end_x, end_y)
+
+ elif action == "left_mouse_down":
+ if not coordinate:
+ raise ToolError("Coordinate is required for mouse_down action")
+ x, y = coordinate
+ return await self._mouse_down(x, y)
+
+ elif action == "left_mouse_up":
+ if not coordinate:
+ raise ToolError("Coordinate is required for mouse_up action")
+ x, y = coordinate
+ return await self._mouse_up(x, y)
+
+ elif action == "read_page":
+ filter_type = text if text in ["interactive", ""] else ""
+ return await self._read_page(filter_type)
+
+ elif action == "get_page_text":
+ return await self._get_page_text()
+
+ elif action == "find":
+ if not text:
+ raise ToolError("Text is required for find action")
+ return await self._find(text)
+
+ elif action == "form_input":
+ if not ref:
+ raise ToolError("Element reference is required for form_input action")
+ if value is None:
+ raise ToolError("Value is required for form_input action")
+ return await self._form_input(ref, value)
+
+ elif action == "wait":
+ if not duration:
+ duration = 1.0
+ return await self._wait(duration)
+
+ elif action == "execute_js":
+ if not text:
+ raise ToolError("JavaScript code is required for execute_js action")
+ return await self._execute_js(text)
+
+ else:
+ raise ToolError(f"Unknown action: {action}")
+
+ async def cleanup(self):
+ """Cleanup method to ensure browser is closed properly."""
+ # Clean up browser resources
+ if self.cdp_url:
+ # When connected to CDP server, just disconnect without closing tabs
+ self._page = None
+ self._context = None
+ self._browser = None
+ else:
+ # For local browser, close everything
+ if self._page:
+ await self._page.close()
+ self._page = None
+
+ if self._context:
+ await self._context.close()
+ self._context = None
+
+ if self._browser:
+ await self._browser.close()
+ self._browser = None
+
+ if self._playwright:
+ await self._playwright.stop()
+ self._playwright = None
+
+ self._initialized = False
diff --git a/browser-use-demo/browser_use_demo/tools/collection.py b/browser-use-demo/browser_use_demo/tools/collection.py
new file mode 100644
index 000000000..2b4b94da1
--- /dev/null
+++ b/browser-use-demo/browser_use_demo/tools/collection.py
@@ -0,0 +1,17 @@
+from typing import Any
+
+from anthropic.types.beta import BetaToolUnionParam
+
+from .base import BaseAnthropicTool
+
+
+class ToolCollection:
+ """Collection of tools for browser automation."""
+
+ def __init__(self, *tools: BaseAnthropicTool):
+ self.tools = tools
+ self.tool_map = {tool.name: tool for tool in tools}
+
+ def to_params(self) -> list[BetaToolUnionParam]:
+ """Convert all tools to API parameters."""
+ return [tool.to_params() for tool in self.tools]
diff --git a/browser-use-demo/browser_use_demo/tools/coordinate_scaling.py b/browser-use-demo/browser_use_demo/tools/coordinate_scaling.py
new file mode 100644
index 000000000..91d956790
--- /dev/null
+++ b/browser-use-demo/browser_use_demo/tools/coordinate_scaling.py
@@ -0,0 +1,178 @@
+"""
+Coordinate scaling utilities for browser tool.
+
+This module handles the scaling of coordinates from Claude's vision model
+resolution to the actual browser viewport resolution.
+"""
+
+
+class CoordinateScaler:
+ """Handles coordinate scaling between Claude's vision and actual viewport."""
+
+ # Claude's image processing resolution for 16:9 aspect ratio
+ # According to the official documentation:
+ # https://docs.claude.com/en/docs/build-with-claude/vision#evaluate-image-size
+ # When images exceed size thresholds, they are resized while preserving aspect ratio.
+ # For 16:9 aspect ratio (like 1920x1080 screenshots), images are resized to these exact dimensions:
+ CLAUDE_ACTUAL_WIDTH = 1456 # Width for 16:9 landscape (from documentation)
+ CLAUDE_ACTUAL_HEIGHT = 819 # Height for 16:9 landscape (from documentation)
+
+ # Documented maximum image sizes for different aspect ratios
+ # Source: https://docs.claude.com/en/docs/build-with-claude/vision#evaluate-image-size
+ DOCUMENTED_SIZES = {
+ # aspect_ratio: (width, height)
+ (1, 1): (1092, 1092), # Square
+ (3, 4): (951, 1268), # Portrait
+ (4, 3): (1268, 951), # Landscape
+ (2, 3): (896, 1344), # Portrait
+ (3, 2): (1344, 896), # Landscape
+ (9, 16): (819, 1456), # Portrait (phone)
+ (16, 9): (1456, 819), # Landscape (widescreen)
+ (1, 2): (784, 1568), # Portrait (tall)
+ (2, 1): (1568, 784), # Landscape (wide)
+ }
+
+ @classmethod
+ def get_documented_size_for_aspect_ratio(cls, viewport_width: int, viewport_height: int) -> tuple[int, int]:
+ """
+ Get the documented size for the given viewport's aspect ratio.
+
+ Args:
+ viewport_width: Actual viewport width
+ viewport_height: Actual viewport height
+
+ Returns:
+ Tuple of (width, height) from documented sizes that matches the aspect ratio
+
+ Raises:
+ ValueError: If the aspect ratio doesn't match any documented sizes
+ """
+ viewport_ratio = viewport_width / viewport_height
+
+ # Tolerance for aspect ratio matching (to handle minor differences)
+ ASPECT_RATIO_TOLERANCE = 0.02
+
+ # Try to find an exact or very close aspect ratio match
+ for (ratio_w, ratio_h), (doc_width, doc_height) in cls.DOCUMENTED_SIZES.items():
+ doc_ratio = ratio_w / ratio_h
+ if abs(viewport_ratio - doc_ratio) < ASPECT_RATIO_TOLERANCE:
+ return (doc_width, doc_height)
+
+ # If no match found, raise an error with helpful information
+ supported_ratios = [f"{w}:{h} ({w/h:.3f})" for (w, h) in cls.DOCUMENTED_SIZES.keys()]
+ raise ValueError(
+ f"Viewport aspect ratio {viewport_ratio:.3f} ({viewport_width}x{viewport_height}) "
+ f"does not match any documented aspect ratios. "
+ f"Supported ratios: {', '.join(supported_ratios)}. "
+ f"See https://docs.claude.com/en/docs/build-with-claude/vision#evaluate-image-size"
+ )
+
+ @classmethod
+ def get_scale_factors(
+ cls,
+ viewport_width: int,
+ viewport_height: int,
+ match_aspect_ratio: bool = False
+ ) -> tuple[float, float]:
+ """
+ Calculate scale factors for converting Claude coordinates to viewport coordinates.
+
+ Args:
+ viewport_width: Actual browser viewport width
+ viewport_height: Actual browser viewport height
+ match_aspect_ratio: If True, match the viewport's aspect ratio to documented sizes.
+ If False (default), use 16:9 for browser scenarios.
+
+ Returns:
+ Tuple of (scale_x, scale_y) factors
+
+ Raises:
+ ValueError: If match_aspect_ratio is True and aspect ratio doesn't match documented sizes
+ """
+ if match_aspect_ratio:
+ # Match the documented size based on aspect ratio (will raise if no match)
+ base_width, base_height = cls.get_documented_size_for_aspect_ratio(viewport_width, viewport_height)
+ else:
+ # Use default 16:9 dimensions (typical for browser screenshots)
+ base_width = cls.CLAUDE_ACTUAL_WIDTH
+ base_height = cls.CLAUDE_ACTUAL_HEIGHT
+
+ # Calculate scale factors
+ scale_x = viewport_width / base_width
+ scale_y = viewport_height / base_height
+
+ return scale_x, scale_y
+
+ @classmethod
+ def scale_coordinates(
+ cls,
+ x: int,
+ y: int,
+ viewport_width: int,
+ viewport_height: int,
+ apply_threshold: bool = True
+ ) -> tuple[int, int]:
+ """
+ Scale coordinates from Claude's vision to actual viewport.
+
+ Args:
+ x: X coordinate from Claude
+ y: Y coordinate from Claude
+ viewport_width: Actual browser viewport width
+ viewport_height: Actual browser viewport height
+ apply_threshold: Whether to check if coordinates need scaling
+
+ Returns:
+ Tuple of (scaled_x, scaled_y)
+ """
+ scale_x, scale_y = cls.get_scale_factors(viewport_width, viewport_height)
+
+ # If scaling factors are close to 1.0, no scaling needed
+ if abs(scale_x - 1.0) < 0.05 and abs(scale_y - 1.0) < 0.05:
+ return x, y
+
+ if apply_threshold:
+ # Check if coordinates appear to be in Claude's resolution
+ # (with 20% margin for edge cases)
+ max_expected_x = cls.CLAUDE_ACTUAL_WIDTH * 1.2
+ max_expected_y = cls.CLAUDE_ACTUAL_HEIGHT * 1.2
+
+ # If coordinates are larger than expected Claude resolution,
+ # they might already be in viewport coordinates
+ if x > max_expected_x or y > max_expected_y:
+ return x, y
+
+ # Apply scaling
+ scaled_x = int(x * scale_x)
+ scaled_y = int(y * scale_y)
+
+ # Ensure we don't exceed viewport bounds
+ scaled_x = min(scaled_x, viewport_width - 1)
+ scaled_y = min(scaled_y, viewport_height - 1)
+
+ return scaled_x, scaled_y
+
+ @classmethod
+ def scale_coordinate_list(
+ cls,
+ coords: list | tuple,
+ viewport_width: int,
+ viewport_height: int
+ ) -> list:
+ """
+ Scale a coordinate pair list/tuple.
+
+ Args:
+ coords: [x, y] coordinate pair
+ viewport_width: Actual browser viewport width
+ viewport_height: Actual browser viewport height
+
+ Returns:
+ Scaled [x, y] coordinate pair
+ """
+ if not isinstance(coords, (list, tuple)) or len(coords) != 2:
+ return list(coords) if isinstance(coords, tuple) else coords
+
+ x, y = coords[0], coords[1]
+ scaled_x, scaled_y = cls.scale_coordinates(x, y, viewport_width, viewport_height)
+ return [scaled_x, scaled_y]
\ No newline at end of file
diff --git a/browser-use-demo/build.sh b/browser-use-demo/build.sh
new file mode 100755
index 000000000..f28d0b1e3
--- /dev/null
+++ b/browser-use-demo/build.sh
@@ -0,0 +1,17 @@
+#!/bin/bash
+set -e
+
+echo "🏗️ Building Browser Use Demo Docker image..."
+docker build . -t browser-use-demo:latest
+
+echo "✅ Build complete!"
+echo ""
+echo "To run the demo:"
+echo " docker run -e ANTHROPIC_API_KEY=\$ANTHROPIC_API_KEY \\"
+echo " -v \$(pwd)/browser_use_demo:/home/browseruse/browser_use_demo/ \\"
+echo " -p 5900:5900 -p 8501:8501 -p 6080:6080 -p 8080:8080 \\"
+echo " -it browser-use-demo:latest"
+echo ""
+echo "Then open:"
+echo " - http://localhost:8501 for the Streamlit interface"
+echo " - http://localhost:8080 to see the browser"
\ No newline at end of file
diff --git a/browser-use-demo/docker-compose.yml b/browser-use-demo/docker-compose.yml
new file mode 100644
index 000000000..976439ef2
--- /dev/null
+++ b/browser-use-demo/docker-compose.yml
@@ -0,0 +1,27 @@
+services:
+ browser-use:
+ build:
+ context: .
+ image: browser-use-demo:latest
+ environment:
+ - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY}
+ volumes:
+ - ./validate_env.py:/home/browseruse/validate_env.py
+ - ${HOME}/.anthropic:/home/browseruse/.anthropic
+ ports:
+ - "${VNC_PORT:-5900}:5900" # VNC
+ - "${STREAMLIT_PORT:-8501}:8501" # Streamlit
+ - "${NOVNC_PORT:-6080}:6080" # noVNC
+ - "${HTTP_PORT:-8080}:8080" # HTTP server
+ stdin_open: true
+ tty: true
+ env_file:
+ - .env
+ develop:
+ watch:
+ - path: ./browser_use_demo
+ action: sync+restart
+ target: /home/browseruse/browser_use_demo
+ - path: ./validate_env.py
+ action: rebuild
+ target: /home/browseruse/validate_env.py
\ No newline at end of file
diff --git a/browser-use-demo/image/.config/tint2/applications/firefox-custom.desktop b/browser-use-demo/image/.config/tint2/applications/firefox-custom.desktop
new file mode 100755
index 000000000..948021262
--- /dev/null
+++ b/browser-use-demo/image/.config/tint2/applications/firefox-custom.desktop
@@ -0,0 +1,8 @@
+[Desktop Entry]
+Name=Firefox Custom
+Comment=Open Firefox with custom URL
+Exec=firefox-esr -new-window
+Icon=firefox-esr
+Terminal=false
+Type=Application
+Categories=Network;WebBrowser;
diff --git a/browser-use-demo/image/.config/tint2/applications/gedit.desktop b/browser-use-demo/image/.config/tint2/applications/gedit.desktop
new file mode 100755
index 000000000..d5af03f40
--- /dev/null
+++ b/browser-use-demo/image/.config/tint2/applications/gedit.desktop
@@ -0,0 +1,8 @@
+[Desktop Entry]
+Name=Gedit
+Comment=Open gedit
+Exec=gedit
+Icon=text-editor-symbolic
+Terminal=false
+Type=Application
+Categories=TextEditor;
diff --git a/browser-use-demo/image/.config/tint2/applications/terminal.desktop b/browser-use-demo/image/.config/tint2/applications/terminal.desktop
new file mode 100644
index 000000000..0c2d45d4d
--- /dev/null
+++ b/browser-use-demo/image/.config/tint2/applications/terminal.desktop
@@ -0,0 +1,8 @@
+[Desktop Entry]
+Name=Terminal
+Comment=Open Terminal
+Exec=xterm
+Icon=utilities-terminal
+Terminal=false
+Type=Application
+Categories=System;TerminalEmulator;
diff --git a/browser-use-demo/image/.config/tint2/tint2rc b/browser-use-demo/image/.config/tint2/tint2rc
new file mode 100644
index 000000000..5db6d3128
--- /dev/null
+++ b/browser-use-demo/image/.config/tint2/tint2rc
@@ -0,0 +1,100 @@
+#-------------------------------------
+# Panel
+panel_items = TL
+panel_size = 100% 60
+panel_margin = 0 0
+panel_padding = 2 0 2
+panel_background_id = 1
+wm_menu = 0
+panel_dock = 0
+panel_position = bottom center horizontal
+panel_layer = top
+panel_monitor = all
+panel_shrink = 0
+autohide = 0
+autohide_show_timeout = 0
+autohide_hide_timeout = 0.5
+autohide_height = 2
+strut_policy = follow_size
+panel_window_name = tint2
+disable_transparency = 1
+mouse_effects = 1
+font_shadow = 0
+mouse_hover_icon_asb = 100 0 10
+mouse_pressed_icon_asb = 100 0 0
+scale_relative_to_dpi = 0
+scale_relative_to_screen_height = 0
+
+#-------------------------------------
+# Taskbar
+taskbar_mode = single_desktop
+taskbar_hide_if_empty = 0
+taskbar_padding = 0 0 2
+taskbar_background_id = 0
+taskbar_active_background_id = 0
+taskbar_name = 1
+taskbar_hide_inactive_tasks = 0
+taskbar_hide_different_monitor = 0
+taskbar_hide_different_desktop = 0
+taskbar_always_show_all_desktop_tasks = 0
+taskbar_name_padding = 4 2
+taskbar_name_background_id = 0
+taskbar_name_active_background_id = 0
+taskbar_name_font_color = #e3e3e3 100
+taskbar_name_active_font_color = #ffffff 100
+taskbar_distribute_size = 0
+taskbar_sort_order = none
+task_align = left
+
+#-------------------------------------
+# Launcher
+launcher_padding = 4 8 4
+launcher_background_id = 0
+launcher_icon_background_id = 0
+launcher_icon_size = 48
+launcher_icon_asb = 100 0 0
+launcher_icon_theme_override = 0
+startup_notifications = 1
+launcher_tooltip = 1
+
+#-------------------------------------
+# Launcher icon
+launcher_item_app = /usr/share/applications/libreoffice-calc.desktop
+launcher_item_app = /home/computeruse/.config/tint2/applications/terminal.desktop
+launcher_item_app = /home/computeruse/.config/tint2/applications/firefox-custom.desktop
+launcher_item_app = /usr/share/applications/xpaint.desktop
+launcher_item_app = /usr/share/applications/xpdf.desktop
+launcher_item_app = /home/computeruse/.config/tint2/applications/gedit.desktop
+launcher_item_app = /usr/share/applications/galculator.desktop
+
+#-------------------------------------
+# Background definitions
+# ID 1
+rounded = 0
+border_width = 0
+background_color = #000000 60
+border_color = #000000 30
+
+# ID 2
+rounded = 4
+border_width = 1
+background_color = #777777 20
+border_color = #777777 30
+
+# ID 3
+rounded = 4
+border_width = 1
+background_color = #777777 20
+border_color = #ffffff 40
+
+# ID 4
+rounded = 4
+border_width = 1
+background_color = #aa4400 100
+border_color = #aa7733 100
+
+# ID 5
+rounded = 4
+border_width = 1
+background_color = #aaaa00 100
+border_color = #aaaa00 100
diff --git a/browser-use-demo/image/entrypoint.sh b/browser-use-demo/image/entrypoint.sh
new file mode 100755
index 000000000..ef7fc6fc9
--- /dev/null
+++ b/browser-use-demo/image/entrypoint.sh
@@ -0,0 +1,23 @@
+#!/bin/bash
+set -e
+
+# Validate environment configuration
+echo "Validating environment configuration..."
+python validate_env.py
+if [ $? -ne 0 ]; then
+ echo "Environment validation failed. Exiting."
+ exit 1
+fi
+
+./start_all.sh
+./novnc_startup.sh
+
+python http_server.py > /tmp/server_logs.txt 2>&1 &
+
+STREAMLIT_SERVER_PORT=8501 python -m streamlit run browser_use_demo/streamlit.py > /tmp/streamlit_stdout.log &
+
+echo "✨ Browser Use Demo is ready!"
+echo "➡️ Open http://localhost:8080 in your browser to begin"
+
+# Keep the container running
+tail -f /dev/null
diff --git a/browser-use-demo/image/http_server.py b/browser-use-demo/image/http_server.py
new file mode 100644
index 000000000..082ff4de1
--- /dev/null
+++ b/browser-use-demo/image/http_server.py
@@ -0,0 +1,19 @@
+import os
+import socket
+from http.server import HTTPServer, SimpleHTTPRequestHandler
+
+
+class HTTPServerV6(HTTPServer):
+ address_family = socket.AF_INET6
+
+
+def run_server():
+ os.chdir(os.path.dirname(__file__) + "/static_content")
+ server_address = ("::", 8080)
+ httpd = HTTPServerV6(server_address, SimpleHTTPRequestHandler)
+ print("Starting HTTP server on port 8080...") # noqa: T201
+ httpd.serve_forever()
+
+
+if __name__ == "__main__":
+ run_server()
diff --git a/browser-use-demo/image/index.html b/browser-use-demo/image/index.html
new file mode 100644
index 000000000..7df7c2e65
--- /dev/null
+++ b/browser-use-demo/image/index.html
@@ -0,0 +1,43 @@
+
+
+
+ Browser Use Demo
+
+
+
+
+
+
+
+
+
+
diff --git a/browser-use-demo/image/mutter_startup.sh b/browser-use-demo/image/mutter_startup.sh
new file mode 100755
index 000000000..5f714f74f
--- /dev/null
+++ b/browser-use-demo/image/mutter_startup.sh
@@ -0,0 +1,20 @@
+echo "starting mutter"
+XDG_SESSION_TYPE=x11 mutter --replace --sm-disable 2>/tmp/mutter_stderr.log &
+
+# Wait for tint2 window properties to appear
+timeout=30
+while [ $timeout -gt 0 ]; do
+ if xdotool search --class "mutter" >/dev/null 2>&1; then
+ break
+ fi
+ sleep 1
+ ((timeout--))
+done
+
+if [ $timeout -eq 0 ]; then
+ echo "mutter stderr output:" >&2
+ cat /tmp/mutter_stderr.log >&2
+ exit 1
+fi
+
+rm /tmp/mutter_stderr.log
diff --git a/browser-use-demo/image/novnc_startup.sh b/browser-use-demo/image/novnc_startup.sh
new file mode 100755
index 000000000..da56816ca
--- /dev/null
+++ b/browser-use-demo/image/novnc_startup.sh
@@ -0,0 +1,21 @@
+#!/bin/bash
+echo "starting noVNC"
+
+# Start noVNC with explicit websocket settings
+/opt/noVNC/utils/novnc_proxy \
+ --vnc localhost:5900 \
+ --listen 6080 \
+ --web /opt/noVNC \
+ > /tmp/novnc.log 2>&1 &
+
+# Wait for noVNC to start
+timeout=10
+while [ $timeout -gt 0 ]; do
+ if netstat -tuln | grep -q ":6080 "; then
+ break
+ fi
+ sleep 1
+ ((timeout--))
+done
+
+echo "noVNC started successfully"
diff --git a/browser-use-demo/image/start_all.sh b/browser-use-demo/image/start_all.sh
new file mode 100755
index 000000000..ce14e3917
--- /dev/null
+++ b/browser-use-demo/image/start_all.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+set -e
+
+export DISPLAY=:${DISPLAY_NUM}
+./xvfb_startup.sh
+./tint2_startup.sh # Keep taskbar for window management
+./mutter_startup.sh # Keep window manager for controls
+./x11vnc_startup.sh
diff --git a/browser-use-demo/image/static_content/index.html b/browser-use-demo/image/static_content/index.html
new file mode 100644
index 000000000..6fadf3382
--- /dev/null
+++ b/browser-use-demo/image/static_content/index.html
@@ -0,0 +1,72 @@
+
+
+
+ Browser Use Demo
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/browser-use-demo/image/tint2_startup.sh b/browser-use-demo/image/tint2_startup.sh
new file mode 100755
index 000000000..34f39a18b
--- /dev/null
+++ b/browser-use-demo/image/tint2_startup.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+echo "starting tint2 on display :$DISPLAY_NUM ..."
+
+# Start tint2 and capture its stderr
+tint2 -c $HOME/.config/tint2/tint2rc 2>/tmp/tint2_stderr.log &
+
+# Wait for tint2 window properties to appear
+timeout=30
+while [ $timeout -gt 0 ]; do
+ if xdotool search --class "tint2" >/dev/null 2>&1; then
+ break
+ fi
+ sleep 1
+ ((timeout--))
+done
+
+if [ $timeout -eq 0 ]; then
+ echo "tint2 stderr output:" >&2
+ cat /tmp/tint2_stderr.log >&2
+ exit 1
+fi
+
+# Remove the temporary stderr log file
+rm /tmp/tint2_stderr.log
diff --git a/browser-use-demo/image/x11vnc_startup.sh b/browser-use-demo/image/x11vnc_startup.sh
new file mode 100755
index 000000000..ad4b352c3
--- /dev/null
+++ b/browser-use-demo/image/x11vnc_startup.sh
@@ -0,0 +1,46 @@
+#!/bin/bash
+echo "starting vnc"
+
+(x11vnc -display $DISPLAY \
+ -forever \
+ -shared \
+ -wait 50 \
+ -rfbport 5900 \
+ -nopw \
+ 2>/tmp/x11vnc_stderr.log) &
+
+x11vnc_pid=$!
+
+# Wait for x11vnc to start
+timeout=10
+while [ $timeout -gt 0 ]; do
+ if netstat -tuln | grep -q ":5900 "; then
+ break
+ fi
+ sleep 1
+ ((timeout--))
+done
+
+if [ $timeout -eq 0 ]; then
+ echo "x11vnc failed to start, stderr output:" >&2
+ cat /tmp/x11vnc_stderr.log >&2
+ exit 1
+fi
+
+: > /tmp/x11vnc_stderr.log
+
+# Monitor x11vnc process in the background
+(
+ while true; do
+ if ! kill -0 $x11vnc_pid 2>/dev/null; then
+ echo "x11vnc process crashed, restarting..." >&2
+ if [ -f /tmp/x11vnc_stderr.log ]; then
+ echo "x11vnc stderr output:" >&2
+ cat /tmp/x11vnc_stderr.log >&2
+ rm /tmp/x11vnc_stderr.log
+ fi
+ exec "$0"
+ fi
+ sleep 5
+ done
+) &
diff --git a/browser-use-demo/image/xvfb_startup.sh b/browser-use-demo/image/xvfb_startup.sh
new file mode 100755
index 000000000..205da2933
--- /dev/null
+++ b/browser-use-demo/image/xvfb_startup.sh
@@ -0,0 +1,57 @@
+#!/bin/bash
+set -e # Exit on error
+
+DPI=96
+RES_AND_DEPTH=${WIDTH}x${HEIGHT}x24
+
+# Function to check if Xvfb is already running
+check_xvfb_running() {
+ if [ -e /tmp/.X${DISPLAY_NUM}-lock ]; then
+ return 0 # Xvfb is already running
+ else
+ return 1 # Xvfb is not running
+ fi
+}
+
+# Function to check if Xvfb is ready
+wait_for_xvfb() {
+ local timeout=10
+ local start_time=$(date +%s)
+ while ! xdpyinfo >/dev/null 2>&1; do
+ if [ $(($(date +%s) - start_time)) -gt $timeout ]; then
+ echo "Xvfb failed to start within $timeout seconds" >&2
+ return 1
+ fi
+ sleep 0.1
+ done
+ return 0
+}
+
+# Check if Xvfb is already running
+if check_xvfb_running; then
+ echo "Lock file found for display ${DISPLAY}, checking if display is accessible..."
+ # Verify that the display is actually accessible
+ if xdpyinfo >/dev/null 2>&1; then
+ echo "Xvfb is already running and accessible on display ${DISPLAY}"
+ exit 0
+ else
+ echo "Display ${DISPLAY} is not accessible, cleaning up stale lock file..."
+ rm -f /tmp/.X${DISPLAY_NUM}-lock
+ # Also clean up the socket if it exists
+ rm -f /tmp/.X11-unix/X${DISPLAY_NUM}
+ fi
+fi
+
+# Start Xvfb
+Xvfb $DISPLAY -ac -screen 0 $RES_AND_DEPTH -retro -dpi $DPI -nolisten tcp -nolisten unix &
+XVFB_PID=$!
+
+# Wait for Xvfb to start
+if wait_for_xvfb; then
+ echo "Xvfb started successfully on display ${DISPLAY}"
+ echo "Xvfb PID: $XVFB_PID"
+else
+ echo "Xvfb failed to start"
+ kill $XVFB_PID
+ exit 1
+fi
diff --git a/browser-use-demo/pytest.ini b/browser-use-demo/pytest.ini
new file mode 100644
index 000000000..eb62803d4
--- /dev/null
+++ b/browser-use-demo/pytest.ini
@@ -0,0 +1,34 @@
+[pytest]
+# Test discovery patterns
+python_files = test_*.py
+python_classes = Test*
+python_functions = test_*
+
+# Test directories
+testpaths = tests
+
+# Markers for test categorization
+markers =
+ integration: Integration tests that test multiple components
+ slow: Tests that take longer than usual to run
+ asyncio: Tests that use asyncio
+
+# Coverage settings
+addopts =
+ -v
+ --strict-markers
+ --tb=short
+ --disable-warnings
+ -p pytest_asyncio
+
+# Asyncio configuration
+asyncio_mode = auto
+asyncio_default_fixture_loop_scope = function
+
+# Minimum Python version
+minversion = 3.11
+
+# Ignore specific warnings
+filterwarnings =
+ ignore::DeprecationWarning
+ ignore::PendingDeprecationWarning
\ No newline at end of file
diff --git a/browser-use-demo/setup.py b/browser-use-demo/setup.py
new file mode 100644
index 000000000..300a31630
--- /dev/null
+++ b/browser-use-demo/setup.py
@@ -0,0 +1,29 @@
+from setuptools import setup, find_packages
+
+setup(
+ name="browser-use-demo",
+ version="0.1.0",
+ packages=find_packages(),
+ python_requires=">=3.11",
+ install_requires=[
+ "streamlit==1.41.0",
+ "anthropic[bedrock,vertex]>=0.39.0",
+ "jsonschema==4.22.0",
+ "boto3>=1.28.57",
+ "google-auth<3,>=2",
+ "playwright>=1.40.0",
+ ],
+ extras_require={
+ "test": [
+ "pytest==8.3.3",
+ "pytest-cov==4.1.0",
+ "pytest-mock==3.11.1",
+ "pytest-asyncio==0.23.6",
+ ],
+ "dev": [
+ "ruff==0.6.7",
+ "pyright>=1.1.300",
+ "pre-commit==3.8.0",
+ ],
+ },
+)
diff --git a/browser-use-demo/test-requirements.txt b/browser-use-demo/test-requirements.txt
new file mode 100644
index 000000000..6efd10ea4
--- /dev/null
+++ b/browser-use-demo/test-requirements.txt
@@ -0,0 +1,4 @@
+pytest==8.3.3
+pytest-cov==4.1.0
+pytest-mock==3.11.1
+pytest-asyncio==0.23.6
\ No newline at end of file
diff --git a/browser-use-demo/tests/README.md b/browser-use-demo/tests/README.md
new file mode 100644
index 000000000..1ddeee764
--- /dev/null
+++ b/browser-use-demo/tests/README.md
@@ -0,0 +1,174 @@
+# Browser Use Demo - Test Suite
+
+Comprehensive test suite for the refactored Browser Use Demo with extensive edge case coverage.
+
+## Installation
+
+```bash
+# Install test dependencies
+pip install -r test-requirements.txt
+
+# Or install with extras
+pip install -e ".[test]"
+```
+
+## Running Tests
+
+### Run all tests
+```bash
+pytest tests/
+```
+
+### Run with coverage report
+```bash
+pytest tests/ --cov=browser_tools_api_demo --cov-report=html
+# Open htmlcov/index.html to view coverage report
+```
+
+### Run specific test file
+```bash
+pytest tests/test_message_renderer.py -v
+```
+
+### Run specific test class or method
+```bash
+pytest tests/test_message_renderer.py::TestMessageRenderer -v
+pytest tests/test_message_renderer.py::TestRenderMethod::test_render_string_message -v
+```
+
+### Run tests by marker
+```bash
+# Run only integration tests
+pytest -m integration
+
+# Run tests excluding integration
+pytest -m "not integration"
+
+# Run async tests
+pytest -m asyncio
+```
+
+## Test Structure
+
+```
+tests/
+├── conftest.py # Shared fixtures and mocks
+├── test_message_renderer.py # MessageRenderer class tests (~300 test cases)
+├── test_streamlit_helpers.py # Helper function tests (~150 test cases)
+└── test_integration.py # End-to-end integration tests (~50 test cases)
+```
+
+## Test Coverage
+
+The test suite covers:
+
+### MessageRenderer (`test_message_renderer.py`)
+- Initialization with various state configurations
+- Rendering all message types (string, dict, ToolResult)
+- Conversation history rendering with complex structures
+- Edge cases: empty messages, None values, circular references
+- Error handling: malformed data, missing fields, exceptions
+- Unicode and special character handling
+- Performance with large messages
+
+### Streamlit Helpers (`test_streamlit_helpers.py`)
+- `setup_state()` with fresh and partial initialization
+- Environment variable handling (present/missing/invalid)
+- Lambda evaluation in state initialization
+- `get_or_create_event_loop()` with various loop states
+- `authenticate()` with different providers and key states
+- Concurrent access and thread safety
+- Error recovery scenarios
+
+### Integration Tests (`test_integration.py`)
+- Complete message rendering pipeline
+- State initialization and persistence
+- Event loop management with async operations
+- Error propagation across components
+- Full user interaction workflow
+- Performance with large datasets (1000+ messages)
+- Deeply nested content structures
+
+## Edge Cases Covered
+
+1. **Boundary Conditions**
+ - Empty strings, lists, dictionaries
+ - Single item collections
+ - Maximum size inputs (100k+ character messages)
+ - Null/None values
+
+2. **Type Mismatches**
+ - Wrong types for expected fields
+ - Missing required fields
+ - Extra unexpected fields
+ - Invalid message structures
+
+3. **State Inconsistencies**
+ - Tools referenced but not in session_state
+ - Partially initialized state
+ - Concurrent modifications
+ - Corrupted state
+
+4. **Error Conditions**
+ - Import errors
+ - Asyncio exceptions
+ - Environment variable errors
+ - Lambda evaluation failures
+ - Base64 decode errors
+
+5. **Performance Edge Cases**
+ - Very large message histories (1000+ messages)
+ - Deeply nested content (100+ levels)
+ - Circular references
+ - Unicode and special characters
+
+## Mocking Strategy
+
+### Streamlit Components
+All Streamlit components are mocked to enable testing without a running Streamlit server:
+- `st.session_state`
+- `st.chat_message`
+- `st.markdown`, `st.write`, `st.error`, `st.code`, `st.image`
+- `st.chat_input`, `st.stop`
+
+### External Dependencies
+- `BrowserTool` - Mocked to avoid Playwright dependencies
+- `asyncio` event loops - Mocked for controlled testing
+- Environment variables - Mocked via `monkeypatch`
+
+## Fixtures
+
+Key fixtures provided in `conftest.py`:
+
+- `mock_streamlit` - Complete Streamlit mocking setup
+- `mock_browser_tool` - BrowserTool mock
+- `sample_tool_result` - Various ToolResult configurations
+- `sample_messages` - Diverse message structures for testing
+- `edge_case_messages` - Messages designed to trigger edge cases
+- `mock_asyncio_loop` - Controlled event loop for testing
+- `mock_environment` - Environment variable setup
+- `clean_environment` - Remove environment variables
+
+## Continuous Integration
+
+To run tests in CI:
+
+```bash
+# Install dependencies
+pip install -e ".[test]"
+
+# Run tests with coverage
+pytest tests/ --cov=browser_tools_api_demo --cov-report=xml --cov-report=term
+
+# Generate coverage badge
+coverage-badge -o coverage.svg
+```
+
+## Contributing
+
+When adding new features or refactoring:
+1. Add corresponding tests for new functionality
+2. Ensure all edge cases are covered
+3. Run the full test suite before committing
+4. Maintain >95% code coverage
+5. Update this README if test structure changes
\ No newline at end of file
diff --git a/browser-use-demo/tests/__init__.py b/browser-use-demo/tests/__init__.py
new file mode 100644
index 000000000..af8757ad5
--- /dev/null
+++ b/browser-use-demo/tests/__init__.py
@@ -0,0 +1 @@
+"""Test suite for Browser Use Demo."""
diff --git a/browser-use-demo/tests/conftest.py b/browser-use-demo/tests/conftest.py
new file mode 100644
index 000000000..5446cecb8
--- /dev/null
+++ b/browser-use-demo/tests/conftest.py
@@ -0,0 +1,301 @@
+"""Shared fixtures and configuration for tests."""
+
+import asyncio
+import sys
+from pathlib import Path
+from unittest.mock import MagicMock, Mock, patch
+
+import pytest
+
+# Add the parent directory to the path
+sys.path.insert(0, str(Path(__file__).parent.parent))
+
+from browser_use_demo.tools import ToolResult
+
+
+@pytest.fixture
+def mock_streamlit():
+ """Mock Streamlit module and session_state."""
+ with patch("streamlit.session_state") as mock_state:
+ # Initialize with default values
+ mock_state.hide_screenshots = False
+ mock_state.tools = {}
+ mock_state.messages = []
+ mock_state.api_key = "test-key"
+ mock_state.provider = MagicMock()
+ mock_state.event_loop = None
+
+ # Mock other streamlit components
+ with patch("streamlit.chat_message") as mock_chat:
+ mock_chat.return_value.__enter__ = Mock()
+ mock_chat.return_value.__exit__ = Mock()
+
+ with patch("streamlit.markdown") as mock_markdown:
+ with patch("streamlit.write") as mock_write:
+ with patch("streamlit.error") as mock_error:
+ with patch("streamlit.code") as mock_code:
+ with patch("streamlit.image") as mock_image:
+ yield {
+ "session_state": mock_state,
+ "chat_message": mock_chat,
+ "markdown": mock_markdown,
+ "write": mock_write,
+ "error": mock_error,
+ "code": mock_code,
+ "image": mock_image,
+ }
+
+
+@pytest.fixture
+def mock_browser_tool():
+ """Mock BrowserTool to avoid Playwright dependencies."""
+ with patch("browser_use_demo.tools.BrowserTool") as mock_tool:
+ mock_instance = MagicMock()
+ mock_tool.return_value = mock_instance
+ yield mock_instance
+
+
+@pytest.fixture
+def sample_tool_result():
+ """Create sample ToolResult objects for testing."""
+ return {
+ "success": ToolResult(output="Success message"),
+ "error": ToolResult(error="Error message"),
+ "with_image": ToolResult(
+ output="With screenshot",
+ base64_image="iVBORw0KGgoAAAANSUhEUgAAAAEAAAABCAYAAAAfFcSJAAAADUlEQVR42mNkYPhfDwAChwGA60e6kgAAAABJRU5ErkJggg==",
+ ),
+ "empty": ToolResult(),
+ "all_fields": ToolResult(
+ output="Output text",
+ error="Error text",
+ base64_image="base64data",
+ system="System message",
+ ),
+ }
+
+
+@pytest.fixture
+def sample_messages():
+ """Provide various message structures for testing edge cases."""
+ return [
+ # Normal messages
+ {"role": "user", "content": "Hello"},
+ {"role": "assistant", "content": "Hi there!"},
+ # Complex content structures
+ {
+ "role": "user",
+ "content": [
+ {"type": "text", "text": "Multiple items"},
+ {"type": "image", "source": "data:image/png;base64,abc123"},
+ ],
+ },
+ # Assistant with tool results
+ {
+ "role": "assistant",
+ "content": [
+ {"type": "text", "text": "Let me help with that"},
+ {
+ "type": "tool_use",
+ "name": "browser",
+ "input": {"url": "example.com"},
+ },
+ {"type": "tool_result", "tool_use_id": "tool_123"},
+ ],
+ },
+ # Edge cases
+ {"role": "user", "content": ""}, # Empty content
+ {"role": "assistant", "content": None}, # None content
+ {"role": "unknown", "content": "Unknown role"}, # Unknown role
+ {"role": "user"}, # Missing content field
+ {"content": "No role"}, # Missing role field
+ # Unicode and special characters
+ {"role": "user", "content": "Hello 世界 🌍 \n\t\r"},
+ # Very long content
+ {"role": "user", "content": "x" * 10000},
+ # Nested structures
+ {
+ "role": "assistant",
+ "content": [
+ {
+ "type": "text",
+ "text": "Nested",
+ "metadata": {"nested": {"deeply": {"value": 123}}},
+ }
+ ],
+ },
+ ]
+
+
+@pytest.fixture
+def edge_case_messages():
+ """Messages specifically designed to test edge cases and error conditions."""
+ return {
+ "empty_list": [],
+ "none": None,
+ "malformed_dict": {"not": "valid", "message": "structure"},
+ "circular_ref": _create_circular_reference(),
+ "missing_tool": {
+ "role": "assistant",
+ "content": [{"type": "tool_result", "tool_use_id": "nonexistent_tool"}],
+ },
+ "invalid_types": [
+ {"role": 123, "content": "Invalid role type"},
+ {"role": "user", "content": {"invalid": "content structure"}},
+ ],
+ "huge_message": {
+ "role": "user",
+ "content": [{"type": "text", "text": "x" * 1000000}],
+ },
+ }
+
+
+def _create_circular_reference():
+ """Helper to create a message with circular reference."""
+ msg = {"role": "user", "content": []}
+ msg["content"].append(msg) # Circular reference
+ return msg
+
+
+@pytest.fixture
+def mock_asyncio_loop():
+ """Mock asyncio event loop for testing."""
+ loop = Mock(spec=asyncio.AbstractEventLoop)
+ loop.is_closed.return_value = False
+ loop.run_until_complete = Mock(side_effect=lambda coro: asyncio.run(coro))
+ return loop
+
+
+@pytest.fixture
+def mock_environment(monkeypatch):
+ """Mock environment variables for testing."""
+ env_vars = {
+ "ANTHROPIC_API_KEY": "test-api-key",
+ }
+
+ for key, value in env_vars.items():
+ monkeypatch.setenv(key, value)
+
+ return env_vars
+
+
+@pytest.fixture
+def clean_environment(monkeypatch):
+ """Remove environment variables for testing missing env scenarios."""
+ keys_to_remove = [
+ "ANTHROPIC_API_KEY",
+ ]
+
+ for key in keys_to_remove:
+ monkeypatch.delenv(key, raising=False)
+
+ return keys_to_remove
+
+
+@pytest.fixture
+def mock_provider():
+ """Mock APIProvider enum."""
+ with patch("browser_use_demo.loop.APIProvider") as mock:
+ mock.ANTHROPIC = "anthropic"
+ mock.BEDROCK = "bedrock"
+ mock.VERTEX = "vertex"
+ yield mock
+
+
+@pytest.fixture
+def mock_api_response_with_text_and_tools():
+ """Mock API response containing both text and tool uses."""
+ response = Mock()
+ response.content = [
+ Mock(type="text", text="I'll help you with that task"),
+ Mock(
+ type="tool_use",
+ id="tool_001",
+ name="browser",
+ input={"action": "screenshot"}
+ ),
+ Mock(type="text", text="Here's what I found"),
+ Mock(
+ type="tool_use",
+ id="tool_002",
+ name="browser",
+ input={"action": "navigate", "url": "example.com"}
+ )
+ ]
+ return response
+
+
+@pytest.fixture
+def mock_tool_collection():
+ """Mock ToolCollection for testing."""
+ from unittest.mock import AsyncMock
+
+ mock_browser = AsyncMock()
+ mock_browser.return_value = ToolResult(output="Tool executed successfully")
+
+ collection = Mock()
+ collection.tool_map = {"browser": mock_browser}
+ collection.to_params = Mock(return_value=[
+ {
+ "name": "browser",
+ "description": "Browser automation tool",
+ "input_schema": {}
+ }
+ ])
+
+ return collection
+
+
+@pytest.fixture
+def sample_mixed_content_messages():
+ """Sample messages with mixed text and tool content."""
+ return [
+ {
+ "role": "user",
+ "content": "Can you help me navigate and take a screenshot?"
+ },
+ {
+ "role": "assistant",
+ "content": [
+ {"type": "text", "text": "Sure, I'll navigate to the page first"},
+ {
+ "type": "tool_use",
+ "id": "tool_nav",
+ "name": "browser",
+ "input": {"action": "navigate", "url": "example.com"}
+ },
+ {"type": "text", "text": "Now taking a screenshot"},
+ {
+ "type": "tool_use",
+ "id": "tool_shot",
+ "name": "browser",
+ "input": {"action": "screenshot"}
+ }
+ ]
+ },
+ {
+ "role": "user",
+ "content": [
+ {
+ "type": "tool_result",
+ "tool_use_id": "tool_nav",
+ "content": [{"type": "text", "text": "Navigation successful"}]
+ },
+ {
+ "type": "tool_result",
+ "tool_use_id": "tool_shot",
+ "content": [
+ {"type": "text", "text": "Screenshot taken"},
+ {
+ "type": "image",
+ "source": {
+ "type": "base64",
+ "media_type": "image/png",
+ "data": "base64imagedata"
+ }
+ }
+ ]
+ }
+ ]
+ }
+ ]
diff --git a/browser-use-demo/tests/test_integration.py b/browser-use-demo/tests/test_integration.py
new file mode 100644
index 000000000..4e0920f50
--- /dev/null
+++ b/browser-use-demo/tests/test_integration.py
@@ -0,0 +1,360 @@
+"""Integration tests for the refactored Browser Use Demo."""
+
+from unittest.mock import AsyncMock, MagicMock, Mock, patch
+
+import pytest
+from browser_use_demo.loop import APIProvider
+from browser_use_demo.message_renderer import MessageRenderer
+from browser_use_demo.streamlit import (
+ get_or_create_event_loop,
+ setup_state,
+)
+from browser_use_demo.tools import ToolResult
+
+
+@pytest.mark.integration
+class TestFullMessageRenderingPipeline:
+ """Test complete message rendering pipeline."""
+
+ @patch("streamlit.session_state", new_callable=MagicMock)
+ @patch("streamlit.chat_message")
+ @patch("streamlit.markdown")
+ @patch("streamlit.write")
+ @patch("streamlit.error")
+ def test_full_conversation_rendering(
+ self, mock_error, mock_write, mock_markdown, mock_chat, mock_state
+ ):
+ """Test rendering a complete conversation with various message types."""
+
+ # Setup mock state
+ mock_state.hide_screenshots = False
+ mock_state.tools = {
+ "tool_1": ToolResult(output="Tool output 1"),
+ "tool_2": ToolResult(error="Tool error 2"),
+ }
+
+ # Create complex conversation
+ messages = [
+ {"role": "user", "content": "Hello, can you help me?"},
+ {
+ "role": "assistant",
+ "content": [
+ {"type": "text", "text": "Sure, let me help you."},
+ {
+ "type": "tool_use",
+ "name": "browser",
+ "input": {"url": "example.com"},
+ },
+ {"type": "tool_result", "tool_use_id": "tool_1"},
+ ],
+ },
+ {"role": "user", "content": [{"type": "text", "text": "Thank you!"}]},
+ {
+ "role": "assistant",
+ "content": [
+ {"type": "text", "text": "You're welcome!"},
+ {"type": "tool_result", "tool_use_id": "tool_2"},
+ ],
+ },
+ ]
+
+ # Mock chat_message context manager
+ mock_chat.return_value.__enter__ = Mock()
+ mock_chat.return_value.__exit__ = Mock()
+
+ # Render full conversation
+ renderer = MessageRenderer(mock_state)
+ renderer.render_conversation_history(messages)
+
+ # Verify all message types were rendered
+ assert mock_markdown.call_count >= 3 # Text messages
+ assert mock_write.call_count >= 2 # Tool use and text blocks
+ assert mock_error.call_count == 1 # Tool error
+
+
+@pytest.mark.integration
+class TestStateInitializationAndPersistence:
+ """Test state initialization and persistence across operations."""
+
+ @patch("streamlit.session_state", new_callable=MagicMock)
+ @patch("browser_use_demo.tools.BrowserTool")
+ def test_complete_state_initialization(self, mock_browser_tool, mock_state):
+ """Test complete state initialization flow."""
+
+ # Simulate fresh state
+ initialized_keys = set()
+
+ def setitem_side_effect(key, value):
+ initialized_keys.add(key)
+ setattr(mock_state, key, value)
+
+ mock_state.__contains__.return_value = False
+ mock_state.__setitem__.side_effect = setitem_side_effect
+
+ # Initialize state
+ setup_state()
+
+ # Verify all required keys were initialized
+ expected_keys = {
+ "messages",
+ "api_key",
+ "provider",
+ "model",
+ "max_tokens",
+ "system_prompt",
+ "hide_screenshots",
+ "tools",
+ "browser_tool",
+ "event_loop",
+ "rendered_message_count",
+ "is_agent_running",
+ "active_messages",
+ "active_response_container",
+ }
+
+ assert expected_keys.issubset(initialized_keys)
+
+ @patch("streamlit.session_state", new_callable=MagicMock)
+ def test_state_persistence_across_renders(self, mock_state):
+ """Test that state persists across multiple render calls."""
+
+ # Initialize state
+ mock_state.tools = {"tool_1": ToolResult(output="Persistent tool")}
+ mock_state.messages = [{"role": "user", "content": "Initial message"}]
+
+ # Create renderer and render
+ renderer1 = MessageRenderer(mock_state)
+ with patch("streamlit.chat_message"), patch("streamlit.markdown"):
+ renderer1.render_conversation_history(mock_state.messages)
+
+ # Add more messages
+ mock_state.messages.append({"role": "assistant", "content": "Response"})
+
+ # Create new renderer instance and render again
+ renderer2 = MessageRenderer(mock_state)
+ with patch("streamlit.chat_message"), patch("streamlit.markdown") as mock_md:
+ renderer2.render_conversation_history(mock_state.messages)
+
+ # Should render both messages
+ assert mock_md.call_count >= 2
+
+
+@pytest.mark.integration
+class TestEventLoopManagementWithAsync:
+ """Test event loop management with async operations."""
+
+ def test_async_agent_execution(self):
+ """Test running async agent with event loop management."""
+
+ with patch("streamlit.session_state", new_callable=MagicMock) as mock_state:
+ mock_state.event_loop = None
+
+ with patch("asyncio.set_event_loop"):
+ with patch("asyncio.new_event_loop") as mock_new_loop:
+ mock_loop = MagicMock()
+ mock_loop.is_closed.return_value = False
+ mock_new_loop.return_value = mock_loop
+
+ loop = get_or_create_event_loop()
+
+ # Verify loop was created and set
+ assert loop == mock_loop
+ assert mock_state.event_loop == mock_loop
+
+ # Test that the loop can handle async operations (mocked)
+ async def mock_agent(input_text):
+ return f"Processed: {input_text}"
+
+ # Mock running the async function
+ mock_loop.run_until_complete = MagicMock(
+ return_value="Processed: Test input"
+ )
+ result = mock_loop.run_until_complete(mock_agent("Test input"))
+ assert result == "Processed: Test input"
+
+ @patch("streamlit.session_state", new_callable=MagicMock)
+ def test_concurrent_async_operations(self, mock_state):
+ """Test handling concurrent async operations."""
+
+ mock_state.event_loop = None
+
+ with patch("asyncio.set_event_loop"):
+ with patch("asyncio.new_event_loop") as mock_new_loop:
+ mock_loop = MagicMock()
+ mock_loop.is_closed.return_value = False
+ mock_new_loop.return_value = mock_loop
+
+ loop = get_or_create_event_loop()
+
+ # Simulate concurrent operations
+ async def async_task(task_id):
+ return f"Task {task_id} complete"
+
+ # Mock gather to simulate concurrent execution
+ expected_results = [f"Task {i} complete" for i in range(5)]
+
+ with patch("asyncio.gather") as mock_gather:
+ mock_gather.return_value = expected_results
+
+ # Test that multiple tasks can be handled
+ tasks = [async_task(i) for i in range(5)]
+ results = mock_gather(*tasks)
+
+ assert len(results) == 5
+ assert all("complete" in r for r in results)
+ mock_gather.assert_called_once()
+
+
+@pytest.mark.integration
+class TestErrorPropagationAndHandling:
+ """Test error propagation and handling across the system."""
+
+ @patch("streamlit.session_state", new_callable=MagicMock)
+ @patch("streamlit.error")
+ def test_rendering_error_propagation(self, mock_error, mock_state):
+ """Test that rendering errors are properly propagated."""
+
+ mock_state.tools = {}
+
+ # Create message that will cause error
+ messages = [
+ {
+ "role": "assistant",
+ "content": [{"type": "tool_result", "tool_use_id": "nonexistent"}],
+ }
+ ]
+
+ renderer = MessageRenderer(mock_state)
+ with patch("streamlit.chat_message"):
+ # Should handle missing tool gracefully
+ renderer.render_conversation_history(messages)
+
+ # Error should not be called for missing tool (handled gracefully)
+ mock_error.assert_not_called()
+
+ @patch("streamlit.session_state", new_callable=MagicMock)
+ def test_initialization_error_recovery(self, mock_state):
+ """Test recovery from initialization errors."""
+
+ mock_state.__contains__.return_value = False
+ # Set provider to valid enum value so lambda can access it
+ mock_state.provider = APIProvider.ANTHROPIC
+
+ # First call fails
+ with patch("browser_use_demo.tools.BrowserTool") as mock_browser:
+ mock_browser.side_effect = [Exception("Init failed"), MagicMock()]
+
+ # First attempt should fail
+ with pytest.raises(Exception, match="Init failed"):
+ setup_state()
+
+ # Second attempt should succeed
+ setup_state()
+ assert mock_browser.call_count == 2
+
+
+@pytest.mark.integration
+class TestCompleteWorkflow:
+ """Test complete workflow from initialization to rendering."""
+
+ @patch("streamlit.session_state", new_callable=MagicMock)
+ @patch("streamlit.chat_input")
+ @patch("streamlit.chat_message")
+ @patch("streamlit.markdown")
+ @patch("browser_use_demo.tools.BrowserTool")
+ @patch("browser_use_demo.streamlit.run_agent", new_callable=AsyncMock)
+ def test_complete_user_interaction_flow(
+ self,
+ mock_run_agent,
+ mock_browser_tool,
+ mock_markdown,
+ mock_chat_message,
+ mock_chat_input,
+ mock_state,
+ ):
+ """Test complete flow from user input to message rendering."""
+
+ # Setup initial state
+ mock_state.__contains__.return_value = False
+ # Set provider to valid enum value so lambda can access it
+ mock_state.provider = APIProvider.ANTHROPIC
+ setup_state()
+
+ # Simulate user input
+ user_input = "Browse to example.com"
+ mock_chat_input.return_value = user_input
+
+ # Setup event loop
+ with patch("asyncio.new_event_loop") as mock_new_loop:
+ mock_loop = MagicMock()
+ mock_loop.is_closed.return_value = False
+ mock_loop.run_until_complete = MagicMock()
+ mock_new_loop.return_value = mock_loop
+
+ with patch("asyncio.set_event_loop"):
+ loop = get_or_create_event_loop()
+
+ # Simulate agent response
+ mock_run_agent.return_value = None
+
+ # Mock chat message context
+ mock_chat_message.return_value.__enter__ = Mock()
+ mock_chat_message.return_value.__exit__ = Mock()
+
+ # Simulate the workflow
+ # User provides input
+ if user_input:
+ loop.run_until_complete(mock_run_agent(user_input))
+
+ mock_loop.run_until_complete.assert_called_once()
+ mock_run_agent.assert_called_once_with(user_input)
+
+
+@pytest.mark.integration
+class TestPerformanceAndScalability:
+ """Test performance with large datasets and edge cases."""
+
+ @patch("streamlit.session_state", new_callable=MagicMock)
+ @patch("streamlit.chat_message")
+ @patch("streamlit.markdown")
+ def test_large_conversation_history(
+ self, mock_markdown, mock_chat_message, mock_state
+ ):
+ """Test rendering very large conversation history."""
+
+ # Create large conversation (1000 messages)
+ large_conversation = []
+ for i in range(1000):
+ role = "user" if i % 2 == 0 else "assistant"
+ large_conversation.append({"role": role, "content": f"Message {i}"})
+
+ mock_state.tools = {}
+ mock_chat_message.return_value.__enter__ = Mock()
+ mock_chat_message.return_value.__exit__ = Mock()
+
+ renderer = MessageRenderer(mock_state)
+
+ # Should handle large conversation without issues
+ renderer.render_conversation_history(large_conversation)
+
+ # Verify all messages were processed
+ assert mock_markdown.call_count == 1000
+
+ @patch("streamlit.session_state", new_callable=MagicMock)
+ def test_deeply_nested_content_performance(self, mock_state):
+ """Test performance with deeply nested content structures."""
+
+ # Create deeply nested structure
+ content = {"type": "text", "text": "Base"}
+ for i in range(100):
+ content = {"type": "wrapper", "content": [content], "depth": i}
+
+ messages = [{"role": "user", "content": [content]}]
+
+ mock_state.tools = {}
+
+ with patch("streamlit.chat_message"), patch("streamlit.write"):
+ renderer = MessageRenderer(mock_state)
+
+ # Should handle deep nesting without stack overflow
+ renderer.render_conversation_history(messages)
diff --git a/browser-use-demo/tests/test_message_renderer.py b/browser-use-demo/tests/test_message_renderer.py
new file mode 100644
index 000000000..a52489084
--- /dev/null
+++ b/browser-use-demo/tests/test_message_renderer.py
@@ -0,0 +1,385 @@
+"""Tests for MessageRenderer class with comprehensive edge case coverage."""
+
+from unittest.mock import MagicMock, Mock, patch
+
+import pytest
+from browser_use_demo.message_renderer import MessageRenderer, Sender
+from browser_use_demo.tools import ToolResult
+
+
+class TestMessageRenderer:
+ """Test suite for MessageRenderer class."""
+
+ def test_initialization(self, mock_streamlit):
+ """Test MessageRenderer initialization."""
+ renderer = MessageRenderer(mock_streamlit["session_state"])
+ assert renderer.session_state == mock_streamlit["session_state"]
+
+ def test_initialization_with_none_state(self):
+ """Test initialization with None session state."""
+ renderer = MessageRenderer(None)
+ assert renderer.session_state is None
+
+ def test_initialization_with_empty_state(self):
+ """Test initialization with empty session state."""
+ empty_state = MagicMock()
+ renderer = MessageRenderer(empty_state)
+ assert renderer.session_state == empty_state
+
+
+class TestRenderMethod:
+ """Test the main render method with various inputs."""
+
+ def test_render_string_message(self, mock_streamlit):
+ """Test rendering a simple string message."""
+ renderer = MessageRenderer(mock_streamlit["session_state"])
+ renderer.render(Sender.USER, "Hello world")
+
+ mock_streamlit["chat_message"].assert_called_with(Sender.USER)
+ mock_streamlit["markdown"].assert_called_with("Hello world")
+
+ def test_render_empty_string(self, mock_streamlit):
+ """Test rendering an empty string (should skip)."""
+ renderer = MessageRenderer(mock_streamlit["session_state"])
+ renderer.render(Sender.USER, "")
+
+ mock_streamlit["chat_message"].assert_not_called()
+
+ def test_render_none_message(self, mock_streamlit):
+ """Test rendering None message (should skip)."""
+ renderer = MessageRenderer(mock_streamlit["session_state"])
+ renderer.render(Sender.BOT, None)
+
+ mock_streamlit["chat_message"].assert_not_called()
+
+ def test_render_tool_result_with_output(self, mock_streamlit, sample_tool_result):
+ """Test rendering ToolResult with output."""
+ renderer = MessageRenderer(mock_streamlit["session_state"])
+ renderer.render(Sender.TOOL, sample_tool_result["success"])
+
+ mock_streamlit["markdown"].assert_called_with("Success message")
+
+ def test_render_tool_result_with_error(self, mock_streamlit, sample_tool_result):
+ """Test rendering ToolResult with error."""
+ renderer = MessageRenderer(mock_streamlit["session_state"])
+ renderer.render(Sender.TOOL, sample_tool_result["error"])
+
+ mock_streamlit["error"].assert_called_with("Error message")
+
+ def test_render_tool_result_with_image(self, mock_streamlit, sample_tool_result):
+ """Test rendering ToolResult with image."""
+ mock_streamlit["session_state"].hide_screenshots = False
+ renderer = MessageRenderer(mock_streamlit["session_state"])
+ renderer.render(Sender.TOOL, sample_tool_result["with_image"])
+
+ mock_streamlit["markdown"].assert_called_with("With screenshot")
+ # Image should be decoded and displayed
+ assert mock_streamlit["image"].called
+
+ def test_render_tool_result_with_hidden_screenshots(
+ self, mock_streamlit, sample_tool_result
+ ):
+ """Test that images are hidden when hide_screenshots is True."""
+ mock_streamlit["session_state"].hide_screenshots = True
+ renderer = MessageRenderer(mock_streamlit["session_state"])
+ renderer.render(Sender.TOOL, sample_tool_result["with_image"])
+
+ # Should render text but not image
+ mock_streamlit["markdown"].assert_called_with("With screenshot")
+ mock_streamlit["image"].assert_not_called()
+
+ def test_render_dict_message_text_type(self, mock_streamlit):
+ """Test rendering dictionary message with text type."""
+ renderer = MessageRenderer(mock_streamlit["session_state"])
+ message = {"type": "text", "text": "Hello from dict"}
+ renderer.render(Sender.USER, message)
+
+ mock_streamlit["write"].assert_called_with("Hello from dict")
+
+ def test_render_dict_message_tool_use_type(self, mock_streamlit):
+ """Test rendering dictionary message with tool_use type."""
+ renderer = MessageRenderer(mock_streamlit["session_state"])
+ message = {
+ "type": "tool_use",
+ "name": "browser_tool",
+ "input": {"url": "example.com"},
+ }
+ renderer.render(Sender.BOT, message)
+
+ expected_code = "Tool Use: browser_tool\nInput: {'url': 'example.com'}"
+ mock_streamlit["code"].assert_called_with(expected_code)
+
+ def test_render_dict_message_unknown_type(self, mock_streamlit):
+ """Test rendering dictionary message with unknown type."""
+ renderer = MessageRenderer(mock_streamlit["session_state"])
+ message = {"type": "unknown", "data": "some data"}
+ renderer.render(Sender.BOT, message)
+
+ # Should fall back to generic write
+ mock_streamlit["write"].assert_called_with(message)
+
+ def test_render_very_long_message(self, mock_streamlit):
+ """Test rendering extremely long messages."""
+ renderer = MessageRenderer(mock_streamlit["session_state"])
+ long_message = "x" * 100000 # 100k characters
+ renderer.render(Sender.USER, long_message)
+
+ mock_streamlit["markdown"].assert_called_with(long_message)
+
+ def test_render_unicode_special_chars(self, mock_streamlit):
+ """Test rendering messages with unicode and special characters."""
+ renderer = MessageRenderer(mock_streamlit["session_state"])
+ special_message = "Hello 世界 🌍 \n\t\r ñáéíóú"
+ renderer.render(Sender.USER, special_message)
+
+ mock_streamlit["markdown"].assert_called_with(special_message)
+
+
+class TestConversationHistory:
+ """Test render_conversation_history method with various scenarios."""
+
+ def test_render_empty_history(self, mock_streamlit):
+ """Test rendering empty conversation history."""
+ renderer = MessageRenderer(mock_streamlit["session_state"])
+ renderer.render_conversation_history([])
+
+ # No rendering should occur
+ mock_streamlit["chat_message"].assert_not_called()
+
+ def test_render_single_message(self, mock_streamlit):
+ """Test rendering single message in history."""
+ renderer = MessageRenderer(mock_streamlit["session_state"])
+ messages = [{"role": "user", "content": "Hello"}]
+ renderer.render_conversation_history(messages)
+
+ mock_streamlit["markdown"].assert_called_with("Hello")
+
+ def test_render_multiple_messages(self, mock_streamlit, sample_messages):
+ """Test rendering multiple messages with different roles."""
+ renderer = MessageRenderer(mock_streamlit["session_state"])
+ renderer.render_conversation_history(sample_messages[:2])
+
+ # Should render both messages
+ assert mock_streamlit["markdown"].call_count >= 2
+
+ def test_render_unknown_role(self, mock_streamlit):
+ """Test handling messages with unknown roles."""
+ renderer = MessageRenderer(mock_streamlit["session_state"])
+ messages = [{"role": "unknown_role", "content": "Test"}]
+ renderer.render_conversation_history(messages)
+
+ # Should not crash, but won't render
+ mock_streamlit["markdown"].assert_not_called()
+
+ def test_render_missing_content_field(self, mock_streamlit):
+ """Test handling messages missing content field."""
+ renderer = MessageRenderer(mock_streamlit["session_state"])
+ messages = [{"role": "user"}] # Missing content
+
+ # Should not crash - will get KeyError but handler should manage it gracefully
+ try:
+ renderer.render_conversation_history(messages)
+ except KeyError:
+ pass # Expected when content field is missing
+
+ def test_render_none_content(self, mock_streamlit):
+ """Test handling messages with None content."""
+ renderer = MessageRenderer(mock_streamlit["session_state"])
+ messages = [{"role": "user", "content": None}]
+ renderer.render_conversation_history(messages)
+
+ # Should handle gracefully without rendering
+ mock_streamlit["markdown"].assert_not_called()
+
+ def test_render_list_content(self, mock_streamlit):
+ """Test rendering messages with list content."""
+ renderer = MessageRenderer(mock_streamlit["session_state"])
+ messages = [
+ {
+ "role": "user",
+ "content": [
+ {"type": "text", "text": "First"},
+ {"type": "text", "text": "Second"},
+ ],
+ }
+ ]
+ renderer.render_conversation_history(messages)
+
+ # Should render both text blocks
+ calls = mock_streamlit["markdown"].call_args_list
+ assert any("First" in str(call) for call in calls)
+ assert any("Second" in str(call) for call in calls)
+
+ def test_skip_image_blocks_in_history(self, mock_streamlit):
+ """Test that image blocks are skipped in conversation history."""
+ renderer = MessageRenderer(mock_streamlit["session_state"])
+ messages = [
+ {
+ "role": "user",
+ "content": [
+ {"type": "text", "text": "Text message"},
+ {"type": "image", "source": "data:image/png;base64,abc"},
+ ],
+ }
+ ]
+ renderer.render_conversation_history(messages)
+
+ # Should only render text, not image
+ mock_streamlit["markdown"].assert_called_with("Text message")
+ mock_streamlit["image"].assert_not_called()
+
+ def test_tool_result_in_assistant_message(self, mock_streamlit, sample_tool_result):
+ """Test rendering tool results from assistant messages."""
+ mock_streamlit["session_state"].tools = {
+ "tool_123": sample_tool_result["success"]
+ }
+ renderer = MessageRenderer(mock_streamlit["session_state"])
+ messages = [
+ {
+ "role": "assistant",
+ "content": [{"type": "tool_result", "tool_use_id": "tool_123"}],
+ }
+ ]
+ renderer.render_conversation_history(messages)
+
+ # Should render the tool result from session state
+ mock_streamlit["markdown"].assert_called_with("Success message")
+
+ def test_missing_tool_in_session_state(self, mock_streamlit):
+ """Test handling tool_use_id that doesn't exist in session state."""
+ renderer = MessageRenderer(mock_streamlit["session_state"])
+ messages = [
+ {
+ "role": "assistant",
+ "content": [{"type": "tool_result", "tool_use_id": "nonexistent"}],
+ }
+ ]
+ renderer.render_conversation_history(messages)
+
+ # Should handle gracefully without crashing
+ mock_streamlit["markdown"].assert_not_called()
+
+
+class TestEdgeCases:
+ """Test edge cases and error conditions."""
+
+ def test_circular_reference_handling(self, mock_streamlit):
+ """Test handling circular references in messages."""
+ renderer = MessageRenderer(mock_streamlit["session_state"])
+
+ # Create circular reference
+ content = []
+ content.append({"type": "text", "text": "Normal", "ref": content})
+ messages = [{"role": "user", "content": content}]
+
+ # Should not crash or infinite loop
+ renderer.render_conversation_history(messages)
+
+ def test_malformed_tool_result(self, mock_streamlit):
+ """Test handling malformed ToolResult objects."""
+ renderer = MessageRenderer(mock_streamlit["session_state"])
+
+ # Create a mock that doesn't have expected attributes
+ malformed = Mock(spec=[]) # No attributes
+ renderer.render(Sender.TOOL, malformed)
+
+ # Should handle gracefully
+ mock_streamlit["markdown"].assert_not_called()
+
+ def test_exception_in_rendering(self, mock_streamlit):
+ """Test that exceptions during rendering are propagated."""
+ # Setup the chat_message context manager properly
+ mock_chat_cm = MagicMock()
+ mock_chat_cm.__enter__ = Mock(return_value=None)
+ mock_chat_cm.__exit__ = Mock(return_value=None)
+ mock_streamlit["chat_message"].return_value = mock_chat_cm
+
+ # Set markdown to raise an exception
+ mock_streamlit["markdown"].side_effect = Exception("Render error")
+ renderer = MessageRenderer(mock_streamlit["session_state"])
+
+ # Should let the exception propagate for markdown rendering
+ with pytest.raises(Exception, match="Render error"):
+ renderer.render(Sender.USER, "Test message")
+
+ def test_normalize_content_with_various_inputs(self, mock_streamlit):
+ """Test _normalize_content with various input types."""
+ renderer = MessageRenderer(mock_streamlit["session_state"])
+
+ # String input
+ assert renderer._normalize_content("test") == ["test"]
+
+ # List input
+ assert renderer._normalize_content([1, 2, 3]) == [1, 2, 3]
+
+ # None input
+ assert renderer._normalize_content(None) == [None]
+
+ # Dict input
+ assert renderer._normalize_content({"key": "value"}) == [{"key": "value"}]
+
+ def test_deeply_nested_content(self, mock_streamlit):
+ """Test handling deeply nested content structures."""
+ renderer = MessageRenderer(mock_streamlit["session_state"])
+
+ # Create deeply nested structure
+ nested = {"type": "text", "text": "Deep"}
+ for _ in range(100):
+ nested = {"type": "wrapper", "content": nested}
+
+ messages = [{"role": "user", "content": [nested]}]
+ renderer.render_conversation_history(messages)
+
+ # Should handle without stack overflow
+
+ def test_concurrent_modification(self, mock_streamlit):
+ """Test behavior when session state is modified during rendering."""
+ renderer = MessageRenderer(mock_streamlit["session_state"])
+
+ # Setup mock properly for context manager
+ mock_cm = MagicMock()
+ mock_cm.__enter__ = Mock(return_value=None)
+ mock_cm.__exit__ = Mock(return_value=None)
+
+ # Simulate modification during rendering
+ def modify_state(*args, **kwargs):
+ mock_streamlit["session_state"].tools = {}
+ return mock_cm
+
+ mock_streamlit["chat_message"].side_effect = modify_state
+
+ # Should complete rendering despite modifications
+ renderer.render(Sender.USER, "Test")
+
+ def test_invalid_sender_type(self, mock_streamlit):
+ """Test handling invalid sender types."""
+ renderer = MessageRenderer(mock_streamlit["session_state"])
+
+ # Use invalid sender
+ renderer.render("invalid_sender", "Message")
+
+ # Should still render with the provided sender
+ mock_streamlit["chat_message"].assert_called_with("invalid_sender")
+
+ def test_base64_decode_error(self, mock_streamlit):
+ """Test handling invalid base64 image data."""
+ # Setup the chat_message context manager properly
+ mock_chat_cm = MagicMock()
+ mock_chat_cm.__enter__ = Mock(return_value=None)
+ mock_chat_cm.__exit__ = Mock(return_value=None)
+ mock_streamlit["chat_message"].return_value = mock_chat_cm
+
+ # Setup session state to not hide screenshots
+ mock_streamlit["session_state"].hide_screenshots = False
+
+ renderer = MessageRenderer(mock_streamlit["session_state"])
+ tool_result = ToolResult(
+ output="With bad image", base64_image="invalid_base64_!@#$"
+ )
+
+ with patch("base64.b64decode") as mock_decode:
+ mock_decode.side_effect = Exception("Invalid base64")
+
+ # Should handle the error gracefully - the exception should propagate
+ with pytest.raises(Exception, match="Invalid base64"):
+ renderer.render(Sender.TOOL, tool_result)
diff --git a/browser-use-demo/tests/test_sampling_loop.py b/browser-use-demo/tests/test_sampling_loop.py
new file mode 100644
index 000000000..5f82e81f6
--- /dev/null
+++ b/browser-use-demo/tests/test_sampling_loop.py
@@ -0,0 +1,512 @@
+"""
+Comprehensive tests for the sampling loop and message handling.
+"""
+
+import asyncio
+from unittest.mock import AsyncMock, Mock, patch
+
+import pytest
+from browser_use_demo.loop import APIProvider, sampling_loop
+from browser_use_demo.message_handler import (
+ MessageBuilder,
+ ResponseProcessor,
+)
+from browser_use_demo.tools import ToolResult
+
+
+class TestResponseProcessor:
+ """Test the ResponseProcessor class."""
+
+ def test_process_response_text_only(self):
+ """Test processing a response with only text content."""
+ mock_response = Mock()
+ mock_response.content = [
+ Mock(type="text", text="This is a text response")
+ ]
+
+ processor = ResponseProcessor()
+ result = processor.process_response(mock_response)
+
+ assert result.has_text is True
+ assert result.has_tools is False
+ assert len(result.assistant_content) == 1
+ assert result.assistant_content[0]["type"] == "text"
+ assert result.assistant_content[0]["text"] == "This is a text response"
+ assert len(result.tool_uses) == 0
+
+ def test_process_response_tool_only(self):
+ """Test processing a response with only tool use."""
+ mock_response = Mock()
+ mock_tool_use = Mock(
+ type="tool_use",
+ id="tool_123",
+ name="browser",
+ input={"action": "screenshot"}
+ )
+ mock_response.content = [mock_tool_use]
+
+ processor = ResponseProcessor()
+ result = processor.process_response(mock_response)
+
+ assert result.has_text is False
+ assert result.has_tools is True
+ assert len(result.assistant_content) == 1
+ assert result.assistant_content[0]["type"] == "tool_use"
+ assert len(result.tool_uses) == 1
+
+ def test_process_response_mixed_content(self):
+ """Test processing a response with both text and tool use."""
+ mock_response = Mock()
+ mock_response.content = [
+ Mock(type="text", text="Let me take a screenshot"),
+ Mock(
+ type="tool_use",
+ id="tool_456",
+ name="browser",
+ input={"action": "screenshot"}
+ )
+ ]
+
+ processor = ResponseProcessor()
+ result = processor.process_response(mock_response)
+
+ assert result.has_text is True
+ assert result.has_tools is True
+ assert len(result.assistant_content) == 2
+ assert result.assistant_content[0]["type"] == "text"
+ assert result.assistant_content[0]["text"] == "Let me take a screenshot"
+ assert result.assistant_content[1]["type"] == "tool_use"
+ assert len(result.tool_uses) == 1
+
+ def test_execute_tools_success(self):
+ """Test successful tool execution."""
+
+ async def run_test():
+ mock_tool = AsyncMock(return_value=ToolResult(output="Tool executed"))
+ mock_collection = Mock()
+ mock_collection.tool_map = {"browser": mock_tool}
+
+ tool_uses = [
+ {
+ "type": "tool_use",
+ "id": "tool_789",
+ "name": "browser",
+ "input": {"action": "screenshot"}
+ }
+ ]
+
+ processor = ResponseProcessor()
+ results = await processor.execute_tools(tool_uses, mock_collection)
+
+ assert len(results) == 1
+ assert results[0]["type"] == "tool_result"
+ assert results[0]["tool_use_id"] == "tool_789"
+ assert any(
+ block.get("text") == "Tool executed"
+ for block in results[0]["content"]
+ )
+
+ asyncio.run(run_test())
+
+ def test_execute_tools_with_error(self):
+ """Test tool execution with error."""
+
+ async def run_test():
+ mock_tool = AsyncMock(side_effect=Exception("Tool failed"))
+ mock_collection = Mock()
+ mock_collection.tool_map = {"browser": mock_tool}
+
+ tool_uses = [
+ {
+ "type": "tool_use",
+ "id": "tool_error",
+ "name": "browser",
+ "input": {"action": "invalid"}
+ }
+ ]
+
+ processor = ResponseProcessor()
+ results = await processor.execute_tools(tool_uses, mock_collection)
+
+ assert len(results) == 1
+ assert results[0]["type"] == "tool_result"
+ assert results[0]["is_error"] is True
+ assert any(
+ "Tool failed" in block.get("text", "")
+ for block in results[0]["content"]
+ )
+
+ asyncio.run(run_test())
+
+ def test_build_tool_result_with_image(self):
+ """Test building tool result with base64 image."""
+ result = ToolResult(base64_image="base64_data_here")
+ processor = ResponseProcessor()
+
+ tool_result = processor._build_tool_result(result, "tool_img")
+
+ assert tool_result["type"] == "tool_result"
+ assert tool_result["tool_use_id"] == "tool_img"
+ assert any(
+ block.get("type") == "image"
+ for block in tool_result["content"]
+ )
+
+ def test_build_tool_result_with_text_extraction_markers(self):
+ """Test handling of text extraction markers in tool results."""
+ result = ToolResult(
+ output="__PAGE_EXTRACTED__\nSome content\n__FULL_CONTENT__\nThe actual content"
+ )
+ processor = ResponseProcessor()
+
+ tool_result = processor._build_tool_result(result, "tool_extract")
+
+ assert any(
+ block.get("text") == "The actual content"
+ for block in tool_result["content"]
+ )
+
+
+class TestMessageBuilder:
+ """Test the MessageBuilder class."""
+
+ def test_add_assistant_message(self):
+ """Test adding an assistant message."""
+ messages = []
+ content = [
+ {"type": "text", "text": "Hello"},
+ {"type": "tool_use", "id": "123", "name": "test", "input": {}}
+ ]
+
+ builder = MessageBuilder()
+ builder.add_assistant_message(messages, content)
+
+ assert len(messages) == 1
+ assert messages[0]["role"] == "assistant"
+ assert messages[0]["content"] == content
+
+ def test_add_assistant_message_empty_content(self):
+ """Test that empty content is not added."""
+ messages = []
+ content = []
+
+ builder = MessageBuilder()
+ builder.add_assistant_message(messages, content)
+
+ assert len(messages) == 0
+
+ def test_add_tool_results(self):
+ """Test adding tool results."""
+ messages = []
+ tool_results = [
+ {
+ "type": "tool_result",
+ "tool_use_id": "123",
+ "content": [{"type": "text", "text": "Result"}]
+ }
+ ]
+
+ builder = MessageBuilder()
+ builder.add_tool_results(messages, tool_results)
+
+ assert len(messages) == 1
+ assert messages[0]["role"] == "user"
+ assert messages[0]["content"] == tool_results
+
+ def test_add_tool_results_empty(self):
+ """Test that empty tool results are not added."""
+ messages = []
+ tool_results = []
+
+ builder = MessageBuilder()
+ builder.add_tool_results(messages, tool_results)
+
+ assert len(messages) == 0
+
+ def test_ensure_message_integrity_valid(self):
+ """Test message integrity validation with valid messages."""
+ messages = [
+ {"role": "user", "content": "Hello"},
+ {"role": "assistant", "content": [{"type": "text", "text": "Hi"}]},
+ ]
+
+ builder = MessageBuilder()
+ assert builder.ensure_message_integrity(messages) is True
+
+ def test_ensure_message_integrity_missing_role(self):
+ """Test message integrity with missing role."""
+ messages = [
+ {"content": "Hello"},
+ ]
+
+ builder = MessageBuilder()
+ assert builder.ensure_message_integrity(messages) is False
+
+ def test_ensure_message_integrity_missing_content(self):
+ """Test message integrity with missing content."""
+ messages = [
+ {"role": "user"},
+ ]
+
+ builder = MessageBuilder()
+ assert builder.ensure_message_integrity(messages) is False
+
+ def test_ensure_message_integrity_empty_list_content(self):
+ """Test message integrity with empty content list."""
+ messages = [
+ {"role": "user", "content": []},
+ ]
+
+ builder = MessageBuilder()
+ assert builder.ensure_message_integrity(messages) is False
+
+ def test_extract_text_from_message(self):
+ """Test extracting text from assistant message."""
+ message = {
+ "role": "assistant",
+ "content": [
+ {"type": "text", "text": "First part"},
+ {"type": "tool_use", "id": "123", "name": "test", "input": {}},
+ {"type": "text", "text": "Second part"}
+ ]
+ }
+
+ builder = MessageBuilder()
+ text = builder.extract_text_from_message(message)
+
+ assert text == "First part Second part"
+
+ def test_extract_text_from_message_no_text(self):
+ """Test extracting text when there's no text content."""
+ message = {
+ "role": "assistant",
+ "content": [
+ {"type": "tool_use", "id": "123", "name": "test", "input": {}}
+ ]
+ }
+
+ builder = MessageBuilder()
+ text = builder.extract_text_from_message(message)
+
+ assert text is None
+
+ def test_extract_text_from_user_message(self):
+ """Test that text extraction returns None for non-assistant messages."""
+ message = {
+ "role": "user",
+ "content": "User message"
+ }
+
+ builder = MessageBuilder()
+ text = builder.extract_text_from_message(message)
+
+ assert text is None
+
+
+@pytest.mark.integration
+class TestSamplingLoopIntegration:
+ """Integration tests for the sampling loop."""
+
+ @patch("browser_use_demo.loop.Anthropic")
+ def test_sampling_loop_preserves_text_with_tools(self, mock_anthropic):
+ """Test that text is preserved when tools are used."""
+
+ async def run_test():
+ mock_client = Mock()
+ mock_anthropic.return_value = mock_client
+
+ mock_response = Mock()
+ mock_response.content = [
+ Mock(type="text", text="I'll help you with that"),
+ Mock(
+ type="tool_use",
+ id="tool_001",
+ name="browser",
+ input={"action": "screenshot"}
+ )
+ ]
+
+ mock_client.beta.messages.create = Mock(return_value=mock_response)
+
+ mock_browser = AsyncMock()
+ mock_browser.return_value = ToolResult(output="Screenshot taken")
+
+ messages = [{"role": "user", "content": "Take a screenshot"}]
+ output_messages = []
+ tool_outputs = {}
+
+ def output_callback(content):
+ output_messages.append(content)
+
+ def tool_output_callback(result, tool_id):
+ tool_outputs[tool_id] = result
+
+ updated_messages = await sampling_loop(
+ model="claude-sonnet-4-5",
+ provider=APIProvider.ANTHROPIC,
+ system_prompt_suffix="",
+ messages=messages,
+ output_callback=output_callback,
+ tool_output_callback=tool_output_callback,
+ api_response_callback=lambda *args: None,
+ api_key="test_key",
+ browser_tool=mock_browser
+ )
+
+ api_call_args = mock_client.beta.messages.create.call_args[1]
+ assert api_call_args["tool_choice"] == {"type": "auto"}
+
+ assert len(output_messages) >= 2
+ assert any(
+ msg.get("type") == "text" and "help you" in msg.get("text", "")
+ for msg in output_messages
+ )
+ assert any(msg.get("type") == "tool_use" for msg in output_messages)
+
+ assistant_msgs = [m for m in updated_messages if m["role"] == "assistant"]
+ assert len(assistant_msgs) > 0
+
+ last_assistant = assistant_msgs[-1]
+ assert isinstance(last_assistant["content"], list)
+
+ has_text = any(
+ block.get("type") == "text"
+ for block in last_assistant["content"]
+ )
+ has_tool = any(
+ block.get("type") == "tool_use"
+ for block in last_assistant["content"]
+ )
+
+ assert has_text and has_tool, "Assistant message should contain both text and tool use"
+
+ asyncio.run(run_test())
+
+ @patch("browser_use_demo.loop.Anthropic")
+ def test_sampling_loop_text_only_response(self, mock_anthropic):
+ """Test handling of text-only responses."""
+
+ async def run_test():
+ mock_client = Mock()
+ mock_anthropic.return_value = mock_client
+
+ mock_response = Mock()
+ mock_response.content = [
+ Mock(type="text", text="This is just a text response")
+ ]
+
+ mock_client.beta.messages.create = Mock(return_value=mock_response)
+
+ messages = [{"role": "user", "content": "Hello"}]
+
+ updated_messages = await sampling_loop(
+ model="claude-sonnet-4-5",
+ provider=APIProvider.ANTHROPIC,
+ system_prompt_suffix="",
+ messages=messages,
+ output_callback=lambda x: None,
+ tool_output_callback=lambda r, i: None,
+ api_response_callback=lambda *args: None,
+ api_key="test_key"
+ )
+
+ assert len(updated_messages) == 2
+ assert updated_messages[-1]["role"] == "assistant"
+ assert any(
+ block.get("text") == "This is just a text response"
+ for block in updated_messages[-1]["content"]
+ )
+
+ asyncio.run(run_test())
+
+ @patch("browser_use_demo.loop.Anthropic")
+ def test_sampling_loop_multiple_tools_with_text(self, mock_anthropic):
+ """Test handling of multiple tool uses with text."""
+
+ async def run_test():
+ mock_client = Mock()
+ mock_anthropic.return_value = mock_client
+
+ mock_response = Mock()
+ mock_response.content = [
+ Mock(type="text", text="I'll perform multiple actions"),
+ Mock(
+ type="tool_use",
+ id="tool_001",
+ name="browser",
+ input={"action": "screenshot"}
+ ),
+ Mock(type="text", text="Now navigating"),
+ Mock(
+ type="tool_use",
+ id="tool_002",
+ name="browser",
+ input={"action": "navigate", "url": "example.com"}
+ )
+ ]
+
+ mock_client.beta.messages.create = Mock(return_value=mock_response)
+
+ mock_browser = AsyncMock()
+ mock_browser.return_value = ToolResult(output="Action completed")
+
+ messages = [{"role": "user", "content": "Do multiple things"}]
+
+ updated_messages = await sampling_loop(
+ model="claude-sonnet-4-5",
+ provider=APIProvider.ANTHROPIC,
+ system_prompt_suffix="",
+ messages=messages,
+ output_callback=lambda x: None,
+ tool_output_callback=lambda r, i: None,
+ api_response_callback=lambda *args: None,
+ api_key="test_key",
+ browser_tool=mock_browser
+ )
+
+ assistant_msgs = [m for m in updated_messages if m["role"] == "assistant"]
+ last_assistant = assistant_msgs[-1]
+
+ text_blocks = [
+ block for block in last_assistant["content"]
+ if block.get("type") == "text"
+ ]
+ tool_blocks = [
+ block for block in last_assistant["content"]
+ if block.get("type") == "tool_use"
+ ]
+
+ assert len(text_blocks) == 2
+ assert len(tool_blocks) == 2
+
+ asyncio.run(run_test())
+
+ @patch("browser_use_demo.loop.Anthropic")
+ def test_tool_choice_parameter_set(self, mock_anthropic):
+ """Test that tool_choice is explicitly set to auto."""
+
+ async def run_test():
+ mock_client = Mock()
+ mock_anthropic.return_value = mock_client
+
+ mock_response = Mock()
+ mock_response.content = [Mock(type="text", text="Response")]
+
+ mock_client.beta.messages.create = Mock(return_value=mock_response)
+
+ await sampling_loop(
+ model="claude-sonnet-4-5",
+ provider=APIProvider.ANTHROPIC,
+ system_prompt_suffix="",
+ messages=[{"role": "user", "content": "Test"}],
+ output_callback=lambda x: None,
+ tool_output_callback=lambda r, i: None,
+ api_response_callback=lambda *args: None,
+ api_key="test_key"
+ )
+
+ call_args = mock_client.beta.messages.create.call_args[1]
+ assert "tool_choice" in call_args
+ assert call_args["tool_choice"] == {"type": "auto"}
+
+ asyncio.run(run_test())
diff --git a/browser-use-demo/tests/test_streamlit_helpers.py b/browser-use-demo/tests/test_streamlit_helpers.py
new file mode 100644
index 000000000..4e7a33d36
--- /dev/null
+++ b/browser-use-demo/tests/test_streamlit_helpers.py
@@ -0,0 +1,325 @@
+"""Tests for Streamlit helper functions with edge case coverage."""
+
+import asyncio
+from unittest.mock import MagicMock, Mock, patch
+
+import pytest
+from browser_use_demo.loop import APIProvider
+from browser_use_demo.streamlit import (
+ authenticate,
+ get_or_create_event_loop,
+ setup_state,
+)
+
+
+class TestSetupState:
+ """Test suite for setup_state function."""
+
+ @patch("streamlit.session_state", new_callable=MagicMock)
+ def test_setup_state_fresh_initialization(self, mock_state, mock_environment):
+ """Test setup_state with completely empty session state."""
+ # Simulate empty session state
+ mock_state.__contains__.return_value = False
+ # Set provider to valid enum value so lambda can access it
+ mock_state.provider = APIProvider.ANTHROPIC
+
+ with patch("browser_use_demo.tools.BrowserTool") as mock_browser:
+ setup_state()
+
+ # Check all defaults were set
+ assert "messages" in mock_state.__setitem__.call_args_list[0][0]
+ assert "api_key" in str(mock_state.__setitem__.call_args_list)
+ assert "event_loop" in str(mock_state.__setitem__.call_args_list)
+
+ # Browser tool should be created
+ mock_browser.assert_called_once()
+
+ @patch("streamlit.session_state", new_callable=MagicMock)
+ def test_setup_state_partial_initialization(self, mock_state):
+ """Test setup_state when some keys already exist."""
+
+ # Simulate partial state
+ existing_keys = ["messages", "api_key"]
+
+ def contains_side_effect(key):
+ return key in existing_keys
+
+ mock_state.__contains__.side_effect = contains_side_effect
+ # Set provider to valid enum value so lambda can access it
+ mock_state.provider = APIProvider.ANTHROPIC
+
+ with patch("browser_use_demo.tools.BrowserTool"):
+ setup_state()
+
+ # Only missing keys should be set
+ set_keys = [call[0][0] for call in mock_state.__setitem__.call_args_list]
+ assert "messages" not in set_keys
+ assert "api_key" not in set_keys
+
+ @patch("streamlit.session_state", new_callable=MagicMock)
+ def test_setup_state_missing_env_variables(self, mock_state, clean_environment):
+ """Test setup_state when environment variables are missing."""
+
+ mock_state.__contains__.return_value = False
+ # Set provider to valid enum value so lambda can access it
+ mock_state.provider = APIProvider.ANTHROPIC
+
+ with patch("browser_use_demo.tools.BrowserTool") as mock_browser:
+ setup_state()
+
+ # BrowserTool no longer takes dimensions as arguments
+ mock_browser.assert_called_with()
+
+ @patch("streamlit.session_state", new_callable=MagicMock)
+ def test_setup_state_lambda_evaluation(self, mock_state, mock_provider):
+ """Test that lambda functions are evaluated correctly."""
+
+ mock_state.__contains__.return_value = False
+ mock_state.provider = mock_provider.ANTHROPIC
+
+ setup_state()
+
+ # Model should be set based on provider
+ model_calls = [
+ call
+ for call in mock_state.__setitem__.call_args_list
+ if call[0][0] == "model"
+ ]
+ assert len(model_calls) > 0
+
+ @patch("streamlit.session_state", new_callable=MagicMock)
+ def test_setup_state_browser_tool_error(self, mock_state):
+ """Test setup_state when BrowserTool initialization fails."""
+
+ mock_state.__contains__.return_value = False
+ # Set provider to valid enum value so lambda can access it
+ mock_state.provider = APIProvider.ANTHROPIC
+
+ with patch("browser_use_demo.tools.BrowserTool") as mock_browser:
+ mock_browser.side_effect = Exception("Browser init failed")
+
+ # Should raise the exception
+ with pytest.raises(Exception, match="Browser init failed"):
+ setup_state()
+
+ # Test removed - BrowserTool no longer reads dimensions from environment
+
+
+class TestGetOrCreateEventLoop:
+ """Test suite for get_or_create_event_loop function."""
+
+ @patch("streamlit.session_state", new_callable=MagicMock)
+ @patch("asyncio.new_event_loop")
+ @patch("asyncio.set_event_loop")
+ def test_create_new_loop_when_none(self, mock_set_loop, mock_new_loop, mock_state):
+ """Test creating new event loop when none exists."""
+
+ mock_state.event_loop = None
+ new_loop = Mock()
+ mock_new_loop.return_value = new_loop
+
+ result = get_or_create_event_loop()
+
+ mock_new_loop.assert_called_once()
+ mock_set_loop.assert_called_once_with(new_loop)
+ assert mock_state.event_loop == new_loop
+ assert result == new_loop
+
+ @patch("streamlit.session_state", new_callable=MagicMock)
+ @patch("asyncio.new_event_loop")
+ @patch("asyncio.set_event_loop")
+ def test_create_new_loop_when_closed(
+ self, mock_set_loop, mock_new_loop, mock_state
+ ):
+ """Test creating new event loop when existing is closed."""
+
+ closed_loop = Mock()
+ closed_loop.is_closed.return_value = True
+ mock_state.event_loop = closed_loop
+
+ new_loop = Mock()
+ mock_new_loop.return_value = new_loop
+
+ result = get_or_create_event_loop()
+
+ mock_new_loop.assert_called_once()
+ mock_set_loop.assert_called_once_with(new_loop)
+ assert mock_state.event_loop == new_loop
+ assert result == new_loop
+
+ @patch("streamlit.session_state", new_callable=MagicMock)
+ @patch("asyncio.new_event_loop")
+ @patch("asyncio.set_event_loop")
+ def test_reuse_existing_open_loop(self, mock_set_loop, mock_new_loop, mock_state):
+ """Test reusing existing open event loop."""
+
+ existing_loop = Mock()
+ existing_loop.is_closed.return_value = False
+ mock_state.event_loop = existing_loop
+
+ result = get_or_create_event_loop()
+
+ mock_new_loop.assert_not_called()
+ mock_set_loop.assert_called_once_with(existing_loop)
+ assert result == existing_loop
+
+ @patch("streamlit.session_state", new_callable=MagicMock)
+ @patch("asyncio.new_event_loop")
+ def test_event_loop_creation_error(self, mock_new_loop, mock_state):
+ """Test handling error during event loop creation."""
+
+ mock_state.event_loop = None
+ mock_new_loop.side_effect = RuntimeError("Cannot create loop")
+
+ with pytest.raises(RuntimeError, match="Cannot create loop"):
+ get_or_create_event_loop()
+
+ @patch("streamlit.session_state", new_callable=MagicMock)
+ @patch("asyncio.set_event_loop")
+ def test_set_event_loop_error(self, mock_set_loop, mock_state):
+ """Test handling error when setting event loop."""
+
+ mock_state.event_loop = None
+ mock_set_loop.side_effect = RuntimeError("Cannot set loop")
+
+ with pytest.raises(RuntimeError, match="Cannot set loop"):
+ get_or_create_event_loop()
+
+
+class TestAuthenticate:
+ """Test suite for authenticate function."""
+
+ @patch("streamlit.session_state", new_callable=MagicMock)
+ @patch("streamlit.error")
+ @patch("streamlit.stop")
+ def test_authenticate_with_valid_key(
+ self, mock_stop, mock_error, mock_state, mock_provider
+ ):
+ """Test authenticate with valid API key."""
+
+ mock_state.provider = mock_provider.ANTHROPIC
+ mock_state.api_key = "valid-key"
+
+ result = authenticate()
+
+ assert result is True
+ mock_error.assert_not_called()
+ mock_stop.assert_not_called()
+
+ @patch("streamlit.session_state", new_callable=MagicMock)
+ @patch("streamlit.error")
+ @patch("streamlit.stop")
+ def test_authenticate_with_missing_key(
+ self, mock_stop, mock_error, mock_state, mock_provider
+ ):
+ """Test authenticate with missing API key."""
+
+ mock_state.provider = mock_provider.ANTHROPIC
+ mock_state.api_key = ""
+
+ authenticate()
+
+ mock_error.assert_called_once_with(
+ "Please provide your Anthropic API key in the sidebar"
+ )
+ mock_stop.assert_called_once()
+ # Function doesn't return after stop() in real scenario
+
+ @patch("streamlit.session_state", new_callable=MagicMock)
+ @patch("streamlit.error")
+ @patch("streamlit.stop")
+ def test_authenticate_with_none_key(
+ self, mock_stop, mock_error, mock_state, mock_provider
+ ):
+ """Test authenticate with None API key."""
+
+ mock_state.provider = mock_provider.ANTHROPIC
+ mock_state.api_key = None
+
+ authenticate()
+
+ mock_error.assert_called_once()
+ mock_stop.assert_called_once()
+
+ @patch("streamlit.session_state", new_callable=MagicMock)
+ def test_authenticate_non_anthropic_provider(self, mock_state, mock_provider):
+ """Test authenticate with non-Anthropic provider."""
+
+ mock_state.provider = mock_provider.BEDROCK
+ mock_state.api_key = "" # Empty key should be OK for non-Anthropic
+
+ result = authenticate()
+
+ assert result is True
+
+
+class TestEdgeCasesAndErrors:
+ """Test edge cases and error conditions for helper functions."""
+
+ @patch("streamlit.session_state", new_callable=MagicMock)
+ def test_setup_state_with_corrupted_state(self, mock_state):
+ """Test setup_state with corrupted session state."""
+
+ # Simulate corrupted state that raises on access
+ mock_state.__contains__.side_effect = Exception("State corrupted")
+
+ with pytest.raises(Exception, match="State corrupted"):
+ setup_state()
+
+ @patch("streamlit.session_state", new_callable=MagicMock)
+ def test_concurrent_setup_state_calls(self, mock_state):
+ """Test concurrent calls to setup_state."""
+ import threading
+
+ mock_state.__contains__.return_value = False
+ # Set provider to valid enum value so lambda can access it
+ mock_state.provider = APIProvider.ANTHROPIC
+ errors = []
+
+ def run_setup():
+ try:
+ with patch("browser_use_demo.tools.BrowserTool"):
+ setup_state()
+ except Exception as e:
+ errors.append(e)
+
+ threads = [threading.Thread(target=run_setup) for _ in range(5)]
+ for t in threads:
+ t.start()
+ for t in threads:
+ t.join()
+
+ # Should handle concurrent access without crashes
+ assert len(errors) == 0
+
+ @patch("streamlit.session_state", new_callable=MagicMock)
+ @patch("asyncio.get_event_loop")
+ def test_get_or_create_with_running_loop(self, mock_get_loop, mock_state):
+ """Test get_or_create_event_loop when another loop is running."""
+
+ mock_state.event_loop = None
+ running_loop = Mock(spec=asyncio.AbstractEventLoop)
+ running_loop.is_running.return_value = True
+ mock_get_loop.return_value = running_loop
+
+ # Should create new loop despite running loop exists
+ with patch("asyncio.new_event_loop") as mock_new:
+ new_loop = Mock(spec=asyncio.AbstractEventLoop)
+ new_loop.is_closed.return_value = False
+ mock_new.return_value = new_loop
+
+ with patch("asyncio.set_event_loop"):
+ result = get_or_create_event_loop()
+ assert result == new_loop
+
+ @patch("streamlit.session_state", new_callable=MagicMock)
+ def test_setup_state_with_readonly_state(self, mock_state):
+ """Test setup_state when session state is read-only."""
+
+ mock_state.__contains__.return_value = False
+ mock_state.__setitem__.side_effect = AttributeError("Read-only state")
+
+ with pytest.raises(AttributeError, match="Read-only state"):
+ setup_state()
+
+ # Test removed - BrowserTool no longer reads dimensions from environment
diff --git a/browser-use-demo/validate_env.py b/browser-use-demo/validate_env.py
new file mode 100644
index 000000000..6ccffbb6c
--- /dev/null
+++ b/browser-use-demo/validate_env.py
@@ -0,0 +1,53 @@
+#!/usr/bin/env python3
+"""Validate environment configuration before startup."""
+
+import os
+import sys
+from pathlib import Path
+
+# Import constants for display information
+try:
+ from browser_use_demo.display_constants import BROWSER_WIDTH, BROWSER_HEIGHT, DISPLAY_WIDTH, DISPLAY_HEIGHT
+except ImportError:
+ # Fallback if running outside the package
+ DISPLAY_WIDTH = 1920
+ DISPLAY_HEIGHT = 1080
+ BROWSER_WIDTH = 1920
+ BROWSER_HEIGHT = 1080
+
+
+def validate_env():
+ """Validate required environment variables are set."""
+ # Check API key
+ api_key = os.environ.get("ANTHROPIC_API_KEY")
+
+ if not api_key:
+ print("\n" + "=" * 60)
+ print("ERROR: Missing required configuration!")
+ print("=" * 60)
+ print("\nThe Browser Use Demo requires proper configuration to run.")
+ print("\n🔧 RECOMMENDED: Use docker-compose with a .env file:")
+ print(" 1. Copy the example environment file:")
+ print(" cp .env.example .env")
+ print(" 2. Edit .env and add your Anthropic API key")
+ print(" 3. Run with docker-compose:")
+ print(" docker-compose up --build")
+ print("=" * 60)
+ sys.exit(1)
+
+ if api_key == "your_anthropic_api_key_here" or len(api_key) < 10:
+ print("\n" + "=" * 60)
+ print("ERROR: Invalid API key!")
+ print("=" * 60)
+ print(" ANTHROPIC_API_KEY: Must be a valid API key")
+ print("\nTo fix this, please edit your .env file with a valid API key")
+ print("=" * 60)
+ sys.exit(1)
+
+ print("\n✓ Environment validation passed")
+ print(f" Display: {DISPLAY_WIDTH}x{DISPLAY_HEIGHT}")
+ print(f" Browser: {BROWSER_WIDTH}x{BROWSER_HEIGHT}")
+
+
+if __name__ == "__main__":
+ validate_env()
diff --git a/computer-use-demo/CONTRIBUTING.md b/computer-use-demo/CONTRIBUTING.md
new file mode 100644
index 000000000..63b8dfbca
--- /dev/null
+++ b/computer-use-demo/CONTRIBUTING.md
@@ -0,0 +1,129 @@
+# Contributing Guidelines
+
+Thank you for your interest in contributing to the Computer Use Demo quickstart! This document outlines the process and guidelines for contributing to this project.
+
+## Code of Conduct
+
+Please be respectful and constructive in all interactions. We aim to maintain a welcoming and inclusive environment for all contributors.
+
+## Contribution Policy
+
+- Bugfixes and updates to our documentation that address correctness issues are always welcome
+- Feature additions, refactors, and documentation updates beyond the scope of correctness (major changes) are accepted at the sole determination of the maintainers. We require each major change to be submitted in a separate Pull Request. We will assess new features under the following criteria:
+ - Adherence to coding standards
+ - Ease of use as a reference implementation
+ - User experience
+ - Applicability to a wide audience of developers
+ - Minimization of third-party dependencies
+ - Does not promote a product or service
+
+Please open a github issue if you are need clarification on this policy or you want to discuss a new feature addition.
+
+## Development Setup
+
+1. Create and activate a Python virtual environment:
+
+ ```bash
+ python -m venv .venv
+ source .venv/bin/activate # On Unix
+ # or
+ .venv\Scripts\activate # On Windows
+ ```
+
+2. Install development dependencies:
+
+ ```bash
+ pip install -r dev-requirements.txt
+ ```
+
+3. Install pre-commit hooks:
+ ```bash
+ pre-commit install
+ ```
+
+## Development Process
+
+1. Fork the repository and create a branch for your changes
+2. Make your changes following our coding standards
+3. Submit a pull request with a clear description of the changes
+
+## Coding Standards
+
+- Use clear, descriptive variable and function names
+- Follow PEP 8 style guidelines for Python code
+- Keep functions focused and single-purpose
+- Avoid inline comments - code should be self-documenting
+- Use type hints for all Python functions
+- Use dataclasses for structured data (see `tools/base.py` for examples)
+- All tools must inherit from `BaseAnthropicTool` and implement required methods
+- Use abstract base classes (ABC) for defining interfaces
+- Handle errors using `ToolError` and `ToolFailure` classes
+
+## Code Quality Tools
+
+We use several tools to maintain code quality:
+
+- **Ruff**: For linting and formatting
+ - Run `ruff check .` for linting
+ - Run `ruff format .` for formatting
+ - See `ruff.toml` for enabled rules
+- **Pyright**: For type checking
+ - Configuration in `pyproject.toml`
+- **Pre-commit**: For automated checks before commits
+
+## Testing
+
+- Add tests for new functionality in the `tests/` directory
+- Follow existing test patterns (see `tests/tools/` for examples)
+- Use pytest fixtures where appropriate
+- Run tests with:
+ ```bash
+ pytest
+ ```
+- Tests must pass in async mode (configured in pyproject.toml)
+
+## Commit Guidelines
+
+- All commits MUST be signed (use `git commit -S`)
+- Write clear, descriptive commit messages
+- Use present tense ("Add feature" not "Added feature")
+- Reference issue numbers when applicable
+
+## Pull Request Process
+
+1. Update documentation as needed
+2. Add tests for new functionality
+3. Ensure all checks pass:
+ - All tests pass
+ - Ruff linting passes
+ - Type checking passes
+ - Pre-commit hooks pass
+4. Request review from maintainers
+5. Address review feedback
+
+## Tool Development
+
+When creating new tools:
+
+1. Inherit from `BaseAnthropicTool`
+2. Implement `__call__` and `to_params` methods
+3. Use appropriate result types (`ToolResult`, `CLIResult`, or `ToolFailure`)
+4. Add comprehensive tests
+5. Document parameters and return types
+
+## Documentation
+
+- Keep README.md up to date
+- Document new features and changes
+- Use clear, concise language
+- Include docstrings for all public classes and methods
+- Use concise, single-line docstrings for simple functions
+- For complex functions, include:
+ - A brief description
+ - Args/parameters if not obvious
+ - Return value if not obvious
+ - Any important notes about behavior
+
+## Questions?
+
+If you have questions, please open an issue for discussion.
diff --git a/computer-use-demo/Dockerfile b/computer-use-demo/Dockerfile
index f3b00255a..8a0e9b472 100644
--- a/computer-use-demo/Dockerfile
+++ b/computer-use-demo/Dockerfile
@@ -6,7 +6,6 @@ ENV DEBIAN_PRIORITY=high
RUN apt-get update && \
apt-get -y upgrade && \
apt-get -y install \
- build-essential \
# UI Requirements
xvfb \
xterm \
diff --git a/computer-use-demo/README.md b/computer-use-demo/README.md
index 2b896da88..6e05b255c 100644
--- a/computer-use-demo/README.md
+++ b/computer-use-demo/README.md
@@ -1,5 +1,8 @@
# Anthropic Computer Use Demo
+> [!NOTE]
+> Now featuring support for the new Claude 4 models! The latest Claude Opus 4.5 (claude-opus-4-5-20251101) is now available, along with Claude Sonnet 4.5 (claude-sonnet-4-5-20250929), Claude Sonnet 4 (claude-sonnet-4-20250514), Claude Opus 4 (claude-opus-4-20250514), and Claude Haiku 4.5 (claude-haiku-4-5-20251001). These models bring next-generation capabilities with the updated str_replace_based_edit_tool that replaces the previous str_replace_editor tool. The undo_edit command has been removed in this latest version for a more streamlined experience.
+
> [!CAUTION]
> Computer use is a beta feature. Please be aware that computer use poses unique risks that are distinct from standard API features or chat interfaces. These risks are heightened when using computer use to interact with the internet. To minimize risks, consider taking precautions such as:
>
@@ -14,25 +17,25 @@
This repository helps you get started with computer use on Claude, with reference implementations of:
-* Build files to create a Docker container with all necessary dependencies
-* A computer use agent loop using the Anthropic API, Bedrock, or Vertex to access the updated Claude 3.5 Sonnet model
-* Anthropic-defined computer use tools
-* A streamlit app for interacting with the agent loop
+- Build files to create a Docker container with all necessary dependencies
+- A computer use agent loop using the Claude API, Bedrock, or Vertex to access Claude Opus 4.5, Claude Sonnet 4.5, Claude Sonnet 4, Claude Opus 4, Claude Haiku 4.5, Claude 3.7 Sonnet, and Claude 3.5 Sonnet models
+- Anthropic-defined computer use tools
+- A streamlit app for interacting with the agent loop
Please use [this form](https://forms.gle/BT1hpBrqDPDUrCqo7) to provide feedback on the quality of the model responses, the API itself, or the quality of the documentation - we cannot wait to hear from you!
> [!IMPORTANT]
-> The Beta API used in this reference implementation is subject to change. Please refer to the [API release notes](https://docs.anthropic.com/en/release-notes/api) for the most up-to-date information.
+> The Beta API used in this reference implementation is subject to change. Please refer to the [API release notes](https://docs.claude.com/en/release-notes/api) for the most up-to-date information.
> [!IMPORTANT]
> The components are weakly separated: the agent loop runs in the container being controlled by Claude, can only be used by one session at a time, and must be restarted or reset between sessions if necessary.
## Quickstart: running the Docker container
-### Anthropic API
+### Claude API
> [!TIP]
-> You can find your API key in the [Anthropic Console](https://console.anthropic.com/).
+> You can find your API key in the [Claude Console](https://console.anthropic.com/).
```bash
export ANTHROPIC_API_KEY=%your_api_key%
@@ -51,10 +54,9 @@ Once the container is running, see the [Accessing the demo app](#accessing-the-d
### Bedrock
> [!TIP]
-> To use the new Claude 3.5 Sonnet on Bedrock, you first need to [request model access](https://docs.aws.amazon.com/bedrock/latest/userguide/model-access-modify.html).
+> To use the new Claude 3.7 Sonnet on Bedrock, you first need to [request model access](https://docs.aws.amazon.com/bedrock/latest/userguide/model-access-modify.html).
You'll need to pass in AWS credentials with appropriate permissions to use Claude on Bedrock.
-
You have a few options for authenticating with Bedrock. See the [boto3 documentation](https://boto3.amazonaws.com/v1/documentation/api/latest/guide/credentials.html#environment-variables) for more details and options.
#### Option 1: (suggested) Use the host's AWS credentials file and AWS profile
@@ -122,7 +124,6 @@ docker run \
Once the container is running, see the [Accessing the demo app](#accessing-the-demo-app) section below for instructions on how to connect to the interface.
This example shows how to use the Google Cloud Application Default Credentials to authenticate with Vertex.
-
You can also set `GOOGLE_APPLICATION_CREDENTIALS` to use an arbitrary credential file, see the [Google Cloud Authentication documentation](https://cloud.google.com/docs/authentication/application-default-credentials#GAC) for more details.
### Accessing the demo app
@@ -154,9 +155,14 @@ docker run \
-it ghcr.io/anthropics/anthropic-quickstarts:computer-use-demo-latest
```
-We do not recommend sending screenshots in resolutions above [XGA/WXGA](https://en.wikipedia.org/wiki/Display_resolution_standards#XGA) to avoid issues related to [image resizing](https://docs.anthropic.com/en/docs/build-with-claude/vision#evaluate-image-size).
+We do not recommend sending screenshots in resolutions above [XGA/WXGA](https://en.wikipedia.org/wiki/Display_resolution_standards#XGA) to avoid issues related to [image resizing](https://docs.claude.com/en/docs/build-with-claude/vision#evaluate-image-size).
Relying on the image resizing behavior in the API will result in lower model accuracy and slower performance than implementing scaling in your tools directly. The `computer` tool implementation in this project demonstrates how to scale both images and coordinates from higher resolutions to the suggested resolutions.
+When implementing computer use yourself, we recommend using XGA resolution (1024x768):
+
+- For higher resolutions: Scale the image down to XGA and let the model interact with this scaled version, then map the coordinates back to the original resolution proportionally.
+- For lower resolutions or smaller devices (e.g. mobile devices): Add black padding around the display area until it reaches 1024x768.
+
## Development
```bash
diff --git a/computer-use-demo/computer_use_demo/loop.py b/computer-use-demo/computer_use_demo/loop.py
index 9db525a39..0a51d84ea 100644
--- a/computer-use-demo/computer_use_demo/loop.py
+++ b/computer-use-demo/computer_use_demo/loop.py
@@ -1,5 +1,5 @@
"""
-Agentic sampling loop that calls the Anthropic API and local implementation of anthropic-defined computer use tools.
+Agentic sampling loop that calls the Claude API and local implementation of anthropic-defined computer use tools.
"""
import platform
@@ -29,9 +29,13 @@
BetaToolUseBlockParam,
)
-from .tools import BashTool, ComputerTool, EditTool, ToolCollection, ToolResult
+from .tools import (
+ TOOL_GROUPS_BY_VERSION,
+ ToolCollection,
+ ToolResult,
+ ToolVersion,
+)
-COMPUTER_USE_BETA_FLAG = "computer-use-2024-10-22"
PROMPT_CACHING_BETA_FLAG = "prompt-caching-2024-07-31"
@@ -41,13 +45,6 @@ class APIProvider(StrEnum):
VERTEX = "vertex"
-PROVIDER_TO_DEFAULT_MODEL_NAME: dict[APIProvider, str] = {
- APIProvider.ANTHROPIC: "claude-3-5-sonnet-20241022",
- APIProvider.BEDROCK: "anthropic.claude-3-5-sonnet-20241022-v2:0",
- APIProvider.VERTEX: "claude-3-5-sonnet-v2@20241022",
-}
-
-
# This system prompt is optimized for the Docker environment in this repository and
# specific tool combinations enabled.
# We encourage modifying this system prompt to ensure the model has context for the
@@ -58,15 +55,15 @@ class APIProvider(StrEnum):
* You can feel free to install Ubuntu applications with your bash tool. Use curl instead of wget.
* To open firefox, please just click on the firefox icon. Note, firefox-esr is what is installed on your system.
* Using bash tool you can start GUI applications, but you need to set export DISPLAY=:1 and use a subshell. For example "(DISPLAY=:1 xterm &)". GUI apps run with bash tool will appear within your desktop environment, but they may take some time to appear. Take a screenshot to confirm it did.
-* When using your bash tool with commands that are expected to output very large quantities of text, redirect into a tmp file and use str_replace_editor or `grep -n -B -A ` to confirm output.
+* When using your bash tool with commands that are expected to output very large quantities of text, redirect into a tmp file and use str_replace_based_edit_tool or `grep -n -B -A ` to confirm output.
* When viewing a page it can be helpful to zoom out so that you can see everything on the page. Either that, or make sure you scroll down to see everything before deciding something isn't available.
* When using your computer function calls, they take a while to run and send back to you. Where possible/feasible, try to chain multiple of these calls all into one function calls request.
-* The current date is {datetime.today().strftime('%A, %B %-d, %Y')}.
+* The current date is {datetime.today().strftime("%A, %B %-d, %Y")}.
* When using Firefox, if a startup wizard appears, IGNORE IT. Do not even click "skip this step". Instead, click on the address bar where it says "Search or enter address", and enter the appropriate search term or URL there.
-* If the item you are looking at is a pdf, if after taking a single screenshot of the pdf it seems that you want to read the entire document instead of trying to continue to read the pdf from your screenshots + navigation, determine the URL, use curl to download the pdf, install and use pdftotext to convert it to a text file, and then read that text file directly with your StrReplaceEditTool.
+* If the item you are looking at is a pdf, if after taking a single screenshot of the pdf it seems that you want to read the entire document instead of trying to continue to read the pdf from your screenshots + navigation, determine the URL, use curl to download the pdf, install and use pdftotext to convert it to a text file, and then read that text file directly with your str_replace_based_edit_tool.
"""
@@ -84,15 +81,15 @@ async def sampling_loop(
api_key: str,
only_n_most_recent_images: int | None = None,
max_tokens: int = 4096,
+ tool_version: ToolVersion,
+ thinking_budget: int | None = None,
+ token_efficient_tools_beta: bool = False,
):
"""
Agentic sampling loop for the assistant/tool interaction of computer use.
"""
- tool_collection = ToolCollection(
- ComputerTool(),
- BashTool(),
- EditTool(),
- )
+ tool_group = TOOL_GROUPS_BY_VERSION[tool_version]
+ tool_collection = ToolCollection(*(ToolCls() for ToolCls in tool_group.tools))
system = BetaTextBlockParam(
type="text",
text=f"{SYSTEM_PROMPT}{' ' + system_prompt_suffix if system_prompt_suffix else ''}",
@@ -100,10 +97,12 @@ async def sampling_loop(
while True:
enable_prompt_caching = False
- betas = [COMPUTER_USE_BETA_FLAG]
- image_truncation_threshold = 10
+ betas = [tool_group.beta_flag] if tool_group.beta_flag else []
+ if token_efficient_tools_beta:
+ betas.append("token-efficient-tools-2025-02-19")
+ image_truncation_threshold = only_n_most_recent_images or 0
if provider == APIProvider.ANTHROPIC:
- client = Anthropic(api_key=api_key)
+ client = Anthropic(api_key=api_key, max_retries=4)
enable_prompt_caching = True
elif provider == APIProvider.VERTEX:
client = AnthropicVertex()
@@ -113,9 +112,11 @@ async def sampling_loop(
if enable_prompt_caching:
betas.append(PROMPT_CACHING_BETA_FLAG)
_inject_prompt_caching(messages)
- # Is it ever worth it to bust the cache with prompt caching?
- image_truncation_threshold = 50
- system["cache_control"] = {"type": "ephemeral"}
+ # Because cached reads are 10% of the price, we don't think it's
+ # ever sensible to break the cache by truncating images
+ only_n_most_recent_images = 0
+ # Use type ignore to bypass TypedDict check until SDK types are updated
+ system["cache_control"] = {"type": "ephemeral"} # type: ignore
if only_n_most_recent_images:
_maybe_filter_to_n_most_recent_images(
@@ -123,6 +124,12 @@ async def sampling_loop(
only_n_most_recent_images,
min_removal_threshold=image_truncation_threshold,
)
+ extra_body = {}
+ if thinking_budget:
+ # Ensure we only send the required fields for thinking
+ extra_body = {
+ "thinking": {"type": "enabled", "budget_tokens": thinking_budget}
+ }
# Call the API
# we use raw_response to provide debug information to streamlit. Your
@@ -136,6 +143,7 @@ async def sampling_loop(
system=[system],
tools=tool_collection.to_params(),
betas=betas,
+ extra_body=extra_body,
)
except (APIStatusError, APIResponseValidationError) as e:
api_response_callback(e.request, e.response, e)
@@ -161,15 +169,20 @@ async def sampling_loop(
tool_result_content: list[BetaToolResultBlockParam] = []
for content_block in response_params:
output_callback(content_block)
- if content_block["type"] == "tool_use":
+ if (
+ isinstance(content_block, dict)
+ and content_block.get("type") == "tool_use"
+ ):
+ # Type narrowing for tool use blocks
+ tool_use_block = cast(BetaToolUseBlockParam, content_block)
result = await tool_collection.run(
- name=content_block["name"],
- tool_input=cast(dict[str, Any], content_block["input"]),
+ name=tool_use_block["name"],
+ tool_input=cast(dict[str, Any], tool_use_block.get("input", {})),
)
tool_result_content.append(
- _make_api_tool_result(result, content_block["id"])
+ _make_api_tool_result(result, tool_use_block["id"])
)
- tool_output_callback(result, content_block["id"])
+ tool_output_callback(result, tool_use_block["id"])
if not tool_result_content:
return messages
@@ -228,12 +241,23 @@ def _maybe_filter_to_n_most_recent_images(
def _response_to_params(
response: BetaMessage,
-) -> list[BetaTextBlockParam | BetaToolUseBlockParam]:
- res: list[BetaTextBlockParam | BetaToolUseBlockParam] = []
+) -> list[BetaContentBlockParam]:
+ res: list[BetaContentBlockParam] = []
for block in response.content:
if isinstance(block, BetaTextBlock):
- res.append({"type": "text", "text": block.text})
+ if block.text:
+ res.append(BetaTextBlockParam(type="text", text=block.text))
+ elif getattr(block, "type", None) == "thinking":
+ # Handle thinking blocks - include signature field
+ thinking_block = {
+ "type": "thinking",
+ "thinking": getattr(block, "thinking", None),
+ }
+ if hasattr(block, "signature"):
+ thinking_block["signature"] = getattr(block, "signature", None)
+ res.append(cast(BetaContentBlockParam, thinking_block))
else:
+ # Handle tool use blocks normally
res.append(cast(BetaToolUseBlockParam, block.model_dump()))
return res
@@ -253,11 +277,13 @@ def _inject_prompt_caching(
):
if breakpoints_remaining:
breakpoints_remaining -= 1
- content[-1]["cache_control"] = BetaCacheControlEphemeralParam(
+ # Use type ignore to bypass TypedDict check until SDK types are updated
+ content[-1]["cache_control"] = BetaCacheControlEphemeralParam( # type: ignore
{"type": "ephemeral"}
)
else:
- content[-1].pop("cache_control", None)
+ if isinstance(content[-1], dict) and "cache_control" in content[-1]:
+ del content[-1]["cache_control"] # type: ignore
# we'll only every have one extra turn per loop
break
diff --git a/computer-use-demo/computer_use_demo/requirements.txt b/computer-use-demo/computer_use_demo/requirements.txt
index 115da8df3..510e1c915 100644
--- a/computer-use-demo/computer_use_demo/requirements.txt
+++ b/computer-use-demo/computer_use_demo/requirements.txt
@@ -1,5 +1,5 @@
-streamlit>=1.38.0
-anthropic[bedrock,vertex]>=0.37.1
+streamlit==1.41.0
+anthropic[bedrock,vertex]>=0.39.0
jsonschema==4.22.0
boto3>=1.28.57
google-auth<3,>=2
diff --git a/computer-use-demo/computer_use_demo/streamlit.py b/computer-use-demo/computer_use_demo/streamlit.py
index 45318ed91..a3428d882 100644
--- a/computer-use-demo/computer_use_demo/streamlit.py
+++ b/computer-use-demo/computer_use_demo/streamlit.py
@@ -7,11 +7,13 @@
import os
import subprocess
import traceback
+from contextlib import contextmanager
+from dataclasses import dataclass
from datetime import datetime, timedelta
from enum import StrEnum
from functools import partial
from pathlib import PosixPath
-from typing import cast
+from typing import cast, get_args
import httpx
import streamlit as st
@@ -19,24 +21,84 @@
from anthropic.types.beta import (
BetaContentBlockParam,
BetaTextBlockParam,
+ BetaToolResultBlockParam,
)
from streamlit.delta_generator import DeltaGenerator
from computer_use_demo.loop import (
- PROVIDER_TO_DEFAULT_MODEL_NAME,
APIProvider,
sampling_loop,
)
-from computer_use_demo.tools import ToolResult
+from computer_use_demo.tools import ToolResult, ToolVersion
+
+PROVIDER_TO_DEFAULT_MODEL_NAME: dict[APIProvider, str] = {
+ APIProvider.ANTHROPIC: "claude-sonnet-4-5-20250929",
+ APIProvider.BEDROCK: "anthropic.claude-3-5-sonnet-20241022-v2:0",
+ APIProvider.VERTEX: "claude-3-5-sonnet-v2@20241022",
+}
+
+
+@dataclass(kw_only=True, frozen=True)
+class ModelConfig:
+ tool_version: ToolVersion
+ max_output_tokens: int
+ default_output_tokens: int
+ has_thinking: bool = False
+
+
+CLAUDE_4 = ModelConfig(
+ tool_version="computer_use_20250429",
+ max_output_tokens=64_000,
+ default_output_tokens=1024 * 16,
+ has_thinking=True,
+)
+
+CLAUDE_4_5 = ModelConfig(
+ tool_version="computer_use_20250124",
+ max_output_tokens=128_000,
+ default_output_tokens=1024 * 16,
+ has_thinking=True,
+)
+
+CLAUDE_4_WITH_ZOOMABLE_TOOL = ModelConfig(
+ tool_version="computer_use_20251124",
+ max_output_tokens=64_000,
+ default_output_tokens=1024 * 16,
+ has_thinking=True,
+)
+
+HAIKU_4_5 = ModelConfig(
+ tool_version="computer_use_20250124",
+ max_output_tokens=1024 * 8,
+ default_output_tokens=1024 * 4,
+ has_thinking=False,
+)
+
+MODEL_TO_MODEL_CONF: dict[str, ModelConfig] = {
+ "claude-opus-4-1-20250805": CLAUDE_4,
+ "claude-sonnet-4-20250514": CLAUDE_4,
+ "claude-opus-4-20250514": CLAUDE_4,
+ "claude-sonnet-4-5-20250929": CLAUDE_4_5,
+ "anthropic.claude-sonnet-4-5-20250929-v1:0": CLAUDE_4_5,
+ "claude-sonnet-4-5@20250929": CLAUDE_4_5,
+ "claude-haiku-4-5-20251001": HAIKU_4_5,
+ "anthropic.claude-haiku-4-5-20251001-v1:0": HAIKU_4_5, # Bedrock
+ "claude-haiku-4-5@20251001": HAIKU_4_5, # Vertex
+ "claude-opus-4-5-20251101": CLAUDE_4_WITH_ZOOMABLE_TOOL,
+}
CONFIG_DIR = PosixPath("~/.anthropic").expanduser()
API_KEY_FILE = CONFIG_DIR / "api_key"
STREAMLIT_STYLE = """