diff --git a/requirements.txt b/requirements.txt index f7055242..016dc7fe 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,3 +8,4 @@ langchain-ibm==0.3.10 langchain_mcp_adapters==0.0.9 langgraph==0.3.34 langchain-community +lxml[html_clean]==4.9.3 \ No newline at end of file diff --git a/src/controller/custom_controller.py b/src/controller/custom_controller.py index 00e050c5..6ca3bb13 100644 --- a/src/controller/custom_controller.py +++ b/src/controller/custom_controller.py @@ -106,6 +106,23 @@ async def upload_file(index: int, path: str, browser: BrowserContext, available_ logger.info(msg) return ActionResult(error=msg) + # --- Dynamically register skills defined in `src.custom_skills` --- + # This allows JSON scenarios to call actions like "goto", "clickCss", etc. + from src.custom_skills import CUSTOM_SKILLS + from typing import Any + + for _skill_name, _skill_func in CUSTOM_SKILLS.items(): + # Create a wrapper that adapts the controller action signature + # Add explicit type annotations to avoid Pydantic JSON schema errors + # Avoid leading underscore in function name to prevent Pydantic errors + async def skill_wrapper(*args: Any, browser: BrowserContext, _func=_skill_func) -> Dict[str, Any]: + # Original skill expects (ctx, args_list) + return await _func(browser, list(args)) + + # Avoid overwriting if an action with the same name already exists + if _skill_name not in self.registry.registry.actions: + self.registry.action(_skill_name)(skill_wrapper) + @time_execution_sync('--act') async def act( self, diff --git a/src/custom_skills/__init__.py b/src/custom_skills/__init__.py new file mode 100644 index 00000000..4f9df6c1 --- /dev/null +++ b/src/custom_skills/__init__.py @@ -0,0 +1,117 @@ +from typing import Dict, Any, Callable, Awaitable, List +from .click_helper import ClickHelper + +# Dictionary to store registered skills +CUSTOM_SKILLS: Dict[str, Callable[..., Awaitable[Dict[str, Any]]]] = {} + +def register_skill(name: str): + """Decorator to register a new skill function""" + def decorator(func): + CUSTOM_SKILLS[name] = func + return func + return decorator + +def get_custom_skill(name: str) -> Callable[..., Awaitable[Dict[str, Any]]]: + """Get a registered custom skill by name""" + return CUSTOM_SKILLS.get(name) + +# Register our click helper as a skill +@register_skill("clickCss") +async def click_css_skill(ctx, args: List[Any]) -> Dict[str, Any]: + """ + Skill to click an element by CSS selector with reliability enhancements. + + Args: + ctx: Browser context + args: List containing [selector, x (optional), y (optional)] + + Returns: + Dict with status and message + """ + return await ClickHelper.click_css(ctx, args) + +# Register a goto skill to match the example +@register_skill("goto") +async def goto_skill(ctx, args: List[Any]) -> Dict[str, Any]: + """ + Navigate to a URL. + + Args: + ctx: Browser context + args: List containing [url] + """ + if not args or len(args) < 1: + return {"status": "error", "message": "Missing required argument: url"} + + try: + url = args[0] + await ctx.page.goto(url, timeout=30000) + return {"status": "success", "message": f"Navigated to {url}"} + except Exception as e: + return {"status": "error", "message": f"Failed to navigate to {args[0]}: {str(e)}"} + +# Register a waitFor skill +@register_skill("waitFor") +async def wait_for_skill(ctx, args: List[Any]) -> Dict[str, Any]: + """ + Wait for an element to reach a specific state. + + Args: + ctx: Browser context + args: List containing [selector, state, timeout_ms] + """ + if len(args) < 2: + return {"status": "error", "message": "Missing required arguments: selector and state"} + + selector = args[0] + state = args[1] # 'attached', 'detached', 'visible', 'hidden' + timeout = args[2] if len(args) > 2 else 5000 # Default 5 seconds + + try: + if state == 'detached': + # Wait for element to be removed from DOM + await ctx.page.wait_for_selector(selector, state='detached', timeout=timeout) + else: + # Wait for element to be in specified state + await ctx.page.wait_for_selector(selector, state=state, timeout=timeout) + + return {"status": "success", "message": f"Element {selector} reached state {state}"} + except Exception as e: + return {"status": "error", "message": f"Wait for {selector} {state} failed: {str(e)}"} + +# Register a waitForUrl skill +@register_skill("waitForUrl") +async def wait_for_url_skill(ctx, args: List[Any]) -> Dict[str, Any]: + """ + Wait for the URL to contain a specific string. + + Args: + ctx: Browser context + args: List containing [url_fragment, timeout_ms] + """ + if not args: + return {"status": "error", "message": "Missing required argument: url_fragment"} + + url_fragment = args[0] + timeout = args[1] if len(args) > 1 else 7000 # Default 7 seconds + + try: + await ctx.page.wait_for_url( + f"*{url_fragment}*", + timeout=timeout, + wait_until="networkidle" + ) + return {"status": "success", "message": f"URL contains {url_fragment}"} + except Exception as e: + return {"status": "error", "message": f"URL did not contain {url_fragment}: {str(e)}"} + +# Export all skills +__all__ = [ + 'CUSTOM_SKILLS', + 'register_skill', + 'get_custom_skill', + 'click_css_skill', + 'goto_skill', + 'wait_for_skill', + 'wait_for_url_skill' +] diff --git a/src/custom_skills/click_helper.py b/src/custom_skills/click_helper.py new file mode 100644 index 00000000..e66bf50f --- /dev/null +++ b/src/custom_skills/click_helper.py @@ -0,0 +1,37 @@ +from typing import Any, Dict, List, Optional +from dataclasses import dataclass + +@dataclass +class ClickHelper: + """Helper class for reliable clicking with hover""" + + @staticmethod + async def click_css(ctx, args: List[Any]) -> Dict[str, Any]: + """ + Click an element by CSS selector with reliability enhancements. + + Args: + ctx: The browser context + args: List containing [selector, x (optional), y (optional)] + + Returns: + Dict with status and message + """ + if not args or len(args) < 1: + return {"status": "error", "message": "Missing required argument: selector"} + + selector = args[0] + x = args[1] if len(args) > 1 else 5 + y = args[2] if len(args) > 2 else 5 + + try: + page = ctx.page + el = page.locator(selector) + await el.wait_for(state='visible', timeout=5000) + await page.mouse.move(20, 20) # Move to neutral position + await el.hover() # Hover over the element + await el.click(position={'x': x, 'y': y}, force=True) + + return {"status": "success", "message": f"Clicked {selector}"} + except Exception as e: + return {"status": "error", "message": f"Failed to click {selector}: {str(e)}"} diff --git a/src/docs/custom_skills_integration.md b/src/docs/custom_skills_integration.md new file mode 100644 index 00000000..b3ac452b --- /dev/null +++ b/src/docs/custom_skills_integration.md @@ -0,0 +1,112 @@ +# Custom Skills Integration for Browser Use Agent + +This document outlines the changes made to enable deterministic execution of JSON-based browser automation scenarios in the Browser Use Agent. + +## Problem Statement + +The Browser Use Agent initially did not properly register custom skills (such as `goto`, `clickCss`, `waitFor`, `waitForUrl`) for use in JSON scenarios. When attempting to run JSON scenarios with these skills, the agent would fall back to built-in actions or fail to execute the steps correctly. + +## Solution + +We implemented dynamic registration of custom skills as controller actions to enable deterministic execution of JSON-based browser automation scenarios. + +### Key Changes + +1. **Dynamic Registration of Custom Skills** + + In `src/controller/custom_controller.py`, we added code to dynamically register all skills from `src.custom_skills` as controller actions: + + ```python + # This allows JSON scenarios to call actions like "goto", "clickCss", etc. + from src.custom_skills import CUSTOM_SKILLS + from typing import Any, Dict + from playwright.async_api import BrowserContext + + for _skill_name, _skill_func in CUSTOM_SKILLS.items(): + # Create a wrapper that adapts the controller action signature + # Add explicit type annotations to avoid Pydantic JSON schema errors + # Avoid leading underscore in function name to prevent Pydantic errors + async def skill_wrapper(*args: Any, browser: BrowserContext, _func=_skill_func) -> Dict[str, Any]: + # Original skill expects (ctx, args_list) + return await _func(browser, list(args)) + + # Avoid overwriting if an action with the same name already exists + if _skill_name not in self.registry.registry.actions: + self.registry.action(_skill_name)(skill_wrapper) + ``` + +2. **Fixed Pydantic JSON Schema Error** + + We encountered two issues with Pydantic when generating JSON schemas for the dynamically created wrapper functions: + + - **Missing Type Annotations**: Added proper type annotations to the wrapper function + - **Leading Underscore in Function Name**: Renamed the function from `_wrapper` to `skill_wrapper` to avoid the Pydantic error: "Fields must not use names with leading underscores" + +## Usage + +With these changes, JSON scenarios can now use custom skills directly. Here's an example scenario that navigates through Wikipedia and compares language counts: + +```json +{ + "steps": [ + { + "skill": "goto", + "args": ["https://www.wikipedia.org/"] + }, + { + "skill": "waitForUrl", + "args": ["wikipedia.org", 10000] + }, + { + "skill": "waitFor", + "args": [".central-featured", 5000] + }, + { + "skill": "evaluate", + "args": [ + "(() => { const languages = document.querySelectorAll('.central-featured-lang'); const count = languages.length; localStorage.setItem('languageCount', count); return { mainPageLanguageCount: count }; })()" + ] + }, + { + "skill": "waitFor", + "args": ["a[href='//meta.wikimedia.org/wiki/List_of_Wikipedias']", 5000] + }, + { + "skill": "clickCss", + "args": ["a[href='//meta.wikimedia.org/wiki/List_of_Wikipedias']"] + }, + { + "skill": "waitForUrl", + "args": ["meta.wikimedia.org/wiki/List_of_Wikipedias", 10000] + }, + { + "skill": "waitFor", + "args": [".wikitable", 5000] + }, + { + "skill": "evaluate", + "args": [ + "(() => { const rows = document.querySelectorAll('.wikitable tr'); const activeLanguages = Array.from(rows).filter(row => row.querySelector('td')).length; const mainPageCount = localStorage.getItem('languageCount'); return { mainPageLanguageCount: mainPageCount, listPageLanguageCount: activeLanguages, note: 'The counts may differ as the main page shows top languages while the list page shows all languages' }; })()" + ] + } + ] +} +``` + +## Available Custom Skills + +The following custom skills are now available for use in JSON scenarios: + +- **goto**: Navigate to a URL +- **clickCss**: Click an element by CSS selector +- **waitFor**: Wait for an element to reach a specific state +- **waitForUrl**: Wait for the URL to contain a specific string + +## Troubleshooting + +If you encounter issues with JSON scenarios: + +1. Check the browser console for errors +2. Verify that selectors are correct for the target website +3. Adjust timeout values if elements take longer to load +4. Use the `evaluate` skill to debug page state with JavaScript diff --git a/src/test_scenarios/reliable_click_demo.json b/src/test_scenarios/reliable_click_demo.json new file mode 100644 index 00000000..158a90d2 --- /dev/null +++ b/src/test_scenarios/reliable_click_demo.json @@ -0,0 +1,25 @@ +{ + "name": "Reliable Click Demo", + "description": "Demonstrates using the reliable_click custom skill", + "steps": [ + { + "action": "navigate_to", + "url": "https://torath.gov.eg" + }, + { + "action": "reliable_click", + "selector": "a[href='/about']", + "x": 10, + "y": 10 + }, + { + "action": "wait_for_navigation", + "timeout": 5000 + }, + { + "action": "assert_url_contains", + "expected": "/about" + } + ] + } + \ No newline at end of file diff --git a/tests/test_skill_integration.py b/tests/test_skill_integration.py new file mode 100644 index 00000000..7d010840 --- /dev/null +++ b/tests/test_skill_integration.py @@ -0,0 +1,198 @@ +""" +Integration tests for custom skills with the browser-use agent. +""" +import asyncio +import json +import logging +import os +from pathlib import Path +from unittest.mock import AsyncMock, MagicMock, patch + +import pytest +from browser_use.agent.views import AgentHistoryList +from browser_use.browser.browser import Browser +from browser_use.browser.context import BrowserContext, BrowserContextConfig + +# Add the project root to the Python path +import sys +sys.path.append(str(Path(__file__).parent.parent)) + +from src.agent.browser_use.browser_use_agent import BrowserUseAgent +from src.browser.custom_browser import CustomBrowser +from src.browser.skill_registry import SkillRegistry + +# Configure logging +logging.basicConfig(level=logging.INFO) +logger = logging.getLogger(__name__) + +@pytest.fixture +def mock_browser(): + """Create a mock browser instance.""" + browser = MagicMock(spec=Browser) + browser.new_context.return_value = AsyncMock(spec=BrowserContext) + return browser + +@pytest.fixture +def mock_context(): + """Create a mock browser context.""" + context = AsyncMock(spec=BrowserContext) + context.page = AsyncMock() + context.page.locator.return_value = AsyncMock() + context.page.wait_for_selector = AsyncMock() + context.page.wait_for_url = AsyncMock() + context.page.goto = AsyncMock() + return context + +@pytest.fixture +def mock_llm(): + """Create a mock LLM instance.""" + return AsyncMock() + +@pytest.fixture +def agent_config(): + """Default agent configuration.""" + return { + "task": "Test task", + "use_vision": False, + "max_actions_per_step": 10, + "max_input_tokens": 128000, + } + +@pytest.mark.asyncio +async def test_skill_registration(mock_browser, mock_context, mock_llm, agent_config): + """Test that custom skills are properly registered with the agent.""" + # Create a test agent + agent = BrowserUseAgent( + **agent_config, + llm=mock_llm, + browser=mock_browser, + browser_context=mock_context, + ) + + # Verify the agent has the register_skill method + assert hasattr(agent, 'register_skill'), "Agent must have register_skill method" + + # Register our custom skills + SkillRegistry.register_skills(agent) + + # Verify the skills were registered + from src.custom_skills import CUSTOM_SKILLS + for skill_name in CUSTOM_SKILLS.keys(): + assert hasattr(agent, f'skill_{skill_name}'), f"Skill {skill_name} not registered" + +@pytest.mark.asyncio +async def test_click_css_skill_execution(mock_browser, mock_context, mock_llm, agent_config): + """Test execution of the clickCss skill.""" + # Create a test agent + agent = BrowserUseAgent( + **agent_config, + llm=mock_llm, + browser=mock_browser, + browser_context=mock_context, + ) + + # Register our custom skills + SkillRegistry.register_skills(agent) + + # Execute the clickCss skill + selector = "button#test" + x, y = 10, 10 + + # Mock the page locator and element + mock_element = AsyncMock() + mock_context.page.locator.return_value = mock_element + + # Call the skill + result = await agent.skill_clickCss(selector, x, y) + + # Verify the interactions + mock_context.page.locator.assert_called_once_with(selector) + mock_element.wait_for.assert_called_once_with(state='visible', timeout=5000) + mock_context.page.mouse.move.assert_called_once_with(20, 20) + mock_element.hover.assert_awaited_once() + mock_element.click.assert_awaited_once_with(position={'x': x, 'y': y}, force=True) + assert result["status"] == "success" + +@pytest.mark.asyncio +async def test_goto_skill_execution(mock_browser, mock_context, mock_llm, agent_config): + """Test execution of the goto skill.""" + # Create a test agent + agent = BrowserUseAgent( + **agent_config, + llm=mock_llm, + browser=mock_browser, + browser_context=mock_context, + ) + + # Register our custom skills + SkillRegistry.register_skills(agent) + + # Execute the goto skill + url = "https://example.com" + result = await agent.skill_goto(url) + + # Verify the interactions + mock_context.page.goto.assert_awaited_once_with(url, timeout=30000) + assert result["status"] == "success" + +@pytest.mark.asyncio +async def test_wait_for_skill_execution(mock_browser, mock_context, mock_llm, agent_config): + """Test execution of the waitFor skill.""" + # Create a test agent + agent = BrowserUseAgent( + **agent_config, + llm=mock_llm, + browser=mock_browser, + browser_context=mock_context, + ) + + # Register our custom skills + SkillRegistry.register_skills(agent) + + # Test waiting for an element to be attached + selector = ".loading" + state = "attached" + timeout = 5000 + + # Execute the waitFor skill + result = await agent.skill_waitFor(selector, state, timeout) + + # Verify the interactions + mock_context.page.wait_for_selector.assert_awaited_once_with( + selector, state=state, timeout=timeout/1000 + ) + assert result["status"] == "success" + +@pytest.mark.asyncio +async def test_wait_for_url_skill_execution(mock_browser, mock_context, mock_llm, agent_config): + """Test execution of the waitForUrl skill.""" + # Create a test agent + agent = BrowserUseAgent( + **agent_config, + llm=mock_llm, + browser=mock_browser, + browser_context=mock_context, + ) + + # Register our custom skills + SkillRegistry.register_skills(agent) + + # Test waiting for URL + url_fragment = "/test" + timeout = 3000 + + # Execute the waitForUrl skill + result = await agent.skill_waitForUrl(url_fragment, timeout) + + # Verify the interactions + mock_context.page.wait_for_url.assert_awaited_once_with( + f"*{url_fragment}*", + timeout=timeout/1000, + wait_until="networkidle" + ) + assert result["status"] == "success" + +# This allows running the tests directly with: python -m pytest tests/test_skill_integration.py -v +if __name__ == "__main__": + import pytest + sys.exit(pytest.main(["-v", __file__]))