Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,4 @@ langchain-ibm==0.3.10
langchain_mcp_adapters==0.0.9
langgraph==0.3.34
langchain-community
lxml[html_clean]==4.9.3
17 changes: 17 additions & 0 deletions src/controller/custom_controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,23 @@ async def upload_file(index: int, path: str, browser: BrowserContext, available_
logger.info(msg)
return ActionResult(error=msg)

# --- Dynamically register skills defined in `src.custom_skills` ---
# This allows JSON scenarios to call actions like "goto", "clickCss", etc.
from src.custom_skills import CUSTOM_SKILLS
from typing import Any

for _skill_name, _skill_func in CUSTOM_SKILLS.items():
# Create a wrapper that adapts the controller action signature
# Add explicit type annotations to avoid Pydantic JSON schema errors
# Avoid leading underscore in function name to prevent Pydantic errors
async def skill_wrapper(*args: Any, browser: BrowserContext, _func=_skill_func) -> Dict[str, Any]:
# Original skill expects (ctx, args_list)
return await _func(browser, list(args))

# Avoid overwriting if an action with the same name already exists
if _skill_name not in self.registry.registry.actions:
self.registry.action(_skill_name)(skill_wrapper)

@time_execution_sync('--act')
async def act(
self,
Expand Down
117 changes: 117 additions & 0 deletions src/custom_skills/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
from typing import Dict, Any, Callable, Awaitable, List
from .click_helper import ClickHelper

# Dictionary to store registered skills
CUSTOM_SKILLS: Dict[str, Callable[..., Awaitable[Dict[str, Any]]]] = {}

def register_skill(name: str):
"""Decorator to register a new skill function"""
def decorator(func):
CUSTOM_SKILLS[name] = func
return func
return decorator

def get_custom_skill(name: str) -> Callable[..., Awaitable[Dict[str, Any]]]:
"""Get a registered custom skill by name"""
return CUSTOM_SKILLS.get(name)

# Register our click helper as a skill
@register_skill("clickCss")
async def click_css_skill(ctx, args: List[Any]) -> Dict[str, Any]:
"""
Skill to click an element by CSS selector with reliability enhancements.

Args:
ctx: Browser context
args: List containing [selector, x (optional), y (optional)]

Returns:
Dict with status and message
"""
return await ClickHelper.click_css(ctx, args)

# Register a goto skill to match the example
@register_skill("goto")
async def goto_skill(ctx, args: List[Any]) -> Dict[str, Any]:
"""
Navigate to a URL.

Args:
ctx: Browser context
args: List containing [url]
"""
if not args or len(args) < 1:
return {"status": "error", "message": "Missing required argument: url"}

try:
url = args[0]
await ctx.page.goto(url, timeout=30000)
return {"status": "success", "message": f"Navigated to {url}"}
except Exception as e:
return {"status": "error", "message": f"Failed to navigate to {args[0]}: {str(e)}"}

# Register a waitFor skill
@register_skill("waitFor")
async def wait_for_skill(ctx, args: List[Any]) -> Dict[str, Any]:
"""
Wait for an element to reach a specific state.

Args:
ctx: Browser context
args: List containing [selector, state, timeout_ms]
"""
if len(args) < 2:
return {"status": "error", "message": "Missing required arguments: selector and state"}

selector = args[0]
state = args[1] # 'attached', 'detached', 'visible', 'hidden'
timeout = args[2] if len(args) > 2 else 5000 # Default 5 seconds

try:
if state == 'detached':
# Wait for element to be removed from DOM
await ctx.page.wait_for_selector(selector, state='detached', timeout=timeout)
else:
# Wait for element to be in specified state
await ctx.page.wait_for_selector(selector, state=state, timeout=timeout)

return {"status": "success", "message": f"Element {selector} reached state {state}"}
except Exception as e:
return {"status": "error", "message": f"Wait for {selector} {state} failed: {str(e)}"}

# Register a waitForUrl skill
@register_skill("waitForUrl")
async def wait_for_url_skill(ctx, args: List[Any]) -> Dict[str, Any]:
"""
Wait for the URL to contain a specific string.

Args:
ctx: Browser context
args: List containing [url_fragment, timeout_ms]
"""
if not args:
return {"status": "error", "message": "Missing required argument: url_fragment"}

url_fragment = args[0]
timeout = args[1] if len(args) > 1 else 7000 # Default 7 seconds

try:
await ctx.page.wait_for_url(
f"*{url_fragment}*",
timeout=timeout,
wait_until="networkidle"
)
return {"status": "success", "message": f"URL contains {url_fragment}"}
except Exception as e:
return {"status": "error", "message": f"URL did not contain {url_fragment}: {str(e)}"}

# Export all skills
__all__ = [
'CUSTOM_SKILLS',
'register_skill',
'get_custom_skill',
'click_css_skill',
'goto_skill',
'wait_for_skill',
'wait_for_url_skill'
]
37 changes: 37 additions & 0 deletions src/custom_skills/click_helper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
from typing import Any, Dict, List, Optional
from dataclasses import dataclass

@dataclass
class ClickHelper:
"""Helper class for reliable clicking with hover"""

@staticmethod
async def click_css(ctx, args: List[Any]) -> Dict[str, Any]:
"""
Click an element by CSS selector with reliability enhancements.

Args:
ctx: The browser context
args: List containing [selector, x (optional), y (optional)]

Returns:
Dict with status and message
"""
if not args or len(args) < 1:
return {"status": "error", "message": "Missing required argument: selector"}

selector = args[0]
x = args[1] if len(args) > 1 else 5
y = args[2] if len(args) > 2 else 5

try:
page = ctx.page
el = page.locator(selector)
await el.wait_for(state='visible', timeout=5000)
await page.mouse.move(20, 20) # Move to neutral position
await el.hover() # Hover over the element
await el.click(position={'x': x, 'y': y}, force=True)

return {"status": "success", "message": f"Clicked {selector}"}
except Exception as e:
return {"status": "error", "message": f"Failed to click {selector}: {str(e)}"}
112 changes: 112 additions & 0 deletions src/docs/custom_skills_integration.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
# Custom Skills Integration for Browser Use Agent

This document outlines the changes made to enable deterministic execution of JSON-based browser automation scenarios in the Browser Use Agent.

## Problem Statement

The Browser Use Agent initially did not properly register custom skills (such as `goto`, `clickCss`, `waitFor`, `waitForUrl`) for use in JSON scenarios. When attempting to run JSON scenarios with these skills, the agent would fall back to built-in actions or fail to execute the steps correctly.

## Solution

We implemented dynamic registration of custom skills as controller actions to enable deterministic execution of JSON-based browser automation scenarios.

### Key Changes

1. **Dynamic Registration of Custom Skills**

In `src/controller/custom_controller.py`, we added code to dynamically register all skills from `src.custom_skills` as controller actions:

```python
# This allows JSON scenarios to call actions like "goto", "clickCss", etc.
from src.custom_skills import CUSTOM_SKILLS
from typing import Any, Dict
from playwright.async_api import BrowserContext

for _skill_name, _skill_func in CUSTOM_SKILLS.items():
# Create a wrapper that adapts the controller action signature
# Add explicit type annotations to avoid Pydantic JSON schema errors
# Avoid leading underscore in function name to prevent Pydantic errors
async def skill_wrapper(*args: Any, browser: BrowserContext, _func=_skill_func) -> Dict[str, Any]:
# Original skill expects (ctx, args_list)
return await _func(browser, list(args))

# Avoid overwriting if an action with the same name already exists
if _skill_name not in self.registry.registry.actions:
self.registry.action(_skill_name)(skill_wrapper)
```

2. **Fixed Pydantic JSON Schema Error**

We encountered two issues with Pydantic when generating JSON schemas for the dynamically created wrapper functions:

- **Missing Type Annotations**: Added proper type annotations to the wrapper function
- **Leading Underscore in Function Name**: Renamed the function from `_wrapper` to `skill_wrapper` to avoid the Pydantic error: "Fields must not use names with leading underscores"

## Usage

With these changes, JSON scenarios can now use custom skills directly. Here's an example scenario that navigates through Wikipedia and compares language counts:

```json
{
"steps": [
{
"skill": "goto",
"args": ["https://www.wikipedia.org/"]
},
{
"skill": "waitForUrl",
"args": ["wikipedia.org", 10000]
},
{
"skill": "waitFor",
"args": [".central-featured", 5000]
},
{
"skill": "evaluate",
"args": [
"(() => { const languages = document.querySelectorAll('.central-featured-lang'); const count = languages.length; localStorage.setItem('languageCount', count); return { mainPageLanguageCount: count }; })()"
]
},
{
"skill": "waitFor",
"args": ["a[href='//meta.wikimedia.org/wiki/List_of_Wikipedias']", 5000]
},
{
"skill": "clickCss",
"args": ["a[href='//meta.wikimedia.org/wiki/List_of_Wikipedias']"]
},
{
"skill": "waitForUrl",
"args": ["meta.wikimedia.org/wiki/List_of_Wikipedias", 10000]
},
{
"skill": "waitFor",
"args": [".wikitable", 5000]
},
{
"skill": "evaluate",
"args": [
"(() => { const rows = document.querySelectorAll('.wikitable tr'); const activeLanguages = Array.from(rows).filter(row => row.querySelector('td')).length; const mainPageCount = localStorage.getItem('languageCount'); return { mainPageLanguageCount: mainPageCount, listPageLanguageCount: activeLanguages, note: 'The counts may differ as the main page shows top languages while the list page shows all languages' }; })()"
]
}
]
}
```

## Available Custom Skills

The following custom skills are now available for use in JSON scenarios:

- **goto**: Navigate to a URL
- **clickCss**: Click an element by CSS selector
- **waitFor**: Wait for an element to reach a specific state
- **waitForUrl**: Wait for the URL to contain a specific string

## Troubleshooting

If you encounter issues with JSON scenarios:

1. Check the browser console for errors
2. Verify that selectors are correct for the target website
3. Adjust timeout values if elements take longer to load
4. Use the `evaluate` skill to debug page state with JavaScript
25 changes: 25 additions & 0 deletions src/test_scenarios/reliable_click_demo.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
{
"name": "Reliable Click Demo",
"description": "Demonstrates using the reliable_click custom skill",
"steps": [
{
"action": "navigate_to",
"url": "https://torath.gov.eg"
},
{
"action": "reliable_click",
"selector": "a[href='/about']",
"x": 10,
"y": 10
},
{
"action": "wait_for_navigation",
"timeout": 5000
},
{
"action": "assert_url_contains",
"expected": "/about"
}
]
}

Loading