Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,4 @@ langchain-ibm==0.3.10
langchain_mcp_adapters==0.0.9
langgraph==0.3.34
langchain-community
lxml[html_clean]
17 changes: 17 additions & 0 deletions src/controller/custom_controller.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,23 @@ async def upload_file(index: int, path: str, browser: BrowserContext, available_
logger.info(msg)
return ActionResult(error=msg)

# --- Dynamically register skills defined in `src.custom_skills` ---
# This allows JSON scenarios to call actions like "goto", "clickCss", etc.
from src.custom_skills import CUSTOM_SKILLS
from typing import Any

for _skill_name, _skill_func in CUSTOM_SKILLS.items():
# Create a wrapper that adapts the controller action signature
# Add explicit type annotations to avoid Pydantic JSON schema errors
# Avoid leading underscore in function name to prevent Pydantic errors
async def skill_wrapper(*args: Any, browser: BrowserContext, _func=_skill_func) -> Dict[str, Any]:
# Original skill expects (ctx, args_list)
return await _func(browser, list(args))

# Avoid overwriting if an action with the same name already exists
if _skill_name not in self.registry.registry.actions:
self.registry.action(_skill_name)(skill_wrapper)

@time_execution_sync('--act')
async def act(
self,
Expand Down
117 changes: 117 additions & 0 deletions src/custom_skills/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,117 @@
from typing import Dict, Any, Callable, Awaitable, List
from .click_helper import ClickHelper

# Dictionary to store registered skills
CUSTOM_SKILLS: Dict[str, Callable[..., Awaitable[Dict[str, Any]]]] = {}

def register_skill(name: str):
"""Decorator to register a new skill function"""
def decorator(func):
CUSTOM_SKILLS[name] = func
return func
return decorator

def get_custom_skill(name: str) -> Callable[..., Awaitable[Dict[str, Any]]]:
"""Get a registered custom skill by name"""
return CUSTOM_SKILLS.get(name)

# Register our click helper as a skill
@register_skill("clickCss")
async def click_css_skill(ctx, args: List[Any]) -> Dict[str, Any]:
"""
Skill to click an element by CSS selector with reliability enhancements.
Args:
ctx: Browser context
args: List containing [selector, x (optional), y (optional)]
Returns:
Dict with status and message
"""
return await ClickHelper.click_css(ctx, args)

# Register a goto skill to match the example
@register_skill("goto")
async def goto_skill(ctx, args: List[Any]) -> Dict[str, Any]:
"""
Navigate to a URL.
Args:
ctx: Browser context
args: List containing [url]
"""
if not args or len(args) < 1:
return {"status": "error", "message": "Missing required argument: url"}

try:
url = args[0]
await ctx.page.goto(url, timeout=30000)
return {"status": "success", "message": f"Navigated to {url}"}
except Exception as e:
return {"status": "error", "message": f"Failed to navigate to {args[0]}: {str(e)}"}

# Register a waitFor skill
@register_skill("waitFor")
async def wait_for_skill(ctx, args: List[Any]) -> Dict[str, Any]:
"""
Wait for an element to reach a specific state.
Args:
ctx: Browser context
args: List containing [selector, state, timeout_ms]
"""
if len(args) < 2:
return {"status": "error", "message": "Missing required arguments: selector and state"}

selector = args[0]
state = args[1] # 'attached', 'detached', 'visible', 'hidden'
timeout = args[2] if len(args) > 2 else 5000 # Default 5 seconds

try:
if state == 'detached':
# Wait for element to be removed from DOM
await ctx.page.wait_for_selector(selector, state='hidden', timeout=timeout/1000)
else:
# Wait for element to be in specified state
await ctx.page.wait_for_selector(selector, state=state, timeout=timeout/1000)

return {"status": "success", "message": f"Element {selector} reached state {state}"}
except Exception as e:
return {"status": "error", "message": f"Wait for {selector} {state} failed: {str(e)}"}

# Register a waitForUrl skill
@register_skill("waitForUrl")
async def wait_for_url_skill(ctx, args: List[Any]) -> Dict[str, Any]:
"""
Wait for the URL to contain a specific string.
Args:
ctx: Browser context
args: List containing [url_fragment, timeout_ms]
"""
if not args:
return {"status": "error", "message": "Missing required argument: url_fragment"}

url_fragment = args[0]
timeout = args[1] if len(args) > 1 else 7000 # Default 7 seconds

try:
await ctx.page.wait_for_url(
f"*{url_fragment}*",
timeout=timeout/1000,
wait_until="networkidle"
)
return {"status": "success", "message": f"URL contains {url_fragment}"}
except Exception as e:
return {"status": "error", "message": f"URL did not contain {url_fragment}: {str(e)}"}

# Export all skills
__all__ = [
'CUSTOM_SKILLS',
'register_skill',
'get_custom_skill',
'click_css_skill',
'goto_skill',
'wait_for_skill',
'wait_for_url_skill'
]
37 changes: 37 additions & 0 deletions src/custom_skills/click_helper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
from typing import Any, Dict, List, Optional
from dataclasses import dataclass

@dataclass
class ClickHelper:
"""Helper class for reliable clicking with hover"""

@staticmethod
async def click_css(ctx, args: List[Any]) -> Dict[str, Any]:
"""
Click an element by CSS selector with reliability enhancements.

Args:
ctx: The browser context
args: List containing [selector, x (optional), y (optional)]

Returns:
Dict with status and message
"""
if not args or len(args) < 1:
return {"status": "error", "message": "Missing required argument: selector"}

selector = args[0]
x = args[1] if len(args) > 1 else 5
y = args[2] if len(args) > 2 else 5

try:
page = ctx.page
el = page.locator(selector)
await el.wait_for(state='visible', timeout=5000)
await page.mouse.move(20, 20) # Move to neutral position
await el.hover() # Hover over the element
await el.click(position={'x': x, 'y': y}, force=True)

return {"status": "success", "message": f"Clicked {selector}"}
except Exception as e:
return {"status": "error", "message": f"Failed to click {selector}: {str(e)}"}
111 changes: 111 additions & 0 deletions src/docs/custom_skills_integration.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
# Custom Skills Integration for Browser Use Agent

This document outlines the changes made to enable deterministic execution of JSON-based browser automation scenarios in the Browser Use Agent.

## Problem Statement

The Browser Use Agent initially did not properly register custom skills (such as `goto`, `clickCss`, `waitFor`, `waitForUrl`) for use in JSON scenarios. When attempting to run JSON scenarios with these skills, the agent would fall back to built-in actions or fail to execute the steps correctly.

## Solution

We implemented dynamic registration of custom skills as controller actions to enable deterministic execution of JSON-based browser automation scenarios.

### Key Changes

1. **Dynamic Registration of Custom Skills**

In `src/controller/custom_controller.py`, we added code to dynamically register all skills from `src.custom_skills` as controller actions:

```python
# This allows JSON scenarios to call actions like "goto", "clickCss", etc.
from src.custom_skills import CUSTOM_SKILLS
from typing import Any

for _skill_name, _skill_func in CUSTOM_SKILLS.items():
# Create a wrapper that adapts the controller action signature
# Add explicit type annotations to avoid Pydantic JSON schema errors
# Avoid leading underscore in function name to prevent Pydantic errors
async def skill_wrapper(*args: Any, browser: BrowserContext, _func=_skill_func) -> Dict[str, Any]:
# Original skill expects (ctx, args_list)
return await _func(browser, list(args))

# Avoid overwriting if an action with the same name already exists
if _skill_name not in self.registry.registry.actions:
self.registry.action(_skill_name)(skill_wrapper)
```

2. **Fixed Pydantic JSON Schema Error**

We encountered two issues with Pydantic when generating JSON schemas for the dynamically created wrapper functions:

- **Missing Type Annotations**: Added proper type annotations to the wrapper function
- **Leading Underscore in Function Name**: Renamed the function from `_wrapper` to `skill_wrapper` to avoid the Pydantic error: "Fields must not use names with leading underscores"

## Usage

With these changes, JSON scenarios can now use custom skills directly. Here's an example scenario that navigates through Wikipedia and compares language counts:

```json
{
"steps": [
{
"skill": "goto",
"args": ["https://www.wikipedia.org/"]
},
{
"skill": "waitForUrl",
"args": ["wikipedia.org", 10000]
},
{
"skill": "waitFor",
"args": [".central-featured", 5000]
},
{
"skill": "evaluate",
"args": [
"(() => { const languages = document.querySelectorAll('.central-featured-lang'); const count = languages.length; localStorage.setItem('languageCount', count); return { mainPageLanguageCount: count }; })()"
]
},
{
"skill": "waitFor",
"args": ["a[href='//meta.wikimedia.org/wiki/List_of_Wikipedias']", 5000]
},
{
"skill": "clickCss",
"args": ["a[href='//meta.wikimedia.org/wiki/List_of_Wikipedias']"]
},
{
"skill": "waitForUrl",
"args": ["meta.wikimedia.org/wiki/List_of_Wikipedias", 10000]
},
{
"skill": "waitFor",
"args": [".wikitable", 5000]
},
{
"skill": "evaluate",
"args": [
"(() => { const rows = document.querySelectorAll('.wikitable tr'); const activeLanguages = Array.from(rows).filter(row => row.querySelector('td')).length; const mainPageCount = localStorage.getItem('languageCount'); return { mainPageLanguageCount: mainPageCount, listPageLanguageCount: activeLanguages, note: 'The counts may differ as the main page shows top languages while the list page shows all languages' }; })()"
]
}
]
}
```

## Available Custom Skills

The following custom skills are now available for use in JSON scenarios:

- **goto**: Navigate to a URL
- **clickCss**: Click an element by CSS selector
- **waitFor**: Wait for an element to reach a specific state
- **waitForUrl**: Wait for the URL to contain a specific string

## Troubleshooting

If you encounter issues with JSON scenarios:

1. Check the browser console for errors
2. Verify that selectors are correct for the target website
3. Adjust timeout values if elements take longer to load
4. Use the `evaluate` skill to debug page state with JavaScript
25 changes: 25 additions & 0 deletions src/test_scenarios/reliable_click_demo.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
{
"name": "Reliable Click Demo",
"description": "Demonstrates using the reliable_click custom skill",
"steps": [
{
"action": "navigate_to",
"url": "https://torath.gov.eg"
},
{
"action": "reliable_click",
"selector": "a[href='/about']",
"x": 10,
"y": 10
},
{
"action": "wait_for_navigation",
"timeout": 5000
},
{
"action": "assert_url_contains",
"expected": "/about"
}
]
}

Loading