-
Notifications
You must be signed in to change notification settings - Fork 2.6k
feat: Enable custom skills in JSON scenarios for Browser Use Agent #645
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Open
glglak
wants to merge
2
commits into
browser-use:main
Choose a base branch
from
glglak:fix-custom-skills-registration
base: main
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Open
Changes from all commits
Commits
File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -8,3 +8,4 @@ langchain-ibm==0.3.10 | |
langchain_mcp_adapters==0.0.9 | ||
langgraph==0.3.34 | ||
langchain-community | ||
lxml[html_clean]==4.9.3 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,117 @@ | ||
from typing import Dict, Any, Callable, Awaitable, List | ||
from .click_helper import ClickHelper | ||
|
||
# Dictionary to store registered skills | ||
CUSTOM_SKILLS: Dict[str, Callable[..., Awaitable[Dict[str, Any]]]] = {} | ||
|
||
def register_skill(name: str): | ||
"""Decorator to register a new skill function""" | ||
def decorator(func): | ||
CUSTOM_SKILLS[name] = func | ||
return func | ||
return decorator | ||
|
||
def get_custom_skill(name: str) -> Callable[..., Awaitable[Dict[str, Any]]]: | ||
"""Get a registered custom skill by name""" | ||
return CUSTOM_SKILLS.get(name) | ||
|
||
# Register our click helper as a skill | ||
@register_skill("clickCss") | ||
async def click_css_skill(ctx, args: List[Any]) -> Dict[str, Any]: | ||
""" | ||
Skill to click an element by CSS selector with reliability enhancements. | ||
|
||
Args: | ||
ctx: Browser context | ||
args: List containing [selector, x (optional), y (optional)] | ||
|
||
Returns: | ||
Dict with status and message | ||
""" | ||
return await ClickHelper.click_css(ctx, args) | ||
|
||
# Register a goto skill to match the example | ||
@register_skill("goto") | ||
async def goto_skill(ctx, args: List[Any]) -> Dict[str, Any]: | ||
""" | ||
Navigate to a URL. | ||
|
||
Args: | ||
ctx: Browser context | ||
args: List containing [url] | ||
""" | ||
if not args or len(args) < 1: | ||
return {"status": "error", "message": "Missing required argument: url"} | ||
|
||
try: | ||
url = args[0] | ||
await ctx.page.goto(url, timeout=30000) | ||
return {"status": "success", "message": f"Navigated to {url}"} | ||
except Exception as e: | ||
return {"status": "error", "message": f"Failed to navigate to {args[0]}: {str(e)}"} | ||
|
||
# Register a waitFor skill | ||
@register_skill("waitFor") | ||
async def wait_for_skill(ctx, args: List[Any]) -> Dict[str, Any]: | ||
""" | ||
Wait for an element to reach a specific state. | ||
|
||
Args: | ||
ctx: Browser context | ||
args: List containing [selector, state, timeout_ms] | ||
""" | ||
if len(args) < 2: | ||
return {"status": "error", "message": "Missing required arguments: selector and state"} | ||
|
||
selector = args[0] | ||
state = args[1] # 'attached', 'detached', 'visible', 'hidden' | ||
timeout = args[2] if len(args) > 2 else 5000 # Default 5 seconds | ||
|
||
try: | ||
if state == 'detached': | ||
# Wait for element to be removed from DOM | ||
await ctx.page.wait_for_selector(selector, state='detached', timeout=timeout) | ||
else: | ||
# Wait for element to be in specified state | ||
await ctx.page.wait_for_selector(selector, state=state, timeout=timeout) | ||
|
||
return {"status": "success", "message": f"Element {selector} reached state {state}"} | ||
except Exception as e: | ||
return {"status": "error", "message": f"Wait for {selector} {state} failed: {str(e)}"} | ||
|
||
# Register a waitForUrl skill | ||
@register_skill("waitForUrl") | ||
async def wait_for_url_skill(ctx, args: List[Any]) -> Dict[str, Any]: | ||
""" | ||
Wait for the URL to contain a specific string. | ||
|
||
Args: | ||
ctx: Browser context | ||
args: List containing [url_fragment, timeout_ms] | ||
""" | ||
if not args: | ||
return {"status": "error", "message": "Missing required argument: url_fragment"} | ||
|
||
url_fragment = args[0] | ||
timeout = args[1] if len(args) > 1 else 7000 # Default 7 seconds | ||
|
||
try: | ||
await ctx.page.wait_for_url( | ||
f"*{url_fragment}*", | ||
timeout=timeout, | ||
wait_until="networkidle" | ||
) | ||
return {"status": "success", "message": f"URL contains {url_fragment}"} | ||
except Exception as e: | ||
return {"status": "error", "message": f"URL did not contain {url_fragment}: {str(e)}"} | ||
|
||
# Export all skills | ||
__all__ = [ | ||
'CUSTOM_SKILLS', | ||
'register_skill', | ||
'get_custom_skill', | ||
'click_css_skill', | ||
'goto_skill', | ||
'wait_for_skill', | ||
'wait_for_url_skill' | ||
] |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,37 @@ | ||
from typing import Any, Dict, List, Optional | ||
from dataclasses import dataclass | ||
|
||
@dataclass | ||
class ClickHelper: | ||
"""Helper class for reliable clicking with hover""" | ||
|
||
@staticmethod | ||
async def click_css(ctx, args: List[Any]) -> Dict[str, Any]: | ||
""" | ||
Click an element by CSS selector with reliability enhancements. | ||
|
||
Args: | ||
ctx: The browser context | ||
args: List containing [selector, x (optional), y (optional)] | ||
|
||
Returns: | ||
Dict with status and message | ||
""" | ||
if not args or len(args) < 1: | ||
return {"status": "error", "message": "Missing required argument: selector"} | ||
|
||
selector = args[0] | ||
x = args[1] if len(args) > 1 else 5 | ||
y = args[2] if len(args) > 2 else 5 | ||
|
||
try: | ||
page = ctx.page | ||
el = page.locator(selector) | ||
await el.wait_for(state='visible', timeout=5000) | ||
await page.mouse.move(20, 20) # Move to neutral position | ||
await el.hover() # Hover over the element | ||
await el.click(position={'x': x, 'y': y}, force=True) | ||
|
||
return {"status": "success", "message": f"Clicked {selector}"} | ||
except Exception as e: | ||
return {"status": "error", "message": f"Failed to click {selector}: {str(e)}"} |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,112 @@ | ||
# Custom Skills Integration for Browser Use Agent | ||
|
||
This document outlines the changes made to enable deterministic execution of JSON-based browser automation scenarios in the Browser Use Agent. | ||
|
||
## Problem Statement | ||
|
||
The Browser Use Agent initially did not properly register custom skills (such as `goto`, `clickCss`, `waitFor`, `waitForUrl`) for use in JSON scenarios. When attempting to run JSON scenarios with these skills, the agent would fall back to built-in actions or fail to execute the steps correctly. | ||
|
||
## Solution | ||
|
||
We implemented dynamic registration of custom skills as controller actions to enable deterministic execution of JSON-based browser automation scenarios. | ||
|
||
### Key Changes | ||
|
||
1. **Dynamic Registration of Custom Skills** | ||
|
||
In `src/controller/custom_controller.py`, we added code to dynamically register all skills from `src.custom_skills` as controller actions: | ||
|
||
```python | ||
# This allows JSON scenarios to call actions like "goto", "clickCss", etc. | ||
from src.custom_skills import CUSTOM_SKILLS | ||
from typing import Any, Dict | ||
from playwright.async_api import BrowserContext | ||
|
||
for _skill_name, _skill_func in CUSTOM_SKILLS.items(): | ||
# Create a wrapper that adapts the controller action signature | ||
# Add explicit type annotations to avoid Pydantic JSON schema errors | ||
# Avoid leading underscore in function name to prevent Pydantic errors | ||
async def skill_wrapper(*args: Any, browser: BrowserContext, _func=_skill_func) -> Dict[str, Any]: | ||
# Original skill expects (ctx, args_list) | ||
return await _func(browser, list(args)) | ||
|
||
# Avoid overwriting if an action with the same name already exists | ||
if _skill_name not in self.registry.registry.actions: | ||
self.registry.action(_skill_name)(skill_wrapper) | ||
``` | ||
|
||
2. **Fixed Pydantic JSON Schema Error** | ||
|
||
We encountered two issues with Pydantic when generating JSON schemas for the dynamically created wrapper functions: | ||
|
||
- **Missing Type Annotations**: Added proper type annotations to the wrapper function | ||
- **Leading Underscore in Function Name**: Renamed the function from `_wrapper` to `skill_wrapper` to avoid the Pydantic error: "Fields must not use names with leading underscores" | ||
|
||
## Usage | ||
|
||
With these changes, JSON scenarios can now use custom skills directly. Here's an example scenario that navigates through Wikipedia and compares language counts: | ||
|
||
```json | ||
{ | ||
"steps": [ | ||
{ | ||
"skill": "goto", | ||
"args": ["https://www.wikipedia.org/"] | ||
}, | ||
{ | ||
"skill": "waitForUrl", | ||
"args": ["wikipedia.org", 10000] | ||
}, | ||
{ | ||
"skill": "waitFor", | ||
"args": [".central-featured", 5000] | ||
}, | ||
{ | ||
"skill": "evaluate", | ||
"args": [ | ||
"(() => { const languages = document.querySelectorAll('.central-featured-lang'); const count = languages.length; localStorage.setItem('languageCount', count); return { mainPageLanguageCount: count }; })()" | ||
] | ||
}, | ||
{ | ||
"skill": "waitFor", | ||
"args": ["a[href='//meta.wikimedia.org/wiki/List_of_Wikipedias']", 5000] | ||
}, | ||
{ | ||
"skill": "clickCss", | ||
"args": ["a[href='//meta.wikimedia.org/wiki/List_of_Wikipedias']"] | ||
}, | ||
{ | ||
"skill": "waitForUrl", | ||
"args": ["meta.wikimedia.org/wiki/List_of_Wikipedias", 10000] | ||
}, | ||
{ | ||
"skill": "waitFor", | ||
"args": [".wikitable", 5000] | ||
}, | ||
{ | ||
"skill": "evaluate", | ||
"args": [ | ||
"(() => { const rows = document.querySelectorAll('.wikitable tr'); const activeLanguages = Array.from(rows).filter(row => row.querySelector('td')).length; const mainPageCount = localStorage.getItem('languageCount'); return { mainPageLanguageCount: mainPageCount, listPageLanguageCount: activeLanguages, note: 'The counts may differ as the main page shows top languages while the list page shows all languages' }; })()" | ||
] | ||
} | ||
] | ||
} | ||
``` | ||
|
||
## Available Custom Skills | ||
|
||
The following custom skills are now available for use in JSON scenarios: | ||
|
||
- **goto**: Navigate to a URL | ||
- **clickCss**: Click an element by CSS selector | ||
- **waitFor**: Wait for an element to reach a specific state | ||
- **waitForUrl**: Wait for the URL to contain a specific string | ||
|
||
## Troubleshooting | ||
|
||
If you encounter issues with JSON scenarios: | ||
|
||
1. Check the browser console for errors | ||
2. Verify that selectors are correct for the target website | ||
3. Adjust timeout values if elements take longer to load | ||
4. Use the `evaluate` skill to debug page state with JavaScript |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
{ | ||
"name": "Reliable Click Demo", | ||
"description": "Demonstrates using the reliable_click custom skill", | ||
"steps": [ | ||
{ | ||
"action": "navigate_to", | ||
"url": "https://torath.gov.eg" | ||
}, | ||
{ | ||
"action": "reliable_click", | ||
"selector": "a[href='/about']", | ||
"x": 10, | ||
"y": 10 | ||
}, | ||
{ | ||
"action": "wait_for_navigation", | ||
"timeout": 5000 | ||
}, | ||
{ | ||
"action": "assert_url_contains", | ||
"expected": "/about" | ||
} | ||
] | ||
} | ||
|
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
Uh oh!
There was an error while loading. Please reload this page.