Skip to content

Commit 1df8ea0

Browse files
committed
Add BrowserUse step
1 parent e33580f commit 1df8ea0

File tree

7 files changed

+997
-2
lines changed

7 files changed

+997
-2
lines changed
Lines changed: 90 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,90 @@
1+
import asyncio
2+
3+
from browser_use import Agent, Browser, BrowserConfig
4+
from langchain_google_genai import ChatGoogleGenerativeAI
5+
from langchain_openai import ChatOpenAI
6+
from langchain_anthropic import ChatAnthropic
7+
from datetime import datetime
8+
9+
from patchwork.step import Step
10+
from patchwork.steps import SimplifiedLLMOnce
11+
from patchwork.steps.BrowserUse.typed import BrowserUseInputs, BrowserUseOutputs
12+
13+
config = BrowserConfig(headless=True, disable_security=True)
14+
15+
16+
class BrowserUse(Step, input_class=BrowserUseInputs, output_class=BrowserUseOutputs):
17+
required_keys = {"task"}
18+
19+
def __init__(self, inputs):
20+
super().__init__(inputs)
21+
22+
if not all(key in inputs.keys() for key in self.required_keys):
23+
raise ValueError(f'Missing required data: "{self.required_keys}"')
24+
25+
self.browser = Browser(config=config)
26+
if "google_api_key" in self.inputs:
27+
self.llm = ChatGoogleGenerativeAI(
28+
model="gemini-2.0-flash", google_api_key=self.inputs["google_api_key"]
29+
)
30+
elif "openai_api_key" in self.inputs:
31+
self.llm = ChatOpenAI(model="gpt-4o", api_key=self.inputs["openai_api_key"])
32+
elif "anthropic_api_key" in self.inputs:
33+
self.llm = ChatAnthropic(
34+
model="claude-3-7-sonnet-latest",
35+
api_key=self.inputs["anthropic_api_key"],
36+
)
37+
self.generate_gif = (
38+
f"agent_history_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.gif"
39+
if ("generate_gif" in self.inputs and self.inputs["generate_gif"])
40+
or ("debug" in self.inputs and self.inputs["debug"])
41+
else False
42+
)
43+
44+
def run(self) -> dict:
45+
agent = Agent(
46+
browser=self.browser,
47+
task=self.inputs["task"],
48+
llm=self.llm,
49+
generate_gif=self.generate_gif,
50+
validate_output=True,
51+
)
52+
53+
loop = asyncio.new_event_loop()
54+
self.history = loop.run_until_complete(agent.run())
55+
56+
if self.inputs["json_example_schema"]:
57+
return self.__format_history_as_json()
58+
59+
return {
60+
"history": self.history,
61+
"result": self.history.final_result(),
62+
"generated_gif": self.generate_gif,
63+
}
64+
65+
def __format_history_as_json(self):
66+
inputs = dict(
67+
user_prompt=f"""
68+
You are a helpful assistant that formats a history of browser actions and conversations into a JSON object.
69+
You are provided with a JSON schema for the history.
70+
Only include the JSON object in your response, nothing else.
71+
72+
Here is the history:
73+
<history>
74+
{self.history.final_result()}
75+
</history>
76+
""",
77+
json_schema=self.inputs["json_example_schema"],
78+
prompt_value=dict(),
79+
)
80+
81+
if "google_api_key" in self.inputs:
82+
inputs["google_api_key"] = self.inputs["google_api_key"]
83+
inputs["model"] = "gemini-2.0-flash"
84+
elif "openai_api_key" in self.inputs:
85+
inputs["openai_api_key"] = self.inputs["openai_api_key"]
86+
inputs["model"] = "gpt-4o-mini"
87+
elif "anthropic_api_key" in self.inputs:
88+
inputs["anthropic_api_key"] = self.inputs["anthropic_api_key"]
89+
inputs["model"] = "claude-3-5-haiku-latest"
90+
return SimplifiedLLMOnce(inputs).run()
Lines changed: 151 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,151 @@
1+
# BrowserUse Documentation
2+
3+
## Overview
4+
The `BrowserUse` module provides functionality to automate browser interactions using an LLM-powered agent. This module enables tasks to be performed in a web browser environment, with the agent navigating websites, filling forms, clicking buttons, and extracting information as needed. The module supports multiple LLM providers and can generate visual recordings of the browser session for debugging purposes.
5+
6+
This documentation covers the three main files in the module:
7+
- `typed.py`: Defines the input and output types.
8+
- `__init__.py`: Initializes the module.
9+
- `BrowserUse.py`: Implements the core functionality for browser automation.
10+
11+
## Files
12+
13+
### 1. `typed.py`
14+
15+
#### Description
16+
This file defines the input and output types required by the `BrowserUse` step using Python's `TypedDict` and `Annotated` for enhanced type safety and validation.
17+
18+
#### Inputs
19+
- `task` (str): The description of the task to be performed in the browser.
20+
- `json_example_schema` (str): Optional schema for formatting the output as JSON.
21+
- API keys (`openai_api_key`, `anthropic_api_key`, `google_api_key`): Various possible API keys required for different LLM providers.
22+
23+
#### Outputs
24+
- `result` (str): The final result of the browser interaction.
25+
- `request_tokens` (int): The number of tokens used in the request.
26+
- `response_tokens` (int): The number of tokens used in the response.
27+
28+
#### Example Code
29+
```python
30+
class BrowserUseInputs(TypedDict, total=False):
31+
task: str
32+
json_example_schema: str
33+
openai_api_key: Annotated[
34+
str,
35+
StepTypeConfig(is_config=True, or_op=["google_api_key", "anthropic_api_key"]),
36+
]
37+
anthropic_api_key: Annotated[
38+
str, StepTypeConfig(is_config=True, or_op=["google_api_key", "openai_api_key"])
39+
]
40+
google_api_key: Annotated[
41+
str,
42+
StepTypeConfig(is_config=True, or_op=["openai_api_key", "anthropic_api_key"]),
43+
]
44+
45+
46+
class BrowserUseOutputs(TypedDict):
47+
result: str
48+
request_tokens: int
49+
response_tokens: int
50+
```
51+
52+
### 2. `__init__.py`
53+
54+
#### Description
55+
An empty file used to mark the directory as a Python package.
56+
57+
#### Example Code
58+
```python
59+
```
60+
61+
### 3. `BrowserUse.py`
62+
63+
#### Description
64+
Implements the core functionality of the `BrowserUse` step. This class initializes a browser instance, sets up the LLM agent, and executes the specified task in the browser environment.
65+
66+
#### Inputs
67+
- The inputs are defined as per the `BrowserUseInputs` class in `typed.py`.
68+
69+
#### Outputs
70+
- A dictionary containing the browser interaction history and final result, optionally formatted as JSON according to the provided schema.
71+
72+
#### Example Code
73+
```python
74+
class BrowserUse(Step, input_class=BrowserUseInputs, output_class=BrowserUseOutputs):
75+
required_keys = {"task"}
76+
77+
def __init__(self, inputs):
78+
super().__init__(inputs)
79+
80+
if not all(key in inputs.keys() for key in self.required_keys):
81+
raise ValueError(f'Missing required data: "{self.required_keys}"')
82+
83+
self.browser = Browser(config=config)
84+
if "google_api_key" in self.inputs:
85+
self.llm = ChatGoogleGenerativeAI(
86+
model="gemini-2.0-flash", google_api_key=self.inputs["google_api_key"]
87+
)
88+
elif "openai_api_key" in self.inputs:
89+
self.llm = ChatOpenAI(model="gpt-4o", api_key=self.inputs["openai_api_key"])
90+
elif "anthropic_api_key" in self.inputs:
91+
self.llm = ChatAnthropic(
92+
model="claude-3-7-sonnet-latest",
93+
api_key=self.inputs["anthropic_api_key"],
94+
)
95+
96+
def run(self) -> dict:
97+
agent = Agent(
98+
browser=self.browser,
99+
task=self.inputs["task"],
100+
llm=self.llm,
101+
generate_gif=f"agent_history_{datetime.now().strftime('%Y-%m-%d_%H-%M-%S')}.gif"
102+
if self.inputs["debug"]
103+
else False,
104+
validate_output=True,
105+
)
106+
107+
loop = asyncio.new_event_loop()
108+
self.history = loop.run_until_complete(agent.run())
109+
110+
if self.inputs["json_example_schema"]:
111+
return self.__format_history_as_json()
112+
113+
return {"history": self.history, "result": self.history.final_result()}
114+
```
115+
116+
## Usage
117+
118+
1. **Initialize inputs**: Define the necessary inputs including the `task` description and at least one API key for the LLM provider.
119+
120+
2. **Instantiate the `BrowserUse` class**: Pass in the inputs during instantiation.
121+
122+
3. **Run the instance**: Call the `run()` method to execute the browser automation task and retrieve the output.
123+
124+
### Example
125+
126+
```python
127+
from patchwork.steps.BrowserUse import BrowserUse
128+
129+
# Define inputs
130+
inputs = {
131+
"task": "Go to example.com and extract the main heading",
132+
"openai_api_key": "your-openai-api-key",
133+
"debug": True # Enable GIF recording for debugging
134+
}
135+
136+
# Create and run the BrowserUse step
137+
browser_step = BrowserUse(inputs)
138+
result = browser_step.run()
139+
140+
# Access the result
141+
print(result["result"])
142+
```
143+
144+
## Features
145+
146+
- **Multi-LLM Support**: Compatible with OpenAI, Anthropic, and Google Generative AI models.
147+
- **Debug Mode**: Can generate GIF recordings of browser sessions for debugging purposes.
148+
- **JSON Output**: Ability to format the output as JSON according to a provided schema.
149+
- **Headless Operation**: Browser runs in headless mode by default for efficiency.
150+
151+
This module is designed to automate browser-based tasks using LLM agents, making it useful for web scraping, form filling, and other web automation scenarios.

patchwork/steps/BrowserUse/__init__.py

Whitespace-only changes.
Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
from typing_extensions import Annotated, TypedDict
2+
3+
from patchwork.common.utils.step_typing import StepTypeConfig
4+
5+
6+
class BrowserUseInputs(TypedDict, total=False):
7+
task: str
8+
json_example_schema: str
9+
openai_api_key: Annotated[
10+
str,
11+
StepTypeConfig(is_config=True, or_op=["google_api_key", "anthropic_api_key"]),
12+
]
13+
anthropic_api_key: Annotated[
14+
str, StepTypeConfig(is_config=True, or_op=["google_api_key", "openai_api_key"])
15+
]
16+
google_api_key: Annotated[
17+
str,
18+
StepTypeConfig(is_config=True, or_op=["openai_api_key", "anthropic_api_key"]),
19+
]
20+
generate_gif: Annotated[bool, StepTypeConfig(is_config=True)]
21+
22+
23+
class BrowserUseOutputs(TypedDict):
24+
result: str
25+
request_tokens: int
26+
response_tokens: int

patchwork/steps/__init__.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,7 @@
4848
from patchwork.steps.SimplifiedLLM.SimplifiedLLM import SimplifiedLLM
4949
from patchwork.steps.SimplifiedLLMOnce.SimplifiedLLMOnce import SimplifiedLLMOnce
5050
from patchwork.steps.SlackMessage.SlackMessage import SlackMessage
51+
from patchwork.steps.BrowserUse.BrowserUse import BrowserUse
5152

5253
# Compatibility Aliases
5354
JoinListPB = JoinList
@@ -104,4 +105,5 @@
104105
"JoinList",
105106
"JoinListPB",
106107
"GetTypescriptTypeInfo",
108+
"BrowserUse",
107109
]

0 commit comments

Comments
 (0)