Skip to content

Commit 041dc55

Browse files
committed
feat: adpat to new version of browser-use
1 parent dcb3914 commit 041dc55

File tree

8 files changed

+254
-162
lines changed

8 files changed

+254
-162
lines changed

requirements.txt

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
1-
browser-use==0.1.17
2-
langchain-google-genai
1+
browser-use>=0.1.18
2+
langchain-google-genai>=2.0.8
33
pyperclip
44
gradio
55
langchain-ollama

src/agent/custom_agent.py

Lines changed: 80 additions & 56 deletions
Original file line numberDiff line numberDiff line change
@@ -6,6 +6,8 @@
66

77
import json
88
import logging
9+
import pdb
10+
import traceback
911
from typing import Optional, Type
1012

1113
from browser_use.agent.prompts import SystemPrompt
@@ -37,51 +39,53 @@
3739

3840
class CustomAgent(Agent):
3941
def __init__(
40-
self,
41-
task: str,
42-
llm: BaseChatModel,
43-
add_infos: str = "",
44-
browser: Browser | None = None,
45-
browser_context: BrowserContext | None = None,
46-
controller: Controller = Controller(),
47-
use_vision: bool = True,
48-
save_conversation_path: Optional[str] = None,
49-
max_failures: int = 5,
50-
retry_delay: int = 10,
51-
system_prompt_class: Type[SystemPrompt] = SystemPrompt,
52-
max_input_tokens: int = 128000,
53-
validate_output: bool = False,
54-
include_attributes: list[str] = [
55-
"title",
56-
"type",
57-
"name",
58-
"role",
59-
"tabindex",
60-
"aria-label",
61-
"placeholder",
62-
"value",
63-
"alt",
64-
"aria-expanded",
65-
],
66-
max_error_length: int = 400,
67-
max_actions_per_step: int = 10,
42+
self,
43+
task: str,
44+
llm: BaseChatModel,
45+
add_infos: str = "",
46+
browser: Browser | None = None,
47+
browser_context: BrowserContext | None = None,
48+
controller: Controller = Controller(),
49+
use_vision: bool = True,
50+
save_conversation_path: Optional[str] = None,
51+
max_failures: int = 5,
52+
retry_delay: int = 10,
53+
system_prompt_class: Type[SystemPrompt] = SystemPrompt,
54+
max_input_tokens: int = 128000,
55+
validate_output: bool = False,
56+
include_attributes: list[str] = [
57+
"title",
58+
"type",
59+
"name",
60+
"role",
61+
"tabindex",
62+
"aria-label",
63+
"placeholder",
64+
"value",
65+
"alt",
66+
"aria-expanded",
67+
],
68+
max_error_length: int = 400,
69+
max_actions_per_step: int = 10,
70+
tool_call_in_content: bool = True,
6871
):
6972
super().__init__(
70-
task,
71-
llm,
72-
browser,
73-
browser_context,
74-
controller,
75-
use_vision,
76-
save_conversation_path,
77-
max_failures,
78-
retry_delay,
79-
system_prompt_class,
80-
max_input_tokens,
81-
validate_output,
82-
include_attributes,
83-
max_error_length,
84-
max_actions_per_step,
73+
task=task,
74+
llm=llm,
75+
browser=browser,
76+
browser_context=browser_context,
77+
controller=controller,
78+
use_vision=use_vision,
79+
save_conversation_path=save_conversation_path,
80+
max_failures=max_failures,
81+
retry_delay=retry_delay,
82+
system_prompt_class=system_prompt_class,
83+
max_input_tokens=max_input_tokens,
84+
validate_output=validate_output,
85+
include_attributes=include_attributes,
86+
max_error_length=max_error_length,
87+
max_actions_per_step=max_actions_per_step,
88+
tool_call_in_content=tool_call_in_content,
8589
)
8690
self.add_infos = add_infos
8791
self.message_manager = CustomMassageManager(
@@ -93,6 +97,7 @@ def __init__(
9397
include_attributes=self.include_attributes,
9498
max_error_length=self.max_error_length,
9599
max_actions_per_step=self.max_actions_per_step,
100+
tool_call_in_content=tool_call_in_content,
96101
)
97102

98103
def _setup_action_models(self) -> None:
@@ -122,7 +127,7 @@ def _log_response(self, response: CustomAgentOutput) -> None:
122127
)
123128

124129
def update_step_info(
125-
self, model_output: CustomAgentOutput, step_info: CustomAgentStepInfo = None
130+
self, model_output: CustomAgentOutput, step_info: CustomAgentStepInfo = None
126131
):
127132
"""
128133
update step info
@@ -133,9 +138,9 @@ def update_step_info(
133138
step_info.step_number += 1
134139
important_contents = model_output.current_state.important_contents
135140
if (
136-
important_contents
137-
and "None" not in important_contents
138-
and important_contents not in step_info.memory
141+
important_contents
142+
and "None" not in important_contents
143+
and important_contents not in step_info.memory
139144
):
140145
step_info.memory += important_contents + "\n"
141146

@@ -146,16 +151,35 @@ def update_step_info(
146151
@time_execution_async("--get_next_action")
147152
async def get_next_action(self, input_messages: list[BaseMessage]) -> AgentOutput:
148153
"""Get next action from LLM based on current state"""
154+
try:
155+
structured_llm = self.llm.with_structured_output(self.AgentOutput, include_raw=True)
156+
response: dict[str, Any] = await structured_llm.ainvoke(input_messages) # type: ignore
157+
158+
parsed: AgentOutput = response['parsed']
159+
# cut the number of actions to max_actions_per_step
160+
parsed.action = parsed.action[: self.max_actions_per_step]
161+
self._log_response(parsed)
162+
self.n_steps += 1
163+
164+
return parsed
165+
except Exception as e:
166+
# If something goes wrong, try to invoke the LLM again without structured output,
167+
# and Manually parse the response. Temporarily solution for DeepSeek
168+
ret = self.llm.invoke(input_messages)
169+
if isinstance(ret.content, list):
170+
parsed_json = json.loads(ret.content[0].replace("```json", "").replace("```", ""))
171+
else:
172+
parsed_json = json.loads(ret.content.replace("```json", "").replace("```", ""))
173+
parsed: AgentOutput = self.AgentOutput(**parsed_json)
174+
if parsed is None:
175+
raise ValueError(f'Could not parse response.')
149176

150-
ret = self.llm.invoke(input_messages)
151-
parsed_json = json.loads(ret.content.replace("```json", "").replace("```", ""))
152-
parsed: AgentOutput = self.AgentOutput(**parsed_json)
153-
# cut the number of actions to max_actions_per_step
154-
parsed.action = parsed.action[: self.max_actions_per_step]
155-
self._log_response(parsed)
156-
self.n_steps += 1
177+
# cut the number of actions to max_actions_per_step
178+
parsed.action = parsed.action[: self.max_actions_per_step]
179+
self._log_response(parsed)
180+
self.n_steps += 1
157181

158-
return parsed
182+
return parsed
159183

160184
@time_execution_async("--step")
161185
async def step(self, step_info: Optional[CustomAgentStepInfo] = None) -> None:
@@ -233,7 +257,7 @@ async def run(self, max_steps: int = 100) -> AgentHistoryList:
233257

234258
if self.history.is_done():
235259
if (
236-
self.validate_output and step < max_steps - 1
260+
self.validate_output and step < max_steps - 1
237261
): # if last step, we dont need to validate
238262
if not await self._validate_output():
239263
continue

src/agent/custom_massage_manager.py

Lines changed: 58 additions & 27 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
from langchain_core.language_models import BaseChatModel
1818
from langchain_core.messages import (
1919
HumanMessage,
20+
AIMessage
2021
)
2122

2223
from .custom_prompts import CustomAgentMessagePrompt
@@ -26,40 +27,70 @@
2627

2728
class CustomMassageManager(MessageManager):
2829
def __init__(
29-
self,
30-
llm: BaseChatModel,
31-
task: str,
32-
action_descriptions: str,
33-
system_prompt_class: Type[SystemPrompt],
34-
max_input_tokens: int = 128000,
35-
estimated_tokens_per_character: int = 3,
36-
image_tokens: int = 800,
37-
include_attributes: list[str] = [],
38-
max_error_length: int = 400,
39-
max_actions_per_step: int = 10,
30+
self,
31+
llm: BaseChatModel,
32+
task: str,
33+
action_descriptions: str,
34+
system_prompt_class: Type[SystemPrompt],
35+
max_input_tokens: int = 128000,
36+
estimated_tokens_per_character: int = 3,
37+
image_tokens: int = 800,
38+
include_attributes: list[str] = [],
39+
max_error_length: int = 400,
40+
max_actions_per_step: int = 10,
41+
tool_call_in_content: bool = False,
4042
):
4143
super().__init__(
42-
llm,
43-
task,
44-
action_descriptions,
45-
system_prompt_class,
46-
max_input_tokens,
47-
estimated_tokens_per_character,
48-
image_tokens,
49-
include_attributes,
50-
max_error_length,
51-
max_actions_per_step,
44+
llm=llm,
45+
task=task,
46+
action_descriptions=action_descriptions,
47+
system_prompt_class=system_prompt_class,
48+
max_input_tokens=max_input_tokens,
49+
estimated_tokens_per_character=estimated_tokens_per_character,
50+
image_tokens=image_tokens,
51+
include_attributes=include_attributes,
52+
max_error_length=max_error_length,
53+
max_actions_per_step=max_actions_per_step,
54+
tool_call_in_content=tool_call_in_content,
5255
)
5356

54-
# Move Task info to state_message
57+
# Custom: Move Task info to state_message
5558
self.history = MessageHistory()
5659
self._add_message_with_tokens(self.system_prompt)
60+
tool_calls = [
61+
{
62+
'name': 'AgentOutput',
63+
'args': {
64+
'current_state': {
65+
'evaluation_previous_goal': 'Unknown - No previous actions to evaluate.',
66+
'memory': '',
67+
'next_goal': 'Obtain task from user',
68+
},
69+
'action': [],
70+
},
71+
'id': '',
72+
'type': 'tool_call',
73+
}
74+
]
75+
if self.tool_call_in_content:
76+
# openai throws error if tool_calls are not responded -> move to content
77+
example_tool_call = AIMessage(
78+
content=f'{tool_calls}',
79+
tool_calls=[],
80+
)
81+
else:
82+
example_tool_call = AIMessage(
83+
content=f'',
84+
tool_calls=tool_calls,
85+
)
86+
87+
self._add_message_with_tokens(example_tool_call)
5788

5889
def add_state_message(
59-
self,
60-
state: BrowserState,
61-
result: Optional[List[ActionResult]] = None,
62-
step_info: Optional[AgentStepInfo] = None,
90+
self,
91+
state: BrowserState,
92+
result: Optional[List[ActionResult]] = None,
93+
step_info: Optional[AgentStepInfo] = None,
6394
) -> None:
6495
"""Add browser state as human message"""
6596

@@ -72,7 +103,7 @@ def add_state_message(
72103
self._add_message_with_tokens(msg)
73104
if r.error:
74105
msg = HumanMessage(
75-
content=str(r.error)[-self.max_error_length :]
106+
content=str(r.error)[-self.max_error_length:]
76107
)
77108
self._add_message_with_tokens(msg)
78109
result = None # if result in history, we dont want to add it again

src/agent/custom_prompts.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ def important_rules(self) -> str:
2424
{
2525
"current_state": {
2626
"prev_action_evaluation": "Success|Failed|Unknown - Analyze the current elements and the image to check if the previous goals/actions are successful like intended by the task. Ignore the action result. The website is the ground truth. Also mention if something unexpected happened like new suggestions in an input field. Shortly state why/why not. Note that the result you output must be consistent with the reasoning you output afterwards. If you consider it to be 'Failed,' you should reflect on this during your thought.",
27-
"important_contents": "Output important contents closely related to user\'s instruction or task on the current page. If there is, please output the contents. If not, please output \"None\".",
27+
"important_contents": "Output important contents closely related to user\'s instruction or task on the current page. If there is, please output the contents. If not, please output empty string ''.",
2828
"completed_contents": "Update the input Task Progress. Completed contents is a general summary of the current contents that have been completed. Just summarize the contents that have been actually completed based on the current page and the history operations. Please list each completed item individually, such as: 1. Input username. 2. Input Password. 3. Click confirm button",
2929
"thought": "Think about the requirements that have been completed in previous operations and the requirements that need to be completed in the next one operation. If the output of prev_action_evaluation is 'Failed', please reflect and output your reflection here. If you think you have entered the wrong page, consider to go back to the previous page in next action.",
3030
"summary": "Please generate a brief natural language description for the operation in next actions based on your Thought."
@@ -148,12 +148,12 @@ def get_system_message(self) -> SystemMessage:
148148

149149
class CustomAgentMessagePrompt:
150150
def __init__(
151-
self,
152-
state: BrowserState,
153-
result: Optional[List[ActionResult]] = None,
154-
include_attributes: list[str] = [],
155-
max_error_length: int = 400,
156-
step_info: Optional[CustomAgentStepInfo] = None,
151+
self,
152+
state: BrowserState,
153+
result: Optional[List[ActionResult]] = None,
154+
include_attributes: list[str] = [],
155+
max_error_length: int = 400,
156+
step_info: Optional[CustomAgentStepInfo] = None,
157157
):
158158
self.state = state
159159
self.result = result
@@ -183,7 +183,7 @@ def get_user_message(self) -> HumanMessage:
183183
state_description += f"\nResult of action {i + 1}/{len(self.result)}: {result.extracted_content}"
184184
if result.error:
185185
# only use last 300 characters of error
186-
error = result.error[-self.max_error_length :]
186+
error = result.error[-self.max_error_length:]
187187
state_description += (
188188
f"\nError of action {i + 1}/{len(self.result)}: ...{error}"
189189
)

src/browser/custom_context.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,11 +23,12 @@ def __init__(
2323
config: BrowserContextConfig = BrowserContextConfig(),
2424
context: BrowserContext = None,
2525
):
26-
super(CustomBrowserContext, self).__init__(browser, config)
26+
super(CustomBrowserContext, self).__init__(browser=browser, config=config)
2727
self.context = context
2828

2929
async def _create_context(self, browser: PlaywrightBrowser):
3030
"""Creates a new browser context with anti-detection measures and loads cookies if available."""
31+
# If we have a context, return it directly
3132
if self.context:
3233
return self.context
3334
if self.browser.config.chrome_instance_path and len(browser.contexts) > 0:
@@ -46,7 +47,7 @@ async def _create_context(self, browser: PlaywrightBrowser):
4647
bypass_csp=self.config.disable_security,
4748
ignore_https_errors=self.config.disable_security,
4849
record_video_dir=self.config.save_recording_path,
49-
record_video_size=self.config.browser_window_size, # set record video size
50+
record_video_size=self.config.browser_window_size, # set record video size, same as windows size
5051
)
5152

5253
if self.config.trace_path:

src/utils/utils.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -86,6 +86,7 @@ def get_llm_model(provider: str, **kwargs):
8686
return ChatOllama(
8787
model=kwargs.get("model_name", "qwen2.5:7b"),
8888
temperature=kwargs.get("temperature", 0.0),
89+
num_ctx=128000,
8990
)
9091
elif provider == "azure_openai":
9192
if not kwargs.get("base_url", ""):

0 commit comments

Comments
 (0)