Skip to content

Commit 566bca7

Browse files
authored
Merge pull request #173 from vvincent1234/fix/adapt_latest_browser-use
Fix/adapt latest browser use
2 parents 284f0b7 + b9080c3 commit 566bca7

14 files changed

+454
-486
lines changed

requirements.txt

Lines changed: 2 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,3 @@
1-
browser-use==0.1.19
2-
langchain-google-genai==2.0.8
1+
browser-use==0.1.29
32
pyperclip==1.9.0
4-
gradio==5.9.1
5-
langchain-ollama==0.2.2
6-
langchain-openai==0.2.14
3+
gradio==5.10.0

src/agent/custom_agent.py

Lines changed: 200 additions & 193 deletions
Large diffs are not rendered by default.

src/agent/custom_massage_manager.py

Lines changed: 17 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
AIMessage,
1616
BaseMessage,
1717
HumanMessage,
18+
ToolMessage
1819
)
1920
from langchain_openai import ChatOpenAI
2021
from ..utils.llm import DeepSeekR1ChatOpenAI
@@ -31,69 +32,44 @@ def __init__(
3132
action_descriptions: str,
3233
system_prompt_class: Type[SystemPrompt],
3334
max_input_tokens: int = 128000,
34-
estimated_tokens_per_character: int = 3,
35+
estimated_characters_per_token: int = 3,
3536
image_tokens: int = 800,
3637
include_attributes: list[str] = [],
3738
max_error_length: int = 400,
3839
max_actions_per_step: int = 10,
39-
tool_call_in_content: bool = False,
40-
use_function_calling: bool = True
40+
message_context: Optional[str] = None,
41+
use_deepseek_r1: bool = False
4142
):
4243
super().__init__(
4344
llm=llm,
4445
task=task,
4546
action_descriptions=action_descriptions,
4647
system_prompt_class=system_prompt_class,
4748
max_input_tokens=max_input_tokens,
48-
estimated_tokens_per_character=estimated_tokens_per_character,
49+
estimated_characters_per_token=estimated_characters_per_token,
4950
image_tokens=image_tokens,
5051
include_attributes=include_attributes,
5152
max_error_length=max_error_length,
5253
max_actions_per_step=max_actions_per_step,
53-
tool_call_in_content=tool_call_in_content,
54+
message_context=message_context
5455
)
55-
self.use_function_calling = use_function_calling
56+
self.tool_id = 1
57+
self.use_deepseek_r1 = use_deepseek_r1
5658
# Custom: Move Task info to state_message
5759
self.history = MessageHistory()
5860
self._add_message_with_tokens(self.system_prompt)
5961

60-
if self.use_function_calling:
61-
tool_calls = [
62-
{
63-
'name': 'CustomAgentOutput',
64-
'args': {
65-
'current_state': {
66-
'prev_action_evaluation': 'Unknown - No previous actions to evaluate.',
67-
'important_contents': '',
68-
'completed_contents': '',
69-
'thought': 'Now Google is open. Need to type OpenAI to search.',
70-
'summary': 'Type OpenAI to search.',
71-
},
72-
'action': [],
73-
},
74-
'id': '',
75-
'type': 'tool_call',
76-
}
77-
]
78-
if self.tool_call_in_content:
79-
# openai throws error if tool_calls are not responded -> move to content
80-
example_tool_call = AIMessage(
81-
content=f'{tool_calls}',
82-
tool_calls=[],
83-
)
84-
else:
85-
example_tool_call = AIMessage(
86-
content=f'',
87-
tool_calls=tool_calls,
88-
)
89-
90-
self._add_message_with_tokens(example_tool_call)
62+
if self.message_context:
63+
context_message = HumanMessage(content=self.message_context)
64+
self._add_message_with_tokens(context_message)
9165

9266
def cut_messages(self):
9367
"""Get current message list, potentially trimmed to max tokens"""
9468
diff = self.history.total_tokens - self.max_input_tokens
95-
while diff > 0 and len(self.history.messages) > 1:
96-
self.history.remove_message(1) # alway remove the oldest one
69+
min_message_len = 2 if self.message_context is not None else 1
70+
71+
while diff > 0 and len(self.history.messages) > min_message_len:
72+
self.history.remove_message(min_message_len) # alway remove the oldest message
9773
diff = self.history.total_tokens - self.max_input_tokens
9874

9975
def add_state_message(
@@ -119,10 +95,10 @@ def _count_text_tokens(self, text: str) -> int:
11995
tokens = self.llm.get_num_tokens(text)
12096
except Exception:
12197
tokens = (
122-
len(text) // self.ESTIMATED_TOKENS_PER_CHARACTER
98+
len(text) // self.estimated_characters_per_token
12399
) # Rough estimate if no tokenizer available
124100
else:
125101
tokens = (
126-
len(text) // self.ESTIMATED_TOKENS_PER_CHARACTER
102+
len(text) // self.estimated_characters_per_token
127103
) # Rough estimate if no tokenizer available
128104
return tokens

src/agent/custom_prompts.py

Lines changed: 40 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
import pdb
22
from typing import List, Optional
33

4-
from browser_use.agent.prompts import SystemPrompt
4+
from browser_use.agent.prompts import SystemPrompt, AgentMessagePrompt
55
from browser_use.agent.views import ActionResult
66
from browser_use.browser.views import BrowserState
77
from langchain_core.messages import HumanMessage, SystemMessage
@@ -19,19 +19,14 @@ def important_rules(self) -> str:
1919
{
2020
"current_state": {
2121
"prev_action_evaluation": "Success|Failed|Unknown - Analyze the current elements and the image to check if the previous goals/actions are successful like intended by the task. Ignore the action result. The website is the ground truth. Also mention if something unexpected happened like new suggestions in an input field. Shortly state why/why not. Note that the result you output must be consistent with the reasoning you output afterwards. If you consider it to be 'Failed,' you should reflect on this during your thought.",
22-
"important_contents": "Output important contents closely related to user\'s instruction or task on the current page. If there is, please output the contents. If not, please output empty string ''.",
22+
"important_contents": "Output important contents closely related to user\'s instruction on the current page. If there is, please output the contents. If not, please output empty string ''.",
2323
"task_progress": "Task Progress is a general summary of the current contents that have been completed. Just summarize the contents that have been actually completed based on the content at current step and the history operations. Please list each completed item individually, such as: 1. Input username. 2. Input Password. 3. Click confirm button. Please return string type not a list.",
2424
"future_plans": "Based on the user's request and the current state, outline the remaining steps needed to complete the task. This should be a concise list of actions yet to be performed, such as: 1. Select a date. 2. Choose a specific time slot. 3. Confirm booking. Please return string type not a list.",
2525
"thought": "Think about the requirements that have been completed in previous operations and the requirements that need to be completed in the next one operation. If your output of prev_action_evaluation is 'Failed', please reflect and output your reflection here.",
2626
"summary": "Please generate a brief natural language description for the operation in next actions based on your Thought."
2727
},
2828
"action": [
29-
{
30-
"action_name": {
31-
// action-specific parameters
32-
}
33-
},
34-
// ... more actions in sequence
29+
* actions in sequences, please refer to **Common action sequences**. Each output action MUST be formated as: \{action_name\: action_params\}*
3530
]
3631
}
3732
@@ -44,7 +39,6 @@ def important_rules(self) -> str:
4439
{"click_element": {"index": 3}}
4540
]
4641
- Navigation and extraction: [
47-
{"open_new_tab": {}},
4842
{"go_to_url": {"url": "https://example.com"}},
4943
{"extract_page_content": {}}
5044
]
@@ -127,7 +121,7 @@ def get_system_message(self) -> SystemMessage:
127121
AGENT_PROMPT = f"""You are a precise browser automation agent that interacts with websites through structured commands. Your role is to:
128122
1. Analyze the provided webpage elements and structure
129123
2. Plan a sequence of actions to accomplish the given task
130-
3. Respond with valid JSON containing your action sequence and state assessment
124+
3. Your final result MUST be a valid JSON as the **RESPONSE FORMAT** described, containing your action sequence and state assessment, No need extra content to expalin.
131125
132126
Current date and time: {time_str}
133127
@@ -142,7 +136,7 @@ def get_system_message(self) -> SystemMessage:
142136
return SystemMessage(content=AGENT_PROMPT)
143137

144138

145-
class CustomAgentMessagePrompt:
139+
class CustomAgentMessagePrompt(AgentMessagePrompt):
146140
def __init__(
147141
self,
148142
state: BrowserState,
@@ -151,11 +145,12 @@ def __init__(
151145
max_error_length: int = 400,
152146
step_info: Optional[CustomAgentStepInfo] = None,
153147
):
154-
self.state = state
155-
self.result = result
156-
self.max_error_length = max_error_length
157-
self.include_attributes = include_attributes
158-
self.step_info = step_info
148+
super(CustomAgentMessagePrompt, self).__init__(state=state,
149+
result=result,
150+
include_attributes=include_attributes,
151+
max_error_length=max_error_length,
152+
step_info=step_info
153+
)
159154

160155
def get_user_message(self) -> HumanMessage:
161156
if self.step_info:
@@ -164,8 +159,26 @@ def get_user_message(self) -> HumanMessage:
164159
step_info_description = ''
165160

166161
elements_text = self.state.element_tree.clickable_elements_to_string(include_attributes=self.include_attributes)
167-
if not elements_text:
162+
163+
has_content_above = (self.state.pixels_above or 0) > 0
164+
has_content_below = (self.state.pixels_below or 0) > 0
165+
166+
if elements_text != '':
167+
if has_content_above:
168+
elements_text = (
169+
f'... {self.state.pixels_above} pixels above - scroll or extract content to see more ...\n{elements_text}'
170+
)
171+
else:
172+
elements_text = f'[Start of page]\n{elements_text}'
173+
if has_content_below:
174+
elements_text = (
175+
f'{elements_text}\n... {self.state.pixels_below} pixels below - scroll or extract content to see more ...'
176+
)
177+
else:
178+
elements_text = f'{elements_text}\n[End of page]'
179+
else:
168180
elements_text = 'empty page'
181+
169182
state_description = f"""
170183
{step_info_description}
171184
1. Task: {self.step_info.task}
@@ -181,15 +194,17 @@ def get_user_message(self) -> HumanMessage:
181194
"""
182195

183196
if self.result:
197+
184198
for i, result in enumerate(self.result):
185-
if result.extracted_content:
186-
state_description += f"\nResult of action {i + 1}/{len(self.result)}: {result.extracted_content}"
187-
if result.error:
188-
# only use last 300 characters of error
189-
error = result.error[-self.max_error_length:]
190-
state_description += (
191-
f"\nError of action {i + 1}/{len(self.result)}: ...{error}"
192-
)
199+
if result.include_in_memory:
200+
if result.extracted_content:
201+
state_description += f"\nResult of previous action {i + 1}/{len(self.result)}: {result.extracted_content}"
202+
if result.error:
203+
# only use last 300 characters of error
204+
error = result.error[-self.max_error_length:]
205+
state_description += (
206+
f"\nError of previous action {i + 1}/{len(self.result)}: ...{error}"
207+
)
193208

194209
if self.state.screenshot:
195210
# Format message for vision model

src/agent/custom_views.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ def type_with_custom_actions(
4545
) -> Type["CustomAgentOutput"]:
4646
"""Extend actions with custom actions"""
4747
return create_model(
48-
"AgentOutput",
48+
"CustomAgentOutput",
4949
__base__=CustomAgentOutput,
5050
action=(
5151
list[custom_actions],

0 commit comments

Comments
 (0)