Skip to content

Commit 41ddd2b

Browse files
committed
2 parents 79b61a9 + 0a6a386 commit 41ddd2b

File tree

10 files changed

+254
-373
lines changed

10 files changed

+254
-373
lines changed

README.md

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -108,7 +108,7 @@ playwright install
108108
- `--dark-mode`: Enables dark mode for the user interface.
109109
3. **Access the WebUI:** Open your web browser and navigate to `http://127.0.0.1:7788`.
110110
4. **Using Your Own Browser(Optional):**
111-
- Set `CHROME_PATH` to the executable path of your browser and `CHROME_USER_DATA` to the user data directory of your browser.
111+
- Set `CHROME_PATH` to the executable path of your browser and `CHROME_USER_DATA` to the user data directory of your browser. Leave `CHROME_USER_DATA` empty if you want to use local user data.
112112
- Windows
113113
```env
114114
CHROME_PATH="C:\Program Files\Google\Chrome\Application\chrome.exe"
@@ -118,7 +118,7 @@ playwright install
118118
- Mac
119119
```env
120120
CHROME_PATH="/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"
121-
CHROME_USER_DATA="~/Library/Application Support/Google/Chrome/Profile 1"
121+
CHROME_USER_DATA="/Users/YourUsername/Library/Application Support/Google/Chrome"
122122
```
123123
- Close all Chrome windows
124124
- Open the WebUI in a non-Chrome browser, such as Firefox or Edge. This is important because the persistent browser context will use the Chrome data when running the agent.

requirements.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
11
browser-use==0.1.29
22
pyperclip==1.9.0
33
gradio==5.10.0
4+
json-repair

src/agent/custom_agent.py

Lines changed: 50 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -8,10 +8,11 @@
88
import base64
99
import io
1010
import platform
11-
from browser_use.agent.prompts import SystemPrompt
11+
from browser_use.agent.prompts import SystemPrompt, AgentMessagePrompt
1212
from browser_use.agent.service import Agent
1313
from browser_use.agent.views import (
1414
ActionResult,
15+
ActionModel,
1516
AgentHistoryList,
1617
AgentOutput,
1718
AgentHistory,
@@ -30,6 +31,7 @@
3031
from langchain_core.messages import (
3132
BaseMessage,
3233
)
34+
from json_repair import repair_json
3335
from src.utils.agent_state import AgentState
3436

3537
from .custom_massage_manager import CustomMassageManager
@@ -52,6 +54,7 @@ def __init__(
5254
max_failures: int = 5,
5355
retry_delay: int = 10,
5456
system_prompt_class: Type[SystemPrompt] = SystemPrompt,
57+
agent_prompt_class: Type[AgentMessagePrompt] = AgentMessagePrompt,
5558
max_input_tokens: int = 128000,
5659
validate_output: bool = False,
5760
include_attributes: list[str] = [
@@ -98,28 +101,31 @@ def __init__(
98101
register_done_callback=register_done_callback,
99102
tool_calling_method=tool_calling_method
100103
)
101-
if self.model_name in ["deepseek-reasoner"] or self.model_name.startswith("deepseek-r1"):
104+
if self.model_name in ["deepseek-reasoner"] or "deepseek-r1" in self.model_name:
102105
# deepseek-reasoner does not support function calling
103106
self.use_deepseek_r1 = True
104107
# deepseek-reasoner only support 64000 context
105108
self.max_input_tokens = 64000
106109
else:
107110
self.use_deepseek_r1 = False
108111

112+
# record last actions
113+
self._last_actions = None
109114
# custom new info
110115
self.add_infos = add_infos
111116
# agent_state for Stop
112117
self.agent_state = agent_state
118+
self.agent_prompt_class = agent_prompt_class
113119
self.message_manager = CustomMassageManager(
114120
llm=self.llm,
115121
task=self.task,
116122
action_descriptions=self.controller.registry.get_prompt_description(),
117123
system_prompt_class=self.system_prompt_class,
124+
agent_prompt_class=agent_prompt_class,
118125
max_input_tokens=self.max_input_tokens,
119126
include_attributes=self.include_attributes,
120127
max_error_length=self.max_error_length,
121-
max_actions_per_step=self.max_actions_per_step,
122-
use_deepseek_r1=self.use_deepseek_r1
128+
max_actions_per_step=self.max_actions_per_step
123129
)
124130

125131
def _setup_action_models(self) -> None:
@@ -178,38 +184,39 @@ def update_step_info(
178184
@time_execution_async("--get_next_action")
179185
async def get_next_action(self, input_messages: list[BaseMessage]) -> AgentOutput:
180186
"""Get next action from LLM based on current state"""
187+
messages_to_process = (
188+
self.message_manager.merge_successive_human_messages(input_messages)
189+
if self.use_deepseek_r1
190+
else input_messages
191+
)
192+
193+
ai_message = self.llm.invoke(messages_to_process)
194+
self.message_manager._add_message_with_tokens(ai_message)
195+
181196
if self.use_deepseek_r1:
182-
merged_input_messages = self.message_manager.merge_successive_human_messages(input_messages)
183-
ai_message = self.llm.invoke(merged_input_messages)
184-
self.message_manager._add_message_with_tokens(ai_message)
185-
logger.info(f"🤯 Start Deep Thinking: ")
197+
logger.info("🤯 Start Deep Thinking: ")
186198
logger.info(ai_message.reasoning_content)
187-
logger.info(f"🤯 End Deep Thinking")
188-
if isinstance(ai_message.content, list):
189-
parsed_json = json.loads(ai_message.content[0].replace("```json", "").replace("```", ""))
190-
else:
191-
parsed_json = json.loads(ai_message.content.replace("```json", "").replace("```", ""))
192-
parsed: AgentOutput = self.AgentOutput(**parsed_json)
193-
if parsed is None:
194-
logger.debug(ai_message.content)
195-
raise ValueError(f'Could not parse response.')
199+
logger.info("🤯 End Deep Thinking")
200+
201+
if isinstance(ai_message.content, list):
202+
ai_content = ai_message.content[0]
196203
else:
197-
ai_message = self.llm.invoke(input_messages)
198-
self.message_manager._add_message_with_tokens(ai_message)
199-
if isinstance(ai_message.content, list):
200-
parsed_json = json.loads(ai_message.content[0].replace("```json", "").replace("```", ""))
201-
else:
202-
parsed_json = json.loads(ai_message.content.replace("```json", "").replace("```", ""))
203-
parsed: AgentOutput = self.AgentOutput(**parsed_json)
204-
if parsed is None:
205-
logger.debug(ai_message.content)
206-
raise ValueError(f'Could not parse response.')
204+
ai_content = ai_message.content
207205

208-
# cut the number of actions to max_actions_per_step
206+
ai_content = ai_content.replace("```json", "").replace("```", "")
207+
ai_content = repair_json(ai_content)
208+
parsed_json = json.loads(ai_content)
209+
parsed: AgentOutput = self.AgentOutput(**parsed_json)
210+
211+
if parsed is None:
212+
logger.debug(ai_message.content)
213+
raise ValueError('Could not parse response.')
214+
215+
# Limit actions to maximum allowed per step
209216
parsed.action = parsed.action[: self.max_actions_per_step]
210217
self._log_response(parsed)
211218
self.n_steps += 1
212-
219+
213220
return parsed
214221

215222
@time_execution_async("--step")
@@ -222,7 +229,7 @@ async def step(self, step_info: Optional[CustomAgentStepInfo] = None) -> None:
222229

223230
try:
224231
state = await self.browser_context.get_state(use_vision=self.use_vision)
225-
self.message_manager.add_state_message(state, self._last_result, step_info)
232+
self.message_manager.add_state_message(state, self._last_actions, self._last_result, step_info)
226233
input_messages = self.message_manager.get_messages()
227234
try:
228235
model_output = await self.get_next_action(input_messages)
@@ -231,27 +238,31 @@ async def step(self, step_info: Optional[CustomAgentStepInfo] = None) -> None:
231238
self.update_step_info(model_output, step_info)
232239
logger.info(f"🧠 All Memory: \n{step_info.memory}")
233240
self._save_conversation(input_messages, model_output)
234-
# should we remove last state message? at least, deepseek-reasoner cannot remove
235241
if self.model_name != "deepseek-reasoner":
236-
self.message_manager._remove_last_state_message()
242+
# remove prev message
243+
self.message_manager._remove_state_message_by_index(-1)
237244
except Exception as e:
238245
# model call failed, remove last state message from history
239-
self.message_manager._remove_last_state_message()
246+
self.message_manager._remove_state_message_by_index(-1)
240247
raise e
241248

249+
actions: list[ActionModel] = model_output.action
242250
result: list[ActionResult] = await self.controller.multi_act(
243-
model_output.action, self.browser_context
251+
actions, self.browser_context
244252
)
245-
if len(result) != len(model_output.action):
253+
if len(result) != len(actions):
246254
# I think something changes, such information should let LLM know
247-
for ri in range(len(result), len(model_output.action)):
255+
for ri in range(len(result), len(actions)):
248256
result.append(ActionResult(extracted_content=None,
249257
include_in_memory=True,
250-
error=f"{model_output.action[ri].model_dump_json(exclude_unset=True)} is Failed to execute. \
251-
Something new appeared after action {model_output.action[len(result) - 1].model_dump_json(exclude_unset=True)}",
258+
error=f"{actions[ri].model_dump_json(exclude_unset=True)} is Failed to execute. \
259+
Something new appeared after action {actions[len(result) - 1].model_dump_json(exclude_unset=True)}",
252260
is_done=False))
261+
if len(actions) == 0:
262+
# TODO: fix no action case
263+
result = [ActionResult(is_done=True, extracted_content=step_info.memory, include_in_memory=True)]
253264
self._last_result = result
254-
265+
self._last_actions = actions
255266
if len(result) > 0 and result[-1].is_done:
256267
logger.info(f"📄 Result: {result[-1].extracted_content}")
257268

src/agent/custom_massage_manager.py

Lines changed: 20 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -5,8 +5,8 @@
55

66
from browser_use.agent.message_manager.service import MessageManager
77
from browser_use.agent.message_manager.views import MessageHistory
8-
from browser_use.agent.prompts import SystemPrompt
9-
from browser_use.agent.views import ActionResult, AgentStepInfo
8+
from browser_use.agent.prompts import SystemPrompt, AgentMessagePrompt
9+
from browser_use.agent.views import ActionResult, AgentStepInfo, ActionModel
1010
from browser_use.browser.views import BrowserState
1111
from langchain_core.language_models import BaseChatModel
1212
from langchain_anthropic import ChatAnthropic
@@ -31,14 +31,14 @@ def __init__(
3131
task: str,
3232
action_descriptions: str,
3333
system_prompt_class: Type[SystemPrompt],
34+
agent_prompt_class: Type[AgentMessagePrompt],
3435
max_input_tokens: int = 128000,
3536
estimated_characters_per_token: int = 3,
3637
image_tokens: int = 800,
3738
include_attributes: list[str] = [],
3839
max_error_length: int = 400,
3940
max_actions_per_step: int = 10,
40-
message_context: Optional[str] = None,
41-
use_deepseek_r1: bool = False
41+
message_context: Optional[str] = None
4242
):
4343
super().__init__(
4444
llm=llm,
@@ -53,8 +53,7 @@ def __init__(
5353
max_actions_per_step=max_actions_per_step,
5454
message_context=message_context
5555
)
56-
self.tool_id = 1
57-
self.use_deepseek_r1 = use_deepseek_r1
56+
self.agent_prompt_class = agent_prompt_class
5857
# Custom: Move Task info to state_message
5958
self.history = MessageHistory()
6059
self._add_message_with_tokens(self.system_prompt)
@@ -75,13 +74,15 @@ def cut_messages(self):
7574
def add_state_message(
7675
self,
7776
state: BrowserState,
77+
actions: Optional[List[ActionModel]] = None,
7878
result: Optional[List[ActionResult]] = None,
7979
step_info: Optional[AgentStepInfo] = None,
8080
) -> None:
8181
"""Add browser state as human message"""
8282
# otherwise add state message and result to next message (which will not stay in memory)
83-
state_message = CustomAgentMessagePrompt(
83+
state_message = self.agent_prompt_class(
8484
state,
85+
actions,
8586
result,
8687
include_attributes=self.include_attributes,
8788
max_error_length=self.max_error_length,
@@ -102,3 +103,15 @@ def _count_text_tokens(self, text: str) -> int:
102103
len(text) // self.estimated_characters_per_token
103104
) # Rough estimate if no tokenizer available
104105
return tokens
106+
107+
def _remove_state_message_by_index(self, remove_ind=-1) -> None:
108+
"""Remove last state message from history"""
109+
i = len(self.history.messages) - 1
110+
remove_cnt = 0
111+
while i >= 0:
112+
if isinstance(self.history.messages[i].message, HumanMessage):
113+
remove_cnt += 1
114+
if remove_cnt == abs(remove_ind):
115+
self.history.remove_message(i)
116+
break
117+
i -= 1

src/agent/custom_prompts.py

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
from typing import List, Optional
33

44
from browser_use.agent.prompts import SystemPrompt, AgentMessagePrompt
5-
from browser_use.agent.views import ActionResult
5+
from browser_use.agent.views import ActionResult, ActionModel
66
from browser_use.browser.views import BrowserState
77
from langchain_core.messages import HumanMessage, SystemMessage
88

@@ -56,7 +56,7 @@ def important_rules(self) -> str:
5656
- Use scroll to find elements you are looking for
5757
5858
5. TASK COMPLETION:
59-
- If you think all the requirements of user\'s instruction have been completed and no further operation is required, output the done action to terminate the operation process.
59+
- If you think all the requirements of user\'s instruction have been completed and no further operation is required, output the **Done** action to terminate the operation process.
6060
- Don't hallucinate actions.
6161
- If the task requires specific information - make sure to include everything in the done function. This is what the user will see.
6262
- If you are running out of steps (current step), think about speeding it up, and ALWAYS use the done action as the last action.
@@ -140,6 +140,7 @@ class CustomAgentMessagePrompt(AgentMessagePrompt):
140140
def __init__(
141141
self,
142142
state: BrowserState,
143+
actions: Optional[List[ActionModel]] = None,
143144
result: Optional[List[ActionResult]] = None,
144145
include_attributes: list[str] = [],
145146
max_error_length: int = 400,
@@ -151,10 +152,11 @@ def __init__(
151152
max_error_length=max_error_length,
152153
step_info=step_info
153154
)
155+
self.actions = actions
154156

155157
def get_user_message(self) -> HumanMessage:
156158
if self.step_info:
157-
step_info_description = f'Current step: {self.step_info.step_number + 1}/{self.step_info.max_steps}'
159+
step_info_description = f'Current step: {self.step_info.step_number}/{self.step_info.max_steps}\n'
158160
else:
159161
step_info_description = ''
160162

@@ -181,7 +183,7 @@ def get_user_message(self) -> HumanMessage:
181183

182184
state_description = f"""
183185
{step_info_description}
184-
1. Task: {self.step_info.task}
186+
1. Task: {self.step_info.task}.
185187
2. Hints(Optional):
186188
{self.step_info.add_infos}
187189
3. Memory:
@@ -193,17 +195,20 @@ def get_user_message(self) -> HumanMessage:
193195
{elements_text}
194196
"""
195197

196-
if self.result:
197-
198+
if self.actions and self.result:
199+
state_description += "\n **Previous Actions** \n"
200+
state_description += f'Previous step: {self.step_info.step_number-1}/{self.step_info.max_steps} \n'
198201
for i, result in enumerate(self.result):
202+
action = self.actions[i]
203+
state_description += f"Previous action {i + 1}/{len(self.result)}: {action.model_dump_json(exclude_unset=True)}\n"
199204
if result.include_in_memory:
200205
if result.extracted_content:
201-
state_description += f"\nResult of previous action {i + 1}/{len(self.result)}: {result.extracted_content}"
206+
state_description += f"Result of previous action {i + 1}/{len(self.result)}: {result.extracted_content}\n"
202207
if result.error:
203208
# only use last 300 characters of error
204209
error = result.error[-self.max_error_length:]
205210
state_description += (
206-
f"\nError of previous action {i + 1}/{len(self.result)}: ...{error}"
211+
f"Error of previous action {i + 1}/{len(self.result)}: ...{error}\n"
207212
)
208213

209214
if self.state.screenshot:

src/controller/custom_controller.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
from pydantic import BaseModel
44
from browser_use.agent.views import ActionResult
55
from browser_use.browser.context import BrowserContext
6-
from browser_use.controller.service import Controller
6+
from browser_use.controller.service import Controller, DoneAction
77

88

99
class CustomController(Controller):

0 commit comments

Comments
 (0)