feat: adpat to new version of browser-use

vvincent1234 · vvincent1234 · commit 041dc55a363d · 2025-01-08T19:23:23.000+08:00
diff --git a/requirements.txt b/requirements.txt
@@ -1,5 +1,5 @@
-browser-use==0.1.17
-langchain-google-genai
+browser-use>=0.1.18
+langchain-google-genai>=2.0.8
 pyperclip
 gradio
 langchain-ollama
diff --git a/src/agent/custom_agent.py b/src/agent/custom_agent.py
@@ -6,6 +6,8 @@
 
 import json
 import logging
+import pdb
+import traceback
 from typing import Optional, Type
 
 from browser_use.agent.prompts import SystemPrompt
@@ -37,51 +39,53 @@
 
 class CustomAgent(Agent):
     def __init__(
-        self,
-        task: str,
-        llm: BaseChatModel,
-        add_infos: str = "",
-        browser: Browser | None = None,
-        browser_context: BrowserContext | None = None,
-        controller: Controller = Controller(),
-        use_vision: bool = True,
-        save_conversation_path: Optional[str] = None,
-        max_failures: int = 5,
-        retry_delay: int = 10,
-        system_prompt_class: Type[SystemPrompt] = SystemPrompt,
-        max_input_tokens: int = 128000,
-        validate_output: bool = False,
-        include_attributes: list[str] = [
-            "title",
-            "type",
-            "name",
-            "role",
-            "tabindex",
-            "aria-label",
-            "placeholder",
-            "value",
-            "alt",
-            "aria-expanded",
-        ],
-        max_error_length: int = 400,
-        max_actions_per_step: int = 10,
+            self,
+            task: str,
+            llm: BaseChatModel,
+            add_infos: str = "",
+            browser: Browser | None = None,
+            browser_context: BrowserContext | None = None,
+            controller: Controller = Controller(),
+            use_vision: bool = True,
+            save_conversation_path: Optional[str] = None,
+            max_failures: int = 5,
+            retry_delay: int = 10,
+            system_prompt_class: Type[SystemPrompt] = SystemPrompt,
+            max_input_tokens: int = 128000,
+            validate_output: bool = False,
+            include_attributes: list[str] = [
+                "title",
+                "type",
+                "name",
+                "role",
+                "tabindex",
+                "aria-label",
+                "placeholder",
+                "value",
+                "alt",
+                "aria-expanded",
+            ],
+            max_error_length: int = 400,
+            max_actions_per_step: int = 10,
+            tool_call_in_content: bool = True,
     ):
         super().__init__(
-            task,
-            llm,
-            browser,
-            browser_context,
-            controller,
-            use_vision,
-            save_conversation_path,
-            max_failures,
-            retry_delay,
-            system_prompt_class,
-            max_input_tokens,
-            validate_output,
-            include_attributes,
-            max_error_length,
-            max_actions_per_step,
+            task=task,
+            llm=llm,
+            browser=browser,
+            browser_context=browser_context,
+            controller=controller,
+            use_vision=use_vision,
+            save_conversation_path=save_conversation_path,
+            max_failures=max_failures,
+            retry_delay=retry_delay,
+            system_prompt_class=system_prompt_class,
+            max_input_tokens=max_input_tokens,
+            validate_output=validate_output,
+            include_attributes=include_attributes,
+            max_error_length=max_error_length,
+            max_actions_per_step=max_actions_per_step,
+            tool_call_in_content=tool_call_in_content,
         )
         self.add_infos = add_infos
         self.message_manager = CustomMassageManager(
@@ -93,6 +97,7 @@ def __init__(
             include_attributes=self.include_attributes,
             max_error_length=self.max_error_length,
             max_actions_per_step=self.max_actions_per_step,
+            tool_call_in_content=tool_call_in_content,
         )
 
     def _setup_action_models(self) -> None:
@@ -122,7 +127,7 @@ def _log_response(self, response: CustomAgentOutput) -> None:
             )
 
     def update_step_info(
-        self, model_output: CustomAgentOutput, step_info: CustomAgentStepInfo = None
+            self, model_output: CustomAgentOutput, step_info: CustomAgentStepInfo = None
     ):
         """
         update step info
@@ -133,9 +138,9 @@ def update_step_info(
         step_info.step_number += 1
         important_contents = model_output.current_state.important_contents
         if (
-            important_contents
-            and "None" not in important_contents
-            and important_contents not in step_info.memory
+                important_contents
+                and "None" not in important_contents
+                and important_contents not in step_info.memory
         ):
             step_info.memory += important_contents + "\n"
 
@@ -146,16 +151,35 @@ def update_step_info(
     @time_execution_async("--get_next_action")
     async def get_next_action(self, input_messages: list[BaseMessage]) -> AgentOutput:
         """Get next action from LLM based on current state"""
+        try:
+            structured_llm = self.llm.with_structured_output(self.AgentOutput, include_raw=True)
+            response: dict[str, Any] = await structured_llm.ainvoke(input_messages)  # type: ignore
+
+            parsed: AgentOutput = response['parsed']
+            # cut the number of actions to max_actions_per_step
+            parsed.action = parsed.action[: self.max_actions_per_step]
+            self._log_response(parsed)
+            self.n_steps += 1
+
+            return parsed
+        except Exception as e:
+            # If something goes wrong, try to invoke the LLM again without structured output,
+            # and Manually parse the response. Temporarily solution for DeepSeek
+            ret = self.llm.invoke(input_messages)
+            if isinstance(ret.content, list):
+                parsed_json = json.loads(ret.content[0].replace("```json", "").replace("```", ""))
+            else:
+                parsed_json = json.loads(ret.content.replace("```json", "").replace("```", ""))
+            parsed: AgentOutput = self.AgentOutput(**parsed_json)
+            if parsed is None:
+                raise ValueError(f'Could not parse response.')
 
-        ret = self.llm.invoke(input_messages)
-        parsed_json = json.loads(ret.content.replace("```json", "").replace("```", ""))
-        parsed: AgentOutput = self.AgentOutput(**parsed_json)
-        # cut the number of actions to max_actions_per_step
-        parsed.action = parsed.action[: self.max_actions_per_step]
-        self._log_response(parsed)
-        self.n_steps += 1
+            # cut the number of actions to max_actions_per_step
+            parsed.action = parsed.action[: self.max_actions_per_step]
+            self._log_response(parsed)
+            self.n_steps += 1
 
-        return parsed
+            return parsed
 
     @time_execution_async("--step")
     async def step(self, step_info: Optional[CustomAgentStepInfo] = None) -> None:
@@ -233,7 +257,7 @@ async def run(self, max_steps: int = 100) -> AgentHistoryList:
 
                 if self.history.is_done():
                     if (
-                        self.validate_output and step < max_steps - 1
+                            self.validate_output and step < max_steps - 1
                     ):  # if last step, we dont need to validate
                         if not await self._validate_output():
                             continue
diff --git a/src/agent/custom_massage_manager.py b/src/agent/custom_massage_manager.py
@@ -17,6 +17,7 @@
 from langchain_core.language_models import BaseChatModel
 from langchain_core.messages import (
     HumanMessage,
+    AIMessage
 )
 
 from .custom_prompts import CustomAgentMessagePrompt
@@ -26,40 +27,70 @@
 
 class CustomMassageManager(MessageManager):
     def __init__(
-        self,
-        llm: BaseChatModel,
-        task: str,
-        action_descriptions: str,
-        system_prompt_class: Type[SystemPrompt],
-        max_input_tokens: int = 128000,
-        estimated_tokens_per_character: int = 3,
-        image_tokens: int = 800,
-        include_attributes: list[str] = [],
-        max_error_length: int = 400,
-        max_actions_per_step: int = 10,
+            self,
+            llm: BaseChatModel,
+            task: str,
+            action_descriptions: str,
+            system_prompt_class: Type[SystemPrompt],
+            max_input_tokens: int = 128000,
+            estimated_tokens_per_character: int = 3,
+            image_tokens: int = 800,
+            include_attributes: list[str] = [],
+            max_error_length: int = 400,
+            max_actions_per_step: int = 10,
+            tool_call_in_content: bool = False,
     ):
         super().__init__(
-            llm,
-            task,
-            action_descriptions,
-            system_prompt_class,
-            max_input_tokens,
-            estimated_tokens_per_character,
-            image_tokens,
-            include_attributes,
-            max_error_length,
-            max_actions_per_step,
+            llm=llm,
+            task=task,
+            action_descriptions=action_descriptions,
+            system_prompt_class=system_prompt_class,
+            max_input_tokens=max_input_tokens,
+            estimated_tokens_per_character=estimated_tokens_per_character,
+            image_tokens=image_tokens,
+            include_attributes=include_attributes,
+            max_error_length=max_error_length,
+            max_actions_per_step=max_actions_per_step,
+            tool_call_in_content=tool_call_in_content,
         )
 
-        # Move Task info to state_message
+        # Custom: Move Task info to state_message
         self.history = MessageHistory()
         self._add_message_with_tokens(self.system_prompt)
+        tool_calls = [
+            {
+                'name': 'AgentOutput',
+                'args': {
+                    'current_state': {
+                        'evaluation_previous_goal': 'Unknown - No previous actions to evaluate.',
+                        'memory': '',
+                        'next_goal': 'Obtain task from user',
+                    },
+                    'action': [],
+                },
+                'id': '',
+                'type': 'tool_call',
+            }
+        ]
+        if self.tool_call_in_content:
+            # openai throws error if tool_calls are not responded -> move to content
+            example_tool_call = AIMessage(
+                content=f'{tool_calls}',
+                tool_calls=[],
+            )
+        else:
+            example_tool_call = AIMessage(
+                content=f'',
+                tool_calls=tool_calls,
+            )
+
+        self._add_message_with_tokens(example_tool_call)
 
     def add_state_message(
-        self,
-        state: BrowserState,
-        result: Optional[List[ActionResult]] = None,
-        step_info: Optional[AgentStepInfo] = None,
+            self,
+            state: BrowserState,
+            result: Optional[List[ActionResult]] = None,
+            step_info: Optional[AgentStepInfo] = None,
     ) -> None:
         """Add browser state as human message"""
 
@@ -72,7 +103,7 @@ def add_state_message(
                         self._add_message_with_tokens(msg)
                     if r.error:
                         msg = HumanMessage(
-                            content=str(r.error)[-self.max_error_length :]
+                            content=str(r.error)[-self.max_error_length:]
                         )
                         self._add_message_with_tokens(msg)
                     result = None  # if result in history, we dont want to add it again
diff --git a/src/agent/custom_prompts.py b/src/agent/custom_prompts.py
@@ -24,7 +24,7 @@ def important_rules(self) -> str:
        {
          "current_state": {
            "prev_action_evaluation": "Success|Failed|Unknown - Analyze the current elements and the image to check if the previous goals/actions are successful like intended by the task. Ignore the action result. The website is the ground truth. Also mention if something unexpected happened like new suggestions in an input field. Shortly state why/why not. Note that the result you output must be consistent with the reasoning you output afterwards. If you consider it to be 'Failed,' you should reflect on this during your thought.",
-           "important_contents": "Output important contents closely related to user\'s instruction or task on the current page. If there is, please output the contents. If not, please output \"None\".",
+           "important_contents": "Output important contents closely related to user\'s instruction or task on the current page. If there is, please output the contents. If not, please output empty string ''.",
            "completed_contents": "Update the input Task Progress. Completed contents is a general summary of the current contents that have been completed. Just summarize the contents that have been actually completed based on the current page and the history operations. Please list each completed item individually, such as: 1. Input username. 2. Input Password. 3. Click confirm button",
            "thought": "Think about the requirements that have been completed in previous operations and the requirements that need to be completed in the next one operation. If the output of prev_action_evaluation is 'Failed', please reflect and output your reflection here. If you think you have entered the wrong page, consider to go back to the previous page in next action.",
            "summary": "Please generate a brief natural language description for the operation in next actions based on your Thought."
@@ -148,12 +148,12 @@ def get_system_message(self) -> SystemMessage:
 
 class CustomAgentMessagePrompt:
     def __init__(
-        self,
-        state: BrowserState,
-        result: Optional[List[ActionResult]] = None,
-        include_attributes: list[str] = [],
-        max_error_length: int = 400,
-        step_info: Optional[CustomAgentStepInfo] = None,
+            self,
+            state: BrowserState,
+            result: Optional[List[ActionResult]] = None,
+            include_attributes: list[str] = [],
+            max_error_length: int = 400,
+            step_info: Optional[CustomAgentStepInfo] = None,
     ):
         self.state = state
         self.result = result
@@ -183,7 +183,7 @@ def get_user_message(self) -> HumanMessage:
                     state_description += f"\nResult of action {i + 1}/{len(self.result)}: {result.extracted_content}"
                 if result.error:
                     # only use last 300 characters of error
-                    error = result.error[-self.max_error_length :]
+                    error = result.error[-self.max_error_length:]
                     state_description += (
                         f"\nError of action {i + 1}/{len(self.result)}: ...{error}"
                     )
diff --git a/src/browser/custom_context.py b/src/browser/custom_context.py
@@ -23,11 +23,12 @@ def __init__(
         config: BrowserContextConfig = BrowserContextConfig(),
         context: BrowserContext = None,
     ):
-        super(CustomBrowserContext, self).__init__(browser, config)
+        super(CustomBrowserContext, self).__init__(browser=browser, config=config)
         self.context = context
 
     async def _create_context(self, browser: PlaywrightBrowser):
         """Creates a new browser context with anti-detection measures and loads cookies if available."""
+        # If we have a context, return it directly
         if self.context:
             return self.context
         if self.browser.config.chrome_instance_path and len(browser.contexts) > 0:
@@ -46,7 +47,7 @@ async def _create_context(self, browser: PlaywrightBrowser):
                 bypass_csp=self.config.disable_security,
                 ignore_https_errors=self.config.disable_security,
                 record_video_dir=self.config.save_recording_path,
-                record_video_size=self.config.browser_window_size,  # set record video size
+                record_video_size=self.config.browser_window_size,  # set record video size, same as windows size
             )
 
         if self.config.trace_path:
diff --git a/src/utils/utils.py b/src/utils/utils.py
@@ -86,6 +86,7 @@ def get_llm_model(provider: str, **kwargs):
         return ChatOllama(
             model=kwargs.get("model_name", "qwen2.5:7b"),
             temperature=kwargs.get("temperature", 0.0),
+            num_ctx=128000,
         )
     elif provider == "azure_openai":
         if not kwargs.get("base_url", ""):
diff --git a/tests/test_browser_use.py b/tests/test_browser_use.py
diff --git a/webui.py b/webui.py

Original file line number	Diff line number	Diff line change
`@@ -86,6 +86,7 @@ def get_llm_model(provider: str, **kwargs):`
`86`	`86`	`return ChatOllama(`
`87`	`87`	`model=kwargs.get("model_name", "qwen2.5:7b"),`
`88`	`88`	`temperature=kwargs.get("temperature", 0.0),`
	`89`	`+ num_ctx=128000,`
`89`	`90`	`)`
`90`	`91`	`elif provider == "azure_openai":`
`91`	`92`	`if not kwargs.get("base_url", ""):`