Merge pull request #3 from warmshao/dev

warmshao · web-flow · commit 73cc09825475 · 2025-01-03T19:33:28.000+08:00
add deepseek
diff --git a/README.md b/README.md
@@ -6,7 +6,7 @@ This project builds upon the foundation of the [browser-use](https://github.com/
 
 1.  **A Brand New WebUI:** We offer a comprehensive web interface that supports a wide range of `browser-use` functionalities. This UI is designed to be user-friendly and enables easy interaction with the browser agent.
 
-2.  **Expanded LLM Support:** We've integrated support for various Large Language Models (LLMs), including: Gemini, OpenAI, Azure OpenAI, Anthropic etc. And we plan to add support for even more models in the future.
+2.  **Expanded LLM Support:** We've integrated support for various Large Language Models (LLMs), including: Gemini, OpenAI, Azure OpenAI, Anthropic, DeepSeek etc. And we plan to add support for even more models in the future.
 
 3.  **Custom Browser Support:** You can use your own browser with our tool, eliminating the need to re-login to sites or deal with other authentication challenges. This feature also supports high-definition screen recording.
 
@@ -43,5 +43,6 @@ This project builds upon the foundation of the [browser-use](https://github.com/
     ```
 2.  **Access the WebUI:** Open your web browser and navigate to `http://127.0.0.1:7788`.
 3.  **Using Your Own Browser:**
+    - Close all chrome windows
     - Open the WebUI in a non-Chrome browser, such as Firefox or Edge. This is important because the persistent browser context will use the Chrome data when running the agent.
     - Check the "Use Own Browser" option within the Browser Settings.
diff --git a/src/agent/custom_agent.py b/src/agent/custom_agent.py
@@ -151,6 +151,20 @@ def update_step_info(self, model_output: CustomAgentOutput, step_info: CustomAge
         if completed_contents and 'None' not in completed_contents:
             step_info.task_progress = completed_contents
 
+    @time_execution_async('--get_next_action')
+    async def get_next_action(self, input_messages: list[BaseMessage]) -> AgentOutput:
+        """Get next action from LLM based on current state"""
+
+        ret = self.llm.invoke(input_messages)
+        parsed_json = json.loads(ret.content.replace('```json', '').replace("```", ""))
+        parsed: AgentOutput = self.AgentOutput(**parsed_json)
+        # cut the number of actions to max_actions_per_step
+        parsed.action = parsed.action[: self.max_actions_per_step]
+        self._log_response(parsed)
+        self.n_steps += 1
+
+        return parsed
+
     @time_execution_async('--step')
     async def step(self, step_info: Optional[CustomAgentStepInfo] = None) -> None:
         """Execute one step of the task"""
diff --git a/src/utils/utils.py b/src/utils/utils.py
@@ -48,6 +48,23 @@ def get_llm_model(provider: str, **kwargs):
         else:
             api_key = kwargs.get("api_key")
 
+        return ChatOpenAI(
+            model=kwargs.get("model_name", 'gpt-4o'),
+            temperature=kwargs.get("temperature", 0.0),
+            base_url=base_url,
+            api_key=api_key
+        )
+    elif provider == 'deepseek':
+        if not kwargs.get("base_url", ""):
+            base_url = os.getenv("DEEPSEEK_ENDPOINT", "")
+        else:
+            base_url = kwargs.get("base_url")
+
+        if not kwargs.get("api_key", ""):
+            api_key = os.getenv("DEEPSEEK_API_KEY", "")
+        else:
+            api_key = kwargs.get("api_key")
+
         return ChatOpenAI(
             model=kwargs.get("model_name", 'gpt-4o'),
             temperature=kwargs.get("temperature", 0.0),
diff --git a/tests/test_browser_use.py b/tests/test_browser_use.py
@@ -98,16 +98,23 @@ async def test_browser_use_custom():
     #     api_key=os.getenv("AZURE_OPENAI_API_KEY", "")
     # )
 
+    # llm = utils.get_llm_model(
+    #     provider="gemini",
+    #     model_name="gemini-2.0-flash-exp",
+    #     temperature=1.0,
+    #     api_key=os.getenv("GOOGLE_API_KEY", "")
+    # )
+
     llm = utils.get_llm_model(
-        provider="gemini",
-        model_name="gemini-2.0-flash-exp",
-        temperature=1.0,
-        api_key=os.getenv("GOOGLE_API_KEY", "")
+        provider="deepseek",
+        model_name="deepseek-chat",
+        temperature=0.8
     )
 
     controller = CustomController()
     use_own_browser = False
     disable_security = True
+    use_vision = False
     playwright = None
     browser_context_ = None
     try:
@@ -156,7 +163,8 @@ async def test_browser_use_custom():
                 llm=llm,
                 browser_context=browser_context,
                 controller=controller,
-                system_prompt_class=CustomSystemPrompt
+                system_prompt_class=CustomSystemPrompt,
+                use_vision=use_vision
             )
             history: AgentHistoryList = await agent.run(max_steps=10)
 
diff --git a/tests/test_llm_api.py b/tests/test_llm_api.py
@@ -95,7 +95,29 @@ def test_azure_openai_model():
     print(ai_msg.content)
 
 
+def test_deepseek_model():
+    from langchain_core.messages import HumanMessage
+    from src.utils import utils
+
+    llm = utils.get_llm_model(
+        provider="deepseek",
+        model_name="deepseek-chat",
+        temperature=0.8,
+        base_url=os.getenv("DEEPSEEK_ENDPOINT", ""),
+        api_key=os.getenv("DEEPSEEK_API_KEY", "")
+    )
+    pdb.set_trace()
+    message = HumanMessage(
+        content=[
+            {"type": "text", "text": "who are you?"}
+        ]
+    )
+    ai_msg = llm.invoke([message])
+    print(ai_msg.content)
+
+
 if __name__ == '__main__':
     # test_openai_model()
-    test_gemini_model()
+    # test_gemini_model()
     # test_azure_openai_model()
+    test_deepseek_model()
diff --git a/webui.py b/webui.py
@@ -52,7 +52,8 @@ async def run_browser_agent(
         save_recording_path,
         task,
         add_infos,
-        max_steps
+        max_steps,
+        use_vision
 ):
     """
     Runs the browser agent based on user configurations.
@@ -75,6 +76,7 @@ async def run_browser_agent(
             save_recording_path=save_recording_path,
             task=task,
             max_steps=max_steps,
+            use_vision=use_vision
         )
     elif agent_type == "custom":
         return await run_custom_agent(
@@ -88,6 +90,7 @@ async def run_browser_agent(
             task=task,
             add_infos=add_infos,
             max_steps=max_steps,
+            use_vision=use_vision
         )
     else:
         raise ValueError(f"Invalid agent type: {agent_type}")
@@ -101,7 +104,8 @@ async def run_org_agent(
         window_h,
         save_recording_path,
         task,
-        max_steps
+        max_steps,
+        use_vision
 ):
     browser = Browser(
         config=BrowserConfig(
@@ -121,6 +125,7 @@ async def run_org_agent(
         agent = Agent(
             task=task,
             llm=llm,
+            use_vision=use_vision,
             browser_context=browser_context,
         )
         history = await agent.run(max_steps=max_steps)
@@ -143,7 +148,8 @@ async def run_custom_agent(
         save_recording_path,
         task,
         add_infos,
-        max_steps
+        max_steps,
+        use_vision
 ):
     controller = CustomController()
     playwright = None
@@ -190,6 +196,7 @@ async def run_custom_agent(
             agent = CustomAgent(
                 task=task,
                 add_infos=add_infos,
+                use_vision=use_vision,
                 llm=llm,
                 browser_context=browser_context,
                 controller=controller,
@@ -245,9 +252,10 @@ def main():
         with gr.Row():
             agent_type = gr.Radio(["org", "custom"], label="Agent Type", value="custom")
             max_steps = gr.Number(label="max run steps", value=100)
+            use_vision = gr.Checkbox(label="use vision", value=True)
         with gr.Row():
             llm_provider = gr.Dropdown(
-                ["anthropic", "openai", "gemini", "azure_openai"], label="LLM Provider", value="gemini"
+                ["anthropic", "openai", "gemini", "azure_openai", "deepseek"], label="LLM Provider", value="gemini"
             )
             llm_model_name = gr.Textbox(label="LLM Model Name", value="gemini-2.0-flash-exp")
             llm_temperature = gr.Number(label="LLM Temperature", value=1.0)
@@ -293,7 +301,8 @@ def main():
                 save_recording_path,
                 task,
                 add_infos,
-                max_steps
+                max_steps,
+                use_vision
             ],
             outputs=[final_result_output, errors_output, model_actions_output, model_thoughts_output],
         )