add custom browser

warmshao · warmshao · commit 6c07ec260341 · 2025-01-02T14:11:10.000+08:00
diff --git a/.env.example b/.env.example
@@ -1,8 +1,12 @@
+OPENAI_ENDPOINT=https://api.openai.com/v1
 OPENAI_API_KEY=
+
 ANTHROPIC_API_KEY=
+
 GOOGLE_API_KEY=
+
 AZURE_OPENAI_ENDPOINT=
-AZURE_OPENAI_KEY=
+AZURE_OPENAI_API_KEY=
 
 # Set to false to disable anonymized telemetry
 ANONYMIZED_TELEMETRY=true
diff --git a/requirements.txt b/requirements.txt
@@ -0,0 +1,3 @@
+browser-use
+langchain-google-genai
+pyperclip
diff --git a/src/agent/custom_agent.py b/src/agent/custom_agent.py
@@ -0,0 +1,117 @@
+# -*- coding: utf-8 -*-
+# @Time    : 2025/1/2
+# @Author  : wenshao
+# @ProjectName: browser-use-webui
+# @FileName: custom_agent.py
+
+import asyncio
+import json
+import logging
+import os
+import time
+import uuid
+from pathlib import Path
+from typing import Any, Optional, Type, TypeVar
+
+from dotenv import load_dotenv
+from langchain_core.language_models.chat_models import BaseChatModel
+from langchain_core.messages import (
+    BaseMessage,
+    SystemMessage,
+)
+from openai import RateLimitError
+from pydantic import BaseModel, ValidationError
+
+from browser_use.agent.message_manager.service import MessageManager
+from browser_use.agent.prompts import AgentMessagePrompt, SystemPrompt
+from browser_use.agent.views import (
+    ActionResult,
+    AgentError,
+    AgentHistory,
+    AgentHistoryList,
+    AgentOutput,
+    AgentStepInfo,
+)
+from browser_use.telemetry.views import (
+    AgentEndTelemetryEvent,
+    AgentRunTelemetryEvent,
+    AgentStepErrorTelemetryEvent,
+)
+from browser_use.agent.service import Agent
+from browser_use.utils import time_execution_async
+
+from .custom_views import CustomAgentOutput
+
+logger = logging.getLogger(__name__)
+
+
+class CustomAgent(Agent):
+
+    def _setup_action_models(self) -> None:
+        """Setup dynamic action models from controller's registry"""
+        # Get the dynamic action model from controller's registry
+        self.ActionModel = self.controller.registry.create_action_model()
+        # Create output model with the dynamic actions
+        self.AgentOutput = CustomAgentOutput.type_with_custom_actions(self.ActionModel)
+
+    def _log_response(self, response: CustomAgentOutput) -> None:
+        """Log the model's response"""
+        if 'Success' in response.current_state.evaluation_previous_goal:
+            emoji = '👍'
+        elif 'Failed' in response.current_state.evaluation_previous_goal:
+            emoji = '⚠'
+        else:
+            emoji = '🤷'
+
+        logger.info(f'{emoji} Eval: {response.current_state.evaluation_previous_goal}')
+        logger.info(f'🧠 Memory: {response.current_state.memory}')
+        logger.info(f'🎯 Next goal: {response.current_state.next_goal}')
+        for i, action in enumerate(response.action):
+            logger.info(
+                f'🛠️  Action {i + 1}/{len(response.action)}: {action.model_dump_json(exclude_unset=True)}'
+            )
+
+    @time_execution_async('--step')
+    async def step(self, step_info: Optional[AgentStepInfo] = None) -> None:
+        """Execute one step of the task"""
+        logger.info(f'\n📍 Step {self.n_steps}')
+        state = None
+        model_output = None
+        result: list[ActionResult] = []
+
+        try:
+            state = await self.browser_context.get_state(use_vision=self.use_vision)
+            self.message_manager.add_state_message(state, self._last_result, step_info)
+            input_messages = self.message_manager.get_messages()
+            model_output = await self.get_next_action(input_messages)
+            self._save_conversation(input_messages, model_output)
+            self.message_manager._remove_last_state_message()  # we dont want the whole state in the chat history
+            self.message_manager.add_model_output(model_output)
+
+            result: list[ActionResult] = await self.controller.multi_act(
+                model_output.action, self.browser_context
+            )
+            self._last_result = result
+
+            if len(result) > 0 and result[-1].is_done:
+                logger.info(f'📄 Result: {result[-1].extracted_content}')
+
+            self.consecutive_failures = 0
+
+        except Exception as e:
+            result = self._handle_step_error(e)
+            self._last_result = result
+
+        finally:
+            if not result:
+                return
+            for r in result:
+                if r.error:
+                    self.telemetry.capture(
+                        AgentStepErrorTelemetryEvent(
+                            agent_id=self.agent_id,
+                            error=r.error,
+                        )
+                    )
+            if state:
+                self._make_history_item(model_output, state, result)
diff --git a/src/agent/custom_prompts.py b/src/agent/custom_prompts.py
@@ -0,0 +1,17 @@
+# -*- coding: utf-8 -*-
+# @Time    : 2025/1/2
+# @Author  : wenshao
+# @ProjectName: browser-use-webui
+# @FileName: custom_prompts.py
+
+from datetime import datetime
+from typing import List, Optional
+
+from langchain_core.messages import HumanMessage, SystemMessage
+
+from browser_use.agent.views import ActionResult, AgentStepInfo
+from browser_use.browser.views import BrowserState
+from browser_use.agent.prompts import SystemPrompt
+
+class CustomSystemPrompt(SystemPrompt):
+    pass
diff --git a/src/agent/custom_views.py b/src/agent/custom_views.py
@@ -0,0 +1,51 @@
+# -*- coding: utf-8 -*-
+# @Time    : 2025/1/2
+# @Author  : wenshao
+# @ProjectName: browser-use-webui
+# @FileName: custom_views.py
+
+from dataclasses import dataclass
+from typing import Type
+from pydantic import BaseModel, ConfigDict, Field, ValidationError, create_model
+from browser_use.controller.registry.views import ActionModel
+
+
+@dataclass
+class CustomAgentStepInfo:
+    step_number: int
+    max_steps: int
+    memory: str
+
+
+
+class CustomAgentBrain(BaseModel):
+    """Current state of the agent"""
+
+    prev_action_evaluation: str
+    memory: str
+    progress: str
+    thought: str
+    summary: str
+    action: str
+
+
+class CustomAgentOutput(BaseModel):
+    """Output model for agent
+
+    @dev note: this model is extended with custom actions in AgentService. You can also use some fields that are not in this model as provided by the linter, as long as they are registered in the DynamicActions model.
+    """
+
+    model_config = ConfigDict(arbitrary_types_allowed=True)
+
+    current_state: CustomAgentBrain
+    action: list[ActionModel]
+
+    @staticmethod
+    def type_with_custom_actions(custom_actions: Type[ActionModel]) -> Type['CustomAgentOutput']:
+        """Extend actions with custom actions"""
+        return create_model(
+            'AgentOutput',
+            __base__=CustomAgentOutput,
+            action=(list[custom_actions], Field(...)),  # Properly annotated field with no default
+            __module__=CustomAgentOutput.__module__,
+        )
diff --git a/src/browser/custom_browser.py b/src/browser/custom_browser.py
@@ -0,0 +1,19 @@
+# -*- coding: utf-8 -*-
+# @Time    : 2025/1/2
+# @Author  : wenshao
+# @ProjectName: browser-use-webui
+# @FileName: browser.py
+
+from browser_use.browser.browser import Browser, BrowserConfig
+from browser_use.browser.context import BrowserContextConfig, BrowserContext
+
+from .custom_context import CustomBrowserContext
+
+
+class CustomBrowser(Browser):
+
+    async def new_context(
+            self, config: BrowserContextConfig = BrowserContextConfig()
+    ) -> BrowserContext:
+        """Create a browser context"""
+        return CustomBrowserContext(config=config, browser=self)
diff --git a/src/browser/custom_context.py b/src/browser/custom_context.py
@@ -19,15 +19,6 @@
 
 
 class CustomBrowserContext(BrowserContext):
-    """
-    定制BrowserContext
-    """
-
-    def __init__(self,
-                 browser: 'Browser',
-                 config: BrowserContextConfig = BrowserContextConfig(),
-                 ):
-        super(CustomBrowserContext, self).__init__(browser, config)
 
     async def _create_context(self, browser: PlaywrightBrowser):
         """Creates a new browser context with anti-detection measures and loads cookies if available."""
diff --git a/src/controller/__init__.py b/src/controller/__init__.py
@@ -0,0 +1,5 @@
+# -*- coding: utf-8 -*-
+# @Time    : 2025/1/2
+# @Author  : wenshao
+# @ProjectName: browser-use-webui
+# @FileName: __init__.py.py
diff --git a/src/controller/custom_controller.py b/src/controller/custom_controller.py
@@ -0,0 +1,34 @@
+# -*- coding: utf-8 -*-
+# @Time    : 2025/1/2
+# @Author  : wenshao
+# @ProjectName: browser-use-webui
+# @FileName: custom_action.py
+
+import pyperclip
+
+from browser_use.controller.service import Controller
+from browser_use.agent.views import ActionResult
+from browser_use.browser.context import BrowserContext
+
+
+class CustomController(Controller):
+    def __init__(self):
+        super().__init__()
+        self._register_custom_actions()
+
+    def _register_custom_actions(self):
+        """Register all custom browser actions"""
+
+        @self.registry.action('Copy text to clipboard')
+        def copy_to_clipboard(text: str):
+            pyperclip.copy(text)
+            return ActionResult(extracted_content=text)
+
+        @self.registry.action('Paste text from clipboard', requires_browser=True)
+        async def paste_from_clipboard(browser: BrowserContext):
+            text = pyperclip.paste()
+            # send text to browser
+            page = await browser.get_current_page()
+            await page.keyboard.type(text)
+
+            return ActionResult(extracted_content=text)
diff --git a/src/utils/utils.py b/src/utils/utils.py
@@ -18,7 +18,7 @@ def get_llm_model(provider: str, **kwargs):
     :param kwargs:
     :return:
     """
-    if provider == 'claude':
+    if provider == 'anthropic':
         return ChatAnthropic(
             model_name=kwargs.get("model_name", 'claude-3-5-sonnet-20240620'),
             temperature=kwargs.get("temperature", 0.0),
diff --git a/tests/test_browser_use.py b/tests/test_browser_use.py
diff --git a/tests/test_llm_api.py b/tests/test_llm_api.py

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+browser-use`
	`2`	`+langchain-google-genai`
	`3`	`+pyperclip`