add mcp tool

vvincent1234 · vvincent1234 · commit 3c0a089fc5eb · 2025-04-26T23:14:40.000+08:00
diff --git a/requirements.txt b/requirements.txt
@@ -1,8 +1,8 @@
-browser-use==0.1.40
+browser-use==0.1.41
 pyperclip==1.9.0
-gradio==5.23.1
+gradio==5.27.0
 json-repair
 langchain-mistralai==0.2.4
-langchain-google-genai==2.0.8
 MainContentExtractor==0.0.4
-langchain-ibm==0.3.10
+langchain-ibm==0.3.10
+langchain_mcp_adapters==0.0.9
diff --git a/src/controller/custom_controller.py b/src/controller/custom_controller.py
@@ -1,7 +1,7 @@
 import pdb
 
 import pyperclip
-from typing import Optional, Type
+from typing import Optional, Type, Callable, Dict, Any, Union, Awaitable
 from pydantic import BaseModel
 from browser_use.agent.views import ActionResult
 from browser_use.browser.context import BrowserContext
@@ -20,30 +20,78 @@
     SwitchTabAction,
 )
 import logging
+import inspect
+import os
+from src.utils import utils
 
 logger = logging.getLogger(__name__)
 
 
 class CustomController(Controller):
     def __init__(self, exclude_actions: list[str] = [],
-                 output_model: Optional[Type[BaseModel]] = None
+                 output_model: Optional[Type[BaseModel]] = None,
+                 ask_assistant_callback: Optional[Union[Callable[[str, BrowserContext], Dict[str, Any]], Callable[
+                     [str, BrowserContext], Awaitable[Dict[str, Any]]]]] = None,
+
                  ):
         super().__init__(exclude_actions=exclude_actions, output_model=output_model)
         self._register_custom_actions()
+        self.ask_assistant_callback = ask_assistant_callback
 
     def _register_custom_actions(self):
         """Register all custom browser actions"""
 
-        @self.registry.action("Copy text to clipboard")
-        def copy_to_clipboard(text: str):
-            pyperclip.copy(text)
-            return ActionResult(extracted_content=text)
+        @self.registry.action(
+            "When executing tasks, prioritize autonomous completion. However, if you encounter a definitive blocker "
+            "that prevents you from proceeding independently – such as needing credentials you don't possess, "
+            "requiring subjective human judgment, needing a physical action performed, encountering complex CAPTCHAs, "
+            "or facing limitations in your capabilities – you must request human assistance."
+        )
+        async def ask_for_assistant(query: str, browser: BrowserContext):
+            if self.ask_assistant_callback:
+                if inspect.iscoroutinefunction(self.ask_assistant_callback):
+                    user_response = await self.ask_assistant_callback(query, browser)
+                else:
+                    user_response = self.ask_assistant_callback(query, browser)
+                msg = f"AI ask: {query}. User response: {user_response['response']}"
+                logger.info(msg)
+                return ActionResult(extracted_content=msg, include_in_memory=True)
+            else:
+                return ActionResult(extracted_content="Human cannot help you. Please try another way.",
+                                    include_in_memory=True)
+
+        @self.registry.action(
+            'Upload file to interactive element with file path ',
+        )
+        async def upload_file(index: int, path: str, browser: BrowserContext, available_file_paths: list[str]):
+            if path not in available_file_paths:
+                return ActionResult(error=f'File path {path} is not available')
+
+            if not os.path.exists(path):
+                return ActionResult(error=f'File {path} does not exist')
+
+            dom_el = await browser.get_dom_element_by_index(index)
+
+            file_upload_dom_el = dom_el.get_file_upload_element()
+
+            if file_upload_dom_el is None:
+                msg = f'No file upload element found at index {index}'
+                logger.info(msg)
+                return ActionResult(error=msg)
+
+            file_upload_el = await browser.get_locate_element(file_upload_dom_el)
 
-        @self.registry.action("Paste text from clipboard")
-        async def paste_from_clipboard(browser: BrowserContext):
-            text = pyperclip.paste()
-            # send text to browser
-            page = await browser.get_current_page()
-            await page.keyboard.type(text)
+            if file_upload_el is None:
+                msg = f'No file upload element found at index {index}'
+                logger.info(msg)
+                return ActionResult(error=msg)
 
-            return ActionResult(extracted_content=text)
+            try:
+                await file_upload_el.set_input_files(path)
+                msg = f'Successfully uploaded file to index {index}'
+                logger.info(msg)
+                return ActionResult(extracted_content=msg, include_in_memory=True)
+            except Exception as e:
+                msg = f'Failed to upload file to index {index}: {str(e)}'
+                logger.info(msg)
+                return ActionResult(error=msg)
diff --git a/src/utils/mcp_client.py b/src/utils/mcp_client.py
@@ -0,0 +1,42 @@
+import os
+import asyncio
+import base64
+import pdb
+from typing import List, Tuple, Optional
+from langchain_core.tools import BaseTool
+from langchain_mcp_adapters.client import MultiServerMCPClient
+import base64
+import json
+import logging
+from typing import Optional, Dict, Any, Type
+from langchain_core.tools import BaseTool
+from pydantic.v1 import BaseModel, Field
+from langchain_core.runnables import RunnableConfig
+
+logger = logging.getLogger(__name__)
+
+
+async def setup_mcp_client_and_tools(mcp_server_config: Dict[str, Any]) -> Tuple[
+    Optional[List[BaseTool]], Optional[MultiServerMCPClient]]:
+    """
+    Initializes the MultiServerMCPClient, connects to servers, fetches tools,
+    filters them, and returns a flat list of usable tools and the client instance.
+
+    Returns:
+        A tuple containing:
+        - list[BaseTool]: The filtered list of usable LangChain tools.
+        - MultiServerMCPClient | None: The initialized and started client instance, or None on failure.
+    """
+
+    logger.info("Initializing MultiServerMCPClient...")
+
+    try:
+        client = MultiServerMCPClient(mcp_server_config)
+        await client.__aenter__()
+        mcp_tools = client.get_tools()
+        logger.info(f"Total usable MCP tools collected: {len(mcp_tools)}")
+        return mcp_tools, client
+
+    except Exception as e:
+        logger.error(f"Failed to setup MCP client or fetch tools: {e}", exc_info=True)
+        return [], None
diff --git a/tests/test_controller.py b/tests/test_controller.py
@@ -0,0 +1,31 @@
+import asyncio
+import pdb
+import sys
+
+sys.path.append(".")
+
+from dotenv import load_dotenv
+
+load_dotenv()
+
+
+async def test_mcp_client():
+    from src.utils.mcp_client import setup_mcp_client_and_tools
+
+    test_server_config = {
+        "playwright": {
+            "command": "npx",
+            "args": [
+                "@playwright/mcp@latest",
+            ],
+            "transport": "stdio",
+        }
+    }
+
+    mcp_tools, mcp_client = await setup_mcp_client_and_tools(test_server_config)
+
+    pdb.set_trace()
+
+
+if __name__ == '__main__':
+    asyncio.run(test_mcp_client())