diff --git a/Dockerfile b/Dockerfile
index d093f829..bd17da28 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -45,6 +45,7 @@ RUN apt-get update && apt-get install -y \
fonts-dejavu-core \
fonts-dejavu-extra \
vim \
+ pipx \
&& rm -rf /var/lib/apt/lists/*
# Install noVNC
@@ -70,20 +71,28 @@ WORKDIR /app
COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt
-# Playwright setup
-ENV PLAYWRIGHT_BROWSERS_PATH=/ms-browsers
-RUN mkdir -p $PLAYWRIGHT_BROWSERS_PATH
+# Install uv and uvx for browser-use
+RUN pip install --no-cache-dir uv
-# Install Chromium via Playwright without --with-deps
-RUN PLAYWRIGHT_SKIP_BROWSER_DOWNLOAD=0 playwright install chromium
+# Install Chromium browser for browser-use
+RUN apt-get update \
+ && apt-get install -y chromium chromium-driver \
+ && rm -rf /var/lib/apt/lists/*
+
+# Set Chrome path for browser-use
+ENV CHROME_BIN=/usr/bin/chromium
+ENV DISPLAY=:99
+
+# Also create a symlink for uvx
+RUN ln -s /usr/local/bin/uv /usr/local/bin/uvx || true
# Copy application code
COPY . .
-# Set up supervisor configuration
-RUN mkdir -p /var/log/supervisor
+# Set up supervisor configuration and DBus
+RUN mkdir -p /var/log/supervisor /run/dbus
COPY supervisord.conf /etc/supervisor/conf.d/supervisord.conf
-EXPOSE 7788 6080 5901 9222
+EXPOSE 7788 6080 5901 9222 3000
CMD ["/usr/bin/supervisord", "-c", "/etc/supervisor/conf.d/supervisord.conf"]
diff --git a/README.md b/README.md
index e5a24ea4..ad1e13b7 100644
--- a/README.md
+++ b/README.md
@@ -55,21 +55,12 @@ Activate the virtual environment:
source .venv/bin/activate
```
-#### Step 3: Install Dependencies
-Install Python packages:
+#### Step 3: Install Python Packages
+Install the required Python packages using uv:
```bash
uv pip install -r requirements.txt
```
-Install Browsers in playwright.
-```bash
-playwright install --with-deps
-```
-Or you can install specific browsers by running:
-```bash
-playwright install chromium --with-deps
-```
-
#### Step 4: Configure Environment
1. Create a copy of the example environment file:
- Windows (Command Prompt):
diff --git a/docker-compose.yml b/docker-compose.yml
index 97fdd2c4..96e85fa4 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -54,9 +54,6 @@ services:
# Display Settings
- DISPLAY=:99
- # This ENV is used by the Dockerfile during build time if playwright respects it.
- # It's not strictly needed at runtime by docker-compose unless your app or scripts also read it.
- - PLAYWRIGHT_BROWSERS_PATH=/ms-browsers # Matches Dockerfile ENV
- RESOLUTION=${RESOLUTION:-1920x1080x24}
- RESOLUTION_WIDTH=${RESOLUTION_WIDTH:-1920}
- RESOLUTION_HEIGHT=${RESOLUTION_HEIGHT:-1080}
diff --git a/requirements.txt b/requirements.txt
index f7055242..9926dfea 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,10 +1,5 @@
-browser-use==0.1.48
+browser-use==0.9.4
pyperclip==1.9.0
-gradio==5.27.0
-json-repair
-langchain-mistralai==0.2.4
+gradio==5.49.1
+json-repair==0.49.0
MainContentExtractor==0.0.4
-langchain-ibm==0.3.10
-langchain_mcp_adapters==0.0.9
-langgraph==0.3.34
-langchain-community
diff --git a/src/agent/browser_use/browser_use_agent.py b/src/agent/browser_use/browser_use_agent.py
index f7f6107b..ab2dced5 100644
--- a/src/agent/browser_use/browser_use_agent.py
+++ b/src/agent/browser_use/browser_use_agent.py
@@ -6,18 +6,18 @@
# from lmnr.sdk.decorators import observe
from browser_use.agent.gif import create_history_gif
-from browser_use.agent.service import Agent, AgentHookFunc
+from browser_use import Agent
+from browser_use.agent.service import AgentHookFunc
from browser_use.agent.views import (
ActionResult,
AgentHistory,
AgentHistoryList,
AgentStepInfo,
- ToolCallingMethod,
)
from browser_use.browser.views import BrowserStateHistory
from browser_use.utils import time_execution_async
from dotenv import load_dotenv
-from browser_use.agent.message_manager.utils import is_model_without_tool_support
+# is_model_without_tool_support removed in browser_use 0.6.0
load_dotenv()
logger = logging.getLogger(__name__)
@@ -28,16 +28,13 @@
class BrowserUseAgent(Agent):
- def _set_tool_calling_method(self) -> ToolCallingMethod | None:
+ def _set_tool_calling_method(self) -> str | None:
tool_calling_method = self.settings.tool_calling_method
if tool_calling_method == 'auto':
- if is_model_without_tool_support(self.model_name):
- return 'raw'
- elif self.chat_model_library == 'ChatGoogleGenerativeAI':
+ # Simplified logic for browser_use 0.6.0
+ if self.chat_model_library == 'ChatGoogleGenerativeAI':
return None
- elif self.chat_model_library == 'ChatOpenAI':
- return 'function_calling'
- elif self.chat_model_library == 'AzureChatOpenAI':
+ elif self.chat_model_library in ['ChatOpenAI', 'AzureChatOpenAI']:
return 'function_calling'
else:
return None
@@ -141,23 +138,6 @@ async def run(
# Unregister signal handlers before cleanup
signal_handler.unregister()
- if self.settings.save_playwright_script_path:
- logger.info(
- f'Agent run finished. Attempting to save Playwright script to: {self.settings.save_playwright_script_path}'
- )
- try:
- # Extract sensitive data keys if sensitive_data is provided
- keys = list(self.sensitive_data.keys()) if self.sensitive_data else None
- # Pass browser and context config to the saving method
- self.state.history.save_as_playwright_script(
- self.settings.save_playwright_script_path,
- sensitive_data_keys=keys,
- browser_config=self.browser.config,
- context_config=self.browser_context.config,
- )
- except Exception as script_gen_err:
- # Log any error during script generation/saving
- logger.error(f'Failed to save Playwright script: {script_gen_err}', exc_info=True)
await self.close()
diff --git a/src/agent/deep_research/deep_research_agent.py b/src/agent/deep_research/deep_research_agent.py
index 86be3016..05fd3efb 100644
--- a/src/agent/deep_research/deep_research_agent.py
+++ b/src/agent/deep_research/deep_research_agent.py
@@ -5,36 +5,14 @@
import threading
import uuid
from pathlib import Path
-from typing import Any, Dict, List, Optional, TypedDict
+from typing import Any, Dict, List, Optional
-from browser_use.browser.browser import BrowserConfig
-from langchain_community.tools.file_management import (
- ListDirectoryTool,
- ReadFileTool,
- WriteFileTool,
-)
-
-# Langchain imports
-from langchain_core.messages import (
- AIMessage,
- BaseMessage,
- HumanMessage,
- SystemMessage,
- ToolMessage,
-)
-from langchain_core.prompts import ChatPromptTemplate
-from langchain_core.tools import StructuredTool, Tool
-
-# Langgraph imports
-from langgraph.graph import StateGraph
+from browser_use.browser.profile import BrowserProfile
+from browser_use.llm import BaseChatModel, UserMessage, SystemMessage, AssistantMessage
from pydantic import BaseModel, Field
-from browser_use.browser.context import BrowserContextConfig
-
-from src.agent.browser_use.browser_use_agent import BrowserUseAgent
-from src.browser.custom_browser import CustomBrowser
-from src.controller.custom_controller import CustomController
-from src.utils.mcp_client import setup_mcp_client_and_tools
+# Using browser_use.Agent directly
+from browser_use import Controller
logger = logging.getLogger(__name__)
@@ -43,1219 +21,223 @@
PLAN_FILENAME = "research_plan.md"
SEARCH_INFO_FILENAME = "search_info.json"
+# Global state management
_AGENT_STOP_FLAGS = {}
_BROWSER_AGENT_INSTANCES = {}
+# Simple file operations to replace langchain tools
+def read_file_content(file_path: str) -> str:
+ """Read content from a file"""
+ try:
+ with open(file_path, 'r', encoding='utf-8') as f:
+ return f.read()
+ except Exception as e:
+ return f"Error reading file: {str(e)}"
+
+def write_file_content(file_path: str, content: str) -> str:
+ """Write content to a file"""
+ try:
+ os.makedirs(os.path.dirname(file_path), exist_ok=True)
+ with open(file_path, 'w', encoding='utf-8') as f:
+ f.write(content)
+ return f"Successfully wrote to {file_path}"
+ except Exception as e:
+ return f"Error writing file: {str(e)}"
+
+def list_directory_contents(directory_path: str) -> str:
+ """List contents of a directory"""
+ try:
+ contents = os.listdir(directory_path)
+ return "\n".join(contents)
+ except Exception as e:
+ return f"Error listing directory: {str(e)}"
+
async def run_single_browser_task(
task_query: str,
task_id: str,
- llm: Any, # Pass the main LLM
+ llm: BaseChatModel,
browser_config: Dict[str, Any],
stop_event: threading.Event,
use_vision: bool = False,
) -> Dict[str, Any]:
"""
- Runs a single BrowserUseAgent task.
- Manages browser creation and closing for this specific task.
+ Simplified browser task runner using browser-use Agent
"""
- if not BrowserUseAgent:
- return {
- "query": task_query,
- "error": "BrowserUseAgent components not available.",
- }
-
- # --- Browser Setup ---
- # These should ideally come from the main agent's config
- headless = browser_config.get("headless", False)
- window_w = browser_config.get("window_width", 1280)
- window_h = browser_config.get("window_height", 1100)
- browser_user_data_dir = browser_config.get("user_data_dir", None)
- use_own_browser = browser_config.get("use_own_browser", False)
- browser_binary_path = browser_config.get("browser_binary_path", None)
- wss_url = browser_config.get("wss_url", None)
- cdp_url = browser_config.get("cdp_url", None)
- disable_security = browser_config.get("disable_security", False)
-
- bu_browser = None
- bu_browser_context = None
try:
- logger.info(f"Starting browser task for query: {task_query}")
- extra_args = []
- if use_own_browser:
- browser_binary_path = os.getenv("BROWSER_PATH", None) or browser_binary_path
- if browser_binary_path == "":
- browser_binary_path = None
- browser_user_data = browser_user_data_dir or os.getenv("BROWSER_USER_DATA", None)
- if browser_user_data:
- extra_args += [f"--user-data-dir={browser_user_data}"]
- else:
- browser_binary_path = None
-
- bu_browser = CustomBrowser(
- config=BrowserConfig(
- headless=headless,
- browser_binary_path=browser_binary_path,
- extra_browser_args=extra_args,
- wss_url=wss_url,
- cdp_url=cdp_url,
- new_context_config=BrowserContextConfig(
- window_width=window_w,
- window_height=window_h,
- )
- )
+ logger.info(f"Running browser task: {task_query}")
+
+ # Create browser profile
+ browser_profile = BrowserProfile(
+ headless=browser_config.get("headless", True),
+ window_width=browser_config.get("window_width", 1280),
+ window_height=browser_config.get("window_height", 1100),
+ browser_binary_path=browser_config.get("browser_binary_path"),
+ user_data_dir=browser_config.get("user_data_dir"),
)
-
- context_config = BrowserContextConfig(
- save_downloads_path="./tmp/downloads",
- window_height=window_h,
- window_width=window_w,
- force_new_context=True,
- )
- bu_browser_context = await bu_browser.new_context(config=context_config)
-
- # Simple controller example, replace with your actual implementation if needed
- bu_controller = CustomController()
-
- # Construct the task prompt for BrowserUseAgent
- # Instruct it to find specific info and return title/URL
- bu_task_prompt = f"""
- Research Task: {task_query}
- Objective: Find relevant information answering the query.
- Output Requirements: For each relevant piece of information found, please provide:
- 1. A concise summary of the information.
- 2. The title of the source page or document.
- 3. The URL of the source.
- Focus on accuracy and relevance. Avoid irrelevant details.
- PDF cannot directly extract _content, please try to download first, then using read_file, if you can't save or read, please try other methods.
- """
-
- bu_agent_instance = BrowserUseAgent(
- task=bu_task_prompt,
- llm=llm, # Use the passed LLM
- browser=bu_browser,
- browser_context=bu_browser_context,
- controller=bu_controller,
+
+ # Use browser-use Agent directly with browser profile
+ controller = Controller()
+
+ # Create browser agent using browser-use Agent class
+ from browser_use import Agent
+ browser_agent = Agent(
+ task=task_query,
+ llm=llm,
+ browser_profile=browser_profile,
+ controller=controller,
use_vision=use_vision,
- source="webui",
)
-
- # Store instance for potential stop() call
- task_key = f"{task_id}_{uuid.uuid4()}"
- _BROWSER_AGENT_INSTANCES[task_key] = bu_agent_instance
-
- # --- Run with Stop Check ---
- # BrowserUseAgent needs to internally check a stop signal or have a stop method.
- # We simulate checking before starting and assume `run` might be interruptible
- # or have its own stop mechanism we can trigger via bu_agent_instance.stop().
+
+ # Store instance for potential stop
+ _BROWSER_AGENT_INSTANCES[task_id] = browser_agent
+
if stop_event.is_set():
- logger.info(f"Browser task for '{task_query}' cancelled before start.")
return {"query": task_query, "result": None, "status": "cancelled"}
-
- # The run needs to be awaitable and ideally accept a stop signal or have a .stop() method
- # result = await bu_agent_instance.run(max_steps=max_steps) # Add max_steps if applicable
- # Let's assume a simplified run for now
- logger.info(f"Running BrowserUseAgent for: {task_query}")
- result = await bu_agent_instance.run() # Assuming run is the main method
- logger.info(f"BrowserUseAgent finished for: {task_query}")
-
- final_data = result.final_result()
-
- if stop_event.is_set():
- logger.info(f"Browser task for '{task_query}' stopped during execution.")
- return {"query": task_query, "result": final_data, "status": "stopped"}
- else:
- logger.info(f"Browser result for '{task_query}': {final_data}")
- return {"query": task_query, "result": final_data, "status": "completed"}
-
- except Exception as e:
- logger.error(
- f"Error during browser task for query '{task_query}': {e}", exc_info=True
- )
- return {"query": task_query, "error": str(e), "status": "failed"}
- finally:
- if bu_browser_context:
- try:
- await bu_browser_context.close()
- bu_browser_context = None
- logger.info("Closed browser context.")
- except Exception as e:
- logger.error(f"Error closing browser context: {e}")
- if bu_browser:
- try:
- await bu_browser.close()
- bu_browser = None
- logger.info("Closed browser.")
- except Exception as e:
- logger.error(f"Error closing browser: {e}")
-
- if task_key in _BROWSER_AGENT_INSTANCES:
- del _BROWSER_AGENT_INSTANCES[task_key]
-
-
-class BrowserSearchInput(BaseModel):
- queries: List[str] = Field(
- description="List of distinct search queries to find information relevant to the research task."
- )
-
-
-async def _run_browser_search_tool(
- queries: List[str],
- task_id: str, # Injected dependency
- llm: Any, # Injected dependency
- browser_config: Dict[str, Any],
- stop_event: threading.Event,
- max_parallel_browsers: int = 1,
-) -> List[Dict[str, Any]]:
- """
- Internal function to execute parallel browser searches based on LLM-provided queries.
- Handles concurrency and stop signals.
- """
-
- # Limit queries just in case LLM ignores the description
- queries = queries[:max_parallel_browsers]
- logger.info(
- f"[Browser Tool {task_id}] Running search for {len(queries)} queries: {queries}"
- )
-
- results = []
- semaphore = asyncio.Semaphore(max_parallel_browsers)
-
- async def task_wrapper(query):
- async with semaphore:
- if stop_event.is_set():
- logger.info(
- f"[Browser Tool {task_id}] Skipping task due to stop signal: {query}"
- )
- return {"query": query, "result": None, "status": "cancelled"}
- # Pass necessary injected configs and the stop event
- return await run_single_browser_task(
- query,
- task_id,
- llm, # Pass the main LLM (or a dedicated one if needed)
- browser_config,
- stop_event,
- # use_vision could be added here if needed
- )
-
- tasks = [task_wrapper(query) for query in queries]
- search_results = await asyncio.gather(*tasks, return_exceptions=True)
-
- processed_results = []
- for i, res in enumerate(search_results):
- query = queries[i] # Get corresponding query
- if isinstance(res, Exception):
- logger.error(
- f"[Browser Tool {task_id}] Gather caught exception for query '{query}': {res}",
- exc_info=True,
- )
- processed_results.append(
- {"query": query, "error": str(res), "status": "failed"}
- )
- elif isinstance(res, dict):
- processed_results.append(res)
- else:
- logger.error(
- f"[Browser Tool {task_id}] Unexpected result type for query '{query}': {type(res)}"
- )
- processed_results.append(
- {"query": query, "error": "Unexpected result type", "status": "failed"}
- )
-
- logger.info(
- f"[Browser Tool {task_id}] Finished search. Results count: {len(processed_results)}"
- )
- return processed_results
-
-
-def create_browser_search_tool(
- llm: Any,
- browser_config: Dict[str, Any],
- task_id: str,
- stop_event: threading.Event,
- max_parallel_browsers: int = 1,
-) -> StructuredTool:
- """Factory function to create the browser search tool with necessary dependencies."""
- # Use partial to bind the dependencies that aren't part of the LLM call arguments
- from functools import partial
-
- bound_tool_func = partial(
- _run_browser_search_tool,
- task_id=task_id,
- llm=llm,
- browser_config=browser_config,
- stop_event=stop_event,
- max_parallel_browsers=max_parallel_browsers,
- )
-
- return StructuredTool.from_function(
- coroutine=bound_tool_func,
- name="parallel_browser_search",
- description=f"""Use this tool to actively search the web for information related to a specific research task or question.
-It runs up to {max_parallel_browsers} searches in parallel using a browser agent for better results than simple scraping.
-Provide a list of distinct search queries(up to {max_parallel_browsers}) that are likely to yield relevant information.""",
- args_schema=BrowserSearchInput,
- )
-
-
-# --- Langgraph State Definition ---
-
-
-class ResearchTaskItem(TypedDict):
- # step: int # Maybe step within category, or just implicit by order
- task_description: str
- status: str # "pending", "completed", "failed"
- queries: Optional[List[str]]
- result_summary: Optional[str]
-
-
-class ResearchCategoryItem(TypedDict):
- category_name: str
- tasks: List[ResearchTaskItem]
- # Optional: category_status: str # Could be "pending", "in_progress", "completed"
-
-
-class DeepResearchState(TypedDict):
- task_id: str
- topic: str
- research_plan: List[ResearchCategoryItem] # CHANGED
- search_results: List[Dict[str, Any]]
- llm: Any
- tools: List[Tool]
- output_dir: Path
- browser_config: Dict[str, Any]
- final_report: Optional[str]
- current_category_index: int
- current_task_index_in_category: int
- stop_requested: bool
- error_message: Optional[str]
- messages: List[BaseMessage]
-
-
-# --- Langgraph Nodes ---
-
-
-def _load_previous_state(task_id: str, output_dir: str) -> Dict[str, Any]:
- state_updates = {}
- plan_file = os.path.join(output_dir, PLAN_FILENAME)
- search_file = os.path.join(output_dir, SEARCH_INFO_FILENAME)
-
- loaded_plan: List[ResearchCategoryItem] = []
- next_cat_idx, next_task_idx = 0, 0
- found_pending = False
-
- if os.path.exists(plan_file):
- try:
- with open(plan_file, "r", encoding="utf-8") as f:
- current_category: Optional[ResearchCategoryItem] = None
- lines = f.readlines()
- cat_counter = 0
- task_counter_in_cat = 0
-
- for line_num, line_content in enumerate(lines):
- line = line_content.strip()
- if line.startswith("## "): # Category
- if current_category: # Save previous category
- loaded_plan.append(current_category)
- if not found_pending: # If previous category was all done, advance cat counter
- cat_counter += 1
- task_counter_in_cat = 0
- category_name = line[line.find(" "):].strip() # Get text after "## X. "
- current_category = ResearchCategoryItem(category_name=category_name, tasks=[])
- elif (line.startswith("- [ ]") or line.startswith("- [x]") or line.startswith(
- "- [-]")) and current_category: # Task
- status = "pending"
- if line.startswith("- [x]"):
- status = "completed"
- elif line.startswith("- [-]"):
- status = "failed"
-
- task_desc = line[5:].strip()
- current_category["tasks"].append(
- ResearchTaskItem(task_description=task_desc, status=status, queries=None,
- result_summary=None)
- )
- if status == "pending" and not found_pending:
- next_cat_idx = cat_counter
- next_task_idx = task_counter_in_cat
- found_pending = True
- if not found_pending: # only increment if previous tasks were completed/failed
- task_counter_in_cat += 1
-
- if current_category: # Append last category
- loaded_plan.append(current_category)
-
- if loaded_plan:
- state_updates["research_plan"] = loaded_plan
- if not found_pending and loaded_plan: # All tasks were completed or failed
- next_cat_idx = len(loaded_plan) # Points beyond the last category
- next_task_idx = 0
- state_updates["current_category_index"] = next_cat_idx
- state_updates["current_task_index_in_category"] = next_task_idx
- logger.info(
- f"Loaded hierarchical research plan from {plan_file}. "
- f"Next task: Category {next_cat_idx}, Task {next_task_idx} in category."
- )
- else:
- logger.warning(f"Plan file {plan_file} was empty or malformed.")
-
- except Exception as e:
- logger.error(f"Failed to load or parse research plan {plan_file}: {e}", exc_info=True)
- state_updates["error_message"] = f"Failed to load research plan: {e}"
- else:
- logger.info(f"Plan file {plan_file} not found. Will start fresh.")
-
- if os.path.exists(search_file):
- try:
- with open(search_file, "r", encoding="utf-8") as f:
- state_updates["search_results"] = json.load(f)
- logger.info(f"Loaded search results from {search_file}")
- except Exception as e:
- logger.error(f"Failed to load search results {search_file}: {e}")
- state_updates["error_message"] = (
- state_updates.get("error_message", "") + f" Failed to load search results: {e}").strip()
-
- return state_updates
-
-
-def _save_plan_to_md(plan: List[ResearchCategoryItem], output_dir: str):
- plan_file = os.path.join(output_dir, PLAN_FILENAME)
- try:
- with open(plan_file, "w", encoding="utf-8") as f:
- f.write(f"# Research Plan\n\n")
- for cat_idx, category in enumerate(plan):
- f.write(f"## {cat_idx + 1}. {category['category_name']}\n\n")
- for task_idx, task in enumerate(category['tasks']):
- marker = "- [x]" if task["status"] == "completed" else "- [ ]" if task[
- "status"] == "pending" else "- [-]" # [-] for failed
- f.write(f" {marker} {task['task_description']}\n")
- f.write("\n")
- logger.info(f"Hierarchical research plan saved to {plan_file}")
- except Exception as e:
- logger.error(f"Failed to save research plan to {plan_file}: {e}")
-
-
-def _save_search_results_to_json(results: List[Dict[str, Any]], output_dir: str):
- """Appends or overwrites search results to a JSON file."""
- search_file = os.path.join(output_dir, SEARCH_INFO_FILENAME)
- try:
- # Simple overwrite for now, could be append
- with open(search_file, "w", encoding="utf-8") as f:
- json.dump(results, f, indent=2, ensure_ascii=False)
- logger.info(f"Search results saved to {search_file}")
- except Exception as e:
- logger.error(f"Failed to save search results to {search_file}: {e}")
-
-
-def _save_report_to_md(report: str, output_dir: Path):
- """Saves the final report to a markdown file."""
- report_file = os.path.join(output_dir, REPORT_FILENAME)
- try:
- with open(report_file, "w", encoding="utf-8") as f:
- f.write(report)
- logger.info(f"Final report saved to {report_file}")
- except Exception as e:
- logger.error(f"Failed to save final report to {report_file}: {e}")
-
-
-async def planning_node(state: DeepResearchState) -> Dict[str, Any]:
- logger.info("--- Entering Planning Node ---")
- if state.get("stop_requested"):
- logger.info("Stop requested, skipping planning.")
- return {"stop_requested": True}
-
- llm = state["llm"]
- topic = state["topic"]
- existing_plan = state.get("research_plan")
- output_dir = state["output_dir"]
-
- if existing_plan and (
- state.get("current_category_index", 0) > 0 or state.get("current_task_index_in_category", 0) > 0):
- logger.info("Resuming with existing plan.")
- _save_plan_to_md(existing_plan, output_dir) # Ensure it's saved initially
- # current_category_index and current_task_index_in_category should be set by _load_previous_state
- return {"research_plan": existing_plan}
-
- logger.info(f"Generating new research plan for topic: {topic}")
-
- prompt_text = f"""You are a meticulous research assistant. Your goal is to create a hierarchical research plan to thoroughly investigate the topic: "{topic}".
-The plan should be structured into several main research categories. Each category should contain a list of specific, actionable research tasks or questions.
-Format the output as a JSON list of objects. Each object represents a research category and should have:
-1. "category_name": A string for the name of the research category.
-2. "tasks": A list of strings, where each string is a specific research task for that category.
-
-Example JSON Output:
-[
- {{
- "category_name": "Understanding Core Concepts and Definitions",
- "tasks": [
- "Define the primary terminology associated with '{topic}'.",
- "Identify the fundamental principles and theories underpinning '{topic}'."
- ]
- }},
- {{
- "category_name": "Historical Development and Key Milestones",
- "tasks": [
- "Trace the historical evolution of '{topic}'.",
- "Identify key figures, events, or breakthroughs in the development of '{topic}'."
- ]
- }},
- {{
- "category_name": "Current State-of-the-Art and Applications",
- "tasks": [
- "Analyze the current advancements and prominent applications of '{topic}'.",
- "Investigate ongoing research and active areas of development related to '{topic}'."
- ]
- }},
- {{
- "category_name": "Challenges, Limitations, and Future Outlook",
- "tasks": [
- "Identify the major challenges and limitations currently facing '{topic}'.",
- "Explore potential future trends, ethical considerations, and societal impacts of '{topic}'."
- ]
- }}
-]
-
-Generate a plan with 3-10 categories, and 2-6 tasks per category for the topic: "{topic}" according to the complexity of the topic.
-Ensure the output is a valid JSON array.
-"""
- messages = [
- SystemMessage(content="You are a research planning assistant outputting JSON."),
- HumanMessage(content=prompt_text)
- ]
-
- try:
- response = await llm.ainvoke(messages)
- raw_content = response.content
- # The LLM might wrap the JSON in backticks
- if raw_content.strip().startswith("```json"):
- raw_content = raw_content.strip()[7:-3].strip()
- elif raw_content.strip().startswith("```"):
- raw_content = raw_content.strip()[3:-3].strip()
-
- logger.debug(f"LLM response for plan: {raw_content}")
- parsed_plan_from_llm = json.loads(raw_content)
-
- new_plan: List[ResearchCategoryItem] = []
- for cat_idx, category_data in enumerate(parsed_plan_from_llm):
- if not isinstance(category_data,
- dict) or "category_name" not in category_data or "tasks" not in category_data:
- logger.warning(f"Skipping invalid category data: {category_data}")
- continue
-
- tasks: List[ResearchTaskItem] = []
- for task_idx, task_desc in enumerate(category_data["tasks"]):
- if isinstance(task_desc, str):
- tasks.append(
- ResearchTaskItem(
- task_description=task_desc,
- status="pending",
- queries=None,
- result_summary=None,
- )
- )
- else: # Sometimes LLM puts tasks as {"task": "description"}
- if isinstance(task_desc, dict) and "task_description" in task_desc:
- tasks.append(
- ResearchTaskItem(
- task_description=task_desc["task_description"],
- status="pending",
- queries=None,
- result_summary=None,
- )
- )
- elif isinstance(task_desc, dict) and "task" in task_desc: # common LLM mistake
- tasks.append(
- ResearchTaskItem(
- task_description=task_desc["task"],
- status="pending",
- queries=None,
- result_summary=None,
- )
- )
- else:
- logger.warning(
- f"Skipping invalid task data: {task_desc} in category {category_data['category_name']}")
-
- new_plan.append(
- ResearchCategoryItem(
- category_name=category_data["category_name"],
- tasks=tasks,
- )
- )
-
- if not new_plan:
- logger.error("LLM failed to generate a valid plan structure from JSON.")
- return {"error_message": "Failed to generate research plan structure."}
-
- logger.info(f"Generated research plan with {len(new_plan)} categories.")
- _save_plan_to_md(new_plan, output_dir) # Save the hierarchical plan
-
- return {
- "research_plan": new_plan,
- "current_category_index": 0,
- "current_task_index_in_category": 0,
- "search_results": [],
- }
-
- except json.JSONDecodeError as e:
- logger.error(f"Failed to parse JSON from LLM for plan: {e}. Response was: {raw_content}", exc_info=True)
- return {"error_message": f"LLM generated invalid JSON for research plan: {e}"}
- except Exception as e:
- logger.error(f"Error during planning: {e}", exc_info=True)
- return {"error_message": f"LLM Error during planning: {e}"}
-
-
-async def research_execution_node(state: DeepResearchState) -> Dict[str, Any]:
- logger.info("--- Entering Research Execution Node ---")
- if state.get("stop_requested"):
- logger.info("Stop requested, skipping research execution.")
- return {
- "stop_requested": True,
- "current_category_index": state["current_category_index"],
- "current_task_index_in_category": state["current_task_index_in_category"],
- }
-
- plan = state["research_plan"]
- cat_idx = state["current_category_index"]
- task_idx = state["current_task_index_in_category"]
- llm = state["llm"]
- tools = state["tools"]
- output_dir = str(state["output_dir"])
- task_id = state["task_id"] # For _AGENT_STOP_FLAGS
-
- # This check should ideally be handled by `should_continue`
- if not plan or cat_idx >= len(plan):
- logger.info("Research plan complete or categories exhausted.")
- return {} # should route to synthesis
-
- current_category = plan[cat_idx]
- if task_idx >= len(current_category["tasks"]):
- logger.info(f"All tasks in category '{current_category['category_name']}' completed. Moving to next category.")
- # This logic is now effectively handled by should_continue and the index updates below
- # The next iteration will be caught by should_continue or this node with updated indices
- return {
- "current_category_index": cat_idx + 1,
- "current_task_index_in_category": 0,
- "messages": state["messages"] # Pass messages along
- }
-
- current_task = current_category["tasks"][task_idx]
-
- if current_task["status"] == "completed":
- logger.info(
- f"Task '{current_task['task_description']}' in category '{current_category['category_name']}' already completed. Skipping.")
- # Logic to find next task
- next_task_idx = task_idx + 1
- next_cat_idx = cat_idx
- if next_task_idx >= len(current_category["tasks"]):
- next_cat_idx += 1
- next_task_idx = 0
- return {
- "current_category_index": next_cat_idx,
- "current_task_index_in_category": next_task_idx,
- "messages": state["messages"] # Pass messages along
- }
-
- logger.info(
- f"Executing research task: '{current_task['task_description']}' (Category: '{current_category['category_name']}')"
- )
-
- llm_with_tools = llm.bind_tools(tools)
-
- # Construct messages for LLM invocation
- task_prompt_content = (
- f"Current Research Category: {current_category['category_name']}\n"
- f"Specific Task: {current_task['task_description']}\n\n"
- "Please use the available tools, especially 'parallel_browser_search', to gather information for this specific task. "
- "Provide focused search queries relevant ONLY to this task. "
- "If you believe you have sufficient information from previous steps for this specific task, you can indicate that you are ready to summarize or that no further search is needed."
- )
- current_task_message_history = [
- HumanMessage(content=task_prompt_content)
- ]
- if not state["messages"]: # First actual execution message
- invocation_messages = [
- SystemMessage(
- content="You are a research assistant executing one task of a research plan. Focus on the current task only."),
- ] + current_task_message_history
- else:
- invocation_messages = state["messages"] + current_task_message_history
-
- try:
- logger.info(f"Invoking LLM with tools for task: {current_task['task_description']}")
- ai_response: BaseMessage = await llm_with_tools.ainvoke(invocation_messages)
- logger.info("LLM invocation complete.")
-
- tool_results = []
- executed_tool_names = []
- current_search_results = state.get("search_results", []) # Get existing search results
-
- if not isinstance(ai_response, AIMessage) or not ai_response.tool_calls:
- logger.warning(
- f"LLM did not call any tool for task '{current_task['task_description']}'. Response: {ai_response.content[:100]}..."
- )
- current_task["status"] = "pending" # Or "completed_no_tool" if LLM explains it's done
- current_task["result_summary"] = f"LLM did not use a tool. Response: {ai_response.content}"
- current_task["current_category_index"] = cat_idx
- current_task["current_task_index_in_category"] = task_idx
- return current_task
- # We still save the plan and advance.
- else:
- # Process tool calls
- for tool_call in ai_response.tool_calls:
- tool_name = tool_call.get("name")
- tool_args = tool_call.get("args", {})
- tool_call_id = tool_call.get("id")
-
- logger.info(f"LLM requested tool call: {tool_name} with args: {tool_args}")
- executed_tool_names.append(tool_name)
- selected_tool = next((t for t in tools if t.name == tool_name), None)
-
- if not selected_tool:
- logger.error(f"LLM called tool '{tool_name}' which is not available.")
- tool_results.append(
- ToolMessage(content=f"Error: Tool '{tool_name}' not found.", tool_call_id=tool_call_id))
- continue
-
- try:
- stop_event = _AGENT_STOP_FLAGS.get(task_id)
- if stop_event and stop_event.is_set():
- logger.info(f"Stop requested before executing tool: {tool_name}")
- current_task["status"] = "pending" # Or a new "stopped" status
- _save_plan_to_md(plan, output_dir)
- return {"stop_requested": True, "research_plan": plan, "current_category_index": cat_idx,
- "current_task_index_in_category": task_idx}
-
- logger.info(f"Executing tool: {tool_name}")
- tool_output = await selected_tool.ainvoke(tool_args)
- logger.info(f"Tool '{tool_name}' executed successfully.")
-
- if tool_name == "parallel_browser_search":
- current_search_results.extend(tool_output) # tool_output is List[Dict]
- else: # For other tools, we might need specific handling or just log
- logger.info(f"Result from tool '{tool_name}': {str(tool_output)[:200]}...")
- # Storing non-browser results might need a different structure or key in search_results
- current_search_results.append(
- {"tool_name": tool_name, "args": tool_args, "output": str(tool_output),
- "status": "completed"})
-
- tool_results.append(ToolMessage(content=json.dumps(tool_output), tool_call_id=tool_call_id))
-
- except Exception as e:
- logger.error(f"Error executing tool '{tool_name}': {e}", exc_info=True)
- tool_results.append(
- ToolMessage(content=f"Error executing tool {tool_name}: {e}", tool_call_id=tool_call_id))
- current_search_results.append(
- {"tool_name": tool_name, "args": tool_args, "status": "failed", "error": str(e)})
-
- # After processing all tool calls for this task
- step_failed_tool_execution = any("Error:" in str(tr.content) for tr in tool_results)
- # Consider a task successful if a browser search was attempted and didn't immediately error out during call
- # The browser search itself returns status for each query.
- browser_tool_attempted_successfully = "parallel_browser_search" in executed_tool_names and not step_failed_tool_execution
-
- if step_failed_tool_execution:
- current_task["status"] = "failed"
- current_task[
- "result_summary"] = f"Tool execution failed. Errors: {[tr.content for tr in tool_results if 'Error' in str(tr.content)]}"
- elif executed_tool_names: # If any tool was called
- current_task["status"] = "completed"
- current_task["result_summary"] = f"Executed tool(s): {', '.join(executed_tool_names)}."
- # TODO: Could ask LLM to summarize the tool_results for this task if needed, rather than just listing tools.
- else: # No tool calls but AI response had .tool_calls structure (empty)
- current_task["status"] = "failed" # Or a more specific status
- current_task["result_summary"] = "LLM prepared for tool call but provided no tools."
-
- # Save progress
- _save_plan_to_md(plan, output_dir)
- _save_search_results_to_json(current_search_results, output_dir)
-
- # Determine next indices
- next_task_idx = task_idx + 1
- next_cat_idx = cat_idx
- if next_task_idx >= len(current_category["tasks"]):
- next_cat_idx += 1
- next_task_idx = 0
-
- updated_messages = state["messages"] + current_task_message_history + [ai_response] + tool_results
-
+
+ # Run the browser agent
+ result = await browser_agent.run()
+
return {
- "research_plan": plan,
- "search_results": current_search_results,
- "current_category_index": next_cat_idx,
- "current_task_index_in_category": next_task_idx,
- "messages": updated_messages,
+ "query": task_query,
+ "result": str(result),
+ "status": "completed"
}
-
+
except Exception as e:
- logger.error(f"Unhandled error during research execution for task '{current_task['task_description']}': {e}",
- exc_info=True)
- current_task["status"] = "failed"
- _save_plan_to_md(plan, output_dir)
- # Determine next indices even on error to attempt to move on
- next_task_idx = task_idx + 1
- next_cat_idx = cat_idx
- if next_task_idx >= len(current_category["tasks"]):
- next_cat_idx += 1
- next_task_idx = 0
+ logger.error(f"Browser task failed: {e}", exc_info=True)
return {
- "research_plan": plan,
- "current_category_index": next_cat_idx,
- "current_task_index_in_category": next_task_idx,
- "error_message": f"Core Execution Error on task '{current_task['task_description']}': {e}",
- "messages": state["messages"] + current_task_message_history # Preserve messages up to error
+ "query": task_query,
+ "result": f"Error: {str(e)}",
+ "status": "failed"
}
-
-
-async def synthesis_node(state: DeepResearchState) -> Dict[str, Any]:
- """Synthesizes the final report from the collected search results."""
- logger.info("--- Entering Synthesis Node ---")
- if state.get("stop_requested"):
- logger.info("Stop requested, skipping synthesis.")
- return {"stop_requested": True}
-
- llm = state["llm"]
- topic = state["topic"]
- search_results = state.get("search_results", [])
- output_dir = state["output_dir"]
- plan = state["research_plan"] # Include plan for context
-
- if not search_results:
- logger.warning("No search results found to synthesize report.")
- report = f"# Research Report: {topic}\n\nNo information was gathered during the research process."
- _save_report_to_md(report, output_dir)
- return {"final_report": report}
-
- logger.info(
- f"Synthesizing report from {len(search_results)} collected search result entries."
- )
-
- # Prepare context for the LLM
- # Format search results nicely, maybe group by query or original plan step
- formatted_results = ""
- references = {}
- ref_count = 1
- for i, result_entry in enumerate(search_results):
- query = result_entry.get("query", "Unknown Query") # From parallel_browser_search
- tool_name = result_entry.get("tool_name") # From other tools
- status = result_entry.get("status", "unknown")
- result_data = result_entry.get("result") # From BrowserUseAgent's final_result
- tool_output_str = result_entry.get("output") # From other tools
-
- if tool_name == "parallel_browser_search" and status == "completed" and result_data:
- # result_data is the summary from BrowserUseAgent
- formatted_results += f'### Finding from Web Search Query: "{query}"\n'
- formatted_results += f"- **Summary:**\n{result_data}\n" # result_data is already a summary string here
- # If result_data contained title/URL, you'd format them here.
- # The current BrowserUseAgent returns a string summary directly as 'final_data' in run_single_browser_task
- formatted_results += "---\n"
- elif tool_name != "parallel_browser_search" and status == "completed" and tool_output_str:
- formatted_results += f'### Finding from Tool: "{tool_name}" (Args: {result_entry.get("args")})\n'
- formatted_results += f"- **Output:**\n{tool_output_str}\n"
- formatted_results += "---\n"
- elif status == "failed":
- error = result_entry.get("error")
- q_or_t = f"Query: \"{query}\"" if query != "Unknown Query" else f"Tool: \"{tool_name}\""
- formatted_results += f'### Failed {q_or_t}\n'
- formatted_results += f"- **Error:** {error}\n"
- formatted_results += "---\n"
-
- # Prepare the research plan context
- plan_summary = "\nResearch Plan Followed:\n"
- for cat_idx, category in enumerate(plan):
- plan_summary += f"\n#### Category {cat_idx + 1}: {category['category_name']}\n"
- for task_idx, task in enumerate(category['tasks']):
- marker = "[x]" if task["status"] == "completed" else "[ ]" if task["status"] == "pending" else "[-]"
- plan_summary += f" - {marker} {task['task_description']}\n"
-
- synthesis_prompt = ChatPromptTemplate.from_messages(
- [
- (
- "system",
- """You are a professional researcher tasked with writing a comprehensive and well-structured report based on collected findings.
- The report should address the research topic thoroughly, synthesizing the information gathered from various sources.
- Structure the report logically:
- 1. Briefly introduce the topic and the report's scope (mentioning the research plan followed, including categories and tasks, is good).
- 2. Discuss the key findings, organizing them thematically, possibly aligning with the research categories. Analyze, compare, and contrast information.
- 3. Summarize the main points and offer concluding thoughts.
-
- Ensure the tone is objective and professional.
- If findings are contradictory or incomplete, acknowledge this.
- """, # Removed citation part for simplicity for now, as browser agent returns summaries.
- ),
- (
- "human",
- f"""
- **Research Topic:** {topic}
-
- {plan_summary}
-
- **Collected Findings:**
- ```
- {formatted_results}
- ```
-
- Please generate the final research report in Markdown format based **only** on the information above.
- """,
- ),
- ]
- )
-
- try:
- response = await llm.ainvoke(
- synthesis_prompt.format_prompt(
- topic=topic,
- plan_summary=plan_summary,
- formatted_results=formatted_results,
- ).to_messages()
- )
- final_report_md = response.content
-
- # Append the reference list automatically to the end of the generated markdown
- if references:
- report_references_section = "\n\n## References\n\n"
- # Sort refs by ID for consistent output
- sorted_refs = sorted(references.values(), key=lambda x: x["id"])
- for ref in sorted_refs:
- report_references_section += (
- f"[{ref['id']}] {ref['title']} - {ref['url']}\n"
- )
- final_report_md += report_references_section
-
- logger.info("Successfully synthesized the final report.")
- _save_report_to_md(final_report_md, output_dir)
- return {"final_report": final_report_md}
-
- except Exception as e:
- logger.error(f"Error during report synthesis: {e}", exc_info=True)
- return {"error_message": f"LLM Error during synthesis: {e}"}
-
-
-# --- Langgraph Edges and Conditional Logic ---
-
-
-def should_continue(state: DeepResearchState) -> str:
- logger.info("--- Evaluating Condition: Should Continue? ---")
- if state.get("stop_requested"):
- logger.info("Stop requested, routing to END.")
- return "end_run"
- if state.get("error_message") and "Core Execution Error" in state["error_message"]: # Critical error in node
- logger.warning(f"Critical error detected: {state['error_message']}. Routing to END.")
- return "end_run"
-
- plan = state.get("research_plan")
- cat_idx = state.get("current_category_index", 0)
- task_idx = state.get("current_task_index_in_category", 0) # This is the *next* task to check
-
- if not plan:
- logger.warning("No research plan found. Routing to END.")
- return "end_run"
-
- # Check if the current indices point to a valid pending task
- if cat_idx < len(plan):
- current_category = plan[cat_idx]
- if task_idx < len(current_category["tasks"]):
- # We are trying to execute the task at plan[cat_idx]["tasks"][task_idx]
- # The research_execution_node will handle if it's already completed.
- logger.info(
- f"Plan has potential pending tasks (next up: Category {cat_idx}, Task {task_idx}). Routing to Research Execution."
- )
- return "execute_research"
- else: # task_idx is out of bounds for current category, means we need to check next category
- if cat_idx + 1 < len(plan): # If there is a next category
- logger.info(
- f"Finished tasks in category {cat_idx}. Moving to category {cat_idx + 1}. Routing to Research Execution."
- )
- # research_execution_node will update state to {current_category_index: cat_idx + 1, current_task_index_in_category: 0}
- # Or rather, the previous execution node already set these indices to the start of the next category.
- return "execute_research"
-
- # If we've gone through all categories and tasks (cat_idx >= len(plan))
- logger.info("All plan categories and tasks processed or current indices are out of bounds. Routing to Synthesis.")
- return "synthesize_report"
-
-
-# --- DeepSearchAgent Class ---
+ finally:
+ # Cleanup
+ if task_id in _BROWSER_AGENT_INSTANCES:
+ del _BROWSER_AGENT_INSTANCES[task_id]
class DeepResearchAgent:
def __init__(
self,
- llm: Any,
+ llm: BaseChatModel,
browser_config: Dict[str, Any],
- mcp_server_config: Optional[Dict[str, Any]] = None,
):
"""
- Initializes the DeepSearchAgent.
+ Simplified Deep Research Agent without langchain dependencies.
Args:
- llm: The Langchain compatible language model instance.
+ llm: Browser-use compatible language model instance.
browser_config: Configuration dictionary for the BrowserUseAgent tool.
Example: {"headless": True, "window_width": 1280, ...}
- mcp_server_config: Optional configuration for the MCP client.
"""
self.llm = llm
self.browser_config = browser_config
- self.mcp_server_config = mcp_server_config
- self.mcp_client = None
self.stopped = False
- self.graph = self._compile_graph()
self.current_task_id: Optional[str] = None
self.stop_event: Optional[threading.Event] = None
- self.runner: Optional[asyncio.Task] = None # To hold the asyncio task for run
-
- async def _setup_tools(
- self, task_id: str, stop_event: threading.Event, max_parallel_browsers: int = 1
- ) -> List[Tool]:
- """Sets up the basic tools (File I/O) and optional MCP tools."""
- tools = [
- WriteFileTool(),
- ReadFileTool(),
- ListDirectoryTool(),
- ] # Basic file operations
- browser_use_tool = create_browser_search_tool(
- llm=self.llm,
- browser_config=self.browser_config,
- task_id=task_id,
- stop_event=stop_event,
- max_parallel_browsers=max_parallel_browsers,
- )
- tools += [browser_use_tool]
- # Add MCP tools if config is provided
- if self.mcp_server_config:
- try:
- logger.info("Setting up MCP client and tools...")
- if not self.mcp_client:
- self.mcp_client = await setup_mcp_client_and_tools(
- self.mcp_server_config
- )
- mcp_tools = self.mcp_client.get_tools()
- logger.info(f"Loaded {len(mcp_tools)} MCP tools.")
- tools.extend(mcp_tools)
- except Exception as e:
- logger.error(f"Failed to set up MCP tools: {e}", exc_info=True)
- elif self.mcp_server_config:
- logger.warning(
- "MCP server config provided, but setup function unavailable."
- )
- tools_map = {tool.name: tool for tool in tools}
- return tools_map.values()
-
- async def close_mcp_client(self):
- if self.mcp_client:
- await self.mcp_client.__aexit__(None, None, None)
- self.mcp_client = None
-
- def _compile_graph(self) -> StateGraph:
- """Compiles the Langgraph state machine."""
- workflow = StateGraph(DeepResearchState)
-
- # Add nodes
- workflow.add_node("plan_research", planning_node)
- workflow.add_node("execute_research", research_execution_node)
- workflow.add_node("synthesize_report", synthesis_node)
- workflow.add_node(
- "end_run", lambda state: logger.info("--- Reached End Run Node ---") or {}
- ) # Simple end node
-
- # Define edges
- workflow.set_entry_point("plan_research")
-
- workflow.add_edge(
- "plan_research", "execute_research"
- ) # Always execute after planning
-
- # Conditional edge after execution
- workflow.add_conditional_edges(
- "execute_research",
- should_continue,
- {
- "execute_research": "execute_research", # Loop back if more steps
- "synthesize_report": "synthesize_report", # Move to synthesis if done
- "end_run": "end_run", # End if stop requested or error
- },
- )
-
- workflow.add_edge("synthesize_report", "end_run") # End after synthesis
-
- app = workflow.compile()
- return app
+ self.runner: Optional[asyncio.Task] = None
- async def run(
- self,
- topic: str,
- task_id: Optional[str] = None,
- save_dir: str = "./tmp/deep_research",
- max_parallel_browsers: int = 1,
- ) -> Dict[str, Any]:
+ async def research(self, query: str, output_dir: str = "./research_output") -> str:
"""
- Starts the deep research process (Async Generator Version).
-
+ Simplified research method that conducts research and writes a report.
+
Args:
- topic: The research topic.
- task_id: Optional existing task ID to resume. If None, a new ID is generated.
-
- Yields:
- Intermediate state updates or messages during execution.
+ query: The research question or topic
+ output_dir: Directory to save research output
+
+ Returns:
+ Path to the generated report
"""
- if self.runner and not self.runner.done():
- logger.warning(
- "Agent is already running. Please stop the current task first."
- )
- # Return an error status instead of yielding
- return {
- "status": "error",
- "message": "Agent already running.",
- "task_id": self.current_task_id,
- }
-
- self.current_task_id = task_id if task_id else str(uuid.uuid4())
- safe_root_dir = "./tmp/deep_research"
- normalized_save_dir = os.path.normpath(save_dir)
- if not normalized_save_dir.startswith(os.path.abspath(safe_root_dir)):
- logger.warning(f"Unsafe save_dir detected: {save_dir}. Using default directory.")
- normalized_save_dir = os.path.abspath(safe_root_dir)
- output_dir = os.path.join(normalized_save_dir, self.current_task_id)
- os.makedirs(output_dir, exist_ok=True)
-
- logger.info(
- f"[AsyncGen] Starting research task ID: {self.current_task_id} for topic: '{topic}'"
- )
- logger.info(f"[AsyncGen] Output directory: {output_dir}")
-
- self.stop_event = threading.Event()
- _AGENT_STOP_FLAGS[self.current_task_id] = self.stop_event
- agent_tools = await self._setup_tools(
- self.current_task_id, self.stop_event, max_parallel_browsers
- )
- initial_state: DeepResearchState = {
- "task_id": self.current_task_id,
- "topic": topic,
- "research_plan": [],
- "search_results": [],
- "messages": [],
- "llm": self.llm,
- "tools": agent_tools,
- "output_dir": Path(output_dir),
- "browser_config": self.browser_config,
- "final_report": None,
- "current_category_index": 0,
- "current_task_index_in_category": 0,
- "stop_requested": False,
- "error_message": None,
- }
-
- if task_id:
- logger.info(f"Attempting to resume task {task_id}...")
- loaded_state = _load_previous_state(task_id, output_dir)
- initial_state.update(loaded_state)
- if loaded_state.get("research_plan"):
- logger.info(
- f"Resuming with {len(loaded_state['research_plan'])} plan categories "
- f"and {len(loaded_state.get('search_results', []))} existing results. "
- f"Next task: Cat {initial_state['current_category_index']}, Task {initial_state['current_task_index_in_category']}"
- )
- initial_state["topic"] = (
- topic # Allow overriding topic even when resuming? Or use stored topic? Let's use new one.
- )
- else:
- logger.warning(
- f"Resume requested for {task_id}, but no previous plan found. Starting fresh."
- )
-
- # --- Execute Graph using ainvoke ---
- final_state = None
- status = "unknown"
- message = None
try:
- logger.info(f"Invoking graph execution for task {self.current_task_id}...")
- self.runner = asyncio.create_task(self.graph.ainvoke(initial_state))
- final_state = await self.runner
- logger.info(f"Graph execution finished for task {self.current_task_id}.")
-
- # Determine status based on final state
- if self.stop_event and self.stop_event.is_set():
- status = "stopped"
- message = "Research process was stopped by request."
- logger.info(message)
- elif final_state and final_state.get("error_message"):
- status = "error"
- message = final_state["error_message"]
- logger.error(f"Graph execution completed with error: {message}")
- elif final_state and final_state.get("final_report"):
- status = "completed"
- message = "Research process completed successfully."
- logger.info(message)
- else:
- # If it ends without error/report (e.g., empty plan, stopped before synthesis)
- status = "finished_incomplete"
- message = "Research process finished, but may be incomplete (no final report generated)."
- logger.warning(message)
-
- except asyncio.CancelledError:
- status = "cancelled"
- message = f"Agent run task cancelled for {self.current_task_id}."
- logger.info(message)
- # final_state will remain None or the state before cancellation if checkpointing was used
+ # Create output directory
+ os.makedirs(output_dir, exist_ok=True)
+
+ # Generate task ID
+ task_id = f"research_{uuid.uuid4().hex[:8]}"
+ self.current_task_id = task_id
+
+ logger.info(f"Starting research on: {query}")
+
+ # Use browser agent to gather information
+ browser_results = await self._conduct_browser_research(query, task_id)
+
+ # Generate report using LLM
+ report_content = await self._generate_report(query, browser_results)
+
+ # Save report
+ report_path = os.path.join(output_dir, REPORT_FILENAME)
+ write_file_content(report_path, report_content)
+
+ logger.info(f"Research complete. Report saved to: {report_path}")
+ return report_path
+
except Exception as e:
- status = "error"
- message = f"Unhandled error during graph execution for {self.current_task_id}: {e}"
- logger.error(message, exc_info=True)
- # final_state will remain None or the state before the error
+ logger.error(f"Research failed: {e}", exc_info=True)
+ raise
finally:
- logger.info(f"Cleaning up resources for task {self.current_task_id}")
- task_id_to_clean = self.current_task_id
-
- self.stop_event = None
self.current_task_id = None
- self.runner = None # Mark runner as finished
- if self.mcp_client:
- await self.mcp_client.__aexit__(None, None, None)
- # Return a result dictionary including the status and the final state if available
- return {
- "status": status,
- "message": message,
- "task_id": task_id_to_clean, # Use the stored task_id
- "final_state": final_state
- if final_state
- else {}, # Return the final state dict
- }
+ async def _conduct_browser_research(self, query: str, task_id: str) -> str:
+ """Conduct research using browser agent"""
+ try:
+ self.stop_event = stop_event = threading.Event()
+ result = await run_single_browser_task(
+ task_query=f"Research and gather comprehensive information about: {query}",
+ task_id=task_id,
+ llm=self.llm,
+ browser_config=self.browser_config,
+ stop_event=stop_event,
+ use_vision=False
+ )
+ return result.get("result", "No results found")
+ except Exception as e:
+ logger.error(f"Browser research failed: {e}")
+ return f"Error conducting research: {str(e)}"
- async def _stop_lingering_browsers(self, task_id):
- """Attempts to stop any BrowserUseAgent instances associated with the task_id."""
- keys_to_stop = [
- key for key in _BROWSER_AGENT_INSTANCES if key.startswith(f"{task_id}_")
- ]
- if not keys_to_stop:
- return
+ async def _generate_report(self, query: str, research_data: str) -> str:
+ """Generate a comprehensive research report"""
+ try:
+ prompt = f"""
+You are a research analyst tasked with creating a comprehensive report.
- logger.warning(
- f"Found {len(keys_to_stop)} potentially lingering browser agents for task {task_id}. Attempting stop..."
- )
- for key in keys_to_stop:
- agent_instance = _BROWSER_AGENT_INSTANCES.get(key)
- try:
- if agent_instance:
- # Assuming BU agent has an async stop method
- await agent_instance.stop()
- logger.info(f"Called stop() on browser agent instance {key}")
- except Exception as e:
- logger.error(
- f"Error calling stop() on browser agent instance {key}: {e}"
- )
+Research Question: {query}
- async def stop(self):
- """Signals the currently running agent task to stop."""
- if not self.current_task_id or not self.stop_event:
- logger.info("No agent task is currently running.")
- return
+Research Data:
+{research_data}
- logger.info(f"Stop requested for task ID: {self.current_task_id}")
- self.stop_event.set() # Signal the stop event
- self.stopped = True
- await self._stop_lingering_browsers(self.current_task_id)
+Please create a well-structured, comprehensive research report that includes:
+1. Executive Summary
+2. Key Findings
+3. Detailed Analysis
+4. Conclusions and Recommendations
+5. Sources (if any were found)
- def close(self):
- self.stopped = False
+Format the report in clear markdown with appropriate headings and structure.
+"""
+
+ messages = [
+ SystemMessage(content="You are an expert research analyst who creates comprehensive, well-structured reports."),
+ UserMessage(content=prompt)
+ ]
+
+ response = await self.llm.ainvoke(messages)
+ # Handle different response types from browser-use LLMs
+ if hasattr(response, 'content'):
+ return response.content
+ elif hasattr(response, 'text'):
+ return response.text
+ else:
+ return str(response)
+
+ except Exception as e:
+ logger.error(f"Report generation failed: {e}")
+ return f"# Research Report\n\n## Error\nFailed to generate report: {str(e)}\n\n## Raw Data\n{research_data}"
+
+ def stop(self):
+ """Stop the research process"""
+ self.stopped = True
+ if self.current_task_id:
+ _AGENT_STOP_FLAGS[self.current_task_id] = True
+ if self.stop_event:
+ self.stop_event.set()
diff --git a/src/browser/browser_compat.py b/src/browser/browser_compat.py
new file mode 100644
index 00000000..33a35a76
--- /dev/null
+++ b/src/browser/browser_compat.py
@@ -0,0 +1,66 @@
+"""
+Compatibility layer for browser_use 0.6.0
+Provides shims for the old API to work with the new one
+"""
+
+from browser_use.browser import BrowserSession, BrowserProfile
+from browser_use import Controller, Agent
+from typing import Optional, Dict, Any
+import logging
+
+logger = logging.getLogger(__name__)
+
+class BrowserConfig:
+ """Compatibility shim for old BrowserConfig"""
+ def __init__(self,
+ headless: bool = False,
+ disable_security: bool = False,
+ browser_binary_path: Optional[str] = None,
+ extra_browser_args: list = None,
+ wss_url: Optional[str] = None,
+ cdp_url: Optional[str] = None,
+ new_context_config: Optional[Any] = None,
+ **kwargs):
+ self.headless = headless
+ self.disable_security = disable_security
+ self.browser_binary_path = browser_binary_path
+ self.extra_browser_args = extra_browser_args or []
+ self.wss_url = wss_url
+ self.cdp_url = cdp_url
+ self.new_context_config = new_context_config
+ self._extra = kwargs
+
+class BrowserContextConfig:
+ """Compatibility shim for old BrowserContextConfig"""
+ def __init__(self,
+ window_width: int = 1280,
+ window_height: int = 720,
+ trace_path: Optional[str] = None,
+ save_recording_path: Optional[str] = None,
+ save_downloads_path: Optional[str] = None,
+ **kwargs):
+ self.window_width = window_width
+ self.window_height = window_height
+ self.trace_path = trace_path
+ self.save_recording_path = save_recording_path
+ self.save_downloads_path = save_downloads_path
+ self._extra = kwargs
+
+ def model_dump(self) -> Dict[str, Any]:
+ """Compatibility method for pydantic model_dump"""
+ return {
+ 'window_width': self.window_width,
+ 'window_height': self.window_height,
+ 'trace_path': self.trace_path,
+ 'save_recording_path': self.save_recording_path,
+ 'save_downloads_path': self.save_downloads_path,
+ **self._extra
+ }
+
+class BrowserState:
+ """Compatibility shim for BrowserState"""
+ pass
+
+# Re-export the new API classes with compatibility
+Browser = BrowserSession
+BrowserContext = BrowserSession # In v0.6.0, context is merged with session
\ No newline at end of file
diff --git a/src/browser/custom_browser.py b/src/browser/custom_browser.py
index 1556959d..ff267523 100644
--- a/src/browser/custom_browser.py
+++ b/src/browser/custom_browser.py
@@ -1,29 +1,23 @@
import asyncio
import pdb
-from playwright.async_api import Browser as PlaywrightBrowser
-from playwright.async_api import (
- BrowserContext as PlaywrightBrowserContext,
-)
-from playwright.async_api import (
- Playwright,
- async_playwright,
-)
-from browser_use.browser.browser import Browser, IN_DOCKER
-from browser_use.browser.context import BrowserContext, BrowserContextConfig
-from playwright.async_api import BrowserContext as PlaywrightBrowserContext
+from browser_use.browser.session import BrowserSession
+from browser_use.browser.profile import BrowserProfile
import logging
-from browser_use.browser.chrome import (
- CHROME_ARGS,
- CHROME_DETERMINISTIC_RENDERING_ARGS,
- CHROME_DISABLE_SECURITY_ARGS,
- CHROME_DOCKER_ARGS,
+# Updated imports for browser_use 0.6.0
+from browser_use.browser.profile import (
+ CHROME_DEFAULT_ARGS,
CHROME_HEADLESS_ARGS,
+ CHROME_DOCKER_ARGS,
+ CHROME_DISABLE_SECURITY_ARGS,
+ CHROME_DETERMINISTIC_RENDERING_ARGS,
+ get_display_size,
+ get_window_adjustments,
)
-from browser_use.browser.context import BrowserContext, BrowserContextConfig
-from browser_use.browser.utils.screen_resolution import get_screen_resolution, get_window_adjustments
+from browser_use.config import CONFIG
from browser_use.utils import time_execution_async
+from .browser_compat import BrowserContextConfig
import socket
from .custom_context import CustomBrowserContext
@@ -31,7 +25,7 @@
logger = logging.getLogger(__name__)
-class CustomBrowser(Browser):
+class CustomBrowser(BrowserSession):
async def new_context(self, config: BrowserContextConfig | None = None) -> CustomBrowserContext:
"""Create a browser context"""
@@ -40,8 +34,8 @@ async def new_context(self, config: BrowserContextConfig | None = None) -> Custo
merged_config = {**browser_config, **context_config}
return CustomBrowserContext(config=BrowserContextConfig(**merged_config), browser=self)
- async def _setup_builtin_browser(self, playwright: Playwright) -> PlaywrightBrowser:
- """Sets up and returns a Playwright Browser instance with anti-detection measures."""
+ async def _setup_builtin_browser(self, playwright_instance) -> object:
+ """Sets up and returns a Browser instance with anti-detection measures."""
assert self.config.browser_binary_path is None, 'browser_binary_path should be None if trying to use the builtin browsers'
# Use the configured window size from new_context_config if available
@@ -60,13 +54,14 @@ async def _setup_builtin_browser(self, playwright: Playwright) -> PlaywrightBrow
screen_size = {'width': 1920, 'height': 1080}
offset_x, offset_y = 0, 0
else:
- screen_size = get_screen_resolution()
+ display_size = get_display_size()
+ screen_size = {'width': display_size.width, 'height': display_size.height} if display_size else {'width': 1920, 'height': 1080}
offset_x, offset_y = get_window_adjustments()
chrome_args = {
f'--remote-debugging-port={self.config.chrome_remote_debugging_port}',
- *CHROME_ARGS,
- *(CHROME_DOCKER_ARGS if IN_DOCKER else []),
+ *CHROME_DEFAULT_ARGS,
+ *(CHROME_DOCKER_ARGS if CONFIG.IN_DOCKER else []),
*(CHROME_HEADLESS_ARGS if self.config.headless else []),
*(CHROME_DISABLE_SECURITY_ARGS if self.config.disable_security else []),
*(CHROME_DETERMINISTIC_RENDERING_ARGS if self.config.deterministic_rendering else []),
@@ -81,7 +76,7 @@ async def _setup_builtin_browser(self, playwright: Playwright) -> PlaywrightBrow
if s.connect_ex(('localhost', self.config.chrome_remote_debugging_port)) == 0:
chrome_args.remove(f'--remote-debugging-port={self.config.chrome_remote_debugging_port}')
- browser_class = getattr(playwright, self.config.browser_class)
+ browser_class = getattr(playwright_instance, self.config.browser_class)
args = {
'chromium': list(chrome_args),
'firefox': [
@@ -99,7 +94,7 @@ async def _setup_builtin_browser(self, playwright: Playwright) -> PlaywrightBrow
}
browser = await browser_class.launch(
- channel='chromium', # https://github.com/microsoft/playwright/issues/33566
+ channel='chromium',
headless=self.config.headless,
args=args[self.config.browser_class],
proxy=self.config.proxy.model_dump() if self.config.proxy else None,
diff --git a/src/browser/custom_context.py b/src/browser/custom_context.py
index 674191af..fa74fb48 100644
--- a/src/browser/custom_context.py
+++ b/src/browser/custom_context.py
@@ -4,8 +4,6 @@
from browser_use.browser.browser import Browser, IN_DOCKER
from browser_use.browser.context import BrowserContext, BrowserContextConfig
-from playwright.async_api import Browser as PlaywrightBrowser
-from playwright.async_api import BrowserContext as PlaywrightBrowserContext
from typing import Optional
from browser_use.browser.context import BrowserContextState
diff --git a/src/controller/custom_controller.py b/src/controller/custom_controller.py
index 00e050c5..9a525045 100644
--- a/src/controller/custom_controller.py
+++ b/src/controller/custom_controller.py
@@ -1,52 +1,37 @@
-import pdb
-
-import pyperclip
+# import pdb # Unused
+# import pyperclip # Unused
from typing import Optional, Type, Callable, Dict, Any, Union, Awaitable, TypeVar
from pydantic import BaseModel
from browser_use.agent.views import ActionResult
-from browser_use.browser.context import BrowserContext
-from browser_use.controller.service import Controller, DoneAction
-from browser_use.controller.registry.service import Registry, RegisteredAction
-from main_content_extractor import MainContentExtractor
-from browser_use.controller.views import (
- ClickElementAction,
- DoneAction,
- ExtractPageContentAction,
- GoToUrlAction,
- InputTextAction,
- OpenTabAction,
- ScrollAction,
- SearchGoogleAction,
- SendKeysAction,
- SwitchTabAction,
-)
+from browser_use.browser import BrowserSession
+from browser_use import Controller
+# from browser_use.controller.registry.service import RegisteredAction # Not available in browser-use 0.7.10
+# Removed unused imports: DoneAction, Registry, MainContentExtractor, and view classes
import logging
import inspect
-import asyncio
-import os
-from langchain_core.language_models.chat_models import BaseChatModel
+# import asyncio # Unused
+# import os # Unused
+from browser_use.llm import BaseChatModel
from browser_use.agent.views import ActionModel, ActionResult
-
-from src.utils.mcp_client import create_tool_param_model, setup_mcp_client_and_tools
+from src.utils.mcp_client import get_mcp_manager, MCPManager
from browser_use.utils import time_execution_sync
logger = logging.getLogger(__name__)
-Context = TypeVar('Context')
+# Context = TypeVar('Context') # Removed - not used in browser-use 0.7.10
class CustomController(Controller):
def __init__(self, exclude_actions: list[str] = [],
output_model: Optional[Type[BaseModel]] = None,
- ask_assistant_callback: Optional[Union[Callable[[str, BrowserContext], Dict[str, Any]], Callable[
- [str, BrowserContext], Awaitable[Dict[str, Any]]]]] = None,
+ ask_assistant_callback: Optional[Union[Callable[[str, BrowserSession], Dict[str, Any]], Callable[
+ [str, BrowserSession], Awaitable[Dict[str, Any]]]]] = None,
):
super().__init__(exclude_actions=exclude_actions, output_model=output_model)
self._register_custom_actions()
self.ask_assistant_callback = ask_assistant_callback
- self.mcp_client = None
- self.mcp_server_config = None
+ self.mcp_manager: Optional[MCPManager] = None
def _register_custom_actions(self):
"""Register all custom browser actions"""
@@ -57,12 +42,12 @@ def _register_custom_actions(self):
"requiring subjective human judgment, needing a physical action performed, encountering complex CAPTCHAs, "
"or facing limitations in your capabilities – you must request human assistance."
)
- async def ask_for_assistant(query: str, browser: BrowserContext):
+ async def ask_for_assistant(query: str, browser_session: BrowserSession):
if self.ask_assistant_callback:
if inspect.iscoroutinefunction(self.ask_assistant_callback):
- user_response = await self.ask_assistant_callback(query, browser)
+ user_response = await self.ask_assistant_callback(query, browser_session)
else:
- user_response = self.ask_assistant_callback(query, browser)
+ user_response = self.ask_assistant_callback(query, browser_session)
msg = f"AI ask: {query}. User response: {user_response['response']}"
logger.info(msg)
return ActionResult(extracted_content=msg, include_in_memory=True)
@@ -70,113 +55,76 @@ async def ask_for_assistant(query: str, browser: BrowserContext):
return ActionResult(extracted_content="Human cannot help you. Please try another way.",
include_in_memory=True)
- @self.registry.action(
- 'Upload file to interactive element with file path ',
- )
- async def upload_file(index: int, path: str, browser: BrowserContext, available_file_paths: list[str]):
- if path not in available_file_paths:
- return ActionResult(error=f'File path {path} is not available')
-
- if not os.path.exists(path):
- return ActionResult(error=f'File {path} does not exist')
-
- dom_el = await browser.get_dom_element_by_index(index)
-
- file_upload_dom_el = dom_el.get_file_upload_element()
-
- if file_upload_dom_el is None:
- msg = f'No file upload element found at index {index}'
- logger.info(msg)
- return ActionResult(error=msg)
-
- file_upload_el = await browser.get_locate_element(file_upload_dom_el)
-
- if file_upload_el is None:
- msg = f'No file upload element found at index {index}'
- logger.info(msg)
- return ActionResult(error=msg)
-
- try:
- await file_upload_el.set_input_files(path)
- msg = f'Successfully uploaded file to index {index}'
- logger.info(msg)
- return ActionResult(extracted_content=msg, include_in_memory=True)
- except Exception as e:
- msg = f'Failed to upload file to index {index}: {str(e)}'
- logger.info(msg)
- return ActionResult(error=msg)
+ # TODO: Update upload_file action for browser_use 0.6.0 API
+ # The old implementation used methods that don't exist in the new version
+ # Commenting out for now to avoid schema generation errors
+
+ # @self.registry.action(
+ # 'Upload file to interactive element with file path ',
+ # )
+ # async def upload_file(index: int, path: str, browser_session: BrowserSession, available_file_paths: list[str]):
+ # # This needs to be reimplemented using browser_use 0.6.0 API
+ # return ActionResult(error='File upload not yet implemented for browser_use 0.6.0')
@time_execution_sync('--act')
async def act(
self,
action: ActionModel,
- browser_context: Optional[BrowserContext] = None,
+ browser_session: BrowserSession,
#
page_extraction_llm: Optional[BaseChatModel] = None,
sensitive_data: Optional[Dict[str, str]] = None,
available_file_paths: Optional[list[str]] = None,
+ file_system: Optional[Any] = None, # Added for compatibility with 0.6.0
#
- context: Context | None = None,
) -> ActionResult:
- """Execute an action"""
+ """Execute an action using parent class - MCP functionality removed"""
+
+ # Delegate to parent class for all actions
+ return await super().act(
+ action=action,
+ browser_session=browser_session,
+ page_extraction_llm=page_extraction_llm,
+ sensitive_data=sensitive_data,
+ available_file_paths=available_file_paths,
+ file_system=file_system,
+ )
+ async def setup_mcp_client(self, mcp_server_config: Optional[Dict[str, Any]] = None):
+ """Set up MCP servers using browser-use's native MCP implementation."""
+ if not mcp_server_config:
+ logger.info("No MCP server configuration provided")
+ return
+
try:
- for action_name, params in action.model_dump(exclude_unset=True).items():
- if params is not None:
- if action_name.startswith("mcp"):
- # this is a mcp tool
- logger.debug(f"Invoke MCP tool: {action_name}")
- mcp_tool = self.registry.registry.actions.get(action_name).function
- result = await mcp_tool.ainvoke(params)
- else:
- result = await self.registry.execute_action(
- action_name,
- params,
- browser=browser_context,
- page_extraction_llm=page_extraction_llm,
- sensitive_data=sensitive_data,
- available_file_paths=available_file_paths,
- context=context,
- )
-
- if isinstance(result, str):
- return ActionResult(extracted_content=result)
- elif isinstance(result, ActionResult):
- return result
- elif result is None:
- return ActionResult()
- else:
- raise ValueError(f'Invalid action result type: {type(result)} of {result}')
- return ActionResult()
+ logger.info("Setting up MCP servers using browser-use implementation...")
+
+ # Get the global MCP manager
+ self.mcp_manager = get_mcp_manager()
+
+ # Set up MCP servers
+ success = await self.mcp_manager.setup_mcp_servers(mcp_server_config)
+
+ if success:
+ # Register MCP tools to this controller
+ tool_count = self.mcp_manager.register_tools_to_controller(self)
+ logger.info(f"Successfully set up MCP with {tool_count} server(s)")
+
+ connected_servers = self.mcp_manager.get_connected_servers()
+ logger.info(f"Connected MCP servers: {connected_servers}")
+ else:
+ logger.warning("Failed to set up MCP servers")
+
except Exception as e:
- raise e
-
- async def setup_mcp_client(self, mcp_server_config: Optional[Dict[str, Any]] = None):
- self.mcp_server_config = mcp_server_config
- if self.mcp_server_config:
- self.mcp_client = await setup_mcp_client_and_tools(self.mcp_server_config)
- self.register_mcp_tools()
-
- def register_mcp_tools(self):
- """
- Register the MCP tools used by this controller.
- """
- if self.mcp_client:
- for server_name in self.mcp_client.server_name_to_tools:
- for tool in self.mcp_client.server_name_to_tools[server_name]:
- tool_name = f"mcp.{server_name}.{tool.name}"
- self.registry.registry.actions[tool_name] = RegisteredAction(
- name=tool_name,
- description=tool.description,
- function=tool,
- param_model=create_tool_param_model(tool),
- )
- logger.info(f"Add mcp tool: {tool_name}")
- logger.debug(
- f"Registered {len(self.mcp_client.server_name_to_tools[server_name])} mcp tools for {server_name}")
- else:
- logger.warning(f"MCP client not started.")
-
+ logger.error(f"Error setting up MCP client: {e}", exc_info=True)
+
async def close_mcp_client(self):
- if self.mcp_client:
- await self.mcp_client.__aexit__(None, None, None)
+ """Close MCP connections."""
+ if self.mcp_manager:
+ try:
+ await self.mcp_manager.disconnect_all()
+ logger.info("Closed all MCP connections")
+ except Exception as e:
+ logger.error(f"Error closing MCP connections: {e}")
+ finally:
+ self.mcp_manager = None
diff --git a/src/utils/config.py b/src/utils/config.py
index de82bb9e..b47952d8 100644
--- a/src/utils/config.py
+++ b/src/utils/config.py
@@ -13,7 +13,7 @@
# Predefined model names for common providers
model_names = {
- "anthropic": ["claude-3-5-sonnet-20241022", "claude-3-5-sonnet-20240620", "claude-3-opus-20240229"],
+ "anthropic": ["claude-haiku-4-5", "claude-sonnet-4-5", "claude-opus-4-1"],
"openai": ["gpt-4o", "gpt-4", "gpt-3.5-turbo", "o3-mini"],
"deepseek": ["deepseek-chat", "deepseek-reasoner"],
"google": ["gemini-2.0-flash", "gemini-2.0-flash-thinking-exp", "gemini-1.5-flash-latest",
diff --git a/src/utils/llm_provider.py b/src/utils/llm_provider.py
index 2ef3d638..de0a8250 100644
--- a/src/utils/llm_provider.py
+++ b/src/utils/llm_provider.py
@@ -1,162 +1,35 @@
-from openai import OpenAI
-import pdb
-from langchain_openai import ChatOpenAI
-from langchain_core.globals import get_llm_cache
-from langchain_core.language_models.base import (
- BaseLanguageModel,
- LangSmithParams,
- LanguageModelInput,
-)
import os
-from langchain_core.load import dumpd, dumps
-from langchain_core.messages import (
- AIMessage,
- SystemMessage,
- AnyMessage,
- BaseMessage,
- BaseMessageChunk,
- HumanMessage,
- convert_to_messages,
- message_chunk_to_message,
-)
-from langchain_core.outputs import (
- ChatGeneration,
- ChatGenerationChunk,
- ChatResult,
- LLMResult,
- RunInfo,
-)
-from langchain_ollama import ChatOllama
-from langchain_core.output_parsers.base import OutputParserLike
-from langchain_core.runnables import Runnable, RunnableConfig
-from langchain_core.tools import BaseTool
-
-from typing import (
- TYPE_CHECKING,
- Any,
- Callable,
- Literal,
- Optional,
- Union,
- cast, List,
-)
-from langchain_anthropic import ChatAnthropic
-from langchain_mistralai import ChatMistralAI
-from langchain_google_genai import ChatGoogleGenerativeAI
-from langchain_ollama import ChatOllama
-from langchain_openai import AzureChatOpenAI, ChatOpenAI
-from langchain_ibm import ChatWatsonx
-from langchain_aws import ChatBedrock
-from pydantic import SecretStr
+import logging
+from typing import Any, Optional
+
+# Browser-use LLM imports
+from browser_use.llm.openai.chat import ChatOpenAI
+from browser_use.llm.anthropic.chat import ChatAnthropic
+from browser_use.llm.google.chat import ChatGoogle
+from browser_use.llm.groq.chat import ChatGroq
+from browser_use.llm.ollama.chat import ChatOllama
+from browser_use.llm.azure.chat import ChatAzureOpenAI
+from browser_use.llm.deepseek.chat import ChatDeepSeek
+from browser_use.llm import BaseChatModel, UserMessage, SystemMessage, AssistantMessage
from src.utils import config
+logger = logging.getLogger(__name__)
-class DeepSeekR1ChatOpenAI(ChatOpenAI):
-
- def __init__(self, *args: Any, **kwargs: Any) -> None:
- super().__init__(*args, **kwargs)
- self.client = OpenAI(
- base_url=kwargs.get("base_url"),
- api_key=kwargs.get("api_key")
- )
-
- async def ainvoke(
- self,
- input: LanguageModelInput,
- config: Optional[RunnableConfig] = None,
- *,
- stop: Optional[list[str]] = None,
- **kwargs: Any,
- ) -> AIMessage:
- message_history = []
- for input_ in input:
- if isinstance(input_, SystemMessage):
- message_history.append({"role": "system", "content": input_.content})
- elif isinstance(input_, AIMessage):
- message_history.append({"role": "assistant", "content": input_.content})
- else:
- message_history.append({"role": "user", "content": input_.content})
-
- response = self.client.chat.completions.create(
- model=self.model_name,
- messages=message_history
- )
-
- reasoning_content = response.choices[0].message.reasoning_content
- content = response.choices[0].message.content
- return AIMessage(content=content, reasoning_content=reasoning_content)
-
- def invoke(
- self,
- input: LanguageModelInput,
- config: Optional[RunnableConfig] = None,
- *,
- stop: Optional[list[str]] = None,
- **kwargs: Any,
- ) -> AIMessage:
- message_history = []
- for input_ in input:
- if isinstance(input_, SystemMessage):
- message_history.append({"role": "system", "content": input_.content})
- elif isinstance(input_, AIMessage):
- message_history.append({"role": "assistant", "content": input_.content})
- else:
- message_history.append({"role": "user", "content": input_.content})
-
- response = self.client.chat.completions.create(
- model=self.model_name,
- messages=message_history
- )
-
- reasoning_content = response.choices[0].message.reasoning_content
- content = response.choices[0].message.content
- return AIMessage(content=content, reasoning_content=reasoning_content)
-
-
-class DeepSeekR1ChatOllama(ChatOllama):
-
- async def ainvoke(
- self,
- input: LanguageModelInput,
- config: Optional[RunnableConfig] = None,
- *,
- stop: Optional[list[str]] = None,
- **kwargs: Any,
- ) -> AIMessage:
- org_ai_message = await super().ainvoke(input=input)
- org_content = org_ai_message.content
- reasoning_content = org_content.split("")[0].replace("", "")
- content = org_content.split("")[1]
- if "**JSON Response:**" in content:
- content = content.split("**JSON Response:**")[-1]
- return AIMessage(content=content, reasoning_content=reasoning_content)
- def invoke(
- self,
- input: LanguageModelInput,
- config: Optional[RunnableConfig] = None,
- *,
- stop: Optional[list[str]] = None,
- **kwargs: Any,
- ) -> AIMessage:
- org_ai_message = super().invoke(input=input)
- org_content = org_ai_message.content
- reasoning_content = org_content.split("")[0].replace("", "")
- content = org_content.split("")[1]
- if "**JSON Response:**" in content:
- content = content.split("**JSON Response:**")[-1]
- return AIMessage(content=content, reasoning_content=reasoning_content)
+# Custom DeepSeek reasoning model implementations are removed
+# as they require specific reasoning content handling not available in browser-use
-def get_llm_model(provider: str, **kwargs):
+def get_llm_model(provider: str, **kwargs) -> BaseChatModel:
"""
- Get LLM model
- :param provider: LLM provider
- :param kwargs:
- :return:
+ Get LLM model using browser-use LLM implementations
+ :param provider: LLM provider name
+ :param kwargs: Additional parameters
+ :return: BaseChatModel instance
"""
- if provider not in ["ollama", "bedrock"]:
+ # Handle API key requirement for most providers
+ if provider not in ["ollama"]:
env_var = f"{provider.upper()}_API_KEY"
api_key = kwargs.get("api_key", "") or os.getenv(env_var, "")
if not api_key:
@@ -166,190 +39,108 @@ def get_llm_model(provider: str, **kwargs):
kwargs["api_key"] = api_key
if provider == "anthropic":
- if not kwargs.get("base_url", ""):
- base_url = "https://api.anthropic.com"
- else:
- base_url = kwargs.get("base_url")
-
+ base_url = kwargs.get("base_url") or os.getenv("ANTHROPIC_ENDPOINT", "https://api.anthropic.com")
return ChatAnthropic(
model=kwargs.get("model_name", "claude-3-5-sonnet-20241022"),
temperature=kwargs.get("temperature", 0.0),
base_url=base_url,
api_key=api_key,
)
- elif provider == 'mistral':
- if not kwargs.get("base_url", ""):
- base_url = os.getenv("MISTRAL_ENDPOINT", "https://api.mistral.ai/v1")
- else:
- base_url = kwargs.get("base_url")
- if not kwargs.get("api_key", ""):
- api_key = os.getenv("MISTRAL_API_KEY", "")
- else:
- api_key = kwargs.get("api_key")
-
- return ChatMistralAI(
- model=kwargs.get("model_name", "mistral-large-latest"),
- temperature=kwargs.get("temperature", 0.0),
- base_url=base_url,
- api_key=api_key,
- )
+
elif provider == "openai":
- if not kwargs.get("base_url", ""):
- base_url = os.getenv("OPENAI_ENDPOINT", "https://api.openai.com/v1")
- else:
- base_url = kwargs.get("base_url")
-
+ base_url = kwargs.get("base_url") or os.getenv("OPENAI_ENDPOINT", "https://api.openai.com/v1")
return ChatOpenAI(
model=kwargs.get("model_name", "gpt-4o"),
- temperature=kwargs.get("temperature", 0.0),
+ temperature=kwargs.get("temperature", 0.2),
base_url=base_url,
api_key=api_key,
)
- elif provider == "grok":
- if not kwargs.get("base_url", ""):
- base_url = os.getenv("GROK_ENDPOINT", "https://api.x.ai/v1")
- else:
- base_url = kwargs.get("base_url")
-
- return ChatOpenAI(
- model=kwargs.get("model_name", "grok-3"),
+
+ elif provider == "google":
+ return ChatGoogle(
+ model=kwargs.get("model_name", "gemini-2.0-flash-exp"),
temperature=kwargs.get("temperature", 0.0),
- base_url=base_url,
api_key=api_key,
)
- elif provider == "deepseek":
- if not kwargs.get("base_url", ""):
- base_url = os.getenv("DEEPSEEK_ENDPOINT", "")
- else:
- base_url = kwargs.get("base_url")
-
- if kwargs.get("model_name", "deepseek-chat") == "deepseek-reasoner":
- return DeepSeekR1ChatOpenAI(
- model=kwargs.get("model_name", "deepseek-reasoner"),
- temperature=kwargs.get("temperature", 0.0),
- base_url=base_url,
- api_key=api_key,
- )
- else:
- return ChatOpenAI(
- model=kwargs.get("model_name", "deepseek-chat"),
- temperature=kwargs.get("temperature", 0.0),
- base_url=base_url,
- api_key=api_key,
- )
- elif provider == "google":
- return ChatGoogleGenerativeAI(
- model=kwargs.get("model_name", "gemini-2.0-flash-exp"),
+
+ elif provider == "groq":
+ base_url = kwargs.get("base_url") or os.getenv("GROQ_ENDPOINT", "https://api.groq.com/openai/v1")
+ return ChatGroq(
+ model=kwargs.get("model_name", "llama-3.1-8b-instant"),
temperature=kwargs.get("temperature", 0.0),
+ base_url=base_url,
api_key=api_key,
)
+
elif provider == "ollama":
- if not kwargs.get("base_url", ""):
- base_url = os.getenv("OLLAMA_ENDPOINT", "http://localhost:11434")
- else:
- base_url = kwargs.get("base_url")
-
- if "deepseek-r1" in kwargs.get("model_name", "qwen2.5:7b"):
- return DeepSeekR1ChatOllama(
- model=kwargs.get("model_name", "deepseek-r1:14b"),
- temperature=kwargs.get("temperature", 0.0),
- num_ctx=kwargs.get("num_ctx", 32000),
- base_url=base_url,
- )
- else:
- return ChatOllama(
- model=kwargs.get("model_name", "qwen2.5:7b"),
- temperature=kwargs.get("temperature", 0.0),
- num_ctx=kwargs.get("num_ctx", 32000),
- num_predict=kwargs.get("num_predict", 1024),
- base_url=base_url,
- )
+ host = kwargs.get("base_url") or os.getenv("OLLAMA_ENDPOINT", "http://localhost:11434")
+ return ChatOllama(
+ model=kwargs.get("model_name", "qwen2.5:7b"),
+ host=host,
+ )
+
elif provider == "azure_openai":
- if not kwargs.get("base_url", ""):
- base_url = os.getenv("AZURE_OPENAI_ENDPOINT", "")
- else:
- base_url = kwargs.get("base_url")
- api_version = kwargs.get("api_version", "") or os.getenv("AZURE_OPENAI_API_VERSION", "2025-01-01-preview")
- return AzureChatOpenAI(
+ base_url = kwargs.get("base_url") or os.getenv("AZURE_OPENAI_ENDPOINT", "")
+ if not base_url:
+ raise ValueError("Azure OpenAI endpoint is required")
+ return ChatAzureOpenAI(
model=kwargs.get("model_name", "gpt-4o"),
- temperature=kwargs.get("temperature", 0.0),
- api_version=api_version,
- azure_endpoint=base_url,
+ temperature=kwargs.get("temperature", 0.2),
+ base_url=base_url,
api_key=api_key,
)
- elif provider == "alibaba":
- if not kwargs.get("base_url", ""):
- base_url = os.getenv("ALIBABA_ENDPOINT", "https://dashscope.aliyuncs.com/compatible-mode/v1")
- else:
- base_url = kwargs.get("base_url")
-
- return ChatOpenAI(
- model=kwargs.get("model_name", "qwen-plus"),
+
+ elif provider == "deepseek":
+ base_url = kwargs.get("base_url") or os.getenv("DEEPSEEK_ENDPOINT", "https://api.deepseek.com/v1")
+ return ChatDeepSeek(
+ model=kwargs.get("model_name", "deepseek-chat"),
temperature=kwargs.get("temperature", 0.0),
base_url=base_url,
api_key=api_key,
)
- elif provider == "ibm":
- parameters = {
- "temperature": kwargs.get("temperature", 0.0),
- "max_tokens": kwargs.get("num_ctx", 32000)
+
+ # For providers not directly supported by browser-use, use OpenAI-compatible API
+ elif provider in ["grok", "alibaba", "moonshot", "unbound", "siliconflow", "modelscope", "mistral", "ibm"]:
+ base_url_map = {
+ "grok": os.getenv("GROK_ENDPOINT", "https://api.x.ai/v1"),
+ "alibaba": os.getenv("ALIBABA_ENDPOINT", "https://dashscope.aliyuncs.com/compatible-mode/v1"),
+ "moonshot": os.getenv("MOONSHOT_ENDPOINT"),
+ "unbound": os.getenv("UNBOUND_ENDPOINT", "https://api.getunbound.ai"),
+ "siliconflow": os.getenv("SILICONFLOW_ENDPOINT", ""),
+ "modelscope": os.getenv("MODELSCOPE_ENDPOINT", ""),
+ "mistral": os.getenv("MISTRAL_ENDPOINT", "https://api.mistral.ai/v1"),
+ "ibm": os.getenv("IBM_ENDPOINT", "https://us-south.ml.cloud.ibm.com")
}
- if not kwargs.get("base_url", ""):
- base_url = os.getenv("IBM_ENDPOINT", "https://us-south.ml.cloud.ibm.com")
- else:
- base_url = kwargs.get("base_url")
-
- return ChatWatsonx(
- model_id=kwargs.get("model_name", "ibm/granite-vision-3.1-2b-preview"),
- url=base_url,
- project_id=os.getenv("IBM_PROJECT_ID"),
- apikey=os.getenv("IBM_API_KEY"),
- params=parameters
- )
- elif provider == "moonshot":
- return ChatOpenAI(
- model=kwargs.get("model_name", "moonshot-v1-32k-vision-preview"),
- temperature=kwargs.get("temperature", 0.0),
- base_url=os.getenv("MOONSHOT_ENDPOINT"),
- api_key=os.getenv("MOONSHOT_API_KEY"),
- )
- elif provider == "unbound":
- return ChatOpenAI(
- model=kwargs.get("model_name", "gpt-4o-mini"),
- temperature=kwargs.get("temperature", 0.0),
- base_url=os.getenv("UNBOUND_ENDPOINT", "https://api.getunbound.ai"),
- api_key=api_key,
- )
- elif provider == "siliconflow":
- if not kwargs.get("api_key", ""):
- api_key = os.getenv("SiliconFLOW_API_KEY", "")
- else:
- api_key = kwargs.get("api_key")
- if not kwargs.get("base_url", ""):
- base_url = os.getenv("SiliconFLOW_ENDPOINT", "")
- else:
- base_url = kwargs.get("base_url")
+
+ model_defaults = {
+ "grok": "grok-3",
+ "alibaba": "qwen-plus",
+ "moonshot": "moonshot-v1-32k-vision-preview",
+ "unbound": "gpt-4o-mini",
+ "siliconflow": "Qwen/QwQ-32B",
+ "modelscope": "Qwen/QwQ-32B",
+ "mistral": "pixtral-large-latest",
+ "ibm": "ibm/granite-vision-3.1-2b-preview"
+ }
+
+ base_url = kwargs.get("base_url") or base_url_map[provider]
+ if not base_url:
+ raise ValueError(f"{provider} endpoint is required")
+
+ # Special handling for IBM which may require project_id in headers
+ extra_headers = {}
+ if provider == "ibm":
+ project_id = kwargs.get("project_id") or os.getenv("IBM_PROJECT_ID")
+ if project_id:
+ extra_headers["X-Project-ID"] = project_id
+
return ChatOpenAI(
- api_key=api_key,
+ model=kwargs.get("model_name", model_defaults[provider]),
+ temperature=kwargs.get("temperature", 0.2),
base_url=base_url,
- model_name=kwargs.get("model_name", "Qwen/QwQ-32B"),
- temperature=kwargs.get("temperature", 0.0),
- )
- elif provider == "modelscope":
- if not kwargs.get("api_key", ""):
- api_key = os.getenv("MODELSCOPE_API_KEY", "")
- else:
- api_key = kwargs.get("api_key")
- if not kwargs.get("base_url", ""):
- base_url = os.getenv("MODELSCOPE_ENDPOINT", "")
- else:
- base_url = kwargs.get("base_url")
- return ChatOpenAI(
api_key=api_key,
- base_url=base_url,
- model_name=kwargs.get("model_name", "Qwen/QwQ-32B"),
- temperature=kwargs.get("temperature", 0.0),
- extra_body = {"enable_thinking": False}
+ extra_headers=extra_headers if extra_headers else None,
)
+
else:
- raise ValueError(f"Unsupported provider: {provider}")
+ raise ValueError(f"Unsupported provider: {provider}. Supported providers: anthropic, openai, google, groq, ollama, azure_openai, deepseek, grok, alibaba, moonshot, unbound, siliconflow, modelscope, mistral, ibm")
diff --git a/src/utils/mcp_client.py b/src/utils/mcp_client.py
index 126d49da..a50de585 100644
--- a/src/utils/mcp_client.py
+++ b/src/utils/mcp_client.py
@@ -1,254 +1,168 @@
-import inspect
import logging
-import uuid
-from datetime import date, datetime, time
-from enum import Enum
-from typing import Any, Dict, List, Optional, Set, Type, Union, get_type_hints
-
-from browser_use.controller.registry.views import ActionModel
-from langchain.tools import BaseTool
-from langchain_mcp_adapters.client import MultiServerMCPClient
-from pydantic import BaseModel, Field, create_model
-from pydantic.v1 import BaseModel, Field
+from typing import Dict, List, Optional, Any
+from browser_use.mcp import MCPClient
+from browser_use import Controller
logger = logging.getLogger(__name__)
-async def setup_mcp_client_and_tools(mcp_server_config: Dict[str, Any]) -> Optional[MultiServerMCPClient]:
+class MCPManager:
"""
- Initializes the MultiServerMCPClient, connects to servers, fetches tools,
- filters them, and returns a flat list of usable tools and the client instance.
-
+ Manages multiple MCP server connections using browser-use's native MCP implementation.
+ """
+
+ def __init__(self):
+ self.clients: Dict[str, MCPClient] = {}
+ self.connected_servers: List[str] = []
+
+ async def setup_mcp_servers(self, mcp_server_config: Dict[str, Any]) -> bool:
+ """
+ Set up MCP servers from configuration.
+
+ Args:
+ mcp_server_config: Configuration dict with server definitions
+
+ Returns:
+ bool: True if setup successful, False otherwise
+ """
+ try:
+ if not mcp_server_config:
+ logger.warning("No MCP server configuration provided")
+ return False
+
+ # Handle nested mcpServers structure
+ servers_config = mcp_server_config
+ if "mcpServers" in mcp_server_config:
+ servers_config = mcp_server_config["mcpServers"]
+
+ logger.info(f"Setting up {len(servers_config)} MCP servers...")
+
+ # Clear existing clients
+ await self.disconnect_all()
+
+ # Create and connect clients
+ for server_name, server_config in servers_config.items():
+ try:
+ command = server_config.get("command")
+ args = server_config.get("args", [])
+ env = server_config.get("env", {})
+
+ if not command:
+ logger.warning(f"No command specified for MCP server '{server_name}', skipping")
+ continue
+
+ logger.info(f"Creating MCP client for server '{server_name}' with command: {command}")
+
+ # Create MCP client
+ client = MCPClient(
+ server_name=server_name,
+ command=command,
+ args=args,
+ env=env
+ )
+
+ # Connect to the server
+ await client.connect()
+
+ # Store client
+ self.clients[server_name] = client
+ self.connected_servers.append(server_name)
+
+ logger.info(f"Successfully connected to MCP server '{server_name}'")
+
+ except Exception as e:
+ logger.error(f"Failed to setup MCP server '{server_name}': {e}", exc_info=True)
+ continue
+
+ logger.info(f"Successfully connected to {len(self.connected_servers)} MCP servers: {self.connected_servers}")
+ return len(self.connected_servers) > 0
+
+ except Exception as e:
+ logger.error(f"Failed to setup MCP servers: {e}", exc_info=True)
+ return False
+
+ def register_tools_to_controller(self, controller: Controller, tool_filter: Optional[List[str]] = None) -> int:
+ """
+ Register MCP tools to a browser-use controller.
+
+ Args:
+ controller: Browser-use controller instance
+ tool_filter: Optional list of tool names to include (None = all tools)
+
+ Returns:
+ int: Number of tools registered
+ """
+ total_tools = 0
+
+ try:
+ for server_name, client in self.clients.items():
+ try:
+ # Register tools with server name prefix
+ client.register_to_controller(
+ controller=controller,
+ tool_filter=tool_filter,
+ prefix=f"mcp_{server_name}_"
+ )
+
+ # Note: browser-use doesn't provide a direct way to count registered tools
+ # We'll estimate based on successful registration
+ logger.info(f"Registered MCP tools from server '{server_name}' to controller")
+ total_tools += 1 # Increment per server for now
+
+ except Exception as e:
+ logger.error(f"Failed to register tools from MCP server '{server_name}': {e}")
+ continue
+
+ logger.info(f"Total MCP servers registered to controller: {total_tools}")
+ return total_tools
+
+ except Exception as e:
+ logger.error(f"Failed to register MCP tools to controller: {e}", exc_info=True)
+ return 0
+
+ async def disconnect_all(self):
+ """Disconnect from all MCP servers."""
+ for server_name, client in self.clients.items():
+ try:
+ await client.disconnect()
+ logger.info(f"Disconnected from MCP server '{server_name}'")
+ except Exception as e:
+ logger.error(f"Error disconnecting from MCP server '{server_name}': {e}")
+
+ self.clients.clear()
+ self.connected_servers.clear()
+
+ def get_connected_servers(self) -> List[str]:
+ """Get list of connected server names."""
+ return self.connected_servers.copy()
+
+ def is_connected(self, server_name: str) -> bool:
+ """Check if a specific server is connected."""
+ return server_name in self.connected_servers
+
+
+# Global MCP manager instance
+_mcp_manager: Optional[MCPManager] = None
+
+
+def get_mcp_manager() -> MCPManager:
+ """Get or create the global MCP manager instance."""
+ global _mcp_manager
+ if _mcp_manager is None:
+ _mcp_manager = MCPManager()
+ return _mcp_manager
+
+
+async def setup_mcp_client_and_tools(mcp_server_config: Dict[str, Any]) -> MCPManager:
+ """
+ Compatibility function that sets up MCP servers and returns the manager.
+
+ Args:
+ mcp_server_config: MCP server configuration dictionary
+
Returns:
- A tuple containing:
- - list[BaseTool]: The filtered list of usable LangChain tools.
- - MultiServerMCPClient | None: The initialized and started client instance, or None on failure.
+ MCPManager: The MCP manager instance
"""
-
- logger.info("Initializing MultiServerMCPClient...")
-
- if not mcp_server_config:
- logger.error("No MCP server configuration provided.")
- return None
-
- try:
- if "mcpServers" in mcp_server_config:
- mcp_server_config = mcp_server_config["mcpServers"]
- client = MultiServerMCPClient(mcp_server_config)
- await client.__aenter__()
- return client
-
- except Exception as e:
- logger.error(f"Failed to setup MCP client or fetch tools: {e}", exc_info=True)
- return None
-
-
-def create_tool_param_model(tool: BaseTool) -> Type[BaseModel]:
- """Creates a Pydantic model from a LangChain tool's schema"""
-
- # Get tool schema information
- json_schema = tool.args_schema
- tool_name = tool.name
-
- # If the tool already has a schema defined, convert it to a new param_model
- if json_schema is not None:
-
- # Create new parameter model
- params = {}
-
- # Process properties if they exist
- if 'properties' in json_schema:
- # Find required fields
- required_fields: Set[str] = set(json_schema.get('required', []))
-
- for prop_name, prop_details in json_schema['properties'].items():
- field_type = resolve_type(prop_details, f"{tool_name}_{prop_name}")
-
- # Check if parameter is required
- is_required = prop_name in required_fields
-
- # Get default value and description
- default_value = prop_details.get('default', ... if is_required else None)
- description = prop_details.get('description', '')
-
- # Add field constraints
- field_kwargs = {'default': default_value}
- if description:
- field_kwargs['description'] = description
-
- # Add additional constraints if present
- if 'minimum' in prop_details:
- field_kwargs['ge'] = prop_details['minimum']
- if 'maximum' in prop_details:
- field_kwargs['le'] = prop_details['maximum']
- if 'minLength' in prop_details:
- field_kwargs['min_length'] = prop_details['minLength']
- if 'maxLength' in prop_details:
- field_kwargs['max_length'] = prop_details['maxLength']
- if 'pattern' in prop_details:
- field_kwargs['pattern'] = prop_details['pattern']
-
- # Add to parameters dictionary
- params[prop_name] = (field_type, Field(**field_kwargs))
-
- return create_model(
- f'{tool_name}_parameters',
- __base__=ActionModel,
- **params, # type: ignore
- )
-
- # If no schema is defined, extract parameters from the _run method
- run_method = tool._run
- sig = inspect.signature(run_method)
-
- # Get type hints for better type information
- try:
- type_hints = get_type_hints(run_method)
- except Exception:
- type_hints = {}
-
- params = {}
- for name, param in sig.parameters.items():
- # Skip 'self' parameter and any other parameters you want to exclude
- if name == 'self':
- continue
-
- # Get annotation from type hints if available, otherwise from signature
- annotation = type_hints.get(name, param.annotation)
- if annotation == inspect.Parameter.empty:
- annotation = Any
-
- # Use default value if available, otherwise make it required
- if param.default != param.empty:
- params[name] = (annotation, param.default)
- else:
- params[name] = (annotation, ...)
-
- return create_model(
- f'{tool_name}_parameters',
- __base__=ActionModel,
- **params, # type: ignore
- )
-
-
-def resolve_type(prop_details: Dict[str, Any], prefix: str = "") -> Any:
- """Recursively resolves JSON schema type to Python/Pydantic type"""
-
- # Handle reference types
- if '$ref' in prop_details:
- # In a real application, reference resolution would be needed
- return Any
-
- # Basic type mapping
- type_mapping = {
- 'string': str,
- 'integer': int,
- 'number': float,
- 'boolean': bool,
- 'array': List,
- 'object': Dict,
- 'null': type(None),
- }
-
- # Handle formatted strings
- if prop_details.get('type') == 'string' and 'format' in prop_details:
- format_mapping = {
- 'date-time': datetime,
- 'date': date,
- 'time': time,
- 'email': str,
- 'uri': str,
- 'url': str,
- 'uuid': uuid.UUID,
- 'binary': bytes,
- }
- return format_mapping.get(prop_details['format'], str)
-
- # Handle enum types
- if 'enum' in prop_details:
- enum_values = prop_details['enum']
- # Create dynamic enum class with safe names
- enum_dict = {}
- for i, v in enumerate(enum_values):
- # Ensure enum names are valid Python identifiers
- if isinstance(v, str):
- key = v.upper().replace(' ', '_').replace('-', '_')
- if not key.isidentifier():
- key = f"VALUE_{i}"
- else:
- key = f"VALUE_{i}"
- enum_dict[key] = v
-
- # Only create enum if we have values
- if enum_dict:
- return Enum(f"{prefix}_Enum", enum_dict)
- return str # Fallback
-
- # Handle array types
- if prop_details.get('type') == 'array' and 'items' in prop_details:
- item_type = resolve_type(prop_details['items'], f"{prefix}_item")
- return List[item_type] # type: ignore
-
- # Handle object types with properties
- if prop_details.get('type') == 'object' and 'properties' in prop_details:
- nested_params = {}
- for nested_name, nested_details in prop_details['properties'].items():
- nested_type = resolve_type(nested_details, f"{prefix}_{nested_name}")
- # Get required field info
- required_fields = prop_details.get('required', [])
- is_required = nested_name in required_fields
- default_value = nested_details.get('default', ... if is_required else None)
- description = nested_details.get('description', '')
-
- field_kwargs = {'default': default_value}
- if description:
- field_kwargs['description'] = description
-
- nested_params[nested_name] = (nested_type, Field(**field_kwargs))
-
- # Create nested model
- nested_model = create_model(f"{prefix}_Model", **nested_params)
- return nested_model
-
- # Handle union types (oneOf, anyOf)
- if 'oneOf' in prop_details or 'anyOf' in prop_details:
- union_schema = prop_details.get('oneOf') or prop_details.get('anyOf')
- union_types = []
- for i, t in enumerate(union_schema):
- union_types.append(resolve_type(t, f"{prefix}_{i}"))
-
- if union_types:
- return Union.__getitem__(tuple(union_types)) # type: ignore
- return Any
-
- # Handle allOf (intersection types)
- if 'allOf' in prop_details:
- nested_params = {}
- for i, schema_part in enumerate(prop_details['allOf']):
- if 'properties' in schema_part:
- for nested_name, nested_details in schema_part['properties'].items():
- nested_type = resolve_type(nested_details, f"{prefix}_allOf_{i}_{nested_name}")
- # Check if required
- required_fields = schema_part.get('required', [])
- is_required = nested_name in required_fields
- nested_params[nested_name] = (nested_type, ... if is_required else None)
-
- # Create composite model
- if nested_params:
- composite_model = create_model(f"{prefix}_CompositeModel", **nested_params)
- return composite_model
- return Dict
-
- # Default to basic types
- schema_type = prop_details.get('type', 'string')
- if isinstance(schema_type, list):
- # Handle multiple types (e.g., ["string", "null"])
- non_null_types = [t for t in schema_type if t != 'null']
- if non_null_types:
- primary_type = type_mapping.get(non_null_types[0], Any)
- if 'null' in schema_type:
- return Optional[primary_type] # type: ignore
- return primary_type
- return Any
-
- return type_mapping.get(schema_type, Any)
+ manager = get_mcp_manager()
+ await manager.setup_mcp_servers(mcp_server_config)
+ return manager
diff --git a/src/webui/components/agent_settings_tab.py b/src/webui/components/agent_settings_tab.py
index a93eb76a..36038cef 100644
--- a/src/webui/components/agent_settings_tab.py
+++ b/src/webui/components/agent_settings_tab.py
@@ -26,7 +26,7 @@ def update_model_dropdown(llm_provider):
async def update_mcp_server(mcp_file: str, webui_manager: WebuiManager):
"""
- Update the MCP server.
+ Update the MCP server configuration using browser-use MCP implementation.
"""
if hasattr(webui_manager, "bu_controller") and webui_manager.bu_controller:
logger.warning("⚠️ Close controller because mcp file has changed!")
@@ -35,7 +35,7 @@ async def update_mcp_server(mcp_file: str, webui_manager: WebuiManager):
if not mcp_file or not os.path.exists(mcp_file) or not mcp_file.endswith('.json'):
logger.warning(f"{mcp_file} is not a valid MCP file.")
- return None, gr.update(visible=False)
+ return '', gr.update(visible=False)
with open(mcp_file, 'r') as f:
mcp_server = json.load(f)
@@ -258,12 +258,12 @@ def create_agent_settings_tab(webui_manager: WebuiManager):
)
async def update_wrapper(mcp_file):
- """Wrapper for handle_pause_resume."""
+ """Wrapper for MCP server update."""
update_dict = await update_mcp_server(mcp_file, webui_manager)
- yield update_dict
+ return update_dict
mcp_json_file.change(
- update_wrapper,
+ fn=update_wrapper,
inputs=[mcp_json_file],
outputs=[mcp_server_config, mcp_server_config]
)
diff --git a/src/webui/components/browser_settings_tab.py b/src/webui/components/browser_settings_tab.py
index 77fbfb52..649e1147 100644
--- a/src/webui/components/browser_settings_tab.py
+++ b/src/webui/components/browser_settings_tab.py
@@ -1,5 +1,14 @@
import os
-from distutils.util import strtobool
+# distutils removed in Python 3.13
+def strtobool(val):
+ """Convert a string representation of truth to true or false."""
+ val = val.lower()
+ if val in ('y', 'yes', 't', 'true', 'on', '1'):
+ return 1
+ elif val in ('n', 'no', 'f', 'false', 'off', '0'):
+ return 0
+ else:
+ raise ValueError(f"invalid truth value {val!r}")
import gradio as gr
import logging
from gradio.components import Component
diff --git a/src/webui/components/browser_use_agent_tab.py b/src/webui/components/browser_use_agent_tab.py
index b51a1663..6e34be01 100644
--- a/src/webui/components/browser_use_agent_tab.py
+++ b/src/webui/components/browser_use_agent_tab.py
@@ -12,14 +12,29 @@
AgentHistoryList,
AgentOutput,
)
-from browser_use.browser.browser import BrowserConfig
-from browser_use.browser.context import BrowserContext, BrowserContextConfig
-from browser_use.browser.views import BrowserState
+# Use compatibility layer for browser_use 0.6.0
+from src.browser.browser_compat import (
+ BrowserConfig,
+ BrowserContextConfig,
+ BrowserState,
+ Browser,
+ BrowserContext
+)
+from browser_use import Controller
+from browser_use.browser import BrowserSession
from gradio.components import Component
-from langchain_core.language_models.chat_models import BaseChatModel
-
-from src.agent.browser_use.browser_use_agent import BrowserUseAgent
-from src.browser.custom_browser import CustomBrowser
+# Import browser_use LLM classes
+from browser_use.llm.anthropic.chat import ChatAnthropic
+from browser_use.llm.openai.chat import ChatOpenAI
+from browser_use.llm.ollama.chat import ChatOllama
+from browser_use.llm.google.chat import ChatGoogle
+from browser_use.llm.groq.chat import ChatGroq
+from browser_use.llm.base import BaseChatModel
+
+# Use standard Agent from browser_use 0.6.0 instead of custom BrowserUseAgent
+from browser_use import Agent
+# Custom browser classes need to be updated for browser_use 0.6.0
+# from src.browser.custom_browser import CustomBrowser
from src.controller.custom_controller import CustomController
from src.utils import llm_provider
from src.webui.webui_manager import WebuiManager
@@ -38,25 +53,60 @@ async def _initialize_llm(
api_key: Optional[str],
num_ctx: Optional[int] = None,
) -> Optional[BaseChatModel]:
- """Initializes the LLM based on settings. Returns None if provider/model is missing."""
+ """Initializes the LLM based on settings using browser_use LLM classes. Returns None if provider/model is missing."""
if not provider or not model_name:
logger.info("LLM Provider or Model Name not specified, LLM will be None.")
return None
try:
- # Use your actual LLM provider logic here
logger.info(
f"Initializing LLM: Provider={provider}, Model={model_name}, Temp={temperature}"
)
- # Example using a placeholder function
- llm = llm_provider.get_llm_model(
- provider=provider,
- model_name=model_name,
- temperature=temperature,
- base_url=base_url or None,
- api_key=api_key or None,
- # Add other relevant params like num_ctx for ollama
- num_ctx=num_ctx if provider == "ollama" else None,
- )
+
+ # Use browser_use specific LLM classes
+ if provider == "anthropic":
+ llm = ChatAnthropic(
+ model=model_name,
+ api_key=api_key or os.getenv("ANTHROPIC_API_KEY"),
+ temperature=temperature,
+ base_url=base_url if base_url else None,
+ )
+ elif provider == "openai":
+ llm = ChatOpenAI(
+ model=model_name,
+ api_key=api_key or os.getenv("OPENAI_API_KEY"),
+ temperature=temperature,
+ base_url=base_url if base_url else None,
+ )
+ elif provider == "ollama":
+ llm = ChatOllama(
+ model=model_name,
+ base_url=base_url or "http://localhost:11434",
+ temperature=temperature,
+ )
+ elif provider == "google":
+ llm = ChatGoogle(
+ model=model_name,
+ api_key=api_key or os.getenv("GOOGLE_API_KEY"),
+ temperature=temperature,
+ )
+ elif provider == "groq":
+ llm = ChatGroq(
+ model=model_name,
+ api_key=api_key or os.getenv("GROQ_API_KEY"),
+ temperature=temperature,
+ )
+ else:
+ # Use the centralized LLM provider for all other providers
+ logger.info(f"Using LLM provider for '{provider}'")
+ llm = llm_provider.get_llm_model(
+ provider=provider,
+ model_name=model_name,
+ temperature=temperature,
+ base_url=base_url or None,
+ api_key=api_key or None,
+ num_ctx=num_ctx if provider == "ollama" else None,
+ )
+
return llm
except Exception as e:
logger.error(f"Failed to initialize LLM: {e}", exc_info=True)
@@ -199,12 +249,14 @@ async def _handle_new_step(
def _handle_done(webui_manager: WebuiManager, history: AgentHistoryList):
"""Callback when the agent finishes the task (success or failure)."""
+ # Get token usage if available
+ total_tokens = history.usage.total_tokens if history.usage else 0
logger.info(
- f"Agent task finished. Duration: {history.total_duration_seconds():.2f}s, Tokens: {history.total_input_tokens()}"
+ f"Agent task finished. Duration: {history.total_duration_seconds():.2f}s, Tokens: {total_tokens}"
)
final_summary = "**Task Completed**\n"
final_summary += f"- Duration: {history.total_duration_seconds():.2f} seconds\n"
- final_summary += f"- Total Input Tokens: {history.total_input_tokens()}\n" # Or total tokens if available
+ final_summary += f"- Total Tokens: {total_tokens}\n" # Use total_tokens from usage
final_result = history.final_result()
if final_result:
@@ -222,7 +274,7 @@ def _handle_done(webui_manager: WebuiManager, history: AgentHistoryList):
async def _ask_assistant_callback(
- webui_manager: WebuiManager, query: str, browser_context: BrowserContext
+ webui_manager: WebuiManager, query: str, browser_session: BrowserSession
) -> Dict[str, Any]:
"""Callback triggered by the agent's ask_for_assistant action."""
logger.info("Agent requires assistance. Waiting for user input.")
@@ -350,9 +402,13 @@ def get_setting(key, default=None):
mcp_server_config_str = (
components.get(mcp_server_config_comp) if mcp_server_config_comp else None
)
- mcp_server_config = (
- json.loads(mcp_server_config_str) if mcp_server_config_str else None
- )
+ mcp_server_config = None
+ if mcp_server_config_str:
+ try:
+ mcp_server_config = json.loads(mcp_server_config_str)
+ except json.JSONDecodeError as e:
+ logger.warning(f"Invalid MCP config JSON: {e}. Continuing without MCP.")
+ mcp_server_config = None
# Planner LLM Settings (Optional)
planner_llm_provider_name = get_setting("planner_llm_provider") or None
@@ -422,14 +478,13 @@ def get_browser_setting(key, default=None):
# Pass the webui_manager instance to the callback when wrapping it
async def ask_callback_wrapper(
- query: str, browser_context: BrowserContext
+ query: str, browser_session: BrowserSession
) -> Dict[str, Any]:
- return await _ask_assistant_callback(webui_manager, query, browser_context)
+ return await _ask_assistant_callback(webui_manager, query, browser_session)
if not webui_manager.bu_controller:
- webui_manager.bu_controller = CustomController(
- ask_assistant_callback=ask_callback_wrapper
- )
+ # Set the controller with MCP setup using browser-use
+ webui_manager.bu_controller = CustomController(ask_assistant_callback=ask_callback_wrapper)
await webui_manager.bu_controller.setup_mcp_client(mcp_server_config)
# --- 4. Initialize Browser and Context ---
@@ -461,38 +516,25 @@ async def ask_callback_wrapper(
else:
browser_binary_path = None
- webui_manager.bu_browser = CustomBrowser(
- config=BrowserConfig(
- headless=headless,
- disable_security=disable_security,
- browser_binary_path=browser_binary_path,
- extra_browser_args=extra_args,
- wss_url=wss_url,
- cdp_url=cdp_url,
- new_context_config=BrowserContextConfig(
- window_width=window_w,
- window_height=window_h,
- )
- )
+ # Use BrowserSession for browser_use 0.6.0 with CDP
+ from browser_use.browser.profile import BrowserProfile
+
+ browser_profile = BrowserProfile(
+ headless=headless,
+ viewport={'width': window_w, 'height': window_h}
+ )
+
+ # For CDP-based browser_use, create session
+ webui_manager.bu_browser = BrowserSession(
+ cdp_url=cdp_url if cdp_url else None,
+ browser_profile=browser_profile,
+ is_local=True if not cdp_url else False
)
- # Create Context if needed
+ # In browser_use 0.6.0, context is part of BrowserSession
if not webui_manager.bu_browser_context:
- logger.info("Creating new browser context.")
- context_config = BrowserContextConfig(
- trace_path=save_trace_path if save_trace_path else None,
- save_recording_path=save_recording_path
- if save_recording_path
- else None,
- save_downloads_path=save_download_path if save_download_path else None,
- window_height=window_h,
- window_width=window_w,
- )
- if not webui_manager.bu_browser:
- raise ValueError("Browser not initialized, cannot create context.")
- webui_manager.bu_browser_context = (
- await webui_manager.bu_browser.new_context(config=context_config)
- )
+ logger.info("Browser context is same as browser session in v0.6.0")
+ webui_manager.bu_browser_context = webui_manager.bu_browser
# --- 5. Initialize or Update Agent ---
webui_manager.bu_agent_task_id = str(uuid.uuid4()) # New ID for this task run
@@ -526,33 +568,43 @@ def done_callback_wrapper(history: AgentHistoryList):
raise ValueError(
"Browser or Context not initialized, cannot create agent."
)
- webui_manager.bu_agent = BrowserUseAgent(
+ # Use standard Agent class with browser_use 0.6.0 parameters
+ webui_manager.bu_agent = Agent(
task=task,
llm=main_llm,
- browser=webui_manager.bu_browser,
- browser_context=webui_manager.bu_browser_context,
+ browser_session=webui_manager.bu_browser, # Use browser_session in v0.6.0
controller=webui_manager.bu_controller,
register_new_step_callback=step_callback_wrapper,
register_done_callback=done_callback_wrapper,
use_vision=use_vision,
override_system_message=override_system_prompt,
extend_system_message=extend_system_prompt,
- max_input_tokens=max_input_tokens,
max_actions_per_step=max_actions,
- tool_calling_method=tool_calling_method,
- planner_llm=planner_llm,
- use_vision_for_planner=planner_use_vision if planner_llm else False,
+ # Remove deprecated/incompatible parameters:
+ # max_input_tokens, tool_calling_method, planner_llm, use_vision_for_planner
source="webui",
)
- webui_manager.bu_agent.state.agent_id = webui_manager.bu_agent_task_id
- webui_manager.bu_agent.settings.generate_gif = gif_path
+ # Note: agent_id and generate_gif may need different handling in 0.6.0
+ # webui_manager.bu_agent.state.agent_id = webui_manager.bu_agent_task_id
+ webui_manager.bu_agent.generate_gif = gif_path
else:
- webui_manager.bu_agent.state.agent_id = webui_manager.bu_agent_task_id
- webui_manager.bu_agent.add_new_task(task)
- webui_manager.bu_agent.settings.generate_gif = gif_path
- webui_manager.bu_agent.browser = webui_manager.bu_browser
- webui_manager.bu_agent.browser_context = webui_manager.bu_browser_context
- webui_manager.bu_agent.controller = webui_manager.bu_controller
+ # Note: add_new_task may not exist in 0.6.0, need to recreate agent
+ # webui_manager.bu_agent.state.agent_id = webui_manager.bu_agent_task_id
+ # Instead of reusing, create a new agent for new tasks
+ webui_manager.bu_agent = Agent(
+ task=task,
+ llm=main_llm,
+ browser_session=webui_manager.bu_browser,
+ controller=webui_manager.bu_controller,
+ register_new_step_callback=step_callback_wrapper,
+ register_done_callback=done_callback_wrapper,
+ use_vision=use_vision,
+ override_system_message=override_system_prompt,
+ extend_system_message=extend_system_prompt,
+ max_actions_per_step=max_actions,
+ source="webui",
+ )
+ webui_manager.bu_agent.generate_gif = gif_path
# --- 6. Run Agent Task and Stream Updates ---
agent_run_coro = webui_manager.bu_agent.run(max_steps=max_steps)
@@ -561,8 +613,9 @@ def done_callback_wrapper(history: AgentHistoryList):
last_chat_len = len(webui_manager.bu_chat_history)
while not agent_task.done():
- is_paused = webui_manager.bu_agent.state.paused
- is_stopped = webui_manager.bu_agent.state.stopped
+ # Access paused/stopped state from agent.state
+ is_paused = webui_manager.bu_agent.state.paused if hasattr(webui_manager.bu_agent, 'state') else False
+ is_stopped = webui_manager.bu_agent.state.stopped if hasattr(webui_manager.bu_agent, 'state') else False
# Check for pause state
if is_paused:
@@ -911,7 +964,11 @@ async def handle_clear(webui_manager: WebuiManager):
task = webui_manager.bu_current_task
if task and not task.done():
logger.info("Clearing requires stopping the current task.")
- webui_manager.bu_agent.stop()
+ if hasattr(webui_manager.bu_agent, 'stop'):
+ webui_manager.bu_agent.stop()
+ else:
+ # Fallback: set stopped flag directly
+ webui_manager.bu_agent.state.stopped = True
task.cancel()
try:
await asyncio.wait_for(task, timeout=2.0) # Wait briefly
diff --git a/src/webui/components/deep_research_agent_tab.py b/src/webui/components/deep_research_agent_tab.py
index 88faea09..75df3eae 100644
--- a/src/webui/components/deep_research_agent_tab.py
+++ b/src/webui/components/deep_research_agent_tab.py
@@ -15,6 +15,24 @@
logger = logging.getLogger(__name__)
+async def update_mcp_server(mcp_file: str, webui_manager: WebuiManager):
+ """
+ Update the MCP server configuration for deep research agent.
+ """
+ if hasattr(webui_manager, "dr_agent") and webui_manager.dr_agent:
+ logger.warning("⚠️ Close agent because mcp file has changed!")
+ # For deep research agent, we'll handle MCP setup when creating the agent
+
+ if not mcp_file or not os.path.exists(mcp_file) or not mcp_file.endswith('.json'):
+ logger.warning(f"{mcp_file} is not a valid MCP file.")
+ return '', gr.update(visible=False)
+
+ with open(mcp_file, 'r') as f:
+ mcp_server = json.load(f)
+
+ return json.dumps(mcp_server, indent=2), gr.update(visible=True)
+
+
async def _initialize_llm(provider: Optional[str], model_name: Optional[str], temperature: float,
base_url: Optional[str], api_key: Optional[str], num_ctx: Optional[int] = None):
"""Initializes the LLM based on settings. Returns None if provider/model is missing."""
@@ -68,7 +86,7 @@ async def run_deep_research(webui_manager: WebuiManager, components: Dict[Compon
stop_button_comp = webui_manager.get_component_by_id("deep_research_agent.stop_button")
markdown_display_comp = webui_manager.get_component_by_id("deep_research_agent.markdown_display")
markdown_download_comp = webui_manager.get_component_by_id("deep_research_agent.markdown_download")
- mcp_server_config_comp = webui_manager.get_component_by_id("deep_research_agent.mcp_server_config")
+# MCP functionality removed
# --- 1. Get Task and Settings ---
task_topic = components.get(research_task_comp, "").strip()
@@ -81,8 +99,17 @@ async def run_deep_research(webui_manager: WebuiManager, components: Dict[Compon
logger.warning(f"Unsafe base_save_dir detected: {base_save_dir}. Using default directory.")
normalized_base_save_dir = os.path.abspath(safe_root_dir)
base_save_dir = normalized_base_save_dir
+
+ # Get MCP configuration
+ mcp_server_config_comp = webui_manager.get_component_by_id("deep_research_agent.mcp_server_config")
mcp_server_config_str = components.get(mcp_server_config_comp)
- mcp_config = json.loads(mcp_server_config_str) if mcp_server_config_str else None
+ mcp_config = None
+ if mcp_server_config_str:
+ try:
+ mcp_config = json.loads(mcp_server_config_str)
+ except json.JSONDecodeError as e:
+ logger.warning(f"Invalid MCP config JSON: {e}. Continuing without MCP.")
+ mcp_config = None
if not task_topic:
gr.Warning("Please enter a research task.")
@@ -151,16 +178,15 @@ def get_setting(tab: str, key: str, default: Any = None):
webui_manager.dr_agent = DeepResearchAgent(
llm=llm,
browser_config=browser_config_dict,
- mcp_server_config=mcp_config
+ # Note: Deep research agent doesn't directly support MCP in the simplified version
+ # but browser agents within it can use MCP through the controller
)
logger.info("DeepResearchAgent initialized.")
# --- 5. Start Agent Run ---
- agent_run_coro = webui_manager.dr_agent.run(
- topic=task_topic,
- task_id=task_id_to_resume,
- save_dir=base_save_dir,
- max_parallel_browsers=max_parallel_agents
+ agent_run_coro = webui_manager.dr_agent.research(
+ query=task_topic,
+ output_dir=base_save_dir
)
agent_task = asyncio.create_task(agent_run_coro)
webui_manager.dr_current_task = agent_task
@@ -230,20 +256,34 @@ def get_setting(tab: str, key: str, default: Any = None):
# --- 7. Task Finalization ---
logger.info("Agent task processing finished. Awaiting final result...")
- final_result_dict = await agent_task # Get result or raise exception
- logger.info(f"Agent run completed. Result keys: {final_result_dict.keys() if final_result_dict else 'None'}")
+ final_result_path = await agent_task # Get result path or raise exception
+ logger.info(f"Agent run completed. Result path: {final_result_path}")
- # Try to get task ID from result if not known before
- if not running_task_id and final_result_dict and 'task_id' in final_result_dict:
- running_task_id = final_result_dict['task_id']
+ # Try to get task ID from agent's current state if not known before
+ if not running_task_id and webui_manager.dr_agent.current_task_id:
+ running_task_id = webui_manager.dr_agent.current_task_id
webui_manager.dr_task_id = running_task_id
task_specific_dir = os.path.join(base_save_dir, str(running_task_id))
report_file_path = os.path.join(task_specific_dir, "report.md")
- logger.info(f"Task ID confirmed from result: {running_task_id}")
+ logger.info(f"Task ID confirmed from agent state: {running_task_id}")
final_ui_update = {}
- if report_file_path and os.path.exists(report_file_path):
- logger.info(f"Loading final report from: {report_file_path}")
+
+ # Use the returned report path directly
+ if final_result_path and os.path.exists(final_result_path):
+ logger.info(f"Loading final report from returned path: {final_result_path}")
+ report_content = _read_file_safe(final_result_path)
+ if report_content:
+ final_ui_update[markdown_display_comp] = gr.update(value=report_content)
+ final_ui_update[markdown_download_comp] = gr.File(value=final_result_path,
+ label=f"Report ({running_task_id or 'research'}.md)",
+ interactive=True)
+ else:
+ final_ui_update[markdown_display_comp] = gr.update(
+ value="# Research Complete\n\n*Error reading final report file.*")
+ elif report_file_path and os.path.exists(report_file_path):
+ # Fallback to expected report path if direct path doesn't work
+ logger.info(f"Loading final report from expected path: {report_file_path}")
report_content = _read_file_safe(report_file_path)
if report_content:
final_ui_update[markdown_display_comp] = gr.update(value=report_content)
@@ -253,15 +293,8 @@ def get_setting(tab: str, key: str, default: Any = None):
else:
final_ui_update[markdown_display_comp] = gr.update(
value="# Research Complete\n\n*Error reading final report file.*")
- elif final_result_dict and 'report' in final_result_dict:
- logger.info("Using report content directly from agent result.")
- # If agent directly returns report content
- final_ui_update[markdown_display_comp] = gr.update(value=final_result_dict['report'])
- # Cannot offer download if only content is available
- final_ui_update[markdown_download_comp] = gr.update(value=None, label="Download Research Report",
- interactive=False)
else:
- logger.warning("Final report file not found and not in result dict.")
+ logger.warning("Final report file not found at returned path or expected location.")
final_ui_update[markdown_display_comp] = gr.update(value="# Research Complete\n\n*Final report not found.*")
yield final_ui_update
@@ -355,22 +388,7 @@ async def stop_deep_research(webui_manager: WebuiManager) -> Dict[Component, Any
return final_update
-async def update_mcp_server(mcp_file: str, webui_manager: WebuiManager):
- """
- Update the MCP server.
- """
- if hasattr(webui_manager, "dr_agent") and webui_manager.dr_agent:
- logger.warning("⚠️ Close controller because mcp file has changed!")
- await webui_manager.dr_agent.close_mcp_client()
-
- if not mcp_file or not os.path.exists(mcp_file) or not mcp_file.endswith('.json'):
- logger.warning(f"{mcp_file} is not a valid MCP file.")
- return None, gr.update(visible=False)
-
- with open(mcp_file, 'r') as f:
- mcp_server = json.load(f)
-
- return json.dumps(mcp_server, indent=2), gr.update(visible=True)
+# MCP functionality removed
def create_deep_research_agent_tab(webui_manager: WebuiManager):
@@ -381,9 +399,8 @@ def create_deep_research_agent_tab(webui_manager: WebuiManager):
tab_components = {}
with gr.Group():
- with gr.Row():
- mcp_json_file = gr.File(label="MCP server json", interactive=True, file_types=[".json"])
- mcp_server_config = gr.Textbox(label="MCP server", lines=6, interactive=True, visible=False)
+ mcp_json_file = gr.File(label="MCP server json", interactive=True, file_types=[".json"])
+ mcp_server_config = gr.Textbox(label="MCP server", lines=6, interactive=True, visible=False)
with gr.Group():
research_task = gr.Textbox(label="Research Task", lines=5,
@@ -421,12 +438,12 @@ def create_deep_research_agent_tab(webui_manager: WebuiManager):
webui_manager.init_deep_research_agent()
async def update_wrapper(mcp_file):
- """Wrapper for handle_pause_resume."""
+ """Wrapper for MCP server update."""
update_dict = await update_mcp_server(mcp_file, webui_manager)
- yield update_dict
+ return update_dict
mcp_json_file.change(
- update_wrapper,
+ fn=update_wrapper,
inputs=[mcp_json_file],
outputs=[mcp_server_config, mcp_server_config]
)
diff --git a/src/webui/webui_manager.py b/src/webui/webui_manager.py
index 0a9d5e16..5aadf88e 100644
--- a/src/webui/webui_manager.py
+++ b/src/webui/webui_manager.py
@@ -10,12 +10,8 @@
import time
from gradio.components import Component
-from browser_use.browser.browser import Browser
-from browser_use.browser.context import BrowserContext
-from browser_use.agent.service import Agent
-from src.browser.custom_browser import CustomBrowser
-from src.browser.custom_context import CustomBrowserContext
-from src.controller.custom_controller import CustomController
+from browser_use import BrowserSession, Controller, Agent
+# Custom browser classes removed - using CDP-based browser_use 0.6.0 directly
from src.agent.deep_research.deep_research_agent import DeepResearchAgent
@@ -32,9 +28,9 @@ def init_browser_use_agent(self) -> None:
init browser use agent
"""
self.bu_agent: Optional[Agent] = None
- self.bu_browser: Optional[CustomBrowser] = None
- self.bu_browser_context: Optional[CustomBrowserContext] = None
- self.bu_controller: Optional[CustomController] = None
+ self.bu_browser: Optional[BrowserSession] = None
+ self.bu_browser_context: Optional[BrowserSession] = None # In v0.6.0, context is merged with session
+ self.bu_controller: Optional[Controller] = None
self.bu_chat_history: List[Dict[str, Optional[str]]] = []
self.bu_response_event: Optional[asyncio.Event] = None
self.bu_user_help_response: Optional[str] = None
diff --git a/supervisord.conf b/supervisord.conf
index 60107669..f3d89a4d 100644
--- a/supervisord.conf
+++ b/supervisord.conf
@@ -65,9 +65,20 @@ startretries=5
startsecs=3
depends_on=x11vnc
+[program:dbus]
+command=dbus-daemon --system --nofork
+autorestart=true
+stdout_logfile=/dev/stdout
+stdout_logfile_maxbytes=0
+stderr_logfile=/dev/stderr
+stderr_logfile_maxbytes=0
+priority=50
+startsecs=1
+
[program:webui]
command=python webui.py --ip 0.0.0.0 --port 7788
directory=/app
+environment=DISPLAY=":99",CHROME_BIN="/usr/bin/chromium"
autorestart=true
stdout_logfile=/dev/stdout
stdout_logfile_maxbytes=0
@@ -77,4 +88,19 @@ priority=400
startretries=3
startsecs=3
stopsignal=TERM
+stopwaitsecs=10
+
+[program:mcp_server]
+command=python -m browser_use.mcp.server --http --port 3000
+directory=/app
+environment=DISPLAY=":99",CHROME_BIN="/usr/bin/chromium"
+autorestart=true
+stdout_logfile=/dev/stdout
+stdout_logfile_maxbytes=0
+stderr_logfile=/dev/stderr
+stderr_logfile_maxbytes=0
+priority=450
+startretries=3
+startsecs=3
+stopsignal=TERM
stopwaitsecs=10
\ No newline at end of file
diff --git a/tests/test_agents.py b/tests/test_agents.py
index a36561e4..c4efd3ea 100644
--- a/tests/test_agents.py
+++ b/tests/test_agents.py
@@ -93,7 +93,7 @@ async def test_browser_use_agent():
}
}
controller = CustomController()
- await controller.setup_mcp_client(mcp_server_config)
+# MCP setup removed
use_own_browser = True
use_vision = True # Set to False when using DeepSeek
@@ -160,7 +160,7 @@ async def test_browser_use_agent():
if browser:
await browser.close()
if controller:
- await controller.close_mcp_client()
+# MCP cleanup removed
async def test_browser_use_parallel():
@@ -249,7 +249,7 @@ async def test_browser_use_parallel():
}
}
controller = CustomController()
- await controller.setup_mcp_client(mcp_server_config)
+# MCP setup removed
use_own_browser = True
use_vision = True # Set to False when using DeepSeek
@@ -323,7 +323,7 @@ async def test_browser_use_parallel():
if browser:
await browser.close()
if controller:
- await controller.close_mcp_client()
+# MCP cleanup removed
async def test_deep_research_agent():
@@ -340,20 +340,9 @@ async def test_deep_research_agent():
# provider="bedrock",
# )
- mcp_server_config = {
- "mcpServers": {
- "desktop-commander": {
- "command": "npx",
- "args": [
- "-y",
- "@wonderwhy-er/desktop-commander"
- ]
- },
- }
- }
-
+# MCP server config removed
browser_config = {"headless": False, "window_width": 1280, "window_height": 1100, "use_own_browser": False}
- agent = DeepResearchAgent(llm=llm, browser_config=browser_config, mcp_server_config=mcp_server_config)
+ agent = DeepResearchAgent(llm=llm, browser_config=browser_config)
research_topic = "Give me investment advices of nvidia and tesla."
task_id_to_resume = "" # Set this to resume a previous task ID
diff --git a/tests/test_controller.py b/tests/test_controller.py
index 173bae44..e7257015 100644
--- a/tests/test_controller.py
+++ b/tests/test_controller.py
@@ -1,131 +1,10 @@
-import asyncio
-import pdb
-import sys
-import time
+# Test file for controller functionality
-sys.path.append(".")
-from dotenv import load_dotenv
+# MCP functionality has been removed from the application.
+# Controller tests related to MCP have been removed since the functionality no longer exists.
+# This file is kept for potential future controller tests.
-load_dotenv()
-
-
-async def test_mcp_client():
- from src.utils.mcp_client import setup_mcp_client_and_tools, create_tool_param_model
-
- test_server_config = {
- "mcpServers": {
- # "markitdown": {
- # "command": "docker",
- # "args": [
- # "run",
- # "--rm",
- # "-i",
- # "markitdown-mcp:latest"
- # ]
- # },
- "desktop-commander": {
- "command": "npx",
- "args": [
- "-y",
- "@wonderwhy-er/desktop-commander"
- ]
- },
- # "filesystem": {
- # "command": "npx",
- # "args": [
- # "-y",
- # "@modelcontextprotocol/server-filesystem",
- # "/Users/xxx/ai_workspace",
- # ]
- # },
- }
- }
-
- mcp_tools, mcp_client = await setup_mcp_client_and_tools(test_server_config)
-
- for tool in mcp_tools:
- tool_param_model = create_tool_param_model(tool)
- print(tool.name)
- print(tool.description)
- print(tool_param_model.model_json_schema())
- pdb.set_trace()
-
-
-async def test_controller_with_mcp():
- import os
- from src.controller.custom_controller import CustomController
- from browser_use.controller.registry.views import ActionModel
-
- mcp_server_config = {
- "mcpServers": {
- # "markitdown": {
- # "command": "docker",
- # "args": [
- # "run",
- # "--rm",
- # "-i",
- # "markitdown-mcp:latest"
- # ]
- # },
- "desktop-commander": {
- "command": "npx",
- "args": [
- "-y",
- "@wonderwhy-er/desktop-commander"
- ]
- },
- # "filesystem": {
- # "command": "npx",
- # "args": [
- # "-y",
- # "@modelcontextprotocol/server-filesystem",
- # "/Users/xxx/ai_workspace",
- # ]
- # },
- }
- }
-
- controller = CustomController()
- await controller.setup_mcp_client(mcp_server_config)
- action_name = "mcp.desktop-commander.execute_command"
- action_info = controller.registry.registry.actions[action_name]
- param_model = action_info.param_model
- print(param_model.model_json_schema())
- params = {"command": f"python ./tmp/test.py"
- }
- validated_params = param_model(**params)
- ActionModel_ = controller.registry.create_action_model()
- # Create ActionModel instance with the validated parameters
- action_model = ActionModel_(**{action_name: validated_params})
- result = await controller.act(action_model)
- result = result.extracted_content
- print(result)
- if result and "Command is still running. Use read_output to get more output." in result and "PID" in \
- result.split("\n")[0]:
- pid = int(result.split("\n")[0].split("PID")[-1].strip())
- action_name = "mcp.desktop-commander.read_output"
- action_info = controller.registry.registry.actions[action_name]
- param_model = action_info.param_model
- print(param_model.model_json_schema())
- params = {"pid": pid}
- validated_params = param_model(**params)
- action_model = ActionModel_(**{action_name: validated_params})
- output_result = ""
- while True:
- time.sleep(1)
- result = await controller.act(action_model)
- result = result.extracted_content
- if result:
- pdb.set_trace()
- output_result = result
- break
- print(output_result)
- pdb.set_trace()
- await controller.close_mcp_client()
- pdb.set_trace()
-
-
-if __name__ == '__main__':
- # asyncio.run(test_mcp_client())
- asyncio.run(test_controller_with_mcp())
+def test_placeholder():
+ """Placeholder test to maintain test file structure."""
+ pass
diff --git a/tests/test_llm_api.py b/tests/test_llm_api.py
index 938f8256..6e865684 100644
--- a/tests/test_llm_api.py
+++ b/tests/test_llm_api.py
@@ -3,8 +3,8 @@
from dataclasses import dataclass
from dotenv import load_dotenv
-from langchain_core.messages import HumanMessage, SystemMessage
-from langchain_ollama import ChatOllama
+from browser_use.llm import UserMessage, SystemMessage, BaseChatModel
+from src.utils import llm_provider
load_dotenv()
@@ -55,21 +55,7 @@ def get_env_value(key, provider):
def test_llm(config, query, image_path=None, system_message=None):
from src.utils import utils, llm_provider
- # Special handling for Ollama-based models
- if config.provider == "ollama":
- if "deepseek-r1" in config.model_name:
- from src.utils.llm_provider import DeepSeekR1ChatOllama
- llm = DeepSeekR1ChatOllama(model=config.model_name)
- else:
- llm = ChatOllama(model=config.model_name)
-
- ai_msg = llm.invoke(query)
- print(ai_msg.content)
- if "deepseek-r1" in config.model_name:
- pdb.set_trace()
- return
-
- # For other providers, use the standard configuration
+ # Use the unified LLM provider for all models
llm = llm_provider.get_llm_model(
provider=config.provider,
model_name=config.model_name,
@@ -78,17 +64,24 @@ def test_llm(config, query, image_path=None, system_message=None):
api_key=config.api_key or get_env_value("api_key", config.provider)
)
- # Prepare messages for non-Ollama models
+ # Prepare messages using browser-use message types
messages = []
if system_message:
- messages.append(SystemMessage(content=create_message_content(system_message)))
- messages.append(HumanMessage(content=create_message_content(query, image_path)))
+ messages.append(SystemMessage(content=system_message))
+
+ # Use create_message_content to handle both text and image content
+ user_content = create_message_content(query, image_path) if image_path else query
+ messages.append(UserMessage(content=user_content))
+
+ # Call the LLM
ai_msg = llm.invoke(messages)
-
+
# Handle different response types
if hasattr(ai_msg, "reasoning_content"):
print(ai_msg.reasoning_content)
- print(ai_msg.content)
+
+ content = ai_msg.content if hasattr(ai_msg, 'content') else str(ai_msg)
+ print(content)
def test_openai_model():
config = LLMConfig(provider="openai", model_name="gpt-4o")
diff --git a/tests/test_playwright.py b/tests/test_playwright.py
deleted file mode 100644
index 6704a02a..00000000
--- a/tests/test_playwright.py
+++ /dev/null
@@ -1,31 +0,0 @@
-import pdb
-from dotenv import load_dotenv
-
-load_dotenv()
-
-
-def test_connect_browser():
- import os
- from playwright.sync_api import sync_playwright
-
- chrome_exe = os.getenv("CHROME_PATH", "")
- chrome_use_data = os.getenv("CHROME_USER_DATA", "")
-
- with sync_playwright() as p:
- browser = p.chromium.launch_persistent_context(
- user_data_dir=chrome_use_data,
- executable_path=chrome_exe,
- headless=False # Keep browser window visible
- )
-
- page = browser.new_page()
- page.goto("https://mail.google.com/mail/u/0/#inbox")
- page.wait_for_load_state()
-
- input("Press the Enter key to close the browser...")
-
- browser.close()
-
-
-if __name__ == '__main__':
- test_connect_browser()