TencentCloudADP
diff --git a/‎configs/agents/tools/search.yaml‎
Lines changed: 11 additions & 2 deletions b/‎configs/agents/tools/search.yaml‎
Lines changed: 11 additions & 2 deletions
diff --git a/‎configs/eval/ww.yaml‎
Lines changed: 6 additions & 2 deletions b/‎configs/eval/ww.yaml‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎docs/tools.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/tools.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎examples/wide_research/prompts.yaml‎
Lines changed: 1 addition & 1 deletion b/‎examples/wide_research/prompts.yaml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎tests/models/test_react_standalone.py‎
Lines changed: 4 additions & 4 deletions b/‎tests/models/test_react_standalone.py‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎tests/tools/test_search_toolkit.py‎
Lines changed: 44 additions & 11 deletions b/‎tests/tools/test_search_toolkit.py‎
Lines changed: 44 additions & 11 deletions
diff --git a/‎utu/tools/search/baidu_search.py‎
Lines changed: 84 additions & 0 deletions b/‎utu/tools/search/baidu_search.py‎
Lines changed: 84 additions & 0 deletions
diff --git a/‎utu/tools/search/crawl4ai_crawl.py‎
Lines changed: 32 additions & 0 deletions b/‎utu/tools/search/crawl4ai_crawl.py‎
Lines changed: 32 additions & 0 deletions
diff --git a/‎utu/tools/search/duckduckgo_search.py‎
Lines changed: 51 additions & 0 deletions b/‎utu/tools/search/duckduckgo_search.py‎
Lines changed: 51 additions & 0 deletions
diff --git a/‎utu/tools/search/google_search.py‎
Lines changed: 50 additions & 0 deletions b/‎utu/tools/search/google_search.py‎
Lines changed: 50 additions & 0 deletions
@@ -2,9 +2,18 @@ name: search
 mode: builtin
 activated_tools: null
 config:
+  # search config
+  # - `JINA_API_KEY` is required for jina. Ref: https://jina.ai/
+  # - `SERPER_API_KEY` is required for google. Ref: https://serper.dev/
+  search_engine: google  # google | jina | baidu | duckduckgo
+  search_params: {"gl": "cn", "hl": "zh-cn"}  # search params for google & jina
+  search_banned_sites: []
+  # crawl config
+  # - `JINA_API_KEY` is required for jina
+  # - `crawl4ai` and `playwright` should be installed for crawl4ai. Ref: https://github.com/unclecode/crawl4ai
+  crawl_engine: jina  # jina | crawl4ai
+  # llm config used in web_qa
   summary_token_limit: 10_000
-  SERPER_API_KEY: ${oc.env:SERPER_API_KEY}
-  JINA_API_KEY: ${oc.env:JINA_API_KEY}
 config_llm:
   model_provider:
     type: ${oc.env:UTU_LLM_TYPE}
 
@@ -66,9 +66,13 @@ agent:
           mode: builtin
           activated_tools: null
           config:
+            search_engine: google  # google | jina | baidu | duckduckgo
+            search_params: {"gl": "cn", "hl": "zh-cn"}  # search params for google & jina
+            # https://huggingface.co/datasets/callanwu/WebWalkerQA
+            # https://huggingface.co/spaces/dobval/WebThinker
+            search_banned_sites: ["https://huggingface.co/", "https://grok.com/share/", "https://modelscope.cn/datasets/"]
+            crawl_engine: jina  # jina | crawl4ai
             summary_token_limit: 10_000
-            SERPER_API_KEY: ${oc.env:SERPER_API_KEY}
-            JINA_API_KEY: ${oc.env:JINA_API_KEY}
           config_llm:
             model_provider:
               type: ${oc.env:UTU_LLM_TYPE}
 
@@ -16,7 +16,7 @@ Here is a summary of some key toolkits available in the framework:
 
 | Toolkit Class | Provided Tools (Functions) | Core Functionality & Mechanism |
 | :--- | :--- | :--- |
-| **[SearchToolkit][utu.tools.search_toolkit.SearchToolkit]** | `search_google_api`, `web_qa` | Performs web searches using the Serper API and reads webpage content using the Jina API. It can use an LLM to answer questions based on page content. |
+| **[SearchToolkit][utu.tools.search_toolkit.SearchToolkit]** | `search`, `web_qa` | Performs web searches using the Serper API and reads webpage content using the Jina API. It can use an LLM to answer questions based on page content. |
 | **[DocumentToolkit][utu.tools.document_toolkit.DocumentToolkit]** | `document_qa` | Processes local or remote documents (PDF, DOCX, etc.). It uses the `chunkr.ai` service to parse the document and an LLM to answer questions or provide a summary. |
 | **[PythonExecutorToolkit][utu.tools.python_executor_toolkit.PythonExecutorToolkit]** | `execute_python_code` | Executes Python code snippets in an isolated environment using `IPython.core.interactiveshell`. It runs in a separate thread to prevent blocking and can capture outputs, errors, and even `matplotlib` plots. |
 | **[BashToolkit][utu.tools.bash_toolkit.BashToolkit]** | `run_bash` | Provides a persistent local shell session using the `pexpect` library. This allows the agent to run a series of commands that maintain state (e.g., current directory). |
 
@@ -3,7 +3,7 @@ planner: |
   
   You should obey the following workflow:
   1. **Clarify the user query**: investigate the user's query carefully, and figure out the subtasks.
-    - Use the "search_google_api" when you need to gather background information from the web. 
+    - Use the "search" when you need to gather background information from the web. 
     - The returned snippet is very simple, so use the "web_qa" to collect detailed information from a specific webpage.
   2. **Collect information parallelly**: 
     - use the "search_wide" tool to collect structured information from the web.
 
@@ -34,10 +34,10 @@
             },
         },
     },
-    "search_google_api": {
+    "search": {
         "type": "function",
         "function": {
-            "name": "search_google_api",
+            "name": "search",
             "description": "Search the query via Google api, the query should be a search query like humans search in Google, concrete and not vague or super long. More the single most important items.",  # pylint: disable=line-too-long
             "parameters": {
                 "type": "object",
@@ -86,7 +86,7 @@
                     {
                         "id": "0",
                         "type": "function",
-                        "function": {"name": "search_google_api", "arguments": str({"query": "smolagents package"})},
+                        "function": {"name": "search", "arguments": str({"query": "smolagents package"})},
                     }
                 ],
             },
@@ -118,7 +118,7 @@
                 ),
             },
         ],
-        "tools": [tools["search_google_api"], tools["web_qa"]],
+        "tools": [tools["search"], tools["web_qa"]],
     }
 ]
 
 
@@ -1,12 +1,43 @@
-import hashlib
 import json
 
 import pytest
 
 from utu.config import ConfigLoader
 from utu.tools import SearchToolkit
+from utu.tools.search.baidu_search import BaiduSearch
+from utu.tools.search.crawl4ai_crawl import Crawl4aiCrawl
+from utu.tools.search.duckduckgo_search import DuckDuckGoSearch
+from utu.tools.search.google_search import GoogleSearch
+from utu.tools.search.jina_crawl import JinaCrawl
+from utu.tools.search.jina_search import JinaSearch
+
+
+# ----------------------------------------------------------------------------
+async def test_baidu_search():
+    baidu_search = BaiduSearch()
+    result = await baidu_search.search_baidu("上海天气")
+    print(result)
+
+
+async def test_google_search():
+    google_search = GoogleSearch()
+    result = await google_search.search_google("上海天气")
+    print(result)
+
 
+async def test_jina_search():
+    jina_search = JinaSearch()
+    result = await jina_search.search_jina("明天上海天气")
+    print(result)
 
+
+async def test_duckduckgo_search():
+    duckduckgo_search = DuckDuckGoSearch()
+    result = await duckduckgo_search.search_duckduckgo("明天上海天气")
+    print(result)
+
+
+# ----------------------------------------------------------------------------
 @pytest.fixture
 def search_toolkit() -> SearchToolkit:
     config = ConfigLoader.load_toolkit_config("search")
@@ -25,28 +56,30 @@ async def test_tool_schema(search_toolkit: SearchToolkit):
 TEST_QUERY = "南京工业大学计算机与信息工程学院 更名 报道"
 
 
-async def test_search_google_api(search_toolkit: SearchToolkit):
-    result = await search_toolkit.search_google_api(TEST_QUERY, num_results=10)
+async def test_search(search_toolkit: SearchToolkit):
+    result = await search_toolkit.search(TEST_QUERY, num_results=10)
     print(result)
 
 
+# ----------------------------------------------------------------------------
 TEST_URL = "https://docs.crawl4ai.com/core/simple-crawling/"
 
 
-async def test_get_content(search_toolkit: SearchToolkit):
-    result = await search_toolkit.get_content(TEST_URL)
+async def test_jina_crawl():
+    jina_crawl = JinaCrawl()
+    result = await jina_crawl.crawl(TEST_URL)
     print(result)
 
 
-async def test_cache(search_toolkit: SearchToolkit):
-    for _ in range(2):
-        res = await search_toolkit.get_content(TEST_URL)
-        hash = hashlib.md5(res.encode()).hexdigest()
-        print(hash)
+async def test_crawl4ai_crawl():
+    crawl4ai_crawl = Crawl4aiCrawl()
+    result = await crawl4ai_crawl.crawl(TEST_URL)
+    print(result)
 
 
+# ----------------------------------------------------------------------------
 queries = (
-    ("https://docs.crawl4ai.com/core/simple-crawling/", ""),
+    ("https://github.com/TencentCloudADP/Youtu-agent", ""),
     ("https://docs.crawl4ai.com/core/simple-crawling/", "How to log?"),
     ("https://github.com/theskumar/python-dotenv", "Summary this page"),
 )
 
@@ -0,0 +1,84 @@
+import aiohttp
+from bs4 import BeautifulSoup
+
+from ...utils import get_logger
+from ..utils import ContentFilter
+
+logger = get_logger(__name__)
+
+
+class BaiduSearch:
+    """Baidu Search."""
+
+    def __init__(self, config: dict = None) -> None:
+        self.url = "https://www.baidu.com/s"
+        self.headers = {
+            "User-Agent": (
+                "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
+                "AppleWebKit/537.36 (KHTML, like Gecko) "
+                "Chrome/120.0.0.0 Safari/537.36"
+            ),
+            "Referer": "https://www.baidu.com",
+        }
+        config = config or {}
+        search_banned_sites = config.get("search_banned_sites", [])
+        self.content_filter = ContentFilter(search_banned_sites) if search_banned_sites else None
+
+    async def search(self, query: str, num_results: int = 5) -> str:
+        """standard search interface."""
+        res = await self.search_baidu(query)
+        # filter
+        if self.content_filter:
+            results = self.content_filter.filter_results(res["data"], num_results, key="url")
+        else:
+            results = res["data"][:num_results]
+        # format
+        formatted_results = []
+        for i, r in enumerate(results, 1):
+            formatted_results.append(f"{i}. {r['title']} ({r['url']})")
+            if "description" in r:
+                formatted_results[-1] += f"\ndescription: {r['description']}"
+        msg = "\n".join(formatted_results)
+        return msg
+
+    # @async_file_cache(expire_time=None)
+    async def search_baidu(self, query: str) -> dict:
+        """Search Baidu using web scraping to retrieve relevant search results.
+
+        - WARNING: Uses web scraping which may be subject to rate limiting or anti-bot measures.
+
+        Returns:
+            Example result:
+            {
+                'result_id': 1,
+                'title': '百度百科',
+                'description': '百度百科是一部内容开放、自由的网络百科全书...',
+                'url': 'https://baike.baidu.com/'
+            }
+        """
+        params = {"wd": query, "rn": "20"}
+        async with aiohttp.ClientSession() as session:
+            async with session.get(self.url, headers=self.headers, params=params) as response:
+                response.raise_for_status()  # avoid cache error!
+                results = await response.text(encoding="utf-8")
+
+        soup = BeautifulSoup(results, "html.parser")
+        results = []
+        for idx, item in enumerate(soup.select(".result"), 1):
+            title_element = item.select_one("h3 > a")
+            title = title_element.get_text(strip=True) if title_element else ""
+            link = title_element["href"] if title_element else ""
+            desc_element = item.select_one(".c-abstract, .c-span-last")
+            desc = desc_element.get_text(strip=True) if desc_element else ""
+
+            results.append(
+                {
+                    "result_id": idx,
+                    "title": title,
+                    "description": desc,
+                    "url": link,
+                }
+            )
+        if len(results) == 0:
+            logger.warning(f"No results found from Baidu search: {query}")
+        return {"data": results}
@@ -0,0 +1,32 @@
+try:
+    from crawl4ai import AsyncWebCrawler
+except ImportError as e:
+    raise ImportError(
+        "Please install crawl4ai: `uv pip install crawl4ai && python -m playwright install --with-deps chromium`"
+    ) from e  # noqa: E501
+from ...utils import async_file_cache, get_logger
+
+logger = get_logger(__name__)
+
+
+class Crawl4aiCrawl:
+    """Crawl4ai Crawl.
+
+    - repo: https://github.com/unclecode/crawl4ai
+    """
+
+    def __init__(self, config: dict = None) -> None:
+        config = config or {}
+
+    async def crawl(self, url: str) -> str:
+        """standard crawl interface."""
+        return await self.crawl_crawl4ai(url)
+
+    @async_file_cache(expire_time=None)
+    async def crawl_crawl4ai(self, url: str) -> str:
+        # Get the content of the url
+        async with AsyncWebCrawler() as crawler:
+            result = await crawler.arun(
+                url=url,
+            )
+            return result.markdown
@@ -0,0 +1,51 @@
+try:
+    from ddgs import DDGS
+except ImportError as e:
+    raise ImportError("Please install ddgs first: `uv pip install ddgs`") from e
+from ...utils import get_logger
+from ..utils import ContentFilter
+
+logger = get_logger(__name__)
+
+
+class DuckDuckGoSearch:
+    """DuckDuckGo Search.
+
+    - repo: https://github.com/deedy5/ddgs
+    """
+
+    def __init__(self, config: dict = None) -> None:
+        self.ddgs = DDGS()
+        config = config or {}
+        search_banned_sites = config.get("search_banned_sites", [])
+        self.content_filter = ContentFilter(search_banned_sites) if search_banned_sites else None
+
+    async def search(self, query: str, num_results: int = 5) -> str:
+        """standard search interface."""
+        res = await self.search_duckduckgo(query)
+        # filter
+        if self.content_filter:
+            results = self.content_filter.filter_results(res, num_results, key="href")
+        else:
+            results = res[:num_results]
+        # format
+        formatted_results = []
+        for i, r in enumerate(results, 1):
+            formatted_results.append(f"{i}. {r['title']} ({r['href']})")
+            if "body" in r:
+                formatted_results[-1] += f"\nbody: {r['body']}"
+        msg = "\n".join(formatted_results)
+        return msg
+
+    async def search_duckduckgo(self, query: str) -> list:
+        """Use DuckDuckGo search engine to search for information on the given query.
+
+        Returns:
+            [{
+                "title": ...
+                "href": ...
+                "body": ...
+            }]
+        """
+        results = self.ddgs.text(query, max_results=100)
+        return results
@@ -0,0 +1,50 @@
+import aiohttp
+
+from ...utils import EnvUtils, async_file_cache, get_logger
+from ..utils import ContentFilter
+
+logger = get_logger(__name__)
+
+
+class GoogleSearch:
+    """Google Search.
+
+    - API key: `SERPER_API_KEY`
+    """
+
+    def __init__(self, config: dict = None) -> None:
+        self.serper_url = r"https://google.serper.dev/search"
+        self.serper_header = {"X-API-KEY": EnvUtils.get_env("SERPER_API_KEY"), "Content-Type": "application/json"}
+        config = config or {}
+        self.search_params = config.get("search_params", {})
+        search_banned_sites = config.get("search_banned_sites", [])
+        self.content_filter = ContentFilter(search_banned_sites) if search_banned_sites else None
+
+    async def search(self, query: str, num_results: int = 5) -> str:
+        """standard search interface."""
+        res = await self.search_google(query)
+        # filter
+        if self.content_filter:
+            results = self.content_filter.filter_results(res["organic"], num_results)
+        else:
+            results = res["organic"][:num_results]
+        # format
+        formatted_results = []
+        for i, r in enumerate(results, 1):
+            formatted_results.append(f"{i}. {r['title']} ({r['link']})")
+            if "snippet" in r:
+                formatted_results[-1] += f"\nsnippet: {r['snippet']}"
+            if "sitelinks" in r:
+                formatted_results[-1] += f"\nsitelinks: {r['sitelinks']}"
+        msg = "\n".join(formatted_results)
+        return msg
+
+    @async_file_cache(expire_time=None)
+    async def search_google(self, query: str) -> dict:
+        """Call the serper.dev API and cache the results."""
+        params = {"q": query, **self.search_params, "num": 100}
+        async with aiohttp.ClientSession() as session:
+            async with session.post(self.serper_url, headers=self.serper_header, json=params) as response:
+                response.raise_for_status()  # avoid cache error!
+                results = await response.json()
+                return results