add miroapi support

ntudy · ntudy · commit d3eda6addbe8 · 2025-10-13T15:42:13.000+08:00
diff --git a/docs/mkdocs/docs/miro_api.md b/docs/mkdocs/docs/miro_api.md
@@ -0,0 +1,24 @@
+# MiroAPI
+
+!!! warning "Preview Documentation"
+    This service is currently in preview and limited to internal access. Public release will follow once it is production-ready.
+
+## Overview
+MiroAPI provides an internal caching layer for Serper Search and Jina Scrape to reduce costs, speed up development, and enable reproducible "go-back-in-time" sandbox runs by serving recorded results when available.
+
+### Step 1: Apply for a MiroAPI key
+    Request a MiroAPI key through the internal portal.
+
+### Step 2: Configure .env
+```
+# API for Google Search (recommended)
+SERPER_API_KEY="svc-miro-api01-replace-with-your-key"
+SERPER_BASE_URL="https://miro-api.miromind.site/serper"
+
+# API for Web Scraping (recommended)
+JINA_API_KEY="svc-miro-api01-replace-with-your-key"
+JINA_BASE_URL="https://miro-api.miromind.site/jina"
+```
+
+
+    
diff --git a/docs/mkdocs/mkdocs.yml b/docs/mkdocs/mkdocs.yml
@@ -74,6 +74,7 @@ nav:
       - tool-python: tool_python.md
     - Advanced Features:
       - E2B Advanced Features: e2b_advanced_features.md
+      - MiroAPI: miro_api.md
     - Add New Tools: contribute_tools.md
 
   - LLM Clients: 
diff --git a/src/tool/mcp_servers/miroapi_serper_mcp_server.py b/src/tool/mcp_servers/miroapi_serper_mcp_server.py
@@ -0,0 +1,165 @@
+# Copyright 2025 Miromind.ai
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""
+adapted from
+https://github.com/MiroMindAI/MiroRL/blob/5073693549ffe05a157a1886e87650ef3be6606e/mirorl/tools/serper_search.py#L1
+"""
+
+import os
+from typing import Any, Dict
+
+import requests
+from mcp.server.fastmcp import FastMCP
+from tenacity import (
+    retry,
+    retry_if_exception_type,
+    stop_after_attempt,
+    wait_exponential,
+)
+
+from .utils.url_unquote import decode_http_urls_in_dict
+
+SERPER_BASE_URL = os.getenv("SERPER_BASE_URL", "https://google.serper.dev")
+SERPER_API_KEY = os.getenv("SERPER_API_KEY", "")
+
+
+# Initialize FastMCP server
+mcp = FastMCP("serper-mcp-server")
+
+
+@retry(
+    stop=stop_after_attempt(3),
+    wait=wait_exponential(multiplier=1, min=4, max=10),
+    retry=retry_if_exception_type(
+        (requests.ConnectionError, requests.Timeout, requests.HTTPError)
+    ),
+)
+def make_serper_request(
+    payload: Dict[str, Any], headers: Dict[str, str]
+) -> requests.Response:
+    """Make HTTP request to Serper API with retry logic."""
+    response = requests.post(f"{SERPER_BASE_URL}/search", json=payload, headers=headers)
+    response.raise_for_status()
+    return response
+
+
+def _is_huggingface_dataset_or_space_url(url):
+    """
+    Check if the URL is a HuggingFace dataset or space URL.
+    :param url: The URL to check
+    :return: True if it's a HuggingFace dataset or space URL, False otherwise
+    """
+    if not url:
+        return False
+    return "huggingface.co/datasets" in url or "huggingface.co/spaces" in url
+
+
+@mcp.tool()
+def google_search(
+    q: str,
+    gl: str = "us",
+    hl: str = "en",
+    location: str | None = None,
+    num: int | None = None,
+    tbs: str | None = None,
+    page: int | None = None,
+    autocorrect: bool | None = None,
+) -> Dict[str, Any]:
+    """
+    Tool to perform web searches via Serper API and retrieve rich results.
+
+    It is able to retrieve organic search results, people also ask,
+    related searches, and knowledge graph.
+
+    Args:
+        q: Search query string
+        gl: Optional region code for search results in ISO 3166-1 alpha-2 format (e.g., 'us')
+        hl: Optional language code for search results in ISO 639-1 format (e.g., 'en')
+        location: Optional location for search results (e.g., 'SoHo, New York, United States', 'California, United States')
+        num: Number of results to return (default: 10)
+        tbs: Time-based search filter ('qdr:h' for past hour, 'qdr:d' for past day, 'qdr:w' for past week,
+            'qdr:m' for past month, 'qdr:y' for past year)
+        page: Page number of results to return (default: 1)
+        autocorrect: Whether to autocorrect spelling in query
+
+    Returns:
+        Dictionary containing search results and metadata.
+    """
+    # Check for API key
+    if not SERPER_API_KEY:
+        return {
+            "success": False,
+            "error": "SERPER_API_KEY environment variable not set",
+            "results": [],
+        }
+
+    # Validate required parameter
+    if not q or not q.strip():
+        return {
+            "success": False,
+            "error": "Search query 'q' is required and cannot be empty",
+            "results": [],
+        }
+
+    try:
+        # Build payload with all supported parameters
+        payload: dict[str, Any] = {
+            "q": q.strip(),
+            "gl": gl,
+            "hl": hl,
+        }
+
+        # Add optional parameters if provided
+        if location:
+            payload["location"] = location
+        if num is not None:
+            payload["num"] = num
+        else:
+            payload["num"] = 10  # Default
+        if tbs:
+            payload["tbs"] = tbs
+        if page is not None:
+            payload["page"] = page
+        if autocorrect is not None:
+            payload["autocorrect"] = autocorrect
+
+        # Set up headers
+        headers = {"X-API-KEY": SERPER_API_KEY, "Content-Type": "application/json"}
+
+        # Make the API request
+        response = make_serper_request(payload, headers)
+        data = response.json()
+
+        # filter out HuggingFace dataset or space urls
+        organic_results = []
+        if "organic" in data:
+            for item in data["organic"]:
+                if _is_huggingface_dataset_or_space_url(item.get("link", "")):
+                    continue
+                organic_results.append(item)
+
+        # Keep all original fields, but overwrite "organic"
+        response_data = dict(data)
+        response_data["organic"] = organic_results
+        response_data = decode_http_urls_in_dict(response_data)
+
+        return response_data
+
+    except Exception as e:
+        return {"success": False, "error": f"Unexpected error: {str(e)}", "results": []}
+
+
+if __name__ == "__main__":
+    mcp.run()
diff --git a/src/tool/mcp_servers/searching_mcp_server.py b/src/tool/mcp_servers/searching_mcp_server.py
@@ -2,6 +2,7 @@
 #
 # SPDX-License-Identifier: Apache-2.0
 
+import sys
 import os
 import json
 import requests
@@ -17,7 +18,11 @@
 
 
 SERPER_API_KEY = os.environ.get("SERPER_API_KEY", "")
+SERPER_BASE_URL = os.environ.get("SERPER_BASE_URL", "https://google.serper.dev")
 JINA_API_KEY = os.environ.get("JINA_API_KEY", "")
+JINA_BASE_URL = os.environ.get("JINA_BASE_URL", "https://r.jina.ai")
+
+IS_MIRO_API = True if "miro" in SERPER_BASE_URL or "miro" in JINA_BASE_URL else False
 
 # Google search result filtering environment variables
 REMOVE_SNIPPETS = os.environ.get("REMOVE_SNIPPETS", "").lower() in ("true", "1", "yes")
@@ -122,11 +127,18 @@ async def google_search(
         arguments["location"] = location
     if tbs:
         arguments["tbs"] = tbs
-    server_params = StdioServerParameters(
-        command="npx",
-        args=["-y", "serper-search-scrape-mcp-server"],
-        env={"SERPER_API_KEY": SERPER_API_KEY},
-    )
+    if IS_MIRO_API:
+        server_params = StdioServerParameters(
+            command=sys.executable,
+            args=["-m", "src.tool.mcp_servers.miroapi_serper_mcp_server"],
+            env={"SERPER_API_KEY": SERPER_API_KEY, "SERPER_BASE_URL": SERPER_BASE_URL},
+        )
+    else:
+        server_params = StdioServerParameters(
+            command="npx",
+            args=["-y", "serper-search-scrape-mcp-server"],
+            env={"SERPER_API_KEY": SERPER_API_KEY},
+        )
     result_content = ""
     retry_count = 0
     max_retries = 5
@@ -348,7 +360,12 @@ async def search_wiki_revision(
         content = await smart_request(
             url=base_url,
             params=params,
-            env={"SERPER_API_KEY": SERPER_API_KEY, "JINA_API_KEY": JINA_API_KEY},
+            env={
+                "SERPER_API_KEY": SERPER_API_KEY,
+                "JINA_API_KEY": JINA_API_KEY,
+                "SERPER_BASE_URL": SERPER_BASE_URL,
+                "JINA_BASE_URL": JINA_BASE_URL,
+            },
         )
         data = request_to_json(content)
 
@@ -527,6 +544,8 @@ async def search_archived_webpage(url: str, year: int, month: int, day: int) ->
                     env={
                         "SERPER_API_KEY": SERPER_API_KEY,
                         "JINA_API_KEY": JINA_API_KEY,
+                        "SERPER_BASE_URL": SERPER_BASE_URL,
+                        "JINA_BASE_URL": JINA_BASE_URL,
                     },
                 )
                 data = request_to_json(content)
@@ -585,7 +604,12 @@ async def search_archived_webpage(url: str, year: int, month: int, day: int) ->
             content = await smart_request(
                 url=base_url,
                 params={"url": url},
-                env={"SERPER_API_KEY": SERPER_API_KEY, "JINA_API_KEY": JINA_API_KEY},
+                env={
+                    "SERPER_API_KEY": SERPER_API_KEY,
+                    "JINA_API_KEY": JINA_API_KEY,
+                    "SERPER_BASE_URL": SERPER_BASE_URL,
+                    "JINA_BASE_URL": JINA_BASE_URL,
+                },
             )
             data = request_to_json(content)
             if "archived_snapshots" in data and "closest" in data["archived_snapshots"]:
@@ -664,7 +688,13 @@ async def scrape_website(url: str) -> str:
     """
     # TODO: Long Content Handling
     return await smart_request(
-        url, env={"SERPER_API_KEY": SERPER_API_KEY, "JINA_API_KEY": JINA_API_KEY}
+        url,
+        env={
+            "SERPER_API_KEY": SERPER_API_KEY,
+            "JINA_API_KEY": JINA_API_KEY,
+            "SERPER_BASE_URL": SERPER_BASE_URL,
+            "JINA_BASE_URL": JINA_BASE_URL,
+        },
     )
 
 
diff --git a/src/tool/mcp_servers/utils/smart_request.py b/src/tool/mcp_servers/utils/smart_request.py
@@ -13,6 +13,8 @@
 import urllib.parse
 from markitdown import MarkItDown
 import io
+from typing import Optional
+import os
 
 
 def request_to_json(content: str) -> dict:
@@ -30,13 +32,16 @@ async def smart_request(url: str, params: dict = None, env: dict = None) -> str:
     if env:
         JINA_API_KEY = env.get("JINA_API_KEY", "")
         SERPER_API_KEY = env.get("SERPER_API_KEY", "")
+        JINA_BASE_URL = env.get("JINA_BASE_URL", "https://r.jina.ai")
     else:
         JINA_API_KEY = ""
         SERPER_API_KEY = ""
 
     if JINA_API_KEY == "" and SERPER_API_KEY == "":
         return "[ERROR]: JINA_API_KEY and SERPER_API_KEY are not set, smart_request is not available."
 
+    IS_MIRO_API = True if "miro" in JINA_BASE_URL else False
+
     # Auto-add https:// if no protocol is specified
     protocol_hint = ""
     if not url.startswith(("http://", "https://")):
@@ -65,21 +70,24 @@ async def smart_request(url: str, params: dict = None, env: dict = None) -> str:
             ):
                 youtube_hint = "[NOTE]: If you need to get information about its visual or audio content, please use tool 'visual_audio_youtube_analyzing' instead. This tool may not be able to provide visual and audio content of a YouTube Video.\n\n"
 
-            content, jina_err = await scrape_jina(url, JINA_API_KEY)
+            content, jina_err = await scrape_jina(url, JINA_API_KEY, JINA_BASE_URL)
             if jina_err:
                 error_msg += f"Failed to get content from Jina.ai: {jina_err}\n"
             elif content is None or content.strip() == "":
                 error_msg += "No content got from Jina.ai.\n"
             else:
                 return protocol_hint + youtube_hint + content
 
-            content, serper_err = await scrape_serper(url, SERPER_API_KEY)
-            if serper_err:
-                error_msg += f"Failed to get content from SERPER: {serper_err}\n"
-            elif content is None or content.strip() == "":
-                error_msg += "No content got from SERPER.\n"
-            else:
-                return protocol_hint + youtube_hint + content
+            if not IS_MIRO_API:
+                # Try Serper API for scraping if not using Miro API
+                # (Miro API does not support caching Serper scraping results)
+                content, serper_err = await scrape_serper(url, SERPER_API_KEY)
+                if serper_err:
+                    error_msg += f"Failed to get content from SERPER: {serper_err}\n"
+                elif content is None or content.strip() == "":
+                    error_msg += "No content got from SERPER.\n"
+                else:
+                    return protocol_hint + youtube_hint + content
 
             content, request_err = scrape_request(url)
             if request_err:
@@ -99,7 +107,9 @@ async def smart_request(url: str, params: dict = None, env: dict = None) -> str:
                 await asyncio.sleep(4**retry_count)
 
 
-async def scrape_jina(url: str, jina_api_key: str) -> tuple[str, str]:
+async def scrape_jina(
+    url: str, jina_api_key: str, jina_base_url: str
+) -> tuple[str, str]:
     # Use Jina.ai reader API to convert URL to LLM-friendly text
     if jina_api_key == "":
         return (
@@ -116,7 +126,7 @@ async def scrape_jina(url: str, jina_api_key: str) -> tuple[str, str]:
         "X-With-Shadow-Dom": "true",
     }
 
-    jina_url = f"https://r.jina.ai/{url}"
+    jina_url = f"{jina_base_url}/{url}"
     try:
         response = requests.get(jina_url, headers=jina_headers, timeout=120)
         if response.status_code == 422:
diff --git a/src/tool/mcp_servers/utils/url_unquote.py b/src/tool/mcp_servers/utils/url_unquote.py