feat: add search_datasets tool

pengqun · pengqun · commit 026a223e1340 · 2025-07-29T10:47:57.000+08:00
diff --git a/README.md b/README.md
@@ -142,10 +142,12 @@ For HTTP/SSE mode, connect using a local URL in your MCP client configuration:
 You can also debug the server using the [MCP Inspector](https://github.com/modelcontextprotocol/inspector) tool:
 
 ```bash
+# Run in UI mode with stdio transport (can switch to HTTP/SSE in the Web UI as needed)
 npx @modelcontextprotocol/inspector uv run modelscope-mcp-server
-```
 
-Uses stdio transport by default; switch to HTTP/SSE in the Web UI as needed.
+# Run in CLI mode with HTTP transportt (can do operations across tools, resources, and prompts)
+npx @modelcontextprotocol/inspector --cli http://127.0.0.1:8000/mcp/ --transport http --method tools/list
+```
 
 ### Testing
 
diff --git a/demo.py b/demo.py
@@ -13,6 +13,17 @@
 from modelscope_mcp_server.settings import settings
 from modelscope_mcp_server.utils.metadata import get_server_name_with_version
 
+# Global counter for demo step numbering
+demo_step = 0
+
+
+def print_step_title(tool_name: str, task_description: str) -> None:
+    """Print demo step title."""
+    global demo_step
+    demo_step += 1
+    print(f"{demo_step}. 🛠️ Tool: {tool_name}")
+    print(f"   • Task: {task_description}")
+
 
 def parse_tool_response(result) -> dict:
     """Parse tool response and return JSON data."""
@@ -27,10 +38,10 @@ def parse_tool_response(result) -> dict:
 
 async def demo_user_info(client: Client) -> None:
     """Demo getting current user information."""
-    print("1. 🛠️ Tool: get_current_user")
-    print("   • Task: 👤 Get current user information")
+    tool_name = "get_current_user"
+    print_step_title(tool_name, "👤 Get current user information")
 
-    result = await client.call_tool("get_current_user", {})
+    result = await client.call_tool(tool_name, {})
     data = parse_tool_response(result)
 
     username = data.get("username", "N/A")
@@ -43,10 +54,10 @@ async def demo_user_info(client: Client) -> None:
 
 async def demo_environment_info(client: Client) -> None:
     """Demo getting environment information."""
-    print("2. 🛠️ Tool: get_environment_info")
-    print("   • Task: 🔧 Get current MCP server environment information")
+    tool_name = "get_environment_info"
+    print_step_title(tool_name, "🔧 Get current MCP server environment information")
 
-    result = await client.call_tool("get_environment_info", {})
+    result = await client.call_tool(tool_name, {})
     data = parse_tool_response(result)
 
     print(f"   • Result: {data}")
@@ -55,11 +66,13 @@ async def demo_environment_info(client: Client) -> None:
 
 async def demo_search_models(client: Client) -> None:
     """Demo searching models."""
-    print("3. 🛠️ Tool: search_models")
-    print("   • Task: 🔍 Search text-generation models (keyword='DeepSeek', support inference, limit 3 results)")
+    tool_name = "search_models"
+    print_step_title(
+        tool_name, "🔍 Search text-generation models (keyword='DeepSeek', support inference, limit 3 results)"
+    )
 
     result = await client.call_tool(
-        "search_models",
+        tool_name,
         {
             "query": "DeepSeek",
             "task": "text-generation",
@@ -82,40 +95,73 @@ async def demo_search_models(client: Client) -> None:
     print()
 
 
+async def demo_search_datasets(client: Client) -> None:
+    """Demo searching datasets."""
+    tool_name = "search_datasets"
+    print_step_title(tool_name, "📊 Search datasets (keyword='金融', sort='downloads', limit 3 results)")
+
+    result = await client.call_tool(
+        tool_name,
+        {
+            "query": "金融",
+            "sort": "downloads",
+            "limit": 3,
+        },
+    )
+    data = parse_tool_response(result)
+
+    if isinstance(data, list) and data:
+        summaries = []
+        for dataset in data:
+            name = dataset.get("name", "N/A")
+            chinese_name = dataset.get("chinese_name", "N/A")
+            downloads = dataset.get("downloads_count", 0)
+            likes = dataset.get("likes_count", 0)
+            summaries.append(f"{name} ({chinese_name}) - Downloads {downloads:,}, Likes {likes}")
+        print(f"   • Result: Found {len(data)} items - {' | '.join(summaries)}")
+    else:
+        print("   • Result: No datasets found")
+    print()
+
+
 async def demo_search_papers(client: Client) -> None:
     """Demo searching papers."""
-    print("4. 🛠️ Tool: search_papers")
-    print("   • Task: 📚 Search academic papers (keyword='Qwen3', sort='hot', limit 1 result)")
+    tool_name = "search_papers"
+    print_step_title(tool_name, "📚 Search papers (keyword='Qwen3', sort='hot', limit 3 result)")
 
     result = await client.call_tool(
-        "search_papers",
+        tool_name,
         {
             "query": "Qwen3",
             "sort": "hot",
-            "limit": 1,
+            "limit": 3,
         },
     )
     data = parse_tool_response(result)
 
     if isinstance(data, list) and data:
-        paper = data[0]
-        title = paper.get("title", "N/A")
-        arxiv_id = paper.get("arxiv_id", "N/A")
-        view_count = paper.get("view_count", 0)
-        modelscope_url = paper.get("modelscope_url", "N/A")
-        print(f"   • Result: '{title}' ArXiv ID={arxiv_id}, Views={view_count:,} ModelScope URL={modelscope_url}")
+        summaries = []
+        for paper in data:
+            title = paper.get("title", "N/A")
+            arxiv_id = paper.get("arxiv_id", "N/A")
+            view_count = paper.get("view_count", 0)
+            modelscope_url = paper.get("modelscope_url", "N/A")
+            summaries.append(f"{title} (ArXiv={arxiv_id}, Views={view_count:,} URL={modelscope_url})")
+        print(f"   • Result: Found {len(data)} items - {' | '.join(summaries)}")
     else:
         print("   • Result: No papers found")
     print()
 
 
 async def demo_search_mcp_servers(client: Client) -> None:
     """Demo searching MCP servers."""
-    print("5. 🛠️ Tool: search_mcp_servers")
-    print("   • Task: 🔍 Search MCP servers (keyword='Chrome', category='browser-automation', limit 3 results)")
+    tool_name = "search_mcp_servers"
+    print_step_title(
+        tool_name, "🔍 Search MCP servers (keyword='Chrome', category='browser-automation', limit 3 results)"
+    )
 
     result = await client.call_tool(
-        "search_mcp_servers",
+        tool_name,
         {
             "search": "Chrome",
             "category": "browser-automation",
@@ -139,24 +185,15 @@ async def demo_search_mcp_servers(client: Client) -> None:
 
 async def demo_generate_image(client: Client) -> None:
     """Demo image generation."""
-    print("6. 🛠️ Tool: generate_image")
-    print("   • Task: 🎨 Generate image (prompt='A curious cat wearing a tiny wizard hat in candy cloud kingdom')")
+    tool_name = "generate_image"
+    prompt = "A curious cat wearing a tiny wizard hat in candy cloud kingdom"
+    print_step_title(tool_name, f"🎨 Generate image with prompt: {prompt}")
 
-    result = await client.call_tool(
-        "generate_image",
-        {
-            "prompt": "A curious cat wearing a tiny wizard hat in candy cloud kingdom",
-        },
-    )
+    result = await client.call_tool(tool_name, {"prompt": prompt})
     data = parse_tool_response(result)
 
-    image_url = data.get("image_url")
-    model = data.get("model")
-
-    if not image_url:
-        raise RuntimeError("Missing required field 'image_url' in response")
-    if not model:
-        raise RuntimeError("Missing required field 'model' in response")
+    image_url = data.get("image_url", "N/A")
+    model = data.get("model", "N/A")
 
     print(f"   • Result: Image generated using model '{model}' - URL: {image_url}")
     print()
@@ -204,6 +241,7 @@ async def main() -> None:
         await demo_user_info(client)
         await demo_environment_info(client)
         await demo_search_models(client)
+        await demo_search_datasets(client)
         await demo_search_papers(client)
         await demo_search_mcp_servers(client)
 
diff --git a/src/modelscope_mcp_server/server.py b/src/modelscope_mcp_server/server.py
@@ -14,6 +14,7 @@
 from .settings import settings
 from .tools.aigc import register_aigc_tools
 from .tools.context import register_context_tools
+from .tools.dataset import register_dataset_tools
 from .tools.mcp import register_mcp_tools
 from .tools.model import register_model_tools
 from .tools.paper import register_paper_tools
@@ -28,9 +29,7 @@ def create_mcp_server() -> FastMCP:
 
     mcp = FastMCP(
         name=get_server_name_with_version(),
-        instructions="""
-            This server provides tools for calling ModelScope (魔搭社区) API.
-        """,
+        instructions="This server provides tools for calling ModelScope (魔搭社区) API.",
     )
 
     # Add middleware in logical order
@@ -42,6 +41,7 @@ def create_mcp_server() -> FastMCP:
     # Register all tools
     register_context_tools(mcp)
     register_model_tools(mcp)
+    register_dataset_tools(mcp)
     register_paper_tools(mcp)
     register_mcp_tools(mcp)
     register_aigc_tools(mcp)
diff --git a/src/modelscope_mcp_server/tools/dataset.py b/src/modelscope_mcp_server/tools/dataset.py
@@ -0,0 +1,86 @@
+"""ModelScope MCP Server Dataset tools.
+
+Provides tools for dataset-related operations in the ModelScope MCP Server,
+such as searching for datasets and retrieving dataset details.
+"""
+
+from typing import Annotated, Literal
+
+from fastmcp import FastMCP
+from fastmcp.utilities import logging
+from pydantic import Field
+
+from ..client import default_client
+from ..settings import settings
+from ..types import Dataset
+
+logger = logging.get_logger(__name__)
+
+
+def register_dataset_tools(mcp: FastMCP) -> None:
+    """Register all dataset-related tools with the MCP server.
+
+    Args:
+        mcp (FastMCP): The MCP server instance
+
+    """
+
+    @mcp.tool(
+        annotations={
+            "title": "Search Datasets",
+        }
+    )
+    async def search_datasets(
+        query: Annotated[
+            str,
+            Field(
+                description="Keyword to search for related datasets. "
+                "Leave empty to get all datasets based on other filters."
+            ),
+        ] = "",
+        sort: Annotated[
+            Literal["default", "downloads", "likes", "gmt_modified"],
+            Field(description="Sort order"),
+        ] = "default",
+        limit: Annotated[int, Field(description="Maximum number of datasets to return", ge=1, le=30)] = 10,
+    ) -> list[Dataset]:
+        """Search for datasets on ModelScope."""
+        url = f"{settings.main_domain}/api/v1/dolphin/datasets"
+
+        params = {
+            "Query": query,
+            "Sort": sort,
+            "PageNumber": 1,
+            "PageSize": limit,
+        }
+
+        response = default_client.get(url, params=params)
+
+        datasets_data = response.get("Data", [])
+
+        datasets = []
+        for dataset_data in datasets_data:
+            path = dataset_data.get("Namespace", "")
+            name = dataset_data.get("Name", "")
+            modelscope_url = f"{settings.main_domain}/datasets/{path}/{name}"
+
+            if not path or not name:
+                logger.warning(f"Skipping dataset with invalid path or name: {dataset_data}")
+                continue
+
+            dataset = Dataset(
+                id=f"{path}/{name}",
+                path=path,
+                name=name,
+                chinese_name=dataset_data.get("ChineseName", ""),
+                created_by=dataset_data.get("CreatedBy", ""),
+                license=dataset_data.get("License", ""),
+                modelscope_url=modelscope_url,
+                downloads_count=dataset_data.get("Downloads", 0),
+                likes_count=dataset_data.get("Likes", 0),
+                created_at=dataset_data.get("GmtCreate", 0),
+                updated_at=dataset_data.get("LastUpdatedTime", 0),
+            )
+            datasets.append(dataset)
+
+        return datasets
diff --git a/src/modelscope_mcp_server/tools/model.py b/src/modelscope_mcp_server/tools/model.py
@@ -34,8 +34,8 @@ async def search_models(
         query: Annotated[
             str,
             Field(
-                description="Keyword to search for related models (e.g., 'Flux' will find models related to Flux). "
-                "Leave empty to skip keyword matching and get all models based on other filters."
+                description="Keyword to search for related models. "
+                "Leave empty to get all models based on other filters."
             ),
         ] = "",
         task: Annotated[
@@ -118,6 +118,7 @@ async def search_models(
                 name=name,
                 chinese_name=model_data.get("ChineseName", ""),
                 created_by=model_data.get("CreatedBy"),
+                license=model_data.get("License", ""),
                 modelscope_url=modelscope_url,
                 # Non-empty value means True, else False
                 support_inference=bool(model_data.get("SupportInference", "")),
diff --git a/src/modelscope_mcp_server/types.py b/src/modelscope_mcp_server/types.py
@@ -39,6 +39,7 @@ class Model(BaseModel):
     name: Annotated[str, Field(description="Model name, for example 'DeepSeek-R1'")]
     chinese_name: Annotated[str, Field(description="Chinese name")]
     created_by: Annotated[str, Field(description="User who created the model")]
+    license: Annotated[str, Field(description="Open source license")]
 
     # Links
     modelscope_url: Annotated[str, Field(description="Detail page URL on ModelScope")]
@@ -55,6 +56,29 @@ class Model(BaseModel):
     updated_at: Annotated[int, Field(description="Last updated time (unix timestamp, seconds)")] = 0
 
 
+class Dataset(BaseModel):
+    """Dataset information."""
+
+    # Basic information
+    id: Annotated[str, Field(description="Unique dataset ID, formatted as 'path/name'")]
+    path: Annotated[str, Field(description="Dataset path, for example 'opencompass'")]
+    name: Annotated[str, Field(description="Dataset name, for example 'mmlu'")]
+    chinese_name: Annotated[str, Field(description="Chinese name")]
+    created_by: Annotated[str, Field(description="User who created the dataset")]
+    license: Annotated[str, Field(description="Open source license")]
+
+    # Links
+    modelscope_url: Annotated[str, Field(description="Detail page URL on ModelScope")]
+
+    # Metrics
+    downloads_count: Annotated[int, Field(description="Number of downloads")] = 0
+    likes_count: Annotated[int, Field(description="Number of likes")] = 0
+
+    # Timestamps
+    created_at: Annotated[int, Field(description="Created time (unix timestamp, seconds)")] = 0
+    updated_at: Annotated[int, Field(description="Last updated time (unix timestamp, seconds)")] = 0
+
+
 class Paper(BaseModel):
     """Paper information."""
 
diff --git a/tests/tools/test_search_datasets.py b/tests/tools/test_search_datasets.py