diff --git a/README.md b/README.md index e917c58..4d9bdac 100644 --- a/README.md +++ b/README.md @@ -13,6 +13,8 @@ A production-ready [Model Context Protocol](https://modelcontextprotocol.io/intr - [Quick Start](#quick-start) - [Available Tools](#available-tools) - [Setup Instructions](#setup-instructions) +- [Local Usage](#local-usage) +- [Google ADK Integration](#google-adk-integration) - [Example Use Cases](#example-use-cases) - [Error Handling](#error-handling) - [Common Issues](#common-issues) @@ -212,6 +214,247 @@ Add the ScrapeGraphAI MCP server on the settings: ![Cursor MCP Integration](assets/cursor_mcp.png) +## Local Usage + +To run the MCP server locally for development or testing, follow these steps: + +### Prerequisites + +- Python 3.10 or higher +- pip or uv package manager +- ScrapeGraph API key + +### Installation + +1. **Clone the repository** (if you haven't already): + +```bash +git clone https://github.com/ScrapeGraphAI/scrapegraph-mcp +cd scrapegraph-mcp +``` + +2. **Install the package**: + +```bash +# Using pip +pip install -e . + +# Or using uv (faster) +uv pip install -e . +``` + +3. **Set your API key**: + +```bash +# macOS/Linux +export SGAI_API_KEY=your-api-key-here + +# Windows (PowerShell) +$env:SGAI_API_KEY="your-api-key-here" + +# Windows (CMD) +set SGAI_API_KEY=your-api-key-here +``` + +### Running the Server Locally + +You can run the server directly: + +```bash +# Using the installed command +scrapegraph-mcp + +# Or using Python module +python -m scrapegraph_mcp.server +``` + +The server will start and communicate via stdio (standard input/output), which is the standard MCP transport method. + +### Testing with MCP Inspector + +Test your local server using the MCP Inspector tool: + +```bash +npx @modelcontextprotocol/inspector python -m scrapegraph_mcp.server +``` + +This provides a web interface to test all available tools interactively. + +### Configuring Claude Desktop for Local Server + +To use your locally running server with Claude Desktop, update your configuration file: + +**macOS/Linux** (`~/Library/Application Support/Claude/claude_desktop_config.json`): + +```json +{ + "mcpServers": { + "scrapegraph-mcp-local": { + "command": "python", + "args": [ + "-m", + "scrapegraph_mcp.server" + ], + "env": { + "SGAI_API_KEY": "your-api-key-here" + } + } + } +} +``` + +**Windows** (`%APPDATA%\Claude\claude_desktop_config.json`): + +```json +{ + "mcpServers": { + "scrapegraph-mcp-local": { + "command": "python", + "args": [ + "-m", + "scrapegraph_mcp.server" + ], + "env": { + "SGAI_API_KEY": "your-api-key-here" + } + } + } +} +``` + +**Note**: Make sure Python is in your PATH. You can verify by running `python --version` in your terminal. + +### Configuring Cursor for Local Server + +In Cursor's MCP settings, add a new server with: + +- **Command**: `python` +- **Args**: `["-m", "scrapegraph_mcp.server"]` +- **Environment Variables**: `{"SGAI_API_KEY": "your-api-key-here"}` + +### Troubleshooting Local Setup + +**Server not starting:** +- Verify Python is installed: `python --version` +- Check that the package is installed: `pip list | grep scrapegraph-mcp` +- Ensure API key is set: `echo $SGAI_API_KEY` (macOS/Linux) or `echo %SGAI_API_KEY%` (Windows) + +**Tools not appearing:** +- Check Claude Desktop logs: + - macOS: `~/Library/Logs/Claude/` + - Windows: `%APPDATA%\Claude\Logs\` +- Verify the server starts without errors when run directly +- Check that the configuration JSON is valid + +**Import errors:** +- Reinstall the package: `pip install -e . --force-reinstall` +- Verify dependencies: `pip install -r requirements.txt` (if available) + +## Google ADK Integration + +The ScrapeGraph MCP server can be integrated with [Google ADK (Agent Development Kit)](https://github.com/google/adk) to create AI agents with web scraping capabilities. + +### Prerequisites + +- Python 3.10 or higher +- Google ADK installed +- ScrapeGraph API key + +### Installation + +1. **Install Google ADK** (if not already installed): + +```bash +pip install google-adk +``` + +2. **Set your API key**: + +```bash +export SGAI_API_KEY=your-api-key-here +``` + +### Basic Integration Example + +Create an agent file (e.g., `agent.py`) with the following configuration: + +```python +import os +from google.adk.agents import LlmAgent +from google.adk.tools.mcp_tool.mcp_toolset import MCPToolset +from google.adk.tools.mcp_tool.mcp_session_manager import StdioConnectionParams +from mcp import StdioServerParameters + +# Path to the scrapegraph-mcp server directory +SCRAPEGRAPH_MCP_PATH = "/path/to/scrapegraph-mcp" + +# Path to the server.py file +SERVER_SCRIPT_PATH = os.path.join( + SCRAPEGRAPH_MCP_PATH, + "src", + "scrapegraph_mcp", + "server.py" +) + +root_agent = LlmAgent( + model='gemini-2.0-flash', + name='scrapegraph_assistant_agent', + instruction='Help the user with web scraping and data extraction using ScrapeGraph AI. ' + 'You can convert webpages to markdown, extract structured data using AI, ' + 'perform web searches, crawl multiple pages, and automate complex scraping workflows.', + tools=[ + MCPToolset( + connection_params=StdioConnectionParams( + server_params=StdioServerParameters( + command='python3', + args=[ + SERVER_SCRIPT_PATH, + ], + env={ + 'SGAI_API_KEY': os.getenv('SGAI_API_KEY'), + }, + ), + timeout=300.0,) + ), + # Optional: Filter which tools from the MCP server are exposed + # tool_filter=['markdownify', 'smartscraper', 'searchscraper'] + ) + ], +) +``` + +### Configuration Options + +**Timeout Settings:** +- Default timeout is 5 seconds, which may be too short for web scraping operations +- Recommended: Set `timeout=300.0 +- Adjust based on your use case (crawling operations may need even longer timeouts) + +**Tool Filtering:** +- By default, all 8 tools are exposed to the agent +- Use `tool_filter` to limit which tools are available: + ```python + tool_filter=['markdownify', 'smartscraper', 'searchscraper'] + ``` + +**API Key Configuration:** +- Set via environment variable: `export SGAI_API_KEY=your-key` +- Or pass directly in `env` dict: `'SGAI_API_KEY': 'your-key-here'` +- Environment variable approach is recommended for security + +### Usage Example + +Once configured, your agent can use natural language to interact with web scraping tools: + +```python +# The agent can now handle queries like: +# - "Convert https://example.com to markdown" +# - "Extract all product prices from this e-commerce page" +# - "Search for recent AI research papers and summarize them" +# - "Crawl this documentation site and extract all API endpoints" +``` +For more information about Google ADK, visit the [official documentation](https://github.com/google/adk). + ## Example Use Cases The server enables sophisticated queries across various scraping scenarios: diff --git a/src/scrapegraph_mcp/server.py b/src/scrapegraph_mcp/server.py index 4c74d38..ff732a4 100644 --- a/src/scrapegraph_mcp/server.py +++ b/src/scrapegraph_mcp/server.py @@ -12,7 +12,7 @@ import json import logging import os -from typing import Any, Dict, Optional, List, Union +from typing import Any, Dict, Optional, List, Union, Annotated import httpx from fastmcp import Context, FastMCP @@ -916,7 +916,16 @@ def smartscraper( website_url: Optional[str] = None, website_html: Optional[str] = None, website_markdown: Optional[str] = None, - output_schema: Optional[Union[str, Dict[str, Any]]] = None, + output_schema: Optional[Annotated[Union[str, Dict[str, Any]], Field( + default=None, + description="JSON schema dict or JSON string defining the expected output structure", + json_schema_extra={ + "oneOf": [ + {"type": "string"}, + {"type": "object"} + ] + } + )]] = None, number_of_scrolls: Optional[int] = None, total_pages: Optional[int] = None, render_heavy_js: Optional[bool] = None, @@ -1157,8 +1166,26 @@ def agentic_scrapper( url: str, ctx: Context, user_prompt: Optional[str] = None, - output_schema: Optional[Union[str, Dict[str, Any]]] = None, - steps: Optional[Union[str, List[str]]] = None, + output_schema: Optional[Annotated[Union[str, Dict[str, Any]], Field( + default=None, + description="Desired output structure as a JSON schema dict or JSON string", + json_schema_extra={ + "oneOf": [ + {"type": "string"}, + {"type": "object"} + ] + } + )]] = None, + steps: Optional[Annotated[Union[str, List[str]], Field( + default=None, + description="Step-by-step instructions for the agent as a list of strings or JSON array string", + json_schema_extra={ + "oneOf": [ + {"type": "string"}, + {"type": "array", "items": {"type": "string"}} + ] + } + )]] = None, ai_extraction: Optional[bool] = None, persistent_session: Optional[bool] = None, timeout_seconds: Optional[float] = None