diff --git a/README.md b/README.md index ba1db2e..5d15aee 100644 --- a/README.md +++ b/README.md @@ -40,17 +40,20 @@ python mcp_server.py WebCat is an **MCP (Model Context Protocol) server** that provides AI models with: - 🔍 **Web Search** - Serper API (premium) or DuckDuckGo (free fallback) -- 📄 **Content Extraction** - Clean markdown conversion with Readability + html2text +- 📄 **Content Extraction** - Serper scrape API (premium) or Trafilatura (free fallback) - 🌐 **Modern HTTP Transport** - Streamable HTTP with JSON-RPC 2.0 - 🐳 **Multi-Platform Docker** - Works on Intel, ARM, and Apple Silicon +- 🎯 **Composite Tool** - Single SERPER_API_KEY enables both search + scraping -Built with **FastMCP**, **Readability**, and **html2text** for seamless AI integration. +Built with **FastMCP**, **Serper.dev**, and **Trafilatura** for seamless AI integration. ## Features - ✅ **Optional Authentication** - Bearer token auth when needed, or run without (v2.3.1) -- ✅ **Automatic Fallback** - Serper API → DuckDuckGo if needed -- ✅ **Smart Content Extraction** - Readability + html2text removes navigation/ads/chrome +- ✅ **Composite Search Tool** - Single Serper API key enables both search + scraping +- ✅ **Automatic Fallback** - Search: Serper → DuckDuckGo | Scraping: Serper → Trafilatura +- ✅ **Premium Scraping** - Serper's optimized infrastructure for fast, clean content extraction +- ✅ **Smart Content Extraction** - Returns markdown with preserved document structure - ✅ **MCP Compliant** - Works with Claude Desktop, LiteLLM, and other MCP clients - ✅ **Parallel Processing** - Fast concurrent scraping - ✅ **Multi-Platform Docker** - Linux (amd64/arm64) support @@ -114,11 +117,12 @@ make mcp # Start MCP server ### Get API Keys -**Serper API (for web search):** +**Serper API (for web search + scraping):** 1. Visit [serper.dev](https://serper.dev) -2. Sign up for free tier (2,500 searches/month) +2. Sign up for free tier (2,500 searches/month + scraping) 3. Copy your API key 4. Add to `.env` file: `SERPER_API_KEY=your_key` +5. **Note:** One API key enables both search AND content scraping! **Perplexity API (for deep research):** 1. Visit [perplexity.ai/settings/api](https://www.perplexity.ai/settings/api) @@ -157,10 +161,8 @@ FastMCP Server (Streamable HTTP with JSON-RPC 2.0) Authentication (optional bearer token) ↓ Search Decision - ├─ Serper API (premium) → Content Scraper - └─ DuckDuckGo (free) → Content Scraper - ↓ - Readability + html2text + ├─ Serper API (premium) → Serper Scrape API (premium) + └─ DuckDuckGo (free) → Trafilatura (free) ↓ Markdown Response ``` @@ -168,9 +170,9 @@ Search Decision **Tech Stack:** - **FastMCP** - MCP protocol implementation with modern HTTP transport - **JSON-RPC 2.0** - Standard protocol for client-server communication -- **Readability** - Content extraction (removes navigation/ads) -- **html2text** - HTML to markdown conversion -- **Serper/DuckDuckGo** - Search APIs with automatic fallback +- **Serper API** - Google-powered search + optimized web scraping +- **Trafilatura** - Fallback content extraction (removes navigation/ads) +- **DuckDuckGo** - Free search fallback ## Testing @@ -231,11 +233,11 @@ docker/ ├── health.py # Health check endpoint ├── api_tools.py # API tooling utilities ├── clients/ # External API clients -│ ├── serper_client.py # Serper API integration +│ ├── serper_client.py # Serper API (search + scrape) │ └── duckduckgo_client.py # DuckDuckGo fallback ├── services/ # Core business logic │ ├── search_service.py # Search orchestration -│ └── content_scraper.py # Readability + html2text +│ └── content_scraper.py # Serper scrape → Trafilatura fallback ├── tools/ # MCP tool implementations │ └── search_tool.py # Search tool with auth ├── models/ # Pydantic data models diff --git a/docker/clients/serper_client.py b/docker/clients/serper_client.py index 0cc496d..aafe7c5 100644 --- a/docker/clients/serper_client.py +++ b/docker/clients/serper_client.py @@ -3,11 +3,11 @@ # This source code is licensed under the MIT license found in the # LICENSE file in the root directory of this source tree. -"""Serper API client - fetches search results from Serper API.""" +"""Serper API client - fetches search results and scrapes webpages via Serper API.""" import json import logging -from typing import List +from typing import List, Optional import requests @@ -66,3 +66,52 @@ def fetch_search_results( except Exception as e: logger.error(f"Error fetching search results: {str(e)}") return [] + + +def scrape_webpage(url: str, api_key: str) -> Optional[str]: + """ + Scrapes webpage content using Serper's scrape API. + + Uses Serper's optimized scraping infrastructure to extract clean + markdown-formatted content from any URL. Returns text with preserved + document structure, including metadata and JSON-LD data. + + Args: + url: The webpage URL to scrape + api_key: The Serper API key + + Returns: + Markdown-formatted content from the webpage, or None if scraping fails + """ + scrape_url = "https://scrape.serper.dev" + payload = json.dumps({"url": url}) + headers = {"X-API-KEY": api_key, "Content-Type": "application/json"} + + try: + logger.info(f"Scraping webpage via Serper: {url}") + response = requests.post(scrape_url, headers=headers, data=payload, timeout=10) + response.raise_for_status() + data = response.json() + + # Extract markdown content from response + # Serper returns text and optional markdown in the response + markdown_content = data.get("text", "") + + if markdown_content: + logger.info( + f"Successfully scraped {len(markdown_content)} chars from {url}" + ) + return markdown_content + + logger.warning(f"No content returned from Serper scrape for {url}") + return None + + except requests.exceptions.Timeout: + logger.error(f"Timeout scraping webpage via Serper: {url}") + return None + except requests.exceptions.RequestException as e: + logger.error(f"Request error scraping webpage via Serper: {str(e)}") + return None + except Exception as e: + logger.exception(f"Unexpected error scraping webpage via Serper: {str(e)}") + return None diff --git a/docker/constants.py b/docker/constants.py index 55c6492..7625eea 100644 --- a/docker/constants.py +++ b/docker/constants.py @@ -8,7 +8,7 @@ import os # Application version -VERSION = "2.5.0" +VERSION = "2.5.1" # Service information SERVICE_NAME = "WebCat MCP Server" diff --git a/docker/services/content_scraper.py b/docker/services/content_scraper.py index 8327e63..4b78ecd 100644 --- a/docker/services/content_scraper.py +++ b/docker/services/content_scraper.py @@ -6,10 +6,12 @@ """Content scraper service - extracts and converts web content to markdown.""" import logging +import os import requests import trafilatura +from clients.serper_client import scrape_webpage as serper_scrape_webpage from constants import MAX_CONTENT_LENGTH, REQUEST_TIMEOUT_SECONDS from models.search_result import SearchResult @@ -85,6 +87,9 @@ def scrape_search_result(result: SearchResult) -> SearchResult: """ Scrapes the content of a search result URL and converts it to markdown. + Uses Serper's scrape API if SERPER_API_KEY is available (faster, more reliable). + Falls back to Trafilatura for local scraping if Serper is not configured. + Args: result: SearchResult object with URL to scrape @@ -95,7 +100,35 @@ def scrape_search_result(result: SearchResult) -> SearchResult: result.content = "Error: Missing URL for content scraping." return result + # Try Serper scraping first if API key is available + serper_api_key = os.environ.get("SERPER_API_KEY", "") + if serper_api_key: + try: + logger.info(f"Using Serper scrape API for {result.url}") + scraped_content = serper_scrape_webpage(result.url, serper_api_key) + + if scraped_content: + # Format with title and source + full_content = ( + f"# {result.title}\n\n*Source: {result.url}*\n\n{scraped_content}" + ) + result.content = _truncate_if_needed(full_content) + logger.info( + f"Successfully scraped via Serper: {len(result.content)} chars" + ) + return result + + logger.warning( + f"Serper scrape returned no content for {result.url}, falling back to Trafilatura" + ) + except Exception as e: + logger.warning( + f"Serper scrape failed for {result.url}: {str(e)}, falling back to Trafilatura" + ) + + # Fallback to Trafilatura scraping try: + logger.info(f"Using Trafilatura for {result.url}") response = _fetch_content(result.url) content_type = response.headers.get("Content-Type", "").lower()