Kode-Rex · T-rav · Oct 10, 2025 · Oct 10, 2025 · Oct 10, 2025
diff --git a/README.md b/README.md
@@ -40,17 +40,20 @@ python mcp_server.py
 
 WebCat is an **MCP (Model Context Protocol) server** that provides AI models with:
 - 🔍 **Web Search** - Serper API (premium) or DuckDuckGo (free fallback)
-- 📄 **Content Extraction** - Clean markdown conversion with Readability + html2text
+- 📄 **Content Extraction** - Serper scrape API (premium) or Trafilatura (free fallback)
 - 🌐 **Modern HTTP Transport** - Streamable HTTP with JSON-RPC 2.0
 - 🐳 **Multi-Platform Docker** - Works on Intel, ARM, and Apple Silicon
+- 🎯 **Composite Tool** - Single SERPER_API_KEY enables both search + scraping
 
-Built with **FastMCP**, **Readability**, and **html2text** for seamless AI integration.
+Built with **FastMCP**, **Serper.dev**, and **Trafilatura** for seamless AI integration.
 
 ## Features
 
 - ✅ **Optional Authentication** - Bearer token auth when needed, or run without (v2.3.1)
-- ✅ **Automatic Fallback** - Serper API → DuckDuckGo if needed
-- ✅ **Smart Content Extraction** - Readability + html2text removes navigation/ads/chrome
+- ✅ **Composite Search Tool** - Single Serper API key enables both search + scraping
+- ✅ **Automatic Fallback** - Search: Serper → DuckDuckGo | Scraping: Serper → Trafilatura
+- ✅ **Premium Scraping** - Serper's optimized infrastructure for fast, clean content extraction
+- ✅ **Smart Content Extraction** - Returns markdown with preserved document structure
 - ✅ **MCP Compliant** - Works with Claude Desktop, LiteLLM, and other MCP clients
 - ✅ **Parallel Processing** - Fast concurrent scraping
 - ✅ **Multi-Platform Docker** - Linux (amd64/arm64) support
@@ -114,11 +117,12 @@ make mcp        # Start MCP server
 
 ### Get API Keys
 
-**Serper API (for web search):**
+**Serper API (for web search + scraping):**
 1. Visit [serper.dev](https://serper.dev)
-2. Sign up for free tier (2,500 searches/month)
+2. Sign up for free tier (2,500 searches/month + scraping)
 3. Copy your API key
 4. Add to `.env` file: `SERPER_API_KEY=your_key`
+5. **Note:** One API key enables both search AND content scraping!
 
 **Perplexity API (for deep research):**
 1. Visit [perplexity.ai/settings/api](https://www.perplexity.ai/settings/api)
@@ -157,20 +161,18 @@ FastMCP Server (Streamable HTTP with JSON-RPC 2.0)
 Authentication (optional bearer token)
     ↓
 Search Decision
-    ├─ Serper API (premium) → Content Scraper
-    └─ DuckDuckGo (free)    → Content Scraper
-                                    ↓
-                            Readability + html2text
+    ├─ Serper API (premium) → Serper Scrape API (premium)
+    └─ DuckDuckGo (free)    → Trafilatura (free)
                                     ↓
                             Markdown Response
 ```
 
 **Tech Stack:**
 - **FastMCP** - MCP protocol implementation with modern HTTP transport
 - **JSON-RPC 2.0** - Standard protocol for client-server communication
-- **Readability** - Content extraction (removes navigation/ads)
-- **html2text** - HTML to markdown conversion
-- **Serper/DuckDuckGo** - Search APIs with automatic fallback
+- **Serper API** - Google-powered search + optimized web scraping
+- **Trafilatura** - Fallback content extraction (removes navigation/ads)
+- **DuckDuckGo** - Free search fallback
 
 ## Testing
 
@@ -231,11 +233,11 @@ docker/
 ├── health.py              # Health check endpoint
 ├── api_tools.py           # API tooling utilities
 ├── clients/               # External API clients
-│   ├── serper_client.py  # Serper API integration
+│   ├── serper_client.py  # Serper API (search + scrape)
 │   └── duckduckgo_client.py  # DuckDuckGo fallback
 ├── services/              # Core business logic
 │   ├── search_service.py # Search orchestration
-│   └── content_scraper.py # Readability + html2text
+│   └── content_scraper.py # Serper scrape → Trafilatura fallback
 ├── tools/                 # MCP tool implementations
 │   └── search_tool.py    # Search tool with auth
 ├── models/                # Pydantic data models

diff --git a/docker/clients/serper_client.py b/docker/clients/serper_client.py
@@ -3,11 +3,11 @@
 # This source code is licensed under the MIT license found in the
 # LICENSE file in the root directory of this source tree.
 
-"""Serper API client - fetches search results from Serper API."""
+"""Serper API client - fetches search results and scrapes webpages via Serper API."""
 
 import json
 import logging
-from typing import List
+from typing import List, Optional
 
 import requests
 
@@ -66,3 +66,52 @@ def fetch_search_results(
     except Exception as e:
         logger.error(f"Error fetching search results: {str(e)}")
         return []
+
+
+def scrape_webpage(url: str, api_key: str) -> Optional[str]:
+    """
+    Scrapes webpage content using Serper's scrape API.
+
+    Uses Serper's optimized scraping infrastructure to extract clean
+    markdown-formatted content from any URL. Returns text with preserved
+    document structure, including metadata and JSON-LD data.
+
+    Args:
+        url: The webpage URL to scrape
+        api_key: The Serper API key
+
+    Returns:
+        Markdown-formatted content from the webpage, or None if scraping fails
+    """
+    scrape_url = "https://scrape.serper.dev"
+    payload = json.dumps({"url": url})
+    headers = {"X-API-KEY": api_key, "Content-Type": "application/json"}
+
+    try:
+        logger.info(f"Scraping webpage via Serper: {url}")
+        response = requests.post(scrape_url, headers=headers, data=payload, timeout=10)
+        response.raise_for_status()
+        data = response.json()
+
+        # Extract markdown content from response
+        # Serper returns text and optional markdown in the response
+        markdown_content = data.get("text", "")
+
+        if markdown_content:
+            logger.info(
+                f"Successfully scraped {len(markdown_content)} chars from {url}"
+            )
+            return markdown_content
+
+        logger.warning(f"No content returned from Serper scrape for {url}")
+        return None
+
+    except requests.exceptions.Timeout:
+        logger.error(f"Timeout scraping webpage via Serper: {url}")
+        return None
+    except requests.exceptions.RequestException as e:
+        logger.error(f"Request error scraping webpage via Serper: {str(e)}")
+        return None
+    except Exception as e:
+        logger.exception(f"Unexpected error scraping webpage via Serper: {str(e)}")
+        return None
diff --git a/docker/constants.py b/docker/constants.py
@@ -8,7 +8,7 @@
 import os
 
 # Application version
-VERSION = "2.5.0"
+VERSION = "2.5.1"
 
 # Service information
 SERVICE_NAME = "WebCat MCP Server"

diff --git a/docker/services/content_scraper.py b/docker/services/content_scraper.py
@@ -6,10 +6,12 @@
 """Content scraper service - extracts and converts web content to markdown."""
 
 import logging
+import os
 
 import requests
 import trafilatura
 
+from clients.serper_client import scrape_webpage as serper_scrape_webpage
 from constants import MAX_CONTENT_LENGTH, REQUEST_TIMEOUT_SECONDS
 from models.search_result import SearchResult
 
@@ -85,6 +87,9 @@ def scrape_search_result(result: SearchResult) -> SearchResult:
     """
     Scrapes the content of a search result URL and converts it to markdown.
 
+    Uses Serper's scrape API if SERPER_API_KEY is available (faster, more reliable).
+    Falls back to Trafilatura for local scraping if Serper is not configured.
+
     Args:
         result: SearchResult object with URL to scrape
 
@@ -95,7 +100,35 @@ def scrape_search_result(result: SearchResult) -> SearchResult:
         result.content = "Error: Missing URL for content scraping."
         return result
 
+    # Try Serper scraping first if API key is available
+    serper_api_key = os.environ.get("SERPER_API_KEY", "")
+    if serper_api_key:
+        try:
+            logger.info(f"Using Serper scrape API for {result.url}")
+            scraped_content = serper_scrape_webpage(result.url, serper_api_key)
+
+            if scraped_content:
+                # Format with title and source
+                full_content = (
+                    f"# {result.title}\n\n*Source: {result.url}*\n\n{scraped_content}"
+                )
+                result.content = _truncate_if_needed(full_content)
+                logger.info(
+                    f"Successfully scraped via Serper: {len(result.content)} chars"
+                )
+                return result
+
+            logger.warning(
+                f"Serper scrape returned no content for {result.url}, falling back to Trafilatura"
+            )
+        except Exception as e:
+            logger.warning(
+                f"Serper scrape failed for {result.url}: {str(e)}, falling back to Trafilatura"
+            )
+
+    # Fallback to Trafilatura scraping
     try:
+        logger.info(f"Using Trafilatura for {result.url}")
         response = _fetch_content(result.url)
         content_type = response.headers.get("Content-Type", "").lower()