Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions python-mcp-crdb-docs/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
FROM python:3.11-slim AS build
WORKDIR /app
COPY pyproject.toml README.md ./
COPY src ./src
RUN pip install --upgrade pip && pip install .

FROM gcr.io/distroless/python3
ENV PYTHONPATH=/usr/local/lib/python3.11/site-packages
WORKDIR /app
COPY --from=build /usr/local /usr/local
COPY --from=build /app/src /app/src
ENTRYPOINT ["python", "-m", "python_mcp_crdb_docs.server"]
16 changes: 16 additions & 0 deletions python-mcp-crdb-docs/Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
.PHONY: install lint test run docker-build

install:
pip install -e .[dev]

lint:
python -m compileall src

test:
pytest

run:
python -m python_mcp_crdb_docs.server

docker-build:
docker build -t python-mcp-crdb-docs .
25 changes: 25 additions & 0 deletions python-mcp-crdb-docs/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
# Python MCP Server for CockroachDB Docs

This package ships a FastMCP-compatible server that exposes CockroachDB documentation via:

* `search_docs` – Algolia-backed search results
* `get_page` – Fetch Markdown for a docs page, with HTML-to-source discovery
* `list_versions` – Enumerate available version folders from GitHub
* `doc://` resource scheme – Direct access to Markdown via version + path

## Local development

```bash
cd python-mcp-crdb-docs
pip install -e .[dev]
pytest
FASTMCP_LOG_LEVEL=INFO python -m python_mcp_crdb_docs.server
```

Configuration is handled with environment variables:

* `ALGOLIA_APP_ID` (default `HPNPWALV9D`)
* `ALGOLIA_SEARCH_KEY` (default search-only key for staging index)
* `ALGOLIA_INDEX` (default `stage_cockroach_docs`)

All HTTP requests are routed through a hardened client that enforces a 10s timeout, 512KB body limit, and allowlisted domains.
26 changes: 26 additions & 0 deletions python-mcp-crdb-docs/pyproject.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
[project]
name = "python-mcp-crdb-docs"
version = "0.1.0"
description = "FastMCP server exposing CockroachDB docs search and fetching"
readme = "README.md"
requires-python = ">=3.11"
dependencies = [
"fastmcp>=0.1.7",
"httpx>=0.27.0",
"pydantic>=2.7.0",
"algoliasearch>=3.0.0",
]

[project.optional-dependencies]
dev = [
"pytest>=8.2.0",
"pytest-asyncio>=0.23.0",
"respx>=0.21.1",
]

[build-system]
requires = ["setuptools", "wheel"]
build-backend = "setuptools.build_meta"

[tool.pytest.ini_options]
pythonpath = ["src"]
Empty file.
26 changes: 26 additions & 0 deletions python-mcp-crdb-docs/src/python_mcp_crdb_docs/core/allowlist.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
"""Domain allowlist enforcement."""

from __future__ import annotations

from urllib.parse import urlparse

ALLOWED_HOST_SUFFIXES = (
".cockroachlabs.com",
".github.com",
".githubusercontent.com",
".algolia.net",
".algolianet.com",
)


class DomainBlockedError(RuntimeError):
"""Raised when a URL is outside the allowlist."""


def ensure_allowed(url: str) -> None:
parsed = urlparse(url)
hostname = parsed.hostname or ""
for suffix in ALLOWED_HOST_SUFFIXES:
if hostname == suffix.lstrip(".") or hostname.endswith(suffix):
return
raise DomainBlockedError(f"URL host '{hostname}' not in allowlist")
43 changes: 43 additions & 0 deletions python-mcp-crdb-docs/src/python_mcp_crdb_docs/core/cache.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
"""Simple TTL-aware LRU cache."""

from __future__ import annotations

import time
from collections import OrderedDict
from typing import Generic, Optional, TypeVar

K = TypeVar("K")
V = TypeVar("V")


class TTLCache(Generic[K, V]):
def __init__(self, maxsize: int = 128, ttl: float = 60.0) -> None:
self.maxsize = maxsize
self.ttl = ttl
self._data: "OrderedDict[K, tuple[float, V]]" = OrderedDict()

def _purge(self) -> None:
now = time.time()
keys_to_delete = [key for key, (expires, _) in self._data.items() if expires < now]
for key in keys_to_delete:
self._data.pop(key, None)
while len(self._data) > self.maxsize:
self._data.popitem(last=False)

def get(self, key: K) -> Optional[V]:
self._purge()
if key not in self._data:
return None
expires, value = self._data.pop(key)
if expires < time.time():
return None
self._data[key] = (expires, value)
return value

def set(self, key: K, value: V) -> None:
expires = time.time() + self.ttl
self._data[key] = (expires, value)
self._purge()

def clear(self) -> None:
self._data.clear()
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
"""Fallback shim for fastmcp when the real package is unavailable."""

from __future__ import annotations

try: # pragma: no cover
from fastmcp import FastMCP, HTTPTransport, ResourceResult, TextResource, ToolResult
except ModuleNotFoundError: # pragma: no cover - exercised when dependency missing
class ToolResult: # type: ignore[override]
def __init__(self, content):
self.content = content

class ResourceResult: # type: ignore[override]
def __init__(self, content):
self.content = content

class TextResource(ResourceResult): # type: ignore[override]
def __init__(self, content: str, mime_type: str = "text/plain"):
super().__init__(content)
self.mime_type = mime_type

class FastMCP: # type: ignore[override]
def __init__(self, name: str):
self.name = name
self.tools = {}
self.resources = {}

def add_tool(self, name, func, args_model=None):
self.tools[name] = (func, args_model)

def add_resource(self, name, scheme, handler):
self.resources[scheme] = handler

def run(self, transport=None):
raise RuntimeError("fastmcp package not installed; server cannot run")

class HTTPTransport: # type: ignore[override]
def __init__(self, host: str = "127.0.0.1", port: int = 3000):
self.host = host
self.port = port

__all__ = ["FastMCP", "HTTPTransport", "ToolResult", "ResourceResult", "TextResource"]
82 changes: 82 additions & 0 deletions python-mcp-crdb-docs/src/python_mcp_crdb_docs/core/fetch.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
"""Markdown fetching utilities."""

from __future__ import annotations

import re
from typing import Optional

from .cache import TTLCache
from .http import create_client
from .logging import debug
from .mapping import build_raw_url

_MD_CACHE = TTLCache[str, str](maxsize=128, ttl=120)


async def _fetch_first(urls: list[str]) -> Optional[str]:
async with create_client() as client:
for url in urls:
resp = await client.get(url)
if resp.status_code == 200:
return resp.text
return None


async def fetch_markdown_from_raw(version: str, path: str) -> Optional[str]:
cache_key = f"raw:{version}:{path}"
cached = _MD_CACHE.get(cache_key)
if cached:
return cached
candidates = build_raw_url(version, path)
content = await _fetch_first(candidates)
if content:
_MD_CACHE.set(cache_key, content)
return content


def _extract_source_href(html: str) -> Optional[str]:
anchor_pattern = re.compile(r"<a[^>]+href=\"([^\"]+)\"[^>]*>(.*?)</a>", re.IGNORECASE | re.DOTALL)
tag_pattern = re.compile(r"<[^>]+>")
for match in anchor_pattern.finditer(html):
text = tag_pattern.sub("", match.group(2)).strip().lower()
if "view page source" in text:
return match.group(1)
return None


async def fetch_markdown_from_html_page(url: str) -> Optional[str]:
cache_key = f"html:{url}"
cached = _MD_CACHE.get(cache_key)
if cached:
return cached
async with create_client() as client:
resp = await client.get(url)
if resp.status_code != 200:
return None
href = _extract_source_href(resp.text)
if not href:
return None
source_url = href
if source_url.startswith("//"):
source_url = f"https:{source_url}"
elif source_url.startswith("/"):
source_url = f"https://www.cockroachlabs.com{source_url}"
raw_candidates = [source_url.replace("/blob/", "/raw/")]
content = await _fetch_first(raw_candidates)
if content:
_MD_CACHE.set(cache_key, content)
return content


async def list_versions_from_github() -> list[str]:
cache_key = "versions"
cached = _MD_CACHE.get(cache_key)
if cached:
return cached.split(",")
async with create_client() as client:
resp = await client.get("https://api.github.com/repos/cockroachdb/docs/contents/src")
resp.raise_for_status()
data = resp.json()
versions = sorted(item["name"] for item in data if item.get("type") == "dir")
_MD_CACHE.set(cache_key, ",".join(versions))
return versions
32 changes: 32 additions & 0 deletions python-mcp-crdb-docs/src/python_mcp_crdb_docs/core/http.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
"""HTTP client with timeouts, allowlist, and response size guards."""

from __future__ import annotations

import asyncio
from typing import Any

import httpx

from .allowlist import ensure_allowed
from .logging import debug

MAX_RESPONSE_BYTES = 512 * 1024
REQUEST_TIMEOUT = 10.0


class SafeAsyncClient(httpx.AsyncClient):
async def _request(self, *args: Any, **kwargs: Any) -> httpx.Response: # type: ignore[override]
url = str(kwargs.get("url") or args[1])
ensure_allowed(url)
kwargs.setdefault("timeout", REQUEST_TIMEOUT)
debug("http_request", method=args[0] if args else kwargs.get("method"), url=url)
response = await super()._request(*args, **kwargs)
content = await response.aread()
if len(content) > MAX_RESPONSE_BYTES:
raise httpx.HTTPStatusError("response too large", request=response.request, response=response)
response._content = content # type: ignore[attr-defined]
return response


def create_client() -> SafeAsyncClient:
return SafeAsyncClient(headers={"User-Agent": "python-mcp-crdb-docs/0.1"})
48 changes: 48 additions & 0 deletions python-mcp-crdb-docs/src/python_mcp_crdb_docs/core/logging.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
"""Structured logging helpers for the MCP server."""

from __future__ import annotations

import json
import os
import sys
import time
from typing import Any, Dict

_LOG_LEVEL = os.getenv("FASTMCP_LOG_LEVEL", "INFO").upper()
_LEVELS = {"DEBUG": 10, "INFO": 20, "WARNING": 30, "ERROR": 40}


def _should_log(level: str) -> bool:
return _LEVELS.get(level, 20) >= _LEVELS.get(_LOG_LEVEL, 20)


def log(level: str, message: str, **fields: Any) -> None:
"""Emit a JSON log line."""
if not _should_log(level):
return
payload: Dict[str, Any] = {
"level": level,
"message": message,
"time": time.strftime("%Y-%m-%dT%H:%M:%SZ", time.gmtime()),
}
if fields:
payload.update(fields)
json.dump(payload, sys.stdout)
sys.stdout.write("\n")
sys.stdout.flush()


def info(message: str, **fields: Any) -> None:
log("INFO", message, **fields)


def warning(message: str, **fields: Any) -> None:
log("WARNING", message, **fields)


def error(message: str, **fields: Any) -> None:
log("ERROR", message, **fields)


def debug(message: str, **fields: Any) -> None:
log("DEBUG", message, **fields)
Loading
Loading