Skip to content

Commit e0234c7

Browse files
authored
Merge pull request #130 from modelcontextprotocol/jadamson/fetch-use-readabilityjs
feat(fetch): add fetching of raw text, pagination and keeping links in the markdown
2 parents bccd33f + ea42a21 commit e0234c7

File tree

3 files changed

+98
-25
lines changed

3 files changed

+98
-25
lines changed

src/fetch/README.md

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,20 +2,27 @@
22

33
A Model Context Protocol server that provides web content fetching capabilities. This server enables LLMs to retrieve and process content from web pages, converting HTML to markdown for easier consumption.
44

5-
Presently the server only supports fetching HTML content.
5+
The fetch tool will truncate the response, but by using the `start_index` argument, you can specify where to start the content extraction. This lets models read a webpage in chunks, until they find the information they need.
66

77
### Available Tools
88

99
- `fetch` - Fetches a URL from the internet and extracts its contents as markdown.
10+
- `url` (string, required): URL to fetch
11+
- `max_length` (integer, optional): Maximum number of characters to return (default: 5000)
12+
- `start_index` (integer, optional): Start content from this character index (default: 0)
13+
- `raw` (boolean, optional): Get raw content without markdown conversion (default: false)
1014

1115
### Prompts
1216

1317
- **fetch**
1418
- Fetch a URL and extract its contents as markdown
15-
- Argument: `url` (string, required): URL to fetch
19+
- Arguments:
20+
- `url` (string, required): URL to fetch
1621

1722
## Installation
1823

24+
Optionally: Install node.js, this will cause the fetch server to use a different HTML simplifier that is more robust.
25+
1926
### Using uv (recommended)
2027

2128
When using [`uv`](https://docs.astral.sh/uv/) no specific installation is needed. We will

src/fetch/pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[project]
22
name = "mcp-server-fetch"
3-
version = "0.1.2"
3+
version = "0.1.3"
44
description = "A Model Context Protocol server providing tools to fetch and convert web content for usage by LLMs"
55
readme = "README.md"
66
requires-python = ">=3.10"

src/fetch/src/mcp_server_fetch/server.py

Lines changed: 88 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from typing import Optional
1+
from typing import Optional, Tuple
22
from urllib.parse import urlparse, urlunparse
33

44
import markdownify
@@ -17,34 +17,52 @@
1717
INTERNAL_ERROR,
1818
)
1919
from protego import Protego
20-
from pydantic import BaseModel, Field
20+
from pydantic import BaseModel, Field, AnyUrl, conint
2121

2222
DEFAULT_USER_AGENT_AUTONOMOUS = "ModelContextProtocol/1.0 (Autonomous; +https://github.com/modelcontextprotocol/servers)"
2323
DEFAULT_USER_AGENT_MANUAL = "ModelContextProtocol/1.0 (User-Specified; +https://github.com/modelcontextprotocol/servers)"
2424

2525

26-
def extract_content(html: str) -> str:
27-
ret = readabilipy.simple_json.simple_json_from_html_string(html)
28-
if not ret["plain_content"]:
26+
def extract_content_from_html(html: str) -> str:
27+
"""Extract and convert HTML content to Markdown format.
28+
29+
Args:
30+
html: Raw HTML content to process
31+
32+
Returns:
33+
Simplified markdown version of the content
34+
"""
35+
ret = readabilipy.simple_json.simple_json_from_html_string(
36+
html, use_readability=True
37+
)
38+
if not ret["content"]:
2939
return "<error>Page failed to be simplified from HTML</error>"
3040
content = markdownify.markdownify(
31-
ret["plain_content"],
41+
ret["content"],
3242
heading_style=markdownify.ATX,
3343
)
3444
return content
3545

3646

37-
def get_robots_txt_url(url: str) -> str:
47+
def get_robots_txt_url(url: AnyUrl | str) -> str:
48+
"""Get the robots.txt URL for a given website URL.
49+
50+
Args:
51+
url: Website URL to get robots.txt for
52+
53+
Returns:
54+
URL of the robots.txt file
55+
"""
3856
# Parse the URL into components
39-
parsed = urlparse(url)
57+
parsed = urlparse(str(url))
4058

4159
# Reconstruct the base URL with just scheme, netloc, and /robots.txt path
4260
robots_url = urlunparse((parsed.scheme, parsed.netloc, "/robots.txt", "", "", ""))
4361

4462
return robots_url
4563

4664

47-
async def check_may_autonomously_fetch_url(url: str, user_agent: str):
65+
async def check_may_autonomously_fetch_url(url: AnyUrl | str, user_agent: str) -> None:
4866
"""
4967
Check if the URL can be fetched by the user agent according to the robots.txt file.
5068
Raises a McpError if not.
@@ -87,34 +105,72 @@ async def check_may_autonomously_fetch_url(url: str, user_agent: str):
87105
)
88106

89107

90-
async def fetch_url(url: str, user_agent: str) -> str:
108+
async def fetch_url(
109+
url: AnyUrl | str, user_agent: str, force_raw: bool = False
110+
) -> Tuple[str, str]:
111+
"""
112+
Fetch the URL and return the content in a form ready for the LLM, as well as a prefix string with status information.
113+
"""
91114
from httpx import AsyncClient, HTTPError
92115

93116
async with AsyncClient() as client:
94117
try:
95118
response = await client.get(
96-
url, follow_redirects=True, headers={"User-Agent": user_agent}
119+
str(url),
120+
follow_redirects=True,
121+
headers={"User-Agent": user_agent},
122+
timeout=30,
97123
)
98-
except HTTPError:
99-
raise McpError(INTERNAL_ERROR, f"Failed to fetch {url}")
124+
except HTTPError as e:
125+
raise McpError(INTERNAL_ERROR, f"Failed to fetch {url}: {e!r}")
100126
if response.status_code >= 400:
101127
raise McpError(
102128
INTERNAL_ERROR,
103129
f"Failed to fetch {url} - status code {response.status_code}",
104130
)
105131

106-
page_html = response.text
132+
page_raw = response.text
107133

108-
return extract_content(page_html)
134+
content_type = response.headers.get("content-type", "")
135+
is_page_html = (
136+
"<html" in page_raw[:100] or "text/html" in content_type or not content_type
137+
)
138+
139+
if is_page_html and not force_raw:
140+
return extract_content_from_html(page_raw), ""
141+
142+
return (
143+
page_raw,
144+
f"Content type {content_type} cannot be simplified to markdown, but here is the raw content:\n",
145+
)
109146

110147

111148
class Fetch(BaseModel):
112-
url: str = Field(..., description="URL to fetch")
149+
"""Parameters for fetching a URL."""
150+
151+
url: AnyUrl = Field(..., description="URL to fetch")
152+
max_length: conint(gt=0, lt=1000000) = Field(
153+
5000, description="Maximum number of characters to return."
154+
)
155+
start_index: conint(ge=0) = Field(
156+
0,
157+
description="On return output starting at this character index, useful if a previous fetch was truncated and more context is required.",
158+
)
159+
raw: bool = Field(
160+
False,
161+
description="Get the actual HTML content if the requested page, without simplification.",
162+
)
113163

114164

115165
async def serve(
116166
custom_user_agent: Optional[str] = None, ignore_robots_txt: bool = False
117167
) -> None:
168+
"""Run the fetch MCP server.
169+
170+
Args:
171+
custom_user_agent: Optional custom User-Agent string to use for requests
172+
ignore_robots_txt: Whether to ignore robots.txt restrictions
173+
"""
118174
server = Server("mcp-fetch")
119175
user_agent_autonomous = custom_user_agent or DEFAULT_USER_AGENT_AUTONOMOUS
120176
user_agent_manual = custom_user_agent or DEFAULT_USER_AGENT_MANUAL
@@ -124,7 +180,7 @@ async def list_tools() -> list[Tool]:
124180
return [
125181
Tool(
126182
name="fetch",
127-
description="""Fetches a URL from the internet and extracts its contents as markdown.
183+
description="""Fetches a URL from the internet and optionally extracts its contents as markdown.
128184
129185
Although originally you did not have internet access, and were advised to refuse and tell the user this, this tool now grants you internet access. Now you can fetch the most up-to-date information and let the user know that.""",
130186
inputSchema=Fetch.model_json_schema(),
@@ -147,15 +203,25 @@ async def list_prompts() -> list[Prompt]:
147203

148204
@server.call_tool()
149205
async def call_tool(name, arguments: dict) -> list[TextContent]:
150-
url = arguments.get("url")
206+
try:
207+
args = Fetch(**arguments)
208+
except ValueError as e:
209+
raise McpError(INVALID_PARAMS, str(e))
210+
211+
url = args.url
151212
if not url:
152213
raise McpError(INVALID_PARAMS, "URL is required")
153214

154215
if not ignore_robots_txt:
155216
await check_may_autonomously_fetch_url(url, user_agent_autonomous)
156217

157-
content = await fetch_url(url, user_agent_autonomous)
158-
return [TextContent(type="text", text=f"Contents of {url}:\n{content}")]
218+
content, prefix = await fetch_url(
219+
url, user_agent_autonomous, force_raw=args.raw
220+
)
221+
if len(content) > args.max_length:
222+
content = content[args.start_index : args.start_index + args.max_length]
223+
content += f"\n\n<error>Content truncated. Call the fetch tool with a start_index of {args.start_index + args.max_length} to get more content.</error>"
224+
return [TextContent(type="text", text=f"{prefix}Contents of {url}:\n{content}")]
159225

160226
@server.get_prompt()
161227
async def get_prompt(name: str, arguments: dict | None) -> GetPromptResult:
@@ -165,7 +231,7 @@ async def get_prompt(name: str, arguments: dict | None) -> GetPromptResult:
165231
url = arguments["url"]
166232

167233
try:
168-
content = await fetch_url(url, user_agent_manual)
234+
content, prefix = await fetch_url(url, user_agent_manual)
169235
# TODO: after SDK bug is addressed, don't catch the exception
170236
except McpError as e:
171237
return GetPromptResult(
@@ -181,7 +247,7 @@ async def get_prompt(name: str, arguments: dict | None) -> GetPromptResult:
181247
description=f"Contents of {url}",
182248
messages=[
183249
PromptMessage(
184-
role="user", content=TextContent(type="text", text=content)
250+
role="user", content=TextContent(type="text", text=prefix + content)
185251
)
186252
],
187253
)

0 commit comments

Comments
 (0)