Skip to content

Commit 57f8717

Browse files
authored
Merge branch 'main' into patch-1
2 parents 68bab2a + 2578d6f commit 57f8717

File tree

11 files changed

+231
-119
lines changed

11 files changed

+231
-119
lines changed

src/fetch/README.md

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,20 +2,27 @@
22

33
A Model Context Protocol server that provides web content fetching capabilities. This server enables LLMs to retrieve and process content from web pages, converting HTML to markdown for easier consumption.
44

5-
Presently the server only supports fetching HTML content.
5+
The fetch tool will truncate the response, but by using the `start_index` argument, you can specify where to start the content extraction. This lets models read a webpage in chunks, until they find the information they need.
66

77
### Available Tools
88

99
- `fetch` - Fetches a URL from the internet and extracts its contents as markdown.
10+
- `url` (string, required): URL to fetch
11+
- `max_length` (integer, optional): Maximum number of characters to return (default: 5000)
12+
- `start_index` (integer, optional): Start content from this character index (default: 0)
13+
- `raw` (boolean, optional): Get raw content without markdown conversion (default: false)
1014

1115
### Prompts
1216

1317
- **fetch**
1418
- Fetch a URL and extract its contents as markdown
15-
- Argument: `url` (string, required): URL to fetch
19+
- Arguments:
20+
- `url` (string, required): URL to fetch
1621

1722
## Installation
1823

24+
Optionally: Install node.js, this will cause the fetch server to use a different HTML simplifier that is more robust.
25+
1926
### Using uv (recommended)
2027

2128
When using [`uv`](https://docs.astral.sh/uv/) no specific installation is needed. We will

src/fetch/pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
[project]
22
name = "mcp-server-fetch"
3-
version = "0.1.2"
3+
version = "0.6.1"
44
description = "A Model Context Protocol server providing tools to fetch and convert web content for usage by LLMs"
55
readme = "README.md"
66
requires-python = ">=3.10"

src/fetch/src/mcp_server_fetch/server.py

Lines changed: 102 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
from typing import Optional
1+
from typing import Annotated, Tuple
22
from urllib.parse import urlparse, urlunparse
33

44
import markdownify
@@ -17,34 +17,52 @@
1717
INTERNAL_ERROR,
1818
)
1919
from protego import Protego
20-
from pydantic import BaseModel, Field
20+
from pydantic import BaseModel, Field, AnyUrl
2121

2222
DEFAULT_USER_AGENT_AUTONOMOUS = "ModelContextProtocol/1.0 (Autonomous; +https://github.com/modelcontextprotocol/servers)"
2323
DEFAULT_USER_AGENT_MANUAL = "ModelContextProtocol/1.0 (User-Specified; +https://github.com/modelcontextprotocol/servers)"
2424

2525

26-
def extract_content(html: str) -> str:
27-
ret = readabilipy.simple_json.simple_json_from_html_string(html)
28-
if not ret["plain_content"]:
26+
def extract_content_from_html(html: str) -> str:
27+
"""Extract and convert HTML content to Markdown format.
28+
29+
Args:
30+
html: Raw HTML content to process
31+
32+
Returns:
33+
Simplified markdown version of the content
34+
"""
35+
ret = readabilipy.simple_json.simple_json_from_html_string(
36+
html, use_readability=True
37+
)
38+
if not ret["content"]:
2939
return "<error>Page failed to be simplified from HTML</error>"
3040
content = markdownify.markdownify(
31-
ret["plain_content"],
41+
ret["content"],
3242
heading_style=markdownify.ATX,
3343
)
3444
return content
3545

3646

37-
def get_robots_txt_url(url: str) -> str:
47+
def get_robots_txt_url(url: AnyUrl | str) -> str:
48+
"""Get the robots.txt URL for a given website URL.
49+
50+
Args:
51+
url: Website URL to get robots.txt for
52+
53+
Returns:
54+
URL of the robots.txt file
55+
"""
3856
# Parse the URL into components
39-
parsed = urlparse(url)
57+
parsed = urlparse(str(url))
4058

4159
# Reconstruct the base URL with just scheme, netloc, and /robots.txt path
4260
robots_url = urlunparse((parsed.scheme, parsed.netloc, "/robots.txt", "", "", ""))
4361

4462
return robots_url
4563

4664

47-
async def check_may_autonomously_fetch_url(url: str, user_agent: str):
65+
async def check_may_autonomously_fetch_url(url: AnyUrl | str, user_agent: str) -> None:
4866
"""
4967
Check if the URL can be fetched by the user agent according to the robots.txt file.
5068
Raises a McpError if not.
@@ -87,34 +105,85 @@ async def check_may_autonomously_fetch_url(url: str, user_agent: str):
87105
)
88106

89107

90-
async def fetch_url(url: str, user_agent: str) -> str:
108+
async def fetch_url(
109+
url: AnyUrl | str, user_agent: str, force_raw: bool = False
110+
) -> Tuple[str, str]:
111+
"""
112+
Fetch the URL and return the content in a form ready for the LLM, as well as a prefix string with status information.
113+
"""
91114
from httpx import AsyncClient, HTTPError
92115

93116
async with AsyncClient() as client:
94117
try:
95118
response = await client.get(
96-
url, follow_redirects=True, headers={"User-Agent": user_agent}
119+
str(url),
120+
follow_redirects=True,
121+
headers={"User-Agent": user_agent},
122+
timeout=30,
97123
)
98-
except HTTPError:
99-
raise McpError(INTERNAL_ERROR, f"Failed to fetch {url}")
124+
except HTTPError as e:
125+
raise McpError(INTERNAL_ERROR, f"Failed to fetch {url}: {e!r}")
100126
if response.status_code >= 400:
101127
raise McpError(
102128
INTERNAL_ERROR,
103129
f"Failed to fetch {url} - status code {response.status_code}",
104130
)
105131

106-
page_html = response.text
132+
page_raw = response.text
107133

108-
return extract_content(page_html)
134+
content_type = response.headers.get("content-type", "")
135+
is_page_html = (
136+
"<html" in page_raw[:100] or "text/html" in content_type or not content_type
137+
)
138+
139+
if is_page_html and not force_raw:
140+
return extract_content_from_html(page_raw), ""
141+
142+
return (
143+
page_raw,
144+
f"Content type {content_type} cannot be simplified to markdown, but here is the raw content:\n",
145+
)
109146

110147

111148
class Fetch(BaseModel):
112-
url: str = Field(..., description="URL to fetch")
149+
"""Parameters for fetching a URL."""
150+
151+
url: Annotated[AnyUrl, Field(description="URL to fetch")]
152+
max_length: Annotated[
153+
int,
154+
Field(
155+
default=5000,
156+
description="Maximum number of characters to return.",
157+
gt=0,
158+
lt=1000000,
159+
),
160+
]
161+
start_index: Annotated[
162+
int,
163+
Field(
164+
default=0,
165+
description="On return output starting at this character index, useful if a previous fetch was truncated and more context is required.",
166+
ge=0,
167+
),
168+
]
169+
raw: Annotated[
170+
bool,
171+
Field(
172+
default=False,
173+
description="Get the actual HTML content if the requested page, without simplification.",
174+
),
175+
]
113176

114177

115178
async def serve(
116-
custom_user_agent: Optional[str] = None, ignore_robots_txt: bool = False
179+
custom_user_agent: str | None = None, ignore_robots_txt: bool = False
117180
) -> None:
181+
"""Run the fetch MCP server.
182+
183+
Args:
184+
custom_user_agent: Optional custom User-Agent string to use for requests
185+
ignore_robots_txt: Whether to ignore robots.txt restrictions
186+
"""
118187
server = Server("mcp-fetch")
119188
user_agent_autonomous = custom_user_agent or DEFAULT_USER_AGENT_AUTONOMOUS
120189
user_agent_manual = custom_user_agent or DEFAULT_USER_AGENT_MANUAL
@@ -124,7 +193,7 @@ async def list_tools() -> list[Tool]:
124193
return [
125194
Tool(
126195
name="fetch",
127-
description="""Fetches a URL from the internet and extracts its contents as markdown.
196+
description="""Fetches a URL from the internet and optionally extracts its contents as markdown.
128197
129198
Although originally you did not have internet access, and were advised to refuse and tell the user this, this tool now grants you internet access. Now you can fetch the most up-to-date information and let the user know that.""",
130199
inputSchema=Fetch.model_json_schema(),
@@ -147,15 +216,25 @@ async def list_prompts() -> list[Prompt]:
147216

148217
@server.call_tool()
149218
async def call_tool(name, arguments: dict) -> list[TextContent]:
150-
url = arguments.get("url")
219+
try:
220+
args = Fetch(**arguments)
221+
except ValueError as e:
222+
raise McpError(INVALID_PARAMS, str(e))
223+
224+
url = args.url
151225
if not url:
152226
raise McpError(INVALID_PARAMS, "URL is required")
153227

154228
if not ignore_robots_txt:
155229
await check_may_autonomously_fetch_url(url, user_agent_autonomous)
156230

157-
content = await fetch_url(url, user_agent_autonomous)
158-
return [TextContent(type="text", text=f"Contents of {url}:\n{content}")]
231+
content, prefix = await fetch_url(
232+
url, user_agent_autonomous, force_raw=args.raw
233+
)
234+
if len(content) > args.max_length:
235+
content = content[args.start_index : args.start_index + args.max_length]
236+
content += f"\n\n<error>Content truncated. Call the fetch tool with a start_index of {args.start_index + args.max_length} to get more content.</error>"
237+
return [TextContent(type="text", text=f"{prefix}Contents of {url}:\n{content}")]
159238

160239
@server.get_prompt()
161240
async def get_prompt(name: str, arguments: dict | None) -> GetPromptResult:
@@ -165,7 +244,7 @@ async def get_prompt(name: str, arguments: dict | None) -> GetPromptResult:
165244
url = arguments["url"]
166245

167246
try:
168-
content = await fetch_url(url, user_agent_manual)
247+
content, prefix = await fetch_url(url, user_agent_manual)
169248
# TODO: after SDK bug is addressed, don't catch the exception
170249
except McpError as e:
171250
return GetPromptResult(
@@ -181,7 +260,7 @@ async def get_prompt(name: str, arguments: dict | None) -> GetPromptResult:
181260
description=f"Contents of {url}",
182261
messages=[
183262
PromptMessage(
184-
role="user", content=TextContent(type="text", text=content)
263+
role="user", content=TextContent(type="text", text=prefix + content)
185264
)
186265
],
187266
)

0 commit comments

Comments
 (0)