Skip to content

Commit e8dcd29

Browse files
committed
add pagination of fetches so models can avoid reading a full page if it's got the information it needs
1 parent 960321f commit e8dcd29

File tree

1 file changed

+21
-9
lines changed

1 file changed

+21
-9
lines changed

src/fetch/src/mcp_server_fetch/server.py

Lines changed: 21 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@
1717
INTERNAL_ERROR,
1818
)
1919
from protego import Protego
20-
from pydantic import BaseModel, Field
20+
from pydantic import BaseModel, Field, ValidationError
2121

2222
DEFAULT_USER_AGENT_AUTONOMOUS = "ModelContextProtocol/1.0 (Autonomous; +https://github.com/modelcontextprotocol/servers)"
2323
DEFAULT_USER_AGENT_MANUAL = "ModelContextProtocol/1.0 (User-Specified; +https://github.com/modelcontextprotocol/servers)"
@@ -89,7 +89,10 @@ async def check_may_autonomously_fetch_url(url: str, user_agent: str):
8989
)
9090

9191

92-
async def fetch_url(url: str, user_agent: str) -> str:
92+
async def fetch_url(url: str, user_agent: str) -> (str, str):
93+
"""
94+
Fetch the URL and return the content in a form ready for the LLM, as well as a prefix string with status information.
95+
"""
9396
from httpx import AsyncClient, HTTPError
9497

9598
async with AsyncClient() as client:
@@ -109,13 +112,14 @@ async def fetch_url(url: str, user_agent: str) -> str:
109112

110113
content_type = response.headers.get("content-type", "")
111114
if "<html" in page_raw[:100] or "text/html" in content_type or not content_type:
112-
return extract_content_from_html(page_raw)
115+
return extract_content_from_html(page_raw), ""
113116

114-
return f"Content type {content_type} cannot be simplified to markdown, but here is the raw content:\n{page_raw}"
117+
return page_raw, f"Content type {content_type} cannot be simplified to markdown, but here is the raw content:\n"
115118

116119

117120
class Fetch(BaseModel):
118121
url: str = Field(..., description="URL to fetch")
122+
max_length: int = Field(5000, description="Maximum number of characters to return.")
119123
start_index: int = Field(0, description="On return output starting at this character index, useful if a previous fetch was truncated and more context is required.")
120124

121125

@@ -154,15 +158,23 @@ async def list_prompts() -> list[Prompt]:
154158

155159
@server.call_tool()
156160
async def call_tool(name, arguments: dict) -> list[TextContent]:
157-
url = arguments.get("url")
161+
try:
162+
args = Fetch(**arguments)
163+
except ValueError as e:
164+
raise McpError(INVALID_PARAMS, str(e))
165+
166+
url = args.url
158167
if not url:
159168
raise McpError(INVALID_PARAMS, "URL is required")
160169

161170
if not ignore_robots_txt:
162171
await check_may_autonomously_fetch_url(url, user_agent_autonomous)
163172

164-
content = await fetch_url(url, user_agent_autonomous)
165-
return [TextContent(type="text", text=f"Contents of {url}:\n{content}")]
173+
content, prefix = await fetch_url(url, user_agent_autonomous)
174+
if len(content) > args.max_length:
175+
content = content[args.start_index : args.start_index + args.max_length]
176+
content += f"\n\n<error>Content truncated. Call the fetch tool with a start_index of {args.start_index + args.max_length} to get more content.</error>"
177+
return [TextContent(type="text", text=f"{prefix}Contents of {url}:\n{content}")]
166178

167179
@server.get_prompt()
168180
async def get_prompt(name: str, arguments: dict | None) -> GetPromptResult:
@@ -172,7 +184,7 @@ async def get_prompt(name: str, arguments: dict | None) -> GetPromptResult:
172184
url = arguments["url"]
173185

174186
try:
175-
content = await fetch_url(url, user_agent_manual)
187+
content, prefix = await fetch_url(url, user_agent_manual)
176188
# TODO: after SDK bug is addressed, don't catch the exception
177189
except McpError as e:
178190
return GetPromptResult(
@@ -188,7 +200,7 @@ async def get_prompt(name: str, arguments: dict | None) -> GetPromptResult:
188200
description=f"Contents of {url}",
189201
messages=[
190202
PromptMessage(
191-
role="user", content=TextContent(type="text", text=content)
203+
role="user", content=TextContent(type="text", text=prefix + content)
192204
)
193205
],
194206
)

0 commit comments

Comments
 (0)