1- from typing import Optional
1+ from typing import Optional , Tuple
22from urllib .parse import urlparse , urlunparse
33
44import markdownify
1717 INTERNAL_ERROR ,
1818)
1919from protego import Protego
20- from pydantic import BaseModel , Field , ValidationError
20+ from pydantic import BaseModel , Field , AnyUrl , conint
2121
2222DEFAULT_USER_AGENT_AUTONOMOUS = "ModelContextProtocol/1.0 (Autonomous; +https://github.com/modelcontextprotocol/servers)"
2323DEFAULT_USER_AGENT_MANUAL = "ModelContextProtocol/1.0 (User-Specified; +https://github.com/modelcontextprotocol/servers)"
2424
2525
2626def extract_content_from_html (html : str ) -> str :
27+ """Extract and convert HTML content to Markdown format.
28+
29+ Args:
30+ html: Raw HTML content to process
31+
32+ Returns:
33+ Simplified markdown version of the content
34+ """
2735 ret = readabilipy .simple_json .simple_json_from_html_string (
2836 html , use_readability = True
2937 )
@@ -36,17 +44,25 @@ def extract_content_from_html(html: str) -> str:
3644 return content
3745
3846
39- def get_robots_txt_url (url : str ) -> str :
47+ def get_robots_txt_url (url : AnyUrl | str ) -> str :
48+ """Get the robots.txt URL for a given website URL.
49+
50+ Args:
51+ url: Website URL to get robots.txt for
52+
53+ Returns:
54+ URL of the robots.txt file
55+ """
4056 # Parse the URL into components
41- parsed = urlparse (url )
57+ parsed = urlparse (str ( url ) )
4258
4359 # Reconstruct the base URL with just scheme, netloc, and /robots.txt path
4460 robots_url = urlunparse ((parsed .scheme , parsed .netloc , "/robots.txt" , "" , "" , "" ))
4561
4662 return robots_url
4763
4864
49- async def check_may_autonomously_fetch_url (url : str , user_agent : str ):
65+ async def check_may_autonomously_fetch_url (url : AnyUrl | str , user_agent : str ) -> None :
5066 """
5167 Check if the URL can be fetched by the user agent according to the robots.txt file.
5268 Raises a McpError if not.
@@ -89,7 +105,9 @@ async def check_may_autonomously_fetch_url(url: str, user_agent: str):
89105 )
90106
91107
92- async def fetch_url (url : str , user_agent : str , force_raw : bool = False ) -> (str , str ):
108+ async def fetch_url (
109+ url : AnyUrl | str , user_agent : str , force_raw : bool = False
110+ ) -> Tuple [str , str ]:
93111 """
94112 Fetch the URL and return the content in a form ready for the LLM, as well as a prefix string with status information.
95113 """
@@ -98,7 +116,7 @@ async def fetch_url(url: str, user_agent: str, force_raw: bool = False) -> (str,
98116 async with AsyncClient () as client :
99117 try :
100118 response = await client .get (
101- url ,
119+ str ( url ) ,
102120 follow_redirects = True ,
103121 headers = {"User-Agent" : user_agent },
104122 timeout = 30 ,
@@ -128,9 +146,13 @@ async def fetch_url(url: str, user_agent: str, force_raw: bool = False) -> (str,
128146
129147
130148class Fetch (BaseModel ):
131- url : str = Field (..., description = "URL to fetch" )
132- max_length : int = Field (5000 , description = "Maximum number of characters to return." )
133- start_index : int = Field (
149+ """Parameters for fetching a URL."""
150+
151+ url : AnyUrl = Field (..., description = "URL to fetch" )
152+ max_length : conint (gt = 0 , lt = 1000000 ) = Field (
153+ 5000 , description = "Maximum number of characters to return."
154+ )
155+ start_index : conint (ge = 0 ) = Field (
134156 0 ,
135157 description = "On return output starting at this character index, useful if a previous fetch was truncated and more context is required." ,
136158 )
@@ -143,6 +165,12 @@ class Fetch(BaseModel):
143165async def serve (
144166 custom_user_agent : Optional [str ] = None , ignore_robots_txt : bool = False
145167) -> None :
168+ """Run the fetch MCP server.
169+
170+ Args:
171+ custom_user_agent: Optional custom User-Agent string to use for requests
172+ ignore_robots_txt: Whether to ignore robots.txt restrictions
173+ """
146174 server = Server ("mcp-fetch" )
147175 user_agent_autonomous = custom_user_agent or DEFAULT_USER_AGENT_AUTONOMOUS
148176 user_agent_manual = custom_user_agent or DEFAULT_USER_AGENT_MANUAL
0 commit comments