1
- from typing import Optional
1
+ from typing import Optional , Tuple
2
2
from urllib .parse import urlparse , urlunparse
3
3
4
4
import markdownify
17
17
INTERNAL_ERROR ,
18
18
)
19
19
from protego import Protego
20
- from pydantic import BaseModel , Field , ValidationError
20
+ from pydantic import BaseModel , Field , AnyUrl , conint
21
21
22
22
DEFAULT_USER_AGENT_AUTONOMOUS = "ModelContextProtocol/1.0 (Autonomous; +https://github.com/modelcontextprotocol/servers)"
23
23
DEFAULT_USER_AGENT_MANUAL = "ModelContextProtocol/1.0 (User-Specified; +https://github.com/modelcontextprotocol/servers)"
24
24
25
25
26
26
def extract_content_from_html (html : str ) -> str :
27
+ """Extract and convert HTML content to Markdown format.
28
+
29
+ Args:
30
+ html: Raw HTML content to process
31
+
32
+ Returns:
33
+ Simplified markdown version of the content
34
+ """
27
35
ret = readabilipy .simple_json .simple_json_from_html_string (
28
36
html , use_readability = True
29
37
)
@@ -36,17 +44,25 @@ def extract_content_from_html(html: str) -> str:
36
44
return content
37
45
38
46
39
- def get_robots_txt_url (url : str ) -> str :
47
+ def get_robots_txt_url (url : AnyUrl | str ) -> str :
48
+ """Get the robots.txt URL for a given website URL.
49
+
50
+ Args:
51
+ url: Website URL to get robots.txt for
52
+
53
+ Returns:
54
+ URL of the robots.txt file
55
+ """
40
56
# Parse the URL into components
41
- parsed = urlparse (url )
57
+ parsed = urlparse (str ( url ) )
42
58
43
59
# Reconstruct the base URL with just scheme, netloc, and /robots.txt path
44
60
robots_url = urlunparse ((parsed .scheme , parsed .netloc , "/robots.txt" , "" , "" , "" ))
45
61
46
62
return robots_url
47
63
48
64
49
- async def check_may_autonomously_fetch_url (url : str , user_agent : str ):
65
+ async def check_may_autonomously_fetch_url (url : AnyUrl | str , user_agent : str ) -> None :
50
66
"""
51
67
Check if the URL can be fetched by the user agent according to the robots.txt file.
52
68
Raises a McpError if not.
@@ -89,7 +105,9 @@ async def check_may_autonomously_fetch_url(url: str, user_agent: str):
89
105
)
90
106
91
107
92
- async def fetch_url (url : str , user_agent : str , force_raw : bool = False ) -> (str , str ):
108
+ async def fetch_url (
109
+ url : AnyUrl | str , user_agent : str , force_raw : bool = False
110
+ ) -> Tuple [str , str ]:
93
111
"""
94
112
Fetch the URL and return the content in a form ready for the LLM, as well as a prefix string with status information.
95
113
"""
@@ -98,7 +116,7 @@ async def fetch_url(url: str, user_agent: str, force_raw: bool = False) -> (str,
98
116
async with AsyncClient () as client :
99
117
try :
100
118
response = await client .get (
101
- url ,
119
+ str ( url ) ,
102
120
follow_redirects = True ,
103
121
headers = {"User-Agent" : user_agent },
104
122
timeout = 30 ,
@@ -128,9 +146,13 @@ async def fetch_url(url: str, user_agent: str, force_raw: bool = False) -> (str,
128
146
129
147
130
148
class Fetch (BaseModel ):
131
- url : str = Field (..., description = "URL to fetch" )
132
- max_length : int = Field (5000 , description = "Maximum number of characters to return." )
133
- start_index : int = Field (
149
+ """Parameters for fetching a URL."""
150
+
151
+ url : AnyUrl = Field (..., description = "URL to fetch" )
152
+ max_length : conint (gt = 0 , lt = 1000000 ) = Field (
153
+ 5000 , description = "Maximum number of characters to return."
154
+ )
155
+ start_index : conint (ge = 0 ) = Field (
134
156
0 ,
135
157
description = "On return output starting at this character index, useful if a previous fetch was truncated and more context is required." ,
136
158
)
@@ -143,6 +165,12 @@ class Fetch(BaseModel):
143
165
async def serve (
144
166
custom_user_agent : Optional [str ] = None , ignore_robots_txt : bool = False
145
167
) -> None :
168
+ """Run the fetch MCP server.
169
+
170
+ Args:
171
+ custom_user_agent: Optional custom User-Agent string to use for requests
172
+ ignore_robots_txt: Whether to ignore robots.txt restrictions
173
+ """
146
174
server = Server ("mcp-fetch" )
147
175
user_agent_autonomous = custom_user_agent or DEFAULT_USER_AGENT_AUTONOMOUS
148
176
user_agent_manual = custom_user_agent or DEFAULT_USER_AGENT_MANUAL
0 commit comments