Skip to content

Commit 014dea2

Browse files
habemaseratch
andauthored
Enhance exception handling in MCP server initialization and cleanup (#2268)
Co-authored-by: Kazuhiro Sera <[email protected]>
1 parent 8455af0 commit 014dea2

File tree

2 files changed

+186
-21
lines changed

2 files changed

+186
-21
lines changed

src/agents/mcp/server.py

Lines changed: 179 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -3,12 +3,17 @@
33
import abc
44
import asyncio
55
import inspect
6+
import sys
67
from collections.abc import Awaitable
78
from contextlib import AbstractAsyncContextManager, AsyncExitStack
89
from datetime import timedelta
910
from pathlib import Path
1011
from typing import TYPE_CHECKING, Any, Callable, Literal, TypeVar
1112

13+
import httpx
14+
15+
if sys.version_info < (3, 11):
16+
from exceptiongroup import BaseExceptionGroup # pyright: ignore[reportMissingImports]
1217
from anyio.streams.memory import MemoryObjectReceiveStream, MemoryObjectSendStream
1318
from mcp import ClientSession, StdioServerParameters, Tool as MCPTool, stdio_client
1419
from mcp.client.session import MessageHandlerFnT
@@ -251,6 +256,35 @@ def invalidate_tools_cache(self):
251256
"""Invalidate the tools cache."""
252257
self._cache_dirty = True
253258

259+
def _extract_http_error_from_exception(self, e: Exception) -> Exception | None:
260+
"""Extract HTTP error from exception or ExceptionGroup."""
261+
if isinstance(e, (httpx.HTTPStatusError, httpx.ConnectError, httpx.TimeoutException)):
262+
return e
263+
264+
# Check if it's an ExceptionGroup containing HTTP errors
265+
if isinstance(e, BaseExceptionGroup):
266+
for exc in e.exceptions:
267+
if isinstance(
268+
exc, (httpx.HTTPStatusError, httpx.ConnectError, httpx.TimeoutException)
269+
):
270+
return exc
271+
272+
return None
273+
274+
def _raise_user_error_for_http_error(self, http_error: Exception) -> None:
275+
"""Raise appropriate UserError for HTTP error."""
276+
error_message = f"Failed to connect to MCP server '{self.name}': "
277+
if isinstance(http_error, httpx.HTTPStatusError):
278+
error_message += f"HTTP error {http_error.response.status_code} ({http_error.response.reason_phrase})" # noqa: E501
279+
280+
elif isinstance(http_error, httpx.ConnectError):
281+
error_message += "Could not reach the server."
282+
283+
elif isinstance(http_error, httpx.TimeoutException):
284+
error_message += "Connection timeout."
285+
286+
raise UserError(error_message) from http_error
287+
254288
async def _run_with_retries(self, func: Callable[[], Awaitable[T]]) -> T:
255289
attempts = 0
256290
while True:
@@ -265,6 +299,7 @@ async def _run_with_retries(self, func: Callable[[], Awaitable[T]]) -> T:
265299

266300
async def connect(self):
267301
"""Connect to the server."""
302+
connection_succeeded = False
268303
try:
269304
transport = await self.exit_stack.enter_async_context(self.create_streams())
270305
# streamablehttp_client returns (read, write, get_session_id)
@@ -285,10 +320,49 @@ async def connect(self):
285320
server_result = await session.initialize()
286321
self.server_initialize_result = server_result
287322
self.session = session
323+
connection_succeeded = True
288324
except Exception as e:
289-
logger.error(f"Error initializing MCP server: {e}")
290-
await self.cleanup()
325+
# Try to extract HTTP error from exception or ExceptionGroup
326+
http_error = self._extract_http_error_from_exception(e)
327+
if http_error:
328+
self._raise_user_error_for_http_error(http_error)
329+
330+
# For CancelledError, preserve cancellation semantics - don't wrap it.
331+
# If it's masking an HTTP error, cleanup() will extract and raise UserError.
332+
if isinstance(e, asyncio.CancelledError):
333+
raise
334+
335+
# For HTTP-related errors, wrap them
336+
if isinstance(e, (httpx.HTTPStatusError, httpx.ConnectError, httpx.TimeoutException)):
337+
self._raise_user_error_for_http_error(e)
338+
339+
# For other errors, re-raise as-is (don't wrap non-HTTP errors)
291340
raise
341+
finally:
342+
# Always attempt cleanup on error, but suppress cleanup errors that mask the original
343+
if not connection_succeeded:
344+
try:
345+
await self.cleanup()
346+
except UserError:
347+
# Re-raise UserError from cleanup (contains the real HTTP error)
348+
raise
349+
except Exception as cleanup_error:
350+
# Suppress RuntimeError about cancel scopes during cleanup - this is a known
351+
# issue with the MCP library's async generator cleanup and shouldn't mask the
352+
# original error
353+
if isinstance(cleanup_error, RuntimeError) and "cancel scope" in str(
354+
cleanup_error
355+
):
356+
logger.debug(
357+
f"Ignoring cancel scope error during cleanup of MCP server "
358+
f"'{self.name}': {cleanup_error}"
359+
)
360+
else:
361+
# Log other cleanup errors but don't raise - original error is more
362+
# important
363+
logger.warning(
364+
f"Error during cleanup of MCP server '{self.name}': {cleanup_error}"
365+
)
292366

293367
async def list_tools(
294368
self,
@@ -301,21 +375,32 @@ async def list_tools(
301375
session = self.session
302376
assert session is not None
303377

304-
# Return from cache if caching is enabled, we have tools, and the cache is not dirty
305-
if self.cache_tools_list and not self._cache_dirty and self._tools_list:
306-
tools = self._tools_list
307-
else:
308-
# Fetch the tools from the server
309-
result = await self._run_with_retries(lambda: session.list_tools())
310-
self._tools_list = result.tools
311-
self._cache_dirty = False
312-
tools = self._tools_list
313-
314-
# Filter tools based on tool_filter
315-
filtered_tools = tools
316-
if self.tool_filter is not None:
317-
filtered_tools = await self._apply_tool_filter(filtered_tools, run_context, agent)
318-
return filtered_tools
378+
try:
379+
# Return from cache if caching is enabled, we have tools, and the cache is not dirty
380+
if self.cache_tools_list and not self._cache_dirty and self._tools_list:
381+
tools = self._tools_list
382+
else:
383+
# Fetch the tools from the server
384+
result = await self._run_with_retries(lambda: session.list_tools())
385+
self._tools_list = result.tools
386+
self._cache_dirty = False
387+
tools = self._tools_list
388+
389+
# Filter tools based on tool_filter
390+
filtered_tools = tools
391+
if self.tool_filter is not None:
392+
filtered_tools = await self._apply_tool_filter(filtered_tools, run_context, agent)
393+
return filtered_tools
394+
except httpx.HTTPStatusError as e:
395+
status_code = e.response.status_code
396+
raise UserError(
397+
f"Failed to list tools from MCP server '{self.name}': HTTP error {status_code}"
398+
) from e
399+
except httpx.ConnectError as e:
400+
raise UserError(
401+
f"Failed to list tools from MCP server '{self.name}': Connection lost. "
402+
f"The server may have disconnected."
403+
) from e
319404

320405
async def call_tool(self, tool_name: str, arguments: dict[str, Any] | None) -> CallToolResult:
321406
"""Invoke a tool on the server."""
@@ -324,7 +409,19 @@ async def call_tool(self, tool_name: str, arguments: dict[str, Any] | None) -> C
324409
session = self.session
325410
assert session is not None
326411

327-
return await self._run_with_retries(lambda: session.call_tool(tool_name, arguments))
412+
try:
413+
return await self._run_with_retries(lambda: session.call_tool(tool_name, arguments))
414+
except httpx.HTTPStatusError as e:
415+
status_code = e.response.status_code
416+
raise UserError(
417+
f"Failed to call tool '{tool_name}' on MCP server '{self.name}': "
418+
f"HTTP error {status_code}"
419+
) from e
420+
except httpx.ConnectError as e:
421+
raise UserError(
422+
f"Failed to call tool '{tool_name}' on MCP server '{self.name}': Connection lost. "
423+
f"The server may have disconnected."
424+
) from e
328425

329426
async def list_prompts(
330427
self,
@@ -347,10 +444,73 @@ async def get_prompt(
347444
async def cleanup(self):
348445
"""Cleanup the server."""
349446
async with self._cleanup_lock:
447+
# Only raise HTTP errors if we're cleaning up after a failed connection.
448+
# During normal teardown (via __aexit__), log but don't raise to avoid
449+
# masking the original exception.
450+
is_failed_connection_cleanup = self.session is None
451+
350452
try:
351453
await self.exit_stack.aclose()
454+
except BaseExceptionGroup as eg:
455+
# Extract HTTP errors from ExceptionGroup raised during cleanup
456+
# This happens when background tasks fail (e.g., HTTP errors)
457+
http_error = None
458+
connect_error = None
459+
timeout_error = None
460+
error_message = f"Failed to connect to MCP server '{self.name}': "
461+
462+
for exc in eg.exceptions:
463+
if isinstance(exc, httpx.HTTPStatusError):
464+
http_error = exc
465+
elif isinstance(exc, httpx.ConnectError):
466+
connect_error = exc
467+
elif isinstance(exc, httpx.TimeoutException):
468+
timeout_error = exc
469+
470+
# Only raise HTTP errors if we're cleaning up after a failed connection.
471+
# During normal teardown, log them instead.
472+
if http_error:
473+
if is_failed_connection_cleanup:
474+
error_message += f"HTTP error {http_error.response.status_code} ({http_error.response.reason_phrase})" # noqa: E501
475+
raise UserError(error_message) from http_error
476+
else:
477+
# Normal teardown - log but don't raise
478+
logger.warning(
479+
f"HTTP error during cleanup of MCP server '{self.name}': {http_error}"
480+
)
481+
elif connect_error:
482+
if is_failed_connection_cleanup:
483+
error_message += "Could not reach the server."
484+
raise UserError(error_message) from connect_error
485+
else:
486+
logger.warning(
487+
f"Connection error during cleanup of MCP server '{self.name}': {connect_error}" # noqa: E501
488+
)
489+
elif timeout_error:
490+
if is_failed_connection_cleanup:
491+
error_message += "Connection timeout."
492+
raise UserError(error_message) from timeout_error
493+
else:
494+
logger.warning(
495+
f"Timeout error during cleanup of MCP server '{self.name}': {timeout_error}" # noqa: E501
496+
)
497+
else:
498+
# No HTTP error found, suppress RuntimeError about cancel scopes
499+
has_cancel_scope_error = any(
500+
isinstance(exc, RuntimeError) and "cancel scope" in str(exc)
501+
for exc in eg.exceptions
502+
)
503+
if has_cancel_scope_error:
504+
logger.debug(f"Ignoring cancel scope error during cleanup: {eg}")
505+
else:
506+
logger.error(f"Error cleaning up server: {eg}")
352507
except Exception as e:
353-
logger.error(f"Error cleaning up server: {e}")
508+
# Suppress RuntimeError about cancel scopes - this is a known issue with the MCP
509+
# library when background tasks fail during async generator cleanup
510+
if isinstance(e, RuntimeError) and "cancel scope" in str(e):
511+
logger.debug(f"Ignoring cancel scope error during cleanup: {e}")
512+
else:
513+
logger.error(f"Error cleaning up server: {e}")
354514
finally:
355515
self.session = None
356516

src/agents/mcp/util.py

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -201,9 +201,14 @@ async def invoke_mcp_tool(
201201

202202
try:
203203
result = await server.call_tool(tool.name, json_data)
204+
except UserError:
205+
# Re-raise UserError as-is (it already has a good message)
206+
raise
204207
except Exception as e:
205-
logger.error(f"Error invoking MCP tool {tool.name}: {e}")
206-
raise AgentsException(f"Error invoking MCP tool {tool.name}: {e}") from e
208+
logger.error(f"Error invoking MCP tool {tool.name} on server '{server.name}': {e}")
209+
raise AgentsException(
210+
f"Error invoking MCP tool {tool.name} on server '{server.name}': {e}"
211+
) from e
207212

208213
if _debug.DONT_LOG_TOOL_DATA:
209214
logger.debug(f"MCP tool {tool.name} completed.")

0 commit comments

Comments
 (0)