Fix SSE parsing of Unicode line separator characters

pja-ant · pja-ant · commit d1d4fb272e39 · 2025-10-07T16:11:39.000+01:00
diff --git a/src/mcp/client/sse.py b/src/mcp/client/sse.py
@@ -1,4 +1,5 @@
 import logging
+from collections.abc import AsyncIterator
 from contextlib import asynccontextmanager
 from typing import Any
 from urllib.parse import urljoin, urlparse
@@ -7,7 +8,8 @@
 import httpx
 from anyio.abc import TaskStatus
 from anyio.streams.memory import MemoryObjectReceiveStream, MemoryObjectSendStream
-from httpx_sse import aconnect_sse
+from httpx_sse import EventSource, ServerSentEvent, aconnect_sse
+from httpx_sse._decoders import SSEDecoder
 
 import mcp.types as types
 from mcp.shared._httpx_utils import McpHttpClientFactory, create_mcp_http_client
@@ -18,6 +20,43 @@
 
 def remove_request_params(url: str) -> str:
     return urljoin(url, urlparse(url).path)
+        
+async def compliant_aiter_sse(event_source: EventSource) -> AsyncIterator[ServerSentEvent]:
+    """
+    Safely iterate over SSE events, working around httpx issue where U+2028 and U+2029
+    are incorrectly treated as newlines, breaking SSE stream parsing.
+    
+    This function replaces event_source.aiter_sse() to handle these Unicode characters
+    correctly by processing the raw byte stream and only splitting on actual newlines.
+    
+    Args:
+        event_source: The EventSource to iterate over
+        
+    Yields:
+        ServerSentEvent objects parsed from the stream
+    """
+    decoder = SSEDecoder()
+    buffer = b""
+    
+    async for chunk in event_source.response.aiter_bytes():
+        buffer += chunk
+        
+        # Split on "\n" only (not U+2028/U+2029 or other anything else)
+        # https://html.spec.whatwg.org/multipage/server-sent-events.html#parsing-an-event-stream
+        while b"\n" in buffer:
+            line_bytes, buffer = buffer.split(b"\n", 1)
+            line = line_bytes.decode('utf-8', errors='replace').rstrip("\r")
+            sse = decoder.decode(line)
+            if sse is not None:
+                yield sse
+    
+    # Process any remaining data in buffer
+    if buffer:
+        assert b"\n" not in buffer
+        line = buffer.decode('utf-8', errors='replace').rstrip("\r")
+        sse = decoder.decode(line)
+        if sse is not None:
+            yield sse
 
 
 @asynccontextmanager
@@ -69,7 +108,8 @@ async def sse_reader(
                         task_status: TaskStatus[str] = anyio.TASK_STATUS_IGNORED,
                     ):
                         try:
-                            async for sse in event_source.aiter_sse():
+                            # Use our compliant SSE iterator to handle Unicode correctly (issue #1356)
+                            async for sse in compliant_aiter_sse(event_source):
                                 logger.debug(f"Received SSE event: {sse.event}")
                                 match sse.event:
                                     case "endpoint":
diff --git a/tests/client/test_sse_unicode.py b/tests/client/test_sse_unicode.py
@@ -0,0 +1,139 @@
+"""Test for SSE client Unicode handling."""
+
+from collections.abc import AsyncIterator
+from unittest.mock import AsyncMock, MagicMock
+
+import pytest
+from httpx_sse import EventSource
+
+from mcp.client.sse import compliant_aiter_sse
+
+pytestmark = pytest.mark.anyio
+
+
+def create_mock_event_source(data_chunks: list[bytes]) -> EventSource:
+    """Create a mock EventSource that yields the given data chunks."""
+    event_source = MagicMock(spec=EventSource)
+    response = AsyncMock()
+    event_source.response = response
+    
+    async def mock_aiter_bytes() -> AsyncIterator[bytes]:
+        for chunk in data_chunks:
+            yield chunk
+    
+    response.aiter_bytes = mock_aiter_bytes
+    return event_source
+
+
+async def test_compliant_aiter_sse_handles_unicode_line_separators():
+    """Test that compliant_aiter_sse correctly handles U+2028 and U+2029 characters."""
+    
+    # Simulate SSE data with U+2028 in JSON content
+    # The server sends: event: message\ndata: {"text":"Hello\u2028World"}\n\n
+    test_data = [
+        b'event: message\n',
+        b'data: {"text":"Hello',
+        b'\xe2\x80\xa8',  # UTF-8 encoding of U+2028
+        b'World"}\n',
+        b'\n',
+    ]
+    
+    event_source = create_mock_event_source(test_data)
+    
+    # Collect events
+    events = [event async for event in compliant_aiter_sse(event_source)]
+    
+    # Should receive one message event
+    assert len(events) == 1
+    assert events[0].event == "message"
+    # The U+2028 should be preserved in the data
+    assert '\u2028' in events[0].data
+    assert events[0].data == '{"text":"Hello\u2028World"}'
+
+
+async def test_compliant_aiter_sse_handles_paragraph_separator():
+    """Test that compliant_aiter_sse correctly handles U+2029 (PARAGRAPH SEPARATOR)."""
+    
+    # Simulate SSE data with U+2029
+    test_data = [
+        b'event: test\ndata: Line1',
+        b'\xe2\x80\xa9',  # UTF-8 encoding of U+2029
+        b'Line2\n\n',
+    ]
+    
+    event_source = create_mock_event_source(test_data)
+    
+    events = [event async for event in compliant_aiter_sse(event_source)]
+    
+    assert len(events) == 1
+    assert events[0].event == "test"
+    # U+2029 should be preserved, not treated as a newline
+    assert '\u2029' in events[0].data
+    assert events[0].data == 'Line1\u2029Line2'
+
+
+async def test_compliant_aiter_sse_handles_crlf():
+    """Test that compliant_aiter_sse correctly handles \\r\\n line endings."""
+    
+    # Simulate SSE data with CRLF line endings
+    test_data = [
+        b'event: message\r\n',
+        b'data: test data\r\n',
+        b'\r\n',
+    ]
+    
+    event_source = create_mock_event_source(test_data)
+    
+    events = [event async for event in compliant_aiter_sse(event_source)]
+    
+    assert len(events) == 1
+    assert events[0].event == "message"
+    assert events[0].data == "test data"
+
+
+async def test_compliant_aiter_sse_handles_split_utf8():
+    """Test that compliant_aiter_sse handles UTF-8 characters split across chunks."""
+    
+    # Split a UTF-8 emoji (🎉 = \xf0\x9f\x8e\x89) across chunks
+    test_data = [
+        b'event: message\n',
+        b'data: Party ',
+        b'\xf0\x9f',  # First half of emoji
+        b'\x8e\x89',  # Second half of emoji
+        b' time!\n\n',
+    ]
+    
+    event_source = create_mock_event_source(test_data)
+    
+    events = [event async for event in compliant_aiter_sse(event_source)]
+    
+    assert len(events) == 1
+    assert events[0].event == "message"
+    assert events[0].data == "Party 🎉 time!"
+
+
+async def test_compliant_aiter_sse_handles_multiple_events():
+    """Test that compliant_aiter_sse correctly handles multiple SSE events."""
+    
+    # Multiple events with problematic Unicode
+    test_data = [
+        b'event: first\ndata: Hello\xe2\x80\xa8World\n\n',
+        b'event: second\ndata: Test\xe2\x80\xa9Data\n\n',
+        b'data: No event name\n\n',
+    ]
+    
+    event_source = create_mock_event_source(test_data)
+    
+    events = [event async for event in compliant_aiter_sse(event_source)]
+    
+    assert len(events) == 3
+    
+    assert events[0].event == "first"
+    assert '\u2028' in events[0].data
+    
+    assert events[1].event == "second"
+    assert '\u2029' in events[1].data
+    
+    # Default event type is "message"
+    assert events[2].event == "message"
+    assert events[2].data == "No event name"
diff --git a/tests/issues/test_1356_sse_parsing_line_separator.py b/tests/issues/test_1356_sse_parsing_line_separator.py
@@ -0,0 +1,161 @@
+"""Test for issue #1356: SSE parsing fails with Unicode line separator characters."""
+
+import multiprocessing
+import socket
+import time
+from collections.abc import Generator
+from typing import Any
+
+import anyio
+import pytest
+import uvicorn
+from starlette.applications import Starlette
+from starlette.requests import Request
+from starlette.responses import Response
+from starlette.routing import Mount, Route
+
+from mcp.client.session import ClientSession
+from mcp.client.sse import sse_client
+from mcp.server import Server
+from mcp.server.sse import SseServerTransport
+from mcp.server.transport_security import TransportSecuritySettings
+from mcp.shared.exceptions import McpError
+from mcp.types import TextContent, Tool
+
+pytestmark = pytest.mark.anyio
+
+
+class ProblematicUnicodeServer(Server):
+    """Test server that returns problematic Unicode characters."""
+
+    def __init__(self):
+        super().__init__("ProblematicUnicodeServer")
+
+        @self.list_tools()
+        async def handle_list_tools() -> list[Tool]:
+            return [
+                Tool(
+                    name="get_problematic_unicode",
+                    description="Returns text with problematic Unicode character U+2028",
+                    inputSchema={"type": "object", "properties": {}},
+                )
+            ]
+
+        @self.call_tool()
+        async def handle_call_tool(name: str, args: dict[str, Any]) -> list[TextContent]:
+            if name == "get_problematic_unicode":
+                # Return text with U+2028 (LINE SEPARATOR) which can cause JSON parsing issues
+                # U+2028 is a valid Unicode character but can break JSON parsing in some contexts
+                problematic_text = "This text contains a line separator\u2028character that may break JSON parsing"
+                return [TextContent(type="text", text=problematic_text)]
+            return [TextContent(type="text", text=f"Unknown tool: {name}")]
+
+
+def make_problematic_server_app() -> Starlette:
+    """Create test Starlette app with SSE transport."""
+    security_settings = TransportSecuritySettings(
+        allowed_hosts=["127.0.0.1:*", "localhost:*"],
+        allowed_origins=["http://127.0.0.1:*", "http://localhost:*"],
+    )
+    sse = SseServerTransport("/messages/", security_settings=security_settings)
+    server = ProblematicUnicodeServer()
+
+    async def handle_sse(request: Request) -> Response:
+        async with sse.connect_sse(request.scope, request.receive, request._send) as streams:
+            await server.run(streams[0], streams[1], server.create_initialization_options())
+        return Response()
+
+    app = Starlette(
+        routes=[
+            Route("/sse", endpoint=handle_sse),
+            Mount("/messages/", app=sse.handle_post_message),
+        ]
+    )
+
+    return app
+
+
+def run_problematic_server(server_port: int) -> None:
+    """Run the problematic Unicode test server."""
+    app = make_problematic_server_app()
+    server = uvicorn.Server(
+        config=uvicorn.Config(app=app, host="127.0.0.1", port=server_port, log_level="error")
+    )
+    server.run()
+
+
+@pytest.fixture
+def problematic_server_port() -> int:
+    """Get an available port for the test server."""
+    with socket.socket() as s:
+        s.bind(("127.0.0.1", 0))
+        return s.getsockname()[1]
+
+
+@pytest.fixture
+def problematic_server(problematic_server_port: int) -> Generator[str, None, None]:
+    """Start the problematic Unicode test server in a separate process."""
+    proc = multiprocessing.Process(
+        target=run_problematic_server, kwargs={"server_port": problematic_server_port}, daemon=True
+    )
+    proc.start()
+
+    # Wait for server to be running
+    max_attempts = 20
+    attempt = 0
+    while attempt < max_attempts:
+        try:
+            with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
+                s.connect(("127.0.0.1", problematic_server_port))
+                break
+        except ConnectionRefusedError:
+            time.sleep(0.1)
+            attempt += 1
+    else:
+        raise RuntimeError(f"Server failed to start after {max_attempts} attempts")
+
+    yield f"http://127.0.0.1:{problematic_server_port}"
+
+    # Clean up
+    proc.kill()
+    proc.join(timeout=2)
+
+
+async def test_json_parsing_with_problematic_unicode(problematic_server: str) -> None:
+    """Test that special Unicode characters like U+2028 are handled properly.
+    
+    This test reproduces issue #1356 where special Unicode characters
+    cause JSON parsing to fail and the raw exception is sent to the stream,
+    preventing proper error handling.
+    """
+    # Connect to the server using SSE client
+    async with sse_client(problematic_server + "/sse") as streams:
+        async with ClientSession(*streams) as session:
+            # Initialize the connection
+            result = await session.initialize()
+            assert result.serverInfo.name == "ProblematicUnicodeServer"
+
+            # Call the tool that returns problematic Unicode
+            # This should succeed and not hang
+            
+            # Use a timeout to detect if we're hanging
+            with anyio.fail_after(5):  # 5 second timeout
+                try:
+                    response = await session.call_tool("get_problematic_unicode", {})
+                    
+                    # If we get here, the Unicode was handled properly
+                    assert len(response.content) == 1
+                    text_content = response.content[0]
+                    assert hasattr(text_content, "text"), f"Response doesn't have text: {text_content}"
+                    
+                    expected = "This text contains a line separator\u2028character that may break JSON parsing"
+                    assert text_content.text == expected, f"Expected: {expected!r}, Got: {text_content.text!r}"
+                    
+                except McpError:
+                    pytest.fail("Unexpected error with tool call")
+                except TimeoutError:
+                    # If we timeout, the issue is confirmed - the client hangs
+                    pytest.fail("Client hangs when handling problematic Unicode (issue #1356 confirmed)")
+                except Exception as e:
+                    # We should not get raw exceptions - they should be wrapped as McpError
+                    pytest.fail(f"Got raw exception instead of McpError: {type(e).__name__}: {e}")