Fix SSE parsing to handle split CRLF across chunks

pja-ant · pja-ant · commit 094c76d42bbe · 2025-10-07T16:11:40.000+01:00
Previously, the SSE parser could incorrectly handle CRLF line endings when
\r appeared at the end of one chunk and \n at the beginning of the next
chunk, potentially treating them as two separate line breaks instead of
a single CRLF sequence.

This fix implements proper CRLF handling by:
- Tracking when a chunk ends with \r using a skip_leading_lf flag
- Skipping a leading \n in the next chunk if the previous ended with \r
- Ensuring Unicode line/paragraph separators (U+2028/U+2029) are treated
  as regular content, not line breaks, per the SSE specification

Added comprehensive test coverage for the edge case of split CRLF sequences
across chunk boundaries.
diff --git a/src/mcp/client/sse.py b/src/mcp/client/sse.py
@@ -37,23 +37,47 @@ async def compliant_aiter_sse(event_source: EventSource) -> AsyncIterator[Server
     """
     decoder = SSEDecoder()
     buffer = b""
+
+    # Split on "\r\n", "\r", or "\n" only, no other new line characters.
+    # https://html.spec.whatwg.org/multipage/server-sent-events.html#parsing-an-event-stream
+
+    # Note: this is tricky, because we could have a "\r" at the end of a chunk and not yet
+    # know if the next chunk starts with a "\n" or not.
+    skip_leading_lf = False
     
     async for chunk in event_source.response.aiter_bytes():
         buffer += chunk
         
-        # Split on "\n" only (not U+2028/U+2029 or other anything else)
-        # https://html.spec.whatwg.org/multipage/server-sent-events.html#parsing-an-event-stream
-        while b"\n" in buffer:
-            line_bytes, buffer = buffer.split(b"\n", 1)
-            line = line_bytes.decode('utf-8', errors='replace').rstrip("\r")
+        while len(buffer) != 0:
+            if skip_leading_lf and buffer.startswith(b"\n"):
+                buffer = buffer[1:]
+            skip_leading_lf = False
+
+            # Find first "\r" or "\n"
+            cr = buffer.find(b"\r")
+            lf = buffer.find(b"\n")
+            pos = cr if lf == -1 else lf if cr == -1 else min(cr, lf)
+
+            if pos == -1:
+                # No lines, need another chunk
+                break
+
+            line_bytes = buffer[:pos]
+            buffer = buffer[pos + 1:]
+
+            # If we have a CR first, skip any LF immediately after (may be in next chunk)
+            skip_leading_lf = (pos == cr)
+
+            line = line_bytes.decode('utf-8', errors='replace')
             sse = decoder.decode(line)
             if sse is not None:
                 yield sse
     
     # Process any remaining data in buffer
     if buffer:
         assert b"\n" not in buffer
-        line = buffer.decode('utf-8', errors='replace').rstrip("\r")
+        assert b"\r" not in buffer
+        line = buffer.decode('utf-8', errors='replace')
         sse = decoder.decode(line)
         if sse is not None:
             yield sse
diff --git a/tests/client/test_sse_unicode.py b/tests/client/test_sse_unicode.py
@@ -136,4 +136,26 @@ async def test_compliant_aiter_sse_handles_multiple_events():
     
     # Default event type is "message"
     assert events[2].event == "message"
-    assert events[2].data == "No event name"
+    assert events[2].data == "No event name"
+
+
+async def test_compliant_aiter_sse_handles_split_crlf():
+    """Test that \r at end of chunk followed by \n in next chunk is treated as one newline."""
+    
+    # Test case where \r is at the end of one chunk and \n starts the next
+    # This should be treated as a single CRLF line ending, not two separate newlines
+    test_data = [
+        b'event: test\r',  # \r at end of chunk
+        b'\ndata: line1\r',  # \n at start of next chunk, then another \r at end
+        b'\ndata: line2\n\n',  # \n at start, completing the CRLF
+    ]
+    
+    event_source = create_mock_event_source(test_data)
+    
+    events = [event async for event in compliant_aiter_sse(event_source)]
+    
+    # Should get exactly one event with both data lines
+    assert len(events) == 1
+    assert events[0].event == "test"
+    # The SSE decoder concatenates multiple data fields with \n
+    assert events[0].data == "line1\nline2"