fix(anthropic): handle partial JSON chunks in streaming responses (BerriAI#17493)

Chesars · web-flow · commit b6b155d67b53 · 2025-12-07T23:34:42.000-08:00
Fixes BerriAI#17473 - Anthropic streaming fails with JSONDecodeError when network fragmentation causes SSE data to arrive in partial chunks. Changes: - Add accumulated_json buffer and chunk_type to ModelResponseIterator - Add _handle_accumulated_json_chunk() to accumulate partial JSON - Add _parse_sse_data() to handle both complete and partial chunks - Modify __next__ and __anext__ to use accumulation logic - Add unit tests for partial chunk handling
diff --git a/litellm/llms/anthropic/chat/handler.py b/litellm/llms/anthropic/chat/handler.py
@@ -10,6 +10,7 @@
     Callable,
     Dict,
     List,
+    Literal,
     Optional,
     Tuple,
     Union,
@@ -498,6 +499,11 @@ def __init__(
         # Track if we've converted any response_format tools (affects finish_reason)
         self.converted_response_format_tool: bool = False
 
+        # For handling partial JSON chunks from fragmentation
+        # See: https://github.com/BerriAI/litellm/issues/17473
+        self.accumulated_json: str = ""
+        self.chunk_type: Literal["valid_json", "accumulated_json"] = "valid_json"
+
     def check_empty_tool_call_args(self) -> bool:
         """
         Check if the tool call block so far has been an empty string
@@ -866,80 +872,154 @@ def _handle_message_delta(self, chunk: dict) -> Tuple[str, Optional[Usage]]:
         usage = self._handle_usage(anthropic_usage_chunk=message_delta["usage"])
         return finish_reason, usage
 
+    def _handle_accumulated_json_chunk(
+        self, data_str: str
+    ) -> Optional[GenericStreamingChunk]:
+        """
+        Handle partial JSON chunks by accumulating them until valid JSON is received.
+
+        This fixes network fragmentation issues where SSE data chunks may be split
+        across TCP packets. See: https://github.com/BerriAI/litellm/issues/17473
+
+        Args:
+            data_str: The JSON string to parse (without "data:" prefix)
+
+        Returns:
+            GenericStreamingChunk if JSON is complete, None if still accumulating
+        """
+        # Accumulate JSON data
+        self.accumulated_json += data_str
+
+        # Try to parse the accumulated JSON
+        try:
+            data_json = json.loads(self.accumulated_json)
+            self.accumulated_json = ""  # Reset after successful parsing
+            return self.chunk_parser(chunk=data_json)
+        except json.JSONDecodeError:
+            # If it's not valid JSON yet, continue to the next chunk
+            return None
+
+    def _parse_sse_data(self, str_line: str) -> Optional[GenericStreamingChunk]:
+        """
+        Parse SSE data line, handling both complete and partial JSON chunks.
+
+        Args:
+            str_line: The SSE line starting with "data:"
+
+        Returns:
+            GenericStreamingChunk if parsing succeeded, None if accumulating partial JSON
+        """
+        data_str = str_line[5:]  # Remove "data:" prefix
+
+        if self.chunk_type == "accumulated_json":
+            # Already in accumulation mode, keep accumulating
+            return self._handle_accumulated_json_chunk(data_str)
+
+        # Try to parse as valid JSON first
+        try:
+            data_json = json.loads(data_str)
+            return self.chunk_parser(chunk=data_json)
+        except json.JSONDecodeError:
+            # Switch to accumulation mode and start accumulating
+            self.chunk_type = "accumulated_json"
+            return self._handle_accumulated_json_chunk(data_str)
+
     # Sync iterator
     def __iter__(self):
         return self
 
     def __next__(self):
-        try:
-            chunk = self.response_iterator.__next__()
-        except StopIteration:
-            raise StopIteration
-        except ValueError as e:
-            raise RuntimeError(f"Error receiving chunk from stream: {e}")
-
-        try:
-            str_line = chunk
-            if isinstance(chunk, bytes):  # Handle binary data
-                str_line = chunk.decode("utf-8")  # Convert bytes to string
-                index = str_line.find("data:")
-                if index != -1:
-                    str_line = str_line[index:]
-
-            if str_line.startswith("data:"):
-                data_json = json.loads(str_line[5:])
-                return self.chunk_parser(chunk=data_json)
-            else:
-                return GenericStreamingChunk(
-                    text="",
-                    is_finished=False,
-                    finish_reason="",
-                    usage=None,
-                    index=0,
-                    tool_use=None,
-                )
-        except StopIteration:
-            raise StopIteration
-        except ValueError as e:
-            raise RuntimeError(f"Error parsing chunk: {e},\nReceived chunk: {chunk}")
+        while True:
+            try:
+                chunk = self.response_iterator.__next__()
+            except StopIteration:
+                # If we have accumulated JSON when stream ends, try to parse it
+                if self.accumulated_json:
+                    try:
+                        data_json = json.loads(self.accumulated_json)
+                        self.accumulated_json = ""
+                        return self.chunk_parser(chunk=data_json)
+                    except json.JSONDecodeError:
+                        pass
+                raise StopIteration
+            except ValueError as e:
+                raise RuntimeError(f"Error receiving chunk from stream: {e}")
+
+            try:
+                str_line = chunk
+                if isinstance(chunk, bytes):  # Handle binary data
+                    str_line = chunk.decode("utf-8")  # Convert bytes to string
+                    index = str_line.find("data:")
+                    if index != -1:
+                        str_line = str_line[index:]
+
+                if str_line.startswith("data:"):
+                    result = self._parse_sse_data(str_line)
+                    if result is not None:
+                        return result
+                    # If None, continue loop to get more chunks for accumulation
+                else:
+                    return GenericStreamingChunk(
+                        text="",
+                        is_finished=False,
+                        finish_reason="",
+                        usage=None,
+                        index=0,
+                        tool_use=None,
+                    )
+            except StopIteration:
+                raise StopIteration
+            except ValueError as e:
+                raise RuntimeError(f"Error parsing chunk: {e},\nReceived chunk: {chunk}")
 
     # Async iterator
     def __aiter__(self):
         self.async_response_iterator = self.streaming_response.__aiter__()
         return self
 
     async def __anext__(self):
-        try:
-            chunk = await self.async_response_iterator.__anext__()
-        except StopAsyncIteration:
-            raise StopAsyncIteration
-        except ValueError as e:
-            raise RuntimeError(f"Error receiving chunk from stream: {e}")
-
-        try:
-            str_line = chunk
-            if isinstance(chunk, bytes):  # Handle binary data
-                str_line = chunk.decode("utf-8")  # Convert bytes to string
-                index = str_line.find("data:")
-                if index != -1:
-                    str_line = str_line[index:]
-
-            if str_line.startswith("data:"):
-                data_json = json.loads(str_line[5:])
-                return self.chunk_parser(chunk=data_json)
-            else:
-                return GenericStreamingChunk(
-                    text="",
-                    is_finished=False,
-                    finish_reason="",
-                    usage=None,
-                    index=0,
-                    tool_use=None,
-                )
-        except StopAsyncIteration:
-            raise StopAsyncIteration
-        except ValueError as e:
-            raise RuntimeError(f"Error parsing chunk: {e},\nReceived chunk: {chunk}")
+        while True:
+            try:
+                chunk = await self.async_response_iterator.__anext__()
+            except StopAsyncIteration:
+                # If we have accumulated JSON when stream ends, try to parse it
+                if self.accumulated_json:
+                    try:
+                        data_json = json.loads(self.accumulated_json)
+                        self.accumulated_json = ""
+                        return self.chunk_parser(chunk=data_json)
+                    except json.JSONDecodeError:
+                        pass
+                raise StopAsyncIteration
+            except ValueError as e:
+                raise RuntimeError(f"Error receiving chunk from stream: {e}")
+
+            try:
+                str_line = chunk
+                if isinstance(chunk, bytes):  # Handle binary data
+                    str_line = chunk.decode("utf-8")  # Convert bytes to string
+                    index = str_line.find("data:")
+                    if index != -1:
+                        str_line = str_line[index:]
+
+                if str_line.startswith("data:"):
+                    result = self._parse_sse_data(str_line)
+                    if result is not None:
+                        return result
+                    # If None, continue loop to get more chunks for accumulation
+                else:
+                    return GenericStreamingChunk(
+                        text="",
+                        is_finished=False,
+                        finish_reason="",
+                        usage=None,
+                        index=0,
+                        tool_use=None,
+                    )
+            except StopAsyncIteration:
+                raise StopAsyncIteration
+            except ValueError as e:
+                raise RuntimeError(f"Error parsing chunk: {e},\nReceived chunk: {chunk}")
 
     def convert_str_chunk_to_generic_chunk(self, chunk: str) -> ModelResponseStream:
         """
diff --git a/tests/test_litellm/llms/anthropic/chat/test_anthropic_chat_handler.py b/tests/test_litellm/llms/anthropic/chat/test_anthropic_chat_handler.py
@@ -460,3 +460,75 @@ def test_streaming_chunks_have_stable_ids():
     response_two = iterator.chunk_parser(chunk=second_chunk)
 
     assert response_one.id == response_two.id == iterator.response_id
+
+
+def test_partial_json_chunk_accumulation():
+    """
+    Test that partial JSON chunks are accumulated correctly.
+
+    This tests the fix for https://github.com/BerriAI/litellm/issues/17473
+    where network fragmentation can cause SSE data to arrive in partial chunks.
+    """
+    iterator = ModelResponseIterator(
+        streaming_response=MagicMock(), sync_stream=True, json_mode=False
+    )
+
+    # Simulate a complete JSON chunk being split into two parts
+    partial_chunk_1 = '{"type":"content_block_delta","index":0,"delta":{"type":"text_delta","text":"Hel'
+    partial_chunk_2 = 'lo"}}'
+
+    # First partial chunk should return None (still accumulating)
+    result1 = iterator._parse_sse_data(f"data:{partial_chunk_1}")
+    assert result1 is None, "First partial chunk should return None while accumulating"
+    assert iterator.chunk_type == "accumulated_json", "Should switch to accumulated_json mode"
+    assert iterator.accumulated_json == partial_chunk_1, "Should have accumulated first part"
+
+    # Second partial chunk should complete the JSON and return a parsed result
+    result2 = iterator._parse_sse_data(f"data:{partial_chunk_2}")
+    assert result2 is not None, "Second chunk should return parsed result"
+    assert iterator.accumulated_json == "", "Buffer should be cleared after successful parse"
+    assert result2.choices[0].delta.content == "Hello", f"Expected 'Hello', got '{result2.choices[0].delta.content}'"
+
+
+def test_complete_json_chunk_no_accumulation():
+    """
+    Test that complete JSON chunks are parsed immediately without accumulation.
+    """
+    iterator = ModelResponseIterator(
+        streaming_response=MagicMock(), sync_stream=True, json_mode=False
+    )
+
+    complete_chunk = '{"type":"content_block_delta","index":0,"delta":{"type":"text_delta","text":"Hello"}}'
+
+    result = iterator._parse_sse_data(f"data:{complete_chunk}")
+    assert result is not None, "Complete chunk should return parsed result immediately"
+    assert iterator.chunk_type == "valid_json", "Should remain in valid_json mode"
+    assert iterator.accumulated_json == "", "Buffer should remain empty"
+    assert result.choices[0].delta.content == "Hello", f"Expected 'Hello', got '{result.choices[0].delta.content}'"
+
+
+def test_multiple_partial_chunks_accumulation():
+    """
+    Test that multiple partial chunks can be accumulated across several iterations.
+    """
+    iterator = ModelResponseIterator(
+        streaming_response=MagicMock(), sync_stream=True, json_mode=False
+    )
+
+    # Split a JSON chunk into three parts
+    part1 = '{"type":"content_block_del'
+    part2 = 'ta","index":0,"delta":{"type":"text_del'
+    part3 = 'ta","text":"Hello"}}'
+
+    result1 = iterator._parse_sse_data(f"data:{part1}")
+    assert result1 is None
+    assert iterator.accumulated_json == part1
+
+    result2 = iterator._parse_sse_data(f"data:{part2}")
+    assert result2 is None
+    assert iterator.accumulated_json == part1 + part2
+
+    result3 = iterator._parse_sse_data(f"data:{part3}")
+    assert result3 is not None
+    assert iterator.accumulated_json == ""
+    assert result3.choices[0].delta.content == "Hello"