Fix streaming thinking tags split across multiple chunks

dsfaccini · dsfaccini · commit ed5f05671a67 · 2025-10-20T14:22:29.000-05:00
diff --git a/pydantic_ai_slim/pydantic_ai/_parts_manager.py b/pydantic_ai_slim/pydantic_ai/_parts_manager.py
@@ -58,6 +58,8 @@ class ModelResponsePartsManager:
     """A list of parts (text or tool calls) that make up the current state of the model's response."""
     _vendor_id_to_part_index: dict[VendorId, int] = field(default_factory=dict, init=False)
     """Maps a vendor's "part" ID (if provided) to the index in `_parts` where that part resides."""
+    _tag_buffer: dict[VendorId, str] = field(default_factory=dict, init=False)
+    """Buffer for accumulating content when thinking tags may be split across chunks."""
 
     def get_parts(self) -> list[ModelResponsePart]:
         """Return only model response parts that are complete (i.e., not ToolCallPartDelta's).
@@ -82,6 +84,9 @@ def handle_text_delta(
         otherwise, a new TextPart is created. When a non-None ID is specified, the TextPart corresponding
         to that vendor ID is either created or updated.
 
+        This method now supports thinking tags that may be split across multiple chunks by buffering
+        content until complete tags can be detected.
+
         Args:
             vendor_part_id: The ID the vendor uses to identify this piece
                 of text. If None, a new part will be created unless the latest part is already
@@ -99,6 +104,33 @@ def handle_text_delta(
         Raises:
             UnexpectedModelBehavior: If attempting to apply text content to a part that is not a TextPart.
         """
+        # If thinking tags are enabled, use the buffering logic to handle split tags
+        if thinking_tags:
+            return self._handle_text_delta_with_thinking_tags(
+                vendor_part_id=vendor_part_id,
+                content=content,
+                id=id,
+                thinking_tags=thinking_tags,
+                ignore_leading_whitespace=ignore_leading_whitespace,
+            )
+
+        # Original logic for non-thinking-tag case
+        return self._handle_text_delta_simple(
+            vendor_part_id=vendor_part_id,
+            content=content,
+            id=id,
+            ignore_leading_whitespace=ignore_leading_whitespace,
+        )
+
+    def _handle_text_delta_simple(
+        self,
+        *,
+        vendor_part_id: VendorId | None,
+        content: str,
+        id: str | None = None,
+        ignore_leading_whitespace: bool = False,
+    ) -> ModelResponseStreamEvent | None:
+        """Handle text delta without thinking tag logic."""
         existing_text_part_and_index: tuple[TextPart, int] | None = None
 
         if vendor_part_id is None:
@@ -113,25 +145,11 @@ def handle_text_delta(
             part_index = self._vendor_id_to_part_index.get(vendor_part_id)
             if part_index is not None:
                 existing_part = self._parts[part_index]
-
-                if thinking_tags and isinstance(existing_part, ThinkingPart):
-                    # We may be building a thinking part instead of a text part if we had previously seen a thinking tag
-                    if content == thinking_tags[1]:
-                        # When we see the thinking end tag, we're done with the thinking part and the next text delta will need a new part
-                        self._vendor_id_to_part_index.pop(vendor_part_id)
-                        return None
-                    else:
-                        return self.handle_thinking_delta(vendor_part_id=vendor_part_id, content=content)
-                elif isinstance(existing_part, TextPart):
+                if isinstance(existing_part, TextPart):
                     existing_text_part_and_index = existing_part, part_index
                 else:
                     raise UnexpectedModelBehavior(f'Cannot apply a text delta to {existing_part=}')
 
-        if thinking_tags and content == thinking_tags[0]:
-            # When we see a thinking start tag (which is a single token), we'll build a new thinking part instead
-            self._vendor_id_to_part_index.pop(vendor_part_id, None)
-            return self.handle_thinking_delta(vendor_part_id=vendor_part_id, content='')
-
         if existing_text_part_and_index is None:
             # This is a workaround for models that emit `<think>\n</think>\n\n` or an empty text part ahead of tool calls (e.g. Ollama + Qwen3),
             # which we don't want to end up treating as a final result when using `run_stream` with `str` a valid `output_type`.
@@ -152,6 +170,127 @@ def handle_text_delta(
             self._parts[part_index] = part_delta.apply(existing_text_part)
             return PartDeltaEvent(index=part_index, delta=part_delta)
 
+    def _handle_text_delta_with_thinking_tags(
+        self,
+        *,
+        vendor_part_id: VendorId | None,
+        content: str,
+        id: str | None = None,
+        thinking_tags: tuple[str, str],
+        ignore_leading_whitespace: bool = False,
+    ) -> ModelResponseStreamEvent | None:
+        """Handle text delta with thinking tag detection and buffering for split tags."""
+        start_tag, end_tag = thinking_tags
+
+        # Combine any buffered content with the new content
+        buffered = self._tag_buffer.get(vendor_part_id, '') if vendor_part_id is not None else ''
+        combined_content = buffered + content
+
+        # Check if we're currently building a thinking part
+        part_index = self._vendor_id_to_part_index.get(vendor_part_id) if vendor_part_id is not None else None
+        in_thinking_mode = part_index is not None and isinstance(self._parts[part_index], ThinkingPart)
+
+        if in_thinking_mode:
+            # Look for the end tag
+            if end_tag in combined_content:
+                # Found complete end tag
+                before_end, after_end = combined_content.split(end_tag, 1)
+
+                # Add any content before the end tag to the thinking part
+                last_event = None
+                if before_end:
+                    last_event = self.handle_thinking_delta(vendor_part_id=vendor_part_id, content=before_end)
+
+                # Close the thinking part
+                self._vendor_id_to_part_index.pop(vendor_part_id)
+                self._tag_buffer.pop(vendor_part_id, None)
+
+                # Process any remaining content after the end tag
+                if after_end:
+                    return self._handle_text_delta_with_thinking_tags(
+                        vendor_part_id=vendor_part_id,
+                        content=after_end,
+                        id=id,
+                        thinking_tags=thinking_tags,
+                        ignore_leading_whitespace=ignore_leading_whitespace,
+                    )
+                return last_event
+            elif self._could_be_tag_start(combined_content, end_tag):
+                # Might be start of end tag, buffer it
+                self._tag_buffer[vendor_part_id] = combined_content
+                return None
+            else:
+                # Not an end tag, add to thinking content
+                self._tag_buffer.pop(vendor_part_id, None)
+                return self.handle_thinking_delta(vendor_part_id=vendor_part_id, content=combined_content)
+        else:
+            # Not in thinking mode, look for start tag
+            if start_tag in combined_content:
+                # Found complete start tag
+                before_start, after_start = combined_content.split(start_tag, 1)
+
+                # Handle any text before the start tag
+                text_event = None
+                if before_start:
+                    text_event = self._handle_text_delta_simple(
+                        vendor_part_id=vendor_part_id,
+                        content=before_start,
+                        id=id,
+                        ignore_leading_whitespace=ignore_leading_whitespace,
+                    )
+
+                # Clear any state for this vendor_part_id and start thinking part
+                self._vendor_id_to_part_index.pop(vendor_part_id, None)
+                self._tag_buffer.pop(vendor_part_id, None)
+                thinking_event = self.handle_thinking_delta(vendor_part_id=vendor_part_id, content='')
+
+                # Process any remaining content after the start tag recursively
+                if after_start:
+                    self._handle_text_delta_with_thinking_tags(
+                        vendor_part_id=vendor_part_id,
+                        content=after_start,
+                        id=id,
+                        thinking_tags=thinking_tags,
+                        ignore_leading_whitespace=ignore_leading_whitespace,
+                    )
+                    # Return the first event that was created (text part or thinking part)
+                    return text_event if text_event is not None else thinking_event
+                else:
+                    # No content after start tag
+                    return text_event if text_event is not None else thinking_event
+            elif self._could_be_tag_start(combined_content, start_tag):
+                # Might be start of start tag, buffer it
+                if vendor_part_id is not None:
+                    self._tag_buffer[vendor_part_id] = combined_content
+                return None
+            else:
+                # Not a start tag, process as normal text
+                if vendor_part_id is not None:
+                    self._tag_buffer.pop(vendor_part_id, None)
+                return self._handle_text_delta_simple(
+                    vendor_part_id=vendor_part_id,
+                    content=combined_content,
+                    id=id,
+                    ignore_leading_whitespace=ignore_leading_whitespace,
+                )
+
+    def _could_be_tag_start(self, content: str, tag: str) -> bool:
+        """Check if content could be the beginning of a tag.
+
+        This is used to determine whether we should buffer content or process it immediately.
+        We check if the tag starts with the content, which means the content could be
+        a partial tag that will be completed in a future chunk.
+        """
+        if not content:
+            return False
+        # Check if the tag starts with any suffix of the content
+        # E.g., for content="<thi" and tag="<think>", we check if "<think>" starts with "<thi"
+        for i in range(len(content)):
+            suffix = content[i:]
+            if tag.startswith(suffix):
+                return True
+        return False
+
     def handle_thinking_delta(
         self,
         *,
diff --git a/pyproject.toml b/pyproject.toml
@@ -311,4 +311,4 @@ skip = '.git*,*.svg,*.lock,*.css,*.yaml'
 check-hidden = true
 # Ignore "formatting" like **L**anguage
 ignore-regex = '\*\*[A-Z]\*\*[a-z]+\b'
-ignore-words-list = 'asend,aci'
+ignore-words-list = 'asend,aci,thi'
diff --git a/tests/models/test_openai.py b/tests/models/test_openai.py
@@ -631,6 +631,41 @@ async def test_stream_text_empty_think_tag_and_text_before_tool_call(allow_model
     assert await result.get_output() == snapshot({'first': 'One', 'second': 'Two'})
 
 
+async def test_stream_thinking_tags_split_across_chunks(allow_model_requests: None):
+    """Test that thinking tags split across multiple chunks are properly detected and extracted.
+
+    This test addresses issue #3007: https://github.com/pydantic/pydantic-ai/issues/3007
+    where models like Gemini via LiteLLM split thinking tags across multiple streaming chunks.
+    """
+    # Simulate thinking tags split across chunks as reported in the issue
+    stream = [
+        text_chunk('<'),  # Start of start tag
+        text_chunk('think>'),  # Complete start tag
+        text_chunk('\nthinking content'),  # Thinking content
+        text_chunk('</think>'),  # Complete end tag
+        text_chunk('\nNormal content.'),  # Normal text after thinking
+        chunk([]),
+    ]
+    mock_client = MockOpenAI.create_mock_stream(stream)
+    m = OpenAIChatModel('gpt-4o', provider=OpenAIProvider(openai_client=mock_client))
+    agent = Agent(m)
+
+    async with agent.run_stream('') as result:
+        assert not result.is_complete
+        # Should stream the normal content, not the thinking content
+        assert [c async for c in result.stream_text(debounce_by=None)] == snapshot(['\nNormal content.'])
+        assert result.is_complete
+
+    # Verify the message parts are correctly separated
+    msgs = result.new_messages()
+    parts = msgs[-1].parts
+    assert len(parts) == 2
+    assert isinstance(parts[0], ThinkingPart)
+    assert parts[0].content.strip() == 'thinking content'
+    assert isinstance(parts[1], TextPart)
+    assert parts[1].content.strip() == 'Normal content.'
+
+
 async def test_no_delta(allow_model_requests: None):
     stream = [
         chunk([]),
diff --git a/tests/test_parts_manager.py b/tests/test_parts_manager.py