Anthropic streaming support (#684)

piercefreeman · sydney-runkle · web-flow · commit d1a7cda2092b · 2025-01-23T14:40:34.000-05:00
Co-authored-by: sydney-runkle &lt;sydneymarierunkle@gmail.com&gt;
diff --git a/pydantic_ai_slim/pydantic_ai/models/anthropic.py b/pydantic_ai_slim/pydantic_ai/models/anthropic.py
@@ -1,21 +1,24 @@
 from __future__ import annotations as _annotations
 
-from collections.abc import AsyncIterator
+from collections.abc import AsyncIterable, AsyncIterator
 from contextlib import asynccontextmanager
 from dataclasses import dataclass, field
+from datetime import datetime, timezone
+from json import JSONDecodeError, loads as json_loads
 from typing import Any, Literal, Union, cast, overload
 
 from httpx import AsyncClient as AsyncHTTPClient
 from typing_extensions import assert_never
 
-from .. import usage
+from .. import UnexpectedModelBehavior, _utils, usage
 from .._utils import guard_tool_call_id as _guard_tool_call_id
 from ..messages import (
     ArgsDict,
     ModelMessage,
     ModelRequest,
     ModelResponse,
     ModelResponsePart,
+    ModelResponseStreamEvent,
     RetryPromptPart,
     SystemPromptPart,
     TextPart,
@@ -38,11 +41,16 @@
     from anthropic.types import (
         Message as AnthropicMessage,
         MessageParam,
+        RawContentBlockDeltaEvent,
+        RawContentBlockStartEvent,
+        RawContentBlockStopEvent,
         RawMessageDeltaEvent,
         RawMessageStartEvent,
+        RawMessageStopEvent,
         RawMessageStreamEvent,
         TextBlock,
         TextBlockParam,
+        TextDelta,
         ToolChoiceParam,
         ToolParam,
         ToolResultBlockParam,
@@ -234,24 +242,15 @@ def _process_response(self, response: AnthropicMessage) -> ModelResponse:
 
         return ModelResponse(items, model_name=self.model_name)
 
-    @staticmethod
-    async def _process_streamed_response(response: AsyncStream[RawMessageStreamEvent]) -> StreamedResponse:
-        """TODO: Process a streamed response, and prepare a streaming response to return."""
-        # We don't yet support streamed responses from Anthropic, so we raise an error here for now.
-        # Streamed responses will be supported in a future release.
-
-        raise RuntimeError('Streamed responses are not yet supported for Anthropic models.')
-
-        # Should be returning some sort of AnthropicStreamTextResponse or AnthropicStreamedResponse
-        # depending on the type of chunk we get, but we need to establish how we handle (and when we get) the following:
-        # RawMessageStartEvent
-        # RawMessageDeltaEvent
-        # RawMessageStopEvent
-        # RawContentBlockStartEvent
-        # RawContentBlockDeltaEvent
-        # RawContentBlockDeltaEvent
-        #
-        # We might refactor streaming internally before we implement this...
+    async def _process_streamed_response(self, response: AsyncStream[RawMessageStreamEvent]) -> StreamedResponse:
+        peekable_response = _utils.PeekableAsyncStream(response)
+        first_chunk = await peekable_response.peek()
+        if isinstance(first_chunk, _utils.Unset):
+            raise UnexpectedModelBehavior('Streamed response ended without content or tool calls')
+
+        # Since Anthropic doesn't provide a timestamp in the message, we'll use the current time
+        timestamp = datetime.now(tz=timezone.utc)
+        return AnthropicStreamedResponse(_model_name=self.model_name, _response=peekable_response, _timestamp=timestamp)
 
     @staticmethod
     def _map_message(messages: list[ModelMessage]) -> tuple[str, list[MessageParam]]:
@@ -347,3 +346,63 @@ def _map_usage(message: AnthropicMessage | RawMessageStreamEvent) -> usage.Usage
         response_tokens=response_usage.output_tokens,
         total_tokens=(request_tokens or 0) + response_usage.output_tokens,
     )
+
+
+@dataclass
+class AnthropicStreamedResponse(StreamedResponse):
+    """Implementation of `StreamedResponse` for Anthropic models."""
+
+    _response: AsyncIterable[RawMessageStreamEvent]
+    _timestamp: datetime
+
+    async def _get_event_iterator(self) -> AsyncIterator[ModelResponseStreamEvent]:
+        current_block: TextBlock | ToolUseBlock | None = None
+        current_json: str = ''
+
+        async for event in self._response:
+            self._usage += _map_usage(event)
+
+            if isinstance(event, RawContentBlockStartEvent):
+                current_block = event.content_block
+                if isinstance(current_block, TextBlock) and current_block.text:
+                    yield self._parts_manager.handle_text_delta(vendor_part_id='content', content=current_block.text)
+                elif isinstance(current_block, ToolUseBlock):
+                    maybe_event = self._parts_manager.handle_tool_call_delta(
+                        vendor_part_id=current_block.id,
+                        tool_name=current_block.name,
+                        args=cast(dict[str, Any], current_block.input),
+                        tool_call_id=current_block.id,
+                    )
+                    if maybe_event is not None:
+                        yield maybe_event
+
+            elif isinstance(event, RawContentBlockDeltaEvent):
+                if isinstance(event.delta, TextDelta):
+                    yield self._parts_manager.handle_text_delta(vendor_part_id='content', content=event.delta.text)
+                elif (
+                    current_block and event.delta.type == 'input_json_delta' and isinstance(current_block, ToolUseBlock)
+                ):
+                    # Try to parse the JSON immediately, otherwise cache the value for later. This handles
+                    # cases where the JSON is not currently valid but will be valid once we stream more tokens.
+                    try:
+                        parsed_args = json_loads(current_json + event.delta.partial_json)
+                        current_json = ''
+                    except JSONDecodeError:
+                        current_json += event.delta.partial_json
+                        continue
+
+                    # For tool calls, we need to handle partial JSON updates
+                    maybe_event = self._parts_manager.handle_tool_call_delta(
+                        vendor_part_id=current_block.id,
+                        tool_name='',
+                        args=parsed_args,
+                        tool_call_id=current_block.id,
+                    )
+                    if maybe_event is not None:
+                        yield maybe_event
+
+            elif isinstance(event, (RawContentBlockStopEvent, RawMessageStopEvent)):
+                current_block = None
+
+    def timestamp(self) -> datetime:
+        return self._timestamp
diff --git a/tests/models/test_anthropic.py b/tests/models/test_anthropic.py
@@ -4,7 +4,7 @@
 from dataclasses import dataclass, field
 from datetime import timezone
 from functools import cached_property
-from typing import Any, cast
+from typing import Any, TypeVar, cast
 
 import pytest
 from inline_snapshot import snapshot
@@ -25,16 +25,27 @@
 from pydantic_ai.settings import ModelSettings
 
 from ..conftest import IsNow, try_import
+from .mock_async_stream import MockAsyncStream
 
 with try_import() as imports_successful:
     from anthropic import NOT_GIVEN, AsyncAnthropic
     from anthropic.types import (
         ContentBlock,
+        InputJSONDelta,
         Message as AnthropicMessage,
+        MessageDeltaUsage,
+        RawContentBlockDeltaEvent,
+        RawContentBlockStartEvent,
+        RawContentBlockStopEvent,
+        RawMessageDeltaEvent,
+        RawMessageStartEvent,
+        RawMessageStopEvent,
+        RawMessageStreamEvent,
         TextBlock,
         ToolUseBlock,
         Usage as AnthropicUsage,
     )
+    from anthropic.types.raw_message_delta_event import Delta
 
     from pydantic_ai.models.anthropic import AnthropicModel
 
@@ -43,6 +54,9 @@
     pytest.mark.anyio,
 ]
 
+# Type variable for generic AsyncStream
+T = TypeVar('T')
+
 
 def test_init():
     m = AnthropicModel('claude-3-5-haiku-latest', api_key='foobar')
@@ -53,6 +67,7 @@ def test_init():
 @dataclass
 class MockAnthropic:
     messages_: AnthropicMessage | list[AnthropicMessage] | None = None
+    stream: list[RawMessageStreamEvent] | list[list[RawMessageStreamEvent]] | None = None
     index = 0
     chat_completion_kwargs: list[dict[str, Any]] = field(default_factory=list)
 
@@ -64,14 +79,31 @@ def messages(self) -> Any:
     def create_mock(cls, messages_: AnthropicMessage | list[AnthropicMessage]) -> AsyncAnthropic:
         return cast(AsyncAnthropic, cls(messages_=messages_))
 
-    async def messages_create(self, *_args: Any, **kwargs: Any) -> AnthropicMessage:
+    @classmethod
+    def create_stream_mock(
+        cls, stream: list[RawMessageStreamEvent] | list[list[RawMessageStreamEvent]]
+    ) -> AsyncAnthropic:
+        return cast(AsyncAnthropic, cls(stream=stream))
+
+    async def messages_create(
+        self, *_args: Any, stream: bool = False, **kwargs: Any
+    ) -> AnthropicMessage | MockAsyncStream[RawMessageStreamEvent]:
         self.chat_completion_kwargs.append({k: v for k, v in kwargs.items() if v is not NOT_GIVEN})
 
-        assert self.messages_ is not None, '`messages` must be provided'
-        if isinstance(self.messages_, list):
-            response = self.messages_[self.index]
+        if stream:
+            assert self.stream is not None, 'you can only use `stream=True` if `stream` is provided'
+            # noinspection PyUnresolvedReferences
+            if isinstance(self.stream[0], list):
+                indexed_stream = cast(list[RawMessageStreamEvent], self.stream[self.index])
+                response = MockAsyncStream(iter(indexed_stream))
+            else:
+                response = MockAsyncStream(iter(cast(list[RawMessageStreamEvent], self.stream)))
         else:
-            response = self.messages_
+            assert self.messages_ is not None, '`messages` must be provided'
+            if isinstance(self.messages_, list):
+                response = self.messages_[self.index]
+            else:
+                response = self.messages_
         self.index += 1
         return response
 
@@ -298,3 +330,112 @@ async def get_location(loc_name: str) -> str:
     assert get_mock_chat_completion_kwargs(mock_client)[0]['tool_choice']['disable_parallel_tool_use'] == (
         not parallel_tool_calls
     )
+
+
+async def test_stream_structured(allow_model_requests: None):
+    """Test streaming structured responses with Anthropic's API.
+
+    This test simulates how Anthropic streams tool calls:
+    1. Message start
+    2. Tool block start with initial data
+    3. Tool block delta with additional data
+    4. Tool block stop
+    5. Update usage
+    6. Message stop
+    """
+    stream: list[RawMessageStreamEvent] = [
+        RawMessageStartEvent(
+            type='message_start',
+            message=AnthropicMessage(
+                id='msg_123',
+                model='claude-3-5-haiku-latest',
+                role='assistant',
+                type='message',
+                content=[],
+                stop_reason=None,
+                usage=AnthropicUsage(input_tokens=20, output_tokens=0),
+            ),
+        ),
+        # Start tool block with initial data
+        RawContentBlockStartEvent(
+            type='content_block_start',
+            index=0,
+            content_block=ToolUseBlock(type='tool_use', id='tool_1', name='my_tool', input={'first': 'One'}),
+        ),
+        # Add more data through an incomplete JSON delta
+        RawContentBlockDeltaEvent(
+            type='content_block_delta',
+            index=0,
+            delta=InputJSONDelta(type='input_json_delta', partial_json='{"second":'),
+        ),
+        RawContentBlockDeltaEvent(
+            type='content_block_delta',
+            index=0,
+            delta=InputJSONDelta(type='input_json_delta', partial_json='"Two"}'),
+        ),
+        # Mark tool block as complete
+        RawContentBlockStopEvent(type='content_block_stop', index=0),
+        # Update the top-level message with usage
+        RawMessageDeltaEvent(
+            type='message_delta',
+            delta=Delta(
+                stop_reason='end_turn',
+            ),
+            usage=MessageDeltaUsage(
+                output_tokens=5,
+            ),
+        ),
+        # Mark message as complete
+        RawMessageStopEvent(type='message_stop'),
+    ]
+
+    done_stream: list[RawMessageStreamEvent] = [
+        RawMessageStartEvent(
+            type='message_start',
+            message=AnthropicMessage(
+                id='msg_123',
+                model='claude-3-5-haiku-latest',
+                role='assistant',
+                type='message',
+                content=[],
+                stop_reason=None,
+                usage=AnthropicUsage(input_tokens=0, output_tokens=0),
+            ),
+        ),
+        # Text block with final data
+        RawContentBlockStartEvent(
+            type='content_block_start',
+            index=0,
+            content_block=TextBlock(type='text', text='FINAL_PAYLOAD'),
+        ),
+        RawContentBlockStopEvent(type='content_block_stop', index=0),
+        RawMessageStopEvent(type='message_stop'),
+    ]
+
+    mock_client = MockAnthropic.create_stream_mock([stream, done_stream])
+    m = AnthropicModel('claude-3-5-haiku-latest', anthropic_client=mock_client)
+    agent = Agent(m)
+
+    tool_called = False
+
+    @agent.tool_plain
+    async def my_tool(first: str, second: str) -> int:
+        nonlocal tool_called
+        tool_called = True
+        return len(first) + len(second)
+
+    async with agent.run_stream('') as result:
+        assert not result.is_complete
+        chunks = [c async for c in result.stream(debounce_by=None)]
+
+        # The tool output doesn't echo any content to the stream, so we only get the final payload once when
+        # the block starts and once when it ends.
+        assert chunks == snapshot(
+            [
+                'FINAL_PAYLOAD',
+                'FINAL_PAYLOAD',
+            ]
+        )
+        assert result.is_complete
+        assert result.usage() == snapshot(Usage(requests=2, request_tokens=20, response_tokens=5, total_tokens=25))
+        assert tool_called