feat: add anthropic_cache_all in AnthropicModel for auto caching all messages

Wh1isper · Wh1isper · commit f0dbcf2110eb · 2025-11-16T07:52:55.000+08:00
diff --git a/docs/models/anthropic.md b/docs/models/anthropic.md
@@ -80,13 +80,14 @@ agent = Agent(model)
 
 ## Prompt Caching
 
-Anthropic supports [prompt caching](https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching) to reduce costs by caching parts of your prompts. Pydantic AI provides three ways to use prompt caching:
+Anthropic supports [prompt caching](https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching) to reduce costs by caching parts of your prompts. Pydantic AI provides four ways to use prompt caching:
 
 1. **Cache User Messages with [`CachePoint`][pydantic_ai.messages.CachePoint]**: Insert a `CachePoint` marker in your user messages to cache everything before it
 2. **Cache System Instructions**: Enable the [`AnthropicModelSettings.anthropic_cache_instructions`][pydantic_ai.models.anthropic.AnthropicModelSettings.anthropic_cache_instructions] [model setting](../agents.md#model-run-settings) to cache your system prompt
 3. **Cache Tool Definitions**: Enable the [`AnthropicModelSettings.anthropic_cache_tool_definitions`][pydantic_ai.models.anthropic.AnthropicModelSettings.anthropic_cache_tool_definitions] [model setting](../agents.md#model-run-settings) to cache your tool definitions
+4. **Cache Entire Conversation**: Enable the [`AnthropicModelSettings.anthropic_cache_all`][pydantic_ai.models.anthropic.AnthropicModelSettings.anthropic_cache_all] [model setting](../agents.md#model-run-settings) to automatically cache the entire conversation history by adding `cache_control` to the last message
 
-You can combine all three strategies for maximum savings:
+You can combine multiple strategies for maximum savings:
 
 ```python {test="skip"}
 from pydantic_ai import Agent, CachePoint, RunContext
@@ -124,6 +125,49 @@ async def main():
     print(f'Second: {result2.output}')
 ```
 
+### Cache Entire Conversation with `anthropic_cache_all`
+
+For long conversations where you want to cache the entire conversation history automatically, use the `anthropic_cache_all` setting:
+
+```python {test="skip"}
+from pydantic_ai import Agent
+from pydantic_ai.models.anthropic import AnthropicModelSettings
+
+agent = Agent(
+    'anthropic:claude-sonnet-4-5',
+    system_prompt='You are a helpful assistant.',
+    model_settings=AnthropicModelSettings(
+        anthropic_cache_all=True,
+        anthropic_cache_instructions=True,
+        anthropic_cache_tool_definitions=True,
+    ),
+)
+
+async def main():
+    # First message - writes to cache
+    result1 = await agent.run('What is machine learning?')
+
+    # Subsequent messages reuse cached conversation history
+    result2 = await agent.run(
+        'Can you explain that in simpler terms?',
+        message_history=result1.all_messages()
+    )
+
+    # Each new message benefits from cached history
+    result3 = await agent.run(
+        'Give me an example.',
+        message_history=result2.all_messages()
+    )
+```
+
+This is particularly useful when:
+
+- Building chatbots or conversational agents with long conversations
+- Iterating on queries while reusing the same context
+- Working with large amounts of conversation history that don't change between requests
+
+### Cache Usage Statistics
+
 Access cache usage statistics via `result.usage()`:
 
 ```python {test="skip"}
diff --git a/pydantic_ai_slim/pydantic_ai/models/anthropic.py b/pydantic_ai_slim/pydantic_ai/models/anthropic.py
@@ -166,6 +166,15 @@ class AnthropicModelSettings(ModelSettings, total=False):
     See https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching for more information.
     """
 
+    anthropic_cache_all: bool
+    """Whether to add `cache_control` to the last message in the conversation.
+
+    When enabled, the last message content will have `cache_control` set,
+    allowing Anthropic to cache the entire conversation history and reduce costs.
+    This is particularly useful for long conversations or when reusing context.
+    See https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching for more information.
+    """
+
 
 @dataclass(init=False)
 class AnthropicModel(Model):
@@ -669,6 +678,23 @@ async def _map_message(  # noqa: C901
                     anthropic_messages.append(BetaMessageParam(role='assistant', content=assistant_content_params))
             else:
                 assert_never(m)
+        # Apply cache_control to the last message if anthropic_cache_all is enabled
+        # This allows Anthropic to cache the entire conversation history, reducing costs
+        # for subsequent requests that reuse the same context
+        if anthropic_messages and model_settings.get('anthropic_cache_all'):
+            m = anthropic_messages[-1]
+            content = m['content']
+            if isinstance(content, str):
+                # Convert string content to structured format with cache_control
+                # This typically happens with assistant messages containing plain text
+                m['content'] = [
+                    {'text': content, 'type': 'text', 'cache_control': BetaCacheControlEphemeralParam(type='ephemeral')}
+                ]
+            else:
+                # For structured content (lists), add cache_control to the last block
+                content = cast(list[BetaContentBlockParam], content)
+                self._add_cache_control_to_last_param(content)
+
         if instructions := self._get_instructions(messages, model_request_parameters):
             system_prompt_parts.insert(0, instructions)
         system_prompt = '\n\n'.join(system_prompt_parts)
diff --git a/tests/models/test_anthropic.py b/tests/models/test_anthropic.py
@@ -549,6 +549,119 @@ def my_tool(value: str) -> str:  # pragma: no cover
     )
 
 
+async def test_anthropic_cache_all_text_message(allow_model_requests: None):
+    """Test that anthropic_cache_all adds cache_control to last text message."""
+    c = completion_message(
+        [BetaTextBlock(text='Response', type='text')],
+        usage=BetaUsage(input_tokens=10, output_tokens=5),
+    )
+    mock_client = MockAnthropic.create_mock(c)
+    m = AnthropicModel('claude-haiku-4-5', provider=AnthropicProvider(anthropic_client=mock_client))
+    agent = Agent(
+        m,
+        system_prompt='System prompt',
+        model_settings=AnthropicModelSettings(anthropic_cache_all=True),
+    )
+
+    await agent.run('test prompt')
+
+    # Verify cache_control was added to the last message content
+    completion_kwargs = get_mock_chat_completion_kwargs(mock_client)[0]
+    messages = completion_kwargs['messages']
+    assert len(messages) == 1
+    assert messages[0]['role'] == 'user'
+    content = messages[0]['content']
+    assert isinstance(content, list)
+    assert len(content) == 1  # pyright: ignore[reportUnknownArgumentType]
+    assert content[0] == snapshot({'type': 'text', 'text': 'test prompt', 'cache_control': {'type': 'ephemeral'}})
+
+
+async def test_anthropic_cache_all_with_multiple_messages(allow_model_requests: None):
+    """Test that anthropic_cache_all only caches the last message in a conversation."""
+    c1 = completion_message(
+        [BetaTextBlock(text='First response', type='text')],
+        usage=BetaUsage(input_tokens=10, output_tokens=5),
+    )
+    c2 = completion_message(
+        [BetaTextBlock(text='Second response', type='text')],
+        usage=BetaUsage(input_tokens=12, output_tokens=6),
+    )
+    mock_client = MockAnthropic.create_mock([c1, c2])
+    m = AnthropicModel('claude-haiku-4-5', provider=AnthropicProvider(anthropic_client=mock_client))
+    agent = Agent(
+        m,
+        model_settings=AnthropicModelSettings(anthropic_cache_all=True),
+    )
+
+    # First run
+    result1 = await agent.run('first prompt')
+    assert result1.output == 'First response'
+
+    # Second run with conversation history
+    result2 = await agent.run('second prompt', message_history=result1.all_messages())
+    assert result2.output == 'Second response'
+
+    # Check second call - should have cache_control on last user message only
+    completion_kwargs = get_mock_chat_completion_kwargs(mock_client)[1]
+    messages = completion_kwargs['messages']
+
+    # Should have 3 messages: user1, assistant1, user2
+    assert len(messages) == 3
+
+    # First user message should NOT have cache_control
+    first_user_content = messages[0]['content']
+    assert isinstance(first_user_content, list)
+    assert len(first_user_content) == 1  # pyright: ignore[reportUnknownArgumentType]
+    assert 'cache_control' not in first_user_content[0]
+
+    # Second user message should have cache_control
+    last_user_content = messages[2]['content']
+    assert isinstance(last_user_content, list)
+    assert last_user_content[0]['cache_control'] == {'type': 'ephemeral'}
+
+
+async def test_anthropic_cache_all_combined_with_other_cache_settings(allow_model_requests: None):
+    """Test that anthropic_cache_all works together with other cache settings."""
+    c = completion_message(
+        [BetaTextBlock(text='Response', type='text')],
+        usage=BetaUsage(input_tokens=10, output_tokens=5),
+    )
+    mock_client = MockAnthropic.create_mock(c)
+    m = AnthropicModel('claude-haiku-4-5', provider=AnthropicProvider(anthropic_client=mock_client))
+    agent = Agent(
+        m,
+        system_prompt='System instructions to cache.',
+        model_settings=AnthropicModelSettings(
+            anthropic_cache_tool_definitions=True,
+            anthropic_cache_instructions=True,
+            anthropic_cache_all=True,
+        ),
+    )
+
+    @agent.tool_plain
+    def my_tool(value: str) -> str:  # pragma: no cover
+        return f'Result: {value}'
+
+    await agent.run('test prompt')
+
+    # Verify all cache settings are applied
+    completion_kwargs = get_mock_chat_completion_kwargs(mock_client)[0]
+
+    # Check tools have cache_control
+    tools = completion_kwargs['tools']
+    assert tools[0]['cache_control'] == {'type': 'ephemeral'}
+
+    # Check system has cache_control
+    system = completion_kwargs['system']
+    assert system[0]['cache_control'] == {'type': 'ephemeral'}
+
+    # Check last message has cache_control
+    messages = completion_kwargs['messages']
+    last_message_content = messages[-1]['content']
+    assert isinstance(last_message_content, list)
+    assert last_message_content[0]['cache_control'] == {'type': 'ephemeral'}
+
+
 async def test_async_request_text_response(allow_model_requests: None):
     c = completion_message(
         [BetaTextBlock(text='world', type='text')],