use anthropic_cache_messages

Wh1isper · Wh1isper · commit 8bf1d945c17d · 2025-11-20T11:26:04.000+08:00
diff --git a/docs/models/anthropic.md b/docs/models/anthropic.md
@@ -85,20 +85,20 @@ Anthropic supports [prompt caching](https://docs.anthropic.com/en/docs/build-wit
 1. **Cache User Messages with [`CachePoint`][pydantic_ai.messages.CachePoint]**: Insert a `CachePoint` marker in your user messages to cache everything before it
 2. **Cache System Instructions**: Set [`AnthropicModelSettings.anthropic_cache_instructions`][pydantic_ai.models.anthropic.AnthropicModelSettings.anthropic_cache_instructions] to `True` (uses 5m TTL by default) or specify `'5m'` / `'1h'` directly
 3. **Cache Tool Definitions**: Set [`AnthropicModelSettings.anthropic_cache_tool_definitions`][pydantic_ai.models.anthropic.AnthropicModelSettings.anthropic_cache_tool_definitions] to `True` (uses 5m TTL by default) or specify `'5m'` / `'1h'` directly
-4. **Cache All (Convenience)**: Set [`AnthropicModelSettings.anthropic_cache_all`][pydantic_ai.models.anthropic.AnthropicModelSettings.anthropic_cache_all] to `True` to automatically cache both system instructions and the last user message
+4. **Cache Last Message (Convenience)**: Set [`AnthropicModelSettings.anthropic_cache_messages`][pydantic_ai.models.anthropic.AnthropicModelSettings.anthropic_cache_messages] to `True` to automatically cache the last user message
 
 You can combine multiple strategies for maximum savings:
 
 ```python {test="skip"}
 from pydantic_ai import Agent, CachePoint, RunContext
 from pydantic_ai.models.anthropic import AnthropicModelSettings
 
-# Option 1: Use anthropic_cache_all for convenience (caches system + last message)
+# Option 1: Use anthropic_cache_messages for convenience (caches last message only)
 agent = Agent(
     'anthropic:claude-sonnet-4-5',
     system_prompt='Detailed instructions...',
     model_settings=AnthropicModelSettings(
-        anthropic_cache_all=True,  # Caches both system prompt and last message
+        anthropic_cache_messages=True,  # Caches the last user message
     ),
 )
 
@@ -159,35 +159,77 @@ async def main():
 
 ### Cache Point Limits
 
-Anthropic enforces a maximum of 4 cache points per request. Pydantic AI automatically manages this limit:
+Anthropic enforces a maximum of 4 cache points per request. Pydantic AI automatically manages this limit to ensure your requests always comply without errors.
 
-- **`anthropic_cache_all`**: Uses 2 cache points (system instructions + last message)
-- **`anthropic_cache_instructions`**: Uses 1 cache point
-- **`anthropic_cache_tool_definitions`**: Uses 1 cache point
-- **`CachePoint` markers**: Use remaining available cache points
+#### How Cache Points Are Allocated
 
-When the total exceeds 4 cache points, Pydantic AI automatically removes cache points from **older messages** (keeping the most recent ones), ensuring your requests always comply with Anthropic's limits without errors.
+Cache points can be placed in three locations:
+
+1. **System Prompt**: Via `anthropic_cache_instructions` setting (adds cache point to last system prompt block)
+2. **Tool Definitions**: Via `anthropic_cache_tool_definitions` setting (adds cache point to last tool definition)
+3. **Messages**: Via `CachePoint` markers or `anthropic_cache_messages` setting (adds cache points to message content)
+
+Each setting uses **at most 1 cache point**, but you can combine them:
 
 ```python {test="skip"}
 from pydantic_ai import Agent, CachePoint
 from pydantic_ai.models.anthropic import AnthropicModelSettings
 
+# Example: Using all 3 cache point sources
+agent = Agent(
+    'anthropic:claude-sonnet-4-5',
+    system_prompt='Detailed instructions...',
+    model_settings=AnthropicModelSettings(
+        anthropic_cache_instructions=True,      # 1 cache point
+        anthropic_cache_tool_definitions=True,  # 1 cache point
+        anthropic_cache_messages=True,          # 1 cache point
+    ),
+)
+
+@agent.tool_plain
+def my_tool() -> str:
+    return 'result'
+
+async def main():
+    # This uses 3 cache points (instructions + tools + last message)
+    # You can add 1 more CachePoint marker before hitting the limit
+    result = await agent.run([
+        'Context', CachePoint(),  # 4th cache point - OK
+        'Question'
+    ])
+```
+
+#### Automatic Cache Point Limiting
+
+When cache points from all sources (settings + `CachePoint` markers) exceed 4, Pydantic AI automatically removes excess cache points from **older message content** (keeping the most recent ones):
+
+```python {test="skip"}
 agent = Agent(
     'anthropic:claude-sonnet-4-5',
     system_prompt='Instructions...',
     model_settings=AnthropicModelSettings(
-        anthropic_cache_all=True,  # Uses 2 cache points
+        anthropic_cache_instructions=True,      # 1 cache point
+        anthropic_cache_tool_definitions=True,  # 1 cache point
     ),
 )
 
+@agent.tool_plain
+def search() -> str:
+    return 'data'
+
 async def main():
-    # Even with multiple CachePoint markers, only 2 more will be kept
-    # (4 total limit - 2 from cache_all = 2 available)
+    # Already using 2 cache points (instructions + tools)
+    # Can add 2 more CachePoint markers (4 total limit)
     result = await agent.run([
-        'Context 1', CachePoint(),  # Will be kept
-        'Context 2', CachePoint(),  # Will be kept
-        'Context 3', CachePoint(),  # Automatically removed (oldest)
+        'Context 1', CachePoint(),  # Oldest - will be removed
+        'Context 2', CachePoint(),  # Will be kept (3rd point)
+        'Context 3', CachePoint(),  # Will be kept (4th point)
         'Question'
     ])
-    print(result.output)
+    # Final cache points: instructions + tools + Context 2 + Context 3 = 4
 ```
+
+**Key Points**:
+- System and tool cache points are **always preserved**
+- Message cache points are removed from oldest to newest when limit is exceeded
+- This ensures critical caching (instructions/tools) is maintained while still benefiting from message-level caching
diff --git a/pydantic_ai_slim/pydantic_ai/models/anthropic.py b/pydantic_ai_slim/pydantic_ai/models/anthropic.py
@@ -169,18 +169,15 @@ class AnthropicModelSettings(ModelSettings, total=False):
     See https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching for more information.
     """
 
-    anthropic_cache_all: bool | Literal['5m', '1h']
-    """Convenience setting to enable caching for both system instructions and the last user message.
+    anthropic_cache_messages: bool | Literal['5m', '1h']
+    """Convenience setting to enable caching for the last user message.
 
-    When enabled, this automatically adds cache points to:
-    1. The last system prompt block (system instructions)
-    2. The last content block in the final user message
-
-    This is equivalent to setting both `anthropic_cache_instructions` and adding a cache point
-    to the last message, but more convenient for common use cases.
+    When enabled, this automatically adds a cache point to the last content block
+    in the final user message, which is useful for caching conversation history
+    or context in multi-turn conversations.
     If `True`, uses TTL='5m'. You can also specify '5m' or '1h' directly.
 
-    Note: Uses 2 of Anthropic's 4 available cache points per request. Any additional CachePoint
+    Note: Uses 1 of Anthropic's 4 available cache points per request. Any additional CachePoint
     markers in messages will be automatically limited to respect the 4-cache-point maximum.
     See https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching for more information.
     """
@@ -349,7 +346,7 @@ async def _messages_create(
         tool_choice = self._infer_tool_choice(tools, model_settings, model_request_parameters)
 
         system_prompt, anthropic_messages = await self._map_message(messages, model_request_parameters, model_settings)
-
+        self._limit_cache_points(system_prompt, anthropic_messages, tools)
         try:
             extra_headers = self._map_extra_headers(beta_features, model_settings)
 
@@ -392,7 +389,7 @@ async def _messages_count_tokens(
         tool_choice = self._infer_tool_choice(tools, model_settings, model_request_parameters)
 
         system_prompt, anthropic_messages = await self._map_message(messages, model_request_parameters, model_settings)
-
+        self._limit_cache_points(system_prompt, anthropic_messages, tools)
         try:
             extra_headers = self._map_extra_headers(beta_features, model_settings)
 
@@ -494,10 +491,7 @@ def _get_tools(
         ]
 
         # Add cache_control to the last tool if enabled
-        if tools and (
-            cache_tool_defs := model_settings.get('anthropic_cache_tool_definitions')
-            or model_settings.get('anthropic_cache_all')
-        ):
+        if tools and (cache_tool_defs := model_settings.get('anthropic_cache_tool_definitions')):
             # If True, use '5m'; otherwise use the specified ttl value
             ttl: Literal['5m', '1h'] = '5m' if cache_tool_defs is True else cache_tool_defs
             last_tool = tools[-1]
@@ -766,9 +760,9 @@ async def _map_message(  # noqa: C901
             system_prompt_parts.insert(0, instructions)
         system_prompt = '\n\n'.join(system_prompt_parts)
 
-        # Add cache_control to the last message content if anthropic_cache_all is enabled
-        if anthropic_messages and (cache_all := model_settings.get('anthropic_cache_all')):
-            ttl: Literal['5m', '1h'] = '5m' if cache_all is True else cache_all
+        # Add cache_control to the last message content if anthropic_cache_messages is enabled
+        if anthropic_messages and (cache_messages := model_settings.get('anthropic_cache_messages')):
+            ttl: Literal['5m', '1h'] = '5m' if cache_messages is True else cache_messages
             m = anthropic_messages[-1]
             content = m['content']
             if isinstance(content, str):
@@ -785,13 +779,8 @@ async def _map_message(  # noqa: C901
                 content = cast(list[BetaContentBlockParam], content)
                 self._add_cache_control_to_last_param(content, ttl)
 
-        # Ensure total cache points don't exceed Anthropic's limit of 4
-        self._limit_cache_points(anthropic_messages, model_settings)
         # If anthropic_cache_instructions is enabled, return system prompt as a list with cache_control
-        if system_prompt and (
-            cache_instructions := model_settings.get('anthropic_cache_instructions')
-            or model_settings.get('anthropic_cache_all')
-        ):
+        if system_prompt and (cache_instructions := model_settings.get('anthropic_cache_instructions')):
             # If True, use '5m'; otherwise use the specified ttl value
             ttl: Literal['5m', '1h'] = '5m' if cache_instructions is True else cache_instructions
             system_prompt_blocks = [
@@ -806,60 +795,57 @@ async def _map_message(  # noqa: C901
         return system_prompt, anthropic_messages
 
     @staticmethod
-    def _limit_cache_points(messages: list[BetaMessageParam], model_settings: AnthropicModelSettings) -> None:
-        """Limit the number of cache points in messages to comply with Anthropic's 4-cache-point maximum.
-
-        Anthropic allows a maximum of 4 cache points per request. This method ensures compliance by:
-        1. Calculating how many cache points are already used by system-level settings
-           (anthropic_cache_instructions, anthropic_cache_tool_definitions, anthropic_cache_all)
-        2. Determining how many cache points remain available for message-level caching
-        3. Traversing messages from newest to oldest, keeping only the allowed number of cache points
-        4. Removing cache_control from older cache points that exceed the limit
+    def _limit_cache_points(
+        system_prompt: str | list[BetaTextBlockParam],
+        anthropic_messages: list[BetaMessageParam],
+        tools: list[BetaToolUnionParam],
+    ) -> None:
+        """Limit the number of cache points in the request to Anthropic's maximum.
+
+        Strategy:
+        1. Keep the last cache point in system_prompt and tools (if present)
+        2. Count cache points already used in system_prompt and tools
+        3. Traverse messages from newest to oldest, keeping the most recent cache points
+           until the maximum limit is reached
+        """
+        MAX_CACHE_POINTS = 4
 
-        This prioritizes recent cache points, which are typically more valuable for conversation continuity.
+        # Count existing cache points in system prompt
+        used_cache_points = (
+            sum(1 for block in system_prompt if 'cache_control' in cast(dict[str, Any], block))
+            if isinstance(system_prompt, list)
+            else 0
+        )
 
-        Args:
-            messages: List of message parameters to limit cache points in.
-            model_settings: Model settings containing cache configuration.
-        """
-        # Anthropic's maximum cache points per request
-        max_cache_points = 4
-        used_cache_points = 0
-
-        # Calculate cache points used by system-level settings
-        if model_settings.get('anthropic_cache_all'):
-            # anthropic_cache_all adds cache points for both system instructions and last message
-            used_cache_points += 2
-        else:
-            if model_settings.get('anthropic_cache_instructions'):
-                used_cache_points += 1
-            if model_settings.get('anthropic_cache_tool_definitions'):
-                # Assume used one cache point for tool definitions
+        # Count existing cache points in tools (any tool may have cache_control)
+        # Note: cache_control can be in the middle of tools list if builtin tools are added after
+        for tool in tools:
+            if 'cache_control' in tool:
                 used_cache_points += 1
 
-        # Calculate remaining cache points available for message content
-        keep_cache_points = max_cache_points - used_cache_points
-
-        # Traverse messages from back to front (newest to oldest)
-        remaining_cache_points = keep_cache_points
-        for message in reversed(messages):
+        # Calculate remaining cache points budget for messages
+        remaining_budget = MAX_CACHE_POINTS - used_cache_points
+        if remaining_budget < 0:  # pragma: no cover
+            raise UserError(
+                f'Too many cache points for Anthropic request. '
+                f'System prompt and tool definitions already use {used_cache_points} cache points, '
+                f'which exceeds the maximum of {MAX_CACHE_POINTS}.'
+            )
+        # Remove excess cache points from messages (newest to oldest)
+        for message in reversed(anthropic_messages):
             content = message['content']
-            # Skip if content is a string or None
             if isinstance(content, str):  # pragma: no cover
                 continue
-            content = cast(list[BetaContentBlockParam], content)
-            # Traverse content blocks from back to front within each message
-            for block in reversed(content):
-                # Cast to dict for TypedDict manipulation
+
+            # Process content blocks in reverse order (newest first)
+            for block in reversed(cast(list[BetaContentBlockParam], content)):
                 block_dict = cast(dict[str, Any], block)
 
-                # Check if this block has cache_control
                 if 'cache_control' in block_dict:
-                    if remaining_cache_points > 0:
-                        # Keep this cache point (within limit)
-                        remaining_cache_points -= 1
+                    if remaining_budget > 0:
+                        remaining_budget -= 1
                     else:
-                        # Remove cache_control as we've exceeded the limit
+                        # Exceeded limit, remove this cache point
                         del block_dict['cache_control']
 
     @staticmethod
diff --git a/tests/models/test_anthropic.py b/tests/models/test_anthropic.py