Skip to content
50 changes: 48 additions & 2 deletions docs/models/anthropic.md
Original file line number Diff line number Diff line change
Expand Up @@ -80,18 +80,29 @@ agent = Agent(model)

## Prompt Caching

Anthropic supports [prompt caching](https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching) to reduce costs by caching parts of your prompts. Pydantic AI provides three ways to use prompt caching:
Anthropic supports [prompt caching](https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching) to reduce costs by caching parts of your prompts. Pydantic AI provides four ways to use prompt caching:

1. **Cache User Messages with [`CachePoint`][pydantic_ai.messages.CachePoint]**: Insert a `CachePoint` marker in your user messages to cache everything before it
2. **Cache System Instructions**: Set [`AnthropicModelSettings.anthropic_cache_instructions`][pydantic_ai.models.anthropic.AnthropicModelSettings.anthropic_cache_instructions] to `True` (uses 5m TTL by default) or specify `'5m'` / `'1h'` directly
3. **Cache Tool Definitions**: Set [`AnthropicModelSettings.anthropic_cache_tool_definitions`][pydantic_ai.models.anthropic.AnthropicModelSettings.anthropic_cache_tool_definitions] to `True` (uses 5m TTL by default) or specify `'5m'` / `'1h'` directly
4. **Cache All (Convenience)**: Set [`AnthropicModelSettings.anthropic_cache_all`][pydantic_ai.models.anthropic.AnthropicModelSettings.anthropic_cache_all] to `True` to automatically cache both system instructions and the last user message

You can combine all three strategies for maximum savings:
You can combine multiple strategies for maximum savings:

```python {test="skip"}
from pydantic_ai import Agent, CachePoint, RunContext
from pydantic_ai.models.anthropic import AnthropicModelSettings

# Option 1: Use anthropic_cache_all for convenience (caches system + last message)
agent = Agent(
'anthropic:claude-sonnet-4-5',
system_prompt='Detailed instructions...',
model_settings=AnthropicModelSettings(
anthropic_cache_all=True, # Caches both system prompt and last message
),
)

# Option 2: Fine-grained control with individual settings
agent = Agent(
'anthropic:claude-sonnet-4-5',
system_prompt='Detailed instructions...',
Expand Down Expand Up @@ -145,3 +156,38 @@ async def main():
print(f'Cache write tokens: {usage.cache_write_tokens}')
print(f'Cache read tokens: {usage.cache_read_tokens}')
```

### Cache Point Limits

Anthropic enforces a maximum of 4 cache points per request. Pydantic AI automatically manages this limit:

- **`anthropic_cache_all`**: Uses 2 cache points (system instructions + last message)
- **`anthropic_cache_instructions`**: Uses 1 cache point
- **`anthropic_cache_tool_definitions`**: Uses 1 cache point
- **`CachePoint` markers**: Use remaining available cache points

When the total exceeds 4 cache points, Pydantic AI automatically removes cache points from **older messages** (keeping the most recent ones), ensuring your requests always comply with Anthropic's limits without errors.

```python {test="skip"}
from pydantic_ai import Agent, CachePoint
from pydantic_ai.models.anthropic import AnthropicModelSettings

agent = Agent(
'anthropic:claude-sonnet-4-5',
system_prompt='Instructions...',
model_settings=AnthropicModelSettings(
anthropic_cache_all=True, # Uses 2 cache points
),
)

async def main():
# Even with multiple CachePoint markers, only 2 more will be kept
# (4 total limit - 2 from cache_all = 2 available)
result = await agent.run([
'Context 1', CachePoint(), # Will be kept
'Context 2', CachePoint(), # Will be kept
'Context 3', CachePoint(), # Automatically removed (oldest)
'Question'
])
print(result.output)
```
104 changes: 102 additions & 2 deletions pydantic_ai_slim/pydantic_ai/models/anthropic.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,6 +169,22 @@ class AnthropicModelSettings(ModelSettings, total=False):
See https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching for more information.
"""

anthropic_cache_all: bool | Literal['5m', '1h']
"""Convenience setting to enable caching for both system instructions and the last user message.

When enabled, this automatically adds cache points to:
1. The last system prompt block (system instructions)
2. The last content block in the final user message

This is equivalent to setting both `anthropic_cache_instructions` and adding a cache point
to the last message, but more convenient for common use cases.
If `True`, uses TTL='5m'. You can also specify '5m' or '1h' directly.

Note: Uses 2 of Anthropic's 4 available cache points per request. Any additional CachePoint
markers in messages will be automatically limited to respect the 4-cache-point maximum.
See https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching for more information.
"""


@dataclass(init=False)
class AnthropicModel(Model):
Expand Down Expand Up @@ -478,7 +494,10 @@ def _get_tools(
]

# Add cache_control to the last tool if enabled
if tools and (cache_tool_defs := model_settings.get('anthropic_cache_tool_definitions')):
if tools and (
cache_tool_defs := model_settings.get('anthropic_cache_tool_definitions')
or model_settings.get('anthropic_cache_all')
):
# If True, use '5m'; otherwise use the specified ttl value
ttl: Literal['5m', '1h'] = '5m' if cache_tool_defs is True else cache_tool_defs
last_tool = tools[-1]
Expand Down Expand Up @@ -747,8 +766,32 @@ async def _map_message( # noqa: C901
system_prompt_parts.insert(0, instructions)
system_prompt = '\n\n'.join(system_prompt_parts)

# Add cache_control to the last message content if anthropic_cache_all is enabled
if anthropic_messages and (cache_all := model_settings.get('anthropic_cache_all')):
ttl: Literal['5m', '1h'] = '5m' if cache_all is True else cache_all
m = anthropic_messages[-1]
content = m['content']
if isinstance(content, str):
# Convert string content to list format with cache_control
m['content'] = [ # pragma: no cover
BetaTextBlockParam(
text=content,
type='text',
cache_control=BetaCacheControlEphemeralParam(type='ephemeral', ttl=ttl),
)
]
else:
# Add cache_control to the last content block
content = cast(list[BetaContentBlockParam], content)
self._add_cache_control_to_last_param(content, ttl)

# Ensure total cache points don't exceed Anthropic's limit of 4
self._limit_cache_points(anthropic_messages, model_settings)
# If anthropic_cache_instructions is enabled, return system prompt as a list with cache_control
if system_prompt and (cache_instructions := model_settings.get('anthropic_cache_instructions')):
if system_prompt and (
cache_instructions := model_settings.get('anthropic_cache_instructions')
or model_settings.get('anthropic_cache_all')
):
# If True, use '5m'; otherwise use the specified ttl value
ttl: Literal['5m', '1h'] = '5m' if cache_instructions is True else cache_instructions
system_prompt_blocks = [
Expand All @@ -762,6 +805,63 @@ async def _map_message( # noqa: C901

return system_prompt, anthropic_messages

@staticmethod
def _limit_cache_points(messages: list[BetaMessageParam], model_settings: AnthropicModelSettings) -> None:
"""Limit the number of cache points in messages to comply with Anthropic's 4-cache-point maximum.

Anthropic allows a maximum of 4 cache points per request. This method ensures compliance by:
1. Calculating how many cache points are already used by system-level settings
(anthropic_cache_instructions, anthropic_cache_tool_definitions, anthropic_cache_all)
2. Determining how many cache points remain available for message-level caching
3. Traversing messages from newest to oldest, keeping only the allowed number of cache points
4. Removing cache_control from older cache points that exceed the limit

This prioritizes recent cache points, which are typically more valuable for conversation continuity.

Args:
messages: List of message parameters to limit cache points in.
model_settings: Model settings containing cache configuration.
"""
# Anthropic's maximum cache points per request
max_cache_points = 4
used_cache_points = 0

# Calculate cache points used by system-level settings
if model_settings.get('anthropic_cache_all'):
# anthropic_cache_all adds cache points for both system instructions and last message
used_cache_points += 2
else:
if model_settings.get('anthropic_cache_instructions'):
used_cache_points += 1
if model_settings.get('anthropic_cache_tool_definitions'):
# Assume used one cache point for tool definitions
used_cache_points += 1

# Calculate remaining cache points available for message content
keep_cache_points = max_cache_points - used_cache_points

# Traverse messages from back to front (newest to oldest)
remaining_cache_points = keep_cache_points
for message in reversed(messages):
content = message['content']
# Skip if content is a string or None
if isinstance(content, str): # pragma: no cover
continue
content = cast(list[BetaContentBlockParam], content)
# Traverse content blocks from back to front within each message
for block in reversed(content):
# Cast to dict for TypedDict manipulation
block_dict = cast(dict[str, Any], block)

# Check if this block has cache_control
if 'cache_control' in block_dict:
if remaining_cache_points > 0:
# Keep this cache point (within limit)
remaining_cache_points -= 1
else:
# Remove cache_control as we've exceeded the limit
del block_dict['cache_control']

@staticmethod
def _add_cache_control_to_last_param(params: list[BetaContentBlockParam], ttl: Literal['5m', '1h'] = '5m') -> None:
"""Add cache control to the last content block param.
Expand Down
160 changes: 160 additions & 0 deletions tests/models/test_anthropic.py
Original file line number Diff line number Diff line change
Expand Up @@ -588,6 +588,166 @@ def my_tool(value: str) -> str: # pragma: no cover
assert system[0]['cache_control'] == snapshot({'type': 'ephemeral', 'ttl': '5m'})


async def test_anthropic_cache_all(allow_model_requests: None):
"""Test that anthropic_cache_all caches both system instructions and last message."""
c = completion_message(
[BetaTextBlock(text='Response', type='text')],
usage=BetaUsage(input_tokens=10, output_tokens=5),
)
mock_client = MockAnthropic.create_mock(c)
m = AnthropicModel('claude-haiku-4-5', provider=AnthropicProvider(anthropic_client=mock_client))
agent = Agent(
m,
system_prompt='System instructions to cache.',
model_settings=AnthropicModelSettings(
anthropic_cache_all=True,
),
)

await agent.run('User message')

# Verify both system and last message have cache_control
completion_kwargs = get_mock_chat_completion_kwargs(mock_client)[0]
system = completion_kwargs['system']
messages = completion_kwargs['messages']

# System should have cache_control
assert system == snapshot(
[{'type': 'text', 'text': 'System instructions to cache.', 'cache_control': {'type': 'ephemeral', 'ttl': '5m'}}]
)

# Last message content should have cache_control
assert messages[-1]['content'][-1] == snapshot(
{'type': 'text', 'text': 'User message', 'cache_control': {'type': 'ephemeral', 'ttl': '5m'}}
)


async def test_anthropic_cache_all_with_custom_ttl(allow_model_requests: None):
"""Test that anthropic_cache_all supports custom TTL values."""
c = completion_message(
[BetaTextBlock(text='Response', type='text')],
usage=BetaUsage(input_tokens=10, output_tokens=5),
)
mock_client = MockAnthropic.create_mock(c)
m = AnthropicModel('claude-haiku-4-5', provider=AnthropicProvider(anthropic_client=mock_client))
agent = Agent(
m,
system_prompt='System instructions.',
model_settings=AnthropicModelSettings(
anthropic_cache_all='1h', # Custom 1h TTL
),
)

await agent.run('User message')

# Verify both use 1h TTL
completion_kwargs = get_mock_chat_completion_kwargs(mock_client)[0]
system = completion_kwargs['system']
messages = completion_kwargs['messages']

assert system[0]['cache_control'] == snapshot({'type': 'ephemeral', 'ttl': '1h'})
assert messages[-1]['content'][-1]['cache_control'] == snapshot({'type': 'ephemeral', 'ttl': '1h'})


async def test_limit_cache_points_with_cache_all(allow_model_requests: None):
"""Test that cache points are limited when using cache_all + CachePoint markers."""
c = completion_message(
[BetaTextBlock(text='Response', type='text')],
usage=BetaUsage(input_tokens=10, output_tokens=5),
)
mock_client = MockAnthropic.create_mock(c)
m = AnthropicModel('claude-haiku-4-5', provider=AnthropicProvider(anthropic_client=mock_client))
agent = Agent(
m,
system_prompt='System instructions.',
model_settings=AnthropicModelSettings(
anthropic_cache_all=True, # Uses 2 cache points
),
)

# Add 3 CachePoint markers (total would be 5: 2 from cache_all + 3 from markers)
# Only 2 CachePoint markers should be kept (newest ones)
await agent.run(
[
'Context 1',
CachePoint(), # Oldest, should be removed
'Context 2',
CachePoint(), # Should be kept
'Context 3',
CachePoint(), # Should be kept
'Question',
]
)

completion_kwargs = get_mock_chat_completion_kwargs(mock_client)[0]
messages = completion_kwargs['messages']

# Count cache_control occurrences in messages
cache_count = 0
for msg in messages:
for block in msg['content']:
if 'cache_control' in block:
cache_count += 1

# anthropic_cache_all uses 2 cache points (system + last message)
# With 3 CachePoint markers, we'd have 5 total
# Limit is 4, so 1 oldest CachePoint should be removed
# Result: 2 cache points in messages (from the 2 newest CachePoints)
# The cache_all's last message cache is applied after limiting
assert cache_count == 2


async def test_limit_cache_points_all_settings(allow_model_requests: None):
"""Test cache point limiting with all cache settings enabled."""
c = completion_message(
[BetaTextBlock(text='Response', type='text')],
usage=BetaUsage(input_tokens=10, output_tokens=5),
)
mock_client = MockAnthropic.create_mock(c)
m = AnthropicModel('claude-haiku-4-5', provider=AnthropicProvider(anthropic_client=mock_client))

agent = Agent(
m,
system_prompt='System instructions.',
model_settings=AnthropicModelSettings(
anthropic_cache_instructions=True, # 1 cache point
anthropic_cache_tool_definitions=True, # 1 cache point
),
)

@agent.tool_plain
def my_tool() -> str: # pragma: no cover
return 'result'

# Add 3 CachePoint markers (total would be 5: 2 from settings + 3 from markers)
# Only 2 CachePoint markers should be kept
await agent.run(
[
'Context 1',
CachePoint(), # Oldest, should be removed
'Context 2',
CachePoint(), # Should be kept
'Context 3',
CachePoint(), # Should be kept
'Question',
]
)

completion_kwargs = get_mock_chat_completion_kwargs(mock_client)[0]
messages = completion_kwargs['messages']

# Count cache_control in messages (excluding system and tools)
cache_count = 0
for msg in messages:
for block in msg['content']:
if 'cache_control' in block:
cache_count += 1

# Should have exactly 2 cache points in messages
# (4 total - 1 system - 1 tool = 2 available for messages)
assert cache_count == 2


async def test_async_request_text_response(allow_model_requests: None):
c = completion_message(
[BetaTextBlock(text='world', type='text')],
Expand Down