Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
130 changes: 130 additions & 0 deletions docs/models/anthropic.md
Original file line number Diff line number Diff line change
Expand Up @@ -77,3 +77,133 @@ model = AnthropicModel(
agent = Agent(model)
...
```

## Prompt Caching

Anthropic supports [prompt caching](https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching) to reduce costs by caching parts of your prompts. PydanticAI provides three ways to use prompt caching:

### 1. Cache User Messages with `CachePoint`

Insert a [`CachePoint`][pydantic_ai.messages.CachePoint] marker in your user messages to cache everything before it:

```python {test="skip"}
from pydantic_ai import Agent, CachePoint

agent = Agent('anthropic:claude-sonnet-4-5')

async def main():
# Everything before CachePoint will be cached
result = await agent.run([
'Long context that should be cached...',
CachePoint(),
'Your question here'
])
print(result.output)
```

### 2. Cache System Instructions

Use `anthropic_cache_instructions=True` to cache your system prompt:

```python {test="skip"}
from pydantic_ai import Agent
from pydantic_ai.models.anthropic import AnthropicModelSettings

agent = Agent(
'anthropic:claude-sonnet-4-5',
system_prompt='Long detailed instructions...',
model_settings=AnthropicModelSettings(
anthropic_cache_instructions=True
),
)

async def main():
result = await agent.run('Your question')
print(result.output)
```

### 3. Cache Tool Definitions

Use `anthropic_cache_tools=True` to cache your tool definitions:

```python {test="skip"}
from pydantic_ai import Agent
from pydantic_ai.models.anthropic import AnthropicModelSettings

agent = Agent(
'anthropic:claude-sonnet-4-5',
model_settings=AnthropicModelSettings(
anthropic_cache_tools=True
),
)

@agent.tool
def my_tool() -> str:
"""Tool definition will be cached."""
return 'result'

async def main():
result = await agent.run('Use the tool')
print(result.output)
```

### Combining Cache Strategies

You can combine all three caching strategies for maximum savings:

```python {test="skip"}
from pydantic_ai import Agent, CachePoint, RunContext
from pydantic_ai.models.anthropic import AnthropicModelSettings

agent = Agent(
'anthropic:claude-sonnet-4-5',
system_prompt='Detailed instructions...',
model_settings=AnthropicModelSettings(
anthropic_cache_instructions=True,
anthropic_cache_tools=True,
),
)

@agent.tool
def search_docs(ctx: RunContext, query: str) -> str:
"""Search documentation."""
return f'Results for {query}'

async def main():
# First call - writes to cache
result1 = await agent.run([
'Long context from documentation...',
CachePoint(),
'First question'
])

# Subsequent calls - read from cache (90% cost reduction)
result2 = await agent.run([
'Long context from documentation...', # Same content
CachePoint(),
'Second question'
])
print(f'First: {result1.output}')
print(f'Second: {result2.output}')
```

Access cache usage statistics via `result.usage()`:

```python {test="skip"}
from pydantic_ai import Agent
from pydantic_ai.models.anthropic import AnthropicModelSettings

agent = Agent(
'anthropic:claude-sonnet-4-5',
system_prompt='Instructions...',
model_settings=AnthropicModelSettings(
anthropic_cache_instructions=True
),
)

async def main():
result = await agent.run('Your question')
usage = result.usage()
print(f'Cache write tokens: {usage.cache_write_tokens}')
print(f'Cache read tokens: {usage.cache_read_tokens}')
```
2 changes: 2 additions & 0 deletions pydantic_ai_slim/pydantic_ai/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
BinaryImage,
BuiltinToolCallPart,
BuiltinToolReturnPart,
CachePoint,
DocumentFormat,
DocumentMediaType,
DocumentUrl,
Expand Down Expand Up @@ -141,6 +142,7 @@
'BinaryContent',
'BuiltinToolCallPart',
'BuiltinToolReturnPart',
'CachePoint',
'DocumentFormat',
'DocumentMediaType',
'DocumentUrl',
Expand Down
21 changes: 20 additions & 1 deletion pydantic_ai_slim/pydantic_ai/messages.py
Original file line number Diff line number Diff line change
Expand Up @@ -612,8 +612,24 @@ def __init__(
raise ValueError('`BinaryImage` must be have a media type that starts with "image/"') # pragma: no cover


@dataclass
class CachePoint:
"""A cache point marker for prompt caching.

Can be inserted into UserPromptPart.content to mark cache boundaries.
Models that don't support caching will filter these out.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
Models that don't support caching will filter these out.
Supported by:
- Anthropic


Supported by:

- Anthropic
"""

kind: Literal['cache-point'] = 'cache-point'
"""Type identifier, this is available on all parts as a discriminator."""


MultiModalContent = ImageUrl | AudioUrl | DocumentUrl | VideoUrl | BinaryContent
UserContent: TypeAlias = str | MultiModalContent
UserContent: TypeAlias = str | MultiModalContent | CachePoint


@dataclass(repr=False)
Expand Down Expand Up @@ -730,6 +746,9 @@ def otel_message_parts(self, settings: InstrumentationSettings) -> list[_otel_me
if settings.include_content and settings.include_binary_content:
converted_part['content'] = base64.b64encode(part.data).decode()
parts.append(converted_part)
elif isinstance(part, CachePoint):
# CachePoint is a marker, not actual content - skip it for otel
pass
else:
parts.append({'type': part.kind}) # pragma: no cover
return parts
Expand Down
86 changes: 78 additions & 8 deletions pydantic_ai_slim/pydantic_ai/models/anthropic.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
BinaryContent,
BuiltinToolCallPart,
BuiltinToolReturnPart,
CachePoint,
DocumentUrl,
FilePart,
FinishReason,
Expand Down Expand Up @@ -58,6 +59,7 @@
from anthropic.types.beta import (
BetaBase64PDFBlockParam,
BetaBase64PDFSourceParam,
BetaCacheControlEphemeralParam,
BetaCitationsDelta,
BetaCodeExecutionTool20250522Param,
BetaCodeExecutionToolResultBlock,
Expand Down Expand Up @@ -148,6 +150,22 @@ class AnthropicModelSettings(ModelSettings, total=False):
See [the Anthropic docs](https://docs.anthropic.com/en/docs/build-with-claude/extended-thinking) for more information.
"""

anthropic_cache_tools: bool
"""Whether to add cache_control to the last tool definition.

When enabled, the last tool in the tools array will have cache_control set,
allowing Anthropic to cache tool definitions and reduce costs.
See https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching for more information.
"""

anthropic_cache_instructions: bool
"""Whether to add cache_control to the last system prompt block.

When enabled, the last system prompt will have cache_control set,
allowing Anthropic to cache system instructions and reduce costs.
See https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching for more information.
"""


@dataclass(init=False)
class AnthropicModel(Model):
Expand Down Expand Up @@ -289,7 +307,7 @@ async def _messages_create(
model_request_parameters: ModelRequestParameters,
) -> BetaMessage | AsyncStream[BetaRawMessageStreamEvent]:
# standalone function to make it easier to override
tools = self._get_tools(model_request_parameters)
tools = self._get_tools(model_request_parameters, model_settings)
tools, mcp_servers, beta_features = self._add_builtin_tools(tools, model_request_parameters)

tool_choice: BetaToolChoiceParam | None
Expand All @@ -305,7 +323,7 @@ async def _messages_create(
if (allow_parallel_tool_calls := model_settings.get('parallel_tool_calls')) is not None:
tool_choice['disable_parallel_tool_use'] = not allow_parallel_tool_calls

system_prompt, anthropic_messages = await self._map_message(messages, model_request_parameters)
system_prompt, anthropic_messages = await self._map_message(messages, model_request_parameters, model_settings)

try:
extra_headers = model_settings.get('extra_headers', {})
Expand Down Expand Up @@ -411,8 +429,19 @@ async def _process_streamed_response(
_provider_url=self._provider.base_url,
)

def _get_tools(self, model_request_parameters: ModelRequestParameters) -> list[BetaToolUnionParam]:
return [self._map_tool_definition(r) for r in model_request_parameters.tool_defs.values()]
def _get_tools(
self, model_request_parameters: ModelRequestParameters, model_settings: AnthropicModelSettings
) -> list[BetaToolUnionParam]:
tools: list[BetaToolUnionParam] = [
self._map_tool_definition(r) for r in model_request_parameters.tool_defs.values()
]

# Add cache_control to the last tool if enabled
if tools and model_settings.get('anthropic_cache_tools'):
last_tool = cast(dict[str, Any], tools[-1])
last_tool['cache_control'] = BetaCacheControlEphemeralParam(type='ephemeral')

return tools

def _add_builtin_tools(
self, tools: list[BetaToolUnionParam], model_request_parameters: ModelRequestParameters
Expand Down Expand Up @@ -464,8 +493,11 @@ def _add_builtin_tools(
return tools, mcp_servers, beta_features

async def _map_message( # noqa: C901
self, messages: list[ModelMessage], model_request_parameters: ModelRequestParameters
) -> tuple[str, list[BetaMessageParam]]:
self,
messages: list[ModelMessage],
model_request_parameters: ModelRequestParameters,
model_settings: AnthropicModelSettings,
) -> tuple[str | list[BetaTextBlockParam], list[BetaMessageParam]]:
"""Just maps a `pydantic_ai.Message` to a `anthropic.types.MessageParam`."""
system_prompt_parts: list[str] = []
anthropic_messages: list[BetaMessageParam] = []
Expand All @@ -477,7 +509,10 @@ async def _map_message( # noqa: C901
system_prompt_parts.append(request_part.content)
elif isinstance(request_part, UserPromptPart):
async for content in self._map_user_prompt(request_part):
user_content_params.append(content)
if isinstance(content, CachePoint):
self._add_cache_control_to_last_param(user_content_params)
else:
user_content_params.append(content)
elif isinstance(request_part, ToolReturnPart):
tool_result_block_param = BetaToolResultBlockParam(
tool_use_id=_guard_tool_call_id(t=request_part),
Expand Down Expand Up @@ -637,12 +672,43 @@ async def _map_message( # noqa: C901
if instructions := self._get_instructions(messages, model_request_parameters):
system_prompt_parts.insert(0, instructions)
system_prompt = '\n\n'.join(system_prompt_parts)

# If anthropic_cache_instructions is enabled, return system prompt as a list with cache_control
if system_prompt and model_settings.get('anthropic_cache_instructions'):
system_prompt_blocks = [
BetaTextBlockParam(
type='text', text=system_prompt, cache_control=BetaCacheControlEphemeralParam(type='ephemeral')
)
]
return system_prompt_blocks, anthropic_messages

return system_prompt, anthropic_messages

@staticmethod
def _add_cache_control_to_last_param(params: list[BetaContentBlockParam]) -> None:
"""Add cache control to the last content block param.

See https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching for more information.
"""
if not params:
raise UserError(
'CachePoint cannot be the first content in a user message - there must be previous content to attach the CachePoint to.'
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copying in context from https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching#what-can-be-cached:

Tools: Tool definitions in the tools array
System messages: Content blocks in the system array
Text messages: Content blocks in the messages.content array, for both user and assistant turns
Images & Documents: Content blocks in the messages.content array, in user turns
Tool use and tool results: Content blocks in the messages.content array, in both user and assistant turns

I think we should support inserting a cache point after tool defs and system messages as well.

In the original PR I suggested doing this by supporting CachePoint as the first content in a user message (by adding it to whatever came before it: the system message, tool definition, or the last message of the assistant output), but that doesn't really feel natural from a code perspective.

What do you think about adding anthropic_cache_tools and anthropic_cache_instructions fields to AnthropicModelSettings, and setting cache_control on the relevant parts when set?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Seems reasonable, I'll look into it!

)

# Only certain types support cache_control
# See https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching#what-can-be-cached
cacheable_types = {'text', 'tool_use', 'server_tool_use', 'image', 'tool_result'}
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you please link to the doc this came from?

last_param = cast(dict[str, Any], params[-1]) # Cast to dict for mutation
if last_param['type'] not in cacheable_types:
raise UserError(f'Cache control not supported for param type: {last_param["type"]}')

# Add cache_control to the last param
last_param['cache_control'] = BetaCacheControlEphemeralParam(type='ephemeral')

@staticmethod
async def _map_user_prompt(
part: UserPromptPart,
) -> AsyncGenerator[BetaContentBlockParam]:
) -> AsyncGenerator[BetaContentBlockParam | CachePoint]:
if isinstance(part.content, str):
if part.content: # Only yield non-empty text
yield BetaTextBlockParam(text=part.content, type='text')
Expand All @@ -651,6 +717,8 @@ async def _map_user_prompt(
if isinstance(item, str):
if item: # Only yield non-empty text
yield BetaTextBlockParam(text=item, type='text')
elif isinstance(item, CachePoint):
yield item
elif isinstance(item, BinaryContent):
if item.is_image:
yield BetaImageBlockParam(
Expand Down Expand Up @@ -717,6 +785,8 @@ def _map_usage(
key: value for key, value in response_usage.model_dump().items() if isinstance(value, int)
}

# Note: genai-prices already extracts cache_creation_input_tokens and cache_read_input_tokens
# from the Anthropic response and maps them to cache_write_tokens and cache_read_tokens
return usage.RequestUsage.extract(
dict(model=model, usage=details),
provider=provider,
Expand Down
4 changes: 4 additions & 0 deletions pydantic_ai_slim/pydantic_ai/models/bedrock.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
BinaryContent,
BuiltinToolCallPart,
BuiltinToolReturnPart,
CachePoint,
DocumentUrl,
FinishReason,
ImageUrl,
Expand Down Expand Up @@ -672,6 +673,9 @@ async def _map_user_prompt(part: UserPromptPart, document_count: Iterator[int])
content.append({'video': video})
elif isinstance(item, AudioUrl): # pragma: no cover
raise NotImplementedError('Audio is not supported yet.')
elif isinstance(item, CachePoint):
# Bedrock doesn't support prompt caching via CachePoint in this implementation
pass
else:
assert_never(item)
return [{'role': 'user', 'content': content}]
Expand Down
4 changes: 4 additions & 0 deletions pydantic_ai_slim/pydantic_ai/models/gemini.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
BinaryContent,
BuiltinToolCallPart,
BuiltinToolReturnPart,
CachePoint,
FilePart,
FileUrl,
ModelMessage,
Expand Down Expand Up @@ -391,6 +392,9 @@ async def _map_user_prompt(self, part: UserPromptPart) -> list[_GeminiPartUnion]
else: # pragma: lax no cover
file_data = _GeminiFileDataPart(file_data={'file_uri': item.url, 'mime_type': item.media_type})
content.append(file_data)
elif isinstance(item, CachePoint):
# Gemini doesn't support prompt caching via CachePoint
pass
else:
assert_never(item) # pragma: lax no cover
return content
Expand Down
Loading
Loading