|
| 1 | +#!/usr/bin/env python3 |
| 2 | +"""Example demonstrating Anthropic prompt caching. |
| 3 | +
|
| 4 | +This example shows how to use CachePoint to reduce costs by caching: |
| 5 | +- Long system prompts |
| 6 | +- Large context (like documentation) |
| 7 | +- Tool definitions |
| 8 | +
|
| 9 | +Run with: uv run -m pydantic_ai_examples.anthropic_prompt_caching |
| 10 | +""" |
| 11 | + |
| 12 | +from pydantic_ai import Agent, CachePoint |
| 13 | + |
| 14 | +# Sample long context to demonstrate caching |
| 15 | +# Need at least 1024 tokens - repeating 10x to be safe |
| 16 | +LONG_CONTEXT = ( |
| 17 | + """ |
| 18 | +# Product Documentation |
| 19 | +
|
| 20 | +## Overview |
| 21 | +Our API provides comprehensive data access with the following features: |
| 22 | +
|
| 23 | +### Authentication |
| 24 | +All requests require a Bearer token in the Authorization header. |
| 25 | +Rate limits: 1000 requests/hour for standard tier. |
| 26 | +
|
| 27 | +### Endpoints |
| 28 | +
|
| 29 | +#### GET /api/users |
| 30 | +Returns a list of users with pagination support. |
| 31 | +Parameters: |
| 32 | +- page: Page number (default: 1) |
| 33 | +- limit: Items per page (default: 20, max: 100) |
| 34 | +- filter: Optional filter expression |
| 35 | +
|
| 36 | +#### GET /api/products |
| 37 | +Returns product catalog with detailed specifications. |
| 38 | +Parameters: |
| 39 | +- category: Filter by category |
| 40 | +- in_stock: Boolean, filter available items |
| 41 | +- sort: Sort order (price_asc, price_desc, name) |
| 42 | +
|
| 43 | +#### POST /api/orders |
| 44 | +Create a new order. Requires authentication. |
| 45 | +Request body: |
| 46 | +- user_id: Integer, required |
| 47 | +- items: Array of {product_id, quantity} |
| 48 | +- shipping_address: Object with address details |
| 49 | +
|
| 50 | +#### Error Handling |
| 51 | +Standard HTTP status codes are used: |
| 52 | +- 200: Success |
| 53 | +- 400: Bad request |
| 54 | +- 401: Unauthorized |
| 55 | +- 404: Not found |
| 56 | +- 500: Server error |
| 57 | +
|
| 58 | +## Best Practices |
| 59 | +1. Always handle rate limiting with exponential backoff |
| 60 | +2. Cache responses where appropriate |
| 61 | +3. Use pagination for large datasets |
| 62 | +4. Validate input before submission |
| 63 | +5. Monitor API usage through dashboard |
| 64 | +
|
| 65 | +## Code Examples |
| 66 | +See detailed examples in our GitHub repository. |
| 67 | +""" |
| 68 | + * 10 |
| 69 | +) # Repeat 10x to ensure we exceed Anthropic's minimum cache size (1024 tokens) |
| 70 | + |
| 71 | + |
| 72 | +async def main() -> None: |
| 73 | + """Demonstrate prompt caching with Anthropic.""" |
| 74 | + print('=== Anthropic Prompt Caching Demo ===\n') |
| 75 | + |
| 76 | + agent = Agent( |
| 77 | + 'anthropic:claude-sonnet-4-5', |
| 78 | + system_prompt='You are a helpful API documentation assistant.', |
| 79 | + ) |
| 80 | + |
| 81 | + # First request with cache point - this will write to cache |
| 82 | + print('First request (will cache context)...') |
| 83 | + result1 = await agent.run( |
| 84 | + [ |
| 85 | + LONG_CONTEXT, |
| 86 | + CachePoint(), # Everything before this will be cached |
| 87 | + 'What authentication method does the API use?', |
| 88 | + ] |
| 89 | + ) |
| 90 | + |
| 91 | + print(f'Response: {result1.output}\n') |
| 92 | + usage1 = result1.usage() |
| 93 | + print(f'Usage: {usage1}') |
| 94 | + if usage1.cache_write_tokens: |
| 95 | + print( |
| 96 | + f' Cache write tokens: {usage1.cache_write_tokens} (tokens written to cache)' |
| 97 | + ) |
| 98 | + print() |
| 99 | + |
| 100 | + # Second request with same cached context - should use cache |
| 101 | + print('Second request (should read from cache)...') |
| 102 | + result2 = await agent.run( |
| 103 | + [ |
| 104 | + LONG_CONTEXT, |
| 105 | + CachePoint(), # Same content, should hit cache |
| 106 | + 'What are the available API endpoints?', |
| 107 | + ] |
| 108 | + ) |
| 109 | + |
| 110 | + print(f'Response: {result2.output}\n') |
| 111 | + usage2 = result2.usage() |
| 112 | + print(f'Usage: {usage2}') |
| 113 | + if usage2.cache_read_tokens: |
| 114 | + print( |
| 115 | + f' Cache read tokens: {usage2.cache_read_tokens} (tokens read from cache)' |
| 116 | + ) |
| 117 | + print( |
| 118 | + f' Cache savings: ~{usage2.cache_read_tokens * 0.9:.0f} token-equivalents (90% discount)' |
| 119 | + ) |
| 120 | + print() |
| 121 | + |
| 122 | + # Third request with different question, same cache |
| 123 | + print('Third request (should also read from cache)...') |
| 124 | + result3 = await agent.run( |
| 125 | + [ |
| 126 | + LONG_CONTEXT, |
| 127 | + CachePoint(), |
| 128 | + 'How should I handle rate limiting?', |
| 129 | + ] |
| 130 | + ) |
| 131 | + |
| 132 | + print(f'Response: {result3.output}\n') |
| 133 | + usage3 = result3.usage() |
| 134 | + print(f'Usage: {usage3}') |
| 135 | + if usage3.cache_read_tokens: |
| 136 | + print(f' Cache read tokens: {usage3.cache_read_tokens}') |
| 137 | + print() |
| 138 | + |
| 139 | + print('=== Summary ===') |
| 140 | + total_usage = usage1 + usage2 + usage3 |
| 141 | + print(f'Total input tokens: {total_usage.input_tokens}') |
| 142 | + print(f'Total cache write: {total_usage.cache_write_tokens}') |
| 143 | + print(f'Total cache read: {total_usage.cache_read_tokens}') |
| 144 | + if total_usage.cache_read_tokens: |
| 145 | + savings = total_usage.cache_read_tokens * 0.9 |
| 146 | + print(f'Estimated savings: ~{savings:.0f} token-equivalents') |
| 147 | + |
| 148 | + |
| 149 | +if __name__ == '__main__': |
| 150 | + import asyncio |
| 151 | + |
| 152 | + asyncio.run(main()) |
0 commit comments