pydantic
diff --git a/‎docs/.hooks/main.py‎
Lines changed: 8 additions & 8 deletions b/‎docs/.hooks/main.py‎
Lines changed: 8 additions & 8 deletions
diff --git a/‎docs/deferred-tools.md‎
Lines changed: 18 additions & 13 deletions b/‎docs/deferred-tools.md‎
Lines changed: 18 additions & 13 deletions
diff --git a/‎docs/durable_execution/temporal.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/durable_execution/temporal.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/gateway.md‎
Lines changed: 7 additions & 7 deletions b/‎docs/gateway.md‎
Lines changed: 7 additions & 7 deletions
diff --git a/‎docs/models/anthropic.md‎
Lines changed: 5 additions & 4 deletions b/‎docs/models/anthropic.md‎
Lines changed: 5 additions & 4 deletions
diff --git a/‎docs/models/google.md‎
Lines changed: 3 additions & 3 deletions b/‎docs/models/google.md‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎docs/toolsets.md‎
Lines changed: 1 addition & 0 deletions b/‎docs/toolsets.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎pydantic_ai_slim/pydantic_ai/_agent_graph.py‎
Lines changed: 27 additions & 3 deletions b/‎pydantic_ai_slim/pydantic_ai/_agent_graph.py‎
Lines changed: 27 additions & 3 deletions
@@ -16,12 +16,12 @@
 
 def on_page_markdown(markdown: str, page: Page, config: Config, files: Files) -> str:
     """Called on each file after it is read and before it is converted to HTML."""
-    relative_path_root = (DOCS_ROOT / page.file.src_uri).parent
-    markdown = inject_snippets(markdown, relative_path_root)
+    relative_path = DOCS_ROOT / page.file.src_uri
+    markdown = inject_snippets(markdown, relative_path.parent)
     markdown = replace_uv_python_run(markdown)
     markdown = render_examples(markdown)
     markdown = render_video(markdown)
-    markdown = create_gateway_toggle(markdown, relative_path_root)
+    markdown = create_gateway_toggle(markdown, relative_path)
     return markdown
 
 
@@ -120,13 +120,13 @@ def sub_cf_video(m: re.Match[str]) -> str:
 """
 
 
-def create_gateway_toggle(markdown: str, relative_path_root: Path) -> str:
+def create_gateway_toggle(markdown: str, relative_path: Path) -> str:
     """Transform Python code blocks with Agent() calls to show both Pydantic AI and Gateway versions."""
     # Pattern matches Python code blocks with or without attributes, and optional annotation definitions after
     # Annotation definitions are numbered list items like "1. Some text" that follow the code block
     return re.sub(
         r'```py(?:thon)?(?: *\{?([^}\n]*)\}?)?\n(.*?)\n```(\n\n(?:\d+\..+?\n)+?\n)?',
-        lambda m: transform_gateway_code_block(m, relative_path_root),
+        lambda m: transform_gateway_code_block(m, relative_path),
         markdown,
         flags=re.MULTILINE | re.DOTALL,
     )
@@ -136,7 +136,7 @@ def create_gateway_toggle(markdown: str, relative_path_root: Path) -> str:
 GATEWAY_MODELS = ('anthropic', 'openai', 'openai-responses', 'openai-chat', 'bedrock', 'google-vertex', 'groq')
 
 
-def transform_gateway_code_block(m: re.Match[str], relative_path_root: Path) -> str:
+def transform_gateway_code_block(m: re.Match[str], relative_path: Path) -> str:
     """Transform a single code block to show both versions if it contains Agent() calls."""
     attrs = m.group(1) or ''
     code = m.group(2)
@@ -186,9 +186,9 @@ def replace_agent_model(match: re.Match[str]) -> str:
 
     # Build attributes string
     docs_path = DOCS_ROOT / 'gateway'
-    relative_path = docs_path.relative_to(relative_path_root, walk_up=True)
-    link = f"<a href='{relative_path}' style='float: right;'>Learn about Gateway</a>"
 
+    relative_path_to_gateway = docs_path.relative_to(relative_path, walk_up=True)
+    link = f"<a href='{relative_path_to_gateway}' style='float: right;'>Learn about Gateway</a>"
     attrs_str = f' {{{attrs}}}' if attrs else ''
 
     if 'title="' in attrs:
 
@@ -47,7 +47,7 @@ PROTECTED_FILES = {'.env'}
 @agent.tool
 def update_file(ctx: RunContext, path: str, content: str) -> str:
     if path in PROTECTED_FILES and not ctx.tool_call_approved:
-        raise ApprovalRequired
+        raise ApprovalRequired(metadata={'reason': 'protected'})  # (1)!
     return f'File {path!r} updated: {content!r}'
 
 
@@ -77,6 +77,7 @@ DeferredToolRequests(
             tool_call_id='delete_file',
         ),
     ],
+    metadata={'update_file_dotenv': {'reason': 'protected'}},
 )
 """
 
@@ -175,6 +176,8 @@ print(result.all_messages())
 """
 ```
 
+1. The optional `metadata` parameter can attach arbitrary context to deferred tool calls, accessible in `DeferredToolRequests.metadata` keyed by `tool_call_id`.
+
 _(This example is complete, it can be run "as is")_
 
 ## External Tool Execution
@@ -209,13 +212,13 @@ from pydantic_ai import (
 
 @dataclass
 class TaskResult:
-    tool_call_id: str
+    task_id: str
     result: Any
 
 
-async def calculate_answer_task(tool_call_id: str, question: str) -> TaskResult:
+async def calculate_answer_task(task_id: str, question: str) -> TaskResult:
     await asyncio.sleep(1)
-    return TaskResult(tool_call_id=tool_call_id, result=42)
+    return TaskResult(task_id=task_id, result=42)
 
 
 agent = Agent('openai:gpt-5', output_type=[str, DeferredToolRequests])
@@ -225,12 +228,11 @@ tasks: list[asyncio.Task[TaskResult]] = []
 
 @agent.tool
 async def calculate_answer(ctx: RunContext, question: str) -> str:
-    assert ctx.tool_call_id is not None
-
-    task = asyncio.create_task(calculate_answer_task(ctx.tool_call_id, question))  # (1)!
+    task_id = f'task_{len(tasks)}'  # (1)!
+    task = asyncio.create_task(calculate_answer_task(task_id, question))
     tasks.append(task)
 
-    raise CallDeferred
+    raise CallDeferred(metadata={'task_id': task_id})  # (2)!
 
 
 async def main():
@@ -252,17 +254,19 @@ async def main():
             )
         ],
         approvals=[],
+        metadata={'pyd_ai_tool_call_id': {'task_id': 'task_0'}},
     )
     """
 
-    done, _ = await asyncio.wait(tasks)  # (2)!
+    done, _ = await asyncio.wait(tasks)  # (3)!
     task_results = [task.result() for task in done]
-    task_results_by_tool_call_id = {result.tool_call_id: result.result for result in task_results}
+    task_results_by_task_id = {result.task_id: result.result for result in task_results}
 
     results = DeferredToolResults()
     for call in requests.calls:
         try:
-            result = task_results_by_tool_call_id[call.tool_call_id]
+            task_id = requests.metadata[call.tool_call_id]['task_id']
+            result = task_results_by_task_id[task_id]
         except KeyError:
             result = ModelRetry('No result for this tool call was found.')
 
@@ -324,8 +328,9 @@ async def main():
     """
 ```
 
-1. In reality, you'd likely use Celery or a similar task queue to run the task in the background.
-2. In reality, this would typically happen in a separate process that polls for the task status or is notified when all pending tasks are complete.
+1. Generate a task ID that can be tracked independently of the tool call ID.
+2. The optional `metadata` parameter passes the `task_id` so it can be matched with results later, accessible in `DeferredToolRequests.metadata` keyed by `tool_call_id`.
+3. In reality, this would typically happen in a separate process that polls for the task status or is notified when all pending tasks are complete.
 
 _(This example is complete, it can be run "as is" — you'll need to add `asyncio.run(main())` to run `main`)_
 
 
@@ -172,7 +172,7 @@ As workflows and activities run in separate processes, any values passed between
 
 To account for these limitations, tool functions and the [event stream handler](#streaming) running inside activities receive a limited version of the agent's [`RunContext`][pydantic_ai.tools.RunContext], and it's your responsibility to make sure that the [dependencies](../dependencies.md) object provided to [`TemporalAgent.run()`][pydantic_ai.durable_exec.temporal.TemporalAgent.run] can be serialized using Pydantic.
 
-Specifically, only the `deps`, `run_id`, `retries`, `tool_call_id`, `tool_name`, `tool_call_approved`, `retry`, `max_retries`, `run_step` and `partial_output` fields are available by default, and trying to access `model`, `usage`, `prompt`, `messages`, or `tracer` will raise an error.
+Specifically, only the `deps`, `run_id`, `retries`, `tool_call_id`, `tool_name`, `tool_call_approved`, `retry`, `max_retries`, `run_step`, `usage`, and `partial_output` fields are available by default, and trying to access `model`, `prompt`, `messages`, or `tracer` will raise an error.
 If you need one or more of these attributes to be available inside activities, you can create a [`TemporalRunContext`][pydantic_ai.durable_exec.temporal.TemporalRunContext] subclass with custom `serialize_run_context` and `deserialize_run_context` class methods and pass it to [`TemporalAgent`][pydantic_ai.durable_exec.temporal.TemporalAgent] as `run_context_type`.
 
 ### Streaming
 
@@ -5,7 +5,7 @@ status: new
 
 # Pydantic AI Gateway
 
-**[Pydantic AI Gateway](https://pydantic.dev/ai-gateway)** (PAIG) is a unified interface for accessing multiple AI providers with a single key. Features include built-in OpenTelemetry observability, real-time cost monitoring, failover management, and native integration with the other tools in the [Pydantic stack](https://pydantic.dev/).
+**[Pydantic AI Gateway](https://pydantic.dev/ai-gateway)** is a unified interface for accessing multiple AI providers with a single key. Features include built-in OpenTelemetry observability, real-time cost monitoring, failover management, and native integration with the other tools in the [Pydantic stack](https://pydantic.dev/).
 
 !!! note "Free while in Beta"
     The Pydantic AI Gateway is currently in Beta. You can bring your own key (BYOK) or buy inference through the Gateway (we will eat the card fee for now).
@@ -26,8 +26,8 @@ To help you get started with [Pydantic AI Gateway](https://gateway.pydantic.dev)
 - **BYOK and managed providers:** Bring your own API keys (BYOK) from LLM providers, or pay for inference directly through the platform.
 - **Multi-provider support:** Access models from OpenAI, Anthropic, Google Vertex, Groq, and AWS Bedrock. _More providers coming soon_.
 - **Backend observability:** Log every request through [Pydantic Logfire](https://pydantic.dev/logfire) or any OpenTelemetry backend (_coming soon_).
-- **Zero translation**: Unlike traditional AI gateways that translate everything to one common schema, PAIG allows requests to flow through directly in each provider's native format. This gives you immediate access to the new model features as soon as they are released.
-- **Open source with self-hosting**: PAIG's core is [open source](https://github.com/pydantic/pydantic-ai-gateway/) (under [AGPL-3.0](https://www.gnu.org/licenses/agpl-3.0.en.html)), allowing self-hosting with file-based configuration, instead of using the managed service.
+- **Zero translation**: Unlike traditional AI gateways that translate everything to one common schema, **Pydantic AI Gateway** allows requests to flow through directly in each provider's native format. This gives you immediate access to the new model features as soon as they are released.
+- **Open source with self-hosting**: Pydantic AI Gateway core is [open source](https://github.com/pydantic/pydantic-ai-gateway/) (under [AGPL-3.0](https://www.gnu.org/licenses/agpl-3.0.en.html)), allowing self-hosting with file-based configuration, instead of using the managed service.
 - **Enterprise ready**: Includes SSO (with OIDC support), granular permissions, and flexible deployment options. Deploy to your Cloudflare account, or run on-premises with our [consulting support](https://pydantic.dev/contact).
 
 ```python {title="hello_world.py"}
@@ -80,7 +80,7 @@ Users can only create personal keys, that will inherit spending caps from both U
 ## Usage
 
 After setting up your account with the instructions above, you will be able to make an AI model request with the Pydantic AI Gateway.
-The code snippets below show how you can use PAIG with different frameworks and SDKs.
+The code snippets below show how you can use Pydantic AI Gateway with different frameworks and SDKs.
 You can add `gateway/` as prefix on every known provider that
 
 To use different models, change the model string `gateway/<api_format>:<model_name>` to other models offered by the supported providers.
@@ -114,7 +114,7 @@ Before you start, make sure you are on version 1.16 or later of `pydantic-ai`. T
 Set the `PYDANTIC_AI_GATEWAY_API_KEY`  environment variable to your Gateway API key:
 
 ```bash
-export PYDANTIC_AI_GATEWAY_API_KEY="YOUR_PAIG_TOKEN"
+export PYDANTIC_AI_GATEWAY_API_KEY="YOUR_PYDANTIC_AI_GATEWAY_API_KEY"
 ```
 
 You can access multiple models with the same API key, as shown in the code snippet below.
@@ -140,10 +140,10 @@ Set your gateway credentials as environment variables:
 
 ```bash
 export ANTHROPIC_BASE_URL="https://gateway.pydantic.dev/proxy/anthropic"
-export ANTHROPIC_AUTH_TOKEN="YOUR_PAIG_TOKEN"
+export ANTHROPIC_AUTH_TOKEN="YOUR_PYDANTIC_AI_GATEWAY_API_KEY"
 ```
 
-Replace `YOUR_PAIG_TOKEN` with the API key from the Keys page.
+Replace `YOUR_PYDANTIC_AI_GATEWAY_API_KEY` with the API key from the Keys page.
 
 Launch Claude Code by typing `claude`. All requests will now route through the Pydantic AI Gateway.
 
 
@@ -83,8 +83,8 @@ agent = Agent(model)
 Anthropic supports [prompt caching](https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching) to reduce costs by caching parts of your prompts. Pydantic AI provides three ways to use prompt caching:
 
 1. **Cache User Messages with [`CachePoint`][pydantic_ai.messages.CachePoint]**: Insert a `CachePoint` marker in your user messages to cache everything before it
-2. **Cache System Instructions**: Enable the [`AnthropicModelSettings.anthropic_cache_instructions`][pydantic_ai.models.anthropic.AnthropicModelSettings.anthropic_cache_instructions] [model setting](../agents.md#model-run-settings) to cache your system prompt
-3. **Cache Tool Definitions**: Enable the [`AnthropicModelSettings.anthropic_cache_tool_definitions`][pydantic_ai.models.anthropic.AnthropicModelSettings.anthropic_cache_tool_definitions] [model setting](../agents.md#model-run-settings) to cache your tool definitions
+2. **Cache System Instructions**: Set [`AnthropicModelSettings.anthropic_cache_instructions`][pydantic_ai.models.anthropic.AnthropicModelSettings.anthropic_cache_instructions] to `True` (uses 5m TTL by default) or specify `'5m'` / `'1h'` directly
+3. **Cache Tool Definitions**: Set [`AnthropicModelSettings.anthropic_cache_tool_definitions`][pydantic_ai.models.anthropic.AnthropicModelSettings.anthropic_cache_tool_definitions] to `True` (uses 5m TTL by default) or specify `'5m'` / `'1h'` directly
 
 You can combine all three strategies for maximum savings:
 
@@ -96,8 +96,9 @@ agent = Agent(
     'anthropic:claude-sonnet-4-5',
     system_prompt='Detailed instructions...',
     model_settings=AnthropicModelSettings(
+        # Use True for default 5m TTL, or specify '5m' / '1h' directly
         anthropic_cache_instructions=True,
-        anthropic_cache_tool_definitions=True,
+        anthropic_cache_tool_definitions='1h',  # Longer cache for tool definitions
     ),
 )
 
@@ -134,7 +135,7 @@ agent = Agent(
     'anthropic:claude-sonnet-4-5',
     system_prompt='Instructions...',
     model_settings=AnthropicModelSettings(
-        anthropic_cache_instructions=True
+        anthropic_cache_instructions=True  # Default 5m TTL
     ),
 )
 
 
@@ -214,22 +214,22 @@ from pydantic_ai.models.google import GoogleModel, GoogleModelSettings
 settings = GoogleModelSettings(
     temperature=0.2,
     max_tokens=1024,
-    google_thinking_config={'thinking_budget': 2048},
+    google_thinking_config={'thinking_level': 'low'},
     google_safety_settings=[
         {
             'category': HarmCategory.HARM_CATEGORY_HATE_SPEECH,
             'threshold': HarmBlockThreshold.BLOCK_LOW_AND_ABOVE,
         }
     ]
 )
-model = GoogleModel('gemini-2.5-flash')
+model = GoogleModel('gemini-2.5-pro')
 agent = Agent(model, model_settings=settings)
 ...
 ```
 
 ### Disable thinking
 
-You can disable thinking by setting the `thinking_budget` to `0` on the `google_thinking_config`:
+On models older than Gemini 2.5 Pro, you can disable thinking by setting the `thinking_budget` to `0` on the `google_thinking_config`:
 
 ```python
 from pydantic_ai import Agent
 
@@ -362,6 +362,7 @@ DeferredToolRequests(
             tool_call_id='pyd_ai_tool_call_id__temperature_fahrenheit',
         ),
     ],
+    metadata={},
 )
 """
 
 
@@ -888,6 +888,7 @@ async def process_tool_calls(  # noqa: C901
         calls_to_run = [call for call in calls_to_run if call.tool_call_id in calls_to_run_results]
 
     deferred_calls: dict[Literal['external', 'unapproved'], list[_messages.ToolCallPart]] = defaultdict(list)
+    deferred_metadata: dict[str, dict[str, Any]] = {}
 
     if calls_to_run:
         async for event in _call_tools(
@@ -899,6 +900,7 @@ async def process_tool_calls(  # noqa: C901
             usage_limits=ctx.deps.usage_limits,
             output_parts=output_parts,
             output_deferred_calls=deferred_calls,
+            output_deferred_metadata=deferred_metadata,
         ):
             yield event
 
@@ -932,6 +934,7 @@ async def process_tool_calls(  # noqa: C901
         deferred_tool_requests = _output.DeferredToolRequests(
             calls=deferred_calls['external'],
             approvals=deferred_calls['unapproved'],
+            metadata=deferred_metadata,
         )
 
         final_result = result.FinalResult(cast(NodeRunEndT, deferred_tool_requests), None, None)
@@ -949,10 +952,12 @@ async def _call_tools(
     usage_limits: _usage.UsageLimits,
     output_parts: list[_messages.ModelRequestPart],
     output_deferred_calls: dict[Literal['external', 'unapproved'], list[_messages.ToolCallPart]],
+    output_deferred_metadata: dict[str, dict[str, Any]],
 ) -> AsyncIterator[_messages.HandleResponseEvent]:
     tool_parts_by_index: dict[int, _messages.ModelRequestPart] = {}
     user_parts_by_index: dict[int, _messages.UserPromptPart] = {}
     deferred_calls_by_index: dict[int, Literal['external', 'unapproved']] = {}
+    deferred_metadata_by_index: dict[int, dict[str, Any] | None] = {}
 
     if usage_limits.tool_calls_limit is not None:
         projected_usage = deepcopy(usage)
@@ -987,10 +992,12 @@ async def handle_call_or_result(
                 tool_part, tool_user_content = (
                     (await coro_or_task) if inspect.isawaitable(coro_or_task) else coro_or_task.result()
                 )
-            except exceptions.CallDeferred:
+            except exceptions.CallDeferred as e:
                 deferred_calls_by_index[index] = 'external'
-            except exceptions.ApprovalRequired:
+                deferred_metadata_by_index[index] = e.metadata
+            except exceptions.ApprovalRequired as e:
                 deferred_calls_by_index[index] = 'unapproved'
+                deferred_metadata_by_index[index] = e.metadata
             else:
                 tool_parts_by_index[index] = tool_part
                 if tool_user_content:
@@ -1028,8 +1035,25 @@ async def handle_call_or_result(
     output_parts.extend([tool_parts_by_index[k] for k in sorted(tool_parts_by_index)])
     output_parts.extend([user_parts_by_index[k] for k in sorted(user_parts_by_index)])
 
+    _populate_deferred_calls(
+        tool_calls, deferred_calls_by_index, deferred_metadata_by_index, output_deferred_calls, output_deferred_metadata
+    )
+
+
+def _populate_deferred_calls(
+    tool_calls: list[_messages.ToolCallPart],
+    deferred_calls_by_index: dict[int, Literal['external', 'unapproved']],
+    deferred_metadata_by_index: dict[int, dict[str, Any] | None],
+    output_deferred_calls: dict[Literal['external', 'unapproved'], list[_messages.ToolCallPart]],
+    output_deferred_metadata: dict[str, dict[str, Any]],
+) -> None:
+    """Populate deferred calls and metadata from indexed mappings."""
     for k in sorted(deferred_calls_by_index):
-        output_deferred_calls[deferred_calls_by_index[k]].append(tool_calls[k])
+        call = tool_calls[k]
+        output_deferred_calls[deferred_calls_by_index[k]].append(call)
+        metadata = deferred_metadata_by_index[k]
+        if metadata is not None:
+            output_deferred_metadata[call.tool_call_id] = metadata
 
 
 async def _call_tool(
Original file line number	Diff line number	Diff line change
`@@ -214,22 +214,22 @@ from pydantic_ai.models.google import GoogleModel, GoogleModelSettings`
`214`	`214`	`settings = GoogleModelSettings(`
`215`	`215`	`temperature=0.2,`
`216`	`216`	`max_tokens=1024,`
`217`		`- google_thinking_config={'thinking_budget': 2048},`
	`217`	`+ google_thinking_config={'thinking_level': 'low'},`
`218`	`218`	`google_safety_settings=[`
`219`	`219`	`{`
`220`	`220`	`'category': HarmCategory.HARM_CATEGORY_HATE_SPEECH,`
`221`	`221`	`'threshold': HarmBlockThreshold.BLOCK_LOW_AND_ABOVE,`
`222`	`222`	`}`
`223`	`223`	`]`
`224`	`224`	`)`
`225`		`-model = GoogleModel('gemini-2.5-flash')`
	`225`	`+model = GoogleModel('gemini-2.5-pro')`
`226`	`226`	`agent = Agent(model, model_settings=settings)`
`227`	`227`	`...`
`228`	`228`	```
`229`	`229`
`230`	`230`	`### Disable thinking`
`231`	`231`
`232`		-You can disable thinking by setting the `thinking_budget` to `0` on the `google_thinking_config`:
	`232`	+On models older than Gemini 2.5 Pro, you can disable thinking by setting the `thinking_budget` to `0` on the `google_thinking_config`:
`233`	`233`
`234`	`234`	```python
`235`	`235`	`from pydantic_ai import Agent`
Original file line number	Diff line number	Diff line change
`@@ -362,6 +362,7 @@ DeferredToolRequests(`
`362`	`362`	`tool_call_id='pyd_ai_tool_call_id__temperature_fahrenheit',`
`363`	`363`	`),`
`364`	`364`	`],`
	`365`	`+ metadata={},`
`365`	`366`	`)`
`366`	`367`	`"""`
`367`	`368`