pydantic
diff --git a/‎docs/.hooks/main.py‎
Lines changed: 8 additions & 8 deletions b/‎docs/.hooks/main.py‎
Lines changed: 8 additions & 8 deletions
diff --git a/‎docs/deferred-tools.md‎
Lines changed: 18 additions & 13 deletions b/‎docs/deferred-tools.md‎
Lines changed: 18 additions & 13 deletions
diff --git a/‎docs/durable_execution/temporal.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/durable_execution/temporal.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/models/anthropic.md‎
Lines changed: 5 additions & 4 deletions b/‎docs/models/anthropic.md‎
Lines changed: 5 additions & 4 deletions
diff --git a/‎docs/toolsets.md‎
Lines changed: 1 addition & 0 deletions b/‎docs/toolsets.md‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎pydantic_ai_slim/pydantic_ai/_agent_graph.py‎
Lines changed: 27 additions & 3 deletions b/‎pydantic_ai_slim/pydantic_ai/_agent_graph.py‎
Lines changed: 27 additions & 3 deletions
diff --git a/‎pydantic_ai_slim/pydantic_ai/_json_schema.py‎
Lines changed: 3 additions & 4 deletions b/‎pydantic_ai_slim/pydantic_ai/_json_schema.py‎
Lines changed: 3 additions & 4 deletions
diff --git a/‎pydantic_ai_slim/pydantic_ai/durable_exec/temporal/__init__.py‎
Lines changed: 1 addition & 0 deletions b/‎pydantic_ai_slim/pydantic_ai/durable_exec/temporal/__init__.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎pydantic_ai_slim/pydantic_ai/durable_exec/temporal/_run_context.py‎
Lines changed: 2 additions & 1 deletion b/‎pydantic_ai_slim/pydantic_ai/durable_exec/temporal/_run_context.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎pydantic_ai_slim/pydantic_ai/durable_exec/temporal/_toolset.py‎
Lines changed: 8 additions & 6 deletions b/‎pydantic_ai_slim/pydantic_ai/durable_exec/temporal/_toolset.py‎
Lines changed: 8 additions & 6 deletions
@@ -16,12 +16,12 @@
 
 def on_page_markdown(markdown: str, page: Page, config: Config, files: Files) -> str:
     """Called on each file after it is read and before it is converted to HTML."""
-    relative_path_root = (DOCS_ROOT / page.file.src_uri).parent
-    markdown = inject_snippets(markdown, relative_path_root)
+    relative_path = DOCS_ROOT / page.file.src_uri
+    markdown = inject_snippets(markdown, relative_path.parent)
     markdown = replace_uv_python_run(markdown)
     markdown = render_examples(markdown)
     markdown = render_video(markdown)
-    markdown = create_gateway_toggle(markdown, relative_path_root)
+    markdown = create_gateway_toggle(markdown, relative_path)
     return markdown
 
 
@@ -120,13 +120,13 @@ def sub_cf_video(m: re.Match[str]) -> str:
 """
 
 
-def create_gateway_toggle(markdown: str, relative_path_root: Path) -> str:
+def create_gateway_toggle(markdown: str, relative_path: Path) -> str:
     """Transform Python code blocks with Agent() calls to show both Pydantic AI and Gateway versions."""
     # Pattern matches Python code blocks with or without attributes, and optional annotation definitions after
     # Annotation definitions are numbered list items like "1. Some text" that follow the code block
     return re.sub(
         r'```py(?:thon)?(?: *\{?([^}\n]*)\}?)?\n(.*?)\n```(\n\n(?:\d+\..+?\n)+?\n)?',
-        lambda m: transform_gateway_code_block(m, relative_path_root),
+        lambda m: transform_gateway_code_block(m, relative_path),
         markdown,
         flags=re.MULTILINE | re.DOTALL,
     )
@@ -136,7 +136,7 @@ def create_gateway_toggle(markdown: str, relative_path_root: Path) -> str:
 GATEWAY_MODELS = ('anthropic', 'openai', 'openai-responses', 'openai-chat', 'bedrock', 'google-vertex', 'groq')
 
 
-def transform_gateway_code_block(m: re.Match[str], relative_path_root: Path) -> str:
+def transform_gateway_code_block(m: re.Match[str], relative_path: Path) -> str:
     """Transform a single code block to show both versions if it contains Agent() calls."""
     attrs = m.group(1) or ''
     code = m.group(2)
@@ -186,9 +186,9 @@ def replace_agent_model(match: re.Match[str]) -> str:
 
     # Build attributes string
     docs_path = DOCS_ROOT / 'gateway'
-    relative_path = docs_path.relative_to(relative_path_root, walk_up=True)
-    link = f"<a href='{relative_path}' style='float: right;'>Learn about Gateway</a>"
 
+    relative_path_to_gateway = docs_path.relative_to(relative_path, walk_up=True)
+    link = f"<a href='{relative_path_to_gateway}' style='float: right;'>Learn about Gateway</a>"
     attrs_str = f' {{{attrs}}}' if attrs else ''
 
     if 'title="' in attrs:
 
@@ -47,7 +47,7 @@ PROTECTED_FILES = {'.env'}
 @agent.tool
 def update_file(ctx: RunContext, path: str, content: str) -> str:
     if path in PROTECTED_FILES and not ctx.tool_call_approved:
-        raise ApprovalRequired
+        raise ApprovalRequired(metadata={'reason': 'protected'})  # (1)!
     return f'File {path!r} updated: {content!r}'
 
 
@@ -77,6 +77,7 @@ DeferredToolRequests(
             tool_call_id='delete_file',
         ),
     ],
+    metadata={'update_file_dotenv': {'reason': 'protected'}},
 )
 """
 
@@ -175,6 +176,8 @@ print(result.all_messages())
 """
 ```
 
+1. The optional `metadata` parameter can attach arbitrary context to deferred tool calls, accessible in `DeferredToolRequests.metadata` keyed by `tool_call_id`.
+
 _(This example is complete, it can be run "as is")_
 
 ## External Tool Execution
@@ -209,13 +212,13 @@ from pydantic_ai import (
 
 @dataclass
 class TaskResult:
-    tool_call_id: str
+    task_id: str
     result: Any
 
 
-async def calculate_answer_task(tool_call_id: str, question: str) -> TaskResult:
+async def calculate_answer_task(task_id: str, question: str) -> TaskResult:
     await asyncio.sleep(1)
-    return TaskResult(tool_call_id=tool_call_id, result=42)
+    return TaskResult(task_id=task_id, result=42)
 
 
 agent = Agent('openai:gpt-5', output_type=[str, DeferredToolRequests])
@@ -225,12 +228,11 @@ tasks: list[asyncio.Task[TaskResult]] = []
 
 @agent.tool
 async def calculate_answer(ctx: RunContext, question: str) -> str:
-    assert ctx.tool_call_id is not None
-
-    task = asyncio.create_task(calculate_answer_task(ctx.tool_call_id, question))  # (1)!
+    task_id = f'task_{len(tasks)}'  # (1)!
+    task = asyncio.create_task(calculate_answer_task(task_id, question))
     tasks.append(task)
 
-    raise CallDeferred
+    raise CallDeferred(metadata={'task_id': task_id})  # (2)!
 
 
 async def main():
@@ -252,17 +254,19 @@ async def main():
             )
         ],
         approvals=[],
+        metadata={'pyd_ai_tool_call_id': {'task_id': 'task_0'}},
     )
     """
 
-    done, _ = await asyncio.wait(tasks)  # (2)!
+    done, _ = await asyncio.wait(tasks)  # (3)!
     task_results = [task.result() for task in done]
-    task_results_by_tool_call_id = {result.tool_call_id: result.result for result in task_results}
+    task_results_by_task_id = {result.task_id: result.result for result in task_results}
 
     results = DeferredToolResults()
     for call in requests.calls:
         try:
-            result = task_results_by_tool_call_id[call.tool_call_id]
+            task_id = requests.metadata[call.tool_call_id]['task_id']
+            result = task_results_by_task_id[task_id]
         except KeyError:
             result = ModelRetry('No result for this tool call was found.')
 
@@ -324,8 +328,9 @@ async def main():
     """
 ```
 
-1. In reality, you'd likely use Celery or a similar task queue to run the task in the background.
-2. In reality, this would typically happen in a separate process that polls for the task status or is notified when all pending tasks are complete.
+1. Generate a task ID that can be tracked independently of the tool call ID.
+2. The optional `metadata` parameter passes the `task_id` so it can be matched with results later, accessible in `DeferredToolRequests.metadata` keyed by `tool_call_id`.
+3. In reality, this would typically happen in a separate process that polls for the task status or is notified when all pending tasks are complete.
 
 _(This example is complete, it can be run "as is" — you'll need to add `asyncio.run(main())` to run `main`)_
 
 
@@ -172,7 +172,7 @@ As workflows and activities run in separate processes, any values passed between
 
 To account for these limitations, tool functions and the [event stream handler](#streaming) running inside activities receive a limited version of the agent's [`RunContext`][pydantic_ai.tools.RunContext], and it's your responsibility to make sure that the [dependencies](../dependencies.md) object provided to [`TemporalAgent.run()`][pydantic_ai.durable_exec.temporal.TemporalAgent.run] can be serialized using Pydantic.
 
-Specifically, only the `deps`, `run_id`, `retries`, `tool_call_id`, `tool_name`, `tool_call_approved`, `retry`, `max_retries`, `run_step` and `partial_output` fields are available by default, and trying to access `model`, `usage`, `prompt`, `messages`, or `tracer` will raise an error.
+Specifically, only the `deps`, `run_id`, `retries`, `tool_call_id`, `tool_name`, `tool_call_approved`, `retry`, `max_retries`, `run_step`, `usage`, and `partial_output` fields are available by default, and trying to access `model`, `prompt`, `messages`, or `tracer` will raise an error.
 If you need one or more of these attributes to be available inside activities, you can create a [`TemporalRunContext`][pydantic_ai.durable_exec.temporal.TemporalRunContext] subclass with custom `serialize_run_context` and `deserialize_run_context` class methods and pass it to [`TemporalAgent`][pydantic_ai.durable_exec.temporal.TemporalAgent] as `run_context_type`.
 
 ### Streaming
 
@@ -83,8 +83,8 @@ agent = Agent(model)
 Anthropic supports [prompt caching](https://docs.anthropic.com/en/docs/build-with-claude/prompt-caching) to reduce costs by caching parts of your prompts. Pydantic AI provides three ways to use prompt caching:
 
 1. **Cache User Messages with [`CachePoint`][pydantic_ai.messages.CachePoint]**: Insert a `CachePoint` marker in your user messages to cache everything before it
-2. **Cache System Instructions**: Enable the [`AnthropicModelSettings.anthropic_cache_instructions`][pydantic_ai.models.anthropic.AnthropicModelSettings.anthropic_cache_instructions] [model setting](../agents.md#model-run-settings) to cache your system prompt
-3. **Cache Tool Definitions**: Enable the [`AnthropicModelSettings.anthropic_cache_tool_definitions`][pydantic_ai.models.anthropic.AnthropicModelSettings.anthropic_cache_tool_definitions] [model setting](../agents.md#model-run-settings) to cache your tool definitions
+2. **Cache System Instructions**: Set [`AnthropicModelSettings.anthropic_cache_instructions`][pydantic_ai.models.anthropic.AnthropicModelSettings.anthropic_cache_instructions] to `True` (uses 5m TTL by default) or specify `'5m'` / `'1h'` directly
+3. **Cache Tool Definitions**: Set [`AnthropicModelSettings.anthropic_cache_tool_definitions`][pydantic_ai.models.anthropic.AnthropicModelSettings.anthropic_cache_tool_definitions] to `True` (uses 5m TTL by default) or specify `'5m'` / `'1h'` directly
 
 You can combine all three strategies for maximum savings:
 
@@ -96,8 +96,9 @@ agent = Agent(
     'anthropic:claude-sonnet-4-5',
     system_prompt='Detailed instructions...',
     model_settings=AnthropicModelSettings(
+        # Use True for default 5m TTL, or specify '5m' / '1h' directly
         anthropic_cache_instructions=True,
-        anthropic_cache_tool_definitions=True,
+        anthropic_cache_tool_definitions='1h',  # Longer cache for tool definitions
     ),
 )
 
@@ -134,7 +135,7 @@ agent = Agent(
     'anthropic:claude-sonnet-4-5',
     system_prompt='Instructions...',
     model_settings=AnthropicModelSettings(
-        anthropic_cache_instructions=True
+        anthropic_cache_instructions=True  # Default 5m TTL
     ),
 )
 
 
@@ -362,6 +362,7 @@ DeferredToolRequests(
             tool_call_id='pyd_ai_tool_call_id__temperature_fahrenheit',
         ),
     ],
+    metadata={},
 )
 """
 
 
@@ -888,6 +888,7 @@ async def process_tool_calls(  # noqa: C901
         calls_to_run = [call for call in calls_to_run if call.tool_call_id in calls_to_run_results]
 
     deferred_calls: dict[Literal['external', 'unapproved'], list[_messages.ToolCallPart]] = defaultdict(list)
+    deferred_metadata: dict[str, dict[str, Any]] = {}
 
     if calls_to_run:
         async for event in _call_tools(
@@ -899,6 +900,7 @@ async def process_tool_calls(  # noqa: C901
             usage_limits=ctx.deps.usage_limits,
             output_parts=output_parts,
             output_deferred_calls=deferred_calls,
+            output_deferred_metadata=deferred_metadata,
         ):
             yield event
 
@@ -932,6 +934,7 @@ async def process_tool_calls(  # noqa: C901
         deferred_tool_requests = _output.DeferredToolRequests(
             calls=deferred_calls['external'],
             approvals=deferred_calls['unapproved'],
+            metadata=deferred_metadata,
         )
 
         final_result = result.FinalResult(cast(NodeRunEndT, deferred_tool_requests), None, None)
@@ -949,10 +952,12 @@ async def _call_tools(
     usage_limits: _usage.UsageLimits,
     output_parts: list[_messages.ModelRequestPart],
     output_deferred_calls: dict[Literal['external', 'unapproved'], list[_messages.ToolCallPart]],
+    output_deferred_metadata: dict[str, dict[str, Any]],
 ) -> AsyncIterator[_messages.HandleResponseEvent]:
     tool_parts_by_index: dict[int, _messages.ModelRequestPart] = {}
     user_parts_by_index: dict[int, _messages.UserPromptPart] = {}
     deferred_calls_by_index: dict[int, Literal['external', 'unapproved']] = {}
+    deferred_metadata_by_index: dict[int, dict[str, Any] | None] = {}
 
     if usage_limits.tool_calls_limit is not None:
         projected_usage = deepcopy(usage)
@@ -987,10 +992,12 @@ async def handle_call_or_result(
                 tool_part, tool_user_content = (
                     (await coro_or_task) if inspect.isawaitable(coro_or_task) else coro_or_task.result()
                 )
-            except exceptions.CallDeferred:
+            except exceptions.CallDeferred as e:
                 deferred_calls_by_index[index] = 'external'
-            except exceptions.ApprovalRequired:
+                deferred_metadata_by_index[index] = e.metadata
+            except exceptions.ApprovalRequired as e:
                 deferred_calls_by_index[index] = 'unapproved'
+                deferred_metadata_by_index[index] = e.metadata
             else:
                 tool_parts_by_index[index] = tool_part
                 if tool_user_content:
@@ -1028,8 +1035,25 @@ async def handle_call_or_result(
     output_parts.extend([tool_parts_by_index[k] for k in sorted(tool_parts_by_index)])
     output_parts.extend([user_parts_by_index[k] for k in sorted(user_parts_by_index)])
 
+    _populate_deferred_calls(
+        tool_calls, deferred_calls_by_index, deferred_metadata_by_index, output_deferred_calls, output_deferred_metadata
+    )
+
+
+def _populate_deferred_calls(
+    tool_calls: list[_messages.ToolCallPart],
+    deferred_calls_by_index: dict[int, Literal['external', 'unapproved']],
+    deferred_metadata_by_index: dict[int, dict[str, Any] | None],
+    output_deferred_calls: dict[Literal['external', 'unapproved'], list[_messages.ToolCallPart]],
+    output_deferred_metadata: dict[str, dict[str, Any]],
+) -> None:
+    """Populate deferred calls and metadata from indexed mappings."""
     for k in sorted(deferred_calls_by_index):
-        output_deferred_calls[deferred_calls_by_index[k]].append(tool_calls[k])
+        call = tool_calls[k]
+        output_deferred_calls[deferred_calls_by_index[k]].append(call)
+        metadata = deferred_metadata_by_index[k]
+        if metadata is not None:
+            output_deferred_metadata[call.tool_call_id] = metadata
 
 
 async def _call_tool(
 
@@ -25,7 +25,7 @@ def __init__(
         *,
         strict: bool | None = None,
         prefer_inlined_defs: bool = False,
-        simplify_nullable_unions: bool = False,
+        simplify_nullable_unions: bool = False,  # TODO (v2): Remove this, no longer used
     ):
         self.schema = schema
 
@@ -146,10 +146,9 @@ def _handle_union(self, schema: JsonSchema, union_kind: Literal['anyOf', 'oneOf'
 
         handled = [self._handle(member) for member in members]
 
-        # convert nullable unions to nullable types
+        # TODO (v2): Remove this feature, no longer used
         if self.simplify_nullable_unions:
             handled = self._simplify_nullable_union(handled)
-
         if len(handled) == 1:
             # In this case, no need to retain the union
             return handled[0] | schema
@@ -161,7 +160,7 @@ def _handle_union(self, schema: JsonSchema, union_kind: Literal['anyOf', 'oneOf'
 
     @staticmethod
     def _simplify_nullable_union(cases: list[JsonSchema]) -> list[JsonSchema]:
-        # TODO: Should we move this to relevant subclasses? Or is it worth keeping here to make reuse easier?
+        # TODO (v2): Remove this method, no longer used
         if len(cases) == 2 and {'type': 'null'} in cases:
             # Find the non-null schema
             non_null_schema = next(
 
@@ -67,6 +67,7 @@ def _workflow_runner(runner: WorkflowRunner | None) -> WorkflowRunner:
             'rich',
             'httpx',
             'anyio',
+            'sniffio',
             'httpcore',
             # Used by fastmcp via py-key-value-aio
             'beartype',
 
@@ -14,7 +14,7 @@
 class TemporalRunContext(RunContext[AgentDepsT]):
     """The [`RunContext`][pydantic_ai.tools.RunContext] subclass to use to serialize and deserialize the run context for use inside a Temporal activity.
 
-    By default, only the `deps`, `run_id`, `retries`, `tool_call_id`, `tool_name`, `tool_call_approved`, `retry`, `max_retries`, `run_step` and `partial_output` attributes will be available.
+    By default, only the `deps`, `run_id`, `retries`, `tool_call_id`, `tool_name`, `tool_call_approved`, `retry`, `max_retries`, `run_step`, `usage`, and `partial_output` attributes will be available.
     To make another attribute available, create a `TemporalRunContext` subclass with a custom `serialize_run_context` class method that returns a dictionary that includes the attribute and pass it to [`TemporalAgent`][pydantic_ai.durable_exec.temporal.TemporalAgent].
     """
 
@@ -51,6 +51,7 @@ def serialize_run_context(cls, ctx: RunContext[Any]) -> dict[str, Any]:
             'max_retries': ctx.max_retries,
             'run_step': ctx.run_step,
             'partial_output': ctx.partial_output,
+            'usage': ctx.usage,
         }
 
     @classmethod
 
@@ -27,11 +27,13 @@ class CallToolParams:
 
 @dataclass
 class _ApprovalRequired:
+    metadata: dict[str, Any] | None = None
     kind: Literal['approval_required'] = 'approval_required'
 
 
 @dataclass
 class _CallDeferred:
+    metadata: dict[str, Any] | None = None
     kind: Literal['call_deferred'] = 'call_deferred'
 
 
@@ -75,20 +77,20 @@ async def _wrap_call_tool_result(self, coro: Awaitable[Any]) -> CallToolResult:
         try:
             result = await coro
             return _ToolReturn(result=result)
-        except ApprovalRequired:
-            return _ApprovalRequired()
-        except CallDeferred:
-            return _CallDeferred()
+        except ApprovalRequired as e:
+            return _ApprovalRequired(metadata=e.metadata)
+        except CallDeferred as e:
+            return _CallDeferred(metadata=e.metadata)
         except ModelRetry as e:
             return _ModelRetry(message=e.message)
 
     def _unwrap_call_tool_result(self, result: CallToolResult) -> Any:
         if isinstance(result, _ToolReturn):
             return result.result
         elif isinstance(result, _ApprovalRequired):
-            raise ApprovalRequired()
+            raise ApprovalRequired(metadata=result.metadata)
         elif isinstance(result, _CallDeferred):
-            raise CallDeferred()
+            raise CallDeferred(metadata=result.metadata)
         elif isinstance(result, _ModelRetry):
             raise ModelRetry(result.message)
         else:
Original file line number	Diff line number	Diff line change
`@@ -362,6 +362,7 @@ DeferredToolRequests(`
`362`	`362`	`tool_call_id='pyd_ai_tool_call_id__temperature_fahrenheit',`
`363`	`363`	`),`
`364`	`364`	`],`
	`365`	`+ metadata={},`
`365`	`366`	`)`
`366`	`367`	`"""`
`367`	`368`