willgdjones
diff --git a/‎docs/models/openai.md‎
Lines changed: 50 additions & 0 deletions b/‎docs/models/openai.md‎
Lines changed: 50 additions & 0 deletions
diff --git a/‎pydantic_ai_slim/pydantic_ai/_agent_graph.py‎
Lines changed: 6 additions & 2 deletions b/‎pydantic_ai_slim/pydantic_ai/_agent_graph.py‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎pydantic_ai_slim/pydantic_ai/exceptions.py‎
Lines changed: 9 additions & 0 deletions b/‎pydantic_ai_slim/pydantic_ai/exceptions.py‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎pydantic_ai_slim/pydantic_ai/models/__init__.py‎
Lines changed: 8 additions & 0 deletions b/‎pydantic_ai_slim/pydantic_ai/models/__init__.py‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎pydantic_ai_slim/pydantic_ai/models/openai.py‎
Lines changed: 52 additions & 1 deletion b/‎pydantic_ai_slim/pydantic_ai/models/openai.py‎
Lines changed: 52 additions & 1 deletion
diff --git a/‎pydantic_ai_slim/pydantic_ai/result.py‎
Lines changed: 42 additions & 10 deletions b/‎pydantic_ai_slim/pydantic_ai/result.py‎
Lines changed: 42 additions & 10 deletions
@@ -143,6 +143,56 @@ As of 7:48 AM on Wednesday, April 2, 2025, in Tokyo, Japan, the weather is cloud
 
 You can learn more about the differences between the Responses API and Chat Completions API in the [OpenAI API docs](https://platform.openai.com/docs/guides/responses-vs-chat-completions).
 
+#### Referencing earlier responses
+
+The Responses API supports referencing earlier model responses in a new request using a `previous_response_id` parameter, to ensure the full [conversation state](https://platform.openai.com/docs/guides/conversation-state?api-mode=responses#passing-context-from-the-previous-response) including [reasoning items](https://platform.openai.com/docs/guides/reasoning#keeping-reasoning-items-in-context) are kept in context. This is available through the `openai_previous_response_id` field in
+[`OpenAIResponsesModelSettings`][pydantic_ai.models.openai.OpenAIResponsesModelSettings].
+
+```python
+from pydantic_ai import Agent
+from pydantic_ai.models.openai import OpenAIResponsesModel, OpenAIResponsesModelSettings
+
+model = OpenAIResponsesModel('gpt-5')
+agent = Agent(model=model)
+
+result = agent.run_sync('The secret is 1234')
+model_settings = OpenAIResponsesModelSettings(
+    openai_previous_response_id=result.all_messages()[-1].provider_response_id
+)
+result = agent.run_sync('What is the secret code?', model_settings=model_settings)
+print(result.output)
+#> 1234
+```
+
+By passing the `provider_response_id` from an earlier run, you can allow the model to build on its own prior reasoning without needing to resend the full message history.
+
+##### Automatically referencing earlier responses
+
+When the `openai_previous_response_id` field is set to `'auto'`, Pydantic AI will automatically select the most recent `provider_response_id` from message history and omit messages that came before it, letting the OpenAI API leverage server-side history instead for improved efficiency.
+
+```python
+from pydantic_ai import Agent
+from pydantic_ai.models.openai import OpenAIResponsesModel, OpenAIResponsesModelSettings
+
+model = OpenAIResponsesModel('gpt-5')
+agent = Agent(model=model)
+
+result1 = agent.run_sync('Tell me a joke.')
+print(result1.output)
+#> Did you hear about the toothpaste scandal? They called it Colgate.
+
+# When set to 'auto', the most recent provider_response_id
+# and messages after it are sent as request.
+model_settings = OpenAIResponsesModelSettings(openai_previous_response_id='auto')
+result2 = agent.run_sync(
+    'Explain?',
+    message_history=result1.new_messages(),
+    model_settings=model_settings
+)
+print(result2.output)
+#> This is an excellent joke invented by Samuel Colvin, it needs no explanation.
+```
+
 ## OpenAI-compatible Models
 
 Many providers and models are compatible with the OpenAI API, and can be used with `OpenAIChatModel` in Pydantic AI.
 
@@ -407,8 +407,12 @@ async def stream(
             )
             yield agent_stream
             # In case the user didn't manually consume the full stream, ensure it is fully consumed here,
-            # otherwise usage won't be properly counted:
-            async for _ in agent_stream:
+            # However, if the stream was cancelled, we should not consume further.
+            try:
+                async for _ in agent_stream:
+                    pass
+            except exceptions.StreamCancelled:
+                # Stream was cancelled - don't consume further
                 pass
 
         model_response = streamed_response.get()
 
@@ -24,6 +24,7 @@
     'UsageLimitExceeded',
     'ModelHTTPError',
     'FallbackExceptionGroup',
+    'StreamCancelled',
 )
 
 
@@ -162,6 +163,14 @@ class FallbackExceptionGroup(ExceptionGroup):
     """A group of exceptions that can be raised when all fallback models fail."""
 
 
+class StreamCancelled(Exception):
+    """Exception raised when a streaming response is cancelled."""
+
+    def __init__(self, message: str = 'Stream was cancelled'):
+        self.message = message
+        super().__init__(message)
+
+
 class ToolRetryError(Exception):
     """Exception used to signal a `ToolRetry` message should be returned to the LLM."""
 
 
@@ -641,6 +641,14 @@ def timestamp(self) -> datetime:
         """Get the timestamp of the response."""
         raise NotImplementedError()
 
+    async def cancel(self) -> None:
+        """Cancel the streaming response.
+
+        This should close the underlying network connection and cause any active iteration
+        to raise a StreamCancelled exception. The default implementation is a no-op.
+        """
+        pass
+
 
 ALLOW_MODEL_REQUESTS = True
 """Whether to allow requests to models.
 
@@ -17,7 +17,7 @@
 from .._thinking_part import split_content_into_text_and_thinking
 from .._utils import guard_tool_call_id as _guard_tool_call_id, now_utc as _now_utc, number_to_datetime
 from ..builtin_tools import CodeExecutionTool, WebSearchTool
-from ..exceptions import UserError
+from ..exceptions import StreamCancelled, UserError
 from ..messages import (
     AudioUrl,
     BinaryContent,
@@ -222,6 +222,17 @@ class OpenAIResponsesModelSettings(OpenAIChatModelSettings, total=False):
     `medium`, and `high`.
     """
 
+    openai_previous_response_id: Literal['auto'] | str
+    """The ID of a previous response from the model to use as the starting point for a continued conversation.
+
+    When set to `'auto'`, the request automatically uses the most recent
+    `provider_response_id` from the message history and omits earlier messages.
+
+    This enables the model to use server-side conversation state and faithfully reference previous reasoning.
+    See the [OpenAI Responses API documentation](https://platform.openai.com/docs/guides/reasoning#keeping-reasoning-items-in-context)
+    for more information.
+    """
+
 
 @dataclass(init=False)
 class OpenAIChatModel(Model):
@@ -977,6 +988,10 @@ async def _responses_create(
         else:
             tool_choice = 'auto'
 
+        previous_response_id = model_settings.get('openai_previous_response_id')
+        if previous_response_id == 'auto':
+            previous_response_id, messages = self._get_previous_response_id_and_new_messages(messages)
+
         instructions, openai_messages = await self._map_messages(messages, model_settings)
         reasoning = self._get_reasoning(model_settings)
 
@@ -1027,6 +1042,7 @@ async def _responses_create(
                 truncation=model_settings.get('openai_truncation', NOT_GIVEN),
                 timeout=model_settings.get('timeout', NOT_GIVEN),
                 service_tier=model_settings.get('openai_service_tier', NOT_GIVEN),
+                previous_response_id=previous_response_id,
                 reasoning=reasoning,
                 user=model_settings.get('openai_user', NOT_GIVEN),
                 text=text or NOT_GIVEN,
@@ -1092,6 +1108,28 @@ def _map_tool_definition(self, f: ToolDefinition) -> responses.FunctionToolParam
             ),
         }
 
+    def _get_previous_response_id_and_new_messages(
+        self, messages: list[ModelMessage]
+    ) -> tuple[str | None, list[ModelMessage]]:
+        # When `openai_previous_response_id` is set to 'auto', the most recent
+        # `provider_response_id` from the message history is selected and all
+        # earlier messages are omitted. This allows the OpenAI SDK to reuse
+        # server-side history for efficiency. The returned tuple contains the
+        # `previous_response_id` (if found) and the trimmed list of messages.
+        previous_response_id = None
+        trimmed_messages: list[ModelMessage] = []
+        for m in reversed(messages):
+            if isinstance(m, ModelResponse) and m.provider_name == self.system:
+                previous_response_id = m.provider_response_id
+                break
+            else:
+                trimmed_messages.append(m)
+
+        if previous_response_id and trimmed_messages:
+            return previous_response_id, list(reversed(trimmed_messages))
+        else:
+            return None, messages
+
     async def _map_messages(  # noqa: C901
         self, messages: list[ModelMessage], model_settings: OpenAIResponsesModelSettings
     ) -> tuple[str | NotGiven, list[responses.ResponseInputItemParam]]:
@@ -1309,9 +1347,14 @@ class OpenAIStreamedResponse(StreamedResponse):
     _response: AsyncIterable[ChatCompletionChunk]
     _timestamp: datetime
     _provider_name: str
+    _cancelled: bool = field(default=False, init=False)
 
     async def _get_event_iterator(self) -> AsyncIterator[ModelResponseStreamEvent]:
         async for chunk in self._response:
+            # Check for cancellation before processing each chunk
+            if self._cancelled:
+                raise StreamCancelled('OpenAI stream was cancelled')
+
             self._usage += _map_usage(chunk)
 
             if chunk.id and self.provider_response_id is None:
@@ -1380,6 +1423,14 @@ def timestamp(self) -> datetime:
         """Get the timestamp of the response."""
         return self._timestamp
 
+    async def cancel(self) -> None:
+        """Cancel the streaming response.
+
+        This marks the stream as cancelled, which will cause the iterator to raise
+        a StreamCancelled exception on the next iteration.
+        """
+        self._cancelled = True
+
 
 @dataclass
 class OpenAIResponsesStreamedResponse(StreamedResponse):
 
@@ -54,6 +54,7 @@ class AgentStream(Generic[AgentDepsT, OutputDataT]):
 
     _agent_stream_iterator: AsyncIterator[ModelResponseStreamEvent] | None = field(default=None, init=False)
     _initial_run_ctx_usage: RunUsage = field(init=False)
+    _cancelled: bool = field(default=False, init=False)
 
     def __post_init__(self):
         self._initial_run_ctx_usage = copy(self._run_ctx.usage)
@@ -123,6 +124,19 @@ def timestamp(self) -> datetime:
         """Get the timestamp of the response."""
         return self._raw_stream_response.timestamp
 
+    async def cancel(self) -> None:
+        """Cancel the streaming response.
+
+        This will close the underlying network connection and cause any active iteration
+        over the stream to raise a StreamCancelled exception.
+
+        Subsequent calls to cancel() are safe and will not raise additional exceptions.
+        """
+        if not self._cancelled:
+            self._cancelled = True
+            # Cancel the underlying stream response
+            await self._raw_stream_response.cancel()
+
     async def get_output(self) -> OutputDataT:
         """Stream the whole response, validate the output and return it."""
         async for _ in self:
@@ -227,8 +241,8 @@ async def _stream_text_deltas() -> AsyncIterator[str]:
     def __aiter__(self) -> AsyncIterator[ModelResponseStreamEvent]:
         """Stream [`ModelResponseStreamEvent`][pydantic_ai.messages.ModelResponseStreamEvent]s."""
         if self._agent_stream_iterator is None:
-            self._agent_stream_iterator = _get_usage_checking_stream_response(
-                self._raw_stream_response, self._usage_limits, self.usage
+            self._agent_stream_iterator = _get_cancellation_aware_stream_response(
+                self._raw_stream_response, self._usage_limits, self.usage, lambda: self._cancelled
             )
 
         return self._agent_stream_iterator
@@ -450,6 +464,18 @@ async def stream_responses(
         else:
             raise ValueError('No stream response or run result provided')  # pragma: no cover
 
+    async def cancel(self) -> None:
+        """Cancel the streaming response.
+
+        This will close the underlying network connection and cause any active iteration
+        over the stream to raise a StreamCancelled exception.
+
+        Subsequent calls to cancel() are safe and will not raise additional exceptions.
+        """
+        if self._stream_response is not None:
+            await self._stream_response.cancel()
+        # If there's no stream response, this is a no-op (already completed)
+
     async def get_output(self) -> OutputDataT:
         """Stream the whole response, validate and return it."""
         if self._run_result is not None:
@@ -526,21 +552,27 @@ class FinalResult(Generic[OutputDataT]):
     __repr__ = _utils.dataclasses_no_defaults_repr
 
 
-def _get_usage_checking_stream_response(
+def _get_cancellation_aware_stream_response(
     stream_response: models.StreamedResponse,
     limits: UsageLimits | None,
     get_usage: Callable[[], RunUsage],
+    is_cancelled: Callable[[], bool],
 ) -> AsyncIterator[ModelResponseStreamEvent]:
-    if limits is not None and limits.has_token_limits():
+    """Create an iterator that checks for cancellation and usage limits."""
 
-        async def _usage_checking_iterator():
-            async for item in stream_response:
+    async def _cancellation_aware_iterator():
+        async for item in stream_response:
+            # Check for cancellation first
+            if is_cancelled():
+                raise exceptions.StreamCancelled()
+
+            # Then check usage limits if needed
+            if limits is not None and limits.has_token_limits():
                 limits.check_tokens(get_usage())
-                yield item
 
-        return _usage_checking_iterator()
-    else:
-        return aiter(stream_response)
+            yield item
+
+    return _cancellation_aware_iterator()
 
 
 def _get_deferred_tool_requests(