Merge main

dmontagu · dmontagu · commit b65db2a07096 · 2025-10-03T13:59:05.000-06:00
diff --git a/docs/evals.md b/docs/evals.md
@@ -653,10 +653,12 @@ async def main():
     print(output_file.read_text())
     """
     # yaml-language-server: $schema=questions_cases_schema.json
+    name: null
     cases:
     - name: Easy Capital Question
       inputs:
         question: What is the capital of France?
+        context: null
       metadata:
         difficulty: easy
         category: Geography
@@ -668,6 +670,7 @@ async def main():
     - name: Challenging Landmark Question
       inputs:
         question: Which world-famous landmark is located on the banks of the Seine River?
+        context: null
       metadata:
         difficulty: hard
         category: Landmarks
@@ -676,6 +679,7 @@ async def main():
         confidence: 0.9
       evaluators:
       - EqualsExpected
+    evaluators: []
     """
 ```
 
@@ -713,11 +717,13 @@ async def main():
     """
     {
       "$schema": "questions_cases_schema.json",
+      "name": null,
       "cases": [
         {
           "name": "Easy Capital Question",
           "inputs": {
-            "question": "What is the capital of France?"
+            "question": "What is the capital of France?",
+            "context": null
           },
           "metadata": {
             "difficulty": "easy",
@@ -734,7 +740,8 @@ async def main():
         {
           "name": "Challenging Landmark Question",
           "inputs": {
-            "question": "Which world-famous landmark is located on the banks of the Seine River?"
+            "question": "Which world-famous landmark is located on the banks of the Seine River?",
+            "context": null
           },
           "metadata": {
             "difficulty": "hard",
@@ -748,7 +755,8 @@ async def main():
             "EqualsExpected"
           ]
         }
-      ]
+      ],
+      "evaluators": []
     }
     """
 ```
diff --git a/pydantic_ai_slim/pydantic_ai/_agent_graph.py b/pydantic_ai_slim/pydantic_ai/_agent_graph.py
@@ -459,15 +459,13 @@ async def _prepare_request(
 
         original_history = ctx.state.message_history[:]
         message_history = await _process_message_history(original_history, ctx.deps.history_processors, run_context)
-        # Never merge the new `ModelRequest` with the one preceding it, to keep `new_messages()` from accidentally including part of the existing message history
-        message_history = [*_clean_message_history(message_history[:-1]), message_history[-1]]
         # `ctx.state.message_history` is the same list used by `capture_run_messages`, so we should replace its contents, not the reference
         ctx.state.message_history[:] = message_history
         # Update the new message index to ensure `result.new_messages()` returns the correct messages
         ctx.deps.new_message_index -= len(original_history) - len(message_history)
 
-        # Do one more cleaning pass to merge possible consecutive trailing `ModelRequest`s into one, with tool call parts before user parts,
-        # but don't store it in the message history on state.
+        # Merge possible consecutive trailing `ModelRequest`s into one, with tool call parts before user parts,
+        # but don't store it in the message history on state. This is just for the benefit of model classes that want clear user/assistant boundaries.
         # See `tests/test_tools.py::test_parallel_tool_return_with_deferred` for an example where this is necessary
         message_history = _clean_message_history(message_history)
 
diff --git a/pydantic_ai_slim/pydantic_ai/format_prompt.py b/pydantic_ai_slim/pydantic_ai/format_prompt.py
@@ -3,6 +3,7 @@
 from collections.abc import Iterable, Iterator, Mapping
 from dataclasses import asdict, dataclass, field, fields, is_dataclass
 from datetime import date
+from enum import Enum
 from typing import Any, Literal
 from xml.etree import ElementTree
 
@@ -26,8 +27,8 @@ def format_as_xml(
     This is useful since LLMs often find it easier to read semi-structured data (e.g. examples) as XML,
     rather than JSON etc.
 
-    Supports: `str`, `bytes`, `bytearray`, `bool`, `int`, `float`, `date`, `datetime`, `Mapping`,
-    `Iterable`, `dataclass`, and `BaseModel`.
+    Supports: `str`, `bytes`, `bytearray`, `bool`, `int`, `float`, `date`, `datetime`, `Enum`,
+    `Mapping`, `Iterable`, `dataclass`, and `BaseModel`.
 
     Args:
         obj: Python Object to serialize to XML.
@@ -101,7 +102,7 @@ def _to_xml(self, value: Any, path: str, tag: str | None = None) -> ElementTree.
             element.text = value
         elif isinstance(value, bytes | bytearray):
             element.text = value.decode(errors='ignore')
-        elif isinstance(value, bool | int | float):
+        elif isinstance(value, bool | int | float | Enum):
             element.text = str(value)
         elif isinstance(value, date):
             element.text = value.isoformat()
diff --git a/pydantic_ai_slim/pydantic_ai/models/__init__.py b/pydantic_ai_slim/pydantic_ai/models/__init__.py
@@ -141,12 +141,20 @@
         'google-gla:gemini-2.0-flash',
         'google-gla:gemini-2.0-flash-lite',
         'google-gla:gemini-2.5-flash',
+        'google-gla:gemini-2.5-flash-preview-09-2025',
+        'google-gla:gemini-flash-latest',
         'google-gla:gemini-2.5-flash-lite',
+        'google-gla:gemini-2.5-flash-lite-preview-09-2025',
+        'google-gla:gemini-flash-lite-latest',
         'google-gla:gemini-2.5-pro',
         'google-vertex:gemini-2.0-flash',
         'google-vertex:gemini-2.0-flash-lite',
         'google-vertex:gemini-2.5-flash',
+        'google-vertex:gemini-2.5-flash-preview-09-2025',
+        'google-vertex:gemini-flash-latest',
         'google-vertex:gemini-2.5-flash-lite',
+        'google-vertex:gemini-2.5-flash-lite-preview-09-2025',
+        'google-vertex:gemini-flash-lite-latest',
         'google-vertex:gemini-2.5-pro',
         'grok:grok-4',
         'grok:grok-4-0709',
diff --git a/pydantic_ai_slim/pydantic_ai/models/gemini.py b/pydantic_ai_slim/pydantic_ai/models/gemini.py
@@ -46,7 +46,11 @@
     'gemini-2.0-flash',
     'gemini-2.0-flash-lite',
     'gemini-2.5-flash',
+    'gemini-2.5-flash-preview-09-2025',
     'gemini-2.5-flash-lite',
+    'gemini-2.5-flash-lite-preview-09-2025',
+    'gemini-flash-latest',
+    'gemini-flash-lite-latest',
     'gemini-2.5-pro',
 ]
 """Latest Gemini models."""
diff --git a/pydantic_ai_slim/pydantic_ai/models/google.py b/pydantic_ai_slim/pydantic_ai/models/google.py
@@ -95,7 +95,11 @@
     'gemini-2.0-flash',
     'gemini-2.0-flash-lite',
     'gemini-2.5-flash',
+    'gemini-2.5-flash-preview-09-2025',
+    'gemini-flash-latest',
     'gemini-2.5-flash-lite',
+    'gemini-2.5-flash-lite-preview-09-2025',
+    'gemini-flash-lite-latest',
     'gemini-2.5-pro',
 ]
 """Latest Gemini models."""
diff --git a/pydantic_evals/pydantic_evals/dataset.py b/pydantic_evals/pydantic_evals/dataset.py
@@ -646,15 +646,15 @@ def to_file(
 
         context: dict[str, Any] = {'use_short_form': True}
         if fmt == 'yaml':
-            dumped_data = self.model_dump(mode='json', by_alias=True, exclude_defaults=True, context=context)
+            dumped_data = self.model_dump(mode='json', by_alias=True, context=context)
             content = yaml.dump(dumped_data, sort_keys=False)
             if schema_ref:  # pragma: no branch
                 yaml_language_server_line = f'{_YAML_SCHEMA_LINE_PREFIX}{schema_ref}'
                 content = f'{yaml_language_server_line}\n{content}'
             path.write_text(content)
         else:
             context['$schema'] = schema_ref
-            json_data = self.model_dump_json(indent=2, by_alias=True, exclude_defaults=True, context=context)
+            json_data = self.model_dump_json(indent=2, by_alias=True, context=context)
             path.write_text(json_data + '\n')
 
     @classmethod
@@ -724,6 +724,7 @@ class Case(BaseModel, extra='forbid'):  # pyright: ignore[reportUnusedClass]  #
                 evaluators: list[Union[tuple(evaluator_schema_types)]] = []  # pyright: ignore  # noqa UP007
 
         class Dataset(BaseModel, extra='forbid'):
+            name: str | None = None
             cases: list[Case]
             if evaluator_schema_types:  # pragma: no branch
                 evaluators: list[Union[tuple(evaluator_schema_types)]] = []  # pyright: ignore  # noqa UP007
diff --git a/tests/evals/test_dataset.py b/tests/evals/test_dataset.py
@@ -4,7 +4,7 @@
 import sys
 from dataclasses import dataclass, field
 from pathlib import Path
-from typing import Any, cast
+from typing import Any, Literal, cast
 
 import pytest
 import yaml
@@ -864,6 +864,38 @@ async def test_serialization_to_json(example_dataset: Dataset[TaskInput, TaskOut
     assert (tmp_path / schema).exists()
 
 
+def test_serializing_parts_with_discriminators(tmp_path: Path):
+    class Foo(BaseModel):
+        foo: str
+        kind: Literal['foo'] = 'foo'
+
+    class Bar(BaseModel):
+        bar: str
+        kind: Literal['bar'] = 'bar'
+
+    items = [Foo(foo='foo'), Bar(bar='bar')]
+
+    dataset = Dataset[list[Foo | Bar]](cases=[Case(inputs=items)])
+    yaml_path = tmp_path / 'test_cases.yaml'
+    dataset.to_file(yaml_path)
+
+    loaded_dataset = Dataset[list[Foo | Bar]].from_file(yaml_path)
+    assert loaded_dataset == snapshot(
+        Dataset(
+            name='test_cases',
+            cases=[
+                Case(
+                    name=None,
+                    inputs=[
+                        Foo(foo='foo'),
+                        Bar(bar='bar'),
+                    ],
+                )
+            ],
+        )
+    )
+
+
 def test_serialization_errors(tmp_path: Path):
     with pytest.raises(ValueError) as exc_info:
         Dataset[TaskInput, TaskOutput, TaskMetadata].from_file(tmp_path / 'test_cases.abc')
diff --git a/tests/test_agent.py b/tests/test_agent.py
@@ -5377,3 +5377,137 @@ def dynamic_instr() -> str:
     sys_texts = [p.content for p in req.parts if isinstance(p, SystemPromptPart)]
     # The dynamic system prompt should still be present since overrides target instructions only
     assert dynamic_value in sys_texts
+
+
+def test_continue_conversation_that_ended_in_output_tool_call(allow_model_requests: None):
+    def llm(messages: list[ModelMessage], info: AgentInfo) -> ModelResponse:
+        if any(isinstance(p, ToolReturnPart) and p.tool_name == 'roll_dice' for p in messages[-1].parts):
+            return ModelResponse(
+                parts=[
+                    ToolCallPart(
+                        tool_name='final_result',
+                        args={'dice_roll': 4},
+                        tool_call_id='pyd_ai_tool_call_id__final_result',
+                    )
+                ]
+            )
+        return ModelResponse(
+            parts=[ToolCallPart(tool_name='roll_dice', args={}, tool_call_id='pyd_ai_tool_call_id__roll_dice')]
+        )
+
+    class Result(BaseModel):
+        dice_roll: int
+
+    agent = Agent(FunctionModel(llm), output_type=Result)
+
+    @agent.tool_plain
+    def roll_dice() -> int:
+        return 4
+
+    result = agent.run_sync('Roll me a dice.')
+    messages = result.all_messages()
+    assert messages == snapshot(
+        [
+            ModelRequest(
+                parts=[
+                    UserPromptPart(
+                        content='Roll me a dice.',
+                        timestamp=IsDatetime(),
+                    )
+                ]
+            ),
+            ModelResponse(
+                parts=[ToolCallPart(tool_name='roll_dice', args={}, tool_call_id='pyd_ai_tool_call_id__roll_dice')],
+                usage=RequestUsage(input_tokens=55, output_tokens=2),
+                model_name='function:llm:',
+                timestamp=IsDatetime(),
+            ),
+            ModelRequest(
+                parts=[
+                    ToolReturnPart(
+                        tool_name='roll_dice',
+                        content=4,
+                        tool_call_id='pyd_ai_tool_call_id__roll_dice',
+                        timestamp=IsDatetime(),
+                    )
+                ]
+            ),
+            ModelResponse(
+                parts=[
+                    ToolCallPart(
+                        tool_name='final_result',
+                        args={'dice_roll': 4},
+                        tool_call_id='pyd_ai_tool_call_id__final_result',
+                    )
+                ],
+                usage=RequestUsage(input_tokens=56, output_tokens=6),
+                model_name='function:llm:',
+                timestamp=IsDatetime(),
+            ),
+            ModelRequest(
+                parts=[
+                    ToolReturnPart(
+                        tool_name='final_result',
+                        content='Final result processed.',
+                        tool_call_id='pyd_ai_tool_call_id__final_result',
+                        timestamp=IsDatetime(),
+                    )
+                ]
+            ),
+        ]
+    )
+
+    result = agent.run_sync('Roll me a dice again.', message_history=messages)
+    new_messages = result.new_messages()
+    assert new_messages == snapshot(
+        [
+            ModelRequest(
+                parts=[
+                    UserPromptPart(
+                        content='Roll me a dice again.',
+                        timestamp=IsDatetime(),
+                    )
+                ]
+            ),
+            ModelResponse(
+                parts=[ToolCallPart(tool_name='roll_dice', args={}, tool_call_id='pyd_ai_tool_call_id__roll_dice')],
+                usage=RequestUsage(input_tokens=66, output_tokens=8),
+                model_name='function:llm:',
+                timestamp=IsDatetime(),
+            ),
+            ModelRequest(
+                parts=[
+                    ToolReturnPart(
+                        tool_name='roll_dice',
+                        content=4,
+                        tool_call_id='pyd_ai_tool_call_id__roll_dice',
+                        timestamp=IsDatetime(),
+                    )
+                ]
+            ),
+            ModelResponse(
+                parts=[
+                    ToolCallPart(
+                        tool_name='final_result',
+                        args={'dice_roll': 4},
+                        tool_call_id='pyd_ai_tool_call_id__final_result',
+                    )
+                ],
+                usage=RequestUsage(input_tokens=67, output_tokens=12),
+                model_name='function:llm:',
+                timestamp=IsDatetime(),
+            ),
+            ModelRequest(
+                parts=[
+                    ToolReturnPart(
+                        tool_name='final_result',
+                        content='Final result processed.',
+                        tool_call_id='pyd_ai_tool_call_id__final_result',
+                        timestamp=IsDatetime(),
+                    )
+                ]
+            ),
+        ]
+    )
+
+    assert not any(isinstance(p, ToolReturnPart) and p.tool_name == 'final_result' for p in new_messages[0].parts)
diff --git a/tests/test_format_as_xml.py b/tests/test_format_as_xml.py