Skip to content

Commit 97834d6

Browse files
dmontaguDouweM
andauthored
BREAKING CHANGE: Change type of 'source' field on EvaluationResult (#2388)
Co-authored-by: Douwe Maan <[email protected]>
1 parent d78b77e commit 97834d6

File tree

13 files changed

+147
-56
lines changed

13 files changed

+147
-56
lines changed

docs/changelog.md

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,10 @@ Pydantic AI is still pre-version 1, so breaking changes will occur, however:
1212
!!! note
1313
Here's a filtered list of the breaking changes for each version to help you upgrade Pydantic AI.
1414

15+
### v0.5.0 (2025-08-04)
16+
17+
See [#2388](https://github.com/pydantic/pydantic-ai/pull/2388) - The `source` field of an `EvaluationResult` is now of type `EvaluatorSpec` rather than the actual source `Evaluator` instance, to help with serialization/deserialization.
18+
1519
### v0.4.0 (2025-07-08)
1620

1721
See [#1799](https://github.com/pydantic/pydantic-ai/pull/1799) - Pydantic Evals `EvaluationReport` and `ReportCase` are now generic dataclasses instead of Pydantic models. If you were serializing them using `model_dump()`, you will now need to use the `EvaluationReportAdapter` and `ReportCaseAdapter` type adapters instead.

pydantic_evals/pydantic_evals/dataset.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,9 +38,9 @@
3838
from ._utils import get_unwrapped_function_name, task_group_gather
3939
from .evaluators import EvaluationResult, Evaluator
4040
from .evaluators._run_evaluator import run_evaluator
41-
from .evaluators._spec import EvaluatorSpec
4241
from .evaluators.common import DEFAULT_EVALUATORS
4342
from .evaluators.context import EvaluatorContext
43+
from .evaluators.spec import EvaluatorSpec
4444
from .otel import SpanTree
4545
from .otel._context_subtree import context_subtree
4646
from .reporting import EvaluationReport, ReportCase, ReportCaseAggregate

pydantic_evals/pydantic_evals/evaluators/__init__.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,7 @@
1010
Python,
1111
)
1212
from .context import EvaluatorContext
13-
from .evaluator import EvaluationReason, EvaluationResult, Evaluator, EvaluatorOutput
13+
from .evaluator import EvaluationReason, EvaluationResult, Evaluator, EvaluatorOutput, EvaluatorSpec
1414

1515
__all__ = (
1616
# common
@@ -27,7 +27,8 @@
2727
'EvaluatorContext',
2828
# evaluator
2929
'Evaluator',
30-
'EvaluationReason',
3130
'EvaluatorOutput',
31+
'EvaluatorSpec',
32+
'EvaluationReason',
3233
'EvaluationResult',
3334
)

pydantic_evals/pydantic_evals/evaluators/_run_evaluator.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -48,7 +48,9 @@ async def run_evaluator(
4848
for name, result in results.items():
4949
if not isinstance(result, EvaluationReason):
5050
result = EvaluationReason(value=result)
51-
details.append(EvaluationResult(name=name, value=result.value, reason=result.reason, source=evaluator))
51+
details.append(
52+
EvaluationResult(name=name, value=result.value, reason=result.reason, source=evaluator.as_spec())
53+
)
5254

5355
return details
5456

pydantic_evals/pydantic_evals/evaluators/evaluator.py

Lines changed: 13 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -17,15 +17,16 @@
1717
from pydantic_ai import _utils
1818

1919
from .._utils import get_event_loop
20-
from ._spec import EvaluatorSpec
2120
from .context import EvaluatorContext
21+
from .spec import EvaluatorSpec
2222

2323
__all__ = (
2424
'EvaluationReason',
2525
'EvaluationResult',
2626
'EvaluationScalar',
2727
'Evaluator',
2828
'EvaluatorOutput',
29+
'EvaluatorSpec',
2930
)
3031

3132
EvaluationScalar = Union[bool, int, float, str]
@@ -71,13 +72,13 @@ class EvaluationResult(Generic[EvaluationScalarT]):
7172
name: The name of the evaluation.
7273
value: The scalar result of the evaluation.
7374
reason: An optional explanation of the evaluation result.
74-
source: The evaluator that produced this result.
75+
source: The spec of the evaluator that produced this result.
7576
"""
7677

7778
name: str
7879
value: EvaluationScalarT
7980
reason: str | None
80-
source: Evaluator
81+
source: EvaluatorSpec
8182

8283
def downcast(self, *value_types: type[T]) -> EvaluationResult[T] | None:
8384
"""Attempt to downcast this result to a more specific type.
@@ -246,6 +247,13 @@ def serialize(self, info: SerializationInfo) -> Any:
246247
Returns:
247248
A JSON-serializable representation of this evaluator as an EvaluatorSpec.
248249
"""
250+
return to_jsonable_python(
251+
self.as_spec(),
252+
context=info.context,
253+
serialize_unknown=True,
254+
)
255+
256+
def as_spec(self) -> EvaluatorSpec:
249257
raw_arguments = self.build_serialization_arguments()
250258

251259
arguments: None | tuple[Any,] | dict[str, Any]
@@ -255,11 +263,8 @@ def serialize(self, info: SerializationInfo) -> Any:
255263
arguments = (next(iter(raw_arguments.values())),)
256264
else:
257265
arguments = raw_arguments
258-
return to_jsonable_python(
259-
EvaluatorSpec(name=self.get_serialization_name(), arguments=arguments),
260-
context=info.context,
261-
serialize_unknown=True,
262-
)
266+
267+
return EvaluatorSpec(name=self.get_serialization_name(), arguments=arguments)
263268

264269
def build_serialization_arguments(self) -> dict[str, Any]:
265270
"""Build the arguments for serialization.

pydantic_evals/pydantic_evals/evaluators/_spec.py renamed to pydantic_evals/pydantic_evals/evaluators/spec.py

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -30,12 +30,6 @@ class EvaluatorSpec(BaseModel):
3030
* `'MyEvaluator'` - Just the (string) name of the Evaluator subclass is used if its `__init__` takes no arguments
3131
* `{'MyEvaluator': first_arg}` - A single argument is passed as the first positional argument to `MyEvaluator.__init__`
3232
* `{'MyEvaluator': {k1: v1, k2: v2}}` - Multiple kwargs are passed to `MyEvaluator.__init__`
33-
34-
Args:
35-
name: The serialization name of the evaluator class returned by `EvaluatorClass.get_serialization_name()`;
36-
this is usually just the class name itself.
37-
arguments: The arguments to pass to the evaluator's constructor. Can be None (for no arguments),
38-
a tuple (for a single positional argument), or a dict (for multiple keyword arguments).
3933
"""
4034

4135
name: str

pydantic_evals/pydantic_evals/reporting/__init__.py

Lines changed: 5 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -669,7 +669,11 @@ def build_diff_row(
669669
row.append(scores_diff)
670670

671671
if self.include_labels: # pragma: no branch
672-
labels_diff = self._render_dicts_diff(baseline.labels, new_case.labels, self.label_renderers)
672+
labels_diff = self._render_dicts_diff(
673+
{k: v.value for k, v in baseline.labels.items()},
674+
{k: v.value for k, v in new_case.labels.items()},
675+
self.label_renderers,
676+
)
673677
row.append(labels_diff)
674678

675679
if self.include_metrics: # pragma: no branch

tests/evals/test_dataset.py

Lines changed: 76 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@
99
import pytest
1010
from dirty_equals import HasRepr, IsNumber
1111
from inline_snapshot import snapshot
12-
from pydantic import BaseModel
12+
from pydantic import BaseModel, TypeAdapter
1313

1414
from ..conftest import IsStr, try_import
1515
from .utils import render_table
@@ -20,7 +20,7 @@
2020

2121
from pydantic_evals import Case, Dataset
2222
from pydantic_evals.dataset import increment_eval_metric, set_eval_attribute
23-
from pydantic_evals.evaluators import EvaluationResult, Evaluator, EvaluatorOutput, LLMJudge, Python
23+
from pydantic_evals.evaluators import EvaluationResult, Evaluator, EvaluatorOutput, EvaluatorSpec, LLMJudge, Python
2424
from pydantic_evals.evaluators.context import EvaluatorContext
2525

2626
@dataclass
@@ -32,7 +32,7 @@ class MockEvaluator(Evaluator[object, object, object]):
3232
def evaluate(self, ctx: EvaluatorContext[object, object, object]) -> EvaluatorOutput:
3333
return self.output
3434

35-
from pydantic_evals.reporting import ReportCase, ReportCaseAdapter
35+
from pydantic_evals.reporting import EvaluationReport, ReportCase, ReportCaseAdapter
3636

3737
pytestmark = [pytest.mark.skipif(not imports_successful(), reason='pydantic-evals not installed'), pytest.mark.anyio]
3838

@@ -456,13 +456,13 @@ async def my_task(inputs: TaskInput) -> TaskOutput:
456456
scores={},
457457
labels={
458458
'output': EvaluationResult(
459-
name='output', value='a', reason=None, source=MockEvaluator(output={'output': 'a'})
459+
name='output', value='a', reason=None, source=MockEvaluator(output={'output': 'a'}).as_spec()
460460
),
461461
'output_2': EvaluationResult(
462-
name='output', value='b', reason=None, source=MockEvaluator(output={'output': 'b'})
462+
name='output', value='b', reason=None, source=MockEvaluator(output={'output': 'b'}).as_spec()
463463
),
464464
'output_3': EvaluationResult(
465-
name='output', value='c', reason=None, source=MockEvaluator(output={'output': 'c'})
465+
name='output', value='c', reason=None, source=MockEvaluator(output={'output': 'c'}).as_spec()
466466
),
467467
},
468468
assertions={},
@@ -482,13 +482,13 @@ async def my_task(inputs: TaskInput) -> TaskOutput:
482482
scores={},
483483
labels={
484484
'output': EvaluationResult(
485-
name='output', value='a', reason=None, source=MockEvaluator(output={'output': 'a'})
485+
name='output', value='a', reason=None, source=MockEvaluator(output={'output': 'a'}).as_spec()
486486
),
487487
'output_2': EvaluationResult(
488-
name='output', value='b', reason=None, source=MockEvaluator(output={'output': 'b'})
488+
name='output', value='b', reason=None, source=MockEvaluator(output={'output': 'b'}).as_spec()
489489
),
490490
'output_3': EvaluationResult(
491-
name='output', value='c', reason=None, source=MockEvaluator(output={'output': 'c'})
491+
name='output', value='c', reason=None, source=MockEvaluator(output={'output': 'c'}).as_spec()
492492
),
493493
},
494494
assertions={},
@@ -501,6 +501,73 @@ async def my_task(inputs: TaskInput) -> TaskOutput:
501501
)
502502

503503

504+
async def test_report_round_trip_serialization(example_dataset: Dataset[TaskInput, TaskOutput, TaskMetadata]):
505+
"""Test the increment_eval_metric function."""
506+
507+
async def my_task(inputs: TaskInput) -> TaskOutput:
508+
return TaskOutput(answer=f'answer to {inputs.query}')
509+
510+
example_dataset.add_evaluator(MockEvaluator({'output': 'a'}))
511+
512+
report = await example_dataset.evaluate(my_task)
513+
assert report == snapshot(
514+
EvaluationReport(
515+
name='my_task',
516+
cases=[
517+
ReportCase(
518+
name='case1',
519+
inputs=TaskInput(query='What is 2+2?'),
520+
metadata=TaskMetadata(difficulty='easy', category='general'),
521+
expected_output=TaskOutput(answer='4', confidence=1.0),
522+
output=TaskOutput(answer='answer to What is 2+2?', confidence=1.0),
523+
metrics={},
524+
attributes={},
525+
scores={},
526+
labels={
527+
'output': EvaluationResult(
528+
name='output',
529+
value='a',
530+
reason=None,
531+
source=EvaluatorSpec(name='MockEvaluator', arguments=({'output': 'a'},)),
532+
)
533+
},
534+
assertions={},
535+
task_duration=1.0,
536+
total_duration=6.0,
537+
trace_id='00000000000000000000000000000001',
538+
span_id='0000000000000003',
539+
),
540+
ReportCase(
541+
name='case2',
542+
inputs=TaskInput(query='What is the capital of France?'),
543+
metadata=TaskMetadata(difficulty='medium', category='geography'),
544+
expected_output=TaskOutput(answer='Paris', confidence=1.0),
545+
output=TaskOutput(answer='answer to What is the capital of France?', confidence=1.0),
546+
metrics={},
547+
attributes={},
548+
scores={},
549+
labels={
550+
'output': EvaluationResult(
551+
name='output',
552+
value='a',
553+
reason=None,
554+
source=EvaluatorSpec(name='MockEvaluator', arguments=({'output': 'a'},)),
555+
)
556+
},
557+
assertions={},
558+
task_duration=1.0,
559+
total_duration=4.0,
560+
trace_id='00000000000000000000000000000001',
561+
span_id='0000000000000007',
562+
),
563+
],
564+
)
565+
)
566+
567+
report_adapter = TypeAdapter(EvaluationReport[TaskInput, TaskOutput, TaskMetadata])
568+
assert report == report_adapter.validate_json(report_adapter.dump_json(report, indent=2))
569+
570+
504571
async def test_genai_attribute_collection(example_dataset: Dataset[TaskInput, TaskOutput, TaskMetadata]):
505572
async def my_task(inputs: TaskInput) -> TaskOutput:
506573
with logfire.span(

tests/evals/test_evaluator_base.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -52,11 +52,11 @@ def evaluate(self, ctx: EvaluatorContext) -> bool:
5252
evaluator = DummyEvaluator()
5353

5454
# Test basic result
55-
result = EvaluationResult(name='test', value=True, reason='Success', source=evaluator)
55+
result = EvaluationResult(name='test', value=True, reason='Success', source=evaluator.as_spec())
5656
assert result.name == 'test'
5757
assert result.value is True
5858
assert result.reason == 'Success'
59-
assert result.source == evaluator
59+
assert result.source == evaluator.as_spec()
6060

6161
# Test downcast with matching type
6262
downcast = result.downcast(bool)

tests/evals/test_evaluator_spec.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
from ..conftest import try_import
77

88
with try_import() as imports_successful:
9-
from pydantic_evals.evaluators._spec import (
9+
from pydantic_evals.evaluators.spec import (
1010
EvaluatorSpec,
1111
_SerializedEvaluatorSpec, # pyright: ignore[reportPrivateUsage]
1212
)

0 commit comments

Comments
 (0)