BREAKING CHANGE: Support printing reasons in the console output for pydantic-evals (#2163)

dmontagu · web-flow · commit 74d94ec7ec4c · 2025-08-04T16:49:19.000-06:00
diff --git a/docs/changelog.md b/docs/changelog.md
@@ -16,6 +16,8 @@ Pydantic AI is still pre-version 1, so breaking changes will occur, however:
 
 See [#2388](https://github.com/pydantic/pydantic-ai/pull/2388) - The `source` field of an `EvaluationResult` is now of type `EvaluatorSpec` rather than the actual source `Evaluator` instance, to help with serialization/deserialization.
 
+See [#2163](https://github.com/pydantic/pydantic-ai/pull/2163) - The `EvaluationReport.print` and `EvaluationReport.console_table` methods now require most arguments be passed by keyword.
+
 ### v0.4.0 (2025-07-08)
 
 See [#1799](https://github.com/pydantic/pydantic-ai/pull/1799) - Pydantic Evals `EvaluationReport` and `ReportCase` are now generic dataclasses instead of Pydantic models. If you were serializing them using `model_dump()`, you will now need to use the `EvaluationReportAdapter` and `ReportCaseAdapter` type adapters instead.
diff --git a/pydantic_evals/pydantic_evals/reporting/__init__.py b/pydantic_evals/pydantic_evals/reporting/__init__.py
@@ -4,7 +4,7 @@
 from collections.abc import Mapping
 from dataclasses import dataclass
 from io import StringIO
-from typing import Any, Callable, Generic, Literal, Protocol
+from typing import Any, Callable, Generic, Literal, Protocol, cast
 
 from pydantic import BaseModel, TypeAdapter
 from rich.console import Console
@@ -168,6 +168,7 @@ def print(
         self,
         width: int | None = None,
         baseline: EvaluationReport[InputsT, OutputT, MetadataT] | None = None,
+        *,
         include_input: bool = False,
         include_metadata: bool = False,
         include_expected_output: bool = False,
@@ -183,6 +184,7 @@ def print(
         label_configs: dict[str, RenderValueConfig] | None = None,
         metric_configs: dict[str, RenderNumberConfig] | None = None,
         duration_config: RenderNumberConfig | None = None,
+        include_reasons: bool = False,
     ):  # pragma: no cover
         """Print this report to the console, optionally comparing it to a baseline report.
 
@@ -205,12 +207,14 @@ def print(
             label_configs=label_configs,
             metric_configs=metric_configs,
             duration_config=duration_config,
+            include_reasons=include_reasons,
         )
         Console(width=width).print(table)
 
     def console_table(
         self,
         baseline: EvaluationReport[InputsT, OutputT, MetadataT] | None = None,
+        *,
         include_input: bool = False,
         include_metadata: bool = False,
         include_expected_output: bool = False,
@@ -226,6 +230,7 @@ def console_table(
         label_configs: dict[str, RenderValueConfig] | None = None,
         metric_configs: dict[str, RenderNumberConfig] | None = None,
         duration_config: RenderNumberConfig | None = None,
+        include_reasons: bool = False,
     ) -> Table:
         """Return a table containing the data from this report, or the diff between this report and a baseline report.
 
@@ -247,6 +252,7 @@ def console_table(
             label_configs=label_configs or {},
             metric_configs=metric_configs or {},
             duration_config=duration_config or _DEFAULT_DURATION_CONFIG,
+            include_reasons=include_reasons,
         )
         if baseline is None:
             return renderer.build_table(self)
@@ -529,15 +535,16 @@ class ReportCaseRenderer:
     include_labels: bool
     include_metrics: bool
     include_assertions: bool
+    include_reasons: bool
     include_durations: bool
     include_total_duration: bool
 
     input_renderer: _ValueRenderer
     metadata_renderer: _ValueRenderer
     output_renderer: _ValueRenderer
-    score_renderers: dict[str, _NumberRenderer]
-    label_renderers: dict[str, _ValueRenderer]
-    metric_renderers: dict[str, _NumberRenderer]
+    score_renderers: Mapping[str, _NumberRenderer]
+    label_renderers: Mapping[str, _ValueRenderer]
+    metric_renderers: Mapping[str, _NumberRenderer]
     duration_renderer: _NumberRenderer
 
     def build_base_table(self, title: str) -> Table:
@@ -581,10 +588,10 @@ def build_row(self, case: ReportCase) -> list[str]:
             row.append(self.output_renderer.render_value(None, case.output) or EMPTY_CELL_STR)
 
         if self.include_scores:
-            row.append(self._render_dict({k: v.value for k, v in case.scores.items()}, self.score_renderers))
+            row.append(self._render_dict({k: v for k, v in case.scores.items()}, self.score_renderers))
 
         if self.include_labels:
-            row.append(self._render_dict({k: v.value for k, v in case.labels.items()}, self.label_renderers))
+            row.append(self._render_dict({k: v for k, v in case.labels.items()}, self.label_renderers))
 
         if self.include_metrics:
             row.append(self._render_dict(case.metrics, self.metric_renderers))
@@ -783,26 +790,36 @@ def _render_dicts_diff(
             diff_lines.append(rendered)
         return '\n'.join(diff_lines) if diff_lines else EMPTY_CELL_STR
 
-    @staticmethod
     def _render_dict(
-        case_dict: dict[str, T],
+        self,
+        case_dict: Mapping[str, EvaluationResult[T] | T],
         renderers: Mapping[str, _AbstractRenderer[T]],
         *,
         include_names: bool = True,
     ) -> str:
         diff_lines: list[str] = []
         for key, val in case_dict.items():
-            rendered = renderers[key].render_value(key if include_names else None, val)
+            value = cast(EvaluationResult[T], val).value if isinstance(val, EvaluationResult) else val
+            rendered = renderers[key].render_value(key if include_names else None, value)
+            if self.include_reasons and isinstance(val, EvaluationResult) and (reason := val.reason):
+                rendered += f'\n  Reason: {reason}\n'
             diff_lines.append(rendered)
         return '\n'.join(diff_lines) if diff_lines else EMPTY_CELL_STR
 
-    @staticmethod
     def _render_assertions(
+        self,
         assertions: list[EvaluationResult[bool]],
     ) -> str:
         if not assertions:
             return EMPTY_CELL_STR
-        return ''.join(['[green]✔[/]' if a.value else '[red]✗[/]' for a in assertions])
+        lines: list[str] = []
+        for a in assertions:
+            line = '[green]✔[/]' if a.value else '[red]✗[/]'
+            if self.include_reasons:
+                line = f'{a.name}: {line}\n'
+                line = f'{line}  Reason: {a.reason}\n\n' if a.reason else line
+            lines.append(line)
+        return ''.join(lines)
 
     @staticmethod
     def _render_aggregate_assertions(
@@ -863,6 +880,10 @@ class EvaluationRenderer:
     metric_configs: dict[str, RenderNumberConfig]
     duration_config: RenderNumberConfig
 
+    # TODO: Make this class kw-only so we can reorder the kwargs
+    # Data to include
+    include_reasons: bool  # only applies to reports, not to diffs
+
     def include_scores(self, report: EvaluationReport, baseline: EvaluationReport | None = None):
         return any(case.scores for case in self._all_cases(report, baseline))
 
@@ -909,6 +930,7 @@ def _get_case_renderer(
             include_labels=self.include_labels(report, baseline),
             include_metrics=self.include_metrics(report, baseline),
             include_assertions=self.include_assertions(report, baseline),
+            include_reasons=self.include_reasons,
             include_durations=self.include_durations,
             include_total_duration=self.include_total_duration,
             input_renderer=input_renderer,
diff --git a/tests/evals/test_reporting.py b/tests/evals/test_reporting.py
@@ -57,7 +57,7 @@ def sample_score(mock_evaluator: Evaluator[TaskInput, TaskOutput, TaskMetadata])
     return EvaluationResult(
         name='MockEvaluator',
         value=2.5,
-        reason=None,
+        reason='my reason',
         source=mock_evaluator.as_spec(),
     )
 
@@ -120,6 +120,7 @@ async def test_evaluation_renderer_basic(sample_report: EvaluationReport):
         label_configs={},
         metric_configs={},
         duration_config={},
+        include_reasons=False,
     )
 
     table = renderer.build_table(sample_report)
@@ -137,6 +138,43 @@ async def test_evaluation_renderer_basic(sample_report: EvaluationReport):
 """)
 
 
+async def test_evaluation_renderer_with_reasons(sample_report: EvaluationReport):
+    """Test basic functionality of EvaluationRenderer."""
+    renderer = EvaluationRenderer(
+        include_input=True,
+        include_output=True,
+        include_metadata=True,
+        include_expected_output=True,
+        include_durations=True,
+        include_total_duration=True,
+        include_removed_cases=False,
+        include_averages=True,
+        input_config={},
+        metadata_config={},
+        output_config={},
+        score_configs={},
+        label_configs={},
+        metric_configs={},
+        duration_config={},
+        include_reasons=True,
+    )
+
+    table = renderer.build_table(sample_report)
+    assert render_table(table) == snapshot("""\
+                                                                                     Evaluation Summary: test_report
+┏━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┓
+┃ Case ID   ┃ Inputs                    ┃ Metadata               ┃ Expected Output ┃ Outputs         ┃ Scores              ┃ Labels                 ┃ Metrics         ┃ Assertions       ┃    Durations ┃
+┡━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━┩
+│ test_case │ {'query': 'What is 2+2?'} │ {'difficulty': 'easy'} │ {'answer': '4'} │ {'answer': '4'} │ score1: 2.50        │ label1: hello          │ accuracy: 0.950 │ MockEvaluator: ✔ │  task: 0.100 │
+│           │                           │                        │                 │                 │   Reason: my reason │                        │                 │                  │ total: 0.200 │
+│           │                           │                        │                 │                 │                     │                        │                 │                  │              │
+├───────────┼───────────────────────────┼────────────────────────┼─────────────────┼─────────────────┼─────────────────────┼────────────────────────┼─────────────────┼──────────────────┼──────────────┤
+│ Averages  │                           │                        │                 │                 │ score1: 2.50        │ label1: {'hello': 1.0} │ accuracy: 0.950 │ 100.0% ✔         │  task: 0.100 │
+│           │                           │                        │                 │                 │                     │                        │                 │                  │ total: 0.200 │
+└───────────┴───────────────────────────┴────────────────────────┴─────────────────┴─────────────────┴─────────────────────┴────────────────────────┴─────────────────┴──────────────────┴──────────────┘
+""")
+
+
 async def test_evaluation_renderer_with_baseline(sample_report: EvaluationReport):
     """Test EvaluationRenderer with baseline comparison."""
     baseline_report = EvaluationReport(
@@ -191,6 +229,7 @@ async def test_evaluation_renderer_with_baseline(sample_report: EvaluationReport
         label_configs={},
         metric_configs={},
         duration_config={},
+        include_reasons=False,
     )
 
     table = renderer.build_diff_table(sample_report, baseline_report)
@@ -248,6 +287,7 @@ async def test_evaluation_renderer_with_removed_cases(sample_report: EvaluationR
         label_configs={},
         metric_configs={},
         duration_config={},
+        include_reasons=False,
     )
 
     table = renderer.build_diff_table(sample_report, baseline_report)
@@ -311,6 +351,7 @@ async def test_evaluation_renderer_with_custom_configs(sample_report: Evaluation
             'diff_increase_style': 'bold red',
             'diff_decrease_style': 'bold green',
         },
+        include_reasons=False,
     )
 
     table = renderer.build_table(sample_report)