From 2c3a218b0b65b65f4b8d5431dbb1a2e73b76aadb Mon Sep 17 00:00:00 2001
From: David Montague <35119617+dmontagu@users.noreply.github.com>
Date: Fri, 24 Oct 2025 09:50:40 -0600
Subject: [PATCH 1/8] Record experiment metadata

---
 pydantic_evals/pydantic_evals/dataset.py | 25 +++++++++++++++++++-----
 1 file changed, 20 insertions(+), 5 deletions(-)

diff --git a/pydantic_evals/pydantic_evals/dataset.py b/pydantic_evals/pydantic_evals/dataset.py
index 23dda99ef7..e5d5e92648 100644
--- a/pydantic_evals/pydantic_evals/dataset.py
+++ b/pydantic_evals/pydantic_evals/dataset.py
@@ -265,6 +265,8 @@ async def evaluate(
         retry_evaluators: RetryConfig | None = None,
         *,
         task_name: str | None = None,
+        metadata: dict[str, Any] | None = None,
+        tags: Sequence[str] | None = None,
     ) -> EvaluationReport[InputsT, OutputT, MetadataT]:
         """Evaluates the test cases in the dataset using the given task.
 
@@ -283,6 +285,8 @@ async def evaluate(
             retry_evaluators: Optional retry configuration for evaluator execution.
             task_name: Optional override to the name of the task being executed, otherwise the name of the task
                 function will be used.
+            metadata: Optional dict of experiment metadata.
+            tags: Optional sequence of logfire tags.
 
         Returns:
             A report containing the results of the evaluation.
@@ -294,6 +298,9 @@ async def evaluate(
 
         limiter = anyio.Semaphore(max_concurrency) if max_concurrency is not None else AsyncExitStack()
 
+        extra_attributes: dict[str, Any] = {'gen_ai.operation.name': 'experiment'}
+        if metadata is not None:
+            extra_attributes['metadata'] = metadata
         with (
             logfire_span(
                 'evaluate {name}',
@@ -301,7 +308,9 @@ async def evaluate(
                 task_name=task_name,
                 dataset_name=self.name,
                 n_cases=len(self.cases),
-                **{'gen_ai.operation.name': 'experiment'},  # pyright: ignore[reportArgumentType]
+                metadata=metadata,
+                **extra_attributes,
+                _tags=tags,
             ) as eval_span,
             progress_bar or nullcontext(),
         ):
@@ -342,10 +351,16 @@ async def _handle_case(case: Case[InputsT, OutputT, MetadataT], report_case_name
                 span_id=span_id,
                 trace_id=trace_id,
             )
-            if (averages := report.averages()) is not None and averages.assertions is not None:
-                experiment_metadata = {'n_cases': len(self.cases), 'averages': averages}
-                eval_span.set_attribute('logfire.experiment.metadata', experiment_metadata)
-                eval_span.set_attribute('assertion_pass_rate', averages.assertions)
+            full_experiment_metadata: dict[str, Any] = {'n_cases': len(self.cases)}
+            if metadata is not None:
+                full_experiment_metadata['metadata'] = metadata
+            if tags is not None:
+                full_experiment_metadata['tags'] = tags
+            if (averages := report.averages()) is not None:
+                full_experiment_metadata['averages'] = averages
+                if averages.assertions is not None:
+                    eval_span.set_attribute('assertion_pass_rate', averages.assertions)
+            eval_span.set_attribute('logfire.experiment.metadata', full_experiment_metadata)
         return report
 
     def evaluate_sync(

From 3cbe4dc494626df70febe6794860f9567857f9c5 Mon Sep 17 00:00:00 2001
From: David Montague <35119617+dmontagu@users.noreply.github.com>
Date: Fri, 24 Oct 2025 11:18:27 -0600
Subject: [PATCH 2/8] Add metadata for experiments

---
 pydantic_evals/pydantic_evals/dataset.py      |   1 +
 .../pydantic_evals/reporting/__init__.py      | 137 ++++++-
 tests/evals/test_reporting.py                 | 367 ++++++++++++++++++
 3 files changed, 494 insertions(+), 11 deletions(-)

diff --git a/pydantic_evals/pydantic_evals/dataset.py b/pydantic_evals/pydantic_evals/dataset.py
index e5d5e92648..01caf0ce86 100644
--- a/pydantic_evals/pydantic_evals/dataset.py
+++ b/pydantic_evals/pydantic_evals/dataset.py
@@ -348,6 +348,7 @@ async def _handle_case(case: Case[InputsT, OutputT, MetadataT], report_case_name
                 name=name,
                 cases=cases,
                 failures=failures,
+                experiment_metadata=metadata,
                 span_id=span_id,
                 trace_id=trace_id,
             )
diff --git a/pydantic_evals/pydantic_evals/reporting/__init__.py b/pydantic_evals/pydantic_evals/reporting/__init__.py
index 16bc00d261..8070c760d3 100644
--- a/pydantic_evals/pydantic_evals/reporting/__init__.py
+++ b/pydantic_evals/pydantic_evals/reporting/__init__.py
@@ -7,8 +7,10 @@
 from typing import Any, Generic, Literal, Protocol, cast
 
 from pydantic import BaseModel, TypeAdapter
-from rich.console import Console
+from rich.console import Console, Group, RenderableType
+from rich.panel import Panel
 from rich.table import Table
+from rich.text import Text
 from typing_extensions import TypedDict, TypeVar
 
 from pydantic_evals._utils import UNSET, Unset
@@ -196,6 +198,8 @@ class EvaluationReport(Generic[InputsT, OutputT, MetadataT]):
     failures: list[ReportCaseFailure[InputsT, OutputT, MetadataT]] = field(default_factory=list)
     """The failures in the report. These are cases where task execution raised an exception."""
 
+    experiment_metadata: dict[str, Any] | None = None
+    """Metadata associated with the specific experiment represented by this report."""
     trace_id: str | None = None
     """The trace ID of the evaluation."""
     span_id: str | None = None
@@ -261,7 +265,6 @@ def render(
             duration_config=duration_config,
             include_reasons=include_reasons,
         )
-        Console(file=io_file)
         return io_file.getvalue()
 
     def print(
@@ -297,7 +300,8 @@ def print(
         if console is None:  # pragma: no branch
             console = Console(width=width)
 
-        table = self.console_table(
+        metadata_panel = self._metadata_panel(baseline=baseline)
+        renderable: RenderableType = self.console_table(
             baseline=baseline,
             include_input=include_input,
             include_metadata=include_metadata,
@@ -316,8 +320,12 @@ def print(
             metric_configs=metric_configs,
             duration_config=duration_config,
             include_reasons=include_reasons,
+            with_title=not metadata_panel,
         )
-        console.print(table)
+        # Wrap table with experiment metadata panel if present
+        if metadata_panel:
+            renderable = Group(metadata_panel, renderable)
+        console.print(renderable)
         if include_errors and self.failures:  # pragma: no cover
             failures_table = self.failures_table(
                 include_input=include_input,
@@ -330,6 +338,7 @@ def print(
             )
             console.print(failures_table, style='red')
 
+    # TODO(DavidM): in v2, change the return type here to RenderableType
     def console_table(
         self,
         baseline: EvaluationReport[InputsT, OutputT, MetadataT] | None = None,
@@ -351,9 +360,11 @@ def console_table(
         metric_configs: dict[str, RenderNumberConfig] | None = None,
         duration_config: RenderNumberConfig | None = None,
         include_reasons: bool = False,
+        with_title: bool = True,
     ) -> Table:
-        """Return a table containing the data from this report, or the diff between this report and a baseline report.
+        """Return a table containing the data from this report.
 
+        If a baseline is provided, returns a diff between this report and the baseline report.
         Optionally include input and output details.
         """
         renderer = EvaluationRenderer(
@@ -378,10 +389,82 @@ def console_table(
             include_reasons=include_reasons,
         )
         if baseline is None:
-            return renderer.build_table(self)
+            return renderer.build_table(self, with_title=with_title)
         else:  # pragma: no cover
-            return renderer.build_diff_table(self, baseline)
+            return renderer.build_diff_table(self, baseline, with_title=with_title)
 
+    def _metadata_panel(
+        self, baseline: EvaluationReport[InputsT, OutputT, MetadataT] | None = None
+    ) -> RenderableType | None:
+        """Wrap a table with an experiment metadata panel if metadata exists.
+
+        Args:
+            table: The table to wrap
+            baseline: Optional baseline report for diff metadata
+
+        Returns:
+            Either the table unchanged or a Group with Panel and Table
+        """
+        if baseline is None:
+            # Single report - show metadata if present
+            if self.experiment_metadata:
+                metadata_text = Text()
+                items = list(self.experiment_metadata.items())
+                for i, (key, value) in enumerate(items):
+                    metadata_text.append(f'{key}: {value}', style='dim')
+                    if i < len(items) - 1:
+                        metadata_text.append('\n')
+                return Panel(
+                    metadata_text,
+                    title=f'Evaluation Summary: {self.name}',
+                    title_align='left',
+                    border_style='dim',
+                    padding=(0, 1),
+                    expand=False,
+                )
+        else:
+            # Diff report - show metadata diff if either has metadata
+            if self.experiment_metadata or baseline.experiment_metadata:
+                diff_name = baseline.name if baseline.name == self.name else f'{baseline.name} → {self.name}'
+                metadata_text = Text()
+                lines_styles: list[tuple[str, str]] = []
+                if baseline.experiment_metadata and self.experiment_metadata:
+                    # Collect all keys from both
+                    all_keys = sorted(set(baseline.experiment_metadata.keys()) | set(self.experiment_metadata.keys()))
+                    for key in all_keys:
+                        baseline_val = baseline.experiment_metadata.get(key)
+                        report_val = self.experiment_metadata.get(key)
+                        if baseline_val == report_val:
+                            lines_styles.append((f'{key}: {report_val}', 'dim'))
+                        elif baseline_val is None:
+                            lines_styles.append((f'+ {key}: {report_val}', 'green'))
+                        elif report_val is None:
+                            lines_styles.append((f'- {key}: {baseline_val}', 'red'))
+                        else:
+                            lines_styles.append((f'{key}: {baseline_val} → {report_val}', 'yellow'))
+                elif self.experiment_metadata:
+                    lines_styles = [(f'+ {k}: {v}', 'green') for k, v in self.experiment_metadata.items()]
+                else:  # baseline.experiment_metadata only
+                    assert baseline.experiment_metadata is not None
+                    lines_styles = [(f'- {k}: {v}', 'red') for k, v in baseline.experiment_metadata.items()]
+
+                for i, (line, style) in enumerate(lines_styles):
+                    metadata_text.append(line, style=style)
+                    if i < len(lines_styles) - 1:
+                        metadata_text.append('\n')
+
+                return Panel(
+                    metadata_text,
+                    title=f'Evaluation Diff: {diff_name}',
+                    title_align='left',
+                    border_style='dim',
+                    padding=(0, 1),
+                    expand=False,
+                )
+
+        return None
+
+    # TODO(DavidM): in v2, change the return type here to RenderableType
     def failures_table(
         self,
         *,
@@ -705,6 +788,7 @@ class ReportCaseRenderer:
     metric_renderers: Mapping[str, _NumberRenderer]
     duration_renderer: _NumberRenderer
 
+    # TODO(DavidM): in v2, change the return type here to RenderableType
     def build_base_table(self, title: str) -> Table:
         """Build and return a Rich Table for the diff output."""
         table = Table(title=title, show_lines=True)
@@ -731,6 +815,7 @@ def build_base_table(self, title: str) -> Table:
             table.add_column('Durations' if self.include_total_duration else 'Duration', justify='right')
         return table
 
+    # TODO(DavidM): in v2, change the return type here to RenderableType
     def build_failures_table(self, title: str) -> Table:
         """Build and return a Rich Table for the failures output."""
         table = Table(title=title, show_lines=True)
@@ -1190,9 +1275,22 @@ def _get_case_renderer(
             duration_renderer=duration_renderer,
         )
 
-    def build_table(self, report: EvaluationReport) -> Table:
+    # TODO(DavidM): in v2, change the return type here to RenderableType
+    def build_table(self, report: EvaluationReport, *, with_title: bool = True) -> Table:
+        """Build a table for the report.
+
+        Args:
+            report: The evaluation report to render
+            with_title: Whether to include the title in the table (default True)
+
+        Returns:
+            A Rich Table object
+        """
         case_renderer = self._get_case_renderer(report)
-        table = case_renderer.build_base_table(f'Evaluation Summary: {report.name}')
+
+        title = f'Evaluation Summary: {report.name}' if with_title else ''
+        table = case_renderer.build_base_table(title)
+
         for case in report.cases:
             table.add_row(*case_renderer.build_row(case))
 
@@ -1203,7 +1301,20 @@ def build_table(self, report: EvaluationReport) -> Table:
 
         return table
 
-    def build_diff_table(self, report: EvaluationReport, baseline: EvaluationReport) -> Table:
+    # TODO(DavidM): in v2, change the return type here to RenderableType
+    def build_diff_table(
+        self, report: EvaluationReport, baseline: EvaluationReport, *, with_title: bool = True
+    ) -> Table:
+        """Build a diff table comparing report to baseline.
+
+        Args:
+            report: The evaluation report to compare
+            baseline: The baseline report to compare against
+            with_title: Whether to include the title in the table (default True)
+
+        Returns:
+            A Rich Table object
+        """
         report_cases = report.cases
         baseline_cases = self._baseline_cases_to_include(report, baseline)
 
@@ -1228,7 +1339,10 @@ def build_diff_table(self, report: EvaluationReport, baseline: EvaluationReport)
 
         case_renderer = self._get_case_renderer(report, baseline)
         diff_name = baseline.name if baseline.name == report.name else f'{baseline.name} → {report.name}'
-        table = case_renderer.build_base_table(f'Evaluation Diff: {diff_name}')
+
+        title = f'Evaluation Diff: {diff_name}' if with_title else ''
+        table = case_renderer.build_base_table(title)
+
         for baseline_case, new_case in diff_cases:
             table.add_row(*case_renderer.build_diff_row(new_case, baseline_case))
         for case in added_cases:
@@ -1247,6 +1361,7 @@ def build_diff_table(self, report: EvaluationReport, baseline: EvaluationReport)
 
         return table
 
+    # TODO(DavidM): in v2, change the return type here to RenderableType
     def build_failures_table(self, report: EvaluationReport) -> Table:
         case_renderer = self._get_case_renderer(report)
         table = case_renderer.build_failures_table('Case Failures')
diff --git a/tests/evals/test_reporting.py b/tests/evals/test_reporting.py
index 06b86065ca..225ab74460 100644
--- a/tests/evals/test_reporting.py
+++ b/tests/evals/test_reporting.py
@@ -950,3 +950,370 @@ async def test_evaluation_renderer_no_evaluator_failures_column():
 │ test_case │ {'query': 'What is 2+2?'} │ {'answer': '4'} │ accuracy: 0.950 │    0.100 │
 └───────────┴───────────────────────────┴─────────────────┴─────────────────┴──────────┘
 """)
+
+
+async def test_evaluation_renderer_with_experiment_metadata(sample_report_case: ReportCase):
+    """Test EvaluationRenderer with experiment metadata."""
+    report = EvaluationReport(
+        cases=[sample_report_case],
+        name='test_report',
+        experiment_metadata={'model': 'gpt-4o', 'temperature': 0.7, 'prompt_version': 'v2'},
+    )
+
+    output = report.render(
+        include_input=True,
+        include_metadata=False,
+        include_expected_output=False,
+        include_output=False,
+        include_durations=True,
+        include_total_duration=False,
+        include_removed_cases=False,
+        include_averages=True,
+        include_errors=False,
+        include_error_stacktrace=False,
+        include_evaluator_failures=True,
+        input_config={},
+        metadata_config={},
+        output_config={},
+        score_configs={},
+        label_configs={},
+        metric_configs={},
+        duration_config={},
+        include_reasons=False,
+    )
+
+    assert output == snapshot("""\
+╭─ Evaluation Summary: test_report ─╮
+│ model: gpt-4o                     │
+│ temperature: 0.7                  │
+│ prompt_version: v2                │
+╰───────────────────────────────────╯
+┏━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━┓
+┃ Case ID   ┃ Inputs                    ┃ Scores       ┃ Labels                 ┃ Metrics         ┃ Assertions ┃ Duration ┃
+┡━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━┩
+│ test_case │ {'query': 'What is 2+2?'} │ score1: 2.50 │ label1: hello          │ accuracy: 0.950 │ ✔          │  100.0ms │
+├───────────┼───────────────────────────┼──────────────┼────────────────────────┼─────────────────┼────────────┼──────────┤
+│ Averages  │                           │ score1: 2.50 │ label1: {'hello': 1.0} │ accuracy: 0.950 │ 100.0% ✔   │  100.0ms │
+└───────────┴───────────────────────────┴──────────────┴────────────────────────┴─────────────────┴────────────┴──────────┘
+""")
+
+
+async def test_evaluation_renderer_with_long_experiment_metadata(sample_report_case: ReportCase):
+    """Test EvaluationRenderer with very long experiment metadata."""
+    report = EvaluationReport(
+        cases=[sample_report_case],
+        name='test_report',
+        experiment_metadata={
+            'model': 'gpt-4o-2024-08-06',
+            'temperature': 0.7,
+            'prompt_version': 'v2.1.5',
+            'system_prompt': 'You are a helpful assistant',
+            'max_tokens': 1000,
+            'top_p': 0.9,
+            'frequency_penalty': 0.1,
+            'presence_penalty': 0.1,
+        },
+    )
+
+    output = report.render(
+        include_input=False,
+        include_metadata=False,
+        include_expected_output=False,
+        include_output=False,
+        include_durations=True,
+        include_total_duration=False,
+        include_removed_cases=False,
+        include_averages=False,
+        include_errors=False,
+        include_error_stacktrace=False,
+        include_evaluator_failures=True,
+        input_config={},
+        metadata_config={},
+        output_config={},
+        score_configs={},
+        label_configs={},
+        metric_configs={},
+        duration_config={},
+        include_reasons=False,
+    )
+
+    assert output == snapshot("""\
+╭─ Evaluation Summary: test_report ──────────╮
+│ model: gpt-4o-2024-08-06                   │
+│ temperature: 0.7                           │
+│ prompt_version: v2.1.5                     │
+│ system_prompt: You are a helpful assistant │
+│ max_tokens: 1000                           │
+│ top_p: 0.9                                 │
+│ frequency_penalty: 0.1                     │
+│ presence_penalty: 0.1                      │
+╰────────────────────────────────────────────╯
+┏━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━┓
+┃ Case ID   ┃ Scores       ┃ Labels        ┃ Metrics         ┃ Assertions ┃ Duration ┃
+┡━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━┩
+│ test_case │ score1: 2.50 │ label1: hello │ accuracy: 0.950 │ ✔          │  100.0ms │
+└───────────┴──────────────┴───────────────┴─────────────────┴────────────┴──────────┘
+""")
+
+
+async def test_evaluation_renderer_diff_with_experiment_metadata(sample_report_case: ReportCase):
+    """Test EvaluationRenderer diff table with experiment metadata."""
+    baseline_report = EvaluationReport(
+        cases=[sample_report_case],
+        name='baseline_report',
+        experiment_metadata={'model': 'gpt-4', 'temperature': 0.5},
+    )
+
+    new_report = EvaluationReport(
+        cases=[sample_report_case],
+        name='new_report',
+        experiment_metadata={'model': 'gpt-4o', 'temperature': 0.7},
+    )
+
+    output = new_report.render(
+        baseline=baseline_report,
+        include_input=False,
+        include_metadata=False,
+        include_expected_output=False,
+        include_output=False,
+        include_durations=True,
+        include_total_duration=False,
+        include_removed_cases=False,
+        include_averages=True,
+        include_errors=False,
+        include_error_stacktrace=False,
+        include_evaluator_failures=True,
+        input_config={},
+        metadata_config={},
+        output_config={},
+        score_configs={},
+        label_configs={},
+        metric_configs={},
+        duration_config={},
+        include_reasons=False,
+    )
+
+    assert output == snapshot("""\
+╭─ Evaluation Diff: baseline_report → new_report ─╮
+│ model: gpt-4 → gpt-4o                           │
+│ temperature: 0.5 → 0.7                          │
+╰─────────────────────────────────────────────────╯
+┏━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━┓
+┃ Case ID   ┃ Scores       ┃ Labels                 ┃ Metrics         ┃ Assertions ┃ Duration ┃
+┡━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━┩
+│ test_case │ score1: 2.50 │ label1: hello          │ accuracy: 0.950 │ ✔          │  100.0ms │
+├───────────┼──────────────┼────────────────────────┼─────────────────┼────────────┼──────────┤
+│ Averages  │ score1: 2.50 │ label1: {'hello': 1.0} │ accuracy: 0.950 │ 100.0% ✔   │  100.0ms │
+└───────────┴──────────────┴────────────────────────┴─────────────────┴────────────┴──────────┘
+""")
+
+
+async def test_evaluation_renderer_diff_with_only_new_metadata(sample_report_case: ReportCase):
+    """Test EvaluationRenderer diff table where only new report has metadata."""
+    baseline_report = EvaluationReport(
+        cases=[sample_report_case],
+        name='baseline_report',
+        experiment_metadata=None,  # No metadata
+    )
+
+    new_report = EvaluationReport(
+        cases=[sample_report_case],
+        name='new_report',
+        experiment_metadata={'model': 'gpt-4o', 'temperature': 0.7},
+    )
+
+    output = new_report.render(
+        baseline=baseline_report,
+        include_input=False,
+        include_metadata=False,
+        include_expected_output=False,
+        include_output=False,
+        include_durations=True,
+        include_total_duration=False,
+        include_removed_cases=False,
+        include_averages=False,
+        include_errors=False,
+        include_error_stacktrace=False,
+        include_evaluator_failures=True,
+        input_config={},
+        metadata_config={},
+        output_config={},
+        score_configs={},
+        label_configs={},
+        metric_configs={},
+        duration_config={},
+        include_reasons=False,
+    )
+
+    assert output == snapshot("""\
+╭─ Evaluation Diff: baseline_report → new_report ─╮
+│ + model: gpt-4o                                 │
+│ + temperature: 0.7                              │
+╰─────────────────────────────────────────────────╯
+┏━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━┓
+┃ Case ID   ┃ Scores       ┃ Labels        ┃ Metrics         ┃ Assertions ┃ Duration ┃
+┡━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━┩
+│ test_case │ score1: 2.50 │ label1: hello │ accuracy: 0.950 │ ✔          │  100.0ms │
+└───────────┴──────────────┴───────────────┴─────────────────┴────────────┴──────────┘
+""")
+
+
+async def test_evaluation_renderer_diff_with_only_baseline_metadata(sample_report_case: ReportCase):
+    """Test EvaluationRenderer diff table where only baseline report has metadata."""
+    baseline_report = EvaluationReport(
+        cases=[sample_report_case],
+        name='baseline_report',
+        experiment_metadata={'model': 'gpt-4', 'temperature': 0.5},
+    )
+
+    new_report = EvaluationReport(
+        cases=[sample_report_case],
+        name='new_report',
+        experiment_metadata=None,  # No metadata
+    )
+
+    output = new_report.render(
+        baseline=baseline_report,
+        include_input=False,
+        include_metadata=False,
+        include_expected_output=False,
+        include_output=False,
+        include_durations=True,
+        include_total_duration=False,
+        include_removed_cases=False,
+        include_averages=False,
+        include_errors=False,
+        include_error_stacktrace=False,
+        include_evaluator_failures=True,
+        input_config={},
+        metadata_config={},
+        output_config={},
+        score_configs={},
+        label_configs={},
+        metric_configs={},
+        duration_config={},
+        include_reasons=False,
+    )
+
+    assert output == snapshot("""\
+╭─ Evaluation Diff: baseline_report → new_report ─╮
+│ - model: gpt-4                                  │
+│ - temperature: 0.5                              │
+╰─────────────────────────────────────────────────╯
+┏━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━┓
+┃ Case ID   ┃ Scores       ┃ Labels        ┃ Metrics         ┃ Assertions ┃ Duration ┃
+┡━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━┩
+│ test_case │ score1: 2.50 │ label1: hello │ accuracy: 0.950 │ ✔          │  100.0ms │
+└───────────┴──────────────┴───────────────┴─────────────────┴────────────┴──────────┘
+""")
+
+
+async def test_evaluation_renderer_diff_with_same_metadata(sample_report_case: ReportCase):
+    """Test EvaluationRenderer diff table where both reports have the same metadata."""
+    metadata = {'model': 'gpt-4o', 'temperature': 0.7}
+
+    baseline_report = EvaluationReport(
+        cases=[sample_report_case],
+        name='baseline_report',
+        experiment_metadata=metadata,
+    )
+
+    new_report = EvaluationReport(
+        cases=[sample_report_case],
+        name='new_report',
+        experiment_metadata=metadata,
+    )
+
+    output = new_report.render(
+        include_input=False,
+        include_metadata=False,
+        include_expected_output=False,
+        include_output=False,
+        include_durations=True,
+        include_total_duration=False,
+        include_removed_cases=False,
+        include_averages=False,
+        include_error_stacktrace=False,
+        include_evaluator_failures=True,
+        input_config={},
+        metadata_config={},
+        output_config={},
+        score_configs={},
+        label_configs={},
+        metric_configs={},
+        duration_config={},
+        include_reasons=False,
+        baseline=baseline_report,
+        include_errors=False,  # Prevent failures table from being added
+    )
+    assert output == snapshot("""\
+╭─ Evaluation Diff: baseline_report → new_report ─╮
+│ model: gpt-4o                                   │
+│ temperature: 0.7                                │
+╰─────────────────────────────────────────────────╯
+┏━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━┓
+┃ Case ID   ┃ Scores       ┃ Labels        ┃ Metrics         ┃ Assertions ┃ Duration ┃
+┡━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━┩
+│ test_case │ score1: 2.50 │ label1: hello │ accuracy: 0.950 │ ✔          │  100.0ms │
+└───────────┴──────────────┴───────────────┴─────────────────┴────────────┴──────────┘
+""")
+
+
+async def test_evaluation_renderer_diff_with_changed_metadata(sample_report_case: ReportCase):
+    """Test EvaluationRenderer diff table where both reports have the same metadata."""
+
+    baseline_report = EvaluationReport(
+        cases=[sample_report_case],
+        name='baseline_report',
+        experiment_metadata={
+            'updated-key': 'original value',
+            'preserved-key': 'preserved value',
+            'old-key': 'old value',
+        },
+    )
+
+    new_report = EvaluationReport(
+        cases=[sample_report_case],
+        name='new_report',
+        experiment_metadata={
+            'updated-key': 'updated value',
+            'preserved-key': 'preserved value',
+            'new-key': 'new value',
+        },
+    )
+
+    output = new_report.render(
+        include_input=False,
+        include_metadata=False,
+        include_expected_output=False,
+        include_output=False,
+        include_durations=True,
+        include_total_duration=False,
+        include_removed_cases=False,
+        include_averages=False,
+        include_error_stacktrace=False,
+        include_evaluator_failures=True,
+        input_config={},
+        metadata_config={},
+        output_config={},
+        score_configs={},
+        label_configs={},
+        metric_configs={},
+        duration_config={},
+        include_reasons=False,
+        baseline=baseline_report,
+        include_errors=False,  # Prevent failures table from being added
+    )
+    assert output == snapshot("""\
+╭─ Evaluation Diff: baseline_report → new_report ─╮
+│ + new-key: new value                            │
+│ - old-key: old value                            │
+│ preserved-key: preserved value                  │
+│ updated-key: original value → updated value     │
+╰─────────────────────────────────────────────────╯
+┏━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━┓
+┃ Case ID   ┃ Scores       ┃ Labels        ┃ Metrics         ┃ Assertions ┃ Duration ┃
+┡━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━┩
+│ test_case │ score1: 2.50 │ label1: hello │ accuracy: 0.950 │ ✔          │  100.0ms │
+└───────────┴──────────────┴───────────────┴─────────────────┴────────────┴──────────┘
+""")

From 140ddf32590d6843b707839b33cdfbec1b2f7862 Mon Sep 17 00:00:00 2001
From: David Montague <35119617+dmontagu@users.noreply.github.com>
Date: Mon, 27 Oct 2025 09:52:56 -0600
Subject: [PATCH 3/8] Add docs and tests, and fix some bugs

---
 docs/evals/how-to/metrics-attributes.md  | 277 +++++++++++++++++++++--
 pydantic_evals/pydantic_evals/dataset.py |  16 +-
 tests/evals/test_dataset.py              |  20 ++
 3 files changed, 297 insertions(+), 16 deletions(-)

diff --git a/docs/evals/how-to/metrics-attributes.md b/docs/evals/how-to/metrics-attributes.md
index c1601953cc..167cbedd3e 100644
--- a/docs/evals/how-to/metrics-attributes.md
+++ b/docs/evals/how-to/metrics-attributes.md
@@ -418,34 +418,285 @@ class QualityEvaluator(Evaluator):
 ```
 
 
+## Experiment-Level Metadata
+
+In addition to case-level metadata, you can also pass experiment-level metadata when calling [`evaluate()`][pydantic_evals.Dataset.evaluate]:
+
+```python
+from pydantic_evals import Case, Dataset
+
+dataset = Dataset(
+    cases=[
+        Case(
+            inputs='test',
+            metadata={'difficulty': 'easy'},  # Case-level metadata
+        )
+    ]
+)
+
+
+async def task(inputs: str) -> str:
+    return f'Result: {inputs}'
+
+
+# Pass experiment-level metadata
+async def main():
+    report = await dataset.evaluate(
+        task,
+        metadata={
+            'model': 'gpt-4o',
+            'prompt_version': 'v2.1',
+            'temperature': 0.7,
+        },
+    )
+
+    # Access experiment metadata in the report
+    print(report.experiment_metadata)
+    #> {'model': 'gpt-4o', 'prompt_version': 'v2.1', 'temperature': 0.7}
+```
+
+### When to Use Experiment Metadata
+
+Experiment metadata is useful for tracking configuration that applies to the entire evaluation run:
+
+- **Model configuration**: Model name, version, parameters
+- **Prompt versioning**: Which prompt template was used
+- **Infrastructure**: Deployment environment, region
+- **Experiment context**: Developer name, feature branch, commit hash
+
+This metadata is especially valuable when:
+
+- Comparing multiple evaluation runs over time
+- Tracking which configuration produced which results
+- Reproducing evaluation results from historical data
+
+### Viewing in Reports
+
+Experiment metadata appears at the top of printed reports:
+
+```python
+from pydantic_evals import Case, Dataset
+
+dataset = Dataset(cases=[Case(inputs='hello', expected_output='HELLO')])
+
+
+async def task(text: str) -> str:
+    return text.upper()
+
+async def main():
+    report = await dataset.evaluate(
+        task,
+        metadata={'model': 'gpt-4o', 'version': 'v1.0'},
+    )
+
+    print(report.render())
+    """
+    ╭─ Evaluation Summary: task ─╮
+    │ model: gpt-4o              │
+    │ version: v1.0              │
+    ╰────────────────────────────╯
+    ┏━━━━━━━━━━┳━━━━━━━━━━┓
+    ┃ Case ID  ┃ Duration ┃
+    ┡━━━━━━━━━━╇━━━━━━━━━━┩
+    │ Case 1   │     10ms │
+    ├──────────┼──────────┤
+    │ Averages │     10ms │
+    └──────────┴──────────┘
+    """
+```
+
+## Synchronization between Tasks and Experiment Metadata
+
+Experiment metadata is for *recording* configuration, not *configuring* the task.
+The metadata dict doesn't automatically configure your task's behavior; you must ensure the values in the metadata dict match what your task actually uses.
+For example, it's easy to accidentally have metadata claim `temperature: 0.7` while your task actually uses `temperature: 1.0`, leading to incorrect experiment tracking and unreproducible results.
+
+To avoid this problem, we recommend establishing a single source of truth for configuration that both your task and metadata reference.
+Below are a few suggested patterns for achieving this synchronization.
+
+### Pattern 1: Shared Module Constants
+
+For simpler cases, use module-level constants:
+
+```python
+from pydantic_ai import Agent
+from pydantic_evals import Case, Dataset
+
+# Module constants as single source of truth
+MODEL_NAME = 'openai:gpt-5-mini'
+TEMPERATURE = 0.7
+SYSTEM_PROMPT = 'You are a helpful assistant.'
+
+agent = Agent(MODEL_NAME, model_settings={'temperature': TEMPERATURE}, system_prompt=SYSTEM_PROMPT)
+
+
+async def task(inputs: str) -> str:
+    result = await agent.run(inputs)
+    return result.output
+
+
+async def main():
+    dataset = Dataset(cases=[Case(inputs='What is the capital of France?')])
+
+    # Metadata references same constants
+    await dataset.evaluate(
+        task,
+        metadata={
+            'model': MODEL_NAME,
+            'temperature': TEMPERATURE,
+            'system_prompt': SYSTEM_PROMPT,
+        },
+    )
+```
+
+### Pattern 2: Configuration Object (Recommended)
+
+Define configuration once and use it everywhere:
+
+```python
+from dataclasses import asdict, dataclass
+
+from pydantic_ai import Agent
+from pydantic_evals import Case, Dataset
+
+
+@dataclass
+class TaskConfig:
+    """Single source of truth for task configuration.
+
+    Includes all variables you'd like to see in experiment metadata.
+    """
+
+    model: str
+    temperature: float
+    max_tokens: int
+    prompt_version: str
+
+
+# Define configuration once
+config = TaskConfig(
+    model='openai:gpt-5-mini',
+    temperature=0.7,
+    max_tokens=500,
+    prompt_version='v2.1',
+)
+
+# Use config in task
+agent = Agent(
+    config.model,
+    model_settings={'temperature': config.temperature, 'max_tokens': config.max_tokens},
+)
+
+
+async def task(inputs: str) -> str:
+    """Task uses the same config that's recorded in metadata."""
+    result = await agent.run(inputs)
+    return result.output
+
+
+# Evaluate with metadata derived from the same config
+async def main():
+    dataset = Dataset(cases=[Case(inputs='What is the capital of France?')])
+
+    report = await dataset.evaluate(
+        task,
+        metadata=asdict(config),  # Guaranteed to match task behavior
+    )
+
+    print(report.experiment_metadata)
+    """
+    {
+        'model': 'openai:gpt-5-mini',
+        'temperature': 0.7,
+        'max_tokens': 500,
+        'prompt_version': 'v2.1',
+    }
+    """
+```
+
+If it's problematic to have a global task configuration, you can also create your `TaskConfig` object at the task
+call-site and pass it to the agent via `deps` or similar, but in this case you would still need to guarantee that the
+value is always the same as the value passed to `metadata` in the call to `Dataset.evaluate`.
+
+### Anti-Pattern: Duplicate Configuration
+
+**Avoid this common mistake**:
+
+```python
+from pydantic_ai import Agent
+from pydantic_evals import Case, Dataset
+
+# ❌ BAD: Configuration defined in multiple places
+agent = Agent('openai:gpt-5-mini', model_settings={'temperature': 0.7})
+
+
+async def task(inputs: str) -> str:
+    result = await agent.run(inputs)
+    return result.output
+
+
+async def main():
+    dataset = Dataset(cases=[Case(inputs='test')])
+
+    # ❌ BAD: Metadata manually typed - easy to get out of sync
+    await dataset.evaluate(
+        task,
+        metadata={
+            'model': 'openai:gpt-5-mini',  # Duplicated! Could diverge from agent definition
+            'temperature': 0.8,  # ⚠️ WRONG! Task actually uses 0.7
+        },
+    )
+```
+
+In this anti-pattern, the metadata claims `temperature: 0.8` but the task uses `0.7`. This leads to:
+
+- Incorrect experiment tracking
+- Inability to reproduce results
+- Confusion when comparing runs
+- Wasted time debugging "why results differ"
+
 ## Metrics vs Attributes vs Metadata
 
 Understanding the differences:
 
-| Feature | Metrics | Attributes | Metadata |
-|---------|---------|------------|----------|
-| **Set in** | Task execution | Task execution | Case definition |
-| **Type** | int, float | Any | Any |
-| **Purpose** | Quantitative | Qualitative | Test data |
-| **Used for** | Aggregation | Context | Input to task |
-| **Available to** | Evaluators | Evaluators | Task & Evaluators |
+| Feature | Metrics | Attributes | Case Metadata | Experiment Metadata |
+|---------|---------|------------|---------------|---------------------|
+| **Set in** | Task execution | Task execution | Case definition | `evaluate()` call |
+| **Type** | int, float | Any | Any | Any |
+| **Purpose** | Quantitative | Qualitative | Test data | Experiment config |
+| **Used for** | Aggregation | Context | Input to task | Tracking runs |
+| **Available to** | Evaluators | Evaluators | Task & Evaluators | Report only |
+| **Scope** | Per case | Per case | Per case | Per experiment |
 
 ```python
-from pydantic_evals import Case, increment_eval_metric, set_eval_attribute
+from pydantic_evals import Case, Dataset, increment_eval_metric, set_eval_attribute
 
-# Metadata: Defined in case (before execution)
-Case(
+# Case Metadata: Defined in case (before execution)
+case = Case(
     inputs='question',
-    metadata={'difficulty': 'hard', 'category': 'math'},
+    metadata={'difficulty': 'hard', 'category': 'math'},  # Per-case metadata
 )
 
+dataset = Dataset(cases=[case])
+
 
 # Metrics & Attributes: Recorded during execution
-def task(inputs):
-    # These are recorded during execution
+async def task(inputs):
+    # These are recorded during execution for each case
     increment_eval_metric('tokens', 100)
     set_eval_attribute('model', 'gpt-4o')
     return f'Result: {inputs}'
+
+
+async def main():
+    # Experiment Metadata: Defined at evaluation time
+    await dataset.evaluate(
+        task,
+        metadata={  # Experiment-level metadata
+            'prompt_version': 'v2.1',
+            'temperature': 0.7,
+        },
+    )
 ```
 
 ## Troubleshooting
diff --git a/pydantic_evals/pydantic_evals/dataset.py b/pydantic_evals/pydantic_evals/dataset.py
index 01caf0ce86..479f3c27c4 100644
--- a/pydantic_evals/pydantic_evals/dataset.py
+++ b/pydantic_evals/pydantic_evals/dataset.py
@@ -286,7 +286,7 @@ async def evaluate(
             task_name: Optional override to the name of the task being executed, otherwise the name of the task
                 function will be used.
             metadata: Optional dict of experiment metadata.
-            tags: Optional sequence of logfire tags.
+            tags: Optional sequence of tags to add to the experiment span.
 
         Returns:
             A report containing the results of the evaluation.
@@ -308,7 +308,6 @@ async def evaluate(
                 task_name=task_name,
                 dataset_name=self.name,
                 n_cases=len(self.cases),
-                metadata=metadata,
                 **extra_attributes,
                 _tags=tags,
             ) as eval_span,
@@ -372,6 +371,10 @@ def evaluate_sync(
         progress: bool = True,
         retry_task: RetryConfig | None = None,
         retry_evaluators: RetryConfig | None = None,
+        *,
+        task_name: str | None = None,
+        metadata: dict[str, Any] | None = None,
+        tags: Sequence[str] | None = None,
     ) -> EvaluationReport[InputsT, OutputT, MetadataT]:
         """Evaluates the test cases in the dataset using the given task.
 
@@ -387,6 +390,10 @@ def evaluate_sync(
             progress: Whether to show a progress bar for the evaluation. Defaults to True.
             retry_task: Optional retry configuration for the task execution.
             retry_evaluators: Optional retry configuration for evaluator execution.
+            task_name: Optional override to the name of the task being executed, otherwise the name of the task
+                function will be used.
+            metadata: Optional dict of experiment metadata.
+            tags: Optional sequence of tags to add to the experiment span.
 
         Returns:
             A report containing the results of the evaluation.
@@ -394,11 +401,14 @@ def evaluate_sync(
         return get_event_loop().run_until_complete(
             self.evaluate(
                 task,
-                task_name=name,
+                name=name,
                 max_concurrency=max_concurrency,
                 progress=progress,
                 retry_task=retry_task,
                 retry_evaluators=retry_evaluators,
+                task_name=task_name,
+                metadata=metadata,
+                tags=tags,
             )
         )
 
diff --git a/tests/evals/test_dataset.py b/tests/evals/test_dataset.py
index 31f91bfc0b..72f5af648f 100644
--- a/tests/evals/test_dataset.py
+++ b/tests/evals/test_dataset.py
@@ -1750,3 +1750,23 @@ async def mock_async_task(inputs: TaskInput) -> TaskOutput:
             ),
         ]
     )
+
+
+async def test_evaluate_with_experiment_metadata(example_dataset: Dataset[TaskInput, TaskOutput, TaskMetadata]):
+    """Test that experiment metadata passed to evaluate() appears in the report."""
+
+    async def task(inputs: TaskInput) -> TaskOutput:
+        return TaskOutput(answer=inputs.query.upper())
+
+    # Pass experiment metadata to evaluate()
+    experiment_metadata = {
+        'model': 'gpt-4o',
+        'prompt_version': 'v2.1',
+        'temperature': 0.7,
+        'max_tokens': 1000,
+    }
+
+    report = await example_dataset.evaluate(task, metadata=experiment_metadata)
+
+    # Verify that the report contains the experiment metadata
+    assert report.experiment_metadata == experiment_metadata

From 6decf5f09ce5760b91b97fc38f0ff76d26de4925 Mon Sep 17 00:00:00 2001
From: David Montague <35119617+dmontagu@users.noreply.github.com>
Date: Mon, 27 Oct 2025 11:18:11 -0600
Subject: [PATCH 4/8] Remove tags and fix test coverage

---
 pydantic_evals/pydantic_evals/dataset.py |  8 -----
 tests/evals/test_reporting.py            | 45 ++++++++++++++++++++++++
 2 files changed, 45 insertions(+), 8 deletions(-)

diff --git a/pydantic_evals/pydantic_evals/dataset.py b/pydantic_evals/pydantic_evals/dataset.py
index 479f3c27c4..6e973fbaf0 100644
--- a/pydantic_evals/pydantic_evals/dataset.py
+++ b/pydantic_evals/pydantic_evals/dataset.py
@@ -266,7 +266,6 @@ async def evaluate(
         *,
         task_name: str | None = None,
         metadata: dict[str, Any] | None = None,
-        tags: Sequence[str] | None = None,
     ) -> EvaluationReport[InputsT, OutputT, MetadataT]:
         """Evaluates the test cases in the dataset using the given task.
 
@@ -286,7 +285,6 @@ async def evaluate(
             task_name: Optional override to the name of the task being executed, otherwise the name of the task
                 function will be used.
             metadata: Optional dict of experiment metadata.
-            tags: Optional sequence of tags to add to the experiment span.
 
         Returns:
             A report containing the results of the evaluation.
@@ -309,7 +307,6 @@ async def evaluate(
                 dataset_name=self.name,
                 n_cases=len(self.cases),
                 **extra_attributes,
-                _tags=tags,
             ) as eval_span,
             progress_bar or nullcontext(),
         ):
@@ -354,8 +351,6 @@ async def _handle_case(case: Case[InputsT, OutputT, MetadataT], report_case_name
             full_experiment_metadata: dict[str, Any] = {'n_cases': len(self.cases)}
             if metadata is not None:
                 full_experiment_metadata['metadata'] = metadata
-            if tags is not None:
-                full_experiment_metadata['tags'] = tags
             if (averages := report.averages()) is not None:
                 full_experiment_metadata['averages'] = averages
                 if averages.assertions is not None:
@@ -374,7 +369,6 @@ def evaluate_sync(
         *,
         task_name: str | None = None,
         metadata: dict[str, Any] | None = None,
-        tags: Sequence[str] | None = None,
     ) -> EvaluationReport[InputsT, OutputT, MetadataT]:
         """Evaluates the test cases in the dataset using the given task.
 
@@ -393,7 +387,6 @@ def evaluate_sync(
             task_name: Optional override to the name of the task being executed, otherwise the name of the task
                 function will be used.
             metadata: Optional dict of experiment metadata.
-            tags: Optional sequence of tags to add to the experiment span.
 
         Returns:
             A report containing the results of the evaluation.
@@ -408,7 +401,6 @@ def evaluate_sync(
                 retry_evaluators=retry_evaluators,
                 task_name=task_name,
                 metadata=metadata,
-                tags=tags,
             )
         )
 
diff --git a/tests/evals/test_reporting.py b/tests/evals/test_reporting.py
index 225ab74460..58a2b399f4 100644
--- a/tests/evals/test_reporting.py
+++ b/tests/evals/test_reporting.py
@@ -1317,3 +1317,48 @@ async def test_evaluation_renderer_diff_with_changed_metadata(sample_report_case
 │ test_case │ score1: 2.50 │ label1: hello │ accuracy: 0.950 │ ✔          │  100.0ms │
 └───────────┴──────────────┴───────────────┴─────────────────┴────────────┴──────────┘
 """)
+
+
+async def test_evaluation_renderer_diff_with_no_metadata(sample_report_case: ReportCase):
+    """Test EvaluationRenderer diff table where both reports have the same metadata."""
+
+    baseline_report = EvaluationReport(
+        cases=[sample_report_case],
+        name='baseline_report',
+    )
+
+    new_report = EvaluationReport(
+        cases=[sample_report_case],
+        name='new_report',
+    )
+
+    output = new_report.render(
+        include_input=False,
+        include_metadata=False,
+        include_expected_output=False,
+        include_output=False,
+        include_durations=True,
+        include_total_duration=False,
+        include_removed_cases=False,
+        include_averages=False,
+        include_error_stacktrace=False,
+        include_evaluator_failures=True,
+        input_config={},
+        metadata_config={},
+        output_config={},
+        score_configs={},
+        label_configs={},
+        metric_configs={},
+        duration_config={},
+        include_reasons=False,
+        baseline=baseline_report,
+        include_errors=False,  # Prevent failures table from being added
+    )
+    assert output == snapshot("""\
+                    Evaluation Diff: baseline_report → new_report
+┏━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━┓
+┃ Case ID   ┃ Scores       ┃ Labels        ┃ Metrics         ┃ Assertions ┃ Duration ┃
+┡━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━┩
+│ test_case │ score1: 2.50 │ label1: hello │ accuracy: 0.950 │ ✔          │  100.0ms │
+└───────────┴──────────────┴───────────────┴─────────────────┴────────────┴──────────┘
+""")

From 669fab8478d41003d915cde9fbe9e8826fd19cc8 Mon Sep 17 00:00:00 2001
From: David Montague <35119617+dmontagu@users.noreply.github.com>
Date: Mon, 27 Oct 2025 11:20:34 -0600
Subject: [PATCH 5/8] Update tests to show impact on attributes of setting
 experiment metadata

---
 tests/evals/test_dataset.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/tests/evals/test_dataset.py b/tests/evals/test_dataset.py
index 72f5af648f..ca5f90d44a 100644
--- a/tests/evals/test_dataset.py
+++ b/tests/evals/test_dataset.py
@@ -1530,7 +1530,7 @@ async def mock_async_task(inputs: TaskInput) -> TaskOutput:
             return TaskOutput(answer='Paris')
         return TaskOutput(answer='Unknown')  # pragma: no cover
 
-    await example_dataset.evaluate(mock_async_task)
+    await example_dataset.evaluate(mock_async_task, metadata={'key': 'value'})
 
     spans = capfire.exporter.exported_spans_as_dict(parse_json_attributes=True)
     spans.sort(key=lambda s: s['start_time'])
@@ -1556,6 +1556,7 @@ async def mock_async_task(inputs: TaskInput) -> TaskOutput:
                             'gen_ai.operation.name': {},
                             'n_cases': {},
                             'name': {},
+                            'metadata': {'type': 'object'},
                             'logfire.experiment.metadata': {
                                 'type': 'object',
                                 'properties': {
@@ -1571,11 +1572,13 @@ async def mock_async_task(inputs: TaskInput) -> TaskOutput:
                         'type': 'object',
                     },
                     'logfire.msg': 'evaluate mock_async_task',
+                    'metadata': {'key': 'value'},
                     'logfire.msg_template': 'evaluate {name}',
                     'logfire.span_type': 'span',
                     'n_cases': 2,
                     'logfire.experiment.metadata': {
                         'n_cases': 2,
+                        'metadata': {'key': 'value'},
                         'averages': {
                             'name': 'Averages',
                             'scores': {'confidence': 1.0},

From 09a9b610ffeb6018aa35a9781a058c883e0e17c5 Mon Sep 17 00:00:00 2001
From: David Montague <35119617+dmontagu@users.noreply.github.com>
Date: Mon, 27 Oct 2025 15:56:21 -0600
Subject: [PATCH 6/8] Fix failing test

---
 tests/evals/test_reporting.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/evals/test_reporting.py b/tests/evals/test_reporting.py
index 58a2b399f4..8e575e4bfc 100644
--- a/tests/evals/test_reporting.py
+++ b/tests/evals/test_reporting.py
@@ -1355,7 +1355,7 @@ async def test_evaluation_renderer_diff_with_no_metadata(sample_report_case: Rep
         include_errors=False,  # Prevent failures table from being added
     )
     assert output == snapshot("""\
-                    Evaluation Diff: baseline_report → new_report
+                    Evaluation Diff: baseline_report → new_report                     \n\
 ┏━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━┓
 ┃ Case ID   ┃ Scores       ┃ Labels        ┃ Metrics         ┃ Assertions ┃ Duration ┃
 ┡━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━┩

From 7c747739b12dcdaa1ff1e412503b3f7d1d67f196 Mon Sep 17 00:00:00 2001
From: David Montague <35119617+dmontagu@users.noreply.github.com>
Date: Mon, 27 Oct 2025 16:13:05 -0600
Subject: [PATCH 7/8] Fix docstring of evaluate_sync

---
 pydantic_evals/pydantic_evals/dataset.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/pydantic_evals/pydantic_evals/dataset.py b/pydantic_evals/pydantic_evals/dataset.py
index 6e973fbaf0..28f0f2a1a5 100644
--- a/pydantic_evals/pydantic_evals/dataset.py
+++ b/pydantic_evals/pydantic_evals/dataset.py
@@ -377,11 +377,11 @@ def evaluate_sync(
         Args:
             task: The task to evaluate. This should be a callable that takes the inputs of the case
                 and returns the output.
-            name: The name of the task being evaluated, this is used to identify the task in the report.
-                If omitted, the name of the task function will be used.
+            name: The name of the experiment being run, this is used to identify the experiment in the report.
+                If omitted, the task_name will be used; if that is not specified, the name of the task function is used.
             max_concurrency: The maximum number of concurrent evaluations of the task to allow.
                 If None, all cases will be evaluated concurrently.
-            progress: Whether to show a progress bar for the evaluation. Defaults to True.
+            progress: Whether to show a progress bar for the evaluation. Defaults to `True`.
             retry_task: Optional retry configuration for the task execution.
             retry_evaluators: Optional retry configuration for evaluator execution.
             task_name: Optional override to the name of the task being executed, otherwise the name of the task

From 2a0bf33d6cb199257527020afc08e9257a19d62f Mon Sep 17 00:00:00 2001
From: David Montague <35119617+dmontagu@users.noreply.github.com>
Date: Tue, 28 Oct 2025 00:33:13 -0600
Subject: [PATCH 8/8] Fix coverage

---
 pydantic_evals/pydantic_evals/reporting/__init__.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/pydantic_evals/pydantic_evals/reporting/__init__.py b/pydantic_evals/pydantic_evals/reporting/__init__.py
index 8070c760d3..622e70970f 100644
--- a/pydantic_evals/pydantic_evals/reporting/__init__.py
+++ b/pydantic_evals/pydantic_evals/reporting/__init__.py
@@ -234,7 +234,7 @@ def render(
         metric_configs: dict[str, RenderNumberConfig] | None = None,
         duration_config: RenderNumberConfig | None = None,
         include_reasons: bool = False,
-    ) -> str:  # pragma: no cover
+    ) -> str:
         """Render this report to a nicely-formatted string, optionally comparing it to a baseline report.
 
         If you want more control over the output, use `console_table` instead and pass it to `rich.Console.print`.
@@ -390,7 +390,7 @@ def console_table(
         )
         if baseline is None:
             return renderer.build_table(self, with_title=with_title)
-        else:  # pragma: no cover
+        else:
             return renderer.build_diff_table(self, baseline, with_title=with_title)
 
     def _metadata_panel(