From 2c3a218b0b65b65f4b8d5431dbb1a2e73b76aadb Mon Sep 17 00:00:00 2001 From: David Montague <35119617+dmontagu@users.noreply.github.com> Date: Fri, 24 Oct 2025 09:50:40 -0600 Subject: [PATCH 1/8] Record experiment metadata --- pydantic_evals/pydantic_evals/dataset.py | 25 +++++++++++++++++++----- 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/pydantic_evals/pydantic_evals/dataset.py b/pydantic_evals/pydantic_evals/dataset.py index 23dda99ef7..e5d5e92648 100644 --- a/pydantic_evals/pydantic_evals/dataset.py +++ b/pydantic_evals/pydantic_evals/dataset.py @@ -265,6 +265,8 @@ async def evaluate( retry_evaluators: RetryConfig | None = None, *, task_name: str | None = None, + metadata: dict[str, Any] | None = None, + tags: Sequence[str] | None = None, ) -> EvaluationReport[InputsT, OutputT, MetadataT]: """Evaluates the test cases in the dataset using the given task. @@ -283,6 +285,8 @@ async def evaluate( retry_evaluators: Optional retry configuration for evaluator execution. task_name: Optional override to the name of the task being executed, otherwise the name of the task function will be used. + metadata: Optional dict of experiment metadata. + tags: Optional sequence of logfire tags. Returns: A report containing the results of the evaluation. @@ -294,6 +298,9 @@ async def evaluate( limiter = anyio.Semaphore(max_concurrency) if max_concurrency is not None else AsyncExitStack() + extra_attributes: dict[str, Any] = {'gen_ai.operation.name': 'experiment'} + if metadata is not None: + extra_attributes['metadata'] = metadata with ( logfire_span( 'evaluate {name}', @@ -301,7 +308,9 @@ async def evaluate( task_name=task_name, dataset_name=self.name, n_cases=len(self.cases), - **{'gen_ai.operation.name': 'experiment'}, # pyright: ignore[reportArgumentType] + metadata=metadata, + **extra_attributes, + _tags=tags, ) as eval_span, progress_bar or nullcontext(), ): @@ -342,10 +351,16 @@ async def _handle_case(case: Case[InputsT, OutputT, MetadataT], report_case_name span_id=span_id, trace_id=trace_id, ) - if (averages := report.averages()) is not None and averages.assertions is not None: - experiment_metadata = {'n_cases': len(self.cases), 'averages': averages} - eval_span.set_attribute('logfire.experiment.metadata', experiment_metadata) - eval_span.set_attribute('assertion_pass_rate', averages.assertions) + full_experiment_metadata: dict[str, Any] = {'n_cases': len(self.cases)} + if metadata is not None: + full_experiment_metadata['metadata'] = metadata + if tags is not None: + full_experiment_metadata['tags'] = tags + if (averages := report.averages()) is not None: + full_experiment_metadata['averages'] = averages + if averages.assertions is not None: + eval_span.set_attribute('assertion_pass_rate', averages.assertions) + eval_span.set_attribute('logfire.experiment.metadata', full_experiment_metadata) return report def evaluate_sync( From 3cbe4dc494626df70febe6794860f9567857f9c5 Mon Sep 17 00:00:00 2001 From: David Montague <35119617+dmontagu@users.noreply.github.com> Date: Fri, 24 Oct 2025 11:18:27 -0600 Subject: [PATCH 2/8] Add metadata for experiments --- pydantic_evals/pydantic_evals/dataset.py | 1 + .../pydantic_evals/reporting/__init__.py | 137 ++++++- tests/evals/test_reporting.py | 367 ++++++++++++++++++ 3 files changed, 494 insertions(+), 11 deletions(-) diff --git a/pydantic_evals/pydantic_evals/dataset.py b/pydantic_evals/pydantic_evals/dataset.py index e5d5e92648..01caf0ce86 100644 --- a/pydantic_evals/pydantic_evals/dataset.py +++ b/pydantic_evals/pydantic_evals/dataset.py @@ -348,6 +348,7 @@ async def _handle_case(case: Case[InputsT, OutputT, MetadataT], report_case_name name=name, cases=cases, failures=failures, + experiment_metadata=metadata, span_id=span_id, trace_id=trace_id, ) diff --git a/pydantic_evals/pydantic_evals/reporting/__init__.py b/pydantic_evals/pydantic_evals/reporting/__init__.py index 16bc00d261..8070c760d3 100644 --- a/pydantic_evals/pydantic_evals/reporting/__init__.py +++ b/pydantic_evals/pydantic_evals/reporting/__init__.py @@ -7,8 +7,10 @@ from typing import Any, Generic, Literal, Protocol, cast from pydantic import BaseModel, TypeAdapter -from rich.console import Console +from rich.console import Console, Group, RenderableType +from rich.panel import Panel from rich.table import Table +from rich.text import Text from typing_extensions import TypedDict, TypeVar from pydantic_evals._utils import UNSET, Unset @@ -196,6 +198,8 @@ class EvaluationReport(Generic[InputsT, OutputT, MetadataT]): failures: list[ReportCaseFailure[InputsT, OutputT, MetadataT]] = field(default_factory=list) """The failures in the report. These are cases where task execution raised an exception.""" + experiment_metadata: dict[str, Any] | None = None + """Metadata associated with the specific experiment represented by this report.""" trace_id: str | None = None """The trace ID of the evaluation.""" span_id: str | None = None @@ -261,7 +265,6 @@ def render( duration_config=duration_config, include_reasons=include_reasons, ) - Console(file=io_file) return io_file.getvalue() def print( @@ -297,7 +300,8 @@ def print( if console is None: # pragma: no branch console = Console(width=width) - table = self.console_table( + metadata_panel = self._metadata_panel(baseline=baseline) + renderable: RenderableType = self.console_table( baseline=baseline, include_input=include_input, include_metadata=include_metadata, @@ -316,8 +320,12 @@ def print( metric_configs=metric_configs, duration_config=duration_config, include_reasons=include_reasons, + with_title=not metadata_panel, ) - console.print(table) + # Wrap table with experiment metadata panel if present + if metadata_panel: + renderable = Group(metadata_panel, renderable) + console.print(renderable) if include_errors and self.failures: # pragma: no cover failures_table = self.failures_table( include_input=include_input, @@ -330,6 +338,7 @@ def print( ) console.print(failures_table, style='red') + # TODO(DavidM): in v2, change the return type here to RenderableType def console_table( self, baseline: EvaluationReport[InputsT, OutputT, MetadataT] | None = None, @@ -351,9 +360,11 @@ def console_table( metric_configs: dict[str, RenderNumberConfig] | None = None, duration_config: RenderNumberConfig | None = None, include_reasons: bool = False, + with_title: bool = True, ) -> Table: - """Return a table containing the data from this report, or the diff between this report and a baseline report. + """Return a table containing the data from this report. + If a baseline is provided, returns a diff between this report and the baseline report. Optionally include input and output details. """ renderer = EvaluationRenderer( @@ -378,10 +389,82 @@ def console_table( include_reasons=include_reasons, ) if baseline is None: - return renderer.build_table(self) + return renderer.build_table(self, with_title=with_title) else: # pragma: no cover - return renderer.build_diff_table(self, baseline) + return renderer.build_diff_table(self, baseline, with_title=with_title) + def _metadata_panel( + self, baseline: EvaluationReport[InputsT, OutputT, MetadataT] | None = None + ) -> RenderableType | None: + """Wrap a table with an experiment metadata panel if metadata exists. + + Args: + table: The table to wrap + baseline: Optional baseline report for diff metadata + + Returns: + Either the table unchanged or a Group with Panel and Table + """ + if baseline is None: + # Single report - show metadata if present + if self.experiment_metadata: + metadata_text = Text() + items = list(self.experiment_metadata.items()) + for i, (key, value) in enumerate(items): + metadata_text.append(f'{key}: {value}', style='dim') + if i < len(items) - 1: + metadata_text.append('\n') + return Panel( + metadata_text, + title=f'Evaluation Summary: {self.name}', + title_align='left', + border_style='dim', + padding=(0, 1), + expand=False, + ) + else: + # Diff report - show metadata diff if either has metadata + if self.experiment_metadata or baseline.experiment_metadata: + diff_name = baseline.name if baseline.name == self.name else f'{baseline.name} → {self.name}' + metadata_text = Text() + lines_styles: list[tuple[str, str]] = [] + if baseline.experiment_metadata and self.experiment_metadata: + # Collect all keys from both + all_keys = sorted(set(baseline.experiment_metadata.keys()) | set(self.experiment_metadata.keys())) + for key in all_keys: + baseline_val = baseline.experiment_metadata.get(key) + report_val = self.experiment_metadata.get(key) + if baseline_val == report_val: + lines_styles.append((f'{key}: {report_val}', 'dim')) + elif baseline_val is None: + lines_styles.append((f'+ {key}: {report_val}', 'green')) + elif report_val is None: + lines_styles.append((f'- {key}: {baseline_val}', 'red')) + else: + lines_styles.append((f'{key}: {baseline_val} → {report_val}', 'yellow')) + elif self.experiment_metadata: + lines_styles = [(f'+ {k}: {v}', 'green') for k, v in self.experiment_metadata.items()] + else: # baseline.experiment_metadata only + assert baseline.experiment_metadata is not None + lines_styles = [(f'- {k}: {v}', 'red') for k, v in baseline.experiment_metadata.items()] + + for i, (line, style) in enumerate(lines_styles): + metadata_text.append(line, style=style) + if i < len(lines_styles) - 1: + metadata_text.append('\n') + + return Panel( + metadata_text, + title=f'Evaluation Diff: {diff_name}', + title_align='left', + border_style='dim', + padding=(0, 1), + expand=False, + ) + + return None + + # TODO(DavidM): in v2, change the return type here to RenderableType def failures_table( self, *, @@ -705,6 +788,7 @@ class ReportCaseRenderer: metric_renderers: Mapping[str, _NumberRenderer] duration_renderer: _NumberRenderer + # TODO(DavidM): in v2, change the return type here to RenderableType def build_base_table(self, title: str) -> Table: """Build and return a Rich Table for the diff output.""" table = Table(title=title, show_lines=True) @@ -731,6 +815,7 @@ def build_base_table(self, title: str) -> Table: table.add_column('Durations' if self.include_total_duration else 'Duration', justify='right') return table + # TODO(DavidM): in v2, change the return type here to RenderableType def build_failures_table(self, title: str) -> Table: """Build and return a Rich Table for the failures output.""" table = Table(title=title, show_lines=True) @@ -1190,9 +1275,22 @@ def _get_case_renderer( duration_renderer=duration_renderer, ) - def build_table(self, report: EvaluationReport) -> Table: + # TODO(DavidM): in v2, change the return type here to RenderableType + def build_table(self, report: EvaluationReport, *, with_title: bool = True) -> Table: + """Build a table for the report. + + Args: + report: The evaluation report to render + with_title: Whether to include the title in the table (default True) + + Returns: + A Rich Table object + """ case_renderer = self._get_case_renderer(report) - table = case_renderer.build_base_table(f'Evaluation Summary: {report.name}') + + title = f'Evaluation Summary: {report.name}' if with_title else '' + table = case_renderer.build_base_table(title) + for case in report.cases: table.add_row(*case_renderer.build_row(case)) @@ -1203,7 +1301,20 @@ def build_table(self, report: EvaluationReport) -> Table: return table - def build_diff_table(self, report: EvaluationReport, baseline: EvaluationReport) -> Table: + # TODO(DavidM): in v2, change the return type here to RenderableType + def build_diff_table( + self, report: EvaluationReport, baseline: EvaluationReport, *, with_title: bool = True + ) -> Table: + """Build a diff table comparing report to baseline. + + Args: + report: The evaluation report to compare + baseline: The baseline report to compare against + with_title: Whether to include the title in the table (default True) + + Returns: + A Rich Table object + """ report_cases = report.cases baseline_cases = self._baseline_cases_to_include(report, baseline) @@ -1228,7 +1339,10 @@ def build_diff_table(self, report: EvaluationReport, baseline: EvaluationReport) case_renderer = self._get_case_renderer(report, baseline) diff_name = baseline.name if baseline.name == report.name else f'{baseline.name} → {report.name}' - table = case_renderer.build_base_table(f'Evaluation Diff: {diff_name}') + + title = f'Evaluation Diff: {diff_name}' if with_title else '' + table = case_renderer.build_base_table(title) + for baseline_case, new_case in diff_cases: table.add_row(*case_renderer.build_diff_row(new_case, baseline_case)) for case in added_cases: @@ -1247,6 +1361,7 @@ def build_diff_table(self, report: EvaluationReport, baseline: EvaluationReport) return table + # TODO(DavidM): in v2, change the return type here to RenderableType def build_failures_table(self, report: EvaluationReport) -> Table: case_renderer = self._get_case_renderer(report) table = case_renderer.build_failures_table('Case Failures') diff --git a/tests/evals/test_reporting.py b/tests/evals/test_reporting.py index 06b86065ca..225ab74460 100644 --- a/tests/evals/test_reporting.py +++ b/tests/evals/test_reporting.py @@ -950,3 +950,370 @@ async def test_evaluation_renderer_no_evaluator_failures_column(): │ test_case │ {'query': 'What is 2+2?'} │ {'answer': '4'} │ accuracy: 0.950 │ 0.100 │ └───────────┴───────────────────────────┴─────────────────┴─────────────────┴──────────┘ """) + + +async def test_evaluation_renderer_with_experiment_metadata(sample_report_case: ReportCase): + """Test EvaluationRenderer with experiment metadata.""" + report = EvaluationReport( + cases=[sample_report_case], + name='test_report', + experiment_metadata={'model': 'gpt-4o', 'temperature': 0.7, 'prompt_version': 'v2'}, + ) + + output = report.render( + include_input=True, + include_metadata=False, + include_expected_output=False, + include_output=False, + include_durations=True, + include_total_duration=False, + include_removed_cases=False, + include_averages=True, + include_errors=False, + include_error_stacktrace=False, + include_evaluator_failures=True, + input_config={}, + metadata_config={}, + output_config={}, + score_configs={}, + label_configs={}, + metric_configs={}, + duration_config={}, + include_reasons=False, + ) + + assert output == snapshot("""\ +╭─ Evaluation Summary: test_report ─╮ +│ model: gpt-4o │ +│ temperature: 0.7 │ +│ prompt_version: v2 │ +╰───────────────────────────────────╯ +┏━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━┓ +┃ Case ID ┃ Inputs ┃ Scores ┃ Labels ┃ Metrics ┃ Assertions ┃ Duration ┃ +┡━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━┩ +│ test_case │ {'query': 'What is 2+2?'} │ score1: 2.50 │ label1: hello │ accuracy: 0.950 │ ✔ │ 100.0ms │ +├───────────┼───────────────────────────┼──────────────┼────────────────────────┼─────────────────┼────────────┼──────────┤ +│ Averages │ │ score1: 2.50 │ label1: {'hello': 1.0} │ accuracy: 0.950 │ 100.0% ✔ │ 100.0ms │ +└───────────┴───────────────────────────┴──────────────┴────────────────────────┴─────────────────┴────────────┴──────────┘ +""") + + +async def test_evaluation_renderer_with_long_experiment_metadata(sample_report_case: ReportCase): + """Test EvaluationRenderer with very long experiment metadata.""" + report = EvaluationReport( + cases=[sample_report_case], + name='test_report', + experiment_metadata={ + 'model': 'gpt-4o-2024-08-06', + 'temperature': 0.7, + 'prompt_version': 'v2.1.5', + 'system_prompt': 'You are a helpful assistant', + 'max_tokens': 1000, + 'top_p': 0.9, + 'frequency_penalty': 0.1, + 'presence_penalty': 0.1, + }, + ) + + output = report.render( + include_input=False, + include_metadata=False, + include_expected_output=False, + include_output=False, + include_durations=True, + include_total_duration=False, + include_removed_cases=False, + include_averages=False, + include_errors=False, + include_error_stacktrace=False, + include_evaluator_failures=True, + input_config={}, + metadata_config={}, + output_config={}, + score_configs={}, + label_configs={}, + metric_configs={}, + duration_config={}, + include_reasons=False, + ) + + assert output == snapshot("""\ +╭─ Evaluation Summary: test_report ──────────╮ +│ model: gpt-4o-2024-08-06 │ +│ temperature: 0.7 │ +│ prompt_version: v2.1.5 │ +│ system_prompt: You are a helpful assistant │ +│ max_tokens: 1000 │ +│ top_p: 0.9 │ +│ frequency_penalty: 0.1 │ +│ presence_penalty: 0.1 │ +╰────────────────────────────────────────────╯ +┏━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━┓ +┃ Case ID ┃ Scores ┃ Labels ┃ Metrics ┃ Assertions ┃ Duration ┃ +┡━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━┩ +│ test_case │ score1: 2.50 │ label1: hello │ accuracy: 0.950 │ ✔ │ 100.0ms │ +└───────────┴──────────────┴───────────────┴─────────────────┴────────────┴──────────┘ +""") + + +async def test_evaluation_renderer_diff_with_experiment_metadata(sample_report_case: ReportCase): + """Test EvaluationRenderer diff table with experiment metadata.""" + baseline_report = EvaluationReport( + cases=[sample_report_case], + name='baseline_report', + experiment_metadata={'model': 'gpt-4', 'temperature': 0.5}, + ) + + new_report = EvaluationReport( + cases=[sample_report_case], + name='new_report', + experiment_metadata={'model': 'gpt-4o', 'temperature': 0.7}, + ) + + output = new_report.render( + baseline=baseline_report, + include_input=False, + include_metadata=False, + include_expected_output=False, + include_output=False, + include_durations=True, + include_total_duration=False, + include_removed_cases=False, + include_averages=True, + include_errors=False, + include_error_stacktrace=False, + include_evaluator_failures=True, + input_config={}, + metadata_config={}, + output_config={}, + score_configs={}, + label_configs={}, + metric_configs={}, + duration_config={}, + include_reasons=False, + ) + + assert output == snapshot("""\ +╭─ Evaluation Diff: baseline_report → new_report ─╮ +│ model: gpt-4 → gpt-4o │ +│ temperature: 0.5 → 0.7 │ +╰─────────────────────────────────────────────────╯ +┏━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━┓ +┃ Case ID ┃ Scores ┃ Labels ┃ Metrics ┃ Assertions ┃ Duration ┃ +┡━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━┩ +│ test_case │ score1: 2.50 │ label1: hello │ accuracy: 0.950 │ ✔ │ 100.0ms │ +├───────────┼──────────────┼────────────────────────┼─────────────────┼────────────┼──────────┤ +│ Averages │ score1: 2.50 │ label1: {'hello': 1.0} │ accuracy: 0.950 │ 100.0% ✔ │ 100.0ms │ +└───────────┴──────────────┴────────────────────────┴─────────────────┴────────────┴──────────┘ +""") + + +async def test_evaluation_renderer_diff_with_only_new_metadata(sample_report_case: ReportCase): + """Test EvaluationRenderer diff table where only new report has metadata.""" + baseline_report = EvaluationReport( + cases=[sample_report_case], + name='baseline_report', + experiment_metadata=None, # No metadata + ) + + new_report = EvaluationReport( + cases=[sample_report_case], + name='new_report', + experiment_metadata={'model': 'gpt-4o', 'temperature': 0.7}, + ) + + output = new_report.render( + baseline=baseline_report, + include_input=False, + include_metadata=False, + include_expected_output=False, + include_output=False, + include_durations=True, + include_total_duration=False, + include_removed_cases=False, + include_averages=False, + include_errors=False, + include_error_stacktrace=False, + include_evaluator_failures=True, + input_config={}, + metadata_config={}, + output_config={}, + score_configs={}, + label_configs={}, + metric_configs={}, + duration_config={}, + include_reasons=False, + ) + + assert output == snapshot("""\ +╭─ Evaluation Diff: baseline_report → new_report ─╮ +│ + model: gpt-4o │ +│ + temperature: 0.7 │ +╰─────────────────────────────────────────────────╯ +┏━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━┓ +┃ Case ID ┃ Scores ┃ Labels ┃ Metrics ┃ Assertions ┃ Duration ┃ +┡━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━┩ +│ test_case │ score1: 2.50 │ label1: hello │ accuracy: 0.950 │ ✔ │ 100.0ms │ +└───────────┴──────────────┴───────────────┴─────────────────┴────────────┴──────────┘ +""") + + +async def test_evaluation_renderer_diff_with_only_baseline_metadata(sample_report_case: ReportCase): + """Test EvaluationRenderer diff table where only baseline report has metadata.""" + baseline_report = EvaluationReport( + cases=[sample_report_case], + name='baseline_report', + experiment_metadata={'model': 'gpt-4', 'temperature': 0.5}, + ) + + new_report = EvaluationReport( + cases=[sample_report_case], + name='new_report', + experiment_metadata=None, # No metadata + ) + + output = new_report.render( + baseline=baseline_report, + include_input=False, + include_metadata=False, + include_expected_output=False, + include_output=False, + include_durations=True, + include_total_duration=False, + include_removed_cases=False, + include_averages=False, + include_errors=False, + include_error_stacktrace=False, + include_evaluator_failures=True, + input_config={}, + metadata_config={}, + output_config={}, + score_configs={}, + label_configs={}, + metric_configs={}, + duration_config={}, + include_reasons=False, + ) + + assert output == snapshot("""\ +╭─ Evaluation Diff: baseline_report → new_report ─╮ +│ - model: gpt-4 │ +│ - temperature: 0.5 │ +╰─────────────────────────────────────────────────╯ +┏━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━┓ +┃ Case ID ┃ Scores ┃ Labels ┃ Metrics ┃ Assertions ┃ Duration ┃ +┡━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━┩ +│ test_case │ score1: 2.50 │ label1: hello │ accuracy: 0.950 │ ✔ │ 100.0ms │ +└───────────┴──────────────┴───────────────┴─────────────────┴────────────┴──────────┘ +""") + + +async def test_evaluation_renderer_diff_with_same_metadata(sample_report_case: ReportCase): + """Test EvaluationRenderer diff table where both reports have the same metadata.""" + metadata = {'model': 'gpt-4o', 'temperature': 0.7} + + baseline_report = EvaluationReport( + cases=[sample_report_case], + name='baseline_report', + experiment_metadata=metadata, + ) + + new_report = EvaluationReport( + cases=[sample_report_case], + name='new_report', + experiment_metadata=metadata, + ) + + output = new_report.render( + include_input=False, + include_metadata=False, + include_expected_output=False, + include_output=False, + include_durations=True, + include_total_duration=False, + include_removed_cases=False, + include_averages=False, + include_error_stacktrace=False, + include_evaluator_failures=True, + input_config={}, + metadata_config={}, + output_config={}, + score_configs={}, + label_configs={}, + metric_configs={}, + duration_config={}, + include_reasons=False, + baseline=baseline_report, + include_errors=False, # Prevent failures table from being added + ) + assert output == snapshot("""\ +╭─ Evaluation Diff: baseline_report → new_report ─╮ +│ model: gpt-4o │ +│ temperature: 0.7 │ +╰─────────────────────────────────────────────────╯ +┏━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━┓ +┃ Case ID ┃ Scores ┃ Labels ┃ Metrics ┃ Assertions ┃ Duration ┃ +┡━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━┩ +│ test_case │ score1: 2.50 │ label1: hello │ accuracy: 0.950 │ ✔ │ 100.0ms │ +└───────────┴──────────────┴───────────────┴─────────────────┴────────────┴──────────┘ +""") + + +async def test_evaluation_renderer_diff_with_changed_metadata(sample_report_case: ReportCase): + """Test EvaluationRenderer diff table where both reports have the same metadata.""" + + baseline_report = EvaluationReport( + cases=[sample_report_case], + name='baseline_report', + experiment_metadata={ + 'updated-key': 'original value', + 'preserved-key': 'preserved value', + 'old-key': 'old value', + }, + ) + + new_report = EvaluationReport( + cases=[sample_report_case], + name='new_report', + experiment_metadata={ + 'updated-key': 'updated value', + 'preserved-key': 'preserved value', + 'new-key': 'new value', + }, + ) + + output = new_report.render( + include_input=False, + include_metadata=False, + include_expected_output=False, + include_output=False, + include_durations=True, + include_total_duration=False, + include_removed_cases=False, + include_averages=False, + include_error_stacktrace=False, + include_evaluator_failures=True, + input_config={}, + metadata_config={}, + output_config={}, + score_configs={}, + label_configs={}, + metric_configs={}, + duration_config={}, + include_reasons=False, + baseline=baseline_report, + include_errors=False, # Prevent failures table from being added + ) + assert output == snapshot("""\ +╭─ Evaluation Diff: baseline_report → new_report ─╮ +│ + new-key: new value │ +│ - old-key: old value │ +│ preserved-key: preserved value │ +│ updated-key: original value → updated value │ +╰─────────────────────────────────────────────────╯ +┏━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━┓ +┃ Case ID ┃ Scores ┃ Labels ┃ Metrics ┃ Assertions ┃ Duration ┃ +┡━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━┩ +│ test_case │ score1: 2.50 │ label1: hello │ accuracy: 0.950 │ ✔ │ 100.0ms │ +└───────────┴──────────────┴───────────────┴─────────────────┴────────────┴──────────┘ +""") From 140ddf32590d6843b707839b33cdfbec1b2f7862 Mon Sep 17 00:00:00 2001 From: David Montague <35119617+dmontagu@users.noreply.github.com> Date: Mon, 27 Oct 2025 09:52:56 -0600 Subject: [PATCH 3/8] Add docs and tests, and fix some bugs --- docs/evals/how-to/metrics-attributes.md | 277 +++++++++++++++++++++-- pydantic_evals/pydantic_evals/dataset.py | 16 +- tests/evals/test_dataset.py | 20 ++ 3 files changed, 297 insertions(+), 16 deletions(-) diff --git a/docs/evals/how-to/metrics-attributes.md b/docs/evals/how-to/metrics-attributes.md index c1601953cc..167cbedd3e 100644 --- a/docs/evals/how-to/metrics-attributes.md +++ b/docs/evals/how-to/metrics-attributes.md @@ -418,34 +418,285 @@ class QualityEvaluator(Evaluator): ``` +## Experiment-Level Metadata + +In addition to case-level metadata, you can also pass experiment-level metadata when calling [`evaluate()`][pydantic_evals.Dataset.evaluate]: + +```python +from pydantic_evals import Case, Dataset + +dataset = Dataset( + cases=[ + Case( + inputs='test', + metadata={'difficulty': 'easy'}, # Case-level metadata + ) + ] +) + + +async def task(inputs: str) -> str: + return f'Result: {inputs}' + + +# Pass experiment-level metadata +async def main(): + report = await dataset.evaluate( + task, + metadata={ + 'model': 'gpt-4o', + 'prompt_version': 'v2.1', + 'temperature': 0.7, + }, + ) + + # Access experiment metadata in the report + print(report.experiment_metadata) + #> {'model': 'gpt-4o', 'prompt_version': 'v2.1', 'temperature': 0.7} +``` + +### When to Use Experiment Metadata + +Experiment metadata is useful for tracking configuration that applies to the entire evaluation run: + +- **Model configuration**: Model name, version, parameters +- **Prompt versioning**: Which prompt template was used +- **Infrastructure**: Deployment environment, region +- **Experiment context**: Developer name, feature branch, commit hash + +This metadata is especially valuable when: + +- Comparing multiple evaluation runs over time +- Tracking which configuration produced which results +- Reproducing evaluation results from historical data + +### Viewing in Reports + +Experiment metadata appears at the top of printed reports: + +```python +from pydantic_evals import Case, Dataset + +dataset = Dataset(cases=[Case(inputs='hello', expected_output='HELLO')]) + + +async def task(text: str) -> str: + return text.upper() + +async def main(): + report = await dataset.evaluate( + task, + metadata={'model': 'gpt-4o', 'version': 'v1.0'}, + ) + + print(report.render()) + """ + ╭─ Evaluation Summary: task ─╮ + │ model: gpt-4o │ + │ version: v1.0 │ + ╰────────────────────────────╯ + ┏━━━━━━━━━━┳━━━━━━━━━━┓ + ┃ Case ID ┃ Duration ┃ + ┡━━━━━━━━━━╇━━━━━━━━━━┩ + │ Case 1 │ 10ms │ + ├──────────┼──────────┤ + │ Averages │ 10ms │ + └──────────┴──────────┘ + """ +``` + +## Synchronization between Tasks and Experiment Metadata + +Experiment metadata is for *recording* configuration, not *configuring* the task. +The metadata dict doesn't automatically configure your task's behavior; you must ensure the values in the metadata dict match what your task actually uses. +For example, it's easy to accidentally have metadata claim `temperature: 0.7` while your task actually uses `temperature: 1.0`, leading to incorrect experiment tracking and unreproducible results. + +To avoid this problem, we recommend establishing a single source of truth for configuration that both your task and metadata reference. +Below are a few suggested patterns for achieving this synchronization. + +### Pattern 1: Shared Module Constants + +For simpler cases, use module-level constants: + +```python +from pydantic_ai import Agent +from pydantic_evals import Case, Dataset + +# Module constants as single source of truth +MODEL_NAME = 'openai:gpt-5-mini' +TEMPERATURE = 0.7 +SYSTEM_PROMPT = 'You are a helpful assistant.' + +agent = Agent(MODEL_NAME, model_settings={'temperature': TEMPERATURE}, system_prompt=SYSTEM_PROMPT) + + +async def task(inputs: str) -> str: + result = await agent.run(inputs) + return result.output + + +async def main(): + dataset = Dataset(cases=[Case(inputs='What is the capital of France?')]) + + # Metadata references same constants + await dataset.evaluate( + task, + metadata={ + 'model': MODEL_NAME, + 'temperature': TEMPERATURE, + 'system_prompt': SYSTEM_PROMPT, + }, + ) +``` + +### Pattern 2: Configuration Object (Recommended) + +Define configuration once and use it everywhere: + +```python +from dataclasses import asdict, dataclass + +from pydantic_ai import Agent +from pydantic_evals import Case, Dataset + + +@dataclass +class TaskConfig: + """Single source of truth for task configuration. + + Includes all variables you'd like to see in experiment metadata. + """ + + model: str + temperature: float + max_tokens: int + prompt_version: str + + +# Define configuration once +config = TaskConfig( + model='openai:gpt-5-mini', + temperature=0.7, + max_tokens=500, + prompt_version='v2.1', +) + +# Use config in task +agent = Agent( + config.model, + model_settings={'temperature': config.temperature, 'max_tokens': config.max_tokens}, +) + + +async def task(inputs: str) -> str: + """Task uses the same config that's recorded in metadata.""" + result = await agent.run(inputs) + return result.output + + +# Evaluate with metadata derived from the same config +async def main(): + dataset = Dataset(cases=[Case(inputs='What is the capital of France?')]) + + report = await dataset.evaluate( + task, + metadata=asdict(config), # Guaranteed to match task behavior + ) + + print(report.experiment_metadata) + """ + { + 'model': 'openai:gpt-5-mini', + 'temperature': 0.7, + 'max_tokens': 500, + 'prompt_version': 'v2.1', + } + """ +``` + +If it's problematic to have a global task configuration, you can also create your `TaskConfig` object at the task +call-site and pass it to the agent via `deps` or similar, but in this case you would still need to guarantee that the +value is always the same as the value passed to `metadata` in the call to `Dataset.evaluate`. + +### Anti-Pattern: Duplicate Configuration + +**Avoid this common mistake**: + +```python +from pydantic_ai import Agent +from pydantic_evals import Case, Dataset + +# ❌ BAD: Configuration defined in multiple places +agent = Agent('openai:gpt-5-mini', model_settings={'temperature': 0.7}) + + +async def task(inputs: str) -> str: + result = await agent.run(inputs) + return result.output + + +async def main(): + dataset = Dataset(cases=[Case(inputs='test')]) + + # ❌ BAD: Metadata manually typed - easy to get out of sync + await dataset.evaluate( + task, + metadata={ + 'model': 'openai:gpt-5-mini', # Duplicated! Could diverge from agent definition + 'temperature': 0.8, # ⚠️ WRONG! Task actually uses 0.7 + }, + ) +``` + +In this anti-pattern, the metadata claims `temperature: 0.8` but the task uses `0.7`. This leads to: + +- Incorrect experiment tracking +- Inability to reproduce results +- Confusion when comparing runs +- Wasted time debugging "why results differ" + ## Metrics vs Attributes vs Metadata Understanding the differences: -| Feature | Metrics | Attributes | Metadata | -|---------|---------|------------|----------| -| **Set in** | Task execution | Task execution | Case definition | -| **Type** | int, float | Any | Any | -| **Purpose** | Quantitative | Qualitative | Test data | -| **Used for** | Aggregation | Context | Input to task | -| **Available to** | Evaluators | Evaluators | Task & Evaluators | +| Feature | Metrics | Attributes | Case Metadata | Experiment Metadata | +|---------|---------|------------|---------------|---------------------| +| **Set in** | Task execution | Task execution | Case definition | `evaluate()` call | +| **Type** | int, float | Any | Any | Any | +| **Purpose** | Quantitative | Qualitative | Test data | Experiment config | +| **Used for** | Aggregation | Context | Input to task | Tracking runs | +| **Available to** | Evaluators | Evaluators | Task & Evaluators | Report only | +| **Scope** | Per case | Per case | Per case | Per experiment | ```python -from pydantic_evals import Case, increment_eval_metric, set_eval_attribute +from pydantic_evals import Case, Dataset, increment_eval_metric, set_eval_attribute -# Metadata: Defined in case (before execution) -Case( +# Case Metadata: Defined in case (before execution) +case = Case( inputs='question', - metadata={'difficulty': 'hard', 'category': 'math'}, + metadata={'difficulty': 'hard', 'category': 'math'}, # Per-case metadata ) +dataset = Dataset(cases=[case]) + # Metrics & Attributes: Recorded during execution -def task(inputs): - # These are recorded during execution +async def task(inputs): + # These are recorded during execution for each case increment_eval_metric('tokens', 100) set_eval_attribute('model', 'gpt-4o') return f'Result: {inputs}' + + +async def main(): + # Experiment Metadata: Defined at evaluation time + await dataset.evaluate( + task, + metadata={ # Experiment-level metadata + 'prompt_version': 'v2.1', + 'temperature': 0.7, + }, + ) ``` ## Troubleshooting diff --git a/pydantic_evals/pydantic_evals/dataset.py b/pydantic_evals/pydantic_evals/dataset.py index 01caf0ce86..479f3c27c4 100644 --- a/pydantic_evals/pydantic_evals/dataset.py +++ b/pydantic_evals/pydantic_evals/dataset.py @@ -286,7 +286,7 @@ async def evaluate( task_name: Optional override to the name of the task being executed, otherwise the name of the task function will be used. metadata: Optional dict of experiment metadata. - tags: Optional sequence of logfire tags. + tags: Optional sequence of tags to add to the experiment span. Returns: A report containing the results of the evaluation. @@ -308,7 +308,6 @@ async def evaluate( task_name=task_name, dataset_name=self.name, n_cases=len(self.cases), - metadata=metadata, **extra_attributes, _tags=tags, ) as eval_span, @@ -372,6 +371,10 @@ def evaluate_sync( progress: bool = True, retry_task: RetryConfig | None = None, retry_evaluators: RetryConfig | None = None, + *, + task_name: str | None = None, + metadata: dict[str, Any] | None = None, + tags: Sequence[str] | None = None, ) -> EvaluationReport[InputsT, OutputT, MetadataT]: """Evaluates the test cases in the dataset using the given task. @@ -387,6 +390,10 @@ def evaluate_sync( progress: Whether to show a progress bar for the evaluation. Defaults to True. retry_task: Optional retry configuration for the task execution. retry_evaluators: Optional retry configuration for evaluator execution. + task_name: Optional override to the name of the task being executed, otherwise the name of the task + function will be used. + metadata: Optional dict of experiment metadata. + tags: Optional sequence of tags to add to the experiment span. Returns: A report containing the results of the evaluation. @@ -394,11 +401,14 @@ def evaluate_sync( return get_event_loop().run_until_complete( self.evaluate( task, - task_name=name, + name=name, max_concurrency=max_concurrency, progress=progress, retry_task=retry_task, retry_evaluators=retry_evaluators, + task_name=task_name, + metadata=metadata, + tags=tags, ) ) diff --git a/tests/evals/test_dataset.py b/tests/evals/test_dataset.py index 31f91bfc0b..72f5af648f 100644 --- a/tests/evals/test_dataset.py +++ b/tests/evals/test_dataset.py @@ -1750,3 +1750,23 @@ async def mock_async_task(inputs: TaskInput) -> TaskOutput: ), ] ) + + +async def test_evaluate_with_experiment_metadata(example_dataset: Dataset[TaskInput, TaskOutput, TaskMetadata]): + """Test that experiment metadata passed to evaluate() appears in the report.""" + + async def task(inputs: TaskInput) -> TaskOutput: + return TaskOutput(answer=inputs.query.upper()) + + # Pass experiment metadata to evaluate() + experiment_metadata = { + 'model': 'gpt-4o', + 'prompt_version': 'v2.1', + 'temperature': 0.7, + 'max_tokens': 1000, + } + + report = await example_dataset.evaluate(task, metadata=experiment_metadata) + + # Verify that the report contains the experiment metadata + assert report.experiment_metadata == experiment_metadata From 6decf5f09ce5760b91b97fc38f0ff76d26de4925 Mon Sep 17 00:00:00 2001 From: David Montague <35119617+dmontagu@users.noreply.github.com> Date: Mon, 27 Oct 2025 11:18:11 -0600 Subject: [PATCH 4/8] Remove tags and fix test coverage --- pydantic_evals/pydantic_evals/dataset.py | 8 ----- tests/evals/test_reporting.py | 45 ++++++++++++++++++++++++ 2 files changed, 45 insertions(+), 8 deletions(-) diff --git a/pydantic_evals/pydantic_evals/dataset.py b/pydantic_evals/pydantic_evals/dataset.py index 479f3c27c4..6e973fbaf0 100644 --- a/pydantic_evals/pydantic_evals/dataset.py +++ b/pydantic_evals/pydantic_evals/dataset.py @@ -266,7 +266,6 @@ async def evaluate( *, task_name: str | None = None, metadata: dict[str, Any] | None = None, - tags: Sequence[str] | None = None, ) -> EvaluationReport[InputsT, OutputT, MetadataT]: """Evaluates the test cases in the dataset using the given task. @@ -286,7 +285,6 @@ async def evaluate( task_name: Optional override to the name of the task being executed, otherwise the name of the task function will be used. metadata: Optional dict of experiment metadata. - tags: Optional sequence of tags to add to the experiment span. Returns: A report containing the results of the evaluation. @@ -309,7 +307,6 @@ async def evaluate( dataset_name=self.name, n_cases=len(self.cases), **extra_attributes, - _tags=tags, ) as eval_span, progress_bar or nullcontext(), ): @@ -354,8 +351,6 @@ async def _handle_case(case: Case[InputsT, OutputT, MetadataT], report_case_name full_experiment_metadata: dict[str, Any] = {'n_cases': len(self.cases)} if metadata is not None: full_experiment_metadata['metadata'] = metadata - if tags is not None: - full_experiment_metadata['tags'] = tags if (averages := report.averages()) is not None: full_experiment_metadata['averages'] = averages if averages.assertions is not None: @@ -374,7 +369,6 @@ def evaluate_sync( *, task_name: str | None = None, metadata: dict[str, Any] | None = None, - tags: Sequence[str] | None = None, ) -> EvaluationReport[InputsT, OutputT, MetadataT]: """Evaluates the test cases in the dataset using the given task. @@ -393,7 +387,6 @@ def evaluate_sync( task_name: Optional override to the name of the task being executed, otherwise the name of the task function will be used. metadata: Optional dict of experiment metadata. - tags: Optional sequence of tags to add to the experiment span. Returns: A report containing the results of the evaluation. @@ -408,7 +401,6 @@ def evaluate_sync( retry_evaluators=retry_evaluators, task_name=task_name, metadata=metadata, - tags=tags, ) ) diff --git a/tests/evals/test_reporting.py b/tests/evals/test_reporting.py index 225ab74460..58a2b399f4 100644 --- a/tests/evals/test_reporting.py +++ b/tests/evals/test_reporting.py @@ -1317,3 +1317,48 @@ async def test_evaluation_renderer_diff_with_changed_metadata(sample_report_case │ test_case │ score1: 2.50 │ label1: hello │ accuracy: 0.950 │ ✔ │ 100.0ms │ └───────────┴──────────────┴───────────────┴─────────────────┴────────────┴──────────┘ """) + + +async def test_evaluation_renderer_diff_with_no_metadata(sample_report_case: ReportCase): + """Test EvaluationRenderer diff table where both reports have the same metadata.""" + + baseline_report = EvaluationReport( + cases=[sample_report_case], + name='baseline_report', + ) + + new_report = EvaluationReport( + cases=[sample_report_case], + name='new_report', + ) + + output = new_report.render( + include_input=False, + include_metadata=False, + include_expected_output=False, + include_output=False, + include_durations=True, + include_total_duration=False, + include_removed_cases=False, + include_averages=False, + include_error_stacktrace=False, + include_evaluator_failures=True, + input_config={}, + metadata_config={}, + output_config={}, + score_configs={}, + label_configs={}, + metric_configs={}, + duration_config={}, + include_reasons=False, + baseline=baseline_report, + include_errors=False, # Prevent failures table from being added + ) + assert output == snapshot("""\ + Evaluation Diff: baseline_report → new_report +┏━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━┓ +┃ Case ID ┃ Scores ┃ Labels ┃ Metrics ┃ Assertions ┃ Duration ┃ +┡━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━┩ +│ test_case │ score1: 2.50 │ label1: hello │ accuracy: 0.950 │ ✔ │ 100.0ms │ +└───────────┴──────────────┴───────────────┴─────────────────┴────────────┴──────────┘ +""") From 669fab8478d41003d915cde9fbe9e8826fd19cc8 Mon Sep 17 00:00:00 2001 From: David Montague <35119617+dmontagu@users.noreply.github.com> Date: Mon, 27 Oct 2025 11:20:34 -0600 Subject: [PATCH 5/8] Update tests to show impact on attributes of setting experiment metadata --- tests/evals/test_dataset.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tests/evals/test_dataset.py b/tests/evals/test_dataset.py index 72f5af648f..ca5f90d44a 100644 --- a/tests/evals/test_dataset.py +++ b/tests/evals/test_dataset.py @@ -1530,7 +1530,7 @@ async def mock_async_task(inputs: TaskInput) -> TaskOutput: return TaskOutput(answer='Paris') return TaskOutput(answer='Unknown') # pragma: no cover - await example_dataset.evaluate(mock_async_task) + await example_dataset.evaluate(mock_async_task, metadata={'key': 'value'}) spans = capfire.exporter.exported_spans_as_dict(parse_json_attributes=True) spans.sort(key=lambda s: s['start_time']) @@ -1556,6 +1556,7 @@ async def mock_async_task(inputs: TaskInput) -> TaskOutput: 'gen_ai.operation.name': {}, 'n_cases': {}, 'name': {}, + 'metadata': {'type': 'object'}, 'logfire.experiment.metadata': { 'type': 'object', 'properties': { @@ -1571,11 +1572,13 @@ async def mock_async_task(inputs: TaskInput) -> TaskOutput: 'type': 'object', }, 'logfire.msg': 'evaluate mock_async_task', + 'metadata': {'key': 'value'}, 'logfire.msg_template': 'evaluate {name}', 'logfire.span_type': 'span', 'n_cases': 2, 'logfire.experiment.metadata': { 'n_cases': 2, + 'metadata': {'key': 'value'}, 'averages': { 'name': 'Averages', 'scores': {'confidence': 1.0}, From 09a9b610ffeb6018aa35a9781a058c883e0e17c5 Mon Sep 17 00:00:00 2001 From: David Montague <35119617+dmontagu@users.noreply.github.com> Date: Mon, 27 Oct 2025 15:56:21 -0600 Subject: [PATCH 6/8] Fix failing test --- tests/evals/test_reporting.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/evals/test_reporting.py b/tests/evals/test_reporting.py index 58a2b399f4..8e575e4bfc 100644 --- a/tests/evals/test_reporting.py +++ b/tests/evals/test_reporting.py @@ -1355,7 +1355,7 @@ async def test_evaluation_renderer_diff_with_no_metadata(sample_report_case: Rep include_errors=False, # Prevent failures table from being added ) assert output == snapshot("""\ - Evaluation Diff: baseline_report → new_report + Evaluation Diff: baseline_report → new_report \n\ ┏━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━┓ ┃ Case ID ┃ Scores ┃ Labels ┃ Metrics ┃ Assertions ┃ Duration ┃ ┡━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━┩ From 7c747739b12dcdaa1ff1e412503b3f7d1d67f196 Mon Sep 17 00:00:00 2001 From: David Montague <35119617+dmontagu@users.noreply.github.com> Date: Mon, 27 Oct 2025 16:13:05 -0600 Subject: [PATCH 7/8] Fix docstring of evaluate_sync --- pydantic_evals/pydantic_evals/dataset.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/pydantic_evals/pydantic_evals/dataset.py b/pydantic_evals/pydantic_evals/dataset.py index 6e973fbaf0..28f0f2a1a5 100644 --- a/pydantic_evals/pydantic_evals/dataset.py +++ b/pydantic_evals/pydantic_evals/dataset.py @@ -377,11 +377,11 @@ def evaluate_sync( Args: task: The task to evaluate. This should be a callable that takes the inputs of the case and returns the output. - name: The name of the task being evaluated, this is used to identify the task in the report. - If omitted, the name of the task function will be used. + name: The name of the experiment being run, this is used to identify the experiment in the report. + If omitted, the task_name will be used; if that is not specified, the name of the task function is used. max_concurrency: The maximum number of concurrent evaluations of the task to allow. If None, all cases will be evaluated concurrently. - progress: Whether to show a progress bar for the evaluation. Defaults to True. + progress: Whether to show a progress bar for the evaluation. Defaults to `True`. retry_task: Optional retry configuration for the task execution. retry_evaluators: Optional retry configuration for evaluator execution. task_name: Optional override to the name of the task being executed, otherwise the name of the task From 2a0bf33d6cb199257527020afc08e9257a19d62f Mon Sep 17 00:00:00 2001 From: David Montague <35119617+dmontagu@users.noreply.github.com> Date: Tue, 28 Oct 2025 00:33:13 -0600 Subject: [PATCH 8/8] Fix coverage --- pydantic_evals/pydantic_evals/reporting/__init__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pydantic_evals/pydantic_evals/reporting/__init__.py b/pydantic_evals/pydantic_evals/reporting/__init__.py index 8070c760d3..622e70970f 100644 --- a/pydantic_evals/pydantic_evals/reporting/__init__.py +++ b/pydantic_evals/pydantic_evals/reporting/__init__.py @@ -234,7 +234,7 @@ def render( metric_configs: dict[str, RenderNumberConfig] | None = None, duration_config: RenderNumberConfig | None = None, include_reasons: bool = False, - ) -> str: # pragma: no cover + ) -> str: """Render this report to a nicely-formatted string, optionally comparing it to a baseline report. If you want more control over the output, use `console_table` instead and pass it to `rich.Console.print`. @@ -390,7 +390,7 @@ def console_table( ) if baseline is None: return renderer.build_table(self, with_title=with_title) - else: # pragma: no cover + else: return renderer.build_diff_table(self, baseline, with_title=with_title) def _metadata_panel(