diff --git a/pydantic_evals/pydantic_evals/dataset.py b/pydantic_evals/pydantic_evals/dataset.py index 98c703b048..26164ec5eb 100644 --- a/pydantic_evals/pydantic_evals/dataset.py +++ b/pydantic_evals/pydantic_evals/dataset.py @@ -324,6 +324,8 @@ async def _handle_case(case: Case[InputsT, OutputT, MetadataT], report_case_name trace_id=trace_id, ) if (averages := report.averages()) is not None and averages.assertions is not None: + experiment_metadata = {'n_cases': len(self.cases), 'averages': averages} + eval_span.set_attribute('experiment.metadata', experiment_metadata) eval_span.set_attribute('assertion_pass_rate', averages.assertions) return report diff --git a/tests/evals/test_dataset.py b/tests/evals/test_dataset.py index d94b8f9825..eddbdcf5dd 100644 --- a/tests/evals/test_dataset.py +++ b/tests/evals/test_dataset.py @@ -1496,10 +1496,36 @@ async def mock_async_task(inputs: TaskInput) -> TaskOutput: 'assertion_pass_rate': 1.0, 'logfire.msg_template': 'evaluate {name}', 'logfire.msg': 'evaluate mock_async_task', + 'experiment.metadata': { + 'n_cases': 2, + 'averages': { + 'name': 'Averages', + 'scores': {'confidence': 1.0}, + 'labels': {}, + 'metrics': {}, + 'assertions': 1.0, + 'task_duration': 1.0, + 'total_duration': 9.0, + }, + }, 'logfire.span_type': 'span', 'logfire.json_schema': { 'type': 'object', - 'properties': {'name': {}, 'n_cases': {}, 'assertion_pass_rate': {}}, + 'properties': { + 'name': {}, + 'n_cases': {}, + 'experiment.metadata': { + 'type': 'object', + 'properties': { + 'averages': { + 'type': 'object', + 'title': 'ReportCaseAggregate', + 'x-python-datatype': 'PydanticModel', + } + }, + }, + 'assertion_pass_rate': {}, + }, }, }, ),