Skip to content

Commit e0e3798

Browse files
DouweMdmontagu
andauthored
Remove cases and averages from eval span (#2715)
Co-authored-by: David Montague <[email protected]>
1 parent 996781d commit e0e3798

File tree

2 files changed

+9
-90
lines changed

2 files changed

+9
-90
lines changed

pydantic_evals/pydantic_evals/dataset.py

Lines changed: 6 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -282,7 +282,10 @@ async def evaluate(
282282

283283
limiter = anyio.Semaphore(max_concurrency) if max_concurrency is not None else AsyncExitStack()
284284

285-
with _logfire.span('evaluate {name}', name=name) as eval_span, progress_bar or nullcontext():
285+
with (
286+
_logfire.span('evaluate {name}', name=name, n_cases=len(self.cases)) as eval_span,
287+
progress_bar or nullcontext(),
288+
):
286289
task_id = progress_bar.add_task(f'Evaluating {name}', total=total_cases) if progress_bar else None
287290

288291
async def _handle_case(case: Case[InputsT, OutputT, MetadataT], report_case_name: str):
@@ -320,15 +323,8 @@ async def _handle_case(case: Case[InputsT, OutputT, MetadataT], report_case_name
320323
span_id=span_id,
321324
trace_id=trace_id,
322325
)
323-
# TODO(DavidM): Address the following TODOs before V1...
324-
# TODO(DavidM): This attribute will be too big in general; remove it once we can use child spans in details panel:
325-
eval_span.set_attribute('cases', _REPORT_CASES_ADAPTER.dump_python(report.cases))
326-
# TODO(DavidM): This attribute will be too big in general; remove it once we can use child spans in details panel:
327-
eval_span.set_attribute('failures', _REPORT_CASE_FAILURES_ADAPTER.dump_python(report.failures))
328-
# TODO(DavidM): Remove this 'averages' attribute once we compute it in the details panel
329-
averages = report.averages()
330-
if averages:
331-
eval_span.set_attribute('averages', _REPORT_CASE_AGGREGATE_ADAPTER.dump_python(averages))
326+
if (averages := report.averages()) is not None and averages.assertions is not None:
327+
eval_span.set_attribute('assertion_pass_rate', averages.assertions)
332328
return report
333329

334330
def evaluate_sync(

tests/evals/test_dataset.py

Lines changed: 3 additions & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -1482,91 +1482,14 @@ async def mock_async_task(inputs: TaskInput) -> TaskOutput:
14821482
'evaluate {name}',
14831483
{
14841484
'name': 'mock_async_task',
1485+
'n_cases': 2,
1486+
'assertion_pass_rate': 1.0,
14851487
'logfire.msg_template': 'evaluate {name}',
14861488
'logfire.msg': 'evaluate mock_async_task',
14871489
'logfire.span_type': 'span',
1488-
'cases': [
1489-
{
1490-
'name': 'case1',
1491-
'inputs': {'query': 'What is 2+2?'},
1492-
'metadata': {'difficulty': 'easy', 'category': 'general'},
1493-
'expected_output': {'answer': '4', 'confidence': 1.0},
1494-
'output': {'answer': '4', 'confidence': 1.0},
1495-
'metrics': {},
1496-
'attributes': {},
1497-
'scores': {
1498-
'confidence': {
1499-
'name': 'confidence',
1500-
'value': 1.0,
1501-
'reason': None,
1502-
'source': {'name': 'SimpleEvaluator', 'arguments': None},
1503-
}
1504-
},
1505-
'labels': {},
1506-
'assertions': {
1507-
'correct': {
1508-
'name': 'correct',
1509-
'value': True,
1510-
'reason': None,
1511-
'source': {'name': 'SimpleEvaluator', 'arguments': None},
1512-
}
1513-
},
1514-
'task_duration': 1.0,
1515-
'total_duration': 10.0,
1516-
'trace_id': '00000000000000000000000000000001',
1517-
'span_id': '0000000000000003',
1518-
'evaluator_failures': [],
1519-
},
1520-
{
1521-
'name': 'case2',
1522-
'inputs': {'query': 'What is the capital of France?'},
1523-
'metadata': {'difficulty': 'medium', 'category': 'geography'},
1524-
'expected_output': {'answer': 'Paris', 'confidence': 1.0},
1525-
'output': {'answer': 'Paris', 'confidence': 1.0},
1526-
'metrics': {},
1527-
'attributes': {},
1528-
'scores': {
1529-
'confidence': {
1530-
'name': 'confidence',
1531-
'value': 1.0,
1532-
'reason': None,
1533-
'source': {'name': 'SimpleEvaluator', 'arguments': None},
1534-
}
1535-
},
1536-
'labels': {},
1537-
'assertions': {
1538-
'correct': {
1539-
'name': 'correct',
1540-
'value': True,
1541-
'reason': None,
1542-
'source': {'name': 'SimpleEvaluator', 'arguments': None},
1543-
}
1544-
},
1545-
'task_duration': 1.0,
1546-
'total_duration': 8.0,
1547-
'trace_id': '00000000000000000000000000000001',
1548-
'span_id': '0000000000000007',
1549-
'evaluator_failures': [],
1550-
},
1551-
],
1552-
'failures': [],
1553-
'averages': {
1554-
'name': 'Averages',
1555-
'scores': {'confidence': 1.0},
1556-
'labels': {},
1557-
'metrics': {},
1558-
'assertions': 1.0,
1559-
'task_duration': 1.0,
1560-
'total_duration': 9.0,
1561-
},
15621490
'logfire.json_schema': {
15631491
'type': 'object',
1564-
'properties': {
1565-
'name': {},
1566-
'cases': {'type': 'array'},
1567-
'failures': {'type': 'array'},
1568-
'averages': {'type': 'object'},
1569-
},
1492+
'properties': {'name': {}, 'n_cases': {}, 'assertion_pass_rate': {}},
15701493
},
15711494
},
15721495
),

0 commit comments

Comments
 (0)