Skip to content

Commit 6decf5f

Browse files
committed
Remove tags and fix test coverage
1 parent 24dd5d0 commit 6decf5f

File tree

2 files changed

+45
-8
lines changed

2 files changed

+45
-8
lines changed

pydantic_evals/pydantic_evals/dataset.py

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -266,7 +266,6 @@ async def evaluate(
266266
*,
267267
task_name: str | None = None,
268268
metadata: dict[str, Any] | None = None,
269-
tags: Sequence[str] | None = None,
270269
) -> EvaluationReport[InputsT, OutputT, MetadataT]:
271270
"""Evaluates the test cases in the dataset using the given task.
272271
@@ -286,7 +285,6 @@ async def evaluate(
286285
task_name: Optional override to the name of the task being executed, otherwise the name of the task
287286
function will be used.
288287
metadata: Optional dict of experiment metadata.
289-
tags: Optional sequence of tags to add to the experiment span.
290288
291289
Returns:
292290
A report containing the results of the evaluation.
@@ -309,7 +307,6 @@ async def evaluate(
309307
dataset_name=self.name,
310308
n_cases=len(self.cases),
311309
**extra_attributes,
312-
_tags=tags,
313310
) as eval_span,
314311
progress_bar or nullcontext(),
315312
):
@@ -354,8 +351,6 @@ async def _handle_case(case: Case[InputsT, OutputT, MetadataT], report_case_name
354351
full_experiment_metadata: dict[str, Any] = {'n_cases': len(self.cases)}
355352
if metadata is not None:
356353
full_experiment_metadata['metadata'] = metadata
357-
if tags is not None:
358-
full_experiment_metadata['tags'] = tags
359354
if (averages := report.averages()) is not None:
360355
full_experiment_metadata['averages'] = averages
361356
if averages.assertions is not None:
@@ -374,7 +369,6 @@ def evaluate_sync(
374369
*,
375370
task_name: str | None = None,
376371
metadata: dict[str, Any] | None = None,
377-
tags: Sequence[str] | None = None,
378372
) -> EvaluationReport[InputsT, OutputT, MetadataT]:
379373
"""Evaluates the test cases in the dataset using the given task.
380374
@@ -393,7 +387,6 @@ def evaluate_sync(
393387
task_name: Optional override to the name of the task being executed, otherwise the name of the task
394388
function will be used.
395389
metadata: Optional dict of experiment metadata.
396-
tags: Optional sequence of tags to add to the experiment span.
397390
398391
Returns:
399392
A report containing the results of the evaluation.
@@ -408,7 +401,6 @@ def evaluate_sync(
408401
retry_evaluators=retry_evaluators,
409402
task_name=task_name,
410403
metadata=metadata,
411-
tags=tags,
412404
)
413405
)
414406

tests/evals/test_reporting.py

Lines changed: 45 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1317,3 +1317,48 @@ async def test_evaluation_renderer_diff_with_changed_metadata(sample_report_case
13171317
│ test_case │ score1: 2.50 │ label1: hello │ accuracy: 0.950 │ ✔ │ 100.0ms │
13181318
└───────────┴──────────────┴───────────────┴─────────────────┴────────────┴──────────┘
13191319
""")
1320+
1321+
1322+
async def test_evaluation_renderer_diff_with_no_metadata(sample_report_case: ReportCase):
1323+
"""Test EvaluationRenderer diff table where both reports have the same metadata."""
1324+
1325+
baseline_report = EvaluationReport(
1326+
cases=[sample_report_case],
1327+
name='baseline_report',
1328+
)
1329+
1330+
new_report = EvaluationReport(
1331+
cases=[sample_report_case],
1332+
name='new_report',
1333+
)
1334+
1335+
output = new_report.render(
1336+
include_input=False,
1337+
include_metadata=False,
1338+
include_expected_output=False,
1339+
include_output=False,
1340+
include_durations=True,
1341+
include_total_duration=False,
1342+
include_removed_cases=False,
1343+
include_averages=False,
1344+
include_error_stacktrace=False,
1345+
include_evaluator_failures=True,
1346+
input_config={},
1347+
metadata_config={},
1348+
output_config={},
1349+
score_configs={},
1350+
label_configs={},
1351+
metric_configs={},
1352+
duration_config={},
1353+
include_reasons=False,
1354+
baseline=baseline_report,
1355+
include_errors=False, # Prevent failures table from being added
1356+
)
1357+
assert output == snapshot("""\
1358+
Evaluation Diff: baseline_report → new_report
1359+
┏━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━┓
1360+
┃ Case ID ┃ Scores ┃ Labels ┃ Metrics ┃ Assertions ┃ Duration ┃
1361+
┡━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━┩
1362+
│ test_case │ score1: 2.50 │ label1: hello │ accuracy: 0.950 │ ✔ │ 100.0ms │
1363+
└───────────┴──────────────┴───────────────┴─────────────────┴────────────┴──────────┘
1364+
""")

0 commit comments

Comments
 (0)