Remove tags and fix test coverage

dmontagu · dmontagu · commit 6decf5f09ce5 · 2025-10-27T11:18:11.000-06:00
diff --git a/pydantic_evals/pydantic_evals/dataset.py b/pydantic_evals/pydantic_evals/dataset.py
@@ -266,7 +266,6 @@ async def evaluate(
         *,
         task_name: str | None = None,
         metadata: dict[str, Any] | None = None,
-        tags: Sequence[str] | None = None,
     ) -> EvaluationReport[InputsT, OutputT, MetadataT]:
         """Evaluates the test cases in the dataset using the given task.
 
@@ -286,7 +285,6 @@ async def evaluate(
             task_name: Optional override to the name of the task being executed, otherwise the name of the task
                 function will be used.
             metadata: Optional dict of experiment metadata.
-            tags: Optional sequence of tags to add to the experiment span.
 
         Returns:
             A report containing the results of the evaluation.
@@ -309,7 +307,6 @@ async def evaluate(
                 dataset_name=self.name,
                 n_cases=len(self.cases),
                 **extra_attributes,
-                _tags=tags,
             ) as eval_span,
             progress_bar or nullcontext(),
         ):
@@ -354,8 +351,6 @@ async def _handle_case(case: Case[InputsT, OutputT, MetadataT], report_case_name
             full_experiment_metadata: dict[str, Any] = {'n_cases': len(self.cases)}
             if metadata is not None:
                 full_experiment_metadata['metadata'] = metadata
-            if tags is not None:
-                full_experiment_metadata['tags'] = tags
             if (averages := report.averages()) is not None:
                 full_experiment_metadata['averages'] = averages
                 if averages.assertions is not None:
@@ -374,7 +369,6 @@ def evaluate_sync(
         *,
         task_name: str | None = None,
         metadata: dict[str, Any] | None = None,
-        tags: Sequence[str] | None = None,
     ) -> EvaluationReport[InputsT, OutputT, MetadataT]:
         """Evaluates the test cases in the dataset using the given task.
 
@@ -393,7 +387,6 @@ def evaluate_sync(
             task_name: Optional override to the name of the task being executed, otherwise the name of the task
                 function will be used.
             metadata: Optional dict of experiment metadata.
-            tags: Optional sequence of tags to add to the experiment span.
 
         Returns:
             A report containing the results of the evaluation.
@@ -408,7 +401,6 @@ def evaluate_sync(
                 retry_evaluators=retry_evaluators,
                 task_name=task_name,
                 metadata=metadata,
-                tags=tags,
             )
         )
 
diff --git a/tests/evals/test_reporting.py b/tests/evals/test_reporting.py
@@ -1317,3 +1317,48 @@ async def test_evaluation_renderer_diff_with_changed_metadata(sample_report_case
 │ test_case │ score1: 2.50 │ label1: hello │ accuracy: 0.950 │ ✔          │  100.0ms │
 └───────────┴──────────────┴───────────────┴─────────────────┴────────────┴──────────┘
 """)
+
+
+async def test_evaluation_renderer_diff_with_no_metadata(sample_report_case: ReportCase):
+    """Test EvaluationRenderer diff table where both reports have the same metadata."""
+
+    baseline_report = EvaluationReport(
+        cases=[sample_report_case],
+        name='baseline_report',
+    )
+
+    new_report = EvaluationReport(
+        cases=[sample_report_case],
+        name='new_report',
+    )
+
+    output = new_report.render(
+        include_input=False,
+        include_metadata=False,
+        include_expected_output=False,
+        include_output=False,
+        include_durations=True,
+        include_total_duration=False,
+        include_removed_cases=False,
+        include_averages=False,
+        include_error_stacktrace=False,
+        include_evaluator_failures=True,
+        input_config={},
+        metadata_config={},
+        output_config={},
+        score_configs={},
+        label_configs={},
+        metric_configs={},
+        duration_config={},
+        include_reasons=False,
+        baseline=baseline_report,
+        include_errors=False,  # Prevent failures table from being added
+    )
+    assert output == snapshot("""\
+                    Evaluation Diff: baseline_report → new_report
+┏━━━━━━━━━━━┳━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━┳━━━━━━━━━━┓
+┃ Case ID   ┃ Scores       ┃ Labels        ┃ Metrics         ┃ Assertions ┃ Duration ┃
+┡━━━━━━━━━━━╇━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━╇━━━━━━━━━━┩
+│ test_case │ score1: 2.50 │ label1: hello │ accuracy: 0.950 │ ✔          │  100.0ms │
+└───────────┴──────────────┴───────────────┴─────────────────┴────────────┴──────────┘
+""")