Striveworks · czaloom · Jan 22, 2026 · Jan 22, 2026 · Jan 22, 2026 · Jan 22, 2026
@@ -1,10 +1,10 @@
-.PHONY: install pre-commit test help
+.PHONY: install lint test help
 
 install:
 	@echo "Installing from source..."
 	pip install -e src/[dev]
 
-pre-commit:
+lint:
 	@echo "Running pre-commit..."
 	pre-commit install
 	pre-commit run --all
@@ -19,6 +19,6 @@ test:
 help:
 	@echo "Available targets:"
 	@echo "  install          Install from source with developer tools."
-	@echo "  pre-commit       Run pre-commit."
+	@echo "  lint       	  Run pre-commit."
 	@echo "  test             Run tests."
 	@echo "  help             Show this help message."
@@ -545,16 +545,12 @@ def iterate_values_with_tables(self, datums: pc.Expression | None = None):
             matches = tbl["match"].to_numpy()
             yield ids, scores, winners, matches, tbl
 
-    def compute_rocauc(
-        self, datums: pc.Expression | None = None
-    ) -> dict[MetricType, list[Metric]]:
+    def compute_rocauc(self) -> dict[MetricType, list[Metric]]:
         """
         Compute ROCAUC.
 
-        Parameters
-        ----------
-        datums : pyarrow.compute.Expression, optional
-            Option to filter datums by an expression.
+        This function does not support direct filtering. To perform evaluation over a filtered
+        set you must first create a new evaluator using `Evaluator.filter`.
 
         Returns
         -------
@@ -567,7 +563,6 @@ def compute_rocauc(
         label_counts = extract_groundtruth_count_per_label(
             reader=self._reader,
             number_of_labels=len(self._index_to_label),
-            datums=datums,
         )
 
         prev = np.zeros((n_labels, 2), dtype=np.uint64)
@@ -577,7 +572,6 @@ def compute_rocauc(
                 "cumulative_fp",
                 "cumulative_tp",
             ],
-            filter=datums,
         ):
             rocauc, prev = compute_rocauc(
                 rocauc=rocauc,

@@ -636,3 +636,49 @@ def test_filtering_six_classifications_inline(
         assert m in expected_metrics
     for m in expected_metrics:
         assert m in actual_metrics
+
+
+def test_filtering_remove_all(
+    loader: Loader,
+    six_classifications: list[Classification],
+    tmp_path: Path,
+):
+
+    loader.add_data(six_classifications)
+    evaluator = loader.finalize()
+
+    datums = pc.field("datum_uid") == "does_not_exist"
+
+    # test evaluation
+    base_metrics = evaluator.compute_precision_recall(datums=datums)
+    with pytest.raises(TypeError) as e:
+        evaluator.compute_rocauc(datums=datums)  # type: ignore - testing
+    assert "unexpected keyword" in str(e)
+    confusion = evaluator.compute_confusion_matrix(datums=datums)
+    examples = evaluator.compute_examples(datums=datums)
+
+    for k, mlist in base_metrics.items():
+        for m in mlist:
+            if k == MetricType.Counts:
+                assert isinstance(m.value, dict)
+                for v in m.value.values():
+                    assert isinstance(v, int)
+                    assert v >= 0
+            else:
+                assert isinstance(m.value, float)
+                assert m.value <= 1.0
+                assert m.value >= 0.0
+    for cm in confusion:
+        assert isinstance(cm.value, dict)
+        for row in cm.value["confusion_matrix"].values():
+            for v in row.values():
+                assert isinstance(v, int)
+                assert v >= 0
+        for v in cm.value["unmatched_ground_truths"].values():
+            assert isinstance(v, int)
+            assert v >= 0
+    for example in examples:
+        assert isinstance(example, dict)
+        for v in example.values():
+            if isinstance(v, list):
+                assert len(v) == 0
@@ -267,3 +267,51 @@ def test_rocauc_with_tabular_example(
         assert m in expected_metrics
     for m in expected_metrics:
         assert m in actual_metrics
+
+
+def test_rocauc_single_classification(loader: Loader):
+    data = [
+        Classification(
+            uid="uid",
+            groundtruth="dog",
+            predictions=["dog", "cat"],
+            scores=[1.0, 0.0],
+        )
+    ]
+    loader.add_data(data)
+    evaluator = loader.finalize()
+
+    metrics = evaluator.compute_rocauc()
+
+    # test ROCAUC
+    actual_metrics = [m.to_dict() for m in metrics[MetricType.ROCAUC]]
+    expected_metrics = [
+        {
+            "type": "ROCAUC",
+            "value": 0.0,
+            "parameters": {
+                "label": "dog",
+            },
+        },
+        {
+            "type": "ROCAUC",
+            "value": 0.0,
+            "parameters": {
+                "label": "cat",
+            },
+        },
+    ]
+    for m in actual_metrics:
+        assert m in expected_metrics
+    for m in expected_metrics:
+        assert m in actual_metrics
+
+    # test mROCAUC
+    actual_metrics = [m.to_dict() for m in metrics[MetricType.mROCAUC]]
+    expected_metrics = [
+        {"type": "mROCAUC", "value": 0.0, "parameters": {}},
+    ]
+    for m in actual_metrics:
+        assert m in expected_metrics
+    for m in expected_metrics:
+        assert m in actual_metrics