Correct confusion matrix calculation-function evaluate_detection_batch (#1853)

panagiotamoraiti · pre-commit-ci[bot] · Borda · web-flow · commit acbaf4ff25ae · 2026-02-12T18:54:00.000+09:00
* Correct confusion matrix calculation-function evaluate_detection_batch
* Correct confusion matrix computation
* Add 3 more tests for empty Detections-GTs
* Minor changes in too-long lines corrected
* Replace deprecated mock_detections with _create_detections
* Fix indentation in test assertions and add missing `self` parameter in `test_confusion_matrix`.
* Update metric computations to improve numerical stability and replace deprecated NumPy functions
* Add IoU+class matching tests and synthetic dataset fixtures for detection metrics
* Use `Optional` for type hinting `classes` parameter in `_yolo_dataset_factory`. Import `Optional` from `typing`.
* Refactor detection metric tests to simplify confusion matrix assertions and enhance IoU+class matching validation.
* Apply suggestions from code review
* fix(pre_commit): 🎨 auto format pre-commit hooks

---------

Co-authored-by: pre-commit-ci[bot] &lt;66853113+pre-commit-ci[bot]@users.noreply.github.com&gt;
Co-authored-by: Jirka Borovec &lt;6035284+Borda@users.noreply.github.com&gt;
Co-authored-by: Copilot &lt;175728472+Copilot@users.noreply.github.com&gt;
diff --git a/src/supervision/metrics/detection.py b/src/supervision/metrics/detection.py
@@ -148,7 +148,6 @@ def from_detections(
 
             ```
         """
-
         prediction_tensors = []
         target_tensors = []
         for prediction, target in zip(predictions, targets):
@@ -274,9 +273,28 @@ def evaluate_detection_batch(
         """
         result_matrix = np.zeros((num_classes + 1, num_classes + 1))
 
+        # Filter predictions by confidence threshold
         conf_idx = 5
         confidence = predictions[:, conf_idx]
-        detection_batch_filtered = predictions[confidence > conf_threshold]
+        detection_batch_filtered = predictions[confidence >= conf_threshold]
+
+        if len(detection_batch_filtered) == 0:
+            # No detections pass confidence threshold - all GT are FN
+            class_id_idx = 4
+            true_classes = np.array(targets[:, class_id_idx], dtype=np.int16)
+            for gt_class in true_classes:
+                result_matrix[gt_class, num_classes] += 1
+            return result_matrix
+
+        if len(targets) == 0:
+            # No ground truth - all detections are FP
+            class_id_idx = 4
+            detection_classes = np.array(
+                detection_batch_filtered[:, class_id_idx], dtype=np.int16
+            )
+            for det_class in detection_classes:
+                result_matrix[num_classes, det_class] += 1
+            return result_matrix
 
         class_id_idx = 4
         true_classes = np.array(targets[:, class_id_idx], dtype=np.int16)
@@ -286,37 +304,71 @@ def evaluate_detection_batch(
         true_boxes = targets[:, :class_id_idx]
         detection_boxes = detection_batch_filtered[:, :class_id_idx]
 
+        # Calculate IoU matrix
         iou_batch = box_iou_batch(
             boxes_true=true_boxes, boxes_detection=detection_boxes
         )
-        matched_idx = np.asarray(iou_batch > iou_threshold).nonzero()
-
-        if matched_idx[0].shape[0]:
-            matches = np.stack(
-                (matched_idx[0], matched_idx[1], iou_batch[matched_idx]), axis=1
-            )
-            matches = ConfusionMatrix._drop_extra_matches(matches=matches)
-        else:
-            matches = np.zeros((0, 3))
 
-        matched_true_idx, matched_detection_idx, _ = matches.transpose().astype(
-            np.int16
-        )
+        # Find all valid matches (IoU > threshold, regardless of class)
+        # Use vectorized operations to avoid nested Python loops
+        iou_mask = iou_batch > iou_threshold
+        gt_indices, det_indices = np.nonzero(iou_mask)
 
-        for i, true_class_value in enumerate(true_classes):
-            j = matched_true_idx == i
-            if matches.shape[0] > 0 and sum(j) == 1:
-                result_matrix[
-                    true_class_value, detection_classes[matched_detection_idx[j]]
-                ] += 1  # TP
-            else:
-                result_matrix[true_class_value, num_classes] += 1  # FN
-
-        for i, detection_class_value in enumerate(detection_classes):
-            if not any(matched_detection_idx == i):
-                result_matrix[num_classes, detection_class_value] += 1  # FP
-        final_result_matrix: npt.NDArray[np.int32] = result_matrix
-        return final_result_matrix
+        # If no pairs exceed the IoU threshold, skip matching
+        if gt_indices.size == 0:
+            valid_matches = []
+        else:
+            ious = iou_batch[gt_indices, det_indices]
+            gt_match_classes = true_classes[gt_indices]
+            det_match_classes = detection_classes[det_indices]
+            class_matches = gt_match_classes == det_match_classes
+
+            # Sort matches by class match first (True before False),
+            # then by IoU descending.
+            # np.lexsort sorts by the last key first, in ascending order.
+            # We use ~class_matches so that True becomes 0
+            # and False becomes 1 (True first),
+            # and -ious so that larger IoUs come first.
+            sort_indices = np.lexsort((-ious, ~class_matches))
+
+            # Build list of matches in the same format as before:
+            # (gt_idx, det_idx, iou, class_match)
+            valid_matches = [
+                (
+                    int(gt_indices[idx]),
+                    int(det_indices[idx]),
+                    float(ious[idx]),
+                    bool(class_matches[idx]),
+                )
+                for idx in sort_indices
+            ]
+        # Greedily assign matches, ensuring each GT
+        # and detection is matched at most once
+        matched_gt_idx = set()
+        matched_det_idx = set()
+
+        for gt_idx, det_idx, iou, class_match in valid_matches:
+            if gt_idx not in matched_gt_idx and det_idx not in matched_det_idx:
+                # Valid spatial match - record the class prediction
+                gt_class = true_classes[gt_idx]
+                det_class = detection_classes[det_idx]
+
+                # This handles both correct classification (TP) and misclassification
+                result_matrix[gt_class, det_class] += 1
+                matched_gt_idx.add(gt_idx)
+                matched_det_idx.add(det_idx)
+
+        # Count unmatched ground truth as FN
+        for gt_idx, gt_class in enumerate(true_classes):
+            if gt_idx not in matched_gt_idx:
+                result_matrix[gt_class, num_classes] += 1
+
+        # Count unmatched detections as FP
+        for det_idx, det_class in enumerate(detection_classes):
+            if det_idx not in matched_det_idx:
+                result_matrix[num_classes, det_class] += 1
+
+        return result_matrix
 
     @staticmethod
     def _drop_extra_matches(
diff --git a/src/supervision/metrics/precision.py b/src/supervision/metrics/precision.py
@@ -385,7 +385,12 @@ def _compute_precision(
         false_positives = confusion_matrix[..., 1]
 
         denominator = true_positives + false_positives
-        precision = np.where(denominator == 0, 0, true_positives / denominator)
+        precision = np.divide(
+            true_positives,
+            denominator,
+            out=np.zeros_like(true_positives),
+            where=denominator != 0,
+        )
 
         result_precision: npt.NDArray[np.float64] = precision
         return result_precision
diff --git a/src/supervision/metrics/recall.py b/src/supervision/metrics/recall.py
@@ -383,7 +383,12 @@ def _compute_recall(
         false_negatives = confusion_matrix[..., 2]
 
         denominator = true_positives + false_negatives
-        recall = np.where(denominator == 0, 0, true_positives / denominator)
+        recall = np.divide(
+            true_positives,
+            denominator,
+            out=np.zeros_like(true_positives),
+            where=denominator != 0,
+        )
 
         result_recall: npt.NDArray[np.float64] = recall
         return result_recall
diff --git a/tests/helpers.py b/tests/helpers.py
@@ -414,3 +414,67 @@ def create_yolo_dataset(
         "image_size": image_size,
         "image_annotations": image_annotations,
     }
+
+
+def create_predictions_with_class_iou_tests(
+    gt_detections: Detections, num_classes: int
+) -> Detections:
+    """
+    Create predictions that test IoU+class matching behavior.
+
+    For each ground truth detection, creates predictions with different patterns:
+    - Pattern 0 (i%3==0): Correct match (same bbox, correct class)
+    - Pattern 1 (i%3==1): Wrong class with perfect IoU + correct class with offset
+    - Pattern 2 (i%3==2): Correct class with slight offset
+
+    This tests that predictions with wrong class don't match even with high IoU,
+    which is the key fix in the confusion matrix calculation.
+
+    Args:
+        gt_detections: Ground truth detections to create predictions for
+        num_classes: Total number of classes in the dataset
+
+    Returns:
+        Detections object with predictions designed to test IoU+class matching
+    """
+    if len(gt_detections) == 0:
+        # No ground truth, return a single false positive
+        return _create_detections(
+            xyxy=[[10, 10, 50, 50]], class_id=[0], confidence=[0.9]
+        )
+
+    pred_boxes = []
+    pred_classes = []
+    pred_confs = []
+
+    for i, (box, cls) in enumerate(zip(gt_detections.xyxy, gt_detections.class_id)):
+        if i % 3 == 0:
+            # Pattern 1: Correct match
+            pred_boxes.append(box)
+            pred_classes.append(cls)
+            pred_confs.append(0.95)
+
+        elif i % 3 == 1:
+            # Pattern 2: Test the fix - add wrong class prediction with perfect IoU,
+            # then correct class with slightly offset bbox
+            wrong_cls = (cls + 1) % num_classes
+            pred_boxes.append(box)  # Perfect IoU
+            pred_classes.append(wrong_cls)  # Wrong class
+            pred_confs.append(0.90)
+
+            # Add correct class with slight offset
+            offset_box = box + np.array([2, 2, 2, 2], dtype=np.float32)
+            pred_boxes.append(offset_box)
+            pred_classes.append(cls)  # Correct class
+            pred_confs.append(0.85)
+
+        else:
+            # Pattern 3: Correct match with slight offset
+            offset_box = box + np.array([1, 1, 1, 1], dtype=np.float32)
+            pred_boxes.append(offset_box)
+            pred_classes.append(cls)
+            pred_confs.append(0.92)
+
+    return _create_detections(
+        xyxy=pred_boxes, class_id=pred_classes, confidence=pred_confs
+    )
diff --git a/tests/metrics/conftest.py b/tests/metrics/conftest.py
@@ -1,3 +1,5 @@
+from typing import Optional
+
 import numpy as np
 import pytest
 
@@ -138,3 +140,85 @@ def target_class_1():
         xyxy=np.array([[60, 60, 100, 100]], dtype=np.float32),
         class_id=np.array([1]),
     )
+
+
+def _yolo_dataset_factory(
+    tmp_path,
+    num_images: int = 20,
+    classes: Optional[list[str]] = None,
+    objects_per_image_range: tuple[int, int] = (1, 3),
+):
+    """
+    Factory function to create synthetic YOLO-format datasets with custom parameters.
+
+    Args:
+        tmp_path: Pytest tmp_path fixture
+        num_images: Number of images to generate
+        classes: List of class names
+        objects_per_image_range: Range of objects per image as (min, max)
+
+    Returns:
+        dict with dataset paths and metadata
+    """
+    from tests.helpers import create_yolo_dataset
+
+    if classes is None:
+        classes = ["dog", "cat", "person"]
+
+    return create_yolo_dataset(
+        dataset_dir=str(tmp_path / "yolo_dataset"),
+        num_images=num_images,
+        image_size=(640, 640, 3),
+        classes=classes,
+        objects_per_image_range=objects_per_image_range,
+        seed=42,
+    )
+
+
+@pytest.fixture
+def yolo_dataset_structure(tmp_path):
+    """
+    Synthetic YOLO-format dataset for testing confusion matrix and detection metrics.
+
+    Configuration:
+    - 20 images
+    - 640x640 resolution
+    - 3 classes: ["dog", "cat", "person"]
+    - 1-3 objects per image
+
+    Use this for tests that need multi-class scenarios (3+ classes).
+
+    Returns:
+        dict with dataset paths and metadata
+    """
+    return _yolo_dataset_factory(
+        tmp_path,
+        num_images=20,
+        classes=["dog", "cat", "person"],
+        objects_per_image_range=(1, 3),
+    )
+
+
+@pytest.fixture
+def yolo_dataset_two_classes(tmp_path):
+    """
+    Synthetic YOLO-format dataset for testing mAR and binary classification metrics.
+
+    Configuration:
+    - 15 images
+    - 640x640 resolution
+    - 2 classes: ["class_0", "class_1"]
+    - 2-4 objects per image
+
+    Use this for tests that specifically need 2-class scenarios or depend on
+    specific class distributions (e.g., mAR @ K per-image limiting tests).
+
+    Returns:
+        dict with dataset paths and metadata
+    """
+    return _yolo_dataset_factory(
+        tmp_path,
+        num_images=15,
+        classes=["class_0", "class_1"],
+        objects_per_image_range=(2, 4),
+    )
diff --git a/tests/metrics/test_detection.py b/tests/metrics/test_detection.py
diff --git a/tests/metrics/test_mean_average_recall.py b/tests/metrics/test_mean_average_recall.py