Filter invalid annotation by task (#4515)

eugene123tw · web-flow · commit ab56cdf44697 · 2025-08-12T14:41:06.000+02:00
* Add task parameter to pre-filtering and enhance annotation validation logic

* fix unit test
diff --git a/src/otx/data/module.py b/src/otx/data/module.py
@@ -105,6 +105,7 @@ def __init__(
                 dataset,
                 self.data_format,
                 self.unannotated_items_ratio,
+                self.task,
                 ignore_index=self.ignore_index if self.task == "SEMANTIC_SEGMENTATION" else None,
             )
         if isinstance(input_size, str) and input_size == "auto":
diff --git a/src/otx/data/utils/pre_filtering.py b/src/otx/data/utils/pre_filtering.py
@@ -1,4 +1,4 @@
-# Copyright (C) 2024 Intel Corporation
+# Copyright (C) 2024-2025 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
 """Pre filtering data for OTX."""
@@ -7,11 +7,14 @@
 
 import secrets
 import warnings
+from functools import partial
 from typing import TYPE_CHECKING
 
-from datumaro.components.annotation import Annotation, Bbox, Polygon
+from datumaro.components.annotation import Annotation, Bbox, Ellipse, Polygon
 from datumaro.components.dataset import Dataset as DmDataset
 
+from otx.types.task import OTXTaskType
+
 if TYPE_CHECKING:
     from datumaro.components.dataset_base import DatasetItem
 
@@ -20,6 +23,7 @@ def pre_filtering(
     dataset: DmDataset,
     data_format: str,
     unannotated_items_ratio: float,
+    task: OTXTaskType,
     ignore_index: int | None = None,
 ) -> DmDataset:
     """Pre-filtering function to filter the dataset based on certain criteria.
@@ -29,6 +33,7 @@ def pre_filtering(
         data_format (str): The format of the dataset.
         unannotated_items_ratio (float): The ratio of background unannotated items to be used.
             This must be a float between 0 and 1.
+        task (OTXTaskType): The task type of the dataset.
         ignore_index (int | None, optional): The index to be used for the ignored label. Defaults to None.
 
     Returns:
@@ -37,7 +42,7 @@ def pre_filtering(
     used_background_items = set()
     msg = f"There are empty annotation items in train set, Of these, only {unannotated_items_ratio*100}% are used."
     warnings.warn(msg, stacklevel=2)
-    dataset = DmDataset.filter(dataset, is_valid_annot, filter_annotations=True)
+    dataset = DmDataset.filter(dataset, partial(is_valid_anno_for_task, task=task), filter_annotations=True)
     dataset = remove_unused_labels(dataset, data_format, ignore_index)
     if unannotated_items_ratio > 0:
         empty_items = [
@@ -77,6 +82,27 @@ def is_valid_annot(item: DatasetItem, annotation: Annotation) -> bool:  # noqa:
     return True
 
 
+def is_valid_anno_for_task(item: DatasetItem, annotation: Annotation, task: OTXTaskType) -> bool:
+    """Return whether DatasetItem's annotation is valid for a specific task.
+
+    Args:
+        item (DatasetItem): The item to be checked.
+        annotation (Annotation): The annotation to be checked.
+        task (OTXTaskType): The task type of the dataset.
+
+    Returns:
+        bool: True if the annotation is valid for the task, False otherwise.
+    """
+    if task == OTXTaskType.DETECTION:
+        return isinstance(annotation, Bbox) and is_valid_annot(item, annotation)
+
+    # Rotated detection is a subset of instance segmentation
+    if task in [OTXTaskType.INSTANCE_SEGMENTATION, OTXTaskType.ROTATED_DETECTION]:
+        return isinstance(annotation, (Polygon, Bbox, Ellipse)) and is_valid_annot(item, annotation)
+
+    return is_valid_annot(item, annotation)
+
+
 def remove_unused_labels(
     dataset: DmDataset,
     data_format: str,
diff --git a/tests/unit/data/test_module.py b/tests/unit/data/test_module.py
@@ -81,11 +81,13 @@ def func(
             dataset: DmDataset,
             data_format: str,
             unannotated_items_ratio: float,
+            task: OTXTaskType,
             ignore_index: int | None,
         ) -> DmDataset:
             del data_format
             del unannotated_items_ratio
             del ignore_index
+            del task
             return dataset
 
         return mocker.patch("otx.data.module.pre_filtering", side_effect=func)
diff --git a/tests/unit/data/test_pre_filtering.py b/tests/unit/data/test_pre_filtering.py
@@ -1,12 +1,13 @@
-# Copyright (C) 2024 Intel Corporation
+# Copyright (C) 2024-2025 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 
 import pytest
-from datumaro.components.annotation import AnnotationType, Bbox, Label, Polygon
+from datumaro.components.annotation import AnnotationType, Bbox, Ellipse, Label, Polygon
 from datumaro.components.dataset import Dataset as DmDataset
 from datumaro.components.dataset_base import DatasetItem
 
-from otx.data.utils.pre_filtering import pre_filtering
+from otx.data.utils.pre_filtering import is_valid_anno_for_task, pre_filtering
+from otx.types.task import OTXTaskType
 
 
 @pytest.fixture()
@@ -80,7 +81,166 @@ def test_pre_filtering(fxt_dm_dataset_with_unannotated: DmDataset, unannotated_i
     filtered_dataset = pre_filtering(
         dataset=fxt_dm_dataset_with_unannotated,
         data_format="datumaro",
+        task=OTXTaskType.MULTI_CLASS_CLS,
         unannotated_items_ratio=unannotated_items_ratio,
     )
     assert len(filtered_dataset) == 82 + int(len(empty_items) * unannotated_items_ratio)
     assert len(filtered_dataset.categories()[AnnotationType.label]) == 3
+
+
+@pytest.fixture()
+def fxt_dataset_item() -> DatasetItem:
+    """Create a sample dataset item for testing."""
+    return DatasetItem(
+        id="test_item",
+        subset="train",
+        media=None,
+        annotations=[],
+    )
+
+
+class TestIsValidAnnoForTask:
+    """Test cases for is_valid_anno_for_task function."""
+
+    @pytest.mark.parametrize(
+        ("task", "annotation", "expected"),
+        [
+            # DETECTION task tests
+            (OTXTaskType.DETECTION, Bbox(x=0, y=0, w=10, h=10, label=0), True),
+            (OTXTaskType.DETECTION, Bbox(x=0, y=0, w=-1, h=-1, label=0), False),  # Invalid bbox
+            (OTXTaskType.DETECTION, Bbox(x=10, y=10, w=5, h=5, label=0), True),
+            (OTXTaskType.DETECTION, Polygon(points=[0, 0, 10, 0, 10, 10, 0, 10], label=0), False),  # Wrong type
+            (OTXTaskType.DETECTION, Ellipse(x1=0, y1=0, x2=10, y2=10, label=0), False),
+            (OTXTaskType.DETECTION, Label(label=0), False),  # Wrong type
+            # INSTANCE_SEGMENTATION task tests
+            (OTXTaskType.INSTANCE_SEGMENTATION, Bbox(x=0, y=0, w=10, h=10, label=0), True),
+            (OTXTaskType.INSTANCE_SEGMENTATION, Bbox(x=0, y=0, w=-1, h=-1, label=0), False),  # Invalid bbox
+            (OTXTaskType.INSTANCE_SEGMENTATION, Polygon(points=[0, 0, 10, 0, 10, 10, 0, 10], label=0), True),
+            (OTXTaskType.INSTANCE_SEGMENTATION, Polygon(points=[0, 0, 0, 0, 0, 0], label=0), False),  # Invalid polygon
+            (OTXTaskType.INSTANCE_SEGMENTATION, Ellipse(x1=0, y1=0, x2=10, y2=10, label=0), True),
+            (OTXTaskType.INSTANCE_SEGMENTATION, Label(label=0), False),  # Wrong type
+            # Other task types (should use default is_valid_annot behavior)
+            (OTXTaskType.MULTI_LABEL_CLS, Bbox(x=0, y=0, w=10, h=10, label=0), True),
+            (OTXTaskType.MULTI_LABEL_CLS, Bbox(x=0, y=0, w=-1, h=-1, label=0), False),  # Invalid bbox
+            (OTXTaskType.MULTI_LABEL_CLS, Polygon(points=[0, 0, 10, 0, 10, 10, 0, 10], label=0), True),
+            (OTXTaskType.MULTI_LABEL_CLS, Polygon(points=[0, 0, 0, 0, 0, 0], label=0), False),  # Invalid polygon
+            (OTXTaskType.MULTI_LABEL_CLS, Ellipse(x1=0, y1=0, x2=10, y2=10, label=0), True),
+            (OTXTaskType.MULTI_LABEL_CLS, Label(label=0), True),  # Label is always valid
+            (OTXTaskType.SEMANTIC_SEGMENTATION, Bbox(x=0, y=0, w=10, h=10, label=0), True),
+            (OTXTaskType.SEMANTIC_SEGMENTATION, Polygon(points=[0, 0, 10, 0, 10, 10, 0, 10], label=0), True),
+            (OTXTaskType.SEMANTIC_SEGMENTATION, Ellipse(x1=0, y1=0, x2=10, y2=10, label=0), True),
+            (OTXTaskType.SEMANTIC_SEGMENTATION, Label(label=0), True),
+            (OTXTaskType.ANOMALY, Bbox(x=0, y=0, w=10, h=10, label=0), True),
+            (OTXTaskType.ANOMALY, Polygon(points=[0, 0, 10, 0, 10, 10, 0, 10], label=0), True),
+            (OTXTaskType.ANOMALY, Ellipse(x1=0, y1=0, x2=10, y2=10, label=0), True),
+            (OTXTaskType.ROTATED_DETECTION, Bbox(x=0, y=0, w=10, h=10, label=0), True),
+            (OTXTaskType.ROTATED_DETECTION, Polygon(points=[0, 0, 10, 0, 10, 10, 0, 10], label=0), True),
+            (OTXTaskType.ROTATED_DETECTION, Ellipse(x1=0, y1=0, x2=10, y2=10, label=0), True),
+            (OTXTaskType.ROTATED_DETECTION, Label(label=0), False),
+        ],
+    )
+    def test_is_valid_anno_for_task(
+        self,
+        fxt_dataset_item: DatasetItem,
+        task: OTXTaskType,
+        annotation,
+        expected: bool,
+    ) -> None:
+        """Test is_valid_anno_for_task with various task types and annotations.
+
+        Args:
+            fxt_dataset_item: The dataset item to test with
+            task: The task type to test
+            annotation: The annotation to test
+            expected: Expected result (True if valid, False if invalid)
+        """
+        result = is_valid_anno_for_task(fxt_dataset_item, annotation, task)
+        assert result == expected, f"Expected {expected} for task {task} with annotation {type(annotation).__name__}"
+
+    def test_detection_task_with_valid_bbox(self, fxt_dataset_item: DatasetItem) -> None:
+        """Test DETECTION task with valid bounding box."""
+        bbox = Bbox(x=5, y=5, w=20, h=15, label=0)
+        result = is_valid_anno_for_task(fxt_dataset_item, bbox, OTXTaskType.DETECTION)
+        assert result is True
+
+    def test_detection_task_with_invalid_bbox(self, fxt_dataset_item: DatasetItem) -> None:
+        """Test DETECTION task with invalid bounding box (negative dimensions)."""
+        bbox = Bbox(x=10, y=10, w=-5, h=-5, label=0)
+        result = is_valid_anno_for_task(fxt_dataset_item, bbox, OTXTaskType.DETECTION)
+        assert result is False
+
+    def test_detection_task_with_zero_dimension_bbox(self, fxt_dataset_item: DatasetItem) -> None:
+        """Test DETECTION task with zero dimension bounding box."""
+        bbox = Bbox(x=10, y=10, w=0, h=0, label=0)
+        result = is_valid_anno_for_task(fxt_dataset_item, bbox, OTXTaskType.DETECTION)
+        assert result is False
+
+    def test_detection_task_with_wrong_annotation_type(self, fxt_dataset_item: DatasetItem) -> None:
+        """Test DETECTION task with non-bbox annotation types."""
+        polygon = Polygon(points=[0, 0, 10, 0, 10, 10, 0, 10], label=0)
+        ellipse = Ellipse(x1=0, y1=0, x2=10, y2=10, label=0)
+        label = Label(label=0)
+
+        assert is_valid_anno_for_task(fxt_dataset_item, polygon, OTXTaskType.DETECTION) is False
+        assert is_valid_anno_for_task(fxt_dataset_item, ellipse, OTXTaskType.DETECTION) is False
+        assert is_valid_anno_for_task(fxt_dataset_item, label, OTXTaskType.DETECTION) is False
+
+    def test_instance_segmentation_task_with_valid_annotations(self, fxt_dataset_item: DatasetItem) -> None:
+        """Test INSTANCE_SEGMENTATION task with valid annotation types."""
+        bbox = Bbox(x=0, y=0, w=10, h=10, label=0)
+        polygon = Polygon(points=[0, 0, 10, 0, 10, 10, 0, 10], label=0)
+        ellipse = Ellipse(x1=0, y1=0, x2=10, y2=10, label=0)
+
+        assert is_valid_anno_for_task(fxt_dataset_item, bbox, OTXTaskType.INSTANCE_SEGMENTATION) is True
+        assert is_valid_anno_for_task(fxt_dataset_item, polygon, OTXTaskType.INSTANCE_SEGMENTATION) is True
+        assert is_valid_anno_for_task(fxt_dataset_item, ellipse, OTXTaskType.INSTANCE_SEGMENTATION) is True
+
+    def test_instance_segmentation_task_with_invalid_annotations(self, fxt_dataset_item: DatasetItem) -> None:
+        """Test INSTANCE_SEGMENTATION task with invalid annotation types."""
+        invalid_bbox = Bbox(x=0, y=0, w=-1, h=-1, label=0)
+        invalid_polygon = Polygon(points=[0, 0, 0, 0, 0, 0], label=0)  # Degenerate polygon
+        label = Label(label=0)  # Wrong type
+
+        assert is_valid_anno_for_task(fxt_dataset_item, invalid_bbox, OTXTaskType.INSTANCE_SEGMENTATION) is False
+        assert is_valid_anno_for_task(fxt_dataset_item, invalid_polygon, OTXTaskType.INSTANCE_SEGMENTATION) is False
+        assert is_valid_anno_for_task(fxt_dataset_item, label, OTXTaskType.INSTANCE_SEGMENTATION) is False
+
+    def test_other_task_types_use_default_validation(self, fxt_dataset_item: DatasetItem) -> None:
+        """Test that other task types use the default is_valid_annot behavior."""
+        valid_bbox = Bbox(x=0, y=0, w=10, h=10, label=0)
+        invalid_bbox = Bbox(x=0, y=0, w=-1, h=-1, label=0)
+        valid_polygon = Polygon(points=[0, 0, 10, 0, 10, 10, 0, 10], label=0)
+        invalid_polygon = Polygon(points=[0, 0, 0, 0, 0, 0], label=0)
+        label = Label(label=0)
+
+        # Test with CLASSIFICATION task
+        assert is_valid_anno_for_task(fxt_dataset_item, valid_bbox, OTXTaskType.MULTI_CLASS_CLS) is True
+        assert is_valid_anno_for_task(fxt_dataset_item, invalid_bbox, OTXTaskType.MULTI_CLASS_CLS) is False
+        assert is_valid_anno_for_task(fxt_dataset_item, valid_polygon, OTXTaskType.MULTI_CLASS_CLS) is True
+        assert is_valid_anno_for_task(fxt_dataset_item, invalid_polygon, OTXTaskType.MULTI_CLASS_CLS) is False
+        assert is_valid_anno_for_task(fxt_dataset_item, label, OTXTaskType.MULTI_CLASS_CLS) is True
+
+        # Test with SEMANTIC_SEGMENTATION task
+        assert is_valid_anno_for_task(fxt_dataset_item, valid_bbox, OTXTaskType.SEMANTIC_SEGMENTATION) is True
+        assert is_valid_anno_for_task(fxt_dataset_item, invalid_bbox, OTXTaskType.SEMANTIC_SEGMENTATION) is False
+        assert is_valid_anno_for_task(fxt_dataset_item, valid_polygon, OTXTaskType.SEMANTIC_SEGMENTATION) is True
+        assert is_valid_anno_for_task(fxt_dataset_item, invalid_polygon, OTXTaskType.SEMANTIC_SEGMENTATION) is False
+        assert is_valid_anno_for_task(fxt_dataset_item, label, OTXTaskType.SEMANTIC_SEGMENTATION) is True
+
+    def test_edge_cases(self, fxt_dataset_item: DatasetItem) -> None:
+        """Test edge cases for annotation validation."""
+        # Very small but valid bbox
+        small_bbox = Bbox(x=0, y=0, w=0.1, h=0.1, label=0)
+        assert is_valid_anno_for_task(fxt_dataset_item, small_bbox, OTXTaskType.DETECTION) is True
+
+        # Bbox with equal coordinates (should be invalid)
+        equal_bbox = Bbox(x=5, y=5, w=0, h=0, label=0)
+        assert is_valid_anno_for_task(fxt_dataset_item, equal_bbox, OTXTaskType.DETECTION) is False
+
+        # Polygon with minimal valid area
+        minimal_polygon = Polygon(points=[0, 0, 1, 0, 1, 1, 0, 1], label=0)
+        assert is_valid_anno_for_task(fxt_dataset_item, minimal_polygon, OTXTaskType.INSTANCE_SEGMENTATION) is True
+
+        # Degenerate polygon (should be invalid)
+        degenerate_polygon = Polygon(points=[0, 0, 0, 0, 0, 0], label=0)
+        assert is_valid_anno_for_task(fxt_dataset_item, degenerate_polygon, OTXTaskType.INSTANCE_SEGMENTATION) is False
diff --git a/tests/unit/data/test_robust_dataset_statistics.py b/tests/unit/data/test_robust_dataset_statistics.py
@@ -8,8 +8,8 @@
 import numpy as np
 import pytest
 from datumaro import Dataset as DmDataset
-from datumaro import DatasetSubset, DatasetItem
-from datumaro.components.annotation import AnnotationType, ExtractedMask, LabelCategories, Polygon, Bbox
+from datumaro import DatasetItem, DatasetSubset
+from datumaro.components.annotation import AnnotationType, Bbox, ExtractedMask, LabelCategories, Polygon
 from datumaro.components.media import Image
 
 from otx.data.utils.utils import compute_robust_dataset_statistics
@@ -19,17 +19,17 @@
 class TestComputeRobustDatasetStatistics:
     """Test cases for compute_robust_dataset_statistics function."""
 
-    @pytest.fixture
+    @pytest.fixture()
     def mock_semantic_seg_dataset(self):
         """Create a mock semantic segmentation dataset with mixed annotation types."""
         dataset = DmDataset(media_type=Image)
-        
+
         # Create label categories
         categories = LabelCategories()
         categories.add("background")
         categories.add("foreground")
         dataset.categories()[AnnotationType.label] = categories
-        
+
         for i in range(5):
             image = Image.from_numpy(np.zeros((100, 100, 3), dtype=np.uint8))
 
@@ -47,35 +47,34 @@ def mock_semantic_seg_dataset(self):
 
             # Bbox annotation (background, should be ignored for SEMANTIC_SEGMENTATION)
             bbox = Bbox(60, 60, 20, 20, label=0)
-        
 
             dataset.put(
                 DatasetItem(
                     id=str(i),
                     media=image,
                     annotations=[ann_mask, polygon, bbox],
                     subset="train",
-                )
+                ),
             )
         return dataset
 
     def test_compute_robust_dataset_statistics_semantic_segmentation(self, mock_semantic_seg_dataset):
         """Test that semantic segmentation with ExtractedMask annotations is handled correctly."""
         # Get the train subset
         train_subset = DatasetSubset(mock_semantic_seg_dataset, "train")
-        
+
         # Compute statistics
         stats = compute_robust_dataset_statistics(
             dataset=train_subset,
             task=OTXTaskType.SEMANTIC_SEGMENTATION,
             max_samples=10,
         )
-        
+
         # Verify the function doesn't crash and returns expected structure
         assert isinstance(stats, dict)
         assert "image" in stats
         assert "annotation" in stats
-        
+
         image_statistics_keys = ["avg", "min", "max", "std", "robust_min", "robust_max"]
         annotation_statistics_keys = ["avg", "min", "max", "std", "robust_min", "robust_max"]
 
@@ -87,35 +86,35 @@ def test_compute_robust_dataset_statistics_semantic_segmentation(self, mock_sema
 
         for key in stats["annotation"]["num_per_image"]:
             assert key in annotation_statistics_keys
-        
+
         for key in stats["annotation"]["size_of_shape"]:
             assert key in annotation_statistics_keys
 
     def test_compute_robust_dataset_statistics_empty_dataset(self):
         """Test handling of empty dataset."""
         empty_dataset = DmDataset(media_type=Image)
         train_subset = DatasetSubset(empty_dataset, "train")
-        
+
         stats = compute_robust_dataset_statistics(
             dataset=train_subset,
             task=OTXTaskType.SEMANTIC_SEGMENTATION,
         )
-        
+
         # Should return empty statistics
         assert stats == {"image": {}, "annotation": {}}
 
     def test_compute_robust_dataset_statistics_max_samples_limit(self, mock_semantic_seg_dataset):
         """Test that max_samples parameter limits the number of processed samples."""
         train_subset = DatasetSubset(mock_semantic_seg_dataset, "train")
-        
+
         # Test with max_samples=2 (should only process 2 items)
         stats = compute_robust_dataset_statistics(
             dataset=train_subset,
             task=OTXTaskType.SEMANTIC_SEGMENTATION,
             max_samples=2,
         )
-        
+
         # Should still return valid statistics
         assert isinstance(stats, dict)
         assert "image" in stats
-        assert "annotation" in stats 
+        assert "annotation" in stats

Original file line number	Diff line number	Diff line change
`@@ -105,6 +105,7 @@ def __init__(`
`105`	`105`	`dataset,`
`106`	`106`	`self.data_format,`
`107`	`107`	`self.unannotated_items_ratio,`
	`108`	`+ self.task,`
`108`	`109`	`ignore_index=self.ignore_index if self.task == "SEMANTIC_SEGMENTATION" else None,`
`109`	`110`	`)`
`110`	`111`	`if isinstance(input_size, str) and input_size == "auto":`