Fix confidence threshold cache invalidation and filtering logic (#4498)

eugene123tw · web-flow · commit 4fb7e61d5353 · 2025-08-06T14:41:44.000+02:00
* Refactor confidence threshold handling in detection and instance segmentation models

* adding stage parameter to model methods for validation and testing

* Refactor metric computation in OTX models by removing stage parameter and consolidating test step logic

* fix inst-seg _filter_outputs_by_threshold

* Remove best_confidence_threshold_list from checkpoint during save and add unit tests for detection model confidence threshold logic.

* Fix format

* Enhance unit tests for detection threshold logic to ensure compatibility with Python 3.10

* Enhance unit tests for detection threshold logic to ensure compatibility with Python 3.10

* Fix tests

* Fix format

* fix tests

* update unit test

* Removing best_confidence_threshold_list and updating related unit tests for checkpoint functionality.

* Refactor checkpoint saving in OTXModel to remove unnecessary line and update comments in OTXDetectionModel for clarity on best_confidence_threshold usage.
diff --git a/src/otx/backend/native/models/detection/base.py b/src/otx/backend/native/models/detection/base.py
@@ -82,23 +82,33 @@ def __init__(
         self.model.feature_vector_fn = feature_vector_fn
         self.model.explain_fn = self.get_explain_fn()
 
-    def validation_step(self, batch: OTXDataBatch, batch_idx: int) -> OTXPredBatch:
-        """Perform a single validation step on a batch of data from the validation set.
-
-        :param batch: A batch of data (a tuple) containing the input tensor of images and target
-            labels.
-        :param batch_idx: The index of the current batch.
-        """
-        return self._filter_outputs_by_threshold(super().validation_step(batch, batch_idx))
-
     def test_step(self, batch: OTXDataBatch, batch_idx: int) -> OTXPredBatch:
         """Perform a single test step on a batch of data from the test set.
 
         :param batch: A batch of data (a tuple) containing the input tensor of images and target
             labels.
         :param batch_idx: The index of the current batch.
         """
-        return self._filter_outputs_by_threshold(super().test_step(batch, batch_idx))
+        preds = self.forward(inputs=batch)
+
+        if isinstance(preds, OTXBatchLossEntity):
+            raise TypeError(preds)
+
+        # 1. Filter outputs by threshold
+        preds = self._filter_outputs_by_threshold(preds)
+        metric_inputs = self._convert_pred_entity_to_compute_metric(preds, batch)
+
+        # 2. Update metric
+        if isinstance(metric_inputs, dict):
+            self.metric.update(**metric_inputs)
+            return preds
+
+        if isinstance(metric_inputs, list) and all(isinstance(inp, dict) for inp in metric_inputs):
+            for inp in metric_inputs:
+                self.metric.update(**inp)
+            return preds
+
+        raise TypeError(metric_inputs)
 
     def predict_step(
         self,
@@ -118,6 +128,10 @@ def predict_step(
         return outputs
 
     def _filter_outputs_by_threshold(self, outputs: OTXPredBatch) -> OTXPredBatch:
+        # NOTE: best_confidence_threshold comes from:
+        # 1. During validation: FMeasure metric computes optimal threshold, stored in hparams via _log_metrics
+        # 2. During test/predict: Uses the threshold computed during validation (from hparams)
+        # 3. If no threshold available: defaults to 0.5
         scores = []
         bboxes = []
         labels = []
@@ -316,53 +330,64 @@ def _convert_pred_entity_to_compute_metric(
     def on_load_checkpoint(self, ckpt: dict[str, Any]) -> None:
         """Load state_dict from checkpoint.
 
-        For detection, it is need to update confidence threshold information when
+        For detection, it is needed to update confidence threshold and F1 score information when
         the metric is FMeasure.
         """
-        if best_confidence_threshold := ckpt.get("confidence_threshold", None) or (
-            (hyper_parameters := ckpt.get("hyper_parameters", None))
-            and (best_confidence_threshold := hyper_parameters.get("best_confidence_threshold", None))
+        hyper_parameters = ckpt.get("hyper_parameters", {})
+
+        # Load best confidence threshold (legacy and new format)
+        if best_confidence_threshold := ckpt.get("confidence_threshold", None) or hyper_parameters.get(
+            "best_confidence_threshold",
+            None,
         ):
             self.hparams["best_confidence_threshold"] = best_confidence_threshold
         super().on_load_checkpoint(ckpt)
 
     def _log_metrics(self, meter: Metric, key: Literal["val", "test"], **compute_kwargs) -> None:
+        """This function is called every epoch.
+
+        Args:
+            meter: Metric object
+            key: "val" or "test"
+            compute_kwargs: Additional keyword arguments for the metric computation
+
+        """
         if key == "val":
-            retval = super()._log_metrics(meter, key)
+            super()._log_metrics(meter, key)
 
-            # NOTE: Validation metric logging can update `best_confidence_threshold`
-            if (
-                isinstance(meter, MetricCollection)
-                and (fmeasure := getattr(meter, "FMeasure", None))
-                and (best_confidence_threshold := getattr(fmeasure, "best_confidence_threshold", None))
-            ) or (
-                isinstance(meter, FMeasure)
-                and (best_confidence_threshold := getattr(meter, "best_confidence_threshold", None))
-            ):
-                self.hparams["best_confidence_threshold"] = best_confidence_threshold
+            fmeasure = None
+            if isinstance(meter, MetricCollection) and (fmeasure := getattr(meter, "FMeasure", None)):
+                pass  # fmeasure is set
+            elif isinstance(meter, FMeasure):
+                fmeasure = meter
 
-            return retval
+            if fmeasure is not None and hasattr(fmeasure, "best_confidence_threshold"):
+                self.hparams["best_confidence_threshold"] = fmeasure.best_confidence_threshold
 
         if key == "test":
-            # NOTE: Test metric logging should use `best_confidence_threshold` found previously.
+            # NOTE: Test metric logging should use `best_confidence_threshold` in the loaded checkpoint.
             best_confidence_threshold = self.hparams.get("best_confidence_threshold", None)
             compute_kwargs = (
                 {"best_confidence_threshold": best_confidence_threshold} if best_confidence_threshold else {}
             )
 
-            return super()._log_metrics(meter, key, **compute_kwargs)
-
-        raise ValueError(key)
+            super()._log_metrics(meter, key, **compute_kwargs)
 
     @property
     def best_confidence_threshold(self) -> float:
-        """Best confidence threshold to filter outputs."""
-        if not hasattr(self, "_best_confidence_threshold"):
-            self._best_confidence_threshold = self.hparams.get("best_confidence_threshold", None)
-            if self._best_confidence_threshold is None:
+        """Best confidence threshold to filter outputs.
+
+        Always returns the current value from hparams, with 0.5 as fallback.
+        This ensures the threshold is always up-to-date after validation updates it.
+        """
+        threshold = self.hparams.get("best_confidence_threshold", None)
+        if threshold is None:
+            # Only log warning once to avoid spam
+            if not getattr(self, "_threshold_warning_logged", False):
                 log.warning("There is no predefined best_confidence_threshold, 0.5 will be used as default.")
-                self._best_confidence_threshold = 0.5
-        return self._best_confidence_threshold
+                self._threshold_warning_logged = True
+            return 0.5
+        return float(threshold)
 
     def get_dummy_input(self, batch_size: int = 1) -> OTXDataBatch:  # type: ignore[override]
         """Returns a dummy input for detection model."""
diff --git a/src/otx/backend/native/models/instance_segmentation/base.py b/src/otx/backend/native/models/instance_segmentation/base.py
@@ -8,6 +8,7 @@
 from __future__ import annotations
 
 import copy
+import logging as log
 import types
 from contextlib import contextmanager
 from typing import TYPE_CHECKING, Any, Callable, Iterator, Literal, Sequence
@@ -49,6 +50,11 @@
 class OTXInstanceSegModel(OTXModel):
     """Base class for the Instance Segmentation models used in OTX.
 
+    NOTE: OTXInstanceSegModel has many duplicate methods to OTXDetectionModel,
+    however, it is not a subclass of OTXDetectionModel because it has different
+    export parameters and different metric computation. Some refactor could be done
+    to reduce the code duplication in the future.
+
     Args:
         label_info (LabelInfoTypes | int | Sequence): Information about the labels used in the model.
             If `int` is given, label info will be constructed from number of classes,
@@ -264,35 +270,96 @@ def _export_parameters(self) -> TaskLevelExportParameters:
             label_info=modified_label_info,
         )
 
+    def test_step(self, batch: OTXDataBatch, batch_idx: int) -> OTXPredBatch:
+        """Perform a single test step on a batch of data from the test set.
+
+        :param batch: A batch of data (a tuple) containing the input tensor of images and target
+            labels.
+        :param batch_idx: The index of the current batch.
+        """
+        preds = self.forward(inputs=batch)
+
+        if isinstance(preds, OTXBatchLossEntity):
+            raise TypeError(preds)
+
+        # 1. Filter outputs by threshold
+        preds = self._filter_outputs_by_threshold(preds)
+        metric_inputs = self._convert_pred_entity_to_compute_metric(preds, batch)
+
+        # 2. Update metric
+        if isinstance(metric_inputs, dict):
+            self.metric.update(**metric_inputs)
+            return preds
+
+        if isinstance(metric_inputs, list) and all(isinstance(inp, dict) for inp in metric_inputs):
+            for inp in metric_inputs:
+                self.metric.update(**inp)
+            return preds
+
+        raise TypeError(metric_inputs)
+
+    def predict_step(
+        self,
+        batch: OTXDataBatch | OTXTileBatchDataEntity,
+        batch_idx: int,
+        dataloader_idx: int = 0,
+    ) -> OTXPredBatch:
+        """Step function called during PyTorch Lightning Trainer's predict."""
+        if self.explain_mode:
+            return self._filter_outputs_by_threshold(self.forward_explain(inputs=batch))  # type: ignore[arg-type]
+
+        outputs = self._filter_outputs_by_threshold(self.forward(inputs=batch))  # type: ignore[arg-type]
+
+        if isinstance(outputs, OTXBatchLossEntity):
+            raise TypeError(outputs)
+
+        return outputs
+
+    @property
+    def best_confidence_threshold(self) -> float:
+        """Best confidence threshold to filter outputs.
+
+        Always returns the current value from hparams, with 0.5 as fallback.
+        This ensures the threshold is always up-to-date after validation updates it.
+        """
+        threshold = self.hparams.get("best_confidence_threshold", None)
+        if threshold is None:
+            # Only log warning once to avoid spam
+            if not getattr(self, "_threshold_warning_logged", False):
+                log.warning("There is no predefined best_confidence_threshold, 0.5 will be used as default.")
+                self._threshold_warning_logged = True
+            return 0.5
+        return float(threshold)
+
     def on_load_checkpoint(self, ckpt: dict[str, Any]) -> None:
         """Load state_dict from checkpoint.
 
-        For detection, it is need to update confidence threshold information when
+        For instance segmentation, it is needed to update confidence threshold and F1 score information when
         the metric is FMeasure.
         """
-        if best_confidence_threshold := ckpt.get("confidence_threshold", None) or (
-            (hyper_parameters := ckpt.get("hyper_parameters", None))
-            and (best_confidence_threshold := hyper_parameters.get("best_confidence_threshold", None))
+        hyper_parameters = ckpt.get("hyper_parameters", {})
+
+        # Load best confidence threshold (legacy and new format)
+        if best_confidence_threshold := ckpt.get("confidence_threshold", None) or hyper_parameters.get(
+            "best_confidence_threshold",
+            None,
         ):
             self.hparams["best_confidence_threshold"] = best_confidence_threshold
         super().on_load_checkpoint(ckpt)
 
     def _log_metrics(self, meter: Metric, key: Literal["val", "test"], **compute_kwargs) -> None:
         if key == "val":
-            retval = super()._log_metrics(meter, key)
+            super()._log_metrics(meter, key)
 
-            # NOTE: Validation metric logging can update `best_confidence_threshold`
-            if (
-                isinstance(meter, MetricCollection)
-                and (fmeasure := getattr(meter, "FMeasure", None))
-                and (best_confidence_threshold := getattr(fmeasure, "best_confidence_threshold", None))
-            ) or (
-                isinstance(meter, FMeasure)
-                and (best_confidence_threshold := getattr(meter, "best_confidence_threshold", None))
-            ):
-                self.hparams["best_confidence_threshold"] = best_confidence_threshold
+            # NOTE: Only update best_confidence_threshold when we achieve a NEW best F1 score
+            fmeasure = None
+            if isinstance(meter, MetricCollection) and (fmeasure := getattr(meter, "FMeasure", None)):
+                pass  # fmeasure is set
+            elif isinstance(meter, FMeasure):
+                fmeasure = meter
 
-            return retval
+            if fmeasure is not None and hasattr(fmeasure, "best_confidence_threshold"):
+                self.hparams["best_confidence_threshold"] = fmeasure.best_confidence_threshold
 
         if key == "test":
             # NOTE: Test metric logging should use `best_confidence_threshold` found previously.
@@ -301,9 +368,37 @@ def _log_metrics(self, meter: Metric, key: Literal["val", "test"], **compute_kwa
                 {"best_confidence_threshold": best_confidence_threshold} if best_confidence_threshold else {}
             )
 
-            return super()._log_metrics(meter, key, **compute_kwargs)
-
-        raise ValueError(key)
+            super()._log_metrics(meter, key, **compute_kwargs)
+
+    def _filter_outputs_by_threshold(self, outputs: OTXPredBatch) -> OTXPredBatch:
+        scores = []
+        bboxes = []
+        labels = []
+        masks = []
+        polygons = []
+
+        for i in range(len(outputs.imgs_info)):  # type: ignore[arg-type]
+            _scores = outputs.scores[i] if outputs.scores is not None else None
+            _bboxes = outputs.bboxes[i] if outputs.bboxes is not None else None
+            _masks = outputs.masks[i] if outputs.masks is not None else None
+            _polygons = outputs.polygons[i] if outputs.polygons is not None else None
+            _labels = outputs.labels[i] if outputs.labels is not None else None
+
+            filtered_idx = torch.where(_scores > self.best_confidence_threshold)
+            scores.append(_scores[filtered_idx])
+            bboxes.append(_bboxes[filtered_idx])
+            labels.append(_labels[filtered_idx])
+            if _masks is not None and len(_masks) > 0:
+                masks.append(_masks[filtered_idx])
+            if _polygons is not None and len(_polygons) > 0:
+                polygons.append(_polygons[filtered_idx])
+
+        outputs.scores = scores
+        outputs.bboxes = bboxes
+        outputs.labels = labels
+        outputs.masks = masks
+        outputs.polygons = polygons
+        return outputs
 
     def _convert_pred_entity_to_compute_metric(
         self,
diff --git a/src/otx/backend/native/utils/utils.py b/src/otx/backend/native/utils/utils.py
@@ -85,7 +85,7 @@ def mock_modules_for_chkpt() -> Iterator[None]:
         sys.modules["otx.core.types"] = otx.types
         sys.modules["otx.core.types.task"] = otx.types.task
         sys.modules["otx.core.types.label"] = otx.types.label
-        sys.modules["otx.core.model"] = otx.backend.native.models
+        sys.modules["otx.core.model"] = otx.backend.native.models  # type: ignore[attr-defined]
         sys.modules["otx.core.metrics"] = otx.metrics
 
         yield
diff --git a/tests/unit/metrics/test_detection_threshold_logic.py b/tests/unit/metrics/test_detection_threshold_logic.py