🔧 Fix auto batch size handling with tiling and improve error logging (#4233)

eugene123tw · kprokofi · web-flow · commit 464b45a5a70b · 2025-03-07T14:54:39.000+01:00
* 🔧 Fix auto batch size handling with tiling and improve error logging

* Update CHANGELOG to include link for auto batch size fix with tiling

* fix linter

---------

Co-authored-by: Prokofiev Kirill &lt;kirill.prokofiev@intel.com&gt;
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -10,6 +10,9 @@ All notable changes to this project will be documented in this file.
 
 ### Bug fixes
 
+
+- Fix auto batch size with tiling
+  (<https://github.com/openvinotoolkit/training_extensions/pull/4233>)
 - Fix exportable code for tiling
   (<https://github.com/openvinotoolkit/training_extensions/pull/4234>)
 - Don't filter empty label from kp arrow
diff --git a/src/otx/core/utils/tile_merge.py b/src/otx/core/utils/tile_merge.py
@@ -7,10 +7,12 @@
 
 from abc import abstractmethod
 from collections import defaultdict
+from typing import Callable
 
 import cv2
 import numpy as np
 import torch
+from packaging import version
 from torchvision import tv_tensors
 from torchvision.ops import batched_nms
 
@@ -21,6 +23,38 @@
 from otx.core.data.entity.instance_segmentation import InstanceSegBatchPredEntity, InstanceSegPredEntity
 from otx.core.data.entity.segmentation import SegBatchPredEntity, SegPredEntity
 
+# Maximum number of elements 2**31 -1
+MAX_ELEMENTS: int = np.iinfo(np.int32).max
+
+
+# NOTE: RuntimeError: nonzero is not supported for tensors with more than INT_MAX elements,
+# See https://github.com/pytorch/pytorch/issues/51871
+int_max_check_condition: Callable[[torch.Tensor], bool] = (
+    lambda tile_masks: version.parse(torch.__version__) < version.parse("2.6")
+    and torch.numel(tile_masks) > MAX_ELEMENTS
+)
+
+
+def keep_chunkify(tensor: torch.Tensor, max_element: int = MAX_ELEMENTS) -> torch.Tensor:
+    """Splits tensor into chunks and processes each chunk separately.
+
+    Args:
+        tensor (torch.Tensor): Input tensor of shape (B, H, W).
+
+    Returns:
+        torch.Tensor: Boolean mask of shape (B,) indicating nonzero sum.
+    """
+    _, h, w = tensor.shape
+    max_batch_size = int(max_element) // (h * w)
+    chunk_size = max(1, min(max_batch_size, tensor.shape[0]))
+
+    keep_indices = []
+    for i in range(0, tensor.shape[0], chunk_size):
+        chunk = tensor[i : i + chunk_size]
+        keep_indices.append(chunk.sum(dim=(1, 2)) > 0)  # Process chunk
+
+    return torch.cat(keep_indices, dim=0)
+
 
 class TileMerge:
     """Base class for tile merge.
@@ -332,7 +366,10 @@ def merge(
                 feature_vectors,
                 strict=True,
             ):
-                keep_indices = tile_masks.to_sparse().sum((1, 2)).to_dense() > 0
+                if int_max_check_condition(tile_masks):
+                    keep_indices = keep_chunkify(tile_masks)
+                else:
+                    keep_indices = tile_masks.to_sparse().sum((1, 2)).to_dense() > 0
                 keep_indices = keep_indices.nonzero(as_tuple=True)[0]
                 _bboxes = tile_bboxes[keep_indices]
                 _labels = tile_labels[keep_indices]
diff --git a/src/otx/engine/adaptive_bs/bs_search_algo.py b/src/otx/engine/adaptive_bs/bs_search_algo.py
@@ -113,8 +113,13 @@ def auto_decrease_batch_size(self) -> int:
 
         if available_bs == 0:
             if oom:
-                msg = "Current device can't train model even with 2."
-                raise RuntimeError(msg)
+                logger.warning(
+                    "The auto batch size algorithm attempted to use a batch size of 2 but still "
+                    "encountered a CUDA OOM error. OTX will proceed with training at batch size 2; "
+                    "however, you will likely encounter a CUDA OOM error once training starts. "
+                    "If the issue persists, please report it accordingly.",
+                )
+                return 2
             logger.warning(
                 "Even with a batch size of 2, most of the memory is used, "
                 "which could cause the training to fail midway.",
diff --git a/tests/unit/engine/adaptive_bs/test_bs_search_algo.py b/tests/unit/engine/adaptive_bs/test_bs_search_algo.py
@@ -106,8 +106,7 @@ def test_find_max_usable_bs_gpu_memory_too_small(self):
         mock_train_func = self.get_mock_train_func(cuda_oom_bound=1, max_runnable_bs=1)
 
         bs_search_algo = BsSearchAlgo(mock_train_func, 128, 1000)
-        with pytest.raises(RuntimeError):
-            bs_search_algo.auto_decrease_batch_size()
+        assert bs_search_algo.auto_decrease_batch_size() == 2
 
     def test_auto_decrease_batch_size_bs2_not_oom_but_most_mem(self):
         """Batch size 2 doesn't make oom but use most of memory."""
@@ -140,8 +139,7 @@ def test_find_big_enough_batch_size_gpu_memory_too_small(self):
         mock_train_func = self.get_mock_train_func(cuda_oom_bound=1, max_runnable_bs=1)
 
         bs_search_algo = BsSearchAlgo(mock_train_func, 128, 1000)
-        with pytest.raises(RuntimeError):
-            bs_search_algo.find_big_enough_batch_size()
+        assert bs_search_algo.find_big_enough_batch_size() == 2
 
     def test_find_big_enough_batch_size_bs2_not_oom_but_most_mem(self):
         """Batch size 2 doesn't make oom but use most of memory."""