Relieve memory usage criteria on batch size 2 during adaptive_bs (#4009)

eunwoosh · web-flow · commit 7744c89f3816 · 2024-10-10T22:26:43.000+09:00
* release memory usage cirteria on batch size 2 during adpative_bs

* update unit test

* update unit test
diff --git a/src/otx/engine/adaptive_bs/bs_search_algo.py b/src/otx/engine/adaptive_bs/bs_search_algo.py
@@ -112,8 +112,14 @@ def auto_decrease_batch_size(self) -> int:
                 break
 
         if available_bs == 0:
-            msg = "Current device can't train model even with 2."
-            raise RuntimeError(msg)
+            if oom:
+                msg = "Current device can't train model even with 2."
+                raise RuntimeError(msg)
+            logger.warning(
+                "Even with a batch size of 2, most of the memory is used, "
+                "which could cause the training to fail midway.",
+            )
+            available_bs = 2
 
         return available_bs
 
@@ -141,8 +147,14 @@ def find_big_enough_batch_size(self, drop_last: bool = False) -> int:
         if oom or bs_mem_usage > self._mem_upper_bound:
             self._default_bs -= 2
             if self._default_bs <= 0:
-                msg = "Current device can't train model even with 2."
-                raise RuntimeError(msg)
+                if oom:
+                    msg = "Current device can't train model even with 2."
+                    raise RuntimeError(msg)
+                logger.warning(
+                    "Even with a batch size of 2, most of the memory is used, "
+                    "which could cause the training to fail midway.",
+                )
+                return 2
 
             return self.auto_decrease_batch_size()
 
diff --git a/tests/unit/engine/adaptive_bs/test_bs_search_algo.py b/tests/unit/engine/adaptive_bs/test_bs_search_algo.py
@@ -99,12 +99,19 @@ def test_auto_decrease_batch_size(self):
         assert adapted_bs == 80
 
     def test_find_max_usable_bs_gpu_memory_too_small(self):
-        mock_train_func = self.get_mock_train_func(cuda_oom_bound=4, max_runnable_bs=1)
+        mock_train_func = self.get_mock_train_func(cuda_oom_bound=1, max_runnable_bs=1)
 
         bs_search_algo = BsSearchAlgo(mock_train_func, 128, 1000)
         with pytest.raises(RuntimeError):
             bs_search_algo.auto_decrease_batch_size()
 
+    def test_auto_decrease_batch_size_bs2_not_oom_but_most_mem(self):
+        """Batch size 2 doesn't make oom but use most of memory."""
+        mock_train_func = self.get_mock_train_func(cuda_oom_bound=2, max_runnable_bs=1)
+
+        bs_search_algo = BsSearchAlgo(mock_train_func, 128, 1000)
+        assert bs_search_algo.auto_decrease_batch_size() == 2
+
     @pytest.mark.parametrize(
         ("max_runnable_bs", "max_bs", "expected_bs"),
         [
@@ -126,12 +133,19 @@ def test_find_big_enough_batch_size(self, max_runnable_bs, max_bs, expected_bs):
             assert adapted_bs == expected_bs
 
     def test_find_big_enough_batch_size_gpu_memory_too_small(self):
-        mock_train_func = self.get_mock_train_func(cuda_oom_bound=4, max_runnable_bs=1)
+        mock_train_func = self.get_mock_train_func(cuda_oom_bound=1, max_runnable_bs=1)
 
         bs_search_algo = BsSearchAlgo(mock_train_func, 128, 1000)
         with pytest.raises(RuntimeError):
             bs_search_algo.find_big_enough_batch_size()
 
+    def test_find_big_enough_batch_size_bs2_not_oom_but_most_mem(self):
+        """Batch size 2 doesn't make oom but use most of memory."""
+        mock_train_func = self.get_mock_train_func(cuda_oom_bound=2, max_runnable_bs=1)
+
+        bs_search_algo = BsSearchAlgo(mock_train_func, 2, 1000)
+        assert bs_search_algo.find_big_enough_batch_size() == 2
+
     def test_find_big_enough_batch_size_gradient_zero(self):
         def mock_train_func(batch_size) -> int:
             if batch_size > 1000: