Fix OOM bug on XPU (#4872)

kprokofi · leoll2 · web-flow · commit e8d7149c9913 · 2025-10-10T13:15:40.000+02:00
Co-authored-by: Leonardo Lai &lt;leonardo.lai@intel.com&gt;
diff --git a/lib/src/otx/backend/native/callbacks/batchsize_finder.py b/lib/src/otx/backend/native/callbacks/batchsize_finder.py
@@ -27,7 +27,7 @@ class BatchSizeFinder(Callback):
 
     def __init__(
         self,
-        steps_per_trial: int = 3,
+        steps_per_trial: int = 5,
     ) -> None:
         self._steps_per_trial = steps_per_trial
 
@@ -52,11 +52,12 @@ def _try_loop_run(trainer: Trainer) -> None:
     loop.run()
 
 
-def _scale_batch_reset_params(trainer: Trainer, steps_per_trial: int) -> None:
+def _scale_batch_reset_params(trainer: Trainer, steps_per_trial: int, max_epochs: int = 1) -> None:
     trainer.logger = DummyLogger() if trainer.logger is not None else None
     trainer.callbacks = []
-    # For XPU devices 1 epoch sometimes is not enough to catch an error
-    max_epochs = 2 if is_xpu_available() else 1
+    # For XPU devices 1 epoch sometimes is not enough to catch an error.
+    # Emperically enlarge this to 15 iterations (steps_per_trial * epochs)
+    max_epochs = 3 if is_xpu_available() else 1
 
     loop = trainer._active_loop  # noqa: SLF001
     if loop is None:
diff --git a/lib/src/otx/backend/native/tools/adaptive_bs/algorithm.py b/lib/src/otx/backend/native/tools/adaptive_bs/algorithm.py
@@ -47,8 +47,8 @@ def __init__(
         self._max_bs = max_bs
         self._bs_try_history: dict[int, int] = {}
         self._total_mem = _get_total_memory_size()
-        self._mem_lower_bound = 0.8 * self._total_mem
-        self._mem_upper_bound = 0.85 * self._total_mem
+        self._mem_lower_bound = 0.75 * self._total_mem
+        self._mem_upper_bound = 0.9 * self._total_mem
         self._mp_ctx = mp.get_context("spawn")
 
     def _try_batch_size(self, bs: int) -> tuple[bool, int]:
@@ -115,16 +115,16 @@ def auto_decrease_batch_size(self) -> int:
             if oom:
                 logger.warning(
                     "The auto batch size algorithm attempted to use a batch size of 2 but still "
-                    "encountered a CUDA OOM error. OTX will proceed with training at batch size 2; "
-                    "however, you will likely encounter a CUDA OOM error once training starts. "
-                    "If the issue persists, please report it accordingly.",
+                    "encountered a CUDA OOM error. OTX will proceed with training at batch size 1; "
+                    "however, it is also possible to encounter a CUDA OOM error during training.",
                 )
-                return 2
+                return 1
             logger.warning(
                 "Even with a batch size of 2, most of the memory is used, "
-                "which could cause the training to fail midway.",
+                "which could cause the training to fail midway."
+                "For safety reasons, decrease bs to 1.",
             )
-            available_bs = 2
+            available_bs = 1
 
         return available_bs
 
@@ -157,9 +157,10 @@ def find_big_enough_batch_size(self, drop_last: bool = False) -> int:
                     raise RuntimeError(msg)
                 logger.warning(
                     "Even with a batch size of 2, most of the memory is used, "
-                    "which could cause the training to fail midway.",
+                    "which could cause the training to fail midway."
+                    "For safety reasons, decrease bs to 1.",
                 )
-                return 2
+                return 1
 
             return self.auto_decrease_batch_size()
 
@@ -270,6 +271,8 @@ def _run_trial(train_func: Callable[[int], Any], bs: int, trial_queue: mp.Queue)
             or "UR_RESULT_ERROR_OUT_OF_DEVICE_MEMORY" in str(e)
             or "UR error" in str(e)
             or "UR_RESULT_ERROR_UNKNOWN" in str(e)
+            or "UR_RESULT_ERROR_OUT_OF_HOST_MEMORY" in str(e)
+            or "UR_RESULT_ERROR" in str(e)
         ):  # XPU OOM
             oom = True
         else:
diff --git a/lib/src/otx/backend/native/tools/adaptive_bs/runner.py b/lib/src/otx/backend/native/tools/adaptive_bs/runner.py
@@ -114,10 +114,6 @@ def _register_callback(callbacks: list[Callback] | Callback | None = None) -> li
 
 def _apply_new_batch_size(engine: OTXEngine, new_batch_size: int) -> None:
     origin_bs = engine.datamodule.train_subset.batch_size
-    if is_xpu_available() and new_batch_size != 1:
-        new_batch_size -= 1  # for safety reasons
-    if new_batch_size == origin_bs:
-        return
     engine.datamodule.train_subset.batch_size = new_batch_size
     engine.datamodule.val_subset.batch_size = new_batch_size
     engine.datamodule.test_subset.batch_size = new_batch_size
diff --git a/lib/src/otx/tools/converter.py b/lib/src/otx/tools/converter.py
@@ -272,6 +272,15 @@ def update_num_iters(param_value: int | None, config: dict) -> None:
     config["max_epochs"] = param_value
 
 
+def update_batch_size(param_value: int | None, config: dict) -> None:
+    """Update batch size in the config."""
+    if param_value is None:
+        logging.info("Batch size is not provided, skipping update.")
+        return
+    config["data"]["train_subset"]["batch_size"] = param_value
+    config["data"]["val_subset"]["batch_size"] = param_value
+
+
 def update_early_stopping(early_stopping_cfg: dict | None, config: dict) -> None:
     """Update early stopping parameters in the config."""
     if early_stopping_cfg is None:
@@ -483,6 +492,7 @@ def _update_params(config: dict, param_dict: dict) -> None:
         update_tiling(tiling, config)
         update_augmentations(augmentation_params, config)
         update_learning_rate(training_parameters.get("learning_rate", None), config)
+        update_batch_size(training_parameters.get("batch_size", None), config)
         update_num_iters(training_parameters.get("max_epochs", None), config)
         update_early_stopping(training_parameters.get("early_stopping", None), config)
         update_input_size(
diff --git a/lib/tests/assets/geti/model_configs/detection.yaml b/lib/tests/assets/geti/model_configs/detection.yaml
@@ -70,6 +70,7 @@ hyperparameters:
       enable: true
       patience: 10
     learning_rate: 0.001
+    batch_size: 4
     input_size_width: 800
     input_size_height: 992
   evaluation:
diff --git a/lib/tests/unit/backend/native/tools/adaptive_bs/test_bs_search_algo.py b/lib/tests/unit/backend/native/tools/adaptive_bs/test_bs_search_algo.py
@@ -68,9 +68,9 @@ def mock_train_func(batch_size) -> int:
                 msg = "CUDA out of memory."
                 raise RuntimeError(msg)
             if batch_size > max_runnable_bs:
-                mem_usage = 8500 + 1500 * batch_size / (cuda_oom_bound - max_runnable_bs)
+                mem_usage = 9000 + 1500 * batch_size / (cuda_oom_bound - max_runnable_bs)
             else:
-                mem_usage = 8500 * batch_size / max_runnable_bs
+                mem_usage = 9000 * batch_size / max_runnable_bs
 
             self.mock_torch.cuda.max_memory_reserved.return_value = mem_usage
             return mem_usage
@@ -110,14 +110,14 @@ def test_find_max_usable_bs_gpu_memory_too_small(self):
         mock_train_func = self.get_mock_train_func(cuda_oom_bound=1, max_runnable_bs=1)
 
         bs_search_algo = BsSearchAlgo(mock_train_func, 128, 1000)
-        assert bs_search_algo.auto_decrease_batch_size() == 2
+        assert bs_search_algo.auto_decrease_batch_size() == 1
 
     def test_auto_decrease_batch_size_bs2_not_oom_but_most_mem(self):
         """Batch size 2 doesn't make oom but use most of memory."""
         mock_train_func = self.get_mock_train_func(cuda_oom_bound=2, max_runnable_bs=1)
 
         bs_search_algo = BsSearchAlgo(mock_train_func, 128, 1000)
-        assert bs_search_algo.auto_decrease_batch_size() == 2
+        assert bs_search_algo.auto_decrease_batch_size() == 1
 
     @pytest.mark.parametrize(
         ("max_runnable_bs", "max_bs", "expected_bs"),
@@ -135,22 +135,22 @@ def test_find_big_enough_batch_size(self, max_runnable_bs, max_bs, expected_bs):
         adapted_bs = bs_search_algo.find_big_enough_batch_size()
 
         if expected_bs is None:
-            assert 7500 <= mock_train_func(adapted_bs) <= 8500
+            assert 7500 <= mock_train_func(adapted_bs) <= 9000
         else:
             assert adapted_bs == expected_bs
 
     def test_find_big_enough_batch_size_gpu_memory_too_small(self):
         mock_train_func = self.get_mock_train_func(cuda_oom_bound=1, max_runnable_bs=1)
 
         bs_search_algo = BsSearchAlgo(mock_train_func, 128, 1000)
-        assert bs_search_algo.find_big_enough_batch_size() == 2
+        assert bs_search_algo.find_big_enough_batch_size() == 1
 
     def test_find_big_enough_batch_size_bs2_not_oom_but_most_mem(self):
         """Batch size 2 doesn't make oom but use most of memory."""
         mock_train_func = self.get_mock_train_func(cuda_oom_bound=2, max_runnable_bs=1)
 
         bs_search_algo = BsSearchAlgo(mock_train_func, 2, 1000)
-        assert bs_search_algo.find_big_enough_batch_size() == 2
+        assert bs_search_algo.find_big_enough_batch_size() == 1
 
     def test_find_big_enough_batch_size_gradient_zero(self):
         def mock_train_func(batch_size) -> int:
@@ -167,7 +167,7 @@ def mock_train_func(batch_size) -> int:
         bs_search_algo = BsSearchAlgo(mock_train_func, 64, 1000)
         adapted_bs = bs_search_algo.find_big_enough_batch_size()
 
-        assert adapted_bs == 100
+        assert adapted_bs == 102
 
     def test_find_big_enough_batch_size_not_exceed_upper_bound(self):
         def mock_train_func(batch_size) -> int:
@@ -184,7 +184,7 @@ def mock_train_func(batch_size) -> int:
         bs_search_algo = BsSearchAlgo(mock_train_func, 64, 1000)
         adapted_bs = bs_search_algo.find_big_enough_batch_size()
 
-        assert mock_train_func(adapted_bs) <= 8500
+        assert mock_train_func(adapted_bs) <= 9000
 
     def test_find_big_enough_batch_size_drop_last(self):
         mock_train_func = self.get_mock_train_func(cuda_oom_bound=10000, max_runnable_bs=180)
diff --git a/lib/tests/unit/tools/test_converter.py b/lib/tests/unit/tools/test_converter.py
@@ -15,8 +15,8 @@ def test_convert(self):
         config = GetiConfigConverter.convert(asdict(otx_config))
 
         assert config["data"]["input_size"] == (992, 800)
-        assert config["data"]["train_subset"]["batch_size"] == 8
-        assert config["data"]["val_subset"]["batch_size"] == 8
+        assert config["data"]["train_subset"]["batch_size"] == 4
+        assert config["data"]["val_subset"]["batch_size"] == 4
         assert config["data"]["test_subset"]["batch_size"] == 8
         assert config["model"]["init_args"]["optimizer"]["init_args"]["lr"] == 0.001
         assert config["max_epochs"] == 100
@@ -266,8 +266,8 @@ def test_instantiate(self, tmp_path):
         assert engine.work_dir == tmp_path
 
         assert engine.datamodule.data_root == data_root
-        assert engine.datamodule.train_subset.batch_size == 8
-        assert engine.datamodule.val_subset.batch_size == 8
+        assert engine.datamodule.train_subset.batch_size == 4
+        assert engine.datamodule.val_subset.batch_size == 4
         assert engine.datamodule.test_subset.batch_size == 8
         assert engine.datamodule.train_subset.num_workers == 2
         assert engine.datamodule.val_subset.num_workers == 2