Fix XPU training and optimization from Geti2.5 (#4486)

kprokofi · web-flow · commit 9ee2e33e9e34 · 2025-07-31T08:25:30.000+02:00
* apply fix to run xpu, change from_config

* fix typing'

* add example

* fix xai test

* fix linte

* fix auto batch size for XPU

* return max_epochs for atss

* add kwargs override for OTXEngine.from_config()

* use cache instead

* return train kwargs back

* minor fixes|

* reply comments
diff --git a/pyproject.toml b/pyproject.toml
@@ -71,6 +71,7 @@ cuda = ["torch==2.7.0"]
 xpu = [
     "torch==2.7.0+xpu",
     "pytorch-triton-xpu==3.3.0",
+    "torchvision==0.22.0+xpu"
 ]
 
 docs = [
diff --git a/src/otx/backend/native/__init__.py b/src/otx/backend/native/__init__.py
@@ -2,3 +2,10 @@
 # SPDX-License-Identifier: Apache-2.0
 
 """Native backend."""
+
+from .lightning import accelerators, strategies
+
+__all__ = [
+    "accelerators",
+    "strategies",
+]
diff --git a/src/otx/backend/native/models/detection/heads/atss_head.py b/src/otx/backend/native/models/detection/heads/atss_head.py
@@ -25,8 +25,7 @@
 )
 from otx.backend.native.models.detection.utils.prior_generators.utils import anchor_inside_flags
 from otx.backend.native.models.detection.utils.utils import unmap
-from otx.backend.native.models.modules.conv_module import Conv2dModule
-from otx.backend.native.models.modules.norm import build_norm_layer
+from otx.backend.native.models.modules import Conv2dModule, PatchedConv2d, build_norm_layer
 from otx.backend.native.models.modules.scale import Scale
 from otx.backend.native.models.utils.utils import InstanceData
 from otx.data.entity.torch import OTXDataBatch
@@ -123,19 +122,19 @@ def _init_layers(self) -> None:
                 ),
             )
         pred_pad_size = self.pred_kernel_size // 2
-        self.atss_cls = nn.Conv2d(
+        self.atss_cls = PatchedConv2d(
             self.feat_channels,
             self.num_anchors * self.cls_out_channels,
             self.pred_kernel_size,
             padding=pred_pad_size,
         )
-        self.atss_reg = nn.Conv2d(
+        self.atss_reg = PatchedConv2d(
             self.feat_channels,
             self.num_base_priors * 4,
             self.pred_kernel_size,
             padding=pred_pad_size,
         )
-        self.atss_centerness = nn.Conv2d(
+        self.atss_centerness = PatchedConv2d(
             self.feat_channels,
             self.num_base_priors * 1,
             self.pred_kernel_size,
diff --git a/src/otx/backend/native/models/detection/heads/rtmdet_head.py b/src/otx/backend/native/models/detection/heads/rtmdet_head.py
@@ -28,7 +28,7 @@
     unmap,
 )
 from otx.backend.native.models.modules import build_activation_layer
-from otx.backend.native.models.modules.conv_module import Conv2dModule, DepthwiseSeparableConvModule
+from otx.backend.native.models.modules.conv_module import Conv2dModule, DepthwiseSeparableConvModule, PatchedConv2d
 from otx.backend.native.models.modules.norm import build_norm_layer, is_norm
 from otx.backend.native.models.modules.scale import Scale
 from otx.backend.native.models.utils.utils import InstanceData
@@ -91,20 +91,20 @@ def _init_layers(self) -> None:
                 ),
             )
         pred_pad_size = self.pred_kernel_size // 2
-        self.rtm_cls = nn.Conv2d(
+        self.rtm_cls = PatchedConv2d(
             self.feat_channels,
             self.num_base_priors * self.cls_out_channels,
             self.pred_kernel_size,
             padding=pred_pad_size,
         )
-        self.rtm_reg = nn.Conv2d(
+        self.rtm_reg = PatchedConv2d(
             self.feat_channels,
             self.num_base_priors * 4,
             self.pred_kernel_size,
             padding=pred_pad_size,
         )
         if self.with_objectness:
-            self.rtm_obj = nn.Conv2d(self.feat_channels, 1, self.pred_kernel_size, padding=pred_pad_size)
+            self.rtm_obj = PatchedConv2d(self.feat_channels, 1, self.pred_kernel_size, padding=pred_pad_size)
 
         self.scales = nn.ModuleList([Scale(1.0) for _ in self.prior_generator.strides])
 
@@ -641,15 +641,15 @@ def _init_layers(self) -> None:
             self.reg_convs.append(reg_convs)
 
             self.rtm_cls.append(
-                nn.Conv2d(
+                PatchedConv2d(
                     self.feat_channels,
                     self.num_base_priors * self.cls_out_channels,
                     self.pred_kernel_size,
                     padding=self.pred_kernel_size // 2,
                 ),
             )
             self.rtm_reg.append(
-                nn.Conv2d(
+                PatchedConv2d(
                     self.feat_channels,
                     self.num_base_priors * 4,
                     self.pred_kernel_size,
@@ -658,7 +658,7 @@ def _init_layers(self) -> None:
             )
             if self.with_objectness:
                 self.rtm_obj.append(
-                    nn.Conv2d(self.feat_channels, 1, self.pred_kernel_size, padding=self.pred_kernel_size // 2),
+                    PatchedConv2d(self.feat_channels, 1, self.pred_kernel_size, padding=self.pred_kernel_size // 2),
                 )
 
         if self.share_conv:
diff --git a/src/otx/backend/native/models/modules/__init__.py b/src/otx/backend/native/models/modules/__init__.py
@@ -4,7 +4,7 @@
 """Common module implementations."""
 
 from .activation import build_activation_layer
-from .conv_module import Conv2dModule, Conv3dModule, DepthwiseSeparableConvModule
+from .conv_module import Conv2dModule, Conv3dModule, DepthwiseSeparableConvModule, PatchedConv2d
 from .norm import FrozenBatchNorm2d, build_norm_layer
 from .padding import build_padding_layer
 
@@ -16,4 +16,5 @@
     "Conv3dModule",
     "DepthwiseSeparableConvModule",
     "FrozenBatchNorm2d",
+    "PatchedConv2d",
 ]
diff --git a/src/otx/backend/native/models/modules/conv_module.py b/src/otx/backend/native/models/modules/conv_module.py
@@ -383,7 +383,9 @@ def forward(self, x: Tensor) -> Tensor:
         x = super().forward(x)
 
         # Apply the fix to the output gradient of Conv2d.
-        return _patch_grad(x)
+        if is_xpu_available():
+            return _patch_grad(x)
+        return x
 
 
 class Conv2dModule(ConvModule):
@@ -392,7 +394,7 @@ class Conv2dModule(ConvModule):
     # Use the patched Conv2d if XPU is available.
     # This is to avoid issues with XPU performance.
     # TODO(kprokofi): Remove this when XPU performance is fixed.
-    _conv_nd = PatchedConv2d if is_xpu_available() else nn.Conv2d
+    _conv_nd = PatchedConv2d
 
 
 class Conv3dModule(ConvModule):
diff --git a/src/otx/backend/native/tools/adaptive_bs/algorithm.py b/src/otx/backend/native/tools/adaptive_bs/algorithm.py
@@ -99,7 +99,6 @@ def auto_decrease_batch_size(self) -> int:
 
         while True:
             oom, max_memory_reserved = self._try_batch_size(current_bs)
-
             # If memory usage is too close to limit, OOM can be raised during training
             if oom or max_memory_reserved > self._mem_upper_bound:
                 if current_bs < lowest_unavailable_bs:
@@ -258,14 +257,19 @@ def check_bs_suitable(estimated_bs: int) -> bool:
 
 def _run_trial(train_func: Callable[[int], Any], bs: int, trial_queue: mp.Queue) -> None:
     mp.set_start_method(None, True)  # reset mp start method
-
     oom = False
     try:
         train_func(bs)
     except RuntimeError as e:
-        if str(e).startswith("CUDA out of memory.") or str(e).startswith(  # CUDA OOM
-            "Allocation is out of device memory on current platform.",  # XPU OOM
-        ):
+        if (
+            str(e).startswith("CUDA out of memory.")
+            or str(e).startswith(  # CUDA OOM
+                "Allocation is out of device memory on current platform.",
+            )
+            or "XPU out of memory" in str(e)
+            or "UR_RESULT_ERROR_OUT_OF_DEVICE_MEMORY" in str(e)
+            or "UR error" in str(e)
+        ):  # XPU OOM
             oom = True
         else:
             raise
diff --git a/src/otx/backend/native/tools/adaptive_bs/runner.py b/src/otx/backend/native/tools/adaptive_bs/runner.py
@@ -96,6 +96,9 @@ def _train_model(bs: int, engine: OTXEngine, callbacks: list[Callback] | Callbac
         engine._cache.update(devices=1)  # noqa: SLF001
 
     engine.datamodule.train_subset.batch_size = bs
+    engine.datamodule.val_subset.batch_size = bs
+    engine.datamodule.test_subset.batch_size = bs
+    train_args["adaptive_bs"] = "None"
     engine.train(callbacks=_register_callback(callbacks), **train_args)
 
 
@@ -113,4 +116,6 @@ def _apply_new_batch_size(engine: OTXEngine, new_batch_size: int) -> None:
     if new_batch_size == origin_bs:
         return
     engine.datamodule.train_subset.batch_size = new_batch_size
+    engine.datamodule.val_subset.batch_size = new_batch_size
+    engine.datamodule.test_subset.batch_size = new_batch_size
     engine.model.optimizer_callable.optimizer_kwargs["lr"] *= sqrt(new_batch_size / origin_bs)  # type: ignore[attr-defined]
diff --git a/src/otx/tools/converter.py b/src/otx/tools/converter.py
@@ -511,6 +511,8 @@ def instantiate(
         instantiated_kwargs = engine_parser.instantiate_classes(Namespace(**config))
 
         train_kwargs = {k: v for k, v in instantiated_kwargs.items() if k in train_arguments}
+        # enable auto batch size for training
+        train_kwargs["adaptive_bs"] = "Safe"
 
         return engine, train_kwargs
 
diff --git a/src/otx/types/label.py b/src/otx/types/label.py
@@ -149,7 +149,10 @@ def to_json(self) -> str:
     @classmethod
     def from_json(cls, serialized: str) -> LabelInfo:
         """Reconstruct it from the JSON serialized string."""
-        return cls(**json.loads(serialized))
+        labels_info = json.loads(serialized)
+        if "label_ids" not in labels_info:
+            labels_info["label_ids"] = labels_info["label_names"]
+        return cls(**labels_info)
 
 
 @dataclass
diff --git a/tests/integration/api/test_xai.py b/tests/integration/api/test_xai.py
@@ -48,20 +48,13 @@ def test_forward_explain(
     model_name = recipe_split[-1].split(".")[0]
     task = recipe_split[-2]
 
-    if "maskdino" in model_name:
-        # TODO(Eugene): maskdino not support yet.
-        pytest.skip(f"There's issue with inst-seg: {model_name}. Skip for now.")
-
     if "dino" in model_name:
         pytest.skip("DINO is not supported.")
 
     if "rtmdet_tiny" in recipe:
         # TODO (sungchul): enable xai for rtmdet_tiny (CVS-142651)
         pytest.skip("rtmdet_tiny on detection is not supported yet.")
 
-    if "yolov9" in recipe:
-        pytest.skip("yolov9 on detection is not supported yet.")
-
     engine = OTXEngine.from_config(
         config_path=recipe,
         data_root=fxt_target_dataset_per_task[task],
@@ -111,10 +104,6 @@ def test_predict_with_explain(
     if "dino" in model_name:
         pytest.skip("DINO is not supported.")
 
-    if "maskdino" in model_name:
-        # TODO(Eugene): maskdino not support yet.
-        pytest.skip(f"There's issue with inst-seg: {model_name}. Skip for now.")
-
     if "rtmdet_tiny" in recipe:
         # TODO (sungchul): enable xai for rtmdet_tiny (CVS-142651)
         pytest.skip("rtmdet_tiny on detection is not supported yet.")
@@ -123,9 +112,6 @@ def test_predict_with_explain(
         # TODO (Galina): required to update model-api to 2.1
         pytest.skip("yolox_tiny_tile on detection requires model-api update")
 
-    if "yolov9" in recipe:
-        pytest.skip("yolov9 on detection is not supported yet.")
-
     tmp_path = tmp_path / f"otx_xai_{model_name}"
     engine = OTXEngine.from_config(
         config_path=recipe,

Original file line number	Diff line number	Diff line change
`@@ -71,6 +71,7 @@ cuda = ["torch==2.7.0"]`
`71`	`71`	`xpu = [`
`72`	`72`	`"torch==2.7.0+xpu",`
`73`	`73`	`"pytorch-triton-xpu==3.3.0",`
	`74`	`+ "torchvision==0.22.0+xpu"`
`74`	`75`	`]`
`75`	`76`
`76`	`77`	`docs = [`