open-edge-platform · kprokofi · Sep 8, 2025 · Sep 8, 2025 · Sep 18, 2025 · Sep 19, 2025
@@ -20,18 +20,20 @@ jobs:
         uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
         with:
           python-version: "3.12"
+      - name: Install build tools
+        run: python -m pip install build
       - name: Build sdist
-        run: python -m build --sdist
+        run: python -m build --sdist library/
       - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
         with:
           name: artifact-sdist
-          path: dist/*.tar.gz
+          path: library/dist/*.tar.gz
       - name: Build wheel
-        run: python -m build --wheel
+        run: python -m build --wheel library/
       - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
         with:
           name: artifact-wheel
-          path: dist/*.whl
+          path: library/dist/*.whl
 
   publish_package:
     name: Publish package
@@ -45,7 +47,7 @@ jobs:
       - name: Download artifacts
         uses: actions/download-artifact@634f93cb2916e3fdff6788551b99b062d0335ce0 # v5.0.0
         with:
-          path: dist
+          path: library/dist
           pattern: artifact-*
           merge-multiple: true
       # to determine where to publish the package distribution to PyPI or TestPyPI
@@ -60,7 +62,7 @@ jobs:
         uses: svenstaro/upload-release-action@81c65b7cd4de9b2570615ce3aad67a41de5b1a13 # v2
         with:
           repo_token: ${{ secrets.GITHUB_TOKEN }}
-          file: dist/*
+          file: library/dist/*
           tag: ${{ github.ref }}
           overwrite: true
           file_glob: true
@@ -73,3 +75,4 @@ jobs:
         with:
           repository-url: https://test.pypi.org/legacy/
           verbose: true
+          packages-dir: library/dist
@@ -2,7 +2,7 @@
 
 All notable changes to this project will be documented in this file.
 
-## \[Unreleased\]
+## \[2.6.0\]
 
 ### New features
 
@@ -13,6 +13,29 @@ All notable changes to this project will be documented in this file.
 - Add DEIM-DFine model for Object Detection
   (<https://github.com/open-edge-platform/training_extensions/pull/4446>)
 
+### Bug fixes
+
+- Fix overriding train parameters
+  (<https://github.com/open-edge-platform/training_extensions/pull/4496>)
+- Fix adaptive batch size to run on CPU
+  (<https://github.com/open-edge-platform/training_extensions/pull/4499>)
+- Workaround for batch size search on XPU devices
+  (<https://github.com/open-edge-platform/training_extensions/pull/4513>)
+- Fix UFLow configuration
+  (<https://github.com/open-edge-platform/training_extensions/pull/4504>)
+- Fix cache args
+  (<https://github.com/open-edge-platform/training_extensions/pull/4522>)
+- Fix finding task type in IR
+  (<https://github.com/open-edge-platform/training_extensions/pull/4576>)
+- Fix loading checkpoint after 1st round of training for DFine-X model
+  (<https://github.com/open-edge-platform/training_extensions/pull/4738>)
+- Fix input size configuration during validation for DFine model
+  (<https://github.com/open-edge-platform/training_extensions/pull/4666>)
+- Fix training on CPU
+  (https://github.com/open-edge-platform/training_extensions/pull/4788)
+- Fix OOM bug on XPU
+  (<https://github.com/open-edge-platform/training_extensions/pull/4872>)
+
 ## \[2.5.0\]
 
 ### Enhancements

@@ -39,16 +39,15 @@ dependencies = [
     "einops==0.8.1",
     "decord==0.6.0",
     "typeguard>=4.3,<4.5",
-    # TODO(ashwinvaidya17): https://github.com/openvinotoolkit/anomalib/issues/2126
-    "setuptools<70",
+    "setuptools==78.1.1",
     "lightning==2.4.0",
     "torchmetrics==1.6.0",
     "pytorchcv==0.0.67",
     "timm==1.0.3",
     "openvino==2025.2",
     "openvino-model-api==0.3.0.2",
     "onnx==1.17.0",
-    "onnxconverter-common==1.14.0",
+    "onnxconverter-common==1.16.0",
     "nncf==2.17.0",
     "anomalib[core]==1.1.3",
 ]

@@ -3,7 +3,7 @@
 
 """OpenVINO Training Extensions."""
 
-__version__ = "2.6.0dev"
+__version__ = "2.7.0dev"
 
 import os
 from pathlib import Path

@@ -27,7 +27,7 @@ class BatchSizeFinder(Callback):
 
     def __init__(
         self,
-        steps_per_trial: int = 3,
+        steps_per_trial: int = 5,
     ) -> None:
         self._steps_per_trial = steps_per_trial
 
@@ -52,11 +52,12 @@ def _try_loop_run(trainer: Trainer) -> None:
     loop.run()
 
 
-def _scale_batch_reset_params(trainer: Trainer, steps_per_trial: int) -> None:
+def _scale_batch_reset_params(trainer: Trainer, steps_per_trial: int, max_epochs: int = 1) -> None:
     trainer.logger = DummyLogger() if trainer.logger is not None else None
     trainer.callbacks = []
-    # For XPU devices 1 epoch sometimes is not enough to catch an error
-    max_epochs = 2 if is_xpu_available() else 1
+    # For XPU devices 1 epoch sometimes is not enough to catch an error.
+    # Emperically enlarge this to 15 iterations (steps_per_trial * epochs)
+    max_epochs = 3 if is_xpu_available() else 1
 
     loop = trainer._active_loop  # noqa: SLF001
     if loop is None:

@@ -41,7 +41,7 @@
 from otx.types.export import OTXExportFormatType
 from otx.types.precision import OTXPrecisionType
 from otx.types.task import OTXTaskType
-from otx.utils.device import is_xpu_available
+from otx.utils.device import get_available_device, is_xpu_available
 from otx.utils.utils import measure_flops
 
 if TYPE_CHECKING:
@@ -909,6 +909,8 @@ def configure_accelerator(self) -> None:
                     ],
                 )
                 self._cache.args["precision"] = None
+        elif (self._device.accelerator == DeviceType.cpu) or (get_available_device() == "cpu"):
+            self._cache.args["precision"] = "32"
 
     def configure_loggers(self, logger: Logger | Iterable[Logger] | bool | None = None) -> Logger | Iterable[Logger]:
         """Sets up the loggers for the trainer.

@@ -92,6 +92,7 @@ def _create_model(self, num_classes: int | None = None) -> DETR:
         decoder = DFINETransformer(
             model_name=self.model_name,
             num_classes=num_classes,
+            eval_spatial_size=self.data_input_params.input_size,
         )
         criterion = DFINECriterion(
             weight_dict={
@@ -157,3 +158,17 @@ def _optimization_config(self) -> dict[str, Any]:
                 },
             },
         }
+
+    def load_state_dict(self, ckpt: dict[str, Any], *args, **kwargs) -> None:
+        """Load state dictionary from checkpoint state dictionary.
+
+        If a RuntimeError occurs due to size mismatch, non-trainable anchors and valid_mask
+        are removed from the checkpoint before loading.
+        """
+        try:
+            return super().load_state_dict(ckpt, *args, **kwargs)
+        except RuntimeError:
+            # Remove non-trainable anchors and valid_mask from the checkpoint to avoid size mismatch
+            ckpt.pop("model.decoder.anchors")
+            ckpt.pop("model.decoder.valid_mask")
+            return super().load_state_dict(ckpt, *args, strict=False, **kwargs)
@@ -408,7 +408,7 @@ class DFINETransformerModule(nn.Module):
         num_denoising (int, optional): Number of denoising samples. Defaults to 100.
         label_noise_ratio (float, optional): Ratio of label noise. Defaults to 0.5.
         box_noise_scale (float, optional): Scale of box noise. Defaults to 1.0.
-        eval_spatial_size (list[int], optional): Spatial size for evaluation. Defaults to [640, 640].
+        eval_spatial_size (tuple[int, int], optional): Spatial size for evaluation. Defaults to (640, 640).
         eval_idx (int, optional): Evaluation index. Defaults to -1.
         reg_scale (float, optional): The weight curvature. Defaults to 4.0.
         reg_max (int, optional): The number of bins for box regression. Defaults to 32.
@@ -431,7 +431,7 @@ def __init__(
         num_denoising: int = 100,
         label_noise_ratio: float = 0.5,
         box_noise_scale: float = 1.0,
-        eval_spatial_size: list[int] = [640, 640],  # noqa: B006
+        eval_spatial_size: tuple[int, int] = (640, 640),
         eval_idx: int = -1,
         reg_scale: float = 4.0,
         reg_max: int = 32,
@@ -693,7 +693,6 @@ def _get_decoder_input(
 
         if memory.shape[0] > 1:
             anchors = anchors.repeat(memory.shape[0], 1, 1)
-
         memory = valid_mask.to(memory.dtype) * memory
 
         output_memory = self.enc_output(memory)
@@ -933,26 +932,22 @@ class DFINETransformer:
             "num_decoder_layers": 3,
             "eval_idx": -1,
             "num_points_list": [6, 6],
-            "eval_spatial_size": [640, 640],
         },
         "dfine_hgnetv2_s": {
             "feat_channels": [256, 256, 256],
             "num_decoder_layers": 3,
             "eval_idx": -1,
-            "eval_spatial_size": [640, 640],
             "num_points_list": [3, 6, 3],
         },
         "dfine_hgnetv2_m": {
             "num_decoder_layers": 4,
             "eval_idx": -1,
-            "eval_spatial_size": [640, 640],
         },
         "dfine_hgnetv2_l": {},
         "dfine_hgnetv2_x": {
             "feat_channels": [384, 384, 384],
             "reg_scale": 8.0,
             "eval_idx": -1,
-            "eval_spatial_size": [640, 640],
         },
         "deim_dfine_hgnetv2_n": {
             "feat_channels": [128, 128],
@@ -963,21 +958,18 @@ class DFINETransformer:
             "num_decoder_layers": 3,
             "eval_idx": -1,
             "num_points_list": [6, 6],
-            "eval_spatial_size": [640, 640],
             "activation": nn.SiLU,
         },
         "deim_dfine_hgnetv2_s": {
             "feat_channels": [256, 256, 256],
             "num_decoder_layers": 3,
             "eval_idx": -1,
-            "eval_spatial_size": [640, 640],
             "num_points_list": [3, 6, 3],
             "activation": nn.SiLU,
         },
         "deim_dfine_hgnetv2_m": {
             "num_decoder_layers": 4,
             "eval_idx": -1,
-            "eval_spatial_size": [640, 640],
             "activation": nn.SiLU,
         },
         "deim_dfine_hgnetv2_l": {
@@ -987,12 +979,13 @@ class DFINETransformer:
             "feat_channels": [384, 384, 384],
             "reg_scale": 8.0,
             "eval_idx": -1,
-            "eval_spatial_size": [640, 640],
             "activation": nn.SiLU,
         },
     }
 
-    def __new__(cls, model_name: str, num_classes: int) -> DFINETransformerModule:
+    def __new__(
+        cls, model_name: str, num_classes: int, eval_spatial_size: tuple[int, int] = (640, 640)
+    ) -> DFINETransformerModule:
         """Constructor for DFINETransformerModule."""
         cfg = cls.decoder_cfg[model_name]
-        return DFINETransformerModule(num_classes=num_classes, **cfg)
+        return DFINETransformerModule(num_classes=num_classes, eval_spatial_size=eval_spatial_size, **cfg)
@@ -47,8 +47,8 @@ def __init__(
         self._max_bs = max_bs
         self._bs_try_history: dict[int, int] = {}
         self._total_mem = _get_total_memory_size()
-        self._mem_lower_bound = 0.8 * self._total_mem
-        self._mem_upper_bound = 0.85 * self._total_mem
+        self._mem_lower_bound = 0.75 * self._total_mem
+        self._mem_upper_bound = 0.9 * self._total_mem
         self._mp_ctx = mp.get_context("spawn")
 
     def _try_batch_size(self, bs: int) -> tuple[bool, int]:
@@ -115,16 +115,16 @@ def auto_decrease_batch_size(self) -> int:
             if oom:
                 logger.warning(
                     "The auto batch size algorithm attempted to use a batch size of 2 but still "
-                    "encountered a CUDA OOM error. OTX will proceed with training at batch size 2; "
-                    "however, you will likely encounter a CUDA OOM error once training starts. "
-                    "If the issue persists, please report it accordingly.",
+                    "encountered a CUDA OOM error. OTX will proceed with training at batch size 1; "
+                    "however, it is also possible to encounter a CUDA OOM error during training.",
                 )
-                return 2
+                return 1
             logger.warning(
                 "Even with a batch size of 2, most of the memory is used, "
-                "which could cause the training to fail midway.",
+                "which could cause the training to fail midway."
+                "For safety reasons, decrease bs to 1.",
             )
-            available_bs = 2
+            available_bs = 1
 
         return available_bs
 
@@ -157,9 +157,10 @@ def find_big_enough_batch_size(self, drop_last: bool = False) -> int:
                     raise RuntimeError(msg)
                 logger.warning(
                     "Even with a batch size of 2, most of the memory is used, "
-                    "which could cause the training to fail midway.",
+                    "which could cause the training to fail midway."
+                    "For safety reasons, decrease bs to 1.",
                 )
-                return 2
+                return 1
 
             return self.auto_decrease_batch_size()
 
@@ -270,6 +271,8 @@ def _run_trial(train_func: Callable[[int], Any], bs: int, trial_queue: mp.Queue)
             or "UR_RESULT_ERROR_OUT_OF_DEVICE_MEMORY" in str(e)
             or "UR error" in str(e)
             or "UR_RESULT_ERROR_UNKNOWN" in str(e)
+            or "UR_RESULT_ERROR_OUT_OF_HOST_MEMORY" in str(e)
+            or "UR_RESULT_ERROR" in str(e)
         ):  # XPU OOM
             oom = True
         else:

@@ -114,10 +114,6 @@ def _register_callback(callbacks: list[Callback] | Callback | None = None) -> li
 
 def _apply_new_batch_size(engine: OTXEngine, new_batch_size: int) -> None:
     origin_bs = engine.datamodule.train_subset.batch_size
-    if is_xpu_available() and new_batch_size != 1:
-        new_batch_size -= 1  # for safety reasons
-    if new_batch_size == origin_bs:
-        return
     engine.datamodule.train_subset.batch_size = new_batch_size
     engine.datamodule.val_subset.batch_size = new_batch_size
     engine.datamodule.test_subset.batch_size = new_batch_size

diff --git a/library/src/otx/data/module.py b/library/src/otx/data/module.py
@@ -98,9 +98,7 @@ def __init__(
         self.save_hyperparameters(ignore=["input_size"])
 
         dataset = DmDataset.import_from(self.data_root, format=self.data_format)
-        if self.task != OTXTaskType.H_LABEL_CLS and not (
-            self.task == OTXTaskType.KEYPOINT_DETECTION and self.data_format == "arrow"
-        ):
+        if self.task != OTXTaskType.H_LABEL_CLS:
             dataset = pre_filtering(
                 dataset,
                 self.data_format,