Skip to content
Open
Show file tree
Hide file tree
Changes from 16 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 9 additions & 6 deletions .github/workflows/publish.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,18 +20,20 @@ jobs:
uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
with:
python-version: "3.12"
- name: Install build tools
run: python -m pip install build
- name: Build sdist
run: python -m build --sdist
run: python -m build --sdist library/
- uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
with:
name: artifact-sdist
path: dist/*.tar.gz
path: library/dist/*.tar.gz
- name: Build wheel
run: python -m build --wheel
run: python -m build --wheel library/
- uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
with:
name: artifact-wheel
path: dist/*.whl
path: library/dist/*.whl

publish_package:
name: Publish package
Expand All @@ -45,7 +47,7 @@ jobs:
- name: Download artifacts
uses: actions/download-artifact@634f93cb2916e3fdff6788551b99b062d0335ce0 # v5.0.0
with:
path: dist
path: library/dist
pattern: artifact-*
merge-multiple: true
# to determine where to publish the package distribution to PyPI or TestPyPI
Expand All @@ -60,7 +62,7 @@ jobs:
uses: svenstaro/upload-release-action@81c65b7cd4de9b2570615ce3aad67a41de5b1a13 # v2
with:
repo_token: ${{ secrets.GITHUB_TOKEN }}
file: dist/*
file: library/dist/*
tag: ${{ github.ref }}
overwrite: true
file_glob: true
Expand All @@ -73,3 +75,4 @@ jobs:
with:
repository-url: https://test.pypi.org/legacy/
verbose: true
packages-dir: library/dist
25 changes: 24 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@

All notable changes to this project will be documented in this file.

## \[Unreleased\]
## \[2.6.0\]

### New features

Expand All @@ -13,6 +13,29 @@ All notable changes to this project will be documented in this file.
- Add DEIM-DFine model for Object Detection
(<https://github.com/open-edge-platform/training_extensions/pull/4446>)

### Bug fixes

- Fix overriding train parameters
(<https://github.com/open-edge-platform/training_extensions/pull/4496>)
- Fix adaptive batch size to run on CPU
(<https://github.com/open-edge-platform/training_extensions/pull/4499>)
- Workaround for batch size search on XPU devices
(<https://github.com/open-edge-platform/training_extensions/pull/4513>)
- Fix UFLow configuration
(<https://github.com/open-edge-platform/training_extensions/pull/4504>)
- Fix cache args
(<https://github.com/open-edge-platform/training_extensions/pull/4522>)
- Fix finding task type in IR
(<https://github.com/open-edge-platform/training_extensions/pull/4576>)
- Fix loading checkpoint after 1st round of training for DFine-X model
(<https://github.com/open-edge-platform/training_extensions/pull/4738>)
- Fix input size configuration during validation for DFine model
(<https://github.com/open-edge-platform/training_extensions/pull/4666>)
- Fix training on CPU
(https://github.com/open-edge-platform/training_extensions/pull/4788)
- Fix OOM bug on XPU
(<https://github.com/open-edge-platform/training_extensions/pull/4872>)

## \[2.5.0\]

### Enhancements
Expand Down
5 changes: 2 additions & 3 deletions library/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -39,16 +39,15 @@ dependencies = [
"einops==0.8.1",
"decord==0.6.0",
"typeguard>=4.3,<4.5",
# TODO(ashwinvaidya17): https://github.com/openvinotoolkit/anomalib/issues/2126
"setuptools<70",
"setuptools==78.1.1",
"lightning==2.4.0",
"torchmetrics==1.6.0",
"pytorchcv==0.0.67",
"timm==1.0.3",
"openvino==2025.2",
"openvino-model-api==0.3.0.2",
"onnx==1.17.0",
"onnxconverter-common==1.14.0",
"onnxconverter-common==1.16.0",
"nncf==2.17.0",
"anomalib[core]==1.1.3",
]
Expand Down
2 changes: 1 addition & 1 deletion library/src/otx/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

"""OpenVINO Training Extensions."""

__version__ = "2.6.0dev"
__version__ = "2.7.0dev"

import os
from pathlib import Path
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ class BatchSizeFinder(Callback):

def __init__(
self,
steps_per_trial: int = 3,
steps_per_trial: int = 5,
) -> None:
self._steps_per_trial = steps_per_trial

Expand All @@ -52,11 +52,12 @@ def _try_loop_run(trainer: Trainer) -> None:
loop.run()


def _scale_batch_reset_params(trainer: Trainer, steps_per_trial: int) -> None:
def _scale_batch_reset_params(trainer: Trainer, steps_per_trial: int, max_epochs: int = 1) -> None:
trainer.logger = DummyLogger() if trainer.logger is not None else None
trainer.callbacks = []
# For XPU devices 1 epoch sometimes is not enough to catch an error
max_epochs = 2 if is_xpu_available() else 1
# For XPU devices 1 epoch sometimes is not enough to catch an error.
# Emperically enlarge this to 15 iterations (steps_per_trial * epochs)
max_epochs = 3 if is_xpu_available() else 1

loop = trainer._active_loop # noqa: SLF001
if loop is None:
Expand Down
4 changes: 3 additions & 1 deletion library/src/otx/backend/native/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,7 @@
from otx.types.export import OTXExportFormatType
from otx.types.precision import OTXPrecisionType
from otx.types.task import OTXTaskType
from otx.utils.device import is_xpu_available
from otx.utils.device import get_available_device, is_xpu_available
from otx.utils.utils import measure_flops

if TYPE_CHECKING:
Expand Down Expand Up @@ -909,6 +909,8 @@ def configure_accelerator(self) -> None:
],
)
self._cache.args["precision"] = None
elif (self._device.accelerator == DeviceType.cpu) or (get_available_device() == "cpu"):
self._cache.args["precision"] = "32"

def configure_loggers(self, logger: Logger | Iterable[Logger] | bool | None = None) -> Logger | Iterable[Logger]:
"""Sets up the loggers for the trainer.
Expand Down
15 changes: 15 additions & 0 deletions library/src/otx/backend/native/models/detection/d_fine.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,6 +92,7 @@ def _create_model(self, num_classes: int | None = None) -> DETR:
decoder = DFINETransformer(
model_name=self.model_name,
num_classes=num_classes,
eval_spatial_size=self.data_input_params.input_size,
)
criterion = DFINECriterion(
weight_dict={
Expand Down Expand Up @@ -157,3 +158,17 @@ def _optimization_config(self) -> dict[str, Any]:
},
},
}

def load_state_dict(self, ckpt: dict[str, Any], *args, **kwargs) -> None:
"""Load state dictionary from checkpoint state dictionary.

If a RuntimeError occurs due to size mismatch, non-trainable anchors and valid_mask
are removed from the checkpoint before loading.
"""
try:
return super().load_state_dict(ckpt, *args, **kwargs)
except RuntimeError:
# Remove non-trainable anchors and valid_mask from the checkpoint to avoid size mismatch
ckpt.pop("model.decoder.anchors")
ckpt.pop("model.decoder.valid_mask")
return super().load_state_dict(ckpt, *args, strict=False, **kwargs)
Original file line number Diff line number Diff line change
Expand Up @@ -408,7 +408,7 @@ class DFINETransformerModule(nn.Module):
num_denoising (int, optional): Number of denoising samples. Defaults to 100.
label_noise_ratio (float, optional): Ratio of label noise. Defaults to 0.5.
box_noise_scale (float, optional): Scale of box noise. Defaults to 1.0.
eval_spatial_size (list[int], optional): Spatial size for evaluation. Defaults to [640, 640].
eval_spatial_size (tuple[int, int], optional): Spatial size for evaluation. Defaults to (640, 640).
eval_idx (int, optional): Evaluation index. Defaults to -1.
reg_scale (float, optional): The weight curvature. Defaults to 4.0.
reg_max (int, optional): The number of bins for box regression. Defaults to 32.
Expand All @@ -431,7 +431,7 @@ def __init__(
num_denoising: int = 100,
label_noise_ratio: float = 0.5,
box_noise_scale: float = 1.0,
eval_spatial_size: list[int] = [640, 640], # noqa: B006
eval_spatial_size: tuple[int, int] = (640, 640),
eval_idx: int = -1,
reg_scale: float = 4.0,
reg_max: int = 32,
Expand Down Expand Up @@ -693,7 +693,6 @@ def _get_decoder_input(

if memory.shape[0] > 1:
anchors = anchors.repeat(memory.shape[0], 1, 1)

memory = valid_mask.to(memory.dtype) * memory

output_memory = self.enc_output(memory)
Expand Down Expand Up @@ -933,26 +932,22 @@ class DFINETransformer:
"num_decoder_layers": 3,
"eval_idx": -1,
"num_points_list": [6, 6],
"eval_spatial_size": [640, 640],
},
"dfine_hgnetv2_s": {
"feat_channels": [256, 256, 256],
"num_decoder_layers": 3,
"eval_idx": -1,
"eval_spatial_size": [640, 640],
"num_points_list": [3, 6, 3],
},
"dfine_hgnetv2_m": {
"num_decoder_layers": 4,
"eval_idx": -1,
"eval_spatial_size": [640, 640],
},
"dfine_hgnetv2_l": {},
"dfine_hgnetv2_x": {
"feat_channels": [384, 384, 384],
"reg_scale": 8.0,
"eval_idx": -1,
"eval_spatial_size": [640, 640],
},
"deim_dfine_hgnetv2_n": {
"feat_channels": [128, 128],
Expand All @@ -963,21 +958,18 @@ class DFINETransformer:
"num_decoder_layers": 3,
"eval_idx": -1,
"num_points_list": [6, 6],
"eval_spatial_size": [640, 640],
"activation": nn.SiLU,
},
"deim_dfine_hgnetv2_s": {
"feat_channels": [256, 256, 256],
"num_decoder_layers": 3,
"eval_idx": -1,
"eval_spatial_size": [640, 640],
"num_points_list": [3, 6, 3],
"activation": nn.SiLU,
},
"deim_dfine_hgnetv2_m": {
"num_decoder_layers": 4,
"eval_idx": -1,
"eval_spatial_size": [640, 640],
"activation": nn.SiLU,
},
"deim_dfine_hgnetv2_l": {
Expand All @@ -987,12 +979,13 @@ class DFINETransformer:
"feat_channels": [384, 384, 384],
"reg_scale": 8.0,
"eval_idx": -1,
"eval_spatial_size": [640, 640],
"activation": nn.SiLU,
},
}

def __new__(cls, model_name: str, num_classes: int) -> DFINETransformerModule:
def __new__(
cls, model_name: str, num_classes: int, eval_spatial_size: tuple[int, int] = (640, 640)
) -> DFINETransformerModule:
"""Constructor for DFINETransformerModule."""
cfg = cls.decoder_cfg[model_name]
return DFINETransformerModule(num_classes=num_classes, **cfg)
return DFINETransformerModule(num_classes=num_classes, eval_spatial_size=eval_spatial_size, **cfg)
23 changes: 13 additions & 10 deletions library/src/otx/backend/native/tools/adaptive_bs/algorithm.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,8 +47,8 @@ def __init__(
self._max_bs = max_bs
self._bs_try_history: dict[int, int] = {}
self._total_mem = _get_total_memory_size()
self._mem_lower_bound = 0.8 * self._total_mem
self._mem_upper_bound = 0.85 * self._total_mem
self._mem_lower_bound = 0.75 * self._total_mem
self._mem_upper_bound = 0.9 * self._total_mem
self._mp_ctx = mp.get_context("spawn")

def _try_batch_size(self, bs: int) -> tuple[bool, int]:
Expand Down Expand Up @@ -115,16 +115,16 @@ def auto_decrease_batch_size(self) -> int:
if oom:
logger.warning(
"The auto batch size algorithm attempted to use a batch size of 2 but still "
"encountered a CUDA OOM error. OTX will proceed with training at batch size 2; "
"however, you will likely encounter a CUDA OOM error once training starts. "
"If the issue persists, please report it accordingly.",
"encountered a CUDA OOM error. OTX will proceed with training at batch size 1; "
"however, it is also possible to encounter a CUDA OOM error during training.",
)
return 2
return 1
logger.warning(
"Even with a batch size of 2, most of the memory is used, "
"which could cause the training to fail midway.",
"which could cause the training to fail midway."
"For safety reasons, decrease bs to 1.",
)
available_bs = 2
available_bs = 1

return available_bs

Expand Down Expand Up @@ -157,9 +157,10 @@ def find_big_enough_batch_size(self, drop_last: bool = False) -> int:
raise RuntimeError(msg)
logger.warning(
"Even with a batch size of 2, most of the memory is used, "
"which could cause the training to fail midway.",
"which could cause the training to fail midway."
"For safety reasons, decrease bs to 1.",
)
return 2
return 1

return self.auto_decrease_batch_size()

Expand Down Expand Up @@ -270,6 +271,8 @@ def _run_trial(train_func: Callable[[int], Any], bs: int, trial_queue: mp.Queue)
or "UR_RESULT_ERROR_OUT_OF_DEVICE_MEMORY" in str(e)
or "UR error" in str(e)
or "UR_RESULT_ERROR_UNKNOWN" in str(e)
or "UR_RESULT_ERROR_OUT_OF_HOST_MEMORY" in str(e)
or "UR_RESULT_ERROR" in str(e)
): # XPU OOM
oom = True
else:
Expand Down
4 changes: 0 additions & 4 deletions library/src/otx/backend/native/tools/adaptive_bs/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -114,10 +114,6 @@ def _register_callback(callbacks: list[Callback] | Callback | None = None) -> li

def _apply_new_batch_size(engine: OTXEngine, new_batch_size: int) -> None:
origin_bs = engine.datamodule.train_subset.batch_size
if is_xpu_available() and new_batch_size != 1:
new_batch_size -= 1 # for safety reasons
if new_batch_size == origin_bs:
return
engine.datamodule.train_subset.batch_size = new_batch_size
engine.datamodule.val_subset.batch_size = new_batch_size
engine.datamodule.test_subset.batch_size = new_batch_size
Expand Down
4 changes: 1 addition & 3 deletions library/src/otx/data/module.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,9 +98,7 @@ def __init__(
self.save_hyperparameters(ignore=["input_size"])

dataset = DmDataset.import_from(self.data_root, format=self.data_format)
if self.task != OTXTaskType.H_LABEL_CLS and not (
self.task == OTXTaskType.KEYPOINT_DETECTION and self.data_format == "arrow"
):
if self.task != OTXTaskType.H_LABEL_CLS:
dataset = pre_filtering(
dataset,
self.data_format,
Expand Down
Loading
Loading