Skip to content

Commit 517f1d1

Browse files
kprokofieugene123twleoll2
authored
Merge release 2.6 fixes to develop (#4903)
Co-authored-by: Eugene Liu <[email protected]> Co-authored-by: Leonardo Lai <[email protected]>
1 parent 33f43c3 commit 517f1d1

File tree

27 files changed

+444
-153
lines changed

27 files changed

+444
-153
lines changed

.github/workflows/publish.yaml

Lines changed: 9 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -20,18 +20,20 @@ jobs:
2020
uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0
2121
with:
2222
python-version: "3.12"
23+
- name: Install build tools
24+
run: python -m pip install build
2325
- name: Build sdist
24-
run: python -m build --sdist
26+
run: python -m build --sdist library/
2527
- uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
2628
with:
2729
name: artifact-sdist
28-
path: dist/*.tar.gz
30+
path: library/dist/*.tar.gz
2931
- name: Build wheel
30-
run: python -m build --wheel
32+
run: python -m build --wheel library/
3133
- uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2
3234
with:
3335
name: artifact-wheel
34-
path: dist/*.whl
36+
path: library/dist/*.whl
3537

3638
publish_package:
3739
name: Publish package
@@ -45,7 +47,7 @@ jobs:
4547
- name: Download artifacts
4648
uses: actions/download-artifact@634f93cb2916e3fdff6788551b99b062d0335ce0 # v5.0.0
4749
with:
48-
path: dist
50+
path: library/dist
4951
pattern: artifact-*
5052
merge-multiple: true
5153
# to determine where to publish the package distribution to PyPI or TestPyPI
@@ -60,7 +62,7 @@ jobs:
6062
uses: svenstaro/upload-release-action@81c65b7cd4de9b2570615ce3aad67a41de5b1a13 # v2
6163
with:
6264
repo_token: ${{ secrets.GITHUB_TOKEN }}
63-
file: dist/*
65+
file: library/dist/*
6466
tag: ${{ github.ref }}
6567
overwrite: true
6668
file_glob: true
@@ -73,3 +75,4 @@ jobs:
7375
with:
7476
repository-url: https://test.pypi.org/legacy/
7577
verbose: true
78+
packages-dir: library/dist

CHANGELOG.md

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22

33
All notable changes to this project will be documented in this file.
44

5-
## \[Unreleased\]
5+
## \[2.6.0\]
66

77
### New features
88

@@ -13,6 +13,29 @@ All notable changes to this project will be documented in this file.
1313
- Add DEIM-DFine model for Object Detection
1414
(<https://github.com/open-edge-platform/training_extensions/pull/4446>)
1515

16+
### Bug fixes
17+
18+
- Fix overriding train parameters
19+
(<https://github.com/open-edge-platform/training_extensions/pull/4496>)
20+
- Fix adaptive batch size to run on CPU
21+
(<https://github.com/open-edge-platform/training_extensions/pull/4499>)
22+
- Workaround for batch size search on XPU devices
23+
(<https://github.com/open-edge-platform/training_extensions/pull/4513>)
24+
- Fix UFLow configuration
25+
(<https://github.com/open-edge-platform/training_extensions/pull/4504>)
26+
- Fix cache args
27+
(<https://github.com/open-edge-platform/training_extensions/pull/4522>)
28+
- Fix finding task type in IR
29+
(<https://github.com/open-edge-platform/training_extensions/pull/4576>)
30+
- Fix loading checkpoint after 1st round of training for DFine-X model
31+
(<https://github.com/open-edge-platform/training_extensions/pull/4738>)
32+
- Fix input size configuration during validation for DFine model
33+
(<https://github.com/open-edge-platform/training_extensions/pull/4666>)
34+
- Fix training on CPU
35+
(https://github.com/open-edge-platform/training_extensions/pull/4788)
36+
- Fix OOM bug on XPU
37+
(<https://github.com/open-edge-platform/training_extensions/pull/4872>)
38+
1639
## \[2.5.0\]
1740

1841
### Enhancements

library/pyproject.toml

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -39,17 +39,16 @@ dependencies = [
3939
"einops==0.8.1",
4040
"decord==0.6.0",
4141
"typeguard>=4.3,<4.5",
42-
# TODO(ashwinvaidya17): https://github.com/openvinotoolkit/anomalib/issues/2126
43-
"setuptools<70",
42+
"setuptools==78.1.1",
4443
"lightning==2.4.0",
4544
"torchmetrics==1.6.0",
4645
"pytorchcv==0.0.67",
4746
"timm==1.0.3",
4847
"openvino==2025.2",
4948
"openvino-model-api==0.3.0.2",
5049
"onnx==1.17.0",
50+
"onnxconverter-common==1.16.0",
5151
"onnxscript==0.5.3",
52-
"onnxconverter-common==1.14.0",
5352
"nncf==2.17.0",
5453
"anomalib[core]==1.1.3",
5554
"numpy<2.0",

library/src/otx/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33

44
"""OpenVINO Training Extensions."""
55

6-
__version__ = "2.6.0dev"
6+
__version__ = "2.7.0dev"
77

88
import os
99
from pathlib import Path

library/src/otx/backend/native/callbacks/batchsize_finder.py

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ class BatchSizeFinder(Callback):
2727

2828
def __init__(
2929
self,
30-
steps_per_trial: int = 3,
30+
steps_per_trial: int = 5,
3131
) -> None:
3232
self._steps_per_trial = steps_per_trial
3333

@@ -52,11 +52,12 @@ def _try_loop_run(trainer: Trainer) -> None:
5252
loop.run()
5353

5454

55-
def _scale_batch_reset_params(trainer: Trainer, steps_per_trial: int) -> None:
55+
def _scale_batch_reset_params(trainer: Trainer, steps_per_trial: int, max_epochs: int = 1) -> None:
5656
trainer.logger = DummyLogger() if trainer.logger is not None else None
5757
trainer.callbacks = []
58-
# For XPU devices 1 epoch sometimes is not enough to catch an error
59-
max_epochs = 2 if is_xpu_available() else 1
58+
# For XPU devices 1 epoch sometimes is not enough to catch an error.
59+
# Emperically enlarge this to 15 iterations (steps_per_trial * epochs)
60+
max_epochs = 3 if is_xpu_available() else 1
6061

6162
loop = trainer._active_loop # noqa: SLF001
6263
if loop is None:

library/src/otx/backend/native/engine.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,7 @@
4141
from otx.types.export import OTXExportFormatType
4242
from otx.types.precision import OTXPrecisionType
4343
from otx.types.task import OTXTaskType
44-
from otx.utils.device import is_xpu_available
44+
from otx.utils.device import get_available_device, is_xpu_available
4545
from otx.utils.utils import measure_flops
4646

4747
if TYPE_CHECKING:
@@ -915,6 +915,8 @@ def configure_accelerator(self) -> None:
915915
],
916916
)
917917
self._cache.args["precision"] = None
918+
elif (self._device.accelerator == DeviceType.cpu) or (get_available_device() == "cpu"):
919+
self._cache.args["precision"] = "32"
918920

919921
def configure_loggers(self, logger: Logger | Iterable[Logger] | bool | None = None) -> Logger | Iterable[Logger]:
920922
"""Sets up the loggers for the trainer.

library/src/otx/backend/native/models/detection/d_fine.py

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -92,6 +92,7 @@ def _create_model(self, num_classes: int | None = None) -> DETR:
9292
decoder = DFINETransformer(
9393
model_name=self.model_name,
9494
num_classes=num_classes,
95+
eval_spatial_size=self.data_input_params.input_size,
9596
)
9697
criterion = DFINECriterion(
9798
weight_dict={
@@ -157,3 +158,17 @@ def _optimization_config(self) -> dict[str, Any]:
157158
},
158159
},
159160
}
161+
162+
def load_state_dict(self, ckpt: dict[str, Any], *args, **kwargs) -> None:
163+
"""Load state dictionary from checkpoint state dictionary.
164+
165+
If a RuntimeError occurs due to size mismatch, non-trainable anchors and valid_mask
166+
are removed from the checkpoint before loading.
167+
"""
168+
try:
169+
return super().load_state_dict(ckpt, *args, **kwargs)
170+
except RuntimeError:
171+
# Remove non-trainable anchors and valid_mask from the checkpoint to avoid size mismatch
172+
ckpt.pop("model.decoder.anchors")
173+
ckpt.pop("model.decoder.valid_mask")
174+
return super().load_state_dict(ckpt, *args, strict=False, **kwargs)

library/src/otx/backend/native/models/detection/heads/dfine_decoder.py

Lines changed: 6 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -408,7 +408,7 @@ class DFINETransformerModule(nn.Module):
408408
num_denoising (int, optional): Number of denoising samples. Defaults to 100.
409409
label_noise_ratio (float, optional): Ratio of label noise. Defaults to 0.5.
410410
box_noise_scale (float, optional): Scale of box noise. Defaults to 1.0.
411-
eval_spatial_size (list[int], optional): Spatial size for evaluation. Defaults to [640, 640].
411+
eval_spatial_size (tuple[int, int], optional): Spatial size for evaluation. Defaults to (640, 640).
412412
eval_idx (int, optional): Evaluation index. Defaults to -1.
413413
reg_scale (float, optional): The weight curvature. Defaults to 4.0.
414414
reg_max (int, optional): The number of bins for box regression. Defaults to 32.
@@ -431,7 +431,7 @@ def __init__(
431431
num_denoising: int = 100,
432432
label_noise_ratio: float = 0.5,
433433
box_noise_scale: float = 1.0,
434-
eval_spatial_size: list[int] = [640, 640], # noqa: B006
434+
eval_spatial_size: tuple[int, int] = (640, 640),
435435
eval_idx: int = -1,
436436
reg_scale: float = 4.0,
437437
reg_max: int = 32,
@@ -693,7 +693,6 @@ def _get_decoder_input(
693693

694694
if memory.shape[0] > 1:
695695
anchors = anchors.repeat(memory.shape[0], 1, 1)
696-
697696
memory = valid_mask.to(memory.dtype) * memory
698697

699698
output_memory = self.enc_output(memory)
@@ -933,26 +932,22 @@ class DFINETransformer:
933932
"num_decoder_layers": 3,
934933
"eval_idx": -1,
935934
"num_points_list": [6, 6],
936-
"eval_spatial_size": [640, 640],
937935
},
938936
"dfine_hgnetv2_s": {
939937
"feat_channels": [256, 256, 256],
940938
"num_decoder_layers": 3,
941939
"eval_idx": -1,
942-
"eval_spatial_size": [640, 640],
943940
"num_points_list": [3, 6, 3],
944941
},
945942
"dfine_hgnetv2_m": {
946943
"num_decoder_layers": 4,
947944
"eval_idx": -1,
948-
"eval_spatial_size": [640, 640],
949945
},
950946
"dfine_hgnetv2_l": {},
951947
"dfine_hgnetv2_x": {
952948
"feat_channels": [384, 384, 384],
953949
"reg_scale": 8.0,
954950
"eval_idx": -1,
955-
"eval_spatial_size": [640, 640],
956951
},
957952
"deim_dfine_hgnetv2_n": {
958953
"feat_channels": [128, 128],
@@ -963,21 +958,18 @@ class DFINETransformer:
963958
"num_decoder_layers": 3,
964959
"eval_idx": -1,
965960
"num_points_list": [6, 6],
966-
"eval_spatial_size": [640, 640],
967961
"activation": nn.SiLU,
968962
},
969963
"deim_dfine_hgnetv2_s": {
970964
"feat_channels": [256, 256, 256],
971965
"num_decoder_layers": 3,
972966
"eval_idx": -1,
973-
"eval_spatial_size": [640, 640],
974967
"num_points_list": [3, 6, 3],
975968
"activation": nn.SiLU,
976969
},
977970
"deim_dfine_hgnetv2_m": {
978971
"num_decoder_layers": 4,
979972
"eval_idx": -1,
980-
"eval_spatial_size": [640, 640],
981973
"activation": nn.SiLU,
982974
},
983975
"deim_dfine_hgnetv2_l": {
@@ -987,12 +979,13 @@ class DFINETransformer:
987979
"feat_channels": [384, 384, 384],
988980
"reg_scale": 8.0,
989981
"eval_idx": -1,
990-
"eval_spatial_size": [640, 640],
991982
"activation": nn.SiLU,
992983
},
993984
}
994985

995-
def __new__(cls, model_name: str, num_classes: int) -> DFINETransformerModule:
986+
def __new__(
987+
cls, model_name: str, num_classes: int, eval_spatial_size: tuple[int, int] = (640, 640)
988+
) -> DFINETransformerModule:
996989
"""Constructor for DFINETransformerModule."""
997990
cfg = cls.decoder_cfg[model_name]
998-
return DFINETransformerModule(num_classes=num_classes, **cfg)
991+
return DFINETransformerModule(num_classes=num_classes, eval_spatial_size=eval_spatial_size, **cfg)

library/src/otx/backend/native/tools/adaptive_bs/algorithm.py

Lines changed: 13 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -47,8 +47,8 @@ def __init__(
4747
self._max_bs = max_bs
4848
self._bs_try_history: dict[int, int] = {}
4949
self._total_mem = _get_total_memory_size()
50-
self._mem_lower_bound = 0.8 * self._total_mem
51-
self._mem_upper_bound = 0.85 * self._total_mem
50+
self._mem_lower_bound = 0.75 * self._total_mem
51+
self._mem_upper_bound = 0.9 * self._total_mem
5252
self._mp_ctx = mp.get_context("spawn")
5353

5454
def _try_batch_size(self, bs: int) -> tuple[bool, int]:
@@ -115,16 +115,16 @@ def auto_decrease_batch_size(self) -> int:
115115
if oom:
116116
logger.warning(
117117
"The auto batch size algorithm attempted to use a batch size of 2 but still "
118-
"encountered a CUDA OOM error. OTX will proceed with training at batch size 2; "
119-
"however, you will likely encounter a CUDA OOM error once training starts. "
120-
"If the issue persists, please report it accordingly.",
118+
"encountered a CUDA OOM error. OTX will proceed with training at batch size 1; "
119+
"however, it is also possible to encounter a CUDA OOM error during training.",
121120
)
122-
return 2
121+
return 1
123122
logger.warning(
124123
"Even with a batch size of 2, most of the memory is used, "
125-
"which could cause the training to fail midway.",
124+
"which could cause the training to fail midway."
125+
"For safety reasons, decrease bs to 1.",
126126
)
127-
available_bs = 2
127+
available_bs = 1
128128

129129
return available_bs
130130

@@ -157,9 +157,10 @@ def find_big_enough_batch_size(self, drop_last: bool = False) -> int:
157157
raise RuntimeError(msg)
158158
logger.warning(
159159
"Even with a batch size of 2, most of the memory is used, "
160-
"which could cause the training to fail midway.",
160+
"which could cause the training to fail midway."
161+
"For safety reasons, decrease bs to 1.",
161162
)
162-
return 2
163+
return 1
163164

164165
return self.auto_decrease_batch_size()
165166

@@ -270,6 +271,8 @@ def _run_trial(train_func: Callable[[int], Any], bs: int, trial_queue: mp.Queue)
270271
or "UR_RESULT_ERROR_OUT_OF_DEVICE_MEMORY" in str(e)
271272
or "UR error" in str(e)
272273
or "UR_RESULT_ERROR_UNKNOWN" in str(e)
274+
or "UR_RESULT_ERROR_OUT_OF_HOST_MEMORY" in str(e)
275+
or "UR_RESULT_ERROR" in str(e)
273276
): # XPU OOM
274277
oom = True
275278
else:

library/src/otx/backend/native/tools/adaptive_bs/runner.py

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -114,10 +114,6 @@ def _register_callback(callbacks: list[Callback] | Callback | None = None) -> li
114114

115115
def _apply_new_batch_size(engine: OTXEngine, new_batch_size: int) -> None:
116116
origin_bs = engine.datamodule.train_subset.batch_size
117-
if is_xpu_available() and new_batch_size != 1:
118-
new_batch_size -= 1 # for safety reasons
119-
if new_batch_size == origin_bs:
120-
return
121117
engine.datamodule.train_subset.batch_size = new_batch_size
122118
engine.datamodule.val_subset.batch_size = new_batch_size
123119
engine.datamodule.test_subset.batch_size = new_batch_size

0 commit comments

Comments
 (0)