diff --git a/.github/workflows/publish.yaml b/.github/workflows/publish.yaml index 9f22e12cd8..d15b34e2fd 100644 --- a/.github/workflows/publish.yaml +++ b/.github/workflows/publish.yaml @@ -20,18 +20,20 @@ jobs: uses: actions/setup-python@e797f83bcb11b83ae66e0230d6156d7c80228e7c # v6.0.0 with: python-version: "3.12" + - name: Install build tools + run: python -m pip install build - name: Build sdist - run: python -m build --sdist + run: python -m build --sdist library/ - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 with: name: artifact-sdist - path: dist/*.tar.gz + path: library/dist/*.tar.gz - name: Build wheel - run: python -m build --wheel + run: python -m build --wheel library/ - uses: actions/upload-artifact@ea165f8d65b6e75b540449e92b4886f43607fa02 # v4.6.2 with: name: artifact-wheel - path: dist/*.whl + path: library/dist/*.whl publish_package: name: Publish package @@ -45,7 +47,7 @@ jobs: - name: Download artifacts uses: actions/download-artifact@634f93cb2916e3fdff6788551b99b062d0335ce0 # v5.0.0 with: - path: dist + path: library/dist pattern: artifact-* merge-multiple: true # to determine where to publish the package distribution to PyPI or TestPyPI @@ -60,7 +62,7 @@ jobs: uses: svenstaro/upload-release-action@81c65b7cd4de9b2570615ce3aad67a41de5b1a13 # v2 with: repo_token: ${{ secrets.GITHUB_TOKEN }} - file: dist/* + file: library/dist/* tag: ${{ github.ref }} overwrite: true file_glob: true @@ -73,3 +75,4 @@ jobs: with: repository-url: https://test.pypi.org/legacy/ verbose: true + packages-dir: library/dist diff --git a/CHANGELOG.md b/CHANGELOG.md index 48e9ce748a..cd02051d45 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,7 +2,7 @@ All notable changes to this project will be documented in this file. -## \[Unreleased\] +## \[2.6.0\] ### New features @@ -13,6 +13,29 @@ All notable changes to this project will be documented in this file. - Add DEIM-DFine model for Object Detection () +### Bug fixes + +- Fix overriding train parameters + () +- Fix adaptive batch size to run on CPU + () +- Workaround for batch size search on XPU devices + () +- Fix UFLow configuration + () +- Fix cache args + () +- Fix finding task type in IR + () +- Fix loading checkpoint after 1st round of training for DFine-X model + () +- Fix input size configuration during validation for DFine model + () +- Fix training on CPU + (https://github.com/open-edge-platform/training_extensions/pull/4788) +- Fix OOM bug on XPU + () + ## \[2.5.0\] ### Enhancements diff --git a/library/pyproject.toml b/library/pyproject.toml index c05a1d9b95..5d61f155bd 100644 --- a/library/pyproject.toml +++ b/library/pyproject.toml @@ -39,8 +39,7 @@ dependencies = [ "einops==0.8.1", "decord==0.6.0", "typeguard>=4.3,<4.5", - # TODO(ashwinvaidya17): https://github.com/openvinotoolkit/anomalib/issues/2126 - "setuptools<70", + "setuptools==78.1.1", "lightning==2.4.0", "torchmetrics==1.6.0", "pytorchcv==0.0.67", @@ -48,8 +47,8 @@ dependencies = [ "openvino==2025.2", "openvino-model-api==0.3.0.2", "onnx==1.17.0", + "onnxconverter-common==1.16.0", "onnxscript==0.5.3", - "onnxconverter-common==1.14.0", "nncf==2.17.0", "anomalib[core]==1.1.3", "numpy<2.0", diff --git a/library/src/otx/__init__.py b/library/src/otx/__init__.py index ab6963c0c1..117520f3c9 100644 --- a/library/src/otx/__init__.py +++ b/library/src/otx/__init__.py @@ -3,7 +3,7 @@ """OpenVINO Training Extensions.""" -__version__ = "2.6.0dev" +__version__ = "2.7.0dev" import os from pathlib import Path diff --git a/library/src/otx/backend/native/callbacks/batchsize_finder.py b/library/src/otx/backend/native/callbacks/batchsize_finder.py index a96e0898af..dc9ec21f7c 100644 --- a/library/src/otx/backend/native/callbacks/batchsize_finder.py +++ b/library/src/otx/backend/native/callbacks/batchsize_finder.py @@ -27,7 +27,7 @@ class BatchSizeFinder(Callback): def __init__( self, - steps_per_trial: int = 3, + steps_per_trial: int = 5, ) -> None: self._steps_per_trial = steps_per_trial @@ -52,11 +52,12 @@ def _try_loop_run(trainer: Trainer) -> None: loop.run() -def _scale_batch_reset_params(trainer: Trainer, steps_per_trial: int) -> None: +def _scale_batch_reset_params(trainer: Trainer, steps_per_trial: int, max_epochs: int = 1) -> None: trainer.logger = DummyLogger() if trainer.logger is not None else None trainer.callbacks = [] - # For XPU devices 1 epoch sometimes is not enough to catch an error - max_epochs = 2 if is_xpu_available() else 1 + # For XPU devices 1 epoch sometimes is not enough to catch an error. + # Emperically enlarge this to 15 iterations (steps_per_trial * epochs) + max_epochs = 3 if is_xpu_available() else 1 loop = trainer._active_loop # noqa: SLF001 if loop is None: diff --git a/library/src/otx/backend/native/engine.py b/library/src/otx/backend/native/engine.py index c053c3c35b..1f566d70ed 100644 --- a/library/src/otx/backend/native/engine.py +++ b/library/src/otx/backend/native/engine.py @@ -41,7 +41,7 @@ from otx.types.export import OTXExportFormatType from otx.types.precision import OTXPrecisionType from otx.types.task import OTXTaskType -from otx.utils.device import is_xpu_available +from otx.utils.device import get_available_device, is_xpu_available from otx.utils.utils import measure_flops if TYPE_CHECKING: @@ -915,6 +915,8 @@ def configure_accelerator(self) -> None: ], ) self._cache.args["precision"] = None + elif (self._device.accelerator == DeviceType.cpu) or (get_available_device() == "cpu"): + self._cache.args["precision"] = "32" def configure_loggers(self, logger: Logger | Iterable[Logger] | bool | None = None) -> Logger | Iterable[Logger]: """Sets up the loggers for the trainer. diff --git a/library/src/otx/backend/native/models/detection/d_fine.py b/library/src/otx/backend/native/models/detection/d_fine.py index 2727ce8044..2c01388f79 100644 --- a/library/src/otx/backend/native/models/detection/d_fine.py +++ b/library/src/otx/backend/native/models/detection/d_fine.py @@ -92,6 +92,7 @@ def _create_model(self, num_classes: int | None = None) -> DETR: decoder = DFINETransformer( model_name=self.model_name, num_classes=num_classes, + eval_spatial_size=self.data_input_params.input_size, ) criterion = DFINECriterion( weight_dict={ @@ -157,3 +158,17 @@ def _optimization_config(self) -> dict[str, Any]: }, }, } + + def load_state_dict(self, ckpt: dict[str, Any], *args, **kwargs) -> None: + """Load state dictionary from checkpoint state dictionary. + + If a RuntimeError occurs due to size mismatch, non-trainable anchors and valid_mask + are removed from the checkpoint before loading. + """ + try: + return super().load_state_dict(ckpt, *args, **kwargs) + except RuntimeError: + # Remove non-trainable anchors and valid_mask from the checkpoint to avoid size mismatch + ckpt.pop("model.decoder.anchors") + ckpt.pop("model.decoder.valid_mask") + return super().load_state_dict(ckpt, *args, strict=False, **kwargs) diff --git a/library/src/otx/backend/native/models/detection/heads/dfine_decoder.py b/library/src/otx/backend/native/models/detection/heads/dfine_decoder.py index 253190dedf..8e6e63cd6a 100644 --- a/library/src/otx/backend/native/models/detection/heads/dfine_decoder.py +++ b/library/src/otx/backend/native/models/detection/heads/dfine_decoder.py @@ -408,7 +408,7 @@ class DFINETransformerModule(nn.Module): num_denoising (int, optional): Number of denoising samples. Defaults to 100. label_noise_ratio (float, optional): Ratio of label noise. Defaults to 0.5. box_noise_scale (float, optional): Scale of box noise. Defaults to 1.0. - eval_spatial_size (list[int], optional): Spatial size for evaluation. Defaults to [640, 640]. + eval_spatial_size (tuple[int, int], optional): Spatial size for evaluation. Defaults to (640, 640). eval_idx (int, optional): Evaluation index. Defaults to -1. reg_scale (float, optional): The weight curvature. Defaults to 4.0. reg_max (int, optional): The number of bins for box regression. Defaults to 32. @@ -431,7 +431,7 @@ def __init__( num_denoising: int = 100, label_noise_ratio: float = 0.5, box_noise_scale: float = 1.0, - eval_spatial_size: list[int] = [640, 640], # noqa: B006 + eval_spatial_size: tuple[int, int] = (640, 640), eval_idx: int = -1, reg_scale: float = 4.0, reg_max: int = 32, @@ -693,7 +693,6 @@ def _get_decoder_input( if memory.shape[0] > 1: anchors = anchors.repeat(memory.shape[0], 1, 1) - memory = valid_mask.to(memory.dtype) * memory output_memory = self.enc_output(memory) @@ -933,26 +932,22 @@ class DFINETransformer: "num_decoder_layers": 3, "eval_idx": -1, "num_points_list": [6, 6], - "eval_spatial_size": [640, 640], }, "dfine_hgnetv2_s": { "feat_channels": [256, 256, 256], "num_decoder_layers": 3, "eval_idx": -1, - "eval_spatial_size": [640, 640], "num_points_list": [3, 6, 3], }, "dfine_hgnetv2_m": { "num_decoder_layers": 4, "eval_idx": -1, - "eval_spatial_size": [640, 640], }, "dfine_hgnetv2_l": {}, "dfine_hgnetv2_x": { "feat_channels": [384, 384, 384], "reg_scale": 8.0, "eval_idx": -1, - "eval_spatial_size": [640, 640], }, "deim_dfine_hgnetv2_n": { "feat_channels": [128, 128], @@ -963,21 +958,18 @@ class DFINETransformer: "num_decoder_layers": 3, "eval_idx": -1, "num_points_list": [6, 6], - "eval_spatial_size": [640, 640], "activation": nn.SiLU, }, "deim_dfine_hgnetv2_s": { "feat_channels": [256, 256, 256], "num_decoder_layers": 3, "eval_idx": -1, - "eval_spatial_size": [640, 640], "num_points_list": [3, 6, 3], "activation": nn.SiLU, }, "deim_dfine_hgnetv2_m": { "num_decoder_layers": 4, "eval_idx": -1, - "eval_spatial_size": [640, 640], "activation": nn.SiLU, }, "deim_dfine_hgnetv2_l": { @@ -987,12 +979,13 @@ class DFINETransformer: "feat_channels": [384, 384, 384], "reg_scale": 8.0, "eval_idx": -1, - "eval_spatial_size": [640, 640], "activation": nn.SiLU, }, } - def __new__(cls, model_name: str, num_classes: int) -> DFINETransformerModule: + def __new__( + cls, model_name: str, num_classes: int, eval_spatial_size: tuple[int, int] = (640, 640) + ) -> DFINETransformerModule: """Constructor for DFINETransformerModule.""" cfg = cls.decoder_cfg[model_name] - return DFINETransformerModule(num_classes=num_classes, **cfg) + return DFINETransformerModule(num_classes=num_classes, eval_spatial_size=eval_spatial_size, **cfg) diff --git a/library/src/otx/backend/native/tools/adaptive_bs/algorithm.py b/library/src/otx/backend/native/tools/adaptive_bs/algorithm.py index 0391756b07..6268d9e5d7 100644 --- a/library/src/otx/backend/native/tools/adaptive_bs/algorithm.py +++ b/library/src/otx/backend/native/tools/adaptive_bs/algorithm.py @@ -47,8 +47,8 @@ def __init__( self._max_bs = max_bs self._bs_try_history: dict[int, int] = {} self._total_mem = _get_total_memory_size() - self._mem_lower_bound = 0.8 * self._total_mem - self._mem_upper_bound = 0.85 * self._total_mem + self._mem_lower_bound = 0.75 * self._total_mem + self._mem_upper_bound = 0.9 * self._total_mem self._mp_ctx = mp.get_context("spawn") def _try_batch_size(self, bs: int) -> tuple[bool, int]: @@ -115,16 +115,16 @@ def auto_decrease_batch_size(self) -> int: if oom: logger.warning( "The auto batch size algorithm attempted to use a batch size of 2 but still " - "encountered a CUDA OOM error. OTX will proceed with training at batch size 2; " - "however, you will likely encounter a CUDA OOM error once training starts. " - "If the issue persists, please report it accordingly.", + "encountered a CUDA OOM error. OTX will proceed with training at batch size 1; " + "however, it is also possible to encounter a CUDA OOM error during training.", ) - return 2 + return 1 logger.warning( "Even with a batch size of 2, most of the memory is used, " - "which could cause the training to fail midway.", + "which could cause the training to fail midway." + "For safety reasons, decrease bs to 1.", ) - available_bs = 2 + available_bs = 1 return available_bs @@ -157,9 +157,10 @@ def find_big_enough_batch_size(self, drop_last: bool = False) -> int: raise RuntimeError(msg) logger.warning( "Even with a batch size of 2, most of the memory is used, " - "which could cause the training to fail midway.", + "which could cause the training to fail midway." + "For safety reasons, decrease bs to 1.", ) - return 2 + return 1 return self.auto_decrease_batch_size() @@ -270,6 +271,8 @@ def _run_trial(train_func: Callable[[int], Any], bs: int, trial_queue: mp.Queue) or "UR_RESULT_ERROR_OUT_OF_DEVICE_MEMORY" in str(e) or "UR error" in str(e) or "UR_RESULT_ERROR_UNKNOWN" in str(e) + or "UR_RESULT_ERROR_OUT_OF_HOST_MEMORY" in str(e) + or "UR_RESULT_ERROR" in str(e) ): # XPU OOM oom = True else: diff --git a/library/src/otx/backend/native/tools/adaptive_bs/runner.py b/library/src/otx/backend/native/tools/adaptive_bs/runner.py index 3c149fec20..47cd817036 100644 --- a/library/src/otx/backend/native/tools/adaptive_bs/runner.py +++ b/library/src/otx/backend/native/tools/adaptive_bs/runner.py @@ -114,10 +114,6 @@ def _register_callback(callbacks: list[Callback] | Callback | None = None) -> li def _apply_new_batch_size(engine: OTXEngine, new_batch_size: int) -> None: origin_bs = engine.datamodule.train_subset.batch_size - if is_xpu_available() and new_batch_size != 1: - new_batch_size -= 1 # for safety reasons - if new_batch_size == origin_bs: - return engine.datamodule.train_subset.batch_size = new_batch_size engine.datamodule.val_subset.batch_size = new_batch_size engine.datamodule.test_subset.batch_size = new_batch_size diff --git a/library/src/otx/data/transform_libs/torchvision.py b/library/src/otx/data/transform_libs/torchvision.py index 16b25c6559..07eb4f3416 100644 --- a/library/src/otx/data/transform_libs/torchvision.py +++ b/library/src/otx/data/transform_libs/torchvision.py @@ -1159,7 +1159,6 @@ def __init__( ) -> None: super().__init__() self._validate_parameters(max_translate_ratio, scaling_ratio_range) - self.max_rotate_degree = max_rotate_degree self.max_translate_ratio = max_translate_ratio self.scaling_ratio_range = scaling_ratio_range @@ -1238,7 +1237,13 @@ def forward(self, *_inputs: OTXDataItem) -> OTXDataItem: homography_matrix = self._get_random_homography_matrix(height, width) output_shape = (height + self.border[0] * 2, width + self.border[1] * 2) - if hasattr(inputs, "bboxes") and inputs.bboxes is not None and len(inputs.bboxes) > 0: + transformed_img = self._warp_image(img, homography_matrix, output_shape) + inputs.image = transformed_img + inputs.img_info = _resize_image_info(inputs.img_info, transformed_img.shape[:2]) + valid_index = None + valid_bboxes = hasattr(inputs, "bboxes") and inputs.bboxes is not None and len(inputs.bboxes) > 0 + + if valid_bboxes: # Test transform bboxes to see if any remain valid valid_index = self._transform_bboxes(inputs, homography_matrix, output_shape) # If no valid annotations will remain after transformation, skip entirely @@ -1246,20 +1251,14 @@ def forward(self, *_inputs: OTXDataItem) -> OTXDataItem: inputs.image = img return self.convert(inputs) # type: ignore[return-value] - # If we reach here, transformation will produce valid results, so proceed - # Transform image - transformed_img = self._warp_image(img, homography_matrix, output_shape) - inputs.image = transformed_img - inputs.img_info = _resize_image_info(inputs.img_info, transformed_img.shape[:2]) - - if hasattr(inputs, "masks") and inputs.masks is not None and len(inputs.masks) > 0: - self._transform_masks(inputs, homography_matrix, output_shape, valid_index) + if hasattr(inputs, "masks") and inputs.masks is not None and len(inputs.masks) > 0: + self._transform_masks(inputs, homography_matrix, output_shape, valid_index) - if hasattr(inputs, "polygons") and inputs.polygons is not None and len(inputs.polygons) > 0: - self._transform_polygons(inputs, homography_matrix, output_shape, valid_index) + if hasattr(inputs, "polygons") and inputs.polygons is not None and len(inputs.polygons) > 0: + self._transform_polygons(inputs, homography_matrix, output_shape, valid_index) - if self.recompute_bbox: - self._recompute_bboxes(inputs, output_shape) + if valid_bboxes and self.recompute_bbox: + self._recompute_bboxes(inputs, output_shape) return self.convert(inputs) # type: ignore[return-value] @@ -1321,7 +1320,7 @@ def _transform_masks( inputs: OTXDataItem, warp_matrix: np.ndarray, output_size: tuple[int, int], - valid_index: np.ndarray, + valid_index: np.ndarray | None = None, ) -> None: """Transform masks using the warp matrix. @@ -1335,11 +1334,11 @@ def _transform_masks( return # Convert valid_index to numpy boolean array if it's a tensor - if hasattr(valid_index, "numpy"): + if valid_index is not None and hasattr(valid_index, "numpy"): valid_index = valid_index.numpy() # Filter masks using valid_index first - masks = inputs.masks[valid_index] + masks = inputs.masks[valid_index] if valid_index is not None else inputs.masks masks = masks.numpy() if not isinstance(masks, np.ndarray) else masks if masks.ndim == 3: @@ -1378,15 +1377,20 @@ def _warp_single_mask(self, mask: np.ndarray, warp_matrix: np.ndarray, output_si ) return warped_mask > 127 - msg = "Multi-class masks are not supported yet." - raise NotImplementedError(msg) + return cv2.warpPerspective( + mask.astype(np.uint8), + warp_matrix, + dsize=(width, height), + flags=cv2.INTER_NEAREST, + borderValue=0, + ) def _transform_polygons( self, inputs: OTXDataItem, warp_matrix: np.ndarray, output_shape: tuple[int, int], - valid_index: np.ndarray, + valid_index: np.ndarray | None = None, ) -> None: """Transform polygons using the warp matrix. @@ -1405,11 +1409,13 @@ def _transform_polygons( return # Convert valid_index to numpy boolean array if it's a tensor - if hasattr(valid_index, "numpy"): + if valid_index is not None and hasattr(valid_index, "numpy"): valid_index = valid_index.numpy() - # Filter polygons using valid_index - filtered_polygons = [p for p, keep in zip(inputs.polygons, valid_index) if keep] + # Filter polygons using valid_index if available + filtered_polygons = ( + [p for p, keep in zip(inputs.polygons, valid_index) if keep] if valid_index is not None else inputs.polygons + ) if filtered_polygons: inputs.polygons = project_polygons(filtered_polygons, warp_matrix, output_shape) diff --git a/library/src/otx/data/utils/pre_filtering.py b/library/src/otx/data/utils/pre_filtering.py index 7f4a265cb7..4bf59d9a3c 100644 --- a/library/src/otx/data/utils/pre_filtering.py +++ b/library/src/otx/data/utils/pre_filtering.py @@ -10,7 +10,7 @@ from functools import partial from typing import TYPE_CHECKING -from datumaro.components.annotation import Annotation, Bbox, Ellipse, Polygon +from datumaro.components.annotation import Annotation, AnnotationType, Bbox, Ellipse, Points, Polygon from datumaro.components.dataset import Dataset as DmDataset from otx.types.task import OTXTaskType @@ -19,6 +19,14 @@ from datumaro.components.dataset_base import DatasetItem +def get_labels(dataset: DmDataset, task: OTXTaskType) -> list[str]: + """Get the labels from the dataset.""" + # label is funky from arrow dataset + if task == OTXTaskType.KEYPOINT_DETECTION: + return dataset.categories()[AnnotationType.points][0].labels + return dataset.categories()[AnnotationType.label] + + def pre_filtering( dataset: DmDataset, data_format: str, @@ -42,7 +50,16 @@ def pre_filtering( used_background_items = set() msg = f"There are empty annotation items in train set, Of these, only {unannotated_items_ratio*100}% are used." warnings.warn(msg, stacklevel=2) - dataset = DmDataset.filter(dataset, partial(is_valid_anno_for_task, task=task), filter_annotations=True) + + labels = get_labels(dataset, task) + + dataset = DmDataset.filter( + dataset, + partial(is_valid_anno_for_task, task=task, labels=labels), + filter_annotations=True, + ) + if task == OTXTaskType.KEYPOINT_DETECTION: + return dataset dataset = remove_unused_labels(dataset, data_format, ignore_index) if unannotated_items_ratio > 0: empty_items = [ @@ -61,7 +78,7 @@ def pre_filtering( ) -def is_valid_annot(item: DatasetItem, annotation: Annotation) -> bool: # noqa: ARG001 +def is_valid_annot(item: DatasetItem, annotation: Annotation, labels: list[str]) -> bool: # noqa: ARG001 """Return whether DatasetItem's annotation is valid.""" if isinstance(annotation, Bbox): x1, y1, x2, y2 = annotation.points @@ -79,28 +96,45 @@ def is_valid_annot(item: DatasetItem, annotation: Annotation) -> bool: # noqa: return True msg = "There are invalid polygon, they will be filtered out before training." return False + if isinstance(annotation, Points): + # For keypoint detection, num of (x, y) points should be equal to num of labels + if len(annotation.points) == 0: + msg = "There are invalid points, they will be filtered out before training." + warnings.warn(msg, stacklevel=2) + return False + return len(annotation.points) // 2 == len(labels) + return True -def is_valid_anno_for_task(item: DatasetItem, annotation: Annotation, task: OTXTaskType) -> bool: +def is_valid_anno_for_task( + item: DatasetItem, + annotation: Annotation, + task: OTXTaskType, + labels: list[str], +) -> bool: """Return whether DatasetItem's annotation is valid for a specific task. Args: item (DatasetItem): The item to be checked. annotation (Annotation): The annotation to be checked. task (OTXTaskType): The task type of the dataset. + labels (list[str]): The labels of the dataset. Returns: bool: True if the annotation is valid for the task, False otherwise. """ if task == OTXTaskType.DETECTION: - return isinstance(annotation, Bbox) and is_valid_annot(item, annotation) + return isinstance(annotation, Bbox) and is_valid_annot(item, annotation, labels) # Rotated detection is a subset of instance segmentation if task in [OTXTaskType.INSTANCE_SEGMENTATION, OTXTaskType.ROTATED_DETECTION]: - return isinstance(annotation, (Polygon, Bbox, Ellipse)) and is_valid_annot(item, annotation) + return isinstance(annotation, (Polygon, Bbox, Ellipse)) and is_valid_annot(item, annotation, labels) + + if task == OTXTaskType.KEYPOINT_DETECTION: + return isinstance(annotation, Points) and is_valid_annot(item, annotation, labels) - return is_valid_annot(item, annotation) + return is_valid_annot(item, annotation, labels) def remove_unused_labels( diff --git a/library/src/otx/recipe/detection/deim_dfine_l.yaml b/library/src/otx/recipe/detection/deim_dfine_l.yaml index 4753c0f8b0..df29f11c38 100644 --- a/library/src/otx/recipe/detection/deim_dfine_l.yaml +++ b/library/src/otx/recipe/detection/deim_dfine_l.yaml @@ -215,7 +215,22 @@ overrides: train_subset: batch_size: 8 num_workers: 4 - transforms: [] + transforms: + - class_path: otx.data.transform_libs.torchvision.Resize + init_args: + scale: $(input_size) + keep_ratio: false + - class_path: otx.data.transform_libs.torchvision.RandomFlip + init_args: + probability: 0.5 + - class_path: torchvision.transforms.v2.ToDtype + init_args: + dtype: ${as_torch_dtype:torch.float32} + scale: false + - class_path: torchvision.transforms.v2.Normalize + init_args: + mean: [0.0, 0.0, 0.0] + std: [255.0, 255.0, 255.0] sampler: class_path: otx.data.samplers.balanced_sampler.BalancedSampler diff --git a/library/src/otx/recipe/detection/deim_dfine_m.yaml b/library/src/otx/recipe/detection/deim_dfine_m.yaml index 4b52e73c60..0a8337cceb 100644 --- a/library/src/otx/recipe/detection/deim_dfine_m.yaml +++ b/library/src/otx/recipe/detection/deim_dfine_m.yaml @@ -214,7 +214,22 @@ overrides: train_subset: batch_size: 8 num_workers: 4 - transforms: [] + transforms: + - class_path: otx.data.transform_libs.torchvision.Resize + init_args: + scale: $(input_size) + keep_ratio: false + - class_path: otx.data.transform_libs.torchvision.RandomFlip + init_args: + probability: 0.5 + - class_path: torchvision.transforms.v2.ToDtype + init_args: + dtype: ${as_torch_dtype:torch.float32} + scale: false + - class_path: torchvision.transforms.v2.Normalize + init_args: + mean: [0.0, 0.0, 0.0] + std: [255.0, 255.0, 255.0] sampler: class_path: otx.data.samplers.balanced_sampler.BalancedSampler diff --git a/library/src/otx/recipe/detection/deim_dfine_x.yaml b/library/src/otx/recipe/detection/deim_dfine_x.yaml index d6d3d31b05..f3d30b8e93 100644 --- a/library/src/otx/recipe/detection/deim_dfine_x.yaml +++ b/library/src/otx/recipe/detection/deim_dfine_x.yaml @@ -215,7 +215,22 @@ overrides: train_subset: batch_size: 8 num_workers: 4 - transforms: [] + transforms: + - class_path: otx.data.transform_libs.torchvision.Resize + init_args: + scale: $(input_size) + keep_ratio: false + - class_path: otx.data.transform_libs.torchvision.RandomFlip + init_args: + probability: 0.5 + - class_path: torchvision.transforms.v2.ToDtype + init_args: + dtype: ${as_torch_dtype:torch.float32} + scale: false + - class_path: torchvision.transforms.v2.Normalize + init_args: + mean: [0.0, 0.0, 0.0] + std: [255.0, 255.0, 255.0] sampler: class_path: otx.data.samplers.balanced_sampler.BalancedSampler diff --git a/library/src/otx/recipe/detection/dfine_x.yaml b/library/src/otx/recipe/detection/dfine_x.yaml index ec392d11b3..7b22280f16 100644 --- a/library/src/otx/recipe/detection/dfine_x.yaml +++ b/library/src/otx/recipe/detection/dfine_x.yaml @@ -66,26 +66,25 @@ overrides: batch_size: 8 num_workers: 4 transforms: - - class_path: torchvision.transforms.v2.RandomPhotometricDistort - init_args: - p: 0.5 - class_path: torchvision.transforms.v2.RandomZoomOut + enable: true init_args: fill: 0 - class_path: otx.data.transform_libs.torchvision.RandomIoUCrop + enable: true init_args: probability: 0.8 - class_path: torchvision.transforms.v2.SanitizeBoundingBoxes init_args: min_size: 1 - - class_path: otx.data.transform_libs.torchvision.RandomFlip - init_args: - probability: 0.5 - class_path: otx.data.transform_libs.torchvision.Resize init_args: scale: $(input_size) transform_bbox: true keep_ratio: false + - class_path: otx.data.transform_libs.torchvision.RandomFlip + init_args: + probability: 0.5 - class_path: torchvision.transforms.v2.RandomPhotometricDistort enable: false init_args: diff --git a/library/src/otx/recipe/detection/dfine_x_tile.yaml b/library/src/otx/recipe/detection/dfine_x_tile.yaml index 0f59712a53..c4f5e5c9fa 100644 --- a/library/src/otx/recipe/detection/dfine_x_tile.yaml +++ b/library/src/otx/recipe/detection/dfine_x_tile.yaml @@ -68,26 +68,25 @@ overrides: num_workers: 4 to_tv_image: true transforms: - - class_path: torchvision.transforms.v2.RandomPhotometricDistort - init_args: - p: 0.5 - class_path: torchvision.transforms.v2.RandomZoomOut + enable: true init_args: fill: 0 - class_path: otx.data.transform_libs.torchvision.RandomIoUCrop + enable: true init_args: probability: 0.8 - class_path: torchvision.transforms.v2.SanitizeBoundingBoxes init_args: min_size: 1 - - class_path: otx.data.transform_libs.torchvision.RandomFlip - init_args: - probability: 0.5 - class_path: otx.data.transform_libs.torchvision.Resize init_args: scale: $(input_size) transform_bbox: true keep_ratio: false + - class_path: otx.data.transform_libs.torchvision.RandomFlip + init_args: + probability: 0.5 - class_path: torchvision.transforms.v2.RandomPhotometricDistort enable: false init_args: diff --git a/library/src/otx/recipe/detection/rtdetr_101.yaml b/library/src/otx/recipe/detection/rtdetr_101.yaml index 06ea3f0f33..3078b46a98 100644 --- a/library/src/otx/recipe/detection/rtdetr_101.yaml +++ b/library/src/otx/recipe/detection/rtdetr_101.yaml @@ -63,6 +63,13 @@ overrides: train_subset: batch_size: 4 transforms: + - class_path: otx.data.transform_libs.torchvision.MinIoURandomCrop + enable: false + - class_path: otx.data.transform_libs.torchvision.Resize + init_args: + scale: $(input_size) + keep_ratio: false + transform_bbox: true - class_path: torchvision.transforms.v2.RandomPhotometricDistort enable: false init_args: @@ -79,11 +86,6 @@ overrides: - -0.05 - 0.05 p: 0.5 - - class_path: otx.data.transform_libs.torchvision.Resize - init_args: - scale: $(input_size) - keep_ratio: false - transform_bbox: true - class_path: otx.data.transform_libs.torchvision.RandomAffine enable: false init_args: @@ -94,6 +96,7 @@ overrides: - 1.5 max_shear_degree: 2.0 - class_path: otx.data.transform_libs.torchvision.RandomFlip + enable: true init_args: probability: 0.5 - class_path: torchvision.transforms.v2.RandomVerticalFlip diff --git a/library/src/otx/recipe/detection/rtdetr_18.yaml b/library/src/otx/recipe/detection/rtdetr_18.yaml index 088b3b317e..44da32abb1 100644 --- a/library/src/otx/recipe/detection/rtdetr_18.yaml +++ b/library/src/otx/recipe/detection/rtdetr_18.yaml @@ -62,6 +62,13 @@ overrides: train_subset: batch_size: 4 transforms: + - class_path: otx.data.transform_libs.torchvision.MinIoURandomCrop + enable: false + - class_path: otx.data.transform_libs.torchvision.Resize + init_args: + scale: $(input_size) + keep_ratio: false + transform_bbox: true - class_path: torchvision.transforms.v2.RandomPhotometricDistort enable: false init_args: @@ -78,11 +85,6 @@ overrides: - -0.05 - 0.05 p: 0.5 - - class_path: otx.data.transform_libs.torchvision.Resize - init_args: - scale: $(input_size) - keep_ratio: false - transform_bbox: true - class_path: otx.data.transform_libs.torchvision.RandomAffine enable: false init_args: @@ -93,6 +95,7 @@ overrides: - 1.5 max_shear_degree: 2.0 - class_path: otx.data.transform_libs.torchvision.RandomFlip + enable: true init_args: probability: 0.5 - class_path: torchvision.transforms.v2.RandomVerticalFlip diff --git a/library/src/otx/recipe/detection/rtdetr_50.yaml b/library/src/otx/recipe/detection/rtdetr_50.yaml index f1da587a62..096c73868a 100644 --- a/library/src/otx/recipe/detection/rtdetr_50.yaml +++ b/library/src/otx/recipe/detection/rtdetr_50.yaml @@ -63,6 +63,13 @@ overrides: train_subset: batch_size: 4 transforms: + - class_path: otx.data.transform_libs.torchvision.MinIoURandomCrop + enable: false + - class_path: otx.data.transform_libs.torchvision.Resize + init_args: + scale: $(input_size) + keep_ratio: false + transform_bbox: true - class_path: torchvision.transforms.v2.RandomPhotometricDistort enable: false init_args: @@ -79,11 +86,6 @@ overrides: - -0.05 - 0.05 p: 0.5 - - class_path: otx.data.transform_libs.torchvision.Resize - init_args: - scale: $(input_size) - keep_ratio: false - transform_bbox: true - class_path: otx.data.transform_libs.torchvision.RandomAffine enable: false init_args: @@ -94,6 +96,7 @@ overrides: - 1.5 max_shear_degree: 2.0 - class_path: otx.data.transform_libs.torchvision.RandomFlip + enable: true init_args: probability: 0.5 - class_path: torchvision.transforms.v2.RandomVerticalFlip diff --git a/library/src/otx/recipe/detection/rtmdet_tiny.yaml b/library/src/otx/recipe/detection/rtmdet_tiny.yaml index 577d77410c..99ed5c5047 100644 --- a/library/src/otx/recipe/detection/rtmdet_tiny.yaml +++ b/library/src/otx/recipe/detection/rtmdet_tiny.yaml @@ -81,15 +81,6 @@ overrides: - class_path: otx.data.transform_libs.torchvision.RandomCrop init_args: crop_size: $(input_size) - - class_path: otx.data.transform_libs.torchvision.RandomAffine - enable: false - init_args: - max_rotate_degree: 10.0 - max_translate_ratio: 0.1 - scaling_ratio_range: - - 0.5 - - 1.5 - max_shear_degree: 2.0 - class_path: torchvision.transforms.v2.RandomPhotometricDistort enable: false init_args: @@ -106,6 +97,15 @@ overrides: - -0.05 - 0.05 p: 0.5 + - class_path: otx.data.transform_libs.torchvision.RandomAffine + enable: false + init_args: + max_rotate_degree: 10.0 + max_translate_ratio: 0.1 + scaling_ratio_range: + - 0.5 + - 1.5 + max_shear_degree: 2.0 - class_path: otx.data.transform_libs.torchvision.YOLOXHSVRandomAug - class_path: otx.data.transform_libs.torchvision.RandomFlip init_args: diff --git a/library/src/otx/tools/converter.py b/library/src/otx/tools/converter.py index 1e8dbea15f..aaf2621046 100644 --- a/library/src/otx/tools/converter.py +++ b/library/src/otx/tools/converter.py @@ -272,6 +272,15 @@ def update_num_iters(param_value: int | None, config: dict) -> None: config["max_epochs"] = param_value +def update_batch_size(param_value: int | None, config: dict) -> None: + """Update batch size in the config.""" + if param_value is None: + logging.info("Batch size is not provided, skipping update.") + return + config["data"]["train_subset"]["batch_size"] = param_value + config["data"]["val_subset"]["batch_size"] = param_value + + def update_early_stopping(early_stopping_cfg: dict | None, config: dict) -> None: """Update early stopping parameters in the config.""" if early_stopping_cfg is None: @@ -483,6 +492,7 @@ def _update_params(config: dict, param_dict: dict) -> None: update_tiling(tiling, config) update_augmentations(augmentation_params, config) update_learning_rate(training_parameters.get("learning_rate", None), config) + update_batch_size(training_parameters.get("batch_size", None), config) update_num_iters(training_parameters.get("max_epochs", None), config) update_early_stopping(training_parameters.get("early_stopping", None), config) update_input_size( diff --git a/library/tests/assets/geti/model_configs/detection.yaml b/library/tests/assets/geti/model_configs/detection.yaml index 282b65e9a1..fea9c36b57 100644 --- a/library/tests/assets/geti/model_configs/detection.yaml +++ b/library/tests/assets/geti/model_configs/detection.yaml @@ -70,6 +70,7 @@ hyperparameters: enable: true patience: 10 learning_rate: 0.001 + batch_size: 4 input_size_width: 800 input_size_height: 992 evaluation: diff --git a/library/tests/unit/backend/native/tools/adaptive_bs/test_bs_search_algo.py b/library/tests/unit/backend/native/tools/adaptive_bs/test_bs_search_algo.py index a6ed580a01..6725d8f481 100644 --- a/library/tests/unit/backend/native/tools/adaptive_bs/test_bs_search_algo.py +++ b/library/tests/unit/backend/native/tools/adaptive_bs/test_bs_search_algo.py @@ -68,9 +68,9 @@ def mock_train_func(batch_size) -> int: msg = "CUDA out of memory." raise RuntimeError(msg) if batch_size > max_runnable_bs: - mem_usage = 8500 + 1500 * batch_size / (cuda_oom_bound - max_runnable_bs) + mem_usage = 9000 + 1500 * batch_size / (cuda_oom_bound - max_runnable_bs) else: - mem_usage = 8500 * batch_size / max_runnable_bs + mem_usage = 9000 * batch_size / max_runnable_bs self.mock_torch.cuda.max_memory_reserved.return_value = mem_usage return mem_usage @@ -110,14 +110,14 @@ def test_find_max_usable_bs_gpu_memory_too_small(self): mock_train_func = self.get_mock_train_func(cuda_oom_bound=1, max_runnable_bs=1) bs_search_algo = BsSearchAlgo(mock_train_func, 128, 1000) - assert bs_search_algo.auto_decrease_batch_size() == 2 + assert bs_search_algo.auto_decrease_batch_size() == 1 def test_auto_decrease_batch_size_bs2_not_oom_but_most_mem(self): """Batch size 2 doesn't make oom but use most of memory.""" mock_train_func = self.get_mock_train_func(cuda_oom_bound=2, max_runnable_bs=1) bs_search_algo = BsSearchAlgo(mock_train_func, 128, 1000) - assert bs_search_algo.auto_decrease_batch_size() == 2 + assert bs_search_algo.auto_decrease_batch_size() == 1 @pytest.mark.parametrize( ("max_runnable_bs", "max_bs", "expected_bs"), @@ -135,7 +135,7 @@ def test_find_big_enough_batch_size(self, max_runnable_bs, max_bs, expected_bs): adapted_bs = bs_search_algo.find_big_enough_batch_size() if expected_bs is None: - assert 7500 <= mock_train_func(adapted_bs) <= 8500 + assert 7500 <= mock_train_func(adapted_bs) <= 9000 else: assert adapted_bs == expected_bs @@ -143,14 +143,14 @@ def test_find_big_enough_batch_size_gpu_memory_too_small(self): mock_train_func = self.get_mock_train_func(cuda_oom_bound=1, max_runnable_bs=1) bs_search_algo = BsSearchAlgo(mock_train_func, 128, 1000) - assert bs_search_algo.find_big_enough_batch_size() == 2 + assert bs_search_algo.find_big_enough_batch_size() == 1 def test_find_big_enough_batch_size_bs2_not_oom_but_most_mem(self): """Batch size 2 doesn't make oom but use most of memory.""" mock_train_func = self.get_mock_train_func(cuda_oom_bound=2, max_runnable_bs=1) bs_search_algo = BsSearchAlgo(mock_train_func, 2, 1000) - assert bs_search_algo.find_big_enough_batch_size() == 2 + assert bs_search_algo.find_big_enough_batch_size() == 1 def test_find_big_enough_batch_size_gradient_zero(self): def mock_train_func(batch_size) -> int: @@ -167,7 +167,7 @@ def mock_train_func(batch_size) -> int: bs_search_algo = BsSearchAlgo(mock_train_func, 64, 1000) adapted_bs = bs_search_algo.find_big_enough_batch_size() - assert adapted_bs == 100 + assert adapted_bs == 102 def test_find_big_enough_batch_size_not_exceed_upper_bound(self): def mock_train_func(batch_size) -> int: @@ -184,7 +184,7 @@ def mock_train_func(batch_size) -> int: bs_search_algo = BsSearchAlgo(mock_train_func, 64, 1000) adapted_bs = bs_search_algo.find_big_enough_batch_size() - assert mock_train_func(adapted_bs) <= 8500 + assert mock_train_func(adapted_bs) <= 9000 def test_find_big_enough_batch_size_drop_last(self): mock_train_func = self.get_mock_train_func(cuda_oom_bound=10000, max_runnable_bs=180) diff --git a/library/tests/unit/data/test_pre_filtering.py b/library/tests/unit/data/test_pre_filtering.py index 53a0831294..84371188e8 100644 --- a/library/tests/unit/data/test_pre_filtering.py +++ b/library/tests/unit/data/test_pre_filtering.py @@ -2,7 +2,7 @@ # SPDX-License-Identifier: Apache-2.0 import pytest -from datumaro.components.annotation import AnnotationType, Bbox, Ellipse, Label, Polygon +from datumaro.components.annotation import AnnotationType, Bbox, Ellipse, Label, Points, Polygon from datumaro.components.dataset import Dataset as DmDataset from datumaro.components.dataset_base import DatasetItem @@ -137,6 +137,26 @@ class TestIsValidAnnoForTask: (OTXTaskType.ROTATED_DETECTION, Polygon(points=[0, 0, 10, 0, 10, 10, 0, 10], label=0), True), (OTXTaskType.ROTATED_DETECTION, Ellipse(x1=0, y1=0, x2=10, y2=10, label=0), True), (OTXTaskType.ROTATED_DETECTION, Label(label=0), False), + # KEYPOINT_DETECTION task tests + ( + OTXTaskType.KEYPOINT_DETECTION, + Points(points=[10, 20, 30, 40], label=0), + True, + ), # 2 keypoints, will use 2 labels + ( + OTXTaskType.KEYPOINT_DETECTION, + Points(points=[10, 20, 30, 40, 50, 60], label=0), + True, + ), # 3 keypoints, will use 3 labels + (OTXTaskType.KEYPOINT_DETECTION, Points(points=[10, 20], label=0), True), # 1 keypoint, will use 1 label + (OTXTaskType.KEYPOINT_DETECTION, Points(points=[], label=0), False), # 0 keypoints, will use 0 labels + (OTXTaskType.KEYPOINT_DETECTION, Bbox(x=0, y=0, w=10, h=10, label=0), False), # Wrong type + ( + OTXTaskType.KEYPOINT_DETECTION, + Polygon(points=[0, 0, 10, 0, 10, 10, 0, 10], label=0), + False, + ), # Wrong type + (OTXTaskType.KEYPOINT_DETECTION, Label(label=0), False), # Wrong type ], ) def test_is_valid_anno_for_task( @@ -154,25 +174,34 @@ def test_is_valid_anno_for_task( annotation: The annotation to test expected: Expected result (True if valid, False if invalid) """ - result = is_valid_anno_for_task(fxt_dataset_item, annotation, task) + # For keypoint detection, we need to provide the correct number of labels + # based on the number of keypoints in the annotation + if task == OTXTaskType.KEYPOINT_DETECTION and isinstance(annotation, Points): + # Calculate expected number of labels based on points (each keypoint is x,y pair) + expected_labels = len(annotation.points) // 2 + labels = [f"keypoint_{i}" for i in range(expected_labels)] + else: + labels = [0] + + result = is_valid_anno_for_task(fxt_dataset_item, annotation, task, labels) assert result == expected, f"Expected {expected} for task {task} with annotation {type(annotation).__name__}" def test_detection_task_with_valid_bbox(self, fxt_dataset_item: DatasetItem) -> None: """Test DETECTION task with valid bounding box.""" bbox = Bbox(x=5, y=5, w=20, h=15, label=0) - result = is_valid_anno_for_task(fxt_dataset_item, bbox, OTXTaskType.DETECTION) + result = is_valid_anno_for_task(fxt_dataset_item, bbox, OTXTaskType.DETECTION, [0]) assert result is True def test_detection_task_with_invalid_bbox(self, fxt_dataset_item: DatasetItem) -> None: """Test DETECTION task with invalid bounding box (negative dimensions).""" bbox = Bbox(x=10, y=10, w=-5, h=-5, label=0) - result = is_valid_anno_for_task(fxt_dataset_item, bbox, OTXTaskType.DETECTION) + result = is_valid_anno_for_task(fxt_dataset_item, bbox, OTXTaskType.DETECTION, [0]) assert result is False def test_detection_task_with_zero_dimension_bbox(self, fxt_dataset_item: DatasetItem) -> None: """Test DETECTION task with zero dimension bounding box.""" bbox = Bbox(x=10, y=10, w=0, h=0, label=0) - result = is_valid_anno_for_task(fxt_dataset_item, bbox, OTXTaskType.DETECTION) + result = is_valid_anno_for_task(fxt_dataset_item, bbox, OTXTaskType.DETECTION, [0]) assert result is False def test_detection_task_with_wrong_annotation_type(self, fxt_dataset_item: DatasetItem) -> None: @@ -181,9 +210,9 @@ def test_detection_task_with_wrong_annotation_type(self, fxt_dataset_item: Datas ellipse = Ellipse(x1=0, y1=0, x2=10, y2=10, label=0) label = Label(label=0) - assert is_valid_anno_for_task(fxt_dataset_item, polygon, OTXTaskType.DETECTION) is False - assert is_valid_anno_for_task(fxt_dataset_item, ellipse, OTXTaskType.DETECTION) is False - assert is_valid_anno_for_task(fxt_dataset_item, label, OTXTaskType.DETECTION) is False + assert is_valid_anno_for_task(fxt_dataset_item, polygon, OTXTaskType.DETECTION, [0]) is False + assert is_valid_anno_for_task(fxt_dataset_item, ellipse, OTXTaskType.DETECTION, [0]) is False + assert is_valid_anno_for_task(fxt_dataset_item, label, OTXTaskType.DETECTION, [0]) is False def test_instance_segmentation_task_with_valid_annotations(self, fxt_dataset_item: DatasetItem) -> None: """Test INSTANCE_SEGMENTATION task with valid annotation types.""" @@ -191,9 +220,9 @@ def test_instance_segmentation_task_with_valid_annotations(self, fxt_dataset_ite polygon = Polygon(points=[0, 0, 10, 0, 10, 10, 0, 10], label=0) ellipse = Ellipse(x1=0, y1=0, x2=10, y2=10, label=0) - assert is_valid_anno_for_task(fxt_dataset_item, bbox, OTXTaskType.INSTANCE_SEGMENTATION) is True - assert is_valid_anno_for_task(fxt_dataset_item, polygon, OTXTaskType.INSTANCE_SEGMENTATION) is True - assert is_valid_anno_for_task(fxt_dataset_item, ellipse, OTXTaskType.INSTANCE_SEGMENTATION) is True + assert is_valid_anno_for_task(fxt_dataset_item, bbox, OTXTaskType.INSTANCE_SEGMENTATION, [0]) is True + assert is_valid_anno_for_task(fxt_dataset_item, polygon, OTXTaskType.INSTANCE_SEGMENTATION, [0]) is True + assert is_valid_anno_for_task(fxt_dataset_item, ellipse, OTXTaskType.INSTANCE_SEGMENTATION, [0]) is True def test_instance_segmentation_task_with_invalid_annotations(self, fxt_dataset_item: DatasetItem) -> None: """Test INSTANCE_SEGMENTATION task with invalid annotation types.""" @@ -201,9 +230,11 @@ def test_instance_segmentation_task_with_invalid_annotations(self, fxt_dataset_i invalid_polygon = Polygon(points=[0, 0, 0, 0, 0, 0], label=0) # Degenerate polygon label = Label(label=0) # Wrong type - assert is_valid_anno_for_task(fxt_dataset_item, invalid_bbox, OTXTaskType.INSTANCE_SEGMENTATION) is False - assert is_valid_anno_for_task(fxt_dataset_item, invalid_polygon, OTXTaskType.INSTANCE_SEGMENTATION) is False - assert is_valid_anno_for_task(fxt_dataset_item, label, OTXTaskType.INSTANCE_SEGMENTATION) is False + assert is_valid_anno_for_task(fxt_dataset_item, invalid_bbox, OTXTaskType.INSTANCE_SEGMENTATION, [0]) is False + assert ( + is_valid_anno_for_task(fxt_dataset_item, invalid_polygon, OTXTaskType.INSTANCE_SEGMENTATION, [0]) is False + ) + assert is_valid_anno_for_task(fxt_dataset_item, label, OTXTaskType.INSTANCE_SEGMENTATION, [0]) is False def test_other_task_types_use_default_validation(self, fxt_dataset_item: DatasetItem) -> None: """Test that other task types use the default is_valid_annot behavior.""" @@ -214,33 +245,128 @@ def test_other_task_types_use_default_validation(self, fxt_dataset_item: Dataset label = Label(label=0) # Test with CLASSIFICATION task - assert is_valid_anno_for_task(fxt_dataset_item, valid_bbox, OTXTaskType.MULTI_CLASS_CLS) is True - assert is_valid_anno_for_task(fxt_dataset_item, invalid_bbox, OTXTaskType.MULTI_CLASS_CLS) is False - assert is_valid_anno_for_task(fxt_dataset_item, valid_polygon, OTXTaskType.MULTI_CLASS_CLS) is True - assert is_valid_anno_for_task(fxt_dataset_item, invalid_polygon, OTXTaskType.MULTI_CLASS_CLS) is False - assert is_valid_anno_for_task(fxt_dataset_item, label, OTXTaskType.MULTI_CLASS_CLS) is True + assert is_valid_anno_for_task(fxt_dataset_item, valid_bbox, OTXTaskType.MULTI_CLASS_CLS, [0]) is True + assert is_valid_anno_for_task(fxt_dataset_item, invalid_bbox, OTXTaskType.MULTI_CLASS_CLS, [0]) is False + assert is_valid_anno_for_task(fxt_dataset_item, valid_polygon, OTXTaskType.MULTI_CLASS_CLS, [0]) is True + assert is_valid_anno_for_task(fxt_dataset_item, invalid_polygon, OTXTaskType.MULTI_CLASS_CLS, [0]) is False + assert is_valid_anno_for_task(fxt_dataset_item, label, OTXTaskType.MULTI_CLASS_CLS, [0]) is True # Test with SEMANTIC_SEGMENTATION task - assert is_valid_anno_for_task(fxt_dataset_item, valid_bbox, OTXTaskType.SEMANTIC_SEGMENTATION) is True - assert is_valid_anno_for_task(fxt_dataset_item, invalid_bbox, OTXTaskType.SEMANTIC_SEGMENTATION) is False - assert is_valid_anno_for_task(fxt_dataset_item, valid_polygon, OTXTaskType.SEMANTIC_SEGMENTATION) is True - assert is_valid_anno_for_task(fxt_dataset_item, invalid_polygon, OTXTaskType.SEMANTIC_SEGMENTATION) is False - assert is_valid_anno_for_task(fxt_dataset_item, label, OTXTaskType.SEMANTIC_SEGMENTATION) is True + assert is_valid_anno_for_task(fxt_dataset_item, valid_bbox, OTXTaskType.SEMANTIC_SEGMENTATION, [0]) is True + assert is_valid_anno_for_task(fxt_dataset_item, invalid_bbox, OTXTaskType.SEMANTIC_SEGMENTATION, [0]) is False + assert is_valid_anno_for_task(fxt_dataset_item, valid_polygon, OTXTaskType.SEMANTIC_SEGMENTATION, [0]) is True + assert ( + is_valid_anno_for_task(fxt_dataset_item, invalid_polygon, OTXTaskType.SEMANTIC_SEGMENTATION, [0]) is False + ) + assert is_valid_anno_for_task(fxt_dataset_item, label, OTXTaskType.SEMANTIC_SEGMENTATION, [0]) is True def test_edge_cases(self, fxt_dataset_item: DatasetItem) -> None: """Test edge cases for annotation validation.""" # Very small but valid bbox small_bbox = Bbox(x=0, y=0, w=0.1, h=0.1, label=0) - assert is_valid_anno_for_task(fxt_dataset_item, small_bbox, OTXTaskType.DETECTION) is True + assert is_valid_anno_for_task(fxt_dataset_item, small_bbox, OTXTaskType.DETECTION, [0]) is True # Bbox with equal coordinates (should be invalid) equal_bbox = Bbox(x=5, y=5, w=0, h=0, label=0) - assert is_valid_anno_for_task(fxt_dataset_item, equal_bbox, OTXTaskType.DETECTION) is False + assert is_valid_anno_for_task(fxt_dataset_item, equal_bbox, OTXTaskType.DETECTION, [0]) is False # Polygon with minimal valid area minimal_polygon = Polygon(points=[0, 0, 1, 0, 1, 1, 0, 1], label=0) - assert is_valid_anno_for_task(fxt_dataset_item, minimal_polygon, OTXTaskType.INSTANCE_SEGMENTATION) is True + assert is_valid_anno_for_task(fxt_dataset_item, minimal_polygon, OTXTaskType.INSTANCE_SEGMENTATION, [0]) is True # Degenerate polygon (should be invalid) degenerate_polygon = Polygon(points=[0, 0, 0, 0, 0, 0], label=0) - assert is_valid_anno_for_task(fxt_dataset_item, degenerate_polygon, OTXTaskType.INSTANCE_SEGMENTATION) is False + assert ( + is_valid_anno_for_task(fxt_dataset_item, degenerate_polygon, OTXTaskType.INSTANCE_SEGMENTATION, [0]) + is False + ) + + def test_keypoint_detection_task_with_valid_points(self, fxt_dataset_item: DatasetItem) -> None: + """Test KEYPOINT_DETECTION task with valid Points annotations.""" + # Test with 2 keypoints (4 coordinates: x1, y1, x2, y2) + points_2_kp = Points(points=[10, 20, 30, 40], label=0) + labels_2 = ["left_eye", "right_eye"] + result = is_valid_anno_for_task(fxt_dataset_item, points_2_kp, OTXTaskType.KEYPOINT_DETECTION, labels_2) + assert result is True + + # Test with 4 keypoints (8 coordinates: x1, y1, x2, y2, x3, y3, x4, y4) + points_4_kp = Points(points=[10, 20, 30, 40, 50, 60, 70, 80], label=0) + labels_4 = ["left_eye", "right_eye", "nose", "mouth"] + result = is_valid_anno_for_task(fxt_dataset_item, points_4_kp, OTXTaskType.KEYPOINT_DETECTION, labels_4) + assert result is True + + # Test with single keypoint (2 coordinates: x1, y1) + points_1_kp = Points(points=[10, 20], label=0) + labels_1 = ["center"] + result = is_valid_anno_for_task(fxt_dataset_item, points_1_kp, OTXTaskType.KEYPOINT_DETECTION, labels_1) + assert result is True + + def test_keypoint_detection_task_with_invalid_points(self, fxt_dataset_item: DatasetItem) -> None: + """Test KEYPOINT_DETECTION task with invalid Points annotations.""" + # Test with empty points + empty_points = Points(points=[], label=0) + labels = ["keypoint1", "keypoint2"] + result = is_valid_anno_for_task(fxt_dataset_item, empty_points, OTXTaskType.KEYPOINT_DETECTION, labels) + assert result is False + + # Test with wrong number of keypoints (too many) + too_many_points = Points(points=[10, 20, 30, 40, 50, 60], label=0) # 3 keypoints + labels = ["keypoint1", "keypoint2"] # Only 2 labels + result = is_valid_anno_for_task(fxt_dataset_item, too_many_points, OTXTaskType.KEYPOINT_DETECTION, labels) + assert result is False + + # Test with wrong number of keypoints (too few) + too_few_points = Points(points=[10, 20], label=0) # 1 keypoint + labels = ["keypoint1", "keypoint2", "keypoint3"] # 3 labels + result = is_valid_anno_for_task(fxt_dataset_item, too_few_points, OTXTaskType.KEYPOINT_DETECTION, labels) + assert result is False + + def test_keypoint_detection_task_with_wrong_annotation_types(self, fxt_dataset_item: DatasetItem) -> None: + """Test KEYPOINT_DETECTION task with non-Points annotation types.""" + labels = ["keypoint1", "keypoint2"] + + # Test with bbox (should be invalid) + bbox = Bbox(x=0, y=0, w=10, h=10, label=0) + result = is_valid_anno_for_task(fxt_dataset_item, bbox, OTXTaskType.KEYPOINT_DETECTION, labels) + assert result is False + + # Test with polygon (should be invalid) + polygon = Polygon(points=[0, 0, 10, 0, 10, 10, 0, 10], label=0) + result = is_valid_anno_for_task(fxt_dataset_item, polygon, OTXTaskType.KEYPOINT_DETECTION, labels) + assert result is False + + # Test with ellipse (should be invalid) + ellipse = Ellipse(x1=0, y1=0, x2=10, y2=10, label=0) + result = is_valid_anno_for_task(fxt_dataset_item, ellipse, OTXTaskType.KEYPOINT_DETECTION, labels) + assert result is False + + # Test with label (should be invalid) + label = Label(label=0) + result = is_valid_anno_for_task(fxt_dataset_item, label, OTXTaskType.KEYPOINT_DETECTION, labels) + assert result is False + + def test_keypoint_detection_edge_cases(self, fxt_dataset_item: DatasetItem) -> None: + """Test edge cases for keypoint detection validation.""" + # Test with zero coordinates (empty points) + empty_points = Points(points=[], label=0) + empty_labels = [] + result = is_valid_anno_for_task(fxt_dataset_item, empty_points, OTXTaskType.KEYPOINT_DETECTION, empty_labels) + assert result is False # Empty points should be invalid + + # Test with many keypoints + many_points = Points(points=list(range(34)), label=0) # 17 keypoints (34 coordinates) + many_labels = [f"keypoint_{i}" for i in range(17)] + result = is_valid_anno_for_task(fxt_dataset_item, many_points, OTXTaskType.KEYPOINT_DETECTION, many_labels) + assert result is True + + # Test with negative coordinates (should still be valid as coordinates can be negative) + negative_points = Points(points=[-10, -20, -30, -40], label=0) + labels = ["keypoint1", "keypoint2"] + result = is_valid_anno_for_task(fxt_dataset_item, negative_points, OTXTaskType.KEYPOINT_DETECTION, labels) + assert result is True + + # Test with floating point coordinates + float_points = Points(points=[10.5, 20.7, 30.1, 40.9], label=0) + labels = ["keypoint1", "keypoint2"] + result = is_valid_anno_for_task(fxt_dataset_item, float_points, OTXTaskType.KEYPOINT_DETECTION, labels) + assert result is True diff --git a/library/tests/unit/data/transform_libs/test_torchvision.py b/library/tests/unit/data/transform_libs/test_torchvision.py index 966f4f8c1a..02d39446ac 100644 --- a/library/tests/unit/data/transform_libs/test_torchvision.py +++ b/library/tests/unit/data/transform_libs/test_torchvision.py @@ -51,6 +51,17 @@ def close(self): return +@pytest.fixture() +def seg_data_entity() -> OTXDataItem: + masks = torch.randint(low=0, high=2, size=(1, 112, 224), dtype=torch.uint8) + return OTXDataItem( + image=tv_tensors.Image(torch.randint(low=0, high=256, size=(3, 112, 224), dtype=torch.uint8)), + img_info=ImageInfo(img_idx=0, img_shape=(112, 224), ori_shape=(112, 224)), + masks=tv_tensors.Mask(masks), + label=LongTensor([1]), + ) + + @pytest.fixture() def det_data_entity() -> OTXDataItem: return OTXDataItem( @@ -359,6 +370,22 @@ def test_forward(self, random_affine: RandomAffine, det_data_entity: OTXDataItem assert results.bboxes.dtype == torch.float32 assert results.img_info.img_shape == results.image.shape[:2] + def test_segmentation_transform( + self, random_affine_with_mask_transform: RandomAffine, seg_data_entity: OTXDataItem + ) -> None: + """Test forward for segmentation task.""" + original_entity = deepcopy(seg_data_entity) + results = random_affine_with_mask_transform(original_entity) + + assert hasattr(results, "masks") + assert results.masks is not None + assert results.masks.shape[0] > 0 # Should have masks + assert results.masks.shape[1:] == results.image.shape[:2] # Same spatial dimensions as image + + # Check that the number of masks matches the number of remaining bboxes and labels + assert results.masks.shape[0] == results.label.shape[0] + assert isinstance(results.masks, tv_tensors.Mask) + def test_forward_with_masks_transform_enabled( self, random_affine_with_mask_transform: RandomAffine, diff --git a/library/tests/unit/tools/test_converter.py b/library/tests/unit/tools/test_converter.py index f1856bbcd4..ac39edfab1 100644 --- a/library/tests/unit/tools/test_converter.py +++ b/library/tests/unit/tools/test_converter.py @@ -15,8 +15,8 @@ def test_convert(self): config = GetiConfigConverter.convert(asdict(otx_config)) assert config["data"]["input_size"] == (992, 800) - assert config["data"]["train_subset"]["batch_size"] == 8 - assert config["data"]["val_subset"]["batch_size"] == 8 + assert config["data"]["train_subset"]["batch_size"] == 4 + assert config["data"]["val_subset"]["batch_size"] == 4 assert config["data"]["test_subset"]["batch_size"] == 8 assert config["model"]["init_args"]["optimizer"]["init_args"]["lr"] == 0.001 assert config["max_epochs"] == 100 @@ -266,8 +266,8 @@ def test_instantiate(self, tmp_path): assert engine.work_dir == tmp_path assert engine.datamodule.data_root == data_root - assert engine.datamodule.train_subset.batch_size == 8 - assert engine.datamodule.val_subset.batch_size == 8 + assert engine.datamodule.train_subset.batch_size == 4 + assert engine.datamodule.val_subset.batch_size == 4 assert engine.datamodule.test_subset.batch_size == 8 assert engine.datamodule.train_subset.num_workers == 2 assert engine.datamodule.val_subset.num_workers == 2