open-edge-platform
diff --git a/‎.github/workflows/pre_merge.yaml‎
Lines changed: 32 additions & 0 deletions b/‎.github/workflows/pre_merge.yaml‎
Lines changed: 32 additions & 0 deletions
diff --git a/‎CHANGELOG.md‎
Lines changed: 2 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎docs/source/guide/explanation/algorithms/object_detection/object_detection.rst‎
Lines changed: 2 additions & 0 deletions b/‎docs/source/guide/explanation/algorithms/object_detection/object_detection.rst‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎pyproject.toml‎
Lines changed: 2 additions & 1 deletion b/‎pyproject.toml‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎src/otx/algo/common/layers/transformer_layers.py‎
Lines changed: 147 additions & 1 deletion b/‎src/otx/algo/common/layers/transformer_layers.py‎
Lines changed: 147 additions & 1 deletion
@@ -84,6 +84,38 @@ jobs:
           curl -Os https://uploader.codecov.io/latest/linux/codecov
           chmod +x codecov
           ./codecov -t ${{ secrets.CODECOV_TOKEN }} --sha $COMMIT_ID -U $HTTP_PROXY -f .tox/coverage_unit-test-${{ matrix.tox-env }}.xml -F ${{ matrix.tox-env }}
+  Intense-Unit-Test:
+    runs-on: [otx-gpu-a10g-1]
+    container:
+      image: "ubuntu:24.04"
+    needs: Code-Quality-Checks
+    timeout-minutes: 120
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+          - python-version: "3.10"
+            tox-env: "py310"
+          - python-version: "3.11"
+            tox-env: "py311"
+    name: Intense-Unit-Test-with-Python${{ matrix.python-version }}
+    steps:
+      - name: Install dependencies
+        run: apt-get update && apt-get install -y libsqlite3-0 libsqlite3-dev libgl1 libglib2.0-0
+      - name: Checkout repository
+        uses: actions/checkout@11bd71901bbe5b1630ceea73d27597364c9af683 # v4.2.2
+      - name: Install Python
+        uses: actions/setup-python@0b93645e9fea7318ecaed2b359559ac225c90a2b # v5.3.0
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install tox
+        run: |
+          python -m pip install --require-hashes --no-deps -r .ci/requirements.txt
+          pip-compile --generate-hashes --output-file=/tmp/requirements.txt --extra=ci_tox pyproject.toml
+          python -m pip install --require-hashes --no-deps -r /tmp/requirements.txt
+          rm /tmp/requirements.txt
+      - name: Run unit test
+        run: tox -vv -e intense-unit-test-${{ matrix.tox-env }}
   Integration-Test:
     if: |
       github.event.pull_request.draft == false &&
 
@@ -22,6 +22,8 @@ All notable changes to this project will be documented in this file.
   (<https://github.com/openvinotoolkit/training_extensions/pull/3979>)
 - Add OpenVINO inference for 3D Object Detection task
   (<https://github.com/openvinotoolkit/training_extensions/pull/4017>)
+- Add D-Fine Detection Algorithm
+  (<https://github.com/openvinotoolkit/training_extensions/pull/4142>)
 
 ### Enhancements
 
 
@@ -73,6 +73,8 @@ We support the following ready-to-use model recipes:
 +------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------+---------------------+-----------------+
 | `Object_Detection_ResNeXt101_ATSS <https://github.com/openvinotoolkit/training_extensions/blob/develop/src/otx/recipe/detection/atss_resnext101.yaml>`_    |   ResNeXt101-ATSS   | 434.75              | 344.0           |
 +------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------+---------------------+-----------------+
+| `D-Fine X Detection <https://github.com/openvinotoolkit/training_extensions/blob/develop/src/otx/recipe/detection/dfine_x.yaml>`                           |   D-Fine X          | 202.486             | 240.0           |
++------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------+---------------------+-----------------+
 
 Above table can be found using the following command
 
 
@@ -398,6 +398,7 @@ convention = "google"
 markers = [
     "gpu",  # mark tests which require NVIDIA GPU
     "cpu",
-    "xpu",  # mark tests which require Intel dGPU
+    "xpu",  # mark tests which require Intel dGPU,
+    "intense", # intense unit tests which require better CI machines
 ]
 python_files = "tests/**/*.py"
@@ -1,4 +1,4 @@
-# Copyright (C) 2024 Intel Corporation
+# Copyright (C) 2024-2025 Intel Corporation
 # SPDX-License-Identifier: Apache-2.0
 #
 """Implementation of common transformer layers."""
@@ -10,6 +10,7 @@
 from typing import Callable
 
 import torch
+import torch.nn.functional as f
 from otx.algo.common.utils.utils import get_clones
 from otx.algo.modules.transformer import deformable_attention_core_func
 from torch import Tensor, nn
@@ -306,6 +307,151 @@ def forward(
         return self.output_proj(output)
 
 
+class MSDeformableAttentionV2(nn.Module):
+    """Multi-Scale Deformable Attention Module V2.
+
+    Note:
+        This is different from vanilla MSDeformableAttention where it uses
+        distinct number of sampling points for features at different scales.
+        Refer to RTDETRv2.
+
+    Args:
+        embed_dim (int): The number of expected features in the input.
+        num_heads (int): The number of heads in the multiheadattention models.
+        num_levels (int): The number of levels in MSDeformableAttention.
+        num_points_list (list[int]): Number of distinct points for each layer. Defaults to [3, 6, 3].
+    """
+
+    def __init__(
+        self,
+        embed_dim: int = 256,
+        num_heads: int = 8,
+        num_levels: int = 4,
+        num_points_list: list[int] = [3, 6, 3],  # noqa: B006
+    ) -> None:
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.num_levels = num_levels
+        self.num_points_list = num_points_list
+
+        num_points_scale = [1 / n for n in num_points_list for _ in range(n)]
+        self.register_buffer(
+            "num_points_scale",
+            torch.tensor(num_points_scale, dtype=torch.float32),
+        )
+
+        self.total_points = num_heads * sum(num_points_list)
+        self.head_dim = embed_dim // num_heads
+
+        self.sampling_offsets = nn.Linear(embed_dim, self.total_points * 2)
+        self.attention_weights = nn.Linear(embed_dim, self.total_points)
+
+        self._reset_parameters()
+
+    def _reset_parameters(self) -> None:
+        """Reset parameters of the model."""
+        init.constant_(self.sampling_offsets.weight, 0)
+        thetas = torch.arange(self.num_heads, dtype=torch.float32) * (2.0 * math.pi / self.num_heads)
+        grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
+        grid_init = grid_init / grid_init.abs().max(-1, keepdim=True).values  # noqa: PD011
+        grid_init = grid_init.reshape(self.num_heads, 1, 2).tile([1, sum(self.num_points_list), 1])
+        scaling = torch.concat([torch.arange(1, n + 1) for n in self.num_points_list]).reshape(1, -1, 1)
+        grid_init *= scaling
+        self.sampling_offsets.bias.data[...] = grid_init.flatten()
+
+        # attention_weights
+        init.constant_(self.attention_weights.weight, 0)
+        init.constant_(self.attention_weights.bias, 0)
+
+    def forward(
+        self,
+        query: Tensor,
+        reference_points: Tensor,
+        value: Tensor,
+        value_spatial_shapes: list[list[int]],
+    ) -> Tensor:
+        """Forward function of MSDeformableAttention.
+
+        Args:
+            query (Tensor): [bs, query_length, C]
+            reference_points (Tensor): [bs, query_length, n_levels, 2], range in [0, 1], top-left (0,0),
+                bottom-right (1, 1), including padding area
+            value (Tensor): [bs, value_length, C]
+            value_spatial_shapes (List): [n_levels, 2], [(H_0, W_0), (H_1, W_1), ..., (H_{L-1}, W_{L-1})]
+
+        Returns:
+            output (Tensor): [bs, Length_{query}, C]
+        """
+        bs, len_q = query.shape[:2]
+        _, n_head, c, _ = value[0].shape
+        num_points_list = self.num_points_list
+
+        sampling_offsets = self.sampling_offsets(query).reshape(
+            bs,
+            len_q,
+            self.num_heads,
+            sum(self.num_points_list),
+            2,
+        )
+
+        attention_weights = self.attention_weights(query).reshape(
+            bs,
+            len_q,
+            self.num_heads,
+            sum(self.num_points_list),
+        )
+        attention_weights = f.softmax(attention_weights, dim=-1)
+
+        if reference_points.shape[-1] == 2:
+            offset_normalizer = torch.tensor(value_spatial_shapes)
+            offset_normalizer = offset_normalizer.flip([1]).reshape(1, 1, 1, self.num_levels, 1, 2)
+            sampling_locations = (
+                reference_points.reshape(
+                    bs,
+                    len_q,
+                    1,
+                    self.num_levels,
+                    1,
+                    2,
+                )
+                + sampling_offsets / offset_normalizer
+            )
+        elif reference_points.shape[-1] == 4:
+            num_points_scale = self.num_points_scale.to(query).unsqueeze(-1)
+            offset = sampling_offsets * num_points_scale * reference_points[:, :, None, :, 2:] * 0.5
+            sampling_locations = reference_points[:, :, None, :, :2] + offset
+        else:
+            msg = (f"Last dim of reference_points must be 2 or 4, but get {reference_points.shape[-1]} instead.",)
+            raise ValueError(msg)
+
+        # sampling_offsets [8, 480, 8, 12, 2]
+        sampling_grids = 2 * sampling_locations - 1
+
+        sampling_grids = sampling_grids.permute(0, 2, 1, 3, 4).flatten(0, 1)
+        sampling_locations_list = sampling_grids.split(num_points_list, dim=-2)
+
+        sampling_value_list = []
+        for level, (h, w) in enumerate(value_spatial_shapes):
+            value_l = value[level].reshape(bs * n_head, c, h, w)
+            sampling_grid_l = sampling_locations_list[level]
+            sampling_value_l = f.grid_sample(
+                value_l,
+                sampling_grid_l,
+                mode="bilinear",
+                padding_mode="zeros",
+                align_corners=False,
+            )
+
+            sampling_value_list.append(sampling_value_l)
+
+        attn_weights = attention_weights.permute(0, 2, 1, 3).reshape(bs * n_head, 1, len_q, sum(num_points_list))
+        weighted_sample_locs = torch.concat(sampling_value_list, dim=-1) * attn_weights
+        output = weighted_sample_locs.sum(-1).reshape(bs, n_head * c, len_q)
+
+        return output.permute(0, 2, 1)
+
+
 class VisualEncoderLayer(nn.Module):
     """VisualEncoderLayer module consisting of MSDeformableAttention and feed-forward network.
Original file line number	Diff line number	Diff line change
`@@ -398,6 +398,7 @@ convention = "google"`
`398`	`398`	`markers = [`
`399`	`399`	`"gpu", # mark tests which require NVIDIA GPU`
`400`	`400`	`"cpu",`
`401`		`- "xpu", # mark tests which require Intel dGPU`
	`401`	`+ "xpu", # mark tests which require Intel dGPU,`
	`402`	`+ "intense", # intense unit tests which require better CI machines`
`402`	`403`	`]`
`403`	`404`	`python_files = "tests/*/.py"`