open-edge-platform
diff --git a/‎library/src/otx/backend/native/callbacks/__init__.py‎
Lines changed: 2 additions & 1 deletion b/‎library/src/otx/backend/native/callbacks/__init__.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎library/src/otx/backend/native/callbacks/ema.py‎
Lines changed: 416 additions & 0 deletions b/‎library/src/otx/backend/native/callbacks/ema.py‎
Lines changed: 416 additions & 0 deletions
diff --git a/‎library/src/otx/backend/native/models/__init__.py‎
Lines changed: 2 additions & 1 deletion b/‎library/src/otx/backend/native/models/__init__.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎library/src/otx/backend/native/models/classification/utils/swiglu_ffn.py‎
Lines changed: 29 additions & 0 deletions b/‎library/src/otx/backend/native/models/classification/utils/swiglu_ffn.py‎
Lines changed: 29 additions & 0 deletions
diff --git a/‎library/src/otx/backend/native/models/common/layers/position_embed.py‎
Lines changed: 111 additions & 0 deletions b/‎library/src/otx/backend/native/models/common/layers/position_embed.py‎
Lines changed: 111 additions & 0 deletions
@@ -4,5 +4,6 @@
 """Module for OTX custom callbacks."""
 
 from .batchsize_finder import BatchSizeFinder
+from .ema import EMAWeightAveraging
 
-__all__ = ["BatchSizeFinder"]
+__all__ = ["BatchSizeFinder", "EMAWeightAveraging"]
@@ -10,7 +10,7 @@
     TVModel,
     VisionTransformer,
 )
-from .detection import ATSS, RTDETR, SSD, YOLOX, DEIMDFine, DFine, RTMDet
+from .detection import ATSS, RTDETR, SSD, YOLOX, DEIMDFine, DEIMV2, DFine, RTMDet
 from .instance_segmentation import MaskRCNN, MaskRCNNTV, RTMDetInst
 from .keypoint_detection import RTMPose
 from .segmentation import DinoV2Seg, LiteHRNet, SegNext
@@ -22,6 +22,7 @@
     "YOLOX",
     "DEIMDFine",
     "DFine",
+    "DEIMV2",
     "DinoV2Seg",
     "EfficientNet",
     "LiteHRNet",
 
@@ -14,6 +14,7 @@
 
 from otx.backend.native.models.modules.drop import build_dropout
 from otx.backend.native.models.modules.norm import build_norm_layer
+from otx.backend.native.models.common.layers.transformer_layers import ListForwardMixin
 
 
 class SwiGLUFFN(nn.Module):
@@ -100,3 +101,31 @@ def __init__(
             out_dims=out_dims,
             bias=bias,
         )
+
+
+class SwiGLUFFNV2(nn.Module, ListForwardMixin):
+    def __init__(
+        self,
+        in_features: int,
+        hidden_features: int | None = None,
+        out_features: int | None = None,
+        act_layer: Callable[..., nn.Module] | None = None,
+        drop: float = 0.0,
+        bias: bool = True,
+        align_to: int = 8,
+        device=None,
+    ) -> None:
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+        d = int(hidden_features * 2 / 3)
+        swiglu_hidden_features = d + (-d % align_to)
+        self.w1 = nn.Linear(in_features, swiglu_hidden_features, bias=bias, device=device)
+        self.w2 = nn.Linear(in_features, swiglu_hidden_features, bias=bias, device=device)
+        self.w3 = nn.Linear(swiglu_hidden_features, out_features, bias=bias, device=device)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x1 = self.w1(x)
+        x2 = self.w2(x)
+        hidden = nn.functional.silu(x1) * x2
+        return self.w3(hidden)
@@ -6,9 +6,12 @@
 from __future__ import annotations
 
 import math
+from typing import Literal
 
 import torch
 from torch import nn
+import numpy as np
+from torch import Tensor
 
 
 class PositionEmbeddingSine(nn.Module):
@@ -105,3 +108,111 @@ def gen_sineembed_for_position(pos_tensor: torch.Tensor) -> torch.Tensor:
         msg = f"Unknown pos_tensor shape(-1):{pos_tensor.size(-1)}"
         raise ValueError(msg)
     return pos
+
+
+class RopePositionEmbedding(nn.Module):
+    def __init__(
+        self,
+        embed_dim: int,
+        *,
+        num_heads: int,
+        base: float | None = 100.0,
+        min_period: float | None = None,
+        max_period: float | None = None,
+        normalize_coords: Literal["min", "max", "separate"] = "separate",
+        shift_coords: float | None = None,
+        jitter_coords: float | None = None,
+        rescale_coords: float | None = None,
+        dtype: torch.dtype | None = None,
+        device: torch.device | None = None,
+    ):
+        super().__init__()
+        assert embed_dim % (4 * num_heads) == 0
+        both_periods = min_period is not None and max_period is not None
+        if (base is None and not both_periods) or (base is not None and both_periods):
+            raise ValueError("Either `base` or `min_period`+`max_period` must be provided.")
+
+        D_head = embed_dim // num_heads
+        self.base = base
+        self.min_period = min_period
+        self.max_period = max_period
+        self.D_head = D_head
+        self.normalize_coords = normalize_coords
+        self.shift_coords = shift_coords
+        self.jitter_coords = jitter_coords
+        self.rescale_coords = rescale_coords
+
+        # Needs persistent=True because we do teacher.load_state_dict(student.state_dict()) to initialize the teacher
+        self.dtype = dtype  # Don't rely on self.periods.dtype
+        self.register_buffer(
+            "periods",
+            torch.empty(D_head // 4, device=device, dtype=dtype),
+            persistent=True,
+        )
+        self._init_weights()
+
+    def forward(self, *, H: int, W: int) -> tuple[Tensor, Tensor]:
+        device = self.periods.device
+        dtype = self.dtype
+        dd = {"device": device, "dtype": dtype}
+
+        # Prepare coords in range [-1, +1]
+        if self.normalize_coords == "max":
+            max_HW = max(H, W)
+            coords_h = torch.arange(0.5, H, **dd) / max_HW  # [H]
+            coords_w = torch.arange(0.5, W, **dd) / max_HW  # [W]
+        elif self.normalize_coords == "min":
+            min_HW = min(H, W)
+            coords_h = torch.arange(0.5, H, **dd) / min_HW  # [H]
+            coords_w = torch.arange(0.5, W, **dd) / min_HW  # [W]
+        elif self.normalize_coords == "separate":
+            coords_h = torch.arange(0.5, H, **dd) / H  # [H]
+            coords_w = torch.arange(0.5, W, **dd) / W  # [W]
+        else:
+            raise ValueError(f"Unknown normalize_coords: {self.normalize_coords}")
+        coords = torch.stack(torch.meshgrid(coords_h, coords_w, indexing="ij"), dim=-1)  # [H, W, 2]
+        coords = coords.flatten(0, 1)  # [HW, 2]
+        coords = 2.0 * coords - 1.0  # Shift range [0, 1] to [-1, +1]
+
+        # Shift coords by adding a uniform value in [-shift, shift]
+        if self.training and self.shift_coords is not None:
+            shift_hw = torch.empty(2, **dd).uniform_(-self.shift_coords, self.shift_coords)
+            coords += shift_hw[None, :]
+
+        # Jitter coords by multiplying the range [-1, 1] by a log-uniform value in [1/jitter, jitter]
+        if self.training and self.jitter_coords is not None:
+            jitter_max = np.log(self.jitter_coords)
+            jitter_min = -jitter_max
+            jitter_hw = torch.empty(2, **dd).uniform_(jitter_min, jitter_max).exp()
+            coords *= jitter_hw[None, :]
+
+        # Rescale coords by multiplying the range [-1, 1] by a log-uniform value in [1/rescale, rescale]
+        if self.training and self.rescale_coords is not None:
+            rescale_max = np.log(self.rescale_coords)
+            rescale_min = -rescale_max
+            rescale_hw = torch.empty(1, **dd).uniform_(rescale_min, rescale_max).exp()
+            coords *= rescale_hw
+
+        # Prepare angles and sin/cos
+        angles = 2 * math.pi * coords[:, :, None] / self.periods[None, None, :]  # [HW, 2, D//4]
+        angles = angles.flatten(1, 2)  # [HW, D//2]
+        angles = angles.tile(2)  # [HW, D]
+        cos = torch.cos(angles)  # [HW, D]
+        sin = torch.sin(angles)  # [HW, D]
+
+        return (sin, cos)  # 2 * [HW, D]
+
+    def _init_weights(self):
+        device = self.periods.device
+        dtype = self.dtype
+        if self.base is not None:
+            periods = self.base ** (
+                2 * torch.arange(self.D_head // 4, device=device, dtype=dtype) / (self.D_head // 2)
+            )  # [D//4]
+        else:
+            base = self.max_period / self.min_period
+            exponents = torch.linspace(0, 1, self.D_head // 4, device=device, dtype=dtype)  # [D//4] range [0, 1]
+            periods = base**exponents  # range [1, max_period / min_period]
+            periods = periods / base  # range [min_period / max_period, 1]
+            periods = periods * self.max_period  # range [min_period, max_period]
+        self.periods.data = periods