Adding Naver rope-vit compatibility to EVA ViT

rwightman · rwightman · commit 6f452afb9b23 · 2025-06-28T22:07:24.000-07:00
diff --git a/timm/layers/__init__.py b/timm/layers/__init__.py
@@ -49,7 +49,7 @@
     resize_rel_pos_bias_table, resize_rel_pos_bias_table_simple, resize_rel_pos_bias_table_levit
 from .pos_embed_sincos import pixel_freq_bands, freq_bands, build_sincos2d_pos_embed, build_fourier_pos_embed, \
     build_rotary_pos_embed, apply_rot_embed, apply_rot_embed_cat, apply_rot_embed_list, apply_keep_indices_nlc, \
-    FourierEmbed, RotaryEmbedding, RotaryEmbeddingCat
+    FourierEmbed, RotaryEmbedding, RotaryEmbeddingCat, RotaryEmbeddingMixed
 from .squeeze_excite import SEModule, SqueezeExcite, EffectiveSEModule, EffectiveSqueezeExcite
 from .selective_kernel import SelectiveKernel
 from .separable_conv import SeparableConv2d, SeparableConvNormAct
diff --git a/timm/layers/pos_embed_sincos.py b/timm/layers/pos_embed_sincos.py
@@ -219,8 +219,6 @@ def apply_rot_embed_list(x: List[torch.Tensor], sin_emb, cos_emb):
 
 def apply_rot_embed_cat(x: torch.Tensor, emb):
     sin_emb, cos_emb = emb.tensor_split(2, -1)
-    if sin_emb.ndim == 3:
-        return x * cos_emb.unsqueeze(1).expand_as(x) + rot(x) * sin_emb.unsqueeze(1).expand_as(x)
     return x * cos_emb + rot(x) * sin_emb
 
 
@@ -351,6 +349,7 @@ def __init__(
                 ref_feat_shape=self.ref_feat_shape,
                 grid_offset=self.grid_offset,
                 grid_indexing=self.grid_indexing,
+                temperature=self.temperature,
             )
             self.bands = None
             self.register_buffer(
@@ -446,6 +445,7 @@ def __init__(
                 ref_feat_shape=self.ref_feat_shape,
                 grid_offset=self.grid_offset,
                 grid_indexing=self.grid_indexing,
+                temperature=self.temperature,
             )
             self.bands = None
             self.register_buffer(
@@ -475,3 +475,125 @@ def forward(self, x):
         # assuming channel-first tensor where spatial dim are >= 2
         pos_embed = self.get_embed(x.shape[2:])
         return apply_rot_embed_cat(x, pos_embed)
+
+
+def init_random_2d_freqs(
+        head_dim: int,
+        depth: int,
+        num_heads: int,
+        temperature: float = 10.0,
+        rotate: bool = True,
+        *,
+        device=None,
+        dtype=torch.float32,
+) -> torch.Tensor:
+    """ Vectorised 2D ROPE frequencies with random rotation for mixed mode ROPE.
+    Returns:
+         Tensor (2, depth, num_heads, head_dim//2)
+    """
+    # base magnitudes, shape: (head_dim//4,)
+    mag = 1.0 / (temperature ** (torch.arange(0, head_dim, 4, device=device, dtype=dtype) / head_dim))
+
+    # (1,1,L) so it broadcasts over both depth and heads
+    mag = mag.unsqueeze(0).unsqueeze(0)  # (1,1,L)
+
+    # random (or zero) rotation per head *and* per block
+    if rotate:
+        angles = torch.rand(depth, num_heads, 1, device=device, dtype=dtype) * 2 * torch.pi
+    else:
+        angles = torch.zeros(depth, num_heads, 1, device=device, dtype=dtype)
+
+    # build (depth, num_heads, 2·L) == head_dim//2 on the last axis
+    fx = torch.cat([mag * torch.cos(angles), mag * torch.cos(angles + torch.pi / 2)], dim=-1)
+    fy = torch.cat([mag * torch.sin(angles), mag * torch.sin(angles + torch.pi / 2)], dim=-1)
+
+    # (2, depth, num_heads, head_dim//2)
+    return torch.stack([fx, fy], dim=0)
+
+
+class RotaryEmbeddingMixed(nn.Module):
+    """Rotary position embedding with depth-dependent learnable frequencies.
+
+    This implementation supports mixed (learnable) ROPE. In mixed mode,
+    each transformer block has its own set of learnable frequency parameters.
+    """
+    def __init__(
+            self,
+            dim: int,
+            depth: int,
+            num_heads: int,
+            temperature: float = 10.0,
+            feat_shape: Optional[List[int]] = None,
+            grid_indexing: str = 'xy',
+    ):
+        """Initialize rotary embeddings.
+
+        Args:
+            dim: Embedding dimension (should be divisible by 4)
+            depth: Number of transformer blocks
+            num_heads: Number of attention heads
+            temperature: Base for frequency computation
+            feat_shape: Spatial dimensions [H, W] if known in advance
+            grid_indexing: How to index grid positions ('xy' or 'ij')
+        """
+        super().__init__()
+        self.dim = dim
+        self.depth = depth
+        self.num_heads = num_heads
+        self.temperature = temperature
+        self.feat_shape = feat_shape
+        self.grid_indexing = grid_indexing
+
+        head_dim = dim // num_heads
+        assert head_dim % 4 == 0, f"head_dim must be divisible by 4, got {head_dim}"
+        freqs = init_random_2d_freqs(
+            head_dim,
+            depth,
+            num_heads,
+            temperature=temperature,
+            rotate=True,
+        )  # (2, depth, num_heads, head_dim//2)
+        self.freqs = nn.Parameter(freqs)
+
+    def get_mixed_freqs(self, H: int, W: int, device: torch.device, dtype: torch.dtype):
+        """Compute mixed (learnable) frequencies."""
+        # Create position indices
+        x_pos, y_pos = torch.meshgrid(
+            torch.arange(H, dtype=dtype, device=device),
+            torch.arange(W, dtype=dtype, device=device),
+            indexing=self.grid_indexing,
+        )
+        t_x = x_pos.flatten()
+        t_y = y_pos.flatten()
+        freqs_x = (t_x.unsqueeze(-1) @ self.freqs[0].unsqueeze(-2))
+        freqs_y = (t_y.unsqueeze(-1) @ self.freqs[1].unsqueeze(-2))
+        combined = freqs_x + freqs_y  # shape: (num_heads, N, dim//4)
+        sin_emb = torch.sin(combined).repeat_interleave(2, -1)  # (N, dim//2)
+        cos_emb = torch.cos(combined).repeat_interleave(2, -1)  # (N, dim//2)
+        rope_embeds = torch.cat([sin_emb, cos_emb], dim=-1)  # (num_heads, H*W, head_dim)
+
+        return rope_embeds
+
+    def get_embed(self, shape: Optional[List[int]] = None) -> torch.Tensor:
+        """Generate rotary embeddings for the given spatial shape.
+
+        Args:
+            shape: Spatial dimensions [H, W]
+
+        Returns:
+            Tensor of shape (depth, H*W, dim) containing concatenated sin/cos embeddings
+        """
+        assert shape is not None, "shape must be provided"
+        H, W = shape
+        device = self.freqs.device
+        dtype = self.freqs.dtype
+        return self.get_mixed_freqs(H, W, device, dtype)
+
+    def forward(self, x):
+        # assuming channel-first tensor where spatial dim are >= 2
+        pos_embed = self.get_embed(x.shape[2:])
+        return apply_rot_embed_cat(x, pos_embed)
+
+    def no_weight_decay(self):
+        """Exclude frequency parameters from weight decay."""
+        return {'freqs'}
diff --git a/timm/models/eva.py b/timm/models/eva.py