vit

Guillaume SANCHEZ · Guillaume SANCHEZ · commit 91b4a213c530 · 2025-12-13T23:16:23.000+01:00
diff --git a/torchelie/models/__init__.py b/torchelie/models/__init__.py
@@ -17,4 +17,5 @@
 from .alexnet import *
 from .mlpmixer import *
 from .convnext import *
-#from .poolformer import *
+from .vit import ViTTrunk
+# from .poolformer import *
diff --git a/torchelie/models/vit.py b/torchelie/models/vit.py
@@ -0,0 +1,58 @@
+import torch
+import torch.nn as nn
+from ..nn.transformer import ViTBlock
+
+
+class ViTTrunk(nn.Module):
+    """
+    Vision Transformer (ViT) trunk that processes a sequence of patch embeddings with positional encoding
+    and optional learnable registers, using a stack of ViTBlock layers.
+
+    Args:
+        seq_len (int): Length of the input sequence (number of patches).
+        d_model (int): Dimension of the model.
+        num_layers (int): Number of transformer blocks.
+        num_heads (int): Number of attention heads.
+        num_registers (int, optional): Number of learnable registers to prepend to the sequence. Default: 10.
+
+    Forward Args:
+        x (Tensor): Input tensor of shape [B, C, H/P, W/P], where P is the patch size.
+
+    Returns:
+        Tensor: Output tensor of shape [B, C, H/P, W/P].
+    """
+
+    def __init__(self, seq_len, d_model, num_layers, num_heads, num_registers=10):
+        super().__init__()
+        self.trunk = nn.ModuleList(
+            [ViTBlock(d_model, num_heads) for _ in range(num_layers)]
+        )
+        self.pos_enc = nn.Parameter(torch.zeros(seq_len, d_model))
+        self.registers = nn.Parameter(
+            torch.randn(num_registers, d_model) / (d_model**0.5)
+        )
+
+    def forward(self, x):
+        """
+        Forward pass for the ViTTrunk.
+
+        Args:
+            x (Tensor): Input tensor of shape [B, C, H/P, W/P].
+
+        Returns:
+            Tensor: Output tensor of shape [B, C, H/P, W/P].
+        """
+        # x: [B,C,H/P,W/P]
+        B, C, Hp, Wp = x.shape
+        x = x.view(B, C, Hp * Wp).permute(0, 2, 1)
+        x = x + self.pos_enc
+        # x: [B, L, C]
+        x = torch.cat([self.registers.unsqueeze(0).expand(B, -1, -1), x], dim=1)
+        for block in self.trunk:
+            x = block(x)
+
+        x = x[:, len(self.registers) :, :]
+        # x = F.gelu(x)
+        x = x.permute(0, 2, 1).reshape(B, C, Hp, Wp)
+        # x: [B,C,H/P,W/P]
+        return x
diff --git a/torchelie/nn/transformer.py b/torchelie/nn/transformer.py
@@ -3,6 +3,8 @@
 import torch.nn as nn
 import torchelie.utils as tu
 from .conv import Conv1x1
+from ..nn.condseq import CondSeq
+from ..nn.llm import SelfAttention
 from .functional.transformer import local_attention_2d
 
 from typing import Optional
@@ -75,3 +77,57 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 
         x = self.out(x)
         return x
+
+
+class ViTBlock(nn.Module):
+    """
+    Vision Transformer (ViT) block consisting of a self-attention layer and a feed-forward MLP,
+    each followed by RMS normalization and gated residual connections.
+
+    Args:
+        d_model (int): Dimension of the model.
+        num_heads (int): Number of attention heads.
+
+    Forward Args:
+        x (Tensor): Input tensor of shape [B, L, d_model].
+        z (Any): Optional conditioning input for CondSeq modules.
+
+    Returns:
+        Tensor: Output tensor of shape [B, L, d_model].
+    """
+
+    def __init__(self, d_model, num_heads):
+        super().__init__()
+        self.sa = CondSeq(
+            nn.RMSNorm(d_model),
+            SelfAttention(
+                d_model,
+                num_heads,
+                head_size=d_model // num_heads,
+                causal=False,
+                rotary=True,
+            ),
+        )
+        self.mlp = CondSeq(
+            nn.RMSNorm(d_model),
+            tu.kaiming(nn.Linear(d_model, 4 * d_model)),
+            nn.GELU(),
+            tu.kaiming(nn.Linear(4 * d_model, d_model)),
+        )
+        self.g1 = tu.kaiming(nn.Linear(d_model, d_model))
+        self.g2 = tu.kaiming(nn.Linear(d_model, d_model))
+
+    def forward(self, x, z):
+        """
+        Forward pass for the ViTBlock.
+
+        Args:
+            x (Tensor): Input tensor of shape [B, L, d_model].
+            z (Any): Optional conditioning input for CondSeq modules.
+
+        Returns:
+            Tensor: Output tensor of shape [B, L, d_model].
+        """
+        x = self.sa(x, z) * torch.tanh(self.g1(x)) + x
+        x = self.mlp(x, z) * torch.tanh(self.g2(x)) + x
+        return x