[vlm] model code w/ siglip2 encoder

lkhphuc · lkhphuc · commit cad098a8be2f · 2025-08-21T15:59:31.000+04:00
tmp
diff --git a/torchtitan/experiments/vlm/model/args.py b/torchtitan/experiments/vlm/model/args.py
@@ -0,0 +1,31 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+from dataclasses import dataclass, field
+
+from torchtitan.models.llama3 import TransformerModelArgs as Llama3Args
+
+
+@dataclass
+class Siglip2ModelArgs:
+    dim: int = 768
+    ffn_dim: int = 3072
+    n_layers: int = 12
+    n_heads: int = 12
+
+    n_pos_embs: int = 16  # Number of positional embeddings per h&w
+    n_channels: int = 3  # RGB channels
+    patch_size: int = 16
+
+    layer_norm_eps: float = 1e-6
+    use_flex_attn: bool = True
+    attn_mask_type: str = "causal"
+
+
+@dataclass
+class Llama3Siglip2ModelArgs(Llama3Args):
+    encoder: Siglip2ModelArgs = field(default_factory=Siglip2ModelArgs)
+    img_token_id: int = 1998
diff --git a/torchtitan/experiments/vlm/model/model.py b/torchtitan/experiments/vlm/model/model.py
@@ -0,0 +1,103 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import einops as E
+import torch
+from torch import nn
+
+from torchtitan.models.attention import init_attention_mask
+from torchtitan.models.llama3 import Transformer as Llama3
+
+from .args import Llama3Siglip2ModelArgs
+from .siglip2 import VisionTransformer
+
+
+class Projector(nn.Module):
+    """Project the Encoder embedding to the LLM embedding."""
+
+    def __init__(self, in_dim: int, out_dim: int) -> None:
+        super().__init__()
+        self.w1 = nn.Linear(in_dim, in_dim)
+        self.w2 = nn.Linear(in_dim, out_dim)
+        self.init_weights()
+
+    def forward(self, x_NLD: torch.Tensor):
+        x_NLD = self.w1(x_NLD)
+        x_NLD = nn.functional.silu(x_NLD)
+        x_NLD = self.w2(x_NLD)
+        return x_NLD
+
+    def init_weights(self):
+        nn.init.xavier_uniform_(self.w1.weight)
+        if self.w1.bias is not None:
+            nn.init.zeros_(self.w1.bias)
+        nn.init.xavier_uniform_(self.w2.weight)
+        if self.w2.bias is not None:
+            nn.init.zeros_(self.w2.bias)
+
+
+class Llama3Siglip2Transformer(Llama3):
+    def __init__(self, model_args: Llama3Siglip2ModelArgs):
+        super().__init__(model_args)
+        self.model_args = model_args
+        self.encoder = VisionTransformer(model_args.encoder)
+        self.projector = Projector(
+            in_dim=model_args.encoder.dim, out_dim=model_args.dim
+        )
+        self.n_pixels_per_token = model_args.encoder.patch_size**2
+        self.init_encoder_weights()
+
+    def init_encoder_weights(self, buffer_device=None):
+        super().init_weights(buffer_device=buffer_device)
+        if self.encoder is not None:
+            self.encoder.init_weights()
+        if self.projector is not None:
+            self.projector.init_weights()
+
+    def _scatter_img_tokens(self, h_BSD, tokens_BS, i_NLD, i_mask_NL, img_id=None):
+        img_id = img_id or self.model_args.img_token_id
+        B, S, D = h_BSD.shape
+        # Where are the image tokens in LLM input, make broadcastable with h_BSD
+        img_mask_h_BSD = E.repeat(tokens_BS == img_id, "b s -> b s 1")
+        # Only get valid (non-padded) tokens, result are flatten
+        i_flatten = torch.masked_select(i_NLD, mask=i_mask_NL.unsqueeze(-1))
+
+        assert i_flatten.numel() // D == img_mask_h_BSD.sum(), (
+            f"Different number of visual embeddings {i_flatten.numel() // D} "
+            f"with placeholder in input token embeddings {img_mask_h_BSD.sum()}"
+        )
+        h_BSD.masked_scatter_(mask=img_mask_h_BSD, source=i_flatten)
+        return h_BSD
+
+    def forward(
+        self,
+        tokens: torch.Tensor,
+        eos_id: int | None = None,
+        input_batch: torch.Tensor | None = None,
+        pixel_values: torch.Tensor | None = None,
+        grid_thw: torch.Tensor | None = None,
+    ):
+        if self.model_args.use_flex_attn:
+            init_attention_mask(
+                input_batch if input_batch is not None else tokens, eos_id=self.eos_id
+            )
+
+        # passthrough for nonexistent layers, allows easy configuration of pipeline parallel stages
+        h_BSD = self.tok_embeddings(tokens) if self.tok_embeddings else tokens
+
+        if self.encoder is not None:
+            grid_hw = grid_thw[:, :, 1:]  # Siglip2 only support image hw
+            pixel_masks = E.reduce(grid_hw != -1, "n l hw -> n l", reduction="all")
+            i_NLD = self.encoder(pixel_values, pixel_masks, grid_hw)
+            i_NLD = self.projector(i_NLD)
+            h_BSD = self._scatter_img_tokens(h_BSD, tokens, i_NLD, pixel_masks)
+
+        for layer in self.layers.values():
+            h_BSD = layer(h_BSD, self.freqs_cis)
+
+        h_BSD = self.norm(h_BSD) if self.norm else h_BSD
+        output = self.output(h_BSD) if self.output else h_BSD
+        return output
diff --git a/torchtitan/experiments/vlm/model/siglip2.py b/torchtitan/experiments/vlm/model/siglip2.py
@@ -0,0 +1,221 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+import einops as E
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from torchtitan.models.attention import build_attention, init_attention_mask
+
+from .args import Siglip2ModelArgs
+
+
+def resize_positional_embeddings(
+    pos_embs_HWD: torch.Tensor,
+    spatial_shapes_N2: torch.Tensor,
+    max_length: int,
+) -> torch.Tensor:
+    """
+    Resize the learned 2D positional embeddings to image-specific size and pad to a fixed size.
+
+    Args:
+        pos_embs_HWD (`torch.Tensor`):
+            Position embeddings of shape (height, width, embed_dim)
+        spatial_shapes (`torch.LongTensor`):
+            Spatial shapes of shape (batch_size, 2) to resize the positional embeddings to
+        max_length (`int`):
+            Maximum length of the positional embeddings to pad resized positional embeddings to
+
+    Returns:
+        `torch.Tensor`: Embeddings of shape (batch_size, max_length, embed_dim)
+    """
+    _, _, D = pos_embs_HWD.shape
+    B, _ = spatial_shapes_N2.shape
+
+    resized_embs_BLD = torch.empty(
+        (B, max_length, D),
+        device=pos_embs_HWD.device,
+        dtype=pos_embs_HWD.dtype,
+    )
+
+    # TODO: group images by size, and do interpolate,
+    # or cache the interpolate output so we do this once per size
+    for i in range(B):
+        height, width = spatial_shapes_N2[i].tolist()
+        if (height + width) == 0:  # Skip empty padding images
+            continue
+
+        resized_emb = F.interpolate(
+            E.rearrange(pos_embs_HWD, "h w d -> 1 d h w"),
+            size=(height, width),
+            mode="bilinear",
+            align_corners=False,
+            antialias=True,
+        )
+
+        resized_emb_LD = E.rearrange(resized_emb, "1 d h w -> (h w) d")
+        resized_embs_BLD[i, : int(height * width)] = resized_emb_LD
+
+    return resized_embs_BLD
+
+
+class VisionEmbeddings(nn.Module):
+    def __init__(self, args: Siglip2ModelArgs):
+        super().__init__()
+        self.patch_embedding = nn.Linear(
+            in_features=args.n_channels * args.patch_size * args.patch_size,
+            out_features=args.dim,
+        )
+        self.position_embedding = nn.Embedding(args.n_pos_embs**2, args.dim)
+        self.n_pos_embs = args.n_pos_embs
+
+    def init_weights(self):
+        nn.init.trunc_normal_(self.patch_embedding.weight, mean=0.0, std=0.02)
+        nn.init.normal_(self.position_embedding.weight)
+
+    def forward(self, pixels_NLD: torch.Tensor, grid_hw: torch.Tensor) -> torch.Tensor:
+        # Apply patch embeddings to already patchified pixel values
+        patch_embeds_NLD = self.patch_embedding(pixels_NLD)
+
+        # Get positional resized and padded positional embeddings
+        pos_emb_HWD = self.position_embedding.weight.reshape(
+            self.n_pos_embs, self.n_pos_embs, -1
+        )
+        spatial_h = E.reduce(grid_hw[:, :, 0], "n l -> n", reduction="max") + 1
+        spatial_w = E.reduce(grid_hw[:, :, 1], "n l -> n", reduction="max") + 1
+        spatial_shapes = torch.stack([spatial_h, spatial_w], dim=-1).long()
+        resized_positional_embeddings = resize_positional_embeddings(
+            pos_emb_HWD,
+            spatial_shapes,
+            max_length=pixels_NLD.shape[1],
+        )
+        # Add positional embeddings to patch embeddings
+        embeddings = patch_embeds_NLD + resized_positional_embeddings
+        return embeddings
+
+
+class Attention(nn.Module):
+    """
+    Multi-head attention module.
+
+    Args:
+        model_args (TransformerModelArgs): Model configuration arguments.
+
+    Attributes:
+        n_heads (int): Number of query heads.
+        head_dim (int): Dimension size of each attention head.
+        wq (Linear): Linear transformation for queries.
+        wk (Linear): Linear transformation for keys.
+        wv (Linear): Linear transformation for values.
+        wo (Linear): Linear transformation for output.
+
+    """
+
+    def __init__(self, args: Siglip2ModelArgs):
+        super().__init__()
+        self.dim = args.dim
+        self.head_dim = args.dim // args.n_heads
+
+        self.q_proj = nn.Linear(self.dim, self.dim)
+        self.k_proj = nn.Linear(self.dim, self.dim)
+        self.v_proj = nn.Linear(self.dim, self.dim)
+        self.out_proj = nn.Linear(self.dim, self.dim)
+
+        self.attn = build_attention(
+            use_flex_attn=True, attn_mask_type=args.attn_mask_type
+        )
+
+    def forward(self, x: torch.Tensor):
+        xq, xk, xv = self.q_proj(x), self.k_proj(x), self.v_proj(x)
+
+        # Use self.head_dim instead of `n_heads` to infer the actual
+        # local heads from sizes of xq, xk, and xv as TP may have sharded them
+        # after the above linear ops.
+        xq = E.rearrange(xq, "b l (h d) -> b h l d", d=self.head_dim)
+        xk = E.rearrange(xk, "b l (h d) -> b h l d", d=self.head_dim)
+        xv = E.rearrange(xv, "b l (h d) -> b h l d", d=self.head_dim)
+
+        output = self.attn(xq, xk, xv)
+        output = E.rearrange(output, "b h l d -> b l (h d)").contiguous()
+
+        return self.out_proj(output)
+
+    def init_weights(self):
+        for linear in (self.q_proj, self.k_proj, self.v_proj, self.out_proj):
+            nn.init.trunc_normal_(linear.weight, mean=0.0, std=0.02)
+
+
+class FeedForward(nn.Module):
+    def __init__(self, args: Siglip2ModelArgs):
+        super().__init__()
+        self.fc1 = nn.Linear(args.dim, args.ffn_dim)
+        self.fc2 = nn.Linear(args.ffn_dim, args.dim)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = self.fc1(x)
+        x = F.gelu(x, approximate="tanh")
+        x = self.fc2(x)
+        return x
+
+    def init_weights(self):
+        nn.init.trunc_normal_(self.fc1.weight, mean=0.0, std=0.02)
+        nn.init.trunc_normal_(self.fc2.weight, mean=0.0, std=0.02)
+
+
+class TransformerLayer(nn.Module):
+    def __init__(self, args: Siglip2ModelArgs):
+        super().__init__()
+        self.layer_norm1 = nn.LayerNorm(args.dim, eps=args.layer_norm_eps)
+        self.self_attn = Attention(args)
+        self.layer_norm2 = nn.LayerNorm(args.dim, eps=args.layer_norm_eps)
+        self.mlp = FeedForward(args)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        x = x + self.self_attn(self.layer_norm1(x))
+        x = x + self.mlp(self.layer_norm2(x))
+        return x
+
+    def init_weights(self):
+        self.layer_norm1.reset_parameters()
+        self.layer_norm2.reset_parameters()
+        self.self_attn.init_weights()
+        self.mlp.init_weights()
+
+
+class VisionTransformer(nn.Module):
+    def __init__(self, args: Siglip2ModelArgs):
+        super().__init__()
+        self.args = args
+        self.eos_id = 11
+
+        self.embeddings = VisionEmbeddings(args)
+        self.layers = nn.ModuleDict(
+            {str(idx): TransformerLayer(args) for idx in range(args.n_layers)}
+        )
+        self.post_layernorm = nn.LayerNorm(args.dim, eps=args.layer_norm_eps)
+
+    def forward(
+        self,
+        pixel_values_NLD: torch.FloatTensor,
+        pixel_masks_NL: torch.BoolTensor,
+        grid_hw: torch.LongTensor,
+    ):
+        init_attention_mask(pixel_masks_NL, eos_id=self.eos_id)
+
+        h = self.embeddings(pixel_values_NLD, grid_hw)
+
+        for layer in self.layers.values():
+            h = layer(h)
+        h = self.post_layernorm(h)
+
+        return h
+
+    def init_weights(self):
+        self.embeddings.init_weights()
+        for layer in self.layers.values():
+            layer.init_weights()
+        self.post_layernorm.reset_parameters()