From f5398b3ca8739c37d2bd829158f0e8861d233b1f Mon Sep 17 00:00:00 2001
From: Joel Lamy-Poirier <joel.lamy-poirier@servicenow.com>
Date: Thu, 6 Nov 2025 16:29:44 -0500
Subject: [PATCH 01/16] Vision model

---
 Dockerfile                                   |   2 +-
 fast_llm/layers/attention/rotary/config.py   |   8 +
 fast_llm/layers/attention/rotary/rotary.py   |  47 +++++
 fast_llm/layers/common/linear/config.py      |  50 ++++-
 fast_llm/layers/common/linear/convolution.py |  24 +++
 fast_llm/layers/language_model/embedding.py  |  67 +++++--
 fast_llm/layers/vision/__init__.py           |   0
 fast_llm/layers/vision/config.py             | 169 ++++++++++++++++
 fast_llm/layers/vision/patch_convolution.py  |  71 +++++++
 fast_llm/layers/vision/preprocessing.py      | 194 +++++++++++++++++++
 fast_llm/layers/vision/vision_encoder.py     |  67 +++++++
 fast_llm/models/multimodal/__init__.py       |   0
 fast_llm/models/multimodal/config.py         |  89 +++++++++
 fast_llm/models/multimodal/model.py          | 133 +++++++++++++
 fast_llm/models/multimodal/trainer.py        |  14 ++
 setup.cfg                                    |   9 +-
 16 files changed, 925 insertions(+), 19 deletions(-)
 create mode 100644 fast_llm/layers/vision/__init__.py
 create mode 100644 fast_llm/layers/vision/config.py
 create mode 100644 fast_llm/layers/vision/patch_convolution.py
 create mode 100644 fast_llm/layers/vision/preprocessing.py
 create mode 100644 fast_llm/layers/vision/vision_encoder.py
 create mode 100644 fast_llm/models/multimodal/__init__.py
 create mode 100644 fast_llm/models/multimodal/config.py
 create mode 100644 fast_llm/models/multimodal/model.py
 create mode 100644 fast_llm/models/multimodal/trainer.py

diff --git a/Dockerfile b/Dockerfile
index 00e13d957..6bc900ae7 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -38,7 +38,7 @@ COPY --chmod=777 ./fast_llm/__init__.py fast_llm/
 COPY --chmod=777 ./fast_llm/csrc/ fast_llm/csrc/
 
 # Install dependencies within the virtual environment.
-RUN pip install --no-cache-dir --no-build-isolation -e ".[CORE,OPTIONAL,HUGGINGFACE,SSM,GENERATION,DEV]" triton==3.1.0
+RUN pip install --no-cache-dir --no-build-isolation -e ".[CORE,OPTIONAL,HUGGINGFACE,SSM,VISION,GENERATION,DEV]" triton==3.1.0
 
 # Copy the remaining source code with universal write permissions.
 COPY --chmod=777 ./Megatron-LM Megatron-LM
diff --git a/fast_llm/layers/attention/rotary/config.py b/fast_llm/layers/attention/rotary/config.py
index 26877ee0c..74b5cf21a 100644
--- a/fast_llm/layers/attention/rotary/config.py
+++ b/fast_llm/layers/attention/rotary/config.py
@@ -135,3 +135,11 @@ def _get_configurable_class(self) -> "type[YarnRotary]":
         from fast_llm.layers.attention.rotary.rotary import YarnRotary
 
         return YarnRotary
+
+
+@config_class(dynamic_type={RotaryConfig: "default_2d"})
+class Rotary2DConfig(DefaultRotaryConfig):
+    def _get_configurable_class(self) -> "type[Rotary2D]":
+        from fast_llm.layers.transformer.rotary.rotary import Rotary2D
+
+        return Rotary2D
diff --git a/fast_llm/layers/attention/rotary/rotary.py b/fast_llm/layers/attention/rotary/rotary.py
index d57d72947..6250fd4a9 100644
--- a/fast_llm/layers/attention/rotary/rotary.py
+++ b/fast_llm/layers/attention/rotary/rotary.py
@@ -12,6 +12,7 @@
     DefaultRotaryConfig,
     Llama3RotaryConfig,
     NoRotaryConfig,
+    Rotary2DConfig,
     RotaryConfig,
     YarnRotaryConfig,
 )
@@ -174,3 +175,49 @@ def _get_correction(self, beta: float, dim: int) -> float:
             * math.log(self._config.original_context_length / (beta * 2 * math.pi))
             / (2 * math.log(self._config.theta))
         )
+
+
+class Rotary2D[ConfigType: Rotary2DConfig](DefaultRotary[ConfigType]):
+    _rotary_embedding_frequencies: torch.Tensor
+    _tensor_cache_max_num_patches: int = -1
+    _config: ConfigType
+
+    def preprocess(self, batch, kwargs: dict[str, typing.Any]) -> None:
+        self._create_tensors(
+            kwargs[VisionEncoderKwargs.max_image_size] // kwargs[VisionEncoderKwargs.patch_size], batch.device
+        )
+        position_ids = kwargs[VisionTransformerKwargs.patch_position_ids]
+        kwargs[AttentionKwargs.rotary_freq_q] = self._rotary_embedding_frequencies[:, position_ids]
+        kwargs[AttentionKwargs.rotary_freq_k] = self._rotary_embedding_frequencies[:, position_ids]
+
+    def forward(
+        self, query: torch.Tensor, key: torch.Tensor, kwargs: dict[str, typing.Any]
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        rotary_fn = triton_rotary_autograd_ if self._config.triton else apply_rotary_embeddings
+        query = rotary_fn(query, kwargs[AttentionKwargs.rotary_freq_q])
+        key = rotary_fn(key, kwargs[AttentionKwargs.rotary_freq_k])
+        return query, key
+
+    def _get_frequencies(self, sequence_length: int, head_size: int, device: torch.device) -> torch.Tensor:
+        max_num_patches = sequence_length
+        # Calculate complex frequencies by using alternating channels for width and height
+        height_positions = torch.arange(max_num_patches, device=device, dtype=torch.float64)
+        width_positions = torch.arange(max_num_patches, device=device, dtype=torch.float64)
+        frequencies = self._config.theta ** -torch.arange(0, 1, 2 / head_size, device=device, dtype=torch.float64)
+        angles_h = torch.outer(height_positions, frequencies[::2])
+        angles_w = torch.outer(width_positions, frequencies[1::2])
+        angles = torch.cat(
+            [
+                angles_h[:, None, :].repeat(1, max_num_patches, 1),
+                angles_w[None, :, :].repeat(max_num_patches, 1, 1),
+            ],
+            dim=-1,
+        ).reshape(-1, head_size // 2)
+
+        frequencies = torch.polar(torch.ones_like(angles), angles)[None, :, None, :].to(torch.complex64)
+        if not self._config.complex_format:
+            frequencies = convert_rotary_complex_to_real(
+                torch.view_as_real(frequencies).flatten(-2), head_size, 3
+            ).contiguous()
+
+        return frequencies
diff --git a/fast_llm/layers/common/linear/config.py b/fast_llm/layers/common/linear/config.py
index e7c6d9e92..0dc118269 100644
--- a/fast_llm/layers/common/linear/config.py
+++ b/fast_llm/layers/common/linear/config.py
@@ -1,7 +1,12 @@
 import typing
 
 from fast_llm.config import Config, Field, FieldHint, check_field, config_class
-from fast_llm.engine.config_utils.initialization import Initialization, init_uniform_centered_, init_zeros_
+from fast_llm.engine.config_utils.initialization import (
+    Initialization,
+    init_normal_,
+    init_uniform_centered_,
+    init_zeros_,
+)
 from fast_llm.engine.config_utils.parameter import OptionalParameterConfig, ParameterConfig, combine_lr_scales
 from fast_llm.engine.config_utils.tensor_dim import TensorDim, scalar_dim
 from fast_llm.functional.config import ActivationType
@@ -9,7 +14,7 @@
 from fast_llm.utils import Assert
 
 if typing.TYPE_CHECKING:
-    from fast_llm.layers.common.linear.convolution import CausalConv1d
+    from fast_llm.layers.common.linear.convolution import CausalConv1d, Convolution2D
     from fast_llm.layers.common.linear.linear import LinearBase
 
 
@@ -217,3 +222,44 @@ def get_layer(
         return CausalConv1d(
             weight, bias, activation=default_activation if self.activation is None else self.activation
         )
+
+
+@config_class
+class Convolution2DConfig(AffineLinearBaseConfig):
+    def get_layer(
+        self,
+        in_dim: TensorDim,
+        out_dim: TensorDim,
+        kernel_dim_1: TensorDim,
+        kernel_dim_2: TensorDim,
+        *,
+        stride: tuple[int, int],
+        default_weight_initialization: Initialization | None = None,
+        default_bias_initialization: Initialization | None = None,
+        default_add_bias: bool = True,
+        lr_scale: float | None,
+        peft: PeftConfig | None,
+    ) -> "Convolution2D":
+        from fast_llm.layers.common.linear.convolution import Convolution2D
+
+        if default_weight_initialization is None:
+            default_weight_initialization = init_normal_()
+        if default_bias_initialization is None:
+            default_bias_initialization = init_normal_()
+
+        lr_scale = (combine_lr_scales(lr_scale, self.lr_scale),)
+        weight = self.weight.get_parameter(
+            (out_dim, in_dim, kernel_dim_1, kernel_dim_2),
+            default_initialization=default_weight_initialization,
+            lr_scale=lr_scale,
+            peft=peft,
+        )
+        bias = self.bias.get_parameter(
+            (out_dim,),
+            default_initialization=default_bias_initialization,
+            lr_scale=lr_scale,
+            default_enabled=default_add_bias,
+            peft=peft,
+        )
+
+        return Convolution2D(weight, bias, stride=stride)
diff --git a/fast_llm/layers/common/linear/convolution.py b/fast_llm/layers/common/linear/convolution.py
index b88b7b2e6..6281348e1 100644
--- a/fast_llm/layers/common/linear/convolution.py
+++ b/fast_llm/layers/common/linear/convolution.py
@@ -55,3 +55,27 @@ def _forward_causal_conv1d(self, input_: torch.Tensor) -> torch.Tensor:
 
     def get_compute_usage(self, input_: TensorMeta, config: ResourceUsageConfig) -> int:
         raise NotImplementedError()
+
+
+class Convolution2D(torch.nn.Module):
+    """
+    TODO: Generalize to other convolutions?
+    """
+
+    def __init__(
+        self,
+        weight: ParameterMeta,
+        bias: ParameterMeta | None,
+        *,
+        stride: tuple[int, int],
+    ):
+        super().__init__()
+        self.weight = weight
+        self.bias = bias
+        self._stride = stride
+
+    def forward(self, input_: torch.Tensor) -> torch.Tensor:
+        return torch.nn.functional.conv2d(input_, self.weight, self.bias, stride=self._stride)
+
+    def get_compute_usage(self, input_: TensorMeta, config: ResourceUsageConfig) -> int:
+        raise NotImplementedError()
diff --git a/fast_llm/layers/language_model/embedding.py b/fast_llm/layers/language_model/embedding.py
index 61ca1cfc0..b9d209274 100644
--- a/fast_llm/layers/language_model/embedding.py
+++ b/fast_llm/layers/language_model/embedding.py
@@ -3,7 +3,7 @@
 import torch
 
 from fast_llm.core.distributed import set_generator
-from fast_llm.core.ops import reduce_forward, split
+from fast_llm.core.ops import gather, reduce_forward, split
 from fast_llm.engine.base_model.config import ResourceUsageConfig
 from fast_llm.engine.config_utils.initialization import init_normal_
 from fast_llm.engine.config_utils.tensor_dim import TensorDim
@@ -14,6 +14,8 @@
 from fast_llm.tensor import TensorMeta
 from fast_llm.utils import Assert
 
+WORD_EMBEDDINGS_WEIGHT = "word_embeddings_weight"
+
 
 class LanguageModelEmbedding[ConfigType: LanguageModelEmbeddingsConfig](Block[ConfigType]):
     """
@@ -26,7 +28,8 @@ class LanguageModelEmbedding[ConfigType: LanguageModelEmbeddingsConfig](Block[Co
     layer_count: float = 1000.0
     _config: ConfigType
 
-    # Position embedding preprocessing
+    # Preprocessing
+    _rotary_embedding_frequencies: torch.Tensor
     _position_ids: torch.Tensor
     _tensor_cache_max_sequence_length: int = -1
 
@@ -75,34 +78,62 @@ def __init__(
         )
 
     @torch.compile
-    def _forward(self, input_: torch.Tensor, position_ids: torch.Tensor | None, mask_inputs: bool) -> torch.Tensor:
+    def _forward(
+        self,
+        input_: torch.Tensor,
+        token_ids: torch.Tensor,
+        position_ids: torch.Tensor | None,
+        mask_inputs: bool,
+        # TODO: Flatten the batch and sequence in the map?
+        embedding_map: tuple[tuple[torch.Tensor, torch.Tensor], tuple[torch.Tensor, torch.Tensor]] | None,
+    ) -> torch.Tensor:
         Assert.eq(position_ids is None, self.position_embeddings_weight is None)
         group = self._parallel_dim.group
         if self._vocab_parallel:
-            input_mask = (input_ >= self._vocab_start_index) * (input_ < self._vocab_end_index)
-            masked_input = (input_ - self._vocab_start_index) * input_mask
-            embeddings = torch.embedding(self.word_embeddings_weight, masked_input) * input_mask.unsqueeze(2)  # noqa
+            token_mask = (token_ids >= self._vocab_start_index) * (token_ids < self._vocab_end_index)
+            masked_input = (token_ids - self._vocab_start_index) * token_mask
+            embeddings = torch.embedding(self.word_embeddings_weight, masked_input) * token_mask.unsqueeze(2)  # noqa
             embeddings = reduce_forward(embeddings, group)
+            # TODO: Input masking of position embeddings inconsistant with non-vocab-parallel
             if self.position_embeddings_weight is not None:
                 embeddings = embeddings + torch.nn.functional.embedding(position_ids, self.position_embeddings_weight)
+
+            if embedding_map is not None:
+                # TODO: Accumulate redundant with masking?
+                input_index, embedding_index = embedding_map
+                if self._sequence_parallel:
+                    input_ = gather(input_, group=group, dim=0)
+                embeddings = embeddings.index_put(embedding_index, input_[input_index], accumulate=True)
+
             if self._sequence_parallel:
                 embeddings = split(embeddings, group=group, dim=0)
         else:
             if self._sequence_parallel:
-                input_ = split(input_, group=group, dim=0)
+                token_ids = split(token_ids, group=group, dim=0)
                 if self.position_embeddings_weight is not None:
                     position_ids = split(position_ids, group=group, dim=0)
             # handle masked tokens
             if mask_inputs:
-                input_mask = input_ >= 0
-                masked_input = input_ * input_mask
-                embeddings = torch.embedding(self.word_embeddings_weight, masked_input)
-            else:
-                embeddings = torch.embedding(self.word_embeddings_weight, input_)
+                token_mask = token_ids >= 0
+                token_ids = token_ids * token_mask
+            embeddings = torch.embedding(self.word_embeddings_weight, token_ids)
             if self.position_embeddings_weight is not None:
                 embeddings = embeddings + torch.nn.functional.embedding(position_ids, self.position_embeddings_weight)
             if mask_inputs:
-                embeddings = embeddings * input_mask.unsqueeze(2)
+                embeddings = embeddings * token_mask.unsqueeze(2)
+
+            if embedding_map is not None:
+                # TODO: Accumulate redundant with masking?
+                input_index, embedding_index = embedding_map
+                if self._sequence_parallel:
+                    #  TODO:: Filter and shift embedding map instead? (needs cuda sync)
+                    input_ = gather(input_, group=group, dim=0)
+                    embeddings_ = embeddings.new_zeros(embeddings.shape[0] * group.size(), *embeddings.shape[1:])
+                    embeddings_.index_put(embedding_index, input_[input_index], accumulate=True)
+                    embeddings = embeddings + split(embeddings_, group=group, dim=0)
+                else:
+                    embeddings = embeddings.index_put(embedding_index, input_[input_index], accumulate=True)
+
         with set_generator(
             self._distributed.tp_generator if self._sequence_parallel else self._distributed.pp_generator
         ):
@@ -119,11 +150,17 @@ def forward(
         if isinstance(input_, TensorMeta):
             return TensorMeta.from_dims(
                 kwargs[LanguageModelKwargs.hidden_dims],
-                tensor_name=f"{self.module_name} output",
+                tensor_name="Embedding output",
                 dtype=self._residual_dtype,
             )
+
         return self._forward(
-            input_, kwargs.get(LanguageModelKwargs.position_ids), kwargs.get(LanguageModelKwargs.mask_inputs)
+            input_,
+            kwargs.get(LanguageModelKwargs.token_ids),
+            kwargs.get(LanguageModelKwargs.position_ids),
+            # TODO ====== Vision ====== Review input masking.
+            kwargs.get(LanguageModelKwargs.mask_inputs),
+            kwargs.get(LanguageModelKwargs.embedding_map),
         )
 
     def get_compute_usage(self, input_: TensorMeta, kwargs: dict[str, typing.Any], config: ResourceUsageConfig) -> int:
diff --git a/fast_llm/layers/vision/__init__.py b/fast_llm/layers/vision/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/fast_llm/layers/vision/config.py b/fast_llm/layers/vision/config.py
new file mode 100644
index 000000000..1af986eef
--- /dev/null
+++ b/fast_llm/layers/vision/config.py
@@ -0,0 +1,169 @@
+import typing
+
+from fast_llm.config import Config, Field, FieldHint, check_field, config_class
+from fast_llm.layers.block.config import BlockConfig, BlockSequenceConfig
+from fast_llm.layers.common.linear.config import Convolution2DConfig
+from fast_llm.layers.common.normalization.config import NormalizationConfig
+from fast_llm.layers.decoder.config import MLPBaseConfig
+from fast_llm.utils import Assert
+
+if typing.TYPE_CHECKING:
+    from fast_llm.layers.vision.vision_encoder import VisionEncoder
+
+
+@config_class()
+class ImageNormalizationConfig(Config):
+    mean_r: float = Field(
+        default=0.48145466,
+        desc="Mean value for the red channel in the image normalization process.",
+        hint=FieldHint.optional,
+    )
+    mean_g: float = Field(
+        default=0.4578275,
+        desc="Mean value for the green channel in the image normalization process.",
+        hint=FieldHint.optional,
+    )
+    mean_b: float = Field(
+        default=0.40821073,
+        desc="Mean value for the blue channel in the image normalization process.",
+        hint=FieldHint.optional,
+    )
+    std_r: float = Field(
+        default=0.26862954,
+        desc="Standard deviation value for the red channel in the image normalization process.",
+        hint=FieldHint.optional,
+    )
+    std_g: float = Field(
+        default=0.26130258,
+        desc="Standard deviation value for the green channel in the image normalization process.",
+        hint=FieldHint.optional,
+    )
+    std_b: float = Field(
+        default=0.27577711,
+        desc="Standard deviation value for the blue channel in the image normalization process.",
+        hint=FieldHint.optional,
+    )
+    rescale_factor: float = Field(
+        default=255.0,
+        desc="Rescale factor for the image normalization process.",
+        hint=FieldHint.optional,
+    )
+
+
+@config_class()
+class PatchConvolutionConfig(BlockConfig):
+    _abstract = False
+    convolution: Convolution2DConfig = Field(
+        desc="Configuration for the 2d convolution.",
+        hint=FieldHint.architecture,
+    )
+    normalization: NormalizationConfig = Field(
+        desc="Configuration for the normalization layer.",
+        hint=FieldHint.architecture,
+    )
+    patch_size: int = Field(
+        default=16,
+        desc="Size of image patches, in pixels (width and height).",
+        hint=FieldHint.core,
+    )
+    input_channels: int = Field(
+        default=3,
+        desc="Number of pixel channels (usually 3).",
+        hint=FieldHint.feature,
+    )
+
+
+@config_class(registry=True)
+class VisionEncoderConfig(BlockConfig):
+    _abstract = False
+    patch_convolution: PatchConvolutionConfig = Field(
+        desc="Configuration for the patch convolution layer.",
+        hint=FieldHint.architecture,
+    )
+    adapter: MLPBaseConfig = Field(
+        desc="Configuration for the adapter layer.",
+        hint=FieldHint.architecture,
+    )
+    # TODO: ====== Appropriate name?? ======
+    decoder: BlockSequenceConfig = Field(
+        desc="Configuration for the vision decoder.",
+        hint=FieldHint.architecture,
+    )
+    hidden_size: int = Field(
+        default=1024,
+        desc="Size of the vision encoder main hidden dimension.",
+        hint=FieldHint.architecture,
+        valid=check_field(Assert.gt, 0),
+    )
+
+    @property
+    def layer_class(self) -> "type[VisionEncoder]":
+        from fast_llm.layers.vision.vision_encoder import VisionEncoder
+
+        return VisionEncoder
+
+    # transformer: TransformerConfig = Field(
+    #    desc="Configuration for the vision transformer architecture.",
+    #    hint=FieldHint.core,
+    # )
+    # patch_size: int = Field(
+    #    default=16,
+    #    desc="Patch size for the image encoder.",
+    #    hint=FieldHint.core,
+    # )
+    # conv_bias: bool = Field(
+    #    default=False,
+    #    desc="Whether to use bias in the convolutional layer.",
+    #    hint=FieldHint.optional,
+    # )
+    # patch_norm: NormalizationConfig = Field(
+    #    desc="Configuration for the normalization layers applied to the image patches.",
+    #    hint=FieldHint.optional,
+    # )
+    # adapter_size: int = Field(
+    #    default=5120,
+    #    desc="Intermediate size for the adapter linear layers. Assuming 2 linear layers",
+    #    hint=FieldHint.core,
+    # )
+    # adapter_activation_type: ActivationType = Field(
+    #    default=ActivationType.gelu,
+    #    desc="The intermediate activation type for multi-modal adapter. Default: GeLU.",
+    #    hint=FieldHint.core,
+    # )
+    # adapter_bias: bool = Field(
+    #    default=True,
+    #    desc="Whether to use bias in the adapter linear layer.",
+    #    hint=FieldHint.optional,
+    # )
+    # image_normalization: ImageNormalizationConfig = Field(
+    #    desc="Configuration for the normalization layers applied to the image patches.",
+    #    hint=FieldHint.optional,
+    # )
+    # image_break_token: int | None = Field(
+    #    default=None,
+    #    desc="Token id to separate image rows. If None, no token id is applied.",
+    #    hint=FieldHint.optional,
+    # )
+    # image_end_token: int | None = Field(
+    #    default=None,
+    #    desc="Token id to indicate the end of an image. If None, no token id is applied.",
+    #    hint=FieldHint.optional,
+    # )
+    # adapter_lr_scale: float | None = Field(
+    #    default=None,
+    #    desc="Custom learning rate scale for the adapter weights.",
+    #    hint=FieldHint.feature,
+    #    valid=skip_valid_if_none(check_field(Assert.geq, 0)),
+    # )
+    # conv_lr_scale: float | None = Field(
+    #    default=None,
+    #    desc="Custom learning rate scale for the convolutional layer weights.",
+    #    hint=FieldHint.feature,
+    #    valid=skip_valid_if_none(check_field(Assert.geq, 0)),
+    # )
+    # adapter_init_method_std: float = Field(
+    #    default=None,
+    #    desc="Standard deviation for the normal initialization of the adapter weights. Default: adapter_size ** -0.5.",
+    #    hint=FieldHint.optional,
+    #    valid=check_field(Assert.geq, 0),
+    # )
diff --git a/fast_llm/layers/vision/patch_convolution.py b/fast_llm/layers/vision/patch_convolution.py
new file mode 100644
index 000000000..46cf86708
--- /dev/null
+++ b/fast_llm/layers/vision/patch_convolution.py
@@ -0,0 +1,71 @@
+import typing
+
+import torch
+
+from fast_llm.core.ops import split
+from fast_llm.engine.config_utils.tensor_dim import TensorDim
+from fast_llm.engine.distributed.config import DistributedConfig, DistributedDimNames
+from fast_llm.layers.attention.config import AttentionKwargs
+from fast_llm.layers.block.block import Block
+from fast_llm.layers.common.peft.config import PeftConfig
+from fast_llm.layers.vision.config import PatchConvolutionConfig
+from fast_llm.tensor import TensorMeta
+
+
+class PatchConvolution[ConfigType: PatchConvolutionConfig](Block[ConfigType]):
+    _config: ConfigType
+
+    def __init__(
+        self,
+        config: ConfigType,
+        distributed_config: DistributedConfig,
+        *,
+        # TODO: Input or output dim?
+        hidden_dim: TensorDim,
+        lr_scale: float | None,
+        peft: PeftConfig | None,
+    ):
+        super().__init__(
+            config,
+            distributed_config,
+            hidden_dim=hidden_dim,
+            lr_scale=lr_scale,
+            peft=peft,
+        )
+        input_dim = TensorDim("input_channels", self._config.input_channels)
+        patch_dim = TensorDim("patch", self._config.patch_size)
+        self._parallel_dim = self._distributed_config.get_distributed_dim(DistributedDimNames.tensor)
+
+        self.convolution = self._config.convolution.get_layer(
+            self._hidden_dim,
+            input_dim,
+            patch_dim,
+            patch_dim,
+            stride=(self._config.patch_size, self._config.patch_size),
+            default_add_bias=False,
+            lr_scale=self._lr_scale,
+            peft=self._peft,
+        )
+        self.normalization = self._config.normalization.get_layer(hidden_dim, lr_scale=self._lr_scale, peft=self._peft)
+
+    def forward(
+        self,
+        input_: torch.Tensor,
+        kwargs: dict[str, typing.Any],
+        losses: dict[str, typing.Any] | None = None,
+        metrics: dict | None = None,
+    ) -> torch.Tensor:
+        if isinstance(input_, TensorMeta):
+            return TensorMeta.from_dims(
+                input_.dims[:-1] + (self._hidden_dim,), tensor_name="patch conv output", dtype=input_.dtype
+            )
+        # TODO: Avoid padding
+        input_ = self.convolution(input_)
+        patch_embeddings = self.normalization(input_.flatten(1)).view_as(input_)
+
+        # TODO: Permute earlier?
+        if kwargs[AttentionKwargs.sequence_first]:
+            patch_embeddings = patch_embeddings.permute(1, 0, 2).contiguous()
+        if self._sequence_parallel:
+            patch_embeddings = split(patch_embeddings, group=self._parallel_dim.group, dim=0)
+        return patch_embeddings
diff --git a/fast_llm/layers/vision/preprocessing.py b/fast_llm/layers/vision/preprocessing.py
new file mode 100644
index 000000000..83331c739
--- /dev/null
+++ b/fast_llm/layers/vision/preprocessing.py
@@ -0,0 +1,194 @@
+import math
+import typing
+
+import torch
+import torchvision.transforms.v2 as torchvision_transforms
+
+from fast_llm.engine.distributed.distributed import Distributed
+from fast_llm.layers.attention.config import AttentionKwargs
+from fast_llm.layers.language_model.config import LanguageModelKwargs
+from fast_llm.layers.vision.config import ImageNormalizationConfig, VisionEncoderConfig
+from fast_llm.utils import div
+
+
+def get_num_patches(height: int, width: int, patch_size: int) -> int:
+    """
+    Calculate the number of patches in height and width dimensions.
+    """
+    return div(height, patch_size) * div(width, patch_size)
+
+
+def get_num_image_tokens(height: int, width: int, patch_size: int, image_break: bool, image_end: bool) -> int:
+    """
+    Calculate the number of image tokens.
+    If image_break is True, we consider 1 additional token after every row of patches.
+    """
+    height_patches = div(height, patch_size)
+    width_patches = div(width, patch_size)
+    num_tokens = height_patches * width_patches
+    if image_break:
+        num_tokens += height_patches
+    elif image_end:
+        num_tokens += 1
+    return num_tokens
+
+
+def get_resize_dims(height: int, width: int, max_height: int, max_width: int, patch_size: int) -> tuple[int, int]:
+    """
+    Calculate the new dimensions for resizing an image while maintaining the aspect ratio.
+    If the image is larger than the max dimensions, it will be resized to fit within them.
+    If the image is smaller, it will be resized to the nearest multiple of the patch size.
+    """
+    ratio = max(height / max_height, width / max_width)
+    if ratio > 1:
+        # Resize to fit within max dimensions
+        height = int(height / ratio)
+        width = int(width / ratio)
+    return patch_size * math.ceil(height / patch_size), patch_size * math.ceil(width / patch_size)
+
+
+def resize(image: torch.Tensor, target_height: int, target_width: int) -> torch.Tensor:
+    # cap the resizing to half of the current size as a workaround for large images
+    # See pytorch issue: https://github.com/pytorch/pytorch/issues/103589
+    while max(image.size(1) / target_height, image.size(2) / target_width) > 2:
+        image = torchvision_transforms.functional.resize(
+            image,
+            size=(math.ceil(image.size(1) / 2), math.ceil(image.size(2) / 2)),
+            interpolation=torchvision_transforms.InterpolationMode.BICUBIC,
+        )
+
+    # TODO: options for interpolation mode?
+    return torchvision_transforms.functional.resize(
+        image, size=(target_height, target_width), interpolation=torchvision_transforms.InterpolationMode.BICUBIC
+    )
+
+
+def position_ids_in_meshgrid(height, width, max_size, patch_size) -> torch.Tensor:
+    patch_height = height // patch_size
+    patch_width = width // patch_size
+    return torch.arange(patch_height).repeat_interleave(patch_width) * max_size + torch.arange(patch_width).repeat(
+        patch_height
+    )
+
+
+class VisionPreprocessor:
+    def __init__(self, config: VisionEncoderConfig, distributed: Distributed):
+        self._config = config
+        self._distributed = distributed
+
+    def preprocess(self, tokens, kwargs: dict[str, typing.Any]) -> None:
+        max_image_size = kwargs.get(VisionEncoderKwargs.max_image_size)
+        patch_size = self._config.patch_size
+        image_sizes = []
+
+        norm_config: ImageNormalizationConfig = kwargs["norm_config"]
+
+        if LanguageModelKwargs.labels in kwargs:
+            labels = kwargs[LanguageModelKwargs.labels]
+            if (self._config.image_break_token is not None) or (self._config.image_end_token is not None):
+                # If image break or end token is present, we need to replace image token ids to -100 in labels
+                # TODO: avoid double cloning labels in case of loss masking spans?
+                labels = labels.clone()
+        patches = []
+        patch_position_ids = []
+        sequence_lengths = [0]
+        max_sequence_length = -1
+
+        for sample_index, (sample_images_, positions) in enumerate(
+            zip(kwargs[VisionEncoderKwargs.images], kwargs.get(VisionEncoderKwargs.image_positions), strict=True)
+        ):
+            image_sizes.append(sample_image_sizes := [])
+
+            sample_sequence_length = 0
+
+            for image, position in zip(sample_images_, positions, strict=True):
+                height, width = get_resize_dims(
+                    image.size(1), image.size(2), max_image_size, max_image_size, patch_size=patch_size
+                )
+
+                sample_image_sizes.append((height, width))
+
+                image = resize(image, height, width)
+
+                # TODO: Normalize with constant dtype instead?
+                image = image.to(dtype=self._distributed.config.training_dtype.torch)
+
+                image = torchvision_transforms.functional.normalize(
+                    image / norm_config.rescale_factor,
+                    mean=[norm_config.mean_r, norm_config.mean_g, norm_config.mean_b],
+                    std=[norm_config.std_r, norm_config.std_g, norm_config.std_b],
+                )
+                patches.extend(
+                    torch.nn.functional.unfold(image, kernel_size=patch_size, stride=patch_size).T.reshape(
+                        -1, 3, patch_size, patch_size
+                    )
+                )
+
+                num_height_patches = div(height, patch_size)
+                num_width_patches = div(width, patch_size)
+                grid_height = torch.arange(num_height_patches).repeat_interleave(num_width_patches)
+                grid_width = torch.arange(num_width_patches).repeat(num_height_patches)
+                grid_height * div(max_image_size, patch_size) + grid_width
+                patch_position_ids.append(grid_height * div(max_image_size, patch_size) + grid_width)
+
+                if LanguageModelKwargs.labels in kwargs:
+                    num_tokens = get_num_image_tokens(
+                        height,
+                        width,
+                        patch_size=patch_size,
+                        image_break=self._config.image_break_token is not None,
+                        image_end=self._config.image_end_token is not None,
+                    )
+                    # set labels for image patches to -100
+                    labels[sample_index, max(position - 1, 0) : position + num_tokens - 1] = -100
+
+                sequence_lengths.append(sequence_length := num_height_patches * num_width_patches)
+                if sequence_length > max_sequence_length:
+                    max_sequence_length = sequence_length
+                sample_sequence_length += sequence_length
+
+            # TODO: No need for padding with varlen?
+            padding_size = kwargs[AttentionKwargs.sequence_length] - sample_sequence_length
+            if padding_size > max_sequence_length:
+                max_sequence_length = padding_size
+            sequence_lengths.append(padding_size)
+
+            patches.append(
+                torch.zeros(padding_size, 3, patch_size, patch_size).to(
+                    dtype=self._tensor_space.distributed_config.training_dtype.torch,
+                    device=self._tensor_space.distributed.device,
+                ),
+            )
+            patch_position_ids.append(torch.full((padding_size,), 0, dtype=torch.int64))
+
+        kwargs[VisionEncoderKwargs.image_sizes] = image_sizes
+        kwargs[VisionEncoderKwargs.image_patches] = torch.cat(patches).to(device=self._distributed.device)
+        kwargs[VisionTransformerKwargs.patch_position_ids] = torch.cat(patch_position_ids).to(
+            device=self._distributed.device
+        )
+        kwargs[VisionEncoderKwargs.max_image_tokens] = div(max_image_size**2, patch_size**2)
+        # sequence data parallel is not yet supported for images, so we use the same cu_seqlens for q and k
+        kwargs[VisionTransformerKwargs.cu_seqlens_q] = torch.tensor(
+            cu_seqlens, device=self._distributed.device, dtype=torch.int32
+        )
+        kwargs[VisionTransformerKwargs.cu_seqlens_k] = torch.tensor(
+            cu_seqlens, device=self._distributed.device, dtype=torch.int32
+        )
+        kwargs[VisionTransformerKwargs.max_seqlen_q] = max_sequence_length
+        kwargs[VisionTransformerKwargs.max_seqlen_k] = max_sequence_length
+        if LanguageModelKwargs.labels in kwargs:
+            kwargs[LanguageModelKwargs.labels] = labels
+
+        # TODO: add proper preprocessing for attention-mask when not using flash attention
+        # Following is just a dummy code to run the tests.
+        kwargs[self._config.transformer._transformer_kwargs.attention_mask] = torch.ones(
+            (1, 1, kwargs[AttentionKwargs.sequence_length], 1, kwargs[AttentionKwargs.sequence_length]),
+            dtype=torch.bool,
+            device=self._tensor_space.distributed.device,
+        )
+        kwargs[self._config.transformer._transformer_kwargs.attention_mask_value] = torch.full(
+            [],
+            torch.finfo(self._distributed.config.training_dtype.torch).min,
+            dtype=self._distributed.config.training_dtype.torch,
+            device=self._distributed.device,
+        )
diff --git a/fast_llm/layers/vision/vision_encoder.py b/fast_llm/layers/vision/vision_encoder.py
new file mode 100644
index 000000000..b4fa189d5
--- /dev/null
+++ b/fast_llm/layers/vision/vision_encoder.py
@@ -0,0 +1,67 @@
+import logging
+import typing
+
+import torch
+
+from fast_llm.engine.base_model.base_model import Layer
+from fast_llm.engine.base_model.config import LossDef
+from fast_llm.engine.config_utils.tensor_dim import TensorDim
+from fast_llm.engine.distributed.config import DistributedConfig
+from fast_llm.layers.block.block import BlockBase
+from fast_llm.layers.common.peft.config import PeftConfig
+from fast_llm.layers.vision.config import VisionEncoderConfig
+
+logger = logging.getLogger(__name__)
+
+
+class VisionEncoder[ConfigType: VisionEncoderConfig](BlockBase[VisionEncoderConfig]):
+    _config: ConfigType
+
+    def __init__(
+        self,
+        config: ConfigType,
+        distributed_config: DistributedConfig,
+        *,
+        hidden_dim: TensorDim,
+        lr_scale: float | None,
+        peft: PeftConfig | None,
+    ):
+        vision_hidden_dim = TensorDim("hidden", self._config.hidden_size)
+        super().__init__(config, distributed_config, hidden_dim=hidden_dim, lr_scale=lr_scale, peft=peft)
+        self.patch_convolution = self._config.patch_convolution.get_layer(
+            distributed_config,
+            vision_hidden_dim,
+            lr_scale=self._lr_scale,
+            peft=self._peft,
+        )
+        # TODO: ====== Appropriate name?? ======
+        self.decoder = self._config.decoder.get_layer(
+            distributed_config,
+            vision_hidden_dim,
+            lr_scale=self._lr_scale,
+            peft=self._peft,
+        )
+        # TODO: ====== Hidden dim ======
+        self.adapter = self._config.adapter.get_layer(
+            distributed_config,
+            vision_hidden_dim,
+            lr_scale=self._lr_scale,
+            peft=self._peft,
+        )
+
+    def get_layers(self) -> list["Layer"]:
+        return self.patch_convolution.get_layers() + self.decoder.get_layers() + self.adapter.get_layers()
+
+    def preprocess(self, batch: torch.Tensor, kwargs: dict[str, typing.Any]) -> None:
+        # Needed because the base class uses `get_layers` which may bypass the decoder and head. TODO: Avoidable?
+        self.patch_convolution.preprocess(batch, kwargs)
+        self.decoder.preprocess(batch, kwargs)
+        self.adapter.preprocess(batch, kwargs)
+
+    def get_loss_definitions(self, count: int = 1) -> list[LossDef]:
+        # Needed because the base class uses `get_layers` which may bypass the decoder and head. TODO: Avoidable?
+        return (
+            self.patch_convolution.get_loss_definitions(count)
+            + self.decoder.get_loss_definitions(count)
+            + self.adapter.get_loss_definitions(count)
+        )
diff --git a/fast_llm/models/multimodal/__init__.py b/fast_llm/models/multimodal/__init__.py
new file mode 100644
index 000000000..e69de29bb
diff --git a/fast_llm/models/multimodal/config.py b/fast_llm/models/multimodal/config.py
new file mode 100644
index 000000000..2415734e4
--- /dev/null
+++ b/fast_llm/models/multimodal/config.py
@@ -0,0 +1,89 @@
+import logging
+import typing
+
+from fast_llm.config import Field, FieldHint, FieldUpdate, config_class
+from fast_llm.engine.checkpoint.config import CheckpointFormat
+from fast_llm.engine.config_utils.runnable import RunnableConfig
+from fast_llm.engine.multi_stage.config import FastLLMModelConfig
+from fast_llm.engine.training.config import TrainerConfig
+from fast_llm.layers.vision.config import VisionEncoderConfig
+from fast_llm.models.gpt.config import (
+    GPTBaseModelConfig,
+    GPTBatchConfig,
+    GPTModelConfig,
+    GPTTrainerConfig,
+    PretrainedGPTModelConfig,
+)
+
+if typing.TYPE_CHECKING:
+    from fast_llm.models.multimodal.huggingface import HuggingfaceMultiModalModelForCausalLM
+    from fast_llm.models.multimodal.model import MultiModalBaseModel, MultiModalModel, MultiModalModelInferenceRunner
+    from fast_llm.models.multimodal.trainer import MultiModalTrainer
+
+logger = logging.getLogger(__name__)
+
+
+@config_class()
+class MultiModalBatchConfig(GPTBatchConfig):
+    pass
+
+
+@config_class()
+class MultiModalBaseModelConfig(GPTBaseModelConfig):
+    vision_encoder: VisionEncoderConfig = Field(
+        hint=FieldHint.architecture,
+        desc="Configuration for the vision encoder.",
+    )
+
+    @property
+    def base_model_class(self) -> type["MultiModalBaseModel"]:
+        from fast_llm.models.multimodal.model import MultiModalBaseModel
+
+        return MultiModalBaseModel
+
+
+@config_class(dynamic_type={FastLLMModelConfig: "gpt"})
+class MultiModalModelConfig(GPTModelConfig):
+    _abstract = False
+    model_name: typing.ClassVar[str] = "gpt"
+    base_model: GPTBaseModelConfig = FieldUpdate()
+    # TODO: ====== Conversion ======
+    checkpoint_formats: typing.ClassVar[tuple[type[CheckpointFormat], ...]] = FastLLMModelConfig.checkpoint_formats
+
+    @classmethod
+    def get_model_class(cls) -> type["MultiModalModel"]:
+        from fast_llm.models.multimodal.model import MultiModalModel
+
+        return MultiModalModel
+
+    @classmethod
+    def get_inference_runner_class(cls) -> type["MultiModalModelInferenceRunner"]:
+        from fast_llm.models.multimodal.model import MultiModalModelInferenceRunner
+
+        return MultiModalModelInferenceRunner
+
+    @classmethod
+    def get_huggingface_model_for_causal_lm_class(cls) -> type["HuggingfaceMultiModalModelForCausalLM"]:
+        from fast_llm.models.multimodal.huggingface import HuggingfaceMultiModalModelForCausalLM
+
+        return HuggingfaceMultiModalModelForCausalLM
+
+
+@config_class()
+class PretrainedMultiModalModelConfig(PretrainedGPTModelConfig):
+    _abstract = False
+    model: MultiModalModelConfig = FieldUpdate()
+
+
+@config_class(dynamic_type={RunnableConfig: "train_gpt", TrainerConfig: "gpt"})
+class MultiModalTrainerConfig(PretrainedMultiModalModelConfig, GPTTrainerConfig):
+    data: MultiModalDataConfig = FieldUpdate()
+    batch: MultiModalBatchConfig = FieldUpdate()
+    # TODO: Use dynamic model type?
+    reference_models: dict[str, PretrainedMultiModalModelConfig] = FieldUpdate()
+
+    @classmethod
+    def get_trainer_class(cls) -> type["MultiModalTrainer"]:
+        from fast_llm.models.multimodal.trainer import MultiModalTrainer
+
+        return MultiModalTrainer
diff --git a/fast_llm/models/multimodal/model.py b/fast_llm/models/multimodal/model.py
new file mode 100644
index 000000000..7426191f7
--- /dev/null
+++ b/fast_llm/models/multimodal/model.py
@@ -0,0 +1,133 @@
+import logging
+import typing
+
+import torch
+
+from fast_llm.data.data.gpt.data import GPTBatch
+from fast_llm.engine.distributed.config import DistributedConfig, PhaseType
+from fast_llm.engine.inference.runner import InferenceRunner
+from fast_llm.models.gpt.config import GPTBatchConfig
+from fast_llm.models.gpt.model import GPTBaseModel, GPTModel
+from fast_llm.models.multimodal.config import MultiModalBaseModelConfig, MultiModalBatchConfig, MultiModalModelConfig
+from fast_llm.tensor import TensorMeta
+
+logger = logging.getLogger(__name__)
+
+
+class MultiModalBaseModel[ConfigType: MultiModalBaseModelConfig](GPTBaseModel[ConfigType]):
+    """
+    A transformer-based language model generalizing the GPT model architecture.
+    """
+
+    _config: ConfigType
+
+    def __init__(
+        self,
+        config: ConfigType,
+        distributed_config: DistributedConfig,
+    ):
+        super().__init__(config, distributed_config)
+        self.vision_encoder = self._config.vision_encoder.get_layer(
+            distributed_config,
+            self._hidden_dim,
+            lr_scale=None,
+            peft=self._config.peft,
+        )
+
+    def preprocess_meta(
+        self, batch_meta: GPTBatchConfig | torch.Tensor, phase: PhaseType
+    ) -> list[tuple[TensorMeta, dict]]:
+        # TODO Remove (Move batch splitting elsewhere)
+        # TODO: Use parallel/sequential dims, distinguish micro and full batch/sequence
+        # TODO ====== Vision ======
+        # if self._config.vision_encoder.enabled:
+        #    try:
+        #        max_image_size = batch_meta.max_image_size
+        #    except AttributeError:
+        #        max_image_size = 256
+        #        logger.warning("Inference mode: max_image_size not provided, defaulting to 256")
+        #    vision_kwargs = {
+        #        VisionEncoderKwargs.patch_size: self._config.vision_encoder.patch_size,
+        #        VisionEncoderKwargs.max_image_size: max_image_size,
+        #        VisionEncoderKwargs.rope_theta: self._config.vision_encoder.transformer.rotary.theta,
+        #        VisionEncoderKwargs.kv_channels: self._tensor_space[VisionTransformerDimNames.kv_channels].size,
+        #        VisionEncoderKwargs.out_channels: self._tensor_space[VisionEncoderDimNames.out_channels].size,
+        #    }
+        #    vision_hidden_dim = self._tensor_space[VisionTransformerDimNames.hidden]
+        #    vision_hidden_dims = (
+        #        (hidden_sequence_q_dim, batch_dim, vision_hidden_dim)
+        #        if sequence_first
+        #        else (batch_dim, hidden_sequence_q_dim, vision_hidden_dim)
+        #    )
+        #    vision_kwargs.update(
+        #        {
+        #            VisionTransformerKwargs.hidden_dims: vision_hidden_dims,
+        #        }
+        #    )
+        #    common_kwargs.update(vision_kwargs)
+
+        # TODO ====== Vision ======
+        # if self._config.vision_encoder.enabled:
+        #     # patch_dimensions are (batch * sequence_length) x 3 x patch_size x patch_size
+        #     preprocessed_meta.append((kwargs[VisionEncoderKwargs.image_patches_meta], kwargs))
+        # else:
+        #     preprocessed_meta.append((tokens, kwargs))
+        pass
+
+    def preprocess_batch(
+        self,
+        batch: GPTBatch,
+        preprocessed_meta: list[tuple[TensorMeta, dict]] | None = None,
+        *,
+        phase: PhaseType,
+        iteration: int,
+        metrics: dict | None = None,
+    ) -> list[tuple[torch.Tensor, dict]]:
+        # TODO Move batch splitting elsewhere, align interface with LayerBase
+        # TODO ====== Vision ======
+        # if self._config.vision_encoder.enabled:
+        #    if self._config.vision_encoder.image_break_token is not None:
+        #        if not labels_cloned:
+        #            labels = labels.clone()
+        #            labels_cloned = True
+        #        labels = torch.where(labels == self._config.vision_encoder.image_break_token, -100, labels)
+        #    if self._config.vision_encoder.image_end_token is not None:
+        #        if not labels_cloned:
+        #            labels = labels.clone()
+        #            labels_cloned = True
+        #        labels = torch.where(labels == self._config.vision_encoder.image_end_token, -100, labels)
+        # Loss-masking for distillation losses
+        # TODO ====== Vision ======
+        # if self._config.vision_encoder.enabled:
+        #    batch_images = (
+        #        batch.images if batch.images is not None else [[]] * kwargs[AttentionKwargs.micro_batch_size]
+        #    )
+        #    kwargs[VisionEncoderKwargs.images] = [
+        #        [
+        #            img.to(device=self._tensor_space.distributed.device, dtype=torch.uint8, non_blocking=True)
+        #            for img in images
+        #        ]
+        #        for images in batch_images
+        #    ]
+        #    kwargs[VisionEncoderKwargs.image_positions] = (
+        #        batch.image_positions
+        #        if batch.image_positions is not None
+        #        else [[]] * kwargs[AttentionKwargs.micro_batch_size]
+        #    )
+        #    kwargs[LanguageModelKwargs.tokens] = tokens
+        # image_patches = kwargs.get(VisionEncoderKwargs.image_patches, None)
+        # if image_patches is not None:
+        #     preprocessed.append((image_patches, kwargs))
+        # else:
+        #     preprocessed.append((tokens, kwargs))
+        pass
+
+
+class MultiModalModel[ConfigType: MultiModalModelConfig](GPTModel[ConfigType]):
+    # TODO: Can we drop class?
+    pass
+
+
+class MultiModalInferenceRunner(InferenceRunner):
+    model_class: typing.ClassVar[type[MultiModalModel]] = MultiModalModel
+    batch_config_class: typing.ClassVar[type[MultiModalBatchConfig]] = MultiModalBatchConfig
diff --git a/fast_llm/models/multimodal/trainer.py b/fast_llm/models/multimodal/trainer.py
new file mode 100644
index 000000000..c4071aafe
--- /dev/null
+++ b/fast_llm/models/multimodal/trainer.py
@@ -0,0 +1,14 @@
+import logging
+
+from fast_llm.models.gpt.trainer import GPTTrainer
+from fast_llm.models.multimodal.config import MultiModalTrainerConfig
+
+logger = logging.getLogger(__name__)
+
+
+class MultiModalTrainer[ConfigType: MultiModalTrainerConfig](GPTTrainer[ConfigType]):
+    def _get_data(self) -> MultiModalData:
+        return MultiModalData(
+            config=self._config.data,
+            distributed_config=self._config.model.distributed,
+        )
diff --git a/setup.cfg b/setup.cfg
index 77073ab55..2a1614554 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -43,7 +43,7 @@ OPTIONAL =
 
 # Huggingface tools
 HUGGINGFACE =
-    transformers>=4.52.4
+    transformers==4.53.2
     hf-transfer>=0.1.9
     datasets>=3.6.0
     huggingface-hub>=0.32.6
@@ -59,6 +59,13 @@ GENERATION =
     lm_eval>=0.4.9
 
 
+# Required for supporting vision inputs
+VISION =
+    # Vision Tools
+    webp>=0.4.0
+    pillow-simd>=9.5.0
+    torchvision>=0.20.0
+
 DEV =
     # Pre-commit git hook
     pre-commit>=4.2.0

From 10938de41007deabc96a59494d8a8f8631ce8e95 Mon Sep 17 00:00:00 2001
From: Joel Lamy-Poirier <joel.lamy-poirier@servicenow.com>
Date: Thu, 6 Nov 2025 20:44:56 -0500
Subject: [PATCH 02/16] misc

---
 fast_llm/engine/base_model/base_model.py      |  59 +++++--
 fast_llm/layers/attention/attention.py        |  27 ++--
 fast_llm/layers/block/config.py               |   1 +
 fast_llm/layers/block/sequence.py             |   8 +-
 fast_llm/layers/decoder/block.py              |   6 +-
 fast_llm/layers/decoder/config.py             |  21 +++
 .../layers/decoder/mlp/mixture_of_experts.py  |   2 +
 fast_llm/layers/decoder/mlp/mlp.py            |   6 +-
 fast_llm/layers/language_model/config.py      |   2 +
 fast_llm/layers/language_model/embedding.py   |  21 ++-
 .../layers/language_model/language_model.py   |  10 +-
 .../language_model/multi_token_prediction.py  |   4 +-
 fast_llm/layers/vision/config.py              |  93 +++--------
 fast_llm/layers/vision/patch_convolution.py   |  10 +-
 fast_llm/layers/vision/vision_encoder.py      |  76 +++++++--
 fast_llm/models/gpt/model.py                  |   2 +-
 fast_llm/models/multimodal/config.py          |   9 +-
 fast_llm/models/multimodal/model.py           | 153 ++++++++----------
 18 files changed, 274 insertions(+), 236 deletions(-)

diff --git a/fast_llm/engine/base_model/base_model.py b/fast_llm/engine/base_model/base_model.py
index 5df59d4cd..9d8dc6c35 100644
--- a/fast_llm/engine/base_model/base_model.py
+++ b/fast_llm/engine/base_model/base_model.py
@@ -1,4 +1,5 @@
 import abc
+import functools
 import typing
 
 import torch.nn
@@ -52,10 +53,15 @@ def get_loss_definitions(self, count: int = 1) -> list[LossDef]:
                 losses += layer.get_loss_definitions(count)
         return losses
 
-    def preprocess(self, batch: "torch.Tensor", kwargs: dict[str, typing.Any]) -> None:
+    def preprocess(self, kwargs: dict[str, typing.Any]) -> None:
         for layer in self.get_layers():
             if layer is not self:
-                layer.preprocess(batch, kwargs)
+                layer.preprocess(kwargs)
+
+    def unwrap(self) -> "LayerBase":
+        # Get the actual module contained in this layer,
+        # undoing any wrapping for the Fast-LLM engine (ex. `LayerBaseWithNamespace`)
+        return self
 
 
 class Layer(LayerBase):
@@ -74,19 +80,17 @@ def forward(
         pass
 
     def unwrap(self) -> "Layer":
-        # Get the actual module contained in this layer,
-        # undoing any wrapping for the Fast-LLM engine (ex. `LayerWithNamespace`)
         return self
 
 
-class LayerWithNamespace(Layer):
+class LayerBaseWithNamespace(LayerBase):
     """
-    A layer with its own namespace for preprocessing (kwargs),
+    A layer base with its own namespace for preprocessing (kwargs),
      so that it doesn't inadvertently interact with other layers.
     TODO: Consider namespace for losses and metrics?
     """
 
-    def __init__(self, layer: Layer, namespace: str = None):
+    def __init__(self, layer: LayerBase, namespace: str = None):
         super().__init__(layer._distributed_config)
         self._layer = layer
         self._namespace = namespace
@@ -98,6 +102,42 @@ def setup(self, distributed: Distributed) -> None:
         self._layer.setup(distributed)
         super().setup(distributed)
 
+    def get_layers(self) -> list["Layer"]:
+        """
+        Wrap individual layers so the namespace is used in forward.
+        """
+        return self._layers_with_namespace
+
+    def preprocess(self, kwargs: dict[str, typing.Any]) -> None:
+        """
+        Preprocess with namespace.
+        """
+        if self._namespace not in kwargs:
+            kwargs[self._namespace] = kwargs.copy()
+        self._layer.preprocess(kwargs[self._namespace])
+
+    def unwrap(self) -> "LayerBase":
+        return self._layer.unwrap()
+
+    @functools.cached_property
+    def _layers_with_namespace(self) -> list[Layer]:
+        # This needs to be in a property because `module_name` is set after `__init__`.
+        # Wrap each set of blocks with identical config in a namespace
+        # using the unique module name of the first such block.
+        return [LayerWithNamespace(layer, self._namespace) for layer in self._layer.get_layers()]
+
+
+class LayerWithNamespace(LayerBaseWithNamespace, Layer):
+    _layer: Layer
+
+    def __init__(self, layer: Layer, namespace: str = None):
+        super().__init__(layer, namespace)
+        self.layer_count = self._layer.layer_count
+
+    def get_layers(self) -> list["Layer"]:
+        # Need to override since `LayerBaseWithNamespace.get_layers` comes first in the MRO.
+        return [self]
+
     def forward(
         self, input_: torch.Tensor, kwargs: dict, losses: dict | None = None, metrics: dict | None = None
     ) -> torch.Tensor:
@@ -109,11 +149,6 @@ def forward(
             assert isinstance(input_, TensorMeta)
         return self._layer.forward(input_, kwargs, losses, metrics)
 
-    def preprocess(self, batch: "torch.Tensor", kwargs: dict[str, typing.Any]) -> None:
-        assert self._namespace not in kwargs
-        kwargs[self._namespace] = kwargs.copy()
-        self._layer.preprocess(batch, kwargs[self._namespace])
-
     def unwrap(self) -> "Layer":
         return self._layer.unwrap()
 
diff --git a/fast_llm/layers/attention/attention.py b/fast_llm/layers/attention/attention.py
index ffbe9955e..48ec15ea7 100644
--- a/fast_llm/layers/attention/attention.py
+++ b/fast_llm/layers/attention/attention.py
@@ -443,14 +443,14 @@ def get_compute_usage(self, input_: TensorMeta, kwargs: dict[str, typing.Any], c
             )
         )
 
-    def preprocess(self, batch: torch.Tensor, kwargs: dict[str, typing.Any]) -> None:
-        self._rotary.preprocess(batch, kwargs)
+    def preprocess(self, kwargs: dict[str, typing.Any]) -> None:
+        self._rotary.preprocess(kwargs)
         if self._implementation == AttentionImplementation.backup:
-            self._preprocess_for_backup_attention(batch, kwargs)
+            self._preprocess_for_backup_attention(kwargs)
         elif self._implementation == AttentionImplementation.flash:
-            self._preprocess_for_flash_attention(batch, kwargs)
+            self._preprocess_for_flash_attention(kwargs)
 
-    def _preprocess_for_backup_attention(self, batch: torch.Tensor, kwargs: dict[str, typing.Any]) -> None:
+    def _preprocess_for_backup_attention(self, kwargs: dict[str, typing.Any]) -> None:
         if (
             sequence_length := kwargs[AttentionKwargs.sequence_length]
         ) > self._backup_attention_tensor_cache_max_sequence_length:
@@ -460,7 +460,7 @@ def _preprocess_for_backup_attention(self, batch: torch.Tensor, kwargs: dict[str
             self._backup_attention_mask = torch.ones(
                 (sequence_length, sequence_length),
                 dtype=torch.bool,
-                device=batch.device,
+                device=self._distributed.device,
             ).tril_()
 
             if self._config.window_size is not None:
@@ -469,9 +469,8 @@ def _preprocess_for_backup_attention(self, batch: torch.Tensor, kwargs: dict[str
                 [],
                 torch.finfo(self._distributed_config.compute_dtype.torch).min,
                 dtype=self._distributed_config.compute_dtype.torch,
-                device=batch.device,
+                device=self._distributed.device,
             )
-
         sequence_k = kwargs[AttentionKwargs.sequence_k_dim].size
         sequence_q = kwargs[AttentionKwargs.sequence_q_dim].size
         kwargs[AttentionKwargs.attention_mask] = self._backup_attention_mask[
@@ -484,14 +483,14 @@ def _preprocess_for_backup_attention(self, batch: torch.Tensor, kwargs: dict[str
                     for sample_lens in kwargs[AttentionKwargs.sequence_lengths]
                 ]
             )
-            document_mask = (seq_ids[:, None, :] == seq_ids[:, :, None]).to(batch.device)
+            document_mask = (seq_ids[:, None, :] == seq_ids[:, :, None]).to(self._distributed.device)
             kwargs[AttentionKwargs.attention_mask] = (
                 kwargs[AttentionKwargs.attention_mask]
                 & document_mask[:, None, sequence_k - sequence_q : sequence_k, None, :sequence_k]
             )
         kwargs[AttentionKwargs.attention_mask_value] = self._backup_attention_mask_value
 
-    def _preprocess_for_flash_attention(self, batch: torch.Tensor, kwargs: dict[str, typing.Any]) -> None:
+    def _preprocess_for_flash_attention(self, kwargs: dict[str, typing.Any]) -> None:
         """
         Prepares cu_seqlens_q and cu_seqlens_k for flash_attn_varlen_func:
         https://github.com/Dao-AILab/flash-attention/blob/main/flash_attn/flash_attn_interface.py#L1375
@@ -542,14 +541,14 @@ def _preprocess_for_flash_attention(self, batch: torch.Tensor, kwargs: dict[str,
             seqlens_k = torch.cat(sequence_lengths)
         kwargs[AttentionKwargs.cu_seqlens_q] = torch.cat(
             (
-                torch.zeros(1, dtype=torch.int32, device=batch.device),
-                torch.cumsum(seqlens_q, dim=0, dtype=torch.int32).to(batch.device),
+                torch.zeros(1, dtype=torch.int32, device=self._distributed.device),
+                torch.cumsum(seqlens_q, dim=0, dtype=torch.int32).to(self._distributed.device),
             )
         )
         kwargs[AttentionKwargs.cu_seqlens_k] = torch.cat(
             (
-                torch.zeros(1, dtype=torch.int32, device=batch.device),
-                torch.cumsum(seqlens_k, dim=0, dtype=torch.int32).to(batch.device),
+                torch.zeros(1, dtype=torch.int32, device=self._distributed.device),
+                torch.cumsum(seqlens_k, dim=0, dtype=torch.int32).to(self._distributed.device),
             )
         )
         kwargs[AttentionKwargs.max_seqlen_q] = seqlens_q.max()
diff --git a/fast_llm/layers/block/config.py b/fast_llm/layers/block/config.py
index f3e93edeb..fda873e9a 100644
--- a/fast_llm/layers/block/config.py
+++ b/fast_llm/layers/block/config.py
@@ -71,6 +71,7 @@ def get_layer(
         self,
         distributed_config: DistributedConfig,
         hidden_dim: TensorDim,
+        *,
         lr_scale: float | None,
         peft: PeftConfig | None,
     ) -> "BlockBase":
diff --git a/fast_llm/layers/block/sequence.py b/fast_llm/layers/block/sequence.py
index 530df950e..54a5b3471 100644
--- a/fast_llm/layers/block/sequence.py
+++ b/fast_llm/layers/block/sequence.py
@@ -55,8 +55,8 @@ def _layers_with_namespace(self) -> list[Layer]:
     def get_layers(self) -> list["Layer"]:
         return self._layers_with_namespace
 
-    def preprocess(self, batch: "torch.Tensor", kwargs: dict[str, typing.Any]) -> None:
-        self._layers_with_namespace[0].preprocess(batch, kwargs)
+    def preprocess(self, kwargs: dict[str, typing.Any]) -> None:
+        self._layers_with_namespace[0].preprocess(kwargs)
 
     def get_loss_definitions(self, count: int = 1) -> list[LossDef]:
         return (
@@ -109,9 +109,9 @@ def _layers_with_namespace(self) -> list[Layer]:
     def get_layers(self) -> list[Layer]:
         return self._layers_with_namespace
 
-    def preprocess(self, batch: "torch.Tensor", kwargs: dict[str, typing.Any]) -> None:
+    def preprocess(self, kwargs: dict[str, typing.Any]) -> None:
         for _, index in self._config.preprocessing_layers.items():
-            self._layers_with_namespace[index].preprocess(batch, kwargs)
+            self._layers_with_namespace[index].preprocess(kwargs)
 
     def get_loss_definitions(self, count: int = 1) -> list[LossDef]:
         # TODO: Prevent name conflicts.
diff --git a/fast_llm/layers/decoder/block.py b/fast_llm/layers/decoder/block.py
index 8b19db66a..2b71e1cec 100644
--- a/fast_llm/layers/decoder/block.py
+++ b/fast_llm/layers/decoder/block.py
@@ -175,9 +175,9 @@ def get_compute_usage(self, input_: TensorMeta, kwargs: dict[str, typing.Any], c
             )
         )
 
-    def preprocess(self, batch: torch.Tensor, kwargs: dict[str, typing.Any]) -> None:
-        self.mixer.preprocess(batch, kwargs)
-        self.mlp.preprocess(batch, kwargs)
+    def preprocess(self, kwargs: dict[str, typing.Any]) -> None:
+        self.mixer.preprocess(kwargs)
+        self.mlp.preprocess(kwargs)
 
     def get_loss_definitions(self, count: int = 1) -> list[LossDef]:
         return self.mixer.get_loss_definitions(count=count) + self.mlp.get_loss_definitions(count=count)
diff --git a/fast_llm/layers/decoder/config.py b/fast_llm/layers/decoder/config.py
index 403b204c8..062388535 100644
--- a/fast_llm/layers/decoder/config.py
+++ b/fast_llm/layers/decoder/config.py
@@ -27,6 +27,7 @@ def get_layer(
         self,
         distributed_config: DistributedConfig,
         hidden_dim: TensorDim,
+        *,
         lr_scale: float | None,
         peft: PeftConfig | None,
         return_bias: bool = False,
@@ -45,6 +46,26 @@ def get_layer(
 class MLPBaseConfig(BlockWithBiasConfig):
     _abstract = True
 
+    def get_layer(
+        self,
+        distributed_config: DistributedConfig,
+        hidden_dim: TensorDim,
+        *,
+        output_dim: TensorDim | None = None,
+        lr_scale: float | None,
+        peft: PeftConfig | None,
+        return_bias: bool = False,
+    ) -> "BlockWithBias":
+        return self.layer_class(
+            self,
+            distributed_config,
+            hidden_dim=hidden_dim,
+            output_dim=output_dim,
+            lr_scale=combine_lr_scales(lr_scale, self.lr_scale),
+            peft=peft,
+            return_bias=return_bias,
+        )
+
     @classmethod
     def _from_dict(cls, default: dict[str, typing.Any], strict: bool = True) -> typing.Self:
         if cls is MLPBaseConfig and cls.get_subclass(default.get("type")) is None:
diff --git a/fast_llm/layers/decoder/mlp/mixture_of_experts.py b/fast_llm/layers/decoder/mlp/mixture_of_experts.py
index ffc9eadba..0b85025e0 100644
--- a/fast_llm/layers/decoder/mlp/mixture_of_experts.py
+++ b/fast_llm/layers/decoder/mlp/mixture_of_experts.py
@@ -88,6 +88,8 @@ def _get_intermediate_dims(self) -> tuple[TensorDim, TensorDim]:
     def _forward(
         self, input_: torch.Tensor, kwargs: dict, losses: dict | None = None, metrics: dict | None = None
     ) -> tuple[torch.Tensor, None]:
+        if isinstance(input_, TensorMeta):
+            return TensorMeta.from_dims(input_.dims[:-1] + (self._output_dim,), "MLP output"), None
         hidden_states = input_.flatten(0, -2)
         logits = self.router(hidden_states)
         if self._debug.enabled:
diff --git a/fast_llm/layers/decoder/mlp/mlp.py b/fast_llm/layers/decoder/mlp/mlp.py
index aaea94adb..f5decbf17 100644
--- a/fast_llm/layers/decoder/mlp/mlp.py
+++ b/fast_llm/layers/decoder/mlp/mlp.py
@@ -26,6 +26,7 @@ def __init__(
         *,
         # TODO: Review `hidden_dim` and `block_index`
         hidden_dim: TensorDim,
+        output_dim: TensorDim | None = None,
         lr_scale: float | None,
         peft: PeftConfig | None,
         return_bias: bool = True,
@@ -38,6 +39,7 @@ def __init__(
             peft=peft,
             return_bias=return_bias,
         )
+        self._output_dim = self._hidden_dim if output_dim is None else output_dim
         self._parallel_dim = self._distributed_config.get_distributed_dim(DistributedDimNames.tensor)
         intermediate_1_dim, self._intermediate_2_dim = self._get_intermediate_dims()
 
@@ -55,7 +57,7 @@ def __init__(
         )
         self.layer_2 = self._config.layer_2.get_layer(
             self._intermediate_2_dim,
-            hidden_dim,
+            self._output_dim,
             default_weight_initialization=init_normal_(std=self._hidden_size**-0.5),
             default_add_bias=self._config.add_linear_biases,
             sequence_parallel=self._sequence_parallel,
@@ -111,6 +113,8 @@ def _forward(
         losses: dict[str, typing.Any] | None = None,
         metrics: dict[str, typing.Any] | None = None,
     ) -> tuple[torch.Tensor, torch.Tensor | None]:
+        if isinstance(input_, TensorMeta):
+            return TensorMeta.from_dims(input_.dims[:-1] + (self._output_dim,), "MLP output"), None
         return (
             mlp_autograd(
                 input_,
diff --git a/fast_llm/layers/language_model/config.py b/fast_llm/layers/language_model/config.py
index 18c64acc4..832808eaf 100644
--- a/fast_llm/layers/language_model/config.py
+++ b/fast_llm/layers/language_model/config.py
@@ -20,7 +20,9 @@
 
 
 class LanguageModelKwargs(BlockKwargs):
+    token_ids = "token_ids"
     position_ids = "position_ids"
+    embedding_map = "embedding_map"
     # TODO: These are generic
     labels = "labels"
     phase = "phase"
diff --git a/fast_llm/layers/language_model/embedding.py b/fast_llm/layers/language_model/embedding.py
index b9d209274..19fa211bd 100644
--- a/fast_llm/layers/language_model/embedding.py
+++ b/fast_llm/layers/language_model/embedding.py
@@ -80,7 +80,7 @@ def __init__(
     @torch.compile
     def _forward(
         self,
-        input_: torch.Tensor,
+        input_: torch.Tensor | None,
         token_ids: torch.Tensor,
         position_ids: torch.Tensor | None,
         mask_inputs: bool,
@@ -153,24 +153,33 @@ def forward(
                 tensor_name="Embedding output",
                 dtype=self._residual_dtype,
             )
+        if (embedding_map := kwargs.get(LanguageModelKwargs.embedding_map)) is None:
+            # Language model: input_ contains token ids.
+            token_ids = input_
+            input_ = None
+        else:
+            # Multimodal case: input_ contains encoder output, token ids stores in kwargs.
+            # TODO: Support multiple encoders.
+            # TODO: Support pipeline-parallel.
+            token_ids = kwargs.get(LanguageModelKwargs.token_ids)
 
         return self._forward(
             input_,
-            kwargs.get(LanguageModelKwargs.token_ids),
+            token_ids,
             kwargs.get(LanguageModelKwargs.position_ids),
             # TODO ====== Vision ====== Review input masking.
             kwargs.get(LanguageModelKwargs.mask_inputs),
-            kwargs.get(LanguageModelKwargs.embedding_map),
+            embedding_map,
         )
 
     def get_compute_usage(self, input_: TensorMeta, kwargs: dict[str, typing.Any], config: ResourceUsageConfig) -> int:
         # TODO: Add marginal compute? (embeddings)
         return 0
 
-    def preprocess(self, batch: torch.Tensor, kwargs: dict[str, typing.Any]) -> None:
+    def preprocess(self, kwargs: dict[str, typing.Any]) -> None:
         if not self._config.position_embeddings.enabled:
             return
-        self._create_position_embeddings(kwargs[LanguageModelKwargs.sequence_length], batch.device)
+        self._create_position_embeddings(kwargs[LanguageModelKwargs.sequence_length], self._distributed.device)
         sequence_k = kwargs[LanguageModelKwargs.sequence_k_dim].size
         sequence_q = kwargs[LanguageModelKwargs.sequence_q_dim].size
         if not self._config.cross_document_position_embeddings:
@@ -179,7 +188,7 @@ def preprocess(self, batch: torch.Tensor, kwargs: dict[str, typing.Any]) -> None
                     torch.cat([torch.arange(x) for x in sample_lens])
                     for sample_lens in kwargs[LanguageModelKwargs.sequence_lengths]
                 ]
-            ).to(batch.device, dtype=torch.int64)
+            ).to(self._distributed.device, dtype=torch.int64)
             position_ids = position_ids[:, sequence_k - sequence_q : sequence_k]
             if kwargs[LanguageModelKwargs.sequence_first]:
                 position_ids = position_ids.transpose(0, 1)
diff --git a/fast_llm/layers/language_model/language_model.py b/fast_llm/layers/language_model/language_model.py
index 2e46bb57a..385bab7ef 100644
--- a/fast_llm/layers/language_model/language_model.py
+++ b/fast_llm/layers/language_model/language_model.py
@@ -1,8 +1,6 @@
 import logging
 import typing
 
-import torch
-
 from fast_llm.engine.base_model.base_model import Layer
 from fast_llm.engine.base_model.config import LossDef
 from fast_llm.engine.config_utils.tensor_dim import TensorDim
@@ -58,11 +56,11 @@ def __init__(
     def get_layers(self) -> list[Layer]:
         return self.embeddings.get_layers() + self.decoder.get_layers() + self.head.get_layers()
 
-    def preprocess(self, batch: torch.Tensor, kwargs: dict[str, typing.Any]) -> None:
+    def preprocess(self, kwargs: dict[str, typing.Any]) -> None:
         # Needed because the base class uses `get_layers` which may bypass the decoder and head. TODO: Avoidable?
-        self.embeddings.preprocess(batch, kwargs)
-        self.decoder.preprocess(batch, kwargs)
-        self.head.preprocess(batch, kwargs)
+        self.embeddings.preprocess(kwargs)
+        self.decoder.preprocess(kwargs)
+        self.head.preprocess(kwargs)
 
     def get_loss_definitions(self, count: int = 1) -> list[LossDef]:
         # Needed because the base class uses `get_layers` which may bypass the decoder and head. TODO: Avoidable?
diff --git a/fast_llm/layers/language_model/multi_token_prediction.py b/fast_llm/layers/language_model/multi_token_prediction.py
index e0eb8175d..ad3395a0f 100644
--- a/fast_llm/layers/language_model/multi_token_prediction.py
+++ b/fast_llm/layers/language_model/multi_token_prediction.py
@@ -83,8 +83,8 @@ def get_layers(self) -> list[Layer]:
     def get_output_weights(self) -> list[torch.Tensor]:
         return sum((head.get_output_weights() for head in self.heads), [])
 
-    def preprocess(self, batch: "torch.Tensor", kwargs: dict[str, typing.Any]) -> None:
-        self._layers_with_namespace[0].preprocess(batch, kwargs)
+    def preprocess(self, kwargs: dict[str, typing.Any]) -> None:
+        self._layers_with_namespace[0].preprocess(kwargs)
 
     def get_loss_definitions(self, count: int = 1) -> list[LossDef]:
         return self.blocks[0].get_loss_definitions(count=count * self._config.prediction_heads) + [
diff --git a/fast_llm/layers/vision/config.py b/fast_llm/layers/vision/config.py
index 1af986eef..1762953cd 100644
--- a/fast_llm/layers/vision/config.py
+++ b/fast_llm/layers/vision/config.py
@@ -5,10 +5,11 @@
 from fast_llm.layers.common.linear.config import Convolution2DConfig
 from fast_llm.layers.common.normalization.config import NormalizationConfig
 from fast_llm.layers.decoder.config import MLPBaseConfig
+from fast_llm.layers.language_model.config import LanguageModelConfig
 from fast_llm.utils import Assert
 
 if typing.TYPE_CHECKING:
-    from fast_llm.layers.vision.vision_encoder import VisionEncoder
+    from fast_llm.layers.vision.vision_encoder import VisionEncoder, VisionMultiModalModel
 
 
 @config_class()
@@ -61,9 +62,14 @@ class PatchConvolutionConfig(BlockConfig):
         desc="Configuration for the normalization layer.",
         hint=FieldHint.architecture,
     )
-    patch_size: int = Field(
+    patch_height: int = Field(
         default=16,
-        desc="Size of image patches, in pixels (width and height).",
+        desc="Height of image patches, in pixels.",
+        hint=FieldHint.core,
+    )
+    patch_width: int = Field(
+        default=16,
+        desc="Width of image patches, in pixels.",
         hint=FieldHint.core,
     )
     input_channels: int = Field(
@@ -84,8 +90,7 @@ class VisionEncoderConfig(BlockConfig):
         desc="Configuration for the adapter layer.",
         hint=FieldHint.architecture,
     )
-    # TODO: ====== Appropriate name?? ======
-    decoder: BlockSequenceConfig = Field(
+    encoder: BlockSequenceConfig = Field(
         desc="Configuration for the vision decoder.",
         hint=FieldHint.architecture,
     )
@@ -102,68 +107,16 @@ def layer_class(self) -> "type[VisionEncoder]":
 
         return VisionEncoder
 
-    # transformer: TransformerConfig = Field(
-    #    desc="Configuration for the vision transformer architecture.",
-    #    hint=FieldHint.core,
-    # )
-    # patch_size: int = Field(
-    #    default=16,
-    #    desc="Patch size for the image encoder.",
-    #    hint=FieldHint.core,
-    # )
-    # conv_bias: bool = Field(
-    #    default=False,
-    #    desc="Whether to use bias in the convolutional layer.",
-    #    hint=FieldHint.optional,
-    # )
-    # patch_norm: NormalizationConfig = Field(
-    #    desc="Configuration for the normalization layers applied to the image patches.",
-    #    hint=FieldHint.optional,
-    # )
-    # adapter_size: int = Field(
-    #    default=5120,
-    #    desc="Intermediate size for the adapter linear layers. Assuming 2 linear layers",
-    #    hint=FieldHint.core,
-    # )
-    # adapter_activation_type: ActivationType = Field(
-    #    default=ActivationType.gelu,
-    #    desc="The intermediate activation type for multi-modal adapter. Default: GeLU.",
-    #    hint=FieldHint.core,
-    # )
-    # adapter_bias: bool = Field(
-    #    default=True,
-    #    desc="Whether to use bias in the adapter linear layer.",
-    #    hint=FieldHint.optional,
-    # )
-    # image_normalization: ImageNormalizationConfig = Field(
-    #    desc="Configuration for the normalization layers applied to the image patches.",
-    #    hint=FieldHint.optional,
-    # )
-    # image_break_token: int | None = Field(
-    #    default=None,
-    #    desc="Token id to separate image rows. If None, no token id is applied.",
-    #    hint=FieldHint.optional,
-    # )
-    # image_end_token: int | None = Field(
-    #    default=None,
-    #    desc="Token id to indicate the end of an image. If None, no token id is applied.",
-    #    hint=FieldHint.optional,
-    # )
-    # adapter_lr_scale: float | None = Field(
-    #    default=None,
-    #    desc="Custom learning rate scale for the adapter weights.",
-    #    hint=FieldHint.feature,
-    #    valid=skip_valid_if_none(check_field(Assert.geq, 0)),
-    # )
-    # conv_lr_scale: float | None = Field(
-    #    default=None,
-    #    desc="Custom learning rate scale for the convolutional layer weights.",
-    #    hint=FieldHint.feature,
-    #    valid=skip_valid_if_none(check_field(Assert.geq, 0)),
-    # )
-    # adapter_init_method_std: float = Field(
-    #    default=None,
-    #    desc="Standard deviation for the normal initialization of the adapter weights. Default: adapter_size ** -0.5.",
-    #    hint=FieldHint.optional,
-    #    valid=check_field(Assert.geq, 0),
-    # )
+
+@config_class()
+class VisionMultiModalModelConfig(LanguageModelConfig):
+    vision_encoder: VisionEncoderConfig = Field(
+        hint=FieldHint.architecture,
+        desc="Configuration for the vision encoder.",
+    )
+
+    @property
+    def layer_class(self) -> "type[VisionMultiModalModel]":
+        from fast_llm.layers.vision.vision_encoder import VisionMultiModalModel
+
+        return VisionMultiModalModel
diff --git a/fast_llm/layers/vision/patch_convolution.py b/fast_llm/layers/vision/patch_convolution.py
index 46cf86708..1055f33cc 100644
--- a/fast_llm/layers/vision/patch_convolution.py
+++ b/fast_llm/layers/vision/patch_convolution.py
@@ -32,16 +32,14 @@ def __init__(
             lr_scale=lr_scale,
             peft=peft,
         )
-        input_dim = TensorDim("input_channels", self._config.input_channels)
-        patch_dim = TensorDim("patch", self._config.patch_size)
         self._parallel_dim = self._distributed_config.get_distributed_dim(DistributedDimNames.tensor)
 
         self.convolution = self._config.convolution.get_layer(
             self._hidden_dim,
-            input_dim,
-            patch_dim,
-            patch_dim,
-            stride=(self._config.patch_size, self._config.patch_size),
+            TensorDim("input_channels", self._config.input_channels),
+            TensorDim("patch_height", self._config.patch_height),
+            TensorDim("patch_width", self._config.patch_width),
+            stride=(self._config.patch_height, self._config.patch_width),
             default_add_bias=False,
             lr_scale=self._lr_scale,
             peft=self._peft,
diff --git a/fast_llm/layers/vision/vision_encoder.py b/fast_llm/layers/vision/vision_encoder.py
index b4fa189d5..fab5c5a65 100644
--- a/fast_llm/layers/vision/vision_encoder.py
+++ b/fast_llm/layers/vision/vision_encoder.py
@@ -1,15 +1,15 @@
+import functools
 import logging
 import typing
 
-import torch
-
-from fast_llm.engine.base_model.base_model import Layer
+from fast_llm.engine.base_model.base_model import Layer, LayerBaseWithNamespace
 from fast_llm.engine.base_model.config import LossDef
 from fast_llm.engine.config_utils.tensor_dim import TensorDim
 from fast_llm.engine.distributed.config import DistributedConfig
 from fast_llm.layers.block.block import BlockBase
 from fast_llm.layers.common.peft.config import PeftConfig
-from fast_llm.layers.vision.config import VisionEncoderConfig
+from fast_llm.layers.language_model.language_model import LanguageModel
+from fast_llm.layers.vision.config import VisionEncoderConfig, VisionMultiModalModelConfig
 
 logger = logging.getLogger(__name__)
 
@@ -34,34 +34,80 @@ def __init__(
             lr_scale=self._lr_scale,
             peft=self._peft,
         )
-        # TODO: ====== Appropriate name?? ======
-        self.decoder = self._config.decoder.get_layer(
+        self.encoder = self._config.encoder.get_layer(
             distributed_config,
             vision_hidden_dim,
             lr_scale=self._lr_scale,
             peft=self._peft,
         )
-        # TODO: ====== Hidden dim ======
         self.adapter = self._config.adapter.get_layer(
             distributed_config,
             vision_hidden_dim,
+            output_dim=self._hidden_dim,
             lr_scale=self._lr_scale,
             peft=self._peft,
         )
 
     def get_layers(self) -> list["Layer"]:
-        return self.patch_convolution.get_layers() + self.decoder.get_layers() + self.adapter.get_layers()
+        return self.patch_convolution.get_layers() + self.encoder.get_layers() + self.adapter.get_layers()
 
-    def preprocess(self, batch: torch.Tensor, kwargs: dict[str, typing.Any]) -> None:
-        # Needed because the base class uses `get_layers` which may bypass the decoder and head. TODO: Avoidable?
-        self.patch_convolution.preprocess(batch, kwargs)
-        self.decoder.preprocess(batch, kwargs)
-        self.adapter.preprocess(batch, kwargs)
+    def preprocess(self, kwargs: dict[str, typing.Any]) -> None:
+        # Needed because the base class uses `get_layers` which may bypass the decoder. TODO: Avoidable?
+        self.patch_convolution.preprocess(kwargs)
+        self.encoder.preprocess(kwargs)
+        self.adapter.preprocess(kwargs)
 
     def get_loss_definitions(self, count: int = 1) -> list[LossDef]:
-        # Needed because the base class uses `get_layers` which may bypass the decoder and head. TODO: Avoidable?
+        # Needed because the base class uses `get_layers` which may bypass the decoder. TODO: Avoidable?
         return (
             self.patch_convolution.get_loss_definitions(count)
-            + self.decoder.get_loss_definitions(count)
+            + self.encoder.get_loss_definitions(count)
             + self.adapter.get_loss_definitions(count)
         )
+
+
+class VisionMultiModalModel[ConfigType: VisionMultiModalModelConfig](LanguageModel[ConfigType]):
+
+    _config: ConfigType
+
+    def __init__(
+        self,
+        config: ConfigType,
+        distributed_config: DistributedConfig,
+        *,
+        # TODO: Unused, but required by the `BlockBase` interface.
+        hidden_dim: TensorDim | None = None,
+        lr_scale: float | None,
+        peft: PeftConfig | None,
+    ):
+        super().__init__(
+            config,
+            distributed_config,
+            hidden_dim=TensorDim("hidden", config.hidden_size),
+            lr_scale=lr_scale,
+            peft=peft,
+        )
+        self.vision_encoder = self._config.vision_encoder.get_layer(
+            distributed_config,
+            hidden_dim=self._hidden_dim,
+            lr_scale=self._lr_scale,
+            peft=self._peft,
+        )
+
+    def get_layers(self) -> list[Layer]:
+        return self._vision_encoder_with_namespace.get_layers() + super().get_layers()
+
+    def preprocess(self, kwargs: dict[str, typing.Any]) -> None:
+        self._vision_encoder_with_namespace.preprocess(kwargs)
+        super().preprocess(kwargs)
+
+    def get_loss_definitions(self, count: int = 1) -> list[LossDef]:
+        return self.vision_encoder.get_loss_definitions(count) + super().get_loss_definitions(count)
+
+    @functools.cached_property
+    def _vision_encoder_namespace(self) -> str:
+        return self.vision_encoder.module_name
+
+    @functools.cached_property
+    def _vision_encoder_with_namespace(self) -> LayerBaseWithNamespace:
+        return LayerBaseWithNamespace(self.vision_encoder, self._vision_encoder_namespace)
diff --git a/fast_llm/models/gpt/model.py b/fast_llm/models/gpt/model.py
index 3295295f6..65be1eb4b 100644
--- a/fast_llm/models/gpt/model.py
+++ b/fast_llm/models/gpt/model.py
@@ -235,7 +235,7 @@ def preprocess_batch(
                 if kwargs[AttentionKwargs.sequence_first]
                 else cropped_tokens.tokens
             ).contiguous()
-            self.preprocess(tokens, kwargs)
+            self.preprocess(kwargs)
             preprocessed.append((tokens, kwargs))
 
         return preprocessed
diff --git a/fast_llm/models/multimodal/config.py b/fast_llm/models/multimodal/config.py
index 2415734e4..1ce8cea33 100644
--- a/fast_llm/models/multimodal/config.py
+++ b/fast_llm/models/multimodal/config.py
@@ -6,7 +6,7 @@
 from fast_llm.engine.config_utils.runnable import RunnableConfig
 from fast_llm.engine.multi_stage.config import FastLLMModelConfig
 from fast_llm.engine.training.config import TrainerConfig
-from fast_llm.layers.vision.config import VisionEncoderConfig
+from fast_llm.layers.vision.config import VisionEncoderConfig, VisionMultiModalModelConfig
 from fast_llm.models.gpt.config import (
     GPTBaseModelConfig,
     GPTBatchConfig,
@@ -16,8 +16,7 @@
 )
 
 if typing.TYPE_CHECKING:
-    from fast_llm.models.multimodal.huggingface import HuggingfaceMultiModalModelForCausalLM
-    from fast_llm.models.multimodal.model import MultiModalBaseModel, MultiModalModel, MultiModalModelInferenceRunner
+    from fast_llm.models.multimodal.model import MultiModalBaseModel, MultiModalModel
     from fast_llm.models.multimodal.trainer import MultiModalTrainer
 
 logger = logging.getLogger(__name__)
@@ -29,7 +28,7 @@ class MultiModalBatchConfig(GPTBatchConfig):
 
 
 @config_class()
-class MultiModalBaseModelConfig(GPTBaseModelConfig):
+class MultiModalBaseModelConfig(VisionMultiModalModelConfig, GPTBaseModelConfig):
     vision_encoder: VisionEncoderConfig = Field(
         hint=FieldHint.architecture,
         desc="Configuration for the vision encoder.",
@@ -77,8 +76,6 @@ class PretrainedMultiModalModelConfig(PretrainedGPTModelConfig):
 
 @config_class(dynamic_type={RunnableConfig: "train_gpt", TrainerConfig: "gpt"})
 class MultiModalTrainerConfig(PretrainedMultiModalModelConfig, GPTTrainerConfig):
-    data: MultiModalDataConfig = FieldUpdate()
-    batch: MultiModalBatchConfig = FieldUpdate()
     # TODO: Use dynamic model type?
     reference_models: dict[str, PretrainedMultiModalModelConfig] = FieldUpdate()
 
diff --git a/fast_llm/models/multimodal/model.py b/fast_llm/models/multimodal/model.py
index 7426191f7..bc9b5adea 100644
--- a/fast_llm/models/multimodal/model.py
+++ b/fast_llm/models/multimodal/model.py
@@ -3,9 +3,12 @@
 
 import torch
 
-from fast_llm.data.data.gpt.data import GPTBatch
-from fast_llm.engine.distributed.config import DistributedConfig, PhaseType
+from fast_llm.data.sample.language_model import LanguageModelBatch
+from fast_llm.engine.config_utils.tensor_dim import ConcatenatedTensorDim, TensorDim, scalar_dim
+from fast_llm.engine.distributed.config import PhaseType
 from fast_llm.engine.inference.runner import InferenceRunner
+from fast_llm.layers.language_model.config import LanguageModelKwargs
+from fast_llm.layers.vision.vision_encoder import VisionMultiModalModel
 from fast_llm.models.gpt.config import GPTBatchConfig
 from fast_llm.models.gpt.model import GPTBaseModel, GPTModel
 from fast_llm.models.multimodal.config import MultiModalBaseModelConfig, MultiModalBatchConfig, MultiModalModelConfig
@@ -14,112 +17,82 @@
 logger = logging.getLogger(__name__)
 
 
-class MultiModalBaseModel[ConfigType: MultiModalBaseModelConfig](GPTBaseModel[ConfigType]):
+class MultiModalBaseModel[ConfigType: MultiModalBaseModelConfig](
+    VisionMultiModalModel[ConfigType], GPTBaseModel[ConfigType]
+):
     """
     A transformer-based language model generalizing the GPT model architecture.
     """
 
     _config: ConfigType
 
-    def __init__(
-        self,
-        config: ConfigType,
-        distributed_config: DistributedConfig,
-    ):
-        super().__init__(config, distributed_config)
-        self.vision_encoder = self._config.vision_encoder.get_layer(
-            distributed_config,
-            self._hidden_dim,
-            lr_scale=None,
-            peft=self._config.peft,
-        )
-
     def preprocess_meta(
         self, batch_meta: GPTBatchConfig | torch.Tensor, phase: PhaseType
     ) -> list[tuple[TensorMeta, dict]]:
-        # TODO Remove (Move batch splitting elsewhere)
-        # TODO: Use parallel/sequential dims, distinguish micro and full batch/sequence
-        # TODO ====== Vision ======
-        # if self._config.vision_encoder.enabled:
-        #    try:
-        #        max_image_size = batch_meta.max_image_size
-        #    except AttributeError:
-        #        max_image_size = 256
-        #        logger.warning("Inference mode: max_image_size not provided, defaulting to 256")
-        #    vision_kwargs = {
-        #        VisionEncoderKwargs.patch_size: self._config.vision_encoder.patch_size,
-        #        VisionEncoderKwargs.max_image_size: max_image_size,
-        #        VisionEncoderKwargs.rope_theta: self._config.vision_encoder.transformer.rotary.theta,
-        #        VisionEncoderKwargs.kv_channels: self._tensor_space[VisionTransformerDimNames.kv_channels].size,
-        #        VisionEncoderKwargs.out_channels: self._tensor_space[VisionEncoderDimNames.out_channels].size,
-        #    }
-        #    vision_hidden_dim = self._tensor_space[VisionTransformerDimNames.hidden]
-        #    vision_hidden_dims = (
-        #        (hidden_sequence_q_dim, batch_dim, vision_hidden_dim)
-        #        if sequence_first
-        #        else (batch_dim, hidden_sequence_q_dim, vision_hidden_dim)
-        #    )
-        #    vision_kwargs.update(
-        #        {
-        #            VisionTransformerKwargs.hidden_dims: vision_hidden_dims,
-        #        }
-        #    )
-        #    common_kwargs.update(vision_kwargs)
-
-        # TODO ====== Vision ======
-        # if self._config.vision_encoder.enabled:
-        #     # patch_dimensions are (batch * sequence_length) x 3 x patch_size x patch_size
-        #     preprocessed_meta.append((kwargs[VisionEncoderKwargs.image_patches_meta], kwargs))
-        # else:
-        #     preprocessed_meta.append((tokens, kwargs))
-        pass
+        preprocessed_meta = []
+        for tokens, kwargs in super().preprocess_meta(batch_meta, phase):
+            kwargs[LanguageModelKwargs.token_ids] = tokens
+            image_patches = TensorMeta.from_dims(
+                (
+                    # We combine the batch and sequence dims to allow for variable sequence lengths.
+                    # Gives the same result, assuming we disable cross-image attention (TODO: Enforce)
+                    scalar_dim,
+                    # TODO: Wrong (variable size).
+                    ConcatenatedTensorDim("image_sequence", tokens.dims),
+                    # TODO: Relate to tensor dims in patch convolution.
+                    TensorDim("input_channels", self._config.vision_encoder.patch_convolution.input_channels),
+                    TensorDim("patch_height", self._config.vision_encoder.patch_convolution.patch_height),
+                    TensorDim("patch_width", self._config.vision_encoder.patch_convolution.patch_width),
+                )
+            )
+            preprocessed_meta.append((image_patches, kwargs))
+
+        return preprocessed_meta
 
     def preprocess_batch(
         self,
-        batch: GPTBatch,
+        batch: LanguageModelBatch,
         preprocessed_meta: list[tuple[TensorMeta, dict]] | None = None,
         *,
         phase: PhaseType,
         iteration: int,
         metrics: dict | None = None,
     ) -> list[tuple[torch.Tensor, dict]]:
-        # TODO Move batch splitting elsewhere, align interface with LayerBase
-        # TODO ====== Vision ======
-        # if self._config.vision_encoder.enabled:
-        #    if self._config.vision_encoder.image_break_token is not None:
-        #        if not labels_cloned:
-        #            labels = labels.clone()
-        #            labels_cloned = True
-        #        labels = torch.where(labels == self._config.vision_encoder.image_break_token, -100, labels)
-        #    if self._config.vision_encoder.image_end_token is not None:
-        #        if not labels_cloned:
-        #            labels = labels.clone()
-        #            labels_cloned = True
-        #        labels = torch.where(labels == self._config.vision_encoder.image_end_token, -100, labels)
-        # Loss-masking for distillation losses
-        # TODO ====== Vision ======
-        # if self._config.vision_encoder.enabled:
-        #    batch_images = (
-        #        batch.images if batch.images is not None else [[]] * kwargs[AttentionKwargs.micro_batch_size]
-        #    )
-        #    kwargs[VisionEncoderKwargs.images] = [
-        #        [
-        #            img.to(device=self._tensor_space.distributed.device, dtype=torch.uint8, non_blocking=True)
-        #            for img in images
-        #        ]
-        #        for images in batch_images
-        #    ]
-        #    kwargs[VisionEncoderKwargs.image_positions] = (
-        #        batch.image_positions
-        #        if batch.image_positions is not None
-        #        else [[]] * kwargs[AttentionKwargs.micro_batch_size]
-        #    )
-        #    kwargs[LanguageModelKwargs.tokens] = tokens
-        # image_patches = kwargs.get(VisionEncoderKwargs.image_patches, None)
-        # if image_patches is not None:
-        #     preprocessed.append((image_patches, kwargs))
-        # else:
-        #     preprocessed.append((tokens, kwargs))
+        preprocessed = super().preprocess_batch(
+            batch, preprocessed_meta, phase=phase, iteration=iteration, metrics=metrics
+        )
+        # TODO: Support micro-sequences.
+        assert len(preprocessed) == 1, "Micro-sequences not supported for MultiModalModel."
+        tokens, kwargs = preprocessed[0]
+
+        kwargs[LanguageModelKwargs.token_ids] = tokens
+
+        kwargs[LanguageModelKwargs.embedding_map] = batch.image_patches.token_map
+
+        image_patches = batch.image_patches.patches
+        sequence_length = image_patches.size(0)
+        sequence_dim = TensorDim("image_sequence", sequence_length)
+
+        hidden_dims = (
+            (sequence_dim, scalar_dim, self.vision_encoder._hidden_dim)
+            if (sequence_first := kwargs[LanguageModelKwargs.sequence_first])
+            else (scalar_dim, sequence_dim, self.vision_encoder._hidden_dim)
+        )
+        kwargs[self._vision_encoder_namespace] = {
+            LanguageModelKwargs.sequence_first: sequence_first,
+            LanguageModelKwargs.position_ids: batch.image_patches.position_ids,
+            LanguageModelKwargs.sequence_lengths: batch.image_patches.lengths,
+            LanguageModelKwargs.sequence_length: sequence_length,
+            LanguageModelKwargs.sequence_k_dim: sequence_dim,
+            LanguageModelKwargs.sequence_q_dim: sequence_dim,
+            LanguageModelKwargs.hidden_dims: hidden_dims,
+        }
+        super().preprocess(kwargs)
+
+        return [(image_patches, kwargs)]
+
+    def preprocess(self, kwargs: dict[str, typing.Any]) -> None:
+        # Hack to delay preprocessing in super().preprocess_batch (TODO: Improve)
         pass
 
 

From e4f3f020d6f4131f092050e63c88b4f5bdff4ad8 Mon Sep 17 00:00:00 2001
From: Joel Lamy-Poirier <joel.lamy-poirier@servicenow.com>
Date: Thu, 6 Nov 2025 20:48:23 -0500
Subject: [PATCH 03/16] cleanup

---
 fast_llm/layers/vision/preprocessing.py | 194 ------------------------
 1 file changed, 194 deletions(-)
 delete mode 100644 fast_llm/layers/vision/preprocessing.py

diff --git a/fast_llm/layers/vision/preprocessing.py b/fast_llm/layers/vision/preprocessing.py
deleted file mode 100644
index 83331c739..000000000
--- a/fast_llm/layers/vision/preprocessing.py
+++ /dev/null
@@ -1,194 +0,0 @@
-import math
-import typing
-
-import torch
-import torchvision.transforms.v2 as torchvision_transforms
-
-from fast_llm.engine.distributed.distributed import Distributed
-from fast_llm.layers.attention.config import AttentionKwargs
-from fast_llm.layers.language_model.config import LanguageModelKwargs
-from fast_llm.layers.vision.config import ImageNormalizationConfig, VisionEncoderConfig
-from fast_llm.utils import div
-
-
-def get_num_patches(height: int, width: int, patch_size: int) -> int:
-    """
-    Calculate the number of patches in height and width dimensions.
-    """
-    return div(height, patch_size) * div(width, patch_size)
-
-
-def get_num_image_tokens(height: int, width: int, patch_size: int, image_break: bool, image_end: bool) -> int:
-    """
-    Calculate the number of image tokens.
-    If image_break is True, we consider 1 additional token after every row of patches.
-    """
-    height_patches = div(height, patch_size)
-    width_patches = div(width, patch_size)
-    num_tokens = height_patches * width_patches
-    if image_break:
-        num_tokens += height_patches
-    elif image_end:
-        num_tokens += 1
-    return num_tokens
-
-
-def get_resize_dims(height: int, width: int, max_height: int, max_width: int, patch_size: int) -> tuple[int, int]:
-    """
-    Calculate the new dimensions for resizing an image while maintaining the aspect ratio.
-    If the image is larger than the max dimensions, it will be resized to fit within them.
-    If the image is smaller, it will be resized to the nearest multiple of the patch size.
-    """
-    ratio = max(height / max_height, width / max_width)
-    if ratio > 1:
-        # Resize to fit within max dimensions
-        height = int(height / ratio)
-        width = int(width / ratio)
-    return patch_size * math.ceil(height / patch_size), patch_size * math.ceil(width / patch_size)
-
-
-def resize(image: torch.Tensor, target_height: int, target_width: int) -> torch.Tensor:
-    # cap the resizing to half of the current size as a workaround for large images
-    # See pytorch issue: https://github.com/pytorch/pytorch/issues/103589
-    while max(image.size(1) / target_height, image.size(2) / target_width) > 2:
-        image = torchvision_transforms.functional.resize(
-            image,
-            size=(math.ceil(image.size(1) / 2), math.ceil(image.size(2) / 2)),
-            interpolation=torchvision_transforms.InterpolationMode.BICUBIC,
-        )
-
-    # TODO: options for interpolation mode?
-    return torchvision_transforms.functional.resize(
-        image, size=(target_height, target_width), interpolation=torchvision_transforms.InterpolationMode.BICUBIC
-    )
-
-
-def position_ids_in_meshgrid(height, width, max_size, patch_size) -> torch.Tensor:
-    patch_height = height // patch_size
-    patch_width = width // patch_size
-    return torch.arange(patch_height).repeat_interleave(patch_width) * max_size + torch.arange(patch_width).repeat(
-        patch_height
-    )
-
-
-class VisionPreprocessor:
-    def __init__(self, config: VisionEncoderConfig, distributed: Distributed):
-        self._config = config
-        self._distributed = distributed
-
-    def preprocess(self, tokens, kwargs: dict[str, typing.Any]) -> None:
-        max_image_size = kwargs.get(VisionEncoderKwargs.max_image_size)
-        patch_size = self._config.patch_size
-        image_sizes = []
-
-        norm_config: ImageNormalizationConfig = kwargs["norm_config"]
-
-        if LanguageModelKwargs.labels in kwargs:
-            labels = kwargs[LanguageModelKwargs.labels]
-            if (self._config.image_break_token is not None) or (self._config.image_end_token is not None):
-                # If image break or end token is present, we need to replace image token ids to -100 in labels
-                # TODO: avoid double cloning labels in case of loss masking spans?
-                labels = labels.clone()
-        patches = []
-        patch_position_ids = []
-        sequence_lengths = [0]
-        max_sequence_length = -1
-
-        for sample_index, (sample_images_, positions) in enumerate(
-            zip(kwargs[VisionEncoderKwargs.images], kwargs.get(VisionEncoderKwargs.image_positions), strict=True)
-        ):
-            image_sizes.append(sample_image_sizes := [])
-
-            sample_sequence_length = 0
-
-            for image, position in zip(sample_images_, positions, strict=True):
-                height, width = get_resize_dims(
-                    image.size(1), image.size(2), max_image_size, max_image_size, patch_size=patch_size
-                )
-
-                sample_image_sizes.append((height, width))
-
-                image = resize(image, height, width)
-
-                # TODO: Normalize with constant dtype instead?
-                image = image.to(dtype=self._distributed.config.training_dtype.torch)
-
-                image = torchvision_transforms.functional.normalize(
-                    image / norm_config.rescale_factor,
-                    mean=[norm_config.mean_r, norm_config.mean_g, norm_config.mean_b],
-                    std=[norm_config.std_r, norm_config.std_g, norm_config.std_b],
-                )
-                patches.extend(
-                    torch.nn.functional.unfold(image, kernel_size=patch_size, stride=patch_size).T.reshape(
-                        -1, 3, patch_size, patch_size
-                    )
-                )
-
-                num_height_patches = div(height, patch_size)
-                num_width_patches = div(width, patch_size)
-                grid_height = torch.arange(num_height_patches).repeat_interleave(num_width_patches)
-                grid_width = torch.arange(num_width_patches).repeat(num_height_patches)
-                grid_height * div(max_image_size, patch_size) + grid_width
-                patch_position_ids.append(grid_height * div(max_image_size, patch_size) + grid_width)
-
-                if LanguageModelKwargs.labels in kwargs:
-                    num_tokens = get_num_image_tokens(
-                        height,
-                        width,
-                        patch_size=patch_size,
-                        image_break=self._config.image_break_token is not None,
-                        image_end=self._config.image_end_token is not None,
-                    )
-                    # set labels for image patches to -100
-                    labels[sample_index, max(position - 1, 0) : position + num_tokens - 1] = -100
-
-                sequence_lengths.append(sequence_length := num_height_patches * num_width_patches)
-                if sequence_length > max_sequence_length:
-                    max_sequence_length = sequence_length
-                sample_sequence_length += sequence_length
-
-            # TODO: No need for padding with varlen?
-            padding_size = kwargs[AttentionKwargs.sequence_length] - sample_sequence_length
-            if padding_size > max_sequence_length:
-                max_sequence_length = padding_size
-            sequence_lengths.append(padding_size)
-
-            patches.append(
-                torch.zeros(padding_size, 3, patch_size, patch_size).to(
-                    dtype=self._tensor_space.distributed_config.training_dtype.torch,
-                    device=self._tensor_space.distributed.device,
-                ),
-            )
-            patch_position_ids.append(torch.full((padding_size,), 0, dtype=torch.int64))
-
-        kwargs[VisionEncoderKwargs.image_sizes] = image_sizes
-        kwargs[VisionEncoderKwargs.image_patches] = torch.cat(patches).to(device=self._distributed.device)
-        kwargs[VisionTransformerKwargs.patch_position_ids] = torch.cat(patch_position_ids).to(
-            device=self._distributed.device
-        )
-        kwargs[VisionEncoderKwargs.max_image_tokens] = div(max_image_size**2, patch_size**2)
-        # sequence data parallel is not yet supported for images, so we use the same cu_seqlens for q and k
-        kwargs[VisionTransformerKwargs.cu_seqlens_q] = torch.tensor(
-            cu_seqlens, device=self._distributed.device, dtype=torch.int32
-        )
-        kwargs[VisionTransformerKwargs.cu_seqlens_k] = torch.tensor(
-            cu_seqlens, device=self._distributed.device, dtype=torch.int32
-        )
-        kwargs[VisionTransformerKwargs.max_seqlen_q] = max_sequence_length
-        kwargs[VisionTransformerKwargs.max_seqlen_k] = max_sequence_length
-        if LanguageModelKwargs.labels in kwargs:
-            kwargs[LanguageModelKwargs.labels] = labels
-
-        # TODO: add proper preprocessing for attention-mask when not using flash attention
-        # Following is just a dummy code to run the tests.
-        kwargs[self._config.transformer._transformer_kwargs.attention_mask] = torch.ones(
-            (1, 1, kwargs[AttentionKwargs.sequence_length], 1, kwargs[AttentionKwargs.sequence_length]),
-            dtype=torch.bool,
-            device=self._tensor_space.distributed.device,
-        )
-        kwargs[self._config.transformer._transformer_kwargs.attention_mask_value] = torch.full(
-            [],
-            torch.finfo(self._distributed.config.training_dtype.torch).min,
-            dtype=self._distributed.config.training_dtype.torch,
-            device=self._distributed.device,
-        )

From 9d2fe10eeb3768eab5eb9a9d428f3bdf1f83d7f5 Mon Sep 17 00:00:00 2001
From: Joel Lamy-Poirier <joel.lamy-poirier@servicenow.com>
Date: Fri, 7 Nov 2025 18:41:59 -0500
Subject: [PATCH 04/16] stuff

---
 fast_llm/data/preprocessing/image_patch.py | 36 +++++++++++++++++-----
 fast_llm/layers/vision/config.py           | 11 ++++---
 2 files changed, 34 insertions(+), 13 deletions(-)

diff --git a/fast_llm/data/preprocessing/image_patch.py b/fast_llm/data/preprocessing/image_patch.py
index 992324a60..072e27f18 100644
--- a/fast_llm/data/preprocessing/image_patch.py
+++ b/fast_llm/data/preprocessing/image_patch.py
@@ -1,10 +1,11 @@
+import functools
 import io
 import math
 import typing
 
 import numpy as np
 
-from fast_llm.config import Config, Field, FieldHint, config_class
+from fast_llm.config import Config, Field, FieldHint, check_field, config_class
 from fast_llm.engine.config_utils.data_type import DataType
 from fast_llm.utils import Assert, div
 
@@ -23,20 +24,22 @@ class ImagePatchConfig(Config):
         default=16,
         desc="Height of the image patches, in pixels.",
         hint=FieldHint.core,
+        valid=check_field(Assert.gt, 0),
     )
     width: int = Field(
         default=16,
         desc="Height of the image patches, in pixels.",
         hint=FieldHint.core,
+        valid=check_field(Assert.gt, 0),
     )
-    max_image_height: int | None = Field(
-        default=None,
+    max_image_height: int = Field(
+        default=1024,
         desc="Maximum height of the complete image, in pixels."
         "If the original image is larger than this, it will be resized to this height.",
         hint=FieldHint.optional,
     )
-    max_image_width: int | None = Field(
-        default=None,
+    max_image_width: int = Field(
+        default=1024,
         desc="Maximum width of the complete image, in pixels."
         "If the original image is larger than this, it will be resized to this width.",
         hint=FieldHint.optional,
@@ -53,6 +56,24 @@ class ImagePatchConfig(Config):
         hint=FieldHint.optional,
     )
 
+    @property
+    def num_channels(self) -> int:
+        # assume 3 channels (RGB) for all images
+        return 3
+
+    @functools.cached_property
+    def max_patches_height(self) -> int:
+        return div(self.max_image_height, self.height)
+
+    @functools.cached_property
+    def max_patches_width(self) -> int:
+        return div(self.max_image_width, self.width)
+
+    def _validate(self):
+        super()._validate()
+        Assert.gt(self.max_patches_height, 0)
+        Assert.gt(self.max_patches_width, 0)
+
     def get_patches(
         self, images: list[bytes], token_data_type: DataType = DataType.int64
     ) -> tuple["torch.Tensor", "torch.Tensor", "torch.Tensor", list["torch.Tensor"], list[int]]:
@@ -72,7 +93,7 @@ def get_patches(
         else:
             # Return empty tensors of appropriate shapes and data types so we can concatenate with other documents.
             return (
-                torch.empty(0, 3, self.height, self.width, dtype=torch.uint8),
+                torch.empty(0, self.num_channels, self.height, self.width, dtype=torch.uint8),
                 torch.empty(0, dtype=torch.int64),
                 torch.empty(0, dtype=torch.int64),
                 [],
@@ -85,7 +106,6 @@ def _get_patches(
         import PIL.Image
         import torch
 
-        # assume 3 channels (RGB) for all images
         with PIL.Image.open(io.BytesIO(image_bytes)) as image:
             if image.mode != "RGB":
                 # Convert all images to RGB
@@ -99,7 +119,7 @@ def _get_patches(
         num_patches_width = div(image.size(2), self.width)
         # Convert to patches. (`torch.nn.functional.unfold` not supported for uint8.)
         patches = (
-            image.view(3, num_patches_height, self.height, num_patches_width, self.width)
+            image.view(self.num_channels, num_patches_height, self.height, num_patches_width, self.width)
             .permute(3, 1, 0, 2, 4)
             .flatten(0, 1)
         )
diff --git a/fast_llm/layers/vision/config.py b/fast_llm/layers/vision/config.py
index 1762953cd..65083fa2c 100644
--- a/fast_llm/layers/vision/config.py
+++ b/fast_llm/layers/vision/config.py
@@ -1,3 +1,4 @@
+import functools
 import typing
 
 from fast_llm.config import Config, Field, FieldHint, check_field, config_class
@@ -72,11 +73,11 @@ class PatchConvolutionConfig(BlockConfig):
         desc="Width of image patches, in pixels.",
         hint=FieldHint.core,
     )
-    input_channels: int = Field(
-        default=3,
-        desc="Number of pixel channels (usually 3).",
-        hint=FieldHint.feature,
-    )
+
+    @functools.cached_property
+    def input_channels(self):
+        # Number of input channels. Currently hard-coded to 3 (RGB).
+        return 3
 
 
 @config_class(registry=True)

From 358abe16777b92de955b18e15cf11fca3739aa27 Mon Sep 17 00:00:00 2001
From: Joel Lamy-Poirier <joel.lamy-poirier@servicenow.com>
Date: Fri, 7 Nov 2025 22:11:06 -0500
Subject: [PATCH 05/16] rotary

---
 fast_llm/layers/attention/rotary/config.py | 11 +++-
 fast_llm/layers/attention/rotary/rotary.py | 65 ++++++++++------------
 fast_llm/layers/vision/config.py           | 15 +++--
 fast_llm/models/multimodal/config.py       | 15 ++---
 fast_llm/models/multimodal/model.py        | 15 ++---
 tests/utils/model_configs.py               | 34 ++++-------
 6 files changed, 74 insertions(+), 81 deletions(-)

diff --git a/fast_llm/layers/attention/rotary/config.py b/fast_llm/layers/attention/rotary/config.py
index 74b5cf21a..92adc880e 100644
--- a/fast_llm/layers/attention/rotary/config.py
+++ b/fast_llm/layers/attention/rotary/config.py
@@ -10,7 +10,14 @@
 from fast_llm.utils import Assert
 
 if typing.TYPE_CHECKING:
-    from fast_llm.layers.attention.rotary.rotary import DefaultRotary, Llama3Rotary, NoRotary, Rotary, YarnRotary
+    from fast_llm.layers.attention.rotary.rotary import (
+        DefaultRotary,
+        Llama3Rotary,
+        NoRotary,
+        Rotary,
+        Rotary2D,
+        YarnRotary,
+    )
 
 
 @config_class(registry=True)
@@ -140,6 +147,6 @@ def _get_configurable_class(self) -> "type[YarnRotary]":
 @config_class(dynamic_type={RotaryConfig: "default_2d"})
 class Rotary2DConfig(DefaultRotaryConfig):
     def _get_configurable_class(self) -> "type[Rotary2D]":
-        from fast_llm.layers.transformer.rotary.rotary import Rotary2D
+        from fast_llm.layers.attention.rotary.rotary import Rotary2D
 
         return Rotary2D
diff --git a/fast_llm/layers/attention/rotary/rotary.py b/fast_llm/layers/attention/rotary/rotary.py
index 6250fd4a9..e8b4b6855 100644
--- a/fast_llm/layers/attention/rotary/rotary.py
+++ b/fast_llm/layers/attention/rotary/rotary.py
@@ -16,7 +16,8 @@
     RotaryConfig,
     YarnRotaryConfig,
 )
-from fast_llm.utils import div
+from fast_llm.layers.vision.config import VisionKwargs
+from fast_llm.utils import Assert, div
 
 
 def convert_rotary_complex_to_real(tensor: torch.Tensor, head_size: int, dim: int) -> torch.Tensor:
@@ -47,7 +48,7 @@ def __init__(
         head_size_dim: TensorDim,
     ):
         super().__init__(config)
-        self._head_size_dim = head_size_dim
+        self._head_size = head_size_dim.global_size
 
     @abc.abstractmethod
     def forward(
@@ -93,7 +94,7 @@ def _create_tensors(self, sequence_length: int, device: torch.device) -> None:
 
         self._rotary_embedding_frequencies = self._get_frequencies(
             sequence_length,
-            self._head_size_dim.global_size,
+            self._head_size,
             device=device,
         )
 
@@ -177,18 +178,36 @@ def _get_correction(self, beta: float, dim: int) -> float:
         )
 
 
-class Rotary2D[ConfigType: Rotary2DConfig](DefaultRotary[ConfigType]):
-    _rotary_embedding_frequencies: torch.Tensor
-    _tensor_cache_max_num_patches: int = -1
+class Rotary2D[ConfigType: Rotary2DConfig](Rotary[ConfigType]):
+    _frequencies: torch.Tensor
     _config: ConfigType
 
+    def __init__(
+        self,
+        config: ConfigType,
+        head_size_dim: TensorDim,
+    ):
+        super().__init__(config, head_size_dim)
+        Assert.multiple(self._head_size, 4)
+
     def preprocess(self, batch, kwargs: dict[str, typing.Any]) -> None:
-        self._create_tensors(
-            kwargs[VisionEncoderKwargs.max_image_size] // kwargs[VisionEncoderKwargs.patch_size], batch.device
+        patch_positions = kwargs[VisionKwargs.patch_positions]
+        if not hasattr(self, "_frequencies"):
+            self._frequencies = self._config.theta ** -torch.arange(
+                0, 1, 4 / self._head_size, device=patch_positions.device, dtype=torch.float64
+            )
+        # TODO: Pre-compute 2d frequencies?
+        angles = torch.outer(patch_positions.flatten(), self._frequencies).view(
+            len(patch_positions), self._head_size // 2
         )
-        position_ids = kwargs[VisionTransformerKwargs.patch_position_ids]
-        kwargs[AttentionKwargs.rotary_freq_q] = self._rotary_embedding_frequencies[:, position_ids]
-        kwargs[AttentionKwargs.rotary_freq_k] = self._rotary_embedding_frequencies[:, position_ids]
+        frequencies = torch.polar(torch.ones_like(angles), angles)[None, :, None, :].to(torch.complex64)
+        if not self._config.complex_format:
+            frequencies = convert_rotary_complex_to_real(
+                torch.view_as_real(frequencies).flatten(-2), self._head_size, 3
+            ).contiguous()
+        # TODO: Support different q and k frequencies.
+        kwargs[AttentionKwargs.rotary_freq_q] = frequencies
+        kwargs[AttentionKwargs.rotary_freq_k] = frequencies
 
     def forward(
         self, query: torch.Tensor, key: torch.Tensor, kwargs: dict[str, typing.Any]
@@ -197,27 +216,3 @@ def forward(
         query = rotary_fn(query, kwargs[AttentionKwargs.rotary_freq_q])
         key = rotary_fn(key, kwargs[AttentionKwargs.rotary_freq_k])
         return query, key
-
-    def _get_frequencies(self, sequence_length: int, head_size: int, device: torch.device) -> torch.Tensor:
-        max_num_patches = sequence_length
-        # Calculate complex frequencies by using alternating channels for width and height
-        height_positions = torch.arange(max_num_patches, device=device, dtype=torch.float64)
-        width_positions = torch.arange(max_num_patches, device=device, dtype=torch.float64)
-        frequencies = self._config.theta ** -torch.arange(0, 1, 2 / head_size, device=device, dtype=torch.float64)
-        angles_h = torch.outer(height_positions, frequencies[::2])
-        angles_w = torch.outer(width_positions, frequencies[1::2])
-        angles = torch.cat(
-            [
-                angles_h[:, None, :].repeat(1, max_num_patches, 1),
-                angles_w[None, :, :].repeat(max_num_patches, 1, 1),
-            ],
-            dim=-1,
-        ).reshape(-1, head_size // 2)
-
-        frequencies = torch.polar(torch.ones_like(angles), angles)[None, :, None, :].to(torch.complex64)
-        if not self._config.complex_format:
-            frequencies = convert_rotary_complex_to_real(
-                torch.view_as_real(frequencies).flatten(-2), head_size, 3
-            ).contiguous()
-
-        return frequencies
diff --git a/fast_llm/layers/vision/config.py b/fast_llm/layers/vision/config.py
index 65083fa2c..f865231b9 100644
--- a/fast_llm/layers/vision/config.py
+++ b/fast_llm/layers/vision/config.py
@@ -2,7 +2,7 @@
 import typing
 
 from fast_llm.config import Config, Field, FieldHint, check_field, config_class
-from fast_llm.layers.block.config import BlockConfig, BlockSequenceConfig
+from fast_llm.layers.block.config import BlockConfig, BlockKwargs, BlockSequenceConfig
 from fast_llm.layers.common.linear.config import Convolution2DConfig
 from fast_llm.layers.common.normalization.config import NormalizationConfig
 from fast_llm.layers.decoder.config import MLPBaseConfig
@@ -13,6 +13,10 @@
     from fast_llm.layers.vision.vision_encoder import VisionEncoder, VisionMultiModalModel
 
 
+class VisionKwargs(BlockKwargs):
+    patch_positions = "patch_positions"
+
+
 @config_class()
 class ImageNormalizationConfig(Config):
     mean_r: float = Field(
@@ -87,14 +91,15 @@ class VisionEncoderConfig(BlockConfig):
         desc="Configuration for the patch convolution layer.",
         hint=FieldHint.architecture,
     )
-    adapter: MLPBaseConfig = Field(
-        desc="Configuration for the adapter layer.",
-        hint=FieldHint.architecture,
-    )
+    # TODO: Should use varlen mixer, 2d rotary, non-causal. Enforce?
     encoder: BlockSequenceConfig = Field(
         desc="Configuration for the vision decoder.",
         hint=FieldHint.architecture,
     )
+    adapter: MLPBaseConfig = Field(
+        desc="Configuration for the adapter layer.",
+        hint=FieldHint.architecture,
+    )
     hidden_size: int = Field(
         default=1024,
         desc="Size of the vision encoder main hidden dimension.",
diff --git a/fast_llm/models/multimodal/config.py b/fast_llm/models/multimodal/config.py
index 1ce8cea33..23c9fa401 100644
--- a/fast_llm/models/multimodal/config.py
+++ b/fast_llm/models/multimodal/config.py
@@ -1,12 +1,12 @@
 import logging
 import typing
 
-from fast_llm.config import Field, FieldHint, FieldUpdate, config_class
+from fast_llm.config import FieldUpdate, config_class
 from fast_llm.engine.checkpoint.config import CheckpointFormat
 from fast_llm.engine.config_utils.runnable import RunnableConfig
 from fast_llm.engine.multi_stage.config import FastLLMModelConfig
 from fast_llm.engine.training.config import TrainerConfig
-from fast_llm.layers.vision.config import VisionEncoderConfig, VisionMultiModalModelConfig
+from fast_llm.layers.vision.config import VisionMultiModalModelConfig
 from fast_llm.models.gpt.config import (
     GPTBaseModelConfig,
     GPTBatchConfig,
@@ -29,11 +29,6 @@ class MultiModalBatchConfig(GPTBatchConfig):
 
 @config_class()
 class MultiModalBaseModelConfig(VisionMultiModalModelConfig, GPTBaseModelConfig):
-    vision_encoder: VisionEncoderConfig = Field(
-        hint=FieldHint.architecture,
-        desc="Configuration for the vision encoder.",
-    )
-
     @property
     def base_model_class(self) -> type["MultiModalBaseModel"]:
         from fast_llm.models.multimodal.model import MultiModalBaseModel
@@ -41,10 +36,10 @@ def base_model_class(self) -> type["MultiModalBaseModel"]:
         return MultiModalBaseModel
 
 
-@config_class(dynamic_type={FastLLMModelConfig: "gpt"})
+@config_class(dynamic_type={FastLLMModelConfig: "multimodal"})
 class MultiModalModelConfig(GPTModelConfig):
     _abstract = False
-    model_name: typing.ClassVar[str] = "gpt"
+    model_name: typing.ClassVar[str] = "multimodal"
     base_model: GPTBaseModelConfig = FieldUpdate()
     # TODO: ====== Conversion ======
     checkpoint_formats: typing.ClassVar[tuple[type[CheckpointFormat], ...]] = FastLLMModelConfig.checkpoint_formats
@@ -74,7 +69,7 @@ class PretrainedMultiModalModelConfig(PretrainedGPTModelConfig):
     model: MultiModalModelConfig = FieldUpdate()
 
 
-@config_class(dynamic_type={RunnableConfig: "train_gpt", TrainerConfig: "gpt"})
+@config_class(dynamic_type={RunnableConfig: "train_multimodal", TrainerConfig: "multimodal"})
 class MultiModalTrainerConfig(PretrainedMultiModalModelConfig, GPTTrainerConfig):
     # TODO: Use dynamic model type?
     reference_models: dict[str, PretrainedMultiModalModelConfig] = FieldUpdate()
diff --git a/fast_llm/models/multimodal/model.py b/fast_llm/models/multimodal/model.py
index bc9b5adea..907bc8f16 100644
--- a/fast_llm/models/multimodal/model.py
+++ b/fast_llm/models/multimodal/model.py
@@ -8,6 +8,7 @@
 from fast_llm.engine.distributed.config import PhaseType
 from fast_llm.engine.inference.runner import InferenceRunner
 from fast_llm.layers.language_model.config import LanguageModelKwargs
+from fast_llm.layers.vision.config import VisionKwargs
 from fast_llm.layers.vision.vision_encoder import VisionMultiModalModel
 from fast_llm.models.gpt.config import GPTBatchConfig
 from fast_llm.models.gpt.model import GPTBaseModel, GPTModel
@@ -79,13 +80,13 @@ def preprocess_batch(
             else (scalar_dim, sequence_dim, self.vision_encoder._hidden_dim)
         )
         kwargs[self._vision_encoder_namespace] = {
-            LanguageModelKwargs.sequence_first: sequence_first,
-            LanguageModelKwargs.position_ids: batch.image_patches.position_ids,
-            LanguageModelKwargs.sequence_lengths: batch.image_patches.lengths,
-            LanguageModelKwargs.sequence_length: sequence_length,
-            LanguageModelKwargs.sequence_k_dim: sequence_dim,
-            LanguageModelKwargs.sequence_q_dim: sequence_dim,
-            LanguageModelKwargs.hidden_dims: hidden_dims,
+            VisionKwargs.sequence_first: sequence_first,
+            VisionKwargs.patch_positions: batch.image_patches.positions,
+            VisionKwargs.sequence_lengths: batch.image_patches.lengths,
+            VisionKwargs.sequence_length: sequence_length,
+            VisionKwargs.sequence_k_dim: sequence_dim,
+            VisionKwargs.sequence_q_dim: sequence_dim,
+            VisionKwargs.hidden_dims: hidden_dims,
         }
         super().preprocess(kwargs)
 
diff --git a/tests/utils/model_configs.py b/tests/utils/model_configs.py
index 956aaea5a..7b26b0664 100644
--- a/tests/utils/model_configs.py
+++ b/tests/utils/model_configs.py
@@ -653,35 +653,25 @@ def _update_and_add_testing_config(
 _update_and_add_testing_config(
     # Tests hybrid discrete Mamba 2.
     "llama",
-    "hybrid_discrete_mamba_2",
+    "llava",
     updates={
-        ("model", "base_model", "decoder"): {
-            "type": "pattern",
-            "blocks": {
-                "t": copy.deepcopy(_llama_block),
-                "m2d": {
-                    **copy.deepcopy(_llama_block),
-                    "mixer": {
-                        "type": "discrete_mamba_2",
-                        "d_inner": 512,
-                        "state_size": 8,
-                        "n_qk_heads": 8,
-                        "n_v_heads": 16,
-                        "chunk_size": 32,
-                        "add_linear_biases": False,
-                    },
-                },
-            },
-            "num_blocks": 2,
-            "pattern": ["t", "m2d"],
+        ("model", "type"): "multimodal",
+        ("model", "base_model", "vision_encoder"): {
+            "patch_convolution": {},
+            "encoder": copy.deepcopy(MODEL_CONFIGS["llama"].config_dict["model"]["base_model"]["decoder"]),
+            "adapter": {"intermediate_size": 512},
+            "hidden_size": 256,
         },
+        ("model", "base_model", "vision_encoder", "encoder", "block", "mixer", "rotary", "type"): {"default_2d"},
+        ("model", "base_model", "vision_encoder", "encoder", "block", "mixer", "causal"): False,
+        ("model", "base_model", "vision_encoder", "encoder", "block", "mixer", "cross_document_attention"): False,
     },
     megatron_args=None,
-    checkpoint_format=AprielHybridSSMCheckpointFormat,
+    checkpoint_format=None,
     groups={
         ModelTestingGroup.basic: ModelTestingGroupAction.normal,
         ModelTestingGroup.checkpoint: ModelTestingGroupAction.normal,
-        ModelTestingGroup.convert: ModelTestingGroupAction.normal,
+        ModelTestingGroup.convert: ModelTestingGroupAction.not_implemented,
         ModelTestingGroup.generate: ModelTestingGroupAction.not_implemented,
         ModelTestingGroup.megatron: ModelTestingGroupAction.not_implemented,
         # TODO: Implement

From 5aed2a7d0b5df943142c7221499ed68441d22c3f Mon Sep 17 00:00:00 2001
From: Joel Lamy-Poirier <joel.lamy-poirier@servicenow.com>
Date: Mon, 10 Nov 2025 20:02:10 -0500
Subject: [PATCH 06/16] fix

---
 tests/utils/model_configs.py | 43 ++++++++++++++++++++++++++++++++++++
 1 file changed, 43 insertions(+)

diff --git a/tests/utils/model_configs.py b/tests/utils/model_configs.py
index 7b26b0664..5fd3495af 100644
--- a/tests/utils/model_configs.py
+++ b/tests/utils/model_configs.py
@@ -650,6 +650,49 @@ def _update_and_add_testing_config(
 )
 
 
+_update_and_add_testing_config(
+    # Tests hybrid discrete Mamba 2.
+    "llama",
+    "hybrid_discrete_mamba_2",
+    updates={
+        ("model", "base_model", "decoder"): {
+            "type": "pattern",
+            "blocks": {
+                "t": copy.deepcopy(_llama_block),
+                "m2d": {
+                    **copy.deepcopy(_llama_block),
+                    "mixer": {
+                        "type": "discrete_mamba_2",
+                        "d_inner": 512,
+                        "state_size": 8,
+                        "n_qk_heads": 8,
+                        "n_v_heads": 16,
+                        "chunk_size": 32,
+                        "add_linear_biases": False,
+                    },
+                },
+            },
+            "num_blocks": 2,
+            "pattern": ["t", "m2d"],
+        },
+    },
+    megatron_args=None,
+    checkpoint_format=AprielHybridSSMCheckpointFormat,
+    groups={
+        ModelTestingGroup.basic: ModelTestingGroupAction.normal,
+        ModelTestingGroup.checkpoint: ModelTestingGroupAction.normal,
+        ModelTestingGroup.convert: ModelTestingGroupAction.normal,
+        ModelTestingGroup.generate: ModelTestingGroupAction.not_implemented,
+        ModelTestingGroup.megatron: ModelTestingGroupAction.not_implemented,
+        # TODO: Implement
+        ModelTestingGroup.distributed: ModelTestingGroupAction.normal,
+    },
+    compare_factor=2.0,
+    # Micro-sequence split and sequence-first not supported.
+    skip_tests=("sdp", "ms"),
+)
+
+
 _update_and_add_testing_config(
     # Tests hybrid discrete Mamba 2.
     "llama",

From 57d8c1f067f3e599844c258752e9d004d916e3c4 Mon Sep 17 00:00:00 2001
From: Joel Lamy-Poirier <joel.lamy-poirier@servicenow.com>
Date: Wed, 12 Nov 2025 22:29:29 -0500
Subject: [PATCH 07/16] fixes

---
 fast_llm/engine/base_model/base_model.py      |  1 -
 fast_llm/engine/config_utils/tensor_dim.py    | 14 +++-
 fast_llm/layers/attention/attention.py        | 82 +++++++------------
 fast_llm/layers/attention/rotary/rotary.py    | 10 +--
 fast_llm/layers/block/config.py               |  1 +
 fast_llm/layers/common/linear/config.py       |  2 +-
 fast_llm/layers/decoder/block.py              |  4 +-
 .../layers/decoder/mlp/mixture_of_experts.py  |  2 +
 fast_llm/layers/decoder/mlp/mlp.py            |  7 +-
 fast_llm/layers/language_model/config.py      |  2 +
 fast_llm/layers/language_model/embedding.py   | 20 ++---
 fast_llm/layers/vision/config.py              |  7 ++
 fast_llm/layers/vision/patch_convolution.py   | 33 +++++---
 fast_llm/layers/vision/vision_encoder.py      |  5 +-
 fast_llm/models/auto.py                       |  1 +
 fast_llm/models/gpt/model.py                  |  1 +
 fast_llm/models/multimodal/config.py          |  2 +-
 fast_llm/models/multimodal/model.py           | 57 ++++++++++---
 fast_llm/models/multimodal/trainer.py         |  6 +-
 fast_llm/tensor.py                            | 21 +++--
 tests/functional/test_triton_kernels.py       |  5 +-
 tests/test_attention.py                       |  3 +-
 tests/test_multi_stage.py                     |  3 +-
 tests/utils/dataset.py                        | 34 +++++++-
 tests/utils/model_configs.py                  | 50 +++++------
 tests/utils/run_test_script.py                |  3 +-
 26 files changed, 225 insertions(+), 151 deletions(-)

diff --git a/fast_llm/engine/base_model/base_model.py b/fast_llm/engine/base_model/base_model.py
index 9d8dc6c35..ffffbed50 100644
--- a/fast_llm/engine/base_model/base_model.py
+++ b/fast_llm/engine/base_model/base_model.py
@@ -94,7 +94,6 @@ def __init__(self, layer: LayerBase, namespace: str = None):
         super().__init__(layer._distributed_config)
         self._layer = layer
         self._namespace = namespace
-        self.layer_count = self._layer.layer_count
         self.get_compute_usage = self._layer.get_compute_usage
         self.module_name = self._layer.module_name
 
diff --git a/fast_llm/engine/config_utils/tensor_dim.py b/fast_llm/engine/config_utils/tensor_dim.py
index f67916a66..974cb74c4 100644
--- a/fast_llm/engine/config_utils/tensor_dim.py
+++ b/fast_llm/engine/config_utils/tensor_dim.py
@@ -14,12 +14,15 @@
 
 
 class TensorDim:
-    def __init__(self, name: str, global_size: int | None, parallel_dim: DistributedDim | None = None):
+    def __init__(
+        self, name: str, global_size: int, parallel_dim: DistributedDim | None = None, variable_size: bool = False
+    ):
         # TODO: Handle None for unknown sizes?
         self._name = name
         self._global_size = global_size
         self._size = self._global_size if parallel_dim is None else div(global_size, parallel_dim.size)
         self._parallel_dim = parallel_dim
+        self._variable_size = variable_size
 
     def __repr__(self) -> str:
         return (
@@ -28,6 +31,7 @@ def __repr__(self) -> str:
             f" size={self._size},"
             f" global_size={self._global_size},"
             f" parallel_dim={self._parallel_dim}"
+            f" variable_size={self._variable_size}"
             f")"
         )
 
@@ -60,9 +64,13 @@ def parallel_group(self) -> "ProcessGroup|None":
         # TODO: Make more flexible for derived classes?
         return None if self._parallel_dim is None else self._parallel_dim.group
 
+    @property
+    def variable_size(self) -> bool:
+        return self._variable_size
+
     def replace_parallel_dim(self, distributed_dim: DistributedDim) -> typing.Self:
         assert self.is_parallel
-        return TensorDim(self.name, self.size * distributed_dim.size, distributed_dim)
+        return TensorDim(self.name, self.size * distributed_dim.size, distributed_dim, self.variable_size)
 
     def local_to_global(self, tensor: "torch.Tensor", dim: int = 0) -> "torch.Tensor":
         if self.is_parallel:
@@ -99,6 +107,7 @@ def __init__(self, name: str, tensor_dims: tuple[TensorDim, ...]):
                 assert parallel_dim is None
                 parallel_dim = tensor_dim.parallel_dim
                 self._parallel_dim_index = dim
+            assert not tensor_dim.variable_size
 
         super().__init__(
             name=name,
@@ -142,6 +151,7 @@ def __init__(self, name: str, tensor_dims: tuple[TensorDim, ...]):
         for dim, tensor_dim in enumerate(tensor_dims[1:]):
             # TODO: Allow more flexibility?
             Assert.is_(tensor_dim.parallel_dim, parallel_dim)
+            assert not tensor_dim.variable_size
 
         super().__init__(
             name=name,
diff --git a/fast_llm/layers/attention/attention.py b/fast_llm/layers/attention/attention.py
index 48ec15ea7..16f217b1e 100644
--- a/fast_llm/layers/attention/attention.py
+++ b/fast_llm/layers/attention/attention.py
@@ -15,7 +15,7 @@
 from fast_llm.layers.common.peft.config import PeftConfig
 from fast_llm.layers.decoder.block import BlockWithBias
 from fast_llm.tensor import TensorMeta
-from fast_llm.utils import div
+from fast_llm.utils import Assert, div
 
 try:
     from flash_attn.flash_attn_interface import flash_attn_func as _flash_attn_func  # noqa
@@ -451,6 +451,7 @@ def preprocess(self, kwargs: dict[str, typing.Any]) -> None:
             self._preprocess_for_flash_attention(kwargs)
 
     def _preprocess_for_backup_attention(self, kwargs: dict[str, typing.Any]) -> None:
+        device = kwargs[AttentionKwargs.device] if AttentionKwargs.device in kwargs else self._distributed.device
         if (
             sequence_length := kwargs[AttentionKwargs.sequence_length]
         ) > self._backup_attention_tensor_cache_max_sequence_length:
@@ -460,7 +461,7 @@ def _preprocess_for_backup_attention(self, kwargs: dict[str, typing.Any]) -> Non
             self._backup_attention_mask = torch.ones(
                 (sequence_length, sequence_length),
                 dtype=torch.bool,
-                device=self._distributed.device,
+                device=device,
             ).tril_()
 
             if self._config.window_size is not None:
@@ -469,7 +470,7 @@ def _preprocess_for_backup_attention(self, kwargs: dict[str, typing.Any]) -> Non
                 [],
                 torch.finfo(self._distributed_config.compute_dtype.torch).min,
                 dtype=self._distributed_config.compute_dtype.torch,
-                device=self._distributed.device,
+                device=device,
             )
         sequence_k = kwargs[AttentionKwargs.sequence_k_dim].size
         sequence_q = kwargs[AttentionKwargs.sequence_q_dim].size
@@ -483,7 +484,7 @@ def _preprocess_for_backup_attention(self, kwargs: dict[str, typing.Any]) -> Non
                     for sample_lens in kwargs[AttentionKwargs.sequence_lengths]
                 ]
             )
-            document_mask = (seq_ids[:, None, :] == seq_ids[:, :, None]).to(self._distributed.device)
+            document_mask = (seq_ids[:, None, :] == seq_ids[:, :, None]).to(device)
             kwargs[AttentionKwargs.attention_mask] = (
                 kwargs[AttentionKwargs.attention_mask]
                 & document_mask[:, None, sequence_k - sequence_q : sequence_k, None, :sequence_k]
@@ -502,54 +503,27 @@ def _preprocess_for_flash_attention(self, kwargs: dict[str, typing.Any]) -> None
         """
         if self._config.cross_document_attention:
             return
-        sequence_lengths = kwargs[AttentionKwargs.sequence_lengths]
-        sequence_k = kwargs[AttentionKwargs.sequence_k_dim].size
-        sequence_q = kwargs[AttentionKwargs.sequence_q_dim].size
-        if sequence_q < kwargs[AttentionKwargs.sequence_length]:
-            cumsums = [torch.cumsum(x, dim=0) for x in sequence_lengths]
-            # The first and last documents in a microsequence need to be handled separately. Include all tokens from other documents
-            # in the microsequence. We need to consider all keys computed so far from the first sample. We also store the offsets
-            # of the first documents so that we can index into their kv pairs
-            start_seq_idx = [
-                torch.argmax((cu_seqlens >= sequence_k - sequence_q).to(torch.uint8), dim=0) for cu_seqlens in cumsums
-            ]
-            end_seq_idx = [torch.argmax((cu_seqlens >= sequence_k).to(torch.uint8), dim=0) for cu_seqlens in cumsums]
-            seqlens_q = []
-            seqlens_k = []
-            for idx, sample_seqlens in enumerate(sequence_lengths):
-                start_idx = start_seq_idx[idx]
-                end_idx = end_seq_idx[idx]
-                seqlens_q.extend([0] * start_idx)
-                n_attention_tokens = sample_seqlens[end_idx] - (cumsums[idx][end_idx] - sequence_k)
-                if start_idx == end_idx:
-                    seqlens_q.append(sequence_q)
-                else:
-                    start_q_tokens = cumsums[idx][start_idx] - (sequence_k - sequence_q)
-                    seqlens_q.extend(
-                        [
-                            start_q_tokens,
-                            *(sample_seqlens[idx] for idx in range(start_idx + 1, end_idx)),
-                            n_attention_tokens,
-                        ]
-                    )
-                seqlens_k.extend(sample_seqlens[: end_idx + 1])
-                seqlens_k[-1] = n_attention_tokens
-            seqlens_q = torch.tensor(seqlens_q, dtype=torch.int32)
-            seqlens_k = torch.tensor(seqlens_k, dtype=torch.int32)
-        else:
-            seqlens_q = torch.cat(sequence_lengths)
-            seqlens_k = torch.cat(sequence_lengths)
-        kwargs[AttentionKwargs.cu_seqlens_q] = torch.cat(
-            (
-                torch.zeros(1, dtype=torch.int32, device=self._distributed.device),
-                torch.cumsum(seqlens_q, dim=0, dtype=torch.int32).to(self._distributed.device),
-            )
-        )
-        kwargs[AttentionKwargs.cu_seqlens_k] = torch.cat(
-            (
-                torch.zeros(1, dtype=torch.int32, device=self._distributed.device),
-                torch.cumsum(seqlens_k, dim=0, dtype=torch.int32).to(self._distributed.device),
-            )
+        device = kwargs[AttentionKwargs.device] if AttentionKwargs.device in kwargs else self._distributed.device
+
+        # TODO: ====== Fix (need to know how much first sequence was cropped) ======
+        Assert.eq(kwargs[AttentionKwargs.sequence_k_dim].size, kwargs[AttentionKwargs.sequence_q_dim].size)
+
+        # TODO: Calculate these in batch preprocessing?
+        sequence_lengths_q = torch.tensor(
+            [
+                0,
+                *(
+                    sequence_length
+                    for sequence_lengths in kwargs[AttentionKwargs.sequence_lengths]
+                    for sequence_length in sequence_lengths
+                ),
+            ],
+            dtype=torch.int32,
         )
-        kwargs[AttentionKwargs.max_seqlen_q] = seqlens_q.max()
-        kwargs[AttentionKwargs.max_seqlen_k] = seqlens_k.max()
+        max_sequence_length = sequence_lengths_q.max().item()
+        cu_seqlens_q = sequence_lengths_q.cumsum_(0).to(device)
+        max_seqlen_q = cu_seqlens_q.new_full((1,), max_sequence_length)
+        kwargs[AttentionKwargs.cu_seqlens_q] = cu_seqlens_q
+        kwargs[AttentionKwargs.cu_seqlens_k] = cu_seqlens_q
+        kwargs[AttentionKwargs.max_seqlen_q] = max_seqlen_q
+        kwargs[AttentionKwargs.max_seqlen_k] = max_seqlen_q
diff --git a/fast_llm/layers/attention/rotary/rotary.py b/fast_llm/layers/attention/rotary/rotary.py
index e8b4b6855..55d929f8a 100644
--- a/fast_llm/layers/attention/rotary/rotary.py
+++ b/fast_llm/layers/attention/rotary/rotary.py
@@ -56,7 +56,7 @@ def forward(
     ) -> tuple[torch.Tensor, torch.Tensor]:
         pass
 
-    def preprocess(self, batch: torch.Tensor, kwargs: dict[str, typing.Any]) -> None:
+    def preprocess(self, kwargs: dict[str, typing.Any]) -> None:
         pass
 
 
@@ -71,8 +71,8 @@ class DefaultRotary[ConfigType: DefaultRotaryConfig](Rotary[ConfigType]):
     _rotary_embedding_frequencies: torch.Tensor
     _tensor_cache_max_sequence_length: int = -1
 
-    def preprocess(self, batch: torch.Tensor, kwargs: dict[str, typing.Any]) -> None:
-        self._create_tensors(kwargs[AttentionKwargs.sequence_length], batch.device)
+    def preprocess(self, kwargs: dict[str, typing.Any]) -> None:
+        self._create_tensors(kwargs[AttentionKwargs.sequence_length], kwargs[AttentionKwargs.device])
         sequence_k = kwargs[AttentionKwargs.sequence_k_dim].size
         kwargs[AttentionKwargs.rotary_freq_q] = self._rotary_embedding_frequencies[
             :, sequence_k - kwargs[AttentionKwargs.sequence_q_dim].size : sequence_k
@@ -190,11 +190,11 @@ def __init__(
         super().__init__(config, head_size_dim)
         Assert.multiple(self._head_size, 4)
 
-    def preprocess(self, batch, kwargs: dict[str, typing.Any]) -> None:
+    def preprocess(self, kwargs: dict[str, typing.Any]) -> None:
         patch_positions = kwargs[VisionKwargs.patch_positions]
         if not hasattr(self, "_frequencies"):
             self._frequencies = self._config.theta ** -torch.arange(
-                0, 1, 4 / self._head_size, device=patch_positions.device, dtype=torch.float64
+                0, 1, 4 / self._head_size, device=kwargs[AttentionKwargs.device], dtype=torch.float64
             )
         # TODO: Pre-compute 2d frequencies?
         angles = torch.outer(patch_positions.flatten(), self._frequencies).view(
diff --git a/fast_llm/layers/block/config.py b/fast_llm/layers/block/config.py
index fda873e9a..04b16df3f 100644
--- a/fast_llm/layers/block/config.py
+++ b/fast_llm/layers/block/config.py
@@ -37,6 +37,7 @@ class BlockKwargs:
     sequence_lengths = "sequence_lengths"
     # TODO: Belongs elsewhere?
     grad_output = "grad_output"
+    device = "device"
 
 
 @config_class(registry=True)
diff --git a/fast_llm/layers/common/linear/config.py b/fast_llm/layers/common/linear/config.py
index 0dc118269..e2c586bb7 100644
--- a/fast_llm/layers/common/linear/config.py
+++ b/fast_llm/layers/common/linear/config.py
@@ -224,7 +224,7 @@ def get_layer(
         )
 
 
-@config_class
+@config_class()
 class Convolution2DConfig(AffineLinearBaseConfig):
     def get_layer(
         self,
diff --git a/fast_llm/layers/decoder/block.py b/fast_llm/layers/decoder/block.py
index 2b71e1cec..5713cbb62 100644
--- a/fast_llm/layers/decoder/block.py
+++ b/fast_llm/layers/decoder/block.py
@@ -90,7 +90,7 @@ def __init__(
         self.mixer = self._config.mixer.get_layer(
             self._distributed_config,
             self._hidden_dim,
-            self._lr_scale,
+            lr_scale=self._lr_scale,
             peft=peft,
             return_bias=True,
         )
@@ -98,7 +98,7 @@ def __init__(
         self.mlp = self._config.mlp.get_layer(
             self._distributed_config,
             self._hidden_dim,
-            self._lr_scale,
+            lr_scale=self._lr_scale,
             peft=peft,
             return_bias=True,
         )
diff --git a/fast_llm/layers/decoder/mlp/mixture_of_experts.py b/fast_llm/layers/decoder/mlp/mixture_of_experts.py
index 0b85025e0..4171e66ab 100644
--- a/fast_llm/layers/decoder/mlp/mixture_of_experts.py
+++ b/fast_llm/layers/decoder/mlp/mixture_of_experts.py
@@ -44,6 +44,7 @@ def __init__(
         *,
         # TODO: Review `hidden_dim` and `block_index`
         hidden_dim: TensorDim,
+        output_dim: TensorDim | None = None,
         lr_scale: float | None,
         peft: PeftConfig | None,
         return_bias: bool = True,
@@ -55,6 +56,7 @@ def __init__(
             config,
             distributed_config,
             hidden_dim=hidden_dim,
+            output_dim=output_dim,
             lr_scale=lr_scale,
             peft=peft,
             return_bias=return_bias,
diff --git a/fast_llm/layers/decoder/mlp/mlp.py b/fast_llm/layers/decoder/mlp/mlp.py
index f5decbf17..7a52539da 100644
--- a/fast_llm/layers/decoder/mlp/mlp.py
+++ b/fast_llm/layers/decoder/mlp/mlp.py
@@ -114,7 +114,12 @@ def _forward(
         metrics: dict[str, typing.Any] | None = None,
     ) -> tuple[torch.Tensor, torch.Tensor | None]:
         if isinstance(input_, TensorMeta):
-            return TensorMeta.from_dims(input_.dims[:-1] + (self._output_dim,), "MLP output"), None
+            return (
+                TensorMeta.from_dims(
+                    input_.dims[:-1] + (self._output_dim,), tensor_name="MLP output", dtype=input_.dtype
+                ),
+                None,
+            )
         return (
             mlp_autograd(
                 input_,
diff --git a/fast_llm/layers/language_model/config.py b/fast_llm/layers/language_model/config.py
index 832808eaf..53dac2892 100644
--- a/fast_llm/layers/language_model/config.py
+++ b/fast_llm/layers/language_model/config.py
@@ -22,6 +22,8 @@
 class LanguageModelKwargs(BlockKwargs):
     token_ids = "token_ids"
     position_ids = "position_ids"
+    token_map = "token_map"
+    sample_map = "sample_map"
     embedding_map = "embedding_map"
     # TODO: These are generic
     labels = "labels"
diff --git a/fast_llm/layers/language_model/embedding.py b/fast_llm/layers/language_model/embedding.py
index 19fa211bd..d95ec6dfd 100644
--- a/fast_llm/layers/language_model/embedding.py
+++ b/fast_llm/layers/language_model/embedding.py
@@ -77,15 +77,14 @@ def __init__(
             peft=self._peft,
         )
 
-    @torch.compile
+    # @torch.compile
     def _forward(
         self,
         input_: torch.Tensor | None,
         token_ids: torch.Tensor,
         position_ids: torch.Tensor | None,
         mask_inputs: bool,
-        # TODO: Flatten the batch and sequence in the map?
-        embedding_map: tuple[tuple[torch.Tensor, torch.Tensor], tuple[torch.Tensor, torch.Tensor]] | None,
+        embedding_map: tuple[torch.Tensor, torch.Tensor] | None,
     ) -> torch.Tensor:
         Assert.eq(position_ids is None, self.position_embeddings_weight is None)
         group = self._parallel_dim.group
@@ -98,12 +97,12 @@ def _forward(
             if self.position_embeddings_weight is not None:
                 embeddings = embeddings + torch.nn.functional.embedding(position_ids, self.position_embeddings_weight)
 
-            if embedding_map is not None:
+            if input_ is not None:
                 # TODO: Accumulate redundant with masking?
-                input_index, embedding_index = embedding_map
                 if self._sequence_parallel:
                     input_ = gather(input_, group=group, dim=0)
-                embeddings = embeddings.index_put(embedding_index, input_[input_index], accumulate=True)
+                # Out-of-place equivalent of `embeddings[embedding_map] += input_`
+                embeddings = embeddings.index_put(embedding_map, input_, accumulate=True)
 
             if self._sequence_parallel:
                 embeddings = split(embeddings, group=group, dim=0)
@@ -122,17 +121,16 @@ def _forward(
             if mask_inputs:
                 embeddings = embeddings * token_mask.unsqueeze(2)
 
-            if embedding_map is not None:
+            if input_ is not None:
                 # TODO: Accumulate redundant with masking?
-                input_index, embedding_index = embedding_map
                 if self._sequence_parallel:
                     #  TODO:: Filter and shift embedding map instead? (needs cuda sync)
                     input_ = gather(input_, group=group, dim=0)
                     embeddings_ = embeddings.new_zeros(embeddings.shape[0] * group.size(), *embeddings.shape[1:])
-                    embeddings_.index_put(embedding_index, input_[input_index], accumulate=True)
+                    embeddings_.index_put(embedding_map, input_, accumulate=True)
                     embeddings = embeddings + split(embeddings_, group=group, dim=0)
                 else:
-                    embeddings = embeddings.index_put(embedding_index, input_[input_index], accumulate=True)
+                    embeddings = embeddings.index_put(embedding_map, input_, accumulate=True)
 
         with set_generator(
             self._distributed.tp_generator if self._sequence_parallel else self._distributed.pp_generator
@@ -162,6 +160,8 @@ def forward(
             # TODO: Support multiple encoders.
             # TODO: Support pipeline-parallel.
             token_ids = kwargs.get(LanguageModelKwargs.token_ids)
+            # Drop the placeholder batch dimension, remove patch padding.
+            input_ = input_.squeeze(int(kwargs[LanguageModelKwargs.sequence_first]))[: embedding_map[0].size(0)]
 
         return self._forward(
             input_,
diff --git a/fast_llm/layers/vision/config.py b/fast_llm/layers/vision/config.py
index f865231b9..1aa7231c1 100644
--- a/fast_llm/layers/vision/config.py
+++ b/fast_llm/layers/vision/config.py
@@ -10,6 +10,7 @@
 from fast_llm.utils import Assert
 
 if typing.TYPE_CHECKING:
+    from fast_llm.layers.vision.patch_convolution import PatchConvolution
     from fast_llm.layers.vision.vision_encoder import VisionEncoder, VisionMultiModalModel
 
 
@@ -83,6 +84,12 @@ def input_channels(self):
         # Number of input channels. Currently hard-coded to 3 (RGB).
         return 3
 
+    @property
+    def layer_class(self) -> "type[PatchConvolution]":
+        from fast_llm.layers.vision.patch_convolution import PatchConvolution
+
+        return PatchConvolution
+
 
 @config_class(registry=True)
 class VisionEncoderConfig(BlockConfig):
diff --git a/fast_llm/layers/vision/patch_convolution.py b/fast_llm/layers/vision/patch_convolution.py
index 1055f33cc..6f1231d1e 100644
--- a/fast_llm/layers/vision/patch_convolution.py
+++ b/fast_llm/layers/vision/patch_convolution.py
@@ -3,10 +3,11 @@
 import torch
 
 from fast_llm.core.ops import split
-from fast_llm.engine.config_utils.tensor_dim import TensorDim
+from fast_llm.engine.config_utils.tensor_dim import TensorDim, scalar_dim
 from fast_llm.engine.distributed.config import DistributedConfig, DistributedDimNames
 from fast_llm.layers.attention.config import AttentionKwargs
 from fast_llm.layers.block.block import Block
+from fast_llm.layers.block.config import BlockKwargs
 from fast_llm.layers.common.peft.config import PeftConfig
 from fast_llm.layers.vision.config import PatchConvolutionConfig
 from fast_llm.tensor import TensorMeta
@@ -35,8 +36,8 @@ def __init__(
         self._parallel_dim = self._distributed_config.get_distributed_dim(DistributedDimNames.tensor)
 
         self.convolution = self._config.convolution.get_layer(
-            self._hidden_dim,
             TensorDim("input_channels", self._config.input_channels),
+            self._hidden_dim,
             TensorDim("patch_height", self._config.patch_height),
             TensorDim("patch_width", self._config.patch_width),
             stride=(self._config.patch_height, self._config.patch_width),
@@ -55,15 +56,27 @@ def forward(
     ) -> torch.Tensor:
         if isinstance(input_, TensorMeta):
             return TensorMeta.from_dims(
-                input_.dims[:-1] + (self._hidden_dim,), tensor_name="patch conv output", dtype=input_.dtype
+                (
+                    (
+                        input_.dims[0],
+                        scalar_dim,
+                        self._hidden_dim,
+                    )
+                    if kwargs[BlockKwargs.sequence_first]
+                    else (
+                        scalar_dim,
+                        input_.dims[0],
+                        self._hidden_dim,
+                    )
+                ),
+                tensor_name="patch convolution output",
+                dtype=input_.dtype,
             )
-        # TODO: Avoid padding
-        input_ = self.convolution(input_)
-        patch_embeddings = self.normalization(input_.flatten(1)).view_as(input_)
-
-        # TODO: Permute earlier?
-        if kwargs[AttentionKwargs.sequence_first]:
-            patch_embeddings = patch_embeddings.permute(1, 0, 2).contiguous()
+        patch_embeddings = (
+            self.normalization(self.convolution(input_).flatten(1))
+            .view(-1, self._hidden_dim.size)
+            .unsqueeze(int(kwargs[AttentionKwargs.sequence_first]))
+        )
         if self._sequence_parallel:
             patch_embeddings = split(patch_embeddings, group=self._parallel_dim.group, dim=0)
         return patch_embeddings
diff --git a/fast_llm/layers/vision/vision_encoder.py b/fast_llm/layers/vision/vision_encoder.py
index fab5c5a65..e62616006 100644
--- a/fast_llm/layers/vision/vision_encoder.py
+++ b/fast_llm/layers/vision/vision_encoder.py
@@ -14,7 +14,7 @@
 logger = logging.getLogger(__name__)
 
 
-class VisionEncoder[ConfigType: VisionEncoderConfig](BlockBase[VisionEncoderConfig]):
+class VisionEncoder[ConfigType: VisionEncoderConfig](BlockBase[ConfigType]):
     _config: ConfigType
 
     def __init__(
@@ -26,8 +26,8 @@ def __init__(
         lr_scale: float | None,
         peft: PeftConfig | None,
     ):
-        vision_hidden_dim = TensorDim("hidden", self._config.hidden_size)
         super().__init__(config, distributed_config, hidden_dim=hidden_dim, lr_scale=lr_scale, peft=peft)
+        vision_hidden_dim = TensorDim("hidden", self._config.hidden_size)
         self.patch_convolution = self._config.patch_convolution.get_layer(
             distributed_config,
             vision_hidden_dim,
@@ -67,7 +67,6 @@ def get_loss_definitions(self, count: int = 1) -> list[LossDef]:
 
 
 class VisionMultiModalModel[ConfigType: VisionMultiModalModelConfig](LanguageModel[ConfigType]):
-
     _config: ConfigType
 
     def __init__(
diff --git a/fast_llm/models/auto.py b/fast_llm/models/auto.py
index 414314627..7830c69a1 100644
--- a/fast_llm/models/auto.py
+++ b/fast_llm/models/auto.py
@@ -5,4 +5,5 @@
 from fast_llm.layers.attention.config import AttentionConfig  # isort: skip
 from fast_llm.layers.ssm.config import MambaConfig, Mamba2Config, DiscreteMamba2Config  # isort: skip
 from fast_llm.models.gpt.config import GPTModelConfig, GPTTrainerConfig  # isort: skip
+from fast_llm.models.multimodal.config import MultiModalModelConfig, MultiModalTrainerConfig  # isort: skip
 from fast_llm.engine.evaluation.evaluators import EvaluatorsConfig  # isort: skip
diff --git a/fast_llm/models/gpt/model.py b/fast_llm/models/gpt/model.py
index 65be1eb4b..9e5533b84 100644
--- a/fast_llm/models/gpt/model.py
+++ b/fast_llm/models/gpt/model.py
@@ -199,6 +199,7 @@ def preprocess_batch(
                 AttentionKwargs.past_key_values: pasts,
                 AttentionKwargs.presents: presents,
                 AttentionKwargs.sequence_lengths: batch.tokens.lengths,
+                AttentionKwargs.device: self._distributed.device,
                 **reference_logits[i],
             }
 
diff --git a/fast_llm/models/multimodal/config.py b/fast_llm/models/multimodal/config.py
index 23c9fa401..7bce78853 100644
--- a/fast_llm/models/multimodal/config.py
+++ b/fast_llm/models/multimodal/config.py
@@ -40,7 +40,7 @@ def base_model_class(self) -> type["MultiModalBaseModel"]:
 class MultiModalModelConfig(GPTModelConfig):
     _abstract = False
     model_name: typing.ClassVar[str] = "multimodal"
-    base_model: GPTBaseModelConfig = FieldUpdate()
+    base_model: MultiModalBaseModelConfig = FieldUpdate()
     # TODO: ====== Conversion ======
     checkpoint_formats: typing.ClassVar[tuple[type[CheckpointFormat], ...]] = FastLLMModelConfig.checkpoint_formats
 
diff --git a/fast_llm/models/multimodal/model.py b/fast_llm/models/multimodal/model.py
index 907bc8f16..21d7eac49 100644
--- a/fast_llm/models/multimodal/model.py
+++ b/fast_llm/models/multimodal/model.py
@@ -4,9 +4,10 @@
 import torch
 
 from fast_llm.data.sample.language_model import LanguageModelBatch
-from fast_llm.engine.config_utils.tensor_dim import ConcatenatedTensorDim, TensorDim, scalar_dim
+from fast_llm.engine.config_utils.tensor_dim import TensorDim, scalar_dim
 from fast_llm.engine.distributed.config import PhaseType
 from fast_llm.engine.inference.runner import InferenceRunner
+from fast_llm.layers.attention.config import AttentionKwargs
 from fast_llm.layers.language_model.config import LanguageModelKwargs
 from fast_llm.layers.vision.config import VisionKwargs
 from fast_llm.layers.vision.vision_encoder import VisionMultiModalModel
@@ -19,7 +20,7 @@
 
 
 class MultiModalBaseModel[ConfigType: MultiModalBaseModelConfig](
-    VisionMultiModalModel[ConfigType], GPTBaseModel[ConfigType]
+    GPTBaseModel[ConfigType], VisionMultiModalModel[ConfigType]
 ):
     """
     A transformer-based language model generalizing the GPT model architecture.
@@ -33,19 +34,31 @@ def preprocess_meta(
         preprocessed_meta = []
         for tokens, kwargs in super().preprocess_meta(batch_meta, phase):
             kwargs[LanguageModelKwargs.token_ids] = tokens
+            kwargs[LanguageModelKwargs.mask_inputs] = True
             image_patches = TensorMeta.from_dims(
                 (
                     # We combine the batch and sequence dims to allow for variable sequence lengths.
                     # Gives the same result, assuming we disable cross-image attention (TODO: Enforce)
-                    scalar_dim,
-                    # TODO: Wrong (variable size).
-                    ConcatenatedTensorDim("image_sequence", tokens.dims),
+                    sequence_dim := TensorDim("image_sequence", tokens.numel(), variable_size=True),
                     # TODO: Relate to tensor dims in patch convolution.
                     TensorDim("input_channels", self._config.vision_encoder.patch_convolution.input_channels),
                     TensorDim("patch_height", self._config.vision_encoder.patch_convolution.patch_height),
                     TensorDim("patch_width", self._config.vision_encoder.patch_convolution.patch_width),
                 )
             )
+
+            hidden_dims = (
+                (sequence_dim, scalar_dim, self.vision_encoder._hidden_dim)
+                if (sequence_first := kwargs[LanguageModelKwargs.sequence_first])
+                else (scalar_dim, sequence_dim, self.vision_encoder._hidden_dim)
+            )
+            kwargs[self._vision_encoder_namespace] = {
+                VisionKwargs.sequence_first: sequence_first,
+                VisionKwargs.sequence_k_dim: sequence_dim,
+                VisionKwargs.sequence_q_dim: sequence_dim,
+                VisionKwargs.hidden_dims: hidden_dims,
+            }
+
             preprocessed_meta.append((image_patches, kwargs))
 
         return preprocessed_meta
@@ -68,11 +81,25 @@ def preprocess_batch(
 
         kwargs[LanguageModelKwargs.token_ids] = tokens
 
-        kwargs[LanguageModelKwargs.embedding_map] = batch.image_patches.token_map
+        # If document cropping is enabled, extra tokens may belong to images and need to be removed.
+        # TODO: Handle earlier.
+        tokens_end = kwargs[AttentionKwargs.sequence_k_dim].size
+        tokens_begin = tokens_end - kwargs[AttentionKwargs.sequence_q_dim].size
+        cropped_image_patches = batch.image_patches.crop(tokens_begin, tokens_end)
 
-        image_patches = batch.image_patches.patches
-        sequence_length = image_patches.size(0)
+        sequence_length = tokens.shape[:2].numel()
         sequence_dim = TensorDim("image_sequence", sequence_length)
+        pad_size = sequence_length - cropped_image_patches.patches.size(0)
+
+        patches = cropped_image_patches.patches.to(self._distributed.config.compute_dtype.torch)
+        patches = torch.cat([patches, patches.new_zeros((pad_size,) + patches.shape[1:])])
+
+        positions = torch.cat(
+            [
+                cropped_image_patches.positions,
+                cropped_image_patches.positions.new_zeros((pad_size,) + cropped_image_patches.positions.shape[1:]),
+            ]
+        )
 
         hidden_dims = (
             (sequence_dim, scalar_dim, self.vision_encoder._hidden_dim)
@@ -81,16 +108,24 @@ def preprocess_batch(
         )
         kwargs[self._vision_encoder_namespace] = {
             VisionKwargs.sequence_first: sequence_first,
-            VisionKwargs.patch_positions: batch.image_patches.positions,
-            VisionKwargs.sequence_lengths: batch.image_patches.lengths,
+            VisionKwargs.patch_positions: positions,
+            VisionKwargs.sequence_lengths: [cropped_image_patches.lengths],
             VisionKwargs.sequence_length: sequence_length,
             VisionKwargs.sequence_k_dim: sequence_dim,
             VisionKwargs.sequence_q_dim: sequence_dim,
             VisionKwargs.hidden_dims: hidden_dims,
+            VisionKwargs.device: self._distributed.device,
         }
+
+        kwargs[LanguageModelKwargs.embedding_map] = (
+            (cropped_image_patches.token_map, cropped_image_patches.sample_map)
+            if kwargs[LanguageModelKwargs.sequence_first]
+            else (cropped_image_patches.sample_map, cropped_image_patches.token_map)
+        )
+
         super().preprocess(kwargs)
 
-        return [(image_patches, kwargs)]
+        return [(patches, kwargs)]
 
     def preprocess(self, kwargs: dict[str, typing.Any]) -> None:
         # Hack to delay preprocessing in super().preprocess_batch (TODO: Improve)
diff --git a/fast_llm/models/multimodal/trainer.py b/fast_llm/models/multimodal/trainer.py
index c4071aafe..2beee1097 100644
--- a/fast_llm/models/multimodal/trainer.py
+++ b/fast_llm/models/multimodal/trainer.py
@@ -7,8 +7,4 @@
 
 
 class MultiModalTrainer[ConfigType: MultiModalTrainerConfig](GPTTrainer[ConfigType]):
-    def _get_data(self) -> MultiModalData:
-        return MultiModalData(
-            config=self._config.data,
-            distributed_config=self._config.model.distributed,
-        )
+    pass
diff --git a/fast_llm/tensor.py b/fast_llm/tensor.py
index b709ea835..f4469df94 100644
--- a/fast_llm/tensor.py
+++ b/fast_llm/tensor.py
@@ -76,6 +76,7 @@ def __init__(
         self._reductions = reductions
         for dim, op in reductions:
             assert isinstance(dim, DistributedDim), dim
+        self._variable_shape = any(dim.variable_size for dim in self.dims)
 
     def __new__(
         cls,
@@ -142,6 +143,14 @@ def from_dims(
     def global_shape(self) -> torch.Size:
         return torch.Size([dim.global_size for dim in self.dims])
 
+    def verify_shape(self, tensor: torch.Tensor, global_: bool = False):
+        if self._variable_shape:
+            for size, dim in zip(tensor.shape, self.dims, strict=True):
+                if not dim.variable_size:
+                    Assert.eq(size, dim.global_size if global_ else dim.size, msg=self)
+        else:
+            Assert.eq(tensor.shape, self.global_shape if global_ else self.shape, msg=self)
+
     def local_to_global(self, tensor: torch.Tensor) -> tuple[torch.Tensor, ...]:
         """
         Reconstruct a global tensor from its distributed slices. Support lazy-loaded safetensor slices.
@@ -149,7 +158,7 @@ def local_to_global(self, tensor: torch.Tensor) -> tuple[torch.Tensor, ...]:
         """
         if tensor.ndim == 0:
             tensor = tensor[None]
-        Assert.eq(tensor.shape, self.shape)
+        self.verify_shape(tensor, False)
         # Tensors are always either split or duplicated in the tensor-parallel direction.
         # TODO: Avoid hard-coded assumptions on duplication
         is_first_rank, modified = True, False
@@ -167,7 +176,7 @@ def local_to_global(self, tensor: torch.Tensor) -> tuple[torch.Tensor, ...]:
                     tensor = tensor.clone()
                 tensor = reduce_op(tensor, distributed_dim.group, op=op)
                 is_first_rank, modified = is_first_rank and distributed_dim.group.rank() == 0, True
-        Assert.eq(tensor.shape, self.global_shape)
+        self.verify_shape(tensor, True)
         return tensor, is_first_rank
 
     def local_to_global_partial(self, tensor: torch.Tensor, fill_value: float | int = -1) -> torch.Tensor:
@@ -179,13 +188,13 @@ def local_to_global_partial(self, tensor: torch.Tensor, fill_value: float | int
         """
         if tensor.ndim == 0:
             tensor = tensor[None]
-        Assert.eq(tensor.shape, self.shape)
+        self.verify_shape(tensor, False)
         assert not self._reductions
         for dim, tensor_dim in enumerate(self.dims):
             if tensor_dim.is_parallel:
                 tensor = tensor_dim.local_to_global_partial(tensor, dim, fill_value)
 
-        Assert.eq(tensor.shape, self.global_shape)
+        self.verify_shape(tensor, True)
         return tensor
 
     def global_to_local(self, tensor: torch.Tensor | SafeTensorSlice) -> torch.Tensor:
@@ -198,12 +207,12 @@ def global_to_local(self, tensor: torch.Tensor | SafeTensorSlice) -> torch.Tenso
         assert not self._reductions
         if tensor.ndim == 0:
             tensor = tensor[None]
-        Assert.eq(tensor.shape, self.global_shape, msg=self)
+        self.verify_shape(tensor, True)
 
         for dim, tensor_dim in reversed(list(enumerate(self.dims))):
             tensor = tensor_dim.global_to_local(tensor, dim)
 
-        Assert.eq(tensor.shape, self.shape, msg=self)
+        self.verify_shape(tensor, False)
         return tensor
 
     @classmethod
diff --git a/tests/functional/test_triton_kernels.py b/tests/functional/test_triton_kernels.py
index b5d88e0ac..807d38804 100644
--- a/tests/functional/test_triton_kernels.py
+++ b/tests/functional/test_triton_kernels.py
@@ -1,6 +1,7 @@
 import pytest
 import torch
 
+from fast_llm.engine.config_utils.tensor_dim import TensorDim
 from fast_llm.functional.config import (
     MAX_DROPLESS_BLOCK_SIZE_ROW,
     ActivationType,
@@ -92,7 +93,7 @@ def test_triton_rotary(batch_size, sequence_length, num_heads, head_size):
     y1 = apply_rotary_embeddings(
         x,
         DefaultRotaryConfig(triton=False)
-        .get_layer(None)
+        .get_layer(TensorDim("", head_size))
         ._get_frequencies(
             sequence_length,
             head_size,
@@ -104,7 +105,7 @@ def test_triton_rotary(batch_size, sequence_length, num_heads, head_size):
         triton_rotary_(
             convert_rotary_complex_to_real(x, head_size, 3),
             DefaultRotaryConfig(triton=True)
-            .get_layer(None)
+            .get_layer(TensorDim("", head_size))
             ._get_frequencies(sequence_length, head_size, device="cuda"),
         ),
         head_size,
diff --git a/tests/test_attention.py b/tests/test_attention.py
index b86cc95fa..69af39503 100644
--- a/tests/test_attention.py
+++ b/tests/test_attention.py
@@ -43,7 +43,8 @@ def test_varlen_preprocessing():
             ),
             AttentionKwargs.sequence_length: sequence_length,
             AttentionKwargs.sequence_lengths: sequence_lengths,
+            AttentionKwargs.device: torch.device("cpu"),
         }
-        attention.preprocess(torch.empty(1, device="cpu"), kwargs)
+        attention.preprocess(kwargs)
         Assert.all_equal(kwargs[AttentionKwargs.cu_seqlens_q], cumulative_sequences_q[micro_seq_idx])
         Assert.all_equal(kwargs[AttentionKwargs.cu_seqlens_k], cumulative_sequences_k[micro_seq_idx])
diff --git a/tests/test_multi_stage.py b/tests/test_multi_stage.py
index 407b47767..e3870a7b1 100644
--- a/tests/test_multi_stage.py
+++ b/tests/test_multi_stage.py
@@ -6,7 +6,6 @@
 from fast_llm.engine.multi_stage.config import FastLLMModelConfig
 from fast_llm.engine.multi_stage.fast_llm_model import FastLLMModel
 from fast_llm.utils import Assert
-from tests.utils.dataset import get_model_test_dataset
 from tests.utils.model_configs import ModelTestingGroup
 from tests.utils.utils import requires_cuda
 
@@ -22,7 +21,7 @@ def _get_model(config_dict: dict, model_type: str = "gpt") -> FastLLMModel:
 @requires_cuda
 @pytest.mark.model_testing_group(ModelTestingGroup.basic)
 def test_frozen_weights(model_testing_config):
-    get_model_test_dataset()
+    model_testing_config.get_dataset()
     frozen_config_dict = copy.deepcopy(model_testing_config.config_dict)
     decoder_config = frozen_config_dict["model"]["base_model"]["decoder"]
     if (decoder_type := decoder_config.get("type", "fixed")) == "fixed":
diff --git a/tests/utils/dataset.py b/tests/utils/dataset.py
index b88f834cb..ed3f01307 100644
--- a/tests/utils/dataset.py
+++ b/tests/utils/dataset.py
@@ -172,7 +172,8 @@ def _get_test_dataset(
     image_patch_config: ImagePatchConfig | None = None,
     min_image_size: int = 4,
     max_image_size: int = 32,
-):
+    config_only: bool = False,
+) -> tuple[pathlib.Path, dict[str, typing.Any], pathlib.Path]:
     config_paths = (
         [path / "fast_llm_config.yaml"]
         if splits is None
@@ -180,7 +181,7 @@ def _get_test_dataset(
     )
     hf_path = path / "hf"
 
-    if not all(config_path.is_file() for config_path in config_paths):
+    if not config_only and not all(config_path.is_file() for config_path in config_paths):
         dataset = _get_hf_test_dataset(
             seed=seed,
             num_documents=num_documents,
@@ -284,5 +285,30 @@ def get_test_dataset_with_image_patches(image_break_token: int | None = None, im
     )
 
 
-def get_model_test_dataset():
-    return _get_test_dataset(DATASET_CACHE / "model_dataset", seed=1234, vocab_size=MODEL_TEST_VOCAB_SIZE)
+def get_model_test_dataset(config_only: bool = False):
+    return _get_test_dataset(
+        DATASET_CACHE / "model_dataset",
+        seed=1234,
+        vocab_size=MODEL_TEST_VOCAB_SIZE,
+        splits={"training": 969, "validation": 30, "test": 1},
+        config_only=config_only,
+    )
+
+
+def get_multimodal_test_dataset(config_only: bool = False):
+    return _get_test_dataset(
+        DATASET_CACHE / "model_dataset_multimodal",
+        seed=1234,
+        vocab_size=MODEL_TEST_VOCAB_SIZE,
+        max_images=2,
+        image_patch_config=ImagePatchConfig(
+            height=4,
+            width=4,
+            max_image_height=16,
+            max_image_width=16,
+            image_break_token=None,
+            image_end_token=None,
+        ),
+        splits={"training": 969, "validation": 30, "test": 1},
+        config_only=config_only,
+    )
diff --git a/tests/utils/model_configs.py b/tests/utils/model_configs.py
index 5fd3495af..f3fdca77d 100644
--- a/tests/utils/model_configs.py
+++ b/tests/utils/model_configs.py
@@ -3,6 +3,7 @@
 import enum
 import functools
 import os
+import pathlib
 import typing
 
 import pytest
@@ -21,8 +22,9 @@
     MTPLlamaCheckpointFormat,
     Qwen2CheckpointFormat,
 )
+from tests.utils.dataset import get_model_test_dataset, get_multimodal_test_dataset
 from tests.utils.distributed_configs import DistributedTestingConfig
-from tests.utils.global_variables import MODEL_DATASET_SHARD_PATH, MODEL_TEST_VOCAB_SIZE
+from tests.utils.global_variables import MODEL_TEST_VOCAB_SIZE
 
 from fast_llm.engine.evaluation.evaluators import (  # isort:skip  # needed for dynamic type registration
     EvaluatorsConfig,
@@ -78,6 +80,13 @@ class ModelTestingConfig:
     compare_factor: float = 1.0
     # Option to skip specific distributed configuration with name containing any of the provided strings.
     skip_tests: tuple[str] = ()
+    get_dataset: typing.Callable[[bool], tuple[pathlib.Path, dict[str, typing.Any], pathlib.Path]] = (
+        get_model_test_dataset
+    )
+
+    def __post_init__(self):
+        _, config, _ = self.get_dataset(config_only=True)
+        self.config_dict["data"]["datasets"] = config
 
     @functools.cached_property
     def config_args(self):
@@ -205,6 +214,7 @@ def _update_and_add_testing_config(
                             "heads": 8,
                             "head_groups": 8,
                             "head_size": 32,
+                            # "cross_document_attention":False,
                         },
                         "mlp": {
                             "layer_1": {"weight": init_1},
@@ -231,27 +241,7 @@ def _update_and_add_testing_config(
             },
         },
         "batch": {"batch_size": 8, "sequence_length": 512},
-        "data": {
-            "datasets": {
-                "training": {
-                    "dataset": {"type": "memmap", "path": MODEL_DATASET_SHARD_PATH},
-                    "type": "slice",
-                    "end": 0.969,
-                },
-                "validation": {
-                    "dataset": {"type": "memmap", "path": MODEL_DATASET_SHARD_PATH},
-                    "type": "slice",
-                    "begin": 0.969,
-                    "end": 0.999,
-                },
-                "test": {
-                    "dataset": {"type": "memmap", "path": MODEL_DATASET_SHARD_PATH},
-                    "type": "slice",
-                    "begin": 0.999,
-                    "end": 1,
-                },
-            }
-        },
+        "data": {},
         "optimizer": {"learning_rate": {"base": 0.0001}},
     },
     megatron_args=[
@@ -697,18 +687,22 @@ def _update_and_add_testing_config(
     # Tests hybrid discrete Mamba 2.
     "llama",
     "llava",
+    model_type="multimodal",
     updates={
-        ("model", "type"): "multimodal",
         ("model", "base_model", "vision_encoder"): {
-            "patch_convolution": {},
+            "patch_convolution": {"patch_height": 4, "patch_width": 4},
             "encoder": copy.deepcopy(MODEL_CONFIGS["llama"].config_dict["model"]["base_model"]["decoder"]),
             "adapter": {"intermediate_size": 512},
             "hidden_size": 256,
         },
-        ("model", "base_model", "vision_encoder", "encoder", "block", "mixer", "rotary", "type"): {"default_2d"},
-        ("model", "base_model", "vision_encoder", "encoder", "block", "mixer", "causal"): False,
-        ("model", "base_model", "vision_encoder", "encoder", "block", "mixer", "cross_document_attention"): False,
+        ("model", "base_model", "decoder", "num_blocks"): 1,
+        ("model", "base_model", "vision_encoder", "encoder", "block", "mixer", "rotary", "type"): "default_2d",
+        ("model", "base_model", "vision_encoder", "encoder", "num_blocks"): 1,
+        # TODO: ====== Make it work with these ======
+        # ("model", "base_model", "vision_encoder", "encoder", "block", "mixer", "causal"): False,
+        # ("model", "base_model", "vision_encoder", "encoder", "block", "mixer", "cross_document_attention"): False,
     },
+    get_dataset=get_multimodal_test_dataset,
     megatron_args=None,
     checkpoint_format=None,
     groups={
@@ -720,7 +714,7 @@ def _update_and_add_testing_config(
         # TODO: Implement
         ModelTestingGroup.distributed: ModelTestingGroupAction.normal,
     },
-    compare_factor=2.0,
+    compare_factor=6.0,
     # Micro-sequence split and sequence-first not supported.
     skip_tests=("sdp", "ms"),
 )
diff --git a/tests/utils/run_test_script.py b/tests/utils/run_test_script.py
index 7d706ebdb..5a24e5936 100644
--- a/tests/utils/run_test_script.py
+++ b/tests/utils/run_test_script.py
@@ -11,7 +11,6 @@
 
 from fast_llm.engine.distributed.config import DistributedConfig
 from fast_llm.utils import Assert
-from tests.utils.dataset import get_model_test_dataset
 from tests.utils.distributed_configs import DistributedTestingConfig
 from tests.utils.model_configs import MODEL_CONFIGS, ModelTestingConfig
 
@@ -72,7 +71,7 @@ def do_run_test_script_for_all_models(
     runnable_type: str = "train",
 ):
     Assert.leq(distributed_testing_config.num_gpus, DistributedConfig.default_world_size)
-    get_model_test_dataset()
+    model_testing_config.get_dataset()
     args = [
         "fast-llm",
         runnable_type,

From f9da7b336553702ae8b45e2a9800baeb6c4c8508 Mon Sep 17 00:00:00 2001
From: Joel Lamy-Poirier <joel.lamy-poirier@servicenow.com>
Date: Thu, 13 Nov 2025 16:41:08 -0500
Subject: [PATCH 08/16] Fix and test backup attention

---
 fast_llm/layers/attention/attention.py | 164 +++++++++++++------------
 fast_llm/layers/attention/config.py    |   3 +
 fast_llm/models/multimodal/model.py    |   2 +-
 tests/test_attention.py                |  51 +++++++-
 tests/utils/model_configs.py           |   5 +-
 5 files changed, 144 insertions(+), 81 deletions(-)

diff --git a/fast_llm/layers/attention/attention.py b/fast_llm/layers/attention/attention.py
index 16f217b1e..3bbff4c13 100644
--- a/fast_llm/layers/attention/attention.py
+++ b/fast_llm/layers/attention/attention.py
@@ -179,11 +179,15 @@ def __init__(
             dense_dim,
         )
 
-    def _attn_fused(
-        self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, mask: torch.Tensor, mask_value: torch.Tensor
+    def _attn_backup(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        kwargs: dict[str, typing.Any],
     ) -> torch.Tensor:
         # Backup attention (inefficient)
-        b, sq, hidden = query.shape
+        b, sq, _, _ = query.shape
         sk = key.size(1)
 
         if self._local_head_groups == 1:
@@ -191,14 +195,12 @@ def _attn_fused(
             key = key.transpose(-1, -2)
         else:
             query = (
-                query.unflatten(-1, (self._local_head_groups, self._local_heads_per_group, self._config.head_size))
+                query.unflatten(2, (self._local_head_groups, self._local_heads_per_group))
                 .transpose(1, 2)
                 .reshape(b * self._local_head_groups, sq * self._local_heads_per_group, self._config.head_size)
             )
-            key = key.unflatten(-1, (self._local_head_groups, self._config.head_size)).movedim(1, 3).flatten(0, 1)
-            value = (
-                value.unflatten(-1, (self._local_head_groups, self._config.head_size)).transpose(1, 2).flatten(0, 1)
-            )
+            key = key.movedim(1, 3).flatten(0, 1)
+            value = value.transpose(1, 2).flatten(0, 1)
 
         attn_weights = torch.empty(
             (b * self._local_head_groups, sq * self._local_heads_per_group, sk), device=query.device, dtype=query.dtype
@@ -212,7 +214,8 @@ def _attn_fused(
         ).view(b, self._local_head_groups, sq, self._local_heads_per_group, sk)
 
         attn_weights = attn_weights.to(torch.float32)
-        attn_weights = torch.where(mask, attn_weights, mask_value)
+        if (attention_mask := kwargs[AttentionKwargs.attention_mask]) is not None:
+            attn_weights = torch.where(attention_mask, attn_weights, kwargs[AttentionKwargs.attention_mask_value])
         attn_weights = torch.nn.functional.softmax(attn_weights, dim=-1).to(query.dtype)
 
         attn_weights = torch.dropout(attn_weights, self._config.dropout, self.training)
@@ -229,6 +232,40 @@ def _attn_fused(
                 .flatten(2)
             )
 
+    def _attn_flash(
+        self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, kwargs: dict[str, typing.Any]
+    ) -> torch.Tensor:
+        assert _flash_available
+        window_size = (-1, -1) if self._config.window_size is None else (self._config.window_size - 1, 0)
+        if self._config.cross_document_attention:
+            return _flash_attn_func(
+                query,
+                key,
+                value,
+                window_size=window_size,
+                dropout_p=self._config.dropout if self.training else 0.0,
+                causal=self._config.causal,
+                softmax_scale=self._softmax_scale,
+            ).flatten(-2)
+        else:
+            return (
+                _flash_attn_varlen_func(
+                    query.view(-1, query.size(-2), query.size(-1)),
+                    key.view(-1, key.size(-2), key.size(-1)),
+                    value.view(-1, value.size(-2), value.size(-1)),
+                    kwargs[AttentionKwargs.cu_seqlens_q],
+                    kwargs[AttentionKwargs.cu_seqlens_k],
+                    kwargs[AttentionKwargs.max_seqlen_q],
+                    kwargs[AttentionKwargs.max_seqlen_k],
+                    dropout_p=self._config.dropout if self.training else 0.0,
+                    window_size=window_size,
+                    causal=self._config.causal,
+                    softmax_scale=self._softmax_scale,
+                )
+                .view(query.size())
+                .flatten(-2)
+            )
+
     def _query_key_value_forward(
         self, input_: torch.Tensor, sequence_first: bool
     ) -> tuple[torch.Tensor, torch.Tensor, dict[str, typing.Any]]:
@@ -332,47 +369,12 @@ def _forward(
             self._debug(key, "key_rotary_input", self._kv_dims, kwargs)
         query, key = self._rotary(query, key, kwargs)
 
-        window_size = (-1, -1) if self._config.window_size is None else (self._config.window_size - 1, 0)
         with set_generator(self._distributed.tp_generator):
             if self._implementation == AttentionImplementation.flash:
-                assert _flash_available
-                if self._config.cross_document_attention:
-                    input_ = _flash_attn_func(
-                        query,
-                        key,
-                        value,
-                        window_size=window_size,
-                        dropout_p=self._config.dropout if self.training else 0.0,
-                        causal=self._config.causal,
-                        softmax_scale=self._softmax_scale,
-                    ).flatten(-2)
-                else:
-                    input_ = (
-                        _flash_attn_varlen_func(
-                            query.view(-1, query.size(-2), query.size(-1)),
-                            key.view(-1, key.size(-2), key.size(-1)),
-                            value.view(-1, value.size(-2), value.size(-1)),
-                            cu_seqlens_q=kwargs.get(AttentionKwargs.cu_seqlens_q),
-                            cu_seqlens_k=kwargs.get(AttentionKwargs.cu_seqlens_k),
-                            max_seqlen_q=kwargs.get(AttentionKwargs.max_seqlen_q),
-                            max_seqlen_k=kwargs.get(AttentionKwargs.max_seqlen_k),
-                            dropout_p=self._config.dropout if self.training else 0.0,
-                            window_size=window_size,
-                            causal=self._config.causal,
-                            softmax_scale=self._softmax_scale,
-                        )
-                        .view(query.size())
-                        .flatten(-2)
-                    )
+                input_ = self._attn_flash(query, key, value, kwargs)
             elif self._implementation == AttentionImplementation.backup:
                 # TODO: Avoid the flattens.
-                input_ = self._attn_fused(
-                    query.flatten(-2),
-                    key.flatten(-2),
-                    value.flatten(-2),
-                    kwargs[AttentionKwargs.attention_mask],
-                    kwargs[AttentionKwargs.attention_mask_value],
-                )
+                input_ = self._attn_backup(query, key, value, kwargs)
             else:
                 raise NotImplementedError(self._implementation)
 
@@ -452,44 +454,54 @@ def preprocess(self, kwargs: dict[str, typing.Any]) -> None:
 
     def _preprocess_for_backup_attention(self, kwargs: dict[str, typing.Any]) -> None:
         device = kwargs[AttentionKwargs.device] if AttentionKwargs.device in kwargs else self._distributed.device
-        if (
-            sequence_length := kwargs[AttentionKwargs.sequence_length]
-        ) > self._backup_attention_tensor_cache_max_sequence_length:
-            # Create tensor cache.
-            self._backup_attention_tensor_cache_max_sequence_length = sequence_length
-
-            self._backup_attention_mask = torch.ones(
-                (sequence_length, sequence_length),
-                dtype=torch.bool,
-                device=device,
-            ).tril_()
-
-            if self._config.window_size is not None:
-                self._backup_attention_mask.triu_(-self._config.window_size + 1)
-            self._backup_attention_mask_value = torch.full(
-                [],
-                torch.finfo(self._distributed_config.compute_dtype.torch).min,
-                dtype=self._distributed_config.compute_dtype.torch,
-                device=device,
-            )
         sequence_k = kwargs[AttentionKwargs.sequence_k_dim].size
         sequence_q = kwargs[AttentionKwargs.sequence_q_dim].size
-        kwargs[AttentionKwargs.attention_mask] = self._backup_attention_mask[
-            None, None, sequence_k - sequence_q : sequence_k, None, :sequence_k
-        ]
+        if self._config.causal:
+            if (
+                sequence_length := kwargs[AttentionKwargs.sequence_length]
+            ) > self._backup_attention_tensor_cache_max_sequence_length:
+                # Create tensor cache.
+                self._backup_attention_tensor_cache_max_sequence_length = sequence_length
+
+                self._backup_attention_mask = torch.ones(
+                    (sequence_length, sequence_length),
+                    dtype=torch.bool,
+                    device=device,
+                ).tril_()
+
+                if self._config.window_size is not None:
+                    self._backup_attention_mask.triu_(-self._config.window_size + 1)
+            attention_mask = self._backup_attention_mask[
+                None, None, sequence_k - sequence_q : sequence_k, None, :sequence_k
+            ]
+        else:
+            attention_mask = None
         if not self._config.cross_document_attention:
             seq_ids = torch.stack(
                 [
-                    torch.cat([torch.full((x,), i) for i, x in enumerate(sample_lens)])
+                    torch.cat([torch.full((x,), i, device=device) for i, x in enumerate(sample_lens)])
                     for sample_lens in kwargs[AttentionKwargs.sequence_lengths]
                 ]
             )
-            document_mask = (seq_ids[:, None, :] == seq_ids[:, :, None]).to(device)
-            kwargs[AttentionKwargs.attention_mask] = (
-                kwargs[AttentionKwargs.attention_mask]
-                & document_mask[:, None, sequence_k - sequence_q : sequence_k, None, :sequence_k]
-            )
-        kwargs[AttentionKwargs.attention_mask_value] = self._backup_attention_mask_value
+            document_mask = (seq_ids[:, None, :] == seq_ids[:, :, None])[
+                :, None, sequence_k - sequence_q : sequence_k, None, :sequence_k
+            ]
+            if attention_mask is None:
+                attention_mask = document_mask
+            else:
+                attention_mask = attention_mask & document_mask
+
+        kwargs[AttentionKwargs.attention_mask] = attention_mask
+
+        if attention_mask is not None:
+            if not hasattr(self, "_backup_attention_mask_value"):
+                self._backup_attention_mask_value = torch.full(
+                    [],
+                    torch.finfo(self._distributed_config.compute_dtype.torch).min,
+                    dtype=self._distributed_config.compute_dtype.torch,
+                    device=device,
+                )
+            kwargs[AttentionKwargs.attention_mask_value] = self._backup_attention_mask_value
 
     def _preprocess_for_flash_attention(self, kwargs: dict[str, typing.Any]) -> None:
         """
diff --git a/fast_llm/layers/attention/config.py b/fast_llm/layers/attention/config.py
index 206fa6e6f..d65c924e4 100644
--- a/fast_llm/layers/attention/config.py
+++ b/fast_llm/layers/attention/config.py
@@ -132,6 +132,9 @@ def _validate(self) -> None:
 
         Assert.multiple(self.heads, self.head_groups)
 
+        if not self.causal:
+            assert self.window_size is None, "Non-causal windowed attention is not supported."
+
     @property
     def layer_class(self) -> "type[Attention]":
         from fast_llm.layers.attention.attention import Attention
diff --git a/fast_llm/models/multimodal/model.py b/fast_llm/models/multimodal/model.py
index 21d7eac49..ffd1554d5 100644
--- a/fast_llm/models/multimodal/model.py
+++ b/fast_llm/models/multimodal/model.py
@@ -109,7 +109,7 @@ def preprocess_batch(
         kwargs[self._vision_encoder_namespace] = {
             VisionKwargs.sequence_first: sequence_first,
             VisionKwargs.patch_positions: positions,
-            VisionKwargs.sequence_lengths: [cropped_image_patches.lengths],
+            VisionKwargs.sequence_lengths: [cropped_image_patches.lengths + [pad_size]],
             VisionKwargs.sequence_length: sequence_length,
             VisionKwargs.sequence_k_dim: sequence_dim,
             VisionKwargs.sequence_q_dim: sequence_dim,
diff --git a/tests/test_attention.py b/tests/test_attention.py
index 69af39503..533000f01 100644
--- a/tests/test_attention.py
+++ b/tests/test_attention.py
@@ -1,3 +1,4 @@
+import pytest
 import torch
 
 from fast_llm.engine.config_utils.tensor_dim import TensorDim
@@ -6,10 +7,13 @@
 from fast_llm.layers.attention.config import AttentionConfig, AttentionImplementation, AttentionKwargs
 from fast_llm.layers.block.config import BlockDimNames
 from fast_llm.utils import Assert
+from tests.utils.utils import requires_cuda
 
 
+# TODO: ====== micro-sequence ======
+@pytest.mark.skip
 def test_varlen_preprocessing():
-    sequence_lengths = [torch.tensor([8, 13, 4, 11], dtype=torch.int32), torch.tensor([11, 16, 9], dtype=torch.int32)]
+    sequence_lengths = [[8, 13, 4, 11], [11, 16, 9]]
     # First micro-sequence:
     # [0...7,0...3] + [0...10,0] -> [0,8,12,23,24]
     # Second micro-sequence:
@@ -48,3 +52,48 @@ def test_varlen_preprocessing():
         attention.preprocess(kwargs)
         Assert.all_equal(kwargs[AttentionKwargs.cu_seqlens_q], cumulative_sequences_q[micro_seq_idx])
         Assert.all_equal(kwargs[AttentionKwargs.cu_seqlens_k], cumulative_sequences_k[micro_seq_idx])
+
+
+@requires_cuda
+@pytest.mark.parametrize("cross_document_attention", (True, False))
+@pytest.mark.parametrize(("causal", "window_size"), ((True, None), (True, 50), (False, None)))
+@pytest.mark.parametrize("padding", (0, 10))
+def test_attention_implementations(cross_document_attention: bool, causal: bool, window_size: int | None):
+    """
+    Check that the flash and backup attention implementation give the same result.
+    """
+    attention: Attention = AttentionConfig(
+        head_size=32,
+        heads=4,
+        head_groups=2,
+        window_size=window_size,
+        cross_document_attention=cross_document_attention,
+        causal=causal,
+    ).get_layer(
+        DistributedConfig(compute_dtype="bfloat16"),
+        TensorDim("hidden_size", 256),
+        lr_scale=None,
+        peft=None,
+    )
+    query = torch.empty(4, 100, 4, 32, dtype=torch.bfloat16, device="cuda").normal_()
+    key = torch.empty(4, 100, 2, 32, dtype=torch.bfloat16, device="cuda").normal_()
+    value = torch.empty(4, 100, 2, 32, dtype=torch.bfloat16, device="cuda").normal_()
+    kwargs = {
+        AttentionKwargs.device: torch.device("cuda"),
+        AttentionKwargs.sequence_length: 100,
+        AttentionKwargs.sequence_lengths: [
+            [20, 32, 10, 11, 9, 18],
+            [100],
+            [2, 8, 22, 7, 6, 5, 1, 10, 4, 11, 3, 8, 4, 9],
+            [5 for _ in range(20)],
+        ],
+        AttentionKwargs.sequence_q_dim: TensorDim("sequence_q", 100),
+        AttentionKwargs.sequence_k_dim: TensorDim("sequence_k", 100),
+    }
+    attention._preprocess_for_backup_attention(kwargs)
+    attention._preprocess_for_flash_attention(kwargs)
+
+    out_backup = attention._attn_backup(query, key, value, kwargs)
+    out_flash = attention._attn_flash(query, key, value, kwargs)
+
+    Assert.rms_close(out_backup, out_flash, 2e-3)
diff --git a/tests/utils/model_configs.py b/tests/utils/model_configs.py
index f3fdca77d..0f14d6b81 100644
--- a/tests/utils/model_configs.py
+++ b/tests/utils/model_configs.py
@@ -698,9 +698,8 @@ def _update_and_add_testing_config(
         ("model", "base_model", "decoder", "num_blocks"): 1,
         ("model", "base_model", "vision_encoder", "encoder", "block", "mixer", "rotary", "type"): "default_2d",
         ("model", "base_model", "vision_encoder", "encoder", "num_blocks"): 1,
-        # TODO: ====== Make it work with these ======
-        # ("model", "base_model", "vision_encoder", "encoder", "block", "mixer", "causal"): False,
-        # ("model", "base_model", "vision_encoder", "encoder", "block", "mixer", "cross_document_attention"): False,
+        ("model", "base_model", "vision_encoder", "encoder", "block", "mixer", "causal"): False,
+        ("model", "base_model", "vision_encoder", "encoder", "block", "mixer", "cross_document_attention"): False,
     },
     get_dataset=get_multimodal_test_dataset,
     megatron_args=None,

From 81c29d0a7a47c13885d31feea9dfadf014b848ce Mon Sep 17 00:00:00 2001
From: Joel Lamy-Poirier <joel.lamy-poirier@servicenow.com>
Date: Thu, 13 Nov 2025 21:09:50 -0500
Subject: [PATCH 09/16] fixes

---
 fast_llm/core/distributed.py                |  16 +++
 fast_llm/engine/distributed/config.py       |  27 +++--
 fast_llm/engine/distributed/distributed.py  |  11 +-
 fast_llm/layers/attention/attention.py      |   8 +-
 fast_llm/layers/language_model/embedding.py |  10 +-
 fast_llm/layers/vision/config.py            |   5 +
 fast_llm/layers/vision/patch_convolution.py |  34 +++---
 fast_llm/models/multimodal/model.py         | 122 +++++++++++++++++---
 tests/utils/model_configs.py                |   3 +-
 9 files changed, 178 insertions(+), 58 deletions(-)

diff --git a/fast_llm/core/distributed.py b/fast_llm/core/distributed.py
index 16b7c3921..c03ee2d1c 100644
--- a/fast_llm/core/distributed.py
+++ b/fast_llm/core/distributed.py
@@ -91,6 +91,22 @@ def allreduce_scalar(
         return value
 
 
+def all_gather_scalar(
+    value: float | int,
+    dtype: torch.dtype = torch.float64,
+    group: torch.distributed.ProcessGroup | None = None,
+    timeout: float | None = None,
+):
+    if group:
+        value = torch.full([1], value, dtype=dtype, device=torch.cuda.current_device())
+        add_ephemeral_timeout(group, timeout)
+        output_tensor = value.new_empty((group.size(),))
+        torch.distributed.all_gather_into_tensor(output_tensor, value, group=group)
+        return output_tensor.tolist()
+    else:
+        return value
+
+
 def broadcast_scalar(
     value: float | int,
     dtype: torch.dtype = torch.float64,
diff --git a/fast_llm/engine/distributed/config.py b/fast_llm/engine/distributed/config.py
index 602c44a4e..f4dab5a26 100644
--- a/fast_llm/engine/distributed/config.py
+++ b/fast_llm/engine/distributed/config.py
@@ -97,6 +97,7 @@ class DistributedDimNames:
     sequence_data = "sequence_data"
     batch_data = "batch_data"
     tensor_and_sequence_data = "tensor_and_sequence_data"
+    tensor_and_data = "tensor_and_data"
 
 
 @config_class()
@@ -255,8 +256,6 @@ def _validate(self) -> None:
         Assert.multiple(self.local_world_size, self.tensor_parallel)
 
         if self.pipeline_first:
-            # Case is useless and would cause too many complications.
-            Assert.eq(self.sequence_data_parallel, 1)
             # Smaller models can be more demanding on pipeline parallel.
             self.data_rank = (self.rank // self.tensor_parallel) // self.pipeline_parallel
             self.pipeline_rank = (self.rank // self.tensor_parallel) % self.pipeline_parallel
@@ -334,14 +333,24 @@ def _validate(self) -> None:
                     ),
                 )
             )
-            self._add_distributed_dim(
-                DistributedDim(
-                    name=DistributedDimNames.tensor_and_sequence_data,
-                    size=self.sequence_data_parallel * self.tensor_parallel,
-                    rank=self.tensor_rank + self.sequence_data_rank * self.tensor_parallel,
-                    global_ranks=self._get_global_ranks(self.sequence_data_parallel * self.tensor_parallel, 1),
+            # Global ranks wrong with pipeline first, so we hide the dims as a safety check.
+            if not self.pipeline_first:
+                self._add_distributed_dim(
+                    DistributedDim(
+                        name=DistributedDimNames.tensor_and_sequence_data,
+                        size=self.sequence_data_parallel * self.tensor_parallel,
+                        rank=self.tensor_rank + self.sequence_data_rank * self.tensor_parallel,
+                        global_ranks=self._get_global_ranks(self.sequence_data_parallel * self.tensor_parallel, 1),
+                    )
+                )
+                self._add_distributed_dim(
+                    DistributedDim(
+                        name=DistributedDimNames.tensor_and_data,
+                        size=self.data_parallel * self.tensor_parallel,
+                        rank=self.tensor_rank + self.data_rank * self.tensor_parallel,
+                        global_ranks=self._get_global_ranks(self.data_parallel * self.tensor_parallel, 1),
+                    )
                 )
-            )
 
         super()._validate()
 
diff --git a/fast_llm/engine/distributed/distributed.py b/fast_llm/engine/distributed/distributed.py
index 2e2f9d401..302cfcdce 100644
--- a/fast_llm/engine/distributed/distributed.py
+++ b/fast_llm/engine/distributed/distributed.py
@@ -171,9 +171,14 @@ def __init__(self, config: DistributedConfig, use_cpu: bool = False):
         self.tensor_group = self.add_group(self._config.distributed_dims[DistributedDimNames.tensor])
         self.sequence_data_group = self.add_group(self._config.distributed_dims[DistributedDimNames.sequence_data])
         self.batch_data_group = self.add_group(self._config.distributed_dims[DistributedDimNames.batch_data])
-        self.tensor_and_sequence_data_group = self.add_group(
-            self._config.distributed_dims[DistributedDimNames.tensor_and_sequence_data]
-        )
+        # Global ranks wrong with pipeline first, so we hide the dims as a safety check.
+        if not self._config.pipeline_first:
+            self.tensor_and_sequence_data_group = self.add_group(
+                self._config.distributed_dims[DistributedDimNames.tensor_and_sequence_data]
+            )
+            self.tensor_and_data_group = self.add_group(
+                self._config.distributed_dims[DistributedDimNames.tensor_and_data]
+            )
 
         self._config.log_first_rank(f"Setting random seeds...")
 
diff --git a/fast_llm/layers/attention/attention.py b/fast_llm/layers/attention/attention.py
index 3bbff4c13..d0e37e472 100644
--- a/fast_llm/layers/attention/attention.py
+++ b/fast_llm/layers/attention/attention.py
@@ -454,8 +454,8 @@ def preprocess(self, kwargs: dict[str, typing.Any]) -> None:
 
     def _preprocess_for_backup_attention(self, kwargs: dict[str, typing.Any]) -> None:
         device = kwargs[AttentionKwargs.device] if AttentionKwargs.device in kwargs else self._distributed.device
-        sequence_k = kwargs[AttentionKwargs.sequence_k_dim].size
-        sequence_q = kwargs[AttentionKwargs.sequence_q_dim].size
+        sequence_k = kwargs[AttentionKwargs.sequence_k_dim].global_size
+        sequence_q = kwargs[AttentionKwargs.sequence_q_dim].global_size
         if self._config.causal:
             if (
                 sequence_length := kwargs[AttentionKwargs.sequence_length]
@@ -518,7 +518,9 @@ def _preprocess_for_flash_attention(self, kwargs: dict[str, typing.Any]) -> None
         device = kwargs[AttentionKwargs.device] if AttentionKwargs.device in kwargs else self._distributed.device
 
         # TODO: ====== Fix (need to know how much first sequence was cropped) ======
-        Assert.eq(kwargs[AttentionKwargs.sequence_k_dim].size, kwargs[AttentionKwargs.sequence_q_dim].size)
+        Assert.eq(
+            kwargs[AttentionKwargs.sequence_k_dim].global_size, kwargs[AttentionKwargs.sequence_q_dim].global_size
+        )
 
         # TODO: Calculate these in batch preprocessing?
         sequence_lengths_q = torch.tensor(
diff --git a/fast_llm/layers/language_model/embedding.py b/fast_llm/layers/language_model/embedding.py
index d95ec6dfd..0337c7b9d 100644
--- a/fast_llm/layers/language_model/embedding.py
+++ b/fast_llm/layers/language_model/embedding.py
@@ -102,7 +102,7 @@ def _forward(
                 if self._sequence_parallel:
                     input_ = gather(input_, group=group, dim=0)
                 # Out-of-place equivalent of `embeddings[embedding_map] += input_`
-                embeddings = embeddings.index_put(embedding_map, input_, accumulate=True)
+                embeddings = embeddings.index_put(embedding_map, input_[: embedding_map[0].size(0)], accumulate=True)
 
             if self._sequence_parallel:
                 embeddings = split(embeddings, group=group, dim=0)
@@ -127,10 +127,12 @@ def _forward(
                     #  TODO:: Filter and shift embedding map instead? (needs cuda sync)
                     input_ = gather(input_, group=group, dim=0)
                     embeddings_ = embeddings.new_zeros(embeddings.shape[0] * group.size(), *embeddings.shape[1:])
-                    embeddings_.index_put(embedding_map, input_, accumulate=True)
+                    embeddings_.index_put(embedding_map, input_[: embedding_map[0].size(0)], accumulate=True)
                     embeddings = embeddings + split(embeddings_, group=group, dim=0)
                 else:
-                    embeddings = embeddings.index_put(embedding_map, input_, accumulate=True)
+                    embeddings = embeddings.index_put(
+                        embedding_map, input_[: embedding_map[0].size(0)], accumulate=True
+                    )
 
         with set_generator(
             self._distributed.tp_generator if self._sequence_parallel else self._distributed.pp_generator
@@ -161,7 +163,7 @@ def forward(
             # TODO: Support pipeline-parallel.
             token_ids = kwargs.get(LanguageModelKwargs.token_ids)
             # Drop the placeholder batch dimension, remove patch padding.
-            input_ = input_.squeeze(int(kwargs[LanguageModelKwargs.sequence_first]))[: embedding_map[0].size(0)]
+            input_ = input_.squeeze(int(kwargs[LanguageModelKwargs.sequence_first]))
 
         return self._forward(
             input_,
diff --git a/fast_llm/layers/vision/config.py b/fast_llm/layers/vision/config.py
index 1aa7231c1..fb05a520c 100644
--- a/fast_llm/layers/vision/config.py
+++ b/fast_llm/layers/vision/config.py
@@ -78,6 +78,11 @@ class PatchConvolutionConfig(BlockConfig):
         desc="Width of image patches, in pixels.",
         hint=FieldHint.core,
     )
+    full_precision_residual: bool = Field(
+        default=False,
+        desc="Store the residuals for the model in full precision (`optimization_dtype`).",
+        hint=FieldHint.stability,
+    )
 
     @functools.cached_property
     def input_channels(self):
diff --git a/fast_llm/layers/vision/patch_convolution.py b/fast_llm/layers/vision/patch_convolution.py
index 6f1231d1e..e744044c7 100644
--- a/fast_llm/layers/vision/patch_convolution.py
+++ b/fast_llm/layers/vision/patch_convolution.py
@@ -3,13 +3,12 @@
 import torch
 
 from fast_llm.core.ops import split
-from fast_llm.engine.config_utils.tensor_dim import TensorDim, scalar_dim
+from fast_llm.engine.config_utils.tensor_dim import TensorDim
 from fast_llm.engine.distributed.config import DistributedConfig, DistributedDimNames
 from fast_llm.layers.attention.config import AttentionKwargs
 from fast_llm.layers.block.block import Block
-from fast_llm.layers.block.config import BlockKwargs
 from fast_llm.layers.common.peft.config import PeftConfig
-from fast_llm.layers.vision.config import PatchConvolutionConfig
+from fast_llm.layers.vision.config import PatchConvolutionConfig, VisionKwargs
 from fast_llm.tensor import TensorMeta
 
 
@@ -33,6 +32,11 @@ def __init__(
             lr_scale=lr_scale,
             peft=peft,
         )
+        self._residual_dtype = (
+            self._distributed_config.optimization_dtype
+            if self._config.full_precision_residual
+            else self._distributed_config.compute_dtype
+        ).torch
         self._parallel_dim = self._distributed_config.get_distributed_dim(DistributedDimNames.tensor)
 
         self.convolution = self._config.convolution.get_layer(
@@ -56,27 +60,15 @@ def forward(
     ) -> torch.Tensor:
         if isinstance(input_, TensorMeta):
             return TensorMeta.from_dims(
-                (
-                    (
-                        input_.dims[0],
-                        scalar_dim,
-                        self._hidden_dim,
-                    )
-                    if kwargs[BlockKwargs.sequence_first]
-                    else (
-                        scalar_dim,
-                        input_.dims[0],
-                        self._hidden_dim,
-                    )
-                ),
-                tensor_name="patch convolution output",
-                dtype=input_.dtype,
+                kwargs[VisionKwargs.hidden_dims],
+                tensor_name="Patch convolution output",
+                dtype=self._residual_dtype,
             )
+        if self._sequence_parallel:
+            input_ = split(input_, group=self._parallel_dim.group, dim=0)
         patch_embeddings = (
             self.normalization(self.convolution(input_).flatten(1))
             .view(-1, self._hidden_dim.size)
             .unsqueeze(int(kwargs[AttentionKwargs.sequence_first]))
         )
-        if self._sequence_parallel:
-            patch_embeddings = split(patch_embeddings, group=self._parallel_dim.group, dim=0)
-        return patch_embeddings
+        return patch_embeddings.to(self._residual_dtype)
diff --git a/fast_llm/models/multimodal/model.py b/fast_llm/models/multimodal/model.py
index ffd1554d5..d1b5a8763 100644
--- a/fast_llm/models/multimodal/model.py
+++ b/fast_llm/models/multimodal/model.py
@@ -3,11 +3,13 @@
 
 import torch
 
+from fast_llm.core.distributed import all_gather_scalar
 from fast_llm.data.sample.language_model import LanguageModelBatch
 from fast_llm.engine.config_utils.tensor_dim import TensorDim, scalar_dim
-from fast_llm.engine.distributed.config import PhaseType
+from fast_llm.engine.distributed.config import DistributedDim, DistributedDimNames, PhaseType
 from fast_llm.engine.inference.runner import InferenceRunner
 from fast_llm.layers.attention.config import AttentionKwargs
+from fast_llm.layers.block.config import BlockDimNames
 from fast_llm.layers.language_model.config import LanguageModelKwargs
 from fast_llm.layers.vision.config import VisionKwargs
 from fast_llm.layers.vision.vision_encoder import VisionMultiModalModel
@@ -19,6 +21,63 @@
 logger = logging.getLogger(__name__)
 
 
+class PatchSequenceTensorDim(TensorDim):
+    """
+    A custom `TensorDim` class to handle the combined batch/sequence dimension in image patches.
+
+    A simple gather `TensorDim.local_to_global` yields inconsistent results between distributed configuration,
+    (because of the padding of image patches) which makes direct comparison in tests impossible.
+    This class solves the problem removing the padding in the tensor returned by `local_to_global`,
+    allowing for consistent results.
+    Note that `local_unpadded_size` must be set manually before any call to `local_to_global`.
+    """
+
+    local_unpadded_size: int
+
+    def __init__(self, name: str, global_size: int, parallel_dim: DistributedDim, batch_parallel_dim: DistributedDim):
+        super().__init__(name, global_size * batch_parallel_dim.size, parallel_dim, variable_size=True)
+        self._batch_parallel_dim = batch_parallel_dim
+
+    @property
+    def is_parallel(self) -> bool:
+        # Ensure `local_to_global` is called in non-parallel setting.
+        return True
+
+    def replace_parallel_dim(self, distributed_dim: DistributedDim) -> typing.Self:
+        raise NotImplementedError()
+
+    def local_to_global(self, tensor: "torch.Tensor", dim: int = 0) -> "torch.Tensor":
+        assert hasattr(self, "local_unpadded_size")
+        batch_parallel_group = self._batch_parallel_dim.group
+        global_padded_tensor = super().local_to_global(tensor, dim)
+
+        if batch_parallel_group is None:
+            return global_padded_tensor[*(slice(None) for _ in range(dim)), : self.local_unpadded_size]
+        else:
+            unpadded_sequence_lengths = all_gather_scalar(self.local_unpadded_size, torch.int32, batch_parallel_group)
+            return torch.cat(
+                [
+                    tensor[*(slice(None) for _ in range(dim)), :unpadded_sequence_length]
+                    for tensor, unpadded_sequence_length in zip(
+                        global_padded_tensor.chunk(batch_parallel_group.size(), dim=dim),
+                        unpadded_sequence_lengths,
+                        strict=True,
+                    )
+                ],
+                dim=dim,
+            )
+
+    def local_to_global_partial(
+        self, tensor: "torch.Tensor", dim: int = 0, fill_value: float | int = -1
+    ) -> "torch.Tensor":
+        # Not needed.
+        raise NotImplementedError()
+
+    def global_to_local(self, tensor: "torch.Tensor", dim: int = 0, expand: bool = False) -> "torch.Tensor":
+        # Not needed.
+        raise NotImplementedError()
+
+
 class MultiModalBaseModel[ConfigType: MultiModalBaseModelConfig](
     GPTBaseModel[ConfigType], VisionMultiModalModel[ConfigType]
 ):
@@ -35,27 +94,55 @@ def preprocess_meta(
         for tokens, kwargs in super().preprocess_meta(batch_meta, phase):
             kwargs[LanguageModelKwargs.token_ids] = tokens
             kwargs[LanguageModelKwargs.mask_inputs] = True
+            # TODO: What about sequence data?
+            batch_data_dim = self._distributed_config.get_distributed_dim(DistributedDimNames.batch_data)
+
+            micro_sequence_length = tokens.global_shape.numel()
+
+            batch_and_sequence_q_dim = PatchSequenceTensorDim(
+                BlockDimNames.sequence_q,
+                micro_sequence_length,
+                self._distributed_config.get_distributed_dim(DistributedDimNames.data),
+                batch_data_dim,
+            )
+            hidden_batch_and_sequence_q_dim = (
+                PatchSequenceTensorDim(
+                    BlockDimNames.sequence_q_tp,
+                    micro_sequence_length,
+                    self._distributed_config.get_distributed_dim(DistributedDimNames.tensor_and_data),
+                    batch_data_dim,
+                )
+                if self._distributed_config.sequence_tensor_parallel
+                else batch_and_sequence_q_dim
+            )
+            # These are used by the model (preprocessing) and shouldn't see the batch-parallel dim.
+            sequence_q_dim = TensorDim(
+                BlockDimNames.sequence_q,
+                micro_sequence_length,
+                self._distributed_config.get_distributed_dim(DistributedDimNames.sequence_data),
+            )
+            sequence_k_dim = TensorDim(BlockDimNames.sequence_k, micro_sequence_length)
+
             image_patches = TensorMeta.from_dims(
                 (
                     # We combine the batch and sequence dims to allow for variable sequence lengths.
                     # Gives the same result, assuming we disable cross-image attention (TODO: Enforce)
-                    sequence_dim := TensorDim("image_sequence", tokens.numel(), variable_size=True),
+                    batch_and_sequence_q_dim,
                     # TODO: Relate to tensor dims in patch convolution.
                     TensorDim("input_channels", self._config.vision_encoder.patch_convolution.input_channels),
                     TensorDim("patch_height", self._config.vision_encoder.patch_convolution.patch_height),
                     TensorDim("patch_width", self._config.vision_encoder.patch_convolution.patch_width),
                 )
             )
-
             hidden_dims = (
-                (sequence_dim, scalar_dim, self.vision_encoder._hidden_dim)
+                (hidden_batch_and_sequence_q_dim, scalar_dim, self.vision_encoder._hidden_dim)
                 if (sequence_first := kwargs[LanguageModelKwargs.sequence_first])
-                else (scalar_dim, sequence_dim, self.vision_encoder._hidden_dim)
+                else (scalar_dim, hidden_batch_and_sequence_q_dim, self.vision_encoder._hidden_dim)
             )
             kwargs[self._vision_encoder_namespace] = {
                 VisionKwargs.sequence_first: sequence_first,
-                VisionKwargs.sequence_k_dim: sequence_dim,
-                VisionKwargs.sequence_q_dim: sequence_dim,
+                VisionKwargs.sequence_k_dim: sequence_k_dim,
+                VisionKwargs.sequence_q_dim: sequence_q_dim,
                 VisionKwargs.hidden_dims: hidden_dims,
             }
 
@@ -88,7 +175,6 @@ def preprocess_batch(
         cropped_image_patches = batch.image_patches.crop(tokens_begin, tokens_end)
 
         sequence_length = tokens.shape[:2].numel()
-        sequence_dim = TensorDim("image_sequence", sequence_length)
         pad_size = sequence_length - cropped_image_patches.patches.size(0)
 
         patches = cropped_image_patches.patches.to(self._distributed.config.compute_dtype.torch)
@@ -101,21 +187,23 @@ def preprocess_batch(
             ]
         )
 
-        hidden_dims = (
-            (sequence_dim, scalar_dim, self.vision_encoder._hidden_dim)
-            if (sequence_first := kwargs[LanguageModelKwargs.sequence_first])
-            else (scalar_dim, sequence_dim, self.vision_encoder._hidden_dim)
-        )
         kwargs[self._vision_encoder_namespace] = {
-            VisionKwargs.sequence_first: sequence_first,
+            **kwargs[self._vision_encoder_namespace],
             VisionKwargs.patch_positions: positions,
             VisionKwargs.sequence_lengths: [cropped_image_patches.lengths + [pad_size]],
             VisionKwargs.sequence_length: sequence_length,
-            VisionKwargs.sequence_k_dim: sequence_dim,
-            VisionKwargs.sequence_q_dim: sequence_dim,
-            VisionKwargs.hidden_dims: hidden_dims,
             VisionKwargs.device: self._distributed.device,
         }
+        # We need to modify `local_unpadded_size` directly in `preprocessed_meta` since it's the one used by the engine.
+        # Unsafe, but only needed for testing.
+        # TODO: Doesn't work with gradient accumulation (only sees the last value).
+        hidden_batch_and_sequence_q_dim = kwargs[self._vision_encoder_namespace][VisionKwargs.hidden_dims][
+            0 if kwargs[self._vision_encoder_namespace][VisionKwargs.sequence_first] else 1
+        ]
+        print(kwargs[self._vision_encoder_namespace][VisionKwargs.hidden_dims])
+        print(hidden_batch_and_sequence_q_dim)
+        assert isinstance(hidden_batch_and_sequence_q_dim, PatchSequenceTensorDim)
+        hidden_batch_and_sequence_q_dim.local_unpadded_size = cropped_image_patches.patches.size(0)
 
         kwargs[LanguageModelKwargs.embedding_map] = (
             (cropped_image_patches.token_map, cropped_image_patches.sample_map)
diff --git a/tests/utils/model_configs.py b/tests/utils/model_configs.py
index 0f14d6b81..78c4d78b4 100644
--- a/tests/utils/model_configs.py
+++ b/tests/utils/model_configs.py
@@ -715,7 +715,8 @@ def _update_and_add_testing_config(
     },
     compare_factor=6.0,
     # Micro-sequence split and sequence-first not supported.
-    skip_tests=("sdp", "ms"),
+    # TODO: Gradient accumulation works but comparison is broken.
+    skip_tests=("sdp", "ms", "bf4", "df"),
 )
 
 

From a61121a0519425e90f8d6802040607cf3588c9e1 Mon Sep 17 00:00:00 2001
From: Joel Lamy-Poirier <joel.lamy-poirier@servicenow.com>
Date: Wed, 19 Nov 2025 10:20:49 -0500
Subject: [PATCH 10/16] fix

---
 fast_llm/data/sample/patch.py          |  2 +-
 fast_llm/layers/attention/attention.py |  4 ++--
 tests/test_attention.py                |  1 -
 tests/test_config.py                   | 25 +++++++++++++++++--------
 4 files changed, 20 insertions(+), 12 deletions(-)

diff --git a/fast_llm/data/sample/patch.py b/fast_llm/data/sample/patch.py
index dd7c98509..9d27d37cd 100644
--- a/fast_llm/data/sample/patch.py
+++ b/fast_llm/data/sample/patch.py
@@ -86,7 +86,7 @@ def get_padding(self, size: int) -> typing.Self:
         return PatchSample(
             self.patches.new_empty((0, *self.patches.shape[1:])),
             self.token_map.new_empty(0),
-            self.positions.new_empty(0),
+            self.positions.new_empty([0, self.patches.ndim - 2]),
             size,
             [],
         )
diff --git a/fast_llm/layers/attention/attention.py b/fast_llm/layers/attention/attention.py
index d0e37e472..cf704a309 100644
--- a/fast_llm/layers/attention/attention.py
+++ b/fast_llm/layers/attention/attention.py
@@ -454,8 +454,8 @@ def preprocess(self, kwargs: dict[str, typing.Any]) -> None:
 
     def _preprocess_for_backup_attention(self, kwargs: dict[str, typing.Any]) -> None:
         device = kwargs[AttentionKwargs.device] if AttentionKwargs.device in kwargs else self._distributed.device
-        sequence_k = kwargs[AttentionKwargs.sequence_k_dim].global_size
-        sequence_q = kwargs[AttentionKwargs.sequence_q_dim].global_size
+        sequence_k = kwargs[AttentionKwargs.sequence_k_dim].size
+        sequence_q = kwargs[AttentionKwargs.sequence_q_dim].size
         if self._config.causal:
             if (
                 sequence_length := kwargs[AttentionKwargs.sequence_length]
diff --git a/tests/test_attention.py b/tests/test_attention.py
index 533000f01..f1409b95c 100644
--- a/tests/test_attention.py
+++ b/tests/test_attention.py
@@ -57,7 +57,6 @@ def test_varlen_preprocessing():
 @requires_cuda
 @pytest.mark.parametrize("cross_document_attention", (True, False))
 @pytest.mark.parametrize(("causal", "window_size"), ((True, None), (True, 50), (False, None)))
-@pytest.mark.parametrize("padding", (0, 10))
 def test_attention_implementations(cross_document_attention: bool, causal: bool, window_size: int | None):
     """
     Check that the flash and backup attention implementation give the same result.
diff --git a/tests/test_config.py b/tests/test_config.py
index 9a1f542a0..4020b6fbc 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -236,13 +236,23 @@ def test_distributed_global_ranks(bdp: int, sdp: int, tp: int, pp: int, pipeline
             tp,
             rank,
         )
-        _check_dim(
-            tp_sdp_dim := config.get_distributed_dim(DistributedDimNames.tensor_and_sequence_data),
-            DistributedDimNames.tensor_and_sequence_data,
-            dp_rank % sdp * tp + tp_rank,
-            tp * sdp,
-            rank,
-        )
+        if not pipeline_first:
+            _check_dim(
+                tp_sdp_dim := config.get_distributed_dim(DistributedDimNames.tensor_and_sequence_data),
+                DistributedDimNames.tensor_and_sequence_data,
+                dp_rank % sdp * tp + tp_rank,
+                tp * sdp,
+                rank,
+            )
+            _check_dim(
+                tp_dp_dim := config.get_distributed_dim(DistributedDimNames.tensor_and_data),
+                DistributedDimNames.tensor_and_data,
+                dp_rank * tp + tp_rank,
+                tp * dp,
+                rank,
+            )
+            all_global_ranks["tp_sdp"].add(tuple(tp_sdp_dim.global_ranks))
+            all_global_ranks["tp_dp"].add(tuple(tp_dp_dim.global_ranks))
         _check_dim(
             sdp_dim := config.get_distributed_dim(DistributedDimNames.sequence_data),
             DistributedDimNames.sequence_data,
@@ -273,7 +283,6 @@ def test_distributed_global_ranks(bdp: int, sdp: int, tp: int, pp: int, pipeline
         )
         all_global_ranks["world"].add(tuple(world_dim.global_ranks))
         all_global_ranks["tp"].add(tuple(tp_dim.global_ranks))
-        all_global_ranks["tp_sdp"].add(tuple(tp_sdp_dim.global_ranks))
         all_global_ranks["sdp"].add(tuple(sdp_dim.global_ranks))
         all_global_ranks["bdp"].add(tuple(bdp_dim.global_ranks))
         all_global_ranks["dp"].add(tuple(dp_dim.global_ranks))

From c2786a2192e90b2fa2c90858df1f2b2233a02a57 Mon Sep 17 00:00:00 2001
From: Joel Lamy-Poirier <joel.lamy-poirier@servicenow.com>
Date: Wed, 19 Nov 2025 13:44:34 -0500
Subject: [PATCH 11/16] fix

---
 fast_llm/data/sample/token.py | 6 +++---
 fast_llm/models/gpt/model.py  | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/fast_llm/data/sample/token.py b/fast_llm/data/sample/token.py
index 0944f5689..9fedf12b5 100644
--- a/fast_llm/data/sample/token.py
+++ b/fast_llm/data/sample/token.py
@@ -21,16 +21,16 @@ def crop_lengths(lengths: list[int], begin: int, end: int) -> list[int]:
         # Shortcut for the frequent case of a single document.
         return [end - begin]
     begin_ = 0
-    lengths = []
+    lengths_ = []
     for length in lengths:
         end_ = begin_ + length
         cropped_length = min(end_, end) - max(begin_, begin)
         if cropped_length > 0:
-            lengths.append(cropped_length)
+            lengths_.append(cropped_length)
         if end_ > end:
             break
         begin_ = end_
-    return lengths
+    return lengths_
 
 
 class TokenSample(Sample):
diff --git a/fast_llm/models/gpt/model.py b/fast_llm/models/gpt/model.py
index 9e5533b84..2c1947afe 100644
--- a/fast_llm/models/gpt/model.py
+++ b/fast_llm/models/gpt/model.py
@@ -198,7 +198,7 @@ def preprocess_batch(
                 **kwargs_meta,
                 AttentionKwargs.past_key_values: pasts,
                 AttentionKwargs.presents: presents,
-                AttentionKwargs.sequence_lengths: batch.tokens.lengths,
+                AttentionKwargs.sequence_lengths: cropped_tokens.lengths,
                 AttentionKwargs.device: self._distributed.device,
                 **reference_logits[i],
             }

From b0fbaf520d6c1c177a52f8c73fc802c07ac7bfe3 Mon Sep 17 00:00:00 2001
From: Joel Lamy-Poirier <joel.lamy-poirier@servicenow.com>
Date: Wed, 19 Nov 2025 15:18:49 -0500
Subject: [PATCH 12/16] fix

---
 fast_llm/layers/attention/attention.py      |  3 ++-
 fast_llm/layers/language_model/embedding.py |  2 +-
 tests/utils/model_configs.py                | 11 +++++------
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/fast_llm/layers/attention/attention.py b/fast_llm/layers/attention/attention.py
index cf704a309..94382b25a 100644
--- a/fast_llm/layers/attention/attention.py
+++ b/fast_llm/layers/attention/attention.py
@@ -192,7 +192,8 @@ def _attn_backup(
 
         if self._local_head_groups == 1:
             query = query.view(b, sq * self._local_heads, self._config.head_size)
-            key = key.transpose(-1, -2)
+            key = key.flatten(-2).transpose(-1, -2)
+            value = value.flatten(-2)
         else:
             query = (
                 query.unflatten(2, (self._local_head_groups, self._local_heads_per_group))
diff --git a/fast_llm/layers/language_model/embedding.py b/fast_llm/layers/language_model/embedding.py
index 0337c7b9d..b99d43e66 100644
--- a/fast_llm/layers/language_model/embedding.py
+++ b/fast_llm/layers/language_model/embedding.py
@@ -150,7 +150,7 @@ def forward(
         if isinstance(input_, TensorMeta):
             return TensorMeta.from_dims(
                 kwargs[LanguageModelKwargs.hidden_dims],
-                tensor_name="Embedding output",
+                tensor_name="embedding output",
                 dtype=self._residual_dtype,
             )
         if (embedding_map := kwargs.get(LanguageModelKwargs.embedding_map)) is None:
diff --git a/tests/utils/model_configs.py b/tests/utils/model_configs.py
index 78c4d78b4..1ed99416e 100644
--- a/tests/utils/model_configs.py
+++ b/tests/utils/model_configs.py
@@ -669,13 +669,12 @@ def _update_and_add_testing_config(
     megatron_args=None,
     checkpoint_format=AprielHybridSSMCheckpointFormat,
     groups={
-        ModelTestingGroup.basic: ModelTestingGroupAction.normal,
-        ModelTestingGroup.checkpoint: ModelTestingGroupAction.normal,
-        ModelTestingGroup.convert: ModelTestingGroupAction.normal,
+        ModelTestingGroup.basic: ModelTestingGroupAction.unimportant,
+        ModelTestingGroup.checkpoint: ModelTestingGroupAction.unimportant,
+        ModelTestingGroup.convert: ModelTestingGroupAction.unimportant,
         ModelTestingGroup.generate: ModelTestingGroupAction.not_implemented,
         ModelTestingGroup.megatron: ModelTestingGroupAction.not_implemented,
-        # TODO: Implement
-        ModelTestingGroup.distributed: ModelTestingGroupAction.normal,
+        ModelTestingGroup.distributed: ModelTestingGroupAction.unimportant,
     },
     compare_factor=2.0,
     # Micro-sequence split and sequence-first not supported.
@@ -684,7 +683,7 @@ def _update_and_add_testing_config(
 
 
 _update_and_add_testing_config(
-    # Tests hybrid discrete Mamba 2.
+    # Tests vision multimodal.
     "llama",
     "llava",
     model_type="multimodal",

From 571e527d728362475d639c3a98ddb3f722c935fa Mon Sep 17 00:00:00 2001
From: Joel Lamy-Poirier <joel.lamy-poirier@servicenow.com>
Date: Wed, 19 Nov 2025 15:24:35 -0500
Subject: [PATCH 13/16] fix

---
 setup.cfg | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.cfg b/setup.cfg
index 2a1614554..329277a0e 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -43,7 +43,7 @@ OPTIONAL =
 
 # Huggingface tools
 HUGGINGFACE =
-    transformers==4.53.2
+    transformers>=4.53.2
     hf-transfer>=0.1.9
     datasets>=3.6.0
     huggingface-hub>=0.32.6

From e11ee917bcc786338fcb2f2c5e00d45b6c121698 Mon Sep 17 00:00:00 2001
From: Joel Lamy-Poirier <joel.lamy-poirier@servicenow.com>
Date: Wed, 19 Nov 2025 16:11:16 -0500
Subject: [PATCH 14/16] fix

---
 fast_llm/models/multimodal/model.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fast_llm/models/multimodal/model.py b/fast_llm/models/multimodal/model.py
index d1b5a8763..c30a5d277 100644
--- a/fast_llm/models/multimodal/model.py
+++ b/fast_llm/models/multimodal/model.py
@@ -32,7 +32,7 @@ class PatchSequenceTensorDim(TensorDim):
     Note that `local_unpadded_size` must be set manually before any call to `local_to_global`.
     """
 
-    local_unpadded_size: int
+    local_unpadded_size: typing.ClassVar[int]
 
     def __init__(self, name: str, global_size: int, parallel_dim: DistributedDim, batch_parallel_dim: DistributedDim):
         super().__init__(name, global_size * batch_parallel_dim.size, parallel_dim, variable_size=True)
@@ -203,7 +203,7 @@ def preprocess_batch(
         print(kwargs[self._vision_encoder_namespace][VisionKwargs.hidden_dims])
         print(hidden_batch_and_sequence_q_dim)
         assert isinstance(hidden_batch_and_sequence_q_dim, PatchSequenceTensorDim)
-        hidden_batch_and_sequence_q_dim.local_unpadded_size = cropped_image_patches.patches.size(0)
+        PatchSequenceTensorDim.local_unpadded_size = cropped_image_patches.patches.size(0)
 
         kwargs[LanguageModelKwargs.embedding_map] = (
             (cropped_image_patches.token_map, cropped_image_patches.sample_map)

From 67d4a7c2ae85859e5fe367702362eec6bd7b9297 Mon Sep 17 00:00:00 2001
From: Joel Lamy-Poirier <joel.lamy-poirier@servicenow.com>
Date: Wed, 19 Nov 2025 16:19:12 -0500
Subject: [PATCH 15/16] fix

---
 fast_llm/layers/language_model/embedding.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fast_llm/layers/language_model/embedding.py b/fast_llm/layers/language_model/embedding.py
index b99d43e66..e59e4b49c 100644
--- a/fast_llm/layers/language_model/embedding.py
+++ b/fast_llm/layers/language_model/embedding.py
@@ -150,7 +150,7 @@ def forward(
         if isinstance(input_, TensorMeta):
             return TensorMeta.from_dims(
                 kwargs[LanguageModelKwargs.hidden_dims],
-                tensor_name="embedding output",
+                tensor_name=f"{self.module_name} output",
                 dtype=self._residual_dtype,
             )
         if (embedding_map := kwargs.get(LanguageModelKwargs.embedding_map)) is None:

From 13701b84a210edcbeb670dfa8af9ca6bc062a808 Mon Sep 17 00:00:00 2001
From: Joel Lamy-Poirier <joel.lamy-poirier@servicenow.com>
Date: Thu, 20 Nov 2025 09:39:40 -0500
Subject: [PATCH 16/16] fix

---
 fast_llm/layers/language_model/embedding.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/fast_llm/layers/language_model/embedding.py b/fast_llm/layers/language_model/embedding.py
index e59e4b49c..321400ac3 100644
--- a/fast_llm/layers/language_model/embedding.py
+++ b/fast_llm/layers/language_model/embedding.py
@@ -77,7 +77,7 @@ def __init__(
             peft=self._peft,
         )
 
-    # @torch.compile
+    @torch.compile
     def _forward(
         self,
         input_: torch.Tensor | None,
@@ -127,7 +127,9 @@ def _forward(
                     #  TODO:: Filter and shift embedding map instead? (needs cuda sync)
                     input_ = gather(input_, group=group, dim=0)
                     embeddings_ = embeddings.new_zeros(embeddings.shape[0] * group.size(), *embeddings.shape[1:])
-                    embeddings_.index_put(embedding_map, input_[: embedding_map[0].size(0)], accumulate=True)
+                    embeddings_ = embeddings_.index_put(
+                        embedding_map, input_[: embedding_map[0].size(0)], accumulate=True
+                    )
                     embeddings = embeddings + split(embeddings_, group=group, dim=0)
                 else:
                     embeddings = embeddings.index_put(