diff --git a/Dockerfile b/Dockerfile
index 00e13d95..6bc900ae 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -38,7 +38,7 @@ COPY --chmod=777 ./fast_llm/__init__.py fast_llm/
 COPY --chmod=777 ./fast_llm/csrc/ fast_llm/csrc/
 
 # Install dependencies within the virtual environment.
-RUN pip install --no-cache-dir --no-build-isolation -e ".[CORE,OPTIONAL,HUGGINGFACE,SSM,GENERATION,DEV]" triton==3.1.0
+RUN pip install --no-cache-dir --no-build-isolation -e ".[CORE,OPTIONAL,HUGGINGFACE,SSM,VISION,GENERATION,DEV]" triton==3.1.0
 
 # Copy the remaining source code with universal write permissions.
 COPY --chmod=777 ./Megatron-LM Megatron-LM
diff --git a/fast_llm/core/distributed.py b/fast_llm/core/distributed.py
index 16b7c392..c03ee2d1 100644
--- a/fast_llm/core/distributed.py
+++ b/fast_llm/core/distributed.py
@@ -91,6 +91,22 @@ def allreduce_scalar(
         return value
 
 
+def all_gather_scalar(
+    value: float | int,
+    dtype: torch.dtype = torch.float64,
+    group: torch.distributed.ProcessGroup | None = None,
+    timeout: float | None = None,
+):
+    if group:
+        value = torch.full([1], value, dtype=dtype, device=torch.cuda.current_device())
+        add_ephemeral_timeout(group, timeout)
+        output_tensor = value.new_empty((group.size(),))
+        torch.distributed.all_gather_into_tensor(output_tensor, value, group=group)
+        return output_tensor.tolist()
+    else:
+        return value
+
+
 def broadcast_scalar(
     value: float | int,
     dtype: torch.dtype = torch.float64,
diff --git a/fast_llm/data/sample/patch.py b/fast_llm/data/sample/patch.py
index dd7c9850..9d27d37c 100644
--- a/fast_llm/data/sample/patch.py
+++ b/fast_llm/data/sample/patch.py
@@ -86,7 +86,7 @@ def get_padding(self, size: int) -> typing.Self:
         return PatchSample(
             self.patches.new_empty((0, *self.patches.shape[1:])),
             self.token_map.new_empty(0),
-            self.positions.new_empty(0),
+            self.positions.new_empty([0, self.patches.ndim - 2]),
             size,
             [],
         )
diff --git a/fast_llm/data/sample/token.py b/fast_llm/data/sample/token.py
index 0944f568..9fedf12b 100644
--- a/fast_llm/data/sample/token.py
+++ b/fast_llm/data/sample/token.py
@@ -21,16 +21,16 @@ def crop_lengths(lengths: list[int], begin: int, end: int) -> list[int]:
         # Shortcut for the frequent case of a single document.
         return [end - begin]
     begin_ = 0
-    lengths = []
+    lengths_ = []
     for length in lengths:
         end_ = begin_ + length
         cropped_length = min(end_, end) - max(begin_, begin)
         if cropped_length > 0:
-            lengths.append(cropped_length)
+            lengths_.append(cropped_length)
         if end_ > end:
             break
         begin_ = end_
-    return lengths
+    return lengths_
 
 
 class TokenSample(Sample):
diff --git a/fast_llm/engine/base_model/base_model.py b/fast_llm/engine/base_model/base_model.py
index 5df59d4c..ffffbed5 100644
--- a/fast_llm/engine/base_model/base_model.py
+++ b/fast_llm/engine/base_model/base_model.py
@@ -1,4 +1,5 @@
 import abc
+import functools
 import typing
 
 import torch.nn
@@ -52,10 +53,15 @@ def get_loss_definitions(self, count: int = 1) -> list[LossDef]:
                 losses += layer.get_loss_definitions(count)
         return losses
 
-    def preprocess(self, batch: "torch.Tensor", kwargs: dict[str, typing.Any]) -> None:
+    def preprocess(self, kwargs: dict[str, typing.Any]) -> None:
         for layer in self.get_layers():
             if layer is not self:
-                layer.preprocess(batch, kwargs)
+                layer.preprocess(kwargs)
+
+    def unwrap(self) -> "LayerBase":
+        # Get the actual module contained in this layer,
+        # undoing any wrapping for the Fast-LLM engine (ex. `LayerBaseWithNamespace`)
+        return self
 
 
 class Layer(LayerBase):
@@ -74,23 +80,20 @@ def forward(
         pass
 
     def unwrap(self) -> "Layer":
-        # Get the actual module contained in this layer,
-        # undoing any wrapping for the Fast-LLM engine (ex. `LayerWithNamespace`)
         return self
 
 
-class LayerWithNamespace(Layer):
+class LayerBaseWithNamespace(LayerBase):
     """
-    A layer with its own namespace for preprocessing (kwargs),
+    A layer base with its own namespace for preprocessing (kwargs),
      so that it doesn't inadvertently interact with other layers.
     TODO: Consider namespace for losses and metrics?
     """
 
-    def __init__(self, layer: Layer, namespace: str = None):
+    def __init__(self, layer: LayerBase, namespace: str = None):
         super().__init__(layer._distributed_config)
         self._layer = layer
         self._namespace = namespace
-        self.layer_count = self._layer.layer_count
         self.get_compute_usage = self._layer.get_compute_usage
         self.module_name = self._layer.module_name
 
@@ -98,6 +101,42 @@ def setup(self, distributed: Distributed) -> None:
         self._layer.setup(distributed)
         super().setup(distributed)
 
+    def get_layers(self) -> list["Layer"]:
+        """
+        Wrap individual layers so the namespace is used in forward.
+        """
+        return self._layers_with_namespace
+
+    def preprocess(self, kwargs: dict[str, typing.Any]) -> None:
+        """
+        Preprocess with namespace.
+        """
+        if self._namespace not in kwargs:
+            kwargs[self._namespace] = kwargs.copy()
+        self._layer.preprocess(kwargs[self._namespace])
+
+    def unwrap(self) -> "LayerBase":
+        return self._layer.unwrap()
+
+    @functools.cached_property
+    def _layers_with_namespace(self) -> list[Layer]:
+        # This needs to be in a property because `module_name` is set after `__init__`.
+        # Wrap each set of blocks with identical config in a namespace
+        # using the unique module name of the first such block.
+        return [LayerWithNamespace(layer, self._namespace) for layer in self._layer.get_layers()]
+
+
+class LayerWithNamespace(LayerBaseWithNamespace, Layer):
+    _layer: Layer
+
+    def __init__(self, layer: Layer, namespace: str = None):
+        super().__init__(layer, namespace)
+        self.layer_count = self._layer.layer_count
+
+    def get_layers(self) -> list["Layer"]:
+        # Need to override since `LayerBaseWithNamespace.get_layers` comes first in the MRO.
+        return [self]
+
     def forward(
         self, input_: torch.Tensor, kwargs: dict, losses: dict | None = None, metrics: dict | None = None
     ) -> torch.Tensor:
@@ -109,11 +148,6 @@ def forward(
             assert isinstance(input_, TensorMeta)
         return self._layer.forward(input_, kwargs, losses, metrics)
 
-    def preprocess(self, batch: "torch.Tensor", kwargs: dict[str, typing.Any]) -> None:
-        assert self._namespace not in kwargs
-        kwargs[self._namespace] = kwargs.copy()
-        self._layer.preprocess(batch, kwargs[self._namespace])
-
     def unwrap(self) -> "Layer":
         return self._layer.unwrap()
 
diff --git a/fast_llm/engine/config_utils/tensor_dim.py b/fast_llm/engine/config_utils/tensor_dim.py
index f67916a6..974cb74c 100644
--- a/fast_llm/engine/config_utils/tensor_dim.py
+++ b/fast_llm/engine/config_utils/tensor_dim.py
@@ -14,12 +14,15 @@
 
 
 class TensorDim:
-    def __init__(self, name: str, global_size: int | None, parallel_dim: DistributedDim | None = None):
+    def __init__(
+        self, name: str, global_size: int, parallel_dim: DistributedDim | None = None, variable_size: bool = False
+    ):
         # TODO: Handle None for unknown sizes?
         self._name = name
         self._global_size = global_size
         self._size = self._global_size if parallel_dim is None else div(global_size, parallel_dim.size)
         self._parallel_dim = parallel_dim
+        self._variable_size = variable_size
 
     def __repr__(self) -> str:
         return (
@@ -28,6 +31,7 @@ def __repr__(self) -> str:
             f" size={self._size},"
             f" global_size={self._global_size},"
             f" parallel_dim={self._parallel_dim}"
+            f" variable_size={self._variable_size}"
             f")"
         )
 
@@ -60,9 +64,13 @@ def parallel_group(self) -> "ProcessGroup|None":
         # TODO: Make more flexible for derived classes?
         return None if self._parallel_dim is None else self._parallel_dim.group
 
+    @property
+    def variable_size(self) -> bool:
+        return self._variable_size
+
     def replace_parallel_dim(self, distributed_dim: DistributedDim) -> typing.Self:
         assert self.is_parallel
-        return TensorDim(self.name, self.size * distributed_dim.size, distributed_dim)
+        return TensorDim(self.name, self.size * distributed_dim.size, distributed_dim, self.variable_size)
 
     def local_to_global(self, tensor: "torch.Tensor", dim: int = 0) -> "torch.Tensor":
         if self.is_parallel:
@@ -99,6 +107,7 @@ def __init__(self, name: str, tensor_dims: tuple[TensorDim, ...]):
                 assert parallel_dim is None
                 parallel_dim = tensor_dim.parallel_dim
                 self._parallel_dim_index = dim
+            assert not tensor_dim.variable_size
 
         super().__init__(
             name=name,
@@ -142,6 +151,7 @@ def __init__(self, name: str, tensor_dims: tuple[TensorDim, ...]):
         for dim, tensor_dim in enumerate(tensor_dims[1:]):
             # TODO: Allow more flexibility?
             Assert.is_(tensor_dim.parallel_dim, parallel_dim)
+            assert not tensor_dim.variable_size
 
         super().__init__(
             name=name,
diff --git a/fast_llm/engine/distributed/config.py b/fast_llm/engine/distributed/config.py
index 602c44a4..f4dab5a2 100644
--- a/fast_llm/engine/distributed/config.py
+++ b/fast_llm/engine/distributed/config.py
@@ -97,6 +97,7 @@ class DistributedDimNames:
     sequence_data = "sequence_data"
     batch_data = "batch_data"
     tensor_and_sequence_data = "tensor_and_sequence_data"
+    tensor_and_data = "tensor_and_data"
 
 
 @config_class()
@@ -255,8 +256,6 @@ def _validate(self) -> None:
         Assert.multiple(self.local_world_size, self.tensor_parallel)
 
         if self.pipeline_first:
-            # Case is useless and would cause too many complications.
-            Assert.eq(self.sequence_data_parallel, 1)
             # Smaller models can be more demanding on pipeline parallel.
             self.data_rank = (self.rank // self.tensor_parallel) // self.pipeline_parallel
             self.pipeline_rank = (self.rank // self.tensor_parallel) % self.pipeline_parallel
@@ -334,14 +333,24 @@ def _validate(self) -> None:
                     ),
                 )
             )
-            self._add_distributed_dim(
-                DistributedDim(
-                    name=DistributedDimNames.tensor_and_sequence_data,
-                    size=self.sequence_data_parallel * self.tensor_parallel,
-                    rank=self.tensor_rank + self.sequence_data_rank * self.tensor_parallel,
-                    global_ranks=self._get_global_ranks(self.sequence_data_parallel * self.tensor_parallel, 1),
+            # Global ranks wrong with pipeline first, so we hide the dims as a safety check.
+            if not self.pipeline_first:
+                self._add_distributed_dim(
+                    DistributedDim(
+                        name=DistributedDimNames.tensor_and_sequence_data,
+                        size=self.sequence_data_parallel * self.tensor_parallel,
+                        rank=self.tensor_rank + self.sequence_data_rank * self.tensor_parallel,
+                        global_ranks=self._get_global_ranks(self.sequence_data_parallel * self.tensor_parallel, 1),
+                    )
+                )
+                self._add_distributed_dim(
+                    DistributedDim(
+                        name=DistributedDimNames.tensor_and_data,
+                        size=self.data_parallel * self.tensor_parallel,
+                        rank=self.tensor_rank + self.data_rank * self.tensor_parallel,
+                        global_ranks=self._get_global_ranks(self.data_parallel * self.tensor_parallel, 1),
+                    )
                 )
-            )
 
         super()._validate()
 
diff --git a/fast_llm/engine/distributed/distributed.py b/fast_llm/engine/distributed/distributed.py
index 2e2f9d40..302cfcdc 100644
--- a/fast_llm/engine/distributed/distributed.py
+++ b/fast_llm/engine/distributed/distributed.py
@@ -171,9 +171,14 @@ def __init__(self, config: DistributedConfig, use_cpu: bool = False):
         self.tensor_group = self.add_group(self._config.distributed_dims[DistributedDimNames.tensor])
         self.sequence_data_group = self.add_group(self._config.distributed_dims[DistributedDimNames.sequence_data])
         self.batch_data_group = self.add_group(self._config.distributed_dims[DistributedDimNames.batch_data])
-        self.tensor_and_sequence_data_group = self.add_group(
-            self._config.distributed_dims[DistributedDimNames.tensor_and_sequence_data]
-        )
+        # Global ranks wrong with pipeline first, so we hide the dims as a safety check.
+        if not self._config.pipeline_first:
+            self.tensor_and_sequence_data_group = self.add_group(
+                self._config.distributed_dims[DistributedDimNames.tensor_and_sequence_data]
+            )
+            self.tensor_and_data_group = self.add_group(
+                self._config.distributed_dims[DistributedDimNames.tensor_and_data]
+            )
 
         self._config.log_first_rank(f"Setting random seeds...")
 
diff --git a/fast_llm/layers/attention/attention.py b/fast_llm/layers/attention/attention.py
index ffbe9955..94382b25 100644
--- a/fast_llm/layers/attention/attention.py
+++ b/fast_llm/layers/attention/attention.py
@@ -15,7 +15,7 @@
 from fast_llm.layers.common.peft.config import PeftConfig
 from fast_llm.layers.decoder.block import BlockWithBias
 from fast_llm.tensor import TensorMeta
-from fast_llm.utils import div
+from fast_llm.utils import Assert, div
 
 try:
     from flash_attn.flash_attn_interface import flash_attn_func as _flash_attn_func  # noqa
@@ -179,26 +179,29 @@ def __init__(
             dense_dim,
         )
 
-    def _attn_fused(
-        self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, mask: torch.Tensor, mask_value: torch.Tensor
+    def _attn_backup(
+        self,
+        query: torch.Tensor,
+        key: torch.Tensor,
+        value: torch.Tensor,
+        kwargs: dict[str, typing.Any],
     ) -> torch.Tensor:
         # Backup attention (inefficient)
-        b, sq, hidden = query.shape
+        b, sq, _, _ = query.shape
         sk = key.size(1)
 
         if self._local_head_groups == 1:
             query = query.view(b, sq * self._local_heads, self._config.head_size)
-            key = key.transpose(-1, -2)
+            key = key.flatten(-2).transpose(-1, -2)
+            value = value.flatten(-2)
         else:
             query = (
-                query.unflatten(-1, (self._local_head_groups, self._local_heads_per_group, self._config.head_size))
+                query.unflatten(2, (self._local_head_groups, self._local_heads_per_group))
                 .transpose(1, 2)
                 .reshape(b * self._local_head_groups, sq * self._local_heads_per_group, self._config.head_size)
             )
-            key = key.unflatten(-1, (self._local_head_groups, self._config.head_size)).movedim(1, 3).flatten(0, 1)
-            value = (
-                value.unflatten(-1, (self._local_head_groups, self._config.head_size)).transpose(1, 2).flatten(0, 1)
-            )
+            key = key.movedim(1, 3).flatten(0, 1)
+            value = value.transpose(1, 2).flatten(0, 1)
 
         attn_weights = torch.empty(
             (b * self._local_head_groups, sq * self._local_heads_per_group, sk), device=query.device, dtype=query.dtype
@@ -212,7 +215,8 @@ def _attn_fused(
         ).view(b, self._local_head_groups, sq, self._local_heads_per_group, sk)
 
         attn_weights = attn_weights.to(torch.float32)
-        attn_weights = torch.where(mask, attn_weights, mask_value)
+        if (attention_mask := kwargs[AttentionKwargs.attention_mask]) is not None:
+            attn_weights = torch.where(attention_mask, attn_weights, kwargs[AttentionKwargs.attention_mask_value])
         attn_weights = torch.nn.functional.softmax(attn_weights, dim=-1).to(query.dtype)
 
         attn_weights = torch.dropout(attn_weights, self._config.dropout, self.training)
@@ -229,6 +233,40 @@ def _attn_fused(
                 .flatten(2)
             )
 
+    def _attn_flash(
+        self, query: torch.Tensor, key: torch.Tensor, value: torch.Tensor, kwargs: dict[str, typing.Any]
+    ) -> torch.Tensor:
+        assert _flash_available
+        window_size = (-1, -1) if self._config.window_size is None else (self._config.window_size - 1, 0)
+        if self._config.cross_document_attention:
+            return _flash_attn_func(
+                query,
+                key,
+                value,
+                window_size=window_size,
+                dropout_p=self._config.dropout if self.training else 0.0,
+                causal=self._config.causal,
+                softmax_scale=self._softmax_scale,
+            ).flatten(-2)
+        else:
+            return (
+                _flash_attn_varlen_func(
+                    query.view(-1, query.size(-2), query.size(-1)),
+                    key.view(-1, key.size(-2), key.size(-1)),
+                    value.view(-1, value.size(-2), value.size(-1)),
+                    kwargs[AttentionKwargs.cu_seqlens_q],
+                    kwargs[AttentionKwargs.cu_seqlens_k],
+                    kwargs[AttentionKwargs.max_seqlen_q],
+                    kwargs[AttentionKwargs.max_seqlen_k],
+                    dropout_p=self._config.dropout if self.training else 0.0,
+                    window_size=window_size,
+                    causal=self._config.causal,
+                    softmax_scale=self._softmax_scale,
+                )
+                .view(query.size())
+                .flatten(-2)
+            )
+
     def _query_key_value_forward(
         self, input_: torch.Tensor, sequence_first: bool
     ) -> tuple[torch.Tensor, torch.Tensor, dict[str, typing.Any]]:
@@ -332,47 +370,12 @@ def _forward(
             self._debug(key, "key_rotary_input", self._kv_dims, kwargs)
         query, key = self._rotary(query, key, kwargs)
 
-        window_size = (-1, -1) if self._config.window_size is None else (self._config.window_size - 1, 0)
         with set_generator(self._distributed.tp_generator):
             if self._implementation == AttentionImplementation.flash:
-                assert _flash_available
-                if self._config.cross_document_attention:
-                    input_ = _flash_attn_func(
-                        query,
-                        key,
-                        value,
-                        window_size=window_size,
-                        dropout_p=self._config.dropout if self.training else 0.0,
-                        causal=self._config.causal,
-                        softmax_scale=self._softmax_scale,
-                    ).flatten(-2)
-                else:
-                    input_ = (
-                        _flash_attn_varlen_func(
-                            query.view(-1, query.size(-2), query.size(-1)),
-                            key.view(-1, key.size(-2), key.size(-1)),
-                            value.view(-1, value.size(-2), value.size(-1)),
-                            cu_seqlens_q=kwargs.get(AttentionKwargs.cu_seqlens_q),
-                            cu_seqlens_k=kwargs.get(AttentionKwargs.cu_seqlens_k),
-                            max_seqlen_q=kwargs.get(AttentionKwargs.max_seqlen_q),
-                            max_seqlen_k=kwargs.get(AttentionKwargs.max_seqlen_k),
-                            dropout_p=self._config.dropout if self.training else 0.0,
-                            window_size=window_size,
-                            causal=self._config.causal,
-                            softmax_scale=self._softmax_scale,
-                        )
-                        .view(query.size())
-                        .flatten(-2)
-                    )
+                input_ = self._attn_flash(query, key, value, kwargs)
             elif self._implementation == AttentionImplementation.backup:
                 # TODO: Avoid the flattens.
-                input_ = self._attn_fused(
-                    query.flatten(-2),
-                    key.flatten(-2),
-                    value.flatten(-2),
-                    kwargs[AttentionKwargs.attention_mask],
-                    kwargs[AttentionKwargs.attention_mask_value],
-                )
+                input_ = self._attn_backup(query, key, value, kwargs)
             else:
                 raise NotImplementedError(self._implementation)
 
@@ -443,55 +446,65 @@ def get_compute_usage(self, input_: TensorMeta, kwargs: dict[str, typing.Any], c
             )
         )
 
-    def preprocess(self, batch: torch.Tensor, kwargs: dict[str, typing.Any]) -> None:
-        self._rotary.preprocess(batch, kwargs)
+    def preprocess(self, kwargs: dict[str, typing.Any]) -> None:
+        self._rotary.preprocess(kwargs)
         if self._implementation == AttentionImplementation.backup:
-            self._preprocess_for_backup_attention(batch, kwargs)
+            self._preprocess_for_backup_attention(kwargs)
         elif self._implementation == AttentionImplementation.flash:
-            self._preprocess_for_flash_attention(batch, kwargs)
-
-    def _preprocess_for_backup_attention(self, batch: torch.Tensor, kwargs: dict[str, typing.Any]) -> None:
-        if (
-            sequence_length := kwargs[AttentionKwargs.sequence_length]
-        ) > self._backup_attention_tensor_cache_max_sequence_length:
-            # Create tensor cache.
-            self._backup_attention_tensor_cache_max_sequence_length = sequence_length
-
-            self._backup_attention_mask = torch.ones(
-                (sequence_length, sequence_length),
-                dtype=torch.bool,
-                device=batch.device,
-            ).tril_()
-
-            if self._config.window_size is not None:
-                self._backup_attention_mask.triu_(-self._config.window_size + 1)
-            self._backup_attention_mask_value = torch.full(
-                [],
-                torch.finfo(self._distributed_config.compute_dtype.torch).min,
-                dtype=self._distributed_config.compute_dtype.torch,
-                device=batch.device,
-            )
+            self._preprocess_for_flash_attention(kwargs)
 
+    def _preprocess_for_backup_attention(self, kwargs: dict[str, typing.Any]) -> None:
+        device = kwargs[AttentionKwargs.device] if AttentionKwargs.device in kwargs else self._distributed.device
         sequence_k = kwargs[AttentionKwargs.sequence_k_dim].size
         sequence_q = kwargs[AttentionKwargs.sequence_q_dim].size
-        kwargs[AttentionKwargs.attention_mask] = self._backup_attention_mask[
-            None, None, sequence_k - sequence_q : sequence_k, None, :sequence_k
-        ]
+        if self._config.causal:
+            if (
+                sequence_length := kwargs[AttentionKwargs.sequence_length]
+            ) > self._backup_attention_tensor_cache_max_sequence_length:
+                # Create tensor cache.
+                self._backup_attention_tensor_cache_max_sequence_length = sequence_length
+
+                self._backup_attention_mask = torch.ones(
+                    (sequence_length, sequence_length),
+                    dtype=torch.bool,
+                    device=device,
+                ).tril_()
+
+                if self._config.window_size is not None:
+                    self._backup_attention_mask.triu_(-self._config.window_size + 1)
+            attention_mask = self._backup_attention_mask[
+                None, None, sequence_k - sequence_q : sequence_k, None, :sequence_k
+            ]
+        else:
+            attention_mask = None
         if not self._config.cross_document_attention:
             seq_ids = torch.stack(
                 [
-                    torch.cat([torch.full((x,), i) for i, x in enumerate(sample_lens)])
+                    torch.cat([torch.full((x,), i, device=device) for i, x in enumerate(sample_lens)])
                     for sample_lens in kwargs[AttentionKwargs.sequence_lengths]
                 ]
             )
-            document_mask = (seq_ids[:, None, :] == seq_ids[:, :, None]).to(batch.device)
-            kwargs[AttentionKwargs.attention_mask] = (
-                kwargs[AttentionKwargs.attention_mask]
-                & document_mask[:, None, sequence_k - sequence_q : sequence_k, None, :sequence_k]
-            )
-        kwargs[AttentionKwargs.attention_mask_value] = self._backup_attention_mask_value
+            document_mask = (seq_ids[:, None, :] == seq_ids[:, :, None])[
+                :, None, sequence_k - sequence_q : sequence_k, None, :sequence_k
+            ]
+            if attention_mask is None:
+                attention_mask = document_mask
+            else:
+                attention_mask = attention_mask & document_mask
 
-    def _preprocess_for_flash_attention(self, batch: torch.Tensor, kwargs: dict[str, typing.Any]) -> None:
+        kwargs[AttentionKwargs.attention_mask] = attention_mask
+
+        if attention_mask is not None:
+            if not hasattr(self, "_backup_attention_mask_value"):
+                self._backup_attention_mask_value = torch.full(
+                    [],
+                    torch.finfo(self._distributed_config.compute_dtype.torch).min,
+                    dtype=self._distributed_config.compute_dtype.torch,
+                    device=device,
+                )
+            kwargs[AttentionKwargs.attention_mask_value] = self._backup_attention_mask_value
+
+    def _preprocess_for_flash_attention(self, kwargs: dict[str, typing.Any]) -> None:
         """
         Prepares cu_seqlens_q and cu_seqlens_k for flash_attn_varlen_func:
         https://github.com/Dao-AILab/flash-attention/blob/main/flash_attn/flash_attn_interface.py#L1375
@@ -503,54 +516,29 @@ def _preprocess_for_flash_attention(self, batch: torch.Tensor, kwargs: dict[str,
         """
         if self._config.cross_document_attention:
             return
-        sequence_lengths = kwargs[AttentionKwargs.sequence_lengths]
-        sequence_k = kwargs[AttentionKwargs.sequence_k_dim].size
-        sequence_q = kwargs[AttentionKwargs.sequence_q_dim].size
-        if sequence_q < kwargs[AttentionKwargs.sequence_length]:
-            cumsums = [torch.cumsum(x, dim=0) for x in sequence_lengths]
-            # The first and last documents in a microsequence need to be handled separately. Include all tokens from other documents
-            # in the microsequence. We need to consider all keys computed so far from the first sample. We also store the offsets
-            # of the first documents so that we can index into their kv pairs
-            start_seq_idx = [
-                torch.argmax((cu_seqlens >= sequence_k - sequence_q).to(torch.uint8), dim=0) for cu_seqlens in cumsums
-            ]
-            end_seq_idx = [torch.argmax((cu_seqlens >= sequence_k).to(torch.uint8), dim=0) for cu_seqlens in cumsums]
-            seqlens_q = []
-            seqlens_k = []
-            for idx, sample_seqlens in enumerate(sequence_lengths):
-                start_idx = start_seq_idx[idx]
-                end_idx = end_seq_idx[idx]
-                seqlens_q.extend([0] * start_idx)
-                n_attention_tokens = sample_seqlens[end_idx] - (cumsums[idx][end_idx] - sequence_k)
-                if start_idx == end_idx:
-                    seqlens_q.append(sequence_q)
-                else:
-                    start_q_tokens = cumsums[idx][start_idx] - (sequence_k - sequence_q)
-                    seqlens_q.extend(
-                        [
-                            start_q_tokens,
-                            *(sample_seqlens[idx] for idx in range(start_idx + 1, end_idx)),
-                            n_attention_tokens,
-                        ]
-                    )
-                seqlens_k.extend(sample_seqlens[: end_idx + 1])
-                seqlens_k[-1] = n_attention_tokens
-            seqlens_q = torch.tensor(seqlens_q, dtype=torch.int32)
-            seqlens_k = torch.tensor(seqlens_k, dtype=torch.int32)
-        else:
-            seqlens_q = torch.cat(sequence_lengths)
-            seqlens_k = torch.cat(sequence_lengths)
-        kwargs[AttentionKwargs.cu_seqlens_q] = torch.cat(
-            (
-                torch.zeros(1, dtype=torch.int32, device=batch.device),
-                torch.cumsum(seqlens_q, dim=0, dtype=torch.int32).to(batch.device),
-            )
+        device = kwargs[AttentionKwargs.device] if AttentionKwargs.device in kwargs else self._distributed.device
+
+        # TODO: ====== Fix (need to know how much first sequence was cropped) ======
+        Assert.eq(
+            kwargs[AttentionKwargs.sequence_k_dim].global_size, kwargs[AttentionKwargs.sequence_q_dim].global_size
         )
-        kwargs[AttentionKwargs.cu_seqlens_k] = torch.cat(
-            (
-                torch.zeros(1, dtype=torch.int32, device=batch.device),
-                torch.cumsum(seqlens_k, dim=0, dtype=torch.int32).to(batch.device),
-            )
+
+        # TODO: Calculate these in batch preprocessing?
+        sequence_lengths_q = torch.tensor(
+            [
+                0,
+                *(
+                    sequence_length
+                    for sequence_lengths in kwargs[AttentionKwargs.sequence_lengths]
+                    for sequence_length in sequence_lengths
+                ),
+            ],
+            dtype=torch.int32,
         )
-        kwargs[AttentionKwargs.max_seqlen_q] = seqlens_q.max()
-        kwargs[AttentionKwargs.max_seqlen_k] = seqlens_k.max()
+        max_sequence_length = sequence_lengths_q.max().item()
+        cu_seqlens_q = sequence_lengths_q.cumsum_(0).to(device)
+        max_seqlen_q = cu_seqlens_q.new_full((1,), max_sequence_length)
+        kwargs[AttentionKwargs.cu_seqlens_q] = cu_seqlens_q
+        kwargs[AttentionKwargs.cu_seqlens_k] = cu_seqlens_q
+        kwargs[AttentionKwargs.max_seqlen_q] = max_seqlen_q
+        kwargs[AttentionKwargs.max_seqlen_k] = max_seqlen_q
diff --git a/fast_llm/layers/attention/config.py b/fast_llm/layers/attention/config.py
index 206fa6e6..d65c924e 100644
--- a/fast_llm/layers/attention/config.py
+++ b/fast_llm/layers/attention/config.py
@@ -132,6 +132,9 @@ def _validate(self) -> None:
 
         Assert.multiple(self.heads, self.head_groups)
 
+        if not self.causal:
+            assert self.window_size is None, "Non-causal windowed attention is not supported."
+
     @property
     def layer_class(self) -> "type[Attention]":
         from fast_llm.layers.attention.attention import Attention
diff --git a/fast_llm/layers/attention/rotary/config.py b/fast_llm/layers/attention/rotary/config.py
index 26877ee0..92adc880 100644
--- a/fast_llm/layers/attention/rotary/config.py
+++ b/fast_llm/layers/attention/rotary/config.py
@@ -10,7 +10,14 @@
 from fast_llm.utils import Assert
 
 if typing.TYPE_CHECKING:
-    from fast_llm.layers.attention.rotary.rotary import DefaultRotary, Llama3Rotary, NoRotary, Rotary, YarnRotary
+    from fast_llm.layers.attention.rotary.rotary import (
+        DefaultRotary,
+        Llama3Rotary,
+        NoRotary,
+        Rotary,
+        Rotary2D,
+        YarnRotary,
+    )
 
 
 @config_class(registry=True)
@@ -135,3 +142,11 @@ def _get_configurable_class(self) -> "type[YarnRotary]":
         from fast_llm.layers.attention.rotary.rotary import YarnRotary
 
         return YarnRotary
+
+
+@config_class(dynamic_type={RotaryConfig: "default_2d"})
+class Rotary2DConfig(DefaultRotaryConfig):
+    def _get_configurable_class(self) -> "type[Rotary2D]":
+        from fast_llm.layers.attention.rotary.rotary import Rotary2D
+
+        return Rotary2D
diff --git a/fast_llm/layers/attention/rotary/rotary.py b/fast_llm/layers/attention/rotary/rotary.py
index d57d7294..55d929f8 100644
--- a/fast_llm/layers/attention/rotary/rotary.py
+++ b/fast_llm/layers/attention/rotary/rotary.py
@@ -12,10 +12,12 @@
     DefaultRotaryConfig,
     Llama3RotaryConfig,
     NoRotaryConfig,
+    Rotary2DConfig,
     RotaryConfig,
     YarnRotaryConfig,
 )
-from fast_llm.utils import div
+from fast_llm.layers.vision.config import VisionKwargs
+from fast_llm.utils import Assert, div
 
 
 def convert_rotary_complex_to_real(tensor: torch.Tensor, head_size: int, dim: int) -> torch.Tensor:
@@ -46,7 +48,7 @@ def __init__(
         head_size_dim: TensorDim,
     ):
         super().__init__(config)
-        self._head_size_dim = head_size_dim
+        self._head_size = head_size_dim.global_size
 
     @abc.abstractmethod
     def forward(
@@ -54,7 +56,7 @@ def forward(
     ) -> tuple[torch.Tensor, torch.Tensor]:
         pass
 
-    def preprocess(self, batch: torch.Tensor, kwargs: dict[str, typing.Any]) -> None:
+    def preprocess(self, kwargs: dict[str, typing.Any]) -> None:
         pass
 
 
@@ -69,8 +71,8 @@ class DefaultRotary[ConfigType: DefaultRotaryConfig](Rotary[ConfigType]):
     _rotary_embedding_frequencies: torch.Tensor
     _tensor_cache_max_sequence_length: int = -1
 
-    def preprocess(self, batch: torch.Tensor, kwargs: dict[str, typing.Any]) -> None:
-        self._create_tensors(kwargs[AttentionKwargs.sequence_length], batch.device)
+    def preprocess(self, kwargs: dict[str, typing.Any]) -> None:
+        self._create_tensors(kwargs[AttentionKwargs.sequence_length], kwargs[AttentionKwargs.device])
         sequence_k = kwargs[AttentionKwargs.sequence_k_dim].size
         kwargs[AttentionKwargs.rotary_freq_q] = self._rotary_embedding_frequencies[
             :, sequence_k - kwargs[AttentionKwargs.sequence_q_dim].size : sequence_k
@@ -92,7 +94,7 @@ def _create_tensors(self, sequence_length: int, device: torch.device) -> None:
 
         self._rotary_embedding_frequencies = self._get_frequencies(
             sequence_length,
-            self._head_size_dim.global_size,
+            self._head_size,
             device=device,
         )
 
@@ -174,3 +176,43 @@ def _get_correction(self, beta: float, dim: int) -> float:
             * math.log(self._config.original_context_length / (beta * 2 * math.pi))
             / (2 * math.log(self._config.theta))
         )
+
+
+class Rotary2D[ConfigType: Rotary2DConfig](Rotary[ConfigType]):
+    _frequencies: torch.Tensor
+    _config: ConfigType
+
+    def __init__(
+        self,
+        config: ConfigType,
+        head_size_dim: TensorDim,
+    ):
+        super().__init__(config, head_size_dim)
+        Assert.multiple(self._head_size, 4)
+
+    def preprocess(self, kwargs: dict[str, typing.Any]) -> None:
+        patch_positions = kwargs[VisionKwargs.patch_positions]
+        if not hasattr(self, "_frequencies"):
+            self._frequencies = self._config.theta ** -torch.arange(
+                0, 1, 4 / self._head_size, device=kwargs[AttentionKwargs.device], dtype=torch.float64
+            )
+        # TODO: Pre-compute 2d frequencies?
+        angles = torch.outer(patch_positions.flatten(), self._frequencies).view(
+            len(patch_positions), self._head_size // 2
+        )
+        frequencies = torch.polar(torch.ones_like(angles), angles)[None, :, None, :].to(torch.complex64)
+        if not self._config.complex_format:
+            frequencies = convert_rotary_complex_to_real(
+                torch.view_as_real(frequencies).flatten(-2), self._head_size, 3
+            ).contiguous()
+        # TODO: Support different q and k frequencies.
+        kwargs[AttentionKwargs.rotary_freq_q] = frequencies
+        kwargs[AttentionKwargs.rotary_freq_k] = frequencies
+
+    def forward(
+        self, query: torch.Tensor, key: torch.Tensor, kwargs: dict[str, typing.Any]
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        rotary_fn = triton_rotary_autograd_ if self._config.triton else apply_rotary_embeddings
+        query = rotary_fn(query, kwargs[AttentionKwargs.rotary_freq_q])
+        key = rotary_fn(key, kwargs[AttentionKwargs.rotary_freq_k])
+        return query, key
diff --git a/fast_llm/layers/block/config.py b/fast_llm/layers/block/config.py
index f3e93ede..04b16df3 100644
--- a/fast_llm/layers/block/config.py
+++ b/fast_llm/layers/block/config.py
@@ -37,6 +37,7 @@ class BlockKwargs:
     sequence_lengths = "sequence_lengths"
     # TODO: Belongs elsewhere?
     grad_output = "grad_output"
+    device = "device"
 
 
 @config_class(registry=True)
@@ -71,6 +72,7 @@ def get_layer(
         self,
         distributed_config: DistributedConfig,
         hidden_dim: TensorDim,
+        *,
         lr_scale: float | None,
         peft: PeftConfig | None,
     ) -> "BlockBase":
diff --git a/fast_llm/layers/block/sequence.py b/fast_llm/layers/block/sequence.py
index 530df950..54a5b347 100644
--- a/fast_llm/layers/block/sequence.py
+++ b/fast_llm/layers/block/sequence.py
@@ -55,8 +55,8 @@ def _layers_with_namespace(self) -> list[Layer]:
     def get_layers(self) -> list["Layer"]:
         return self._layers_with_namespace
 
-    def preprocess(self, batch: "torch.Tensor", kwargs: dict[str, typing.Any]) -> None:
-        self._layers_with_namespace[0].preprocess(batch, kwargs)
+    def preprocess(self, kwargs: dict[str, typing.Any]) -> None:
+        self._layers_with_namespace[0].preprocess(kwargs)
 
     def get_loss_definitions(self, count: int = 1) -> list[LossDef]:
         return (
@@ -109,9 +109,9 @@ def _layers_with_namespace(self) -> list[Layer]:
     def get_layers(self) -> list[Layer]:
         return self._layers_with_namespace
 
-    def preprocess(self, batch: "torch.Tensor", kwargs: dict[str, typing.Any]) -> None:
+    def preprocess(self, kwargs: dict[str, typing.Any]) -> None:
         for _, index in self._config.preprocessing_layers.items():
-            self._layers_with_namespace[index].preprocess(batch, kwargs)
+            self._layers_with_namespace[index].preprocess(kwargs)
 
     def get_loss_definitions(self, count: int = 1) -> list[LossDef]:
         # TODO: Prevent name conflicts.
diff --git a/fast_llm/layers/common/linear/config.py b/fast_llm/layers/common/linear/config.py
index e7c6d9e9..e2c586bb 100644
--- a/fast_llm/layers/common/linear/config.py
+++ b/fast_llm/layers/common/linear/config.py
@@ -1,7 +1,12 @@
 import typing
 
 from fast_llm.config import Config, Field, FieldHint, check_field, config_class
-from fast_llm.engine.config_utils.initialization import Initialization, init_uniform_centered_, init_zeros_
+from fast_llm.engine.config_utils.initialization import (
+    Initialization,
+    init_normal_,
+    init_uniform_centered_,
+    init_zeros_,
+)
 from fast_llm.engine.config_utils.parameter import OptionalParameterConfig, ParameterConfig, combine_lr_scales
 from fast_llm.engine.config_utils.tensor_dim import TensorDim, scalar_dim
 from fast_llm.functional.config import ActivationType
@@ -9,7 +14,7 @@
 from fast_llm.utils import Assert
 
 if typing.TYPE_CHECKING:
-    from fast_llm.layers.common.linear.convolution import CausalConv1d
+    from fast_llm.layers.common.linear.convolution import CausalConv1d, Convolution2D
     from fast_llm.layers.common.linear.linear import LinearBase
 
 
@@ -217,3 +222,44 @@ def get_layer(
         return CausalConv1d(
             weight, bias, activation=default_activation if self.activation is None else self.activation
         )
+
+
+@config_class()
+class Convolution2DConfig(AffineLinearBaseConfig):
+    def get_layer(
+        self,
+        in_dim: TensorDim,
+        out_dim: TensorDim,
+        kernel_dim_1: TensorDim,
+        kernel_dim_2: TensorDim,
+        *,
+        stride: tuple[int, int],
+        default_weight_initialization: Initialization | None = None,
+        default_bias_initialization: Initialization | None = None,
+        default_add_bias: bool = True,
+        lr_scale: float | None,
+        peft: PeftConfig | None,
+    ) -> "Convolution2D":
+        from fast_llm.layers.common.linear.convolution import Convolution2D
+
+        if default_weight_initialization is None:
+            default_weight_initialization = init_normal_()
+        if default_bias_initialization is None:
+            default_bias_initialization = init_normal_()
+
+        lr_scale = (combine_lr_scales(lr_scale, self.lr_scale),)
+        weight = self.weight.get_parameter(
+            (out_dim, in_dim, kernel_dim_1, kernel_dim_2),
+            default_initialization=default_weight_initialization,
+            lr_scale=lr_scale,
+            peft=peft,
+        )
+        bias = self.bias.get_parameter(
+            (out_dim,),
+            default_initialization=default_bias_initialization,
+            lr_scale=lr_scale,
+            default_enabled=default_add_bias,
+            peft=peft,
+        )
+
+        return Convolution2D(weight, bias, stride=stride)
diff --git a/fast_llm/layers/common/linear/convolution.py b/fast_llm/layers/common/linear/convolution.py
index b88b7b2e..6281348e 100644
--- a/fast_llm/layers/common/linear/convolution.py
+++ b/fast_llm/layers/common/linear/convolution.py
@@ -55,3 +55,27 @@ def _forward_causal_conv1d(self, input_: torch.Tensor) -> torch.Tensor:
 
     def get_compute_usage(self, input_: TensorMeta, config: ResourceUsageConfig) -> int:
         raise NotImplementedError()
+
+
+class Convolution2D(torch.nn.Module):
+    """
+    TODO: Generalize to other convolutions?
+    """
+
+    def __init__(
+        self,
+        weight: ParameterMeta,
+        bias: ParameterMeta | None,
+        *,
+        stride: tuple[int, int],
+    ):
+        super().__init__()
+        self.weight = weight
+        self.bias = bias
+        self._stride = stride
+
+    def forward(self, input_: torch.Tensor) -> torch.Tensor:
+        return torch.nn.functional.conv2d(input_, self.weight, self.bias, stride=self._stride)
+
+    def get_compute_usage(self, input_: TensorMeta, config: ResourceUsageConfig) -> int:
+        raise NotImplementedError()
diff --git a/fast_llm/layers/decoder/block.py b/fast_llm/layers/decoder/block.py
index 8b19db66..5713cbb6 100644
--- a/fast_llm/layers/decoder/block.py
+++ b/fast_llm/layers/decoder/block.py
@@ -90,7 +90,7 @@ def __init__(
         self.mixer = self._config.mixer.get_layer(
             self._distributed_config,
             self._hidden_dim,
-            self._lr_scale,
+            lr_scale=self._lr_scale,
             peft=peft,
             return_bias=True,
         )
@@ -98,7 +98,7 @@ def __init__(
         self.mlp = self._config.mlp.get_layer(
             self._distributed_config,
             self._hidden_dim,
-            self._lr_scale,
+            lr_scale=self._lr_scale,
             peft=peft,
             return_bias=True,
         )
@@ -175,9 +175,9 @@ def get_compute_usage(self, input_: TensorMeta, kwargs: dict[str, typing.Any], c
             )
         )
 
-    def preprocess(self, batch: torch.Tensor, kwargs: dict[str, typing.Any]) -> None:
-        self.mixer.preprocess(batch, kwargs)
-        self.mlp.preprocess(batch, kwargs)
+    def preprocess(self, kwargs: dict[str, typing.Any]) -> None:
+        self.mixer.preprocess(kwargs)
+        self.mlp.preprocess(kwargs)
 
     def get_loss_definitions(self, count: int = 1) -> list[LossDef]:
         return self.mixer.get_loss_definitions(count=count) + self.mlp.get_loss_definitions(count=count)
diff --git a/fast_llm/layers/decoder/config.py b/fast_llm/layers/decoder/config.py
index 403b204c..06238853 100644
--- a/fast_llm/layers/decoder/config.py
+++ b/fast_llm/layers/decoder/config.py
@@ -27,6 +27,7 @@ def get_layer(
         self,
         distributed_config: DistributedConfig,
         hidden_dim: TensorDim,
+        *,
         lr_scale: float | None,
         peft: PeftConfig | None,
         return_bias: bool = False,
@@ -45,6 +46,26 @@ def get_layer(
 class MLPBaseConfig(BlockWithBiasConfig):
     _abstract = True
 
+    def get_layer(
+        self,
+        distributed_config: DistributedConfig,
+        hidden_dim: TensorDim,
+        *,
+        output_dim: TensorDim | None = None,
+        lr_scale: float | None,
+        peft: PeftConfig | None,
+        return_bias: bool = False,
+    ) -> "BlockWithBias":
+        return self.layer_class(
+            self,
+            distributed_config,
+            hidden_dim=hidden_dim,
+            output_dim=output_dim,
+            lr_scale=combine_lr_scales(lr_scale, self.lr_scale),
+            peft=peft,
+            return_bias=return_bias,
+        )
+
     @classmethod
     def _from_dict(cls, default: dict[str, typing.Any], strict: bool = True) -> typing.Self:
         if cls is MLPBaseConfig and cls.get_subclass(default.get("type")) is None:
diff --git a/fast_llm/layers/decoder/mlp/mixture_of_experts.py b/fast_llm/layers/decoder/mlp/mixture_of_experts.py
index ffc9eadb..4171e66a 100644
--- a/fast_llm/layers/decoder/mlp/mixture_of_experts.py
+++ b/fast_llm/layers/decoder/mlp/mixture_of_experts.py
@@ -44,6 +44,7 @@ def __init__(
         *,
         # TODO: Review `hidden_dim` and `block_index`
         hidden_dim: TensorDim,
+        output_dim: TensorDim | None = None,
         lr_scale: float | None,
         peft: PeftConfig | None,
         return_bias: bool = True,
@@ -55,6 +56,7 @@ def __init__(
             config,
             distributed_config,
             hidden_dim=hidden_dim,
+            output_dim=output_dim,
             lr_scale=lr_scale,
             peft=peft,
             return_bias=return_bias,
@@ -88,6 +90,8 @@ def _get_intermediate_dims(self) -> tuple[TensorDim, TensorDim]:
     def _forward(
         self, input_: torch.Tensor, kwargs: dict, losses: dict | None = None, metrics: dict | None = None
     ) -> tuple[torch.Tensor, None]:
+        if isinstance(input_, TensorMeta):
+            return TensorMeta.from_dims(input_.dims[:-1] + (self._output_dim,), "MLP output"), None
         hidden_states = input_.flatten(0, -2)
         logits = self.router(hidden_states)
         if self._debug.enabled:
diff --git a/fast_llm/layers/decoder/mlp/mlp.py b/fast_llm/layers/decoder/mlp/mlp.py
index aaea94ad..7a52539d 100644
--- a/fast_llm/layers/decoder/mlp/mlp.py
+++ b/fast_llm/layers/decoder/mlp/mlp.py
@@ -26,6 +26,7 @@ def __init__(
         *,
         # TODO: Review `hidden_dim` and `block_index`
         hidden_dim: TensorDim,
+        output_dim: TensorDim | None = None,
         lr_scale: float | None,
         peft: PeftConfig | None,
         return_bias: bool = True,
@@ -38,6 +39,7 @@ def __init__(
             peft=peft,
             return_bias=return_bias,
         )
+        self._output_dim = self._hidden_dim if output_dim is None else output_dim
         self._parallel_dim = self._distributed_config.get_distributed_dim(DistributedDimNames.tensor)
         intermediate_1_dim, self._intermediate_2_dim = self._get_intermediate_dims()
 
@@ -55,7 +57,7 @@ def __init__(
         )
         self.layer_2 = self._config.layer_2.get_layer(
             self._intermediate_2_dim,
-            hidden_dim,
+            self._output_dim,
             default_weight_initialization=init_normal_(std=self._hidden_size**-0.5),
             default_add_bias=self._config.add_linear_biases,
             sequence_parallel=self._sequence_parallel,
@@ -111,6 +113,13 @@ def _forward(
         losses: dict[str, typing.Any] | None = None,
         metrics: dict[str, typing.Any] | None = None,
     ) -> tuple[torch.Tensor, torch.Tensor | None]:
+        if isinstance(input_, TensorMeta):
+            return (
+                TensorMeta.from_dims(
+                    input_.dims[:-1] + (self._output_dim,), tensor_name="MLP output", dtype=input_.dtype
+                ),
+                None,
+            )
         return (
             mlp_autograd(
                 input_,
diff --git a/fast_llm/layers/language_model/config.py b/fast_llm/layers/language_model/config.py
index 18c64acc..53dac289 100644
--- a/fast_llm/layers/language_model/config.py
+++ b/fast_llm/layers/language_model/config.py
@@ -20,7 +20,11 @@
 
 
 class LanguageModelKwargs(BlockKwargs):
+    token_ids = "token_ids"
     position_ids = "position_ids"
+    token_map = "token_map"
+    sample_map = "sample_map"
+    embedding_map = "embedding_map"
     # TODO: These are generic
     labels = "labels"
     phase = "phase"
diff --git a/fast_llm/layers/language_model/embedding.py b/fast_llm/layers/language_model/embedding.py
index 61ca1cfc..321400ac 100644
--- a/fast_llm/layers/language_model/embedding.py
+++ b/fast_llm/layers/language_model/embedding.py
@@ -3,7 +3,7 @@
 import torch
 
 from fast_llm.core.distributed import set_generator
-from fast_llm.core.ops import reduce_forward, split
+from fast_llm.core.ops import gather, reduce_forward, split
 from fast_llm.engine.base_model.config import ResourceUsageConfig
 from fast_llm.engine.config_utils.initialization import init_normal_
 from fast_llm.engine.config_utils.tensor_dim import TensorDim
@@ -14,6 +14,8 @@
 from fast_llm.tensor import TensorMeta
 from fast_llm.utils import Assert
 
+WORD_EMBEDDINGS_WEIGHT = "word_embeddings_weight"
+
 
 class LanguageModelEmbedding[ConfigType: LanguageModelEmbeddingsConfig](Block[ConfigType]):
     """
@@ -26,7 +28,8 @@ class LanguageModelEmbedding[ConfigType: LanguageModelEmbeddingsConfig](Block[Co
     layer_count: float = 1000.0
     _config: ConfigType
 
-    # Position embedding preprocessing
+    # Preprocessing
+    _rotary_embedding_frequencies: torch.Tensor
     _position_ids: torch.Tensor
     _tensor_cache_max_sequence_length: int = -1
 
@@ -75,34 +78,64 @@ def __init__(
         )
 
     @torch.compile
-    def _forward(self, input_: torch.Tensor, position_ids: torch.Tensor | None, mask_inputs: bool) -> torch.Tensor:
+    def _forward(
+        self,
+        input_: torch.Tensor | None,
+        token_ids: torch.Tensor,
+        position_ids: torch.Tensor | None,
+        mask_inputs: bool,
+        embedding_map: tuple[torch.Tensor, torch.Tensor] | None,
+    ) -> torch.Tensor:
         Assert.eq(position_ids is None, self.position_embeddings_weight is None)
         group = self._parallel_dim.group
         if self._vocab_parallel:
-            input_mask = (input_ >= self._vocab_start_index) * (input_ < self._vocab_end_index)
-            masked_input = (input_ - self._vocab_start_index) * input_mask
-            embeddings = torch.embedding(self.word_embeddings_weight, masked_input) * input_mask.unsqueeze(2)  # noqa
+            token_mask = (token_ids >= self._vocab_start_index) * (token_ids < self._vocab_end_index)
+            masked_input = (token_ids - self._vocab_start_index) * token_mask
+            embeddings = torch.embedding(self.word_embeddings_weight, masked_input) * token_mask.unsqueeze(2)  # noqa
             embeddings = reduce_forward(embeddings, group)
+            # TODO: Input masking of position embeddings inconsistant with non-vocab-parallel
             if self.position_embeddings_weight is not None:
                 embeddings = embeddings + torch.nn.functional.embedding(position_ids, self.position_embeddings_weight)
+
+            if input_ is not None:
+                # TODO: Accumulate redundant with masking?
+                if self._sequence_parallel:
+                    input_ = gather(input_, group=group, dim=0)
+                # Out-of-place equivalent of `embeddings[embedding_map] += input_`
+                embeddings = embeddings.index_put(embedding_map, input_[: embedding_map[0].size(0)], accumulate=True)
+
             if self._sequence_parallel:
                 embeddings = split(embeddings, group=group, dim=0)
         else:
             if self._sequence_parallel:
-                input_ = split(input_, group=group, dim=0)
+                token_ids = split(token_ids, group=group, dim=0)
                 if self.position_embeddings_weight is not None:
                     position_ids = split(position_ids, group=group, dim=0)
             # handle masked tokens
             if mask_inputs:
-                input_mask = input_ >= 0
-                masked_input = input_ * input_mask
-                embeddings = torch.embedding(self.word_embeddings_weight, masked_input)
-            else:
-                embeddings = torch.embedding(self.word_embeddings_weight, input_)
+                token_mask = token_ids >= 0
+                token_ids = token_ids * token_mask
+            embeddings = torch.embedding(self.word_embeddings_weight, token_ids)
             if self.position_embeddings_weight is not None:
                 embeddings = embeddings + torch.nn.functional.embedding(position_ids, self.position_embeddings_weight)
             if mask_inputs:
-                embeddings = embeddings * input_mask.unsqueeze(2)
+                embeddings = embeddings * token_mask.unsqueeze(2)
+
+            if input_ is not None:
+                # TODO: Accumulate redundant with masking?
+                if self._sequence_parallel:
+                    #  TODO:: Filter and shift embedding map instead? (needs cuda sync)
+                    input_ = gather(input_, group=group, dim=0)
+                    embeddings_ = embeddings.new_zeros(embeddings.shape[0] * group.size(), *embeddings.shape[1:])
+                    embeddings_ = embeddings_.index_put(
+                        embedding_map, input_[: embedding_map[0].size(0)], accumulate=True
+                    )
+                    embeddings = embeddings + split(embeddings_, group=group, dim=0)
+                else:
+                    embeddings = embeddings.index_put(
+                        embedding_map, input_[: embedding_map[0].size(0)], accumulate=True
+                    )
+
         with set_generator(
             self._distributed.tp_generator if self._sequence_parallel else self._distributed.pp_generator
         ):
@@ -122,18 +155,35 @@ def forward(
                 tensor_name=f"{self.module_name} output",
                 dtype=self._residual_dtype,
             )
+        if (embedding_map := kwargs.get(LanguageModelKwargs.embedding_map)) is None:
+            # Language model: input_ contains token ids.
+            token_ids = input_
+            input_ = None
+        else:
+            # Multimodal case: input_ contains encoder output, token ids stores in kwargs.
+            # TODO: Support multiple encoders.
+            # TODO: Support pipeline-parallel.
+            token_ids = kwargs.get(LanguageModelKwargs.token_ids)
+            # Drop the placeholder batch dimension, remove patch padding.
+            input_ = input_.squeeze(int(kwargs[LanguageModelKwargs.sequence_first]))
+
         return self._forward(
-            input_, kwargs.get(LanguageModelKwargs.position_ids), kwargs.get(LanguageModelKwargs.mask_inputs)
+            input_,
+            token_ids,
+            kwargs.get(LanguageModelKwargs.position_ids),
+            # TODO ====== Vision ====== Review input masking.
+            kwargs.get(LanguageModelKwargs.mask_inputs),
+            embedding_map,
         )
 
     def get_compute_usage(self, input_: TensorMeta, kwargs: dict[str, typing.Any], config: ResourceUsageConfig) -> int:
         # TODO: Add marginal compute? (embeddings)
         return 0
 
-    def preprocess(self, batch: torch.Tensor, kwargs: dict[str, typing.Any]) -> None:
+    def preprocess(self, kwargs: dict[str, typing.Any]) -> None:
         if not self._config.position_embeddings.enabled:
             return
-        self._create_position_embeddings(kwargs[LanguageModelKwargs.sequence_length], batch.device)
+        self._create_position_embeddings(kwargs[LanguageModelKwargs.sequence_length], self._distributed.device)
         sequence_k = kwargs[LanguageModelKwargs.sequence_k_dim].size
         sequence_q = kwargs[LanguageModelKwargs.sequence_q_dim].size
         if not self._config.cross_document_position_embeddings:
@@ -142,7 +192,7 @@ def preprocess(self, batch: torch.Tensor, kwargs: dict[str, typing.Any]) -> None
                     torch.cat([torch.arange(x) for x in sample_lens])
                     for sample_lens in kwargs[LanguageModelKwargs.sequence_lengths]
                 ]
-            ).to(batch.device, dtype=torch.int64)
+            ).to(self._distributed.device, dtype=torch.int64)
             position_ids = position_ids[:, sequence_k - sequence_q : sequence_k]
             if kwargs[LanguageModelKwargs.sequence_first]:
                 position_ids = position_ids.transpose(0, 1)
diff --git a/fast_llm/layers/language_model/language_model.py b/fast_llm/layers/language_model/language_model.py
index 2e46bb57..385bab7e 100644
--- a/fast_llm/layers/language_model/language_model.py
+++ b/fast_llm/layers/language_model/language_model.py
@@ -1,8 +1,6 @@
 import logging
 import typing
 
-import torch
-
 from fast_llm.engine.base_model.base_model import Layer
 from fast_llm.engine.base_model.config import LossDef
 from fast_llm.engine.config_utils.tensor_dim import TensorDim
@@ -58,11 +56,11 @@ def __init__(
     def get_layers(self) -> list[Layer]:
         return self.embeddings.get_layers() + self.decoder.get_layers() + self.head.get_layers()
 
-    def preprocess(self, batch: torch.Tensor, kwargs: dict[str, typing.Any]) -> None:
+    def preprocess(self, kwargs: dict[str, typing.Any]) -> None:
         # Needed because the base class uses `get_layers` which may bypass the decoder and head. TODO: Avoidable?
-        self.embeddings.preprocess(batch, kwargs)
-        self.decoder.preprocess(batch, kwargs)
-        self.head.preprocess(batch, kwargs)
+        self.embeddings.preprocess(kwargs)
+        self.decoder.preprocess(kwargs)
+        self.head.preprocess(kwargs)
 
     def get_loss_definitions(self, count: int = 1) -> list[LossDef]:
         # Needed because the base class uses `get_layers` which may bypass the decoder and head. TODO: Avoidable?
diff --git a/fast_llm/layers/language_model/multi_token_prediction.py b/fast_llm/layers/language_model/multi_token_prediction.py
index e0eb8175..ad3395a0 100644
--- a/fast_llm/layers/language_model/multi_token_prediction.py
+++ b/fast_llm/layers/language_model/multi_token_prediction.py
@@ -83,8 +83,8 @@ def get_layers(self) -> list[Layer]:
     def get_output_weights(self) -> list[torch.Tensor]:
         return sum((head.get_output_weights() for head in self.heads), [])
 
-    def preprocess(self, batch: "torch.Tensor", kwargs: dict[str, typing.Any]) -> None:
-        self._layers_with_namespace[0].preprocess(batch, kwargs)
+    def preprocess(self, kwargs: dict[str, typing.Any]) -> None:
+        self._layers_with_namespace[0].preprocess(kwargs)
 
     def get_loss_definitions(self, count: int = 1) -> list[LossDef]:
         return self.blocks[0].get_loss_definitions(count=count * self._config.prediction_heads) + [
diff --git a/fast_llm/layers/vision/__init__.py b/fast_llm/layers/vision/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/fast_llm/layers/vision/config.py b/fast_llm/layers/vision/config.py
new file mode 100644
index 00000000..fb05a520
--- /dev/null
+++ b/fast_llm/layers/vision/config.py
@@ -0,0 +1,140 @@
+import functools
+import typing
+
+from fast_llm.config import Config, Field, FieldHint, check_field, config_class
+from fast_llm.layers.block.config import BlockConfig, BlockKwargs, BlockSequenceConfig
+from fast_llm.layers.common.linear.config import Convolution2DConfig
+from fast_llm.layers.common.normalization.config import NormalizationConfig
+from fast_llm.layers.decoder.config import MLPBaseConfig
+from fast_llm.layers.language_model.config import LanguageModelConfig
+from fast_llm.utils import Assert
+
+if typing.TYPE_CHECKING:
+    from fast_llm.layers.vision.patch_convolution import PatchConvolution
+    from fast_llm.layers.vision.vision_encoder import VisionEncoder, VisionMultiModalModel
+
+
+class VisionKwargs(BlockKwargs):
+    patch_positions = "patch_positions"
+
+
+@config_class()
+class ImageNormalizationConfig(Config):
+    mean_r: float = Field(
+        default=0.48145466,
+        desc="Mean value for the red channel in the image normalization process.",
+        hint=FieldHint.optional,
+    )
+    mean_g: float = Field(
+        default=0.4578275,
+        desc="Mean value for the green channel in the image normalization process.",
+        hint=FieldHint.optional,
+    )
+    mean_b: float = Field(
+        default=0.40821073,
+        desc="Mean value for the blue channel in the image normalization process.",
+        hint=FieldHint.optional,
+    )
+    std_r: float = Field(
+        default=0.26862954,
+        desc="Standard deviation value for the red channel in the image normalization process.",
+        hint=FieldHint.optional,
+    )
+    std_g: float = Field(
+        default=0.26130258,
+        desc="Standard deviation value for the green channel in the image normalization process.",
+        hint=FieldHint.optional,
+    )
+    std_b: float = Field(
+        default=0.27577711,
+        desc="Standard deviation value for the blue channel in the image normalization process.",
+        hint=FieldHint.optional,
+    )
+    rescale_factor: float = Field(
+        default=255.0,
+        desc="Rescale factor for the image normalization process.",
+        hint=FieldHint.optional,
+    )
+
+
+@config_class()
+class PatchConvolutionConfig(BlockConfig):
+    _abstract = False
+    convolution: Convolution2DConfig = Field(
+        desc="Configuration for the 2d convolution.",
+        hint=FieldHint.architecture,
+    )
+    normalization: NormalizationConfig = Field(
+        desc="Configuration for the normalization layer.",
+        hint=FieldHint.architecture,
+    )
+    patch_height: int = Field(
+        default=16,
+        desc="Height of image patches, in pixels.",
+        hint=FieldHint.core,
+    )
+    patch_width: int = Field(
+        default=16,
+        desc="Width of image patches, in pixels.",
+        hint=FieldHint.core,
+    )
+    full_precision_residual: bool = Field(
+        default=False,
+        desc="Store the residuals for the model in full precision (`optimization_dtype`).",
+        hint=FieldHint.stability,
+    )
+
+    @functools.cached_property
+    def input_channels(self):
+        # Number of input channels. Currently hard-coded to 3 (RGB).
+        return 3
+
+    @property
+    def layer_class(self) -> "type[PatchConvolution]":
+        from fast_llm.layers.vision.patch_convolution import PatchConvolution
+
+        return PatchConvolution
+
+
+@config_class(registry=True)
+class VisionEncoderConfig(BlockConfig):
+    _abstract = False
+    patch_convolution: PatchConvolutionConfig = Field(
+        desc="Configuration for the patch convolution layer.",
+        hint=FieldHint.architecture,
+    )
+    # TODO: Should use varlen mixer, 2d rotary, non-causal. Enforce?
+    encoder: BlockSequenceConfig = Field(
+        desc="Configuration for the vision decoder.",
+        hint=FieldHint.architecture,
+    )
+    adapter: MLPBaseConfig = Field(
+        desc="Configuration for the adapter layer.",
+        hint=FieldHint.architecture,
+    )
+    hidden_size: int = Field(
+        default=1024,
+        desc="Size of the vision encoder main hidden dimension.",
+        hint=FieldHint.architecture,
+        valid=check_field(Assert.gt, 0),
+    )
+
+    @property
+    def layer_class(self) -> "type[VisionEncoder]":
+        from fast_llm.layers.vision.vision_encoder import VisionEncoder
+
+        return VisionEncoder
+
+
+@config_class()
+class VisionMultiModalModelConfig(LanguageModelConfig):
+    vision_encoder: VisionEncoderConfig = Field(
+        hint=FieldHint.architecture,
+        desc="Configuration for the vision encoder.",
+    )
+
+    @property
+    def layer_class(self) -> "type[VisionMultiModalModel]":
+        from fast_llm.layers.vision.vision_encoder import VisionMultiModalModel
+
+        return VisionMultiModalModel
diff --git a/fast_llm/layers/vision/patch_convolution.py b/fast_llm/layers/vision/patch_convolution.py
new file mode 100644
index 00000000..e744044c
--- /dev/null
+++ b/fast_llm/layers/vision/patch_convolution.py
@@ -0,0 +1,74 @@
+import typing
+
+import torch
+
+from fast_llm.core.ops import split
+from fast_llm.engine.config_utils.tensor_dim import TensorDim
+from fast_llm.engine.distributed.config import DistributedConfig, DistributedDimNames
+from fast_llm.layers.attention.config import AttentionKwargs
+from fast_llm.layers.block.block import Block
+from fast_llm.layers.common.peft.config import PeftConfig
+from fast_llm.layers.vision.config import PatchConvolutionConfig, VisionKwargs
+from fast_llm.tensor import TensorMeta
+
+
+class PatchConvolution[ConfigType: PatchConvolutionConfig](Block[ConfigType]):
+    _config: ConfigType
+
+    def __init__(
+        self,
+        config: ConfigType,
+        distributed_config: DistributedConfig,
+        *,
+        # TODO: Input or output dim?
+        hidden_dim: TensorDim,
+        lr_scale: float | None,
+        peft: PeftConfig | None,
+    ):
+        super().__init__(
+            config,
+            distributed_config,
+            hidden_dim=hidden_dim,
+            lr_scale=lr_scale,
+            peft=peft,
+        )
+        self._residual_dtype = (
+            self._distributed_config.optimization_dtype
+            if self._config.full_precision_residual
+            else self._distributed_config.compute_dtype
+        ).torch
+        self._parallel_dim = self._distributed_config.get_distributed_dim(DistributedDimNames.tensor)
+
+        self.convolution = self._config.convolution.get_layer(
+            TensorDim("input_channels", self._config.input_channels),
+            self._hidden_dim,
+            TensorDim("patch_height", self._config.patch_height),
+            TensorDim("patch_width", self._config.patch_width),
+            stride=(self._config.patch_height, self._config.patch_width),
+            default_add_bias=False,
+            lr_scale=self._lr_scale,
+            peft=self._peft,
+        )
+        self.normalization = self._config.normalization.get_layer(hidden_dim, lr_scale=self._lr_scale, peft=self._peft)
+
+    def forward(
+        self,
+        input_: torch.Tensor,
+        kwargs: dict[str, typing.Any],
+        losses: dict[str, typing.Any] | None = None,
+        metrics: dict | None = None,
+    ) -> torch.Tensor:
+        if isinstance(input_, TensorMeta):
+            return TensorMeta.from_dims(
+                kwargs[VisionKwargs.hidden_dims],
+                tensor_name="Patch convolution output",
+                dtype=self._residual_dtype,
+            )
+        if self._sequence_parallel:
+            input_ = split(input_, group=self._parallel_dim.group, dim=0)
+        patch_embeddings = (
+            self.normalization(self.convolution(input_).flatten(1))
+            .view(-1, self._hidden_dim.size)
+            .unsqueeze(int(kwargs[AttentionKwargs.sequence_first]))
+        )
+        return patch_embeddings.to(self._residual_dtype)
diff --git a/fast_llm/layers/vision/vision_encoder.py b/fast_llm/layers/vision/vision_encoder.py
new file mode 100644
index 00000000..e6261600
--- /dev/null
+++ b/fast_llm/layers/vision/vision_encoder.py
@@ -0,0 +1,112 @@
+import functools
+import logging
+import typing
+
+from fast_llm.engine.base_model.base_model import Layer, LayerBaseWithNamespace
+from fast_llm.engine.base_model.config import LossDef
+from fast_llm.engine.config_utils.tensor_dim import TensorDim
+from fast_llm.engine.distributed.config import DistributedConfig
+from fast_llm.layers.block.block import BlockBase
+from fast_llm.layers.common.peft.config import PeftConfig
+from fast_llm.layers.language_model.language_model import LanguageModel
+from fast_llm.layers.vision.config import VisionEncoderConfig, VisionMultiModalModelConfig
+
+logger = logging.getLogger(__name__)
+
+
+class VisionEncoder[ConfigType: VisionEncoderConfig](BlockBase[ConfigType]):
+    _config: ConfigType
+
+    def __init__(
+        self,
+        config: ConfigType,
+        distributed_config: DistributedConfig,
+        *,
+        hidden_dim: TensorDim,
+        lr_scale: float | None,
+        peft: PeftConfig | None,
+    ):
+        super().__init__(config, distributed_config, hidden_dim=hidden_dim, lr_scale=lr_scale, peft=peft)
+        vision_hidden_dim = TensorDim("hidden", self._config.hidden_size)
+        self.patch_convolution = self._config.patch_convolution.get_layer(
+            distributed_config,
+            vision_hidden_dim,
+            lr_scale=self._lr_scale,
+            peft=self._peft,
+        )
+        self.encoder = self._config.encoder.get_layer(
+            distributed_config,
+            vision_hidden_dim,
+            lr_scale=self._lr_scale,
+            peft=self._peft,
+        )
+        self.adapter = self._config.adapter.get_layer(
+            distributed_config,
+            vision_hidden_dim,
+            output_dim=self._hidden_dim,
+            lr_scale=self._lr_scale,
+            peft=self._peft,
+        )
+
+    def get_layers(self) -> list["Layer"]:
+        return self.patch_convolution.get_layers() + self.encoder.get_layers() + self.adapter.get_layers()
+
+    def preprocess(self, kwargs: dict[str, typing.Any]) -> None:
+        # Needed because the base class uses `get_layers` which may bypass the decoder. TODO: Avoidable?
+        self.patch_convolution.preprocess(kwargs)
+        self.encoder.preprocess(kwargs)
+        self.adapter.preprocess(kwargs)
+
+    def get_loss_definitions(self, count: int = 1) -> list[LossDef]:
+        # Needed because the base class uses `get_layers` which may bypass the decoder. TODO: Avoidable?
+        return (
+            self.patch_convolution.get_loss_definitions(count)
+            + self.encoder.get_loss_definitions(count)
+            + self.adapter.get_loss_definitions(count)
+        )
+
+
+class VisionMultiModalModel[ConfigType: VisionMultiModalModelConfig](LanguageModel[ConfigType]):
+    _config: ConfigType
+
+    def __init__(
+        self,
+        config: ConfigType,
+        distributed_config: DistributedConfig,
+        *,
+        # TODO: Unused, but required by the `BlockBase` interface.
+        hidden_dim: TensorDim | None = None,
+        lr_scale: float | None,
+        peft: PeftConfig | None,
+    ):
+        super().__init__(
+            config,
+            distributed_config,
+            hidden_dim=TensorDim("hidden", config.hidden_size),
+            lr_scale=lr_scale,
+            peft=peft,
+        )
+        self.vision_encoder = self._config.vision_encoder.get_layer(
+            distributed_config,
+            hidden_dim=self._hidden_dim,
+            lr_scale=self._lr_scale,
+            peft=self._peft,
+        )
+
+    def get_layers(self) -> list[Layer]:
+        return self._vision_encoder_with_namespace.get_layers() + super().get_layers()
+
+    def preprocess(self, kwargs: dict[str, typing.Any]) -> None:
+        self._vision_encoder_with_namespace.preprocess(kwargs)
+        super().preprocess(kwargs)
+
+    def get_loss_definitions(self, count: int = 1) -> list[LossDef]:
+        return self.vision_encoder.get_loss_definitions(count) + super().get_loss_definitions(count)
+
+    @functools.cached_property
+    def _vision_encoder_namespace(self) -> str:
+        return self.vision_encoder.module_name
+
+    @functools.cached_property
+    def _vision_encoder_with_namespace(self) -> LayerBaseWithNamespace:
+        return LayerBaseWithNamespace(self.vision_encoder, self._vision_encoder_namespace)
diff --git a/fast_llm/models/auto.py b/fast_llm/models/auto.py
index 41431462..7830c69a 100644
--- a/fast_llm/models/auto.py
+++ b/fast_llm/models/auto.py
@@ -5,4 +5,5 @@
 from fast_llm.layers.attention.config import AttentionConfig  # isort: skip
 from fast_llm.layers.ssm.config import MambaConfig, Mamba2Config, DiscreteMamba2Config  # isort: skip
 from fast_llm.models.gpt.config import GPTModelConfig, GPTTrainerConfig  # isort: skip
+from fast_llm.models.multimodal.config import MultiModalModelConfig, MultiModalTrainerConfig  # isort: skip
 from fast_llm.engine.evaluation.evaluators import EvaluatorsConfig  # isort: skip
diff --git a/fast_llm/models/gpt/model.py b/fast_llm/models/gpt/model.py
index 3295295f..2c1947af 100644
--- a/fast_llm/models/gpt/model.py
+++ b/fast_llm/models/gpt/model.py
@@ -198,7 +198,8 @@ def preprocess_batch(
                 **kwargs_meta,
                 AttentionKwargs.past_key_values: pasts,
                 AttentionKwargs.presents: presents,
-                AttentionKwargs.sequence_lengths: batch.tokens.lengths,
+                AttentionKwargs.sequence_lengths: cropped_tokens.lengths,
+                AttentionKwargs.device: self._distributed.device,
                 **reference_logits[i],
             }
 
@@ -235,7 +236,7 @@ def preprocess_batch(
                 if kwargs[AttentionKwargs.sequence_first]
                 else cropped_tokens.tokens
             ).contiguous()
-            self.preprocess(tokens, kwargs)
+            self.preprocess(kwargs)
             preprocessed.append((tokens, kwargs))
 
         return preprocessed
diff --git a/fast_llm/models/multimodal/__init__.py b/fast_llm/models/multimodal/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/fast_llm/models/multimodal/config.py b/fast_llm/models/multimodal/config.py
new file mode 100644
index 00000000..7bce7885
--- /dev/null
+++ b/fast_llm/models/multimodal/config.py
@@ -0,0 +1,81 @@
+import logging
+import typing
+
+from fast_llm.config import FieldUpdate, config_class
+from fast_llm.engine.checkpoint.config import CheckpointFormat
+from fast_llm.engine.config_utils.runnable import RunnableConfig
+from fast_llm.engine.multi_stage.config import FastLLMModelConfig
+from fast_llm.engine.training.config import TrainerConfig
+from fast_llm.layers.vision.config import VisionMultiModalModelConfig
+from fast_llm.models.gpt.config import (
+    GPTBaseModelConfig,
+    GPTBatchConfig,
+    GPTModelConfig,
+    GPTTrainerConfig,
+    PretrainedGPTModelConfig,
+)
+
+if typing.TYPE_CHECKING:
+    from fast_llm.models.multimodal.model import MultiModalBaseModel, MultiModalModel
+    from fast_llm.models.multimodal.trainer import MultiModalTrainer
+
+logger = logging.getLogger(__name__)
+
+
+@config_class()
+class MultiModalBatchConfig(GPTBatchConfig):
+    pass
+
+
+@config_class()
+class MultiModalBaseModelConfig(VisionMultiModalModelConfig, GPTBaseModelConfig):
+    @property
+    def base_model_class(self) -> type["MultiModalBaseModel"]:
+        from fast_llm.models.multimodal.model import MultiModalBaseModel
+
+        return MultiModalBaseModel
+
+
+@config_class(dynamic_type={FastLLMModelConfig: "multimodal"})
+class MultiModalModelConfig(GPTModelConfig):
+    _abstract = False
+    model_name: typing.ClassVar[str] = "multimodal"
+    base_model: MultiModalBaseModelConfig = FieldUpdate()
+    # TODO: ====== Conversion ======
+    checkpoint_formats: typing.ClassVar[tuple[type[CheckpointFormat], ...]] = FastLLMModelConfig.checkpoint_formats
+
+    @classmethod
+    def get_model_class(cls) -> type["MultiModalModel"]:
+        from fast_llm.models.multimodal.model import MultiModalModel
+
+        return MultiModalModel
+
+    @classmethod
+    def get_inference_runner_class(cls) -> type["MultiModalModelInferenceRunner"]:
+        from fast_llm.models.multimodal.model import MultiModalModelInferenceRunner
+
+        return MultiModalModelInferenceRunner
+
+    @classmethod
+    def get_huggingface_model_for_causal_lm_class(cls) -> type["HuggingfaceMultiModalModelForCausalLM"]:
+        from fast_llm.models.multimodal.huggingface import HuggingfaceMultiModalModelForCausalLM
+
+        return HuggingfaceMultiModalModelForCausalLM
+
+
+@config_class()
+class PretrainedMultiModalModelConfig(PretrainedGPTModelConfig):
+    _abstract = False
+    model: MultiModalModelConfig = FieldUpdate()
+
+
+@config_class(dynamic_type={RunnableConfig: "train_multimodal", TrainerConfig: "multimodal"})
+class MultiModalTrainerConfig(PretrainedMultiModalModelConfig, GPTTrainerConfig):
+    # TODO: Use dynamic model type?
+    reference_models: dict[str, PretrainedMultiModalModelConfig] = FieldUpdate()
+
+    @classmethod
+    def get_trainer_class(cls) -> type["MultiModalTrainer"]:
+        from fast_llm.models.multimodal.trainer import MultiModalTrainer
+
+        return MultiModalTrainer
diff --git a/fast_llm/models/multimodal/model.py b/fast_llm/models/multimodal/model.py
new file mode 100644
index 00000000..c30a5d27
--- /dev/null
+++ b/fast_llm/models/multimodal/model.py
@@ -0,0 +1,230 @@
+import logging
+import typing
+
+import torch
+
+from fast_llm.core.distributed import all_gather_scalar
+from fast_llm.data.sample.language_model import LanguageModelBatch
+from fast_llm.engine.config_utils.tensor_dim import TensorDim, scalar_dim
+from fast_llm.engine.distributed.config import DistributedDim, DistributedDimNames, PhaseType
+from fast_llm.engine.inference.runner import InferenceRunner
+from fast_llm.layers.attention.config import AttentionKwargs
+from fast_llm.layers.block.config import BlockDimNames
+from fast_llm.layers.language_model.config import LanguageModelKwargs
+from fast_llm.layers.vision.config import VisionKwargs
+from fast_llm.layers.vision.vision_encoder import VisionMultiModalModel
+from fast_llm.models.gpt.config import GPTBatchConfig
+from fast_llm.models.gpt.model import GPTBaseModel, GPTModel
+from fast_llm.models.multimodal.config import MultiModalBaseModelConfig, MultiModalBatchConfig, MultiModalModelConfig
+from fast_llm.tensor import TensorMeta
+
+logger = logging.getLogger(__name__)
+
+
+class PatchSequenceTensorDim(TensorDim):
+    """
+    A custom `TensorDim` class to handle the combined batch/sequence dimension in image patches.
+
+    A simple gather `TensorDim.local_to_global` yields inconsistent results between distributed configuration,
+    (because of the padding of image patches) which makes direct comparison in tests impossible.
+    This class solves the problem removing the padding in the tensor returned by `local_to_global`,
+    allowing for consistent results.
+    Note that `local_unpadded_size` must be set manually before any call to `local_to_global`.
+    """
+
+    local_unpadded_size: typing.ClassVar[int]
+
+    def __init__(self, name: str, global_size: int, parallel_dim: DistributedDim, batch_parallel_dim: DistributedDim):
+        super().__init__(name, global_size * batch_parallel_dim.size, parallel_dim, variable_size=True)
+        self._batch_parallel_dim = batch_parallel_dim
+
+    @property
+    def is_parallel(self) -> bool:
+        # Ensure `local_to_global` is called in non-parallel setting.
+        return True
+
+    def replace_parallel_dim(self, distributed_dim: DistributedDim) -> typing.Self:
+        raise NotImplementedError()
+
+    def local_to_global(self, tensor: "torch.Tensor", dim: int = 0) -> "torch.Tensor":
+        assert hasattr(self, "local_unpadded_size")
+        batch_parallel_group = self._batch_parallel_dim.group
+        global_padded_tensor = super().local_to_global(tensor, dim)
+
+        if batch_parallel_group is None:
+            return global_padded_tensor[*(slice(None) for _ in range(dim)), : self.local_unpadded_size]
+        else:
+            unpadded_sequence_lengths = all_gather_scalar(self.local_unpadded_size, torch.int32, batch_parallel_group)
+            return torch.cat(
+                [
+                    tensor[*(slice(None) for _ in range(dim)), :unpadded_sequence_length]
+                    for tensor, unpadded_sequence_length in zip(
+                        global_padded_tensor.chunk(batch_parallel_group.size(), dim=dim),
+                        unpadded_sequence_lengths,
+                        strict=True,
+                    )
+                ],
+                dim=dim,
+            )
+
+    def local_to_global_partial(
+        self, tensor: "torch.Tensor", dim: int = 0, fill_value: float | int = -1
+    ) -> "torch.Tensor":
+        # Not needed.
+        raise NotImplementedError()
+
+    def global_to_local(self, tensor: "torch.Tensor", dim: int = 0, expand: bool = False) -> "torch.Tensor":
+        # Not needed.
+        raise NotImplementedError()
+
+
+class MultiModalBaseModel[ConfigType: MultiModalBaseModelConfig](
+    GPTBaseModel[ConfigType], VisionMultiModalModel[ConfigType]
+):
+    """
+    A transformer-based language model generalizing the GPT model architecture.
+    """
+
+    _config: ConfigType
+
+    def preprocess_meta(
+        self, batch_meta: GPTBatchConfig | torch.Tensor, phase: PhaseType
+    ) -> list[tuple[TensorMeta, dict]]:
+        preprocessed_meta = []
+        for tokens, kwargs in super().preprocess_meta(batch_meta, phase):
+            kwargs[LanguageModelKwargs.token_ids] = tokens
+            kwargs[LanguageModelKwargs.mask_inputs] = True
+            # TODO: What about sequence data?
+            batch_data_dim = self._distributed_config.get_distributed_dim(DistributedDimNames.batch_data)
+
+            micro_sequence_length = tokens.global_shape.numel()
+
+            batch_and_sequence_q_dim = PatchSequenceTensorDim(
+                BlockDimNames.sequence_q,
+                micro_sequence_length,
+                self._distributed_config.get_distributed_dim(DistributedDimNames.data),
+                batch_data_dim,
+            )
+            hidden_batch_and_sequence_q_dim = (
+                PatchSequenceTensorDim(
+                    BlockDimNames.sequence_q_tp,
+                    micro_sequence_length,
+                    self._distributed_config.get_distributed_dim(DistributedDimNames.tensor_and_data),
+                    batch_data_dim,
+                )
+                if self._distributed_config.sequence_tensor_parallel
+                else batch_and_sequence_q_dim
+            )
+            # These are used by the model (preprocessing) and shouldn't see the batch-parallel dim.
+            sequence_q_dim = TensorDim(
+                BlockDimNames.sequence_q,
+                micro_sequence_length,
+                self._distributed_config.get_distributed_dim(DistributedDimNames.sequence_data),
+            )
+            sequence_k_dim = TensorDim(BlockDimNames.sequence_k, micro_sequence_length)
+
+            image_patches = TensorMeta.from_dims(
+                (
+                    # We combine the batch and sequence dims to allow for variable sequence lengths.
+                    # Gives the same result, assuming we disable cross-image attention (TODO: Enforce)
+                    batch_and_sequence_q_dim,
+                    # TODO: Relate to tensor dims in patch convolution.
+                    TensorDim("input_channels", self._config.vision_encoder.patch_convolution.input_channels),
+                    TensorDim("patch_height", self._config.vision_encoder.patch_convolution.patch_height),
+                    TensorDim("patch_width", self._config.vision_encoder.patch_convolution.patch_width),
+                )
+            )
+            hidden_dims = (
+                (hidden_batch_and_sequence_q_dim, scalar_dim, self.vision_encoder._hidden_dim)
+                if (sequence_first := kwargs[LanguageModelKwargs.sequence_first])
+                else (scalar_dim, hidden_batch_and_sequence_q_dim, self.vision_encoder._hidden_dim)
+            )
+            kwargs[self._vision_encoder_namespace] = {
+                VisionKwargs.sequence_first: sequence_first,
+                VisionKwargs.sequence_k_dim: sequence_k_dim,
+                VisionKwargs.sequence_q_dim: sequence_q_dim,
+                VisionKwargs.hidden_dims: hidden_dims,
+            }
+
+            preprocessed_meta.append((image_patches, kwargs))
+
+        return preprocessed_meta
+
+    def preprocess_batch(
+        self,
+        batch: LanguageModelBatch,
+        preprocessed_meta: list[tuple[TensorMeta, dict]] | None = None,
+        *,
+        phase: PhaseType,
+        iteration: int,
+        metrics: dict | None = None,
+    ) -> list[tuple[torch.Tensor, dict]]:
+        preprocessed = super().preprocess_batch(
+            batch, preprocessed_meta, phase=phase, iteration=iteration, metrics=metrics
+        )
+        # TODO: Support micro-sequences.
+        assert len(preprocessed) == 1, "Micro-sequences not supported for MultiModalModel."
+        tokens, kwargs = preprocessed[0]
+
+        kwargs[LanguageModelKwargs.token_ids] = tokens
+
+        # If document cropping is enabled, extra tokens may belong to images and need to be removed.
+        # TODO: Handle earlier.
+        tokens_end = kwargs[AttentionKwargs.sequence_k_dim].size
+        tokens_begin = tokens_end - kwargs[AttentionKwargs.sequence_q_dim].size
+        cropped_image_patches = batch.image_patches.crop(tokens_begin, tokens_end)
+
+        sequence_length = tokens.shape[:2].numel()
+        pad_size = sequence_length - cropped_image_patches.patches.size(0)
+
+        patches = cropped_image_patches.patches.to(self._distributed.config.compute_dtype.torch)
+        patches = torch.cat([patches, patches.new_zeros((pad_size,) + patches.shape[1:])])
+
+        positions = torch.cat(
+            [
+                cropped_image_patches.positions,
+                cropped_image_patches.positions.new_zeros((pad_size,) + cropped_image_patches.positions.shape[1:]),
+            ]
+        )
+
+        kwargs[self._vision_encoder_namespace] = {
+            **kwargs[self._vision_encoder_namespace],
+            VisionKwargs.patch_positions: positions,
+            VisionKwargs.sequence_lengths: [cropped_image_patches.lengths + [pad_size]],
+            VisionKwargs.sequence_length: sequence_length,
+            VisionKwargs.device: self._distributed.device,
+        }
+        # We need to modify `local_unpadded_size` directly in `preprocessed_meta` since it's the one used by the engine.
+        # Unsafe, but only needed for testing.
+        # TODO: Doesn't work with gradient accumulation (only sees the last value).
+        hidden_batch_and_sequence_q_dim = kwargs[self._vision_encoder_namespace][VisionKwargs.hidden_dims][
+            0 if kwargs[self._vision_encoder_namespace][VisionKwargs.sequence_first] else 1
+        ]
+        print(kwargs[self._vision_encoder_namespace][VisionKwargs.hidden_dims])
+        print(hidden_batch_and_sequence_q_dim)
+        assert isinstance(hidden_batch_and_sequence_q_dim, PatchSequenceTensorDim)
+        PatchSequenceTensorDim.local_unpadded_size = cropped_image_patches.patches.size(0)
+
+        kwargs[LanguageModelKwargs.embedding_map] = (
+            (cropped_image_patches.token_map, cropped_image_patches.sample_map)
+            if kwargs[LanguageModelKwargs.sequence_first]
+            else (cropped_image_patches.sample_map, cropped_image_patches.token_map)
+        )
+
+        super().preprocess(kwargs)
+
+        return [(patches, kwargs)]
+
+    def preprocess(self, kwargs: dict[str, typing.Any]) -> None:
+        # Hack to delay preprocessing in super().preprocess_batch (TODO: Improve)
+        pass
+
+
+class MultiModalModel[ConfigType: MultiModalModelConfig](GPTModel[ConfigType]):
+    # TODO: Can we drop class?
+    pass
+
+
+class MultiModalInferenceRunner(InferenceRunner):
+    model_class: typing.ClassVar[type[MultiModalModel]] = MultiModalModel
+    batch_config_class: typing.ClassVar[type[MultiModalBatchConfig]] = MultiModalBatchConfig
diff --git a/fast_llm/models/multimodal/trainer.py b/fast_llm/models/multimodal/trainer.py
new file mode 100644
index 00000000..2beee109
--- /dev/null
+++ b/fast_llm/models/multimodal/trainer.py
@@ -0,0 +1,10 @@
+import logging
+
+from fast_llm.models.gpt.trainer import GPTTrainer
+from fast_llm.models.multimodal.config import MultiModalTrainerConfig
+
+logger = logging.getLogger(__name__)
+
+
+class MultiModalTrainer[ConfigType: MultiModalTrainerConfig](GPTTrainer[ConfigType]):
+    pass
diff --git a/fast_llm/tensor.py b/fast_llm/tensor.py
index b709ea83..f4469df9 100644
--- a/fast_llm/tensor.py
+++ b/fast_llm/tensor.py
@@ -76,6 +76,7 @@ def __init__(
         self._reductions = reductions
         for dim, op in reductions:
             assert isinstance(dim, DistributedDim), dim
+        self._variable_shape = any(dim.variable_size for dim in self.dims)
 
     def __new__(
         cls,
@@ -142,6 +143,14 @@ def from_dims(
     def global_shape(self) -> torch.Size:
         return torch.Size([dim.global_size for dim in self.dims])
 
+    def verify_shape(self, tensor: torch.Tensor, global_: bool = False):
+        if self._variable_shape:
+            for size, dim in zip(tensor.shape, self.dims, strict=True):
+                if not dim.variable_size:
+                    Assert.eq(size, dim.global_size if global_ else dim.size, msg=self)
+        else:
+            Assert.eq(tensor.shape, self.global_shape if global_ else self.shape, msg=self)
+
     def local_to_global(self, tensor: torch.Tensor) -> tuple[torch.Tensor, ...]:
         """
         Reconstruct a global tensor from its distributed slices. Support lazy-loaded safetensor slices.
@@ -149,7 +158,7 @@ def local_to_global(self, tensor: torch.Tensor) -> tuple[torch.Tensor, ...]:
         """
         if tensor.ndim == 0:
             tensor = tensor[None]
-        Assert.eq(tensor.shape, self.shape)
+        self.verify_shape(tensor, False)
         # Tensors are always either split or duplicated in the tensor-parallel direction.
         # TODO: Avoid hard-coded assumptions on duplication
         is_first_rank, modified = True, False
@@ -167,7 +176,7 @@ def local_to_global(self, tensor: torch.Tensor) -> tuple[torch.Tensor, ...]:
                     tensor = tensor.clone()
                 tensor = reduce_op(tensor, distributed_dim.group, op=op)
                 is_first_rank, modified = is_first_rank and distributed_dim.group.rank() == 0, True
-        Assert.eq(tensor.shape, self.global_shape)
+        self.verify_shape(tensor, True)
         return tensor, is_first_rank
 
     def local_to_global_partial(self, tensor: torch.Tensor, fill_value: float | int = -1) -> torch.Tensor:
@@ -179,13 +188,13 @@ def local_to_global_partial(self, tensor: torch.Tensor, fill_value: float | int
         """
         if tensor.ndim == 0:
             tensor = tensor[None]
-        Assert.eq(tensor.shape, self.shape)
+        self.verify_shape(tensor, False)
         assert not self._reductions
         for dim, tensor_dim in enumerate(self.dims):
             if tensor_dim.is_parallel:
                 tensor = tensor_dim.local_to_global_partial(tensor, dim, fill_value)
 
-        Assert.eq(tensor.shape, self.global_shape)
+        self.verify_shape(tensor, True)
         return tensor
 
     def global_to_local(self, tensor: torch.Tensor | SafeTensorSlice) -> torch.Tensor:
@@ -198,12 +207,12 @@ def global_to_local(self, tensor: torch.Tensor | SafeTensorSlice) -> torch.Tenso
         assert not self._reductions
         if tensor.ndim == 0:
             tensor = tensor[None]
-        Assert.eq(tensor.shape, self.global_shape, msg=self)
+        self.verify_shape(tensor, True)
 
         for dim, tensor_dim in reversed(list(enumerate(self.dims))):
             tensor = tensor_dim.global_to_local(tensor, dim)
 
-        Assert.eq(tensor.shape, self.shape, msg=self)
+        self.verify_shape(tensor, False)
         return tensor
 
     @classmethod
diff --git a/setup.cfg b/setup.cfg
index 77073ab5..329277a0 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -43,7 +43,7 @@ OPTIONAL =
 
 # Huggingface tools
 HUGGINGFACE =
-    transformers>=4.52.4
+    transformers>=4.53.2
     hf-transfer>=0.1.9
     datasets>=3.6.0
     huggingface-hub>=0.32.6
@@ -59,6 +59,13 @@ GENERATION =
     lm_eval>=0.4.9
 
 
+# Required for supporting vision inputs
+VISION =
+    # Vision Tools
+    webp>=0.4.0
+    pillow-simd>=9.5.0
+    torchvision>=0.20.0
+
 DEV =
     # Pre-commit git hook
     pre-commit>=4.2.0
diff --git a/tests/functional/test_triton_kernels.py b/tests/functional/test_triton_kernels.py
index b5d88e0a..807d3880 100644
--- a/tests/functional/test_triton_kernels.py
+++ b/tests/functional/test_triton_kernels.py
@@ -1,6 +1,7 @@
 import pytest
 import torch
 
+from fast_llm.engine.config_utils.tensor_dim import TensorDim
 from fast_llm.functional.config import (
     MAX_DROPLESS_BLOCK_SIZE_ROW,
     ActivationType,
@@ -92,7 +93,7 @@ def test_triton_rotary(batch_size, sequence_length, num_heads, head_size):
     y1 = apply_rotary_embeddings(
         x,
         DefaultRotaryConfig(triton=False)
-        .get_layer(None)
+        .get_layer(TensorDim("", head_size))
         ._get_frequencies(
             sequence_length,
             head_size,
@@ -104,7 +105,7 @@ def test_triton_rotary(batch_size, sequence_length, num_heads, head_size):
         triton_rotary_(
             convert_rotary_complex_to_real(x, head_size, 3),
             DefaultRotaryConfig(triton=True)
-            .get_layer(None)
+            .get_layer(TensorDim("", head_size))
             ._get_frequencies(sequence_length, head_size, device="cuda"),
         ),
         head_size,
diff --git a/tests/test_attention.py b/tests/test_attention.py
index b86cc95f..f1409b95 100644
--- a/tests/test_attention.py
+++ b/tests/test_attention.py
@@ -1,3 +1,4 @@
+import pytest
 import torch
 
 from fast_llm.engine.config_utils.tensor_dim import TensorDim
@@ -6,10 +7,13 @@
 from fast_llm.layers.attention.config import AttentionConfig, AttentionImplementation, AttentionKwargs
 from fast_llm.layers.block.config import BlockDimNames
 from fast_llm.utils import Assert
+from tests.utils.utils import requires_cuda
 
 
+# TODO: ====== micro-sequence ======
+@pytest.mark.skip
 def test_varlen_preprocessing():
-    sequence_lengths = [torch.tensor([8, 13, 4, 11], dtype=torch.int32), torch.tensor([11, 16, 9], dtype=torch.int32)]
+    sequence_lengths = [[8, 13, 4, 11], [11, 16, 9]]
     # First micro-sequence:
     # [0...7,0...3] + [0...10,0] -> [0,8,12,23,24]
     # Second micro-sequence:
@@ -43,7 +47,52 @@ def test_varlen_preprocessing():
             ),
             AttentionKwargs.sequence_length: sequence_length,
             AttentionKwargs.sequence_lengths: sequence_lengths,
+            AttentionKwargs.device: torch.device("cpu"),
         }
-        attention.preprocess(torch.empty(1, device="cpu"), kwargs)
+        attention.preprocess(kwargs)
         Assert.all_equal(kwargs[AttentionKwargs.cu_seqlens_q], cumulative_sequences_q[micro_seq_idx])
         Assert.all_equal(kwargs[AttentionKwargs.cu_seqlens_k], cumulative_sequences_k[micro_seq_idx])
+
+
+@requires_cuda
+@pytest.mark.parametrize("cross_document_attention", (True, False))
+@pytest.mark.parametrize(("causal", "window_size"), ((True, None), (True, 50), (False, None)))
+def test_attention_implementations(cross_document_attention: bool, causal: bool, window_size: int | None):
+    """
+    Check that the flash and backup attention implementation give the same result.
+    """
+    attention: Attention = AttentionConfig(
+        head_size=32,
+        heads=4,
+        head_groups=2,
+        window_size=window_size,
+        cross_document_attention=cross_document_attention,
+        causal=causal,
+    ).get_layer(
+        DistributedConfig(compute_dtype="bfloat16"),
+        TensorDim("hidden_size", 256),
+        lr_scale=None,
+        peft=None,
+    )
+    query = torch.empty(4, 100, 4, 32, dtype=torch.bfloat16, device="cuda").normal_()
+    key = torch.empty(4, 100, 2, 32, dtype=torch.bfloat16, device="cuda").normal_()
+    value = torch.empty(4, 100, 2, 32, dtype=torch.bfloat16, device="cuda").normal_()
+    kwargs = {
+        AttentionKwargs.device: torch.device("cuda"),
+        AttentionKwargs.sequence_length: 100,
+        AttentionKwargs.sequence_lengths: [
+            [20, 32, 10, 11, 9, 18],
+            [100],
+            [2, 8, 22, 7, 6, 5, 1, 10, 4, 11, 3, 8, 4, 9],
+            [5 for _ in range(20)],
+        ],
+        AttentionKwargs.sequence_q_dim: TensorDim("sequence_q", 100),
+        AttentionKwargs.sequence_k_dim: TensorDim("sequence_k", 100),
+    }
+    attention._preprocess_for_backup_attention(kwargs)
+    attention._preprocess_for_flash_attention(kwargs)
+
+    out_backup = attention._attn_backup(query, key, value, kwargs)
+    out_flash = attention._attn_flash(query, key, value, kwargs)
+
+    Assert.rms_close(out_backup, out_flash, 2e-3)
diff --git a/tests/test_config.py b/tests/test_config.py
index 9a1f542a..4020b6fb 100644
--- a/tests/test_config.py
+++ b/tests/test_config.py
@@ -236,13 +236,23 @@ def test_distributed_global_ranks(bdp: int, sdp: int, tp: int, pp: int, pipeline
             tp,
             rank,
         )
-        _check_dim(
-            tp_sdp_dim := config.get_distributed_dim(DistributedDimNames.tensor_and_sequence_data),
-            DistributedDimNames.tensor_and_sequence_data,
-            dp_rank % sdp * tp + tp_rank,
-            tp * sdp,
-            rank,
-        )
+        if not pipeline_first:
+            _check_dim(
+                tp_sdp_dim := config.get_distributed_dim(DistributedDimNames.tensor_and_sequence_data),
+                DistributedDimNames.tensor_and_sequence_data,
+                dp_rank % sdp * tp + tp_rank,
+                tp * sdp,
+                rank,
+            )
+            _check_dim(
+                tp_dp_dim := config.get_distributed_dim(DistributedDimNames.tensor_and_data),
+                DistributedDimNames.tensor_and_data,
+                dp_rank * tp + tp_rank,
+                tp * dp,
+                rank,
+            )
+            all_global_ranks["tp_sdp"].add(tuple(tp_sdp_dim.global_ranks))
+            all_global_ranks["tp_dp"].add(tuple(tp_dp_dim.global_ranks))
         _check_dim(
             sdp_dim := config.get_distributed_dim(DistributedDimNames.sequence_data),
             DistributedDimNames.sequence_data,
@@ -273,7 +283,6 @@ def test_distributed_global_ranks(bdp: int, sdp: int, tp: int, pp: int, pipeline
         )
         all_global_ranks["world"].add(tuple(world_dim.global_ranks))
         all_global_ranks["tp"].add(tuple(tp_dim.global_ranks))
-        all_global_ranks["tp_sdp"].add(tuple(tp_sdp_dim.global_ranks))
         all_global_ranks["sdp"].add(tuple(sdp_dim.global_ranks))
         all_global_ranks["bdp"].add(tuple(bdp_dim.global_ranks))
         all_global_ranks["dp"].add(tuple(dp_dim.global_ranks))
diff --git a/tests/test_multi_stage.py b/tests/test_multi_stage.py
index 407b4776..e3870a7b 100644
--- a/tests/test_multi_stage.py
+++ b/tests/test_multi_stage.py
@@ -6,7 +6,6 @@
 from fast_llm.engine.multi_stage.config import FastLLMModelConfig
 from fast_llm.engine.multi_stage.fast_llm_model import FastLLMModel
 from fast_llm.utils import Assert
-from tests.utils.dataset import get_model_test_dataset
 from tests.utils.model_configs import ModelTestingGroup
 from tests.utils.utils import requires_cuda
 
@@ -22,7 +21,7 @@ def _get_model(config_dict: dict, model_type: str = "gpt") -> FastLLMModel:
 @requires_cuda
 @pytest.mark.model_testing_group(ModelTestingGroup.basic)
 def test_frozen_weights(model_testing_config):
-    get_model_test_dataset()
+    model_testing_config.get_dataset()
     frozen_config_dict = copy.deepcopy(model_testing_config.config_dict)
     decoder_config = frozen_config_dict["model"]["base_model"]["decoder"]
     if (decoder_type := decoder_config.get("type", "fixed")) == "fixed":
diff --git a/tests/utils/dataset.py b/tests/utils/dataset.py
index b88f834c..ed3f0130 100644
--- a/tests/utils/dataset.py
+++ b/tests/utils/dataset.py
@@ -172,7 +172,8 @@ def _get_test_dataset(
     image_patch_config: ImagePatchConfig | None = None,
     min_image_size: int = 4,
     max_image_size: int = 32,
-):
+    config_only: bool = False,
+) -> tuple[pathlib.Path, dict[str, typing.Any], pathlib.Path]:
     config_paths = (
         [path / "fast_llm_config.yaml"]
         if splits is None
@@ -180,7 +181,7 @@ def _get_test_dataset(
     )
     hf_path = path / "hf"
 
-    if not all(config_path.is_file() for config_path in config_paths):
+    if not config_only and not all(config_path.is_file() for config_path in config_paths):
         dataset = _get_hf_test_dataset(
             seed=seed,
             num_documents=num_documents,
@@ -284,5 +285,30 @@ def get_test_dataset_with_image_patches(image_break_token: int | None = None, im
     )
 
 
-def get_model_test_dataset():
-    return _get_test_dataset(DATASET_CACHE / "model_dataset", seed=1234, vocab_size=MODEL_TEST_VOCAB_SIZE)
+def get_model_test_dataset(config_only: bool = False):
+    return _get_test_dataset(
+        DATASET_CACHE / "model_dataset",
+        seed=1234,
+        vocab_size=MODEL_TEST_VOCAB_SIZE,
+        splits={"training": 969, "validation": 30, "test": 1},
+        config_only=config_only,
+    )
+
+
+def get_multimodal_test_dataset(config_only: bool = False):
+    return _get_test_dataset(
+        DATASET_CACHE / "model_dataset_multimodal",
+        seed=1234,
+        vocab_size=MODEL_TEST_VOCAB_SIZE,
+        max_images=2,
+        image_patch_config=ImagePatchConfig(
+            height=4,
+            width=4,
+            max_image_height=16,
+            max_image_width=16,
+            image_break_token=None,
+            image_end_token=None,
+        ),
+        splits={"training": 969, "validation": 30, "test": 1},
+        config_only=config_only,
+    )
diff --git a/tests/utils/model_configs.py b/tests/utils/model_configs.py
index 956aaea5..1ed99416 100644
--- a/tests/utils/model_configs.py
+++ b/tests/utils/model_configs.py
@@ -3,6 +3,7 @@
 import enum
 import functools
 import os
+import pathlib
 import typing
 
 import pytest
@@ -21,8 +22,9 @@
     MTPLlamaCheckpointFormat,
     Qwen2CheckpointFormat,
 )
+from tests.utils.dataset import get_model_test_dataset, get_multimodal_test_dataset
 from tests.utils.distributed_configs import DistributedTestingConfig
-from tests.utils.global_variables import MODEL_DATASET_SHARD_PATH, MODEL_TEST_VOCAB_SIZE
+from tests.utils.global_variables import MODEL_TEST_VOCAB_SIZE
 
 from fast_llm.engine.evaluation.evaluators import (  # isort:skip  # needed for dynamic type registration
     EvaluatorsConfig,
@@ -78,6 +80,13 @@ class ModelTestingConfig:
     compare_factor: float = 1.0
     # Option to skip specific distributed configuration with name containing any of the provided strings.
     skip_tests: tuple[str] = ()
+    get_dataset: typing.Callable[[bool], tuple[pathlib.Path, dict[str, typing.Any], pathlib.Path]] = (
+        get_model_test_dataset
+    )
+
+    def __post_init__(self):
+        _, config, _ = self.get_dataset(config_only=True)
+        self.config_dict["data"]["datasets"] = config
 
     @functools.cached_property
     def config_args(self):
@@ -205,6 +214,7 @@ def _update_and_add_testing_config(
                             "heads": 8,
                             "head_groups": 8,
                             "head_size": 32,
+                            # "cross_document_attention":False,
                         },
                         "mlp": {
                             "layer_1": {"weight": init_1},
@@ -231,27 +241,7 @@ def _update_and_add_testing_config(
             },
         },
         "batch": {"batch_size": 8, "sequence_length": 512},
-        "data": {
-            "datasets": {
-                "training": {
-                    "dataset": {"type": "memmap", "path": MODEL_DATASET_SHARD_PATH},
-                    "type": "slice",
-                    "end": 0.969,
-                },
-                "validation": {
-                    "dataset": {"type": "memmap", "path": MODEL_DATASET_SHARD_PATH},
-                    "type": "slice",
-                    "begin": 0.969,
-                    "end": 0.999,
-                },
-                "test": {
-                    "dataset": {"type": "memmap", "path": MODEL_DATASET_SHARD_PATH},
-                    "type": "slice",
-                    "begin": 0.999,
-                    "end": 1,
-                },
-            }
-        },
+        "data": {},
         "optimizer": {"learning_rate": {"base": 0.0001}},
     },
     megatron_args=[
@@ -678,18 +668,54 @@ def _update_and_add_testing_config(
     },
     megatron_args=None,
     checkpoint_format=AprielHybridSSMCheckpointFormat,
+    groups={
+        ModelTestingGroup.basic: ModelTestingGroupAction.unimportant,
+        ModelTestingGroup.checkpoint: ModelTestingGroupAction.unimportant,
+        ModelTestingGroup.convert: ModelTestingGroupAction.unimportant,
+        ModelTestingGroup.generate: ModelTestingGroupAction.not_implemented,
+        ModelTestingGroup.megatron: ModelTestingGroupAction.not_implemented,
+        ModelTestingGroup.distributed: ModelTestingGroupAction.unimportant,
+    },
+    compare_factor=2.0,
+    # Micro-sequence split and sequence-first not supported.
+    skip_tests=("sdp", "ms"),
+)
+
+
+_update_and_add_testing_config(
+    # Tests vision multimodal.
+    "llama",
+    "llava",
+    model_type="multimodal",
+    updates={
+        ("model", "base_model", "vision_encoder"): {
+            "patch_convolution": {"patch_height": 4, "patch_width": 4},
+            "encoder": copy.deepcopy(MODEL_CONFIGS["llama"].config_dict["model"]["base_model"]["decoder"]),
+            "adapter": {"intermediate_size": 512},
+            "hidden_size": 256,
+        },
+        ("model", "base_model", "decoder", "num_blocks"): 1,
+        ("model", "base_model", "vision_encoder", "encoder", "block", "mixer", "rotary", "type"): "default_2d",
+        ("model", "base_model", "vision_encoder", "encoder", "num_blocks"): 1,
+        ("model", "base_model", "vision_encoder", "encoder", "block", "mixer", "causal"): False,
+        ("model", "base_model", "vision_encoder", "encoder", "block", "mixer", "cross_document_attention"): False,
+    },
+    get_dataset=get_multimodal_test_dataset,
+    megatron_args=None,
+    checkpoint_format=None,
     groups={
         ModelTestingGroup.basic: ModelTestingGroupAction.normal,
         ModelTestingGroup.checkpoint: ModelTestingGroupAction.normal,
-        ModelTestingGroup.convert: ModelTestingGroupAction.normal,
+        ModelTestingGroup.convert: ModelTestingGroupAction.not_implemented,
         ModelTestingGroup.generate: ModelTestingGroupAction.not_implemented,
         ModelTestingGroup.megatron: ModelTestingGroupAction.not_implemented,
         # TODO: Implement
         ModelTestingGroup.distributed: ModelTestingGroupAction.normal,
     },
-    compare_factor=2.0,
+    compare_factor=6.0,
     # Micro-sequence split and sequence-first not supported.
-    skip_tests=("sdp", "ms"),
+    # TODO: Gradient accumulation works but comparison is broken.
+    skip_tests=("sdp", "ms", "bf4", "df"),
 )
 
 
diff --git a/tests/utils/run_test_script.py b/tests/utils/run_test_script.py
index 7d706ebd..5a24e593 100644
--- a/tests/utils/run_test_script.py
+++ b/tests/utils/run_test_script.py
@@ -11,7 +11,6 @@
 
 from fast_llm.engine.distributed.config import DistributedConfig
 from fast_llm.utils import Assert
-from tests.utils.dataset import get_model_test_dataset
 from tests.utils.distributed_configs import DistributedTestingConfig
 from tests.utils.model_configs import MODEL_CONFIGS, ModelTestingConfig
 
@@ -72,7 +71,7 @@ def do_run_test_script_for_all_models(
     runnable_type: str = "train",
 ):
     Assert.leq(distributed_testing_config.num_gpus, DistributedConfig.default_world_size)
-    get_model_test_dataset()
+    model_testing_config.get_dataset()
     args = [
         "fast-llm",
         runnable_type,