[feat] Enable TP and batching for PixtralVisionModel / Mistral3VLM (NVIDIA#6152)

2ez4bz · web-flow · commit ab7434ac6298 · 2025-07-22T11:06:41.000-07:00
Signed-off-by: William Zhang &lt;133824995+2ez4bz@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/models/modeling_clip.py b/tensorrt_llm/_torch/models/modeling_clip.py
@@ -202,7 +202,7 @@ def prepare_attn_metadata(self, batch_size):
             request_ids=request_ids,
             prompt_lens=prompt_lens,
         )
-        attn_metadata.max_seq_len = seq_len * batch_size
+        attn_metadata.max_seq_len = seq_len
         attn_metadata.prepare()
         return attn_metadata
 
diff --git a/tensorrt_llm/_torch/models/modeling_mistral.py b/tensorrt_llm/_torch/models/modeling_mistral.py
@@ -3,6 +3,7 @@
 from typing import Any, Dict, List, Optional, Tuple
 
 import torch
+import torchvision
 from torch import nn
 from transformers import (AutoProcessor, AutoTokenizer, Mistral3Config,
                           MistralConfig, PretrainedConfig, PreTrainedModel)
@@ -347,7 +348,6 @@ def forward(
         attn_metadata: AttentionMetadata,
         input_ids: Optional[torch.LongTensor] = None,
         position_ids: Optional[torch.LongTensor] = None,
-        inputs_embeds: Optional[torch.Tensor] = None,
         return_context_logits: bool = False,
         **kwargs,
     ) -> torch.Tensor:
@@ -363,20 +363,26 @@ def forward(
                 raise RuntimeError(
                     f"Number of multimodal tensors ({multimodal_params_len}) should be equal to number of "
                     f"context requests ({num_context_requests}) in the batch.")
-            # NOTES:
-            # 1. the pixel values in `multimodal_data["image"]` might vary in (height, width) between
-            #    images, making them unsafe to batch in general. The input processor also cannot produce
-            #    them in a batch, since it is always called with a single input - otherwise, we would
-            #    have been able to naturally leverage the padding / resizing capabilities of the underlying
-            #    `PixtralProcessor`.
-            # 2. After each `pixel_values` tensor has gone through the vision tower's `patch_conv` layer,
-            #    they are divided into patches that are then concatenated in order to treat them as a
-            #    single "sequence" in the vision tower's attention layers, so some form of batching still
-            #    happens in the vision tower.
-            image_features = [
-                self._get_image_features(**x.multimodal_data["image"])
+            pixel_values = [
+                x.multimodal_data["image"]["pixel_values"]
+                for x in multimodal_params
+            ]
+            image_sizes = [
+                x.multimodal_data["image"]["image_sizes"]
                 for x in multimodal_params
             ]
+            if not (len(pixel_values) == len(image_sizes) ==
+                    multimodal_params_len):
+                raise ValueError(
+                    f"Expected as many `pixel_values` ({len(pixel_values)}) and "
+                    f"`image_sizes` ({len(image_sizes)}) as number of multimodal parameters "
+                    f"({multimodal_params_len}).")
+            batched_pixel_values, batched_image_sizes = self._batch_pixel_values(
+                pixel_values=pixel_values, image_sizes=image_sizes)
+            image_features = [
+                self._get_image_features(pixel_values=batched_pixel_values,
+                                         image_sizes=batched_image_sizes)
+            ]
 
         input_ids, inputs_embeds = fuse_input_embeds(
             embedding_layer=self.llm.model.embed_tokens,
@@ -429,6 +435,31 @@ def _get_image_features(
                                                      image_sizes)
         return image_features
 
+    # Original HF implementation:
+    # https://github.com/huggingface/transformers/blob/v4.51.3/src/transformers/models/pixtral/
+    # image_processing_pixtral.py#L276
+    # We switch to using torchvision's padding functionality since it supports torch tensors
+    # (the transformers one expected numpy arrays).
+    @staticmethod
+    @torch.inference_mode()
+    def _batch_pixel_values(
+        pixel_values: List[torch.Tensor],
+        image_sizes: List[torch.Tensor],
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        batched_image_sizes = torch.cat(image_sizes)
+        max_shape = batched_image_sizes.max(dim=0).values
+        pixel_values = [
+            torchvision.transforms.v2.functional.pad(
+                image,
+                # Per torchvision docs, this should be in LTRB order if it's a sequence of 4 numbers.
+                padding=[0, 0, max_shape[1] - size[1], max_shape[0] - size[0]],
+                # Values extracted from HF implementation.
+                fill=0.0,
+                padding_mode="constant",
+            ) for image, size in zip(pixel_values, batched_image_sizes)
+        ]
+        return torch.cat(pixel_values), batched_image_sizes
+
 
 # Original implementation:
 # https://github.com/huggingface/transformers/blob/v4.51.3/src/transformers/models/mistral3/modeling_mistral3.py#L66
diff --git a/tensorrt_llm/_torch/models/modeling_pixtral.py b/tensorrt_llm/_torch/models/modeling_pixtral.py
@@ -106,11 +106,18 @@ def forward(
 class PixtralTransformer(torch.nn.Module):
     def __init__(self, config: model_config_lib.ModelConfig[transformers.PixtralVisionConfig]):
         super().__init__()
+        tp_size = config.mapping.tp_size
+        num_heads = config.pretrained_config.num_attention_heads
+        if (num_heads % tp_size) > 0:
+            raise ValueError(f"{tp_size=} must divide {num_heads=}.")
+        num_heads //= tp_size
+
+        self._head_dim = config.pretrained_config.head_dim
+        self._num_heads = num_heads
+
         self.layers = torch.nn.ModuleList()
         for i in range(config.pretrained_config.num_hidden_layers):
             self.layers.append(PixtralAttentionLayer(config=config, layer_idx=i))
-        self._head_dim = config.pretrained_config.head_dim
-        self._num_heads = config.pretrained_config.num_attention_heads
 
     def forward(
         self,
@@ -165,12 +172,6 @@ def __init__(
         self, model_config: model_config_lib.ModelConfig[transformers.PixtralVisionConfig]
     ):
         super().__init__()
-        tp_size = model_config.mapping.tp_size
-        # TODO: implement support for `tp_size > 1`.
-        if tp_size > 1:
-            raise NotImplementedError(
-                f"Mistral3VLM does not support `mapping.tp_size > 1` yet (got {tp_size})."
-            )
         # Both the below are needed in order to use `_load_weights_impl`.
         self.model_config = model_config
         self.config: transformers.PixtralVisionConfig = model_config.pretrained_config
@@ -204,12 +205,14 @@ def forward(
     ):
         with torch.autocast(device_type="cuda", dtype=self.config.torch_dtype):
             patch_embeds = self.patch_conv(pixel_values)
+
         patch_embeds_list = [
             embed[..., : (size[0] // self._patch_size), : (size[1] // self._patch_size)]
             for embed, size in zip(patch_embeds, image_sizes)
         ]
 
-        patch_embeds = torch.cat([p.flatten(1).T for p in patch_embeds_list], dim=0)
+        flattened_embeds = [p.flatten(1).T for p in patch_embeds_list]
+        patch_embeds = torch.cat(flattened_embeds, dim=0)
         patch_embeds = self.ln_pre(patch_embeds)
 
         position_ids = transformers.models.pixtral.modeling_pixtral.position_ids_in_meshgrid(
@@ -218,10 +221,8 @@ def forward(
         position_embeddings = self._patch_positional_embedding(patch_embeds, position_ids)
 
         attn_metadata = self._prepare_attn_metadata(
-            # The `torch.cat` that creates the `patch_embeds` flattens the conv features from multiple
-            # images into a single sequence - hence why we hardcode the batch size to 1 here.
-            batch_size=1,
-            seq_len=position_ids.size(0),
+            batch_size=pixel_values.size(0),
+            seq_lengths=[x.size(0) for x in flattened_embeds],
         )
         out = self.transformer(
             patch_embeds,
@@ -235,19 +236,18 @@ def forward(
     def load_weights(self, weights):
         modeling_utils._load_weights_impl(self, weights)
 
-    def _prepare_attn_metadata(self, batch_size: int, seq_len: int):
+    def _prepare_attn_metadata(self, batch_size: int, seq_lengths: List[int]):
         request_ids = list(range(1, batch_size + 1))
-        prompt_lens = [seq_len] * batch_size
         attn_metadata = self._metadata_cls(
-            seq_lens=torch.tensor([seq_len] * batch_size, dtype=torch.int),
+            seq_lens=torch.tensor(seq_lengths, dtype=torch.int),
             num_contexts=batch_size,
             max_num_requests=batch_size,
-            max_num_tokens=seq_len * batch_size,
+            max_num_tokens=sum(seq_lengths),
             kv_cache_manager=None,
             request_ids=request_ids,
-            prompt_lens=prompt_lens,
+            prompt_lens=seq_lengths,
         )
-        attn_metadata.max_seq_len = seq_len * batch_size
+        attn_metadata.max_seq_len = max(seq_lengths)
         attn_metadata.prepare()
         return attn_metadata
 
diff --git a/tests/unittest/_torch/modeling/test_modeling_pixtral.py b/tests/unittest/_torch/modeling/test_modeling_pixtral.py
@@ -1,12 +1,32 @@
+import gc
+import os
+import pathlib
+import pickle
+import sys
+
+import cloudpickle
+import mpi4py
 import pytest
 import torch
 import transformers
 from transformers.models.pixtral import modeling_pixtral as hf_modeling_pixtral
 
+import tensorrt_llm
 from tensorrt_llm import mapping as mapping_lib
 from tensorrt_llm._torch import model_config as model_config_lib
 from tensorrt_llm._torch.models import modeling_pixtral
 
+sys.path.append(os.path.join(os.path.dirname(__file__), ".."))
+cloudpickle.register_pickle_by_value(sys.modules[__name__])
+mpi4py.MPI.pickle.__init__(
+    cloudpickle.dumps,
+    cloudpickle.loads,
+    pickle.HIGHEST_PROTOCOL,
+)
+
+# needed since we reuse the mpi executor pool, first test running will leak a thread
+pytestmark = pytest.mark.threadleak(enabled=False)
+
 
 @pytest.fixture
 def pixtral_vision_config():
@@ -49,21 +69,6 @@ def init_hf_model(cls, config, dtype, device):
     return model
 
 
-@pytest.mark.parametrize(
-    "mapping",
-    [
-        mapping_lib.Mapping(world_size=2, tp_size=2),
-        mapping_lib.Mapping(world_size=3, tp_size=3),
-        mapping_lib.Mapping(world_size=4, tp_size=2, pp_size=2),
-        mapping_lib.Mapping(world_size=8, tp_size=2, pp_size=2, cp_size=2),
-    ],
-)
-def test_pixtral_vision_model_rejects_tp_size_greater_than_one(pixtral_vision_config, mapping):
-    pixtral_vision_config.mapping = mapping
-    with pytest.raises(NotImplementedError, match="tp_size > 1"):
-        modeling_pixtral.PixtralVisionModel(model_config=pixtral_vision_config)
-
-
 @torch.no_grad()
 @pytest.mark.usefixtures("set_seed")
 def test_pixtral_vision_model_vs_hf(pixtral_vision_config):
@@ -83,10 +88,10 @@ def test_pixtral_vision_model_vs_hf(pixtral_vision_config):
     # Make sure both models have the same weights.
     pixtral_model.load_weights(hf_pixtral_model.state_dict())
 
-    batch_size = 1
+    batch_size = 2
     height, width, channels = 123, 456, 3
     pixel_values = torch.randn(batch_size, channels, height, width, device=device, dtype=dtype)
-    image_sizes = torch.tensor([[height, width]])
+    image_sizes = torch.tensor([[height, width], [height - 7, width - 11]])
     out = pixtral_model(
         pixel_values=pixel_values,
         image_sizes=image_sizes,
@@ -102,3 +107,112 @@ def test_pixtral_vision_model_vs_hf(pixtral_vision_config):
         )
 
     torch.testing.assert_close(out, hf_out, atol=0.2, rtol=0.2)
+
+
+@pytest.mark.parametrize("mpi_pool_executor", [2], indirect=True)
+@torch.no_grad()
+def test_tensor_parallelism(pixtral_vision_config, mpi_pool_executor, tmp_path):
+    mapping = mapping_lib.Mapping(world_size=2, tp_size=2)
+    if (num_available_devices := torch.cuda.device_count()) < mapping.world_size:
+        pytest.skip(f"{num_available_devices=} is less than the requested {mapping.world_size}.")
+
+    dtype = torch.bfloat16
+    device = torch.device("cuda")
+    pretrained_config = pixtral_vision_config.pretrained_config
+
+    hf_pixtral_model = init_hf_model(
+        cls=hf_modeling_pixtral.PixtralVisionModel,
+        config=pretrained_config,
+        dtype=dtype,
+        device=device,
+    )
+    # Save HF weights to disk so they can be used by worker processes.
+    state_dict = hf_pixtral_model.state_dict()
+    hf_weights_path = tmp_path / "hf_weights.pt"
+    torch.save(state_dict, hf_weights_path)
+
+    pixtral_model = (
+        modeling_pixtral.PixtralVisionModel(model_config=pixtral_vision_config).eval().to("cuda")
+    )
+    pixtral_model.load_weights(state_dict)
+    # Save the number of params to check that the model gets shared in the workers.
+    num_params = sum(p.numel() for p in pixtral_model.parameters())
+
+    batch_size = 2
+    height, width, channels = 123, 456, 3
+    pixel_values = torch.randn(batch_size, channels, height, width, device=device, dtype=dtype)
+    image_sizes = torch.tensor([[height, width], [height - 7, width - 11]])
+
+    ref_out = pixtral_model(pixel_values=pixel_values, image_sizes=image_sizes)
+
+    # Move to CPU before sending across process barrier.
+    ref_out = ref_out.to("cpu")
+    pixel_values = pixel_values.to("cpu")
+    image_sizes = image_sizes.to("cpu")
+
+    # Free up GPU memory on rank 0.
+    del state_dict
+    del hf_pixtral_model
+    del pixtral_model
+    gc.collect()
+    torch.cuda.empty_cache()
+
+    world_size = mapping.world_size
+    pixtral_vision_config.mapping = mapping
+    results = mpi_pool_executor.starmap(
+        _run_pixtral_and_compare_against_ref,
+        [
+            (
+                pixtral_vision_config,
+                hf_weights_path,
+                pixel_values,
+                image_sizes,
+                ref_out,
+                num_params,
+            )
+            for _ in range(world_size)
+        ],
+    )
+
+    for r in results:
+        assert r
+
+
+def _run_pixtral_and_compare_against_ref(
+    pixtral_vision_config: model_config_lib.ModelConfig[transformers.PixtralVisionConfig],
+    hf_weights_path: pathlib.Path,
+    pixel_values: torch.Tensor,
+    image_sizes: torch.Tensor,
+    expected_output: torch.Tensor,
+    total_num_params: int,
+) -> bool:
+    rank = tensorrt_llm.mpi_rank()
+    # Smoke check.
+    world_size = tensorrt_llm.mpi_world_size()
+    assert world_size > 1
+
+    torch.cuda.set_device(rank)
+
+    pixel_values = pixel_values.to("cuda")
+    image_sizes = image_sizes.to("cuda")
+    expected_output = expected_output.to("cuda")
+
+    pixtral_vision_config.mapping.rank = rank
+    pixtral_model = (
+        modeling_pixtral.PixtralVisionModel(model_config=pixtral_vision_config).eval().to("cuda")
+    )
+    state_dict = torch.load(hf_weights_path, map_location="cuda")
+    pixtral_model.load_weights(state_dict)
+
+    # Smoke check to see that we are indeed sharding the model.
+    rank_num_params = sum(p.numel() for p in pixtral_model.parameters())
+    params_fraction = rank_num_params / total_num_params
+    assert params_fraction < 1.0
+    assert params_fraction == pytest.approx(1.0 / world_size, rel=1e-2)
+
+    out = pixtral_model(
+        pixel_values=pixel_values,
+        image_sizes=image_sizes,
+    )
+    torch.testing.assert_close(out, expected_output, atol=0.2, rtol=0.2)
+    return True

Original file line number	Diff line number	Diff line change
`@@ -202,7 +202,7 @@ def prepare_attn_metadata(self, batch_size):`
`202`	`202`	`request_ids=request_ids,`
`203`	`203`	`prompt_lens=prompt_lens,`
`204`	`204`	`)`
`205`		`- attn_metadata.max_seq_len = seq_len * batch_size`
	`205`	`+ attn_metadata.max_seq_len = seq_len`
`206`	`206`	`attn_metadata.prepare()`
`207`	`207`	`return attn_metadata`
`208`	`208`