[None][feat] support Qwen3-VL dense model in pytorch backend (#9060)

Nekofish-L · web-flow · commit 73870ae4ad12 · 2025-12-31T17:54:26.000+09:00
Signed-off-by: Nekofish-L &lt;liuxiangyang@mail.ustc.edu.cn&gt;
diff --git a/tensorrt_llm/_torch/models/__init__.py b/tensorrt_llm/_torch/models/__init__.py
@@ -28,6 +28,7 @@
 from .modeling_qwen3 import Qwen3ForCausalLM
 from .modeling_qwen3_moe import Qwen3MoeForCausalLM
 from .modeling_qwen3_next import Qwen3NextForCausalLM
+from .modeling_qwen3vl import Qwen3VLModel
 from .modeling_qwen3vl_moe import Qwen3MoeVLModel
 from .modeling_qwen_moe import Qwen2MoeForCausalLM
 from .modeling_seedoss import SeedOssForCausalLM
@@ -76,6 +77,7 @@
     "GptOssForCausalLM",
     "SeedOssForCausalLM",
     "Glm4MoeForCausalLM",
+    "Qwen3VLModel",
 ]
 
 if transformers.__version__ >= "4.45.1":
diff --git a/tensorrt_llm/_torch/models/checkpoints/__init__.py b/tensorrt_llm/_torch/models/checkpoints/__init__.py
@@ -10,6 +10,7 @@
 from .hf.qwen2vl_weight_mapper import Qwen2VLHfWeightMapper
 from .hf.qwen3_moe_weight_mapper import Qwen3MoeHfWeightMapper
 from .hf.qwen3_next_weight_mapper import Qwen3NextHfWeightMapper
+from .hf.qwen3vl_weight_mapper import Qwen3VLHfWeightMapper
 from .hf.weight_loader import HfWeightLoader
 from .hf.weight_mapper import HfWeightMapper
 from .mistral.checkpoint_loader import (MistralCheckpointLoader,
@@ -19,23 +20,12 @@
                                     MistralWeightMapper)
 
 __all__ = [
-    "HfConfigLoader",
-    "HfWeightLoader",
-    "HfWeightMapper",
-    "MistralConfigLoader",
-    "MistralWeightMapper",
-    "MistralCheckpointLoader",
-    "BaseCheckpointLoader",
-    "HfCheckpointLoader",
-    "NemotronHHfWeightMapper",
-    "Gemma3HfWeightMapper",
-    "MixtralHfWeightMapper",
-    "Llama4HfWeightMapper",
-    "Qwen2MoeHfWeightMapper",
-    "Qwen3MoeHfWeightMapper",
-    "Qwen2VLHfWeightMapper",
-    "Qwen3NextHfWeightMapper",
-    "LlavaNextHfWeightMapper",
-    "MistralLarge3CheckpointLoader",
-    "MistralLarge3WeightMapper",
+    "HfConfigLoader", "HfWeightLoader", "HfWeightMapper", "MistralConfigLoader",
+    "MistralWeightMapper", "MistralCheckpointLoader", "BaseCheckpointLoader",
+    "HfCheckpointLoader", "NemotronHHfWeightMapper", "Gemma3HfWeightMapper",
+    "MixtralHfWeightMapper", "Llama4HfWeightMapper", "Qwen2MoeHfWeightMapper",
+    "Qwen3MoeHfWeightMapper", "Qwen2VLHfWeightMapper",
+    "Qwen3NextHfWeightMapper", "LlavaNextHfWeightMapper",
+    "MistralLarge3CheckpointLoader", "MistralLarge3WeightMapper",
+    "Qwen3VLHfWeightMapper"
 ]
diff --git a/tensorrt_llm/_torch/models/checkpoints/hf/qwen3vl_weight_mapper.py b/tensorrt_llm/_torch/models/checkpoints/hf/qwen3vl_weight_mapper.py
@@ -0,0 +1,8 @@
+from tensorrt_llm._torch.models.checkpoints.hf.weight_mapper import HfWeightMapper
+from tensorrt_llm._torch.models.modeling_utils import register_mapper
+
+
+@register_mapper("HF", "Qwen3VLForConditionalGeneration")
+class Qwen3VLHfWeightMapper(HfWeightMapper):
+    def preprocess_weights(self, weights: dict) -> dict:
+        return weights
diff --git a/tensorrt_llm/_torch/models/modeling_qwen3.py b/tensorrt_llm/_torch/models/modeling_qwen3.py
@@ -121,6 +121,8 @@ def forward(
         attn_metadata: AttentionMetadata,
         residual: Optional[torch.Tensor],
         spec_metadata: Optional[SpecMetadata] = None,
+        mrope_config: Optional[dict] = None,
+        deepstack_embeds: Optional[list[torch.Tensor]] = None,
         **kwargs,
     ) -> torch.Tensor:
         if residual is None:
@@ -137,6 +139,7 @@ def forward(
             attn_metadata=attn_metadata,
             all_reduce_params=AllReduceParams(
                 enable_allreduce=not self.disable_allreduce),
+            mrope_config=mrope_config,
             **kwargs,
         )
 
@@ -150,6 +153,9 @@ def forward(
                 enable_allreduce=not self.disable_allreduce),
             cutlass_min_latency_mode=False,
         )
+        if deepstack_embeds is not None and self.layer_idx in range(
+                len(deepstack_embeds)):
+            residual = residual + deepstack_embeds[self.layer_idx]
 
         if spec_metadata is not None:
             spec_metadata.maybe_capture_hidden_states(self.layer_idx,
@@ -191,6 +197,9 @@ def forward(
         position_ids: Optional[torch.IntTensor] = None,
         inputs_embeds: Optional[torch.FloatTensor] = None,
         spec_metadata: Optional[SpecMetadata] = None,
+        mrope_config: Optional[dict] = None,
+        # args for deepstack
+        deepstack_embeds: Optional[list[torch.Tensor]] = None,
         **kwargs,
     ) -> torch.Tensor:
         if (input_ids is None) ^ (inputs_embeds is not None):
@@ -211,8 +220,8 @@ def forward(
                 attn_metadata=attn_metadata,
                 residual=residual,
                 spec_metadata=spec_metadata,
-            )
-
+                mrope_config=mrope_config,
+                deepstack_embeds=deepstack_embeds)
         hidden_states, _ = self.norm(hidden_states, residual)
         return hidden_states
 
diff --git a/tensorrt_llm/_torch/models/modeling_qwen3vl.py b/tensorrt_llm/_torch/models/modeling_qwen3vl.py
@@ -21,7 +21,10 @@
     BaseMultimodalDummyInputsBuilder,
     BaseMultimodalInputProcessor,
     ExtraProcessedInputs,
+    MultimodalPlaceholderMetadata,
+    MultimodalPlaceholderPlacement,
     TextPrompt,
+    register_input_processor,
 )
 from ...inputs.multimodal import MultimodalParams
 from ...logger import logger
@@ -33,14 +36,23 @@
 from ..modules.linear import Linear, TensorParallelMode
 from ..modules.mlp import MLP
 from ..modules.rotary_embedding import MRotaryEmbedding
+from .checkpoints.base_weight_mapper import BaseWeightMapper
+from .checkpoints.hf.qwen3vl_weight_mapper import Qwen3VLHfWeightMapper
 from .modeling_auto import AutoModelForCausalLM
 from .modeling_multimodal_utils import (
     find_input_mm_embeds,
     fuse_input_embeds,
     get_multimodal_embeddings,
 )
 from .modeling_qwen2vl import Qwen2_5_VLVisionAttention
-from .modeling_utils import ModelConfig, QuantConfig, _load_weights_impl, filter_weights
+from .modeling_utils import (
+    ModelConfig,
+    QuantConfig,
+    _load_weights_impl,
+    filter_weights,
+    register_auto_model,
+    register_vision_encoder,
+)
 
 
 class Qwen3VLInputProcessorBase(BaseMultimodalInputProcessor, BaseMultimodalDummyInputsBuilder):
@@ -807,7 +819,12 @@ def __init__(
 
         llm_model_config = copy.deepcopy(model_config)
         llm_model_config.pretrained_config = config.text_config
-        llm_model_config.pretrained_config.architectures = ["Qwen3MoeForCausalLM"]
+        if self.original_arch == "Qwen3VLForConditionalGeneration":
+            llm_model_config.pretrained_config.architectures = ["Qwen3ForCausalLM"]
+        elif self.original_arch == "Qwen3VLMoeForConditionalGeneration":
+            llm_model_config.pretrained_config.architectures = ["Qwen3MoeForCausalLM"]
+        else:
+            raise ValueError(f"Unsupported architecture: {self.original_arch}")
         self.llm = AutoModelForCausalLM.from_config(llm_model_config)
 
         if not _is_disagg():
@@ -990,3 +1007,42 @@ def forward(
         )
         logger.debug(f"output shape: {output_prob.shape}")
         return output_prob
+
+
+@register_vision_encoder(Qwen3VisionModelBase, vlm_base_model=Qwen3VisionModel)
+@register_auto_model("Qwen3VLForConditionalGeneration")
+@register_input_processor(
+    Qwen3VLInputProcessorBase,
+    model_type="qwen3_vl",
+    placeholder_metadata=MultimodalPlaceholderMetadata(
+        placeholder_map={
+            "image": "<|vision_start|><|image_pad|><|vision_end|>",
+            "video": "<|vision_start|><|video_pad|><|vision_end|>",
+        },
+        placeholder_placement=MultimodalPlaceholderPlacement.BEFORE_TEXT,
+    ),
+)
+class Qwen3VLModel(Qwen3VLModelBase):
+    def __init__(self, model_config: ModelConfig[PretrainedConfig], *args, **kwargs):
+        # NOTE: HF implementation.
+        kwargs["vision_model_class"] = Qwen3VisionModel
+        kwargs["disable_fuse_rope"] = kwargs.get(
+            "disable_fuse_rope", False
+        )  # TODO: Make this ModelConfig's argument
+        super().__init__(model_config, *args, **kwargs)
+
+    @property
+    def multimodal_data_device_paths(self) -> List[str]:
+        return ["image.pixel_values", "video.pixel_values_videos", "multimodal_embedding"]
+
+    def load_weights(self, weights: Dict[str, torch.Tensor], weight_mapper: BaseWeightMapper):
+        if not _is_disagg():
+            self.mm_encoder.load_weights(weights)
+
+        weight_mapper = Qwen3VLHfWeightMapper()
+        weight_mapper.init_model_and_config(self.llm, self.model_config)
+        filtered_weights = {k: v for k, v in weights.items() if not k.startswith("model.visual.")}
+        params_map = {
+            r"^model\.language_model\.(.*)$": r"model.\1",
+        }
+        self.llm.load_weights(filtered_weights, weight_mapper, params_map=params_map)
diff --git a/tests/unittest/_torch/modeling/test_modeling_qwen3vl.py b/tests/unittest/_torch/modeling/test_modeling_qwen3vl.py