NVIDIA
diff --git a/‎tensorrt_llm/_torch/models/checkpoints/__init__.py‎
Lines changed: 2 additions & 1 deletion b/‎tensorrt_llm/_torch/models/checkpoints/__init__.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎tensorrt_llm/_torch/models/checkpoints/hf/llava_next_weight_mapper.py‎
Lines changed: 15 additions & 0 deletions b/‎tensorrt_llm/_torch/models/checkpoints/hf/llava_next_weight_mapper.py‎
Lines changed: 15 additions & 0 deletions
diff --git a/‎tensorrt_llm/_torch/models/modeling_llava_next.py‎
Lines changed: 62 additions & 80 deletions b/‎tensorrt_llm/_torch/models/modeling_llava_next.py‎
Lines changed: 62 additions & 80 deletions
diff --git a/‎tensorrt_llm/_torch/models/modeling_qwen2vl.py‎
Lines changed: 5 additions & 5 deletions b/‎tensorrt_llm/_torch/models/modeling_qwen2vl.py‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎tensorrt_llm/_torch/pyexecutor/py_executor_creator.py‎
Lines changed: 0 additions & 2 deletions b/‎tensorrt_llm/_torch/pyexecutor/py_executor_creator.py‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎tensorrt_llm/evaluate/lm_eval.py‎
Lines changed: 1 addition & 1 deletion b/‎tensorrt_llm/evaluate/lm_eval.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎tensorrt_llm/inputs/utils.py‎
Lines changed: 3 additions & 3 deletions b/‎tensorrt_llm/inputs/utils.py‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎tests/integration/defs/accuracy/references/mmmu.yaml‎
Lines changed: 8 additions & 0 deletions b/‎tests/integration/defs/accuracy/references/mmmu.yaml‎
Lines changed: 8 additions & 0 deletions
@@ -3,6 +3,7 @@
 from .hf.config_loader import HfConfigLoader
 from .hf.gemma3_weight_mapper import Gemma3HfWeightMapper
 from .hf.llama4_weight_mapper import Llama4HfWeightMapper
+from .hf.llava_next_weight_mapper import LlavaNextHfWeightMapper
 from .hf.mixtral_weight_mapper import MixtralHfWeightMapper
 from .hf.nemotron_h_weight_mapper import NemotronHHfWeightMapper
 from .hf.qwen2_moe_weight_mapper import Qwen2MoeHfWeightMapper
@@ -17,5 +18,5 @@
     "BaseCheckpointLoader", "HfCheckpointLoader", "NemotronHHfWeightMapper",
     "Gemma3HfWeightMapper", "MixtralHfWeightMapper", "Llama4HfWeightMapper",
     "Qwen2MoeHfWeightMapper", "Qwen3MoeHfWeightMapper", "Qwen2VLHfWeightMapper",
-    "Qwen3NextHfWeightMapper"
+    "Qwen3NextHfWeightMapper", "LlavaNextHfWeightMapper"
 ]
@@ -0,0 +1,15 @@
+from tensorrt_llm._torch.models.checkpoints.hf.weight_mapper import HfWeightMapper
+from tensorrt_llm._torch.models.modeling_utils import register_mapper
+
+
+@register_mapper("HF", "LlavaNextForConditionalGeneration")
+class LlavaNextHfWeightMapper(HfWeightMapper):
+    def preprocess_weights(self, weights: dict) -> dict:
+        transformed_weights = {}
+        for key, value in weights.items():
+            if key.startswith("model."):
+                new_key = key[len("model.") :]
+                transformed_weights[new_key] = value
+            else:
+                transformed_weights[key] = value
+        return transformed_weights
@@ -5,21 +5,23 @@
 import numpy as np
 import torch
 import torch.nn as nn
-from transformers import (AutoConfig, AutoModel, AutoProcessor, AutoTokenizer,
-                          LlavaNextConfig, PretrainedConfig, PreTrainedModel)
-from transformers.modeling_utils import load_sharded_checkpoint
+from transformers import (AutoProcessor, AutoTokenizer, LlavaNextConfig,
+                          PretrainedConfig, PreTrainedModel)
 from transformers.models.llava_next.modeling_llava_next import (
     LlavaNextMultiModalProjector, get_anyres_image_grid_shape,
     image_size_to_num_patches, unpad_image)
 
+from tensorrt_llm._torch.models.checkpoints.base_weight_mapper import \
+    BaseWeightMapper
+from tensorrt_llm._torch.models.checkpoints.hf.llava_next_weight_mapper import \
+    LlavaNextHfWeightMapper
 from tensorrt_llm.inputs.multimodal import MultimodalParams
 
 from ...inputs import (BaseMultimodalInputProcessor, ExtraProcessedInputs,
                        InputProcessor, MultimodalPlaceholderMetadata,
                        MultimodalPlaceholderPlacement, TextPrompt,
                        register_input_processor,
                        support_multimodal_disaggregated)
-from ...llmapi.utils import download_hf_model
 from ...logger import logger
 from ...sampling_params import SamplingParams
 from ..attention_backend import AttentionMetadata
@@ -28,8 +30,7 @@
 from .modeling_clip import CLIPVisionModel
 from .modeling_multimodal_utils import (find_input_mm_embeds, fuse_input_embeds,
                                         get_multimodal_embeddings)
-from .modeling_utils import (filter_weights, register_auto_model,
-                             register_vision_encoder)
+from .modeling_utils import register_auto_model, register_vision_encoder
 
 DISAGG = os.getenv('TLLM_MULTIMODAL_DISAGGREGATED', '0') == '1'
 
@@ -295,62 +296,36 @@ def __init__(self, model_config: ModelConfig[PretrainedConfig], *args,
         super().__init__()
         self.model_config = model_config
         self.pretrained_config = model_config.pretrained_config
-        # TODO: use config.mapping.get_local_rank() instead
-        self.device = f"cuda:{torch.cuda.current_device()}"
-        model_path = self.pretrained_config._name_or_path
 
-        # Determine the actual local path for model files
-        if os.path.isdir(model_path):
-            local_model_path = model_path
-        else:
-            local_model_path = download_hf_model(model_path)
-
-        # Partially load the model to reduce memory usage(Vision tower and multi-modal projector)
-        hf_model_config = AutoConfig.from_pretrained(local_model_path)
-        self.dtype = hf_model_config.text_config.torch_dtype
-        module_dict = nn.ModuleDict({
-            "vision_tower":
-            AutoModel.from_config(hf_model_config.vision_config),
-            "multi_modal_projector":
-            LlavaNextMultiModalProjector(hf_model_config)
-        })
-        module_dict.register_parameter(
-            "image_newline",
-            nn.Parameter(torch.empty(hf_model_config.text_config.hidden_size)))
-
-        missing_keys, _ = load_sharded_checkpoint(module_dict,
-                                                  local_model_path,
-                                                  strict=False)
-        assert len(missing_keys) == 0, f"Missing keys: {missing_keys}"
-        hf_vision_tower = module_dict["vision_tower"].to(self.dtype)
-        hf_mm_projector = module_dict["multi_modal_projector"].to(
-            self.dtype).to(self.device)
-        hf_image_newline = module_dict.image_newline.to(self.dtype).to(
-            self.device)
-
-        # For A100 GPU, fallback to HF vision tower due to accuracy issue in TRT-LLM CLIPAttention
-        # Otherwise, use TRTLLM vision tower(CLIPVisionModel)
-        prop = torch.cuda.get_device_properties(0)
-        sm_version = prop.major * 10 + prop.minor
-        self.use_hf_vision_tower = sm_version == 80
-        if self.use_hf_vision_tower:
-            self.vision_tower = hf_vision_tower.to(self.device)
-        else:
-            vision_model_config = ModelConfig(
-                pretrained_config=self.pretrained_config.vision_config,
-                attn_backend="TRTLLM")
-            self.vision_tower = CLIPVisionModel(vision_model_config).to(
-                self.device).to(self.dtype)
-            self.vision_tower.load_weights(hf_vision_tower.state_dict())
-
-        # Use HF multi-modal projector
-        self.mm_projector = hf_mm_projector
-        self.image_newline = hf_image_newline
+        clip_model_config = copy.deepcopy(self.model_config)
+        clip_model_config.pretrained_config = self.model_config.pretrained_config.vision_config
+        self.dtype = self.model_config.pretrained_config.text_config.torch_dtype
+        self.vision_model = CLIPVisionModel(clip_model_config).to(self.dtype)
+        self.mm_projector = LlavaNextMultiModalProjector(
+            self.pretrained_config).to(self.dtype)
+        self.image_newline = nn.Parameter(torch.empty(
+            self.pretrained_config.text_config.hidden_size),
+                                          requires_grad=False).to(self.dtype)
         self.vision_feature_select_strategy = getattr(
             self.pretrained_config, "vision_feature_select_strategy", "default")
-
         self.post_config()
 
+    def load_weights(self, weights):
+
+        def filter_weights(prefix, weights: Dict):
+            result = {}
+            for key, weight in weights.items():
+                if key.startswith(prefix):
+                    new_key = key[len(prefix):]
+                    result[new_key] = weight
+            return result
+
+        visual_model_weights = filter_weights("vision_tower.", weights)
+        self.vision_model.load_weights(visual_model_weights)
+        mm_projector_weights = filter_weights("multi_modal_projector.", weights)
+        self.mm_projector.load_state_dict(mm_projector_weights, strict=True)
+        self.image_newline.data.copy_(weights["image_newline"])
+
     def post_config(self):
         self.config = self.pretrained_config.vision_config
 
@@ -464,7 +439,6 @@ def forward(self, multimodal_params: List[MultimodalParams]):
             for multimodal_param in multimodal_params
         ]
         pixel_values = self._pad_for_batching(pixel_values)
-
         pixel_values = torch.cat(pixel_values, dim=0)
         image_sizes = torch.cat(image_sizes, dim=0)
 
@@ -484,23 +458,18 @@ def forward(self, multimodal_params: List[MultimodalParams]):
             ]
             pixel_values = torch.cat(_pixel_values_list, dim=0)
 
-        if self.use_hf_vision_tower:
-            image_features = self.vision_tower(
-                pixel_values, output_hidden_states=True).hidden_states
-        else:
-            attn_metadata = self.vision_tower.prepare_attn_metadata(
-                pixel_values.shape[0])
-            image_features = self.vision_tower(
-                pixel_values,
-                attn_metadata=attn_metadata,
-            )
+        attn_metadata = self.vision_model.prepare_attn_metadata(
+            pixel_values.shape[0])
+        image_features = self.vision_model(
+            pixel_values,
+            attn_metadata=attn_metadata,
+        )
         selected_image_feature = image_features[-2][:, 1:]
         image_features = self.mm_projector(selected_image_feature)
-
         image_features = torch.split(image_features, image_num_patches, dim=0)
 
-        # NOTE: 'pack_image_features' is directly copied from the HF's code
-        image_features, feature_lens = self.pack_image_features(
+        # NOTE: 'pack_image_features' is from the HF's code
+        image_features, _ = self.pack_image_features(
             image_features,
             image_sizes,
             vision_feature_select_strategy=self.vision_feature_select_strategy,
@@ -526,6 +495,7 @@ class LlavaNextModel(PreTrainedModel):
     def __init__(self, model_config: ModelConfig[PretrainedConfig], *args,
                  **kwargs) -> None:
         config = model_config.pretrained_config
+        self._supports_sdpa = True
         super().__init__(config)
         if hasattr(self, "llm"):
             return
@@ -543,16 +513,29 @@ def __init__(self, model_config: ModelConfig[PretrainedConfig], *args,
         self.llm = AutoModelForCausalLM.from_config(llm_model_config)
 
         self.model_config = model_config
-        self.model_dtype = getattr(config.text_config, "torch_dtype",
-                                   torch.float16)
-        logger.info(f"{self.dtype=} {self.model_dtype=}")
-
         self.post_config()
-        self.is_loaded = True
 
-    def load_weights(self, weights):
-        weights = filter_weights("language_model", weights)
-        self.llm.load_weights(weights)
+    def load_weights(self, weights, weight_mapper: BaseWeightMapper):
+        if isinstance(weight_mapper, LlavaNextHfWeightMapper):
+            weights = weight_mapper.preprocess_weights(weights)
+
+        self.mm_encoder.load_weights(weights)
+
+        def filter_weights(weights: Dict):
+            transformed_weights = {}
+            for key, weight in weights.items():
+                if key.startswith("language_model."):
+                    if isinstance(weight_mapper, LlavaNextHfWeightMapper):
+                        new_key = "model." + key[len("language_model."):]
+                    else:
+                        new_key = key[len("language_model."):]
+                    transformed_weights[new_key] = weight
+                elif key.startswith("lm_head."):
+                    transformed_weights[key] = weight
+            return transformed_weights
+
+        language_model_weights = filter_weights(weights)
+        self.llm.load_weights(language_model_weights)
 
     def post_config(self):
         self.config = self.llm.config
@@ -590,7 +573,6 @@ def forward(
                 mm_embeds, multimodal_params[:num_context_requests])
         input_ids, inputs_embeds = fuse_input_embeds(
             self.llm.model.embed_tokens, input_ids, mm_embeds, **kwargs)
-
         logits = self.llm.forward(attn_metadata, input_ids, position_ids,
                                   inputs_embeds, return_context_logits)
         return logits
@@ -840,8 +840,13 @@ def __init__(
         model_config.pretrained_config.disable_fuse_rope = disabble_fuse_rope
         model_config.pretrained_config.rope_scaling['type'] = 'mrope'
         config = model_config.pretrained_config
+
+        self._supports_sdpa = True
         super().__init__(config)
 
+        if not disabble_fuse_rope:
+            self.init_mrope_embedding(model_config)
+
         self.model_config = model_config
         self.config = model_config.pretrained_config
 
@@ -947,14 +952,10 @@ def forward(
         VLM forward logic with inflight batching support.
         """
         num_context_requests, num_generation_requests = attn_metadata.num_contexts, attn_metadata.num_generations
-        logger.debug(
-            f"num_context_requests: {num_context_requests}, num_generation_requests: {num_generation_requests}"
-        )
 
         multimodal_params = kwargs.get("multimodal_params", [])
         mm_embeds = []
         mrope_config = {}
-
         if len(multimodal_params) > 0:
             if not DISAGG:
                 mm_embeds = get_multimodal_embeddings(
@@ -965,7 +966,6 @@ def forward(
                     "Qwen2VLModel does not support disaggregated inference yet. Please unset "
                     f"the TLLM_MULTIMODAL_DISAGGREGATED environment variable, or set it to '0'."
                 )
-
             mm_embeds = find_input_mm_embeds(
                 mm_embeds, multimodal_params[:num_context_requests])
             if not self.model_config.pretrained_config.disable_fuse_rope:
 
@@ -279,7 +279,6 @@ def create_py_executor(
     if mm_encoder_only:
         # TODO(qijun): clean up pytorch_backend_config later
         pytorch_backend_config.mm_encoder_only = True
-        pytorch_backend_config.load_format = LoadFormat.VISION_ONLY
         # Disable overlap scheduler for multimodal encoder-only mode
         logger.warning(
             "Disabling overlap scheduler for multimodal encoder-only mode. "
@@ -288,7 +287,6 @@ def create_py_executor(
         pytorch_backend_config.disable_overlap_scheduler = True
 
         llm_args.mm_encoder_only = True
-        llm_args.load_format = LoadFormat.VISION_ONLY
         llm_args.disable_overlap_scheduler = True
 
     mapping = _get_mapping(llm_args.parallel_config.to_mapping())
 
@@ -236,7 +236,7 @@ def apply_chat_template(self,
         output = trtllm_apply_chat_template(
             model_type=self.model_type,
             tokenizer=self.llm.tokenizer,
-            processor=self.llm.input_processor.processor,
+            processor=getattr(self.llm.input_processor, 'processor', None),
             conversation=chat_history,
             add_generation_prompt=add_generation_prompt,
             mm_placeholder_counts=mm_placeholder_counts,
 
@@ -545,7 +545,7 @@ def apply_chat_template(
     processor: ProcessorMixin,
     conversation: list[ConversationMessage],
     add_generation_prompt: bool,
-    mm_placeholder_counts: dict[str, int],
+    mm_placeholder_counts: list[dict[str, int]],
     tools: Optional[list[dict[str, Any]]] = None,
     documents: Optional[list[dict[str, str]]] = None,
     chat_template: Optional[str] = None,
@@ -567,7 +567,7 @@ def apply_chat_template(
     if model_type in PLACEHOLDER_EXCEPTIONS:
         # flattened content do not work for these models, so go back to other formats as needed
         conversation = handle_placeholder_exceptions(model_type, conversation,
-                                                     [mm_placeholder_counts])
+                                                     mm_placeholder_counts)
 
     return tokenizer.apply_chat_template(
         conversation=conversation,
@@ -732,7 +732,7 @@ def convert_to_conversation_message(
             processor=processor,
             conversation=[conv],
             add_generation_prompt=True,
-            mm_placeholder_counts=mm_placeholder_counts)
+            mm_placeholder_counts=[mm_placeholder_counts])
         input = {"prompt": prompt}
         if mm_placeholder_counts:
             if mm_embeddings is not None:
 
@@ -1,4 +1,12 @@
 Qwen/Qwen2-VL-7B-Instruct:
   - accuracy: 48.44
+Qwen/Qwen2.5-VL-7B-Instruct:
+  - accuracy: 51.22
 nvidia/Nano-v2-VLM:
   - accuracy: 43.78
+llava-hf/llava-v1.6-mistral-7b:
+  - accuracy: 35.33
+Efficient-Large-Model/NVILA-8B:
+  - accuracy: 47.77
+Efficient-Large-Model/VILA1.5-3b:
+  - accuracy: 32.33