NVIDIA
diff --git a/‎tensorrt_llm/_torch/models/modeling_llava_next.py‎
Lines changed: 4 additions & 1 deletion b/‎tensorrt_llm/_torch/models/modeling_llava_next.py‎
Lines changed: 4 additions & 1 deletion
diff --git a/‎tensorrt_llm/_torch/models/modeling_qwen2vl.py‎
Lines changed: 104 additions & 11 deletions b/‎tensorrt_llm/_torch/models/modeling_qwen2vl.py‎
Lines changed: 104 additions & 11 deletions
diff --git a/‎tensorrt_llm/_torch/pyexecutor/llm_request.py‎
Lines changed: 26 additions & 1 deletion b/‎tensorrt_llm/_torch/pyexecutor/llm_request.py‎
Lines changed: 26 additions & 1 deletion
diff --git a/‎tensorrt_llm/_torch/pyexecutor/model_engine.py‎
Lines changed: 31 additions & 10 deletions b/‎tensorrt_llm/_torch/pyexecutor/model_engine.py‎
Lines changed: 31 additions & 10 deletions
diff --git a/‎tensorrt_llm/_torch/pyexecutor/sampler.py‎
Lines changed: 19 additions & 3 deletions b/‎tensorrt_llm/_torch/pyexecutor/sampler.py‎
Lines changed: 19 additions & 3 deletions
diff --git a/‎tensorrt_llm/disaggregated_params.py‎
Lines changed: 2 additions & 0 deletions b/‎tensorrt_llm/disaggregated_params.py‎
Lines changed: 2 additions & 0 deletions
@@ -527,6 +527,8 @@ def __init__(self, model_config: ModelConfig[PretrainedConfig], *args,
             return
         if not DISAGG:
             self.mm_encoder = LlavaNextVisionModel(model_config)
+        else:
+            self.mm_encoder = None
 
         llm_model_config = copy.deepcopy(model_config)
         llm_model_config.pretrained_config = model_config.pretrained_config.text_config
@@ -545,7 +547,8 @@ def load_weights(self, weights, weight_mapper: BaseWeightMapper):
         if isinstance(weight_mapper, LlavaNextHfWeightMapper):
             weights = weight_mapper.preprocess_weights(weights)
 
-        self.mm_encoder.load_weights(weights)
+        if self.mm_encoder is not None:
+            self.mm_encoder.load_weights(weights)
 
         def filter_weights(weights: Dict):
             transformed_weights = {}
 
@@ -32,7 +32,8 @@
                        BaseMultimodalInputProcessor, ExtraProcessedInputs,
                        MultimodalPlaceholderMetadata,
                        MultimodalPlaceholderPlacement, TextPrompt,
-                       register_input_processor)
+                       register_input_processor,
+                       support_multimodal_disaggregated)
 from ...logger import logger
 from ...sampling_params import SamplingParams
 from ..attention_backend import AttentionMetadata
@@ -865,6 +866,8 @@ def __init__(
             mm_encoder_config = copy.deepcopy(model_config)
             self.mm_encoder = Qwen2VisionModelBase(
                 mm_encoder_config, kwargs.get('vision_model_class', None))
+        else:
+            self.mm_encoder = None
 
     def init_mrope_embedding(self, model_config: ModelConfig[PretrainedConfig]):
         config = model_config.pretrained_config
@@ -953,24 +956,21 @@ def forward(
         """
         VLM forward logic with inflight batching support.
         """
-        num_context_requests, num_generation_requests = attn_metadata.num_contexts, attn_metadata.num_generations
+        num_context_requests = attn_metadata.num_contexts
 
         multimodal_params = kwargs.get("multimodal_params", [])
         mm_embeds = []
         mrope_config = {}
-        # NOTE: Qwen*-VL series has mrope_config even on the text-only prompts, so we need to separate the mm_multimodal_params from the text-only prompts.
-        mm_multimodal_params = [
-            multimodal_param for multimodal_param in multimodal_params
-            if multimodal_param.multimodal_data.get("image", {}).get(
-                "pixel_values") is not None or multimodal_param.multimodal_data.
-            get("video", {}).get("pixel_values_videos") is not None
-        ]
+        # NOTE: Qwen*-VL series has mrope_config even on the text-only prompts, so we need to separate
+        # the entries that do have multimodal data from those that correspond to text-only prompts.
+        mm_multimodal_params = self._get_requests_with_mm_data(
+            multimodal_params)
         if len(mm_multimodal_params) > 0:
             if not _is_disagg():
                 mm_embeds = get_multimodal_embeddings(
                     encoder_forward_fn=self.mm_encoder.forward,
                     multimodal_params=mm_multimodal_params)
-            else:
+            elif not getattr(self, "support_mm_disagg", False):
                 raise NotImplementedError(
                     "Qwen2VLModel does not support disaggregated inference yet. Please unset "
                     f"the TLLM_MULTIMODAL_DISAGGREGATED environment variable, or set it to '0'."
@@ -995,6 +995,21 @@ def forward(
         logger.debug(f'output shape: {output_prob.shape}')
         return output_prob
 
+    def _get_requests_with_mm_data(self, multimodal_params):
+        mm_multimodal_params = []
+        for multimodal_param in multimodal_params:
+            data = multimodal_param.multimodal_data
+            if (
+                    # The first 2 conditions check whether there is input on which inference should be run.
+                    data.get("image", {}).get("pixel_values") is not None or
+                    data.get("video", {}).get("pixel_values_videos") is not None
+                    # This condition corresponds to when the embeddings are already populated, as is e.g.
+                    # the case in EPD disagg in the prefill worker.
+                    or data.get("multimodal_embedding")):
+                mm_multimodal_params.append(multimodal_param)
+
+        return mm_multimodal_params
+
 
 @register_vision_encoder(Qwen2VisionModelBase,
                          vlm_base_model=Qwen2VisionTransformerPretrainedModel)
@@ -1032,11 +1047,89 @@ def load_weights(self, weights, weight_mapper: BaseWeightMapper):
         self.llm.load_weights(weights, weight_mapper)
 
 
+class Qwen2_5VLInputProcessorBase(Qwen2VLInputProcessorBase):
+
+    def get_prompt_token_ids(
+        self, inputs: TextPrompt,
+        mm_handles: List[Dict[str,
+                              Any]]) -> Tuple[List[int], List[int], List[int]]:
+        """
+        Build input token ids with multimodal placeholders expanded to the number of MM tokens.
+
+        Args:
+            inputs: Text prompt input container. Must contain a non-empty prompt string.
+            mm_handles: List of multimodal embedding handles. Currently only a single handle is supported.
+
+        Returns:
+            Tuple[List[int], List[int], List[int]]:
+                - expanded_ids: token ids with each image token expanded to a placeholder repeated per MM token
+                - mm_token_length: per-image MM token lengths
+                - mm_token_offsets: start offsets (positions) for each image's MM tokens within expanded_ids
+        """
+        # TODO: Move this function to the base input processor class when extending for more models
+        text_prompt = inputs.get("prompt")
+        if not text_prompt:
+            raise ValueError("Text prompt is required but not provided")
+
+        if not isinstance(mm_handles, list):
+            raise TypeError("mm_handles must be a list")
+
+        if len(mm_handles) != 1:
+            # TODO: only support single multimodal item within a request for now
+            raise NotImplementedError(
+                "Only one mm_handle is supported for Qwen2.5 VL for now")
+        hidden_size = mm_handles[0]['tensor_size'][1]
+        assert hidden_size == self.config.text_config.hidden_size, "Multimodal embedding hidden size must match model hidden size"
+        input_ids = self.tokenizer(text_prompt,
+                                   return_tensors="pt").input_ids[0]
+
+        image_token_index = self.config.image_token_id
+
+        image_mask = input_ids == image_token_index
+        image_positions = torch.where(image_mask)[0]
+        num_images = len(image_positions)
+        assert num_images == len(
+            mm_handles), "Number of images must match number of mm_handles"
+        total_mm_tokens = sum(mm_handle["tensor_size"][0]
+                              for mm_handle in mm_handles)
+        final_length = len(input_ids) - num_images + total_mm_tokens
+        # Create output tensor
+        expanded_ids = torch.empty(final_length, dtype=input_ids.dtype)
+        placeholder_id = self.tllm_multimodal_token_id
+
+        # Fill the expanded sequence
+        write_pos = 0
+        image_cnt = 0
+        mm_token_length = []
+        mm_token_offsets = []
+        for read_pos in range(len(input_ids)):
+            if input_ids[read_pos] == image_token_index:
+                # Replace with placeholder id
+                mm_token_num = mm_handles[image_cnt]["tensor_size"][0]
+                expanded_ids[write_pos:write_pos + mm_token_num] = \
+                    placeholder_id
+                mm_token_offsets.append(write_pos)
+                mm_token_length.append(mm_token_num)
+                write_pos += mm_token_num
+                image_cnt += 1
+            else:
+                # Copy text token as-is
+                expanded_ids[write_pos] = input_ids[read_pos]
+                write_pos += 1
+
+        assert write_pos == final_length, f"Write position mismatch: {write_pos} != {final_length}"
+        assert mm_token_length[-1] + mm_token_offsets[
+            -1] <= final_length, f"mm_token_length[-1] + mm_token_offsets[-1] ({mm_token_length[-1] + mm_token_offsets[-1]}) should be less than or equal to final_length ({final_length})"
+        return expanded_ids.to(
+            torch.int32).tolist(), mm_token_length, mm_token_offsets
+
+
+@support_multimodal_disaggregated
 @register_vision_encoder(Qwen2VisionModelBase,
                          vlm_base_model=Qwen2_5_VisionModel)
 @register_auto_model("Qwen2_5_VLForConditionalGeneration")
 @register_input_processor(
-    Qwen2VLInputProcessorBase,
+    Qwen2_5VLInputProcessorBase,
     model_type="qwen2_5_vl",
     placeholder_metadata=MultimodalPlaceholderMetadata(
         placeholder_map={
 
@@ -262,6 +262,8 @@ def __init__(self,
             chunk_size=self._chunk_size) if return_generation_logits else None
         self._log_probs = LogProbStorage() if return_log_probs else None
         self._mm_embeddings = None
+        self._mrope_position_ids = None
+        self._mrope_position_deltas = None
         self._additional_context_outputs = {
             name: []
             for name in additional_outputs
@@ -293,6 +295,16 @@ def append_mm_embeddings(self, mm_embeddings: torch.Tensor):
         self._mm_embeddings = SharedTensorContainer.from_tensor(
             mm_embeddings).dump_to_dict()
 
+    def set_mrope_position(
+        self,
+        mrope_position_ids: torch.Tensor,
+        mrope_position_deltas: torch.Tensor,
+    ):
+        self._mrope_position_ids = (SharedTensorContainer.from_tensor(
+            mrope_position_ids).dump_to_dict())
+        self._mrope_position_deltas = (SharedTensorContainer.from_tensor(
+            mrope_position_deltas).dump_to_dict())
+
     def transfer_remaining_device_logits(self):
         """Finalize any remaining generation logits transfers (for chunked mode)"""
         if self._generation_logits:
@@ -352,6 +364,18 @@ def cum_log_probs(self) -> list[float] | None:
     def mm_embedding_handle(self) -> Dict[str, Any] | None:
         return self._mm_embeddings
 
+    @property
+    def mrope_position_ids_handle(self) -> Dict[str, Any] | None:
+        # NOTE: when populated, the returned `dict` contains the information necessary to rebuild
+        # the `SharedTensorContainer` using the `from_dict` class method.
+        return self._mrope_position_ids
+
+    @property
+    def mrope_position_deltas_handle(self) -> Dict[str, Any] | None:
+        # NOTE: when populated, the returned `dict` contains the information necessary to rebuild
+        # the `SharedTensorContainer` using the `from_dict` class method.
+        return self._mrope_position_deltas
+
     @property
     def additional_context_outputs(self) -> Dict[str, torch.Tensor] | None:
         if self._additional_context_outputs is None:
@@ -382,7 +406,8 @@ class LlmResult:
     py_result_properties = frozenset(
         ('context_logits', 'generation_logits', 'log_probs', 'cum_log_probs',
          'mm_embedding_handle', 'additional_context_outputs',
-         'additional_generation_outputs'))
+         'additional_generation_outputs', 'mrope_position_ids_handle',
+         'mrope_position_deltas_handle'))
 
     def __init__(self,
                  result: Union[bytes, tensorrt_llm.bindings.executor.Result],
 
@@ -2213,13 +2213,14 @@ def _prepare_tp_inputs(
                                                   mrope_position_deltas).expand(
                                                       3, 1, 1)
                         mrope_position_ids.append(gen_mrope_position_ids)
-                        multimodal_params.to_device(
-                            "multimodal_data",
-                            "cuda",
-                            pin_memory=True,
-                            target_keywords=[
-                                "mrope_config.mrope_position_deltas"
-                            ])
+                        if mrope_position_deltas.device.type == "cpu":
+                            multimodal_params.to_device(
+                                "multimodal_data",
+                                "cuda",
+                                pin_memory=True,
+                                target_keywords=[
+                                    "mrope_config.mrope_position_deltas"
+                                ])
                         multimodal_params_list.append(multimodal_params)
 
             request.py_batch_idx = request.py_seq_slot
@@ -2448,8 +2449,9 @@ def previous_seq_slots_device():
             # NOTE: self.use_mrope is enough for differentiating whether to use mrope_position_ids but
             # `_create_dummy_context_requests` from `kv_cache_creater` makes an exception that I can not add multimodal_data to the dummy_request
             # so that we only replace position_ids with mrope_position_ids when it is not a dummy request and for models who is using mrope.
-            mrope_position_ids = torch.cat(mrope_position_ids,
-                                           dim=-1).pin_memory()
+            mrope_position_ids = torch.cat(mrope_position_ids, dim=-1)
+            if mrope_position_ids.device.type == "cpu":
+                mrope_position_ids = mrope_position_ids.pin_memory()
             self.mrope_position_ids_cuda[:, :, :total_num_tokens].copy_(
                 mrope_position_ids[:, :, :total_num_tokens], non_blocking=True)
             final_position_ids = self.mrope_position_ids_cuda[:, :, :
@@ -3362,7 +3364,26 @@ def _forward_step_mm_encoder_only(
             mm_embeddings = list(
                 torch.split(mm_embeddings[0], multimodal_chunks, dim=0))
 
-        return {'mm_embeddings': mm_embeddings, 'logits': None}
+        # Extract mrope position data from multimodal_params if available
+        mrope_position_ids_list = []
+        mrope_position_deltas_list = []
+        for multimodal_param in multimodal_params:
+            mrope_config = multimodal_param.multimodal_data.get(
+                'mrope_config', {})
+            mrope_position_ids = mrope_config.get('mrope_position_ids')
+            mrope_position_deltas = mrope_config.get('mrope_position_deltas')
+            if mrope_position_ids is not None:
+                mrope_position_ids_list.append(mrope_position_ids)
+            if mrope_position_deltas is not None:
+                mrope_position_deltas_list.append(mrope_position_deltas)
+
+        result = {'mm_embeddings': mm_embeddings, 'logits': None}
+        if mrope_position_ids_list:
+            result['mrope_position_ids'] = mrope_position_ids_list
+        if mrope_position_deltas_list:
+            result['mrope_position_deltas'] = mrope_position_deltas_list
+
+        return result
 
     def _init_userbuffers(self, hidden_size):
         if self.mapping.tp_size <= 1 or self.mapping.pp_size > 1:
 
@@ -21,7 +21,7 @@
 from dataclasses import dataclass
 from functools import cached_property
 from itertools import repeat
-from typing import Any, Callable, Generic, List, Optional, Type, TypeVar, cast
+from typing import Any, Callable, Dict, Generic, List, Optional, Type, TypeVar, cast
 
 import numpy as np
 import torch
@@ -199,6 +199,8 @@ def is_generation_model(self) -> bool:
 @dataclass(kw_only=True)
 class MultimodalResult:
     mm_embeddings: List[torch.Tensor]
+    # Can be used to include e.g. `mrope_position_ids`, etc.
+    extra_data: Optional[Dict[str, Any]] = None
 
     def values(self):
         return vars(self).values()
@@ -262,7 +264,10 @@ def sample_async(
         resource_manager: Optional[ResourceManager] = None,
     ) -> SampleStateWithMMResult:
         # from model_outputs to MultimodalResult
-        data = MultimodalResult(mm_embeddings=model_outputs["mm_embeddings"])
+        data = MultimodalResult(
+            mm_embeddings=model_outputs.pop("mm_embeddings"),
+            extra_data={**model_outputs},
+        )
         return SampleStateWithMMResult(scheduled_requests=scheduled_requests, data=data)
 
     @override
@@ -276,7 +281,12 @@ def update_requests(
         scheduled_requests = state.scheduled_requests
         assert not scheduled_requests.generation_requests
         mm_embeddings = state.data.mm_embeddings
-        for request, mm_embedding in zip(scheduled_requests.context_requests, mm_embeddings):
+        extra_data = state.data.extra_data or {}
+        mrope_position_ids = extra_data.get("mrope_position_ids", None)
+        mrope_position_deltas = extra_data.get("mrope_position_deltas", None)
+        for i, (request, mm_embedding) in enumerate(
+            zip(scheduled_requests.context_requests, mm_embeddings)
+        ):
             request.state = LlmRequestState.GENERATION_COMPLETE
             # NOTE: This is a hack: set finish reason manually and set the beam 0
             request.set_finished_reason(FinishReason.LENGTH, 0)
@@ -287,6 +297,12 @@ def update_requests(
 
             request.py_result.append_mm_embeddings(mm_embedding)
 
+            # Store mrope data if available
+            if mrope_position_ids is not None and mrope_position_deltas is not None:
+                request.py_result.set_mrope_position(
+                    mrope_position_ids[i], mrope_position_deltas[i]
+                )
+
     @override
     def is_generation_model(self) -> bool:
         return False
 
@@ -40,6 +40,8 @@ class DisaggregatedParams:
     multimodal_hashes: Optional[List[List[int]]] = (
         None  # user provided mm hashes should be a list of 8 integers
     )
+    mrope_position_ids_handle: Optional[Dict[str, Any]] = None
+    mrope_position_deltas_handle: Optional[Dict[str, Any]] = None
 
     def get_context_phase_params(self) -> tllme.ContextPhaseParams:
         return tllme.ContextPhaseParams(
Original file line number	Diff line number	Diff line change
`@@ -40,6 +40,8 @@ class DisaggregatedParams:`
`40`	`40`	`multimodal_hashes: Optional[List[List[int]]] = (`
`41`	`41`	`None # user provided mm hashes should be a list of 8 integers`
`42`	`42`	`)`
	`43`	`+ mrope_position_ids_handle: Optional[Dict[str, Any]] = None`
	`44`	`+ mrope_position_deltas_handle: Optional[Dict[str, Any]] = None`
`43`	`45`
`44`	`46`	`def get_context_phase_params(self) -> tllme.ContextPhaseParams:`
`45`	`47`	`return tllme.ContextPhaseParams(`