@@ -172,13 +172,10 @@ def _create_dummy_mm_context_request(
172172
173173 max_num_tokens = len (prompt_token_ids )
174174 assert max_num_tokens > 0 , "the length of the prompt of the dummy mm req is less than or equal to 0"
175- remaining_tokens = max (max_num_tokens , input_seq_len )
175+ remaining_tokens = min (max_num_tokens , input_seq_len )
176176 if remaining_tokens > input_seq_len :
177177 logger .warning (f"Profiling with multimedia prompt which contains more tokens than the allowed input_seq_len. " \
178178 f"Multimodal prompt has { remaining_tokens } while the input_seq_len is: { input_seq_len } " )
179- ## add + 1 to avoid error: RuntimeError: The max KV cache length of input sequences (X + 1) exceeds the KV cache manager's maximum supported length X.
180- ## at line "/code/tensorrt_llm/tensorrt_llm/_torch/attention_backend/trtllm.py", line 837
181- self ._max_seq_len = remaining_tokens + 1
182179 while remaining_tokens > 0 :
183180 req_mm_input = trtllm .MultimodalInput (
184181 multimodal_hashes = multimodal_input .multimodal_hashes ,
@@ -193,6 +190,9 @@ def _create_dummy_mm_context_request(
193190 output_config = trtllm .OutputConfig (),
194191 end_id = - 1 ,
195192 multimodal_input = req_mm_input )
193+ # TODO:
194+ # create_input_processor_with_hash shouldn’t be required during profiling,
195+ # but is temporarily needed due to the multimodal input dependency for chunked prefill
196196 request .py_multimodal_data = multimodal_data
197197 remaining_tokens -= max_num_tokens
198198 requests .append (request )
@@ -205,11 +205,10 @@ def _create_dummy_mm_context_request(
205205 def _create_dummy_context_requests (
206206 self , input_seq_len : int ) -> List [trtllm .Request ]:
207207 requests = []
208- if hasattr (
209- self ._model_engine .model ,
210- "original_arch" ) and MODEL_CLASS_VISION_ENCODER_MAPPING .get (
211- self ._model_engine .model .original_arch , None
212- ) and self ._model_engine .attn_runtime_features .chunked_prefill :
208+ if hasattr (self ._model_engine .model ,
209+ "original_arch" ) and MODEL_CLASS_VISION_ENCODER_MAPPING .get (
210+ self ._model_engine .model .original_arch , None ):
211+ input_seq_len = min (self ._max_num_tokens , input_seq_len )
213212 requests = self ._create_dummy_mm_context_request (input_seq_len )
214213 # if succeed profiling with multimodal requests then return, otherwise profile
215214 # with default case
0 commit comments