include flag to check chunked prefill flag during profiling

johncalesp · johncalesp · commit f0b86b6e1d8c · 2025-10-12T18:43:24.000-04:00
Signed-off-by: John Calderon &lt;jcalderon@nvidia.com&gt;
diff --git a/tensorrt_llm/_torch/pyexecutor/_util.py b/tensorrt_llm/_torch/pyexecutor/_util.py
@@ -205,9 +205,11 @@ def _create_dummy_mm_context_request(
     def _create_dummy_context_requests(
             self, input_seq_len: int) -> List[trtllm.Request]:
         requests = []
-        if hasattr(self._model_engine.model,
-                   "original_arch") and MODEL_CLASS_VISION_ENCODER_MAPPING.get(
-                       self._model_engine.model.original_arch, None):
+        if hasattr(
+                self._model_engine.model,
+                "original_arch") and MODEL_CLASS_VISION_ENCODER_MAPPING.get(
+                    self._model_engine.model.original_arch, None
+                ) and self._model_engine.attn_runtime_features.chunked_prefill:
             requests = self._create_dummy_mm_context_request(input_seq_len)
         # if succeed profiling with multimodal requests then return, otherwise profile
         # with default case
diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -3500,7 +3500,6 @@ class TestQwen2_VL_7B(LlmapiAccuracyTestHarness):
     def test_auto_dtype(self):
         with LLM(self.MODEL_PATH,
                  max_num_tokens=16384,
-                 enable_chunked_prefill=True,
                  kv_cache_config=self.kv_cache_config) as llm:
             task = MMMU(self.MODEL_NAME)
             task.evaluate(llm, sampling_params=self.sampling_params)
diff --git a/tests/unittest/llmapi/apps/_test_openai_chat_multimodal.py b/tests/unittest/llmapi/apps/_test_openai_chat_multimodal.py
@@ -48,7 +48,7 @@ def server(model_name: str, temp_extra_llm_api_options_file: str):
     model_path = get_model_path(model_name)
     args = [
         "--extra_llm_api_options", temp_extra_llm_api_options_file,
-        "--max_batch_size", "64", "--enable_chunked_prefill"
+        "--max_batch_size", "64"
     ]
     with RemoteOpenAIServer(model_path, args) as remote_server:
         yield remote_server
diff --git a/tests/unittest/llmapi/apps/_test_trtllm_serve_multimodal_example.py b/tests/unittest/llmapi/apps/_test_trtllm_serve_multimodal_example.py
@@ -46,10 +46,7 @@ def temp_extra_llm_api_options_file(request):
 @pytest.fixture(scope="module")
 def server(model_name: str, temp_extra_llm_api_options_file: str):
     model_path = get_model_path(model_name)
-    args = [
-        "--enable_chunked_prefill", "--extra_llm_api_options",
-        temp_extra_llm_api_options_file
-    ]
+    args = ["--extra_llm_api_options", temp_extra_llm_api_options_file]
     with RemoteOpenAIServer(model_path, port=8000,
                             cli_args=args) as remote_server:
         yield remote_server

Original file line number	Diff line number	Diff line change
`@@ -48,7 +48,7 @@ def server(model_name: str, temp_extra_llm_api_options_file: str):`
`48`	`48`	`model_path = get_model_path(model_name)`
`49`	`49`	`args = [`
`50`	`50`	`"--extra_llm_api_options", temp_extra_llm_api_options_file,`
`51`		`- "--max_batch_size", "64", "--enable_chunked_prefill"`
	`51`	`+ "--max_batch_size", "64"`
`52`	`52`	`]`
`53`	`53`	`with RemoteOpenAIServer(model_path, args) as remote_server:`
`54`	`54`	`yield remote_server`