test: add multimodal model fallback tests

venkywonka · venkywonka · commit 6f3566529321 · 2026-01-14T18:52:09.000-08:00
Add TestMultimodalModelFallbacks class to verify the fallback mechanism
works correctly for models in MULTIMODAL_MATRIX:
- HCXVisionForCausalLM (CHUNKED_PREFILL=NO)
- LlavaLlamaModel (VILA) (CHUNKED_PREFILL=NO, KV_CACHE_REUSE=NO)
- Llama4ForConditionalGeneration (in both matrices)
- Qwen2VLForConditionalGeneration (all features YES)

Also removes redundant inline comments from fallback code.

Signed-off-by: Venky Ganesh &lt;23023424+venkywonka@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/llmapi/llm.py b/tensorrt_llm/llmapi/llm.py
@@ -331,15 +331,13 @@ def _disable_kv_cache_reuse() -> None:
                                     ),
         )
 
-        # CUDA graph: disable by setting cuda_graph_config to None
         _disable_if_unsupported(
             SupportFeature.CUDA_GRAPH,
             enabled=getattr(self.args, "cuda_graph_config", None) is not None,
             arg_path="cuda_graph_config",
             disable=lambda: setattr(self.args, "cuda_graph_config", None),
         )
 
-        # Guided decoding: disable by setting guided_decoding_backend to None
         _disable_if_unsupported(
             SupportFeature.GUIDED_DECODING,
             enabled=getattr(self.args, "guided_decoding_backend", None)
@@ -348,13 +346,6 @@ def _disable_kv_cache_reuse() -> None:
             disable=lambda: setattr(self.args, "guided_decoding_backend", None),
         )
 
-        # Features NOT handled here (with reasons):
-        # - DISAGGREGATED_SERVING: Server/deployment-level config, not LLM init
-        # - MTP, EAGLE3_*: User explicitly configures speculative decoding
-        # - TORCH_SAMPLER, TLLM_CPP_SAMPLER: sampler_type=auto handles selection
-        # - SLIDING_WINDOW_ATTENTION: Model architecture inherent, not configurable
-        # - LOGITS_POST_PROCESSOR: User-provided callback, not a simple flag
-
     @property
     @set_api_status("beta")
     def llm_id(self) -> str:
diff --git a/tests/unittest/llmapi/test_model_feature_fallbacks.py b/tests/unittest/llmapi/test_model_feature_fallbacks.py
@@ -326,5 +326,62 @@ def test_guided_decoding_no_change_when_not_configured(self):
         assert mock.args.guided_decoding_backend is None
 
 
+class TestMultimodalModelFallbacks:
+    """Tests for multimodal models (MULTIMODAL_MATRIX coverage)."""
+
+    def test_chunked_prefill_disabled_for_hcxvision(self):
+        """HCXVisionForCausalLM (multimodal) has CHUNKED_PREFILL=NO, should disable."""
+        mock = _make_mock_llm(
+            architecture="HCXVisionForCausalLM",
+            chunked_prefill=True,
+        )
+
+        BaseLLM._apply_model_feature_fallbacks(mock)
+
+        assert mock.args.enable_chunked_prefill is False
+
+    def test_multiple_features_disabled_for_llava_vila(self):
+        """LlavaLlamaModel (VILA) has CHUNKED_PREFILL=NO and KV_CACHE_REUSE=NO."""
+        mock = _make_mock_llm(
+            architecture="LlavaLlamaModel (VILA)",
+            chunked_prefill=True,
+            kv_cache_reuse=True,
+        )
+
+        BaseLLM._apply_model_feature_fallbacks(mock)
+
+        # Both features should be disabled
+        assert mock.args.enable_chunked_prefill is False
+        assert mock.args.kv_cache_config.enable_block_reuse is False
+
+    def test_multimodal_llama4_chunked_prefill_disabled(self):
+        """Llama4ForConditionalGeneration (in both matrices) has CHUNKED_PREFILL=NO in multimodal."""
+        mock = _make_mock_llm(
+            architecture="Llama4ForConditionalGeneration",
+            chunked_prefill=True,
+            kv_cache_reuse=True,
+        )
+
+        BaseLLM._apply_model_feature_fallbacks(mock)
+
+        # MULTIMODAL_MATRIX is checked first, has CHUNKED_PREFILL=NO and KV_CACHE_REUSE=NO
+        assert mock.args.enable_chunked_prefill is False
+        assert mock.args.kv_cache_config.enable_block_reuse is False
+
+    def test_multimodal_supported_features_stay_enabled(self):
+        """Qwen2VLForConditionalGeneration has most features as YES."""
+        mock = _make_mock_llm(
+            architecture="Qwen2VLForConditionalGeneration",
+            chunked_prefill=True,
+            kv_cache_reuse=True,
+        )
+
+        BaseLLM._apply_model_feature_fallbacks(mock)
+
+        # Both features should stay enabled (YES in matrix)
+        assert mock.args.enable_chunked_prefill is True
+        assert mock.args.kv_cache_config.enable_block_reuse is True
+
+
 if __name__ == "__main__":
     pytest.main([__file__, "-v"])