Skip to content

Commit 6f35665

Browse files
committed
test: add multimodal model fallback tests
Add TestMultimodalModelFallbacks class to verify the fallback mechanism works correctly for models in MULTIMODAL_MATRIX: - HCXVisionForCausalLM (CHUNKED_PREFILL=NO) - LlavaLlamaModel (VILA) (CHUNKED_PREFILL=NO, KV_CACHE_REUSE=NO) - Llama4ForConditionalGeneration (in both matrices) - Qwen2VLForConditionalGeneration (all features YES) Also removes redundant inline comments from fallback code. Signed-off-by: Venky Ganesh <23023424+venkywonka@users.noreply.github.com>
1 parent fb6e36f commit 6f35665

File tree

2 files changed

+57
-9
lines changed

2 files changed

+57
-9
lines changed

tensorrt_llm/llmapi/llm.py

Lines changed: 0 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -331,15 +331,13 @@ def _disable_kv_cache_reuse() -> None:
331331
),
332332
)
333333

334-
# CUDA graph: disable by setting cuda_graph_config to None
335334
_disable_if_unsupported(
336335
SupportFeature.CUDA_GRAPH,
337336
enabled=getattr(self.args, "cuda_graph_config", None) is not None,
338337
arg_path="cuda_graph_config",
339338
disable=lambda: setattr(self.args, "cuda_graph_config", None),
340339
)
341340

342-
# Guided decoding: disable by setting guided_decoding_backend to None
343341
_disable_if_unsupported(
344342
SupportFeature.GUIDED_DECODING,
345343
enabled=getattr(self.args, "guided_decoding_backend", None)
@@ -348,13 +346,6 @@ def _disable_kv_cache_reuse() -> None:
348346
disable=lambda: setattr(self.args, "guided_decoding_backend", None),
349347
)
350348

351-
# Features NOT handled here (with reasons):
352-
# - DISAGGREGATED_SERVING: Server/deployment-level config, not LLM init
353-
# - MTP, EAGLE3_*: User explicitly configures speculative decoding
354-
# - TORCH_SAMPLER, TLLM_CPP_SAMPLER: sampler_type=auto handles selection
355-
# - SLIDING_WINDOW_ATTENTION: Model architecture inherent, not configurable
356-
# - LOGITS_POST_PROCESSOR: User-provided callback, not a simple flag
357-
358349
@property
359350
@set_api_status("beta")
360351
def llm_id(self) -> str:

tests/unittest/llmapi/test_model_feature_fallbacks.py

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -326,5 +326,62 @@ def test_guided_decoding_no_change_when_not_configured(self):
326326
assert mock.args.guided_decoding_backend is None
327327

328328

329+
class TestMultimodalModelFallbacks:
330+
"""Tests for multimodal models (MULTIMODAL_MATRIX coverage)."""
331+
332+
def test_chunked_prefill_disabled_for_hcxvision(self):
333+
"""HCXVisionForCausalLM (multimodal) has CHUNKED_PREFILL=NO, should disable."""
334+
mock = _make_mock_llm(
335+
architecture="HCXVisionForCausalLM",
336+
chunked_prefill=True,
337+
)
338+
339+
BaseLLM._apply_model_feature_fallbacks(mock)
340+
341+
assert mock.args.enable_chunked_prefill is False
342+
343+
def test_multiple_features_disabled_for_llava_vila(self):
344+
"""LlavaLlamaModel (VILA) has CHUNKED_PREFILL=NO and KV_CACHE_REUSE=NO."""
345+
mock = _make_mock_llm(
346+
architecture="LlavaLlamaModel (VILA)",
347+
chunked_prefill=True,
348+
kv_cache_reuse=True,
349+
)
350+
351+
BaseLLM._apply_model_feature_fallbacks(mock)
352+
353+
# Both features should be disabled
354+
assert mock.args.enable_chunked_prefill is False
355+
assert mock.args.kv_cache_config.enable_block_reuse is False
356+
357+
def test_multimodal_llama4_chunked_prefill_disabled(self):
358+
"""Llama4ForConditionalGeneration (in both matrices) has CHUNKED_PREFILL=NO in multimodal."""
359+
mock = _make_mock_llm(
360+
architecture="Llama4ForConditionalGeneration",
361+
chunked_prefill=True,
362+
kv_cache_reuse=True,
363+
)
364+
365+
BaseLLM._apply_model_feature_fallbacks(mock)
366+
367+
# MULTIMODAL_MATRIX is checked first, has CHUNKED_PREFILL=NO and KV_CACHE_REUSE=NO
368+
assert mock.args.enable_chunked_prefill is False
369+
assert mock.args.kv_cache_config.enable_block_reuse is False
370+
371+
def test_multimodal_supported_features_stay_enabled(self):
372+
"""Qwen2VLForConditionalGeneration has most features as YES."""
373+
mock = _make_mock_llm(
374+
architecture="Qwen2VLForConditionalGeneration",
375+
chunked_prefill=True,
376+
kv_cache_reuse=True,
377+
)
378+
379+
BaseLLM._apply_model_feature_fallbacks(mock)
380+
381+
# Both features should stay enabled (YES in matrix)
382+
assert mock.args.enable_chunked_prefill is True
383+
assert mock.args.kv_cache_config.enable_block_reuse is True
384+
385+
329386
if __name__ == "__main__":
330387
pytest.main([__file__, "-v"])

0 commit comments

Comments
 (0)