Skip to content

Commit f2af195

Browse files
Wanli-Jiangmikeiovine
authored andcommitted
[None][fix] Bypass key-word matching for multimodal tests (NVIDIA#9170)
Signed-off-by: Wanli Jiang <35160485+Wanli-Jiang@users.noreply.github.com> Signed-off-by: Mike Iovine <6158008+mikeiovine@users.noreply.github.com> Signed-off-by: Mike Iovine <miovine@nvidia.com>
1 parent 4e19bea commit f2af195

File tree

9 files changed

+85
-78
lines changed

9 files changed

+85
-78
lines changed

tests/integration/defs/accuracy/references/mmmu.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,3 +15,5 @@ Efficient-Large-Model/VILA1.5-3b:
1515
# the metric here is for model sanity checking.
1616
nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16:
1717
- accuracy: 26.67
18+
microsoft/Phi-4-multimodal-instruct:
19+
- accuracy: 53.67

tests/integration/defs/accuracy/test_llm_api_pytorch_multimodal.py

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -192,3 +192,25 @@ def test_auto_dtype(self):
192192
sampling_params=self.sampling_params,
193193
extra_evaluator_kwargs=self.EXTRA_EVALUATOR_KWARGS,
194194
)
195+
196+
197+
class TestPhi4MMFusedVisionLora(LlmapiAccuracyTestHarness):
198+
MODEL_NAME = "microsoft/Phi-4-multimodal-instruct"
199+
MODEL_PATH = f"{llm_models_root()}/multimodals/Phi-4-multimodal-instruct-fuse-vision-lora"
200+
MAX_NUM_TOKENS = 25600
201+
202+
sampling_params = SamplingParams(
203+
max_tokens=MAX_NUM_TOKENS, truncate_prompt_tokens=MMMU.MAX_INPUT_LEN, stop="<|USER|>"
204+
)
205+
206+
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.7)
207+
208+
def test_auto_dtype(self):
209+
with LLM(
210+
self.MODEL_PATH,
211+
max_batch_size=32,
212+
max_num_tokens=self.MAX_NUM_TOKENS,
213+
kv_cache_config=self.kv_cache_config,
214+
) as llm:
215+
task = MMMU(self.MODEL_NAME)
216+
task.evaluate(llm, sampling_params=self.sampling_params)

tests/integration/defs/test_e2e.py

Lines changed: 40 additions & 57 deletions
Original file line numberDiff line numberDiff line change
@@ -2558,7 +2558,9 @@ def test_ptp_quickstart_multimodal(llm_root, llm_venv, model_name, model_path,
25582558
print("output:", output)
25592559
return
25602560

2561-
match_ratio = 4.0 / 5
2561+
# Set match ratio to 0.0 to bypass keyword matching.
2562+
match_ratio = 0.0
2563+
25622564
parsed_outputs = parse_output(output)
25632565
for prompt_output, prompt_keywords in zip(
25642566
parsed_outputs, expected_keywords[model_name][modality]):
@@ -2572,29 +2574,21 @@ def test_ptp_quickstart_multimodal(llm_root, llm_venv, model_name, model_path,
25722574

25732575

25742576
@pytest.mark.parametrize("modality", ["image", "video"])
2575-
@pytest.mark.parametrize(
2576-
"model_name,model_path,match_ratio",
2577-
[
2578-
("phi4-multimodal-instruct", "multimodals/Phi-4-multimodal-instruct",
2579-
0.8),
2580-
pytest.param("phi4-multimodal-instruct-fp4",
2581-
"multimodals/Phi-4-multimodal-instruct-FP4",
2582-
0.8,
2583-
marks=skip_pre_blackwell),
2584-
pytest.param("phi4-multimodal-instruct-fp8",
2585-
"multimodals/Phi-4-multimodal-instruct-FP8",
2586-
0.8,
2587-
marks=skip_pre_hopper),
2588-
pytest.param(
2589-
"mistral-small-3.1-24b-instruct",
2590-
"Mistral-Small-3.1-24B-Instruct-2503",
2591-
# Lower threshold to give some wiggle room for flakiness.
2592-
0.6,
2593-
marks=pytest.mark.skip_less_device_memory(80000)),
2594-
])
2577+
@pytest.mark.parametrize("model_name,model_path", [
2578+
("phi4-multimodal-instruct", "multimodals/Phi-4-multimodal-instruct"),
2579+
pytest.param("phi4-multimodal-instruct-fp4",
2580+
"multimodals/Phi-4-multimodal-instruct-FP4",
2581+
marks=skip_pre_blackwell),
2582+
pytest.param("phi4-multimodal-instruct-fp8",
2583+
"multimodals/Phi-4-multimodal-instruct-FP8",
2584+
marks=skip_pre_hopper),
2585+
pytest.param("mistral-small-3.1-24b-instruct",
2586+
"Mistral-Small-3.1-24B-Instruct-2503",
2587+
marks=pytest.mark.skip_less_device_memory(80000)),
2588+
])
25952589
def test_ptp_quickstart_multimodal_kv_cache_reuse(llm_root, llm_venv,
25962590
model_name, model_path,
2597-
modality, match_ratio):
2591+
modality):
25982592
# NOTE: individual tests need to be enabled in
25992593
# tests/integration/test_lists/qa/examples_test_list.txt
26002594

@@ -2684,7 +2678,9 @@ def test_ptp_quickstart_multimodal_kv_cache_reuse(llm_root, llm_venv,
26842678
cmd.append("Phi4MMForCausalLM")
26852679

26862680
output = llm_venv.run_cmd(cmd, caller=check_output)
2687-
match_ratio = 4.0 / 5
2681+
2682+
# Set match ratio to 0.0 to bypass keyword matching.
2683+
match_ratio = 0.0
26882684
for prompt_output, prompt_keywords in zip(
26892685
parse_output(output), expected_keywords[model_name][modality]):
26902686
matches = [
@@ -2702,29 +2698,21 @@ def test_ptp_quickstart_multimodal_kv_cache_reuse(llm_root, llm_venv,
27022698

27032699

27042700
@pytest.mark.parametrize("modality", ["image", "video"])
2705-
@pytest.mark.parametrize(
2706-
"model_name,model_path,match_ratio",
2707-
[
2708-
("phi4-multimodal-instruct", "multimodals/Phi-4-multimodal-instruct",
2709-
0.8),
2710-
pytest.param("phi4-multimodal-instruct-fp4",
2711-
"multimodals/Phi-4-multimodal-instruct-FP4",
2712-
0.8,
2713-
marks=skip_pre_blackwell),
2714-
pytest.param("phi4-multimodal-instruct-fp8",
2715-
"multimodals/Phi-4-multimodal-instruct-FP8",
2716-
0.8,
2717-
marks=skip_pre_hopper),
2718-
pytest.param(
2719-
"mistral-small-3.1-24b-instruct",
2720-
"Mistral-Small-3.1-24B-Instruct-2503",
2721-
# Lower threshold to give some wiggle room for flakiness.
2722-
0.6,
2723-
marks=pytest.mark.skip_less_device_memory(80000)),
2724-
])
2701+
@pytest.mark.parametrize("model_name,model_path", [
2702+
("phi4-multimodal-instruct", "multimodals/Phi-4-multimodal-instruct"),
2703+
pytest.param("phi4-multimodal-instruct-fp4",
2704+
"multimodals/Phi-4-multimodal-instruct-FP4",
2705+
marks=skip_pre_blackwell),
2706+
pytest.param("phi4-multimodal-instruct-fp8",
2707+
"multimodals/Phi-4-multimodal-instruct-FP8",
2708+
marks=skip_pre_hopper),
2709+
pytest.param("mistral-small-3.1-24b-instruct",
2710+
"Mistral-Small-3.1-24B-Instruct-2503",
2711+
marks=pytest.mark.skip_less_device_memory(80000)),
2712+
])
27252713
def test_ptp_quickstart_multimodal_chunked_prefill(llm_root, llm_venv,
27262714
model_name, model_path,
2727-
modality, match_ratio):
2715+
modality):
27282716
# NOTE: individual tests need to be enabled in
27292717
# tests/integration/test_lists/qa/examples_test_list.txt
27302718

@@ -2843,6 +2831,8 @@ def test_ptp_quickstart_multimodal_chunked_prefill(llm_root, llm_venv,
28432831
cmd.append("Phi4MMForCausalLM")
28442832

28452833
output = llm_venv.run_cmd(cmd, caller=check_output)
2834+
# Set match ratio to 0.0 to bypass keyword matching.
2835+
match_ratio = 0.0
28462836
for prompt_output, prompt_keywords in zip(
28472837
parse_output(output), expected_keywords[model_name][modality]):
28482838
matches = [
@@ -2944,7 +2934,8 @@ def test_ptp_quickstart_multimodal_phi4mm(llm_root, llm_venv, model_name,
29442934
]
29452935
output = llm_venv.run_cmd(cmd, caller=check_output)
29462936

2947-
match_ratio = 0.6
2937+
# Set match ratio to 0.0 to bypass keyword matching.
2938+
match_ratio = 0.0
29482939
parsed_outputs = parse_output(output)
29492940
for prompt_output, prompt_keywords in zip(parsed_outputs,
29502941
expected_keywords[modality]):
@@ -3069,12 +3060,8 @@ def test_ptp_quickstart_multimodal_2gpu(llm_root, llm_venv, model_name,
30693060
print("output:", output)
30703061
return
30713062

3072-
# Set match ratio based on model
3073-
match_ratio = 4.0 / 5
3074-
if model_name.startswith("phi4-multimodal-instruct"):
3075-
match_ratio = 0.6
3076-
3077-
# Check output accuracy
3063+
# Set match ratio to 0.0 to bypass keyword matching.
3064+
match_ratio = 0.0
30783065
parsed_outputs = parse_output(output)
30793066
for prompt_output, prompt_keywords in zip(
30803067
parsed_outputs, expected_keywords[model_name]["image"]):
@@ -3197,12 +3184,8 @@ def test_ptp_quickstart_multimodal_multiturn(llm_root, llm_venv, model_name,
31973184
)
31983185
return
31993186

3200-
# Set match ratio based on model
3201-
match_ratio = 4.0 / 5
3202-
if model_name.startswith("Phi-4-multimodal-instruct"):
3203-
match_ratio = 0.6
3204-
3205-
# Check output accuracy
3187+
# Set match ratio to 0.0 to bypass keyword matching.
3188+
match_ratio = 0.0
32063189
parsed_outputs = parse_output(output)
32073190
for prompt_output, prompt_keywords in zip(
32083191
parsed_outputs, expected_keywords[model_name]["image"]):

tests/integration/test_lists/qa/llm_function_core.txt

Lines changed: 9 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -631,6 +631,7 @@ accuracy/test_llm_api_pytorch_multimodal.py::TestLlava_V1_6_Mistral_7B::test_aut
631631
accuracy/test_llm_api_pytorch_multimodal.py::TestNVILA_8B::test_auto_dtype
632632
accuracy/test_llm_api_pytorch_multimodal.py::TestVILA1_5_3B::test_auto_dtype
633633
accuracy/test_llm_api_pytorch_multimodal.py::TestNemotron_Nano_12B_V2_VL::test_auto_dtype
634+
accuracy/test_llm_api_pytorch_multimodal.py::TestPhi4MMFusedVisionLora::test_auto_dtype
634635

635636
test_e2e.py::test_llama_e2e[use_cpp_session-remove_input_padding-]
636637
test_e2e.py::test_llama_e2e[use_py_session-remove_input_padding-]
@@ -674,14 +675,14 @@ test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistr
674675
test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-mixture_text_image-True]
675676
test_e2e.py::test_ptp_quickstart_multimodal[gemma-3-27b-it-gemma/gemma-3-27b-it-image-False]
676677
test_e2e.py::test_ptp_quickstart_multimodal[gemma-3-27b-it-gemma/gemma-3-27b-it-image-True]
677-
test_e2e.py::test_ptp_quickstart_multimodal_kv_cache_reuse[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-0.6-image]
678-
test_e2e.py::test_ptp_quickstart_multimodal_kv_cache_reuse[phi4-multimodal-instruct-multimodals/Phi-4-multimodal-instruct-0.8-image]
679-
test_e2e.py::test_ptp_quickstart_multimodal_kv_cache_reuse[phi4-multimodal-instruct-fp4-multimodals/Phi-4-multimodal-instruct-FP4-0.8-image]
680-
test_e2e.py::test_ptp_quickstart_multimodal_kv_cache_reuse[phi4-multimodal-instruct-fp8-multimodals/Phi-4-multimodal-instruct-FP8-0.8-image]
681-
test_e2e.py::test_ptp_quickstart_multimodal_chunked_prefill[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-0.6-image]
682-
test_e2e.py::test_ptp_quickstart_multimodal_chunked_prefill[phi4-multimodal-instruct-multimodals/Phi-4-multimodal-instruct-0.8-image]
683-
test_e2e.py::test_ptp_quickstart_multimodal_chunked_prefill[phi4-multimodal-instruct-fp8-multimodals/Phi-4-multimodal-instruct-FP8-0.8-image]
684-
test_e2e.py::test_ptp_quickstart_multimodal_chunked_prefill[phi4-multimodal-instruct-fp4-multimodals/Phi-4-multimodal-instruct-FP4-0.8-image]
678+
test_e2e.py::test_ptp_quickstart_multimodal_kv_cache_reuse[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-image]
679+
test_e2e.py::test_ptp_quickstart_multimodal_kv_cache_reuse[phi4-multimodal-instruct-multimodals/Phi-4-multimodal-instruct-image]
680+
test_e2e.py::test_ptp_quickstart_multimodal_kv_cache_reuse[phi4-multimodal-instruct-fp4-multimodals/Phi-4-multimodal-instruct-FP4-image]
681+
test_e2e.py::test_ptp_quickstart_multimodal_kv_cache_reuse[phi4-multimodal-instruct-fp8-multimodals/Phi-4-multimodal-instruct-FP8-image]
682+
test_e2e.py::test_ptp_quickstart_multimodal_chunked_prefill[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-image]
683+
test_e2e.py::test_ptp_quickstart_multimodal_chunked_prefill[phi4-multimodal-instruct-multimodals/Phi-4-multimodal-instruct-image]
684+
test_e2e.py::test_ptp_quickstart_multimodal_chunked_prefill[phi4-multimodal-instruct-fp8-multimodals/Phi-4-multimodal-instruct-FP8-image]
685+
test_e2e.py::test_ptp_quickstart_multimodal_chunked_prefill[phi4-multimodal-instruct-fp4-multimodals/Phi-4-multimodal-instruct-FP4-image]
685686
test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[phi4-multimodal-instruct-multimodals/Phi-4-multimodal-instruct-audio]
686687
test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[phi4-multimodal-instruct-multimodals/Phi-4-multimodal-instruct-image]
687688
test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[phi4-multimodal-instruct-multimodals/Phi-4-multimodal-instruct-image_audio]

tests/integration/test_lists/qa/llm_function_l20.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,7 @@ accuracy/test_llm_api_pytorch_multimodal.py::TestQwen2_5_VL_7B::test_auto_dtype
5151
accuracy/test_llm_api_pytorch_multimodal.py::TestLlava_V1_6_Mistral_7B::test_auto_dtype
5252
accuracy/test_llm_api_pytorch_multimodal.py::TestNVILA_8B::test_auto_dtype
5353
accuracy/test_llm_api_pytorch_multimodal.py::TestVILA1_5_3B::test_auto_dtype
54+
accuracy/test_llm_api_pytorch_multimodal.py::TestPhi4MMFusedVisionLora::test_auto_dtype
5455

5556
test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[phi4-multimodal-instruct-multimodals/Phi-4-multimodal-instruct-audio]
5657
test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[phi4-multimodal-instruct-multimodals/Phi-4-multimodal-instruct-image]

tests/integration/test_lists/qa/llm_function_nim.txt

Lines changed: 5 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -375,6 +375,7 @@ accuracy/test_llm_api_pytorch.py::TestKimiK2::test_fp8_blockscale[latency]
375375
accuracy/test_llm_api_pytorch_multimodal.py::TestQwen2_VL_7B::test_auto_dtype
376376
accuracy/test_llm_api_pytorch_multimodal.py::TestQwen2_5_VL_7B::test_auto_dtype
377377
accuracy/test_llm_api_pytorch_multimodal.py::TestLlava_V1_6_Mistral_7B::test_auto_dtype
378+
accuracy/test_llm_api_pytorch_multimodal.py::TestPhi4MMFusedVisionLora::test_auto_dtype
378379
accuracy/test_disaggregated_serving.py::TestGPTOSS::test_auto_dtype[True]
379380
accuracy/test_disaggregated_serving.py::TestGPTOSS::test_auto_dtype[False]
380381
test_e2e.py::test_openai_chat_harmony
@@ -387,10 +388,10 @@ test_e2e.py::test_llmapi_generation_logits[llama-3.1-model/Llama-3.1-8B-Instruct
387388
test_e2e.py::test_llmapi_generation_logits[llama-3.1-model/Llama-3.1-8B-Instruct-False]
388389
test_e2e.py::test_llmapi_generation_logits[llama-3.3-models/Llama-3.3-70B-Instruct-True]
389390
test_e2e.py::test_llmapi_generation_logits[llama-3.3-models/Llama-3.3-70B-Instruct-False]
390-
test_e2e.py::test_ptp_quickstart_multimodal_kv_cache_reuse[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-0.6-image]
391-
test_e2e.py::test_ptp_quickstart_multimodal_kv_cache_reuse[phi4-multimodal-instruct-multimodals/Phi-4-multimodal-instruct-0.8-image]
392-
test_e2e.py::test_ptp_quickstart_multimodal_chunked_prefill[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-0.6-image]
393-
test_e2e.py::test_ptp_quickstart_multimodal_chunked_prefill[phi4-multimodal-instruct-multimodals/Phi-4-multimodal-instruct-0.8-image]
391+
test_e2e.py::test_ptp_quickstart_multimodal_kv_cache_reuse[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-image]
392+
test_e2e.py::test_ptp_quickstart_multimodal_kv_cache_reuse[phi4-multimodal-instruct-multimodals/Phi-4-multimodal-instruct-image]
393+
test_e2e.py::test_ptp_quickstart_multimodal_chunked_prefill[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-image]
394+
test_e2e.py::test_ptp_quickstart_multimodal_chunked_prefill[phi4-multimodal-instruct-multimodals/Phi-4-multimodal-instruct-image]
394395

395396
examples/serve/test_serve.py::test_extra_llm_api_options
396397
examples/serve/test_serve_negative.py::test_invalid_max_tokens

tests/integration/test_lists/test-db/l0_h100.yml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -265,8 +265,8 @@ l0_h100:
265265
- accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_guided_decoding_with_ngram[llguidance]
266266
- test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-image-True]
267267
- test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-mixture_text_image-True]
268-
- test_e2e.py::test_ptp_quickstart_multimodal_kv_cache_reuse[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-0.6-image]
269-
- test_e2e.py::test_ptp_quickstart_multimodal_chunked_prefill[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-0.6-image]
268+
- test_e2e.py::test_ptp_quickstart_multimodal_kv_cache_reuse[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-image]
269+
- test_e2e.py::test_ptp_quickstart_multimodal_chunked_prefill[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-image]
270270
- examples/test_mistral.py::test_mistral_with_bf16_lora_torch[mistral-7b-v0.1]
271271
- examples/test_phi.py::test_phi_4_mini_instruct_with_bf16_lora_torch[Phi-4-mini-instruct]
272272
- examples/test_llama.py::test_llama_3_x_with_bf16_lora_torch[llama-3.2-1b-instruct]

tests/integration/test_lists/test-db/l0_rtx_pro_6000.yml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ l0_rtx_pro_6000:
3535
- test_e2e.py::test_ptp_quickstart_advanced[Qwen3-30B-A3B_nvfp4_hf-Qwen3/saved_models_Qwen3-30B-A3B_nvfp4_hf] # 2mins
3636
- test_e2e.py::test_ptp_quickstart_advanced[GPT-OSS-20B-gpt_oss/gpt-oss-20b]
3737
- test_e2e.py::test_ptp_quickstart_advanced[GPT-OSS-120B-gpt_oss/gpt-oss-120b]
38-
- test_e2e.py::test_ptp_quickstart_multimodal_chunked_prefill[phi4-multimodal-instruct-multimodals/Phi-4-multimodal-instruct-0.8-image]
38+
- test_e2e.py::test_ptp_quickstart_multimodal_chunked_prefill[phi4-multimodal-instruct-multimodals/Phi-4-multimodal-instruct-image]
3939
- test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[phi4-multimodal-instruct-fp4-multimodals/Phi-4-multimodal-instruct-FP4-image_audio]
4040
- test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[phi4-multimodal-instruct-fp8-multimodals/Phi-4-multimodal-instruct-FP8-image_audio]
4141
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[latency_moe_cutlass-torch_compile=False] # 8mins

0 commit comments

Comments
 (0)