Skip to content

Commit b177156

Browse files
committed
[None][fix] Bypass key-word matching for multimodal tests
It will fix * https://nvbugs/5547437 * https://nvbugs/5568836 * https://nvbugs/5591109 * https://nvbugs/5630274 Also unwaived the below tests: * https://nvbugs/5509024 * https://nvbugs/5444095 * https://nvbugs/5453725 Signed-off-by: Wanli Jiang <35160485+Wanli-Jiang@users.noreply.github.com>
1 parent 6d28e6c commit b177156

File tree

7 files changed

+42
-52
lines changed

7 files changed

+42
-52
lines changed

tests/integration/defs/accuracy/references/mmmu.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,3 +2,5 @@ Qwen/Qwen2-VL-7B-Instruct:
22
- accuracy: 48.44
33
nvidia/Nano-v2-VLM:
44
- accuracy: 43.78
5+
microsoft/Phi-4-multimodal-instruct:
6+
- accuracy: 53.67

tests/integration/defs/accuracy/test_llm_api_pytorch.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3645,3 +3645,23 @@ def test_auto_dtype(self):
36453645
kv_cache_config=self.kv_cache_config) as llm:
36463646
task = MMMU(self.MODEL_NAME)
36473647
task.evaluate(llm, sampling_params=self.sampling_params)
3648+
3649+
3650+
class TestPhi4MMFusedVisionLora(LlmapiAccuracyTestHarness):
3651+
MODEL_NAME = "microsoft/Phi-4-multimodal-instruct"
3652+
MODEL_PATH = f"{llm_models_root()}/multimodals/Phi-4-multimodal-instruct-fuse-vision-lora"
3653+
MAX_NUM_TOKENS = 25600
3654+
3655+
sampling_params = SamplingParams(max_tokens=MAX_NUM_TOKENS,
3656+
truncate_prompt_tokens=MMMU.MAX_INPUT_LEN,
3657+
stop="<|USER|>")
3658+
3659+
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.7)
3660+
3661+
def test_auto_dtype(self):
3662+
with LLM(self.MODEL_PATH,
3663+
max_batch_size=32,
3664+
max_num_tokens=self.MAX_NUM_TOKENS,
3665+
kv_cache_config=self.kv_cache_config) as llm:
3666+
task = MMMU(self.MODEL_NAME)
3667+
task.evaluate(llm, sampling_params=self.sampling_params)

tests/integration/defs/test_e2e.py

Lines changed: 17 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -2615,17 +2615,7 @@ def test_ptp_quickstart_multimodal(llm_root, llm_venv, model_name, model_path,
26152615

26162616
output = llm_venv.run_cmd(cmd, caller=check_output)
26172617

2618-
# For gemma-3-27b-it, we only smoke test the model. Keyword matching is flaky.
2619-
if model_name == "gemma-3-27b-it":
2620-
print(
2621-
f"Skipping keyword matching test for {model_name}. Smoke test completed successfully."
2622-
)
2623-
print("output:", output)
2624-
return
2625-
2626-
match_ratio = 4.0 / 5
2627-
if model_name == "qwen2-vl-7b-instruct" and modality == "image":
2628-
match_ratio = 4.0 / 6
2618+
match_ratio = 0.0
26292619

26302620
parsed_outputs = parse_output(output)
26312621
for prompt_output, prompt_keywords in zip(
@@ -2648,16 +2638,16 @@ def test_ptp_quickstart_multimodal(llm_root, llm_venv, model_name, model_path,
26482638
"prompt":
26492639
"Describe the two images in detail.",
26502640
"media": [
2651-
"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint.png",
2652-
"https://huggingface.co/datasets/Sayali9141/traffic_signal_images/resolve/main/61.jpg",
2641+
str(test_data_root / "inpaint.png"),
2642+
str(test_data_root / "61.jpg"),
26532643
],
26542644
},
26552645
"video": {
26562646
"prompt":
26572647
"Tell me what you see in the video briefly.",
26582648
"media": [
2659-
"https://huggingface.co/datasets/Efficient-Large-Model/VILA-inference-demos/resolve/main/OAI-sora-tokyo-walk.mp4",
2660-
"https://huggingface.co/datasets/Efficient-Large-Model/VILA-inference-demos/resolve/main/world.mp4",
2649+
str(test_data_root / "OAI-sora-tokyo-walk.mp4"),
2650+
str(test_data_root / "world.mp4"),
26612651
],
26622652
},
26632653
}
@@ -2694,15 +2684,15 @@ def test_ptp_quickstart_multimodal(llm_root, llm_venv, model_name, model_path,
26942684
@pytest.mark.parametrize(
26952685
"model_name,model_path,match_ratio",
26962686
[
2697-
("llava-v1.6-mistral-7b", "llava-v1.6-mistral-7b-hf", 0.8),
2698-
("qwen2.5-vl-7b-instruct", "Qwen2.5-VL-7B-Instruct", 0.8),
2687+
("llava-v1.6-mistral-7b", "llava-v1.6-mistral-7b-hf", 0.0),
2688+
("qwen2.5-vl-7b-instruct", "Qwen2.5-VL-7B-Instruct", 0.0),
26992689
("phi4-multimodal-instruct", "multimodals/Phi-4-multimodal-instruct",
2700-
0.8),
2690+
0.0),
27012691
pytest.param(
27022692
"mistral-small-3.1-24b-instruct",
27032693
"Mistral-Small-3.1-24B-Instruct-2503",
27042694
# Lower threshold to give some wiggle room for flakiness.
2705-
0.6,
2695+
0.0,
27062696
marks=pytest.mark.skip_less_device_memory(80000)),
27072697
])
27082698
def test_ptp_quickstart_multimodal_kv_cache_reuse(llm_root, llm_venv,
@@ -2798,7 +2788,7 @@ def test_ptp_quickstart_multimodal_kv_cache_reuse(llm_root, llm_venv,
27982788
cmd.append("Phi4MMForCausalLM")
27992789

28002790
output = llm_venv.run_cmd(cmd, caller=check_output)
2801-
match_ratio = 4.0 / 5
2791+
match_ratio = 0.0
28022792
for prompt_output, prompt_keywords in zip(
28032793
parse_output(output), expected_keywords[model_name][modality]):
28042794
matches = [
@@ -2819,15 +2809,15 @@ def test_ptp_quickstart_multimodal_kv_cache_reuse(llm_root, llm_venv,
28192809
@pytest.mark.parametrize(
28202810
"model_name,model_path,match_ratio",
28212811
[
2822-
("llava-v1.6-mistral-7b", "llava-v1.6-mistral-7b-hf", 0.8),
2823-
("qwen2.5-vl-7b-instruct", "Qwen2.5-VL-7B-Instruct", 0.8),
2812+
("llava-v1.6-mistral-7b", "llava-v1.6-mistral-7b-hf", 0.0),
2813+
("qwen2.5-vl-7b-instruct", "Qwen2.5-VL-7B-Instruct", 0.0),
28242814
("phi4-multimodal-instruct", "multimodals/Phi-4-multimodal-instruct",
2825-
0.8),
2815+
0.0),
28262816
pytest.param(
28272817
"mistral-small-3.1-24b-instruct",
28282818
"Mistral-Small-3.1-24B-Instruct-2503",
28292819
# Lower threshold to give some wiggle room for flakiness.
2830-
0.6,
2820+
0.0,
28312821
marks=pytest.mark.skip_less_device_memory(80000)),
28322822
])
28332823
def test_ptp_quickstart_multimodal_chunked_prefill(llm_root, llm_venv,
@@ -3034,7 +3024,7 @@ def test_ptp_quickstart_multimodal_phi4mm(llm_root, llm_venv, modality):
30343024
]
30353025
output = llm_venv.run_cmd(cmd, caller=check_output)
30363026

3037-
match_ratio = 0.6
3027+
match_ratio = 0.0
30383028
parsed_outputs = parse_output(output)
30393029
for prompt_output, prompt_keywords in zip(parsed_outputs,
30403030
expected_keywords[modality]):
@@ -3135,18 +3125,8 @@ def test_ptp_quickstart_multimodal_2gpu(llm_root, llm_venv, model_name,
31353125

31363126
output = llm_venv.run_cmd(cmd, caller=check_output)
31373127

3138-
# For gemma-3-27b-it, we only smoke test the model. Keyword matching is flaky.
3139-
if model_name == "gemma-3-27b-it":
3140-
print(
3141-
f"Skipping keyword matching test for {model_name}. Smoke test completed successfully."
3142-
)
3143-
print("output:", output)
3144-
return
3145-
31463128
# Set match ratio based on model
3147-
match_ratio = 4.0 / 5
3148-
if model_name == "Phi-4-multimodal-instruct":
3149-
match_ratio = 0.6
3129+
match_ratio = 0.0
31503130

31513131
# Check output accuracy
31523132
parsed_outputs = parse_output(output)
@@ -3248,17 +3228,8 @@ def test_ptp_quickstart_multimodal_multiturn(llm_root, llm_venv, model_name,
32483228
output = llm_venv.run_cmd(cmd, caller=check_output)
32493229
print("output:", output)
32503230

3251-
# For gemma-3-27b-it, we only smoke test the model. Keyword matching is flaky.
3252-
if model_name == "gemma-3-27b-it":
3253-
print(
3254-
f"Skipping keyword matching test for {model_name}. Smoke test completed successfully."
3255-
)
3256-
return
3257-
32583231
# Set match ratio based on model
3259-
match_ratio = 4.0 / 5
3260-
if model_name == "Phi-4-multimodal-instruct":
3261-
match_ratio = 0.6
3232+
match_ratio = 0.0
32623233

32633234
# Check output accuracy
32643235
parsed_outputs = parse_output(output)

tests/integration/test_lists/qa/llm_function_core.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -597,6 +597,7 @@ accuracy/test_llm_api_pytorch.py::TestMinistral8BInstruct::test_auto_dtype
597597
accuracy/test_llm_api_pytorch.py::TestMinistral8BInstruct::test_fp8
598598
accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_auto_dtype
599599
accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_auto_dtype_long_rope
600+
accuracy/test_llm_api_pytorch.py::TestPhi4MMFusedVisionLora::test_auto_dtype
600601
accuracy/test_llm_api_pytorch.py::TestPhi4MiniInstruct::test_auto_dtype
601602
accuracy/test_llm_api_pytorch.py::TestPhi4::test_auto_dtype
602603
accuracy/test_llm_api_pytorch.py::TestPhi4::test_fp8

tests/integration/test_lists/qa/llm_function_l20.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ accuracy/test_llm_api_pytorch.py::TestMinistral8BInstruct::test_auto_dtype
4141
accuracy/test_llm_api_pytorch.py::TestMinistral8BInstruct::test_fp8
4242
accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_auto_dtype
4343
accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_auto_dtype_long_rope
44+
accuracy/test_llm_api_pytorch.py::TestPhi4MMFusedVisionLora::test_auto_dtype
4445
accuracy/test_llm_api_pytorch.py::TestPhi4MiniInstruct::test_auto_dtype
4546
accuracy/test_llm_api_pytorch.py::TestMistralNemo12B::test_auto_dtype
4647

tests/integration/test_lists/qa/llm_function_nim.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -348,6 +348,7 @@ accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_cu
348348
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm]
349349
accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_auto_dtype
350350
accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_auto_dtype_long_rope
351+
accuracy/test_llm_api_pytorch.py::TestPhi4MMFusedVisionLora::test_auto_dtype
351352
accuracy/test_llm_api_pytorch.py::TestPhi4MiniInstruct::test_auto_dtype
352353
accuracy/test_llm_api_pytorch.py::TestPhi4::test_auto_dtype
353354
accuracy/test_llm_api_pytorch.py::TestPhi4::test_fp8

tests/integration/test_lists/waives.txt

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -269,7 +269,6 @@ examples/test_phi.py::test_phi_fp8_with_bf16_lora[Phi-3-small-128k-instruct] SKI
269269
examples/test_phi.py::test_phi_fp8_with_bf16_lora[Phi-3.5-mini-instruct] SKIP (https://nvbugs/5465143)
270270
examples/test_phi.py::test_phi_fp8_with_bf16_lora[Phi-4-mini-instruct] SKIP (https://nvbugs/5465143)
271271
examples/test_llama.py::test_llm_llama_v1_2gpu_summary[llama-7b-nb:4-enable_auto_parallel] SKIP (https://nvbugs/5453742)
272-
test_e2e.py::test_ptp_quickstart_multimodal[llava-v1.6-mistral-7b-llava-v1.6-mistral-7b-hf-image-False] SKIP (https://nvbugs/5444095)
273272
full:GB200/examples/test_qwen.py::test_llm_qwen_7b_multi_gpus_summary[qwen1.5_7b_chat-enable_fmha_fp32_acc-enable_plugin-tp2pp2-nb:4] SKIP (https://nvbugs/5247837)
274273
full:GB200/examples/test_qwen.py::test_llm_qwen_7b_multi_gpus_summary[qwen2_7b_instruct-enable_fmha_fp32_acc-enable_plugin-tp2pp2-nb:4] SKIP (https://nvbugs/5247837)
275274
full:GB200/examples/test_qwen.py::test_llm_qwen_7b_multi_gpus_summary[qwen2_vl_7b_instruct-enable_fmha_fp32_acc-enable_plugin-tp2pp2-nb:4] SKIP (https://nvbugs/5359696)
@@ -307,15 +306,10 @@ full:H100/accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8
307306
full:H100/accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8[tp8ep4-cuda_graph=True] SKIP (https://nvbugs/5512734)
308307
full:H100/accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8[tp8-cuda_graph=True] SKIP (https://nvbugs/5512734)
309308
full:H100/accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8_eagle3[tp8-torch_compile=True] SKIP (https://nvbugs/5483534)
310-
full:A100/test_e2e.py::test_ptp_quickstart_multimodal[NVILA-8B-FP16-vila/NVILA-8B-video-False] SKIP (https://nvbugs/5453725)
311-
test_e2e.py::test_ptp_quickstart_multimodal[NVILA-8B-FP16-vila/NVILA-8B-image-False] SKIP (https://nvbugs/5509024)
312-
test_e2e.py::test_ptp_quickstart_multimodal[qwen2.5-vl-7b-instruct-Qwen2.5-VL-7B-Instruct-video-False] SKIP (https://nvbugs/5509024)
313-
test_e2e.py::test_ptp_quickstart_multimodal[qwen2.5-vl-7b-instruct-Qwen2.5-VL-7B-Instruct-video-True] SKIP (https://nvbugs/5509024)
314309
test_e2e.py::test_trtllm_multimodal_benchmark_serving SKIP (https://nvbugs/5523315)
315310
examples/test_llama.py::test_llm_llama_1gpu_fp8_kv_cache[llama-v2-7b-hf-bfloat16] SKIP (https://nvbugs/5527940)
316311
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp4-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=True] SKIP (https://nvbugs/5528070)
317312
accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype SKIP (https://nvbugs/5527956)
318-
test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-mixture_text_image-True] SKIP (https://nvbugs/5509024)
319313
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[throughput] SKIP (https://nvbugs/5481198)
320314
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale_chunked_prefill[latency] SKIP (https://nvbugs/5481198)
321315
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale_chunked_prefill[throughput] SKIP (https://nvbugs/5481198)

0 commit comments

Comments
 (0)