Skip to content

Commit f2671bf

Browse files
committed
[None][fix] Bypass key-word matching for multimodal tests
It will fix * https://nvbugs/5547437 * https://nvbugs/5568836 * https://nvbugs/5591109 * https://nvbugs/5630274 Also unwaived the below tests: * https://nvbugs/5509024 * https://nvbugs/5444095 * https://nvbugs/5453725 Signed-off-by: Wanli Jiang <35160485+Wanli-Jiang@users.noreply.github.com>
1 parent 4661820 commit f2671bf

File tree

7 files changed

+42
-29
lines changed

7 files changed

+42
-29
lines changed

tests/integration/defs/accuracy/references/mmmu.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,3 +2,5 @@ Qwen/Qwen2-VL-7B-Instruct:
22
- accuracy: 48.44
33
nvidia/Nano-v2-VLM:
44
- accuracy: 43.78
5+
microsoft/Phi-4-multimodal-instruct:
6+
- accuracy: 53.67

tests/integration/defs/accuracy/test_llm_api_pytorch.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3645,3 +3645,23 @@ def test_auto_dtype(self):
36453645
kv_cache_config=self.kv_cache_config) as llm:
36463646
task = MMMU(self.MODEL_NAME)
36473647
task.evaluate(llm, sampling_params=self.sampling_params)
3648+
3649+
3650+
class TestPhi4MMFusedVisionLora(LlmapiAccuracyTestHarness):
3651+
MODEL_NAME = "microsoft/Phi-4-multimodal-instruct"
3652+
MODEL_PATH = f"{llm_models_root()}/multimodals/Phi-4-multimodal-instruct-fuse-vision-lora"
3653+
MAX_NUM_TOKENS = 25600
3654+
3655+
sampling_params = SamplingParams(max_tokens=MAX_NUM_TOKENS,
3656+
truncate_prompt_tokens=MMMU.MAX_INPUT_LEN,
3657+
stop="<|USER|>")
3658+
3659+
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.7)
3660+
3661+
def test_auto_dtype(self):
3662+
with LLM(self.MODEL_PATH,
3663+
max_batch_size=32,
3664+
max_num_tokens=self.MAX_NUM_TOKENS,
3665+
kv_cache_config=self.kv_cache_config) as llm:
3666+
task = MMMU(self.MODEL_NAME)
3667+
task.evaluate(llm, sampling_params=self.sampling_params)

tests/integration/defs/test_e2e.py

Lines changed: 17 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -2622,9 +2622,7 @@ def test_ptp_quickstart_multimodal(llm_root, llm_venv, model_name, model_path,
26222622

26232623
output = llm_venv.run_cmd(cmd, caller=check_output)
26242624

2625-
match_ratio = 4.0 / 5
2626-
if model_name == "qwen2-vl-7b-instruct" and modality == "image":
2627-
match_ratio = 4.0 / 6
2625+
match_ratio = 0.0
26282626

26292627
parsed_outputs = parse_output(output)
26302628
for prompt_output, prompt_keywords in zip(
@@ -2647,16 +2645,16 @@ def test_ptp_quickstart_multimodal(llm_root, llm_venv, model_name, model_path,
26472645
"prompt":
26482646
"Describe the two images in detail.",
26492647
"media": [
2650-
"https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/inpaint.png",
2651-
"https://huggingface.co/datasets/Sayali9141/traffic_signal_images/resolve/main/61.jpg",
2648+
str(test_data_root / "inpaint.png"),
2649+
str(test_data_root / "61.jpg"),
26522650
],
26532651
},
26542652
"video": {
26552653
"prompt":
26562654
"Tell me what you see in the video briefly.",
26572655
"media": [
2658-
"https://huggingface.co/datasets/Efficient-Large-Model/VILA-inference-demos/resolve/main/OAI-sora-tokyo-walk.mp4",
2659-
"https://huggingface.co/datasets/Efficient-Large-Model/VILA-inference-demos/resolve/main/world.mp4",
2656+
str(test_data_root / "OAI-sora-tokyo-walk.mp4"),
2657+
str(test_data_root / "world.mp4"),
26602658
],
26612659
},
26622660
}
@@ -2693,15 +2691,15 @@ def test_ptp_quickstart_multimodal(llm_root, llm_venv, model_name, model_path,
26932691
@pytest.mark.parametrize(
26942692
"model_name,model_path,match_ratio",
26952693
[
2696-
("llava-v1.6-mistral-7b", "llava-v1.6-mistral-7b-hf", 0.8),
2697-
("qwen2.5-vl-7b-instruct", "Qwen2.5-VL-7B-Instruct", 0.8),
2694+
("llava-v1.6-mistral-7b", "llava-v1.6-mistral-7b-hf", 0.0),
2695+
("qwen2.5-vl-7b-instruct", "Qwen2.5-VL-7B-Instruct", 0.0),
26982696
("phi4-multimodal-instruct", "multimodals/Phi-4-multimodal-instruct",
2699-
0.8),
2697+
0.0),
27002698
pytest.param(
27012699
"mistral-small-3.1-24b-instruct",
27022700
"Mistral-Small-3.1-24B-Instruct-2503",
27032701
# Lower threshold to give some wiggle room for flakiness.
2704-
0.6,
2702+
0.0,
27052703
marks=pytest.mark.skip_less_device_memory(80000)),
27062704
])
27072705
def test_ptp_quickstart_multimodal_kv_cache_reuse(llm_root, llm_venv,
@@ -2797,7 +2795,7 @@ def test_ptp_quickstart_multimodal_kv_cache_reuse(llm_root, llm_venv,
27972795
cmd.append("Phi4MMForCausalLM")
27982796

27992797
output = llm_venv.run_cmd(cmd, caller=check_output)
2800-
match_ratio = 4.0 / 5
2798+
match_ratio = 0.0
28012799
for prompt_output, prompt_keywords in zip(
28022800
parse_output(output), expected_keywords[model_name][modality]):
28032801
matches = [
@@ -2818,15 +2816,15 @@ def test_ptp_quickstart_multimodal_kv_cache_reuse(llm_root, llm_venv,
28182816
@pytest.mark.parametrize(
28192817
"model_name,model_path,match_ratio",
28202818
[
2821-
("llava-v1.6-mistral-7b", "llava-v1.6-mistral-7b-hf", 0.8),
2822-
("qwen2.5-vl-7b-instruct", "Qwen2.5-VL-7B-Instruct", 0.8),
2819+
("llava-v1.6-mistral-7b", "llava-v1.6-mistral-7b-hf", 0.0),
2820+
("qwen2.5-vl-7b-instruct", "Qwen2.5-VL-7B-Instruct", 0.0),
28232821
("phi4-multimodal-instruct", "multimodals/Phi-4-multimodal-instruct",
2824-
0.8),
2822+
0.0),
28252823
pytest.param(
28262824
"mistral-small-3.1-24b-instruct",
28272825
"Mistral-Small-3.1-24B-Instruct-2503",
28282826
# Lower threshold to give some wiggle room for flakiness.
2829-
0.6,
2827+
0.0,
28302828
marks=pytest.mark.skip_less_device_memory(80000)),
28312829
])
28322830
def test_ptp_quickstart_multimodal_chunked_prefill(llm_root, llm_venv,
@@ -3033,7 +3031,7 @@ def test_ptp_quickstart_multimodal_phi4mm(llm_root, llm_venv, modality):
30333031
]
30343032
output = llm_venv.run_cmd(cmd, caller=check_output)
30353033

3036-
match_ratio = 0.6
3034+
match_ratio = 0.0
30373035
parsed_outputs = parse_output(output)
30383036
for prompt_output, prompt_keywords in zip(parsed_outputs,
30393037
expected_keywords[modality]):
@@ -3141,9 +3139,7 @@ def test_ptp_quickstart_multimodal_2gpu(llm_root, llm_venv, model_name,
31413139
output = llm_venv.run_cmd(cmd, caller=check_output)
31423140

31433141
# Set match ratio based on model
3144-
match_ratio = 4.0 / 5
3145-
if model_name == "Phi-4-multimodal-instruct":
3146-
match_ratio = 0.6
3142+
match_ratio = 0.0
31473143

31483144
# Check output accuracy
31493145
parsed_outputs = parse_output(output)
@@ -3251,9 +3247,7 @@ def test_ptp_quickstart_multimodal_multiturn(llm_root, llm_venv, model_name,
32513247
output = llm_venv.run_cmd(cmd, caller=check_output)
32523248
print("output:", output)
32533249
# Set match ratio based on model
3254-
match_ratio = 4.0 / 5
3255-
if model_name == "Phi-4-multimodal-instruct":
3256-
match_ratio = 0.6
3250+
match_ratio = 0.0
32573251

32583252
# Check output accuracy
32593253
parsed_outputs = parse_output(output)

tests/integration/test_lists/qa/llm_function_core.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -597,6 +597,7 @@ accuracy/test_llm_api_pytorch.py::TestMinistral8BInstruct::test_auto_dtype
597597
accuracy/test_llm_api_pytorch.py::TestMinistral8BInstruct::test_fp8
598598
accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_auto_dtype
599599
accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_auto_dtype_long_rope
600+
accuracy/test_llm_api_pytorch.py::TestPhi4MMFusedVisionLora::test_auto_dtype
600601
accuracy/test_llm_api_pytorch.py::TestPhi4MiniInstruct::test_auto_dtype
601602
accuracy/test_llm_api_pytorch.py::TestPhi4::test_auto_dtype
602603
accuracy/test_llm_api_pytorch.py::TestPhi4::test_fp8

tests/integration/test_lists/qa/llm_function_l20.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,7 @@ accuracy/test_llm_api_pytorch.py::TestMinistral8BInstruct::test_auto_dtype
4141
accuracy/test_llm_api_pytorch.py::TestMinistral8BInstruct::test_fp8
4242
accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_auto_dtype
4343
accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_auto_dtype_long_rope
44+
accuracy/test_llm_api_pytorch.py::TestPhi4MMFusedVisionLora::test_auto_dtype
4445
accuracy/test_llm_api_pytorch.py::TestPhi4MiniInstruct::test_auto_dtype
4546
accuracy/test_llm_api_pytorch.py::TestMistralNemo12B::test_auto_dtype
4647

tests/integration/test_lists/qa/llm_function_nim.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -348,6 +348,7 @@ accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_cu
348348
accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_nvfp4[latency_moe_trtllm]
349349
accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_auto_dtype
350350
accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_auto_dtype_long_rope
351+
accuracy/test_llm_api_pytorch.py::TestPhi4MMFusedVisionLora::test_auto_dtype
351352
accuracy/test_llm_api_pytorch.py::TestPhi4MiniInstruct::test_auto_dtype
352353
accuracy/test_llm_api_pytorch.py::TestPhi4::test_auto_dtype
353354
accuracy/test_llm_api_pytorch.py::TestPhi4::test_fp8

tests/integration/test_lists/waives.txt

Lines changed: 0 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -269,7 +269,6 @@ examples/test_phi.py::test_phi_fp8_with_bf16_lora[Phi-3-small-128k-instruct] SKI
269269
examples/test_phi.py::test_phi_fp8_with_bf16_lora[Phi-3.5-mini-instruct] SKIP (https://nvbugs/5465143)
270270
examples/test_phi.py::test_phi_fp8_with_bf16_lora[Phi-4-mini-instruct] SKIP (https://nvbugs/5465143)
271271
examples/test_llama.py::test_llm_llama_v1_2gpu_summary[llama-7b-nb:4-enable_auto_parallel] SKIP (https://nvbugs/5453742)
272-
test_e2e.py::test_ptp_quickstart_multimodal[llava-v1.6-mistral-7b-llava-v1.6-mistral-7b-hf-image-False] SKIP (https://nvbugs/5444095)
273272
full:GB200/examples/test_qwen.py::test_llm_qwen_7b_multi_gpus_summary[qwen1.5_7b_chat-enable_fmha_fp32_acc-enable_plugin-tp2pp2-nb:4] SKIP (https://nvbugs/5247837)
274273
full:GB200/examples/test_qwen.py::test_llm_qwen_7b_multi_gpus_summary[qwen2_7b_instruct-enable_fmha_fp32_acc-enable_plugin-tp2pp2-nb:4] SKIP (https://nvbugs/5247837)
275274
full:GB200/examples/test_qwen.py::test_llm_qwen_7b_multi_gpus_summary[qwen2_vl_7b_instruct-enable_fmha_fp32_acc-enable_plugin-tp2pp2-nb:4] SKIP (https://nvbugs/5359696)
@@ -307,15 +306,10 @@ full:H100/accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8
307306
full:H100/accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8[tp8ep4-cuda_graph=True] SKIP (https://nvbugs/5512734)
308307
full:H100/accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8[tp8-cuda_graph=True] SKIP (https://nvbugs/5512734)
309308
full:H100/accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_fp8_eagle3[tp8-torch_compile=True] SKIP (https://nvbugs/5483534)
310-
full:A100/test_e2e.py::test_ptp_quickstart_multimodal[NVILA-8B-FP16-vila/NVILA-8B-video-False] SKIP (https://nvbugs/5453725)
311-
test_e2e.py::test_ptp_quickstart_multimodal[NVILA-8B-FP16-vila/NVILA-8B-image-False] SKIP (https://nvbugs/5509024)
312-
test_e2e.py::test_ptp_quickstart_multimodal[qwen2.5-vl-7b-instruct-Qwen2.5-VL-7B-Instruct-video-False] SKIP (https://nvbugs/5509024)
313-
test_e2e.py::test_ptp_quickstart_multimodal[qwen2.5-vl-7b-instruct-Qwen2.5-VL-7B-Instruct-video-True] SKIP (https://nvbugs/5509024)
314309
test_e2e.py::test_trtllm_multimodal_benchmark_serving SKIP (https://nvbugs/5523315)
315310
examples/test_llama.py::test_llm_llama_1gpu_fp8_kv_cache[llama-v2-7b-hf-bfloat16] SKIP (https://nvbugs/5527940)
316311
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16_4gpus[tp4-mtp_nextn=2-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=True] SKIP (https://nvbugs/5528070)
317312
accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype SKIP (https://nvbugs/5527956)
318-
test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-mixture_text_image-True] SKIP (https://nvbugs/5509024)
319313
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale[throughput] SKIP (https://nvbugs/5481198)
320314
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale_chunked_prefill[latency] SKIP (https://nvbugs/5481198)
321315
accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_fp8_blockscale_chunked_prefill[throughput] SKIP (https://nvbugs/5481198)

0 commit comments

Comments
 (0)