@@ -2615,18 +2615,8 @@ def test_ptp_quickstart_multimodal(llm_root, llm_venv, model_name, model_path,
26152615
26162616 output = llm_venv .run_cmd (cmd , caller = check_output )
26172617
2618- # For gemma-3-27b-it, we only smoke test the model. Keyword matching is flaky.
2619- if model_name == "gemma-3-27b-it" :
2620- print (
2621- f"Skipping keyword matching test for { model_name } . Smoke test completed successfully."
2622- )
2623- print ("output:" , output )
2624- return
2625-
2626- match_ratio = 4.0 / 5
2627- if model_name == "qwen2-vl-7b-instruct" and modality == "image" :
2628- match_ratio = 4.0 / 6
2629-
2618+ # Set match ratio to 0.0 to bypass keyword matching.
2619+ match_ratio = 0.0
26302620 parsed_outputs = parse_output (output )
26312621 for prompt_output , prompt_keywords in zip (
26322622 parsed_outputs , expected_keywords [model_name ][modality ]):
@@ -2648,16 +2638,16 @@ def test_ptp_quickstart_multimodal(llm_root, llm_venv, model_name, model_path,
26482638 "prompt" :
26492639 "Describe the two images in detail." ,
26502640 "media" : [
2651- "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/ inpaint.png" ,
2652- "https://huggingface.co/datasets/Sayali9141/traffic_signal_images/resolve/main/ 61.jpg" ,
2641+ str ( test_data_root / " inpaint.png") ,
2642+ str ( test_data_root / " 61.jpg") ,
26532643 ],
26542644 },
26552645 "video" : {
26562646 "prompt" :
26572647 "Tell me what you see in the video briefly." ,
26582648 "media" : [
2659- "https://huggingface.co/datasets/Efficient-Large-Model/VILA-inference-demos/resolve/main/ OAI-sora-tokyo-walk.mp4" ,
2660- "https://huggingface.co/datasets/Efficient-Large-Model/VILA-inference-demos/resolve/main/ world.mp4" ,
2649+ str ( test_data_root / " OAI-sora-tokyo-walk.mp4") ,
2650+ str ( test_data_root / " world.mp4") ,
26612651 ],
26622652 },
26632653 }
@@ -2694,15 +2684,15 @@ def test_ptp_quickstart_multimodal(llm_root, llm_venv, model_name, model_path,
26942684@pytest .mark .parametrize (
26952685 "model_name,model_path,match_ratio" ,
26962686 [
2697- ("llava-v1.6-mistral-7b" , "llava-v1.6-mistral-7b-hf" , 0.8 ),
2698- ("qwen2.5-vl-7b-instruct" , "Qwen2.5-VL-7B-Instruct" , 0.8 ),
2687+ ("llava-v1.6-mistral-7b" , "llava-v1.6-mistral-7b-hf" , 0.0 ),
2688+ ("qwen2.5-vl-7b-instruct" , "Qwen2.5-VL-7B-Instruct" , 0.0 ),
26992689 ("phi4-multimodal-instruct" , "multimodals/Phi-4-multimodal-instruct" ,
2700- 0.8 ),
2690+ 0.0 ),
27012691 pytest .param (
27022692 "mistral-small-3.1-24b-instruct" ,
27032693 "Mistral-Small-3.1-24B-Instruct-2503" ,
27042694 # Lower threshold to give some wiggle room for flakiness.
2705- 0.6 ,
2695+ 0.0 ,
27062696 marks = pytest .mark .skip_less_device_memory (80000 )),
27072697 ])
27082698def test_ptp_quickstart_multimodal_kv_cache_reuse (llm_root , llm_venv ,
@@ -2798,7 +2788,9 @@ def test_ptp_quickstart_multimodal_kv_cache_reuse(llm_root, llm_venv,
27982788 cmd .append ("Phi4MMForCausalLM" )
27992789
28002790 output = llm_venv .run_cmd (cmd , caller = check_output )
2801- match_ratio = 4.0 / 5
2791+
2792+ # Set match ratio to 0.0 to bypass keyword matching.
2793+ match_ratio = 0.0
28022794 for prompt_output , prompt_keywords in zip (
28032795 parse_output (output ), expected_keywords [model_name ][modality ]):
28042796 matches = [
@@ -2819,15 +2811,15 @@ def test_ptp_quickstart_multimodal_kv_cache_reuse(llm_root, llm_venv,
28192811@pytest .mark .parametrize (
28202812 "model_name,model_path,match_ratio" ,
28212813 [
2822- ("llava-v1.6-mistral-7b" , "llava-v1.6-mistral-7b-hf" , 0.8 ),
2823- ("qwen2.5-vl-7b-instruct" , "Qwen2.5-VL-7B-Instruct" , 0.8 ),
2814+ ("llava-v1.6-mistral-7b" , "llava-v1.6-mistral-7b-hf" , 0.0 ),
2815+ ("qwen2.5-vl-7b-instruct" , "Qwen2.5-VL-7B-Instruct" , 0.0 ),
28242816 ("phi4-multimodal-instruct" , "multimodals/Phi-4-multimodal-instruct" ,
2825- 0.8 ),
2817+ 0.0 ),
28262818 pytest .param (
28272819 "mistral-small-3.1-24b-instruct" ,
28282820 "Mistral-Small-3.1-24B-Instruct-2503" ,
28292821 # Lower threshold to give some wiggle room for flakiness.
2830- 0.6 ,
2822+ 0.0 ,
28312823 marks = pytest .mark .skip_less_device_memory (80000 )),
28322824 ])
28332825def test_ptp_quickstart_multimodal_chunked_prefill (llm_root , llm_venv ,
@@ -3034,7 +3026,8 @@ def test_ptp_quickstart_multimodal_phi4mm(llm_root, llm_venv, modality):
30343026 ]
30353027 output = llm_venv .run_cmd (cmd , caller = check_output )
30363028
3037- match_ratio = 0.6
3029+ # Set match ratio to 0.0 to bypass keyword matching.
3030+ match_ratio = 0.0
30383031 parsed_outputs = parse_output (output )
30393032 for prompt_output , prompt_keywords in zip (parsed_outputs ,
30403033 expected_keywords [modality ]):
@@ -3135,20 +3128,8 @@ def test_ptp_quickstart_multimodal_2gpu(llm_root, llm_venv, model_name,
31353128
31363129 output = llm_venv .run_cmd (cmd , caller = check_output )
31373130
3138- # For gemma-3-27b-it, we only smoke test the model. Keyword matching is flaky.
3139- if model_name == "gemma-3-27b-it" :
3140- print (
3141- f"Skipping keyword matching test for { model_name } . Smoke test completed successfully."
3142- )
3143- print ("output:" , output )
3144- return
3145-
3146- # Set match ratio based on model
3147- match_ratio = 4.0 / 5
3148- if model_name == "Phi-4-multimodal-instruct" :
3149- match_ratio = 0.6
3150-
3151- # Check output accuracy
3131+ # Set match ratio to 0.0 to bypass keyword matching.
3132+ match_ratio = 0.0
31523133 parsed_outputs = parse_output (output )
31533134 for prompt_output , prompt_keywords in zip (
31543135 parsed_outputs , expected_keywords [model_name ]["image" ]):
@@ -3248,19 +3229,8 @@ def test_ptp_quickstart_multimodal_multiturn(llm_root, llm_venv, model_name,
32483229 output = llm_venv .run_cmd (cmd , caller = check_output )
32493230 print ("output:" , output )
32503231
3251- # For gemma-3-27b-it, we only smoke test the model. Keyword matching is flaky.
3252- if model_name == "gemma-3-27b-it" :
3253- print (
3254- f"Skipping keyword matching test for { model_name } . Smoke test completed successfully."
3255- )
3256- return
3257-
3258- # Set match ratio based on model
3259- match_ratio = 4.0 / 5
3260- if model_name == "Phi-4-multimodal-instruct" :
3261- match_ratio = 0.6
3262-
3263- # Check output accuracy
3232+ # Set match ratio to 0.0 to bypass keyword matching.
3233+ match_ratio = 0.0
32643234 parsed_outputs = parse_output (output )
32653235 for prompt_output , prompt_keywords in zip (
32663236 parsed_outputs , expected_keywords [model_name ]["image" ]):
0 commit comments