@@ -2615,17 +2615,7 @@ def test_ptp_quickstart_multimodal(llm_root, llm_venv, model_name, model_path,
26152615
26162616 output = llm_venv .run_cmd (cmd , caller = check_output )
26172617
2618- # For gemma-3-27b-it, we only smoke test the model. Keyword matching is flaky.
2619- if model_name == "gemma-3-27b-it" :
2620- print (
2621- f"Skipping keyword matching test for { model_name } . Smoke test completed successfully."
2622- )
2623- print ("output:" , output )
2624- return
2625-
2626- match_ratio = 4.0 / 5
2627- if model_name == "qwen2-vl-7b-instruct" and modality == "image" :
2628- match_ratio = 4.0 / 6
2618+ match_ratio = 0.0
26292619
26302620 parsed_outputs = parse_output (output )
26312621 for prompt_output , prompt_keywords in zip (
@@ -2648,16 +2638,16 @@ def test_ptp_quickstart_multimodal(llm_root, llm_venv, model_name, model_path,
26482638 "prompt" :
26492639 "Describe the two images in detail." ,
26502640 "media" : [
2651- "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/ inpaint.png" ,
2652- "https://huggingface.co/datasets/Sayali9141/traffic_signal_images/resolve/main/ 61.jpg" ,
2641+ str ( test_data_root / " inpaint.png") ,
2642+ str ( test_data_root / " 61.jpg") ,
26532643 ],
26542644 },
26552645 "video" : {
26562646 "prompt" :
26572647 "Tell me what you see in the video briefly." ,
26582648 "media" : [
2659- "https://huggingface.co/datasets/Efficient-Large-Model/VILA-inference-demos/resolve/main/ OAI-sora-tokyo-walk.mp4" ,
2660- "https://huggingface.co/datasets/Efficient-Large-Model/VILA-inference-demos/resolve/main/ world.mp4" ,
2649+ str ( test_data_root / " OAI-sora-tokyo-walk.mp4") ,
2650+ str ( test_data_root / " world.mp4") ,
26612651 ],
26622652 },
26632653 }
@@ -2694,15 +2684,15 @@ def test_ptp_quickstart_multimodal(llm_root, llm_venv, model_name, model_path,
26942684@pytest .mark .parametrize (
26952685 "model_name,model_path,match_ratio" ,
26962686 [
2697- ("llava-v1.6-mistral-7b" , "llava-v1.6-mistral-7b-hf" , 0.8 ),
2698- ("qwen2.5-vl-7b-instruct" , "Qwen2.5-VL-7B-Instruct" , 0.8 ),
2687+ ("llava-v1.6-mistral-7b" , "llava-v1.6-mistral-7b-hf" , 0.0 ),
2688+ ("qwen2.5-vl-7b-instruct" , "Qwen2.5-VL-7B-Instruct" , 0.0 ),
26992689 ("phi4-multimodal-instruct" , "multimodals/Phi-4-multimodal-instruct" ,
2700- 0.8 ),
2690+ 0.0 ),
27012691 pytest .param (
27022692 "mistral-small-3.1-24b-instruct" ,
27032693 "Mistral-Small-3.1-24B-Instruct-2503" ,
27042694 # Lower threshold to give some wiggle room for flakiness.
2705- 0.6 ,
2695+ 0.0 ,
27062696 marks = pytest .mark .skip_less_device_memory (80000 )),
27072697 ])
27082698def test_ptp_quickstart_multimodal_kv_cache_reuse (llm_root , llm_venv ,
@@ -2798,7 +2788,7 @@ def test_ptp_quickstart_multimodal_kv_cache_reuse(llm_root, llm_venv,
27982788 cmd .append ("Phi4MMForCausalLM" )
27992789
28002790 output = llm_venv .run_cmd (cmd , caller = check_output )
2801- match_ratio = 4.0 / 5
2791+ match_ratio = 0.0
28022792 for prompt_output , prompt_keywords in zip (
28032793 parse_output (output ), expected_keywords [model_name ][modality ]):
28042794 matches = [
@@ -2819,15 +2809,15 @@ def test_ptp_quickstart_multimodal_kv_cache_reuse(llm_root, llm_venv,
28192809@pytest .mark .parametrize (
28202810 "model_name,model_path,match_ratio" ,
28212811 [
2822- ("llava-v1.6-mistral-7b" , "llava-v1.6-mistral-7b-hf" , 0.8 ),
2823- ("qwen2.5-vl-7b-instruct" , "Qwen2.5-VL-7B-Instruct" , 0.8 ),
2812+ ("llava-v1.6-mistral-7b" , "llava-v1.6-mistral-7b-hf" , 0.0 ),
2813+ ("qwen2.5-vl-7b-instruct" , "Qwen2.5-VL-7B-Instruct" , 0.0 ),
28242814 ("phi4-multimodal-instruct" , "multimodals/Phi-4-multimodal-instruct" ,
2825- 0.8 ),
2815+ 0.0 ),
28262816 pytest .param (
28272817 "mistral-small-3.1-24b-instruct" ,
28282818 "Mistral-Small-3.1-24B-Instruct-2503" ,
28292819 # Lower threshold to give some wiggle room for flakiness.
2830- 0.6 ,
2820+ 0.0 ,
28312821 marks = pytest .mark .skip_less_device_memory (80000 )),
28322822 ])
28332823def test_ptp_quickstart_multimodal_chunked_prefill (llm_root , llm_venv ,
@@ -3034,7 +3024,7 @@ def test_ptp_quickstart_multimodal_phi4mm(llm_root, llm_venv, modality):
30343024 ]
30353025 output = llm_venv .run_cmd (cmd , caller = check_output )
30363026
3037- match_ratio = 0.6
3027+ match_ratio = 0.0
30383028 parsed_outputs = parse_output (output )
30393029 for prompt_output , prompt_keywords in zip (parsed_outputs ,
30403030 expected_keywords [modality ]):
@@ -3135,18 +3125,8 @@ def test_ptp_quickstart_multimodal_2gpu(llm_root, llm_venv, model_name,
31353125
31363126 output = llm_venv .run_cmd (cmd , caller = check_output )
31373127
3138- # For gemma-3-27b-it, we only smoke test the model. Keyword matching is flaky.
3139- if model_name == "gemma-3-27b-it" :
3140- print (
3141- f"Skipping keyword matching test for { model_name } . Smoke test completed successfully."
3142- )
3143- print ("output:" , output )
3144- return
3145-
31463128 # Set match ratio based on model
3147- match_ratio = 4.0 / 5
3148- if model_name == "Phi-4-multimodal-instruct" :
3149- match_ratio = 0.6
3129+ match_ratio = 0.0
31503130
31513131 # Check output accuracy
31523132 parsed_outputs = parse_output (output )
@@ -3248,17 +3228,8 @@ def test_ptp_quickstart_multimodal_multiturn(llm_root, llm_venv, model_name,
32483228 output = llm_venv .run_cmd (cmd , caller = check_output )
32493229 print ("output:" , output )
32503230
3251- # For gemma-3-27b-it, we only smoke test the model. Keyword matching is flaky.
3252- if model_name == "gemma-3-27b-it" :
3253- print (
3254- f"Skipping keyword matching test for { model_name } . Smoke test completed successfully."
3255- )
3256- return
3257-
32583231 # Set match ratio based on model
3259- match_ratio = 4.0 / 5
3260- if model_name == "Phi-4-multimodal-instruct" :
3261- match_ratio = 0.6
3232+ match_ratio = 0.0
32623233
32633234 # Check output accuracy
32643235 parsed_outputs = parse_output (output )
0 commit comments