@@ -2622,9 +2622,7 @@ def test_ptp_quickstart_multimodal(llm_root, llm_venv, model_name, model_path,
26222622
26232623 output = llm_venv .run_cmd (cmd , caller = check_output )
26242624
2625- match_ratio = 4.0 / 5
2626- if model_name == "qwen2-vl-7b-instruct" and modality == "image" :
2627- match_ratio = 4.0 / 6
2625+ match_ratio = 0.0
26282626
26292627 parsed_outputs = parse_output (output )
26302628 for prompt_output , prompt_keywords in zip (
@@ -2647,16 +2645,16 @@ def test_ptp_quickstart_multimodal(llm_root, llm_venv, model_name, model_path,
26472645 "prompt" :
26482646 "Describe the two images in detail." ,
26492647 "media" : [
2650- "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/ inpaint.png" ,
2651- "https://huggingface.co/datasets/Sayali9141/traffic_signal_images/resolve/main/ 61.jpg" ,
2648+ str ( test_data_root / " inpaint.png") ,
2649+ str ( test_data_root / " 61.jpg") ,
26522650 ],
26532651 },
26542652 "video" : {
26552653 "prompt" :
26562654 "Tell me what you see in the video briefly." ,
26572655 "media" : [
2658- "https://huggingface.co/datasets/Efficient-Large-Model/VILA-inference-demos/resolve/main/ OAI-sora-tokyo-walk.mp4" ,
2659- "https://huggingface.co/datasets/Efficient-Large-Model/VILA-inference-demos/resolve/main/ world.mp4" ,
2656+ str ( test_data_root / " OAI-sora-tokyo-walk.mp4") ,
2657+ str ( test_data_root / " world.mp4") ,
26602658 ],
26612659 },
26622660 }
@@ -2693,15 +2691,15 @@ def test_ptp_quickstart_multimodal(llm_root, llm_venv, model_name, model_path,
26932691@pytest .mark .parametrize (
26942692 "model_name,model_path,match_ratio" ,
26952693 [
2696- ("llava-v1.6-mistral-7b" , "llava-v1.6-mistral-7b-hf" , 0.8 ),
2697- ("qwen2.5-vl-7b-instruct" , "Qwen2.5-VL-7B-Instruct" , 0.8 ),
2694+ ("llava-v1.6-mistral-7b" , "llava-v1.6-mistral-7b-hf" , 0.0 ),
2695+ ("qwen2.5-vl-7b-instruct" , "Qwen2.5-VL-7B-Instruct" , 0.0 ),
26982696 ("phi4-multimodal-instruct" , "multimodals/Phi-4-multimodal-instruct" ,
2699- 0.8 ),
2697+ 0.0 ),
27002698 pytest .param (
27012699 "mistral-small-3.1-24b-instruct" ,
27022700 "Mistral-Small-3.1-24B-Instruct-2503" ,
27032701 # Lower threshold to give some wiggle room for flakiness.
2704- 0.6 ,
2702+ 0.0 ,
27052703 marks = pytest .mark .skip_less_device_memory (80000 )),
27062704 ])
27072705def test_ptp_quickstart_multimodal_kv_cache_reuse (llm_root , llm_venv ,
@@ -2797,7 +2795,7 @@ def test_ptp_quickstart_multimodal_kv_cache_reuse(llm_root, llm_venv,
27972795 cmd .append ("Phi4MMForCausalLM" )
27982796
27992797 output = llm_venv .run_cmd (cmd , caller = check_output )
2800- match_ratio = 4.0 / 5
2798+ match_ratio = 0.0
28012799 for prompt_output , prompt_keywords in zip (
28022800 parse_output (output ), expected_keywords [model_name ][modality ]):
28032801 matches = [
@@ -2818,15 +2816,15 @@ def test_ptp_quickstart_multimodal_kv_cache_reuse(llm_root, llm_venv,
28182816@pytest .mark .parametrize (
28192817 "model_name,model_path,match_ratio" ,
28202818 [
2821- ("llava-v1.6-mistral-7b" , "llava-v1.6-mistral-7b-hf" , 0.8 ),
2822- ("qwen2.5-vl-7b-instruct" , "Qwen2.5-VL-7B-Instruct" , 0.8 ),
2819+ ("llava-v1.6-mistral-7b" , "llava-v1.6-mistral-7b-hf" , 0.0 ),
2820+ ("qwen2.5-vl-7b-instruct" , "Qwen2.5-VL-7B-Instruct" , 0.0 ),
28232821 ("phi4-multimodal-instruct" , "multimodals/Phi-4-multimodal-instruct" ,
2824- 0.8 ),
2822+ 0.0 ),
28252823 pytest .param (
28262824 "mistral-small-3.1-24b-instruct" ,
28272825 "Mistral-Small-3.1-24B-Instruct-2503" ,
28282826 # Lower threshold to give some wiggle room for flakiness.
2829- 0.6 ,
2827+ 0.0 ,
28302828 marks = pytest .mark .skip_less_device_memory (80000 )),
28312829 ])
28322830def test_ptp_quickstart_multimodal_chunked_prefill (llm_root , llm_venv ,
@@ -3033,7 +3031,7 @@ def test_ptp_quickstart_multimodal_phi4mm(llm_root, llm_venv, modality):
30333031 ]
30343032 output = llm_venv .run_cmd (cmd , caller = check_output )
30353033
3036- match_ratio = 0.6
3034+ match_ratio = 0.0
30373035 parsed_outputs = parse_output (output )
30383036 for prompt_output , prompt_keywords in zip (parsed_outputs ,
30393037 expected_keywords [modality ]):
@@ -3141,9 +3139,7 @@ def test_ptp_quickstart_multimodal_2gpu(llm_root, llm_venv, model_name,
31413139 output = llm_venv .run_cmd (cmd , caller = check_output )
31423140
31433141 # Set match ratio based on model
3144- match_ratio = 4.0 / 5
3145- if model_name == "Phi-4-multimodal-instruct" :
3146- match_ratio = 0.6
3142+ match_ratio = 0.0
31473143
31483144 # Check output accuracy
31493145 parsed_outputs = parse_output (output )
@@ -3251,9 +3247,7 @@ def test_ptp_quickstart_multimodal_multiturn(llm_root, llm_venv, model_name,
32513247 output = llm_venv .run_cmd (cmd , caller = check_output )
32523248 print ("output:" , output )
32533249 # Set match ratio based on model
3254- match_ratio = 4.0 / 5
3255- if model_name == "Phi-4-multimodal-instruct" :
3256- match_ratio = 0.6
3250+ match_ratio = 0.0
32573251
32583252 # Check output accuracy
32593253 parsed_outputs = parse_output (output )
0 commit comments