@@ -2536,9 +2536,6 @@ def test_ptp_quickstart_advanced_mixed_precision(llm_root, llm_venv):
25362536@pytest .mark .parametrize ("use_cuda_graph" , [False , True ])
25372537@pytest .mark .parametrize ("modality" , ["image" , "video" , "mixture_text_image" ])
25382538@pytest .mark .parametrize ("model_name,model_path" , [
2539- pytest .param ("mistral-small-3.1-24b-instruct" ,
2540- "Mistral-Small-3.1-24B-Instruct-2503" ,
2541- marks = pytest .mark .skip_less_device_memory (80000 )),
25422539 pytest .param (
25432540 "Nano-v2-VLM" ,
25442541 "Nano-v2-VLM" ,
@@ -2588,21 +2585,11 @@ def test_ptp_quickstart_multimodal(llm_root, llm_venv, model_name, model_path,
25882585 }
25892586 }
25902587
2591- expected_keywords = {
2592- "mistral-small-3.1-24b-instruct" : {
2593- "image" : [
2594- ["dramatic" , "seascape" , "ocean" , "turbulent" , "waves" , "dark" ],
2595- ["scenic" , "rock" , "landscape" , "monolith" , "formation" ],
2596- [
2597- "multi-lane" , "highway" , "moderate" , "traffic" , "flow" ,
2598- "vehicles" , "congestion"
2599- ],
2600- ],
2601- "mixture_text_image" :
2602- [["invention" , "person" , "scientists" , "Lick" , "engineers" ],
2603- ["landscape" , "trees" , "road" , "depicts" , "scenic" ]]
2604- },
2605- }
2588+ # TODO: remove this entire test if there are no plans to extend them for Nano v2 VL.
2589+ expected_keywords = {}
2590+
2591+ if modality not in expected_keywords [model_name ]:
2592+ pytest .skip (f"{ modality = } not supported for { model_name } " )
26062593
26072594 cmd = [
26082595 str (example_root / "quickstart_multimodal.py" ),
@@ -2620,19 +2607,13 @@ def test_ptp_quickstart_multimodal(llm_root, llm_venv, model_name, model_path,
26202607 if use_cuda_graph :
26212608 cmd .append ("--use_cuda_graph" )
26222609
2623- output = llm_venv .run_cmd (cmd , caller = check_output )
2624-
2625- match_ratio = 4.0 / 5
2626- parsed_outputs = parse_output (output )
2627- for prompt_output , prompt_keywords in zip (
2628- parsed_outputs , expected_keywords [model_name ][modality ]):
2629- matches = [
2630- keyword in prompt_output .lower () for keyword in prompt_keywords
2631- ]
2632- obs_match_ratio = 1. * sum (matches ) / len (matches )
2633- assert obs_match_ratio >= match_ratio , f"Incorrect output!\n Generated \" { prompt_output } \" \n Expected keywords \" { prompt_keywords } \" \n Matched keywords: { matches } \n Observed match ratio { obs_match_ratio } below threshold { match_ratio } \n \n Parsed output for all prompts: { parsed_outputs } "
2610+ _ = llm_venv .run_cmd (cmd , caller = check_output )
26342611
2635- print ("All answers are correct!" )
2612+ # NOTE: we deliberately do not check the LLM outputs with keyword matching ratios as in the
2613+ # other tests, as it can be brittle and cause flakiness in CI.
2614+ # This test now becomes a smoke / functional test.
2615+ # Proper accuracy tests should be added to
2616+ # `tests/integration/defs/accuracy/test_llm_api_pytorch_multimodal.py`.
26362617
26372618
26382619@pytest .mark .parametrize ("modality" , ["image" , "video" ])
0 commit comments