Skip to content

Commit 6099568

Browse files
committed
[https://nvbugs/5669097][tests] Add MMMU test for mistral small
Signed-off-by: William Zhang <133824995+2ez4bz@users.noreply.github.com>
1 parent e033129 commit 6099568

File tree

8 files changed

+39
-44
lines changed

8 files changed

+39
-44
lines changed

tests/integration/defs/accuracy/references/mmmu.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -27,3 +27,5 @@ Qwen/Qwen3-VL-30B-A3B-Instruct:
2727
mistral/Mistral-Large-3-675B:
2828
# Mistral Large 3 675B only supports single image input, so accuracy is lower.
2929
- accuracy: 47
30+
mistralai/Mistral-Small-3.1-24B-Instruct-2503:
31+
- accuracy: 57.0

tests/integration/defs/accuracy/test_llm_api_pytorch_multimodal.py

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -327,3 +327,28 @@ def test_nvfp4_4gpus(
327327
) as llm:
328328
task = MMMU(self.MODEL_NAME)
329329
task.evaluate(llm, sampling_params=self.sampling_params)
330+
331+
332+
class TestMistralSmall24B(LlmapiAccuracyTestHarness):
333+
MODEL_NAME = "mistralai/Mistral-Small-3.1-24B-Instruct-2503"
334+
MODEL_PATH = f"{llm_models_root()}/Mistral-Small-3.1-24B-Instruct-2503"
335+
MAX_NUM_TOKENS = 16384
336+
337+
# NOTE: MMMU adds <|endoftext|> to the stop token.
338+
sampling_params = SamplingParams(
339+
max_tokens=MMMU.MAX_OUTPUT_LEN,
340+
truncate_prompt_tokens=MMMU.MAX_INPUT_LEN,
341+
stop="<|endoftext|>",
342+
)
343+
344+
@pytest.mark.skip_less_device_memory(80000)
345+
def test_auto_dtype(self):
346+
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75)
347+
with LLM(
348+
self.MODEL_PATH,
349+
kv_cache_config=kv_cache_config,
350+
enable_chunked_prefill=True,
351+
max_num_tokens=self.MAX_NUM_TOKENS,
352+
) as llm:
353+
task = MMMU(self.MODEL_NAME)
354+
task.evaluate(llm, sampling_params=self.sampling_params)

tests/integration/defs/test_e2e.py

Lines changed: 11 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -2536,9 +2536,6 @@ def test_ptp_quickstart_advanced_mixed_precision(llm_root, llm_venv):
25362536
@pytest.mark.parametrize("use_cuda_graph", [False, True])
25372537
@pytest.mark.parametrize("modality", ["image", "video", "mixture_text_image"])
25382538
@pytest.mark.parametrize("model_name,model_path", [
2539-
pytest.param("mistral-small-3.1-24b-instruct",
2540-
"Mistral-Small-3.1-24B-Instruct-2503",
2541-
marks=pytest.mark.skip_less_device_memory(80000)),
25422539
pytest.param(
25432540
"Nano-v2-VLM",
25442541
"Nano-v2-VLM",
@@ -2588,21 +2585,11 @@ def test_ptp_quickstart_multimodal(llm_root, llm_venv, model_name, model_path,
25882585
}
25892586
}
25902587

2591-
expected_keywords = {
2592-
"mistral-small-3.1-24b-instruct": {
2593-
"image": [
2594-
["dramatic", "seascape", "ocean", "turbulent", "waves", "dark"],
2595-
["scenic", "rock", "landscape", "monolith", "formation"],
2596-
[
2597-
"multi-lane", "highway", "moderate", "traffic", "flow",
2598-
"vehicles", "congestion"
2599-
],
2600-
],
2601-
"mixture_text_image":
2602-
[["invention", "person", "scientists", "Lick", "engineers"],
2603-
["landscape", "trees", "road", "depicts", "scenic"]]
2604-
},
2605-
}
2588+
# TODO: remove this entire test if there are no plans to extend them for Nano v2 VL.
2589+
expected_keywords = {}
2590+
2591+
if modality not in expected_keywords[model_name]:
2592+
pytest.skip(f"{modality=} not supported for {model_name}")
26062593

26072594
cmd = [
26082595
str(example_root / "quickstart_multimodal.py"),
@@ -2620,19 +2607,13 @@ def test_ptp_quickstart_multimodal(llm_root, llm_venv, model_name, model_path,
26202607
if use_cuda_graph:
26212608
cmd.append("--use_cuda_graph")
26222609

2623-
output = llm_venv.run_cmd(cmd, caller=check_output)
2624-
2625-
match_ratio = 4.0 / 5
2626-
parsed_outputs = parse_output(output)
2627-
for prompt_output, prompt_keywords in zip(
2628-
parsed_outputs, expected_keywords[model_name][modality]):
2629-
matches = [
2630-
keyword in prompt_output.lower() for keyword in prompt_keywords
2631-
]
2632-
obs_match_ratio = 1. * sum(matches) / len(matches)
2633-
assert obs_match_ratio >= match_ratio, f"Incorrect output!\nGenerated \"{prompt_output}\"\nExpected keywords \"{prompt_keywords}\"\n Matched keywords: {matches}\n Observed match ratio {obs_match_ratio} below threshold {match_ratio}\n\nParsed output for all prompts: {parsed_outputs}"
2610+
_ = llm_venv.run_cmd(cmd, caller=check_output)
26342611

2635-
print("All answers are correct!")
2612+
# NOTE: we deliberately do not check the LLM outputs with keyword matching ratios as in the
2613+
# other tests, as it can be brittle and cause flakiness in CI.
2614+
# This test now becomes a smoke / functional test.
2615+
# Proper accuracy tests should be added to
2616+
# `tests/integration/defs/accuracy/test_llm_api_pytorch_multimodal.py`.
26362617

26372618

26382619
@pytest.mark.parametrize("modality", ["image", "video"])

tests/integration/test_lists/qa/llm_function_core.txt

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -741,9 +741,6 @@ test_e2e.py::test_ptp_quickstart_advanced_8gpus_chunked_prefill_sq_22k[Llama-4-M
741741
test_e2e.py::test_ptp_quickstart_advanced_8gpus_chunked_prefill_sq_22k[Llama-4-Scout-17B-16E-Instruct-FP8-llama4-models/Llama-4-Scout-17B-16E-Instruct-FP8-True]
742742
test_e2e.py::test_ptp_quickstart_advanced_8gpus_chunked_prefill_sq_22k[Llama-4-Scout-17B-16E-Instruct-FP4-llama4-models/Llama-4-Scout-17B-16E-Instruct-FP4-True]
743743
test_e2e.py::test_relaxed_acceptance_quickstart_advanced_deepseek_r1_8gpus[DeepSeek-R1-DeepSeek-R1/DeepSeek-R1]
744-
test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-image-True]
745-
test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-image-False]
746-
test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-mixture_text_image-True]
747744
test_e2e.py::test_ptp_quickstart_multimodal_kv_cache_reuse[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-0.6-image]
748745
test_e2e.py::test_ptp_quickstart_multimodal_chunked_prefill[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-0.6-image]
749746
test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[phi4-multimodal-instruct-multimodals/Phi-4-multimodal-instruct-audio]

tests/integration/test_lists/qa/llm_function_core_sanity.txt

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -280,9 +280,6 @@ test_e2e.py::test_ptp_quickstart_advanced[Llama3.2-11B-BF16-llama-3.2-models/Lla
280280
test_e2e.py::test_ptp_quickstart_advanced[Qwen3-30B-A3B-Qwen3/Qwen3-30B-A3B]
281281
test_e2e.py::test_ptp_quickstart_bert[TRTLLM-BertForSequenceClassification-bert/bert-base-uncased-yelp-polarity]
282282
test_e2e.py::test_ptp_quickstart_bert[VANILLA-BertForSequenceClassification-bert/bert-base-uncased-yelp-polarity]
283-
test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-image-False]
284-
test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-image-True]
285-
test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-mixture_text_image-True]
286283
test_e2e.py::test_ptp_scaffolding[DeepSeek-R1-Distill-Qwen-7B-DeepSeek-R1/DeepSeek-R1-Distill-Qwen-7B]
287284
test_e2e.py::test_ptp_quickstart_advanced_deepseek_r1_w4afp8_8gpus[DeepSeek-R1-W4AFP8-DeepSeek-R1/DeepSeek-R1-W4AFP8]
288285
test_e2e.py::test_ptp_star_attention_example[Llama3.1-8B-BF16-llama-3.1-model/Meta-Llama-3.1-8B]

tests/integration/test_lists/qa/llm_function_nim.txt

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -276,9 +276,6 @@ test_e2e.py::test_ptp_quickstart_advanced_8gpus_chunked_prefill_sq_22k[Llama-4-M
276276
test_e2e.py::test_ptp_quickstart_advanced_8gpus_chunked_prefill_sq_22k[Llama-4-Scout-17B-16E-Instruct-FP8-llama4-models/Llama-4-Scout-17B-16E-Instruct-FP8-True]
277277
test_e2e.py::test_ptp_quickstart_advanced_8gpus_chunked_prefill_sq_22k[Llama-4-Scout-17B-16E-Instruct-FP4-llama4-models/Llama-4-Scout-17B-16E-Instruct-FP4-True]
278278
test_e2e.py::test_relaxed_acceptance_quickstart_advanced_deepseek_r1_8gpus[DeepSeek-R1-DeepSeek-R1/DeepSeek-R1]
279-
test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-image-True]
280-
test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-image-False]
281-
test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-mixture_text_image-True]
282279
test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[phi4-multimodal-instruct-multimodals/Phi-4-multimodal-instruct-audio]
283280
test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[phi4-multimodal-instruct-multimodals/Phi-4-multimodal-instruct-image]
284281
test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[phi4-multimodal-instruct-multimodals/Phi-4-multimodal-instruct-image_audio]

tests/integration/test_lists/test-db/l0_h100.yml

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,7 @@ l0_h100:
7272
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_chunked_prefill[quant_dtype=fp8-kv_cache_reuse=True-fp8kv=True-overlap_scheduler=True]
7373
- accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_dummy_load_format
7474
- accuracy/test_llm_api_pytorch_multimodal.py::TestGemma3_27BInstruct::test_fp8_prequantized
75+
- accuracy/test_llm_api_pytorch_multimodal.py::TestMistralSmall24B::test_auto_dtype
7576
- accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_fp8_block_scales[latency]
7677
- accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_dummy_load_format
7778
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_fp8[latency-torch_compile=False]
@@ -284,8 +285,6 @@ l0_h100:
284285
- accuracy/test_llm_api_pytorch.py::TestStarcoder2_3B::test_auto_dtype
285286
- accuracy/test_llm_api_pytorch.py::TestStarcoder2_7B::test_auto_dtype
286287
- accuracy/test_llm_api_pytorch.py::TestStarcoder2_15B::test_auto_dtype
287-
- test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-image-True]
288-
- test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-mixture_text_image-True]
289288
- test_e2e.py::test_ptp_quickstart_multimodal_kv_cache_reuse[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-0.6-image]
290289
- test_e2e.py::test_ptp_quickstart_multimodal_chunked_prefill[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-0.6-image]
291290
- examples/test_mistral.py::test_mistral_with_bf16_lora_torch[mistral-7b-v0.1]

tests/integration/test_lists/waives.txt

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -319,8 +319,6 @@ accuracy/test_llm_api_pytorch.py::TestLlama3_1NemotronNano8Bv1::test_fp8_prequan
319319
accuracy/test_llm_api_pytorch.py::TestNemotronH_47B_Base::test_auto_dtype[tp8ep4-cuda_graph=True] SKIP (https://nvbugs/5640697)
320320
accuracy/test_llm_api_pytorch.py::TestNemotronH_47B_Base::test_reasoning_fp8_prequantized[tp8ep8-cuda_graph=True] SKIP (https://nvbugs/5640697)
321321
accuracy/test_llm_api_pytorch.py::TestQwQ_32B::test_auto_dtype_tp4 SKIP (https://nvbugs/5640697)
322-
test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-image-True] SKIP (https://nvbugs/5648560)
323-
test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-image-False] SKIP (https://nvbugs/5648560)
324322
examples/test_multimodal.py::test_llm_multimodal_general[nougat-base-pp:1-tp:1-bfloat16-bs:8-cpp_e2e:False-nb:1] SKIP (https://nvbugs/5568052)
325323
accuracy/test_llm_api_pytorch_multimodal.py::TestNVILA_8B::test_auto_dtype SKIP (https://nvbugs/5648441)
326324
accuracy/test_llm_api_pytorch_multimodal.py::TestVILA1_5_3B::test_auto_dtype SKIP (https://nvbugs/5648441)
@@ -359,7 +357,6 @@ accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[tp1-cutlass]
359357
accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[tp4ep1-cutlass] SKIP (https://nvbugs/5702795)
360358
accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[tp4ep4-cutlass] SKIP (https://nvbugs/5702795)
361359
accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[tp4ep4-trtllm] SKIP (https://nvbugs/5702795)
362-
test_e2e.py::test_ptp_quickstart_multimodal[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503-mixture_text_image-True] SKIP (https://nvbugs/5648560)
363360
test_e2e.py::test_ptp_quickstart_multimodal_2gpu[mistral-small-3.1-24b-instruct-Mistral-Small-3.1-24B-Instruct-2503] SKIP (https://nvbugs/5648560)
364361
accuracy/test_cli_flow.py::TestGpt2::test_weight_only[int4] SKIP (https://nvbugs/5705193)
365362
accuracy/test_cli_flow.py::TestGpt2::test_int8_kv_cache SKIP (https://nvbugs/5705193)

0 commit comments

Comments
 (0)