Skip to content

Commit fe070a0

Browse files
authored
test: Update Llama4 Scout FP4 & FP8 accuracy tests (NVIDIA#5901)
Signed-off-by: Chenfei Zhang <chenfeiz@nvidia.com>
1 parent 28385f6 commit fe070a0

File tree

6 files changed

+77
-10
lines changed

6 files changed

+77
-10
lines changed

tensorrt_llm/_torch/modules/fused_moe/quantization.py

Lines changed: 14 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1096,10 +1096,13 @@ def load_expert_w3_w1_weight_scale_nvfp4(
10961096

10971097
orig_shape = dst_w3_w1_weight_scale.shape
10981098

1099-
dst_w3_w1_weight_scale.copy_(
1100-
torch.ops.trtllm.nvfp4_block_scale_interleave(
1101-
dst_w3_w1_weight_scale.view(float4_sf_dtype)).view(
1102-
self.block_scales_dtype).reshape(orig_shape))
1099+
dst_w3_w1_weight_scale_interleaved = torch.ops.trtllm.nvfp4_block_scale_interleave(
1100+
dst_w3_w1_weight_scale.view(float4_sf_dtype)).view(
1101+
self.block_scales_dtype).reshape(orig_shape)
1102+
1103+
torch.cuda.synchronize()
1104+
1105+
dst_w3_w1_weight_scale.copy_(dst_w3_w1_weight_scale_interleaved)
11031106

11041107
def load_expert_w2_weight_scale_nvfp4(self, module: torch.nn.Module,
11051108
w2_weight_scale: torch.Tensor,
@@ -1113,10 +1116,13 @@ def load_expert_w2_weight_scale_nvfp4(self, module: torch.nn.Module,
11131116

11141117
orig_shape = dst_w2_weight_scale.shape
11151118

1116-
dst_w2_weight_scale.copy_(
1117-
torch.ops.trtllm.nvfp4_block_scale_interleave(
1118-
dst_w2_weight_scale.view(float4_sf_dtype)).view(
1119-
self.block_scales_dtype).reshape(orig_shape))
1119+
dst_w2_weight_scale_interleaved = torch.ops.trtllm.nvfp4_block_scale_interleave(
1120+
dst_w2_weight_scale.view(float4_sf_dtype)).view(
1121+
self.block_scales_dtype).reshape(orig_shape)
1122+
1123+
torch.cuda.synchronize()
1124+
1125+
dst_w2_weight_scale.copy_(dst_w2_weight_scale_interleaved)
11201126

11211127

11221128
class NVFP4TRTLLMGenFusedMoEMethod(NVFP4FusedMoEMethod):

tests/integration/defs/accuracy/references/gsm8k.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -18,6 +18,11 @@ meta-llama/Llama-4-Maverick-17B-128E-Instruct:
1818
- accuracy: 92.20
1919
meta-llama/Llama-4-Scout-17B-16E-Instruct:
2020
- accuracy: 89.70
21+
- quant_algo: NVFP4
22+
kv_cache_quant_algo: FP8
23+
accuracy: 79.62
24+
- quant_algo: FP8
25+
accuracy: 80.37
2126
deepseek-ai/DeepSeek-V3-Lite:
2227
- accuracy: 64.74
2328
- quant_algo: NVFP4

tests/integration/defs/accuracy/references/mmlu.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,11 @@ meta-llama/Llama-4-Maverick-17B-128E-Instruct:
6868
- accuracy: 86.40
6969
meta-llama/Llama-4-Scout-17B-16E-Instruct:
7070
- accuracy: 80.00
71+
- quant_algo: NVFP4
72+
kv_cache_quant_algo: FP8
73+
accuracy: 88.63
74+
- quant_algo: FP8
75+
accuracy: 89.46
7176
mistralai/Mistral-7B-v0.1:
7277
- accuracy: 66
7378
mistralai/Mistral-7B-Instruct-v0.3:

tests/integration/defs/accuracy/test_llm_api_pytorch.py

Lines changed: 47 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -422,7 +422,6 @@ def test_auto_dtype(self, cuda_graph, tp_size, pp_size, ep_size):
422422

423423
class TestLlama4ScoutInstruct(LlmapiAccuracyTestHarness):
424424
MODEL_NAME = "meta-llama/Llama-4-Scout-17B-16E-Instruct"
425-
MODEL_PATH = f"{llm_models_root()}/llama4-models/Llama-4-Scout-17B-16E-Instruct"
426425

427426
@skip_pre_hopper
428427
@pytest.mark.skip_less_mpi_world_size(8)
@@ -431,8 +430,9 @@ class TestLlama4ScoutInstruct(LlmapiAccuracyTestHarness):
431430
(8, 1, 8)],
432431
ids=["tp8", "tp8ep4", "tp8ep8"])
433432
def test_auto_dtype(self, cuda_graph, tp_size, pp_size, ep_size):
433+
model_path = f"{llm_models_root()}/llama4-models/Llama-4-Scout-17B-16E-Instruct"
434434
with LLM(
435-
self.MODEL_PATH,
435+
model_path,
436436
tensor_parallel_size=tp_size,
437437
# Keep this low to avoid warmup OOM in CI
438438
max_seq_len=8192,
@@ -445,6 +445,51 @@ def test_auto_dtype(self, cuda_graph, tp_size, pp_size, ep_size):
445445
task = GSM8K(self.MODEL_NAME)
446446
task.evaluate(llm)
447447

448+
@skip_pre_hopper
449+
@pytest.mark.skip_less_mpi_world_size(8)
450+
@parametrize_with_ids("cuda_graph", [True])
451+
@pytest.mark.parametrize("tp_size,pp_size,ep_size", [(8, 1, 8), (4, 1, 1)],
452+
ids=["tp8ep8", "tp4"])
453+
def test_fp8(self, cuda_graph, tp_size, pp_size, ep_size):
454+
model_path = f"{llm_models_root()}/llama4-models/Llama-4-Scout-17B-16E-Instruct-FP8"
455+
with LLM(
456+
model_path,
457+
tensor_parallel_size=tp_size,
458+
# Keep this low to avoid warmup OOM in CI
459+
max_seq_len=8192,
460+
pipeline_parallel_size=pp_size,
461+
moe_expert_parallel_size=ep_size,
462+
cuda_graph_config=CudaGraphConfig()
463+
if cuda_graph else None) as llm:
464+
assert llm.args.quant_config.quant_algo == QuantAlgo.FP8
465+
task = MMLU(self.MODEL_NAME)
466+
task.evaluate(llm)
467+
task = GSM8K(self.MODEL_NAME)
468+
task.evaluate(llm)
469+
470+
@skip_pre_blackwell
471+
@pytest.mark.skip_less_mpi_world_size(8)
472+
@parametrize_with_ids("cuda_graph", [True])
473+
@pytest.mark.parametrize("tp_size,pp_size,ep_size", [(8, 1, 8), (4, 1, 1)],
474+
ids=["tp8ep8", "tp4"])
475+
def test_fp4(self, cuda_graph, tp_size, pp_size, ep_size):
476+
model_path = f"{llm_models_root()}/llama4-models/Llama-4-Scout-17B-16E-Instruct-FP4"
477+
with LLM(
478+
model_path,
479+
tensor_parallel_size=tp_size,
480+
# Keep this low to avoid warmup OOM in CI
481+
max_seq_len=8192,
482+
pipeline_parallel_size=pp_size,
483+
moe_expert_parallel_size=ep_size,
484+
cuda_graph_config=CudaGraphConfig()
485+
if cuda_graph else None) as llm:
486+
assert llm.args.quant_config.quant_algo == QuantAlgo.NVFP4
487+
assert llm.args.quant_config.kv_cache_quant_algo == QuantAlgo.FP8
488+
task = MMLU(self.MODEL_NAME)
489+
task.evaluate(llm)
490+
task = GSM8K(self.MODEL_NAME)
491+
task.evaluate(llm)
492+
448493

449494
class TestMistral7B(LlmapiAccuracyTestHarness):
450495
MODEL_NAME = "mistralai/Mistral-7B-v0.1"

tests/integration/test_lists/qa/examples_test_list.txt

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -455,6 +455,10 @@ accuracy/test_llm_api_pytorch.py::TestLlama4MaverickInstruct::test_auto_dtype[tp
455455
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp8-cuda_graph=False]
456456
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp8ep4-cuda_graph=True]
457457
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_auto_dtype[tp8ep8-cuda_graph=True]
458+
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp8[tp8ep8-cuda_graph=True]
459+
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp8[tp4-cuda_graph=True]
460+
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp4[tp8ep8-cuda_graph=True]
461+
accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp4[tp4-cuda_graph=True]
458462
accuracy/test_llm_api_pytorch.py::TestMixtral8x7B::test_fp8_tp2
459463
accuracy/test_llm_api_pytorch.py::TestMixtral8x7B::test_nvfp4_tp2
460464
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=0-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False]

tests/integration/test_lists/test-db/l0_dgx_b200.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -62,3 +62,5 @@ l0_dgx_b200:
6262
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[tep4_latency_moe_cutlass]
6363
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[dep4_latency_moe_trtllm]
6464
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[dep4_latency_moe_cutlass]
65+
- accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp8[tp4-cuda_graph=True]
66+
- accuracy/test_llm_api_pytorch.py::TestLlama4ScoutInstruct::test_fp4[tp4-cuda_graph=True]

0 commit comments

Comments
 (0)