Skip to content

Commit 51bf716

Browse files
authored
[None][feat] add qwen3-next CI test of accuracy on BF16 and NVFP4 (#9330)
Signed-off-by: jiant <107457950+JadoTu@users.noreply.github.com>
1 parent e47927e commit 51bf716

File tree

10 files changed

+115
-0
lines changed

10 files changed

+115
-0
lines changed

tests/integration/defs/accuracy/references/gsm8k.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -138,6 +138,11 @@ Qwen3/Qwen3-235B-A22B:
138138
accuracy: 85.78
139139
Qwen3/Qwen3-Next-80B-A3B-Thinking:
140140
- accuracy: 81.577
141+
Qwen3/Qwen3-Next-80B-A3B-Instruct:
142+
- accuracy: 92.72
143+
- quant_algo: NVFP4
144+
kv_cache_quant_algo: FP8
145+
accuracy: 90.86
141146
moonshotai/Kimi-K2-Instruct:
142147
- quant_algo: FP8_BLOCK_SCALES
143148
accuracy: 94.84

tests/integration/defs/accuracy/references/mmlu.yaml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -242,6 +242,11 @@ Qwen3/Qwen3-235B-A22B:
242242
accuracy: 86
243243
Qwen3/Qwen3-Next-80B-A3B-Thinking:
244244
- accuracy: 86
245+
Qwen3/Qwen3-Next-80B-A3B-Instruct:
246+
- accuracy: 86.03
247+
- quant_algo: NVFP4
248+
kv_cache_quant_algo: FP8
249+
accuracy: 85.08
245250
moonshotai/Kimi-K2-Instruct:
246251
- quant_algo: FP8_BLOCK_SCALES
247252
accuracy: 87.65

tests/integration/defs/accuracy/test_llm_api_pytorch.py

Lines changed: 80 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4198,6 +4198,86 @@ def test_auto_dtype(self, tp_size, pp_size, ep_size):
41984198
task.evaluate(llm)
41994199

42004200

4201+
@skip_pre_hopper
4202+
@pytest.mark.skip_less_device_memory(80000)
4203+
class TestQwen3NextInstruct(LlmapiAccuracyTestHarness):
4204+
MODEL_PATH = f"{llm_models_root()}/Qwen3-Next"
4205+
MODEL_NAME = "Qwen3/Qwen3-Next-80B-A3B-Instruct"
4206+
4207+
# Default setting of `256` is too small
4208+
GSM8K_MAX_OUTPUT_LEN = 512
4209+
4210+
@pytest.mark.skip_less_device(4)
4211+
@pytest.mark.parametrize(
4212+
"tp_size,pp_size,ep_size,cuda_graph,overlap_scheduler",
4213+
[
4214+
(4, 1, 4, True, True),
4215+
],
4216+
ids=[
4217+
"tp4ep4_cudagraph_overlap",
4218+
],
4219+
)
4220+
def test_bf16_4gpu(self, tp_size, pp_size, ep_size, cuda_graph,
4221+
overlap_scheduler, mocker):
4222+
model_path = f"{self.MODEL_PATH}/Qwen3-Next-80B-A3B-Instruct"
4223+
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.6,
4224+
enable_block_reuse=False)
4225+
pytorch_config = dict(disable_overlap_scheduler=not overlap_scheduler,
4226+
cuda_graph_config=CudaGraphConfig(
4227+
max_batch_size=512) if cuda_graph else None)
4228+
4229+
with LLM(
4230+
model_path,
4231+
tensor_parallel_size=tp_size,
4232+
max_num_tokens=16384,
4233+
pipeline_parallel_size=pp_size,
4234+
moe_expert_parallel_size=ep_size,
4235+
kv_cache_config=kv_cache_config,
4236+
**pytorch_config,
4237+
) as llm:
4238+
task = MMLU(self.MODEL_NAME)
4239+
task.evaluate(llm)
4240+
mocker.patch.object(GSM8K, "MAX_OUTPUT_LEN",
4241+
self.GSM8K_MAX_OUTPUT_LEN)
4242+
task = GSM8K(self.MODEL_NAME)
4243+
task.evaluate(llm)
4244+
4245+
@skip_pre_blackwell
4246+
@pytest.mark.skip_less_device(4)
4247+
@pytest.mark.parametrize("moe_backend", ["CUTLASS", "TRTLLM"],
4248+
ids=["cutlass", "trtllm"])
4249+
@pytest.mark.parametrize(
4250+
"tp_size,pp_size,ep_size,cuda_graph,overlap_scheduler",
4251+
[(1, 1, 1, True, True), (4, 1, 1, True, True), (4, 1, 4, True, True),
4252+
(4, 1, 4, False, False)],
4253+
ids=["tp1", "tp4ep1", "tp4ep4", "no_cuda_graph_overlap"])
4254+
def test_nvfp4(self, moe_backend, tp_size, pp_size, ep_size, cuda_graph,
4255+
overlap_scheduler, mocker):
4256+
model_path = f"{self.MODEL_PATH}/qwen3-next-80b-instruct-nvfp4-ptq-fp8kv"
4257+
4258+
kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.6,
4259+
enable_block_reuse=False)
4260+
pytorch_config = dict(disable_overlap_scheduler=not overlap_scheduler,
4261+
cuda_graph_config=CudaGraphConfig(
4262+
max_batch_size=512) if cuda_graph else None)
4263+
moe_config = MoeConfig(backend=moe_backend)
4264+
4265+
with LLM(model_path,
4266+
tensor_parallel_size=tp_size,
4267+
max_num_tokens=16384,
4268+
pipeline_parallel_size=pp_size,
4269+
moe_expert_parallel_size=ep_size,
4270+
kv_cache_config=kv_cache_config,
4271+
**pytorch_config,
4272+
moe_config=moe_config) as llm:
4273+
task = MMLU(self.MODEL_NAME)
4274+
task.evaluate(llm)
4275+
mocker.patch.object(GSM8K, "MAX_OUTPUT_LEN",
4276+
self.GSM8K_MAX_OUTPUT_LEN)
4277+
task = GSM8K(self.MODEL_NAME)
4278+
task.evaluate(llm)
4279+
4280+
42014281
class TestSeedOss_36B(LlmapiAccuracyTestHarness):
42024282
MODEL_NAME = "ByteDance-Seed/Seed-OSS-36B-Instruct"
42034283
MODEL_PATH = f"{llm_models_root()}/Seed-OSS/Seed-OSS-36B-Instruct"

tests/integration/test_lists/test-db/l0_b200.yml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,11 @@ l0_b200:
5454
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[mxfp8-latency-TRTLLM]
5555
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[mxfp8-latency-CUTLASS]
5656
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a16_mxfp4[latency-TRTLLM]
57+
- accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[tp1-cutlass]
58+
- accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[tp4ep1-cutlass]
59+
- accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[tp4ep4-cutlass]
60+
- accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[no_cuda_graph_overlap-cutlass]
61+
- accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[tp4ep4-trtllm]
5762
- disaggregated/test_workers.py::test_workers_kv_cache_aware_router_eviction[TinyLlama-1.1B-Chat-v1.0] # nvbugs 5300551
5863
- test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-NVFP4-nvfp4-quantized/Meta-Llama-3.1-8B]
5964
- test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-FP8-llama-3.1-model/Llama-3.1-8B-Instruct-FP8]

tests/integration/test_lists/test-db/l0_dgx_b200.yml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,12 @@ l0_dgx_b200:
3939
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[dep4_latency_moe_cutlass-torch_compile=False]
4040
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[dep4_latency_moe_cutlass-torch_compile=True] ISOLATION
4141
- accuracy/test_llm_api_pytorch.py::TestQwen3NextThinking::test_auto_dtype[tp4ep4]
42+
- accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_bf16_4gpu[tp4ep4_cudagraph_overlap]
43+
- accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[tp1-cutlass]
44+
- accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[tp4ep1-cutlass]
45+
- accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[tp4ep4-cutlass]
46+
- accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[no_cuda_graph_overlap-cutlass]
47+
- accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[tp4ep4-trtllm]
4248
- disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_ucx[DeepSeek-V3-Lite-fp8]
4349
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-trtllm-auto]
4450
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[ep4-cutlass-auto]

tests/integration/test_lists/test-db/l0_dgx_b300.yml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -58,6 +58,11 @@ l0_dgx_b300:
5858
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[dep4_latency_moe_trtllm-torch_compile=False]
5959
- accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[dep4_latency_moe_cutlass-torch_compile=True]
6060
- accuracy/test_disaggregated_serving.py::TestQwen3_30B_A3B::test_mixed_ctx_gen_model[ctxpp2gentp2]
61+
- accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[tp1-cutlass]
62+
- accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[tp4ep1-cutlass]
63+
- accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[tp4ep4-cutlass]
64+
- accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[no_cuda_graph_overlap-cutlass]
65+
- accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[tp4ep4-trtllm]
6166
- disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_ucx[DeepSeek-V3-Lite-fp8]
6267
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-cutlass-fp8]
6368
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-cutlass-auto]

tests/integration/test_lists/test-db/l0_dgx_h100.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -93,6 +93,7 @@ l0_dgx_h100:
9393
- accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[MMLU-gen_tp=2-ctx_pp=2]
9494
- accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_multi_instance[GSM8K]
9595
- accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_multi_instance[MMLU]
96+
- accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_bf16_4gpu[tp4ep4_cudagraph_overlap]
9697
- disaggregated/test_auto_scaling.py::test_service_discovery[etcd-round_robin]
9798
- disaggregated/test_auto_scaling.py::test_service_discovery[etcd-load_balancing]
9899
- disaggregated/test_auto_scaling.py::test_worker_restart[etcd-round_robin]

tests/integration/test_lists/test-db/l0_dgx_h200.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@ l0_dgx_h200:
3333
- accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[MMLU-gen_tp=2-ctx_pp=4]
3434
- accuracy/test_disaggregated_serving.py::TestGPTOSS::test_auto_dtype[False]
3535
- accuracy/test_disaggregated_serving.py::TestGPTOSS::test_auto_dtype[True]
36+
- accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_bf16_4gpu[tp4ep4_cudagraph_overlap]
3637
- disaggregated/test_disaggregated.py::test_disaggregated_ctxtp2pp2_gentp2pp2[TinyLlama-1.1B-Chat-v1.0]
3738
- disaggregated/test_disaggregated.py::test_disaggregated_ctxpp4_genpp4[TinyLlama-1.1B-Chat-v1.0]
3839
- unittest/llmapi/test_llm_pytorch.py::test_nemotron_nas_lora

tests/integration/test_lists/test-db/l0_gb200_multi_gpus.yml

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,11 @@ l0_gb200_multi_gpus:
4343
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[tp4-trtllm-auto]
4444
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-cutlass-auto]
4545
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus_online_eplb[fp8]
46+
- accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[tp1-cutlass]
47+
- accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[tp4ep1-cutlass]
48+
- accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[tp4ep4-cutlass]
49+
- accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[no_cuda_graph_overlap-cutlass]
50+
- accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[tp4ep4-trtllm]
4651
- condition:
4752
ranges:
4853
system_gpu_count:

tests/integration/test_lists/test-db/l0_rtx_pro_6000.yml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,8 @@ l0_rtx_pro_6000:
4242
- accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-cutlass-auto]
4343
- accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_fp4
4444
- accuracy/test_llm_api_pytorch.py::TestPhi4MM::test_fp8
45+
- accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[tp1-cutlass]
46+
- accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[tp4ep4-trtllm]
4547

4648
- condition:
4749
ranges:

0 commit comments

Comments
 (0)