@@ -4198,6 +4198,86 @@ def test_auto_dtype(self, tp_size, pp_size, ep_size):
41984198 task .evaluate (llm )
41994199
42004200
4201+ @skip_pre_hopper
4202+ @pytest .mark .skip_less_device_memory (80000 )
4203+ class TestQwen3NextInstruct (LlmapiAccuracyTestHarness ):
4204+ MODEL_PATH = f"{ llm_models_root ()} /Qwen3-Next"
4205+ MODEL_NAME = "Qwen3/Qwen3-Next-80B-A3B-Instruct"
4206+
4207+ # Default setting of `256` is too small
4208+ GSM8K_MAX_OUTPUT_LEN = 512
4209+
4210+ @pytest .mark .skip_less_device (4 )
4211+ @pytest .mark .parametrize (
4212+ "tp_size,pp_size,ep_size,cuda_graph,overlap_scheduler" ,
4213+ [
4214+ (4 , 1 , 4 , True , True ),
4215+ ],
4216+ ids = [
4217+ "tp4ep4_cudagraph_overlap" ,
4218+ ],
4219+ )
4220+ def test_bf16_4gpu (self , tp_size , pp_size , ep_size , cuda_graph ,
4221+ overlap_scheduler , mocker ):
4222+ model_path = f"{ self .MODEL_PATH } /Qwen3-Next-80B-A3B-Instruct"
4223+ kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.6 ,
4224+ enable_block_reuse = False )
4225+ pytorch_config = dict (disable_overlap_scheduler = not overlap_scheduler ,
4226+ cuda_graph_config = CudaGraphConfig (
4227+ max_batch_size = 512 ) if cuda_graph else None )
4228+
4229+ with LLM (
4230+ model_path ,
4231+ tensor_parallel_size = tp_size ,
4232+ max_num_tokens = 16384 ,
4233+ pipeline_parallel_size = pp_size ,
4234+ moe_expert_parallel_size = ep_size ,
4235+ kv_cache_config = kv_cache_config ,
4236+ ** pytorch_config ,
4237+ ) as llm :
4238+ task = MMLU (self .MODEL_NAME )
4239+ task .evaluate (llm )
4240+ mocker .patch .object (GSM8K , "MAX_OUTPUT_LEN" ,
4241+ self .GSM8K_MAX_OUTPUT_LEN )
4242+ task = GSM8K (self .MODEL_NAME )
4243+ task .evaluate (llm )
4244+
4245+ @skip_pre_blackwell
4246+ @pytest .mark .skip_less_device (4 )
4247+ @pytest .mark .parametrize ("moe_backend" , ["CUTLASS" , "TRTLLM" ],
4248+ ids = ["cutlass" , "trtllm" ])
4249+ @pytest .mark .parametrize (
4250+ "tp_size,pp_size,ep_size,cuda_graph,overlap_scheduler" ,
4251+ [(1 , 1 , 1 , True , True ), (4 , 1 , 1 , True , True ), (4 , 1 , 4 , True , True ),
4252+ (4 , 1 , 4 , False , False )],
4253+ ids = ["tp1" , "tp4ep1" , "tp4ep4" , "no_cuda_graph_overlap" ])
4254+ def test_nvfp4 (self , moe_backend , tp_size , pp_size , ep_size , cuda_graph ,
4255+ overlap_scheduler , mocker ):
4256+ model_path = f"{ self .MODEL_PATH } /qwen3-next-80b-instruct-nvfp4-ptq-fp8kv"
4257+
4258+ kv_cache_config = KvCacheConfig (free_gpu_memory_fraction = 0.6 ,
4259+ enable_block_reuse = False )
4260+ pytorch_config = dict (disable_overlap_scheduler = not overlap_scheduler ,
4261+ cuda_graph_config = CudaGraphConfig (
4262+ max_batch_size = 512 ) if cuda_graph else None )
4263+ moe_config = MoeConfig (backend = moe_backend )
4264+
4265+ with LLM (model_path ,
4266+ tensor_parallel_size = tp_size ,
4267+ max_num_tokens = 16384 ,
4268+ pipeline_parallel_size = pp_size ,
4269+ moe_expert_parallel_size = ep_size ,
4270+ kv_cache_config = kv_cache_config ,
4271+ ** pytorch_config ,
4272+ moe_config = moe_config ) as llm :
4273+ task = MMLU (self .MODEL_NAME )
4274+ task .evaluate (llm )
4275+ mocker .patch .object (GSM8K , "MAX_OUTPUT_LEN" ,
4276+ self .GSM8K_MAX_OUTPUT_LEN )
4277+ task = GSM8K (self .MODEL_NAME )
4278+ task .evaluate (llm )
4279+
4280+
42014281class TestSeedOss_36B (LlmapiAccuracyTestHarness ):
42024282 MODEL_NAME = "ByteDance-Seed/Seed-OSS-36B-Instruct"
42034283 MODEL_PATH = f"{ llm_models_root ()} /Seed-OSS/Seed-OSS-36B-Instruct"
0 commit comments