File tree Expand file tree Collapse file tree 3 files changed +35
-0
lines changed
Expand file tree Collapse file tree 3 files changed +35
-0
lines changed Original file line number Diff line number Diff line change @@ -106,6 +106,9 @@ deepseek-ai/DeepSeek-V3.2-Exp:
106106 - quant_algo : NVFP4
107107 spec_dec_algo : MTP
108108 accuracy : 95.6
109+ Qwen3/Qwen3-4B :
110+ - spec_dec_algo : Eagle
111+ accuracy : 85.823
109112Qwen3/Qwen3-8B :
110113 - accuracy : 87.1114
111114 - spec_dec_algo : Eagle
Original file line number Diff line number Diff line change @@ -3294,6 +3294,37 @@ def test_auto_dtype(self):
32943294 extra_evaluator_kwargs = self .EXTRA_EVALUATOR_KWARGS )
32953295
32963296
3297+ class TestQwen3_4B (LlmapiAccuracyTestHarness ):
3298+ MODEL_NAME = "Qwen3/Qwen3-4B"
3299+
3300+ def test_eagle3 (self ):
3301+ "RCCA: https://nvbugspro.nvidia.com/bug/5698434"
3302+ pytorch_config = dict (
3303+ disable_overlap_scheduler = True ,
3304+ cuda_graph_config = CudaGraphConfig (),
3305+ )
3306+ kv_cache_config = KvCacheConfig (
3307+ enable_block_reuse = False ,
3308+ free_gpu_memory_fraction = 0.6 ,
3309+ )
3310+
3311+ eagle_model_dir = f"{ llm_models_root ()} /Qwen3/Qwen3-4B_eagle3/"
3312+ target_model_dir = f"{ llm_models_root ()} /Qwen3/Qwen3-4B"
3313+
3314+ draft_len = 3
3315+ spec_config = EagleDecodingConfig (max_draft_len = draft_len ,
3316+ speculative_model_dir = eagle_model_dir )
3317+
3318+ llm = LLM (model = target_model_dir ,
3319+ ** pytorch_config ,
3320+ kv_cache_config = kv_cache_config ,
3321+ speculative_config = spec_config )
3322+
3323+ with llm :
3324+ task = GSM8K (self .MODEL_NAME )
3325+ task .evaluate (llm )
3326+
3327+
32973328class TestQwen3_8B (LlmapiAccuracyTestHarness ):
32983329 MODEL_NAME = "Qwen3/Qwen3-8B"
32993330
Original file line number Diff line number Diff line change @@ -523,6 +523,7 @@ accuracy/test_llm_api_pytorch.py::TestGLM4_6::test_nvfp4_multi_gpus[throughput]
523523accuracy/test_llm_api_pytorch.py::TestGLM4_6::test_nvfp4_multi_gpus[throughput_trtllm]
524524accuracy/test_llm_api_pytorch.py::TestGLM4_6::test_nvfp4_2_model_mtp[2model]
525525accuracy/test_llm_api_pytorch.py::TestGLM4_6::test_nvfp4_2_model_mtp[2model_trtllm]
526+ accuracy/test_llm_api_pytorch.py::TestQwen3_4B::test_eagle3
526527accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_fp8_block_scales[latency]
527528accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_bf16[multi_gpus_no_cache]
528529accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_fp8_block_scales[latency-torch_compile=False]
You can’t perform that action at this time.
0 commit comments