diff --git a/tests/integration/defs/accuracy/references/gsm8k.yaml b/tests/integration/defs/accuracy/references/gsm8k.yaml index d281411d12a..76cf65bcb3d 100644 --- a/tests/integration/defs/accuracy/references/gsm8k.yaml +++ b/tests/integration/defs/accuracy/references/gsm8k.yaml @@ -106,6 +106,9 @@ deepseek-ai/DeepSeek-V3.2-Exp: - quant_algo: NVFP4 spec_dec_algo: MTP accuracy: 95.6 +Qwen3/Qwen3-4B: + - spec_dec_algo: Eagle + accuracy: 85.823 Qwen3/Qwen3-8B: - accuracy: 87.1114 - spec_dec_algo: Eagle diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index 58de46628f8..a9767f0b9e3 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -3295,6 +3295,35 @@ def test_auto_dtype(self): extra_evaluator_kwargs=self.EXTRA_EVALUATOR_KWARGS) +class TestQwen3_4B(LlmapiAccuracyTestHarness): + MODEL_NAME = "Qwen3/Qwen3-4B" + + def test_eagle3(self): + "RCCA: https://nvbugspro.nvidia.com/bug/5698434" + pytorch_config = dict( + disable_overlap_scheduler=True, + cuda_graph_config=CudaGraphConfig(), + ) + kv_cache_config = KvCacheConfig( + enable_block_reuse=False, + free_gpu_memory_fraction=0.6, + ) + + eagle_model_dir = f"{llm_models_root()}/Qwen3/Qwen3-4B_eagle3/" + target_model_dir = f"{llm_models_root()}/Qwen3/Qwen3-4B" + + draft_len = 3 + spec_config = EagleDecodingConfig(max_draft_len=draft_len, + speculative_model_dir=eagle_model_dir) + + with LLM(model=target_model_dir, + **pytorch_config, + kv_cache_config=kv_cache_config, + speculative_config=spec_config) as llm: + task = GSM8K(self.MODEL_NAME) + task.evaluate(llm) + + class TestQwen3_8B(LlmapiAccuracyTestHarness): MODEL_NAME = "Qwen3/Qwen3-8B" diff --git a/tests/integration/test_lists/qa/llm_function_core.txt b/tests/integration/test_lists/qa/llm_function_core.txt index c0ad85da972..6e1dcf1728f 100644 --- a/tests/integration/test_lists/qa/llm_function_core.txt +++ b/tests/integration/test_lists/qa/llm_function_core.txt @@ -523,6 +523,7 @@ accuracy/test_llm_api_pytorch.py::TestGLM4_6::test_nvfp4_multi_gpus[throughput] accuracy/test_llm_api_pytorch.py::TestGLM4_6::test_nvfp4_multi_gpus[throughput_trtllm] accuracy/test_llm_api_pytorch.py::TestGLM4_6::test_nvfp4_2_model_mtp[2model] accuracy/test_llm_api_pytorch.py::TestGLM4_6::test_nvfp4_2_model_mtp[2model_trtllm] +accuracy/test_llm_api_pytorch.py::TestQwen3_4B::test_eagle3 accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_fp8_block_scales[latency] accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_bf16[multi_gpus_no_cache] accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_fp8_block_scales[latency-torch_compile=False] diff --git a/tests/integration/test_lists/qa/llm_function_core_sanity.txt b/tests/integration/test_lists/qa/llm_function_core_sanity.txt index 3087be35063..eedcc3e30c7 100644 --- a/tests/integration/test_lists/qa/llm_function_core_sanity.txt +++ b/tests/integration/test_lists/qa/llm_function_core_sanity.txt @@ -229,6 +229,7 @@ accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[mxfp8-laten accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_fp8_block_scales[latency] accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_w4a8_mxfp4[fp8-latency] accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_w4a8_mxfp4[mxfp8-latency] +accuracy/test_llm_api_pytorch.py::TestQwen3_4B::test_eagle3 accuracy/test_llm_api_pytorch.py::TestSeedOss_36B::test_auto_dtype accuracy/test_llm_api_pytorch_multimodal.py::TestQwen2_VL_7B::test_auto_dtype