add eagle3 gpt-oss test

jhaotingc · jhaotingc · commit ff66d85b2704 · 2025-10-28T15:05:41.000-07:00
Signed-off-by: Jhao-Ting Chen &lt;jhaotingc@nvidia.com&gt;
diff --git a/tensorrt_llm/_torch/attention_backend/trtllm.py b/tensorrt_llm/_torch/attention_backend/trtllm.py
@@ -1071,10 +1071,10 @@ def update_spec_dec_param(
             spec_decoding_packed_mask = None
             spec_decoding_generation_lengths = None
         # spec_dec mode should only be enabled for pre-Blackwell machines and when there's a spec-dec tree.
-        self.is_spec_decoding_enabled = is_spec_decoding_enabled and get_sm_version(
-        ) < 100
+        self.is_spec_decoding_enabled = is_spec_decoding_enabled and (
+            get_sm_version() < 100 or get_sm_version() == 120)
 
-        if get_sm_version() >= 100:
+        if get_sm_version() >= 100 and get_sm_version() != 120:
             if is_spec_dec_tree or is_spec_dec_dynamic_tree:
                 assert not is_spec_dec_tree, "Spec-dec tree is not supported on this machine. Please use a pre-Blackwell machine for a spec-dec tree."
                 assert not is_spec_dec_dynamic_tree, "Spec-dec dynamic tree is not supported on this machine. Please use a pre-Blackwell machine for a spec-dec dynamic tree."
diff --git a/tests/integration/defs/accuracy/references/gsm8k.yaml b/tests/integration/defs/accuracy/references/gsm8k.yaml
@@ -212,6 +212,11 @@ GPT-OSS/BF16:
   - accuracy: 90.3
   - kv_cache_quant_algo: FP8
     accuracy: 90.3
+  - quant_algo: W4A16_MXFP4
+    accuracy: 90.3
+  - quant_algo: W4A16_MXFP4
+    spec_dec_algo: Eagle
+    accuracy: 90.3
 GPT-OSS/120B-MXFP4:
   - accuracy: 90.3
   - quant_algo: W4A8_MXFP4_MXFP8
diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -3598,6 +3598,59 @@ def test_w4a16(self, kv_cache_dtype, tp_size, pp_size, ep_size,
             task.evaluate(llm,
                           extra_evaluator_kwargs=self.extra_evaluator_kwargs)
 
+    @pytest.mark.skip_less_device(4)
+    @pytest.mark.parametrize("kv_cache_dtype", ["auto"])
+    @pytest.mark.parametrize(
+        "tp_size,pp_size,ep_size,attention_dp,cuda_graph,overlap_scheduler", [
+            (4, 1, 4, False, True, True),
+        ],
+        ids=["tep4"])
+    @pytest.mark.parametrize(
+        "moe_backend",
+        ["triton", "cutlass",
+         pytest.param("trtllm", marks=skip_pre_blackwell)])
+    def test_w4a16_eagle3(self, kv_cache_dtype, tp_size, pp_size, ep_size,
+                          attention_dp, cuda_graph, overlap_scheduler,
+                          moe_backend, monkeypatch, mocker):
+        mocker.patch.object(GSM8K, "MAX_OUTPUT_LEN", 8192)
+        mocker.patch.dict(GSM8K.EVALUATE_KWARGS,
+                          {"scores_filter": "exact_match,flexible-extract"})
+        if moe_backend == "TRITON" and not IS_TRITON_KERNELS_AVAILABLE:
+            pytest.skip("Triton kernels are not available")
+        monkeypatch.setenv("OVERRIDE_QUANT_ALGO", "W4A16_MXFP4")
+
+        cuda_graph_config = CudaGraphConfig(enable_padding=True,
+                                            max_batch_size=8)
+
+        pytorch_config = dict(
+            max_batch_size=8,
+            disable_overlap_scheduler=not overlap_scheduler,
+            cuda_graph_config=CudaGraphConfig() if cuda_graph else None)
+
+        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.5,
+                                        dtype=kv_cache_dtype)
+        spec_config = EagleDecodingConfig(
+            max_draft_len=3,
+            speculative_model_dir=
+            f"{llm_models_root()}/gpt_oss/gpt-oss-120b-Eagle3/",
+            eagle3_one_model=True)
+
+        llm = LLM(self.MODEL_PATH,
+                  tensor_parallel_size=tp_size,
+                  pipeline_parallel_size=pp_size,
+                  moe_expert_parallel_size=ep_size,
+                  kv_cache_config=kv_cache_config,
+                  **pytorch_config,
+                  enable_attention_dp=attention_dp,
+                  moe_config=MoeConfig(backend=moe_backend),
+                  speculative_config=spec_config)
+
+        with llm:
+            model_name = "GPT-OSS/BF16"
+            task = GSM8K(model_name)
+            task.evaluate(llm,
+                          extra_evaluator_kwargs=self.extra_evaluator_kwargs)
+
     @pytest.mark.skip_less_device(2)
     @pytest.mark.parametrize(
         "kv_cache_dtype",
diff --git a/tests/integration/test_lists/test-db/l0_dgx_b200.yml b/tests/integration/test_lists/test-db/l0_dgx_b200.yml
@@ -49,6 +49,7 @@ l0_dgx_b200:
   - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-trtllm-fp8]
   - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4a16[dp4-auto]
   - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4a16[dp4-fp8]
+  - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4a16_eagle3[trtllm-tep4-auto]
   - disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[DeepSeek-V3-Lite-bf16]
   - disaggregated/test_disaggregated.py::test_disaggregated_benchmark_on_diff_backends[llama-3.1-8b-instruct-hf-fp8]
   - disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_nixl[DeepSeek-V3-Lite-fp8]
diff --git a/tests/integration/test_lists/test-db/l0_dgx_h100.yml b/tests/integration/test_lists/test-db/l0_dgx_h100.yml
@@ -174,6 +174,7 @@ l0_dgx_h100:
   - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-cutlass-auto]
   - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_4gpus[dp4-triton-auto]
   - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4a16[dp4-auto]
+  - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4a16_eagle3[triton-tep4-auto]
 - condition:
     ranges:
       system_gpu_count:
diff --git a/tests/integration/test_lists/test-db/l0_rtx_pro_6000.yml b/tests/integration/test_lists/test-db/l0_rtx_pro_6000.yml
@@ -107,5 +107,6 @@ l0_rtx_pro_6000:
   # - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[dep4_latency_moe_cutlass-torch_compile=False] # failed
   - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[tep4_latency_moe_cutlass-torch_compile=False]
   - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_nvfp4[tep4_latency_moe_cutlass-torch_compile=True]
+  - accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4a16_eagle3[cutlass-tep4-auto]
   - test_e2e.py::test_ptp_quickstart_multimodal_2gpu[phi4-multimodal-instruct-fp8-multimodals/Phi-4-multimodal-instruct-FP8]
   - test_e2e.py::test_ptp_quickstart_multimodal_2gpu[phi4-multimodal-instruct-fp4-multimodals/Phi-4-multimodal-instruct-FP4]