Add Eagle3 decoding_type

venkywonka · venkywonka · commit 3888cd8acf80 · 2025-12-22T22:13:29.000+05:30
Introduce speculative_config.decoding_type: Eagle3 for the PyTorch backend, warn when using Eagle as an alias, and reject Eagle3 on the TensorRT backend. Update docs/examples and add unit tests.

Signed-off-by: Venky Ganesh &lt;23023424+venkywonka@users.noreply.github.com&gt;
diff --git a/docs/source/blogs/tech_blog/blog11_GPT_OSS_Eagle3.md b/docs/source/blogs/tech_blog/blog11_GPT_OSS_Eagle3.md
@@ -84,7 +84,7 @@ kv_cache_config:
   enable_block_reuse: false
   free_gpu_memory_fraction: 0.8
 speculative_config:
-  decoding_type: Eagle
+  decoding_type: Eagle3
   max_draft_len: 3
   speculative_model_dir: /config/models/eagle/
 cuda_graph_config:
diff --git a/docs/source/blogs/tech_blog/blog6_Llama4_maverick_eagle_guide.md b/docs/source/blogs/tech_blog/blog6_Llama4_maverick_eagle_guide.md
@@ -68,7 +68,7 @@ docker run -d --ipc=host --ulimit memlock=-1 --ulimit stack=67108864 \
     -p 8000:8000 --gpus=all -e "TRTLLM_ENABLE_PDL=1" \
     -v /path/to/maverick:/config/models/maverick -v /path/to/eagle:/config/models/eagle \
     docker.io/<username>/tensorrt_llm:main sh \
-        -c "echo -e 'enable_autotuner: false\nenable_attention_dp: false\nenable_min_latency: true\ncuda_graph_config:\n  max_batch_size: 8\nspeculative_config:\n  decoding_type: Eagle\n  max_draft_len: 3\n  speculative_model_dir: /config/models/eagle\n  eagle3_one_model: true\nkv_cache_config:\n  enable_block_reuse: false' > c.yaml && \
+        -c "echo -e 'enable_autotuner: false\nenable_attention_dp: false\nenable_min_latency: true\ncuda_graph_config:\n  max_batch_size: 8\nspeculative_config:\n  decoding_type: Eagle3\n  max_draft_len: 3\n  speculative_model_dir: /config/models/eagle\n  eagle3_one_model: true\nkv_cache_config:\n  enable_block_reuse: false' > c.yaml && \
         TRT_LLM_DISABLE_LOAD_WEIGHTS_IN_PARALLEL=True \
         trtllm-serve /config/models/maverick \
             --host 0.0.0.0 --port 8000 \
diff --git a/docs/source/features/speculative-decoding.md b/docs/source/features/speculative-decoding.md
@@ -125,16 +125,18 @@ llm = LLM("/path/to/target_model", speculative_config=speculative_config)
 Speculative decoding options must be specified via `--extra_llm_api_options config.yaml` for both `trtllm-bench` and `trtllm-serve`. All speculative decoding options can be specified in this YAML file. An additional `decoding_type` option is used to specify the type of speculation to use. The available options are:
 
 * `MTP`
-* `Eagle` (for EAGLE 3)
+* `Eagle3` (EAGLE 3)
 * `NGram`
 * `DraftTarget`
 
+> Note: `decoding_type: Eagle` is accepted as a PyTorch-backend alias for `Eagle3`, but `Eagle3` is preferred for clarity.
+
 The rest of the argument names/valid values are the same as in their corresponding configuration class described in the Quick Start section. For example, a YAML configuration could look like this:
 
 ```
 disable_overlap_scheduler: true
 speculative_config:
-  decoding_type: Eagle
+  decoding_type: Eagle3
   max_draft_len: 4
   speculative_model: /path/to/draft/model
 ```
diff --git a/docs/source/features/torch_compile_and_piecewise_cuda_graph.md b/docs/source/features/torch_compile_and_piecewise_cuda_graph.md
diff --git a/examples/models/core/qwen/README.md b/examples/models/core/qwen/README.md
@@ -837,8 +837,8 @@ settings for your specific use case.
 
 Qwen3 now supports Eagle3 (Speculative Decoding with Eagle3). To enable Eagle3 on Qwen3, you need to set the following arguments when running `trtllm-bench` or `trtllm-serve`:
 
-- `speculative_config.decoding_type: Eagle`
-  Set the decoding type to "Eagle" to enable Eagle3 speculative decoding.
+- `speculative_config.decoding_type: Eagle3`
+  Set the decoding type to `Eagle3` to enable Eagle3 speculative decoding.
 - `speculative_config.max_draft_len: 3`
   Set the maximum number of draft tokens generated per step (this value can be adjusted as needed).
 - `speculative_config.speculative_model_dir: <EAGLE3_DRAFT_MODEL_PATH>`
@@ -855,7 +855,7 @@ Example `extra-llm-api-config.yml` snippet for Eagle3:
 echo "
 enable_attention_dp: false
 speculative_config:
-    decoding_type: Eagle
+    decoding_type: Eagle3
     max_draft_len: 3
     speculative_model_dir: <EAGLE3_DRAFT_MODEL_PATH>
 kv_cache_config:
diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py
@@ -729,6 +729,7 @@ def from_dict(cls, data: dict):
             "MTP": MTPDecodingConfig,
             "Medusa": MedusaDecodingConfig,
             "Eagle": EagleDecodingConfig,
+            "Eagle3": Eagle3DecodingConfig,
             "Lookahead": LookaheadDecodingConfig,
             "NGram": NGramDecodingConfig,
             "DraftTarget": DraftTargetDecodingConfig,
@@ -927,6 +928,10 @@ def is_linear_tree(self) -> bool:
         return False
 
 
+class Eagle3DecodingConfig(EagleDecodingConfig):
+    decoding_type: ClassVar[str] = "Eagle3"
+
+
 class SaveHiddenStatesDecodingConfig(DecodingBaseConfig):
     output_directory: str
     write_interval: int = 20
@@ -2422,9 +2427,15 @@ def validate_speculative_config(self):
                     decoding_mode=DecodingMode.Medusa(),
                     medusa_choices=self.speculative_config.medusa_choices)
 
+            elif isinstance(self.speculative_config, Eagle3DecodingConfig):
+                raise ValueError(
+                    "speculative_config.decoding_type 'Eagle3' is only supported on the PyTorch backend. "
+                    "Use decoding_type: Eagle with --backend tensorrt, or switch to --backend pytorch for Eagle3."
+                )
+
             elif isinstance(self.speculative_config, EagleDecodingConfig):
                 assert self.speculative_config.max_draft_len > 0
-                assert self.speculative_config.speculative_model_dir is not None, "Path to EAGLE3 weights must be specified."
+                assert self.speculative_config.speculative_model_dir is not None, "Path to EAGLE weights must be specified."
                 self.build_config.max_draft_len = self.speculative_config.max_draft_len
                 self.build_config.speculative_decoding_mode = SpeculativeDecodingMode.EAGLE
                 eagle_config = _EagleConfig(
@@ -2940,6 +2951,10 @@ def validate_speculative_config(self):
                     f"support backend {self.backend}")
 
             if isinstance(self.speculative_config, EagleDecodingConfig):
+                if type(self.speculative_config) is EagleDecodingConfig:
+                    logger.warning(
+                        "speculative_config.decoding_type 'Eagle' maps to Eagle3 in the PyTorch backend; "
+                        "use 'Eagle3' to be explicit.")
                 assert self.speculative_config.max_draft_len > 0
                 assert self.speculative_config.speculative_model_dir is not None, "Path to EAGLE3 weights must be specified."
             elif isinstance(self.speculative_config, NGramDecodingConfig):
diff --git a/tests/unittest/llmapi/test_llm_args.py b/tests/unittest/llmapi/test_llm_args.py
@@ -139,6 +139,57 @@ def test_llm_args_with_pydantic_options(self):
         assert llm_args.max_seq_len == 128
 
 
+def test_decoding_type_eagle3_parses_to_eagle3_decoding_config():
+    spec_cfg = DecodingBaseConfig.from_dict({
+        "decoding_type":
+        "Eagle3",
+        "max_draft_len":
+        3,
+        "speculative_model_dir":
+        "/path/to/draft/model",
+    })
+    assert isinstance(spec_cfg, Eagle3DecodingConfig)
+
+
+def test_decoding_type_eagle_warns_on_pytorch_backend(monkeypatch):
+    import tensorrt_llm.llmapi.llm_args as llm_args_mod
+
+    warnings_seen: list[str] = []
+
+    def _capture_warning(msg, *args, **kwargs):
+        warnings_seen.append(str(msg))
+
+    monkeypatch.setattr(llm_args_mod.logger, "warning", _capture_warning)
+
+    spec_cfg = DecodingBaseConfig.from_dict({
+        "decoding_type":
+        "Eagle",
+        "max_draft_len":
+        3,
+        "speculative_model_dir":
+        "/path/to/draft/model",
+    })
+
+    TorchLlmArgs(model=llama_model_path, speculative_config=spec_cfg)
+
+    assert any("maps to Eagle3 in the PyTorch backend" in m
+               for m in warnings_seen)
+
+
+def test_decoding_type_eagle3_errors_on_tensorrt_backend():
+    spec_cfg = DecodingBaseConfig.from_dict({
+        "decoding_type":
+        "Eagle3",
+        "max_draft_len":
+        3,
+        "speculative_model_dir":
+        "/path/to/draft/model",
+    })
+    with pytest.raises(ValueError,
+                       match="only supported on the PyTorch backend"):
+        TrtLlmArgs(model=llama_model_path, speculative_config=spec_cfg)
+
+
 def check_defaults(py_config_cls, pybind_config_cls):
     py_config = py_config_cls()
     pybind_config = pybind_config_cls()