[None][fix] Fix MTP 2-model (#8115)

mikeiovine · web-flow · commit ca8291133aa4 · 2025-10-03T10:13:50.000-07:00
Signed-off-by: Mike Iovine &lt;6158008+mikeiovine@users.noreply.github.com&gt;
Signed-off-by: Mike Iovine &lt;miovine@nvidia.com&gt;
diff --git a/tensorrt_llm/_torch/speculative/eagle3.py b/tensorrt_llm/_torch/speculative/eagle3.py
@@ -39,7 +39,13 @@ def __init__(self, config: "EagleDecodingConfig", dtype: torch.dtype,
         # Reserve one more slot for the dummy request.
         slot_size = self.max_seq_len + 1
         self.slot_manager = SlotManager(slot_size)
-        self.max_total_draft_tokens = config.max_total_draft_tokens
+        # This class is reused by MTP_EAGLE
+        from ...llmapi.llm_args import EagleDecodingConfig
+
+        if isinstance(config, EagleDecodingConfig):
+            self.max_total_draft_tokens = config.max_total_draft_tokens
+        else:
+            self.max_total_draft_tokens = self.max_draft_len
 
         # empty hidden states tensor
         max_num_tokens = min(max_num_tokens,
@@ -55,7 +61,9 @@ def __init__(self, config: "EagleDecodingConfig", dtype: torch.dtype,
         # whether the next draft forward is the first
         self.is_first_draft = True
         self.spec_tree_manager = None
-        if config.eagle_choices is not None:
+
+        if isinstance(config,
+                      EagleDecodingConfig) and config.eagle_choices is not None:
             self.spec_tree_manager = SpecTreeManager(
                 max_num_requests=self.max_num_requests,
                 use_dynamic_tree=config.use_dynamic_tree,
diff --git a/tensorrt_llm/_torch/speculative/interface.py b/tensorrt_llm/_torch/speculative/interface.py
@@ -67,6 +67,10 @@ def needs_kv_cache_rewind(self):
         ) or self.is_ngram()
 
     def support_overlap_scheduler(self):
+        # TODO: fix accuracy issue
+        if self.is_mtp_eagle():
+            return False
+
         return self.is_mtp_one_model() or self.is_eagle3_one_model(
         ) or self.has_draft_model()
 
diff --git a/tensorrt_llm/_torch/speculative/utils.py b/tensorrt_llm/_torch/speculative/utils.py
@@ -28,6 +28,20 @@ def get_spec_metadata(spec_config,
             max_num_requests=max_num_requests,
             mtp_hidden_states_manager=spec_resource_manager,
         )
+    if spec_config.spec_dec_mode.is_mtp_eagle():
+        return Eagle3SpecMetadata(
+            max_draft_len=spec_config.max_draft_len,
+            spec_dec_mode=spec_config.spec_dec_mode,
+            max_num_requests=max_num_requests,
+            num_layers=model_config.num_hidden_layers,
+            hidden_size=model_config.hidden_size,
+            max_num_tokens=max_num_tokens,
+            dtype=model_config.torch_dtype,
+            is_draft_model=is_draft_model,
+            eagle3_resource_manager=spec_resource_manager,
+            layers_to_capture=None,
+            is_mtp_eagle=True,
+        )
     if spec_config.spec_dec_mode.is_eagle3():
         return Eagle3SpecMetadata(
             max_draft_len=spec_config.max_draft_len,
diff --git a/tests/integration/test_lists/test-db/l0_b200.yml b/tests/integration/test_lists/test-db/l0_b200.yml
@@ -55,6 +55,7 @@ l0_b200:
   - test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-NVFP4-nvfp4-quantized/Meta-Llama-3.1-8B]
   - test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-FP8-llama-3.1-model/Llama-3.1-8B-Instruct-FP8]
   - test_e2e.py::test_ptp_quickstart_advanced_mtp[DeepSeek-V3-Lite-BF16-DeepSeek-V3-Lite/bf16]
+  - test_e2e.py::test_ptp_quickstart_advanced_mtp_eagle[DeepSeek-V3-Lite-BF16-DeepSeek-V3-Lite/bf16]
   - test_e2e.py::test_ptp_quickstart_advanced_mixed_precision
   - test_e2e.py::test_ptp_quickstart_advanced_eagle3[Llama-3.1-8b-Instruct-llama-3.1-model/Llama-3.1-8B-Instruct-EAGLE3-LLaMA3.1-Instruct-8B]
   - test_e2e.py::test_ptp_quickstart_advanced_ngram[Llama-3.1-8B-Instruct-llama-3.1-model/Llama-3.1-8B-Instruct]