[refactor] Simplification of Speculative decoding configs - Part 2 (NVIDIA#5936)

wili-65535 · web-flow · commit 8ecdeee3004f · 2025-07-23T09:20:27.000+08:00
Signed-off-by: wili-65535 &lt;wili-65535@users.noreply.github.com&gt;
Co-authored-by: wili-65535 &lt;wili-65535@users.noreply.github.com&gt;
diff --git a/tensorrt_llm/_torch/pyexecutor/_util.py b/tensorrt_llm/_torch/pyexecutor/_util.py
@@ -18,7 +18,7 @@
 from tensorrt_llm.mapping import Mapping
 
 from ..model_config import ModelConfig
-from ..speculative import get_spec_decoder
+from ..speculative import get_num_extra_kv_tokens, get_spec_decoder
 from .config import PyTorchConfig
 from .config_utils import is_mla, is_nemotron_hybrid
 from .guided_decoder import GuidedDecoder
@@ -164,7 +164,7 @@ def _get_token_num_for_estimation(self) -> int:
 
         if spec_cfg is not None:
             num_extra_tokens_per_seq += spec_cfg.max_draft_len
-            num_extra_tokens_per_seq += spec_cfg.num_extra_kv_tokens
+            num_extra_tokens_per_seq += get_num_extra_kv_tokens(spec_cfg)
         for req in self._dummy_reqs:
             num_req_tokens = len(req.input_token_ids) + num_extra_tokens_per_seq
             # Requests cannot share KV cache blocks. Round up to nearest integer multiple of block size.
diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py
@@ -18,6 +18,8 @@
 from tensorrt_llm._torch.models.checkpoints.base_checkpoint_loader import \
     BaseCheckpointLoader
 from tensorrt_llm._torch.pyexecutor.sampler import SampleStateTensors
+from tensorrt_llm._torch.speculative import (
+    get_num_extra_kv_tokens, update_spec_config_from_model_config)
 from tensorrt_llm._torch.speculative.mtp import SampleStateTensorsMTP
 from tensorrt_llm._utils import (is_trace_enabled, nvtx_range, release_gc,
                                  torch_dtype_to_str, trace_func)
@@ -353,7 +355,8 @@ def __init__(
 
         if self.is_spec_decode:
             self.spec_metadata = None
-            self.spec_config.update_from_model_config(self.model.config)
+            update_spec_config_from_model_config(self.spec_config,
+                                                 self.model.config)
             max_num_draft_tokens = self.spec_config.max_draft_len * batch_size
             self.draft_tokens_cuda = torch.empty((max_num_draft_tokens, ),
                                                  dtype=torch.int,
@@ -1442,8 +1445,7 @@ def previous_seq_slots_device():
         attn_metadata.kv_cache_params = KVCacheParams(
             use_cache=True,
             num_cached_tokens_per_seq=num_cached_tokens_per_seq,
-            num_extra_kv_tokens=0 if self.spec_config is None else
-            self.spec_config.num_extra_kv_tokens)
+            num_extra_kv_tokens=get_num_extra_kv_tokens(self.spec_config))
         attn_metadata.kv_cache_manager = kv_cache_manager
 
         attn_metadata.prepare()
diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py b/tensorrt_llm/_torch/pyexecutor/py_executor_creator.py
@@ -19,7 +19,8 @@
 
 from ..attention_backend.interface import AttentionRuntimeFeatures
 from ..distributed import MPIDist
-from ..speculative import get_spec_drafter, get_spec_resource_manager
+from ..speculative import (get_num_extra_kv_tokens, get_spec_drafter,
+                           get_spec_resource_manager)
 from ._util import (KvCacheCreator, _adjust_torch_mem_fraction,
                     create_py_executor_instance, instantiate_sampler, is_mla)
 from .config import PyTorchConfig
@@ -266,7 +267,7 @@ def create_py_executor(
             max_seq_len += spec_config.max_draft_len
 
     if spec_config is not None:
-        max_seq_len += spec_config.num_extra_kv_tokens
+        max_seq_len += get_num_extra_kv_tokens(spec_config)
         max_seq_len += spec_config.max_draft_len
 
     executor_config.max_seq_len = max_seq_len
diff --git a/tensorrt_llm/_torch/pyexecutor/resource_manager.py b/tensorrt_llm/_torch/pyexecutor/resource_manager.py
@@ -176,7 +176,9 @@ def __init__(
         self.kv_factor = 1 if kv_cache_type == CacheTypeCpp.SELFKONLY else 2
         # Some speculative decoding methods need to use different kv lengths for the
         # draft/target layers. Add extra tokens to handle this issue.
-        self.num_extra_kv_tokens = 0 if spec_config is None else spec_config.num_extra_kv_tokens
+        # Import here to avoid circular imports
+        from ..speculative import get_num_extra_kv_tokens
+        self.num_extra_kv_tokens = get_num_extra_kv_tokens(spec_config)
         self.event_buffer_max_size = kv_cache_config.event_buffer_max_size
         self.max_num_tokens = max_num_tokens
 
diff --git a/tensorrt_llm/_torch/speculative/__init__.py b/tensorrt_llm/_torch/speculative/__init__.py
@@ -2,9 +2,10 @@
 from .interface import SpecMetadata
 from .mtp import MTPEagleWorker, MTPSpecMetadata, MTPWorker
 from .ngram import NGramDrafter, NGramPoolManager
-from .utils import (get_num_spec_layers, get_spec_decoder, get_spec_drafter,
-                    get_spec_metadata, get_spec_resource_manager,
-                    get_spec_worker)
+from .utils import (get_num_extra_kv_tokens, get_num_spec_layers,
+                    get_spec_decoder, get_spec_drafter, get_spec_metadata,
+                    get_spec_resource_manager, get_spec_worker,
+                    update_spec_config_from_model_config)
 
 __all__ = [
     "Eagle3SpecMetadata",
@@ -14,10 +15,12 @@
     "NGramDrafter",
     "NGramPoolManager",
     "SpecMetadata",
+    "get_num_extra_kv_tokens",
     "get_num_spec_layers",
     "get_spec_decoder",
     "get_spec_drafter",
     "get_spec_metadata",
     "get_spec_resource_manager",
     "get_spec_worker",
+    "update_spec_config_from_model_config",
 ]
diff --git a/tensorrt_llm/_torch/speculative/model_drafter.py b/tensorrt_llm/_torch/speculative/model_drafter.py
@@ -3,6 +3,8 @@
 import traceback
 from typing import TYPE_CHECKING, Any, Dict, List, Optional, Tuple
 
+import torch
+
 from tensorrt_llm._utils import nvtx_range
 from tensorrt_llm.logger import logger
 
@@ -15,6 +17,20 @@
 
 if TYPE_CHECKING:
     from ..pyexecutor.model_engine import ModelEngine
+    from .interface import SpeculativeDecodingMode
+
+
+# Place the tool function here to avoid circular import
+def get_draft_model_prompt(spec_dec_mode: SpeculativeDecodingMode,
+                           input_tokens: torch.Tensor) -> torch.Tensor:
+    """
+    Can be used to modify prompts for speculative algorithms that need to update tokens
+    before drafting.
+    """
+    if spec_dec_mode.is_eagle3():
+        # EAGLE3 always throws away the first token when processing draft inputs
+        return input_tokens[1:]
+    return input_tokens
 
 
 class ModelDrafter(Drafter):
@@ -113,8 +129,8 @@ def _create_draft_request_for_request(
         """Create a draft request based on the original request state."""
         num_draft_tokens, num_accepted_tokens = self._initialize_draft_tokens(
             request)
-        input_tokens = self.spec_config.get_draft_model_prompt(
-            request.get_tokens()[0])
+        input_tokens = get_draft_model_prompt(self.spec_config.spec_dec_mode,
+                                              request.get_tokens()[0])
 
         # First time seeing this request - context request
         if request.max_beam_num_tokens - 1 == request.py_prompt_len:
diff --git a/tensorrt_llm/_torch/speculative/utils.py b/tensorrt_llm/_torch/speculative/utils.py
@@ -153,3 +153,24 @@ def get_spec_worker(spec_config, mapping):
     if spec_config.spec_dec_mode.is_eagle3_one_model():
         return Eagle3OneModelWorker(spec_config, mapping)
     return None
+
+
+def get_num_extra_kv_tokens(spec_config):
+    """
+    Implementation detail for one model implementations of speculative decoding. Extra
+    KV cache tokens are required.
+    """
+    if spec_config is None:
+        return 0
+    if spec_config.spec_dec_mode.is_eagle3_one_model(
+    ) or spec_config.spec_dec_mode.is_mtp_eagle():
+        return spec_config.max_draft_len - 1
+    return 0
+
+
+def update_spec_config_from_model_config(spec_config, model_config):
+    if spec_config.spec_dec_mode.is_mtp():
+        # Use `max_draft_len` for several low-level APIs. TODO: Remove this after distinguishing them.
+        spec_config.max_draft_len = spec_config.num_nextn_predict_layers
+        # Use `num_nextn_predict_layers_from_model_config` to decide decoding mode MTP / MTP_EAGLE.
+        spec_config.num_nextn_predict_layers_from_model_config = model_config.num_nextn_predict_layers
diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py
@@ -248,7 +248,6 @@ class _ModelFormatKind(Enum):
 class DecodingBaseConfig(BaseModel):
     max_draft_len: Optional[int] = None
     speculative_model_dir: Optional[Union[str, Path]] = None
-    num_extra_kv_tokens: int = 0
 
     @classmethod
     def from_dict(cls, data: dict):
@@ -295,13 +294,6 @@ def spec_dec_mode(self):
         return TorchSpeculativeDecodingMode.from_string(
             self.decoding_type.upper())
 
-    def update_from_model_config(self, model_config):
-        pass
-
-    def get_draft_model_prompt(self,
-                               input_tokens: torch.Tensor) -> torch.Tensor:
-        return input_tokens
-
 
 class MedusaDecodingConfig(DecodingBaseConfig):
     medusa_choices: Optional[List[List[int]]] = None
@@ -345,13 +337,6 @@ def spec_dec_mode(self):
             return TorchSpeculativeDecodingMode.EAGLE3_ONE_MODEL
         return TorchSpeculativeDecodingMode.EAGLE3
 
-    def get_draft_model_prompt(self,
-                               input_tokens: torch.Tensor) -> torch.Tensor:
-        """
-        Eagle3 always throws away the first token when processing draft inputs
-        """
-        return input_tokens[1:]
-
 
 class UserProvidedDecodingConfig(DecodingBaseConfig):
     # Cannot use real type annotations due to circular imports
@@ -448,11 +433,6 @@ def spec_dec_mode(self):
             return TorchSpeculativeDecodingMode.MTP_EAGLE
         return TorchSpeculativeDecodingMode.MTP
 
-    def update_from_model_config(self, model_config):
-        assert self.num_nextn_predict_layers > 0
-        if model_config.num_nextn_predict_layers == 1 and not self.use_mtp_vanilla:
-            self.num_extra_kv_tokens = self.num_nextn_predict_layers - 1
-
 
 class PybindMirror(ABC):
     ''' A class containing the utilities for mirroring Python classes to
@@ -1468,8 +1448,6 @@ def validate_speculative_config(self):
                 assert self.speculative_config.speculative_model_dir is not None, "Path to EAGLE3 weights must be specified."
                 self.build_config.max_draft_len = self.speculative_config.max_draft_len
                 self.build_config.speculative_decoding_mode = SpeculativeDecodingMode.EAGLE
-                if self.speculative_config.eagle3_one_model:
-                    self.speculative_config.num_extra_kv_tokens = self.speculative_config.max_draft_len - 1
                 if self.backend not in ['pytorch', '_autodeploy']:
                     eagle_config = _EagleConfig(
                         self.speculative_config.eagle_choices,
@@ -1490,6 +1468,7 @@ def validate_speculative_config(self):
             elif isinstance(self.speculative_config, DraftTargetDecodingConfig):
                 assert self.backend in ['pytorch']
                 assert self.speculative_config.max_draft_len > 0
+                assert self.speculative_config.speculative_model_dir is not None, "Path to draft model must be specified."
                 self.build_config.speculative_decoding_mode = SpeculativeDecodingMode.DRAFT_TOKENS_EXTERNAL
                 self.build_config.max_draft_len = self.speculative_config.max_draft_len
 
diff --git a/tests/unittest/_torch/speculative/test_draft_target.py b/tests/unittest/_torch/speculative/test_draft_target.py
@@ -49,8 +49,7 @@ def test_llama_draft_target(use_cuda_graph: bool, attn_backend: str):
     )
 
     prompts = [
-        #"The capital of France is",  # Waive this prompt to avoid a flaky error, https://nvbugspro.nvidia.com/bug/5374319
-        "The capital of Germany is",
+        "The capital of France is",
         "The president of the United States is",
     ]
     sampling_params = SamplingParams(max_tokens=32)

Original file line number	Diff line number	Diff line change
`@@ -49,8 +49,7 @@ def test_llama_draft_target(use_cuda_graph: bool, attn_backend: str):`
`49`	`49`	`)`
`50`	`50`
`51`	`51`	`prompts = [`
`52`		`- #"The capital of France is", # Waive this prompt to avoid a flaky error, https://nvbugspro.nvidia.com/bug/5374319`
`53`		`- "The capital of Germany is",`
	`52`	`+ "The capital of France is",`
`54`	`53`	`"The president of the United States is",`
`55`	`54`	`]`
`56`	`55`	`sampling_params = SamplingParams(max_tokens=32)`