NVIDIA
diff --git a/‎tensorrt_llm/_torch/auto_deploy/config/default.yaml‎
Lines changed: 5 additions & 0 deletions b/‎tensorrt_llm/_torch/auto_deploy/config/default.yaml‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎tensorrt_llm/_torch/auto_deploy/llm_args.py‎
Lines changed: 40 additions & 7 deletions b/‎tensorrt_llm/_torch/auto_deploy/llm_args.py‎
Lines changed: 40 additions & 7 deletions
diff --git a/‎tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py‎
Lines changed: 161 additions & 12 deletions b/‎tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py‎
Lines changed: 161 additions & 12 deletions
@@ -75,6 +75,8 @@ transforms:
     stage: pattern_matcher
   quantize_mxfp4_moe:
     stage: pattern_matcher
+  detect_hidden_states_for_capture:
+    stage: pattern_matcher
   detect_sharding:
     stage: sharding
     simple_shard_only: false
@@ -160,6 +162,9 @@ transforms:
   insert_cached_delta_rule:
     stage: cache_init
     backend: fla_delta
+  insert_cached_residual_add:
+    stage: cache_init
+    backend: cached_residual_add
   initialize_cache:
     stage: cache_init
     run_per_gm: false
 
@@ -1,14 +1,21 @@
 from importlib.resources import files
 from pathlib import Path
-from typing import Any, Dict, List, Literal, Optional, Type, Union
+from typing import Any, Dict, List, Literal, Optional, Set, Type, Union
 
 import torch
 from pydantic import Field, PrivateAttr, ValidationInfo, field_validator, model_validator
 from pydantic_settings import BaseSettings, SettingsConfigDict
 
 from tensorrt_llm.models.modeling_utils import QuantConfig
 
-from ...llmapi.llm_args import BaseLlmArgs, BuildConfig, KvCacheConfig, SamplerType, _ParallelConfig
+from ...llmapi.llm_args import (
+    BaseLlmArgs,
+    BuildConfig,
+    EagleDecodingConfig,
+    KvCacheConfig,
+    SamplerType,
+    _ParallelConfig,
+)
 from .models import ModelFactory, ModelFactoryRegistry
 from .utils._config import DynamicYamlMixInForSettings
 from .utils.logger import ad_logger
@@ -38,6 +45,12 @@ def _check_for_default_value_only(
     return value
 
 
+def default_eagle3_layers_to_capture(num_hidden_layers: int) -> Set[int]:
+    if num_hidden_layers <= 5:
+        raise ValueError("Not enough hidden layers for default EAGLE3 capture")
+    return {1, num_hidden_layers // 2 - 1, num_hidden_layers - 4}
+
+
 _TRANSFORMS_SHORTCUT_LOOKUP = {
     "attn_backend": ("insert_cached_attention.backend", "transformers_replace_cached_attn.backend"),
     "free_mem_ratio": ("resize_kv_cache.free_mem_ratio",),
@@ -150,6 +163,11 @@ class AutoDeployConfig(DynamicYamlMixInForSettings, BaseSettings):
 
     enable_chunked_prefill: bool = Field(default=False, description="Enable chunked prefill.")
 
+    draft_checkpoint_loader: Optional[object] = Field(
+        default=None,
+        description="The checkpoint loader to use for the draft model when using speculative decoding with two models.",
+    )
+
     ### INFERENCE OPTIMIZER CONFIG #################################################################
     mode: Literal["graph", "transformers"] = Field(
         default="graph",
@@ -190,11 +208,6 @@ class AutoDeployConfig(DynamicYamlMixInForSettings, BaseSettings):
         ),
     )
 
-    draft_checkpoint_loader: Optional[object] = Field(
-        default=None,
-        description="The checkpoint loader to use for the draft model when using speculative decoding with two models.",
-    )
-
     ### SEQUENCE INTERFACE CONFIG ##################################################################
     max_input_len: int = Field(default=1024, description="The maximum input length.")
     max_num_tokens: Optional[int] = Field(default=None, description="The maximum number of tokens.")
@@ -401,6 +414,26 @@ def ensure_no_custom_parallel_config(cls, value: Any, info: ValidationInfo) -> A
         msg = "AutoDeploy only supports parallelization via the `world_size` argument."
         return _check_for_default_value_only(cls, value, info, msg)
 
+    @model_validator(mode="after")
+    def default_eagle3_layers_to_capture(self):
+        if self.speculative_config is None or not isinstance(
+            self.speculative_config, EagleDecodingConfig
+        ):
+            return self
+
+        if self.speculative_config.eagle3_layers_to_capture is None:
+            num_hidden_layers = self.create_factory()._get_model_config()[0].num_hidden_layers
+            self.speculative_config.eagle3_layers_to_capture = default_eagle3_layers_to_capture(
+                num_hidden_layers
+            )
+
+        # insert the layers to capture into the transforms config.
+        self.transforms["detect_hidden_states_for_capture"]["eagle3_layers_to_capture"] = (
+            self.speculative_config.eagle3_layers_to_capture
+        )
+
+        return self
+
     @model_validator(mode="after")
     def validate_parallel_config(self):
         """Setup parallel config according to world_size.
 
@@ -12,10 +12,11 @@
 import copy
 from collections import defaultdict
 from dataclasses import dataclass
-from types import SimpleNamespace
+from types import MethodType, SimpleNamespace
 from typing import Dict, List, Optional, Tuple
 
 import torch
+import torch.nn.functional as F
 from strenum import StrEnum
 from torch._prims_common import DeviceLikeType
 
@@ -30,9 +31,11 @@
 from tensorrt_llm._torch.pyexecutor.py_executor_creator import get_guided_decoding_config
 from tensorrt_llm._torch.pyexecutor.seq_slot_manager import SeqSlotManager
 from tensorrt_llm._torch.speculative import get_spec_drafter
+from tensorrt_llm._torch.speculative.eagle3 import Eagle3ResourceManager
 from tensorrt_llm._utils import nvtx_range
 from tensorrt_llm.llmapi.llm_args import (
     ContextChunkingPolicy,
+    EagleDecodingConfig,
     LoadFormat,
     SamplerType,
     SpeculativeConfig,
@@ -51,6 +54,7 @@
 from ...pyexecutor.scheduler import (
     BindCapacityScheduler,
     BindMicroBatchScheduler,
+    RequestList,
     ScheduledRequests,
     SimpleScheduler,
 )
@@ -107,6 +111,90 @@ def calculate_max_num_blocks(
         return self.num_blocks, 0
 
 
+class ADHiddenStateManager(Eagle3ResourceManager):
+    def __init__(
+        self,
+        cache_seq_interface: CachedSequenceInterface,
+        config: EagleDecodingConfig,
+        max_num_requests: int,
+        max_seq_len: int,
+        max_num_tokens: int,
+    ):
+        hidden_state_buffer = self._get_hidden_state_buffers(cache_seq_interface)[0]
+        dtype = hidden_state_buffer.dtype
+        hidden_size = hidden_state_buffer.shape[1]
+
+        super().__init__(config, dtype, hidden_size, max_num_requests, max_seq_len, max_num_tokens)
+
+        self.hidden_state_write_indices: torch.Tensor = torch.empty(
+            max_num_tokens, dtype=torch.long, device="cuda"
+        )
+
+    def _get_hidden_state_buffers(
+        self, cache_seq_interface: CachedSequenceInterface
+    ) -> List[torch.Tensor]:
+        hidden_state_buffers = []
+        for name, tensor in cache_seq_interface.named_args.items():
+            if "hidden_states_cache" in name:
+                hidden_state_buffers.append(tensor)
+
+        if not hidden_state_buffers:
+            raise ValueError(
+                "No hidden_state_buffers found in cache_seq_interface. Check if we are actually running Eagle3."
+            )
+        return hidden_state_buffers
+
+    def prepare_hidden_states_capture(
+        self, ordered_requests: RequestList, cache_seq_interface: CachedSequenceInterface
+    ) -> None:
+        """Prepare the hidden states for capture by establishing indices that the hidden states will be written to."""
+        seq_lens = cache_seq_interface.info.seq_len
+        num_tokens = sum(seq_lens)
+
+        start_idx = 0
+        hidden_states_write_indices = []
+        for request, seq_len in zip(ordered_requests, seq_lens):
+            request_id = request.request_id
+            slot_id = self.slot_manager.get_slot(request_id)
+            self.start_indices[slot_id] = start_idx
+            hidden_states_write_indices.extend(range(start_idx, start_idx + seq_len))
+            start_idx += max(seq_len, self.max_total_draft_tokens + 1)
+            assert start_idx < self.hidden_states.shape[0], (
+                f"start_idx {start_idx} exceeds hidden_states capacity {self.hidden_states.shape[0]}"
+            )
+
+        if len(hidden_states_write_indices) != num_tokens:
+            raise ValueError(
+                f"len(hidden_state_write_indices) ({len(hidden_states_write_indices)}) != num_tokens \
+                ({num_tokens}). Check whether ordered_requests matches up with seq_lens."
+            )
+
+        hidden_state_write_indices_host = torch.tensor(
+            hidden_states_write_indices, dtype=torch.long
+        )
+
+        self.hidden_state_write_indices[:num_tokens].copy_(
+            hidden_state_write_indices_host, non_blocking=True
+        )
+
+    def capture_hidden_states(self, cache_seq_interface: CachedSequenceInterface) -> None:
+        """Capture configured hidden states that have been written by the model,
+        in a format that can be used by the draft model.
+        """
+        full_hidden_states = self._get_hidden_state_buffers(cache_seq_interface)
+        if not full_hidden_states:
+            return
+
+        num_tokens = sum(cache_seq_interface.info.seq_len)
+
+        hidden_states = [hidden_state[:num_tokens] for hidden_state in full_hidden_states]
+        hidden_states = torch.cat(hidden_states, dim=1) if hidden_states else None
+        hidden_states = hidden_states.to(dtype=self.dtype)
+
+        token_idx = self.hidden_state_write_indices[:num_tokens]
+        self.hidden_states[:, : hidden_states.shape[1]].index_copy_(0, token_idx, hidden_states)
+
+
 def construct_draft_llm_args(
     ad_config: LlmArgs,
 ) -> TorchLlmArgs:
@@ -331,6 +419,10 @@ def _prepare_inputs(
         kv_cache_manager = resource_manager.get_resource_manager(
             ResourceManagerType.KV_CACHE_MANAGER
         )
+        # resource manager for hidden state capture
+        spec_resource_manager = resource_manager.get_resource_manager(
+            ResourceManagerType.SPEC_RESOURCE_MANAGER
+        )
 
         # requests in order of context, generate
         context_requests = scheduled_requests.context_requests
@@ -341,6 +433,7 @@ def _prepare_inputs(
             r for r in scheduled_requests.generation_requests if get_draft_token_length(r) == 0
         ]
         gen_requests = extend_requests + generation_requests
+        ordered_requests = context_requests + gen_requests
         # info to be extracted
         input_ids: List[List[int]] = []
         input_pos: List[int] = []
@@ -481,17 +574,32 @@ def _build_input_ids(request) -> Tuple[List[int], List[int]]:
                 scatter_ref=dummy_token,
             )
 
+        if spec_resource_manager is not None and isinstance(
+            spec_resource_manager, ADHiddenStateManager
+        ):
+            spec_resource_manager.prepare_hidden_states_capture(
+                ordered_requests, self.cache_seq_interface
+            )
+
         self.iter_states["num_ctx_requests"] = num_ctx_requests
         self.iter_states["num_ctx_tokens"] = num_ctx_tokens
         # TODO: handle extend requests and draft requests for specdec
         self.iter_states["num_generation_tokens"] = num_generation_tokens
         return last_logit_only
 
     @nvtx_range("ad_compute_logits")
-    def _compute_logits(self) -> List[torch.Tensor]:
+    def _compute_logits(self, resource_manager: ResourceManager) -> List[torch.Tensor]:
         # run the model
         logits: torch.Tensor = self.model(**self.cache_seq_interface.named_args)[0]
 
+        spec_resource_manager = resource_manager.get_resource_manager(
+            ResourceManagerType.SPEC_RESOURCE_MANAGER
+        )
+        if spec_resource_manager is not None and isinstance(
+            spec_resource_manager, ADHiddenStateManager
+        ):
+            spec_resource_manager.capture_hidden_states(self.cache_seq_interface)
+
         # TRTLLMSampler expects float32 logits. PyTorchModelEngine always casts to float32 regardless.
         logits = logits.float()
 
@@ -518,7 +626,7 @@ def forward(
         self.iter_counter += 1
 
         # compute all logits
-        logits = self._compute_logits()
+        logits = self._compute_logits(resource_manager)
 
         # gather+cat logits
         logits_flat = torch.cat(
@@ -529,8 +637,30 @@ def forward(
         return {"logits": logits_flat}
 
 
+def share_embedding_weights(
+    target_model_engine: "ADEngine", draft_model_engine: PyTorchModelEngine
+):
+    # This function is necessary for supporting Eagle and other speculative decoding methods that
+    # copy the embed_tokens submodule. It is not necessary for MTP and other speculative decoding methods that
+    # use the draft model engine directly.
+
+    submodule = target_model_engine.model.model.embed_tokens
+
+    world_size = mpi_world_size()
+    assert world_size <= 1, f"This code assumes tp<=1. World size: {world_size}"
+
+    # Note: This simple forward function implementation assumes tp=1.
+    # TODO(govind): Handle the tp>1 case.
+    def new_embedding_forward(self, input_ids):
+        return F.embedding(input_ids, self.weight)
+
+    submodule.forward = MethodType(new_embedding_forward, submodule)
+
+    draft_model_engine.load_weights_from_target_model(target_model_engine.model)
+
+
 def create_draft_model_engine_maybe(
-    ad_config: LlmArgs, engine, dist_mapping: Mapping, mpi_dist: MPIDist
+    ad_config: LlmArgs, target_engine: ADEngine, dist_mapping: Mapping, mpi_dist: MPIDist
 ) -> Optional[PyTorchModelEngine]:
     """Create a draft model engine for speculative decoding.
 
@@ -558,14 +688,18 @@ def create_draft_model_engine_maybe(
         chunked_prefill=ad_config.enable_chunked_prefill,
         cache_reuse=kv_cache_config.enable_block_reuse,
         has_speculative_draft_tokens=has_spec_drafter,
-        chunk_size=engine.llm_args.max_num_tokens,
+        chunk_size=target_engine.llm_args.max_num_tokens,
     )
 
     # Construct TorchLlmArgs for the draft model
     draft_llm_args = construct_draft_llm_args(
         ad_config=ad_config,
     )
 
+    # chain drafter is not supported currently for AutoDeploy.
+    # TODO(govind): Do this when we want to optimize 2-model spec dec performance.
+    drafting_loop_wrapper = None
+
     draft_model_engine = PyTorchModelEngine(
         model_path=draft_spec_config.speculative_model_dir,
         llm_args=draft_llm_args,
@@ -574,7 +708,11 @@ def create_draft_model_engine_maybe(
         dist=mpi_dist,
         spec_config=draft_spec_config,
         is_draft_model=True,
-        drafting_loop_wrapper=None,
+        drafting_loop_wrapper=drafting_loop_wrapper,
+    )
+
+    share_embedding_weights(
+        target_model_engine=target_engine, draft_model_engine=draft_model_engine
     )
 
     draft_model_engine.kv_cache_manager_key = ResourceManagerType.DRAFT_KV_CACHE_MANAGER
@@ -668,21 +806,32 @@ def create_autodeploy_executor(ad_config: LlmArgs, tokenizer: Optional[Tokenizer
     engine = ADEngine.build_from_config(ad_config=ad_config)
 
     spec_config = ad_config.speculative_config
-    if spec_config is not None and not spec_config.spec_dec_mode.is_draft_target():
+    if spec_config is not None and not (
+        spec_config.spec_dec_mode.is_draft_target() or spec_config.spec_dec_mode.is_eagle3()
+    ):
         raise ValueError(
-            "Currently, AutoDeploy only supports speculative decoding in draft target mode."
+            "Currently, AutoDeploy only supports speculative decoding in draft target or eagle3 mode."
         )
 
     if spec_config is not None and ad_config.guided_decoding_backend is not None:
         raise ValueError(
             "Guided decoding is not currently supported for speculative decoding in AutoDeploy."
         )
 
-    # Speculative resource manager not needed for DraftTargetDecoding.
-    spec_resource_manager = None
-
     draft_model_engine = create_draft_model_engine_maybe(
-        ad_config=ad_config, engine=engine, dist_mapping=dist_mapping, mpi_dist=mpi_dist
+        ad_config=ad_config, target_engine=engine, dist_mapping=dist_mapping, mpi_dist=mpi_dist
+    )
+
+    spec_resource_manager = (
+        ADHiddenStateManager(
+            cache_seq_interface=engine.cache_seq_interface,
+            config=spec_config,
+            max_num_requests=ad_config.max_batch_size,
+            max_seq_len=engine.llm_args.max_seq_len,
+            max_num_tokens=engine.llm_args.max_num_tokens,
+        )
+        if isinstance(spec_config, EagleDecodingConfig)
+        else None
     )
 
     # check kvcache config for partial block reuse