fix eagle3 fp8 target model + bf16 draft model

DylanChen-NV · DylanChen-NV · commit 87ec43c55661 · 2025-09-17T11:37:42.000Z
Signed-off-by: Dylan Chen &lt;191843203+DylanChen-NV@users.noreply.github.com&gt;
diff --git a/cpp/tensorrt_llm/common/attentionOp.cpp b/cpp/tensorrt_llm/common/attentionOp.cpp
@@ -120,6 +120,7 @@ struct FusedQKVMaskedAttentionDispatchParams
     bool block_sparse_attention = false;
     BlockSparseParams block_sparse_params;
     int32_t const* mrope_position_deltas;
+    bool is_eagle3 = false;
 };
 
 template <typename T, typename KVCacheBuffer>
@@ -645,7 +646,8 @@ void fusedQKV_masked_attention_dispatch(Multihead_attention_params<T_MMHA, CROSS
     params.ia3_key_weights = reinterpret_cast<DataType const*>(input_params.ia3_key_weights);
     params.ia3_value_weights = reinterpret_cast<DataType const*>(input_params.ia3_value_weights);
 
-    if (input_params.quant_option.hasStaticActivationScaling() || input_params.fp8_context_fmha)
+    if ((input_params.quant_option.hasStaticActivationScaling() || input_params.fp8_context_fmha)
+        && !input_params.is_eagle3)
     {
         // qkv_scale_out is nullptr currently (no scale).
         params.qkv_scale_quant_orig = input_params.qkv_scale_out;
@@ -2407,6 +2409,7 @@ int AttentionOp::enqueueGeneration(EnqueueGenerationParams<T> const& params, cud
     dispatch_params.block_sparse_attention = mMaskType == AttentionMaskType::BLOCKSPARSE;
     dispatch_params.block_sparse_params = mBlockSparseParams;
     dispatch_params.mrope_position_deltas = params.mrope_position_deltas;
+    dispatch_params.is_eagle3 = mIsEagle3;
 
     using DataType = typename SATypeConverter<T>::Type;
     if (!isCrossAttention())
@@ -2614,7 +2617,7 @@ int AttentionOp::initialize() noexcept
         fmhaParams.dataTypeOut = mFP8AttenOutput ? DATA_TYPE_E4M3 : data_type;
 
         // FP8 FMHA should be used with fp8 workflow together.
-        if (mFP8ContextFMHA || mFP8ContextMLA)
+        if ((mFP8ContextFMHA || mFP8ContextMLA) && !mIsEagle3)
         {
             data_type = DATA_TYPE_E4M3;
         }
@@ -2624,7 +2627,7 @@ int AttentionOp::initialize() noexcept
         // The KV input data type. The default is same as dataType.
         fmhaParams.dataTypeKv = fmhaParams.dataType;
         // If the kernel must read from KV cache, set the dtype correctly.
-        if (mPagedKVCache && mPagedContextFMHA)
+        if (mPagedKVCache && mPagedContextFMHA && !mIsEagle3)
         {
             if (mKVCacheQuantMode.hasFp8KvCache())
             {
diff --git a/cpp/tensorrt_llm/common/attentionOp.h b/cpp/tensorrt_llm/common/attentionOp.h
@@ -420,6 +420,7 @@ class AttentionOp
     bool mIsSpecDecodingEnabled = false;
     bool mUseSpecDecoding = false;
     bool mIsSpecDecTree = true;
+    bool mIsEagle3 = false;
     bool mSpecDecodingIsGenerationLengthVariable = false;
     int32_t mSpecDecodingMaxGenerationLength = 1;
     bool mIsMLAEnabled = false;
@@ -470,11 +471,11 @@ class AttentionOp
             mUnfuseQkvGemm, (int32_t) mType, mMaxContextLength, mQKVBiasEnabled, mCrossAttention, mMaxDistance,
             mPosShiftEnabled, mPagedContextFMHA, mFP8ContextFMHA, mChunkPrefillBufferBatchSize, mFP8AttenOutput,
             mDenseContextFMHA, mHasFullAttentionMask, mIsSpecDecodingEnabled, mUseSpecDecoding, mIsSpecDecTree,
-            mSpecDecodingIsGenerationLengthVariable, mSpecDecodingMaxGenerationLength, mIsMLAEnabled, mIsGenerationMLA,
-            mUseGenFlashMLA, mMLAParams.data(), mCpSize, mCpRank, mCpGroup, mNumAttnHeads, mNumAttnKVHeads,
-            mNumKVHeadsOrigin, mAttnTpSize, mAttnTpRank, mAttnCpSize, mAttnCpRank, mUlyssesMQABroadcast,
-            mEnableContextFMHA, mFMHAForceFP32Acc, mMultiBlockMode, mEnableXQA, mUseKVCache, mSkipAttn, mFuseFp4Quant,
-            mNbMultiBlockSemaphores, mAttentionChunkSize.value_or(-1));
+            mIsEagle3, mSpecDecodingIsGenerationLengthVariable, mSpecDecodingMaxGenerationLength, mIsMLAEnabled,
+            mIsGenerationMLA, mUseGenFlashMLA, mMLAParams.data(), mCpSize, mCpRank, mCpGroup, mNumAttnHeads,
+            mNumAttnKVHeads, mNumKVHeadsOrigin, mAttnTpSize, mAttnTpRank, mAttnCpSize, mAttnCpRank,
+            mUlyssesMQABroadcast, mEnableContextFMHA, mFMHAForceFP32Acc, mMultiBlockMode, mEnableXQA, mUseKVCache,
+            mSkipAttn, mFuseFp4Quant, mNbMultiBlockSemaphores, mAttentionChunkSize.value_or(-1));
     };
 
 private:
diff --git a/cpp/tensorrt_llm/thop/attentionOp.cpp b/cpp/tensorrt_llm/thop/attentionOp.cpp
@@ -609,11 +609,13 @@ void attention(torch::Tensor q, std::optional<torch::Tensor> k, std::optional<to
 
     op->mAttentionChunkSize = attention_chunk_size;
 
-    TORCH_CHECK(spec_decoding_bool_params.size() == 3,
-        "Expecting 3 bools for spec-dec mode, is_spec_decoding_enabled, use_spec_decoding, and is_spec_dec_tree.");
+    TORCH_CHECK(spec_decoding_bool_params.size() == 4,
+        "Expecting 4 bools for spec-dec mode, is_spec_decoding_enabled, use_spec_decoding, is_spec_dec_tree, and "
+        "is_eagle3.");
     op->mIsSpecDecodingEnabled = spec_decoding_bool_params[0]; // is_spec_decoding_enabled
     op->mUseSpecDecoding = spec_decoding_bool_params[1];       // use_spec_decoding
     op->mIsSpecDecTree = spec_decoding_bool_params[2];         // is_spec_dec_tree
+    op->mIsEagle3 = spec_decoding_bool_params[3];              // is_eagle3
 
     if (is_mla_enable)
     {
diff --git a/tensorrt_llm/_torch/attention_backend/trtllm.py b/tensorrt_llm/_torch/attention_backend/trtllm.py
@@ -184,6 +184,7 @@ def plan(
         is_spec_decoding_enabled: bool = False,
         use_spec_decoding: bool = False,
         is_spec_dec_tree: bool = False,
+        is_eagle3: bool = False,
         spec_decoding_position_offsets: Optional[torch.Tensor] = None,
         spec_decoding_packed_mask: Optional[torch.Tensor] = None,
         spec_decoding_generation_lengths: Optional[torch.Tensor] = None,
@@ -271,6 +272,7 @@ def plan(
         self.is_spec_decoding_enabled = is_spec_decoding_enabled
         self.use_spec_decoding = use_spec_decoding
         self.is_spec_dec_tree = is_spec_dec_tree
+        self.is_eagle3 = is_eagle3
         self.spec_decoding_position_offsets = spec_decoding_position_offsets
         self.spec_decoding_packed_mask = spec_decoding_packed_mask
         self.spec_decoding_generation_lengths = spec_decoding_generation_lengths
@@ -414,7 +416,7 @@ def run(
         ]
         spec_decoding_bool_params = [
             self.is_spec_decoding_enabled, self.use_spec_decoding,
-            self.is_spec_dec_tree
+            self.is_spec_dec_tree, self.is_eagle3
         ]
         spec_decoding_tensor_params = [
             self.spec_decoding_generation_lengths,
@@ -1237,6 +1239,8 @@ def forward(
             # Context MLA uses separate qkv instead of paged_context_fmha
             use_paged_context_fmha = False
 
+        is_eagle3 = kwargs.get("is_eagle3", False)
+
         use_nvfp4_output = False
         if enable_attn_nvfp4_output and self.has_nvfp4 and self.support_nvfp4_output(
         ):
@@ -1287,6 +1291,7 @@ def forward(
             is_spec_decoding_enabled=metadata.is_spec_decoding_enabled,
             use_spec_decoding=metadata.use_spec_decoding,
             is_spec_dec_tree=metadata.is_spec_dec_tree,
+            is_eagle3=is_eagle3,
             spec_decoding_position_offsets=metadata.
             spec_decoding_position_offsets,
             spec_decoding_packed_mask=metadata.spec_decoding_packed_mask,
diff --git a/tensorrt_llm/_torch/models/modeling_speculative.py b/tensorrt_llm/_torch/models/modeling_speculative.py
@@ -45,6 +45,7 @@ def __init__(
             dtype=config.torch_dtype,
             config=model_config,
         )
+        self.is_eagle3 = True
 
         tp_size = model_config.mapping.tp_size
         # Override the QKV projection. The number of input features
diff --git a/tensorrt_llm/_torch/modules/attention.py b/tensorrt_llm/_torch/modules/attention.py
@@ -396,7 +396,8 @@ def _attn_impl(
             enable_attn_nvfp4_output=enable_attn_nvfp4_output,
             output=output[:num_tokens, :] if output is not None else None,
             output_sf=output_sf,
-            attention_sinks=attention_sinks)
+            attention_sinks=attention_sinks,
+            is_eagle3=getattr(self, "is_eagle3", False))
         if isinstance(attn_output, tuple):
             assert len(
                 attn_output
diff --git a/tests/unittest/_torch/speculative/test_eagle3.py b/tests/unittest/_torch/speculative/test_eagle3.py
@@ -17,35 +17,36 @@
 
 
 @pytest.mark.parametrize(
-    "use_cuda_graph,attn_backend,disable_overlap_scheduler,enable_block_reuse,use_one_model,enable_chunked_prefill,use_chain_drafter",
+    "use_cuda_graph,attn_backend,disable_overlap_scheduler,enable_block_reuse,use_one_model,enable_chunked_prefill,use_chain_drafter,fp8_target",
     [
-        [True, "TRTLLM", True, False, False, False, True],
-        [True, "TRTLLM", True, False, False, False, False],
-        [False, "TRTLLM", True, False, False, False, True],
-        [False, "TRTLLM", True, False, False, False, False],
-        [True, "FLASHINFER", True, False, False, False, True],
-        [False, "FLASHINFER", True, False, False, False, True],
-        [False, "TRTLLM", False, True, True, False, True],
-        [True, "TRTLLM", False, True, True, False, True],
-        [True, "TRTLLM", True, False, True, True, True],
-        [True, "TRTLLM", True, False, True, False, True],
+        [True, "TRTLLM", True, False, False, False, True, False],
+        [True, "TRTLLM", True, False, False, False, False, False],
+        [False, "TRTLLM", True, False, False, False, True, False],
+        [False, "TRTLLM", True, False, False, False, False, False],
+        [True, "FLASHINFER", True, False, False, False, True, False],
+        [False, "FLASHINFER", True, False, False, False, True, False],
+        [False, "TRTLLM", False, True, True, False, True, False],
+        [True, "TRTLLM", False, True, True, False, True, False],
+        [True, "TRTLLM", True, False, True, True, True, False],
+        [True, "TRTLLM", True, False, True, False, True, False],
         # TODO: nvbugs/5461761
-        # [True, "TRTLLM", True, False, False, True, True],
-        [True, "TRTLLM", False, False, False, False, True],
-        [False, "TRTLLM", False, False, False, False, True],
-        [True, "TRTLLM", False, False, False, False, False],
-        [False, "TRTLLM", False, False, False, False, False],
-        [True, "TRTLLM", False, False, False, True, True],
-        [True, "TRTLLM", False, False, False, True, False],
+        # [True, "TRTLLM", True, False, False, True, True, False],
+        [True, "TRTLLM", False, False, False, False, True, False],
+        [False, "TRTLLM", False, False, False, False, True, False],
+        [True, "TRTLLM", False, False, False, False, False, False],
+        [False, "TRTLLM", False, False, False, False, False, False],
+        [True, "TRTLLM", False, False, False, True, True, False],
+        [True, "TRTLLM", False, False, False, True, False, False],
         # TODO: nvbugs/5522851
-        # [True, "FLASHINFER", False, False, False, False, True],
-        [False, "FLASHINFER", False, False, False, False, True],
+        # [True, "FLASHINFER", False, False, False, False, True, False],
+        [False, "FLASHINFER", False, False, False, False, True, False],
+        [True, "TRTLLM", True, True, True, True, True, True],
     ])
 @pytest.mark.high_cuda_memory
 def test_llama_eagle3(use_cuda_graph: bool, attn_backend: str,
                       disable_overlap_scheduler: bool, enable_block_reuse: bool,
                       use_one_model: bool, enable_chunked_prefill: bool,
-                      use_chain_drafter: bool):
+                      use_chain_drafter: bool, fp8_target: bool):
     # Eagle3 one model works with overlap scheduler and block reuse.
     total_mem_gb = torch.cuda.get_device_properties(0).total_memory / 1e9
     if total_mem_gb < 35:
@@ -54,13 +55,18 @@ def test_llama_eagle3(use_cuda_graph: bool, attn_backend: str,
     models_path = llm_models_root()
     eagle_model_dir = f"{models_path}/EAGLE3-LLaMA3.1-Instruct-8B"
     target_model_dir = f"{models_path}/llama-3.1-model/Llama-3.1-8B-Instruct"
+    kv_cache_dtype = 'auto'
+    if fp8_target:
+        target_model_dir = f"{models_path}/llama-3.1-model/Llama-3.1-8B-Instruct-FP8"
+        kv_cache_dtype = 'fp8'
 
     # bs > 1 gives non-deterministic when doing IFB. There are slight chances
     # that ref and spec does not match 100%
     max_batch_size = 1
     max_draft_len = 4
     kv_cache_config = KvCacheConfig(enable_block_reuse=enable_block_reuse,
-                                    max_tokens=8192)
+                                    max_tokens=8192,
+                                    dtype=kv_cache_dtype)
     cuda_graph_config = CudaGraphConfig(
         batch_sizes=[1]) if use_cuda_graph else None
 

Original file line number	Diff line number	Diff line change
`@@ -45,6 +45,7 @@ def __init__(`
`45`	`45`	`dtype=config.torch_dtype,`
`46`	`46`	`config=model_config,`
`47`	`47`	`)`
	`48`	`+ self.is_eagle3 = True`
`48`	`49`
`49`	`50`	`tp_size = model_config.mapping.tp_size`
`50`	`51`	`# Override the QKV projection. The number of input features`