fix eagle3 fp8 chunk

DylanChen-NV · DylanChen-NV · commit 2c063a293cde · 2025-10-10T17:09:45.000+08:00
Signed-off-by: Dylan Chen &lt;191843203+DylanChen-NV@users.noreply.github.com&gt;
diff --git a/cpp/tensorrt_llm/kernels/xqaDispatcher.cpp b/cpp/tensorrt_llm/kernels/xqaDispatcher.cpp
@@ -243,6 +243,16 @@ bool XqaDispatcher::shouldUse(XQAParams const& params)
 
         return true;
     }
+
+    if (params.kv_cache_data_type == DATA_TYPE_E4M3
+        && (params.data_type == DATA_TYPE_BF16 || params.data_type == DATA_TYPE_FP16))
+    {
+        TLLM_LOG_DEBUG(
+            "XQA kernels are selected in the generation phase for fp16/bf16 input and e4m3 kv cache because MMHA does "
+            "not support this combination.");
+        return true;
+    }
+
     return mDecoderXqaRunner->shouldUse(params, /*forConfigurePlugin=*/false);
 }
 
diff --git a/tensorrt_llm/_torch/attention_backend/trtllm.py b/tensorrt_llm/_torch/attention_backend/trtllm.py
@@ -1211,6 +1211,7 @@ def forward(
         output_sf: Optional[torch.Tensor] = None,
         attention_sinks: Optional[torch.Tensor] = None,
         chunked_prefill_buffer_batch_size: int = 1,
+        fp8_fmha_for_eagle3: bool = False,
         **kwargs,
     ) -> Union[torch.Tensor, Tuple[torch.Tensor, Optional[torch.Tensor]]]:
         assert isinstance(
@@ -1293,6 +1294,10 @@ def forward(
             if use_nvfp4_output:
                 # Use UINT8 as the container dtype for NVFP4.
                 out_dtype = torch.uint8
+            # elif fp8_fmha_for_eagle3:
+            elif self.has_fp8_kv_cache and not self.has_fp8_qdq and out_scale is not None:
+                # Force to use FP8 FMHA for (eagle3 + FP8 target model + BF16/FP16 draft model) in draft layers
+                out_dtype = torch.float8_e4m3fn
             elif (self.has_fp8_qdq or self.has_nvfp4 or self.has_fp8_block_wise
                   or self.has_fp8_rowwise
                   or self.has_w4a8_nvfp4_fp8) and (self.has_fp8_kv_cache
diff --git a/tensorrt_llm/_torch/modules/attention.py b/tensorrt_llm/_torch/modules/attention.py
@@ -404,6 +404,14 @@ def _attn_impl(
             if mrope_position_deltas is not None:
                 mrope_config["mrope_position_deltas"] = mrope_position_deltas
 
+        # Be forced to use FP8 FMHA for BF16/FP16 model with FP8 KV cache (e.g. eagle3 + FP8 target model + BF16/FP16 draft model)
+        forced_to_fp8_fmha = not self.has_quant_scale and self.quant_config is not None and self.quant_config.layer_quant_mode.has_fp8_kv_cache(
+        ) and attn_metadata.num_contexts != 0
+        if forced_to_fp8_fmha:
+            out_scale = torch.tensor([1.0],
+                                     dtype=torch.float32,
+                                     device=q.device)
+
         attn_output = self.attn.forward(
             q,
             k,
@@ -425,7 +433,12 @@ def _attn_impl(
             assert len(
                 attn_output
             ) == 2, "attn_output should be a tuple of (output, output_sf)"
-            return attn_output[0], attn_output[1]
+            if forced_to_fp8_fmha:
+                return attn_output[0].to(q.dtype), attn_output[1]
+            else:
+                return attn_output[0], attn_output[1]
+        if forced_to_fp8_fmha:
+            return attn_output.to(q.dtype), None
         return attn_output, None
 
     def forward_impl(
diff --git a/tests/unittest/_torch/speculative/test_eagle3.py b/tests/unittest/_torch/speculative/test_eagle3.py
@@ -2,7 +2,6 @@
 import os
 import sys
 import tempfile
-import unittest
 from pathlib import Path
 from unittest.mock import patch
 
@@ -24,40 +23,32 @@ def enforce_single_worker(monkeypatch):
 
 
 @pytest.mark.parametrize(
-    "use_cuda_graph,attn_backend,disable_overlap_scheduler,enable_block_reuse,use_one_model,enable_chunked_prefill,use_chain_drafter,multi_batch,attention_dp",
+    "use_cuda_graph,attn_backend,disable_overlap_scheduler,enable_block_reuse,use_one_model,enable_chunked_prefill,use_chain_drafter,multi_batch,attention_dp,fp8_target",
     [
-        [True, "TRTLLM", True, False, False, False, True, False, False],
-        [True, "TRTLLM", True, False, False, False, False, False, False],
-        [False, "TRTLLM", True, False, False, False, True, False, False],
-        [False, "TRTLLM", True, False, False, False, False, False, False],
-        [True, "FLASHINFER", True, False, False, False, True, False, False],
-        [False, "FLASHINFER", True, False, False, False, True, False, False],
-        [False, "TRTLLM", False, True, True, False, True, False, False],
-        [True, "TRTLLM", False, True, True, False, True, False, False],
-        [True, "TRTLLM", True, False, True, True, True, False, False],
-        [True, "TRTLLM", True, False, True, False, True, False, False],
+        [True, "TRTLLM", True, False, False, False, True, False, False, False],
+        [True, "TRTLLM", True, False, False, False, False, False, False, False],
+        [False, "TRTLLM", True, False, False, False, True, False, False, False],
+        [False, "TRTLLM", True, False, False, False, False, False, False, False],
+        [True, "FLASHINFER", True, False, False, False, True, False, False, False],
+        [False, "FLASHINFER", True, False, False, False, True, False, False, False],
+        [False, "TRTLLM", False, True, True, False, True, False, False, False],
+        [True, "TRTLLM", False, True, True, False, True, False, False, False],
+        [True, "TRTLLM", True, False, True, True, True, False, False, False],
+        [True, "TRTLLM", True, False, True, False, True, False, False, False],
         # TODO: nvbugs/5461761
-        # [True, "TRTLLM", True, False, False, True, True, False],
-        [True, "TRTLLM", False, False, False, False, True, False, False],
-        [False, "TRTLLM", False, False, False, False, True, False, False],
-        [True, "TRTLLM", False, False, False, False, False, True, False],
-        [True, "TRTLLM", False, False, False, False, False, True, True],
-        [False, "TRTLLM", False, False, False, False, False, True, False],
-        [True, "TRTLLM", False, False, False, False, True, True, False],
-        [False, "TRTLLM", False, False, False, False, True, True, False],
-        [True, "TRTLLM", False, False, False, False, False, False, False],
-        [False, "TRTLLM", False, False, False, False, False, False, False],
-        [True, "TRTLLM", False, False, False, True, True, False, False],
-        [True, "TRTLLM", False, False, False, True, False, False, False],
-        [True, "FLASHINFER", False, False, False, False, True, False, False],
-        [False, "FLASHINFER", False, False, False, False, True, False, False],
+        # [True, "TRTLLM", True, False, False, True, True, False, False, False],
+        [True, "TRTLLM", False, False, False, False, True, False, False, False],
+        [False, "TRTLLM", False, False, False, False, True, False, False, False],
+        [True, "TRTLLM", False, False, False, False, False, True, False, False],
+        [True, "TRTLLM", False, False, False, False, False, True, True, False],
+        [True, "TRTLLM", False, True, True, True, True, True, True, True],
     ])
 @pytest.mark.high_cuda_memory
 def test_llama_eagle3(use_cuda_graph: bool, attn_backend: str,
                       disable_overlap_scheduler: bool, enable_block_reuse: bool,
                       use_one_model: bool, enable_chunked_prefill: bool,
                       use_chain_drafter: bool, multi_batch: bool,
-                      attention_dp: bool, request):
+                      attention_dp: bool, fp8_target: bool, request):
     # Use enforce_single_worker fixture only when use_chain_drafter is False.
     # Otherwise, we can't modify the returned value of _get_allow_chain_drafter in multiprocessing.
     if not use_chain_drafter:
@@ -71,6 +62,8 @@ def test_llama_eagle3(use_cuda_graph: bool, attn_backend: str,
     models_path = llm_models_root()
     eagle_model_dir = f"{models_path}/EAGLE3-LLaMA3.1-Instruct-8B"
     target_model_dir = f"{models_path}/llama-3.1-model/Llama-3.1-8B-Instruct"
+    if fp8_target:
+        target_model_dir = f"{models_path}/llama-3.1-model/Llama-3.1-8B-Instruct-FP8"
 
     # Mock _get_allow_chain_drafter to return False when use_chain_drafter is False
     if not use_chain_drafter:
@@ -89,6 +82,8 @@ def test_llama_eagle3(use_cuda_graph: bool, attn_backend: str,
         max_draft_len = 4
         kv_cache_config = KvCacheConfig(enable_block_reuse=enable_block_reuse,
                                         max_tokens=8192)
+        if fp8_target:
+            kv_cache_config.dtype = 'fp8'
         cuda_graph_config = CudaGraphConfig(
             batch_sizes=[i for i in range(1, max_batch_size +
                                           1)]) if use_cuda_graph else None
@@ -169,9 +164,11 @@ def test_llama_eagle3(use_cuda_graph: bool, attn_backend: str,
         generated_text_ref = [result.outputs[0].text for result in results_ref]
         llm_ref.shutdown()
 
-        for text_spec, text_ref in zip(generated_text_spec, generated_text_ref):
-            # The spec decode algorithm currently guarantees identical results
-            assert text_spec == text_ref
+        if not fp8_target:
+            for text_spec, text_ref in zip(generated_text_spec,
+                                           generated_text_ref):
+                # The spec decode algorithm currently guarantees identical results
+                assert text_spec == text_ref
 
 
 def test_deepseek_eagle3():
@@ -377,6 +374,7 @@ def test_multi_eagle3(use_one_model: bool):
             pass
 
 
+<<<<<<< HEAD
 @pytest.mark.parametrize("disable_overlap_scheduler", [True, False])
 def test_eagle3_cuda_graph_padding(disable_overlap_scheduler: bool):
     """Test CUDA graph padding with 3 requests and max_batch_size=4.