support trtllm-gen prefill fp4 output (#1360)

weireweire · web-flow · commit cd928a7e044c · 2025-08-01T10:41:00.000-07:00
## 📌 Description Support nvfp4 for prefill function call. (not wrapper yet) The nvfp4 test won't pass until the trtllm-gen kernel update as there is a bug that ignored v_scale. I tested locally and will update the kernels latter. ## 🔍 Related Issues  ## 🚀 Pull Request Checklist Thank you for contributing to FlashInfer! Before we review your pull request, please make sure the following items are complete. ### ✅ Pre-commit Checks - [x] I have installed `pre-commit` by running `pip install pre-commit` (or used your preferred method). - [x] I have installed the hooks with `pre-commit install`. - [x] I have run the hooks manually with `pre-commit run --all-files` and fixed any reported issues. > If you are unsure about how to set up `pre-commit`, see [the pre-commit documentation](https://pre-commit.com/). ## 🧪 Tests - [ ] Tests have been added or updated as needed. - [ ] All tests are passing (`unittest`, etc.). ## Reviewer Notes
diff --git a/csrc/trtllm_fmha_kernel_launcher.cu b/csrc/trtllm_fmha_kernel_launcher.cu
@@ -184,7 +184,7 @@ inline Data_type torch_dtype_to_tllm_data_type(at::ScalarType dtype) {
 
 inline bool is_4bit(Data_type data_type) { return data_type == Data_type::DATA_TYPE_E2M1; }
 
-void trtllm_paged_attention_decode(at::Tensor out, std::optional<at::Tensor> const out_scale_factor,
+void trtllm_paged_attention_decode(at::Tensor out, std::optional<at::Tensor> out_scale_factor,
                                    at::Tensor query, at::Tensor key_value_cache,
                                    at::Tensor workspace_buffer, at::Tensor block_tables,
                                    at::Tensor seq_lens, int64_t max_kv_len, double bmm1_scale,
@@ -245,12 +245,14 @@ void trtllm_paged_attention_decode(at::Tensor out, std::optional<at::Tensor> con
       bmm2_scale, o_sf_scale, o_sf_vec_size, window_left, sum_seq_q, sm_count, stream);
 }
 
-void trtllm_paged_attention_context(at::Tensor out, at::Tensor query, at::Tensor key_value_cache,
+void trtllm_paged_attention_context(at::Tensor out, std::optional<at::Tensor> out_scale_factor,
+                                    at::Tensor query, at::Tensor key_value_cache,
                                     at::Tensor workspace_buffer, at::Tensor block_tables,
                                     at::Tensor seq_lens, int64_t max_q_len, int64_t max_kv_len,
-                                    double bmm1_scale, double bmm2_scale, int64_t batch_size,
-                                    int64_t window_left, at::Tensor cum_seq_lens_q,
-                                    at::Tensor cum_seq_lens_kv, int64_t sm_count) {
+                                    double bmm1_scale, double bmm2_scale, double o_sf_scale,
+                                    int64_t o_sf_vec_size, int64_t batch_size, int64_t window_left,
+                                    at::Tensor cum_seq_lens_q, at::Tensor cum_seq_lens_kv,
+                                    int64_t sm_count) {
   auto q_data_type = torch_dtype_to_tllm_data_type(query.scalar_type());
   auto kv_data_type = torch_dtype_to_tllm_data_type(key_value_cache.scalar_type());
   auto o_data_type = torch_dtype_to_tllm_data_type(out.scalar_type());
@@ -284,9 +286,10 @@ void trtllm_paged_attention_context(at::Tensor out, at::Tensor query, at::Tensor
 
   auto device = query.device();
   const auto stream = at::cuda::getCurrentCUDAStream(device.index());
+  void* output_sf_ptr = out_scale_factor ? out_scale_factor.value().data_ptr() : nullptr;
 
   trtllm_paged_attention_launcher(
-      out.data_ptr(), /*out_scale_factor=*/nullptr, query.data_ptr(), key_value_cache.data_ptr(),
+      out.data_ptr(), output_sf_ptr, query.data_ptr(), key_value_cache.data_ptr(),
       (char*)key_value_cache.data_ptr() +
           (share_kv_cache ? 0 : key_value_cache.stride(1) * key_value_cache.element_size()),
       workspace_buffer.data_ptr(), static_cast<int*>(block_tables.data_ptr()),
@@ -296,8 +299,7 @@ void trtllm_paged_attention_context(at::Tensor out, at::Tensor query, at::Tensor
       o_data_type, TllmPagedAttentionMode::Context, batch_size, max_q_len, max_kv_len,
       num_pages_in_mem_pool, num_qo_heads, num_kv_heads, head_dim_qk, head_dim_vo, page_size,
       kv_stride_keys_values, kv_stride_heads, kv_stride_batch, max_num_blocks_per_seq, bmm1_scale,
-      bmm2_scale, /* o_sf_scale =*/-1, /* o_sf_vec_size =*/-1, window_left, sum_seq_q, sm_count,
-      stream);
+      bmm2_scale, o_sf_scale, o_sf_vec_size, window_left, sum_seq_q, sm_count, stream);
 }
 
 namespace trtllm_cubin_loader {
diff --git a/flashinfer/decode.py b/flashinfer/decode.py
@@ -1988,12 +1988,12 @@ def trtllm_batch_decode_with_kv_cache(
     block_tables: page_table of kv cache, [batch_size, num_pages]
     seq_lens: A uint32 1D tensor indicating the kv sequence length of each prompt. shape: ``[batch_size]``
     max_seq_len: max sequence length for kv_cache
-    out: output tensor, if not provided, will be allocated with ``out_dtype``, if ``out_dtype`` is not provided, will use the type of ``query``.
-    out_dtype: output dtype, if not provided, will use the type of ``out``.
     bmm1_scale: fused scale for bmm1 input.
     bmm2_scale: fused scale for bmm2 input.
     window_left: The left (inclusive) window size for the attention window, when set to ``-1``, the window
             size will be set to the full length of the sequence. Defaults to ``-1``.
+    out: output tensor, if not provided, will be allocated with ``out_dtype``, if ``out_dtype`` is not provided, will use the type of ``query``.
+    out_dtype: output dtype, if not provided, will use the type of ``out``. For nvfp4, use string ``nvfp4``.
     o_sf_scale: scale for nvfp4 output tensor scale factor.
     o_sf_vec_size: vector size for nvfp4 output tensor scale factor.
 
@@ -2020,8 +2020,8 @@ def trtllm_batch_decode_with_kv_cache(
         )
 
         if isinstance(out, FP4Tensor):
-            out_scale_factor = out.scale_factor
-            out = out.tensor
+            out_scale_factor = out.scale
+            out = out.data
         elif out is None:
             out = torch.empty(fp4_out_shape, dtype=torch.uint8, device=query.device)
             out_scale_factor = torch.empty(
diff --git a/flashinfer/prefill.py b/flashinfer/prefill.py
@@ -37,6 +37,7 @@
 from .page import block_sparse_indices_to_vector_sparse_offsets, get_seq_lens
 from .quantization import packbits, segment_packbits
 from .utils import (
+    FP4Tensor,
     MaskMode,
     PosEncodingMode,
     TensorLayout,
@@ -115,6 +116,7 @@ def _paged_run(
             out = torch.empty_like(query)
         op.trtllm_paged_attention_context(
             out,
+            None,  # fp4 output not supported in wrapper api yet.
             query,
             kv_cache,
             workspace_buffer,
@@ -124,6 +126,8 @@ def _paged_run(
             max_kv_len,
             bmm1_scale,
             bmm2_scale,
+            -1,  # o_sf_scale
+            -1,  # o_sf_vec_size
             batch_size,
             window_left,
             cum_seq_lens_q,
@@ -2964,18 +2968,87 @@ def trtllm_batch_context_with_kv_cache(
     cum_seq_lens_q: torch.Tensor,
     cum_seq_lens_kv: torch.Tensor,
     window_left: int = -1,
-    out: Optional[torch.Tensor] = None,
-) -> torch.Tensor:
+    out: Optional[Union[torch.Tensor, FP4Tensor]] = None,
+    out_dtype: Optional[Union[torch.dtype, str]] = None,
+    o_sf_scale: Optional[float] = None,
+    o_sf_vec_size: Optional[int] = None,
+) -> Union[torch.Tensor, FP4Tensor]:
+    """
+    Parameters:
+    query: query tensor with shape [num_tokens, num_heads, head_dim]
+    kv_cache: kv_cache tensor with shape [num_pages, 1 or 2, num_kv_heads, page_size, head_dim]
+    workspace_buffer: workspace
+    block_tables: page_table of kv cache, [batch_size, num_pages]
+    seq_lens: A uint32 1D tensor indicating the kv sequence length of each prompt. shape: ``[batch_size]``
+    max_q_len: max sequence length for query
+    max_kv_len: max sequence length for kv_cache
+    bmm1_scale: fused scale for bmm1 input.
+    bmm2_scale: fused scale for bmm2 input.
+    batch_size: batch size
+    cum_seq_lens_q: cumulative sequence length for query. shape: ``[batch_size + 1]``
+    cum_seq_lens_kv: cumulative sequence length for kv_cache. shape: ``[batch_size + 1]``
+    window_left: The left (inclusive) window size for the attention window, when set to ``-1``, the window
+            size will be set to the full length of the sequence. Defaults to ``-1``.
+    out: output tensor, if not provided, will be allocated with ``out_dtype``, if ``out_dtype`` is not provided, will use the type of ``query``.
+    out_dtype: output dtype, if not provided, will use the type of ``out``. For nvfp4, use string ``nvfp4``.
+    o_sf_scale: scale for nvfp4 output tensor scale factor.
+    o_sf_vec_size: vector size for nvfp4 output tensor scale factor.
+
+    Returns:
+    out: output torch.Tensor or FP4Tensor.
+    """
     run_func = get_trtllm_gen_fmha_module().trtllm_paged_attention_context
     sm_count = get_device_sm_count(query.device)
 
-    if out is None:
-        out = torch.empty_like(query)
-    else:
+    if out_dtype == "nvfp4" or (out_dtype is None and isinstance(out, FP4Tensor)):
+        assert (
+            query.dtype == torch.float8_e4m3fn
+        ), "query must be fp8 when out_dtype is nvfp4."
+        assert o_sf_scale is not None
+        assert o_sf_vec_size in [None, 16], "only o_sf_vec_size = 16 is supported"
+        o_sf_vec_size = o_sf_vec_size or 16
+
+        fp4_out_shape = query.shape[:-1] + (math.ceil(query.shape[-1] / 2),)
+
+        fp4_out_scale_shape = (
+            math.ceil(query.shape[0] / 128) * 128,
+            math.ceil(query.shape[1] * query.shape[2] / o_sf_vec_size / 4) * 4,
+        )
+
+        if isinstance(out, FP4Tensor):
+            out_scale_factor = out.scale
+            out = out.data
+        elif out is None:
+            out = torch.empty(fp4_out_shape, dtype=torch.uint8, device=query.device)
+            out_scale_factor = torch.empty(
+                fp4_out_scale_shape, dtype=torch.float8_e4m3fn, device=query.device
+            )
+        else:
+            raise ValueError(f"Invalid out: {out}")
+
+        _check_shape_dtype_device(out, fp4_out_shape, torch.uint8, query.device, "out")
+
+        # Use uint8 as the container dtype to compliant with next fp4 gemm.
+        _check_shape_dtype_device(
+            out_scale_factor,
+            fp4_out_scale_shape,
+            torch.float8_e4m3fn,
+            query.device,
+            "out_scale_factor",
+        )
+    elif isinstance(out_dtype, torch.dtype) or out_dtype is None:
+        assert o_sf_scale is None
+        assert o_sf_vec_size is None
+        out_scale_factor = None
+        out_dtype = out_dtype or query.dtype
+        out = out if out is not None else torch.empty_like(query, dtype=out_dtype)
         _check_shape_dtype_device(out, query.shape, query.dtype, query.device, "out")
+    else:
+        raise ValueError(f"Invalid out_dtype: {out_dtype}")
 
     run_func(
         out,
+        out_scale_factor,
         query,
         kv_cache,
         workspace_buffer,
@@ -2985,10 +3058,14 @@ def trtllm_batch_context_with_kv_cache(
         max_kv_len,
         bmm1_scale,
         bmm2_scale,
+        o_sf_scale or -1.0,
+        o_sf_vec_size or -1,
         batch_size,
         window_left,
         cum_seq_lens_q,
         cum_seq_lens_kv,
         sm_count,
     )
-    return out
+    return (
+        out if out_dtype != "nvfp4" else FP4Tensor(out, out_scale_factor, query.shape)
+    )
diff --git a/include/flashinfer/trtllm/fmha/fmhaKernels.cuh b/include/flashinfer/trtllm/fmha/fmhaKernels.cuh
@@ -237,7 +237,7 @@ class TllmGenFmhaKernel {
   static std::string getCubinPath() {
     const char* env_hash = std::getenv("FLASHINFER_CUBIN_ARTIFACTORY_HASH");
     std::string hash =
-        env_hash ? std::string(env_hash) : "4c7bdebb4eba13311fc652a069e64782d5c0723d";
+        env_hash ? std::string(env_hash) : "52e676342c67a3772e06f10b84600044c0c22b76";
     std::string cubin_path = hash + "/fmha/trtllm-gen/";
     return cubin_path;
   }
@@ -595,7 +595,7 @@ class TllmFmhaKernelFactory {
     if (!metainfo_loaded) {
       std::string metainfo_raw =
           getMetaInfo(TllmGenFmhaKernel::getCubinPath() + "flashInferMetaInfo",
-                      "b3907fa4e30a75a0f72cfded44e6cf0f04fe5868166659732487726cbc23c0b9", ".h");
+                      "8c5630020c0452fb1cd1ea7e3b8fdbb7bf94f71bd899ed5b704a490bdb4f7368", ".h");
       metainfo = KernelType::KernelMeta::loadFromMetaInfoRaw(metainfo_raw);
       metainfo_loaded = true;
     }
diff --git a/tests/test_trtllm_gen_context.py b/tests/test_trtllm_gen_context.py
@@ -205,13 +205,9 @@ def test_trtllm_batch_context_wrapper(
     "q_dtype,kv_cache_dtype,o_dtype",
     [
         ("half", "half", "half"),
-        # ("half", "fp8", "half"),
         ("bf16", "bf16", "bf16"),
-        # ("bf16", "fp8", "bf16"),
         ("fp8", "fp8", "fp8"),
-        # ("fp8", "fp8", "half"),
-        # ("fp8", "fp8", "bf16"),
-        # ("fp8", "fp8", "nvfp4"),
+        ("fp8", "fp8", "nvfp4"),
     ],
 )
 def test_trtllm_batch_prefill(
@@ -329,7 +325,7 @@ def test_trtllm_batch_prefill(
     else:
         o_scale = 1.0
     o_sf_scale = (
-        0.2 if o_dtype == "nvfp4" else None
+        300 if o_dtype == "nvfp4" else None
     )  # choose a value to make error smaller by testing.
 
     sm_scale = float(1.0 / (head_dim**0.5))
@@ -362,6 +358,9 @@ def test_trtllm_batch_prefill(
         q_indptr,
         kv_indptr,
         window_left,  # window_left
+        out_dtype=dtype_map[o_dtype],
+        o_sf_scale=o_sf_scale,
+        o_sf_vec_size=16 if o_dtype == "nvfp4" else None,
     )
 
     # Handle different return types based on out_dtype
@@ -398,7 +397,7 @@ def test_trtllm_batch_prefill(
     output_ref = wrapper.run(ref_q, ref_kv_cache)
 
     if q_dtype == "fp8" and o_dtype == "nvfp4":
-        rtol, atol = 5e-1, 1.1e0
+        rtol, atol = 4e-1, 1e0
     elif q_dtype == "fp8" and o_dtype == "fp8":
         rtol, atol = 5e-2, 7e-2
     else:
@@ -414,9 +413,14 @@ def test_trtllm_batch_prefill(
         torch.testing.assert_close(
             out_scale_factor.float().reshape(out_scale_factor_ref.shape),
             out_scale_factor_ref.float(),
-            rtol=rtol,
-            atol=atol,
+            rtol=2e-1,
+            atol=2e-1,
         )
+        rmse = torch.sqrt(
+            torch.mean((output.float() * o_scale - output_ref.float()) ** 2)
+        )
+        assert rmse.item() < 0.3
+
     # convert to float32 for fp8 is not supported by assert_close
     torch.testing.assert_close(
         output.float() * o_scale, output_ref.float(), rtol=rtol, atol=atol
diff --git a/tests/test_trtllm_gen_decode.py b/tests/test_trtllm_gen_decode.py
@@ -222,7 +222,7 @@ def test_trtllm_batch_decode_fmha(
     else:
         o_scale = 1.0
     o_sf_scale = (
-        0.2 if o_dtype == "nvfp4" else None
+        300 if o_dtype == "nvfp4" else None
     )  # choose a value to make error smaller by testing.
 
     sm_scale = float(1.0 / (head_dim**0.5))
@@ -287,7 +287,7 @@ def test_trtllm_batch_decode_fmha(
     output_ref = wrapper.run(ref_q, ref_kv_cache)
 
     if q_dtype == "fp8" and o_dtype == "nvfp4":
-        rtol, atol = 5e-1, 1.1e0
+        rtol, atol = 3e-1, 1e0
     elif q_dtype == "fp8" and o_dtype == "fp8":
         rtol, atol = 5e-2, 7e-2
     else:
@@ -303,10 +303,13 @@ def test_trtllm_batch_decode_fmha(
         torch.testing.assert_close(
             out_scale_factor.float().reshape(out_scale_factor_ref.shape),
             out_scale_factor_ref.float(),
-            rtol=rtol,
-            atol=atol,
+            rtol=2e-1,
+            atol=2e-1,
         )
-
+        rmse = torch.sqrt(
+            torch.mean((output.float() * o_scale - output_ref.float()) ** 2)
+        )
+        assert rmse.item() < 0.3
     # convert to float32 for fp8 is not supported by assert_close
     torch.testing.assert_close(
         output.float() * o_scale, output_ref.float(), rtol=rtol, atol=atol