feat: Support scale factor start index for fp4 mha prefill/decode (#1363)

weireweire · web-flow · commit 50dcd9ac9472 · 2025-08-02T17:59:49.000-07:00
## 📌 Description the start index of fp4 output scale factor `o_sf_start_index` is useful when the decode kernels are reusing the scale factor tensor of prefill kernels. It can write from a offset even though the scale factor is swizzled. This is a follow up of #1360, please only review the latest commit. ## 🔍 Related Issues  ## 🚀 Pull Request Checklist Thank you for contributing to FlashInfer! Before we review your pull request, please make sure the following items are complete. ### ✅ Pre-commit Checks - [x] I have installed `pre-commit` by running `pip install pre-commit` (or used your preferred method). - [x] I have installed the hooks with `pre-commit install`. - [ ] I have run the hooks manually with `pre-commit run --all-files` and fixed any reported issues. > If you are unsure about how to set up `pre-commit`, see [the pre-commit documentation](https://pre-commit.com/). ## 🧪 Tests - [ ] Tests have been added or updated as needed. - [ ] All tests are passing (`unittest`, etc.). ## Reviewer Notes
diff --git a/csrc/trtllm_fmha_kernel_launcher.cu b/csrc/trtllm_fmha_kernel_launcher.cu
@@ -80,8 +80,8 @@ void trtllm_paged_attention_launcher(
     int64_t num_pages_in_mem_pool, int64_t num_qo_heads, int64_t num_kv_heads, int64_t head_dim_qk,
     int64_t head_dim_vo, int64_t page_size, int64_t kv_stride_keys_values, int64_t kv_stride_heads,
     int64_t kv_stride_batch, int64_t max_num_blocks_per_seq, double bmm1_scale, double bmm2_scale,
-    double o_sf_scale, int64_t o_sf_vec_size, int64_t window_left, int64_t sum_seq_q,
-    int64_t sm_count, cudaStream_t stream) {
+    double o_sf_scale, int64_t o_sf_vec_size, int64_t o_sf_start_index, int64_t window_left,
+    int64_t sum_seq_q, int64_t sm_count, cudaStream_t stream) {
   if (num_qo_heads % num_kv_heads != 0) {
     std::ostringstream err_msg;
     err_msg << "num_qo_heads must be a multiple of num_kv_heads, got num_kv_heads: " << num_kv_heads
@@ -118,6 +118,7 @@ void trtllm_paged_attention_launcher(
   runner_params.outputScale = bmm2_scale;
   runner_params.scaleSoftmaxLog2 = bmm1_scale * M_LOG2E;
   runner_params.oSfPtr = out_scale_factor;
+  runner_params.mSfStartTokenIdx = o_sf_start_index;
   runner_params.mScaleSfO = o_sf_scale;
   TORCH_CHECK(o_sf_vec_size == 16 || o_sf_vec_size == -1,
               "Only support o_sf_vec_size == 16 or -1(not used)");
@@ -189,7 +190,8 @@ void trtllm_paged_attention_decode(at::Tensor out, std::optional<at::Tensor> out
                                    at::Tensor workspace_buffer, at::Tensor block_tables,
                                    at::Tensor seq_lens, int64_t max_kv_len, double bmm1_scale,
                                    double bmm2_scale, double o_sf_scale, int64_t o_sf_vec_size,
-                                   int64_t window_left, int64_t sm_count) {
+                                   int64_t o_sf_start_index, int64_t window_left,
+                                   int64_t sm_count) {
   auto q_data_type = torch_dtype_to_tllm_data_type(query.scalar_type());
   auto kv_data_type = torch_dtype_to_tllm_data_type(key_value_cache.scalar_type());
   auto o_data_type = torch_dtype_to_tllm_data_type(out.scalar_type());
@@ -242,15 +244,17 @@ void trtllm_paged_attention_decode(at::Tensor out, std::optional<at::Tensor> out
       TllmPagedAttentionMode::ForGen, batch_size, /*max_q_len=*/q_len_per_request, max_kv_len,
       num_pages_in_mem_pool, num_qo_heads, num_kv_heads, head_dim_qk, head_dim_vo, page_size,
       kv_stride_keys_values, kv_stride_heads, kv_stride_batch, max_num_blocks_per_seq, bmm1_scale,
-      bmm2_scale, o_sf_scale, o_sf_vec_size, window_left, sum_seq_q, sm_count, stream);
+      bmm2_scale, o_sf_scale, o_sf_vec_size, o_sf_start_index, window_left, sum_seq_q, sm_count,
+      stream);
 }
 
 void trtllm_paged_attention_context(at::Tensor out, std::optional<at::Tensor> out_scale_factor,
                                     at::Tensor query, at::Tensor key_value_cache,
                                     at::Tensor workspace_buffer, at::Tensor block_tables,
                                     at::Tensor seq_lens, int64_t max_q_len, int64_t max_kv_len,
                                     double bmm1_scale, double bmm2_scale, double o_sf_scale,
-                                    int64_t o_sf_vec_size, int64_t batch_size, int64_t window_left,
+                                    int64_t o_sf_vec_size, int64_t o_sf_start_index,
+                                    int64_t batch_size, int64_t window_left,
                                     at::Tensor cum_seq_lens_q, at::Tensor cum_seq_lens_kv,
                                     int64_t sm_count) {
   auto q_data_type = torch_dtype_to_tllm_data_type(query.scalar_type());
@@ -299,7 +303,8 @@ void trtllm_paged_attention_context(at::Tensor out, std::optional<at::Tensor> ou
       o_data_type, TllmPagedAttentionMode::Context, batch_size, max_q_len, max_kv_len,
       num_pages_in_mem_pool, num_qo_heads, num_kv_heads, head_dim_qk, head_dim_vo, page_size,
       kv_stride_keys_values, kv_stride_heads, kv_stride_batch, max_num_blocks_per_seq, bmm1_scale,
-      bmm2_scale, o_sf_scale, o_sf_vec_size, window_left, sum_seq_q, sm_count, stream);
+      bmm2_scale, o_sf_scale, o_sf_vec_size, o_sf_start_index, window_left, sum_seq_q, sm_count,
+      stream);
 }
 
 namespace trtllm_cubin_loader {
diff --git a/flashinfer/decode.py b/flashinfer/decode.py
@@ -1821,6 +1821,7 @@ def _paged_run(
             bmm2_scale,
             -1,  # o_sf_scale
             -1,  # o_sf_vec_size
+            0,  # o_sf_start_index
             window_left,
             self._sm_count,
         )
@@ -2021,12 +2022,14 @@ def trtllm_batch_decode_with_kv_cache(
 
         if isinstance(out, FP4Tensor):
             out_scale_factor = out.scale
+            o_sf_start_index = out.scale_start_index
             out = out.data
         elif out is None:
-            out = torch.empty(fp4_out_shape, dtype=torch.uint8, device=query.device)
             out_scale_factor = torch.empty(
                 fp4_out_scale_shape, dtype=torch.float8_e4m3fn, device=query.device
             )
+            o_sf_start_index = 0
+            out = torch.empty(fp4_out_shape, dtype=torch.uint8, device=query.device)
         else:
             raise ValueError(f"Invalid out: {out}")
 
@@ -2044,6 +2047,7 @@ def trtllm_batch_decode_with_kv_cache(
         assert o_sf_scale is None
         assert o_sf_vec_size is None
         out_scale_factor = None
+        o_sf_start_index = 0
         out_dtype = out_dtype or query.dtype
         out = out if out is not None else torch.empty_like(query, dtype=out_dtype)
         _check_shape_dtype_device(out, query.shape, query.dtype, query.device, "out")
@@ -2063,12 +2067,15 @@ def trtllm_batch_decode_with_kv_cache(
         bmm2_scale,
         o_sf_scale or -1.0,
         o_sf_vec_size or -1,
+        o_sf_start_index,
         window_left,
         sm_count,
     )
 
     return (
-        out if out_dtype != "nvfp4" else FP4Tensor(out, out_scale_factor, query.shape)
+        out
+        if out_dtype != "nvfp4"
+        else FP4Tensor(out, out_scale_factor, o_sf_start_index, query.shape)
     )
 
 
@@ -2217,6 +2224,7 @@ def trtllm_batch_decode_with_kv_cache_mla(
         bmm2_scale,
         -1,  # o_sf_scale
         -1,  # o_sf_vec_size
+        0,  # o_sf_start_index
         -1,  # window_left
         sm_count,
     )
diff --git a/flashinfer/prefill.py b/flashinfer/prefill.py
@@ -128,6 +128,7 @@ def _paged_run(
             bmm2_scale,
             -1,  # o_sf_scale
             -1,  # o_sf_vec_size
+            0,  # o_sf_start_index
             batch_size,
             window_left,
             cum_seq_lens_q,
@@ -3017,12 +3018,14 @@ def trtllm_batch_context_with_kv_cache(
 
         if isinstance(out, FP4Tensor):
             out_scale_factor = out.scale
+            o_sf_start_index = out.scale_start_index
             out = out.data
         elif out is None:
-            out = torch.empty(fp4_out_shape, dtype=torch.uint8, device=query.device)
             out_scale_factor = torch.empty(
                 fp4_out_scale_shape, dtype=torch.float8_e4m3fn, device=query.device
             )
+            o_sf_start_index = 0
+            out = torch.empty(fp4_out_shape, dtype=torch.uint8, device=query.device)
         else:
             raise ValueError(f"Invalid out: {out}")
 
@@ -3040,6 +3043,7 @@ def trtllm_batch_context_with_kv_cache(
         assert o_sf_scale is None
         assert o_sf_vec_size is None
         out_scale_factor = None
+        o_sf_start_index = 0
         out_dtype = out_dtype or query.dtype
         out = out if out is not None else torch.empty_like(query, dtype=out_dtype)
         _check_shape_dtype_device(out, query.shape, query.dtype, query.device, "out")
@@ -3060,12 +3064,15 @@ def trtllm_batch_context_with_kv_cache(
         bmm2_scale,
         o_sf_scale or -1.0,
         o_sf_vec_size or -1,
+        o_sf_start_index,
         batch_size,
         window_left,
         cum_seq_lens_q,
         cum_seq_lens_kv,
         sm_count,
     )
     return (
-        out if out_dtype != "nvfp4" else FP4Tensor(out, out_scale_factor, query.shape)
+        out
+        if out_dtype != "nvfp4"
+        else FP4Tensor(out, out_scale_factor, o_sf_start_index, query.shape)
     )
diff --git a/flashinfer/utils.py b/flashinfer/utils.py
@@ -524,6 +524,7 @@ def __init__(
         self,
         data: torch.Tensor,
         scale: torch.Tensor,
+        scale_start_index: int = 0,
         original_shape: Optional[Tuple[int, ...]] = None,
     ):
         """Initialize FP4Tensor.
@@ -534,6 +535,8 @@ def __init__(
             uint8 tensor storing the compressed FP4 data
         scale : torch.Tensor
             float8_e4m3fn tensor storing the scale factors
+        scale_start_index : int
+            The start token index of the scale factors. This is needed when two kernels (like prefill and decode kernels) are reusing the same scale factor tensor with different offsets.
         original_shape : Optional[Tuple[int, ...]]
             The original shape before compression.
         """
@@ -561,6 +564,7 @@ def __init__(
 
         self.data = data
         self.scale = scale
+        self.scale_start_index = scale_start_index
         self.original_shape = original_shape
         self.dtype = "nvfp4"
 
diff --git a/tests/test_trtllm_gen_context.py b/tests/test_trtllm_gen_context.py
@@ -5,6 +5,15 @@
 from utils_fp4 import cast_from_fp4, recover_swizzled_scales, ref_nvfp4_quant
 
 import flashinfer
+from flashinfer.utils import FP4Tensor
+
+
+def flip_coin(*args, **kwargs):
+    # Use any test parameters to deterministically decide branch
+    # This makes test configurations go through different paths
+    param_tuple = args + tuple(sorted(kwargs.items()))
+    hash_value = hash(param_tuple)
+    return (hash_value % 2) == 0
 
 
 def to_float8(x, dtype=torch.float8_e4m3fn):
@@ -327,7 +336,7 @@ def test_trtllm_batch_prefill(
     o_sf_scale = (
         300 if o_dtype == "nvfp4" else None
     )  # choose a value to make error smaller by testing.
-
+    o_sf_vec_size = 16 if o_dtype == "nvfp4" else None
     sm_scale = float(1.0 / (head_dim**0.5))
 
     workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.int8, device=device)
@@ -344,6 +353,29 @@ def test_trtllm_batch_prefill(
         ]
     )
 
+    if flip_coin(batch_size, page_size, num_kv_heads, head_grp_size, o_dtype):
+        if o_dtype == "nvfp4":
+            fp4_out_shape = q.shape[:-1] + (math.ceil(q.shape[-1] / 2),)
+
+            fp4_out_scale_shape = (
+                math.ceil(q.shape[0] / 128) * 128,
+                math.ceil(q.shape[1] * q.shape[2] / o_sf_vec_size / 4) * 4,
+            )
+
+            out_scale_factor = torch.empty(
+                fp4_out_scale_shape, dtype=torch.float8_e4m3fn, device=q.device
+            )
+            extra_size = fp4_out_scale_shape[0] - q.shape[0]
+            o_sf_start_index = (
+                torch.randint(0, extra_size, (1,)).item() if extra_size > 0 else 0
+            )
+            out_data = torch.empty(fp4_out_shape, dtype=torch.uint8, device=q.device)
+            out = FP4Tensor(out_data, out_scale_factor, o_sf_start_index)
+        else:
+            out = torch.empty_like(q, dtype=dtype_map[o_dtype])
+    else:
+        out = None
+
     output = flashinfer.prefill.trtllm_batch_context_with_kv_cache(
         q.contiguous(),
         kv_cache,
@@ -358,14 +390,16 @@ def test_trtllm_batch_prefill(
         q_indptr,
         kv_indptr,
         window_left,  # window_left
+        out=out,
         out_dtype=dtype_map[o_dtype],
         o_sf_scale=o_sf_scale,
-        o_sf_vec_size=16 if o_dtype == "nvfp4" else None,
+        o_sf_vec_size=o_sf_vec_size,
     )
 
     # Handle different return types based on out_dtype
     if o_dtype == "nvfp4":
         out_scale_factor = output.scale  # FP4Tensor.scale
+        o_sf_start_index = output.scale_start_index
         output = output.data  # FP4Tensor.data
     else:
         out_scale_factor = None
@@ -407,7 +441,11 @@ def test_trtllm_batch_prefill(
         output = cast_from_fp4(output)
         output_ref, out_scale_factor_ref = ref_nvfp4_quant(output_ref, o_sf_scale, 16)
         out_scale_factor = recover_swizzled_scales(
-            out_scale_factor, output.shape[0], output.shape[1] * output.shape[2], 16
+            out_scale_factor,
+            output.shape[0],
+            output.shape[1] * output.shape[2],
+            16,
+            o_sf_start_index,
         )
 
         torch.testing.assert_close(
diff --git a/tests/test_trtllm_gen_decode.py b/tests/test_trtllm_gen_decode.py
@@ -7,6 +7,15 @@
 from utils_fp4 import cast_from_fp4, recover_swizzled_scales, ref_nvfp4_quant
 
 import flashinfer
+from flashinfer.utils import FP4Tensor
+
+
+def flip_coin(*args, **kwargs):
+    # Use any test parameters to deterministically decide branch
+    # This makes test configurations go through different paths
+    param_tuple = args + tuple(sorted(kwargs.items()))
+    hash_value = hash(param_tuple)
+    return (hash_value % 2) == 0
 
 
 def to_float8(x, dtype=torch.float8_e4m3fn):
@@ -224,6 +233,7 @@ def test_trtllm_batch_decode_fmha(
     o_sf_scale = (
         300 if o_dtype == "nvfp4" else None
     )  # choose a value to make error smaller by testing.
+    o_sf_vec_size = 16 if o_dtype == "nvfp4" else None
 
     sm_scale = float(1.0 / (head_dim**0.5))
 
@@ -237,6 +247,29 @@ def test_trtllm_batch_decode_fmha(
         ]
     )
 
+    if flip_coin(batch_size, page_size, num_kv_heads, head_grp_size, o_dtype):
+        if o_dtype == "nvfp4":
+            fp4_out_shape = q.shape[:-1] + (math.ceil(q.shape[-1] / 2),)
+
+            fp4_out_scale_shape = (
+                math.ceil(q.shape[0] / 128) * 128,
+                math.ceil(q.shape[1] * q.shape[2] / o_sf_vec_size / 4) * 4,
+            )
+
+            out_scale_factor = torch.empty(
+                fp4_out_scale_shape, dtype=torch.float8_e4m3fn, device=q.device
+            )
+            extra_size = fp4_out_scale_shape[0] - q.shape[0]
+            o_sf_start_index = (
+                torch.randint(0, extra_size, (1,)).item() if extra_size > 0 else 0
+            )
+            out_data = torch.empty(fp4_out_shape, dtype=torch.uint8, device=q.device)
+            out = FP4Tensor(out_data, out_scale_factor, o_sf_start_index)
+        else:
+            out = torch.empty_like(q, dtype=dtype_map[o_dtype])
+    else:
+        out = None
+
     output = flashinfer.decode.trtllm_batch_decode_with_kv_cache(
         q.contiguous(),
         kv_cache,
@@ -247,14 +280,16 @@ def test_trtllm_batch_decode_fmha(
         q_scale * k_scale * sm_scale,  # bmm1_scale
         v_scale / o_scale,  # bmm2_scale
         window_left,  # window_left
+        out=out,
         out_dtype=dtype_map[o_dtype],
         o_sf_scale=o_sf_scale,
-        o_sf_vec_size=16 if o_dtype == "nvfp4" else None,
+        o_sf_vec_size=o_sf_vec_size,
     )
 
     # Handle different return types based on out_dtype
     if o_dtype == "nvfp4":
         out_scale_factor = output.scale  # FP4Tensor.scale
+        o_sf_start_index = output.scale_start_index
         output = output.data  # FP4Tensor.data
     else:
         out_scale_factor = None
@@ -297,7 +332,11 @@ def test_trtllm_batch_decode_fmha(
         output = cast_from_fp4(output)
         output_ref, out_scale_factor_ref = ref_nvfp4_quant(output_ref, o_sf_scale, 16)
         out_scale_factor = recover_swizzled_scales(
-            out_scale_factor, output.shape[0], output.shape[1] * output.shape[2], 16
+            out_scale_factor,
+            output.shape[0],
+            output.shape[1] * output.shape[2],
+            16,
+            o_sf_start_index,
         )
 
         torch.testing.assert_close(
diff --git a/tests/utils_fp4.py b/tests/utils_fp4.py
@@ -84,12 +84,13 @@ def ref_nvfp4_quant(x, global_scale, block_size):
     return cast_to_fp4(clipped_x), scale.squeeze(-1)
 
 
-def recover_swizzled_scales(scale, m, n, block_size):
+def recover_swizzled_scales(scale, m, n, block_size, sf_start_index=0):
+    assert sf_start_index + m <= scale.shape[0]
     rounded_m = utils.round_up(m, 128)
     scale_n = n // block_size
     rounded_n = utils.round_up(scale_n, 4)
     # Recover the swizzled scaling factor to linear layout
     tmp = torch.reshape(scale, (1, rounded_m // 128, rounded_n // 4, 32, 4, 4))
     tmp = torch.permute(tmp, (0, 1, 4, 3, 2, 5))
     result = torch.reshape(tmp, (rounded_m, rounded_n)).to(torch.float32)
-    return result[:m, :scale_n]
+    return result[sf_start_index : sf_start_index + m, :scale_n]