bugfix: Fix TRTLLM NVFP4-out attention kernel scale factor dim issue (#1460)

elvischenv · web-flow · commit 74b8785fa1d0 · 2025-08-13T23:34:12.000-07:00
## 📌 Description  Fixed the shape checking for FP4 scale factor tensor. After #1363, we could pass `o_sf_start_index` to write the scale factor shared by prefill and decode kernel. Current implementation still assumes the batch dim of scale factor tensor is the same with query, but it should be a combination of both prefill and decode scale. This PR fixed the checking, as well as do the correct swizzling recovery. ## 🔍 Related Issues  ## 🚀 Pull Request Checklist Thank you for contributing to FlashInfer! Before we review your pull request, please make sure the following items are complete. ### ✅ Pre-commit Checks - [x] I have installed `pre-commit` by running `pip install pre-commit` (or used your preferred method). - [x] I have installed the hooks with `pre-commit install`. - [ ] I have run the hooks manually with `pre-commit run --all-files` and fixed any reported issues. > If you are unsure about how to set up `pre-commit`, see [the pre-commit documentation](https://pre-commit.com/). ## 🧪 Tests - [x] Tests have been added or updated as needed. - [x] All tests are passing (`unittest`, etc.). ## Reviewer Notes
diff --git a/flashinfer/decode.py b/flashinfer/decode.py
@@ -60,6 +60,8 @@
     is_float8,
     register_custom_op,
     register_fake_op,
+    ceil_div,
+    round_up,
 )
 
 
@@ -2085,18 +2087,21 @@ def trtllm_batch_decode_with_kv_cache(
         assert o_sf_vec_size in [None, 16], "only o_sf_vec_size = 16 is supported"
         o_sf_vec_size = o_sf_vec_size or 16
 
-        fp4_out_shape = query.shape[:-1] + (math.ceil(query.shape[-1] / 2),)
-
-        fp4_out_scale_shape = (
-            math.ceil(query.shape[0] / 128) * 128,
-            math.ceil(query.shape[1] * query.shape[2] / o_sf_vec_size / 4) * 4,
-        )
+        fp4_out_shape = query.shape[:-1] + (ceil_div(query.shape[-1], 2),)
 
         if isinstance(out, FP4Tensor):
+            fp4_out_scale_shape = (
+                out.scale.shape[0],
+                round_up(query.shape[1] * query.shape[2] // o_sf_vec_size, 4),
+            )
             out_scale_factor = out.scale
             o_sf_start_index = out.scale_start_index
             out = out.data
         elif out is None:
+            fp4_out_scale_shape = (
+                round_up(query.shape[0], 128),
+                round_up(query.shape[1] * query.shape[2] // o_sf_vec_size, 4),
+            )
             out_scale_factor = torch.empty(
                 fp4_out_scale_shape, dtype=torch.float8_e4m3fn, device=query.device
             )
@@ -2105,16 +2110,30 @@ def trtllm_batch_decode_with_kv_cache(
         else:
             raise ValueError(f"Invalid out: {out}")
 
-        _check_shape_dtype_device(out, fp4_out_shape, torch.uint8, query.device, "out")
+        assert isinstance(out, torch.Tensor)
 
         # Use uint8 as the container dtype to compliant with next fp4 gemm.
+        _check_shape_dtype_device(out, fp4_out_shape, torch.uint8, query.device, "out")
+
         _check_shape_dtype_device(
             out_scale_factor,
             fp4_out_scale_shape,
             torch.float8_e4m3fn,
             query.device,
             "out_scale_factor",
         )
+
+        # Check o_sf_start_index is valid
+        if (
+            o_sf_start_index < 0
+            or o_sf_start_index + out.shape[0] > out_scale_factor.shape[0]
+        ):
+            raise ValueError(
+                f"o_sf_start_index is out of the valid range of out_scale_factor. "
+                f"o_sf_start_index={o_sf_start_index}, out.shape[0]={out.shape[0]}, "
+                f"out_scale_factor.shape[0]={out_scale_factor.shape[0]}"
+            )
+
     elif isinstance(out_dtype, torch.dtype) or out_dtype is None:
         assert o_sf_scale is None
         assert o_sf_vec_size is None
diff --git a/flashinfer/prefill.py b/flashinfer/prefill.py
@@ -56,6 +56,8 @@
     is_sm100a_supported,
     register_custom_op,
     register_fake_op,
+    ceil_div,
+    round_up,
 )
 
 
@@ -3216,18 +3218,21 @@ def trtllm_batch_context_with_kv_cache(
         assert o_sf_vec_size in [None, 16], "only o_sf_vec_size = 16 is supported"
         o_sf_vec_size = o_sf_vec_size or 16
 
-        fp4_out_shape = query.shape[:-1] + (math.ceil(query.shape[-1] / 2),)
-
-        fp4_out_scale_shape = (
-            math.ceil(query.shape[0] / 128) * 128,
-            math.ceil(query.shape[1] * query.shape[2] / o_sf_vec_size / 4) * 4,
-        )
+        fp4_out_shape = query.shape[:-1] + (ceil_div(query.shape[-1], 2),)
 
         if isinstance(out, FP4Tensor):
+            fp4_out_scale_shape = (
+                out.scale.shape[0],
+                round_up(query.shape[1] * query.shape[2] // o_sf_vec_size, 4),
+            )
             out_scale_factor = out.scale
             o_sf_start_index = out.scale_start_index
             out = out.data
         elif out is None:
+            fp4_out_scale_shape = (
+                round_up(query.shape[0], 128),
+                round_up(query.shape[1] * query.shape[2] // o_sf_vec_size, 4),
+            )
             out_scale_factor = torch.empty(
                 fp4_out_scale_shape, dtype=torch.float8_e4m3fn, device=query.device
             )
@@ -3236,16 +3241,30 @@ def trtllm_batch_context_with_kv_cache(
         else:
             raise ValueError(f"Invalid out: {out}")
 
-        _check_shape_dtype_device(out, fp4_out_shape, torch.uint8, query.device, "out")
+        assert isinstance(out, torch.Tensor)
 
         # Use uint8 as the container dtype to compliant with next fp4 gemm.
+        _check_shape_dtype_device(out, fp4_out_shape, torch.uint8, query.device, "out")
+
         _check_shape_dtype_device(
             out_scale_factor,
             fp4_out_scale_shape,
             torch.float8_e4m3fn,
             query.device,
             "out_scale_factor",
         )
+
+        # Check o_sf_start_index is valid
+        if (
+            o_sf_start_index < 0
+            or o_sf_start_index + out.shape[0] > out_scale_factor.shape[0]
+        ):
+            raise ValueError(
+                f"o_sf_start_index is out of the valid range of out_scale_factor. "
+                f"o_sf_start_index={o_sf_start_index}, out.shape[0]={out.shape[0]}, "
+                f"out_scale_factor.shape[0]={out_scale_factor.shape[0]}"
+            )
+
     elif isinstance(out_dtype, torch.dtype) or out_dtype is None:
         assert o_sf_scale is None
         assert o_sf_vec_size is None
diff --git a/flashinfer/utils.py b/flashinfer/utils.py
@@ -546,8 +546,24 @@ def __init__(
         """
         if data.dtype != torch.uint8:
             raise ValueError(f"data must be uint8 tensor, got {data.dtype}")
+
+        # Validate scale factor tensor and scale start index
         if scale.dtype != torch.float8_e4m3fn:
             raise ValueError(f"scale must be float8_e4m3fn tensor, got {scale.dtype}")
+        if scale.shape[0] % 128 != 0:
+            raise ValueError(
+                f"scale.shape[0] must be a multiple of 128, got {scale.shape[0]}"
+            )
+        if scale_start_index < 0 or scale_start_index >= scale.shape[0]:
+            raise ValueError(
+                f"scale start index must be in the range [0, scale.shape[0]). "
+                f"scale_start_index={scale_start_index}, scale.shape[0]={scale.shape[0]}"
+            )
+        if scale_start_index + data.shape[0] > scale.shape[0]:
+            raise ValueError(
+                f"scale start index + data.shape[0] must not exceed scale.shape[0]. "
+                f"scale_start_index={scale_start_index}, data.shape[0]={data.shape[0]}, scale.shape[0]={scale.shape[0]}"
+            )
 
         # Validate shape relationship if original_shape is provided
         if original_shape is not None:
diff --git a/tests/test_trtllm_gen_attention.py b/tests/test_trtllm_gen_attention.py
@@ -5,7 +5,7 @@
 from utils_fp4 import cast_from_fp4, recover_swizzled_scales, ref_fp4_quant
 
 import flashinfer
-from flashinfer.utils import FP4Tensor
+from flashinfer.utils import FP4Tensor, ceil_div, round_up
 
 DTYPE_MAP = {
     "half": torch.float16,
@@ -162,19 +162,23 @@ def create_output(q, o_dtype, create_out_tensor):
 
     if create_out_tensor:
         if o_dtype == "nvfp4":
-            fp4_out_shape = q.shape[:-1] + (math.ceil(q.shape[-1] / 2),)
+            fp4_out_shape = q.shape[:-1] + (ceil_div(q.shape[-1], 2),)
+
+            extra_size = torch.randint(0, 256, (1,)).item()
 
             fp4_out_scale_shape = (
-                math.ceil(q.shape[0] / 128) * 128,
-                math.ceil(q.shape[1] * q.shape[2] / o_sf_vec_size / 4) * 4,
+                round_up(q.shape[0] + extra_size, 128),
+                round_up(q.shape[1] * q.shape[2] // o_sf_vec_size, 4),
             )
 
             out_scale_factor = torch.empty(
                 fp4_out_scale_shape, dtype=torch.float8_e4m3fn, device=q.device
             )
-            extra_size = fp4_out_scale_shape[0] - q.shape[0]
+            rounded_extra_size = fp4_out_scale_shape[0] - q.shape[0]
             o_sf_start_index = (
-                torch.randint(0, extra_size, (1,)).item() if extra_size > 0 else 0
+                torch.randint(0, rounded_extra_size, (1,)).item()
+                if rounded_extra_size > 0
+                else 0
             )
             out_data = torch.empty(fp4_out_shape, dtype=torch.uint8, device=q.device)
             out = FP4Tensor(out_data, out_scale_factor, o_sf_start_index)
diff --git a/tests/utils_fp4.py b/tests/utils_fp4.py
@@ -90,11 +90,11 @@ def ref_fp4_quant(x, global_scale, block_size, sf_use_ue8m0=False):
 
 def recover_swizzled_scales(scale, m, n, block_size, sf_start_index=0):
     assert sf_start_index + m <= scale.shape[0]
-    rounded_m = utils.round_up(m, 128)
+    full_m = scale.shape[0]
     scale_n = n // block_size
     rounded_n = utils.round_up(scale_n, 4)
     # Recover the swizzled scaling factor to linear layout
-    tmp = torch.reshape(scale, (1, rounded_m // 128, rounded_n // 4, 32, 4, 4))
+    tmp = torch.reshape(scale, (1, full_m // 128, rounded_n // 4, 32, 4, 4))
     tmp = torch.permute(tmp, (0, 1, 4, 3, 2, 5))
-    result = torch.reshape(tmp, (rounded_m, rounded_n)).to(torch.float32)
+    result = torch.reshape(tmp, (full_m, rounded_n)).to(torch.float32)
     return result[sf_start_index : sf_start_index + m, :scale_n]