bugfix: fix integer overflow in FA2 customized_mask & add buffer overflow warning. (#1290)

happierpig · web-flow · commit ba3f3248518e · 2025-07-24T11:03:08.000-07:00
## 📌 Description 1. Per discussion with @haochengxi and @Radioheading, this PR moves the `plan` function in `VariableBlockSparseAttentionWrapper` to the GPU side, to avoid expensive (hundreds ms) host operations. 2. This PR also enlarges the default internal buffer size to accommodate video DiT use cases. 3. This PR fixes the **INT overflow** during offset calculation in attention map. This causes errors in `customized_mask` mode of FA2 prefill template. E.g., with a `kv_len=128K`, the last element of the attention map will be `128*128*1e6=1e10`, which is larger than `INT32_MAX`.  ## 🔍 Related Issues This PR should solve #1271  ## 🚀 Pull Request Checklist Thank you for contributing to FlashInfer! Before we review your pull request, please make sure the following items are complete. ### ✅ Pre-commit Checks - [x] I have installed `pre-commit` by running `pip install pre-commit` (or used your preferred method). - [x] I have installed the hooks with `pre-commit install`. - [x] I have run the hooks manually with `pre-commit run --all-files` and fixed any reported issues. > If you are unsure about how to set up `pre-commit`, see [the pre-commit documentation](https://pre-commit.com/). ## 🧪 Tests - [x] Tests have been added or updated as needed. - [x] All tests are passing (`unittest`, etc.). ## Reviewer Notes
diff --git a/flashinfer/__init__.py b/flashinfer/__init__.py
@@ -117,4 +117,7 @@
 from .sampling import top_p_renorm_probs as top_p_renorm_probs
 from .sampling import top_p_sampling_from_probs as top_p_sampling_from_probs
 from .sparse import BlockSparseAttentionWrapper as BlockSparseAttentionWrapper
+from .sparse import (
+    VariableBlockSparseAttentionWrapper as VariableBlockSparseAttentionWrapper,
+)
 from .utils import next_positive_power_of_2 as next_positive_power_of_2
diff --git a/flashinfer/sparse.py b/flashinfer/sparse.py
@@ -131,9 +131,10 @@ def __init__(
             (8 * 1024 * 1024,), dtype=torch.uint8, device=self.device
         )
         if backend in ["fa3", "auto"]:
-            # NOTE(Zihao): assume maximum accumulate kv length is 4M
+            # NOTE(Zihao): assume maximum accumulate kv length is 128M
+            # NOTE(Yilong): 128M is required by video DiT models
             self._vector_sparse_indices_buffer = torch.empty(
-                (4 * 1024 * 1024,), dtype=torch.int32, device=self.device
+                (128 * 1024 * 1024,), dtype=torch.int32, device=self.device
             )
             # NOTE(Zihao): assume maximum batch size is 32768
             self._vector_sparse_indptr_buffer = torch.empty(
@@ -164,7 +165,11 @@ def __init__(
         self._backend = backend
 
     def reset_workspace_buffer(
-        self, float_workspace_buffer: torch.Tensor, int_workspace_buffer: torch.Tensor
+        self,
+        float_workspace_buffer: torch.Tensor,
+        int_workspace_buffer: torch.Tensor,
+        vector_sparse_indices_buffer: Optional[torch.Tensor] = None,
+        vector_sparse_indptr_buffer: Optional[torch.Tensor] = None,
     ) -> None:
         r"""Reset the workspace buffer.
 
@@ -186,6 +191,12 @@ def reset_workspace_buffer(
             pin_memory=True,
         )
 
+        # Enable user-defined size
+        if vector_sparse_indices_buffer is not None:
+            self._vector_sparse_indices_buffer = vector_sparse_indices_buffer
+        if vector_sparse_indptr_buffer is not None:
+            self._vector_sparse_indptr_buffer = vector_sparse_indptr_buffer
+
     def plan(
         self,
         indptr: torch.Tensor,
@@ -589,6 +600,14 @@ def run(
 
         if self._use_tensor_cores:
             if self._backend == "fa3":
+                if (
+                    self._vector_sparse_indices_buffer.numel()
+                    <= self._paged_kv_indices_buf.numel() * self.C
+                ):
+                    raise ValueError(
+                        "_vector_sparse_indices_buffer is not large enough. Please increase the size."
+                    )
+
                 sparse_indices = block_sparse_indices_to_vector_sparse_offsets(
                     self._paged_kv_indices_buf,
                     self._paged_kv_indptr_buf,
@@ -725,11 +744,9 @@ def __init__(
             (8 * 1024 * 1024,), dtype=torch.uint8, device=self.device
         )
         if backend in ["fa3", "auto"]:
-            # NOTE(Zihao): assume maximum accumulate kv length is 4M
             self._vector_sparse_indices_buffer = torch.empty(
-                (4 * 1024 * 1024,), dtype=torch.int32, device=self.device
+                (128 * 1024 * 1024,), dtype=torch.int32, device=self.device
             )
-            # NOTE(Zihao): assume maximum batch size is 32768
             self._vector_sparse_indptr_buffer = torch.empty(
                 (32768,), dtype=torch.int32, device=self.device
             )
@@ -752,7 +769,11 @@ def __init__(
         self._backend = backend
 
     def reset_workspace_buffer(
-        self, float_workspace_buffer: torch.Tensor, int_workspace_buffer: torch.Tensor
+        self,
+        float_workspace_buffer: torch.Tensor,
+        int_workspace_buffer: torch.Tensor,
+        vector_sparse_indices_buffer: Optional[torch.Tensor] = None,
+        vector_sparse_indptr_buffer: Optional[torch.Tensor] = None,
     ) -> None:
         r"""Reset the workspace buffer.
 
@@ -774,6 +795,12 @@ def reset_workspace_buffer(
             pin_memory=True,
         )
 
+        # Enable user-defined size
+        if vector_sparse_indices_buffer is not None:
+            self._vector_sparse_indices_buffer = vector_sparse_indices_buffer
+        if vector_sparse_indptr_buffer is not None:
+            self._vector_sparse_indptr_buffer = vector_sparse_indptr_buffer
+
     def plan(
         self,
         block_mask_map: torch.Tensor,
@@ -860,14 +887,14 @@ def plan(
 
         # q layout: [seq_len, num_kv_heads, gqa_group_size, head_dim]
         # padded into: [seq_len * num_kv_heads, 1, gqa_group_size, head_dim]
-        qo_indptr_host = torch.cat(
+        qo_indptr = torch.cat(
             [
                 torch.zeros(1, dtype=torch.int32, device=block_row_sz.device),
                 torch.cumsum(block_row_sz.flatten(), dim=0, dtype=torch.int32),
             ],
             dim=0,
         )
-        qo_indptr = qo_indptr_host.to(block_mask_map.device, non_blocking=non_blocking)
+        qo_indptr_host = qo_indptr.to("cpu", non_blocking=non_blocking)
         last_block_len = torch.full(
             (num_blocks_row * num_kv_heads,),
             1,
@@ -926,36 +953,37 @@ def _block_mask_map_to_expanded_indices(
                 dtype=dtype_i, device=device
             )
 
-        kv_indptr_host, kv_indices_host = _block_mask_map_to_expanded_indices(
+        kv_indptr, kv_indices = _block_mask_map_to_expanded_indices(
             block_mask_map, block_col_sz
         )
+        kv_indptr_host = kv_indptr.to("cpu", non_blocking=non_blocking)
+        kv_indices_host = kv_indices.to("cpu", non_blocking=non_blocking)
 
         self._qo_indptr = qo_indptr.to(self.device, non_blocking=non_blocking)
-        self._paged_kv_indptr_buf = kv_indptr_host.to(
-            self.device, non_blocking=non_blocking
-        )
-        self._paged_kv_indices_buf = kv_indices_host.to(
+        self._paged_kv_indptr_buf = kv_indptr.to(self.device, non_blocking=non_blocking)
+        self._paged_kv_indices_buf = kv_indices.to(
             self.device, non_blocking=non_blocking
         )
         self._paged_kv_last_page_len = last_block_len.to(
             self.device, non_blocking=non_blocking
         )
+        torch.cuda.synchronize()  # for non-blocking copy
         self._mask_mode = MaskMode.CAUSAL.value if causal else MaskMode.NON_CAUSAL.value
 
         # Sanity check
         assert (
             num_qo_heads % num_kv_heads == 0
         ), "num_qo_heads must be a multiple of num_kv_heads"
         assert num_blocks_row * num_kv_heads + 1 == kv_indptr_host.shape[0]
-        assert kv_indptr_host[-1].item() == kv_indices_host.shape[0]
+        assert (
+            kv_indptr_host[-1].item() == kv_indices_host.shape[0]
+        ), f"{kv_indptr_host[-1].item()} != {kv_indices_host.shape[0]}"
         assert num_kv_heads == block_mask_map.shape[0]
         assert num_kv_heads == block_row_sz.shape[0]
         assert num_kv_heads == block_col_sz.shape[0]
         assert num_blocks_row == block_mask_map.shape[1]
         assert num_blocks_col == block_mask_map.shape[2]
 
-        kv_indptr_host = kv_indptr_host.to("cpu")
-
         if self._backend == "auto":
             self._backend = determine_attention_backend(
                 self.device,
@@ -986,8 +1014,12 @@ def _block_mask_map_to_expanded_indices(
         )
 
         if self._backend == "fa3":
-            self._vector_sparse_indptr_buffer[: len(kv_indptr_host)].copy_(
-                kv_indptr_host, non_blocking=non_blocking
+            if self._vector_sparse_indptr_buffer.numel() <= kv_indptr.numel():
+                raise ValueError(
+                    "_vector_sparse_indptr_buffer is not large enough. Please increase the buffer size."
+                )
+            self._vector_sparse_indptr_buffer[: len(kv_indptr)].copy_(
+                kv_indptr, non_blocking=non_blocking
             )
 
         self._plan_info = self._cached_module.plan(
@@ -1135,6 +1167,14 @@ def run(
             _check_shape_dtype_device(out, q.shape, self._o_dtype, q.device, "out")
 
         if self._backend == "fa3":
+            if (
+                self._vector_sparse_indices_buffer.numel()
+                <= self._paged_kv_indices_buf.numel()
+            ):
+                raise ValueError(
+                    "_vector_sparse_indices_buffer is not large enough. Please increase the buffer size."
+                )
+
             sparse_indices = block_sparse_indices_to_vector_sparse_offsets(
                 self._paged_kv_indices_buf,
                 self._paged_kv_indptr_buf,
diff --git a/include/flashinfer/attention/variants.cuh b/include/flashinfer/attention/variants.cuh
@@ -83,7 +83,7 @@ struct DefaultAttention : AttentionVariantBase {
       if (qo_idx >= qo_len || kv_idx >= kv_len) {
         mask = false;
       } else {
-        const uint32_t offset = qo_idx * kv_len + kv_idx;
+        const uint64_t offset = static_cast<uint64_t>(qo_idx) * kv_len + kv_idx;
         mask &= ((custom_mask_ptr[offset / 8] >> (offset % 8)) & 1);
       }
     }
diff --git a/include/flashinfer/quantization.cuh b/include/flashinfer/quantization.cuh
@@ -38,13 +38,19 @@ enum class BitOrder { kBig = 0U, kLittle = 1U };
 
 template <BitOrder BITORDER>
 __global__ void PackBitsKernel(bool* input, uint8_t* output, int64_t num_elements) {
-  int64_t start_offset = blockIdx.x * blockDim.x * 8, tx = threadIdx.x;
+  int64_t start_offset = static_cast<int64_t>(blockIdx.x) * blockDim.x * 8, tx = threadIdx.x;
   uint8_t ret = 0;
   bool input_vec[8];
   typedef cub::BlockLoad<bool, 256, 8, cub::BLOCK_LOAD_VECTORIZE> BlockLoad;
   __shared__ typename BlockLoad::TempStorage temp_storage;
-  BlockLoad(temp_storage)
-      .Load(input + start_offset, input_vec, num_elements - start_offset, /*default=*/0);
+
+  // This fix the INT32_T overflow issue, which is possible in DiT video models
+  // where the kv_len could be 128K.
+  // ref:
+  // https://github.com/NVIDIA/cub/blob/0fc3c3701632a4be906765b73be20a9ad0da603d/cub/block/block_load.cuh#L711C13-L711C100
+  int block_items_end =
+      (num_elements - start_offset > INT32_MAX) ? INT32_MAX : num_elements - start_offset;
+  BlockLoad(temp_storage).Load(input + start_offset, input_vec, block_items_end, /*default=*/0);
 
   if constexpr (BITORDER == BitOrder::kBig) {
     ret = (input_vec[0] << 7) | (input_vec[1] << 6) | (input_vec[2] << 5) | (input_vec[3] << 4) |
diff --git a/tests/test_block_sparse.py b/tests/test_block_sparse.py
@@ -67,6 +67,12 @@ def bsr_attention_ref(
     return o
 
 
+def set_seed(seed: int = 42):
+    torch.cuda.manual_seed(seed)
+    torch.manual_seed(seed)
+    np.random.seed(seed)
+
+
 @pytest.mark.parametrize("R", [1, 4, 16])
 @pytest.mark.parametrize("C", [1, 4, 16])
 @pytest.mark.parametrize("M", [64, 128, 256])
@@ -80,7 +86,10 @@ def test_block_sparse_attention(
 ):
     if num_qo_heads % num_kv_heads != 0:
         pytest.skip("num_qo_heads must be divisible by num_kv_heads")
+
+    set_seed(33)
     rng = np.random.default_rng()
+
     MB = M // R
     NB = N // C
     S = sp.sparse.random(MB, NB, density=0.25, random_state=rng).tocsr()
@@ -182,6 +191,8 @@ def test_variable_block_sparse_attention_wrapper(
     if seq_len // num_blocks_col < 1:
         pytest.skip("seq_len must be greater than num_blocks_col")
 
+    set_seed(330)
+
     def random_partition_batch(
         seq_len: int,
         num_blocks: int,
@@ -209,7 +220,7 @@ def random_partition_batch(
         assert sizes.max() <= seq_len
         assert torch.all(sizes.sum(dim=-1) == seq_len)
 
-        return sizes
+        return sizes.to(device=device)
 
     def _test_variable_block_sparse_attention(
         num_qo_heads: int,
@@ -260,12 +271,15 @@ def _test_variable_block_sparse_attention(
             )
             torch.testing.assert_close(o[kv_head_idx], o_ref, atol=1e-2, rtol=1e-2)
 
-    block_row_sz = random_partition_batch(seq_len, num_blocks_row, num_kv_heads)
-    block_col_sz = random_partition_batch(seq_len, num_blocks_col, num_kv_heads)
+    block_row_sz = random_partition_batch(
+        seq_len, num_blocks_row, num_kv_heads, device="cuda:0"
+    )
+    block_col_sz = random_partition_batch(
+        seq_len, num_blocks_col, num_kv_heads, device="cuda:0"
+    )
     block_mask_map = (
         torch.rand(num_kv_heads, num_blocks_row, num_blocks_col) > block_density
-    )
-    block_mask_map = block_mask_map.to(dtype=torch.bool, device="cpu")
+    ).to(device="cuda:0")
 
     _test_variable_block_sparse_attention(
         num_qo_heads,
@@ -278,5 +292,6 @@ def _test_variable_block_sparse_attention(
 
 
 if __name__ == "__main__":
-    test_block_sparse_attention(1, 1, 64, 64, 1, 1, 128, False)
-    test_block_sparse_attention(16, 16, 256, 256, 16, 16, 256, True)
+    # This test verifies the INT32_T overflow issue.
+    for seq_len in [16 * 1024, 32 * 1024, 40 * 1024, 48 * 1024, 64 * 1024]:
+        test_block_sparse_attention(128, 128, seq_len, seq_len, 1, 1, 128, False)

Original file line number	Diff line number	Diff line change
`@@ -83,7 +83,7 @@ struct DefaultAttention : AttentionVariantBase {`
`83`	`83`	`if (qo_idx >= qo_len \|\| kv_idx >= kv_len) {`
`84`	`84`	`mask = false;`
`85`	`85`	`} else {`
`86`		`- const uint32_t offset = qo_idx * kv_len + kv_idx;`
	`86`	`+ const uint64_t offset = static_cast<uint64_t>(qo_idx) * kv_len + kv_idx;`
`87`	`87`	`mask &= ((custom_mask_ptr[offset / 8] >> (offset % 8)) & 1);`
`88`	`88`	`}`
`89`	`89`	`}`