fix: remove redundant zero_init reverted by #1459 (#1463)

yyihuang · web-flow · commit 03053413d660 · 2025-08-13T00:10:36.000-07:00
## 📌 Description The duplicate zero_init should be fixed. But we got some crash reported from DLFW. So we revert it in #1459 and make 0.2.11.post1. After this fix, **workspace buffer passed into any trtllm-gen attn interface must be zero-initialized**. This PR is to enable this optimization. It should be merged and released only after these two are tested. - sgl-project/sglang#9065 - vllm-project/vllm#22603 ## 🔍 Related Issues  ## 🚀 Pull Request Checklist Thank you for contributing to FlashInfer! Before we review your pull request, please make sure the following items are complete. ### ✅ Pre-commit Checks - [ ] I have installed `pre-commit` by running `pip install pre-commit` (or used your preferred method). - [ ] I have installed the hooks with `pre-commit install`. - [ ] I have run the hooks manually with `pre-commit run --all-files` and fixed any reported issues. > If you are unsure about how to set up `pre-commit`, see [the pre-commit documentation](https://pre-commit.com/). ## 🧪 Tests - [ ] Tests have been added or updated as needed. - [ ] All tests are passing (`unittest`, etc.). ## Reviewer Notes
diff --git a/csrc/trtllm_fmha_kernel_launcher.cu b/csrc/trtllm_fmha_kernel_launcher.cu
@@ -20,7 +20,6 @@
 #include <flashinfer/trtllm/fmha/fmhaRunnerParams.h>
 #include <nvrtc.h>
 
-#include <flashinfer/semaphore_utils.cuh>
 #include <flashinfer/trtllm/fmha/fmhaRunner.cuh>
 #include <flashinfer/trtllm/fmha/gen_kernel_launcher.cuh>
 #include <flashinfer/utils.cuh>
@@ -146,13 +145,13 @@ void trtllm_paged_attention_launcher(
         use_multi_block ? TileScheduler::Static : TileScheduler::Persistent;
     runner_params.mMultiCtasKvMode = use_multi_block;
 
+    size_t max_batch_size = 8192;   // todo(Yingyi): get from dlfw
+    size_t max_num_qo_heads = 256;  // todo(Yingyi): get from dlfw, in total 8MB
     size_t num_semaphores =
-        round_up(batch_size * num_qo_heads, 8);  // align multiCtasKvScratchPtr to 16 bytes
+        round_up(max_batch_size * max_num_qo_heads, 8);  // max 8MB, should align to 16 bytes
     runner_params.multiCtasKvScratchPtr = reinterpret_cast<void*>(
         static_cast<char*>(workspace_buffer) + num_semaphores * sizeof(uint32_t));
     runner_params.multiCtasKvCounterPtr = reinterpret_cast<int32_t*>(workspace_buffer);
-    zero_gmem_semaphore_launcher(runner_params.multiCtasKvCounterPtr, num_semaphores,
-                                 /*enable_pdl=*/true, stream);
   }
 
   auto [foundKernels, kinfo] = fmha_runner->isSupportedWithInfo(runner_params);
diff --git a/flashinfer/decode.py b/flashinfer/decode.py
@@ -593,7 +593,7 @@ class BatchDecodeWithPagedKVCacheWrapper:
     >>> max_num_pages = 128
     >>> page_size = 16
     >>> # allocate 128MB workspace buffer
-    >>> workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.uint8, device="cuda:0")
+    >>> workspace_buffer = torch.zeros(128 * 1024 * 1024, dtype=torch.uint8, device="cuda:0")
     >>> decode_wrapper = flashinfer.BatchDecodeWithPagedKVCacheWrapper(
     ...     workspace_buffer, "NHD"
     ... )
@@ -658,7 +658,7 @@ def __init__(
 
         Parameters
         ----------
-        float_workspace_buffer : torch.Tensor
+        float_workspace_buffer : torch.Tensor. Must be initialized to 0 for its first use.
             The user reserved float workspace buffer used to store intermediate attention results
             in the split-k algorithm. The recommended size is 128MB, the device of the workspace
             buffer should be the same as the device of the input tensors.
@@ -2000,7 +2000,7 @@ def trtllm_batch_decode_with_kv_cache(
         If kv_cache is a single tensor, it should be a tensor with shape [num_pages, 1 or 2, num_kv_heads, page_size, head_dim]
         If kv_cache is a tuple of two tensors, it should be a tuple of two tensors with shape [num_pages, num_kv_heads, page_size, head_dim]
 
-    workspace_buffer : torch.Tensor
+    workspace_buffer : torch.Tensor. Must be initialized to 0 for its first use.
         workspace
 
     block_tables : torch.Tensor
@@ -2198,7 +2198,7 @@ def trtllm_batch_decode_with_kv_cache_mla(
     Parameters:
     query: [batch_size, q_len_per_request, num_heads, head_dim_qk], head_dim_qk = qk_nope_head_dim (kv_lora_rank) + qk_rope_head_dim, should be concated q_nope + q_rope; q_len_per_request is the MTP query length.
     kv_cache: [num_pages, page_size, head_dim_ckv + head_dim_kpe], should be concated ckv_cache + kpe_cache
-    workspace_buffer: [num_semaphores, 4], used for multi_block mode
+    workspace_buffer: [num_semaphores, 4], used for multi_block mode. Must be initialized to 0 for its first use.
     qk_nope_head_dim: qk_nope_head_dim, must be 128
     kv_lora_rank: kv_lora_rank, must be 512
     qk_rope_head_dim: qk_rope_head_dim, must be 64
diff --git a/flashinfer/prefill.py b/flashinfer/prefill.py
@@ -1215,7 +1215,7 @@ class BatchPrefillWithPagedKVCacheWrapper:
     >>> max_num_pages = 128
     >>> page_size = 16
     >>> # allocate 128MB workspace buffer
-    >>> workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.uint8, device="cuda:0")
+    >>> workspace_buffer = torch.zeros(128 * 1024 * 1024, dtype=torch.uint8, device="cuda:0")
     >>> prefill_wrapper = flashinfer.BatchPrefillWithPagedKVCacheWrapper(
     ...     workspace_buffer, "NHD"
     ... )
@@ -3144,7 +3144,7 @@ def trtllm_batch_context_with_kv_cache(
     kv_cache : Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]
         If kv_cache is a single tensor, it should be a tensor with shape [num_pages, 1 or 2, num_kv_heads, page_size, head_dim]
         If kv_cache is a tuple of two tensors, it should be a tuple of two tensors with shape [num_pages, num_kv_heads, page_size, head_dim]
-    workspace_buffer : torch.Tensor
+    workspace_buffer : torch.Tensor. Must be initialized to 0 for its first use.
         workspace
     block_tables : torch.Tensor
         page_table of kv cache, [batch_size, num_pages]
diff --git a/include/flashinfer/semaphore_utils.cuh b/include/flashinfer/semaphore_utils.cuh
diff --git a/tests/test_trtllm_gen_context.py b/tests/test_trtllm_gen_context.py
@@ -7,6 +7,8 @@
 import flashinfer
 from flashinfer.utils import FP4Tensor
 
+global_workspace_buffer = None
+
 
 def flip_coin(*args, **kwargs):
     # Use any test parameters to deterministically decide branch
@@ -97,7 +99,12 @@ def test_trtllm_batch_context_wrapper(
     kv_last_page_len_cpu = torch.full(
         (batch_size,), (kv_len - 1) % page_size + 1, dtype=torch.int32
     )
-    workspace_buffer = torch.empty(256 * 1024 * 1024, dtype=torch.int8, device="cuda:0")
+    global global_workspace_buffer
+    if global_workspace_buffer is None:
+        global_workspace_buffer = torch.zeros(
+            256 * 1024 * 1024, dtype=torch.int8, device="cuda:0"
+        )
+    workspace_buffer = global_workspace_buffer
 
     # reference
     q_indptr_gpu = q_indptr_cpu.to(device)
@@ -337,7 +344,13 @@ def test_trtllm_batch_prefill(
     o_sf_vec_size = 16 if o_dtype == "nvfp4" else None
     sm_scale = float(1.0 / (head_dim**0.5))
 
-    workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.int8, device=device)
+    global global_workspace_buffer
+    if global_workspace_buffer is None:
+        global_workspace_buffer = torch.zeros(
+            128 * 1024 * 1024, dtype=torch.int8, device="cuda:0"
+        )
+    workspace_buffer = global_workspace_buffer
+
     q_indptr = torch.cat(
         [
             torch.tensor([0], dtype=torch.int32, device=device),
diff --git a/tests/test_trtllm_gen_decode.py b/tests/test_trtllm_gen_decode.py
@@ -8,6 +8,8 @@
 import flashinfer
 from flashinfer.utils import FP4Tensor
 
+global_workspace_buffer = None
+
 
 def flip_coin(*args, **kwargs):
     # Use any test parameters to deterministically decide branch
@@ -235,7 +237,12 @@ def test_trtllm_batch_decode_fmha(
 
     sm_scale = float(1.0 / (head_dim**0.5))
 
-    workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.int8, device=device)
+    global global_workspace_buffer
+    if global_workspace_buffer is None:
+        global_workspace_buffer = torch.zeros(
+            128 * 1024 * 1024, dtype=torch.int8, device="cuda:0"
+        )
+    workspace_buffer = global_workspace_buffer
 
     # Compute kv_indptr as cumulative sum of blocks per sequence
     kv_indptr = torch.cat(
@@ -469,7 +476,12 @@ def test_trtllm_batch_decode_mla(
 
     # Allocate workspace buffer
     # todo(Yingyi): calculate the actual size of workspace buffer
-    workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.int8, device=device)
+    global global_workspace_buffer
+    if global_workspace_buffer is None:
+        global_workspace_buffer = torch.zeros(
+            128 * 1024 * 1024, dtype=torch.int8, device="cuda:0"
+        )
+    workspace_buffer = global_workspace_buffer
 
     bmm1_log2_scale_tensor = (
         torch.tensor(