Revert "fix: remote redundant zero_init from trtllm-gen attn (#1444)" (#1459)

zhyncs · web-flow · commit df306f6ff835 · 2025-08-11T02:28:24.000-07:00
This reverts commit 5451029.  ## 📌 Description  ## 🔍 Related Issues  ## 🚀 Pull Request Checklist Thank you for contributing to FlashInfer! Before we review your pull request, please make sure the following items are complete. ### ✅ Pre-commit Checks - [ ] I have installed `pre-commit` by running `pip install pre-commit` (or used your preferred method). - [ ] I have installed the hooks with `pre-commit install`. - [ ] I have run the hooks manually with `pre-commit run --all-files` and fixed any reported issues. > If you are unsure about how to set up `pre-commit`, see [the pre-commit documentation](https://pre-commit.com/). ## 🧪 Tests - [ ] Tests have been added or updated as needed. - [ ] All tests are passing (`unittest`, etc.). ## Reviewer Notes
diff --git a/csrc/trtllm_fmha_kernel_launcher.cu b/csrc/trtllm_fmha_kernel_launcher.cu
@@ -151,6 +151,8 @@ void trtllm_paged_attention_launcher(
     runner_params.multiCtasKvScratchPtr = reinterpret_cast<void*>(
         static_cast<char*>(workspace_buffer) + num_semaphores * sizeof(uint32_t));
     runner_params.multiCtasKvCounterPtr = reinterpret_cast<int32_t*>(workspace_buffer);
+    zero_gmem_semaphore_launcher(runner_params.multiCtasKvCounterPtr, num_semaphores,
+                                 /*enable_pdl=*/true, stream);
   }
 
   auto [foundKernels, kinfo] = fmha_runner->isSupportedWithInfo(runner_params);
diff --git a/tests/test_trtllm_gen_decode.py b/tests/test_trtllm_gen_decode.py
@@ -469,7 +469,7 @@ def test_trtllm_batch_decode_mla(
 
     # Allocate workspace buffer
     # todo(Yingyi): calculate the actual size of workspace buffer
-    workspace_buffer = torch.zeros(128 * 1024 * 1024, dtype=torch.int8, device=device)
+    workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.int8, device=device)
 
     bmm1_log2_scale_tensor = (
         torch.tensor(
@@ -487,22 +487,21 @@ def test_trtllm_batch_decode_mla(
     )
 
     # Run decode-MLA
-    for _ in range(3):
-        output = flashinfer.decode.trtllm_batch_decode_with_kv_cache_mla(
-            query=query,
-            kv_cache=kv_cache.unsqueeze(1),
-            workspace_buffer=workspace_buffer,
-            qk_nope_head_dim=qk_nope_head_dim,
-            kv_lora_rank=kv_lora_rank,
-            qk_rope_head_dim=qk_rope_head_dim,
-            block_tables=block_tables,
-            seq_lens=seq_lens_tensor,
-            max_seq_len=max_seq_len,
-            bmm1_scale=scale / ((128 + 64) ** 0.5),
-            bmm2_scale=1.0,
-            bmm1_scale_log2_tensor=bmm1_log2_scale_tensor,
-            bmm2_scale_tensor=bmm2_scale_tensor,
-        )
+    output = flashinfer.decode.trtllm_batch_decode_with_kv_cache_mla(
+        query=query,
+        kv_cache=kv_cache.unsqueeze(1),
+        workspace_buffer=workspace_buffer,
+        qk_nope_head_dim=qk_nope_head_dim,
+        kv_lora_rank=kv_lora_rank,
+        qk_rope_head_dim=qk_rope_head_dim,
+        block_tables=block_tables,
+        seq_lens=seq_lens_tensor,
+        max_seq_len=max_seq_len,
+        bmm1_scale=scale / ((128 + 64) ** 0.5),
+        bmm2_scale=1.0,
+        bmm1_scale_log2_tensor=bmm1_log2_scale_tensor,
+        bmm2_scale_tensor=bmm2_scale_tensor,
+    )
 
     # Run reference attention and align output
     sm_scale = scale / (
diff --git a/version.txt b/version.txt
@@ -1 +1 @@
-0.2.11
+0.2.11.post1

Original file line number	Diff line number	Diff line change
`@@ -151,6 +151,8 @@ void trtllm_paged_attention_launcher(`
`151`	`151`	`runner_params.multiCtasKvScratchPtr = reinterpret_cast<void*>(`
`152`	`152`	`static_cast<char>(workspace_buffer) + num_semaphores sizeof(uint32_t));`
`153`	`153`	`runner_params.multiCtasKvCounterPtr = reinterpret_cast<int32_t*>(workspace_buffer);`
	`154`	`+ zero_gmem_semaphore_launcher(runner_params.multiCtasKvCounterPtr, num_semaphores,`
	`155`	`+ /enable_pdl=/true, stream);`
`154`	`156`	`}`
`155`	`157`
`156`	`158`	`auto [foundKernels, kinfo] = fmha_runner->isSupportedWithInfo(runner_params);`