fix: remote redundant zero_init from trtllm-gen attn (#1444)

yyihuang · web-flow · commit 54510292140e · 2025-08-10T01:30:17.000-07:00
## 📌 Description  ## 🔍 Related Issues  ## 🚀 Pull Request Checklist Thank you for contributing to FlashInfer! Before we review your pull request, please make sure the following items are complete. ### ✅ Pre-commit Checks - [ ] I have installed `pre-commit` by running `pip install pre-commit` (or used your preferred method). - [ ] I have installed the hooks with `pre-commit install`. - [ ] I have run the hooks manually with `pre-commit run --all-files` and fixed any reported issues. > If you are unsure about how to set up `pre-commit`, see [the pre-commit documentation](https://pre-commit.com/). ## 🧪 Tests - [ ] Tests have been added or updated as needed. - [ ] All tests are passing (`unittest`, etc.). ## Reviewer Notes
diff --git a/csrc/trtllm_fmha_kernel_launcher.cu b/csrc/trtllm_fmha_kernel_launcher.cu
@@ -151,8 +151,6 @@ void trtllm_paged_attention_launcher(
     runner_params.multiCtasKvScratchPtr = reinterpret_cast<void*>(
         static_cast<char*>(workspace_buffer) + num_semaphores * sizeof(uint32_t));
     runner_params.multiCtasKvCounterPtr = reinterpret_cast<int32_t*>(workspace_buffer);
-    zero_gmem_semaphore_launcher(runner_params.multiCtasKvCounterPtr, num_semaphores,
-                                 /*enable_pdl=*/true, stream);
   }
 
   auto [foundKernels, kinfo] = fmha_runner->isSupportedWithInfo(runner_params);
diff --git a/tests/test_trtllm_gen_decode.py b/tests/test_trtllm_gen_decode.py
@@ -465,7 +465,7 @@ def test_trtllm_batch_decode_mla(
 
     # Allocate workspace buffer
     # todo(Yingyi): calculate the actual size of workspace buffer
-    workspace_buffer = torch.empty(128 * 1024 * 1024, dtype=torch.int8, device=device)
+    workspace_buffer = torch.zeros(128 * 1024 * 1024, dtype=torch.int8, device=device)
 
     bmm1_log2_scale_tensor = (
         torch.tensor(
@@ -483,21 +483,22 @@ def test_trtllm_batch_decode_mla(
     )
 
     # Run decode-MLA
-    output = flashinfer.decode.trtllm_batch_decode_with_kv_cache_mla(
-        query=query,
-        kv_cache=kv_cache.unsqueeze(1),
-        workspace_buffer=workspace_buffer,
-        qk_nope_head_dim=qk_nope_head_dim,
-        kv_lora_rank=kv_lora_rank,
-        qk_rope_head_dim=qk_rope_head_dim,
-        block_tables=block_tables,
-        seq_lens=seq_lens_tensor,
-        max_seq_len=max_seq_len,
-        bmm1_scale=scale / ((128 + 64) ** 0.5),
-        bmm2_scale=1.0,
-        bmm1_scale_log2_tensor=bmm1_log2_scale_tensor,
-        bmm2_scale_tensor=bmm2_scale_tensor,
-    )
+    for _ in range(3):
+        output = flashinfer.decode.trtllm_batch_decode_with_kv_cache_mla(
+            query=query,
+            kv_cache=kv_cache.unsqueeze(1),
+            workspace_buffer=workspace_buffer,
+            qk_nope_head_dim=qk_nope_head_dim,
+            kv_lora_rank=kv_lora_rank,
+            qk_rope_head_dim=qk_rope_head_dim,
+            block_tables=block_tables,
+            seq_lens=seq_lens_tensor,
+            max_seq_len=max_seq_len,
+            bmm1_scale=scale / ((128 + 64) ** 0.5),
+            bmm2_scale=1.0,
+            bmm1_scale_log2_tensor=bmm1_log2_scale_tensor,
+            bmm2_scale_tensor=bmm2_scale_tensor,
+        )
 
     # Run reference attention and align output
     sm_scale = scale / (

Original file line number	Diff line number	Diff line change
`@@ -151,8 +151,6 @@ void trtllm_paged_attention_launcher(`
`151`	`151`	`runner_params.multiCtasKvScratchPtr = reinterpret_cast<void*>(`
`152`	`152`	`static_cast<char>(workspace_buffer) + num_semaphores sizeof(uint32_t));`
`153`	`153`	`runner_params.multiCtasKvCounterPtr = reinterpret_cast<int32_t*>(workspace_buffer);`
`154`		`- zero_gmem_semaphore_launcher(runner_params.multiCtasKvCounterPtr, num_semaphores,`
`155`		`- /enable_pdl=/true, stream);`
`156`	`154`	`}`
`157`	`155`
`158`	`156`	`auto [foundKernels, kinfo] = fmha_runner->isSupportedWithInfo(runner_params);`