Remove cudaMalloc/Free in GDN prefill kernel (#2415)

KevinZeng08 · web-flow · commit 920c82a40733 · 2026-01-25T11:54:36.000-08:00
## 📌 Description In GDN prefill kernel, this line of code will cause redundant cudaMalloc/cudaFree in kernel execution, which harms runtime performance. This workspace buffer is used for TMA store output. https://github.com/flashinfer-ai/flashinfer/blob/a49b45336e56e4615eae102cf29d5110293d9130/csrc/flat/prefill/prefill_kernel_delta_rule_sm90.cuh#L132 This PR replaces this with a workspace buffer created by torch with the same size (# of SMs * 128B), removing redundant cudaMalloc/Free function call.  ## 🔍 Related Issues  ## 🚀 Pull Request Checklist Thank you for contributing to FlashInfer! Before we review your pull request, please make sure the following items are complete. ### ✅ Pre-commit Checks - [x] I have installed `pre-commit` by running `pip install pre-commit` (or used your preferred method). - [x] I have installed the hooks with `pre-commit install`. - [x] I have run the hooks manually with `pre-commit run --all-files` and fixed any reported issues. > If you are unsure about how to set up `pre-commit`, see [the pre-commit documentation](https://pre-commit.com/). ## 🧪 Tests - [x] Tests have been added or updated as needed. - [x] All tests are passing (`unittest`, etc.). ## Reviewer Notes   ## Summary by CodeRabbit * **Refactor** * Launchers and kernels now accept an external per‑SM workspace buffer; internal workspace allocation removed. * Native launchers and Python prefill functions updated to accept, validate, and forward the workspace buffer. * Runtime checks added for the provided workspace; call sites updated to construct and pass a per‑SM workspace where needed. <sub>✏️ Tip: You can customize this high-level summary in your review settings.</sub>
diff --git a/csrc/flat/prefill/prefill_kernel.hpp b/csrc/flat/prefill/prefill_kernel.hpp
@@ -32,8 +32,9 @@ void launch_delta_rule_prefill_kernel(cudaStream_t stream, TO* output, TState* o
                                       TQKV const* q, TQKV const* k, TQKV const* v,
                                       TState const* input_state, float const* alpha,
                                       float const* beta, int64_t const* cu_seqlens,
-                                      int32_t num_seqs, int32_t num_q_heads, int32_t num_k_heads,
-                                      int32_t num_v_heads, int32_t num_o_heads, int32_t head_size,
-                                      int64_t total_seqlen, float scale, int32_t sm_count = 0);
+                                      uint8_t* workspace_buffer, int32_t num_seqs,
+                                      int32_t num_q_heads, int32_t num_k_heads, int32_t num_v_heads,
+                                      int32_t num_o_heads, int32_t head_size, int64_t total_seqlen,
+                                      float scale, int32_t sm_count = 0);
 
 }  // namespace flat
diff --git a/csrc/flat/prefill/prefill_kernel_delta_rule_sm90.cu b/csrc/flat/prefill/prefill_kernel_delta_rule_sm90.cu
@@ -27,19 +27,20 @@ void launch_delta_rule_prefill_kernel(cudaStream_t stream, TO* output, TState* o
                                       TQKV const* q, TQKV const* k, TQKV const* v,
                                       TState const* input_state, float const* alpha,
                                       float const* beta, int64_t const* cu_seqlens,
-                                      int32_t num_seqs, int32_t num_q_heads, int32_t num_k_heads,
-                                      int32_t num_v_heads, int32_t num_o_heads, int32_t head_size,
-                                      int64_t total_seqlen, float scale, int32_t sm_count) {
+                                      uint8_t* workspace_buffer, int32_t num_seqs,
+                                      int32_t num_q_heads, int32_t num_k_heads, int32_t num_v_heads,
+                                      int32_t num_o_heads, int32_t head_size, int64_t total_seqlen,
+                                      float scale, int32_t sm_count) {
   bool is_gva = num_v_heads > num_q_heads;
   bool needs_beta = beta != nullptr;
   bool needs_alpha = alpha != nullptr;
   bool init_state = input_state != nullptr;
 
-#define LAUNCH(is_gva, needs_beta, needs_alpha, init_state)                                    \
-  launch_delta_rule_prefill_kernel_gbai<is_gva, needs_beta, needs_alpha, init_state, ArchTag>( \
-      stream, output, output_state, q, k, v, input_state, alpha, beta, cu_seqlens, num_seqs,   \
-      num_q_heads, num_k_heads, num_v_heads, num_o_heads, head_size, total_seqlen, scale,      \
-      sm_count);
+#define LAUNCH(is_gva, needs_beta, needs_alpha, init_state)                                      \
+  launch_delta_rule_prefill_kernel_gbai<is_gva, needs_beta, needs_alpha, init_state, ArchTag>(   \
+      stream, output, output_state, q, k, v, input_state, alpha, beta, cu_seqlens,               \
+      workspace_buffer, num_seqs, num_q_heads, num_k_heads, num_v_heads, num_o_heads, head_size, \
+      total_seqlen, scale, sm_count);
 
   if (init_state) {
     if (is_gva && needs_beta && needs_alpha) {
@@ -89,15 +90,16 @@ void launch_delta_rule_prefill_kernel(cudaStream_t stream, TO* output, TState* o
 template void launch_delta_rule_prefill_kernel<cutlass::arch::Sm90, half, half, float>(
     cudaStream_t stream, half* output, float* state, half const* q, half const* k, half const* v,
     float const* input_state, float const* alpha, float const* beta, int64_t const* cu_seqlens,
-    int32_t num_seqs, int32_t num_q_heads, int32_t num_k_heads, int32_t num_v_heads,
-    int32_t num_o_heads, int32_t head_size, int64_t total_seqlen, float scale, int32_t sm_count);
+    uint8_t* workspace_buffer, int32_t num_seqs, int32_t num_q_heads, int32_t num_k_heads,
+    int32_t num_v_heads, int32_t num_o_heads, int32_t head_size, int64_t total_seqlen, float scale,
+    int32_t sm_count);
 
 template void
 launch_delta_rule_prefill_kernel<cutlass::arch::Sm90, nv_bfloat16, nv_bfloat16, float>(
     cudaStream_t stream, nv_bfloat16* output, float* state, nv_bfloat16 const* q,
     nv_bfloat16 const* k, nv_bfloat16 const* v, float const* input_state, float const* alpha,
-    float const* beta, int64_t const* cu_seqlens, int32_t num_seqs, int32_t num_q_heads,
-    int32_t num_k_heads, int32_t num_v_heads, int32_t num_o_heads, int32_t head_size,
-    int64_t total_seqlen, float scale, int32_t sm_count);
+    float const* beta, int64_t const* cu_seqlens, uint8_t* workspace_buffer, int32_t num_seqs,
+    int32_t num_q_heads, int32_t num_k_heads, int32_t num_v_heads, int32_t num_o_heads,
+    int32_t head_size, int64_t total_seqlen, float scale, int32_t sm_count);
 
 }  // namespace flat
diff --git a/csrc/flat/prefill/prefill_kernel_delta_rule_sm90.cuh b/csrc/flat/prefill/prefill_kernel_delta_rule_sm90.cuh
@@ -31,14 +31,12 @@ using namespace cute;
 
 template <bool IsGVA, bool NeedsBeta, bool NeedsAlpha, bool InitStateFromInput, typename ArchTag,
           typename TO, typename TQKV, typename TState>
-void launch_delta_rule_prefill_kernel_gbai(cudaStream_t stream, TO* output, TState* output_state,
-                                           TQKV const* q, TQKV const* k, TQKV const* v,
-                                           TState const* input_state, float const* alpha,
-                                           float const* beta, int64_t const* cu_seqlens,
-                                           int32_t num_seqs, int32_t num_q_heads,
-                                           int32_t num_k_heads, int32_t num_v_heads,
-                                           int32_t num_o_heads, int32_t head_size,
-                                           int64_t total_seqlen, float scale, int32_t sm_count) {
+void launch_delta_rule_prefill_kernel_gbai(
+    cudaStream_t stream, TO* output, TState* output_state, TQKV const* q, TQKV const* k,
+    TQKV const* v, TState const* input_state, float const* alpha, float const* beta,
+    int64_t const* cu_seqlens, uint8_t* workspace_buffer, int32_t num_seqs, int32_t num_q_heads,
+    int32_t num_k_heads, int32_t num_v_heads, int32_t num_o_heads, int32_t head_size,
+    int64_t total_seqlen, float scale, int32_t sm_count) {
 #if defined(FLAT_SM90A_ENABLED)
   constexpr bool HopperSupported = true;
 #else
@@ -128,16 +126,13 @@ void launch_delta_rule_prefill_kernel_gbai(cudaStream_t stream, TO* output, TSta
         },  // clang-format on
                         .hw_info = hw_info};
 
-    size_t workspace_size = op.get_workspace_size(arguments);
-    cutlass::device_memory::allocation<uint8_t> workspace(workspace_size);
-
     cutlass::Status status;
     status = op.can_implement(arguments);
     if (status != cutlass::Status::kSuccess) {
       throw std::runtime_error("can_implement failed");
     }
 
-    status = op.initialize(arguments, workspace.get(), stream);
+    status = op.initialize(arguments, workspace_buffer, stream);
     if (status != cutlass::Status::kSuccess) {
       throw std::runtime_error("initialize failed");
     }
diff --git a/csrc/gdn_prefill_launcher.cu b/csrc/gdn_prefill_launcher.cu
@@ -35,10 +35,10 @@ namespace flashinfer {
 
 void gdn_prefill_launcher(void* output, void* output_state, void* q, void* k, void* v,
                           void* input_state, void* alpha, void* beta, int64_t* cu_seqlens,
-                          int64_t num_seqs, int64_t num_q_heads, int64_t num_k_heads,
-                          int64_t num_v_heads, int64_t num_o_heads, int64_t head_size,
-                          int64_t packed_seq, float scale, int64_t sm_count, DLDataType dtype,
-                          cudaStream_t stream) {
+                          uint8_t* workspace_buffer, int64_t num_seqs, int64_t num_q_heads,
+                          int64_t num_k_heads, int64_t num_v_heads, int64_t num_o_heads,
+                          int64_t head_size, int64_t packed_seq, float scale, int64_t sm_count,
+                          DLDataType dtype, cudaStream_t stream) {
   DISPATCH_DLPACK_DTYPE_TO_CTYPE_FP16(dtype, DType, [&] {
     int dev_id;
     cudaGetDevice(&dev_id);
@@ -51,8 +51,8 @@ void gdn_prefill_launcher(void* output, void* output_state, void* q, void* k, vo
           stream, static_cast<DType*>(output), static_cast<float*>(output_state),
           static_cast<DType const*>(q), static_cast<DType const*>(k), static_cast<DType const*>(v),
           static_cast<float const*>(input_state), static_cast<float const*>(alpha),
-          static_cast<float const*>(beta), cu_seqlens, num_seqs, num_q_heads, num_k_heads,
-          num_v_heads, num_o_heads, head_size, packed_seq, scale, sm_count);
+          static_cast<float const*>(beta), cu_seqlens, workspace_buffer, num_seqs, num_q_heads,
+          num_k_heads, num_v_heads, num_o_heads, head_size, packed_seq, scale, sm_count);
       return true;
     } else {
       std::ostringstream err_msg;
@@ -70,7 +70,8 @@ void gdn_prefill_launcher(void* output, void* output_state, void* q, void* k, vo
 
 void gdn_prefill(TensorView output, TensorView output_state, TensorView q, TensorView k,
                  TensorView v, TensorView cu_seqlens, Optional<TensorView> input_state,
-                 Optional<TensorView> alpha, Optional<TensorView> beta, double scale) {
+                 Optional<TensorView> alpha, Optional<TensorView> beta, double scale,
+                 TensorView workspace_buffer) {
   int64_t num_seqs = cu_seqlens.size(0) - 1;
   int64_t packed_seq = q.size(0);
   int64_t head_size = q.size(2);
@@ -109,13 +110,15 @@ void gdn_prefill(TensorView output, TensorView output_state, TensorView q, Tenso
   CHECK_INPUT(k);
   CHECK_INPUT(v);
   CHECK_INPUT(cu_seqlens);
+  CHECK_INPUT(workspace_buffer);
 
   TVM_FFI_ICHECK(output.dtype() == dl_float16 || output.dtype() == dl_bfloat16);
   TVM_FFI_ICHECK_EQ(output_state.dtype(), dl_float32);
   TVM_FFI_ICHECK_EQ(output.dtype(), q.dtype());
   TVM_FFI_ICHECK_EQ(output.dtype(), k.dtype());
   TVM_FFI_ICHECK_EQ(output.dtype(), v.dtype());
   TVM_FFI_ICHECK_EQ(cu_seqlens.dtype(), dl_int64);
+  TVM_FFI_ICHECK_EQ(workspace_buffer.dtype(), dl_uint8);
 
   TVM_FFI_ICHECK_EQ(packed_seq, k.size(0));
   TVM_FFI_ICHECK_EQ(packed_seq, v.size(0));
@@ -164,7 +167,8 @@ void gdn_prefill(TensorView output, TensorView output_state, TensorView q, Tenso
 
   gdn_prefill_launcher(output.data_ptr(), output_state.data_ptr(), q.data_ptr(), k.data_ptr(),
                        v.data_ptr(), input_state_ptr, alpha_ptr, beta_ptr,
-                       static_cast<int64_t*>(cu_seqlens.data_ptr()), num_seqs, num_q_heads,
+                       static_cast<int64_t*>(cu_seqlens.data_ptr()),
+                       static_cast<uint8_t*>(workspace_buffer.data_ptr()), num_seqs, num_q_heads,
                        num_k_heads, num_v_heads, num_o_heads, head_size, packed_seq,
                        static_cast<float>(scale), sm_count, q.dtype(), stream);
 }
diff --git a/flashinfer/gdn_prefill.py b/flashinfer/gdn_prefill.py
@@ -24,6 +24,8 @@
 from .utils import (
     register_custom_op,
     register_fake_op,
+    get_device_sm_count,
+    _get_cache_buf,
 )
 
 
@@ -45,6 +47,7 @@ def gdn_prefill(
         g: Optional[torch.Tensor],
         beta: Optional[torch.Tensor],
         scale: float,
+        workspace_buffer: torch.Tensor,
     ) -> None:
         module.gdn_prefill(
             output,
@@ -57,6 +60,7 @@ def gdn_prefill(
             g,
             beta,
             scale,
+            workspace_buffer,
         )
 
     @register_fake_op("flashinfer::gdn_prefill")
@@ -71,6 +75,7 @@ def _fake_gdn_prefill(
         g: Optional[torch.Tensor],
         beta: Optional[torch.Tensor],
         scale: float,
+        workspace_buffer: torch.Tensor,
     ) -> None:
         pass
 
@@ -183,6 +188,11 @@ def chunk_gated_delta_rule(
             device=q.device,
         )
 
+    # Prepare workspace buffer for TMA Store in kernel
+    # 128B tensormap for each SM on Hopper architecture
+    workspace_size = get_device_sm_count(q.device) * 128
+    workspace_buffer = _get_cache_buf("gdn_prefill_workspace", workspace_size, q.device)
+
     get_gdn_prefill_module().gdn_prefill(
         output,
         output_state,
@@ -194,6 +204,7 @@ def chunk_gated_delta_rule(
         g,
         beta,
         scale if scale is not None else 0.0,
+        workspace_buffer,
     )
 
     if output_final_state: