feat: add warp-level persistent qk norm (#1843)

happierpig · happierpig · web-flow · commit ec4fc2ca56ae · 2025-10-03T16:12:17.000-07:00
## 📌 Description Recent models are using QK normalization right before RoPE and core self-attention (e.g., Qwen-3, Wan). Existing RMSNorm implementation in FlashInfer falls short on optimal for: 1. Extra shared memory reduction step. 2. Do not support non-contiguous layout on the middle dimension. E.g., q maybe [batch_size, :num_qo_heads, head_dim]. This PR implements a persistent version of RMSNorm, where each head is unrolled with each warp.  ## 🔍 Related Issues  ## 🚀 Pull Request Checklist Thank you for contributing to FlashInfer! Before we review your pull request, please make sure the following items are complete. ### ✅ Pre-commit Checks - [x] I have installed `pre-commit` by running `pip install pre-commit` (or used your preferred method). - [x] I have installed the hooks with `pre-commit install`. - [x] I have run the hooks manually with `pre-commit run --all-files` and fixed any reported issues. > If you are unsure about how to set up `pre-commit`, see [the pre-commit documentation](https://pre-commit.com/). ## 🧪 Tests - [x] Tests have been added or updated as needed. - [x] All tests are passing (`unittest`, etc.). ## Reviewer Notes  --------- Co-authored-by: happierpig <zhaoyilong217@sjtu.edn.cn>
diff --git a/csrc/norm.cu b/csrc/norm.cu
@@ -23,27 +23,59 @@ using tvm::ffi::Tensor;
 
 void rmsnorm(Tensor output, Tensor input, Tensor weight, double eps, bool enable_pdl) {
   CHECK_LAST_DIM_CONTIGUOUS_INPUT(input);
+  CHECK_LAST_DIM_CONTIGUOUS_INPUT(output);
   CHECK_LAST_DIM_CONTIGUOUS_INPUT(weight);
   CHECK_DEVICE(input, weight);
-  CHECK_DIM(2, input);   // input: (batch_size, hidden_size)
   CHECK_DIM(1, weight);  // weight: (hidden_size)
-  TVM_FFI_ICHECK_EQ(input->shape[1], weight->shape[0]);
-  unsigned int batch_size = input->shape[0];
-  unsigned int hidden_size = input->shape[1];
-  TVM_FFI_ICHECK_EQ(output->shape[0], batch_size);
-  TVM_FFI_ICHECK_EQ(output->shape[1], hidden_size);
-  cudaSetDevice(input->device.device_id);
-  const cudaStream_t stream = get_stream(input->device);
 
-  DISPATCH_DLPACK_DTYPE_TO_CTYPE_FP16(input->dtype, c_type, [&] {
-    cudaError_t status =
-        norm::RMSNorm(static_cast<c_type*>(input->data), static_cast<c_type*>(weight->data),
-                      static_cast<c_type*>(output->data), batch_size, hidden_size,
-                      input->strides[0], output->strides[0], eps, enable_pdl, stream);
-    TVM_FFI_ICHECK(status == cudaSuccess)
-        << "RMSNorm failed with error code " << cudaGetErrorString(status);
-    return true;
-  });
+  auto input_ndim = input->ndim;
+  if (input_ndim == 2) {
+    // Normal RMSNorm: [batch_size, hidden_size]
+    // Use CTA parallelization for better parallelism
+    CHECK_DIM(2, output);
+    TVM_FFI_ICHECK_EQ(input->shape[1], weight->shape[0]);
+    unsigned int batch_size = input->shape[0];
+    unsigned int hidden_size = input->shape[1];
+    TVM_FFI_ICHECK_EQ(output->shape[0], batch_size);
+    TVM_FFI_ICHECK_EQ(output->shape[1], hidden_size);
+    cudaSetDevice(input->device.device_id);
+    const cudaStream_t stream = get_stream(input->device);
+
+    DISPATCH_DLPACK_DTYPE_TO_CTYPE_FP16(input->dtype, c_type, [&] {
+      cudaError_t status =
+          norm::RMSNorm(static_cast<c_type*>(input->data), static_cast<c_type*>(weight->data),
+                        static_cast<c_type*>(output->data), batch_size, hidden_size,
+                        input->strides[0], output->strides[0], eps, enable_pdl, stream);
+      TVM_FFI_ICHECK(status == cudaSuccess)
+          << "RMSNorm failed with error code " << cudaGetErrorString(status);
+      return true;
+    });
+  } else if (input_ndim == 3) {
+    // QK RMSNorm: [batch_size, num_heads, head_dim]
+    // Use warp-level parallization
+    CHECK_DIM(3, output);  // output: (batch_size, num_heads, hidden_size)
+    TVM_FFI_ICHECK_EQ(input->shape[2], weight->shape[0]);
+    unsigned int batch_size = input->shape[0];
+    unsigned int num_heads = input->shape[1];
+    unsigned int hidden_size = input->shape[2];
+    TVM_FFI_ICHECK_EQ(output->shape[0], batch_size);
+    TVM_FFI_ICHECK_EQ(output->shape[1], num_heads);
+    TVM_FFI_ICHECK_EQ(output->shape[2], hidden_size);
+
+    cudaSetDevice(input->device.device_id);
+    const cudaStream_t stream = get_stream(input->device);
+    DISPATCH_DLPACK_DTYPE_TO_CTYPE_FP16(input->dtype, c_type, [&] {
+      cudaError_t status = norm::QKRMSNorm(
+          static_cast<c_type*>(input->data), static_cast<c_type*>(weight->data),
+          static_cast<c_type*>(output->data), batch_size, num_heads, hidden_size, input->strides[0],
+          input->strides[1], output->strides[0], output->strides[1], eps, enable_pdl, stream);
+      TVM_FFI_ICHECK(status == cudaSuccess)
+          << "QKRMSNorm failed with error code " << cudaGetErrorString(status);
+      return true;
+    });
+  } else {
+    TVM_FFI_ICHECK(false) << "Unsupported input dimension: " << input_ndim;
+  }
 }
 
 void fused_add_rmsnorm(Tensor input, Tensor residual, Tensor weight, double eps, bool enable_pdl) {
diff --git a/flashinfer/norm.py b/flashinfer/norm.py
@@ -54,7 +54,7 @@ def rmsnorm(
     Parameters
     ----------
     input: torch.Tensor
-        Input tensor, shape (batch_size, hidden_size).
+        Input tensor, 2D shape (batch_size, hidden_size) or 3D shape (batch_size, num_heads, hidden_size).
     weight: torch.Tensor
         Weight tensor, shape (hidden_size,).
     eps: float
@@ -68,7 +68,7 @@ def rmsnorm(
     Returns
     -------
     output: torch.Tensor
-        Normalized tensor, shape (batch_size, hidden_size).
+        Normalized tensor, 2D shape (batch_size, hidden_size) or 3D shape (batch_size, num_heads, hidden_size).
     """
     if enable_pdl is None:
         enable_pdl = device_support_pdl(input.device)
diff --git a/include/flashinfer/norm.cuh b/include/flashinfer/norm.cuh
@@ -139,6 +139,128 @@ cudaError_t RMSNorm(T* input, T* weight, T* output, uint32_t batch_size, uint32_
   return cudaSuccess;
 }
 
+template <uint32_t VEC_SIZE, typename T>
+__global__ void QKRMSNormKernel(T* __restrict__ input, T* __restrict__ weight,
+                                T* __restrict__ output, const uint32_t d, const uint32_t batch_size,
+                                const uint32_t num_heads, const uint32_t stride_input_n,
+                                const uint32_t stride_input_h, const uint32_t stride_output_n,
+                                const uint32_t stride_output_h, float weight_bias, float eps) {
+  const uint32_t num_blks = gridDim.x, num_warps = blockDim.y;
+  const uint32_t num_workers = num_blks * num_warps;  // unroll on warp-dim
+  const uint32_t num_jobs = batch_size * num_heads;
+
+  const uint32_t bx = blockIdx.x;
+  const uint32_t tx = threadIdx.x, ty = threadIdx.y;
+  const uint32_t worker_idx = bx * num_warps + ty;
+
+  constexpr uint32_t warp_size = 32;
+  const uint32_t num_threads = warp_size;
+  const uint32_t thread_id = tx;
+  const uint32_t rounds = ceil_div(d, VEC_SIZE * num_threads);
+
+#if (__CUDACC_VER_MAJOR__ >= 12 && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
+  asm volatile("griddepcontrol.wait;");
+#endif
+
+  for (uint32_t job_idx = worker_idx; job_idx < num_jobs; job_idx += num_workers) {
+    // clear buffer
+    float sum_sq = 0.f;
+
+    // map back to batch-idx and head-idx; layout [batch_size, num_heads, head_dim]
+    const uint32_t batch_idx = job_idx / num_heads;
+    const uint32_t head_idx = job_idx % num_heads;
+
+    for (uint32_t i = 0; i < rounds; i++) {
+      vec_t<T, VEC_SIZE> input_vec;
+      input_vec.fill(0.f);
+      if ((i * num_threads + thread_id) * VEC_SIZE < d) {
+        input_vec.load(input + batch_idx * stride_input_n + head_idx * stride_input_h +
+                       i * num_threads * VEC_SIZE + thread_id * VEC_SIZE);
+      }
+#pragma unroll
+      for (uint32_t j = 0; j < VEC_SIZE; j++) {
+        sum_sq += float(input_vec[j]) * float(input_vec[j]);
+      }
+    }
+
+    // only have warp reduce sum
+    // no need for __syncwarps as shfl already sync
+#pragma unroll
+    for (uint32_t offset = warp_size / 2; offset > 0; offset /= 2) {
+      sum_sq += math::shfl_xor_sync(sum_sq, offset);
+    }
+
+    float rms_rcp = math::rsqrt(sum_sq / float(d) + eps);
+
+    for (uint32_t i = 0; i < rounds; i++) {
+      vec_t<T, VEC_SIZE> input_vec;
+      vec_t<T, VEC_SIZE> weight_vec;
+      vec_t<T, VEC_SIZE> output_vec;
+      input_vec.fill(0.f);
+      weight_vec.fill(0.f);
+      if ((i * num_threads + thread_id) * VEC_SIZE < d) {
+        input_vec.load(input + batch_idx * stride_input_n + head_idx * stride_input_h +
+                       i * num_threads * VEC_SIZE + thread_id * VEC_SIZE);
+        weight_vec.load(weight + i * num_threads * VEC_SIZE + thread_id * VEC_SIZE);
+      }
+#pragma unroll
+      for (uint32_t j = 0; j < VEC_SIZE; j++) {
+        output_vec[j] = float(input_vec[j]) * rms_rcp * (weight_bias + float(weight_vec[j]));
+      }
+      if ((i * num_threads + thread_id) * VEC_SIZE < d) {
+        output_vec.store(output + batch_idx * stride_output_n + head_idx * stride_output_h +
+                         i * num_threads * VEC_SIZE + thread_id * VEC_SIZE);
+      }
+    }
+  }
+#if (__CUDACC_VER_MAJOR__ >= 12 && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900))
+  asm volatile("griddepcontrol.launch_dependents;");
+#endif
+}
+
+template <typename T>
+cudaError_t QKRMSNorm(T* input, T* weight, T* output, uint32_t batch_size, uint32_t num_heads,
+                      uint32_t d, uint32_t stride_input_n, uint32_t stride_input_h,
+                      uint32_t stride_output_n, uint32_t stride_output_h, float eps = 1e-5,
+                      bool enable_pdl = false, cudaStream_t stream = 0) {
+  const uint32_t vec_size = std::gcd(16 / sizeof(T), d);
+  const uint32_t num_warps = 4;
+  const uint32_t smem_size = 0;
+
+  float weight_bias = 0.f;
+
+  cudaLaunchConfig_t config;
+  config.dynamicSmemBytes = smem_size;
+  config.stream = stream;
+  cudaLaunchAttribute attrs[1];
+  attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization;
+  attrs[0].val.programmaticStreamSerializationAllowed = enable_pdl;
+  config.numAttrs = 1;
+  config.attrs = attrs;
+
+  DISPATCH_ALIGNED_VEC_SIZE(vec_size, VEC_SIZE, {
+    auto kernel = QKRMSNormKernel<VEC_SIZE, T>;
+
+    // calculate launching blocks
+    int num_blocks_per_sm = 0, num_sms = 0, dev_id = 0;
+    FLASHINFER_CUDA_CALL(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&num_blocks_per_sm, kernel,
+                                                                       num_warps * 32, smem_size));
+    FLASHINFER_CUDA_CALL(cudaGetDevice(&dev_id));
+    FLASHINFER_CUDA_CALL(cudaDeviceGetAttribute(&num_sms, cudaDevAttrMultiProcessorCount, dev_id));
+
+    dim3 nblks(num_blocks_per_sm * num_sms);
+    dim3 nthrs(32, num_warps);
+    config.gridDim = nblks;
+    config.blockDim = nthrs;
+
+    // execute kernel
+    FLASHINFER_CUDA_CALL(cudaLaunchKernelEx(&config, kernel, input, weight, output, d, batch_size,
+                                            num_heads, stride_input_n, stride_input_h,
+                                            stride_output_n, stride_output_h, weight_bias, eps));
+  });
+  return cudaSuccess;
+}
+
 template <uint32_t VEC_SIZE, typename T>
 __global__ void FusedAddRMSNormKernel(T* __restrict__ input, T* __restrict__ residual,
                                       T* __restrict__ weight, const uint32_t d,
diff --git a/tests/utils/test_norm.py b/tests/utils/test_norm.py
@@ -93,6 +93,37 @@ def test_norm(batch_size, hidden_size, dtype, specify_out, enable_pdl, contiguou
     torch.testing.assert_close(y_ref, y, rtol=1e-3, atol=1e-3)
 
 
+@pytest.mark.parametrize("batch_size", [1, 19, 99, 989])
+@pytest.mark.parametrize("num_heads", [4, 7, 16])
+@pytest.mark.parametrize("head_dim", [64, 128, 256, 512])
+@pytest.mark.parametrize("dtype", [torch.float16])
+@pytest.mark.parametrize("specify_out", [True, False])
+@pytest.mark.parametrize("enable_pdl", [True, False])
+@pytest.mark.parametrize("contiguous", [True, False])
+def test_qknorm(
+    batch_size, num_heads, head_dim, dtype, specify_out, enable_pdl, contiguous
+):
+    if contiguous:
+        x = torch.randn(batch_size, num_heads, head_dim).to(0).to(dtype)
+    else:
+        x = torch.randn(batch_size, num_heads * 2, head_dim, device="cuda").to(dtype)
+        x = x[:, :num_heads, :head_dim]
+
+    if enable_pdl and not device_support_pdl(x.device):
+        pytest.skip("PDL is only available for Hopper and later GPUs")
+
+    w = torch.randn(head_dim).to(0).to(dtype)
+
+    y_ref = llama_rms_norm(x, w)
+    if specify_out:
+        y = torch.empty_like(x)
+        flashinfer.norm.rmsnorm(x, w, out=y, enable_pdl=enable_pdl)
+    else:
+        y = flashinfer.norm.rmsnorm(x, w, enable_pdl=enable_pdl)
+
+    torch.testing.assert_close(y_ref, y, rtol=1e-3, atol=1e-3)
+
+
 @pytest.mark.parametrize("batch_size", [1, 19, 99, 989])
 @pytest.mark.parametrize("hidden_size", [111, 500, 1024, 3072, 3584, 4096, 8192, 16384])
 @pytest.mark.parametrize("dtype", [torch.float16])