PDL patch for TGV GEMM (#1877)

yangs75 · web-flow · commit a4ddf2636b76 · 2025-10-06T17:56:38.000-07:00
## 📌 Description  ## 🔍 Related Issues Add missing nvcc flags to TGV gemm including the one that enables PDL feature. Also add safe guarding checks to TGV gemm to prevent tensor size being too large for TMA. ## 🚀 Pull Request Checklist Thank you for contributing to FlashInfer! Before we review your pull request, please make sure the following items are complete. ### ✅ Pre-commit Checks - [x] I have installed `pre-commit` by running `pip install pre-commit` (or used your preferred method). - [x] I have installed the hooks with `pre-commit install`. - [x] I have run the hooks manually with `pre-commit run --all-files` and fixed any reported issues. > If you are unsure about how to set up `pre-commit`, see [the pre-commit documentation](https://pre-commit.com/). ## 🧪 Tests - [x] Tests have been added or updated as needed. - [x] All tests are passing (`unittest`, etc.). ## Reviewer Notes
diff --git a/csrc/tgv_gemm.cu b/csrc/tgv_gemm.cu
@@ -148,6 +148,14 @@ Tensor tgv_gemm(Tensor const& mat1, Tensor const& mat2, Optional<Tensor> bias, i
   int K = mat1->shape[1];
   int N = mat2->shape[1];
 
+  int64_t element_size = get_element_size(mat1);
+  TVM_FFI_ICHECK(int64_t(M) * N * element_size < std::numeric_limits<int32_t>::max())
+      << "TMA plane stride (M * N * element_size) exceeds INT32_MAX; tensor too large for TMA";
+  TVM_FFI_ICHECK(int64_t(M) * K * element_size < std::numeric_limits<int32_t>::max())
+      << "TMA plane stride (M * K * element_size) exceeds INT32_MAX; mat1 too large for TMA";
+  TVM_FFI_ICHECK(int64_t(N) * K * element_size < std::numeric_limits<int32_t>::max())
+      << "TMA plane stride (N * K * element_size) exceeds INT32_MAX; mat2 too large for TMA";
+
   // validity check for bias
   if (bias.has_value()) {
     TVM_FFI_ICHECK_EQ(bias.value()->device.device_type, kDLCUDA) << "Bias tensor must be on CUDA";
diff --git a/flashinfer/gemm.py b/flashinfer/gemm.py
@@ -584,7 +584,7 @@ def tgv_gemm_sm100(
         pdl: Whether to use PDL (persistent data loader), defaults to False
 
     Returns:
-        Output tensor of shape (M, N)
+        Output tensor of shape (M, N) in row-major layout
 
     Supported dtypes:
         - torch.bfloat16
diff --git a/flashinfer/jit/gemm/core.py b/flashinfer/jit/gemm/core.py
@@ -445,7 +445,13 @@ def gen_tgv_gemm_sm10x_module(
     return gen_jit_spec(
         module_name,
         source_paths,
-        extra_cuda_cflags=sm100f_nvcc_flags if use_sm_100f else sm100a_nvcc_flags,
+        extra_cuda_cflags=[
+            "--expt-relaxed-constexpr",
+            "-DCUTLASS_ENABLE_GDC_FOR_SM100=1",
+        ]
+        + sm100f_nvcc_flags
+        if use_sm_100f
+        else sm100a_nvcc_flags,
         extra_include_paths=[
             jit_env.FLASHINFER_INCLUDE_DIR,
             jit_env.FLASHINFER_CSRC_DIR,