TVM: support TVM binding for GroupedGemm (#1725)

neurusL · Anrui Liu · web-flow · commit a6e9c9f8759e · 2025-09-20T15:50:01.000-07:00
## 📌 Description  The PR add support for GroupedGemm tvm_binding from FlashInfer side. - ```flashinfer/tvm_binding/grouped_gemm_fp8.cu``` contains implementation of dispatching templates to ```group_gemm::CutlassFP8GroupwiseScaledGroupGEMMSM100```, supporting JIT compilation - ```flashinfer/tvm_binding/grouped_gemm_fp8_jit_tvm_binding.cu``` contains declaration of above function - ```flashinfer/flashinfer/jit/gemm``` contains interface exposed to tvm ## 🔍 Related Issues  ## 🚀 Pull Request Checklist Thank you for contributing to FlashInfer! Before we review your pull request, please make sure the following items are complete. ### ✅ Pre-commit Checks - [x] I have installed `pre-commit` by running `pip install pre-commit` (or used your preferred method). - [x] I have installed the hooks with `pre-commit install`. - [x] I have run the hooks manually with `pre-commit run --all-files` and fixed any reported issues. > If you are unsure about how to set up `pre-commit`, see [the pre-commit documentation](https://pre-commit.com/). ## 🧪 Tests - [x] Tests have been added or updated as needed. - [x] All tests are passing (`unittest`, etc.). ## Reviewer Notes  --------- Co-authored-by: Anrui Liu <anruil@catalyst-fleet1.cs.cmu.edu>
diff --git a/.gitmodules b/.gitmodules
@@ -1,9 +1,6 @@
 [submodule "3rdparty/cutlass"]
 	path = 3rdparty/cutlass
 	url = https://github.com/NVIDIA/cutlass.git
-[submodule "3rdparty/composable_kernels"]
-	path = 3rdparty/composable_kernels
-	url = https://github.com/ROCm/composable_kernel.git
 [submodule "3rdparty/spdlog"]
 	path = 3rdparty/spdlog
 	url = https://github.com/gabime/spdlog.git
diff --git a/flashinfer/jit/__init__.py b/flashinfer/jit/__init__.py
@@ -78,6 +78,9 @@
 from .core import current_compilation_context as current_compilation_context
 from .cubin_loader import setup_cubin_loader
 
+from .gemm import gen_grouped_gemm_fp8_tvm_binding as gen_grouped_gemm_fp8_tvm_binding
+from .gemm import get_grouped_gemm_fp8_uri as get_grouped_gemm_fp8_uri
+
 
 @functools.cache
 def get_cudnn_fmha_gen_module():
diff --git a/flashinfer/jit/gemm/__init__.py b/flashinfer/jit/gemm/__init__.py
@@ -0,0 +1,18 @@
+"""
+Copyright (c) 2025 by FlashInfer team.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+from .tvm import gen_grouped_gemm_fp8_tvm_binding as gen_grouped_gemm_fp8_tvm_binding
+from .tvm import get_grouped_gemm_fp8_uri as get_grouped_gemm_fp8_uri
diff --git a/flashinfer/jit/gemm/tvm.py b/flashinfer/jit/gemm/tvm.py
@@ -0,0 +1,148 @@
+"""
+Copyright (c) 2025 by FlashInfer team.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+  http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License.
+"""
+
+import os
+from typing import Tuple
+
+import torch
+
+from .. import env as jit_env
+from ..utils import write_if_different
+
+
+def gen_grouped_gemm_fp8_tvm_binding(
+    uri: str,
+    dtype_a: torch.dtype,
+    dtype_b: torch.dtype,
+    dtype_out: torch.dtype,
+    scale_granularity_m: int,
+    scale_granularity_n: int,
+    scale_granularity_k: int,
+    scale_major_mode: str,  # "K" or "MN"
+    mma_sm: int,
+) -> Tuple[str, list]:
+    """Generate TVM binding for FP8 grouped GEMM.
+
+    Parameters
+    ----------
+    uri : str
+        Unique identifier for this kernel configuration
+    dtype_a : torch.dtype
+        Data type of matrix A
+    dtype_b : torch.dtype
+        Data type of matrix B
+    dtype_out : torch.dtype
+        Data type of output matrix
+    scale_granularity_m : int
+        Scaling granularity in M dimension
+    scale_granularity_n : int
+        Scaling granularity in N dimension
+    scale_granularity_k : int
+        Scaling granularity in K dimension
+    scale_major_mode : str
+        Scale storage mode ("K" or "MN")
+    mma_sm : int
+        MMA scheduling mode (1 or 2)
+
+    Returns
+    -------
+    Tuple[str, list]
+        URI and list of generated source file paths
+    """
+    gen_directory = jit_env.FLASHINFER_GEN_SRC_DIR / uri
+    os.makedirs(gen_directory, exist_ok=True)
+
+    source_paths = []
+
+    # Copy the base implementation file unchanged
+    src_path = jit_env.FLASHINFER_TVM_BINDING_DIR / "grouped_gemm_fp8.cu"
+    dest_path = gen_directory / "grouped_gemm_fp8.cu"
+    source_paths.append(dest_path)
+    with open(src_path, "r") as f:
+        source = f.read()
+    write_if_different(dest_path, source)
+
+    # Read the base TVM binding file and create specialized version
+    tvm_binding_src = (
+        jit_env.FLASHINFER_TVM_BINDING_DIR / "grouped_gemm_fp8_jit_tvm_binding.cu"
+    )
+    with open(tvm_binding_src, "r") as f:
+        base_content = f.read()
+
+    # Convert scale_major_mode to integer
+    scale_major_mode_val = 0 if scale_major_mode == "K" else 1
+
+    # Create specialized version by modifying the function export
+    # Replace the direct export with a specialized wrapper
+    specialized_content = base_content.replace(
+        "TVM_FFI_DLL_EXPORT_TYPED_FUNC(grouped_gemm_fp8_run, GroupedGemmFp8Run);",
+        f"""// Specialized wrapper for this configuration
+int GroupedGemmFp8RunSpecialized(
+    DLTensor* int_workspace_buffer,
+    DLTensor* float_workspace_buffer,
+    DLTensor* A,
+    DLTensor* B,
+    DLTensor* SFA,
+    DLTensor* SFB,
+    DLTensor* D,
+    DLTensor* m_indptr,
+    int64_t n, int64_t k,
+    TVMStreamHandle cuda_stream
+) {{
+    return GroupedGemmFp8Run(
+        int_workspace_buffer,
+        float_workspace_buffer,
+        A, B, SFA, SFB, D, m_indptr,
+        n, k,
+        {scale_granularity_m},  // scale_granularity_m
+        {scale_granularity_n},  // scale_granularity_n
+        {scale_granularity_k},  // scale_granularity_k
+        {scale_major_mode_val}, // scale_major_mode
+        {mma_sm},               // mma_sm
+        cuda_stream
+    );
+}}
+
+TVM_FFI_DLL_EXPORT_TYPED_FUNC(grouped_gemm_fp8_run, GroupedGemmFp8RunSpecialized);""",
+    )
+
+    binding_dest_path = gen_directory / "grouped_gemm_fp8_jit_tvm_binding.cu"
+    source_paths.append(binding_dest_path)
+    write_if_different(binding_dest_path, specialized_content)
+
+    return uri, source_paths
+
+
+def get_grouped_gemm_fp8_uri(
+    dtype_a: torch.dtype,
+    dtype_b: torch.dtype,
+    dtype_out: torch.dtype,
+    scale_granularity_m: int,
+    scale_granularity_n: int,
+    scale_granularity_k: int,
+    scale_major_mode: str,
+    mma_sm: int,
+) -> str:
+    """Generate URI for FP8 grouped GEMM configuration."""
+    dtype_a_str = str(dtype_a).split(".")[-1]
+    dtype_b_str = str(dtype_b).split(".")[-1]
+    dtype_out_str = str(dtype_out).split(".")[-1]
+
+    return (
+        f"group_gemm_fp8_{dtype_a_str}_{dtype_b_str}_{dtype_out_str}_"
+        f"sg_{scale_granularity_m}_{scale_granularity_n}_{scale_granularity_k}_"
+        f"sm_{scale_major_mode}_mma_{mma_sm}"
+    )
diff --git a/pyproject.toml b/pyproject.toml
@@ -53,6 +53,7 @@ packages = [
     "flashinfer.jit.attention",
     "flashinfer.jit.cutlass_gemm",
     "flashinfer.testing",
+    "flashinfer.jit.gemm",
     "flashinfer.triton",
     "flashinfer.tuning_configs",
     "flashinfer.profiler",
diff --git a/tvm_binding/grouped_gemm_fp8.cu b/tvm_binding/grouped_gemm_fp8.cu
@@ -0,0 +1,161 @@
+#include <dlpack/dlpack.h>
+
+#include <flashinfer/cutlass_utils.cuh>
+#include <flashinfer/gemm/group_gemm_fp8_groupwise_sm100.cuh>
+
+#include "tvm_binding_utils.h"
+
+__global__ void simple_print_kernel(void* data, int dtype_code) {
+  if (threadIdx.x == 0 && blockIdx.x == 0) {
+    if (dtype_code == kDLBfloat) {
+      // bfloat16
+      uint16_t* bf16_data = static_cast<uint16_t*>(data);
+      uint32_t full = ((uint32_t)bf16_data[0]) << 16;
+      float val = *reinterpret_cast<float*>(&full);
+      printf("GPU: D[0] = %.6f\n", val);
+    } else {
+      // float32
+      float* f32_data = static_cast<float*>(data);
+      printf("GPU: D[0] = %.6f\n", f32_data[0]);
+    }
+  }
+}
+
+// following MACROS duplicates from flashinfer/csrc/group_gemm_fp8_groupwise_sm100.cu
+#define DISPATCH_TVM_DTYPE_TO_CTYPE(tvm_dtype_in, tvm_dtype_out, c_type_in, c_type_out, ...) \
+  [&]() -> bool {                                                                            \
+    if (tvm_dtype_in.code == kDLFloat8_e4m3fn && tvm_dtype_in.bits == 8) {                   \
+      using c_type_in = cutlass::float_e4m3_t;                                               \
+      if (tvm_dtype_out.code == kDLFloat && tvm_dtype_out.bits == 16) {                      \
+        using c_type_out = cutlass::half_t;                                                  \
+        return __VA_ARGS__();                                                                \
+      }                                                                                      \
+      if (tvm_dtype_out.code == kDLBfloat && tvm_dtype_out.bits == 16) {                     \
+        using c_type_out = cutlass::bfloat16_t;                                              \
+        return __VA_ARGS__();                                                                \
+      }                                                                                      \
+    }                                                                                        \
+    CHECK(false) << "Unsupported TVM dtype combination: input(" << tvm_dtype_in.code << ","  \
+                 << tvm_dtype_in.bits << ") output(" << tvm_dtype_out.code << ","            \
+                 << tvm_dtype_out.bits << ")";                                               \
+    return false;                                                                            \
+  }()
+
+#define DISPATCH_MMA_SM(mma_sm, MMA_SM, ...)          \
+  [&]() -> bool {                                     \
+    if (mma_sm == 1) {                                \
+      constexpr int MMA_SM = 1;                       \
+      return __VA_ARGS__();                           \
+    } else if (mma_sm == 2) {                         \
+      constexpr int MMA_SM = 2;                       \
+      return __VA_ARGS__();                           \
+    }                                                 \
+    CHECK(false) << "Unsupported MMA SM: " << mma_sm; \
+    return false;                                     \
+  }()
+
+#define DISPATCH_SCALE_GRANULARITY(scale_granularity_m, scale_granularity_n, scale_granularity_k, \
+                                   SCALE_GRANULARITY_M, SCALE_GRANULARITY_N, SCALE_GRANULARITY_K, \
+                                   ...)                                                           \
+  [&]() -> bool {                                                                                 \
+    if (scale_granularity_m == 1 && scale_granularity_n == 128 && scale_granularity_k == 128) {   \
+      constexpr int SCALE_GRANULARITY_M = 1;                                                      \
+      constexpr int SCALE_GRANULARITY_N = 128;                                                    \
+      constexpr int SCALE_GRANULARITY_K = 128;                                                    \
+      return __VA_ARGS__();                                                                       \
+    } else if (scale_granularity_m == 128 && scale_granularity_n == 128 &&                        \
+               scale_granularity_k == 128) {                                                      \
+      constexpr int SCALE_GRANULARITY_M = 128;                                                    \
+      constexpr int SCALE_GRANULARITY_N = 128;                                                    \
+      constexpr int SCALE_GRANULARITY_K = 128;                                                    \
+      return __VA_ARGS__();                                                                       \
+    }                                                                                             \
+    CHECK(false) << "Unsupported scale granularity: (" << scale_granularity_m << ","              \
+                 << scale_granularity_n << "," << scale_granularity_k << ")";                     \
+    return false;                                                                                 \
+  }()
+
+#define DISPATCH_SCALE_MAJOR_K(scale_major_mode, SCALE_MAJOR_K, ...)      \
+  [&]() -> bool {                                                         \
+    if (scale_major_mode == 0) {                                          \
+      constexpr bool SCALE_MAJOR_K = true;                                \
+      return __VA_ARGS__();                                               \
+    } else if (scale_major_mode == 1) {                                   \
+      constexpr bool SCALE_MAJOR_K = false;                               \
+      return __VA_ARGS__();                                               \
+    }                                                                     \
+    CHECK(false) << "Unsupported Scale Major Mode: " << scale_major_mode; \
+    return false;                                                         \
+  }()
+
+namespace flashinfer {
+namespace group_gemm {
+
+template <int ScaleGranularityM, int ScaleGranularityN, int ScaleGranularityK, bool ScaleMajorK,
+          int MmaSM, typename DTypeIn, typename DTypeOut>
+cudaError_t CutlassFP8GroupwiseScaledGroupGEMMSM100(
+    void* int_buffer, size_t int_buffer_size_in_bytes, void* float_buffer,
+    size_t float_buffer_size_in_bytes, DTypeIn* A, DTypeIn* B, float* SFA, float* SFB, DTypeOut* D,
+    int* m_indptr, int max_m, int n, int k, int num_groups, cudaStream_t stream);
+
+}
+}  // namespace flashinfer
+
+// FP8 Group GEMM implementation with CUTLASS for SM100A (Blackwell)
+void GroupedGemmFp8Run(DLTensor* int_workspace_buffer, DLTensor* float_workspace_buffer,
+                       DLTensor* A, DLTensor* B, DLTensor* SFA, DLTensor* SFB, DLTensor* D,
+                       DLTensor* m_indptr, int64_t n, int64_t k, int64_t scale_granularity_m,
+                       int64_t scale_granularity_n, int64_t scale_granularity_k,
+                       int64_t scale_major_mode, int64_t mma_sm, TVMStreamHandle cuda_stream) {
+  cudaStream_t stream = reinterpret_cast<cudaStream_t>(cuda_stream);
+
+  size_t float_workspace_size =
+      float_workspace_buffer->shape[0] * DataType(float_workspace_buffer->dtype).bytes();
+  size_t int_workspace_size =
+      int_workspace_buffer->shape[0] * DataType(int_workspace_buffer->dtype).bytes();
+
+  int64_t num_groups = m_indptr->shape[0] - 1;
+  int64_t max_m = SFA->shape[1];
+
+  try {
+    DISPATCH_TVM_DTYPE_TO_CTYPE(A->dtype, D->dtype, c_type_in, c_type_out, [&] {
+      return DISPATCH_SCALE_MAJOR_K(scale_major_mode, SCALE_MAJOR_K, [&] {
+        return DISPATCH_MMA_SM(mma_sm, MMA_SM, [&] {
+          return DISPATCH_SCALE_GRANULARITY(
+              scale_granularity_m, scale_granularity_n, scale_granularity_k, SCALE_GRANULARITY_M,
+              SCALE_GRANULARITY_N, SCALE_GRANULARITY_K, [&] {
+                using cutlass_t_in = flashinfer::cutlass_dtype_t<c_type_in>;
+                using cutlass_t_out = flashinfer::cutlass_dtype_t<c_type_out>;
+
+                auto status = flashinfer::group_gemm::CutlassFP8GroupwiseScaledGroupGEMMSM100<
+                    SCALE_GRANULARITY_M, SCALE_GRANULARITY_N, SCALE_GRANULARITY_K, SCALE_MAJOR_K,
+                    MMA_SM>(
+                    static_cast<int32_t*>(int_workspace_buffer->data) +
+                        int_workspace_buffer->byte_offset / sizeof(int32_t),
+                    int_workspace_buffer->shape[0] * sizeof(int32_t),
+                    static_cast<float*>(float_workspace_buffer->data) +
+                        float_workspace_buffer->byte_offset / sizeof(float),
+                    float_workspace_buffer->shape[0] * sizeof(float),
+                    static_cast<cutlass_t_in*>(A->data) + A->byte_offset / sizeof(cutlass_t_in),
+                    static_cast<cutlass_t_in*>(B->data) + B->byte_offset / sizeof(cutlass_t_in),
+                    static_cast<float*>(SFA->data) + SFA->byte_offset / sizeof(float),
+                    static_cast<float*>(SFB->data) + SFB->byte_offset / sizeof(float),
+                    static_cast<cutlass_t_out*>(D->data) + D->byte_offset / sizeof(cutlass_t_out),
+                    static_cast<int32_t*>(m_indptr->data) + m_indptr->byte_offset / sizeof(int32_t),
+                    max_m, n, k, num_groups, stream);
+
+                // Check for CUDA errors immediately after kernel call
+                cudaError_t cuda_error = cudaGetLastError();
+                if (cuda_error != cudaSuccess) {
+                  return false;
+                }
+                LOG(INFO) << "Kernel execution completed successfully";
+                return status == cudaSuccess;
+              });
+        });
+      });
+    });
+  } catch (const std::exception& e) {
+    LOG(INFO) << "Exception caught:" << e.what();
+  }
+}
diff --git a/tvm_binding/grouped_gemm_fp8_jit_tvm_binding.cu b/tvm_binding/grouped_gemm_fp8_jit_tvm_binding.cu
@@ -0,0 +1,11 @@
+#include "tvm_binding_utils.h"
+
+// Function declarations (implementations in grouped_gemm.cu)
+IntTuple GroupedGemmGetWorkspaceSize(int64_t batch_size, int64_t max_m, int64_t max_n,
+                                     int64_t max_k);
+
+void GroupedGemmFp8Run(DLTensor* int_workspace_buffer, DLTensor* float_workspace_buffer,
+                       DLTensor* A, DLTensor* B, DLTensor* SFA, DLTensor* SFB, DLTensor* D,
+                       DLTensor* m_indptr, int64_t n, int64_t k, int64_t scale_granularity_m,
+                       int64_t scale_granularity_n, int64_t scale_granularity_k,
+                       int64_t scale_major_mode, int64_t mma_sm, TVMStreamHandle cuda_stream);