[NVIDIA] Expose cublas.gemm (#7656)

Mogball · web-flow · commit 1d99b61f96ac · 2025-07-26T00:12:58.000-07:00
Useful for test performance of a GEMM implementation.
diff --git a/python/tutorials/09-persistent-matmul.py b/python/tutorials/09-persistent-matmul.py
@@ -76,12 +76,13 @@ def _matmul_launch_metadata(grid, kernel, args):
 
 def matmul_get_configs(pre_hook=None):
     return [
-        triton.Config({'BLOCK_SIZE_M': BM, 'BLOCK_SIZE_N': BN, "BLOCK_SIZE_K" : BK, "GROUP_SIZE_M" : 8}, num_stages=s, num_warps=w, pre_hook=pre_hook) \
-        for BM in [128] \
-        for BN in [128, 256] \
-        for BK in [64,128] \
-        for s in ([3,4]) \
-        for w in [4,8] \
+        triton.Config({'BLOCK_SIZE_M': BM, 'BLOCK_SIZE_N': BN, "BLOCK_SIZE_K": BK, "GROUP_SIZE_M": 8}, num_stages=s,
+                      num_warps=w, pre_hook=pre_hook)
+        for BM in [128]
+        for BN in [128, 256]
+        for BK in [64, 128]
+        for s in ([2, 3, 4])
+        for w in [4, 8]
     ]
 
 
diff --git a/third_party/nvidia/include/cublas_instance.h b/third_party/nvidia/include/cublas_instance.h
@@ -120,8 +120,8 @@ class CublasLtInstance {
   }
 
   // Simple wrapper around the cublasLtMatmul function
-  void matmul_impl(int m, int n, int k, uint64_t A, uint64_t B, uint64_t D,
-                   cudaDataType_t dtype) {
+  void gemm_impl(int m, int n, int k, uint64_t A, uint64_t B, uint64_t C,
+                 uint64_t D, cudaDataType_t dtype, float alpha, float beta) {
     cublasLtMatmulDesc_t matmulDesc = NULL;
 
     cublasOperation_t transa = CUBLAS_OP_T;
@@ -160,10 +160,8 @@ class CublasLtInstance {
           "No valid algorithm found by cublasLtMatmulAlgoGetHeuristic");
     }
 
-    float alpha = 1.0f;
-    float beta = 0.0f;
     successOrExit(cublasLtMatmul(ltHandle, matmulDesc, &alpha, (void *)A, Adesc,
-                                 (void *)B, Bdesc, &beta, nullptr, Cdesc,
+                                 (void *)B, Bdesc, &beta, (void *)C, Cdesc,
                                  (void *)D, Ddesc, &heuristicResult.algo,
                                  (void *)workspace, workspaceSize, 0));
     if (Ddesc)
@@ -206,7 +204,12 @@ class CublasLtInstance {
               cudaDataType_t dtype) {
     // CUDA is column-major, while triton is row-major, therefore we need to
     // reverse the order of the matrices ( A * B = (B^T * A^T)^T ).
-    matmul_impl(n, m, k, B, A, C, dtype);
+    gemm_impl(n, m, k, B, A, 0, C, dtype, 1.0f, 0.0f);
+  }
+
+  void gemm(int m, int n, int k, uint64_t A, uint64_t B, uint64_t C, uint64_t D,
+            cudaDataType_t dtype, float alpha, float beta) {
+    gemm_impl(n, m, k, B, A, C, D, dtype, alpha, beta);
   }
 };
 
diff --git a/third_party/nvidia/triton_nvidia.cc b/third_party/nvidia/triton_nvidia.cc
@@ -84,6 +84,53 @@ void init_triton_hopper_passes(py::module &&m) {
                             mlir::createNVGPUWarpSpecialization, int, bool);
 }
 
+static void checkMatmulConstraints(const std::string &A_dtype,
+                                   const std::string &B_dtype,
+                                   const std::string &C_dtype,
+                                   const std::vector<int> &A_shape,
+                                   const std::vector<int> &B_shape,
+                                   const std::vector<int> &C_shape) {
+  if (A_dtype != B_dtype || A_dtype != C_dtype) {
+    throw std::runtime_error("Data types do not match.");
+  }
+  if (A_dtype != "torch.float8_e4m3fn" && A_dtype != "torch.float16") {
+    throw std::runtime_error("Unsupported data type.");
+  }
+
+  if (A_shape.size() != 2 || B_shape.size() != 2 || C_shape.size() != 2) {
+    throw std::runtime_error("Only 2D matrices are supported.");
+  }
+
+  int k = A_shape[1];
+  if (k != B_shape[1]) {
+    throw std::runtime_error(
+        "Matrix dimensions do not match. A is [" + std::to_string(A_shape[0]) +
+        ", " + std::to_string(A_shape[1]) + "], B is [" +
+        std::to_string(B_shape[0]) + ", " + std::to_string(B_shape[1]) +
+        "]. Expected A.shape[1] == B.shape[1]. Note "
+        "that B needs to be transposed.");
+  }
+
+  int m = A_shape[0];
+  if (m != C_shape[0]) {
+    throw std::runtime_error(
+        "Matrix dimensions do not match. A is [" + std::to_string(A_shape[0]) +
+        ", " + std::to_string(A_shape[1]) + "], C is [" +
+        std::to_string(C_shape[0]) + ", " + std::to_string(C_shape[1]) +
+        "]. Expected A.shape[0] == C.shape[0].");
+  }
+
+  int n = B_shape[0];
+  if (n != C_shape[1]) {
+    throw std::runtime_error(
+        "Matrix dimensions do not match. B is [" + std::to_string(B_shape[0]) +
+        ", " + std::to_string(B_shape[1]) + "], C is [" +
+        std::to_string(C_shape[0]) + ", " + std::to_string(C_shape[1]) +
+        "]. Expected B.shape[0] == C.shape[1]. Note "
+        "that B needs to be transposed.");
+  }
+}
+
 void init_triton_nvidia(py::module &&m) {
   auto passes = m.def_submodule("passes");
   init_triton_nvidia_passes_nvws(passes.def_submodule("nvws"));
@@ -155,22 +202,64 @@ void init_triton_nvidia(py::module &&m) {
                         workspace.attr("element_size")().cast<size_t>();
         return new CublasLtInstance(wrk_ptr, wrk_size);
       }))
-      .def("matmul", [](CublasLtInstance &self, py::object &A, py::object &B,
-                        py::object &C) {
+      .def("matmul",
+           [](CublasLtInstance &self, py::object &A, py::object &B,
+              py::object &C) {
+             auto A_ptr = A.attr("data_ptr")().cast<uint64_t>();
+             auto B_ptr = B.attr("data_ptr")().cast<uint64_t>();
+             auto C_ptr = C.attr("data_ptr")().cast<uint64_t>();
+
+             auto A_shape = A.attr("shape").cast<std::vector<int>>();
+             auto B_shape = B.attr("shape").cast<std::vector<int>>();
+             auto C_shape = C.attr("shape").cast<std::vector<int>>();
+
+             auto A_dtype =
+                 A.attr("dtype").attr("__str__")().cast<std::string>();
+             auto B_dtype =
+                 B.attr("dtype").attr("__str__")().cast<std::string>();
+             auto C_dtype =
+                 C.attr("dtype").attr("__str__")().cast<std::string>();
+
+             checkMatmulConstraints(A_dtype, B_dtype, C_dtype, A_shape, B_shape,
+                                    C_shape);
+
+             std::string dtype_str =
+                 A_dtype.substr(A_dtype.find_last_of('.') + 1);
+             cudaDataType_t dtype;
+             if (dtype_str == "float8_e4m3fn") {
+               dtype = CUDA_R_8F_E4M3;
+             } else if (dtype_str == "float16") {
+               dtype = CUDA_R_16F;
+             }
+
+             self.matmul(A_shape[0], B_shape[0], A_shape[1], A_ptr, B_ptr,
+                         C_ptr, dtype);
+           })
+      .def("gemm", [](CublasLtInstance &self, py::object &A, py::object &B,
+                      py::object &C, py::object &D, float alpha, float beta) {
         auto A_ptr = A.attr("data_ptr")().cast<uint64_t>();
         auto B_ptr = B.attr("data_ptr")().cast<uint64_t>();
         auto C_ptr = C.attr("data_ptr")().cast<uint64_t>();
+        auto D_ptr = D.attr("data_ptr")().cast<uint64_t>();
 
         auto A_shape = A.attr("shape").cast<std::vector<int>>();
         auto B_shape = B.attr("shape").cast<std::vector<int>>();
         auto C_shape = C.attr("shape").cast<std::vector<int>>();
+        auto D_shape = D.attr("shape").cast<std::vector<int>>();
 
         auto A_dtype = A.attr("dtype").attr("__str__")().cast<std::string>();
         auto B_dtype = B.attr("dtype").attr("__str__")().cast<std::string>();
         auto C_dtype = C.attr("dtype").attr("__str__")().cast<std::string>();
+        auto D_dtype = D.attr("dtype").attr("__str__")().cast<std::string>();
 
-        assert(A_dtype == B_dtype && A_dtype == C_dtype);
-        assert(A_dtype == "torch.float8_e4m3fn" || A_dtype == "torch.float16");
+        checkMatmulConstraints(A_dtype, B_dtype, D_dtype, A_shape, B_shape,
+                               D_shape);
+        if (C_dtype != "torch.float16") {
+          throw std::runtime_error("C dtype must be float16, got " + C_dtype);
+        }
+        if (C_shape != D_shape) {
+          throw std::runtime_error("C and D shapes must match");
+        }
 
         std::string dtype_str = A_dtype.substr(A_dtype.find_last_of('.') + 1);
         cudaDataType_t dtype;
@@ -180,43 +269,7 @@ void init_triton_nvidia(py::module &&m) {
           dtype = CUDA_R_16F;
         }
 
-        if (A_shape.size() != 2 || B_shape.size() != 2 || C_shape.size() != 2) {
-          throw std::runtime_error("Only 2D matrices are supported.");
-        }
-
-        int k = A_shape[1];
-        if (k != B_shape[1]) {
-          throw std::runtime_error("Matrix dimensions do not match. A is [" +
-                                   std::to_string(A_shape[0]) + ", " +
-                                   std::to_string(A_shape[1]) + "], B is [" +
-                                   std::to_string(B_shape[0]) + ", " +
-                                   std::to_string(B_shape[1]) +
-                                   "]. Expected A.shape[1] == B.shape[1]. Note "
-                                   "that B needs to be transposed.");
-        }
-
-        int m = A_shape[0];
-        if (m != C_shape[0]) {
-          throw std::runtime_error("Matrix dimensions do not match. A is [" +
-                                   std::to_string(A_shape[0]) + ", " +
-                                   std::to_string(A_shape[1]) + "], C is [" +
-                                   std::to_string(C_shape[0]) + ", " +
-                                   std::to_string(C_shape[1]) +
-                                   "]. Expected A.shape[0] == C.shape[0].");
-        }
-
-        int n = B_shape[0];
-        if (n != C_shape[1]) {
-          throw std::runtime_error("Matrix dimensions do not match. B is [" +
-                                   std::to_string(B_shape[0]) + ", " +
-                                   std::to_string(B_shape[1]) + "], C is [" +
-                                   std::to_string(C_shape[0]) + ", " +
-                                   std::to_string(C_shape[1]) +
-                                   "]. Expected B.shape[0] == C.shape[1]. Note "
-                                   "that B needs to be transposed.");
-        }
-
-        self.matmul(A_shape[0], B_shape[0], A_shape[1], A_ptr, B_ptr, C_ptr,
-                    dtype);
+        self.gemm(A_shape[0], B_shape[0], A_shape[1], A_ptr, B_ptr, C_ptr,
+                  D_ptr, dtype, alpha, beta);
       });
 }