[HGEMM] update HGEMM benchmark option (#95)

DefTruth · web-flow · commit 0c29631ab65e · 2024-10-20T13:10:16.000+08:00
* update hgemm benchmark option

* update hgemm benchmark option

* update hgemm benchmark option
diff --git a/hgemm/README.md b/hgemm/README.md
@@ -238,12 +238,14 @@ export TORCH_CUDA_ARCH_LIST=Ada
 python3 hgemm.py # default, test some wmma kernels for all MNK
 python3 hgemm.py --wmma # test all wmma kernels for all MNK
 python3 hgemm.py --M 16384 --N 16384 --K 8192 --i 10 --wmma # test all wmma kernels for specific MNK
+python3 hgemm.py --wmma --no-default # test all wmma kernels, but exclude the default part.
 ```
 
 输出:
 
 - NVIDIA L20  
 ```bash
+python3 hgemm.py
 ----------------------------------------------------------------------------------------------------------------------------------
                                                        M=4096, N=4096, K=2048
                     f16x8pack(t8x8+dbuf): ['1.59863281', '-1.5263671'], time:1.404404ms, swizzle: NOOP, TFLOPS: 48.93 (+0.00%)
diff --git a/hgemm/hgemm.cu b/hgemm/hgemm.cu
@@ -1211,7 +1211,7 @@ void hgemm_t_8x8_sliced_k32_f16x8_pack_dbuf_async(torch::Tensor a, torch::Tensor
 void hgemm_t_16x8_sliced_k32_f16x8_pack_dbuf(torch::Tensor a, torch::Tensor b, torch::Tensor c);
 void hgemm_t_16x8_sliced_k32_f16x8_pack_dbuf_async(torch::Tensor a, torch::Tensor b, torch::Tensor c);
 // from hgemm_cublas.cu
-void hgemm_cublas_tensor_op(torch::Tensor a, torch::Tensor b, torch::Tensor c);
+void hgemm_cublas_tensor_op_row_major(torch::Tensor a, torch::Tensor b, torch::Tensor c);
 // from hgemm_wmma.cu
 void hgemm_wmma_m16n16k16_naive(torch::Tensor a, torch::Tensor b, torch::Tensor c);
 void hgemm_wmma_m16n16k16_mma4x2(torch::Tensor a, torch::Tensor b, torch::Tensor c);
@@ -1266,7 +1266,7 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
   TORCH_BINDING_COMMON_EXTENSION(hgemm_t_16x8_sliced_k32_f16x8_pack_dbuf)
   TORCH_BINDING_COMMON_EXTENSION(hgemm_t_16x8_sliced_k32_f16x8_pack_dbuf_async)
   // cuBLAS Tensor Cores
-  TORCH_BINDING_COMMON_EXTENSION(hgemm_cublas_tensor_op)
+  TORCH_BINDING_COMMON_EXTENSION(hgemm_cublas_tensor_op_row_major)
   // WMMA API Tensor Cores
   TORCH_BINDING_COMMON_EXTENSION(hgemm_wmma_m16n16k16_naive)
   TORCH_BINDING_COMMON_EXTENSION(hgemm_wmma_m16n16k16_mma4x2)
diff --git a/hgemm/hgemm.py b/hgemm/hgemm.py
@@ -18,8 +18,8 @@ def get_args():
     parser.add_argument("--enable-wmma-all", "--wmma", action="store_true", help="Enable all WMMA kernel tests")
     parser.add_argument("--enable-cuda-all", "--cuda", action="store_true", help="Enable all CUDA kernel tests")
     parser.add_argument("--enable-torch", "--torch", action="store_true", help="Enable torch matmul")
-    parser.add_argument("--enable-cublas", "--cublas", action="store_true", default=True, help="Enable cublas hgemm")
-    parser.add_argument("--disable-default", "--no-default", action="store_true", default=False, help="Disable default tests")
+    parser.add_argument("--disable-cublas", "--no-cublas", action="store_true", help="Disable cublas hgemm")
+    parser.add_argument("--disable-default", "--no-default", action="store_true", help="Disable default tests")
     return parser.parse_args()
 
 args = get_args()
@@ -205,8 +205,8 @@ def run_benchmark(perf_func: callable,
     if args.enable_mma_all: # more mma kernel tests.
         print("-" * 68 + "MMA" + "-" * 59)
         pass
-    if args.enable_cublas:
-        run_benchmark(lib.hgemm_cublas_tensor_op, a, b, "(cublas)", c)
+    if not args.disable_cublas:
+        run_benchmark(lib.hgemm_cublas_tensor_op_row_major, a, b, "(cublas)", c)
     if args.enable_torch:
         run_benchmark(partial(torch.matmul, out=c), a, b, "(torch)")
     torch.cuda.synchronize()
diff --git a/hgemm/hgemm_cublas.cu b/hgemm/hgemm_cublas.cu
@@ -14,11 +14,10 @@
 
 #include "cublas_v2.h"
 
+void cublas_tensor_op_row_major(half *A, half *B, half *C,  size_t M, 
+                                size_t N, size_t K) {
 
-void cublas_tensor_op(half *A, half *B, half *C,  size_t M, 
-                      size_t N, size_t K) {
-
-  cublasHandle_t handle = nullptr;
+  static cublasHandle_t handle = nullptr;
   cublasCreate(&handle);
   cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH);
 
@@ -41,6 +40,8 @@ void cublas_tensor_op(half *A, half *B, half *C,  size_t M,
   // cublasDestroy(handle);
 }
 
+// TODO: add cublas_tensor_op_col_major
+
 // --------------------- PyTorch bindings for custom kernel -----------------------
 #define STRINGFY(str) #str
 #define TORCH_BINDING_COMMON_EXTENSION(func)   \
@@ -57,8 +58,8 @@ if (((T).size(0) != (S0)) || ((T).size(1) != (S1))) { \
   throw std::runtime_error("Tensor size mismatch!");  \
 }
 
-// cublas tensor op
-void hgemm_cublas_tensor_op(
+// cublas tensor op with row major B matrix
+void hgemm_cublas_tensor_op_row_major(
   torch::Tensor a, torch::Tensor b, torch::Tensor c) {
   CHECK_TORCH_TENSOR_DTYPE(a, torch::kHalf)
   CHECK_TORCH_TENSOR_DTYPE(b, torch::kHalf)
@@ -70,10 +71,12 @@ void hgemm_cublas_tensor_op(
   CHECK_TORCH_TENSOR_SHAPE(b, K, N)
   CHECK_TORCH_TENSOR_SHAPE(c, M, N)
 
-  cublas_tensor_op(
+  cublas_tensor_op_row_major(
     reinterpret_cast<half*>(a.data_ptr()),
     reinterpret_cast<half*>(b.data_ptr()),
     reinterpret_cast<half*>(c.data_ptr()),
     M, N, K
   );
 }
+
+// TODO: add cublas_tensor_op_col_major