[HGEMM] fix cublas hgemm handle error (#138)

DefTruth · web-flow · commit 91f7b1048dae · 2024-11-20T14:58:07.000+08:00
* Update hgemm_cublas.cu

* Update hgemm_mma_stage_tn_cute.cu

* Update hgemm_mma_stage_tn_cute.cu

* Update utils.h

* Update utils.h

* Update hgemm_mma_stage_tn_cute.cu

* Update hgemm_cublas.cu

* Update hgemm_mma_stage_tn_cute.cu
diff --git a/hgemm/hgemm_cublas.cu b/hgemm/hgemm_cublas.cu
@@ -63,7 +63,27 @@ void cublas_tensor_op_tn(half *A, half *B, half *C,  size_t M, size_t N, size_t
 // build cpp binary
 #ifndef NO_CUBLAS_HGEMM_BIN
 
-float perf_cublas(int M, int N, int K, int repeat) {
+// pass the cuBLAS handle from outside to avoid error.
+void cublas_tensor_op_tn_v2(cublasHandle_t handle, 
+                            half *A, half *B, half *C,  
+                            size_t M, size_t N, size_t K) {
+  half alpha = 1.0;
+  half beta = 0.0;
+
+  cublasGemmEx(handle, 
+               CUBLAS_OP_T, 
+               CUBLAS_OP_N, 
+               N, M, K, 
+               &alpha, 
+               B, CUDA_R_16F, K, 
+               A, CUDA_R_16F, K, 
+               &beta,  
+               C, CUDA_R_16F, N, 
+               CUBLAS_COMPUTE_16F,
+               CUBLAS_GEMM_DEFAULT_TENSOR_OP);
+}
+
+float perf_cublas_tn(int M, int N, int K, int repeat) {
   size_t size_a = M * K * sizeof(half);
   size_t size_b = K * N * sizeof(half);
   size_t size_c = M * N * sizeof(half);
@@ -74,9 +94,13 @@ float perf_cublas(int M, int N, int K, int repeat) {
   cudaMalloc(&d_b, size_b);
   cudaMalloc(&d_c, size_c);
 
+  cublasHandle_t handle = nullptr;
+  cublasCreate(&handle);
+  cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH);
+
   // warmup
   for (int i = 0; i < 10; ++i) {
-    cublas_tensor_op_tn(d_a, d_b, d_c, M, N, K);
+    cublas_tensor_op_tn_v2(handle, d_a, d_b, d_c, M, N, K);
   }
   cudaDeviceSynchronize();
 
@@ -86,7 +110,7 @@ float perf_cublas(int M, int N, int K, int repeat) {
   cudaEventRecord(start);
 
   for (int i = 0; i < repeat; i++) {
-    cublas_tensor_op_tn(d_a, d_b, d_c, M, N, K);
+    cublas_tensor_op_tn_v2(handle, d_a, d_b, d_c, M, N, K);
   }
 
   cudaEventRecord(end);
@@ -102,12 +126,13 @@ float perf_cublas(int M, int N, int K, int repeat) {
   cudaFree(d_c);
   cudaEventDestroy(start);
   cudaEventDestroy(end);
+  cublasDestroy(handle);
 
   return sec;
 }
 
 int main(int argc, char *argv[]) {
-  const int test_num = 50;
+  const int test_num = 64;
   int M_list[test_num];
   int N_list[test_num];
   int K_list[test_num];
@@ -120,7 +145,7 @@ int main(int argc, char *argv[]) {
 
   const int outer_repeat = 10, inner_repeat = 1;
 
-  printf("\nalgo = Cublas TN\n");
+  printf("ALGO = cuBLAS CUBLAS_GEMM_DEFAULT_TENSOR_OP TN\n");
 
   for (int j = 0; j < test_num; j++) {
     int M = M_list[j], N = N_list[j], K = K_list[j];
@@ -130,7 +155,7 @@ int main(int argc, char *argv[]) {
     double total_sec = 0.0;
 
     for (int k = 0; k < outer_repeat; k++) {
-      double this_sec = perf_cublas(M, N, K, inner_repeat);
+      double this_sec = perf_cublas_tn(M, N, K, inner_repeat);
       max_sec = max(max_sec, this_sec);
       min_sec = min(min_sec, this_sec);
       total_sec += this_sec;
diff --git a/hgemm/hgemm_mma_stage_tn_cute.cu b/hgemm/hgemm_mma_stage_tn_cute.cu
@@ -3,7 +3,6 @@
 #include <stdlib.h>
 #include <cute/tensor.hpp>
 #include <float.h>
-// modifide from: https://github.com/weishengying/cute_gemm/blob/main/gemm_4/gemm.cu
 
 // TODO: thread block swizzle, cute hgemm nn
 template <
@@ -349,7 +348,7 @@ int main() {
   using T = cute::half_t;
   using namespace cute;
 
-  const int test_num = 50;
+  const int test_num = 64;
   int M_list[test_num];
   int N_list[test_num];
   int K_list[test_num];
@@ -362,10 +361,10 @@ int main() {
 
   const int outer_repeat = 10, inner_repeat = 1;
 
-  printf("\nalgo = CuTe HGEMM Stages 2\n");
+  printf("ALGO = CuTe HGEMM Stages 2\n");
   for (int j = 0; j < 5; j++) {
     int M = M_list[j], N = N_list[j], K = K_list[j];
-    float max_error = gemm_error_check_v2<T>(
+    float max_error = gemm_error_check<T>(
       launch_hgemm_mma_stages_tn_cute, M, N, K);
     printf("M N K = %6d %6d %6d, ", M, N, K);
     printf("Max Error = %f\n", max_error);
diff --git a/hgemm/utils.h b/hgemm/utils.h
@@ -2,38 +2,6 @@
 #include <cstdlib>
 #include <cuda.h>
 #include <cublas_v2.h>
-// modified from: https://github.com/weishengying/cute_gemm/blob/main/utils.h
-
-#define OFFSET(row_idx, col_idx, stride_0, stride_1) \
-  row_idx*stride_0 + col_idx*stride_1
-
-#define PRINT(name, content) \
-  print(name);               \
-  print(" : ");              \
-  print(content);            \
-  print("\n");
-
-#define PRINTTENSOR(name, content) \
-  print(name);                     \
-  print(" : ");                    \
-  print_tensor(content);           \
-  print("\n");
-  
-template<class T>
-void cpu_hgemm(const T* A, const T* B, T* C,
-         const int M, const int N, const int K) {
-  // A(M,K):(K,1)     B(K,N):(1,K)
-  for(int m = 0; m < M; m++) {
-    for(int n = 0;  n < N; n++) {
-      float tmp = 0.0;
-      for(int k = 0; k < K; k++) {
-        tmp += float(A[OFFSET(m, k, K, 1)]) * float(B[OFFSET(k, n, 1, K)]); 
-      }
-      C[OFFSET(m, n, N, 1)] = T(tmp);
-    }
-  }
-  return;
-}
 
 template <typename T>
 float perf_gemm(
@@ -89,59 +57,6 @@ float gemm_error_check(
   size_t size_b = K * N * sizeof(T);
   size_t size_c = M * N * sizeof(T);
 
-  T *h_a, *h_b, *d_a, *d_b;
-  T *h_c, *d_c, *h_d_c;
-
-  h_a = (T *)malloc(size_a);
-  h_b = (T *)malloc(size_b);
-  h_c = (T *)malloc(size_c);
-  cudaMalloc(&d_a, size_a);
-  cudaMalloc(&d_b, size_b);
-  cudaMalloc(&d_c, size_c);
-
-  h_d_c = (T *)malloc(size_c);
-
-  srand(time(0));
-  for (int i = 0; i < M * K; i++)
-    h_a[i] = (T)((rand() % 200 - 100) * 0.01); // -1 ~ 1
-  for (int i = 0; i < K * N; i++)
-    h_b[i] = (T)((rand() % 200 - 100) * 0.01);
-
-  cpu_hgemm(h_a, h_b, h_c, M, N, K);
-
-  cudaMemcpy(d_a, h_a, size_a, cudaMemcpyHostToDevice);
-  cudaMemcpy(d_b, h_b, size_b, cudaMemcpyHostToDevice);
-
-  gpu_hgemm(d_a, d_b, d_c, M, N, K);
-
-  cudaMemcpy(h_d_c, d_c, size_c, cudaMemcpyDeviceToHost);
-
-  float max_error = 0.0;
-  for (int i = 0; i < M * N; i++) {
-    float this_error = abs((float)h_d_c[i] - (float)h_c[i]);
-    max_error = max(max_error, this_error);
-  }
-
-  free(h_a); 
-  free(h_b); 
-  free(h_c); 
-  cudaFree(d_a); 
-  cudaFree(d_b); 
-  cudaFree(d_c); 
-  free(h_d_c);
-
-  return max_error;
-}
-
-template <typename T>
-float gemm_error_check_v2(
-  void (*gpu_hgemm) (const T *, const T *, T *, int, int, int),
-  int M, int N, int K) {
-
-  size_t size_a = M * K * sizeof(T);
-  size_t size_b = K * N * sizeof(T);
-  size_t size_c = M * N * sizeof(T);
-
   T *h_a, *h_b, *h_c, *h_c_ref;
   T *d_a, *d_b, *d_c, *d_c_ref;
 
@@ -170,14 +85,14 @@ float gemm_error_check_v2(
   cudaMemcpy(d_b, h_b, size_b, cudaMemcpyHostToDevice);
   
   cublasHgemm(handle, 
-        CUBLAS_OP_T, 
-        CUBLAS_OP_N, 
-        N, M, K,
-        &alpha, 
-        (half *)d_b, K, 
-        (half *)d_a, K, 
-        &beta, 
-        (half *)d_c_ref, N);
+              CUBLAS_OP_T, 
+              CUBLAS_OP_N, 
+              N, M, K,
+              &alpha, 
+              (half *)d_b, K, 
+              (half *)d_a, K, 
+              &beta, 
+              (half *)d_c_ref, N);
         
   gpu_hgemm(d_a, d_b, d_c, M, N, K);