xlite-dev
diff --git a/‎README.md
Lines changed: 3 additions & 2 deletions b/‎README.md
Lines changed: 3 additions & 2 deletions
diff --git a/‎hgemm/README.md
Lines changed: 182 additions & 3 deletions b/‎hgemm/README.md
Lines changed: 182 additions & 3 deletions
diff --git a/‎hgemm/hgemm.cu
Lines changed: 7 additions & 0 deletions b/‎hgemm/hgemm.cu
Lines changed: 7 additions & 0 deletions
diff --git a/‎hgemm/hgemm.py
Lines changed: 10 additions & 6 deletions b/‎hgemm/hgemm.py
Lines changed: 10 additions & 6 deletions
diff --git a/‎hgemm/hgemm_cublas.cu
Lines changed: 76 additions & 0 deletions b/‎hgemm/hgemm_cublas.cu
Lines changed: 76 additions & 0 deletions
@@ -91,6 +91,7 @@
 | ✔️ [safe_softmax_f16x2_f32](./softmax/softmax.cu)|f16|f32|[link](./softmax/)|⭐️⭐️|  
 | ✔️ [safe_softmax_f16x8_pack_f32](./softmax/softmax.cu)|f16|f32|[link](./softmax/)|⭐️⭐️|  
 | ✔️ [online_safe_softmax_f32](./softmax/softmax.cu)|f32|f32|[link](./softmax/)|⭐️⭐️|
+| ✔️ [online_safe_softmax_f32x4_pack](./softmax/softmax.cu)|f32|f32|[link](./softmax/)|⭐️⭐️|
 | ✔️ [layer_norm_f32](./layer-norm/layer_norm.cu)|f32|f32|[link](./layer-norm/)|⭐️⭐️|  
 | ✔️ [layer_norm_f32x4](./layer-norm/layer_norm.cu)|f32|f32|[link](./layer-norm/)|⭐️⭐️|  
 | ✔️ [layer_norm_f16_f16](./layer-norm/layer_norm.cu)|f16|f16|[link](./layer-norm/)|⭐️⭐️|  
@@ -131,9 +132,9 @@
 | ✔️ [hgemm_wmma_m16n16k16...async*](./hgemm/hgemm_wmma.cu)|f16|f16|[link](./hgemm/)|⭐️⭐️⭐️|  
 | ✔️ [hgemm_wmma_m16n16k16...offset*](./hgemm/hgemm_wmma.cu)|f16|f16|[link](./hgemm/)|⭐️⭐️⭐️|  
 | ✔️ [hgemm_wmma_m16n16k16...dbuf*](./hgemm/hgemm_wmma.cu)|f16|f16|[link](./hgemm/)|⭐️⭐️⭐️|  
-| ✔️ [hgemm_wmma_m32n8k16...dbuf*](./hgemm/hgemm_wmma.cu)|f16|f16|[link](./hgemm/)|⭐️⭐️⭐️|  
 | ✔️ [hgemm_wmma_m16n16k16...rbuf*](./hgemm/hgemm_wmma.cu)|f16|f16|[link](./hgemm/)|⭐️⭐️⭐️|  
-| ✔️ [hgemm_wmma_m16n16k16...stage3/4*](./hgemm/hgemm_wmma_stage.cu)|f16|f16|[link](./hgemm/)|⭐️⭐️⭐️|  
+| ✔️ [hgemm_wmma_m16n16k16...stage2/3/4*](./hgemm/hgemm_wmma_stage.cu)|f16|f16|[link](./hgemm/)|⭐️⭐️⭐️|  
+| ✔️ [hgemm_wmma_m32n8k16...dbuf*](./hgemm/hgemm_wmma.cu)|f16|f16|[link](./hgemm/)|⭐️⭐️⭐️|  
 | ✔️ [sgemv_k32_f32](./sgemv/sgemv.cu)|f32|f32|[link](./sgemv/)|⭐️⭐️⭐️|  
 | ✔️ [sgemv_k128_f32x4](./sgemv/sgemv.cu)|f32|f32|[link](./sgemv/)|⭐️⭐️⭐️|  
 | ✔️ [sgemv_k16_f32](./sgemv/sgemv.cu)|f32|f32|[link](./sgemv/)|⭐️⭐️⭐️|  
 
@@ -1229,10 +1229,14 @@ void hgemm_wmma_m32n8k16_mma2x4_warp2x4_dbuf_async_offset(torch::Tensor a, torch
 void hgemm_wmma_m16n16k16_mma4x2_warp2x4x2_rbuf_async(torch::Tensor a, torch::Tensor b, torch::Tensor c);
 void hgemm_wmma_m16n16k16_mma4x2_warp2x4x2_rbuf_async_offset(torch::Tensor a, torch::Tensor b, torch::Tensor c);
 // from hgemm_wmma_stage.cu
+void hgemm_wmma_m16n16k16_mma4x2_warp2x4_stage2(torch::Tensor a, torch::Tensor b, torch::Tensor c);
+void hgemm_wmma_m16n16k16_mma4x2_warp2x4_stage2_offset(torch::Tensor a, torch::Tensor b, torch::Tensor c);
 void hgemm_wmma_m16n16k16_mma4x2_warp2x4_stage3(torch::Tensor a, torch::Tensor b, torch::Tensor c);
 void hgemm_wmma_m16n16k16_mma4x2_warp2x4_stage3_offset(torch::Tensor a, torch::Tensor b, torch::Tensor c);
 void hgemm_wmma_m16n16k16_mma4x2_warp2x4_stage4(torch::Tensor a, torch::Tensor b, torch::Tensor c);
 void hgemm_wmma_m16n16k16_mma4x2_warp2x4_stage4_offset(torch::Tensor a, torch::Tensor b, torch::Tensor c);
+// from hgemm_cublas.cu
+void hgemm_cublas_tensor_op(torch::Tensor a, torch::Tensor b, torch::Tensor c);
 
 
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
@@ -1272,8 +1276,11 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
   TORCH_BINDING_COMMON_EXTENSION(hgemm_wmma_m32n8k16_mma2x4_warp2x4_dbuf_async_offset)
   TORCH_BINDING_COMMON_EXTENSION(hgemm_wmma_m16n16k16_mma4x2_warp2x4x2_rbuf_async)
   TORCH_BINDING_COMMON_EXTENSION(hgemm_wmma_m16n16k16_mma4x2_warp2x4x2_rbuf_async_offset)
+  TORCH_BINDING_COMMON_EXTENSION(hgemm_wmma_m16n16k16_mma4x2_warp2x4_stage2)
+  TORCH_BINDING_COMMON_EXTENSION(hgemm_wmma_m16n16k16_mma4x2_warp2x4_stage2_offset)
   TORCH_BINDING_COMMON_EXTENSION(hgemm_wmma_m16n16k16_mma4x2_warp2x4_stage3)
   TORCH_BINDING_COMMON_EXTENSION(hgemm_wmma_m16n16k16_mma4x2_warp2x4_stage3_offset)
   TORCH_BINDING_COMMON_EXTENSION(hgemm_wmma_m16n16k16_mma4x2_warp2x4_stage4)
   TORCH_BINDING_COMMON_EXTENSION(hgemm_wmma_m16n16k16_mma4x2_warp2x4_stage4_offset)
+  TORCH_BINDING_COMMON_EXTENSION(hgemm_cublas_tensor_op)
 }
@@ -9,7 +9,7 @@
 # Load the CUDA kernel as a python module
 lib = load(name='hgemm_lib', 
            sources=['hgemm.cu', 'hgemm_async.cu', 'hgemm_wmma.cu', 
-                    'hgemm_wmma_stage.cu'], 
+                    'hgemm_wmma_stage.cu', 'hgemm_cublas.cu'], 
            extra_cuda_cflags=[
                "-O3",
                 "-U__CUDA_NO_HALF_OPERATORS__",
@@ -98,7 +98,7 @@ def run_benchmark(perf_func: callable,
                   a, b, "f16x8pack(bcf+offset)",             c)
     run_benchmark(lib.hgemm_t_8x8_sliced_k_f16x8_pack_bcf_dbuf,            
                   a, b, "f16x8pack(bcf+dbuf)",               c)
-    print("-" * 57 + "Async" + "-" * 58)
+    print("-" * 58 + "Async" + "-" * 57)
     run_benchmark(lib.hgemm_t_8x8_sliced_k16_f16x8_pack_dbuf,              
                   a, b, "f16x8pack(k16+dbuf)",               c)
     run_benchmark(lib.hgemm_t_8x8_sliced_k16_f16x8_pack_dbuf_offset,       
@@ -138,6 +138,8 @@ def run_benchmark(perf_func: callable,
                   a, b, "f16wmma(mma4x2+warp2x4+dbuf)",                  c)
     run_benchmark(lib.hgemm_wmma_m32n8k16_mma2x4_warp2x4_dbuf_async,              
                   a, b, "f16wmma(m32n8k16+mma2x4+warp2x4+dbuf)",         c)
+    run_benchmark(lib.hgemm_wmma_m16n16k16_mma4x2_warp2x4_stage2,              
+                  a, b, "f16wmma(mma2x4+warp2x4+stage2)",                c)
     run_benchmark(lib.hgemm_wmma_m16n16k16_mma4x2_warp2x4_stage3,              
                   a, b, "f16wmma(mma2x4+warp2x4+stage3)",                c)
     run_benchmark(lib.hgemm_wmma_m16n16k16_mma4x2_warp2x4_stage4,              
@@ -150,13 +152,15 @@ def run_benchmark(perf_func: callable,
                   a, b, "f16wmma(mma4x4+warp2x2x2+dbuf+offset)",         c)
     run_benchmark(lib.hgemm_wmma_m32n8k16_mma2x4_warp2x4_dbuf_async_offset,              
                   a, b, "f16wmma(m32n8k16+mma2x4+warp2x4+dbuf+offset)",  c)
-    run_benchmark(lib.hgemm_wmma_m16n16k16_mma4x2_warp2x4_stage4_offset,              
-                  a, b, "f16wmma(mma4x2+warp2x4+stage4+offset)",         c)
     run_benchmark(lib.hgemm_wmma_m16n16k16_mma4x2_warp2x4_dbuf_async_offset,              
                   a, b, "f16wmma(mma4x2+warp2x4+dbuf+offset)",           c)
+    run_benchmark(lib.hgemm_wmma_m16n16k16_mma4x2_warp2x4_stage4_offset,              
+                  a, b, "f16wmma(mma4x2+warp2x4+stage4+offset)",         c)
+    run_benchmark(lib.hgemm_wmma_m16n16k16_mma4x2_warp2x4_stage2_offset,              
+                  a, b, "f16wmma(mma4x2+warp2x4+stage2+offset)",         c)
     run_benchmark(lib.hgemm_wmma_m16n16k16_mma4x2_warp2x4_stage3_offset,              
                   a, b, "f16wmma(mma4x2+warp2x4+stage3+offset)",         c)
-    run_benchmark(partial(torch.matmul, out=c),
-                  a, b, "f16_th")
+    run_benchmark(partial(torch.matmul, out=c),             a, b, "f16_th")
+    run_benchmark(lib.hgemm_cublas_tensor_op, a, b, "f16(cublas)",       c)
     print("-" * 120)
 
@@ -0,0 +1,76 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <float.h>
+#include <vector>
+#include <algorithm>
+#include <cuda_runtime.h>
+#include <cuda_fp16.h>
+#include <cuda_bf16.h>
+#include <cuda_fp8.h>
+#include <mma.h>
+
+#include <torch/types.h>
+#include <torch/extension.h>
+
+#include "cublas_v2.h"
+
+
+void cublas_tensor_op(half *A, half *B, half *C,  size_t M, 
+                      size_t N, size_t K) {
+
+  cublasHandle_t handle = nullptr;
+  cublasCreate(&handle);
+  cublasSetMathMode(handle, CUBLAS_TENSOR_OP_MATH);
+
+  static half alpha = 1.0;
+  static half beta = 0.0;
+
+  cublasGemmEx(handle, 
+               CUBLAS_OP_N, 
+               CUBLAS_OP_N, 
+               N, M, K, 
+               &alpha, 
+               B, CUDA_R_16F, N, 
+               A, CUDA_R_16F, K, 
+               &beta,  
+               C, CUDA_R_16F, N, 
+               CUBLAS_COMPUTE_16F,
+               CUBLAS_GEMM_DEFAULT_TENSOR_OP);
+}
+
+// --------------------- PyTorch bindings for custom kernel -----------------------
+#define STRINGFY(str) #str
+#define TORCH_BINDING_COMMON_EXTENSION(func)   \
+  m.def(STRINGFY(func), &func, STRINGFY(func));
+
+#define CHECK_TORCH_TENSOR_DTYPE(T, th_type)                 \
+if(((T).options().dtype() != (th_type))) {                   \
+  std::cout << "Tensor Info:" << (T).options() << std::endl; \
+  throw std::runtime_error("values must be "#th_type);       \
+}
+
+#define CHECK_TORCH_TENSOR_SHAPE(T, S0, S1)           \
+if (((T).size(0) != (S0)) || ((T).size(1) != (S1))) { \
+  throw std::runtime_error("Tensor size mismatch!");  \
+}
+
+// cublas tensor op
+void hgemm_cublas_tensor_op(
+  torch::Tensor a, torch::Tensor b, torch::Tensor c) {
+  CHECK_TORCH_TENSOR_DTYPE(a, torch::kHalf)
+  CHECK_TORCH_TENSOR_DTYPE(b, torch::kHalf)
+  CHECK_TORCH_TENSOR_DTYPE(c, torch::kHalf)
+  const int M = a.size(0);
+  const int K = a.size(1);
+  const int N = b.size(1); 
+  CHECK_TORCH_TENSOR_SHAPE(a, M, K)
+  CHECK_TORCH_TENSOR_SHAPE(b, K, N)
+  CHECK_TORCH_TENSOR_SHAPE(c, M, N)
+
+  cublas_tensor_op(
+    reinterpret_cast<half*>(a.data_ptr()),
+    reinterpret_cast<half*>(b.data_ptr()),
+    reinterpret_cast<half*>(c.data_ptr()),
+    M, N, K
+  );
+}