xlite-dev
diff --git a/‎README.md
Lines changed: 6 additions & 4 deletions b/‎README.md
Lines changed: 6 additions & 4 deletions
diff --git a/‎hgemm/.gitignore
Lines changed: 2 additions & 0 deletions b/‎hgemm/.gitignore
Lines changed: 2 additions & 0 deletions
diff --git a/‎hgemm/README.md
Lines changed: 40 additions & 5 deletions b/‎hgemm/README.md
Lines changed: 40 additions & 5 deletions
diff --git a/‎hgemm/hgemm.cu
Lines changed: 8 additions & 1 deletion b/‎hgemm/hgemm.cu
Lines changed: 8 additions & 1 deletion
diff --git a/‎hgemm/hgemm.py
Lines changed: 52 additions & 15 deletions b/‎hgemm/hgemm.py
Lines changed: 52 additions & 15 deletions
@@ -25,7 +25,7 @@
   <img src='https://github.com/user-attachments/assets/c7d65fe5-9fb9-49a8-b962-a6c09bcc030a' height="225px" width="403px">
 </div> 
 
-Currently, on NVIDIA L20, RTX 4090 and RTX 3090 Laptop, compared with cuBLAS's default Tensor Cores math algorithm `CUBLAS_GEMM_DEFAULT_TENSOR_OP`, the `HGEMM (WMMA/MMA)` implemented in this repo (`sky blue`🔵) can achieve `95%~99%` of its (`orange`🟠) performance. Please check [hgemm benchmark](./hgemm) for more details.
+Currently, on NVIDIA L20, RTX 4090 and RTX 3090 Laptop, compared with cuBLAS's default Tensor Cores math algorithm `CUBLAS_GEMM_DEFAULT_TENSOR_OP`, the `HGEMM (WMMA/MMA)` implemented in this repo (`blue`🔵) can achieve `95%~99%` of its (`orange`🟠) performance. Please check [hgemm benchmark](./hgemm) for more details.
 
 |CUDA Cores|Sliced K(Loop over K)|Tile Block|Tile Thread|
 |:---:|:---:|:---:|:---:|
@@ -36,8 +36,8 @@ Currently, on NVIDIA L20, RTX 4090 and RTX 3090 Laptop, compared with cuBLAS's d
 |✔️|✔️|✔️|✔️|
 |Reg Double Buffers|Block Swizzle|Warp Swizzle|Collective Store(Warp Shfl)|
 |✔️|✔️|✔️|✔️|
-|Row Major(NN)|Col Major(TN)|SGEMM TF32|SMEM Swizzle(Permuted)|
-|✔️|✔️|✔️|...|
+|Row Major(NN)|Col Major(TN)|SGEMM TF32|SMEM Swizzle(CuTe)|
+|✔️|✔️|✔️|✔️|
 
 
 
@@ -201,6 +201,7 @@ Currently, on NVIDIA L20, RTX 4090 and RTX 3090 Laptop, compared with cuBLAS's d
 | ✔️ [hgemm_mma_m16n8k16...mma2x4*](./hgemm/hgemm_mma.cu)|f16|f16|[link](./hgemm/)|⭐️⭐️⭐️|  
 | ✔️ [hgemm_mma_m16n8k16...stages*](./hgemm/hgemm_mma_stage.cu)|f16|f16|[link](./hgemm/)|⭐️⭐️⭐️|  
 | ✔️ [hgemm_mma_m16n8k16...swizzle*](./hgemm/hgemm_mma_stage.cu)|f16|f16|[link](./hgemm/)|⭐️⭐️⭐️|  
+| ✔️ [hgemm_mma_stages_tn_cute*](./hgemm/hgemm_mma_stage_tn_cute.cu)|f16|f16|[link](./hgemm/)|⭐️⭐️⭐️|  
 | ✔️ [sgemv_k32_f32](./sgemv/sgemv.cu)|f32|f32|[link](./sgemv/)|⭐️⭐️⭐️|  
 | ✔️ [sgemv_k128_f32x4](./sgemv/sgemv.cu)|f32|f32|[link](./sgemv/)|⭐️⭐️⭐️|  
 | ✔️ [sgemv_k16_f32](./sgemv/sgemv.cu)|f32|f32|[link](./sgemv/)|⭐️⭐️⭐️|  
@@ -397,5 +398,6 @@ How to contribute? please check [🌤🌤CONTRIBUTE🎉🎉](https://github.com/
 - [cuda_hgemm](https://github.com/Bruce-Lee-LY/cuda_hgemm)
 - [cuda-tensorcore-hgemm](https://github.com/nicolaswilde/cuda-tensorcore-hgemm)
 - [How_to_optimize_in_GPU](https://github.com/Liu-xiandong/How_to_optimize_in_GPU/tree/master/sgemv)
-  
+- [cute_gemm](https://github.com/weishengying/cute_gemm)
+
 </details>
@@ -16,3 +16,5 @@ __pycache__
 *.ncu*
 *.sqlite*
 *.engine
+*.bin
+*.out
@@ -11,8 +11,8 @@
 |✔️|✔️|✔️|✔️|
 |**Reg Double Buffers**|**Block Swizzle**|**Warp Swizzle**|**Collective Store(Reg Reuse&Warp Shfl)**|
 |✔️|✔️|✔️|✔️|
-|**Row Major(NN)**|**Col Major(TN)**|**SGEMM TF32**|**SMEM Swizzle/Permuted**|
-|✔️|✔️|✔️|❔|
+|**Row Major(NN)**|**Col Major(TN)**|**SGEMM TF32**|**SMEM Swizzle(CuTe)**|
+|✔️|✔️|✔️|✔️|
 
 <details>
 <summary> 🔑️ 点击查看所有支持的HGEMM Kernels! </summary>  
@@ -46,7 +46,10 @@
 
 ## 测试命令
 
+**Python**: 支持python脚本直接测试
+
 ```bash
+git submodule update --init --recursive --force
 # 只测试Ada架构 不指定默认编译所有架构 耗时较长: Volta, Ampere, Ada, Hopper, ...
 export TORCH_CUDA_ARCH_LIST=Ada 
 python3 hgemm.py --wmma # test defalut wmma kernels for all MNK
@@ -56,19 +59,51 @@ python3 hgemm.py --M 16384 --N 16384 --K 8192 --i 10 --mma # test default mma ke
 python3 hgemm.py --wmma-all # test all wmma kernels for all MNK
 python3 hgemm.py --mma-all # test all mma kernels for all MNK
 python3 hgemm.py --cuda-all --wmma-all --mma-all # test all kernels for all MNK
+python3 hgemm.py --cute-tn --no-default # test cute hgemm with smem swizzle for all MNK
 ```
 如果需要绘制TFLOPS曲线图，需要先安装matplotlib，并指定--plot-flops（或--plot）选项:
 ```bash
 python3 -m pip install matplotlib
 # topk指定只绘制性能最好的topk个kernel
-python3 hgemm.py --mma-all --plot --topk 8 
+python3 hgemm.py --mma-all --plot --topk 8
+python3 hgemm.py --cute-tn --no-default --plot # test cute hgemm with smem swizzle for all MNK
+```
+
+**C++**: C++测试目前仅支持CuTe HGEMM，C++ bin方式测试的性能数据会略优于python测试方式，可能是torch binding引入了一定的开销。
+```bash
+make
+./hgemm_cute.bin
+# NVIDIA L20
+algo = CUTE HGEMM Stages 2
+M N K =    256    256    256, Time =   0.00001946   0.00002007   0.00002048 s, AVG Performance =     1.6718 Tflops
+M N K =    512    512    512, Time =   0.00003174   0.00003277   0.00003379 s, AVG Performance =     8.1920 Tflops
+M N K =    768    768    768, Time =   0.00004506   0.00004608   0.00004710 s, AVG Performance =    19.6608 Tflops
+M N K =   1024   1024   1024, Time =   0.00005837   0.00005929   0.00006042 s, AVG Performance =    36.2202 Tflops
+M N K =   9216   9216   9216, Time =   0.01371546   0.01371679   0.01371853 s, AVG Performance =   114.1314 Tflops
+M N K =   9472   9472   9472, Time =   0.01458586   0.01458924   0.01460531 s, AVG Performance =   116.4991 Tflops
+M N K =   9728   9728   9728, Time =   0.01597747   0.01597931   0.01598157 s, AVG Performance =   115.2239 Tflops
+M N K =   9984   9984   9984, Time =   0.01741721   0.01742008   0.01743462 s, AVG Performance =   114.2598 Tflops
+M N K =  10240  10240  10240, Time =   0.01839923   0.01840046   0.01840230 s, AVG Performance =   116.7081 Tflops
+M N K =  10496  10496  10496, Time =   0.01993421   0.01993523   0.01993728 s, AVG Performance =   116.0059 Tflops
+M N K =  10752  10752  10752, Time =   0.02151629   0.02151956   0.02153472 s, AVG Performance =   115.5219 Tflops
+M N K =  11008  11008  11008, Time =   0.02315571   0.02315663   0.02315878 s, AVG Performance =   115.2073 Tflops
+M N K =  11264  11264  11264, Time =   0.02484634   0.02484808   0.02484941 s, AVG Performance =   115.0311 Tflops
+M N K =  11520  11520  11520, Time =   0.02659226   0.02659430   0.02659840 s, AVG Performance =   114.9738 Tflops
+M N K =  11776  11776  11776, Time =   0.02780057   0.02780426   0.02781082 s, AVG Performance =   117.4660 Tflops
+M N K =  12032  12032  12032, Time =   0.03024179   0.03024701   0.03025818 s, AVG Performance =   115.1757 Tflops
+M N K =  12288  12288  12288, Time =   0.03214848   0.03215698   0.03217306 s, AVG Performance =   115.3980 Tflops
+M N K =  12544  12544  12544, Time =   0.03410842   0.03411661   0.03412173 s, AVG Performance =   115.7104 Tflops
+M N K =  12800  12800  12800, Time =   0.03612979   0.03613184   0.03613491 s, AVG Performance =   116.0833 Tflops
+M N K =  13056  13056  13056, Time =   0.03820134   0.03820769   0.03821671 s, AVG Performance =   116.4956 Tflops
+M N K =  15872  15872  15872, Time =   0.06917632   0.06927145   0.06936883 s, AVG Performance =   115.4438 Tflops
+M N K =  16128  16128  16128, Time =   0.07299379   0.07302472   0.07304806 s, AVG Performance =   114.8951 Tflops
 ```
 
 ## 目前性能  
 
 ### NVIDIA L20  
 
-目前最优的实现，在L20上（理论Tensor Cores FP16算力为 119.5 TFLOPS），使用WMMA API能达到cuBLAS大概95%~98%左右的性能(105-113 TFLOPS vs 105-115 TFLOPS)，使用MMA API能达到115 TFLOPS，部分case会超越cuBLAS。已知问题为bank conflicts没有完全消除，目前通过padding的方式缓解bank conflicts会导致shared memory浪费，也会影响SM occupancy。并且尚未手工实现smem swizzle/permute(受限于WMMA API的灵活性以及row major的layout)，后续将会尝试通过MMA PTX实现smem swizzle/permute。
+目前最优的实现，在L20上（理论Tensor Cores FP16算力为 119.5 TFLOPS），使用WMMA API能达到cuBLAS大概95%~98%左右的性能(105-113 TFLOPS vs 105-115 TFLOPS)，使用MMA API能达到115 TFLOPS，部分case会超越cuBLAS。目前通过padding和smem swizzle的方式缓解bank conflicts。对于NN layout，使用smem padding缓解bank conflicts；对于TN layout，通过cutlass cute的smem swizzle/permuted消除bank conflicts。
 
 <div id="NV-L20"></div>
 
@@ -227,7 +262,7 @@ NVIDIA的[文章](https://developer.nvidia.com/blog/using-shared-memory-cuda-cc/
 ```C
 cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeEightByte);
 ```
-本项目目前通过padding的方式缓解bank conflicts会导致shared memory浪费，也会影响SM occupancy。并且尚未手工实现smem swizzle/permute(受限于WMMA API的灵活性以及row major的layout)，后续将会尝试通过MMA PTX实现smem swizzle/permute。
+目前通过padding和smem swizzle的方式缓解bank conflicts。对于NN layout，使用smem padding缓解bank conflicts；对于TN layout，通过cutlass cute的smem swizzle/permuted消除bank conflicts。
 
 ### 双缓冲 Double Buffers
 
 
@@ -1023,7 +1023,10 @@ void hgemm_mma_m16n8k16_mma2x4_warp4x4x2_stages_dsmem_x4(torch::Tensor a, torch:
 void hgemm_mma_m16n8k16_mma2x4_warp4x4x2_stages_dsmem_rr(torch::Tensor a, torch::Tensor b, torch::Tensor c, int stages, bool swizzle, int swizzle_stride);
 // from hgemm_mma_stage_tn.cu
 void hgemm_mma_m16n8k16_mma2x4_warp4x4_stages_dsmem_tn(torch::Tensor a, torch::Tensor b, torch::Tensor c, int stages, bool swizzle, int swizzle_stride);
-
+#ifdef ENBLE_CUTE_HGEMM
+// from hgemm_mma_stage_tn_cute.cu
+void hgemm_mma_stages_tn_cute(torch::Tensor a, torch::Tensor b, torch::Tensor c, int stages, bool swizzle, int swizzle_stride);
+#endif
 
 PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
   // CUDA Cores FP16
@@ -1067,5 +1070,9 @@ PYBIND11_MODULE(TORCH_EXTENSION_NAME, m) {
   TORCH_BINDING_COMMON_EXTENSION(hgemm_mma_m16n8k16_mma2x4_warp4x4x2_stages_dsmem_rr)
   // TN: A row major MxK, B col major NxK, C row major MxN
   TORCH_BINDING_COMMON_EXTENSION(hgemm_mma_m16n8k16_mma2x4_warp4x4_stages_dsmem_tn)
+  // cute hgemm
+#ifdef ENBLE_CUTE_HGEMM
+  TORCH_BINDING_COMMON_EXTENSION(hgemm_mma_stages_tn_cute)
+#endif
 }
 
@@ -1,3 +1,4 @@
+import os
 import torch
 import time 
 from torch.utils.cpp_extension import load
@@ -30,6 +31,8 @@ def get_args():
     parser.add_argument("--enable-wmma-all", "--wmma-all", action="store_true", help="Enable all WMMA kernel tests")
     parser.add_argument("--enable-cuda-all", "--cuda-all", action="store_true", help="Enable all CUDA kernel tests")
     parser.add_argument("--enable-torch", "--torch", action="store_true", help="Enable torch matmul")
+    parser.add_argument("--enable-cute-tn", "--cute-tn", action="store_true", help="Enable cute hgemm matmul")
+    parser.add_argument("--enable-cute", "--cute", action="store_true", help="Enable cute hgemm matmul")
     parser.add_argument("--disable-cublas", "--no-cublas", action="store_true", help="Disable cublas hgemm")
     parser.add_argument("--disable-cublas-tn", "--no-cublas-tn", action="store_true", help="Disable cublas TN hgemm")
     parser.add_argument("--sleep-duration", "--sleep", type=float, default=0.1, help="Sleep duration")
@@ -42,6 +45,7 @@ def get_args():
     parser.add_argument("--save-dir", "--dir", type=str, default="./", help="Save dir for plot")
     return parser.parse_args()
 
+
 args = get_args()
 print(args)
 
@@ -58,15 +62,25 @@ def get_device_capability():
     return torch.cuda.get_device_capability(torch.cuda.current_device())
 
 
-# Load the CUDA kernel as a python module
-print(f"Loading hgemm lib on device: {get_device_name()}, capability: {get_device_capability()} ...")
+def get_build_sources():
+    build_sources = [
+        'hgemm.cu', 'hgemm_async.cu', 'hgemm_wmma.cu', 
+        'hgemm_wmma_stage.cu', 'hgemm_cublas.cu',
+        'hgemm_mma.cu', 'hgemm_mma_stage.cu',
+        'hgemm_mma_stage_tn.cu'
+    ]
+    # if args.enable_cute_tn:
+    #     build_sources.append('hgemm_mma_stage_tn_cute.cu')
+    build_sources.append('hgemm_mma_stage_tn_cute.cu')
+    return build_sources
 
-lib = load(name='hgemm_lib', 
-           sources=['hgemm.cu', 'hgemm_async.cu', 'hgemm_wmma.cu', 
-                    'hgemm_wmma_stage.cu', 'hgemm_cublas.cu',
-                    'hgemm_mma.cu', 'hgemm_mma_stage.cu',
-                    'hgemm_mma_stage_tn.cu'], 
-           extra_cuda_cflags=[
+
+def get_project_dir():
+    return os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+
+
+def get_build_cuda_cflags():
+    extra_cuda_cflags=[
                "-O3",
                 "-U__CUDA_NO_HALF_OPERATORS__",
                 "-U__CUDA_NO_HALF_CONVERSIONS__",
@@ -94,7 +108,23 @@ def get_device_capability():
                 # spill loads: 则是指将之前溢出到栈上的数据重新加载回寄存器。
                 "-Xptxas -v",
                 # "-maxrregcount=128 -Xptxas -dlcm=cg" if args.reduce_reg else ""
-            ], 
+    ]
+    # extra cuda flags for cute hgemm
+    project_dir = get_project_dir()
+    extra_cuda_cflags.append('-DNO_CUTE_HGEMM_BIN')
+    extra_cuda_cflags.append('-DENBLE_CUTE_HGEMM')
+    extra_cuda_cflags.append(f'-I {project_dir}')
+    extra_cuda_cflags.append(f'-I {project_dir}/third-party/cutlass/include')
+    extra_cuda_cflags.append(f'-I {project_dir}/third-party/cutlass/tools/util/include')
+
+    return extra_cuda_cflags
+
+# Load the CUDA kernel as a python module
+print(f"Loading hgemm lib on device: {get_device_name()}, capability: {get_device_capability()} ...")
+
+lib = load(name='hgemm_lib', 
+           sources=get_build_sources(), 
+           extra_cuda_cflags=get_build_cuda_cflags(), 
            extra_cflags=['-std=c++17'],
            verbose=args.verbose)
 
@@ -254,6 +284,7 @@ def plot_tflops():
     STATIS_INFO["(best)"] = get_best_tflops()
     draw_tags = topk_tflops
     draw_tags.append("(cublas)")
+    draw_tags.append("tn(cublas)")
     draw_tags.append("(best)")
 
     def skip_it(tag: str) -> bool:
@@ -269,10 +300,10 @@ def skip_it(tag: str) -> bool:
         if skip_it(tag): 
             continue
         if "cublas" in tag:
-            ax.plot(tflops, label=tag, linewidth=3)
+            ax.plot(tflops, label=tag, linewidth=3, color='orange')
         else:
             if "best" in tag and not args.no_plot_best:
-                ax.plot(tflops, label=tag, linewidth=4)
+                ax.plot(tflops, label=tag, linewidth=4, color='blue')
             else:
                 ax.plot(tflops, label=tag, linestyle='--')
 
@@ -400,15 +431,21 @@ def get_mnk(sep: int = args.SEP):
         run_benchmark(lib.hgemm_cublas_tensor_op_nn, a, b, "(cublas)", c)
     if args.enable_torch:
         run_benchmark(partial(torch.matmul, out=c), a, b, "(torch)")
-    if args.enable_mma_tn:
+    # TN layout: A row major with shape [M,K], B col major with shape [N,K]
+    if any((args.enable_mma_tn, args.enable_cute_tn)):
         MAX_TFLOPS = -1
-        print("-" * 68 + "MMA(TN)" + "-" * 55)
+        print("-" * 68 + "TN" + "-" * 60)
+    if args.enable_mma_tn:
         run_benchmark(lib.hgemm_mma_m16n8k16_mma2x4_warp4x4_stages_dsmem_tn, a, b.transpose(1, 0), "tn(mma2x4+warp4x4+stage3+dsmem)", c, stages=3)
         run_benchmark(lib.hgemm_mma_m16n8k16_mma2x4_warp4x4_stages_dsmem_tn, a, b.transpose(1, 0), "tn(mma2x4+warp4x4+stage2+dsmem)", c, stages=2)
         run_benchmark(lib.hgemm_mma_m16n8k16_mma2x4_warp4x4_stages_dsmem_tn, a, b.transpose(1, 0), "tn(mma2x4+warp4x4+stage3+dsmem+swizzle)", c, stages=3, swizzle=True)
         run_benchmark(lib.hgemm_mma_m16n8k16_mma2x4_warp4x4_stages_dsmem_tn, a, b.transpose(1, 0), "tn(mma2x4+warp4x4+stage2+dsmem+swizzle)", c, stages=2, swizzle=True)
-        if not args.disable_cublas_tn:
-            run_benchmark(lib.hgemm_cublas_tensor_op_tn, a, b.transpose(1, 0), "tn(cublas)", c)
+    if args.enable_cute_tn:
+        run_benchmark(lib.hgemm_mma_stages_tn_cute, a, b.transpose(1, 0), "tn(cute+swizzle<smem>+stage4)", c, stages=4)
+        run_benchmark(lib.hgemm_mma_stages_tn_cute, a, b.transpose(1, 0), "tn(cute+swizzle<smem>+stage3)", c, stages=3)
+        run_benchmark(lib.hgemm_mma_stages_tn_cute, a, b.transpose(1, 0), "tn(cute+swizzle<smem>+stage2)", c, stages=2)
+    if not args.disable_cublas_tn and any((args.enable_mma_tn, args.enable_cute_tn)):
+        run_benchmark(lib.hgemm_cublas_tensor_op_tn, a, b.transpose(1, 0), "tn(cublas)", c)
     torch.cuda.synchronize()
     print("-" * 130)