xlite-dev
diff --git a/‎README.md
Lines changed: 3 additions & 3 deletions b/‎README.md
Lines changed: 3 additions & 3 deletions
diff --git a/‎flash-attn/README.md
Lines changed: 15 additions & 52 deletions b/‎flash-attn/README.md
Lines changed: 15 additions & 52 deletions
diff --git a/‎flash-attn/flash_attn.cc
Lines changed: 0 additions & 9 deletions b/‎flash-attn/flash_attn.cc
Lines changed: 0 additions & 9 deletions
diff --git a/‎flash-attn/flash_attn.py
Lines changed: 64 additions & 49 deletions b/‎flash-attn/flash_attn.py
Lines changed: 64 additions & 49 deletions
@@ -87,9 +87,9 @@
 | ✔️ [hgemv_k16_f16_kernel](./hgemv)|f16|f16|[link](./hgemv/)|⭐️⭐️⭐️|  
 | ✔️ [flash_attn_1_fwd_f32_kernel](./flash-attn/flash_attn_1_fwd_f32.cu)|f32|f32|[link](./flash-attn)|⭐️⭐️⭐️|  
 | ❔ [flash_attn_2_fwd_f32_kernel](./flash-attn/flash_attn_2_fwd_f32.cu)|f32|f32|[link](./flash-attn)|⭐️⭐️⭐️|  
-| ❔ [flash_attn_2_fwd_f16_kernel](./flash-attn/flash_attn_2_fwd_f32.cu)|f16|f32|[link](./flash-attn)|⭐️⭐️⭐️|  
-| ❔ [flash_attn_2_fwd_bf16_kernel](./flash-attn/flash_attn_2_fwd_f32.cu)|bf16|f32|[link](./flash-attn)|⭐️⭐️⭐️|  
-| ✔️ [hard_nms cpp only](./nms/nms.cc)|f32|/|❔|⭐️|  
+| ❔ [flash_attn_2_fwd_f16_kernel](./flash-attn/flash_attn_2_fwd_f32.cu)|f16|f16|[link](./flash-attn)|⭐️⭐️⭐️|  
+| ✔️ [flash_attn_2_fwd_f16_mma_m16n8k16](./flash-attn/flash_attn_2_fwd_f16_mma_m16n8k16.cu)|f16|f16|[link](./flash-attn)|⭐️⭐️⭐️|  
+| ✔️ [hard_nms cpp only](./nms/nms.cc)|f32|/|/|⭐️|  
 | ✔️ [notes v1(deprecated)](./notes-v1.cu)|f32|f32|/|⭐️|  
 
 ## 0x01 📖 博客目录
 
@@ -1,61 +1,24 @@
-## FlashAttention 测试  
+# FlashAttention
 
-### 前置依赖  
-- PyTorch >= 2.2.1  
-- CUDA >= 12.2
+## 0x00 说明
 
-```bash
-python3 -m pip install torch
-```
+包含以下内容：
+
+- [X] flash_attn_1_fwd_f32_kernel 
+- [ ] flash_attn_2_fwd_f32_kernel
+- [ ] flash_attn_2_fwd_f16_kernel
+- [x] flash_attn_2_fwd_f16_mma_m16n8k16_kernel
+- [X] PyTorch bindings
 
 ### 运行测试   
 ```bash
 python3 flash_attn.py
 ```
-日志如下：（RTX 3080 Ti）
+日志如下：
 ```bash
-python3 flash_attn.py
-=== profiling manual attention ===
-STAGE:2024-03-25 08:47:18 3818250:3818250 ActivityProfilerController.cpp:314] Completed Stage: Warm Up
-STAGE:2024-03-25 08:47:18 3818250:3818250 ActivityProfilerController.cpp:320] Completed Stage: Collection
-STAGE:2024-03-25 08:47:18 3818250:3818250 ActivityProfilerController.cpp:324] Completed Stage: Post Processing
--------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
-                     Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  Total KFLOPs
--------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
-              manual_attn        45.60%     513.000us        98.31%       1.106ms       1.106ms     489.000us        43.82%       1.116ms       1.116ms             1            --
-             aten::matmul        14.31%     161.000us        40.27%     453.000us     226.500us     131.000us        11.74%     496.000us     248.000us             2            --
-                aten::bmm         5.78%      65.000us         7.82%      88.000us      44.000us     166.000us        14.87%     166.000us      83.000us             2    201326.592
-            aten::reshape         4.98%      56.000us         7.38%      83.000us      20.750us      74.000us         6.63%     105.000us      26.250us             4            --
-             aten::expand         4.62%      52.000us         6.13%      69.000us      17.250us      65.000us         5.82%      90.000us      22.500us             4            --
-          aten::transpose         3.47%      39.000us         4.27%      48.000us      48.000us      44.000us         3.94%      54.000us      54.000us             1            --
-            aten::softmax         1.16%      13.000us         3.47%      39.000us      39.000us      17.000us         1.52%      44.000us      44.000us             1            --
-         aten::as_strided         0.53%       6.000us         0.53%       6.000us       1.200us      35.000us         3.14%      35.000us       7.000us             5            --
-                aten::mul         2.22%      25.000us         2.84%      32.000us      32.000us      33.000us         2.96%      33.000us      33.000us             1       786.432
-           aten::_softmax         1.42%      16.000us         1.96%      22.000us      22.000us      27.000us         2.42%      27.000us      27.000us             1            --
--------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
-Self CPU time total: 1.125ms
-Self CUDA time total: 1.116ms
-
-=== profiling flash_attn_1_fwd_f32 attention ===
-STAGE:2024-03-25 08:47:18 3818250:3818250 ActivityProfilerController.cpp:314] Completed Stage: Warm Up
-STAGE:2024-03-25 08:47:18 3818250:3818250 ActivityProfilerController.cpp:320] Completed Stage: Collection
-STAGE:2024-03-25 08:47:18 3818250:3818250 ActivityProfilerController.cpp:324] Completed Stage: Post Processing
---------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
-                      Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls
---------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
-      flash_attn_1_fwd_f32         5.76%     148.000us        15.72%     404.000us     404.000us       1.804ms        96.37%       1.872ms       1.872ms             1
-          aten::zeros_like         1.21%      31.000us         5.21%     134.000us     134.000us       8.000us         0.43%      31.000us      31.000us             1
-               aten::zero_         1.44%      37.000us         2.96%      76.000us      38.000us      11.000us         0.59%      25.000us      12.500us             2
-               aten::zeros         0.78%      20.000us         2.41%      62.000us      62.000us       8.000us         0.43%      21.000us      21.000us             1
-               aten::fill_         0.89%      23.000us         1.60%      41.000us      13.667us      19.000us         1.01%      19.000us       6.333us             3
-                aten::full         0.74%      19.000us         1.71%      44.000us      44.000us       9.000us         0.48%      16.000us      16.000us             1
-          aten::empty_like         1.01%      26.000us         1.71%      44.000us      44.000us       6.000us         0.32%       8.000us       8.000us             1
-               aten::empty         0.62%      16.000us         0.62%      16.000us       8.000us       5.000us         0.27%       5.000us       2.500us             2
-       aten::empty_strided         0.54%      14.000us         0.54%      14.000us      14.000us       2.000us         0.11%       2.000us       2.000us             1
-           cudaEventRecord         2.18%      56.000us         2.18%      56.000us       2.154us       0.000us         0.00%       0.000us       0.000us            26
---------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------
-Self CPU time total: 2.570ms
-Self CUDA time total: 1.872ms
-
-attn values sanity check: True
+--------------------------------------------------------------------------------
+    out_fa1fwdf32: [0.11064263, 0.08648866, -0.07250906], time:2.32403278ms
+out_fa1fwdf32(v2): [0.11064263, 0.08648866, -0.07250906], time:2.22899675ms
+   out_attnf32_th: [0.11064263, 0.08648865, -0.07250906], time:0.11474848ms
+--------------------------------------------------------------------------------
 ```
@@ -4,67 +4,82 @@
 import torch
 from torch.nn import functional as F
 from torch.utils.cpp_extension import load
+from functools import partial
+from typing import Optional
 
 torch.set_grad_enabled(False)
 # Load the CUDA kernel as a python module
-custom_flash_attn = load(name='custom_flash_attn', 
-                         sources=[
-                            'flash_attn.cc',
-                            'flash_attn_1_fwd_f32.cu',
-                            'flash_attn_2_fwd_f32.cu'
-                         ], 
-                         extra_cuda_cflags=['-O2'])
+lib = load(name='flash_attn_lib', 
+           sources=['flash_attn_1_fwd_f32.cu'], 
+           extra_cuda_cflags=[
+               "-O3",
+                "-U__CUDA_NO_HALF_OPERATORS__",
+                "-U__CUDA_NO_HALF_CONVERSIONS__",
+                "-U__CUDA_NO_HALF2_OPERATORS__",
+                "-U__CUDA_NO_BFLOAT16_CONVERSIONS__",
+                "--expt-relaxed-constexpr",
+                "--expt-extended-lambda",
+                "--use_fast_math"
+            ], 
+           extra_cflags=['-std=c++17'])
 
 # Use small model params, otherwise slower than manual attention. See caveats in README.
-batch_size = 16
-n_head = 12
-seq_len = 64
-head_embd = 64
 
-q = torch.randn(batch_size, n_head, seq_len, head_embd).float().cuda()
-k = torch.randn(batch_size, n_head, seq_len, head_embd).float().cuda()
-v = torch.randn(batch_size, n_head, seq_len, head_embd).float().cuda()
-q.requires_grad = False
-k.requires_grad = False
-v.requires_grad = False
-print('=== profiling manual attention ===')
-
-def manual_attn(q, k, v):
+# un-fused naive attn
+def manual_attn(q: torch.Tensor, k: torch.Tensor, v: torch.Tensor):
     att = (q @ k.transpose(-2, -1) * (1.0 / math.sqrt(k.size(-1))))
     att = F.softmax(att, dim=-1)
     y = att @ v
     return y
 
-for _ in range(2): 
-    manual_result = manual_attn(q, k, v) # warmup
-
-torch.cuda.synchronize()
-with torch.autograd.profiler.profile(use_cuda=True, with_flops=True) as prof:
-    with torch.autograd.profiler.record_function("manual_attn"):
-        manual_result = manual_attn(q, k, v)
-print(prof.key_averages().table(sort_by='cuda_time_total', row_limit=10))
 
-for _ in range(2): 
-    custom_result = custom_flash_attn.flash_attn_1_fwd_f32(q, k, v) # warmup
-print('=== profiling flash_attn_1_fwd_f32 attention === ')
-with torch.autograd.profiler.profile(use_cuda=True, with_flops=True) as prof:
-     with torch.autograd.profiler.record_function("flash_attn_1_fwd_f32"):
-        custom_result = custom_flash_attn.flash_attn_1_fwd_f32(q, k, v)
-print(prof.key_averages().table(sort_by='cuda_time_total', row_limit=10))
-print('attn values sanity check:', torch.allclose(custom_result, manual_result, rtol=0, atol=1e-02))
-
-# Why custom flash attn is slow than naive attn in for loop test ?
-REPEAT = 10
-manual_result = manual_attn(q, k, v) # warmup
-st = time.time()
-for _ in range(REPEAT):
-    manual_result = manual_attn(q, k, v)
+def run_benchmark(perf_func: callable, 
+                  q: torch.Tensor, k: torch.Tensor, v: torch.Tensor,
+                  tag: str, out: Optional[torch.Tensor] = None, 
+                  warmup: int = 10, iters: int = 200,
+                  show_all: bool = False):
+    if out is not None: 
+        out.fill_(0)      
+    if out is not None:
+        for i in range(warmup):
+            perf_func(q, k, v, out)
+    else:
+        for i in range(warmup):
+            _ = perf_func(q, k, v)
+    
     torch.cuda.synchronize()
-print(f"manual attention mean time(ms): {((time.time() - st) * 1000) / REPEAT}")
-custom_result = custom_flash_attn.flash_attn_1_fwd_f32(q, k, v)  # warmup
-st = time.time()
-for _ in range(REPEAT):
-    custom_result = custom_flash_attn.flash_attn_1_fwd_f32(q, k, v)
+    start = time.time()
+    # iters
+    if out is not None:
+        for i in range(iters):
+            perf_func(q, k, v, out)
+    else:
+        for i in range(iters):
+            out = perf_func(q, k, v)
     torch.cuda.synchronize()
-print(f"flash_attn_1_fwd_f32 mean time(ms): {((time.time() - st) * 1000) / REPEAT}")
+    end = time.time()
+    total_time = (end - start) * 1000 # ms
+    mean_time = total_time / iters
+    out_info = f"out_{tag}"
+    out_val = out.flatten().detach().cpu().numpy().tolist()[:3]
+    out_val = [round(v, 8) for v in out_val]
+    print(f"{out_info:>17}: {out_val}, time:{mean_time:.8f}ms")
+    if show_all: print(out[0, 0, 0, :])
+    return out.clone(), mean_time
+
 
+print("-" * 80)
+# batch_size, n_head, seq_len, head_dim (B,nh,N,d)
+B, nh, N, d = 16, 12, 64, 64
+q = torch.randn(B, nh, N, d).float().cuda().contiguous()
+k = torch.randn(B, nh, N, d).float().cuda().contiguous()
+v = torch.randn(B, nh, N, d).float().cuda().contiguous()
+o = torch.randn(B, nh, N, d).float().cuda().contiguous()
+q.requires_grad = False
+k.requires_grad = False
+v.requires_grad = False
+o.requires_grad = False
+run_benchmark(lib.flash_attn_1_fwd_f32,    q, k, v, "fa1fwdf32")
+run_benchmark(lib.flash_attn_1_fwd_f32_v2, q, k, v, "fa1fwdf32(v2)", o)
+run_benchmark(manual_attn,                 q, k, v, "attnf32_th")
+print("-" * 80)