enable autotune for all attn

Dewei-Wang-sh · Dewei-Wang-sh · commit 15d233405d89 · 2024-10-21T01:43:12.000-07:00
diff --git a/benchmarks/triton_kernels_benchmark/flash_attention_fwd_benchmark.py b/benchmarks/triton_kernels_benchmark/flash_attention_fwd_benchmark.py
@@ -156,7 +156,7 @@ def _attn_fwd(Q, K, V, sm_scale, M, Out,  #
     triton.Config({'BLOCK_M': BM, 'BLOCK_N': BN, 'grf_mode': 'large'}, num_stages=s, num_warps=w) \
     for BM in [128, 256] \
     for BN in [32, 64] \
-    for s in [3, 4] \
+    for s in [2, 3, 4] \
     for w in [8, 16, 32] \
     ]
 
@@ -170,43 +170,25 @@ def forward(q, k, v, causal, sm_scale):
     assert Lq == Lk and Lk == Lv
     assert Lk in {16, 32, 64, 128}
     o = torch.empty_like(q, dtype=torch.float32)
-    BLOCK_M = 128
-    BLOCK_N = 64 if Lk <= 64 else 32
-    num_stages = 3
-    num_warps = 8 if Lq == 64 else 16
+    #BLOCK_M = 128
+    #BLOCK_N = 64 if Lk <= 64 else 32
+    #num_stages = 3
+    #num_warps = 8 if Lq == 64 else 16
     stage = 3 if causal else 1
     grid = lambda args: (q.shape[0], q.shape[1], triton.cdiv(q.shape[2], args['BLOCK_M']))
     M = torch.empty((q.shape[0], q.shape[1], q.shape[2]), device=q.device, dtype=torch.float32)
 
-    if os.getenv('TRITON_INTEL_ADVANCED_PATH', '0') == '0':
-        # default pipeline
-        tune_attn_fwd[grid](
-            q, k, v, sm_scale, M, o,  #
-            q.stride(0), q.stride(1), q.stride(2), q.stride(3),  #
-            k.stride(0), k.stride(1), k.stride(2), k.stride(3),  #
-            v.stride(0), v.stride(1), v.stride(2), v.stride(3),  #
-            o.stride(0), o.stride(1), o.stride(2), o.stride(3),  #
-            q.shape[0], q.shape[1],  #
-            N_CTX=q.shape[2],  #
-            BLOCK_DMODEL=Lk,  #
-            STAGE=stage,  #
-        )
-    else:
-        _attn_fwd[grid](
-            q, k, v, sm_scale, M, o,  #
-            q.stride(0), q.stride(1), q.stride(2), q.stride(3),  #
-            k.stride(0), k.stride(1), k.stride(2), k.stride(3),  #
-            v.stride(0), v.stride(1), v.stride(2), v.stride(3),  #
-            o.stride(0), o.stride(1), o.stride(2), o.stride(3),  #
-            q.shape[0], q.shape[1],  #
-            N_CTX=q.shape[2],  #
-            BLOCK_M=BLOCK_M,  #
-            BLOCK_N=BLOCK_N,  #
-            BLOCK_DMODEL=Lk,  #
-            STAGE=stage,  #
-            num_warps=num_warps,  #
-            num_stages=num_stages  #
-        )
+    tune_attn_fwd[grid](
+        q, k, v, sm_scale, M, o,  #
+        q.stride(0), q.stride(1), q.stride(2), q.stride(3),  #
+        k.stride(0), k.stride(1), k.stride(2), k.stride(3),  #
+        v.stride(0), v.stride(1), v.stride(2), v.stride(3),  #
+        o.stride(0), o.stride(1), o.stride(2), o.stride(3),  #
+        q.shape[0], q.shape[1],  #
+        N_CTX=q.shape[2],  #
+        BLOCK_DMODEL=Lk,  #
+        STAGE=stage,  #
+    )
     return o