[Benchmarks] Add more variants in XeTLA FA implementation (#2309)

yudongsi · web-flow · commit f2b4b6df8eea · 2024-09-24T11:04:45.000+08:00
More shapes and causal support.
diff --git a/benchmarks/triton_kernels_benchmark/flash_attention_fwd_benchmark.py b/benchmarks/triton_kernels_benchmark/flash_attention_fwd_benchmark.py
@@ -188,13 +188,19 @@ def forward(q, k, v, causal, sm_scale):
         # argument names to use as an x-axis for the plot
         x_names=['Z', 'H', 'N_CTX', 'D_HEAD'],
         x_vals=[  #
+            [1, 16, 16384, 128],  #
             [1, 32, 16384, 64],  #
+            [2, 16, 8192, 128],  #
             [2, 32, 8192, 64],  #
+            [4, 16, 4096, 128],  #
             [4, 32, 4096, 64],  #
             [4, 48, 1024, 64],  #
+            [8, 16, 2048, 128],  #
             [8, 32, 2048, 64],  #
+            [16, 16, 1024, 128],  #
             [16, 32, 1024, 64],  #
-            [32, 32, 512, 64]  #
+            [32, 16, 512, 128],  #
+            [32, 32, 512, 64],  #
         ],
         line_arg='provider',
         # argument name whose value corresponds to a different line in the plot
@@ -238,7 +244,8 @@ def benchmark(Z, H, N_CTX, D_HEAD, provider):
                                                               fast_flush=False)
 
     elif provider == 'xetla':
-        func = getattr(xetla_kernel, 'flash_attn')
+        module_name = f'flash_attn_causal_{causal}'.lower()
+        func = getattr(xetla_kernel, module_name)
         out = torch.empty_like(q, device='xpu', dtype=dtype)
         size_score = Z * H * N_CTX * N_CTX
         size_attn_mask = Z * N_CTX * N_CTX
@@ -248,7 +255,7 @@ def benchmark(Z, H, N_CTX, D_HEAD, provider):
         m = torch.empty((size_ml, ), device='xpu', dtype=torch.float)
         l = torch.empty((size_ml, ), device='xpu', dtype=torch.float)
 
-        xetla_fn = lambda: func(q, k, v, out, dropout_mask, bias, m, l, Z, H, D_HEAD, N_CTX, N_CTX)
+        xetla_fn = lambda: func(q, k, v, out, dropout_mask, bias, m, l, Z, H, D_HEAD, N_CTX, N_CTX, sm_scale)
         _, min_ms, max_ms, mean, cv = benchmark_suit.do_bench(xetla_fn, warmup=10, rep=10, quantiles=quantiles,
                                                               fast_flush=False)
 
diff --git a/benchmarks/xetla_kernel/flash_attention/fmha_forward_v5.h b/benchmarks/xetla_kernel/flash_attention/fmha_forward_v5.h
@@ -29,37 +29,6 @@ namespace gpu::xetla {
 
 namespace fmha {
 
-struct Shape {
-  Shape(int B, int N, int F, int T, int H)
-      : num_batches(B), num_heads(N), num_queries(F), num_keys(T),
-        head_size(H) {}
-  const int num_batches;
-  const int num_heads;
-  const int num_queries;
-  const int num_keys;
-  const int head_size;
-
-  inline uint32_t get_query_size() const {
-    return num_batches * num_heads * num_queries * head_size;
-  }
-  inline uint32_t get_key_size() const {
-    return num_batches * num_heads * num_keys * head_size;
-  }
-  inline uint32_t get_score_size() const {
-    return num_batches * num_heads * num_queries * num_keys;
-  }
-  inline uint32_t get_ml_size() const {
-    return num_batches * num_heads * num_queries;
-  }
-  inline uint32_t get_attn_mask_size() const {
-#if _BIAS_AS_INPUT
-    return num_batches * num_heads * num_queries * num_keys;
-#else
-    return num_batches * num_queries * num_keys;
-#endif
-  }
-};
-
 template <typename fmha_policy, typename scalar_t, bool kUseBias,
           bool kIsCausal, bool kIsTraining>
 class fmha_forward_t {
@@ -620,46 +589,28 @@ class FmhaForwardKernel;
 // The launcher of fmha forward kernel
 template <typename fmha_policy, typename T, bool kUseBias = false,
           bool kIsCausal = false, bool kIsTraining = false>
-sycl::event fmha_forward_impl(sycl::queue &q, void *_q, void *_k, void *_v,
-                              void *_out, void *_dropout_mask, void *_bias,
-                              void *_m, void *_l, uint32_t num_batches,
-                              uint32_t num_heads, uint32_t head_size,
-                              uint32_t num_queries, uint32_t num_keys,
-                              uint64_t seed = 0, uint64_t offset = 123) {
-
-  Shape shape(num_batches, num_heads, num_queries, num_keys, head_size);
+sycl::event
+fmha_forward_impl(sycl::queue &q, void *_q, void *_k, void *_v, void *_out,
+                  void *_dropout_mask, void *_bias, void *_m, void *_l,
+                  uint32_t num_batches, uint32_t num_heads, uint32_t head_size,
+                  uint32_t num_queries, uint32_t num_keys, float head_scale,
+                  uint64_t seed = 0, uint64_t offset = 123) {
 
   constexpr bool use_mask = false;
   constexpr bool use_dropout = false;
   float dropout_prob = 0.0f;
   if constexpr (use_dropout)
     dropout_prob = 0.5f;
-  const float scale = 1 / (1 - dropout_prob);
-  const float head_scale = sycl::rsqrt(float(head_size));
-
-  uint32_t size_query = shape.get_query_size();
-  uint32_t size_key = shape.get_key_size();
-  uint32_t size_score = shape.get_score_size();
-  uint32_t size_attn_mask = shape.get_attn_mask_size();
-  uint32_t size_ml = shape.get_ml_size();
 
   // forward
-  // T *query = sycl::malloc_shared<T>(size_query, q);
-  // T *key = sycl::malloc_shared<T>(size_key, q);
-  // T *value = sycl::malloc_shared<T>(size_key, q);
   T *query = static_cast<T *>(_q);
   T *key = static_cast<T *>(_k);
   T *value = static_cast<T *>(_v);
 
-  // T *bias = sycl::malloc_shared<T>(size_attn_mask, q);
   T *bias = static_cast<T *>(_bias);
-  // uint8_t *dropout_mask = sycl::malloc_shared<uint8_t>(size_score, q);
   uint8_t *dropout_mask = static_cast<uint8_t *>(_dropout_mask);
-  // T *out = sycl::malloc_shared<T>(size_query, q);
   T *out = static_cast<T *>(_out);
-  // float *m = sycl::malloc_shared<float>(size_ml, q);
   float *m = static_cast<float *>(_m);
-  // float *l = sycl::malloc_shared<float>(size_ml, q);
   float *l = static_cast<float *>(_l);
 
   // fmha forward kernel
@@ -687,12 +638,6 @@ sycl::event fmha_forward_impl(sycl::queue &q, void *_q, void *_k, void *_v,
           fmha_fwd_op(ei, args);
         });
   });
-  // sycl::free(query, q);
-  // sycl::free(key, q);
-  // sycl::free(value, q);
-  // sycl::free(bias, q);
-  // sycl::free(dropout_mask, q);
-  // sycl::free(out, q);
   return event;
 }
 
diff --git a/benchmarks/xetla_kernel/python_main.cpp b/benchmarks/xetla_kernel/python_main.cpp
@@ -94,11 +94,11 @@ at::Tensor bf16_stream_k_gemm(const at::Tensor &a, const at::Tensor &b,
   return acc;
 }
 
-#define CALL_IMPL_ATTENTION_FUNC(P)                                            \
+#define CALL_IMPL_ATTENTION_FWD_FUNC(P)                                        \
   fmha::fmha_forward_impl<P, T, use_mask, IsCausal, use_dropout>(              \
       queue, q.data_ptr(), k.data_ptr(), v.data_ptr(), out.data_ptr(),         \
       dropout_mask.data_ptr(), bias.data_ptr(), m.data_ptr(), l.data_ptr(),    \
-      num_batches, num_heads, head_size, num_queries, num_keys)
+      num_batches, num_heads, head_size, num_queries, num_keys, head_scale)
 
 template <bool use_mask = false, bool IsCausal = false,
           bool use_dropout = false>
@@ -107,7 +107,8 @@ void flash_attn(const at::Tensor &q, const at::Tensor &k, const at::Tensor &v,
                 const at::Tensor &bias, const at::Tensor &m,
                 const at::Tensor &l, const int64_t num_batches,
                 const int64_t num_heads, const int64_t head_size,
-                const int64_t num_queries, const int64_t num_keys) {
+                const int64_t num_queries, const int64_t num_keys,
+                float head_scale) {
 
   CHECK_INPUT(q);
   CHECK_INPUT(k);
@@ -126,14 +127,14 @@ void flash_attn(const at::Tensor &q, const at::Tensor &k, const at::Tensor &v,
 
   sycl::event evt;
   if (head_size <= 64) {
-    evt = CALL_IMPL_ATTENTION_FUNC(fmha_policy_64x128x64);
+    evt = CALL_IMPL_ATTENTION_FWD_FUNC(fmha_policy_64x128x64);
   } else if (head_size <= 128) {
-    evt = CALL_IMPL_ATTENTION_FUNC(fmha_policy_64x128x128);
+    evt = CALL_IMPL_ATTENTION_FWD_FUNC(fmha_policy_64x128x128);
   } else if (head_size <= 25) {
     if (num_keys <= 256) {
-      evt = CALL_IMPL_ATTENTION_FUNC(fmha_policy_32x256x256);
+      evt = CALL_IMPL_ATTENTION_FWD_FUNC(fmha_policy_32x256x256);
     } else {
-      evt = CALL_IMPL_ATTENTION_FUNC(fmha_policy_64x512x256);
+      evt = CALL_IMPL_ATTENTION_FWD_FUNC(fmha_policy_64x512x256);
     }
   } else {
     std::cout << "No policy available for current head_size " << head_size
@@ -213,5 +214,8 @@ PYBIND11_MODULE(xetla_kernel, m) {
   m.def("gemm_shape_4096_8_16384_128",
         &bf16_gemm<Test_4096x8x16384x128_row_row>, "bf16_gemm (XeTLA)");
   // flash_attn
-  m.def("flash_attn", &flash_attn<false, false, false>, "flash attn (XeTLA)");
+  m.def("flash_attn_causal_false", &flash_attn<false, false, false>,
+        "flash attn fwd (XeTLA)");
+  m.def("flash_attn_causal_true", &flash_attn<false, true, false>,
+        "flash attn fwd (XeTLA)");
 }