microsoft
diff --git a/‎.github/workflows/lint.yml
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/lint.yml
Lines changed: 1 addition & 0 deletions
diff --git a/‎.lintrunner.toml
Lines changed: 1 addition & 0 deletions b/‎.lintrunner.toml
Lines changed: 1 addition & 0 deletions
diff --git a/‎cmake/CMakeLists.txt
Lines changed: 4 additions & 1 deletion b/‎cmake/CMakeLists.txt
Lines changed: 4 additions & 1 deletion
diff --git a/‎onnxruntime/contrib_ops/cuda/bert/attention.cc
Lines changed: 1 addition & 1 deletion b/‎onnxruntime/contrib_ops/cuda/bert/attention.cc
Lines changed: 1 addition & 1 deletion
diff --git a/‎onnxruntime/contrib_ops/cuda/bert/attention_impl.cu
Lines changed: 5 additions & 0 deletions b/‎onnxruntime/contrib_ops/cuda/bert/attention_impl.cu
Lines changed: 5 additions & 0 deletions
diff --git a/‎onnxruntime/contrib_ops/cuda/bert/flash_attention/alibi.h
Lines changed: 67 additions & 0 deletions b/‎onnxruntime/contrib_ops/cuda/bert/flash_attention/alibi.h
Lines changed: 67 additions & 0 deletions
diff --git a/‎onnxruntime/contrib_ops/cuda/bert/flash_attention/block_info.h
Lines changed: 20 additions & 7 deletions b/‎onnxruntime/contrib_ops/cuda/bert/flash_attention/block_info.h
Lines changed: 20 additions & 7 deletions
diff --git a/‎onnxruntime/contrib_ops/cuda/bert/flash_attention/flash.h
Lines changed: 12 additions & 1 deletion b/‎onnxruntime/contrib_ops/cuda/bert/flash_attention/flash.h
Lines changed: 12 additions & 1 deletion
@@ -97,6 +97,7 @@ jobs:
             --exclude=java/src/main/native/*.c
             --exclude=onnxruntime/core/mlas/inc/*
             --exclude=onnxruntime/core/mlas/lib/*
+            --exclude=onnxruntime/contrib_ops/cuda/bert/flash_attention/*
           filter: "-runtime/references"
 
   lint-js:
 
@@ -136,6 +136,7 @@ exclude_patterns = [
     'onnxruntime/core/mickey/cutlass_ext/**', # CUTLASS based libs recommends NO automatic code formatting
     'onnxruntime/core/mickey/gemm/**', # CUTLASS based libs recommends NO automatic code formatting
     'winml/lib/Api.Image/shaders/**',  # Contains data chunks
+    'onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_launch_template.h', # Bool Switches hang Clang
 ]
 command = [
     'python',
 
@@ -102,7 +102,7 @@ option(onnxruntime_USE_PREINSTALLED_EIGEN "Use pre-installed EIGEN. Need to prov
 option(onnxruntime_BUILD_BENCHMARKS "Build ONNXRuntime micro-benchmarks" OFF)
 option(onnxruntime_USE_LLVM "Build TVM with LLVM" OFF)
 
-cmake_dependent_option(onnxruntime_USE_FLASH_ATTENTION "Build flash attention kernel for scaled dot product attention" ON "NOT WIN32; onnxruntime_USE_CUDA" OFF)
+cmake_dependent_option(onnxruntime_USE_FLASH_ATTENTION "Build flash attention kernel for scaled dot product attention" ON "onnxruntime_USE_CUDA" OFF)
 option(onnxruntime_USE_MEMORY_EFFICIENT_ATTENTION "Build memory efficient attention kernel for scaled dot product attention" ON)
 
 option(onnxruntime_BUILD_FOR_NATIVE_MACHINE "Enable this option for turning on optimization specific to this machine" OFF)
@@ -734,6 +734,9 @@ if (onnxruntime_USE_CUDA)
     message( STATUS "Turn off flash attention since CUDA compiler version < 11.6")
     set(onnxruntime_USE_FLASH_ATTENTION OFF)
     set(onnxruntime_USE_MEMORY_EFFICIENT_ATTENTION OFF)
+  elseif(WIN32 AND CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 12)
+    message( STATUS "Flash-Attention unsupported in Windows with CUDA compiler version < 12.0")
+    set(onnxruntime_USE_FLASH_ATTENTION OFF)
   endif()
   if (CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 11.4)
     message( FATAL_ERROR "Failed build due to CUDA compiler version < 11.4")
 
@@ -145,7 +145,7 @@ Status Attention<T>::ComputeInternal(OpKernelContext* context) const {
     auto [num_splits, slse_accum_bytes, o_accum_bytes] = onnxruntime::flash::get_num_splits_and_buffer_sizes(
         parameters.batch_size, parameters.sequence_length, parameters.kv_sequence_length, parameters.num_heads,
         parameters.head_size, device_prop.multiProcessorCount);
-    parameters.num_splits = num_splits;
+    parameters.num_splits = static_cast<int>(num_splits);
     softmax_lse_accum_bytes = slse_accum_bytes;
     out_accum_bytes = o_accum_bytes;
   }
 
@@ -334,6 +334,11 @@ Status FlashAttention(
     contrib::AttentionParameters& parameters,
     AttentionData<float>& data,
     float scale) {
+  ORT_UNUSED_PARAMETER(device_prop);
+  ORT_UNUSED_PARAMETER(stream);
+  ORT_UNUSED_PARAMETER(parameters);
+  ORT_UNUSED_PARAMETER(data);
+  ORT_UNUSED_PARAMETER(scale);
   return ORT_MAKE_STATUS(ONNXRUNTIME, StatusCode::NOT_IMPLEMENTED, "flash attention does not support float tensor");
 }
 #endif
 
@@ -0,0 +1,67 @@
+#include <cmath>
+#include <cute/tensor.hpp>
+#include <cutlass/cutlass.h>
+#include <cutlass/array.h>
+#include "utils.h"
+
+namespace onnxruntime {
+namespace flash {
+
+using namespace cute;
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
+template <bool Is_causal>
+struct Alibi {
+  const float alibi_slope;
+  const int max_seqlen_k, max_seqlen_q;
+
+  __forceinline__ __device__ Alibi(const float alibi_slope, const int max_seqlen_k, const int max_seqlen_q)
+      : alibi_slope(alibi_slope), max_seqlen_k(max_seqlen_k), max_seqlen_q(max_seqlen_q){};
+
+  template <typename Engine, typename Layout>
+  __forceinline__ __device__ void apply_alibi(Tensor<Engine, Layout>& tensor,
+                                              const int col_idx_offset_,
+                                              const int row_idx_offset,
+                                              const int warp_row_stride) {
+    // tensor has shape (ncol=(2, MMA_M), nrow=(2, MMA_N))
+    static_assert(Layout::rank == 2, "Only support 2D Tensor");
+    const int lane_id = threadIdx.x % 32;
+    const int col_idx_offset = col_idx_offset_ + (lane_id % 4) * 2;
+    if constexpr (Is_causal) {  // Simpler, we add the same bias vector to all rows
+#pragma unroll
+      for (int nj = 0; nj < size<1, 1>(tensor); ++nj) {
+        const int col_idx_base = col_idx_offset + nj * 8;
+#pragma unroll
+        for (int j = 0; j < size<1, 0>(tensor); ++j) {
+          const int col_idx = col_idx_base + j;
+#pragma unroll
+          for (int mi = 0; mi < size<0>(tensor); ++mi) {
+            tensor(mi, make_coord(j, nj)) += alibi_slope * col_idx;
+          }
+        }
+      }
+    } else {  // Bias depends on both row_idx and col_idx
+#pragma unroll
+      for (int mi = 0; mi < size<0, 1>(tensor); ++mi) {
+        const int row_idx_base = row_idx_offset + mi * warp_row_stride;
+#pragma unroll
+        for (int i = 0; i < size<0, 0>(tensor); ++i) {
+          const int row_idx = row_idx_base + i * 8;
+#pragma unroll
+          for (int nj = 0; nj < size<1, 1>(tensor); ++nj) {
+            const int col_idx_base = col_idx_offset + nj * 8;
+#pragma unroll
+            for (int j = 0; j < size<1, 0>(tensor); ++j) {
+              const int col_idx = col_idx_base + j;
+              tensor(make_coord(i, mi), make_coord(j, nj)) -= alibi_slope * abs(row_idx + max_seqlen_k - max_seqlen_q - col_idx);
+            }
+          }
+        }
+      }
+    }
+  }
+};
+
+}  // namespace flash
+}  // namespace onnxruntime
@@ -12,22 +12,36 @@ struct BlockInfo {
   template <typename Params>
   __device__ BlockInfo(const Params& params, const int bidb)
       : sum_s_q(!Varlen || params.cu_seqlens_q == nullptr ? -1 : params.cu_seqlens_q[bidb]),
-        sum_s_k(!Varlen || params.cu_seqlens_k == nullptr || !params.is_seqlens_k_cumulative ? -1 : params.cu_seqlens_k[bidb]),
-        actual_seqlen_q(!Varlen || params.cu_seqlens_q == nullptr ? params.seqlen_q : params.cu_seqlens_q[bidb + 1] - sum_s_q)
+        sum_s_k(!Varlen || params.cu_seqlens_k == nullptr || !params.is_seqlens_k_cumulative
+                    ? -1
+                    : params.cu_seqlens_k[bidb]),
+        actual_seqlen_q(!Varlen || params.cu_seqlens_q == nullptr
+                            ? params.seqlen_q
+                            : params.cu_seqlens_q[bidb + 1] - sum_s_q)
         // If is_seqlens_k_cumulative, then seqlen_k is cu_seqlens_k[bidb + 1] - cu_seqlens_k[bidb].
         // Otherwise it's cu_seqlens_k[bidb], i.e., we use cu_seqlens_k to store the sequence lengths of K.
         ,
-        seqlen_k_cache(!Varlen || params.cu_seqlens_k == nullptr ? params.seqlen_k : (params.is_seqlens_k_cumulative ? params.cu_seqlens_k[bidb + 1] - sum_s_k : params.cu_seqlens_k[bidb])),
-        actual_seqlen_k(seqlen_k_cache + (params.knew_ptr == nullptr ? 0 : params.seqlen_knew)) {
+        seqlen_k_cache(!Varlen || params.cu_seqlens_k == nullptr
+                           ? params.seqlen_k
+                           : (params.is_seqlens_k_cumulative
+                                  ? params.cu_seqlens_k[bidb + 1] - sum_s_k
+                                  : params.cu_seqlens_k[bidb])),
+        actual_seqlen_k(params.seqused_k
+                            ? params.seqused_k[bidb]
+                            : seqlen_k_cache + (params.knew_ptr == nullptr ? 0 : params.seqlen_knew)) {
   }
 
   template <typename index_t>
-  inline __device__ index_t q_offset(const index_t batch_stride, const index_t row_stride, const int bidb) const {
+  __forceinline__ __device__
+      index_t
+      q_offset(const index_t batch_stride, const index_t row_stride, const int bidb) const {
     return sum_s_q == -1 ? bidb * batch_stride : uint32_t(sum_s_q) * row_stride;
   }
 
   template <typename index_t>
-  inline __device__ index_t k_offset(const index_t batch_stride, const index_t row_stride, const int bidb) const {
+  __forceinline__ __device__
+      index_t
+      k_offset(const index_t batch_stride, const index_t row_stride, const int bidb) const {
     return sum_s_k == -1 ? bidb * batch_stride : uint32_t(sum_s_k) * row_stride;
   }
 
@@ -41,6 +55,5 @@ struct BlockInfo {
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
-////////////////////////////////////////////////////////////////////////////////////////////////////
 }  // namespace flash
 }  // namespace onnxruntime
@@ -16,7 +16,7 @@ constexpr int D_DIM = 2;
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
 struct Qkv_params {
-  using index_t = uint32_t;
+  using index_t = int64_t;
   // The QKV matrices.
   void* __restrict__ q_ptr = nullptr;
   void* __restrict__ k_ptr = nullptr;
@@ -79,6 +79,9 @@ struct Flash_fwd_params : public Qkv_params {
   int* __restrict__ cu_seqlens_q = nullptr;
   int* __restrict__ cu_seqlens_k = nullptr;
 
+  // If provided, the actual length of each k sequence.
+  int* __restrict__ seqused_k = nullptr;
+
   int* __restrict__ blockmask = nullptr;
 
   // The K_new and V_new matrices.
@@ -100,6 +103,11 @@ struct Flash_fwd_params : public Qkv_params {
   // The indices to index into the KV cache.
   int* __restrict__ cache_batch_idx = nullptr;
 
+  // Paged KV cache
+  int* __restrict__ block_table = nullptr;
+  index_t block_table_batch_stride = 0;
+  int page_block_size = 0;
+
   // Local window size
   int window_size_left = -1;
   int window_size_right = -1;
@@ -115,6 +123,9 @@ struct Flash_fwd_params : public Qkv_params {
 
   int num_splits = 0;  // For split-KV version
 
+  void* __restrict__ alibi_slopes_ptr = nullptr;
+  index_t alibi_slopes_batch_stride = 0;
+
   const cudaDeviceProp* dprops = nullptr;
 };
Original file line number	Diff line number	Diff line change
`@@ -136,6 +136,7 @@ exclude_patterns = [`
`136`	`136`	`'onnxruntime/core/mickey/cutlass_ext/**', # CUTLASS based libs recommends NO automatic code formatting`
`137`	`137`	`'onnxruntime/core/mickey/gemm/**', # CUTLASS based libs recommends NO automatic code formatting`
`138`	`138`	`'winml/lib/Api.Image/shaders/**', # Contains data chunks`
	`139`	`+ 'onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_launch_template.h', # Bool Switches hang Clang`
`139`	`140`	`]`
`140`	`141`	`command = [`
`141`	`142`	`'python',`
Original file line number	Diff line number	Diff line change
`@@ -145,7 +145,7 @@ Status Attention<T>::ComputeInternal(OpKernelContext* context) const {`
`145`	`145`	`auto [num_splits, slse_accum_bytes, o_accum_bytes] = onnxruntime::flash::get_num_splits_and_buffer_sizes(`
`146`	`146`	`parameters.batch_size, parameters.sequence_length, parameters.kv_sequence_length, parameters.num_heads,`
`147`	`147`	`parameters.head_size, device_prop.multiProcessorCount);`
`148`		`- parameters.num_splits = num_splits;`
	`148`	`+ parameters.num_splits = static_cast<int>(num_splits);`
`149`	`149`	`softmax_lse_accum_bytes = slse_accum_bytes;`
`150`	`150`	`out_accum_bytes = o_accum_bytes;`
`151`	`151`	`}`