microsoft
diff --git a/‎docs/ContribOperators.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/ContribOperators.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎docs/OperatorKernels.md‎
Lines changed: 1 addition & 1 deletion b/‎docs/OperatorKernels.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎onnxruntime/contrib_ops/cuda/bert/add_bias_transpose.cu‎
Lines changed: 104 additions & 0 deletions b/‎onnxruntime/contrib_ops/cuda/bert/add_bias_transpose.cu‎
Lines changed: 104 additions & 0 deletions
diff --git a/‎onnxruntime/contrib_ops/cuda/bert/attention.cc‎
Lines changed: 10 additions & 6 deletions b/‎onnxruntime/contrib_ops/cuda/bert/attention.cc‎
Lines changed: 10 additions & 6 deletions
diff --git a/‎onnxruntime/contrib_ops/cuda/bert/attention_impl.cu‎
Lines changed: 17 additions & 0 deletions b/‎onnxruntime/contrib_ops/cuda/bert/attention_impl.cu‎
Lines changed: 17 additions & 0 deletions
diff --git a/‎onnxruntime/contrib_ops/cuda/bert/attention_impl.h‎
Lines changed: 13 additions & 0 deletions b/‎onnxruntime/contrib_ops/cuda/bert/attention_impl.h‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎onnxruntime/contrib_ops/cuda/bert/attention_kv_cache.cu‎
Lines changed: 65 additions & 0 deletions b/‎onnxruntime/contrib_ops/cuda/bert/attention_kv_cache.cu‎
Lines changed: 65 additions & 0 deletions
diff --git a/‎onnxruntime/contrib_ops/cuda/bert/attention_kv_cache.h‎
Lines changed: 13 additions & 0 deletions b/‎onnxruntime/contrib_ops/cuda/bert/attention_kv_cache.h‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎onnxruntime/contrib_ops/cuda/bert/attention_prepare_qkv.cu‎
Lines changed: 10 additions & 2 deletions b/‎onnxruntime/contrib_ops/cuda/bert/attention_prepare_qkv.cu‎
Lines changed: 10 additions & 2 deletions
@@ -199,7 +199,7 @@ This version of the operator has been available since version 1 of the 'com.micr
 #### Type Constraints
 
 <dl>
-<dt><tt>T</tt> : tensor(float), tensor(float16)</dt>
+<dt><tt>T</tt> : tensor(float), tensor(float16), tensor(bfloat16)</dt>
 <dd>Constrain input and output types to float tensors.</dd>
 <dt><tt>M</tt> : tensor(int32)</dt>
 <dd>Constrain mask index to integer types</dd>
 
@@ -940,7 +940,7 @@ Do not modify directly.*
 | |
 | |
 |**Operator Domain:** *com.microsoft*||||
-|Attention|*in* input:**T**<br> *in* weights:**T**<br> *in* bias:**T**<br> *in* mask_index:**M**<br> *in* past:**T**<br> *in* attention_bias:**T**<br> *in* past_sequence_length:**M**<br> *out* output:**T**<br> *out* present:**T**|1+|**T** = tensor(float), tensor(float16)|
+|Attention|*in* input:**T**<br> *in* weights:**T**<br> *in* bias:**T**<br> *in* mask_index:**M**<br> *in* past:**T**<br> *in* attention_bias:**T**<br> *in* past_sequence_length:**M**<br> *out* output:**T**<br> *out* present:**T**|1+|**T** = tensor(bfloat16), tensor(float), tensor(float16)|
 |BeamSearch|*in* input_ids:**F**<br> *in* max_length:**I**<br> *in* min_length:**I**<br> *in* num_beams:**I**<br> *in* num_return_sequences:**I**<br> *in* length_penalty:**T**<br> *in* repetition_penalty:**T**<br> *in* vocab_mask:**M**<br> *in* prefix_vocab_mask:**M**<br> *in* attention_mask:**I**<br> *in* decoder_input_ids:**I**<br> *in* logits_processor:**I**<br> *out* sequences:**I**<br> *out* sequences_scores:**T**<br> *out* scores:**T**|1+|**T** = tensor(float), tensor(float16)|
 |BiasAdd|*in* X:**T**<br> *in* bias:**T**<br> *in* skip:**T**<br> *out* Y:**T**|1+|**T** = tensor(float), tensor(float16)|
 |BiasDropout|*in* data:**T**<br> *in* bias:**T**<br> *in* residual:**T**<br> *in* ratio:**T1**<br> *in* training_mode:**T2**<br> *out* output:**T**<br> *out* mask:**T2**|1+|**T** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16)<br/> **T1** = tensor(bfloat16), tensor(double), tensor(float), tensor(float16)<br/> **T2** = tensor(bool)|
 
@@ -794,6 +794,39 @@ void LaunchAddBiasTranspose(
   }
 }
 
+template <>
+void LaunchAddBiasTranspose<BFloat16>(
+    cudaStream_t stream, const int num_matrices, const int format, const int max_threads_per_block,
+    const int batch_size, const int sequence_length, const int num_heads, const int qk_head_size,
+    const BFloat16* input, const BFloat16* biases, BFloat16* output,
+    bool /*enable_half4*/, const int v_head_size,
+    BFloat16* qkv_add_bias, int total_matrix_count,
+    bool do_rotary, int rotary_embedding, int past_sequence_length) {
+  total_matrix_count = std::max(num_matrices, total_matrix_count);
+  if (0 == (qk_head_size & 1) && (v_head_size == -1 || 0 == (v_head_size & 1)) && !do_rotary) {
+    const int H = qk_head_size / 2;
+    const int H_v = v_head_size / 2;
+
+    const __nv_bfloat162* input2 = reinterpret_cast<const __nv_bfloat162*>(input);
+    const __nv_bfloat162* biases2 = reinterpret_cast<const __nv_bfloat162*>(biases);
+    __nv_bfloat162* output2 = reinterpret_cast<__nv_bfloat162*>(output);
+    __nv_bfloat162* qkv_add_bias2 = reinterpret_cast<__nv_bfloat162*>(qkv_add_bias);
+
+    InvokeAddBiasTranspose<__nv_bfloat162>(
+        stream, num_matrices, format, max_threads_per_block,
+        batch_size, sequence_length, num_heads, H,
+        input2, biases2, output2, qkv_add_bias2,
+        H_v, total_matrix_count);
+  } else {
+    InvokeAddBiasTranspose<BFloat16>(
+        stream, num_matrices, format, max_threads_per_block,
+        batch_size, sequence_length, num_heads, qk_head_size,
+        input, biases, output,
+        qkv_add_bias, v_head_size, total_matrix_count,
+        do_rotary, rotary_embedding, past_sequence_length);
+  }
+}
+
 template <>
 void LaunchAddBiasTranspose(
     cudaStream_t stream, const int num_matrices, const int format, const int max_threads_per_block,
@@ -888,6 +921,20 @@ void LaunchAddBiasTransposeTrt(
   ORT_ENFORCE(false, "Shall not call this since fused kernel does not support float input.");
 }
 
+template <>
+void LaunchAddBiasTransposeTrt<BFloat16>(
+    cudaStream_t /*stream*/, const int /*max_threads_per_block*/,
+    const int /*batch_size*/, const int /*sequence_length*/,
+    const int /*num_heads*/, const int /*head_size*/,
+    const BFloat16* /*biases*/,
+    const BFloat16* /*query*/,
+    const BFloat16* /*key*/,
+    const BFloat16* /*value*/,
+    BFloat16* /*output*/,
+    bool /*is_cross_attention*/, int /*kv_sequence_length*/) {
+  ORT_ENFORCE(false, "BF16 not supported for LaunchAddBiasTransposeTrt.");
+}
+
 template <>
 void LaunchAddBiasTransposeTrt(
     cudaStream_t stream, const int max_threads_per_block,
@@ -1049,6 +1096,38 @@ void LaunchAddBias(
   }
 }
 
+template <>
+void LaunchAddBias<BFloat16>(
+    cudaStream_t stream, const int max_threads_per_block,
+    const int batch_size, const int sequence_length, const int kv_sequence_length,
+    const int num_heads, const int head_size, const int v_head_size,
+    const BFloat16* biases, const BFloat16* query, const BFloat16* key, const BFloat16* value,
+    BFloat16* q, BFloat16* k, BFloat16* v) {
+  if (0 == (head_size & 1) && 0 == (v_head_size & 1)) {
+    const int H = head_size / 2;
+    const int H_v = v_head_size / 2;
+    const __nv_bfloat162* query2 = reinterpret_cast<const __nv_bfloat162*>(query);
+    const __nv_bfloat162* key2 = reinterpret_cast<const __nv_bfloat162*>(key);
+    const __nv_bfloat162* value2 = reinterpret_cast<const __nv_bfloat162*>(value);
+    const __nv_bfloat162* biases2 = reinterpret_cast<const __nv_bfloat162*>(biases);
+    __nv_bfloat162* q2 = reinterpret_cast<__nv_bfloat162*>(q);
+    __nv_bfloat162* k2 = reinterpret_cast<__nv_bfloat162*>(k);
+    __nv_bfloat162* v2 = reinterpret_cast<__nv_bfloat162*>(v);
+
+    InvokeAddBias<__nv_bfloat162>(
+        stream, max_threads_per_block,
+        batch_size, sequence_length, kv_sequence_length, num_heads, H, H_v,
+        biases2, query2, key2, value2, q2, k2, v2);
+
+  } else {
+    InvokeAddBias<BFloat16>(
+        stream, max_threads_per_block,
+        batch_size, sequence_length, kv_sequence_length, num_heads,
+        head_size, v_head_size,
+        biases, query, key, value, q, k, v);
+  }
+}
+
 template <typename T>
 void InvokeAddBias(
     cudaStream_t stream, const int max_threads_per_block,
@@ -1125,6 +1204,31 @@ void LaunchAddBias(
   }
 }
 
+template <>
+void LaunchAddBias<BFloat16>(
+    cudaStream_t stream, const int max_threads_per_block,
+    const int batch_size, const int sequence_length,
+    const int num_heads, const int head_size,
+    const BFloat16* biases, const BFloat16* query, BFloat16* q) {
+  if (0 == (head_size & 1)) {
+    const int H = head_size / 2;
+    const __nv_bfloat162* query2 = reinterpret_cast<const __nv_bfloat162*>(query);
+    const __nv_bfloat162* biases2 = reinterpret_cast<const __nv_bfloat162*>(biases);
+    __nv_bfloat162* q2 = reinterpret_cast<__nv_bfloat162*>(q);
+
+    InvokeAddBias<__nv_bfloat162>(
+        stream, max_threads_per_block,
+        batch_size, sequence_length, num_heads, H,
+        biases2, query2, q2);
+
+  } else {
+    InvokeAddBias<BFloat16>(
+        stream, max_threads_per_block,
+        batch_size, sequence_length, num_heads, head_size,
+        biases, query, q);
+  }
+}
+
 }  // namespace cuda
 }  // namespace contrib
 }  // namespace onnxruntime
@@ -36,20 +36,24 @@ constexpr int kPresentOutputIndex = 1;
 
 REGISTER_KERNEL_TYPED(float)
 REGISTER_KERNEL_TYPED(MLFloat16)
+REGISTER_KERNEL_TYPED(BFloat16)
 
 template <typename T>
 Attention<T>::Attention(const OpKernelInfo& info) : CudaKernel(info), AttentionBase(info, false) {
   kernel_options_ = this->GetAttentionKernelOptions();
 
-  disable_fused_self_attention_ = sizeof(T) != 2 || !kernel_options_->UseTrtFusedAttention();
+  constexpr bool kIsFp16 = std::is_same<T, MLFloat16>::value;
+  constexpr bool kIsBf16 = std::is_same<T, BFloat16>::value;
+  constexpr bool kIs16bit = kIsFp16 || kIsBf16;
 
-  enable_trt_flash_attention_ = sizeof(T) == 2 && kernel_options_->UseTrtFlashAttention();
+  // We only support FP16 for TRT fused/flash/causal attention.
+  disable_fused_self_attention_ = !kIsFp16 || !kernel_options_->UseTrtFusedAttention();
+  enable_trt_flash_attention_ = kIsFp16 && kernel_options_->UseTrtFlashAttention();
+  enable_fused_causal_attention_ = kIsFp16 && kernel_options_->UseTrtCausalAttention();
 
-  enable_fused_causal_attention_ = sizeof(T) == 2 && kernel_options_->UseTrtCausalAttention();
+  disable_memory_efficient_attention_ = kIsBf16 || !kernel_options_->UseEfficientAttention();
 
-  disable_memory_efficient_attention_ = !kernel_options_->UseEfficientAttention();
-
-  disable_flash_attention_ = sizeof(T) != 2 || !kernel_options_->UseFlashAttention();
+  disable_flash_attention_ = !kIs16bit || !kernel_options_->UseFlashAttention();
 }
 
 template <typename T>
 
@@ -952,6 +952,13 @@ Status QkvToContext(
     Stream* ort_stream,
     contrib::AttentionParameters& parameters,
     AttentionData<T>& data) {
+  if constexpr (std::is_same<T, BFloat16>::value || std::is_same<QK, BFloat16>::value) {
+    if (device_prop.major < 8) {
+      ORT_THROW("BF16 Attention requires Ampere (sm_80)+ with BF16 support. This GPU (",
+                device_prop.name, ", cc ", device_prop.major, ".", device_prop.minor, ") is not supported.");
+    }
+  }
+
   auto stream = static_cast<cudaStream_t>(ort_stream->GetHandle());
   const int max_threads_per_block = device_prop.maxThreadsPerBlock;
   const int batch_size = parameters.batch_size;
@@ -1040,6 +1047,8 @@ template struct AttentionData<float>;
 
 template struct AttentionData<half>;
 
+template struct AttentionData<BFloat16>;
+
 template Status QkvToContext<float>(
     const cudaDeviceProp& device_prop,
     cublasHandle_t& cublas,
@@ -1056,6 +1065,14 @@ template Status QkvToContext<half>(
     contrib::AttentionParameters& parameters,
     AttentionData<half>& data);
 
+template Status QkvToContext<BFloat16>(
+    const cudaDeviceProp& device_prop,
+    cublasHandle_t& cublas,
+    cudnnHandle_t& cudnn,
+    Stream* ort_stream,
+    contrib::AttentionParameters& parameters,
+    AttentionData<BFloat16>& data);
+
 template Status QkvToContext<float, half>(
     const cudaDeviceProp& device_prop,
     cublasHandle_t& cublas,
 
@@ -3,6 +3,7 @@
 
 #pragma once
 
+#include <cuda_bf16.h>
 #include <cuda_fp16.h>
 #include <cublas_v2.h>
 #include <gsl/gsl>
@@ -96,6 +97,10 @@ Status LaunchTransCtx(cudaStream_t stream,
                       const int sequence_length, const int batch_size, const int head_size, const int num_heads,
                       const int max_threads_per_block, const bool reversed_bs, const half* input, half* output);
 
+Status LaunchTransCtx(cudaStream_t stream,
+                      const int sequence_length, const int batch_size, const int head_size, const int num_heads,
+                      const int max_threads_per_block, const bool reversed_bs, const BFloat16* input, BFloat16* output);
+
 // BxSxMxNxH or SxBxMxNxH (reversed_bs is true) => MxBxNxSxH
 Status LaunchTransQkv(cudaStream_t stream, const int matrix_num,
                       const int sequence_length, const int batch_size, const int head_size, const int num_heads,
@@ -107,12 +112,20 @@ Status LaunchTransQkv(cudaStream_t stream, const int matrix_num,
                       const int max_threads_per_block, const bool reversed_bs, const half* input, half* output,
                       int total_matrix_count = -1);
 
+Status LaunchTransQkv(cudaStream_t stream, const int matrix_num,
+                      const int sequence_length, const int batch_size, const int head_size, const int num_heads,
+                      const int max_threads_per_block, const bool reversed_bs, const BFloat16* input, BFloat16* output,
+                      int total_matrix_count = -1);
+
 Status Transpose_BSNH_to_BNSH(const int batch_size, const int sequence_length, const int num_heads, const int head_size,
                               const float* input, float* output, cudaStream_t stream, const int max_threads_per_block);
 
 Status Transpose_BSNH_to_BNSH(const int batch_size, const int sequence_length, const int num_heads, const int head_size,
                               const half* input, half* output, cudaStream_t stream, const int max_threads_per_block);
 
+Status Transpose_BSNH_to_BNSH(const int batch_size, const int sequence_length, const int num_heads, const int head_size,
+                              const BFloat16* input, BFloat16* output, cudaStream_t stream, const int max_threads_per_block);
+
 template <typename T>
 Status ConcatPastToPresent(int batch_size, int num_heads, int qk_head_size, int v_head_size,
                            int sequence_length, int total_sequence_length,
 
@@ -197,6 +197,59 @@ Status LaunchConcatTensorToTensor(cudaStream_t stream,
   return CUDA_CALL(cudaGetLastError());
 }
 
+Status LaunchConcatTensorToTensor(cudaStream_t stream,
+                                  const int all_sequence_length,
+                                  const int sequence_length,
+                                  const int batch_size,
+                                  const int head_size,
+                                  const int num_heads,
+                                  const int max_threads_per_block,
+                                  const int matrix_num,
+                                  const BFloat16* tensor_in,
+                                  const BFloat16* tensor_add,
+                                  BFloat16* tensor_out) {
+  assert(num_heads <= max_threads_per_block);
+  const dim3 grid(all_sequence_length, batch_size, matrix_num);
+  if (0 == (head_size & 1)) {
+    const int H = head_size / 2;
+    if (H * num_heads <= max_threads_per_block) {
+      const dim3 block(H, num_heads, 1);
+      ConcatTensorToTensor<__nv_bfloat162><<<grid, block, 0, stream>>>(
+          sequence_length,
+          reinterpret_cast<const __nv_bfloat162*>(tensor_in),
+          reinterpret_cast<const __nv_bfloat162*>(tensor_add),
+          reinterpret_cast<__nv_bfloat162*>(tensor_out));
+    } else {
+      const dim3 block(max_threads_per_block / num_heads, num_heads, 1);
+      ConcatTensorToTensorLarge<__nv_bfloat162><<<grid, block, 0, stream>>>(
+          sequence_length,
+          H,
+          reinterpret_cast<const __nv_bfloat162*>(tensor_in),
+          reinterpret_cast<const __nv_bfloat162*>(tensor_add),
+          reinterpret_cast<__nv_bfloat162*>(tensor_out));
+    }
+  } else {
+    if (head_size * num_heads <= max_threads_per_block) {
+      const dim3 block(head_size, num_heads, 1);
+      ConcatTensorToTensor<__nv_bfloat16><<<grid, block, 0, stream>>>(
+          sequence_length,
+          reinterpret_cast<const __nv_bfloat16*>(tensor_in),
+          reinterpret_cast<const __nv_bfloat16*>(tensor_add),
+          reinterpret_cast<__nv_bfloat16*>(tensor_out));
+    } else {
+      const dim3 block(max_threads_per_block / num_heads, num_heads, 1);
+      ConcatTensorToTensorLarge<__nv_bfloat16><<<grid, block, 0, stream>>>(
+          sequence_length,
+          head_size,
+          reinterpret_cast<const __nv_bfloat16*>(tensor_in),
+          reinterpret_cast<const __nv_bfloat16*>(tensor_add),
+          reinterpret_cast<__nv_bfloat16*>(tensor_out));
+    }
+  }
+
+  return CUDA_CALL(cudaGetLastError());
+}
+
 #ifndef USE_ROCM  // exclude the following from hipify since they are not used in ROCM EP
 
 // ----------------------------------------------------------------------------------
@@ -332,6 +385,18 @@ template Status LaunchAddBiasTransAppendKvToPresent(cudaStream_t stream,
                                                     const half* bias,
                                                     const half* qkv_buffer,
                                                     half* present);
+
+template Status LaunchAddBiasTransAppendKvToPresent(cudaStream_t stream,
+                                                    const int max_sequence_length,
+                                                    const int total_sequence_length,
+                                                    const int sequence_length,
+                                                    const int batch_size,
+                                                    const int head_size,
+                                                    const int num_heads,
+                                                    const int max_threads_per_block,
+                                                    const BFloat16* bias,
+                                                    const BFloat16* qkv_buffer,
+                                                    BFloat16* present);
 #endif
 
 // Kernel to append new and past kv in either BSNH or BNSH format
 
@@ -3,6 +3,7 @@
 
 #pragma once
 #include "core/providers/cuda/shared_inc/cuda_utils.h"
+#include <cuda_bf16.h>
 #include <cuda_fp16.h>
 #include "core/framework/allocator.h"
 #include "core/providers/cuda/cuda_common.h"
@@ -38,6 +39,18 @@ Status LaunchConcatTensorToTensor(cudaStream_t stream,
                                   const half* tensor_add,
                                   half* tensor_out);
 
+Status LaunchConcatTensorToTensor(cudaStream_t stream,
+                                  const int all_sequence_length,
+                                  const int sequence_length,
+                                  const int batch_size,
+                                  const int head_size,
+                                  const int num_heads,
+                                  const int max_threads_per_block,
+                                  const int matrix_num,
+                                  const BFloat16* tensor_in,
+                                  const BFloat16* tensor_add,
+                                  BFloat16* tensor_out);
+
 template <typename T>
 Status LaunchAddBiasTransAppendKvToPresent(cudaStream_t stream,
                                            const int max_sequence_length,
 
@@ -744,9 +744,11 @@ Status PrepareQkv(contrib::AttentionParameters& parameters,
 #endif
 
   if (nullptr != data.gemm_buffer) {  // Attention operator
-    ORT_RETURN_IF_ERROR(PrepareQkv_Attention<T>(parameters, data, stream, max_threads_per_block));
+    ORT_RETURN_IF_ERROR(PrepareQkv_Attention<T>(
+        parameters, data, stream, max_threads_per_block));
   } else {  // MultiHeadAttention operator
-    ORT_RETURN_IF_ERROR(PrepareQkv_MultiHeadAttention<T>(parameters, data, stream, max_threads_per_block));
+    ORT_RETURN_IF_ERROR(PrepareQkv_MultiHeadAttention<T>(
+        parameters, data, stream, max_threads_per_block));
   }
 
   assert(data.qkv_format != AttentionQkvFormat::UNKNOWN);
@@ -776,6 +778,12 @@ template Status PrepareQkv<half>(
     cudaStream_t stream,
     int max_threads_per_block);
 
+template Status PrepareQkv<BFloat16>(
+    contrib::AttentionParameters& parameters,
+    AttentionData<BFloat16>& data,
+    cudaStream_t stream,
+    int max_threads_per_block);
+
 }  // namespace cuda
 }  // namespace contrib
 }  // namespace onnxruntime