intel
diff --git a/‎.github/scripts/apply_torch_pr.py‎
Lines changed: 0 additions & 2 deletions b/‎.github/scripts/apply_torch_pr.py‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎src/ATen/CMakeLists.txt‎
Lines changed: 2 additions & 2 deletions b/‎src/ATen/CMakeLists.txt‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎src/ATen/native/transformers/Attention.cpp‎
Lines changed: 334 additions & 0 deletions b/‎src/ATen/native/transformers/Attention.cpp‎
Lines changed: 334 additions & 0 deletions
@@ -13,8 +13,6 @@
         "https://github.com/pytorch/pytorch/pull/126516",
         # Modify the tolerance level in TIMM benchmark
         "https://github.com/pytorch/pytorch/pull/129735",
-        # [Intel GPU] Allow XPU device in cdist and pdist operators
-        "https://github.com/pytorch/pytorch/pull/138441",
     ]
 )
 parser.add_argument('--extra-pr-list', '-e', nargs='+',default=[])
 
@@ -2,8 +2,8 @@
 
 file(GLOB xpu_h "xpu/*.h")
 file(GLOB xpu_cpp "xpu/*.cpp")
-file(GLOB xpu_native_cpp "native/xpu/*.cpp" "native/sparse/*.cpp")
-file(GLOB xpu_sycl "native/xpu/sycl/*.cpp")
+file(GLOB xpu_native_cpp "native/xpu/*.cpp" "native/sparse/*.cpp" "native/transformers/*.cpp")
+file(GLOB xpu_sycl "native/xpu/sycl/*.cpp" "native/transformers/sycl/*.cpp")
 
 list(APPEND ATen_XPU_CPP_SRCS ${xpu_cpp})
 list(APPEND ATen_XPU_NATIVE_CPP_SRCS ${xpu_native_cpp})
 
@@ -0,0 +1,334 @@
+#include <ATen/NestedTensorImpl.h>
+#include <ATen/core/Tensor.h>
+#include <ATen/native/nested/NestedTensorUtils.h>
+#include <ATen/native/transformers/attention.h>
+#include <ATen/native/transformers/sdp_utils_cpp.h>
+
+#ifndef AT_PER_OPERATOR_HEADERS
+#include <ATen/Functions.h>
+#include <ATen/NativeFunctions.h>
+#else
+#include <ATen/ops/empty_like.h>
+#include <ATen/ops/linear.h>
+#include <ATen/ops/scaled_dot_product_attention.h>
+#include <ATen/ops/split_native.h>
+#endif
+
+#include <ATen/native/transformers/SDPUtils.h>
+#include <ATen/native/transformers/sycl/AttentionKernels.h>
+
+#include <comm/SYCLContext.h>
+
+namespace at {
+namespace native {
+
+// compute q = (q + q_bias) / sqrt(dim_per_head), k = k + k_bias, v = v + v_bias
+// Note: current only support contiguous indexing, since nested tensor is all
+// contiguous
+std::tuple<Tensor, Tensor, Tensor> transform_bias_rescale_qkv_xpu(
+    const Tensor& qkv,
+    const Tensor& qkv_bias,
+    const int64_t num_head) {
+  // for nested tensor, B is most outer size, but T is not regular, it should be
+  // the large size on dim1
+  auto B = qkv.is_nested()
+      ? native::get_nested_tensor_impl(qkv)->get_nested_sizes().size(0)
+      : qkv.size(0);
+
+  auto T = qkv.is_nested() ? native::NestedTensor_get_max_size(
+                                 *native::get_nested_tensor_impl(qkv))[0]
+                           : qkv.size(1);
+  if (qkv.is_nested()) {
+    // Don't mess with non-nested case for now since it's not set up to fiddle
+    // with mask size.
+
+    // Round T up to next multiple of 8 so as to be able to utilize Tensor
+    // cores. Otherwise, sometimes with padding, *no* row will have the maximum
+    // sequence length and so we'll have a non-divisible-by-8 dimension even if
+    // the model author chose a multiple of 8.
+    T = T + (8 - (T % 8)) % 8;
+  }
+  auto _3D = qkv_bias.size(0);
+  auto D = _3D / 3;
+  TORCH_CHECK(D % num_head == 0);
+  const auto dim_per_head = D / num_head;
+
+  // q_k_v B T 3D -> 3, B, num_head, T, dim_per_head
+  auto q_k_v = at::empty({3, B, num_head, T, dim_per_head}, qkv_bias.options());
+
+  xpu::_transform_bias_rescale_qkv_kernel(
+      qkv, qkv_bias, num_head, q_k_v, B, T, D, dim_per_head);
+
+  auto q_k_v_s =
+      at::native::split(q_k_v.view({3 * B, num_head, T, dim_per_head}), B, 0);
+  return std::make_tuple(q_k_v_s[0], q_k_v_s[1], q_k_v_s[2]);
+}
+
+static bool check_for_seq_len_1_nested_tensor(
+    sdp::sdp_params params,
+    bool debug) {
+  // When this function is called we are assured that the nt is dim==4
+  if (!params.query.is_nested()) {
+    return true;
+  }
+
+  const auto nt_q_tensor_impl =
+      at::native::get_nested_tensor_impl(params.query);
+  const at::Tensor& sizes = nt_q_tensor_impl->get_nested_sizes();
+  auto* sizes_ptr = sizes.data_ptr<int64_t>();
+  const int64_t n_tensors = params.query.size(0);
+  const int64_t size_tensor_stride = sizes.stride(0);
+
+  // This is being called inside sdp with shape [batch, heads, {seq_len}, dim]
+  for (const auto i : c10::irange(n_tensors)) {
+    if (sizes_ptr[(i * size_tensor_stride) + 1] <= 1) {
+      if (debug) {
+        TORCH_WARN(
+            "Packed projection for fused kernels does not support sequence_length <= 1");
+      }
+      return false;
+    }
+  }
+
+  return true;
+}
+
+int64_t _fused_sdp_choice_xpu(
+    const Tensor& query,
+    const Tensor& key,
+    const Tensor& value,
+    const std::optional<Tensor>& attn_mask_,
+    double dropout_p,
+    bool is_causal,
+    std::optional<double> scale,
+    bool enable_gqa) {
+  // We have implemented efficient_attention backend with xetla, flash_attention
+  // backend is not supported now, which will be implemented in the future. So
+  // we provide two backends here.
+  sdp::sdp_params kernel_params{
+      query, key, value, attn_mask_, dropout_p, is_causal, enable_gqa};
+  // Because TORCHCHECK checks if condition is true we negate debug so that
+  // The statements will be printed when debug is true
+  bool print_debug = false;
+  sdp::SDPBackend backend =
+      sdp::can_use_mem_efficient_attention(kernel_params, print_debug)
+      ? sdp::SDPBackend::efficient_attention
+      : sdp::SDPBackend::math;
+  if (backend == sdp::SDPBackend::error) {
+    TORCH_CHECK(
+        false,
+        "No viable backend for scaled_dot_product_attention was found. ",
+        "This is likely due to turning off both the math kernel and the fused kernels.");
+  }
+  return static_cast<int64_t>(backend);
+}
+
+std::tuple<Tensor, Tensor> native_multi_head_attention_xpu(
+    const Tensor& query,
+    const Tensor& key,
+    const Tensor& value,
+    const int64_t embed_dim,
+    const int64_t num_head,
+    const Tensor& qkv_weight,
+    const Tensor& qkv_bias,
+    const Tensor& proj_weight,
+    const Tensor& proj_bias,
+    const std::optional<Tensor>& mask,
+    bool need_weights,
+    bool average_attn_weights,
+    const std::optional<int64_t> mask_type) {
+  // query shape: [B, T, D]
+  // qkv_weight shape: [3 * D, D]
+
+  TORCH_CHECK(
+      !mask || !query.is_nested(),
+      "NestedTensor with mask is not supported yet");
+  const auto D = embed_dim;
+  TORCH_CHECK(
+      query.dim() == 3, "expected 3-D `query`, got ", query.dim(), "-D tensor");
+  TORCH_CHECK(
+      query.is_nested() || query.sizes()[2] == embed_dim,
+      "passed-in embed_dim ",
+      embed_dim,
+      " didn't match last dim of query ",
+      query.sizes()[2]);
+  TORCH_CHECK(
+      key.dim() == 3, "expected 3-D `key`, got ", key.dim(), "-D tensor");
+  TORCH_CHECK(
+      value.dim() == 3, "expected 3-D `value`, got ", value.dim(), "-D tensor");
+  TORCH_CHECK(
+      query.is_nested() || key.is_nested() || value.is_nested() ||
+          (query.sizes() == key.sizes() && key.sizes() == value.sizes()),
+      "expected `query`/`key`/`value` shapes to match");
+  TORCH_CHECK(
+      qkv_weight.dim() == 2,
+      "expected 2-D `qkv_weight`, got ",
+      qkv_weight.dim(),
+      "-D tensor");
+  TORCH_CHECK(
+      D * 3 == qkv_weight.sizes()[0],
+      "expected `qkv_weight` first dim to be 3x embed_dim");
+  TORCH_CHECK(
+      D == qkv_weight.sizes()[1],
+      "expected `qkv_weight` second dim to be embed_Dim");
+  TORCH_CHECK(
+      qkv_bias.dim() == 1,
+      "expected 1-D `qkv_bias`, got ",
+      qkv_bias.dim(),
+      "-D tensor");
+  TORCH_CHECK(
+      qkv_bias.sizes()[0] == 3 * D,
+      "expected `qkv_bias` first dim and first dim of query to be equal");
+  TORCH_CHECK(
+      D % num_head == 0, "`embed_dim` must divide evenly by `num_heads`");
+
+#ifndef NDEBUG
+  const auto B = query.is_nested()
+      ? native::get_nested_tensor_impl(query)->get_nested_sizes().size(0)
+      : query.sizes()[0];
+  auto T = query.is_nested() ? 0 : query.sizes()[1];
+
+#endif
+  const auto dim_per_head = D / num_head;
+  if ((query.is_same(key) && key.is_same(value)) && dim_per_head % 8 == 0 &&
+      !need_weights) {
+    // We have not done linear projection yet but the input for SDP
+    // Is expected to be 4 dimensional. We "cheaply" create view tensors
+    // That will then be used for checking hot path conditions with
+    // select_sd_backend
+    auto q =
+        query.view({query.size(0), -1, num_head, dim_per_head}).transpose(1, 2);
+    auto k =
+        key.view({key.size(0), -1, num_head, dim_per_head}).transpose(1, 2);
+    auto v =
+        value.view({value.size(0), -1, num_head, dim_per_head}).transpose(1, 2);
+
+    sdp::sdp_params kernel_params{q, k, v, mask, 0.0, false, false};
+    auto backend = static_cast<sdp::SDPBackend>(
+        _fused_sdp_choice_xpu(q, k, v, mask, 0.0, false, {}, false));
+
+    // strides from packed projection for nested tensors when seq_len is 1 will
+    // be and will trigger a contiguous call in the kernel, so we prevent this
+    bool no_seq_len_1_nested = query.is_nested()
+        ? check_for_seq_len_1_nested_tensor(kernel_params, false)
+        : true;
+    // The API for transformer_encoder is a mask of shape (Batch_Size,
+    // Seq_len_q) For mem-eff attention this will cause the expand call to error
+    // For now I am going to turn of that path not have to deal with all the
+    // annoying Mask type shape grossness
+    if (!mask.has_value() && no_seq_len_1_nested &&
+        (backend == sdp::SDPBackend::flash_attention ||
+         backend == sdp::SDPBackend::efficient_attention)) {
+      auto x = at::linear(query, qkv_weight, qkv_bias);
+      auto chunks = x.chunk(3, -1);
+      auto x_size_0 = x.size(0);
+
+      chunks[0] = (chunks[0].view({x_size_0, -1, num_head, dim_per_head}))
+                      .transpose(1, 2);
+      chunks[1] = (chunks[1].view({x_size_0, -1, num_head, dim_per_head}))
+                      .transpose(1, 2);
+      chunks[2] = (chunks[2].view({x_size_0, -1, num_head, dim_per_head}))
+                      .transpose(1, 2);
+      auto y = at::scaled_dot_product_attention(
+          chunks[0], chunks[1], chunks[2], mask, 0.0, false, std::nullopt);
+
+      auto past_sdp = y.transpose(1, 2).reshape({x_size_0, -1, embed_dim});
+      return std::make_tuple(
+          at::linear(past_sdp, proj_weight, proj_bias), Tensor());
+    }
+    // Returned math or error lets not use it
+  }
+
+  // shape: [B, T, 3 x D]
+  auto qkv = native::qkv_projection(query, key, value, embed_dim, qkv_weight);
+
+  if (!qkv.is_nested() && qkv.numel() == 0) {
+    if (query.is_nested()) {
+      return std::make_tuple(Tensor(), Tensor());
+    }
+    return std::make_tuple(at::empty_like(query), Tensor());
+  }
+
+#ifndef NDEBUG
+  if (!query.is_nested() || !qkv.is_nested()) {
+    if (query.is_nested()) {
+      T = qkv.size(1);
+    }
+    native::debug_assert_shape(__LINE__, qkv, {B, T, 3 * D});
+  }
+#endif
+
+#ifdef DEBUG_PRINT_EACH_STEP
+  if (!qkv.is_nested()) {
+    std::cerr << "qkv: " << qkv << std::endl;
+  }
+#endif
+  // shape: 3 x [B, num_head, T, dim_per_head]
+  auto q_k_v = transform_bias_rescale_qkv_xpu(qkv, qkv_bias, num_head);
+  qkv = Tensor(); // Not used any more, allow free
+  auto& q = std::get<0>(q_k_v);
+  const auto& k = std::get<1>(q_k_v);
+  const auto& v = std::get<2>(q_k_v);
+#ifndef NDEBUG
+  native::debug_assert_shape(__LINE__, q, {B, num_head, T, dim_per_head});
+  native::debug_assert_shape(__LINE__, k, {B, num_head, T, dim_per_head});
+  native::debug_assert_shape(__LINE__, v, {B, num_head, T, dim_per_head});
+#endif
+#ifdef DEBUG_PRINT_EACH_STEP
+  std::cerr << "q: " << q << std::endl;
+  std::cerr << "k: " << k << std::endl;
+  std::cerr << "v: " << v << std::endl;
+#endif
+
+  // shape: [B, num_head, T, T]
+  auto qkt = native::bmm_nt(q, k);
+  // q & k are dead but cannot be freed because they were packed with v
+#ifndef NDEBUG
+  native::debug_assert_shape(__LINE__, qkt, {B, num_head, T, T});
+#endif
+#ifdef DEBUG_PRINT_EACH_STEP
+  std::cerr << "qkt: " << qkt << std::endl;
+#endif
+
+  // shape: [B, num_head, T, T]
+  // TODO: long-term, have a kernel that works with
+  // NestedTensor directly if there is no mask passed
+  qkt = native::masked_softmax(qkt, mask, query, mask_type);
+#ifdef DEBUG_PRINT_EACH_STEP
+  std::cerr << "qkt after softmax: " << qkt << std::endl;
+#endif
+
+  // shape: [B, num_head, T, dim_per_head]
+  // reuse storage for q; we're done with it
+  auto attn_ctx = native::bmm_nn(q, qkt, v);
+  // qkv is not dead; we just reused storage for q!
+  if (!need_weights) {
+    qkt = Tensor();
+  }
+#ifndef NDEBUG
+  native::debug_assert_shape(
+      __LINE__, attn_ctx, {B, num_head, T, dim_per_head});
+#endif
+#ifdef DEBUG_PRINT_EACH_STEP
+  std::cerr << "attn_ctx: " << attn_ctx << std::endl;
+#endif
+
+  // shape: [B, T, D]
+  // Fuse transform_0213 inside
+  auto proj = native::transform0213_gemm_nt_bias(
+      attn_ctx, proj_weight, proj_bias, query);
+#ifndef NDEBUG
+  native::debug_assert_shape(__LINE__, proj, {B, T, D});
+#endif
+  if (need_weights && average_attn_weights) {
+    // weights are not needed for full transformer, so don't worry too
+    // much about performance -- we implement this just to make use
+    // cases that don't disable need_weights still get some speedup.
+    qkt = qkt.sum(1);
+    qkt /= num_head;
+  }
+  return std::make_tuple(std::move(proj), std::move(qkt));
+}
+
+} // namespace native
+} // namespace at
Original file line number	Diff line number	Diff line change
`@@ -13,8 +13,6 @@`
`13`	`13`	`"https://github.com/pytorch/pytorch/pull/126516",`
`14`	`14`	`# Modify the tolerance level in TIMM benchmark`
`15`	`15`	`"https://github.com/pytorch/pytorch/pull/129735",`
`16`		`- # [Intel GPU] Allow XPU device in cdist and pdist operators`
`17`		`- "https://github.com/pytorch/pytorch/pull/138441",`
`18`	`16`	`]`
`19`	`17`	`)`
`20`	`18`	`parser.add_argument('--extra-pr-list', '-e', nargs='+',default=[])`