[Intel GPU] Make SDPA output has the same stride as Query. (pytorch#154340)

LuFinch · pytorchmergebot · commit 0f10df71a66c · 2025-06-04T07:16:56.000Z
Fixes [pytorch#153903](pytorch#153903). Currently the output tensor of SDPA XPU is always defined as contiguous stride, while CPU/CUDA flash_attention and cudnn_attention allocate output tensor with stride the same as Query. This PR aligns XPU's behavior with CUDA/CPU to make XPU compatible to CPU/CUDA's modeling code. The function `alloc_with_matching_layout` is copied from cudnn https://github.com/pytorch/pytorch/blob/8c16d0e4047a8ac5885baf52e8779fb3e36f2987/aten/src/ATen/native/cudnn/MHA.cpp#L874 Pull Request resolved: pytorch#154340 Approved by: https://github.com/Skylion007, https://github.com/EikanWang, https://github.com/guangyey
diff --git a/aten/src/ATen/native/mkldnn/xpu/Attention.cpp b/aten/src/ATen/native/mkldnn/xpu/Attention.cpp
@@ -190,8 +190,7 @@ _scaled_dot_product_fused_attention_overrideable_xpu(
   const int64_t seq_len_q = query.size(2);
   const int64_t seq_len_kv = key.size(2);
 
-  auto opts = query.options();
-  auto output = at::empty({batch_size, num_head, seq_len_q, head_dim}, opts);
+  at::Tensor output;
   at::Tensor logsumexp, debug_attn_mask; // not supported
 
   at::native::onednn::gpu_float_sdpa(
diff --git a/aten/src/ATen/native/mkldnn/xpu/detail/Attention.cpp b/aten/src/ATen/native/mkldnn/xpu/detail/Attention.cpp
@@ -330,6 +330,39 @@ partition& find_or_create_graph_partition(
   }
   return *partition_;
 }
+
+void alloc_with_matching_layout(
+    const at::Tensor& q,
+    at::Tensor& output,
+    const std::vector<int64_t>& shape) {
+  TORCH_INTERNAL_ASSERT(
+      shape.size() == q.sizes().size(),
+      "OneDNN SDPA alloc_with_matching_layout got requested shape ndim != q ndim");
+
+  if (std::equal(q.sizes().begin(), q.sizes().end(), shape.begin())) {
+    output = at::empty_like(q);
+    return;
+  }
+
+  // get the "fill order," which is just an argsort on the strides
+  std::vector<int> fill_order(shape.size());
+  std::iota(fill_order.begin(), fill_order.end(), 0);
+  const auto q_strides = q.strides();
+  std::stable_sort(
+      fill_order.begin(), fill_order.end(), [&q_strides](int idx1, int idx2) {
+        return q_strides[idx1] < q_strides[idx2];
+      });
+  std::vector<int64_t> ordered_strides(shape.size());
+  int64_t current_stride = 1;
+  for (const int dim_idx : fill_order) {
+    ordered_strides[dim_idx] = current_stride;
+    current_stride *= shape[dim_idx];
+  }
+  output = at::empty(at::IntArrayRef(shape), q.options())
+               .as_strided(
+                   at::IntArrayRef(shape), at::IntArrayRef(ordered_strides), 0);
+}
+
 } // namespace
 
 namespace at::native::onednn {
@@ -347,7 +380,14 @@ void gpu_float_sdpa(
     std::optional<at::Tensor> attn_mask,
     bool is_causal,
     float softmax_scale,
-    const Tensor& output) {
+    Tensor& output) {
+  if (!output.defined()) {
+    // allocate output tensor with layout matched to query
+    std::vector<int64_t> output_shape = {
+        batch_size, num_head, seq_len_q, head_dim_v};
+    alloc_with_matching_layout(query, output, output_shape);
+  }
+
   auto& eng = GpuEngineManager::Instance().get_engine();
   auto& strm = GpuStreamManager::Instance().get_stream();
 
diff --git a/aten/src/ATen/native/mkldnn/xpu/detail/oneDNN.h b/aten/src/ATen/native/mkldnn/xpu/detail/oneDNN.h
@@ -178,5 +178,5 @@ void gpu_float_sdpa(
     std::optional<at::Tensor> attn_mask,
     bool is_causal,
     float softmax_scale,
-    const Tensor& output);
+    Tensor& output);
 } // namespace at::native::onednn
diff --git a/test/test_transformers.py b/test/test_transformers.py
@@ -4059,6 +4059,31 @@ def test_fused_attention_broadcasted_input(self, device):
 
         self.assertEqual(actual.contiguous(), math_ref.contiguous().to(dtype), atol=1e-3, rtol=1e-2)
 
+    def test_attention_preserves_query_layout(self, device):
+
+        def test_attention(permute_order: list[list[int]]):
+            BHSqD = [4, 16, 256, 64]
+            BHSkvD = [4, 16, 512, 64]
+
+            shape_q = [BHSqD[idx] for idx in permute_order]
+            shape_kv = [BHSkvD[idx] for idx in permute_order]
+            reverse = [permute_order.index(idx) for idx in range(4)]
+            q = torch.randn(*shape_q, dtype=torch.bfloat16, device=device, requires_grad=False).permute(reverse)
+            k = torch.randn(*shape_kv, dtype=torch.bfloat16, device=device, requires_grad=False).permute(reverse)
+            v = torch.randn(*shape_kv, dtype=torch.bfloat16, device=device, requires_grad=False).permute(reverse)
+            self.assertEqual(q.shape, BHSqD)
+            self.assertEqual(k.shape, BHSkvD)
+            self.assertEqual(v.shape, BHSkvD)
+
+            out = F.scaled_dot_product_attention(q, k, v)
+            self.assertTrue(out.permute(permute_order).is_contiguous())
+
+        permutable = [0, 1, 2]
+        permute_orders = itertools.permutations(permutable)
+
+        for permute_order in permute_orders:
+            test_attention(list(permute_order) + [3])
+
     @parametrize("type", ["dense"])
     @parametrize("is_contiguous", [True, False])
     def test_scaled_dot_product_attention_fused_kernels_packed(self, device, type: str, is_contiguous: bool):