Revert "[Intel GPU] Make SDPA output has the same stride as Query. (pytorch#154340)"

pytorchmergebot · pytorchmergebot · commit d3c8f36ba0da · 2025-06-05T06:46:24.000Z
This reverts commit 0f10df7. Reverted pytorch#154340 on behalf of https://github.com/etaf due to This PR breaks hugging face E2E run on XPU. ([comment](pytorch#154340 (comment)))
diff --git a/aten/src/ATen/native/mkldnn/xpu/Attention.cpp b/aten/src/ATen/native/mkldnn/xpu/Attention.cpp
@@ -190,7 +190,8 @@ _scaled_dot_product_fused_attention_overrideable_xpu(
   const int64_t seq_len_q = query.size(2);
   const int64_t seq_len_kv = key.size(2);
 
-  at::Tensor output;
+  auto opts = query.options();
+  auto output = at::empty({batch_size, num_head, seq_len_q, head_dim}, opts);
   at::Tensor logsumexp, debug_attn_mask; // not supported
 
   at::native::onednn::gpu_float_sdpa(
diff --git a/aten/src/ATen/native/mkldnn/xpu/detail/Attention.cpp b/aten/src/ATen/native/mkldnn/xpu/detail/Attention.cpp
@@ -330,39 +330,6 @@ partition& find_or_create_graph_partition(
   }
   return *partition_;
 }
-
-void alloc_with_matching_layout(
-    const at::Tensor& q,
-    at::Tensor& output,
-    const std::vector<int64_t>& shape) {
-  TORCH_INTERNAL_ASSERT(
-      shape.size() == q.sizes().size(),
-      "OneDNN SDPA alloc_with_matching_layout got requested shape ndim != q ndim");
-
-  if (std::equal(q.sizes().begin(), q.sizes().end(), shape.begin())) {
-    output = at::empty_like(q);
-    return;
-  }
-
-  // get the "fill order," which is just an argsort on the strides
-  std::vector<int> fill_order(shape.size());
-  std::iota(fill_order.begin(), fill_order.end(), 0);
-  const auto q_strides = q.strides();
-  std::stable_sort(
-      fill_order.begin(), fill_order.end(), [&q_strides](int idx1, int idx2) {
-        return q_strides[idx1] < q_strides[idx2];
-      });
-  std::vector<int64_t> ordered_strides(shape.size());
-  int64_t current_stride = 1;
-  for (const int dim_idx : fill_order) {
-    ordered_strides[dim_idx] = current_stride;
-    current_stride *= shape[dim_idx];
-  }
-  output = at::empty(at::IntArrayRef(shape), q.options())
-               .as_strided(
-                   at::IntArrayRef(shape), at::IntArrayRef(ordered_strides), 0);
-}
-
 } // namespace
 
 namespace at::native::onednn {
@@ -380,14 +347,7 @@ void gpu_float_sdpa(
     std::optional<at::Tensor> attn_mask,
     bool is_causal,
     float softmax_scale,
-    Tensor& output) {
-  if (!output.defined()) {
-    // allocate output tensor with layout matched to query
-    std::vector<int64_t> output_shape = {
-        batch_size, num_head, seq_len_q, head_dim_v};
-    alloc_with_matching_layout(query, output, output_shape);
-  }
-
+    const Tensor& output) {
   auto& eng = GpuEngineManager::Instance().get_engine();
   auto& strm = GpuStreamManager::Instance().get_stream();
 
diff --git a/aten/src/ATen/native/mkldnn/xpu/detail/oneDNN.h b/aten/src/ATen/native/mkldnn/xpu/detail/oneDNN.h
@@ -178,5 +178,5 @@ void gpu_float_sdpa(
     std::optional<at::Tensor> attn_mask,
     bool is_causal,
     float softmax_scale,
-    Tensor& output);
+    const Tensor& output);
 } // namespace at::native::onednn
diff --git a/test/test_transformers.py b/test/test_transformers.py
@@ -4059,31 +4059,6 @@ def test_fused_attention_broadcasted_input(self, device):
 
         self.assertEqual(actual.contiguous(), math_ref.contiguous().to(dtype), atol=1e-3, rtol=1e-2)
 
-    def test_attention_preserves_query_layout(self, device):
-
-        def test_attention(permute_order: list[list[int]]):
-            BHSqD = [4, 16, 256, 64]
-            BHSkvD = [4, 16, 512, 64]
-
-            shape_q = [BHSqD[idx] for idx in permute_order]
-            shape_kv = [BHSkvD[idx] for idx in permute_order]
-            reverse = [permute_order.index(idx) for idx in range(4)]
-            q = torch.randn(*shape_q, dtype=torch.bfloat16, device=device, requires_grad=False).permute(reverse)
-            k = torch.randn(*shape_kv, dtype=torch.bfloat16, device=device, requires_grad=False).permute(reverse)
-            v = torch.randn(*shape_kv, dtype=torch.bfloat16, device=device, requires_grad=False).permute(reverse)
-            self.assertEqual(q.shape, BHSqD)
-            self.assertEqual(k.shape, BHSkvD)
-            self.assertEqual(v.shape, BHSkvD)
-
-            out = F.scaled_dot_product_attention(q, k, v)
-            self.assertTrue(out.permute(permute_order).is_contiguous())
-
-        permutable = [0, 1, 2]
-        permute_orders = itertools.permutations(permutable)
-
-        for permute_order in permute_orders:
-            test_attention(list(permute_order) + [3])
-
     @parametrize("type", ["dense"])
     @parametrize("is_contiguous", [True, False])
     def test_scaled_dot_product_attention_fused_kernels_packed(self, device, type: str, is_contiguous: bool):