[FlexAttention] explicilty create grad_q w/ strides (pytorch#153641)

pytorchbot · drisspg · web-flow · commit 6f2f41c85b25 · 2025-05-21T12:43:38.000-04:00
[FlexAttention] explicilty create grad_q w/ strides (pytorch#152641) Fixes: pytorch#147463 There is a mismatch between inductor's lowering for empty_like and it does not match the behavior of eager. The strides do not match preserve format pytorch#144699 Pull Request resolved: pytorch#152641 Approved by: https://github.com/xmfan (cherry picked from commit a6ea63a) Co-authored-by: drisspg <drisspguessous@gmail.com>
diff --git a/torch/_higher_order_ops/flex_attention.py b/torch/_higher_order_ops/flex_attention.py
@@ -780,11 +780,12 @@ def sdpa_dense_backward(
 ]:
     from torch._dynamo._trace_wrapped_higher_order_op import TransformGetItemToIndex
 
-    Bq, _, _, qk_head_dim = query.shape
+    Bq, Hq, seq_len_q, qk_head_dim = query.shape
     Bkv, Hkv, seq_len_kv, v_head_dim = value.shape
 
     # Get outputs before calling repeat interleave and permute to input stride orders
-    actual_grad_query = torch.empty_like(query)
+    actual_grad_query = query.new_empty((Bq, Hq, seq_len_q, qk_head_dim))
+    actual_grad_query = _permute_strides(actual_grad_query, query.stride())
 
     actual_grad_key = key.new_empty((Bq, Hkv, seq_len_kv, qk_head_dim))
     actual_grad_key = _permute_strides(actual_grad_key, key.stride())
diff --git a/torch/_inductor/kernel/flex_attention.py b/torch/_inductor/kernel/flex_attention.py
@@ -38,7 +38,6 @@
     _full,
     check_and_broadcast_indices,
     empty,
-    empty_like,
     empty_strided,
     expand,
     index_output_size_and_inner_fn,
@@ -2524,7 +2523,14 @@ def flex_attention_backward(*args, **kwargs):
     grad_lse_exp2, delta = maybe_realize([grad_lse_exp2, delta])
 
     # # see NOTE:[TritonTemplates with multiple outputs]
-    grad_query = empty_like(query)
+    query_size = [Bq, Hq, seq_len_q, qk_head_dim]
+    grad_query_strides = infer_dense_strides(query_size, query.get_stride())
+    grad_query = empty_strided(
+        query_size,
+        stride=[sympy.sympify(s) for s in grad_query_strides],
+        dtype=query.get_dtype(),
+        device=query.get_device(),
+    )
 
     # Construct output layout with stride order matching value
     value_size = [Bq, Hkv, seq_len_kv, v_head_dim]