[Flex] Fix silent correctness w/ backpropping grads (pytorch#164366)

pytorchbot · drisspg · web-flow · commit 10b501fde9ec · 2025-10-01T14:43:28.000-07:00
[Flex] Fix silent correctness w/ backpropping grads (pytorch#163677) Fixes #pytorch#162228 # Summary Majority of our tests are only compiling flex-attention in isolation. This means that for fake tensor propagation the input primals and all captured buffers dont do any intermediate computation below autograd. As a result result the by happen chance match the `require_grad`ness of the eager implementation and this check will pass. However if score_mod is a the result of some other intermediate fake tensor prop then it is not guaranteed to have accurate req_gradness, which was happening here. TLDR is that this was a boot and suspenders that was actually harmful and we should just let the joint graph handle creating the correct joint graph Pull Request resolved: pytorch#163677 Approved by: https://github.com/ydwu4 (cherry picked from commit e2ce79e) Co-authored-by: drisspg <drisspguessous@gmail.com>
diff --git a/test/inductor/test_flex_attention.py b/test/inductor/test_flex_attention.py
@@ -6546,6 +6546,35 @@ def bias_mod(score, b, h, q_idx, kv_idx):
         assert bias.grad, "No gradient computed for bias"
         assert torch.any(bias.grad != 0), "Gradient for bias is 0"
 
+    @skip_on_cpu
+    def test_backprop_error_case(self, device):
+        @torch.compile()
+        def test(x, y):
+            # Materialize a bias matrix
+            B, L, device = x.shape[0], x.shape[1], x.device
+            b = torch.arange(B, device=device, dtype=torch.long).view(B, 1, 1)
+            q_idx = torch.arange(L, device=device, dtype=torch.long).view(1, L, 1)
+            kv_idx = torch.arange(L, device=device, dtype=torch.long).view(1, 1, L)
+            bias_mat = y[b, q_idx] + y[b, kv_idx]  # (B, L, L)
+
+            # Dummy score_mod retrieving bias values
+            def score_mod(score, b, h, q_idx, kv_idx):
+                return score + bias_mat[b, q_idx, kv_idx]
+
+            x_ = x[:, :, None].repeat(1, 1, 16, 1)
+            # torch._dynamo.graph_break()
+            return flex_attention(x_, x_, x_, score_mod=score_mod)
+
+        B, L, D = 2, 16, 64
+
+        x = torch.randn(B, L, D, device=device, requires_grad=True)
+        y = torch.randn(B, L, device=device, requires_grad=True)
+
+        _ = test(x, y).mean().backward()
+
+        assert x.grad.norm() > 0
+        assert y.grad.norm() > 0
+
     @skip_on_cpu
     @common_utils.parametrize(
         "params", get_params(device_configs["cuda"].dtypes), name_fn=lambda x: f"{x}"
diff --git a/torch/_higher_order_ops/flex_attention.py b/torch/_higher_order_ops/flex_attention.py
@@ -1266,7 +1266,7 @@ def flex_attention_backward_fake_tensor_mode(
         [
             (
                 torch.empty_like(buffer, memory_format=torch.contiguous_format)
-                if isinstance(buffer, torch.Tensor) and buffer.requires_grad
+                if isinstance(buffer, torch.Tensor)
                 else None
             )
             for buffer in score_mod_other_buffers

Original file line number	Diff line number	Diff line change
`@@ -1266,7 +1266,7 @@ def flex_attention_backward_fake_tensor_mode(`
`1266`	`1266`	`[`
`1267`	`1267`	`(`
`1268`	`1268`	`torch.empty_like(buffer, memory_format=torch.contiguous_format)`
`1269`		`- if isinstance(buffer, torch.Tensor) and buffer.requires_grad`
	`1269`	`+ if isinstance(buffer, torch.Tensor)`
`1270`	`1270`	`else None`
`1271`	`1271`	`)`
`1272`	`1272`	`for buffer in score_mod_other_buffers`