Revert "Remove test since it ooms on CI (pytorch#161644)"

pytorchmergebot · pytorchmergebot · commit 5432966253ce · 2025-08-28T13:41:58.000Z
This reverts commit 443452c. Reverted pytorch#161644 on behalf of https://github.com/atalman due to need to revert pytorch#157767 internal tests ([comment](pytorch#161644 (comment)))
diff --git a/test/inductor/test_flex_attention.py b/test/inductor/test_flex_attention.py
@@ -48,6 +48,7 @@
     skipCPUIf,
     skipCUDAIf,
 )
+from torch.testing._internal.common_utils import IS_FBCODE
 from torch.utils._triton import has_triton, has_triton_tma_device
 
 
@@ -4339,6 +4340,41 @@ def simple_score_mod(score, b, h, q_idx, kv_idx):
             fa._FLEX_ATTENTION_DISABLE_COMPILE_DEBUG = original_flag
             fa._WARNINGS_SHOWN = original_warnings_shown
 
+    @largeTensorTest("38GB", "cuda")  # emperically
+    @skip_on_cpu
+    @unittest.skipIf(IS_FBCODE, "Skip large tensor test in fbcode")
+    def test_int64_indexing_large_stride(self, device):
+        B = 1
+        H = 64
+        S = 2**20
+        D = 64
+        dtype = torch.float16
+
+        def _simple_causal(b, h, q_idx, kv_idx):
+            return q_idx >= kv_idx
+
+        BLOCK_M = 1024
+        BLOCK_N = 1024
+
+        block_mask = torch.compile(create_block_mask)(
+            _simple_causal, B, H, S, S, device=device, BLOCK_SIZE=(BLOCK_M, BLOCK_N)
+        )
+
+        q = torch.randn(B, H, S, D, device=device, dtype=dtype, requires_grad=True)
+        k = torch.randn(B, H, S, D, device=device, dtype=dtype, requires_grad=True)
+        v = torch.randn(B, H, S, D, device=device, dtype=dtype, requires_grad=True)
+
+        # Test forward and backward pass
+        out = torch.compile(flex_attention)(q, k, v, block_mask=block_mask)
+        loss = out.sum()
+        loss.backward()
+
+        # Basic correctness checks, doing full comapre consumes too much memory :/
+        self.assertEqual(out.shape, (B, H, S, D))
+        self.assertTrue(q.grad is not None)
+        self.assertTrue(k.grad is not None)
+        self.assertTrue(v.grad is not None)
+
 
 class TestBlockMask(InductorTestCase):
     def setUp(self):