Revert "Ensure large tensor int32 -> int64 indexing is enabled (pytorch#157767)"

pytorchmergebot · pytorchmergebot · commit ef0483d74c2e · 2025-08-28T13:44:41.000Z
This reverts commit b36a20d. Reverted pytorch#157767 on behalf of https://github.com/atalman due to need to revert pytorch#157767 internal tests ([comment](pytorch#157767 (comment)))
diff --git a/test/inductor/test_flex_attention.py b/test/inductor/test_flex_attention.py
@@ -48,7 +48,6 @@
     skipCPUIf,
     skipCUDAIf,
 )
-from torch.testing._internal.common_utils import IS_FBCODE
 from torch.utils._triton import has_triton, has_triton_tma_device
 
 
@@ -4340,41 +4339,6 @@ def simple_score_mod(score, b, h, q_idx, kv_idx):
             fa._FLEX_ATTENTION_DISABLE_COMPILE_DEBUG = original_flag
             fa._WARNINGS_SHOWN = original_warnings_shown
 
-    @largeTensorTest("38GB", "cuda")  # emperically
-    @skip_on_cpu
-    @unittest.skipIf(IS_FBCODE, "Skip large tensor test in fbcode")
-    def test_int64_indexing_large_stride(self, device):
-        B = 1
-        H = 64
-        S = 2**20
-        D = 64
-        dtype = torch.float16
-
-        def _simple_causal(b, h, q_idx, kv_idx):
-            return q_idx >= kv_idx
-
-        BLOCK_M = 1024
-        BLOCK_N = 1024
-
-        block_mask = torch.compile(create_block_mask)(
-            _simple_causal, B, H, S, S, device=device, BLOCK_SIZE=(BLOCK_M, BLOCK_N)
-        )
-
-        q = torch.randn(B, H, S, D, device=device, dtype=dtype, requires_grad=True)
-        k = torch.randn(B, H, S, D, device=device, dtype=dtype, requires_grad=True)
-        v = torch.randn(B, H, S, D, device=device, dtype=dtype, requires_grad=True)
-
-        # Test forward and backward pass
-        out = torch.compile(flex_attention)(q, k, v, block_mask=block_mask)
-        loss = out.sum()
-        loss.backward()
-
-        # Basic correctness checks, doing full comapre consumes too much memory :/
-        self.assertEqual(out.shape, (B, H, S, D))
-        self.assertTrue(q.grad is not None)
-        self.assertTrue(k.grad is not None)
-        self.assertTrue(v.grad is not None)
-
 
 class TestBlockMask(InductorTestCase):
     def setUp(self):
diff --git a/torch/_inductor/kernel/flex/templates/common.py.jinja b/torch/_inductor/kernel/flex/templates/common.py.jinja
@@ -4,7 +4,7 @@
 @triton.jit
 def forward_block_mn(
     {{gen_argdefs()}},
-    q, K, V, desc_k, desc_v, Q_LEN, KV_LEN,
+    q, K_block_ptr, V_block_ptr, desc_k, desc_v, Q_LEN, KV_LEN,
     # accumulated values
     acc, l_i, m_i,
     # Offsets
@@ -13,8 +13,6 @@ def forward_block_mn(
     kv_start,
     kv_offset,
     MATMUL_PRECISION, RCP_LN2,
-    # Strides for K and V
-    stride_kk, stride_kn, stride_vn, stride_vk,
     IS_FULL_BLOCKS, CHECK_BLOCK_BOUNDARY=False,
 
 ):
@@ -23,21 +21,17 @@ def forward_block_mn(
 
     # -- load k --
     # NB reversed order to since K is transposed
-    kv_base_offset = kv_start + kv_offset
     {%- if USE_TMA %}
     k = tl.load_tensor_descriptor(
         desc_k,
-        [kv_base_offset, 0],
+        [kv_start + kv_offset, 0],
     )
     {%- else %}
-
-    # Load K as [BLOCK_N, QK_HEAD_DIM_ROUNDED] then transpose to [QK_HEAD_DIM_ROUNDED, BLOCK_N]
-    offs_k = tl.arange(0, QK_HEAD_DIM_ROUNDED)
-    offs_n_load = kv_base_offset + tl.arange(0, BLOCK_N)
-    k = load_checked_2d(K, offs_n_load, offs_k, stride_kn, stride_kk, IS_DIVISIBLE, SAFE_HEAD_DIM, KV_LEN, QK_HEAD_DIM)
+    k = load_checked_block(K_block_ptr, SAFE_HEAD_DIM, IS_DIVISIBLE)
     {%- endif %}
 
-    k = tl.trans(k)
+    if USE_TMA:
+        k = tl.trans(k)
     # -- compute qk ---
     qk = tl.dot(q, k, input_precision=FLOAT32_PRECISION) # TODO: use cuda matmul when q_len <= 2.
     if not PRESCALE_QK:
@@ -104,12 +98,10 @@ def forward_block_mn(
     {%- if USE_TMA %}
     v = tl.load_tensor_descriptor(
         desc_v,
-        [kv_base_offset, 0],
+        [kv_start + kv_offset, 0],
     )
     {%- else %}
-    # Calculate offsets for V loading - reuse kv_base_offset from K loading
-    offs_v = tl.arange(0, V_HEAD_DIM_ROUNDED)
-    v = load_checked_2d(V, offs_n_load, offs_v, stride_vn, stride_vk, IS_DIVISIBLE, SAFE_HEAD_DIM, KV_LEN, V_HEAD_DIM)
+    v = load_checked_block(V_block_ptr, IS_DIVISIBLE, SAFE_HEAD_DIM)
     {%- endif %}
     acc = tl.dot(p.to(MATMUL_PRECISION), v, acc, input_precision=FLOAT32_PRECISION)
 
@@ -121,7 +113,7 @@ def forward_block_mn(
 @triton.jit
 def forward_inner(
     {{gen_argdefs()}},
-    q, K, V,
+    q, K_block_ptr, V_block_ptr,
     desc_k, desc_v, Q_LEN, KV_LEN,
     # accumulated values
     acc, l_i, m_i,
@@ -135,8 +127,6 @@ def forward_inner(
     # start kv and end kv block
     block_n_start, block_n_end,
     MATMUL_PRECISION,
-    # Strides for K and V
-    stride_kk, stride_kn, stride_vn, stride_vk,
     IS_FULL_BLOCKS,
 ):
     # Redefines all kernel parameters (BLOCK_M, etc.) so we don't need to plumb them all through
@@ -156,7 +146,7 @@ def forward_inner(
         if IS_DIVISIBLE:
             acc, l_i, m_i = forward_block_mn(
                 {{gen_argdefs()}},
-                q, K, V, desc_k, desc_v, Q_LEN, KV_LEN,
+                q, K_block_ptr, V_block_ptr, desc_k, desc_v, Q_LEN, KV_LEN,
                 # accumulated values
                 acc, l_i, m_i,
                 # Offsets
@@ -165,8 +155,6 @@ def forward_inner(
                 kv_start,
                 kv_offset,
                 MATMUL_PRECISION, RCP_LN2,
-                # Strides for K and V
-                stride_kk, stride_kn, stride_vn, stride_vk,
                 IS_FULL_BLOCKS,
             )
         else:
@@ -176,7 +164,7 @@ def forward_inner(
             # to the last block because it's faster a lot.
             acc, l_i, m_i = forward_block_mn(
                 {{gen_argdefs()}},
-                q, K, V, desc_k, desc_v, Q_LEN, KV_LEN,
+                q, K_block_ptr, V_block_ptr, desc_k, desc_v, Q_LEN, KV_LEN,
                 # accumulated values
                 acc, l_i, m_i,
                 # Offsets
@@ -185,8 +173,6 @@ def forward_inner(
                 kv_start,
                 kv_offset,
                 MATMUL_PRECISION, RCP_LN2,
-                # Strides for K and V
-                stride_kk, stride_kn, stride_vn, stride_vk,
                 IS_FULL_BLOCKS, CHECK_BLOCK_BOUNDARY=True,
             )
 
@@ -199,6 +185,9 @@ def forward_inner(
 
         offs_n = offs_n + offset
         kv_offset += offset
+        if not USE_TMA:
+            K_block_ptr = tl.advance(K_block_ptr, (0, offset))
+            V_block_ptr = tl.advance(V_block_ptr, (offset, 0))
 
 
     return acc, l_i, m_i
diff --git a/torch/_inductor/kernel/flex/templates/flex_attention.py.jinja b/torch/_inductor/kernel/flex/templates/flex_attention.py.jinja
@@ -45,9 +45,9 @@
 
     MATMUL_PRECISION = Q.dtype.element_ty
 
-    q_start = tl.program_id(0).to(INDEX_DTYPE)
-    off_zq = tl.program_id(1).to(INDEX_DTYPE)
-    off_hq = tl.program_id(2).to(INDEX_DTYPE)
+    q_start = tl.program_id(0)
+    off_zq = tl.program_id(1)
+    off_hq = tl.program_id(2)
 
     # We support two cases for batch dimension. a) (ZKV == ZQ) where off_zkv = off_zq.
     # b) (ZKV == 1 and ZQ > 1) where KV is broadcasted along the batch dimension and off_zkv=0.
@@ -114,16 +114,27 @@
     sparse_hz_offset = sparse_idx_z * SPARSE_HQ + sparse_idx_hq
     sparse_kv_num_blks_offset = sparse_hz_offset * stride_kv_num_blks_h + q_start // SPARSE_Q_MULTIPLE
     sparse_kv_idx_offset = sparse_hz_offset * stride_kv_idx_h + (q_start // SPARSE_Q_MULTIPLE) * stride_kv_idx_m  # noqa: B950
+    K_block_ptr = None
+    V_block_ptr = None
+    Q_block_ptr = None
+
+    if not USE_TMA:
+        Q_block_ptr = tl.make_block_ptr(
+            base=Q ,
+            shape=(Q_LEN, QK_HEAD_DIM),
+            strides=(stride_qm, stride_qk),
+            offsets=(q_start * BLOCK_M, 0),
+            block_shape=(BLOCK_M, QK_HEAD_DIM_ROUNDED),
+            order=(1, 0)
+        )
 
     {%- if USE_TMA %}
     q = tl.load_tensor_descriptor(
         desc_q,
         [(q_start * BLOCK_M).to(tl.int32), 0],
     )
     {%- else %}
-    offs_m = q_start * BLOCK_M + tl.arange(0, BLOCK_M)
-    offs_k = tl.arange(0, QK_HEAD_DIM_ROUNDED)
-    q = load_checked_2d(Q, offs_m, offs_k, stride_qm, stride_qk, IS_DIVISIBLE, SAFE_HEAD_DIM, Q_LEN, QK_HEAD_DIM)
+        q = load_checked_block(Q_block_ptr, IS_DIVISIBLE, SAFE_HEAD_DIM)
     {%- endif %}
 
     # ~~~~~~~~~~~~~~ normal blocks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -135,22 +146,38 @@
     block_n_end = tl.minimum(kv_num_blocks * SPARSE_KV_MULTIPLE, tl.maximum(tl.cdiv(KV_LEN, BLOCK_N), 1))
 
 
-    # K and V pointers will be passed directly to forward_inner
+    if not USE_TMA:
+        K_block_ptr = tl.make_block_ptr(
+            base=K,
+            shape=(QK_HEAD_DIM, KV_LEN),
+            strides=(stride_kk, stride_kn),
+            offsets=(0, kv_start),
+            block_shape=(QK_HEAD_DIM_ROUNDED, BLOCK_N),
+            order=(0, 1)
+        )
+
+        V_block_ptr = tl.make_block_ptr(
+            base=V,
+            shape=(KV_LEN, V_HEAD_DIM),
+            strides=(stride_vn, stride_vk),
+            offsets=(kv_start, 0),
+            block_shape=(BLOCK_N, V_HEAD_DIM_ROUNDED),
+            order=(1, 0)
+        )
 
     offs_n = kv_start + tl.arange(0, BLOCK_N)
 
 
     acc, l_i, m_i = forward_inner(
         {{gen_argdefs()}},
-        q, K, V,
+        q, K_block_ptr, V_block_ptr,
         desc_k, desc_v, Q_LEN, KV_LEN,
         acc, l_i, m_i,
         off_zq, off_hq, offs_m[:, None], offs_n[None, :],
         kv_start,
         kv_indices, kv_num_blocks,
         0, block_n_end,
         MATMUL_PRECISION,
-        stride_kk, stride_kn, stride_vn, stride_vk,
         IS_FULL_BLOCKS=False,
     )
 
@@ -163,20 +190,35 @@
         kv_start = tl.load(kv_indices) * SPARSE_KV_BLOCK_SIZE # first kv block we're loading
         kv_num_blocks = tl.load(FULL_KV_NUM_BLKS + sparse_kv_num_blks_offset)
         block_n_end = tl.minimum(kv_num_blocks * SPARSE_KV_MULTIPLE, tl.maximum(tl.cdiv(KV_LEN, BLOCK_N), 1))
-        # K and V pointers will be passed directly to forward_inner
+        if not USE_TMA:
+            K_block_ptr = tl.make_block_ptr(
+                base=K,
+                shape=(QK_HEAD_DIM, KV_LEN),
+                strides=(stride_kk, stride_kn),
+                offsets=(0, kv_start),
+                block_shape=(QK_HEAD_DIM_ROUNDED, BLOCK_N),
+                order=(0, 1)
+            )
+            V_block_ptr = tl.make_block_ptr(
+                base=V,
+                shape=(KV_LEN, V_HEAD_DIM),
+                strides=(stride_vn, stride_vk),
+                offsets=(kv_start, 0),
+                block_shape=(BLOCK_N, V_HEAD_DIM_ROUNDED),
+                order=(1, 0)
+            )
         offs_n = kv_start + tl.arange(0, BLOCK_N)
 
         acc, l_i, m_i = forward_inner(
             {{gen_argdefs()}},
-            q, K, V,
+            q, K_block_ptr, V_block_ptr,
             desc_k, desc_v, Q_LEN, KV_LEN,
             acc, l_i, m_i,
             off_zq, off_hq, offs_m[:, None], offs_n[None, :],
             kv_start,
             kv_indices, kv_num_blocks,
             0, block_n_end,
             MATMUL_PRECISION,
-            stride_kk, stride_kn, stride_vn, stride_vk,
             IS_FULL_BLOCKS=True,
         )
 
@@ -187,10 +229,10 @@
     l_i = tl.where(l_i == 0.0, 1, l_i)
 
     acc = acc / l_i[:, None]
-    idx_zq = tl.program_id(1).to(INDEX_DTYPE)
-    idx_hq = tl.program_id(2).to(INDEX_DTYPE)
-    idx_m = offs_m[:, None].to(INDEX_DTYPE)
-    idx_d = tl.arange(0, V_HEAD_DIM_ROUNDED)[None, :].to(INDEX_DTYPE)
+    idx_zq = tl.program_id(1)
+    idx_hq = tl.program_id(2)
+    idx_m = offs_m[:, None]
+    idx_d = tl.arange(0, V_HEAD_DIM_ROUNDED)[None, :]
 
     mask = (idx_m < Q_LEN) & (idx_d < V_HEAD_DIM)
 
diff --git a/torch/_inductor/kernel/flex/templates/flex_backwards.py.jinja b/torch/_inductor/kernel/flex/templates/flex_backwards.py.jinja
@@ -51,12 +51,12 @@
 
     MATMUL_PRECISION = Q.dtype.element_ty
 
-    pid = tl.program_id(0).to(INDEX_DTYPE)
+    pid = tl.program_id(0)
     NUM_KV_BLOCKS = tl.cdiv(KV_LEN, BLOCK_N1)
     NUM_Q_BLOCKS = tl.cdiv(Q_LEN, BLOCK_M2)
 
-    off_zq = tl.program_id(1).to(INDEX_DTYPE) # q batch idx
-    off_hkv = tl.program_id(2).to(INDEX_DTYPE) # kv head idx
+    off_zq = tl.program_id(1) # q batch idx
+    off_hkv = tl.program_id(2) # kv head idx
     off_zkv = off_zq % ZKV # kv batch idx
 
     SPARSE_Z = {{size("KV_NUM_BLKS", 0)}}
diff --git a/torch/_inductor/kernel/flex/templates/flex_decode.py.jinja b/torch/_inductor/kernel/flex/templates/flex_decode.py.jinja
diff --git a/torch/_inductor/select_algorithm.py b/torch/_inductor/select_algorithm.py