Only use Tensor Descriptor indexing with appropriate shapes (#360)

PaulZhang12 · web-flow · commit 3288b245dc78 · 2025-07-24T14:37:04.000-04:00
diff --git a/helion/_compiler/indexing_strategy.py b/helion/_compiler/indexing_strategy.py
@@ -8,7 +8,6 @@
 
 import sympy
 import torch
-import triton
 
 from .. import exc
 from .._compat import get_tensor_descriptor_fn_name
@@ -186,10 +185,23 @@ def is_supported(
             return False
 
         def valid_block_size(
-            block_size: int | torch.SymInt | None, stride: int | torch.SymInt
+            block_size: int | torch.SymInt | None, stride: int | torch.SymInt, idx: int
         ) -> bool:
             if not isinstance(block_size, int):
                 return False
+
+            if (
+                get_tensor_descriptor_fn_name()
+                == "tl._experimental_make_tensor_descriptor"
+            ):
+                # https://github.com/triton-lang/triton/blob/d654e0f2d91f07496454e0fcbec2a9b97df37d47/python/triton/language/semantic.py#L1162
+                threshold = 32 // fake_tensor.dtype.itemsize
+                if idx == 0:
+                    threshold = min(8, threshold)
+
+                if fake_tensor.ndim == 2 and block_size < threshold:
+                    return False
+
             # was getting some IMAs with small block sizes even in non-stride 1 dims
             return block_size * element_size >= 16 or (block_size == 1 and stride != 1)
 
@@ -198,34 +210,22 @@ def valid_block_size(
         strides = fake_tensor.stride()
         size_stride = collections.deque(zip(sizes, strides, strict=True))
         config = DeviceFunction.current().config
-        for k in subscript:
+        for i, k in enumerate(subscript):
             if k is None:
                 continue
             size, stride = size_stride.popleft()
             if str(k) == "slice(None, None, None)":
                 block_size = env.allocate_reduction_dimension(size).from_config(config)
-                if not valid_block_size(block_size, stride):
+                if not valid_block_size(block_size, stride, i):
                     return False
             elif isinstance(k, torch.SymInt):
                 block_id = env.get_block_id(k)
                 if block_id is None:
                     return False
                 block_size = env.block_sizes[block_id].from_config(config)
-                if not valid_block_size(block_size, stride):
+                if not valid_block_size(block_size, stride, i):
                     return False
 
-        # 5) Extra requirement for experimental version
-        if get_tensor_descriptor_fn_name() == "tl._experimental_make_tensor_descriptor":
-            # NOTE: There's no clean way to convert a torch.dtype to triton.dtype
-            # This is improved in triton 3.4 but tl._experimental_make_tensor_descriptor
-            # is only available on <= triton 3.3
-            primitive_bitwidth = getattr(
-                triton.language, str(fake_tensor.dtype).split(".")[-1]
-            ).primitive_bitwidth
-            if env.size_hint(sizes[1]) < (32 // primitive_bitwidth) * 8:
-                # https://github.com/triton-lang/triton/blob/d654e0f2d91f07496454e0fcbec2a9b97df37d47/python/triton/language/semantic.py#L1162
-                return False
-
         return True
 
     def codegen_load(
diff --git a/test/test_indexing.expected b/test/test_indexing.expected
@@ -261,3 +261,65 @@ def pairwise_add(x: torch.Tensor, *, _launcher=_default_launcher):
     _BLOCK_SIZE_0 = 32
     _launcher(_pairwise_add_kernel, (triton.cdiv(out.size(0), _BLOCK_SIZE_0),), out, x, out.size(0), out.stride(0), x.stride(0), _BLOCK_SIZE_0, num_warps=4, num_stages=3)
     return out
+
+--- assertExpectedJournal(TestIndexing.test_reduction_tensor_descriptor_indexing_block_size)
+from __future__ import annotations
+
+import torch
+import triton
+import triton.language as tl
+from helion.runtime import default_launcher as _default_launcher
+
+@triton.jit
+def _reduction_sum_kernel(x, out, out_stride_0, x_stride_0, x_stride_1, m, _, _BLOCK_SIZE_0: tl.constexpr, _RDIM_SIZE_1: tl.constexpr):
+    pid_0 = tl.program_id(0)
+    offset_0 = pid_0 * _BLOCK_SIZE_0
+    indices_0 = (offset_0 + tl.arange(0, _BLOCK_SIZE_0)).to(tl.int32)
+    mask_0 = indices_0 < m
+    indices_1 = tl.arange(0, _RDIM_SIZE_1).to(tl.int32)
+    mask_1 = indices_1 < _
+    load = tl.load(x + (indices_0[:, None] * x_stride_0 + indices_1[None, :] * x_stride_1), mask_0[:, None] & mask_1[None, :], other=0)
+    sum_1 = tl.sum(load, 1)
+    tl.store(out + indices_0 * out_stride_0, sum_1, mask_0)
+
+def reduction_sum(x: torch.Tensor, *, _launcher=_default_launcher):
+    m, _ = x.size()
+    out = torch.empty([m], device=x.device, dtype=x.dtype)
+    _BLOCK_SIZE_0 = 4
+    _RDIM_SIZE_1 = triton.next_power_of_2(_)
+    _launcher(_reduction_sum_kernel, (triton.cdiv(m, _BLOCK_SIZE_0),), x, out, out.stride(0), x.stride(0), x.stride(1), m, _, _BLOCK_SIZE_0, _RDIM_SIZE_1, num_warps=4, num_stages=3)
+    return out
+
+--- assertExpectedJournal(TestIndexing.test_reduction_tensor_descriptor_indexing_reduction_loop)
+from __future__ import annotations
+
+import torch
+import triton
+import triton.language as tl
+from helion.runtime import default_launcher as _default_launcher
+
+@triton.jit
+def _reduction_sum_kernel(x, out, out_stride_0, x_stride_0, x_stride_1, m, _, _BLOCK_SIZE_0: tl.constexpr, _REDUCTION_BLOCK_1: tl.constexpr):
+    pid_0 = tl.program_id(0)
+    offset_0 = pid_0 * _BLOCK_SIZE_0
+    indices_0 = (offset_0 + tl.arange(0, _BLOCK_SIZE_0)).to(tl.int32)
+    mask_0 = indices_0 < m
+    sum_1_acc = tl.full([_BLOCK_SIZE_0, _REDUCTION_BLOCK_1], 0, tl.float32)
+    for roffset_1 in tl.range(0, _, _REDUCTION_BLOCK_1):
+        rindex_1 = roffset_1 + tl.arange(0, _REDUCTION_BLOCK_1).to(tl.int32)
+        mask_1 = rindex_1 < _
+        load = tl.load(x + (indices_0[:, None] * x_stride_0 + rindex_1[None, :] * x_stride_1), mask_0[:, None] & mask_1[None, :], other=0)
+        v_0 = load.to(tl.float32)
+        v_1 = sum_1_acc + v_0
+        sum_1_acc = v_1
+    sum_1 = tl.sum(sum_1_acc, 1)
+    v_2 = sum_1.to(tl.float16)
+    tl.store(out + indices_0 * out_stride_0, v_2, mask_0)
+
+def reduction_sum(x: torch.Tensor, *, _launcher=_default_launcher):
+    m, _ = x.size()
+    out = torch.empty([m], device=x.device, dtype=x.dtype)
+    _BLOCK_SIZE_0 = 8
+    _REDUCTION_BLOCK_1 = 8
+    _launcher(_reduction_sum_kernel, (triton.cdiv(m, _BLOCK_SIZE_0),), x, out, out.stride(0), x.stride(0), x.stride(1), m, _, _BLOCK_SIZE_0, _REDUCTION_BLOCK_1, num_warps=4, num_stages=3)
+    return out
diff --git a/test/test_indexing.py b/test/test_indexing.py
@@ -29,6 +29,16 @@ def broadcast_add_3d(
     return out
 
 
+@helion.kernel
+def reduction_sum(x: torch.Tensor) -> torch.Tensor:
+    m, _ = x.size()
+    out = torch.empty([m], device=x.device, dtype=x.dtype)
+    for tile in hl.tile(x.size(0)):
+        out[tile] = x[tile, :].to(torch.float32).sum(-1).to(x.dtype)
+
+    return out
+
+
 class TestIndexing(TestCase):
     def test_arange(self):
         @helion.kernel
@@ -385,6 +395,49 @@ def test_broadcasting_tensor_descriptor_indexing(self):
         torch.testing.assert_close(result, expected)
         self.assertExpectedJournal(code)
 
+    @unittest.skipIf(not supports_tensor_descriptor(), "TensorDescriptor not supported")
+    @unittest.skipIf(
+        get_tensor_descriptor_fn_name() != "tl._experimental_make_tensor_descriptor",
+        "Not using experimental tensor descriptor",
+    )
+    def test_reduction_tensor_descriptor_indexing_block_size(self):
+        x = torch.randn([64, 64], dtype=torch.float32, device=DEVICE)
+
+        # Given block_size 4, tensor_descriptor should not actually be used
+        # Convert to default pointer indexing
+        code, result = code_and_output(
+            reduction_sum,
+            (x,),
+            indexing="tensor_descriptor",
+            block_size=[4],
+        )
+
+        expected = torch.sum(x, dim=1)
+        torch.testing.assert_close(result, expected)
+        self.assertExpectedJournal(code)
+
+    @unittest.skipIf(not supports_tensor_descriptor(), "TensorDescriptor not supported")
+    @unittest.skipIf(
+        get_tensor_descriptor_fn_name() != "tl._experimental_make_tensor_descriptor",
+        "Not using experimental tensor descriptor",
+    )
+    def test_reduction_tensor_descriptor_indexing_reduction_loop(self):
+        x = torch.randn([64, 256], dtype=torch.float16, device=DEVICE)
+
+        # Given reduction_loop 2, # of columns not compatible with tensor_descriptor
+        # Convert to default pointer indexing
+        code, result = code_and_output(
+            reduction_sum,
+            (x,),
+            indexing="tensor_descriptor",
+            block_size=[8],
+            reduction_loops=[8],
+        )
+
+        expected = torch.sum(x, dim=1)
+        torch.testing.assert_close(result, expected)
+        self.assertExpectedJournal(code)
+
 
 if __name__ == "__main__":
     unittest.main()