Add support for torch.arange (pytorch#215)

jansel · web-flow · commit 9554e7e4d6b4 · 2025-06-23T08:50:16.000-07:00
diff --git a/helion/_compiler/compile_environment.py b/helion/_compiler/compile_environment.py
@@ -71,6 +71,7 @@ def __init__(self, device: torch.device, settings: Settings) -> None:
         )
         self.specialized_vars: set[sympy.Symbol] = set()
         self.loop_dependency_checker = LoopDependencyChecker()
+        self._symint_cache: dict[object, torch.SymInt] = {}
 
     def add_kernel_tensor_size(self, sizes: Sequence[int | torch.SymInt]) -> None:
         for size in sizes:
@@ -174,6 +175,30 @@ def create_unbacked_symint(self, hint: int = 8192) -> torch.SymInt:
             self.shape_env.var_to_val[sym._sympy_()] = sympy.sympify(hint)
             return sym
 
+    def cached_create_unbacked_symint(
+        self, key: Sequence[object], hint: int = 8192
+    ) -> torch.SymInt:
+        """Create an unbacked symint with caching based on a key.
+
+        This ensures that the same key always returns the same unbacked
+        symint, which is crucial to allow simplification of expressions
+        for things like tile_begin.
+
+        Args:
+            key: The cache key (should be sequence of hashables and unique for the desired symint)
+            hint: Hint value for the symint
+
+        Returns:
+            A consistent unbacked symint for the given key
+        """
+        # pyre-ignore[16]
+        key = tuple([x._sympy_() if hasattr(x, "_sympy_") else x for x in key])
+        result = self._symint_cache.get(key)
+        if result is None:
+            result = self.create_unbacked_symint(hint)
+            self._symint_cache[key] = result
+        return result
+
     def to_fake(self, obj: object, origin: Origin) -> object:
         if isinstance(obj, torch.Tensor):
             return self._to_fake_tensor(obj, origin.to_source())
diff --git a/helion/_compiler/inductor_lowering.py b/helion/_compiler/inductor_lowering.py
@@ -871,6 +871,20 @@ def __init__(self, graph: torch.fx.Graph, cg: GenerateAST) -> None:
         super().__init__(_LazyGraphModule({}, graph), garbage_collect_values=False)
         self.cg = cg
 
+    def to_ast(self, value: object) -> ast.AST:
+        """
+        Convert a value to an AST expression.
+        """
+        if isinstance(value, torch.fx.Node):
+            result = self.env[value]
+            assert isinstance(result, ast.AST)
+            return result
+        if isinstance(value, (int, float, bool)):
+            return create(ast.Constant, value=value)
+        if isinstance(value, ast.AST):
+            return value
+        raise TypeError(f"Unsupported value type for AST conversion: {type(value)}")
+
     def _collect_multi_outputs(
         self, node: Node, last_node_result: object
     ) -> tuple[object, ...]:
@@ -1018,3 +1032,29 @@ def add_statement(self, statement: ast.AST | str) -> None:
 
     def sympy_expr(self, expr: sympy.Expr) -> str:
         return self.codegen.device_function.sympy_expr(expr)
+
+
+# pyre-fixme[56]
+@register_lowering(torch.ops.prims.iota.default)
+def codegen_iota(ctx: GraphInterpreter, node: torch.fx.Node) -> object:
+    """Generate tl.arange for torch.ops.prims.iota.default operations."""
+    start = node.kwargs.get("start", 0)
+    step = node.kwargs.get("step", 1)
+    dtype = (
+        node.kwargs.get("dtype") or CompileEnvironment.current().settings.index_dtype
+    )
+    assert isinstance(dtype, torch.dtype)
+    (length_arg,) = node.args  # expecting a single argument for length
+    expr = "tl.arange(0, length)"
+    if step != 1:
+        expr = f"step * {expr}"
+    if start != 0:
+        expr = f"start + {expr}"
+    if dtype != torch.int32:
+        expr = f"({expr}).to({triton_type(dtype)})"
+    return expr_from_string(
+        expr,
+        start=ctx.to_ast(start),
+        step=ctx.to_ast(step),
+        length=ctx.to_ast(length_arg),
+    )
diff --git a/helion/language/__init__.py b/helion/language/__init__.py
@@ -2,6 +2,7 @@
 
 from .constexpr import ConstExpr as constexpr  # noqa: F401
 from .constexpr import specialize as specialize
+from .creation_ops import arange as arange
 from .creation_ops import full as full
 from .creation_ops import zeros as zeros
 from .device_print import device_print as device_print
diff --git a/helion/language/creation_ops.py b/helion/language/creation_ops.py
@@ -78,3 +78,27 @@ def _(
     value = node.args[1]
     assert isinstance(value, (int, float, bool))
     return value
+
+
+def arange(
+    *args: int,
+    dtype: torch.dtype | None = None,
+    **kwargs: object,
+) -> torch.Tensor:
+    """
+    Same as `torch.arange()`, but defaults to same device as the current kernel.
+
+    Example usage:
+        hl.arange(tile.block_size)  # [0, 1, ..., tile.block_size - 1]
+        hl.arange(tile.begin, tile.begin + tile.block_size)  # same as tile.index
+    """
+    env = CompileEnvironment.current()
+    if dtype is None:
+        dtype = env.settings.index_dtype
+    return torch.arange(
+        *args,
+        # pyre-ignore[6]
+        **kwargs,
+        dtype=dtype,
+        device=env.device,
+    )
diff --git a/helion/language/tile_ops.py b/helion/language/tile_ops.py
@@ -60,7 +60,9 @@ def tile_begin(tile: Tile) -> int:
 @_decorators.register_fake(tile_begin)
 def _(tile: torch.SymInt) -> torch.SymInt:
     _disable_flatten_get_tile(tile)  # update config spec if needed
-    return CompileEnvironment.current().create_unbacked_symint()
+    return CompileEnvironment.current().cached_create_unbacked_symint(
+        ("tile_begin", tile)
+    )
 
 
 def _disable_flatten_get_tile(tile: object) -> int:
@@ -94,7 +96,9 @@ def tile_end(tile: Tile) -> int:
 @_decorators.register_fake(tile_end)
 def _(tile: torch.SymInt) -> torch.SymInt:
     _disable_flatten_get_tile(tile)  # update config spec if needed
-    return CompileEnvironment.current().create_unbacked_symint()
+    return CompileEnvironment.current().cached_create_unbacked_symint(
+        ("tile_end", tile)
+    )
 
 
 @_decorators.codegen(tile_end)
@@ -148,7 +152,7 @@ def tile_id(tile: Tile) -> int:
 @_decorators.register_fake(tile_id)
 def _(tile: torch.SymInt) -> torch.SymInt:
     assert isinstance(tile, torch.SymInt)
-    return CompileEnvironment.current().create_unbacked_symint()
+    return CompileEnvironment.current().cached_create_unbacked_symint(("tile_id", tile))
 
 
 @_decorators.codegen(tile_id)
diff --git a/test/test_indexing.py b/test/test_indexing.py
@@ -392,6 +392,113 @@ def fn(x: torch.Tensor) -> torch.Tensor:
         )
         torch.testing.assert_close(result, expected)
 
+    def test_arange_tile_block_size(self):
+        @helion.kernel(use_default_config=True)
+        def arange_from_block_size(x: torch.Tensor) -> torch.Tensor:
+            out = torch.zeros([x.size(0)], dtype=torch.int32, device=x.device)
+            for tile in hl.tile(x.size(0)):
+                # Test the exact pattern requested: torch.arange(tile.block_size, device=x.device)
+                out[tile] = torch.arange(tile.block_size, device=x.device)
+            return out
+
+        x = torch.randn([64], device=DEVICE)
+        code, result = code_and_output(
+            arange_from_block_size,
+            (x,),
+            block_size=16,
+        )
+        expected = torch.arange(16, dtype=torch.int32, device=DEVICE).repeat(4)
+        torch.testing.assert_close(result, expected)
+
+    def test_arange_two_args(self):
+        @helion.kernel(use_default_config=True)
+        def arange_two_args(x: torch.Tensor) -> torch.Tensor:
+            out = torch.zeros([x.size(0)], dtype=torch.int32, device=x.device)
+            for tile in hl.tile(x.size(0)):
+                # Test the exact pattern requested: torch.arange(tile.begin, tile.begin+tile.block_size, device=x.device)
+                out[tile] = torch.arange(
+                    tile.begin, tile.begin + tile.block_size, device=x.device
+                )
+            return out
+
+        x = torch.randn([64], device=DEVICE)
+        code, result = code_and_output(
+            arange_two_args,
+            (x,),
+            block_size=16,
+        )
+        expected = torch.arange(64, dtype=torch.int32, device=DEVICE)
+        torch.testing.assert_close(result, expected)
+
+    def test_arange_three_args_step(self):
+        @helion.kernel(config={"block_size": 8})
+        def arange_three_args_step(x: torch.Tensor) -> torch.Tensor:
+            out = torch.zeros([x.size(0) // 2], dtype=torch.int32, device=x.device)
+            for tile in hl.tile(x.size(0) // 2):
+                # Test the exact pattern requested: torch.arange(start, end, step=2, device=x.device)
+                start_idx = tile.begin * 2
+                end_idx = (tile.begin + tile.block_size) * 2
+                out[tile] = torch.arange(start_idx, end_idx, step=2, device=x.device)
+            return out
+
+        x = torch.randn([64], device=DEVICE)
+        code, result = code_and_output(
+            arange_three_args_step,
+            (x,),
+        )
+        expected = torch.arange(0, 64, step=2, dtype=torch.int32, device=DEVICE)
+        torch.testing.assert_close(result, expected)
+        self.assertExpectedInline(
+            code,
+            """\
+from __future__ import annotations
+
+import torch
+import triton
+import triton.language as tl
+
+@triton.jit
+def _arange_three_args_step_kernel(out, out_size_0, out_stride_0, _BLOCK_SIZE_0: tl.constexpr):
+    pid_0 = tl.program_id(0)
+    offset_0 = pid_0 * _BLOCK_SIZE_0
+    indices_0 = (offset_0 + tl.arange(0, _BLOCK_SIZE_0)).to(tl.int32)
+    mask_0 = indices_0 < out_size_0
+    mul = 2 * offset_0
+    iota = (mul + 2 * tl.arange(0, _BLOCK_SIZE_0)).to(tl.int64)
+    v_0 = iota.to(tl.int32)
+    tl.store(out + indices_0 * out_stride_0, v_0, mask_0)
+
+def arange_three_args_step(x: torch.Tensor):
+    out = torch.zeros([x.size(0) // 2], dtype=torch.int32, device=x.device)
+    _BLOCK_SIZE_0 = 8
+    _arange_three_args_step_kernel[triton.cdiv(out.size(0), _BLOCK_SIZE_0),](out, out.size(0), out.stride(0), _BLOCK_SIZE_0, num_warps=4, num_stages=3)
+    return out
+
+def _arange_three_args_step_make_precompiler(x: torch.Tensor):
+    out = torch.zeros([x.size(0) // 2], dtype=torch.int32, device=x.device)
+    _BLOCK_SIZE_0 = 8
+    from helion.runtime.precompile_shim import make_precompiler
+    return make_precompiler(_arange_three_args_step_kernel)(out, out.size(0), out.stride(0), _BLOCK_SIZE_0, num_warps=4, num_stages=3)""",
+        )
+
+    def test_arange_hl_alias(self):
+        @helion.kernel(config={"block_size": 8})
+        def arange_three_args_step(x: torch.Tensor) -> torch.Tensor:
+            out = torch.zeros([x.size(0) // 2], dtype=torch.int32, device=x.device)
+            for tile in hl.tile(x.size(0) // 2):
+                start_idx = tile.begin * 2
+                end_idx = (tile.begin + tile.block_size) * 2
+                out[tile] = hl.arange(start_idx, end_idx, step=2)
+            return out
+
+        x = torch.randn([64], device=DEVICE)
+        code, result = code_and_output(
+            arange_three_args_step,
+            (x,),
+        )
+        expected = torch.arange(0, 64, step=2, dtype=torch.int32, device=DEVICE)
+        torch.testing.assert_close(result, expected)
+
 
 if __name__ == "__main__":
     unittest.main()