[RFC] Add static loop unrolling (#216)

oulgen · web-flow · commit f0c517678e42 · 2025-06-22T12:30:03.000-07:00
diff --git a/helion/_compiler/host_function.py b/helion/_compiler/host_function.py
@@ -100,8 +100,10 @@ def __init__(
             HostFunction.validate_ast(root)
 
             from .device_ir import lower_to_device_ir
+            from .static_loop_unroller import unroll_static_loops
             from .type_propagation import propagate_types
 
+            unroll_static_loops(self)
             propagate_types(self, fake_args)
             env.finalize_config_spec()
             self.device_ir = lower_to_device_ir(self)
diff --git a/helion/_compiler/static_loop_unroller.py b/helion/_compiler/static_loop_unroller.py
@@ -0,0 +1,81 @@
+from __future__ import annotations
+
+import ast
+from typing import TYPE_CHECKING
+from typing import NoReturn
+
+from .ast_extension import create
+
+if TYPE_CHECKING:
+    from collections.abc import Sequence
+
+    from .host_function import HostFunction
+
+
+class CannotUnrollLoop(Exception):
+    pass
+
+
+class StaticLoopUnroller(ast.NodeTransformer):
+    """
+    A compiler optimization pass that unrolls static for loops.
+
+    TODO(oulgen): This pass is primitive, does not handle for.orelse, break, continue etc
+    """
+
+    def visit_For(self, node: ast.For) -> ast.AST | list[ast.AST]:
+        # Generic visit to handle nested loops
+        node = self.generic_visit(node)  # pyre-ignore[9]
+
+        # Check if this is a static loop that can be unrolled
+        if static_values := self._extract_static_values(node.iter):
+            return self._unroll_loop(node, static_values)
+
+        return node
+
+    def visit_Break(self, node: ast.Break) -> NoReturn:
+        raise CannotUnrollLoop
+
+    def visit_Continue(self, node: ast.Continue) -> NoReturn:
+        raise CannotUnrollLoop
+
+    def _extract_static_values(self, iter_node: ast.expr) -> list[ast.expr] | None:
+        """
+        Check if iterator is static, and if so extract those values
+        """
+        if isinstance(iter_node, (ast.List, ast.Tuple)):
+            return iter_node.elts
+        return None
+
+    def _unroll_loop(
+        self, loop_node: ast.For, static_values: Sequence[ast.AST]
+    ) -> ast.AST | list[ast.AST]:
+        unrolled_statements = []
+
+        for value in static_values:
+            assignment = create(
+                ast.Assign,
+                targets=[loop_node.target],
+                value=value,
+            )
+            unrolled_statements.append(assignment)
+
+            # TODO(oulgen): Should we deepcopy these to avoid reference issues?
+            unrolled_statements.extend(loop_node.body)
+
+        if loop_node.orelse:
+            raise CannotUnrollLoop
+        return unrolled_statements
+
+
+def unroll_static_loops(func: HostFunction) -> None:
+    new_body = []
+    for stmt in func.body:
+        try:
+            unrolled_stmts = StaticLoopUnroller().visit(stmt)
+        except CannotUnrollLoop:
+            new_body.append(stmt)
+        else:
+            assert isinstance(unrolled_stmts, ast.stmt)
+            new_body.append(unrolled_stmts)
+    func.body = new_body
diff --git a/test/test_errors.py b/test/test_errors.py
@@ -1,5 +1,7 @@
 from __future__ import annotations
 
+import unittest
+
 from expecttest import TestCase
 import torch
 
@@ -118,9 +120,13 @@ def fn(x: torch.Tensor) -> torch.Tensor:
             batch = x.size(0)
             out = x.new_empty(batch)
             for tile_batch in hl.tile(batch):
-                for i in [1, 2, 3]:
+                for i in {1: None, 2: None, 3: None}:
                     out[tile_batch] = x[tile_batch] + i
             return out
 
         with self.assertRaises(helion.exc.InvalidDeviceForLoop):
             code_and_output(fn, (torch.randn(8, device=DEVICE),))
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/test/test_loops.py b/test/test_loops.py
@@ -1378,6 +1378,127 @@ def _chebyshev_kernel_make_precompiler(x: torch.Tensor, w: torch.Tensor):
     return make_precompiler(_chebyshev_kernel_kernel)(x, w, out, out.stride(0), out.stride(1), w.stride(0), w.stride(1), x.stride(0), x.stride(1), B, C, _BLOCK_SIZE_0, _BLOCK_SIZE_1, num_warps=4, num_stages=3)""",
         )
 
+    def test_loop_unroll1(self):
+        @helion.kernel()
+        def fn(x: torch.Tensor) -> torch.Tensor:
+            out = torch.zeros_like(x)
+            for tile in hl.tile(x.size()):
+                out[tile] = x[tile]
+                for i in [1, 2, 3]:
+                    out[tile] += i
+            return out
+
+        x = torch.randn(4, device=DEVICE)
+        code, output = code_and_output(fn, (x,))
+        torch.testing.assert_close(output, x + 6)
+        self.assertExpectedInline(
+            code,
+            """\
+from __future__ import annotations
+
+import torch
+import triton
+import triton.language as tl
+
+@triton.jit
+def _fn_kernel(x, out, x_size_0, out_stride_0, x_stride_0, _BLOCK_SIZE_0: tl.constexpr):
+    pid_0 = tl.program_id(0)
+    offset_0 = pid_0 * _BLOCK_SIZE_0
+    indices_0 = (offset_0 + tl.arange(0, _BLOCK_SIZE_0)).to(tl.int32)
+    mask_0 = indices_0 < x_size_0
+    load = tl.load(x + indices_0 * x_stride_0, mask_0, other=0)
+    tl.store(out + indices_0 * out_stride_0, load, mask_0)
+    load_1 = tl.load(out + indices_0 * out_stride_0, mask_0, other=0)
+    v_0 = 1.0
+    v_1 = load_1 + v_0
+    tl.store(out + indices_0 * out_stride_0, v_1, mask_0)
+    load_2 = tl.load(out + indices_0 * out_stride_0, mask_0, other=0)
+    v_2 = 2.0
+    v_3 = load_2 + v_2
+    tl.store(out + indices_0 * out_stride_0, v_3, mask_0)
+    load_3 = tl.load(out + indices_0 * out_stride_0, mask_0, other=0)
+    v_4 = 3.0
+    v_5 = load_3 + v_4
+    tl.store(out + indices_0 * out_stride_0, v_5, mask_0)
+
+def fn(x: torch.Tensor):
+    out = torch.zeros_like(x)
+    _BLOCK_SIZE_0 = 4
+    _fn_kernel[triton.cdiv(x.size(0), _BLOCK_SIZE_0),](x, out, x.size(0), out.stride(0), x.stride(0), _BLOCK_SIZE_0, num_warps=4, num_stages=3)
+    return out
+
+def _fn_make_precompiler(x: torch.Tensor):
+    out = torch.zeros_like(x)
+    _BLOCK_SIZE_0 = 4
+    from helion.runtime.precompile_shim import make_precompiler
+    return make_precompiler(_fn_kernel)(x, out, x.size(0), out.stride(0), x.stride(0), _BLOCK_SIZE_0, num_warps=4, num_stages=3)""",
+        )
+
+    def test_loop_unroll2(self):
+        @helion.kernel()
+        def fn(x: torch.Tensor) -> torch.Tensor:
+            out = torch.zeros_like(x)
+            a = 1
+            b = 2
+            c = 3
+            for tile in hl.tile(x.size()):
+                out[tile] = x[tile]
+                for i in (a, b, c):
+                    out[tile] += i
+            return out
+
+        x = torch.randn(4, device=DEVICE)
+        code, output = code_and_output(fn, (x,))
+        torch.testing.assert_close(output, x + 6)
+        self.assertExpectedInline(
+            code,
+            """\
+from __future__ import annotations
+
+import torch
+import triton
+import triton.language as tl
+
+@triton.jit
+def _fn_kernel(x, out, x_size_0, out_stride_0, x_stride_0, _BLOCK_SIZE_0: tl.constexpr):
+    pid_0 = tl.program_id(0)
+    offset_0 = pid_0 * _BLOCK_SIZE_0
+    indices_0 = (offset_0 + tl.arange(0, _BLOCK_SIZE_0)).to(tl.int32)
+    mask_0 = indices_0 < x_size_0
+    load = tl.load(x + indices_0 * x_stride_0, mask_0, other=0)
+    tl.store(out + indices_0 * out_stride_0, load, mask_0)
+    load_1 = tl.load(out + indices_0 * out_stride_0, mask_0, other=0)
+    v_0 = 1.0
+    v_1 = load_1 + v_0
+    tl.store(out + indices_0 * out_stride_0, v_1, mask_0)
+    load_2 = tl.load(out + indices_0 * out_stride_0, mask_0, other=0)
+    v_2 = 2.0
+    v_3 = load_2 + v_2
+    tl.store(out + indices_0 * out_stride_0, v_3, mask_0)
+    load_3 = tl.load(out + indices_0 * out_stride_0, mask_0, other=0)
+    v_4 = 3.0
+    v_5 = load_3 + v_4
+    tl.store(out + indices_0 * out_stride_0, v_5, mask_0)
+
+def fn(x: torch.Tensor):
+    out = torch.zeros_like(x)
+    a = 1
+    b = 2
+    c = 3
+    _BLOCK_SIZE_0 = 4
+    _fn_kernel[triton.cdiv(x.size(0), _BLOCK_SIZE_0),](x, out, x.size(0), out.stride(0), x.stride(0), _BLOCK_SIZE_0, num_warps=4, num_stages=3)
+    return out
+
+def _fn_make_precompiler(x: torch.Tensor):
+    out = torch.zeros_like(x)
+    a = 1
+    b = 2
+    c = 3
+    _BLOCK_SIZE_0 = 4
+    from helion.runtime.precompile_shim import make_precompiler
+    return make_precompiler(_fn_kernel)(x, out, x.size(0), out.stride(0), x.stride(0), _BLOCK_SIZE_0, num_warps=4, num_stages=3)""",
+        )
+
 
 if __name__ == "__main__":
     unittest.main()