Fix issue with integer in rolled reduction (#354)

jansel · web-flow · commit d593642c1a7b · 2025-07-23T07:20:01.000-07:00
Fixes #345
diff --git a/helion/language/_tracing_ops.py b/helion/language/_tracing_ops.py
@@ -42,6 +42,11 @@ def _get_symnode(debug_name: str) -> int:
 @_decorators.codegen(_get_symnode)
 def _(state: CodegenState) -> ast.AST:
     val = state.fx_node.meta["val"]  # pyright: ignore[reportOptionalMemberAccess]
+
+    # Handle the case where val is a regular integer (e.g., from reduction_loops config)
+    if isinstance(val, int):
+        return expr_from_string(str(val))
+
     assert isinstance(val, (torch.SymInt, torch.SymFloat, torch.SymBool)), val
     if (block_idx := CompileEnvironment.current().get_block_id(val)) is not None:  # pyright: ignore[reportArgumentType]
         block_size_var = state.device_function.block_size_var(block_idx)
diff --git a/test/test_reductions.expected b/test/test_reductions.expected
@@ -153,6 +153,75 @@ def reduce_kernel(x: torch.Tensor, fn: Callable[[torch.Tensor], torch.Tensor], o
     _launcher(_reduce_kernel_kernel, (triton.cdiv(n, _BLOCK_SIZE_0),), x, out, out.size(0), x.size(0), x.size(1), out.stride(0), x.stride(0), x.stride(1), _m, _BLOCK_SIZE_0, _RDIM_SIZE_1, num_warps=4, num_stages=3)
     return out
 
+--- assertExpectedJournal(TestReductions.test_reduction_loops_integer_values)
+from __future__ import annotations
+
+import torch
+import triton
+import triton.language as tl
+from torch._inductor.runtime.triton_compat import libdevice
+from helion.runtime import default_launcher as _default_launcher
+
+@triton.jit
+def _layer_norm_reduction_kernel(bias, x, weight, out, bias_size_0, bias_stride_0, out_stride_0, out_stride_1, weight_stride_0, x_stride_0, x_stride_1, m, eps, _BLOCK_SIZE_0: tl.constexpr, _REDUCTION_BLOCK_1: tl.constexpr):
+    pid_0 = tl.program_id(0)
+    offset_0 = pid_0 * _BLOCK_SIZE_0
+    indices_0 = (offset_0 + tl.arange(0, _BLOCK_SIZE_0)).to(tl.int32)
+    mask_0 = indices_0 < m
+    var_mean_extra_acc = tl.full([_BLOCK_SIZE_0, _REDUCTION_BLOCK_1], 0, tl.float32)
+    for roffset_1 in tl.range(0, bias_size_0, _REDUCTION_BLOCK_1):
+        rindex_1 = roffset_1 + tl.arange(0, _REDUCTION_BLOCK_1).to(tl.int32)
+        mask_1 = rindex_1 < bias_size_0
+        load = tl.load(x + (indices_0[:, None] * x_stride_0 + rindex_1[None, :] * x_stride_1), mask_0[:, None] & mask_1[None, :], other=0)
+        v_0 = load.to(tl.float32)
+        v_1 = var_mean_extra_acc + v_0
+        var_mean_extra_acc = v_1
+    var_mean_extra = tl.reshape(tl.sum(var_mean_extra_acc, 1), [_BLOCK_SIZE_0, 1])
+    v_2 = var_mean_extra / bias_size_0.to(tl.float32)
+    _mask_to_1 = tl.where(tl.broadcast_to(mask_0[:, None], [_BLOCK_SIZE_0, 1]), v_2, 0)
+    var_mean_extra_2_acc = tl.full([_BLOCK_SIZE_0, _REDUCTION_BLOCK_1], 0, tl.float32)
+    for roffset_1 in tl.range(0, bias_size_0, _REDUCTION_BLOCK_1):
+        rindex_1 = roffset_1 + tl.arange(0, _REDUCTION_BLOCK_1).to(tl.int32)
+        mask_1 = rindex_1 < bias_size_0
+        _mask_to_1_copy = _mask_to_1
+        load_1 = tl.load(x + (indices_0[:, None] * x_stride_0 + rindex_1[None, :] * x_stride_1), mask_0[:, None] & mask_1[None, :], other=0)
+        v_3 = load_1.to(tl.float32)
+        v_4 = v_3 - _mask_to_1_copy
+        v_5 = v_4 * v_4
+        v_6 = var_mean_extra_2_acc + v_5
+        var_mean_extra_2_acc = v_6
+    var_mean_extra_2 = tl.reshape(tl.sum(var_mean_extra_2_acc, 1), [_BLOCK_SIZE_0, 1])
+    v_7 = var_mean_extra_2 / bias_size_0.to(tl.float32)
+    v_8 = v_7 + eps
+    v_9 = libdevice.rsqrt(v_8)
+    for roffset_1 in tl.range(0, bias_size_0, _REDUCTION_BLOCK_1):
+        rindex_1 = roffset_1 + tl.arange(0, _REDUCTION_BLOCK_1).to(tl.int32)
+        mask_1 = rindex_1 < bias_size_0
+        v_2_copy = v_2
+        v_9_copy = v_9
+        load_2 = tl.load(x + (indices_0[:, None] * x_stride_0 + rindex_1[None, :] * x_stride_1), mask_0[:, None] & mask_1[None, :], other=0)
+        v_10 = load_2.to(tl.float32)
+        v_11 = v_10 - v_2_copy
+        v_12 = v_11 * v_9_copy
+        load_3 = tl.load(weight + rindex_1 * weight_stride_0, mask_1, other=0)
+        v_13 = load_3.to(tl.float32)
+        v_14 = v_13[None, :]
+        v_15 = v_12 * v_14
+        load_4 = tl.load(bias + rindex_1 * bias_stride_0, mask_1, other=0)
+        v_16 = load_4.to(tl.float32)
+        v_17 = v_16[None, :]
+        v_18 = v_15 + v_17
+        v_19 = v_18.to(tl.float16)
+        tl.store(out + (indices_0[:, None] * out_stride_0 + rindex_1[None, :] * out_stride_1), v_19, mask_0[:, None] & mask_1[None, :])
+
+def layer_norm_reduction(x: torch.Tensor, weight: torch.Tensor, bias: torch.Tensor, eps: float=1e-05, *, _launcher=_default_launcher):
+    m, n = x.size()
+    out = torch.empty([m, n], dtype=torch.float16, device=x.device)
+    _BLOCK_SIZE_0 = 32
+    _REDUCTION_BLOCK_1 = 4
+    _launcher(_layer_norm_reduction_kernel, (triton.cdiv(m, _BLOCK_SIZE_0),), bias, x, weight, out, bias.size(0), bias.stride(0), out.stride(0), out.stride(1), weight.stride(0), x.stride(0), x.stride(1), m, eps, _BLOCK_SIZE_0, _REDUCTION_BLOCK_1, num_warps=4, num_stages=3)
+    return out
+
 --- assertExpectedJournal(TestReductions.test_sum)
 from __future__ import annotations
 
diff --git a/test/test_reductions.py b/test/test_reductions.py
@@ -135,6 +135,59 @@ def test_argmin_argmax_looped(self):
             torch.testing.assert_close(output, args[1](args[0], dim=-1))
         self.assertExpectedJournal(code)
 
+    def test_reduction_loops_integer_values(self):
+        """Test that reduction_loops with integer values works (issue #345 fix)."""
+
+        @helion.kernel(use_default_config=True)
+        def layer_norm_reduction(
+            x: torch.Tensor,
+            weight: torch.Tensor,
+            bias: torch.Tensor,
+            eps: float = 1e-5,
+        ) -> torch.Tensor:
+            m, n = x.size()
+            out = torch.empty([m, n], dtype=torch.float16, device=x.device)
+
+            for tile_m in hl.tile(m):
+                acc = x[tile_m, :].to(torch.float32)
+                var, mean = torch.var_mean(acc, dim=-1, keepdim=True, correction=0)
+                normalized = (acc - mean) * torch.rsqrt(var + eps)
+                result = normalized * (weight[:].to(torch.float32)) + (
+                    bias[:].to(torch.float32)
+                )
+                out[tile_m, :] = result
+            return out
+
+        x = torch.randn([32, 64], device=DEVICE, dtype=torch.float16)
+        weight = torch.randn([64], device=DEVICE, dtype=torch.float16)
+        bias = torch.randn([64], device=DEVICE, dtype=torch.float16)
+        eps = 1e-4
+
+        args = (x, weight, bias, eps)
+
+        # Test various reduction_loops configurations that previously failed
+        for reduction_loop_value in [2, 4, 8]:
+            with self.subTest(reduction_loop=reduction_loop_value):
+                code, output = code_and_output(
+                    layer_norm_reduction,
+                    args,
+                    block_size=32,
+                    reduction_loop=reduction_loop_value,
+                )
+
+                # Compute expected result using PyTorch's layer_norm
+                expected = torch.nn.functional.layer_norm(
+                    x.float(), [64], weight.float(), bias.float(), eps
+                ).half()
+
+                torch.testing.assert_close(output, expected, rtol=1e-2, atol=1e-2)
+
+        # Only check the generated code for one configuration to avoid redundant expected outputs
+        code, _ = code_and_output(
+            layer_norm_reduction, args, block_size=32, reduction_loop=4
+        )
+        self.assertExpectedJournal(code)
+
 
 if __name__ == "__main__":
     unittest.main()