Remerge LayerNorm (#348) (#373)

PaulZhang12 · web-flow · commit 27b27747cf70 · 2025-07-24T15:53:34.000-04:00
diff --git a/benchmarks/run.py b/benchmarks/run.py
@@ -75,6 +75,11 @@
         "examples.fp8_attention",
         "fp8_attention_tritonbench",
     ),
+    "layer_norm": (
+        "tritonbench.operators.layer_norm.operator",
+        "examples.layer_norm",
+        "layer_norm_fwd",
+    ),
 }
 
 
diff --git a/examples/layer_norm.py b/examples/layer_norm.py
@@ -0,0 +1,64 @@
+from __future__ import annotations
+
+import torch
+
+import helion
+from helion._testing import run_example
+import helion.language as hl
+
+
+@helion.kernel
+def layer_norm_fwd(
+    x: torch.Tensor,
+    nomralized_shape: list[int],
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    eps: float = 1e-5,
+) -> torch.Tensor:
+    m, n = x.size()
+    assert weight.size(0) == n, f"weight size mismatch {weight.size(0)} != {m}"
+    assert bias.size(0) == n, f"bias size mismatch {bias.size(0)} != {m}"
+    assert len(nomralized_shape) == 1, (
+        "Helion layer norm only supports 1D layer norm currently"
+    )
+    assert nomralized_shape[0] == n, (
+        f"normalized shape mismatch {nomralized_shape[0]} != {n}"
+    )
+
+    out = torch.empty([m, n], dtype=torch.float16, device=x.device)
+
+    for tile_m in hl.tile(m):
+        acc = x[tile_m, :].to(torch.float32)
+
+        var, mean = torch.var_mean(acc, dim=-1, keepdim=True, correction=0)
+
+        normalized = (acc - mean) * torch.rsqrt(var + eps)
+        acc = normalized * (weight[:].to(torch.float32)) + (bias[:].to(torch.float32))
+
+        out[tile_m, :] = acc
+    return out
+
+
+def main() -> None:
+    batch_size = 32
+    dim = 64
+    device = "cuda"
+
+    x = torch.randn([batch_size, dim], device=device, dtype=torch.float16)
+    weight = torch.randn([dim], device=device, dtype=torch.float16)
+    bias = torch.randn([dim], device=device, dtype=torch.float16)
+    eps = 1e-4
+
+    run_example(
+        layer_norm_fwd,
+        torch.nn.functional.layer_norm,
+        (x, [dim], weight, bias, eps),
+        kernel_name="helion",
+        baseline_name="torch",
+        rtol=1e-3,
+        atol=1e-3,
+    )
+
+
+if __name__ == "__main__":
+    main()
diff --git a/test/test_examples.expected b/test/test_examples.expected
@@ -883,6 +883,59 @@ def jagged_mean_kernel(x_data: torch.Tensor, x_offsets: torch.Tensor, x_feature_
     _launcher(_jagged_mean_kernel_kernel, (triton.cdiv(num_rows, _BLOCK_SIZE_0),), x_offsets, x_feature_counts, x_flat, out, out.stride(0), out.stride(1), x_feature_counts.stride(0), x_flat.stride(0), x_offsets.stride(0), num_rows, max_M, _BLOCK_SIZE_0, _BLOCK_SIZE_1, _BLOCK_SIZE_2, num_warps=4, num_stages=3)
     return out
 
+--- assertExpectedJournal(TestExamples.test_layernorm)
+from __future__ import annotations
+
+import torch
+import triton
+import triton.language as tl
+from torch._inductor.runtime.triton_compat import libdevice
+from helion.runtime import default_launcher as _default_launcher
+
+@triton.jit
+def _layer_norm_fwd_kernel(bias, x, weight, out, bias_size_0, bias_stride_0, out_stride_0, out_stride_1, weight_stride_0, x_stride_0, x_stride_1, m, eps, _BLOCK_SIZE_0: tl.constexpr, _RDIM_SIZE_1: tl.constexpr):
+    pid_0 = tl.program_id(0)
+    offset_0 = pid_0 * _BLOCK_SIZE_0
+    indices_0 = (offset_0 + tl.arange(0, _BLOCK_SIZE_0)).to(tl.int32)
+    mask_0 = indices_0 < m
+    indices_1 = tl.arange(0, _RDIM_SIZE_1).to(tl.int32)
+    mask_1 = indices_1 < bias_size_0
+    load = tl.load(x + (indices_0[:, None] * x_stride_0 + indices_1[None, :] * x_stride_1), mask_0[:, None] & mask_1[None, :], other=0)
+    v_0 = load.to(tl.float32)
+    var_mean_extra = tl.reshape(tl.sum(v_0, 1), [_BLOCK_SIZE_0, 1])
+    v_1 = var_mean_extra / bias_size_0.to(tl.float32)
+    _mask_to_1 = tl.where(tl.broadcast_to(mask_0[:, None], [_BLOCK_SIZE_0, 1]), v_1, 0)
+    v_2 = v_0 - _mask_to_1
+    v_3 = v_2 * v_2
+    var_mean_extra_2 = tl.reshape(tl.sum(v_3, 1), [_BLOCK_SIZE_0, 1])
+    v_4 = var_mean_extra_2 / bias_size_0.to(tl.float32)
+    v_5 = v_0 - v_1
+    v_6 = v_4 + eps
+    v_7 = libdevice.rsqrt(v_6)
+    v_8 = v_5 * v_7
+    load_1 = tl.load(weight + indices_1 * weight_stride_0, mask_1, other=0)
+    v_9 = load_1.to(tl.float32)
+    v_10 = v_9[None, :]
+    v_11 = v_8 * v_10
+    load_2 = tl.load(bias + indices_1 * bias_stride_0, mask_1, other=0)
+    v_12 = load_2.to(tl.float32)
+    v_13 = v_12[None, :]
+    v_14 = v_11 + v_13
+    v_15 = v_14.to(tl.float16)
+    tl.store(out + (indices_0[:, None] * out_stride_0 + indices_1[None, :] * out_stride_1), v_15, mask_0[:, None] & mask_1[None, :])
+
+def layer_norm_fwd(x: torch.Tensor, nomralized_shape: list[int], weight: torch.Tensor, bias: torch.Tensor, eps: float=1e-05, *, _launcher=_default_launcher):
+    m, n = x.size()
+    assert weight.size(0) == n, f'weight size mismatch {weight.size(0)} != {m}'
+    assert bias.size(0) == n, f'bias size mismatch {bias.size(0)} != {m}'
+    assert len(nomralized_shape) == 1, 'Helion layer norm only supports 1D layer norm currently'
+    assert nomralized_shape[0] == n, f'normalized shape mismatch {nomralized_shape[0]} != {n}'
+    out = torch.empty([m, n], dtype=torch.float16, device=x.device)
+    _BLOCK_SIZE_0 = 32
+    _RDIM_SIZE_1 = triton.next_power_of_2(bias.size(0))
+    _launcher(_layer_norm_fwd_kernel, (triton.cdiv(m, _BLOCK_SIZE_0),), bias, x, weight, out, bias.size(0), bias.stride(0), out.stride(0), out.stride(1), weight.stride(0), x.stride(0), x.stride(1), m, eps, _BLOCK_SIZE_0, _RDIM_SIZE_1, num_warps=4, num_stages=3)
+    return out
+
 --- assertExpectedJournal(TestExamples.test_matmul)
 from __future__ import annotations
 
diff --git a/test/test_examples.py b/test/test_examples.py
@@ -601,6 +601,22 @@ def test_fp8_attention(self):
             )
         )
 
+    def test_layernorm(self):
+        x = torch.randn([32, 64], device=DEVICE, dtype=torch.float16)
+        weight = torch.randn([64], device=DEVICE, dtype=torch.float16)
+        bias = torch.randn([64], device=DEVICE, dtype=torch.float16)
+
+        args = (x, [64], weight, bias)
+
+        self.assertExpectedJournal(
+            check_example(
+                "layer_norm",
+                args,
+                torch.nn.functional.layer_norm(*args),
+                fn_name="layer_norm_fwd",
+            )
+        )
+
 
 if __name__ == "__main__":
     unittest.main()