[Examples] Add matmul variants with bias support and tests (#379)

yf225 · web-flow · commit 8fd5a4b972e6 · 2025-07-29T21:40:22.000-07:00
- Add wrapper functions for tritonbench dispatch in matmul.py and matmul_split_k.py
- Implement bias handling in both matmul and matmul_split_k
- Add comprehensive tests in test_examples.py for all matmul variants
diff --git a/examples/matmul.py b/examples/matmul.py
@@ -1,15 +1,27 @@
 from __future__ import annotations
 
+from typing import TYPE_CHECKING
+
 import torch
+from torch import Tensor
 
 import helion
 from helion._testing import run_example
 import helion.language as hl
 
+if TYPE_CHECKING:
+    from collections.abc import Callable
+
 
-# static_shapes=True gives a performance boost for matmuls
-@helion.kernel(static_shapes=True)
-def matmul(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+@helion.kernel(
+    # static_shapes=True gives a performance boost for matmuls
+    static_shapes=True,
+)
+def matmul(
+    x: Tensor,
+    y: Tensor,
+    epilogue: Callable[[Tensor, list[Tensor]], Tensor] = lambda acc, tile: acc,
+) -> Tensor:
     m, k = x.size()
     k2, n = y.size()
     assert k == k2, f"size mismatch {k} != {k2}"
@@ -20,17 +32,57 @@ def matmul(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
         acc = hl.zeros([tile_m, tile_n], dtype=torch.float32)
         for tile_k in hl.tile(k):
             acc = torch.addmm(acc, x[tile_m, tile_k], y[tile_k, tile_n])
-        out[tile_m, tile_n] = acc
+        out[tile_m, tile_n] = epilogue(acc, [tile_m, tile_n])
     return out
 
 
+def autotune(m: int, k: int, n: int) -> None:
+    x = torch.randn([m, k], device="cuda", dtype=torch.float16)
+    y = torch.randn([k, n], device="cuda", dtype=torch.float16)
+    bias = torch.randn([n], device="cuda", dtype=torch.float16)
+    args = (x, y, lambda acc, tile: torch.relu(acc + bias[tile[1]]))
+    best_config = matmul.autotune(args, force=True)
+    print(f"Best config: {best_config}")
+    best_config.save("best_config.json")
+
+
 def check(m: int, k: int, n: int) -> None:
     x = torch.randn([m, k], device="cuda", dtype=torch.float16)
     y = torch.randn([k, n], device="cuda", dtype=torch.float16)
+    bias = torch.randn([n], device="cuda", dtype=torch.float16)
+
+    # Test without bias
     run_example(matmul, torch.matmul, (x, y))
 
+    # Test with bias
+    def helion_linear(x: Tensor, y: Tensor, bias: Tensor) -> Tensor:
+        return matmul(x, y, lambda acc, tile: acc + bias[tile[1]])
+
+    def baseline_linear(x: Tensor, y: Tensor, bias: Tensor) -> Tensor:
+        return torch.nn.functional.linear(x, y.T, bias)
+
+    run_example(helion_linear, baseline_linear, (x, y, bias))
+
+    # Test more complex epilogue
+    def epilogue(acc: Tensor, tile: list[Tensor]) -> Tensor:
+        # The epilogue can use the captured bias tensor that is implicitly lifted to a kernel arg
+        return torch.relu(acc + bias[tile[1]])
+
+    def kernel_wrapper(x: Tensor, y: Tensor) -> Tensor:
+        return matmul(x, y, epilogue)
+
+    def baseline_wrapper(x: Tensor, y: Tensor) -> Tensor:
+        return torch.relu(x @ y + bias)
+
+    run_example(
+        kernel_wrapper,
+        baseline_wrapper,
+        (x, y),
+    )
+
 
 def main() -> None:
+    # autotune(1024, 1024, 1024)
     check(1024, 1024, 1024)
 
 
diff --git a/examples/matmul_split_k.py b/examples/matmul_split_k.py
@@ -1,16 +1,26 @@
 from __future__ import annotations
 
+from typing import TYPE_CHECKING
+
 import torch
 
 import helion
 from helion._testing import run_example
 from helion.autotuner import PowerOfTwoFragment
 import helion.language as hl
 
+if TYPE_CHECKING:
+    from collections.abc import Callable
+
 
 # static_shapes=True gives a performance boost for matmuls
 @helion.kernel(static_shapes=True)
-def matmul_split_k(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+def matmul_split_k(
+    x: torch.Tensor,
+    y: torch.Tensor,
+    epilogue: Callable[[torch.Tensor, list[torch.Tensor]], torch.Tensor] = lambda acc,
+    tile: acc,
+) -> torch.Tensor:
     m, k = x.size()
     k2, n = y.size()
     assert k == k2, f"size mismatch {k} != {k2}"
@@ -23,14 +33,29 @@ def matmul_split_k(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
         acc = hl.zeros([tile_m, tile_n], dtype=torch.float32)
         for inner_k in hl.tile(outer_k.begin, outer_k.end):
             acc = torch.addmm(acc, x[tile_m, inner_k], y[inner_k, tile_n])
+        # Apply epilogue only on the first k-split iteration
+        if outer_k.begin == 0:
+            acc = epilogue(acc, [tile_m, tile_n])
         hl.atomic_add(out, [tile_m, tile_n], acc)
     return out
 
 
 def check(m: int, k: int, n: int) -> None:
     x = torch.randn([m, k], device="cuda", dtype=torch.float16)
     y = torch.randn([k, n], device="cuda", dtype=torch.float16)
-    run_example(matmul_split_k, torch.matmul, (x, y), atol=1)
+
+    # Test without bias
+    kernel_no_bias = lambda x, y: matmul_split_k(x, y)  # noqa: E731
+    expected_no_bias = lambda x, y: torch.matmul(x, y)  # noqa: E731
+    run_example(kernel_no_bias, expected_no_bias, (x, y), atol=1)
+
+    # Test with bias using closure approach
+    bias = torch.randn([n], device="cuda", dtype=torch.float16)
+    kernel_with_bias = lambda x, y: matmul_split_k(  # noqa: E731
+        x, y, epilogue=lambda acc, tile: acc + bias[tile[1]]
+    )
+    expected_with_bias = lambda x, y: torch.nn.functional.linear(x, y.T, bias)  # noqa: E731
+    run_example(kernel_with_bias, expected_with_bias, (x, y), atol=1)
 
 
 def main() -> None:
diff --git a/examples/template_via_closure.py b/examples/template_via_closure.py
diff --git a/test/test_examples.expected b/test/test_examples.expected
@@ -964,7 +964,7 @@ def _matmul_kernel(x, y, out, _BLOCK_SIZE_0: tl.constexpr, _BLOCK_SIZE_1: tl.con
         acc = tl.dot(load, load_1, acc=acc_copy_0, input_precision='tf32')
     tl.store(out + (indices_0[:, None] * 128 + indices_1[None, :] * 1), acc, None)
 
-def matmul(x: torch.Tensor, y: torch.Tensor, *, _launcher=_default_launcher):
+def matmul(x: Tensor, y: Tensor, epilogue: Callable[[Tensor, list[Tensor]], Tensor]=lambda acc, tile: acc, *, _launcher=_default_launcher):
     m, k = x.size()
     k2, n = y.size()
     assert k == k2, f'size mismatch {k} != {k2}'
@@ -1131,9 +1131,13 @@ def _matmul_split_k_kernel(x, y, out, _BLOCK_SIZE_0: tl.constexpr, _BLOCK_SIZE_1
         load = tl.load(x + (indices_0[:, None] * 1024 + indices_3[None, :] * 1), mask_3[None, :], other=0)
         load_1 = tl.load(y + (indices_3[:, None] * 64 + indices_1[None, :] * 1), mask_3[:, None], other=0)
         acc = tl.dot(load, load_1, acc=acc_copy_0, input_precision='tf32')
+    eq = offset_2 == 0
+    if eq:
+        acc_copy_1 = acc
+        acc = acc_copy_1
     tl.atomic_add(out + (indices_0[:, None] * 64 + indices_1[None, :] * 1), acc, mask=None, sem='relaxed')
 
-def matmul_split_k(x: torch.Tensor, y: torch.Tensor, *, _launcher=_default_launcher):
+def matmul_split_k(x: torch.Tensor, y: torch.Tensor, epilogue: Callable[[torch.Tensor, list[torch.Tensor]], torch.Tensor]=lambda acc, tile: acc, *, _launcher=_default_launcher):
     m, k = x.size()
     k2, n = y.size()
     assert k == k2, f'size mismatch {k} != {k2}'
@@ -1610,7 +1614,7 @@ from helion.runtime import default_launcher as _default_launcher
 import test.test_examples as _global_source0
 
 @triton.jit
-def _matmul_with_epilogue_kernel(x, y, epilogue_closure_0, out, _BLOCK_SIZE_0: tl.constexpr, _BLOCK_SIZE_1: tl.constexpr, _BLOCK_SIZE_2: tl.constexpr):
+def _matmul_kernel(x, y, epilogue_closure_0, out, _BLOCK_SIZE_0: tl.constexpr, _BLOCK_SIZE_1: tl.constexpr, _BLOCK_SIZE_2: tl.constexpr):
     num_pid_m = tl.cdiv(1024, _BLOCK_SIZE_0)
     num_pid_n = tl.cdiv(1024, _BLOCK_SIZE_1)
     inner_2d_pid = tl.program_id(0)
@@ -1640,15 +1644,15 @@ def _matmul_with_epilogue_kernel(x, y, epilogue_closure_0, out, _BLOCK_SIZE_0: t
     v_4 = v_3.to(tl.float16)
     tl.store(out + (indices_0[:, None] * 1024 + indices_1[None, :] * 1), v_4, None)
 
-def matmul_with_epilogue(x: Tensor, y: Tensor, epilogue: Callable[[Tensor, list[Tensor]], Tensor], *, _launcher=_default_launcher):
+def matmul(x: Tensor, y: Tensor, epilogue: Callable[[Tensor, list[Tensor]], Tensor]=lambda acc, tile: acc, *, _launcher=_default_launcher):
     m, k = x.size()
     k2, n = y.size()
     assert k == k2, f'size mismatch {k} != {k2}'
     out = torch.empty([m, n], dtype=torch.promote_types(x.dtype, y.dtype), device=x.device)
     _BLOCK_SIZE_0 = 64
     _BLOCK_SIZE_1 = 64
     _BLOCK_SIZE_2 = 16
-    _launcher(_matmul_with_epilogue_kernel, (triton.cdiv(1024, _BLOCK_SIZE_0) * triton.cdiv(1024, _BLOCK_SIZE_1),), x, y, epilogue.__closure__[0].cell_contents, out, _BLOCK_SIZE_0, _BLOCK_SIZE_1, _BLOCK_SIZE_2, num_warps=2, num_stages=4)
+    _launcher(_matmul_kernel, (triton.cdiv(1024, _BLOCK_SIZE_0) * triton.cdiv(1024, _BLOCK_SIZE_1),), x, y, epilogue.__closure__[0].cell_contents, out, _BLOCK_SIZE_0, _BLOCK_SIZE_1, _BLOCK_SIZE_2, num_warps=2, num_stages=4)
     return out
 
 --- assertExpectedJournal(TestExamples.test_template_via_closure1)
@@ -1663,7 +1667,7 @@ from helion.runtime import default_launcher as _default_launcher
 import test.test_examples as _global_source0
 
 @triton.jit
-def _matmul_with_epilogue_kernel(x, y, epilogue_closure_0, out, _BLOCK_SIZE_0: tl.constexpr, _BLOCK_SIZE_1: tl.constexpr, _BLOCK_SIZE_2: tl.constexpr):
+def _matmul_kernel(x, y, epilogue_closure_0, out, _BLOCK_SIZE_0: tl.constexpr, _BLOCK_SIZE_1: tl.constexpr, _BLOCK_SIZE_2: tl.constexpr):
     num_pid_m = tl.cdiv(1024, _BLOCK_SIZE_0)
     num_pid_n = tl.cdiv(1024, _BLOCK_SIZE_1)
     inner_2d_pid = tl.program_id(0)
@@ -1690,15 +1694,15 @@ def _matmul_with_epilogue_kernel(x, y, epilogue_closure_0, out, _BLOCK_SIZE_0: t
     v_4 = v_3.to(tl.float16)
     tl.store(tl.make_block_ptr(out, [1024, 1024], [1024, 1], [offset_0, offset_1], [_BLOCK_SIZE_0, _BLOCK_SIZE_1], [1, 0]), v_4, boundary_check=[0, 1])
 
-def matmul_with_epilogue(x: Tensor, y: Tensor, epilogue: Callable[[Tensor, list[Tensor]], Tensor], *, _launcher=_default_launcher):
+def matmul(x: Tensor, y: Tensor, epilogue: Callable[[Tensor, list[Tensor]], Tensor]=lambda acc, tile: acc, *, _launcher=_default_launcher):
     m, k = x.size()
     k2, n = y.size()
     assert k == k2, f'size mismatch {k} != {k2}'
     out = torch.empty([m, n], dtype=torch.promote_types(x.dtype, y.dtype), device=x.device)
     _BLOCK_SIZE_0 = 64
     _BLOCK_SIZE_1 = 64
     _BLOCK_SIZE_2 = 16
-    _launcher(_matmul_with_epilogue_kernel, (triton.cdiv(1024, _BLOCK_SIZE_0) * triton.cdiv(1024, _BLOCK_SIZE_1),), x, y, epilogue.__closure__[0].cell_contents, out, _BLOCK_SIZE_0, _BLOCK_SIZE_1, _BLOCK_SIZE_2, num_warps=2, num_stages=4)
+    _launcher(_matmul_kernel, (triton.cdiv(1024, _BLOCK_SIZE_0) * triton.cdiv(1024, _BLOCK_SIZE_1),), x, y, epilogue.__closure__[0].cell_contents, out, _BLOCK_SIZE_0, _BLOCK_SIZE_1, _BLOCK_SIZE_2, num_warps=2, num_stages=4)
     return out
 
 --- assertExpectedJournal(TestExamples.test_template_via_closure2)
@@ -1713,7 +1717,7 @@ from helion.runtime import default_launcher as _default_launcher
 import test.test_examples as _global_source0
 
 @triton.jit
-def _matmul_with_epilogue_kernel(x, y, out, _BLOCK_SIZE_0: tl.constexpr, _BLOCK_SIZE_1: tl.constexpr, _BLOCK_SIZE_2: tl.constexpr):
+def _matmul_kernel(x, y, out, _BLOCK_SIZE_0: tl.constexpr, _BLOCK_SIZE_1: tl.constexpr, _BLOCK_SIZE_2: tl.constexpr):
     num_pid_m = tl.cdiv(1024, _BLOCK_SIZE_0)
     num_pid_n = tl.cdiv(1024, _BLOCK_SIZE_1)
     inner_2d_pid = tl.program_id(0)
@@ -1737,13 +1741,13 @@ def _matmul_with_epilogue_kernel(x, y, out, _BLOCK_SIZE_0: tl.constexpr, _BLOCK_
     v_2 = v_1.to(tl.float16)
     tl.store(tl.make_block_ptr(out, [1024, 1024], [1024, 1], [offset_0, offset_1], [_BLOCK_SIZE_0, _BLOCK_SIZE_1], [1, 0]), v_2, boundary_check=[0, 1])
 
-def matmul_with_epilogue(x: Tensor, y: Tensor, epilogue: Callable[[Tensor, list[Tensor]], Tensor], *, _launcher=_default_launcher):
+def matmul(x: Tensor, y: Tensor, epilogue: Callable[[Tensor, list[Tensor]], Tensor]=lambda acc, tile: acc, *, _launcher=_default_launcher):
     m, k = x.size()
     k2, n = y.size()
     assert k == k2, f'size mismatch {k} != {k2}'
     out = torch.empty([m, n], dtype=torch.promote_types(x.dtype, y.dtype), device=x.device)
     _BLOCK_SIZE_0 = 64
     _BLOCK_SIZE_1 = 64
     _BLOCK_SIZE_2 = 16
-    _launcher(_matmul_with_epilogue_kernel, (triton.cdiv(1024, _BLOCK_SIZE_0) * triton.cdiv(1024, _BLOCK_SIZE_1),), x, y, out, _BLOCK_SIZE_0, _BLOCK_SIZE_1, _BLOCK_SIZE_2, num_warps=2, num_stages=4)
+    _launcher(_matmul_kernel, (triton.cdiv(1024, _BLOCK_SIZE_0) * triton.cdiv(1024, _BLOCK_SIZE_1),), x, y, out, _BLOCK_SIZE_0, _BLOCK_SIZE_1, _BLOCK_SIZE_2, num_warps=2, num_stages=4)
     return out
diff --git a/test/test_examples.py b/test/test_examples.py
@@ -143,10 +143,10 @@ def test_template_via_closure0(self):
         )
         self.assertExpectedJournal(
             check_example(
-                "template_via_closure",
+                "matmul",
                 args,
                 torch.relu(args[0] @ args[1] + bias),
-                fn_name="matmul_with_epilogue",
+                fn_name="matmul",
                 block_sizes=[64, 64, 16],
                 loop_orders=[[0, 1]],
                 num_warps=2,
@@ -165,10 +165,10 @@ def test_template_via_closure1(self):
         )
         self.assertExpectedJournal(
             check_example(
-                "template_via_closure",
+                "matmul",
                 args,
                 torch.relu(args[0] @ args[1] + bias),
-                fn_name="matmul_with_epilogue",
+                fn_name="matmul",
                 block_sizes=[64, 64, 16],
                 loop_orders=[[0, 1]],
                 num_warps=2,
@@ -186,10 +186,10 @@ def test_template_via_closure2(self):
         )
         self.assertExpectedJournal(
             check_example(
-                "template_via_closure",
+                "matmul",
                 args,
                 torch.relu(args[0] @ args[1]),
-                fn_name="matmul_with_epilogue",
+                fn_name="matmul",
                 block_sizes=[64, 64, 16],
                 loop_orders=[[0, 1]],
                 num_warps=2,
diff --git a/test/test_matmul.expected b/test/test_matmul.expected
@@ -75,7 +75,7 @@ def _matmul_kernel(x, y, out, _BLOCK_SIZE_1: tl.constexpr, _BLOCK_SIZE_0: tl.con
         acc = tl.dot(load, load_1, acc=acc_copy_0, input_precision='tf32')
     tl.store(out + (indices_0[:, None] * 128 + indices_1[None, :] * 1), acc, None)
 
-def matmul(x: torch.Tensor, y: torch.Tensor, *, _launcher=_default_launcher):
+def matmul(x: Tensor, y: Tensor, epilogue: Callable[[Tensor, list[Tensor]], Tensor]=lambda acc, tile: acc, *, _launcher=_default_launcher):
     m, k = x.size()
     k2, n = y.size()
     assert k == k2, f'size mismatch {k} != {k2}'
@@ -162,7 +162,7 @@ def _matmul_kernel(x, y, out, _BLOCK_SIZE_0: tl.constexpr, _BLOCK_SIZE_1: tl.con
         acc = tl.dot(load, load_1, acc=acc_copy_0, input_precision='tf32')
     tl.store(tl.make_block_ptr(out, [128, 128], [128, 1], [offset_0, offset_1], [_BLOCK_SIZE_0, _BLOCK_SIZE_1], [1, 0]), acc, boundary_check=[0, 1])
 
-def matmul(x: torch.Tensor, y: torch.Tensor, *, _launcher=_default_launcher):
+def matmul(x: Tensor, y: Tensor, epilogue: Callable[[Tensor, list[Tensor]], Tensor]=lambda acc, tile: acc, *, _launcher=_default_launcher):
     m, k = x.size()
     k2, n = y.size()
     assert k == k2, f'size mismatch {k} != {k2}'
@@ -435,7 +435,7 @@ def _matmul_kernel(x, y, out, _BLOCK_SIZE_0: tl.constexpr, _BLOCK_SIZE_1: tl.con
         acc = tl.dot(load, load_1, acc=acc_copy_0, input_precision='tf32')
     out_desc.store([offset_0, offset_1], acc)
 
-def matmul(x: torch.Tensor, y: torch.Tensor, *, _launcher=_default_launcher):
+def matmul(x: Tensor, y: Tensor, epilogue: Callable[[Tensor, list[Tensor]], Tensor]=lambda acc, tile: acc, *, _launcher=_default_launcher):
     m, k = x.size()
     k2, n = y.size()
     assert k == k2, f'size mismatch {k} != {k2}'
diff --git a/test/test_type_propagation.expected b/test/test_type_propagation.expected