[Examples] Add matmul variants with bias support and tests

yf225 · yf225 · commit 90e2d69c1e84 · 2025-07-27T00:40:27.000-07:00
- Add wrapper functions for tritonbench dispatch in matmul.py and matmul_split_k.py - Implement bias handling in both matmul and matmul_split_k - Add comprehensive tests in test_examples.py for all matmul variants stack-info: PR: #379, branch: yf225/stack/41
diff --git a/examples/matmul.py b/examples/matmul.py
@@ -1,15 +1,25 @@
 from __future__ import annotations
 
+from typing import TYPE_CHECKING
+
 import torch
 
 import helion
 from helion._testing import run_example
 import helion.language as hl
 
+if TYPE_CHECKING:
+    from collections.abc import Callable
+
 
 # static_shapes=True gives a performance boost for matmuls
 @helion.kernel(static_shapes=True)
-def matmul(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+def matmul(
+    x: torch.Tensor,
+    y: torch.Tensor,
+    epilogue: Callable[[torch.Tensor, list[torch.Tensor]], torch.Tensor] = lambda acc,
+    tile: acc,
+) -> torch.Tensor:
     m, k = x.size()
     k2, n = y.size()
     assert k == k2, f"size mismatch {k} != {k2}"
@@ -20,14 +30,24 @@ def matmul(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
         acc = hl.zeros([tile_m, tile_n], dtype=torch.float32)
         for tile_k in hl.tile(k):
             acc = torch.addmm(acc, x[tile_m, tile_k], y[tile_k, tile_n])
-        out[tile_m, tile_n] = acc
+        out[tile_m, tile_n] = epilogue(acc, [tile_m, tile_n])
     return out
 
 
 def check(m: int, k: int, n: int) -> None:
     x = torch.randn([m, k], device="cuda", dtype=torch.float16)
     y = torch.randn([k, n], device="cuda", dtype=torch.float16)
-    run_example(matmul, torch.matmul, (x, y))
+
+    # Test without bias
+    kernel_no_bias = lambda x, y: matmul(x, y)  # noqa: E731
+    expected_no_bias = lambda x, y: torch.matmul(x, y)  # noqa: E731
+    run_example(kernel_no_bias, expected_no_bias, (x, y))
+
+    # Test with bias
+    bias = torch.randn([n], device="cuda", dtype=torch.float16)
+    kernel_with_bias = lambda x, y: matmul(x, y, lambda acc, tile: acc + bias[tile[1]])  # noqa: E731
+    expected_with_bias = lambda x, y: torch.matmul(x, y) + bias  # noqa: E731
+    run_example(kernel_with_bias, expected_with_bias, (x, y))
 
 
 def main() -> None:
diff --git a/examples/matmul_split_k.py b/examples/matmul_split_k.py
@@ -1,16 +1,26 @@
 from __future__ import annotations
 
+from typing import TYPE_CHECKING
+
 import torch
 
 import helion
 from helion._testing import run_example
 from helion.autotuner import PowerOfTwoFragment
 import helion.language as hl
 
+if TYPE_CHECKING:
+    from collections.abc import Callable
+
 
 # static_shapes=True gives a performance boost for matmuls
 @helion.kernel(static_shapes=True)
-def matmul_split_k(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
+def matmul_split_k(
+    x: torch.Tensor,
+    y: torch.Tensor,
+    epilogue: Callable[[torch.Tensor, list[torch.Tensor]], torch.Tensor] = lambda acc,
+    tile: acc,
+) -> torch.Tensor:
     m, k = x.size()
     k2, n = y.size()
     assert k == k2, f"size mismatch {k} != {k2}"
@@ -23,14 +33,29 @@ def matmul_split_k(x: torch.Tensor, y: torch.Tensor) -> torch.Tensor:
         acc = hl.zeros([tile_m, tile_n], dtype=torch.float32)
         for inner_k in hl.tile(outer_k.begin, outer_k.end):
             acc = torch.addmm(acc, x[tile_m, inner_k], y[inner_k, tile_n])
+        # Apply epilogue only on the first k-split iteration
+        if outer_k.begin == 0:
+            acc = epilogue(acc, [tile_m, tile_n])
         hl.atomic_add(out, [tile_m, tile_n], acc)
     return out
 
 
 def check(m: int, k: int, n: int) -> None:
     x = torch.randn([m, k], device="cuda", dtype=torch.float16)
     y = torch.randn([k, n], device="cuda", dtype=torch.float16)
-    run_example(matmul_split_k, torch.matmul, (x, y), atol=1)
+
+    # Test without bias
+    kernel_no_bias = lambda x, y: matmul_split_k(x, y)  # noqa: E731
+    expected_no_bias = lambda x, y: torch.matmul(x, y)  # noqa: E731
+    run_example(kernel_no_bias, expected_no_bias, (x, y), atol=1)
+
+    # Test with bias using closure approach
+    bias = torch.randn([n], device="cuda", dtype=torch.float16)
+    kernel_with_bias = lambda x, y: matmul_split_k(  # noqa: E731
+        x, y, epilogue=lambda acc, tile: acc + bias[tile[1]]
+    )
+    expected_with_bias = lambda x, y: torch.matmul(x, y) + bias  # noqa: E731
+    run_example(kernel_with_bias, expected_with_bias, (x, y), atol=1)
 
 
 def main() -> None:
diff --git a/test/test_examples.expected b/test/test_examples.expected
@@ -969,7 +969,7 @@ def _matmul_kernel(x, y, out, _BLOCK_SIZE_0: tl.constexpr, _BLOCK_SIZE_1: tl.con
         acc = tl.dot(load, load_1, acc=acc_copy_0, input_precision='tf32')
     tl.store(out + (indices_0[:, None] * 128 + indices_1[None, :] * 1), acc, None)
 
-def matmul(x: torch.Tensor, y: torch.Tensor, *, _launcher=_default_launcher):
+def matmul(x: torch.Tensor, y: torch.Tensor, epilogue: Callable[[torch.Tensor, list[torch.Tensor]], torch.Tensor]=lambda acc, tile: acc, *, _launcher=_default_launcher):
     m, k = x.size()
     k2, n = y.size()
     assert k == k2, f'size mismatch {k} != {k2}'
@@ -1136,9 +1136,13 @@ def _matmul_split_k_kernel(x, y, out, _BLOCK_SIZE_0: tl.constexpr, _BLOCK_SIZE_1
         load = tl.load(x + (indices_0[:, None] * 1024 + indices_3[None, :] * 1), mask_3[None, :], other=0)
         load_1 = tl.load(y + (indices_3[:, None] * 64 + indices_1[None, :] * 1), mask_3[:, None], other=0)
         acc = tl.dot(load, load_1, acc=acc_copy_0, input_precision='tf32')
+    eq = offset_2 == 0
+    if eq:
+        acc_copy_1 = acc
+        acc = acc_copy_1
     tl.atomic_add(out + (indices_0[:, None] * 64 + indices_1[None, :] * 1), acc, mask=None, sem='relaxed')
 
-def matmul_split_k(x: torch.Tensor, y: torch.Tensor, *, _launcher=_default_launcher):
+def matmul_split_k(x: torch.Tensor, y: torch.Tensor, epilogue: Callable[[torch.Tensor, list[torch.Tensor]], torch.Tensor]=lambda acc, tile: acc, *, _launcher=_default_launcher):
     m, k = x.size()
     k2, n = y.size()
     assert k == k2, f'size mismatch {k} != {k2}'
diff --git a/test/test_matmul.expected b/test/test_matmul.expected
@@ -75,7 +75,7 @@ def _matmul_kernel(x, y, out, _BLOCK_SIZE_1: tl.constexpr, _BLOCK_SIZE_0: tl.con
         acc = tl.dot(load, load_1, acc=acc_copy_0, input_precision='tf32')
     tl.store(out + (indices_0[:, None] * 128 + indices_1[None, :] * 1), acc, None)
 
-def matmul(x: torch.Tensor, y: torch.Tensor, *, _launcher=_default_launcher):
+def matmul(x: torch.Tensor, y: torch.Tensor, epilogue: Callable[[torch.Tensor, list[torch.Tensor]], torch.Tensor]=lambda acc, tile: acc, *, _launcher=_default_launcher):
     m, k = x.size()
     k2, n = y.size()
     assert k == k2, f'size mismatch {k} != {k2}'
@@ -162,7 +162,7 @@ def _matmul_kernel(x, y, out, _BLOCK_SIZE_0: tl.constexpr, _BLOCK_SIZE_1: tl.con
         acc = tl.dot(load, load_1, acc=acc_copy_0, input_precision='tf32')
     tl.store(tl.make_block_ptr(out, [128, 128], [128, 1], [offset_0, offset_1], [_BLOCK_SIZE_0, _BLOCK_SIZE_1], [1, 0]), acc, boundary_check=[0, 1])
 
-def matmul(x: torch.Tensor, y: torch.Tensor, *, _launcher=_default_launcher):
+def matmul(x: torch.Tensor, y: torch.Tensor, epilogue: Callable[[torch.Tensor, list[torch.Tensor]], torch.Tensor]=lambda acc, tile: acc, *, _launcher=_default_launcher):
     m, k = x.size()
     k2, n = y.size()
     assert k == k2, f'size mismatch {k} != {k2}'
@@ -435,7 +435,7 @@ def _matmul_kernel(x, y, out, _BLOCK_SIZE_0: tl.constexpr, _BLOCK_SIZE_1: tl.con
         acc = tl.dot(load, load_1, acc=acc_copy_0, input_precision='tf32')
     out_desc.store([offset_0, offset_1], acc)
 
-def matmul(x: torch.Tensor, y: torch.Tensor, *, _launcher=_default_launcher):
+def matmul(x: torch.Tensor, y: torch.Tensor, epilogue: Callable[[torch.Tensor, list[torch.Tensor]], torch.Tensor]=lambda acc, tile: acc, *, _launcher=_default_launcher):
     m, k = x.size()
     k2, n = y.size()
     assert k == k2, f'size mismatch {k} != {k2}'
diff --git a/test/test_type_propagation.expected b/test/test_type_propagation.expected