[tilelang] Add gemm and rms_norm kernels (#514)

xuzhao9 · web-flow · commit adad9c3b0cef · 2025-10-03T21:19:02.000-04:00
diff --git a/tritonbench/operators/gemm/operator.py b/tritonbench/operators/gemm/operator.py
@@ -26,9 +26,16 @@ def _tlx_matmul(*args, **kwargs):
         raise RuntimeError("TLX not available in this Triton version")
 
 
+from tritonbench.utils.python_utils import try_import
+
+with try_import("HAS_TILELANG"):
+    from .tilelang import tilelang_matmul_func
+
+
 from tritonbench.utils.data_utils import get_production_shapes
 from tritonbench.utils.env_utils import (
     get_nvidia_gpu_model,
+    is_cu130,
     is_cuda,
     is_fbcode,
     supports_tma,
@@ -472,6 +479,12 @@ def tlx_matmul(self, a, b, bias) -> Callable:
             else:
                 return lambda: _tlx_matmul(a, b)
 
+        @register_benchmark(enabled=HAS_TILELANG and is_cu130())
+        def tilelang_blackwell_matmul(self, a, b, bias) -> Callable:
+            assert bias is None, "Tilelang does not support bias"
+            assert a.dtype == torch.bfloat16, "Tilelang only supports bf16"
+            return tilelang_matmul_func(a, b)
+
     @register_x_val(label="(M, N, K)")
     def get_x_val(self, example_inputs) -> Tuple[int, int, int]:
         # x-value: computation intensity
diff --git a/tritonbench/operators/gemm/tilelang.py b/tritonbench/operators/gemm/tilelang.py
@@ -0,0 +1,110 @@
+# Original source: https://github.com/tile-ai/tilelang/blob/main/examples/gemm_sm100/gemm_tcgen5mma.py
+import tilelang
+import tilelang.language as T
+import torch
+
+tilelang.disable_cache()
+
+
+def matmul(
+    M,
+    N,
+    K,
+    block_M,
+    block_N,
+    block_K,
+    trans_A,
+    trans_B,
+    in_dtype,
+    out_dtype,
+    accum_dtype,
+    num_stages,
+    threads,
+):
+    A_shape = (K, M) if trans_A else (M, K)
+    B_shape = (N, K) if trans_B else (K, N)
+    A_shared_shape = (block_K, block_M) if trans_A else (block_M, block_K)
+    B_shared_shape = (block_N, block_K) if trans_B else (block_K, block_N)
+
+    @T.prim_func
+    def main(
+        A: T.Tensor(A_shape, in_dtype),
+        B: T.Tensor(B_shape, in_dtype),
+        C: T.Tensor((M, N), out_dtype),
+    ):
+        with T.Kernel(
+            T.ceildiv(N, block_N), T.ceildiv(M, block_M), threads=threads
+        ) as (bx, by):
+            A_shared = T.alloc_shared(A_shared_shape, in_dtype)
+            B_shared = T.alloc_shared(B_shared_shape, in_dtype)
+            C_tmem = T.alloc_tmem([block_M, block_N], accum_dtype)
+            mbar = T.alloc_barrier(1)
+            C_local = T.alloc_fragment((block_M, block_N), accum_dtype)
+            C_shared = T.alloc_shared((block_M, block_N), out_dtype)
+
+            for k in T.Pipelined(T.ceildiv(K, block_K), num_stages=num_stages):
+                T.copy(A[by * block_M, k * block_K], A_shared)
+                T.copy(B[bx * block_N, k * block_K], B_shared)
+                T.gemm(
+                    A_shared,
+                    B_shared,
+                    C_tmem,
+                    trans_A,
+                    trans_B,
+                    mbar=mbar,
+                    wg_wait=-1,
+                    clear_accum=k == 0,
+                )
+                T.mbarrier_wait_parity(mbar, k % 2)
+
+            T.copy(C_tmem, C_local)
+            T.copy(C_local, C_shared)
+
+            T.copy(C_shared, C[by * block_M, bx * block_N])
+
+    return main
+
+
+TILELANG_DTYPE_MAP = {
+    torch.bfloat16: "bfloat16",
+    torch.float16: "float16",
+    torch.float32: "float",
+}
+
+
+def tilelang_matmul_func(a, b):
+    M, K = a.size()
+    K, N = b.size()
+    b_T = b.T.contiguous()
+    block_M, block_N, block_K = 128, 256, 128
+    trans_A, trans_B = False, True
+    in_dtype = TILELANG_DTYPE_MAP[a.dtype]
+    out_dtype = TILELANG_DTYPE_MAP[a.dtype]
+    accum_dtype = "float"
+    num_stages = 2
+    threads = 256
+    func = matmul(
+        M,
+        N,
+        K,
+        block_M,
+        block_N,
+        block_K,
+        trans_A,
+        trans_B,
+        in_dtype,
+        out_dtype,
+        accum_dtype,
+        num_stages,
+        threads,
+    )
+    jit_kernel = tilelang.compile(
+        func,
+        out_idx=[2],
+        target="cuda",
+        pass_configs={
+            tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+            tilelang.PassConfigKey.TL_DISABLE_WARP_SPECIALIZED: True,
+        },
+    )
+    return lambda: jit_kernel(a, b_T)
diff --git a/tritonbench/operators/rms_norm/operator.py b/tritonbench/operators/rms_norm/operator.py
@@ -4,6 +4,7 @@
 import torch
 
 from tritonbench.utils.env_utils import is_hip
+from tritonbench.utils.python_utils import try_import
 
 from tritonbench.utils.triton_op import (
     BenchmarkOperator,
@@ -31,6 +32,9 @@
 except ModuleNotFoundError:
     QuackRMSNorm = None
 
+with try_import("HAS_TILELANG"):
+    from .tilelang import TileLangRMSNorm
+
 
 def parse_op_args(args: List[str]):
     parser = argparse.ArgumentParser()
@@ -153,6 +157,12 @@ def aiter(self, H, input, weight) -> Callable:
         self.aiter_rms_op = module
         return lambda: module(input)
 
+    @register_benchmark(enabled=HAS_TILELANG)
+    def tilelang(self, H, input, weight) -> Callable:
+        module = TileLangRMSNorm(hidden_size=H, eps=self.eps).to(self.device)
+        module.weight = weight
+        return module(input)
+
     @register_x_val(label="(M, H)")
     def get_x_val(self, example_inputs) -> Tuple[int, int]:
         H = example_inputs[0]
diff --git a/tritonbench/operators/rms_norm/tilelang.py b/tritonbench/operators/rms_norm/tilelang.py
@@ -0,0 +1,94 @@
+# Original source:
+# https://github.com/tile-ai/tilelang/blob/main/examples/norm/test_rms_norm.py
+import tilelang
+import tilelang.language as T
+import torch
+
+tilelang.disable_cache()
+
+
+def rms_norm_splitk(M, N, blk_m, blk_k):
+    dtype = "float"
+
+    @T.prim_func
+    def main(A: T.Tensor((M, N), dtype), B: T.Tensor((M, N), dtype)):
+        with T.Kernel(T.ceildiv(M, blk_m), threads=128) as bx:
+            A_shared = T.alloc_shared((blk_m, blk_k), dtype)
+            A_local = T.alloc_fragment((blk_m, blk_k), dtype)
+            A_powsum = T.alloc_fragment((blk_m,), dtype)
+
+            num_k_step = T.ceildiv(N, blk_k)
+            T.clear(A_local)
+            for k in range(num_k_step):
+                T.copy(A[bx * blk_m, k * blk_k], A_shared)
+                for i, j in T.Parallel(blk_m, blk_k):
+                    A_local[i, j] += A_shared[i, j] * A_shared[i, j]
+            T.reduce_sum(A_local, A_powsum, dim=1)
+            for i in T.Parallel(blk_m):
+                A_powsum[i] = T.rsqrt(A_powsum[i] / N) + 1e-12
+
+            for k in range(num_k_step):
+                # reverse, better cache hit rate
+                T.copy(A[bx * blk_m, (num_k_step - 1 - k) * blk_k], A_shared)
+                for i, j in T.Parallel(blk_m, blk_k):
+                    A_shared[i, j] *= A_powsum[i]
+                T.copy(A_shared, B[bx * blk_m, (num_k_step - 1 - k) * blk_k])
+
+    return main
+
+
+def rms_norm(M, N, blk_m, dtype, variance_epsilon=1e-12):
+    @T.prim_func
+    def main(A: T.Tensor((M, N), dtype), B: T.Tensor((M, N), dtype)):
+        with T.Kernel(T.ceildiv(M, blk_m), threads=128) as bx:
+            A_shared = T.alloc_shared((blk_m, N), dtype)
+            A_pow_local = T.alloc_fragment((blk_m, N), dtype)
+            A_local = T.alloc_fragment((blk_m, N), dtype)
+            A_powsum = T.alloc_fragment((blk_m,), dtype)
+
+            T.copy(A[bx * blk_m : (bx + 1) * blk_m, :], A_shared)
+            T.copy(A_shared, A_local)
+            for i, j in T.Parallel(blk_m, N):
+                A_pow_local[i, j] = A_local[i, j] * A_local[i, j]
+            T.reduce_sum(A_pow_local, A_powsum, dim=1)
+            for i in T.Parallel(blk_m):
+                A_powsum[i] = T.rsqrt(A_powsum[i] / N) + variance_epsilon
+            for i, j in T.Parallel(blk_m, N):
+                A_local[i, j] *= A_powsum[i]
+            T.copy(A_local, B[bx * blk_m : (bx + 1) * blk_m, :])
+
+    return main
+
+
+TILELANG_DTYPE_MAP = {
+    torch.bfloat16: "bfloat16",
+    torch.float16: "float16",
+    torch.float32: "float",
+}
+
+
+class TileLangRMSNorm(torch.nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        TileLangRMSNorm
+        """
+        super().__init__()
+        self.weight = torch.nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        M, N = hidden_states.size()
+        dtype = TILELANG_DTYPE_MAP[hidden_states.dtype]
+        blk_m = 1
+        blk_k = 512
+
+        kernel = rms_norm(M, N, blk_m, dtype, self.variance_epsilon)
+        jit_kernel = tilelang.compile(
+            kernel,
+            out_idx=[-1],
+            target="cuda",
+            pass_configs={
+                tilelang.PassConfigKey.TL_DISABLE_TMA_LOWER: True,
+            },
+        )
+        return lambda: jit_kernel(hidden_states)
diff --git a/tritonbench/utils/env_utils.py b/tritonbench/utils/env_utils.py
@@ -75,6 +75,10 @@ def supports_tma():
     return is_cuda() and torch.cuda.get_device_capability()[0] >= 9
 
 
+def is_cu130():
+    return is_cuda() and torch.version.cuda == "13.0"
+
+
 def set_env():
     # set cutlass dir
     # by default we use the cutlass version built with pytorch