Added compiler hints to enable buffer loads (#729)

azaidy · web-flow · commit 752d83c05041 · 2025-02-21T15:31:15.000-05:00
diff --git a/python/perf-kernels/gemm.py b/python/perf-kernels/gemm.py
@@ -71,6 +71,14 @@ def matmul_kernel(
     """Kernel for computing the matmul C = A x B.
     A has shape (M, K), B has shape (K, N) and C has shape (M, N)
     """
+
+    tl.assume(stride_am > 0)
+    tl.assume(stride_ak > 0)
+    tl.assume(stride_bk > 0)
+    tl.assume(stride_bn > 0)
+    tl.assume(stride_cm > 0)
+    tl.assume(stride_cn > 0)
+
     # -----------------------------------------------------------
     # Map program ids `pid` to the block of C it should compute.
     # This is done in a grouped ordering to promote L2 data reuse.
@@ -89,6 +97,9 @@ def matmul_kernel(
         pid_m = first_pid_m + (pid % group_size_m)
         pid_n = (pid % num_pid_in_group) // group_size_m
 
+    tl.assume(pid_m > 0)
+    tl.assume(pid_n > 0)
+
     # Create pointers for first block of A and B input matrices
     offs_k = tl.arange(0, BLOCK_SIZE_K)
     offs_am = (pid_m * BLOCK_SIZE_M + tl.arange(0, BLOCK_SIZE_M)) % M