makslevental
diff --git a/‎examples/cuda_matmul_opt.py
Lines changed: 243 additions & 38 deletions b/‎examples/cuda_matmul_opt.py
Lines changed: 243 additions & 38 deletions
diff --git a/‎mlir/extras/ast/canonicalize.py
Lines changed: 5 additions & 1 deletion b/‎mlir/extras/ast/canonicalize.py
Lines changed: 5 additions & 1 deletion
diff --git a/‎mlir/extras/ast/util.py
Lines changed: 1 addition & 2 deletions b/‎mlir/extras/ast/util.py
Lines changed: 1 addition & 2 deletions
diff --git a/‎mlir/extras/dialects/ext/arith.py
Lines changed: 2 additions & 1 deletion b/‎mlir/extras/dialects/ext/arith.py
Lines changed: 2 additions & 1 deletion
@@ -15,8 +15,8 @@
 )
 from mlir.extras.dialects.ext import arith, memref, gpu, scf
 from mlir.extras.dialects.ext.gpu import (
-    block_id,
-    thread_id,
+    block_idx,
+    thread_idx,
     block_dim,
     get_compile_object_bytes,
 )
@@ -30,13 +30,44 @@
 _ = memref
 
 
-def build_cuda_func(compiled_module, kernel_name="mat_product_kernel"):
+def build_cuda_func(compiled_module, kernel_name="naive"):
     ptx = get_compile_object_bytes(compiled_module)
     mod = Module()
     mod.load(ptx)
     return mod.get_function(kernel_name)
 
 
+def print_ptx(compiled_module):
+    ptx = get_compile_object_bytes(compiled_module)
+    print(ptx.decode())
+
+
+def compile_module(module, enable_ir_printing=False, print_ptx_=False):
+    if enable_ir_printing:
+        print_ptx_ = True
+    mod = run_pipeline(
+        module,
+        Pipeline().add_pass(
+            "gpu-lower-to-nvvm-pipeline",
+            # https://github.com/llvm/llvm-project/blob/ace69e6b942b8fa7e610d70be2a92e801ceea481/mlir/include/mlir/Dialect/GPU/Pipelines/Passes.h#L18
+            **{
+                "cubin-chip": "sm_80",
+                "cubin-features": "+ptx83",
+                "cubin-format": "isa",
+                "kernel-bare-ptr-calling-convention": "1",
+                "opt-level": "2",
+                # "cubin-format": "fatbin",
+                # "cubin-format": "bin",
+            },
+        ),
+        enable_ir_printing=enable_ir_printing,
+    )
+    if print_ptx_:
+        print_ptx(mod)
+
+    return mod
+
+
 @contextlib.contextmanager
 def time_cuda():
     start_gpu = cp.cuda.Event()
@@ -50,80 +81,254 @@ def time_cuda():
 
 @gpu.func
 @canonicalize(using=(arith.canonicalizer, scf.canonicalizer))
-def mat_product_kernel[
+def sgemm_naive[
+    M, K, N, dtype
+](A: T.memref(M, K, dtype), B: T.memref(K, N, dtype), C: T.memref(M, N, dtype)):
+    one = arith.constant(1.0, type=dtype)
+    tmp = arith.constant(0, type=dtype)
+
+    # this is from the example and it's basically a mistake
+    # it increments the row for each adjacent thread id
+    # uncomment the print to see
+    r = block_dim.x * block_idx.x + thread_idx.x
+    c = block_dim.y * block_idx.y + thread_idx.y
+    # tid = gpu.thread_id()
+    # gpu.printf("tid: %ld: (%ld, %ld)\n", tid, r, c)
+
+    for k, tmp in range_(K, iter_args=[tmp]):
+        tmp += A[r, k] * B[k, c]
+        tmp = yield tmp
+    C[r, c] = tmp + one
+
+
+@gpu.func
+@canonicalize(using=(arith.canonicalizer, scf.canonicalizer))
+def sgemm_naive_row_order[
     M, K, N, dtype
 ](A: T.memref(M, K, dtype), B: T.memref(K, N, dtype), C: T.memref(M, N, dtype)):
-    x = block_dim.x * block_id.x + thread_id.x
-    y = block_dim.y * block_id.y + thread_id.y
+    one = arith.constant(1.0, type=dtype)
+    tmp = arith.constant(0, type=dtype)
+
+    # increment along the cols (ie preserve row-order access)
+    c = block_dim.x * block_idx.x + thread_idx.x
+    r = block_dim.y * block_idx.y + thread_idx.y
+    # tid = gpu.thread_id()
+    # gpu.printf("tid: %ld: (%ld, %ld)\n", tid, r, c)
+
+    for k, tmp in range_(K, iter_args=[tmp]):
+        tmp += A[r, k] * B[k, c]
+        tmp = yield tmp
+    C[r, c] = tmp + one
+
+
+@gpu.func
+@canonicalize(using=(arith.canonicalizer, scf.canonicalizer))
+def sgemm_coalesce[
+    M, K, N, dtype, BLOCK_SIZE
+](A: T.memref(M, K, dtype), B: T.memref(K, N, dtype), C: T.memref(M, N, dtype)):
+
+    tid = gpu.thread_id()
+    # this is actually floordiv
+    r = block_idx.x * BLOCK_SIZE + (tid / BLOCK_SIZE)
+    c = block_idx.y * BLOCK_SIZE + (tid % BLOCK_SIZE)
+    # gpu.printf("tid: %ld: (%ld, %ld)\n", tid, r, c)
+
+    one = arith.constant(1.0, type=dtype)
+    tmp = arith.constant(0, type=dtype)
+
+    for k, tmp in range_(K, iter_args=[tmp]):
+        # k varies per core while c varies with tid
+        # apparently that's fine? i guess all the loads can happen
+        # because there's enough scratch per SM to prefetch all the data each thread needs?
+        tmp += A[r, k] * B[k, c]
+        tmp = yield tmp
+    C[r, c] = tmp + one
+
+
+# So if you try to load something like:
+#
+# B.T:
+#
+# 0 0 0 0 0 0 0 0
+# 1 1 1 1 1 1 1 1
+# 2 2 2 2 2 2 2 2
+#
+# vs
+#
+# B:
+# 0 1 2 3 4 5 6 7 8
+# 0 1 2 3 4 5 6 7 8
+# 0 1 2 3 4 5 6 7 8
+#
+# In B, you are feeding all threads with a single load (say warp can load 8 elements at a time) and then you increment k
+#
+# in B.T, a single load is feeding only a single thread, so others are probably waiting for their load to happen
+# these are the issues by threads:
+#
+# 0: (0, 0), (1, 0), (2, 0)
+# 1: (0, 1), (1, 1), (2, 1)
+# 2: (0, 2), (1, 2), (2, 2)
+#
+# warp recieves these issues:
+#
+# (0, 0), (0, 1), (0, 2), (1, 0), (1, 1), (1, 2), (2, 0), (2, 1), (2, 2)
+#
+# warp issues coalesced reads:
+#
+# (0, 0:2), (1, 0:2), (2,0:2)
+# so even though the threads have bad memory access pattern
+# the warp has good memory access pattern
+# and since the actual load happens at warp level
+# its good
+@gpu.func
+@canonicalize(using=(arith.canonicalizer, scf.canonicalizer))
+def sgemm_coalesce_transpose_B[
+    M, K, N, dtype, BLOCK_SIZE
+](A: T.memref(M, K, dtype), B: T.memref(K, N, dtype), C: T.memref(M, N, dtype)):
+
+    tid = gpu.thread_id()
+    r = block_idx.x * BLOCK_SIZE + (tid / BLOCK_SIZE)
+    c = block_idx.y * BLOCK_SIZE + (tid % BLOCK_SIZE)
 
     one = arith.constant(1.0, type=dtype)
     tmp = arith.constant(0, type=dtype)
+
     for k, tmp in range_(K, iter_args=[tmp]):
-        tmp += A[x, k] * B[k, y]
+        # this is slower because c is incremented with each tid
+        # so you break memory coalescing
+        # but k now being on the row order dim doesn't help?
+        tmp += A[r, k] * B[c, k]
+        tmp = yield tmp
+    C[r, c] = tmp + one
+
+
+@gpu.func
+@canonicalize(using=(arith.canonicalizer, scf.canonicalizer))
+def sgemm_shared_mem_block[
+    M, K, N, dtype, BLOCK_SIZE
+](A: T.memref(M, K, dtype), B: T.memref(K, N, dtype), C: T.memref(M, N, dtype)):
+    # allocate buffer for current block in fast shared mem
+    # shared mem is shared between all threads in a block
+    base = gpu.dynamic_shared_memory()
+    A_shared = memref.view(base, (BLOCK_SIZE, BLOCK_SIZE), dtype=dtype)
+    B_shared = memref.view(
+        base, (BLOCK_SIZE, BLOCK_SIZE), dtype=dtype, shift=BLOCK_SIZE * BLOCK_SIZE
+    )
+
+    # the inner row & col that we're accessing in this thread
+    tid = gpu.thread_id()
+    thread_row = tid / BLOCK_SIZE
+    thread_col = tid % BLOCK_SIZE
+
+    # the output block that we want to compute in this threadblock
+    c_row = block_idx.x * BLOCK_SIZE
+    c_col = block_idx.y * BLOCK_SIZE
+
+    one = arith.constant(1.0, type=dtype)
+    tmp = arith.constant(0, type=dtype)
+
+    for bk_idx, tmp in range_(0, K, BLOCK_SIZE, iter_args=[tmp]):
+        A_ = A[c_row : c_row + BLOCK_SIZE, bk_idx : bk_idx + BLOCK_SIZE]
+        B_ = B[bk_idx : bk_idx + BLOCK_SIZE, c_col : c_col + BLOCK_SIZE]
+
+        # Have each thread load one of the elements in A & B
+        # Make the threadCol (=threadIdx.x) the consecutive index
+        # to allow global memory access coalescing
+        A_shared[thread_row, thread_col] = A_[thread_row, thread_col]
+        B_shared[thread_row, thread_col] = B_[thread_row, thread_col]
+
+        # block threads in this block until cache is fully populated
+        gpu.barrier()
+
+        # execute the dotproduct on the currently cached block
+        for k, tmp in range_(BLOCK_SIZE, iter_args=[tmp]):
+            tmp += A_shared[thread_row, k] * B_shared[k, thread_col]
+            tmp = yield tmp
+
+        # need to sync again at the end, to avoid faster threads
+        # fetching the next block into the cache before slower threads are done
+        gpu.barrier()
+
         tmp = yield tmp
-    C[x, y] = tmp + one
+
+    C_ = C[c_row : c_row + BLOCK_SIZE, c_col : c_col + BLOCK_SIZE]
+    C_[thread_row, thread_col] = tmp + one
 
 
-def main(ctx: MLIRContext, M, K, N, BLOCK_SIZE=32, repeat_times=50):
+def main(ctx: MLIRContext, M, K, N, BLOCK_SIZE=32, repeat_times=None):
+    if repeat_times is None:
+        repeat_times = 50
     dtype = T.f32()
     npy_dtype = np.float32
 
     gpu.set_container_module(ctx.module)
 
-    @gpu.module("naive", ["#nvvm.target"])
-    def _():
-        mat_product_kernel[M, K, N, dtype].emit()
+    @gpu.module("matmul", ["#nvvm.target"])
+    def matmul_mod():
+        sgemm_shared_mem_block[M, K, N, dtype, BLOCK_SIZE].emit()
 
     # print(ctx.module)
-    ctx.module.operation.verify()
+    # print(ctx.module.operation.verify())
+    # exit()
 
-    compiled_module = run_pipeline(
-        ctx.module,
-        Pipeline().add_pass(
-            "gpu-lower-to-nvvm-pipeline",
-            # https://github.com/llvm/llvm-project/blob/ace69e6b942b8fa7e610d70be2a92e801ceea481/mlir/include/mlir/Dialect/GPU/Pipelines/Passes.h#L18
-            **{
-                "cubin-chip": "sm_80",
-                "cubin-features": "+ptx83",
-                "cubin-format": "isa",
-                "kernel-bare-ptr-calling-convention": "1",
-                # "cubin-format": "fatbin",
-                # "cubin-format": "bin",
-            },
-        ),
-    )
-    cuda_func = build_cuda_func(compiled_module)
-    # print(compiled_module)
+    kernel_name = matmul_mod.opview.body.operations[0].attributes["sym_name"].value
+    compiled_module = compile_module(ctx.module)
+    cuda_func = build_cuda_func(compiled_module, kernel_name)
     # print_ptx(compiled_module)
 
     A = np.random.randint(0, 10, (M, K)).astype(npy_dtype)
     B = np.random.randint(0, 10, (K, N)).astype(npy_dtype)
     C = np.zeros((M, N)).astype(npy_dtype)
 
     dA = cp.asarray(A)
-    dB = cp.asarray(B)
+    if "transpose_B" in kernel_name:
+        dB = cp.asarray(np.ascontiguousarray(B.T))
+    else:
+        dB = cp.asarray(B)
     dC = cp.asarray(C)
 
+    grid_dims = (math.ceil(M / BLOCK_SIZE), math.ceil(N / BLOCK_SIZE))
+    block_dims = (BLOCK_SIZE, BLOCK_SIZE)
+
+    if "shared" in kernel_name:
+        shared_mem = 2 * BLOCK_SIZE * BLOCK_SIZE * npy_dtype().nbytes
+    else:
+        shared_mem = None
+
+    cuda_func(
+        grid_dims,
+        block_dims,
+        (dA.data.ptr, dB.data.ptr, dC.data.ptr),
+        shared_mem=shared_mem,
+    )
+    C = cp.asnumpy(dC)
+    if not np.array_equal(C, A @ B + 1):
+        print(A @ B + 1)
+        print(C)
+        assert False
+    if repeat_times < 1:
+        return
+
     with time_cuda() as (start_gpu, end_gpu):
         for _ in range(repeat_times):
             cuda_func(
-                (math.ceil(M / BLOCK_SIZE), math.ceil(N / BLOCK_SIZE), 1),
-                (BLOCK_SIZE, BLOCK_SIZE, 1),
+                grid_dims,
+                block_dims,
                 (dA.data.ptr, dB.data.ptr, dC.data.ptr),
+                shared_mem=shared_mem,
             )
 
     t_gpu = cp.cuda.get_elapsed_time(start_gpu, end_gpu)
 
     print(f"t_gpu={t_gpu / repeat_times:.6f} ms")
 
-    if not cp.array_equal(dC, dA @ dB + 1):
-        print(dA @ dB + 1)
-        print(dC)
 
+sizes = [128, 256, 512, 1024]
+repeats = None
 
-for s in [128, 256, 512, 1024]:
+for s in sizes:
     with (
         mlir_mod_ctx() as ctx,
         # enable_debug()
     ):
-        main(ctx, s, s, s)
+        main(ctx, s, s, s, repeat_times=repeats)
@@ -14,7 +14,7 @@
 import astunparse
 from bytecode import ConcreteBytecode
 
-from ..ast.util import get_module_cst
+from ..ast.util import get_module_cst, set_lineno
 
 logger = logging.getLogger(__name__)
 
@@ -69,6 +69,7 @@ def insert_closed_vars(f, module):
         ),
         body=[],
         decorator_list=[],
+        type_params=[],
     )
     for var in f.__code__.co_freevars:
         enclosing_mod.body.append(
@@ -77,6 +78,9 @@ def insert_closed_vars(f, module):
                 value=ast.Constant(None, kind="None"),
             )
         )
+    enclosing_mod = set_lineno(enclosing_mod, module.body[0].lineno)
+    enclosing_mod = ast.fix_missing_locations(enclosing_mod)
+
     enclosing_mod.body.extend(module.body)
     module.body = [enclosing_mod]
     return module
 
@@ -32,8 +32,7 @@ def ast_call(name, args=None, keywords=None):
 
 
 def get_module_cst(f):
-    lines, _lnum = inspect.getsourcelines(f)
-    f_src = dedent("".join(list(dropwhile(lambda l: l.startswith("@"), lines))))
+    f_src = dedent(inspect.getsource(f))
     tree = ast.parse(f_src)
     assert isinstance(
         tree.body[0], ast.FunctionDef
 
@@ -337,7 +337,8 @@ def _binary_op(
     elif _is_integer_like_type(lhs.dtype):
         # TODO(max): this needs to all be regularized
         if "div" in op.lower() or "rem" in op.lower():
-            if not lhs.dtype.is_signless:
+            # TODO(max): should be using index ops here
+            if not _is_index_type(lhs.dtype) and not lhs.dtype.is_signless:
                 raise ValueError(f"{op.lower()}i not supported for {lhs=}")
             if op == "Floordiv":
                 op = "FloorDiv"