working demo

makslevental · makslevental · commit 827455438409 · 2025-04-19T19:47:29.000-04:00
diff --git a/examples/att.txt b/examples/att.txt
diff --git a/examples/requirements.txt b/examples/requirements.txt
diff --git a/examples/rocprof.sh b/examples/rocprof.sh
@@ -8,13 +8,12 @@ echo "Script directory: $SCRIPT_DIR"
 
 export PATH=/opt/rocm-6.5.0/bin:$PATH
 export PYTHONPATH=$SCRIPT_DIR/..
-export OUTPUT_PATH=$SCRIPT_DIR
 export ROCPROF_ATT_LIBRARY_PATH=/opt/rocm-6.5.0/att-decoder-v3-3.0.0-Linux/lib
 export ATT_VIEWER=../../ROCProfiler-ATT-Viewer-amd-staging/cmake-build-debug/ATTViewer
 
 
 rm -rf traces
-/opt/rocm-6.5.0/bin/rocprofv3 -i att.json -d traces -o demo_trace -- $SCRIPT_DIR/demo.py
+rocprofv3 -i att.json -d traces -o demo_trace -- $SCRIPT_DIR/schedule_barriers.py
 
 for ui in $(ls $SCRIPT_DIR/traces) ; do
   if [ -d $SCRIPT_DIR/traces/$ui ]; then
diff --git a/examples/schedule_barriers.py b/examples/schedule_barriers.py
@@ -58,7 +58,7 @@ def gpu_module():
 set_container_module(ctx.module)
 
 v_len = 16
-M, K, N = 16, 16, 16
+M, K, N = 512, 512, 512
 TILE_SIZE = BK = 16
 dtype = T.f16()
 np_dtype = np.float16
@@ -78,23 +78,27 @@ def kernel(
 
     row = block_idx.y * TILE_SIZE + thread_idx.y
     col = block_idx.x * TILE_SIZE + thread_idx.x
+    lane = thread_idx.x % v_len
     # gpu.printf("(%ld, %ld)\n", row, col)
     # vector.print_(source=row)
 
     sum = arith.constant(np.full([v_len], 0.0, np_dtype), v16)
-    for t, sum, _ in scf.range_(0, N, BK, iter_args=[sum]):
-        Bs[thread_idx.y, thread_idx.x] = B[col, thread_idx.y + t]
-        As[thread_idx.y, thread_idx.x] = A[row, thread_idx.x + t]
 
+    Bs[thread_idx.y, thread_idx.x] = B[col, thread_idx.y + 0]
+    As[thread_idx.y, thread_idx.x] = A[row, thread_idx.x + 0]
+
+    for t, sum, _ in scf.range_(BK, N + BK, BK, iter_args=[sum]):
         gpu.barrier()
 
-        lane = thread_idx.x % v_len
         a_frag = As @ vector.load(v16) @ [lane, 0]
         b_frag = Bs @ vector.load(v16) @ [lane, 0]
 
-        # call the WMMA intrinsic
-        false = arith.constant(False, T.bool())
-        sum = rocdl.wmma_f16_16x16x16_f16(v16, [a_frag, b_frag, sum, false])
+        sum = rocdl.wmma_f16_16x16x16_f16(a_frag, b_frag, sum)
+
+        if arith.index_cast(t, T.i32()) < N:
+            Bs[thread_idx.y, thread_idx.x] = B[col, thread_idx.y + t]
+            As[thread_idx.y, thread_idx.x] = A[row, thread_idx.x + t]
+
         sum = yield sum
 
     C[row, col] = sum[2 * (row // 2)]
@@ -142,18 +146,25 @@ def gpu_module():
 hip_module = hip_check(hip.hipModuleLoadData(hsaco))
 function = hip_check(hip.hipModuleGetFunction(hip_module, kernel.__name__.encode()))
 
-a_h = np.random.randint(0, 10, (M, K)).astype(dtype=np_dtype)
-b_h = np.random.randint(0, 10, (K, N)).astype(dtype=np_dtype)
-# a_h = np.ones((M, K)).astype(dtype=np_dtype)
-# b_h = np.ones((K, N)).astype(dtype=np_dtype)
-c_h = 0 * np.ones((M, N), dtype=np_dtype)
+# a_h = np.random.randint(1, 5, (M, K)).astype(dtype=np_dtype)
+# b_h = np.random.randint(1, 5, (K, N)).astype(dtype=np_dtype)
 
+# a_h = np.random.rand(M, K).astype(np_dtype)
+# b_h = np.random.rand(K, N).astype(np_dtype)
+
+a_h = 3 * np.ones((M, K)).astype(dtype=np_dtype)
+a_h[0 : M // 2, 0 : K // 2] = 0
+a_h[M // 2 : M, K // 2 : K] = 1
+b_h = 2 * np.ones((K, N)).astype(dtype=np_dtype)
+b_h[0 : K // 2, 0 : N // 2] = 2
+b_h[K // 2 : K, N // 2 : N] = 3
+
+c_h = 0 * np.ones((M, N), dtype=np.float32)
 for k in range(K):
-    a = a_h[:, k]
-    b = b_h[k, :]
+    a = a_h.astype(np.float32)[:, k]
+    b = b_h.astype(np.float32)[k, :]
     c_h += np.outer(a, b)
-
-assert np.allclose(a_h @ b_h, c_h)
+assert np.allclose(a_h.astype(np.float32) @ b_h.astype(np.float32), c_h)
 
 c_h = -3 * np.ones((M, N), dtype=np_dtype)
 a_num_bytes = a_h.size * a_h.itemsize
@@ -210,10 +221,12 @@ def gpu_module():
 
 if not np.allclose(c_h, correct):
     with np.printoptions(threshold=np.inf, linewidth=np.inf):
-        print("correct\n", correct)
-        print("c_h\n", c_h)
+        # print("correct\n", correct)
+        # print("c_h\n", c_h)
         print("off by atol", np.max(np.abs(correct - c_h)))
         print("off by rtol", np.max(np.abs(correct - c_h) / correct))
+        print("num incorrect", np.sum(np.abs(correct - c_h) != 0))
+        print("fraction incorrect", np.sum(np.abs(correct - c_h) != 0) / (M * N))
 
 
 hip_check(hip.hipFree(a_d))
diff --git a/mlir/extras/dialects/ext/gpu.py b/mlir/extras/dialects/ext/gpu.py
@@ -49,43 +49,43 @@ def __get__(self, owner_self, owner_cls):
 class block_idx:
     @classproperty
     def x(cls):
-        return _block_id("x")
+        return _block_id("x", loc=get_user_code_loc())
 
     @classproperty
     def y(cls):
-        return _block_id("y")
+        return _block_id("y", loc=get_user_code_loc())
 
     @classproperty
     def z(cls):
-        return _block_id("z")
+        return _block_id("z", loc=get_user_code_loc())
 
 
 class block_dim:
     @classproperty
     def x(cls):
-        return _block_dim("x")
+        return _block_dim("x", loc=get_user_code_loc())
 
     @classproperty
     def y(cls):
-        return _block_dim("y")
+        return _block_dim("y", loc=get_user_code_loc())
 
     @classproperty
     def z(cls):
-        return _block_dim("z")
+        return _block_dim("z", loc=get_user_code_loc())
 
 
 class thread_idx:
     @classproperty
     def x(cls):
-        return _thread_id("x")
+        return _thread_id("x", loc=get_user_code_loc())
 
     @classproperty
     def y(cls):
-        return _thread_id("y")
+        return _thread_id("y", loc=get_user_code_loc())
 
     @classproperty
     def z(cls):
-        return _thread_id("z")
+        return _thread_id("z", loc=get_user_code_loc())
 
 
 def thread_id():
@@ -222,6 +222,8 @@ def __init__(
         loc=None,
         ip=None,
     ):
+        if loc is None:
+            loc = get_user_code_loc()
         super().__init__(
             function_type=function_type,
             arg_attrs=arg_attrs,
@@ -301,10 +303,10 @@ def launch_(
 ):
     if loc is None:
         loc = get_user_code_loc()
-        for size in [grid_size, block_size]:
-            for i, s in enumerate(size):
-                if isinstance(s, int):
-                    size[i] = constant(s, index=True)
+    for size in [grid_size, block_size]:
+        for i, s in enumerate(size):
+            if isinstance(s, int):
+                size[i] = constant(s, index=True)
     launch_op = LaunchOp(
         grid_size,
         block_size,
@@ -371,13 +373,16 @@ def __call__(
         async_dependencies=None,
         dynamic_shared_memory_size: Optional[Value] = None,
         stream=None,
+        loc=None,
+        ip=None,
     ):
         for size in [grid_size, block_size]:
             for i, s in enumerate(size):
                 if isinstance(s, int):
                     size[i] = constant(s, index=True)
 
-        loc = get_user_code_loc()
+        if loc is None:
+            loc = get_user_code_loc()
         return get_op_result_or_op_results(
             LaunchFuncOp(
                 (
@@ -469,6 +474,8 @@ def all_reduce__(value: Value, *, op=None, uniform=None, loc=None, ip=None):
 
 
 def all_reduce_(value: Value, *, op=None, uniform=None, loc=None, ip=None):
+    if loc is None:
+        loc = get_user_code_loc()
     return get_op_result_or_op_results(
         all_reduce__(value, op=op, uniform=uniform, loc=loc, ip=ip)
     )
@@ -577,15 +584,18 @@ def get_compile_object_bytes(compiled_module):
 _printf = printf
 
 
-def printf(format, *args):
-    loc = get_user_code_loc()
-    return _printf(format=format, args=args, loc=loc)
+def printf(format, *args, loc=None, ip=None):
+    if loc is None:
+        loc = get_user_code_loc()
+    return _printf(format=format, args=args, loc=loc, ip=ip)
 
 
 _dynamic_shared_memory = dynamic_shared_memory
 
 
 def dynamic_shared_memory(*, int=False, loc=None, ip=None):
+    if loc is None:
+        loc = get_user_code_loc()
     return _dynamic_shared_memory(
         T.memref(
             ShapedType.get_dynamic_size(),
@@ -611,3 +621,10 @@ def memset(dst, value, async_dependencies=None, *, loc=None, ip=None):
     if isinstance(value, (int, float, bool)):
         value = constant(value, type=dst.type.element_type)
     return _memset(async_token, async_dependencies, dst, value, loc=loc, ip=ip)
+
+
+def barrier(*, loc=None, ip=None):
+    if loc is None:
+        loc = get_user_code_loc()
+
+    return BarrierOp(loc=loc, ip=ip)
diff --git a/mlir/extras/dialects/ext/memref.py b/mlir/extras/dialects/ext/memref.py
@@ -281,6 +281,8 @@ def _canonicalize_start_stop(start, stop, step):
     elif isinstance(start, int) and isinstance(stop, int):
         return stop - start
 
+    raise NotImplementedError
+
 
 def _subview(
     mem: MemRef,
@@ -362,6 +364,8 @@ def _copy_to_subview(
 
 
 def dim(source, index, *, loc=None, ip=None):
+    if loc is None:
+        loc = get_user_code_loc()
     if isinstance(index, int):
         index = constant(index, index=True)
     return _dim(source=source, index=index, loc=loc, ip=ip)
@@ -412,7 +416,9 @@ def global_(
     ).opview
 
 
-def view(source, shape, dtype=None, shift=0, memory_space=None):
+def view(source, shape, dtype=None, shift=0, memory_space=None, loc=None, ip=None):
+    if loc is None:
+        loc = get_user_code_loc()
     if dtype is None:
         dtype = source.type.element_type
     byte_width_dtype = dtype.width // 8
@@ -425,6 +431,8 @@ def view(source, shape, dtype=None, shift=0, memory_space=None):
         source,
         byte_shift,
         [],
+        loc=loc,
+        ip=ip,
     )
 
 
@@ -434,6 +442,8 @@ def view(source, shape, dtype=None, shift=0, memory_space=None):
 def get_global(
     name_or_global, *, name=None, global_=None, result=None, loc=None, ip=None
 ):
+    if loc is None:
+        loc = get_user_code_loc()
     if isinstance(name_or_global, GlobalOp):
         global_ = name_or_global
     elif isinstance(name_or_global, str):
diff --git a/mlir/extras/dialects/ext/rocdl.py b/mlir/extras/dialects/ext/rocdl.py
@@ -24,6 +24,8 @@ class WMMA_F16_16X16X16_F16(ir.OpView):
     _ODS_REGIONS = (0, True)
 
     def __init__(self, res, args, *, loc=None, ip=None):
+        if loc is None:
+            loc = get_user_code_loc()
         operands = []
         results = []
         attributes = {}
@@ -56,5 +58,11 @@ def res(self):
         return self.operation.results[0]
 
 
-def wmma_f16_16x16x16_f16(res, args, *, loc=None, ip=None) -> ir.Value:
-    return WMMA_F16_16X16X16_F16(res=res, args=args, loc=loc, ip=ip).result
+def wmma_f16_16x16x16_f16(A, B, C, *, OPSEL=False, loc=None, ip=None) -> ir.Value:
+    if loc is None:
+        loc = get_user_code_loc()
+
+    opsel = arith.constant(OPSEL, ir.IntegerType.get_signless(1))
+    args = [A, B, C, opsel]
+    v16 = ir.VectorType.get((16,), ir.F16Type.get())
+    return WMMA_F16_16X16X16_F16(res=v16, args=args, loc=loc, ip=ip).result
diff --git a/tests/test_gpu.py b/tests/test_gpu.py
@@ -1228,9 +1228,7 @@ def smol_matmul(
             a_frag[ele] = a[lane, ele]
             a_frag, b_frag = yield a_frag, b_frag
 
-        # call the WMMA intrinsic
-        false = arith.constant(False, T.bool())
-        c_frag = rocdl.wmma_f16_16x16x16_f16(v16f16, [a_frag, b_frag, c_frag, false])
+        c_frag = rocdl.wmma_f16_16x16x16_f16(a_frag, b_frag, c_frag)
 
         for i in scf.range_(v_len):
             gpu.printf("(%02ld, %02ld, %02ld), %f\n", lIdx, lane, i, c_frag[i])