makslevental
diff --git a/‎examples/att.txt
Lines changed: 0 additions & 5 deletions b/‎examples/att.txt
Lines changed: 0 additions & 5 deletions
diff --git a/‎examples/requirements.txt
Lines changed: 0 additions & 2 deletions b/‎examples/requirements.txt
Lines changed: 0 additions & 2 deletions
diff --git a/‎examples/rocprof.sh
Lines changed: 1 addition & 2 deletions b/‎examples/rocprof.sh
Lines changed: 1 addition & 2 deletions
diff --git a/‎examples/demo.py renamed to ‎examples/schedule_barriers.py
Lines changed: 32 additions & 19 deletions b/‎examples/demo.py renamed to ‎examples/schedule_barriers.py
Lines changed: 32 additions & 19 deletions
diff --git a/‎tests/test_gpu.py
Lines changed: 0 additions & 3 deletions b/‎tests/test_gpu.py
Lines changed: 0 additions & 3 deletions
@@ -8,13 +8,12 @@ echo "Script directory: $SCRIPT_DIR"
 
 export PATH=/opt/rocm-6.5.0/bin:$PATH
 export PYTHONPATH=$SCRIPT_DIR/..
-export OUTPUT_PATH=$SCRIPT_DIR
 export ROCPROF_ATT_LIBRARY_PATH=/opt/rocm-6.5.0/att-decoder-v3-3.0.0-Linux/lib
 export ATT_VIEWER=../../ROCProfiler-ATT-Viewer-amd-staging/cmake-build-debug/ATTViewer
 
 
 rm -rf traces
-/opt/rocm-6.5.0/bin/rocprofv3 -i att.json -d traces -o demo_trace -- $SCRIPT_DIR/demo.py
+rocprofv3 -i att.json -d traces -o demo_trace -- $SCRIPT_DIR/schedule_barriers.py
 
 for ui in $(ls $SCRIPT_DIR/traces) ; do
   if [ -d $SCRIPT_DIR/traces/$ui ]; then
 
@@ -58,7 +58,7 @@ def gpu_module():
 set_container_module(ctx.module)
 
 v_len = 16
-M, K, N = 16, 16, 16
+M, K, N = 512, 512, 512
 TILE_SIZE = BK = 16
 dtype = T.f16()
 np_dtype = np.float16
@@ -78,23 +78,27 @@ def kernel(
 
     row = block_idx.y * TILE_SIZE + thread_idx.y
     col = block_idx.x * TILE_SIZE + thread_idx.x
+    lane = thread_idx.x % v_len
     # gpu.printf("(%ld, %ld)\n", row, col)
     # vector.print_(source=row)
 
     sum = arith.constant(np.full([v_len], 0.0, np_dtype), v16)
-    for t, sum, _ in scf.range_(0, N, BK, iter_args=[sum]):
-        Bs[thread_idx.y, thread_idx.x] = B[col, thread_idx.y + t]
-        As[thread_idx.y, thread_idx.x] = A[row, thread_idx.x + t]
 
+    Bs[thread_idx.y, thread_idx.x] = B[col, thread_idx.y + 0]
+    As[thread_idx.y, thread_idx.x] = A[row, thread_idx.x + 0]
+
+    for t, sum, _ in scf.range_(BK, N + BK, BK, iter_args=[sum]):
         gpu.barrier()
 
-        lane = thread_idx.x % v_len
         a_frag = As @ vector.load(v16) @ [lane, 0]
         b_frag = Bs @ vector.load(v16) @ [lane, 0]
 
-        # call the WMMA intrinsic
-        false = arith.constant(False, T.bool())
-        sum = rocdl.wmma_f16_16x16x16_f16(v16, [a_frag, b_frag, sum, false])
+        sum = rocdl.wmma_f16_16x16x16_f16(a_frag, b_frag, sum)
+
+        if arith.index_cast(t, T.i32()) < N:
+            Bs[thread_idx.y, thread_idx.x] = B[col, thread_idx.y + t]
+            As[thread_idx.y, thread_idx.x] = A[row, thread_idx.x + t]
+
         sum = yield sum
 
     C[row, col] = sum[2 * (row // 2)]
@@ -142,18 +146,25 @@ def gpu_module():
 hip_module = hip_check(hip.hipModuleLoadData(hsaco))
 function = hip_check(hip.hipModuleGetFunction(hip_module, kernel.__name__.encode()))
 
-a_h = np.random.randint(0, 10, (M, K)).astype(dtype=np_dtype)
-b_h = np.random.randint(0, 10, (K, N)).astype(dtype=np_dtype)
-# a_h = np.ones((M, K)).astype(dtype=np_dtype)
-# b_h = np.ones((K, N)).astype(dtype=np_dtype)
-c_h = 0 * np.ones((M, N), dtype=np_dtype)
+# a_h = np.random.randint(1, 5, (M, K)).astype(dtype=np_dtype)
+# b_h = np.random.randint(1, 5, (K, N)).astype(dtype=np_dtype)
 
+# a_h = np.random.rand(M, K).astype(np_dtype)
+# b_h = np.random.rand(K, N).astype(np_dtype)
+
+a_h = 3 * np.ones((M, K)).astype(dtype=np_dtype)
+a_h[0 : M // 2, 0 : K // 2] = 0
+a_h[M // 2 : M, K // 2 : K] = 1
+b_h = 2 * np.ones((K, N)).astype(dtype=np_dtype)
+b_h[0 : K // 2, 0 : N // 2] = 2
+b_h[K // 2 : K, N // 2 : N] = 3
+
+c_h = 0 * np.ones((M, N), dtype=np.float32)
 for k in range(K):
-    a = a_h[:, k]
-    b = b_h[k, :]
+    a = a_h.astype(np.float32)[:, k]
+    b = b_h.astype(np.float32)[k, :]
     c_h += np.outer(a, b)
-
-assert np.allclose(a_h @ b_h, c_h)
+assert np.allclose(a_h.astype(np.float32) @ b_h.astype(np.float32), c_h)
 
 c_h = -3 * np.ones((M, N), dtype=np_dtype)
 a_num_bytes = a_h.size * a_h.itemsize
@@ -210,10 +221,12 @@ def gpu_module():
 
 if not np.allclose(c_h, correct):
     with np.printoptions(threshold=np.inf, linewidth=np.inf):
-        print("correct\n", correct)
-        print("c_h\n", c_h)
+        # print("correct\n", correct)
+        # print("c_h\n", c_h)
         print("off by atol", np.max(np.abs(correct - c_h)))
         print("off by rtol", np.max(np.abs(correct - c_h) / correct))
+        print("num incorrect", np.sum(np.abs(correct - c_h) != 0))
+        print("fraction incorrect", np.sum(np.abs(correct - c_h) != 0) / (M * N))
 
 
 hip_check(hip.hipFree(a_d))
 
@@ -1230,9 +1230,6 @@ def smol_matmul(
 
         c_frag = rocdl.wmma_f16_16x16x16_f16(a_frag, b_frag, c_frag)
 
-        for i in scf.range_(v_len):
-            gpu.printf("(%02ld, %02ld, %02ld), %f\n", lIdx, lane, i, c_frag[i])
-
         for i in scf.range_(v_len):
             gpu.printf("(%02ld, %02ld, %02ld), %f\n", lIdx, lane, i, c_frag[i])