diff --git a/examples/att.json b/examples/att.json new file mode 100644 index 0000000..80bfdee --- /dev/null +++ b/examples/att.json @@ -0,0 +1,12 @@ +{ + "jobs": [ + { + "advanced_thread_trace": true, + "att_parse" : "trace", + "att_target_cu" : 0, + "att_shader_engine_mask" : "0xF", + "att_simd_select": "0x0", + "att_buffer_size": "0x60000000" + } + ] +} diff --git a/examples/rocprof.sh b/examples/rocprof.sh new file mode 100755 index 0000000..765277e --- /dev/null +++ b/examples/rocprof.sh @@ -0,0 +1,27 @@ +#!/bin/bash + +#set -eux + +cd "$(dirname "$0")" +SCRIPT_DIR="$(pwd)" +echo "Script directory: $SCRIPT_DIR" + +export PATH=/opt/rocm-6.5.0/bin:$PATH +export PYTHONPATH=$SCRIPT_DIR/.. +export ROCPROF_ATT_LIBRARY_PATH=/opt/rocm-6.5.0/att-decoder-v3-3.0.0-Linux/lib +export ATT_VIEWER=../../ROCProfiler-ATT-Viewer-amd-staging/cmake-build-debug/ATTViewer + + +rm -rf traces +rocprofv3 -i att.json -d traces -o demo_trace -- $SCRIPT_DIR/schedule_barriers.py + +for ui in $(ls $SCRIPT_DIR/traces) ; do + if [ -d $SCRIPT_DIR/traces/$ui ]; then + ls $SCRIPT_DIR/traces/$ui | grep se > /dev/null + if [ $? == 0 ]; then + UI_PATH=$SCRIPT_DIR/traces/$ui + fi + fi +done + +$ATT_VIEWER $UI_PATH \ No newline at end of file diff --git a/examples/schedule_barriers.py b/examples/schedule_barriers.py new file mode 100755 index 0000000..402920d --- /dev/null +++ b/examples/schedule_barriers.py @@ -0,0 +1,236 @@ +#!/usr/bin/env python +from pathlib import Path + +import mlir.extras.types as T +import numpy as np +from hip import hip +from mlir.ir import InsertionPoint + +from mlir.extras.ast.canonicalize import canonicalize +from mlir.extras.context import RAIIMLIRContextModule +from mlir.extras.dialects.ext import memref, scf, arith, rocdl, gpu, llvm, vector + +# noinspection PyUnresolvedReferences +from mlir.extras.dialects.ext.gpu import ( + all_reduce, + wait, + thread_attr as thread, + block_idx, + thread_idx, + block_dim, + GPUModuleMeta, + func as gpu_func, + set_container_module, + launch, + all_reduce_, + module, + get_compile_object_bytes, + lds_space, + dynamic_shared_memory, +) +from mlir.extras.runtime.passes import run_pipeline, Pipeline + +# noinspection PyUnresolvedReferences +from util import hip_check, launch_kernel, hip_synchronize + + +def time_to_gflops(time_ms, N): + return 1e-6 * (N * N * N * 2 + 3 * N * N) // time_ms + + +# just so it doesn't get DCE'd by black/reformat +# TypeError: 'mlir._mlir_libs._mlir.ir.BlockArgument' object is not subscriptable +_ = memref + +ctx = RAIIMLIRContextModule() +set_container_module(ctx.module) + + +# just a default attr - actual target is set blow +@module("kernels", [f'#rocdl.target']) +def gpu_module(): + pass + + +ip = InsertionPoint.at_block_begin(gpu_module.regions[0].blocks[0]) +ip.__enter__() + +set_container_module(ctx.module) + +v_len = 16 +M, K, N = 512, 512, 512 +TILE_SIZE = BK = 16 +dtype = T.f16() +np_dtype = np.float16 +v16 = T.vector(v_len, dtype) + + +@gpu_func +@canonicalize(using=scf.canonicalizer) +def kernel( + A: T.memref(M, K, dtype), B: T.memref(K, N, dtype), C: T.memref(M, N, dtype) +): + base = dynamic_shared_memory() + As = memref.view(base, (TILE_SIZE, TILE_SIZE), dtype=dtype) + Bs = memref.view( + base, (TILE_SIZE, TILE_SIZE), dtype=dtype, shift=TILE_SIZE * TILE_SIZE + ) + + row = block_idx.y * TILE_SIZE + thread_idx.y + col = block_idx.x * TILE_SIZE + thread_idx.x + lane = thread_idx.x % v_len + # gpu.printf("(%ld, %ld)\n", row, col) + # vector.print_(source=row) + + sum = arith.constant(np.full([v_len], 0.0, np_dtype), v16) + + Bs[thread_idx.y, thread_idx.x] = B[col, thread_idx.y + 0] + As[thread_idx.y, thread_idx.x] = A[row, thread_idx.x + 0] + + for t, sum, _ in scf.range_(BK, N + BK, BK, iter_args=[sum]): + gpu.barrier() + + a_frag = As @ vector.load(v16) @ [lane, 0] + b_frag = Bs @ vector.load(v16) @ [lane, 0] + + sum = rocdl.wmma_f16_16x16x16_f16(a_frag, b_frag, sum) + + if arith.index_cast(t, T.i32()) < N: + Bs[thread_idx.y, thread_idx.x] = B[col, thread_idx.y + t] + As[thread_idx.y, thread_idx.x] = A[row, thread_idx.x + t] + + sum = yield sum + + C[row, col] = sum[2 * (row // 2)] + + +props = hip.hipDeviceProp_t() +hip_check(hip.hipGetDeviceProperties(props, 0)) +arch = props.gcnArchName.decode().split(":")[0] + + +@module("naive", [f'#rocdl.target']) +def gpu_module(): + kernel.emit() + + +ip.__exit__(None, None, None) + +# gpu_module = run_pipeline(gpu_module, Pipeline().cse()) +# print(gpu_module) + +O = 3 +output_format = "binary" + +lowered_module = run_pipeline( + gpu_module, + Pipeline() + .Gpu( + Pipeline().convert_gpu_to_rocdl( + use_bare_ptr_memref_call_conv=True, + runtime="HIP", + ) + ) + .rocdl_attach_target(chip=arch, abi="500", O=O) + .gpu_to_llvm() + .lower_to_llvm() + .ensure_debug_info_scope_on_llvm_func(emission_kind="Full") + .gpu_module_to_binary(format=output_format), +) + +hsaco = get_compile_object_bytes(lowered_module) +if output_format == "assembly": + with open(Path(__file__).parent / f"hsacoO{O}.txt", "wb") as f: + f.write(hsaco) + exit() +hip_module = hip_check(hip.hipModuleLoadData(hsaco)) +function = hip_check(hip.hipModuleGetFunction(hip_module, kernel.__name__.encode())) + +# a_h = np.random.randint(1, 5, (M, K)).astype(dtype=np_dtype) +# b_h = np.random.randint(1, 5, (K, N)).astype(dtype=np_dtype) + +# a_h = np.random.rand(M, K).astype(np_dtype) +# b_h = np.random.rand(K, N).astype(np_dtype) + +a_h = 3 * np.ones((M, K)).astype(dtype=np_dtype) +a_h[0 : M // 2, 0 : K // 2] = 0 +a_h[M // 2 : M, K // 2 : K] = 1 +b_h = 2 * np.ones((K, N)).astype(dtype=np_dtype) +b_h[0 : K // 2, 0 : N // 2] = 2 +b_h[K // 2 : K, N // 2 : N] = 3 + +c_h = 0 * np.ones((M, N), dtype=np.float32) +for k in range(K): + a = a_h.astype(np.float32)[:, k] + b = b_h.astype(np.float32)[k, :] + c_h += np.outer(a, b) +assert np.allclose(a_h.astype(np.float32) @ b_h.astype(np.float32), c_h) + +c_h = -3 * np.ones((M, N), dtype=np_dtype) +a_num_bytes = a_h.size * a_h.itemsize +b_num_bytes = b_h.size * b_h.itemsize +c_num_bytes = c_h.size * c_h.itemsize + +a_d = hip_check(hip.hipMalloc(a_num_bytes)) +b_d = hip_check(hip.hipMalloc(b_num_bytes)) +c_d = hip_check(hip.hipMalloc(c_num_bytes)) + +hip_check(hip.hipMemcpy(a_d, a_h, a_num_bytes, hip.hipMemcpyKind.hipMemcpyHostToDevice)) +hip_check(hip.hipMemcpy(b_d, b_h, b_num_bytes, hip.hipMemcpyKind.hipMemcpyHostToDevice)) +hip_check(hip.hipMemcpy(c_d, c_h, c_num_bytes, hip.hipMemcpyKind.hipMemcpyHostToDevice)) + +( + ( + blocks_per_grid_x, + blocks_per_grid_y, + blocks_per_grid_z, + ), + ( + threads_per_block_x, + threads_per_block_y, + threads_per_block_z, + ), + shared_memory, +) = ( + (N // TILE_SIZE, N // TILE_SIZE, 1), + (TILE_SIZE, TILE_SIZE, 1), + 2 * TILE_SIZE * TILE_SIZE * dtype.width // 8, +) + +stream = 0 + +launch_kernel( + function.as_c_void_p(), + blocks_per_grid_x, + blocks_per_grid_y, + blocks_per_grid_z, + threads_per_block_x, + threads_per_block_y, + threads_per_block_z, + stream, + shared_memory, + a_d, + b_d, + c_d, +) + +correct = a_h @ b_h +assert np.allclose(c_h, -3.0) +assert not np.allclose(correct, c_h) +hip_check(hip.hipMemcpy(c_h, c_d, c_num_bytes, hip.hipMemcpyKind.hipMemcpyDeviceToHost)) + +if not np.allclose(c_h, correct): + with np.printoptions(threshold=np.inf, linewidth=np.inf): + # print("correct\n", correct) + # print("c_h\n", c_h) + print("off by atol", np.max(np.abs(correct - c_h))) + print("off by rtol", np.max(np.abs(correct - c_h) / correct)) + print("num incorrect", np.sum(np.abs(correct - c_h) != 0)) + print("fraction incorrect", np.sum(np.abs(correct - c_h) != 0) / (M * N)) + + +hip_check(hip.hipFree(a_d)) +hip_check(hip.hipFree(b_d)) +hip_check(hip.hipFree(c_d)) + +hip_check(hip.hipModuleUnload(hip_module))