Skip to content

Commit fa1ee7d

Browse files
committed
fix rocprof.sh
1 parent 9b55540 commit fa1ee7d

File tree

4 files changed

+115
-67
lines changed

4 files changed

+115
-67
lines changed

examples/att.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
"att_parse" : "trace",
66
"att_target_cu" : 0,
77
"att_shader_engine_mask" : "0xF",
8-
"att_simd_select": "0xF",
8+
"att_simd_select": "0x0",
99
"att_buffer_size": "0x60000000"
1010
}
1111
]

examples/demo.py

Lines changed: 83 additions & 60 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
#!/usr/bin/env python
2+
from pathlib import Path
23

34
import mlir.extras.types as T
45
import numpy as np
@@ -7,7 +8,7 @@
78

89
from mlir.extras.ast.canonicalize import canonicalize
910
from mlir.extras.context import RAIIMLIRContextModule
10-
from mlir.extras.dialects.ext import memref, scf, arith, rocdl
11+
from mlir.extras.dialects.ext import memref, scf, arith, rocdl, gpu, llvm, vector
1112

1213
# noinspection PyUnresolvedReferences
1314
from mlir.extras.dialects.ext.gpu import (
@@ -25,6 +26,7 @@
2526
module,
2627
get_compile_object_bytes,
2728
lds_space,
29+
dynamic_shared_memory,
2830
)
2931
from mlir.extras.runtime.passes import run_pipeline, Pipeline
3032

@@ -43,10 +45,6 @@ def time_to_gflops(time_ms, N):
4345
ctx = RAIIMLIRContextModule()
4446
set_container_module(ctx.module)
4547

46-
props = hip.hipDeviceProp_t()
47-
hip_check(hip.hipGetDeviceProperties(props, 0))
48-
arch = props.gcnArchName.decode()
49-
5048

5149
# just a default attr - actual target is set blow
5250
@module("kernels", [f'#rocdl.target<abi = "500">'])
@@ -60,40 +58,44 @@ def gpu_module():
6058
set_container_module(ctx.module)
6159

6260
v_len = 16
63-
M, K, N = 1024, 1024, 1024
64-
v16f16 = T.vector(v_len, T.f16())
61+
M, K, N = 512, 512, 512
62+
TILE_SIZE = BK = 16
63+
dtype = T.f16()
64+
np_dtype = np.float16
65+
v16 = T.vector(v_len, dtype)
6566

6667

6768
@gpu_func
6869
@canonicalize(using=scf.canonicalizer)
69-
def smol_matmul(
70-
a: T.memref(M, K, T.f16()),
71-
b: T.memref(K, N, T.f16()),
72-
c: T.memref(M, N, T.f16()),
70+
def kernel(
71+
A: T.memref(M, K, dtype), B: T.memref(K, N, dtype), C: T.memref(M, N, dtype)
7372
):
74-
lIdx = thread_idx.x
75-
# a and b fragments are stored in 8 VGPRs each, in packed format, so 16 elements each for a and b
76-
# a_frag will store one column of the 16x16 matrix A tile
77-
# b_frag will store one row of the 16x16 matrix B tile
78-
a_frag = arith.constant(np.full([v_len], 0.0, np.float16), v16f16)
79-
b_frag = arith.constant(np.full([v_len], 0.0, np.float16), v16f16)
80-
c_frag = arith.constant(np.full([v_len], 0.0, np.float16), v16f16)
81-
82-
# lane is (0-31) mod 16 instead of 0-31 due to matrix replication in RDNA 3
83-
lane = lIdx % v_len
84-
for ele in range(v_len):
85-
b_frag[ele] = b[ele, lane]
86-
a_frag[ele] = a[lane, ele]
87-
# a_frag, b_frag = yield a_frag, b_frag
88-
89-
# call the WMMA intrinsic
90-
false = arith.constant(False, T.bool())
91-
c_frag = rocdl.wmma_f16_16x16x16_f16(v16f16, [a_frag, b_frag, c_frag, false])
92-
93-
for ele in range(v_len // 2):
94-
r = ele * 2 + (lIdx // v_len)
95-
# store results from unpacked c_frag output
96-
c[r, lane] = c_frag[ele * 2]
73+
base = dynamic_shared_memory()
74+
As = memref.view(base, (TILE_SIZE, TILE_SIZE), dtype=dtype)
75+
Bs = memref.view(
76+
base, (TILE_SIZE, TILE_SIZE), dtype=dtype, shift=TILE_SIZE * TILE_SIZE
77+
)
78+
79+
row = block_idx.y * TILE_SIZE + thread_idx.y
80+
col = block_idx.x * TILE_SIZE + thread_idx.x
81+
82+
sum = arith.constant(np.full([v_len], 0.0, np_dtype), v16)
83+
for t, sum, _ in scf.range_(0, N, BK, iter_args=[sum]):
84+
Bs[thread_idx.y, thread_idx.x] = B[thread_idx.y + t, col]
85+
As[thread_idx.y, thread_idx.x] = A[row, thread_idx.x + t]
86+
87+
gpu.barrier()
88+
89+
a_frag = As @ vector.load(v16) @ [thread_idx.y, 0]
90+
b_frag = Bs @ vector.load(v16) @ [0, thread_idx.x]
91+
false = arith.constant(False, T.bool())
92+
sum = rocdl.wmma_f16_16x16x16_f16(v16, [a_frag, b_frag, sum, false])
93+
94+
gpu.barrier()
95+
96+
sum = yield sum
97+
98+
C[row, col] = sum
9799

98100

99101
props = hip.hipDeviceProp_t()
@@ -103,31 +105,38 @@ def smol_matmul(
103105

104106
@module("naive", [f'#rocdl.target<chip = "{arch}", abi = "500">'])
105107
def gpu_module():
106-
smol_matmul.emit()
108+
kernel.emit()
107109

108110

109111
ip.__exit__(None, None, None)
110112

113+
O = 3
114+
output_format = "binary"
115+
111116
lowered_module = run_pipeline(
112117
gpu_module,
113118
Pipeline()
114119
.Gpu(Pipeline().convert_gpu_to_rocdl(use_bare_ptr_memref_call_conv=True))
115-
.rocdl_attach_target(chip=arch, abi="500", O=0)
120+
.rocdl_attach_target(chip=arch, abi="500", O=O)
116121
.gpu_to_llvm()
117122
.lower_to_llvm()
118123
.ensure_debug_info_scope_on_llvm_func(emission_kind="Full")
119-
.gpu_module_to_binary(),
124+
.gpu_module_to_binary(format=output_format),
120125
)
121126

122127
hsaco = get_compile_object_bytes(lowered_module)
128+
if output_format == "assembly":
129+
with open(Path(__file__).parent / f"hsacoO{O}.txt", "wb") as f:
130+
f.write(hsaco)
131+
exit()
123132
hip_module = hip_check(hip.hipModuleLoadData(hsaco))
124-
function = hip_check(
125-
hip.hipModuleGetFunction(hip_module, smol_matmul.__name__.encode())
126-
)
133+
function = hip_check(hip.hipModuleGetFunction(hip_module, kernel.__name__.encode()))
127134

128-
a_h = np.random.randint(0, 10, (M, K)).astype(dtype=np.float16)
129-
b_h = np.random.randint(0, 10, (K, N)).astype(dtype=np.float16)
130-
c_h = -3 * np.ones((M, N), dtype=np.float16)
135+
# a_h = np.random.randint(0, 10, (M, K)).astype(dtype=np_dtype)
136+
# b_h = np.random.randint(0, 10, (K, N)).astype(dtype=np_dtype)
137+
a_h = np.ones((M, K)).astype(dtype=np_dtype)
138+
b_h = np.ones((K, N)).astype(dtype=np_dtype)
139+
c_h = -3 * np.ones((M, N), dtype=np_dtype)
131140

132141
a_num_bytes = a_h.size * a_h.itemsize
133142
b_num_bytes = b_h.size * b_h.itemsize
@@ -141,22 +150,34 @@ def gpu_module():
141150
hip_check(hip.hipMemcpy(b_d, b_h, b_num_bytes, hip.hipMemcpyKind.hipMemcpyHostToDevice))
142151
hip_check(hip.hipMemcpy(c_d, c_h, c_num_bytes, hip.hipMemcpyKind.hipMemcpyHostToDevice))
143152

144-
gridX = 32
145-
gridY = 32
146-
gridZ = 1
147-
warp_size = 32
148-
num_warps = 1
153+
(
154+
(
155+
blocks_per_grid_x,
156+
blocks_per_grid_y,
157+
blocks_per_grid_z,
158+
),
159+
(
160+
threads_per_block_x,
161+
threads_per_block_y,
162+
threads_per_block_z,
163+
),
164+
shared_memory,
165+
) = (
166+
(N // TILE_SIZE, N // TILE_SIZE, 1),
167+
(TILE_SIZE, TILE_SIZE, 1),
168+
2 * TILE_SIZE * TILE_SIZE * dtype.width // 8,
169+
)
170+
149171
stream = 0
150-
shared_memory = 0
151172

152173
launch_kernel(
153174
function.as_c_void_p(),
154-
gridX,
155-
gridY,
156-
gridZ,
157-
warp_size,
158-
num_warps,
159-
1,
175+
blocks_per_grid_x,
176+
blocks_per_grid_y,
177+
blocks_per_grid_z,
178+
threads_per_block_x,
179+
threads_per_block_y,
180+
threads_per_block_z,
160181
stream,
161182
shared_memory,
162183
a_d,
@@ -169,11 +190,13 @@ def gpu_module():
169190
assert not np.allclose(correct, c_h)
170191
hip_check(hip.hipMemcpy(c_h, c_d, c_num_bytes, hip.hipMemcpyKind.hipMemcpyDeviceToHost))
171192

172-
# if not np.allclose(c_h, correct):
173-
# with np.printoptions(threshold=np.inf, linewidth=200):
174-
# print(correct)
175-
# print(c_h)
176-
# assert False
193+
194+
if not np.allclose(c_h, correct):
195+
with np.printoptions(threshold=np.inf, linewidth=np.inf):
196+
# print("correct", correct)
197+
# print("c_h", c_h)
198+
print("off by atol", np.max(np.abs(correct - c_h)))
199+
print("off by rtol", np.max(np.abs(correct - c_h) / correct))
177200

178201
hip_check(hip.hipFree(a_d))
179202
hip_check(hip.hipFree(b_d))

examples/rocprof.sh

Lines changed: 20 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -2,13 +2,27 @@
22

33
#set -eux
44

5+
cd "$(dirname "$0")"
6+
SCRIPT_DIR="$(pwd)"
7+
echo "Script directory: $SCRIPT_DIR"
8+
59
export PATH=/opt/rocm-6.5.0/bin:$PATH
6-
export PYTHONPATH=/home/mlevental/dev_projects/mlir-python-extras
7-
export OUTPUT_PATH=$PWD
10+
export PYTHONPATH=$SCRIPT_DIR/..
11+
export OUTPUT_PATH=$SCRIPT_DIR
812
export ROCPROF_ATT_LIBRARY_PATH=/opt/rocm-6.5.0/att-decoder-v3-3.0.0-Linux/lib
13+
export ATT_VIEWER=../../ROCProfiler-ATT-Viewer-amd-staging/cmake-build-debug/ATTViewer
14+
915

1016
rm -rf traces
11-
#rocprofv2 --kernel-trace /home/mlevental/dev_projects/mlir-python-extras/examples/demo.py
12-
#rocprofv2 -i att.txt --kernel-trace --plugin att auto --mode file,csv -d traces/ /home/mlevental/dev_projects/mlir-python-extras/examples/demo.py
13-
/opt/rocm-6.5.0/bin/rocprofv3 -i att.json -d traces -- /home/mlevental/dev_projects/mlir-python-extras/examples/demo.py
14-
../../ROCProfiler-ATT-Viewer-amd-staging/cmake-build-debug/ATTViewer traces/ui*
17+
/opt/rocm-6.5.0/bin/rocprofv3 -i att.json -d traces -o demo_trace -- $SCRIPT_DIR/demo.py
18+
19+
for ui in $(ls $SCRIPT_DIR/traces) ; do
20+
if [ -d $SCRIPT_DIR/traces/$ui ]; then
21+
ls $SCRIPT_DIR/traces/$ui | grep se > /dev/null
22+
if [ $? == 0 ]; then
23+
UI_PATH=$SCRIPT_DIR/traces/$ui
24+
fi
25+
fi
26+
done
27+
28+
$ATT_VIEWER $UI_PATH

mlir/extras/dialects/ext/vector.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -251,6 +251,8 @@ def extract_strided_slice(vector, offsets, sizes, strides, *, loc=None, ip=None)
251251

252252

253253
def outerproduct(lhs, rhs, acc=None, *, kind=None, loc=None, ip=None):
254+
if loc is None:
255+
loc = get_user_code_loc()
254256
if kind is None:
255257
kind = CombiningKind.ADD
256258
result_shape = [lhs.shape[0], rhs.shape[0]]
@@ -262,6 +264,8 @@ def outerproduct(lhs, rhs, acc=None, *, kind=None, loc=None, ip=None):
262264

263265
@Infix
264266
def outer(lhs, rhs, acc=None, *, kind=None, loc=None, ip=None):
267+
if loc is None:
268+
loc = get_user_code_loc()
265269
return outerproduct(lhs, rhs, acc, kind=kind, loc=loc, ip=ip)
266270

267271

@@ -270,6 +274,8 @@ def outer(lhs, rhs, acc=None, *, kind=None, loc=None, ip=None):
270274

271275
@Infix
272276
def shuffle(v1, v2, mask, *, loc=None, ip=None):
277+
if loc is None:
278+
loc = get_user_code_loc()
273279
return ShuffleOp(v1=v1, v2=v2, mask=mask, loc=loc, ip=ip).result
274280

275281

@@ -278,6 +284,11 @@ def shuffle(v1, v2, mask, *, loc=None, ip=None):
278284

279285
@Infix
280286
def load(base, indices, result, *, nontemporal=None, loc=None, ip=None):
287+
if loc is None:
288+
loc = get_user_code_loc()
289+
for j, i in enumerate(indices):
290+
if isinstance(i, int):
291+
indices[j] = constant(i, index=True)
281292
return LoadOp(
282293
result=result,
283294
base=base,

0 commit comments

Comments
 (0)