We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see our documentation.
There was an error while loading. Please reload this page.
1 parent 7adc012 commit 0d975d4Copy full SHA for 0d975d4
python/perf-kernels/streamk/streamk_kernel.py
@@ -162,6 +162,7 @@ def streamk_gemm(
162
rn1 = tl.max_contiguous(tl.multiple_of(rn1, BLOCK_SIZE_N), BLOCK_SIZE_N)
163
P_ = P + pid * BLOCK_SIZE_M * BLOCK_SIZE_N + rm1[:, None] * BLOCK_SIZE_N + rn1[None, :]
164
tl.store(P_, acc, cache_modifier=".wt")
165
+ tl.debug_barrier()
166
tl.store(locks + pid, 1, cache_modifier=".wt")
167
# tl.store(P_, acc)
168
# tl.debug_barrier()
0 commit comments