Skip to content

Commit d78b4f9

Browse files
[BACKEND] Add workaround for TMA device side descriptor race condition (#7293)
A fence is currently missing after TMA update. Adding extra instruction to force the fence to be materialized. Patch provided by @mbrookhart Co-authored-by: Matthew Brookhart <[email protected]>
1 parent e56afa6 commit d78b4f9

File tree

2 files changed

+10
-0
lines changed

2 files changed

+10
-0
lines changed

test/Conversion/tritonnvidiagpu_to_llvm.mlir

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -182,6 +182,9 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.targ
182182
module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} {
183183
tt.func public @tensormap_fenceproxy_acquire(%arg0: !tt.ptr<i8> {tt.divisibility = 16 : i32}) attributes {noinline = false} {
184184
// CHECK: fence.proxy.tensormap::generic.acquire.gpu [ $0 + 0 ], 0x80;
185+
// ptxas missing fence workaround:
186+
// CHECK: cp.async.bulk.commit_group
187+
// CHECK: cp.async.bulk.wait_group.read 0
185188
ttng.tensormap_fenceproxy_acquire %arg0 : !tt.ptr<i8>
186189
tt.return
187190
}

third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/TMAToLLVM.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -206,6 +206,13 @@ struct TensormapFenceproxyAcquireOpConversion
206206
*ptxBuilder.create<>("fence.proxy.tensormap::generic.acquire.gpu");
207207
fence(descAddrOpr, sizeOpr).predicate(pred);
208208

209+
// Workaround for a ptxas bug missing a fence after generic.acquire.gpu.
210+
// TODO: remove the workaround once ptxas is fixed.
211+
auto &commit = *ptxBuilder.create<>("cp.async.bulk.commit_group");
212+
commit().predicate(pred);
213+
auto &wait = *ptxBuilder.create<>("cp.async.bulk.wait_group.read 0");
214+
wait().predicate(pred);
215+
209216
ptxBuilder.launch(rewriter, loc, getVoidType());
210217

211218
// We run the fence on a single warp, then use a barrier to synchronize the

0 commit comments

Comments
 (0)