Skip to content

Commit 9c2cefd

Browse files
authored
[AMD][NFC] Use ROCDL ops for WaitAsyncCntOp/WaitTensorCntOp (#8820)
Removing `WaitAsyncCntOp`/`WaitTensorCntOp` LLVM intrinsic call ops from AMDGPU backend
1 parent dad2ba0 commit 9c2cefd

File tree

2 files changed

+4
-8
lines changed

2 files changed

+4
-8
lines changed

test/Conversion/amd/tritongpu_tdm_to_llvm.mlir

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -17,7 +17,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.thr
1717
// CHECK-COUNT-8: llvm.insertelement{{.*}} : vector<8xi32>
1818
// CHECK: llvm.amdgcn.tensor.load.to.lds.d2{{.*}} : (vector<4xi32>, vector<8xi32>, i32) -> ()
1919
%2 = amdg.async_tdm_copy_global_to_local %0[%c_offset, %c_offset] into %1, %c_pred : !tt.tensordesc<tensor<64x64xf16, #shared>> -> !ttg.memdesc<64x64xf16, #shared, #smem, mutable>
20-
// CHECK: llvm.amdgcn.s.wait.tensorcnt{{.*}} : (i16) -> ()
20+
// CHECK: rocdl.s.wait.tensorcnt 0
2121
%3 = amdg.async_tdm_wait {num = 0 : i32}
2222
%4 = ttg.local_load %1 : !ttg.memdesc<64x64xf16, #shared, #smem, mutable> -> tensor<64x64xf16, #blocked>
2323
tt.return
@@ -44,7 +44,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.thr
4444
// CHECK-COUNT-8: llvm.insertelement{{.*}} : vector<8xi32>
4545
// CHECK: llvm.amdgcn.tensor.store.from.lds.d2{{.*}} : (vector<4xi32>, vector<8xi32>, i32) -> ()
4646
amdg.async_tdm_copy_local_to_global %0[%c_offset, %c_offset] from %1: !ttg.memdesc<64x64xf16, #shared, #smem, mutable> -> !tt.tensordesc<tensor<64x64xf16, #shared>>
47-
// CHECK: llvm.amdgcn.s.wait.tensorcnt{{.*}} : (i16) -> ()
47+
// CHECK: rocdl.s.wait.tensorcnt 0
4848
%3 = amdg.async_tdm_wait {num = 0 : i32}
4949
tt.return
5050
}

third_party/amd/lib/TritonAMDGPUToLLVM/LoadStoreOpToLLVM.cpp

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1996,9 +1996,7 @@ struct AsyncWaitOpConversion
19961996
case ISAFamily::GFX1250: {
19971997
// Clamp asyncCnt to 6bits(hw imit); lower means conservative
19981998
unsigned asyncCnt = std::min(63u, op.getNumInst());
1999-
LLVM::createLLVMIntrinsicCallOp(rewriter, loc,
2000-
"llvm.amdgcn.s.wait.asynccnt", {},
2001-
{b.i16_val(asyncCnt)});
1999+
ROCDL::WaitAsynccntOp::create(rewriter, loc, asyncCnt);
20022000
break;
20032001
}
20042002
default:
@@ -2025,9 +2023,7 @@ struct AsyncTDMWaitConversion
20252023
ConversionPatternRewriter &rewriter) const override {
20262024
auto loc = op.getLoc();
20272025
auto b = TritonLLVMOpBuilder(loc, rewriter);
2028-
LLVM::createLLVMIntrinsicCallOp(rewriter, loc,
2029-
"llvm.amdgcn.s.wait.tensorcnt", {},
2030-
{b.i16_val(op.getNum())});
2026+
ROCDL::WaitTensorcntOp::create(rewriter, loc, op.getNum());
20312027
rewriter.eraseOp(op);
20322028
return success();
20332029
}

0 commit comments

Comments
 (0)