Skip to content

Commit 15a4d66

Browse files
authored
[AMD] NFC: Replace intrinsics for ballot and readlane with ROCDL ops (triton-lang#6325)
Just a small cleanup to remove intrinsics with ROCDL ops.
1 parent 575bed8 commit 15a4d66

File tree

3 files changed

+11
-15
lines changed

3 files changed

+11
-15
lines changed

test/Conversion/amd/tritongpu_to_llvm.mlir

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -164,7 +164,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, "ttg.thr
164164
// CHECK-SAME: with 323, 15, 15, true : f32
165165
// CHECK-NEXT: llvm.intr.maxnum
166166

167-
// CHECK: llvm.amdgcn.readlane
167+
// CHECK: rocdl.readlane
168168
%0 = "tt.reduce"(%arg0) <{axis = 0 : i32}> ({
169169
^bb0(%arg1: f32, %arg2: f32):
170170
%1 = arith.maxnumf %arg1, %arg2 : f32
@@ -269,15 +269,15 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, "ttg.thr
269269
// CHECK-COUNT-2: llvm.amdgcn.ds.permute
270270
// CHECK: llvm.bitcast
271271
// CHECK: llvm.inttoptr
272-
// CHECK: llvm.amdgcn.ballot
272+
// CHECK: rocdl.ballot
273273
// CHECK: llvm.ptrtoint
274-
// CHECK: llvm.amdgcn.ballot
274+
// CHECK: rocdl.ballot
275275

276276
// loop body:
277277
// CHECK: llvm.bitcast
278278
// CHECK-COUNT-2: llvm.amdgcn.readfirstlane
279279
// CHECK: llvm.bitcast
280-
// CHECK: llvm.amdgcn.ballot
280+
// CHECK: rocdl.ballot
281281
// CHECK: rocdl.mbcnt.lo
282282
// CHECK: rocdl.mbcnt.hi
283283

@@ -294,7 +294,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, "ttg.thr
294294
// CHECK: llvm.amdgcn.ds.permute
295295
// CHECK: llvm.bitcast
296296

297-
// CHECK: llvm.amdgcn.ballot
297+
// CHECK: rocdl.ballot
298298

299299
// reduction:
300300
// CHECK-COUNT-6: llvm.amdgcn.ds.bpermute

test/Conversion/amd/tritongpu_to_llvm_rdna.mlir

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, "ttg.thr
2222

2323
// CHECK: llvm.amdgcn.permlanex16
2424
// CHECK: llvm.intr.maxnum
25-
// CHECK: llvm.amdgcn.readlane
25+
// CHECK: rocdl.readlane
2626
%0 = "tt.reduce"(%arg0) <{axis = 0 : i32}> ({
2727
^bb0(%arg1: f32, %arg2: f32):
2828
%1 = arith.maxnumf %arg1, %arg2 : f32

third_party/amd/lib/TritonAMDGPUToLLVM/TargetInfo.cpp

Lines changed: 5 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -110,9 +110,7 @@ Value TargetInfo::getClusterCTAId(RewriterBase &rewriter, Location loc) const {
110110

111111
Value TargetInfo::ballot(RewriterBase &rewriter, Location loc, Type type,
112112
Value cmp) const {
113-
return LLVM::createLLVMIntrinsicCallOp(rewriter, loc, "llvm.amdgcn.ballot",
114-
type, cmp)
115-
->getResult(0);
113+
return rewriter.create<ROCDL::BallotOp>(loc, type, cmp);
116114
}
117115

118116
void TargetInfo::storeDShared(RewriterBase &rewriter, Location loc, Value ptr,
@@ -366,12 +364,10 @@ bool TargetInfo::warpReduce(RewriterBase &rewriter, Location loc,
366364
// Similarly, we need to cast data types for readlane instruction.
367365
Type actualType = castToAndSExtInt(rewriter, loc, buf, valType, 16);
368366

369-
// Get reduction result from lane 63/31
370-
std::string intrinsic = "llvm.amdgcn.readlane";
371-
Value result = LLVM::createLLVMIntrinsicCallOp(
372-
rewriter, loc, intrinsic, actualType,
373-
ValueRange{buf, b.i32_val(isCDNA() ? 63 : 31)})
374-
->getResult(0);
367+
// Get reduction result from the last lane of the warp
368+
Value lastLaneId = b.i32_val(gpu::lookupThreadsPerWarp(rewriter) - 1);
369+
Value result =
370+
rewriter.create<ROCDL::ReadlaneOp>(loc, actualType, buf, lastLaneId);
375371

376372
result = truncAndCastFromInt(rewriter, loc, result, valType, 16);
377373

0 commit comments

Comments
 (0)