[AMD] NFC: Replace intrinsics for ballot and readlane with ROCDL ops (triton-lang#6325)

AlexAUT · web-flow · commit 15a4d66a6eb3 · 2025-03-28T12:26:20.000-07:00
Just a small cleanup to remove intrinsics with ROCDL ops.
diff --git a/test/Conversion/amd/tritongpu_to_llvm.mlir b/test/Conversion/amd/tritongpu_to_llvm.mlir
@@ -164,7 +164,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, "ttg.thr
     // CHECK-SAME: with 323, 15, 15, true : f32
     // CHECK-NEXT: llvm.intr.maxnum
 
-    // CHECK: llvm.amdgcn.readlane
+    // CHECK: rocdl.readlane
     %0 = "tt.reduce"(%arg0) <{axis = 0 : i32}> ({
     ^bb0(%arg1: f32, %arg2: f32):
       %1 = arith.maxnumf %arg1, %arg2 : f32
@@ -269,15 +269,15 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, "ttg.thr
     // CHECK-COUNT-2: llvm.amdgcn.ds.permute
     // CHECK: llvm.bitcast
     // CHECK: llvm.inttoptr
-    // CHECK: llvm.amdgcn.ballot
+    // CHECK: rocdl.ballot
     // CHECK: llvm.ptrtoint
-    // CHECK: llvm.amdgcn.ballot
+    // CHECK: rocdl.ballot
 
     // loop body:
     // CHECK: llvm.bitcast
     // CHECK-COUNT-2: llvm.amdgcn.readfirstlane
     // CHECK: llvm.bitcast
-    // CHECK: llvm.amdgcn.ballot
+    // CHECK: rocdl.ballot
     // CHECK: rocdl.mbcnt.lo
     // CHECK: rocdl.mbcnt.hi
 
@@ -294,7 +294,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, "ttg.thr
     // CHECK: llvm.amdgcn.ds.permute
     // CHECK: llvm.bitcast
 
-    // CHECK: llvm.amdgcn.ballot
+    // CHECK: rocdl.ballot
 
     // reduction:
     // CHECK-COUNT-6: llvm.amdgcn.ds.bpermute
diff --git a/test/Conversion/amd/tritongpu_to_llvm_rdna.mlir b/test/Conversion/amd/tritongpu_to_llvm_rdna.mlir
@@ -22,7 +22,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, "ttg.thr
 
     // CHECK: llvm.amdgcn.permlanex16
     // CHECK: llvm.intr.maxnum
-    // CHECK: llvm.amdgcn.readlane
+    // CHECK: rocdl.readlane
     %0 = "tt.reduce"(%arg0) <{axis = 0 : i32}> ({
     ^bb0(%arg1: f32, %arg2: f32):
       %1 = arith.maxnumf %arg1, %arg2 : f32
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/TargetInfo.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/TargetInfo.cpp
@@ -110,9 +110,7 @@ Value TargetInfo::getClusterCTAId(RewriterBase &rewriter, Location loc) const {
 
 Value TargetInfo::ballot(RewriterBase &rewriter, Location loc, Type type,
                          Value cmp) const {
-  return LLVM::createLLVMIntrinsicCallOp(rewriter, loc, "llvm.amdgcn.ballot",
-                                         type, cmp)
-      ->getResult(0);
+  return rewriter.create<ROCDL::BallotOp>(loc, type, cmp);
 }
 
 void TargetInfo::storeDShared(RewriterBase &rewriter, Location loc, Value ptr,
@@ -366,12 +364,10 @@ bool TargetInfo::warpReduce(RewriterBase &rewriter, Location loc,
     // Similarly, we need to cast data types for readlane instruction.
     Type actualType = castToAndSExtInt(rewriter, loc, buf, valType, 16);
 
-    // Get reduction result from lane 63/31
-    std::string intrinsic = "llvm.amdgcn.readlane";
-    Value result = LLVM::createLLVMIntrinsicCallOp(
-                       rewriter, loc, intrinsic, actualType,
-                       ValueRange{buf, b.i32_val(isCDNA() ? 63 : 31)})
-                       ->getResult(0);
+    // Get reduction result from the last lane of the warp
+    Value lastLaneId = b.i32_val(gpu::lookupThreadsPerWarp(rewriter) - 1);
+    Value result =
+        rewriter.create<ROCDL::ReadlaneOp>(loc, actualType, buf, lastLaneId);
 
     result = truncAndCastFromInt(rewriter, loc, result, valType, 16);