[AMD] Use permlanex16 for shuffleXor on rdna (#7269)

ptrojahn · paultrojahnamd · web-flow · commit 3a17b5c695de · 2025-06-26T11:41:53.000-05:00
On RDNA, permlanex16 works similar to DPP operations, but has more
flexible lane selection. Each lane in the upper/lower block of 16
contiguous lanes can select an arbitrary lane in the other block to read
from. With 4-bits per lane, we construct the identity mapping
0xfedcba9876543210 so that lane i in the upper 16 lanes reads data from
lane i in the lower 16 lanes and vice versa.
This does not require a round trip to LDS, as was necessary with the
previously used ds_swizzle instruction.

Co-authored-by: Paul Trojahn &lt;paul.trojahn@amd.com&gt;
diff --git a/test/Conversion/amd/tritongpu_to_llvm_rdna.mlir b/test/Conversion/amd/tritongpu_to_llvm_rdna.mlir
@@ -20,7 +20,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, "ttg.thr
     // CHECK-SAME: with 273, 15, 15, true : f32
     // CHECK-NEXT: llvm.intr.maxnum
 
-    // CHECK: llvm.amdgcn.permlanex16
+    // CHECK: rocdl.permlanex16
     // CHECK: llvm.intr.maxnum
     // CHECK: rocdl.readlane
     %0 = "tt.reduce"(%arg0) <{axis = 0 : i32}> ({
@@ -31,3 +31,33 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, "ttg.thr
     tt.return
   }
 }
+
+#linear = #ttg.linear<{register = [[16, 0]], lane = [[1, 0], [2, 0], [4, 0], [8, 0], [0, 1]], warp = [], block = []}>
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, "ttg.threads-per-warp" = 32 : i32} {
+// CHECK-LABEL: @reduce_linear_layout
+tt.func private @reduce_linear_layout(%arg0: tensor<32x2xi32, #linear>) -> tensor<32xi32, #ttg.slice<{dim = 1, parent = #linear}>> {
+  // This tensor has 64 elements with the last dimension across the lower and upper 16 lanes.
+  // Therefore, we can reduce it with a 16 element butterfly shuffle.
+
+  // CHECK-DAG: [[result0:%.*]] = llvm.mlir.undef
+  // CHECK-DAG: [[select_lo:%.*]] = llvm.mlir.constant(1985229328 : i32)
+  // CHECK-DAG: [[select_hi:%.*]] = llvm.mlir.constant(-19088744 : i32)
+  // CHECK-DAG: [[reg0:%.*]] = llvm.extractvalue %arg0[0]
+  // CHECK-DAG: [[reg1:%.*]] = llvm.extractvalue %arg0[1]
+  // CHECK: [[permlane0:%.*]] = rocdl.permlanex16 [[reg0]], [[reg0]], [[select_lo]], [[select_hi]], true, false
+  // CHECK: [[sum0:%.*]] = llvm.add [[reg0]], [[permlane0]]
+  // CHECK: [[permlane1:%.*]] = rocdl.permlanex16 [[reg1]], [[reg1]], [[select_lo]], [[select_hi]], true, false
+  // CHECK: [[sum1:%.*]] = llvm.add [[reg1]], [[permlane1]]
+  // CHECK: [[result1:%.*]] = llvm.insertvalue [[sum0]], [[result0]][0]
+  // CHECK: [[result2:%.*]] = llvm.insertvalue [[sum1]], [[result1]][1]
+
+  %0 = "tt.reduce"(%arg0) ({
+  ^bb0(%arg1: i32, %arg2: i32):
+    %1 = arith.addi %arg1, %arg2 : i32
+    tt.reduce.return %1 : i32
+  }) {axis = 1 : i32} : (tensor<32x2xi32, #linear>) -> tensor<32xi32, #ttg.slice<{dim = 1, parent = #linear}>>
+
+  // CHECK: llvm.return [[result2]]
+  tt.return %0 : tensor<32xi32, #ttg.slice<{dim = 1, parent = #linear}>>
+}
+}
diff --git a/third_party/amd/include/TritonAMDGPUToLLVM/TargetUtils.h b/third_party/amd/include/TritonAMDGPUToLLVM/TargetUtils.h
@@ -23,6 +23,10 @@ ISAFamily deduceISAFamily(llvm::StringRef arch);
 // Retursn true if given architecture support V_DOT instruction.
 bool supportsVDot(llvm::StringRef arch);
 
+bool isCDNA(ISAFamily isaFamily);
+
+bool isRDNA(ISAFamily isaFamily);
+
 // Here is a partial definition of DppCtrl enums. For the complete definition,
 // please check:
 // https://github.com/llvm/llvm-project/blob/8c75290/llvm/lib/Target/AMDGPU/SIDefines.h#L939
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/TargetInfo.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/TargetInfo.cpp
@@ -66,34 +66,7 @@ llvm::AMDGPU::GPUKind TargetInfo::getGPUKind() const {
   return llvm::AMDGPU::parseArchAMDGCN(arch);
 }
 
-bool TargetInfo::isCDNA() const {
-  switch (getISAFamily()) {
-  case ISAFamily::CDNA1:
-  case ISAFamily::CDNA2:
-  case ISAFamily::CDNA3:
-  case ISAFamily::CDNA4:
-    return true;
-  default:
-    break;
-  }
-
-  return false;
-}
-
-bool TargetInfo::isRDNA() const {
-  switch (getISAFamily()) {
-  case ISAFamily::RDNA1:
-  case ISAFamily::RDNA2:
-  case ISAFamily::RDNA3:
-    return true;
-  default:
-    break;
-  }
-
-  return false;
-}
-
-int TargetInfo::getWarpSize() const { return isCDNA() ? 64 : 32; }
+int TargetInfo::getWarpSize() const { return isCDNA(getISAFamily()) ? 64 : 32; }
 
 int TargetInfo::getSharedMemorySize() const {
   int kbytes = getISAFamily() == ISAFamily::CDNA4 ? 160 : 64;
@@ -317,9 +290,9 @@ bool TargetInfo::warpReduce(RewriterBase &rewriter, Location loc,
     return true;
   if (numLaneToReduce != getWarpSize())
     return false;
-  if (isCDNA() && getISAFamily() == ISAFamily::CDNA1)
+  if (isCDNA(getISAFamily()) && getISAFamily() == ISAFamily::CDNA1)
     return false;
-  if (isRDNA() && getISAFamily() != ISAFamily::RDNA3)
+  if (isRDNA(getISAFamily()) && getISAFamily() != ISAFamily::RDNA3)
     return false;
 
   Operation *reduxOp = op.getSingleCombiner();
@@ -420,7 +393,7 @@ bool TargetInfo::warpReduce(RewriterBase &rewriter, Location loc,
     buf = createDppReduxOpWithBoundCtrl(valType, buf, 1 + dppCtrlRowShr,
                                         allRows, allBanks);
 
-    if (isCDNA()) {
+    if (isCDNA(getISAFamily())) {
       // row_bcast:15 row_mask:0xa
       buf = createDppReduxOpWithBoundCtrl(
           valType, buf, static_cast<uint32_t>(DppCtrl::BCAST15), 0xa, allBanks);
@@ -433,12 +406,12 @@ bool TargetInfo::warpReduce(RewriterBase &rewriter, Location loc,
       // RDNA doesn't have broadcast dpp mode
       Type actualType = castToAndSExtInt(rewriter, loc, buf, valType, 32);
 
-      Value permlaneResult =
-          LLVM::createLLVMIntrinsicCallOp(
-              rewriter, loc, "llvm.amdgcn.permlanex16", actualType,
-              ValueRange{buf, buf, b.i32_val(-1), b.i32_val(-1), b.true_val(),
-                         b.false_val()})
-              ->getResult(0);
+      // Lanes 0-15 read from lane 31 and lanes 16-31 read from lane 15.
+      Value permlaneResult = rewriter
+                                 .create<ROCDL::PermlaneX16Op>(
+                                     loc, actualType, buf, buf, b.i32_val(-1),
+                                     b.i32_val(-1), true, false)
+                                 .getRes();
       buf = truncAndCastFromInt(rewriter, loc, buf, valType, 32);
       permlaneResult =
           truncAndCastFromInt(rewriter, loc, permlaneResult, valType, 32);
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/TargetInfo.h b/third_party/amd/lib/TritonAMDGPUToLLVM/TargetInfo.h
@@ -15,10 +15,6 @@ class TargetInfo : public mlir::triton::TargetInfoBase {
 
   llvm::AMDGPU::GPUKind getGPUKind() const;
 
-  bool isCDNA() const;
-
-  bool isRDNA() const;
-
   int getWarpSize() const;
 
   int getSharedMemorySize() const;
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/TargetUtils.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/TargetUtils.cpp
@@ -49,4 +49,31 @@ bool supportsVDot(llvm::StringRef arch) {
   return false;
 }
 
+bool isCDNA(ISAFamily isaFamily) {
+  switch (isaFamily) {
+  case ISAFamily::CDNA1:
+  case ISAFamily::CDNA2:
+  case ISAFamily::CDNA3:
+  case ISAFamily::CDNA4:
+    return true;
+  default:
+    break;
+  }
+
+  return false;
+}
+
+bool isRDNA(ISAFamily isaFamily) {
+  switch (isaFamily) {
+  case ISAFamily::RDNA1:
+  case ISAFamily::RDNA2:
+  case ISAFamily::RDNA3:
+    return true;
+  default:
+    break;
+  }
+
+  return false;
+}
+
 } // namespace mlir::triton::AMD
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/Utility.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/Utility.cpp
@@ -1,6 +1,7 @@
 #include "Utility.h"
 #include "Dialect/TritonAMDGPU/IR/Dialect.h"
 #include "TritonAMDGPUToLLVM/GCNAsmFormat.h"
+#include "TritonAMDGPUToLLVM/TargetUtils.h"
 #include "mlir/Dialect/LLVMIR/LLVMTypes.h"
 #include "mlir/Dialect/LLVMIR/ROCDLDialect.h"
 #include "mlir/IR/PatternMatch.h"
@@ -137,8 +138,17 @@ static Value shuffleCommonImpl(Location loc, RewriterBase &rewriter,
       Value lineId = b.xor_(threadId, stride);
       return bpermute(lineId);
     } else if (strideInt == 16) {
-      Value offset = b.i32_val(0x401F);
-      return rewriter.create<ROCDL::DsSwizzleOp>(loc, valType, val, offset);
+      if (isRDNA(isaFamily)) {
+        // Lane i in the upper 16 lanes reads the value from lane i in the lower
+        // 16 lanes and vice versa.
+        Value select_lo = b.i32_val(0x76543210);
+        Value select_hi = b.i32_val(0xfedcba98);
+        return rewriter.create<ROCDL::PermlaneX16Op>(
+            loc, valType, val, val, select_lo, select_hi, true, false);
+      } else {
+        Value offset = b.i32_val(0x401F);
+        return rewriter.create<ROCDL::DsSwizzleOp>(loc, valType, val, offset);
+      }
     } else {
       if (!llvm::is_contained({ISAFamily::CDNA2, ISAFamily::CDNA3,
                                ISAFamily::CDNA4, ISAFamily::RDNA3},
diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/UpdateAsyncWaitCount.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/UpdateAsyncWaitCount.cpp
@@ -119,7 +119,7 @@ struct TritonAMDGPUUpdateAsyncWaitCountPass
 
   void runOnOperation() override {
     tt::AMD::TargetInfo targetInfo(archGenerationName);
-    if (!targetInfo.isCDNA()) {
+    if (!isCDNA(targetInfo.getISAFamily())) {
       return;
     }
 

Original file line number	Diff line number	Diff line change
`@@ -119,7 +119,7 @@ struct TritonAMDGPUUpdateAsyncWaitCountPass`
`119`	`119`
`120`	`120`	`void runOnOperation() override {`
`121`	`121`	`tt::AMD::TargetInfo targetInfo(archGenerationName);`
`122`		`- if (!targetInfo.isCDNA()) {`
	`122`	`+ if (!isCDNA(targetInfo.getISAFamily())) {`
`123`	`123`	`return;`
`124`	`124`	`}`
`125`	`125`