[AMD] Add support for dpp instructions on RDNA (#6250)

ptrojahn · paultrojahnamd · web-flow · commit a39389aac290 · 2025-03-25T16:22:27.000-07:00
Enables the dpp reduction path from CDNA on RDNA with small
modifications to handle the missing BCAST15 mode.

---------

Co-authored-by: paul &lt;paul.trojahn@amd.com&gt;
diff --git a/test/Conversion/amd/tritongpu_to_llvm_rdna.mlir b/test/Conversion/amd/tritongpu_to_llvm_rdna.mlir
@@ -0,0 +1,33 @@
+// RUN: triton-opt %s -split-input-file --allocate-shared-memory --convert-triton-amdgpu-to-llvm=arch=gfx1100 --convert-builtin-func-to-llvm | FileCheck %s
+
+#blocked3 = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [1], order = [0]}>
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, "ttg.threads-per-warp" = 32 : i32} {
+  // CHECK-LABEL: reduce_dpp_max
+  tt.func @reduce_dpp_max(%arg0: tensor<32xf32, #blocked3>) {
+    // CHECK: rocdl.update.dpp
+    // CHECK-SAME: with 280, 15, 15, true : f32
+    // CHECK-NEXT: llvm.intr.maxnum
+
+    // CHECK-NEXT: rocdl.update.dpp
+    // CHECK-SAME: with 276, 15, 15, true : f32
+    // CHECK-NEXT: llvm.intr.maxnum
+
+    // CHECK-NEXT: rocdl.update.dpp
+    // CHECK-SAME: with 274, 15, 15, true : f32
+    // CHECK-NEXT: llvm.intr.maxnum
+
+    // CHECK-NEXT: rocdl.update.dpp
+    // CHECK-SAME: with 273, 15, 15, true : f32
+    // CHECK-NEXT: llvm.intr.maxnum
+
+    // CHECK: llvm.amdgcn.permlanex16
+    // CHECK: llvm.intr.maxnum
+    // CHECK: llvm.amdgcn.readlane
+    %0 = "tt.reduce"(%arg0) <{axis = 0 : i32}> ({
+    ^bb0(%arg1: f32, %arg2: f32):
+      %1 = arith.maxnumf %arg1, %arg2 : f32
+      tt.reduce.return %1 : f32
+    }) : (tensor<32xf32, #blocked3>) -> f32
+    tt.return
+  }
+}
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/TargetInfo.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/TargetInfo.cpp
@@ -65,6 +65,35 @@ llvm::AMDGPU::GPUKind TargetInfo::getGPUKind() const {
   return llvm::AMDGPU::parseArchAMDGCN(arch);
 }
 
+bool TargetInfo::isCDNA() const {
+  switch (getISAFamily()) {
+  case ISAFamily::CDNA1:
+  case ISAFamily::CDNA2:
+  case ISAFamily::CDNA3:
+  case ISAFamily::CDNA4:
+    return true;
+  default:
+    break;
+  }
+
+  return false;
+}
+
+bool TargetInfo::isRDNA() const {
+  switch (getISAFamily()) {
+  case ISAFamily::RDNA1:
+  case ISAFamily::RDNA2:
+  case ISAFamily::RDNA3:
+    return true;
+  default:
+    break;
+  }
+
+  return false;
+}
+
+int TargetInfo::getWarpSize() const { return isCDNA() ? 64 : 32; }
+
 int TargetInfo::getSharedMemorySize() const {
   int kbytes = getISAFamily() == ISAFamily::CDNA4 ? 160 : 64;
   return kbytes * 1024;
@@ -200,14 +229,13 @@ bool TargetInfo::warpReduce(RewriterBase &rewriter, Location loc,
                             unsigned numLaneToReduce,
                             unsigned interleave) const {
   auto b = TritonLLVMOpBuilder(loc, rewriter);
-  if (numLaneToReduce != 64)
-    return false;
 
-  if (!llvm::is_contained(
-          {ISAFamily::CDNA2, ISAFamily::CDNA3, ISAFamily::CDNA4},
-          getISAFamily())) {
+  if (numLaneToReduce != getWarpSize())
+    return false;
+  if (isCDNA() && getISAFamily() == ISAFamily::CDNA1)
+    return false;
+  if (isRDNA() && getISAFamily() != ISAFamily::RDNA3)
     return false;
-  }
 
   Operation *reduxOp = op.getSingleCombiner();
   if (!reduxOp)
@@ -307,24 +335,43 @@ bool TargetInfo::warpReduce(RewriterBase &rewriter, Location loc,
     buf = createDppReduxOpWithBoundCtrl(valType, buf, 1 + dppCtrlRowShr,
                                         allRows, allBanks);
 
-    // row_bcast:15 row_mask:0xa
-    buf = createDppReduxOpWithBoundCtrl(
-        valType, buf, static_cast<uint32_t>(DppCtrl::BCAST15), 0xa, allBanks);
+    if (isCDNA()) {
+      // row_bcast:15 row_mask:0xa
+      buf = createDppReduxOpWithBoundCtrl(
+          valType, buf, static_cast<uint32_t>(DppCtrl::BCAST15), 0xa, allBanks);
 
-    // row_bcast:31
-    buf = createDppReduxOpWithBoundCtrl(valType, buf,
-                                        static_cast<uint32_t>(DppCtrl::BCAST31),
-                                        allRows, allBanks);
+      // row_bcast:31
+      buf = createDppReduxOpWithBoundCtrl(
+          valType, buf, static_cast<uint32_t>(DppCtrl::BCAST31), allRows,
+          allBanks);
+    } else {
+      // RDNA doesn't have broadcast dpp mode
+      Type actualType = castToAndSExtInt(rewriter, loc, buf, valType, 32);
+
+      Value permlaneResult =
+          LLVM::createLLVMIntrinsicCallOp(
+              rewriter, loc, "llvm.amdgcn.permlanex16", actualType,
+              ValueRange{buf, buf, b.i32_val(-1), b.i32_val(-1), b.true_val(),
+                         b.false_val()})
+              ->getResult(0);
+      buf = truncAndCastFromInt(rewriter, loc, buf, valType, 32);
+      permlaneResult =
+          truncAndCastFromInt(rewriter, loc, permlaneResult, valType, 32);
+      IRMapping mapping;
+      mapping.map(reduxOp->getOperand(0), buf);
+      mapping.map(reduxOp->getOperand(1), permlaneResult);
+      buf = rewriter.clone(*reduxOp, mapping)->getResult(0);
+    }
 
     // Similarly, we need to cast data types for readlane instruction.
     Type actualType = castToAndSExtInt(rewriter, loc, buf, valType, 16);
 
-    // Get reduction result from lane 63
+    // Get reduction result from lane 63/31
     std::string intrinsic = "llvm.amdgcn.readlane";
-    Value result =
-        LLVM::createLLVMIntrinsicCallOp(rewriter, loc, intrinsic, actualType,
-                                        ValueRange{buf, b.i32_val(63)})
-            ->getResult(0);
+    Value result = LLVM::createLLVMIntrinsicCallOp(
+                       rewriter, loc, intrinsic, actualType,
+                       ValueRange{buf, b.i32_val(isCDNA() ? 63 : 31)})
+                       ->getResult(0);
 
     result = truncAndCastFromInt(rewriter, loc, result, valType, 16);
 
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/TargetInfo.h b/third_party/amd/lib/TritonAMDGPUToLLVM/TargetInfo.h
@@ -15,6 +15,12 @@ class TargetInfo : public mlir::triton::TargetInfoBase {
 
   llvm::AMDGPU::GPUKind getGPUKind() const;
 
+  bool isCDNA() const;
+
+  bool isRDNA() const;
+
+  int getWarpSize() const;
+
   int getSharedMemorySize() const;
 
   bool supportMaximumMinimum() const override;
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/Utility.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/Utility.cpp
@@ -139,11 +139,11 @@ static Value shuffleCommonImpl(Location loc, RewriterBase &rewriter,
       Value offset = b.i32_val(0x401F);
       return rewriter.create<ROCDL::DsSwizzleOp>(loc, valType, val, offset);
     } else {
-      if (!llvm::is_contained(
-              {ISAFamily::CDNA2, ISAFamily::CDNA3, ISAFamily::CDNA4},
-              isaFamily)) {
-        // DPP is only supported for CDNA2/CDNA3/CDNA4 right now, so we fallback
-        // to ds_swizzle for other architectures.
+      if (!llvm::is_contained({ISAFamily::CDNA2, ISAFamily::CDNA3,
+                               ISAFamily::CDNA4, ISAFamily::RDNA3},
+                              isaFamily)) {
+        // DPP is only supported for CDNA2/CDNA3/CDNA4/RDNA3 right now, so we
+        // fallback to ds_swizzle for other architectures.
         //
         // This map facilates the butterfly shuffle pattern for a stride less
         // than 16. The pattern stride is the key of the map.