[AMD] Optimize reduction with v_permlane intrinsics in GFX950 (#6594)

yiqian1 · web-flow · commit d52153ca184f · 2025-04-25T14:23:58.000-07:00
This helps to improve attention performance on gfx950.
diff --git a/test/Conversion/amd/tritongpu_to_llvm.mlir b/test/Conversion/amd/tritongpu_to_llvm.mlir
@@ -1,4 +1,5 @@
 // RUN: triton-opt %s -split-input-file --allocate-shared-memory --convert-triton-amdgpu-to-llvm=arch=gfx942 --convert-builtin-func-to-llvm | FileCheck %s
+// RUN: triton-opt %s -split-input-file --allocate-shared-memory --convert-triton-amdgpu-to-llvm=arch=gfx950 | FileCheck %s --check-prefix=GFX950
 
 module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
   // CHECK-LABEL: atomic_add_f32_scalar
@@ -380,3 +381,33 @@ module attributes {"ttg.target" = "hip:gfx942", "ttg.num-ctas" = 1 : i32, "ttg.n
     tt.return
   }
 }
+
+// -----
+// GFX950-LABEL: reduce_32x32
+// GFX950: llvm.call_intrinsic "llvm.amdgcn.permlane32.swap"
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, "ttg.threads-per-warp" = 64 : i32} {
+  tt.func @reduce_32x32(%arg0: tensor<64x32xf32, #ttg.amd_mfma<{versionMajor = 4, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [32, 32], isTransposed = true}>>) {
+%3101 = "tt.reduce"(%arg0) <{axis = 1 : i32}> ({
+^bb0(%arg24: f32, %arg25: f32):
+  %3166 = "arith.maxnumf"(%arg24, %arg25) <{fastmath = #arith.fastmath<none>}> : (f32, f32) -> f32
+  "tt.reduce.return"(%3166) : (f32) -> ()
+}) : (tensor<64x32xf32, #ttg.amd_mfma<{versionMajor = 4, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [32, 32], isTransposed = true}>>) -> tensor<64xf32, #ttg.slice<{dim = 1, parent = #ttg.amd_mfma<{versionMajor = 4, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [32, 32], isTransposed = true}>}>>
+  tt.return
+  }
+}
+
+// -----
+
+// GFX950-LABEL: reduce_16x16
+// GFX950: llvm.call_intrinsic "llvm.amdgcn.permlane32.swap"
+// GFX950: llvm.call_intrinsic "llvm.amdgcn.permlane16.swap"
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 1 : i32, "ttg.threads-per-warp" = 64 : i32} {
+  tt.func @reduce_16x16(%arg0: tensor<64x16xf32, #ttg.amd_mfma<{versionMajor = 4, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 16], isTransposed = true}>>){
+%1 = "tt.reduce"(%arg0) <{axis = 1 : i32}> ({
+^bb0(%arg24: f32, %arg25: f32):
+  %3166 = "arith.maxnumf"(%arg24, %arg25) <{fastmath = #arith.fastmath<none>}> : (f32, f32) -> f32
+  "tt.reduce.return"(%3166) : (f32) -> ()
+}) : (tensor<64x16xf32, #ttg.amd_mfma<{versionMajor = 4, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 16], isTransposed = true}>>) -> tensor<64xf32, #ttg.slice<{dim = 1, parent = #ttg.amd_mfma<{versionMajor = 4, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 16], isTransposed = true}>}>>
+  tt.return
+  }
+}
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/TargetInfo.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/TargetInfo.cpp
@@ -224,12 +224,96 @@ static inline Value truncAndCastFromInt(RewriterBase &rewriter, Location loc,
   return toVal;
 }
 
+// Permute lanes of the input val and apply reduction to permuted values.
+static Value permuteAndReduce(RewriterBase &rewriter, Location loc,
+                              StringRef intrinsic, Value val,
+                              Operation *reduxOp) {
+  Type valType = val.getType();
+  assert(valType.getIntOrFloatBitWidth() <= 32);
+
+  Type actualType = valType;
+  if (!valType.isInteger(32))
+    actualType = castToAndSExtInt(rewriter, loc, val, valType, 32);
+
+  auto b = TritonLLVMOpBuilder(loc, rewriter);
+  Value falseVal = b.false_val();
+  MLIRContext *ctx = rewriter.getContext();
+  Type retType = struct_ty({i32_ty, i32_ty});
+  Value perm =
+      LLVM::createLLVMIntrinsicCallOp(rewriter, loc, intrinsic, retType,
+                                      ValueRange{val, val, falseVal, falseVal})
+          ->getResult(0);
+  Value v0 = b.extract_val(i32_ty, perm, 0);
+  Value v1 = b.extract_val(i32_ty, perm, 1);
+
+  if (!valType.isInteger(32)) {
+    v0 = truncAndCastFromInt(rewriter, loc, v0, valType, 32);
+    v1 = truncAndCastFromInt(rewriter, loc, v1, valType, 32);
+  }
+  IRMapping mapping;
+  mapping.map(reduxOp->getOperand(0), v0);
+  mapping.map(reduxOp->getOperand(1), v1);
+  Value redx = rewriter.clone(*reduxOp, mapping)->getResult(0);
+  return redx;
+}
+
+// Apply warp reduction across lanes using llvm intrinsics in GFX950.
+// The input acc has the partial accumulated values from reduction within
+// threads. The output acc has the final accumulated values.
+//
+// Two special cases are supported:
+// When numLaneToReduce == 2 && interleave == 32:
+//   step 1: use permlane32_swap() to swap the row 2 and 3 of acc and
+//           the row 0 and 1 of the copy of acc
+//   step 2: apply reduction to the result values to get final result
+// When numLaneToReduce == 4 && interleave == 16:
+//   step 1: use permlane32_swap() to swap the row 2 and 3 of acc and
+//           the row 0 and 1 of the copy of acc
+//   step 2: apply reduction to the result values to get the partial result
+//   step 3: use permlane16_swap() to swap the odd and even rows of
+//           the partial results
+//   step 4: apply reduction to get the final results
+static bool warpReduceSwap16or32(RewriterBase &rewriter, Location loc,
+                                 SmallVector<Value> &acc, triton::ReduceOp op,
+                                 unsigned numLaneToReduce,
+                                 unsigned interleave) {
+  Operation *reduxOp = op.getSingleCombiner();
+  if (!reduxOp)
+    return false;
+
+  bool mfma32Case = numLaneToReduce == 2 && interleave == 32;
+  bool mfma16Case = numLaneToReduce == 4 && interleave == 16;
+  if (!(mfma32Case || mfma16Case))
+    return false;
+
+  Value val = acc[0];
+  unsigned bits = val.getType().getIntOrFloatBitWidth();
+  if (bits > 32)
+    return false;
+
+  StringRef intrinsic = "llvm.amdgcn.permlane32.swap";
+  for (auto i = 0; i < acc.size(); i++) {
+    Value redx = permuteAndReduce(rewriter, loc, intrinsic, acc[i], reduxOp);
+
+    if (mfma16Case) {
+      intrinsic = "llvm.amdgcn.permlane16.swap";
+      redx = permuteAndReduce(rewriter, loc, intrinsic, redx, reduxOp);
+    }
+
+    acc[i] = redx;
+  }
+  return true;
+}
+
 bool TargetInfo::warpReduce(RewriterBase &rewriter, Location loc,
                             SmallVector<Value> &acc, triton::ReduceOp op,
                             unsigned numLaneToReduce,
                             unsigned interleave) const {
   auto b = TritonLLVMOpBuilder(loc, rewriter);
 
+  if (isCDNA() && getISAFamily() == ISAFamily::CDNA4 &&
+      warpReduceSwap16or32(rewriter, loc, acc, op, numLaneToReduce, interleave))
+    return true;
   if (numLaneToReduce != getWarpSize())
     return false;
   if (isCDNA() && getISAFamily() == ISAFamily::CDNA1)