[AMD] Enable v_permlane16_swap for convert_layout and reduceOp on GFX1250 (#8724)

yiqian1 · zhanglx13 · web-flow · commit 06c0b203e489 · 2025-11-24T10:31:53.000-08:00
---------

Co-authored-by: Lixun Zhang &lt;lixun.zhang@amd.com&gt;
diff --git a/test/Conversion/amd/tritongpu_to_llvm_gfx1250.mlir b/test/Conversion/amd/tritongpu_to_llvm_gfx1250.mlir
@@ -0,0 +1,29 @@
+// RUN:  triton-opt %s -split-input-file --allocate-shared-memory --convert-triton-amdgpu-to-llvm=arch="gfx1250" | FileCheck %s --check-prefix=GFX1250
+#linear = #ttg.linear<{register = [[0, 1], [0, 2], [0, 8], [0, 16]], lane = [[1, 0], [2, 0], [4, 0], [8, 0], [0, 4]], warp = [[16, 0]], block = []}>
+#mma = #ttg.amd_wmma<{version = 3, warpsPerCTA = [2, 1], isTranspose = true, instrShape = [16, 16, 32]}>
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 2 : i32, "ttg.threads-per-warp" = 32 : i32} {
+  // GFX1250-LABEL: wmma_permlane16_swap
+  tt.func @wmma_permlane16_swap(%arg0: tensor<32x32xf16, #mma>) {
+    // GFX1250-NOT: store
+    // GFX1250-NOT: load
+    // GFX1250-COUNT-4: llvm.call_intrinsic "llvm.amdgcn.permlane16.swap"
+    // GFX1250-NOT: llvm.call_intrinsic "llvm.amdgcn.permlane16.swap"
+    %0 = ttg.convert_layout %arg0 : tensor<32x32xf16, #mma> -> tensor<32x32xf16, #linear>
+    tt.return
+  }
+}
+
+// -----
+
+#mma = #ttg.amd_wmma<{version = 3, warpsPerCTA = [4, 1], isTranspose = true, instrShape = [16, 16, 32]}>
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 32 : i32} {
+  // GFX1250-LABEL: reduce_16x16
+  tt.func @reduce_16x16(%input: tensor<128x128xf32, #mma>) {
+    // GFX1250-COUNT-2: llvm.call_intrinsic "llvm.amdgcn.permlane16.swap"
+    %0 = "tt.reduce"(%input) <{axis = 1 : i32}> ({
+      ^bb0(%arg1: f32 , %arg2: f32):
+      %2 = "arith.maxnumf"(%arg1, %arg2) : (f32, f32) -> f32
+      tt.reduce.return %2 : f32 }) : (tensor<128x128xf32, #mma>) -> tensor<128xf32, #ttg.slice<{dim = 1, parent = #mma}>>
+   tt.return
+  }
+}
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertLayoutOpToLLVM.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertLayoutOpToLLVM.cpp
@@ -24,7 +24,8 @@ class ConvertLayoutOpPermlaneSwap
                   ConversionPatternRewriter &rewriter) const override {
     auto &amdTargInfo =
         static_cast<const mlir::triton::AMD::TargetInfo &>(targetInfo);
-    if (amdTargInfo.getISAFamily() != AMD::ISAFamily::CDNA4)
+    if (!(amdTargInfo.getISAFamily() == AMD::ISAFamily::CDNA4 ||
+          amdTargInfo.getISAFamily() == AMD::ISAFamily::GFX1250))
       return failure();
 
     auto srcTy = cast<RankedTensorType>(op.getSrc().getType());
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/TargetInfo.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/TargetInfo.cpp
@@ -324,6 +324,29 @@ static bool warpReduceSwap16or32(RewriterBase &rewriter, Location loc,
   return true;
 }
 
+static bool warpReduceSwap16(RewriterBase &rewriter, Location loc,
+                             SmallVector<Value> &acc, triton::ReduceOp op,
+                             unsigned numLaneToReduce, unsigned interleave) {
+  Operation *reduxOp = op.getSingleCombiner();
+  if (!reduxOp)
+    return false;
+
+  bool mfma16Case = numLaneToReduce == 2 && interleave == 16;
+  if (!mfma16Case)
+    return false;
+
+  Value val = acc[0];
+  unsigned bits = val.getType().getIntOrFloatBitWidth();
+  if (bits > 32)
+    return false;
+
+  StringRef intrinsic = "llvm.amdgcn.permlane16.swap";
+  for (auto i = 0; i < acc.size(); i++) {
+    acc[i] = permuteAndReduce(rewriter, loc, intrinsic, acc[i], reduxOp);
+  }
+  return true;
+}
+
 bool TargetInfo::warpReduce(RewriterBase &rewriter, Location loc,
                             SmallVector<Value> &acc, triton::ReduceOp op,
                             unsigned numLaneToReduce,
@@ -333,6 +356,9 @@ bool TargetInfo::warpReduce(RewriterBase &rewriter, Location loc,
   if (getISAFamily() == ISAFamily::CDNA4 &&
       warpReduceSwap16or32(rewriter, loc, acc, op, numLaneToReduce, interleave))
     return true;
+  if ((getISAFamily() == ISAFamily::GFX1250) &&
+      warpReduceSwap16(rewriter, loc, acc, op, numLaneToReduce, interleave))
+    return true;
   if (numLaneToReduce != getWarpSize())
     return false;
   if (isCDNA(getISAFamily()) && getISAFamily() == ISAFamily::CDNA1)