[AMD] Use permlane_swap for layout conversions between two dot operations (triton-lang#7947)

yiqian1 · web-flow · commit 6f06595a5885 · 2025-09-03T19:52:33.000-07:00
diff --git a/lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp b/lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp
@@ -691,18 +691,19 @@ LinearLayout mfmaDotToLinearLayout(DotOperandEncodingAttr dotMfmaLayout,
   int mIndex = 0 + hasBatchDim;
 
   int32_t kWidth = dotMfmaLayout.getKWidth();
-  auto kDimIndex = dotMfmaLayout.getOpIdx() == 0 ? rank - 1 : rank - 2;
+  auto nonKDimIndex = dotMfmaLayout.getOpIdx() == 0 ? rank - 2 : rank - 1;
 
   auto warpsPerCTA = mfmaLayout.getWarpsPerCTA();
   auto tilesPerWarp = mfmaLayout.getTilesPerWarp();
-  auto tilePerWarpNonK = tilesPerWarp[kDimIndex];
+  auto tilePerWarpNonK = tilesPerWarp[nonKDimIndex];
 
   auto mDim = mfmaLayout.getMDim();
   auto nDim = mfmaLayout.getNDim();
   auto opIdx = dotMfmaLayout.getOpIdx();
   auto nonKDim = opIdx == 0 ? mDim : nDim;
   constexpr int warpSize = 64;
 
+  auto kDimIndex = dotMfmaLayout.getOpIdx() == 0 ? rank - 1 : rank - 2;
   int32_t kSize = shape[kDimIndex];
 
   MLIRContext *ctx = dotMfmaLayout.getContext();
diff --git a/test/Conversion/amd/mfma-shortcut.mlir b/test/Conversion/amd/mfma-shortcut.mlir
@@ -94,3 +94,17 @@ module attributes {"ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 64 : i32}
     tt.return
   }
 }
+
+// -----
+
+#mma = #ttg.amd_mfma<{version = 4, warpsPerCTA = [4, 1], instrShape = [32, 32], isTransposed = true}>
+#mma1 = #ttg.amd_mfma<{version = 4, warpsPerCTA = [4, 1], tilesPerWarp = [2, 1], instrShape = [16, 16], isTransposed = true}>
+module attributes {"ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 64 : i32} {
+  // GFX950-LABEL: mfma_dotop_permlane_swap
+  tt.func public @mfma_dotop_permlane_swap(%arg0: tensor<128x16xf16, #mma1>) {
+  // GFX950-NOT: load
+  // GFX950-COUNT-2: llvm.call_intrinsic "llvm.amdgcn.permlane16.swap"
+    %1 = ttg.convert_layout %arg0: tensor<128x16xf16, #mma1> -> tensor<128x16xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 8}>>
+    tt.return
+  }
+}
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/Utility.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/Utility.cpp
@@ -686,7 +686,7 @@ bool isUsedByDotScaledOp(Operation *op) {
       });
 }
 
-bool isChainDotHead(tt::DotOpInterface dotOp) {
+bool isChainDotHead(tt::DotOpInterface dotOp, unsigned opIdx) {
   auto isInSameRegion = [&dotOp](Operation *op) {
     return op->getParentRegion() == dotOp->getParentRegion();
   };
@@ -697,8 +697,9 @@ bool isChainDotHead(tt::DotOpInterface dotOp) {
   for (Operation *op : fwdSlices) {
     if (auto dOp = dyn_cast<tt::DotOpInterface>(op)) {
       assert(dOp != dotOp);
-      auto opA = dOp.getA().getDefiningOp();
-      if (opA && fwdSlices.contains(opA)) {
+      Operation *dotOperand = (opIdx == 0) ? dOp.getA().getDefiningOp()
+                                           : dOp.getB().getDefiningOp();
+      if (dotOperand && fwdSlices.contains(dotOperand)) {
         return true;
       }
     }
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/Utility.h b/third_party/amd/lib/TritonAMDGPUToLLVM/Utility.h
@@ -111,9 +111,9 @@ bool doesSwizzleInsideWarp(RewriterBase &rewriter,
 // Return true if op is used by DotScaledOp or UpcastMXFPOp ops.
 bool isUsedByDotScaledOp(Operation *op);
 
-// Check if the result of this tl.dot is used as opA of another tl.dot
+// Check if the result of this tl.dot is used as opA or opB of another tl.dot
 // in the same region
-bool isChainDotHead(mlir::triton::DotOpInterface dotOp);
+bool isChainDotHead(mlir::triton::DotOpInterface dotOp, unsigned opIdx = 0);
 
 // Check if given operand of this tt.dot is the result of a tt.trans
 // in the same region
diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/AccelerateAMDMatmul.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/AccelerateAMDMatmul.cpp
@@ -475,9 +475,36 @@ class BlockedToMFMA : public OpRewritePattern<tt::DotOp> {
     // requires to broadcast the operand A.
     bool isTransposed = !(mDim == 4 && nDim == 64);
     auto aElemTy = mfmaInstr->aElementType;
+    auto is16BitElemTy = (aElemTy.isF16() || aElemTy.isBF16());
+
+    unsigned rank = oldRetType.getRank();
+    SmallVector<unsigned, 2> tilesPerWarp = {1, 1};
+
+    // Set tilesPerWarp and isTransposed to enable intra warp conversion for the
+    // mfma16x16 layout of a dot op, depending on whether
+    // its result is used by operand 0 or operand 1 of another dot op.
+    if (mfmaVersion == 4 && is16BitElemTy && mDim == 16 && nDim == 16 &&
+        rank == 2) {
+      if (isChainDotHead(dotOp, 0u) &&
+          retShape.front() >= 16 * 2 * warpsPerTile.front() &&
+          retShape.back() == 16 && warpsPerTile.back() == 1) {
+        isTransposed = true;
+        tilesPerWarp = {2, 1};
+      } else if (isChainDotHead(dotOp, 1u) && retShape.front() == 16 &&
+                 retShape.back() >= 16 * 2 * warpsPerTile.back() &&
+                 warpsPerTile.front() == 1) {
+        isTransposed = false;
+        tilesPerWarp = {1, 2};
+      }
+    }
+
+    if (rank == 3) {
+      tilesPerWarp.insert(tilesPerWarp.begin(), 1);
+    }
+
     ttg::AMDMfmaEncodingAttr mfmaEnc = ttg::AMDMfmaEncodingAttr::get(
         oldRetType.getContext(),
-        /*version*/ mfmaVersion, warpsPerTile,
+        /*version*/ mfmaVersion, warpsPerTile, tilesPerWarp,
         /*instrShape*/ mDim, nDim, /*isTransposed=*/isTransposed, CTALayout,
         mfmaAccType);
 
@@ -524,7 +551,6 @@ class BlockedToMFMA : public OpRewritePattern<tt::DotOp> {
     // kWidth = 4 so that the coversion from #mma (result of 1st dot)
     // to #dotOp (operand 0 of 2nd dot) is a no-op.
     // TODO (lixun): relax the condition for 8-bit elementTy.
-    auto is16BitElemTy = (aElemTy.isF16() || aElemTy.isBF16());
     if (is16BitElemTy && isDotChainTail) {
       kWidth = 4;
     }

Original file line number	Diff line number	Diff line change
`@@ -686,7 +686,7 @@ bool isUsedByDotScaledOp(Operation *op) {`
`686`	`686`	`});`
`687`	`687`	`}`
`688`	`688`
`689`		`-bool isChainDotHead(tt::DotOpInterface dotOp) {`
	`689`	`+bool isChainDotHead(tt::DotOpInterface dotOp, unsigned opIdx) {`
`690`	`690`	`auto isInSameRegion = [&dotOp](Operation *op) {`
`691`	`691`	`return op->getParentRegion() == dotOp->getParentRegion();`
`692`	`692`	`};`
`@@ -697,8 +697,9 @@ bool isChainDotHead(tt::DotOpInterface dotOp) {`
`697`	`697`	`for (Operation *op : fwdSlices) {`
`698`	`698`	`if (auto dOp = dyn_cast<tt::DotOpInterface>(op)) {`
`699`	`699`	`assert(dOp != dotOp);`
`700`		`- auto opA = dOp.getA().getDefiningOp();`
`701`		`- if (opA && fwdSlices.contains(opA)) {`
	`700`	`+ Operation *dotOperand = (opIdx == 0) ? dOp.getA().getDefiningOp()`
	`701`	`+ : dOp.getB().getDefiningOp();`
	`702`	`+ if (dotOperand && fwdSlices.contains(dotOperand)) {`
`702`	`703`	`return true;`
`703`	`704`	`}`
`704`	`705`	`}`