[AMD] Use composition to swap columns for mfma like store layout (#6844)

antiagainst · yiqian1 · web-flow · commit e6b9efdff5c3 · 2025-05-16T16:01:40.000-07:00
This commit improves how we create the mfma-like layout for
optimizing global store by using linear layout composition.
Along the way fixes a few implemenation issues.

---------

Co-authored-by: Yi Qian &lt;yi.qian@amd.com&gt;
diff --git a/lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp b/lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp
@@ -1535,40 +1535,35 @@ std::optional<LinearLayout>
 chooseMfmaLikeStoreLayout(RankedTensorType valType) {
   auto mfmaLayout = cast<AMDMfmaEncodingAttr>(valType.getEncoding());
 
-  // Currently support transposed [B]F16 MFMA32x32 on CDNA4
+  // We currently only support transposed [B]F16 MFMA32x32 on CDNA4.
   bool isMfma32 = mfmaLayout.getMDim() == 32 && mfmaLayout.getNDim() == 32;
   Type elemType = valType.getElementType();
   if (!(valType.getRank() == 2 && (elemType.isF16() || elemType.isBF16()) &&
         mfmaLayout.getVersionMajor() == 4 && mfmaLayout.getIsTransposed() &&
         isMfma32))
     return {};
 
-  MLIRContext *ctx = mfmaLayout.getContext();
-  StringAttr kRegister = S("register");
-  StringAttr kLane = S("lane");
-  StringAttr kWarp = S("warp");
-  StringAttr kBlock = S("block");
-
-  SmallVector<unsigned> order = getDefaultMmaOrder(mfmaLayout);
-  auto standardOutDims = standardOutDimNames(ctx, 2);
-  // We make each thread handle 8 consecutive elements to enable 128-bit
-  // global stores for [b]f16 types and keep the thread pattern in each lane
-  // similar to the canonical mfmaLayout.
-  LinearLayout mfma8Layout = LinearLayout::empty();
-  mfma8Layout =
-      LinearLayout({{kRegister, {{1, 0}, {2, 0}, {4, 0}}},
-                    {kLane, {{0, 1}, {0, 2}, {0, 4}, {0, 8}, {0, 16}, {8, 0}}},
-                    {kWarp, {}},
-                    {kBlock, {}}},
-                   {standardOutDims[order[0]], standardOutDims[order[1]]});
-
-  LinearLayout warpLayout =
-      identityStandardND(kWarp, mfmaLayout.getWarpsPerCTA(), order);
-  LinearLayout ctaLayout = mfma8Layout.transposeOuts(standardOutDims) *
-                           warpLayout.transposeOuts(standardOutDims);
-  mfma8Layout = combineCtaCgaWithShape(ctaLayout, mfmaLayout.getCTALayout(),
-                                       valType.getShape());
-  return mfma8Layout;
+  auto valShape = valType.getShape();
+  LinearLayout mfmaLL = mfmaLayout.toLinearLayout(valShape);
+  auto mfmaOutDims = llvm::to_vector(mfmaLL.getOutDimNames());
+  StringAttr dimM = mfmaOutDims[0];
+  StringAttr dimN = mfmaOutDims[1];
+
+  auto swapLL = LinearLayout::empty();
+  // The rows are kept as is with an identity linear layout.
+  swapLL *= LinearLayout::identity1D(valShape[0], dimM, dimM);
+  // In transposed mfma32 layout, each thread holds 4 consecutive values along N
+  // dim. We want to exchange column 4-7 (owned by thread 32-63) and column 8-11
+  // (owned by thread 0-31) every 16 columns to make each thread holds 8
+  // elements. This would mean exchange the 2nd and 3rd basis vector from an
+  // identity linear layout.
+  std::vector<std::vector<int32_t>> dimNBases(mfmaLL.getOutDimSizeLog2(dimN));
+  std::generate(dimNBases.begin(), dimNBases.end(),
+                [i = 0]() mutable { return std::vector<int32_t>{1 << i++}; });
+  std::swap(dimNBases[2], dimNBases[3]);
+  swapLL *= LinearLayout({{dimN, dimNBases}}, {dimN});
+
+  return mfmaLL.compose(swapLL);
 }
 
 LinearLayout getScaleTMEMStoreLinearLayout(RankedTensorType scaleType,
diff --git a/test/TritonGPU/amd/amd-optimize-epilogue.mlir b/test/TritonGPU/amd/amd-optimize-epilogue.mlir
@@ -43,15 +43,15 @@ module attributes {"ttg.num-warps" = 1 : i32, "ttg.threads-per-warp" = 64 : i32}
 
 // -----
 // CHECK{LITERAL}: #linear = #ttg.linear<{register = [[0, 1], [0, 2], [0, 4], [0, 16], [0, 32], [0, 64]], lane = [[1, 0], [2, 0], [4, 0], [8, 0], [16, 0], [0, 8]], warp = [[32, 0], [64, 0]], block = []}>
-// CHECK-LABEL: store_dword
+// CHECK-LABEL: store_dword_128x128
 // CHECK-NOT: ttg.convert_layout %{{.*}} : tensor<128x128xf32, #mma> -> tensor<128x128xf32, #blocked>
 // CHECK-DAG: %[[PTR:.+]] = ttg.convert_layout %{{.*}} : tensor<128x128x!tt.ptr<f16>, #mma> -> tensor<128x128x!tt.ptr<f16>, #linear>
 // CHECK-DAG: %[[VAL:.+]] = ttg.convert_layout %{{.*}} : tensor<128x128xf16, #mma> -> tensor<128x128xf16, #linear>
 // CHECK: tt.store %[[PTR]], %[[VAL]] : tensor<128x128x!tt.ptr<f16>, #linear>
 #blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [16, 4], warpsPerCTA = [4, 1], order = [0, 1]}>
 #mma = #ttg.amd_mfma<{versionMajor = 4, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [32, 32], isTransposed = true}>
 module attributes {"ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 64 : i32} {
-  tt.func public @store_dword(%arg0: !tt.ptr<f16>) attributes {noinline = false} {
+  tt.func public @store_dword_128x128(%arg0: !tt.ptr<f16>) attributes {noinline = false} {
     %cst = arith.constant dense<0.000000e+00> : tensor<128x128xf32, #mma>
     %cst_0 = arith.constant dense<1.230000e+02> : tensor<128x128xf32, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>>
     %cst_1 = arith.constant dense<1.230000e+02> : tensor<128x128xf32, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>>
@@ -63,3 +63,26 @@ module attributes {"ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 64 : i32}
     tt.return
   }
 }
+
+// -----
+// CHECK{LITERAL}: #linear = #ttg.linear<{register = [[0, 1], [0, 2], [0, 4], [0, 16], [0, 128], [64, 0], [128, 0]], lane = [[1, 0], [2, 0], [4, 0], [8, 0], [16, 0], [0, 8]], warp = [[0, 32], [0, 64], [32, 0]], block = []}>
+// CHECK-LABEL: store_dword_256x256
+// CHECK-NOT: ttg.convert_layout %{{.*}} : tensor<256x256xf32, #mma> -> tensor<256x256xf32, #blocked>
+// CHECK-DAG: %[[PTR:.+]] = ttg.convert_layout %{{.*}} : tensor<256x256x!tt.ptr<f16>, #mma> -> tensor<256x256x!tt.ptr<f16>, #linear>
+// CHECK-DAG: %[[VAL:.+]] = ttg.convert_layout %{{.*}} : tensor<256x256xf16, #mma> -> tensor<256x256xf16, #linear>
+// CHECK: tt.store %[[PTR]], %[[VAL]] : tensor<256x256x!tt.ptr<f16>, #linear>
+#blocked = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [2, 32], warpsPerCTA = [8, 1], order = [1, 0]}>
+#mma = #ttg.amd_mfma<{versionMajor = 4, versionMinor = 0, warpsPerCTA = [2, 4], instrShape = [32, 32], isTransposed = true}>
+module attributes {"ttg.num-warps" = 8 : i32, "ttg.threads-per-warp" = 64 : i32} {
+  tt.func public @store_dword_256x256(%arg0: !tt.ptr<f16>) attributes {noinline = false} {
+    %cst = arith.constant dense<0.000000e+00> : tensor<256x256xf32, #mma>
+    %cst_0 = arith.constant dense<1.230000e+02> : tensor<256x256xf32, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>>
+    %cst_1 = arith.constant dense<1.230000e+02> : tensor<256x256xf32, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>>
+    %0 = tt.dot %cst_0, %cst_1, %cst : tensor<256x256xf32, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * tensor<256x256xf32, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<256x256xf32, #mma>
+    %1 = ttg.convert_layout %0 : tensor<256x256xf32, #mma> -> tensor<256x256xf32, #blocked>
+    %2 = arith.truncf %1 : tensor<256x256xf32, #blocked> to tensor<256x256xf16, #blocked>
+    %3 = tt.splat %arg0 : !tt.ptr<f16> -> tensor<256x256x!tt.ptr<f16>, #blocked>
+    tt.store %3, %2 : tensor<256x256x!tt.ptr<f16>, #blocked>
+    tt.return
+  }
+}
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertLayoutOpToLLVM.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/ConvertLayoutOpToLLVM.cpp
@@ -189,21 +189,10 @@ static bool matchMFMAAndLinearLayoutCase(RankedTensorType srcTy,
   if (!mfmaLayout || !linearLayout)
     return false;
 
-  std::optional<LinearLayout> srcLL =
+  std::optional<LinearLayout> storeLL =
       mlir::triton::gpu::chooseMfmaLikeStoreLayout(srcTy);
-  if (!srcLL)
-    return false;
-
-  MLIRContext *ctx = linearLayout.getContext();
-  StringAttr kLane = StringAttr::get(ctx, "lane");
-  StringAttr kRegister = StringAttr::get(ctx, "register");
-  auto srcBase = srcLL.value().getBases();
-  auto srcReg = srcBase.lookup(kRegister);
-  auto srcLane = srcBase.lookup(kLane);
-  auto dstBases = linearLayout.getLinearLayout().getBases();
-  auto dstReg = dstBases.lookup(kRegister);
-  auto dstLane = dstBases.lookup(kLane);
-  return dstReg == srcReg && dstLane == srcLane;
+  return linearLayout.getLinearLayout() ==
+         storeLL.value_or(LinearLayout::empty());
 };
 
 struct ConvertLayoutOpMFMAToLinearConversion
diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/OptimizeEpilogue.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/OptimizeEpilogue.cpp
@@ -59,27 +59,23 @@ bool isOneOperandElementwiseOp(Operation *op) {
   return false;
 }
 
-static triton::StoreOp convertMfmaLayoutForCDNA4(PatternRewriter &rewriter,
-                                                 Value ptr, Value val,
-                                                 Value mask,
-                                                 triton::StoreOp oldStOp) {
+// Tries to optimize oldStoreOp with v_permlane*_swap instruction when possible.
+// Returns null store op if not suitable.
+static triton::StoreOp
+usePermlaneSwapToOptimizeStore(PatternRewriter &rewriter, Value ptr, Value val,
+                               Value mask, triton::StoreOp oldStoreOp) {
   auto ptrType = cast<RankedTensorType>(ptr.getType());
   auto valType = cast<RankedTensorType>(val.getType());
 
-  auto mfmaLayout =
-      cast<triton::gpu::AMDMfmaEncodingAttr>(valType.getEncoding());
-
   // Create a new layout where each thread holds 8 consecutive elements, in
   // order to enable wide 128-bit global stores.
-  std::optional<triton::LinearLayout> mfma8Layout =
+  std::optional<triton::LinearLayout> storeLL =
       triton::gpu::chooseMfmaLikeStoreLayout(valType);
+  if (!storeLL)
+    return nullptr;
 
-  if (!mfma8Layout)
-    return rewriter.create<triton::StoreOp>(oldStOp.getLoc(), ptr, val, mask,
-                                            oldStOp.getCache(),
-                                            oldStOp.getEvict());
   Attribute newEncoding = triton::gpu::LinearEncodingAttr::get(
-      mfmaLayout.getContext(), mfma8Layout.value());
+      oldStoreOp.getContext(), storeLL.value());
   auto newPtrType = RankedTensorType::get(
       ptrType.getShape(), ptrType.getElementType(), newEncoding);
   Value newPtr = rewriter.create<triton::gpu::ConvertLayoutOp>(ptr.getLoc(),
@@ -99,9 +95,9 @@ static triton::StoreOp convertMfmaLayoutForCDNA4(PatternRewriter &rewriter,
                                                             newMaskType, mask);
   }
 
-  return rewriter.create<triton::StoreOp>(oldStOp.getLoc(), newPtr, newVal,
-                                          newMask, oldStOp.getCache(),
-                                          oldStOp.getEvict());
+  return rewriter.create<triton::StoreOp>(oldStoreOp.getLoc(), newPtr, newVal,
+                                          newMask, oldStoreOp.getCache(),
+                                          oldStoreOp.getEvict());
 }
 
 // convert(val) : xmma -> blocked
@@ -195,12 +191,9 @@ class BypassEpilogueSMEM : public mlir::OpRewritePattern<triton::StoreOp> {
       newMask = rewriter.create<triton::gpu::ConvertLayoutOp>(
           mask.getLoc(), newMaskType, mask);
     }
-    triton::StoreOp newStoreOp;
-    if (auto mfmaLayout =
-            dyn_cast<triton::gpu::AMDMfmaEncodingAttr>(newEncoding)) {
-      newStoreOp =
-          convertMfmaLayoutForCDNA4(rewriter, newPtr, newVal, newMask, stOp);
-    } else {
+    triton::StoreOp newStoreOp =
+        usePermlaneSwapToOptimizeStore(rewriter, newPtr, newVal, newMask, stOp);
+    if (!newStoreOp) {
       newStoreOp = rewriter.create<triton::StoreOp>(
           stOp.getLoc(), newPtr, newVal, newMask, stOp.getCache(),
           stOp.getEvict());