[AMD] Optimize to use 128-bit stores in epilogue for CDNA4 (triton-lang#6688)

yiqian1 · web-flow · commit e32c3b139c4c · 2025-05-05T14:10:47.000-07:00
Convert the mfmaLayout to a linear layout where each thread holds 8
consecutive elements to enable dwordx4 stores in the epilogue.
diff --git a/include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h b/include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h
@@ -282,6 +282,13 @@ LinearLayout chooseScaledMfmaScaleLayout(
     MLIRContext *ctx, int dotOperandIdx,
     const std::vector<std::vector<int32_t>> &dotOperandWarpBasis,
     ArrayRef<int64_t> dotOperandShape, unsigned mfmaMDim);
-} // namespace mlir::triton::gpu
 
+// Create a LinearLayout similar to mfmaLayout, but changing each thread to hold
+// 8 elements. This layout is useful for emitting the widest 128-bit global
+// store instructions. Since it closely resembles mfmaLayout, conversion between
+// the two can be done using transferWithinWarp, without involving LDS
+LinearLayout chooseMfmaLikeStoreLayout(AMDMfmaEncodingAttr mfmaLayout,
+                                       ArrayRef<int64_t> shape);
+
+} // namespace mlir::triton::gpu
 #endif // TRITON_DIALECT_TRITONGPU_IR_LINEARLAYOUTCONVERSIONS_H
diff --git a/lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp b/lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp
@@ -1542,6 +1542,39 @@ LinearLayout chooseScaledMfmaScaleLayout(
   return newLL;
 }
 
+LinearLayout chooseMfmaLikeStoreLayout(AMDMfmaEncodingAttr mfmaLayout,
+                                       ArrayRef<int64_t> shape) {
+  assert(shape.size() == 2 && mfmaLayout.getMDim() == 32 &&
+         mfmaLayout.getNDim() == 32 && mfmaLayout.getIsTransposed());
+
+  MLIRContext *ctx = mfmaLayout.getContext();
+  StringAttr kRegister = S("register");
+  StringAttr kLane = S("lane");
+  StringAttr kWarp = S("warp");
+  StringAttr kBlock = S("block");
+
+  SmallVector<unsigned> order = getDefaultMmaOrder(mfmaLayout);
+  auto standardOutDims = standardOutDimNames(ctx, 2);
+  // We make each thread handle 8 consecutive elements to enable 128-bit
+  // global stores for [b]f16 types and keep the thread pattern in each lane
+  // similar to the canonical mfmaLayout.
+  LinearLayout mfma8Layout = LinearLayout::empty();
+  mfma8Layout =
+      LinearLayout({{kRegister, {{1, 0}, {2, 0}, {4, 0}}},
+                    {kLane, {{0, 1}, {0, 2}, {0, 4}, {0, 8}, {0, 16}, {8, 0}}},
+                    {kWarp, {}},
+                    {kBlock, {}}},
+                   {standardOutDims[order[0]], standardOutDims[order[1]]});
+
+  LinearLayout warpLayout =
+      identityStandardND(kWarp, mfmaLayout.getWarpsPerCTA(), order);
+  LinearLayout ctaLayout = mfma8Layout.transposeOuts(standardOutDims) *
+                           warpLayout.transposeOuts(standardOutDims);
+  mfma8Layout =
+      combineCtaCgaWithShape(ctaLayout, mfmaLayout.getCTALayout(), shape);
+  return mfma8Layout;
+}
+
 LinearLayout getScaleTMEMStoreLinearLayout(RankedTensorType scaleType,
                                            int numWarps) {
   assert(numWarps == 4 || numWarps == 8);
diff --git a/test/TritonGPU/amd/amd-optimize-epilogue.mlir b/test/TritonGPU/amd/amd-optimize-epilogue.mlir
@@ -40,3 +40,26 @@ module attributes {"ttg.num-warps" = 1 : i32, "ttg.threads-per-warp" = 64 : i32}
     tt.return
   }
 }
+
+// -----
+// CHECK{LITERAL}: #linear = #ttg.linear<{register = [[0, 1], [0, 2], [0, 4], [0, 16], [0, 32], [0, 64]], lane = [[1, 0], [2, 0], [4, 0], [8, 0], [16, 0], [0, 8]], warp = [[32, 0], [64, 0]], block = []}>
+// CHECK-LABEL: store_dword
+// CHECK-NOT: ttg.convert_layout %{{.*}} : tensor<128x128xf32, #mma> -> tensor<128x128xf32, #blocked>
+// CHECK-DAG: %[[PTR:.+]] = ttg.convert_layout %{{.*}} : tensor<128x128x!tt.ptr<f16>, #mma> -> tensor<128x128x!tt.ptr<f16>, #linear>
+// CHECK-DAG: %[[VAL:.+]] = ttg.convert_layout %{{.*}} : tensor<128x128xf16, #mma> -> tensor<128x128xf16, #linear>
+// CHECK: tt.store %[[PTR]], %[[VAL]] : tensor<128x128x!tt.ptr<f16>, #linear>
+#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [16, 4], warpsPerCTA = [4, 1], order = [0, 1]}>
+#mma = #ttg.amd_mfma<{versionMajor = 4, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [32, 32], isTransposed = true}>
+module attributes {"ttg.num-warps" = 4 : i32, "ttg.threads-per-warp" = 64 : i32} {
+  tt.func public @store_dword(%arg0: !tt.ptr<f16>) attributes {noinline = false} {
+    %cst = arith.constant dense<0.000000e+00> : tensor<128x128xf32, #mma>
+    %cst_0 = arith.constant dense<1.230000e+02> : tensor<128x128xf32, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>>
+    %cst_1 = arith.constant dense<1.230000e+02> : tensor<128x128xf32, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>>
+    %0 = tt.dot %cst_0, %cst_1, %cst : tensor<128x128xf32, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 2}>> * tensor<128x128xf32, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 2}>> -> tensor<128x128xf32, #mma>
+    %1 = ttg.convert_layout %0 : tensor<128x128xf32, #mma> -> tensor<128x128xf32, #blocked>
+    %2 = arith.truncf %1 : tensor<128x128xf32, #blocked> to tensor<128x128xf16, #blocked>
+    %3 = tt.splat %arg0 : !tt.ptr<f16> -> tensor<128x128x!tt.ptr<f16>, #blocked>
+    tt.store %3, %2 : tensor<128x128x!tt.ptr<f16>, #blocked>
+    tt.return
+  }
+}
diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/OptimizeEpilogue.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/OptimizeEpilogue.cpp
@@ -59,6 +59,59 @@ bool isOneOperandElementwiseOp(Operation *op) {
   return false;
 }
 
+static triton::StoreOp convertMfmaLayoutForCDNA4(PatternRewriter &rewriter,
+                                                 Value ptr, Value val,
+                                                 Value mask,
+                                                 triton::StoreOp oldStOp) {
+  auto ptrType = cast<RankedTensorType>(ptr.getType());
+  auto valType = cast<RankedTensorType>(val.getType());
+
+  auto mfmaLayout =
+      cast<triton::gpu::AMDMfmaEncodingAttr>(valType.getEncoding());
+
+  bool mfma32 = mfmaLayout.getMDim() == 32 && mfmaLayout.getNDim() == 32;
+
+  if (valType.getRank() != 2 ||
+      (!valType.getElementType().isF16() &&
+       !valType.getElementType().isBF16()) ||
+      mfmaLayout.getVersionMajor() != 4 || !mfmaLayout.getIsTransposed() ||
+      !mfma32) {
+    return rewriter.create<triton::StoreOp>(oldStOp.getLoc(), ptr, val, mask,
+                                            oldStOp.getCache(),
+                                            oldStOp.getEvict());
+  }
+
+  // Create a new layout where each thread holds 8 consecutive elements, in
+  // order to enable wide 128-bit global stores.
+  triton::LinearLayout mfma8Layout =
+      chooseMfmaLikeStoreLayout(mfmaLayout, valType.getShape());
+
+  Attribute newEncoding = triton::gpu::LinearEncodingAttr::get(
+      mfmaLayout.getContext(), mfma8Layout);
+  auto newPtrType = RankedTensorType::get(
+      ptrType.getShape(), ptrType.getElementType(), newEncoding);
+  Value newPtr = rewriter.create<triton::gpu::ConvertLayoutOp>(ptr.getLoc(),
+                                                               newPtrType, ptr);
+
+  auto newValType = RankedTensorType::get(
+      valType.getShape(), valType.getElementType(), newEncoding);
+  Value newVal = rewriter.create<triton::gpu::ConvertLayoutOp>(val.getLoc(),
+                                                               newValType, val);
+
+  Value newMask = mask;
+  if (mask) {
+    auto maskType = dyn_cast<RankedTensorType>(mask.getType());
+    auto newMaskType = RankedTensorType::get(
+        maskType.getShape(), maskType.getElementType(), newEncoding);
+    newMask = rewriter.create<triton::gpu::ConvertLayoutOp>(mask.getLoc(),
+                                                            newMaskType, mask);
+  }
+
+  return rewriter.create<triton::StoreOp>(oldStOp.getLoc(), newPtr, newVal,
+                                          newMask, oldStOp.getCache(),
+                                          oldStOp.getEvict());
+}
+
 // convert(val) : xmma -> blocked
 // elementWiseOp(val) : blocked
 // ...
@@ -126,19 +179,20 @@ class BypassEpilogueSMEM : public mlir::RewritePattern {
     auto newEncoding =
         cast<RankedTensorType>(cvtOp.getSrc().getType()).getEncoding();
 
-    auto newVal = cvtOp.getSrc();
-
     auto newPtrType = RankedTensorType::get(
         ptrType.getShape(), ptrType.getElementType(), newEncoding);
     Value newPtr = rewriter.create<triton::gpu::ConvertLayoutOp>(
         ptr.getLoc(), newPtrType, ptr);
 
+    auto newVal = cvtOp.getSrc();
+
     for (auto chainedOp : llvm::reverse(chainedOps)) {
       auto oldType =
           cast<mlir::RankedTensorType>(chainedOp->getResult(0).getType());
       chainedOp->setOperand(0, newVal);
       newVal = llvm::cast<mlir::TypedValue<RankedTensorType>>(
           chainedOp->getResult(0));
+
       auto newType = mlir::RankedTensorType::get(
           oldType.getShape(), oldType.getElementType(), newEncoding);
       newVal.setType(newType);
@@ -152,9 +206,18 @@ class BypassEpilogueSMEM : public mlir::RewritePattern {
       newMask = rewriter.create<triton::gpu::ConvertLayoutOp>(
           mask.getLoc(), newMaskType, mask);
     }
+    triton::StoreOp newStoreOp;
+    if (auto mfmaLayout =
+            dyn_cast<triton::gpu::AMDMfmaEncodingAttr>(newEncoding)) {
+      newStoreOp =
+          convertMfmaLayoutForCDNA4(rewriter, newPtr, newVal, newMask, stOp);
+    } else {
+      newStoreOp = rewriter.create<triton::StoreOp>(
+          stOp.getLoc(), newPtr, newVal, newMask, stOp.getCache(),
+          stOp.getEvict());
+    }
 
-    rewriter.replaceOpWithNewOp<triton::StoreOp>(
-        stOp, newPtr, newVal, newMask, stOp.getCache(), stOp.getEvict());
+    rewriter.replaceOp(stOp, newStoreOp);
     return mlir::success();
   }
 };