[AMD] Add pattern to enforce coalesced write for async load (#6255)

AlexAUT · web-flow · commit 88b6833d7aa0 · 2025-03-21T11:35:26.000-07:00
Adds `TritonAMDGPUCoalesceAsyncCopyPass` to convert the blocked layout
of AsyncCopies if they produce non coalesced writes on `GFX9` which is a
hardware requirement.
The pass ensures `sizePerThread` of the blocked layout is not greater
than the contiguity of the source and mask elements and the support load
size.

Support for swizzled shared encodings will be added as a separate PR so
the pass skips those `AsyncCopies` for now.
This pass will be required when we add `AsyncCopy` pipelining support in
the AMD backend in a later PR.
diff --git a/bin/RegisterTritonDialects.h b/bin/RegisterTritonDialects.h
@@ -70,6 +70,7 @@ inline void registerTritonDialects(mlir::DialectRegistry &registry) {
   mlir::registerTritonAMDGPUCanonicalizePointers();
   mlir::registerTritonAMDGPUConvertToBufferOps();
   mlir::registerTritonAMDGPUInThreadTranspose();
+  mlir::registerTritonAMDGPUCoalesceAsyncCopy();
   mlir::triton::registerTritonAMDGPUInsertInstructionSchedHints();
   mlir::triton::registerTritonAMDGPULowerInstructionSchedHints();
 
diff --git a/test/TritonGPU/amd/amd-coalesce-async-copy.mlir b/test/TritonGPU/amd/amd-coalesce-async-copy.mlir
@@ -0,0 +1,145 @@
+// RUN: triton-opt %s -split-input-file --tritonamdgpu-coalesce-async-copy=arch-generation-name=gfx950 | FileCheck %s
+
+#blocked = #ttg.blocked<{sizePerThread = [4], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}>
+#shared = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [0]}>
+#smem = #ttg.shared_memory
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
+// sizePerThread = [1] because we have no information about contiguity of src pointers
+// CHECK: #[[NEW_BLOCKED:.*]] = #ttg.blocked<{sizePerThread = [1], threadsPerWarp = [32], warpsPerCTA = [4], order = [0]}>
+tt.func @async_copy_1d(%input: tensor<1024x!tt.ptr<f16>, #blocked>,
+    %view: !ttg.memdesc<1024xf16, #shared, #smem, mutable>) {
+  // CHECK: %{{.*}} = ttg.convert_layout %{{.*}} : {{.*}} -> tensor<1024x!tt.ptr<f16>, #[[NEW_BLOCKED]]>
+  // CHECK: %{{.*}} = ttg.async_copy_global_to_local %{{.*}}: tensor<1024x!tt.ptr<f16>, #[[NEW_BLOCKED]]>
+  %token = ttg.async_copy_global_to_local %input, %view: tensor<1024x!tt.ptr<f16>, #blocked> -> <1024xf16, #shared, #smem, mutable>
+  tt.return
+}
+}
+
+// -----
+
+#blocked = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}>
+#shared = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [1, 0]}>
+#smem = #ttg.shared_memory
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
+// sizePerThread = [1, 1] because we have no information about contiguity of src pointers
+// CHECK: #[[NEW_BLOCKED:.*]] = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [2, 2], order = [1, 0]}>
+tt.func @async_copy_2d(%input: tensor<64x64x!tt.ptr<f16>, #blocked>,
+    %view: !ttg.memdesc<64x64xf16, #shared, #smem, mutable>) {
+  // CHECK: %{{.*}} = ttg.convert_layout %{{.*}} : {{.*}} -> tensor<64x64x!tt.ptr<f16>, #[[NEW_BLOCKED]]>
+  // CHECK: %{{.*}} = ttg.async_copy_global_to_local %{{.*}}: tensor<64x64x!tt.ptr<f16>, #[[NEW_BLOCKED]]>
+  %token = ttg.async_copy_global_to_local %input, %view: tensor<64x64x!tt.ptr<f16>, #blocked> -> <64x64xf16, #shared, #smem, mutable>
+  tt.return
+}
+}
+
+// -----
+
+#blocked = #ttg.blocked<{sizePerThread = [8, 1, 1], threadsPerWarp = [32, 1, 1], warpsPerCTA = [1,2,2], order = [0,1,2]}>
+#shared = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [0,1,2]}>
+#smem = #ttg.shared_memory
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
+// sizePerThread = [1, 1, 1] because we have no information about contiguity of src pointers
+// CHECK: #[[NEW_BLOCKED:.*]] = #ttg.blocked<{sizePerThread = [1, 1, 1], threadsPerWarp = [32, 1, 1], warpsPerCTA = [4, 1, 1], order = [0, 1, 2]}>
+tt.func @async_copy_3d(%input: tensor<1024x1024x1024x!tt.ptr<f16>, #blocked>,
+    %view: !ttg.memdesc<1024x1024x1024xf16, #shared, #smem, mutable>) {
+  // CHECK: %{{.*}} = ttg.convert_layout %{{.*}} : {{.*}} -> tensor<1024x1024x1024x!tt.ptr<f16>, #[[NEW_BLOCKED]]>
+  // CHECK: %{{.*}} = ttg.async_copy_global_to_local %{{.*}}: tensor<1024x1024x1024x!tt.ptr<f16>, #[[NEW_BLOCKED]]>
+  %token = ttg.async_copy_global_to_local %input, %view: tensor<1024x1024x1024x!tt.ptr<f16>, #blocked> -> <1024x1024x1024xf16, #shared, #smem, mutable>
+  tt.return
+}
+}
+
+// -----
+
+#blocked = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}>
+#shared = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [1, 0]}>
+#smem = #ttg.shared_memory
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32} {
+// CHECK: #[[NEW_BLOCKED:.*]] = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [2, 2], order = [1, 0]}>
+tt.func @async_copy_with_mask_and_other(%input: tensor<64x64x!tt.ptr<f16>, #blocked>,
+    %view: !ttg.memdesc<64x64xf16, #shared, #smem, mutable>,
+    %mask: tensor<64x64xi1, #blocked>,
+    %other: tensor<64x64xf16, #blocked>) {
+  // CHECK: %{{.*}} = ttg.convert_layout %{{.*}} : {{.*}} -> tensor<64x64x!tt.ptr<f16>, #[[NEW_BLOCKED]]>
+  // CHECK: %{{.*}} = ttg.convert_layout %{{.*}} : {{.*}} -> tensor<64x64xi1, #[[NEW_BLOCKED]]>
+  // CHECK: %{{.*}} = ttg.convert_layout %{{.*}} : {{.*}} -> tensor<64x64xf16, #[[NEW_BLOCKED]]>
+  // CHECK: %{{.*}} = ttg.async_copy_global_to_local %{{.*}}: tensor<64x64x!tt.ptr<f16>, #[[NEW_BLOCKED]]>
+  %token = ttg.async_copy_global_to_local %input, %view mask %mask other %other: tensor<64x64x!tt.ptr<f16>, #blocked> -> <64x64xf16, #shared, #smem, mutable>
+  tt.return
+}
+}
+
+// -----
+
+#blocked = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [4, 16], warpsPerCTA = [4, 1], order = [1, 0]}>
+#shared = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [1, 0]}>
+#smem = #ttg.shared_memory
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.shared = 8192 : i32, ttg.target = "hip:gfx942", "ttg.threads-per-warp" = 64 : i32} {
+  // Clip to vector size 2 (32bit) because we do not support 64 bit loads to lds
+  // CHECK: #[[NEW_BLOCKED:.*]] = #ttg.blocked<{sizePerThread = [1, 2], threadsPerWarp = [2, 32], warpsPerCTA = [4, 1], order = [1, 0]}>
+  tt.func public @async_copy_vector_size_2(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32},
+                                %arg1: i32 {tt.divisibility = 16 : i32},
+                                %arg2: !ttg.memdesc<32x64xf16, #shared, #smem, mutable>) {
+    // We need the index calculation so AxisAnalysis sees that we can vectorize the load
+    %1 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked}>>
+    %2 = tt.expand_dims %1 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x64xi32, #blocked>
+    %3 = tt.broadcast %2 : tensor<1x64xi32, #blocked> -> tensor<32x64xi32, #blocked>
+    %4 = tt.splat %arg0 : !tt.ptr<f16> -> tensor<32x64x!tt.ptr<f16>, #blocked>
+    %5 = tt.addptr %4, %3 : tensor<32x64x!tt.ptr<f16>, #blocked>, tensor<32x64xi32, #blocked>
+
+    // CHECK: %{{.*}} = ttg.convert_layout %{{.*}} : {{.*}} -> tensor<32x64x!tt.ptr<f16>, #[[NEW_BLOCKED]]>
+    // CHECK: %{{.*}} = ttg.async_copy_global_to_local %{{.*}}: tensor<32x64x!tt.ptr<f16>, #[[NEW_BLOCKED]]>
+    %6 = ttg.async_copy_global_to_local %5, %arg2 : tensor<32x64x!tt.ptr<f16>, #blocked> -> <32x64xf16, #shared, #smem, mutable>
+    tt.return
+  }
+}
+
+// -----
+
+#blocked = #ttg.blocked<{sizePerThread = [1, 16], threadsPerWarp = [32, 2], warpsPerCTA = [4, 1], order = [1, 0]}>
+#shared = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [1, 0]}>
+#smem = #ttg.shared_memory
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.shared = 8192 : i32, ttg.target = "hip:gfx942", "ttg.threads-per-warp" = 64 : i32} {
+  // Clip to vector size 4 (128bit) which is the largest supported load width
+  // CHECK: #[[NEW_BLOCKED:.*]] = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [8, 8], warpsPerCTA = [4, 1], order = [1, 0]}>
+  tt.func public @async_copy_vector_size_8(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32},
+                                %arg1: i32 {tt.divisibility = 16 : i32},
+                                %arg2: !ttg.memdesc<32x64xf16, #shared, #smem, mutable>) {
+    // We need the index calculation so AxisAnalysis sees that we can vectorize the load based on the src contiguity
+    %1 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked}>>
+    %2 = tt.expand_dims %1 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x64xi32, #blocked>
+    %3 = tt.broadcast %2 : tensor<1x64xi32, #blocked> -> tensor<32x64xi32, #blocked>
+    %4 = tt.splat %arg0 : !tt.ptr<f16> -> tensor<32x64x!tt.ptr<f16>, #blocked>
+    %5 = tt.addptr %4, %3 : tensor<32x64x!tt.ptr<f16>, #blocked>, tensor<32x64xi32, #blocked>
+
+    // CHECK: %{{.*}} = ttg.convert_layout %{{.*}} : {{.*}} -> tensor<32x64x!tt.ptr<f16>, #[[NEW_BLOCKED]]>
+    // CHECK: %{{.*}} = ttg.async_copy_global_to_local %{{.*}}: tensor<32x64x!tt.ptr<f16>, #[[NEW_BLOCKED]]>
+    %6 = ttg.async_copy_global_to_local %5, %arg2 : tensor<32x64x!tt.ptr<f16>, #blocked> -> <32x64xf16, #shared, #smem, mutable>
+    tt.return
+  }
+}
+
+// -----
+
+#blocked = #ttg.blocked<{sizePerThread = [1, 16], threadsPerWarp = [32, 2], warpsPerCTA = [4, 1], order = [1, 0]}>
+#shared = #ttg.swizzled_shared<{vec = 1, perPhase = 1, maxPhase = 1, order = [0, 1]}>
+#smem = #ttg.shared_memory
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.shared = 8192 : i32, ttg.target = "hip:gfx942", "ttg.threads-per-warp" = 64 : i32} {
+  // The order of #blocked and #shared are different so we need to clip to 1 element
+  // CHECK: #[[NEW_BLOCKED:.*]] = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 64], warpsPerCTA = [4, 1], order = [1, 0]}>
+  tt.func public @async_copy_different_order(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32, tt.pointer_range = 32 : i32},
+                                %arg1: i32 {tt.divisibility = 16 : i32},
+                                %arg2: !ttg.memdesc<32x64xf16, #shared, #smem, mutable>) {
+    // We need the index calculation so AxisAnalysis sees that we can vectorize the load based on the src contiguity
+    %1 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked}>>
+    %2 = tt.expand_dims %1 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x64xi32, #blocked>
+    %3 = tt.broadcast %2 : tensor<1x64xi32, #blocked> -> tensor<32x64xi32, #blocked>
+    %4 = tt.splat %arg0 : !tt.ptr<f16> -> tensor<32x64x!tt.ptr<f16>, #blocked>
+    %5 = tt.addptr %4, %3 : tensor<32x64x!tt.ptr<f16>, #blocked>, tensor<32x64xi32, #blocked>
+
+    // CHECK: %{{.*}} = ttg.convert_layout %{{.*}} : {{.*}} -> tensor<32x64x!tt.ptr<f16>, #[[NEW_BLOCKED]]>
+    // CHECK: %{{.*}} = ttg.async_copy_global_to_local %{{.*}}: tensor<32x64x!tt.ptr<f16>, #[[NEW_BLOCKED]]>
+    %6 = ttg.async_copy_global_to_local %5, %arg2 : tensor<32x64x!tt.ptr<f16>, #blocked> -> <32x64xf16, #shared, #smem, mutable>
+    tt.return
+  }
+}
diff --git a/third_party/amd/include/TritonAMDGPUTransforms/Passes.h b/third_party/amd/include/TritonAMDGPUTransforms/Passes.h
@@ -37,6 +37,9 @@ createTritonAMDGPUBlockPingpongPass(int32_t numStages = 2);
 
 std::unique_ptr<Pass> createTritonAMDGPUInThreadTransposePass();
 
+std::unique_ptr<Pass>
+createTritonAMDGPUCoalesceAsyncCopyPass(std::string archGenName = {});
+
 /// Generate the code for registering passes.
 #define GEN_PASS_REGISTRATION
 #include "TritonAMDGPUTransforms/Passes.h.inc"
diff --git a/third_party/amd/include/TritonAMDGPUTransforms/Passes.td b/third_party/amd/include/TritonAMDGPUTransforms/Passes.td
@@ -224,6 +224,26 @@ def TritonAMDGPUInThreadTranspose: Pass<"tritonamdgpu-in-thread-transpose", "mli
   let dependentDialects = ["mlir::triton::amdgpu::TritonAMDGPUDialect", "mlir::triton::gpu::TritonGPUDialect"];
 }
 
+def TritonAMDGPUCoalesceAsyncCopy: Pass<"tritonamdgpu-coalesce-async-copy", "mlir::ModuleOp"> {
+  let summary = "Improve coalescing for async global to local copies";
+
+  let description = [{
+    GFX9:
+      For AsyncCopyGlobalToLocal ops where the blocked encoding's sizePerThread is larger than the contiguity of the
+      source or the supported load vector size we clip it to the largest supported size. This ensures we get coalesced writes to
+      shared memory as required by the hardware. Does only work for non swizzled shared memory layouts
+  }];
+
+  let constructor = "mlir::createTritonAMDGPUCoalesceAsyncCopyPass()";
+
+  let dependentDialects = [];
+
+  let options = [
+    Option<"archGenerationName", "arch-generation-name",
+           "std::string", /*default=*/"std::string{}",
+           "GFX generation name of target device.">,
+  ];
+}
 
 
 #endif
diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/CMakeLists.txt b/third_party/amd/lib/TritonAMDGPUTransforms/CMakeLists.txt
@@ -2,6 +2,7 @@ add_triton_library(TritonAMDGPUTransforms
   AccelerateAMDMatmul.cpp
   BlockPingpong.cpp
   CanonicalizePointers.cpp
+  CoalesceAsyncCopy.cpp
   ConvertToBufferOps.cpp
   OptimizeEpilogue.cpp
   HoistLayoutConversions.cpp
diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/CoalesceAsyncCopy.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/CoalesceAsyncCopy.cpp
@@ -0,0 +1,194 @@
+#include "TritonAMDGPUToLLVM/TargetUtils.h"
+#include "amd/lib/TritonAMDGPUToLLVM/Utility.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
+#include "triton/Analysis/AxisInfo.h"
+#include "triton/Conversion/TritonGPUToLLVM/Utility.h"
+
+#define GEN_PASS_CLASSES
+#include "TritonAMDGPUTransforms/Passes.h"
+
+#undef DEBUG_TYPE
+#define DEBUG_TYPE "tritonamdgpu-coalesce-async-copy"
+#define DBGS() (llvm::dbgs() << "[" DEBUG_TYPE "]: ")
+#define LDBG(X) LLVM_DEBUG(DBGS() << X << "\n")
+
+using namespace mlir;
+namespace ttg = triton::gpu;
+
+// On gfx9 global and buffer loads directly to shared memory need to write
+// coalesced. This pattern converts the layout of the src, mask and other to
+// ensure the owned data per thread is contigious and does no exceed the
+// supported load vector size to ensure coalesed writes
+struct CoalesceAsyncCopyWrites
+    : public OpRewritePattern<ttg::AsyncCopyGlobalToLocalOp> {
+  CoalesceAsyncCopyWrites(const triton::AMD::TargetInfo &targetInfo,
+                          const DenseMap<ttg::AsyncCopyGlobalToLocalOp,
+                                         unsigned> &asyncCopyContiguity,
+                          MLIRContext *ctx)
+      : OpRewritePattern(ctx), targetInfo{targetInfo},
+        asyncCopyContiguity{std::move(asyncCopyContiguity)} {}
+
+  LogicalResult matchAndRewrite(ttg::AsyncCopyGlobalToLocalOp copyOp,
+                                PatternRewriter &rewriter) const override {
+    auto src = copyOp.getSrc();
+    auto dst = copyOp.getResult();
+    Value mask = copyOp.getMask();
+    Value other = copyOp.getOther();
+
+    auto srcTy = cast<RankedTensorType>(src.getType());
+    auto dstTy = cast<ttg::MemDescType>(dst.getType());
+
+    auto blockedEnc = dyn_cast<ttg::BlockedEncodingAttr>(srcTy.getEncoding());
+    if (!blockedEnc)
+      return rewriter.notifyMatchFailure(copyOp,
+                                         "src encoding must be #blocked");
+
+    auto sharedEnc =
+        dyn_cast<ttg::SwizzledSharedEncodingAttr>(dstTy.getEncoding());
+    if (!sharedEnc)
+      return rewriter.notifyMatchFailure(
+          copyOp, "destination encoding must be #SwizzledShared");
+    if (sharedEnc.getMaxPhase() > 1)
+      return rewriter.notifyMatchFailure(
+          copyOp, "swizzled shared encoding not supported");
+
+    // We start from the precomputed contiguity we got from AxisAnalysis.
+    unsigned loadContig = 0;
+    if (auto it = asyncCopyContiguity.find(copyOp);
+        it != asyncCopyContiguity.end())
+      loadContig = it->second;
+    else
+      return copyOp->emitError()
+             << "No contiguity information about the copy op";
+    assert(loadContig > 0);
+
+    // Further restrict the contiguity based on the contiguity of the src to dst
+    // layout e.g. if the order of the blocked and shared encoding is different
+    // we can only load one element at a time or if the shared encoding is
+    // swizzled we cannot exceed the vector size of the swizzling pattern
+    LinearLayout regLayout =
+        triton::gpu::toLinearLayout(srcTy.getShape(), blockedEnc);
+    LinearLayout sharedLayout =
+        triton::gpu::toLinearLayout(srcTy.getShape(), sharedEnc);
+    auto regToSharedLayout = regLayout.invertAndCompose(sharedLayout);
+    loadContig = std::min<unsigned>(loadContig,
+                                    regToSharedLayout.getNumConsecutiveInOut());
+
+    // Select the largest supported load width equal or smaller than loadContig
+    auto elemBitWidth = dstTy.getElementTypeBitWidth();
+    while (loadContig > 0 && !targetInfo.supportsDirectToLdsLoadBitWidth(
+                                 loadContig * elemBitWidth)) {
+      loadContig /= 2;
+    }
+
+    if (loadContig == 0) {
+      return rewriter.notifyMatchFailure(
+          copyOp, "could not find layout config to create coalesced writes");
+    }
+
+    // Do not rewrite if we already use the correct contiguity (could be from a
+    // previous rewrite)
+    auto contigPerThread = ttg::getContigPerThread(srcTy);
+    auto blockedContig = contigPerThread[blockedEnc.getOrder()[0]];
+    if (blockedContig == loadContig) {
+      return rewriter.notifyMatchFailure(copyOp,
+                                         "already using the correct layout");
+    }
+
+    // Get new blocked encoding with loadContig as sizePerThread in the fastest
+    // dim
+    assert(blockedContig >= loadContig);
+    contigPerThread[blockedEnc.getOrder()[0]] = loadContig;
+    int numWarps = triton::gpu::lookupNumWarps(copyOp);
+    auto mod = copyOp->getParentOfType<ModuleOp>();
+    int threadsPerWarp = ttg::TritonGPUDialect::getThreadsPerWarp(mod);
+    auto newBlockEnc = BlockedEncodingAttr::get(
+        copyOp.getContext(), srcTy.getShape(), contigPerThread,
+        blockedEnc.getOrder(), numWarps, threadsPerWarp,
+        blockedEnc.getCTALayout());
+
+    // Convert layout of src, mask and other to new encoding
+    auto convertLayout = [&rewriter](auto loc, Value old, auto newEnc) {
+      auto oldTy = cast<RankedTensorType>(old.getType());
+      RankedTensorType newSrcTy = RankedTensorType::get(
+          oldTy.getShape(), oldTy.getElementType(), newEnc);
+      return rewriter.create<ttg::ConvertLayoutOp>(loc, newSrcTy, old);
+    };
+
+    auto loc = copyOp->getLoc();
+    Value cvtSrc = convertLayout(loc, src, newBlockEnc);
+
+    if (mask)
+      mask = convertLayout(loc, mask, newBlockEnc);
+    if (other)
+      other = convertLayout(loc, other, newBlockEnc);
+
+    rewriter.modifyOpInPlace(copyOp, [&]() {
+      copyOp.getSrcMutable().assign(cvtSrc);
+      if (mask)
+        copyOp.getMaskMutable().assign(mask);
+      if (other)
+        copyOp.getOtherMutable().assign(other);
+    });
+    return success();
+  }
+
+private:
+  const triton::AMD::TargetInfo &targetInfo;
+  const DenseMap<ttg::AsyncCopyGlobalToLocalOp, unsigned> &asyncCopyContiguity;
+};
+
+class TritonAMDGPUCoalesceAsyncCopyPass
+    : public TritonAMDGPUCoalesceAsyncCopyBase<
+          TritonAMDGPUCoalesceAsyncCopyPass> {
+public:
+  TritonAMDGPUCoalesceAsyncCopyPass(StringRef archGenName) {
+    this->archGenerationName = archGenName.str();
+  }
+
+  void runOnOperation() override {
+    ModuleOp m = getOperation();
+    MLIRContext *context = &getContext();
+
+    triton::AMD::TargetInfo targetInfo(archGenerationName);
+
+    mlir::RewritePatternSet patterns(context);
+
+    switch (targetInfo.getISAFamily()) {
+    case triton::AMD::ISAFamily::CDNA1:
+    case triton::AMD::ISAFamily::CDNA2:
+    case triton::AMD::ISAFamily::CDNA3:
+    case triton::AMD::ISAFamily::CDNA4: {
+      break;
+    }
+    default:
+      return;
+    }
+
+    // Precompute the contiguity of all AsyncCopy ops based on the src and
+    // mask contiguity/alignment to avoid rebuilding ModuleAxisInfoAnalysis
+    // after every IR change.
+    triton::ModuleAxisInfoAnalysis axisAnalysis(m);
+    DenseMap<ttg::AsyncCopyGlobalToLocalOp, unsigned> asyncCopyContiguity;
+    m->walk([&](ttg::AsyncCopyGlobalToLocalOp copyOp) {
+      unsigned contiguity =
+          mlir::LLVM::AMD::getContiguity(copyOp.getSrc(), axisAnalysis);
+      if (auto mask = copyOp.getMask()) {
+        contiguity =
+            std::min<unsigned>(contiguity, axisAnalysis.getMaskAlignment(mask));
+      }
+      asyncCopyContiguity.insert({copyOp, contiguity});
+    });
+    patterns.add<CoalesceAsyncCopyWrites>(targetInfo, asyncCopyContiguity,
+                                          context);
+
+    if (applyPatternsGreedily(m, std::move(patterns)).failed())
+      signalPassFailure();
+  }
+};
+
+std::unique_ptr<Pass>
+mlir::createTritonAMDGPUCoalesceAsyncCopyPass(std::string archGenName) {
+  return std::make_unique<TritonAMDGPUCoalesceAsyncCopyPass>(
+      std::move(archGenName));
+}
diff --git a/third_party/amd/python/triton_amd.cc b/third_party/amd/python/triton_amd.cc