[AMD] Do not pipeline via AsyncCopy for unsupported vec sizes (#7676)

AlexAUT · web-flow · commit eda353505e70 · 2025-07-28T15:37:11.000Z
This PR makes `canBeConvertedToAsyncLoad` more general and strict to
only allow cases where there is a supported direct-to-lds `vecSize`
smaller or equal to the `vecSize` based on contiguity.

This catches cases where we load less than 32bits, which was already
rejected before this PR. Additionally it catches case where we cannot
lower the vecSize to a supported size. In such cases we can also not use
`ttg.async_copy_global_to_local` since we cannot split contiguous
elements owned by a thread into multiple load instructions. e.g. fp64
with vecSize==1 does not work with `ttg.async_copy_global_to_local` on
GFX9.
diff --git a/test/TritonGPU/loop-pipeline-hip.mlir b/test/TritonGPU/loop-pipeline-hip.mlir
@@ -738,3 +738,62 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.targ
     tt.return
   }
 }
+
+// -----
+
+#AL = #ttg.blocked<{sizePerThread = [1, 2], threadsPerWarp = [8, 8], warpsPerCTA = [4, 1], order = [1, 0]}>
+#C = #ttg.amd_mfma<{version = 4, warpsPerCTA = [4, 1], instrShape = [16, 16], isTransposed = true}>
+#A = #ttg.dot_op<{opIdx = 0, parent = #C, kWidth=2}>
+#B = #ttg.dot_op<{opIdx = 1, parent = #C, kWidth=2}>
+#smem = #ttg.shared_memory
+
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "hip:gfx942", "ttg.threads-per-warp" = 64 : i32} {
+// Verify that we do not get AsyncCopies because we cannot lower it on gfx942 since we only have 32bit wide loads to lds
+// COMMON-LABEL: @reject_fp64_pipelining_with_async_copy_gfx942
+// ASYNC-NOT: ttg.async_copy_global_to_local
+tt.func @reject_fp64_pipelining_with_async_copy_gfx942(
+                  %a_ptr : tensor<128x32x!tt.ptr<f64>, #AL> {tt.divisibility = 16 : i32, tt.contiguity = 16 : i32},
+                  %B : tensor<32x128xf64, #B>, %lb: i32, %ub: i32, %step: i32) -> tensor<128x128xf64, #C> {
+  %c_init = arith.constant dense<0.00e+00> : tensor<128x128xf64, #C>
+  %loop = scf.for %iv = %lb to %ub step %step iter_args(%prev_c = %c_init) -> (tensor<128x128xf64, #C>) : i32 {
+    %a_ = tt.load %a_ptr : tensor<128x32x!tt.ptr<f64>, #AL>
+    %a = ttg.convert_layout %a_ : tensor<128x32xf64, #AL> -> tensor<128x32xf64, #A>
+    %c = tt.dot %a, %B, %prev_c : tensor<128x32xf64, #A> * tensor<32x128xf64, #B> -> tensor<128x128xf64, #C>
+    scf.yield %c : tensor<128x128xf64, #C>
+  }
+  tt.return %loop: tensor<128x128xf64, #C>
+}
+}
+
+// -----
+
+#AL = #ttg.blocked<{sizePerThread = [1, 2], threadsPerWarp = [8, 8], warpsPerCTA = [4, 1], order = [1, 0]}>
+#BL = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [8, 8], warpsPerCTA = [4, 1], order = [1, 0]}>
+#C = #ttg.amd_mfma<{version = 4, warpsPerCTA = [4, 1], instrShape = [16, 16], isTransposed = true}>
+#A = #ttg.dot_op<{opIdx = 0, parent = #C, kWidth=2}>
+#B = #ttg.dot_op<{opIdx = 1, parent = #C, kWidth=2}>
+#smem = #ttg.shared_memory
+
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "hip:gfx950", "ttg.threads-per-warp" = 64 : i32} {
+// On GFX950 we can use AsyncCopy if sizePerThread >= 2 and it's contiguous because we can load 2 fp64 with one direct to lds instruction
+// COMMON-LABEL: @pipeline_fp64_with_async_copy_gfx950
+// ASYNC: ttg.async_copy_global_to_local
+// ASYNC: tt.load
+// ASYNC: ttg.async_copy_global_to_local
+// ASYNC: tt.load
+tt.func @pipeline_fp64_with_async_copy_gfx950(
+                  %a_ptr : tensor<128x32x!tt.ptr<f64>, #AL> {tt.divisibility = 16 : i32, tt.contiguity = 16 : i32},
+                  %b_ptr : tensor<32x128x!tt.ptr<f64>, #BL> {tt.divisibility = 16 : i32, tt.contiguity = 2 : i32},
+                  %lb: i32, %ub: i32, %step: i32) -> tensor<128x128xf64, #C> {
+  %c_init = arith.constant dense<0.00e+00> : tensor<128x128xf64, #C>
+  %loop = scf.for %iv = %lb to %ub step %step iter_args(%prev_c = %c_init) -> (tensor<128x128xf64, #C>) : i32 {
+    %a_ = tt.load %a_ptr : tensor<128x32x!tt.ptr<f64>, #AL>
+    %a = ttg.convert_layout %a_ : tensor<128x32xf64, #AL> -> tensor<128x32xf64, #A>
+    %b_ = tt.load %b_ptr : tensor<32x128x!tt.ptr<f64>, #BL>
+    %b = ttg.convert_layout %b_ : tensor<32x128xf64, #BL> -> tensor<32x128xf64, #B>
+    %c = tt.dot %a, %b, %prev_c : tensor<128x32xf64, #A> * tensor<32x128xf64, #B> -> tensor<128x128xf64, #C>
+    scf.yield %c : tensor<128x128xf64, #C>
+  }
+  tt.return %loop: tensor<128x128xf64, #C>
+}
+}
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/AsyncUtility.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/AsyncUtility.cpp
@@ -1,6 +1,7 @@
 #include "AsyncUtility.h"
 
 #include "Dialect/TritonAMDGPU/IR/Dialect.h"
+#include "TargetInfo.h"
 #include "triton/Dialect/TritonGPU/IR/Dialect.h"
 
 namespace mlir::triton::AMD {
@@ -128,4 +129,14 @@ void addLocalLoadNoAliasScope(LLVM::AliasAnalysisOpInterface llLoadOp) {
   llLoadOp.setAliasScopes(aliasScopes);
 }
 
+unsigned
+fitToValidDirectToLdsVecSize(unsigned maxVecSize, unsigned elemBitwidth,
+                             const triton::AMD::TargetInfo &targetInfo) {
+  while (maxVecSize > 0 && !targetInfo.supportsDirectToLdsLoadBitWidth(
+                               maxVecSize * elemBitwidth)) {
+    maxVecSize /= 2;
+  }
+  return maxVecSize;
+}
+
 } // namespace mlir::triton::AMD
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/AsyncUtility.h b/third_party/amd/lib/TritonAMDGPUToLLVM/AsyncUtility.h
@@ -7,6 +7,8 @@
 #include "triton/Dialect/TritonGPU/IR/Dialect.h"
 
 namespace mlir::triton::AMD {
+class TargetInfo;
+
 // Annotates LocalLoadOps with ttg.amdgpu.syncedByAsyncWait=true if they are
 // synced by an AsyncWait.
 void annotateLocalLoadsSyncedViaAsyncWait(ModuleOp mod);
@@ -39,6 +41,12 @@ void addLocalLoadNoAliasScope(LLVM::AliasAnalysisOpInterface llLoadOp);
 // Attaches the "AsyncCopies" alias scope to llLoadDirectToLdsOp
 void addAsyncCopyAliasScope(LLVM::AliasAnalysisOpInterface llLoadDirectToLdsOp);
 
+// Finds the largest supported vecSize smaller than maxVecSize. Returns 0 if
+// there is none
+unsigned
+fitToValidDirectToLdsVecSize(unsigned maxVecSize, unsigned elemBitwidth,
+                             const triton::AMD::TargetInfo &targetInfo);
+
 } // namespace mlir::triton::AMD
 
 #endif
diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/CoalesceAsyncCopy.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/CoalesceAsyncCopy.cpp
@@ -1,5 +1,6 @@
 #include "TritonAMDGPUToLLVM/TargetUtils.h"
 #include "TritonAMDGPUTransforms/Passes.h"
+#include "amd/lib/TritonAMDGPUToLLVM/AsyncUtility.h"
 #include "amd/lib/TritonAMDGPUToLLVM/Utility.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "third_party/amd/include/Analysis/AxisInfoExt.h"
@@ -22,9 +23,8 @@ namespace {
 
 // On gfx9 global and buffer loads directly to shared memory need to write
 // coalesced. This pattern converts the layout of the src, mask and other to
-// ensure the owned data per thread is contigious and does no exceed the
-// supported load vector size. The swizzle pattern is ignored here and is
-// handled when lowering to LLVMIR
+// ensure the owned data per thread is contiguous and does no exceed the
+// supported load vector size.
 struct CoalesceAsyncCopyWrites
     : public OpRewritePattern<ttg::AsyncCopyGlobalToLocalOp> {
   CoalesceAsyncCopyWrites(const triton::AMD::TargetInfo &targetInfo,
@@ -49,12 +49,6 @@ struct CoalesceAsyncCopyWrites
       return rewriter.notifyMatchFailure(copyOp,
                                          "src encoding must be #blocked");
 
-    auto sharedEnc =
-        dyn_cast<ttg::SwizzledSharedEncodingAttr>(dstTy.getEncoding());
-    if (!sharedEnc)
-      return rewriter.notifyMatchFailure(
-          copyOp, "destination encoding must be #SwizzledShared");
-
     // We start from the precomputed contiguity we got from AxisAnalysis.
     unsigned loadContig = 0;
     if (auto it = asyncCopyContiguity.find(copyOp);
@@ -77,10 +71,8 @@ struct CoalesceAsyncCopyWrites
 
     // Select the largest supported load width equal or smaller than loadContig
     auto elemBitWidth = dstTy.getElementTypeBitWidth();
-    while (loadContig > 0 && !targetInfo.supportsDirectToLdsLoadBitWidth(
-                                 loadContig * elemBitWidth)) {
-      loadContig /= 2;
-    }
+    loadContig =
+        fitToValidDirectToLdsVecSize(loadContig, elemBitWidth, targetInfo);
 
     if (loadContig == 0) {
       return rewriter.notifyMatchFailure(
diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipeline.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipeline.cpp
@@ -1,4 +1,5 @@
 #include "TritonAMDGPUTransforms/Passes.h"
+#include "amd/lib/TritonAMDGPUToLLVM/AsyncUtility.h"
 #include "amd/lib/TritonAMDGPUToLLVM/TargetInfo.h"
 #include "third_party/amd/include/Analysis/AxisInfoExt.h"
 #include "triton/Analysis/AxisInfo.h"
@@ -280,7 +281,8 @@ getSharedEncIfAllUsersAreDotEnc(Value loadedValue) {
 
 bool canBeConvertedToAsyncLoad(unsigned numBuffers, tt::LoadOp loadOp,
                                Value alloc,
-                               tt::ModuleAxisInfoAnalysis &axisInfoAnalysis) {
+                               tt::ModuleAxisInfoAnalysis &axisInfoAnalysis,
+                               const tt::AMD::TargetInfo &targetInfo) {
   // If we have a single buffer we would require another barrier after the
   // local_reads so instead we fall back to pipeline with registers
   // Removing this check will create incorrect IR, see
@@ -289,7 +291,9 @@ bool canBeConvertedToAsyncLoad(unsigned numBuffers, tt::LoadOp loadOp,
     return false;
 
   // Compute the final vecSize we can use for the combination of sourceEncoding
-  // and sharedEncoding. We can only use AsyncCopy if the width is >= 32 bit
+  // and sharedEncoding. We can only use AsyncCopy if the target supports the
+  // requested or a smaller vecSize because we cannot stride when loading
+  // directly to lds
   auto srcTy = cast<RankedTensorType>(loadOp.getPtr().getType());
   auto dstTy = cast<ttg::MemDescType>(alloc.getType());
   auto regLayout = triton::gpu::toLinearLayout(srcTy);
@@ -298,9 +302,11 @@ bool canBeConvertedToAsyncLoad(unsigned numBuffers, tt::LoadOp loadOp,
   auto sharedLayout =
       triton::gpu::toLinearLayout(srcShape, dstTy.getEncoding(), srcShape);
   auto regToSharedLayout = regLayout.invertAndCompose(sharedLayout);
-  unsigned loadContig = regToSharedLayout.getNumConsecutiveInOut();
-  unsigned width = loadContig * dstTy.getElementTypeBitWidth();
-  if (width < 32)
+
+  unsigned vecSize = regToSharedLayout.getNumConsecutiveInOut();
+  unsigned elemBitWidth = dstTy.getElementTypeBitWidth();
+
+  if (fitToValidDirectToLdsVecSize(vecSize, elemBitWidth, targetInfo) == 0)
     return false;
 
   // Checks whether the global pointer's contiguity and mask alignment allows
@@ -354,10 +360,13 @@ createStreamOps(const LoadToInfoMap &loadToInfo, scf::ForOp &forOp,
     Value alloc = triton::createAlloc(forOp, ty, loadOp->getLoc(),
                                       info.sharedEncoding, numBuffers);
     assert(alloc && "Failed to create alloc for the async load.");
+    auto arch = getAMDArch(loadOp->getParentOfType<ModuleOp>());
+    triton::AMD::TargetInfo targetInfo(arch ? arch->str() : "");
 
     // Replace the old load with multi-buffered loads
-    if (useAsyncCopy && canBeConvertedToAsyncLoad(numBuffers, loadOp, alloc,
-                                                  axisInfoAnalysis)) {
+    if (useAsyncCopy &&
+        canBeConvertedToAsyncLoad(numBuffers, loadOp, alloc, axisInfoAnalysis,
+                                  targetInfo)) {
       loadToStreamOp[loadOp] = createAsyncCopy(loadOp, alloc, extractIdx);
     } else {
       loadToStreamOp[loadOp] = createStreamCopy(loadOp, alloc, extractIdx);