[AMD] Avoid async load to pipeline for less than 32bit load (triton-lang#7250)

AlexAUT · AlexAUT · commit 7562a29369cc · 2025-07-17T08:57:24.000Z
We can only use AsyncCopy if the final load width can be &gt;= 4 bytes. 
`triton::canBeConvertedToAsyncLoad` checks that the vecSize of the
source is large enough. Additionally we need to ensure the register to
shared layout (blocked+shared) does have enough contiguous elements
since we cannot scatter into LDS.

Before this PR we will abort compilation instead of falling back to
pipelining through registers.
diff --git a/test/TritonGPU/loop-pipeline-hip.mlir b/test/TritonGPU/loop-pipeline-hip.mlir
@@ -700,3 +700,42 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.targ
     tt.return %75#0 : tensor<128x256xf32, #blocked3>
   }
 }
+
+// -----
+
+// Check we do not get AsyncCopyGlobalToLocal because the vec width will be < 32bit.
+// The order of the shared memory will be getMemoryOrder(#linear1) == [0, 1]
+// which differs from the order [1, 0] of the blocked layout. Since we have to
+// gather into lds with AsyncCopyGlobalToLocal we have to fallback to registers
+
+// COMMON-LABEL: pipeline_scale_memory_order
+// COMMON-NOT: ttg.async_copy_global_to_local
+
+#blocked = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [64, 1], warpsPerCTA = [8, 1], order = [1, 0]}>
+#linear = #ttg.linear<{register = [[0, 4], [16, 0], [32, 0], [64, 0]], lane = [[1, 0], [2, 0], [4, 0], [8, 0], [0, 1], [0, 2]], warp = [[0, 0], [0, 0], [0, 0]], block = []}>
+#linear1 = #ttg.linear<{register = [[0, 4], [128, 0], [256, 0]], lane = [[1, 0], [2, 0], [4, 0], [8, 0], [0, 1], [0, 2]], warp = [[16, 0], [32, 0], [64, 0]], block = []}>
+#mma = #ttg.amd_mfma<{versionMajor = 4, versionMinor = 0, warpsPerCTA = [1, 8], instrShape = [16, 16], isTransposed = true}>
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "hip:gfx950", "ttg.threads-per-warp" = 64 : i32} {
+  tt.func public @pipeline_scale_memory_order(%arg0: !tt.ptr<i8> {tt.divisibility = 16 : i32}, %arg1: i64 {tt.divisibility = 16 : i32}, %arg2: tensor<128x256xf8E4M3FN, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 16}>>, %arg3: tensor<128x512xi8, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 16}>>, %arg4: tensor<128x512x!tt.ptr<f32>, #mma>, %arg5: tensor<512x8x!tt.ptr<i8>, #blocked>) attributes {noinline = false} {
+    %cst = arith.constant dense<127> : tensor<128x8xi8, #linear>
+    %cst_0 = arith.constant dense<8> : tensor<512x8xi32, #blocked>
+    %c256_i64 = arith.constant 256 : i64
+    %c0_i64 = arith.constant 0 : i64
+    %cst_1 = arith.constant dense<0.000000e+00> : tensor<128x512xf32, #mma>
+    %0 = tt.make_range {end = 8 : i32, start = 0 : i32} : tensor<8xi32, #ttg.slice<{dim = 0, parent = #blocked}>>
+    %1 = arith.extsi %0 : tensor<8xi32, #ttg.slice<{dim = 0, parent = #blocked}>> to tensor<8xi64, #ttg.slice<{dim = 0, parent = #blocked}>>
+    %2 = tt.expand_dims %1 {axis = 0 : i32} : tensor<8xi64, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x8xi64, #blocked>
+    %3 = tt.splat %arg0 : !tt.ptr<i8> -> tensor<1x8x!tt.ptr<i8>, #blocked>
+    %4 = tt.addptr %3, %2 : tensor<1x8x!tt.ptr<i8>, #blocked>, tensor<1x8xi64, #blocked>
+    %5 = tt.broadcast %4 : tensor<1x8x!tt.ptr<i8>, #blocked> -> tensor<512x8x!tt.ptr<i8>, #blocked>
+    %6:2 = scf.for %arg6 = %c0_i64 to %arg1 step %c256_i64 iter_args(%arg7 = %cst_1, %arg8 = %5) -> (tensor<128x512xf32, #mma>, tensor<512x8x!tt.ptr<i8>, #blocked>)  : i64 {
+      %7 = tt.load %arg8 : tensor<512x8x!tt.ptr<i8>, #blocked>
+      %8 = ttg.convert_layout %7 : tensor<512x8xi8, #blocked> -> tensor<512x8xi8, #linear1>
+      %9 = tt.dot_scaled %arg2 scale %cst, %arg3 scale %8, %arg7 lhs = e4m3 rhs = e2m1 {fastMath = true} : tensor<128x256xf8E4M3FN, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 16}>>, tensor<128x8xi8, #linear> * tensor<128x512xi8, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 16}>>, tensor<512x8xi8, #linear1> -> tensor<128x512xf32, #mma>
+      %10 = tt.addptr %arg8, %cst_0 : tensor<512x8x!tt.ptr<i8>, #blocked>, tensor<512x8xi32, #blocked>
+      scf.yield %9, %10 : tensor<128x512xf32, #mma>, tensor<512x8x!tt.ptr<i8>, #blocked>
+    }
+    tt.store %arg4, %6#0 : tensor<128x512x!tt.ptr<f32>, #mma>
+    tt.return
+  }
+}
diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipeline.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipeline.cpp
@@ -706,13 +706,44 @@ void scheduleRemainingToLastStage(int numStages,
     schedule.insert(op, lastStage, cluster);
 }
 
+namespace {
+bool canBeConvertedToAsyncLoad(unsigned numBuffers, tt::LoadOp loadOp,
+                               Value alloc,
+                               tt::ModuleAxisInfoAnalysis &axisInfoAnalysis) {
+  // If we have a single buffer we would require another barrier after the
+  // local_reads so instead we fall back to pipeline with registers
+  // Removing this check will create incorrect IR, see
+  // MembarUtility.h:membarFilter
+  if (numBuffers <= 1)
+    return false;
+
+  // Compute the final vecSize we can use for the combination of sourceEncoding
+  // and sharedEncoding. We can only use AsyncCopy if the width is >= 32 bit
+  auto srcTy = cast<RankedTensorType>(loadOp.getPtr().getType());
+  auto dstTy = cast<ttg::MemDescType>(alloc.getType());
+  auto shape = srcTy.getShape();
+  auto regLayout = triton::gpu::toLinearLayout(shape, srcTy.getEncoding());
+  auto sharedLayout = triton::gpu::toLinearLayout(shape, dstTy.getEncoding());
+  auto regToSharedLayout = regLayout.invertAndCompose(sharedLayout);
+  unsigned loadContig = regToSharedLayout.getNumConsecutiveInOut();
+  unsigned width = loadContig * dstTy.getElementTypeBitWidth();
+  if (width < 32)
+    return false;
+
+  // Checks whether the global pointer's contiguity and mask alignment allows
+  // for at least 32 bit wide loads
+  return triton::canBeConvertedToAsyncLoad(loadOp, axisInfoAnalysis);
+}
+} // namespace
+
 // Convert load ops into shared memory allocation loads and apply
 // multi-buffering based on the required number of buffers.
 SmallVector<std::pair<Operation *, Value>> createAndScheduleStreamOps(
     const llvm::MapVector<Operation *, LoadInfo> &loadToInfo, scf::ForOp &forOp,
     const int &numBuffers, bool useAsyncCopy, tt::CoarseSchedule &schedule,
     const int stages[SCHED_SIZE],
-    const std::array<tt::CoarseSchedule::Cluster, SCHED_SIZE> &clusters) {
+    const std::array<tt::CoarseSchedule::Cluster, SCHED_SIZE> &clusters,
+    tt::ModuleAxisInfoAnalysis &axisInfoAnalysis) {
   IRRewriter builder(forOp.getContext());
   Attribute sharedMemorySpace =
       ttg::SharedMemorySpaceAttr::get(forOp.getContext());
@@ -762,11 +793,8 @@ SmallVector<std::pair<Operation *, Value>> createAndScheduleStreamOps(
   // Replace tt.loads with async copies or stream copies
   for (auto &[op, alloc] : loadToAllocs) {
     if (auto loadOp = dyn_cast<tt::LoadOp>(op)) {
-      // If we have a single buffer we would require another barrier after the
-      // local_reads so instead we fall back to pipeline with registers
-      // Removing this check will create incorrect IR, see
-      // MembarUtility.h:membarFilter
-      if (useAsyncCopy && numBuffers > 1) {
+      if (useAsyncCopy && canBeConvertedToAsyncLoad(numBuffers, loadOp, alloc,
+                                                    axisInfoAnalysis)) {
         createAndScheduleAsyncCopy(loadOp, alloc, extractIdx, forOp, schedule,
                                    stages, clusters);
       } else {
@@ -820,7 +848,7 @@ LogicalResult preprocessLoopAndBuildSchedule(scf::ForOp &forOp, int numStages,
   // Convert the loads into shared memory allocations and loads from them.
   SmallVector<std::pair<Operation *, Value>> sharedMemAllocs =
       createAndScheduleStreamOps(*loadToInfo, forOp, numBuffers, useAsyncCopy,
-                                 schedule, stages, clusters);
+                                 schedule, stages, clusters, axisInfoAnalysis);
 
   scheduleDependencies(schedule, forOp, numStages);
   LLVM_DEBUG({