[AMD] Redesign stream pipeliner LDS layout selection logic (#8053)

PMylon · web-flow · commit 8b792c8c7a64 · 2025-09-08T14:28:58.000-07:00
This commit adapts the LDS layout selection logic in Stream Pipeliner so
that we pick a common swizzled shared memory layout with vecSize = max
kWidth of all users.
diff --git a/test/TritonGPU/amd/amd-stream-lds-layout-selection.mlir b/test/TritonGPU/amd/amd-stream-lds-layout-selection.mlir
@@ -0,0 +1,116 @@
+// RUN: triton-opt %s -split-input-file -tritonamdgpu-stream-pipeline="num_stages=2" -canonicalize | FileCheck %s
+
+// Pick a common shared memory layout with vec = max kWidth of all users.
+// CHECK{LITERAL}: #shared = #ttg.swizzled_shared<{vec = 8, perPhase = 2, maxPhase = 8, order = [0, 1]}>
+// CHECK-NOT: #ttg.swizzled_shared
+// CHECK{LITERAL}: #smem = #ttg.shared_memory
+// CHECK-LABEL: test_lds_layout_selection
+
+// CHECK: %[[ALLOC:.+]] = ttg.local_alloc : () -> !ttg.memdesc<1x64x16xf16, #shared, #smem, mutable>
+// CHECK: %[[MEMDESC_IDX:.+]] = ttg.memdesc_index %[[ALLOC]]
+
+// CHECK: scf.for {{.+}} iter_args({{.*}}, %[[MEMDESC_IDX_ITER:.+]] = %[[MEMDESC_IDX]]) -> ({{.+}})
+//  CHECK: %[[LOAD:.+]] = tt.load {{.+}} : tensor<64x16x!tt.ptr<f16>, #blocked>
+//  CHECK: %[[LOCAL_LOAD_TRANS:.+]] = ttg.local_load %[[MEMDESC_IDX_ITER]] : {{.+}} -> tensor<64x16xf16, #linear>
+//  CHECK: %[[LOCAL_LOAD_DIRECT:.+]] = ttg.local_load %[[MEMDESC_IDX_ITER]] : {{.+}} -> tensor<64x16xf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 4}>>
+//  CHECK: tt.dot {{.+}}, %[[LOCAL_LOAD_DIRECT]], {{.+}}
+//  CHECK: %[[TRANS:.+]] = tt.trans %[[LOCAL_LOAD_TRANS]] {{.+}} : {{.+}} -> tensor<16x64xf16, #ttg.dot_op<{opIdx = 1, parent = #mma1, kWidth = 8}>>
+//  CHECK: tt.dot {{.+}}, %[[TRANS]], {{.+}}
+//  CHECK: %[[MEMDESC_IDX:.+]] = ttg.memdesc_index %[[ALLOC]]
+//  CHECK: ttg.local_store %[[LOAD]], %[[MEMDESC_IDX]]
+//  CHECK: scf.yield
+
+#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [64, 1], warpsPerCTA = [1, 4], order = [0, 1]}>
+#linear = #ttg.linear<{register = [[0, 1], [0, 2], [0, 4], [32, 0]], lane = [[1, 0], [2, 0], [4, 0], [8, 0], [16, 0], [0, 8]], warp = [[0, 0], [0, 0]], block = []}>
+#mma = #ttg.amd_mfma<{version = 4, warpsPerCTA = [4, 1], instrShape = [32, 32], isTransposed = true}>
+#mma1 = #ttg.amd_mfma<{version = 4, warpsPerCTA = [4, 1], instrShape = [16, 16], isTransposed = true}>
+
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "hip:gfx950", "ttg.threads-per-warp" = 64 : i32} {
+  tt.func public @test_lds_layout_selection(
+    %arg0: tensor<64x16x!tt.ptr<f16>, #blocked>,
+    %out0 : tensor<128x16x!tt.ptr<f32>, #blocked>,
+    %out1 : tensor<128x64x!tt.ptr<f32>, #blocked>
+  ) {
+    %cst_0 = arith.constant dense<0.000000e+00> : tensor<128x16xf32, #mma1>
+    %cst_1 = arith.constant dense<0.693147182> : tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 4}>>
+    %cst_2 = arith.constant dense<0.581374812> : tensor<128x16xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 8}>>
+    %cst_3 = arith.constant dense<0.000000e+00> : tensor<128x64xf32, #mma>
+    %c0_i32 = arith.constant 0 : i32
+    %c1_i32 = arith.constant 1 : i32
+    %c8_i32 = arith.constant 8 : i32
+
+    %0:2 = scf.for %arg1 = %c0_i32 to %c8_i32 step %c1_i32 iter_args(%arg2 = %cst_0, %arg3 = %cst_3) -> (tensor<128x16xf32, #mma1>, tensor<128x64xf32, #mma>)  : i32 {
+      %1 = tt.load %arg0 : tensor<64x16x!tt.ptr<f16>, #blocked>
+      %2 = ttg.convert_layout %1 : tensor<64x16xf16, #blocked> -> tensor<64x16xf16, #linear>
+      %3 = ttg.convert_layout %1 : tensor<64x16xf16, #blocked> -> tensor<64x16xf16, #ttg.dot_op<{opIdx = 1, parent = #mma1, kWidth = 4}>>
+      %4 = tt.dot %cst_1, %3, %arg2 : tensor<128x64xf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 4}>> * tensor<64x16xf16, #ttg.dot_op<{opIdx = 1, parent = #mma1, kWidth = 4}>> -> tensor<128x16xf32, #mma1>
+      %5 = tt.trans %2 {order = array<i32: 1, 0>} : tensor<64x16xf16, #linear> -> tensor<16x64xf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 8}>>
+      %6 = tt.dot %cst_2, %5, %arg3 : tensor<128x16xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 8}>> * tensor<16x64xf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 8}>> -> tensor<128x64xf32, #mma>
+      scf.yield %4, %6 : tensor<128x16xf32, #mma1>, tensor<128x64xf32, #mma>
+    }
+
+    %7 = ttg.convert_layout %0#0 : tensor<128x16xf32, #mma1> -> tensor<128x16xf32, #blocked>
+    %8 = ttg.convert_layout %0#1 : tensor<128x64xf32, #mma> -> tensor<128x64xf32, #blocked>
+    tt.store %out0, %7 : tensor<128x16x!tt.ptr<f32>, #blocked>
+    tt.store %out1, %8 : tensor<128x64x!tt.ptr<f32>, #blocked>
+    tt.return
+  }
+}
+// -----
+
+// Verify that a common shared memory layout is chosen for users with different kWidth and opIdx.
+// CHECK{LITERAL}: #shared = #ttg.swizzled_shared<{vec = 8, perPhase = 2, maxPhase = 8, order = [0, 1]}>
+// CHECK-NOT: #ttg.swizzled_shared
+// CHECK{LITERAL}: #smem = #ttg.shared_memory
+// CHECK-LABEL: test_lds_layout_selection_different_opIdx
+
+// CHECK: %[[ALLOC:.+]] = ttg.local_alloc : () -> !ttg.memdesc<1x64x16xf16, #shared, #smem, mutable>
+// CHECK: %[[MEMDESC_IDX:.+]] = ttg.memdesc_index %[[ALLOC]]
+
+// CHECK: scf.for {{.+}} iter_args({{.*}}, %[[MEMDESC_IDX_ITER:.+]] = %[[MEMDESC_IDX]]) -> ({{.+}})
+//  CHECK: %[[LOAD:.+]] = tt.load {{.+}} : tensor<64x16x!tt.ptr<f16>, #blocked>
+//  CHECK: %[[LOCAL_LOAD_TRANS:.+]] = ttg.local_load %[[MEMDESC_IDX_ITER]] : {{.+}} -> tensor<64x16xf16, #linear>
+//  CHECK: %[[LOCAL_LOAD_DIRECT:.+]] = ttg.local_load %[[MEMDESC_IDX_ITER]] : {{.+}} -> tensor<64x16xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 4}>>
+//  CHECK: tt.dot %[[LOCAL_LOAD_DIRECT]], {{.+}}
+//  CHECK: %[[TRANS:.+]] = tt.trans %[[LOCAL_LOAD_TRANS]] {{.+}} : {{.+}} -> tensor<16x64xf16, #ttg.dot_op<{opIdx = 1, parent = #mma1, kWidth = 8}>>
+//  CHECK: tt.dot {{.+}}, %[[TRANS]], {{.+}}
+//  CHECK: %[[MEMDESC_IDX:.+]] = ttg.memdesc_index %[[ALLOC]]
+//  CHECK: ttg.local_store %[[LOAD]], %[[MEMDESC_IDX]]
+//  CHECK: scf.yield
+
+#blocked = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [64, 1], warpsPerCTA = [1, 4], order = [0, 1]}>
+#linear = #ttg.linear<{register = [[0, 1], [0, 2], [0, 4], [32, 0]], lane = [[1, 0], [2, 0], [4, 0], [8, 0], [16, 0], [0, 8]], warp = [[0, 0], [0, 0]], block = []}>
+#mma = #ttg.amd_mfma<{version = 4, warpsPerCTA = [4, 1], instrShape = [32, 32], isTransposed = true}>
+#mma1 = #ttg.amd_mfma<{version = 4, warpsPerCTA = [4, 1], instrShape = [16, 16], isTransposed = true}>
+
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "hip:gfx950", "ttg.threads-per-warp" = 64 : i32} {
+  tt.func public @test_lds_layout_selection_different_opIdx(
+    %arg0: tensor<64x16x!tt.ptr<f16>, #blocked>,
+    %out0 : tensor<64x64x!tt.ptr<f32>, #blocked>,
+    %out1 : tensor<128x64x!tt.ptr<f32>, #blocked>
+  ) {
+    %cst_0 = arith.constant dense<0.000000e+00> : tensor<64x64xf32, #mma1>
+    %cst_1 = arith.constant dense<0.693147182> : tensor<16x64xf16, #ttg.dot_op<{opIdx = 1, parent = #mma1, kWidth = 4}>>
+    %cst_2 = arith.constant dense<0.581374812> : tensor<128x16xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 8}>>
+    %cst_3 = arith.constant dense<0.000000e+00> : tensor<128x64xf32, #mma>
+    %c0_i32 = arith.constant 0 : i32
+    %c1_i32 = arith.constant 1 : i32
+    %c8_i32 = arith.constant 8 : i32
+
+    %0:2 = scf.for %arg1 = %c0_i32 to %c8_i32 step %c1_i32 iter_args(%arg2 = %cst_0, %arg3 = %cst_3) -> (tensor<64x64xf32, #mma1>, tensor<128x64xf32, #mma>)  : i32 {
+      %1 = tt.load %arg0 : tensor<64x16x!tt.ptr<f16>, #blocked>
+      %2 = ttg.convert_layout %1 : tensor<64x16xf16, #blocked> -> tensor<64x16xf16, #linear>
+      %3 = ttg.convert_layout %1 : tensor<64x16xf16, #blocked> -> tensor<64x16xf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 4}>>
+      %4 = tt.dot %3, %cst_1, %arg2 : tensor<64x16xf16, #ttg.dot_op<{opIdx = 0, parent = #mma1, kWidth = 4}>> * tensor<16x64xf16, #ttg.dot_op<{opIdx = 1, parent = #mma1, kWidth = 4}>> -> tensor<64x64xf32, #mma1>
+      %5 = tt.trans %2 {order = array<i32: 1, 0>} : tensor<64x16xf16, #linear> -> tensor<16x64xf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 8}>>
+      %6 = tt.dot %cst_2, %5, %arg3 : tensor<128x16xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 8}>> * tensor<16x64xf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 8}>> -> tensor<128x64xf32, #mma>
+      scf.yield %4, %6 : tensor<64x64xf32, #mma1>, tensor<128x64xf32, #mma>
+    }
+
+    %7 = ttg.convert_layout %0#0 : tensor<64x64xf32, #mma1> -> tensor<64x64xf32, #blocked>
+    %8 = ttg.convert_layout %0#1 : tensor<128x64xf32, #mma> -> tensor<128x64xf32, #blocked>
+    tt.store %out0, %7 : tensor<64x64x!tt.ptr<f32>, #blocked>
+    tt.store %out1, %8 : tensor<128x64x!tt.ptr<f32>, #blocked>
+    tt.return
+  }
+}
diff --git a/test/TritonGPU/amd/amd-stream-loop-assume.mlir b/test/TritonGPU/amd/amd-stream-loop-assume.mlir
@@ -1,14 +1,14 @@
 // RUN: triton-opt %s -split-input-file -tritonamdgpu-stream-pipeline="num_stages=2" -canonicalize | FileCheck %s
 
 // matmul: 128x32 @ 32x128 -> 128x128
-#AL = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}>
-#BL = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [4, 1], order = [1, 0]}>
+#AL = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [8, 8], warpsPerCTA = [4, 1], order = [1, 0]}>
+#BL = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [8, 8], warpsPerCTA = [4, 1], order = [1, 0]}>
 #ALs0 = #ttg.slice<{parent=#AL, dim=0}>
 #BLs0 = #ttg.slice<{parent=#BL, dim=0}>
 #BLs1 = #ttg.slice<{parent=#BL, dim=1}>
-#C = #ttg.nvidia_mma<{versionMajor = 2, warpsPerCTA = [4, 1], instrShape = [16, 8]}>
-#A = #ttg.dot_op<{opIdx = 0, parent = #C, kWidth=2}>
-#B = #ttg.dot_op<{opIdx = 1, parent = #C, kWidth=2}>
+#C = #ttg.amd_mfma<{version = 4, warpsPerCTA = [4, 1], instrShape = [32, 32], isTransposed = true}>
+#A = #ttg.dot_op<{opIdx = 0, parent = #C, kWidth = 4}>
+#B = #ttg.dot_op<{opIdx = 1, parent = #C, kWidth = 4}>
 
 // CHECK-LABEL: tt.func @assume_matmul
 // CHECK-COUNT-2: tt.load
@@ -27,7 +27,7 @@
 // CHECK: tt.dot
 // CHECK-NOT: tt.dot
 
-module attributes {"ttg.num-warps" = 4 : i32, "ttg.num-ctas" = 1 : i32} {
+module attributes {"ttg.num-warps" = 4 : i32, "ttg.num-ctas" = 1 : i32, ttg.target = "hip:gfx942", "ttg.threads-per-warp" = 64 : i32} {
 tt.func @assume_matmul(%lb : index, %ub : index, %step : index,
                   %A : !tt.ptr<f16> {tt.divisibility = 16 : i32},
                   %B : !tt.ptr<f16> {tt.divisibility = 16 : i32}) -> tensor<128x128xf32, #C> {
diff --git a/test/TritonGPU/amd/amd-stream-prefetch.mlir b/test/TritonGPU/amd/amd-stream-prefetch.mlir
@@ -4,14 +4,14 @@
 // RUN: triton-opt %s -tritonamdgpu-stream-pipeline="num_stages=2 local_prefetch=1" -canonicalize | FileCheck %s --check-prefixes=LOCAL_1
 
 // matmul: 128x32 @ 32x128 -> 128x128
-#AL = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}>
-#BL = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [1, 32], warpsPerCTA = [4, 1], order = [1, 0]}>
+#AL = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [8, 8], warpsPerCTA = [4, 1], order = [1, 0]}>
+#BL = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [8, 8], warpsPerCTA = [4, 1], order = [1, 0]}>
 #ALs0 = #ttg.slice<{parent=#AL, dim=0}>
 #BLs0 = #ttg.slice<{parent=#BL, dim=0}>
 #BLs1 = #ttg.slice<{parent=#BL, dim=1}>
-#C = #ttg.nvidia_mma<{versionMajor = 2, warpsPerCTA = [4, 1], instrShape = [16, 8]}>
-#A = #ttg.dot_op<{opIdx = 0, parent = #C, kWidth=2}>
-#B = #ttg.dot_op<{opIdx = 1, parent = #C, kWidth=2}>
+#C = #ttg.amd_mfma<{version = 4, warpsPerCTA = [4, 1], instrShape = [32, 32], isTransposed = true}>
+#A = #ttg.dot_op<{opIdx = 0, parent = #C, kWidth = 4}>
+#B = #ttg.dot_op<{opIdx = 1, parent = #C, kWidth = 4}>
 
 // An extra register buffer for global loads.
 // GLOBAL_1-LABEL: tt.func @matmul_loop
@@ -74,7 +74,7 @@
 // LOCAL_1: tt.dot
 // LOCAL_1-NOT: tt.dot
 
-module attributes {"ttg.num-warps" = 4 : i32, "ttg.num-ctas" = 1 : i32} {
+module attributes {"ttg.num-warps" = 4 : i32, "ttg.num-ctas" = 1 : i32, ttg.target = "hip:gfx942", "ttg.threads-per-warp" = 64 : i32} {
 tt.func @matmul_loop(%lb : index, %ub : index, %step : index,
                   %A : !tt.ptr<f16> {tt.divisibility = 16 : i32},
                   %B : !tt.ptr<f16> {tt.divisibility = 16 : i32}) -> tensor<128x128xf32, #C> {
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/Utility.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/Utility.cpp
@@ -707,28 +707,6 @@ bool isChainDotHead(tt::DotOpInterface dotOp, unsigned opIdx) {
   return false;
 }
 
-bool hasTransInDefChain(tt::DotOpInterface dotOp, unsigned opIdx) {
-  auto isInSameRegion = [&dotOp](Operation *op) {
-    return op->getParentRegion() == dotOp->getParentRegion();
-  };
-
-  BackwardSliceOptions bwdOpt;
-  bwdOpt.omitBlockArguments = true;
-  bwdOpt.filter = isInSameRegion;
-  SetVector<Operation *> bwdSlices;
-  Operation *dotOperand = (opIdx == 0) ? dotOp.getA().getDefiningOp()
-                                       : dotOp.getB().getDefiningOp();
-
-  if (!dotOperand)
-    return false;
-  (void)getBackwardSlice(dotOperand, &bwdSlices, bwdOpt);
-  if (llvm::find_if(bwdSlices, [](Operation *op) {
-        return isa<tt::TransOp>(op);
-      }) != bwdSlices.end())
-    return true;
-  return false;
-}
-
 bool isChainDotTail(tt::DotOpInterface dotOp) {
   auto isInSameRegion = [&dotOp](Operation *op) {
     return op->getParentRegion() == dotOp->getParentRegion();
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/Utility.h b/third_party/amd/lib/TritonAMDGPUToLLVM/Utility.h
@@ -115,10 +115,6 @@ bool isUsedByDotScaledOp(Operation *op);
 // in the same region
 bool isChainDotHead(mlir::triton::DotOpInterface dotOp, unsigned opIdx = 0);
 
-// Check if given operand of this tt.dot is the result of a tt.trans
-// in the same region
-bool hasTransInDefChain(mlir::triton::DotOpInterface dotOp, unsigned opIdx);
-
 // Check if the opA of this tl.dot is the result of another tl.dot
 // in the same region
 bool isChainDotTail(mlir::triton::DotOpInterface dotOp);
diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/AccelerateAMDMatmul.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/AccelerateAMDMatmul.cpp
@@ -15,7 +15,6 @@
 
 namespace tt = mlir::triton;
 namespace ttg = mlir::triton::gpu;
-using ::mlir::LLVM::AMD::hasTransInDefChain;
 using ::mlir::LLVM::AMD::isChainDotHead;
 using ::mlir::LLVM::AMD::isChainDotTail;
 using ::mlir::LLVM::AMD::scaleDotElemTypeToMLIRType;
@@ -563,18 +562,6 @@ class BlockedToMFMA : public OpRewritePattern<tt::DotOp> {
     if (is16BitElemTy && isDotChainTail) {
       kWidth = 4;
     }
-    // For FA bwd kernel (detected using hasTransInDefChain), depending on
-    // whether the dot is a head or tail in the chain, we adjust the kWidth
-    // accordingly. This will enable us to create the same shared encoding per
-    // pair of tt.dot ops that both use the same tt.load result, one directly
-    // and one via tt.trans, later in the pass pipeline.
-    if (is16BitElemTy && hasTransInDefChain(dotOp, 1u)) {
-      if (isChainDotHead(dotOp)) {
-        kWidth = 4;
-      } else if (isDotChainTail) {
-        kWidth = 8;
-      }
-    }
 
     Value newDot;
     if (withScale) {
diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipeline.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/StreamPipeline.cpp