[AMD] Fix lowerLoops: only erase load ops which are converted (#8737)

leonling-ll · anmyachev · commit 2a6bc7eb19f6 · 2025-12-01T10:59:17.000Z
This change addresses the issue that when there is a LoadOp and AddfOp
between 2 dots in a loop, this LoadOp is not streamable in
AMDGPUPipeline Pass. This case would make compile crash for erasing
LoadOp which still have uses.

The solution is to replace `loadToInfo` with `loadToStreamOps`, so that
only erase LoadOps that are converted to Stream Ops.
diff --git a/test/TritonGPU/amd/amd-pipeline-chained-dots.mlir b/test/TritonGPU/amd/amd-pipeline-chained-dots.mlir
@@ -160,3 +160,57 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.targ
     tt.return %6 : tensor<128x16xf32, #mma>
   }
 }
+
+// -----
+
+#blocked = #ttg.blocked<{sizePerThread = [8, 1], threadsPerWarp = [8, 8], warpsPerCTA = [1, 8], order = [0, 1]}>
+#mma = #ttg.amd_mfma<{version = 3, warpsPerCTA = [8, 1], instrShape = [16, 16, 16], isTransposed = true}>
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, ttg.target = "hip:gfx950", "ttg.threads-per-warp" = 64 : i32} {
+  // CHECK-LABEL: tt.func @chained_dots_with_load_bias_in_between
+
+  // Similar to the previous test but load bias tensor bewteen 2 dots
+  // We expect the unstreamable load can be kept after pipelining
+
+  // CHECK: scf.for
+  // CHECK: tt.dot
+  // CHECK: ttg.async_copy_global_to_local
+  // CHECK: tt.dot
+  // CHECK: ttg.async_wait
+  // CHECK: ttg.local_load
+  // CHECK: tt.load
+  // CHECK: scf.yield
+
+  tt.func @chained_dots_with_load_bias_in_between(%arg0: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg1: tensor<256x64xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 4}>>, %arg2: i64 {tt.divisibility = 16 : i32}, %arg3: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg4: i32) -> tensor<256x64xf32, #mma> {
+    %c0_i32 = arith.constant 0 : i32
+    %c1_i32 = arith.constant 1 : i32
+    %c64_i32 = arith.constant 64 : i32
+    %cst = arith.constant dense<0.000000e+00> : tensor<256x64xf32, #mma>
+    %0 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>>
+    %1 = tt.expand_dims %0 {axis = 1 : i32} : tensor<64xi32, #ttg.slice<{dim = 1, parent = #blocked}>> -> tensor<64x1xi32, #blocked>
+    %2 = tt.splat %arg0 : !tt.ptr<f16> -> tensor<64x64x!tt.ptr<f16>, #blocked>
+    %3 = tt.broadcast %1 : tensor<64x1xi32, #blocked> -> tensor<64x64xi32, #blocked>
+    %4 = tt.addptr %2, %3 : tensor<64x64x!tt.ptr<f16>, #blocked>, tensor<64x64xi32, #blocked>
+    %5 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked}>>
+    %6 = tt.splat %arg3 : !tt.ptr<f16> -> tensor<256x64x!tt.ptr<f16>, #blocked>
+    %7 = scf.for %arg5 = %c0_i32 to %arg4 step %c1_i32 iter_args(%arg6 = %cst) -> (tensor<256x64xf32, #mma>)  : i32 {
+      %8 = tt.load %4 : tensor<64x64x!tt.ptr<f16>, #blocked>
+      %9 = ttg.convert_layout %8 : tensor<64x64xf16, #blocked> -> tensor<64x64xf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 4}>>
+      %10 = tt.dot %arg1, %9, %cst : tensor<256x64xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 4}>> * tensor<64x64xf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 4}>> -> tensor<256x64xf32, #mma>
+      %11 = arith.muli %arg5, %c64_i32 : i32
+      %12 = tt.splat %11 : i32 -> tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked}>>
+      %13 = arith.addi %12, %5 : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked}>>
+      %14 = tt.expand_dims %13 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x64xi32, #blocked>
+      %15 = tt.broadcast %14 : tensor<1x64xi32, #blocked> -> tensor<256x64xi32, #blocked>
+      %bias_ptr = tt.addptr %6, %15 : tensor<256x64x!tt.ptr<f16>, #blocked>, tensor<256x64xi32, #blocked>
+      %bias = tt.load %bias_ptr : tensor<256x64x!tt.ptr<f16>, #blocked>
+      %bias_mma = ttg.convert_layout %bias : tensor<256x64xf16, #blocked> -> tensor<256x64xf16, #mma>
+      %bias_f32 = arith.extf %bias_mma : tensor<256x64xf16, #mma> to tensor<256x64xf32, #mma>
+      %dot_bias = arith.addf %10, %bias_f32 : tensor<256x64xf32, #mma>
+      %21 = arith.truncf %dot_bias : tensor<256x64xf32, #mma> to tensor<256x64xf16, #mma>
+      %22 = ttg.convert_layout %21 : tensor<256x64xf16, #mma> -> tensor<256x64xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 4}>>
+      %23 = tt.dot %22, %9, %arg6 : tensor<256x64xf16, #ttg.dot_op<{opIdx = 0, parent = #mma, kWidth = 4}>> * tensor<64x64xf16, #ttg.dot_op<{opIdx = 1, parent = #mma, kWidth = 4}>> -> tensor<256x64xf32, #mma>
+      scf.yield %23 : tensor<256x64xf32, #mma>
+    }
+    tt.return %7 : tensor<256x64xf32, #mma>
+  }
+}
diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/LowerLoops.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/LowerLoops.cpp
@@ -718,7 +718,7 @@ void updateSchedule(scf::ForOp &forOp, const LoadToInfoMap &loadToInfo,
                                          useAsyncCopy, axisInfoAnalysis);
   scheduleStreamOps(loadToStreamOps, schedule, clusters);
 
-  for (auto [l, _] : loadToInfo) {
+  for (auto [l, _] : loadToStreamOps) {
     schedule.erase(l);
     l->erase();
   }

Original file line number	Diff line number	Diff line change
`@@ -718,7 +718,7 @@ void updateSchedule(scf::ForOp &forOp, const LoadToInfoMap &loadToInfo,`
`718`	`718`	`useAsyncCopy, axisInfoAnalysis);`
`719`	`719`	`scheduleStreamOps(loadToStreamOps, schedule, clusters);`
`720`	`720`
`721`		`- for (auto [l, _] : loadToInfo) {`
	`721`	`+ for (auto [l, _] : loadToStreamOps) {`
`722`	`722`	`schedule.erase(l);`
`723`	`723`	`l->erase();`
`724`	`724`	`}`