[Pipeliner] Make loop flattening and scheduler more robust (#5902)

Mogball · web-flow · commit e7072a36a902 · 2025-02-12T16:08:20.000-08:00
Loop flattening: don't forward all outer loop results through the
generated epilogue to avoid creating dependencies on the prologue.
    
In scheduling: overwrite the stage and clusters of ops if it was
requested that they be scheduled earlier than previously. This can
happen some times due to the order in which certain ops have their
dependencies scheduled. Otherwise, it is guaranteed that the generated
schedule is invalid.
diff --git a/include/triton/Dialect/TritonGPU/Transforms/Schedule.h b/include/triton/Dialect/TritonGPU/Transforms/Schedule.h
@@ -106,27 +106,10 @@ class CoarseSchedule {
     return true;
   }
 
-  void insertMinimum(Operation *op, int stage, Cluster cluster) {
-    auto res = opToStageAndCluster.insert({op, {stage, cluster}});
-    if (res.second) {
-      return;
-    }
-    auto &[existingStage, existingCluster] = res.first->second;
-    existingStage = std::min(stage, existingStage);
-
-    // If existingCluster is reachable from cluster,
-    // then cluster is earlier in the list
-    auto it = cluster;
-    for (auto it = cluster; it != clusters.end(); ++it) {
-      if (it == existingCluster) {
-        existingCluster = cluster;
-        return;
-      }
-    }
-  }
+  bool insertMinimum(Operation *op, int stage, Cluster cluster);
 
   bool insertDepsOfOp(Operation *op, int stage, CoarseSchedule::Cluster cluster,
-                      bool includeArg);
+                      bool includeArg, bool insertIfEarlier = false);
 
   void erase(Operation *op) { opToStageAndCluster.erase(op); }
 
diff --git a/lib/Dialect/TritonGPU/Transforms/FuseNestedLoops.cpp b/lib/Dialect/TritonGPU/Transforms/FuseNestedLoops.cpp
@@ -800,25 +800,38 @@ static void fuseOneLevel(LoopNestNode *parent, mlir::DominanceInfo &domInfo) {
   // if T == len_j0 + len_j1 + ... + len_jN - N - 1:
   //   epilogue(i)
   Logue &epilogue = logues.back();
+
+  // The only possible use of an epilogue output is the yield.
+  auto outerYield = cast<scf::YieldOp>(outer.getBody()->getTerminator());
+  SmallVector<Value> usedIterArgs;
+  for (Value output : epilogue.getOutputs()) {
+    for (OpOperand &use : output.getUses()) {
+      if (use.getOwner() == outerYield) {
+        usedIterArgs.push_back(fused.getRegionIterArgs().drop_front(
+            outerArgsStartIdx)[use.getOperandNumber()]);
+      }
+    }
+  }
+
   auto epilogueCond =
       b.create<arith::CmpIOp>(arith::CmpIPredicate::eq, T,
                               b.create<arith::SubIOp>(innerLen, intTyCst(1)));
   auto epilogueIf =
-      b.create<scf::IfOp>(outer.getYieldedValues().getTypes(), epilogueCond);
+      b.create<scf::IfOp>(epilogue.getOutputTypes(), epilogueCond);
 
   Block *thenBlock = b.createBlock(&epilogueIf.getThenRegion());
   epilogue.moveBefore(thenBlock, thenBlock->end());
 
   b.setInsertionPointToEnd(thenBlock);
-  b.create<scf::YieldOp>(outer.getYieldedValues());
-
+  b.create<scf::YieldOp>(epilogue.getOutputs());
   b.createBlock(&epilogueIf.getElseRegion());
-  b.create<scf::YieldOp>(fused.getRegionIterArgs().slice(
-      outerArgsStartIdx, outer.getNumRegionIterArgs()));
+  b.create<scf::YieldOp>(usedIterArgs);
+  epilogue.replaceAllUsesWith(epilogueIf.getResults(),
+                              epilogueIf.getThenRegion());
 
   // Finally, create the yield of the fused loop.
   SmallVector<Value> outerOuts{T, i};
-  llvm::append_range(outerOuts, epilogueIf.getResults());
+  llvm::append_range(outerOuts, outerYield.getOperands());
   for (scf::IfOp bodyIf : bodyIfs)
     outerOuts.push_back(/*jk=*/bodyIf.getResult(0));
   for (auto [bodyIf, loop] : llvm::zip(bodyIfs, innerLoops)) {
@@ -831,7 +844,7 @@ static void fuseOneLevel(LoopNestNode *parent, mlir::DominanceInfo &domInfo) {
   }
 
   b.setInsertionPointToEnd(fused.getBody());
-  auto outerYield = b.create<scf::YieldOp>(outerOuts);
+  b.create<scf::YieldOp>(outerOuts);
   outer.replaceAllUsesWith(
       fused.getResults().slice(outerArgsStartIdx, outer.getNumResults()));
 
diff --git a/lib/Dialect/TritonGPU/Transforms/LoopScheduling.cpp b/lib/Dialect/TritonGPU/Transforms/LoopScheduling.cpp
@@ -174,14 +174,17 @@ void scheduleDistanceOneDependencies(scf::ForOp forOp,
               // Exception: Schedule loads with a distance of 1 together
               // with the current op.
               schedule.insertIfAbsent(defOp, stage, cluster);
-              schedule.insertDepsOfOp(defOp, stage, cluster, true);
+              schedule.insertDepsOfOp(defOp, stage, cluster,
+                                      /*includeArg=*/true,
+                                      /*insertIfEarlier=*/true);
             } else {
               if (dist1Cluster.count(&cluster) == 0) {
                 dist1Cluster[&cluster] = schedule.clusters.newBefore(cluster);
               }
               schedule.insertIfAbsent(defOp, stage + 1, dist1Cluster[&cluster]);
               schedule.insertDepsOfOp(defOp, stage + 1, dist1Cluster[&cluster],
-                                      true);
+                                      /*includeArg=*/true,
+                                      /*includeIfEarlier=*/true);
             }
           }
         }
diff --git a/lib/Dialect/TritonGPU/Transforms/Pipeliner/Schedule.cpp b/lib/Dialect/TritonGPU/Transforms/Pipeliner/Schedule.cpp
@@ -14,9 +14,51 @@ using namespace mlir;
 namespace tt = mlir::triton;
 namespace ttg = mlir::triton::gpu;
 
+bool tt::CoarseSchedule::insertMinimum(Operation *op, int stage,
+                                       Cluster cluster) {
+  auto res = opToStageAndCluster.insert({op, {stage, cluster}});
+  if (res.second) {
+    return true;
+  }
+
+  auto &[existingStage, existingCluster] = res.first->second;
+
+  // Always insert if the stage is earlier.
+  if (stage < existingStage) {
+    existingStage = stage;
+    existingCluster = cluster;
+    return true;
+  }
+
+  // If the stage is later, no change.
+  if (stage > existingStage) {
+    return false;
+  }
+
+  // If existingCluster is reachable from cluster,
+  // then cluster is earlier in the list
+  auto it = cluster;
+  for (auto it = cluster; it != clusters.end(); ++it) {
+    if (it == existingCluster) {
+      existingCluster = cluster;
+      return true;
+    }
+  }
+
+  // Didn't change the cluster.
+  return false;
+}
+
 bool tt::CoarseSchedule::insertDepsOfOp(Operation *op, int stage,
                                         tt::CoarseSchedule::Cluster cluster,
-                                        bool includeArg) {
+                                        bool includeArg, bool insertIfEarlier) {
+  auto tryInsert = [&](Operation *op, int stage,
+                       tt::CoarseSchedule::Cluster cluster) {
+    if (!insertIfEarlier)
+      return insertIfAbsent(op, stage, cluster);
+    return insertMinimum(op, stage, cluster);
+  };
+
   bool inserted = false;
   for (Value operand : getNestedOperands(op)) {
     Value v = operand;
@@ -35,7 +77,7 @@ bool tt::CoarseSchedule::insertDepsOfOp(Operation *op, int stage,
     }
     Operation *defOp = v.getDefiningOp();
     if (defOp && defOp->getBlock() == op->getBlock()) {
-      if (insertIfAbsent(defOp, stage, cluster)) {
+      if (tryInsert(defOp, stage, cluster)) {
         inserted = true;
         insertDepsOfOp(defOp, stage, cluster, includeArg);
       }
diff --git a/test/TritonGPU/fuse-nested-loops.mlir b/test/TritonGPU/fuse-nested-loops.mlir
@@ -540,3 +540,30 @@ tt.func @sink_prologue_to_epilogue(%ub: i32) {
 
   tt.return
 }
+
+// -----
+
+// CHECK-LABEL: @prologue_output
+tt.func @prologue_output(%ub: i32) {
+  %c0_i32 = arith.constant 0 : i32
+  %c1_i32 = arith.constant 1 : i32
+
+  // CHECK: scf.for
+  %0 = scf.for %i = %c0_i32 to %ub step %c1_i32 iter_args(%k = %c0_i32) -> i32 : i32 {
+    // CHECK: scf.if
+    // CHECK: {increment}
+    %next = arith.addi %k, %c1_i32 {increment} : i32
+    // CHECK: scf.if
+    scf.for %j = %c0_i32 to %ub step %c1_i32 : i32 {
+      // CHECK-NEXT: "body"
+      "body"(%i, %j) : (i32, i32) -> ()
+    }
+    // CHECK: scf.if {{%[0-9]+}} {
+    // CHECK-NEXT: "epilogue"
+    "epilogue"(%i) : (i32) -> ()
+    // CHECK-NEXT: } else {
+    scf.yield %next : i32
+  } {"ttg.always-fuse"}
+
+  tt.return
+}
diff --git a/test/TritonGPU/loop-schedule.mlir b/test/TritonGPU/loop-schedule.mlir
@@ -62,6 +62,88 @@ tt.func @matmul_loop_load_acc(%lb : index, %ub : index, %step : index,
 
 // -----
 
+#blocked = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [4, 8], warpsPerCTA = [4, 1], order = [1, 0]}>
+#blocked1 = #ttg.blocked<{sizePerThread = [1, 1], threadsPerWarp = [1, 32], warpsPerCTA = [1, 4], order = [1, 0]}>
+#mma = #ttg.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [4, 1], instrShape = [16, 256, 16]}>
+#shared = #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = false, elementBitWidth = 16}>
+#smem = #ttg.shared_memory
+
+module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.target = "cuda:90", "ttg.threads-per-warp" = 32 : i32} {
+
+// CHECK-LABEL: @fused_loop
+tt.func public @fused_loop(%arg5: !tt.ptr<f16> {tt.divisibility = 16 : i32}, %arg7: i32 {tt.divisibility = 16 : i32}) {
+  %c10_i32 = arith.constant 10 : i32
+  %false = arith.constant false
+  %0 = ub.poison : !tt.tensordesc<tensor<64x256xf16>>
+  %cst = arith.constant dense<0> : tensor<128x1xi64, #blocked>
+  %c-1_i32 = arith.constant -1 : i32
+  %c1_i32 = arith.constant 1 : i32
+  %c0_i32 = arith.constant 0 : i32
+  %c64_i32 = arith.constant 64 : i32
+  %c1_i64 = arith.constant 1 : i64
+  %cst_0 = arith.constant dense<0.000000e+00> : tensor<128x256xf32, #mma>
+
+  %1 = tt.make_range {end = 64 : i32, start = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked}>>
+  %2 = tt.expand_dims %1 {axis = 0 : i32} : tensor<64xi32, #ttg.slice<{dim = 0, parent = #blocked}>> -> tensor<1x64xi32, #blocked>
+  %3 = arith.extsi %arg7 : i32 to i64
+  %4 = tt.make_tensor_descriptor %arg5, [%arg7, %arg7], [%3, %c1_i64] : <f16>, <tensor<64x256xf16>>
+  %5 = tt.broadcast %2 : tensor<1x64xi32, #blocked> -> tensor<128x64xi32, #blocked>
+  %7 = tt.splat %3 : i64 -> tensor<128x1xi64, #blocked>
+
+  // CHECK: scf.for
+  %8:9 = scf.for %arg29 = %c0_i32 to %arg7 step %c1_i32 iter_args(%arg30 = %c-1_i32, %arg31 = %4, %arg32 = %c0_i32, %arg33 = %arg5, %arg34 = %cst_0, %arg35 = %c0_i32, %arg36 = %cst, %arg37 = %0, %arg38 = %false) -> (i32, !tt.tensordesc<tensor<64x256xf16>>, i32, !tt.ptr<f16>, tensor<128x256xf32, #mma>, i32, tensor<128x1xi64, #blocked>, !tt.tensordesc<tensor<64x256xf16>>, i1)  : i32 {
+    %9 = arith.addi %arg30, %c1_i32 : i32
+    %10 = arith.cmpi eq, %arg30, %c10_i32 : i32
+    %11 = arith.select %10, %c0_i32, %9 : i32
+    %12 = arith.cmpi eq, %11, %c0_i32 : i32
+
+    // This op is a distance 1 dependency of itself.
+    // CHECK: {_test_marker_0, loop.cluster = 4 : i32, loop.stage = 0 : i32}
+    %13 = arith.select %12, %c0_i32, %arg32 {_test_marker_0} : i32
+
+    %14 = arith.select %12, %arg31, %arg37 : !tt.tensordesc<tensor<64x256xf16>>
+    %15 = arith.select %12, %c10_i32, %arg35 : i32
+    %16 = scf.if %12 -> (tensor<128x1xi64, #blocked>) {
+      %32 = arith.muli %cst, %7 : tensor<128x1xi64, #blocked>
+      scf.yield %32 : tensor<128x1xi64, #blocked>
+    } else {
+      scf.yield %arg36 : tensor<128x1xi64, #blocked>
+    }
+    %17 = tt.splat %arg33 : !tt.ptr<f16> -> tensor<128x1x!tt.ptr<f16>, #blocked>
+    %18 = tt.addptr %17, %16 : tensor<128x1x!tt.ptr<f16>, #blocked>, tensor<128x1xi64, #blocked>
+    %19 = tt.broadcast %18 : tensor<128x1x!tt.ptr<f16>, #blocked> -> tensor<128x64x!tt.ptr<f16>, #blocked>
+    %20 = tt.addptr %19, %5 : tensor<128x64x!tt.ptr<f16>, #blocked>, tensor<128x64xi32, #blocked>
+    %21 = tt.addptr %arg33, %c64_i32 : !tt.ptr<f16>, i32
+    %22 = tt.load %20 : tensor<128x64x!tt.ptr<f16>, #blocked>
+    %23 = ttg.local_alloc %22 : (tensor<128x64xf16, #blocked>) -> !ttg.memdesc<128x64xf16, #shared, #smem>
+    %24 = arith.muli %13, %c64_i32 : i32
+    %25 = tt.experimental_descriptor_load %14[%24, %15] : !tt.tensordesc<tensor<64x256xf16>> -> tensor<64x256xf16, #blocked1>
+    %26 = ttg.local_alloc %25 : (tensor<64x256xf16, #blocked1>) -> !ttg.memdesc<64x256xf16, #shared, #smem>
+    %27 = ttng.warp_group_dot %23, %26, %arg34, %arg38 {inputPrecision = 0 : i32} : !ttg.memdesc<128x64xf16, #shared, #smem> * !ttg.memdesc<64x256xf16, #shared, #smem> -> tensor<128x256xf32, #mma>
+    %28 = arith.addi %13, %c1_i32 : i32
+
+    // This op is in the backward slice of `_test_marker_2` and the epilogue.
+    // CHECK: {_test_marker_1, loop.cluster = 3 : i32, loop.stage = 1 : i32}
+    %29 = arith.cmpi eq, %11, %c10_i32 {_test_marker_1} : i32
+
+    // CHECK: {_test_marker_2, loop.cluster = 3 : i32, loop.stage = 1 : i32}
+    %30 = arith.select %29, %arg5, %21 {_test_marker_2} : !tt.ptr<f16>
+
+    %31 = arith.cmpi ne, %11, %c10_i32 : i32
+
+    scf.if %29 {
+      "use"(%27) : (tensor<128x256xf32, #mma>) -> ()
+      // CHECK: {_test_marker_3, loop.cluster = 5 : i32, loop.stage = 2 : i32}
+    } {_test_marker_3}
+    scf.yield %11, %14, %28, %30, %27, %15, %16, %14, %31 : i32, !tt.tensordesc<tensor<64x256xf16>>, i32, !tt.ptr<f16>, tensor<128x256xf32, #mma>, i32, tensor<128x1xi64, #blocked>, !tt.tensordesc<tensor<64x256xf16>>, i1
+  }
+  tt.return
+}
+
+}
+
+// -----
+
 // CHECK-LABEL: @prologue_backward_slice
 tt.func @prologue_backward_slice(%ub: i32, %cond: i1) {
   %c0_i32 = arith.constant 0 : i32

Original file line number	Diff line number	Diff line change
`@@ -174,14 +174,17 @@ void scheduleDistanceOneDependencies(scf::ForOp forOp,`
`174`	`174`	`// Exception: Schedule loads with a distance of 1 together`
`175`	`175`	`// with the current op.`
`176`	`176`	`schedule.insertIfAbsent(defOp, stage, cluster);`
`177`		`- schedule.insertDepsOfOp(defOp, stage, cluster, true);`
	`177`	`+ schedule.insertDepsOfOp(defOp, stage, cluster,`
	`178`	`+ /includeArg=/true,`
	`179`	`+ /insertIfEarlier=/true);`
`178`	`180`	`} else {`
`179`	`181`	`if (dist1Cluster.count(&cluster) == 0) {`
`180`	`182`	`dist1Cluster[&cluster] = schedule.clusters.newBefore(cluster);`
`181`	`183`	`}`
`182`	`184`	`schedule.insertIfAbsent(defOp, stage + 1, dist1Cluster[&cluster]);`
`183`	`185`	`schedule.insertDepsOfOp(defOp, stage + 1, dist1Cluster[&cluster],`
`184`		`- true);`
	`186`	`+ /includeArg=/true,`
	`187`	`+ /includeIfEarlier=/true);`
`185`	`188`	`}`
`186`	`189`	`}`
`187`	`190`	`}`