[Pipeliner] Fix backward scheduling over ttg.local_load (#7194)

Mogball · web-flow · commit 7a342f2859ac · 2025-06-17T12:04:33.000-04:00
A small bug was caused because backwards stage prop after warp
specialization was assuming all latency ops have stages assigned. This
is the correct thing to assume but right now the stage can sometimes be
dropped for various reasons. Workaround the problem (for now) by
ignoring those ops.

This PR also tightens `const` API for CoarseSchedule to help catch bugs
like this.
diff --git a/include/triton/Dialect/TritonGPU/Transforms/Schedule.h b/include/triton/Dialect/TritonGPU/Transforms/Schedule.h
@@ -45,7 +45,7 @@ class CoarseSchedule {
     const_iterator begin() const { return orderClusters.begin(); }
     iterator end() { return orderClusters.end(); }
     const_iterator end() const { return orderClusters.end(); }
-    size_t size() { return orderClusters.size(); }
+    size_t size() const { return orderClusters.size(); }
     iterator newAtBack() {
       orderClusters.push_back(orderClusters.size());
       return std::prev(orderClusters.end());
@@ -88,7 +88,7 @@ class CoarseSchedule {
   DenseMap<Operation *, std::pair<int, Cluster>> opToStageAndCluster;
 
   void setNumStages(int numStages) { this->numStages = numStages; }
-  int getNumStages() { return numStages; }
+  int getNumStages() const { return numStages; }
 
   void insert(Operation *op, int stage, Cluster cluster) {
     if (stage >= numStages) {
@@ -115,7 +115,7 @@ class CoarseSchedule {
 
   void erase(Operation *op) { opToStageAndCluster.erase(op); }
 
-  int count(Operation *op) { return opToStageAndCluster.count(op); }
+  int count(Operation *op) const { return opToStageAndCluster.count(op); }
 
   std::pair<int, Cluster> operator[](Operation *op) {
     return opToStageAndCluster[op];
@@ -129,25 +129,25 @@ class CoarseSchedule {
   Cluster splitClusterBefore(Operation *op, scf::ForOp forOp);
 
   // Check if op a will show up before op b in the final unrolled code.
-  bool isOpBefore(Operation *a, Operation *b);
+  bool isOpBefore(Operation *a, Operation *b) const;
 
   // Check if op a is in earlier cluster than op b.
-  bool isOpInEarlierCluster(Operation *a, Operation *b);
+  bool isOpInEarlierCluster(Operation *a, Operation *b) const;
 
   // Check if op a is in the same cluster as op b.
-  bool isOpInSameCluster(Operation *a, Operation *b);
+  bool isOpInSameCluster(Operation *a, Operation *b) const;
 
   SmallVector<std::tuple<Operation *, int, Cluster>>
-  getOpsInOrder(scf::ForOp forOp);
+  getOpsInOrder(scf::ForOp forOp) const;
   std::vector<std::pair<Operation *, unsigned>>
-  createFinalSchedule(scf::ForOp forOp);
+  createFinalSchedule(scf::ForOp forOp) const;
 
   bool empty() const { return opToStageAndCluster.size() == 0; }
   auto end() const { return opToStageAndCluster.end(); }
   auto begin() const { return opToStageAndCluster.begin(); }
 
   // Set <stage, cluster> based on CoarseSchedule.
-  void serialize(scf::ForOp &forOp);
+  void serialize(scf::ForOp &forOp) const;
   // Create a CoarseSchedule based on forOp's <stage, cluster>.
   LogicalResult deSerialize(scf::ForOp &forOp);
 
diff --git a/lib/Dialect/TritonGPU/Transforms/Pipeliner/Schedule.cpp b/lib/Dialect/TritonGPU/Transforms/Pipeliner/Schedule.cpp
@@ -123,11 +123,11 @@ tt::CoarseSchedule::splitClusterBefore(Operation *op, scf::ForOp forOp) {
 }
 
 // Check if op a will show up before op b in the final unrolled code.
-bool tt::CoarseSchedule::isOpBefore(Operation *a, Operation *b) {
+bool tt::CoarseSchedule::isOpBefore(Operation *a, Operation *b) const {
   assert(opToStageAndCluster.count(a) && opToStageAndCluster.count(b) &&
          "Operations must be in the schedule");
-  auto [aStage, aCluster] = opToStageAndCluster[a];
-  auto [bStage, bCluster] = opToStageAndCluster[b];
+  auto [aStage, aCluster] = opToStageAndCluster.at(a);
+  auto [bStage, bCluster] = opToStageAndCluster.at(b);
   if (aStage != bStage) {
     return aStage < bStage;
   }
@@ -137,21 +137,22 @@ bool tt::CoarseSchedule::isOpBefore(Operation *a, Operation *b) {
   return a->isBeforeInBlock(b);
 }
 
-bool tt::CoarseSchedule::isOpInEarlierCluster(Operation *a, Operation *b) {
+bool tt::CoarseSchedule::isOpInEarlierCluster(Operation *a,
+                                              Operation *b) const {
   assert(opToStageAndCluster.count(a) && opToStageAndCluster.count(b) &&
          "Operations must be in the schedule");
-  return clusters.isBefore(opToStageAndCluster[a].second,
-                           opToStageAndCluster[b].second);
+  return clusters.isBefore(opToStageAndCluster.at(a).second,
+                           opToStageAndCluster.at(b).second);
 }
 
-bool tt::CoarseSchedule::isOpInSameCluster(Operation *a, Operation *b) {
+bool tt::CoarseSchedule::isOpInSameCluster(Operation *a, Operation *b) const {
   assert(opToStageAndCluster.count(a) && opToStageAndCluster.count(b) &&
          "Operations must be in the schedule");
-  return opToStageAndCluster[a].second == opToStageAndCluster[b].second;
+  return opToStageAndCluster.at(a).second == opToStageAndCluster.at(b).second;
 }
 
 SmallVector<std::tuple<Operation *, int, tt::CoarseSchedule::Cluster>>
-tt::CoarseSchedule::getOpsInOrder(scf::ForOp forOp) {
+tt::CoarseSchedule::getOpsInOrder(scf::ForOp forOp) const {
   SmallVector<SmallVector<std::tuple<Operation *, int, Cluster>>, 8>
       orderClusters(clusters.size());
   for (auto &op : forOp.getBody()->without_terminator()) {
@@ -160,12 +161,11 @@ tt::CoarseSchedule::getOpsInOrder(scf::ForOp forOp) {
       continue;
     }
     auto [stage, cluster] = it->second;
-    if (cluster == Cluster{}) {
-      continue;
-    }
+    assert(cluster != Cluster{} && "Op with invalid cluster!");
     assert(stage < numStages && "Op with invalid stage!");
     int clusterId = *cluster;
-    assert(clusterId == std::distance(clusters.begin(), cluster) &&
+    assert(clusterId == std::distance(clusters.begin(),
+                                      ClusterList::const_iterator(cluster)) &&
            "Cluster ID mismatch!");
     orderClusters[clusterId].push_back(make_tuple(&op, stage, cluster));
   }
@@ -180,7 +180,7 @@ tt::CoarseSchedule::getOpsInOrder(scf::ForOp forOp) {
 }
 
 std::vector<std::pair<Operation *, unsigned>>
-tt::CoarseSchedule::createFinalSchedule(scf::ForOp forOp) {
+tt::CoarseSchedule::createFinalSchedule(scf::ForOp forOp) const {
   SmallVector<std::tuple<Operation *, int, tt::CoarseSchedule::Cluster>>
       opsInOrder = getOpsInOrder(forOp);
   std::vector<std::pair<Operation *, unsigned>> schedule;
@@ -248,7 +248,7 @@ static std::optional<int> tryGetMaxStage(scf::ForOp &forOp) {
 }
 
 // Set <stage, cluster> based on CoarseSchedule.
-void tt::CoarseSchedule::serialize(scf::ForOp &forOp) {
+void tt::CoarseSchedule::serialize(scf::ForOp &forOp) const {
   for (auto [op, stage, cluster] : getOpsInOrder(forOp)) {
     setStageCluster(op, stage, *cluster);
   }
diff --git a/lib/Dialect/TritonGPU/Transforms/Pipeliner/ScheduleLoops.cpp b/lib/Dialect/TritonGPU/Transforms/Pipeliner/ScheduleLoops.cpp
@@ -184,8 +184,11 @@ CoarseSchedule getInitialSchedule(scf::ForOp forOp,
     // assigned to the same stage.
     DenseSet<int> latencyStages;
     auto ops = forOp.getBody()->without_terminator();
-    for (Operation &op : llvm::make_filter_range(ops, isLatencyOp))
-      latencyStages.insert(schedule[&op].first);
+    for (Operation &op : llvm::make_filter_range(ops, isLatencyOp)) {
+      // FIXME: This should assert all latency ops have an assigned stage.
+      if (schedule.count(&op))
+        latencyStages.insert(schedule[&op].first);
+    }
     if (latencyStages.size() <= 1) {
       CoarseSchedule normalized(/*numStages=*/1);
       auto cluster = normalized.clusters.newAtFront();
diff --git a/test/TritonGPU/pipeline-schedule-loop.mlir b/test/TritonGPU/pipeline-schedule-loop.mlir
@@ -841,3 +841,30 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.targ
     tt.return %2 : tensor<128x128xf16, #blocked1>
   }
 }
+
+// -----
+
+#blocked = #ttg.blocked<{sizePerThread = [1, 4], threadsPerWarp = [32, 1], warpsPerCTA = [4, 1], order = [1, 0]}>
+#linear = #ttg.linear<{register = [[0, 1], [0, 2], [32, 0], [64, 0]], lane = [[1, 0], [2, 0], [4, 0], [8, 0], [16, 0]], warp = [[0, 0], [0, 0]], block = []}>
+#shared = #ttg.nvmma_shared<{swizzlingByteWidth = 0, transposed = false, elementBitWidth = 8}>
+#smem = #ttg.shared_memory
+#tmem_scales = #ttng.tensor_memory_scales_encoding<>
+
+module attributes {"ttg.num-warps" = 4 : i32} {
+
+// CHECK-LABEL: @backwards_prop_existing
+tt.func public @backwards_prop_existing(%arg0: i32, %arg1: tensor<128x4x!tt.ptr<i8>, #blocked>) {
+  %c0_i32 = arith.constant 0 : i32
+  %c1_i32 = arith.constant 1 : i32
+  scf.for %arg2 = %c0_i32 to %arg0 step %c1_i32  : i32 {
+    %0 = tt.load %arg1 {loop.cluster = 2 : i32, loop.stage = 3 : i32} : tensor<128x4x!tt.ptr<i8>, #blocked>
+    %1 = ttg.local_alloc %0 : (tensor<128x4xi8, #blocked>) -> !ttg.memdesc<128x4xi8, #shared, #smem>
+    // CHECK: ttg.local_load %{{.*}} {loop.cluster = 0 : i32, loop.stage = 0 : i32}
+    %2 = ttg.local_load %1 : !ttg.memdesc<128x4xi8, #shared, #smem> -> tensor<128x4xi8, #linear>
+    %result = ttng.tmem_alloc %2 {loop.cluster = 2 : i32, loop.stage = 3 : i32} : (tensor<128x4xi8, #linear>) -> !ttg.memdesc<128x4xi8, #tmem_scales, #ttng.tensor_memory>
+    "use"(%result) {loop.cluster = 2 : i32, loop.stage = 3 : i32} : (!ttg.memdesc<128x4xi8, #tmem_scales, #ttng.tensor_memory>) -> ()
+  } {tt.scheduled_max_stage = 3 : i32, tt.warp_specialize}
+  tt.return
+}
+
+}