[PIPELINER] Adding a marker for loop scheduling serialization (#6037)

pawelszczerbuk · web-flow · commit 04159ed54e8a · 2025-02-26T18:17:15.000-08:00
Marking a loop with serialized schedule data with an attribute, so
getting the maximum scheduling stage is both faster and more reliable.
diff --git a/include/triton/Dialect/TritonGPU/Transforms/PipeliningUtility.h b/include/triton/Dialect/TritonGPU/Transforms/PipeliningUtility.h
@@ -15,6 +15,7 @@ static const char *kDisallowAccMultiBufferAttrName =
     "tt.disallow_acc_multi_buffer";
 static const char *kLoopStageAttrName = "loop.stage";
 static const char *kLoopClusterAttrName = "loop.cluster";
+static const char *kScheduledMaxStageAttrName = "tt.scheduled_max_stage";
 static const char *kLatencyAttrName = "tt.latency";
 
 bool loopHasDistGreaterThanOne(scf::ForOp forOp);
diff --git a/lib/Dialect/TritonGPU/Transforms/Pipeliner/Schedule.cpp b/lib/Dialect/TritonGPU/Transforms/Pipeliner/Schedule.cpp
@@ -172,12 +172,11 @@ static std::pair<int, int> getMinMaxCluster(scf::ForOp &forOp) {
 
 static std::optional<int> tryGetMaxStage(scf::ForOp &forOp) {
   std::optional<int> maxStage = std::nullopt;
-  for (auto &op : forOp.getBody()->without_terminator()) {
-    if (!op.hasAttr(mlir::triton::kLoopStageAttrName) ||
-        !op.hasAttr(mlir::triton::kLoopClusterAttrName))
-      continue;
-    auto [stage, _] = getStageCluster(&op);
-    maxStage = maxStage ? (stage > *maxStage ? stage : *maxStage) : stage;
+  if (forOp->hasAttr(mlir::triton::kScheduledMaxStageAttrName)) {
+    return forOp
+        ->getAttrOfType<IntegerAttr>(mlir::triton::kScheduledMaxStageAttrName)
+        .getValue()
+        .getSExtValue();
   }
   return maxStage;
 }
@@ -187,6 +186,9 @@ void tt::CoarseSchedule::serialize(scf::ForOp &forOp) {
   for (auto [op, stage, cluster] : getOpsInOrder(forOp)) {
     setStageCluster(op, stage, *cluster);
   }
+  forOp->setAttr(mlir::triton::kScheduledMaxStageAttrName,
+                 IntegerAttr::get(IntegerType::get(forOp.getContext(), 32),
+                                  numStages - 1));
 }
 
 // Create a CoarseSchedule based on forOp's <stage, cluster>.
diff --git a/lib/Dialect/TritonGPU/Transforms/Pipeliner/SoftwarePipeliner.cpp b/lib/Dialect/TritonGPU/Transforms/Pipeliner/SoftwarePipeliner.cpp
@@ -72,6 +72,7 @@ static void removeAttributes(ModuleOp moduleOp) {
   moduleOp->walk([&](Operation *op) {
     op->removeAttr(mlir::triton::kLoopStageAttrName);
     op->removeAttr(mlir::triton::kLoopClusterAttrName);
+    op->removeAttr(mlir::triton::kScheduledMaxStageAttrName);
   });
 }
 
diff --git a/test/TritonGPU/pipeline-lower-loop.mlir b/test/TritonGPU/pipeline-lower-loop.mlir
@@ -51,7 +51,7 @@ tt.func @one_dep_async(%lb : index, %ub : index, %step : index,
   scf.for %iv = %lb to %ub step %step : index {
     %a = tt.load %a_ptr_init {loop.cluster = 2 : i32, loop.stage = 0 : i32} : tensor<128x32x!tt.ptr<f16>, #A>
     "use"(%a) {loop.cluster = 0 : i32, loop.stage = 2 : i32} : (tensor<128x32xf16, #A>) -> ()
-  }
+  } {tt.scheduled_max_stage = 2 : i32}
   tt.return
 }
 }
@@ -75,7 +75,7 @@ tt.func @different_use_stages(%lb : index, %ub : index, %step : index,
     %a = tt.load %a_ptr_init {loop.cluster = 2 : i32, loop.stage = 0 : i32} : tensor<128x32x!tt.ptr<f16>, #A>
     "use1"(%a) {loop.cluster = 0 : i32, loop.stage = 2 : i32} : (tensor<128x32xf16, #A>) -> ()
     "use2"(%a) {loop.cluster = 0 : i32, loop.stage = 3 : i32} : (tensor<128x32xf16, #A>) -> ()
-  }
+  } {tt.scheduled_max_stage = 3 : i32}
   tt.return
 }
 }
@@ -106,7 +106,7 @@ tt.func @used_by_if_yield(%lb : index, %ub : index, %step : index,
       scf.yield %init_a : tensor<128x32xf16, #A>
     } {loop.cluster = 0 : i32, loop.stage = 2 : i32}
     "use"(%a_if) {loop.cluster = 0 : i32, loop.stage = 3 : i32} : (tensor<128x32xf16, #A>) -> ()
-  }
+  } {tt.scheduled_max_stage = 3 : i32}
   tt.return
 }
 }
@@ -124,7 +124,7 @@ tt.func @dist1_load(%lb : index, %ub : index, %step : index,
     %a = tt.load %a_ptr_init {loop.cluster = 2 : i32, loop.stage = 0 : i32} : tensor<128x32x!tt.ptr<f16>, #A>
     "use"(%a) {loop.cluster = 0 : i32, loop.stage = 2 : i32} : (tensor<128x32xf16, #A>) -> ()
     scf.yield %a : tensor<128x32xf16, #A>
-  }
+  } {tt.scheduled_max_stage = 2 : i32}
   tt.return
 }
 }
@@ -142,7 +142,7 @@ tt.func @one_dep_sync(%lb : index, %ub : index, %step : index,
   scf.for %iv = %lb to %ub step %step : index {
     %a = tt.load %a_ptr_init {loop.cluster = 2 : i32, loop.stage = 0 : i32} : tensor<1x!tt.ptr<f16>, #A>
     "use"(%a) {loop.cluster = 0 : i32, loop.stage = 2 : i32} : (tensor<1xf16, #A>) -> ()
-  }
+  } {tt.scheduled_max_stage = 2 : i32}
   tt.return
 }
 }
@@ -183,7 +183,7 @@ tt.func @one_dep_local_alloc(%lb : index, %ub : index, %step : index,
     %a_alloc = ttg.local_alloc %a {loop.cluster = 0 : i32, loop.stage = 2 : i32} : (tensor<128x32xf16, #A>) -> !ttg.memdesc<128x32xf16, #shared, #ttg.shared_memory, mutable>
     %a_load = ttg.local_load %a_alloc {loop.cluster = 0 : i32, loop.stage = 2 : i32} : !ttg.memdesc<128x32xf16, #shared, #ttg.shared_memory, mutable> -> tensor<128x32xf16, #A>
     "use"(%a_load) {loop.cluster = 0 : i32, loop.stage = 2 : i32} : (tensor<128x32xf16, #A>) -> ()
-  }
+  } {tt.scheduled_max_stage = 2 : i32}
   tt.return
 }
 }
@@ -214,7 +214,7 @@ tt.func @one_load_group(%lb : index, %ub : index, %step : index,
     %b = tt.load %a_ptr_init {loop.cluster = 2 : i32, loop.stage = 0 : i32} : tensor<128x32x!tt.ptr<f32>, #A>
     "use1"(%a){loop.cluster = 0 : i32, loop.stage = 2 : i32} : (tensor<128x32xf32, #A>) -> ()
     "use2"(%b){loop.cluster = 0 : i32, loop.stage = 2 : i32} : (tensor<128x32xf32, #A>) -> ()
-  }
+  } {tt.scheduled_max_stage = 2 : i32}
   tt.return
 }
 }
@@ -255,7 +255,7 @@ tt.func @two_load_groups(%lb : index, %ub : index, %step : index,
     "use1"(%a){loop.cluster = 0 : i32, loop.stage = 2 : i32} : (tensor<128x32xf32, #A>) -> ()
     "use2"(%b){loop.cluster = 0 : i32, loop.stage = 2 : i32} : (tensor<128x32xf32, #A>) -> ()
     "use3"(%c){loop.cluster = 0 : i32, loop.stage = 3 : i32} : (tensor<128x32xf32, #A>) -> ()
-  }
+  } {tt.scheduled_max_stage = 3 : i32}
   tt.return
 }
 }
@@ -304,7 +304,7 @@ tt.func @dependent_loads(%lb : index, %ub : index, %step : index,
     %b = "pointerize"(%a) {loop.cluster = 2 : i32, loop.stage = 2 : i32} : (tensor<128x32xf32, #A>) -> tensor<128x32x!tt.ptr<f32>, #A>
     %c = tt.load %b {loop.cluster = 2 : i32, loop.stage = 2 : i32} : tensor<128x32x!tt.ptr<f32>, #A>
     "use1"(%c){loop.cluster = 0 : i32, loop.stage = 4 : i32} : (tensor<128x32xf32, #A>) -> ()
-  }
+  } {tt.scheduled_max_stage = 4 : i32}
   tt.return
 }
 }
@@ -361,7 +361,7 @@ tt.func @dependent_loads_asymmetric(%lb : index, %ub : index, %step : index,
     %b = "pointerize"(%a) {loop.cluster = 2 : i32, loop.stage = 2 : i32} : (tensor<128x32xf32, #A>) -> tensor<128x32x!tt.ptr<f32>, #A>
     %c = tt.load %b {loop.cluster = 2 : i32, loop.stage = 2 : i32} : tensor<128x32x!tt.ptr<f32>, #A>
     "use1"(%c){loop.cluster = 0 : i32, loop.stage = 5 : i32} : (tensor<128x32xf32, #A>) -> ()
-  }
+  } {tt.scheduled_max_stage = 5 : i32}
   tt.return
 }
 }
@@ -379,7 +379,7 @@ tt.func @unused_load(%lb : index, %ub : index, %step : index,
     // CHECK: dummy
     %a = tt.load %a_ptr_init {loop.cluster = 0 : i32, loop.stage = 1 : i32} : tensor<128x32x!tt.ptr<f32>, #A>
     "dummy"() : () -> ()
-  }
+  } {tt.scheduled_max_stage = 1 : i32}
   tt.return
 }
 }
@@ -434,7 +434,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.targ
       %B_sh = ttg.local_alloc %B {loop.cluster = 0 : i32, loop.stage = 2 : i32} : (tensor<128x128xf16, #blocked1>) -> !ttg.memdesc<128x128xf16, #shared, #ttg.shared_memory>
       %acc_res = ttng.warp_group_dot %A_sh, %B_sh, %acc {loop.cluster = 0 : i32, loop.stage = 2 : i32} : !ttg.memdesc<128x128xf16, #shared, #ttg.shared_memory> * !ttg.memdesc<128x128xf16, #shared, #ttg.shared_memory> -> tensor<128x128xf32, #mma>
       scf.yield %acc_res : tensor<128x128xf32, #mma>
-    }
+    } {tt.scheduled_max_stage = 2 : i32}
     %res_f16 = arith.truncf %res : tensor<128x128xf32, #mma> to tensor<128x128xf16, #mma>
     tt.return %res_f16 : tensor<128x128xf16, #mma>
   }
@@ -489,7 +489,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.targ
       %B_sh = ttg.local_alloc %B {loop.cluster = 0 : i32, loop.stage = 2 : i32} : (tensor<128x128xf16, #blocked1>) -> !ttg.memdesc<128x128xf16, #shared1, #ttg.shared_memory>
       %acc_res = ttng.warp_group_dot %A_sh, %B_sh, %acc {loop.cluster = 0 : i32, loop.stage = 2 : i32} : !ttg.memdesc<128x128xf16, #shared, #ttg.shared_memory> * !ttg.memdesc<128x128xf16, #shared1, #ttg.shared_memory> -> tensor<128x128xf32, #mma>
       scf.yield %acc_res : tensor<128x128xf32, #mma>
-    }
+    } {tt.scheduled_max_stage = 2 : i32}
     tt.return %res : tensor<128x128xf32, #mma>
   }
 }
@@ -555,7 +555,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.targ
       %B_sh = ttg.local_alloc %B {loop.cluster = 0 : i32, loop.stage = 2 : i32} : (tensor<128x128xf16, #blocked1>) -> !ttg.memdesc<128x128xf16, #shared, #ttg.shared_memory>
       %acc_res = ttng.warp_group_dot %A_sh, %B_sh, %acc {loop.cluster = 0 : i32, loop.stage = 2 : i32} : !ttg.memdesc<128x128xf16, #shared, #ttg.shared_memory> * !ttg.memdesc<128x128xf16, #shared, #ttg.shared_memory> -> tensor<128x128xf32, #mma>
       scf.yield %acc_res : tensor<128x128xf32, #mma>
-    }
+    } {tt.scheduled_max_stage = 2 : i32}
     %res_f16 = arith.truncf %res : tensor<128x128xf32, #mma> to tensor<128x128xf16, #mma>
     tt.return %res_f16 : tensor<128x128xf16, #mma>
   }
@@ -614,7 +614,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.targ
       ttng.tc_gen5_mma %A_sh, %B_sh, %acc_tm, %true, %true {loop.cluster = 0 : i32, loop.stage = 2 : i32} : (!ttg.memdesc<128x128xf16, #shared, #ttg.shared_memory>, !ttg.memdesc<128x128xf16, #shared, #ttg.shared_memory>, !ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory>, i1, i1) -> ()
       %acc_res = ttng.tmem_load %acc_tm {loop.cluster = 0 : i32, loop.stage = 2 : i32} : !ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory> -> tensor<128x128xf32, #blocked>
       scf.yield %acc_res : tensor<128x128xf32, #blocked>
-    }
+    } {tt.scheduled_max_stage = 2 : i32}
     %res_f16 = arith.truncf %res : tensor<128x128xf32, #blocked> to tensor<128x128xf16, #blocked>
     tt.return %res_f16 : tensor<128x128xf16, #blocked>
   }
@@ -669,7 +669,7 @@ tt.func @tma_load_lowering(%lb : index, %ub : index, %step : index,
   scf.for %iv = %lb to %ub step %step : index {
     %a = tt.experimental_descriptor_load %desc[%offs, %offs] {loop.cluster = 2 : i32, loop.stage = 0 : i32} : !tt.tensordesc<tensor<128x32xf16>> -> tensor<128x32xf16, #A>
     "use"(%a) {loop.cluster = 0 : i32, loop.stage = 2 : i32} : (tensor<128x32xf16, #A>) -> ()
-  }
+  } {tt.scheduled_max_stage = 2 : i32}
   tt.return
 }
 }
@@ -725,7 +725,7 @@ tt.func @tma_gather_lowering(%lb : index, %ub : index, %step : index,
   scf.for %iv = %lb to %ub step %step : index {
     %a = tt.experimental_descriptor_gather %desc[%x, %y] {loop.cluster = 2 : i32, loop.stage = 0 : i32} : (!tt.tensordesc<tensor<1x128xf32>>, tensor<32xi32, #offsets>, i32) -> tensor<32x128xf32, #A>
     "use"(%a) {loop.cluster = 0 : i32, loop.stage = 2 : i32} : (tensor<32x128xf32, #A>) -> ()
-  }
+  } {tt.scheduled_max_stage = 2 : i32}
   tt.return
 }
 }
@@ -760,7 +760,7 @@ tt.func @tma_reuse_barrier(%lb : index, %ub : index, %step : index,
     "use2"(%b) {loop.cluster = 0 : i32, loop.stage = 2 : i32} : (tensor<128x32xf16, #A>) -> ()
     %c = tt.experimental_descriptor_load %descC[%offs, %offs] {loop.cluster = 2 : i32, loop.stage = 0 : i32} : !tt.tensordesc<tensor<128x32xf16>> -> tensor<128x32xf16, #A>
     "use3"(%c) {loop.cluster = 0 : i32, loop.stage = 2 : i32} : (tensor<128x32xf16, #A>) -> ()
-  }
+  } {tt.scheduled_max_stage = 2 : i32}
   tt.return
 }
 }
@@ -798,7 +798,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.targ
       %B_sh = ttg.local_alloc %B {loop.cluster = 0 : i32, loop.stage = 2 : i32} : (tensor<128x128xf16, #blocked1>) -> !ttg.memdesc<128x128xf16, #shared, #ttg.shared_memory>
       %acc_res = ttng.warp_group_dot %A_sh, %B_sh, %acc {loop.cluster = 0 : i32, loop.stage = 2 : i32} : !ttg.memdesc<128x128xf16, #shared, #ttg.shared_memory> * !ttg.memdesc<128x128xf16, #shared, #ttg.shared_memory> -> tensor<128x128xf32, #mma>
       scf.yield %acc_res : tensor<128x128xf32, #mma>
-    }
+    } {tt.scheduled_max_stage = 2 : i32}
     %res_f16 = arith.truncf %res : tensor<128x128xf32, #mma> to tensor<128x128xf16, #mma>
     tt.return %res_f16 : tensor<128x128xf16, #mma>
   }
@@ -833,7 +833,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.targ
     scf.for %iv = %lb to %ub step %step : index {
       %desc = tt.make_tensor_descriptor %A, [%shape_x, %shape_y], [%strides_x, %strides_y] {loop.cluster = 0 : i32, loop.stage = 1 : i32} : <f16>, <tensor<128x128xf16>>
       "use"(%desc) {loop.cluster = 0 : i32, loop.stage = 1 : i32} : (!tt.tensordesc<tensor<128x128xf16>>) -> ()
-    }
+    } {tt.scheduled_max_stage = 1 : i32}
     tt.return
   }
 }
@@ -879,7 +879,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.targ
       ttng.tc_gen5_mma_scaled %A_sh, %B_sh, %acc_tm, %A_sc_sh, %B_sc_sh, %true, %true lhs = e5m2 rhs = e5m2 {loop.cluster = 0 : i32, loop.stage = 2 : i32} : (!ttg.memdesc<128x128xf8E5M2, #shared, #ttg.shared_memory>, !ttg.memdesc<128x128xf8E5M2, #shared, #ttg.shared_memory>, !ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory>, !ttg.memdesc<1x2x32x4x4xi8, #shared1, #smem>, !ttg.memdesc<1x2x32x4x4xi8, #shared1, #smem>, i1, i1) -> ()
       %acc_res = ttng.tmem_load %acc_tm {loop.cluster = 0 : i32, loop.stage = 2 : i32} : !ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory> -> tensor<128x128xf32, #blocked>
       scf.yield %acc_res : tensor<128x128xf32, #blocked>
-    }
+    } {tt.scheduled_max_stage = 2 : i32}
     tt.return %res : tensor<128x128xf32, #blocked>
   }
 }
diff --git a/test/TritonGPU/pipeline-schedule-loop.mlir b/test/TritonGPU/pipeline-schedule-loop.mlir
@@ -21,6 +21,7 @@ tt.func @one_dep(%lb : index, %ub : index, %step : index,
     %res = arith.addf %acc, %a : tensor<128x32xf16, #A>
     scf.yield %res : tensor<128x32xf16, #A>
   }
+  // CHECK: tt.scheduled_max_stage
   tt.return %loop#0 : tensor<128x32xf16, #A>
 }
 

Original file line number	Diff line number	Diff line change
`@@ -72,6 +72,7 @@ static void removeAttributes(ModuleOp moduleOp) {`
`72`	`72`	`moduleOp->walk([&](Operation *op) {`
`73`	`73`	`op->removeAttr(mlir::triton::kLoopStageAttrName);`
`74`	`74`	`op->removeAttr(mlir::triton::kLoopClusterAttrName);`
	`75`	`+ op->removeAttr(mlir::triton::kScheduledMaxStageAttrName);`
`75`	`76`	`});`
`76`	`77`	`}`
`77`	`78`
Original file line number	Diff line number	Diff line change
`@@ -51,7 +51,7 @@ tt.func @one_dep_async(%lb : index, %ub : index, %step : index,`
`51`	`51`	`scf.for %iv = %lb to %ub step %step : index {`
`52`	`52`	`%a = tt.load %a_ptr_init {loop.cluster = 2 : i32, loop.stage = 0 : i32} : tensor<128x32x!tt.ptr<f16>, #A>`
`53`	`53`	`"use"(%a) {loop.cluster = 0 : i32, loop.stage = 2 : i32} : (tensor<128x32xf16, #A>) -> ()`
`54`		`- }`
	`54`	`+ } {tt.scheduled_max_stage = 2 : i32}`
`55`	`55`	`tt.return`
`56`	`56`	`}`
`57`	`57`	`}`
`@@ -75,7 +75,7 @@ tt.func @different_use_stages(%lb : index, %ub : index, %step : index,`
`75`	`75`	`%a = tt.load %a_ptr_init {loop.cluster = 2 : i32, loop.stage = 0 : i32} : tensor<128x32x!tt.ptr<f16>, #A>`
`76`	`76`	`"use1"(%a) {loop.cluster = 0 : i32, loop.stage = 2 : i32} : (tensor<128x32xf16, #A>) -> ()`
`77`	`77`	`"use2"(%a) {loop.cluster = 0 : i32, loop.stage = 3 : i32} : (tensor<128x32xf16, #A>) -> ()`
`78`		`- }`
	`78`	`+ } {tt.scheduled_max_stage = 3 : i32}`
`79`	`79`	`tt.return`
`80`	`80`	`}`
`81`	`81`	`}`
`@@ -106,7 +106,7 @@ tt.func @used_by_if_yield(%lb : index, %ub : index, %step : index,`
`106`	`106`	`scf.yield %init_a : tensor<128x32xf16, #A>`
`107`	`107`	`} {loop.cluster = 0 : i32, loop.stage = 2 : i32}`
`108`	`108`	`"use"(%a_if) {loop.cluster = 0 : i32, loop.stage = 3 : i32} : (tensor<128x32xf16, #A>) -> ()`
`109`		`- }`
	`109`	`+ } {tt.scheduled_max_stage = 3 : i32}`
`110`	`110`	`tt.return`
`111`	`111`	`}`
`112`	`112`	`}`
`@@ -124,7 +124,7 @@ tt.func @dist1_load(%lb : index, %ub : index, %step : index,`
`124`	`124`	`%a = tt.load %a_ptr_init {loop.cluster = 2 : i32, loop.stage = 0 : i32} : tensor<128x32x!tt.ptr<f16>, #A>`
`125`	`125`	`"use"(%a) {loop.cluster = 0 : i32, loop.stage = 2 : i32} : (tensor<128x32xf16, #A>) -> ()`
`126`	`126`	`scf.yield %a : tensor<128x32xf16, #A>`
`127`		`- }`
	`127`	`+ } {tt.scheduled_max_stage = 2 : i32}`
`128`	`128`	`tt.return`
`129`	`129`	`}`
`130`	`130`	`}`
`@@ -142,7 +142,7 @@ tt.func @one_dep_sync(%lb : index, %ub : index, %step : index,`
`142`	`142`	`scf.for %iv = %lb to %ub step %step : index {`
`143`	`143`	`%a = tt.load %a_ptr_init {loop.cluster = 2 : i32, loop.stage = 0 : i32} : tensor<1x!tt.ptr<f16>, #A>`
`144`	`144`	`"use"(%a) {loop.cluster = 0 : i32, loop.stage = 2 : i32} : (tensor<1xf16, #A>) -> ()`
`145`		`- }`
	`145`	`+ } {tt.scheduled_max_stage = 2 : i32}`
`146`	`146`	`tt.return`
`147`	`147`	`}`
`148`	`148`	`}`
`@@ -183,7 +183,7 @@ tt.func @one_dep_local_alloc(%lb : index, %ub : index, %step : index,`
`183`	`183`	`%a_alloc = ttg.local_alloc %a {loop.cluster = 0 : i32, loop.stage = 2 : i32} : (tensor<128x32xf16, #A>) -> !ttg.memdesc<128x32xf16, #shared, #ttg.shared_memory, mutable>`
`184`	`184`	`%a_load = ttg.local_load %a_alloc {loop.cluster = 0 : i32, loop.stage = 2 : i32} : !ttg.memdesc<128x32xf16, #shared, #ttg.shared_memory, mutable> -> tensor<128x32xf16, #A>`
`185`	`185`	`"use"(%a_load) {loop.cluster = 0 : i32, loop.stage = 2 : i32} : (tensor<128x32xf16, #A>) -> ()`
`186`		`- }`
	`186`	`+ } {tt.scheduled_max_stage = 2 : i32}`
`187`	`187`	`tt.return`
`188`	`188`	`}`
`189`	`189`	`}`
`@@ -214,7 +214,7 @@ tt.func @one_load_group(%lb : index, %ub : index, %step : index,`
`214`	`214`	`%b = tt.load %a_ptr_init {loop.cluster = 2 : i32, loop.stage = 0 : i32} : tensor<128x32x!tt.ptr<f32>, #A>`
`215`	`215`	`"use1"(%a){loop.cluster = 0 : i32, loop.stage = 2 : i32} : (tensor<128x32xf32, #A>) -> ()`
`216`	`216`	`"use2"(%b){loop.cluster = 0 : i32, loop.stage = 2 : i32} : (tensor<128x32xf32, #A>) -> ()`
`217`		`- }`
	`217`	`+ } {tt.scheduled_max_stage = 2 : i32}`
`218`	`218`	`tt.return`
`219`	`219`	`}`
`220`	`220`	`}`
`@@ -255,7 +255,7 @@ tt.func @two_load_groups(%lb : index, %ub : index, %step : index,`
`255`	`255`	`"use1"(%a){loop.cluster = 0 : i32, loop.stage = 2 : i32} : (tensor<128x32xf32, #A>) -> ()`
`256`	`256`	`"use2"(%b){loop.cluster = 0 : i32, loop.stage = 2 : i32} : (tensor<128x32xf32, #A>) -> ()`
`257`	`257`	`"use3"(%c){loop.cluster = 0 : i32, loop.stage = 3 : i32} : (tensor<128x32xf32, #A>) -> ()`
`258`		`- }`
	`258`	`+ } {tt.scheduled_max_stage = 3 : i32}`
`259`	`259`	`tt.return`
`260`	`260`	`}`
`261`	`261`	`}`
`@@ -304,7 +304,7 @@ tt.func @dependent_loads(%lb : index, %ub : index, %step : index,`
`304`	`304`	`%b = "pointerize"(%a) {loop.cluster = 2 : i32, loop.stage = 2 : i32} : (tensor<128x32xf32, #A>) -> tensor<128x32x!tt.ptr<f32>, #A>`
`305`	`305`	`%c = tt.load %b {loop.cluster = 2 : i32, loop.stage = 2 : i32} : tensor<128x32x!tt.ptr<f32>, #A>`
`306`	`306`	`"use1"(%c){loop.cluster = 0 : i32, loop.stage = 4 : i32} : (tensor<128x32xf32, #A>) -> ()`
`307`		`- }`
	`307`	`+ } {tt.scheduled_max_stage = 4 : i32}`
`308`	`308`	`tt.return`
`309`	`309`	`}`
`310`	`310`	`}`
`@@ -361,7 +361,7 @@ tt.func @dependent_loads_asymmetric(%lb : index, %ub : index, %step : index,`
`361`	`361`	`%b = "pointerize"(%a) {loop.cluster = 2 : i32, loop.stage = 2 : i32} : (tensor<128x32xf32, #A>) -> tensor<128x32x!tt.ptr<f32>, #A>`
`362`	`362`	`%c = tt.load %b {loop.cluster = 2 : i32, loop.stage = 2 : i32} : tensor<128x32x!tt.ptr<f32>, #A>`
`363`	`363`	`"use1"(%c){loop.cluster = 0 : i32, loop.stage = 5 : i32} : (tensor<128x32xf32, #A>) -> ()`
`364`		`- }`
	`364`	`+ } {tt.scheduled_max_stage = 5 : i32}`
`365`	`365`	`tt.return`
`366`	`366`	`}`
`367`	`367`	`}`
`@@ -379,7 +379,7 @@ tt.func @unused_load(%lb : index, %ub : index, %step : index,`
`379`	`379`	`// CHECK: dummy`
`380`	`380`	`%a = tt.load %a_ptr_init {loop.cluster = 0 : i32, loop.stage = 1 : i32} : tensor<128x32x!tt.ptr<f32>, #A>`
`381`	`381`	`"dummy"() : () -> ()`
`382`		`- }`
	`382`	`+ } {tt.scheduled_max_stage = 1 : i32}`
`383`	`383`	`tt.return`
`384`	`384`	`}`
`385`	`385`	`}`
`@@ -434,7 +434,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.targ`
`434`	`434`	`%B_sh = ttg.local_alloc %B {loop.cluster = 0 : i32, loop.stage = 2 : i32} : (tensor<128x128xf16, #blocked1>) -> !ttg.memdesc<128x128xf16, #shared, #ttg.shared_memory>`
`435`	`435`	`%acc_res = ttng.warp_group_dot %A_sh, %B_sh, %acc {loop.cluster = 0 : i32, loop.stage = 2 : i32} : !ttg.memdesc<128x128xf16, #shared, #ttg.shared_memory> * !ttg.memdesc<128x128xf16, #shared, #ttg.shared_memory> -> tensor<128x128xf32, #mma>`
`436`	`436`	`scf.yield %acc_res : tensor<128x128xf32, #mma>`
`437`		`- }`
	`437`	`+ } {tt.scheduled_max_stage = 2 : i32}`
`438`	`438`	`%res_f16 = arith.truncf %res : tensor<128x128xf32, #mma> to tensor<128x128xf16, #mma>`
`439`	`439`	`tt.return %res_f16 : tensor<128x128xf16, #mma>`
`440`	`440`	`}`
`@@ -489,7 +489,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.targ`
`489`	`489`	`%B_sh = ttg.local_alloc %B {loop.cluster = 0 : i32, loop.stage = 2 : i32} : (tensor<128x128xf16, #blocked1>) -> !ttg.memdesc<128x128xf16, #shared1, #ttg.shared_memory>`
`490`	`490`	`%acc_res = ttng.warp_group_dot %A_sh, %B_sh, %acc {loop.cluster = 0 : i32, loop.stage = 2 : i32} : !ttg.memdesc<128x128xf16, #shared, #ttg.shared_memory> * !ttg.memdesc<128x128xf16, #shared1, #ttg.shared_memory> -> tensor<128x128xf32, #mma>`
`491`	`491`	`scf.yield %acc_res : tensor<128x128xf32, #mma>`
`492`		`- }`
	`492`	`+ } {tt.scheduled_max_stage = 2 : i32}`
`493`	`493`	`tt.return %res : tensor<128x128xf32, #mma>`
`494`	`494`	`}`
`495`	`495`	`}`
`@@ -555,7 +555,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.targ`
`555`	`555`	`%B_sh = ttg.local_alloc %B {loop.cluster = 0 : i32, loop.stage = 2 : i32} : (tensor<128x128xf16, #blocked1>) -> !ttg.memdesc<128x128xf16, #shared, #ttg.shared_memory>`
`556`	`556`	`%acc_res = ttng.warp_group_dot %A_sh, %B_sh, %acc {loop.cluster = 0 : i32, loop.stage = 2 : i32} : !ttg.memdesc<128x128xf16, #shared, #ttg.shared_memory> * !ttg.memdesc<128x128xf16, #shared, #ttg.shared_memory> -> tensor<128x128xf32, #mma>`
`557`	`557`	`scf.yield %acc_res : tensor<128x128xf32, #mma>`
`558`		`- }`
	`558`	`+ } {tt.scheduled_max_stage = 2 : i32}`
`559`	`559`	`%res_f16 = arith.truncf %res : tensor<128x128xf32, #mma> to tensor<128x128xf16, #mma>`
`560`	`560`	`tt.return %res_f16 : tensor<128x128xf16, #mma>`
`561`	`561`	`}`
`@@ -614,7 +614,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.targ`
`614`	`614`	`ttng.tc_gen5_mma %A_sh, %B_sh, %acc_tm, %true, %true {loop.cluster = 0 : i32, loop.stage = 2 : i32} : (!ttg.memdesc<128x128xf16, #shared, #ttg.shared_memory>, !ttg.memdesc<128x128xf16, #shared, #ttg.shared_memory>, !ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory>, i1, i1) -> ()`
`615`	`615`	`%acc_res = ttng.tmem_load %acc_tm {loop.cluster = 0 : i32, loop.stage = 2 : i32} : !ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory> -> tensor<128x128xf32, #blocked>`
`616`	`616`	`scf.yield %acc_res : tensor<128x128xf32, #blocked>`
`617`		`- }`
	`617`	`+ } {tt.scheduled_max_stage = 2 : i32}`
`618`	`618`	`%res_f16 = arith.truncf %res : tensor<128x128xf32, #blocked> to tensor<128x128xf16, #blocked>`
`619`	`619`	`tt.return %res_f16 : tensor<128x128xf16, #blocked>`
`620`	`620`	`}`
`@@ -669,7 +669,7 @@ tt.func @tma_load_lowering(%lb : index, %ub : index, %step : index,`
`669`	`669`	`scf.for %iv = %lb to %ub step %step : index {`
`670`	`670`	`%a = tt.experimental_descriptor_load %desc[%offs, %offs] {loop.cluster = 2 : i32, loop.stage = 0 : i32} : !tt.tensordesc<tensor<128x32xf16>> -> tensor<128x32xf16, #A>`
`671`	`671`	`"use"(%a) {loop.cluster = 0 : i32, loop.stage = 2 : i32} : (tensor<128x32xf16, #A>) -> ()`
`672`		`- }`
	`672`	`+ } {tt.scheduled_max_stage = 2 : i32}`
`673`	`673`	`tt.return`
`674`	`674`	`}`
`675`	`675`	`}`
`@@ -725,7 +725,7 @@ tt.func @tma_gather_lowering(%lb : index, %ub : index, %step : index,`
`725`	`725`	`scf.for %iv = %lb to %ub step %step : index {`
`726`	`726`	`%a = tt.experimental_descriptor_gather %desc[%x, %y] {loop.cluster = 2 : i32, loop.stage = 0 : i32} : (!tt.tensordesc<tensor<1x128xf32>>, tensor<32xi32, #offsets>, i32) -> tensor<32x128xf32, #A>`
`727`	`727`	`"use"(%a) {loop.cluster = 0 : i32, loop.stage = 2 : i32} : (tensor<32x128xf32, #A>) -> ()`
`728`		`- }`
	`728`	`+ } {tt.scheduled_max_stage = 2 : i32}`
`729`	`729`	`tt.return`
`730`	`730`	`}`
`731`	`731`	`}`
`@@ -760,7 +760,7 @@ tt.func @tma_reuse_barrier(%lb : index, %ub : index, %step : index,`
`760`	`760`	`"use2"(%b) {loop.cluster = 0 : i32, loop.stage = 2 : i32} : (tensor<128x32xf16, #A>) -> ()`
`761`	`761`	`%c = tt.experimental_descriptor_load %descC[%offs, %offs] {loop.cluster = 2 : i32, loop.stage = 0 : i32} : !tt.tensordesc<tensor<128x32xf16>> -> tensor<128x32xf16, #A>`
`762`	`762`	`"use3"(%c) {loop.cluster = 0 : i32, loop.stage = 2 : i32} : (tensor<128x32xf16, #A>) -> ()`
`763`		`- }`
	`763`	`+ } {tt.scheduled_max_stage = 2 : i32}`
`764`	`764`	`tt.return`
`765`	`765`	`}`
`766`	`766`	`}`
`@@ -798,7 +798,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.targ`
`798`	`798`	`%B_sh = ttg.local_alloc %B {loop.cluster = 0 : i32, loop.stage = 2 : i32} : (tensor<128x128xf16, #blocked1>) -> !ttg.memdesc<128x128xf16, #shared, #ttg.shared_memory>`
`799`	`799`	`%acc_res = ttng.warp_group_dot %A_sh, %B_sh, %acc {loop.cluster = 0 : i32, loop.stage = 2 : i32} : !ttg.memdesc<128x128xf16, #shared, #ttg.shared_memory> * !ttg.memdesc<128x128xf16, #shared, #ttg.shared_memory> -> tensor<128x128xf32, #mma>`
`800`	`800`	`scf.yield %acc_res : tensor<128x128xf32, #mma>`
`801`		`- }`
	`801`	`+ } {tt.scheduled_max_stage = 2 : i32}`
`802`	`802`	`%res_f16 = arith.truncf %res : tensor<128x128xf32, #mma> to tensor<128x128xf16, #mma>`
`803`	`803`	`tt.return %res_f16 : tensor<128x128xf16, #mma>`
`804`	`804`	`}`
`@@ -833,7 +833,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.targ`
`833`	`833`	`scf.for %iv = %lb to %ub step %step : index {`
`834`	`834`	`%desc = tt.make_tensor_descriptor %A, [%shape_x, %shape_y], [%strides_x, %strides_y] {loop.cluster = 0 : i32, loop.stage = 1 : i32} : <f16>, <tensor<128x128xf16>>`
`835`	`835`	`"use"(%desc) {loop.cluster = 0 : i32, loop.stage = 1 : i32} : (!tt.tensordesc<tensor<128x128xf16>>) -> ()`
`836`		`- }`
	`836`	`+ } {tt.scheduled_max_stage = 1 : i32}`
`837`	`837`	`tt.return`
`838`	`838`	`}`
`839`	`839`	`}`
`@@ -879,7 +879,7 @@ module attributes {"ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 4 : i32, ttg.targ`
`879`	`879`	`ttng.tc_gen5_mma_scaled %A_sh, %B_sh, %acc_tm, %A_sc_sh, %B_sc_sh, %true, %true lhs = e5m2 rhs = e5m2 {loop.cluster = 0 : i32, loop.stage = 2 : i32} : (!ttg.memdesc<128x128xf8E5M2, #shared, #ttg.shared_memory>, !ttg.memdesc<128x128xf8E5M2, #shared, #ttg.shared_memory>, !ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory>, !ttg.memdesc<1x2x32x4x4xi8, #shared1, #smem>, !ttg.memdesc<1x2x32x4x4xi8, #shared1, #smem>, i1, i1) -> ()`
`880`	`880`	`%acc_res = ttng.tmem_load %acc_tm {loop.cluster = 0 : i32, loop.stage = 2 : i32} : !ttg.memdesc<128x128xf32, #tmem, #ttng.tensor_memory> -> tensor<128x128xf32, #blocked>`
`881`	`881`	`scf.yield %acc_res : tensor<128x128xf32, #blocked>`
`882`		`- }`
	`882`	`+ } {tt.scheduled_max_stage = 2 : i32}`
`883`	`883`	`tt.return %res : tensor<128x128xf32, #blocked>`
`884`	`884`	`}`
`885`	`885`	`}`
Original file line number	Diff line number	Diff line change
`@@ -21,6 +21,7 @@ tt.func @one_dep(%lb : index, %ub : index, %step : index,`
`21`	`21`	`%res = arith.addf %acc, %a : tensor<128x32xf16, #A>`
`22`	`22`	`scf.yield %res : tensor<128x32xf16, #A>`
`23`	`23`	`}`
	`24`	`+ // CHECK: tt.scheduled_max_stage`
`24`	`25`	`tt.return %loop#0 : tensor<128x32xf16, #A>`
`25`	`26`	`}`
`26`	`27`