[Warp Specialization] Tweak the scheduling heuristic (#7073)

Mogball · web-flow · commit 4ff4bd022439 · 2025-06-06T00:51:56.000-07:00
diff --git a/lib/Dialect/TritonGPU/Transforms/WarpSpecialization/PartitionScheduling.cpp b/lib/Dialect/TritonGPU/Transforms/WarpSpecialization/PartitionScheduling.cpp
@@ -226,11 +226,17 @@ static std::optional<WarpSchedule> getInitialSchedule(scf::ForOp loop) {
     return std::nullopt;
 
   // Propagate defs of exp.
-  for (auto expOp : loop.getOps<math::Exp2Op>()) {
-    auto tensorTy = dyn_cast<RankedTensorType>(expOp.getType());
-    if (tensorTy && tensorTy.getNumElements() > 256) {
-      schedule.trySchedule(defaultPartition, expOp);
-      scheduleDependencies(loop, schedule, defaultPartition, expOp);
+  for (Operation &op : loop.getOps()) {
+    if (!isa<math::Exp2Op, ElementwiseInlineAsmOp>(op))
+      continue;
+    int elementCount = 0;
+    for (Type type : op.getResultTypes()) {
+      if (auto tensorTy = dyn_cast<RankedTensorType>(type))
+        elementCount += tensorTy.getNumElements();
+    }
+    if (elementCount > 256) {
+      schedule.trySchedule(defaultPartition, &op);
+      scheduleDependencies(loop, schedule, defaultPartition, &op);
     }
   }
 
@@ -242,7 +248,8 @@ static std::optional<WarpSchedule> getInitialSchedule(scf::ForOp loop) {
   while (userPartitions.size() < mmas.size()) {
     userPartitions.push_back(schedule.addPartition(userPartitions.size()));
   }
-  for (auto [mmaOp, userPartition] : llvm::zip(mmas, userPartitions)) {
+  for (auto [mmaOp, userPartition] :
+       llvm::reverse(llvm::zip(mmas, userPartitions))) {
     scheduleUsers(loop, schedule, userPartition, mmaOp);
   }