intel
diff --git a/‎include/triton/Dialect/TritonGPU/Transforms/PipeliningUtility.h‎
Lines changed: 7 additions & 0 deletions b/‎include/triton/Dialect/TritonGPU/Transforms/PipeliningUtility.h‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/Transforms/Schedule.h‎
Lines changed: 8 additions & 2 deletions b/‎include/triton/Dialect/TritonGPU/Transforms/Schedule.h‎
Lines changed: 8 additions & 2 deletions
diff --git a/‎lib/Dialect/TritonGPU/IR/Dialect.cpp‎
Lines changed: 1 addition & 0 deletions b/‎lib/Dialect/TritonGPU/IR/Dialect.cpp‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎lib/Dialect/TritonGPU/Transforms/LoopScheduling.cpp‎
Lines changed: 28 additions & 19 deletions b/‎lib/Dialect/TritonGPU/Transforms/LoopScheduling.cpp‎
Lines changed: 28 additions & 19 deletions
diff --git a/‎lib/Dialect/TritonGPU/Transforms/Pipeliner/AssignLatencies.cpp‎
Lines changed: 1 addition & 1 deletion b/‎lib/Dialect/TritonGPU/Transforms/Pipeliner/AssignLatencies.cpp‎
Lines changed: 1 addition & 1 deletion
@@ -43,6 +43,13 @@ void replaceUsesAndPropagateType(OpBuilder &builder, Operation *oldUse,
 // `tt.disallow_acc_multi_buffer` set to true.
 bool getDisallowAccMultiBuffer(scf::ForOp forOp);
 
+/// Visit the operands of `op` and the operands of any nested ops defined
+/// outside of `op`.
+void visitNestedOperands(Operation *op, function_ref<void(Value)> visitor);
+/// Get the operands of `op` and the operands of any nested ops defined outside
+/// of `op`.
+SetVector<Value> getNestedOperands(Operation *op);
+
 // Return the minClusterId and maxClusterId for the given ForOp.
 std::pair<int, int> getMinMaxCluster(scf::ForOp &forOp);
 std::pair<int, int> getStageCluster(Operation *op);
 
@@ -142,11 +142,17 @@ class CoarseSchedule {
   getOpsInOrder(scf::ForOp forOp);
   std::vector<std::pair<Operation *, unsigned>>
   createFinalSchedule(scf::ForOp forOp);
-  void dump();
-  bool empty() { return opToStageAndCluster.size() == 0; }
+
+  bool empty() const { return opToStageAndCluster.size() == 0; }
+  auto end() const { return opToStageAndCluster.end(); }
+  auto begin() const { return opToStageAndCluster.begin(); }
+
+  // Set <stage, cluster> based on CoarseSchedule.
   void serialize(scf::ForOp &forOp);
   // Create a CoarseSchedule based on forOp's <stage, cluster>.
   void deSerialize(scf::ForOp &forOp);
+
+  LLVM_DUMP_METHOD void dump();
 };
 
 // Add dependencies of anchor ops to the coarse schedule. Schedule them to
 
@@ -1445,6 +1445,7 @@ Attribute AMDMfmaEncodingAttr::parse(AsmParser &parser, Type type) {
       if (parseIntArrayAttr(parser, attr, instrShape, "instrShape").failed())
         return {};
     }
+
     if (attr.getName() == "isTransposed") {
       if (parseBool(parser, attr, isTransposed, "isTransposed").failed())
         return {};
 
@@ -104,9 +104,7 @@ CoarseSchedule scheduleKeyOps(scf::ForOp forOp,
   }
 
   // Assign stage to each op reachable from a latency op
-  for (auto &kv : distance) {
-    Operation *op = kv.first;
-    int dist = kv.second;
+  for (auto [op, dist] : distance) {
     // We only schedule ops that are downstream of a latency op
     // (had a non-negative distance due to a latency op).
     if (dist >= 0)
@@ -120,16 +118,31 @@ CoarseSchedule scheduleKeyOps(scf::ForOp forOp,
   for (int i = 0; i <= maxStage; i++) {
     clusters[i] = schedule.clusters.newAtBack();
   }
-  CoarseSchedule::Cluster epilogue = schedule.clusters.newAtBack();
   // Assign ops to the clusters in reverse-stage order;
   // ops with higher stage numbers are assigned first. This way we will
   // end up with roughly reverse program order in the clusters.
+  for (auto [op, stage] : opToStage)
+    schedule.insert(op, stage, clusters[maxStage - stage]);
+
+  // Move `scf.if` ops in the current schedule (forward slice of the latency
+  // ops) into a new epilogue cluster at the end of the schedule, pushing them
+  // as close to the end of the loop body as possible.
+  CoarseSchedule::Cluster epilogue = schedule.clusters.newAtBack();
   for (auto [op, stage] : opToStage) {
-    if (isa<scf::IfOp>(op)) {
-      schedule.insert(op, stage, epilogue);
+    auto ifOp = dyn_cast<scf::IfOp>(op);
+    if (!ifOp)
       continue;
-    }
-    schedule.insert(op, stage, clusters[maxStage - stage]);
+    // If the `scf.if` op itself is a latency op, skip it.
+    if (opLatency.contains(ifOp))
+      continue;
+    // Ensure this does not create scheduling conflicts by ensuring the forward
+    // slice of the `scf.if` does not contain ops that are already scheduled, as
+    // this will cause the `scf.if` to be scheduled after its dependents.
+    SetVector<Operation *> slice;
+    getForwardSlice(ifOp, &slice);
+    if (llvm::any_of(slice, [&](Operation *op) { return opToStage.count(op); }))
+      continue;
+    schedule.insert(ifOp, stage, epilogue);
   }
 
   return schedule;
@@ -140,16 +153,6 @@ CoarseSchedule scheduleKeyOps(scf::ForOp forOp,
 void scheduleDistanceOneDependencies(scf::ForOp forOp,
                                      CoarseSchedule &schedule) {
   int numStages = schedule.numStages;
-  auto getNestedOperands = [](Operation *op) -> SmallVector<Value> {
-    SmallVector<Value> operands;
-    op->walk([&](Operation *nestedOp) {
-      for (Value operand : nestedOp->getOperands()) {
-        if (operand.getParentBlock()->getParentOp()->isAncestor(nestedOp))
-          operands.push_back(operand);
-      }
-    });
-    return operands;
-  };
 
   // Mapping from the cluster to the cluster before it.
   DenseMap<CoarseSchedule::Cluster *, CoarseSchedule::Cluster> dist1Cluster;
@@ -206,6 +209,7 @@ CoarseSchedule::Cluster schedulePrologueAndEpilogue(scf::ForOp forOp,
       SetVector<Operation *> backwardSlice;
       BackwardSliceOptions opt;
       opt.omitBlockArguments = true;
+      opt.omitUsesFromAbove = false;
       getBackwardSlice((Operation *)op, &backwardSlice, opt);
 
       for (auto op : backwardSlice) {
@@ -218,7 +222,7 @@ CoarseSchedule::Cluster schedulePrologueAndEpilogue(scf::ForOp forOp,
   if (!ifsToStage.empty()) {
     CoarseSchedule::Cluster prologueCluster = schedule.clusters.newAtFront();
     for (auto [ifOp, stage] : ifsToStage) {
-      schedule.insert(ifOp, stage, prologueCluster);
+      schedule.insertIfAbsent(ifOp, stage, prologueCluster);
     }
   }
 
@@ -341,6 +345,11 @@ class TritonGPULoopSchedulingPass
     // only for loops missing the latency information.
     DenseMap<Operation *, int> opLatency =
         assignLatencies(getOperation(), numStages);
+    LLVM_DEBUG({
+      LDBG("Assigned latencies:\n");
+      for (auto [op, latency] : opLatency)
+        LDBG("  " << latency << " : " << *op);
+    });
     // numStages should not be used below this point. We should know everything
     // based on the assigned stages
 
 
@@ -164,7 +164,7 @@ loadOpsToIndirectionLevel(scf::ForOp forOp, bool pipelineWithoutDot,
           finalUser = op;
           distance++;
         }
-        for (Value operand : op->getOperands()) {
+        for (Value operand : getNestedOperands(op)) {
           if (isa<mlir::triton::DotOpInterface>(op)) {
             // Heuristic: only pipeline A and B operands of the dot op.
             if (operand == op->getOperand(2))
Original file line number	Diff line number	Diff line change
`@@ -1445,6 +1445,7 @@ Attribute AMDMfmaEncodingAttr::parse(AsmParser &parser, Type type) {`
`1445`	`1445`	`if (parseIntArrayAttr(parser, attr, instrShape, "instrShape").failed())`
`1446`	`1446`	`return {};`
`1447`	`1447`	`}`
	`1448`	`+`
`1448`	`1449`	`if (attr.getName() == "isTransposed") {`
`1449`	`1450`	`if (parseBool(parser, attr, isTransposed, "isTransposed").failed())`
`1450`	`1451`	`return {};`
Original file line number	Diff line number	Diff line change
`@@ -164,7 +164,7 @@ loadOpsToIndirectionLevel(scf::ForOp forOp, bool pipelineWithoutDot,`
`164`	`164`	`finalUser = op;`
`165`	`165`	`distance++;`
`166`	`166`	`}`
`167`		`- for (Value operand : op->getOperands()) {`
	`167`	`+ for (Value operand : getNestedOperands(op)) {`
`168`	`168`	`if (isa<mlir::triton::DotOpInterface>(op)) {`
`169`	`169`	`// Heuristic: only pipeline A and B operands of the dot op.`
`170`	`170`	`if (operand == op->getOperand(2))`