zwu-2025
diff --git a/‎include/triton/Dialect/TritonGPU/Transforms/Partition.h‎
Lines changed: 26 additions & 28 deletions b/‎include/triton/Dialect/TritonGPU/Transforms/Partition.h‎
Lines changed: 26 additions & 28 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/Transforms/Passes.td‎
Lines changed: 9 additions & 18 deletions b/‎include/triton/Dialect/TritonGPU/Transforms/Passes.td‎
Lines changed: 9 additions & 18 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/Transforms/PipeliningUtility.h‎
Lines changed: 0 additions & 2 deletions b/‎include/triton/Dialect/TritonGPU/Transforms/PipeliningUtility.h‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/Transforms/Schedule.h‎
Lines changed: 4 additions & 7 deletions b/‎include/triton/Dialect/TritonGPU/Transforms/Schedule.h‎
Lines changed: 4 additions & 7 deletions
diff --git a/‎lib/Dialect/TritonGPU/Transforms/CMakeLists.txt‎
Lines changed: 1 addition & 2 deletions b/‎lib/Dialect/TritonGPU/Transforms/CMakeLists.txt‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎lib/Dialect/TritonGPU/Transforms/Pipeliner/AssignLatencies.cpp‎
Lines changed: 39 additions & 13 deletions b/‎lib/Dialect/TritonGPU/Transforms/Pipeliner/AssignLatencies.cpp‎
Lines changed: 39 additions & 13 deletions
diff --git a/‎lib/Dialect/TritonGPU/Transforms/Pipeliner/PipeliningUtility.cpp‎
Lines changed: 2 additions & 2 deletions b/‎lib/Dialect/TritonGPU/Transforms/Pipeliner/PipeliningUtility.cpp‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎lib/Dialect/TritonGPU/Transforms/Pipeliner/Schedule.cpp‎
Lines changed: 15 additions & 1 deletion b/‎lib/Dialect/TritonGPU/Transforms/Pipeliner/Schedule.cpp‎
Lines changed: 15 additions & 1 deletion
@@ -4,8 +4,6 @@
 #include "mlir/Support/LLVM.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/GraphTraits.h"
-#include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SmallVector.h"
 
 namespace mlir {
@@ -26,39 +24,39 @@ static constexpr char kPartitionStagesAttrName[] = "ttg.partition.stages";
 //===----------------------------------------------------------------------===//
 
 namespace mlir::triton::gpu {
+// A partition has a stage and contains some operation. The stage of a
+// partition determines how many cycles the partition's outputs are buffered
+// relative to its consumers.
+class Partition {
+public:
+  Partition(int idx, int stage) : idx(idx), stage(stage) {}
+
+  int getIndex() const { return idx; }
+  int getStage() const { return stage; }
+  ArrayRef<Operation *> getOps() const { return ops; }
+
+  void insert(Operation *op) { ops.push_back(op); }
+  void remove(Operation *op) { ops.erase(llvm::find(ops, op)); }
+
+private:
+  void setIndex(int idx) { this->idx = idx; }
+  friend class WarpSchedule;
+
+  // The partition number.
+  int idx;
+  // The stage of the partition.
+  int stage;
+  // The ops in the partition.
+  SmallVector<Operation *> ops;
+};
+
 // A warp schedule divides a loop into multiple partitions. Ops in a loop are
 // assigned at most one partition. A warp schedule represents asynchronous
 // execution of the loop body, where partitions may execute simultaneously.
 class WarpSchedule {
   static constexpr int kSentinel = -1;
 
 public:
-  // A partition has a stage and contains some operation. The stage of a
-  // partition determines how many cycles the partition's outputs are buffered
-  // relative to its consumers.
-  class Partition {
-  public:
-    Partition(int idx, int stage) : idx(idx), stage(stage) {}
-
-    int getIndex() const { return idx; }
-    int getStage() const { return stage; }
-    ArrayRef<Operation *> getOps() const { return ops; }
-
-    void insert(Operation *op) { ops.push_back(op); }
-    void remove(Operation *op) { ops.erase(llvm::find(ops, op)); }
-
-  private:
-    void setIndex(int idx) { this->idx = idx; }
-    friend class WarpSchedule;
-
-    // The partition number.
-    int idx;
-    // The stage of the partition.
-    int stage;
-    // The ops in the partition.
-    SmallVector<Operation *> ops;
-  };
-
   // Create a new partition with a stage.
   Partition *addPartition(unsigned stage);
   // Update the op to partition mapping.
 
@@ -26,36 +26,27 @@ def TritonGPUPipeline : Pass<"tritongpu-pipeline", "mlir::ModuleOp"> {
   ];
 }
 
-def TritonGPUTestPipelineAssignLatencies : Pass<"tritongpu-test-pipeline-assign-latencies", "mlir::ModuleOp"> {
-  let summary = "test assigning latencies to interesting ops ahead of pipelining";
+def TritonGPUAssignLatencies : Pass<"tritongpu-assign-latencies", "mlir::ModuleOp"> {
+  let summary = "assign latencies to interesting ops ahead of pipelining";
 
   let description = [{
-    This is a test pass that tests `assignLatencies` method of `TritonGPUPipeline`.
+    The `tritongpu-assign-latencies` pass assigns latencies to latency ops based
+    on the number of stages.
   }];
 
-  let dependentDialects = ["mlir::triton::gpu::TritonGPUDialect",
-                           "mlir::triton::nvidia_gpu::TritonNvidiaGPUDialect",
-                           "mlir::scf::SCFDialect",
-                           "mlir::arith::ArithDialect"];
-
   let options = [
-    Option<"numStages", "num-stages",
-           "int32_t", /*default*/"3",
+    Option<"numStages", "num-stages", "int32_t", /*default*/"3",
            "number of pipeline stages">
   ];
 }
 
-def TritonGPUTestPipelineScheduleLoop : Pass<"tritongpu-test-pipeline-schedule-loop", "mlir::ModuleOp"> {
-  let summary = "test scheduling a loop for software pipelining";
+def TritonGPUScheduleLoops : Pass<"tritongpu-schedule-loops", "mlir::ModuleOp"> {
+  let summary = "software pipeline loop scheduling";
 
   let description = [{
-    This is a test pass that tests `scheduleLoop` method of `TritonGPUPipeline`.
+    The `tritongpu-schedule-loops` pass performs scheduling for loop pipelining
+    for loops with latency ops.
   }];
-
-  let dependentDialects = ["mlir::triton::gpu::TritonGPUDialect",
-                           "mlir::triton::nvidia_gpu::TritonNvidiaGPUDialect",
-                           "mlir::scf::SCFDialect",
-                           "mlir::arith::ArithDialect"];
 }
 
 def TritonGPUHoistTMEMAlloc : Pass<"tritongpu-hoist-tmem-alloc", "mlir::ModuleOp"> {
 
@@ -19,8 +19,6 @@ static const char *kWarpSpecializeAttrName = "tt.warp_specialize";
 static const char *kLoopStageAttrName = "loop.stage";
 static const char *kLoopClusterAttrName = "loop.cluster";
 static const char *kScheduledMaxStageAttrName = "tt.scheduled_max_stage";
-static const char *kAssignedStageAttrName = "ttg.assigned_stage";
-static const char *kAssignedClusterAttrName = "ttg.assigned_cluster";
 
 //===----------------------------------------------------------------------===//
 // Hoisting Utilities
 
@@ -13,13 +13,6 @@ namespace triton {
 
 namespace gpu {
 
-/// Discover operations that should become async and assign latencies to them
-/// based on the numStages value provided by the user.
-void assignLatencies(ModuleOp moduleOp, int numStages);
-
-/// Schedule the loops based on the latencies assigned to the operations.
-void scheduleLoops(ModuleOp moduleOp);
-
 /// Lower the loops to prepare them for pipeline expansion.
 void lowerLoops(ModuleOp moduleOp);
 
@@ -115,6 +108,10 @@ class CoarseSchedule {
   bool insertDepsOfOp(Operation *op, int stage, CoarseSchedule::Cluster cluster,
                       bool includeArg, bool insertIfEarlier = false);
 
+  // Remove empty stages and clusters from the schedule, adjusting the maximum
+  // number of stages as appropriate.
+  void shrinkToFit();
+
   void erase(Operation *op) { opToStageAndCluster.erase(op); }
 
   int count(Operation *op) { return opToStageAndCluster.count(op); }
 
@@ -16,8 +16,6 @@ add_triton_library(TritonGPUTransforms
   Pipeliner/ScheduleLoops.cpp
   Pipeliner/WGMMAPipeline.cpp
   Pipeliner/PipelineExpander.cpp
-  Pipeliner/TestPipelineAssignLatencies.cpp
-  Pipeliner/TestPipelineScheduleLoop.cpp
   Pipeliner/TestPipelineLowerLoop.cpp
   Pipeliner/SoftwarePipeliner.cpp
   Pipeliner/TMAStoresPipeline.cpp
@@ -33,6 +31,7 @@ add_triton_library(TritonGPUTransforms
   WarpSpecialization/LoadMMASpecialization.cpp
   WarpSpecialization/Partition.cpp
   WarpSpecialization/OptimizePartitionWarps.cpp
+  WarpSpecialization/PartitionBuilder.cpp
   WarpSpecialization/PartitionLoops.cpp
   WarpSpecialization/PartitionScheduling.cpp
   WarpSpecialization/RewritePartitionDependencies.cpp
 
@@ -18,12 +18,13 @@ namespace tt = mlir::triton;
 namespace ttg = mlir::triton::gpu;
 namespace ttng = mlir::triton::nvidia_gpu;
 
-namespace mlir {
-namespace triton {
-namespace gpu {
-
+namespace mlir::triton::gpu {
 namespace {
 
+//===----------------------------------------------------------------------===//
+// assignLatencies
+//===----------------------------------------------------------------------===//
+
 // Return true if the preconditions for pipelining the loop are met.
 bool preCondition(scf::ForOp forOp) {
   // Skip loop with distance > 1 for now.
@@ -293,6 +294,15 @@ class AssignMMALatencies {
             // MMA's users can be pushed to the next stage
             opLatency[&op] = 1;
           }
+          // HACK: A pipelined MMA's latency should equal the number of buffers
+          // for the accumulator, but when the user is in an `scf.if` in SWP,
+          // the `scf.if` is pushed to the end of the loop rather than peeled
+          // before the MMA op, requiring an extra buffer due to liverange
+          // overlap. WS does not have this problem because the MMA is placed in
+          // a different partition than the MMA, so we can correctly set the
+          // latency.
+          if (forOp->hasAttr(kWarpSpecializeAttrName))
+            opLatency[&op] += 1;
         }
       }
     }
@@ -312,12 +322,13 @@ class AssignMMALatencies {
   }
 };
 
-} // namespace
-
-// Look for load ops that directly or indirectly feed into dot ops. Based
-// on the requested number of stages assign the latencies in a way that
-// cover all the stages with the sum of latencies in the chain from the first
-// load to the final dot op.
+// Discover operations that should become async and assign latencies to them
+// based on the numStages value provided by the user.
+//
+// Look for load ops that directly or indirectly feed into dot ops. Based on the
+// requested number of stages assign the latencies in a way that cover all the
+// stages with the sum of latencies in the chain from the first load to the
+// final dot op.
 void assignLatencies(ModuleOp moduleOp, int defaultNumStages) {
   SmallVector<scf::ForOp> loops;
   moduleOp->walk([&](scf::ForOp forOp) {
@@ -341,6 +352,21 @@ void assignLatencies(ModuleOp moduleOp, int defaultNumStages) {
   }
   serializeLatencies(moduleOp, opLatency);
 }
-} // namespace gpu
-} // namespace triton
-} // namespace mlir
+
+} // namespace
+
+//===----------------------------------------------------------------------===//
+// Pass Definition
+//===----------------------------------------------------------------------===//
+
+#define GEN_PASS_DEF_TRITONGPUASSIGNLATENCIES
+#include "triton/Dialect/TritonGPU/Transforms/Passes.h.inc"
+
+struct AssignLatencies
+    : public impl::TritonGPUAssignLatenciesBase<AssignLatencies> {
+  using TritonGPUAssignLatenciesBase::TritonGPUAssignLatenciesBase;
+
+  void runOnOperation() override { assignLatencies(getOperation(), numStages); }
+};
+
+} // namespace mlir::triton::gpu
@@ -161,7 +161,7 @@ Operation *mlir::triton::predicateOp(RewriterBase &rewriter, Operation *op,
   OpBuilder::InsertionGuard guard(rewriter);
   if (mlir::isMemoryEffectFree(op))
     return op;
-  if (isa<LLVM::AssumeOp>(op))
+  if (isa<LLVM::AssumeOp, ttng::FenceAsyncSharedOp>(op))
     return op;
   if (isa<ttg::AsyncCommitGroupOp, ttg::AsyncWaitOp>(op))
     return op;
@@ -264,7 +264,7 @@ Operation *mlir::triton::predicateOp(RewriterBase &rewriter, Operation *op,
     return op;
   }
 
-  op->emitError("pipeliner doesn't know how to predicate this op.");
+  op->emitOpError("pipeliner doesn't know how to predicate this op.");
   llvm::report_fatal_error("Fatal pipeliner error");
   return op;
 }
 
@@ -87,6 +87,19 @@ bool tt::CoarseSchedule::insertDepsOfOp(Operation *op, int stage,
   return inserted;
 }
 
+void tt::CoarseSchedule::shrinkToFit() {
+  int minStage = std::numeric_limits<int>::max();
+  int maxStage = std::numeric_limits<int>::min();
+  for (auto &[op, stageAndCluster] : opToStageAndCluster) {
+    auto [stage, cluster] = stageAndCluster;
+    minStage = std::min(minStage, stage);
+    maxStage = std::max(maxStage, stage);
+  }
+  for (auto &[op, stageAndCluster] : opToStageAndCluster)
+    stageAndCluster.first -= minStage;
+  numStages = maxStage - minStage + 1;
+}
+
 // Split the cluster containing op into two clusters, one containing all
 // operations before the op and one containing op and all operations after the
 // op. Return the cluster containing op and all operations after the op. Do not
@@ -282,7 +295,8 @@ void tt::scheduleDependencies(scf::ForOp forOp, tt::CoarseSchedule &schedule) {
     for (auto [op, stage_, cluster] : opsInOrder) {
       if (stage_ != stage)
         continue;
-      schedule.insertDepsOfOp(op, stage, cluster, false);
+      schedule.insertDepsOfOp(op, stage, cluster, /*includeArg=*/false,
+                              /*insertIfEarlier=*/true);
     }
   }
 }