openxla
diff --git a/‎include/triton/Dialect/TritonGPU/Transforms/Passes.td‎
Lines changed: 13 additions & 0 deletions b/‎include/triton/Dialect/TritonGPU/Transforms/Passes.td‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/Transforms/PipeliningUtility.h‎
Lines changed: 7 additions & 0 deletions b/‎include/triton/Dialect/TritonGPU/Transforms/PipeliningUtility.h‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/Transforms/Schedule.h‎
Lines changed: 8 additions & 0 deletions b/‎include/triton/Dialect/TritonGPU/Transforms/Schedule.h‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/Transforms/Utility.h‎
Lines changed: 4 additions & 0 deletions b/‎include/triton/Dialect/TritonGPU/Transforms/Utility.h‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎lib/Dialect/TritonGPU/Transforms/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions b/‎lib/Dialect/TritonGPU/Transforms/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions
@@ -179,4 +179,17 @@ def TritonGPUOptimizeAccumulatorInit: Pass<"tritongpu-optimize-accumulator-init"
                            "mlir::triton::TritonDialect"];
 }
 
+def TritonGPULoopScheduling: Pass<"tritongpu-loop-scheduling", "mlir::ModuleOp"> {
+  let summary = "Generate loop scheduling for SWP";
+
+  let description = "This pass sets up stages and clustering for software pipelining.";
+
+  let dependentDialects = ["mlir::triton::gpu::TritonGPUDialect",
+                           "mlir::triton::TritonDialect"];
+  let options = [
+    Option<"numStages", "num-stages",
+           "int32_t", /*default*/"3",
+           "number of pipeline stages">
+  ];
+}
 #endif
@@ -8,6 +8,8 @@ namespace mlir {
 namespace triton {
 
 static const char *kNumStagesAttrName = "tt.num_stages";
+static const char *kLoopStageAttrName = "loop.stage";
+static const char *kLoopClusterAttrName = "loop.cluster";
 
 /// Function to mask operations during scheduling.
 Operation *predicateOp(RewriterBase &rewriter, Operation *op, Value pred);
@@ -29,6 +31,11 @@ void addOps(scf::ForOp forOp, int stage,
 /// mutable.
 void replaceUsesAndPropagateType(OpBuilder &builder, Operation *oldUse,
                                  Value val);
+
+// Return the minClusterId and maxClusterId for the given ForOp.
+std::pair<int, int> getMinMaxCluster(scf::ForOp &forOp);
+std::pair<int, int> getStageCluster(Operation *op);
+void setStageCluster(scf::ForOp &forOp, Operation *op, int stage, int cluster);
 } // namespace triton
 } // namespace mlir
 
 
@@ -101,8 +101,16 @@ class CoarseSchedule {
   createFinalSchedule(scf::ForOp forOp);
   void dump();
   bool empty() { return opToStageAndCluster.size() == 0; }
+  void serialize(scf::ForOp &forOp);
+  // Create a CoarseSchedule based on forOp's <stage, cluster>.
+  void deSerialize(scf::ForOp &forOp);
 };
 
+// Add dependencies of anchor ops to the coarse schedule. Schedule them to
+// the same stage and ordering cluster as the anchor op.
+void scheduleDependencies(scf::ForOp forOp, CoarseSchedule &schedule,
+                          int numStages);
+
 } // namespace triton
 } // namespace mlir
 #endif // TRITON_TRITONGPU_TRANSFORM_PIPELINE_SCHEDULE_H_
@@ -192,6 +192,10 @@ bool isPureUnaryInlineAsm(Operation *op);
 // read the compute capability from the module attributes
 int getNVIDIAComputeCapability(Operation *module);
 
+std::optional<mlir::triton::gpu::SharedEncodingAttr>
+getSharedEncIfAllUsersAreDotEnc(Value val, bool &incompatible);
+
+bool loadIsMMAv3(Operation *loadOp);
 } // namespace mlir
 
 #endif // TRITON_DIALECT_TRITONGPU_TRANSFORMS_UTILITY_H_
@@ -3,6 +3,7 @@ add_triton_library(TritonGPUTransforms
   Coalesce.cpp
   F32DotTC.cpp
   CombineTensorSelectAndIf.cpp
+  LoopScheduling.cpp
   ReduceDataDuplication.cpp
   OptimizeAccumulatorInit.cpp
   OptimizeDotOperands.cpp