intel
diff --git a/‎include/triton/Dialect/Triton/IR/TritonDialect.td‎
Lines changed: 2 additions & 1 deletion b/‎include/triton/Dialect/Triton/IR/TritonDialect.td‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎include/triton/Dialect/TritonGPU/Transforms/MMAv5PipelineUtility.h‎
Lines changed: 31 additions & 11 deletions b/‎include/triton/Dialect/TritonGPU/Transforms/MMAv5PipelineUtility.h‎
Lines changed: 31 additions & 11 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/Transforms/PipeliningUtility.h‎
Lines changed: 8 additions & 0 deletions b/‎include/triton/Dialect/TritonGPU/Transforms/PipeliningUtility.h‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/Transforms/Schedule.h‎
Lines changed: 14 additions & 0 deletions b/‎include/triton/Dialect/TritonGPU/Transforms/Schedule.h‎
Lines changed: 14 additions & 0 deletions
diff --git a/‎include/triton/Tools/Sys/GetEnv.hpp‎
Lines changed: 1 addition & 2 deletions b/‎include/triton/Tools/Sys/GetEnv.hpp‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎lib/Dialect/TritonGPU/Transforms/Pipeliner/AssignLatencies.cpp‎
Lines changed: 35 additions & 16 deletions b/‎lib/Dialect/TritonGPU/Transforms/Pipeliner/AssignLatencies.cpp‎
Lines changed: 35 additions & 16 deletions
@@ -45,7 +45,8 @@ def Triton_Dialect : Dialect {
 
   let discardableAttrs = (ins
      "::mlir::IntegerAttr":$num_stages,
-     "::mlir::IntegerAttr":$latency
+     "::mlir::IntegerAttr":$latency,
+     "::mlir::IntegerAttr":$self_latency
   );
 
   let hasConstantMaterializer = 1;
 
@@ -20,11 +20,37 @@ namespace triton::nvidia_gpu {
 // Given an MMAv5 operation in a loop, determine if its accumulator can be
 // multibuffered.
 bool isAccMultibufferingPossible(MMAv5OpInterface mma, scf::ForOp forOp);
-// Only pipeline the loops where the MMA happens before the tmem_load, or is in
-// the same stage as the tmem_load. Lowering does not support the case where the
-// MMA is in a different stage as the tmem_load and happens after it.
-bool mmav5DominatesTmemLoads(
-    scf::ForOp forOp, function_ref<bool(MMAv5OpInterface)> isMmaPipelineable);
+
+// Returns true if the MMA operation requires acc multi-buffering when
+// pipelined.
+bool requiresAccMultiBuffering(MMAv5OpInterface mma, scf::ForOp forOp);
+
+// Returns true if there are loads from tmem after the MMA operation.
+bool hasLoadsAfterMMA(MMAv5OpInterface mma, scf::ForOp forOp);
+
+// Helper class to determine if the operands of an MMA operation are
+// pipelineable.
+class MMAv5PipelineableOperandsHelper {
+public:
+  MMAv5PipelineableOperandsHelper(
+      MMAv5OpInterface mmaOp, scf::ForOp forOp,
+      std::function<bool(Operation *)> isLoadToBePipelined)
+      : mmaOp(mmaOp), forOp(forOp), isLoadToBePipelined(isLoadToBePipelined) {
+    run();
+  }
+  bool isPipelineable = false;
+  // If true, the existing operand loads are all been found and their
+  // pipelineability has been determined.
+  bool isOperandsStateDetermined = false;
+  SmallVector<Operation *> unpipelineableOperandLoads;
+
+private:
+  MMAv5OpInterface mmaOp;
+  scf::ForOp forOp;
+  std::function<bool(Operation *)> isLoadToBePipelined;
+  bool comesFromLoadOrOutsideLoop(Value v, Operation *&foundLoad);
+  void run();
+};
 
 //===----------------------------------------------------------------------===//
 // MMA Pipeline Rewriters
@@ -35,12 +61,6 @@ bool mmav5DominatesTmemLoads(
 TMEMAllocOp createTMemAlloc(OpBuilder &builder, TMEMAllocOp oldTMemAllocOp,
                             bool multiBufferred, int numStages);
 
-// Return true if operands of the MMA operation are/are going to be pipelined
-// and multibuffered, enabling the MMA operation to be pipelined.
-bool mmaHasPipelineableOperands(
-    MMAv5OpInterface mma, scf::ForOp forOp,
-    std::function<bool(Operation *)> isLoadPipelineable);
-
 // Return true if the accumulator of an mma in subsequent iterations is either
 // independent from the previous iteration (overwritten) or completely reused,
 // without read-modify-write.
 
@@ -91,6 +91,11 @@ int getCopyVecBytes(RankedTensorType registerTy,
 // attribute.
 void serializeLatencies(ModuleOp module, DenseMap<Operation *, int> &opLatency);
 
+// Serialize the self latencies of the operations in the loops into the
+// self_latency attribute.
+void serializeSelfLatencies(ModuleOp module,
+                            DenseMap<Operation *, int> &opSelfLatency);
+
 // Deserialize the latencies of the operations in the loops from the attribute.
 DenseMap<Operation *, int> deserializeLatencies(Operation *op);
 
@@ -107,6 +112,9 @@ Value createAlloc(scf::ForOp forOp, RankedTensorType ty, Location loc,
 // Determine if the operation is a TMA load.
 bool isTMALoad(Operation *op);
 
+// Determine if the operation can be lowered to an async load.
+bool canBeAsyncLoad(Operation *op);
+
 // Look for consecutive wait ops and combine them into a single wait op.
 void combineRedundantWaitOps(
     llvm::SmallSetVector<gpu::AsyncWaitOp, 8> &waitOps);
 
@@ -125,6 +125,20 @@ class CoarseSchedule {
 
   auto find(Operation *op) const { return opToStageAndCluster.find(op); }
 
+  // Split the cluster containing op into two clusters, one containing all
+  // operations before the op and one containing op and all operations after the
+  // op. Return the cluster containing op and all operations after the op.
+  Cluster splitClusterBefore(Operation *op, scf::ForOp forOp);
+
+  // Check if op a will show up before op b in the final unrolled code.
+  bool isOpBefore(Operation *a, Operation *b);
+
+  // Check if op a is in earlier cluster than op b.
+  bool isOpInEarlierCluster(Operation *a, Operation *b);
+
+  // Check if op a is in the same cluster as op b.
+  bool isOpInSameCluster(Operation *a, Operation *b);
+
   SmallVector<std::tuple<Operation *, int, Cluster>>
   getOpsInOrder(scf::ForOp forOp);
   std::vector<std::pair<Operation *, unsigned>>
 
@@ -43,8 +43,7 @@ inline const std::set<std::string> CACHE_INVALIDATING_ENV_VARS = {
     "NVPTX_ENABLE_DUMP",
     "STORE_TMEM_TO_GLOBAL_BYPASS_SMEM",
     "ALLOW_LHS_TMEM_LAYOUT_CONVERSION",
-    "TRITON_F32_DEFAULT",
-    "ENABLE_MMA_V5_ATT_PIPELINE"
+    "TRITON_F32_DEFAULT"
     // clang-format on
 };
 
 
@@ -260,37 +260,56 @@ class AssignMMALatencies {
       : forOp(forOp), opLatency(opLatency) {};
 
   void run() {
-    if (!triton::tools::getBoolEnv("ENABLE_MMA_V5_ATT_PIPELINE")) {
-      int mmav5Count = 0;
-      for (auto &op : forOp.getBody()->without_terminator()) {
-        if (isa<ttng::MMAv5OpInterface>(&op)) {
-          mmav5Count++;
-        }
-      }
-      if (mmav5Count > 1)
-        return;
-    }
+    DenseMap<Operation *, int> mmaSelfLatency;
     // Check if the load op (mma operand) is pipelineable.
-    auto isLoadPipelineable = [&](Operation *op) {
+    auto isLoadToBePipelined = [&](Operation *op) {
       return opLatency.count(op) && opLatency[op] > 0;
     };
     for (auto &op : forOp.getBody()->without_terminator()) {
       // If the acc can not be multibuffered, do not pipeline the uses of
       // the MMA to later stages.
       if (auto mma = dyn_cast<ttng::MMAv5OpInterface>(&op)) {
-        if (ttng::mmaHasPipelineableOperands(mma, forOp, isLoadPipelineable) &&
-            !ttng::hasAccReadModifyWrite(mma, forOp) &&
-            ttng::isAccMultibufferingPossible(mma, forOp) &&
-            !getDisallowAccMultiBuffer(forOp)) {
-          opLatency[&op] = 1;
+        // Try to push out the wait by one stage even if the operands are not
+        // pipelineable, but we know where the loads are scheduled, so we can
+        // place the wait right before the loads.
+
+        if (hasSyncDots(forOp)) {
+          // Skip pipelining MMA in the loops where sync dots are used. This is
+          // dirty heuristic for performance drops in kernels where we would
+          // rather want to have last iteration peeled instead of having a full
+          // iteration of masked operations only to execute single wait.
+          continue;
+        }
+        auto pipeHelper = ttng::MMAv5PipelineableOperandsHelper(
+            mma, forOp, isLoadToBePipelined);
+        if (pipeHelper.isPipelineable ||
+            (pipeHelper.isOperandsStateDetermined &&
+             !ttng::hasLoadsAfterMMA(mma, forOp))) {
+          // MMA can be overlapped with itself
+          mmaSelfLatency[mma] = 1;
+          if (!ttng::requiresAccMultiBuffering(mma, forOp) ||
+              (ttng::isAccMultibufferingPossible(mma, forOp) &&
+               !getDisallowAccMultiBuffer(forOp))) {
+            // MMA's users can be pushed to the next stage
+            opLatency[&op] = 1;
+          }
         }
       }
     }
+    serializeSelfLatencies(forOp->getParentOfType<ModuleOp>(), mmaSelfLatency);
   }
 
 private:
   scf::ForOp forOp;
   DenseMap<Operation *, int> &opLatency;
+
+  bool hasSyncDots(scf::ForOp forOp) {
+    for (auto &op : forOp.getBody()->without_terminator()) {
+      if (isa<mlir::triton::DotOp>(op))
+        return true;
+    }
+    return false;
+  }
 };
 
 } // namespace