intel
diff --git a/‎include/triton/Dialect/TritonGPU/Transforms/MMAv5PipelineUtility.h‎
Lines changed: 74 additions & 9 deletions b/‎include/triton/Dialect/TritonGPU/Transforms/MMAv5PipelineUtility.h‎
Lines changed: 74 additions & 9 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/Transforms/Partition.h‎
Lines changed: 5 additions & 0 deletions b/‎include/triton/Dialect/TritonGPU/Transforms/Partition.h‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/Transforms/PipeliningUtility.h‎
Lines changed: 1 addition & 15 deletions b/‎include/triton/Dialect/TritonGPU/Transforms/PipeliningUtility.h‎
Lines changed: 1 addition & 15 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/Transforms/Utility.h‎
Lines changed: 24 additions & 0 deletions b/‎include/triton/Dialect/TritonGPU/Transforms/Utility.h‎
Lines changed: 24 additions & 0 deletions
diff --git a/‎lib/Dialect/TritonGPU/IR/Ops.cpp‎
Lines changed: 1 addition & 1 deletion b/‎lib/Dialect/TritonGPU/IR/Ops.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎lib/Dialect/TritonGPU/Transforms/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions b/‎lib/Dialect/TritonGPU/Transforms/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎lib/Dialect/TritonGPU/Transforms/FuseNestedLoops.cpp‎
Lines changed: 7 additions & 0 deletions b/‎lib/Dialect/TritonGPU/Transforms/FuseNestedLoops.cpp‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎lib/Dialect/TritonGPU/Transforms/HoistTMEMAlloc.cpp‎
Lines changed: 0 additions & 29 deletions b/‎lib/Dialect/TritonGPU/Transforms/HoistTMEMAlloc.cpp‎
Lines changed: 0 additions & 29 deletions
diff --git a/‎lib/Dialect/TritonGPU/Transforms/OptimizeAccumulatorInit.cpp‎
Lines changed: 1 addition & 4 deletions b/‎lib/Dialect/TritonGPU/Transforms/OptimizeAccumulatorInit.cpp‎
Lines changed: 1 addition & 4 deletions
diff --git a/‎lib/Dialect/TritonGPU/Transforms/Pipeliner/LowerLoops.cpp‎
Lines changed: 10 additions & 16 deletions b/‎lib/Dialect/TritonGPU/Transforms/Pipeliner/LowerLoops.cpp‎
Lines changed: 10 additions & 16 deletions
@@ -1,40 +1,105 @@
 #ifndef TRITON_TRITONGPU_TRANSFORMS_MMAV5PIPELINEUTILITY_H_
 #define TRITON_TRITONGPU_TRANSFORMS_MMAV5PIPELINEUTILITY_H_
 
-#include <functional>
-#include <optional>
-#include <tuple>
+#include "triton/Dialect/TritonNvidiaGPU/IR/Dialect.h"
 
 namespace mlir {
+
 class OpBuilder;
-class Operation;
+class DominanceInfo;
 
 namespace scf {
 class ForOp;
-}
+} // namespace scf
 namespace triton::nvidia_gpu {
-class MMAv5OpInterface;
-class TMEMAllocOp;
-class TMEMLoadOp;
+
+//===----------------------------------------------------------------------===//
+// MMAInfo
+//===----------------------------------------------------------------------===//
+
+// This struct contains analysis information about an MMAv5 operation inside a
+// loop used for pipelining MMA ops.
+struct MMAInfo {
+  // This struct contains information about when the MMA's accumulator is
+  // overridden in the loop, if it is at all.
+  struct AccOverridePoint {
+    // The operation which overrides the accumulator.
+    Operation *op;
+    // The condition on which the accumulator is reset.
+    Value condition = nullptr;
+    // The initial value of the accumulator and the value after a reset.
+    Value initValue = nullptr;
+    // The number of loop iterations ago the accumulator was reset.
+    int distance = 0;
+    // Whether the accumulator is reset via setting the `useAcc` flag to false
+    // or by clearing the accumulator tensor value.
+    bool isFlag = false;
+  };
+
+  // The TMEM allocation of the accumuator, which directly precedes the dot op.
+  TMEMAllocOp accAlloc;
+  // The TMEM load of the accumulator value out of TMEM, which directly follows
+  // the dot op.
+  TMEMLoadOp accLoad;
+  // The override point of the accumulator value, if it is overriden in the
+  // loop. E.g. this is typically present for persistent kernels.
+  std::optional<AccOverridePoint> accDef;
+  // If the accumulator is used in future iterations of the loop, this is the
+  // iter arg number.
+  std::optional<int> yieldArgNo;
+  // Whether the accumulator needs to be multibuffered.
+  bool accIsMultiBuffered;
+
+  Value phase = nullptr;
+  Value barrierIdx = nullptr;
+  Value accInsertIdx = nullptr;
+  Value accExtractIdx = nullptr;
+  Value barrierAlloc = nullptr;
+};
+
+//===----------------------------------------------------------------------===//
+// MMA Pipeline Analysis
+//===----------------------------------------------------------------------===//
 
 // Returns the TMEMAllocOp and TMEMLoadOp that are used to allocate and load the
 // accumulator for the given MMA operation. The TMEMAllocOp and TMEMLoadOp must
 // be in the same region as the MMA operation.
 std::optional<std::pair<TMEMAllocOp, TMEMLoadOp>>
 getTMemAllocAndLoad(MMAv5OpInterface mmaOp);
+// Get immediate users of the accumulator within the current loop iteration.
+SmallVector<Operation *> getDirectAccUses(TMEMLoadOp accDef);
+// Analyze an MMA op inside a loop to determine information about how it can be
+// pipelined. Returns `std::nullopt` if it cannot be pipelined.
+std::optional<MMAInfo> getMMAInfo(scf::ForOp forOp, MMAv5OpInterface mmaOp,
+                                  DominanceInfo &domInfo);
+
+//===----------------------------------------------------------------------===//
+// MMA Pipeline Rewriters
+//===----------------------------------------------------------------------===//
+
 // Create a new TMEMAllocOp to use for the pipelined MMA operation. It is
 // optionally multi-buffered based on the number of stages.
 TMEMAllocOp createTMemAlloc(OpBuilder &builder, TMEMAllocOp oldTMemAllocOp,
                             bool multiBufferred, int numStages);
 
+// Create a store op of the initial value of the accumulator into the
+// potentially multi-buffered accumulator.
+void createInitStore(OpBuilder &builder, TMEMAllocOp allocOp, Value initVal,
+                     bool multiBufferred);
+
 // Return true if operands of the MMA operation are/are going to be pipelined
 // and multibuffered, enabling the MMA operation to be pipelined.
 bool mmaHasPipelineableOperands(
     MMAv5OpInterface mma, scf::ForOp forOp,
     std::function<bool(Operation *)> isLoadPipelineable);
 
-// Return true if the loop has a read-modify-write access to the accumulator.
+// Return true if the accumulator of an mma in subsequent iterations is either
+// independent from the previous iteration (overwritten) or completely reused,
+// without read-modify-write.
+// Otherwise, we can not pipeline the MMA, as we need to insert a wait after the
+// mma to read back the accumulator for RMW.
 bool hasAccReadModifyWrite(MMAv5OpInterface mma, scf::ForOp forOp);
+
 } // namespace triton::nvidia_gpu
 } // namespace mlir
 
 
@@ -47,6 +47,9 @@ class WarpSchedule {
     void insert(Operation *op) { ops.push_back(op); }
 
   private:
+    void setIndex(int idx) { this->idx = idx; }
+    friend class WarpSchedule;
+
     // The partition number.
     int idx;
     // The stage of the partition.
@@ -57,6 +60,8 @@ class WarpSchedule {
 
   // Create a new partition with a stage.
   Partition *addPartition(unsigned stage);
+  // Give each partition a new index and order. The indices must be unique.
+  void reorderPartitions(ArrayRef<unsigned> order);
 
   // Get the partition the op belongs to.
   Partition *getPartition(Operation *op);
 
@@ -14,6 +14,7 @@ namespace triton {
 static const char *kNumStagesAttrName = "tt.num_stages";
 static const char *kDisallowAccMultiBufferAttrName =
     "tt.disallow_acc_multi_buffer";
+static const char *kWarpSpecializeAttrName = "tt.warp_specialize";
 static const char *kLoopStageAttrName = "loop.stage";
 static const char *kLoopClusterAttrName = "loop.cluster";
 static const char *kScheduledMaxStageAttrName = "tt.scheduled_max_stage";
@@ -38,17 +39,6 @@ void replaceUsesAndPropagateType(OpBuilder &builder, Operation *oldUse,
 // `tt.disallow_acc_multi_buffer` set to true.
 bool getDisallowAccMultiBuffer(scf::ForOp forOp);
 
-/// Visit the operands of `op` and the operands of any nested ops defined
-/// outside of `op`.
-void visitNestedOperands(Operation *op,
-                         function_ref<void(OpOperand &)> visitor);
-/// Visit the operands of `op` and the operands of any nested ops defined
-/// outside of `op`.
-void visitNestedOperands(Operation *op, function_ref<void(Value)> visitor);
-/// Get the operands of `op` and the operands of any nested ops defined outside
-/// of `op`.
-SetVector<Value> getNestedOperands(Operation *op);
-
 // Return the definition of the given value. If the value is a loop-carried
 // dependency, return the definition and the distance to it.
 std::pair<OpResult, int64_t> getDefinitionAndDistance(scf::ForOp forOp,
@@ -90,10 +80,6 @@ gpu::SharedEncodingTrait getSharedEncoding(RankedTensorType ty);
 // Get a shared encoding for a tensor based on its uses.
 gpu::SharedEncodingTrait getSharedEncoding(Operation *loadOp);
 
-// Erase the given loop carried values from the loop, where `loop` is replaced
-// with a new loop.
-void eraseLoopCarriedValues(scf::ForOp &loop, llvm::BitVector indices);
-
 // Get the number of stages to pipeline the loop with, if it is explicitly
 // specified.
 int getNumStagesOrDefault(scf::ForOp forOp, int defaultNumStages);
 
@@ -10,6 +10,7 @@
 #include <numeric>
 
 namespace mlir {
+class DominanceInfo;
 
 namespace triton {
 class ModuleAxisInfoAnalysis;
@@ -135,6 +136,8 @@ scf::ForOp replaceForOpWithNewSignature(
     SmallVectorImpl<std::tuple<Value, Value>> &replacements);
 scf::ForOp replaceForOpWithNewSignature(OpBuilder &rewriter, scf::ForOp loop,
                                         ValueRange newIterOperands);
+Block::BlockArgListType addIterArgsToLoop(OpBuilder &rewriter, scf::ForOp &loop,
+                                          ValueRange newIterOperands);
 
 // Replace WhileOp with a new WhileOp with extra operands. The YieldOp is not
 // updated and needs to be updated separately for the loop to be correct.
@@ -213,6 +216,27 @@ triton::gpu::LocalAllocOp findShmemAlloc(Value operand);
 SmallVector<Operation *>
 getMMAsWithMultiBufferredOperands(scf::ForOp forOp,
                                   SmallVector<Operation *> &mmaOps);
+
+// Given a list of ops, find the naerest common dominator of all ops or return
+// null if one could not be found. The ops are allowed to be in different
+// regions. The result op is not necessarily one of the ops in the list.
+Operation *findNearestCommonDominator(ArrayRef<Operation *> ops,
+                                      DominanceInfo &domInfo);
+
+/// Visit the operands of `op` and the operands of any nested ops defined
+/// outside of `op`.
+void visitNestedOperands(Operation *op,
+                         function_ref<void(OpOperand &)> visitor);
+/// Visit the operands of `op` and the operands of any nested ops defined
+/// outside of `op`.
+void visitNestedOperands(Operation *op, function_ref<void(Value)> visitor);
+/// Get the operands of `op` and the operands of any nested ops defined outside
+/// of `op`.
+SetVector<Value> getNestedOperands(Operation *op);
+
+// Erase the given loop carried values from the loop, where `loop` is replaced
+// with a new loop.
+void eraseLoopCarriedValues(scf::ForOp &loop, llvm::BitVector indices);
 } // namespace mlir
 
 #endif // TRITON_DIALECT_TRITONGPU_TRANSFORMS_UTILITY_H_
@@ -848,7 +848,7 @@ LogicalResult WarpYieldOp::verify() {
 static size_t getSharedMemorySize(Type type) {
   if (isa<IntegerType, FloatType>(type))
     return llvm::divideCeil(type.getIntOrFloatBitWidth(), 8);
-  if (isa<PointerType>(type))
+  if (isa<PointerType, TensorDescType>(type))
     return 8;
   if (auto desc = dyn_cast<MemDescType>(type)) {
     if (!isa<SharedMemorySpaceAttr>(desc.getMemorySpace()))
 
@@ -12,6 +12,7 @@ add_triton_library(TritonGPUTransforms
   OptimizeThreadLocality.cpp
   Pipeliner/AssignLatencies.cpp
   Pipeliner/LowerLoops.cpp
+  Pipeliner/MMAv5PipelineUtility.cpp
   Pipeliner/ScheduleLoops.cpp
   Pipeliner/WGMMAPipeline.cpp
   Pipeliner/PipelineExpander.cpp
 
@@ -900,6 +900,13 @@ static void fuseOneLevel(LoopNestNode *parent, mlir::DominanceInfo &domInfo) {
     epilogueIf.erase();
   }
 
+  // Propagate warp specialization flags.
+  if (outer->hasAttr(kWarpSpecializeAttrName) ||
+      llvm::any_of(innerLoops, [](scf::ForOp loop) {
+        return loop->hasAttr(kWarpSpecializeAttrName);
+      }))
+    fused->setAttr(kWarpSpecializeAttrName, b.getUnitAttr());
+
   // Propagate the `tt.disallow_acc_multi_buffer` attribute to the parent loop.
   bool disallowAccMultiBuffer = getDisallowAccMultiBuffer(outer);
   for (scf::ForOp loop : innerLoops) {
 
@@ -46,35 +46,6 @@ bool aliasingStoresBetween(Operation *op, ttng::TMEMStoreOp store) {
   return false;
 }
 
-Operation *findNearestCommonDominator(ArrayRef<Operation *> ops,
-                                      DominanceInfo &domInfo) {
-  if (ops.size() == 0) {
-    return nullptr;
-  }
-  if (ops.size() == 1) {
-    return ops[0];
-  }
-  llvm::SmallPtrSet<Block *, 16> blocks;
-  for (auto op : ops) {
-    blocks.insert(op->getBlock());
-  }
-  Block *domBlock = domInfo.findNearestCommonDominator(blocks);
-  if (domBlock == nullptr) {
-    return nullptr;
-  }
-  SmallVector<Operation *> ancestorOps;
-  for (auto op : ops) {
-    ancestorOps.push_back(domBlock->findAncestorOpInBlock(*op));
-  }
-  Operation *dom = ancestorOps[0];
-  for (unsigned i = 1; i < ops.size(); i++) {
-    if (ancestorOps[i]->isBeforeInBlock(dom)) {
-      dom = ancestorOps[i];
-    }
-  }
-  return dom;
-}
-
 class CombineTMEMStoreAndSelect : public OpRewritePattern<ttng::TMEMStoreOp> {
 public:
   using OpRewritePattern::OpRewritePattern;
 
@@ -211,10 +211,7 @@ class OptimizeAccumulatorInitPass
       }
 
       Value loopArgFlagValue = loopArgIsZero ? vFalse : vTrue;
-      scf::ForOp newForOp =
-          replaceForOpWithNewSignature(rewriter, forOp, {loopArgFlagValue});
-      forOp.erase();
-      forOp = newForOp;
+      (void)addIterArgsToLoop(rewriter, forOp, {loopArgFlagValue});
       loopArgFlagValue =
           forOp.getRegionIterArg(forOp.getNumRegionIterArgs() - 1);
 
 
@@ -590,13 +590,10 @@ scf::ForOp lowerLoads(scf::ForOp forOp, CoarseSchedule &schedule) {
   }
 
   // Patch the loop to add the new loop carried dependencies.
-  scf::ForOp newForOp =
-      replaceForOpWithNewSignature(builder, forOp, newOperands);
-  forOp.erase();
-  forOp = newForOp;
+  (void)addIterArgsToLoop(builder, forOp, newOperands);
 
   // Update yield op with temporary yield values
-  auto forYield = cast<scf::YieldOp>(newForOp.getBody()->getTerminator());
+  auto forYield = cast<scf::YieldOp>(forOp.getBody()->getTerminator());
   for (unsigned i = 0; i < newOperands.size(); ++i) {
     forYield.getResultsMutable().append(newOperands[i]);
   }
@@ -605,13 +602,13 @@ scf::ForOp lowerLoads(scf::ForOp forOp, CoarseSchedule &schedule) {
   loc = forOp.getLoc();
   int argIdx = newOperandIndex;
   for (auto &[numBuffers, loadGroup] : loadGroups) {
-    Value insertIdx = newForOp.getBody()->getArgument(argIdx);
+    Value insertIdx = forOp.getBody()->getArgument(argIdx);
     argIdx++;
-    Value extractIdx = newForOp.getBody()->getArgument(argIdx);
+    Value extractIdx = forOp.getBody()->getArgument(argIdx);
     argIdx++;
     Value phase = nullptr;
     if (loadGroup.hasTMALoad) {
-      phase = newForOp.getBody()->getArgument(argIdx);
+      phase = forOp.getBody()->getArgument(argIdx);
       argIdx++;
     }
 
@@ -821,25 +818,22 @@ scf::ForOp lowerTMADescriptors(scf::ForOp forOp, CoarseSchedule &schedule) {
     newOperands.push_back(zero);
   }
 
-  scf::ForOp newForOp =
-      replaceForOpWithNewSignature(builder, forOp, newOperands);
-  forOp.erase();
-  forOp = newForOp;
+  (void)addIterArgsToLoop(builder, forOp, newOperands);
 
-  auto tmaCounters = ArrayRef<BlockArgument>(newForOp.getBody()->getArguments())
+  auto tmaCounters = ArrayRef<BlockArgument>(forOp.getBody()->getArguments())
                          .slice(tmaCounterArgsStartIdx);
 
   // Update yield op with temporary yield values
-  auto forYield = cast<scf::YieldOp>(newForOp.getBody()->getTerminator());
+  auto forYield = cast<scf::YieldOp>(forOp.getBody()->getTerminator());
   for (unsigned i = 0; i < newOperands.size(); ++i) {
     forYield.getResultsMutable().append(newOperands[i]);
   }
 
-  if (failed(rewriteTMABufferUpdates(newForOp, tmaBufferMapping, tmaCounters,
+  if (failed(rewriteTMABufferUpdates(forOp, tmaBufferMapping, tmaCounters,
                                      maxStage, one, zero, schedule))) {
     llvm_unreachable("Failed to rewrite TMA ops");
   }
-  return newForOp;
+  return forOp;
 }
 
 /////////////////////////////