intel
diff --git a/‎include/triton/Conversion/TritonGPUToLLVM/Utility.h‎
Lines changed: 0 additions & 6 deletions b/‎include/triton/Conversion/TritonGPUToLLVM/Utility.h‎
Lines changed: 0 additions & 6 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/Transforms/MMAv5PipelineUtility.h‎
Lines changed: 8 additions & 55 deletions b/‎include/triton/Dialect/TritonGPU/Transforms/MMAv5PipelineUtility.h‎
Lines changed: 8 additions & 55 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/Transforms/Utility.h‎
Lines changed: 6 additions & 0 deletions b/‎include/triton/Dialect/TritonGPU/Transforms/Utility.h‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎lib/Conversion/TritonGPUToLLVM/Utility.cpp‎
Lines changed: 0 additions & 12 deletions b/‎lib/Conversion/TritonGPUToLLVM/Utility.cpp‎
Lines changed: 0 additions & 12 deletions
diff --git a/‎lib/Dialect/TritonGPU/Transforms/Pipeliner/AssignLatencies.cpp‎
Lines changed: 2 additions & 51 deletions b/‎lib/Dialect/TritonGPU/Transforms/Pipeliner/AssignLatencies.cpp‎
Lines changed: 2 additions & 51 deletions
@@ -53,9 +53,6 @@ createLLVMIntrinsicCallOp(OpBuilder &builder, Location loc, StringRef intrinsic,
                           TypeRange types, ValueRange args);
 } // namespace mlir::LLVM
 
-// Is v an integer or floating-point scalar constant equal to 0?
-bool isConstantZero(Value v);
-
 namespace mlir::triton {
 
 struct TritonLLVMOpBuilder {
@@ -348,9 +345,6 @@ LLVM::LLVMFuncOp appendOrGetExternFuncOp(RewriterBase &rewriter, Operation *op,
 namespace LLVM {
 using namespace mlir::triton;
 
-// Is v an integer or floating-point scalar constant equal to 0?
-bool isConstantZero(Value v);
-
 class SharedMemoryObject {
 public:
   SharedMemoryObject(Value base, Type baseElemType, ArrayRef<Value> offsets)
 
@@ -13,50 +13,6 @@ class ForOp;
 } // namespace scf
 namespace triton::nvidia_gpu {
 
-//===----------------------------------------------------------------------===//
-// MMAInfo
-//===----------------------------------------------------------------------===//
-
-// This struct contains analysis information about an MMAv5 operation inside a
-// loop used for pipelining MMA ops.
-struct MMAInfo {
-  // This struct contains information about when the MMA's accumulator is
-  // overridden in the loop, if it is at all.
-  struct AccOverridePoint {
-    // The operation which overrides the accumulator.
-    Operation *op;
-    // The condition on which the accumulator is reset.
-    Value condition = nullptr;
-    // The initial value of the accumulator and the value after a reset.
-    Value initValue = nullptr;
-    // The number of loop iterations ago the accumulator was reset.
-    int distance = 0;
-    // Whether the accumulator is reset via setting the `useAcc` flag to false
-    // or by clearing the accumulator tensor value.
-    bool isFlag = false;
-  };
-
-  // The TMEM allocation of the accumuator, which directly precedes the dot op.
-  TMEMAllocOp accAlloc;
-  // The TMEM load of the accumulator value out of TMEM, which directly follows
-  // the dot op.
-  TMEMLoadOp accLoad;
-  // The override point of the accumulator value, if it is overriden in the
-  // loop. E.g. this is typically present for persistent kernels.
-  std::optional<AccOverridePoint> accDef;
-  // If the accumulator is used in future iterations of the loop, this is the
-  // iter arg number.
-  std::optional<int> yieldArgNo;
-  // Whether the accumulator needs to be multibuffered.
-  bool accIsMultiBuffered;
-
-  Value phase = nullptr;
-  Value barrierIdx = nullptr;
-  Value accInsertIdx = nullptr;
-  Value accExtractIdx = nullptr;
-  Value barrierAlloc = nullptr;
-};
-
 //===----------------------------------------------------------------------===//
 // MMA Pipeline Analysis
 //===----------------------------------------------------------------------===//
@@ -66,12 +22,14 @@ struct MMAInfo {
 // be in the same region as the MMA operation.
 std::optional<std::pair<TMEMAllocOp, TMEMLoadOp>>
 getTMemAllocAndLoad(MMAv5OpInterface mmaOp);
-// Get immediate users of the accumulator within the current loop iteration.
-SmallVector<Operation *> getDirectAccUses(TMEMLoadOp accDef);
-// Analyze an MMA op inside a loop to determine information about how it can be
-// pipelined. Returns `std::nullopt` if it cannot be pipelined.
-std::optional<MMAInfo> getMMAInfo(scf::ForOp forOp, MMAv5OpInterface mmaOp,
-                                  DominanceInfo &domInfo);
+// Given an MMAv5 operation in a loop, determine if its accumulator can be
+// multibuffered.
+bool isAccMultibufferingPossible(MMAv5OpInterface mma, scf::ForOp forOp);
+// Only pipeline the loops where the MMA happens before the tmem_load, or is in
+// the same stage as the tmem_load. Lowering does not support the case where the
+// MMA is in a different stage as the tmem_load and happens after it.
+bool mmav5DominatesTmemLoads(
+    scf::ForOp forOp, function_ref<bool(MMAv5OpInterface)> isMmaPipelineable);
 
 //===----------------------------------------------------------------------===//
 // MMA Pipeline Rewriters
@@ -82,11 +40,6 @@ std::optional<MMAInfo> getMMAInfo(scf::ForOp forOp, MMAv5OpInterface mmaOp,
 TMEMAllocOp createTMemAlloc(OpBuilder &builder, TMEMAllocOp oldTMemAllocOp,
                             bool multiBufferred, int numStages);
 
-// Create a store op of the initial value of the accumulator into the
-// potentially multi-buffered accumulator.
-void createInitStore(OpBuilder &builder, TMEMAllocOp allocOp, Value initVal,
-                     bool multiBufferred);
-
 // Return true if operands of the MMA operation are/are going to be pipelined
 // and multibuffered, enabling the MMA operation to be pipelined.
 bool mmaHasPipelineableOperands(
 
@@ -11,6 +11,7 @@
 
 namespace mlir {
 class DominanceInfo;
+class PostDominanceInfo;
 
 namespace triton {
 class ModuleAxisInfoAnalysis;
@@ -222,6 +223,11 @@ getMMAsWithMultiBufferredOperands(scf::ForOp forOp,
 // regions. The result op is not necessarily one of the ops in the list.
 Operation *findNearestCommonDominator(ArrayRef<Operation *> ops,
                                       DominanceInfo &domInfo);
+// Given a list of ops, find the naerest common postdominator of all ops or
+// return null if one could not be found. The ops are allowed to be in different
+// regions. The result op is not necessarily one of the ops in the list.
+Operation *findNearestCommonPostDominator(ArrayRef<Operation *> ops,
+                                          PostDominanceInfo &postDomInfo);
 
 /// Visit the operands of `op` and the operands of any nested ops defined
 /// outside of `op`.
 
@@ -667,18 +667,6 @@ createLLVMIntrinsicCallOp(OpBuilder &builder, Location loc, StringRef intrinsic,
   return op;
 }
 
-bool isConstantZero(Value v) {
-  if (auto constantOp = v.getDefiningOp<arith::ConstantOp>()) {
-    if (auto attr = dyn_cast<IntegerAttr>(constantOp.getValue())) {
-      return attr.getValue().isZero();
-    }
-    if (auto attr = dyn_cast<FloatAttr>(constantOp.getValue())) {
-      return attr.getValue().isZero();
-    }
-  }
-  return false;
-}
-
 Value getStructFromSharedMemoryObject(Location loc,
                                       const SharedMemoryObject &smemObj,
                                       RewriterBase &rewriter) {
 
@@ -278,8 +278,8 @@ class AssignMMALatencies {
       if (auto mma = dyn_cast<ttng::MMAv5OpInterface>(&op)) {
         if (ttng::mmaHasPipelineableOperands(mma, forOp, isLoadPipelineable) &&
             !ttng::hasAccReadModifyWrite(mma, forOp) &&
-            !getDisallowAccMultiBuffer(forOp) &&
-            isAccMultibufferingPossible(mma, forOp)) {
+            ttng::isAccMultibufferingPossible(mma, forOp) &&
+            !getDisallowAccMultiBuffer(forOp)) {
           opLatency[&op] = 1;
         }
       }
@@ -289,55 +289,6 @@ class AssignMMALatencies {
 private:
   scf::ForOp forOp;
   DenseMap<Operation *, int> &opLatency;
-
-  bool isConstantZero(Value v) {
-    if (auto constantOp = v.getDefiningOp<arith::ConstantOp>()) {
-      if (auto attr = dyn_cast<IntegerAttr>(constantOp.getValue())) {
-        return attr.getValue().isZero();
-      }
-      if (auto attr = dyn_cast<FloatAttr>(constantOp.getValue())) {
-        return attr.getValue().isZero();
-      }
-    }
-    return false;
-  }
-
-  bool accUseFlagSetToFalse(ttng::MMAv5OpInterface mma, scf::ForOp forOp) {
-    Value accUseFlag = mma.useAccumulator();
-    if (isConstantZero(accUseFlag)) {
-      return true;
-    }
-    auto yieldOp = cast<scf::YieldOp>(forOp.getBody()->getTerminator());
-    while (auto blockArg = dyn_cast<BlockArgument>(accUseFlag)) {
-      accUseFlag = yieldOp.getOperand(blockArg.getArgNumber() - 1);
-    }
-    // If the accUseFlag is overwritten in the loop, we treat it as a 'false'
-    // with condition being ~accUseFlag.
-    return accUseFlag.getDefiningOp() &&
-           forOp->isAncestor(accUseFlag.getDefiningOp());
-  }
-
-  bool accOverwrittenInLoop(ttng::MMAv5OpInterface mma, scf::ForOp forOp) {
-    auto tmemAlloc = mma.getAccumulator().getDefiningOp<ttng::TMEMAllocOp>();
-    if (!tmemAlloc || !forOp.isDefinedOutsideOfLoop(tmemAlloc)) {
-      return false;
-    }
-    for (auto user : tmemAlloc->getUsers()) {
-      if (isa<ttng::TMEMStoreOp>(user) &&
-          forOp->isAncestor(user->getParentOp())) {
-        return true;
-      }
-    }
-    return false;
-  }
-
-  bool isAccMultibufferingPossible(ttng::MMAv5OpInterface mma,
-                                   scf::ForOp forOp) {
-    // If the accumulator is never overwritten in the loop, we can't multibuffer
-    // it, as the overwrite point is the only place where we can swap the
-    // buffer.
-    return accUseFlagSetToFalse(mma, forOp) || accOverwrittenInLoop(mma, forOp);
-  }
 };
 
 } // namespace