Pawel/hoist unpipelineable operands (#7082)

pawelszczerbuk · web-flow · commit e461a5b4e15b · 2025-06-06T12:03:48.000-07:00
Hoist tmem and smem allocations for mmav5 operands that may have wait
pushed to the next stage despite operands not being pipelined. This is
to avoid bug where tmem/smem allocations are being clobbered by
allocator due to liveranges being implicitly longer than the allocator
is able to see.
To support hoistedproper wait placement in the presence of hoisted
allocs (thus having stores instead of allocs in the loop) the
`MMAv5PipelineableOperandsHelper` was gneralized a bit to look for any
ops that override operand memory, not only load.
Fence insertion pass also had to be generalized a bit to detect that
shared memory is being overwritten with stores.
diff --git a/include/triton/Dialect/TritonGPU/Transforms/MMAv5PipelineUtility.h b/include/triton/Dialect/TritonGPU/Transforms/MMAv5PipelineUtility.h
@@ -38,18 +38,19 @@ class MMAv5PipelineableOperandsHelper {
       : mmaOp(mmaOp), forOp(forOp), isLoadToBePipelined(isLoadToBePipelined) {
     run();
   }
+
   bool isPipelineable = false;
   // If true, the existing operand loads are all been found and their
   // pipelineability has been determined.
   bool isOperandsStateDetermined = false;
-  SmallVector<Operation *> unpipelineableOperandLoads;
+  SmallVector<Operation *> unpipelineableOperandDefs;
 
 private:
   MMAv5OpInterface mmaOp;
   scf::ForOp forOp;
   std::function<bool(Operation *)> isLoadToBePipelined;
-  bool comesFromLoadOrOutsideLoop(Value v, Operation *&foundLoad);
   void run();
+  bool isOperandPipelineable(Value v, Operation *&foundDef);
 };
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/Dialect/TritonGPU/Transforms/Pipeliner/LowerLoops.cpp b/lib/Dialect/TritonGPU/Transforms/Pipeliner/LowerLoops.cpp
@@ -636,6 +636,9 @@ scf::ForOp lowerLoads(scf::ForOp forOp, CoarseSchedule &schedule) {
 
   // Make sure all ops have attributes.
   for (Operation &op : forOp.getBody()->without_terminator()) {
+    if (!schedule.count(&op)) {
+      op.emitError() << "op not found in the schedule";
+    }
     assert(schedule.count(&op) && "op not found in the schedule");
   }
   return forOp;
@@ -796,6 +799,41 @@ getTmemUseStageBoundOps(ttng::TMEMAllocOp alloc, scf::ForOp forOp,
   return bounds;
 }
 
+Operation *hoistBufferOutOfLoop(scf::ForOp forOp, Operation *op,
+                                CoarseSchedule &schedule) {
+  Operation *newStore = nullptr;
+  if (!isa<ttng::TMEMAllocOp, ttg::LocalAllocOp>(op))
+    return nullptr;
+  // If the alloc is already out of the loop, there is nothing to do.
+  if (!forOp->isAncestor(op))
+    return nullptr;
+  OpBuilderForStage builder(op->getLoc(), forOp, schedule);
+  auto allocType = dyn_cast<MemDescType>(op->getResult(0).getType());
+  auto newType = triton::gpu::MemDescType::get(
+      allocType.getShape(), allocType.getElementType(), allocType.getEncoding(),
+      allocType.getMemorySpace(),
+      /*mutableMemory=*/true);
+  auto newAlloc = builder.clone(*op);
+  newAlloc->getResult(0).setType(newType);
+  builder.setStageCluster(schedule[op]);
+  if (auto tmemAlloc = dyn_cast<ttng::TMEMAllocOp>(newAlloc)) {
+    tmemAlloc.getSrcMutable().clear();
+    builder.setInsertionPointAfter(op);
+    Value trueVal = builder.create<arith::ConstantIntOp>(1, 1);
+    newStore = builder.create<ttng::TMEMStoreOp>(tmemAlloc.getResult(),
+                                                 op->getOperand(0), trueVal);
+  } else {
+    auto localAlloc = cast<ttg::LocalAllocOp>(newAlloc);
+    localAlloc.getSrcMutable().clear();
+    builder.setInsertionPointAfter(op);
+    newStore = builder.create<ttg::LocalStoreOp>(op->getOperand(0),
+                                                 localAlloc.getResult());
+  }
+  op->replaceAllUsesWith(newAlloc);
+  op->erase();
+  return newStore;
+}
+
 void createBarrierAndWaitOps(scf::ForOp forOp, CoarseSchedule &schedule,
                              ttng::MMAv5OpInterface mma, int mmaSelfLatency,
                              ttng::TMEMAllocOp alloc, int phaseArgIdx,
@@ -818,13 +856,24 @@ void createBarrierAndWaitOps(scf::ForOp forOp, CoarseSchedule &schedule,
 
   ttng::MMAv5PipelineableOperandsHelper mmaPipeHelper(mma, forOp,
                                                       isLoadToBePipelined);
+
+  SmallVector<Operation *> updatedDefs;
+  for (auto def : mmaPipeHelper.unpipelineableOperandDefs) {
+    auto newStore = hoistBufferOutOfLoop(forOp, def, schedule);
+    if (newStore) {
+      updatedDefs.push_back(newStore);
+    } else {
+      updatedDefs.push_back(def);
+    }
+  }
+
   if (!mmaPipeHelper.isPipelineable &&
       mmaPipeHelper.isOperandsStateDetermined) {
     // If the operands are not pipelineable, we need to insert a sync point
     // before the earliest operand load
-    for (auto load : mmaPipeHelper.unpipelineableOperandLoads) {
-      if (!latestSyncPoint || schedule.isOpBefore(load, *latestSyncPoint)) {
-        latestSyncPoint = load;
+    for (auto def : updatedDefs) {
+      if (!latestSyncPoint || schedule.isOpBefore(def, *latestSyncPoint)) {
+        latestSyncPoint = def;
       }
     }
   }
diff --git a/lib/Dialect/TritonGPU/Transforms/Pipeliner/MMAv5PipelineUtility.cpp b/lib/Dialect/TritonGPU/Transforms/Pipeliner/MMAv5PipelineUtility.cpp
@@ -14,8 +14,8 @@ namespace ttng = mlir::triton::nvidia_gpu;
 // MMA Pipeline Analysis
 //===----------------------------------------------------------------------===//
 
-bool ttng::MMAv5PipelineableOperandsHelper::comesFromLoadOrOutsideLoop(
-    Value v, Operation *&foundLoad) {
+bool ttng::MMAv5PipelineableOperandsHelper::isOperandPipelineable(
+    Value v, Operation *&foundDef) {
   if (forOp.isDefinedOutsideOfLoop(v)) {
     return true;
   }
@@ -25,14 +25,16 @@ bool ttng::MMAv5PipelineableOperandsHelper::comesFromLoadOrOutsideLoop(
   while (isa<ttg::MemDescTransOp, ttg::MemDescReshapeOp>(v.getDefiningOp())) {
     v = v.getDefiningOp()->getOperand(0);
   }
-  if (auto tmemAlloc = dyn_cast<ttng::TMEMAllocOp>(v.getDefiningOp())) {
-    foundLoad = tmemAlloc;
+  if (isa<ttg::LocalStoreOp, ttng::TMEMStoreOp, ttng::TMEMAllocOp>(
+          v.getDefiningOp())) {
+    foundDef = v.getDefiningOp();
     return false;
   }
   auto localAlloc = dyn_cast<ttg::LocalAllocOp>(v.getDefiningOp());
   if (!localAlloc) {
     return false;
   }
+  foundDef = localAlloc;
   if (!localAlloc.getSrc()) {
     return false;
   }
@@ -44,17 +46,18 @@ bool ttng::MMAv5PipelineableOperandsHelper::comesFromLoadOrOutsideLoop(
           localAllocSrc)) {
     return false;
   }
-  foundLoad = localAllocSrc;
-  if (!isLoadToBePipelined(foundLoad)) {
+  foundDef = localAllocSrc;
+  if (!isLoadToBePipelined(localAllocSrc)) {
     return false;
   }
-  if (canBeAsyncLoad(foundLoad)) {
+  if (canBeAsyncLoad(localAllocSrc)) {
     return true;
   }
   return false;
 }
 
 void ttng::MMAv5PipelineableOperandsHelper::run() {
+  unpipelineableOperandDefs.clear();
   isOperandsStateDetermined = true;
   // Accumulator alloc must be outside the loop.
   auto tmemAlloc = mmaOp.getAccumulator().getDefiningOp<ttng::TMEMAllocOp>();
@@ -65,17 +68,17 @@ void ttng::MMAv5PipelineableOperandsHelper::run() {
     return;
   }
   if (auto dotOp = dyn_cast<tt::DotOpInterface>(mmaOp.getOperation())) {
-    Operation *foundLoad = nullptr;
-    if (!comesFromLoadOrOutsideLoop(dotOp.getA(), foundLoad)) {
-      if (foundLoad) {
-        unpipelineableOperandLoads.push_back(foundLoad);
+    Operation *foundDef = nullptr;
+    if (!isOperandPipelineable(dotOp.getA(), foundDef)) {
+      if (foundDef) {
+        unpipelineableOperandDefs.push_back(foundDef);
       } else {
         isOperandsStateDetermined = false;
       }
     }
-    if (!comesFromLoadOrOutsideLoop(dotOp.getB(), foundLoad)) {
-      if (foundLoad) {
-        unpipelineableOperandLoads.push_back(foundLoad);
+    if (!isOperandPipelineable(dotOp.getB(), foundDef)) {
+      if (foundDef) {
+        unpipelineableOperandDefs.push_back(foundDef);
       } else {
         isOperandsStateDetermined = false;
       }
@@ -95,24 +98,24 @@ void ttng::MMAv5PipelineableOperandsHelper::run() {
       isOperandsStateDetermined = false;
       return;
     }
-    Operation *foundLoad = nullptr;
-    if (!comesFromLoadOrOutsideLoop(scaledOp.getAScale(), foundLoad)) {
-      if (foundLoad) {
-        unpipelineableOperandLoads.push_back(foundLoad);
+    Operation *foundDef = nullptr;
+    if (!isOperandPipelineable(scaledOp.getAScale(), foundDef)) {
+      if (foundDef) {
+        unpipelineableOperandDefs.push_back(foundDef);
       } else {
         isOperandsStateDetermined = false;
       }
     }
-    if (!comesFromLoadOrOutsideLoop(scaledOp.getBScale(), foundLoad)) {
-      if (foundLoad) {
-        unpipelineableOperandLoads.push_back(foundLoad);
+    if (!isOperandPipelineable(scaledOp.getBScale(), foundDef)) {
+      if (foundDef) {
+        unpipelineableOperandDefs.push_back(foundDef);
       } else {
         isOperandsStateDetermined = false;
       }
     }
   }
   isPipelineable =
-      isOperandsStateDetermined && unpipelineableOperandLoads.empty();
+      isOperandsStateDetermined && unpipelineableOperandDefs.empty();
 }
 
 bool ttng::hasAccReadModifyWrite(ttng::MMAv5OpInterface mma, scf::ForOp forOp) {
diff --git a/lib/Dialect/TritonNvidiaGPU/Transforms/FenceInsertion.cpp b/lib/Dialect/TritonNvidiaGPU/Transforms/FenceInsertion.cpp
@@ -39,9 +39,9 @@ struct FenceInsertionPass
     mod.walk([&](DotOpInterface dotOp) {
       Value a = dotOp.getA();
       Value b = dotOp.getB();
-      bool aDependsOnShared = dependOnCopyRegToShared(a);
-      bool bDependsOnShared = dependOnCopyRegToShared(b);
-      if (!aDependsOnShared && !bDependsOnShared)
+      SmallVector<Operation *> copyRegToSharedOpsA = findCopyRegToSharedOps(a);
+      SmallVector<Operation *> copyRegToSharedOpsB = findCopyRegToSharedOps(b);
+      if (copyRegToSharedOpsA.empty() && copyRegToSharedOpsB.empty())
         return WalkResult::advance();
 
       OpBuilder builder(dotOp);
@@ -50,11 +50,13 @@ struct FenceInsertionPass
       // If there is all the dependencies are outside of the loop try to hoist
       // the fence.
       while (auto loopOp = fence->getParentOfType<LoopLikeOpInterface>()) {
-        if (aDependsOnShared &&
-            loopOp->isAncestor(a.getParentBlock()->getParentOp()))
+        if (!copyRegToSharedOpsA.empty() &&
+            llvm::any_of(copyRegToSharedOpsA,
+                         [&](Operation *op) { return loopOp->isAncestor(op); }))
           break;
-        if (bDependsOnShared &&
-            loopOp->isAncestor(b.getParentBlock()->getParentOp()))
+        if (!copyRegToSharedOpsB.empty() &&
+            llvm::any_of(copyRegToSharedOpsB,
+                         [&](Operation *op) { return loopOp->isAncestor(op); }))
           break;
         loopOp.moveOutOfLoop(fence);
       }
@@ -72,31 +74,47 @@ struct FenceInsertionPass
 
 private:
   // Return true if the operand depends on a copy from register to shared.
-  bool dependOnCopyRegToShared(Value operand) {
+  SmallVector<Operation *> findCopyRegToSharedOps(Value operand) {
     DenseSet<Value> visited;
-    return dependOnCopyRegToShared(operand, visited);
+    llvm::SetVector<Operation *> result;
+    findCopyRegToSharedOps(operand, visited, result);
+    return result.takeVector();
   }
 
-  bool dependOnCopyRegToShared(Value operand, DenseSet<Value> &visited) {
+  void findCopyRegToSharedOps(Value operand, DenseSet<Value> &visited,
+                              llvm::SetVector<Operation *> &result) {
     // If the value has already been visited we can safely return false as we
     // would early return when true.
     if (visited.count(operand))
-      return false;
+      return;
     visited.insert(operand);
     if (!isa<triton::gpu::MemDescType>(operand.getType()))
-      return false;
+      return;
 
     auto op = operand.getDefiningOp();
     if (op) {
       // reach an alloc copying from register, we need a fence.
-      if (isa<ttg::LocalAllocOp>(op) && cast<ttg::LocalAllocOp>(op).getSrc())
-        return true;
+      if (auto localAlloc = dyn_cast<ttg::LocalAllocOp>(op)) {
+        if (localAlloc.getSrc()) {
+          result.insert(op);
+        }
+        // Check if there are local_store ops that write to that buffer.
+        for (auto user : localAlloc.getResult().getUsers()) {
+          while (user->hasOneUse() &&
+                 user->hasTrait<OpTrait::MemDescViewTrait>()) {
+            user = *user->getUsers().begin();
+          }
+          if (isa<ttg::LocalStoreOp>(user)) {
+            result.insert(user);
+            return;
+          }
+        }
+      }
       // if it is not an alloc, iterate over the operands.
       for (auto v : op->getOperands()) {
-        if (dependOnCopyRegToShared(v))
-          return true;
+        findCopyRegToSharedOps(v, visited, result);
       }
-      return false;
+      return;
     }
 
     // reach BlockArgument
@@ -108,22 +126,23 @@ struct FenceInsertionPass
       assert(argNum != 0 && "induction var cannot be memdesc type");
       --argNum;
       // prologue
-      if (dependOnCopyRegToShared(forOp.getInitArgs()[argNum], visited))
-        return true;
+      findCopyRegToSharedOps(forOp.getInitArgs()[argNum], visited, result);
       // yield
       auto yieldOp = forOp.getBody()->getTerminator();
       Value v = yieldOp->getOperand(argNum);
-      return dependOnCopyRegToShared(v, visited);
+      findCopyRegToSharedOps(v, visited, result);
+      return;
     }
 
     // look through `ttg.warp_specialize`.
     if (auto wsOp = dyn_cast<ttg::WarpSpecializePartitionsOp>(argOwner)) {
-      return dependOnCopyRegToShared(
-          wsOp.getParentOp().getExplicitCaptures()[argNum]);
+      findCopyRegToSharedOps(wsOp.getParentOp().getExplicitCaptures()[argNum],
+                             visited, result);
+      return;
     }
 
     // Conservatively return true for other ops
-    return true;
+    result.insert(argOwner);
   }
 };
 
diff --git a/test/TritonGPU/fence-inserstion.mlir b/test/TritonGPU/fence-inserstion.mlir
@@ -20,6 +20,27 @@ module attributes {"ttg.target" = "cuda:90", "ttg.num-ctas" = 1 : i32, "ttg.num-
 
 // -----
 
+#blocked = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [2, 16], warpsPerCTA = [8, 1], order = [1, 0]}>
+#blocked2 = #ttg.blocked<{sizePerThread = [8, 1], threadsPerWarp = [16, 2], warpsPerCTA = [1, 8], order = [0, 1]}>
+#mma = #ttg.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [8, 1], instrShape = [16, 64, 16]}>
+#shared = #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = false, elementBitWidth = 16}>
+#shared1 = #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = true, elementBitWidth = 16}>
+#smem = #ttg.shared_memory
+module attributes {"ttg.target" = "cuda:90", "ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, "ttg.threads-per-warp" = 32 : i32} {
+  // CHECK-LABEL: matmul_like_fence_local_store
+  tt.func public @matmul_like_fence_local_store(%arg0: tensor<128x128xf16, #blocked>, %arg1: tensor<128x64xf16, #blocked2>) {
+    %cst = arith.constant dense<0.000000e+00> : tensor<128x64xf32, #mma>
+    %0 = ttg.local_alloc : () -> !ttg.memdesc<128x128xf16, #shared, #smem, mutable>
+    %1 = ttg.local_alloc : () -> !ttg.memdesc<128x64xf16, #shared1, #smem, mutable>
+    ttg.local_store %arg0, %0 : tensor<128x128xf16, #blocked> -> !ttg.memdesc<128x128xf16, #shared, #smem, mutable>
+    // CHECK: ttng.fence_async_shared
+    %2 = ttng.warp_group_dot %0, %1, %cst : !ttg.memdesc<128x128xf16, #shared, #smem, mutable> * !ttg.memdesc<128x64xf16, #shared1, #smem, mutable> -> tensor<128x64xf32, #mma>
+    tt.return
+  }
+}
+
+// -----
+
 #blocked = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [2, 16], warpsPerCTA = [8, 1], order = [1, 0]}>
 #blocked1 = #ttg.blocked<{sizePerThread = [1, 64], threadsPerWarp = [32, 1], warpsPerCTA = [8, 1], order = [1, 0]}>
 #blocked2 = #ttg.blocked<{sizePerThread = [8, 1], threadsPerWarp = [16, 2], warpsPerCTA = [1, 8], order = [0, 1]}>
@@ -74,6 +95,37 @@ module attributes {"ttg.target" = "cuda:90", "ttg.num-ctas" = 1 : i32, "ttg.num-
 
 // -----
 
+#blocked = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [2, 16], warpsPerCTA = [8, 1], order = [1, 0]}>
+#blocked2 = #ttg.blocked<{sizePerThread = [8, 1], threadsPerWarp = [16, 2], warpsPerCTA = [1, 8], order = [0, 1]}>
+#mma = #ttg.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [8, 1], instrShape = [16, 64, 16]}>
+#shared = #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = false, elementBitWidth = 16}>
+#shared1 = #ttg.nvmma_shared<{swizzlingByteWidth = 128, transposed = true, elementBitWidth = 16}>
+#smem = #ttg.shared_memory
+module attributes {"ttg.target" = "cuda:90", "ttg.num-ctas" = 1 : i32, "ttg.num-warps" = 8 : i32, "ttg.threads-per-warp" = 32 : i32} {
+  // CHECK-LABEL: fence_store_in_loop
+  tt.func public @fence_store_in_loop(%arg0: tensor<128x128xf16, #blocked>, %arg1: tensor<128x64xf16, #blocked>) {
+    %cst = arith.constant dense<0.000000e+00> : tensor<128x64xf32, #mma>
+    %c64_i32 = arith.constant 64 : i32
+    %c0_i32 = arith.constant 0 : i32
+    %c32_i32 = arith.constant 32 : i32
+    %0 = ttg.local_alloc %arg0 : (tensor<128x128xf16, #blocked>) -> !ttg.memdesc<128x128xf16, #shared, #smem, mutable>
+    %1 = ttg.local_alloc %arg1 : (tensor<128x64xf16, #blocked>) -> !ttg.memdesc<128x64xf16, #shared1, #smem>
+    // CHECK-NOT: ttng.fence_async_shared
+    // CHECK: scf.for
+    // CHECK: ttng.fence_async_shared
+    // CHECK: ttng.warp_group_dot
+    scf.for %iv0 = %c0_i32 to %c64_i32 step %c32_i32 : i32 {
+      scf.for %iv1 = %c0_i32 to %c64_i32 step %c32_i32 : i32 {
+        ttg.local_store %arg0, %0 : tensor<128x128xf16, #blocked> -> !ttg.memdesc<128x128xf16, #shared, #smem, mutable>
+        %2 = ttng.warp_group_dot %0, %1, %cst : !ttg.memdesc<128x128xf16, #shared, #smem, mutable> * !ttg.memdesc<128x64xf16, #shared1, #smem> -> tensor<128x64xf32, #mma>
+      }
+    }
+    tt.return
+  }
+}
+
+// -----
+
 #blocked = #ttg.blocked<{sizePerThread = [1, 8], threadsPerWarp = [2, 16], warpsPerCTA = [8, 1], order = [1, 0]}>
 #blocked2 = #ttg.blocked<{sizePerThread = [8, 1], threadsPerWarp = [16, 2], warpsPerCTA = [1, 8], order = [0, 1]}>
 #mma = #ttg.nvidia_mma<{versionMajor = 3, versionMinor = 0, warpsPerCTA = [8, 1], instrShape = [16, 64, 16]}>
diff --git a/test/TritonGPU/loop-pipeline-blackwell.mlir b/test/TritonGPU/loop-pipeline-blackwell.mlir
diff --git a/test/TritonGPU/pipeline-lower-loop.mlir b/test/TritonGPU/pipeline-lower-loop.mlir

Original file line number	Diff line number	Diff line change
`@@ -14,8 +14,8 @@ namespace ttng = mlir::triton::nvidia_gpu;`
`14`	`14`	`// MMA Pipeline Analysis`
`15`	`15`	`//===----------------------------------------------------------------------===//`
`16`	`16`
`17`		`-bool ttng::MMAv5PipelineableOperandsHelper::comesFromLoadOrOutsideLoop(`
`18`		`- Value v, Operation *&foundLoad) {`
	`17`	`+bool ttng::MMAv5PipelineableOperandsHelper::isOperandPipelineable(`
	`18`	`+ Value v, Operation *&foundDef) {`
`19`	`19`	`if (forOp.isDefinedOutsideOfLoop(v)) {`
`20`	`20`	`return true;`
`21`	`21`	`}`
`@@ -25,14 +25,16 @@ bool ttng::MMAv5PipelineableOperandsHelper::comesFromLoadOrOutsideLoop(`
`25`	`25`	`while (isa<ttg::MemDescTransOp, ttg::MemDescReshapeOp>(v.getDefiningOp())) {`
`26`	`26`	`v = v.getDefiningOp()->getOperand(0);`
`27`	`27`	`}`
`28`		`- if (auto tmemAlloc = dyn_cast<ttng::TMEMAllocOp>(v.getDefiningOp())) {`
`29`		`- foundLoad = tmemAlloc;`
	`28`	`+ if (isa<ttg::LocalStoreOp, ttng::TMEMStoreOp, ttng::TMEMAllocOp>(`
	`29`	`+ v.getDefiningOp())) {`
	`30`	`+ foundDef = v.getDefiningOp();`
`30`	`31`	`return false;`
`31`	`32`	`}`
`32`	`33`	`auto localAlloc = dyn_cast<ttg::LocalAllocOp>(v.getDefiningOp());`
`33`	`34`	`if (!localAlloc) {`
`34`	`35`	`return false;`
`35`	`36`	`}`
	`37`	`+ foundDef = localAlloc;`
`36`	`38`	`if (!localAlloc.getSrc()) {`
`37`	`39`	`return false;`
`38`	`40`	`}`
`@@ -44,17 +46,18 @@ bool ttng::MMAv5PipelineableOperandsHelper::comesFromLoadOrOutsideLoop(`
`44`	`46`	`localAllocSrc)) {`
`45`	`47`	`return false;`
`46`	`48`	`}`
`47`		`- foundLoad = localAllocSrc;`
`48`		`- if (!isLoadToBePipelined(foundLoad)) {`
	`49`	`+ foundDef = localAllocSrc;`
	`50`	`+ if (!isLoadToBePipelined(localAllocSrc)) {`
`49`	`51`	`return false;`
`50`	`52`	`}`
`51`		`- if (canBeAsyncLoad(foundLoad)) {`
	`53`	`+ if (canBeAsyncLoad(localAllocSrc)) {`
`52`	`54`	`return true;`
`53`	`55`	`}`
`54`	`56`	`return false;`
`55`	`57`	`}`
`56`	`58`
`57`	`59`	`void ttng::MMAv5PipelineableOperandsHelper::run() {`
	`60`	`+ unpipelineableOperandDefs.clear();`
`58`	`61`	`isOperandsStateDetermined = true;`
`59`	`62`	`// Accumulator alloc must be outside the loop.`
`60`	`63`	`auto tmemAlloc = mmaOp.getAccumulator().getDefiningOp<ttng::TMEMAllocOp>();`
`@@ -65,17 +68,17 @@ void ttng::MMAv5PipelineableOperandsHelper::run() {`
`65`	`68`	`return;`
`66`	`69`	`}`
`67`	`70`	`if (auto dotOp = dyn_cast<tt::DotOpInterface>(mmaOp.getOperation())) {`
`68`		`- Operation *foundLoad = nullptr;`
`69`		`- if (!comesFromLoadOrOutsideLoop(dotOp.getA(), foundLoad)) {`
`70`		`- if (foundLoad) {`
`71`		`- unpipelineableOperandLoads.push_back(foundLoad);`
	`71`	`+ Operation *foundDef = nullptr;`
	`72`	`+ if (!isOperandPipelineable(dotOp.getA(), foundDef)) {`
	`73`	`+ if (foundDef) {`
	`74`	`+ unpipelineableOperandDefs.push_back(foundDef);`
`72`	`75`	`} else {`
`73`	`76`	`isOperandsStateDetermined = false;`
`74`	`77`	`}`
`75`	`78`	`}`
`76`		`- if (!comesFromLoadOrOutsideLoop(dotOp.getB(), foundLoad)) {`
`77`		`- if (foundLoad) {`
`78`		`- unpipelineableOperandLoads.push_back(foundLoad);`
	`79`	`+ if (!isOperandPipelineable(dotOp.getB(), foundDef)) {`
	`80`	`+ if (foundDef) {`
	`81`	`+ unpipelineableOperandDefs.push_back(foundDef);`
`79`	`82`	`} else {`
`80`	`83`	`isOperandsStateDetermined = false;`
`81`	`84`	`}`
`@@ -95,24 +98,24 @@ void ttng::MMAv5PipelineableOperandsHelper::run() {`
`95`	`98`	`isOperandsStateDetermined = false;`
`96`	`99`	`return;`
`97`	`100`	`}`
`98`		`- Operation *foundLoad = nullptr;`
`99`		`- if (!comesFromLoadOrOutsideLoop(scaledOp.getAScale(), foundLoad)) {`
`100`		`- if (foundLoad) {`
`101`		`- unpipelineableOperandLoads.push_back(foundLoad);`
	`101`	`+ Operation *foundDef = nullptr;`
	`102`	`+ if (!isOperandPipelineable(scaledOp.getAScale(), foundDef)) {`
	`103`	`+ if (foundDef) {`
	`104`	`+ unpipelineableOperandDefs.push_back(foundDef);`
`102`	`105`	`} else {`
`103`	`106`	`isOperandsStateDetermined = false;`
`104`	`107`	`}`
`105`	`108`	`}`
`106`		`- if (!comesFromLoadOrOutsideLoop(scaledOp.getBScale(), foundLoad)) {`
`107`		`- if (foundLoad) {`
`108`		`- unpipelineableOperandLoads.push_back(foundLoad);`
	`109`	`+ if (!isOperandPipelineable(scaledOp.getBScale(), foundDef)) {`
	`110`	`+ if (foundDef) {`
	`111`	`+ unpipelineableOperandDefs.push_back(foundDef);`
`109`	`112`	`} else {`
`110`	`113`	`isOperandsStateDetermined = false;`
`111`	`114`	`}`
`112`	`115`	`}`
`113`	`116`	`}`
`114`	`117`	`isPipelineable =`
`115`		`- isOperandsStateDetermined && unpipelineableOperandLoads.empty();`
	`118`	`+ isOperandsStateDetermined && unpipelineableOperandDefs.empty();`
`116`	`119`	`}`
`117`	`120`
`118`	`121`	`bool ttng::hasAccReadModifyWrite(ttng::MMAv5OpInterface mma, scf::ForOp forOp) {`