intel
diff --git a/‎include/triton/Dialect/TritonGPU/Transforms/MMAv5PipelineUtility.h
Lines changed: 3 additions & 2 deletions b/‎include/triton/Dialect/TritonGPU/Transforms/MMAv5PipelineUtility.h
Lines changed: 3 additions & 2 deletions
diff --git a/‎lib/Dialect/TritonGPU/Transforms/Pipeliner/LowerLoops.cpp
Lines changed: 52 additions & 3 deletions b/‎lib/Dialect/TritonGPU/Transforms/Pipeliner/LowerLoops.cpp
Lines changed: 52 additions & 3 deletions
diff --git a/‎lib/Dialect/TritonGPU/Transforms/Pipeliner/MMAv5PipelineUtility.cpp
Lines changed: 25 additions & 22 deletions b/‎lib/Dialect/TritonGPU/Transforms/Pipeliner/MMAv5PipelineUtility.cpp
Lines changed: 25 additions & 22 deletions
diff --git a/‎lib/Dialect/TritonGPU/Transforms/RemoveLayoutConversions.cpp
Lines changed: 13 additions & 0 deletions b/‎lib/Dialect/TritonGPU/Transforms/RemoveLayoutConversions.cpp
Lines changed: 13 additions & 0 deletions
diff --git a/‎lib/Dialect/TritonNvidiaGPU/Transforms/FenceInsertion.cpp
Lines changed: 42 additions & 23 deletions b/‎lib/Dialect/TritonNvidiaGPU/Transforms/FenceInsertion.cpp
Lines changed: 42 additions & 23 deletions
diff --git a/‎python/src/gluon_ir.cc
Lines changed: 5 additions & 0 deletions b/‎python/src/gluon_ir.cc
Lines changed: 5 additions & 0 deletions
diff --git a/‎python/src/ir.cc
Lines changed: 1 addition & 6 deletions b/‎python/src/ir.cc
Lines changed: 1 addition & 6 deletions
@@ -38,18 +38,19 @@ class MMAv5PipelineableOperandsHelper {
       : mmaOp(mmaOp), forOp(forOp), isLoadToBePipelined(isLoadToBePipelined) {
     run();
   }
+
   bool isPipelineable = false;
   // If true, the existing operand loads are all been found and their
   // pipelineability has been determined.
   bool isOperandsStateDetermined = false;
-  SmallVector<Operation *> unpipelineableOperandLoads;
+  SmallVector<Operation *> unpipelineableOperandDefs;
 
 private:
   MMAv5OpInterface mmaOp;
   scf::ForOp forOp;
   std::function<bool(Operation *)> isLoadToBePipelined;
-  bool comesFromLoadOrOutsideLoop(Value v, Operation *&foundLoad);
   void run();
+  bool isOperandPipelineable(Value v, Operation *&foundDef);
 };
 
 //===----------------------------------------------------------------------===//
 
@@ -636,6 +636,9 @@ scf::ForOp lowerLoads(scf::ForOp forOp, CoarseSchedule &schedule) {
 
   // Make sure all ops have attributes.
   for (Operation &op : forOp.getBody()->without_terminator()) {
+    if (!schedule.count(&op)) {
+      op.emitError() << "op not found in the schedule";
+    }
     assert(schedule.count(&op) && "op not found in the schedule");
   }
   return forOp;
@@ -796,6 +799,41 @@ getTmemUseStageBoundOps(ttng::TMEMAllocOp alloc, scf::ForOp forOp,
   return bounds;
 }
 
+Operation *hoistBufferOutOfLoop(scf::ForOp forOp, Operation *op,
+                                CoarseSchedule &schedule) {
+  Operation *newStore = nullptr;
+  if (!isa<ttng::TMEMAllocOp, ttg::LocalAllocOp>(op))
+    return nullptr;
+  // If the alloc is already out of the loop, there is nothing to do.
+  if (!forOp->isAncestor(op))
+    return nullptr;
+  OpBuilderForStage builder(op->getLoc(), forOp, schedule);
+  auto allocType = dyn_cast<MemDescType>(op->getResult(0).getType());
+  auto newType = triton::gpu::MemDescType::get(
+      allocType.getShape(), allocType.getElementType(), allocType.getEncoding(),
+      allocType.getMemorySpace(),
+      /*mutableMemory=*/true);
+  auto newAlloc = builder.clone(*op);
+  newAlloc->getResult(0).setType(newType);
+  builder.setStageCluster(schedule[op]);
+  if (auto tmemAlloc = dyn_cast<ttng::TMEMAllocOp>(newAlloc)) {
+    tmemAlloc.getSrcMutable().clear();
+    builder.setInsertionPointAfter(op);
+    Value trueVal = builder.create<arith::ConstantIntOp>(1, 1);
+    newStore = builder.create<ttng::TMEMStoreOp>(tmemAlloc.getResult(),
+                                                 op->getOperand(0), trueVal);
+  } else {
+    auto localAlloc = cast<ttg::LocalAllocOp>(newAlloc);
+    localAlloc.getSrcMutable().clear();
+    builder.setInsertionPointAfter(op);
+    newStore = builder.create<ttg::LocalStoreOp>(op->getOperand(0),
+                                                 localAlloc.getResult());
+  }
+  op->replaceAllUsesWith(newAlloc);
+  op->erase();
+  return newStore;
+}
+
 void createBarrierAndWaitOps(scf::ForOp forOp, CoarseSchedule &schedule,
                              ttng::MMAv5OpInterface mma, int mmaSelfLatency,
                              ttng::TMEMAllocOp alloc, int phaseArgIdx,
@@ -818,13 +856,24 @@ void createBarrierAndWaitOps(scf::ForOp forOp, CoarseSchedule &schedule,
 
   ttng::MMAv5PipelineableOperandsHelper mmaPipeHelper(mma, forOp,
                                                       isLoadToBePipelined);
+
+  SmallVector<Operation *> updatedDefs;
+  for (auto def : mmaPipeHelper.unpipelineableOperandDefs) {
+    auto newStore = hoistBufferOutOfLoop(forOp, def, schedule);
+    if (newStore) {
+      updatedDefs.push_back(newStore);
+    } else {
+      updatedDefs.push_back(def);
+    }
+  }
+
   if (!mmaPipeHelper.isPipelineable &&
       mmaPipeHelper.isOperandsStateDetermined) {
     // If the operands are not pipelineable, we need to insert a sync point
     // before the earliest operand load
-    for (auto load : mmaPipeHelper.unpipelineableOperandLoads) {
-      if (!latestSyncPoint || schedule.isOpBefore(load, *latestSyncPoint)) {
-        latestSyncPoint = load;
+    for (auto def : updatedDefs) {
+      if (!latestSyncPoint || schedule.isOpBefore(def, *latestSyncPoint)) {
+        latestSyncPoint = def;
       }
     }
   }
 
@@ -14,8 +14,8 @@ namespace ttng = mlir::triton::nvidia_gpu;
 // MMA Pipeline Analysis
 //===----------------------------------------------------------------------===//
 
-bool ttng::MMAv5PipelineableOperandsHelper::comesFromLoadOrOutsideLoop(
-    Value v, Operation *&foundLoad) {
+bool ttng::MMAv5PipelineableOperandsHelper::isOperandPipelineable(
+    Value v, Operation *&foundDef) {
   if (forOp.isDefinedOutsideOfLoop(v)) {
     return true;
   }
@@ -25,14 +25,16 @@ bool ttng::MMAv5PipelineableOperandsHelper::comesFromLoadOrOutsideLoop(
   while (isa<ttg::MemDescTransOp, ttg::MemDescReshapeOp>(v.getDefiningOp())) {
     v = v.getDefiningOp()->getOperand(0);
   }
-  if (auto tmemAlloc = dyn_cast<ttng::TMEMAllocOp>(v.getDefiningOp())) {
-    foundLoad = tmemAlloc;
+  if (isa<ttg::LocalStoreOp, ttng::TMEMStoreOp, ttng::TMEMAllocOp>(
+          v.getDefiningOp())) {
+    foundDef = v.getDefiningOp();
     return false;
   }
   auto localAlloc = dyn_cast<ttg::LocalAllocOp>(v.getDefiningOp());
   if (!localAlloc) {
     return false;
   }
+  foundDef = localAlloc;
   if (!localAlloc.getSrc()) {
     return false;
   }
@@ -44,17 +46,18 @@ bool ttng::MMAv5PipelineableOperandsHelper::comesFromLoadOrOutsideLoop(
           localAllocSrc)) {
     return false;
   }
-  foundLoad = localAllocSrc;
-  if (!isLoadToBePipelined(foundLoad)) {
+  foundDef = localAllocSrc;
+  if (!isLoadToBePipelined(localAllocSrc)) {
     return false;
   }
-  if (canBeAsyncLoad(foundLoad)) {
+  if (canBeAsyncLoad(localAllocSrc)) {
     return true;
   }
   return false;
 }
 
 void ttng::MMAv5PipelineableOperandsHelper::run() {
+  unpipelineableOperandDefs.clear();
   isOperandsStateDetermined = true;
   // Accumulator alloc must be outside the loop.
   auto tmemAlloc = mmaOp.getAccumulator().getDefiningOp<ttng::TMEMAllocOp>();
@@ -65,17 +68,17 @@ void ttng::MMAv5PipelineableOperandsHelper::run() {
     return;
   }
   if (auto dotOp = dyn_cast<tt::DotOpInterface>(mmaOp.getOperation())) {
-    Operation *foundLoad = nullptr;
-    if (!comesFromLoadOrOutsideLoop(dotOp.getA(), foundLoad)) {
-      if (foundLoad) {
-        unpipelineableOperandLoads.push_back(foundLoad);
+    Operation *foundDef = nullptr;
+    if (!isOperandPipelineable(dotOp.getA(), foundDef)) {
+      if (foundDef) {
+        unpipelineableOperandDefs.push_back(foundDef);
       } else {
         isOperandsStateDetermined = false;
       }
     }
-    if (!comesFromLoadOrOutsideLoop(dotOp.getB(), foundLoad)) {
-      if (foundLoad) {
-        unpipelineableOperandLoads.push_back(foundLoad);
+    if (!isOperandPipelineable(dotOp.getB(), foundDef)) {
+      if (foundDef) {
+        unpipelineableOperandDefs.push_back(foundDef);
       } else {
         isOperandsStateDetermined = false;
       }
@@ -95,24 +98,24 @@ void ttng::MMAv5PipelineableOperandsHelper::run() {
       isOperandsStateDetermined = false;
       return;
     }
-    Operation *foundLoad = nullptr;
-    if (!comesFromLoadOrOutsideLoop(scaledOp.getAScale(), foundLoad)) {
-      if (foundLoad) {
-        unpipelineableOperandLoads.push_back(foundLoad);
+    Operation *foundDef = nullptr;
+    if (!isOperandPipelineable(scaledOp.getAScale(), foundDef)) {
+      if (foundDef) {
+        unpipelineableOperandDefs.push_back(foundDef);
       } else {
         isOperandsStateDetermined = false;
       }
     }
-    if (!comesFromLoadOrOutsideLoop(scaledOp.getBScale(), foundLoad)) {
-      if (foundLoad) {
-        unpipelineableOperandLoads.push_back(foundLoad);
+    if (!isOperandPipelineable(scaledOp.getBScale(), foundDef)) {
+      if (foundDef) {
+        unpipelineableOperandDefs.push_back(foundDef);
       } else {
         isOperandsStateDetermined = false;
       }
     }
   }
   isPipelineable =
-      isOperandsStateDetermined && unpipelineableOperandLoads.empty();
+      isOperandsStateDetermined && unpipelineableOperandDefs.empty();
 }
 
 bool ttng::hasAccReadModifyWrite(ttng::MMAv5OpInterface mma, scf::ForOp forOp) {
 
@@ -1399,6 +1399,19 @@ void LayoutRematerialization::hoistConvertOnTopOfExtOrBroadcast(
         return;
       LogicalResult result = getRematerializableSlice(
           op->getOpOperand(0), srcEncoding, tempSlice, tempLayout);
+
+      // If a value is already assigned to a _different_ layout,
+      // we cannot propagate past this op (as it would conflict with
+      // an already-assigned layout).
+      for (auto [val, enc] : tempLayout) {
+        auto preexistingLayout = layout.find(val);
+        if (preexistingLayout != layout.end() &&
+            preexistingLayout->second != enc) {
+          result = failure();
+          break;
+        }
+      }
+
       // If we can rematerialize the rest of the ext slice we can ignore this
       // ext as it won't need a convert.
       if (result.succeeded()) {
 
@@ -39,9 +39,9 @@ struct FenceInsertionPass
     mod.walk([&](DotOpInterface dotOp) {
       Value a = dotOp.getA();
       Value b = dotOp.getB();
-      bool aDependsOnShared = dependOnCopyRegToShared(a);
-      bool bDependsOnShared = dependOnCopyRegToShared(b);
-      if (!aDependsOnShared && !bDependsOnShared)
+      SmallVector<Operation *> copyRegToSharedOpsA = findCopyRegToSharedOps(a);
+      SmallVector<Operation *> copyRegToSharedOpsB = findCopyRegToSharedOps(b);
+      if (copyRegToSharedOpsA.empty() && copyRegToSharedOpsB.empty())
         return WalkResult::advance();
 
       OpBuilder builder(dotOp);
@@ -50,11 +50,13 @@ struct FenceInsertionPass
       // If there is all the dependencies are outside of the loop try to hoist
       // the fence.
       while (auto loopOp = fence->getParentOfType<LoopLikeOpInterface>()) {
-        if (aDependsOnShared &&
-            loopOp->isAncestor(a.getParentBlock()->getParentOp()))
+        if (!copyRegToSharedOpsA.empty() &&
+            llvm::any_of(copyRegToSharedOpsA,
+                         [&](Operation *op) { return loopOp->isAncestor(op); }))
           break;
-        if (bDependsOnShared &&
-            loopOp->isAncestor(b.getParentBlock()->getParentOp()))
+        if (!copyRegToSharedOpsB.empty() &&
+            llvm::any_of(copyRegToSharedOpsB,
+                         [&](Operation *op) { return loopOp->isAncestor(op); }))
           break;
         loopOp.moveOutOfLoop(fence);
       }
@@ -72,31 +74,47 @@ struct FenceInsertionPass
 
 private:
   // Return true if the operand depends on a copy from register to shared.
-  bool dependOnCopyRegToShared(Value operand) {
+  SmallVector<Operation *> findCopyRegToSharedOps(Value operand) {
     DenseSet<Value> visited;
-    return dependOnCopyRegToShared(operand, visited);
+    llvm::SetVector<Operation *> result;
+    findCopyRegToSharedOps(operand, visited, result);
+    return result.takeVector();
   }
 
-  bool dependOnCopyRegToShared(Value operand, DenseSet<Value> &visited) {
+  void findCopyRegToSharedOps(Value operand, DenseSet<Value> &visited,
+                              llvm::SetVector<Operation *> &result) {
     // If the value has already been visited we can safely return false as we
     // would early return when true.
     if (visited.count(operand))
-      return false;
+      return;
     visited.insert(operand);
     if (!isa<triton::gpu::MemDescType>(operand.getType()))
-      return false;
+      return;
 
     auto op = operand.getDefiningOp();
     if (op) {
       // reach an alloc copying from register, we need a fence.
-      if (isa<ttg::LocalAllocOp>(op) && cast<ttg::LocalAllocOp>(op).getSrc())
-        return true;
+      if (auto localAlloc = dyn_cast<ttg::LocalAllocOp>(op)) {
+        if (localAlloc.getSrc()) {
+          result.insert(op);
+        }
+        // Check if there are local_store ops that write to that buffer.
+        for (auto user : localAlloc.getResult().getUsers()) {
+          while (user->hasOneUse() &&
+                 user->hasTrait<OpTrait::MemDescViewTrait>()) {
+            user = *user->getUsers().begin();
+          }
+          if (isa<ttg::LocalStoreOp>(user)) {
+            result.insert(user);
+            return;
+          }
+        }
+      }
       // if it is not an alloc, iterate over the operands.
       for (auto v : op->getOperands()) {
-        if (dependOnCopyRegToShared(v))
-          return true;
+        findCopyRegToSharedOps(v, visited, result);
       }
-      return false;
+      return;
     }
 
     // reach BlockArgument
@@ -108,22 +126,23 @@ struct FenceInsertionPass
       assert(argNum != 0 && "induction var cannot be memdesc type");
       --argNum;
       // prologue
-      if (dependOnCopyRegToShared(forOp.getInitArgs()[argNum], visited))
-        return true;
+      findCopyRegToSharedOps(forOp.getInitArgs()[argNum], visited, result);
       // yield
       auto yieldOp = forOp.getBody()->getTerminator();
       Value v = yieldOp->getOperand(argNum);
-      return dependOnCopyRegToShared(v, visited);
+      findCopyRegToSharedOps(v, visited, result);
+      return;
     }
 
     // look through `ttg.warp_specialize`.
     if (auto wsOp = dyn_cast<ttg::WarpSpecializePartitionsOp>(argOwner)) {
-      return dependOnCopyRegToShared(
-          wsOp.getParentOp().getExplicitCaptures()[argNum]);
+      findCopyRegToSharedOps(wsOp.getParentOp().getExplicitCaptures()[argNum],
+                             visited, result);
+      return;
     }
 
     // Conservatively return true for other ops
-    return true;
+    result.insert(argOwner);
   }
 };
 
 
@@ -299,6 +299,11 @@ void init_gluon_ir(py::module &&m) {
              self.create<ttng::AsyncTMAScatterOp>(descPtr, xOffsets, yOffset,
                                                   src);
            })
+      .def("create_fence_async_shared",
+           [](GluonOpBuilder &self, bool bCluster) -> OpState {
+             return self.create<ttng::FenceAsyncSharedOp>(bCluster);
+           })
+
       .def("create_broadcast",
            [](TritonOpBuilder &self, Value &arg, Type retTy) -> Value {
              return self.create<tt::BroadcastOp>(retTy, arg);
 
@@ -1425,12 +1425,7 @@ void init_triton_ir(py::module &&m) {
            })
       .def("create_expand_dims",
            [](TritonOpBuilder &self, Value &arg, int axis) -> Value {
-             auto argType = dyn_cast<RankedTensorType>(arg.getType());
-             auto argEltType = argType.getElementType();
-             std::vector<int64_t> retShape = argType.getShape();
-             retShape.insert(retShape.begin() + axis, 1);
-             return self.create<ExpandDimsOp>(
-                 RankedTensorType::get(retShape, argEltType), arg, axis);
+             return self.create<ExpandDimsOp>(arg, axis);
            })
       .def("create_cat",
            [](TritonOpBuilder &self, Value &lhs, Value &rhs) -> Value {
Original file line number	Diff line number	Diff line change
`@@ -14,8 +14,8 @@ namespace ttng = mlir::triton::nvidia_gpu;`
`14`	`14`	`// MMA Pipeline Analysis`
`15`	`15`	`//===----------------------------------------------------------------------===//`
`16`	`16`
`17`		`-bool ttng::MMAv5PipelineableOperandsHelper::comesFromLoadOrOutsideLoop(`
`18`		`- Value v, Operation *&foundLoad) {`
	`17`	`+bool ttng::MMAv5PipelineableOperandsHelper::isOperandPipelineable(`
	`18`	`+ Value v, Operation *&foundDef) {`
`19`	`19`	`if (forOp.isDefinedOutsideOfLoop(v)) {`
`20`	`20`	`return true;`
`21`	`21`	`}`
`@@ -25,14 +25,16 @@ bool ttng::MMAv5PipelineableOperandsHelper::comesFromLoadOrOutsideLoop(`
`25`	`25`	`while (isa<ttg::MemDescTransOp, ttg::MemDescReshapeOp>(v.getDefiningOp())) {`
`26`	`26`	`v = v.getDefiningOp()->getOperand(0);`
`27`	`27`	`}`
`28`		`- if (auto tmemAlloc = dyn_cast<ttng::TMEMAllocOp>(v.getDefiningOp())) {`
`29`		`- foundLoad = tmemAlloc;`
	`28`	`+ if (isa<ttg::LocalStoreOp, ttng::TMEMStoreOp, ttng::TMEMAllocOp>(`
	`29`	`+ v.getDefiningOp())) {`
	`30`	`+ foundDef = v.getDefiningOp();`
`30`	`31`	`return false;`
`31`	`32`	`}`
`32`	`33`	`auto localAlloc = dyn_cast<ttg::LocalAllocOp>(v.getDefiningOp());`
`33`	`34`	`if (!localAlloc) {`
`34`	`35`	`return false;`
`35`	`36`	`}`
	`37`	`+ foundDef = localAlloc;`
`36`	`38`	`if (!localAlloc.getSrc()) {`
`37`	`39`	`return false;`
`38`	`40`	`}`
`@@ -44,17 +46,18 @@ bool ttng::MMAv5PipelineableOperandsHelper::comesFromLoadOrOutsideLoop(`
`44`	`46`	`localAllocSrc)) {`
`45`	`47`	`return false;`
`46`	`48`	`}`
`47`		`- foundLoad = localAllocSrc;`
`48`		`- if (!isLoadToBePipelined(foundLoad)) {`
	`49`	`+ foundDef = localAllocSrc;`
	`50`	`+ if (!isLoadToBePipelined(localAllocSrc)) {`
`49`	`51`	`return false;`
`50`	`52`	`}`
`51`		`- if (canBeAsyncLoad(foundLoad)) {`
	`53`	`+ if (canBeAsyncLoad(localAllocSrc)) {`
`52`	`54`	`return true;`
`53`	`55`	`}`
`54`	`56`	`return false;`
`55`	`57`	`}`
`56`	`58`
`57`	`59`	`void ttng::MMAv5PipelineableOperandsHelper::run() {`
	`60`	`+ unpipelineableOperandDefs.clear();`
`58`	`61`	`isOperandsStateDetermined = true;`
`59`	`62`	`// Accumulator alloc must be outside the loop.`
`60`	`63`	`auto tmemAlloc = mmaOp.getAccumulator().getDefiningOp<ttng::TMEMAllocOp>();`
`@@ -65,17 +68,17 @@ void ttng::MMAv5PipelineableOperandsHelper::run() {`
`65`	`68`	`return;`
`66`	`69`	`}`
`67`	`70`	`if (auto dotOp = dyn_cast<tt::DotOpInterface>(mmaOp.getOperation())) {`
`68`		`- Operation *foundLoad = nullptr;`
`69`		`- if (!comesFromLoadOrOutsideLoop(dotOp.getA(), foundLoad)) {`
`70`		`- if (foundLoad) {`
`71`		`- unpipelineableOperandLoads.push_back(foundLoad);`
	`71`	`+ Operation *foundDef = nullptr;`
	`72`	`+ if (!isOperandPipelineable(dotOp.getA(), foundDef)) {`
	`73`	`+ if (foundDef) {`
	`74`	`+ unpipelineableOperandDefs.push_back(foundDef);`
`72`	`75`	`} else {`
`73`	`76`	`isOperandsStateDetermined = false;`
`74`	`77`	`}`
`75`	`78`	`}`
`76`		`- if (!comesFromLoadOrOutsideLoop(dotOp.getB(), foundLoad)) {`
`77`		`- if (foundLoad) {`
`78`		`- unpipelineableOperandLoads.push_back(foundLoad);`
	`79`	`+ if (!isOperandPipelineable(dotOp.getB(), foundDef)) {`
	`80`	`+ if (foundDef) {`
	`81`	`+ unpipelineableOperandDefs.push_back(foundDef);`
`79`	`82`	`} else {`
`80`	`83`	`isOperandsStateDetermined = false;`
`81`	`84`	`}`
`@@ -95,24 +98,24 @@ void ttng::MMAv5PipelineableOperandsHelper::run() {`
`95`	`98`	`isOperandsStateDetermined = false;`
`96`	`99`	`return;`
`97`	`100`	`}`
`98`		`- Operation *foundLoad = nullptr;`
`99`		`- if (!comesFromLoadOrOutsideLoop(scaledOp.getAScale(), foundLoad)) {`
`100`		`- if (foundLoad) {`
`101`		`- unpipelineableOperandLoads.push_back(foundLoad);`
	`101`	`+ Operation *foundDef = nullptr;`
	`102`	`+ if (!isOperandPipelineable(scaledOp.getAScale(), foundDef)) {`
	`103`	`+ if (foundDef) {`
	`104`	`+ unpipelineableOperandDefs.push_back(foundDef);`
`102`	`105`	`} else {`
`103`	`106`	`isOperandsStateDetermined = false;`
`104`	`107`	`}`
`105`	`108`	`}`
`106`		`- if (!comesFromLoadOrOutsideLoop(scaledOp.getBScale(), foundLoad)) {`
`107`		`- if (foundLoad) {`
`108`		`- unpipelineableOperandLoads.push_back(foundLoad);`
	`109`	`+ if (!isOperandPipelineable(scaledOp.getBScale(), foundDef)) {`
	`110`	`+ if (foundDef) {`
	`111`	`+ unpipelineableOperandDefs.push_back(foundDef);`
`109`	`112`	`} else {`
`110`	`113`	`isOperandsStateDetermined = false;`
`111`	`114`	`}`
`112`	`115`	`}`
`113`	`116`	`}`
`114`	`117`	`isPipelineable =`
`115`		`- isOperandsStateDetermined && unpipelineableOperandLoads.empty();`
	`118`	`+ isOperandsStateDetermined && unpipelineableOperandDefs.empty();`
`116`	`119`	`}`
`117`	`120`
`118`	`121`	`bool ttng::hasAccReadModifyWrite(ttng::MMAv5OpInterface mma, scf::ForOp forOp) {`