FindHao
diff --git a/‎include/triton/Dialect/TritonGPU/Transforms/PipeliningUtility.h‎
Lines changed: 33 additions & 0 deletions b/‎include/triton/Dialect/TritonGPU/Transforms/PipeliningUtility.h‎
Lines changed: 33 additions & 0 deletions
diff --git a/‎include/triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUOpInterfaces.td‎
Lines changed: 3 additions & 3 deletions b/‎include/triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUOpInterfaces.td‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎include/triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUOps.td‎
Lines changed: 89 additions & 42 deletions b/‎include/triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUOps.td‎
Lines changed: 89 additions & 42 deletions
diff --git a/‎lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp‎
Lines changed: 2 additions & 2 deletions b/‎lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎lib/Dialect/TritonGPU/Transforms/FuseNestedLoops.cpp‎
Lines changed: 7 additions & 55 deletions b/‎lib/Dialect/TritonGPU/Transforms/FuseNestedLoops.cpp‎
Lines changed: 7 additions & 55 deletions
diff --git a/‎lib/Dialect/TritonGPU/Transforms/Pipeliner/LowerLoops.cpp‎
Lines changed: 2 additions & 1 deletion b/‎lib/Dialect/TritonGPU/Transforms/Pipeliner/LowerLoops.cpp‎
Lines changed: 2 additions & 1 deletion
@@ -8,6 +8,7 @@
 #include <vector>
 
 namespace mlir {
+class DominanceInfo;
 class ImplicitLocOpBuilder;
 namespace triton {
 
@@ -20,6 +21,38 @@ static const char *kLoopClusterAttrName = "loop.cluster";
 static const char *kScheduledMaxStageAttrName = "tt.scheduled_max_stage";
 static const char *kLatencyAttrName = "tt.latency";
 
+//===----------------------------------------------------------------------===//
+// Hoisting Utilities
+//===----------------------------------------------------------------------===//
+
+// By default, an operation can be hoisted if it is pure scalar operation.
+bool isPureScalarOp(Operation *op);
+
+// Given a set of values and a reference operation, return true if all of the
+// values dominate the reference operation OR a set of "trivial" operations can
+// be moved before the reference operation such that the value set dominates the
+// reference operation.
+//
+// Returns false if it is not possible to make the values dominate the reference
+// operation. The function determines "trivial"-ness with the given callback.
+// By default, it determines that memory-effect-free and scalar operations are
+// trivial.
+bool getDominatingValueSetOpsToHoist(
+    DominanceInfo &domInfo, Operation *refOp, ArrayRef<Value> valueSet,
+    llvm::SetVector<Operation *> &toHoist,
+    function_ref<bool(Operation *)> canHoist = isPureScalarOp);
+
+// Hoist the given set of operations above the reference operation.
+void hoistOpsBefore(Operation *refOp,
+                    const llvm::SetVector<Operation *> &toHoist);
+// Hoist the given set of operations before the iterator.
+void hoistOpsBefore(Block *block, Block::iterator it,
+                    const llvm::SetVector<Operation *> &toHoist);
+
+//===----------------------------------------------------------------------===//
+// Loop Pipelining Utilities
+//===----------------------------------------------------------------------===//
+
 bool loopHasDistGreaterThanOne(scf::ForOp forOp);
 bool isOuterLoop(scf::ForOp forOp);
 
 
@@ -19,10 +19,10 @@ def MMAv5OpInterface : OpInterface<"MMAv5OpInterface"> {
                     "void",
                     "setUseAccumulator",
                     (ins "::mlir::Value":$flag)>,
-    InterfaceMethod<"Associate a new barrier to this MMAv5 op.",
+    InterfaceMethod<"Associate a new completion barrier to this MMAv5 op.",
                     "void",
-                    "setBarrier",
-                    (ins "::mlir::Value":$barrier)>,
+                    "addCompletionBarrier",
+                    (ins "::mlir::Value":$barrier, "::mlir::Value":$pred)>,
     InterfaceMethod<"Return the accumulator.",
                     "::mlir::Value",
                     "getAccumulator">,
 
@@ -389,55 +389,102 @@ def TTNG_TMAStoreWaitOp : TTNG_Op<"async_tma_store_wait"> {
   let assemblyFormat = "attr-dict";
 }
 
-def TTNG_TCGen5MMAOp : TTNG_Op<"tc_gen5_mma", [DeclareOpInterfaceMethods<MemoryEffectsOpInterface>, DeclareOpInterfaceMethods<DotOpInterface>, DeclareOpInterfaceMethods<MMAv5OpInterface>]> {
-    let summary = "block level op mapping to tensorcore gen5 mma";
+def TTNG_TCGen5MMAOp : TTNG_Op<"tc_gen5_mma", [
+    DeclareOpInterfaceMethods<MemoryEffectsOpInterface>,
+    DeclareOpInterfaceMethods<DotOpInterface>,
+    DeclareOpInterfaceMethods<MMAv5OpInterface>,
+    SameVariadicOperandSize
+]> {
+  let summary = "block level op mapping to tensorcore gen5 mma";
 
-    let description = [{
-        $d += matrix_multiply($a, $b).
-        If no barrier is given the op is assumed to be synchronous otherwise the op will trigger a commit/arrive on the given barrier.
-        If there is a barrier the result will be safe to read after a barrier wait.
-        If $two_ctas is set the op will execute a matmul across two contiguous CTAs, it will read the data distributed across the two CTAs.
-        and syncronize both CTAs if the op is synchronous.
-    }];
+  let description = [{
+    $d += matrix_multiply($a, $b).
+    If no barrier is given the op is assumed to be synchronous otherwise the op will trigger a commit/arrive on the given barrier.
+    If there is a barrier the result will be safe to read after a barrier wait.
+    If $two_ctas is set the op will execute a matmul across two contiguous CTAs, it will read the data distributed across the two CTAs.
+    and syncronize both CTAs if the op is synchronous.
+  }];
 
-    let arguments = (ins TTG_MemDescType:$a,
-                         TTG_MemDescType:$b,
-                         TTG_MemDescType:$d,
-                         I1:$useD,
-                         I1:$pred,
-                         Optional<TTG_MemDescType>:$barrier,
-                         OptionalAttr<UnitAttr>:$two_ctas);
+  let arguments = (ins
+    TTG_MemDescType:$a,
+    TTG_MemDescType:$b,
+    TTG_MemDescType:$d,
+    I1:$useD,
+    I1:$pred,
+    Variadic<TTG_MemDescType>:$barriers,
+    Variadic<I1>:$barrier_preds,
+    OptionalAttr<UnitAttr>:$two_ctas
+  );
 
-    // TODO: improve printing format.
-    let assemblyFormat = "$a`,` $b`,` $d`,` $useD`,` $pred (`,` $barrier^)? attr-dict `:` functional-type(operands, results)";
+  let builders = [
+    OpBuilder<(ins
+      "Value":$a, "Value":$b, "Value":$d, "Value":$useD, "Value":$pred,
+      CArg<"bool", "false">:$two_ctas, CArg<"ValueRange", "{}">:$barriers,
+      CArg<"ValueRange", "{}">:$barrier_preds)>
+  ];
+
+  let assemblyFormat = [{
+    $a`,` $b`,` $d`,` $useD`,` $pred
+    `` custom<BarriersAndPreds>($barriers, $barrier_preds)
+    attr-dict `:` qualified(type($a)) `,` qualified(type($b)) `,`
+    qualified(type($d)) (`,` qualified(type($barriers))^)?
+  }];
 }
 
-def TTNG_TCGen5MMAScaledOp : TTNG_Op<"tc_gen5_mma_scaled", [DeclareOpInterfaceMethods<MemoryEffectsOpInterface>, DeclareOpInterfaceMethods<DotOpInterface, ["verifyDims", "verifyOutputDims"]>, DeclareOpInterfaceMethods<MMAv5OpInterface>]> {
-    let summary = "block level op mapping to tensorcore gen5 mma";
+def TTNG_TCGen5MMAScaledOp : TTNG_Op<"tc_gen5_mma_scaled", [
+    DeclareOpInterfaceMethods<MemoryEffectsOpInterface>,
+    DeclareOpInterfaceMethods<DotOpInterface, ["verifyDims", "verifyOutputDims"]>,
+    DeclareOpInterfaceMethods<MMAv5OpInterface>,
+    SameVariadicOperandSize
+]> {
+  let summary = "block level op mapping to tensorcore gen5 mma";
 
-    let description = [{
-        $d += matrix_multiply(scale($lhs, $lhs_scale), scale(rlhs, $rhs_scale))
-        If no barrier is given the op is assumed to be synchronous otherwise the op will trigger a commit/arrive on the given barrier.
-        If there is a barrier the result will be safe to read after a barrier wait.
-    }];
+  let description = [{
+    $d += matrix_multiply(scale($lhs, $lhs_scale), scale(rlhs, $rhs_scale))
+    If no barrier is given the op is assumed to be synchronous otherwise the op will trigger a commit/arrive on the given barrier.
+    If there is a barrier the result will be safe to read after a barrier wait.
+  }];
 
-    let arguments = (ins TTG_MemDescType:$a,
-                         TTG_MemDescType:$b,
-                         TTG_MemDescType:$d,
-                         TTG_MemDescType:$a_scale,
-                         TTG_MemDescType:$b_scale,
-                         TT_ScaleDotElemTypeAttr:$a_type,
-                         TT_ScaleDotElemTypeAttr:$b_type,
-                         I1:$useD,
-                         I1:$pred,
-                         Optional<TTG_MemDescType>:$barrier);
-    let extraClassDeclaration = [{
-      int64_t getBlockM();
-      int64_t getBlockN();
-      int64_t getBlockK();
-    }];
-    // TODO: improve printing format.
-    let assemblyFormat = "$a `,` $b `,` $d `,` $a_scale `,` $b_scale `,` $useD`,` $pred `lhs` `=` $a_type `rhs` `=` $b_type (`,` $barrier^)? attr-dict `:` functional-type(operands, results)";
+  let arguments = (ins
+    TTG_MemDescType:$a,
+    TTG_MemDescType:$b,
+    TTG_MemDescType:$d,
+    TTG_MemDescType:$a_scale,
+    TTG_MemDescType:$b_scale,
+    TT_ScaleDotElemTypeAttr:$a_type,
+    TT_ScaleDotElemTypeAttr:$b_type,
+    I1:$useD,
+    I1:$pred,
+    Variadic<TTG_MemDescType>:$barriers,
+    Variadic<I1>:$barrier_preds
+  );
+  let extraClassDeclaration = [{
+    int64_t getBlockM();
+    int64_t getBlockN();
+    int64_t getBlockK();
+  }];
+
+  let builders = [
+    // Namespaces need to be prefixed so ODS prefers our
+    // custom builder signature over the default-generated one.
+    OpBuilder<(ins
+      "::mlir::Value":$a, "::mlir::Value":$b, "::mlir::Value":$d,
+      "::mlir::Value":$a_scale, "::mlir::Value":$b_scale,
+      "::mlir::triton::ScaleDotElemType":$a_type,
+      "::mlir::triton::ScaleDotElemType":$b_type,
+      "::mlir::Value":$useD, "::mlir::Value":$pred,
+      CArg<"::mlir::ValueRange", "{}">:$barriers,
+      CArg<"::mlir::ValueRange", "{}">:$barrier_preds)>
+  ];
+
+  let assemblyFormat = [{
+    $a `,` $b `,` $d `,` $a_scale `,` $b_scale `,` $useD`,` $pred
+    `lhs` `=` $a_type `rhs` `=` $b_type
+    `` custom<BarriersAndPreds>($barriers, $barrier_preds)
+    attr-dict `:` qualified(type($a)) `,` qualified(type($b)) `,`
+    qualified(type($d)) `,` qualified(type($a_scale)) `,`
+    qualified(type($b_scale)) (`,` qualified(type($barriers))^)?
+  }];
 }
 
 def TTNG_TMEMLoadOp : TTNG_Op<"tmem_load"> {
 
@@ -548,7 +548,7 @@ class BlockedToMMAv5 : public mlir::OpRewritePattern<DotOp> {
         loc, accMemDescType, cvtAcc);
     auto vTrue = rewriter.create<arith::ConstantIntOp>(dotOp.getLoc(), 1, 1);
     auto mma = rewriter.create<triton::nvidia_gpu::TCGen5MMAOp>(
-        loc, a, b, acc, vTrue, vTrue, Value(), UnitAttr());
+        loc, a, b, acc, /*useD=*/vTrue, /*pred=*/vTrue);
     mma.setTwoCtas(useTwoCTAs);
 
     auto ld =
@@ -735,7 +735,7 @@ class ScaledBlockedToMMAv5
     auto vTrue = rewriter.create<arith::ConstantIntOp>(dotOp.getLoc(), 1, 1);
     rewriter.create<triton::nvidia_gpu::TCGen5MMAScaledOp>(
         loc, a, b, acc, scaleA, scaleB, dotOp.getAElemType(),
-        dotOp.getBElemType(), vTrue, vTrue, Value());
+        dotOp.getBElemType(), /*useD=*/vTrue, /*pred=*/vTrue);
 
     auto ld =
         rewriter.create<triton::nvidia_gpu::TMEMLoadOp>(loc, newAccType, acc);
 
@@ -1,4 +1,3 @@
-#include "mlir/Analysis/TopologicalSortUtils.h"
 #include "mlir/Dialect/UB/IR/UBOps.h"
 #include "mlir/IR/Dominance.h"
 #include "mlir/IR/ImplicitLocOpBuilder.h"
@@ -238,7 +237,8 @@ static Logue createLogueFrom(llvm::iterator_range<Block::iterator> ops,
 // Only hoist operations that are side-effect free and "cheap" (i.e. only scalar
 // operands). Importantly, we need to be able to hoist code generated by fusing
 // children loops into their parents so the algorithm can be applied
-// recursively.
+// recursively. This includes integer division, which are not speculatable, but
+// we know they will never divide by zero.
 static bool canHoistLoopBoundComputation(Operation *op) {
   auto isScalar = [](Type type) { return type.isIntOrIndexOrFloat(); };
   return isMemoryEffectFree(op) &&
@@ -251,50 +251,8 @@ static bool canHoistLoopBoundComputation(Operation *op) {
 static bool isOuterLoopInvariant(mlir::DominanceInfo &domInfo, scf::ForOp outer,
                                  ArrayRef<Value> values,
                                  llvm::SetVector<Operation *> &toHoist) {
-  // The set of operations within `outer` that are being checked if they can be
-  // hoisted. This set prevents checking operations twice but also if the
-  // computation can be hoisted, this becomes the set of operations to hoist.
-  llvm::SetVector<Operation *> visited;
-
-  // Climb the use-def chain breadth-first so that operations can be hoisted in
-  // the reverse visitation order.
-  std::queue<Value> queue;
-  for (Value value : values)
-    queue.push(value);
-
-  while (!queue.empty()) {
-    Value value = queue.front();
-    queue.pop();
-
-    // If the value properly dominates the outer loop, then it must be invariant
-    // to it.
-    if (domInfo.properlyDominates(value, outer))
-      continue;
-    // If the value is a block argument, it cannot be hoisted.
-    if (auto arg = dyn_cast<BlockArgument>(value))
-      return false;
-
-    Operation *op = value.getDefiningOp();
-    // Check if the op was already visited.
-    if (visited.contains(op))
-      continue;
-    // If the defining op cannot be hoisted, then the value cannot be made loop
-    // invariant.
-    if (!canHoistLoopBoundComputation(op))
-      return false;
-    visited.insert(op);
-    // Recurse on the operands of the op.
-    for (Value operand : op->getOperands())
-      queue.push(operand);
-  }
-
-  // The operations in `visited` must be hoisted. Note that operations are not
-  // added to `toHoist` unless all of `values` can be hoisted. This is to avoid
-  // hoisting operations for loops that don't end up getting fused if one of
-  // their bounds operands cannot be hoisted.
-  toHoist.insert(visited.begin(), visited.end());
-
-  return true;
+  return getDominatingValueSetOpsToHoist(domInfo, outer, values, toHoist,
+                                         canHoistLoopBoundComputation);
 }
 
 // Pessimistically assume the internal storage bitwidth for index types.
@@ -545,9 +503,7 @@ static void fuseOneLevel(LoopNestNode *parent, mlir::DominanceInfo &domInfo) {
   // The transformation will definitely succeed on `childrenToFuse`. `toHoist`
   // only contains the operations that must be hoisted for `childrenToFuse` to
   // be fusible.
-  toHoist = topologicalSort(toHoist);
-  for (Operation *op : toHoist)
-    op->moveBefore(outer);
+  hoistOpsBefore(outer, toHoist);
 
   // Determine the integer type to use for the length computations. Use an
   // integer bitwidth twice the size of the largest integer, up to 64 bits, to
@@ -993,9 +949,7 @@ static void sinkOps(Region &limit, Block *sinkBlock, Block::iterator sinkBefore,
   if (sunkOps.empty())
     return;
 
-  sunkOps = topologicalSort(sunkOps);
-  for (Operation *op : sunkOps)
-    op->moveBefore(sinkBlock, sinkBefore);
+  hoistOpsBefore(sinkBlock, sinkBefore, sunkOps);
 }
 
 // Sink ops from the prologue into the epilogue when possible.
@@ -1028,9 +982,7 @@ static LogicalResult speculateInnerLoopLength(scf::ForOp outerLoop,
     return failure();
 
   // Hoist the inner loop bounds computations if necessary.
-  toHoist = topologicalSort(toHoist);
-  for (Operation *op : toHoist)
-    op->moveBefore(outerLoop);
+  hoistOpsBefore(outerLoop, toHoist);
 
   // Mark the inner loop.
   ImplicitLocOpBuilder b(loc, outerLoop);
 
@@ -913,7 +913,8 @@ scf::ForOp createBarrierAndWaitOps(scf::ForOp forOp, CoarseSchedule &schedule,
     barrierSlice =
         triton::createSingleBufferView(builder, barrierAlloc, barrierIdx);
   }
-  mma.setBarrier(barrierSlice);
+  mma.addCompletionBarrier(barrierSlice,
+                           builder.create<arith::ConstantIntOp>(loc, 1, 1));
 
   // List of buffers that may be used until wait completes
   SmallVector<Value> waitBuffers;
Original file line number	Diff line number	Diff line change
`@@ -913,7 +913,8 @@ scf::ForOp createBarrierAndWaitOps(scf::ForOp forOp, CoarseSchedule &schedule,`
`913`	`913`	`barrierSlice =`
`914`	`914`	`triton::createSingleBufferView(builder, barrierAlloc, barrierIdx);`
`915`	`915`	`}`
`916`		`- mma.setBarrier(barrierSlice);`
	`916`	`+ mma.addCompletionBarrier(barrierSlice,`
	`917`	`+ builder.create<arith::ConstantIntOp>(loc, 1, 1));`
`917`	`918`
`918`	`919`	`// List of buffers that may be used until wait completes`
`919`	`920`	`SmallVector<Value> waitBuffers;`