intel
diff --git a/‎include/triton/Dialect/TritonGPU/IR/Dialect.h‎
Lines changed: 8 additions & 2 deletions b/‎include/triton/Dialect/TritonGPU/IR/Dialect.h‎
Lines changed: 8 additions & 2 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/Transforms/Passes.td‎
Lines changed: 2 additions & 1 deletion b/‎include/triton/Dialect/TritonGPU/Transforms/Passes.td‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎lib/Dialect/TritonGPU/IR/Dialect.cpp‎
Lines changed: 9 additions & 21 deletions b/‎lib/Dialect/TritonGPU/IR/Dialect.cpp‎
Lines changed: 9 additions & 21 deletions
diff --git a/‎lib/Dialect/TritonGPU/Transforms/HoistTMEMAlloc.cpp‎
Lines changed: 1 addition & 107 deletions b/‎lib/Dialect/TritonGPU/Transforms/HoistTMEMAlloc.cpp‎
Lines changed: 1 addition & 107 deletions
diff --git a/‎lib/Dialect/TritonGPU/Transforms/WarpSpecialization/AutomaticWarpSpecialization.cpp‎
Lines changed: 3 additions & 0 deletions b/‎lib/Dialect/TritonGPU/Transforms/WarpSpecialization/AutomaticWarpSpecialization.cpp‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎lib/Dialect/TritonGPU/Transforms/WarpSpecialization/Partition.cpp‎
Lines changed: 0 additions & 51 deletions b/‎lib/Dialect/TritonGPU/Transforms/WarpSpecialization/Partition.cpp‎
Lines changed: 0 additions & 51 deletions
diff --git a/‎lib/Dialect/TritonGPU/Transforms/WarpSpecialization/PartitionLoops.cpp‎
Lines changed: 0 additions & 6 deletions b/‎lib/Dialect/TritonGPU/Transforms/WarpSpecialization/PartitionLoops.cpp‎
Lines changed: 0 additions & 6 deletions
@@ -221,13 +221,19 @@ SmallVector<unsigned> getCTAOrder(Attribute layout);
 // [FIXME LL] Kill this function
 SmallVector<unsigned> getShapePerCTATile(RankedTensorType layout);
 
-// Returns the "logical" shape per CTA
+// Returns the "logical" shape per CTA.
+// When shape and CTASplitNum have different number of dimensions, we assume
+// only the last N between common dimensions are split.
+// Example1: shape = [2, 4, 8], CTASplitNum = [2, 2], ret = [2, 2, 4].
+// It can be caused by pipelining.
+// Example2: shape = [2, 4], CTASplitNum = [2, 2, 2], ret = [1, 2].
+// It can be caused by memory slicing.
 SmallVector<int64_t> getShapePerCTA(ArrayRef<unsigned> CTASplitNum,
                                     ArrayRef<int64_t> shape);
 SmallVector<int64_t> getShapePerCTA(Attribute layout, ArrayRef<int64_t> shape);
 SmallVector<int64_t> getShapePerCTA(Type type);
 
-// Returns the shape per CTA, which is "physically" allocated
+// Returns the shape per CTA, which is "physically" allocated.
 // Such shapes may be bigger than the logical one due to, for example, padding
 // in shared memory.
 SmallVector<int64_t> getAllocationShapePerCTA(Attribute layout,
 
@@ -130,7 +130,8 @@ def TritonGPURewritePartitionDependencies : Pass<"tritongpu-rewrite-partition-de
     "mlir::triton::gpu::TritonGPUDialect",
     "mlir::scf::SCFDialect",
     "mlir::arith::ArithDialect",
-    "mlir::triton::nvidia_gpu::TritonNvidiaGPUDialect"
+    "mlir::triton::nvidia_gpu::TritonNvidiaGPUDialect",
+    "mlir::triton::nvws::NVWSDialect"
   ];
 }
 
 
@@ -293,34 +293,22 @@ SmallVector<unsigned> getCTAOrder(Attribute layout) {
 SmallVector<int64_t> getShapePerCTA(ArrayRef<unsigned> CTASplitNum,
                                     ArrayRef<int64_t> shape) {
   unsigned rank = shape.size();
+  auto splitNum = llvm::to_vector(CTASplitNum);
+  if (splitNum.size() <= rank) { // pipelining
+    splitNum.insert(splitNum.begin(), rank - splitNum.size(), 1);
+  } else { // memory slicing
+    splitNum =
+        llvm::to_vector(llvm::drop_begin(splitNum, splitNum.size() - rank));
+  }
   SmallVector<int64_t> shapePerCTA(rank);
   for (unsigned i = 0; i < rank; ++i) {
-    unsigned splitNum = std::min<unsigned>(shape[i], CTASplitNum[i]);
-    shapePerCTA[i] = shape[i] / splitNum;
+    shapePerCTA[i] = shape[i] / std::min<unsigned>(shape[i], splitNum[i]);
   }
   return shapePerCTA;
 }
 
 SmallVector<int64_t> getShapePerCTA(Attribute layout, ArrayRef<int64_t> shape) {
-  if (mlir::isa<SharedEncodingTrait>(layout)) {
-    // Special logic for pipeline pass, where shape is 3D and CTALayout is 2D.
-    // The first dim of shape is numStages. This is a work around, otherwise
-    // too many places would have to be modified in pipeline pass. Maybe we
-    // need to refactor this logic in the future.
-    auto CTASplitNum = cast<LayoutEncodingTrait>(layout).getCTASplitNum();
-    if (shape.size() == CTASplitNum.size() + 1) {
-      auto res = getShapePerCTA(CTASplitNum, shape.drop_front());
-      res.insert(res.begin(), shape.front());
-      return res;
-    }
-  }
-  SmallVector<unsigned> splitNum = getCTASplitNum(layout);
-  if (auto tmem = dyn_cast<nvidia_gpu::TensorMemoryEncodingAttr>(layout)) {
-    if (shape.size() > splitNum.size()) {
-      splitNum.insert(splitNum.begin(), shape.size() - splitNum.size(), 1);
-    }
-  }
-  return getShapePerCTA(splitNum, shape);
+  return getShapePerCTA(getCTASplitNum(layout), shape);
 }
 
 SmallVector<int64_t> getAllocationShapePerCTA(Attribute layout,
 
@@ -148,94 +148,6 @@ class SinkTMEMLoad : public OpRewritePattern<TMEMTokenLoadOp> {
   }
 };
 
-// Combine back TMEM alloc and store. This is equivalent but gives us a more
-// canonical form to do further optimizations.
-class CombineTMEMStoreAndAlloc : public OpRewritePattern<TMEMTokenStoreOp> {
-public:
-  using OpRewritePattern::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(TMEMTokenStoreOp store,
-                                PatternRewriter &rewriter) const override {
-    if (!matchPattern(store.getPred(), m_One()))
-      return failure();
-    auto alloc = store.getDep().getDefiningOp<TMEMTokenAllocOp>();
-    if (!alloc)
-      return failure();
-    if (alloc->getBlock() != store->getBlock())
-      return failure();
-    alloc.getSrcMutable().assign(store.getSrc());
-    rewriter.replaceOp(store, alloc.getToken());
-    return success();
-  }
-};
-
-// Hoists a tmem alloc outside an if op like this:
-// %0 = scf.if {
-//   %1, %token0 = tmem.alloc %init
-//   ...
-//   %2 = tmem.load %1, %token1
-//   scf.yield %2
-// } else {
-//   scf.yield %init
-// }
-// ->
-// %a, %token0 = tmem.alloc %init
-// %token2 = scf.if {
-//
-//   ...
-//   scf.yield %token1
-// } else {
-//   scf.yield %token0
-// }
-// %2 = tmem.load %a, %token2
-class HoistTMEMAllocOutOfIf : public OpRewritePattern<ttng::TMEMAllocOp> {
-public:
-  using OpRewritePattern::OpRewritePattern;
-
-  LogicalResult matchAndRewrite(ttng::TMEMAllocOp alloc,
-                                PatternRewriter &rewriter) const override {
-    if (!alloc.getToken())
-      return failure();
-    Value init = alloc.getSrc();
-    if (!init)
-      return failure();
-    auto ifOp = dyn_cast<scf::IfOp>(alloc->getParentOp());
-    if (!ifOp)
-      return failure();
-    auto thenOp = ifOp.thenBlock()->getTerminator();
-    auto elseOp = ifOp.elseBlock()->getTerminator();
-    SmallVector<int> yieldArgs;
-    for (auto [thenOperand, elseOperand] :
-         llvm::zip(thenOp->getOpOperands(), elseOp->getOpOperands())) {
-      auto load = thenOperand.get().getDefiningOp<TMEMTokenLoadOp>();
-      if (!load || load.getSrc() != alloc.getResult())
-        continue;
-      if (elseOperand.get() != init)
-        continue;
-      yieldArgs.push_back(thenOperand.getOperandNumber());
-    }
-    if (yieldArgs.empty())
-      return failure();
-    // Since init is used in the else terminator we know that it dominates the
-    // if op.
-    alloc->moveBefore(ifOp);
-    rewriter.setInsertionPointAfter(ifOp);
-    for (int argNo : yieldArgs) {
-      auto load =
-          cast<TMEMTokenLoadOp>(thenOp->getOperand(argNo).getDefiningOp());
-      auto newLoad = cast<TMEMTokenLoadOp>(rewriter.clone(*load));
-      rewriter.modifyOpInPlace(ifOp, [&] {
-        ifOp->getResult(argNo).replaceAllUsesWith(newLoad.getResult());
-        newLoad.getDepMutable().assign(ifOp->getResult(argNo));
-        thenOp->setOperand(argNo, load.getToken());
-        elseOp->setOperand(argNo, alloc.getToken());
-        ifOp->getResult(argNo).setType(newLoad.getToken().getType());
-      });
-    }
-    return success();
-  }
-};
-
 // Remove loop-carried tensor dependencies if they are fed immediately into a
 // TMEM store by pulling the store into the previous iteration.
 class RotateTMEMStoreInLoop : public OpRewritePattern<TMEMTokenStoreOp> {
@@ -500,29 +412,11 @@ struct HoistTMEMAlloc
     mlir::RewritePatternSet patterns(&getContext());
     patterns.add<RotateTMEMStoreInLoop, RotateTMEMLoadInLoop,
                  CombineTMEMLoadAndStore, CombineTMEMStoreAndSelect,
-                 SinkTMEMLoad, RemoveUnusedTMEMLoad, CombineTMEMStoreAndAlloc,
-                 HoistTMEMAllocOutOfIf>(&getContext());
+                 SinkTMEMLoad, RemoveUnusedTMEMLoad>(&getContext());
     scf::ForOp::getCanonicalizationPatterns(patterns, &getContext());
     if (failed(applyPatternsGreedily(getOperation(), std::move(patterns)))) {
       llvm_unreachable("Failed to hoist tmem_store");
     }
-
-    // TODO: currently some code assumes that a mutable tmem alloc doesn't have
-    // an initial value. As a workaround we break up the op in order to keep
-    // this form for the downstream passes. We should remove this once the
-    // downstread passes are fixed.
-    m.walk([&](ttng::TMEMAllocOp alloc) {
-      if (alloc.getType().getMutableMemory() && alloc.getSrc()) {
-        OpBuilder builder(alloc);
-        builder.setInsertionPointAfter(alloc);
-        auto store = builder.create<ttng::TMEMStoreOp>(
-            alloc.getLoc(), builder.getType<AsyncTokenType>(),
-            alloc.getResult(), alloc.getToken(), alloc.getSrc(),
-            builder.create<arith::ConstantIntOp>(alloc.getLoc(), 1, 1));
-        alloc.getToken().replaceAllUsesExcept(store.getToken(), store);
-        alloc.getSrcMutable().clear();
-      }
-    });
   }
 };
 
 
@@ -3,6 +3,7 @@
 #include "mlir/Pass/Pass.h"
 #include "mlir/Pass/PassManager.h"
 #include "mlir/Transforms/Passes.h"
+#include "third_party/nvidia/include/Dialect/NVWS/Transforms/Passes.h"
 #include "triton/Dialect/TritonGPU/Transforms/Passes.h"
 #include "triton/Dialect/TritonGPU/Transforms/Utility.h"
 
@@ -42,6 +43,8 @@ void AutomaticWarpSpecialization::runOnOperation() {
   pm.addPass(createSCCPPass());
   pm.addPass(createCSEPass());
   pm.addPass(createTritonGPUPartitionLoops());
+  pm.addPass(createNVWSLowerAref());
+  pm.addPass(createNVWSLowerWarpGroup());
   if (failed(runPipeline(pm, getOperation())))
     return signalPassFailure();
 
 
@@ -191,57 +191,6 @@ LogicalResult WarpSchedule::verify(scf::ForOp loop) const {
   if (failed)
     return failure();
 
-  // Within a loop iteration, the partitions must form a DAG. For example, the
-  // following is invalid:
-  //
-  //   scf.for %i = %lb to %ub step %step
-  //     %0 = op_a()     {ttg.partition = 0}
-  //     %1 = op_b(%0)   {ttg.partition = 1}
-  //     op_c(%1)        {ttg.partition = 0}
-  //
-  PartitionGraph graph(loop, *this);
-  for (auto it = llvm::scc_begin(graph); !it.isAtEnd(); ++it) {
-    if (!it.hasCycle())
-      continue;
-    InFlightDiagnostic diag =
-        mlir::emitWarning(loop.getLoc(), "warp schedule contains a cycle");
-    for (auto [node, use] : *it) {
-      assert(use && "already checked that the root partition has no ancestors");
-      diag.attachNote(use->getOwner()->getLoc())
-          << "operation in partition #" << node->partition->getIndex()
-          << " uses value defined in partition #"
-          << opToPartition.at(use->get().getDefiningOp())->getIndex();
-    }
-    return failure();
-  }
-
-  // Each partition's stage must be strictly less than all of its consumers plus
-  // the distance.
-  for (Partition &partition : getPartitions()) {
-    bool failed = false;
-    auto callback = [&](OpResult output, OpOperand &use, unsigned distance) {
-      Operation *user = loop.getBody()->findAncestorOpInBlock(*use.getOwner());
-      const Partition *consumer = opToPartition.at(user);
-      if (partition.getStage() < consumer->getStage() + distance)
-        return;
-      InFlightDiagnostic diag =
-          mlir::emitWarning(loop.getLoc(), "partition #")
-          << partition.getIndex() << " has stage " << partition.getStage()
-          << " but is consumed by partition #" << consumer->getIndex()
-          << " with stage " << consumer->getStage() << " at distance "
-          << distance;
-      diag.attachNote(use.getOwner()->getLoc())
-          << "use of value defined in partition #" << partition.getIndex()
-          << " at " << distance << " iterations in the future";
-      diag.attachNote(output.getLoc())
-          << "value defined here in partition #" << partition.getIndex();
-      failed = true;
-    };
-    iterateUses(loop, &partition, callback);
-    if (failed)
-      return failure();
-  }
-
   return success();
 }
 
 
@@ -499,10 +499,4 @@ void PartitionLoops::runOnOperation() {
     if (failed(partitionLoop(loop)))
       return signalPassFailure();
   }
-
-  OpPassManager pm;
-  pm.addPass(mlir::triton::createNVWSLowerWarpGroup());
-
-  if (failed(runPipeline(pm, getOperation())))
-    return signalPassFailure();
 }
Original file line number	Diff line number	Diff line change
`@@ -130,7 +130,8 @@ def TritonGPURewritePartitionDependencies : Pass<"tritongpu-rewrite-partition-de`
`130`	`130`	`"mlir::triton::gpu::TritonGPUDialect",`
`131`	`131`	`"mlir::scf::SCFDialect",`
`132`	`132`	`"mlir::arith::ArithDialect",`
`133`		`- "mlir::triton::nvidia_gpu::TritonNvidiaGPUDialect"`
	`133`	`+ "mlir::triton::nvidia_gpu::TritonNvidiaGPUDialect",`
	`134`	`+ "mlir::triton::nvws::NVWSDialect"`
`134`	`135`	`];`
`135`	`136`	`}`
`136`	`137`
Original file line number	Diff line number	Diff line change
`@@ -499,10 +499,4 @@ void PartitionLoops::runOnOperation() {`
`499`	`499`	`if (failed(partitionLoop(loop)))`
`500`	`500`	`return signalPassFailure();`
`501`	`501`	`}`
`502`		`-`
`503`		`- OpPassManager pm;`
`504`		`- pm.addPass(mlir::triton::createNVWSLowerWarpGroup());`
`505`		`-`
`506`		`- if (failed(runPipeline(pm, getOperation())))`
`507`		`- return signalPassFailure();`
`508`	`502`	`}`