intel
diff --git a/‎Makefile
Lines changed: 1 addition & 1 deletion b/‎Makefile
Lines changed: 1 addition & 1 deletion
diff --git a/‎include/triton/Conversion/TritonGPUToLLVM/Utility.h
Lines changed: 0 additions & 4 deletions b/‎include/triton/Conversion/TritonGPUToLLVM/Utility.h
Lines changed: 0 additions & 4 deletions
diff --git a/‎include/triton/Dialect/Triton/IR/Utility.h
Lines changed: 4 additions & 0 deletions b/‎include/triton/Dialect/Triton/IR/Utility.h
Lines changed: 4 additions & 0 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/Transforms/PipeliningUtility.h
Lines changed: 7 additions & 0 deletions b/‎include/triton/Dialect/TritonGPU/Transforms/PipeliningUtility.h
Lines changed: 7 additions & 0 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/Transforms/Schedule.h
Lines changed: 27 additions & 0 deletions b/‎include/triton/Dialect/TritonGPU/Transforms/Schedule.h
Lines changed: 27 additions & 0 deletions
diff --git a/‎lib/Conversion/TritonGPUToLLVM/FuncOpToLLVM.cpp
Lines changed: 3 additions & 3 deletions b/‎lib/Conversion/TritonGPUToLLVM/FuncOpToLLVM.cpp
Lines changed: 3 additions & 3 deletions
diff --git a/‎lib/Dialect/Triton/IR/Utility.cpp
Lines changed: 14 additions & 0 deletions b/‎lib/Dialect/Triton/IR/Utility.cpp
Lines changed: 14 additions & 0 deletions
diff --git a/‎lib/Dialect/TritonGPU/Transforms/Pipeliner/LowerLoops.cpp
Lines changed: 0 additions & 164 deletions b/‎lib/Dialect/TritonGPU/Transforms/Pipeliner/LowerLoops.cpp
Lines changed: 0 additions & 164 deletions
@@ -107,7 +107,7 @@ dev-install-llvm:
 
 .PHONY: golden-samples
 golden-samples: triton-opt
-	$(TRITON_OPT) test/TritonGPU/samples/simulated-grouped-gemm.mlir.in -tritongpu-assign-latencies -tritongpu-schedule-loops -tritongpu-pipeline -canonicalize | \
+	$(TRITON_OPT) test/TritonGPU/samples/simulated-grouped-gemm.mlir.in -tritongpu-pipeline -canonicalize | \
 		$(PYTHON) utils/generate-test-checks.py --source test/TritonGPU/samples/simulated-grouped-gemm.mlir.in --source_delim_regex="\bmodule" \
 		-o test/TritonGPU/samples/simulated-grouped-gemm.mlir
 	$(TRITON_OPT) test/TritonGPU/samples/descriptor-matmul-pipeline.mlir.in -tritongpu-assign-latencies -tritongpu-schedule-loops -tritongpu-pipeline -canonicalize | \
 
@@ -423,10 +423,6 @@ size_t linearize(ArrayRef<unsigned> multiDim, ArrayRef<unsigned> shape,
 Value addStringToModule(Location loc, RewriterBase &rewriter, StringRef key,
                         StringRef content);
 
-inline bool isKernel(FunctionOpInterface funcOp) {
-  return funcOp.getVisibility() == SymbolTable::Visibility::Public;
-}
-
 Value getStackPointer(RewriterBase &rewriter, FunctionOpInterface funcOp);
 
 Value getGlobalScratchPtr(Location loc, RewriterBase &rewriter,
 
@@ -182,6 +182,10 @@ Value getLastInductionValue(OpBuilder &b, scf::ForOp loop);
 
 MakeTensorPtrOp getMakeTensorPtrOp(Value v);
 
+bool isHostSideDescriptor(Value v);
+
+bool isKernel(FunctionOpInterface funcOp);
+
 } // namespace triton
 } // namespace mlir
 
 
@@ -19,6 +19,7 @@ static const char *kWarpSpecializeAttrName = "tt.warp_specialize";
 static const char *kLoopStageAttrName = "loop.stage";
 static const char *kLoopClusterAttrName = "loop.cluster";
 static const char *kScheduledMaxStageAttrName = "tt.scheduled_max_stage";
+class CoarseSchedule;
 
 //===----------------------------------------------------------------------===//
 // Hoisting Utilities
@@ -138,6 +139,12 @@ createSingleBufferView(OpBuilder &builder, Value alloc, Value idx);
 TypedValue<triton::gpu::MemDescType>
 createSingleBufferView(OpBuilder &builder, Value alloc, int idx);
 
+Value createIncrementModulo(OpBuilder &builder, Location loc, Value counter,
+                            Value modulus, Value zero, Value one,
+                            Value *outWrapCond = nullptr);
+
+scf::ForOp lowerTMADescriptors(scf::ForOp forOp, CoarseSchedule &schedule);
+
 } // namespace triton
 } // namespace mlir
 
 
@@ -2,6 +2,7 @@
 #define TRITON_TRITONGPU_TRANSFORM_PIPELINE_SCHEDULE_H_
 
 #include "mlir/Dialect/SCF/IR/SCF.h"
+#include "mlir/IR/ImplicitLocOpBuilder.h"
 #include "mlir/Support/LLVM.h"
 #include "triton/Dialect/TritonGPU/Transforms/PipelineExpander.h"
 #include "llvm/ADT/ArrayRef.h"
@@ -164,6 +165,32 @@ class CoarseSchedule {
 // the same stage and ordering cluster as the anchor op.
 void scheduleDependencies(scf::ForOp forOp, CoarseSchedule &schedule);
 
+class OpBuilderForStage : public mlir::ImplicitLocOpBuilder,
+                          public OpBuilder::Listener {
+public:
+  explicit OpBuilderForStage(Location loc, Operation *op,
+                             CoarseSchedule &schedule)
+      : ImplicitLocOpBuilder(loc, op, this), schedule(schedule) {
+    if (auto it = schedule.find(op); it != schedule.end())
+      std::tie(stage, cluster) = it->second;
+  }
+
+  void setStageCluster(std::pair<int, CoarseSchedule::Cluster> stageCluster) {
+    stage = stageCluster.first;
+    cluster = stageCluster.second;
+  }
+
+  void notifyOperationInserted(Operation *op, InsertPoint previous) {
+    if (stage && cluster)
+      schedule.insert(op, *stage, *cluster);
+  }
+
+private:
+  std::optional<int> stage;
+  std::optional<CoarseSchedule::Cluster> cluster;
+  CoarseSchedule &schedule;
+};
+
 } // namespace triton
 } // namespace mlir
 #endif // TRITON_TRITONGPU_TRANSFORM_PIPELINE_SCHEDULE_H_
@@ -66,7 +66,7 @@ struct FuncOpConversion : public ConvertOpToLLVMPattern<triton::FuncOp> {
     // 1. Modify the function type to add the new arguments.
     auto funcTy = funcOp.getFunctionType();
     auto amendedInputTy = llvm::to_vector<4>(funcTy.getInputs());
-    bool isKernel = LLVM::isKernel(funcOp);
+    bool isKernel = triton::isKernel(funcOp);
     if (isKernel) {
       for (auto i : llvm::seq(amendedInputTy.size())) {
         if (isa<TensorDescType>(amendedInputTy[i])) {
@@ -111,7 +111,7 @@ struct FuncOpConversion : public ConvertOpToLLVMPattern<triton::FuncOp> {
   // Map the MLIR attribute `tt.nv_tma_desc` to the appropriate LLVM and NVVM
   // attributes.
   static void handleByvalTmaDescArgs(LLVM::LLVMFuncOp &llvmFuncOp) {
-    const bool isKernel = LLVM::isKernel(llvmFuncOp);
+    const bool isKernel = triton::isKernel(llvmFuncOp);
     for (unsigned i = 0; i < llvmFuncOp.getNumArguments(); ++i) {
       const auto attrs = llvmFuncOp.getArgAttrDict(i);
       if (!attrs) {
@@ -161,7 +161,7 @@ struct FuncOpConversion : public ConvertOpToLLVMPattern<triton::FuncOp> {
 
     auto ctx = funcOp->getContext();
 
-    if (LLVM::isKernel(funcOp)) {
+    if (triton::isKernel(funcOp)) {
       // Set an attribute to indicate this function is a kernel entry.
       newFuncOp->setAttr(NVVM::NVVMDialect::getKernelFuncAttrName(),
                          rewriter.getIntegerAttr(type::u1Ty(ctx), 1));
 
@@ -103,3 +103,17 @@ Value tt::getLastInductionValue(OpBuilder &b, scf::ForOp loop) {
       loc, b.create<arith::DivSIOp>(loc, diff, loop.getStep()), loop.getStep());
   return b.create<arith::AddIOp>(loc, ceilStep, loop.getLowerBound());
 }
+
+bool tt::isKernel(FunctionOpInterface funcOp) {
+  return funcOp.getVisibility() == SymbolTable::Visibility::Public;
+}
+
+bool tt::isHostSideDescriptor(Value v) {
+  auto arg = dyn_cast<BlockArgument>(v);
+  if (!arg)
+    return false;
+  auto funcOp = dyn_cast<FunctionOpInterface>(arg.getOwner()->getParentOp());
+  if (!funcOp)
+    return false;
+  return tt::isKernel(funcOp);
+}
@@ -1,6 +1,5 @@
 #include "mlir/Dialect/UB/IR/UBOps.h"
 #include "mlir/IR/Dominance.h"
-#include "mlir/IR/ImplicitLocOpBuilder.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 #include "triton/Analysis/Utility.h"
 #include "triton/Dialect/Triton/IR/Dialect.h"
@@ -38,32 +37,6 @@ namespace {
 // UTILS
 /////////////////////////////
 
-class OpBuilderForStage : public ImplicitLocOpBuilder,
-                          public OpBuilder::Listener {
-public:
-  explicit OpBuilderForStage(Location loc, Operation *op,
-                             CoarseSchedule &schedule)
-      : ImplicitLocOpBuilder(loc, op, this), schedule(schedule) {
-    if (auto it = schedule.find(op); it != schedule.end())
-      std::tie(stage, cluster) = it->second;
-  }
-
-  void setStageCluster(std::pair<int, CoarseSchedule::Cluster> stageCluster) {
-    stage = stageCluster.first;
-    cluster = stageCluster.second;
-  }
-
-  void notifyOperationInserted(Operation *op, InsertPoint previous) {
-    if (stage && cluster)
-      schedule.insert(op, *stage, *cluster);
-  }
-
-private:
-  std::optional<int> stage;
-  std::optional<CoarseSchedule::Cluster> cluster;
-  CoarseSchedule &schedule;
-};
-
 int getSelfLatencyFromAttr(Operation *op) {
   auto module = op->getParentOfType<ModuleOp>();
   auto helper = TritonDialect::getLoaded(module)->getSelfLatencyAttrHelper();
@@ -207,17 +180,6 @@ int getDefUseStageDiff(Operation *op, scf::ForOp forOp,
   return useStage.value() - defStage;
 }
 
-Value createIncrementModulo(OpBuilder &builder, Location loc, Value counter,
-                            Value modulus, Value zero, Value one,
-                            Value *outWrapCond = nullptr) {
-  Value addOne = builder.create<arith::AddIOp>(loc, counter, one);
-  Value outOfRangeCond = builder.create<arith::CmpIOp>(
-      loc, arith::CmpIPredicate::sge, addOne, modulus);
-  if (outWrapCond)
-    *outWrapCond = outOfRangeCond;
-  return builder.create<arith::SelectOp>(loc, outOfRangeCond, zero, addOne);
-}
-
 void replaceAllUsesDominatedBy(Operation *domOp, Value newValue, Value oldValue,
                                DominanceInfo &domInfo) {
   if (newValue == oldValue)
@@ -644,132 +606,6 @@ scf::ForOp lowerLoads(scf::ForOp forOp, CoarseSchedule &schedule) {
   return forOp;
 }
 
-/////////////////////////////
-// LOWER TMA DESCRIPTORS
-/////////////////////////////
-
-void allocTMABuffers(scf::ForOp forOp,
-                     llvm::MapVector<Operation *, Value> &tmaBufferMapping,
-                     int maxStage) {
-  IRRewriter rewriter(forOp);
-
-  // Create a multi-buffered allocation for each MakeTensorDescOp call in the
-  // loop
-  forOp.walk([&](tt::MakeTensorDescOp op) {
-    // TODO peter: walk to loop yield to find the init value if this is a
-    // loop-carried value. That would save us from allocating another buffer
-    // just for the init value
-    auto loc = op.getLoc();
-    Value alloc = rewriter.create<triton::gpu::GlobalScratchAllocOp>(
-        loc, triton::getPointerType(rewriter.getI8Type()),
-        maxStage * ttng::TMA_SIZE_BYTES, ttng::TMA_ALIGN);
-    tmaBufferMapping[op.getOperation()] = alloc;
-  });
-}
-
-Value subviewTMADescriptor(OpBuilder &builder, Location loc, Value alloc,
-                           Value counter) {
-  Value tmaSizeVal =
-      builder.create<arith::ConstantIntOp>(loc, ttng::TMA_SIZE_BYTES, 32);
-  Value offset = builder.create<arith::MulIOp>(loc, tmaSizeVal, counter);
-  return builder.create<triton::AddPtrOp>(loc, alloc.getType(), alloc, offset);
-}
-
-LogicalResult rewriteTMABufferUpdates(
-    scf::ForOp forOp,
-    const llvm::MapVector<Operation *, Value> &tmaBufferMapping,
-    ArrayRef<BlockArgument> tmaCounters, int numBuffers, Value one, Value zero,
-    CoarseSchedule &schedule) {
-  assert(tmaBufferMapping.size() == tmaCounters.size());
-
-  Value numBuffersVal = mlir::OpBuilder(forOp).create<arith::ConstantIntOp>(
-      forOp.getLoc(), numBuffers, 32);
-
-  for (auto [iOp, pair] : llvm::enumerate(tmaBufferMapping)) {
-    auto &[op, alloc] = pair;
-
-    // Rewriter MakeTensorDescOp as writing a TMA descriptor
-    auto makeDescOp = cast<tt::MakeTensorDescOp>(op);
-
-    OpBuilderForStage builder(makeDescOp.getLoc(), makeDescOp, schedule);
-
-    BlockArgument counter = tmaCounters[iOp];
-    Value nextBuf =
-        subviewTMADescriptor(builder, builder.getLoc(), alloc, counter);
-    if (failed(ttng::createTMADesc(nextBuf, makeDescOp, builder))) {
-      return failure();
-    }
-    builder.create<ttng::TensormapFenceproxyAcquireOp>(nextBuf);
-    Value nextDesc = builder.create<ttng::ReinterpretTensorDescOp>(
-        makeDescOp.getType(), nextBuf);
-
-    makeDescOp.getResult().replaceAllUsesWith(nextDesc);
-
-    // Increment the buffer index counter
-    Value nextCounter = createIncrementModulo(
-        builder, builder.getLoc(), counter, numBuffersVal, zero, one);
-
-    // If we are in a (potentially nested) if region, propagate the counter
-    // up to the main for op body scope
-    IRRewriter rewriter(forOp);
-    nextCounter =
-        sinkValueRedefinition(rewriter, counter, nextCounter, op->getBlock());
-
-    // Finally, rewrite the loop level yield
-    auto forYield = cast<scf::YieldOp>(forOp.getBody()->getTerminator());
-    forYield.setOperand(counter.getArgNumber() - 1, nextCounter);
-  }
-  return success();
-}
-
-scf::ForOp lowerTMADescriptors(scf::ForOp forOp, CoarseSchedule &schedule) {
-  llvm::MapVector<Operation *, Value> tmaBufferMapping;
-  int maxStage = schedule.getNumStages() - 1;
-  for (auto &op : forOp.getBody()->without_terminator()) {
-    if (auto wgMmaOp = dyn_cast<ttng::WarpGroupDotOp>(&op)) {
-      // Hopper only: Add one more buffer slice if there is a WarpGroupDotOp,
-      // as if it will be pipelined, we will effectively make the pipeline
-      // one stage longer.
-      maxStage += 1;
-      break;
-    }
-  }
-  allocTMABuffers(forOp, tmaBufferMapping, maxStage);
-  if (tmaBufferMapping.empty())
-    return forOp;
-
-  IRRewriter builder(forOp);
-  Location loc = forOp.getLoc();
-  Value zero = builder.create<arith::ConstantIntOp>(loc, 0, 32);
-  Value one = builder.create<arith::ConstantIntOp>(loc, 1, 32);
-  SmallVector<Value> newOperands;
-  unsigned newOperandIndex = forOp.getBody()->getNumArguments();
-  // Create one counter per TMA buffer. This allows the descriptors to be
-  // updated independently without needing to write duplicate of existing tma
-  // descriptors.
-  unsigned tmaCounterArgsStartIdx = newOperandIndex + newOperands.size();
-  for (int i = 0; i < tmaBufferMapping.size(); ++i) {
-    newOperands.push_back(zero);
-  }
-
-  forOp = addIterArgsToLoop(builder, forOp, newOperands);
-
-  auto tmaCounters = ArrayRef<BlockArgument>(forOp.getBody()->getArguments())
-                         .slice(tmaCounterArgsStartIdx);
-
-  // Update yield op with temporary yield values
-  auto forYield = cast<scf::YieldOp>(forOp.getBody()->getTerminator());
-  for (unsigned i = 0; i < newOperands.size(); ++i) {
-    forYield.getResultsMutable().append(newOperands[i]);
-  }
-
-  if (failed(rewriteTMABufferUpdates(forOp, tmaBufferMapping, tmaCounters,
-                                     maxStage, one, zero, schedule))) {
-    llvm_unreachable("Failed to rewrite TMA ops");
-  }
-  return forOp;
-}
-
 /////////////////////////////
 // LOWER MMA
 /////////////////////////////