intel
diff --git a/‎bench/triton_bench/swiglu.py‎
Lines changed: 1 addition & 1 deletion b/‎bench/triton_bench/swiglu.py‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎include/triton/Dialect/TritonGPU/Transforms/Partition.h‎
Lines changed: 10 additions & 2 deletions b/‎include/triton/Dialect/TritonGPU/Transforms/Partition.h‎
Lines changed: 10 additions & 2 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/Transforms/PipeliningUtility.h‎
Lines changed: 1 addition & 0 deletions b/‎include/triton/Dialect/TritonGPU/Transforms/PipeliningUtility.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎lib/Analysis/Membar.cpp‎
Lines changed: 8 additions & 0 deletions b/‎lib/Analysis/Membar.cpp‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎lib/Conversion/TritonToTritonGPU/RelayoutTritonGPU.cpp‎
Lines changed: 5 additions & 0 deletions b/‎lib/Conversion/TritonToTritonGPU/RelayoutTritonGPU.cpp‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎lib/Dialect/TritonGPU/Transforms/Pipeliner/Schedule.cpp‎
Lines changed: 6 additions & 2 deletions b/‎lib/Dialect/TritonGPU/Transforms/Pipeliner/Schedule.cpp‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎lib/Dialect/TritonGPU/Transforms/Pipeliner/ScheduleLoops.cpp‎
Lines changed: 11 additions & 1 deletion b/‎lib/Dialect/TritonGPU/Transforms/Pipeliner/ScheduleLoops.cpp‎
Lines changed: 11 additions & 1 deletion
diff --git a/‎lib/Dialect/TritonGPU/Transforms/Pipeliner/SoftwarePipeliner.cpp‎
Lines changed: 1 addition & 0 deletions b/‎lib/Dialect/TritonGPU/Transforms/Pipeliner/SoftwarePipeliner.cpp‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎lib/Dialect/TritonGPU/Transforms/Utility.cpp‎
Lines changed: 16 additions & 11 deletions b/‎lib/Dialect/TritonGPU/Transforms/Utility.cpp‎
Lines changed: 16 additions & 11 deletions
@@ -69,7 +69,7 @@ def forward(ctx, a, alpha, precision_config, routing_data):
             n_tokens,
             BLOCK_M=BLOCK_M,
             BLOCK_N=BLOCK_N,
-            EVEN_N=(N // 2) % 2 == 0,
+            EVEN_N=(N // 2) % BLOCK_N == 0,
             M_BLOCKS=M_BLOCKS,
             N_BLOCKS=N_BLOCKS,
             flexpoint_saturate_inf=flex_ctx.saturate_inf,
 
@@ -61,8 +61,6 @@ class WarpSchedule {
 
   // Create a new partition with a stage.
   Partition *addPartition(unsigned stage);
-  // Give each partition a new index and order. The indices must be unique.
-  void reorderPartitions(ArrayRef<unsigned> order);
   // Update the op to partition mapping.
   void updatePartitions();
 
@@ -74,15 +72,25 @@ class WarpSchedule {
   Partition *getPartition(unsigned idx);
   // Get the partition at the index.
   const Partition *getPartition(unsigned idx) const;
+  // Insert an operation into a partition.
+  void insert(Partition *partition, Operation *op);
   // Return an iterator range over the partitions.
   auto getPartitions() { return llvm::make_pointee_range(partitions); }
   // Return an iterator range over the partitions.
   auto getPartitions() const { return llvm::make_pointee_range(partitions); }
+  // Get the number of partitions.
+  unsigned getNumPartitions() const { return partitions.size(); }
   // Get the root partition.
   Partition *getRootPartition() { return rootPartition.get(); }
   // Get the root partition.
   const Partition *getRootPartition() const { return rootPartition.get(); }
 
+  // Return true if an operation is assigned to a partition.
+  bool isScheduled(Operation *op) const;
+  // Schedule an operation to a partition if it is not already scheduled. Return
+  // true if the operation was scheduled.
+  bool trySchedule(Partition *partition, Operation *op);
+
   // Deserialize a warp schedule from an `scf.for` op using the attributes
   // tagged on operations in its body.
   static FailureOr<WarpSchedule> deserialize(scf::ForOp loop);
 
@@ -19,6 +19,7 @@ static const char *kWarpSpecializeAttrName = "tt.warp_specialize";
 static const char *kLoopStageAttrName = "loop.stage";
 static const char *kLoopClusterAttrName = "loop.cluster";
 static const char *kScheduledMaxStageAttrName = "tt.scheduled_max_stage";
+static const char *kAssignedStageAttrName = "ttg.assigned_stage";
 
 //===----------------------------------------------------------------------===//
 // Hoisting Utilities
 
@@ -1,6 +1,7 @@
 #include "triton/Analysis/Membar.h"
 #include "triton/Analysis/Alias.h"
 #include "triton/Dialect/TritonGPU/IR/Dialect.h"
+#include "triton/Dialect/TritonNvidiaGPU/IR/Dialect.h"
 
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
@@ -216,6 +217,13 @@ void MembarAnalysis::update(Operation *op, BlockInfo *blockInfo,
         }
       }
     }
+    // If this op is may be signalling other threads asynchronously, make sure
+    // all shared memory transactions are complete beforehand.
+    if (isa<triton::nvidia_gpu::ArriveBarrierOp>(op)) {
+      Interval<size_t> allIntervals(0, std::numeric_limits<size_t>::max());
+      curBlockInfo.syncWriteIntervals[allIntervals].insert(op);
+      curBlockInfo.syncReadIntervals[allIntervals].insert(op);
+    }
     scratchBufferId = allocation->getBufferId(op);
   }
 
 
@@ -44,6 +44,11 @@ struct TMEMLoadOpPattern : public OpConversionPattern<ttng::TMEMLoadOp> {
     RankedTensorType type = getTMEMTensorLayout(
         typeConverter, op.getType(), op.getSrc().getType(), lookupNumWarps(op));
     rewriter.modifyOpInPlace(op, [&] { op.getResult().setType(type); });
+    Type resultType = getTypeConverter()->convertType(op.getType());
+    rewriter.setInsertionPointAfter(op);
+    auto cvt = rewriter.create<ConvertLayoutOp>(op.getLoc(), resultType,
+                                                op.getResult());
+    rewriter.replaceAllUsesExcept(op.getResult(), cvt, cvt);
     return success();
   }
 };
 
@@ -189,9 +189,13 @@ void tt::CoarseSchedule::serialize(scf::ForOp &forOp) {
   for (auto [op, stage, cluster] : getOpsInOrder(forOp)) {
     setStageCluster(op, stage, *cluster);
   }
+
+  Builder b(forOp.getContext());
+  int maxStages = numStages - 1;
+  if (auto maxStageAttr = tryGetMaxStage(forOp))
+    maxStages = std::max(maxStages, *maxStageAttr);
   forOp->setAttr(mlir::triton::kScheduledMaxStageAttrName,
-                 IntegerAttr::get(IntegerType::get(forOp.getContext(), 32),
-                                  numStages - 1));
+                 b.getI32IntegerAttr(maxStages));
 }
 
 // Create a CoarseSchedule based on forOp's <stage, cluster>.
 
@@ -59,6 +59,8 @@ bool hasLatenciesAssigned(scf::ForOp forOp,
   for (auto &op : forOp.getBody()->without_terminator()) {
     if (opLatency.count(&op))
       return true;
+    if (op.getAttr(kAssignedStageAttrName))
+      return true;
   }
   return false;
 }
@@ -70,12 +72,15 @@ CoarseSchedule scheduleKeyOps(scf::ForOp forOp,
   auto terminator = cast<scf::YieldOp>(forOp.getBody()->getTerminator());
   // Determine all operations that have a non-zero latency
   SmallVector<Operation *> latOps;
+  SmallVector<Operation *> stagedOps;
   for (auto &op : forOp.getBody()->without_terminator()) {
     if (opLatency.count(&op))
       latOps.push_back(&op);
+    if (op.getAttr(kAssignedStageAttrName))
+      stagedOps.push_back(&op);
   }
   // If no latency ops, nothing to schedule
-  if (latOps.empty())
+  if (latOps.empty() && stagedOps.empty())
     return CoarseSchedule(0);
 
   DominanceInfo domInfo(forOp);
@@ -123,6 +128,11 @@ CoarseSchedule scheduleKeyOps(scf::ForOp forOp,
       opToStage[op] = maxDistance - dist;
   }
 
+  for (Operation *op : stagedOps) {
+    auto stageAttr = op->getAttrOfType<IntegerAttr>(kAssignedStageAttrName);
+    opToStage[op] = stageAttr.getInt();
+  }
+
   auto stages = llvm::make_second_range(opToStage);
   int maxStage = *llvm::max_element(stages);
   CoarseSchedule schedule(maxStage + 1);
 
@@ -73,6 +73,7 @@ static void removeAttributes(ModuleOp moduleOp) {
     op->removeAttr(mlir::triton::kLoopStageAttrName);
     op->removeAttr(mlir::triton::kLoopClusterAttrName);
     op->removeAttr(mlir::triton::kScheduledMaxStageAttrName);
+    op->removeAttr(mlir::triton::kAssignedStageAttrName);
   });
 }
 
 
@@ -1402,30 +1402,35 @@ void replaceUsesAndPropagateType(OpBuilder &builder, Operation *oldUse,
   // Save the operand to replace / delete later (avoid iterator invalidation).
   // TODO: can we use an early_inc iterator?
   for (OpOperand &use : oldUse->getUses()) {
+    // Propagate through `ttg.warp_specialize`.
+    if (auto wsOp = dyn_cast<ttg::WarpSpecializeOp>(use.getOwner())) {
+      for (Region *region : wsOp.getPartitionRegions())
+        region->getArgument(use.getOperandNumber()).setType(val.getType());
+    }
+
     // Non-subview/trans ops will be replaced by `val`.
-    if (!isa<triton::gpu::MemDescTransOp, triton::gpu::MemDescSubviewOp>(
-            use.getOwner())) {
+    if (!isa<ttg::MemDescTransOp, ttg::MemDescSubviewOp>(use.getOwner())) {
       operandsToReplace.push_back(&use);
       continue;
     }
+
     Operation *user = use.getOwner();
     // `subview(old_op)` is replaced by a new `subview(val)`.
     OpBuilder::InsertionGuard g(builder);
     builder.setInsertionPoint(user);
     Value newVal;
-    if (auto subview = dyn_cast<triton::gpu::MemDescSubviewOp>(user)) {
-      triton::gpu::MemDescType oldType = subview.getType();
-      bool isMutable =
-          cast<triton::gpu::MemDescType>(val.getType()).getMutableMemory();
-      Type newDstType = triton::gpu::MemDescType::get(
+    if (auto subview = dyn_cast<ttg::MemDescSubviewOp>(user)) {
+      ttg::MemDescType oldType = subview.getType();
+      bool isMutable = cast<ttg::MemDescType>(val.getType()).getMutableMemory();
+      Type newDstType = ttg::MemDescType::get(
           oldType.getShape(), oldType.getElementType(), oldType.getEncoding(),
           oldType.getMemorySpace(), isMutable);
-      newVal = builder.create<triton::gpu::MemDescSubviewOp>(
+      newVal = builder.create<ttg::MemDescSubviewOp>(
           subview.getLoc(), newDstType, val, subview.getOffsets());
       newVal.getDefiningOp()->setAttrs(user->getAttrs());
-    } else if (auto trans = dyn_cast<triton::gpu::MemDescTransOp>(user)) {
-      newVal = builder.create<triton::gpu::MemDescTransOp>(trans.getLoc(), val,
-                                                           trans.getOrder());
+    } else if (auto trans = dyn_cast<ttg::MemDescTransOp>(user)) {
+      newVal = builder.create<ttg::MemDescTransOp>(trans.getLoc(), val,
+                                                   trans.getOrder());
       newVal.getDefiningOp()->setAttrs(user->getAttrs());
     }
     assert(newVal);
Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,7 @@`
`1`	`1`	`#include "triton/Analysis/Membar.h"`
`2`	`2`	`#include "triton/Analysis/Alias.h"`
`3`	`3`	`#include "triton/Dialect/TritonGPU/IR/Dialect.h"`
	`4`	`+#include "triton/Dialect/TritonNvidiaGPU/IR/Dialect.h"`
`4`	`5`
`5`	`6`	`#include "mlir/Dialect/Func/IR/FuncOps.h"`
`6`	`7`	`#include "mlir/Dialect/GPU/IR/GPUDialect.h"`
`@@ -216,6 +217,13 @@ void MembarAnalysis::update(Operation op, BlockInfo blockInfo,`
`216`	`217`	`}`
`217`	`218`	`}`
`218`	`219`	`}`
	`220`	`+ // If this op is may be signalling other threads asynchronously, make sure`
	`221`	`+ // all shared memory transactions are complete beforehand.`
	`222`	`+ if (isa<triton::nvidia_gpu::ArriveBarrierOp>(op)) {`
	`223`	`+ Interval<size_t> allIntervals(0, std::numeric_limits<size_t>::max());`
	`224`	`+ curBlockInfo.syncWriteIntervals[allIntervals].insert(op);`
	`225`	`+ curBlockInfo.syncReadIntervals[allIntervals].insert(op);`
	`226`	`+ }`
`219`	`227`	`scratchBufferId = allocation->getBufferId(op);`
`220`	`228`	`}`
`221`	`229`
Original file line number	Diff line number	Diff line change
`@@ -73,6 +73,7 @@ static void removeAttributes(ModuleOp moduleOp) {`
`73`	`73`	`op->removeAttr(mlir::triton::kLoopStageAttrName);`
`74`	`74`	`op->removeAttr(mlir::triton::kLoopClusterAttrName);`
`75`	`75`	`op->removeAttr(mlir::triton::kScheduledMaxStageAttrName);`
	`76`	`+ op->removeAttr(mlir::triton::kAssignedStageAttrName);`
`76`	`77`	`});`
`77`	`78`	`}`
`78`	`79`