intel
diff --git a/‎.github/workflows/integration-tests.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/integration-tests.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎.github/workflows/integration-tests.yml.in‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/integration-tests.yml.in‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎README.md‎
Lines changed: 1 addition & 1 deletion b/‎README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎include/triton/Dialect/TritonGPU/Transforms/MMAv5PipelineUtility.h‎
Lines changed: 74 additions & 9 deletions b/‎include/triton/Dialect/TritonGPU/Transforms/MMAv5PipelineUtility.h‎
Lines changed: 74 additions & 9 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/Transforms/Partition.h‎
Lines changed: 5 additions & 0 deletions b/‎include/triton/Dialect/TritonGPU/Transforms/Partition.h‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/Transforms/PipeliningUtility.h‎
Lines changed: 1 addition & 15 deletions b/‎include/triton/Dialect/TritonGPU/Transforms/PipeliningUtility.h‎
Lines changed: 1 addition & 15 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/Transforms/Utility.h‎
Lines changed: 24 additions & 0 deletions b/‎include/triton/Dialect/TritonGPU/Transforms/Utility.h‎
Lines changed: 24 additions & 0 deletions
diff --git a/‎lib/Dialect/TritonGPU/IR/Ops.cpp‎
Lines changed: 1 addition & 1 deletion b/‎lib/Dialect/TritonGPU/IR/Ops.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎lib/Dialect/TritonGPU/Transforms/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions b/‎lib/Dialect/TritonGPU/Transforms/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎lib/Dialect/TritonGPU/Transforms/FuseNestedLoops.cpp‎
Lines changed: 7 additions & 0 deletions b/‎lib/Dialect/TritonGPU/Transforms/FuseNestedLoops.cpp‎
Lines changed: 7 additions & 0 deletions
@@ -56,7 +56,7 @@ jobs:
       - name: Detect if build deps (e.g. LLVM hash) changed
         id: detect-change
         if: github.event_name == 'push'
-        uses: tj-actions/changed-files@v45
+        uses: tj-actions/changed-files@v46
         with:
           files: |
             cmake/*.txt
 
@@ -62,7 +62,7 @@ jobs:
       - name: Detect if build deps (e.g. LLVM hash) changed
         id: detect-change
         if: github.event_name == 'push'
-        uses: tj-actions/changed-files@v45
+        uses: tj-actions/changed-files@v46
         with:
           files: |
             cmake/*.txt
 
@@ -4,7 +4,7 @@
 
 | **`Documentation`** | **`Nightly Wheels`** |
 |-------------------- | -------------------- |
-| [![Documentation](https://github.com/triton-lang/triton/actions/workflows/documentation.yml/badge.svg)](https://triton-lang.org/) | [![Wheels](https://github.com/triton-lang/triton/actions/workflows/wheels.yml/badge.svg?branch=release/2.0.x)](https://github.com/triton-lang/triton/actions/workflows/wheels.yml) |
+| [![Documentation](https://github.com/triton-lang/triton/actions/workflows/documentation.yml/badge.svg)](https://triton-lang.org/) | [![Wheels](https://github.com/triton-lang/triton/actions/workflows/wheels.yml/badge.svg)](https://github.com/triton-lang/triton/actions/workflows/wheels.yml) |
 
 # Triton
 
 
@@ -1,40 +1,105 @@
 #ifndef TRITON_TRITONGPU_TRANSFORMS_MMAV5PIPELINEUTILITY_H_
 #define TRITON_TRITONGPU_TRANSFORMS_MMAV5PIPELINEUTILITY_H_
 
-#include <functional>
-#include <optional>
-#include <tuple>
+#include "triton/Dialect/TritonNvidiaGPU/IR/Dialect.h"
 
 namespace mlir {
+
 class OpBuilder;
-class Operation;
+class DominanceInfo;
 
 namespace scf {
 class ForOp;
-}
+} // namespace scf
 namespace triton::nvidia_gpu {
-class MMAv5OpInterface;
-class TMEMAllocOp;
-class TMEMLoadOp;
+
+//===----------------------------------------------------------------------===//
+// MMAInfo
+//===----------------------------------------------------------------------===//
+
+// This struct contains analysis information about an MMAv5 operation inside a
+// loop used for pipelining MMA ops.
+struct MMAInfo {
+  // This struct contains information about when the MMA's accumulator is
+  // overridden in the loop, if it is at all.
+  struct AccOverridePoint {
+    // The operation which overrides the accumulator.
+    Operation *op;
+    // The condition on which the accumulator is reset.
+    Value condition = nullptr;
+    // The initial value of the accumulator and the value after a reset.
+    Value initValue = nullptr;
+    // The number of loop iterations ago the accumulator was reset.
+    int distance = 0;
+    // Whether the accumulator is reset via setting the `useAcc` flag to false
+    // or by clearing the accumulator tensor value.
+    bool isFlag = false;
+  };
+
+  // The TMEM allocation of the accumuator, which directly precedes the dot op.
+  TMEMAllocOp accAlloc;
+  // The TMEM load of the accumulator value out of TMEM, which directly follows
+  // the dot op.
+  TMEMLoadOp accLoad;
+  // The override point of the accumulator value, if it is overriden in the
+  // loop. E.g. this is typically present for persistent kernels.
+  std::optional<AccOverridePoint> accDef;
+  // If the accumulator is used in future iterations of the loop, this is the
+  // iter arg number.
+  std::optional<int> yieldArgNo;
+  // Whether the accumulator needs to be multibuffered.
+  bool accIsMultiBuffered;
+
+  Value phase = nullptr;
+  Value barrierIdx = nullptr;
+  Value accInsertIdx = nullptr;
+  Value accExtractIdx = nullptr;
+  Value barrierAlloc = nullptr;
+};
+
+//===----------------------------------------------------------------------===//
+// MMA Pipeline Analysis
+//===----------------------------------------------------------------------===//
 
 // Returns the TMEMAllocOp and TMEMLoadOp that are used to allocate and load the
 // accumulator for the given MMA operation. The TMEMAllocOp and TMEMLoadOp must
 // be in the same region as the MMA operation.
 std::optional<std::pair<TMEMAllocOp, TMEMLoadOp>>
 getTMemAllocAndLoad(MMAv5OpInterface mmaOp);
+// Get immediate users of the accumulator within the current loop iteration.
+SmallVector<Operation *> getDirectAccUses(TMEMLoadOp accDef);
+// Analyze an MMA op inside a loop to determine information about how it can be
+// pipelined. Returns `std::nullopt` if it cannot be pipelined.
+std::optional<MMAInfo> getMMAInfo(scf::ForOp forOp, MMAv5OpInterface mmaOp,
+                                  DominanceInfo &domInfo);
+
+//===----------------------------------------------------------------------===//
+// MMA Pipeline Rewriters
+//===----------------------------------------------------------------------===//
+
 // Create a new TMEMAllocOp to use for the pipelined MMA operation. It is
 // optionally multi-buffered based on the number of stages.
 TMEMAllocOp createTMemAlloc(OpBuilder &builder, TMEMAllocOp oldTMemAllocOp,
                             bool multiBufferred, int numStages);
 
+// Create a store op of the initial value of the accumulator into the
+// potentially multi-buffered accumulator.
+void createInitStore(OpBuilder &builder, TMEMAllocOp allocOp, Value initVal,
+                     bool multiBufferred);
+
 // Return true if operands of the MMA operation are/are going to be pipelined
 // and multibuffered, enabling the MMA operation to be pipelined.
 bool mmaHasPipelineableOperands(
     MMAv5OpInterface mma, scf::ForOp forOp,
     std::function<bool(Operation *)> isLoadPipelineable);
 
-// Return true if the loop has a read-modify-write access to the accumulator.
+// Return true if the accumulator of an mma in subsequent iterations is either
+// independent from the previous iteration (overwritten) or completely reused,
+// without read-modify-write.
+// Otherwise, we can not pipeline the MMA, as we need to insert a wait after the
+// mma to read back the accumulator for RMW.
 bool hasAccReadModifyWrite(MMAv5OpInterface mma, scf::ForOp forOp);
+
 } // namespace triton::nvidia_gpu
 } // namespace mlir
 
 
@@ -47,6 +47,9 @@ class WarpSchedule {
     void insert(Operation *op) { ops.push_back(op); }
 
   private:
+    void setIndex(int idx) { this->idx = idx; }
+    friend class WarpSchedule;
+
     // The partition number.
     int idx;
     // The stage of the partition.
@@ -57,6 +60,8 @@ class WarpSchedule {
 
   // Create a new partition with a stage.
   Partition *addPartition(unsigned stage);
+  // Give each partition a new index and order. The indices must be unique.
+  void reorderPartitions(ArrayRef<unsigned> order);
 
   // Get the partition the op belongs to.
   Partition *getPartition(Operation *op);
 
@@ -14,6 +14,7 @@ namespace triton {
 static const char *kNumStagesAttrName = "tt.num_stages";
 static const char *kDisallowAccMultiBufferAttrName =
     "tt.disallow_acc_multi_buffer";
+static const char *kWarpSpecializeAttrName = "tt.warp_specialize";
 static const char *kLoopStageAttrName = "loop.stage";
 static const char *kLoopClusterAttrName = "loop.cluster";
 static const char *kScheduledMaxStageAttrName = "tt.scheduled_max_stage";
@@ -38,17 +39,6 @@ void replaceUsesAndPropagateType(OpBuilder &builder, Operation *oldUse,
 // `tt.disallow_acc_multi_buffer` set to true.
 bool getDisallowAccMultiBuffer(scf::ForOp forOp);
 
-/// Visit the operands of `op` and the operands of any nested ops defined
-/// outside of `op`.
-void visitNestedOperands(Operation *op,
-                         function_ref<void(OpOperand &)> visitor);
-/// Visit the operands of `op` and the operands of any nested ops defined
-/// outside of `op`.
-void visitNestedOperands(Operation *op, function_ref<void(Value)> visitor);
-/// Get the operands of `op` and the operands of any nested ops defined outside
-/// of `op`.
-SetVector<Value> getNestedOperands(Operation *op);
-
 // Return the definition of the given value. If the value is a loop-carried
 // dependency, return the definition and the distance to it.
 std::pair<OpResult, int64_t> getDefinitionAndDistance(scf::ForOp forOp,
@@ -90,10 +80,6 @@ gpu::SharedEncodingTrait getSharedEncoding(RankedTensorType ty);
 // Get a shared encoding for a tensor based on its uses.
 gpu::SharedEncodingTrait getSharedEncoding(Operation *loadOp);
 
-// Erase the given loop carried values from the loop, where `loop` is replaced
-// with a new loop.
-void eraseLoopCarriedValues(scf::ForOp &loop, llvm::BitVector indices);
-
 // Get the number of stages to pipeline the loop with, if it is explicitly
 // specified.
 int getNumStagesOrDefault(scf::ForOp forOp, int defaultNumStages);
 
@@ -10,6 +10,7 @@
 #include <numeric>
 
 namespace mlir {
+class DominanceInfo;
 
 namespace triton {
 class ModuleAxisInfoAnalysis;
@@ -135,6 +136,8 @@ scf::ForOp replaceForOpWithNewSignature(
     SmallVectorImpl<std::tuple<Value, Value>> &replacements);
 scf::ForOp replaceForOpWithNewSignature(OpBuilder &rewriter, scf::ForOp loop,
                                         ValueRange newIterOperands);
+Block::BlockArgListType addIterArgsToLoop(OpBuilder &rewriter, scf::ForOp &loop,
+                                          ValueRange newIterOperands);
 
 // Replace WhileOp with a new WhileOp with extra operands. The YieldOp is not
 // updated and needs to be updated separately for the loop to be correct.
@@ -213,6 +216,27 @@ triton::gpu::LocalAllocOp findShmemAlloc(Value operand);
 SmallVector<Operation *>
 getMMAsWithMultiBufferredOperands(scf::ForOp forOp,
                                   SmallVector<Operation *> &mmaOps);
+
+// Given a list of ops, find the naerest common dominator of all ops or return
+// null if one could not be found. The ops are allowed to be in different
+// regions. The result op is not necessarily one of the ops in the list.
+Operation *findNearestCommonDominator(ArrayRef<Operation *> ops,
+                                      DominanceInfo &domInfo);
+
+/// Visit the operands of `op` and the operands of any nested ops defined
+/// outside of `op`.
+void visitNestedOperands(Operation *op,
+                         function_ref<void(OpOperand &)> visitor);
+/// Visit the operands of `op` and the operands of any nested ops defined
+/// outside of `op`.
+void visitNestedOperands(Operation *op, function_ref<void(Value)> visitor);
+/// Get the operands of `op` and the operands of any nested ops defined outside
+/// of `op`.
+SetVector<Value> getNestedOperands(Operation *op);
+
+// Erase the given loop carried values from the loop, where `loop` is replaced
+// with a new loop.
+void eraseLoopCarriedValues(scf::ForOp &loop, llvm::BitVector indices);
 } // namespace mlir
 
 #endif // TRITON_DIALECT_TRITONGPU_TRANSFORMS_UTILITY_H_
@@ -850,7 +850,7 @@ LogicalResult WarpYieldOp::verify() {
 static size_t getSharedMemorySize(Type type) {
   if (isa<IntegerType, FloatType>(type))
     return llvm::divideCeil(type.getIntOrFloatBitWidth(), 8);
-  if (isa<PointerType>(type))
+  if (isa<PointerType, TensorDescType>(type))
     return 8;
   if (auto desc = dyn_cast<MemDescType>(type)) {
     if (!isa<SharedMemorySpaceAttr>(desc.getMemorySpace()))
 
@@ -12,6 +12,7 @@ add_triton_library(TritonGPUTransforms
   OptimizeThreadLocality.cpp
   Pipeliner/AssignLatencies.cpp
   Pipeliner/LowerLoops.cpp
+  Pipeliner/MMAv5PipelineUtility.cpp
   Pipeliner/ScheduleLoops.cpp
   Pipeliner/WGMMAPipeline.cpp
   Pipeliner/PipelineExpander.cpp
 
@@ -900,6 +900,13 @@ static void fuseOneLevel(LoopNestNode *parent, mlir::DominanceInfo &domInfo) {
     epilogueIf.erase();
   }
 
+  // Propagate warp specialization flags.
+  if (outer->hasAttr(kWarpSpecializeAttrName) ||
+      llvm::any_of(innerLoops, [](scf::ForOp loop) {
+        return loop->hasAttr(kWarpSpecializeAttrName);
+      }))
+    fused->setAttr(kWarpSpecializeAttrName, b.getUnitAttr());
+
   // Propagate the `tt.disallow_acc_multi_buffer` attribute to the parent loop.
   bool disallowAccMultiBuffer = getDisallowAccMultiBuffer(outer);
   for (scf::ForOp loop : innerLoops) {