intel
diff --git a/‎.github/workflows/test-backends.yml‎
Lines changed: 0 additions & 83 deletions b/‎.github/workflows/test-backends.yml‎
Lines changed: 0 additions & 83 deletions
diff --git a/‎docs/python-api/triton.language.rst‎
Lines changed: 4 additions & 0 deletions b/‎docs/python-api/triton.language.rst‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/Transforms/Partition.h‎
Lines changed: 23 additions & 30 deletions b/‎include/triton/Dialect/TritonGPU/Transforms/Partition.h‎
Lines changed: 23 additions & 30 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/Transforms/Passes.td‎
Lines changed: 9 additions & 18 deletions b/‎include/triton/Dialect/TritonGPU/Transforms/Passes.td‎
Lines changed: 9 additions & 18 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/Transforms/PipeliningUtility.h‎
Lines changed: 4 additions & 36 deletions b/‎include/triton/Dialect/TritonGPU/Transforms/PipeliningUtility.h‎
Lines changed: 4 additions & 36 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/Transforms/Schedule.h‎
Lines changed: 4 additions & 7 deletions b/‎include/triton/Dialect/TritonGPU/Transforms/Schedule.h‎
Lines changed: 4 additions & 7 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/Transforms/Utility.h‎
Lines changed: 9 additions & 31 deletions b/‎include/triton/Dialect/TritonGPU/Transforms/Utility.h‎
Lines changed: 9 additions & 31 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/Transforms/WarpSpecialization.h‎
Lines changed: 0 additions & 3 deletions b/‎include/triton/Dialect/TritonGPU/Transforms/WarpSpecialization.h‎
Lines changed: 0 additions & 3 deletions
@@ -12,6 +12,7 @@ Programming Model
     :nosignatures:
 
     tensor
+    tensor_descriptor
     program_id
     num_programs
 
@@ -71,6 +72,9 @@ Memory/Pointer Ops
 
     load
     store
+    make_tensor_descriptor
+    load_tensor_descriptor
+    store_tensor_descriptor
     make_block_ptr
     advance
 
 
@@ -4,8 +4,6 @@
 #include "mlir/Support/LLVM.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/GraphTraits.h"
-#include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SmallVector.h"
 
 namespace mlir {
@@ -26,43 +24,38 @@ static constexpr char kPartitionStagesAttrName[] = "ttg.partition.stages";
 //===----------------------------------------------------------------------===//
 
 namespace mlir::triton::gpu {
+// A partition has a stage and contains some operation. The stage of a
+// partition determines how many cycles the partition's outputs are buffered
+// relative to its consumers.
+class Partition {
+public:
+  Partition(int idx, int stage) : idx(idx), stage(stage) {}
+
+  int getIndex() const { return idx; }
+  int getStage() const { return stage; }
+  ArrayRef<Operation *> getOps() const { return ops; }
+
+private:
+  void setIndex(int idx) { this->idx = idx; }
+  friend class WarpSchedule;
+
+  // The partition number.
+  int idx;
+  // The stage of the partition.
+  int stage;
+  // The ops in the partition.
+  SmallVector<Operation *> ops;
+};
+
 // A warp schedule divides a loop into multiple partitions. Ops in a loop are
 // assigned at most one partition. A warp schedule represents asynchronous
 // execution of the loop body, where partitions may execute simultaneously.
 class WarpSchedule {
   static constexpr int kSentinel = -1;
 
 public:
-  // A partition has a stage and contains some operation. The stage of a
-  // partition determines how many cycles the partition's outputs are buffered
-  // relative to its consumers.
-  class Partition {
-  public:
-    Partition(int idx, int stage) : idx(idx), stage(stage) {}
-
-    int getIndex() const { return idx; }
-    int getStage() const { return stage; }
-    ArrayRef<Operation *> getOps() const { return ops; }
-
-    void insert(Operation *op) { ops.push_back(op); }
-    void remove(Operation *op) { ops.erase(llvm::find(ops, op)); }
-
-  private:
-    void setIndex(int idx) { this->idx = idx; }
-    friend class WarpSchedule;
-
-    // The partition number.
-    int idx;
-    // The stage of the partition.
-    int stage;
-    // The ops in the partition.
-    SmallVector<Operation *> ops;
-  };
-
   // Create a new partition with a stage.
   Partition *addPartition(unsigned stage);
-  // Update the op to partition mapping.
-  void updatePartitions();
 
   // Get the partition the op belongs to.
   Partition *getPartition(Operation *op);
 
@@ -26,36 +26,27 @@ def TritonGPUPipeline : Pass<"tritongpu-pipeline", "mlir::ModuleOp"> {
   ];
 }
 
-def TritonGPUTestPipelineAssignLatencies : Pass<"tritongpu-test-pipeline-assign-latencies", "mlir::ModuleOp"> {
-  let summary = "test assigning latencies to interesting ops ahead of pipelining";
+def TritonGPUAssignLatencies : Pass<"tritongpu-assign-latencies", "mlir::ModuleOp"> {
+  let summary = "assign latencies to interesting ops ahead of pipelining";
 
   let description = [{
-    This is a test pass that tests `assignLatencies` method of `TritonGPUPipeline`.
+    The `tritongpu-assign-latencies` pass assigns latencies to latency ops based
+    on the number of stages.
   }];
 
-  let dependentDialects = ["mlir::triton::gpu::TritonGPUDialect",
-                           "mlir::triton::nvidia_gpu::TritonNvidiaGPUDialect",
-                           "mlir::scf::SCFDialect",
-                           "mlir::arith::ArithDialect"];
-
   let options = [
-    Option<"numStages", "num-stages",
-           "int32_t", /*default*/"3",
+    Option<"numStages", "num-stages", "int32_t", /*default*/"3",
            "number of pipeline stages">
   ];
 }
 
-def TritonGPUTestPipelineScheduleLoop : Pass<"tritongpu-test-pipeline-schedule-loop", "mlir::ModuleOp"> {
-  let summary = "test scheduling a loop for software pipelining";
+def TritonGPUScheduleLoops : Pass<"tritongpu-schedule-loops", "mlir::ModuleOp"> {
+  let summary = "software pipeline loop scheduling";
 
   let description = [{
-    This is a test pass that tests `scheduleLoop` method of `TritonGPUPipeline`.
+    The `tritongpu-schedule-loops` pass performs scheduling for loop pipelining
+    for loops with latency ops.
   }];
-
-  let dependentDialects = ["mlir::triton::gpu::TritonGPUDialect",
-                           "mlir::triton::nvidia_gpu::TritonNvidiaGPUDialect",
-                           "mlir::scf::SCFDialect",
-                           "mlir::arith::ArithDialect"];
 }
 
 def TritonGPUHoistTMEMAlloc : Pass<"tritongpu-hoist-tmem-alloc", "mlir::ModuleOp"> {
 
@@ -19,8 +19,6 @@ static const char *kWarpSpecializeAttrName = "tt.warp_specialize";
 static const char *kLoopStageAttrName = "loop.stage";
 static const char *kLoopClusterAttrName = "loop.cluster";
 static const char *kScheduledMaxStageAttrName = "tt.scheduled_max_stage";
-static const char *kAssignedStageAttrName = "ttg.assigned_stage";
-static const char *kAssignedClusterAttrName = "ttg.assigned_cluster";
 
 //===----------------------------------------------------------------------===//
 // Hoisting Utilities
@@ -133,42 +131,12 @@ int getNumStagesOrDefault(scf::ForOp forOp, int defaultNumStages);
 
 // Given a result of MemDescSubview, or Alloca, create a MemDescSubview with a
 // single buffer slice (leading dimension equal to 1), at the given index.
-template <typename TBuilder>
 TypedValue<triton::gpu::MemDescType>
-createSingleBufferView(TBuilder &builder, Value alloc, Value idx) {
-  assert(isa<triton::gpu::MemDescType>(alloc.getType()) &&
-         "Expected MemDescType");
-  auto allocDescType = cast<triton::gpu::MemDescType>(alloc.getType());
-  SmallVector<int64_t> shape;
-  if (allocDescType.getShape().size() > 1) {
-    shape.insert(shape.end(), allocDescType.getShape().begin() + 1,
-                 allocDescType.getShape().end());
-  } else {
-    shape.push_back(1);
-  }
-  auto viewDescType = triton::gpu::MemDescType::get(
-      shape, allocDescType.getElementType(), allocDescType.getEncoding(),
-      allocDescType.getMemorySpace(), allocDescType.getMutableMemory(),
-      /*allocShape=*/allocDescType.getAllocShape());
-  SmallVector<Value> idxs = {idx};
-  if (allocDescType.getShape().size() > 1) {
-    Value zero =
-        builder.template create<arith::ConstantIntOp>(alloc.getLoc(), 0, 32);
-    for (unsigned i = 1; i < allocDescType.getShape().size(); i++) {
-      idxs.push_back(zero);
-    }
-  }
-  return builder.template create<triton::gpu::MemDescSubviewOp>(
-      alloc.getLoc(), viewDescType, alloc, idxs);
-}
-
-template <typename TBuilder>
+createSingleBufferView(OpBuilder &builder, Value alloc, Value idx);
+// Given a result of MemDescSubview, or Alloca, create a MemDescSubview with a
+// single buffer slice (leading dimension equal to 1), at the given index.
 TypedValue<triton::gpu::MemDescType>
-createSingleBufferView(TBuilder &builder, Value alloc, int idx) {
-  return createSingleBufferView(
-      builder, alloc,
-      builder.template create<arith::ConstantIntOp>(alloc.getLoc(), idx, 32));
-}
+createSingleBufferView(OpBuilder &builder, Value alloc, int idx);
 
 } // namespace triton
 } // namespace mlir
 
@@ -13,13 +13,6 @@ namespace triton {
 
 namespace gpu {
 
-/// Discover operations that should become async and assign latencies to them
-/// based on the numStages value provided by the user.
-void assignLatencies(ModuleOp moduleOp, int numStages);
-
-/// Schedule the loops based on the latencies assigned to the operations.
-void scheduleLoops(ModuleOp moduleOp);
-
 /// Lower the loops to prepare them for pipeline expansion.
 void lowerLoops(ModuleOp moduleOp);
 
@@ -115,6 +108,10 @@ class CoarseSchedule {
   bool insertDepsOfOp(Operation *op, int stage, CoarseSchedule::Cluster cluster,
                       bool includeArg, bool insertIfEarlier = false);
 
+  // Remove empty stages and clusters from the schedule, adjusting the maximum
+  // number of stages as appropriate.
+  void shrinkToFit();
+
   void erase(Operation *op) { opToStageAndCluster.erase(op); }
 
   int count(Operation *op) { return opToStageAndCluster.count(op); }
 
@@ -247,47 +247,25 @@ SetVector<Value> getNestedOperands(Operation *op);
 // Erase the given loop carried values from the loop, where `loop` is replaced
 // with a new loop.
 void eraseLoopCarriedValues(scf::ForOp &loop, llvm::BitVector indices);
+
+// Get a boolean if the Value is an arith::ConstantOp
+std::optional<bool> getBoolFromConstant(Value cst);
 } // namespace mlir
 
 namespace mlir::triton {
-
 /// Replace all uses of `oldUse` with `val` and propagate the type if needed.
 /// This is useful when we need to change a memory descriptor from immutable to
 /// mutable.
 void replaceUsesAndPropagateType(OpBuilder &builder, Operation *oldUse,
                                  Value val);
 
-template <typename BuilderT>
+/// Replace all uses of `old` with a local load from `alloc` unless the use is a
+/// `ttg.local_alloc` with a matching shared encoding, in which case the shared
+/// memory is forwarded directly into the use.
 void replaceUsesWithLocalLoad(
-    BuilderT &builder, OpResult old, TypedValue<triton::gpu::MemDescType> alloc,
-    TypedValue<triton::gpu::AsyncTokenType> token = {}) {
-  //  Remove redundant local_load -> local_alloc
-  namespace ttg = triton::gpu;
-  using triton::gpu::LocalAllocOp;
-  auto allocTy = alloc.getType();
-  SmallVector<LocalAllocOp> allocsToErase;
-  for (Operation *user : old.getUsers()) {
-    if (auto userAlloc = dyn_cast<LocalAllocOp>(user)) {
-      if (allocTy.getEncoding() == userAlloc.getType().getEncoding()) {
-        replaceUsesAndPropagateType(builder, userAlloc, alloc);
-        allocsToErase.push_back(userAlloc);
-      }
-    }
-  }
-
-  // If there are some uses that were not local_allocs, we need to create a
-  // local_load for them.
-  if (std::distance(old.getUsers().begin(), old.getUsers().end()) >
-      allocsToErase.size()) {
-    auto loc = old.getOwner()->getLoc();
-    auto sharedLoad = builder.template create<ttg::LocalLoadOp>(
-        loc, old.getType(), alloc, token);
-    old.replaceAllUsesWith(sharedLoad.getResult());
-  }
-  for (auto alloc : allocsToErase) {
-    alloc.erase();
-  }
-}
+    OpBuilder &builder, OpResult old,
+    TypedValue<triton::gpu::MemDescType> alloc,
+    TypedValue<triton::gpu::AsyncTokenType> token = {});
 } // namespace mlir::triton
 
 #endif // TRITON_DIALECT_TRITONGPU_TRANSFORMS_UTILITY_H_
@@ -8,9 +8,6 @@ namespace scf {
 class ForOp;
 } // namespace scf
 namespace triton::gpu {
-// Identify load-mma dependencies and specialize them to different partitions.
-LogicalResult specializeLoadMMADependencies(scf::ForOp &loop,
-                                            int defaultNumStages);
 // This is the final step to prepare a loop for warp specialization. This takes
 // a loop with a partition schedule and rewrites the loop such that all SSA
 // dependencies between partitions are passed through shared memory and