intel
diff --git a/‎include/triton/Conversion/TritonGPUToLLVM/Utility.h
Lines changed: 0 additions & 11 deletions b/‎include/triton/Conversion/TritonGPUToLLVM/Utility.h
Lines changed: 0 additions & 11 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h
Lines changed: 0 additions & 168 deletions b/‎include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h
Lines changed: 0 additions & 168 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td
Lines changed: 1 addition & 3 deletions b/‎include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td
Lines changed: 1 addition & 3 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/Transforms/PartitionBuilder.h
Lines changed: 3 additions & 3 deletions b/‎include/triton/Dialect/TritonGPU/Transforms/PartitionBuilder.h
Lines changed: 3 additions & 3 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/Transforms/PipeliningUtility.h
Lines changed: 24 additions & 1 deletion b/‎include/triton/Dialect/TritonGPU/Transforms/PipeliningUtility.h
Lines changed: 24 additions & 1 deletion
diff --git a/‎include/triton/Dialect/TritonGPU/Transforms/Utility.h
Lines changed: 5 additions & 2 deletions b/‎include/triton/Dialect/TritonGPU/Transforms/Utility.h
Lines changed: 5 additions & 2 deletions
diff --git a/‎include/triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUOps.td
Lines changed: 2 additions & 2 deletions b/‎include/triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUOps.td
Lines changed: 2 additions & 2 deletions
diff --git a/‎include/triton/Tools/LayoutUtils.h
Lines changed: 0 additions & 8 deletions b/‎include/triton/Tools/LayoutUtils.h
Lines changed: 0 additions & 8 deletions
diff --git a/‎lib/Conversion/TritonGPUToLLVM/Utility.cpp
Lines changed: 0 additions & 49 deletions b/‎lib/Conversion/TritonGPUToLLVM/Utility.cpp
Lines changed: 0 additions & 49 deletions
@@ -357,9 +357,6 @@ class SharedMemoryObject {
 
   SmallVector<Type> getTypes() const;
 
-  SmallVector<Value> getStrides(triton::gpu::MemDescType memDesc, Location loc,
-                                RewriterBase &rewriter) const;
-
   // Returns a mask representing all the bits of the memdesc offsets that
   // may be modified by an affine offset coming from a memdesc_subslice.
   // The offsets are considered to be in the type of the memdesc.
@@ -385,14 +382,6 @@ class SharedMemoryObject {
   Value getBaseBeforeSlice(int dim, Location loc, RewriterBase &rewriter) const;
 
 private:
-  static SmallVector<unsigned> getOrderForShape(ArrayRef<int64_t> shape,
-                                                ArrayRef<unsigned> layoutOrder);
-
-  static SmallVector<Value> getStridesForShape(ArrayRef<int64_t> shape,
-                                               ArrayRef<unsigned> layoutOrder,
-                                               Location loc,
-                                               RewriterBase &rewriter);
-
   Value base; // i32 ptr. The start address of the shared memory object.
   Type baseElemType;
   SmallVector<Value>
 
@@ -99,174 +99,6 @@ LinearLayout chooseShemLayoutForRegToRegConversion(
     MLIRContext *ctx, ArrayRef<unsigned> tensorShape,
     ArrayRef<unsigned> repShape, ArrayRef<unsigned> order);
 
-// This function constructs a linear layout that maps
-// <register, lane, warp> to <shared memory offset, iteration>.
-// The primary goal is to efficiently store 2D tiles of a tensor into shared
-// memory using the `stmatrix` instruction, with each thread responsible for
-// storing `N` elements.  If `stmatrix` cannot be used for the given tensor
-// encoding, this function returns `std::nullopt`.
-//
-// Unlike standard vectorized stores, such as `st.shared.v4 [%offset],
-// %vec_reg`, where `%vec_reg` contains four consecutive data elements, the
-// `stmatrix` instruction allows `N` registers to point to non-contiguous
-// locations within a tensor tile.
-//
-// For instance, the `stmatrix [%offset], %mat_reg` instruction on NVIDIA GPUs
-// enables `%mat_reg` to store `N` elements that do not need to be consecutive.
-// However, it is crucial that the address (`%offset`) of each row in a tensor
-// tile should be aligned to `N` * `elemBitWidth`.  The `%offset` of each thread
-// is calculated based on the provided tensor encoding.
-//
-// Currently, we support only the NVIDIA MMAv3 encoding and the `stmatrix.x4`
-// instruction.  Each `stmatrix.x4` instruction stores eight 16-bit elements per
-// thread, resulting in a total of 8 * 32 = 256 elements per warp, or 16 * 16
-// elements per warp when distributed across four 8x8 tiles.  Each thread's
-// `%offset` points to an address aligned with 8 * 16 bits, denoting a row in
-// the 8x8 tile.  The values in `%mat_reg` are non-consecutive elements,
-// composed of 4 pairs of consecutive elements.  These matrix addresses are
-// distributed as follows:
-//
-//              col[0-7]     col[8-15]
-//   row[0-7]  lane[0-7]    lane[16-23]
-//   row[8-15] lane[8-15]   lane[24-31]
-//
-// The matrix elements of thread 0 are distributed in the following pattern:
-//
-//           col0       col8
-//   row0  reg[0-1]   reg[4-5]
-//   row8  reg[2-3]   reg[6-7]
-//
-// When `swizzleByteSize` is non-zero, the layout is constructed
-// differently due to leading dimension offset and swizzling.
-// There are two key concepts to understand:
-//
-//   1. Chunks: The leading dimension (i.e., the column dimension) is divided
-//   into chunks, where each chunk's size is determined by `swizzleByteSize`.
-//   2. Swizzling within tiles: Each tile applies a swizzling pattern to its
-//   rows to optimize memory access.
-//
-// - Concept 1: Chunks
-//
-// In the swizzled layout, the leading dimension is strided by
-// `swizzleByteSize`. This introduces the concept of a "chunk", where each chunk
-// spans a certain number of columns.
-//
-// For a tile size of `stmatrix.x4` (16x16 elements), with each element being 16
-// bits (2 bytes), each tile occupies 16 rows and 32 bytes per row (since 16
-// elements * 2 bytes per element = 32 bytes per row).
-//
-// Given a `swizzleByteSize` of 128 bytes, the number of tiles per chunk can be
-// calculated as:
-//
-//   Number of tiles per chunk = swizzleByteSize / (bytes per row) = 128 bytes /
-//   32 bytes = 4 tiles
-//
-// Therefore, each chunk contains 4 tiles horizontally, spanning 64 columns
-// (since each tile is 16 columns):
-//
-//             col0-15    col16-31   col32-47   col48-63
-//   row0-15    tile0      tile1      tile2      tile3
-//
-// For a tensor of size 128x128 elements (#rows x #columns), and each element
-// being 16 bits, the tensor can be divided into multiple chunks both
-// horizontally and vertically.  Chunks are stored in memory in a "column-major"
-// order based on chunks, meaning chunk1's address follows chunk0's.
-//
-// Assuming we have 8 warps, and we assign each warp to process a chunk of 16
-// rows (rows per tile) and 128 columns (the width of two chunks). This results
-// in each warp handling one horizontal slice of the tensor.
-//
-// The overall layout can be visualized as:
-//
-//                        |<- 128 * 128 bytes ->|<- 128 * 128 bytes ->|
-//                              columns 0-63         columns 64-127
-//   warp0 | rows 0-15            chunk0               chunk8
-//   warp1 | rows 16-31           chunk1               chunk9
-//   warp2 | rows 32-47           chunk2               chunk10
-//   warp3 | rows 48-63           chunk3               chunk11
-//   warp4 | rows 64-79           chunk4               chunk12
-//   warp5 | rows 80-95           chunk5               chunk13
-//   warp6 | rows 96-111          chunk6               chunk14
-//   warp7 | rows 112-127         chunk7               chunk15
-//
-// - Concept 2: Swizzling within tiles
-//
-// Within each 16x16 tile, rows are swizzled to optimize memory access patterns.
-// This swizzling is similar to what's defined in `TritonGPUAttrDefs.td`. at the
-// level of each 16x16 tile rather than the entire tensor.
-//
-// Key parameters for swizzling:
-//
-//   - `perPhase`: The number of rows over which to apply a XOR operation at
-//   each phase.
-//   - `maxPhase`: The total number of phases.
-//   - `vectorWidth`: The number of elements per vector, which is 8 in this case
-//   because `stmatrix` stores 8 contiguous elements per thread.
-//
-// The offset of each element within a tile is calculated using the formula:
-//
-//   offset = row * swizzleByteSize + (vectorWidth * ((row / perPhase) %
-//   maxPhase)) * elementSize
-//
-// where `elementSize` is the size of each element in bytes (2 bytes for 16-bit
-// elements).
-//
-// For example, consider the element at index `(row=1, col=0)` in chunk0:
-//
-// Without swizzling:
-//
-//   offset = row * swizzleByteSize + col * elementSize
-//          = 1 * 128 bytes + 0 * 2 bytes
-//          = 128 bytes
-//
-// With swizzling (assuming `perPhase=1`, `maxPhase=8`, `vectorWidth=8`):
-//
-//   offset = row * swizzleByteSize + (vectorWidth * ((row / perPhase) %
-//   maxPhase)) * elementSize
-//          = 1 * 128 bytes + (8 * ((1 / 1) % 8)) * 2 bytes
-//          = 128 bytes + (8 * (1 % 8)) * 2 bytes
-//          = 128 bytes + 8 * 2 bytes
-//          = 128 bytes + 16 bytes
-//          = 144 bytes
-//
-// This swizzling ensures that elements are stored in a way that optimizes for
-// memory bandwidth and reduces bank conflicts.
-//
-// - Verification through Linear Layout
-//
-// We can verify the offsets with the following outputs of the corresponding
-// linear layout, where each element is 16 bits (2 bytes):
-//
-//   - register=1 -> offset=1
-//     register=2 -> offset=2
-//     register=4 -> offset=4
-//     register=8 -> offset=16
-//     register=16 -> offset=32
-//     register=32 -> offset=8192
-//   - lane=1 -> offset=72
-//     lane=2 -> offset=144
-//     lane=4 -> offset=288
-//     lane=8 -> offset=512
-//     lane=16 -> offset=8
-//   - warp=1 -> offset=1024
-//     warp=2 -> offset=2048
-//     warp=4 -> offset=4096
-//
-// For index `(row=1, col=0)`, which corresponds to `reg=0` and `lane=1` in
-// `warp=0`, the offset is calculated as 72 * 2 bytes = 144 bytes.  The result
-// matches our earlier calculation.
-//
-// TODO(Keren): We should replace tensorTy with a LinearLayout and the element
-// bit width of the tensor in the future to support more flexible tensor
-// encodings
-LinearLayout chooseStMatrixLayout(MLIRContext *ctx, RankedTensorType tensorTy,
-                                  int swizzleByteSize);
-
-// The primary goal of this function is to efficiently store 2D tiles of a
-// tensor into shared memory using the `ldmatrix` instruction.
-LinearLayout chooseLdMatrixLayout(Attribute enc, ArrayRef<int64_t> shape,
-                                  bool needTrans, int32_t elemBitWidth);
-
 // The primary goal of this function is to efficiently load 2D tiles of a
 // tensor from shared memory using the `ds_read_tr` instruction for AMD GPUs.
 LinearLayout chooseDsReadB64TrLayout(Attribute enc, ArrayRef<int64_t> shape,
 
@@ -64,9 +64,7 @@ def TTG_AsyncCommitGroupOp : TTG_Op<"async_commit_group"> {
   let results = (outs TTG_AsyncToken:$asyncToken);
   let arguments = (ins Variadic<TTG_AsyncToken>:$inputTokens);
 
-  let assemblyFormat = [{
-    $inputTokens attr-dict
-  }];
+  let assemblyFormat = [{($inputTokens ^)?attr-dict}];
 
   let extraClassDeclaration = [{
     static bool isSupported(int computeCapability) {
 
@@ -1,5 +1,5 @@
-#ifndef TRITONGPU_WARPSPECIALIZATION_PARTITIONBUILDER_H
-#define TRITONGPU_WARPSPECIALIZATION_PARTITIONBUILDER_H
+#ifndef TRITON_TRITONGPU_TRANSFORMS_PARTITIONBUILDER_H
+#define TRITON_TRITONGPU_TRANSFORMS_PARTITIONBUILDER_H
 
 #include "mlir/IR/ImplicitLocOpBuilder.h"
 
@@ -33,4 +33,4 @@ StageCluster getStageCluster(Operation *op);
 
 } // namespace mlir::triton::gpu
 
-#endif // TRITONGPU_WARPSPECIALIZATION_PARTITIONBUILDER_H
+#endif // TRITON_TRITONGPU_TRANSFORMS_PARTITIONBUILDER_H
@@ -132,7 +132,14 @@ void combineRedundantWaitOps(
     llvm::SmallSetVector<gpu::AsyncWaitOp, 8> &waitOps);
 
 // Get the type of the view of a multi-buffered tensor value.
-gpu::MemDescType getBufferViewType(gpu::MemDescType allocTy);
+gpu::MemDescType getBufferViewType(gpu::MemDescType allocTy,
+                                   bool mutableMemory = true);
+
+// Get a mutable, multi-buffered version of the given memdesc type, with
+// multiplicity "depth".
+gpu::MemDescType getMultiBufferedType(gpu::MemDescType memDescType,
+                                      int32_t depth);
+
 // Get a generic shared encoding for a tensor.
 gpu::SharedEncodingTrait getSharedEncoding(RankedTensorType ty);
 // Get a shared encoding for a tensor based on its uses.
@@ -157,6 +164,22 @@ Value createIncrementModulo(OpBuilder &builder, Location loc, Value counter,
 
 scf::ForOp lowerTMADescriptors(scf::ForOp forOp, CoarseSchedule &schedule);
 
+DenseSet<Operation *>
+getTopLevelUsersInLoop(Operation *op, scf::ForOp forOp,
+                       std::function<bool(Operation *)> filter = nullptr);
+
+// Return the "first" op in terms of the stage and cluser ordering
+Operation *
+getFirstUseOfPipelinedOp(ArrayRef<Operation *> ops, scf::ForOp forOp,
+                         CoarseSchedule &schedule,
+                         std::function<bool(Operation *)> filterUse = nullptr);
+
+// Return the "last" op in terms of the stage and cluser ordering
+Operation *
+getLastUseOfPipelinedOp(ArrayRef<Operation *> ops, scf::ForOp forOp,
+                        CoarseSchedule &schedule,
+                        std::function<bool(Operation *)> filterUse = nullptr);
+
 } // namespace triton
 } // namespace mlir
 
 
@@ -255,8 +255,11 @@ namespace mlir::triton {
 /// Replace all uses of `oldUse` with `val` and propagate the type if needed.
 /// This is useful when we need to change a memory descriptor from immutable to
 /// mutable.
-void replaceUsesAndPropagateType(OpBuilder &builder, Operation *oldUse,
-                                 Value val);
+/// The callback is invoked for each pair of an old and a cloned memdesc op
+/// as the type is propagated.
+void replaceUsesAndPropagateType(
+    OpBuilder &builder, Operation *oldUse, Value val,
+    std::function<void(Operation *, Operation *)> callback = nullptr);
 
 /// Replace all uses of `old` with a local load from `alloc` unless the use is a
 /// `ttg.local_alloc` with a matching shared encoding, in which case the shared
 
@@ -87,7 +87,7 @@ def TTNG_WarpGroupDotOp : TTNG_Op<"warp_group_dot", [
 
   let arguments = (ins
     TTG_TensorOrMemDesc:$a,
-    TTG_TensorOrMemDesc:$b,
+    TTG_MemDescType:$b,
     TT_FpIntTensor:$c,
     Optional<I1>:$useC,
     DefaultValuedAttr<TT_InputPrecisionAttr, "::mlir::triton::InputPrecision::IEEE">:$inputPrecision,
@@ -99,7 +99,7 @@ def TTNG_WarpGroupDotOp : TTNG_Op<"warp_group_dot", [
 
   let assemblyFormat = [{
     $a`,` $b`,` $c (`,` $useC^)? attr-dict
-    `:` type($a) `*` type($b) `->` type($d)
+    `:` type($a) `*` qualified(type($b)) `->` type($d)
   }];
 
   let extraClassDeclaration = [{
 
@@ -10,14 +10,6 @@ namespace mlir::triton {
 bool squareSublayoutIsIdentity(const LinearLayout &ll,
                                ArrayRef<StringAttr> dimNames);
 
-// Is the sublayout defined from dimNames to dimNames a subpermutation matrix?
-// I.e. the layout matrix is formed by selecting unique columns from the
-// identity matrix and adding zero columns. A zero column in the layout means
-// that changing a bit in the inputs does not change the bits of the outputs
-// (broadcasting).
-bool squareSublayoutIsPermutation(const LinearLayout &ll,
-                                  ArrayRef<StringAttr> dimNames);
-
 // For each output dimension d, ensure that the layout's output size (i.e., its
 // codomain) does not exceed shape[d]. Do this without changing the size of the
 // layout's inputs (i.e., leave its domain unchanged).
 
@@ -1079,19 +1079,6 @@ SmallVector<Type> SharedMemoryObject::getTypes() const {
   return types;
 }
 
-SmallVector<Value>
-SharedMemoryObject::getStrides(triton::gpu::MemDescType memDesc, Location loc,
-                               RewriterBase &rewriter) const {
-  auto allocShape = memDesc.getAllocShape();
-  auto allocShapePerCTA =
-      triton::gpu::getAllocationShapePerCTA(memDesc.getEncoding(), allocShape);
-  auto layoutOrder = triton::gpu::getOrder(memDesc);
-  auto allocStrides = SharedMemoryObject::getStridesForShape(
-      allocShapePerCTA, layoutOrder, loc, rewriter);
-  return SmallVector<Value>(allocStrides.end() - offsets.size(),
-                            allocStrides.end());
-}
-
 Value SharedMemoryObject::getBaseBeforeSlice(int dim, Location loc,
                                              RewriterBase &rewriter) const {
   auto b = TritonLLVMOpBuilder(loc, rewriter);
@@ -1101,42 +1088,6 @@ Value SharedMemoryObject::getBaseBeforeSlice(int dim, Location loc,
   return b.gep(type, baseElemType, base, offset);
 }
 
-SmallVector<unsigned>
-SharedMemoryObject::getOrderForShape(ArrayRef<int64_t> shape,
-                                     ArrayRef<unsigned> layoutOrder) {
-  SmallVector<unsigned> order(shape.size());
-  // Default minor-to-major order
-  std::iota(order.rbegin(), order.rend(), 0);
-  if (layoutOrder.size() > 0) {
-    // If a layout order is provided, we assume it specifies the order in
-    // which the dimensions are first accessed, and unspecified dimensions
-    // retain the minor-to-major order. For example, if order = [2, 1, 0] and
-    // layoutOrder = [0, 1], we need to shift `layoutOrder`
-    // by -1 (move them right). The resulting order will then be [1, 2, 0].
-    int rankDiff = layoutOrder.size() - shape.size();
-    auto minRank = std::min<size_t>(shape.size(), layoutOrder.size());
-    for (size_t i = 0; i < minRank; ++i)
-      order[i] = layoutOrder[i] - rankDiff;
-    assert(isPermutationOfIota(order) && "Invalid order");
-  }
-  return order;
-}
-
-SmallVector<Value>
-SharedMemoryObject::getStridesForShape(ArrayRef<int64_t> shape,
-                                       ArrayRef<unsigned> layoutOrder,
-                                       Location loc, RewriterBase &rewriter) {
-  SmallVector<Value> strides(shape.size());
-  auto order = SharedMemoryObject::getOrderForShape(shape, layoutOrder);
-  int64_t stride = 1;
-  auto b = TritonLLVMOpBuilder(loc, rewriter);
-  for (auto idx : order) {
-    strides[idx] = b.i32_val(stride);
-    stride *= shape[idx];
-  }
-  return strides;
-}
-
 uint64_t
 SharedMemoryObject::getMaskSpanOffsets(triton::gpu::MemDescType srcTy) {
   auto ctx = srcTy.getContext();