intel
diff --git a/‎include/triton/Conversion/TritonGPUToLLVM/Utility.h‎
Lines changed: 1 addition & 0 deletions b/‎include/triton/Conversion/TritonGPUToLLVM/Utility.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/IR/LayoutUtility.h‎
Lines changed: 0 additions & 4 deletions b/‎include/triton/Dialect/TritonGPU/IR/LayoutUtility.h‎
Lines changed: 0 additions & 4 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h‎
Lines changed: 11 additions & 2 deletions b/‎include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h‎
Lines changed: 11 additions & 2 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td‎
Lines changed: 69 additions & 23 deletions b/‎include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td‎
Lines changed: 69 additions & 23 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/Transforms/Utility.h‎
Lines changed: 3 additions & 2 deletions b/‎include/triton/Dialect/TritonGPU/Transforms/Utility.h‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎include/triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUOpInterfaces.td‎
Lines changed: 3 additions & 0 deletions b/‎include/triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUOpInterfaces.td‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎lib/Conversion/TritonGPUToLLVM/MemoryOpToLLVM.cpp‎
Lines changed: 14 additions & 12 deletions b/‎lib/Conversion/TritonGPUToLLVM/MemoryOpToLLVM.cpp‎
Lines changed: 14 additions & 12 deletions
diff --git a/‎lib/Conversion/TritonGPUToLLVM/Utility.cpp‎
Lines changed: 15 additions & 7 deletions b/‎lib/Conversion/TritonGPUToLLVM/Utility.cpp‎
Lines changed: 15 additions & 7 deletions
@@ -569,6 +569,7 @@ lowerLdStShared(Location loc, MLIRContext *ctx, LinearLayout cvt,
                 std::function<Value(Value)> calcPaddedOffset,
                 Value affineOffset, uint64_t maskSpanAffineOffset,
                 RewriterBase &rewriter, const TargetInfoBase &targetInfo,
+                std::optional<int> maybeMaxVecElems = {},
                 Operation *localLoadOp = nullptr);
 
 // Lower an ld/st-like operation given a layout and a callback that creates the
 
@@ -5,8 +5,4 @@ namespace mlir::triton::gpu {
 
 CTALayoutAttr permuteCTALayout(MLIRContext *ctx, CTALayoutAttr layout,
                                ArrayRef<int> order);
-
-LinearLayout getPaddedRegToSharedLayout(const LinearLayout &regLayout,
-                                        PaddedSharedEncodingAttr paddedEnc);
-
 } // namespace mlir::triton::gpu
@@ -15,10 +15,9 @@ enum class ScaleDotElemType : uint32_t;
 namespace mlir::triton::gpu {
 class SwizzledSharedEncodingAttr;
 class NVMMASharedEncodingAttr;
-class AMDRotatingSharedEncodingAttr;
-class AMDMfmaEncodingAttr;
 class TensorOrMemDesc;
 class MemDescType;
+class CTALayoutAttr;
 
 // - BlockedEncodingAttrs have the following input dimensions.
 //
@@ -73,6 +72,16 @@ LinearLayout nvmmaSharedToLinearLayout(ArrayRef<int64_t> shape,
 // `inDimNames`. The latter does not modify the output sizes.
 LinearLayout getLayoutWithinBlock(const LinearLayout &layout);
 
+// Combines the layout of a CTA (input dims [register, lane, warp]) with the
+// layout of a CGA (i.e. a block), and ensures that the resulting layout has the
+// given shape.
+//
+// See the nomenclature note at the top of LinearLayoutConversions.cpp for why
+// the variable with type CTALayoutAttr is called cgaLayoutAttr.
+LinearLayout combineCtaCgaWithShape(LinearLayout ctaLayout,
+                                    CTALayoutAttr cgaLayoutAttr,
+                                    ArrayRef<int64_t> shape);
+
 // In this function, we construct a linear layout representing the
 // <shared memory offset, iteration, block> -> <tensor element index> mapping
 // for entire `src` and `dst` tensors.  We determine the shape of the
 
@@ -6,7 +6,7 @@ include "triton/Dialect/Triton/IR/TritonInterfaces.td"
 include "triton/Dialect/TritonGPU/IR/TritonGPUDialect.td"
 
 //===----------------------------------------------------------------------===//
-// Traits and Interfaces
+// Traits, Interfaces and shared Parameters
 //===----------------------------------------------------------------------===//
 
 def MemDescViewTrait : NativeOpTrait<"MemDescViewTrait">;
@@ -55,6 +55,11 @@ def SharedEncodingTrait : AttrInterface<"SharedEncodingTrait"> {
 def DeclareSharedEncodingMethods : DeclareAttrInterfaceMethods<
   SharedEncodingTrait, ["getAlignment"]>;
 
+def LinearLayoutParam : AttrOrTypeParameter<"LinearLayout",
+                                            "linear layout"> {
+  let cppAccessorType = "const LinearLayout &";
+}
+
 //===----------------------------------------------------------------------===//
 // Base Attribute
 //===----------------------------------------------------------------------===//
@@ -369,14 +374,15 @@ When vec=2, elements are swizzled in pairs of 2.  In other words, the element at
 
 def PaddedSharedEncodingAttr
     : TritonGPU_Attr<"PaddedSharedEncoding", "padded_shared_encoding",
-                     [SharedEncodingTrait, LayoutEncodingTrait]> {
+                     [SharedEncodingTrait, DeclareLayoutEncodingMethods]> {
   let mnemonic = "padded_shared";
 
   let description = [{
 An encoding for tensors whose elements may be simultaneously accessed by
 different GPU threads in the programs, via shared memory. In other words,
 for all indices i \in Z^d, \mathcal{L}(i) = {0, 1, ..., 32*num_warps - 1}.
-Compared to SwizzledSharedEncodingAttr, this encoding uses padding to avoid
+Compared to SwizzledSharedEncodingAttr, this encoding combines padding with
+element reordering via linear transformation (e.g. row permutation) to avoid
 shared memory bank conflicts.
 
 Formally, given a layout:
@@ -388,48 +394,93 @@ at index i, the corresponding shared memory location index is
     i + \sum_{k} (i / interval_k) * pad_k = 1
 `<interval_i>` and `<pad_i>` all need to be power of two.
 
-Some concrete examples, using `eM` to mean tensor elements and `pN` to mean
-padding:
+Some concrete examples ignoring the linear component, using `eM` to mean tensor
+elements and `pN` to mean padding:
 
 1. Single interval-padding pair:
 
-   #ttg.padded_shared<[2:+2]>
+   #ttg.padded_shared<[2:+2], {...}>
    [e0, e1, p0, p1,
     e2, e3, p2, p3,
     ...]
 
 2. Double interval-padding pairs:
 
-   #ttg.padded_shared<[2:+1, 4:+2]>
+   #ttg.padded_shared<[2:+1, 4:+2], {...}>
    [e0, e1, p0,
     e2, e3, p1, p2, p3,
     e4, e5, p4,
     e6, e7, p5, p6, p7,
     ...]
 
-In addition to interval-padding pairs, this encoding requires an `order` to
-specify the logical tensor dimenions from the fastest-to slowest-varying.
-It may optionally support CGA level organization like other encoding
-attributes too, for example,
-    #ttg.padded_shared<[2:+1, 4:+2] {
-        order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1],
-        CTAOrder = [0, 1]}>
+Furthermore this encoding allows for a linear remapping from the 1-D shared
+memory offset to logical n-D tensor elements. The remapping is given in the form
+of linear bases mapping from offset to [dim0, dim1...dimN-1].
+See LinearLayout.h for more details how linear layouts are applied to remap
+elements.
+Some concrete examples using `xN` and `yN` to mean the logical n-D tensor elements
+and `pN` to mean padding:
+
+1. 1D Single interval-padding with strided elements
+
+    #ttg.padded_shared<[2:+2] {offset = [[2], [1]], block = []}>
+    [x0, x2, p0 p1,
+     x1, x3, p2, p3
+     ...]
+
+2. 2D single interval-padding with rearanged rows.
+
+    #ttg.padded_shared<[16:+1] {offset = [[0, 1], [0, 2], /*gap, stride by 2 rows*/[2, 0], [4, 0], [1, 0]]], block = []}>
+    [
+      x0y0, x0y1, x0y2, x0y3,
+      x2y0, x2y1, x2y2, x2y3,
+      x4y0, x4y1, x4y2, x4y3,
+      x6y0, x6y1, x6y2, x6y3,
+      p0,
+      x1y0, x1y1, x1y2, x1y3,
+      x3y0, x3y1, x3y2, x3y3,
+      x5y0, x5y1, x5y2, x5y3,
+      x7y0, x7y1, x7y2, x7y3,
+      p1,
+    ]
+
+For identity mappings a short form based on order and shape is used to increase readability. The following two encodings are the same:
+
+    #ttg.padded_shared<[2:+2] {order = [1, 0], shape = [16, 32]}>
+    #ttg.padded_shared<[2:+2] {offset = [[0, 1], [0, 2], [0, 4], [0, 8], [0, 16], [1, 0], [2, 0], [4, 0], [8, 0]], block = []}>
+
+
   }];
 
   let parameters = (ins
       ArrayRefParameter<"unsigned">:$intervals,
       ArrayRefParameter<"unsigned">:$paddings,
-      // Order of logical tensor dimensions; fastest-varying first.
-      ArrayRefParameter<"unsigned">:$order,
-      "CTALayoutAttr":$CTALayout
+      LinearLayoutParam:$linearComponent
   );
 
   let builders = [
       AttrBuilder<(ins "ArrayRef<std::pair<unsigned, unsigned>>":$intervalPads,
-                       "ArrayRef<unsigned>":$order, "CTALayoutAttr":$ctaLayout)>,
+                       "LinearLayout":$linearComponent)>,
+
+      // Builder to create an identity mapping as the linear component
+      AttrBuilder<(ins "ArrayRef<std::pair<unsigned, unsigned>>":$intervalPads,
+                       "ArrayRef<unsigned>":$order, "ArrayRef<int64_t>":$shape,
+                       "CTALayoutAttr":$ctaLayout)>,
   ];
 
   let extraClassDeclaration = extraBaseClassDeclaration # [{
+    // Returns the order of the dimensions `dimName` of the layout.
+    // If more than dimension is of size one, it uses defaultOrder to determine
+    // the order of the dimensions of size one.
+    SmallVector<unsigned> orderPerDim(StringAttr dimName,
+                                      ArrayRef<unsigned> defaultOrder) const;
+    SmallVector<unsigned> getOrder() const;
+
+    // Returns the bases of the dimensions `dimName` of the linear_component.
+    // If skipBroadcast is false, we count a base zero
+    SmallVector<unsigned> basesPerDim(StringAttr dimName,
+                                      bool skipBroadcast = true) const;
+
     unsigned getMinInterval() const {
       return *llvm::min_element(getIntervals());
     }
@@ -708,11 +759,6 @@ L(T) = [ {0,8} , {1,9} , {2,10}, {3,11}, {0,8} , {1, 9} , {2, 10}, {3, 11},
 // Linear Layout Encoding
 //===----------------------------------------------------------------------===//
 
-def LinearLayoutParam : AttrOrTypeParameter<"LinearLayout",
-                                            "linear layout"> {
-  let cppAccessorType = "const LinearLayout &";
-}
-
 def LinearEncodingAttr : DistributedEncoding<"LinearEncoding", "linear_encoding", [DeclareLayoutEncodingMethods]> {
   let mnemonic = "linear";
 
 
@@ -213,8 +213,9 @@ std::optional<StringRef> getAMDArch(Operation *module);
 std::optional<mlir::triton::gpu::SwizzledSharedEncodingAttr>
 getSharedEncIfAllUsersAreDotEnc(Value val, bool &incompatible);
 
-// Convert \param op operands and results to layout \param encoding.
-void convertOpEncoding(Attribute encoding, Operation *op);
+// Convert \param op to use \param encoding attribute.
+// Skips operands if they're in shared encoding.
+Operation *convertDistributedOpEncoding(Attribute encoding, Operation *op);
 
 // Returns the original memory allocation for a memdesc value
 triton::gpu::LocalAllocOp findShmemAlloc(Value operand);
 
@@ -53,6 +53,9 @@ def MMAv5OpInterface : OpInterface<"MMAv5OpInterface"> {
                     "void",
                     "setIsAsync",
                     (ins "bool":$isAsync)>,
+    InterfaceMethod<"Return true if this MMA op executes asynchronously.",
+                    "bool",
+                    "isAsync">
   ];
 
   let verify = [{
 
@@ -31,15 +31,16 @@ LogicalResult lowerLocalStore(Location loc, MLIRContext *ctx, Value regVal,
       dyn_cast<triton::gpu::PaddedSharedEncodingAttr>(memDescTy.getEncoding());
   LinearLayout cvt = LinearLayout::empty();
   if (paddedEnc) {
-    cvt = getPaddedRegToSharedLayout(regLayout, paddedEnc);
+    const auto &sharedLL = paddedEnc.getLinearComponent();
+    cvt = regLayout.invertAndCompose(sharedLL);
   } else {
     auto sharedLayout = toLinearLayout(memDescTy);
     cvt = regLayout.invertAndCompose(sharedLayout);
-    auto kBlock = str_attr("block");
-    // NYI. We would need to emit a map.shared::cluster instruction.
-    if (!cvt.isTrivialOver({kBlock})) {
-      return failure();
-    }
+  }
+  auto kBlock = str_attr("block");
+  // NYI. We would need to emit a map.shared::cluster instruction.
+  if (!cvt.isTrivialOver({kBlock})) {
+    return failure();
   }
   cvt = cvt.sublayout({kReg, kLane, kWarp}, {kOffset});
   lowerLocalLdSt(loc, ctx, cvt, inVals, llvmElemTy, memDescTy, smemObj,
@@ -167,15 +168,16 @@ struct LocalLoadOpConversion : public ConvertOpToLLVMPattern<LocalLoadOp> {
     auto paddedEnc = dyn_cast<triton::gpu::PaddedSharedEncodingAttr>(sharedEnc);
     LinearLayout cvt = LinearLayout::empty();
     if (paddedEnc) {
-      cvt = getPaddedRegToSharedLayout(regLayout, paddedEnc);
+      const auto &sharedLL = paddedEnc.getLinearComponent();
+      cvt = regLayout.invertAndCompose(sharedLL);
     } else {
       auto sharedLayout = toLinearLayout(memDescTy);
       cvt = regLayout.invertAndCompose(sharedLayout);
-      auto kBlock = str_attr("block");
-      // NYI. We would need to emit a map.shared::cluster instruction.
-      if (!cvt.isTrivialOver({kBlock})) {
-        return failure();
-      }
+    }
+    auto kBlock = str_attr("block");
+    // NYI. We would need to emit a map.shared::cluster instruction.
+    if (!cvt.isTrivialOver({kBlock})) {
+      return failure();
     }
     cvt = cvt.sublayout({kReg, kLane, kWarp}, {kOffset});
 
 
@@ -567,7 +567,7 @@ lowerLdStShared(Location loc, MLIRContext *ctx, LinearLayout cvt,
                 std::function<Value(Value)> calcPaddedOffset,
                 Value affineOffset, uint64_t maskSpanAffineOffset,
                 RewriterBase &rewriter, const TargetInfoBase &targetInfo,
-                Operation *localLoadOp) {
+                std::optional<int> maybeMaxVecElems, Operation *localLoadOp) {
 
   bool isStore = !valsArray.empty();
   auto b = TritonLLVMOpBuilder(loc, rewriter);
@@ -593,7 +593,7 @@ lowerLdStShared(Location loc, MLIRContext *ctx, LinearLayout cvt,
   auto [laneId, warpId] = getLaneAndWarpId(rewriter, loc);
   return lowerLdSt(loc, ctx, cvt, valsArray, llvmElemTy, smemBase,
                    calcPaddedOffset, affineOffset, maskSpanAffineOffset, laneId,
-                   warpId, rewriter, targetInfo, {}, emitLdSt);
+                   warpId, rewriter, targetInfo, maybeMaxVecElems, emitLdSt);
 }
 
 SmallVector<Value> lowerLdSt(
@@ -728,9 +728,17 @@ lowerLocalLdSt(Location loc, MLIRContext *ctx,
   }
   auto affineOffset = smemObj.getShmemOffset(loc, rewriter, srcTy);
   auto maskSpanAffineOffset = smemObj.getMaskSpanOffsets(srcTy);
-  return lowerLdStShared(
-      loc, ctx, cvt, valsArray, llvmElemTy, smemObj.getBase(), calcPaddedOffset,
-      affineOffset, maskSpanAffineOffset, rewriter, targetInfo, localLoadOp);
+
+  std::optional<int> maybeMaxVecElems;
+  if (auto paddedEnc = dyn_cast<triton::gpu::PaddedSharedEncodingAttr>(
+          srcTy.getEncoding())) {
+    maybeMaxVecElems = paddedEnc.getMinInterval();
+  }
+
+  return lowerLdStShared(loc, ctx, cvt, valsArray, llvmElemTy,
+                         smemObj.getBase(), calcPaddedOffset, affineOffset,
+                         maskSpanAffineOffset, rewriter, targetInfo,
+                         maybeMaxVecElems, localLoadOp);
 }
 
 bool emitTransferBetweenRegistersAndShared(
@@ -753,8 +761,8 @@ bool emitTransferBetweenRegistersAndShared(
       dyn_cast<triton::gpu::PaddedSharedEncodingAttr>(sharedTy.getEncoding());
   LinearLayout regToSharedLayout = LinearLayout::empty();
   if (paddedEnc) {
-    regToSharedLayout =
-        triton::gpu::getPaddedRegToSharedLayout(regLayout, paddedEnc);
+    const auto &sharedLL = paddedEnc.getLinearComponent();
+    regToSharedLayout = regLayout.invertAndCompose(sharedLL);
   } else {
     auto sharedLL = triton::gpu::toLinearLayout(sharedTy);
     regToSharedLayout = regLayout.invertAndCompose(sharedLL);