intel
diff --git a/‎README.md‎
Lines changed: 1 addition & 1 deletion b/‎README.md‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎bin/RegisterTritonDialects.h‎
Lines changed: 2 additions & 0 deletions b/‎bin/RegisterTritonDialects.h‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎include/triton/Conversion/TritonGPUToLLVM/Utility.h‎
Lines changed: 7 additions & 80 deletions b/‎include/triton/Conversion/TritonGPUToLLVM/Utility.h‎
Lines changed: 7 additions & 80 deletions
diff --git a/‎include/triton/Dialect/Triton/IR/TritonOps.td‎
Lines changed: 3 additions & 2 deletions b/‎include/triton/Dialect/Triton/IR/TritonOps.td‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/IR/Dialect.h‎
Lines changed: 5 additions & 0 deletions b/‎include/triton/Dialect/TritonGPU/IR/Dialect.h‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h‎
Lines changed: 3 additions & 8 deletions b/‎include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h‎
Lines changed: 3 additions & 8 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/IR/Traits.h‎
Lines changed: 22 additions & 0 deletions b/‎include/triton/Dialect/TritonGPU/IR/Traits.h‎
Lines changed: 22 additions & 0 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td‎
Lines changed: 45 additions & 32 deletions b/‎include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td‎
Lines changed: 45 additions & 32 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td‎
Lines changed: 25 additions & 1 deletion b/‎include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td‎
Lines changed: 25 additions & 1 deletion
diff --git a/‎include/triton/Dialect/TritonGPU/Transforms/Partition.h‎
Lines changed: 6 additions & 0 deletions b/‎include/triton/Dialect/TritonGPU/Transforms/Partition.h‎
Lines changed: 6 additions & 0 deletions
@@ -152,7 +152,7 @@ arbitrary LLVM version.
 
     - Do a local build. Run command `pip install -e .`
     - Get the full path to the `compile_commands.json` file produced by the build:
-      `find python/build -name 'compile_commands.json' | xargs readlink -f`.
+      `find ./build -name 'compile_commands.json' | xargs readlink -f`.
       You might get a full path similar to `/Users/{username}/triton/build/cmake.macosx-11.1-arm64-cpython-3.12/compile_commands.json`
     - In vscode, install the
       [C/C++
 
@@ -51,6 +51,7 @@ void registerTestAlignmentPass();
 void registerTestAllocationPass();
 void registerTestLivenessPass();
 void registerTestMembarPass();
+void registerTestAMDGPUMembarPass();
 void registerTestTritonAMDGPURangeAnalysis();
 } // namespace test
 } // namespace mlir
@@ -66,6 +67,7 @@ inline void registerTritonDialects(mlir::DialectRegistry &registry) {
   mlir::test::registerTestAllocationPass();
   mlir::test::registerTestLivenessPass();
   mlir::test::registerTestMembarPass();
+  mlir::test::registerTestAMDGPUMembarPass();
   mlir::test::registerTestTritonAMDGPURangeAnalysis();
   mlir::triton::registerConvertTritonToTritonGPUPass();
   mlir::triton::intel::registerConvertTritonToTritonGPUWarpPass();
 
@@ -718,89 +718,16 @@ void storeDistributedToShared(
     RewriterBase &rewriter, const TargetInfoBase &target,
     std::pair<size_t, Type> *const llvmOpCount = nullptr);
 
-inline SmallVector<Value> unpackLLElements(Location loc, Value llvmStruct,
-                                           RewriterBase &rewriter) {
-  assert(bool(llvmStruct) && "can not unpack null values");
-  if (llvmStruct.getType().isIntOrIndexOrFloat() ||
-      isa<triton::PointerType>(llvmStruct.getType()) ||
-      isa<LLVM::LLVMPointerType>(llvmStruct.getType()))
-    return {llvmStruct};
-  ArrayRef<Type> types =
-      cast<LLVM::LLVMStructType>(llvmStruct.getType()).getBody();
-  SmallVector<Value> results(types.size());
-  auto b = TritonLLVMOpBuilder(loc, rewriter);
-  for (unsigned i = 0; i < types.size(); ++i) {
-    Type type = types[i];
-    results[i] = b.extract_val(type, llvmStruct, i);
-  }
-  return results;
-}
-
-inline Value packLLElements(Location loc,
-                            const LLVMTypeConverter *typeConverter,
-                            ValueRange resultVals, RewriterBase &rewriter,
-                            Type type) {
-  auto structType =
-      dyn_cast<LLVM::LLVMStructType>(typeConverter->convertType(type));
-  if (!structType) {
-    assert(resultVals.size() == 1);
-    return *resultVals.begin();
-  }
-
-  auto elementTypes = structType.getBody();
-  if (elementTypes.size() != resultVals.size()) {
-    emitError(loc) << " size mismatch when packing elements for LLVM struct"
-                   << " expected " << elementTypes.size() << " but got "
-                   << resultVals.size();
-  }
-  Value llvmStruct = rewriter.create<LLVM::UndefOp>(loc, structType);
-  auto b = TritonLLVMOpBuilder(loc, rewriter);
-  for (const auto &v : llvm::enumerate(resultVals)) {
-    if (!v.value()) {
-      emitError(loc)
-          << "cannot insert null values into struct, but tried to insert"
-          << v.value();
-    }
-    if (v.value().getType() != elementTypes[v.index()]) {
-      LDBG("type " << type << " structType " << structType);
-      LDBG("value " << v.value());
-      emitError(loc) << "invalid element type in packLLElements. Expected "
-                     << elementTypes[v.index()] << " but got "
-                     << v.value().getType();
-    }
-    llvmStruct = b.insert_val(structType, llvmStruct, v.value(), v.index());
-  }
-  return llvmStruct;
-}
+SmallVector<Value> unpackLLElements(Location loc, Value llvmStruct,
+                                    RewriterBase &rewriter);
 
-inline SmallVector<Value> unpackLLVector(Location loc, Value llvmVec,
-                                         RewriterBase &rewriter) {
-  assert(bool(llvmVec) && "cannot unpack null value");
-  if (llvmVec.getType().isIntOrIndexOrFloat() ||
-      isa<triton::PointerType>(llvmVec.getType()) ||
-      isa<LLVM::LLVMPointerType>(llvmVec.getType()))
-    return {llvmVec};
+Value packLLElements(Location loc, const LLVMTypeConverter *typeConverter,
+                     ValueRange resultVals, RewriterBase &rewriter, Type type);
 
-  auto b = TritonLLVMOpBuilder(loc, rewriter);
-  SmallVector<Value> results;
-  for (int i = 0; i < cast<VectorType>(llvmVec.getType()).getNumElements();
-       i++) {
-    results.push_back(b.extract_element(llvmVec, b.i32_val(i)));
-  }
-  return results;
-}
+SmallVector<Value> unpackLLVector(Location loc, Value llvmVec,
+                                  RewriterBase &rewriter);
 
-inline Value packLLVector(Location loc, ValueRange vals,
-                          RewriterBase &rewriter) {
-  assert(vals.size() > 0);
-  auto vecType = vec_ty(vals[0].getType(), vals.size());
-  auto b = TritonLLVMOpBuilder(loc, rewriter);
-  Value vec = b.undef(vecType);
-  for (int i = 0; i < vals.size(); i++) {
-    vec = b.insert_element(vec, vals[i], b.i32_val(i));
-  }
-  return vec;
-}
+Value packLLVector(Location loc, ValueRange vals, RewriterBase &rewriter);
 
 inline bool
 isSimpleSharedMemoryAccess(ArrayRef<int64_t> shape,
 
@@ -493,6 +493,7 @@ def TT_BroadcastOp : TT_Op<"broadcast", [Pure,
     let hasVerifier = 1;
 }
 
+// Cat is not pure because it may reorder elements.
 def TT_CatOp : TT_Op<"cat", [NoMemoryEffect,
                              SameTypeOperands,
                              SameOperandsAndResultElementType]> {
@@ -506,7 +507,7 @@ def TT_CatOp : TT_Op<"cat", [NoMemoryEffect,
 }
 
 def TT_JoinOp : TT_Op<"join", [
-    NoMemoryEffect, SameTypeOperands]> {
+    Pure, SameTypeOperands]> {
     let summary = "join two tensors along a new, minor dimension";
     let description = [{
         For example, if the two input tensors are 4x8xf32, returns a tensor of
@@ -526,7 +527,7 @@ def TT_JoinOp : TT_Op<"join", [
 }
 
 def TT_SplitOp : TT_Op<"split", [
-  NoMemoryEffect,
+  Pure,
   InferTypeOpWithLayoutEquivalence,
   TypesMatchWith<"outLHS and outRHS types match",
                   "outLHS", "outRHS", "$_self">,
 
@@ -9,6 +9,7 @@
 // TritonGPU depends on Triton
 #include "triton/Dialect/Triton/IR/Dialect.h"
 #include "triton/Dialect/TritonGPU/IR/Attributes.h"
+#include "triton/Dialect/TritonGPU/IR/Traits.h"
 #include "triton/Dialect/TritonGPU/IR/Types.h"
 
 #include <unordered_map>
@@ -278,6 +279,10 @@ bool areLayoutsEquivalent(ArrayRef<int64_t> shape, Attribute lhs,
 
 // Return true if the innermost numElems are contiguous.
 bool isInnermostContiguous(MemDescType type, unsigned numElems);
+
+LinearLayout inferReshapeLinearLayout(ArrayRef<int64_t> srcShape,
+                                      Attribute srcEnc,
+                                      ArrayRef<int64_t> dstShape);
 } // namespace mlir::triton::gpu
 
 #endif // TRITON_DIALECT_TRITONGPU_IR_DIALECT_H_
@@ -54,9 +54,9 @@ LinearLayout toLinearLayout(ArrayRef<int64_t> shape, Attribute layout);
 //
 // If `disableSwizzle` is set, then the resulting layout does not include
 // swizzling.
-LinearLayout sharedToLinearLayoutLeadingOffset(ArrayRef<int64_t> shape,
-                                               NVMMASharedEncodingAttr shared,
-                                               bool disableSwizzle = false);
+LinearLayout nvmmaSharedToLinearLayout(ArrayRef<int64_t> shape,
+                                       NVMMASharedEncodingAttr shared,
+                                       bool disableSwizzle = false);
 
 // Given a linear layout where the input dimensions contain a "block" dimension,
 // this method sets the "block" dimension to 0 and removes the corresponding
@@ -282,11 +282,6 @@ LinearLayout chooseScaledMfmaScaleLayout(
     MLIRContext *ctx, int dotOperandIdx,
     const std::vector<std::vector<int32_t>> &dotOperandWarpBasis,
     ArrayRef<int64_t> dotOperandShape, unsigned mfmaMDim);
-
-// Create LinearLayout for nvidia mma tile.
-LinearLayout nvidiaMmaTile(MLIRContext *ctx, ArrayRef<unsigned> tileShape,
-                           unsigned kWidth, ArrayRef<unsigned> order,
-                           ArrayRef<unsigned> repOrder);
 } // namespace mlir::triton::gpu
 
 #endif // TRITON_DIALECT_TRITONGPU_IR_LINEARLAYOUTCONVERSIONS_H
@@ -0,0 +1,22 @@
+#ifndef TRITONGPU_IR_TRAITS_H_
+#define TRITONGPU_IR_TRAITS_H_
+
+#include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/OpDefinition.h"
+#include "mlir/Interfaces/InferTypeOpInterface.h"
+#include "mlir/Support/LogicalResult.h"
+#include "triton/Dialect/Triton/IR/Types.h"
+
+namespace mlir {
+namespace OpTrait {
+
+template <typename ConcreteType>
+class MemDescViewTrait
+    : public mlir::OpTrait::TraitBase<ConcreteType, MemDescViewTrait> {
+  // Optional: Add methods or verification logic here
+};
+
+} // namespace OpTrait
+} // namespace mlir
+
+#endif
@@ -15,6 +15,9 @@ def TritonGPU_AttrTrait : AttrInterface<"TritonGPU_AttrTrait"> {
   ];
 }
 
+def MemDescViewTrait : NativeOpTrait<"MemDescViewTrait">;
+
+
 class TritonGPU_Attr<string name, string attrMnemonic, list<Trait> traits = [],
                      Dialect dialect = TritonGPU_Dialect,
                      string baseCppClass = "::mlir::Attribute">
@@ -309,46 +312,54 @@ When vec=2, elements are swizzled in pairs of 2.  In other words, the element at
         if(!mmaEnc)
           return get(context, 1, 1, 1, order, CTALayout);
 
+        int opIdx = dotOpEnc.getOpIdx();
+        auto shapePerCTA = getShapePerCTA(CTALayout.getCTASplitNum(), shape);
+
+        // number of rows per phase
+
+        // index of the inner dimension in `order`
+        unsigned inner = (opIdx == 0) ? 0 : 1;
+
         // ---- begin Ampere & Hopper ----
         if (mmaEnc.isAmpere() || mmaEnc.isHopper()) {
-          return get(context, dotOpEnc.getOpIdx(), dotOpEnc.getKWidth(), shape, order, CTALayout, typeWidthInBit, needTrans);
+          int perPhase = 128 / (shapePerCTA[order[0]] * 4 / dotOpEnc.getKWidth());
+          perPhase = std::max<int>(perPhase, 1);
+          std::vector<size_t> matShape = {8, 8, 4 * dotOpEnc.getKWidth()};
+          int vecWidth = 32 / typeWidthInBit;
+          if (vecWidth != dotOpEnc.getKWidth() && order[0] == inner) {
+              perPhase = std::max<int>(perPhase, 2 * vecWidth);
+          }
+          int rank = order.size();
+          // --- handle A operand ---
+          if (opIdx == 0) { // compute swizzling for A operand
+              int m = (needTrans) ? matShape[2] : matShape[0];
+              int k = (needTrans) ? matShape[0] : matShape[2];
+              int vec = (order[0] == rank-1) ? k : m;
+              int mmaStride = (order[0] == rank-1) ? m : k;
+              int maxPhase = std::max(mmaStride / perPhase, 1);
+              return get(context, vec, perPhase, maxPhase, order, CTALayout);
+          }
+
+          // --- handle B operand ---
+          if (opIdx == 1) {
+              // we compute vec and maxPhase m, n and k size of the mma
+              // instruction. when matmul operands is transposed, we should
+              // consider that to get m, n and k.
+              int n = needTrans ? matShape[2] : matShape[1];
+              int k = needTrans ? matShape[1] : matShape[2];
+              int vec = (order[0] == rank-1) ? n : k;
+              int mmaStride = (order[0] == rank-1) ? k : n;
+              int maxPhase = std::max(mmaStride / perPhase, 1);
+              return get(context, vec, perPhase, maxPhase, order, CTALayout);
+          }
+
+          llvm_unreachable("invalid operand index");
         }
 
         // ---- not implemented ----
         llvm_unreachable("unsupported swizzling for provided MMA version");
     }]>,
 
-    // NVIDIA constructor!
-    // TODO(lezcano): We should totally get rid of all these constructors...
-    AttrBuilder<(ins "int":$opIdx,
-                     "unsigned":$kWidth,
-                     "ArrayRef<int64_t>":$shape,
-                     "ArrayRef<unsigned>":$order,
-                     "CTALayoutAttr":$CTALayout,
-                     "unsigned":$bitwidth,
-                     "bool":$needTrans), [{
-        int K =  getShapePerCTA(CTALayout.getCTASplitNum(), shape)[order[0]];
-        // Elems necessary to cover all the banks divided by the inner dimension
-        // This packs a few rows together for small K
-        int perPhase = std::max<int>(1024 / (bitwidth * K), 1);
-
-        int mmaStride = 8;
-        int vec = 4 * kWidth;
-        // needsTrans is equiv. to flipping the opIdx
-        if (needTrans)
-          std::swap(vec, mmaStride);
-        assert(opIdx == 0 || opIdx == 1);
-        int rank = order.size();
-        int kDim = opIdx == 0 ? rank-1 : rank-2;
-        if (order[0] != kDim)
-          std::swap(vec, mmaStride);
-        // Count how many vec elements are needed to cover all the banks
-        int maxPhase = std::max(std::min<int>(mmaStride, 1024 / (vec * bitwidth)), 1);
-        // Account for the row packing from perPhase: mmaStride / perPhase
-        maxPhase = std::max(maxPhase / perPhase, 1);
-        return get(context, vec, perPhase, maxPhase, order, CTALayout);
-    }]>,
-
     AttrBuilder<(ins "DotOperandEncodingAttr":$dotOpEnc,
                      "ArrayRef<int64_t>":$shape,
                      "ArrayRef<unsigned>":$order,
@@ -387,6 +398,8 @@ def NVMMASharedEncodingAttr :
     This is meant to represent 2d tiled blocked layout.
     The full layout representation is described here:
     https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-warpgroup-level-matrix-shared-memory-layout
+    When the memdesc has more than 2 dimensions the tiling is applied to 8 rows even if the first outer dimension is smaller than 8.
+    In this case `transposed` means that the contiguous dimension is the most outer dimension of the memdesc.
   }];
 
 
 
@@ -198,7 +198,7 @@ def TTG_LocalDeallocOp : TTG_Op<"local_dealloc"> {
   let assemblyFormat = [{$src attr-dict `:` qualified(type($src))}];
 }
 
-def TTG_MemDescSubviewOp : TTG_Op<"memdesc_subview", [Pure]> {
+def TTG_MemDescSubviewOp : TTG_Op<"memdesc_subview", [Pure, MemDescViewTrait]> {
   let summary = "take a subview of the descriptor.";
 
   let description = [{
@@ -224,6 +224,7 @@ def TTG_MemDescSubviewOp : TTG_Op<"memdesc_subview", [Pure]> {
 }
 
 def TTG_MemDescTransOp : TTG_Op<"memdesc_trans", [Pure,
+                                                  MemDescViewTrait,
                                                   TransposeOpInterface,
                                                   InferTypeOpWithLayoutEquivalence,
                                                   SameOperandsAndResultElementType]> {
@@ -248,6 +249,29 @@ def TTG_MemDescTransOp : TTG_Op<"memdesc_trans", [Pure,
   let hasFolder = 1;
 }
 
+def TTG_MemDescReshapeOp : TTG_Op<"memdesc_reshape", [Pure,
+                                                      MemDescViewTrait,
+                                                      SameOperandsAndResultElementType]> {
+  let summary = "creates a descriptor for the new shape";
+
+  let description = [{
+    This operation returns a new descriptor representing a reshaped view of the underlying buffer.
+    This doesn't affect the memory.
+  }];
+
+  let arguments = (ins TTG_MemDescType:$src);
+
+  let arguments = (
+    ins TTG_MemDescType:$src
+  );
+
+  let results = (outs TTG_MemDescType:$result);
+
+  let assemblyFormat = "$src attr-dict `:` qualified(type($src)) `->` qualified(type($result))";
+
+  let hasVerifier = 1;
+}
+
 def TTG_LocalLoadOp : TTG_Op<"local_load"> {
   let summary = "Load a buffer from local memory into a distributed tensor";
 
 
@@ -45,6 +45,7 @@ class WarpSchedule {
     ArrayRef<Operation *> getOps() const { return ops; }
 
     void insert(Operation *op) { ops.push_back(op); }
+    void remove(Operation *op) { ops.erase(llvm::find(ops, op)); }
 
   private:
     void setIndex(int idx) { this->idx = idx; }
@@ -62,6 +63,8 @@ class WarpSchedule {
   Partition *addPartition(unsigned stage);
   // Give each partition a new index and order. The indices must be unique.
   void reorderPartitions(ArrayRef<unsigned> order);
+  // Update the op to partition mapping.
+  void updatePartitions();
 
   // Get the partition the op belongs to.
   Partition *getPartition(Operation *op);
@@ -115,6 +118,9 @@ class WarpSchedule {
       scf::ForOp loop, const Partition *partition,
       function_ref<void(OpResult, OpOperand &, unsigned)> callback) const;
 
+  // Debug dump the schedule.
+  LLVM_DUMP_METHOD void dump() const;
+
 private:
   // Partitions are numbered [0, N).
   SmallVector<std::unique_ptr<Partition>> partitions;