intel
diff --git a/‎include/triton/Dialect/TritonGPU/IR/Dialect.h
Lines changed: 2 additions & 1 deletion b/‎include/triton/Dialect/TritonGPU/IR/Dialect.h
Lines changed: 2 additions & 1 deletion
diff --git a/‎include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h
Lines changed: 2 additions & 1 deletion b/‎include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h
Lines changed: 2 additions & 1 deletion
diff --git a/‎include/triton/Dialect/TritonGPU/IR/TritonGPUDialect.td
Lines changed: 1 addition & 1 deletion b/‎include/triton/Dialect/TritonGPU/IR/TritonGPUDialect.td
Lines changed: 1 addition & 1 deletion
diff --git a/‎include/triton/Tools/LinearLayout.h
Lines changed: 4 additions & 3 deletions b/‎include/triton/Tools/LinearLayout.h
Lines changed: 4 additions & 3 deletions
diff --git a/‎lib/Conversion/TritonGPUToLLVM/Utility.cpp
Lines changed: 18 additions & 14 deletions b/‎lib/Conversion/TritonGPUToLLVM/Utility.cpp
Lines changed: 18 additions & 14 deletions
diff --git a/‎lib/Conversion/TritonGPUToLLVM/ViewOpToLLVM.cpp
Lines changed: 11 additions & 17 deletions b/‎lib/Conversion/TritonGPUToLLVM/ViewOpToLLVM.cpp
Lines changed: 11 additions & 17 deletions
diff --git a/‎lib/Dialect/TritonGPU/IR/Dialect.cpp
Lines changed: 18 additions & 13 deletions b/‎lib/Dialect/TritonGPU/IR/Dialect.cpp
Lines changed: 18 additions & 13 deletions
@@ -15,7 +15,8 @@
 #include <unordered_map>
 
 // LinearLayoutCache Utils
-using CacheKey = std::tuple<std::vector<int64_t>, mlir::Attribute>;
+using CacheKey =
+    std::tuple<std::vector<int64_t>, mlir::Attribute, std::vector<int64_t>>;
 
 namespace llvm {
 template <typename T> size_t hash_value(const std::vector<T> &vec) {
 
@@ -47,7 +47,8 @@ class MemDescType;
 // elemBitWidth is the bit width of one element in the layout.  This is required
 // to compute the linear layout for MMAv3 (i.e. Hopper) shared layouts (i.e.
 // shared layouts with nvmma_shared layout) but is otherwise unused.
-LinearLayout toLinearLayout(ArrayRef<int64_t> shape, Attribute layout);
+LinearLayout toLinearLayout(ArrayRef<int64_t> shape, Attribute layout,
+                            ArrayRef<int64_t> allocationShape);
 LinearLayout toLinearLayout(RankedTensorType type);
 LinearLayout toLinearLayout(MemDescType type);
 LinearLayout toLinearLayout(TensorOrMemDesc type);
 
@@ -22,7 +22,7 @@ def TritonGPU_Dialect : Dialect {
   let extraClassDeclaration = [{
     void registerTypes();
 
-    LinearLayout toLinearLayout(ArrayRef<int64_t> shape, Attribute layout);
+    LinearLayout toLinearLayout(ArrayRef<int64_t> shape, Attribute layout, ArrayRef<int64_t> allocationShape);
     LinearEncodingAttr toLinearEncoding(ArrayRef<int64_t> shape, Attribute layout);
 
     static int getNumCTAs(ModuleOp mod);
 
@@ -325,7 +325,7 @@ class LinearLayout {
       bases;
 
   llvm::MapVector<StringAttr, int32_t /*size*/> outDims;
-  bool surjective = true;
+  int32_t rank = 0;
 
 public:
   using BasesT = decltype(bases);
@@ -425,10 +425,11 @@ class LinearLayout {
       ArrayRef<std::pair<StringAttr, std::vector<std::vector<int32_t>>>> bases,
       ArrayRef<std::pair<StringAttr, int32_t>> outDims, bool requireSurjective);
 
-  bool isSurjective() const { return surjective; }
+  bool isSurjective() const { return rank == getTotalOutDimSizeLog2(); }
+  bool isInjective() const { return rank == getTotalInDimSizeLog2(); }
 
   bool isInvertible() const {
-    return surjective && getTotalInDimSize() == getTotalOutDimSize();
+    return isSurjective() && getTotalInDimSize() == getTotalOutDimSize();
   }
 
   const BasesT &getBases() const { return bases; }
 
@@ -40,11 +40,13 @@ namespace {
 LinearLayout getRegToSharedLayout(MLIRContext *ctx, ArrayRef<int64_t> shape,
                                   LinearLayout regLayout,
                                   triton::gpu::SharedEncodingTrait dstEnc,
-                                  int elemBitWidth) {
+                                  int elemBitWidth,
+                                  ArrayRef<int64_t> allocShape) {
   StringAttr kBlock = StringAttr::get(ctx, ("block"));
   int rank = shape.size();
 
-  LinearLayout sharedLayout = triton::gpu::toLinearLayout(shape, dstEnc);
+  LinearLayout sharedLayout =
+      triton::gpu::toLinearLayout(shape, dstEnc, allocShape);
   auto sharedOrder = triton::gpu::getOrder(dstEnc, shape);
 
   // sharedLayout's in-dims are currently (offset, block).  Reshape to
@@ -378,7 +380,7 @@ emitIndices(Location loc, RewriterBase &rewriter, const TargetInfoBase &target,
   MLIRContext *ctx = rewriter.getContext();
   auto shape = type.getShape();
 
-  LinearLayout ll = triton::gpu::toLinearLayout(shape, layout);
+  LinearLayout ll = triton::gpu::toLinearLayout(shape, layout, {});
 
   StringAttr kRegister = str_attr("register");
   StringAttr kLane = str_attr("lane");
@@ -503,7 +505,7 @@ SmallVector<Value> getSmemVecAddrVec(
                 sharedEnc)) {
       auto regToSharedSwizzledLayout =
           getRegToSharedLayout(ctx, shape, regLayout, swizzledSharedEnc,
-                               elemLlvmTy.getIntOrFloatBitWidth());
+                               elemLlvmTy.getIntOrFloatBitWidth(), allocShape);
       auto smemOrder = swizzledSharedEnc.getOrder();
 
       auto swizzledIndicesVec =
@@ -659,9 +661,9 @@ lowerLdStShared(Location loc, MLIRContext *ctx, LinearLayout cvt,
   bool isStore = !valsArray.empty();
   auto b = TritonLLVMOpBuilder(loc, rewriter);
 
-  auto emitCpAsync = [&](ConversionPatternRewriter &rewriter, Location loc,
-                         ArrayRef<Value> vals, Value shmemAddr, int idx,
-                         VectorType vecTy) -> SmallVector<Value> {
+  auto emitLdSt = [&](ConversionPatternRewriter &rewriter, Location loc,
+                      ArrayRef<Value> vals, Value shmemAddr, int idx,
+                      VectorType vecTy) -> SmallVector<Value> {
     auto length = vecTy.getNumElements();
     if (isStore) {
       Value valsVec =
@@ -677,7 +679,7 @@ lowerLdStShared(Location loc, MLIRContext *ctx, LinearLayout cvt,
     }
   };
   return lowerLdSt(loc, ctx, cvt, valsArray, llvmElemTy, smemBase, rewriter,
-                   targetInfo, {}, emitCpAsync);
+                   targetInfo, {}, emitLdSt);
 }
 
 SmallVector<Value> lowerLdSt(
@@ -859,11 +861,13 @@ bool emitTransferBetweenRegistersAndShared(
   auto allocShape = sharedTy.getAllocShape();
   auto invertAllocSharedLayout = LinearLayout::empty();
   if (!paddedLayout) {
-    // For now this is only needed for the cases where we have swizzling.
-    invertAllocSharedLayout =
-        triton::gpu::toLinearLayout(allocShape.take_back(sharedTy.getRank()),
-                                    sharedTy.getEncoding())
-            .pseudoinvert();
+    // This is the legacy way of doing things that's much more ad-hoc
+    // For generic shared layouts it may or may not be correct
+    auto allocShape = sharedTy.getAllocShape();
+    auto trimShape = allocShape.take_back(sharedTy.getRank());
+    invertAllocSharedLayout = triton::gpu::toLinearLayout(
+                                  trimShape, sharedTy.getEncoding(), trimShape)
+                                  .pseudoinvert();
   }
 
   int numElems = regToSharedLayout.getInDimSize(kRegister);
@@ -1473,7 +1477,7 @@ delinearize(RewriterBase &rewriter, Location loc,
             triton::gpu::DistributedEncodingTrait layout,
             ArrayRef<int64_t> shape, StringAttr dimName, Value linear) {
   auto b = TritonLLVMOpBuilder(loc, rewriter);
-  auto ll = triton::gpu::toLinearLayout(shape, layout);
+  auto ll = triton::gpu::toLinearLayout(shape, layout, {});
   auto linearLayout =
       triton::gpu::LinearEncodingAttr::get(rewriter.getContext(), ll);
   assert(ll.hasInDim(dimName));
 
@@ -471,35 +471,24 @@ struct MemDescSubviewOpConversion
     // newBase = base + offset
     auto smemObj = getSharedMemoryObjectFromStruct(loc, adaptor.getSrc(),
                                                    llvmElemTy, rewriter);
-    auto smemStrides = smemObj.getStrides(srcTy, loc, rewriter);
     SmallVector<Value> opOffsetVals = op.getOffsets();
     // We assume we always create a subview of the last dimensions
-    SmallVector<Value> opSmemStrides(smemStrides.end() - opOffsetVals.size(),
-                                     smemStrides.end());
     // Compute total offset
-    SmallVector<Value> offsetVals;
-    auto destRank = op.getResult().getType().getRank();
-    auto rankReduced = srcTy.getRank() - destRank;
-    for (int i = rankReduced; i < opOffsetVals.size(); i++) {
-      offsetVals.push_back(b.add(opOffsetVals[i], smemObj.getOffsets()[i]));
-    }
+    auto rankReduced = srcTy.getRank() - destTy.getRank();
 
     Value offset;
     if (rankReduced || (destTy.getRank() == 1 && destTy.getDimSize(0) == 1)) {
+      auto smemStrides = smemObj.getStrides(srcTy, loc, rewriter);
+      SmallVector<Value> opSmemStrides(smemStrides.end() - opOffsetVals.size(),
+                                       smemStrides.end());
       // We are splitting the pipelining dimension which may not be a power of 2
       // so we can't use LinearLayouts
       offset = dot(rewriter, loc, opOffsetVals, opSmemStrides);
     } else {
       auto dimNames = standardOutDimNames(ctx, opOffsetVals.size());
       SmallVector<std::pair<StringAttr, Value>> logicalOffsets;
-      // This assumes the subviews are additive, in the sense that we can
-      // compute the offset of one and an add it to the offset of the previous
-      // one we computed. We check for this in the verifier.
-      for (int i = 0; i < rankReduced; i++) {
-        logicalOffsets.push_back({dimNames[i], b.i32_val(0)});
-      }
-      for (int i = rankReduced; i < opOffsetVals.size(); i++) {
-        logicalOffsets.push_back({dimNames[i], offsetVals[i - rankReduced]});
+      for (auto [dim, offset] : llvm::zip(dimNames, opOffsetVals)) {
+        logicalOffsets.push_back({dim, offset});
       }
       auto ll = toLinearLayout(srcTy);
       // Checked in the verifier.
@@ -517,6 +506,11 @@ struct MemDescSubviewOpConversion
       offset = b.add(offset, padOffset);
     }
 
+    SmallVector<Value> offsetVals;
+    for (int i = rankReduced; i < opOffsetVals.size(); i++) {
+      offsetVals.push_back(b.add(opOffsetVals[i], smemObj.getOffsets()[i]));
+    }
+
     auto base = smemObj.getBase();
     auto elemPtrTy = base.getType();
     smemObj = SharedMemoryObject(b.gep(elemPtrTy, llvmElemTy, base, offset),
 
@@ -39,11 +39,14 @@ namespace gpu {
 
 LinearEncodingAttr TritonGPUDialect::toLinearEncoding(ArrayRef<int64_t> shape,
                                                       Attribute layout) {
-  CacheKey key{std::vector<int64_t>(shape.begin(), shape.end()), layout};
+  // LinearEncoding is a DistributedLayout
+  std::vector<int64_t> allocationShape;
+  CacheKey key{std::vector<int64_t>(shape.begin(), shape.end()), layout,
+               allocationShape};
   if (auto result = leCache.get(key)) {
     return *result;
   }
-  auto linearLayout = toLinearLayout(shape, layout);
+  auto linearLayout = toLinearLayout(shape, layout, {});
   auto linearEncoding =
       LinearEncodingAttr::get(layout.getContext(), std::move(linearLayout));
   leCache.set(key, linearEncoding);
@@ -2369,7 +2372,7 @@ struct TritonGPUInferLayoutInterface
       return success();
     }
 
-    auto ll = toLinearLayout(shape, operandEncoding);
+    auto ll = toLinearLayout(shape, operandEncoding, {});
     auto transposedLl = transposeLinearLayout(ll, order);
     resultEncoding = LinearEncodingAttr::get(ctx, std::move(transposedLl));
     return success();
@@ -2491,8 +2494,9 @@ struct TritonGPUInferLayoutInterface
           CTALayout);
       // Big guns, check linear layouts are equivalent
       // We disallow reshaping memdesc_subviews in the verifier
-      auto srcLL = toLinearLayout(srcShape, srcEnc);
-      auto dstLL = toLinearLayout(dstShape, dstEnc);
+      // We disallow reshaping memdesc_subviews in the verifier
+      auto srcLL = toLinearLayout(srcShape, srcEnc, srcShape);
+      auto dstLL = toLinearLayout(dstShape, dstEnc, dstShape);
       if (reshapeLayout(ctx, srcLL, dstShape) != dstLL) {
         return failure();
       }
@@ -2774,7 +2778,7 @@ struct TritonGPUInferLayoutInterface
       SmallVector<int64_t> joinedShape(shape);
       joinedShape.push_back(2);
       auto parent = enc.getParent();
-      auto parentLL = toLinearLayout(joinedShape, parent);
+      auto parentLL = toLinearLayout(joinedShape, parent, {});
 
       Attribute splitEnc;
       auto result = inferSplitOpEncoding(parent, splitEnc, joinedShape, loc);
@@ -2810,7 +2814,7 @@ struct TritonGPUInferLayoutInterface
     }
 
     // Append dim to shape
-    auto ll = toLinearLayout(shape, srcEnc);
+    auto ll = toLinearLayout(shape, srcEnc, {});
     SmallVector<int64_t> dstShape(shape.begin(), shape.end());
     dstShape.push_back(1);
     ll = ll.reshapeOuts(standardOutDimPairs(ctx, dstShape));
@@ -2866,7 +2870,7 @@ struct TritonGPUInferLayoutInterface
     auto ctx = getContext();
 
     // Split on last dim
-    auto ll = toLinearLayout(shape, srcEnc);
+    auto ll = toLinearLayout(shape, srcEnc, {});
     auto newLl = LinearLayout::empty();
     auto result =
         tryJoinOnAxis(ctx, ll, newLl, /*fwdInference=*/false, axis, loc);
@@ -2935,7 +2939,7 @@ struct TritonGPUInferLayoutInterface
       }
     }
 
-    auto ll = toLinearLayout(shape, inEnc);
+    auto ll = toLinearLayout(shape, inEnc, {});
     auto newLl = LinearLayout::empty();
     auto result = tryJoinOnAxis(ctx, ll, newLl, fwdInference, axis, loc);
     if (!result.succeeded())
@@ -3042,15 +3046,16 @@ std::string getSharedLayoutStr(RankedTensorType type, bool useHWPointOfView) {
     return "";
 
   // This RankedTensorType is a MemDescType (?!)
-  LinearLayout ll = triton::gpu::toLinearLayout(type);
+  auto shape = type.getShape();
+  auto layout = type.getEncoding();
+  LinearLayout ll = triton::gpu::toLinearLayout(shape, layout, shape);
 
   StringAttr kOffset = StringAttr::get(type.getContext(), "offset");
   StringAttr kBlock = StringAttr::get(type.getContext(), "block");
   int64_t tensorSize = product(type.getShape());
   auto enc = type.getEncoding();
   unsigned numBlocks = getNumCTAs(enc);
   int32_t blockSize = tensorSize / numBlocks;
-  auto shape = type.getShape();
 
   // elementMapping is for the non-hw layout, offsetMapping for hw-layout
   std::vector<std::string> elementMapping(tensorSize);
@@ -3463,8 +3468,8 @@ int triton::gpu::lookupThreadsPerWarp(OpBuilder &rewriter) {
 bool triton::gpu::areLayoutsEquivalent(ArrayRef<int64_t> shape,
                                        DistributedEncodingTrait lhs,
                                        DistributedEncodingTrait rhs) {
-  auto lhsLL = triton::gpu::toLinearLayout(shape, lhs);
-  auto rhsLL = triton::gpu::toLinearLayout(shape, rhs);
+  auto lhsLL = triton::gpu::toLinearLayout(shape, lhs, {});
+  auto rhsLL = triton::gpu::toLinearLayout(shape, rhs, {});
   return lhsLL == rhsLL;
 }