intel
diff --git a/‎include/triton/Dialect/TritonGPU/IR/Dialect.h
Lines changed: 2 additions & 1 deletion b/‎include/triton/Dialect/TritonGPU/IR/Dialect.h
Lines changed: 2 additions & 1 deletion
diff --git a/‎include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h
Lines changed: 2 additions & 1 deletion b/‎include/triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h
Lines changed: 2 additions & 1 deletion
diff --git a/‎include/triton/Dialect/TritonGPU/IR/TritonGPUDialect.td
Lines changed: 1 addition & 1 deletion b/‎include/triton/Dialect/TritonGPU/IR/TritonGPUDialect.td
Lines changed: 1 addition & 1 deletion
diff --git a/‎include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td
Lines changed: 21 additions & 0 deletions b/‎include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td
Lines changed: 21 additions & 0 deletions
diff --git a/‎include/triton/Tools/LinearLayout.h
Lines changed: 4 additions & 3 deletions b/‎include/triton/Tools/LinearLayout.h
Lines changed: 4 additions & 3 deletions
diff --git a/‎lib/Conversion/TritonGPUToLLVM/Utility.cpp
Lines changed: 18 additions & 14 deletions b/‎lib/Conversion/TritonGPUToLLVM/Utility.cpp
Lines changed: 18 additions & 14 deletions
diff --git a/‎lib/Conversion/TritonGPUToLLVM/ViewOpToLLVM.cpp
Lines changed: 11 additions & 17 deletions b/‎lib/Conversion/TritonGPUToLLVM/ViewOpToLLVM.cpp
Lines changed: 11 additions & 17 deletions
diff --git a/‎lib/Dialect/TritonGPU/IR/Dialect.cpp
Lines changed: 52 additions & 11 deletions b/‎lib/Dialect/TritonGPU/IR/Dialect.cpp
Lines changed: 52 additions & 11 deletions
@@ -15,7 +15,8 @@
 #include <unordered_map>
 
 // LinearLayoutCache Utils
-using CacheKey = std::tuple<std::vector<int64_t>, mlir::Attribute>;
+using CacheKey =
+    std::tuple<std::vector<int64_t>, mlir::Attribute, std::vector<int64_t>>;
 
 namespace llvm {
 template <typename T> size_t hash_value(const std::vector<T> &vec) {
 
@@ -47,7 +47,8 @@ class MemDescType;
 // elemBitWidth is the bit width of one element in the layout.  This is required
 // to compute the linear layout for MMAv3 (i.e. Hopper) shared layouts (i.e.
 // shared layouts with nvmma_shared layout) but is otherwise unused.
-LinearLayout toLinearLayout(ArrayRef<int64_t> shape, Attribute layout);
+LinearLayout toLinearLayout(ArrayRef<int64_t> shape, Attribute layout,
+                            ArrayRef<int64_t> allocationShape);
 LinearLayout toLinearLayout(RankedTensorType type);
 LinearLayout toLinearLayout(MemDescType type);
 LinearLayout toLinearLayout(TensorOrMemDesc type);
 
@@ -22,7 +22,7 @@ def TritonGPU_Dialect : Dialect {
   let extraClassDeclaration = [{
     void registerTypes();
 
-    LinearLayout toLinearLayout(ArrayRef<int64_t> shape, Attribute layout);
+    LinearLayout toLinearLayout(ArrayRef<int64_t> shape, Attribute layout, ArrayRef<int64_t> allocationShape);
     LinearEncodingAttr toLinearEncoding(ArrayRef<int64_t> shape, Attribute layout);
 
     static int getNumCTAs(ModuleOp mod);
 
@@ -273,6 +273,27 @@ def TTG_MemDescReshapeOp : TTG_Op<"memdesc_reshape", [Pure,
   }];
 
   let arguments = (ins TTG_MemDescType:$src);
+
+  let builders = [
+    OpBuilder<(ins "Value":$src, "ArrayRef<int64_t>":$shape),
+              [{
+                MemDescType dstTy;
+                auto srcTy = cast<MemDescType>(src.getType());
+                auto result = inferReturnTypes($_builder.getContext(),
+                                           $_builder.getUnknownLoc(),
+                                           srcTy, shape, dstTy);
+                assert(succeeded(result) && "failed to infer return types");
+                build($_builder, $_state, dstTy, src);
+              }]>
+  ];
+  let extraClassDeclaration = [{
+      static LogicalResult inferReturnTypes(MLIRContext *context,
+                                        std::optional<Location> loc,
+                                        MemDescType srcTy,
+                                        ArrayRef<int64_t> dstShape,
+                                        MemDescType &inferredReturnType);
+  }];
+
   let results = (outs TTG_MemDescType:$result);
 
   let assemblyFormat = "$src attr-dict `:` qualified(type($src)) `->` qualified(type($result))";
 
@@ -325,7 +325,7 @@ class LinearLayout {
       bases;
 
   llvm::MapVector<StringAttr, int32_t /*size*/> outDims;
-  bool surjective = true;
+  int32_t rank = 0;
 
 public:
   using BasesT = decltype(bases);
@@ -425,10 +425,11 @@ class LinearLayout {
       ArrayRef<std::pair<StringAttr, std::vector<std::vector<int32_t>>>> bases,
       ArrayRef<std::pair<StringAttr, int32_t>> outDims, bool requireSurjective);
 
-  bool isSurjective() const { return surjective; }
+  bool isSurjective() const { return rank == getTotalOutDimSizeLog2(); }
+  bool isInjective() const { return rank == getTotalInDimSizeLog2(); }
 
   bool isInvertible() const {
-    return surjective && getTotalInDimSize() == getTotalOutDimSize();
+    return isSurjective() && getTotalInDimSize() == getTotalOutDimSize();
   }
 
   const BasesT &getBases() const { return bases; }
 
@@ -40,11 +40,13 @@ namespace {
 LinearLayout getRegToSharedLayout(MLIRContext *ctx, ArrayRef<int64_t> shape,
                                   LinearLayout regLayout,
                                   triton::gpu::SharedEncodingTrait dstEnc,
-                                  int elemBitWidth) {
+                                  int elemBitWidth,
+                                  ArrayRef<int64_t> allocShape) {
   StringAttr kBlock = StringAttr::get(ctx, ("block"));
   int rank = shape.size();
 
-  LinearLayout sharedLayout = triton::gpu::toLinearLayout(shape, dstEnc);
+  LinearLayout sharedLayout =
+      triton::gpu::toLinearLayout(shape, dstEnc, allocShape);
   auto sharedOrder = triton::gpu::getOrder(dstEnc, shape);
 
   // sharedLayout's in-dims are currently (offset, block).  Reshape to
@@ -399,7 +401,7 @@ emitIndices(Location loc, RewriterBase &rewriter, const TargetInfoBase &target,
   MLIRContext *ctx = rewriter.getContext();
   auto shape = type.getShape();
 
-  LinearLayout ll = triton::gpu::toLinearLayout(shape, layout);
+  LinearLayout ll = triton::gpu::toLinearLayout(shape, layout, {});
 
   StringAttr kRegister = str_attr("register");
   StringAttr kLane = str_attr("lane");
@@ -524,7 +526,7 @@ SmallVector<Value> getSmemVecAddrVec(
                 sharedEnc)) {
       auto regToSharedSwizzledLayout =
           getRegToSharedLayout(ctx, shape, regLayout, swizzledSharedEnc,
-                               elemLlvmTy.getIntOrFloatBitWidth());
+                               elemLlvmTy.getIntOrFloatBitWidth(), allocShape);
       auto smemOrder = swizzledSharedEnc.getOrder();
 
       auto swizzledIndicesVec =
@@ -680,9 +682,9 @@ lowerLdStShared(Location loc, MLIRContext *ctx, LinearLayout cvt,
   bool isStore = !valsArray.empty();
   auto b = TritonLLVMOpBuilder(loc, rewriter);
 
-  auto emitCpAsync = [&](ConversionPatternRewriter &rewriter, Location loc,
-                         ArrayRef<Value> vals, Value shmemAddr, int idx,
-                         VectorType vecTy) -> SmallVector<Value> {
+  auto emitLdSt = [&](ConversionPatternRewriter &rewriter, Location loc,
+                      ArrayRef<Value> vals, Value shmemAddr, int idx,
+                      VectorType vecTy) -> SmallVector<Value> {
     auto length = vecTy.getNumElements();
     if (isStore) {
       Value valsVec =
@@ -698,7 +700,7 @@ lowerLdStShared(Location loc, MLIRContext *ctx, LinearLayout cvt,
     }
   };
   return lowerLdSt(loc, ctx, cvt, valsArray, llvmElemTy, smemBase, rewriter,
-                   targetInfo, {}, emitCpAsync);
+                   targetInfo, {}, emitLdSt);
 }
 
 SmallVector<Value> lowerLdSt(
@@ -880,11 +882,13 @@ bool emitTransferBetweenRegistersAndShared(
   auto allocShape = sharedTy.getAllocShape();
   auto invertAllocSharedLayout = LinearLayout::empty();
   if (!paddedLayout) {
-    // For now this is only needed for the cases where we have swizzling.
-    invertAllocSharedLayout =
-        triton::gpu::toLinearLayout(allocShape.take_back(sharedTy.getRank()),
-                                    sharedTy.getEncoding())
-            .pseudoinvert();
+    // This is the legacy way of doing things that's much more ad-hoc
+    // For generic shared layouts it may or may not be correct
+    auto allocShape = sharedTy.getAllocShape();
+    auto trimShape = allocShape.take_back(sharedTy.getRank());
+    invertAllocSharedLayout = triton::gpu::toLinearLayout(
+                                  trimShape, sharedTy.getEncoding(), trimShape)
+                                  .pseudoinvert();
   }
 
   int numElems = regToSharedLayout.getInDimSize(kRegister);
@@ -1494,7 +1498,7 @@ delinearize(RewriterBase &rewriter, Location loc,
             triton::gpu::DistributedEncodingTrait layout,
             ArrayRef<int64_t> shape, StringAttr dimName, Value linear) {
   auto b = TritonLLVMOpBuilder(loc, rewriter);
-  auto ll = triton::gpu::toLinearLayout(shape, layout);
+  auto ll = triton::gpu::toLinearLayout(shape, layout, {});
   auto linearLayout =
       triton::gpu::LinearEncodingAttr::get(rewriter.getContext(), ll);
   assert(ll.hasInDim(dimName));
 
@@ -471,35 +471,24 @@ struct MemDescSubviewOpConversion
     // newBase = base + offset
     auto smemObj = getSharedMemoryObjectFromStruct(loc, adaptor.getSrc(),
                                                    llvmElemTy, rewriter);
-    auto smemStrides = smemObj.getStrides(srcTy, loc, rewriter);
     SmallVector<Value> opOffsetVals = op.getOffsets();
     // We assume we always create a subview of the last dimensions
-    SmallVector<Value> opSmemStrides(smemStrides.end() - opOffsetVals.size(),
-                                     smemStrides.end());
     // Compute total offset
-    SmallVector<Value> offsetVals;
-    auto destRank = op.getResult().getType().getRank();
-    auto rankReduced = srcTy.getRank() - destRank;
-    for (int i = rankReduced; i < opOffsetVals.size(); i++) {
-      offsetVals.push_back(b.add(opOffsetVals[i], smemObj.getOffsets()[i]));
-    }
+    auto rankReduced = srcTy.getRank() - destTy.getRank();
 
     Value offset;
     if (rankReduced || (destTy.getRank() == 1 && destTy.getDimSize(0) == 1)) {
+      auto smemStrides = smemObj.getStrides(srcTy, loc, rewriter);
+      SmallVector<Value> opSmemStrides(smemStrides.end() - opOffsetVals.size(),
+                                       smemStrides.end());
       // We are splitting the pipelining dimension which may not be a power of 2
       // so we can't use LinearLayouts
       offset = dot(rewriter, loc, opOffsetVals, opSmemStrides);
     } else {
       auto dimNames = standardOutDimNames(ctx, opOffsetVals.size());
       SmallVector<std::pair<StringAttr, Value>> logicalOffsets;
-      // This assumes the subviews are additive, in the sense that we can
-      // compute the offset of one and an add it to the offset of the previous
-      // one we computed. We check for this in the verifier.
-      for (int i = 0; i < rankReduced; i++) {
-        logicalOffsets.push_back({dimNames[i], b.i32_val(0)});
-      }
-      for (int i = rankReduced; i < opOffsetVals.size(); i++) {
-        logicalOffsets.push_back({dimNames[i], offsetVals[i - rankReduced]});
+      for (auto [dim, offset] : llvm::zip(dimNames, opOffsetVals)) {
+        logicalOffsets.push_back({dim, offset});
       }
       auto ll = toLinearLayout(srcTy);
       // Checked in the verifier.
@@ -517,6 +506,11 @@ struct MemDescSubviewOpConversion
       offset = b.add(offset, padOffset);
     }
 
+    SmallVector<Value> offsetVals;
+    for (int i = rankReduced; i < opOffsetVals.size(); i++) {
+      offsetVals.push_back(b.add(opOffsetVals[i], smemObj.getOffsets()[i]));
+    }
+
     auto base = smemObj.getBase();
     auto elemPtrTy = base.getType();
     smemObj = SharedMemoryObject(b.gep(elemPtrTy, llvmElemTy, base, offset),
 
@@ -42,11 +42,14 @@ namespace gpu {
 
 LinearEncodingAttr TritonGPUDialect::toLinearEncoding(ArrayRef<int64_t> shape,
                                                       Attribute layout) {
-  CacheKey key{std::vector<int64_t>(shape.begin(), shape.end()), layout};
+  // LinearEncoding is a DistributedLayout
+  std::vector<int64_t> allocationShape;
+  CacheKey key{std::vector<int64_t>(shape.begin(), shape.end()), layout,
+               allocationShape};
   if (auto result = leCache.get(key)) {
     return *result;
   }
-  auto linearLayout = toLinearLayout(shape, layout);
+  auto linearLayout = toLinearLayout(shape, layout, {});
   auto linearEncoding =
       LinearEncodingAttr::get(layout.getContext(), std::move(linearLayout));
   leCache.set(key, linearEncoding);
@@ -2386,7 +2389,7 @@ struct TritonGPUInferLayoutInterface
       return success();
     }
 
-    auto ll = toLinearLayout(shape, operandEncoding);
+    auto ll = toLinearLayout(shape, operandEncoding, {});
     auto transposedLl = transposeLinearLayout(ll, order);
     resultEncoding = LinearEncodingAttr::get(ctx, std::move(transposedLl));
     return success();
@@ -2483,6 +2486,39 @@ struct TritonGPUInferLayoutInterface
                                              Attribute srcEnc,
                                              ArrayRef<int64_t> dstShape,
                                              Attribute &dstEnc) const {
+    if (auto mmaEncoding = dyn_cast<NVMMASharedEncodingAttr>(srcEnc)) {
+      // TODO: supporting reshape of CTA layouts is non-trivial.
+      if (getNumCTAs(mmaEncoding) > 1)
+        return failure();
+      int innerDimDst =
+          mmaEncoding.getTransposed() ? dstShape.front() : dstShape.back();
+      int innerDimSrc =
+          mmaEncoding.getTransposed() ? srcShape.front() : srcShape.back();
+      // For now disallow reshape of the inner dimension.
+      if (innerDimDst != innerDimSrc)
+        return failure();
+      auto *ctx = srcEnc.getContext();
+
+      // CTALayout can be all 1's because we bailed on multi-CTA layouts above.
+      auto CTALayout = CTALayoutAttr::get(
+          ctx,
+          /*CTAsPerCGA=*/SmallVector<unsigned>(dstShape.size(), 1),
+          /*CTASplitNum=*/SmallVector<unsigned>(dstShape.size(), 1),
+          /*CTAOrder=*/llvm::to_vector(llvm::seq<unsigned>(dstShape.size())));
+      dstEnc = NVMMASharedEncodingAttr::get(
+          ctx, mmaEncoding.getSwizzlingByteWidth(), mmaEncoding.getTransposed(),
+          mmaEncoding.getElementBitWidth(), mmaEncoding.getFp4Padded(),
+          CTALayout);
+      // Big guns, check linear layouts are equivalent
+      // We disallow reshaping memdesc_subviews in the verifier
+      // We disallow reshaping memdesc_subviews in the verifier
+      auto srcLL = toLinearLayout(srcShape, srcEnc, srcShape);
+      auto dstLL = toLinearLayout(dstShape, dstEnc, dstShape);
+      if (reshapeLayout(ctx, srcLL, dstShape) != dstLL) {
+        return failure();
+      }
+      return success();
+    }
     auto src = mlir::dyn_cast<BlockedEncodingAttr>(srcEnc);
     if (!src) {
       return failure();
@@ -2730,6 +2766,10 @@ struct TritonGPUInferLayoutInterface
     if (succeeded(result)) {
       return result;
     }
+    if (!isa<DistributedEncodingTrait>(srcEnc)) {
+      return emitOptionalError(loc,
+                               "Failed MemDescReshapeOp encoding inference");
+    }
     // If the legacy encoding failed use LinearLayouts.
     // Once LinearLayouts are more widely used, we can remove
     // inferReshapeOpLegacyEncoding and simply use LLs.
@@ -2755,7 +2795,7 @@ struct TritonGPUInferLayoutInterface
       SmallVector<int64_t> joinedShape(shape);
       joinedShape.push_back(2);
       auto parent = enc.getParent();
-      auto parentLL = toLinearLayout(joinedShape, parent);
+      auto parentLL = toLinearLayout(joinedShape, parent, {});
 
       Attribute splitEnc;
       auto result = inferSplitOpEncoding(parent, splitEnc, joinedShape, loc);
@@ -2791,7 +2831,7 @@ struct TritonGPUInferLayoutInterface
     }
 
     // Append dim to shape
-    auto ll = toLinearLayout(shape, srcEnc);
+    auto ll = toLinearLayout(shape, srcEnc, {});
     SmallVector<int64_t> dstShape(shape.begin(), shape.end());
     dstShape.push_back(1);
     ll = ll.reshapeOuts(standardOutDimPairs(ctx, dstShape));
@@ -2847,7 +2887,7 @@ struct TritonGPUInferLayoutInterface
     auto ctx = getContext();
 
     // Split on last dim
-    auto ll = toLinearLayout(shape, srcEnc);
+    auto ll = toLinearLayout(shape, srcEnc, {});
     auto newLl = LinearLayout::empty();
     auto result =
         tryJoinOnAxis(ctx, ll, newLl, /*fwdInference=*/false, axis, loc);
@@ -2916,7 +2956,7 @@ struct TritonGPUInferLayoutInterface
       }
     }
 
-    auto ll = toLinearLayout(shape, inEnc);
+    auto ll = toLinearLayout(shape, inEnc, {});
     auto newLl = LinearLayout::empty();
     auto result = tryJoinOnAxis(ctx, ll, newLl, fwdInference, axis, loc);
     if (!result.succeeded())
@@ -3027,15 +3067,16 @@ std::string getSharedLayoutStr(RankedTensorType type, bool useHWPointOfView) {
     return "";
 
   // This RankedTensorType is a MemDescType (?!)
-  LinearLayout ll = triton::gpu::toLinearLayout(type);
+  auto shape = type.getShape();
+  auto layout = type.getEncoding();
+  LinearLayout ll = triton::gpu::toLinearLayout(shape, layout, shape);
 
   StringAttr kOffset = StringAttr::get(type.getContext(), "offset");
   StringAttr kBlock = StringAttr::get(type.getContext(), "block");
   int64_t tensorSize = product(type.getShape());
   auto enc = type.getEncoding();
   unsigned numBlocks = getNumCTAs(enc);
   int32_t blockSize = tensorSize / numBlocks;
-  auto shape = type.getShape();
 
   // elementMapping is for the non-hw layout, offsetMapping for hw-layout
   std::vector<std::string> elementMapping(tensorSize);
@@ -3448,8 +3489,8 @@ int triton::gpu::lookupThreadsPerWarp(OpBuilder &rewriter) {
 bool triton::gpu::areLayoutsEquivalent(ArrayRef<int64_t> shape,
                                        DistributedEncodingTrait lhs,
                                        DistributedEncodingTrait rhs) {
-  auto lhsLL = triton::gpu::toLinearLayout(shape, lhs);
-  auto rhsLL = triton::gpu::toLinearLayout(shape, rhs);
+  auto lhsLL = triton::gpu::toLinearLayout(shape, lhs, {});
+  auto rhsLL = triton::gpu::toLinearLayout(shape, rhs, {});
   return lhsLL == rhsLL;
 }