intel
diff --git a/‎include/triton/Dialect/TritonNvidiaGPU/Transforms/TMAUtilities.h‎
Lines changed: 27 additions & 9 deletions b/‎include/triton/Dialect/TritonNvidiaGPU/Transforms/TMAUtilities.h‎
Lines changed: 27 additions & 9 deletions
diff --git a/‎include/triton/Tools/LayoutUtils.h‎
Lines changed: 11 additions & 0 deletions b/‎include/triton/Tools/LayoutUtils.h‎
Lines changed: 11 additions & 0 deletions
diff --git a/‎include/triton/Tools/LinearLayout.h‎
Lines changed: 12 additions & 3 deletions b/‎include/triton/Tools/LinearLayout.h‎
Lines changed: 12 additions & 3 deletions
diff --git a/‎lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp‎
Lines changed: 27 additions & 29 deletions b/‎lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp‎
Lines changed: 27 additions & 29 deletions
diff --git a/‎lib/Dialect/TritonGPU/Transforms/OptimizeDotOperands.cpp‎
Lines changed: 13 additions & 11 deletions b/‎lib/Dialect/TritonGPU/Transforms/OptimizeDotOperands.cpp‎
Lines changed: 13 additions & 11 deletions
diff --git a/‎lib/Dialect/TritonNvidiaGPU/Transforms/OptimizeDescriptorEncoding.cpp‎
Lines changed: 0 additions & 7 deletions b/‎lib/Dialect/TritonNvidiaGPU/Transforms/OptimizeDescriptorEncoding.cpp‎
Lines changed: 0 additions & 7 deletions
diff --git a/‎lib/Dialect/TritonNvidiaGPU/Transforms/TMAUtilities.cpp‎
Lines changed: 23 additions & 16 deletions b/‎lib/Dialect/TritonNvidiaGPU/Transforms/TMAUtilities.cpp‎
Lines changed: 23 additions & 16 deletions
@@ -39,14 +39,30 @@ triton::gpu::SharedEncodingTrait
 getEncodingFromDescriptor(Operation *op, RankedTensorType tensorType,
                           Value desc);
 
-int64_t getTMAContigDim(Attribute encoding, ArrayRef<int64_t> shape);
+SmallVector<int64_t> getTMABlockShape(ArrayRef<int64_t> shapePerCTA,
+                                      int elementBitWidth, int swizzleBytes,
+                                      bool fp4Padded, bool transposed,
+                                      bool packedSize);
+
+inline SmallVector<int64_t> getTMABlockShape(Attribute encoding,
+                                             ArrayRef<int64_t> shapePerCTA,
+                                             bool packedSize) {
+  auto mmaEnc = cast<gpu::NVMMASharedEncodingAttr>(encoding);
+  return getTMABlockShape(shapePerCTA, mmaEnc.getElementBitWidth(),
+                          mmaEnc.getSwizzlingByteWidth(), mmaEnc.getFp4Padded(),
+                          mmaEnc.getTransposed(), packedSize);
+}
 
-inline int64_t getTMAContigDim(RankedTensorType tensorType) {
-  return getTMAContigDim(tensorType.getEncoding(), tensorType.getShape());
+inline SmallVector<int64_t> getTMABlockShape(RankedTensorType ty,
+                                             bool packedSize) {
+  auto shapePerCTA = gpu::getShapePerCTA(ty);
+  return getTMABlockShape(ty.getEncoding(), shapePerCTA, packedSize);
 }
 
-inline int64_t getTMAContigDim(gpu::MemDescType memDescType) {
-  return getTMAContigDim(memDescType.getEncoding(), memDescType.getShape());
+inline SmallVector<int64_t> getTMABlockShape(triton::gpu::MemDescType ty,
+                                             bool packedSize) {
+  auto shapePerCTA = gpu::getShapePerCTA(ty);
+  return getTMABlockShape(ty.getEncoding(), shapePerCTA, packedSize);
 }
 
 std::optional<int> getTMASwizzleMode(Operation *op, TensorDescType ty);
@@ -74,16 +90,18 @@ mlir::LogicalResult createTMADesc(mlir::Value tmaPtr,
 
   int paddingScale = fp4Padded ? 2 : 1;
   auto shapePerCTA = gpu::getShapePerCTA(encoding, op.getTensorShape());
-  int32_t contig_dim_size = getTMAContigDim(encoding, op.getTensorShape());
+  auto blockShape =
+      getTMABlockShape(encoding, shapePerCTA, /*packedSize=*/false);
+  auto contigDimSize = blockShape.back();
 
   llvm::SmallVector<Value> boxDim;
-  if (fp4Padded && contig_dim_size != 128) {
+  if (fp4Padded && contigDimSize != 128) {
     return op->emitError(
         "FP4 padded loads require 128 elements or more in the last dim");
   }
-  boxDim.push_back(mkI32Constant(contig_dim_size));
+  boxDim.push_back(mkI32Constant(contigDimSize));
   for (int k = shapePerCTA.size() - 2; k >= 0; --k)
-    boxDim.push_back(mkI32Constant(shapePerCTA[k]));
+    boxDim.push_back(mkI32Constant(blockShape[k]));
 
   unsigned swizzleBytes = mmaEncoding ? mmaEncoding.getSwizzlingByteWidth() : 0;
   if (!mmaEncoding) {
 
@@ -83,6 +83,17 @@ LinearLayout ensureLayoutNotSmallerThan(
     const LinearLayout &layout,
     const llvm::SmallDenseMap<StringAttr, int64_t> &shape);
 
+inline LinearLayout
+ensureLayoutNotSmallerThan(const LinearLayout &layout,
+                           const llvm::ArrayRef<StringAttr> dimNames,
+                           const llvm::ArrayRef<int64_t> shape) {
+  llvm::SmallDenseMap<StringAttr, int64_t> namedDims;
+  for (auto [dimName, length] : llvm::zip_equal(dimNames, shape))
+    namedDims[dimName] = length;
+  assert(namedDims.size() == shape.size() && "duplicate dimension names given");
+  return ensureLayoutNotSmallerThan(layout, namedDims);
+}
+
 // Return a vector of the standard out dimension names for tensor layouts. These
 // are "dim0", "dim1", etc.
 SmallVector<StringAttr> standardOutDimNames(MLIRContext *ctx, int rank);
 
@@ -325,23 +325,32 @@ class LinearLayout {
       bases;
 
   llvm::MapVector<StringAttr, int32_t /*size*/> outDims;
-  bool surjective;
+  bool surjective = true;
 
 public:
   using BasesT = decltype(bases);
 
+  LinearLayout() = default;
+
   // The 0-dimensional layout that maps everything to 0.  This is useful as a
   // starting point when doing something like
   //
   //   LinearLayout ret = LinearLayout::empty();
   //   for (...) ret *= ...;
   //   return ret;
-  static LinearLayout empty() { return LinearLayout(BasesT{}, {}); }
+  static LinearLayout empty() { return {}; }
+
+  // Creates a 1D -> 1D layout that's the function L(x) = stride * x
+  // for x in [0, size).
+  static LinearLayout strided1D(int32_t size, int32_t stride, StringAttr inDim,
+                                StringAttr outDim);
 
   // Creates a 1D -> 1D layout that's the identity function, i.e. L(x) = x
   // for x in [0, size).
   static LinearLayout identity1D(int32_t size, StringAttr inDim,
-                                 StringAttr outDim);
+                                 StringAttr outDim) {
+    return strided1D(size, /*stride=*/1, inDim, outDim);
+  }
 
   // Creates a 1D -> 1D layout that maps every input value to 0, i.e. L(x) = 0
   // for x in [0, size). By default this creates a surjective layout where
 
@@ -6,6 +6,7 @@
 #include "triton/Dialect/TritonGPU/IR/Dialect.h"
 #include "triton/Dialect/TritonGPU/IR/LinearLayoutConversions.h"
 #include "triton/Dialect/TritonGPU/IR/TritonGPUInterfaces.h"
+#include "triton/Dialect/TritonNvidiaGPU/Transforms/TMAUtilities.h"
 #include "triton/Tools/LayoutUtils.h"
 #include "triton/Tools/LinearLayout.h"
 #include "triton/Tools/StrUtil.h"
@@ -241,7 +242,6 @@ LinearLayout getCoreMatrixLinearLayout(NVMMASharedEncodingAttr shared,
   int tileRows = 8;
   int tileCols = 8 * tileWidthBytes / elemBitWidth;
   bool isFp4Padded = shared.getFp4Padded();
-  int packingFactor = isFp4Padded ? 2 : 1;
 
   std::vector<std::vector<int>> bases2D;
   for (int col = 1; col < tileCols; col *= 2) {
@@ -269,11 +269,7 @@ LinearLayout getCoreMatrixLinearLayout(NVMMASharedEncodingAttr shared,
     }
   }
   auto outDimNames = standardOutDimNames(ctx, 2);
-  auto kRow = outDimNames[1];
-  auto kCol = outDimNames[0];
-  LinearLayout tileLayout =
-      LinearLayout({{S("offset"), bases2D}}, {kRow, kCol});
-  return tileLayout;
+  return LinearLayout({{S("offset"), bases2D}}, outDimNames);
 }
 
 } // namespace
@@ -285,63 +281,62 @@ LinearLayout nvmmaSharedToLinearLayout(ArrayRef<int64_t> shape,
   int rank = shape.size();
   auto shapePerCTA = getShapePerCTA(shared, shape);
   auto kOffset = S("offset");
+  auto tmaShape = triton::nvidia_gpu::getTMABlockShape(shared, shapePerCTA,
+                                                       /*packedSize=*/true);
   if (shared.getSwizzlingByteWidth() == 0) {
     auto outDimNames = standardOutDimNames(ctx, rank);
-    LinearLayout layout = LinearLayout::identity1D(
-        shapePerCTA[rank - 1], kOffset, outDimNames[rank - 1]);
+    LinearLayout layout = LinearLayout::identity1D(tmaShape[rank - 1], kOffset,
+                                                   outDimNames[rank - 1]);
     for (int i = rank - 2; i >= 0; --i) {
-      layout *=
-          LinearLayout::identity1D(shapePerCTA[i], kOffset, outDimNames[i]);
+      layout *= LinearLayout::identity1D(tmaShape[i], kOffset, outDimNames[i]);
     }
+    layout = ensureLayoutNotSmallerThan(layout, outDimNames, shapePerCTA);
     return combineCtaCgaWithShape(layout, shared.getCTALayout(), shape);
   }
   assert(rank >= 2);
 
   // Collapse all the outer dim into one. We will then create a layout for this
   // shape and reshape it to the original shape.
-  std::array<int64_t, 2> collapsedShapePerCTA{1, shapePerCTA.back()};
+  std::array<int64_t, 2> collapsedTmaShape{1, tmaShape.back()};
   for (int i = 0; i + 1 < rank; i++)
-    collapsedShapePerCTA[0] *= shapePerCTA[i];
+    collapsedTmaShape[0] *= tmaShape[i];
   if (shared.getTransposed()) {
-    std::swap(collapsedShapePerCTA[0], collapsedShapePerCTA[1]);
+    std::swap(collapsedTmaShape[0], collapsedTmaShape[1]);
   }
 
   auto tileLayout = getCoreMatrixLinearLayout(shared, disableSwizzle);
   auto outDimNames = standardOutDimNames(ctx, 2);
-  auto kRow = outDimNames[1];
-  auto kCol = outDimNames[0];
+  auto kRow = outDimNames[0];
+  auto kCol = outDimNames[1];
   auto tileRows = tileLayout.getOutDimSize(kRow);
   auto tileCols = tileLayout.getOutDimSize(kCol);
 
   int packingFactor = shared.getFp4Padded() ? 2 : 1;
-  if (collapsedShapePerCTA[1] * packingFactor < tileCols ||
-      collapsedShapePerCTA[0] < tileRows) {
+  if (collapsedTmaShape[1] * packingFactor < tileCols ||
+      collapsedTmaShape[0] < tileRows) {
     llvm::errs() << "Illegal shared layout; expected collapsed shapePerCTA to "
                     "be at least ["
                  << tileRows << ", " << (tileCols / packingFactor)
-                 << "], collapsedShapePerCTA: [" << collapsedShapePerCTA[0]
-                 << ", " << collapsedShapePerCTA[1] << "]\n";
+                 << "], collapsedTmaShape: [" << collapsedTmaShape[0] << ", "
+                 << collapsedTmaShape[1] << "]\n";
     llvm::report_fatal_error("Illegal shared layout");
   }
 
   // Distribute the remaining rows and cols.
-  auto layout = tileLayout;
-  layout *= LinearLayout::identity1D(collapsedShapePerCTA[0] / tileRows,
-                                     kOffset, kRow);
-  layout *= LinearLayout::identity1D(collapsedShapePerCTA[1] / tileCols,
-                                     kOffset, kCol);
+  auto layout =
+      ensureLayoutNotSmallerThan(tileLayout, outDimNames, collapsedTmaShape);
 
   // Reshape the layout to the N-D pre-transposed shape per CTA.
-  SmallVector<int64_t> maybeTransposedShapePerCTA = shapePerCTA;
+  SmallVector<int64_t> maybeTransposedTmaShape = tmaShape;
   if (shared.getTransposed()) {
     // Move the outer dim to the inner position.
     // TODO: we should move back to using `order` instead of transposed to make
     // the order more explicit.
-    std::rotate(maybeTransposedShapePerCTA.begin(),
-                maybeTransposedShapePerCTA.begin() + 1,
-                maybeTransposedShapePerCTA.end());
+    std::rotate(maybeTransposedTmaShape.begin(),
+                maybeTransposedTmaShape.begin() + 1,
+                maybeTransposedTmaShape.end());
   }
-  auto reshapedLayout = reshapeLayout(ctx, layout, maybeTransposedShapePerCTA);
+  auto reshapedLayout = reshapeLayout(ctx, layout, maybeTransposedTmaShape);
 
   if (shared.getTransposed()) {
     SmallVector<int> order = {rank - 1};
@@ -351,6 +346,9 @@ LinearLayout nvmmaSharedToLinearLayout(ArrayRef<int64_t> shape,
     reshapedLayout = transposeLinearLayout(reshapedLayout, order);
   }
 
+  reshapedLayout = ensureLayoutNotSmallerThan(
+      reshapedLayout, standardOutDimNames(ctx, shapePerCTA.size()),
+      shapePerCTA);
   return combineCtaCgaWithShape(reshapedLayout, shared.getCTALayout(), shape);
 }
 
 
@@ -11,6 +11,7 @@
 #include "triton/Dialect/TritonGPU/Transforms/Passes.h"
 #include "triton/Dialect/TritonGPU/Transforms/Utility.h"
 #include "triton/Dialect/TritonNvidiaGPU/IR/Dialect.h"
+#include "triton/Tools/LayoutUtils.h"
 #include "triton/Tools/LinearLayout.h"
 #include <memory>
 
@@ -149,29 +150,36 @@ static Attribute inferSrcEncodingMemDescReshape(Attribute dstEncoding,
                                                 ArrayRef<int64_t> dstShape) {
   auto mmaEncoding = dyn_cast<NVMMASharedEncodingAttr>(dstEncoding);
   if (!mmaEncoding)
-    return Attribute();
+    return {};
   // TODO: supporting reshape of CTA layouts is non-trivial.
   if (getNumCTAs(mmaEncoding) > 1)
-    return Attribute();
+    return {};
   int innerDimDst =
       mmaEncoding.getTransposed() ? dstShape.front() : dstShape.back();
   int innerDimSrc =
       mmaEncoding.getTransposed() ? srcShape.front() : srcShape.back();
   // For now disallow reshape of the inner dimension.
   if (innerDimDst != innerDimSrc)
-    return Attribute();
+    return {};
 
   // CTALayout can be all 1's because we bailed on multi-CTA layouts above.
   auto CTALayout = CTALayoutAttr::get(
       dstEncoding.getContext(),
       /*CTAsPerCGA=*/SmallVector<unsigned>(srcShape.size(), 1),
       /*CTASplitNum=*/SmallVector<unsigned>(srcShape.size(), 1),
       /*CTAOrder=*/llvm::to_vector(llvm::seq<unsigned>(srcShape.size())));
-  // Check that the second dim is big enough to contain a full swizzle.
-  return NVMMASharedEncodingAttr::get(
+  auto srcEncoding = NVMMASharedEncodingAttr::get(
       dstEncoding.getContext(), mmaEncoding.getSwizzlingByteWidth(),
       mmaEncoding.getTransposed(), mmaEncoding.getElementBitWidth(),
       mmaEncoding.getFp4Padded(), CTALayout);
+  // Big guns, check linear layouts are equivalent
+  auto srcLL = toLinearLayout(srcShape, srcEncoding);
+  auto dstLL = toLinearLayout(dstShape, dstEncoding);
+  auto ctx = dstEncoding.getContext();
+  if (reshapeLayout(ctx, srcLL, dstShape) != dstLL) {
+    return {};
+  }
+  return srcEncoding;
 }
 
 // Rewrite
@@ -315,12 +323,6 @@ class UseShmemForScales
     if (!isInnermostContiguous(scaleType, 512))
       return false;
 
-    auto sharedEnc =
-        dyn_cast<triton::gpu::NVMMASharedEncodingAttr>(scaleType.getEncoding());
-    if (!sharedEnc || sharedEnc.getTransposed() || sharedEnc.getFp4Padded() ||
-        sharedEnc.getSwizzlingByteWidth() != 0)
-      return false;
-
     if (usesTMAload) {
       return true;
     }
 
@@ -57,13 +57,6 @@ Attribute findLoadEncodingFromUsers(Operation *op) {
   return {};
 }
 
-ttg::CTALayoutAttr getCtaLayoutFromEncoding(Attribute encoding) {
-  auto layout = cast<ttg::LayoutEncodingTrait>(encoding);
-  auto ctx = encoding.getContext();
-  return ttg::CTALayoutAttr::get(ctx, layout.getCTAsPerCGA(),
-                                 layout.getCTASplitNum(), layout.getCTAOrder());
-}
-
 SmallVector<int64_t> expandToRank(ArrayRef<int64_t> shape, int rank) {
   SmallVector<int64_t> result(rank, 1);
   assert(shape.size() <= rank);
 
@@ -99,23 +99,30 @@ ttg::SharedEncodingTrait getEncodingFromDescriptor(Operation *op,
   return updateEncodingForShape(op, sharedEnc, tensorType);
 }
 
-int64_t getTMAContigDim(Attribute encoding, ArrayRef<int64_t> shape) {
-  assert(encoding);
-  auto mmaEncoding =
-      llvm::dyn_cast_or_null<ttg::NVMMASharedEncodingAttr>(encoding);
-
-  // The bounding box inner dimension must be less than or equal to the
-  // swizzle size.
-  // https://docs.nvidia.com/cuda/cuda-driver-api/group__CUDA__TENSOR__MEMORY.html#group__CUDA__TENSOR__MEMORY_1ga7c7d2aaac9e49294304e755e6f341d7
-  // We clamp the block size and the codegen will emit multiple copy
-  // operations.
-  if (mmaEncoding && mmaEncoding.getSwizzlingByteWidth() != 0) {
-    auto elemSize = mmaEncoding.getElementBitWidth() / 8;
-    return mmaEncoding.getSwizzlingByteWidth() / elemSize;
+SmallVector<int64_t> getTMABlockShape(ArrayRef<int64_t> shapePerCTA,
+                                      int elementBitWidth, int swizzleBytes,
+                                      bool fp4Padded, bool isTransposed,
+                                      bool packedSize) {
+  SmallVector<int64_t> blockShape(shapePerCTA);
+  int contigDim = isTransposed ? 0 : blockShape.size() - 1;
+  if (fp4Padded) {
+    blockShape[contigDim] *= 2;
   }
-
-  auto shapePerCTA = ttg::getShapePerCTA(encoding, shape);
-  return shapePerCTA.back();
+  // All dimensions must be at most 256
+  constexpr int64_t dimMax = 256;
+  for (auto &size : blockShape) {
+    size = std::min(size, dimMax);
+  }
+  // Last dim must equal the swizzle byte size
+  if (swizzleBytes != 0) {
+    auto contigDimSize = (8 * swizzleBytes) / elementBitWidth;
+    assert(blockShape[contigDim] >= contigDimSize);
+    blockShape[contigDim] = contigDimSize;
+  }
+  if (fp4Padded && packedSize) {
+    blockShape[contigDim] /= 2;
+  }
+  return blockShape;
 }
 
 std::optional<int> getTMASwizzleMode(Operation *op, TensorDescType ty) {