makslevental
diff --git a/‎include/triton/Conversion/TritonGPUToLLVM/Utility.h‎
Lines changed: 2 additions & 2 deletions b/‎include/triton/Conversion/TritonGPUToLLVM/Utility.h‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/IR/Dialect.h‎
Lines changed: 8 additions & 0 deletions b/‎include/triton/Dialect/TritonGPU/IR/Dialect.h‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td‎
Lines changed: 9 additions & 3 deletions b/‎include/triton/Dialect/TritonGPU/IR/TritonGPUAttrDefs.td‎
Lines changed: 9 additions & 3 deletions
diff --git a/‎include/triton/Tools/LinearLayout.h‎
Lines changed: 2 additions & 3 deletions b/‎include/triton/Tools/LinearLayout.h‎
Lines changed: 2 additions & 3 deletions
diff --git a/‎lib/Analysis/Allocation.cpp‎
Lines changed: 1 addition & 1 deletion b/‎lib/Analysis/Allocation.cpp‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎lib/Dialect/TritonGPU/IR/Dialect.cpp‎
Lines changed: 34 additions & 3 deletions b/‎lib/Dialect/TritonGPU/IR/Dialect.cpp‎
Lines changed: 34 additions & 3 deletions
diff --git a/‎lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp‎
Lines changed: 28 additions & 3 deletions b/‎lib/Dialect/TritonGPU/IR/LinearLayoutConversions.cpp‎
Lines changed: 28 additions & 3 deletions
diff --git a/‎lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp‎
Lines changed: 33 additions & 10 deletions b/‎lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp‎
Lines changed: 33 additions & 10 deletions
diff --git a/‎lib/Dialect/TritonGPU/Transforms/OptimizeDotOperands.cpp‎
Lines changed: 2 additions & 1 deletion b/‎lib/Dialect/TritonGPU/Transforms/OptimizeDotOperands.cpp‎
Lines changed: 2 additions & 1 deletion
@@ -427,8 +427,8 @@ class SharedMemoryObject {
   SmallVector<Value> getStrides(triton::gpu::MemDescType memDesc, Location loc,
                                 RewriterBase &rewriter) const {
     auto allocShape = memDesc.getAllocShape();
-    auto allocShapePerCTA =
-        triton::gpu::getShapePerCTA(memDesc.getEncoding(), allocShape);
+    auto allocShapePerCTA = triton::gpu::getAllocationShapePerCTA(
+        memDesc.getEncoding(), allocShape);
     auto layoutOrder = triton::gpu::getOrder(memDesc.getEncoding());
     auto allocStrides = SharedMemoryObject::getStridesForShape(
         allocShapePerCTA, layoutOrder, loc, rewriter);
 
@@ -165,11 +165,19 @@ SmallVector<unsigned> getCTAOrder(Attribute layout);
  */
 SmallVector<unsigned> getShapePerCTATile(Attribute layout);
 
+// Returns the "logical" shape per CTA
 SmallVector<int64_t> getShapePerCTA(ArrayRef<unsigned> CTASplitNum,
                                     ArrayRef<int64_t> shape);
 SmallVector<int64_t> getShapePerCTA(Attribute layout, ArrayRef<int64_t> shape);
 SmallVector<int64_t> getShapePerCTA(Type type);
 
+// Returns the shape per CTA, which is "physically" allocated
+// Such shapes may be bigger than the logical one due to, for example, padding
+// in shared memory.
+SmallVector<int64_t> getAllocationShapePerCTA(Attribute layout,
+                                              ArrayRef<int64_t> shape);
+SmallVector<int64_t> getAllocationShapePerCTA(Type type);
+
 unsigned getNumWarpsPerCTA(Attribute layout);
 
 unsigned getNumCTAs(Attribute layout);
 
@@ -419,26 +419,32 @@ def NVMMASharedEncodingAttr :
     https://docs.nvidia.com/cuda/parallel-thread-execution/#asynchronous-warpgroup-level-matrix-shared-memory-layout
   }];
 
+
+  // fp4Padded: Indicates that this encoding represents a mixed-precision fp4 operand in MMAv5 scaled dot, which needs
+  // to be in the special padded layout as described in https://docs.nvidia.com/cuda/parallel-thread-execution/#packing-format-used-for-matrix-a-and-b-by-kind-mxf8f6f4-in-shared-memory
   let parameters = (
     ins
     "unsigned":$swizzlingByteWidth,
     "bool":$transposed,
     "unsigned":$elementBitWidth,
+    "bool":$fp4Padded,
     "CTALayoutAttr":$CTALayout
   );
 
   let builders = [
     AttrBuilder<(ins "ArrayRef<int64_t>":$shape,
                      "ArrayRef<unsigned>":$order,
                      "CTALayoutAttr":$CTALayout,
-                     "Type":$eltTy), [{
+                     "Type":$eltTy,
+                     "bool": $fp4Padded), [{
         auto shapePerCTA = getShapePerCTA(CTALayout.getCTASplitNum(), shape);
         int32_t swizzlingByteWidth = 0;
         unsigned eleBitWidth = eltTy.getIntOrFloatBitWidth();
+        int packingFactor = fp4Padded ? 2 : 1;
 
         // get proper shared memory swizzling mode from the contiguous dimension
         // size of the origin blocked layout.
-        auto contigDimSizeInByte = shapePerCTA[order[0]] * eleBitWidth / 8;
+        auto contigDimSizeInByte = shapePerCTA[order[0]] * packingFactor * eleBitWidth / 8;
         if (contigDimSizeInByte >= 128 && contigDimSizeInByte % 128 == 0) {
           swizzlingByteWidth = 128;
         } else if (contigDimSizeInByte >= 64 && contigDimSizeInByte % 64 == 0) {
@@ -449,7 +455,7 @@ def NVMMASharedEncodingAttr :
           llvm_unreachable("unsupported shared memory layout for MMAv3");
         }
         bool transposed = order[0] == 0;
-        return $_get(context, swizzlingByteWidth, transposed, eleBitWidth, CTALayout);
+        return $_get(context, swizzlingByteWidth, transposed, eleBitWidth, fp4Padded, CTALayout);
     }]>
   ];
 
 
@@ -683,9 +683,8 @@ class LinearLayout {
   //     Otherwise, R could map some tensor index that is not stored in S.
   //
   // One requirement we *don't* have is that S is injective; we allow two shmem
-  // offsets to hold the same 2D index.  If S is not injective, there's
-  // ambiguity in which offset we choose for a given (lane, warp).  For now we
-  // don't place any guarantees on the choices made by this function.
+  // offsets to hold the same 2D index.  If S is not injective,
+  // the algorithm chooses the smallest offset for a given (lane, warp).
   [[nodiscard]] LinearLayout invertAndCompose(const LinearLayout &outer) const;
 
   // Get the layout that is the inverse of this layout.
 
@@ -234,7 +234,7 @@ class AllocationAnalysis {
         // Bytes could be a different value once we support padding or other
         // allocation policies.
         auto allocType = alloc.getType();
-        auto shapePerCTA = gpu::getShapePerCTA(allocType);
+        auto shapePerCTA = gpu::getAllocationShapePerCTA(allocType);
         auto bytes = product<int64_t>(shapePerCTA) *
                      allocType.getElementTypeBitWidth() / 8;
 
 
@@ -388,11 +388,34 @@ SmallVector<int64_t> getShapePerCTA(Attribute layout, ArrayRef<int64_t> shape) {
   return getShapePerCTA(splitNum, shape);
 }
 
+SmallVector<int64_t> getAllocationShapePerCTA(Attribute layout,
+                                              ArrayRef<int64_t> shapeLogical) {
+  SmallVector<int64_t> shape(shapeLogical);
+  if (auto sharedMMALayout = mlir::dyn_cast<NVMMASharedEncodingAttr>(layout)) {
+    if (sharedMMALayout.getFp4Padded()) {
+      auto packedAxis = getOrder(sharedMMALayout)[0];
+      if (shape.size() == 3) {
+        // Take into account multi buffering
+        shape[1 + packedAxis] *= 2;
+      } else {
+        shape[packedAxis] *= 2;
+      }
+    }
+  }
+  return getShapePerCTA(layout, shape);
+}
+
 SmallVector<int64_t> getShapePerCTA(Type type) {
   auto tensorType = cast<TensorOrMemDesc>(type);
   return getShapePerCTA(tensorType.getEncoding(), tensorType.getShape());
 }
 
+SmallVector<int64_t> getAllocationShapePerCTA(Type type) {
+  auto tensorType = cast<TensorOrMemDesc>(type);
+  return getAllocationShapePerCTA(tensorType.getEncoding(),
+                                  tensorType.getShape());
+}
+
 unsigned getNumWarpsPerCTA(Attribute layout) {
   SmallVector<unsigned> warpsPerCTA;
   if (auto blockedLayout = dyn_cast<BlockedEncodingAttr>(layout))
@@ -1913,7 +1936,8 @@ Attribute NVMMASharedEncodingAttr::parse(AsmParser &parser, Type type) {
     return {};
 
   unsigned swizzlingByteWidth;
-  bool transposed;
+  bool transposed = false;
+  bool fp4Padded = false;
   unsigned elementBitWidth;
   std::optional<SmallVector<unsigned>> CTAsPerCGA;
   std::optional<SmallVector<unsigned>> CTASplitNum;
@@ -1929,6 +1953,9 @@ Attribute NVMMASharedEncodingAttr::parse(AsmParser &parser, Type type) {
     } else if (attr.getName() == "elementBitWidth") {
       if (parseUInt(parser, attr, elementBitWidth, "elementBitWidth").failed())
         return {};
+    } else if (attr.getName() == "fp4Padded") {
+      if (parseBool(parser, attr, fp4Padded, "fp4Padded").failed())
+        return {};
     } else if (attr.getName() == "CTAsPerCGA") {
       if (parseIntArrayAttr(parser, attr, CTAsPerCGA.emplace(), "CTAsPerCGA")
               .failed())
@@ -1955,14 +1982,18 @@ Attribute NVMMASharedEncodingAttr::parse(AsmParser &parser, Type type) {
 
   return parser.getChecked<NVMMASharedEncodingAttr>(
       parser.getContext(), swizzlingByteWidth, transposed, elementBitWidth,
-      *CTALayout);
+      fp4Padded, *CTALayout);
 }
 
 void NVMMASharedEncodingAttr::print(AsmPrinter &printer) const {
   printer << "<{"
           << "swizzlingByteWidth = " << getSwizzlingByteWidth() //
           << ", transposed = " << getTransposed()               //
           << ", elementBitWidth = " << getElementBitWidth();
+  if (getFp4Padded()) {
+    // Print only in this case to reduce the noise for the more common case.
+    printer << ", fp4Padded = true";
+  }
   maybePrintCTALayout(getContext(), printer, getCTALayout(),
                       /*rank=*/2);
   printer << "}>";
@@ -2602,7 +2633,7 @@ struct TritonGPUInferLayoutInterface
       }
       resultEncoding = NVMMASharedEncodingAttr::get(
           ctx, enc.getSwizzlingByteWidth(), !enc.getTransposed(),
-          enc.getElementBitWidth(), *ctaLayout);
+          enc.getElementBitWidth(), enc.getFp4Padded(), *ctaLayout);
       return success();
     }
 
 
@@ -202,8 +202,16 @@ LinearLayout sharedToLinearLayoutLeadingOffset(ArrayRef<int64_t> shape,
 
   int tileRows = 8;
   int tileCols = 8 * tileWidthBytes / elemBitWidth;
+  bool isFp4Padded = false;
+  if (auto sharedMMALayout =
+          dyn_cast<triton::gpu::NVMMASharedEncodingAttr>(shared)) {
+    if (sharedMMALayout.getFp4Padded()) {
+      isFp4Padded = true;
+    }
+  }
+  int packingFactor = isFp4Padded ? 2 : 1;
 
-  if (shape[colDim] < tileCols || shape[rowDim] < tileRows) {
+  if (shape[colDim] * packingFactor < tileCols || shape[rowDim] < tileRows) {
     llvm::errs() << "Illegal shared layout; expected shape to be at least ["
                  << tileRows << ", " << tileCols << "], shape: ["
                  << shape[rowDim] << ", " << shape[colDim] << "]\n";
@@ -215,15 +223,32 @@ LinearLayout sharedToLinearLayoutLeadingOffset(ArrayRef<int64_t> shape,
 
   std::vector<std::vector<int>> bases2D;
   for (int logCol = 0; logCol < llvm::Log2_32(tileCols); logCol++) {
-    bases2D.push_back({0, 1 << logCol});
+    if (isFp4Padded) {
+      int colPadded = 1 << logCol;
+      // Each group of 16 offsets consists of 8 "real" and 8 "padded" offsets.
+      // We represent the padded layout by mapping 8 padded offsets to the same
+      // coordinates as the real ones. When computing the inverse of this LL,
+      // the offsets correspoding to the real ones are picked in the image by
+      // invertAndCompose.
+      int colPacked = colPadded / 16 * 8 + colPadded % 8;
+      bases2D.push_back({0, colPacked});
+    } else {
+      bases2D.push_back({0, 1 << logCol});
+    }
   }
   for (int logRow = 0; logRow < llvm::Log2_32(tileRows); logRow++) {
     int row = 1 << logRow;
     if (disableSwizzle) {
       bases2D.push_back({row, 0});
       continue;
     }
-    bases2D.push_back({row, vec * ((row / perPhase) % maxPhase)});
+    if (isFp4Padded) {
+      int colPadded = vec * ((row / perPhase) % maxPhase);
+      int colPacked = colPadded / 16 * 8 + colPadded % 8;
+      bases2D.push_back({row, colPacked});
+    } else {
+      bases2D.push_back({row, vec * ((row / perPhase) % maxPhase)});
+    }
   }
   LinearLayout tileLayout =
       LinearLayout({{S("offset"), bases2D}}, {rowDimName, colDimName});
 
@@ -141,8 +141,10 @@ warpsPerTileV3(DotOp dotOp, const ArrayRef<int64_t> shape, int numWarps,
 
 // Returns a shared memory allocation that can be used by a dotMMA op for the
 // given value.
-static Value getSharedMemoryMMAOperand(Value v, mlir::PatternRewriter &rewriter,
-                                       int opIdx, bool allowTranspose) {
+static Value
+getSharedMemoryMMAOperand(Value v, mlir::PatternRewriter &rewriter, int opIdx,
+                          bool allowTranspose, bool isMMAv5Fp4Padded = false,
+                          Operation *op = nullptr /*only for diagnostic*/) {
   OpBuilder::InsertionGuard g(rewriter);
   Value arg = v;
   if (auto cvtOp = v.getDefiningOp<ConvertLayoutOp>())
@@ -161,12 +163,21 @@ static Value getSharedMemoryMMAOperand(Value v, mlir::PatternRewriter &rewriter,
     }
   }
 
+  if (newOrder != getOrder(argType.getEncoding()) && op) {
+    op->emitWarning("Warning: Forcing a different order [")
+        << newOrder[0] << ", " << newOrder[1]
+        << "] on SMEM than the register order for the opreand " << opIdx
+        << ". Registers will be transposed before SMEM store and the pipelined "
+           "load for this operand will be disabled, so poor performance is "
+           "expected.";
+  }
+
   Attribute SharedMemorySpace =
       SharedMemorySpaceAttr::get(argType.getContext());
   auto CTALayout = getCTALayout(argType.getEncoding());
   auto newLayout = NVMMASharedEncodingAttr::get(
       argType.getContext(), argType.getShape(), newOrder, CTALayout,
-      argType.getElementType());
+      argType.getElementType(), isMMAv5Fp4Padded);
   auto newType = MemDescType::get(argType.getShape(), argType.getElementType(),
                                   newLayout, SharedMemorySpace);
   rewriter.setInsertionPointAfterValue(arg);
@@ -582,11 +593,6 @@ class ScaledBlockedToMMAv5
         mlir::isa<NvidiaMmaEncodingAttr>(oldRetType.getEncoding()))
       return failure();
 
-    if (dotOp.getLhsType() != dotOp.getRhsType()) {
-      // Mixed precision is not supported yet.
-      return failure();
-    }
-
     if (dotOp.getLhsScale() == nullptr || dotOp.getRhsScale() == nullptr) {
       return failure();
     }
@@ -607,8 +613,25 @@ class ScaledBlockedToMMAv5
     auto oldAType = dotOp.getLhs().getType();
     auto oldBType = dotOp.getRhs().getType();
 
-    a = getSharedMemoryMMAOperand(a, rewriter, 0, /*allowTranspose=*/true);
-    b = getSharedMemoryMMAOperand(b, rewriter, 1, /*allowTranspose=*/true);
+    bool IsAMixedPrecFp4 = false;
+    bool IsBMixedPrecFp4 = false;
+
+    if (dotOp.getLhsType() != dotOp.getRhsType()) {
+      if (dotOp.getLhsType() == ScaleDotElemType::E2M1)
+        IsAMixedPrecFp4 = true;
+      else if (dotOp.getRhsType() == ScaleDotElemType::E2M1)
+        IsBMixedPrecFp4 = true;
+    }
+
+    // For mixed-precision fp4 operands, set allowTranspose = false, to force
+    // the packed axis, K, to be contiguous in SMEM
+    a = getSharedMemoryMMAOperand(a, rewriter, 0,
+                                  /*allowTranspose=*/!IsAMixedPrecFp4,
+                                  IsAMixedPrecFp4, dotOp);
+    b = getSharedMemoryMMAOperand(b, rewriter, 1,
+                                  /*allowTranspose=*/!IsBMixedPrecFp4,
+                                  IsBMixedPrecFp4, dotOp);
+
     MLIRContext *context = dotOp->getContext();
     unsigned m = 128;
     unsigned n = retShapePerCTA[1] >= 256 ? 256 : retShapePerCTA[1];
 
@@ -125,7 +125,8 @@ class FuseTransMMAV3Plus : public OpRewritePattern<LocalAllocOp> {
     // all CTALayouts are the same.
     auto newInnerEnc = NVMMASharedEncodingAttr::get(
         getContext(), srcTy.getShape(), newInnerCvtOrder,
-        allocEncoding.getCTALayout(), srcTy.getElementType());
+        allocEncoding.getCTALayout(), srcTy.getElementType(),
+        allocEncoding.getFp4Padded());
 
     MemDescType innerTy =
         MemDescType::get(srcTy.getShape(), srcTy.getElementType(), newInnerEnc,