intel
diff --git a/‎include/triton/Conversion/TritonGPUToLLVM/Utility.h‎
Lines changed: 0 additions & 6 deletions b/‎include/triton/Conversion/TritonGPUToLLVM/Utility.h‎
Lines changed: 0 additions & 6 deletions
diff --git a/‎include/triton/Dialect/Triton/IR/TritonOpInterfaces.td‎
Lines changed: 17 additions & 1 deletion b/‎include/triton/Dialect/Triton/IR/TritonOpInterfaces.td‎
Lines changed: 17 additions & 1 deletion
diff --git a/‎include/triton/Dialect/Triton/IR/TritonOps.td‎
Lines changed: 4 additions & 2 deletions b/‎include/triton/Dialect/Triton/IR/TritonOps.td‎
Lines changed: 4 additions & 2 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/Transforms/MMAv5PipelineUtility.h‎
Lines changed: 8 additions & 55 deletions b/‎include/triton/Dialect/TritonGPU/Transforms/MMAv5PipelineUtility.h‎
Lines changed: 8 additions & 55 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/Transforms/Utility.h‎
Lines changed: 6 additions & 0 deletions b/‎include/triton/Dialect/TritonGPU/Transforms/Utility.h‎
Lines changed: 6 additions & 0 deletions
diff --git a/‎include/triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUOps.td‎
Lines changed: 6 additions & 2 deletions b/‎include/triton/Dialect/TritonNvidiaGPU/IR/TritonNvidiaGPUOps.td‎
Lines changed: 6 additions & 2 deletions
diff --git a/‎lib/Conversion/TritonGPUToLLVM/Utility.cpp‎
Lines changed: 0 additions & 12 deletions b/‎lib/Conversion/TritonGPUToLLVM/Utility.cpp‎
Lines changed: 0 additions & 12 deletions
diff --git a/‎lib/Dialect/Triton/IR/OpInterfaces.cpp‎
Lines changed: 1 addition & 2 deletions b/‎lib/Dialect/Triton/IR/OpInterfaces.cpp‎
Lines changed: 1 addition & 2 deletions
diff --git a/‎lib/Dialect/Triton/IR/Ops.cpp‎
Lines changed: 29 additions & 4 deletions b/‎lib/Dialect/Triton/IR/Ops.cpp‎
Lines changed: 29 additions & 4 deletions
diff --git a/‎lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp‎
Lines changed: 23 additions & 22 deletions b/‎lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp‎
Lines changed: 23 additions & 22 deletions
@@ -53,9 +53,6 @@ createLLVMIntrinsicCallOp(OpBuilder &builder, Location loc, StringRef intrinsic,
                           TypeRange types, ValueRange args);
 } // namespace mlir::LLVM
 
-// Is v an integer or floating-point scalar constant equal to 0?
-bool isConstantZero(Value v);
-
 namespace mlir::triton {
 
 struct TritonLLVMOpBuilder {
@@ -348,9 +345,6 @@ LLVM::LLVMFuncOp appendOrGetExternFuncOp(RewriterBase &rewriter, Operation *op,
 namespace LLVM {
 using namespace mlir::triton;
 
-// Is v an integer or floating-point scalar constant equal to 0?
-bool isConstantZero(Value v);
-
 class SharedMemoryObject {
 public:
   SharedMemoryObject(Value base, Type baseElemType, ArrayRef<Value> offsets)
 
@@ -53,7 +53,23 @@ def DotOpInterface : OpInterface<"DotOpInterface"> {
       /*desc=*/"Verify the dimensions of the A and B DotOp operands.",
       /*retType=*/"bool",
       /*methodName=*/"verifyDims",
-      /*args=*/(ins)>
+      /*args=*/(ins)>,
+  InterfaceMethod<
+      /*desc=*/"Verify the dimensions of the DotOp output.",
+      /*retType=*/"bool",
+      /*methodName=*/"verifyOutputDims",
+      /*args=*/(ins),
+      /*methodBody=*/[{}],
+      /*defaultImpl=*/ [{
+        auto aTy = cast<ShapedType>($_op.getA().getType());
+        auto bTy = cast<ShapedType>($_op.getB().getType());
+        auto cTy = cast<ShapedType>($_op->getOperand(2).getType());
+        auto aShape = aTy.getShape();
+        auto bShape = bTy.getShape();
+        auto cShape = cTy.getShape();
+        return cShape[cShape.size() - 2] == aShape[aShape.size() - 2] &&
+               cShape[cShape.size() - 1] == bShape[aShape.size() - 1];
+      }]>
   ];
 
   let verify = [{ return ::mlir::triton::impl::verifyDotOpInterface($_op); }];
 
@@ -676,7 +676,7 @@ def TT_DotOp : TT_Op<"dot", [Pure,
 //
 def TT_DotScaledOp : TT_Op<"dot_scaled", [Pure,
                              AttrSizedOperandSegments,
-                             DeclareOpInterfaceMethods<DotOpInterface>,
+                             DeclareOpInterfaceMethods<DotOpInterface, ["verifyDims", "verifyOutputDims"]>,
                              TypesMatchWith<"result's type matches accumulator's type",
                                             "d", "c", "$_self">]> {
     let summary = "dot_scaled";
@@ -697,7 +697,9 @@ def TT_DotScaledOp : TT_Op<"dot_scaled", [Pure,
       Optional<RankedTensorOf<[TT_Float, I8]>>:$b_scale,
       TT_ScaleDotElemTypeAttr:$a_elem_type,
       TT_ScaleDotElemTypeAttr:$b_elem_type,
-      BoolAttr:$fastMath
+      BoolAttr:$fastMath,
+      DefaultValuedAttr<BoolAttr, "true">:$lhs_k_pack,
+      DefaultValuedAttr<BoolAttr, "true">:$rhs_k_pack
     );
 
     let results = (outs TT_FloatTensor:$d);
 
@@ -13,50 +13,6 @@ class ForOp;
 } // namespace scf
 namespace triton::nvidia_gpu {
 
-//===----------------------------------------------------------------------===//
-// MMAInfo
-//===----------------------------------------------------------------------===//
-
-// This struct contains analysis information about an MMAv5 operation inside a
-// loop used for pipelining MMA ops.
-struct MMAInfo {
-  // This struct contains information about when the MMA's accumulator is
-  // overridden in the loop, if it is at all.
-  struct AccOverridePoint {
-    // The operation which overrides the accumulator.
-    Operation *op;
-    // The condition on which the accumulator is reset.
-    Value condition = nullptr;
-    // The initial value of the accumulator and the value after a reset.
-    Value initValue = nullptr;
-    // The number of loop iterations ago the accumulator was reset.
-    int distance = 0;
-    // Whether the accumulator is reset via setting the `useAcc` flag to false
-    // or by clearing the accumulator tensor value.
-    bool isFlag = false;
-  };
-
-  // The TMEM allocation of the accumuator, which directly precedes the dot op.
-  TMEMAllocOp accAlloc;
-  // The TMEM load of the accumulator value out of TMEM, which directly follows
-  // the dot op.
-  TMEMLoadOp accLoad;
-  // The override point of the accumulator value, if it is overriden in the
-  // loop. E.g. this is typically present for persistent kernels.
-  std::optional<AccOverridePoint> accDef;
-  // If the accumulator is used in future iterations of the loop, this is the
-  // iter arg number.
-  std::optional<int> yieldArgNo;
-  // Whether the accumulator needs to be multibuffered.
-  bool accIsMultiBuffered;
-
-  Value phase = nullptr;
-  Value barrierIdx = nullptr;
-  Value accInsertIdx = nullptr;
-  Value accExtractIdx = nullptr;
-  Value barrierAlloc = nullptr;
-};
-
 //===----------------------------------------------------------------------===//
 // MMA Pipeline Analysis
 //===----------------------------------------------------------------------===//
@@ -66,12 +22,14 @@ struct MMAInfo {
 // be in the same region as the MMA operation.
 std::optional<std::pair<TMEMAllocOp, TMEMLoadOp>>
 getTMemAllocAndLoad(MMAv5OpInterface mmaOp);
-// Get immediate users of the accumulator within the current loop iteration.
-SmallVector<Operation *> getDirectAccUses(TMEMLoadOp accDef);
-// Analyze an MMA op inside a loop to determine information about how it can be
-// pipelined. Returns `std::nullopt` if it cannot be pipelined.
-std::optional<MMAInfo> getMMAInfo(scf::ForOp forOp, MMAv5OpInterface mmaOp,
-                                  DominanceInfo &domInfo);
+// Given an MMAv5 operation in a loop, determine if its accumulator can be
+// multibuffered.
+bool isAccMultibufferingPossible(MMAv5OpInterface mma, scf::ForOp forOp);
+// Only pipeline the loops where the MMA happens before the tmem_load, or is in
+// the same stage as the tmem_load. Lowering does not support the case where the
+// MMA is in a different stage as the tmem_load and happens after it.
+bool mmav5DominatesTmemLoads(
+    scf::ForOp forOp, function_ref<bool(MMAv5OpInterface)> isMmaPipelineable);
 
 //===----------------------------------------------------------------------===//
 // MMA Pipeline Rewriters
@@ -82,11 +40,6 @@ std::optional<MMAInfo> getMMAInfo(scf::ForOp forOp, MMAv5OpInterface mmaOp,
 TMEMAllocOp createTMemAlloc(OpBuilder &builder, TMEMAllocOp oldTMemAllocOp,
                             bool multiBufferred, int numStages);
 
-// Create a store op of the initial value of the accumulator into the
-// potentially multi-buffered accumulator.
-void createInitStore(OpBuilder &builder, TMEMAllocOp allocOp, Value initVal,
-                     bool multiBufferred);
-
 // Return true if operands of the MMA operation are/are going to be pipelined
 // and multibuffered, enabling the MMA operation to be pipelined.
 bool mmaHasPipelineableOperands(
 
@@ -11,6 +11,7 @@
 
 namespace mlir {
 class DominanceInfo;
+class PostDominanceInfo;
 
 namespace triton {
 class ModuleAxisInfoAnalysis;
@@ -222,6 +223,11 @@ getMMAsWithMultiBufferredOperands(scf::ForOp forOp,
 // regions. The result op is not necessarily one of the ops in the list.
 Operation *findNearestCommonDominator(ArrayRef<Operation *> ops,
                                       DominanceInfo &domInfo);
+// Given a list of ops, find the naerest common postdominator of all ops or
+// return null if one could not be found. The ops are allowed to be in different
+// regions. The result op is not necessarily one of the ops in the list.
+Operation *findNearestCommonPostDominator(ArrayRef<Operation *> ops,
+                                          PostDominanceInfo &postDomInfo);
 
 /// Visit the operands of `op` and the operands of any nested ops defined
 /// outside of `op`.
 
@@ -404,7 +404,7 @@ def TTNG_TCGen5MMAOp : TTNG_Op<"tc_gen5_mma", [DeclareOpInterfaceMethods<MemoryE
     let assemblyFormat = "$a`,` $b`,` $d`,` $useD`,` $pred (`,` $barrier^)? attr-dict `:` functional-type(operands, results)";
 }
 
-def TTNG_TCGen5MMAScaledOp : TTNG_Op<"tc_gen5_mma_scaled", [DeclareOpInterfaceMethods<MemoryEffectsOpInterface>, DeclareOpInterfaceMethods<DotOpInterface>, DeclareOpInterfaceMethods<MMAv5OpInterface>]> {
+def TTNG_TCGen5MMAScaledOp : TTNG_Op<"tc_gen5_mma_scaled", [DeclareOpInterfaceMethods<MemoryEffectsOpInterface>, DeclareOpInterfaceMethods<DotOpInterface, ["verifyDims", "verifyOutputDims"]>, DeclareOpInterfaceMethods<MMAv5OpInterface>]> {
     let summary = "block level op mapping to tensorcore gen5 mma";
 
     let description = [{
@@ -423,7 +423,11 @@ def TTNG_TCGen5MMAScaledOp : TTNG_Op<"tc_gen5_mma_scaled", [DeclareOpInterfaceMe
                          I1:$useD,
                          I1:$pred,
                          Optional<TTG_MemDescType>:$barrier);
-
+    let extraClassDeclaration = [{
+      int64_t getBlockM();
+      int64_t getBlockN();
+      int64_t getBlockK();
+    }];
     // TODO: improve printing format.
     let assemblyFormat = "$a `,` $b `,` $d `,` $a_scale `,` $b_scale `,` $useD`,` $pred `lhs` `=` $a_type `rhs` `=` $b_type (`,` $barrier^)? attr-dict `:` functional-type(operands, results)";
 }
 
@@ -667,18 +667,6 @@ createLLVMIntrinsicCallOp(OpBuilder &builder, Location loc, StringRef intrinsic,
   return op;
 }
 
-bool isConstantZero(Value v) {
-  if (auto constantOp = v.getDefiningOp<arith::ConstantOp>()) {
-    if (auto attr = dyn_cast<IntegerAttr>(constantOp.getValue())) {
-      return attr.getValue().isZero();
-    }
-    if (auto attr = dyn_cast<FloatAttr>(constantOp.getValue())) {
-      return attr.getValue().isZero();
-    }
-  }
-  return false;
-}
-
 Value getStructFromSharedMemoryObject(Location loc,
                                       const SharedMemoryObject &smemObj,
                                       RewriterBase &rewriter) {
 
@@ -64,8 +64,7 @@ LogicalResult verifyDotOpInterface(Operation *op) {
                               "operand to be equal to the first dimension of "
                               "the result");
   // Check the output shape
-  if (cShape[cShape.size() - 2] != aShape[aShape.size() - 2] ||
-      cShape[cShape.size() - 1] != bShape[aShape.size() - 1])
+  if (!dotOp.verifyOutputDims())
     return dotOp->emitOpError(
         "expected the output shape to be the concatenation of the last "
         "dimension of the first operand and the last dimension of the "
 
@@ -323,14 +323,39 @@ bool DotScaledOp::verifyDims() {
 
   auto aKdim = aShape[aShape.size() - 1];
   auto bKdim = bShape[aShape.size() - 2];
-  if (this->getAElemType() == ScaleDotElemType::E2M1)
-    aKdim *= 2;
-  if (this->getBElemType() == ScaleDotElemType::E2M1)
-    bKdim *= 2;
+  if (this->getAElemType() == ScaleDotElemType::E2M1) {
+    if (this->getLhsKPack())
+      aKdim *= 2;
+  }
+  if (this->getBElemType() == ScaleDotElemType::E2M1) {
+    if (this->getRhsKPack())
+      bKdim *= 2;
+  }
 
   return aKdim == bKdim;
 }
 
+bool DotScaledOp::verifyOutputDims() {
+  auto cShape = this->getC().getType().getShape();
+  auto oMdim = cShape[cShape.size() - 2];
+  auto oNdim = cShape[cShape.size() - 1];
+  auto aShape = this->getA().getType().getShape();
+  auto bShape = this->getB().getType().getShape();
+  auto adim = aShape[aShape.size() - 2];
+  auto bdim = bShape[bShape.size() - 1];
+  if (this->getAElemType() == ScaleDotElemType::E2M1) {
+    if (!this->getLhsKPack())
+      adim *= 2;
+  }
+  if (this->getBElemType() == ScaleDotElemType::E2M1) {
+    if (!this->getRhsKPack())
+      bdim *= 2;
+  }
+  if (adim != oMdim || bdim != oNdim)
+    return false;
+  return true;
+}
+
 //-- MakeRangeOp --
 OpFoldResult MakeRangeOp::fold(FoldAdaptor adaptor) {
   // make_range(start, start + 1) -> constant(start)
 
@@ -149,6 +149,7 @@ warpsPerTileV3(DotOp dotOp, const ArrayRef<int64_t> shape, int numWarps,
 static Value
 getSharedMemoryMMAOperand(Value v, mlir::PatternRewriter &rewriter, int opIdx,
                           bool allowTranspose, bool isMMAv5Fp4Padded = false,
+                          bool forceTranspose = false,
                           Operation *op = nullptr /*only for diagnostic*/) {
   OpBuilder::InsertionGuard g(rewriter);
   Value arg = v;
@@ -167,6 +168,8 @@ getSharedMemoryMMAOperand(Value v, mlir::PatternRewriter &rewriter, int opIdx,
     } else {
       newOrder = {1, 0};
     }
+    if (forceTranspose)
+      std::swap(newOrder[0], newOrder[1]);
   }
 
   if (newOrder != order && op) {
@@ -648,49 +651,47 @@ class ScaledBlockedToMMAv5
 
     bool IsAMixedPrecFp4 = false;
     bool IsBMixedPrecFp4 = false;
+    bool isAFP4 = dotOp.getAElemType() == ScaleDotElemType::E2M1;
+    bool isBFP4 = dotOp.getBElemType() == ScaleDotElemType::E2M1;
 
     if (dotOp.getAElemType() != dotOp.getBElemType()) {
-      if (dotOp.getAElemType() == ScaleDotElemType::E2M1)
+      if (isAFP4)
         IsAMixedPrecFp4 = true;
-      else if (dotOp.getBElemType() == ScaleDotElemType::E2M1)
+      else if (isBFP4)
         IsBMixedPrecFp4 = true;
     }
-
+    // If we use txgen05.mma.kind.mxf864 we need to padd the fp4 operands:
+    // https://docs.nvidia.com/cuda/parallel-thread-execution/#tcgen05-packing-formats-mxf8f6f4-smem
+    bool isMMAv5Fp4PaddedLhs = IsAMixedPrecFp4 || !dotOp.getLhsKPack();
+    bool isMMAv5Fp4PaddedRhs = IsBMixedPrecFp4 || !dotOp.getRhsKPack();
     // For mixed-precision fp4 operands, set allowTranspose = false, to force
     // the packed axis, K, to be contiguous in SMEM
     a = getSharedMemoryMMAOperand(a, rewriter, 0,
-                                  /*allowTranspose=*/!IsAMixedPrecFp4,
-                                  IsAMixedPrecFp4, dotOp);
+                                  /*allowTranspose=*/!isAFP4,
+                                  /*isMMAv5Fp4Padded=*/isMMAv5Fp4PaddedLhs,
+                                  /*forceTranspose=*/!dotOp.getLhsKPack(),
+                                  dotOp);
     b = getSharedMemoryMMAOperand(b, rewriter, 1,
-                                  /*allowTranspose=*/!IsBMixedPrecFp4,
-                                  IsBMixedPrecFp4, dotOp);
+                                  /*allowTranspose=*/!isBFP4,
+                                  /*isMMAv5Fp4Padded=*/isMMAv5Fp4PaddedRhs,
+                                  /*forceTranspose=*/!dotOp.getRhsKPack(),
+                                  dotOp);
 
     MLIRContext *context = dotOp->getContext();
     unsigned m = 128;
     unsigned n = retShapePerCTA[1] >= 256 ? 256 : retShapePerCTA[1];
-    unsigned k = 32;
-    // If both operands are E2M1, target the FP4 tensor core implicitly.
-    // This may result in a downstream compile-time error if the scaled TC
-    // descriptor requires options that are unavailable to the .kind=mxf4 mma.
-    // This is likely preferable over a silent runtime performance degradation
-    // from running f4xf4 via .kind=mxf8f6f4
-    if (dotOp.getAElemType() == ScaleDotElemType::E2M1 &&
-        dotOp.getBElemType() == ScaleDotElemType::E2M1) {
-      k = 64;
-    }
-    SmallVector<unsigned> instrShape = {m, n, k};
+
     ArrayRef<unsigned> CTASplitNum = CTALayout.getCTASplitNum();
     Attribute accEncoding = triton::nvidia_gpu::TensorMemoryEncodingAttr::get(
-        context, instrShape[0], instrShape[1], /*unpacked=*/true,
-        CTASplitNum[0], CTASplitNum[1]);
+        context, m, n, /*unpacked=*/true, CTASplitNum[0], CTASplitNum[1]);
     Attribute tensorMemorySpace =
         triton::nvidia_gpu::TensorMemorySpaceAttr::get(context);
     Type accMemDescType = triton::gpu::MemDescType::get(
         oldRetType.getShape(), oldRetType.getElementType(), accEncoding,
         tensorMemorySpace,
         /*mutableMemory=*/true);
-    Attribute newDistributedEncoding = nvidia_gpu::getTmemCompatibleLayout(
-        instrShape[0], instrShape[1], oldRetType, numWarps);
+    Attribute newDistributedEncoding =
+        nvidia_gpu::getTmemCompatibleLayout(m, n, oldRetType, numWarps);
     auto newAccType = RankedTensorType::get(oldRetType.getShape(),
                                             oldRetType.getElementType(),
                                             newDistributedEncoding);