intel
diff --git a/‎include/triton/Dialect/Triton/IR/Dialect.h‎
Lines changed: 0 additions & 5 deletions b/‎include/triton/Dialect/Triton/IR/Dialect.h‎
Lines changed: 0 additions & 5 deletions
diff --git a/‎include/triton/Dialect/Triton/IR/TritonOps.td‎
Lines changed: 0 additions & 4 deletions b/‎include/triton/Dialect/Triton/IR/TritonOps.td‎
Lines changed: 0 additions & 4 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td‎
Lines changed: 18 additions & 16 deletions b/‎include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td‎
Lines changed: 18 additions & 16 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/Transforms/DecomposeScaledBlocked.h‎
Lines changed: 0 additions & 8 deletions b/‎include/triton/Dialect/TritonGPU/Transforms/DecomposeScaledBlocked.h‎
Lines changed: 0 additions & 8 deletions
diff --git a/‎lib/Dialect/Triton/IR/Ops.cpp‎
Lines changed: 0 additions & 21 deletions b/‎lib/Dialect/Triton/IR/Ops.cpp‎
Lines changed: 0 additions & 21 deletions
diff --git a/‎lib/Dialect/TritonGPU/IR/Dialect.cpp‎
Lines changed: 0 additions & 92 deletions b/‎lib/Dialect/TritonGPU/IR/Dialect.cpp‎
Lines changed: 0 additions & 92 deletions
diff --git a/‎lib/Dialect/TritonGPU/IR/Ops.cpp‎
Lines changed: 110 additions & 53 deletions b/‎lib/Dialect/TritonGPU/IR/Ops.cpp‎
Lines changed: 110 additions & 53 deletions
@@ -83,11 +83,6 @@ class DialectInferLayoutInterface
   virtual LogicalResult
   verifyDotOpEncodingCompatibility(Operation *op, Attribute operandEncodingA,
                                    Attribute operandEncodingB) const = 0;
-
-  virtual LogicalResult
-  inferFp4ToFpOpEncoding(ArrayRef<int64_t> shape, int axis, Attribute inEnc,
-                         Attribute &outEnc, bool fwdInference,
-                         std::optional<Location> loc) const = 0;
 };
 
 class DialectVerifyTensorLayoutInterface
 
@@ -456,10 +456,6 @@ def TT_ReshapeOp : TT_Op<"reshape", [Pure,
         If efficient_layout is set, this is a hint that the destination layout should be kept for performance reason.
         The compiler is still free to change it for better performance.
     }];
-    let builders = [
-      OpBuilder<(ins "ArrayRef<int64_t>":$shape, "TypedValue<RankedTensorType>":$src)>
-    ];
-
     let arguments = (ins TT_Tensor:$src, UnitAttr:$allow_reorder, UnitAttr:$efficient_layout);
     let results = (outs TT_Tensor:$result);
     let assemblyFormat = "$src (`allow_reorder` $allow_reorder^)? (`efficient_layout` $efficient_layout^)? attr-dict `:` type($src) `->` type($result)";
 
@@ -283,29 +283,31 @@ def TTG_LocalStoreOp : TTG_Op<"local_store", [DeclareOpInterfaceMethods<MemoryEf
   }];
 }
 
-def TTG_Fp4ToFpOp : TTG_Op<"fp4_to_fp", [Pure]> {
-  let summary = "Upcast fp4 (e2m1) to fp";
+def TTG_UpcastMXFPOp : TTG_Op<"upcast_mxfp", [Pure]> {
+  let summary = "Convert an mxfp tensor to bf16/fp16";
 
   let hasVerifier = 1;
 
   let description = [{
-    Upcast fp4 (e2m1) represented packed as i8s to fp.
-
-    The lower 4 bits of the i8s represent the first fp4 element, and the upper 4 bits
-    the second fp4 element.
-
-    The `axis` attribute specifies the axis along which the fp4 elements are packed.
+    Compute the bf16 encoded in the given mxfp number as per
+    https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf
   }];
-
-  let builders = [
-      OpBuilder<(ins "TypedValue<RankedTensorType>":$src, "Type":$elemType, "int32_t":$axis)>
-    ];
-
-  let arguments = (ins RankedTensorOf<[I8]>:$src, I32Attr:$axis);
-  let results = (outs TT_FloatTensor:$result);
+  let arguments = (
+    ins
+    TT_Tensor:$src,
+    TT_Tensor:$scale,
+    TT_ScaleDotElemTypeAttr:$fp_type,
+    BoolAttr:$fastMath
+  );
+  let results = (outs TT_Tensor:$result);
 
   let assemblyFormat = [{
-    $src attr-dict `:` type($src) `->` type($result)
+    $src `,` $scale  `fp_type` `=` $fp_type attr-dict `:` type($src) `,` type($scale) `->` type($result)
+  }];
+
+  let extraClassDeclaration = [{
+    static RankedTensorType deduceOutputType(
+        TypedValue<RankedTensorType> inputTensor, ScaleDotElemType inputElemType, Type outputElemType);
   }];
 }
 
 
@@ -693,23 +693,6 @@ OpFoldResult ExpandDimsOp::fold(FoldAdaptor adaptor) {
 }
 
 //-- ReshapeOp --
-
-void ReshapeOp::build(OpBuilder &builder, OperationState &state,
-                      ArrayRef<int64_t> shape,
-                      TypedValue<RankedTensorType> src) {
-  auto srcTy = src.getType();
-  auto srcEnc = srcTy.getEncoding();
-  Attribute dstEnc;
-  if (srcEnc) {
-    auto result = cast<DialectInferLayoutInterface>(&srcEnc.getDialect())
-                      ->inferReshapeOpEncoding(srcTy.getShape(), srcEnc, shape,
-                                               dstEnc, state.location);
-    assert(succeeded(result));
-  }
-  auto dstTy = RankedTensorType::get(shape, srcTy.getElementType(), dstEnc);
-  build(builder, state, dstTy, src);
-}
-
 LogicalResult ReshapeOp::canonicalize(ReshapeOp op, PatternRewriter &rewriter) {
   if (op.getEfficientLayout())
     return failure();
@@ -786,10 +769,6 @@ LogicalResult ReshapeOp::verify() {
 OpFoldResult FpToFpOp::fold(FoldAdaptor adaptor) {
   auto srcVal = getSrc();
   auto dstTy = getType();
-  // Fold trivial cast
-  if (srcVal.getType() == dstTy) {
-    return srcVal;
-  }
 
   auto resElemType = cast<FloatType>(getElementTypeOrSelf(getType()));
   const llvm::fltSemantics &semantic = resElemType.getFloatSemantics();
 
@@ -2882,98 +2882,6 @@ struct TritonGPUInferLayoutInterface
                            ArrayRef(enc.getCTAOrder()).drop_front(1)));
     return success();
   }
-
-  LogicalResult
-  inferFp4ToFpOpEncoding(ArrayRef<int64_t> shape, int axis, Attribute inEnc,
-                         Attribute &outEnc, bool fwdInference,
-                         std::optional<Location> loc) const override {
-    // We implement two legacy layout propagations
-    // Once we fully migrate to LinearLayouts, we can remove these.
-    auto *ctx = getContext();
-    auto rank = shape.size();
-    // The output encoding will only be a legacy encoding if the axis is the
-    // fastest running dimension.
-    if (getOrder(inEnc)[axis] == 0) {
-      // Dot operand: double kWidth if kDim == axis.
-      if (auto dotEnc = mlir::dyn_cast<DotOperandEncodingAttr>(inEnc)) {
-        auto kWidth = dotEnc.getKWidth();
-        if (fwdInference) {
-          kWidth *= 2;
-        } else {
-          if (kWidth > 1) {
-            // bwd inference
-            kWidth /= 2;
-          } else {
-            return emitOptionalError(loc,
-                                     "Fp4ToFpOp requires at least 2 elements "
-                                     "per thread in the axis dimension");
-          }
-        }
-        outEnc = DotOperandEncodingAttr::get(ctx, dotEnc.getOpIdx(),
-                                             dotEnc.getParent(), kWidth);
-        return success();
-      }
-
-      // Blocked layout: double elemsPerThread[axis].
-      if (auto blockedEnc = mlir::dyn_cast<BlockedEncodingAttr>(inEnc)) {
-        auto sizePerThread = llvm::to_vector(blockedEnc.getSizePerThread());
-        if (fwdInference) {
-          sizePerThread[axis] *= 2;
-        } else {
-          if (sizePerThread[axis] > 1) {
-            sizePerThread[axis] /= 2;
-          } else {
-            return emitOptionalError(
-                loc, "Fp4ToFpOp requires at least 2 elements per "
-                     "thread in the axis dimension");
-          }
-        }
-        outEnc = BlockedEncodingAttr::get(
-            ctx, sizePerThread, blockedEnc.getThreadsPerWarp(),
-            blockedEnc.getWarpsPerCTA(), blockedEnc.getOrder(),
-            blockedEnc.getCTALayout());
-        return success();
-      }
-    }
-
-    auto ll = toLinearLayout(shape, inEnc);
-
-    auto kRegister = StringAttr::get(ctx, "register");
-    auto outDims = llvm::to_vector(ll.getOutDimNames());
-    LinearLayout newLl = LinearLayout::empty();
-    if (fwdInference) {
-      auto split = LinearLayout::identity1D(2, kRegister, outDims[axis]);
-      newLl = split * ll;
-      // FIXME!!!!
-      // operator* transposes the output dimensions??!! WTF
-      newLl = newLl.transposeOuts(outDims);
-    } else {
-      // TODO This requires a division algorithm!
-      // Implement manually ll.divideLeft(split)
-      auto contiguousElems =
-          LinearEncodingAttr::get(ctx, ll).getContigPerThread();
-      if (contiguousElems[axis] > 1) {
-        LinearLayout::BasesT newBases;
-        for (const auto &basesDim : ll.getBases()) {
-          std::vector<std::vector<int32_t>> newBasesDim;
-          for (auto base : basesDim.second) {
-            if (base[axis] == 1) {
-              continue;
-            }
-            base[axis] /= 2;
-            newBasesDim.push_back(std::move(base));
-          }
-          newBases.insert({basesDim.first, std::move(newBasesDim)});
-        }
-        newLl = LinearLayout(std::move(newBases), std::move(outDims));
-      } else {
-        return emitOptionalError(loc, "Fp4ToFpOp requires at least 2 elements "
-                                      "per thread in the axis dimension");
-      }
-    }
-    outEnc = LinearEncodingAttr::get(ctx, newLl);
-    return success();
-  }
 };
 
 struct TritonGPUVerifyTensorLayoutInterface
 
@@ -331,64 +331,121 @@ void ConvertLayoutOp::getCanonicalizationPatterns(RewritePatternSet &patterns,
   patterns.add<CanonicalizeConvertFromSplit>(context);
 }
 
-LogicalResult Fp4ToFpOp::verify() {
-  auto srcTy = cast<RankedTensorType>(getSrc().getType());
-  auto resTy = cast<RankedTensorType>(getResult().getType());
-  auto rank = srcTy.getRank();
-
-  if (rank != resTy.getRank())
-    return emitError() << "source rank " << rank << " != result rank "
-                       << resTy.getRank();
-
-  auto srcShape = srcTy.getShape();
-  auto resShape = resTy.getShape();
-  auto axis = getAxis();
-
-  if (!(0 <= axis && axis < rank))
-    return emitError() << "axis " << axis << " out of range for rank " << rank;
-
-  auto elemType = resTy.getElementType();
-  if (!(elemType.isBF16() || elemType.isF16()))
-    return emitError() << "only bf16 or f16 is supported for now, got "
-                       << elemType;
-
-  for (int i = 0; i < rank; ++i) {
-    if (i == axis) {
-      if (resShape[i] != srcShape[i] * 2)
-        return emitError() << "axis " << axis
-                           << " dimension must be 2x source dimension (src="
-                           << srcShape[i] << ", dst=" << resShape[i] << ")";
-    } else {
-      if (resShape[i] != srcShape[i])
-        return emitError() << "dimension " << i
-                           << " mismatch (src=" << srcShape[i]
-                           << ", dst=" << resShape[i] << ", axis=" << axis
-                           << ")";
+LogicalResult UpcastMXFPOp::verify() {
+  auto fpType = getFpType();
+
+  auto xTy = getSrc().getType();
+  auto scaleTy = getScale().getType();
+  Builder b(getContext());
+  if (xTy.getElementType() != b.getBF16Type() &&
+      xTy.getElementType() != b.getF16Type() &&
+      xTy.getElementType() != b.getI8Type()) {
+    return emitOpError(
+        "element type of the first operand must be bf16/fp16 or i8");
+  }
+
+  if (scaleTy.getElementType() != b.getI8Type()) {
+    return emitOpError("element type of the second operand must be uint8");
+  }
+
+  auto xShape = xTy.getShape();
+  auto scaleShape = scaleTy.getShape();
+
+  if (xShape.size() != scaleShape.size() || xShape.size() < 2) {
+    return emitOpError(
+        "operands must have the same number of dimensions, at least 2");
+  }
+
+  if (!(fpType == ScaleDotElemType::E2M1 || fpType == ScaleDotElemType::E4M3 ||
+        fpType == ScaleDotElemType::E5M2)) {
+    return emitOpError("NYI: fpType must be E2M1, E4M3, or E5M2");
+  }
+
+  auto layoutX = xTy.getEncoding();
+  auto layoutScale = scaleTy.getEncoding();
+  if (bool(layoutX) != bool(layoutScale)) {
+    return emitOpError(
+        "Expected either both or neither operands to have an encoding");
+  }
+  // Nothing to check if no encoding. This is used to infer the return type in
+  // AccelerateMatmul.cpp
+  if (!layoutX) {
+    return success();
+  }
+
+  auto dotEncoding = dyn_cast<DotOperandEncodingAttr>(layoutX);
+  if (!dotEncoding) {
+    return emitOpError("Expected a DotOperandEncodingAttr for values");
+  }
+  if (!isa<BlockedEncodingAttr, LinearEncodingAttr>(layoutScale)) {
+    return emitOpError(
+        "Expected a BlockOperandEncoding or LinearOperandEncoding "
+        "for scales");
+  }
+
+  if (isa<NvidiaMmaEncodingAttr>(dotEncoding.getParent())) {
+    // Necessary to keep all of the scales of a given block of values in the
+    // same warp
+    auto threadsPerWarp =
+        cast<DistributedEncodingTrait>(layoutScale).getThreadsPerWarp();
+    if (threadsPerWarp != ArrayRef<unsigned>({16, 2})) {
+      return emitOpError("Expected threads per warp to be {16, 2}");
     }
   }
+
+  // Change to support fp8 types
+  const auto elemsPacked = fpType == ScaleDotElemType::E2M1 ? 2 : 1;
+  // Figure out the K dimension for the input A/B. For A/B scale, the K
+  // dimension is always the last dimension.
+  const int opIdx = dotEncoding.getOpIdx();
+  const bool hasBatch = xShape.size() == 3;
+  const int kIdx = (opIdx == 0 ? 1 : 0) + hasBatch;
+
+  if (xShape[kIdx] != (32 / elemsPacked) * scaleShape.back()) {
+    return emitOpError("K dimension of first operand must be 16 times "
+                       "larger than last/K dimension of the second operand");
+  }
+
+  // Check other dimensions match too. For input A/B, we need to figure out the
+  // index for the M/N dimension. For scale, it's always {(batch), M/N, K}.
+  const int mnIdx = (opIdx == 0 ? 0 : 1) + hasBatch;
+  if (hasBatch && xShape[0] != scaleShape[0])
+    return emitOpError("batch dimension must match between operands");
+  if (xShape[mnIdx] != scaleShape[hasBatch]) {
+    return emitOpError("M/N dimension must match between operands");
+  }
+
   return success();
 }
 
-void Fp4ToFpOp::build(OpBuilder &builder, OperationState &state,
-                      TypedValue<RankedTensorType> src, Type elemType,
-                      int32_t axis) {
-  auto srcTy = src.getType();
-  auto shape = llvm::to_vector(srcTy.getShape());
-  auto rank = srcTy.getRank();
-  assert(0 <= axis && axis < rank);
-  shape[axis] *= 2;
-
-  Attribute inEnc = srcTy.getEncoding();
-  Attribute outEnc;
-  auto result =
-      inEnc.getDialect()
-          .getRegisteredInterface<triton::DialectInferLayoutInterface>()
-          ->inferFp4ToFpOpEncoding(shape, axis, inEnc, outEnc,
-                                   /*fwdInference=*/true, state.location);
-  assert(succeeded(result));
-
-  auto resultTy = RankedTensorType::get(shape, elemType, outEnc);
-  build(builder, state, resultTy, src, axis);
+RankedTensorType
+UpcastMXFPOp::deduceOutputType(TypedValue<RankedTensorType> inputTensor,
+                               ScaleDotElemType inputElemType,
+                               Type outputElemType) {
+  MLIRContext *ctx = inputTensor.getContext();
+  auto xTy = inputTensor.getType();
+  if (inputElemType != ScaleDotElemType::E2M1)
+    return xTy;
+
+  auto xShape = xTy.getShape();
+  auto newShape = llvm::to_vector(xShape);
+  auto encoding = xTy.getEncoding();
+  if (!encoding) {
+    newShape.back() *= 2;
+    return RankedTensorType::get(xShape, outputElemType);
+  }
+
+  auto oldEncoding = cast<DotOperandEncodingAttr>(encoding);
+  auto newVEncoding = DotOperandEncodingAttr::get(ctx, oldEncoding.getOpIdx(),
+                                                  oldEncoding.getParent(),
+                                                  oldEncoding.getKWidth() * 2);
+  // Figure out the K dimension for the input A/B, given that the return
+  // type is upcasted A/B type so we need to update the proper dim size.
+  const int opIdx = oldEncoding.getOpIdx();
+  const bool hasBatch = xShape.size() == 3;
+  const int kIdx = (opIdx == 0 ? 1 : 0) + hasBatch;
+  newShape[kIdx] *= 2;
+  return RankedTensorType::get(newShape, outputElemType, newVEncoding);
 }
 
 OpFoldResult MemDescTransOp::fold(FoldAdaptor adaptor) {