intel
diff --git a/‎include/triton/Dialect/Triton/IR/Dialect.h‎
Lines changed: 5 additions & 0 deletions b/‎include/triton/Dialect/Triton/IR/Dialect.h‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎include/triton/Dialect/Triton/IR/TritonOps.td‎
Lines changed: 4 additions & 0 deletions b/‎include/triton/Dialect/Triton/IR/TritonOps.td‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td‎
Lines changed: 16 additions & 18 deletions b/‎include/triton/Dialect/TritonGPU/IR/TritonGPUOps.td‎
Lines changed: 16 additions & 18 deletions
diff --git a/‎include/triton/Dialect/TritonGPU/Transforms/DecomposeScaledBlocked.h‎
Lines changed: 8 additions & 0 deletions b/‎include/triton/Dialect/TritonGPU/Transforms/DecomposeScaledBlocked.h‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎lib/Dialect/Triton/IR/Ops.cpp‎
Lines changed: 21 additions & 0 deletions b/‎lib/Dialect/Triton/IR/Ops.cpp‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎lib/Dialect/TritonGPU/IR/Dialect.cpp‎
Lines changed: 92 additions & 0 deletions b/‎lib/Dialect/TritonGPU/IR/Dialect.cpp‎
Lines changed: 92 additions & 0 deletions
diff --git a/‎lib/Dialect/TritonGPU/IR/Ops.cpp‎
Lines changed: 53 additions & 110 deletions b/‎lib/Dialect/TritonGPU/IR/Ops.cpp‎
Lines changed: 53 additions & 110 deletions
@@ -83,6 +83,11 @@ class DialectInferLayoutInterface
   virtual LogicalResult
   verifyDotOpEncodingCompatibility(Operation *op, Attribute operandEncodingA,
                                    Attribute operandEncodingB) const = 0;
+
+  virtual LogicalResult
+  inferFp4ToFpOpEncoding(ArrayRef<int64_t> shape, int axis, Attribute inEnc,
+                         Attribute &outEnc, bool fwdInference,
+                         std::optional<Location> loc) const = 0;
 };
 
 class DialectVerifyTensorLayoutInterface
 
@@ -456,6 +456,10 @@ def TT_ReshapeOp : TT_Op<"reshape", [Pure,
         If efficient_layout is set, this is a hint that the destination layout should be kept for performance reason.
         The compiler is still free to change it for better performance.
     }];
+    let builders = [
+      OpBuilder<(ins "ArrayRef<int64_t>":$shape, "TypedValue<RankedTensorType>":$src)>
+    ];
+
     let arguments = (ins TT_Tensor:$src, UnitAttr:$allow_reorder, UnitAttr:$efficient_layout);
     let results = (outs TT_Tensor:$result);
     let assemblyFormat = "$src (`allow_reorder` $allow_reorder^)? (`efficient_layout` $efficient_layout^)? attr-dict `:` type($src) `->` type($result)";
 
@@ -283,31 +283,29 @@ def TTG_LocalStoreOp : TTG_Op<"local_store", [DeclareOpInterfaceMethods<MemoryEf
   }];
 }
 
-def TTG_UpcastMXFPOp : TTG_Op<"upcast_mxfp", [Pure]> {
-  let summary = "Convert an mxfp tensor to bf16/fp16";
+def TTG_Fp4ToFpOp : TTG_Op<"fp4_to_fp", [Pure]> {
+  let summary = "Upcast fp4 (e2m1) to fp";
 
   let hasVerifier = 1;
 
   let description = [{
-    Compute the bf16 encoded in the given mxfp number as per
-    https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf
-  }];
-  let arguments = (
-    ins
-    TT_Tensor:$src,
-    TT_Tensor:$scale,
-    TT_ScaleDotElemTypeAttr:$fp_type,
-    BoolAttr:$fastMath
-  );
-  let results = (outs TT_Tensor:$result);
+    Upcast fp4 (e2m1) represented packed as i8s to fp.
 
-  let assemblyFormat = [{
-    $src `,` $scale  `fp_type` `=` $fp_type attr-dict `:` type($src) `,` type($scale) `->` type($result)
+    The lower 4 bits of the i8s represent the first fp4 element, and the upper 4 bits
+    the second fp4 element.
+
+    The `axis` attribute specifies the axis along which the fp4 elements are packed.
   }];
 
-  let extraClassDeclaration = [{
-    static RankedTensorType deduceOutputType(
-        TypedValue<RankedTensorType> inputTensor, ScaleDotElemType inputElemType, Type outputElemType);
+  let builders = [
+      OpBuilder<(ins "TypedValue<RankedTensorType>":$src, "Type":$elemType, "int32_t":$axis)>
+    ];
+
+  let arguments = (ins RankedTensorOf<[I8]>:$src, I32Attr:$axis);
+  let results = (outs TT_FloatTensor:$result);
+
+  let assemblyFormat = [{
+    $src attr-dict `:` type($src) `->` type($result)
   }];
 }
 
 
@@ -0,0 +1,8 @@
+#include "mlir/IR/PatternMatch.h"
+
+namespace mlir::triton::gpu {
+
+void populateDecomposeScaledBlockedPatterns(mlir::RewritePatternSet &patterns,
+                                            int benefit);
+
+} // namespace mlir::triton::gpu
@@ -693,6 +693,23 @@ OpFoldResult ExpandDimsOp::fold(FoldAdaptor adaptor) {
 }
 
 //-- ReshapeOp --
+
+void ReshapeOp::build(OpBuilder &builder, OperationState &state,
+                      ArrayRef<int64_t> shape,
+                      TypedValue<RankedTensorType> src) {
+  auto srcTy = src.getType();
+  auto srcEnc = srcTy.getEncoding();
+  Attribute dstEnc;
+  if (srcEnc) {
+    auto result = cast<DialectInferLayoutInterface>(&srcEnc.getDialect())
+                      ->inferReshapeOpEncoding(srcTy.getShape(), srcEnc, shape,
+                                               dstEnc, state.location);
+    assert(succeeded(result));
+  }
+  auto dstTy = RankedTensorType::get(shape, srcTy.getElementType(), dstEnc);
+  build(builder, state, dstTy, src);
+}
+
 LogicalResult ReshapeOp::canonicalize(ReshapeOp op, PatternRewriter &rewriter) {
   if (op.getEfficientLayout())
     return failure();
@@ -769,6 +786,10 @@ LogicalResult ReshapeOp::verify() {
 OpFoldResult FpToFpOp::fold(FoldAdaptor adaptor) {
   auto srcVal = getSrc();
   auto dstTy = getType();
+  // Fold trivial cast
+  if (srcVal.getType() == dstTy) {
+    return srcVal;
+  }
 
   auto resElemType = cast<FloatType>(getElementTypeOrSelf(getType()));
   const llvm::fltSemantics &semantic = resElemType.getFloatSemantics();
 
@@ -2843,6 +2843,98 @@ struct TritonGPUInferLayoutInterface
                            ArrayRef(enc.getCTAOrder()).drop_front(1)));
     return success();
   }
+
+  LogicalResult
+  inferFp4ToFpOpEncoding(ArrayRef<int64_t> shape, int axis, Attribute inEnc,
+                         Attribute &outEnc, bool fwdInference,
+                         std::optional<Location> loc) const override {
+    // We implement two legacy layout propagations
+    // Once we fully migrate to LinearLayouts, we can remove these.
+    auto *ctx = getContext();
+    auto rank = shape.size();
+    // The output encoding will only be a legacy encoding if the axis is the
+    // fastest running dimension.
+    if (getOrder(inEnc)[axis] == 0) {
+      // Dot operand: double kWidth if kDim == axis.
+      if (auto dotEnc = mlir::dyn_cast<DotOperandEncodingAttr>(inEnc)) {
+        auto kWidth = dotEnc.getKWidth();
+        if (fwdInference) {
+          kWidth *= 2;
+        } else {
+          if (kWidth > 1) {
+            // bwd inference
+            kWidth /= 2;
+          } else {
+            return emitOptionalError(loc,
+                                     "Fp4ToFpOp requires at least 2 elements "
+                                     "per thread in the axis dimension");
+          }
+        }
+        outEnc = DotOperandEncodingAttr::get(ctx, dotEnc.getOpIdx(),
+                                             dotEnc.getParent(), kWidth);
+        return success();
+      }
+
+      // Blocked layout: double elemsPerThread[axis].
+      if (auto blockedEnc = mlir::dyn_cast<BlockedEncodingAttr>(inEnc)) {
+        auto sizePerThread = llvm::to_vector(blockedEnc.getSizePerThread());
+        if (fwdInference) {
+          sizePerThread[axis] *= 2;
+        } else {
+          if (sizePerThread[axis] > 1) {
+            sizePerThread[axis] /= 2;
+          } else {
+            return emitOptionalError(
+                loc, "Fp4ToFpOp requires at least 2 elements per "
+                     "thread in the axis dimension");
+          }
+        }
+        outEnc = BlockedEncodingAttr::get(
+            ctx, sizePerThread, blockedEnc.getThreadsPerWarp(),
+            blockedEnc.getWarpsPerCTA(), blockedEnc.getOrder(),
+            blockedEnc.getCTALayout());
+        return success();
+      }
+    }
+
+    auto ll = toLinearLayout(shape, inEnc);
+
+    auto kRegister = StringAttr::get(ctx, "register");
+    auto outDims = llvm::to_vector(ll.getOutDimNames());
+    LinearLayout newLl = LinearLayout::empty();
+    if (fwdInference) {
+      auto split = LinearLayout::identity1D(2, kRegister, outDims[axis]);
+      newLl = split * ll;
+      // FIXME!!!!
+      // operator* transposes the output dimensions??!! WTF
+      newLl = newLl.transposeOuts(outDims);
+    } else {
+      // TODO This requires a division algorithm!
+      // Implement manually ll.divideLeft(split)
+      auto contiguousElems =
+          LinearEncodingAttr::get(ctx, ll).getContigPerThread();
+      if (contiguousElems[axis] > 1) {
+        LinearLayout::BasesT newBases;
+        for (const auto &basesDim : ll.getBases()) {
+          std::vector<std::vector<int32_t>> newBasesDim;
+          for (auto base : basesDim.second) {
+            if (base[axis] == 1) {
+              continue;
+            }
+            base[axis] /= 2;
+            newBasesDim.push_back(std::move(base));
+          }
+          newBases.insert({basesDim.first, std::move(newBasesDim)});
+        }
+        newLl = LinearLayout(std::move(newBases), std::move(outDims));
+      } else {
+        return emitOptionalError(loc, "Fp4ToFpOp requires at least 2 elements "
+                                      "per thread in the axis dimension");
+      }
+    }
+    outEnc = LinearEncodingAttr::get(ctx, newLl);
+    return success();
+  }
 };
 
 struct TritonGPUVerifyTensorLayoutInterface
 
@@ -329,121 +329,64 @@ void ConvertLayoutOp::getCanonicalizationPatterns(RewritePatternSet &patterns,
   patterns.add<CanonicalizeConvertFromSplit>(context);
 }
 
-LogicalResult UpcastMXFPOp::verify() {
-  auto fpType = getFpType();
-
-  auto xTy = getSrc().getType();
-  auto scaleTy = getScale().getType();
-  Builder b(getContext());
-  if (xTy.getElementType() != b.getBF16Type() &&
-      xTy.getElementType() != b.getF16Type() &&
-      xTy.getElementType() != b.getI8Type()) {
-    return emitOpError(
-        "element type of the first operand must be bf16/fp16 or i8");
-  }
-
-  if (scaleTy.getElementType() != b.getI8Type()) {
-    return emitOpError("element type of the second operand must be uint8");
-  }
-
-  auto xShape = xTy.getShape();
-  auto scaleShape = scaleTy.getShape();
-
-  if (xShape.size() != scaleShape.size() || xShape.size() < 2) {
-    return emitOpError(
-        "operands must have the same number of dimensions, at least 2");
-  }
-
-  if (!(fpType == ScaleDotElemType::E2M1 || fpType == ScaleDotElemType::E4M3 ||
-        fpType == ScaleDotElemType::E5M2)) {
-    return emitOpError("NYI: fpType must be E2M1, E4M3, or E5M2");
-  }
-
-  auto layoutX = xTy.getEncoding();
-  auto layoutScale = scaleTy.getEncoding();
-  if (bool(layoutX) != bool(layoutScale)) {
-    return emitOpError(
-        "Expected either both or neither operands to have an encoding");
-  }
-  // Nothing to check if no encoding. This is used to infer the return type in
-  // AccelerateMatmul.cpp
-  if (!layoutX) {
-    return success();
-  }
-
-  auto dotEncoding = dyn_cast<DotOperandEncodingAttr>(layoutX);
-  if (!dotEncoding) {
-    return emitOpError("Expected a DotOperandEncodingAttr for values");
-  }
-  if (!isa<BlockedEncodingAttr, LinearEncodingAttr>(layoutScale)) {
-    return emitOpError(
-        "Expected a BlockOperandEncoding or LinearOperandEncoding "
-        "for scales");
-  }
-
-  if (isa<NvidiaMmaEncodingAttr>(dotEncoding.getParent())) {
-    // Necessary to keep all of the scales of a given block of values in the
-    // same warp
-    auto threadsPerWarp =
-        cast<DistributedEncodingTrait>(layoutScale).getThreadsPerWarp();
-    if (threadsPerWarp != ArrayRef<unsigned>({16, 2})) {
-      return emitOpError("Expected threads per warp to be {16, 2}");
+LogicalResult Fp4ToFpOp::verify() {
+  auto srcTy = cast<RankedTensorType>(getSrc().getType());
+  auto resTy = cast<RankedTensorType>(getResult().getType());
+  auto rank = srcTy.getRank();
+
+  if (rank != resTy.getRank())
+    return emitError() << "source rank " << rank << " != result rank "
+                       << resTy.getRank();
+
+  auto srcShape = srcTy.getShape();
+  auto resShape = resTy.getShape();
+  auto axis = getAxis();
+
+  if (!(0 <= axis && axis < rank))
+    return emitError() << "axis " << axis << " out of range for rank " << rank;
+
+  auto elemType = resTy.getElementType();
+  if (!(elemType.isBF16() || elemType.isF16()))
+    return emitError() << "only bf16 or f16 is supported for now, got "
+                       << elemType;
+
+  for (int i = 0; i < rank; ++i) {
+    if (i == axis) {
+      if (resShape[i] != srcShape[i] * 2)
+        return emitError() << "axis " << axis
+                           << " dimension must be 2x source dimension (src="
+                           << srcShape[i] << ", dst=" << resShape[i] << ")";
+    } else {
+      if (resShape[i] != srcShape[i])
+        return emitError() << "dimension " << i
+                           << " mismatch (src=" << srcShape[i]
+                           << ", dst=" << resShape[i] << ", axis=" << axis
+                           << ")";
     }
   }
-
-  // Change to support fp8 types
-  const auto elemsPacked = fpType == ScaleDotElemType::E2M1 ? 2 : 1;
-  // Figure out the K dimension for the input A/B. For A/B scale, the K
-  // dimension is always the last dimension.
-  const int opIdx = dotEncoding.getOpIdx();
-  const bool hasBatch = xShape.size() == 3;
-  const int kIdx = (opIdx == 0 ? 1 : 0) + hasBatch;
-
-  if (xShape[kIdx] != (32 / elemsPacked) * scaleShape.back()) {
-    return emitOpError("K dimension of first operand must be 16 times "
-                       "larger than last/K dimension of the second operand");
-  }
-
-  // Check other dimensions match too. For input A/B, we need to figure out the
-  // index for the M/N dimension. For scale, it's always {(batch), M/N, K}.
-  const int mnIdx = (opIdx == 0 ? 0 : 1) + hasBatch;
-  if (hasBatch && xShape[0] != scaleShape[0])
-    return emitOpError("batch dimension must match between operands");
-  if (xShape[mnIdx] != scaleShape[hasBatch]) {
-    return emitOpError("M/N dimension must match between operands");
-  }
-
   return success();
 }
 
-RankedTensorType
-UpcastMXFPOp::deduceOutputType(TypedValue<RankedTensorType> inputTensor,
-                               ScaleDotElemType inputElemType,
-                               Type outputElemType) {
-  MLIRContext *ctx = inputTensor.getContext();
-  auto xTy = inputTensor.getType();
-  if (inputElemType != ScaleDotElemType::E2M1)
-    return xTy;
-
-  auto xShape = xTy.getShape();
-  auto newShape = llvm::to_vector(xShape);
-  auto encoding = xTy.getEncoding();
-  if (!encoding) {
-    newShape.back() *= 2;
-    return RankedTensorType::get(xShape, outputElemType);
-  }
-
-  auto oldEncoding = cast<DotOperandEncodingAttr>(encoding);
-  auto newVEncoding = DotOperandEncodingAttr::get(ctx, oldEncoding.getOpIdx(),
-                                                  oldEncoding.getParent(),
-                                                  oldEncoding.getKWidth() * 2);
-  // Figure out the K dimension for the input A/B, given that the return
-  // type is upcasted A/B type so we need to update the proper dim size.
-  const int opIdx = oldEncoding.getOpIdx();
-  const bool hasBatch = xShape.size() == 3;
-  const int kIdx = (opIdx == 0 ? 1 : 0) + hasBatch;
-  newShape[kIdx] *= 2;
-  return RankedTensorType::get(newShape, outputElemType, newVEncoding);
+void Fp4ToFpOp::build(OpBuilder &builder, OperationState &state,
+                      TypedValue<RankedTensorType> src, Type elemType,
+                      int32_t axis) {
+  auto srcTy = src.getType();
+  auto shape = llvm::to_vector(srcTy.getShape());
+  auto rank = srcTy.getRank();
+  assert(0 <= axis && axis < rank);
+  shape[axis] *= 2;
+
+  Attribute inEnc = srcTy.getEncoding();
+  Attribute outEnc;
+  auto result =
+      inEnc.getDialect()
+          .getRegisteredInterface<triton::DialectInferLayoutInterface>()
+          ->inferFp4ToFpOpEncoding(shape, axis, inEnc, outEnc,
+                                   /*fwdInference=*/true, state.location);
+  assert(succeeded(result));
+
+  auto resultTy = RankedTensorType::get(shape, elemType, outEnc);
+  build(builder, state, resultTy, src, axis);
 }
 
 OpFoldResult MemDescTransOp::fold(FoldAdaptor adaptor) {