intel
diff --git a/‎third_party/amd/include/Dialect/TritonAMDGPU/IR/TritonAMDGPUOps.td‎
Lines changed: 28 additions & 0 deletions b/‎third_party/amd/include/Dialect/TritonAMDGPU/IR/TritonAMDGPUOps.td‎
Lines changed: 28 additions & 0 deletions
diff --git a/‎third_party/amd/lib/Dialect/TritonAMDGPU/IR/Dialect.cpp‎
Lines changed: 108 additions & 0 deletions b/‎third_party/amd/lib/Dialect/TritonAMDGPU/IR/Dialect.cpp‎
Lines changed: 108 additions & 0 deletions
diff --git a/‎third_party/amd/lib/TritonAMDGPUToLLVM/CMakeLists.txt‎
Lines changed: 1 addition & 1 deletion b/‎third_party/amd/lib/TritonAMDGPUToLLVM/CMakeLists.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎third_party/amd/lib/TritonAMDGPUToLLVM/Fp4ToFpOpToLLVM.cpp‎
Lines changed: 0 additions & 212 deletions b/‎third_party/amd/lib/TritonAMDGPUToLLVM/Fp4ToFpOpToLLVM.cpp‎
Lines changed: 0 additions & 212 deletions
diff --git a/‎third_party/amd/lib/TritonAMDGPUToLLVM/PatternTritonGPUOpToLLVM.h‎
Lines changed: 4 additions & 3 deletions b/‎third_party/amd/lib/TritonAMDGPUToLLVM/PatternTritonGPUOpToLLVM.h‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎third_party/amd/lib/TritonAMDGPUToLLVM/TritonGPUToLLVM.cpp‎
Lines changed: 2 additions & 2 deletions b/‎third_party/amd/lib/TritonAMDGPUToLLVM/TritonGPUToLLVM.cpp‎
Lines changed: 2 additions & 2 deletions
@@ -266,6 +266,34 @@ def BufferAtomicRMWOp : TT_AMDGPU_Op<"buffer_atomic_rmw", [
     }];
 }
 
+def TTG_UpcastMXFPOp : TT_AMDGPU_Op<"upcast_mxfp", [Pure]> {
+  let summary = "Convert an mxfp tensor to bf16/fp16";
+
+  let hasVerifier = 1;
+
+  let description = [{
+    Compute the bf16 encoded in the given mxfp number as per
+    https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf
+  }];
+  let arguments = (
+    ins
+    TT_Tensor:$src,
+    TT_Tensor:$scale,
+    TT_ScaleDotElemTypeAttr:$fp_type,
+    BoolAttr:$fastMath
+  );
+  let results = (outs TT_Tensor:$result);
+
+  let assemblyFormat = [{
+    $src `,` $scale  `fp_type` `=` $fp_type attr-dict `:` type($src) `,` type($scale) `->` type($result)
+  }];
+
+  let extraClassDeclaration = [{
+    static RankedTensorType deduceOutputType(
+        TypedValue<RankedTensorType> inputTensor, ScaleDotElemType inputElemType, Type outputElemType);
+  }];
+}
+
 def BufferStoreOp : TT_AMDGPU_Op<"buffer_store", [
   SameLoadStoreOperandsEncoding,
   MemoryEffects<[MemWrite<GlobalMemory>]>,
 
@@ -133,4 +133,112 @@ LogicalResult ExtractSliceOp::verify() {
 
   return success();
 }
+
+LogicalResult UpcastMXFPOp::verify() {
+  auto fpType = getFpType();
+
+  auto xTy = getSrc().getType();
+  auto scaleTy = getScale().getType();
+  Builder b(getContext());
+  if (xTy.getElementType() != b.getBF16Type() &&
+      xTy.getElementType() != b.getF16Type() &&
+      xTy.getElementType() != b.getI8Type()) {
+    return emitOpError(
+        "element type of the first operand must be bf16/fp16 or i8");
+  }
+
+  if (scaleTy.getElementType() != b.getI8Type()) {
+    return emitOpError("element type of the second operand must be uint8");
+  }
+
+  auto xShape = xTy.getShape();
+  auto scaleShape = scaleTy.getShape();
+
+  if (xShape.size() != scaleShape.size() || xShape.size() < 2) {
+    return emitOpError(
+        "operands must have the same number of dimensions, at least 2");
+  }
+
+  if (!(fpType == ScaleDotElemType::E2M1 || fpType == ScaleDotElemType::E4M3 ||
+        fpType == ScaleDotElemType::E5M2)) {
+    return emitOpError("NYI: fpType must be E2M1, E4M3, or E5M2");
+  }
+
+  auto layoutX = xTy.getEncoding();
+  auto layoutScale = scaleTy.getEncoding();
+  if (bool(layoutX) != bool(layoutScale)) {
+    return emitOpError(
+        "Expected either both or neither operands to have an encoding");
+  }
+  // Nothing to check if no encoding. This is used to infer the return type in
+  // AccelerateMatmul.cpp
+  if (!layoutX) {
+    return success();
+  }
+
+  auto dotEncoding = dyn_cast<gpu::DotOperandEncodingAttr>(layoutX);
+  if (!dotEncoding) {
+    return emitOpError("Expected a DotOperandEncodingAttr for values");
+  }
+  if (!isa<gpu::BlockedEncodingAttr, gpu::LinearEncodingAttr>(layoutScale)) {
+    return emitOpError(
+        "Expected a BlockOperandEncoding or LinearOperandEncoding "
+        "for scales");
+  }
+
+  // Change to support fp8 types
+  const auto elemsPacked = fpType == ScaleDotElemType::E2M1 ? 2 : 1;
+  // Figure out the K dimension for the input A/B. For A/B scale, the K
+  // dimension is always the last dimension.
+  const int opIdx = dotEncoding.getOpIdx();
+  const bool hasBatch = xShape.size() == 3;
+  const int kIdx = (opIdx == 0 ? 1 : 0) + hasBatch;
+
+  if (xShape[kIdx] != (32 / elemsPacked) * scaleShape.back()) {
+    return emitOpError("K dimension of first operand must be 16 times "
+                       "larger than last/K dimension of the second operand");
+  }
+
+  // Check other dimensions match too. For input A/B, we need to figure out the
+  // index for the M/N dimension. For scale, it's always {(batch), M/N, K}.
+  const int mnIdx = (opIdx == 0 ? 0 : 1) + hasBatch;
+  if (hasBatch && xShape[0] != scaleShape[0])
+    return emitOpError("batch dimension must match between operands");
+  if (xShape[mnIdx] != scaleShape[hasBatch]) {
+    return emitOpError("M/N dimension must match between operands");
+  }
+
+  return success();
+}
+
+RankedTensorType
+UpcastMXFPOp::deduceOutputType(TypedValue<RankedTensorType> inputTensor,
+                               ScaleDotElemType inputElemType,
+                               Type outputElemType) {
+  MLIRContext *ctx = inputTensor.getContext();
+  auto xTy = inputTensor.getType();
+  if (inputElemType != ScaleDotElemType::E2M1)
+    return xTy;
+
+  auto xShape = xTy.getShape();
+  auto newShape = llvm::to_vector(xShape);
+  auto encoding = xTy.getEncoding();
+  if (!encoding) {
+    newShape.back() *= 2;
+    return RankedTensorType::get(xShape, outputElemType);
+  }
+
+  auto oldEncoding = cast<DotOperandEncodingAttr>(encoding);
+  auto newVEncoding = DotOperandEncodingAttr::get(ctx, oldEncoding.getOpIdx(),
+                                                  oldEncoding.getParent(),
+                                                  oldEncoding.getKWidth() * 2);
+  // Figure out the K dimension for the input A/B, given that the return
+  // type is upcasted A/B type so we need to update the proper dim size.
+  const int opIdx = oldEncoding.getOpIdx();
+  const bool hasBatch = xShape.size() == 3;
+  const int kIdx = (opIdx == 0 ? 1 : 0) + hasBatch;
+  newShape[kIdx] *= 2;
+  return RankedTensorType::get(newShape, outputElemType, newVEncoding);
+}
+
 } // namespace mlir::triton::amdgpu
@@ -22,7 +22,7 @@ add_triton_library(TritonAMDGPUToLLVM
     OptimizeLDSUtility.cpp
     SPMDOpToLLVM.cpp
     SchedInstructions.cpp
-    Fp4ToFpOpToLLVM.cpp
+    UpcastMXFPToLLVM.cpp
 
     DEPENDS
     TritonAMDGPUConversionPassIncGen
 
@@ -38,9 +38,10 @@ void populateTritonAMDGPUToLLVMPatterns(LLVMTypeConverter &typeConverter,
                                         RewritePatternSet &patterns,
                                         PatternBenefit benefit);
 
-void populateFp4ToFpToLLVMPatterns(LLVMTypeConverter &typeConverter,
-                                   RewritePatternSet &patterns,
-                                   PatternBenefit benefit);
+void populateUpcastMXFPToLLVMPatterns(LLVMTypeConverter &typeConverter,
+                                      RewritePatternSet &patterns,
+                                      const TargetInfo &targetInfo,
+                                      PatternBenefit benefit);
 
 } // namespace mlir::triton::AMD
 
 
@@ -201,8 +201,8 @@ struct ConvertTritonAMDGPUToLLVM
 
     mlir::triton::AMD::populateTritonAMDGPUToLLVMPatterns(typeConverter,
                                                           patterns, AMDBenefit);
-    mlir::triton::AMD::populateFp4ToFpToLLVMPatterns(typeConverter, patterns,
-                                                     AMDBenefit);
+    mlir::triton::AMD::populateUpcastMXFPToLLVMPatterns(typeConverter, patterns,
+                                                        targetInfo, AMDBenefit);
 
     // TODO(thomas): this should probably be done in a separate step to not
     // interfere with our own lowering of arith ops. Add arith/math's patterns