Add UpcastMXFP op to TritonIntelGPU Dialect to reduce common file changes (#3145)

leonling-ll · web-flow · commit f7aaf0443840 · 2025-01-13T23:12:41.000-05:00
diff --git a/lib/Dialect/TritonGPU/IR/Ops.cpp b/lib/Dialect/TritonGPU/IR/Ops.cpp
@@ -392,34 +392,13 @@ UpcastMXFPOp::deduceOutputType(TypedValue<RankedTensorType> inputTensor,
     return RankedTensorType::get(xShape, outputElemType);
   }
 
-  Attribute newVEncoding = nullptr;
   auto oldEncoding = cast<DotOperandEncodingAttr>(encoding);
+  auto newVEncoding = DotOperandEncodingAttr::get(ctx, oldEncoding.getOpIdx(),
+                                                  oldEncoding.getParent(),
+                                                  oldEncoding.getKWidth() * 2);
+  // Figure out the K dimension for the input A/B, given that the return
+  // type is upcasted A/B type so we need to update the proper dim size.
   const int opIdx = oldEncoding.getOpIdx();
-  // Note: For Intel the dot operands layout's kWidth parameter must match
-  // the parent's DPAS layout opsPerChannel so we need to materialize a
-  // new DPAS layout.
-  if (auto dpasEncoding =
-          dyn_cast<intel::DpasEncodingAttr>(oldEncoding.getParent())) {
-    unsigned opsPerChannel =
-        intel::DpasEncodingAttr::getOpsPerChannel(outputElemType);
-    // e2m1 is packed 2 elements per int8, we must handle continuous 2
-    // elements when upcasting to bf16
-    if (xTy.getElementType() == IntegerType::get(ctx, 8))
-      opsPerChannel *= 2;
-    auto newDpasEncoding = intel::DpasEncodingAttr::get(
-        ctx, dpasEncoding.getRepeatCount(), dpasEncoding.getSystolicDepth(),
-        dpasEncoding.getExecutionSize(), opsPerChannel,
-        dpasEncoding.getWarpsPerCTA(), dpasEncoding.getRepCluster(),
-        product<unsigned>(dpasEncoding.getThreadsPerWarp()));
-    newVEncoding = DotOperandEncodingAttr::get(
-        ctx, opIdx, newDpasEncoding, newDpasEncoding.getOpsPerChannel());
-  } else {
-    // Figure out the K dimension for the input A/B, given that the return
-    // type is upcasted A/B type so we need to update the proper dim size.
-    newVEncoding = DotOperandEncodingAttr::get(ctx, oldEncoding.getOpIdx(),
-                                               oldEncoding.getParent(),
-                                               oldEncoding.getKWidth() * 2);
-  }
   const bool hasBatch = xShape.size() == 3;
   const int kIdx = (opIdx == 0 ? 1 : 0) + hasBatch;
   newShape[kIdx] *= 2;
diff --git a/test/TritonIntelGPU/accelerate-matmul-pvc.mlir b/test/TritonIntelGPU/accelerate-matmul-pvc.mlir
@@ -222,7 +222,7 @@ module attributes {"ttg.target" = "xpu", "ttg.num-ctas" = 1 : i32, "ttg.num-warp
     // CHECK: [[C:%.*]] = ttg.convert_layout [[CST]] : tensor<128x128xf32, [[BLOCKED2]]> -> tensor<128x128xf32, [[DPAS]]>
     // CHECK: [[CVT_ARG0:%.*]] = ttg.convert_layout [[ARG0]] : tensor<128x32xi8, [[BLOCKED]]> -> tensor<128x32xi8, #ttg.dot_op<{opIdx = 0, parent = [[DPAS]], kWidth = 2}>>
     // CHECK: [[CVT_ARG1:%.*]] = ttg.convert_layout [[ARG1]] : tensor<128x2xi8, [[BLOCKED1]]> -> tensor<128x2xi8, [[BLOCKED3]]>
-    // CHECK: [[UPCAST:%.*]] = ttg.upcast_mxfp [[CVT_ARG0]], [[CVT_ARG1]] fp_type = e2m1 {fastMath = false} : tensor<128x32xi8, #ttg.dot_op<{opIdx = 0, parent = [[DPAS]], kWidth = 2}>>, tensor<128x2xi8, [[BLOCKED3]]> -> tensor<128x64xbf16, #ttg.dot_op<{opIdx = 0, parent = [[DPAS1]], kWidth = 4}>>
+    // CHECK: [[UPCAST:%.*]] = triton_intel_gpu.upcast_mxfp [[CVT_ARG0]], [[CVT_ARG1]] fp_type = e2m1 {fastMath = false} : tensor<128x32xi8, #ttg.dot_op<{opIdx = 0, parent = [[DPAS]], kWidth = 2}>>, tensor<128x2xi8, [[BLOCKED3]]> -> tensor<128x64xbf16, #ttg.dot_op<{opIdx = 0, parent = [[DPAS1]], kWidth = 4}>>
     // CHECK: [[A:%.*]] = ttg.convert_layout [[UPCAST]] : tensor<128x64xbf16, #ttg.dot_op<{opIdx = 0, parent = [[DPAS1]], kWidth = 4}>> -> tensor<128x64xbf16, #ttg.dot_op<{opIdx = 0, parent = [[DPAS]], kWidth = 2}>>
     // CHECK: [[B:%.*]] = ttg.convert_layout [[ARG2]] : tensor<64x128xbf16, [[BLOCKED2]]> -> tensor<64x128xbf16, #ttg.dot_op<{opIdx = 1, parent = [[DPAS]], kWidth = 2}>>
     // CHECK: [[D:%.*]] = tt.dot [[A]], [[B]], [[C]] : tensor<128x64xbf16, #ttg.dot_op<{opIdx = 0, parent = [[DPAS]], kWidth = 2}>> * tensor<64x128xbf16, #ttg.dot_op<{opIdx = 1, parent = [[DPAS]], kWidth = 2}>> -> tensor<128x128xf32, [[DPAS]]>
@@ -239,7 +239,7 @@ module attributes {"ttg.target" = "xpu", "ttg.num-ctas" = 1 : i32, "ttg.num-warp
     // CHECK: [[C:%.*]] = ttg.convert_layout [[CST]] : tensor<128x128xf32, [[BLOCKED2]]> -> tensor<128x128xf32, [[DPAS]]>
     // CHECK: [[CVT_ARG0:%.*]] = ttg.convert_layout %arg0 : tensor<128x32xi8, [[BLOCKED]]> -> tensor<128x32xi8, #ttg.dot_op<{opIdx = 0, parent = [[DPAS]], kWidth = 2}>>
     // CHECK: [[CVT_ARG1:%.*]] = ttg.convert_layout %arg1 : tensor<128x2xi8, [[BLOCKED1]]> -> tensor<128x2xi8, [[BLOCKED3]]>
-    // CHECK: [[UPCAST:%.*]] = ttg.upcast_mxfp [[CVT_ARG0]], [[CVT_ARG1]] fp_type = e2m1 {fastMath = true} : tensor<128x32xi8, #ttg.dot_op<{opIdx = 0, parent = [[DPAS]], kWidth = 2}>>, tensor<128x2xi8, [[BLOCKED3]]> -> tensor<128x64xbf16, #ttg.dot_op<{opIdx = 0, parent = [[DPAS1]], kWidth = 4}>>
+    // CHECK: [[UPCAST:%.*]] = triton_intel_gpu.upcast_mxfp [[CVT_ARG0]], [[CVT_ARG1]] fp_type = e2m1 {fastMath = true} : tensor<128x32xi8, #ttg.dot_op<{opIdx = 0, parent = [[DPAS]], kWidth = 2}>>, tensor<128x2xi8, [[BLOCKED3]]> -> tensor<128x64xbf16, #ttg.dot_op<{opIdx = 0, parent = [[DPAS1]], kWidth = 4}>>
     // CHECK: [[A:%.*]] = ttg.convert_layout [[UPCAST]] : tensor<128x64xbf16, #ttg.dot_op<{opIdx = 0, parent = [[DPAS1]], kWidth = 4}>> -> tensor<128x64xbf16, #ttg.dot_op<{opIdx = 0, parent = [[DPAS]], kWidth = 2}>>
     // CHECK: [[CVT_ARG2:%.*]] = ttg.convert_layout [[ARG2]] : tensor<64x128xf8E4M3FN, [[BLOCKED2]]> -> tensor<64x128xf8E4M3FN, #ttg.dot_op<{opIdx = 1, parent = [[DPAS]], kWidth = 2}>>
     // CHECK: [[B:%.*]] = tt.fp_to_fp [[CVT_ARG2]] : tensor<64x128xf8E4M3FN, #ttg.dot_op<{opIdx = 1, parent = [[DPAS]], kWidth = 2}>> -> tensor<64x128xbf16, #ttg.dot_op<{opIdx = 1, parent = [[DPAS]], kWidth = 2}>>
@@ -285,7 +285,7 @@ module attributes {ttg.target = "xpu", "ttg.num-ctas" = 1 : i32, "ttg.num-warps"
       // CHECK: [[C:%.*]] = ttg.convert_layout [[ARG5]] : tensor<32x128xf32, [[BLOCKED4]]> -> tensor<32x128xf32, [[DPAS]]>
       // CHECK: [[CVT_ARG1:%.*]] = ttg.convert_layout [[TRANS_B]] : tensor<32x32xi8, [[BLOCKED4]]> -> tensor<32x32xi8, #ttg.dot_op<{opIdx = 0, parent = [[DPAS]], kWidth = 2}>>
       // CHECK: [[CVT_ARG2:%.*]] = ttg.convert_layout [[ARG2]] : tensor<32x2xi8, [[BLOCKED2]]> -> tensor<32x2xi8, [[BLOCKED6]]>
-      // CHECK: [[UPCAST:%.*]] = ttg.upcast_mxfp [[CVT_ARG1]], [[CVT_ARG2]] fp_type = e2m1  {fastMath = false} : tensor<32x32xi8, #ttg.dot_op<{opIdx = 0, parent = [[DPAS]], kWidth = 2}>>, tensor<32x2xi8, [[BLOCKED6]]> -> tensor<32x64xbf16, #ttg.dot_op<{opIdx = 0, parent = [[DPAS1]], kWidth = 4}>>
+      // CHECK: [[UPCAST:%.*]] = triton_intel_gpu.upcast_mxfp [[CVT_ARG1]], [[CVT_ARG2]] fp_type = e2m1  {fastMath = false} : tensor<32x32xi8, #ttg.dot_op<{opIdx = 0, parent = [[DPAS]], kWidth = 2}>>, tensor<32x2xi8, [[BLOCKED6]]> -> tensor<32x64xbf16, #ttg.dot_op<{opIdx = 0, parent = [[DPAS1]], kWidth = 4}>>
       // CHECK: [[A:%.*]] = ttg.convert_layout [[UPCAST]] : tensor<32x64xbf16, #ttg.dot_op<{opIdx = 0, parent = [[DPAS1]], kWidth = 4}>> -> tensor<32x64xbf16, #ttg.dot_op<{opIdx = 0, parent = [[DPAS]], kWidth = 2}>>
       // CHECK: [[CVT_ARG0:%.*]] = ttg.convert_layout [[TRANS_A]] : tensor<64x128xf8E4M3FN, [[BLOCKED5]]> -> tensor<64x128xf8E4M3FN, #ttg.dot_op<{opIdx = 1, parent = [[DPAS]], kWidth = 2}>>
       // CHECK: [[B:%.*]] = tt.fp_to_fp [[CVT_ARG0]] : tensor<64x128xf8E4M3FN, #ttg.dot_op<{opIdx = 1, parent = [[DPAS]], kWidth = 2}>> -> tensor<64x128xbf16, #ttg.dot_op<{opIdx = 1, parent = [[DPAS]], kWidth = 2}>>
diff --git a/third_party/intel/include/Dialect/TritonIntelGPU/IR/TritonIntelGPUOps.td b/third_party/intel/include/Dialect/TritonIntelGPU/IR/TritonIntelGPUOps.td
@@ -202,4 +202,33 @@ def TTIG_SubGroupTransposeOp
   let hasVerifier = 1;
 }
 
+// The same as ttg.upcast_mxfp, but we want Dot Layout from Dpas layout for input tensor
+def TTIG_UpcastMXFPOp : TTIG_Op<"upcast_mxfp", [Pure]> {
+  let summary = "Convert an mxfp tensor to bf16/fp16";
+
+  let hasVerifier = 1;
+
+  let description = [{
+    Compute the bf16 encoded in the given mxfp number as per
+    https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf
+  }];
+  let arguments = (
+    ins
+    TT_Tensor:$src,
+    TT_Tensor:$scale,
+    TT_ScaleDotElemTypeAttr:$fp_type,
+    BoolAttr:$fastMath
+  );
+  let results = (outs TT_Tensor:$result);
+
+  let assemblyFormat = [{
+    $src `,` $scale  `fp_type` `=` $fp_type attr-dict `:` type($src) `,` type($scale) `->` type($result)
+  }];
+
+  let extraClassDeclaration = [{
+    static RankedTensorType deduceOutputType(
+        TypedValue<RankedTensorType> inputTensor, ScaleDotElemType inputElemType, Type outputElemType);
+  }];
+}
+
 #endif
diff --git a/third_party/intel/lib/Dialect/TritonIntelGPU/IR/Ops.cpp b/third_party/intel/lib/Dialect/TritonIntelGPU/IR/Ops.cpp
@@ -209,4 +209,124 @@ LogicalResult SubGroupTransposeOp::verify() {
   return success();
 }
 
+LogicalResult UpcastMXFPOp::verify() {
+  auto fpType = getFpType();
+
+  auto xTy = getSrc().getType();
+  auto scaleTy = getScale().getType();
+  Builder b(getContext());
+  if (xTy.getElementType() != b.getBF16Type() &&
+      xTy.getElementType() != b.getF16Type() &&
+      xTy.getElementType() != b.getI8Type()) {
+    return emitOpError(
+        "element type of the first operand must be bf16/fp16 or i8");
+  }
+
+  if (scaleTy.getElementType() != b.getI8Type()) {
+    return emitOpError("element type of the second operand must be uint8");
+  }
+
+  auto xShape = xTy.getShape();
+  auto scaleShape = scaleTy.getShape();
+
+  if (xShape.size() != scaleShape.size() || xShape.size() < 2) {
+    return emitOpError(
+        "operands must have the same number of dimensions, at least 2");
+  }
+
+  if (!(fpType == ScaleDotElemType::E2M1 || fpType == ScaleDotElemType::E4M3 ||
+        fpType == ScaleDotElemType::E5M2)) {
+    return emitOpError("NYI: fpType must be E2M1, E4M3, or E5M2");
+  }
+
+  auto layoutX = xTy.getEncoding();
+  auto layoutScale = scaleTy.getEncoding();
+  if (bool(layoutX) != bool(layoutScale)) {
+    return emitOpError(
+        "Expected either both or neither operands to have an encoding");
+  }
+  // Nothing to check if no encoding. This is used to infer the return type in
+  // AccelerateMatmul.cpp
+  if (!layoutX) {
+    return success();
+  }
+
+  auto dotEncoding = dyn_cast<DotOperandEncodingAttr>(layoutX);
+  if (!dotEncoding) {
+    return emitOpError("Expected a DotOperandEncodingAttr for values");
+  }
+  if (!isa<BlockedEncodingAttr, LinearEncodingAttr>(layoutScale)) {
+    return emitOpError(
+        "Expected a BlockOperandEncoding or LinearOperandEncoding "
+        "for scales");
+  }
+
+  // Change to support fp8 types
+  const auto elemsPacked = fpType == ScaleDotElemType::E2M1 ? 2 : 1;
+  // Figure out the K dimension for the input A/B. For A/B scale, the K
+  // dimension is always the last dimension.
+  const int opIdx = dotEncoding.getOpIdx();
+  const bool hasBatch = xShape.size() == 3;
+  const int kIdx = (opIdx == 0 ? 1 : 0) + hasBatch;
+
+  if (xShape[kIdx] != (32 / elemsPacked) * scaleShape.back()) {
+    return emitOpError("K dimension of first operand must be 16 times "
+                       "larger than last/K dimension of the second operand");
+  }
+
+  // Check other dimensions match too. For input A/B, we need to figure out the
+  // index for the M/N dimension. For scale, it's always {(batch), M/N, K}.
+  const int mnIdx = (opIdx == 0 ? 0 : 1) + hasBatch;
+  if (hasBatch && xShape[0] != scaleShape[0])
+    return emitOpError("batch dimension must match between operands");
+  if (xShape[mnIdx] != scaleShape[hasBatch]) {
+    return emitOpError("M/N dimension must match between operands");
+  }
+
+  return success();
+}
+
+RankedTensorType
+UpcastMXFPOp::deduceOutputType(TypedValue<RankedTensorType> inputTensor,
+                               ScaleDotElemType inputElemType,
+                               Type outputElemType) {
+  MLIRContext *ctx = inputTensor.getContext();
+  auto xTy = inputTensor.getType();
+  if (inputElemType != ScaleDotElemType::E2M1)
+    return xTy;
+
+  auto xShape = xTy.getShape();
+  auto newShape = llvm::to_vector(xShape);
+  auto encoding = xTy.getEncoding();
+  if (!encoding) {
+    newShape.back() *= 2;
+    return RankedTensorType::get(xShape, outputElemType);
+  }
+
+  auto oldEncoding = cast<DotOperandEncodingAttr>(encoding);
+  const int opIdx = oldEncoding.getOpIdx();
+  // Note: For Intel the dot operands layout's kWidth parameter must match
+  // the parent's DPAS layout opsPerChannel so we need to materialize a
+  // new DPAS layout.
+  auto dpasEncoding = cast<intel::DpasEncodingAttr>(oldEncoding.getParent());
+  unsigned opsPerChannel =
+      intel::DpasEncodingAttr::getOpsPerChannel(outputElemType);
+  // e2m1 is packed 2 elements per int8, we must handle continuous 2
+  // elements when upcasting to bf16
+  if (xTy.getElementType() == IntegerType::get(ctx, 8))
+    opsPerChannel *= 2;
+  auto newDpasEncoding = intel::DpasEncodingAttr::get(
+      ctx, dpasEncoding.getRepeatCount(), dpasEncoding.getSystolicDepth(),
+      dpasEncoding.getExecutionSize(), opsPerChannel,
+      dpasEncoding.getWarpsPerCTA(), dpasEncoding.getRepCluster(),
+      product<unsigned>(dpasEncoding.getThreadsPerWarp()));
+  Attribute newVEncoding = DotOperandEncodingAttr::get(
+      ctx, opIdx, newDpasEncoding, newDpasEncoding.getOpsPerChannel());
+
+  const bool hasBatch = xShape.size() == 3;
+  const int kIdx = (opIdx == 0 ? 1 : 0) + hasBatch;
+  newShape[kIdx] *= 2;
+  return RankedTensorType::get(newShape, outputElemType, newVEncoding);
+}
+
 } // namespace mlir::triton::gpu::intel
diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/UpcastMXFPToLLVM.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/UpcastMXFPToLLVM.cpp
@@ -12,7 +12,7 @@
 
 using namespace mlir;
 using namespace mlir::triton;
-using namespace mlir::triton::gpu;
+using namespace mlir::triton::gpu::intel;
 
 namespace {
 
@@ -80,7 +80,7 @@ class UpcastMXFPOpPattern : public ConvertOpToLLVMPattern<UpcastMXFPOp> {
     // kWidth here is the contiguous number of elements each thread access.
     unsigned kWidth = dpasEnc.getOpsPerChannel() / 2;
     unsigned numMxfp =
-        TritonGPUDialect::TritonGPUDialect::getThreadsPerWarp(mod) / instShapeM;
+        triton::gpu::TritonGPUDialect::getThreadsPerWarp(mod) / instShapeM;
     unsigned mxfpSize = repSize * subTileSize * kWidth;
     constexpr unsigned numScales = 16;
 
diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/AccelerateMatmul.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/AccelerateMatmul.cpp
@@ -423,10 +423,10 @@ class DecomposeScaledBlocked : public OpRewritePattern<tt::DotScaledOp> {
     if (!scale)
       return v;
 
-    auto retTy = triton::gpu::UpcastMXFPOp::deduceOutputType(
+    auto retTy = triton::gpu::intel::UpcastMXFPOp::deduceOutputType(
         v, elemType, Builder(v.getContext()).getBF16Type());
-    return rewriter.create<ttg::UpcastMXFPOp>(v.getLoc(), retTy, v, scale,
-                                              elemType, fastMath);
+    return rewriter.create<ttgi::UpcastMXFPOp>(v.getLoc(), retTy, v, scale,
+                                               elemType, fastMath);
   }
 };