Merge branch 'liyang/upcast_mxfp_and_dot_scaled' of https://github.com/intel/intel-xpu-backend-for-triton into liyang/upcast_mxfp_and_dot_scaled

etiotto · etiotto · commit 48feaa45e18a · 2024-12-06T19:29:43.000Z
diff --git a/lib/Dialect/TritonGPU/IR/Ops.cpp b/lib/Dialect/TritonGPU/IR/Ops.cpp
@@ -152,9 +152,10 @@ LogicalResult UpcastMXFPOp::inferReturnTypes(
       } else if (auto oldEncoding = dyn_cast<BlockedEncodingAttr>(encoding)) {
         // TODO: Temporary code, remove once upcast_mxfp support dot encoding.
         assert(!tools::getBoolEnv("TRITON_INTEL_UPCASTMXFP_DOTOP_ENCODING"));
-        newShape.back() *= 2;
         SmallVector<unsigned> sizePerThread = oldEncoding.getSizePerThread();
-        sizePerThread.back() *= 2;
+        int opIdx = sizePerThread.back() == 1 ? 1 : 0;
+        sizePerThread[!opIdx] *= 2;
+        newShape[!opIdx] *= 2;
         newVEncoding = BlockedEncodingAttr::get(
             ctx, sizePerThread, oldEncoding.getThreadsPerWarp(),
             oldEncoding.getWarpsPerCTA(), oldEncoding.getCTAOrder(),
diff --git a/python/test/unit/language/test_core.py b/python/test/unit/language/test_core.py
@@ -3440,9 +3440,6 @@ def test_scaled_dot(M, N, K, col_a, col_b, rhs_scale, normal_type, mxfp_type, nu
             pytest.skip(f"scaled_dot({normal_type}, {mxfp_type}) only implemented for MI300")
         if mma == 16 and K == 64:
             pytest.skip(f"K == {K} too small for mfma {mma} in scaled_dot")
-    if is_xpu():
-        if rhs_scale:
-            pytest.skip("scaled_dot with rhs_scale not supported on XPU")
 
     @triton.jit
     def dot_scale_kernel(a_base, stride_a0, stride_a1, a_scale, b_base, stride_b0, stride_b1, b_scale, out,
diff --git a/third_party/intel/include/Analysis/DPAS.h b/third_party/intel/include/Analysis/DPAS.h
@@ -24,6 +24,8 @@ class DPASAnalysis {
     FP32_FP32_TF32_TF32,
     FP16_FP16_FP16_FP16,
     BF16_BF16_BF16_BF16,
+    U32_U32_U8_U8,
+    S32_S32_S8_S8,
     // data types for dot scaled.
     FP32_FP32_BF16_FP8,
     FP32_FP32_BF16_FP4,
@@ -32,8 +34,6 @@ class DPASAnalysis {
     FP32_FP32_FP8_FP4,
     FP32_FP32_FP4_BF16,
     FP32_FP32_FP4_FP8,
-    U32_U32_U8_U8,
-    S32_S32_S8_S8,
     NOT_APPLICABLE
   };
 
diff --git a/third_party/intel/lib/Analysis/DPAS.cpp b/third_party/intel/lib/Analysis/DPAS.cpp
@@ -150,7 +150,8 @@ DPASAnalysis::getDPASType(OpTy op) {
         if (aElemTy.isBF16() &&
             (bElemTy.isFloat8E4M3FN() || bElemTy.isFloat8E5M2()))
           return DPASEngineType::FP32_FP32_BF16_FP8;
-        if (aElemTy.isBF16() && bElemTy.isFloat4E2M1FN())
+        // 2 E2M1 are packed into 1 int8
+        if (aElemTy.isBF16() && bElemTy.isInteger(8))
           return DPASEngineType::FP32_FP32_BF16_FP4;
         if ((aElemTy.isFloat8E4M3FN() || aElemTy.isFloat8E5M2()) &&
             bElemTy.isBF16())
@@ -159,9 +160,8 @@ DPASAnalysis::getDPASType(OpTy op) {
             (bElemTy.isFloat8E4M3FN() || bElemTy.isFloat8E5M2()))
           return DPASEngineType::FP32_FP32_FP8_FP8;
         if ((aElemTy.isFloat8E4M3FN() || aElemTy.isFloat8E5M2()) &&
-            bElemTy.isFloat4E2M1FN())
+            bElemTy.isInteger(8))
           return DPASEngineType::FP32_FP32_FP8_FP4;
-        // 2 E2M1 are packed into 1 int8
         if (aElemTy.isInteger(8) && bElemTy.isBF16())
           return DPASEngineType::FP32_FP32_FP4_BF16;
         if (aElemTy.isInteger(8) &&
diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/UpcastMXFPToLLVM.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/UpcastMXFPToLLVM.cpp
@@ -17,6 +17,24 @@ using namespace mlir::triton::gpu;
 
 namespace {
 
+static Value mxfpScaleBf16(ConversionPatternRewriter &rewriter, Location loc,
+                           Value v, Value scale) {
+  Value vBf16 = bitcast(v, bf16_ty);
+  Value nanBf16 = bitcast(i16_val(0x7fff), bf16_ty);
+  Value scaleIsNan = icmp_eq(scale, i8_val(0xff));
+  Value scaleBf16 = bitcast(shl(zext(i16_ty, scale), i16_val(7)), bf16_ty);
+
+  Value v0 = mlir::triton::intel::convertBf16ToFp32(loc, rewriter, vBf16);
+  Value v1 = mlir::triton::intel::convertBf16ToFp32(loc, rewriter, scaleBf16);
+  auto result = rewriter.create<LLVM::FMulOp>(loc, f32_ty, v0, v1);
+  auto undefRounding = static_cast<mlir::triton::RoundingMode>(-1);
+  Value scaledBf16 = mlir::triton::intel::convertFp32ToBf16(
+      loc, rewriter, result, undefRounding);
+  // Value scaledBf16 = fmul(vBf16, scaleBf16);
+  // Account for NaN in the scale as per the mxfp specification.
+  return select(scaleIsNan, nanBf16, scaledBf16);
+};
+
 class UpcastMXFPOpPattern : public ConvertOpToLLVMPattern<UpcastMXFPOp> {
 private:
   const TargetInfoBase &targetInfo;
@@ -48,8 +66,8 @@ class UpcastMXFPOpPattern : public ConvertOpToLLVMPattern<UpcastMXFPOp> {
 
     for (auto [i, scaleVal] : llvm::enumerate(scaleVals)) {
       for (int j = 0; j < 32; ++j) {
-        xVals[32 * i + j] = LLVM::intel::mxfpScaleBf16(
-            rewriter, loc, xVals[32 * i + j], scaleVal);
+        xVals[32 * i + j] =
+            mxfpScaleBf16(rewriter, loc, xVals[32 * i + j], scaleVal);
       }
     }
 
diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/Utility.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/Utility.cpp
@@ -159,21 +159,4 @@ LLVM::LLVMFuncOp getSpirvPrintfDeclaration(RewriterBase &rewriter) {
   return printFunc;
 }
 
-Value mxfpScaleBf16(ConversionPatternRewriter &rewriter, Location loc, Value v,
-                    Value scale) {
-  Value vBf16 = bitcast(v, bf16_ty);
-  Value nanBf16 = bitcast(i16_val(0x7fff), bf16_ty);
-  Value scaleIsNan = icmp_eq(scale, i8_val(0xff));
-  Value scaleBf16 = bitcast(shl(zext(i16_ty, scale), i16_val(7)), bf16_ty);
-
-  Value v0 = mlir::triton::intel::convertBf16ToFp32(loc, rewriter, vBf16);
-  Value v1 = mlir::triton::intel::convertBf16ToFp32(loc, rewriter, scaleBf16);
-  auto result = rewriter.create<LLVM::FMulOp>(loc, f32_ty, v0, v1);
-  auto undefRounding = static_cast<mlir::triton::RoundingMode>(-1);
-  Value scaledBf16 = mlir::triton::intel::convertFp32ToBf16(
-      loc, rewriter, result, undefRounding);
-  // Value scaledBf16 = fmul(vBf16, scaleBf16);
-  // Account for NaN in the scale as per the mxfp specification.
-  return select(scaleIsNan, nanBf16, scaledBf16);
-};
 } // namespace mlir::LLVM::intel
diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/Utility.h b/third_party/intel/lib/TritonIntelGPUToLLVM/Utility.h
@@ -127,8 +127,6 @@ static Value getModuleWarpSize(RewriterBase &rewriter, Location loc) {
   return i32_val(triton::gpu::TritonGPUDialect::getThreadsPerWarp(mod));
 }
 
-Value mxfpScaleBf16(ConversionPatternRewriter &rewriter, Location loc, Value v,
-                    Value scale);
 } // namespace mlir::LLVM::intel
 
 // -----------------------------------------------------------------------
diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/AccelerateMatmul.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/AccelerateMatmul.cpp
@@ -291,13 +291,16 @@ class DecomposeScaledBlocked : public OpRewritePattern<tt::DotScaledOp> {
     static_assert(opIdx == 0 || opIdx == 1, "Illegal operand index");
     assert(opDesc.scale && "Expecting valid operand & scale");
 
-    unsigned opsPerChannel = dpasEnc.getOpsPerChannel();
-
     MLIRContext *ctx = opDesc.op.getContext();
+    unsigned numWarps = ttg::TritonGPUDialect::getNumWarps(mod);
+    unsigned warpSize = ttg::TritonGPUDialect::getThreadsPerWarp(mod);
+    unsigned opsPerChannel = dpasEnc.getOpsPerChannel();
     unsigned rank = retType.getRank();
+
     if (upcastMXFPUseDotOpEnc) {
       if (opDesc.elemType == tt::ScaleDotElemType::E2M1)
         opsPerChannel *= 2;
+
       auto opEncoding = ttg::intel::DpasEncodingAttr::get(
           ctx, dpasEnc.getRepeatCount(), dpasEnc.getSystolicDepth(),
           dpasEnc.getExecutionSize(), opsPerChannel, dpasEnc.getWarpsPerCTA(),
@@ -312,7 +315,6 @@ class DecomposeScaledBlocked : public OpRewritePattern<tt::DotScaledOp> {
       unsigned instrShapeM = dpasEnc.getDPASInstShapeA()[1];
       SmallVector<unsigned, 2> threadsPerWarp{instrShapeM,
                                               warpSize / instrShapeM};
-      int numWarps = ttg::TritonGPUDialect::getNumWarps(mod);
       SmallVector<unsigned, 2> warpsPerCTA(rank, 1);
       warpsPerCTA[0] = numWarps;
       auto CTALayout = ttg::getCTALayout(retType.getEncoding());
@@ -325,7 +327,6 @@ class DecomposeScaledBlocked : public OpRewritePattern<tt::DotScaledOp> {
       return createUpcastMxfpOp(op, scale, opDesc.elemType, rewriter);
     }
 
-    // Temporary code: remove once upcast_mxfp support dot encoding.
     auto scaleEncoding = dyn_cast<ttg::BlockedEncodingAttr>(
         opDesc.scale.getType().getEncoding());
     assert(scaleEncoding && "Expecting blocked encoding for scale");
@@ -334,19 +335,28 @@ class DecomposeScaledBlocked : public OpRewritePattern<tt::DotScaledOp> {
     // https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf
     // the scalingBlockSize should be 32 for E5M2, E4M3 and E2M1
     unsigned scalingBlockSize = 32;
-    // 2 FP4E2M1 are packed in 1 I8
+    // 2 FP4E2M1 are packed in one i8
     if (opDesc.elemType == tt::ScaleDotElemType::E2M1)
       scalingBlockSize = 16;
-    SmallVector<unsigned, 2> sizePerThread(rank, 1);
-    sizePerThread[rank - 1 - opIdx] = scalingBlockSize;
-    auto newOpEncoding = ttg::BlockedEncodingAttr::get(
-        ctx, sizePerThread, scaleEncoding.getThreadsPerWarp(),
-        scaleEncoding.getWarpsPerCTA(), scaleEncoding.getCTAOrder(),
-        scaleEncoding.getCTALayout());
 
+    SmallVector<unsigned> sizePerThread = {1, 1};
+    SmallVector<unsigned> threadsPerWarp = {1, 1};
+    sizePerThread[!opIdx] = scalingBlockSize;
+    threadsPerWarp[opIdx] = warpSize;
+    SmallVector<unsigned> warpsPerCTA = {numWarps, 1};
+
+    auto newOpEncoding = ttg::BlockedEncodingAttr::get(
+        ctx, sizePerThread, threadsPerWarp, warpsPerCTA,
+        scaleEncoding.getCTAOrder(), scaleEncoding.getCTALayout());
     TensorValue op =
         createArg(opDesc.op, opDesc.elemType, newOpEncoding, rewriter);
-    TensorValue scale = opDesc.scale;
+
+    warpsPerCTA = opIdx ? SmallVector<unsigned>{1, numWarps}
+                        : SmallVector<unsigned>{numWarps, 1};
+    auto newScaleEncoding = ttg::BlockedEncodingAttr::get(
+        ctx, {1, 1}, {warpSize, 1}, warpsPerCTA, scaleEncoding.getCTAOrder(),
+        scaleEncoding.getCTALayout());
+    TensorValue scale = createScale(opDesc.scale, newScaleEncoding, rewriter);
 
     auto retDpasEncoding = ttg::intel::DpasEncodingAttr::get(
         ctx, dpasEnc.getRepeatCount(), dpasEnc.getSystolicDepth(),
@@ -357,11 +367,11 @@ class DecomposeScaledBlocked : public OpRewritePattern<tt::DotScaledOp> {
 
     auto upcastOp = createUpcastMxfpOp(op, scale, opDesc.elemType, rewriter);
 
-    auto upcastRetType = cast<RankedTensorType>(upcastOp.getType());
-    retType = RankedTensorType::get(retType.getShape(),
-                                    retType.getElementType(), retDotOpEncoding);
-    return rewriter.create<ttg::ConvertLayoutOp>(opDesc.op.getLoc(),
-                                                 upcastRetType, upcastOp);
+    auto resultType = cast<RankedTensorType>(upcastOp.getType());
+    resultType = RankedTensorType::get(
+        resultType.getShape(), resultType.getElementType(), retDotOpEncoding);
+    return rewriter.create<ttg::ConvertLayoutOp>(opDesc.op.getLoc(), resultType,
+                                                 upcastOp);
   }
 
   template <unsigned opIdx>

Original file line number	Diff line number	Diff line change
`@@ -127,8 +127,6 @@ static Value getModuleWarpSize(RewriterBase &rewriter, Location loc) {`
`127`	`127`	`return i32_val(triton::gpu::TritonGPUDialect::getThreadsPerWarp(mod));`
`128`	`128`	`}`
`129`	`129`
`130`		`-Value mxfpScaleBf16(ConversionPatternRewriter &rewriter, Location loc, Value v,`
`131`		`- Value scale);`
`132`	`130`	`} // namespace mlir::LLVM::intel`
`133`	`131`
`134`	`132`	`// -----------------------------------------------------------------------`