Fix rhs scaling

leonling-ll · leonling-ll · commit 90f937a5ca44 · 2024-12-06T18:05:36.000Z
diff --git a/lib/Dialect/TritonGPU/IR/Ops.cpp b/lib/Dialect/TritonGPU/IR/Ops.cpp
@@ -152,9 +152,10 @@ LogicalResult UpcastMXFPOp::inferReturnTypes(
       } else if (auto oldEncoding = dyn_cast<BlockedEncodingAttr>(encoding)) {
         // TODO: Temporary code, remove once upcast_mxfp support dot encoding.
         assert(!tools::getBoolEnv("TRITON_INTEL_UPCASTMXFP_DOTOP_ENCODING"));
-        newShape.back() *= 2;
         SmallVector<unsigned> sizePerThread = oldEncoding.getSizePerThread();
-        sizePerThread.back() *= 2;
+        int opIdx = sizePerThread.back() == 1 ? 1 : 0;
+        sizePerThread[!opIdx] *= 2;
+        newShape[!opIdx] *= 2;
         newVEncoding = BlockedEncodingAttr::get(
             ctx, sizePerThread, oldEncoding.getThreadsPerWarp(),
             oldEncoding.getWarpsPerCTA(), oldEncoding.getCTAOrder(),
diff --git a/python/test/unit/language/test_core.py b/python/test/unit/language/test_core.py
@@ -3440,9 +3440,6 @@ def test_scaled_dot(M, N, K, col_a, col_b, rhs_scale, normal_type, mxfp_type, nu
             pytest.skip(f"scaled_dot({normal_type}, {mxfp_type}) only implemented for MI300")
         if mma == 16 and K == 64:
             pytest.skip(f"K == {K} too small for mfma {mma} in scaled_dot")
-    if is_xpu():
-        if rhs_scale:
-            pytest.skip("scaled_dot with rhs_scale not supported on XPU")
 
     @triton.jit
     def dot_scale_kernel(a_base, stride_a0, stride_a1, a_scale, b_base, stride_b0, stride_b1, b_scale, out,
diff --git a/third_party/intel/include/Analysis/DPAS.h b/third_party/intel/include/Analysis/DPAS.h
@@ -24,6 +24,8 @@ class DPASAnalysis {
     FP32_FP32_TF32_TF32,
     FP16_FP16_FP16_FP16,
     BF16_BF16_BF16_BF16,
+    U32_U32_U8_U8,
+    S32_S32_S8_S8,
     // data types for dot scaled.
     FP32_FP32_BF16_FP8,
     FP32_FP32_BF16_FP4,
@@ -32,8 +34,6 @@ class DPASAnalysis {
     FP32_FP32_FP8_FP4,
     FP32_FP32_FP4_BF16,
     FP32_FP32_FP4_FP8,
-    U32_U32_U8_U8,
-    S32_S32_S8_S8,
     NOT_APPLICABLE
   };
 
diff --git a/third_party/intel/lib/Analysis/DPAS.cpp b/third_party/intel/lib/Analysis/DPAS.cpp
@@ -140,7 +140,8 @@ DPASAnalysis::DPASEngineType DPASAnalysis::getDPASType(Operation *op) {
         if (aElemTy.isBF16() &&
             (bElemTy.isFloat8E4M3FN() || bElemTy.isFloat8E5M2()))
           return DPASEngineType::FP32_FP32_BF16_FP8;
-        if (aElemTy.isBF16() && bElemTy.isFloat4E2M1FN())
+        // 2 E2M1 are packed into 1 int8
+        if (aElemTy.isBF16() && bElemTy.isInteger(8))
           return DPASEngineType::FP32_FP32_BF16_FP4;
         if ((aElemTy.isFloat8E4M3FN() || aElemTy.isFloat8E5M2()) &&
             bElemTy.isBF16())
@@ -149,9 +150,8 @@ DPASAnalysis::DPASEngineType DPASAnalysis::getDPASType(Operation *op) {
             (bElemTy.isFloat8E4M3FN() || bElemTy.isFloat8E5M2()))
           return DPASEngineType::FP32_FP32_FP8_FP8;
         if ((aElemTy.isFloat8E4M3FN() || aElemTy.isFloat8E5M2()) &&
-            bElemTy.isFloat4E2M1FN())
+            bElemTy.isInteger(8))
           return DPASEngineType::FP32_FP32_FP8_FP4;
-        // 2 E2M1 are packed into 1 int8
         if (aElemTy.isInteger(8) && bElemTy.isBF16())
           return DPASEngineType::FP32_FP32_FP4_BF16;
         if (aElemTy.isInteger(8) &&
diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/AccelerateMatmul.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/AccelerateMatmul.cpp
@@ -291,10 +291,12 @@ class DecomposeScaledBlocked : public OpRewritePattern<tt::DotScaledOp> {
     static_assert(opIdx == 0 || opIdx == 1, "Illegal operand index");
     assert(opDesc.scale && "Expecting valid operand & scale");
 
-    unsigned opsPerChannel = dpasEnc.getOpsPerChannel();
-
     MLIRContext *ctx = opDesc.op.getContext();
+    unsigned numWarps = ttg::TritonGPUDialect::getNumWarps(mod);
+    unsigned warpSize = ttg::TritonGPUDialect::getThreadsPerWarp(mod);
+    unsigned opsPerChannel = dpasEnc.getOpsPerChannel();
     unsigned rank = retType.getRank();
+
     if (upcastMXFPUseDotOpEnc) {
       if (opDesc.elemType == tt::ScaleDotElemType::E2M1)
         opsPerChannel *= 2;
@@ -312,7 +314,6 @@ class DecomposeScaledBlocked : public OpRewritePattern<tt::DotScaledOp> {
       unsigned instrShapeM = dpasEnc.getDPASInstShapeA()[1];
       SmallVector<unsigned, 2> threadsPerWarp{instrShapeM,
                                               warpSize / instrShapeM};
-      int numWarps = ttg::TritonGPUDialect::getNumWarps(mod);
       SmallVector<unsigned, 2> warpsPerCTA(rank, 1);
       warpsPerCTA[0] = numWarps;
       auto CTALayout = ttg::getCTALayout(retType.getEncoding());
@@ -323,44 +324,52 @@ class DecomposeScaledBlocked : public OpRewritePattern<tt::DotScaledOp> {
       TensorValue scale = createScale(opDesc.scale, newScaleEncoding, rewriter);
 
       return createUpcastMxfpOp(op, scale, opDesc.elemType, rewriter);
-    } else {
-      auto scaleEncoding = dyn_cast<ttg::BlockedEncodingAttr>(
-          opDesc.scale.getType().getEncoding());
-      assert(scaleEncoding && "Expecting blocked encoding for scale");
-
-      // Referring to
-      // https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf
-      // the scalingBlockSize should be 32 for E5M2, E4M3 and E2M1
-      unsigned scalingBlockSize = 32;
-      // 2 FP4E2M1 are packed in 1 I8
-      if (opDesc.elemType == tt::ScaleDotElemType::E2M1)
-        scalingBlockSize = 16;
-      SmallVector<unsigned, 2> sizePerThread(rank, 1);
-      sizePerThread[rank - 1 - opIdx] = scalingBlockSize;
-      auto newOpEncoding = ttg::BlockedEncodingAttr::get(
-          ctx, sizePerThread, scaleEncoding.getThreadsPerWarp(),
-          scaleEncoding.getWarpsPerCTA(), scaleEncoding.getCTAOrder(),
-          scaleEncoding.getCTALayout());
-
-      TensorValue op =
-          createArg(opDesc.op, opDesc.elemType, newOpEncoding, rewriter);
-      TensorValue scale = opDesc.scale;
-
-      auto retDpasEncoding = ttg::intel::DpasEncodingAttr::get(
-          ctx, dpasEnc.getRepeatCount(), dpasEnc.getSystolicDepth(),
-          dpasEnc.getExecutionSize(), opsPerChannel, dpasEnc.getWarpsPerCTA(),
-          dpasEnc.getRepCluster(), dpasEnc.getSubGroupSize());
-      auto retDotOpEncoding = ttg::DotOperandEncodingAttr::get(
-          ctx, opIdx, retDpasEncoding, retDpasEncoding.getOpsPerChannel());
-
-      auto upcastOp = createUpcastMxfpOp(op, scale, opDesc.elemType, rewriter);
-
-      auto retType = cast<RankedTensorType>(upcastOp.getType());
-      retType = RankedTensorType::get(
-          retType.getShape(), retType.getElementType(), retDotOpEncoding);
-      return rewriter.create<ttg::ConvertLayoutOp>(opDesc.op.getLoc(), retType,
-                                                   upcastOp);
     }
+
+    auto scaleEncoding = dyn_cast<ttg::BlockedEncodingAttr>(
+        opDesc.scale.getType().getEncoding());
+    assert(scaleEncoding && "Expecting blocked encoding for scale");
+
+    // Referring to
+    // https://www.opencompute.org/documents/ocp-microscaling-formats-mx-v1-0-spec-final-pdf
+    // the scalingBlockSize should be 32 for E5M2, E4M3 and E2M1
+    unsigned scalingBlockSize = 32;
+    // 2 FP4E2M1 are packed in 1 I8
+    if (opDesc.elemType == tt::ScaleDotElemType::E2M1)
+      scalingBlockSize = 16;
+    SmallVector<unsigned> sizePerThread = {1, 1};
+    SmallVector<unsigned> threadsPerWarp = {1, 1};
+    sizePerThread[!opIdx] = scalingBlockSize;
+    threadsPerWarp[opIdx] = warpSize;
+    SmallVector<unsigned> warpsPerCTA = {numWarps, 1};
+
+    auto newOpEncoding = ttg::BlockedEncodingAttr::get(
+        ctx, sizePerThread, threadsPerWarp, warpsPerCTA,
+        scaleEncoding.getCTAOrder(), scaleEncoding.getCTALayout());
+    TensorValue op =
+        createArg(opDesc.op, opDesc.elemType, newOpEncoding, rewriter);
+
+    warpsPerCTA = opIdx ? SmallVector<unsigned>{1, numWarps}
+                        : SmallVector<unsigned>{numWarps, 1};
+    auto newScaleEncoding = ttg::BlockedEncodingAttr::get(
+        ctx, {1, 1}, {warpSize, 1}, warpsPerCTA, scaleEncoding.getCTAOrder(),
+        scaleEncoding.getCTALayout());
+    TensorValue scale = createScale(opDesc.scale, newScaleEncoding, rewriter);
+
+    auto retDpasEncoding = ttg::intel::DpasEncodingAttr::get(
+        ctx, dpasEnc.getRepeatCount(), dpasEnc.getSystolicDepth(),
+        dpasEnc.getExecutionSize(), opsPerChannel, dpasEnc.getWarpsPerCTA(),
+        dpasEnc.getRepCluster(), dpasEnc.getSubGroupSize());
+    auto retDotOpEncoding = ttg::DotOperandEncodingAttr::get(
+        ctx, opIdx, retDpasEncoding, retDpasEncoding.getOpsPerChannel());
+
+    auto upcastOp = createUpcastMxfpOp(op, scale, opDesc.elemType, rewriter);
+
+    auto resultType = cast<RankedTensorType>(upcastOp.getType());
+    resultType = RankedTensorType::get(
+        resultType.getShape(), resultType.getElementType(), retDotOpEncoding);
+    return rewriter.create<ttg::ConvertLayoutOp>(opDesc.op.getLoc(), resultType,
+                                                 upcastOp);
   }
 
   template <unsigned opIdx>