[AMD] Enable B scale for scaled_dot (#5112)

antiagainst · web-flow · commit 915c1499789e · 2024-11-12T18:36:46.000-08:00
This commit enables supporting B scale directly in AccelerateMatmul
and UpcastMXFPOp patterns.

Along the way we need to update the verification for the UpcastMXFPOp
to make sure it allows the case.
diff --git a/lib/Dialect/TritonGPU/IR/Ops.cpp b/lib/Dialect/TritonGPU/IR/Ops.cpp
@@ -1,10 +1,7 @@
 #include "mlir/IR/BuiltinTypes.h"
-#include "triton/Conversion/TritonGPUToLLVM/Utility.h"
 #include "triton/Dialect/Triton/IR/Dialect.h"
 #include "triton/Dialect/TritonGPU/IR/Attributes.h"
 #include "triton/Dialect/TritonGPU/IR/Dialect.h"
-#include "triton/Dialect/TritonNvidiaGPU/IR/Types.h"
-#include "llvm/Support/raw_ostream.h"
 
 #define GET_OP_CLASSES
 #include "triton/Dialect/TritonGPU/IR/Ops.cpp.inc"
@@ -39,19 +36,6 @@ LogicalResult UpcastMXFPOp::verify() {
     return emitOpError("NYI: fpType must be E2M1, E4M3, or E5M2");
   }
 
-  // Change to support fp8 types
-  const auto elems_packed = fpType == ScaleDotElemType::E2M1 ? 2 : 1;
-
-  if (xShape.back() != (32 / elems_packed) * scaleShape.back()) {
-    return emitOpError("last dimension of first operand must be 16 times "
-                       "larger than that of the second operand");
-  }
-
-  if (!std::equal(xShape.begin(), xShape.end() - 1, scaleShape.begin())) {
-    return emitOpError(
-        "all dimensions except the last must match between operands");
-  }
-
   auto layoutX = xTy.getEncoding();
   auto layoutScale = scaleTy.getEncoding();
   if (bool(layoutX) != bool(layoutScale)) {
@@ -82,6 +66,28 @@ LogicalResult UpcastMXFPOp::verify() {
     }
   }
 
+  // Change to support fp8 types
+  const auto elemsPacked = fpType == ScaleDotElemType::E2M1 ? 2 : 1;
+  // Figure out the K dimension for the input A/B. For A/B scale, the K
+  // dimension is always the last dimension.
+  const int opIdx = dotEncoding.getOpIdx();
+  const bool hasBatch = xShape.size() == 3;
+  const int kIdx = (opIdx == 0 ? 1 : 0) + hasBatch;
+
+  if (xShape[kIdx] != (32 / elemsPacked) * scaleShape.back()) {
+    return emitOpError("K dimension of first operand must be 16 times "
+                       "larger than last/K dimension of the second operand");
+  }
+
+  // Check other dimensions match too. For input A/B, we need to figure out the
+  // index for the M/N dimension. For scale, it's always {(batch), M/N, K}.
+  const int mnIdx = (opIdx == 0 ? 0 : 1) + hasBatch;
+  if (hasBatch && xShape[0] != scaleShape[0])
+    return emitOpError("batch dimension must match between operands");
+  if (xShape[mnIdx] != scaleShape[hasBatch]) {
+    return emitOpError("M/N dimension must match between operands");
+  }
+
   return success();
 }
 
@@ -100,14 +106,20 @@ LogicalResult UpcastMXFPOp::inferReturnTypes(
     RankedTensorType retTy;
 
     auto newShape = SmallVector<int64_t>(xShape);
-    newShape.back() *= 2;
     if (!encoding) {
+      newShape.back() *= 2;
       retTy = RankedTensorType::get(xShape, FloatType::getBF16(ctx));
     } else {
       auto oldEncoding = cast<DotOperandEncodingAttr>(encoding);
       auto newVEncoding = DotOperandEncodingAttr::get(
           ctx, oldEncoding.getOpIdx(), oldEncoding.getParent(),
           oldEncoding.getKWidth() * 2);
+      // Figure out the K dimension for the input A/B, given that the return
+      // type is upcasted A/B type so we need to update the proper dim size.
+      const int opIdx = oldEncoding.getOpIdx();
+      const bool hasBatch = xShape.size() == 3;
+      const int kIdx = (opIdx == 0 ? 1 : 0) + hasBatch;
+      newShape[kIdx] *= 2;
       retTy = RankedTensorType::get(newShape, FloatType::getBF16(ctx),
                                     newVEncoding);
     }
diff --git a/python/test/unit/language/test_core.py b/python/test/unit/language/test_core.py
@@ -3386,8 +3386,6 @@ def test_scaled_dot(M, N, K, col_a, col_b, rhs_scale, normal_type, mxfp_type, nu
         if cc < (8, 9):
             pytest.skip("float8e4nv not supported on CUDA < 8.9")
     if is_hip():
-        if rhs_scale:
-            pytest.skip("scales on rhs not yet support for HIP")
         if not is_hip_cdna():
             pytest.skip("scaled_dot only implemented for HIP CDNA")
         if "e4m3" in (normal_type, mxfp_type) and not is_hip_mi300():
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/UpcastMXFPToLLVM.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/UpcastMXFPToLLVM.cpp
@@ -53,8 +53,6 @@ class UpcastMXFPOpPattern : public ConvertOpToLLVMPattern<UpcastMXFPOp> {
 
     auto dotEncoding =
         cast<DotOperandEncodingAttr>(op.getSrc().getType().getEncoding());
-    if (dotEncoding.getOpIdx() == 1)
-      return rewriter.notifyMatchFailure(op, "NYI: dot RHS");
     auto mfmaEncoding = dyn_cast<AMDMfmaEncodingAttr>(dotEncoding.getParent());
     if (!mfmaEncoding)
       return rewriter.notifyMatchFailure(op, "NYI: non-mfma dot operand");
diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/AccelerateAMDMatmul.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/AccelerateAMDMatmul.cpp
@@ -497,24 +497,27 @@ class ScaledBlockedToMFMA final : public OpRewritePattern<triton::DotScaledOp> {
     if (!isa_and_nonnull<BlockedEncodingAttr>(oldRetType.getEncoding()))
       return rewriter.notifyMatchFailure(
           dotOp, "expected blocked encoding result tensor");
-
-    if (dotOp.getRhsScale())
-      return rewriter.notifyMatchFailure(dotOp, "NYI: RHS scale");
+    unsigned rank = oldRetType.getRank();
+    if (rank == 3)
+      return rewriter.notifyMatchFailure(dotOp, "NYI: 3d case");
 
     TensorValue a = dotOp.getLhs();
     TensorValue b = dotOp.getRhs();
     TensorValue aScale = dotOp.getLhsScale();
+    TensorValue bScale = dotOp.getRhsScale();
+    if (aScale && bScale)
+      return rewriter.notifyMatchFailure(dotOp, "NYI: both LHS and RHS scale");
+
     ScaleDotElemType aElemType = dotOp.getLhsType();
     ScaleDotElemType bElemType = dotOp.getRhsType();
-
-    if (!(aElemType == ScaleDotElemType::E2M1 ||
-          aElemType == ScaleDotElemType::E4M3 ||
-          aElemType == ScaleDotElemType::E5M2))
-      return rewriter.notifyMatchFailure(dotOp, "NYI: non-mxfp8/mxfp4 LHS");
-    if (!(bElemType == ScaleDotElemType::E4M3 ||
-          bElemType == ScaleDotElemType::E5M2 ||
-          bElemType == ScaleDotElemType::BF16))
-      return rewriter.notifyMatchFailure(dotOp, "NYI: non-fp8/bf16 RHS");
+    auto supportsTypes = [](ScaleDotElemType elemType) {
+      return elemType == ScaleDotElemType::E2M1 ||
+             elemType == ScaleDotElemType::E4M3 ||
+             elemType == ScaleDotElemType::E5M2 ||
+             elemType == ScaleDotElemType::BF16;
+    };
+    if (!supportsTypes(aElemType) || !supportsTypes(bElemType))
+      return rewriter.notifyMatchFailure(dotOp, "NYI: mxfp6 operand");
 
     MLIRContext *ctx = dotOp.getContext();
     auto moduleOp = dotOp->getParentOfType<ModuleOp>();
@@ -534,27 +537,30 @@ class ScaledBlockedToMFMA final : public OpRewritePattern<triton::DotScaledOp> {
     unsigned kDim = mfmaInstr.value().getKDim();
     unsigned kBase = mfmaInstr.value().getKBase();
 
-    // If A tensor contains mxfp4, we pack every two values into one int8 value
-    // there. For such cases, we have different initial kWidth for LHS and RHS,
-    // which will be "fixed" later by using upcast_mxfp to convert LHS to
-    // unpacked values. For such packed cases, we cannot support flexible kPack
-    // choices from the developer--it just does not apply here. So mandate the
-    // choice here.
-    bool isPacked = aElemType == ScaleDotElemType::E2M1;
-    unsigned kWdiths[] = {isPacked ? 4 : kBase * kPack,
-                          isPacked ? 8 : kBase * kPack};
-
-    // For A tensor, 32 consecutive elements along K dim share the same scale.
+    // For mxfp4 A/B tensor, we pack every two values into one int8 value there.
+    // For such cases, we have different initial kWidth for LHS and RHS, which
+    // will be "fixed" later by using upcast_mxfp to convert LHS to unpacked
+    // values. For such packed cases, we cannot support flexible kPack choices
+    // from the developer--it just does not apply here. So mandate the choice
+    // here.
+    bool isAPacked = aElemType == ScaleDotElemType::E2M1;
+    bool isBPacked = bElemType == ScaleDotElemType::E2M1;
+    bool isPacked = isAPacked || isBPacked;
+    unsigned kWdiths[] = {isPacked ? (isAPacked ? 4 : 8) : kBase * kPack,
+                          isPacked ? (isAPacked ? 8 : 4) : kBase * kPack};
+
+    // For A/B tensor, 32 consecutive elements along K dim share the same scale.
     // We'd like to keep the scale values together with the base values in the
     // same warp to avoid cross-warp data exchange. It means we want warpsPerCTA
-    // = 1 along the N dimension.
-    SmallVector<unsigned, 3> warpsPerCTA(oldRetType.getRank(), 1);
-    warpsPerCTA.front() = numWarps;
+    // = 1 along the N/M dimension for the mxfp A/B case. We achieve that by
+    // setting the M/N dimension as numWarps.
+    SmallVector<unsigned, 2> mfmaWarpsPerCTA(rank, 1);
+    mfmaWarpsPerCTA[aScale ? 0 : 1] = numWarps;
 
     // Always use transposed mfma layout. This enables larger vectorization
     // for global store instructions.
     auto mfmaEnc = ttg::AMDMfmaEncodingAttr::get(
-        ctx, /*versionMajor=*/mfmaVersion, /*versionMinor=*/0, warpsPerCTA,
+        ctx, /*versionMajor=*/mfmaVersion, /*versionMinor=*/0, mfmaWarpsPerCTA,
         /*instrShape=*/mDim, nDim, /*isTransposed=*/true, ctaLayout);
 
     auto newRetType = RankedTensorType::get(
@@ -571,11 +577,9 @@ class ScaledBlockedToMFMA final : public OpRewritePattern<triton::DotScaledOp> {
       auto newVType = RankedTensorType::get(
           vType.getShape(), vType.getElementType(), newVEncoding);
       v = rewriter.create<ttg::ConvertLayoutOp>(v.getLoc(), newVType, v);
-      if (type == ScaleDotElemType::BF16)
-        return v;
-      // Don't need to covert int8 holding mxfp4 for A--the upcast_mxfp op can
+      // Don't need to covert int8 holding mxfp4--the upcast_mxfp op can
       // take int8 tensor as input.
-      if (idx == 0 && type == ScaleDotElemType::E2M1)
+      if (type == ScaleDotElemType::BF16 || type == ScaleDotElemType::E2M1)
         return v;
 
       auto vTypeBf16 = RankedTensorType::get(
@@ -586,27 +590,42 @@ class ScaledBlockedToMFMA final : public OpRewritePattern<triton::DotScaledOp> {
     a = toMMABf16(a, 0, aElemType);
     b = toMMABf16(b, 1, bElemType);
 
-    // We need to have "matching" encoding between the A tensor and A scale
+    // We need to have "matching" encoding between the main tensor and scale
     // tensor to make sure the scale values needed is in the same warp. So we
     // adopt the same CTA layout and warps per CTA. The warp dimensions needs to
-    // match along M dimension too. With in a warp, we have 64 threads. We let
-    // each thread read in one scale value. So we need a threadsPerWarp = mDim
-    // along M dimension.
+    // match along M/N dimension too. With in a warp, we have 64 threads. We let
+    // each thread read in one scale value. So we need a threadsPerWarp =
+    // mDim/nDim along M/N dimension. Note that For MFMA intrinsics, mDim is
+    // always the same as nDim. And for scaled dot scale tensor, we always have
+    // K as the innermost dimension. So we have the same threadsPerWarp in the
+    // below no matter A or B scale. Similarly for warpsPerCTA, the non-K
+    // dimension is always at index 0.
+    assert(mDim == nDim);
     SmallVector<unsigned, 2> threadsPerWarp = {mDim, numThreads / mDim};
+    SmallVector<unsigned, 2> blockWarpsPerCTA(rank, 1);
+    blockWarpsPerCTA[0] = numWarps;
     auto newScaleEncoding = triton::gpu::BlockedEncodingAttr::get(
-        ctx, {1, 1}, threadsPerWarp, warpsPerCTA, {1, 0}, ctaLayout);
+        ctx, {1, 1}, threadsPerWarp, blockWarpsPerCTA, {1, 0}, ctaLayout);
+
+    auto upcastMXFP = [&](TensorValue main, TensorValue scale,
+                          ScaleDotElemType elemType) -> Value {
+      if (!scale)
+        return main;
 
-    auto newScaleType = RankedTensorType::get(aScale.getType().getShape(),
-                                              aScale.getType().getElementType(),
-                                              newScaleEncoding);
-    aScale = rewriter.create<ttg::ConvertLayoutOp>(aScale.getLoc(),
-                                                   newScaleType, aScale);
+      auto newScaleType = RankedTensorType::get(
+          scale.getType().getShape(), scale.getType().getElementType(),
+          newScaleEncoding);
+      auto convOp = rewriter.create<ttg::ConvertLayoutOp>(scale.getLoc(),
+                                                          newScaleType, scale);
 
-    auto scaledA = rewriter.create<triton::gpu::UpcastMXFPOp>(
-        dotOp.getLoc(), a, aScale, dotOp.getLhsType());
+      return rewriter.create<triton::gpu::UpcastMXFPOp>(dotOp.getLoc(), main,
+                                                        convOp, elemType);
+    };
 
-    auto newDot =
-        rewriter.create<DotOp>(dotOp.getLoc(), newRetType, scaledA, b, newAcc);
+    Value scaledA = upcastMXFP(a, aScale, dotOp.getLhsType());
+    Value scaledB = upcastMXFP(b, bScale, dotOp.getRhsType());
+    auto newDot = rewriter.create<DotOp>(dotOp.getLoc(), newRetType, scaledA,
+                                         scaledB, newAcc);
     rewriter.replaceOpWithNewOp<ttg::ConvertLayoutOp>(dotOp, oldRetType,
                                                       newDot);
     return success();