[AMD] Add support for scaled_dot(mxfp4, -) (triton-lang#5034)

antiagainst · guacamoleo · commit b1064f218ced · 2024-11-14T17:15:37.000Z
This commit adds support for mxfp4 typed A tensor
for sacled dot in the AMD backend.

We moved the `convertMxfp4x2ToBf16x2` impl
from NVIDIA side to a common path to reuse.
diff --git a/include/triton/Conversion/TritonGPUToLLVM/Utility.h b/include/triton/Conversion/TritonGPUToLLVM/Utility.h
@@ -391,6 +391,15 @@ inline Value getSharedMemoryBase(Location loc, RewriterBase &rewriter,
   Value base = gep(ptrTy, i8_ty, LLVM::getStackPointer(rewriter, func), offVal);
   return base;
 }
+
+// -----------------------------------------------------------------------
+// MXFP utilities
+// -----------------------------------------------------------------------
+
+// Convert one int8, which contain, 2 packed mxfp4 values, into 2 bf16
+// standalone values and returns them as a pair for (high 4 bits, low 4 bits).
+std::pair<Value, Value> convertMxfp4x2ToBf16x2(RewriterBase &rewriter,
+                                               Location loc, Value v);
 } // namespace LLVM
 
 /* ------------------------------------ */
diff --git a/lib/Conversion/TritonGPUToLLVM/Utility.cpp b/lib/Conversion/TritonGPUToLLVM/Utility.cpp
@@ -862,5 +862,32 @@ SmallVector<Value> getWrappedMultiDimOffset(
   return multiDimOffsetWrapped;
 }
 
+std::pair<Value, Value> convertMxfp4x2ToBf16x2(RewriterBase &rewriter,
+                                               Location loc, Value v) {
+  auto em0 = and_(v, i8_val(0x70));
+  auto em1 = and_(v, i8_val(0x7));
+  Value v0 = or_(shl(zext(i16_ty, em0), i16_val(2)),
+                 shl(zext(i16_ty, and_(v, i8_val(0x80))), i16_val(8)));
+  Value v1 = or_(shl(zext(i16_ty, em1), i16_val(6)),
+                 shl(zext(i16_ty, and_(v, i8_val(0x8))), i16_val(12)));
+
+  // Three cases:
+  // 1) x is normal and non-zero: Correct bias
+  v0 = select(icmp_ne(and_(em0, i8_val(0x60)), i8_val(0)),
+              add(v0, i16_val((127 - 1) << 7)), v0);
+  v1 = select(icmp_ne(and_(em1, i8_val(0x6)), i8_val(0)),
+              add(v1, i16_val((127 - 1) << 7)), v1);
+
+  // 2) x is subnormal (x == 0bs001 where s is the sign): Map to +-0.5 in
+  // bf16
+  v0 = select(icmp_eq(em0, i8_val(0x10)),
+              or_(i16_val(16128), and_(v0, i16_val(0x8000))), v0);
+  v1 = select(icmp_eq(em1, i8_val(0x1)),
+              or_(i16_val(16128), and_(v1, i16_val(0x8000))), v1);
+  // 3) x is zero, nothing to do
+
+  return {v0, v1};
+}
+
 } // namespace LLVM
 } // namespace mlir
diff --git a/python/test/unit/language/test_core.py b/python/test/unit/language/test_core.py
@@ -29,6 +29,7 @@
     is_cuda,
     is_interpreter,
     is_hip,
+    is_hip_mi200,
     get_arch,
     torch_float8_dtypes,
     torch_dtypes,
@@ -3354,7 +3355,7 @@ def test_scaled_dot(M, N, K, col_a, col_b, type_a, type_b, num_warps, mma, kpack
         if cc < (8, 9):
             pytest.skip("float8e4nv not supported on CUDA < 8.9")
     if is_hip():
-        if type_a != "e5m2" or (type_b != "e5m2" and type_b != "bf16"):
+        if (type_a not in ["e2m1", "e5m2"]) or (type_b not in ["e2m1", "e5m2", "bf16"]):
             pytest.skip(f"scaled_dot({type_a}, {type_b}) not yet implemented for HIP")
         if mma == 16 and K == 64:
             pytest.skip(f"K == {K} too small for mfma {mma} in scaled_dot")
@@ -3530,7 +3531,13 @@ def make_finite(x, dtype):
 
     z_ref = dot_scale_ref(x, scale_x, y, type_a, type_b)
 
-    torch.testing.assert_close(z, z_ref, atol=1e-5, rtol=1e-2)
+    # Bigger tolerance for AMD MI200 devices.
+    # MI200 devices use reduced precision fp16 and bf16 and flush input and output denormal values
+    # to zero. Detailed info is at:
+    # https://pytorch.org/docs/stable/notes/numerical_accuracy.html#reduced-precision-fp16-and-bf16-gemms-and-convolutions-on-amd-instinct-mi200-devices
+    atol = 2e-4 if is_hip_mi200() else 1e-5
+    rtol = 2e-2 if is_hip_mi200() else 1e-2
+    torch.testing.assert_close(z, z_ref, atol=atol, rtol=rtol)
 
     # make sure ld/st are vectorized
     if is_cuda():
diff --git a/python/triton/_internal_testing.py b/python/triton/_internal_testing.py
@@ -41,6 +41,11 @@ def is_hip():
     return False if target is None else target.backend == "hip"
 
 
+def is_hip_mi200():
+    target = get_current_target()
+    return target.backend == 'hip' and target.arch == 'gfx90a'
+
+
 def get_arch():
     target = get_current_target()
     return "" if target is None else str(target.arch)
diff --git a/third_party/amd/lib/TritonAMDGPUToLLVM/UpcastMXFPToLLVM.cpp b/third_party/amd/lib/TritonAMDGPUToLLVM/UpcastMXFPToLLVM.cpp
@@ -21,10 +21,11 @@ namespace {
 
 Value mxfpScaleBf16(RewriterBase &rewriter, Location loc, Value v,
                     Value scale) {
+  Value vBf16 = bitcast(v, bf16_ty);
   Value nanBf16 = bitcast(i16_val(0x7fff), bf16_ty);
   Value scaleIsNan = icmp_eq(scale, i8_val(0xff));
   Value scaleBf16 = bitcast(shl(zext(i16_ty, scale), i16_val(7)), bf16_ty);
-  Value scaledBf16 = fmul(v, scaleBf16);
+  Value scaledBf16 = fmul(vBf16, scaleBf16);
   // Account for NaN in the scale as per the mxfp specification.
   return select(scaleIsNan, nanBf16, scaledBf16);
 };
@@ -43,7 +44,9 @@ class UpcastMXFPOpPattern : public ConvertOpToLLVMPattern<UpcastMXFPOp> {
   matchAndRewrite(UpcastMXFPOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
     auto fpType = op.getFpType();
-    if (!(fpType == ScaleDotElemType::E4M3 || fpType == ScaleDotElemType::E5M2))
+    bool isPacked = fpType == ScaleDotElemType::E2M1;
+    if (!(isPacked || fpType == ScaleDotElemType::E4M3 ||
+          fpType == ScaleDotElemType::E5M2))
       return rewriter.notifyMatchFailure(op, "NYI: non-mxfp8 cases");
 
     Location loc = op.getLoc();
@@ -56,7 +59,7 @@ class UpcastMXFPOpPattern : public ConvertOpToLLVMPattern<UpcastMXFPOp> {
     // warp. MXFP spec mandates 1 scale value for every 32 onsecutive values
     // along the K dimension. So in total each thread should read 32x main
     // element values.
-    if (xVals.size() != scaleVals.size() * 32)
+    if (xVals.size() != scaleVals.size() * (isPacked ? 16 : 32))
       return rewriter.notifyMatchFailure(op, "unsupported problem size");
 
     auto dotEncoding =
@@ -79,6 +82,9 @@ class UpcastMXFPOpPattern : public ConvertOpToLLVMPattern<UpcastMXFPOp> {
     Value warpId = udiv(tid, warpSize);
     Value laneId = urem(tid, warpSize);
 
+    if (isPacked)
+      xVals = unpackFP4Elements(loc, rewriter, xVals);
+
     // Given that MFMA layout for the A tensor arranges thread in a column-major
     // manner, for the current tid, it's at row (tid % mDim). When we set up
     // blocked layout for the A scale tensor, we made sure that it has a
@@ -136,6 +142,20 @@ class UpcastMXFPOpPattern : public ConvertOpToLLVMPattern<UpcastMXFPOp> {
     rewriter.replaceOp(op, result);
     return success();
   }
+
+private:
+  SmallVector<Value> unpackFP4Elements(Location loc, RewriterBase &rewriter,
+                                       ArrayRef<Value> packed) const {
+    // Split every fp4x2 into 2 bf16 values.
+    llvm::SmallVector<Value> unpacked;
+    unpacked.reserve(packed.size() * 2);
+    for (Value v : packed) {
+      auto [e0, e1] = LLVM::convertMxfp4x2ToBf16x2(rewriter, loc, v);
+      unpacked.push_back(e0);
+      unpacked.push_back(e1);
+    }
+    return unpacked;
+  }
 };
 } // anonymous namespace
 
diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/AccelerateAMDMatmul.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/AccelerateAMDMatmul.cpp
@@ -507,9 +507,10 @@ class ScaledBlockedToMFMA final : public OpRewritePattern<triton::DotScaledOp> {
     ScaleDotElemType aElemType = dotOp.getLhsType();
     ScaleDotElemType bElemType = dotOp.getRhsType();
 
-    if (!(aElemType == ScaleDotElemType::E4M3 ||
+    if (!(aElemType == ScaleDotElemType::E2M1 ||
+          aElemType == ScaleDotElemType::E4M3 ||
           aElemType == ScaleDotElemType::E5M2))
-      return rewriter.notifyMatchFailure(dotOp, "NYI: non-mxfp8 LHS");
+      return rewriter.notifyMatchFailure(dotOp, "NYI: non-mxfp8/mxfp4 LHS");
     if (!(bElemType == ScaleDotElemType::E4M3 ||
           bElemType == ScaleDotElemType::E5M2 ||
           bElemType == ScaleDotElemType::BF16))
@@ -532,7 +533,16 @@ class ScaledBlockedToMFMA final : public OpRewritePattern<triton::DotScaledOp> {
     unsigned nDim = mfmaInstr.value().getNDim();
     unsigned kDim = mfmaInstr.value().getKDim();
     unsigned kBase = mfmaInstr.value().getKBase();
-    unsigned kWdith = kBase *= kPack;
+
+    // If A tensor contains mxfp4, we pack every two values into one int8 value
+    // there. For such cases, we have different initial kWidth for LHS and RHS,
+    // which will be "fixed" later by using upcast_mxfp to convert LHS to
+    // unpacked values. For such packed cases, we cannot support flexible kPack
+    // choices from the developer--it just does not apply here. So mandate the
+    // choice here.
+    bool isPacked = aElemType == ScaleDotElemType::E2M1;
+    unsigned kWdiths[] = {isPacked ? 4 : kBase * kPack,
+                          isPacked ? 8 : kBase * kPack};
 
     // For A tensor, 32 consecutive elements along K dim share the same scale.
     // We'd like to keep the scale values together with the base values in the
@@ -553,38 +563,20 @@ class ScaledBlockedToMFMA final : public OpRewritePattern<triton::DotScaledOp> {
     auto newAcc = rewriter.create<ttg::ConvertLayoutOp>(
         dotOp.getC().getLoc(), newRetType, dotOp.getC());
 
-    // OCP mxfp8 requires implementations to follow OCP fp8 elements. We are
-    // doing software emulation using bf16 here, so we map to OCP fp8 f8E4M3FN
-    // and f8E5M2.
-    auto enumToType = [&rewriter](ScaleDotElemType type) {
-      switch (type) {
-      case ScaleDotElemType::E4M3:
-        return rewriter.getFloat8E4M3FNType();
-      case ScaleDotElemType::E5M2:
-        return rewriter.getFloat8E5M2Type();
-      default:
-        llvm_unreachable("unexpected fp type");
-      }
-    };
-
     auto toMMABf16 = [&](TensorValue v, int idx,
                          ScaleDotElemType type) -> TensorValue {
-      assert(type == ScaleDotElemType::E5M2 || type == ScaleDotElemType::E4M3 ||
-             type == ScaleDotElemType::BF16);
-
       auto vType = v.getType();
       auto newVEncoding = DotOperandEncodingAttr::get(
-          ctx, idx, newRetType.getEncoding(), kWdith);
+          ctx, idx, newRetType.getEncoding(), kWdiths[idx]);
       auto newVType = RankedTensorType::get(
           vType.getShape(), vType.getElementType(), newVEncoding);
       v = rewriter.create<ttg::ConvertLayoutOp>(v.getLoc(), newVType, v);
       if (type == ScaleDotElemType::BF16)
         return v;
-
-      auto vTypeFp8 = RankedTensorType::get(vType.getShape(), enumToType(type),
-                                            newVEncoding);
-      v = cast<TensorValue>(
-          rewriter.create<BitcastOp>(v.getLoc(), vTypeFp8, v).getResult());
+      // Don't need to covert int8 holding mxfp4 for A--the upcast_mxfp op can
+      // take int8 tensor as input.
+      if (idx == 0 && type == ScaleDotElemType::E2M1)
+        return v;
 
       auto vTypeBf16 = RankedTensorType::get(
           vType.getShape(), rewriter.getBF16Type(), newVEncoding);
diff --git a/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/UpcastMXFPToLLVM.cpp b/third_party/nvidia/lib/TritonNVIDIAGPUToLLVM/UpcastMXFPToLLVM.cpp
@@ -1,6 +1,7 @@
 #include "mlir/Conversion/LLVMCommon/Pattern.h"
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/BuiltinTypes.h"
+#include "mlir/IR/PatternMatch.h"
 #include "mlir/IR/TypeUtilities.h"
 
 #include "PatternTritonGPUOpToLLVM.h"
@@ -12,7 +13,6 @@
 #include "triton/Dialect/Triton/IR/Dialect.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/Support/raw_ostream.h"
 #include <array>
 
 using namespace mlir;
@@ -30,42 +30,17 @@ class UpcastMXFPOpPattern : public ConvertOpToLLVMPattern<UpcastMXFPOp> {
       : ConvertOpToLLVMPattern<UpcastMXFPOp>(typeConverter, benefit),
         targetInfo(targetInfo) {}
 
-  llvm::SmallVector<Value>
-  unpackFP4Elements(Location loc, ConversionPatternRewriter &rewriter,
-                    const llvm::SmallVector<Value> &vals, Value laneId) const {
-    auto fp4x2ToBf16x2 = [&loc, &rewriter](Value v) -> Value {
-      auto em0 = and_(v, i8_val(0x70));
-      auto em1 = and_(v, i8_val(0x7));
-      Value v0 = or_(shl(zext(i16_ty, em0), i16_val(2)),
-                     shl(zext(i16_ty, and_(v, i8_val(0x80))), i16_val(8)));
-      Value v1 = or_(shl(zext(i16_ty, em1), i16_val(6)),
-                     shl(zext(i16_ty, and_(v, i8_val(0x8))), i16_val(12)));
-
-      // Three cases:
-      // 1) x is normal and non-zero: Correct bias
-      v0 = select(icmp_ne(and_(em0, i8_val(0x60)), i8_val(0)),
-                  add(v0, i16_val((127 - 1) << 7)), v0);
-      v1 = select(icmp_ne(and_(em1, i8_val(0x6)), i8_val(0)),
-                  add(v1, i16_val((127 - 1) << 7)), v1);
-
-      // 2) x is subnormal (x == 0bs001 where s is the sign): Map to +-0.5 in
-      // bf16
-      v0 = select(icmp_eq(em0, i8_val(0x10)),
-                  or_(i16_val(16128), and_(v0, i16_val(0x8000))), v0);
-      v1 = select(icmp_eq(em1, i8_val(0x1)),
-                  or_(i16_val(16128), and_(v1, i16_val(0x8000))), v1);
-      // 3) x is zero, nothing to do
-
-      // Swap as they come packed in big endian
-      return or_(zext(i32_ty, v0), shl(zext(i32_ty, v1), i32_val(16)));
-    };
+  llvm::SmallVector<Value> unpackFP4Elements(Location loc,
+                                             RewriterBase &rewriter,
+                                             ArrayRef<Value> vals) const {
 
-    auto fp4x8ToBf16x2 = [&loc, &rewriter, &fp4x2ToBf16x2](
-                             Value v) -> llvm::SmallVector<Value, 4> {
+    auto fp4x8ToBf16x2 = [&loc, &rewriter](Value v) {
       llvm::SmallVector<Value, 4> results(4);
       for (int i = 0; i < 4; ++i) {
         auto v_i = trunc(i8_ty, lshr(v, i32_val(8 * i)));
-        results[i] = fp4x2ToBf16x2(v_i);
+        auto [e0, e1] = LLVM::convertMxfp4x2ToBf16x2(rewriter, loc, v_i);
+        // Swap as they come packed in big endian
+        results[i] = or_(zext(i32_ty, e0), shl(zext(i32_ty, e1), i32_val(16)));
       }
       return results;
     };
@@ -104,7 +79,7 @@ class UpcastMXFPOpPattern : public ConvertOpToLLVMPattern<UpcastMXFPOp> {
     Value laneId = urem(tid, warpSize);
 
     if (fpType == ScaleDotElemType::E2M1) {
-      xVals = unpackFP4Elements(loc, rewriter, xVals, laneId);
+      xVals = unpackFP4Elements(loc, rewriter, xVals);
     }
 
     auto scaleBf16x2 = [&loc, &rewriter](Value v, Value s) -> Value {