[Intel] Port changes to support fp16 scaled dot (#3153)

whitneywhtsang · web-flow · commit 73b93565c24a · 2025-01-14T05:14:15.000Z
These changes are ported from f9d9fad, but they are not enough to make fp16 scaled dot unit tests to pass, more investigation is needed to enable those test cases. Part of #3141. Signed-off-by: Whitney Tsang <whitney.tsang@intel.com>
diff --git a/third_party/intel/lib/TritonIntelGPUToLLVM/UpcastMXFPToLLVM.cpp b/third_party/intel/lib/TritonIntelGPUToLLVM/UpcastMXFPToLLVM.cpp
@@ -16,6 +16,53 @@ using namespace mlir::triton::gpu::intel;
 
 namespace {
 
+SmallVector<Value> convertMxfp4x2ToFp16x2(RewriterBase &rewriter, Location loc,
+                                          ArrayRef<Value> values) {
+  SmallVector<Value> results;
+  for (auto v : values) {
+    auto em0 = and_(v, i8_val(0x7));
+    auto em1 = and_(v, i8_val(0x70));
+    // FP16 bits: sign = 1, exponent = 5, mantissa = 10
+    Value v0 = or_(shl(zext(i16_ty, em0), i16_val(10 - 1)),
+                   shl(zext(i16_ty, and_(v, i8_val(0x8))), i16_val(12)));
+    Value v1 = or_(shl(zext(i16_ty, em1), i16_val(10 - 1 - 4)),
+                   shl(zext(i16_ty, and_(v, i8_val(0x80))), i16_val(8)));
+
+    // Three cases:
+    // 1) x is normal and non-zero: Correct bias
+    v0 = select(icmp_ne(and_(em0, i8_val(0x6)), i8_val(0)),
+                add(v0, i16_val((15 - 1) << 10)), v0);
+    v1 = select(icmp_ne(and_(em1, i8_val(0x60)), i8_val(0)),
+                add(v1, i16_val((15 - 1) << 10)), v1);
+
+    // 2) x is subnormal (x == 0bs001 where s is the sign): Map to fp16 +-0.5
+    v0 = bitcast(select(icmp_eq(em0, i8_val(0x1)),
+                        or_(i16_val(0x3800), and_(v0, i16_val(0x8000))), v0),
+                 f16_ty);
+    v1 = bitcast(select(icmp_eq(em1, i8_val(0x10)),
+                        or_(i16_val(0x3800), and_(v1, i16_val(0x8000))), v1),
+                 f16_ty);
+    // 3) x is zero, nothing to do
+    results.push_back(v0);
+    results.push_back(v1);
+  }
+  return results;
+}
+
+Value mxfpScaleFp16(ConversionPatternRewriter &rewriter, Location loc, Value v,
+                    Value scale, bool fastMath) {
+  Value scaleF32 = bitcast(shl(zext(i32_ty, scale), i32_val(23)), f32_ty);
+  Value scaleF16 = LLVM::intel::convertFp32ToFp16(loc, rewriter, scaleF32,
+                                                  RoundingMode::RTNE);
+  Value mulF16 = fmul(v, scaleF16);
+  if (fastMath)
+    return mulF16;
+  // Account for NaN in the scale as per the mxfp specification.
+  Value scaleIsNan = icmp_eq(scale, i8_val(0xff));
+  Value nanF16 = bitcast(i16_val(0x7c01), f16_ty);
+  return select(scaleIsNan, nanF16, bitcast(mulF16, f16_ty));
+};
+
 static Value mxfpScaleBf16(ConversionPatternRewriter &rewriter, Location loc,
                            Value v, Value scale, bool fastMath) {
   Value vBf16 = bitcast(v, bf16_ty);
@@ -61,8 +108,11 @@ class UpcastMXFPOpPattern : public ConvertOpToLLVMPattern<UpcastMXFPOp> {
     Value warpId = udiv(tid, warpSize);
     Value laneId = urem(tid, warpSize);
 
-    if (fpType == ScaleDotElemType::E2M1)
-      xVals = LLVM::convertMxfp4x2ToBf16x2(rewriter, loc, xVals);
+    bool useFp16 = op.getType().getElementType().isF16();
+    if (fpType == ScaleDotElemType::E2M1) {
+      xVals = useFp16 ? convertMxfp4x2ToFp16x2(rewriter, loc, xVals)
+                      : LLVM::convertMxfp4x2ToBf16x2(rewriter, loc, xVals);
+    }
 
     auto xType = cast<RankedTensorType>(op->getOperandTypes()[0]);
     auto dotEnc = cast<DotOperandEncodingAttr>(xType.getEncoding());
@@ -106,8 +156,11 @@ class UpcastMXFPOpPattern : public ConvertOpToLLVMPattern<UpcastMXFPOp> {
             for (int k = 0; k < kWidth; ++k) {
               unsigned idx = i * scalingBlockSize + mxfp * mxfpSize +
                              rep * subTileSize * kWidth + subTile * kWidth + k;
-              xVals[idx] = mxfpScaleBf16(rewriter, loc, xVals[idx], si[subTile],
-                                         op.getFastMath());
+              xVals[idx] = useFp16
+                               ? mxfpScaleFp16(rewriter, loc, xVals[idx],
+                                               si[subTile], op.getFastMath())
+                               : mxfpScaleBf16(rewriter, loc, xVals[idx],
+                                               si[subTile], op.getFastMath());
             }
           }
         }
diff --git a/third_party/intel/lib/TritonIntelGPUTransforms/AccelerateMatmul.cpp b/third_party/intel/lib/TritonIntelGPUTransforms/AccelerateMatmul.cpp
@@ -227,7 +227,8 @@ class DecomposeScaledBlocked : public OpRewritePattern<tt::DotScaledOp> {
       return elemType == tt::ScaleDotElemType::E2M1 ||
              elemType == tt::ScaleDotElemType::E4M3 ||
              elemType == tt::ScaleDotElemType::E5M2 ||
-             elemType == tt::ScaleDotElemType::BF16;
+             elemType == tt::ScaleDotElemType::BF16 ||
+             elemType == tt::ScaleDotElemType::FP16;
     };
     if (!supportsTypes(aElemType) || !supportsTypes(bElemType))
       return rewriter.notifyMatchFailure(scaledDotOp, "NYI: mxfp6 operand");
@@ -263,27 +264,31 @@ class DecomposeScaledBlocked : public OpRewritePattern<tt::DotScaledOp> {
     assert((aDesc.scale || bDesc.scale) && "No scale provided");
     assert(!(aDesc.scale && bDesc.scale) && "NYI: Both LHS and RHS scale");
 
+    bool useFp16 = aDesc.elemType == tt::ScaleDotElemType::FP16 ||
+                   bDesc.elemType == tt::ScaleDotElemType::FP16;
+
     if (aDesc.scale) {
       TensorValue newA =
           convertScaledOperand<ttgi::DpasEncodingAttr::OpIdx::OperandA>(
-              aDesc, fastMath, dpasEnc, newRetType, mod, rewriter);
+              aDesc, useFp16, fastMath, dpasEnc, newRetType, mod, rewriter);
       TensorValue newB =
           convertUnscaledOperand<ttgi::DpasEncodingAttr::OpIdx::OperandB>(
-              bDesc, dpasEnc, newRetType, rewriter);
+              bDesc, useFp16, dpasEnc, newRetType, rewriter);
       return {newA, newB};
     }
 
     TensorValue newB =
         convertScaledOperand<ttgi::DpasEncodingAttr::OpIdx::OperandB>(
-            bDesc, fastMath, dpasEnc, newRetType, mod, rewriter);
+            bDesc, useFp16, fastMath, dpasEnc, newRetType, mod, rewriter);
     TensorValue newA =
         convertUnscaledOperand<ttgi::DpasEncodingAttr::OpIdx::OperandA>(
-            aDesc, dpasEnc, newRetType, rewriter);
+            aDesc, useFp16, dpasEnc, newRetType, rewriter);
     return {newA, newB};
   }
 
   template <ttgi::DpasEncodingAttr::OpIdx opIdx>
-  TensorValue convertScaledOperand(OpDescriptor opDesc, bool fastMath,
+  TensorValue convertScaledOperand(OpDescriptor opDesc, bool useFp16,
+                                   bool fastMath,
                                    ttg::intel::DpasEncodingAttr dpasEnc,
                                    RankedTensorType retType, ModuleOp mod,
                                    PatternRewriter &rewriter) const {
@@ -304,7 +309,7 @@ class DecomposeScaledBlocked : public OpRewritePattern<tt::DotScaledOp> {
     auto newOpEncoding = ttg::DotOperandEncodingAttr::get(
         ctx, unsigned(opIdx), opEncoding, opEncoding.getOpsPerChannel());
     TensorValue op =
-        createArg(opDesc.op, opDesc.elemType, newOpEncoding, rewriter);
+        createArg(opDesc.op, opDesc.elemType, useFp16, newOpEncoding, rewriter);
 
     unsigned instrShapeM = dpasEnc.getDPASInstShapeA()[0];
     SmallVector<unsigned, 2> threadsPerWarp{instrShapeM,
@@ -332,7 +337,7 @@ class DecomposeScaledBlocked : public OpRewritePattern<tt::DotScaledOp> {
   }
 
   template <ttgi::DpasEncodingAttr::OpIdx opIdx>
-  TensorValue convertUnscaledOperand(OpDescriptor opDesc,
+  TensorValue convertUnscaledOperand(OpDescriptor opDesc, bool useFp16,
                                      ttg::intel::DpasEncodingAttr dpasEnc,
                                      RankedTensorType retType,
                                      PatternRewriter &rewriter) const {
@@ -341,7 +346,8 @@ class DecomposeScaledBlocked : public OpRewritePattern<tt::DotScaledOp> {
     auto newOpEncoding = ttg::DotOperandEncodingAttr::get(
         opDesc.op.getContext(), unsigned(opIdx), dpasEnc,
         dpasEnc.getOpsPerChannel());
-    return createArg(opDesc.op, opDesc.elemType, newOpEncoding, rewriter);
+    return createArg(opDesc.op, opDesc.elemType, useFp16, newOpEncoding,
+                     rewriter);
   }
 
   ttg::intel::DpasEncodingAttr
@@ -385,7 +391,7 @@ class DecomposeScaledBlocked : public OpRewritePattern<tt::DotScaledOp> {
                                                  oldAcc);
   }
 
-  TensorValue createArg(TensorValue v, tt::ScaleDotElemType type,
+  TensorValue createArg(TensorValue v, tt::ScaleDotElemType type, bool useFp16,
                         Attribute vEncoding, PatternRewriter &rewriter) const {
     RankedTensorType vType = v.getType();
     auto newVType = RankedTensorType::get(vType.getShape(),
@@ -395,13 +401,16 @@ class DecomposeScaledBlocked : public OpRewritePattern<tt::DotScaledOp> {
 
     // convert to bf16
     if (type != tt::ScaleDotElemType::E2M1 &&
-        type != tt::ScaleDotElemType::BF16) {
+        type != tt::ScaleDotElemType::BF16 &&
+        type != tt::ScaleDotElemType::FP16) {
       assert(type == tt::ScaleDotElemType::E5M2 ||
              type == tt::ScaleDotElemType::E4M3);
-      auto vTypeBf16 = RankedTensorType::get(
-          newVType.getShape(), rewriter.getBF16Type(), newVType.getEncoding());
+      auto upcastedType = RankedTensorType::get(
+          newVType.getShape(),
+          useFp16 ? rewriter.getF16Type() : rewriter.getBF16Type(),
+          newVType.getEncoding());
       ret = cast<TypedValue<RankedTensorType>>(
-          rewriter.create<tt::FpToFpOp>(v.getLoc(), vTypeBf16, ret)
+          rewriter.create<tt::FpToFpOp>(v.getLoc(), upcastedType, ret)
               .getResult());
     }
     return ret;
@@ -423,8 +432,11 @@ class DecomposeScaledBlocked : public OpRewritePattern<tt::DotScaledOp> {
     if (!scale)
       return v;
 
+    Builder b(v.getContext());
+    bool useFp16 = elemType == tt::ScaleDotElemType::FP16;
+    Type outputElemType = useFp16 ? b.getF16Type() : b.getBF16Type();
     auto retTy = triton::gpu::intel::UpcastMXFPOp::deduceOutputType(
-        v, elemType, Builder(v.getContext()).getBF16Type());
+        v, elemType, outputElemType);
     return rewriter.create<ttgi::UpcastMXFPOp>(v.getLoc(), retTy, v, scale,
                                                elemType, fastMath);
   }