[BACKEND] Support scalar fp_to_fp (#5132)

ThomasRaoux · web-flow · commit d556ce93e5b9 · 2024-11-12T18:33:25.000-08:00
Now that fp_to_fp is marked as elementwise we may have scalar version of
this op.
diff --git a/include/triton/Dialect/Triton/IR/TritonOps.td b/include/triton/Dialect/Triton/IR/TritonOps.td
@@ -100,11 +100,11 @@ def TT_FpToFpOp : TT_Op<"fp_to_fp", [Elementwise,
     }];
 
     let arguments = (
-      ins TT_FloatTensor:$src,
+      ins TT_FloatLike:$src,
       OptionalAttr<TT_RoundingModeAttr>:$rounding
     );
 
-    let results = (outs TT_FloatTensor:$result);
+    let results = (outs TT_FloatLike:$result);
 
     let assemblyFormat = "$src attr-dict  (`,` `rounding` `=` $rounding^)? `:` type($src) `->` type($result)";
 
diff --git a/lib/Dialect/Triton/IR/Ops.cpp b/lib/Dialect/Triton/IR/Ops.cpp
@@ -734,26 +734,34 @@ OpFoldResult FpToFpOp::fold(FoldAdaptor adaptor) {
   auto srcVal = getSrc();
   auto dstTy = getType();
 
-  const llvm::fltSemantics &semantic =
-      llvm::cast<FloatType>(dstTy.getElementType()).getFloatSemantics();
+  auto resElemType = cast<FloatType>(getElementTypeOrSelf(getType()));
+  const llvm::fltSemantics &semantic = resElemType.getFloatSemantics();
 
   if (matchPattern(srcVal, m_PosZeroFloat())) {
     llvm::APFloat posZero =
         llvm::APFloat::getZero(semantic, /*negative=*/false);
-    return DenseFPElementsAttr::get(dstTy, posZero);
+    if (auto tensorTy = dyn_cast<RankedTensorType>(dstTy))
+      return DenseElementsAttr::get(tensorTy, posZero);
+    return Builder(getContext()).getFloatAttr(resElemType, posZero);
   }
 
   if (matchPattern(srcVal, m_NegZeroFloat())) {
     llvm::APFloat negZero = llvm::APFloat::getZero(semantic, /*negative=*/true);
-    return DenseFPElementsAttr::get(dstTy, negZero);
+    if (auto tensorTy = dyn_cast<RankedTensorType>(dstTy))
+      return DenseElementsAttr::get(tensorTy, negZero);
+    return Builder(getContext()).getFloatAttr(resElemType, negZero);
   }
 
   return {};
 }
 
 LogicalResult FpToFpOp::verify() {
-  auto dstType = getType().getElementType();
-  auto srcType = getSrc().getType().getElementType();
+  auto dstType = getType();
+  auto srcType = getSrc().getType();
+  if (auto dstTensorType = dyn_cast<RankedTensorType>(dstType))
+    dstType = dstTensorType.getElementType();
+  if (auto srcTensorType = dyn_cast<RankedTensorType>(srcType))
+    srcType = srcTensorType.getElementType();
   if ((dstType.getIntOrFloatBitWidth() < srcType.getIntOrFloatBitWidth()) &&
       (!getRounding().has_value())) {
     return emitError("Rounding mode is required for FP downcast");
diff --git a/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp b/lib/Dialect/TritonGPU/Transforms/AccelerateMatmul.cpp
@@ -495,7 +495,8 @@ class DecomposeScaledBlocked
       assert(type == ScaleDotElemType::E5M2 || type == ScaleDotElemType::E4M3);
       auto vTypeBf16 = RankedTensorType::get(
           newVType.getShape(), rewriter.getBF16Type(), newVType.getEncoding());
-      ret = rewriter.create<FpToFpOp>(v.getLoc(), vTypeBf16, ret);
+      ret = cast<TypedValue<RankedTensorType>>(
+          rewriter.create<FpToFpOp>(v.getLoc(), vTypeBf16, ret).getResult());
     }
     if (opt_scale.has_value()) {
       auto scale = *opt_scale;
diff --git a/lib/Dialect/TritonGPU/Transforms/RemoveLayoutConversions.cpp b/lib/Dialect/TritonGPU/Transforms/RemoveLayoutConversions.cpp
@@ -1024,7 +1024,7 @@ void LayoutRematerialization::hoistConvertOnTopOfExtOrBroadcast(
     if (auto fpToFpOp = dyn_cast<FpToFpOp>(op)) {
       auto srcType = cast<RankedTensorType>(fpToFpOp.getOperand().getType());
       return getElementBitWidth(srcType) <
-             getElementBitWidth(fpToFpOp.getType());
+             getElementBitWidth(cast<RankedTensorType>(fpToFpOp.getType()));
     }
     return false;
   };
diff --git a/test/Triton/canonicalize.mlir b/test/Triton/canonicalize.mlir
@@ -67,6 +67,19 @@ module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 :
 
 // -----
 
+module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 : i32} {
+  tt.func @fp_to_fp_pos_zero_fold_scalar() -> f8E4M3FNUZ {
+    // CHECK-LABEL: fp_to_fp_pos_zero_fold_scalar
+    // CHECK-NEXT: %[[cst_folded:.+]] = arith.constant 0.000000e+00 : f8E4M3FNUZ
+    // CHECK-NEXT: tt.return %[[cst_folded]]
+    %cst = arith.constant 0.00e+00 : f32
+    %cst_converted = tt.fp_to_fp %cst, rounding = rtne : f32 -> f8E4M3FNUZ
+    tt.return %cst_converted : f8E4M3FNUZ
+  }
+}  // end module
+
+// -----
+
 #blocked = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [8, 4], warpsPerCTA = [1, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
 module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 : i32} {
   tt.func @fp_to_fp_neg_zero_fold() -> tensor<32x128xf8E4M3FN, #blocked> {
diff --git a/third_party/amd/lib/TritonAMDGPUTransforms/AccelerateAMDMatmul.cpp b/third_party/amd/lib/TritonAMDGPUTransforms/AccelerateAMDMatmul.cpp
@@ -580,7 +580,8 @@ class ScaledBlockedToMFMA final : public OpRewritePattern<triton::DotScaledOp> {
 
       auto vTypeBf16 = RankedTensorType::get(
           vType.getShape(), rewriter.getBF16Type(), newVEncoding);
-      return rewriter.create<FpToFpOp>(v.getLoc(), vTypeBf16, v);
+      return cast<TensorValue>(
+          rewriter.create<FpToFpOp>(v.getLoc(), vTypeBf16, v).getResult());
     };
     a = toMMABf16(a, 0, aElemType);
     b = toMMABf16(b, 1, bElemType);

Original file line number	Diff line number	Diff line change
`@@ -1024,7 +1024,7 @@ void LayoutRematerialization::hoistConvertOnTopOfExtOrBroadcast(`
`1024`	`1024`	`if (auto fpToFpOp = dyn_cast<FpToFpOp>(op)) {`
`1025`	`1025`	`auto srcType = cast<RankedTensorType>(fpToFpOp.getOperand().getType());`
`1026`	`1026`	`return getElementBitWidth(srcType) <`
`1027`		`- getElementBitWidth(fpToFpOp.getType());`
	`1027`	`+ getElementBitWidth(cast<RankedTensorType>(fpToFpOp.getType()));`
`1028`	`1028`	`}`
`1029`	`1029`	`return false;`
`1030`	`1030`	`};`