Fold fp_to_fp op with zero constant input (#5007)

oplavsic · Ognjen Plavsic · antiagainst · web-flow · commit 20361eb5f5b7 · 2024-11-04T15:18:27.000-08:00
Fold fp_to_fp op with a zero constant input into a zero constant with
fp_to_fp op destination type.

---------

Co-authored-by: Ognjen Plavsic &lt;ognjen.plavsic@luxoft.com&gt;
Co-authored-by: Lei Zhang &lt;antiagainst@gmail.com&gt;
diff --git a/include/triton/Dialect/Triton/IR/TritonOps.td b/include/triton/Dialect/Triton/IR/TritonOps.td
@@ -108,6 +108,8 @@ def TT_FpToFpOp : TT_Op<"fp_to_fp", [SameOperandsAndResultShape,
     let assemblyFormat = "$src attr-dict  (`,` `rounding` `=` $rounding^)? `:` type($src) `->` type($result)";
 
     let hasVerifier = 1;
+
+    let hasFolder = 1;
 }
 
 //
diff --git a/lib/Dialect/Triton/IR/Ops.cpp b/lib/Dialect/Triton/IR/Ops.cpp
@@ -728,6 +728,29 @@ LogicalResult ReshapeOp::verify() {
 }
 
 //-- FpToFpOp --
+
+// Fold FpToFpOp when the input operand is a constant zero.
+OpFoldResult FpToFpOp::fold(FoldAdaptor adaptor) {
+  auto srcVal = getSrc();
+  auto dstTy = getType();
+
+  const llvm::fltSemantics &semantic =
+      llvm::cast<FloatType>(dstTy.getElementType()).getFloatSemantics();
+
+  if (matchPattern(srcVal, m_PosZeroFloat())) {
+    llvm::APFloat posZero =
+        llvm::APFloat::getZero(semantic, /*negative=*/false);
+    return DenseFPElementsAttr::get(dstTy, posZero);
+  }
+
+  if (matchPattern(srcVal, m_NegZeroFloat())) {
+    llvm::APFloat negZero = llvm::APFloat::getZero(semantic, /*negative=*/true);
+    return DenseFPElementsAttr::get(dstTy, negZero);
+  }
+
+  return {};
+}
+
 LogicalResult FpToFpOp::verify() {
   auto dstType = getType().getElementType();
   auto srcType = getSrc().getType().getElementType();
diff --git a/test/Triton/canonicalize.mlir b/test/Triton/canonicalize.mlir
@@ -50,3 +50,74 @@ tt.func @fn(%arg0: tensor<1xf32, #sliced0>) -> (tensor<32x1xf32, #blocked0>){
   tt.return %b : tensor<32x1xf32, #blocked0>
 }
 }  // end module
+
+// -----
+
+#blocked = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [8, 4], warpsPerCTA = [1, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
+module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 : i32} {
+  tt.func @fp_to_fp_pos_zero_fold() -> tensor<32x128xf8E4M3FNUZ, #blocked> {
+    // CHECK-LABEL: fp_to_fp_pos_zero_fold
+    // CHECK-NEXT: %[[cst_folded:.+]] = arith.constant dense<0.000000e+00> : tensor<32x128xf8E4M3FNUZ, #blocked>
+    // CHECK-NEXT: tt.return %[[cst_folded]]
+    %cst = arith.constant dense<0.00e+00> : tensor<32x128xf32, #blocked>
+    %cst_converted = tt.fp_to_fp %cst, rounding = rtne : tensor<32x128xf32, #blocked> -> tensor<32x128xf8E4M3FNUZ, #blocked>
+    tt.return %cst_converted : tensor<32x128xf8E4M3FNUZ, #blocked>
+  }
+}  // end module
+
+// -----
+
+#blocked = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [8, 4], warpsPerCTA = [1, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
+module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 : i32} {
+  tt.func @fp_to_fp_neg_zero_fold() -> tensor<32x128xf8E4M3FN, #blocked> {
+    // CHECK-LABEL: fp_to_fp_neg_zero_fold
+    // CHECK-NEXT: %[[cst_folded:.+]] = arith.constant dense<-0.000000e+00> : tensor<32x128xf8E4M3FN, #blocked>
+    // CHECK-NEXT: tt.return %[[cst_folded]]
+    %cst = arith.constant dense<-0.00e+00> : tensor<32x128xf32, #blocked>
+    %cst_converted = tt.fp_to_fp %cst, rounding = rtne : tensor<32x128xf32, #blocked> -> tensor<32x128xf8E4M3FN, #blocked>
+    tt.return %cst_converted : tensor<32x128xf8E4M3FN, #blocked>
+  }
+}  // end module
+
+// -----
+
+#blocked = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [8, 4], warpsPerCTA = [1, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
+module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 : i32} {
+  tt.func @fp_to_fp_neg_zero_fold() -> tensor<32x128xf8E4M3FNUZ, #blocked> {
+    // CHECK-LABEL: fp_to_fp_neg_zero_fold
+    // We fold to the positive zero here given by definition f8E4M3FNUZ does not have negative zero encoding.
+    // CHECK-NEXT: %[[cst_folded:.+]] = arith.constant dense<0.000000e+00> : tensor<32x128xf8E4M3FNUZ, #blocked>
+    // CHECK-NEXT: tt.return %[[cst_folded]]
+    %cst = arith.constant dense<-0.00e+00> : tensor<32x128xf32, #blocked>
+    %cst_converted = tt.fp_to_fp %cst, rounding = rtne : tensor<32x128xf32, #blocked> -> tensor<32x128xf8E4M3FNUZ, #blocked>
+    tt.return %cst_converted : tensor<32x128xf8E4M3FNUZ, #blocked>
+  }
+}  // end module
+
+// -----
+
+#blocked = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [8, 4], warpsPerCTA = [1, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
+module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 : i32} {
+  tt.func @fold_fp_to_fp_non_zero_nofold() -> tensor<32x128xf8E4M3FNUZ, #blocked> {
+    // CHECK-LABEL: fold_fp_to_fp_non_zero_nofold
+    // CHECK-NEXT: %[[cst:.+]] = arith.constant dense<0xFF800000> : tensor<32x128xf32, #blocked>
+    // CHECK-NEXT: %[[cst_cvt:.+]] = tt.fp_to_fp %[[cst]]
+    // CHECK-NEXT: tt.return %[[cst_cvt]]
+    %cst = arith.constant dense<0xFF800000> : tensor<32x128xf32, #blocked>
+    %cst_converted = tt.fp_to_fp %cst, rounding = rtne : tensor<32x128xf32, #blocked> -> tensor<32x128xf8E4M3FNUZ, #blocked>
+    tt.return %cst_converted : tensor<32x128xf8E4M3FNUZ, #blocked>
+  }
+}  // end module
+
+// -----
+
+#blocked = #triton_gpu.blocked<{sizePerThread = [1, 4], threadsPerWarp = [8, 4], warpsPerCTA = [1, 1], order = [1, 0], CTAsPerCGA = [1, 1], CTASplitNum = [1, 1], CTAOrder = [1, 0]}>
+module attributes {"triton_gpu.num-ctas" = 1 : i32, "triton_gpu.num-warps" = 1 : i32} {
+  tt.func @fold_fp_to_fp_non_constant_nofold(%arg0: tensor<32x128xf32, #blocked>) -> tensor<32x128xf8E4M3FNUZ, #blocked> {
+    // CHECK-LABEL: fold_fp_to_fp_non_constant_nofold
+    // CHECK-NEXT: %[[arg_cvt:.+]] = tt.fp_to_fp %arg0
+    // CHECK-NEXT: tt.return %[[arg_cvt]]
+    %cst_converted = tt.fp_to_fp %arg0, rounding = rtne : tensor<32x128xf32, #blocked> -> tensor<32x128xf8E4M3FNUZ, #blocked>
+    tt.return %cst_converted : tensor<32x128xf8E4M3FNUZ, #blocked>
+  }
+}  // end module

Original file line number	Diff line number	Diff line change
`@@ -108,6 +108,8 @@ def TT_FpToFpOp : TT_Op<"fp_to_fp", [SameOperandsAndResultShape,`
`108`	`108`	let assemblyFormat = "$src attr-dict (`,` `rounding` `=` $rounding^)? `:` type($src) `->` type($result)";
`109`	`109`
`110`	`110`	`let hasVerifier = 1;`
	`111`	`+`
	`112`	`+ let hasFolder = 1;`
`111`	`113`	`}`
`112`	`114`
`113`	`115`	`//`