[DAGCombiner] Extend fp->int->fp optimizations to include clamping

guy-david · guy-david · commit 974253338131 · 2025-10-22T00:19:25.000+03:00
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -17,6 +17,7 @@
 
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/APInt.h"
+#include "llvm/ADT/APSInt.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/IntervalMap.h"
@@ -18874,6 +18875,8 @@ SDValue DAGCombiner::visitFPOW(SDNode *N) {
 static SDValue foldFPToIntToFP(SDNode *N, const SDLoc &DL, SelectionDAG &DAG,
                                const TargetLowering &TLI) {
   // We can fold the fpto[us]i -> [us]itofp pattern into a single ftrunc.
+  // Additionally, if there are clamps (smin/smax or umin/umax) around
+  // the fpto[us]i, we can fold those into fminnum/fmaxnum around the ftrunc.
   // If NoSignedZerosFPMath is enabled, this is a direct replacement.
   // Otherwise, for strict math, we must handle edge cases:
   // 1. For unsigned conversions, use FABS to handle negative cases. Take -0.0
@@ -18885,28 +18888,68 @@ static SDValue foldFPToIntToFP(SDNode *N, const SDLoc &DL, SelectionDAG &DAG,
   if (!TLI.isOperationLegal(ISD::FTRUNC, VT))
     return SDValue();
 
-  // fptosi/fptoui round towards zero, so converting from FP to integer and
-  // back is the same as an 'ftrunc': [us]itofp (fpto[us]i X) --> ftrunc X
-  SDValue N0 = N->getOperand(0);
-  if (N->getOpcode() == ISD::SINT_TO_FP && N0.getOpcode() == ISD::FP_TO_SINT &&
-      N0.getOperand(0).getValueType() == VT) {
-    if (DAG.getTarget().Options.NoSignedZerosFPMath)
-      return DAG.getNode(ISD::FTRUNC, DL, VT, N0.getOperand(0));
-  }
+  bool IsUnsigned = N->getOpcode() == ISD::UINT_TO_FP;
+  bool IsSigned = N->getOpcode() == ISD::SINT_TO_FP;
+  assert(IsSigned || IsUnsigned);
 
-  if (N->getOpcode() == ISD::UINT_TO_FP && N0.getOpcode() == ISD::FP_TO_UINT &&
-      N0.getOperand(0).getValueType() == VT) {
-    if (DAG.getTarget().Options.NoSignedZerosFPMath)
-      return DAG.getNode(ISD::FTRUNC, DL, VT, N0.getOperand(0));
+  bool IsSignedZeroSafe =
+      DAG.getTarget().Options.NoSignedZerosFPMath;
+  // For signed conversions: The optimization changes signed zero behavior.
+  if (IsSigned && !IsSignedZeroSafe)
+    return SDValue();
+  // For unsigned conversions, we need FABS to canonicalize -0.0 to +0.0
+  // (unless NoSignedZerosFPMath is set).
+  if (IsUnsigned && !IsSignedZeroSafe && !TLI.isFAbsFree(VT))
+    return SDValue();
 
-    // Strict math: use FABS to handle negative inputs correctly.
-    if (TLI.isFAbsFree(VT)) {
-      SDValue Abs = DAG.getNode(ISD::FABS, DL, VT, N0.getOperand(0));
-      return DAG.getNode(ISD::FTRUNC, DL, VT, Abs);
-    }
+  // Collect potential clamp operations (innermost to outermost) and peel.
+  struct ClampOp {
+    unsigned Opcode;
+    SDValue Constant;
+  };
+  SmallVector<ClampOp, 2> Clamps;
+  unsigned MinOp = IsUnsigned ? ISD::UMIN : ISD::SMIN;
+  unsigned MaxOp = IsUnsigned ? ISD::UMAX : ISD::SMAX;
+  SDValue IntVal = N->getOperand(0);
+  constexpr unsigned MaxClampLevels = 2;
+  for (unsigned Level = 0; Level < MaxClampLevels; ++Level) {
+    if (!IntVal.hasOneUse() || (IntVal.getOpcode() != MinOp && IntVal.getOpcode() != MaxOp))
+      break;
+    unsigned FPClampOp =
+        (IntVal.getOpcode() == MinOp) ? ISD::FMINNUM : ISD::FMAXNUM;
+    if (!TLI.isOperationLegal(FPClampOp, VT))
+      return SDValue();
+    auto *IntConstNode = dyn_cast<ConstantSDNode>(IntVal.getOperand(1));
+    if (!IntConstNode)
+      return SDValue();
+    APFloat FPConst(VT.getFltSemantics());
+    APInt IntConst = IntConstNode->getAPIntValue();
+    FPConst.convertFromAPInt(IntConst, IsSigned, APFloat::rmNearestTiesToEven);
+    // Verify roundtrip exactness.
+    APSInt RoundTrip(IntConst.getBitWidth(), IsUnsigned);
+    bool IsExact;
+    if (FPConst.convertToInteger(RoundTrip, APFloat::rmTowardZero, &IsExact) !=
+            APFloat::opOK ||
+        !IsExact || static_cast<const APInt &>(RoundTrip) != IntConst)
+      return SDValue();
+    Clamps.push_back({FPClampOp, DAG.getConstantFP(FPConst, DL, VT)});
+    IntVal = IntVal.getOperand(0);
   }
 
-  return SDValue();
+  // Check that the sequence ends with a FPTo[us]i of the right type.
+  unsigned FPToIntOp = IsUnsigned ? ISD::FP_TO_UINT : ISD::FP_TO_SINT;
+  if (IntVal.getOpcode() != FPToIntOp ||
+      IntVal.getOperand(0).getValueType() != VT)
+    return SDValue();
+
+  SDValue Result = IntVal.getOperand(0);
+  if (IsUnsigned && !IsSignedZeroSafe && TLI.isFAbsFree(VT))
+    Result = DAG.getNode(ISD::FABS, DL, VT, Result);
+  Result = DAG.getNode(ISD::FTRUNC, DL, VT, Result);
+  // Apply clamps, if any, in reverse order (innermost first).
+  for (auto I = Clamps.rbegin(), E = Clamps.rend(); I != E; ++I)
+    Result = DAG.getNode(I->Opcode, DL, VT, Result, I->Constant);
+  return Result;
 }
 
 SDValue DAGCombiner::visitSINT_TO_FP(SDNode *N) {
diff --git a/llvm/test/CodeGen/AArch64/fp-to-int-to-fp.ll b/llvm/test/CodeGen/AArch64/fp-to-int-to-fp.ll
@@ -0,0 +1,140 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64 < %s | FileCheck %s
+; RUN: llc -mtriple=aarch64 --enable-no-signed-zeros-fp-math < %s | FileCheck %s --check-prefix=NO-SIGNED-ZEROS
+
+; Test folding of float->int->float roundtrips into float-only operations.
+; The optimization could converts patterns like:
+;   sitofp(fptosi(x)) -> ftrunc(x)
+;   sitofp(smin(fptosi(x), C)) -> fminnum(ftrunc(x), (float)C)
+; This is relevant for AArch64 as it avoids GPR bouncing and keeps computation in SIMD/FP registers.
+
+define float @test_signed_basic(float %x) {
+; CHECK-LABEL: test_signed_basic:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtzs s0, s0
+; CHECK-NEXT:    scvtf s0, s0
+; CHECK-NEXT:    ret
+;
+; NO-SIGNED-ZEROS-LABEL: test_signed_basic:
+; NO-SIGNED-ZEROS:       // %bb.0: // %entry
+; NO-SIGNED-ZEROS-NEXT:    frintz s0, s0
+; NO-SIGNED-ZEROS-NEXT:    ret
+entry:
+  %i = fptosi float %x to i32
+  %f = sitofp i32 %i to float
+  ret float %f
+}
+
+define float @test_unsigned_basic(float %x) {
+; CHECK-LABEL: test_unsigned_basic:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtzu s0, s0
+; CHECK-NEXT:    ucvtf s0, s0
+; CHECK-NEXT:    ret
+;
+; NO-SIGNED-ZEROS-LABEL: test_unsigned_basic:
+; NO-SIGNED-ZEROS:       // %bb.0: // %entry
+; NO-SIGNED-ZEROS-NEXT:    frintz s0, s0
+; NO-SIGNED-ZEROS-NEXT:    ret
+entry:
+  %i = fptoui float %x to i32
+  %f = uitofp i32 %i to float
+  ret float %f
+}
+
+define float @test_signed_min_max(float %x) {
+; CHECK-LABEL: test_signed_min_max:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtzs w9, s0
+; CHECK-NEXT:    mov w8, #-512 // =0xfffffe00
+; CHECK-NEXT:    cmn w9, #512
+; CHECK-NEXT:    csel w8, w9, w8, gt
+; CHECK-NEXT:    mov w9, #1023 // =0x3ff
+; CHECK-NEXT:    cmp w8, #1023
+; CHECK-NEXT:    csel w8, w8, w9, lt
+; CHECK-NEXT:    scvtf s0, w8
+; CHECK-NEXT:    ret
+;
+; NO-SIGNED-ZEROS-LABEL: test_signed_min_max:
+; NO-SIGNED-ZEROS:       // %bb.0: // %entry
+; NO-SIGNED-ZEROS-NEXT:    movi v1.2s, #196, lsl #24
+; NO-SIGNED-ZEROS-NEXT:    frintz s0, s0
+; NO-SIGNED-ZEROS-NEXT:    mov w8, #49152 // =0xc000
+; NO-SIGNED-ZEROS-NEXT:    movk w8, #17535, lsl #16
+; NO-SIGNED-ZEROS-NEXT:    fmaxnm s0, s0, s1
+; NO-SIGNED-ZEROS-NEXT:    fmov s1, w8
+; NO-SIGNED-ZEROS-NEXT:    fminnm s0, s0, s1
+; NO-SIGNED-ZEROS-NEXT:    ret
+entry:
+  %i = fptosi float %x to i32
+  %lower = call i32 @llvm.smax.i32(i32 %i, i32 -512)
+  %clamped = call i32 @llvm.smin.i32(i32 %lower, i32 1023)
+  %f = sitofp i32 %clamped to float
+  ret float %f
+}
+
+define float @test_unsigned_min_max(float %x) {
+; CHECK-LABEL: test_unsigned_min_max:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtzu w9, s0
+; CHECK-NEXT:    mov w8, #512 // =0x200
+; CHECK-NEXT:    cmp w9, #512
+; CHECK-NEXT:    csel w8, w9, w8, hi
+; CHECK-NEXT:    mov w9, #1023 // =0x3ff
+; CHECK-NEXT:    cmp w8, #1023
+; CHECK-NEXT:    csel w8, w8, w9, lo
+; CHECK-NEXT:    ucvtf s0, w8
+; CHECK-NEXT:    ret
+;
+; NO-SIGNED-ZEROS-LABEL: test_unsigned_min_max:
+; NO-SIGNED-ZEROS:       // %bb.0: // %entry
+; NO-SIGNED-ZEROS-NEXT:    movi v1.2s, #68, lsl #24
+; NO-SIGNED-ZEROS-NEXT:    frintz s0, s0
+; NO-SIGNED-ZEROS-NEXT:    mov w8, #49152 // =0xc000
+; NO-SIGNED-ZEROS-NEXT:    movk w8, #17535, lsl #16
+; NO-SIGNED-ZEROS-NEXT:    fmaxnm s0, s0, s1
+; NO-SIGNED-ZEROS-NEXT:    fmov s1, w8
+; NO-SIGNED-ZEROS-NEXT:    fminnm s0, s0, s1
+; NO-SIGNED-ZEROS-NEXT:    ret
+entry:
+  %i = fptoui float %x to i32
+  %lower = call i32 @llvm.umax.i32(i32 %i, i32 512)
+  %clamped = call i32 @llvm.umin.i32(i32 %lower, i32 1023)
+  %f = uitofp i32 %clamped to float
+  ret float %f
+}
+
+; 16777217 is NOT exactly representable in f32.
+define float @test_inexact_16777217(float %x) {
+; CHECK-LABEL: test_inexact_16777217:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtzs w8, s0
+; CHECK-NEXT:    mov w9, #16777216 // =0x1000000
+; CHECK-NEXT:    cmp w8, w9
+; CHECK-NEXT:    mov w9, #1 // =0x1
+; CHECK-NEXT:    movk w9, #256, lsl #16
+; CHECK-NEXT:    csel w8, w8, w9, le
+; CHECK-NEXT:    scvtf s0, w8
+; CHECK-NEXT:    ret
+;
+; NO-SIGNED-ZEROS-LABEL: test_inexact_16777217:
+; NO-SIGNED-ZEROS:       // %bb.0: // %entry
+; NO-SIGNED-ZEROS-NEXT:    fcvtzs w8, s0
+; NO-SIGNED-ZEROS-NEXT:    mov w9, #16777216 // =0x1000000
+; NO-SIGNED-ZEROS-NEXT:    cmp w8, w9
+; NO-SIGNED-ZEROS-NEXT:    mov w9, #1 // =0x1
+; NO-SIGNED-ZEROS-NEXT:    movk w9, #256, lsl #16
+; NO-SIGNED-ZEROS-NEXT:    csel w8, w8, w9, le
+; NO-SIGNED-ZEROS-NEXT:    scvtf s0, w8
+; NO-SIGNED-ZEROS-NEXT:    ret
+entry:
+  %i = fptosi float %x to i32
+  %clamped = call i32 @llvm.smin.i32(i32 %i, i32 16777217)
+  %f = sitofp i32 %clamped to float
+  ret float %f
+}
+
+declare i32 @llvm.smin.i32(i32, i32)
+declare i32 @llvm.smax.i32(i32, i32)
+declare i32 @llvm.umin.i32(i32, i32)
+declare i32 @llvm.umax.i32(i32, i32)