Skip to content

Commit 5d79dd0

Browse files
committed
[DAGCombiner] Extend fp->int->fp optimizations to include clamping
1 parent 2e7afb1 commit 5d79dd0

File tree

2 files changed

+201
-18
lines changed

2 files changed

+201
-18
lines changed

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

Lines changed: 61 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717

1818
#include "llvm/ADT/APFloat.h"
1919
#include "llvm/ADT/APInt.h"
20+
#include "llvm/ADT/APSInt.h"
2021
#include "llvm/ADT/ArrayRef.h"
2122
#include "llvm/ADT/DenseMap.h"
2223
#include "llvm/ADT/IntervalMap.h"
@@ -18874,6 +18875,8 @@ SDValue DAGCombiner::visitFPOW(SDNode *N) {
1887418875
static SDValue foldFPToIntToFP(SDNode *N, const SDLoc &DL, SelectionDAG &DAG,
1887518876
const TargetLowering &TLI) {
1887618877
// We can fold the fpto[us]i -> [us]itofp pattern into a single ftrunc.
18878+
// Additionally, if there are clamps (smin/smax or umin/umax) around
18879+
// the fpto[us]i, we can fold those into fminnum/fmaxnum around the ftrunc.
1887718880
// If NoSignedZerosFPMath is enabled, this is a direct replacement.
1887818881
// Otherwise, for strict math, we must handle edge cases:
1887918882
// 1. For unsigned conversions, use FABS to handle negative cases. Take -0.0
@@ -18885,28 +18888,68 @@ static SDValue foldFPToIntToFP(SDNode *N, const SDLoc &DL, SelectionDAG &DAG,
1888518888
if (!TLI.isOperationLegal(ISD::FTRUNC, VT))
1888618889
return SDValue();
1888718890

18888-
// fptosi/fptoui round towards zero, so converting from FP to integer and
18889-
// back is the same as an 'ftrunc': [us]itofp (fpto[us]i X) --> ftrunc X
18890-
SDValue N0 = N->getOperand(0);
18891-
if (N->getOpcode() == ISD::SINT_TO_FP && N0.getOpcode() == ISD::FP_TO_SINT &&
18892-
N0.getOperand(0).getValueType() == VT) {
18893-
if (DAG.getTarget().Options.NoSignedZerosFPMath)
18894-
return DAG.getNode(ISD::FTRUNC, DL, VT, N0.getOperand(0));
18895-
}
18891+
bool IsUnsigned = N->getOpcode() == ISD::UINT_TO_FP;
18892+
bool IsSigned = N->getOpcode() == ISD::SINT_TO_FP;
18893+
assert(IsSigned || IsUnsigned);
1889618894

18897-
if (N->getOpcode() == ISD::UINT_TO_FP && N0.getOpcode() == ISD::FP_TO_UINT &&
18898-
N0.getOperand(0).getValueType() == VT) {
18899-
if (DAG.getTarget().Options.NoSignedZerosFPMath)
18900-
return DAG.getNode(ISD::FTRUNC, DL, VT, N0.getOperand(0));
18895+
bool IsSignedZeroSafe = DAG.getTarget().Options.NoSignedZerosFPMath;
18896+
// For signed conversions: The optimization changes signed zero behavior.
18897+
if (IsSigned && !IsSignedZeroSafe)
18898+
return SDValue();
18899+
// For unsigned conversions, we need FABS to canonicalize -0.0 to +0.0
18900+
// (unless NoSignedZerosFPMath is set).
18901+
if (IsUnsigned && !IsSignedZeroSafe && !TLI.isFAbsFree(VT))
18902+
return SDValue();
1890118903

18902-
// Strict math: use FABS to handle negative inputs correctly.
18903-
if (TLI.isFAbsFree(VT)) {
18904-
SDValue Abs = DAG.getNode(ISD::FABS, DL, VT, N0.getOperand(0));
18905-
return DAG.getNode(ISD::FTRUNC, DL, VT, Abs);
18906-
}
18904+
// Collect potential clamp operations (innermost to outermost) and peel.
18905+
struct ClampOp {
18906+
unsigned Opcode;
18907+
SDValue Constant;
18908+
};
18909+
SmallVector<ClampOp, 2> Clamps;
18910+
unsigned MinOp = IsUnsigned ? ISD::UMIN : ISD::SMIN;
18911+
unsigned MaxOp = IsUnsigned ? ISD::UMAX : ISD::SMAX;
18912+
SDValue IntVal = N->getOperand(0);
18913+
constexpr unsigned MaxClampLevels = 2;
18914+
for (unsigned Level = 0; Level < MaxClampLevels; ++Level) {
18915+
if (!IntVal.hasOneUse() ||
18916+
(IntVal.getOpcode() != MinOp && IntVal.getOpcode() != MaxOp))
18917+
break;
18918+
unsigned FPClampOp =
18919+
(IntVal.getOpcode() == MinOp) ? ISD::FMINNUM : ISD::FMAXNUM;
18920+
if (!TLI.isOperationLegal(FPClampOp, VT))
18921+
return SDValue();
18922+
auto *IntConstNode = dyn_cast<ConstantSDNode>(IntVal.getOperand(1));
18923+
if (!IntConstNode)
18924+
return SDValue();
18925+
APFloat FPConst(VT.getFltSemantics());
18926+
APInt IntConst = IntConstNode->getAPIntValue();
18927+
FPConst.convertFromAPInt(IntConst, IsSigned, APFloat::rmNearestTiesToEven);
18928+
// Verify roundtrip exactness.
18929+
APSInt RoundTrip(IntConst.getBitWidth(), IsUnsigned);
18930+
bool IsExact;
18931+
if (FPConst.convertToInteger(RoundTrip, APFloat::rmTowardZero, &IsExact) !=
18932+
APFloat::opOK ||
18933+
!IsExact || static_cast<const APInt &>(RoundTrip) != IntConst)
18934+
return SDValue();
18935+
Clamps.push_back({FPClampOp, DAG.getConstantFP(FPConst, DL, VT)});
18936+
IntVal = IntVal.getOperand(0);
1890718937
}
1890818938

18909-
return SDValue();
18939+
// Check that the sequence ends with a FPTo[us]i of the right type.
18940+
unsigned FPToIntOp = IsUnsigned ? ISD::FP_TO_UINT : ISD::FP_TO_SINT;
18941+
if (IntVal.getOpcode() != FPToIntOp ||
18942+
IntVal.getOperand(0).getValueType() != VT)
18943+
return SDValue();
18944+
18945+
SDValue Result = IntVal.getOperand(0);
18946+
if (IsUnsigned && !IsSignedZeroSafe && TLI.isFAbsFree(VT))
18947+
Result = DAG.getNode(ISD::FABS, DL, VT, Result);
18948+
Result = DAG.getNode(ISD::FTRUNC, DL, VT, Result);
18949+
// Apply clamps, if any, in reverse order (innermost first).
18950+
for (auto I = Clamps.rbegin(), E = Clamps.rend(); I != E; ++I)
18951+
Result = DAG.getNode(I->Opcode, DL, VT, Result, I->Constant);
18952+
return Result;
1891018953
}
1891118954

1891218955
SDValue DAGCombiner::visitSINT_TO_FP(SDNode *N) {
Lines changed: 140 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,140 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc -mtriple=aarch64 < %s | FileCheck %s
3+
; RUN: llc -mtriple=aarch64 --enable-no-signed-zeros-fp-math < %s | FileCheck %s --check-prefix=NO-SIGNED-ZEROS
4+
5+
; Test folding of float->int->float roundtrips into float-only operations.
6+
; The optimization could converts patterns like:
7+
; sitofp(fptosi(x)) -> ftrunc(x)
8+
; sitofp(smin(fptosi(x), C)) -> fminnum(ftrunc(x), (float)C)
9+
; This is relevant for AArch64 as it avoids GPR bouncing and keeps computation in SIMD/FP registers.
10+
11+
define float @test_signed_basic(float %x) {
12+
; CHECK-LABEL: test_signed_basic:
13+
; CHECK: // %bb.0: // %entry
14+
; CHECK-NEXT: fcvtzs s0, s0
15+
; CHECK-NEXT: scvtf s0, s0
16+
; CHECK-NEXT: ret
17+
;
18+
; NO-SIGNED-ZEROS-LABEL: test_signed_basic:
19+
; NO-SIGNED-ZEROS: // %bb.0: // %entry
20+
; NO-SIGNED-ZEROS-NEXT: frintz s0, s0
21+
; NO-SIGNED-ZEROS-NEXT: ret
22+
entry:
23+
%i = fptosi float %x to i32
24+
%f = sitofp i32 %i to float
25+
ret float %f
26+
}
27+
28+
define float @test_unsigned_basic(float %x) {
29+
; CHECK-LABEL: test_unsigned_basic:
30+
; CHECK: // %bb.0: // %entry
31+
; CHECK-NEXT: fcvtzu s0, s0
32+
; CHECK-NEXT: ucvtf s0, s0
33+
; CHECK-NEXT: ret
34+
;
35+
; NO-SIGNED-ZEROS-LABEL: test_unsigned_basic:
36+
; NO-SIGNED-ZEROS: // %bb.0: // %entry
37+
; NO-SIGNED-ZEROS-NEXT: frintz s0, s0
38+
; NO-SIGNED-ZEROS-NEXT: ret
39+
entry:
40+
%i = fptoui float %x to i32
41+
%f = uitofp i32 %i to float
42+
ret float %f
43+
}
44+
45+
define float @test_signed_min_max(float %x) {
46+
; CHECK-LABEL: test_signed_min_max:
47+
; CHECK: // %bb.0: // %entry
48+
; CHECK-NEXT: fcvtzs w9, s0
49+
; CHECK-NEXT: mov w8, #-512 // =0xfffffe00
50+
; CHECK-NEXT: cmn w9, #512
51+
; CHECK-NEXT: csel w8, w9, w8, gt
52+
; CHECK-NEXT: mov w9, #1023 // =0x3ff
53+
; CHECK-NEXT: cmp w8, #1023
54+
; CHECK-NEXT: csel w8, w8, w9, lt
55+
; CHECK-NEXT: scvtf s0, w8
56+
; CHECK-NEXT: ret
57+
;
58+
; NO-SIGNED-ZEROS-LABEL: test_signed_min_max:
59+
; NO-SIGNED-ZEROS: // %bb.0: // %entry
60+
; NO-SIGNED-ZEROS-NEXT: movi v1.2s, #196, lsl #24
61+
; NO-SIGNED-ZEROS-NEXT: frintz s0, s0
62+
; NO-SIGNED-ZEROS-NEXT: mov w8, #49152 // =0xc000
63+
; NO-SIGNED-ZEROS-NEXT: movk w8, #17535, lsl #16
64+
; NO-SIGNED-ZEROS-NEXT: fmaxnm s0, s0, s1
65+
; NO-SIGNED-ZEROS-NEXT: fmov s1, w8
66+
; NO-SIGNED-ZEROS-NEXT: fminnm s0, s0, s1
67+
; NO-SIGNED-ZEROS-NEXT: ret
68+
entry:
69+
%i = fptosi float %x to i32
70+
%lower = call i32 @llvm.smax.i32(i32 %i, i32 -512)
71+
%clamped = call i32 @llvm.smin.i32(i32 %lower, i32 1023)
72+
%f = sitofp i32 %clamped to float
73+
ret float %f
74+
}
75+
76+
define float @test_unsigned_min_max(float %x) {
77+
; CHECK-LABEL: test_unsigned_min_max:
78+
; CHECK: // %bb.0: // %entry
79+
; CHECK-NEXT: fcvtzu w9, s0
80+
; CHECK-NEXT: mov w8, #512 // =0x200
81+
; CHECK-NEXT: cmp w9, #512
82+
; CHECK-NEXT: csel w8, w9, w8, hi
83+
; CHECK-NEXT: mov w9, #1023 // =0x3ff
84+
; CHECK-NEXT: cmp w8, #1023
85+
; CHECK-NEXT: csel w8, w8, w9, lo
86+
; CHECK-NEXT: ucvtf s0, w8
87+
; CHECK-NEXT: ret
88+
;
89+
; NO-SIGNED-ZEROS-LABEL: test_unsigned_min_max:
90+
; NO-SIGNED-ZEROS: // %bb.0: // %entry
91+
; NO-SIGNED-ZEROS-NEXT: movi v1.2s, #68, lsl #24
92+
; NO-SIGNED-ZEROS-NEXT: frintz s0, s0
93+
; NO-SIGNED-ZEROS-NEXT: mov w8, #49152 // =0xc000
94+
; NO-SIGNED-ZEROS-NEXT: movk w8, #17535, lsl #16
95+
; NO-SIGNED-ZEROS-NEXT: fmaxnm s0, s0, s1
96+
; NO-SIGNED-ZEROS-NEXT: fmov s1, w8
97+
; NO-SIGNED-ZEROS-NEXT: fminnm s0, s0, s1
98+
; NO-SIGNED-ZEROS-NEXT: ret
99+
entry:
100+
%i = fptoui float %x to i32
101+
%lower = call i32 @llvm.umax.i32(i32 %i, i32 512)
102+
%clamped = call i32 @llvm.umin.i32(i32 %lower, i32 1023)
103+
%f = uitofp i32 %clamped to float
104+
ret float %f
105+
}
106+
107+
; 16777217 is NOT exactly representable in f32.
108+
define float @test_inexact_16777217(float %x) {
109+
; CHECK-LABEL: test_inexact_16777217:
110+
; CHECK: // %bb.0: // %entry
111+
; CHECK-NEXT: fcvtzs w8, s0
112+
; CHECK-NEXT: mov w9, #16777216 // =0x1000000
113+
; CHECK-NEXT: cmp w8, w9
114+
; CHECK-NEXT: mov w9, #1 // =0x1
115+
; CHECK-NEXT: movk w9, #256, lsl #16
116+
; CHECK-NEXT: csel w8, w8, w9, le
117+
; CHECK-NEXT: scvtf s0, w8
118+
; CHECK-NEXT: ret
119+
;
120+
; NO-SIGNED-ZEROS-LABEL: test_inexact_16777217:
121+
; NO-SIGNED-ZEROS: // %bb.0: // %entry
122+
; NO-SIGNED-ZEROS-NEXT: fcvtzs w8, s0
123+
; NO-SIGNED-ZEROS-NEXT: mov w9, #16777216 // =0x1000000
124+
; NO-SIGNED-ZEROS-NEXT: cmp w8, w9
125+
; NO-SIGNED-ZEROS-NEXT: mov w9, #1 // =0x1
126+
; NO-SIGNED-ZEROS-NEXT: movk w9, #256, lsl #16
127+
; NO-SIGNED-ZEROS-NEXT: csel w8, w8, w9, le
128+
; NO-SIGNED-ZEROS-NEXT: scvtf s0, w8
129+
; NO-SIGNED-ZEROS-NEXT: ret
130+
entry:
131+
%i = fptosi float %x to i32
132+
%clamped = call i32 @llvm.smin.i32(i32 %i, i32 16777217)
133+
%f = sitofp i32 %clamped to float
134+
ret float %f
135+
}
136+
137+
declare i32 @llvm.smin.i32(i32, i32)
138+
declare i32 @llvm.smax.i32(i32, i32)
139+
declare i32 @llvm.umin.i32(i32, i32)
140+
declare i32 @llvm.umax.i32(i32, i32)

0 commit comments

Comments
 (0)