Skip to content

Commit da43b7e

Browse files
committed
[DAGCombiner] Extend fp->int->fp optimizations to include clamping
1 parent 89b18f0 commit da43b7e

File tree

2 files changed

+201
-18
lines changed

2 files changed

+201
-18
lines changed

llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp

Lines changed: 61 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717

1818
#include "llvm/ADT/APFloat.h"
1919
#include "llvm/ADT/APInt.h"
20+
#include "llvm/ADT/APSInt.h"
2021
#include "llvm/ADT/ArrayRef.h"
2122
#include "llvm/ADT/DenseMap.h"
2223
#include "llvm/ADT/IntervalMap.h"
@@ -18873,6 +18874,8 @@ SDValue DAGCombiner::visitFPOW(SDNode *N) {
1887318874
static SDValue foldFPToIntToFP(SDNode *N, const SDLoc &DL, SelectionDAG &DAG,
1887418875
const TargetLowering &TLI) {
1887518876
// We can fold the fpto[us]i -> [us]itofp pattern into a single ftrunc.
18877+
// Additionally, if there are clamps ([us]min or [us]max) around
18878+
// the fpto[us]i, we can fold those into fminnum/fmaxnum around the ftrunc.
1887618879
// If NoSignedZerosFPMath is enabled, this is a direct replacement.
1887718880
// Otherwise, for strict math, we must handle edge cases:
1887818881
// 1. For unsigned conversions, use FABS to handle negative cases. Take -0.0
@@ -18884,28 +18887,68 @@ static SDValue foldFPToIntToFP(SDNode *N, const SDLoc &DL, SelectionDAG &DAG,
1888418887
if (!TLI.isOperationLegal(ISD::FTRUNC, VT))
1888518888
return SDValue();
1888618889

18887-
// fptosi/fptoui round towards zero, so converting from FP to integer and
18888-
// back is the same as an 'ftrunc': [us]itofp (fpto[us]i X) --> ftrunc X
18889-
SDValue N0 = N->getOperand(0);
18890-
if (N->getOpcode() == ISD::SINT_TO_FP && N0.getOpcode() == ISD::FP_TO_SINT &&
18891-
N0.getOperand(0).getValueType() == VT) {
18892-
if (DAG.getTarget().Options.NoSignedZerosFPMath)
18893-
return DAG.getNode(ISD::FTRUNC, DL, VT, N0.getOperand(0));
18894-
}
18890+
bool IsUnsigned = N->getOpcode() == ISD::UINT_TO_FP;
18891+
bool IsSigned = N->getOpcode() == ISD::SINT_TO_FP;
18892+
assert(IsSigned || IsUnsigned);
1889518893

18896-
if (N->getOpcode() == ISD::UINT_TO_FP && N0.getOpcode() == ISD::FP_TO_UINT &&
18897-
N0.getOperand(0).getValueType() == VT) {
18898-
if (DAG.getTarget().Options.NoSignedZerosFPMath)
18899-
return DAG.getNode(ISD::FTRUNC, DL, VT, N0.getOperand(0));
18894+
bool IsSignedZeroSafe = DAG.getTarget().Options.NoSignedZerosFPMath;
18895+
// For signed conversions: The optimization changes signed zero behavior.
18896+
if (IsSigned && !IsSignedZeroSafe)
18897+
return SDValue();
18898+
// For unsigned conversions, we need FABS to canonicalize -0.0 to +0.0
18899+
// (unless NoSignedZerosFPMath is set).
18900+
if (IsUnsigned && !IsSignedZeroSafe && !TLI.isFAbsFree(VT))
18901+
return SDValue();
1890018902

18901-
// Strict math: use FABS to handle negative inputs correctly.
18902-
if (TLI.isFAbsFree(VT)) {
18903-
SDValue Abs = DAG.getNode(ISD::FABS, DL, VT, N0.getOperand(0));
18904-
return DAG.getNode(ISD::FTRUNC, DL, VT, Abs);
18905-
}
18903+
// Collect potential clamp operations (innermost to outermost) and peel.
18904+
struct ClampOp {
18905+
unsigned Opcode;
18906+
SDValue Constant;
18907+
};
18908+
SmallVector<ClampOp, 2> Clamps;
18909+
unsigned MinOp = IsUnsigned ? ISD::UMIN : ISD::SMIN;
18910+
unsigned MaxOp = IsUnsigned ? ISD::UMAX : ISD::SMAX;
18911+
SDValue IntVal = N->getOperand(0);
18912+
constexpr unsigned MaxClampLevels = 2;
18913+
for (unsigned Level = 0; Level < MaxClampLevels; ++Level) {
18914+
if (!IntVal.hasOneUse() ||
18915+
(IntVal.getOpcode() != MinOp && IntVal.getOpcode() != MaxOp))
18916+
break;
18917+
unsigned FPClampOp =
18918+
(IntVal.getOpcode() == MinOp) ? ISD::FMINNUM : ISD::FMAXNUM;
18919+
if (!TLI.isOperationLegal(FPClampOp, VT))
18920+
return SDValue();
18921+
auto *IntConstNode = dyn_cast<ConstantSDNode>(IntVal.getOperand(1));
18922+
if (!IntConstNode)
18923+
return SDValue();
18924+
APFloat FPConst(VT.getFltSemantics());
18925+
APInt IntConst = IntConstNode->getAPIntValue();
18926+
FPConst.convertFromAPInt(IntConst, IsSigned, APFloat::rmNearestTiesToEven);
18927+
// Verify roundtrip exactness.
18928+
APSInt RoundTrip(IntConst.getBitWidth(), IsUnsigned);
18929+
bool IsExact;
18930+
if (FPConst.convertToInteger(RoundTrip, APFloat::rmTowardZero, &IsExact) !=
18931+
APFloat::opOK ||
18932+
!IsExact || static_cast<const APInt &>(RoundTrip) != IntConst)
18933+
return SDValue();
18934+
Clamps.push_back({FPClampOp, DAG.getConstantFP(FPConst, DL, VT)});
18935+
IntVal = IntVal.getOperand(0);
1890618936
}
1890718937

18908-
return SDValue();
18938+
// Check that the sequence ends with a FPTo[us]i of the right type.
18939+
unsigned FPToIntOp = IsUnsigned ? ISD::FP_TO_UINT : ISD::FP_TO_SINT;
18940+
if (IntVal.getOpcode() != FPToIntOp ||
18941+
IntVal.getOperand(0).getValueType() != VT)
18942+
return SDValue();
18943+
18944+
SDValue Result = IntVal.getOperand(0);
18945+
if (IsUnsigned && !IsSignedZeroSafe && TLI.isFAbsFree(VT))
18946+
Result = DAG.getNode(ISD::FABS, DL, VT, Result);
18947+
Result = DAG.getNode(ISD::FTRUNC, DL, VT, Result);
18948+
// Apply clamps, if any, in reverse order (innermost first).
18949+
for (auto I = Clamps.rbegin(), E = Clamps.rend(); I != E; ++I)
18950+
Result = DAG.getNode(I->Opcode, DL, VT, Result, I->Constant);
18951+
return Result;
1890918952
}
1891018953

1891118954
SDValue DAGCombiner::visitSINT_TO_FP(SDNode *N) {
Lines changed: 140 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,140 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc -mtriple=aarch64 < %s | FileCheck %s
3+
; RUN: llc -mtriple=aarch64 --enable-no-signed-zeros-fp-math < %s | FileCheck %s --check-prefix=NO-SIGNED-ZEROS
4+
5+
; Test folding of float->int->float roundtrips into float-only operations.
6+
; The optimization could converts patterns like:
7+
; sitofp(fptosi(x)) -> ftrunc(x)
8+
; sitofp(smin(fptosi(x), C)) -> fminnum(ftrunc(x), (float)C)
9+
; This is relevant for AArch64 as it avoids GPR bouncing and keeps computation in SIMD/FP registers.
10+
11+
define float @test_signed_basic(float %x) {
12+
; CHECK-LABEL: test_signed_basic:
13+
; CHECK: // %bb.0: // %entry
14+
; CHECK-NEXT: fcvtzs s0, s0
15+
; CHECK-NEXT: scvtf s0, s0
16+
; CHECK-NEXT: ret
17+
;
18+
; NO-SIGNED-ZEROS-LABEL: test_signed_basic:
19+
; NO-SIGNED-ZEROS: // %bb.0: // %entry
20+
; NO-SIGNED-ZEROS-NEXT: frintz s0, s0
21+
; NO-SIGNED-ZEROS-NEXT: ret
22+
entry:
23+
%i = fptosi float %x to i32
24+
%f = sitofp i32 %i to float
25+
ret float %f
26+
}
27+
28+
define float @test_unsigned_basic(float %x) {
29+
; CHECK-LABEL: test_unsigned_basic:
30+
; CHECK: // %bb.0: // %entry
31+
; CHECK-NEXT: fcvtzu s0, s0
32+
; CHECK-NEXT: ucvtf s0, s0
33+
; CHECK-NEXT: ret
34+
;
35+
; NO-SIGNED-ZEROS-LABEL: test_unsigned_basic:
36+
; NO-SIGNED-ZEROS: // %bb.0: // %entry
37+
; NO-SIGNED-ZEROS-NEXT: frintz s0, s0
38+
; NO-SIGNED-ZEROS-NEXT: ret
39+
entry:
40+
%i = fptoui float %x to i32
41+
%f = uitofp i32 %i to float
42+
ret float %f
43+
}
44+
45+
define float @test_signed_min_max(float %x) {
46+
; CHECK-LABEL: test_signed_min_max:
47+
; CHECK: // %bb.0: // %entry
48+
; CHECK-NEXT: fcvtzs w9, s0
49+
; CHECK-NEXT: mov w8, #-512 // =0xfffffe00
50+
; CHECK-NEXT: cmn w9, #512
51+
; CHECK-NEXT: csel w8, w9, w8, gt
52+
; CHECK-NEXT: mov w9, #1023 // =0x3ff
53+
; CHECK-NEXT: cmp w8, #1023
54+
; CHECK-NEXT: csel w8, w8, w9, lt
55+
; CHECK-NEXT: scvtf s0, w8
56+
; CHECK-NEXT: ret
57+
;
58+
; NO-SIGNED-ZEROS-LABEL: test_signed_min_max:
59+
; NO-SIGNED-ZEROS: // %bb.0: // %entry
60+
; NO-SIGNED-ZEROS-NEXT: movi v1.2s, #196, lsl #24
61+
; NO-SIGNED-ZEROS-NEXT: frintz s0, s0
62+
; NO-SIGNED-ZEROS-NEXT: mov w8, #49152 // =0xc000
63+
; NO-SIGNED-ZEROS-NEXT: movk w8, #17535, lsl #16
64+
; NO-SIGNED-ZEROS-NEXT: fmaxnm s0, s0, s1
65+
; NO-SIGNED-ZEROS-NEXT: fmov s1, w8
66+
; NO-SIGNED-ZEROS-NEXT: fminnm s0, s0, s1
67+
; NO-SIGNED-ZEROS-NEXT: ret
68+
entry:
69+
%i = fptosi float %x to i32
70+
%lower = call i32 @llvm.smax.i32(i32 %i, i32 -512)
71+
%clamped = call i32 @llvm.smin.i32(i32 %lower, i32 1023)
72+
%f = sitofp i32 %clamped to float
73+
ret float %f
74+
}
75+
76+
define float @test_unsigned_min_max(float %x) {
77+
; CHECK-LABEL: test_unsigned_min_max:
78+
; CHECK: // %bb.0: // %entry
79+
; CHECK-NEXT: fcvtzu w9, s0
80+
; CHECK-NEXT: mov w8, #512 // =0x200
81+
; CHECK-NEXT: cmp w9, #512
82+
; CHECK-NEXT: csel w8, w9, w8, hi
83+
; CHECK-NEXT: mov w9, #1023 // =0x3ff
84+
; CHECK-NEXT: cmp w8, #1023
85+
; CHECK-NEXT: csel w8, w8, w9, lo
86+
; CHECK-NEXT: ucvtf s0, w8
87+
; CHECK-NEXT: ret
88+
;
89+
; NO-SIGNED-ZEROS-LABEL: test_unsigned_min_max:
90+
; NO-SIGNED-ZEROS: // %bb.0: // %entry
91+
; NO-SIGNED-ZEROS-NEXT: movi v1.2s, #68, lsl #24
92+
; NO-SIGNED-ZEROS-NEXT: frintz s0, s0
93+
; NO-SIGNED-ZEROS-NEXT: mov w8, #49152 // =0xc000
94+
; NO-SIGNED-ZEROS-NEXT: movk w8, #17535, lsl #16
95+
; NO-SIGNED-ZEROS-NEXT: fmaxnm s0, s0, s1
96+
; NO-SIGNED-ZEROS-NEXT: fmov s1, w8
97+
; NO-SIGNED-ZEROS-NEXT: fminnm s0, s0, s1
98+
; NO-SIGNED-ZEROS-NEXT: ret
99+
entry:
100+
%i = fptoui float %x to i32
101+
%lower = call i32 @llvm.umax.i32(i32 %i, i32 512)
102+
%clamped = call i32 @llvm.umin.i32(i32 %lower, i32 1023)
103+
%f = uitofp i32 %clamped to float
104+
ret float %f
105+
}
106+
107+
; 16777217 is NOT exactly representable in f32.
108+
define float @test_inexact_16777217(float %x) {
109+
; CHECK-LABEL: test_inexact_16777217:
110+
; CHECK: // %bb.0: // %entry
111+
; CHECK-NEXT: fcvtzs w8, s0
112+
; CHECK-NEXT: mov w9, #16777216 // =0x1000000
113+
; CHECK-NEXT: cmp w8, w9
114+
; CHECK-NEXT: mov w9, #1 // =0x1
115+
; CHECK-NEXT: movk w9, #256, lsl #16
116+
; CHECK-NEXT: csel w8, w8, w9, le
117+
; CHECK-NEXT: scvtf s0, w8
118+
; CHECK-NEXT: ret
119+
;
120+
; NO-SIGNED-ZEROS-LABEL: test_inexact_16777217:
121+
; NO-SIGNED-ZEROS: // %bb.0: // %entry
122+
; NO-SIGNED-ZEROS-NEXT: fcvtzs w8, s0
123+
; NO-SIGNED-ZEROS-NEXT: mov w9, #16777216 // =0x1000000
124+
; NO-SIGNED-ZEROS-NEXT: cmp w8, w9
125+
; NO-SIGNED-ZEROS-NEXT: mov w9, #1 // =0x1
126+
; NO-SIGNED-ZEROS-NEXT: movk w9, #256, lsl #16
127+
; NO-SIGNED-ZEROS-NEXT: csel w8, w8, w9, le
128+
; NO-SIGNED-ZEROS-NEXT: scvtf s0, w8
129+
; NO-SIGNED-ZEROS-NEXT: ret
130+
entry:
131+
%i = fptosi float %x to i32
132+
%clamped = call i32 @llvm.smin.i32(i32 %i, i32 16777217)
133+
%f = sitofp i32 %clamped to float
134+
ret float %f
135+
}
136+
137+
declare i32 @llvm.smin.i32(i32, i32)
138+
declare i32 @llvm.smax.i32(i32, i32)
139+
declare i32 @llvm.umin.i32(i32, i32)
140+
declare i32 @llvm.umax.i32(i32, i32)

0 commit comments

Comments
 (0)