Skip to content

Commit e010657

Browse files
phoebewangmaleadt
authored andcommitted
[X86][FP16] Do not combine fminnum/fmaxnum for FP16 emulation
Under the emulation situation, we lack native fmin/fmax instruction support. Fixes llvm#59258 Reviewed By: skan, spatel Differential Revision: https://reviews.llvm.org/D139078
1 parent e42aaf1 commit e010657

File tree

2 files changed

+171
-2
lines changed

2 files changed

+171
-2
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -51111,12 +51111,12 @@ static SDValue combineFMinFMax(SDNode *N, SelectionDAG &DAG) {
5111151111

5111251112
static SDValue combineFMinNumFMaxNum(SDNode *N, SelectionDAG &DAG,
5111351113
const X86Subtarget &Subtarget) {
51114-
if (Subtarget.useSoftFloat())
51114+
EVT VT = N->getValueType(0);
51115+
if (Subtarget.useSoftFloat() || isSoftFP16(VT, Subtarget))
5111551116
return SDValue();
5111651117

5111751118
const TargetLowering &TLI = DAG.getTargetLoweringInfo();
5111851119

51119-
EVT VT = N->getValueType(0);
5112051120
if (!((Subtarget.hasSSE1() && VT == MVT::f32) ||
5112151121
(Subtarget.hasSSE2() && VT == MVT::f64) ||
5112251122
(Subtarget.hasFP16() && VT == MVT::f16) ||

llvm/test/CodeGen/X86/pr59258.ll

Lines changed: 169 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,169 @@
1+
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
2+
; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s
3+
4+
define <8 x half> @cvt_and_clamp2(<8 x float>) nounwind {
5+
; CHECK-LABEL: cvt_and_clamp2:
6+
; CHECK: # %bb.0:
7+
; CHECK-NEXT: subq $120, %rsp
8+
; CHECK-NEXT: movaps %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
9+
; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
10+
; CHECK-NEXT: movaps %xmm1, %xmm0
11+
; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3],xmm1[3,3]
12+
; CHECK-NEXT: callq __truncsfhf2@PLT
13+
; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
14+
; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
15+
; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
16+
; CHECK-NEXT: callq __truncsfhf2@PLT
17+
; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
18+
; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
19+
; CHECK-NEXT: callq __truncsfhf2@PLT
20+
; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
21+
; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
22+
; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
23+
; CHECK-NEXT: callq __truncsfhf2@PLT
24+
; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
25+
; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
26+
; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
27+
; CHECK-NEXT: callq __truncsfhf2@PLT
28+
; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
29+
; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
30+
; CHECK-NEXT: movhlps {{.*#+}} xmm0 = xmm0[1,1]
31+
; CHECK-NEXT: callq __truncsfhf2@PLT
32+
; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
33+
; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
34+
; CHECK-NEXT: callq __truncsfhf2@PLT
35+
; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
36+
; CHECK-NEXT: movaps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
37+
; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1,1,1]
38+
; CHECK-NEXT: callq __truncsfhf2@PLT
39+
; CHECK-NEXT: callq __extendhfsf2@PLT
40+
; CHECK-NEXT: xorps %xmm1, %xmm1
41+
; CHECK-NEXT: callq fmaxf@PLT
42+
; CHECK-NEXT: callq __truncsfhf2@PLT
43+
; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
44+
; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
45+
; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero
46+
; CHECK-NEXT: callq __extendhfsf2@PLT
47+
; CHECK-NEXT: xorps %xmm1, %xmm1
48+
; CHECK-NEXT: callq fmaxf@PLT
49+
; CHECK-NEXT: callq __truncsfhf2@PLT
50+
; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
51+
; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
52+
; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero
53+
; CHECK-NEXT: callq __extendhfsf2@PLT
54+
; CHECK-NEXT: xorps %xmm1, %xmm1
55+
; CHECK-NEXT: callq fmaxf@PLT
56+
; CHECK-NEXT: callq __truncsfhf2@PLT
57+
; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
58+
; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
59+
; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero
60+
; CHECK-NEXT: callq __extendhfsf2@PLT
61+
; CHECK-NEXT: xorps %xmm1, %xmm1
62+
; CHECK-NEXT: callq fmaxf@PLT
63+
; CHECK-NEXT: callq __truncsfhf2@PLT
64+
; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
65+
; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
66+
; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero
67+
; CHECK-NEXT: callq __extendhfsf2@PLT
68+
; CHECK-NEXT: xorps %xmm1, %xmm1
69+
; CHECK-NEXT: callq fmaxf@PLT
70+
; CHECK-NEXT: callq __truncsfhf2@PLT
71+
; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
72+
; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
73+
; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero
74+
; CHECK-NEXT: callq __extendhfsf2@PLT
75+
; CHECK-NEXT: xorps %xmm1, %xmm1
76+
; CHECK-NEXT: callq fmaxf@PLT
77+
; CHECK-NEXT: callq __truncsfhf2@PLT
78+
; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
79+
; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
80+
; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero
81+
; CHECK-NEXT: callq __extendhfsf2@PLT
82+
; CHECK-NEXT: xorps %xmm1, %xmm1
83+
; CHECK-NEXT: callq fmaxf@PLT
84+
; CHECK-NEXT: callq __truncsfhf2@PLT
85+
; CHECK-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
86+
; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
87+
; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero
88+
; CHECK-NEXT: callq __extendhfsf2@PLT
89+
; CHECK-NEXT: xorps %xmm1, %xmm1
90+
; CHECK-NEXT: callq fmaxf@PLT
91+
; CHECK-NEXT: callq __truncsfhf2@PLT
92+
; CHECK-NEXT: callq __extendhfsf2@PLT
93+
; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
94+
; CHECK-NEXT: callq fminf@PLT
95+
; CHECK-NEXT: callq __truncsfhf2@PLT
96+
; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
97+
; CHECK-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
98+
; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero
99+
; CHECK-NEXT: callq __extendhfsf2@PLT
100+
; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
101+
; CHECK-NEXT: callq fminf@PLT
102+
; CHECK-NEXT: callq __truncsfhf2@PLT
103+
; CHECK-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
104+
; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
105+
; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
106+
; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
107+
; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero
108+
; CHECK-NEXT: callq __extendhfsf2@PLT
109+
; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
110+
; CHECK-NEXT: callq fminf@PLT
111+
; CHECK-NEXT: callq __truncsfhf2@PLT
112+
; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
113+
; CHECK-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
114+
; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero
115+
; CHECK-NEXT: callq __extendhfsf2@PLT
116+
; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
117+
; CHECK-NEXT: callq fminf@PLT
118+
; CHECK-NEXT: callq __truncsfhf2@PLT
119+
; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
120+
; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
121+
; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
122+
; CHECK-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
123+
; CHECK-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
124+
; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
125+
; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero
126+
; CHECK-NEXT: callq __extendhfsf2@PLT
127+
; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
128+
; CHECK-NEXT: callq fminf@PLT
129+
; CHECK-NEXT: callq __truncsfhf2@PLT
130+
; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
131+
; CHECK-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
132+
; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero
133+
; CHECK-NEXT: callq __extendhfsf2@PLT
134+
; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
135+
; CHECK-NEXT: callq fminf@PLT
136+
; CHECK-NEXT: callq __truncsfhf2@PLT
137+
; CHECK-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
138+
; CHECK-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
139+
; CHECK-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
140+
; CHECK-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
141+
; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero
142+
; CHECK-NEXT: callq __extendhfsf2@PLT
143+
; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
144+
; CHECK-NEXT: callq fminf@PLT
145+
; CHECK-NEXT: callq __truncsfhf2@PLT
146+
; CHECK-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
147+
; CHECK-NEXT: movd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Folded Reload
148+
; CHECK-NEXT: # xmm0 = mem[0],zero,zero,zero
149+
; CHECK-NEXT: callq __extendhfsf2@PLT
150+
; CHECK-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
151+
; CHECK-NEXT: callq fminf@PLT
152+
; CHECK-NEXT: callq __truncsfhf2@PLT
153+
; CHECK-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
154+
; CHECK-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
155+
; CHECK-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
156+
; CHECK-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
157+
; CHECK-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
158+
; CHECK-NEXT: # xmm1 = xmm1[0],mem[0]
159+
; CHECK-NEXT: movdqa %xmm1, %xmm0
160+
; CHECK-NEXT: addq $120, %rsp
161+
; CHECK-NEXT: retq
162+
%2 = fptrunc <8 x float> %0 to <8 x half>
163+
%3 = call <8 x half> @llvm.maxnum.v8f16(<8 x half> zeroinitializer, <8 x half> %2)
164+
%4 = call <8 x half> @llvm.minnum.v8f16(<8 x half> %3, <8 x half> <half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0, half 1.0>)
165+
ret <8 x half> %4
166+
}
167+
168+
declare <8 x half> @llvm.maxnum.v8f16(<8 x half>, <8 x half>)
169+
declare <8 x half> @llvm.minnum.v8f16(<8 x half>, <8 x half>)

0 commit comments

Comments
 (0)