@@ -141,56 +141,61 @@ declare <8 x half> @llvm.ldexp.v8f16.v8i16(<8 x half>, <8 x i16>)
141141define <8 x half > @fmul_pow2_8xhalf (<8 x i16 > %i ) {
142142; CHECK-SSE-LABEL: fmul_pow2_8xhalf:
143143; CHECK-SSE: # %bb.0:
144- ; CHECK-SSE-NEXT: subq $88 , %rsp
145- ; CHECK-SSE-NEXT: .cfi_def_cfa_offset 96
144+ ; CHECK-SSE-NEXT: subq $104 , %rsp
145+ ; CHECK-SSE-NEXT: .cfi_def_cfa_offset 112
146146; CHECK-SSE-NEXT: movdqa %xmm0, %xmm1
147147; CHECK-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
148148; CHECK-SSE-NEXT: pslld $23, %xmm1
149149; CHECK-SSE-NEXT: movdqa {{.*#+}} xmm2 = [1065353216,1065353216,1065353216,1065353216]
150150; CHECK-SSE-NEXT: paddd %xmm2, %xmm1
151151; CHECK-SSE-NEXT: cvttps2dq %xmm1, %xmm1
152- ; CHECK-SSE-NEXT: movaps %xmm1, (%rsp) # 16-byte Spill
152+ ; CHECK-SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
153+ ; CHECK-SSE-NEXT: pslld $16, %xmm1
154+ ; CHECK-SSE-NEXT: movdqa %xmm1, (%rsp) # 16-byte Spill
153155; CHECK-SSE-NEXT: punpcklwd {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3]
154156; CHECK-SSE-NEXT: pslld $23, %xmm0
155157; CHECK-SSE-NEXT: paddd %xmm2, %xmm0
156158; CHECK-SSE-NEXT: cvttps2dq %xmm0, %xmm0
159+ ; CHECK-SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
157160; CHECK-SSE-NEXT: pslld $16, %xmm0
158- ; CHECK-SSE-NEXT: psrld $16, %xmm0
159161; CHECK-SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
160- ; CHECK-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
162+ ; CHECK-SSE-NEXT: psrld $16, % xmm0
161163; CHECK-SSE-NEXT: cvtdq2ps %xmm0, %xmm0
162164; CHECK-SSE-NEXT: callq __truncsfhf2@PLT
163165; CHECK-SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
164- ; CHECK-SSE-NEXT: cvtdq2ps {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
166+ ; CHECK-SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
167+ ; CHECK-SSE-NEXT: psrlq $48, %xmm0
168+ ; CHECK-SSE-NEXT: cvtdq2ps %xmm0, %xmm0
165169; CHECK-SSE-NEXT: callq __truncsfhf2@PLT
166170; CHECK-SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
167- ; CHECK-SSE-NEXT: pshufd $238, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
168- ; CHECK-SSE-NEXT: # xmm0 = mem[2,3,2,3]
171+ ; CHECK-SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
172+ ; CHECK-SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
169173; CHECK-SSE-NEXT: cvtdq2ps %xmm0, %xmm0
170174; CHECK-SSE-NEXT: callq __truncsfhf2@PLT
171175; CHECK-SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
172- ; CHECK-SSE-NEXT: pshufd $255, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
173- ; CHECK-SSE-NEXT: # xmm0 = mem[3,3,3,3]
174- ; CHECK-SSE-NEXT: cvtdq2ps %xmm0, %xmm0
176+ ; CHECK-SSE-NEXT: xorps %xmm0, %xmm0
177+ ; CHECK-SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
178+ ; CHECK-SSE-NEXT: punpckhwd {{.*#+}} xmm1 = xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
179+ ; CHECK-SSE-NEXT: cvtdq2ps %xmm1, %xmm0
175180; CHECK-SSE-NEXT: callq __truncsfhf2@PLT
176181; CHECK-SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
177182; CHECK-SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload
178- ; CHECK-SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
179- ; CHECK-SSE-NEXT: movdqa %xmm0, (%rsp) # 16-byte Spill
180- ; CHECK-SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
183+ ; CHECK-SSE-NEXT: psrld $16, %xmm0
181184; CHECK-SSE-NEXT: cvtdq2ps %xmm0, %xmm0
182185; CHECK-SSE-NEXT: callq __truncsfhf2@PLT
183186; CHECK-SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
184- ; CHECK-SSE-NEXT: cvtdq2ps (%rsp), %xmm0 # 16-byte Folded Reload
187+ ; CHECK-SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload
188+ ; CHECK-SSE-NEXT: psrlq $48, %xmm0
189+ ; CHECK-SSE-NEXT: cvtdq2ps %xmm0, %xmm0
185190; CHECK-SSE-NEXT: callq __truncsfhf2@PLT
186191; CHECK-SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
187- ; CHECK-SSE-NEXT: pshufd $238, (%rsp), %xmm0 # 16-byte Folded Reload
188- ; CHECK-SSE-NEXT: # xmm0 = mem[2,3,2,3]
192+ ; CHECK-SSE-NEXT: movdqa (%rsp), %xmm0 # 16-byte Reload
193+ ; CHECK-SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero
189194; CHECK-SSE-NEXT: cvtdq2ps %xmm0, %xmm0
190195; CHECK-SSE-NEXT: callq __truncsfhf2@PLT
191196; CHECK-SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
192- ; CHECK-SSE-NEXT: pshufd $255, (%rsp ), %xmm0 # 16-byte Folded Reload
193- ; CHECK-SSE-NEXT: # xmm0 = mem[3,3,3,3 ]
197+ ; CHECK-SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p ), %xmm0 # 16-byte Reload
198+ ; CHECK-SSE-NEXT: punpckhwd {{.*#+}} xmm0 = xmm0[4], mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7 ]
194199; CHECK-SSE-NEXT: cvtdq2ps %xmm0, %xmm0
195200; CHECK-SSE-NEXT: callq __truncsfhf2@PLT
196201; CHECK-SSE-NEXT: callq __extendhfsf2@PLT
@@ -202,39 +207,39 @@ define <8 x half> @fmul_pow2_8xhalf(<8 x i16> %i) {
202207; CHECK-SSE-NEXT: callq __extendhfsf2@PLT
203208; CHECK-SSE-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
204209; CHECK-SSE-NEXT: callq __truncsfhf2@PLT
205- ; CHECK-SSE-NEXT: punpcklwd (%rsp), %xmm0 # 16-byte Folded Reload
206- ; CHECK-SSE-NEXT: # xmm0 = xmm0 [0],mem [0],xmm0 [1],mem [1],xmm0 [2],mem [2],xmm0 [3],mem [3]
207- ; CHECK-SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p ) # 16-byte Spill
210+ ; CHECK-SSE-NEXT: movdqa (%rsp), %xmm1 # 16-byte Reload
211+ ; CHECK-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1 [0],xmm0 [0],xmm1 [1],xmm0 [1],xmm1 [2],xmm0 [2],xmm1 [3],xmm0 [3]
212+ ; CHECK-SSE-NEXT: movdqa %xmm1, (%rsp ) # 16-byte Spill
208213; CHECK-SSE-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
209214; CHECK-SSE-NEXT: # xmm0 = mem[0],zero,zero,zero
210215; CHECK-SSE-NEXT: callq __extendhfsf2@PLT
211216; CHECK-SSE-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
212217; CHECK-SSE-NEXT: callq __truncsfhf2@PLT
213- ; CHECK-SSE-NEXT: movaps %xmm0, (%rsp ) # 16-byte Spill
218+ ; CHECK-SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p ) # 16-byte Spill
214219; CHECK-SSE-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
215220; CHECK-SSE-NEXT: # xmm0 = mem[0],zero,zero,zero
216221; CHECK-SSE-NEXT: callq __extendhfsf2@PLT
217222; CHECK-SSE-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
218223; CHECK-SSE-NEXT: callq __truncsfhf2@PLT
219- ; CHECK-SSE-NEXT: movdqa (%rsp ), %xmm1 # 16-byte Reload
220- ; CHECK-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1 [0],xmm0 [0],xmm1 [1],xmm0 [1],xmm1 [2],xmm0 [2],xmm1 [3],xmm0 [3]
221- ; CHECK-SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p ), %xmm1 # 16-byte Folded Reload
222- ; CHECK-SSE-NEXT: # xmm1 = xmm1 [0],mem[0],xmm1 [1],mem[1]
223- ; CHECK-SSE-NEXT: movdqa %xmm1, (%rsp ) # 16-byte Spill
224+ ; CHECK-SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p ), %xmm0 # 16-byte Folded Reload
225+ ; CHECK-SSE-NEXT: # xmm0 = xmm0 [0],mem [0],xmm0 [1],mem [1],xmm0 [2],mem [2],xmm0 [3],mem [3]
226+ ; CHECK-SSE-NEXT: punpckldq (%rsp ), %xmm0 # 16-byte Folded Reload
227+ ; CHECK-SSE-NEXT: # xmm0 = xmm0 [0],mem[0],xmm0 [1],mem[1]
228+ ; CHECK-SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p ) # 16-byte Spill
224229; CHECK-SSE-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
225230; CHECK-SSE-NEXT: # xmm0 = mem[0],zero,zero,zero
226231; CHECK-SSE-NEXT: callq __extendhfsf2@PLT
227232; CHECK-SSE-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
228233; CHECK-SSE-NEXT: callq __truncsfhf2@PLT
229- ; CHECK-SSE-NEXT: movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p ) # 16-byte Spill
234+ ; CHECK-SSE-NEXT: movaps %xmm0, (%rsp ) # 16-byte Spill
230235; CHECK-SSE-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
231236; CHECK-SSE-NEXT: # xmm0 = mem[0],zero,zero,zero
232237; CHECK-SSE-NEXT: callq __extendhfsf2@PLT
233238; CHECK-SSE-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
234239; CHECK-SSE-NEXT: callq __truncsfhf2@PLT
235- ; CHECK-SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p ), %xmm0 # 16-byte Folded Reload
236- ; CHECK-SSE-NEXT: # xmm0 = xmm0 [0],mem [0],xmm0 [1],mem [1],xmm0 [2],mem [2],xmm0 [3],mem [3]
237- ; CHECK-SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p ) # 16-byte Spill
240+ ; CHECK-SSE-NEXT: movdqa (%rsp ), %xmm1 # 16-byte Reload
241+ ; CHECK-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1 [0],xmm0 [0],xmm1 [1],xmm0 [1],xmm1 [2],xmm0 [2],xmm1 [3],xmm0 [3]
242+ ; CHECK-SSE-NEXT: movdqa %xmm1, (%rsp ) # 16-byte Spill
238243; CHECK-SSE-NEXT: movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
239244; CHECK-SSE-NEXT: # xmm0 = mem[0],zero,zero,zero
240245; CHECK-SSE-NEXT: callq __extendhfsf2@PLT
@@ -246,14 +251,13 @@ define <8 x half> @fmul_pow2_8xhalf(<8 x i16> %i) {
246251; CHECK-SSE-NEXT: callq __extendhfsf2@PLT
247252; CHECK-SSE-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
248253; CHECK-SSE-NEXT: callq __truncsfhf2@PLT
249- ; CHECK-SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
250- ; CHECK-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
251- ; CHECK-SSE-NEXT: punpckldq {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
252- ; CHECK-SSE-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
253- ; CHECK-SSE-NEXT: punpcklqdq (%rsp), %xmm1 # 16-byte Folded Reload
254- ; CHECK-SSE-NEXT: # xmm1 = xmm1[0],mem[0]
255- ; CHECK-SSE-NEXT: movdqa %xmm1, %xmm0
256- ; CHECK-SSE-NEXT: addq $88, %rsp
254+ ; CHECK-SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
255+ ; CHECK-SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
256+ ; CHECK-SSE-NEXT: punpckldq (%rsp), %xmm0 # 16-byte Folded Reload
257+ ; CHECK-SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
258+ ; CHECK-SSE-NEXT: punpcklqdq {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
259+ ; CHECK-SSE-NEXT: # xmm0 = xmm0[0],mem[0]
260+ ; CHECK-SSE-NEXT: addq $104, %rsp
257261; CHECK-SSE-NEXT: .cfi_def_cfa_offset 8
258262; CHECK-SSE-NEXT: retq
259263;
@@ -1028,17 +1032,17 @@ define <2 x half> @fmul_pow_shl_cnt_vec_fail_to_large(<2 x i16> %cnt) nounwind {
10281032; CHECK-SSE-NEXT: pslld $23, %xmm0
10291033; CHECK-SSE-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
10301034; CHECK-SSE-NEXT: cvttps2dq %xmm0, %xmm0
1031- ; CHECK-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,2,2,3,4,5,6,7]
1032- ; CHECK-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 # [2,2,u,u,u,u,u,u]
1033- ; CHECK-SSE-NEXT: pxor %xmm0, %xmm0
1034- ; CHECK-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1035- ; CHECK-SSE-NEXT: movdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1036- ; CHECK-SSE-NEXT: cvtdq2ps %xmm1, %xmm0
1035+ ; CHECK-SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
1036+ ; CHECK-SSE-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 # [2,2,u,u,u,u,u,u]
1037+ ; CHECK-SSE-NEXT: movdqa %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
1038+ ; CHECK-SSE-NEXT: psrld $16, %xmm0
1039+ ; CHECK-SSE-NEXT: cvtdq2ps %xmm0, %xmm0
10371040; CHECK-SSE-NEXT: callq __truncsfhf2@PLT
10381041; CHECK-SSE-NEXT: movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
1039- ; CHECK-SSE-NEXT: pshufd $85, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1040- ; CHECK-SSE-NEXT: # xmm0 = mem[1,1,1,1]
1041- ; CHECK-SSE-NEXT: cvtdq2ps %xmm0, %xmm0
1042+ ; CHECK-SSE-NEXT: xorps %xmm0, %xmm0
1043+ ; CHECK-SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1044+ ; CHECK-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1045+ ; CHECK-SSE-NEXT: cvtdq2ps %xmm1, %xmm0
10421046; CHECK-SSE-NEXT: callq __truncsfhf2@PLT
10431047; CHECK-SSE-NEXT: callq __extendhfsf2@PLT
10441048; CHECK-SSE-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
@@ -1049,8 +1053,9 @@ define <2 x half> @fmul_pow_shl_cnt_vec_fail_to_large(<2 x i16> %cnt) nounwind {
10491053; CHECK-SSE-NEXT: callq __extendhfsf2@PLT
10501054; CHECK-SSE-NEXT: mulss {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
10511055; CHECK-SSE-NEXT: callq __truncsfhf2@PLT
1052- ; CHECK-SSE-NEXT: punpcklwd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload
1053- ; CHECK-SSE-NEXT: # xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
1056+ ; CHECK-SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
1057+ ; CHECK-SSE-NEXT: punpcklwd {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3]
1058+ ; CHECK-SSE-NEXT: movdqa %xmm1, %xmm0
10541059; CHECK-SSE-NEXT: addq $40, %rsp
10551060; CHECK-SSE-NEXT: retq
10561061;
0 commit comments