@@ -162,42 +162,72 @@ define <2 x i32> @var_funnnel_v2i32(<2 x i32> %x, <2 x i32> %amt) nounwind {
162162define <2 x i32 > @splatvar_funnnel_v2i32 (<2 x i32 > %x , <2 x i32 > %amt ) nounwind {
163163; SSE2-LABEL: splatvar_funnnel_v2i32:
164164; SSE2: # %bb.0:
165+ ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
166+ ; SSE2-NEXT: pslld $23, %xmm1
165167; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
166- ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,2,3,3]
167- ; SSE2-NEXT: psllq %xmm1, %xmm2
168- ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
169- ; SSE2-NEXT: psllq %xmm1, %xmm0
170- ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3]
168+ ; SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
169+ ; SSE2-NEXT: cvttps2dq %xmm1, %xmm1
170+ ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
171+ ; SSE2-NEXT: pmuludq %xmm1, %xmm0
172+ ; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
173+ ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
174+ ; SSE2-NEXT: pmuludq %xmm2, %xmm1
175+ ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
176+ ; SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
177+ ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
178+ ; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
179+ ; SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
180+ ; SSE2-NEXT: por %xmm3, %xmm0
171181; SSE2-NEXT: retq
172182;
173183; SSE41-LABEL: splatvar_funnnel_v2i32:
174184; SSE41: # %bb.0:
185+ ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
186+ ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
187+ ; SSE41-NEXT: pslld $23, %xmm1
175188; SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
176- ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,2,3,3]
177- ; SSE41-NEXT: psllq %xmm1, %xmm2
178- ; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
179- ; SSE41-NEXT: psllq %xmm1, %xmm0
180- ; SSE41-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3]
189+ ; SSE41-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
190+ ; SSE41-NEXT: cvttps2dq %xmm1, %xmm1
191+ ; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
192+ ; SSE41-NEXT: pmuludq %xmm2, %xmm3
193+ ; SSE41-NEXT: pmuludq %xmm1, %xmm0
194+ ; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
195+ ; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],xmm3[2,3],xmm1[4,5],xmm3[6,7]
196+ ; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm3[0,0,2,2]
197+ ; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
198+ ; SSE41-NEXT: por %xmm1, %xmm0
181199; SSE41-NEXT: retq
182200;
183201; AVX1-LABEL: splatvar_funnnel_v2i32:
184202; AVX1: # %bb.0:
203+ ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
204+ ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
205+ ; AVX1-NEXT: vpslld $23, %xmm1, %xmm1
185206; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
186- ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,2,3,3]
187- ; AVX1-NEXT: vpsllq %xmm1, %xmm2, %xmm2
188- ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
189- ; AVX1-NEXT: vpsllq %xmm1, %xmm0, %xmm0
190- ; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3]
207+ ; AVX1-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
208+ ; AVX1-NEXT: vcvttps2dq %xmm1, %xmm1
209+ ; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm1[1,1,3,3]
210+ ; AVX1-NEXT: vpmuludq %xmm3, %xmm2, %xmm2
211+ ; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm0
212+ ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,1,3,3]
213+ ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm2[2,3],xmm1[4,5],xmm2[6,7]
214+ ; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,2,2]
215+ ; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3],xmm0[4,5],xmm2[6,7]
216+ ; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
191217; AVX1-NEXT: retq
192218;
193219; AVX2-LABEL: splatvar_funnnel_v2i32:
194220; AVX2: # %bb.0:
195- ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
196- ; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[2,2,3,3]
197- ; AVX2-NEXT: vpsllq %xmm1, %xmm2, %xmm2
198- ; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
199- ; AVX2-NEXT: vpsllq %xmm1, %xmm0, %xmm0
200- ; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3]
221+ ; AVX2-NEXT: vpbroadcastd %xmm1, %xmm1
222+ ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm2 = [31,31,31,31]
223+ ; AVX2-NEXT: vpand %xmm2, %xmm1, %xmm1
224+ ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero
225+ ; AVX2-NEXT: vpslld %xmm2, %xmm0, %xmm2
226+ ; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [32,32,32,32]
227+ ; AVX2-NEXT: vpsubd %xmm1, %xmm3, %xmm1
228+ ; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
229+ ; AVX2-NEXT: vpsrld %xmm1, %xmm0, %xmm0
230+ ; AVX2-NEXT: vpor %xmm0, %xmm2, %xmm0
201231; AVX2-NEXT: retq
202232;
203233; AVX512F-LABEL: splatvar_funnnel_v2i32:
@@ -259,12 +289,22 @@ define <2 x i32> @splatvar_funnnel_v2i32(<2 x i32> %x, <2 x i32> %amt) nounwind
259289;
260290; X86-SSE2-LABEL: splatvar_funnnel_v2i32:
261291; X86-SSE2: # %bb.0:
292+ ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,1,1]
293+ ; X86-SSE2-NEXT: pslld $23, %xmm1
262294; X86-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
263- ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,2,3,3]
264- ; X86-SSE2-NEXT: psllq %xmm1, %xmm2
265- ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,1,1]
266- ; X86-SSE2-NEXT: psllq %xmm1, %xmm0
267- ; X86-SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,3],xmm2[1,3]
295+ ; X86-SSE2-NEXT: paddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
296+ ; X86-SSE2-NEXT: cvttps2dq %xmm1, %xmm1
297+ ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[1,1,3,3]
298+ ; X86-SSE2-NEXT: pmuludq %xmm1, %xmm0
299+ ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[1,3,2,3]
300+ ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,3,3]
301+ ; X86-SSE2-NEXT: pmuludq %xmm2, %xmm1
302+ ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,3,2,3]
303+ ; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
304+ ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
305+ ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,2,2,3]
306+ ; X86-SSE2-NEXT: punpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
307+ ; X86-SSE2-NEXT: por %xmm3, %xmm0
268308; X86-SSE2-NEXT: retl
269309 %splat = shufflevector <2 x i32 > %amt , <2 x i32 > undef , <2 x i32 > zeroinitializer
270310 %res = call <2 x i32 > @llvm.fshl.v2i32 (<2 x i32 > %x , <2 x i32 > %x , <2 x i32 > %splat )
0 commit comments