@@ -179,8 +179,8 @@ define <8 x float> @pair_sum_v8f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl
179179; SSSE3-SLOW-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
180180; SSSE3-SLOW-NEXT: haddps %xmm7, %xmm6
181181; SSSE3-SLOW-NEXT: haddps %xmm6, %xmm6
182- ; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,3 ],xmm6[0, 1]
183- ; SSSE3-SLOW-NEXT: movaps %xmm2 , %xmm1
182+ ; SSSE3-SLOW-NEXT: movhlps {{.*#+}} xmm6 = xmm2[1 ],xmm6[1]
183+ ; SSSE3-SLOW-NEXT: movaps %xmm6 , %xmm1
184184; SSSE3-SLOW-NEXT: retq
185185;
186186; SSSE3-FAST-LABEL: pair_sum_v8f32_v4f32:
@@ -345,8 +345,7 @@ define <8 x i32> @pair_sum_v8i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2,
345345; SSSE3-SLOW-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
346346; SSSE3-SLOW-NEXT: phaddd %xmm7, %xmm6
347347; SSSE3-SLOW-NEXT: phaddd %xmm6, %xmm6
348- ; SSSE3-SLOW-NEXT: palignr {{.*#+}} xmm6 = xmm1[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7]
349- ; SSSE3-SLOW-NEXT: movdqa %xmm6, %xmm1
348+ ; SSSE3-SLOW-NEXT: punpckhqdq {{.*#+}} xmm1 = xmm1[1],xmm6[1]
350349; SSSE3-SLOW-NEXT: retq
351350;
352351; SSSE3-FAST-LABEL: pair_sum_v8i32_v4i32:
@@ -374,7 +373,7 @@ define <8 x i32> @pair_sum_v8i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2,
374373; AVX1-SLOW-NEXT: vphaddd %xmm5, %xmm5, %xmm4
375374; AVX1-SLOW-NEXT: vphaddd %xmm3, %xmm2, %xmm2
376375; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,2,2,3]
377- ; AVX1-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0 ]
376+ ; AVX1-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3 ],xmm1[4,5,6,7 ]
378377; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[0,0,0,0]
379378; AVX1-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm5[6,7]
380379; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3]
@@ -397,7 +396,7 @@ define <8 x i32> @pair_sum_v8i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2,
397396; AVX1-FAST-NEXT: vphaddd %xmm5, %xmm5, %xmm4
398397; AVX1-FAST-NEXT: vphaddd %xmm3, %xmm2, %xmm2
399398; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,2,2,3]
400- ; AVX1-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0 ]
399+ ; AVX1-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3 ],xmm1[4,5,6,7 ]
401400; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[0,0,0,0]
402401; AVX1-FAST-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm5[6,7]
403402; AVX1-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3]
@@ -422,7 +421,7 @@ define <8 x i32> @pair_sum_v8i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2,
422421; AVX2-SLOW-NEXT: vphaddd %xmm5, %xmm5, %xmm4
423422; AVX2-SLOW-NEXT: vphaddd %xmm3, %xmm2, %xmm2
424423; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,2,2,3]
425- ; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0 ]
424+ ; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1 ],xmm1[2,3 ]
426425; AVX2-SLOW-NEXT: vpbroadcastd %xmm4, %xmm5
427426; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[3]
428427; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,1]
@@ -445,7 +444,7 @@ define <8 x i32> @pair_sum_v8i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2,
445444; AVX2-FAST-NEXT: vphaddd %xmm5, %xmm5, %xmm4
446445; AVX2-FAST-NEXT: vphaddd %xmm3, %xmm2, %xmm2
447446; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,2,2,3]
448- ; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0 ]
447+ ; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1 ],xmm1[2,3 ]
449448; AVX2-FAST-NEXT: vpbroadcastd %xmm4, %xmm5
450449; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[3]
451450; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,1]
0 commit comments