@@ -833,34 +833,34 @@ define <4 x float> @reduction_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, <4
833833;
834834; SSSE3-FAST-LABEL: reduction_sum_v4f32_v4f32:
835835; SSSE3-FAST: # %bb.0:
836- ; SSSE3-FAST-NEXT: movaps %xmm0, %xmm4
837- ; SSSE3-FAST-NEXT: haddps %xmm0, %xmm4
838- ; SSSE3-FAST-NEXT: movaps %xmm0, %xmm5
839- ; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1]
840- ; SSSE3-FAST-NEXT: addss %xmm4, %xmm5
841- ; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
842- ; SSSE3-FAST-NEXT: addss %xmm5, %xmm0
843- ; SSSE3-FAST-NEXT: movaps %xmm1, %xmm4
844- ; SSSE3-FAST-NEXT: haddps %xmm1, %xmm4
845- ; SSSE3-FAST-NEXT: movaps %xmm1, %xmm5
846- ; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1]
847- ; SSSE3-FAST-NEXT: addss %xmm4, %xmm5
848- ; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
849- ; SSSE3-FAST-NEXT: addss %xmm5, %xmm1
850- ; SSSE3-FAST-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
851- ; SSSE3-FAST-NEXT: movaps %xmm2, %xmm1
852- ; SSSE3-FAST-NEXT: haddps %xmm2, %xmm1
853- ; SSSE3-FAST-NEXT: movaps %xmm2, %xmm4
854- ; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm2[1]
855- ; SSSE3-FAST-NEXT: addss %xmm1, %xmm4
856- ; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
857- ; SSSE3-FAST-NEXT: addss %xmm4, %xmm2
858- ; SSSE3-FAST-NEXT: movaps %xmm3, %xmm1
859- ; SSSE3-FAST-NEXT: haddps %xmm3, %xmm1
860836; SSSE3-FAST-NEXT: movaps %xmm3, %xmm4
861- ; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm3[1]
862- ; SSSE3-FAST-NEXT: addss %xmm1, %xmm4
863- ; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm3 = xmm3[3,3,3,3]
837+ ; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm4 = xmm4[3,3],xmm3[3,3]
838+ ; SSSE3-FAST-NEXT: movaps %xmm3, %xmm5
839+ ; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm3[1]
840+ ; SSSE3-FAST-NEXT: movaps %xmm2, %xmm6
841+ ; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm6 = xmm6[3,3],xmm2[3,3]
842+ ; SSSE3-FAST-NEXT: movaps %xmm2, %xmm7
843+ ; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm7 = xmm7[1],xmm2[1]
844+ ; SSSE3-FAST-NEXT: movaps %xmm1, %xmm8
845+ ; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm8 = xmm8[3,3],xmm1[3,3]
846+ ; SSSE3-FAST-NEXT: movaps %xmm1, %xmm9
847+ ; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm9 = xmm9[1],xmm1[1]
848+ ; SSSE3-FAST-NEXT: movaps %xmm0, %xmm10
849+ ; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm10 = xmm10[3,3],xmm0[3,3]
850+ ; SSSE3-FAST-NEXT: movaps %xmm0, %xmm11
851+ ; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm11 = xmm11[1],xmm0[1]
852+ ; SSSE3-FAST-NEXT: haddps %xmm0, %xmm0
853+ ; SSSE3-FAST-NEXT: addss %xmm11, %xmm0
854+ ; SSSE3-FAST-NEXT: addss %xmm10, %xmm0
855+ ; SSSE3-FAST-NEXT: haddps %xmm1, %xmm1
856+ ; SSSE3-FAST-NEXT: addss %xmm9, %xmm1
857+ ; SSSE3-FAST-NEXT: addss %xmm8, %xmm1
858+ ; SSSE3-FAST-NEXT: unpcklps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
859+ ; SSSE3-FAST-NEXT: haddps %xmm2, %xmm2
860+ ; SSSE3-FAST-NEXT: addss %xmm7, %xmm2
861+ ; SSSE3-FAST-NEXT: addss %xmm6, %xmm2
862+ ; SSSE3-FAST-NEXT: haddps %xmm3, %xmm3
863+ ; SSSE3-FAST-NEXT: addss %xmm5, %xmm3
864864; SSSE3-FAST-NEXT: addss %xmm4, %xmm3
865865; SSSE3-FAST-NEXT: unpcklps {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
866866; SSSE3-FAST-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
@@ -899,28 +899,28 @@ define <4 x float> @reduction_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, <4
899899;
900900; AVX-FAST-LABEL: reduction_sum_v4f32_v4f32:
901901; AVX-FAST: # %bb.0:
902- ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm4
903- ; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm5 = xmm0[1,0]
904- ; AVX-FAST-NEXT: vaddss %xmm5, %xmm4, %xmm4
905- ; AVX-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
906- ; AVX-FAST-NEXT: vaddss %xmm0, %xmm4, %xmm0
907- ; AVX-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm4
908- ; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm5 = xmm1[1,0]
909- ; AVX-FAST-NEXT: vaddss %xmm5, %xmm4, %xmm4
910- ; AVX-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
911- ; AVX-FAST-NEXT: vaddss %xmm1, %xmm4, %xmm1
902+ ; AVX-FAST-NEXT: vshufps {{.*#+}} xmm4 = xmm3[3,3,3,3]
903+ ; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm5 = xmm3[1,0]
904+ ; AVX-FAST-NEXT: vshufps {{.*#+}} xmm6 = xmm2[3,3,3,3]
905+ ; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm7 = xmm2[1,0]
906+ ; AVX-FAST-NEXT: vshufps {{.*#+}} xmm8 = xmm1[3,3,3,3]
907+ ; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm9 = xmm1[1,0]
908+ ; AVX-FAST-NEXT: vshufps {{.*#+}} xmm10 = xmm0[3,3,3,3]
909+ ; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm11 = xmm0[1,0]
910+ ; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm0
911+ ; AVX-FAST-NEXT: vaddss %xmm0, %xmm11, %xmm0
912+ ; AVX-FAST-NEXT: vaddss %xmm0, %xmm10, %xmm0
913+ ; AVX-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm1
914+ ; AVX-FAST-NEXT: vaddss %xmm1, %xmm9, %xmm1
915+ ; AVX-FAST-NEXT: vaddss %xmm1, %xmm8, %xmm1
912916; AVX-FAST-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
913917; AVX-FAST-NEXT: vhaddps %xmm2, %xmm2, %xmm1
914- ; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm4 = xmm2[1,0]
915- ; AVX-FAST-NEXT: vaddss %xmm4, %xmm1, %xmm1
916- ; AVX-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
917- ; AVX-FAST-NEXT: vaddss %xmm2, %xmm1, %xmm1
918+ ; AVX-FAST-NEXT: vaddss %xmm7, %xmm1, %xmm1
919+ ; AVX-FAST-NEXT: vaddss %xmm6, %xmm1, %xmm1
918920; AVX-FAST-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
919921; AVX-FAST-NEXT: vhaddps %xmm3, %xmm3, %xmm1
920- ; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm2 = xmm3[1,0]
921- ; AVX-FAST-NEXT: vaddss %xmm2, %xmm1, %xmm1
922- ; AVX-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm3[3,3,3,3]
923- ; AVX-FAST-NEXT: vaddss %xmm2, %xmm1, %xmm1
922+ ; AVX-FAST-NEXT: vaddss %xmm5, %xmm1, %xmm1
923+ ; AVX-FAST-NEXT: vaddss %xmm4, %xmm1, %xmm1
924924; AVX-FAST-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
925925; AVX-FAST-NEXT: retq
926926 %5 = call float @llvm.vector.reduce.fadd.f32.v4f32 (float -0 .0 , <4 x float > %0 )
@@ -964,21 +964,9 @@ define <4 x float> @reduction_sum_v4f32_v4f32_reassoc(<4 x float> %0, <4 x float
964964;
965965; SSSE3-FAST-LABEL: reduction_sum_v4f32_v4f32_reassoc:
966966; SSSE3-FAST: # %bb.0:
967- ; SSSE3-FAST-NEXT: movaps %xmm0, %xmm4
968- ; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm0[1]
969- ; SSSE3-FAST-NEXT: addps %xmm4, %xmm0
970- ; SSSE3-FAST-NEXT: movaps %xmm1, %xmm4
971- ; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm4 = xmm4[1],xmm1[1]
972- ; SSSE3-FAST-NEXT: addps %xmm1, %xmm4
973- ; SSSE3-FAST-NEXT: haddps %xmm4, %xmm0
974- ; SSSE3-FAST-NEXT: movaps %xmm2, %xmm1
975- ; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm1 = xmm1[1],xmm2[1]
976- ; SSSE3-FAST-NEXT: addps %xmm2, %xmm1
977- ; SSSE3-FAST-NEXT: movaps %xmm3, %xmm2
978- ; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm3[1]
979- ; SSSE3-FAST-NEXT: addps %xmm3, %xmm2
980- ; SSSE3-FAST-NEXT: haddps %xmm2, %xmm1
981- ; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
967+ ; SSSE3-FAST-NEXT: haddps %xmm3, %xmm2
968+ ; SSSE3-FAST-NEXT: haddps %xmm1, %xmm0
969+ ; SSSE3-FAST-NEXT: haddps %xmm2, %xmm0
982970; SSSE3-FAST-NEXT: retq
983971;
984972; AVX-SLOW-LABEL: reduction_sum_v4f32_v4f32_reassoc:
@@ -1002,17 +990,9 @@ define <4 x float> @reduction_sum_v4f32_v4f32_reassoc(<4 x float> %0, <4 x float
1002990;
1003991; AVX-FAST-LABEL: reduction_sum_v4f32_v4f32_reassoc:
1004992; AVX-FAST: # %bb.0:
1005- ; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm4 = xmm0[1,0]
1006- ; AVX-FAST-NEXT: vaddps %xmm4, %xmm0, %xmm0
1007- ; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm4 = xmm1[1,0]
1008- ; AVX-FAST-NEXT: vaddps %xmm4, %xmm1, %xmm1
993+ ; AVX-FAST-NEXT: vhaddps %xmm3, %xmm2, %xmm2
1009994; AVX-FAST-NEXT: vhaddps %xmm1, %xmm0, %xmm0
1010- ; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm2[1,0]
1011- ; AVX-FAST-NEXT: vaddps %xmm1, %xmm2, %xmm1
1012- ; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm2 = xmm3[1,0]
1013- ; AVX-FAST-NEXT: vaddps %xmm2, %xmm3, %xmm2
1014- ; AVX-FAST-NEXT: vhaddps %xmm2, %xmm1, %xmm1
1015- ; AVX-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
995+ ; AVX-FAST-NEXT: vhaddps %xmm2, %xmm0, %xmm0
1016996; AVX-FAST-NEXT: retq
1017997 %5 = call reassoc float @llvm.vector.reduce.fadd.f32.v4f32 (float -0 .0 , <4 x float > %0 )
1018998 %6 = call reassoc float @llvm.vector.reduce.fadd.f32.v4f32 (float -0 .0 , <4 x float > %1 )
@@ -1051,17 +1031,9 @@ define <4 x i32> @reduction_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32
10511031;
10521032; SSSE3-FAST-LABEL: reduction_sum_v4i32_v4i32:
10531033; SSSE3-FAST: # %bb.0:
1054- ; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
1055- ; SSSE3-FAST-NEXT: paddd %xmm4, %xmm0
1056- ; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm4 = xmm1[2,3,2,3]
1057- ; SSSE3-FAST-NEXT: paddd %xmm1, %xmm4
1058- ; SSSE3-FAST-NEXT: phaddd %xmm4, %xmm0
1059- ; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
1060- ; SSSE3-FAST-NEXT: paddd %xmm2, %xmm1
1061- ; SSSE3-FAST-NEXT: pshufd {{.*#+}} xmm2 = xmm3[2,3,2,3]
1062- ; SSSE3-FAST-NEXT: paddd %xmm3, %xmm2
1063- ; SSSE3-FAST-NEXT: phaddd %xmm2, %xmm1
1064- ; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1034+ ; SSSE3-FAST-NEXT: phaddd %xmm3, %xmm2
1035+ ; SSSE3-FAST-NEXT: phaddd %xmm1, %xmm0
1036+ ; SSSE3-FAST-NEXT: phaddd %xmm2, %xmm0
10651037; SSSE3-FAST-NEXT: retq
10661038;
10671039; AVX1-SLOW-LABEL: reduction_sum_v4i32_v4i32:
@@ -1089,17 +1061,9 @@ define <4 x i32> @reduction_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32
10891061;
10901062; AVX-FAST-LABEL: reduction_sum_v4i32_v4i32:
10911063; AVX-FAST: # %bb.0:
1092- ; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[2,3,2,3]
1093- ; AVX-FAST-NEXT: vpaddd %xmm4, %xmm0, %xmm0
1094- ; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm1[2,3,2,3]
1095- ; AVX-FAST-NEXT: vpaddd %xmm4, %xmm1, %xmm1
1064+ ; AVX-FAST-NEXT: vphaddd %xmm3, %xmm2, %xmm2
10961065; AVX-FAST-NEXT: vphaddd %xmm1, %xmm0, %xmm0
1097- ; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[2,3,2,3]
1098- ; AVX-FAST-NEXT: vpaddd %xmm1, %xmm2, %xmm1
1099- ; AVX-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,3,2,3]
1100- ; AVX-FAST-NEXT: vpaddd %xmm2, %xmm3, %xmm2
1101- ; AVX-FAST-NEXT: vphaddd %xmm2, %xmm1, %xmm1
1102- ; AVX-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
1066+ ; AVX-FAST-NEXT: vphaddd %xmm2, %xmm0, %xmm0
11031067; AVX-FAST-NEXT: retq
11041068;
11051069; AVX2-SLOW-LABEL: reduction_sum_v4i32_v4i32:
0 commit comments