@@ -179,22 +179,53 @@ define <8 x i16> @combine_pmaddubsw_zero_commute(<16 x i8> %a0, <16 x i8> %a1) {
179179 ret <8 x i16 > %1
180180}
181181
182- define <16 x i16 > @combine_pmaddubsw_concat (<16 x i8 > %a0 , <16 x i8 > %a1 , < 16 x i8 > %a2 , < 16 x i8 > %a3 ) {
182+ define <16 x i16 > @combine_pmaddubsw_concat (<32 x i8 > %a0 , <32 x i8 > %a1 ) {
183183; SSE-LABEL: combine_pmaddubsw_concat:
184184; SSE: # %bb.0:
185+ ; SSE-NEXT: pmaddubsw %xmm2, %xmm0
186+ ; SSE-NEXT: pmaddubsw %xmm3, %xmm1
187+ ; SSE-NEXT: retq
188+ ;
189+ ; AVX1-LABEL: combine_pmaddubsw_concat:
190+ ; AVX1: # %bb.0:
191+ ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
192+ ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
193+ ; AVX1-NEXT: vpmaddubsw %xmm3, %xmm2, %xmm2
194+ ; AVX1-NEXT: vpmaddubsw %xmm1, %xmm0, %xmm0
195+ ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
196+ ; AVX1-NEXT: retq
197+ ;
198+ ; AVX2-LABEL: combine_pmaddubsw_concat:
199+ ; AVX2: # %bb.0:
200+ ; AVX2-NEXT: vpmaddubsw %ymm1, %ymm0, %ymm0
201+ ; AVX2-NEXT: retq
202+ %lo0 = shufflevector <32 x i8 > %a0 , <32 x i8 > undef , <16 x i32 > <i32 0 , i32 1 , i32 2 , i32 3 , i32 4 , i32 5 , i32 6 , i32 7 , i32 8 , i32 9 , i32 10 , i32 11 , i32 12 , i32 13 , i32 14 , i32 15 >
203+ %lo1 = shufflevector <32 x i8 > %a1 , <32 x i8 > undef , <16 x i32 > <i32 0 , i32 1 , i32 2 , i32 3 , i32 4 , i32 5 , i32 6 , i32 7 , i32 8 , i32 9 , i32 10 , i32 11 , i32 12 , i32 13 , i32 14 , i32 15 >
204+ %hi0 = shufflevector <32 x i8 > %a0 , <32 x i8 > undef , <16 x i32 > <i32 16 , i32 17 , i32 18 , i32 19 , i32 20 , i32 21 , i32 22 , i32 23 , i32 24 , i32 25 , i32 26 , i32 27 , i32 28 , i32 29 , i32 30 , i32 31 >
205+ %hi1 = shufflevector <32 x i8 > %a1 , <32 x i8 > undef , <16 x i32 > <i32 16 , i32 17 , i32 18 , i32 19 , i32 20 , i32 21 , i32 22 , i32 23 , i32 24 , i32 25 , i32 26 , i32 27 , i32 28 , i32 29 , i32 30 , i32 31 >
206+ %lo = call <8 x i16 > @llvm.x86.ssse3.pmadd.ub.sw.128 (<16 x i8 > %lo0 , <16 x i8 > %lo1 )
207+ %hi = call <8 x i16 > @llvm.x86.ssse3.pmadd.ub.sw.128 (<16 x i8 > %hi0 , <16 x i8 > %hi1 )
208+ %res = shufflevector <8 x i16 > %lo , <8 x i16 > %hi , <16 x i32 > <i32 0 , i32 1 , i32 2 , i32 3 , i32 4 , i32 5 , i32 6 , i32 7 , i32 8 , i32 9 , i32 10 , i32 11 , i32 12 , i32 13 , i32 14 , i32 15 >
209+ ret <16 x i16 > %res
210+ }
211+
212+ ; TODO: Not beneficial to concatenate both inputs just to create a 256-bit pmaddubsw
213+ define <16 x i16 > @combine_pmaddubsw_concat_unecessary (<16 x i8 > %a0 , <16 x i8 > %a1 , <16 x i8 > %a2 , <16 x i8 > %a3 ) {
214+ ; SSE-LABEL: combine_pmaddubsw_concat_unecessary:
215+ ; SSE: # %bb.0:
185216; SSE-NEXT: pmaddubsw %xmm1, %xmm0
186217; SSE-NEXT: pmaddubsw %xmm3, %xmm2
187218; SSE-NEXT: movdqa %xmm2, %xmm1
188219; SSE-NEXT: retq
189220;
190- ; AVX1-LABEL: combine_pmaddubsw_concat :
221+ ; AVX1-LABEL: combine_pmaddubsw_concat_unecessary :
191222; AVX1: # %bb.0:
192223; AVX1-NEXT: vpmaddubsw %xmm1, %xmm0, %xmm0
193224; AVX1-NEXT: vpmaddubsw %xmm3, %xmm2, %xmm1
194225; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
195226; AVX1-NEXT: retq
196227;
197- ; AVX2-LABEL: combine_pmaddubsw_concat :
228+ ; AVX2-LABEL: combine_pmaddubsw_concat_unecessary :
198229; AVX2: # %bb.0:
199230; AVX2-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1
200231; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0
0 commit comments