-
Notifications
You must be signed in to change notification settings - Fork 15.3k
Description
| Bugzilla Link | 39768 |
| Version | trunk |
| OS | All |
| CC | @alexey-bataev,@dtemirbulatov,@hfinkel,@RKSimon,@vporpo |
Extended Description
Filing this based on the discussion in:
https://reviews.llvm.org/D50840
...which was abandoned. But we need this capability because nothing else may be able to generate good code for these sequences.
SLP should be able to produce a vector cast op in the following examples, and then InstCombine can already turn the leftover insert/extract into a shuffle.
define <4 x i32> @zext_no_shuffle_needed(<8 x i16> %in) {
%elt0e = extractelement <8 x i16> %in, i32 0
%elt1e = extractelement <8 x i16> %in, i32 1
%elt2e = extractelement <8 x i16> %in, i32 2
%elt3e = extractelement <8 x i16> %in, i32 3
%elt0 = zext i16 %elt0e to i32
%elt1 = zext i16 %elt1e to i32
%elt2 = zext i16 %elt2e to i32
%elt3 = zext i16 %elt3e to i32
%vec.0 = insertelement <4 x i32> undef, i32 %elt0, i32 0
%vec.1 = insertelement <4 x i32> %vec.0, i32 %elt1, i32 1
%vec.2 = insertelement <4 x i32> %vec.1, i32 %elt2, i32 2
%vec.3 = insertelement <4 x i32> %vec.2, i32 %elt3, i32 3
ret <4 x i32> %vec.3
}
define <4 x i32> @sext_shuffled(<8 x i16> %in, <8 x i16> %in2) {
%elt0e = extractelement <8 x i16> %in, i32 1
%elt1e = extractelement <8 x i16> %in, i32 0
%elt2e = extractelement <8 x i16> %in, i32 3
%elt3e = extractelement <8 x i16> %in, i32 2
%elt0 = sext i16 %elt0e to i32
%elt1 = sext i16 %elt1e to i32
%elt2 = sext i16 %elt2e to i32
%elt3 = sext i16 %elt3e to i32
%vec.0 = insertelement <4 x i32> undef, i32 %elt0, i32 0
%vec.1 = insertelement <4 x i32> %vec.0, i32 %elt1, i32 1
%vec.2 = insertelement <4 x i32> %vec.1, i32 %elt2, i32 2
%vec.3 = insertelement <4 x i32> %vec.2, i32 %elt3, i32 3
ret <4 x i32> %vec.3
}
As of r347501, the backend will produce the expected 'pmovzxwd' for x86 on the 1st test (but I think we're trying to get the backend out of ad-hoc vectorization).
The 2nd is not good for an avx target:
vpextrw $1, %xmm0, %eax
vmovd %xmm0, %ecx
vpextrw $3, %xmm0, %edx
vpextrw $2, %xmm0, %esi
cwtl
movswl %cx, %ecx
movswl %dx, %edx
movswl %si, %esi
vmovd %eax, %xmm0
vpinsrd $1, %ecx, %xmm0, %xmm0
vpinsrd $2, %edx, %xmm0, %xmm0
vpinsrd $3, %esi, %xmm0, %xmm0