diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 5a4ba74bf35fd..9665425b5e7f1 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -13776,6 +13776,14 @@ static SDValue lowerV8I16GeneralSingleInputShuffle( DWordPairs.resize(2, std::make_pair(-1, -1)); int PSHUFHalfMask[4] = {DWordPairs[0].first, DWordPairs[0].second, DWordPairs[1].first, DWordPairs[1].second}; + // For splat, ensure we widen the PSHUFDMask to allow vXi64 folds. + if (ShuffleVectorSDNode::isSplatMask(PSHUFDMask) && + ShuffleVectorSDNode::isSplatMask(PSHUFHalfMask)) { + int SplatIdx = ShuffleVectorSDNode::getSplatMaskIndex(PSHUFHalfMask); + std::fill(PSHUFHalfMask, PSHUFHalfMask + 4, SplatIdx); + PSHUFDMask[0] = PSHUFDMask[2] = DOffset + 0; + PSHUFDMask[1] = PSHUFDMask[3] = DOffset + 1; + } if ((NumHToL + NumHToH) == 0) return ShuffleDWordPairs(PSHUFHalfMask, PSHUFDMask, X86ISD::PSHUFLW); if ((NumLToL + NumLToH) == 0) diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll index facdbad50164a..ba37d3ee0d642 100644 --- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll +++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll @@ -2363,7 +2363,7 @@ define void @vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24(ptr %in ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,0,0] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,0,1] ; SSE2-NEXT: paddb (%rdx), %xmm0 ; SSE2-NEXT: movdqa 16(%rdx), %xmm2 ; SSE2-NEXT: paddb %xmm1, %xmm2 @@ -2384,7 +2384,7 @@ define void @vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24(ptr %in ; SSE42-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; SSE42-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,0,0] +; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,0,1] ; SSE42-NEXT: paddb (%rdx), %xmm0 ; SSE42-NEXT: movdqa 16(%rdx), %xmm2 ; SSE42-NEXT: paddb %xmm1, %xmm2 @@ -2405,7 +2405,7 @@ define void @vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24(ptr %in ; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7] ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 ; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 @@ -2497,7 +2497,7 @@ define void @vec384_i8_widen_to_i24_factor3_broadcast_to_v16i24_factor16(ptr %in ; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE2-NEXT: pandn %xmm0, %xmm2 ; SSE2-NEXT: por %xmm1, %xmm2 ; SSE2-NEXT: paddb (%rdx), %xmm2 @@ -2762,7 +2762,7 @@ define void @vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8(ptr %in.v ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255] ; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE2-NEXT: pandn %xmm0, %xmm2 ; SSE2-NEXT: por %xmm1, %xmm2 ; SSE2-NEXT: paddb (%rdx), %xmm2 @@ -2783,7 +2783,7 @@ define void @vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8(ptr %in.v ; SSE42-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7] ; SSE42-NEXT: palignr {{.*#+}} xmm0 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14] -; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,0,0] +; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,0,1] ; SSE42-NEXT: paddb (%rdx), %xmm0 ; SSE42-NEXT: movdqa 16(%rdx), %xmm2 ; SSE42-NEXT: paddb %xmm1, %xmm2 @@ -2802,7 +2802,7 @@ define void @vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8(ptr %in.v ; AVX-NEXT: vpalignr {{.*#+}} xmm1 = xmm1[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14] ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 ; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 @@ -3549,7 +3549,7 @@ define void @vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8(ptr %in. ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,0,65535,65535,0,65535] ; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE2-NEXT: pandn %xmm0, %xmm2 ; SSE2-NEXT: por %xmm1, %xmm2 ; SSE2-NEXT: paddb (%rdx), %xmm2 @@ -3568,7 +3568,7 @@ define void @vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8(ptr %in. ; SSE42-NEXT: paddb 48(%rsi), %xmm1 ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] ; SSE42-NEXT: paddb (%rdx), %xmm1 ; SSE42-NEXT: movdqa 16(%rdx), %xmm2 @@ -3586,7 +3586,7 @@ define void @vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8(ptr %in. ; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 ; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2 @@ -4963,7 +4963,7 @@ define void @vec512_i8_widen_to_i16_factor2_broadcast_to_v32i16_factor32(ptr %in ; SSE-NEXT: movdqa (%rdi), %xmm0 ; SSE-NEXT: paddb (%rsi), %xmm0 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE-NEXT: movdqa 16(%rdx), %xmm1 ; SSE-NEXT: paddb %xmm0, %xmm1 ; SSE-NEXT: movdqa (%rdx), %xmm2 @@ -4982,7 +4982,7 @@ define void @vec512_i8_widen_to_i16_factor2_broadcast_to_v32i16_factor32(ptr %in ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX-NEXT: vpaddb 48(%rdx), %xmm0, %xmm1 ; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm3 diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll index ca589e63b671a..305509ca7fc3f 100644 --- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll +++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll @@ -1837,7 +1837,7 @@ define void @vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24(ptr %in ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,0,0] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,0,1] ; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: movdqa 16(%rsi), %xmm2 ; SSE2-NEXT: paddb %xmm1, %xmm2 @@ -1856,7 +1856,7 @@ define void @vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24(ptr %in ; SSE42-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; SSE42-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,0,0,0] +; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,0,1] ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: movdqa 16(%rsi), %xmm2 ; SSE42-NEXT: paddb %xmm1, %xmm2 @@ -1875,9 +1875,9 @@ define void @vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24(ptr %in ; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7] ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = mem[0,0,0,0,4,5,6,7] -; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] +; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2 @@ -1954,7 +1954,7 @@ define void @vec384_i8_widen_to_i24_factor3_broadcast_to_v16i24_factor16(ptr %in ; SSE2-NEXT: pand %xmm1, %xmm2 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE2-NEXT: pandn %xmm0, %xmm1 ; SSE2-NEXT: por %xmm2, %xmm1 ; SSE2-NEXT: paddb (%rsi), %xmm1 @@ -2174,7 +2174,7 @@ define void @vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8(ptr %in.e ; SSE2-NEXT: movdqa 48(%rdi), %xmm1 ; SSE2-NEXT: pand %xmm0, %xmm1 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = mem[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] ; SSE2-NEXT: pandn %xmm2, %xmm0 ; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: paddb (%rsi), %xmm0 @@ -2192,7 +2192,7 @@ define void @vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8(ptr %in.e ; SSE42-NEXT: pshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7] ; SSE42-NEXT: palignr {{.*#+}} xmm0 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] ; SSE42-NEXT: pshufb {{.*#+}} xmm0 = xmm0[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14] -; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; SSE42-NEXT: movdqa 16(%rsi), %xmm2 ; SSE42-NEXT: paddb %xmm1, %xmm2 ; SSE42-NEXT: paddb (%rsi), %xmm0 @@ -2208,9 +2208,9 @@ define void @vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8(ptr %in.e ; AVX-NEXT: vpalignr {{.*#+}} xmm1 = mem[1,2,3,4,5,6,7,8,9,10,11,12,13,14,15],xmm0[0] ; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[15,0,1,2,3,4,15,6,7,8,9,10,15,12,13,14] ; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = mem[0,0,0,0,4,5,6,7] -; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] +; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2 @@ -2835,7 +2835,7 @@ define void @vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8(ptr %in. ; SSE2-NEXT: movdqa 48(%rdi), %xmm1 ; SSE2-NEXT: pand %xmm0, %xmm1 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = mem[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] ; SSE2-NEXT: pandn %xmm2, %xmm0 ; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: paddb (%rsi), %xmm0 @@ -2850,7 +2850,7 @@ define void @vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8(ptr %in. ; SSE42-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8: ; SSE42: # %bb.0: ; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = mem[0,0,0,0,4,5,6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE42-NEXT: movdqa 48(%rdi), %xmm1 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] ; SSE42-NEXT: paddb (%rsi), %xmm1 @@ -2865,7 +2865,7 @@ define void @vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8(ptr %in. ; AVX-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8: ; AVX: # %bb.0: ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = mem[0,0,0,0,4,5,6,7] -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2],xmm0[3],mem[4,5],xmm0[6],mem[7] ; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2 @@ -3913,7 +3913,7 @@ define void @vec512_i8_widen_to_i16_factor2_broadcast_to_v32i16_factor32(ptr %in ; SSE-LABEL: vec512_i8_widen_to_i16_factor2_broadcast_to_v32i16_factor32: ; SSE: # %bb.0: ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = mem[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE-NEXT: movdqa 16(%rsi), %xmm1 ; SSE-NEXT: paddb %xmm0, %xmm1 ; SSE-NEXT: movdqa (%rsi), %xmm2 @@ -3930,7 +3930,7 @@ define void @vec512_i8_widen_to_i16_factor2_broadcast_to_v32i16_factor32(ptr %in ; AVX-LABEL: vec512_i8_widen_to_i16_factor2_broadcast_to_v32i16_factor32: ; AVX: # %bb.0: ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = mem[0,0,0,0,4,5,6,7] -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX-NEXT: vpaddb 48(%rsi), %xmm0, %xmm1 ; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2 ; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm3 diff --git a/llvm/test/CodeGen/X86/avx-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/avx-intrinsics-fast-isel.ll index b7dea5f4151e2..7d0a5679936da 100644 --- a/llvm/test/CodeGen/X86/avx-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/avx-intrinsics-fast-isel.ll @@ -1899,7 +1899,7 @@ define <4 x i64> @test_mm256_set1_epi16(i16 %a0) nounwind { ; X86-NEXT: movzwl {{[0-9]+}}(%esp), %eax ; X86-NEXT: vmovd %eax, %xmm0 ; X86-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; X86-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; X86-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X86-NEXT: retl ; @@ -1907,7 +1907,7 @@ define <4 x i64> @test_mm256_set1_epi16(i16 %a0) nounwind { ; X64: # %bb.0: ; X64-NEXT: vmovd %edi, %xmm0 ; X64-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; X64-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; X64-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; X64-NEXT: retq %res0 = insertelement <16 x i16> undef, i16 %a0, i32 0 diff --git a/llvm/test/CodeGen/X86/avx-splat.ll b/llvm/test/CodeGen/X86/avx-splat.ll index 5d3a6e3e41217..15c2aab1a82e1 100644 --- a/llvm/test/CodeGen/X86/avx-splat.ll +++ b/llvm/test/CodeGen/X86/avx-splat.ll @@ -17,7 +17,7 @@ define <16 x i16> @funcB(<16 x i16> %a) nounwind uwtable readnone ssp { ; CHECK-LABEL: funcB: ; CHECK: # %bb.0: # %entry ; CHECK-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] -; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; CHECK-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; CHECK-NEXT: ret{{[l|q]}} entry: diff --git a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll index efb7ba393721b..423f2c49e70e5 100644 --- a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll +++ b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll @@ -97,7 +97,7 @@ define <8 x i16> @ext_i8_8i16(i8 %a0) { ; SSE2-SSSE3: # %bb.0: ; SSE2-SSSE3-NEXT: movd %edi, %xmm0 ; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] ; SSE2-SSSE3-NEXT: pand %xmm1, %xmm0 ; SSE2-SSSE3-NEXT: pcmpeqw %xmm1, %xmm0 @@ -107,7 +107,7 @@ define <8 x i16> @ext_i8_8i16(i8 %a0) { ; AVX1: # %bb.0: ; AVX1-NEXT: vmovd %edi, %xmm0 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 @@ -286,7 +286,7 @@ define <16 x i16> @ext_i16_16i16(i16 %a0) { ; SSE2-SSSE3: # %bb.0: ; SSE2-SSSE3-NEXT: movd %edi, %xmm0 ; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128] ; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm0 ; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0 @@ -300,7 +300,7 @@ define <16 x i16> @ext_i16_16i16(i16 %a0) { ; AVX1: # %bb.0: ; AVX1-NEXT: vmovd %edi, %xmm0 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768] ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 @@ -519,7 +519,7 @@ define <32 x i16> @ext_i32_32i16(i32 %a0) { ; SSE2-SSSE3: # %bb.0: ; SSE2-SSSE3-NEXT: movd %edi, %xmm2 ; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,0,0,0,4,5,6,7] -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [1,2,4,8,16,32,64,128] ; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm0 ; SSE2-SSSE3-NEXT: pand %xmm4, %xmm0 @@ -528,7 +528,7 @@ define <32 x i16> @ext_i32_32i16(i32 %a0) { ; SSE2-SSSE3-NEXT: pand %xmm5, %xmm1 ; SSE2-SSSE3-NEXT: pcmpeqw %xmm5, %xmm1 ; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,1,1,1,4,5,6,7] -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,0,0] +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,1,0,1] ; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm2 ; SSE2-SSSE3-NEXT: pand %xmm4, %xmm2 ; SSE2-SSSE3-NEXT: pcmpeqw %xmm4, %xmm2 @@ -540,7 +540,7 @@ define <32 x i16> @ext_i32_32i16(i32 %a0) { ; AVX1: # %bb.0: ; AVX1-NEXT: vmovd %edi, %xmm1 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768] ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 @@ -550,7 +550,7 @@ define <32 x i16> @ext_i32_32i16(i32 %a0) { ; AVX1-NEXT: vpcmpeqw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,1,1,1,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3 diff --git a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll index 7863de8754fa0..f478fa5a1f6cd 100644 --- a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll +++ b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll @@ -122,7 +122,7 @@ define <8 x i16> @ext_i8_8i16(i8 %a0) { ; SSE2-SSSE3: # %bb.0: ; SSE2-SSSE3-NEXT: movd %edi, %xmm0 ; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] ; SSE2-SSSE3-NEXT: pand %xmm1, %xmm0 ; SSE2-SSSE3-NEXT: pcmpeqw %xmm1, %xmm0 @@ -133,7 +133,7 @@ define <8 x i16> @ext_i8_8i16(i8 %a0) { ; AVX1: # %bb.0: ; AVX1-NEXT: vmovd %edi, %xmm0 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 @@ -360,7 +360,7 @@ define <16 x i16> @ext_i16_16i16(i16 %a0) { ; SSE2-SSSE3: # %bb.0: ; SSE2-SSSE3-NEXT: movd %edi, %xmm0 ; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128] ; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm0 ; SSE2-SSSE3-NEXT: pand %xmm2, %xmm0 @@ -376,7 +376,7 @@ define <16 x i16> @ext_i16_16i16(i16 %a0) { ; AVX1: # %bb.0: ; AVX1-NEXT: vmovd %edi, %xmm0 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768] ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 @@ -658,7 +658,7 @@ define <32 x i16> @ext_i32_32i16(i32 %a0) { ; SSE2-SSSE3: # %bb.0: ; SSE2-SSSE3-NEXT: movd %edi, %xmm2 ; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,0,0,0,4,5,6,7] -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm4 = [1,2,4,8,16,32,64,128] ; SSE2-SSSE3-NEXT: movdqa %xmm1, %xmm0 ; SSE2-SSSE3-NEXT: pand %xmm4, %xmm0 @@ -669,7 +669,7 @@ define <32 x i16> @ext_i32_32i16(i32 %a0) { ; SSE2-SSSE3-NEXT: pcmpeqw %xmm5, %xmm1 ; SSE2-SSSE3-NEXT: psrlw $15, %xmm1 ; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,1,1,1,4,5,6,7] -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,0,0,0] +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm2[0,1,0,1] ; SSE2-SSSE3-NEXT: movdqa %xmm3, %xmm2 ; SSE2-SSSE3-NEXT: pand %xmm4, %xmm2 ; SSE2-SSSE3-NEXT: pcmpeqw %xmm4, %xmm2 @@ -683,7 +683,7 @@ define <32 x i16> @ext_i32_32i16(i32 %a0) { ; AVX1: # %bb.0: ; AVX1-NEXT: vmovd %edi, %xmm1 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm1[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768] ; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0 @@ -695,7 +695,7 @@ define <32 x i16> @ext_i32_32i16(i32 %a0) { ; AVX1-NEXT: vbroadcastss {{.*#+}} ymm3 = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1] ; AVX1-NEXT: vandps %ymm3, %ymm0, %ymm0 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[1,1,1,1,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm1 ; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1 ; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5 diff --git a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool.ll b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool.ll index 33a0946c8fbe9..2a79dae43bb2f 100644 --- a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool.ll +++ b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool.ll @@ -94,7 +94,7 @@ define <8 x i1> @bitcast_i8_8i1(i8 zeroext %a0) { ; SSE2-SSSE3: # %bb.0: ; SSE2-SSSE3-NEXT: movd %edi, %xmm0 ; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] ; SSE2-SSSE3-NEXT: pand %xmm1, %xmm0 ; SSE2-SSSE3-NEXT: pcmpeqw %xmm1, %xmm0 @@ -105,7 +105,7 @@ define <8 x i1> @bitcast_i8_8i1(i8 zeroext %a0) { ; AVX1: # %bb.0: ; AVX1-NEXT: vmovd %edi, %xmm0 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 @@ -137,7 +137,7 @@ define <8 x i1> @bitcast_i8_8i1_freeze(i8 zeroext %a0) { ; SSE2-SSSE3: # %bb.0: ; SSE2-SSSE3-NEXT: movd %edi, %xmm0 ; SSE2-SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE2-SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE2-SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] ; SSE2-SSSE3-NEXT: pand %xmm1, %xmm0 ; SSE2-SSSE3-NEXT: pcmpeqw %xmm1, %xmm0 @@ -148,7 +148,7 @@ define <8 x i1> @bitcast_i8_8i1_freeze(i8 zeroext %a0) { ; AVX1: # %bb.0: ; AVX1-NEXT: vmovd %edi, %xmm0 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/cast-vsel.ll b/llvm/test/CodeGen/X86/cast-vsel.ll index d5ccc152abc23..79513b205933e 100644 --- a/llvm/test/CodeGen/X86/cast-vsel.ll +++ b/llvm/test/CodeGen/X86/cast-vsel.ll @@ -397,10 +397,10 @@ define dso_local void @example24(i16 signext %x, i16 signext %y) nounwind { ; SSE2: # %bb.0: # %vector.ph ; SSE2-NEXT: movd %edi, %xmm0 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE2-NEXT: movd %esi, %xmm1 ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; SSE2-NEXT: movq $-4096, %rax # imm = 0xF000 ; SSE2-NEXT: .p2align 4 ; SSE2-NEXT: .LBB6_1: # %vector.body @@ -429,10 +429,10 @@ define dso_local void @example24(i16 signext %x, i16 signext %y) nounwind { ; SSE41: # %bb.0: # %vector.ph ; SSE41-NEXT: movd %edi, %xmm0 ; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] ; SSE41-NEXT: movd %esi, %xmm0 ; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,0,0] +; SSE41-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] ; SSE41-NEXT: movq $-4096, %rax # imm = 0xF000 ; SSE41-NEXT: .p2align 4 ; SSE41-NEXT: .LBB6_1: # %vector.body @@ -458,10 +458,10 @@ define dso_local void @example24(i16 signext %x, i16 signext %y) nounwind { ; AVX1: # %bb.0: # %vector.ph ; AVX1-NEXT: vmovd %edi, %xmm0 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,0,0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] ; AVX1-NEXT: vmovd %esi, %xmm0 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,0,0,0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,1,0,1] ; AVX1-NEXT: movq $-4096, %rax # imm = 0xF000 ; AVX1-NEXT: vpmovsxwd %xmm2, %xmm0 ; AVX1-NEXT: vpmovsxwd %xmm3, %xmm1 diff --git a/llvm/test/CodeGen/X86/combine-and.ll b/llvm/test/CodeGen/X86/combine-and.ll index cbdc867ddeb00..e5594dc9c5e3c 100644 --- a/llvm/test/CodeGen/X86/combine-and.ll +++ b/llvm/test/CodeGen/X86/combine-and.ll @@ -849,7 +849,7 @@ define <8 x i16> @neg_scalar_broadcast_v8i16(i16 %a0, <8 x i16> %a1) { ; SSE: # %bb.0: ; SSE-NEXT: movd %edi, %xmm1 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm1, %xmm0 ; SSE-NEXT: retq @@ -858,7 +858,7 @@ define <8 x i16> @neg_scalar_broadcast_v8i16(i16 %a0, <8 x i16> %a1) { ; AVX1: # %bb.0: ; AVX1-NEXT: vmovd %edi, %xmm1 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-NEXT: vpandn %xmm0, %xmm1, %xmm0 ; AVX1-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/combine-pavg.ll b/llvm/test/CodeGen/X86/combine-pavg.ll index cb2d426a52b4b..4a75eb182d79d 100644 --- a/llvm/test/CodeGen/X86/combine-pavg.ll +++ b/llvm/test/CodeGen/X86/combine-pavg.ll @@ -86,14 +86,14 @@ define <8 x i16> @combine_pavgw_demandedelts(<8 x i16> %a0, <8 x i16> %a1) { ; SSE: # %bb.0: ; SSE-NEXT: pavgw %xmm1, %xmm0 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE-NEXT: retq ; ; AVX1-LABEL: combine_pavgw_demandedelts: ; AVX1: # %bb.0: ; AVX1-NEXT: vpavgw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: retq ; ; AVX2-LABEL: combine_pavgw_demandedelts: diff --git a/llvm/test/CodeGen/X86/half.ll b/llvm/test/CodeGen/X86/half.ll index fb836cd2480a7..1b98886ba24e7 100644 --- a/llvm/test/CodeGen/X86/half.ll +++ b/llvm/test/CodeGen/X86/half.ll @@ -1253,13 +1253,13 @@ define <8 x half> @shuffle(ptr %p) { ; CHECK-LIBCALL: # %bb.0: ; CHECK-LIBCALL-NEXT: movdqu (%rdi), %xmm0 ; CHECK-LIBCALL-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; CHECK-LIBCALL-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; CHECK-LIBCALL-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; CHECK-LIBCALL-NEXT: retq ; ; BWON-F16C-LABEL: shuffle: ; BWON-F16C: # %bb.0: ; BWON-F16C-NEXT: vpshufhw {{.*#+}} xmm0 = mem[0,1,2,3,4,4,4,4] -; BWON-F16C-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; BWON-F16C-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; BWON-F16C-NEXT: retq ; ; CHECK-I686-LABEL: shuffle: @@ -1267,7 +1267,7 @@ define <8 x half> @shuffle(ptr %p) { ; CHECK-I686-NEXT: movl {{[0-9]+}}(%esp), %eax ; CHECK-I686-NEXT: movdqu (%eax), %xmm0 ; CHECK-I686-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; CHECK-I686-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; CHECK-I686-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; CHECK-I686-NEXT: retl %1 = load <8 x half>, ptr %p, align 8 %2 = shufflevector <8 x half> %1, <8 x half> poison, <8 x i32> diff --git a/llvm/test/CodeGen/X86/icmp-pow2-mask.ll b/llvm/test/CodeGen/X86/icmp-pow2-mask.ll index b0fe43a647716..af3b07bc131a9 100644 --- a/llvm/test/CodeGen/X86/icmp-pow2-mask.ll +++ b/llvm/test/CodeGen/X86/icmp-pow2-mask.ll @@ -56,7 +56,7 @@ define <16 x i16> @pow2_mask_v16i16(i16 zeroext %0) { ; SSE2: # %bb.0: ; SSE2-NEXT: movd %edi, %xmm0 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [128,64,32,16,8,4,2,1] ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: pand %xmm2, %xmm1 @@ -70,7 +70,7 @@ define <16 x i16> @pow2_mask_v16i16(i16 zeroext %0) { ; SSE41: # %bb.0: ; SSE41-NEXT: movd %edi, %xmm0 ; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = [128,64,32,16,8,4,2,1] ; SSE41-NEXT: movdqa %xmm0, %xmm1 ; SSE41-NEXT: pand %xmm2, %xmm1 diff --git a/llvm/test/CodeGen/X86/insertelement-var-index.ll b/llvm/test/CodeGen/X86/insertelement-var-index.ll index 16946caf9a328..d151c6f28e51b 100644 --- a/llvm/test/CodeGen/X86/insertelement-var-index.ll +++ b/llvm/test/CodeGen/X86/insertelement-var-index.ll @@ -33,7 +33,7 @@ define <16 x i8> @arg_i8_v16i8_undef(i8 %x, i32 %y) nounwind { ; SSE2-NEXT: movd %edi, %xmm0 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE2-NEXT: retq ; ; SSE41-LABEL: arg_i8_v16i8_undef: @@ -80,14 +80,14 @@ define <8 x i16> @arg_i16_v8i16_undef(i16 %x, i32 %y) nounwind { ; SSE: # %bb.0: ; SSE-NEXT: movd %edi, %xmm0 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE-NEXT: retq ; ; AVX1-LABEL: arg_i16_v8i16_undef: ; AVX1: # %bb.0: ; AVX1-NEXT: vmovd %edi, %xmm0 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: retq ; ; AVX2-LABEL: arg_i16_v8i16_undef: @@ -239,7 +239,7 @@ define <16 x i8> @load_i8_v16i8_undef(ptr %p, i32 %y) nounwind { ; SSE2-NEXT: movd %eax, %xmm0 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE2-NEXT: retq ; ; SSE41-LABEL: load_i8_v16i8_undef: @@ -284,7 +284,7 @@ define <8 x i16> @load_i16_v8i16_undef(ptr %p, i32 %y) nounwind { ; SSE-NEXT: movzwl (%rdi), %eax ; SSE-NEXT: movd %eax, %xmm0 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE-NEXT: retq ; ; AVX1-LABEL: load_i16_v8i16_undef: @@ -292,7 +292,7 @@ define <8 x i16> @load_i16_v8i16_undef(ptr %p, i32 %y) nounwind { ; AVX1-NEXT: movzwl (%rdi), %eax ; AVX1-NEXT: vmovd %eax, %xmm0 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: retq ; ; AVX2-LABEL: load_i16_v8i16_undef: @@ -465,7 +465,7 @@ define <16 x i16> @arg_i16_v16i16_undef(i16 %x, i32 %y) nounwind { ; AVX1: # %bb.0: ; AVX1-NEXT: vmovd %edi, %xmm0 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -690,7 +690,7 @@ define <16 x i16> @load_i16_v16i16_undef(ptr %p, i32 %y) nounwind { ; AVX1-NEXT: movzwl (%rdi), %eax ; AVX1-NEXT: vmovd %eax, %xmm0 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/memset-inline.ll b/llvm/test/CodeGen/X86/memset-inline.ll index 905a0ffda061f..d3999c01a5d71 100644 --- a/llvm/test/CodeGen/X86/memset-inline.ll +++ b/llvm/test/CodeGen/X86/memset-inline.ll @@ -188,7 +188,7 @@ define void @aligned_memset_16(ptr align 16 %a, i8 %value) nounwind { ; SSE2-NEXT: movd %esi, %xmm0 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE2-NEXT: movdqa %xmm0, (%rdi) ; SSE2-NEXT: retq ; @@ -224,7 +224,7 @@ define void @aligned_memset_32(ptr align 32 %a, i8 %value) nounwind { ; SSE2-NEXT: movd %esi, %xmm0 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE2-NEXT: movdqa %xmm0, 16(%rdi) ; SSE2-NEXT: movdqa %xmm0, (%rdi) ; SSE2-NEXT: retq @@ -264,7 +264,7 @@ define void @aligned_memset_64(ptr align 64 %a, i8 %value) nounwind { ; SSE2-NEXT: movd %esi, %xmm0 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE2-NEXT: movdqa %xmm0, 48(%rdi) ; SSE2-NEXT: movdqa %xmm0, 32(%rdi) ; SSE2-NEXT: movdqa %xmm0, 16(%rdi) diff --git a/llvm/test/CodeGen/X86/memset-nonzero.ll b/llvm/test/CodeGen/X86/memset-nonzero.ll index 96ac8dff79530..d07b0f64d68c1 100644 --- a/llvm/test/CodeGen/X86/memset-nonzero.ll +++ b/llvm/test/CodeGen/X86/memset-nonzero.ll @@ -291,7 +291,7 @@ define void @memset_16_nonconst_bytes(ptr %x, i8 %c) { ; SSE2FAST-NEXT: movd %esi, %xmm0 ; SSE2FAST-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2FAST-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE2FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE2FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE2FAST-NEXT: movdqu %xmm0, (%rdi) ; SSE2FAST-NEXT: retq ; @@ -337,7 +337,7 @@ define void @memset_32_nonconst_bytes(ptr %x, i8 %c) { ; SSE2FAST-NEXT: movd %esi, %xmm0 ; SSE2FAST-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2FAST-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE2FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE2FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE2FAST-NEXT: movdqu %xmm0, 16(%rdi) ; SSE2FAST-NEXT: movdqu %xmm0, (%rdi) ; SSE2FAST-NEXT: retq @@ -391,7 +391,7 @@ define void @memset_64_nonconst_bytes(ptr %x, i8 %c) { ; SSE2FAST-NEXT: movd %esi, %xmm0 ; SSE2FAST-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2FAST-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE2FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE2FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE2FAST-NEXT: movdqu %xmm0, 48(%rdi) ; SSE2FAST-NEXT: movdqu %xmm0, 32(%rdi) ; SSE2FAST-NEXT: movdqu %xmm0, 16(%rdi) @@ -466,7 +466,7 @@ define void @memset_128_nonconst_bytes(ptr %x, i8 %c) { ; SSE2FAST-NEXT: movd %esi, %xmm0 ; SSE2FAST-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2FAST-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE2FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE2FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE2FAST-NEXT: movdqu %xmm0, 112(%rdi) ; SSE2FAST-NEXT: movdqu %xmm0, 96(%rdi) ; SSE2FAST-NEXT: movdqu %xmm0, 80(%rdi) @@ -533,7 +533,7 @@ define void @memset_256_nonconst_bytes(ptr %x, i8 %c) { ; SSE2FAST-NEXT: movd %esi, %xmm0 ; SSE2FAST-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2FAST-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE2FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE2FAST-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE2FAST-NEXT: movdqu %xmm0, 240(%rdi) ; SSE2FAST-NEXT: movdqu %xmm0, 224(%rdi) ; SSE2FAST-NEXT: movdqu %xmm0, 208(%rdi) diff --git a/llvm/test/CodeGen/X86/pr46527.ll b/llvm/test/CodeGen/X86/pr46527.ll index ab454cfe470f3..a624055dc1041 100644 --- a/llvm/test/CodeGen/X86/pr46527.ll +++ b/llvm/test/CodeGen/X86/pr46527.ll @@ -19,7 +19,7 @@ define void @f(ptr %out, <16 x i8> %in, i1 %flag) { ; CHECK-NEXT: movd %edx, %xmm1 ; CHECK-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; CHECK-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; CHECK-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; CHECK-NEXT: paddb %xmm1, %xmm1 ; CHECK-NEXT: pxor %xmm0, %xmm1 ; CHECK-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}@GOTOFF(%eax), %xmm1 diff --git a/llvm/test/CodeGen/X86/pr62014.ll b/llvm/test/CodeGen/X86/pr62014.ll index a1ce577c0a024..19a6962731b6a 100644 --- a/llvm/test/CodeGen/X86/pr62014.ll +++ b/llvm/test/CodeGen/X86/pr62014.ll @@ -142,7 +142,7 @@ define <8 x i16> @select_cast_cond_multiuse_v8i16(<8 x i16> %x, <8 x i16> %y, i8 ; SSE2: # %bb.0: ; SSE2-NEXT: movd %edi, %xmm2 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] ; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [1,2,4,8,16,32,64,128] ; SSE2-NEXT: pand %xmm3, %xmm2 ; SSE2-NEXT: pcmpeqw %xmm3, %xmm2 @@ -157,7 +157,7 @@ define <8 x i16> @select_cast_cond_multiuse_v8i16(<8 x i16> %x, <8 x i16> %y, i8 ; SSE42-NEXT: movdqa %xmm0, %xmm2 ; SSE42-NEXT: movd %edi, %xmm0 ; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE42-NEXT: pmovzxbw {{.*#+}} xmm3 = [1,2,4,8,16,32,64,128] ; SSE42-NEXT: pand %xmm3, %xmm0 ; SSE42-NEXT: pcmpeqw %xmm3, %xmm0 @@ -275,7 +275,7 @@ define <8 x float> @select_cast_cond_multiuse_v8i16_v8f32(<8 x float> %x, <8 x f ; SSE2: # %bb.0: ; SSE2-NEXT: movd %edi, %xmm4 ; SSE2-NEXT: pshuflw {{.*#+}} xmm5 = xmm4[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,0,0,0] +; SSE2-NEXT: pshufd {{.*#+}} xmm5 = xmm5[0,1,0,1] ; SSE2-NEXT: movdqa {{.*#+}} xmm6 = [1,2,4,8,16,32,64,128] ; SSE2-NEXT: pand %xmm6, %xmm5 ; SSE2-NEXT: pcmpeqw %xmm6, %xmm5 @@ -301,7 +301,7 @@ define <8 x float> @select_cast_cond_multiuse_v8i16_v8f32(<8 x float> %x, <8 x f ; SSE42-NEXT: movaps %xmm0, %xmm4 ; SSE42-NEXT: movd %edi, %xmm0 ; SSE42-NEXT: pshuflw {{.*#+}} xmm5 = xmm0[0,0,0,0,4,5,6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,0,0,0] +; SSE42-NEXT: pshufd {{.*#+}} xmm6 = xmm5[0,1,0,1] ; SSE42-NEXT: pmovzxbw {{.*#+}} xmm5 = [1,2,4,8,16,32,64,128] ; SSE42-NEXT: pand %xmm5, %xmm6 ; SSE42-NEXT: pcmpeqw %xmm5, %xmm6 diff --git a/llvm/test/CodeGen/X86/psubus.ll b/llvm/test/CodeGen/X86/psubus.ll index 9656822d144e4..61e3611dcedc9 100644 --- a/llvm/test/CodeGen/X86/psubus.ll +++ b/llvm/test/CodeGen/X86/psubus.ll @@ -258,7 +258,7 @@ define <8 x i16> @test3(<8 x i16> %x, i16 zeroext %w) nounwind { ; SSE: # %bb.0: # %vector.ph ; SSE-NEXT: movd %edi, %xmm1 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; SSE-NEXT: psubusw %xmm1, %xmm0 ; SSE-NEXT: retq ; @@ -266,7 +266,7 @@ define <8 x i16> @test3(<8 x i16> %x, i16 zeroext %w) nounwind { ; AVX1: # %bb.0: # %vector.ph ; AVX1-NEXT: vmovd %edi, %xmm1 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-NEXT: vpsubusw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq ; @@ -331,7 +331,7 @@ define <16 x i8> @test6(<16 x i8> %x, i8 zeroext %w) nounwind { ; SSE2-NEXT: movd %edi, %xmm1 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; SSE2-NEXT: psubusb %xmm1, %xmm0 ; SSE2-NEXT: retq ; @@ -546,7 +546,7 @@ define <16 x i16> @test9(<16 x i16> %x, i16 zeroext %w) nounwind { ; SSE: # %bb.0: # %vector.ph ; SSE-NEXT: movd %edi, %xmm2 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] ; SSE-NEXT: psubusw %xmm2, %xmm0 ; SSE-NEXT: psubusw %xmm2, %xmm1 ; SSE-NEXT: retq @@ -556,7 +556,7 @@ define <16 x i16> @test9(<16 x i16> %x, i16 zeroext %w) nounwind { ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; AVX1-NEXT: vmovd %edi, %xmm2 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] ; AVX1-NEXT: vpsubusw %xmm2, %xmm1, %xmm1 ; AVX1-NEXT: vpsubusw %xmm2, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 @@ -686,7 +686,7 @@ define <32 x i8> @test12(<32 x i8> %x, i8 zeroext %w) nounwind { ; SSE2-NEXT: movd %edi, %xmm2 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] ; SSE2-NEXT: psubusb %xmm2, %xmm0 ; SSE2-NEXT: psubusb %xmm2, %xmm1 ; SSE2-NEXT: retq @@ -1220,7 +1220,7 @@ define <64 x i8> @test17(<64 x i8> %x, i8 zeroext %w) nounwind { ; SSE2-NEXT: movd %edi, %xmm4 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0] +; SSE2-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,1] ; SSE2-NEXT: psubusb %xmm4, %xmm0 ; SSE2-NEXT: psubusb %xmm4, %xmm1 ; SSE2-NEXT: psubusb %xmm4, %xmm2 @@ -1291,7 +1291,7 @@ define <32 x i16> @test18(<32 x i16> %x, i16 zeroext %w) nounwind { ; SSE: # %bb.0: # %vector.ph ; SSE-NEXT: movd %edi, %xmm4 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm4[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,1] ; SSE-NEXT: psubusw %xmm4, %xmm0 ; SSE-NEXT: psubusw %xmm4, %xmm1 ; SSE-NEXT: psubusw %xmm4, %xmm2 @@ -1303,7 +1303,7 @@ define <32 x i16> @test18(<32 x i16> %x, i16 zeroext %w) nounwind { ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; AVX1-NEXT: vmovd %edi, %xmm3 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm3 = xmm3[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,1,0,1] ; AVX1-NEXT: vpsubusw %xmm3, %xmm2, %xmm2 ; AVX1-NEXT: vpsubusw %xmm3, %xmm0, %xmm0 ; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll b/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll index ca85afc352a57..dbdc45abb24d6 100644 --- a/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll +++ b/llvm/test/CodeGen/X86/sse2-intrinsics-fast-isel.ll @@ -4343,8 +4343,8 @@ define <2 x i64> @test_mm_set1_epi8(i8 %a0) nounwind { ; X86-SSE-NEXT: # xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X86-SSE-NEXT: pshuflw $0, %xmm0, %xmm0 # encoding: [0xf2,0x0f,0x70,0xc0,0x00] ; X86-SSE-NEXT: # xmm0 = xmm0[0,0,0,0,4,5,6,7] -; X86-SSE-NEXT: pshufd $0, %xmm0, %xmm0 # encoding: [0x66,0x0f,0x70,0xc0,0x00] -; X86-SSE-NEXT: # xmm0 = xmm0[0,0,0,0] +; X86-SSE-NEXT: pshufd $68, %xmm0, %xmm0 # encoding: [0x66,0x0f,0x70,0xc0,0x44] +; X86-SSE-NEXT: # xmm0 = xmm0[0,1,0,1] ; X86-SSE-NEXT: retl # encoding: [0xc3] ; ; X86-AVX1-LABEL: test_mm_set1_epi8: @@ -4369,8 +4369,8 @@ define <2 x i64> @test_mm_set1_epi8(i8 %a0) nounwind { ; X64-SSE-NEXT: # xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X64-SSE-NEXT: pshuflw $0, %xmm0, %xmm0 # encoding: [0xf2,0x0f,0x70,0xc0,0x00] ; X64-SSE-NEXT: # xmm0 = xmm0[0,0,0,0,4,5,6,7] -; X64-SSE-NEXT: pshufd $0, %xmm0, %xmm0 # encoding: [0x66,0x0f,0x70,0xc0,0x00] -; X64-SSE-NEXT: # xmm0 = xmm0[0,0,0,0] +; X64-SSE-NEXT: pshufd $68, %xmm0, %xmm0 # encoding: [0x66,0x0f,0x70,0xc0,0x44] +; X64-SSE-NEXT: # xmm0 = xmm0[0,1,0,1] ; X64-SSE-NEXT: retq # encoding: [0xc3] ; ; X64-AVX1-LABEL: test_mm_set1_epi8: @@ -4393,8 +4393,8 @@ define <2 x i64> @test_mm_set1_epi8(i8 %a0) nounwind { ; X32-SSE-NEXT: # xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X32-SSE-NEXT: pshuflw $0, %xmm0, %xmm0 # encoding: [0xf2,0x0f,0x70,0xc0,0x00] ; X32-SSE-NEXT: # xmm0 = xmm0[0,0,0,0,4,5,6,7] -; X32-SSE-NEXT: pshufd $0, %xmm0, %xmm0 # encoding: [0x66,0x0f,0x70,0xc0,0x00] -; X32-SSE-NEXT: # xmm0 = xmm0[0,0,0,0] +; X32-SSE-NEXT: pshufd $68, %xmm0, %xmm0 # encoding: [0x66,0x0f,0x70,0xc0,0x44] +; X32-SSE-NEXT: # xmm0 = xmm0[0,1,0,1] ; X32-SSE-NEXT: retq # encoding: [0xc3] ; ; X32-AVX1-LABEL: test_mm_set1_epi8: @@ -4435,8 +4435,8 @@ define <2 x i64> @test_mm_set1_epi16(i16 %a0) nounwind { ; X86-SSE-NEXT: movd %eax, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc0] ; X86-SSE-NEXT: pshuflw $0, %xmm0, %xmm0 # encoding: [0xf2,0x0f,0x70,0xc0,0x00] ; X86-SSE-NEXT: # xmm0 = xmm0[0,0,0,0,4,5,6,7] -; X86-SSE-NEXT: pshufd $0, %xmm0, %xmm0 # encoding: [0x66,0x0f,0x70,0xc0,0x00] -; X86-SSE-NEXT: # xmm0 = xmm0[0,0,0,0] +; X86-SSE-NEXT: pshufd $68, %xmm0, %xmm0 # encoding: [0x66,0x0f,0x70,0xc0,0x44] +; X86-SSE-NEXT: # xmm0 = xmm0[0,1,0,1] ; X86-SSE-NEXT: retl # encoding: [0xc3] ; ; X86-AVX1-LABEL: test_mm_set1_epi16: @@ -4445,8 +4445,8 @@ define <2 x i64> @test_mm_set1_epi16(i16 %a0) nounwind { ; X86-AVX1-NEXT: vmovd %eax, %xmm0 # encoding: [0xc5,0xf9,0x6e,0xc0] ; X86-AVX1-NEXT: vpshuflw $0, %xmm0, %xmm0 # encoding: [0xc5,0xfb,0x70,0xc0,0x00] ; X86-AVX1-NEXT: # xmm0 = xmm0[0,0,0,0,4,5,6,7] -; X86-AVX1-NEXT: vpshufd $0, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x70,0xc0,0x00] -; X86-AVX1-NEXT: # xmm0 = xmm0[0,0,0,0] +; X86-AVX1-NEXT: vpshufd $68, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x70,0xc0,0x44] +; X86-AVX1-NEXT: # xmm0 = xmm0[0,1,0,1] ; X86-AVX1-NEXT: retl # encoding: [0xc3] ; ; X86-AVX512-LABEL: test_mm_set1_epi16: @@ -4460,8 +4460,8 @@ define <2 x i64> @test_mm_set1_epi16(i16 %a0) nounwind { ; X64-SSE-NEXT: movd %edi, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc7] ; X64-SSE-NEXT: pshuflw $0, %xmm0, %xmm0 # encoding: [0xf2,0x0f,0x70,0xc0,0x00] ; X64-SSE-NEXT: # xmm0 = xmm0[0,0,0,0,4,5,6,7] -; X64-SSE-NEXT: pshufd $0, %xmm0, %xmm0 # encoding: [0x66,0x0f,0x70,0xc0,0x00] -; X64-SSE-NEXT: # xmm0 = xmm0[0,0,0,0] +; X64-SSE-NEXT: pshufd $68, %xmm0, %xmm0 # encoding: [0x66,0x0f,0x70,0xc0,0x44] +; X64-SSE-NEXT: # xmm0 = xmm0[0,1,0,1] ; X64-SSE-NEXT: retq # encoding: [0xc3] ; ; X64-AVX1-LABEL: test_mm_set1_epi16: @@ -4469,8 +4469,8 @@ define <2 x i64> @test_mm_set1_epi16(i16 %a0) nounwind { ; X64-AVX1-NEXT: vmovd %edi, %xmm0 # encoding: [0xc5,0xf9,0x6e,0xc7] ; X64-AVX1-NEXT: vpshuflw $0, %xmm0, %xmm0 # encoding: [0xc5,0xfb,0x70,0xc0,0x00] ; X64-AVX1-NEXT: # xmm0 = xmm0[0,0,0,0,4,5,6,7] -; X64-AVX1-NEXT: vpshufd $0, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x70,0xc0,0x00] -; X64-AVX1-NEXT: # xmm0 = xmm0[0,0,0,0] +; X64-AVX1-NEXT: vpshufd $68, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x70,0xc0,0x44] +; X64-AVX1-NEXT: # xmm0 = xmm0[0,1,0,1] ; X64-AVX1-NEXT: retq # encoding: [0xc3] ; ; X64-AVX512-LABEL: test_mm_set1_epi16: @@ -4483,8 +4483,8 @@ define <2 x i64> @test_mm_set1_epi16(i16 %a0) nounwind { ; X32-SSE-NEXT: movd %edi, %xmm0 # encoding: [0x66,0x0f,0x6e,0xc7] ; X32-SSE-NEXT: pshuflw $0, %xmm0, %xmm0 # encoding: [0xf2,0x0f,0x70,0xc0,0x00] ; X32-SSE-NEXT: # xmm0 = xmm0[0,0,0,0,4,5,6,7] -; X32-SSE-NEXT: pshufd $0, %xmm0, %xmm0 # encoding: [0x66,0x0f,0x70,0xc0,0x00] -; X32-SSE-NEXT: # xmm0 = xmm0[0,0,0,0] +; X32-SSE-NEXT: pshufd $68, %xmm0, %xmm0 # encoding: [0x66,0x0f,0x70,0xc0,0x44] +; X32-SSE-NEXT: # xmm0 = xmm0[0,1,0,1] ; X32-SSE-NEXT: retq # encoding: [0xc3] ; ; X32-AVX1-LABEL: test_mm_set1_epi16: @@ -4492,8 +4492,8 @@ define <2 x i64> @test_mm_set1_epi16(i16 %a0) nounwind { ; X32-AVX1-NEXT: vmovd %edi, %xmm0 # encoding: [0xc5,0xf9,0x6e,0xc7] ; X32-AVX1-NEXT: vpshuflw $0, %xmm0, %xmm0 # encoding: [0xc5,0xfb,0x70,0xc0,0x00] ; X32-AVX1-NEXT: # xmm0 = xmm0[0,0,0,0,4,5,6,7] -; X32-AVX1-NEXT: vpshufd $0, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x70,0xc0,0x00] -; X32-AVX1-NEXT: # xmm0 = xmm0[0,0,0,0] +; X32-AVX1-NEXT: vpshufd $68, %xmm0, %xmm0 # encoding: [0xc5,0xf9,0x70,0xc0,0x44] +; X32-AVX1-NEXT: # xmm0 = xmm0[0,1,0,1] ; X32-AVX1-NEXT: retq # encoding: [0xc3] ; ; X32-AVX512-LABEL: test_mm_set1_epi16: diff --git a/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll b/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll index df8a85fd07258..5bd624c0697a0 100644 --- a/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll +++ b/llvm/test/CodeGen/X86/subvectorwise-store-of-vector-splat.ll @@ -196,7 +196,7 @@ define void @vec128_v2i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.p ; SSE2-ONLY-NEXT: movd %xmm0, %eax ; SSE2-ONLY-NEXT: movw %ax, (%rsi) ; SSE2-ONLY-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE2-ONLY-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE2-ONLY-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE2-ONLY-NEXT: movdqa %xmm0, (%rdx) ; SSE2-ONLY-NEXT: retq ; @@ -207,7 +207,7 @@ define void @vec128_v2i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.p ; SSE3-NEXT: movd %xmm0, %eax ; SSE3-NEXT: movw %ax, (%rsi) ; SSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE3-NEXT: movdqa %xmm0, (%rdx) ; SSE3-NEXT: retq ; @@ -218,7 +218,7 @@ define void @vec128_v2i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.p ; SSSE3-ONLY-NEXT: movd %xmm0, %eax ; SSSE3-ONLY-NEXT: movw %ax, (%rsi) ; SSSE3-ONLY-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSSE3-ONLY-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSSE3-ONLY-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSSE3-ONLY-NEXT: movdqa %xmm0, (%rdx) ; SSSE3-ONLY-NEXT: retq ; @@ -228,7 +228,7 @@ define void @vec128_v2i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.p ; SSE41-NEXT: pxor (%rdi), %xmm0 ; SSE41-NEXT: pextrw $0, %xmm0, (%rsi) ; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE41-NEXT: movdqa %xmm0, (%rdx) ; SSE41-NEXT: retq ; @@ -238,7 +238,7 @@ define void @vec128_v2i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.p ; SSE42-NEXT: pxor (%rdi), %xmm0 ; SSE42-NEXT: pextrw $0, %xmm0, (%rsi) ; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE42-NEXT: movdqa %xmm0, (%rdx) ; SSE42-NEXT: retq ; @@ -248,7 +248,7 @@ define void @vec128_v2i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.p ; AVX1-NEXT: vpxor (%rdi), %xmm0, %xmm0 ; AVX1-NEXT: vpextrw $0, %xmm0, (%rsi) ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: vmovdqa %xmm0, (%rdx) ; AVX1-NEXT: retq ; @@ -759,7 +759,7 @@ define void @vec256_v2i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.p ; SSE2-ONLY-NEXT: movd %xmm0, %eax ; SSE2-ONLY-NEXT: movw %ax, (%rsi) ; SSE2-ONLY-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE2-ONLY-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE2-ONLY-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE2-ONLY-NEXT: movdqa %xmm0, (%rdx) ; SSE2-ONLY-NEXT: movdqa %xmm0, 16(%rdx) ; SSE2-ONLY-NEXT: retq @@ -771,7 +771,7 @@ define void @vec256_v2i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.p ; SSE3-NEXT: movd %xmm0, %eax ; SSE3-NEXT: movw %ax, (%rsi) ; SSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE3-NEXT: movdqa %xmm0, (%rdx) ; SSE3-NEXT: movdqa %xmm0, 16(%rdx) ; SSE3-NEXT: retq @@ -783,7 +783,7 @@ define void @vec256_v2i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.p ; SSSE3-ONLY-NEXT: movd %xmm0, %eax ; SSSE3-ONLY-NEXT: movw %ax, (%rsi) ; SSSE3-ONLY-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSSE3-ONLY-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSSE3-ONLY-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSSE3-ONLY-NEXT: movdqa %xmm0, (%rdx) ; SSSE3-ONLY-NEXT: movdqa %xmm0, 16(%rdx) ; SSSE3-ONLY-NEXT: retq @@ -794,7 +794,7 @@ define void @vec256_v2i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.p ; SSE41-NEXT: pxor (%rdi), %xmm0 ; SSE41-NEXT: pextrw $0, %xmm0, (%rsi) ; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE41-NEXT: movdqa %xmm0, (%rdx) ; SSE41-NEXT: movdqa %xmm0, 16(%rdx) ; SSE41-NEXT: retq @@ -805,7 +805,7 @@ define void @vec256_v2i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.p ; SSE42-NEXT: pxor (%rdi), %xmm0 ; SSE42-NEXT: pextrw $0, %xmm0, (%rsi) ; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE42-NEXT: movdqa %xmm0, (%rdx) ; SSE42-NEXT: movdqa %xmm0, 16(%rdx) ; SSE42-NEXT: retq @@ -816,7 +816,7 @@ define void @vec256_v2i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.p ; AVX1-NEXT: vpxor (%rdi), %xmm0, %xmm0 ; AVX1-NEXT: vpextrw $0, %xmm0, (%rsi) ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: vmovaps %ymm0, (%rdx) ; AVX1-NEXT: vzeroupper @@ -1877,7 +1877,7 @@ define void @vec384_v2i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.p ; SSE2-ONLY-NEXT: movd %xmm0, %eax ; SSE2-ONLY-NEXT: movw %ax, (%rsi) ; SSE2-ONLY-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE2-ONLY-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE2-ONLY-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE2-ONLY-NEXT: movdqa %xmm0, (%rdx) ; SSE2-ONLY-NEXT: movdqa %xmm0, 16(%rdx) ; SSE2-ONLY-NEXT: movdqa %xmm0, 32(%rdx) @@ -1890,7 +1890,7 @@ define void @vec384_v2i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.p ; SSE3-NEXT: movd %xmm0, %eax ; SSE3-NEXT: movw %ax, (%rsi) ; SSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE3-NEXT: movdqa %xmm0, (%rdx) ; SSE3-NEXT: movdqa %xmm0, 16(%rdx) ; SSE3-NEXT: movdqa %xmm0, 32(%rdx) @@ -1903,7 +1903,7 @@ define void @vec384_v2i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.p ; SSSE3-ONLY-NEXT: movd %xmm0, %eax ; SSSE3-ONLY-NEXT: movw %ax, (%rsi) ; SSSE3-ONLY-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSSE3-ONLY-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSSE3-ONLY-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSSE3-ONLY-NEXT: movdqa %xmm0, (%rdx) ; SSSE3-ONLY-NEXT: movdqa %xmm0, 16(%rdx) ; SSSE3-ONLY-NEXT: movdqa %xmm0, 32(%rdx) @@ -1915,7 +1915,7 @@ define void @vec384_v2i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.p ; SSE41-NEXT: pxor (%rdi), %xmm0 ; SSE41-NEXT: pextrw $0, %xmm0, (%rsi) ; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE41-NEXT: movdqa %xmm0, (%rdx) ; SSE41-NEXT: movdqa %xmm0, 16(%rdx) ; SSE41-NEXT: movdqa %xmm0, 32(%rdx) @@ -1927,7 +1927,7 @@ define void @vec384_v2i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.p ; SSE42-NEXT: pxor (%rdi), %xmm0 ; SSE42-NEXT: pextrw $0, %xmm0, (%rsi) ; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE42-NEXT: movdqa %xmm0, (%rdx) ; SSE42-NEXT: movdqa %xmm0, 16(%rdx) ; SSE42-NEXT: movdqa %xmm0, 32(%rdx) @@ -1939,7 +1939,7 @@ define void @vec384_v2i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.p ; AVX1-NEXT: vpxor (%rdi), %xmm0, %xmm0 ; AVX1-NEXT: vpextrw $0, %xmm0, (%rsi) ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1 ; AVX1-NEXT: vmovaps %ymm1, (%rdx) ; AVX1-NEXT: vmovdqa %xmm0, 32(%rdx) @@ -5076,7 +5076,7 @@ define void @vec512_v2i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.p ; SSE2-ONLY-NEXT: movd %xmm0, %eax ; SSE2-ONLY-NEXT: movw %ax, (%rsi) ; SSE2-ONLY-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE2-ONLY-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE2-ONLY-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE2-ONLY-NEXT: movdqa %xmm0, (%rdx) ; SSE2-ONLY-NEXT: movdqa %xmm0, 16(%rdx) ; SSE2-ONLY-NEXT: movdqa %xmm0, 32(%rdx) @@ -5090,7 +5090,7 @@ define void @vec512_v2i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.p ; SSE3-NEXT: movd %xmm0, %eax ; SSE3-NEXT: movw %ax, (%rsi) ; SSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE3-NEXT: movdqa %xmm0, (%rdx) ; SSE3-NEXT: movdqa %xmm0, 16(%rdx) ; SSE3-NEXT: movdqa %xmm0, 32(%rdx) @@ -5104,7 +5104,7 @@ define void @vec512_v2i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.p ; SSSE3-ONLY-NEXT: movd %xmm0, %eax ; SSSE3-ONLY-NEXT: movw %ax, (%rsi) ; SSSE3-ONLY-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSSE3-ONLY-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSSE3-ONLY-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSSE3-ONLY-NEXT: movdqa %xmm0, (%rdx) ; SSSE3-ONLY-NEXT: movdqa %xmm0, 16(%rdx) ; SSSE3-ONLY-NEXT: movdqa %xmm0, 32(%rdx) @@ -5117,7 +5117,7 @@ define void @vec512_v2i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.p ; SSE41-NEXT: pxor (%rdi), %xmm0 ; SSE41-NEXT: pextrw $0, %xmm0, (%rsi) ; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE41-NEXT: movdqa %xmm0, (%rdx) ; SSE41-NEXT: movdqa %xmm0, 16(%rdx) ; SSE41-NEXT: movdqa %xmm0, 32(%rdx) @@ -5130,7 +5130,7 @@ define void @vec512_v2i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.p ; SSE42-NEXT: pxor (%rdi), %xmm0 ; SSE42-NEXT: pextrw $0, %xmm0, (%rsi) ; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE42-NEXT: movdqa %xmm0, (%rdx) ; SSE42-NEXT: movdqa %xmm0, 16(%rdx) ; SSE42-NEXT: movdqa %xmm0, 32(%rdx) @@ -5143,7 +5143,7 @@ define void @vec512_v2i8(ptr %in.subvec.ptr, ptr %out.subvec.ptr, ptr %out.vec.p ; AVX1-NEXT: vpxor (%rdi), %xmm0, %xmm0 ; AVX1-NEXT: vpextrw $0, %xmm0, (%rsi) ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: vmovaps %ymm0, (%rdx) ; AVX1-NEXT: vmovaps %ymm0, 32(%rdx) diff --git a/llvm/test/CodeGen/X86/vec_set-H.ll b/llvm/test/CodeGen/X86/vec_set-H.ll index 071f9a162c387..932e899aaa44d 100644 --- a/llvm/test/CodeGen/X86/vec_set-H.ll +++ b/llvm/test/CodeGen/X86/vec_set-H.ll @@ -6,7 +6,7 @@ define <2 x i64> @doload64(i16 signext %x) nounwind { ; CHECK: # %bb.0: ; CHECK-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; CHECK-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; CHECK-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; CHECK-NEXT: retl %tmp36 = insertelement <8 x i16> undef, i16 %x, i32 0 %tmp37 = insertelement <8 x i16> %tmp36, i16 %x, i32 1 diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll index 6c3e4a9009975..b763b7bac2432 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-rot-128.ll @@ -1028,7 +1028,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind ; XOPAVX1-LABEL: splatvar_funnnel_v8i16: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; XOPAVX1-NEXT: vprotw %xmm1, %xmm0, %xmm0 ; XOPAVX1-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll b/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll index 47524b2062556..c7e430de1af97 100644 --- a/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshl-rot-256.ll @@ -844,7 +844,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounw ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; XOPAVX1-NEXT: vprotw %xmm1, %xmm2, %xmm2 ; XOPAVX1-NEXT: vprotw %xmm1, %xmm0, %xmm0 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll index 4e30d706c0e20..399bdb7a248b8 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll @@ -1067,7 +1067,7 @@ define <8 x i16> @splatvar_funnnel_v8i16(<8 x i16> %x, <8 x i16> %amt) nounwind ; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX1-NEXT: vpsubw %xmm1, %xmm2, %xmm1 -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; XOPAVX1-NEXT: vprotw %xmm1, %xmm0, %xmm0 ; XOPAVX1-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll index bc2ea6839accd..d81d6ef7c9f28 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-rot-256.ll @@ -884,7 +884,7 @@ define <16 x i16> @splatvar_funnnel_v16i16(<16 x i16> %x, <16 x i16> %amt) nounw ; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] ; XOPAVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; XOPAVX1-NEXT: vpsubw %xmm1, %xmm2, %xmm1 -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; XOPAVX1-NEXT: vprotw %xmm1, %xmm2, %xmm2 ; XOPAVX1-NEXT: vprotw %xmm1, %xmm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll index ad334cb13ae91..3f5ea542e55f0 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll @@ -560,7 +560,7 @@ define void @store_i8_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; SSE-NEXT: pand %xmm3, %xmm7 ; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7] ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm2[0,1,2,3,7,7,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE-NEXT: pandn %xmm1, %xmm3 ; SSE-NEXT: por %xmm7, %xmm3 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,255,0,255,255,255,255,255,255,255,255] diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll index 5b59762ae2175..9d82e663c061c 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-7.ll @@ -344,7 +344,7 @@ define void @store_i8_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255] ; SSE-NEXT: pand %xmm7, %xmm8 ; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm2[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,1,0,1] ; SSE-NEXT: pandn %xmm9, %xmm7 ; SSE-NEXT: por %xmm8, %xmm7 ; SSE-NEXT: pandn %xmm7, %xmm4 @@ -363,7 +363,7 @@ define void @store_i8_stride7_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,255,255,0,0,0,255,255,255,255,0,255,255,255,255] ; SSE-NEXT: pand %xmm3, %xmm1 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[1,1,1,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] ; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255] ; SSE-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[3,1,2,3] @@ -701,7 +701,7 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; SSE-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm0[0] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,0,0,0,4,5,6,7] ; SSE-NEXT: movdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] ; SSE-NEXT: pand %xmm6, %xmm0 ; SSE-NEXT: movdqa %xmm4, %xmm7 @@ -737,7 +737,7 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; SSE-NEXT: packuswb %xmm0, %xmm6 ; SSE-NEXT: movdqa {{.*#+}} xmm11 = [255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255] ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm14[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE-NEXT: movdqa %xmm11, %xmm14 ; SSE-NEXT: pandn %xmm0, %xmm14 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm6[0,0,2,3] @@ -780,7 +780,7 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; SSE-NEXT: pand %xmm12, %xmm9 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm15 # 16-byte Reload ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm15[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] ; SSE-NEXT: pandn %xmm2, %xmm12 ; SSE-NEXT: por %xmm9, %xmm12 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,0,0,0,0,255,255,255,0,0,0,0,255,255] @@ -792,7 +792,7 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; SSE-NEXT: packuswb %xmm13, %xmm2 ; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: pshuflw {{.*#+}} xmm9 = xmm15[1,1,1,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm9 = xmm9[0,1,0,1] ; SSE-NEXT: pandn %xmm9, %xmm0 ; SSE-NEXT: por %xmm2, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm10[0,1,2,2] @@ -819,12 +819,12 @@ define void @store_i8_stride7_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp ; SSE-NEXT: por %xmm2, %xmm1 ; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7] ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,7,7,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm0[2,3,2,3] ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm10[0],xmm4[1],xmm10[1],xmm4[2],xmm10[2],xmm4[3],xmm10[3],xmm4[4],xmm10[4],xmm4[5],xmm10[5],xmm4[6],xmm10[6],xmm4[7],xmm10[7] ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,0,0,255,255,255,255,255,255,255,255,255,255,255] ; SSE-NEXT: pshufhw {{.*#+}} xmm4 = xmm4[0,1,2,3,7,7,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[2,3,2,3] ; SSE-NEXT: pand %xmm0, %xmm4 ; SSE-NEXT: pandn %xmm2, %xmm0 ; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm5[3,1,2,3] @@ -1378,7 +1378,7 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: movdqa (%r9), %xmm8 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE-NEXT: movdqa {{.*#+}} xmm13 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] ; SSE-NEXT: pand %xmm13, %xmm0 ; SSE-NEXT: movdqa %xmm5, %xmm1 @@ -1395,7 +1395,7 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: pandn %xmm2, %xmm1 ; SSE-NEXT: pshufhw {{.*#+}} xmm2 = xmm12[0,1,2,3,7,7,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3] ; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] ; SSE-NEXT: pand %xmm10, %xmm3 ; SSE-NEXT: movdqa %xmm4, %xmm9 @@ -1410,7 +1410,7 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pand %xmm0, %xmm10 ; SSE-NEXT: por %xmm1, %xmm10 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm7[0,1,2,3,7,7,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255] ; SSE-NEXT: movdqa %xmm2, %xmm4 ; SSE-NEXT: pandn %xmm1, %xmm4 @@ -1492,11 +1492,11 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pandn %xmm1, %xmm4 ; SSE-NEXT: movdqa %xmm5, %xmm15 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm5[0,1,2,3,6,6,6,6] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE-NEXT: pand %xmm13, %xmm1 ; SSE-NEXT: por %xmm1, %xmm4 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm8[0,1,2,3,6,6,6,6] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE-NEXT: movdqa %xmm2, %xmm7 ; SSE-NEXT: pandn %xmm1, %xmm7 ; SSE-NEXT: pshufd $230, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -1539,7 +1539,7 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[0,0,0,0,4,5,6,7] ; SSE-NEXT: movdqa %xmm8, %xmm3 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE-NEXT: pand %xmm5, %xmm0 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -1566,12 +1566,12 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pandn %xmm1, %xmm7 ; SSE-NEXT: movdqa %xmm9, %xmm5 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm9[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; SSE-NEXT: pand %xmm13, %xmm1 ; SSE-NEXT: por %xmm1, %xmm7 ; SSE-NEXT: movdqa %xmm6, %xmm8 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm6[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; SSE-NEXT: movdqa %xmm2, %xmm9 ; SSE-NEXT: pandn %xmm1, %xmm9 ; SSE-NEXT: pand %xmm2, %xmm7 @@ -1590,7 +1590,7 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pand %xmm10, %xmm9 ; SSE-NEXT: por %xmm7, %xmm9 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm15[0,1,2,3,5,5,5,5] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE-NEXT: movdqa %xmm2, %xmm7 ; SSE-NEXT: pandn %xmm0, %xmm7 ; SSE-NEXT: pshuflw $233, (%rsp), %xmm0 # 16-byte Folded Reload @@ -1655,7 +1655,7 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: por %xmm7, %xmm9 ; SSE-NEXT: pand %xmm10, %xmm9 ; SSE-NEXT: pshuflw {{.*#+}} xmm7 = xmm8[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm7 = xmm7[0,1,0,1] ; SSE-NEXT: pandn %xmm7, %xmm10 ; SSE-NEXT: por %xmm9, %xmm10 ; SSE-NEXT: movdqa {{.*#+}} xmm7 = [255,255,255,0,0,0,0,255,255,255,0,0,0,0,255,255] @@ -1680,7 +1680,7 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pandn %xmm2, %xmm6 ; SSE-NEXT: por %xmm11, %xmm6 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm4[1,1,1,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,0,255,255,255,255,255,255,0,255,255,255,255,255] ; SSE-NEXT: pand %xmm1, %xmm0 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm5[0,2,2,3,4,5,6,7] @@ -1689,7 +1689,7 @@ define void @store_i8_stride7_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: pand %xmm12, %xmm1 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm8[1,1,1,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE-NEXT: pandn %xmm0, %xmm12 ; SSE-NEXT: por %xmm1, %xmm12 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,0,0,0,255,255,255,255,0,0,0,255,255,255,255] @@ -2660,7 +2660,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movdqa 16(%r9), %xmm5 ; SSE-NEXT: movdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm15[0,1,2,3,6,6,6,6] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE-NEXT: movdqa {{.*#+}} xmm10 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] ; SSE-NEXT: pand %xmm10, %xmm0 ; SSE-NEXT: movdqa %xmm4, %xmm8 @@ -2680,7 +2680,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm3[0,1,2,3,6,6,6,6] ; SSE-NEXT: movdqa %xmm3, %xmm4 ; SSE-NEXT: movdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE-NEXT: movdqa {{.*#+}} xmm9 = [255,255,255,255,255,255,0,255,255,255,255,255,255,0,255,255] ; SSE-NEXT: movdqa %xmm9, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 @@ -2727,7 +2727,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movdqa %xmm10, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,7,7,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE-NEXT: pand %xmm10, %xmm0 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] @@ -2739,14 +2739,14 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pandn %xmm1, %xmm3 ; SSE-NEXT: movdqa %xmm15, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm15[0,1,2,3,7,7,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE-NEXT: pand %xmm4, %xmm1 ; SSE-NEXT: por %xmm1, %xmm3 ; SSE-NEXT: pand %xmm2, %xmm3 ; SSE-NEXT: por %xmm0, %xmm3 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,7,7,7,7] ; SSE-NEXT: movdqa %xmm6, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: pand %xmm9, %xmm3 @@ -2880,14 +2880,14 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movdqa %xmm2, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,6,6,6,6] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] ; SSE-NEXT: movdqa %xmm2, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,6,6,6,6] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: movdqa %xmm14, %xmm0 @@ -2966,7 +2966,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: por %xmm1, %xmm3 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm5[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] ; SSE-NEXT: movdqa %xmm2, %xmm14 ; SSE-NEXT: pandn %xmm1, %xmm14 @@ -3001,11 +3001,11 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movdqa %xmm13, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm10[1,1,1,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; SSE-NEXT: pand %xmm13, %xmm1 ; SSE-NEXT: por %xmm1, %xmm3 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm5[1,1,1,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; SSE-NEXT: movdqa %xmm15, %xmm4 ; SSE-NEXT: pandn %xmm1, %xmm4 ; SSE-NEXT: pand %xmm15, %xmm3 @@ -3031,7 +3031,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pandn %xmm0, %xmm3 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,5,5,5,5] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE-NEXT: movdqa %xmm9, %xmm4 ; SSE-NEXT: pandn %xmm0, %xmm4 ; SSE-NEXT: pshuflw $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -3102,11 +3102,11 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pandn %xmm3, %xmm4 ; SSE-NEXT: movdqa %xmm13, %xmm5 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm13[1,1,1,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,1] ; SSE-NEXT: pand %xmm2, %xmm3 ; SSE-NEXT: por %xmm3, %xmm4 ; SSE-NEXT: pshuflw {{.*#+}} xmm3 = xmm12[1,1,1,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,1] ; SSE-NEXT: movdqa %xmm15, %xmm10 ; SSE-NEXT: pandn %xmm3, %xmm10 ; SSE-NEXT: pand %xmm15, %xmm4 @@ -3121,7 +3121,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movdqa %xmm2, %xmm3 ; SSE-NEXT: pandn %xmm0, %xmm3 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: por %xmm0, %xmm3 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] @@ -3144,11 +3144,11 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movdqa %xmm13, %xmm10 ; SSE-NEXT: pandn %xmm4, %xmm10 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm5[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,1] ; SSE-NEXT: pand %xmm13, %xmm4 ; SSE-NEXT: por %xmm4, %xmm10 ; SSE-NEXT: pshuflw {{.*#+}} xmm4 = xmm12[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm4[0,1,0,1] ; SSE-NEXT: movdqa %xmm9, %xmm2 ; SSE-NEXT: pandn %xmm4, %xmm2 ; SSE-NEXT: pand %xmm9, %xmm10 @@ -3171,7 +3171,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: por %xmm3, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm12 # 16-byte Reload ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm12[0,1,2,3,5,5,5,5] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] ; SSE-NEXT: movdqa %xmm9, %xmm5 ; SSE-NEXT: pandn %xmm3, %xmm5 ; SSE-NEXT: pshuflw $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Folded Reload @@ -3210,7 +3210,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movdqa %xmm3, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm4[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; SSE-NEXT: pand %xmm3, %xmm1 ; SSE-NEXT: movdqa %xmm3, %xmm4 ; SSE-NEXT: por %xmm1, %xmm2 @@ -3232,11 +3232,11 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movdqa %xmm13, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm8[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; SSE-NEXT: pand %xmm13, %xmm1 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm10[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; SSE-NEXT: movdqa %xmm9, %xmm3 ; SSE-NEXT: pandn %xmm1, %xmm3 ; SSE-NEXT: pand %xmm9, %xmm2 @@ -3246,7 +3246,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: por %xmm0, %xmm7 ; SSE-NEXT: movdqa (%rsp), %xmm5 # 16-byte Reload ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,7,7,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE-NEXT: pand %xmm13, %xmm0 ; SSE-NEXT: pshufhw $246, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,2,3,6,5,7,7] @@ -3255,7 +3255,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: por %xmm0, %xmm13 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm6[0,1,2,3,7,7,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE-NEXT: movdqa %xmm4, %xmm3 ; SSE-NEXT: pand %xmm4, %xmm0 ; SSE-NEXT: pshufhw $232, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload @@ -3270,7 +3270,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movdqa %xmm0, %xmm3 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,7,7,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE-NEXT: movdqa %xmm9, %xmm1 ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: pshufhw $233, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload @@ -3320,7 +3320,7 @@ define void @store_i8_stride7_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] ; SSE-NEXT: pand %xmm4, %xmm11 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm13[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; SSE-NEXT: pandn %xmm1, %xmm4 ; SSE-NEXT: por %xmm11, %xmm4 ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,255,0,0,0,0,255,255,255,0,0,0,0,255,255] @@ -5352,7 +5352,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movdqa %xmm5, %xmm4 ; SSE-NEXT: pandn %xmm3, %xmm5 ; SSE-NEXT: pshufhw {{.*#+}} xmm3 = xmm14[0,1,2,3,6,6,6,6] -; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm3 = xmm3[2,3,2,3] ; SSE-NEXT: pand %xmm4, %xmm3 ; SSE-NEXT: por %xmm3, %xmm5 ; SSE-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] @@ -5360,7 +5360,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pandn %xmm5, %xmm3 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm6[0,1,2,3,6,6,6,6] ; SSE-NEXT: movdqa %xmm6, %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] ; SSE-NEXT: movdqa %xmm7, %xmm6 ; SSE-NEXT: pandn %xmm5, %xmm6 ; SSE-NEXT: movdqa %xmm10, %xmm5 @@ -5410,7 +5410,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movdqa %xmm14, %xmm13 ; SSE-NEXT: movdqa %xmm14, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm14[0,1,2,3,5,5,5,5] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,3,2,3] ; SSE-NEXT: movdqa %xmm7, %xmm11 ; SSE-NEXT: pandn %xmm6, %xmm11 ; SSE-NEXT: pshuflw {{.*#+}} xmm6 = xmm0[1,2,2,3,4,5,6,7] @@ -5450,7 +5450,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movdqa %xmm11, %xmm6 ; SSE-NEXT: pandn %xmm5, %xmm6 ; SSE-NEXT: pshufhw {{.*#+}} xmm5 = xmm15[0,1,2,3,7,7,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm5[2,3,2,3] ; SSE-NEXT: pand %xmm11, %xmm5 ; SSE-NEXT: por %xmm5, %xmm6 ; SSE-NEXT: movdqa {{.*#+}} xmm8 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] @@ -5461,13 +5461,13 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movdqa %xmm4, %xmm6 ; SSE-NEXT: pandn %xmm0, %xmm6 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,7,7,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE-NEXT: pand %xmm4, %xmm0 ; SSE-NEXT: por %xmm0, %xmm6 ; SSE-NEXT: pand %xmm8, %xmm6 ; SSE-NEXT: por %xmm5, %xmm6 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,7,7,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE-NEXT: movdqa %xmm7, %xmm5 ; SSE-NEXT: pandn %xmm0, %xmm5 ; SSE-NEXT: pand %xmm7, %xmm6 @@ -5672,7 +5672,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pandn %xmm0, %xmm1 ; SSE-NEXT: pshufhw $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Folded Reload ; SSE-NEXT: # xmm0 = mem[0,1,2,3,6,6,6,6] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE-NEXT: pand %xmm12, %xmm0 ; SSE-NEXT: por %xmm0, %xmm1 ; SSE-NEXT: movdqa {{.*#+}} xmm14 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] @@ -5680,7 +5680,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pandn %xmm1, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm6[0,1,2,3,6,6,6,6] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE-NEXT: movdqa %xmm7, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload @@ -5727,7 +5727,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pandn %xmm1, %xmm2 ; SSE-NEXT: pshufhw $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,2,3,6,6,6,6] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE-NEXT: pand %xmm12, %xmm1 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm14, %xmm12 @@ -5735,7 +5735,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pandn %xmm2, %xmm3 ; SSE-NEXT: pshufhw $170, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,2,3,6,6,6,6] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE-NEXT: movdqa %xmm7, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -5781,13 +5781,13 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pandn %xmm1, %xmm2 ; SSE-NEXT: pshufhw $170, (%rsp), %xmm1 # 16-byte Folded Reload ; SSE-NEXT: # xmm1 = mem[0,1,2,3,6,6,6,6] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE-NEXT: pand %xmm8, %xmm1 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm12, %xmm3 ; SSE-NEXT: pandn %xmm2, %xmm3 ; SSE-NEXT: pshufhw {{.*#+}} xmm1 = xmm9[0,1,2,3,6,6,6,6] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; SSE-NEXT: movdqa %xmm7, %xmm2 ; SSE-NEXT: pandn %xmm1, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload @@ -5828,7 +5828,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pandn %xmm1, %xmm2 ; SSE-NEXT: movdqa %xmm6, %xmm8 ; SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm6[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; SSE-NEXT: pand %xmm4, %xmm1 ; SSE-NEXT: por %xmm1, %xmm2 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] @@ -5858,12 +5858,12 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pandn %xmm2, %xmm15 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm13[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] ; SSE-NEXT: pand %xmm0, %xmm2 ; SSE-NEXT: por %xmm2, %xmm15 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm11[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] ; SSE-NEXT: movdqa %xmm7, %xmm10 ; SSE-NEXT: pandn %xmm2, %xmm10 ; SSE-NEXT: pand %xmm7, %xmm15 @@ -5904,11 +5904,11 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pandn %xmm10, %xmm15 ; SSE-NEXT: movdqa %xmm13, %xmm3 ; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm13[1,1,1,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,0,1] ; SSE-NEXT: pand %xmm0, %xmm10 ; SSE-NEXT: por %xmm10, %xmm15 ; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm11[1,1,1,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[0,1,0,1] ; SSE-NEXT: movdqa {{.*#+}} xmm13 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255] ; SSE-NEXT: movdqa %xmm13, %xmm0 ; SSE-NEXT: pandn %xmm10, %xmm0 @@ -5956,7 +5956,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pand %xmm1, %xmm2 ; SSE-NEXT: por %xmm2, %xmm5 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm11[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] ; SSE-NEXT: movdqa %xmm1, %xmm8 ; SSE-NEXT: pandn %xmm2, %xmm8 @@ -5977,7 +5977,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movdqa %xmm1, %xmm5 ; SSE-NEXT: pandn %xmm0, %xmm5 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm9[0,1,2,3,5,5,5,5] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE-NEXT: movdqa %xmm7, %xmm8 ; SSE-NEXT: pandn %xmm0, %xmm8 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload @@ -6014,7 +6014,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movdqa %xmm12, %xmm2 ; SSE-NEXT: pandn %xmm0, %xmm2 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm10[0,1,2,3,7,7,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE-NEXT: pand %xmm12, %xmm0 ; SSE-NEXT: por %xmm0, %xmm2 ; SSE-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,0,0,255,255,255,255,255,0,0,255,255,255] @@ -6026,14 +6026,14 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movdqa %xmm3, %xmm2 ; SSE-NEXT: pandn %xmm6, %xmm2 ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm9[0,1,2,3,7,7,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,3,2,3] ; SSE-NEXT: pand %xmm3, %xmm6 ; SSE-NEXT: movdqa %xmm3, %xmm10 ; SSE-NEXT: por %xmm6, %xmm2 ; SSE-NEXT: pand %xmm5, %xmm2 ; SSE-NEXT: por %xmm0, %xmm2 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm4[0,1,2,3,7,7,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE-NEXT: movdqa %xmm7, %xmm6 ; SSE-NEXT: pandn %xmm0, %xmm6 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,5,6,6,7] @@ -6062,7 +6062,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pandn %xmm0, %xmm2 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE-NEXT: pand %xmm3, %xmm0 ; SSE-NEXT: por %xmm0, %xmm2 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] @@ -6089,12 +6089,12 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pandn %xmm2, %xmm6 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm13 # 16-byte Reload ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm13[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] ; SSE-NEXT: pand %xmm12, %xmm2 ; SSE-NEXT: por %xmm2, %xmm6 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm14 # 16-byte Reload ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm14[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] ; SSE-NEXT: movdqa %xmm7, %xmm8 ; SSE-NEXT: pandn %xmm2, %xmm8 ; SSE-NEXT: pand %xmm7, %xmm6 @@ -6130,11 +6130,11 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movdqa %xmm10, %xmm6 ; SSE-NEXT: pandn %xmm2, %xmm6 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm13[1,1,1,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] ; SSE-NEXT: pand %xmm10, %xmm2 ; SSE-NEXT: por %xmm2, %xmm6 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm14[1,1,1,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] ; SSE-NEXT: movdqa %xmm15, %xmm8 ; SSE-NEXT: pandn %xmm2, %xmm8 ; SSE-NEXT: pand %xmm15, %xmm6 @@ -6179,7 +6179,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pand %xmm4, %xmm2 ; SSE-NEXT: por %xmm2, %xmm6 ; SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm14[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm2[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm2[0,1,0,1] ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] ; SSE-NEXT: movdqa %xmm2, %xmm4 ; SSE-NEXT: pandn %xmm8, %xmm4 @@ -6200,7 +6200,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movdqa %xmm2, %xmm6 ; SSE-NEXT: pandn %xmm0, %xmm6 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm1[0,1,2,3,5,5,5,5] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE-NEXT: movdqa %xmm7, %xmm8 ; SSE-NEXT: pandn %xmm0, %xmm8 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm9 # 16-byte Reload @@ -6239,7 +6239,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movdqa %xmm6, %xmm3 ; SSE-NEXT: pandn %xmm0, %xmm6 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm11[0,1,2,3,7,7,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE-NEXT: pand %xmm3, %xmm0 ; SSE-NEXT: movdqa %xmm3, %xmm14 ; SSE-NEXT: por %xmm0, %xmm6 @@ -6252,14 +6252,14 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movdqa %xmm4, %xmm0 ; SSE-NEXT: pandn %xmm6, %xmm0 ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm1[0,1,2,3,7,7,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,3,2,3] ; SSE-NEXT: pand %xmm4, %xmm6 ; SSE-NEXT: movdqa %xmm4, %xmm11 ; SSE-NEXT: por %xmm6, %xmm0 ; SSE-NEXT: pand %xmm3, %xmm0 ; SSE-NEXT: por %xmm8, %xmm0 ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm5[0,1,2,3,7,7,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm6 = xmm6[2,3,2,3] ; SSE-NEXT: movdqa %xmm7, %xmm8 ; SSE-NEXT: pandn %xmm6, %xmm8 ; SSE-NEXT: pshufhw {{.*#+}} xmm6 = xmm13[0,1,2,3,5,6,6,7] @@ -6288,7 +6288,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pandn %xmm0, %xmm8 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE-NEXT: pand %xmm1, %xmm0 ; SSE-NEXT: por %xmm0, %xmm8 ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,0,0,255,255,255,255,255,0,0,255,255,255,255,255] @@ -6315,12 +6315,12 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pandn %xmm8, %xmm10 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm1[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,0,1] ; SSE-NEXT: pand %xmm14, %xmm8 ; SSE-NEXT: por %xmm8, %xmm10 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm6 # 16-byte Reload ; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm6[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,0,1] ; SSE-NEXT: movdqa %xmm7, %xmm15 ; SSE-NEXT: pandn %xmm8, %xmm15 ; SSE-NEXT: pand %xmm7, %xmm10 @@ -6359,11 +6359,11 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movdqa %xmm2, %xmm10 ; SSE-NEXT: pandn %xmm8, %xmm10 ; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm1[1,1,1,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,0,1] ; SSE-NEXT: pand %xmm2, %xmm8 ; SSE-NEXT: por %xmm8, %xmm10 ; SSE-NEXT: pshuflw {{.*#+}} xmm8 = xmm6[1,1,1,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm8 = xmm8[0,1,0,1] ; SSE-NEXT: movdqa {{.*#+}} xmm2 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255] ; SSE-NEXT: movdqa %xmm2, %xmm15 ; SSE-NEXT: pandn %xmm8, %xmm15 @@ -6409,7 +6409,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movdqa %xmm14, %xmm4 ; SSE-NEXT: por %xmm0, %xmm15 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm6[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE-NEXT: movdqa {{.*#+}} xmm1 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] ; SSE-NEXT: movdqa %xmm1, %xmm9 ; SSE-NEXT: pandn %xmm0, %xmm9 @@ -6426,7 +6426,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pand %xmm1, %xmm0 ; SSE-NEXT: por %xmm10, %xmm0 ; SSE-NEXT: pshufhw {{.*#+}} xmm10 = xmm13[0,1,2,3,5,5,5,5] -; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm10 = xmm10[2,3,2,3] ; SSE-NEXT: movdqa %xmm7, %xmm15 ; SSE-NEXT: pandn %xmm10, %xmm15 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm3 # 16-byte Reload @@ -6464,7 +6464,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movdqa %xmm14, %xmm10 ; SSE-NEXT: pandn %xmm0, %xmm10 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm11[0,1,2,3,7,7,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE-NEXT: pand %xmm14, %xmm0 ; SSE-NEXT: por %xmm0, %xmm10 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm3[0,1,2,3,4,6,6,7] @@ -6473,7 +6473,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movdqa %xmm1, %xmm15 ; SSE-NEXT: pandn %xmm0, %xmm15 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm13[0,1,2,3,7,7,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE-NEXT: pand %xmm1, %xmm0 ; SSE-NEXT: movdqa %xmm1, %xmm2 ; SSE-NEXT: por %xmm0, %xmm15 @@ -6483,7 +6483,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: por %xmm15, %xmm0 ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm8[0,1,2,3,7,7,7,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE-NEXT: movdqa %xmm7, %xmm10 ; SSE-NEXT: pandn %xmm0, %xmm10 ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm5[0,1,2,3,5,6,6,7] @@ -6510,7 +6510,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pandn %xmm0, %xmm10 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm11 # 16-byte Reload ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm11[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE-NEXT: pand %xmm2, %xmm0 ; SSE-NEXT: movdqa %xmm2, %xmm6 ; SSE-NEXT: por %xmm0, %xmm10 @@ -6532,7 +6532,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movdqa %xmm0, %xmm3 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE-NEXT: movdqa {{.*#+}} xmm15 = [255,255,255,255,255,0,255,255,255,255,255,255,0,255,255,255] ; SSE-NEXT: pand %xmm15, %xmm0 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm5 # 16-byte Reload @@ -6543,7 +6543,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: por %xmm0, %xmm15 ; SSE-NEXT: movdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE-NEXT: movdqa %xmm7, %xmm10 ; SSE-NEXT: pandn %xmm0, %xmm10 ; SSE-NEXT: pand %xmm7, %xmm15 @@ -6574,7 +6574,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: pand %xmm4, %xmm3 ; SSE-NEXT: por %xmm10, %xmm3 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm2[1,1,1,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE-NEXT: movdqa %xmm6, %xmm4 ; SSE-NEXT: pand %xmm6, %xmm0 ; SSE-NEXT: pshuflw {{.*#+}} xmm10 = xmm5[0,2,2,3,4,5,6,7] @@ -6583,7 +6583,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: por %xmm0, %xmm4 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[1,1,1,1,4,5,6,7] ; SSE-NEXT: movdqa %xmm1, %xmm15 -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE-NEXT: movdqa {{.*#+}} xmm6 = [255,255,255,255,0,255,255,255,255,255,255,0,255,255,255,255] ; SSE-NEXT: movdqa %xmm6, %xmm10 ; SSE-NEXT: pandn %xmm0, %xmm10 @@ -6621,7 +6621,7 @@ define void @store_i8_stride7_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; SSE-NEXT: movdqa {{.*#+}} xmm3 = [255,255,0,255,255,255,255,255,255,0,255,255,255,255,255,255] ; SSE-NEXT: pand %xmm3, %xmm14 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm15[2,2,2,2,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE-NEXT: pandn %xmm0, %xmm3 ; SSE-NEXT: por %xmm14, %xmm3 ; SSE-NEXT: movdqa {{.*#+}} xmm0 = [255,255,255,0,0,0,0,255,255,255,0,0,0,0,255,255] diff --git a/llvm/test/CodeGen/X86/vector-rotate-128.ll b/llvm/test/CodeGen/X86/vector-rotate-128.ll index b114cba14cb6c..993e6afc0eaf3 100644 --- a/llvm/test/CodeGen/X86/vector-rotate-128.ll +++ b/llvm/test/CodeGen/X86/vector-rotate-128.ll @@ -898,7 +898,7 @@ define <8 x i16> @splatvar_rotate_v8i16(<8 x i16> %a, <8 x i16> %b) nounwind { ; XOPAVX1-LABEL: splatvar_rotate_v8i16: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; XOPAVX1-NEXT: vprotw %xmm1, %xmm0, %xmm0 ; XOPAVX1-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-rotate-256.ll b/llvm/test/CodeGen/X86/vector-rotate-256.ll index 86c4d79a28c89..ec45f64bdb0e6 100644 --- a/llvm/test/CodeGen/X86/vector-rotate-256.ll +++ b/llvm/test/CodeGen/X86/vector-rotate-256.ll @@ -735,7 +735,7 @@ define <16 x i16> @splatvar_rotate_v16i16(<16 x i16> %a, <16 x i16> %b) nounwind ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm2 ; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; XOPAVX1-NEXT: vprotw %xmm1, %xmm2, %xmm2 ; XOPAVX1-NEXT: vprotw %xmm1, %xmm0, %xmm0 ; XOPAVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0 diff --git a/llvm/test/CodeGen/X86/vector-sext.ll b/llvm/test/CodeGen/X86/vector-sext.ll index 8620651c1c199..f57efb40bf0e3 100644 --- a/llvm/test/CodeGen/X86/vector-sext.ll +++ b/llvm/test/CodeGen/X86/vector-sext.ll @@ -2126,7 +2126,7 @@ define <8 x i16> @load_sext_8i1_to_8i16(ptr%ptr) { ; SSE2-NEXT: movzbl (%rdi), %eax ; SSE2-NEXT: movd %eax, %xmm0 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] ; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: pcmpeqw %xmm1, %xmm0 @@ -2137,7 +2137,7 @@ define <8 x i16> @load_sext_8i1_to_8i16(ptr%ptr) { ; SSSE3-NEXT: movzbl (%rdi), %eax ; SSSE3-NEXT: movd %eax, %xmm0 ; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSSE3-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] ; SSSE3-NEXT: pand %xmm1, %xmm0 ; SSSE3-NEXT: pcmpeqw %xmm1, %xmm0 @@ -2148,7 +2148,7 @@ define <8 x i16> @load_sext_8i1_to_8i16(ptr%ptr) { ; SSE41-NEXT: movzbl (%rdi), %eax ; SSE41-NEXT: movd %eax, %xmm0 ; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] ; SSE41-NEXT: pand %xmm1, %xmm0 ; SSE41-NEXT: pcmpeqw %xmm1, %xmm0 @@ -2159,7 +2159,7 @@ define <8 x i16> @load_sext_8i1_to_8i16(ptr%ptr) { ; AVX1-NEXT: movzbl (%rdi), %eax ; AVX1-NEXT: vmovd %eax, %xmm0 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: vpmovzxbw {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] ; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 @@ -2198,7 +2198,7 @@ define <8 x i16> @load_sext_8i1_to_8i16(ptr%ptr) { ; X86-SSE2-NEXT: movzbl (%eax), %eax ; X86-SSE2-NEXT: movd %eax, %xmm0 ; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] ; X86-SSE2-NEXT: pand %xmm1, %xmm0 ; X86-SSE2-NEXT: pcmpeqw %xmm1, %xmm0 @@ -2210,7 +2210,7 @@ define <8 x i16> @load_sext_8i1_to_8i16(ptr%ptr) { ; X86-SSE41-NEXT: movzbl (%eax), %eax ; X86-SSE41-NEXT: movd %eax, %xmm0 ; X86-SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; X86-SSE41-NEXT: pmovzxbw {{.*#+}} xmm1 = [1,2,4,8,16,32,64,128] ; X86-SSE41-NEXT: pand %xmm1, %xmm0 ; X86-SSE41-NEXT: pcmpeqw %xmm1, %xmm0 @@ -2654,7 +2654,7 @@ define <16 x i16> @load_sext_16i1_to_16i16(ptr%ptr) { ; SSE2-NEXT: movzwl (%rdi), %eax ; SSE2-NEXT: movd %eax, %xmm0 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128] ; SSE2-NEXT: movdqa %xmm1, %xmm0 ; SSE2-NEXT: pand %xmm2, %xmm0 @@ -2669,7 +2669,7 @@ define <16 x i16> @load_sext_16i1_to_16i16(ptr%ptr) { ; SSSE3-NEXT: movzwl (%rdi), %eax ; SSSE3-NEXT: movd %eax, %xmm0 ; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] +; SSSE3-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] ; SSSE3-NEXT: movdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128] ; SSSE3-NEXT: movdqa %xmm1, %xmm0 ; SSSE3-NEXT: pand %xmm2, %xmm0 @@ -2684,7 +2684,7 @@ define <16 x i16> @load_sext_16i1_to_16i16(ptr%ptr) { ; SSE41-NEXT: movzwl (%rdi), %eax ; SSE41-NEXT: movd %eax, %xmm0 ; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] ; SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128] ; SSE41-NEXT: movdqa %xmm1, %xmm0 ; SSE41-NEXT: pand %xmm2, %xmm0 @@ -2699,7 +2699,7 @@ define <16 x i16> @load_sext_16i1_to_16i16(ptr%ptr) { ; AVX1-NEXT: movzwl (%rdi), %eax ; AVX1-NEXT: vmovd %eax, %xmm0 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128,256,512,1024,2048,4096,8192,16384,32768] ; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0 @@ -2737,7 +2737,7 @@ define <16 x i16> @load_sext_16i1_to_16i16(ptr%ptr) { ; X86-SSE2-NEXT: movzwl (%eax), %eax ; X86-SSE2-NEXT: movd %eax, %xmm0 ; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] ; X86-SSE2-NEXT: movdqa {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128] ; X86-SSE2-NEXT: movdqa %xmm1, %xmm0 ; X86-SSE2-NEXT: pand %xmm2, %xmm0 @@ -2753,7 +2753,7 @@ define <16 x i16> @load_sext_16i1_to_16i16(ptr%ptr) { ; X86-SSE41-NEXT: movzwl (%eax), %eax ; X86-SSE41-NEXT: movd %eax, %xmm0 ; X86-SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] ; X86-SSE41-NEXT: pmovzxbw {{.*#+}} xmm2 = [1,2,4,8,16,32,64,128] ; X86-SSE41-NEXT: movdqa %xmm1, %xmm0 ; X86-SSE41-NEXT: pand %xmm2, %xmm0 @@ -3764,7 +3764,7 @@ define <8 x i64> @sext_8i6_to_8i64(i32 %x) nounwind uwtable readnone ssp { ; SSE2: # %bb.0: # %entry ; SSE2-NEXT: movd %edi, %xmm0 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,1,0,1] ; SSE2-NEXT: paddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,0,0] ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] @@ -3804,7 +3804,7 @@ define <8 x i64> @sext_8i6_to_8i64(i32 %x) nounwind uwtable readnone ssp { ; SSSE3: # %bb.0: # %entry ; SSSE3-NEXT: movd %edi, %xmm0 ; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0] +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,1,0,1] ; SSSE3-NEXT: paddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,0,0] ; SSSE3-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] @@ -3844,7 +3844,7 @@ define <8 x i64> @sext_8i6_to_8i64(i32 %x) nounwind uwtable readnone ssp { ; SSE41: # %bb.0: # %entry ; SSE41-NEXT: movd %edi, %xmm0 ; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0] +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,1,0,1] ; SSE41-NEXT: paddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 ; SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero ; SSE41-NEXT: psllq $58, %xmm1 @@ -3879,7 +3879,7 @@ define <8 x i64> @sext_8i6_to_8i64(i32 %x) nounwind uwtable readnone ssp { ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vmovd %edi, %xmm0 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vpsllw $10, %xmm0, %xmm0 ; AVX1-NEXT: vpsraw $10, %xmm0, %xmm1 @@ -3920,7 +3920,7 @@ define <8 x i64> @sext_8i6_to_8i64(i32 %x) nounwind uwtable readnone ssp { ; X86-SSE2: # %bb.0: # %entry ; X86-SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X86-SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0] +; X86-SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,1,0,1] ; X86-SSE2-NEXT: paddw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3 ; X86-SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,0,0] ; X86-SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] @@ -3960,7 +3960,7 @@ define <8 x i64> @sext_8i6_to_8i64(i32 %x) nounwind uwtable readnone ssp { ; X86-SSE41: # %bb.0: # %entry ; X86-SSE41-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X86-SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; X86-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0] +; X86-SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,1,0,1] ; X86-SSE41-NEXT: paddw {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3 ; X86-SSE41-NEXT: pmovzxwq {{.*#+}} xmm1 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero ; X86-SSE41-NEXT: psllq $58, %xmm1 diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll index 54056461bff8c..60295f1c145a1 100644 --- a/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-ashr-128.ll @@ -803,7 +803,7 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; SSE2-NEXT: psrlw $8, %xmm2 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] ; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] ; SSE2-NEXT: psrlw %xmm1, %xmm2 @@ -917,7 +917,7 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; X86-SSE-NEXT: psrlw $8, %xmm2 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X86-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7] -; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] +; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] ; X86-SSE-NEXT: pand %xmm2, %xmm0 ; X86-SSE-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] ; X86-SSE-NEXT: psrlw %xmm1, %xmm2 @@ -1107,7 +1107,7 @@ define <16 x i8> @splatvar_modulo_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwi ; SSE2-NEXT: psrlw $8, %xmm2 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] ; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] ; SSE2-NEXT: psrlw %xmm1, %xmm2 @@ -1226,7 +1226,7 @@ define <16 x i8> @splatvar_modulo_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwi ; X86-SSE-NEXT: psrlw $8, %xmm2 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X86-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7] -; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] +; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] ; X86-SSE-NEXT: pand %xmm2, %xmm0 ; X86-SSE-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] ; X86-SSE-NEXT: psrlw %xmm1, %xmm2 diff --git a/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll b/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll index e577388d88c2c..4f8cbc07243fd 100644 --- a/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-ashr-sub128.ll @@ -1311,7 +1311,7 @@ define <8 x i8> @splatvar_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind { ; SSE2-NEXT: psrlw $8, %xmm2 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] ; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] ; SSE2-NEXT: psrlw %xmm1, %xmm2 @@ -1426,7 +1426,7 @@ define <8 x i8> @splatvar_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind { ; X86-SSE-NEXT: psrlw $8, %xmm2 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X86-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7] -; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] +; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] ; X86-SSE-NEXT: pand %xmm2, %xmm0 ; X86-SSE-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] ; X86-SSE-NEXT: psrlw %xmm1, %xmm2 @@ -1449,7 +1449,7 @@ define <4 x i8> @splatvar_shift_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind { ; SSE2-NEXT: psrlw $8, %xmm2 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] ; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] ; SSE2-NEXT: psrlw %xmm1, %xmm2 @@ -1564,7 +1564,7 @@ define <4 x i8> @splatvar_shift_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind { ; X86-SSE-NEXT: psrlw $8, %xmm2 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X86-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7] -; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] +; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] ; X86-SSE-NEXT: pand %xmm2, %xmm0 ; X86-SSE-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] ; X86-SSE-NEXT: psrlw %xmm1, %xmm2 @@ -1587,7 +1587,7 @@ define <2 x i8> @splatvar_shift_v2i8(<2 x i8> %a, <2 x i8> %b) nounwind { ; SSE2-NEXT: psrlw $8, %xmm2 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] ; SSE2-NEXT: pand %xmm2, %xmm0 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] ; SSE2-NEXT: psrlw %xmm1, %xmm2 @@ -1693,7 +1693,7 @@ define <2 x i8> @splatvar_shift_v2i8(<2 x i8> %a, <2 x i8> %b) nounwind { ; X86-SSE-NEXT: psrlw $8, %xmm2 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X86-SSE-NEXT: pshuflw {{.*#+}} xmm2 = xmm2[0,0,0,0,4,5,6,7] -; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] +; X86-SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] ; X86-SSE-NEXT: pand %xmm2, %xmm0 ; X86-SSE-NEXT: movdqa {{.*#+}} xmm2 = [32896,32896,32896,32896,32896,32896,32896,32896] ; X86-SSE-NEXT: psrlw %xmm1, %xmm2 diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll index 467c1574180da..1d1697aa38bae 100644 --- a/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-lshr-128.ll @@ -658,7 +658,7 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; SSE2-NEXT: psrlw $8, %xmm2 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: retq ; @@ -756,7 +756,7 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; X86-SSE-NEXT: psrlw $8, %xmm2 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X86-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,0,0,4,5,6,7] -; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; X86-SSE-NEXT: pand %xmm1, %xmm0 ; X86-SSE-NEXT: retl %splat = shufflevector <16 x i8> %b, <16 x i8> poison, <16 x i32> zeroinitializer @@ -904,7 +904,7 @@ define <16 x i8> @splatvar_modulo_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwi ; SSE2-NEXT: psrlw $8, %xmm2 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: retq ; @@ -1007,7 +1007,7 @@ define <16 x i8> @splatvar_modulo_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwi ; X86-SSE-NEXT: psrlw $8, %xmm2 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X86-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,0,0,4,5,6,7] -; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; X86-SSE-NEXT: pand %xmm1, %xmm0 ; X86-SSE-NEXT: retl %mod = and <16 x i8> %b, diff --git a/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll b/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll index e0d1c256ebecf..79281116665f6 100644 --- a/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-lshr-sub128.ll @@ -1077,7 +1077,7 @@ define <8 x i8> @splatvar_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind { ; SSE2-NEXT: psrlw $8, %xmm2 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: retq ; @@ -1176,7 +1176,7 @@ define <8 x i8> @splatvar_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind { ; X86-SSE-NEXT: psrlw $8, %xmm2 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X86-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,0,0,4,5,6,7] -; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; X86-SSE-NEXT: pand %xmm1, %xmm0 ; X86-SSE-NEXT: retl %splat = shufflevector <8 x i8> %b, <8 x i8> poison, <8 x i32> zeroinitializer @@ -1195,7 +1195,7 @@ define <4 x i8> @splatvar_shift_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind { ; SSE2-NEXT: psrlw $8, %xmm2 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: retq ; @@ -1294,7 +1294,7 @@ define <4 x i8> @splatvar_shift_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind { ; X86-SSE-NEXT: psrlw $8, %xmm2 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X86-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,0,0,4,5,6,7] -; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; X86-SSE-NEXT: pand %xmm1, %xmm0 ; X86-SSE-NEXT: retl %splat = shufflevector <4 x i8> %b, <4 x i8> poison, <4 x i32> zeroinitializer @@ -1313,7 +1313,7 @@ define <2 x i8> @splatvar_shift_v2i8(<2 x i8> %a, <2 x i8> %b) nounwind { ; SSE2-NEXT: psrlw $8, %xmm2 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: retq ; @@ -1403,7 +1403,7 @@ define <2 x i8> @splatvar_shift_v2i8(<2 x i8> %a, <2 x i8> %b) nounwind { ; X86-SSE-NEXT: psrlw $8, %xmm2 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X86-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,0,0,4,5,6,7] -; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; X86-SSE-NEXT: pand %xmm1, %xmm0 ; X86-SSE-NEXT: retl %splat = shufflevector <2 x i8> %b, <2 x i8> poison, <2 x i32> zeroinitializer diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-128.ll b/llvm/test/CodeGen/X86/vector-shift-shl-128.ll index 4dda9ff09cc62..2b1cf5b671e53 100644 --- a/llvm/test/CodeGen/X86/vector-shift-shl-128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-shl-128.ll @@ -568,7 +568,7 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; SSE2-NEXT: psllw %xmm1, %xmm2 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: retq ; @@ -663,7 +663,7 @@ define <16 x i8> @splatvar_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwind { ; X86-SSE-NEXT: psllw %xmm1, %xmm2 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X86-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,0,0,4,5,6,7] -; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; X86-SSE-NEXT: pand %xmm1, %xmm0 ; X86-SSE-NEXT: retl %splat = shufflevector <16 x i8> %b, <16 x i8> poison, <16 x i32> zeroinitializer @@ -810,7 +810,7 @@ define <16 x i8> @splatvar_modulo_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwi ; SSE2-NEXT: psllw %xmm1, %xmm2 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: retq ; @@ -910,7 +910,7 @@ define <16 x i8> @splatvar_modulo_shift_v16i8(<16 x i8> %a, <16 x i8> %b) nounwi ; X86-SSE-NEXT: psllw %xmm1, %xmm2 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X86-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,0,0,4,5,6,7] -; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; X86-SSE-NEXT: pand %xmm1, %xmm0 ; X86-SSE-NEXT: retl %mod = and <16 x i8> %b, diff --git a/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll b/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll index f213db0349c31..d245bdca6ee29 100644 --- a/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll +++ b/llvm/test/CodeGen/X86/vector-shift-shl-sub128.ll @@ -929,7 +929,7 @@ define <8 x i8> @splatvar_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind { ; SSE2-NEXT: psllw %xmm1, %xmm2 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: retq ; @@ -1024,7 +1024,7 @@ define <8 x i8> @splatvar_shift_v8i8(<8 x i8> %a, <8 x i8> %b) nounwind { ; X86-SSE-NEXT: psllw %xmm1, %xmm2 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X86-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,0,0,4,5,6,7] -; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; X86-SSE-NEXT: pand %xmm1, %xmm0 ; X86-SSE-NEXT: retl %splat = shufflevector <8 x i8> %b, <8 x i8> poison, <8 x i32> zeroinitializer @@ -1042,7 +1042,7 @@ define <4 x i8> @splatvar_shift_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind { ; SSE2-NEXT: psllw %xmm1, %xmm2 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: retq ; @@ -1137,7 +1137,7 @@ define <4 x i8> @splatvar_shift_v4i8(<4 x i8> %a, <4 x i8> %b) nounwind { ; X86-SSE-NEXT: psllw %xmm1, %xmm2 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X86-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,0,0,4,5,6,7] -; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; X86-SSE-NEXT: pand %xmm1, %xmm0 ; X86-SSE-NEXT: retl %splat = shufflevector <4 x i8> %b, <4 x i8> poison, <4 x i32> zeroinitializer @@ -1155,7 +1155,7 @@ define <2 x i8> @splatvar_shift_v2i8(<2 x i8> %a, <2 x i8> %b) nounwind { ; SSE2-NEXT: psllw %xmm1, %xmm2 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; SSE2-NEXT: pand %xmm1, %xmm0 ; SSE2-NEXT: retq ; @@ -1243,7 +1243,7 @@ define <2 x i8> @splatvar_shift_v2i8(<2 x i8> %a, <2 x i8> %b) nounwind { ; X86-SSE-NEXT: psllw %xmm1, %xmm2 ; X86-SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; X86-SSE-NEXT: pshuflw {{.*#+}} xmm1 = xmm2[0,0,0,0,4,5,6,7] -; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; X86-SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; X86-SSE-NEXT: pand %xmm1, %xmm0 ; X86-SSE-NEXT: retl %splat = shufflevector <2 x i8> %b, <2 x i8> poison, <2 x i32> zeroinitializer diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll index b1c90aa8021b8..442bfde6a9e2e 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v16.ll @@ -18,7 +18,7 @@ define <16 x i8> @shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00( ; SSE2: # %bb.0: ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: shuffle_v16i8_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: @@ -235,13 +235,13 @@ define <16 x i8> @shuffle_v16i8_0101010101010101(<16 x i8> %a, <16 x i8> %b) { ; SSE-LABEL: shuffle_v16i8_0101010101010101: ; SSE: # %bb.0: ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE-NEXT: retq ; ; AVX1-LABEL: shuffle_v16i8_0101010101010101: ; AVX1: # %bb.0: ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: retq ; ; AVX2OR512VL-LABEL: shuffle_v16i8_0101010101010101: @@ -252,7 +252,7 @@ define <16 x i8> @shuffle_v16i8_0101010101010101(<16 x i8> %a, <16 x i8> %b) { ; XOPAVX1-LABEL: shuffle_v16i8_0101010101010101: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: shuffle_v16i8_0101010101010101: @@ -2274,7 +2274,7 @@ define <16 x i8> @insert_dup_mem_v16i8_i32(ptr %ptr) { ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: insert_dup_mem_v16i8_i32: @@ -2328,7 +2328,7 @@ define <16 x i8> @insert_dup_mem_v16i8_sext_i8(ptr %ptr) { ; SSE2-NEXT: movd %eax, %xmm0 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: insert_dup_mem_v16i8_sext_i8: @@ -2386,7 +2386,7 @@ define <16 x i8> @insert_dup_elt1_mem_v16i8_i32(ptr %ptr) { ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: insert_dup_elt1_mem_v16i8_i32: @@ -2435,7 +2435,7 @@ define <16 x i8> @insert_dup_elt2_mem_v16i8_i32(ptr %ptr) { ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: insert_dup_elt2_mem_v16i8_i32: @@ -2485,7 +2485,7 @@ define <16 x i8> @insert_dup_elt1_mem_v16i8_sext_i8(ptr %ptr) { ; SSE2-NEXT: movd %eax, %xmm0 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: insert_dup_elt1_mem_v16i8_sext_i8: @@ -2553,7 +2553,7 @@ define <16 x i8> @insert_dup_elt2_mem_v16i8_sext_i8(ptr %ptr) { ; SSE2-NEXT: movd %eax, %xmm0 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: insert_dup_elt2_mem_v16i8_sext_i8: diff --git a/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll b/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll index 212cde9fcd6b2..f3659fd934e71 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-128-v8.ll @@ -68,13 +68,13 @@ define <8 x i16> @shuffle_v8i16_00000000(<8 x i16> %a, <8 x i16> %b) { ; SSE-LABEL: shuffle_v8i16_00000000: ; SSE: # %bb.0: ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE-NEXT: retq ; ; AVX1-LABEL: shuffle_v8i16_00000000: ; AVX1: # %bb.0: ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: retq ; ; AVX2OR512VL-LABEL: shuffle_v8i16_00000000: @@ -85,7 +85,7 @@ define <8 x i16> @shuffle_v8i16_00000000(<8 x i16> %a, <8 x i16> %b) { ; XOPAVX1-LABEL: shuffle_v8i16_00000000: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: shuffle_v8i16_00000000: @@ -3191,14 +3191,14 @@ define <8 x i16> @insert_dup_mem_v8i16_i32(ptr %ptr) { ; SSE: # %bb.0: ; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE-NEXT: retq ; ; AVX1-LABEL: insert_dup_mem_v8i16_i32: ; AVX1: # %bb.0: ; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: retq ; ; AVX2OR512VL-LABEL: insert_dup_mem_v8i16_i32: @@ -3210,7 +3210,7 @@ define <8 x i16> @insert_dup_mem_v8i16_i32(ptr %ptr) { ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: insert_dup_mem_v8i16_i32: @@ -3230,7 +3230,7 @@ define <8 x i16> @insert_dup_mem_v8i16_sext_i16(ptr %ptr) { ; SSE-NEXT: movzwl (%rdi), %eax ; SSE-NEXT: movd %eax, %xmm0 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE-NEXT: retq ; ; AVX1-LABEL: insert_dup_mem_v8i16_sext_i16: @@ -3238,7 +3238,7 @@ define <8 x i16> @insert_dup_mem_v8i16_sext_i16(ptr %ptr) { ; AVX1-NEXT: movzwl (%rdi), %eax ; AVX1-NEXT: vmovd %eax, %xmm0 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: retq ; ; AVX2OR512VL-LABEL: insert_dup_mem_v8i16_sext_i16: @@ -3251,7 +3251,7 @@ define <8 x i16> @insert_dup_mem_v8i16_sext_i16(ptr %ptr) { ; XOPAVX1-NEXT: movzwl (%rdi), %eax ; XOPAVX1-NEXT: vmovd %eax, %xmm0 ; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: insert_dup_mem_v8i16_sext_i16: @@ -3271,14 +3271,14 @@ define <8 x i16> @insert_dup_elt1_mem_v8i16_i32(ptr %ptr) { ; SSE: # %bb.0: ; SSE-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE-NEXT: retq ; ; AVX1-LABEL: insert_dup_elt1_mem_v8i16_i32: ; AVX1: # %bb.0: ; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: retq ; ; AVX2OR512VL-LABEL: insert_dup_elt1_mem_v8i16_i32: @@ -3290,7 +3290,7 @@ define <8 x i16> @insert_dup_elt1_mem_v8i16_i32(ptr %ptr) { ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: insert_dup_elt1_mem_v8i16_i32: @@ -3309,7 +3309,7 @@ define <8 x i16> @insert_dup_elt3_mem_v8i16_i32(ptr %ptr) { ; SSE2: # %bb.0: ; SSE2-NEXT: movd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: insert_dup_elt3_mem_v8i16_i32: @@ -3328,7 +3328,7 @@ define <8 x i16> @insert_dup_elt3_mem_v8i16_i32(ptr %ptr) { ; AVX1: # %bb.0: ; AVX1-NEXT: vbroadcastss (%rdi), %xmm0 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: retq ; ; AVX2OR512VL-LABEL: insert_dup_elt3_mem_v8i16_i32: @@ -3340,7 +3340,7 @@ define <8 x i16> @insert_dup_elt3_mem_v8i16_i32(ptr %ptr) { ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vbroadcastss (%rdi), %xmm0 ; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: insert_dup_elt3_mem_v8i16_i32: @@ -3360,7 +3360,7 @@ define <8 x i16> @insert_dup_elt1_mem_v8i16_sext_i16(ptr %ptr) { ; SSE-NEXT: movswl (%rdi), %eax ; SSE-NEXT: movd %eax, %xmm0 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE-NEXT: retq ; ; AVX1-LABEL: insert_dup_elt1_mem_v8i16_sext_i16: @@ -3368,7 +3368,7 @@ define <8 x i16> @insert_dup_elt1_mem_v8i16_sext_i16(ptr %ptr) { ; AVX1-NEXT: movswl (%rdi), %eax ; AVX1-NEXT: vmovd %eax, %xmm0 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: retq ; ; AVX2-LABEL: insert_dup_elt1_mem_v8i16_sext_i16: @@ -3391,7 +3391,7 @@ define <8 x i16> @insert_dup_elt1_mem_v8i16_sext_i16(ptr %ptr) { ; XOPAVX1-NEXT: movswl (%rdi), %eax ; XOPAVX1-NEXT: vmovd %eax, %xmm0 ; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: insert_dup_elt1_mem_v8i16_sext_i16: @@ -3415,7 +3415,7 @@ define <8 x i16> @insert_dup_elt3_mem_v8i16_sext_i16(ptr %ptr) { ; SSE2-NEXT: movswl (%rdi), %eax ; SSE2-NEXT: movd %eax, %xmm0 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: insert_dup_elt3_mem_v8i16_sext_i16: @@ -3481,14 +3481,14 @@ define <8 x i16> @insert_dup_mem_v8i16_i64(ptr %ptr) { ; SSE: # %bb.0: ; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE-NEXT: retq ; ; AVX1-LABEL: insert_dup_mem_v8i16_i64: ; AVX1: # %bb.0: ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: retq ; ; AVX2OR512VL-LABEL: insert_dup_mem_v8i16_i64: @@ -3500,7 +3500,7 @@ define <8 x i16> @insert_dup_mem_v8i16_i64(ptr %ptr) { ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: insert_dup_mem_v8i16_i64: @@ -3519,14 +3519,14 @@ define <8 x i16> @insert_dup_elt1_mem_v8i16_i64(ptr %ptr) { ; SSE: # %bb.0: ; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE-NEXT: retq ; ; AVX1-LABEL: insert_dup_elt1_mem_v8i16_i64: ; AVX1: # %bb.0: ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: retq ; ; AVX2OR512VL-LABEL: insert_dup_elt1_mem_v8i16_i64: @@ -3538,7 +3538,7 @@ define <8 x i16> @insert_dup_elt1_mem_v8i16_i64(ptr %ptr) { ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: insert_dup_elt1_mem_v8i16_i64: @@ -3557,14 +3557,14 @@ define <8 x i16> @insert_dup_elt3_mem_v8i16_i64(ptr %ptr) { ; SSE: # %bb.0: ; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE-NEXT: retq ; ; AVX1-LABEL: insert_dup_elt3_mem_v8i16_i64: ; AVX1: # %bb.0: ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: retq ; ; AVX2OR512VL-LABEL: insert_dup_elt3_mem_v8i16_i64: @@ -3576,7 +3576,7 @@ define <8 x i16> @insert_dup_elt3_mem_v8i16_i64(ptr %ptr) { ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: insert_dup_elt3_mem_v8i16_i64: @@ -3596,7 +3596,7 @@ define <8 x i16> @insert_dup_elt7_mem_v8i16_i64(ptr %ptr) { ; SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE2-NEXT: retq ; ; SSSE3-LABEL: insert_dup_elt7_mem_v8i16_i64: @@ -3615,7 +3615,7 @@ define <8 x i16> @insert_dup_elt7_mem_v8i16_i64(ptr %ptr) { ; AVX1: # %bb.0: ; AVX1-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] ; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX1-NEXT: retq ; ; AVX2OR512VL-LABEL: insert_dup_elt7_mem_v8i16_i64: @@ -3627,7 +3627,7 @@ define <8 x i16> @insert_dup_elt7_mem_v8i16_i64(ptr %ptr) { ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vmovddup {{.*#+}} xmm0 = mem[0,0] ; XOPAVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: insert_dup_elt7_mem_v8i16_i64: @@ -3647,7 +3647,7 @@ define <8 x i16> @insert_dup_mem_v8i16_sext_i16_i64(ptr %ptr) { ; SSE-NEXT: movzwl (%rdi), %eax ; SSE-NEXT: movd %eax, %xmm0 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE-NEXT: retq ; ; AVX1-LABEL: insert_dup_mem_v8i16_sext_i16_i64: @@ -3655,7 +3655,7 @@ define <8 x i16> @insert_dup_mem_v8i16_sext_i16_i64(ptr %ptr) { ; AVX1-NEXT: movzwl (%rdi), %eax ; AVX1-NEXT: vmovd %eax, %xmm0 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: retq ; ; AVX2OR512VL-LABEL: insert_dup_mem_v8i16_sext_i16_i64: @@ -3668,7 +3668,7 @@ define <8 x i16> @insert_dup_mem_v8i16_sext_i16_i64(ptr %ptr) { ; XOPAVX1-NEXT: movzwl (%rdi), %eax ; XOPAVX1-NEXT: vmovd %eax, %xmm0 ; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: insert_dup_mem_v8i16_sext_i16_i64: diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll index 4ebf4b3c5668a..154533d5ec107 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v16.ll @@ -13,7 +13,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_0 ; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; AVX1: # %bb.0: ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -25,7 +25,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_0 ; XOPAVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00_00: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; XOPAVX1-NEXT: retq ; @@ -41,7 +41,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_0 ; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00: ; AVX1: # %bb.0: ; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,1,0,4,5,6,7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -76,7 +76,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_0 ; XOPAVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_00_01_00: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7] -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,1,0,4,5,6,7] ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,1] ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -96,7 +96,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_0 ; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00: ; AVX1: # %bb.0: ; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,2,4,5,6,7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -131,7 +131,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_0 ; XOPAVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_00_02_00_00: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7] -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,2,4,5,6,7] ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,0] ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -151,7 +151,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_0 ; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00: ; AVX1: # %bb.0: ; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,3,0,4,5,6,7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,0] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -186,7 +186,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_0 ; XOPAVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_00_03_00_00_00: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7] -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,3,0,4,5,6,7] ; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,0] ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -206,7 +206,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_0 ; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00: ; AVX1: # %bb.0: ; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq @@ -226,7 +226,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_0 ; XOPAVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_00_04_00_00_00_00: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7] -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; XOPAVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1] ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; XOPAVX1-NEXT: retq @@ -244,7 +244,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_0 ; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00: ; AVX1: # %bb.0: ; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq @@ -264,7 +264,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_0 ; XOPAVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_00_05_00_00_00_00_00: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7] -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; XOPAVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1] ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; XOPAVX1-NEXT: retq @@ -282,7 +282,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_0 ; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00: ; AVX1: # %bb.0: ; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq @@ -302,7 +302,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_0 ; XOPAVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_00_06_00_00_00_00_00_00: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7] -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; XOPAVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1] ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; XOPAVX1-NEXT: retq @@ -320,7 +320,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_0 ; AVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00: ; AVX1: # %bb.0: ; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq @@ -340,7 +340,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_0 ; XOPAVX1-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_07_00_00_00_00_00_00_00: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7] -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; XOPAVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1] ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; XOPAVX1-NEXT: retq @@ -362,7 +362,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_0 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,1,4,5,6,7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,1] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; @@ -383,7 +383,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_08_00_00_00_00_00_00_00_0 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; XOPAVX1-NEXT: vpperm {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1],xmm1[0,1] ; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; XOPAVX1-NEXT: retq ; @@ -404,7 +404,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_0 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,3,0,4,5,6,7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,1] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; @@ -425,7 +425,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_09_00_00_00_00_00_00_00_00_0 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; XOPAVX1-NEXT: vpperm {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1],xmm1[2,3],xmm0[0,1] ; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; XOPAVX1-NEXT: retq ; @@ -445,7 +445,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_0 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,0,1,0,1,0,1,10,11,0,1,0,1] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; @@ -466,7 +466,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_10_00_00_00_00_00_00_00_00_00_0 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; XOPAVX1-NEXT: vpperm {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1,0,1],xmm1[4,5],xmm0[0,1,0,1] ; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; XOPAVX1-NEXT: retq ; @@ -486,7 +486,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_0 ; AVX1-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3] ; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,0,1,0,1,14,15,0,1,0,1,0,1] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; @@ -507,7 +507,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_11_00_00_00_00_00_00_00_00_00_00_0 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; XOPAVX1-NEXT: vpperm {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1,0,1],xmm1[6,7],xmm0[0,1,0,1,0,1] ; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; XOPAVX1-NEXT: retq ; @@ -527,7 +527,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_0 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] ; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,0,1,8,9,0,1,0,1,0,1,0,1] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; @@ -548,7 +548,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_12_00_00_00_00_00_00_00_00_00_00_00_0 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; XOPAVX1-NEXT: vpperm {{.*#+}} xmm1 = xmm0[0,1,0,1,0,1],xmm1[8,9],xmm0[0,1,0,1,0,1,0,1] ; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; XOPAVX1-NEXT: retq ; @@ -568,7 +568,7 @@ define <16 x i16> @shuffle_v16i16_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_0 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] ; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,0,1,10,11,0,1,0,1,0,1,0,1,0,1] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; @@ -589,7 +589,7 @@ define <16 x i16> @shuffle_v16i16_00_00_13_00_00_00_00_00_00_00_00_00_00_00_00_0 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; XOPAVX1-NEXT: vpperm {{.*#+}} xmm1 = xmm0[0,1,0,1],xmm1[10,11],xmm0[0,1,0,1,0,1,0,1,0,1] ; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; XOPAVX1-NEXT: retq ; @@ -609,7 +609,7 @@ define <16 x i16> @shuffle_v16i16_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_0 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] ; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[0,1,12,13,0,1,0,1,0,1,0,1,0,1,0,1] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; @@ -630,7 +630,7 @@ define <16 x i16> @shuffle_v16i16_00_14_00_00_00_00_00_00_00_00_00_00_00_00_00_0 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; XOPAVX1-NEXT: vpperm {{.*#+}} xmm1 = xmm0[0,1],xmm1[12,13],xmm0[0,1,0,1,0,1,0,1,0,1,0,1] ; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; XOPAVX1-NEXT: retq ; @@ -650,7 +650,7 @@ define <16 x i16> @shuffle_v16i16_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_0 ; AVX1-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] ; AVX1-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[14,15,0,1,0,1,0,1,0,1,0,1,0,1,0,1] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; AVX1-NEXT: retq ; @@ -671,7 +671,7 @@ define <16 x i16> @shuffle_v16i16_15_00_00_00_00_00_00_00_00_00_00_00_00_00_00_0 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; XOPAVX1-NEXT: vpperm {{.*#+}} xmm1 = xmm1[14,15],xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1] ; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 ; XOPAVX1-NEXT: retq ; @@ -691,13 +691,13 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_0 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; AVX1-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5] ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08: @@ -708,7 +708,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_0 ; AVX512VL-SLOW-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08: ; AVX512VL-SLOW: # %bb.0: ; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] -; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5] ; AVX512VL-SLOW-NEXT: retq ; ; AVX512VL-FAST-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08: @@ -722,13 +722,13 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_0 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; XOPAVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; XOPAVX1-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: shuffle_v16i16_00_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08: ; XOPAVX2: # %bb.0: ; XOPAVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] -; XOPAVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; XOPAVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5] ; XOPAVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle @@ -741,13 +741,13 @@ define <16 x i16> @shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_1 ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6] +; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15: @@ -758,7 +758,7 @@ define <16 x i16> @shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_1 ; AVX512VL-SLOW-LABEL: shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15: ; AVX512VL-SLOW: # %bb.0: ; AVX512VL-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15] -; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6] +; AVX512VL-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] ; AVX512VL-SLOW-NEXT: retq ; ; AVX512VL-FAST-LABEL: shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15: @@ -772,13 +772,13 @@ define <16 x i16> @shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_1 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; XOPAVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; XOPAVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6] +; XOPAVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: shuffle_v16i16_07_07_07_07_07_07_07_07_15_15_15_15_15_15_15_15: ; XOPAVX2: # %bb.0: ; XOPAVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15] -; XOPAVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6] +; XOPAVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,3,2,3,6,7,6,7] ; XOPAVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle @@ -2437,7 +2437,7 @@ define <16 x i16> @shuffle_v16i16_00_uu_uu_00_00_00_00_00_08_08_uu_uu_08_08_14_0 ; AVX1-LABEL: shuffle_v16i16_00_uu_uu_00_00_00_00_00_08_08_uu_uu_08_08_14_08: ; AVX1: # %bb.0: ; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,12,13,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -2451,7 +2451,7 @@ define <16 x i16> @shuffle_v16i16_00_uu_uu_00_00_00_00_00_08_08_uu_uu_08_08_14_0 ; XOPAVX1-LABEL: shuffle_v16i16_00_uu_uu_00_00_00_00_00_08_08_uu_uu_08_08_14_08: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7] -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; XOPAVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,12,13,0,1] ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 @@ -2531,7 +2531,7 @@ define <16 x i16> @shuffle_v16i16_04_04_04_04_uu_uu_uu_uu_08_08_08_uu_uu_12_12_1 ; AVX1-LABEL: shuffle_v16i16_04_04_04_04_uu_uu_uu_uu_08_08_08_uu_uu_12_12_12: ; AVX1: # %bb.0: ; AVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,4,4] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] @@ -2546,7 +2546,7 @@ define <16 x i16> @shuffle_v16i16_04_04_04_04_uu_uu_uu_uu_08_08_08_uu_uu_12_12_1 ; XOPAVX1-LABEL: shuffle_v16i16_04_04_04_04_uu_uu_uu_uu_08_08_08_uu_uu_12_12_12: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vpshufhw {{.*#+}} xmm1 = xmm0[0,1,2,3,4,4,4,4] -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,2,2] +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3] ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; XOPAVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] @@ -3452,7 +3452,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_0 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,1,4,5,6,7] ; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,1] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -3473,7 +3473,7 @@ define <16 x i16> @shuffle_v16i16_00_00_00_00_00_00_00_08_08_08_08_08_08_08_08_0 ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm1 ; XOPAVX1-NEXT: vpperm {{.*#+}} xmm0 = xmm0[0,1,0,1,0,1,0,1,0,1,0,1,0,1],xmm1[0,1] ; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm1 = xmm1[0,0,0,0,4,5,6,7] -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; XOPAVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; XOPAVX1-NEXT: retq ; @@ -6779,14 +6779,14 @@ define <16 x i16> @shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3(<16 x i16> %a, ; AVX1-LABEL: shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3: ; AVX1: # %bb.0: ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastd %xmm0, %ymm0 +; AVX2-SLOW-NEXT: vpbroadcastq %xmm0, %ymm0 ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3: @@ -6798,7 +6798,7 @@ define <16 x i16> @shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3(<16 x i16> %a, ; AVX512VL-SLOW-LABEL: shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3: ; AVX512VL-SLOW: # %bb.0: ; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] -; AVX512VL-SLOW-NEXT: vpbroadcastd %xmm0, %ymm0 +; AVX512VL-SLOW-NEXT: vpbroadcastq %xmm0, %ymm0 ; AVX512VL-SLOW-NEXT: retq ; ; AVX512VL-FAST-LABEL: shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3: @@ -6810,14 +6810,14 @@ define <16 x i16> @shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3(<16 x i16> %a, ; XOPAVX1-LABEL: shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: shuffle_v16i16_u_u_u_u_u_u_u_u_3_3_3_3_3_3_3_3: ; XOPAVX2: # %bb.0: ; XOPAVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] -; XOPAVX2-NEXT: vpbroadcastd %xmm0, %ymm0 +; XOPAVX2-NEXT: vpbroadcastq %xmm0, %ymm0 ; XOPAVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle @@ -6828,7 +6828,7 @@ define <16 x i16> @shuffle_v16i16_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8(<16 x i16> %a, ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -6860,7 +6860,7 @@ define <16 x i16> @shuffle_v16i16_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8_8(<16 x i16> %a, ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; XOPAVX1-NEXT: retq ; @@ -6886,13 +6886,13 @@ define <16 x i16> @shuffle_v16i16_3_3_3_3_3_3_3_3_u_u_u_u_u_u_u_u(<16 x i16> %a, ; AVX1-LABEL: shuffle_v16i16_3_3_3_3_3_3_3_3_u_u_u_u_u_u_u_u: ; AVX1: # %bb.0: ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: shuffle_v16i16_3_3_3_3_3_3_3_3_u_u_u_u_u_u_u_u: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastd %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpbroadcastq %xmm0, %xmm0 ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: shuffle_v16i16_3_3_3_3_3_3_3_3_u_u_u_u_u_u_u_u: @@ -6903,7 +6903,7 @@ define <16 x i16> @shuffle_v16i16_3_3_3_3_3_3_3_3_u_u_u_u_u_u_u_u(<16 x i16> %a, ; AVX512VL-SLOW-LABEL: shuffle_v16i16_3_3_3_3_3_3_3_3_u_u_u_u_u_u_u_u: ; AVX512VL-SLOW: # %bb.0: ; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] -; AVX512VL-SLOW-NEXT: vpbroadcastd %xmm0, %xmm0 +; AVX512VL-SLOW-NEXT: vpbroadcastq %xmm0, %xmm0 ; AVX512VL-SLOW-NEXT: retq ; ; AVX512VL-FAST-LABEL: shuffle_v16i16_3_3_3_3_3_3_3_3_u_u_u_u_u_u_u_u: @@ -6914,13 +6914,13 @@ define <16 x i16> @shuffle_v16i16_3_3_3_3_3_3_3_3_u_u_u_u_u_u_u_u(<16 x i16> %a, ; XOPAVX1-LABEL: shuffle_v16i16_3_3_3_3_3_3_3_3_u_u_u_u_u_u_u_u: ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: shuffle_v16i16_3_3_3_3_3_3_3_3_u_u_u_u_u_u_u_u: ; XOPAVX2: # %bb.0: ; XOPAVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] -; XOPAVX2-NEXT: vpbroadcastd %xmm0, %xmm0 +; XOPAVX2-NEXT: vpbroadcastq %xmm0, %xmm0 ; XOPAVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle @@ -6931,14 +6931,14 @@ define <16 x i16> @shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u(<16 x i16> %a, ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] -; AVX2-SLOW-NEXT: vpbroadcastd %xmm0, %xmm0 +; AVX2-SLOW-NEXT: vpbroadcastq %xmm0, %xmm0 ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u: @@ -6951,7 +6951,7 @@ define <16 x i16> @shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u(<16 x i16> %a, ; AVX512VL-SLOW: # %bb.0: ; AVX512VL-SLOW-NEXT: vextracti128 $1, %ymm0, %xmm0 ; AVX512VL-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] -; AVX512VL-SLOW-NEXT: vpbroadcastd %xmm0, %xmm0 +; AVX512VL-SLOW-NEXT: vpbroadcastq %xmm0, %xmm0 ; AVX512VL-SLOW-NEXT: retq ; ; AVX512VL-FAST-LABEL: shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u: @@ -6964,14 +6964,14 @@ define <16 x i16> @shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u(<16 x i16> %a, ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: shuffle_v16i16_9_9_9_9_9_9_9_9_u_u_u_u_u_u_u_u: ; XOPAVX2: # %bb.0: ; XOPAVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 ; XOPAVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] -; XOPAVX2-NEXT: vpbroadcastd %xmm0, %xmm0 +; XOPAVX2-NEXT: vpbroadcastq %xmm0, %xmm0 ; XOPAVX2-NEXT: retq %shuffle = shufflevector <16 x i16> %a, <16 x i16> %b, <16 x i32> ret <16 x i16> %shuffle @@ -7381,7 +7381,7 @@ define <16 x i16> @shuffle_v16i16_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_0 ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -7413,7 +7413,7 @@ define <16 x i16> @shuffle_v16i16_08_08_08_08_08_08_08_08_08_08_08_08_08_08_08_0 ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; XOPAVX1-NEXT: retq ; @@ -7431,7 +7431,7 @@ define <16 x i16> @shuffle_v16i16_11_11_11_11_11_11_11_11_11_11_11_11_11_11_11_1 ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -7463,7 +7463,7 @@ define <16 x i16> @shuffle_v16i16_11_11_11_11_11_11_11_11_11_11_11_11_11_11_11_1 ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; XOPAVX1-NEXT: retq ; @@ -7482,29 +7482,15 @@ define <16 x i16> @shuffle_v16i16_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_1 ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-SLOW-LABEL: shuffle_v16i16_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-ALL-LABEL: shuffle_v16i16_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15: -; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15] -; AVX2-FAST-ALL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6] -; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-ALL-NEXT: retq -; -; AVX2-FAST-PERLANE-LABEL: shuffle_v16i16_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15: -; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,30,31,30,31,30,31,30,31,u,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: retq +; AVX2-LABEL: shuffle_v16i16_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,3,3,3] +; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v16i16_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15: ; AVX512VL: # %bb.0: @@ -7516,15 +7502,14 @@ define <16 x i16> @shuffle_v16i16_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_1 ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; XOPAVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: shuffle_v16i16_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15_15: ; XOPAVX2: # %bb.0: ; XOPAVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15] -; XOPAVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6] -; XOPAVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] +; XOPAVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,3,3,3] ; XOPAVX2-NEXT: retq %r = shufflevector <16 x i16> %x, <16 x i16> poison, <16 x i32> splat(i32 15) ret <16 x i16> %r @@ -7761,7 +7746,7 @@ define <16 x i16> @insert_dup_mem_v16i16_i32(ptr %ptr) { ; AVX1: # %bb.0: ; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -7774,7 +7759,7 @@ define <16 x i16> @insert_dup_mem_v16i16_i32(ptr %ptr) { ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; XOPAVX1-NEXT: retq ; @@ -7795,7 +7780,7 @@ define <16 x i16> @insert_dup_mem_v16i16_sext_i16(ptr %ptr) { ; AVX1-NEXT: movzwl (%rdi), %eax ; AVX1-NEXT: vmovd %eax, %xmm0 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -7809,7 +7794,7 @@ define <16 x i16> @insert_dup_mem_v16i16_sext_i16(ptr %ptr) { ; XOPAVX1-NEXT: movzwl (%rdi), %eax ; XOPAVX1-NEXT: vmovd %eax, %xmm0 ; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; XOPAVX1-NEXT: retq ; @@ -7830,7 +7815,7 @@ define <16 x i16> @insert_dup_elt1_mem_v16i16_i32(ptr %ptr) #0 { ; AVX1: # %bb.0: ; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -7843,7 +7828,7 @@ define <16 x i16> @insert_dup_elt1_mem_v16i16_i32(ptr %ptr) #0 { ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero ; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; XOPAVX1-NEXT: retq ; @@ -7863,7 +7848,7 @@ define <16 x i16> @insert_dup_elt3_mem_v16i16_i32(ptr %ptr) #0 { ; AVX1: # %bb.0: ; AVX1-NEXT: vbroadcastss (%rdi), %xmm0 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -7876,7 +7861,7 @@ define <16 x i16> @insert_dup_elt3_mem_v16i16_i32(ptr %ptr) #0 { ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vbroadcastss (%rdi), %xmm0 ; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; XOPAVX1-NEXT: retq ; @@ -7896,7 +7881,7 @@ define <16 x i16> @insert_dup_mem_v16i16_i64(ptr %ptr) { ; AVX1: # %bb.0: ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -7909,7 +7894,7 @@ define <16 x i16> @insert_dup_mem_v16i16_i64(ptr %ptr) { ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; XOPAVX1-NEXT: retq ; @@ -7929,7 +7914,7 @@ define <16 x i16> @insert_dup_elt1_mem_v16i16_i64(ptr %ptr) { ; AVX1: # %bb.0: ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -7942,7 +7927,7 @@ define <16 x i16> @insert_dup_elt1_mem_v16i16_i64(ptr %ptr) { ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7] -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; XOPAVX1-NEXT: retq ; @@ -7962,7 +7947,7 @@ define <16 x i16> @insert_dup_elt3_mem_v16i16_i64(ptr %ptr) { ; AVX1: # %bb.0: ; AVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -7975,7 +7960,7 @@ define <16 x i16> @insert_dup_elt3_mem_v16i16_i64(ptr %ptr) { ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero ; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[3,3,3,3,4,5,6,7] -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; XOPAVX1-NEXT: retq ; @@ -8027,7 +8012,7 @@ define <16 x i16> @insert_dup_mem_v16i16_sext_i16_i64(ptr %ptr) { ; AVX1-NEXT: movzwl (%rdi), %eax ; AVX1-NEXT: vmovd %eax, %xmm0 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -8041,7 +8026,7 @@ define <16 x i16> @insert_dup_mem_v16i16_sext_i16_i64(ptr %ptr) { ; XOPAVX1-NEXT: movzwl (%rdi), %eax ; XOPAVX1-NEXT: vmovd %eax, %xmm0 ; XOPAVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; XOPAVX1-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll index d287fb6d5b834..d08bfc6e2d7ea 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v32.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=ALL,AVX1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=ALL,AVX2OR512VL,AVX2,AVX2-SLOW -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=ALL,AVX2OR512VL,AVX2,AVX2-FAST,AVX2-FAST-ALL -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=ALL,AVX2OR512VL,AVX2,AVX2-FAST,AVX2-FAST-PERLANE +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=ALL,AVX2OR512VL,AVX2,AVX2-FAST +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=ALL,AVX2OR512VL,AVX2,AVX2-FAST ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=ALL,AVX2OR512VL,AVX512VL,AVX512VLBW,AVX512VLBW-SLOW ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=ALL,AVX2OR512VL,AVX512VL,AVX512VLBW,AVX512VLBW-FAST ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512vl,+avx512bw,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=ALL,AVX2OR512VL,AVX512VL,AVX512VLBW,AVX512VLBW-FAST @@ -2416,7 +2416,7 @@ define <32 x i8> @shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_ ; AVX1: # %bb.0: ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -2451,14 +2451,14 @@ define <32 x i8> @shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_ ; AVX1-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0 -; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; AVX1-NEXT: vmovddup {{.*#+}} ymm0 = ymm0[0,0,2,2] ; AVX1-NEXT: retq ; ; AVX2-SLOW-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_16_48_16_48_16_48_16_48_16_48_16_48_16_48_16_48: ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] ; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5] ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_16_48_16_48_16_48_16_48_16_48_16_48_16_48_16_48: @@ -2471,7 +2471,7 @@ define <32 x i8> @shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_ ; AVX512VLBW-SLOW: # %bb.0: ; AVX512VLBW-SLOW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] ; AVX512VLBW-SLOW-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] -; AVX512VLBW-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; AVX512VLBW-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5] ; AVX512VLBW-SLOW-NEXT: retq ; ; AVX512VLBW-FAST-LABEL: shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_16_48_16_48_16_48_16_48_16_48_16_48_16_48_16_48: @@ -2500,7 +2500,7 @@ define <32 x i8> @shuffle_v32i8_00_32_00_32_00_32_00_32_00_32_00_32_00_32_00_32_ ; XOPAVX2: # %bb.0: ; XOPAVX2-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0],ymm1[0],ymm0[1],ymm1[1],ymm0[2],ymm1[2],ymm0[3],ymm1[3],ymm0[4],ymm1[4],ymm0[5],ymm1[5],ymm0[6],ymm1[6],ymm0[7],ymm1[7],ymm0[16],ymm1[16],ymm0[17],ymm1[17],ymm0[18],ymm1[18],ymm0[19],ymm1[19],ymm0[20],ymm1[20],ymm0[21],ymm1[21],ymm0[22],ymm1[22],ymm0[23],ymm1[23] ; XOPAVX2-NEXT: vpshuflw {{.*#+}} ymm0 = ymm0[0,0,0,0,4,5,6,7,8,8,8,8,12,13,14,15] -; XOPAVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,0,0,0,4,4,4,4] +; XOPAVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[0,1,0,1,4,5,4,5] ; XOPAVX2-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> %b, <32 x i32> ret <32 x i8> %shuffle @@ -4845,29 +4845,15 @@ define <32 x i8> @shuffle_v32i8_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_ ; AVX1: # %bb.0: ; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; AVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; -; AVX2-SLOW-LABEL: shuffle_v32i8_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6] -; AVX2-SLOW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-ALL-LABEL: shuffle_v32i8_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31: -; AVX2-FAST-ALL: # %bb.0: -; AVX2-FAST-ALL-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15] -; AVX2-FAST-ALL-NEXT: vpbroadcastd {{.*#+}} ymm1 = [6,6,6,6,6,6,6,6] -; AVX2-FAST-ALL-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-ALL-NEXT: retq -; -; AVX2-FAST-PERLANE-LABEL: shuffle_v32i8_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31: -; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,30,31,30,31,30,31,30,31,u,u,u,u,u,u,u,u] -; AVX2-FAST-PERLANE-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] -; AVX2-FAST-PERLANE-NEXT: retq +; AVX2-LABEL: shuffle_v32i8_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31: +; AVX2: # %bb.0: +; AVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,3,3,3] +; AVX2-NEXT: retq ; ; AVX512VL-LABEL: shuffle_v32i8_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31: ; AVX512VL: # %bb.0: @@ -4879,15 +4865,14 @@ define <32 x i8> @shuffle_v32i8_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_ ; XOPAVX1: # %bb.0: ; XOPAVX1-NEXT: vextractf128 $1, %ymm0, %xmm0 ; XOPAVX1-NEXT: vpshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,7,7,7,7] -; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; XOPAVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; XOPAVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; XOPAVX1-NEXT: retq ; ; XOPAVX2-LABEL: shuffle_v32i8_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31_30_31: ; XOPAVX2: # %bb.0: ; XOPAVX2-NEXT: vpshufhw {{.*#+}} ymm0 = ymm0[0,1,2,3,7,7,7,7,8,9,10,11,15,15,15,15] -; XOPAVX2-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,2,2,6,6,6,6] -; XOPAVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,2,2] +; XOPAVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[3,3,3,3] ; XOPAVX2-NEXT: retq %shuffle = shufflevector <32 x i8> %a, <32 x i8> poison, <32 x i32> ret <32 x i8> %shuffle diff --git a/llvm/test/CodeGen/X86/vector-trunc.ll b/llvm/test/CodeGen/X86/vector-trunc.ll index 1a5f5fd5e6db5..732cc445ddcd8 100644 --- a/llvm/test/CodeGen/X86/vector-trunc.ll +++ b/llvm/test/CodeGen/X86/vector-trunc.ll @@ -1838,7 +1838,7 @@ define <8 x i16> @PR32160(<8 x i32> %x) { ; SSE-LABEL: PR32160: ; SSE: # %bb.0: ; SSE-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,4,4,4] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,3,2,3] ; SSE-NEXT: retq ; ; AVX-LABEL: PR32160: @@ -1852,7 +1852,7 @@ define <8 x i16> @PR32160(<8 x i32> %x) { ; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0 ; AVX512F-NEXT: vpmovdw %zmm0, %ymm0 ; AVX512F-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[2,2,2,2,4,5,6,7] -; AVX512F-NEXT: vpbroadcastd %xmm0, %xmm0 +; AVX512F-NEXT: vpbroadcastq %xmm0, %xmm0 ; AVX512F-NEXT: vzeroupper ; AVX512F-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-unsigned-cmp.ll b/llvm/test/CodeGen/X86/vector-unsigned-cmp.ll index 44bd58f9d4121..97124f0a9d8d9 100644 --- a/llvm/test/CodeGen/X86/vector-unsigned-cmp.ll +++ b/llvm/test/CodeGen/X86/vector-unsigned-cmp.ll @@ -571,7 +571,7 @@ define <8 x i16> @PR47448_uge(i16 signext %0) { ; SSE2-NEXT: andl $7, %edi ; SSE2-NEXT: movd %edi, %xmm0 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,1,2,3,4,5,6,7] ; SSE2-NEXT: pcmpgtw %xmm0, %xmm1 ; SSE2-NEXT: pcmpeqd %xmm0, %xmm0 @@ -583,7 +583,7 @@ define <8 x i16> @PR47448_uge(i16 signext %0) { ; SSE41-NEXT: andl $7, %edi ; SSE41-NEXT: movd %edi, %xmm0 ; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] +; SSE41-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] ; SSE41-NEXT: pmovsxbw {{.*#+}} xmm0 = [0,1,2,3,4,5,6,7] ; SSE41-NEXT: pmaxuw %xmm1, %xmm0 ; SSE41-NEXT: pcmpeqw %xmm1, %xmm0 @@ -594,7 +594,7 @@ define <8 x i16> @PR47448_uge(i16 signext %0) { ; AVX1-NEXT: andl $7, %edi ; AVX1-NEXT: vmovd %edi, %xmm0 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: vpmaxuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 ; AVX1-NEXT: vpcmpeqw %xmm1, %xmm0, %xmm0 ; AVX1-NEXT: retq @@ -621,7 +621,7 @@ define <8 x i16> @PR47448_ugt(i16 signext %0) { ; SSE-NEXT: andl $7, %edi ; SSE-NEXT: movd %edi, %xmm0 ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE-NEXT: pcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE-NEXT: retq ; @@ -630,7 +630,7 @@ define <8 x i16> @PR47448_ugt(i16 signext %0) { ; AVX1-NEXT: andl $7, %edi ; AVX1-NEXT: vmovd %edi, %xmm0 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: vpcmpgtw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-zext.ll b/llvm/test/CodeGen/X86/vector-zext.ll index 74926f46ffa43..bd1a48ba5d6ec 100644 --- a/llvm/test/CodeGen/X86/vector-zext.ll +++ b/llvm/test/CodeGen/X86/vector-zext.ll @@ -2448,7 +2448,7 @@ define <8 x i64> @zext_8i6_to_8i64(i32 %x) nounwind uwtable readnone ssp { ; SSE2: # %bb.0: # %entry ; SSE2-NEXT: movd %edi, %xmm0 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,1,0,1] ; SSE2-NEXT: paddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 ; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,0,0] ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] @@ -2469,7 +2469,7 @@ define <8 x i64> @zext_8i6_to_8i64(i32 %x) nounwind uwtable readnone ssp { ; SSSE3: # %bb.0: # %entry ; SSSE3-NEXT: movd %edi, %xmm0 ; SSSE3-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0] +; SSSE3-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,1,0,1] ; SSSE3-NEXT: paddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 ; SSSE3-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,0,0,0] ; SSSE3-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,5,5,5,5] @@ -2490,7 +2490,7 @@ define <8 x i64> @zext_8i6_to_8i64(i32 %x) nounwind uwtable readnone ssp { ; SSE41: # %bb.0: # %entry ; SSE41-NEXT: movd %edi, %xmm0 ; SSE41-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0] +; SSE41-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,1,0,1] ; SSE41-NEXT: paddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3 ; SSE41-NEXT: pmovzxwq {{.*#+}} xmm0 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero ; SSE41-NEXT: pmovsxbq {{.*#+}} xmm4 = [63,63] @@ -2510,7 +2510,7 @@ define <8 x i64> @zext_8i6_to_8i64(i32 %x) nounwind uwtable readnone ssp { ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vmovd %edi, %xmm0 ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: vpaddw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1 ; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm0 = xmm1[0],zero,zero,zero,xmm1[1],zero,zero,zero @@ -2696,9 +2696,8 @@ define <16 x i16> @splatshuf_zext_v16i16(<16 x i8> %x) { ; SSE2: # %bb.0: ; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] ; SSE2-NEXT: pshufhw {{.*#+}} xmm0 = xmm0[0,1,2,3,6,6,6,6] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[2,2,2,2] ; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; SSE2-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15] ; SSE2-NEXT: movdqa %xmm0, %xmm1 ; SSE2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/widened-broadcast.ll b/llvm/test/CodeGen/X86/widened-broadcast.ll index 8e47ed67bdcff..d25512900440e 100644 --- a/llvm/test/CodeGen/X86/widened-broadcast.ll +++ b/llvm/test/CodeGen/X86/widened-broadcast.ll @@ -261,13 +261,13 @@ define <16 x i8> @load_splat_16i8_16i8_0101010101010101(ptr %ptr) nounwind uwtab ; SSE-LABEL: load_splat_16i8_16i8_0101010101010101: ; SSE: # %bb.0: # %entry ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = mem[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE-NEXT: retq ; ; AVX1-LABEL: load_splat_16i8_16i8_0101010101010101: ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = mem[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: retq ; ; AVX2-LABEL: load_splat_16i8_16i8_0101010101010101: @@ -341,14 +341,14 @@ define <32 x i8> @load_splat_32i8_16i8_01010101010101010101010101010101(ptr %ptr ; SSE-LABEL: load_splat_32i8_16i8_01010101010101010101010101010101: ; SSE: # %bb.0: # %entry ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = mem[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: retq ; ; AVX1-LABEL: load_splat_32i8_16i8_01010101010101010101010101010101: ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = mem[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; @@ -405,14 +405,14 @@ define <32 x i8> @load_splat_32i8_32i8_01010101010101010101010101010101(ptr %ptr ; SSE-LABEL: load_splat_32i8_32i8_01010101010101010101010101010101: ; SSE: # %bb.0: # %entry ; SSE-NEXT: pshuflw {{.*#+}} xmm0 = mem[0,0,0,0,4,5,6,7] -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE-NEXT: movdqa %xmm0, %xmm1 ; SSE-NEXT: retq ; ; AVX1-LABEL: load_splat_32i8_32i8_01010101010101010101010101010101: ; AVX1: # %bb.0: # %entry ; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = mem[0,0,0,0,4,5,6,7] -; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0 ; AVX1-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll index b5487ad192337..409d889a9cbc3 100644 --- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll +++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll @@ -2359,7 +2359,7 @@ define void @vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24(ptr %in ; SSE2-NEXT: paddb 48(%rsi), %xmm1 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] ; SSE2-NEXT: pshuflw {{.*#+}} xmm3 = xmm0[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,0,0,0] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm3[0,1,0,1] ; SSE2-NEXT: pand %xmm2, %xmm3 ; SSE2-NEXT: pandn %xmm1, %xmm2 ; SSE2-NEXT: por %xmm3, %xmm2 @@ -2383,7 +2383,7 @@ define void @vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24(ptr %in ; SSE42-NEXT: paddb 48(%rsi), %xmm2 ; SSE42-NEXT: paddb (%rsi), %xmm1 ; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,0,0,0,4,5,6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0] +; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,1,0,1] ; SSE42-NEXT: movaps {{.*#+}} xmm0 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] ; SSE42-NEXT: pblendvb %xmm0, %xmm3, %xmm2 ; SSE42-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero @@ -2403,7 +2403,7 @@ define void @vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24(ptr %in ; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7] -; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] +; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] ; AVX-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] ; AVX-NEXT: vpblendvb %xmm3, %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero @@ -2507,7 +2507,7 @@ define void @vec384_i8_widen_to_i24_factor3_broadcast_to_v16i24_factor16(ptr %in ; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE2-NEXT: pandn %xmm0, %xmm2 ; SSE2-NEXT: por %xmm1, %xmm2 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,255,0,0,255,0,0,255,0,0,255,0,0,255,0,0] @@ -2801,7 +2801,7 @@ define void @vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8(ptr %in.v ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255] ; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE2-NEXT: pandn %xmm0, %xmm2 ; SSE2-NEXT: por %xmm1, %xmm2 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,0,0,0,255,0,0,0,0,0,255,0,0,0,0,0] @@ -2822,7 +2822,7 @@ define void @vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8(ptr %in.v ; SSE42-NEXT: paddb 48(%rsi), %xmm2 ; SSE42-NEXT: paddb (%rsi), %xmm1 ; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,0,0,0,4,5,6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0] +; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,1,0,1] ; SSE42-NEXT: movaps {{.*#+}} xmm0 = [0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255] ; SSE42-NEXT: pblendvb %xmm0, %xmm2, %xmm3 ; SSE42-NEXT: movdqa %xmm1, %xmm0 @@ -2843,7 +2843,7 @@ define void @vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8(ptr %in.v ; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7] -; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] +; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] ; AVX-NEXT: vmovdqa {{.*#+}} xmm3 = [0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255] ; AVX-NEXT: vpblendvb %xmm3, %xmm1, %xmm2, %xmm1 ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm0[0],zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,xmm0[0],zero @@ -3685,7 +3685,7 @@ define void @vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8(ptr %in. ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,0,65535,65535,0,65535] ; SSE2-NEXT: pand %xmm2, %xmm1 ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE2-NEXT: pandn %xmm0, %xmm2 ; SSE2-NEXT: por %xmm1, %xmm2 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,0,65535,0,0,65535,0,0] @@ -3706,7 +3706,7 @@ define void @vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8(ptr %in. ; SSE42-NEXT: paddb 48(%rsi), %xmm1 ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] ; SSE42-NEXT: pxor %xmm2, %xmm2 ; SSE42-NEXT: pxor %xmm3, %xmm3 @@ -3727,7 +3727,7 @@ define void @vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8(ptr %in. ; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2],xmm0[3],xmm1[4,5],xmm0[6],xmm1[7] ; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7] diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll index 95fde98536818..ab216cafcc923 100644 --- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll +++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll @@ -1832,7 +1832,7 @@ define void @vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24(ptr %in ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] ; SSE2-NEXT: pand %xmm1, %xmm2 ; SSE2-NEXT: pandn 48(%rdi), %xmm1 ; SSE2-NEXT: por %xmm1, %xmm2 @@ -1853,7 +1853,7 @@ define void @vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24(ptr %in ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm1 ; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,0,0,0,4,5,6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,0,0] +; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] ; SSE42-NEXT: movaps {{.*#+}} xmm0 = [0,255,0,255,0,255,0,255,0,255,0,255,0,255,0,255] ; SSE42-NEXT: pblendvb %xmm0, 48(%rdi), %xmm2 ; SSE42-NEXT: pshufb {{.*#+}} xmm1 = xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero,xmm1[0],zero @@ -1871,7 +1871,7 @@ define void @vec384_i8_widen_to_i16_factor2_broadcast_to_v24i16_factor24(ptr %in ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX-NEXT: vpshuflw {{.*#+}} xmm2 = xmm0[0,0,0,0,4,5,6,7] -; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] +; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] ; AVX-NEXT: vbroadcastss {{.*#+}} xmm3 = [255,0,255,0,255,0,255,0,255,0,255,0,255,0,255,0] ; AVX-NEXT: vpblendvb %xmm3, %xmm2, %xmm1, %xmm1 ; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero,xmm0[0],zero @@ -1960,7 +1960,7 @@ define void @vec384_i8_widen_to_i24_factor3_broadcast_to_v16i24_factor16(ptr %in ; SSE2-NEXT: pand %xmm1, %xmm2 ; SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] ; SSE2-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE2-NEXT: pandn %xmm0, %xmm1 ; SSE2-NEXT: por %xmm2, %xmm1 ; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,255,0,0,255,0,0,255,0,0,255,0,0,255,0,0] @@ -2213,7 +2213,7 @@ define void @vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8(ptr %in.e ; SSE2-NEXT: movdqa 48(%rdi), %xmm1 ; SSE2-NEXT: pand %xmm0, %xmm1 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = mem[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] ; SSE2-NEXT: pandn %xmm2, %xmm0 ; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,0,0,0,255,0,0,0,0,0,255,0,0,0,0,0] @@ -2231,7 +2231,7 @@ define void @vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8(ptr %in.e ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm1 ; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = xmm1[0,0,0,0,4,5,6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,0,0] +; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] ; SSE42-NEXT: movaps {{.*#+}} xmm0 = [0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255] ; SSE42-NEXT: pblendvb %xmm0, 48(%rdi), %xmm2 ; SSE42-NEXT: movdqa %xmm1, %xmm0 @@ -2249,7 +2249,7 @@ define void @vec384_i8_widen_to_i48_factor6_broadcast_to_v8i48_factor8(ptr %in.e ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[0,0,0,0,4,5,6,7] -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0] +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,0,1] ; AVX-NEXT: vmovdqa {{.*#+}} xmm2 = [0,255,255,255,255,255,0,255,255,255,255,255,0,255,255,255] ; AVX-NEXT: vpblendvb %xmm2, 48(%rdi), %xmm1, %xmm1 ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,xmm0[0],zero,zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,zero,xmm0[0],zero @@ -2955,7 +2955,7 @@ define void @vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8(ptr %in. ; SSE2-NEXT: movdqa 48(%rdi), %xmm1 ; SSE2-NEXT: pand %xmm0, %xmm1 ; SSE2-NEXT: pshuflw {{.*#+}} xmm2 = mem[0,0,0,0,4,5,6,7] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,0,0,0] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[0,1,0,1] ; SSE2-NEXT: pandn %xmm2, %xmm0 ; SSE2-NEXT: por %xmm1, %xmm0 ; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,0,65535,0,0,65535,0,0] @@ -2972,7 +2972,7 @@ define void @vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8(ptr %in. ; SSE42-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8: ; SSE42: # %bb.0: ; SSE42-NEXT: pshuflw {{.*#+}} xmm0 = mem[0,0,0,0,4,5,6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE42-NEXT: pxor %xmm1, %xmm1 ; SSE42-NEXT: pxor %xmm2, %xmm2 ; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm0[2],xmm2[3,4],xmm0[5],xmm2[6,7] @@ -2989,7 +2989,7 @@ define void @vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8(ptr %in. ; AVX-LABEL: vec384_i16_widen_to_i48_factor3_broadcast_to_v8i48_factor8: ; AVX: # %bb.0: ; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = mem[0,0,0,0,4,5,6,7] -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2],xmm0[3],mem[4,5],xmm0[6],mem[7] ; AVX-NEXT: vmovdqa (%rdi), %xmm2 ; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,xmm2[0,1],zero,zero,zero,zero,xmm2[0,1],zero,zero,zero,zero