From f7485059c6ff1623f2195f509932628cfc3c07ad Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Fri, 10 Jan 2025 21:13:09 +0700 Subject: [PATCH 1/5] DAG: Avoid forming shufflevector from a single extract_vector_elt This avoids regressions in a future AMDGPU commit. Previously we would have a build_vector (extract_vector_elt x), undef with free access to the elements bloated into a shuffle of one element + undef, which has much worse combine support than the extract. Alternatively could check aggressivelyPreferBuildVectorSources, but I'm not sure it's really different than isExtractVecEltCheap. --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 25 +++++++++++++++---- .../CodeGen/AMDGPU/insert_vector_dynelt.ll | 10 ++++---- llvm/test/CodeGen/X86/avx512-build-vector.ll | 8 +++--- .../CodeGen/X86/insertelement-duplicates.ll | 10 +++----- llvm/test/CodeGen/X86/sse-align-12.ll | 4 +-- 5 files changed, 34 insertions(+), 23 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 58ab99e0dcdee..1d27af9151162 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -23807,6 +23807,10 @@ SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) { SmallVector VecIn; VecIn.push_back(SDValue()); + // If we have a single extract_element with a constant index, track the index + // value. + unsigned OneConstExtractIndex = ~0u; + for (unsigned i = 0; i != NumElems; ++i) { SDValue Op = N->getOperand(i); @@ -23824,16 +23828,18 @@ SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) { // Not an undef or zero. If the input is something other than an // EXTRACT_VECTOR_ELT with an in-range constant index, bail out. - if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT || - !isa(Op.getOperand(1))) + if (Op.getOpcode() != ISD::EXTRACT_VECTOR_ELT) return SDValue(); - SDValue ExtractedFromVec = Op.getOperand(0); + SDValue ExtractedFromVec = Op.getOperand(0); if (ExtractedFromVec.getValueType().isScalableVector()) return SDValue(); + auto *ExtractIdx = dyn_cast(Op.getOperand(1)); + if (!ExtractIdx) + return SDValue(); - const APInt &ExtractIdx = Op.getConstantOperandAPInt(1); - if (ExtractIdx.uge(ExtractedFromVec.getValueType().getVectorNumElements())) + if (ExtractIdx->getAsAPIntVal().uge( + ExtractedFromVec.getValueType().getVectorNumElements())) return SDValue(); // All inputs must have the same element type as the output. @@ -23841,6 +23847,8 @@ SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) { ExtractedFromVec.getValueType().getVectorElementType()) return SDValue(); + OneConstExtractIndex = ExtractIdx->getZExtValue(); + // Have we seen this input vector before? // The vectors are expected to be tiny (usually 1 or 2 elements), so using // a map back from SDValues to numbers isn't worth it. @@ -23863,6 +23871,13 @@ SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) { // VecIn accordingly. bool DidSplitVec = false; if (VecIn.size() == 2) { + // If we only found a single constant indexed extract_vector_elt feeding the + // build_vector, do not produce a more complicated shuffle if the extract is + // cheap. + if (TLI.isOperationLegalOrCustom(ISD::EXTRACT_VECTOR_ELT, VT) && + TLI.isExtractVecEltCheap(VT, OneConstExtractIndex)) + return SDValue(); + unsigned MaxIndex = 0; unsigned NearestPow2 = 0; SDValue Vec = VecIn.back(); diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll index 7912d1cf8dc0d..add8c0f75bf33 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll @@ -452,11 +452,11 @@ define amdgpu_kernel void @byte8_inselt(ptr addrspace(1) %out, <8 x i8> %vec, i3 ; GCN-NEXT: s_and_b32 s6, s4, 0x1010101 ; GCN-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] ; GCN-NEXT: s_or_b64 s[2:3], s[6:7], s[2:3] -; GCN-NEXT: v_mov_b32_e32 v3, s1 -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: v_mov_b32_e32 v2, s0 -; GCN-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: v_mov_b32_e32 v3, s3 +; GCN-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GCN-NEXT: s_endpgm entry: %v = insertelement <8 x i8> %vec, i8 1, i32 %sel diff --git a/llvm/test/CodeGen/X86/avx512-build-vector.ll b/llvm/test/CodeGen/X86/avx512-build-vector.ll index b21a0c4e36c2b..27cb3eb406e9e 100644 --- a/llvm/test/CodeGen/X86/avx512-build-vector.ll +++ b/llvm/test/CodeGen/X86/avx512-build-vector.ll @@ -14,11 +14,9 @@ define <16 x i32> @test2(<16 x i32> %x) { define <16 x float> @test3(<4 x float> %a) { ; CHECK-LABEL: test3: ; CHECK: ## %bb.0: -; CHECK-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 -; CHECK-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,1,2,3,4,18,16,7,8,9,10,11,12,13,14,15] -; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vpermt2ps %zmm0, %zmm2, %zmm1 -; CHECK-NEXT: vmovaps %zmm1, %zmm0 +; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[8,9,10,11,0,1,2,3],zero,zero,zero,zero +; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vinserti32x4 $1, %xmm0, %zmm1, %zmm0 ; CHECK-NEXT: retq %b = extractelement <4 x float> %a, i32 2 %c = insertelement <16 x float> , float %b, i32 5 diff --git a/llvm/test/CodeGen/X86/insertelement-duplicates.ll b/llvm/test/CodeGen/X86/insertelement-duplicates.ll index 435ea61412b73..3da53a4ca5f1b 100644 --- a/llvm/test/CodeGen/X86/insertelement-duplicates.ll +++ b/llvm/test/CodeGen/X86/insertelement-duplicates.ll @@ -31,18 +31,16 @@ define void @PR15298(ptr nocapture %source, ptr nocapture %dest) nounwind noinli ; AVX-32: # %bb.0: # %L.entry ; AVX-32-NEXT: movl {{[0-9]+}}(%esp), %eax ; AVX-32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; AVX-32-NEXT: vbroadcastss 304(%ecx), %xmm0 -; AVX-32-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX-32-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4,5,6,7] +; AVX-32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-32-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,0,1] ; AVX-32-NEXT: vmovups %ymm0, 608(%eax) ; AVX-32-NEXT: vzeroupper ; AVX-32-NEXT: retl ; ; AVX-64-LABEL: PR15298: ; AVX-64: # %bb.0: # %L.entry -; AVX-64-NEXT: vbroadcastss 304(%rdi), %xmm0 -; AVX-64-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX-64-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4,5,6,7] +; AVX-64-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX-64-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,0,1] ; AVX-64-NEXT: vmovups %ymm0, 608(%rsi) ; AVX-64-NEXT: vzeroupper ; AVX-64-NEXT: retq diff --git a/llvm/test/CodeGen/X86/sse-align-12.ll b/llvm/test/CodeGen/X86/sse-align-12.ll index 7b4bd3ffdf00c..0d5bdb0954ce3 100644 --- a/llvm/test/CodeGen/X86/sse-align-12.ll +++ b/llvm/test/CodeGen/X86/sse-align-12.ll @@ -40,8 +40,8 @@ define <4 x float> @b(ptr %y, <4 x float> %z) nounwind { define <2 x double> @c(ptr %y) nounwind { ; CHECK-LABEL: c: ; CHECK: # %bb.0: -; CHECK-NEXT: movups (%rdi), %xmm0 -; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3,0,1] +; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; CHECK-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] ; CHECK-NEXT: retq %x = load <2 x double>, ptr %y, align 8 %a = extractelement <2 x double> %x, i32 0 From a4823b0ea585d7a96e25870b4589b4086da4f6d8 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Wed, 15 Jan 2025 15:43:54 +0700 Subject: [PATCH 2/5] Check hasOneUse on the source vector. This gives up some of the x86 improvements --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 6 +++++- llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll | 10 +++++----- llvm/test/CodeGen/X86/avx512-build-vector.ll | 8 +++++--- llvm/test/CodeGen/X86/sse-align-12.ll | 4 ++-- 4 files changed, 17 insertions(+), 11 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 1d27af9151162..70d2947821c06 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -23874,8 +23874,12 @@ SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) { // If we only found a single constant indexed extract_vector_elt feeding the // build_vector, do not produce a more complicated shuffle if the extract is // cheap. + + // TODO: This should be more aggressive about skipping the shuffle formation + // (e.g., always do this for VecIn[1]->hasOneUse()) if (TLI.isOperationLegalOrCustom(ISD::EXTRACT_VECTOR_ELT, VT) && - TLI.isExtractVecEltCheap(VT, OneConstExtractIndex)) + (VecIn[1].hasOneUse() && + TLI.isExtractVecEltCheap(VT, OneConstExtractIndex))) return SDValue(); unsigned MaxIndex = 0; diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll index add8c0f75bf33..7912d1cf8dc0d 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll @@ -452,11 +452,11 @@ define amdgpu_kernel void @byte8_inselt(ptr addrspace(1) %out, <8 x i8> %vec, i3 ; GCN-NEXT: s_and_b32 s6, s4, 0x1010101 ; GCN-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] ; GCN-NEXT: s_or_b64 s[2:3], s[6:7], s[2:3] -; GCN-NEXT: v_mov_b32_e32 v0, s0 -; GCN-NEXT: v_mov_b32_e32 v2, s2 -; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: v_mov_b32_e32 v3, s3 -; GCN-NEXT: flat_store_dwordx2 v[0:1], v[2:3] +; GCN-NEXT: v_mov_b32_e32 v3, s1 +; GCN-NEXT: v_mov_b32_e32 v0, s2 +; GCN-NEXT: v_mov_b32_e32 v1, s3 +; GCN-NEXT: v_mov_b32_e32 v2, s0 +; GCN-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GCN-NEXT: s_endpgm entry: %v = insertelement <8 x i8> %vec, i8 1, i32 %sel diff --git a/llvm/test/CodeGen/X86/avx512-build-vector.ll b/llvm/test/CodeGen/X86/avx512-build-vector.ll index 27cb3eb406e9e..b21a0c4e36c2b 100644 --- a/llvm/test/CodeGen/X86/avx512-build-vector.ll +++ b/llvm/test/CodeGen/X86/avx512-build-vector.ll @@ -14,9 +14,11 @@ define <16 x i32> @test2(<16 x i32> %x) { define <16 x float> @test3(<4 x float> %a) { ; CHECK-LABEL: test3: ; CHECK: ## %bb.0: -; CHECK-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[8,9,10,11,0,1,2,3],zero,zero,zero,zero -; CHECK-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vinserti32x4 $1, %xmm0, %zmm1, %zmm0 +; CHECK-NEXT: ## kill: def $xmm0 killed $xmm0 def $zmm0 +; CHECK-NEXT: vpmovsxbd {{.*#+}} zmm2 = [0,1,2,3,4,18,16,7,8,9,10,11,12,13,14,15] +; CHECK-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vpermt2ps %zmm0, %zmm2, %zmm1 +; CHECK-NEXT: vmovaps %zmm1, %zmm0 ; CHECK-NEXT: retq %b = extractelement <4 x float> %a, i32 2 %c = insertelement <16 x float> , float %b, i32 5 diff --git a/llvm/test/CodeGen/X86/sse-align-12.ll b/llvm/test/CodeGen/X86/sse-align-12.ll index 0d5bdb0954ce3..7b4bd3ffdf00c 100644 --- a/llvm/test/CodeGen/X86/sse-align-12.ll +++ b/llvm/test/CodeGen/X86/sse-align-12.ll @@ -40,8 +40,8 @@ define <4 x float> @b(ptr %y, <4 x float> %z) nounwind { define <2 x double> @c(ptr %y) nounwind { ; CHECK-LABEL: c: ; CHECK: # %bb.0: -; CHECK-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; CHECK-NEXT: movhps {{.*#+}} xmm0 = xmm0[0,1],mem[0,1] +; CHECK-NEXT: movups (%rdi), %xmm0 +; CHECK-NEXT: shufps {{.*#+}} xmm0 = xmm0[2,3,0,1] ; CHECK-NEXT: retq %x = load <2 x double>, ptr %y, align 8 %a = extractelement <2 x double> %x, i32 0 From e845f03322548ff35a61f4e9033e0de7de22860f Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Thu, 16 Jan 2025 13:47:27 +0700 Subject: [PATCH 3/5] Count the number of extract uses --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 14 +- .../CodeGen/AMDGPU/insert_vector_dynelt.ll | 10 +- .../any_extend_vector_inreg_of_broadcast.ll | 240 +++++----- ...d_vector_inreg_of_broadcast_from_memory.ll | 118 +++-- llvm/test/CodeGen/X86/buildvec-extract.ll | 39 +- .../CodeGen/X86/insertelement-duplicates.ll | 10 +- llvm/test/CodeGen/X86/movmsk-bittest.ll | 27 +- .../CodeGen/X86/split-extend-vector-inreg.ll | 46 +- llvm/test/CodeGen/X86/sse41.ll | 92 ++-- llvm/test/CodeGen/X86/vec_extract-avx.ll | 4 +- .../vector-interleaved-store-i32-stride-5.ll | 26 +- llvm/test/CodeGen/X86/vector-narrow-binop.ll | 9 +- .../zero_extend_vector_inreg_of_broadcast.ll | 420 +++++++++--------- ...d_vector_inreg_of_broadcast_from_memory.ll | 325 ++++++-------- 14 files changed, 681 insertions(+), 699 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 70d2947821c06..d5d8336cc2c47 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -23811,6 +23811,8 @@ SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) { // value. unsigned OneConstExtractIndex = ~0u; + unsigned NumExtracts = 0; + for (unsigned i = 0; i != NumElems; ++i) { SDValue Op = N->getOperand(i); @@ -23847,7 +23849,10 @@ SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) { ExtractedFromVec.getValueType().getVectorElementType()) return SDValue(); - OneConstExtractIndex = ExtractIdx->getZExtValue(); + if (OneConstExtractIndex == ~0u) + OneConstExtractIndex = ExtractIdx->getZExtValue(); + + ++NumExtracts; // Have we seen this input vector before? // The vectors are expected to be tiny (usually 1 or 2 elements), so using @@ -23878,8 +23883,11 @@ SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) { // TODO: This should be more aggressive about skipping the shuffle formation // (e.g., always do this for VecIn[1]->hasOneUse()) if (TLI.isOperationLegalOrCustom(ISD::EXTRACT_VECTOR_ELT, VT) && - (VecIn[1].hasOneUse() && - TLI.isExtractVecEltCheap(VT, OneConstExtractIndex))) + TLI.isTypeLegal(VT.getVectorElementType()) && + // VecIn[1].hasOneUse() && + NumExtracts == 1 + //&& TLI.isExtractVecEltCheap(VT, OneConstExtractIndex)) + ) return SDValue(); unsigned MaxIndex = 0; diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll index 7912d1cf8dc0d..add8c0f75bf33 100644 --- a/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_dynelt.ll @@ -452,11 +452,11 @@ define amdgpu_kernel void @byte8_inselt(ptr addrspace(1) %out, <8 x i8> %vec, i3 ; GCN-NEXT: s_and_b32 s6, s4, 0x1010101 ; GCN-NEXT: s_andn2_b64 s[2:3], s[2:3], s[4:5] ; GCN-NEXT: s_or_b64 s[2:3], s[6:7], s[2:3] -; GCN-NEXT: v_mov_b32_e32 v3, s1 -; GCN-NEXT: v_mov_b32_e32 v0, s2 -; GCN-NEXT: v_mov_b32_e32 v1, s3 -; GCN-NEXT: v_mov_b32_e32 v2, s0 -; GCN-NEXT: flat_store_dwordx2 v[2:3], v[0:1] +; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: v_mov_b32_e32 v2, s2 +; GCN-NEXT: v_mov_b32_e32 v1, s1 +; GCN-NEXT: v_mov_b32_e32 v3, s3 +; GCN-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GCN-NEXT: s_endpgm entry: %v = insertelement <8 x i8> %vec, i8 1, i32 %sel diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll index cad1d09f11d9c..940c1ca2d4d35 100644 --- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll +++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll @@ -3817,21 +3817,21 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in. ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: movdqa 48(%rdi), %xmm1 -; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: paddb 48(%rsi), %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,0,65535] -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0] -; SSE2-NEXT: pandn %xmm3, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE2-NEXT: paddb (%rdx), %xmm2 -; SSE2-NEXT: paddb 16(%rdx), %xmm0 -; SSE2-NEXT: paddb 32(%rdx), %xmm1 -; SSE2-NEXT: movdqa %xmm1, 32(%rcx) -; SSE2-NEXT: movdqa %xmm0, 16(%rcx) -; SSE2-NEXT: movdqa %xmm2, (%rcx) +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,1,0,1] +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [0,65535,65535,65535,65535,65535,0,65535] +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE2-NEXT: pandn %xmm0, %xmm4 +; SSE2-NEXT: por %xmm1, %xmm4 +; SSE2-NEXT: paddb (%rdx), %xmm4 +; SSE2-NEXT: paddb 16(%rdx), %xmm3 +; SSE2-NEXT: paddb 32(%rdx), %xmm2 +; SSE2-NEXT: movdqa %xmm2, 32(%rcx) +; SSE2-NEXT: movdqa %xmm3, 16(%rcx) +; SSE2-NEXT: movdqa %xmm4, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: @@ -3840,16 +3840,16 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in. ; SSE42-NEXT: movdqa 48(%rdi), %xmm1 ; SSE42-NEXT: paddb 48(%rsi), %xmm1 ; SSE42-NEXT: paddb (%rsi), %xmm0 -; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,0,0] -; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm1[1,2,3,4,5],xmm2[6],xmm1[7] -; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] -; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE42-NEXT: paddb (%rdx), %xmm2 -; SSE42-NEXT: paddb 16(%rdx), %xmm0 -; SSE42-NEXT: paddb 32(%rdx), %xmm1 -; SSE42-NEXT: movdqa %xmm1, 32(%rcx) -; SSE42-NEXT: movdqa %xmm0, 16(%rcx) -; SSE42-NEXT: movdqa %xmm2, (%rcx) +; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] +; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,1,0,1] +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5],xmm0[6],xmm1[7] +; SSE42-NEXT: paddb (%rdx), %xmm0 +; SSE42-NEXT: paddb 16(%rdx), %xmm3 +; SSE42-NEXT: paddb 32(%rdx), %xmm2 +; SSE42-NEXT: movdqa %xmm2, 32(%rcx) +; SSE42-NEXT: movdqa %xmm3, 16(%rcx) +; SSE42-NEXT: movdqa %xmm0, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: @@ -3858,15 +3858,15 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in. ; AVX-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] -; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,0,0,0] -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3,4,5],xmm3[6],xmm1[7] -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,0,0] +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3,4,5],xmm2[6],xmm1[7] +; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm2, %xmm2 -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 +; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) ; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: retq ; @@ -3875,7 +3875,7 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in. ; AVX2-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0 +; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0 ; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5],xmm0[6],xmm1[7] ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 @@ -3985,20 +3985,16 @@ define void @vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3(ptr %i ; AVX-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX-NEXT: vmovdqa 48(%rdi), %xmm2 -; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 -; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3,4,5,6,7] -; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2 -; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm3 +; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] +; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 ; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) -; AVX-NEXT: vmovdqa %xmm1, 48(%rcx) -; AVX-NEXT: vmovdqa %xmm2, (%rcx) +; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3: @@ -4079,29 +4075,29 @@ define void @vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2(ptr % ; SSE2-NEXT: movdqa 48(%rdi), %xmm1 ; SSE2-NEXT: paddb 48(%rsi), %xmm1 ; SSE2-NEXT: paddb (%rsi), %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,65535] -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pandn %xmm0, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE2-NEXT: paddb (%rdx), %xmm2 -; SSE2-NEXT: paddb 16(%rdx), %xmm0 -; SSE2-NEXT: movdqa %xmm0, 16(%rcx) -; SSE2-NEXT: movdqa %xmm2, (%rcx) +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [0,65535,65535,65535,65535,65535,65535,65535] +; SSE2-NEXT: pand %xmm3, %xmm1 +; SSE2-NEXT: pandn %xmm0, %xmm3 +; SSE2-NEXT: por %xmm1, %xmm3 +; SSE2-NEXT: paddb (%rdx), %xmm3 +; SSE2-NEXT: paddb 16(%rdx), %xmm2 +; SSE2-NEXT: movdqa %xmm2, 16(%rcx) +; SSE2-NEXT: movdqa %xmm3, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: movdqa 48(%rdi), %xmm1 -; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: paddb 48(%rsi), %xmm1 -; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE42-NEXT: paddb (%rdx), %xmm1 -; SSE42-NEXT: paddb 16(%rdx), %xmm0 -; SSE42-NEXT: movdqa %xmm1, (%rcx) -; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] +; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] +; SSE42-NEXT: paddb (%rdx), %xmm0 +; SSE42-NEXT: paddb 16(%rdx), %xmm2 +; SSE42-NEXT: movdqa %xmm0, (%rcx) +; SSE42-NEXT: movdqa %xmm2, 16(%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2: @@ -4339,33 +4335,33 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in. ; SSE42-NEXT: movdqa 48(%rdi), %xmm1 ; SSE42-NEXT: paddb 48(%rsi), %xmm1 ; SSE42-NEXT: paddb (%rsi), %xmm0 -; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,0,0] -; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3,4,5],xmm2[6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] -; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE42-NEXT: paddb (%rdx), %xmm2 -; SSE42-NEXT: paddb 16(%rdx), %xmm0 -; SSE42-NEXT: paddb 32(%rdx), %xmm1 -; SSE42-NEXT: movdqa %xmm1, 32(%rcx) -; SSE42-NEXT: movdqa %xmm0, 16(%rcx) -; SSE42-NEXT: movdqa %xmm2, (%rcx) +; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] +; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,1,0,1] +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5],xmm0[6,7] +; SSE42-NEXT: paddb (%rdx), %xmm0 +; SSE42-NEXT: paddb 16(%rdx), %xmm3 +; SSE42-NEXT: paddb 32(%rdx), %xmm2 +; SSE42-NEXT: movdqa %xmm2, 32(%rcx) +; SSE42-NEXT: movdqa %xmm3, 16(%rcx) +; SSE42-NEXT: movdqa %xmm0, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] +; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm2, %xmm2 -; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 +; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) ; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: retq ; @@ -4375,7 +4371,7 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in. ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] +; AVX2-NEXT: vpbroadcastd %xmm0, %ymm2 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,5,6,0] ; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 @@ -4492,20 +4488,16 @@ define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %i ; AVX-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX-NEXT: vmovdqa 48(%rdi), %xmm2 -; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 -; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3,4,5,6,7] -; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2 -; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm3 +; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 ; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) -; AVX-NEXT: vmovdqa %xmm1, 48(%rcx) -; AVX-NEXT: vmovdqa %xmm2, (%rcx) +; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3: @@ -4599,28 +4591,28 @@ define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %i ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: movdqa 48(%rdi), %xmm1 -; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: paddb 48(%rsi), %xmm1 +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE2-NEXT: paddb (%rdx), %xmm1 -; SSE2-NEXT: paddb 16(%rdx), %xmm0 +; SSE2-NEXT: paddb 16(%rdx), %xmm2 ; SSE2-NEXT: movdqa %xmm1, (%rcx) -; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: movdqa %xmm2, 16(%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: movdqa 48(%rdi), %xmm1 -; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: paddb 48(%rsi), %xmm1 -; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE42-NEXT: paddb (%rdx), %xmm1 -; SSE42-NEXT: paddb 16(%rdx), %xmm0 -; SSE42-NEXT: movdqa %xmm1, (%rcx) -; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] +; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; SSE42-NEXT: paddb (%rdx), %xmm0 +; SSE42-NEXT: paddb 16(%rdx), %xmm2 +; SSE42-NEXT: movdqa %xmm0, (%rcx) +; SSE42-NEXT: movdqa %xmm2, 16(%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2: @@ -4750,22 +4742,18 @@ define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %i ; AVX-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX-NEXT: vmovdqa 48(%rdi), %xmm2 -; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 +; AVX-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm3 -; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] -; AVX-NEXT: vextractf128 $1, %ymm2, %xmm3 -; AVX-NEXT: vpaddb 16(%rdx), %xmm3, %xmm3 -; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2 -; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm2 +; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] +; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 +; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 +; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 ; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 ; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) -; AVX-NEXT: vmovdqa %xmm1, 48(%rcx) -; AVX-NEXT: vmovdqa %xmm2, (%rcx) -; AVX-NEXT: vmovdqa %xmm3, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm1, (%rcx) +; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -4860,14 +4848,14 @@ define void @vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2(ptr %i ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: movdqa 48(%rdi), %xmm1 -; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: paddb 48(%rsi), %xmm1 -; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE2-NEXT: paddb (%rdx), %xmm1 -; SSE2-NEXT: paddb 16(%rdx), %xmm0 -; SSE2-NEXT: movdqa %xmm1, (%rcx) -; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; SSE2-NEXT: paddb (%rdx), %xmm0 +; SSE2-NEXT: paddb 16(%rdx), %xmm2 +; SSE2-NEXT: movdqa %xmm0, (%rcx) +; SSE2-NEXT: movdqa %xmm2, 16(%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2: @@ -4876,12 +4864,12 @@ define void @vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2(ptr %i ; SSE42-NEXT: movdqa 48(%rdi), %xmm1 ; SSE42-NEXT: paddb 48(%rsi), %xmm1 ; SSE42-NEXT: paddb (%rsi), %xmm0 -; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE42-NEXT: paddb (%rdx), %xmm1 -; SSE42-NEXT: paddb 16(%rdx), %xmm0 -; SSE42-NEXT: movdqa %xmm1, (%rcx) -; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] +; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] +; SSE42-NEXT: paddb (%rdx), %xmm0 +; SSE42-NEXT: paddb 16(%rdx), %xmm2 +; SSE42-NEXT: movdqa %xmm0, (%rcx) +; SSE42-NEXT: movdqa %xmm2, 16(%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2: diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll index 3d72319f59ca9..9572ff3a37fad 100644 --- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll +++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll @@ -3129,8 +3129,8 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in. ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],mem[1,2,3,4,5],xmm1[6],mem[7] ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 16(%rsi), %xmm2, %xmm2 ; AVX-NEXT: vmovdqa %xmm2, 16(%rdx) @@ -3141,13 +3141,14 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in. ; AVX2-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastw (%rdi), %xmm0 +; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3,4,5],xmm0[6],mem[7] ; AVX2-NEXT: vpbroadcastw (%rdi), %xmm1 -; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],mem[1,2,3,4,5],xmm1[6],mem[7] -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 -; AVX2-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) -; AVX2-NEXT: vmovdqa %ymm1, (%rdx) +; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vpbroadcastw (%rdi), %ymm1 +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-NEXT: vmovdqa %ymm0, (%rdx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -3233,17 +3234,13 @@ define void @vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3(ptr %i ; AVX-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 -; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],mem[1,2,3,4,5,6,7] -; AVX-NEXT: vpaddb (%rsi), %xmm2, %xmm2 -; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa (%rdi), %xmm3 -; AVX-NEXT: vpaddb 32(%rsi), %xmm3, %xmm3 +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2,3,4,5,6,7] +; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 +; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2 ; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 ; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm3, 32(%rdx) -; AVX-NEXT: vmovdqa %xmm1, 48(%rdx) -; AVX-NEXT: vmovdqa %xmm2, (%rdx) +; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3: @@ -3519,16 +3516,16 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in. ; SSE42-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 -; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] -; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],mem[2,3,4,5],xmm1[6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] -; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE42-NEXT: paddb (%rsi), %xmm1 -; SSE42-NEXT: paddb 16(%rsi), %xmm0 -; SSE42-NEXT: paddb 32(%rsi), %xmm2 -; SSE42-NEXT: movdqa %xmm2, 32(%rdx) -; SSE42-NEXT: movdqa %xmm0, 16(%rdx) -; SSE42-NEXT: movdqa %xmm1, (%rdx) +; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] +; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3,4,5],xmm0[6,7] +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: paddb 16(%rsi), %xmm2 +; SSE42-NEXT: paddb 32(%rsi), %xmm1 +; SSE42-NEXT: movdqa %xmm1, 32(%rdx) +; SSE42-NEXT: movdqa %xmm2, 16(%rdx) +; SSE42-NEXT: movdqa %xmm0, (%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4: @@ -3537,8 +3534,8 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in. ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],mem[2,3,4,5,6,7] ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] ; AVX-NEXT: vbroadcastss (%rdi), %xmm2 -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 16(%rsi), %xmm2, %xmm2 ; AVX-NEXT: vmovdqa %xmm2, 16(%rdx) @@ -3549,10 +3546,10 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in. ; AVX2-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0 -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = mem[0,0,1,1] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,5,6,0] -; AVX2-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,5,6,0] +; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-NEXT: vpbroadcastd (%rdi), %ymm1 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 ; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx) @@ -3634,19 +3631,15 @@ define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %i ; ; AVX-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3: ; AVX: # %bb.0: -; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] -; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0],mem[1,2,3] -; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa (%rdi), %xmm2 -; AVX-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX-NEXT: vpaddb 48(%rsi), %xmm3, %xmm3 -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2 +; AVX-NEXT: vmovdqa 48(%rdi), %xmm0 +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = mem[0,1],xmm0[2,3,4,5,6,7] +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa (%rdi), %xmm1 +; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm2 +; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 16(%rdx) ; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm3, 48(%rdx) -; AVX-NEXT: vmovdqa %xmm1, (%rdx) -; AVX-NEXT: vzeroupper +; AVX-NEXT: vmovdqa %xmm0, (%rdx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3: @@ -3708,26 +3701,25 @@ define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %i define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %in.elt.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { ; SSE2-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2: ; SSE2: # %bb.0: -; SSE2-NEXT: movaps (%rdi), %xmm0 +; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: movaps 48(%rdi), %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE2-NEXT: paddb 16(%rsi), %xmm0 +; SSE2-NEXT: paddb 16(%rsi), %xmm2 ; SSE2-NEXT: paddb (%rsi), %xmm1 ; SSE2-NEXT: movdqa %xmm1, (%rdx) -; SSE2-NEXT: movdqa %xmm0, 16(%rdx) +; SSE2-NEXT: movdqa %xmm2, 16(%rdx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 -; SSE42-NEXT: movdqa 48(%rdi), %xmm1 -; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE42-NEXT: paddb 16(%rsi), %xmm0 -; SSE42-NEXT: paddb (%rsi), %xmm1 -; SSE42-NEXT: movdqa %xmm1, (%rdx) -; SSE42-NEXT: movdqa %xmm0, 16(%rdx) +; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] +; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3,4,5,6,7] +; SSE42-NEXT: paddb 16(%rsi), %xmm1 +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: movdqa %xmm0, (%rdx) +; SSE42-NEXT: movdqa %xmm1, 16(%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2: @@ -3820,19 +3812,15 @@ define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %i ; ; AVX-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3: ; AVX: # %bb.0: -; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] -; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1],mem[2,3] -; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa (%rdi), %xmm2 -; AVX-NEXT: vmovdqa 16(%rdi), %xmm3 -; AVX-NEXT: vpaddb 48(%rsi), %xmm3, %xmm3 -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2 +; AVX-NEXT: vmovdqa 48(%rdi), %xmm0 +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = mem[0,1,2,3],xmm0[4,5,6,7] +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa (%rdi), %xmm1 +; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm2 +; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa %xmm1, 16(%rdx) ; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) -; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm3, 48(%rdx) -; AVX-NEXT: vmovdqa %xmm1, (%rdx) -; AVX-NEXT: vzeroupper +; AVX-NEXT: vmovdqa %xmm0, (%rdx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3: diff --git a/llvm/test/CodeGen/X86/buildvec-extract.ll b/llvm/test/CodeGen/X86/buildvec-extract.ll index 545c57fed4b2c..a10e38512b26c 100644 --- a/llvm/test/CodeGen/X86/buildvec-extract.ll +++ b/llvm/test/CodeGen/X86/buildvec-extract.ll @@ -69,9 +69,9 @@ define <2 x i64> @extract1_i32_zext_insert0_i64_undef(<4 x i32> %x) { define <2 x i64> @extract1_i32_zext_insert0_i64_zero(<4 x i32> %x) { ; SSE2-LABEL: extract1_i32_zext_insert0_i64_zero: ; SSE2: # %bb.0: -; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[1,0] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] ; SSE2-NEXT: retq ; ; SSE41-LABEL: extract1_i32_zext_insert0_i64_zero: @@ -114,9 +114,9 @@ define <2 x i64> @extract2_i32_zext_insert0_i64_undef(<4 x i32> %x) { define <2 x i64> @extract2_i32_zext_insert0_i64_zero(<4 x i32> %x) { ; SSE2-LABEL: extract2_i32_zext_insert0_i64_zero: ; SSE2: # %bb.0: -; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero -; SSE2-NEXT: xorps %xmm1, %xmm1 -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] ; SSE2-NEXT: retq ; ; SSE41-LABEL: extract2_i32_zext_insert0_i64_zero: @@ -375,7 +375,8 @@ define <2 x i64> @extract0_i16_zext_insert0_i64_undef(<8 x i16> %x) { define <2 x i64> @extract0_i16_zext_insert0_i64_zero(<8 x i16> %x) { ; SSE2-LABEL: extract0_i16_zext_insert0_i64_zero: ; SSE2: # %bb.0: -; SSE2-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: pextrw $0, %xmm0, %eax +; SSE2-NEXT: movd %eax, %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: extract0_i16_zext_insert0_i64_zero: @@ -416,14 +417,14 @@ define <2 x i64> @extract1_i16_zext_insert0_i64_undef(<8 x i16> %x) { define <2 x i64> @extract1_i16_zext_insert0_i64_zero(<8 x i16> %x) { ; SSE-LABEL: extract1_i16_zext_insert0_i64_zero: ; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: pextrw $1, %xmm0, %eax +; SSE-NEXT: movd %eax, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: extract1_i16_zext_insert0_i64_zero: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: vpextrw $1, %xmm0, %eax +; AVX-NEXT: vmovd %eax, %xmm0 ; AVX-NEXT: retq %e = extractelement <8 x i16> %x, i32 1 %z = zext i16 %e to i64 @@ -452,14 +453,14 @@ define <2 x i64> @extract2_i16_zext_insert0_i64_undef(<8 x i16> %x) { define <2 x i64> @extract2_i16_zext_insert0_i64_zero(<8 x i16> %x) { ; SSE-LABEL: extract2_i16_zext_insert0_i64_zero: ; SSE: # %bb.0: -; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] -; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: pextrw $2, %xmm0, %eax +; SSE-NEXT: movd %eax, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: extract2_i16_zext_insert0_i64_zero: ; AVX: # %bb.0: -; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] -; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: vpextrw $2, %xmm0, %eax +; AVX-NEXT: vmovd %eax, %xmm0 ; AVX-NEXT: retq %e = extractelement <8 x i16> %x, i32 2 %z = zext i16 %e to i64 @@ -486,14 +487,14 @@ define <2 x i64> @extract3_i16_zext_insert0_i64_undef(<8 x i16> %x) { define <2 x i64> @extract3_i16_zext_insert0_i64_zero(<8 x i16> %x) { ; SSE-LABEL: extract3_i16_zext_insert0_i64_zero: ; SSE: # %bb.0: -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE-NEXT: pextrw $3, %xmm0, %eax +; SSE-NEXT: movd %eax, %xmm0 ; SSE-NEXT: retq ; ; AVX-LABEL: extract3_i16_zext_insert0_i64_zero: ; AVX: # %bb.0: -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX-NEXT: vpextrw $3, %xmm0, %eax +; AVX-NEXT: vmovd %eax, %xmm0 ; AVX-NEXT: retq %e = extractelement <8 x i16> %x, i32 3 %z = zext i16 %e to i64 diff --git a/llvm/test/CodeGen/X86/insertelement-duplicates.ll b/llvm/test/CodeGen/X86/insertelement-duplicates.ll index 3da53a4ca5f1b..435ea61412b73 100644 --- a/llvm/test/CodeGen/X86/insertelement-duplicates.ll +++ b/llvm/test/CodeGen/X86/insertelement-duplicates.ll @@ -31,16 +31,18 @@ define void @PR15298(ptr nocapture %source, ptr nocapture %dest) nounwind noinli ; AVX-32: # %bb.0: # %L.entry ; AVX-32-NEXT: movl {{[0-9]+}}(%esp), %eax ; AVX-32-NEXT: movl {{[0-9]+}}(%esp), %ecx -; AVX-32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-32-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,0,1] +; AVX-32-NEXT: vbroadcastss 304(%ecx), %xmm0 +; AVX-32-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX-32-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4,5,6,7] ; AVX-32-NEXT: vmovups %ymm0, 608(%eax) ; AVX-32-NEXT: vzeroupper ; AVX-32-NEXT: retl ; ; AVX-64-LABEL: PR15298: ; AVX-64: # %bb.0: # %L.entry -; AVX-64-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX-64-NEXT: vshufps {{.*#+}} xmm0 = xmm0[1,0,0,1] +; AVX-64-NEXT: vbroadcastss 304(%rdi), %xmm0 +; AVX-64-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; AVX-64-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2],ymm1[3,4,5,6,7] ; AVX-64-NEXT: vmovups %ymm0, 608(%rsi) ; AVX-64-NEXT: vzeroupper ; AVX-64-NEXT: retq diff --git a/llvm/test/CodeGen/X86/movmsk-bittest.ll b/llvm/test/CodeGen/X86/movmsk-bittest.ll index b67e70e71c3d5..0bde62f106ae6 100644 --- a/llvm/test/CodeGen/X86/movmsk-bittest.ll +++ b/llvm/test/CodeGen/X86/movmsk-bittest.ll @@ -219,14 +219,23 @@ define i32 @movmsk_sgt_v16i8_15(<16 x i8> %v, i32 %a, i32 %b) { } define i32 @movmsk_eq_v4i64_0(<4 x i64> %v, i32 %a, i32 %b) { -; SSE-LABEL: movmsk_eq_v4i64_0: -; SSE: # %bb.0: -; SSE-NEXT: movl %edi, %eax -; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE-NEXT: movmskps %xmm0, %ecx -; SSE-NEXT: testb $1, %cl -; SSE-NEXT: cmovel %esi, %eax -; SSE-NEXT: retq +; SSE2-LABEL: movmsk_eq_v4i64_0: +; SSE2: # %bb.0: +; SSE2-NEXT: movl %edi, %eax +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE2-NEXT: movmskps %xmm0, %ecx +; SSE2-NEXT: testb $1, %cl +; SSE2-NEXT: cmovel %esi, %eax +; SSE2-NEXT: retq +; +; SSE41-LABEL: movmsk_eq_v4i64_0: +; SSE41: # %bb.0: +; SSE41-NEXT: movl %edi, %eax +; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] +; SSE41-NEXT: movmskps %xmm0, %ecx +; SSE41-NEXT: testb $1, %cl +; SSE41-NEXT: cmovel %esi, %eax +; SSE41-NEXT: retq ; ; AVX-LABEL: movmsk_eq_v4i64_0: ; AVX: # %bb.0: @@ -557,5 +566,3 @@ define i32 @movmsk_sgt_v32i8_31(<32 x i8> %v, i32 %a, i32 %b) { ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; AVX1OR2: {{.*}} -; SSE2: {{.*}} -; SSE41: {{.*}} diff --git a/llvm/test/CodeGen/X86/split-extend-vector-inreg.ll b/llvm/test/CodeGen/X86/split-extend-vector-inreg.ll index 8a6c2f851a6d6..f76e7f64fbf09 100644 --- a/llvm/test/CodeGen/X86/split-extend-vector-inreg.ll +++ b/llvm/test/CodeGen/X86/split-extend-vector-inreg.ll @@ -1,21 +1,35 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx | FileCheck %s -; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s +; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx | FileCheck -check-prefixes=CHECK,X32 %s +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck -check-prefixes=CHECK,X64 %s define <4 x i64> @autogen_SD88863() { -; CHECK-LABEL: autogen_SD88863: -; CHECK: # %bb.0: # %BB -; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1] -; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1 -; CHECK-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[3],ymm1[3] -; CHECK-NEXT: movb $1, %al -; CHECK-NEXT: .p2align 4 -; CHECK-NEXT: .LBB0_1: # %CF -; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: testb %al, %al -; CHECK-NEXT: jne .LBB0_1 -; CHECK-NEXT: # %bb.2: # %CF240 -; CHECK-NEXT: ret{{[l|q]}} +; X32-LABEL: autogen_SD88863: +; X32: # %bb.0: # %BB +; X32-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1] +; X32-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; X32-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[3],ymm1[3] +; X32-NEXT: movb $1, %al +; X32-NEXT: .p2align 4 +; X32-NEXT: .LBB0_1: # %CF +; X32-NEXT: # =>This Inner Loop Header: Depth=1 +; X32-NEXT: testb %al, %al +; X32-NEXT: jne .LBB0_1 +; X32-NEXT: # %bb.2: # %CF240 +; X32-NEXT: retl +; +; X64-LABEL: autogen_SD88863: +; X64: # %bb.0: # %BB +; X64-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero +; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 +; X64-NEXT: movb $1, %al +; X64-NEXT: .p2align 4 +; X64-NEXT: .LBB0_1: # %CF +; X64-NEXT: # =>This Inner Loop Header: Depth=1 +; X64-NEXT: testb %al, %al +; X64-NEXT: jne .LBB0_1 +; X64-NEXT: # %bb.2: # %CF240 +; X64-NEXT: retq BB: %I26 = insertelement <4 x i64> undef, i64 undef, i32 2 br label %CF @@ -29,3 +43,5 @@ CF: CF240: ret <4 x i64> %I68 } +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CHECK: {{.*}} diff --git a/llvm/test/CodeGen/X86/sse41.ll b/llvm/test/CodeGen/X86/sse41.ll index 2d7258a49f5d0..6443bd5cda55f 100644 --- a/llvm/test/CodeGen/X86/sse41.ll +++ b/llvm/test/CodeGen/X86/sse41.ll @@ -1233,31 +1233,47 @@ define <4 x i32> @i32_shuf_X00A(<4 x i32> %x, <4 x i32> %a) { } define <4 x i32> @i32_shuf_X00X(<4 x i32> %x, <4 x i32> %a) { -; SSE-LABEL: i32_shuf_X00X: -; SSE: ## %bb.0: -; SSE-NEXT: pxor %xmm1, %xmm1 ## encoding: [0x66,0x0f,0xef,0xc9] -; SSE-NEXT: pshufd $0, %xmm0, %xmm0 ## encoding: [0x66,0x0f,0x70,0xc0,0x00] -; SSE-NEXT: ## xmm0 = xmm0[0,0,0,0] -; SSE-NEXT: pblendw $60, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0e,0xc1,0x3c] -; SSE-NEXT: ## xmm0 = xmm0[0,1],xmm1[2,3,4,5],xmm0[6,7] -; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; X86-SSE-LABEL: i32_shuf_X00X: +; X86-SSE: ## %bb.0: +; X86-SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3] +; X86-SSE-NEXT: ## encoding: [0x66,0x0f,0x38,0x00,0x05,A,A,A,A] +; X86-SSE-NEXT: ## fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 +; X86-SSE-NEXT: retl ## encoding: [0xc3] ; -; AVX1-LABEL: i32_shuf_X00X: -; AVX1: ## %bb.0: -; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9] -; AVX1-NEXT: vshufps $0, %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0xc6,0xc0,0x00] -; AVX1-NEXT: ## xmm0 = xmm0[0,0,0,0] -; AVX1-NEXT: vblendps $6, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x06] -; AVX1-NEXT: ## xmm0 = xmm0[0],xmm1[1,2],xmm0[3] -; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; X86-AVX1-LABEL: i32_shuf_X00X: +; X86-AVX1: ## %bb.0: +; X86-AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3] +; X86-AVX1-NEXT: ## encoding: [0xc4,0xe2,0x79,0x00,0x05,A,A,A,A] +; X86-AVX1-NEXT: ## fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 +; X86-AVX1-NEXT: retl ## encoding: [0xc3] ; -; AVX512-LABEL: i32_shuf_X00X: -; AVX512: ## %bb.0: -; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x57,0xc9] -; AVX512-NEXT: vbroadcastss %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0xc0] -; AVX512-NEXT: vblendps $6, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x06] -; AVX512-NEXT: ## xmm0 = xmm0[0],xmm1[1,2],xmm0[3] -; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] +; X86-AVX512-LABEL: i32_shuf_X00X: +; X86-AVX512: ## %bb.0: +; X86-AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3] +; X86-AVX512-NEXT: ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x00,0x05,A,A,A,A] +; X86-AVX512-NEXT: ## fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 +; X86-AVX512-NEXT: retl ## encoding: [0xc3] +; +; X64-SSE-LABEL: i32_shuf_X00X: +; X64-SSE: ## %bb.0: +; X64-SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3] +; X64-SSE-NEXT: ## encoding: [0x66,0x0f,0x38,0x00,0x05,A,A,A,A] +; X64-SSE-NEXT: ## fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; X64-SSE-NEXT: retq ## encoding: [0xc3] +; +; X64-AVX1-LABEL: i32_shuf_X00X: +; X64-AVX1: ## %bb.0: +; X64-AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3] +; X64-AVX1-NEXT: ## encoding: [0xc4,0xe2,0x79,0x00,0x05,A,A,A,A] +; X64-AVX1-NEXT: ## fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; X64-AVX1-NEXT: retq ## encoding: [0xc3] +; +; X64-AVX512-LABEL: i32_shuf_X00X: +; X64-AVX512: ## %bb.0: +; X64-AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3] +; X64-AVX512-NEXT: ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x00,0x05,A,A,A,A] +; X64-AVX512-NEXT: ## fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte +; X64-AVX512-NEXT: retq ## encoding: [0xc3] %vecext = extractelement <4 x i32> %x, i32 0 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0 %vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1 @@ -1269,32 +1285,26 @@ define <4 x i32> @i32_shuf_X00X(<4 x i32> %x, <4 x i32> %a) { define <4 x i32> @i32_shuf_X0YC(<4 x i32> %x, <4 x i32> %a) { ; SSE-LABEL: i32_shuf_X0YC: ; SSE: ## %bb.0: -; SSE-NEXT: pmovzxdq %xmm0, %xmm2 ## encoding: [0x66,0x0f,0x38,0x35,0xd0] -; SSE-NEXT: ## xmm2 = xmm0[0],zero,xmm0[1],zero -; SSE-NEXT: pshufd $170, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x70,0xc1,0xaa] -; SSE-NEXT: ## xmm0 = xmm1[2,2,2,2] -; SSE-NEXT: pblendw $63, %xmm2, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0e,0xc2,0x3f] -; SSE-NEXT: ## xmm0 = xmm2[0,1,2,3,4,5],xmm0[6,7] +; SSE-NEXT: pmovzxdq %xmm0, %xmm0 ## encoding: [0x66,0x0f,0x38,0x35,0xc0] +; SSE-NEXT: ## xmm0 = xmm0[0],zero,xmm0[1],zero +; SSE-NEXT: insertps $176, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0xb0] +; SSE-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[2] ; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] ; ; AVX1-LABEL: i32_shuf_X0YC: ; AVX1: ## %bb.0: ; AVX1-NEXT: vpmovzxdq %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x35,0xc0] ; AVX1-NEXT: ## xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vpshufd $170, %xmm1, %xmm1 ## encoding: [0xc5,0xf9,0x70,0xc9,0xaa] -; AVX1-NEXT: ## xmm1 = xmm1[2,2,2,2] -; AVX1-NEXT: vpblendw $192, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0e,0xc1,0xc0] -; AVX1-NEXT: ## xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7] +; AVX1-NEXT: vinsertps $176, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xb0] +; AVX1-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[2] ; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] ; ; AVX512-LABEL: i32_shuf_X0YC: ; AVX512: ## %bb.0: ; AVX512-NEXT: vpmovzxdq %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x35,0xc0] ; AVX512-NEXT: ## xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX512-NEXT: vpshufd $170, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x70,0xc9,0xaa] -; AVX512-NEXT: ## xmm1 = xmm1[2,2,2,2] -; AVX512-NEXT: vpblendd $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x02,0xc1,0x08] -; AVX512-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[3] +; AVX512-NEXT: vinsertps $176, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xb0] +; AVX512-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[2] ; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %vecext = extractelement <4 x i32> %x, i32 0 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0 @@ -2124,14 +2134,14 @@ define <4 x float> @build_vector_to_shuffle_1(<4 x float> %A) { ; AVX1-LABEL: build_vector_to_shuffle_1: ; AVX1: ## %bb.0: ; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9] -; AVX1-NEXT: vblendps $10, %xmm0, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x0a] +; AVX1-NEXT: vblendps $5, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x05] ; AVX1-NEXT: ## xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] ; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] ; ; AVX512-LABEL: build_vector_to_shuffle_1: ; AVX512: ## %bb.0: ; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x57,0xc9] -; AVX512-NEXT: vblendps $10, %xmm0, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x0a] +; AVX512-NEXT: vblendps $5, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x05] ; AVX512-NEXT: ## xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] ; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %vecext = extractelement <4 x float> %A, i32 1 @@ -2152,14 +2162,14 @@ define <4 x float> @build_vector_to_shuffle_2(<4 x float> %A) { ; AVX1-LABEL: build_vector_to_shuffle_2: ; AVX1: ## %bb.0: ; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9] -; AVX1-NEXT: vblendps $2, %xmm0, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x02] +; AVX1-NEXT: vblendps $13, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x0d] ; AVX1-NEXT: ## xmm0 = xmm1[0],xmm0[1],xmm1[2,3] ; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] ; ; AVX512-LABEL: build_vector_to_shuffle_2: ; AVX512: ## %bb.0: ; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x57,0xc9] -; AVX512-NEXT: vblendps $2, %xmm0, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x02] +; AVX512-NEXT: vblendps $13, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x0d] ; AVX512-NEXT: ## xmm0 = xmm1[0],xmm0[1],xmm1[2,3] ; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %vecext = extractelement <4 x float> %A, i32 1 diff --git a/llvm/test/CodeGen/X86/vec_extract-avx.ll b/llvm/test/CodeGen/X86/vec_extract-avx.ll index 341a703a21bd5..ff0a68eb5692c 100644 --- a/llvm/test/CodeGen/X86/vec_extract-avx.ll +++ b/llvm/test/CodeGen/X86/vec_extract-avx.ll @@ -126,9 +126,7 @@ define void @legal_vzmovl_2i32_8i32(ptr %in, ptr %out) { ; ; X64-LABEL: legal_vzmovl_2i32_8i32: ; X64: # %bb.0: -; X64-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero -; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X64-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] +; X64-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero ; X64-NEXT: vmovaps %ymm0, (%rsi) ; X64-NEXT: vzeroupper ; X64-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll index 07d8a370a5f93..1dd015cc516f0 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll @@ -18,24 +18,24 @@ define void @store_i32_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %out.vec) nounwind { ; SSE-LABEL: store_i32_stride5_vf2: ; SSE: # %bb.0: -; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero -; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero -; SSE-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero -; SSE-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero +; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero +; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero +; SSE-NEXT: movq {{.*#+}} xmm2 = mem[0],zero +; SSE-NEXT: movq {{.*#+}} xmm3 = mem[0],zero ; SSE-NEXT: movsd {{.*#+}} xmm4 = mem[0],zero -; SSE-NEXT: movaps %xmm2, %xmm5 -; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] -; SSE-NEXT: movaps %xmm0, %xmm6 -; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] -; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm5[0] +; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,1,1] +; SSE-NEXT: movdqa %xmm2, %xmm6 +; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] +; SSE-NEXT: movdqa %xmm0, %xmm3 +; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] +; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm6[0] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0] ; SSE-NEXT: movss {{.*#+}} xmm0 = xmm4[0],xmm0[1,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] -; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] -; SSE-NEXT: movq %xmm2, 32(%r9) -; SSE-NEXT: movaps %xmm6, (%r9) +; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] +; SSE-NEXT: movq %xmm5, 32(%r9) +; SSE-NEXT: movdqa %xmm3, (%r9) ; SSE-NEXT: movaps %xmm0, 16(%r9) ; SSE-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-narrow-binop.ll b/llvm/test/CodeGen/X86/vector-narrow-binop.ll index ad345213c1472..6f1948d3bc2a5 100644 --- a/llvm/test/CodeGen/X86/vector-narrow-binop.ll +++ b/llvm/test/CodeGen/X86/vector-narrow-binop.ll @@ -107,9 +107,11 @@ define <2 x i8> @PR39893(<2 x i32> %x, <8 x i8> %y) { ; SSE: # %bb.0: ; SSE-NEXT: pxor %xmm2, %xmm2 ; SSE-NEXT: psubd %xmm0, %xmm2 -; SSE-NEXT: psrld $16, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] +; SSE-NEXT: psrlq $16, %xmm2 +; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 +; SSE-NEXT: packuswb %xmm2, %xmm2 ; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: retq ; @@ -117,9 +119,8 @@ define <2 x i8> @PR39893(<2 x i32> %x, <8 x i8> %y) { ; AVX: # %bb.0: ; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX-NEXT: vpsubd %xmm0, %xmm2, %xmm0 -; AVX-NEXT: vpsrld $16, %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] +; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: retq %sub = sub <2 x i32> , %x %bc = bitcast <2 x i32> %sub to <8 x i8> diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll index ce092f9d343fc..2fc1e662d7a7f 100644 --- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll +++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll @@ -4115,24 +4115,24 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in. ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: movdqa 48(%rdi), %xmm1 -; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: paddb 48(%rsi), %xmm1 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,0,65535] -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0] -; SSE2-NEXT: pandn %xmm3, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] -; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE2-NEXT: movdqa %xmm0, %xmm1 -; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11] -; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] -; SSE2-NEXT: paddb (%rdx), %xmm2 -; SSE2-NEXT: paddb 16(%rdx), %xmm0 -; SSE2-NEXT: paddb 32(%rdx), %xmm1 -; SSE2-NEXT: movdqa %xmm1, 32(%rcx) -; SSE2-NEXT: movdqa %xmm2, (%rcx) -; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: pxor %xmm3, %xmm3 +; SSE2-NEXT: pinsrw $2, %eax, %xmm3 +; SSE2-NEXT: pinsrw $4, %eax, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [0,65535,65535,65535,65535,65535,0,65535] +; SSE2-NEXT: pand %xmm4, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE2-NEXT: pandn %xmm0, %xmm4 +; SSE2-NEXT: por %xmm1, %xmm4 +; SSE2-NEXT: paddb (%rdx), %xmm4 +; SSE2-NEXT: paddb 16(%rdx), %xmm2 +; SSE2-NEXT: paddb 32(%rdx), %xmm3 +; SSE2-NEXT: movdqa %xmm3, 32(%rcx) +; SSE2-NEXT: movdqa %xmm2, 16(%rcx) +; SSE2-NEXT: movdqa %xmm4, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: @@ -4141,19 +4141,19 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in. ; SSE42-NEXT: movdqa 48(%rdi), %xmm1 ; SSE42-NEXT: paddb 48(%rsi), %xmm1 ; SSE42-NEXT: paddb (%rsi), %xmm0 -; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,0,0] -; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm1[1,2,3,4,5],xmm2[6],xmm1[7] -; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] +; SSE42-NEXT: movd %xmm0, %eax +; SSE42-NEXT: pxor %xmm2, %xmm2 ; SSE42-NEXT: pxor %xmm3, %xmm3 -; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3,4,5,6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4],xmm3[5,6,7] -; SSE42-NEXT: paddb (%rdx), %xmm2 -; SSE42-NEXT: paddb 16(%rdx), %xmm0 -; SSE42-NEXT: paddb 32(%rdx), %xmm1 -; SSE42-NEXT: movdqa %xmm1, 32(%rcx) -; SSE42-NEXT: movdqa %xmm2, (%rcx) -; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: pinsrw $2, %eax, %xmm3 +; SSE42-NEXT: pinsrw $4, %eax, %xmm2 +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5],xmm0[6],xmm1[7] +; SSE42-NEXT: paddb (%rdx), %xmm0 +; SSE42-NEXT: paddb 16(%rdx), %xmm2 +; SSE42-NEXT: paddb 32(%rdx), %xmm3 +; SSE42-NEXT: movdqa %xmm3, 32(%rcx) +; SSE42-NEXT: movdqa %xmm0, (%rcx) +; SSE42-NEXT: movdqa %xmm2, 16(%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: @@ -4165,16 +4165,16 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in. ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,0,0] ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3,4,5],xmm2[6],xmm1[7] ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3,4,5,6,7] -; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovd %xmm0, %eax +; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm0 +; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 +; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm1 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm2, %xmm2 ; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 -; AVX-NEXT: vmovdqa %xmm1, (%rcx) +; AVX-NEXT: vmovdqa %xmm0, (%rcx) ; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm1, 32(%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: @@ -4183,12 +4183,13 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in. ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0 -; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5],xmm0[6],xmm1[7] -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastw %xmm0, %xmm2 +; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3,4,5],xmm2[6],xmm1[7] +; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4,5,6,7] +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 +; AVX2-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 ; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) @@ -4323,36 +4324,37 @@ define void @vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3(ptr %i ; SSE2-NEXT: movdqa 48(%rdi), %xmm1 ; SSE2-NEXT: paddb 48(%rsi), %xmm1 ; SSE2-NEXT: paddb (%rsi), %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,65535] -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pandn %xmm0, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: paddb (%rdx), %xmm2 -; SSE2-NEXT: movdqa 16(%rdx), %xmm1 -; SSE2-NEXT: paddb %xmm0, %xmm1 -; SSE2-NEXT: paddb 32(%rdx), %xmm0 -; SSE2-NEXT: movdqa %xmm0, 32(%rcx) -; SSE2-NEXT: movdqa %xmm1, 16(%rcx) -; SSE2-NEXT: movdqa %xmm2, (%rcx) +; SSE2-NEXT: pextrw $0, %xmm0, %eax +; SSE2-NEXT: movd %eax, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [0,65535,65535,65535,65535,65535,65535,65535] +; SSE2-NEXT: pand %xmm3, %xmm1 +; SSE2-NEXT: pandn %xmm0, %xmm3 +; SSE2-NEXT: por %xmm1, %xmm3 +; SSE2-NEXT: paddb (%rdx), %xmm3 +; SSE2-NEXT: movdqa 16(%rdx), %xmm0 +; SSE2-NEXT: paddb %xmm2, %xmm0 +; SSE2-NEXT: paddb 32(%rdx), %xmm2 +; SSE2-NEXT: movdqa %xmm2, 32(%rcx) +; SSE2-NEXT: movdqa %xmm3, (%rcx) +; SSE2-NEXT: movdqa %xmm0, 16(%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: movdqa 48(%rdi), %xmm1 -; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: paddb 48(%rsi), %xmm1 -; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] +; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: pxor %xmm2, %xmm2 ; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3,4,5,6,7] -; SSE42-NEXT: paddb (%rdx), %xmm1 -; SSE42-NEXT: movdqa 16(%rdx), %xmm0 -; SSE42-NEXT: paddb %xmm2, %xmm0 +; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] +; SSE42-NEXT: paddb (%rdx), %xmm0 +; SSE42-NEXT: movdqa 16(%rdx), %xmm1 +; SSE42-NEXT: paddb %xmm2, %xmm1 ; SSE42-NEXT: paddb 32(%rdx), %xmm2 ; SSE42-NEXT: movdqa %xmm2, 32(%rcx) -; SSE42-NEXT: movdqa %xmm1, (%rcx) -; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: movdqa %xmm0, (%rcx) +; SSE42-NEXT: movdqa %xmm1, 16(%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3: @@ -4362,9 +4364,9 @@ define void @vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3(ptr %i ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] +; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 ; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3,4,5,6,7] -; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 ; AVX-NEXT: vpaddb 32(%rdx), %xmm2, %xmm2 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 @@ -4461,36 +4463,36 @@ define void @vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2(ptr % ; SSE2-NEXT: movdqa 48(%rdi), %xmm1 ; SSE2-NEXT: paddb 48(%rsi), %xmm1 ; SSE2-NEXT: paddb (%rsi), %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,65535] -; SSE2-NEXT: pand %xmm2, %xmm1 -; SSE2-NEXT: pandn %xmm0, %xmm2 -; SSE2-NEXT: por %xmm1, %xmm2 -; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] -; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] -; SSE2-NEXT: movaps 32(%rdx), %xmm1 -; SSE2-NEXT: paddb (%rdx), %xmm2 -; SSE2-NEXT: paddb 16(%rdx), %xmm0 -; SSE2-NEXT: movaps %xmm1, 32(%rcx) -; SSE2-NEXT: movdqa %xmm2, (%rcx) -; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: pinsrw $4, %eax, %xmm2 +; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [0,65535,65535,65535,65535,65535,65535,65535] +; SSE2-NEXT: pand %xmm3, %xmm1 +; SSE2-NEXT: pandn %xmm0, %xmm3 +; SSE2-NEXT: por %xmm1, %xmm3 +; SSE2-NEXT: movaps 32(%rdx), %xmm0 +; SSE2-NEXT: paddb (%rdx), %xmm3 +; SSE2-NEXT: paddb 16(%rdx), %xmm2 +; SSE2-NEXT: movaps %xmm0, 32(%rcx) +; SSE2-NEXT: movdqa %xmm2, 16(%rcx) +; SSE2-NEXT: movdqa %xmm3, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: movdqa 48(%rdi), %xmm1 -; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: paddb 48(%rsi), %xmm1 -; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: movd %xmm0, %eax ; SSE42-NEXT: pxor %xmm2, %xmm2 -; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4],xmm2[5,6,7] -; SSE42-NEXT: movaps 32(%rdx), %xmm0 -; SSE42-NEXT: paddb (%rdx), %xmm1 +; SSE42-NEXT: pinsrw $4, %eax, %xmm2 +; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] +; SSE42-NEXT: movaps 32(%rdx), %xmm1 +; SSE42-NEXT: paddb (%rdx), %xmm0 ; SSE42-NEXT: paddb 16(%rdx), %xmm2 -; SSE42-NEXT: movaps %xmm0, 32(%rcx) -; SSE42-NEXT: movdqa %xmm1, (%rcx) +; SSE42-NEXT: movaps %xmm1, 32(%rcx) +; SSE42-NEXT: movdqa %xmm0, (%rcx) ; SSE42-NEXT: movdqa %xmm2, 16(%rcx) ; SSE42-NEXT: retq ; @@ -4800,14 +4802,14 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in. ; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: xorps %xmm2, %xmm2 ; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,0,1,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,0,1] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[1,2] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,3,1] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,0,1,1] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,0,1] ; SSE2-NEXT: paddb (%rdx), %xmm0 ; SSE2-NEXT: paddb 16(%rdx), %xmm2 -; SSE2-NEXT: paddb 32(%rdx), %xmm1 -; SSE2-NEXT: movdqa %xmm1, 32(%rcx) +; SSE2-NEXT: paddb 32(%rdx), %xmm3 +; SSE2-NEXT: movdqa %xmm3, 32(%rcx) ; SSE2-NEXT: movdqa %xmm0, (%rcx) ; SSE2-NEXT: movdqa %xmm2, 16(%rcx) ; SSE2-NEXT: retq @@ -4818,18 +4820,18 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in. ; SSE42-NEXT: movdqa 48(%rdi), %xmm1 ; SSE42-NEXT: paddb 48(%rsi), %xmm1 ; SSE42-NEXT: paddb (%rsi), %xmm0 -; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,0,0] -; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3,4,5],xmm2[6,7] -; SSE42-NEXT: pxor %xmm1, %xmm1 -; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,1,1] -; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,0,1] -; SSE42-NEXT: paddb (%rdx), %xmm2 -; SSE42-NEXT: paddb 16(%rdx), %xmm1 -; SSE42-NEXT: paddb 32(%rdx), %xmm0 -; SSE42-NEXT: movdqa %xmm0, 32(%rcx) -; SSE42-NEXT: movdqa %xmm2, (%rcx) -; SSE42-NEXT: movdqa %xmm1, 16(%rcx) +; SSE42-NEXT: pxor %xmm2, %xmm2 +; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3,4,5,6,7] +; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,0,1,1] +; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,0,1] +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5],xmm0[6,7] +; SSE42-NEXT: paddb (%rdx), %xmm0 +; SSE42-NEXT: paddb 16(%rdx), %xmm2 +; SSE42-NEXT: paddb 32(%rdx), %xmm3 +; SSE42-NEXT: movdqa %xmm3, 32(%rcx) +; SSE42-NEXT: movdqa %xmm0, (%rcx) +; SSE42-NEXT: movdqa %xmm2, 16(%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4: @@ -4841,10 +4843,10 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in. ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] +; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 ; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3,4,5,6,7] ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,1,1] -; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 ; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5],xmm3[6,7] ; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 @@ -4857,20 +4859,19 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in. ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-SLOW-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,5,6,0] -; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm2, %ymm1 -; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6],ymm2[7] -; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,1,1] -; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rcx) -; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpbroadcastd %xmm0, %xmm2 +; AVX2-SLOW-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,5,6,0] +; AVX2-SLOW-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6],ymm3[7] +; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm1, 32(%rcx) ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; @@ -4878,18 +4879,18 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in. ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,5,6,0] -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm2, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6],ymm2[7] -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,ymm0[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,5,6,0] +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6],ymm1[7] +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; @@ -4897,18 +4898,18 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in. ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-FAST-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,5,6,0] -; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 -; AVX2-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6],ymm2[7] -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rcx) -; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,ymm0[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,5,6,0] +; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 +; AVX2-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6],ymm1[7] +; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm2, %ymm1 +; AVX2-FAST-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rcx) ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; @@ -4997,11 +4998,11 @@ define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %i ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: movdqa 48(%rdi), %xmm1 -; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: paddb 48(%rsi), %xmm1 -; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] +; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: xorps %xmm2, %xmm2 ; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] +; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE2-NEXT: paddb (%rdx), %xmm1 ; SSE2-NEXT: movdqa 16(%rdx), %xmm0 ; SSE2-NEXT: paddb %xmm2, %xmm0 @@ -5015,18 +5016,18 @@ define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %i ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: movdqa 48(%rdi), %xmm1 -; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: paddb 48(%rsi), %xmm1 -; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: pxor %xmm2, %xmm2 ; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3,4,5,6,7] -; SSE42-NEXT: paddb (%rdx), %xmm1 -; SSE42-NEXT: movdqa 16(%rdx), %xmm0 -; SSE42-NEXT: paddb %xmm2, %xmm0 +; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; SSE42-NEXT: paddb (%rdx), %xmm0 +; SSE42-NEXT: movdqa 16(%rdx), %xmm1 +; SSE42-NEXT: paddb %xmm2, %xmm1 ; SSE42-NEXT: paddb 32(%rdx), %xmm2 ; SSE42-NEXT: movdqa %xmm2, 32(%rcx) -; SSE42-NEXT: movdqa %xmm1, (%rcx) -; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: movdqa %xmm0, (%rcx) +; SSE42-NEXT: movdqa %xmm1, 16(%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3: @@ -5036,9 +5037,9 @@ define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %i ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 ; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1],xmm2[2,3,4,5,6,7] -; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 ; AVX-NEXT: vpaddb 32(%rdx), %xmm3, %xmm3 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7] ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 @@ -5051,18 +5052,18 @@ define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %i ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1] -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4,5,6,7] -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; AVX2-NEXT: vmovdqa 48(%rdi), %xmm2 +; AVX2-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7] -; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm1, (%rcx) -; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] +; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 +; AVX2-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -5150,36 +5151,36 @@ define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %i ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: movdqa 48(%rdi), %xmm1 -; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: paddb 48(%rsi), %xmm1 -; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] +; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: xorps %xmm2, %xmm2 ; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,0,1] -; SSE2-NEXT: movaps 32(%rdx), %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,0,1] +; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] +; SSE2-NEXT: movaps 32(%rdx), %xmm0 ; SSE2-NEXT: paddb (%rdx), %xmm1 -; SSE2-NEXT: paddb 16(%rdx), %xmm0 -; SSE2-NEXT: movaps %xmm2, 32(%rcx) +; SSE2-NEXT: paddb 16(%rdx), %xmm2 +; SSE2-NEXT: movaps %xmm0, 32(%rcx) ; SSE2-NEXT: movdqa %xmm1, (%rcx) -; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: movdqa %xmm2, 16(%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: movdqa 48(%rdi), %xmm1 -; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: paddb 48(%rsi), %xmm1 -; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: pxor %xmm2, %xmm2 ; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3,4,5,6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,0,1] -; SSE42-NEXT: movaps 32(%rdx), %xmm2 -; SSE42-NEXT: paddb (%rdx), %xmm1 -; SSE42-NEXT: paddb 16(%rdx), %xmm0 -; SSE42-NEXT: movaps %xmm2, 32(%rcx) -; SSE42-NEXT: movdqa %xmm1, (%rcx) -; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,0,1] +; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; SSE42-NEXT: movaps 32(%rdx), %xmm1 +; SSE42-NEXT: paddb (%rdx), %xmm0 +; SSE42-NEXT: paddb 16(%rdx), %xmm2 +; SSE42-NEXT: movaps %xmm1, 32(%rcx) +; SSE42-NEXT: movdqa %xmm0, (%rcx) +; SSE42-NEXT: movdqa %xmm2, 16(%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2: @@ -5283,17 +5284,17 @@ define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %i ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: movdqa 48(%rdi), %xmm1 -; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: paddb 48(%rsi), %xmm1 -; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE2-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero -; SSE2-NEXT: paddb (%rdx), %xmm1 -; SSE2-NEXT: movdqa 16(%rdx), %xmm2 -; SSE2-NEXT: paddb %xmm0, %xmm2 -; SSE2-NEXT: paddb 32(%rdx), %xmm0 -; SSE2-NEXT: movdqa %xmm0, 32(%rcx) -; SSE2-NEXT: movdqa %xmm1, (%rcx) -; SSE2-NEXT: movdqa %xmm2, 16(%rcx) +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: movq {{.*#+}} xmm2 = xmm0[0],zero +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] +; SSE2-NEXT: paddb (%rdx), %xmm0 +; SSE2-NEXT: movdqa 16(%rdx), %xmm1 +; SSE2-NEXT: paddb %xmm2, %xmm1 +; SSE2-NEXT: paddb 32(%rdx), %xmm2 +; SSE2-NEXT: movdqa %xmm2, 32(%rcx) +; SSE2-NEXT: movdqa %xmm0, (%rcx) +; SSE2-NEXT: movdqa %xmm1, 16(%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3: @@ -5302,15 +5303,15 @@ define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %i ; SSE42-NEXT: movdqa 48(%rdi), %xmm1 ; SSE42-NEXT: paddb 48(%rsi), %xmm1 ; SSE42-NEXT: paddb (%rsi), %xmm0 -; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] -; SSE42-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero -; SSE42-NEXT: paddb (%rdx), %xmm1 -; SSE42-NEXT: movdqa 16(%rdx), %xmm2 -; SSE42-NEXT: paddb %xmm0, %xmm2 -; SSE42-NEXT: paddb 32(%rdx), %xmm0 -; SSE42-NEXT: movdqa %xmm0, 32(%rcx) -; SSE42-NEXT: movdqa %xmm1, (%rcx) -; SSE42-NEXT: movdqa %xmm2, 16(%rcx) +; SSE42-NEXT: movq {{.*#+}} xmm2 = xmm0[0],zero +; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] +; SSE42-NEXT: paddb (%rdx), %xmm0 +; SSE42-NEXT: movdqa 16(%rdx), %xmm1 +; SSE42-NEXT: paddb %xmm2, %xmm1 +; SSE42-NEXT: paddb 32(%rdx), %xmm2 +; SSE42-NEXT: movdqa %xmm2, 32(%rcx) +; SSE42-NEXT: movdqa %xmm0, (%rcx) +; SSE42-NEXT: movdqa %xmm1, 16(%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3: @@ -5320,8 +5321,8 @@ define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %i ; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] -; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 +; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero ; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 ; AVX-NEXT: vmovdqa %xmm1, (%rcx) @@ -5335,15 +5336,15 @@ define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %i ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,3,0,3] -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] -; AVX2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero -; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 -; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm1, (%rcx) -; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) +; AVX2-NEXT: vmovq {{.*#+}} xmm2 = xmm0[0],zero +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,3] +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] +; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vpaddb 32(%rdx), %ymm2, %ymm1 +; AVX2-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -5429,8 +5430,8 @@ define void @vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2(ptr %i ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: movdqa 48(%rdi), %xmm1 -; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: paddb 48(%rsi), %xmm1 +; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] ; SSE2-NEXT: movaps 32(%rdx), %xmm2 @@ -5447,14 +5448,15 @@ define void @vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2(ptr %i ; SSE42-NEXT: movdqa 48(%rdi), %xmm1 ; SSE42-NEXT: paddb 48(%rsi), %xmm1 ; SSE42-NEXT: paddb (%rsi), %xmm0 -; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] -; SSE42-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] -; SSE42-NEXT: movaps 32(%rdx), %xmm2 -; SSE42-NEXT: paddb (%rdx), %xmm1 -; SSE42-NEXT: paddb 16(%rdx), %xmm0 -; SSE42-NEXT: movaps %xmm2, 32(%rcx) -; SSE42-NEXT: movdqa %xmm0, 16(%rcx) -; SSE42-NEXT: movdqa %xmm1, (%rcx) +; SSE42-NEXT: movdqa %xmm0, %xmm2 +; SSE42-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7] +; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] +; SSE42-NEXT: movaps 32(%rdx), %xmm1 +; SSE42-NEXT: paddb (%rdx), %xmm0 +; SSE42-NEXT: paddb 16(%rdx), %xmm2 +; SSE42-NEXT: movaps %xmm1, 32(%rcx) +; SSE42-NEXT: movdqa %xmm2, 16(%rcx) +; SSE42-NEXT: movdqa %xmm0, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2: diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll index acedcf4263906..d874ceb3f4736 100644 --- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll +++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll @@ -3316,41 +3316,39 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in. define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in.elt.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { ; SSE2-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rdi), %xmm0 -; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,0,65535] -; SSE2-NEXT: movdqa 48(%rdi), %xmm2 -; SSE2-NEXT: pand %xmm1, %xmm2 -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0] -; SSE2-NEXT: pandn %xmm3, %xmm1 -; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] -; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE2-NEXT: movdqa %xmm0, %xmm2 -; SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9,10,11] -; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] -; SSE2-NEXT: paddb (%rsi), %xmm1 -; SSE2-NEXT: paddb 16(%rsi), %xmm0 +; SSE2-NEXT: movl (%rdi), %eax +; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [0,65535,65535,65535,65535,65535,0,65535] +; SSE2-NEXT: movdqa 48(%rdi), %xmm1 +; SSE2-NEXT: pand %xmm0, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = mem[0,0,0,0] +; SSE2-NEXT: pandn %xmm2, %xmm0 +; SSE2-NEXT: por %xmm1, %xmm0 +; SSE2-NEXT: pxor %xmm1, %xmm1 +; SSE2-NEXT: pxor %xmm2, %xmm2 +; SSE2-NEXT: pinsrw $2, %eax, %xmm2 +; SSE2-NEXT: pinsrw $4, %eax, %xmm1 +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: paddb 16(%rsi), %xmm1 ; SSE2-NEXT: paddb 32(%rsi), %xmm2 ; SSE2-NEXT: movdqa %xmm2, 32(%rdx) -; SSE2-NEXT: movdqa %xmm1, (%rdx) -; SSE2-NEXT: movdqa %xmm0, 16(%rdx) +; SSE2-NEXT: movdqa %xmm1, 16(%rdx) +; SSE2-NEXT: movdqa %xmm0, (%rdx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: ; SSE42: # %bb.0: -; SSE42-NEXT: movdqa (%rdi), %xmm0 -; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] -; SSE42-NEXT: pxor %xmm2, %xmm2 -; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3,4,5,6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0] -; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],mem[1,2,3,4,5],xmm3[6],mem[7] -; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4],xmm2[5,6,7] +; SSE42-NEXT: movl (%rdi), %eax +; SSE42-NEXT: pxor %xmm0, %xmm0 +; SSE42-NEXT: pxor %xmm1, %xmm1 +; SSE42-NEXT: pinsrw $2, %eax, %xmm1 +; SSE42-NEXT: pshufd {{.*#+}} xmm2 = mem[0,0,0,0] +; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],mem[1,2,3,4,5],xmm2[6],mem[7] +; SSE42-NEXT: pinsrw $4, %eax, %xmm0 ; SSE42-NEXT: paddb 16(%rsi), %xmm0 -; SSE42-NEXT: paddb (%rsi), %xmm3 +; SSE42-NEXT: paddb (%rsi), %xmm2 ; SSE42-NEXT: paddb 32(%rsi), %xmm1 ; SSE42-NEXT: movdqa %xmm1, 32(%rdx) -; SSE42-NEXT: movdqa %xmm3, (%rdx) +; SSE42-NEXT: movdqa %xmm2, (%rdx) ; SSE42-NEXT: movdqa %xmm0, 16(%rdx) ; SSE42-NEXT: retq ; @@ -3359,66 +3357,35 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in. ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],mem[1,2,3,4,5],xmm1[6],mem[7] -; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] -; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2],xmm3[3,4,5,6,7] ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 +; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpinsrw $2, (%rdi), %xmm2, %xmm2 +; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2 +; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) ; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) ; AVX-NEXT: retq ; -; AVX2-SLOW-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: -; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpbroadcastw (%rdi), %xmm0 -; AVX2-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3,4,5],xmm0[6],mem[7] -; AVX2-SLOW-NEXT: vpbroadcastw (%rdi), %xmm2 -; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa %ymm1, 32(%rdx) -; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-SLOW-NEXT: vzeroupper -; AVX2-SLOW-NEXT: retq -; -; AVX2-FAST-PERLANE-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: -; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpbroadcastw (%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],mem[1,2,3,4,5],xmm1[6],mem[7] -; AVX2-FAST-PERLANE-NEXT: vpbroadcastw (%rdi), %xmm2 -; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rdx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rdx) -; AVX2-FAST-PERLANE-NEXT: vzeroupper -; AVX2-FAST-PERLANE-NEXT: retq -; -; AVX2-FAST-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: -; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: vpbroadcastw (%rdi), %xmm1 -; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],mem[1,2,3,4,5],xmm1[6],mem[7] -; AVX2-FAST-NEXT: vpbroadcastw (%rdi), %xmm2 -; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX2-FAST-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rdx) -; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rdx) -; AVX2-FAST-NEXT: vzeroupper -; AVX2-FAST-NEXT: retq +; AVX2-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: +; AVX2: # %bb.0: +; AVX2-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-NEXT: vmovd %xmm0, %eax +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 +; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0 +; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3,4,5],xmm0[6],mem[7] +; AVX2-NEXT: vpbroadcastw (%rdi), %xmm2 +; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-NEXT: vmovdqa %ymm0, (%rdx) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq ; ; AVX512F-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: ; AVX512F: # %bb.0: @@ -3481,14 +3448,15 @@ define void @vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3(ptr %i ; SSE2-NEXT: pand %xmm1, %xmm2 ; SSE2-NEXT: pandn %xmm0, %xmm1 ; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: pextrw $0, %xmm0, %eax +; SSE2-NEXT: movd %eax, %xmm0 ; SSE2-NEXT: paddb (%rsi), %xmm1 ; SSE2-NEXT: movdqa 16(%rsi), %xmm2 ; SSE2-NEXT: paddb %xmm0, %xmm2 ; SSE2-NEXT: paddb 32(%rsi), %xmm0 ; SSE2-NEXT: movdqa %xmm0, 32(%rdx) -; SSE2-NEXT: movdqa %xmm2, 16(%rdx) ; SSE2-NEXT: movdqa %xmm1, (%rdx) +; SSE2-NEXT: movdqa %xmm2, 16(%rdx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3: @@ -3527,7 +3495,7 @@ define void @vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3(ptr %i ; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1,2,3,4,5,6,7] ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = mem[0,1,0,1] +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = mem[0,1,0,1] ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15] ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX2-NEXT: vpaddb (%rsi), %ymm1, %ymm1 @@ -3596,9 +3564,9 @@ define void @vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2(ptr % ; SSE2-NEXT: pand %xmm1, %xmm2 ; SSE2-NEXT: pandn %xmm0, %xmm1 ; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] -; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] +; SSE2-NEXT: movd %xmm0, %eax +; SSE2-NEXT: pxor %xmm0, %xmm0 +; SSE2-NEXT: pinsrw $4, %eax, %xmm0 ; SSE2-NEXT: movaps 32(%rsi), %xmm2 ; SSE2-NEXT: paddb (%rsi), %xmm1 ; SSE2-NEXT: paddb 16(%rsi), %xmm0 @@ -3612,15 +3580,15 @@ define void @vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2(ptr % ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: movdqa 48(%rdi), %xmm1 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] -; SSE42-NEXT: pxor %xmm2, %xmm2 -; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4],xmm2[5,6,7] -; SSE42-NEXT: movaps 32(%rsi), %xmm0 -; SSE42-NEXT: paddb 16(%rsi), %xmm2 +; SSE42-NEXT: movd %xmm0, %eax +; SSE42-NEXT: pxor %xmm0, %xmm0 +; SSE42-NEXT: pinsrw $4, %eax, %xmm0 +; SSE42-NEXT: movaps 32(%rsi), %xmm2 +; SSE42-NEXT: paddb 16(%rsi), %xmm0 ; SSE42-NEXT: paddb (%rsi), %xmm1 -; SSE42-NEXT: movaps %xmm0, 32(%rdx) +; SSE42-NEXT: movaps %xmm2, 32(%rdx) ; SSE42-NEXT: movdqa %xmm1, (%rdx) -; SSE42-NEXT: movdqa %xmm2, 16(%rdx) +; SSE42-NEXT: movdqa %xmm0, 16(%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2: @@ -3862,9 +3830,9 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in. ; SSE2-NEXT: xorps %xmm1, %xmm1 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,0,1,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,0,1] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],mem[1,2] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,3,1] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,0,1] ; SSE2-NEXT: paddb 16(%rsi), %xmm1 ; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: paddb 32(%rsi), %xmm2 @@ -3879,9 +3847,9 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in. ; SSE42-NEXT: pxor %xmm1, %xmm1 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] ; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,0,1,1] +; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,0,1] ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3,4,5],xmm0[6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,0,1] ; SSE42-NEXT: paddb 16(%rsi), %xmm1 ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: paddb 32(%rsi), %xmm2 @@ -3895,12 +3863,12 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in. ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],mem[2,3,4,5,6,7] ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] -; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7] +; AVX-NEXT: vbroadcastss (%rdi), %ymm2 +; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3,4,5,6,7] ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,1,1] -; AVX-NEXT: vbroadcastss (%rdi), %ymm3 ; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5],xmm2[6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5],xmm3[6,7] ; AVX-NEXT: vpaddb 16(%rsi), %xmm2, %xmm2 ; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 ; AVX-NEXT: vmovdqa %xmm1, (%rdx) @@ -3911,27 +3879,26 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in. ; ; AVX2-SLOW-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm0 -; AVX2-SLOW-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero -; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,0,1,1] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX2-SLOW-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,5,6,0] -; AVX2-SLOW-NEXT: vpermd %ymm0, %ymm2, %ymm0 -; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6],ymm2[7] -; AVX2-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa %ymm1, 32(%rdx) -; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rdx) +; AVX2-SLOW-NEXT: vpbroadcastd (%rdi), %xmm0 +; AVX2-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] +; AVX2-SLOW-NEXT: vpmovsxbd {{.*#+}} xmm3 = [0,5,6,0] +; AVX2-SLOW-NEXT: vpermd %ymm2, %ymm3, %ymm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6],ymm1[7] +; AVX2-SLOW-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rdx) ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,ymm0[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,5,6,0] ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm2, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpxor %xmm2, %xmm2, %xmm2 @@ -3945,10 +3912,9 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in. ; ; AVX2-FAST-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm0 -; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm1 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,ymm0[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] ; AVX2-FAST-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,5,6,0] ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 ; AVX2-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 @@ -4029,47 +3995,47 @@ define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %i ; SSE42-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 -; SSE42-NEXT: movdqa 48(%rdi), %xmm1 +; SSE42-NEXT: pxor %xmm1, %xmm1 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] -; SSE42-NEXT: pxor %xmm2, %xmm2 -; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3,4,5,6,7] -; SSE42-NEXT: movdqa 16(%rsi), %xmm0 -; SSE42-NEXT: paddb %xmm2, %xmm0 -; SSE42-NEXT: paddb (%rsi), %xmm1 -; SSE42-NEXT: paddb 32(%rsi), %xmm2 -; SSE42-NEXT: movdqa %xmm2, 32(%rdx) -; SSE42-NEXT: movdqa %xmm1, (%rdx) -; SSE42-NEXT: movdqa %xmm0, 16(%rdx) +; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3,4,5,6,7] +; SSE42-NEXT: movdqa 16(%rsi), %xmm2 +; SSE42-NEXT: paddb %xmm1, %xmm2 +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: paddb 32(%rsi), %xmm1 +; SSE42-NEXT: movdqa %xmm1, 32(%rdx) +; SSE42-NEXT: movdqa %xmm0, (%rdx) +; SSE42-NEXT: movdqa %xmm2, 16(%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3: ; AVX: # %bb.0: -; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] -; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0],mem[1,2,3] -; AVX-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero -; AVX-NEXT: vxorps %xmm3, %xmm3, %xmm3 -; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3] -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],mem[2,3,4,5,6,7] +; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1],xmm2[2,3,4,5,6,7] +; AVX-NEXT: vpaddb 32(%rsi), %xmm3, %xmm3 ; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2 -; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) -; AVX-NEXT: vmovdqa %xmm1, (%rdx) +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7] +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 ; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) -; AVX-NEXT: vzeroupper +; AVX-NEXT: vmovdqa %xmm1, (%rdx) +; AVX-NEXT: vmovdqa %xmm3, 32(%rdx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero -; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = mem[0,1,0,1] -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4,5,6,7] +; AVX2-NEXT: vmovdqa (%rdi), %ymm0 +; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; AVX2-NEXT: vmovdqa 48(%rdi), %xmm2 +; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4,5,6,7] ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] -; AVX2-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) -; AVX2-NEXT: vmovdqa %ymm1, (%rdx) +; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] +; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, (%rdx) +; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -4142,17 +4108,16 @@ define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %i ; SSE42-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 -; SSE42-NEXT: movdqa 48(%rdi), %xmm1 +; SSE42-NEXT: pxor %xmm1, %xmm1 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] -; SSE42-NEXT: pxor %xmm2, %xmm2 -; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3,4,5,6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,0,1] +; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,0,1] +; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3,4,5,6,7] ; SSE42-NEXT: movaps 32(%rsi), %xmm2 -; SSE42-NEXT: paddb 16(%rsi), %xmm0 -; SSE42-NEXT: paddb (%rsi), %xmm1 +; SSE42-NEXT: paddb 16(%rsi), %xmm1 +; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: movaps %xmm2, 32(%rdx) -; SSE42-NEXT: movdqa %xmm1, (%rdx) -; SSE42-NEXT: movdqa %xmm0, 16(%rdx) +; SSE42-NEXT: movdqa %xmm0, (%rdx) +; SSE42-NEXT: movdqa %xmm1, 16(%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2: @@ -4233,61 +4198,57 @@ define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %i define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %in.elt.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { ; SSE2-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3: ; SSE2: # %bb.0: -; SSE2-NEXT: movapd (%rdi), %xmm0 -; SSE2-NEXT: movapd 48(%rdi), %xmm1 -; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] -; SSE2-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: movq {{.*#+}} xmm1 = xmm0[0],zero +; SSE2-NEXT: shufpd {{.*#+}} xmm0 = xmm0[0],mem[1] ; SSE2-NEXT: movdqa 16(%rsi), %xmm2 -; SSE2-NEXT: paddb %xmm0, %xmm2 -; SSE2-NEXT: paddb (%rsi), %xmm1 -; SSE2-NEXT: paddb 32(%rsi), %xmm0 -; SSE2-NEXT: movdqa %xmm0, 32(%rdx) -; SSE2-NEXT: movdqa %xmm1, (%rdx) +; SSE2-NEXT: paddb %xmm1, %xmm2 +; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: paddb 32(%rsi), %xmm1 +; SSE2-NEXT: movdqa %xmm1, 32(%rdx) +; SSE2-NEXT: movdqa %xmm0, (%rdx) ; SSE2-NEXT: movdqa %xmm2, 16(%rdx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 -; SSE42-NEXT: movdqa 48(%rdi), %xmm1 -; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] -; SSE42-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero +; SSE42-NEXT: movq {{.*#+}} xmm1 = xmm0[0],zero +; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],mem[4,5,6,7] ; SSE42-NEXT: movdqa 16(%rsi), %xmm2 -; SSE42-NEXT: paddb %xmm0, %xmm2 -; SSE42-NEXT: paddb (%rsi), %xmm1 -; SSE42-NEXT: paddb 32(%rsi), %xmm0 -; SSE42-NEXT: movdqa %xmm0, 32(%rdx) -; SSE42-NEXT: movdqa %xmm1, (%rdx) +; SSE42-NEXT: paddb %xmm1, %xmm2 +; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: paddb 32(%rsi), %xmm1 +; SSE42-NEXT: movdqa %xmm1, 32(%rdx) +; SSE42-NEXT: movdqa %xmm0, (%rdx) ; SSE42-NEXT: movdqa %xmm2, 16(%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3: ; AVX: # %bb.0: -; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] -; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1],mem[2,3] -; AVX-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero +; AVX-NEXT: vmovdqa (%rdi), %xmm0 +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],mem[4,5,6,7] ; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero -; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2 ; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2 -; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) -; AVX-NEXT: vmovdqa %xmm1, (%rdx) +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 ; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) -; AVX-NEXT: vzeroupper +; AVX-NEXT: vmovdqa %xmm1, (%rdx) +; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero +; AVX2-NEXT: vmovq {{.*#+}} xmm1 = xmm0[0],zero ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5],mem[6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,3] ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa %ymm0, (%rdx) +; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; From 7cd9367b4f9a23c7603fed68d3c5f4f085561eaa Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Thu, 16 Jan 2025 14:29:15 +0700 Subject: [PATCH 4/5] Avoid arm regressions --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 4 +- .../any_extend_vector_inreg_of_broadcast.ll | 240 +++++----- ...d_vector_inreg_of_broadcast_from_memory.ll | 118 ++--- llvm/test/CodeGen/X86/buildvec-extract.ll | 39 +- llvm/test/CodeGen/X86/movmsk-bittest.ll | 27 +- .../CodeGen/X86/split-extend-vector-inreg.ll | 46 +- llvm/test/CodeGen/X86/sse41.ll | 92 ++-- llvm/test/CodeGen/X86/vec_extract-avx.ll | 4 +- .../vector-interleaved-store-i32-stride-5.ll | 26 +- llvm/test/CodeGen/X86/vector-narrow-binop.ll | 9 +- .../zero_extend_vector_inreg_of_broadcast.ll | 420 +++++++++--------- ...d_vector_inreg_of_broadcast_from_memory.ll | 325 ++++++++------ 12 files changed, 688 insertions(+), 662 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index d5d8336cc2c47..8cfd994a21d56 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -23885,9 +23885,7 @@ SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) { if (TLI.isOperationLegalOrCustom(ISD::EXTRACT_VECTOR_ELT, VT) && TLI.isTypeLegal(VT.getVectorElementType()) && // VecIn[1].hasOneUse() && - NumExtracts == 1 - //&& TLI.isExtractVecEltCheap(VT, OneConstExtractIndex)) - ) + NumExtracts == 1 && TLI.isExtractVecEltCheap(VT, OneConstExtractIndex)) return SDValue(); unsigned MaxIndex = 0; diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll index 940c1ca2d4d35..cad1d09f11d9c 100644 --- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll +++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast.ll @@ -3817,21 +3817,21 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in. ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: movdqa 48(%rdi), %xmm1 -; SSE2-NEXT: paddb 48(%rsi), %xmm1 ; SSE2-NEXT: paddb (%rsi), %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,1,0,1] -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [0,65535,65535,65535,65535,65535,0,65535] -; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE2-NEXT: pandn %xmm0, %xmm4 -; SSE2-NEXT: por %xmm1, %xmm4 -; SSE2-NEXT: paddb (%rdx), %xmm4 -; SSE2-NEXT: paddb 16(%rdx), %xmm3 -; SSE2-NEXT: paddb 32(%rdx), %xmm2 -; SSE2-NEXT: movdqa %xmm2, 32(%rcx) -; SSE2-NEXT: movdqa %xmm3, 16(%rcx) -; SSE2-NEXT: movdqa %xmm4, (%rcx) +; SSE2-NEXT: paddb 48(%rsi), %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,0,65535] +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0] +; SSE2-NEXT: pandn %xmm3, %xmm2 +; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE2-NEXT: paddb (%rdx), %xmm2 +; SSE2-NEXT: paddb 16(%rdx), %xmm0 +; SSE2-NEXT: paddb 32(%rdx), %xmm1 +; SSE2-NEXT: movdqa %xmm1, 32(%rcx) +; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: movdqa %xmm2, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: @@ -3840,16 +3840,16 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in. ; SSE42-NEXT: movdqa 48(%rdi), %xmm1 ; SSE42-NEXT: paddb 48(%rsi), %xmm1 ; SSE42-NEXT: paddb (%rsi), %xmm0 -; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] -; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,1,0,1] -; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5],xmm0[6],xmm1[7] -; SSE42-NEXT: paddb (%rdx), %xmm0 -; SSE42-NEXT: paddb 16(%rdx), %xmm3 -; SSE42-NEXT: paddb 32(%rdx), %xmm2 -; SSE42-NEXT: movdqa %xmm2, 32(%rcx) -; SSE42-NEXT: movdqa %xmm3, 16(%rcx) -; SSE42-NEXT: movdqa %xmm0, (%rcx) +; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,0,0] +; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm1[1,2,3,4,5],xmm2[6],xmm1[7] +; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE42-NEXT: paddb (%rdx), %xmm2 +; SSE42-NEXT: paddb 16(%rdx), %xmm0 +; SSE42-NEXT: paddb 32(%rdx), %xmm1 +; SSE42-NEXT: movdqa %xmm1, 32(%rcx) +; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: movdqa %xmm2, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: @@ -3858,15 +3858,15 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in. ; AVX-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,0,0] -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3,4,5],xmm2[6],xmm1[7] -; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm3 = xmm0[0,0,0,0] +; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm3[0],xmm1[1,2,3,4,5],xmm3[6],xmm1[7] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 -; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX-NEXT: vpaddb 32(%rdx), %xmm2, %xmm2 +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) ; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: retq ; @@ -3875,7 +3875,7 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in. ; AVX2-NEXT: vmovdqa (%rdi), %xmm0 ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastw %xmm0, %ymm0 +; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0 ; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5],xmm0[6],xmm1[7] ; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 @@ -3985,16 +3985,20 @@ define void @vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3(ptr %i ; AVX-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX-NEXT: vmovdqa 48(%rdi), %xmm2 +; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] -; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 +; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3,4,5,6,7] +; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2 +; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm1 +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm3 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 ; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) -; AVX-NEXT: vmovdqa %xmm1, (%rcx) +; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm1, 48(%rcx) +; AVX-NEXT: vmovdqa %xmm2, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3: @@ -4075,29 +4079,29 @@ define void @vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2(ptr % ; SSE2-NEXT: movdqa 48(%rdi), %xmm1 ; SSE2-NEXT: paddb 48(%rsi), %xmm1 ; SSE2-NEXT: paddb (%rsi), %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [0,65535,65535,65535,65535,65535,65535,65535] -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pandn %xmm0, %xmm3 -; SSE2-NEXT: por %xmm1, %xmm3 -; SSE2-NEXT: paddb (%rdx), %xmm3 -; SSE2-NEXT: paddb 16(%rdx), %xmm2 -; SSE2-NEXT: movdqa %xmm2, 16(%rcx) -; SSE2-NEXT: movdqa %xmm3, (%rcx) +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,65535] +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: pandn %xmm0, %xmm2 +; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE2-NEXT: paddb (%rdx), %xmm2 +; SSE2-NEXT: paddb 16(%rdx), %xmm0 +; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: movdqa %xmm2, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: movdqa 48(%rdi), %xmm1 -; SSE42-NEXT: paddb 48(%rsi), %xmm1 ; SSE42-NEXT: paddb (%rsi), %xmm0 -; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] -; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] -; SSE42-NEXT: paddb (%rdx), %xmm0 -; SSE42-NEXT: paddb 16(%rdx), %xmm2 -; SSE42-NEXT: movdqa %xmm0, (%rcx) -; SSE42-NEXT: movdqa %xmm2, 16(%rcx) +; SSE42-NEXT: paddb 48(%rsi), %xmm1 +; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE42-NEXT: paddb (%rdx), %xmm1 +; SSE42-NEXT: paddb 16(%rdx), %xmm0 +; SSE42-NEXT: movdqa %xmm1, (%rcx) +; SSE42-NEXT: movdqa %xmm0, 16(%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2: @@ -4335,33 +4339,33 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in. ; SSE42-NEXT: movdqa 48(%rdi), %xmm1 ; SSE42-NEXT: paddb 48(%rsi), %xmm1 ; SSE42-NEXT: paddb (%rsi), %xmm0 -; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] -; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,1,0,1] -; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5],xmm0[6,7] -; SSE42-NEXT: paddb (%rdx), %xmm0 -; SSE42-NEXT: paddb 16(%rdx), %xmm3 -; SSE42-NEXT: paddb 32(%rdx), %xmm2 -; SSE42-NEXT: movdqa %xmm2, 32(%rcx) -; SSE42-NEXT: movdqa %xmm3, 16(%rcx) -; SSE42-NEXT: movdqa %xmm0, (%rcx) +; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,0,0] +; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3,4,5],xmm2[6,7] +; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE42-NEXT: paddb (%rdx), %xmm2 +; SSE42-NEXT: paddb 16(%rdx), %xmm0 +; SSE42-NEXT: paddb 32(%rdx), %xmm1 +; SSE42-NEXT: movdqa %xmm1, 32(%rcx) +; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: movdqa %xmm2, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] -; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 -; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 -; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) +; AVX-NEXT: vpaddb 32(%rdx), %xmm2, %xmm2 +; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 +; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) ; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: retq ; @@ -4371,7 +4375,7 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in. ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastd %xmm0, %ymm2 +; AVX2-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] ; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,5,6,0] ; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 @@ -4488,16 +4492,20 @@ define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %i ; AVX-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX-NEXT: vmovdqa 48(%rdi), %xmm2 +; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] -; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2 +; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 +; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3,4,5,6,7] +; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2 +; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm1 +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm3 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 ; AVX-NEXT: vmovdqa %xmm0, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rcx) -; AVX-NEXT: vmovdqa %xmm1, (%rcx) +; AVX-NEXT: vmovdqa %xmm3, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm1, 48(%rcx) +; AVX-NEXT: vmovdqa %xmm2, (%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3: @@ -4591,28 +4599,28 @@ define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %i ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: movdqa 48(%rdi), %xmm1 -; SSE2-NEXT: paddb 48(%rsi), %xmm1 ; SSE2-NEXT: paddb (%rsi), %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] +; SSE2-NEXT: paddb 48(%rsi), %xmm1 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE2-NEXT: paddb (%rdx), %xmm1 -; SSE2-NEXT: paddb 16(%rdx), %xmm2 +; SSE2-NEXT: paddb 16(%rdx), %xmm0 ; SSE2-NEXT: movdqa %xmm1, (%rcx) -; SSE2-NEXT: movdqa %xmm2, 16(%rcx) +; SSE2-NEXT: movdqa %xmm0, 16(%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: movdqa 48(%rdi), %xmm1 -; SSE42-NEXT: paddb 48(%rsi), %xmm1 ; SSE42-NEXT: paddb (%rsi), %xmm0 -; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] -; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] -; SSE42-NEXT: paddb (%rdx), %xmm0 -; SSE42-NEXT: paddb 16(%rdx), %xmm2 -; SSE42-NEXT: movdqa %xmm0, (%rcx) -; SSE42-NEXT: movdqa %xmm2, 16(%rcx) +; SSE42-NEXT: paddb 48(%rsi), %xmm1 +; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE42-NEXT: paddb (%rdx), %xmm1 +; SSE42-NEXT: paddb 16(%rdx), %xmm0 +; SSE42-NEXT: movdqa %xmm1, (%rcx) +; SSE42-NEXT: movdqa %xmm0, 16(%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2: @@ -4742,18 +4750,22 @@ define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %i ; AVX-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX-NEXT: vmovdqa 48(%rdi), %xmm2 +; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm2 -; AVX-NEXT: vblendps {{.*#+}} ymm1 = ymm2[0,1],ymm1[2,3],ymm2[4,5],ymm1[6,7] -; AVX-NEXT: vextractf128 $1, %ymm1, %xmm2 -; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 -; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 +; AVX-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm3 +; AVX-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1],ymm2[2,3],ymm3[4,5],ymm2[6,7] +; AVX-NEXT: vextractf128 $1, %ymm2, %xmm3 +; AVX-NEXT: vpaddb 16(%rdx), %xmm3, %xmm3 +; AVX-NEXT: vpaddb (%rdx), %xmm2, %xmm2 +; AVX-NEXT: vpaddb 48(%rdx), %xmm1, %xmm1 ; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 ; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) -; AVX-NEXT: vmovdqa %xmm1, (%rcx) -; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) +; AVX-NEXT: vmovdqa %xmm1, 48(%rcx) +; AVX-NEXT: vmovdqa %xmm2, (%rcx) +; AVX-NEXT: vmovdqa %xmm3, 16(%rcx) ; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; @@ -4848,14 +4860,14 @@ define void @vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2(ptr %i ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: movdqa 48(%rdi), %xmm1 -; SSE2-NEXT: paddb 48(%rsi), %xmm1 ; SSE2-NEXT: paddb (%rsi), %xmm0 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; SSE2-NEXT: paddb (%rdx), %xmm0 -; SSE2-NEXT: paddb 16(%rdx), %xmm2 -; SSE2-NEXT: movdqa %xmm0, (%rcx) -; SSE2-NEXT: movdqa %xmm2, 16(%rcx) +; SSE2-NEXT: paddb 48(%rsi), %xmm1 +; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE2-NEXT: paddb (%rdx), %xmm1 +; SSE2-NEXT: paddb 16(%rdx), %xmm0 +; SSE2-NEXT: movdqa %xmm1, (%rcx) +; SSE2-NEXT: movdqa %xmm0, 16(%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2: @@ -4864,12 +4876,12 @@ define void @vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2(ptr %i ; SSE42-NEXT: movdqa 48(%rdi), %xmm1 ; SSE42-NEXT: paddb 48(%rsi), %xmm1 ; SSE42-NEXT: paddb (%rsi), %xmm0 -; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] -; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] -; SSE42-NEXT: paddb (%rdx), %xmm0 -; SSE42-NEXT: paddb 16(%rdx), %xmm2 -; SSE42-NEXT: movdqa %xmm0, (%rcx) -; SSE42-NEXT: movdqa %xmm2, 16(%rcx) +; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE42-NEXT: paddb (%rdx), %xmm1 +; SSE42-NEXT: paddb 16(%rdx), %xmm0 +; SSE42-NEXT: movdqa %xmm1, (%rcx) +; SSE42-NEXT: movdqa %xmm0, 16(%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2: diff --git a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll index 9572ff3a37fad..3d72319f59ca9 100644 --- a/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll +++ b/llvm/test/CodeGen/X86/any_extend_vector_inreg_of_broadcast_from_memory.ll @@ -3129,8 +3129,8 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in. ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],mem[1,2,3,4,5],xmm1[6],mem[7] ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] -; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 16(%rsi), %xmm2, %xmm2 ; AVX-NEXT: vmovdqa %xmm2, 16(%rdx) @@ -3141,14 +3141,13 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in. ; AVX2-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: ; AVX2: # %bb.0: ; AVX2-NEXT: vpbroadcastw (%rdi), %xmm0 -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3,4,5],xmm0[6],mem[7] ; AVX2-NEXT: vpbroadcastw (%rdi), %xmm1 -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0 -; AVX2-NEXT: vpbroadcastw (%rdi), %ymm1 -; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx) -; AVX2-NEXT: vmovdqa %ymm0, (%rdx) +; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],mem[1,2,3,4,5],xmm1[6],mem[7] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 +; AVX2-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-NEXT: vmovdqa %ymm1, (%rdx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -3234,13 +3233,17 @@ define void @vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3(ptr %i ; AVX-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3: ; AVX: # %bb.0: ; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],mem[1,2,3,4,5,6,7] -; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2 +; AVX-NEXT: vmovdqa 16(%rdi), %xmm1 +; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],mem[1,2,3,4,5,6,7] +; AVX-NEXT: vpaddb (%rsi), %xmm2, %xmm2 +; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa (%rdi), %xmm3 +; AVX-NEXT: vpaddb 32(%rsi), %xmm3, %xmm3 ; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 ; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) -; AVX-NEXT: vmovdqa %xmm1, (%rdx) +; AVX-NEXT: vmovdqa %xmm3, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm1, 48(%rdx) +; AVX-NEXT: vmovdqa %xmm2, (%rdx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3: @@ -3516,16 +3519,16 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in. ; SSE42-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 -; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] -; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] -; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3,4,5],xmm0[6,7] -; SSE42-NEXT: paddb (%rsi), %xmm0 -; SSE42-NEXT: paddb 16(%rsi), %xmm2 -; SSE42-NEXT: paddb 32(%rsi), %xmm1 -; SSE42-NEXT: movdqa %xmm1, 32(%rdx) -; SSE42-NEXT: movdqa %xmm2, 16(%rdx) -; SSE42-NEXT: movdqa %xmm0, (%rdx) +; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] +; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1],mem[2,3,4,5],xmm1[6,7] +; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE42-NEXT: paddb (%rsi), %xmm1 +; SSE42-NEXT: paddb 16(%rsi), %xmm0 +; SSE42-NEXT: paddb 32(%rsi), %xmm2 +; SSE42-NEXT: movdqa %xmm2, 32(%rdx) +; SSE42-NEXT: movdqa %xmm0, 16(%rdx) +; SSE42-NEXT: movdqa %xmm1, (%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4: @@ -3534,8 +3537,8 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in. ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],mem[2,3,4,5,6,7] ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] ; AVX-NEXT: vbroadcastss (%rdi), %xmm2 +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] ; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 16(%rsi), %xmm2, %xmm2 ; AVX-NEXT: vmovdqa %xmm2, 16(%rdx) @@ -3546,10 +3549,10 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in. ; AVX2-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = mem[0,0,1,1] ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,5,6,0] -; AVX2-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-NEXT: vpbroadcastd (%rdi), %ymm1 +; AVX2-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,5,6,0] +; AVX2-NEXT: vpermd %ymm0, %ymm2, %ymm0 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 ; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 ; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx) @@ -3631,15 +3634,19 @@ define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %i ; ; AVX-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3: ; AVX: # %bb.0: -; AVX-NEXT: vmovdqa 48(%rdi), %xmm0 -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = mem[0,1],xmm0[2,3,4,5,6,7] -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa (%rdi), %xmm1 -; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm2 -; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, 16(%rdx) +; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0],mem[1,2,3] +; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa (%rdi), %xmm2 +; AVX-NEXT: vmovdqa 16(%rdi), %xmm3 +; AVX-NEXT: vpaddb 48(%rsi), %xmm3, %xmm3 +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2 ; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) -; AVX-NEXT: vmovdqa %xmm0, (%rdx) +; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm3, 48(%rdx) +; AVX-NEXT: vmovdqa %xmm1, (%rdx) +; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX2-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3: @@ -3701,25 +3708,26 @@ define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %i define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %in.elt.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { ; SSE2-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: movaps (%rdi), %xmm0 ; SSE2-NEXT: movaps 48(%rdi), %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE2-NEXT: paddb 16(%rsi), %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE2-NEXT: paddb 16(%rsi), %xmm0 ; SSE2-NEXT: paddb (%rsi), %xmm1 ; SSE2-NEXT: movdqa %xmm1, (%rdx) -; SSE2-NEXT: movdqa %xmm2, 16(%rdx) +; SSE2-NEXT: movdqa %xmm0, 16(%rdx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 -; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,1,0,1] -; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3,4,5,6,7] -; SSE42-NEXT: paddb 16(%rsi), %xmm1 -; SSE42-NEXT: paddb (%rsi), %xmm0 -; SSE42-NEXT: movdqa %xmm0, (%rdx) -; SSE42-NEXT: movdqa %xmm1, 16(%rdx) +; SSE42-NEXT: movdqa 48(%rdi), %xmm1 +; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE42-NEXT: paddb 16(%rsi), %xmm0 +; SSE42-NEXT: paddb (%rsi), %xmm1 +; SSE42-NEXT: movdqa %xmm1, (%rdx) +; SSE42-NEXT: movdqa %xmm0, 16(%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2: @@ -3812,15 +3820,19 @@ define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %i ; ; AVX-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3: ; AVX: # %bb.0: -; AVX-NEXT: vmovdqa 48(%rdi), %xmm0 -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = mem[0,1,2,3],xmm0[4,5,6,7] -; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa (%rdi), %xmm1 -; AVX-NEXT: vpaddb 32(%rsi), %xmm1, %xmm2 -; AVX-NEXT: vpaddb 16(%rsi), %xmm1, %xmm1 -; AVX-NEXT: vmovdqa %xmm1, 16(%rdx) +; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1],mem[2,3] +; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 +; AVX-NEXT: vmovdqa (%rdi), %xmm2 +; AVX-NEXT: vmovdqa 16(%rdi), %xmm3 +; AVX-NEXT: vpaddb 48(%rsi), %xmm3, %xmm3 +; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 +; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2 ; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) -; AVX-NEXT: vmovdqa %xmm0, (%rdx) +; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm3, 48(%rdx) +; AVX-NEXT: vmovdqa %xmm1, (%rdx) +; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX2-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3: diff --git a/llvm/test/CodeGen/X86/buildvec-extract.ll b/llvm/test/CodeGen/X86/buildvec-extract.ll index a10e38512b26c..545c57fed4b2c 100644 --- a/llvm/test/CodeGen/X86/buildvec-extract.ll +++ b/llvm/test/CodeGen/X86/buildvec-extract.ll @@ -69,9 +69,9 @@ define <2 x i64> @extract1_i32_zext_insert0_i64_undef(<4 x i32> %x) { define <2 x i64> @extract1_i32_zext_insert0_i64_zero(<4 x i32> %x) { ; SSE2-LABEL: extract1_i32_zext_insert0_i64_zero: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1] -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,0],xmm1[1,0] +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3] ; SSE2-NEXT: retq ; ; SSE41-LABEL: extract1_i32_zext_insert0_i64_zero: @@ -114,9 +114,9 @@ define <2 x i64> @extract2_i32_zext_insert0_i64_undef(<4 x i32> %x) { define <2 x i64> @extract2_i32_zext_insert0_i64_zero(<4 x i32> %x) { ; SSE2-LABEL: extract2_i32_zext_insert0_i64_zero: ; SSE2: # %bb.0: -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3] -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3] +; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: xorps %xmm1, %xmm1 +; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[2,3] ; SSE2-NEXT: retq ; ; SSE41-LABEL: extract2_i32_zext_insert0_i64_zero: @@ -375,8 +375,7 @@ define <2 x i64> @extract0_i16_zext_insert0_i64_undef(<8 x i16> %x) { define <2 x i64> @extract0_i16_zext_insert0_i64_zero(<8 x i16> %x) { ; SSE2-LABEL: extract0_i16_zext_insert0_i64_zero: ; SSE2: # %bb.0: -; SSE2-NEXT: pextrw $0, %xmm0, %eax -; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: andps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: retq ; ; SSE41-LABEL: extract0_i16_zext_insert0_i64_zero: @@ -417,14 +416,14 @@ define <2 x i64> @extract1_i16_zext_insert0_i64_undef(<8 x i16> %x) { define <2 x i64> @extract1_i16_zext_insert0_i64_zero(<8 x i16> %x) { ; SSE-LABEL: extract1_i16_zext_insert0_i64_zero: ; SSE: # %bb.0: -; SSE-NEXT: pextrw $1, %xmm0, %eax -; SSE-NEXT: movd %eax, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE-NEXT: retq ; ; AVX-LABEL: extract1_i16_zext_insert0_i64_zero: ; AVX: # %bb.0: -; AVX-NEXT: vpextrw $1, %xmm0, %eax -; AVX-NEXT: vmovd %eax, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] +; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX-NEXT: retq %e = extractelement <8 x i16> %x, i32 1 %z = zext i16 %e to i64 @@ -453,14 +452,14 @@ define <2 x i64> @extract2_i16_zext_insert0_i64_undef(<8 x i16> %x) { define <2 x i64> @extract2_i16_zext_insert0_i64_zero(<8 x i16> %x) { ; SSE-LABEL: extract2_i16_zext_insert0_i64_zero: ; SSE: # %bb.0: -; SSE-NEXT: pextrw $2, %xmm0, %eax -; SSE-NEXT: movd %eax, %xmm0 +; SSE-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] +; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE-NEXT: retq ; ; AVX-LABEL: extract2_i16_zext_insert0_i64_zero: ; AVX: # %bb.0: -; AVX-NEXT: vpextrw $2, %xmm0, %eax -; AVX-NEXT: vmovd %eax, %xmm0 +; AVX-NEXT: vpslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5] +; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX-NEXT: retq %e = extractelement <8 x i16> %x, i32 2 %z = zext i16 %e to i64 @@ -487,14 +486,14 @@ define <2 x i64> @extract3_i16_zext_insert0_i64_undef(<8 x i16> %x) { define <2 x i64> @extract3_i16_zext_insert0_i64_zero(<8 x i16> %x) { ; SSE-LABEL: extract3_i16_zext_insert0_i64_zero: ; SSE: # %bb.0: -; SSE-NEXT: pextrw $3, %xmm0, %eax -; SSE-NEXT: movd %eax, %xmm0 +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; SSE-NEXT: retq ; ; AVX-LABEL: extract3_i16_zext_insert0_i64_zero: ; AVX: # %bb.0: -; AVX-NEXT: vpextrw $3, %xmm0, %eax -; AVX-NEXT: vmovd %eax, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero ; AVX-NEXT: retq %e = extractelement <8 x i16> %x, i32 3 %z = zext i16 %e to i64 diff --git a/llvm/test/CodeGen/X86/movmsk-bittest.ll b/llvm/test/CodeGen/X86/movmsk-bittest.ll index 0bde62f106ae6..b67e70e71c3d5 100644 --- a/llvm/test/CodeGen/X86/movmsk-bittest.ll +++ b/llvm/test/CodeGen/X86/movmsk-bittest.ll @@ -219,23 +219,14 @@ define i32 @movmsk_sgt_v16i8_15(<16 x i8> %v, i32 %a, i32 %b) { } define i32 @movmsk_eq_v4i64_0(<4 x i64> %v, i32 %a, i32 %b) { -; SSE2-LABEL: movmsk_eq_v4i64_0: -; SSE2: # %bb.0: -; SSE2-NEXT: movl %edi, %eax -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] -; SSE2-NEXT: movmskps %xmm0, %ecx -; SSE2-NEXT: testb $1, %cl -; SSE2-NEXT: cmovel %esi, %eax -; SSE2-NEXT: retq -; -; SSE41-LABEL: movmsk_eq_v4i64_0: -; SSE41: # %bb.0: -; SSE41-NEXT: movl %edi, %eax -; SSE41-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,1,1] -; SSE41-NEXT: movmskps %xmm0, %ecx -; SSE41-NEXT: testb $1, %cl -; SSE41-NEXT: cmovel %esi, %eax -; SSE41-NEXT: retq +; SSE-LABEL: movmsk_eq_v4i64_0: +; SSE: # %bb.0: +; SSE-NEXT: movl %edi, %eax +; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm0[1,1,3,3] +; SSE-NEXT: movmskps %xmm0, %ecx +; SSE-NEXT: testb $1, %cl +; SSE-NEXT: cmovel %esi, %eax +; SSE-NEXT: retq ; ; AVX-LABEL: movmsk_eq_v4i64_0: ; AVX: # %bb.0: @@ -566,3 +557,5 @@ define i32 @movmsk_sgt_v32i8_31(<32 x i8> %v, i32 %a, i32 %b) { ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; AVX1OR2: {{.*}} +; SSE2: {{.*}} +; SSE41: {{.*}} diff --git a/llvm/test/CodeGen/X86/split-extend-vector-inreg.ll b/llvm/test/CodeGen/X86/split-extend-vector-inreg.ll index f76e7f64fbf09..8a6c2f851a6d6 100644 --- a/llvm/test/CodeGen/X86/split-extend-vector-inreg.ll +++ b/llvm/test/CodeGen/X86/split-extend-vector-inreg.ll @@ -1,35 +1,21 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx | FileCheck -check-prefixes=CHECK,X32 %s -; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck -check-prefixes=CHECK,X64 %s +; RUN: llc < %s -mtriple=i686-unknown -mattr=+avx | FileCheck %s +; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s define <4 x i64> @autogen_SD88863() { -; X32-LABEL: autogen_SD88863: -; X32: # %bb.0: # %BB -; X32-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1] -; X32-NEXT: vxorpd %xmm1, %xmm1, %xmm1 -; X32-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[3],ymm1[3] -; X32-NEXT: movb $1, %al -; X32-NEXT: .p2align 4 -; X32-NEXT: .LBB0_1: # %CF -; X32-NEXT: # =>This Inner Loop Header: Depth=1 -; X32-NEXT: testb %al, %al -; X32-NEXT: jne .LBB0_1 -; X32-NEXT: # %bb.2: # %CF240 -; X32-NEXT: retl -; -; X64-LABEL: autogen_SD88863: -; X64: # %bb.0: # %BB -; X64-NEXT: vpsrldq {{.*#+}} xmm0 = xmm0[8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,zero,zero,zero -; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0 -; X64-NEXT: movb $1, %al -; X64-NEXT: .p2align 4 -; X64-NEXT: .LBB0_1: # %CF -; X64-NEXT: # =>This Inner Loop Header: Depth=1 -; X64-NEXT: testb %al, %al -; X64-NEXT: jne .LBB0_1 -; X64-NEXT: # %bb.2: # %CF240 -; X64-NEXT: retq +; CHECK-LABEL: autogen_SD88863: +; CHECK: # %bb.0: # %BB +; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = zero,zero,ymm0[0,1] +; CHECK-NEXT: vxorpd %xmm1, %xmm1, %xmm1 +; CHECK-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0],ymm1[1],ymm0[3],ymm1[3] +; CHECK-NEXT: movb $1, %al +; CHECK-NEXT: .p2align 4 +; CHECK-NEXT: .LBB0_1: # %CF +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: testb %al, %al +; CHECK-NEXT: jne .LBB0_1 +; CHECK-NEXT: # %bb.2: # %CF240 +; CHECK-NEXT: ret{{[l|q]}} BB: %I26 = insertelement <4 x i64> undef, i64 undef, i32 2 br label %CF @@ -43,5 +29,3 @@ CF: CF240: ret <4 x i64> %I68 } -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; CHECK: {{.*}} diff --git a/llvm/test/CodeGen/X86/sse41.ll b/llvm/test/CodeGen/X86/sse41.ll index 6443bd5cda55f..2d7258a49f5d0 100644 --- a/llvm/test/CodeGen/X86/sse41.ll +++ b/llvm/test/CodeGen/X86/sse41.ll @@ -1233,47 +1233,31 @@ define <4 x i32> @i32_shuf_X00A(<4 x i32> %x, <4 x i32> %a) { } define <4 x i32> @i32_shuf_X00X(<4 x i32> %x, <4 x i32> %a) { -; X86-SSE-LABEL: i32_shuf_X00X: -; X86-SSE: ## %bb.0: -; X86-SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3] -; X86-SSE-NEXT: ## encoding: [0x66,0x0f,0x38,0x00,0x05,A,A,A,A] -; X86-SSE-NEXT: ## fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 -; X86-SSE-NEXT: retl ## encoding: [0xc3] -; -; X86-AVX1-LABEL: i32_shuf_X00X: -; X86-AVX1: ## %bb.0: -; X86-AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3] -; X86-AVX1-NEXT: ## encoding: [0xc4,0xe2,0x79,0x00,0x05,A,A,A,A] -; X86-AVX1-NEXT: ## fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 -; X86-AVX1-NEXT: retl ## encoding: [0xc3] -; -; X86-AVX512-LABEL: i32_shuf_X00X: -; X86-AVX512: ## %bb.0: -; X86-AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3] -; X86-AVX512-NEXT: ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x00,0x05,A,A,A,A] -; X86-AVX512-NEXT: ## fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}, kind: FK_Data_4 -; X86-AVX512-NEXT: retl ## encoding: [0xc3] -; -; X64-SSE-LABEL: i32_shuf_X00X: -; X64-SSE: ## %bb.0: -; X64-SSE-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3] -; X64-SSE-NEXT: ## encoding: [0x66,0x0f,0x38,0x00,0x05,A,A,A,A] -; X64-SSE-NEXT: ## fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte -; X64-SSE-NEXT: retq ## encoding: [0xc3] +; SSE-LABEL: i32_shuf_X00X: +; SSE: ## %bb.0: +; SSE-NEXT: pxor %xmm1, %xmm1 ## encoding: [0x66,0x0f,0xef,0xc9] +; SSE-NEXT: pshufd $0, %xmm0, %xmm0 ## encoding: [0x66,0x0f,0x70,0xc0,0x00] +; SSE-NEXT: ## xmm0 = xmm0[0,0,0,0] +; SSE-NEXT: pblendw $60, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0e,0xc1,0x3c] +; SSE-NEXT: ## xmm0 = xmm0[0,1],xmm1[2,3,4,5],xmm0[6,7] +; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] ; -; X64-AVX1-LABEL: i32_shuf_X00X: -; X64-AVX1: ## %bb.0: -; X64-AVX1-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3] -; X64-AVX1-NEXT: ## encoding: [0xc4,0xe2,0x79,0x00,0x05,A,A,A,A] -; X64-AVX1-NEXT: ## fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte -; X64-AVX1-NEXT: retq ## encoding: [0xc3] +; AVX1-LABEL: i32_shuf_X00X: +; AVX1: ## %bb.0: +; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9] +; AVX1-NEXT: vshufps $0, %xmm0, %xmm0, %xmm0 ## encoding: [0xc5,0xf8,0xc6,0xc0,0x00] +; AVX1-NEXT: ## xmm0 = xmm0[0,0,0,0] +; AVX1-NEXT: vblendps $6, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x06] +; AVX1-NEXT: ## xmm0 = xmm0[0],xmm1[1,2],xmm0[3] +; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] ; -; X64-AVX512-LABEL: i32_shuf_X00X: -; X64-AVX512: ## %bb.0: -; X64-AVX512-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3] -; X64-AVX512-NEXT: ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x00,0x05,A,A,A,A] -; X64-AVX512-NEXT: ## fixup A - offset: 5, value: {{\.?LCPI[0-9]+_[0-9]+}}-4, kind: reloc_riprel_4byte -; X64-AVX512-NEXT: retq ## encoding: [0xc3] +; AVX512-LABEL: i32_shuf_X00X: +; AVX512: ## %bb.0: +; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x57,0xc9] +; AVX512-NEXT: vbroadcastss %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x18,0xc0] +; AVX512-NEXT: vblendps $6, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x06] +; AVX512-NEXT: ## xmm0 = xmm0[0],xmm1[1,2],xmm0[3] +; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %vecext = extractelement <4 x i32> %x, i32 0 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0 %vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1 @@ -1285,26 +1269,32 @@ define <4 x i32> @i32_shuf_X00X(<4 x i32> %x, <4 x i32> %a) { define <4 x i32> @i32_shuf_X0YC(<4 x i32> %x, <4 x i32> %a) { ; SSE-LABEL: i32_shuf_X0YC: ; SSE: ## %bb.0: -; SSE-NEXT: pmovzxdq %xmm0, %xmm0 ## encoding: [0x66,0x0f,0x38,0x35,0xc0] -; SSE-NEXT: ## xmm0 = xmm0[0],zero,xmm0[1],zero -; SSE-NEXT: insertps $176, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x21,0xc1,0xb0] -; SSE-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[2] +; SSE-NEXT: pmovzxdq %xmm0, %xmm2 ## encoding: [0x66,0x0f,0x38,0x35,0xd0] +; SSE-NEXT: ## xmm2 = xmm0[0],zero,xmm0[1],zero +; SSE-NEXT: pshufd $170, %xmm1, %xmm0 ## encoding: [0x66,0x0f,0x70,0xc1,0xaa] +; SSE-NEXT: ## xmm0 = xmm1[2,2,2,2] +; SSE-NEXT: pblendw $63, %xmm2, %xmm0 ## encoding: [0x66,0x0f,0x3a,0x0e,0xc2,0x3f] +; SSE-NEXT: ## xmm0 = xmm2[0,1,2,3,4,5],xmm0[6,7] ; SSE-NEXT: ret{{[l|q]}} ## encoding: [0xc3] ; ; AVX1-LABEL: i32_shuf_X0YC: ; AVX1: ## %bb.0: ; AVX1-NEXT: vpmovzxdq %xmm0, %xmm0 ## encoding: [0xc4,0xe2,0x79,0x35,0xc0] ; AVX1-NEXT: ## xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX1-NEXT: vinsertps $176, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xb0] -; AVX1-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[2] +; AVX1-NEXT: vpshufd $170, %xmm1, %xmm1 ## encoding: [0xc5,0xf9,0x70,0xc9,0xaa] +; AVX1-NEXT: ## xmm1 = xmm1[2,2,2,2] +; AVX1-NEXT: vpblendw $192, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0e,0xc1,0xc0] +; AVX1-NEXT: ## xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7] ; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] ; ; AVX512-LABEL: i32_shuf_X0YC: ; AVX512: ## %bb.0: ; AVX512-NEXT: vpmovzxdq %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe2,0x79,0x35,0xc0] ; AVX512-NEXT: ## xmm0 = xmm0[0],zero,xmm0[1],zero -; AVX512-NEXT: vinsertps $176, %xmm1, %xmm0, %xmm0 ## EVEX TO VEX Compression encoding: [0xc4,0xe3,0x79,0x21,0xc1,0xb0] -; AVX512-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[2] +; AVX512-NEXT: vpshufd $170, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf9,0x70,0xc9,0xaa] +; AVX512-NEXT: ## xmm1 = xmm1[2,2,2,2] +; AVX512-NEXT: vpblendd $8, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x02,0xc1,0x08] +; AVX512-NEXT: ## xmm0 = xmm0[0,1,2],xmm1[3] ; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %vecext = extractelement <4 x i32> %x, i32 0 %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0 @@ -2134,14 +2124,14 @@ define <4 x float> @build_vector_to_shuffle_1(<4 x float> %A) { ; AVX1-LABEL: build_vector_to_shuffle_1: ; AVX1: ## %bb.0: ; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9] -; AVX1-NEXT: vblendps $5, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x05] +; AVX1-NEXT: vblendps $10, %xmm0, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x0a] ; AVX1-NEXT: ## xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] ; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] ; ; AVX512-LABEL: build_vector_to_shuffle_1: ; AVX512: ## %bb.0: ; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x57,0xc9] -; AVX512-NEXT: vblendps $5, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x05] +; AVX512-NEXT: vblendps $10, %xmm0, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x0a] ; AVX512-NEXT: ## xmm0 = xmm1[0],xmm0[1],xmm1[2],xmm0[3] ; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %vecext = extractelement <4 x float> %A, i32 1 @@ -2162,14 +2152,14 @@ define <4 x float> @build_vector_to_shuffle_2(<4 x float> %A) { ; AVX1-LABEL: build_vector_to_shuffle_2: ; AVX1: ## %bb.0: ; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## encoding: [0xc5,0xf0,0x57,0xc9] -; AVX1-NEXT: vblendps $13, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x0d] +; AVX1-NEXT: vblendps $2, %xmm0, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x02] ; AVX1-NEXT: ## xmm0 = xmm1[0],xmm0[1],xmm1[2,3] ; AVX1-NEXT: ret{{[l|q]}} ## encoding: [0xc3] ; ; AVX512-LABEL: build_vector_to_shuffle_2: ; AVX512: ## %bb.0: ; AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1 ## EVEX TO VEX Compression encoding: [0xc5,0xf0,0x57,0xc9] -; AVX512-NEXT: vblendps $13, %xmm1, %xmm0, %xmm0 ## encoding: [0xc4,0xe3,0x79,0x0c,0xc1,0x0d] +; AVX512-NEXT: vblendps $2, %xmm0, %xmm1, %xmm0 ## encoding: [0xc4,0xe3,0x71,0x0c,0xc0,0x02] ; AVX512-NEXT: ## xmm0 = xmm1[0],xmm0[1],xmm1[2,3] ; AVX512-NEXT: ret{{[l|q]}} ## encoding: [0xc3] %vecext = extractelement <4 x float> %A, i32 1 diff --git a/llvm/test/CodeGen/X86/vec_extract-avx.ll b/llvm/test/CodeGen/X86/vec_extract-avx.ll index ff0a68eb5692c..341a703a21bd5 100644 --- a/llvm/test/CodeGen/X86/vec_extract-avx.ll +++ b/llvm/test/CodeGen/X86/vec_extract-avx.ll @@ -126,7 +126,9 @@ define void @legal_vzmovl_2i32_8i32(ptr %in, ptr %out) { ; ; X64-LABEL: legal_vzmovl_2i32_8i32: ; X64: # %bb.0: -; X64-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero +; X64-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero +; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1 +; X64-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3] ; X64-NEXT: vmovaps %ymm0, (%rsi) ; X64-NEXT: vzeroupper ; X64-NEXT: retq diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll index 1dd015cc516f0..07d8a370a5f93 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i32-stride-5.ll @@ -18,24 +18,24 @@ define void @store_i32_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecptr2, ptr %in.vecptr3, ptr %in.vecptr4, ptr %out.vec) nounwind { ; SSE-LABEL: store_i32_stride5_vf2: ; SSE: # %bb.0: -; SSE-NEXT: movq {{.*#+}} xmm0 = mem[0],zero -; SSE-NEXT: movq {{.*#+}} xmm1 = mem[0],zero -; SSE-NEXT: movq {{.*#+}} xmm2 = mem[0],zero -; SSE-NEXT: movq {{.*#+}} xmm3 = mem[0],zero +; SSE-NEXT: movsd {{.*#+}} xmm0 = mem[0],zero +; SSE-NEXT: movsd {{.*#+}} xmm1 = mem[0],zero +; SSE-NEXT: movsd {{.*#+}} xmm2 = mem[0],zero +; SSE-NEXT: movsd {{.*#+}} xmm3 = mem[0],zero ; SSE-NEXT: movsd {{.*#+}} xmm4 = mem[0],zero -; SSE-NEXT: pshufd {{.*#+}} xmm5 = xmm3[1,1,1,1] -; SSE-NEXT: movdqa %xmm2, %xmm6 -; SSE-NEXT: punpckldq {{.*#+}} xmm6 = xmm6[0],xmm3[0],xmm6[1],xmm3[1] -; SSE-NEXT: movdqa %xmm0, %xmm3 -; SSE-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1] -; SSE-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm6[0] +; SSE-NEXT: movaps %xmm2, %xmm5 +; SSE-NEXT: unpcklps {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1] +; SSE-NEXT: movaps %xmm0, %xmm6 +; SSE-NEXT: unpcklps {{.*#+}} xmm6 = xmm6[0],xmm1[0],xmm6[1],xmm1[1] +; SSE-NEXT: movlhps {{.*#+}} xmm6 = xmm6[0],xmm5[0] ; SSE-NEXT: shufps {{.*#+}} xmm2 = xmm2[1,1],xmm1[1,1] ; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,0] ; SSE-NEXT: movss {{.*#+}} xmm0 = xmm4[0],xmm0[1,2,3] ; SSE-NEXT: pshufd {{.*#+}} xmm1 = xmm4[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm1[0],xmm5[1],xmm1[1] -; SSE-NEXT: movq %xmm5, 32(%r9) -; SSE-NEXT: movdqa %xmm3, (%r9) +; SSE-NEXT: pshufd {{.*#+}} xmm2 = xmm3[1,1,1,1] +; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1] +; SSE-NEXT: movq %xmm2, 32(%r9) +; SSE-NEXT: movaps %xmm6, (%r9) ; SSE-NEXT: movaps %xmm0, 16(%r9) ; SSE-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/vector-narrow-binop.ll b/llvm/test/CodeGen/X86/vector-narrow-binop.ll index 6f1948d3bc2a5..ad345213c1472 100644 --- a/llvm/test/CodeGen/X86/vector-narrow-binop.ll +++ b/llvm/test/CodeGen/X86/vector-narrow-binop.ll @@ -107,11 +107,9 @@ define <2 x i8> @PR39893(<2 x i32> %x, <8 x i8> %y) { ; SSE: # %bb.0: ; SSE-NEXT: pxor %xmm2, %xmm2 ; SSE-NEXT: psubd %xmm0, %xmm2 +; SSE-NEXT: psrld $16, %xmm2 ; SSE-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1] -; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1] -; SSE-NEXT: psrlq $16, %xmm2 -; SSE-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2 -; SSE-NEXT: packuswb %xmm2, %xmm2 +; SSE-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm0[0],xmm2[1],xmm0[1],xmm2[2],xmm0[2],xmm2[3],xmm0[3],xmm2[4],xmm0[4],xmm2[5],xmm0[5],xmm2[6],xmm0[6],xmm2[7],xmm0[7] ; SSE-NEXT: movdqa %xmm2, %xmm0 ; SSE-NEXT: retq ; @@ -119,8 +117,9 @@ define <2 x i8> @PR39893(<2 x i32> %x, <8 x i8> %y) { ; AVX: # %bb.0: ; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX-NEXT: vpsubd %xmm0, %xmm2, %xmm0 +; AVX-NEXT: vpsrld $16, %xmm0, %xmm0 +; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1] ; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3],xmm0[4],xmm1[4],xmm0[5],xmm1[5],xmm0[6],xmm1[6],xmm0[7],xmm1[7] -; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[4,9,u,u,u,u,u,u,u,u,u,u,u,u,u,u] ; AVX-NEXT: retq %sub = sub <2 x i32> , %x %bc = bitcast <2 x i32> %sub to <8 x i8> diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll index 2fc1e662d7a7f..ce092f9d343fc 100644 --- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll +++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast.ll @@ -4115,24 +4115,24 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in. ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: movdqa 48(%rdi), %xmm1 -; SSE2-NEXT: paddb 48(%rsi), %xmm1 ; SSE2-NEXT: paddb (%rsi), %xmm0 -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pxor %xmm3, %xmm3 -; SSE2-NEXT: pinsrw $2, %eax, %xmm3 -; SSE2-NEXT: pinsrw $4, %eax, %xmm2 -; SSE2-NEXT: movdqa {{.*#+}} xmm4 = [0,65535,65535,65535,65535,65535,0,65535] -; SSE2-NEXT: pand %xmm4, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE2-NEXT: pandn %xmm0, %xmm4 -; SSE2-NEXT: por %xmm1, %xmm4 -; SSE2-NEXT: paddb (%rdx), %xmm4 -; SSE2-NEXT: paddb 16(%rdx), %xmm2 -; SSE2-NEXT: paddb 32(%rdx), %xmm3 -; SSE2-NEXT: movdqa %xmm3, 32(%rcx) -; SSE2-NEXT: movdqa %xmm2, 16(%rcx) -; SSE2-NEXT: movdqa %xmm4, (%rcx) +; SSE2-NEXT: paddb 48(%rsi), %xmm1 +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,0,65535] +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0] +; SSE2-NEXT: pandn %xmm3, %xmm2 +; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] +; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: movdqa %xmm0, %xmm1 +; SSE2-NEXT: pslldq {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[0,1,2,3,4,5,6,7,8,9,10,11] +; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] +; SSE2-NEXT: paddb (%rdx), %xmm2 +; SSE2-NEXT: paddb 16(%rdx), %xmm0 +; SSE2-NEXT: paddb 32(%rdx), %xmm1 +; SSE2-NEXT: movdqa %xmm1, 32(%rcx) +; SSE2-NEXT: movdqa %xmm2, (%rcx) +; SSE2-NEXT: movdqa %xmm0, 16(%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: @@ -4141,19 +4141,19 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in. ; SSE42-NEXT: movdqa 48(%rdi), %xmm1 ; SSE42-NEXT: paddb 48(%rsi), %xmm1 ; SSE42-NEXT: paddb (%rsi), %xmm0 -; SSE42-NEXT: movd %xmm0, %eax -; SSE42-NEXT: pxor %xmm2, %xmm2 +; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,0,0] +; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],xmm1[1,2,3,4,5],xmm2[6],xmm1[7] +; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] ; SSE42-NEXT: pxor %xmm3, %xmm3 -; SSE42-NEXT: pinsrw $2, %eax, %xmm3 -; SSE42-NEXT: pinsrw $4, %eax, %xmm2 -; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5],xmm0[6],xmm1[7] -; SSE42-NEXT: paddb (%rdx), %xmm0 -; SSE42-NEXT: paddb 16(%rdx), %xmm2 -; SSE42-NEXT: paddb 32(%rdx), %xmm3 -; SSE42-NEXT: movdqa %xmm3, 32(%rcx) -; SSE42-NEXT: movdqa %xmm0, (%rcx) -; SSE42-NEXT: movdqa %xmm2, 16(%rcx) +; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm3[0,1],xmm1[2],xmm3[3,4,5,6,7] +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm3[0,1,2,3],xmm0[4],xmm3[5,6,7] +; SSE42-NEXT: paddb (%rdx), %xmm2 +; SSE42-NEXT: paddb 16(%rdx), %xmm0 +; SSE42-NEXT: paddb 32(%rdx), %xmm1 +; SSE42-NEXT: movdqa %xmm1, 32(%rcx) +; SSE42-NEXT: movdqa %xmm2, (%rcx) +; SSE42-NEXT: movdqa %xmm0, 16(%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: @@ -4165,16 +4165,16 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in. ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,0,0] ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3,4,5],xmm2[6],xmm1[7] ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] -; AVX-NEXT: vmovd %xmm0, %eax -; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm0 -; AVX-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 -; AVX-NEXT: vpaddb 32(%rdx), %xmm1, %xmm1 +; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,1,1] +; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm3[0,1],xmm0[2],xmm3[3,4,5,6,7] +; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 +; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm2, %xmm2 ; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 -; AVX-NEXT: vmovdqa %xmm0, (%rcx) +; AVX-NEXT: vmovdqa %xmm1, (%rcx) ; AVX-NEXT: vmovdqa %xmm2, 16(%rcx) -; AVX-NEXT: vmovdqa %xmm1, 32(%rcx) +; AVX-NEXT: vmovdqa %xmm0, 32(%rcx) ; AVX-NEXT: retq ; ; AVX2-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: @@ -4183,13 +4183,12 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in. ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 ; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX2-NEXT: vpaddb (%rsi), %xmm0, %xmm0 -; AVX2-NEXT: vpbroadcastw %xmm0, %xmm2 -; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm2[0],xmm1[1,2,3,4,5],xmm2[6],xmm1[7] -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0 +; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5],xmm0[6],xmm1[7] +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm1 ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 -; AVX2-NEXT: vpinsrw $2, %eax, %xmm0, %xmm0 +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2],xmm2[3,4,5,6,7] ; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 ; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) @@ -4324,37 +4323,36 @@ define void @vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3(ptr %i ; SSE2-NEXT: movdqa 48(%rdi), %xmm1 ; SSE2-NEXT: paddb 48(%rsi), %xmm1 ; SSE2-NEXT: paddb (%rsi), %xmm0 -; SSE2-NEXT: pextrw $0, %xmm0, %eax -; SSE2-NEXT: movd %eax, %xmm2 -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [0,65535,65535,65535,65535,65535,65535,65535] -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pandn %xmm0, %xmm3 -; SSE2-NEXT: por %xmm1, %xmm3 -; SSE2-NEXT: paddb (%rdx), %xmm3 -; SSE2-NEXT: movdqa 16(%rdx), %xmm0 -; SSE2-NEXT: paddb %xmm2, %xmm0 -; SSE2-NEXT: paddb 32(%rdx), %xmm2 -; SSE2-NEXT: movdqa %xmm2, 32(%rcx) -; SSE2-NEXT: movdqa %xmm3, (%rcx) -; SSE2-NEXT: movdqa %xmm0, 16(%rcx) +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,65535] +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: pandn %xmm0, %xmm2 +; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE2-NEXT: paddb (%rdx), %xmm2 +; SSE2-NEXT: movdqa 16(%rdx), %xmm1 +; SSE2-NEXT: paddb %xmm0, %xmm1 +; SSE2-NEXT: paddb 32(%rdx), %xmm0 +; SSE2-NEXT: movdqa %xmm0, 32(%rcx) +; SSE2-NEXT: movdqa %xmm1, 16(%rcx) +; SSE2-NEXT: movdqa %xmm2, (%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: movdqa 48(%rdi), %xmm1 -; SSE42-NEXT: paddb 48(%rsi), %xmm1 ; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: paddb 48(%rsi), %xmm1 +; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] ; SSE42-NEXT: pxor %xmm2, %xmm2 ; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3,4,5,6,7] -; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] -; SSE42-NEXT: paddb (%rdx), %xmm0 -; SSE42-NEXT: movdqa 16(%rdx), %xmm1 -; SSE42-NEXT: paddb %xmm2, %xmm1 +; SSE42-NEXT: paddb (%rdx), %xmm1 +; SSE42-NEXT: movdqa 16(%rdx), %xmm0 +; SSE42-NEXT: paddb %xmm2, %xmm0 ; SSE42-NEXT: paddb 32(%rdx), %xmm2 ; SSE42-NEXT: movdqa %xmm2, 32(%rcx) -; SSE42-NEXT: movdqa %xmm0, (%rcx) -; SSE42-NEXT: movdqa %xmm1, 16(%rcx) +; SSE42-NEXT: movdqa %xmm1, (%rcx) +; SSE42-NEXT: movdqa %xmm0, 16(%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3: @@ -4364,9 +4362,9 @@ define void @vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3(ptr %i ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] -; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 ; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3,4,5,6,7] +; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 ; AVX-NEXT: vpaddb 32(%rdx), %xmm2, %xmm2 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 @@ -4463,36 +4461,36 @@ define void @vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2(ptr % ; SSE2-NEXT: movdqa 48(%rdi), %xmm1 ; SSE2-NEXT: paddb 48(%rsi), %xmm1 ; SSE2-NEXT: paddb (%rsi), %xmm0 -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pinsrw $4, %eax, %xmm2 -; SSE2-NEXT: movdqa {{.*#+}} xmm3 = [0,65535,65535,65535,65535,65535,65535,65535] -; SSE2-NEXT: pand %xmm3, %xmm1 -; SSE2-NEXT: pandn %xmm0, %xmm3 -; SSE2-NEXT: por %xmm1, %xmm3 -; SSE2-NEXT: movaps 32(%rdx), %xmm0 -; SSE2-NEXT: paddb (%rdx), %xmm3 -; SSE2-NEXT: paddb 16(%rdx), %xmm2 -; SSE2-NEXT: movaps %xmm0, 32(%rcx) -; SSE2-NEXT: movdqa %xmm2, 16(%rcx) -; SSE2-NEXT: movdqa %xmm3, (%rcx) +; SSE2-NEXT: movdqa {{.*#+}} xmm2 = [0,65535,65535,65535,65535,65535,65535,65535] +; SSE2-NEXT: pand %xmm2, %xmm1 +; SSE2-NEXT: pandn %xmm0, %xmm2 +; SSE2-NEXT: por %xmm1, %xmm2 +; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] +; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] +; SSE2-NEXT: movaps 32(%rdx), %xmm1 +; SSE2-NEXT: paddb (%rdx), %xmm2 +; SSE2-NEXT: paddb 16(%rdx), %xmm0 +; SSE2-NEXT: movaps %xmm1, 32(%rcx) +; SSE2-NEXT: movdqa %xmm2, (%rcx) +; SSE2-NEXT: movdqa %xmm0, 16(%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: movdqa 48(%rdi), %xmm1 -; SSE42-NEXT: paddb 48(%rsi), %xmm1 ; SSE42-NEXT: paddb (%rsi), %xmm0 -; SSE42-NEXT: movd %xmm0, %eax +; SSE42-NEXT: paddb 48(%rsi), %xmm1 +; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] ; SSE42-NEXT: pxor %xmm2, %xmm2 -; SSE42-NEXT: pinsrw $4, %eax, %xmm2 -; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0],xmm1[1,2,3,4,5,6,7] -; SSE42-NEXT: movaps 32(%rdx), %xmm1 -; SSE42-NEXT: paddb (%rdx), %xmm0 +; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4],xmm2[5,6,7] +; SSE42-NEXT: movaps 32(%rdx), %xmm0 +; SSE42-NEXT: paddb (%rdx), %xmm1 ; SSE42-NEXT: paddb 16(%rdx), %xmm2 -; SSE42-NEXT: movaps %xmm1, 32(%rcx) -; SSE42-NEXT: movdqa %xmm0, (%rcx) +; SSE42-NEXT: movaps %xmm0, 32(%rcx) +; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: movdqa %xmm2, 16(%rcx) ; SSE42-NEXT: retq ; @@ -4802,14 +4800,14 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in. ; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: xorps %xmm2, %xmm2 ; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,0,1,1] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,0,1] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],xmm1[1,2] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,3,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm2[1,0,1,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,0,1] ; SSE2-NEXT: paddb (%rdx), %xmm0 ; SSE2-NEXT: paddb 16(%rdx), %xmm2 -; SSE2-NEXT: paddb 32(%rdx), %xmm3 -; SSE2-NEXT: movdqa %xmm3, 32(%rcx) +; SSE2-NEXT: paddb 32(%rdx), %xmm1 +; SSE2-NEXT: movdqa %xmm1, 32(%rcx) ; SSE2-NEXT: movdqa %xmm0, (%rcx) ; SSE2-NEXT: movdqa %xmm2, 16(%rcx) ; SSE2-NEXT: retq @@ -4820,18 +4818,18 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in. ; SSE42-NEXT: movdqa 48(%rdi), %xmm1 ; SSE42-NEXT: paddb 48(%rsi), %xmm1 ; SSE42-NEXT: paddb (%rsi), %xmm0 -; SSE42-NEXT: pxor %xmm2, %xmm2 -; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3,4,5,6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm2[1,0,1,1] -; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,0,1] -; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] -; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5],xmm0[6,7] -; SSE42-NEXT: paddb (%rdx), %xmm0 -; SSE42-NEXT: paddb 16(%rdx), %xmm2 -; SSE42-NEXT: paddb 32(%rdx), %xmm3 -; SSE42-NEXT: movdqa %xmm3, 32(%rcx) -; SSE42-NEXT: movdqa %xmm0, (%rcx) -; SSE42-NEXT: movdqa %xmm2, 16(%rcx) +; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm0[0,0,0,0] +; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1],xmm1[2,3,4,5],xmm2[6,7] +; SSE42-NEXT: pxor %xmm1, %xmm1 +; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,0,1,1] +; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,0,1] +; SSE42-NEXT: paddb (%rdx), %xmm2 +; SSE42-NEXT: paddb 16(%rdx), %xmm1 +; SSE42-NEXT: paddb 32(%rdx), %xmm0 +; SSE42-NEXT: movdqa %xmm0, 32(%rcx) +; SSE42-NEXT: movdqa %xmm2, (%rcx) +; SSE42-NEXT: movdqa %xmm1, 16(%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4: @@ -4843,10 +4841,10 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in. ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] ; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,1,0,1] -; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 ; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3,4,5,6,7] ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,1,1] +; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 ; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm0 ; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5],xmm3[6,7] ; AVX-NEXT: vpaddb 16(%rdx), %xmm2, %xmm2 @@ -4859,19 +4857,20 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in. ; AVX2-SLOW: # %bb.0: ; AVX2-SLOW-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-SLOW-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 ; AVX2-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpbroadcastd %xmm0, %xmm2 -; AVX2-SLOW-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = ymm3[0],ymm2[1],ymm3[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-SLOW-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,5,6,0] -; AVX2-SLOW-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm3[4,5],ymm0[6],ymm3[7] -; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm2, %ymm1 -; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rcx) -; AVX2-SLOW-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-SLOW-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-SLOW-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,5,6,0] +; AVX2-SLOW-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6],ymm2[7] +; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7] +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,1,1] +; AVX2-SLOW-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rcx) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; @@ -4879,18 +4878,18 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in. ; AVX2-FAST-PERLANE: # %bb.0: ; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 ; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,ymm0[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-PERLANE-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,5,6,0] -; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6],ymm1[7] -; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm2, %ymm1 -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, 32(%rcx) -; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,5,6,0] +; AVX2-FAST-PERLANE-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6],ymm2[7] +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rcx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX2-FAST-PERLANE-NEXT: vzeroupper ; AVX2-FAST-PERLANE-NEXT: retq ; @@ -4898,18 +4897,18 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in. ; AVX2-FAST: # %bb.0: ; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm1 -; AVX2-FAST-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 ; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,ymm0[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5,6,7] -; AVX2-FAST-NEXT: vpmovsxbd {{.*#+}} xmm1 = [0,5,6,0] -; AVX2-FAST-NEXT: vpermd %ymm0, %ymm1, %ymm0 -; AVX2-FAST-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm1[4,5],ymm0[6],ymm1[7] -; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm2, %ymm1 -; AVX2-FAST-NEXT: vmovdqa %ymm1, 32(%rcx) -; AVX2-FAST-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-FAST-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1,2,3],ymm1[4,5,6,7] +; AVX2-FAST-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,5,6,0] +; AVX2-FAST-NEXT: vpermd %ymm1, %ymm2, %ymm1 +; AVX2-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm2[4,5],ymm1[6],ymm2[7] +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rcx) +; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX2-FAST-NEXT: vzeroupper ; AVX2-FAST-NEXT: retq ; @@ -4998,11 +4997,11 @@ define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %i ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: movdqa 48(%rdi), %xmm1 -; SSE2-NEXT: paddb 48(%rsi), %xmm1 ; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: paddb 48(%rsi), %xmm1 +; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE2-NEXT: xorps %xmm2, %xmm2 ; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] -; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE2-NEXT: paddb (%rdx), %xmm1 ; SSE2-NEXT: movdqa 16(%rdx), %xmm0 ; SSE2-NEXT: paddb %xmm2, %xmm0 @@ -5016,18 +5015,18 @@ define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %i ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: movdqa 48(%rdi), %xmm1 -; SSE42-NEXT: paddb 48(%rsi), %xmm1 ; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: paddb 48(%rsi), %xmm1 +; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] ; SSE42-NEXT: pxor %xmm2, %xmm2 ; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3,4,5,6,7] -; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] -; SSE42-NEXT: paddb (%rdx), %xmm0 -; SSE42-NEXT: movdqa 16(%rdx), %xmm1 -; SSE42-NEXT: paddb %xmm2, %xmm1 +; SSE42-NEXT: paddb (%rdx), %xmm1 +; SSE42-NEXT: movdqa 16(%rdx), %xmm0 +; SSE42-NEXT: paddb %xmm2, %xmm0 ; SSE42-NEXT: paddb 32(%rdx), %xmm2 ; SSE42-NEXT: movdqa %xmm2, 32(%rcx) -; SSE42-NEXT: movdqa %xmm0, (%rcx) -; SSE42-NEXT: movdqa %xmm1, 16(%rcx) +; SSE42-NEXT: movdqa %xmm1, (%rcx) +; SSE42-NEXT: movdqa %xmm0, 16(%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3: @@ -5037,9 +5036,9 @@ define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %i ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] -; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 ; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1],xmm2[2,3,4,5,6,7] +; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 ; AVX-NEXT: vpaddb 32(%rdx), %xmm3, %xmm3 ; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7] ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 @@ -5052,18 +5051,18 @@ define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %i ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] -; AVX2-NEXT: vmovdqa 48(%rdi), %xmm2 -; AVX2-NEXT: vpaddb 48(%rsi), %xmm2, %xmm2 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX2-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 +; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,1,0,1] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4,5,6,7] ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] -; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX2-NEXT: vpaddb 32(%rdx), %ymm1, %ymm1 -; AVX2-NEXT: vmovdqa %ymm0, (%rcx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7] +; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm1, (%rcx) +; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -5151,36 +5150,36 @@ define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %i ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: movdqa 48(%rdi), %xmm1 -; SSE2-NEXT: paddb 48(%rsi), %xmm1 ; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: paddb 48(%rsi), %xmm1 +; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE2-NEXT: xorps %xmm2, %xmm2 ; SSE2-NEXT: movss {{.*#+}} xmm2 = xmm0[0],xmm2[1,2,3] -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,0,1] -; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] -; SSE2-NEXT: movaps 32(%rdx), %xmm0 +; SSE2-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,0,1] +; SSE2-NEXT: movaps 32(%rdx), %xmm2 ; SSE2-NEXT: paddb (%rdx), %xmm1 -; SSE2-NEXT: paddb 16(%rdx), %xmm2 -; SSE2-NEXT: movaps %xmm0, 32(%rcx) +; SSE2-NEXT: paddb 16(%rdx), %xmm0 +; SSE2-NEXT: movaps %xmm2, 32(%rcx) ; SSE2-NEXT: movdqa %xmm1, (%rcx) -; SSE2-NEXT: movdqa %xmm2, 16(%rcx) +; SSE2-NEXT: movdqa %xmm0, 16(%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: movdqa 48(%rdi), %xmm1 -; SSE42-NEXT: paddb 48(%rsi), %xmm1 ; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: paddb 48(%rsi), %xmm1 +; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] ; SSE42-NEXT: pxor %xmm2, %xmm2 ; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3,4,5,6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm2[1,1,0,1] -; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3,4,5,6,7] -; SSE42-NEXT: movaps 32(%rdx), %xmm1 -; SSE42-NEXT: paddb (%rdx), %xmm0 -; SSE42-NEXT: paddb 16(%rdx), %xmm2 -; SSE42-NEXT: movaps %xmm1, 32(%rcx) -; SSE42-NEXT: movdqa %xmm0, (%rcx) -; SSE42-NEXT: movdqa %xmm2, 16(%rcx) +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,0,1] +; SSE42-NEXT: movaps 32(%rdx), %xmm2 +; SSE42-NEXT: paddb (%rdx), %xmm1 +; SSE42-NEXT: paddb 16(%rdx), %xmm0 +; SSE42-NEXT: movaps %xmm2, 32(%rcx) +; SSE42-NEXT: movdqa %xmm1, (%rcx) +; SSE42-NEXT: movdqa %xmm0, 16(%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2: @@ -5284,17 +5283,17 @@ define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %i ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: movdqa 48(%rdi), %xmm1 -; SSE2-NEXT: paddb 48(%rsi), %xmm1 ; SSE2-NEXT: paddb (%rsi), %xmm0 -; SSE2-NEXT: movq {{.*#+}} xmm2 = xmm0[0],zero -; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm1[2,3] -; SSE2-NEXT: paddb (%rdx), %xmm0 -; SSE2-NEXT: movdqa 16(%rdx), %xmm1 -; SSE2-NEXT: paddb %xmm2, %xmm1 -; SSE2-NEXT: paddb 32(%rdx), %xmm2 -; SSE2-NEXT: movdqa %xmm2, 32(%rcx) -; SSE2-NEXT: movdqa %xmm0, (%rcx) -; SSE2-NEXT: movdqa %xmm1, 16(%rcx) +; SSE2-NEXT: paddb 48(%rsi), %xmm1 +; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE2-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero +; SSE2-NEXT: paddb (%rdx), %xmm1 +; SSE2-NEXT: movdqa 16(%rdx), %xmm2 +; SSE2-NEXT: paddb %xmm0, %xmm2 +; SSE2-NEXT: paddb 32(%rdx), %xmm0 +; SSE2-NEXT: movdqa %xmm0, 32(%rcx) +; SSE2-NEXT: movdqa %xmm1, (%rcx) +; SSE2-NEXT: movdqa %xmm2, 16(%rcx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3: @@ -5303,15 +5302,15 @@ define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %i ; SSE42-NEXT: movdqa 48(%rdi), %xmm1 ; SSE42-NEXT: paddb 48(%rsi), %xmm1 ; SSE42-NEXT: paddb (%rsi), %xmm0 -; SSE42-NEXT: movq {{.*#+}} xmm2 = xmm0[0],zero -; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] -; SSE42-NEXT: paddb (%rdx), %xmm0 -; SSE42-NEXT: movdqa 16(%rdx), %xmm1 -; SSE42-NEXT: paddb %xmm2, %xmm1 -; SSE42-NEXT: paddb 32(%rdx), %xmm2 -; SSE42-NEXT: movdqa %xmm2, 32(%rcx) -; SSE42-NEXT: movdqa %xmm0, (%rcx) -; SSE42-NEXT: movdqa %xmm1, 16(%rcx) +; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] +; SSE42-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero +; SSE42-NEXT: paddb (%rdx), %xmm1 +; SSE42-NEXT: movdqa 16(%rdx), %xmm2 +; SSE42-NEXT: paddb %xmm0, %xmm2 +; SSE42-NEXT: paddb 32(%rdx), %xmm0 +; SSE42-NEXT: movdqa %xmm0, 32(%rcx) +; SSE42-NEXT: movdqa %xmm1, (%rcx) +; SSE42-NEXT: movdqa %xmm2, 16(%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3: @@ -5321,8 +5320,8 @@ define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %i ; AVX-NEXT: vpaddb 48(%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpaddb (%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] -; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 ; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX-NEXT: vpaddb (%rdx), %xmm1, %xmm1 ; AVX-NEXT: vpaddb 32(%rdx), %xmm0, %xmm2 ; AVX-NEXT: vpaddb 16(%rdx), %xmm0, %xmm0 ; AVX-NEXT: vmovdqa %xmm1, (%rcx) @@ -5336,15 +5335,15 @@ define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %i ; AVX2-NEXT: vmovdqa 32(%rdi), %ymm1 ; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovq {{.*#+}} xmm2 = xmm0[0],zero -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,3] -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7] -; AVX2-NEXT: vpaddb (%rdx), %ymm0, %ymm0 -; AVX2-NEXT: vpaddb 32(%rdx), %ymm2, %ymm1 -; AVX2-NEXT: vmovdqa %ymm0, (%rcx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rcx) +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm0[0,1],ymm1[2,3],ymm0[4,5],ymm1[6,7] +; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,3,0,3] +; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm2[6,7] +; AVX2-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero +; AVX2-NEXT: vpaddb (%rdx), %ymm1, %ymm1 +; AVX2-NEXT: vpaddb 32(%rdx), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm1, (%rcx) +; AVX2-NEXT: vmovdqa %ymm0, 32(%rcx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -5430,8 +5429,8 @@ define void @vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2(ptr %i ; SSE2: # %bb.0: ; SSE2-NEXT: movdqa (%rdi), %xmm0 ; SSE2-NEXT: movdqa 48(%rdi), %xmm1 -; SSE2-NEXT: paddb 48(%rsi), %xmm1 ; SSE2-NEXT: paddb (%rsi), %xmm0 +; SSE2-NEXT: paddb 48(%rsi), %xmm1 ; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] ; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] ; SSE2-NEXT: movaps 32(%rdx), %xmm2 @@ -5448,15 +5447,14 @@ define void @vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2(ptr %i ; SSE42-NEXT: movdqa 48(%rdi), %xmm1 ; SSE42-NEXT: paddb 48(%rsi), %xmm1 ; SSE42-NEXT: paddb (%rsi), %xmm0 -; SSE42-NEXT: movdqa %xmm0, %xmm2 -; SSE42-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7] -; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],xmm1[4,5,6,7] -; SSE42-NEXT: movaps 32(%rdx), %xmm1 -; SSE42-NEXT: paddb (%rdx), %xmm0 -; SSE42-NEXT: paddb 16(%rdx), %xmm2 -; SSE42-NEXT: movaps %xmm1, 32(%rcx) -; SSE42-NEXT: movdqa %xmm2, 16(%rcx) -; SSE42-NEXT: movdqa %xmm0, (%rcx) +; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] +; SSE42-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] +; SSE42-NEXT: movaps 32(%rdx), %xmm2 +; SSE42-NEXT: paddb (%rdx), %xmm1 +; SSE42-NEXT: paddb 16(%rdx), %xmm0 +; SSE42-NEXT: movaps %xmm2, 32(%rcx) +; SSE42-NEXT: movdqa %xmm0, 16(%rcx) +; SSE42-NEXT: movdqa %xmm1, (%rcx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i64_widen_to_i192_factor3_broadcast_to_v2i192_factor2: diff --git a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll index d874ceb3f4736..acedcf4263906 100644 --- a/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll +++ b/llvm/test/CodeGen/X86/zero_extend_vector_inreg_of_broadcast_from_memory.ll @@ -3316,39 +3316,41 @@ define void @vec384_i16_widen_to_i64_factor4_broadcast_to_v6i64_factor6(ptr %in. define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in.elt.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { ; SSE2-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: ; SSE2: # %bb.0: -; SSE2-NEXT: movl (%rdi), %eax -; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [0,65535,65535,65535,65535,65535,0,65535] -; SSE2-NEXT: movdqa 48(%rdi), %xmm1 -; SSE2-NEXT: pand %xmm0, %xmm1 -; SSE2-NEXT: pshufd {{.*#+}} xmm2 = mem[0,0,0,0] -; SSE2-NEXT: pandn %xmm2, %xmm0 -; SSE2-NEXT: por %xmm1, %xmm0 -; SSE2-NEXT: pxor %xmm1, %xmm1 -; SSE2-NEXT: pxor %xmm2, %xmm2 -; SSE2-NEXT: pinsrw $2, %eax, %xmm2 -; SSE2-NEXT: pinsrw $4, %eax, %xmm1 -; SSE2-NEXT: paddb (%rsi), %xmm0 -; SSE2-NEXT: paddb 16(%rsi), %xmm1 +; SSE2-NEXT: movdqa (%rdi), %xmm0 +; SSE2-NEXT: movdqa {{.*#+}} xmm1 = [0,65535,65535,65535,65535,65535,0,65535] +; SSE2-NEXT: movdqa 48(%rdi), %xmm2 +; SSE2-NEXT: pand %xmm1, %xmm2 +; SSE2-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0] +; SSE2-NEXT: pandn %xmm3, %xmm1 +; SSE2-NEXT: por %xmm2, %xmm1 +; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] +; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: movdqa %xmm0, %xmm2 +; SSE2-NEXT: pslldq {{.*#+}} xmm2 = zero,zero,zero,zero,xmm2[0,1,2,3,4,5,6,7,8,9,10,11] +; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] +; SSE2-NEXT: paddb (%rsi), %xmm1 +; SSE2-NEXT: paddb 16(%rsi), %xmm0 ; SSE2-NEXT: paddb 32(%rsi), %xmm2 ; SSE2-NEXT: movdqa %xmm2, 32(%rdx) -; SSE2-NEXT: movdqa %xmm1, 16(%rdx) -; SSE2-NEXT: movdqa %xmm0, (%rdx) +; SSE2-NEXT: movdqa %xmm1, (%rdx) +; SSE2-NEXT: movdqa %xmm0, 16(%rdx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: ; SSE42: # %bb.0: -; SSE42-NEXT: movl (%rdi), %eax -; SSE42-NEXT: pxor %xmm0, %xmm0 -; SSE42-NEXT: pxor %xmm1, %xmm1 -; SSE42-NEXT: pinsrw $2, %eax, %xmm1 -; SSE42-NEXT: pshufd {{.*#+}} xmm2 = mem[0,0,0,0] -; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0],mem[1,2,3,4,5],xmm2[6],mem[7] -; SSE42-NEXT: pinsrw $4, %eax, %xmm0 +; SSE42-NEXT: movdqa (%rdi), %xmm0 +; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[0,0,1,1] +; SSE42-NEXT: pxor %xmm2, %xmm2 +; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm2[0,1],xmm1[2],xmm2[3,4,5,6,7] +; SSE42-NEXT: pshufd {{.*#+}} xmm3 = xmm0[0,0,0,0] +; SSE42-NEXT: pblendw {{.*#+}} xmm3 = xmm3[0],mem[1,2,3,4,5],xmm3[6],mem[7] +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1,2,3],xmm0[4],xmm2[5,6,7] ; SSE42-NEXT: paddb 16(%rsi), %xmm0 -; SSE42-NEXT: paddb (%rsi), %xmm2 +; SSE42-NEXT: paddb (%rsi), %xmm3 ; SSE42-NEXT: paddb 32(%rsi), %xmm1 ; SSE42-NEXT: movdqa %xmm1, 32(%rdx) -; SSE42-NEXT: movdqa %xmm2, (%rdx) +; SSE42-NEXT: movdqa %xmm3, (%rdx) ; SSE42-NEXT: movdqa %xmm0, 16(%rdx) ; SSE42-NEXT: retq ; @@ -3357,35 +3359,66 @@ define void @vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4(ptr %in. ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[0,0,0,0] ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],mem[1,2,3,4,5],xmm1[6],mem[7] +; AVX-NEXT: vpshufd {{.*#+}} xmm2 = xmm0[0,0,1,1] +; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3 +; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1],xmm2[2],xmm3[3,4,5,6,7] ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2 ; AVX-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}+16(%rip), %xmm0, %xmm0 ; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 ; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpinsrw $2, (%rdi), %xmm2, %xmm2 -; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2 -; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) ; AVX-NEXT: vmovdqa %xmm1, (%rdx) ; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) +; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) ; AVX-NEXT: retq ; -; AVX2-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: -; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %xmm0 -; AVX2-NEXT: vmovd %xmm0, %eax -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpinsrw $2, %eax, %xmm1, %xmm1 -; AVX2-NEXT: vpbroadcastw %xmm0, %xmm0 -; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3,4,5],xmm0[6],mem[7] -; AVX2-NEXT: vpbroadcastw (%rdi), %xmm2 -; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 -; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx) -; AVX2-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-NEXT: vzeroupper -; AVX2-NEXT: retq +; AVX2-SLOW-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: +; AVX2-SLOW: # %bb.0: +; AVX2-SLOW-NEXT: vpbroadcastw (%rdi), %xmm0 +; AVX2-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0,1],xmm0[2],xmm1[3,4,5,6,7] +; AVX2-SLOW-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0],mem[1,2,3,4,5],xmm0[6],mem[7] +; AVX2-SLOW-NEXT: vpbroadcastw (%rdi), %xmm2 +; AVX2-SLOW-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rdx) +; AVX2-SLOW-NEXT: vzeroupper +; AVX2-SLOW-NEXT: retq +; +; AVX2-FAST-PERLANE-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: +; AVX2-FAST-PERLANE: # %bb.0: +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw (%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],mem[1,2,3,4,5],xmm1[6],mem[7] +; AVX2-FAST-PERLANE-NEXT: vpbroadcastw (%rdi), %xmm2 +; AVX2-FAST-PERLANE-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX2-FAST-PERLANE-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-FAST-PERLANE-NEXT: vmovdqa %ymm1, (%rdx) +; AVX2-FAST-PERLANE-NEXT: vzeroupper +; AVX2-FAST-PERLANE-NEXT: retq +; +; AVX2-FAST-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: +; AVX2-FAST: # %bb.0: +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm0 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,zero,zero,xmm0[0,1],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vpbroadcastw (%rdi), %xmm1 +; AVX2-FAST-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],mem[1,2,3,4,5],xmm1[6],mem[7] +; AVX2-FAST-NEXT: vpbroadcastw (%rdi), %xmm2 +; AVX2-FAST-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX2-FAST-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-FAST-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-FAST-NEXT: vmovdqa %ymm1, (%rdx) +; AVX2-FAST-NEXT: vzeroupper +; AVX2-FAST-NEXT: retq ; ; AVX512F-LABEL: vec384_i16_widen_to_i96_factor6_broadcast_to_v4i96_factor4: ; AVX512F: # %bb.0: @@ -3448,15 +3481,14 @@ define void @vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3(ptr %i ; SSE2-NEXT: pand %xmm1, %xmm2 ; SSE2-NEXT: pandn %xmm0, %xmm1 ; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: pextrw $0, %xmm0, %eax -; SSE2-NEXT: movd %eax, %xmm0 +; SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 ; SSE2-NEXT: paddb (%rsi), %xmm1 ; SSE2-NEXT: movdqa 16(%rsi), %xmm2 ; SSE2-NEXT: paddb %xmm0, %xmm2 ; SSE2-NEXT: paddb 32(%rsi), %xmm0 ; SSE2-NEXT: movdqa %xmm0, 32(%rdx) -; SSE2-NEXT: movdqa %xmm1, (%rdx) ; SSE2-NEXT: movdqa %xmm2, 16(%rdx) +; SSE2-NEXT: movdqa %xmm1, (%rdx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3: @@ -3495,7 +3527,7 @@ define void @vec384_i16_widen_to_i128_factor8_broadcast_to_v3i128_factor3(ptr %i ; AVX2-NEXT: vpxor %xmm0, %xmm0, %xmm0 ; AVX2-NEXT: vpblendw {{.*#+}} xmm0 = mem[0],xmm0[1,2,3,4,5,6,7] ; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 -; AVX2-NEXT: vpermq {{.*#+}} ymm2 = mem[0,1,0,1] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = mem[0,1,0,1] ; AVX2-NEXT: vpblendw {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3,4,5,6,7],ymm2[8],ymm1[9,10,11,12,13,14,15] ; AVX2-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1 ; AVX2-NEXT: vpaddb (%rsi), %ymm1, %ymm1 @@ -3564,9 +3596,9 @@ define void @vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2(ptr % ; SSE2-NEXT: pand %xmm1, %xmm2 ; SSE2-NEXT: pandn %xmm0, %xmm1 ; SSE2-NEXT: por %xmm2, %xmm1 -; SSE2-NEXT: movd %xmm0, %eax -; SSE2-NEXT: pxor %xmm0, %xmm0 -; SSE2-NEXT: pinsrw $4, %eax, %xmm0 +; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1] +; SSE2-NEXT: psrldq {{.*#+}} xmm0 = xmm0[14,15],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero +; SSE2-NEXT: pslldq {{.*#+}} xmm0 = zero,zero,zero,zero,zero,zero,zero,zero,xmm0[0,1,2,3,4,5,6,7] ; SSE2-NEXT: movaps 32(%rsi), %xmm2 ; SSE2-NEXT: paddb (%rsi), %xmm1 ; SSE2-NEXT: paddb 16(%rsi), %xmm0 @@ -3580,15 +3612,15 @@ define void @vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2(ptr % ; SSE42-NEXT: movdqa (%rdi), %xmm0 ; SSE42-NEXT: movdqa 48(%rdi), %xmm1 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3,4,5,6,7] -; SSE42-NEXT: movd %xmm0, %eax -; SSE42-NEXT: pxor %xmm0, %xmm0 -; SSE42-NEXT: pinsrw $4, %eax, %xmm0 -; SSE42-NEXT: movaps 32(%rsi), %xmm2 -; SSE42-NEXT: paddb 16(%rsi), %xmm0 +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,1,0,1] +; SSE42-NEXT: pxor %xmm2, %xmm2 +; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm0[4],xmm2[5,6,7] +; SSE42-NEXT: movaps 32(%rsi), %xmm0 +; SSE42-NEXT: paddb 16(%rsi), %xmm2 ; SSE42-NEXT: paddb (%rsi), %xmm1 -; SSE42-NEXT: movaps %xmm2, 32(%rdx) +; SSE42-NEXT: movaps %xmm0, 32(%rdx) ; SSE42-NEXT: movdqa %xmm1, (%rdx) -; SSE42-NEXT: movdqa %xmm0, 16(%rdx) +; SSE42-NEXT: movdqa %xmm2, 16(%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i16_widen_to_i192_factor12_broadcast_to_v2i192_factor2: @@ -3830,9 +3862,9 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in. ; SSE2-NEXT: xorps %xmm1, %xmm1 ; SSE2-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3] ; SSE2-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,0,1,1] -; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,0,1] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0],mem[1,2] ; SSE2-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,3,1] +; SSE2-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,0,1] ; SSE2-NEXT: paddb 16(%rsi), %xmm1 ; SSE2-NEXT: paddb (%rsi), %xmm0 ; SSE2-NEXT: paddb 32(%rsi), %xmm2 @@ -3847,9 +3879,9 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in. ; SSE42-NEXT: pxor %xmm1, %xmm1 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] ; SSE42-NEXT: pshufd {{.*#+}} xmm2 = xmm1[1,0,1,1] -; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,0,1] ; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,0,0,0] ; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3,4,5],xmm0[6,7] +; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,0,1] ; SSE42-NEXT: paddb 16(%rsi), %xmm1 ; SSE42-NEXT: paddb (%rsi), %xmm0 ; SSE42-NEXT: paddb 32(%rsi), %xmm2 @@ -3863,12 +3895,12 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in. ; AVX-NEXT: vmovdqa (%rdi), %xmm0 ; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],mem[2,3,4,5,6,7] ; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,1,2,0] -; AVX-NEXT: vbroadcastss (%rdi), %ymm2 -; AVX-NEXT: vpxor %xmm3, %xmm3, %xmm3 -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm3[2,3,4,5,6,7] +; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7] ; AVX-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,0,1,1] +; AVX-NEXT: vbroadcastss (%rdi), %ymm3 ; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm0 -; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm3[0,1,2,3],xmm2[4,5],xmm3[6,7] +; AVX-NEXT: vpblendw {{.*#+}} xmm2 = xmm2[0,1,2,3],xmm3[4,5],xmm2[6,7] ; AVX-NEXT: vpaddb 16(%rsi), %xmm2, %xmm2 ; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 ; AVX-NEXT: vmovdqa %xmm1, (%rdx) @@ -3879,26 +3911,27 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in. ; ; AVX2-SLOW-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4: ; AVX2-SLOW: # %bb.0: -; AVX2-SLOW-NEXT: vpbroadcastd (%rdi), %xmm0 -; AVX2-SLOW-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2,3,4,5,6,7] -; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm2 = mem[0,1,2,3],ymm2[4,5,6,7] -; AVX2-SLOW-NEXT: vpmovsxbd {{.*#+}} xmm3 = [0,5,6,0] -; AVX2-SLOW-NEXT: vpermd %ymm2, %ymm3, %ymm2 -; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0,1,2,3],ymm1[4,5],ymm2[6],ymm1[7] -; AVX2-SLOW-NEXT: vpaddb (%rsi), %ymm1, %ymm1 -; AVX2-SLOW-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 -; AVX2-SLOW-NEXT: vmovdqa %ymm0, 32(%rdx) -; AVX2-SLOW-NEXT: vmovdqa %ymm1, (%rdx) +; AVX2-SLOW-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX2-SLOW-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero +; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,0,1,1] +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] +; AVX2-SLOW-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,5,6,0] +; AVX2-SLOW-NEXT: vpermd %ymm0, %ymm2, %ymm0 +; AVX2-SLOW-NEXT: vpxor %xmm2, %xmm2, %xmm2 +; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4,5],ymm0[6],ymm2[7] +; AVX2-SLOW-NEXT: vpaddb (%rsi), %ymm0, %ymm0 +; AVX2-SLOW-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 +; AVX2-SLOW-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-SLOW-NEXT: vmovdqa %ymm0, (%rdx) ; AVX2-SLOW-NEXT: vzeroupper ; AVX2-SLOW-NEXT: retq ; ; AVX2-FAST-PERLANE-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4: ; AVX2-FAST-PERLANE: # %bb.0: -; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,ymm0[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-PERLANE-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX2-FAST-PERLANE-NEXT: vmovdqa (%rdi), %xmm1 +; AVX2-FAST-PERLANE-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-PERLANE-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-PERLANE-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,5,6,0] ; AVX2-FAST-PERLANE-NEXT: vpermd %ymm0, %ymm2, %ymm0 ; AVX2-FAST-PERLANE-NEXT: vpxor %xmm2, %xmm2, %xmm2 @@ -3912,9 +3945,10 @@ define void @vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4(ptr %in. ; ; AVX2-FAST-LABEL: vec384_i32_widen_to_i96_factor3_broadcast_to_v4i96_factor4: ; AVX2-FAST: # %bb.0: -; AVX2-FAST-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-FAST-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,zero,ymm0[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero -; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3],mem[4,5,6,7] +; AVX2-FAST-NEXT: vmovdqa 32(%rdi), %ymm0 +; AVX2-FAST-NEXT: vmovdqa (%rdi), %xmm1 +; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,xmm1[0,1,2,3],zero,zero,zero,zero,zero,zero,zero,zero +; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = mem[0,1,2,3],ymm0[4,5,6,7] ; AVX2-FAST-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,5,6,0] ; AVX2-FAST-NEXT: vpermd %ymm0, %ymm2, %ymm0 ; AVX2-FAST-NEXT: vpxor %xmm2, %xmm2, %xmm2 @@ -3995,47 +4029,47 @@ define void @vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3(ptr %i ; SSE42-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 -; SSE42-NEXT: pxor %xmm1, %xmm1 +; SSE42-NEXT: movdqa 48(%rdi), %xmm1 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] -; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3,4,5,6,7] -; SSE42-NEXT: movdqa 16(%rsi), %xmm2 -; SSE42-NEXT: paddb %xmm1, %xmm2 -; SSE42-NEXT: paddb (%rsi), %xmm0 -; SSE42-NEXT: paddb 32(%rsi), %xmm1 -; SSE42-NEXT: movdqa %xmm1, 32(%rdx) -; SSE42-NEXT: movdqa %xmm0, (%rdx) -; SSE42-NEXT: movdqa %xmm2, 16(%rdx) +; SSE42-NEXT: pxor %xmm2, %xmm2 +; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3,4,5,6,7] +; SSE42-NEXT: movdqa 16(%rsi), %xmm0 +; SSE42-NEXT: paddb %xmm2, %xmm0 +; SSE42-NEXT: paddb (%rsi), %xmm1 +; SSE42-NEXT: paddb 32(%rsi), %xmm2 +; SSE42-NEXT: movdqa %xmm2, 32(%rdx) +; SSE42-NEXT: movdqa %xmm1, (%rdx) +; SSE42-NEXT: movdqa %xmm0, 16(%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3: ; AVX: # %bb.0: -; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],mem[2,3,4,5,6,7] -; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX-NEXT: vpblendw {{.*#+}} xmm3 = xmm0[0,1],xmm2[2,3,4,5,6,7] -; AVX-NEXT: vpaddb 32(%rsi), %xmm3, %xmm3 -; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3,4,5,6,7] +; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0],mem[1,2,3] +; AVX-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero +; AVX-NEXT: vxorps %xmm3, %xmm3, %xmm3 +; AVX-NEXT: vblendps {{.*#+}} xmm0 = xmm0[0],xmm3[1,2,3] ; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) +; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 +; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2 +; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) ; AVX-NEXT: vmovdqa %xmm1, (%rdx) -; AVX-NEXT: vmovdqa %xmm3, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) +; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX2-LABEL: vec384_i32_widen_to_i128_factor4_broadcast_to_v3i128_factor3: ; AVX2: # %bb.0: -; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] -; AVX2-NEXT: vmovdqa 48(%rdi), %xmm2 -; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,1,0,1] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0],ymm2[1,2,3],ymm0[4,5,6,7] +; AVX2-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero +; AVX2-NEXT: vmovdqa 48(%rdi), %xmm1 +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm2 = mem[0,1,0,1] +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm2[0],ymm1[1,2,3],ymm2[4,5,6,7] ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5,6,7] -; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 -; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rdx) -; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-NEXT: vpblendd {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5,6,7] +; AVX2-NEXT: vpaddb (%rsi), %ymm1, %ymm1 +; AVX2-NEXT: vpaddb 32(%rsi), %ymm0, %ymm0 +; AVX2-NEXT: vmovdqa %ymm0, 32(%rdx) +; AVX2-NEXT: vmovdqa %ymm1, (%rdx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; @@ -4108,16 +4142,17 @@ define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %i ; SSE42-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 -; SSE42-NEXT: pxor %xmm1, %xmm1 +; SSE42-NEXT: movdqa 48(%rdi), %xmm1 ; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1],xmm1[2,3,4,5,6,7] -; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm1[1,1,0,1] -; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1],mem[2,3,4,5,6,7] +; SSE42-NEXT: pxor %xmm2, %xmm2 +; SSE42-NEXT: pblendw {{.*#+}} xmm2 = xmm0[0,1],xmm2[2,3,4,5,6,7] +; SSE42-NEXT: pshufd {{.*#+}} xmm0 = xmm2[1,1,0,1] ; SSE42-NEXT: movaps 32(%rsi), %xmm2 -; SSE42-NEXT: paddb 16(%rsi), %xmm1 -; SSE42-NEXT: paddb (%rsi), %xmm0 +; SSE42-NEXT: paddb 16(%rsi), %xmm0 +; SSE42-NEXT: paddb (%rsi), %xmm1 ; SSE42-NEXT: movaps %xmm2, 32(%rdx) -; SSE42-NEXT: movdqa %xmm0, (%rdx) -; SSE42-NEXT: movdqa %xmm1, 16(%rdx) +; SSE42-NEXT: movdqa %xmm1, (%rdx) +; SSE42-NEXT: movdqa %xmm0, 16(%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2: @@ -4198,57 +4233,61 @@ define void @vec384_i32_widen_to_i192_factor6_broadcast_to_v2i192_factor2(ptr %i define void @vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3(ptr %in.elt.ptr, ptr %out.vec.bias.ptr, ptr %out.vec.ptr) nounwind { ; SSE2-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3: ; SSE2: # %bb.0: -; SSE2-NEXT: movdqa (%rdi), %xmm0 -; SSE2-NEXT: movq {{.*#+}} xmm1 = xmm0[0],zero -; SSE2-NEXT: shufpd {{.*#+}} xmm0 = xmm0[0],mem[1] +; SSE2-NEXT: movapd (%rdi), %xmm0 +; SSE2-NEXT: movapd 48(%rdi), %xmm1 +; SSE2-NEXT: movsd {{.*#+}} xmm1 = xmm0[0],xmm1[1] +; SSE2-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero ; SSE2-NEXT: movdqa 16(%rsi), %xmm2 -; SSE2-NEXT: paddb %xmm1, %xmm2 -; SSE2-NEXT: paddb (%rsi), %xmm0 -; SSE2-NEXT: paddb 32(%rsi), %xmm1 -; SSE2-NEXT: movdqa %xmm1, 32(%rdx) -; SSE2-NEXT: movdqa %xmm0, (%rdx) +; SSE2-NEXT: paddb %xmm0, %xmm2 +; SSE2-NEXT: paddb (%rsi), %xmm1 +; SSE2-NEXT: paddb 32(%rsi), %xmm0 +; SSE2-NEXT: movdqa %xmm0, 32(%rdx) +; SSE2-NEXT: movdqa %xmm1, (%rdx) ; SSE2-NEXT: movdqa %xmm2, 16(%rdx) ; SSE2-NEXT: retq ; ; SSE42-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3: ; SSE42: # %bb.0: ; SSE42-NEXT: movdqa (%rdi), %xmm0 -; SSE42-NEXT: movq {{.*#+}} xmm1 = xmm0[0],zero -; SSE42-NEXT: pblendw {{.*#+}} xmm0 = xmm0[0,1,2,3],mem[4,5,6,7] +; SSE42-NEXT: movdqa 48(%rdi), %xmm1 +; SSE42-NEXT: pblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],xmm1[4,5,6,7] +; SSE42-NEXT: movq {{.*#+}} xmm0 = xmm0[0],zero ; SSE42-NEXT: movdqa 16(%rsi), %xmm2 -; SSE42-NEXT: paddb %xmm1, %xmm2 -; SSE42-NEXT: paddb (%rsi), %xmm0 -; SSE42-NEXT: paddb 32(%rsi), %xmm1 -; SSE42-NEXT: movdqa %xmm1, 32(%rdx) -; SSE42-NEXT: movdqa %xmm0, (%rdx) +; SSE42-NEXT: paddb %xmm0, %xmm2 +; SSE42-NEXT: paddb (%rsi), %xmm1 +; SSE42-NEXT: paddb 32(%rsi), %xmm0 +; SSE42-NEXT: movdqa %xmm0, 32(%rdx) +; SSE42-NEXT: movdqa %xmm1, (%rdx) ; SSE42-NEXT: movdqa %xmm2, 16(%rdx) ; SSE42-NEXT: retq ; ; AVX-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3: ; AVX: # %bb.0: -; AVX-NEXT: vmovdqa (%rdi), %xmm0 -; AVX-NEXT: vpblendw {{.*#+}} xmm1 = xmm0[0,1,2,3],mem[4,5,6,7] +; AVX-NEXT: vbroadcastf128 {{.*#+}} ymm0 = mem[0,1,0,1] +; AVX-NEXT: vblendps {{.*#+}} xmm1 = xmm0[0,1],mem[2,3] +; AVX-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero ; AVX-NEXT: vmovq {{.*#+}} xmm0 = xmm0[0],zero -; AVX-NEXT: vpaddb 32(%rsi), %xmm0, %xmm2 -; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 ; AVX-NEXT: vpaddb 16(%rsi), %xmm0, %xmm0 -; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) -; AVX-NEXT: vmovdqa %xmm1, (%rdx) +; AVX-NEXT: vpaddb (%rsi), %xmm1, %xmm1 +; AVX-NEXT: vpaddb 32(%rsi), %xmm2, %xmm2 ; AVX-NEXT: vmovdqa %xmm2, 32(%rdx) +; AVX-NEXT: vmovdqa %xmm1, (%rdx) +; AVX-NEXT: vmovdqa %xmm0, 16(%rdx) +; AVX-NEXT: vzeroupper ; AVX-NEXT: retq ; ; AVX2-LABEL: vec384_i64_widen_to_i128_factor2_broadcast_to_v3i128_factor3: ; AVX2: # %bb.0: ; AVX2-NEXT: vmovdqa (%rdi), %ymm0 -; AVX2-NEXT: vmovq {{.*#+}} xmm1 = xmm0[0],zero +; AVX2-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1],mem[2,3],ymm0[4,5],mem[6,7] ; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,3,0,3] ; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 ; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6,7] -; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 ; AVX2-NEXT: vpaddb (%rsi), %ymm0, %ymm0 -; AVX2-NEXT: vmovdqa %ymm0, (%rdx) +; AVX2-NEXT: vpaddb 32(%rsi), %ymm1, %ymm1 ; AVX2-NEXT: vmovdqa %ymm1, 32(%rdx) +; AVX2-NEXT: vmovdqa %ymm0, (%rdx) ; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; From 0f588c812c2c551cde646963cd926c5047e9a534 Mon Sep 17 00:00:00 2001 From: Matt Arsenault Date: Thu, 16 Jan 2025 14:54:00 +0700 Subject: [PATCH 5/5] Cleanup and comment --- llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 21 ++++++++++--------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 8cfd994a21d56..de7fb21f5903e 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -23811,6 +23811,7 @@ SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) { // value. unsigned OneConstExtractIndex = ~0u; + // Count the number of extract_vector_elt sources (i.e. non-constant or undef) unsigned NumExtracts = 0; for (unsigned i = 0; i != NumElems; ++i) { @@ -23849,9 +23850,7 @@ SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) { ExtractedFromVec.getValueType().getVectorElementType()) return SDValue(); - if (OneConstExtractIndex == ~0u) - OneConstExtractIndex = ExtractIdx->getZExtValue(); - + OneConstExtractIndex = ExtractIdx->getZExtValue(); ++NumExtracts; // Have we seen this input vector before? @@ -23878,14 +23877,16 @@ SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) { if (VecIn.size() == 2) { // If we only found a single constant indexed extract_vector_elt feeding the // build_vector, do not produce a more complicated shuffle if the extract is - // cheap. - - // TODO: This should be more aggressive about skipping the shuffle formation - // (e.g., always do this for VecIn[1]->hasOneUse()) - if (TLI.isOperationLegalOrCustom(ISD::EXTRACT_VECTOR_ELT, VT) && + // cheap with other constant/undef elements. Skip broadcast patterns with + // multiple uses in the build_vector. + + // TODO: This should be more aggressive about skipping the shuffle + // formation, particularly if VecIn[1].hasOneUse(), and regardless of the + // index. + if (NumExtracts == 1 && + TLI.isOperationLegalOrCustom(ISD::EXTRACT_VECTOR_ELT, VT) && TLI.isTypeLegal(VT.getVectorElementType()) && - // VecIn[1].hasOneUse() && - NumExtracts == 1 && TLI.isExtractVecEltCheap(VT, OneConstExtractIndex)) + TLI.isExtractVecEltCheap(VT, OneConstExtractIndex)) return SDValue(); unsigned MaxIndex = 0;