diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp index 564cc372f595c..d6e288a59b2ee 100644 --- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp +++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp @@ -23018,18 +23018,33 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) { return NewShuffle; } - // If all insertions are zero value, try to convert to AND mask. - // TODO: Do this for -1 with OR mask? - if (!LegalOperations && llvm::isNullConstant(InVal) && - all_of(Ops, [InVal](SDValue Op) { return !Op || Op == InVal; }) && - count_if(Ops, [InVal](SDValue Op) { return Op == InVal; }) >= 2) { - SDValue Zero = DAG.getConstant(0, DL, MaxEltVT); - SDValue AllOnes = DAG.getAllOnesConstant(DL, MaxEltVT); - SmallVector Mask(NumElts); - for (unsigned I = 0; I != NumElts; ++I) - Mask[I] = Ops[I] ? Zero : AllOnes; - return DAG.getNode(ISD::AND, DL, VT, CurVec, - DAG.getBuildVector(VT, DL, Mask)); + if (!LegalOperations) { + bool IsNull = llvm::isNullConstant(InVal); + // We can convert to AND/OR mask if all insertions are zero or -1 + // respectively. + if ((IsNull || llvm::isAllOnesConstant(InVal)) && + all_of(Ops, [InVal](SDValue Op) { return !Op || Op == InVal; }) && + count_if(Ops, [InVal](SDValue Op) { return Op == InVal; }) >= 2) { + SDValue Zero = DAG.getConstant(0, DL, MaxEltVT); + SDValue AllOnes = DAG.getAllOnesConstant(DL, MaxEltVT); + SmallVector Mask(NumElts); + + // Build the mask and return the corresponding DAG node. + auto BuildMaskAndNode = [&](SDValue TrueVal, SDValue FalseVal, + unsigned MaskOpcode) { + for (unsigned I = 0; I != NumElts; ++I) + Mask[I] = Ops[I] ? TrueVal : FalseVal; + return DAG.getNode(MaskOpcode, DL, VT, CurVec, + DAG.getBuildVector(VT, DL, Mask)); + }; + + // If all elements are zero, we can use AND with all ones. + if (IsNull) + return BuildMaskAndNode(Zero, AllOnes, ISD::AND); + + // If all elements are -1, we can use OR with zero. + return BuildMaskAndNode(AllOnes, Zero, ISD::OR); + } } // Failed to find a match in the chain - bail. diff --git a/llvm/test/CodeGen/AArch64/vecreduce-and-legalization.ll b/llvm/test/CodeGen/AArch64/vecreduce-and-legalization.ll index 7fa416e0dbcd5..d2f16721e6e47 100644 --- a/llvm/test/CodeGen/AArch64/vecreduce-and-legalization.ll +++ b/llvm/test/CodeGen/AArch64/vecreduce-and-legalization.ll @@ -101,19 +101,13 @@ define i8 @test_v3i8(<3 x i8> %a) nounwind { define i8 @test_v9i8(<9 x i8> %a) nounwind { ; CHECK-LABEL: test_v9i8: ; CHECK: // %bb.0: -; CHECK-NEXT: mov v1.16b, v0.16b -; CHECK-NEXT: mov w8, #-1 // =0xffffffff -; CHECK-NEXT: mov v1.b[9], w8 -; CHECK-NEXT: mov v1.b[10], w8 -; CHECK-NEXT: mov v1.b[11], w8 -; CHECK-NEXT: mov v1.b[12], w8 -; CHECK-NEXT: mov v1.b[13], w8 -; CHECK-NEXT: mov v1.b[14], w8 -; CHECK-NEXT: mov v1.b[15], w8 +; CHECK-NEXT: movi v1.2d, #0xffffff00ffffff00 +; CHECK-NEXT: fmov x8, d0 +; CHECK-NEXT: orr v1.16b, v0.16b, v1.16b ; CHECK-NEXT: ext v1.16b, v1.16b, v1.16b, #8 ; CHECK-NEXT: and v0.8b, v0.8b, v1.8b -; CHECK-NEXT: fmov x8, d0 -; CHECK-NEXT: and x8, x8, x8, lsr #32 +; CHECK-NEXT: fmov x9, d0 +; CHECK-NEXT: and x8, x9, x8, lsr #32 ; CHECK-NEXT: and x8, x8, x8, lsr #16 ; CHECK-NEXT: lsr x9, x8, #8 ; CHECK-NEXT: and w0, w8, w9 diff --git a/llvm/test/CodeGen/X86/avx-cvt-3.ll b/llvm/test/CodeGen/X86/avx-cvt-3.ll index 87eabd9cb5521..760db4af1f1b4 100644 --- a/llvm/test/CodeGen/X86/avx-cvt-3.ll +++ b/llvm/test/CodeGen/X86/avx-cvt-3.ll @@ -48,17 +48,13 @@ define <8 x float> @sitofp_shuffle_zero_v8i32(<8 x i32> %a0) { define <8 x float> @sitofp_insert_allbits_v8i32(<8 x i32> %a0) { ; X86-LABEL: sitofp_insert_allbits_v8i32: ; X86: # %bb.0: -; X86-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X86-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1 -; X86-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6,7] +; X86-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 ; X86-NEXT: vcvtdq2ps %ymm0, %ymm0 ; X86-NEXT: retl ; ; X64-LABEL: sitofp_insert_allbits_v8i32: ; X64: # %bb.0: -; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; X64-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1 -; X64-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[3],ymm1[4,5],ymm0[6,7] +; X64-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 ; X64-NEXT: vcvtdq2ps %ymm0, %ymm0 ; X64-NEXT: retq %1 = insertelement <8 x i32> %a0, i32 -1, i32 0 diff --git a/llvm/test/CodeGen/X86/insertelement-ones.ll b/llvm/test/CodeGen/X86/insertelement-ones.ll index cfcb8798e0b9c..a404bc870b4c4 100644 --- a/llvm/test/CodeGen/X86/insertelement-ones.ll +++ b/llvm/test/CodeGen/X86/insertelement-ones.ll @@ -1,8 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefix=SSE2 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse3 | FileCheck %s --check-prefix=SSE3 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefix=SSSE3 -; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefix=SSE41 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefixes=SSE,SSE2 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse3 | FileCheck %s --check-prefixes=SSE,SSE3 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSE,SSSE3 +; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,avx512bw | FileCheck %s --check-prefixes=AVX,AVX512 @@ -148,98 +148,30 @@ define <4 x i32> @insert_v4i32_01x3(<4 x i32> %a) { } define <8 x i32> @insert_v8i32_x12345x7(<8 x i32> %a) { -; SSE2-LABEL: insert_v8i32_x12345x7: -; SSE2: # %bb.0: -; SSE2-NEXT: movss {{.*#+}} xmm2 = [NaN,0.0E+0,0.0E+0,0.0E+0] -; SSE2-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3] -; SSE2-NEXT: movl $-1, %eax -; SSE2-NEXT: movd %eax, %xmm2 -; SSE2-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[3,0] -; SSE2-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2] -; SSE2-NEXT: retq -; -; SSE3-LABEL: insert_v8i32_x12345x7: -; SSE3: # %bb.0: -; SSE3-NEXT: movss {{.*#+}} xmm2 = [NaN,0.0E+0,0.0E+0,0.0E+0] -; SSE3-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3] -; SSE3-NEXT: movl $-1, %eax -; SSE3-NEXT: movd %eax, %xmm2 -; SSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[3,0] -; SSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2] -; SSE3-NEXT: retq +; SSE-LABEL: insert_v8i32_x12345x7: +; SSE: # %bb.0: +; SSE-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE-NEXT: retq ; -; SSSE3-LABEL: insert_v8i32_x12345x7: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movss {{.*#+}} xmm2 = [NaN,0.0E+0,0.0E+0,0.0E+0] -; SSSE3-NEXT: movss {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3] -; SSSE3-NEXT: movl $-1, %eax -; SSSE3-NEXT: movd %eax, %xmm2 -; SSSE3-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,0],xmm1[3,0] -; SSSE3-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,1],xmm2[0,2] -; SSSE3-NEXT: retq -; -; SSE41-LABEL: insert_v8i32_x12345x7: -; SSE41: # %bb.0: -; SSE41-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0,1],xmm0[2,3,4,5,6,7] -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3],xmm2[4,5],xmm1[6,7] -; SSE41-NEXT: retq -; -; AVX1-LABEL: insert_v8i32_x12345x7: -; AVX1: # %bb.0: -; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1 -; AVX1-NEXT: vcmptrueps %ymm1, %ymm1, %ymm1 -; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5],ymm1[6],ymm0[7] -; AVX1-NEXT: retq -; -; AVX2-LABEL: insert_v8i32_x12345x7: -; AVX2: # %bb.0: -; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5],ymm1[6],ymm0[7] -; AVX2-NEXT: retq -; -; AVX512-LABEL: insert_v8i32_x12345x7: -; AVX512: # %bb.0: -; AVX512-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1 -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0],ymm0[1,2,3,4,5],ymm1[6],ymm0[7] -; AVX512-NEXT: retq +; AVX-LABEL: insert_v8i32_x12345x7: +; AVX: # %bb.0: +; AVX-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX-NEXT: retq %1 = insertelement <8 x i32> %a, i32 -1, i32 0 %2 = insertelement <8 x i32> %1, i32 -1, i32 6 ret <8 x i32> %2 } define <8 x i16> @insert_v8i16_x12345x7(<8 x i16> %a) { -; SSE2-LABEL: insert_v8i16_x12345x7: -; SSE2: # %bb.0: -; SSE2-NEXT: movl $65535, %eax # imm = 0xFFFF -; SSE2-NEXT: pinsrw $0, %eax, %xmm0 -; SSE2-NEXT: pinsrw $6, %eax, %xmm0 -; SSE2-NEXT: retq -; -; SSE3-LABEL: insert_v8i16_x12345x7: -; SSE3: # %bb.0: -; SSE3-NEXT: movl $65535, %eax # imm = 0xFFFF -; SSE3-NEXT: pinsrw $0, %eax, %xmm0 -; SSE3-NEXT: pinsrw $6, %eax, %xmm0 -; SSE3-NEXT: retq -; -; SSSE3-LABEL: insert_v8i16_x12345x7: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movl $65535, %eax # imm = 0xFFFF -; SSSE3-NEXT: pinsrw $0, %eax, %xmm0 -; SSSE3-NEXT: pinsrw $6, %eax, %xmm0 -; SSSE3-NEXT: retq -; -; SSE41-LABEL: insert_v8i16_x12345x7: -; SSE41: # %bb.0: -; SSE41-NEXT: pcmpeqd %xmm1, %xmm1 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5],xmm1[6],xmm0[7] -; SSE41-NEXT: retq +; SSE-LABEL: insert_v8i16_x12345x7: +; SSE: # %bb.0: +; SSE-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE-NEXT: retq ; ; AVX-LABEL: insert_v8i16_x12345x7: ; AVX: # %bb.0: -; AVX-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX-NEXT: vpblendw {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3,4,5],xmm1[6],xmm0[7] +; AVX-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq %1 = insertelement <8 x i16> %a, i16 -1, i32 0 %2 = insertelement <8 x i16> %1, i16 -1, i32 6 @@ -247,62 +179,16 @@ define <8 x i16> @insert_v8i16_x12345x7(<8 x i16> %a) { } define <16 x i16> @insert_v16i16_x12345x789ABCDEx(<16 x i16> %a) { -; SSE2-LABEL: insert_v16i16_x12345x789ABCDEx: -; SSE2: # %bb.0: -; SSE2-NEXT: movl $65535, %eax # imm = 0xFFFF -; SSE2-NEXT: pinsrw $0, %eax, %xmm0 -; SSE2-NEXT: pinsrw $6, %eax, %xmm0 -; SSE2-NEXT: pinsrw $7, %eax, %xmm1 -; SSE2-NEXT: retq -; -; SSE3-LABEL: insert_v16i16_x12345x789ABCDEx: -; SSE3: # %bb.0: -; SSE3-NEXT: movl $65535, %eax # imm = 0xFFFF -; SSE3-NEXT: pinsrw $0, %eax, %xmm0 -; SSE3-NEXT: pinsrw $6, %eax, %xmm0 -; SSE3-NEXT: pinsrw $7, %eax, %xmm1 -; SSE3-NEXT: retq -; -; SSSE3-LABEL: insert_v16i16_x12345x789ABCDEx: -; SSSE3: # %bb.0: -; SSSE3-NEXT: movl $65535, %eax # imm = 0xFFFF -; SSSE3-NEXT: pinsrw $0, %eax, %xmm0 -; SSSE3-NEXT: pinsrw $6, %eax, %xmm0 -; SSSE3-NEXT: pinsrw $7, %eax, %xmm1 -; SSSE3-NEXT: retq +; SSE-LABEL: insert_v16i16_x12345x789ABCDEx: +; SSE: # %bb.0: +; SSE-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE-NEXT: retq ; -; SSE41-LABEL: insert_v16i16_x12345x789ABCDEx: -; SSE41: # %bb.0: -; SSE41-NEXT: pcmpeqd %xmm2, %xmm2 -; SSE41-NEXT: pblendw {{.*#+}} xmm0 = xmm2[0],xmm0[1,2,3,4,5],xmm2[6],xmm0[7] -; SSE41-NEXT: pblendw {{.*#+}} xmm1 = xmm1[0,1,2,3,4,5,6],xmm2[7] -; SSE41-NEXT: retq -; -; AVX1-LABEL: insert_v16i16_x12345x789ABCDEx: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovss {{.*#+}} xmm1 = [65535,0,0,0] -; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX1-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: insert_v16i16_x12345x789ABCDEx: -; AVX2: # %bb.0: -; AVX2-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX2-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3,4,5],xmm1[6],xmm0[7] -; AVX2-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 -; AVX2-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7],ymm0[8,9,10,11,12,13,14],ymm2[15] -; AVX2-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX2-NEXT: retq -; -; AVX512-LABEL: insert_v16i16_x12345x789ABCDEx: -; AVX512: # %bb.0: -; AVX512-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1 -; AVX512-NEXT: vpblendw {{.*#+}} xmm1 = xmm1[0],xmm0[1,2,3,4,5],xmm1[6],xmm0[7] -; AVX512-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2 -; AVX512-NEXT: vpblendw {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7],ymm0[8,9,10,11,12,13,14],ymm2[15] -; AVX512-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1,2,3],ymm0[4,5,6,7] -; AVX512-NEXT: retq +; AVX-LABEL: insert_v16i16_x12345x789ABCDEx: +; AVX: # %bb.0: +; AVX-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX-NEXT: retq %1 = insertelement <16 x i16> %a, i16 -1, i32 0 %2 = insertelement <16 x i16> %1, i16 -1, i32 6 %3 = insertelement <16 x i16> %2, i16 -1, i32 15 @@ -310,36 +196,14 @@ define <16 x i16> @insert_v16i16_x12345x789ABCDEx(<16 x i16> %a) { } define <16 x i8> @insert_v16i8_x123456789ABCDEx(<16 x i8> %a) { -; SSE2-LABEL: insert_v16i8_x123456789ABCDEx: -; SSE2: # %bb.0: -; SSE2-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: retq -; -; SSE3-LABEL: insert_v16i8_x123456789ABCDEx: -; SSE3: # %bb.0: -; SSE3-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE3-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE3-NEXT: retq -; -; SSSE3-LABEL: insert_v16i8_x123456789ABCDEx: -; SSSE3: # %bb.0: -; SSSE3-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSSE3-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSSE3-NEXT: retq -; -; SSE41-LABEL: insert_v16i8_x123456789ABCDEx: -; SSE41: # %bb.0: -; SSE41-NEXT: movl $255, %eax -; SSE41-NEXT: pinsrb $0, %eax, %xmm0 -; SSE41-NEXT: pinsrb $15, %eax, %xmm0 -; SSE41-NEXT: retq +; SSE-LABEL: insert_v16i8_x123456789ABCDEx: +; SSE: # %bb.0: +; SSE-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE-NEXT: retq ; ; AVX-LABEL: insert_v16i8_x123456789ABCDEx: ; AVX: # %bb.0: -; AVX-NEXT: movl $255, %eax -; AVX-NEXT: vpinsrb $0, %eax, %xmm0, %xmm0 -; AVX-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 +; AVX-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0 ; AVX-NEXT: retq %1 = insertelement <16 x i8> %a, i8 -1, i32 0 %2 = insertelement <16 x i8> %1, i8 -1, i32 15 @@ -347,72 +211,16 @@ define <16 x i8> @insert_v16i8_x123456789ABCDEx(<16 x i8> %a) { } define <32 x i8> @insert_v32i8_x123456789ABCDEzGHIJKLMNOPQRSTxx(<32 x i8> %a) { -; SSE2-LABEL: insert_v32i8_x123456789ABCDEzGHIJKLMNOPQRSTxx: -; SSE2: # %bb.0: -; SSE2-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE2-NEXT: movaps {{.*#+}} xmm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255] -; SSE2-NEXT: orps %xmm2, %xmm0 -; SSE2-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE2-NEXT: orps %xmm2, %xmm1 -; SSE2-NEXT: retq -; -; SSE3-LABEL: insert_v32i8_x123456789ABCDEzGHIJKLMNOPQRSTxx: -; SSE3: # %bb.0: -; SSE3-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSE3-NEXT: movaps {{.*#+}} xmm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255] -; SSE3-NEXT: orps %xmm2, %xmm0 -; SSE3-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSE3-NEXT: orps %xmm2, %xmm1 -; SSE3-NEXT: retq -; -; SSSE3-LABEL: insert_v32i8_x123456789ABCDEzGHIJKLMNOPQRSTxx: -; SSSE3: # %bb.0: -; SSSE3-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 -; SSSE3-NEXT: movaps {{.*#+}} xmm2 = [0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,255] -; SSSE3-NEXT: orps %xmm2, %xmm0 -; SSSE3-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 -; SSSE3-NEXT: orps %xmm2, %xmm1 -; SSSE3-NEXT: retq -; -; SSE41-LABEL: insert_v32i8_x123456789ABCDEzGHIJKLMNOPQRSTxx: -; SSE41: # %bb.0: -; SSE41-NEXT: movl $255, %eax -; SSE41-NEXT: pinsrb $0, %eax, %xmm0 -; SSE41-NEXT: pinsrb $15, %eax, %xmm0 -; SSE41-NEXT: pinsrb $14, %eax, %xmm1 -; SSE41-NEXT: pinsrb $15, %eax, %xmm1 -; SSE41-NEXT: retq -; -; AVX1-LABEL: insert_v32i8_x123456789ABCDEzGHIJKLMNOPQRSTxx: -; AVX1: # %bb.0: -; AVX1-NEXT: vmovss {{.*#+}} xmm1 = [255,0,0,0] -; AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0 -; AVX1-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX1-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX1-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 -; AVX1-NEXT: retq -; -; AVX2-LABEL: insert_v32i8_x123456789ABCDEzGHIJKLMNOPQRSTxx: -; AVX2: # %bb.0: -; AVX2-NEXT: movl $255, %eax -; AVX2-NEXT: vpinsrb $0, %eax, %xmm0, %xmm1 -; AVX2-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 -; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX2-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 -; AVX2-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX2-NEXT: retq +; SSE-LABEL: insert_v32i8_x123456789ABCDEzGHIJKLMNOPQRSTxx: +; SSE: # %bb.0: +; SSE-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0 +; SSE-NEXT: orps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1 +; SSE-NEXT: retq ; -; AVX512-LABEL: insert_v32i8_x123456789ABCDEzGHIJKLMNOPQRSTxx: -; AVX512: # %bb.0: -; AVX512-NEXT: movl $255, %eax -; AVX512-NEXT: vpinsrb $0, %eax, %xmm0, %xmm1 -; AVX512-NEXT: vpinsrb $15, %eax, %xmm1, %xmm1 -; AVX512-NEXT: vextracti128 $1, %ymm0, %xmm0 -; AVX512-NEXT: vpinsrb $14, %eax, %xmm0, %xmm0 -; AVX512-NEXT: vpinsrb $15, %eax, %xmm0, %xmm0 -; AVX512-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 -; AVX512-NEXT: retq +; AVX-LABEL: insert_v32i8_x123456789ABCDEzGHIJKLMNOPQRSTxx: +; AVX: # %bb.0: +; AVX-NEXT: vorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; AVX-NEXT: retq %1 = insertelement <32 x i8> %a, i8 -1, i32 0 %2 = insertelement <32 x i8> %1, i8 -1, i32 15 %3 = insertelement <32 x i8> %2, i8 -1, i32 30