From 1548a2db92715e9be1d8f4b05b47914e00292435 Mon Sep 17 00:00:00 2001 From: Simon Pilgrim Date: Mon, 10 Mar 2025 09:08:52 +0000 Subject: [PATCH] [X86] combineConcatVectorOps - convert ISD::VECTOR_SHUFFLE concatenation to use combineConcatVectorOps recursion Only concatenate ISD::VECTOR_SHUFFLE nodes if at least one operand is beneficial to concatenate --- llvm/lib/Target/X86/X86ISelLowering.cpp | 38 +-- llvm/test/CodeGen/X86/gfni-rotates.ll | 6 +- llvm/test/CodeGen/X86/known-bits-vector.ll | 8 +- llvm/test/CodeGen/X86/matrix-multiply.ll | 88 ++++-- llvm/test/CodeGen/X86/mulvi32.ll | 8 +- llvm/test/CodeGen/X86/vector-fshr-rot-128.ll | 6 +- .../vector-interleaved-store-i8-stride-3.ll | 33 ++- .../test/CodeGen/X86/vector-shuffle-256-v4.ll | 19 +- llvm/test/CodeGen/X86/widen_bitcnt.ll | 256 ++++++++---------- .../CodeGen/X86/x86-interleaved-access.ll | 56 ++-- 10 files changed, 267 insertions(+), 251 deletions(-) diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp index 5e98ef70c578f..5e86b5ad28e6d 100644 --- a/llvm/lib/Target/X86/X86ISelLowering.cpp +++ b/llvm/lib/Target/X86/X86ISelLowering.cpp @@ -57971,24 +57971,28 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT, // TODO: Relax VBMI requirement for repeated shuffle ops - currently // limited to targets that should always have good cross lane shuffles. if (!IsSplat && NumOps == 2 && VT.is256BitVector() && - (EltSizeInBits >= 32 || Subtarget.hasInt256()) && - (IsConcatFree(VT, Ops, 0) || IsConcatFree(VT, Ops, 1) || - (Ops[0].getOperand(0) == Ops[1].getOperand(0) && - Ops[0].getOperand(1) == Ops[1].getOperand(1) && - Subtarget.hasVBMI()))) { - int NumSubElts = Op0.getValueType().getVectorNumElements(); - SmallVector NewMask; - for (int M : cast(Ops[0])->getMask()) { - M = M >= NumSubElts ? M + NumSubElts : M; - NewMask.push_back(M); - } - for (int M : cast(Ops[1])->getMask()) { - if (0 <= M) - M = (M >= NumSubElts ? M + NumSubElts : M) + NumSubElts; - NewMask.push_back(M); + (EltSizeInBits >= 32 || Subtarget.hasInt256())) { + SDValue Concat0 = CombineSubOperand(VT, Ops, 0); + SDValue Concat1 = CombineSubOperand(VT, Ops, 1); + if (Concat0 || Concat1 || + (Ops[0].getOperand(0) == Ops[1].getOperand(0) && + Ops[0].getOperand(1) == Ops[1].getOperand(1) && + Subtarget.hasVBMI())) { + int NumSubElts = Op0.getValueType().getVectorNumElements(); + SmallVector NewMask; + for (int M : cast(Ops[0])->getMask()) { + M = M >= NumSubElts ? M + NumSubElts : M; + NewMask.push_back(M); + } + for (int M : cast(Ops[1])->getMask()) { + if (0 <= M) + M = (M >= NumSubElts ? M + NumSubElts : M) + NumSubElts; + NewMask.push_back(M); + } + Concat0 = Concat0 ? Concat0 : ConcatSubOperand(VT, Ops, 0); + Concat1 = Concat1 ? Concat1 : ConcatSubOperand(VT, Ops, 1); + return DAG.getVectorShuffle(VT, DL, Concat0, Concat1, NewMask); } - return DAG.getVectorShuffle(VT, DL, ConcatSubOperand(VT, Ops, 0), - ConcatSubOperand(VT, Ops, 1), NewMask); } break; } diff --git a/llvm/test/CodeGen/X86/gfni-rotates.ll b/llvm/test/CodeGen/X86/gfni-rotates.ll index 5fd4dfa7cc262..967f26f70946a 100644 --- a/llvm/test/CodeGen/X86/gfni-rotates.ll +++ b/llvm/test/CodeGen/X86/gfni-rotates.ll @@ -255,9 +255,9 @@ define <16 x i8> @splatvar_rotr_v16i8(<16 x i8> %a, <16 x i8> %amt) nounwind { ; ; GFNIAVX512BW-LABEL: splatvar_rotr_v16i8: ; GFNIAVX512BW: # %bb.0: -; GFNIAVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; GFNIAVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; GFNIAVX512BW-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 +; GFNIAVX512BW-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; GFNIAVX512BW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; GFNIAVX512BW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; GFNIAVX512BW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; GFNIAVX512BW-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 ; GFNIAVX512BW-NEXT: vpmovwb %ymm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/known-bits-vector.ll b/llvm/test/CodeGen/X86/known-bits-vector.ll index dbf3d6635fb92..261908fafc06e 100644 --- a/llvm/test/CodeGen/X86/known-bits-vector.ll +++ b/llvm/test/CodeGen/X86/known-bits-vector.ll @@ -384,19 +384,19 @@ declare <4 x i32> @llvm.bswap.v4i32(<4 x i32>) define <8 x float> @knownbits_mask_concat_uitofp(<4 x i32> %a0, <4 x i32> %a1) nounwind { ; X86-LABEL: knownbits_mask_concat_uitofp: ; X86: # %bb.0: -; X86-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,0,2] -; X86-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3,1,3] +; X86-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; X86-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; X86-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0 +; X86-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,0,2,5,7,5,7] ; X86-NEXT: vcvtdq2ps %ymm0, %ymm0 ; X86-NEXT: retl ; ; X64-LABEL: knownbits_mask_concat_uitofp: ; X64: # %bb.0: -; X64-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,0,2] -; X64-NEXT: vshufps {{.*#+}} xmm1 = xmm1[1,3,1,3] +; X64-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 ; X64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 ; X64-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0 +; X64-NEXT: vpermilps {{.*#+}} ymm0 = ymm0[0,2,0,2,5,7,5,7] ; X64-NEXT: vcvtdq2ps %ymm0, %ymm0 ; X64-NEXT: retq %1 = and <4 x i32> %a0, diff --git a/llvm/test/CodeGen/X86/matrix-multiply.ll b/llvm/test/CodeGen/X86/matrix-multiply.ll index 1ee03c5f1223f..d723ec849f328 100644 --- a/llvm/test/CodeGen/X86/matrix-multiply.ll +++ b/llvm/test/CodeGen/X86/matrix-multiply.ll @@ -974,35 +974,65 @@ define <16 x float> @test_mul4x4_f32(<16 x float> %a0, <16 x float> %a1) nounwin ; SSE-NEXT: movaps %xmm5, %xmm2 ; SSE-NEXT: retq ; -; AVX1OR2-LABEL: test_mul4x4_f32: -; AVX1OR2: # %bb.0: # %entry -; AVX1OR2-NEXT: vshufps {{.*#+}} ymm4 = ymm2[1,1,1,1,5,5,5,5] -; AVX1OR2-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm0[2,3,2,3] -; AVX1OR2-NEXT: vmulps %ymm4, %ymm5, %ymm4 -; AVX1OR2-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm6 -; AVX1OR2-NEXT: vshufps {{.*#+}} ymm0 = ymm2[0,0,0,0,4,4,4,4] -; AVX1OR2-NEXT: vmulps %ymm0, %ymm6, %ymm0 -; AVX1OR2-NEXT: vaddps %ymm4, %ymm0, %ymm0 -; AVX1OR2-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm4 -; AVX1OR2-NEXT: vshufps {{.*#+}} ymm7 = ymm2[2,2,2,2,6,6,6,6] -; AVX1OR2-NEXT: vmulps %ymm7, %ymm4, %ymm7 -; AVX1OR2-NEXT: vaddps %ymm7, %ymm0, %ymm0 -; AVX1OR2-NEXT: vshufps {{.*#+}} ymm2 = ymm2[3,3,3,3,7,7,7,7] -; AVX1OR2-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] -; AVX1OR2-NEXT: vmulps %ymm2, %ymm1, %ymm2 -; AVX1OR2-NEXT: vaddps %ymm2, %ymm0, %ymm0 -; AVX1OR2-NEXT: vshufps {{.*#+}} ymm2 = ymm3[1,1,1,1,5,5,5,5] -; AVX1OR2-NEXT: vmulps %ymm2, %ymm5, %ymm2 -; AVX1OR2-NEXT: vshufps {{.*#+}} ymm5 = ymm3[0,0,0,0,4,4,4,4] -; AVX1OR2-NEXT: vmulps %ymm5, %ymm6, %ymm5 -; AVX1OR2-NEXT: vaddps %ymm2, %ymm5, %ymm2 -; AVX1OR2-NEXT: vshufps {{.*#+}} ymm5 = ymm3[2,2,2,2,6,6,6,6] -; AVX1OR2-NEXT: vmulps %ymm5, %ymm4, %ymm4 -; AVX1OR2-NEXT: vaddps %ymm4, %ymm2, %ymm2 -; AVX1OR2-NEXT: vshufps {{.*#+}} ymm3 = ymm3[3,3,3,3,7,7,7,7] -; AVX1OR2-NEXT: vmulps %ymm3, %ymm1, %ymm1 -; AVX1OR2-NEXT: vaddps %ymm1, %ymm2, %ymm1 -; AVX1OR2-NEXT: retq +; AVX1-LABEL: test_mul4x4_f32: +; AVX1: # %bb.0: # %entry +; AVX1-NEXT: vshufps {{.*#+}} ymm4 = ymm2[1,1,1,1,5,5,5,5] +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm5 = ymm0[2,3,2,3] +; AVX1-NEXT: vmulps %ymm4, %ymm5, %ymm4 +; AVX1-NEXT: vshufps {{.*#+}} ymm6 = ymm2[0,0,0,0,4,4,4,4] +; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm7 +; AVX1-NEXT: vmulps %ymm6, %ymm7, %ymm0 +; AVX1-NEXT: vaddps %ymm4, %ymm0, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} ymm4 = ymm2[2,2,2,2,6,6,6,6] +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm1, %ymm6 +; AVX1-NEXT: vmulps %ymm4, %ymm6, %ymm4 +; AVX1-NEXT: vaddps %ymm4, %ymm0, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} ymm2 = ymm2[3,3,3,3,7,7,7,7] +; AVX1-NEXT: vperm2f128 {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX1-NEXT: vmulps %ymm2, %ymm1, %ymm2 +; AVX1-NEXT: vaddps %ymm2, %ymm0, %ymm0 +; AVX1-NEXT: vshufps {{.*#+}} ymm2 = ymm3[1,1,1,1,5,5,5,5] +; AVX1-NEXT: vmulps %ymm2, %ymm5, %ymm2 +; AVX1-NEXT: vshufps {{.*#+}} ymm4 = ymm3[0,0,0,0,4,4,4,4] +; AVX1-NEXT: vmulps %ymm4, %ymm7, %ymm4 +; AVX1-NEXT: vaddps %ymm2, %ymm4, %ymm2 +; AVX1-NEXT: vshufps {{.*#+}} ymm4 = ymm3[2,2,2,2,6,6,6,6] +; AVX1-NEXT: vmulps %ymm4, %ymm6, %ymm4 +; AVX1-NEXT: vaddps %ymm4, %ymm2, %ymm2 +; AVX1-NEXT: vshufps {{.*#+}} ymm3 = ymm3[3,3,3,3,7,7,7,7] +; AVX1-NEXT: vmulps %ymm3, %ymm1, %ymm1 +; AVX1-NEXT: vaddps %ymm1, %ymm2, %ymm1 +; AVX1-NEXT: retq +; +; AVX2-LABEL: test_mul4x4_f32: +; AVX2: # %bb.0: # %entry +; AVX2-NEXT: vshufps {{.*#+}} ymm4 = ymm2[1,1,1,1,5,5,5,5] +; AVX2-NEXT: vpermpd {{.*#+}} ymm5 = ymm0[2,3,2,3] +; AVX2-NEXT: vmulps %ymm4, %ymm5, %ymm4 +; AVX2-NEXT: vshufps {{.*#+}} ymm6 = ymm2[0,0,0,0,4,4,4,4] +; AVX2-NEXT: vpermpd {{.*#+}} ymm7 = ymm0[0,1,0,1] +; AVX2-NEXT: vmulps %ymm6, %ymm7, %ymm0 +; AVX2-NEXT: vaddps %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vshufps {{.*#+}} ymm4 = ymm2[2,2,2,2,6,6,6,6] +; AVX2-NEXT: vpermpd {{.*#+}} ymm6 = ymm1[0,1,0,1] +; AVX2-NEXT: vmulps %ymm4, %ymm6, %ymm4 +; AVX2-NEXT: vaddps %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm2[3,3,3,3,7,7,7,7] +; AVX2-NEXT: vpermpd {{.*#+}} ymm1 = ymm1[2,3,2,3] +; AVX2-NEXT: vmulps %ymm2, %ymm1, %ymm2 +; AVX2-NEXT: vaddps %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vshufps {{.*#+}} ymm2 = ymm3[1,1,1,1,5,5,5,5] +; AVX2-NEXT: vmulps %ymm2, %ymm5, %ymm2 +; AVX2-NEXT: vshufps {{.*#+}} ymm4 = ymm3[0,0,0,0,4,4,4,4] +; AVX2-NEXT: vmulps %ymm4, %ymm7, %ymm4 +; AVX2-NEXT: vaddps %ymm2, %ymm4, %ymm2 +; AVX2-NEXT: vshufps {{.*#+}} ymm4 = ymm3[2,2,2,2,6,6,6,6] +; AVX2-NEXT: vmulps %ymm4, %ymm6, %ymm4 +; AVX2-NEXT: vaddps %ymm4, %ymm2, %ymm2 +; AVX2-NEXT: vshufps {{.*#+}} ymm3 = ymm3[3,3,3,3,7,7,7,7] +; AVX2-NEXT: vmulps %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vaddps %ymm1, %ymm2, %ymm1 +; AVX2-NEXT: retq ; ; AVX512-LABEL: test_mul4x4_f32: ; AVX512: # %bb.0: # %entry diff --git a/llvm/test/CodeGen/X86/mulvi32.ll b/llvm/test/CodeGen/X86/mulvi32.ll index 46d5da54a7482..bbda4d68bb685 100644 --- a/llvm/test/CodeGen/X86/mulvi32.ll +++ b/llvm/test/CodeGen/X86/mulvi32.ll @@ -286,12 +286,8 @@ define <4 x i64> @_mul4xi32toi64c(<4 x i32>, <4 x i32>) { ; ; AVX2-LABEL: _mul4xi32toi64c: ; AVX2: # %bb.0: -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero -; AVX2-NEXT: vpmovzxdq {{.*#+}} xmm3 = xmm1[0],zero,xmm1[1],zero -; AVX2-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[2,2,3,3] -; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,2,3,3] -; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm3, %ymm1 -; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero +; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero ; AVX2-NEXT: vpmuludq %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq %lower0 = shufflevector <4 x i32> %0, <4 x i32> undef, <4 x i32> diff --git a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll index f45405d885377..9ce682306f18b 100644 --- a/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll +++ b/llvm/test/CodeGen/X86/vector-fshr-rot-128.ll @@ -1194,9 +1194,9 @@ define <16 x i8> @splatvar_funnnel_v16i8(<16 x i8> %x, <16 x i8> %amt) nounwind ; ; AVX512VLBW-LABEL: splatvar_funnnel_v16i8: ; AVX512VLBW: # %bb.0: -; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7] -; AVX512VLBW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15] -; AVX512VLBW-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm0 +; AVX512VLBW-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512VLBW-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3] +; AVX512VLBW-NEXT: vpunpcklbw {{.*#+}} ymm0 = ymm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7,16,16,17,17,18,18,19,19,20,20,21,21,22,22,23,23] ; AVX512VLBW-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1 ; AVX512VLBW-NEXT: vpsrlw %xmm1, %ymm0, %ymm0 ; AVX512VLBW-NEXT: vpmovwb %ymm0, %xmm0 diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll index ba1621c67f480..7f2210742e7f1 100644 --- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll +++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-3.ll @@ -671,15 +671,16 @@ define void @store_i8_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4] ; AVX2-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4] ; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4] -; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] -; AVX2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] -; AVX2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX2-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX2-NEXT: vmovdqa %xmm0, 16(%rcx) -; AVX2-NEXT: vmovdqa %xmm1, (%rcx) +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-NEXT: vmovdqa %xmm2, 32(%rcx) +; AVX2-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-NEXT: vzeroupper ; AVX2-NEXT: retq ; ; AVX2-FP-LABEL: store_i8_stride3_vf16: @@ -693,15 +694,16 @@ define void @store_i8_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4] ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4] ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4] -; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] -; AVX2-FP-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] -; AVX2-FP-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX2-FP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] +; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] +; AVX2-FP-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-FP-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX2-FP-NEXT: vmovdqa %xmm0, 16(%rcx) -; AVX2-FP-NEXT: vmovdqa %xmm1, (%rcx) +; AVX2-FP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-FP-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-FP-NEXT: vmovdqa %xmm2, 32(%rcx) +; AVX2-FP-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-FP-NEXT: vzeroupper ; AVX2-FP-NEXT: retq ; ; AVX2-FCP-LABEL: store_i8_stride3_vf16: @@ -715,15 +717,16 @@ define void @store_i8_stride3_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4] ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4] ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4] -; AVX2-FCP-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm1, %xmm1 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] -; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm0, %xmm0 ; AVX2-FCP-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] +; AVX2-FCP-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] +; AVX2-FCP-NEXT: # ymm3 = mem[0,1,0,1] ; AVX2-FCP-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX2-FCP-NEXT: vmovdqa %xmm0, 16(%rcx) -; AVX2-FCP-NEXT: vmovdqa %xmm1, (%rcx) +; AVX2-FCP-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-FCP-NEXT: vpshufb %ymm3, %ymm0, %ymm0 ; AVX2-FCP-NEXT: vmovdqa %xmm2, 32(%rcx) +; AVX2-FCP-NEXT: vmovdqa %ymm0, (%rcx) +; AVX2-FCP-NEXT: vzeroupper ; AVX2-FCP-NEXT: retq ; ; AVX512-LABEL: store_i8_stride3_vf16: diff --git a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll index d6208aca3b2b7..fb8618be17f06 100644 --- a/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll +++ b/llvm/test/CodeGen/X86/vector-shuffle-256-v4.ll @@ -1682,12 +1682,19 @@ define <4 x i64> @shuffle_v4i64_0044_v2i64(<2 x i64> %a, <2 x i64> %b) { } define <4 x i64> @shuffle_v4i64_1032_v2i64(<2 x i64> %a, <2 x i64> %b) { -; AVX1OR2-LABEL: shuffle_v4i64_1032_v2i64: -; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 -; AVX1OR2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 -; AVX1OR2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] -; AVX1OR2-NEXT: retq +; AVX1-LABEL: shuffle_v4i64_1032_v2i64: +; AVX1: # %bb.0: +; AVX1-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[1,0,3,2] +; AVX1-NEXT: retq +; +; AVX2-LABEL: shuffle_v4i64_1032_v2i64: +; AVX2: # %bb.0: +; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0 +; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[2,3,0,1,6,7,4,5] +; AVX2-NEXT: retq ; ; AVX512VL-SLOW-LABEL: shuffle_v4i64_1032_v2i64: ; AVX512VL-SLOW: # %bb.0: diff --git a/llvm/test/CodeGen/X86/widen_bitcnt.ll b/llvm/test/CodeGen/X86/widen_bitcnt.ll index cca9d4aa2a9f0..56001468898e4 100644 --- a/llvm/test/CodeGen/X86/widen_bitcnt.ll +++ b/llvm/test/CodeGen/X86/widen_bitcnt.ll @@ -241,77 +241,59 @@ define <8 x i32> @widen_ctpop_v2i32_v8i32(<2 x i32> %a0, <2 x i32> %a1, <2 x i32 ; ; AVX2-LABEL: widen_ctpop_v2i32_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm4 -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpand %xmm5, %xmm4, %xmm4 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: vpshufb %xmm4, %xmm6, %xmm4 -; AVX2-NEXT: vpand %xmm5, %xmm0, %xmm0 -; AVX2-NEXT: vpshufb %xmm0, %xmm6, %xmm0 -; AVX2-NEXT: vpsrlw $4, %xmm1, %xmm7 -; AVX2-NEXT: vpand %xmm5, %xmm7, %xmm7 -; AVX2-NEXT: vpshufb %xmm7, %xmm6, %xmm7 -; AVX2-NEXT: vpand %xmm5, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm1, %xmm6, %xmm1 -; AVX2-NEXT: vpsrlw $4, %xmm2, %xmm8 -; AVX2-NEXT: vpand %xmm5, %xmm8, %xmm8 -; AVX2-NEXT: vpshufb %xmm8, %xmm6, %xmm8 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm8[0] -; AVX2-NEXT: vpand %xmm5, %xmm2, %xmm2 -; AVX2-NEXT: vpshufb %xmm2, %xmm6, %xmm2 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; AVX2-NEXT: vpaddb %xmm0, %xmm4, %xmm0 -; AVX2-NEXT: vpsrlw $4, %xmm3, %xmm2 -; AVX2-NEXT: vpand %xmm5, %xmm2, %xmm2 -; AVX2-NEXT: vpshufb %xmm2, %xmm6, %xmm2 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm7[0],xmm2[0] -; AVX2-NEXT: vpand %xmm5, %xmm3, %xmm3 -; AVX2-NEXT: vpshufb %xmm3, %xmm6, %xmm3 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] -; AVX2-NEXT: vpaddb %xmm1, %xmm2, %xmm1 -; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpsadbw %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX2-NEXT: vpsadbw %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 +; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm4 +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm5 = mem[0,1,0,1] +; AVX2-NEXT: vpshufb %ymm4, %ymm5, %ymm4 +; AVX2-NEXT: vpsrlw $4, %ymm1, %ymm1 +; AVX2-NEXT: vpand %ymm3, %ymm1, %ymm1 +; AVX2-NEXT: vpshufb %ymm1, %ymm5, %ymm1 +; AVX2-NEXT: vpaddb %ymm4, %ymm1, %ymm1 +; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX2-NEXT: vpunpckldq {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[4],ymm4[4],ymm1[5],ymm4[5] +; AVX2-NEXT: vpsadbw %ymm4, %ymm1, %ymm1 +; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm2 +; AVX2-NEXT: vpshufb %ymm2, %ymm5, %ymm2 +; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vpshufb %ymm0, %ymm5, %ymm0 +; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[1],ymm4[1],ymm0[4],ymm4[4],ymm0[5],ymm4[5] +; AVX2-NEXT: vpsadbw %ymm4, %ymm0, %ymm0 ; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; ; AVX512VL-LABEL: widen_ctpop_v2i32_v8i32: ; AVX512VL: # %bb.0: -; AVX512VL-NEXT: vpsrlw $4, %xmm0, %xmm4 -; AVX512VL-NEXT: vpbroadcastb {{.*#+}} xmm5 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX512VL-NEXT: vpand %xmm5, %xmm4, %xmm4 -; AVX512VL-NEXT: vmovdqa {{.*#+}} xmm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX512VL-NEXT: vpshufb %xmm4, %xmm6, %xmm4 -; AVX512VL-NEXT: vpand %xmm5, %xmm0, %xmm0 -; AVX512VL-NEXT: vpshufb %xmm0, %xmm6, %xmm0 -; AVX512VL-NEXT: vpsrlw $4, %xmm1, %xmm7 -; AVX512VL-NEXT: vpand %xmm5, %xmm7, %xmm7 -; AVX512VL-NEXT: vpshufb %xmm7, %xmm6, %xmm7 -; AVX512VL-NEXT: vpand %xmm5, %xmm1, %xmm1 -; AVX512VL-NEXT: vpshufb %xmm1, %xmm6, %xmm1 -; AVX512VL-NEXT: vpsrlw $4, %xmm2, %xmm8 -; AVX512VL-NEXT: vpand %xmm5, %xmm8, %xmm8 -; AVX512VL-NEXT: vpshufb %xmm8, %xmm6, %xmm8 -; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm8[0] -; AVX512VL-NEXT: vpand %xmm5, %xmm2, %xmm2 -; AVX512VL-NEXT: vpshufb %xmm2, %xmm6, %xmm2 -; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; AVX512VL-NEXT: vpaddb %xmm0, %xmm4, %xmm0 -; AVX512VL-NEXT: vpsrlw $4, %xmm3, %xmm2 -; AVX512VL-NEXT: vpand %xmm5, %xmm2, %xmm2 -; AVX512VL-NEXT: vpshufb %xmm2, %xmm6, %xmm2 -; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm7[0],xmm2[0] -; AVX512VL-NEXT: vpand %xmm5, %xmm3, %xmm3 -; AVX512VL-NEXT: vpshufb %xmm3, %xmm6, %xmm3 -; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm3[0] -; AVX512VL-NEXT: vpaddb %xmm1, %xmm2, %xmm1 -; AVX512VL-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; AVX512VL-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX512VL-NEXT: vpsadbw %ymm2, %ymm1, %ymm1 -; AVX512VL-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX512VL-NEXT: vpsadbw %ymm2, %ymm0, %ymm0 +; AVX512VL-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 +; AVX512VL-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX512VL-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; AVX512VL-NEXT: vpbroadcastb {{.*#+}} ymm3 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX512VL-NEXT: vpand %ymm3, %ymm1, %ymm4 +; AVX512VL-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX512VL-NEXT: # ymm5 = mem[0,1,0,1] +; AVX512VL-NEXT: vpshufb %ymm4, %ymm5, %ymm4 +; AVX512VL-NEXT: vpsrlw $4, %ymm1, %ymm1 +; AVX512VL-NEXT: vpand %ymm3, %ymm1, %ymm1 +; AVX512VL-NEXT: vpshufb %ymm1, %ymm5, %ymm1 +; AVX512VL-NEXT: vpaddb %ymm4, %ymm1, %ymm1 +; AVX512VL-NEXT: vpxor %xmm4, %xmm4, %xmm4 +; AVX512VL-NEXT: vpunpckldq {{.*#+}} ymm1 = ymm1[0],ymm4[0],ymm1[1],ymm4[1],ymm1[4],ymm4[4],ymm1[5],ymm4[5] +; AVX512VL-NEXT: vpsadbw %ymm4, %ymm1, %ymm1 +; AVX512VL-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX512VL-NEXT: vpand %ymm3, %ymm0, %ymm2 +; AVX512VL-NEXT: vpshufb %ymm2, %ymm5, %ymm2 +; AVX512VL-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX512VL-NEXT: vpand %ymm3, %ymm0, %ymm0 +; AVX512VL-NEXT: vpshufb %ymm0, %ymm5, %ymm0 +; AVX512VL-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; AVX512VL-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm4[0],ymm0[1],ymm4[1],ymm0[4],ymm4[4],ymm0[5],ymm4[5] +; AVX512VL-NEXT: vpsadbw %ymm4, %ymm0, %ymm0 ; AVX512VL-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; AVX512VL-NEXT: retq ; @@ -1297,48 +1279,35 @@ define <8 x i32> @widen_cttz_v2i32_v8i32(<2 x i32> %a0, <2 x i32> %a1, <2 x i32> ; ; AVX2-LABEL: widen_cttz_v2i32_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 -; AVX2-NEXT: vpaddd %xmm4, %xmm0, %xmm5 -; AVX2-NEXT: vpandn %xmm5, %xmm0, %xmm0 -; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm5 -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpand %xmm6, %xmm5, %xmm5 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: vpshufb %xmm5, %xmm7, %xmm5 -; AVX2-NEXT: vpand %xmm6, %xmm0, %xmm0 -; AVX2-NEXT: vpshufb %xmm0, %xmm7, %xmm0 -; AVX2-NEXT: vpaddd %xmm4, %xmm1, %xmm8 -; AVX2-NEXT: vpandn %xmm8, %xmm1, %xmm1 -; AVX2-NEXT: vpsrlw $4, %xmm1, %xmm8 -; AVX2-NEXT: vpand %xmm6, %xmm8, %xmm8 -; AVX2-NEXT: vpshufb %xmm8, %xmm7, %xmm8 -; AVX2-NEXT: vpand %xmm6, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm1, %xmm7, %xmm1 -; AVX2-NEXT: vpaddd %xmm4, %xmm2, %xmm9 -; AVX2-NEXT: vpandn %xmm9, %xmm2, %xmm2 -; AVX2-NEXT: vpsrlw $4, %xmm2, %xmm9 -; AVX2-NEXT: vpand %xmm6, %xmm9, %xmm9 -; AVX2-NEXT: vpshufb %xmm9, %xmm7, %xmm9 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm9[0] -; AVX2-NEXT: vpand %xmm6, %xmm2, %xmm2 -; AVX2-NEXT: vpshufb %xmm2, %xmm7, %xmm2 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; AVX2-NEXT: vpaddb %xmm0, %xmm5, %xmm0 -; AVX2-NEXT: vpaddd %xmm4, %xmm3, %xmm2 -; AVX2-NEXT: vpandn %xmm2, %xmm3, %xmm2 -; AVX2-NEXT: vpsrlw $4, %xmm2, %xmm3 -; AVX2-NEXT: vpand %xmm6, %xmm3, %xmm3 -; AVX2-NEXT: vpshufb %xmm3, %xmm7, %xmm3 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm8[0],xmm3[0] -; AVX2-NEXT: vpand %xmm6, %xmm2, %xmm2 -; AVX2-NEXT: vpshufb %xmm2, %xmm7, %xmm2 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX2-NEXT: vpaddb %xmm1, %xmm3, %xmm1 -; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpsadbw %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX2-NEXT: vpsadbw %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 +; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; AVX2-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3 +; AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm4 +; AVX2-NEXT: vpandn %ymm4, %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm5 +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm6 = mem[0,1,0,1] +; AVX2-NEXT: vpshufb %ymm5, %ymm6, %ymm5 +; AVX2-NEXT: vpsrlw $4, %ymm1, %ymm1 +; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 +; AVX2-NEXT: vpshufb %ymm1, %ymm6, %ymm1 +; AVX2-NEXT: vpaddb %ymm5, %ymm1, %ymm1 +; AVX2-NEXT: vpxor %xmm5, %xmm5, %xmm5 +; AVX2-NEXT: vpunpckldq {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[1],ymm5[1],ymm1[4],ymm5[4],ymm1[5],ymm5[5] +; AVX2-NEXT: vpsadbw %ymm5, %ymm1, %ymm1 +; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddd %ymm3, %ymm0, %ymm2 +; AVX2-NEXT: vpandn %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm2 +; AVX2-NEXT: vpshufb %ymm2, %ymm6, %ymm2 +; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vpshufb %ymm0, %ymm6, %ymm0 +; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[1],ymm5[1],ymm0[4],ymm5[4],ymm0[5],ymm5[5] +; AVX2-NEXT: vpsadbw %ymm5, %ymm0, %ymm0 ; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; @@ -1640,48 +1609,35 @@ define <8 x i32> @widen_cttz_undef_v2i32_v8i32(<2 x i32> %a0, <2 x i32> %a1, <2 ; ; AVX2-LABEL: widen_cttz_undef_v2i32_v8i32: ; AVX2: # %bb.0: -; AVX2-NEXT: vpcmpeqd %xmm4, %xmm4, %xmm4 -; AVX2-NEXT: vpaddd %xmm4, %xmm0, %xmm5 -; AVX2-NEXT: vpandn %xmm5, %xmm0, %xmm0 -; AVX2-NEXT: vpsrlw $4, %xmm0, %xmm5 -; AVX2-NEXT: vpbroadcastb {{.*#+}} xmm6 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] -; AVX2-NEXT: vpand %xmm6, %xmm5, %xmm5 -; AVX2-NEXT: vmovdqa {{.*#+}} xmm7 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] -; AVX2-NEXT: vpshufb %xmm5, %xmm7, %xmm5 -; AVX2-NEXT: vpand %xmm6, %xmm0, %xmm0 -; AVX2-NEXT: vpshufb %xmm0, %xmm7, %xmm0 -; AVX2-NEXT: vpaddd %xmm4, %xmm1, %xmm8 -; AVX2-NEXT: vpandn %xmm8, %xmm1, %xmm1 -; AVX2-NEXT: vpsrlw $4, %xmm1, %xmm8 -; AVX2-NEXT: vpand %xmm6, %xmm8, %xmm8 -; AVX2-NEXT: vpshufb %xmm8, %xmm7, %xmm8 -; AVX2-NEXT: vpand %xmm6, %xmm1, %xmm1 -; AVX2-NEXT: vpshufb %xmm1, %xmm7, %xmm1 -; AVX2-NEXT: vpaddd %xmm4, %xmm2, %xmm9 -; AVX2-NEXT: vpandn %xmm9, %xmm2, %xmm2 -; AVX2-NEXT: vpsrlw $4, %xmm2, %xmm9 -; AVX2-NEXT: vpand %xmm6, %xmm9, %xmm9 -; AVX2-NEXT: vpshufb %xmm9, %xmm7, %xmm9 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm9[0] -; AVX2-NEXT: vpand %xmm6, %xmm2, %xmm2 -; AVX2-NEXT: vpshufb %xmm2, %xmm7, %xmm2 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0] -; AVX2-NEXT: vpaddb %xmm0, %xmm5, %xmm0 -; AVX2-NEXT: vpaddd %xmm4, %xmm3, %xmm2 -; AVX2-NEXT: vpandn %xmm2, %xmm3, %xmm2 -; AVX2-NEXT: vpsrlw $4, %xmm2, %xmm3 -; AVX2-NEXT: vpand %xmm6, %xmm3, %xmm3 -; AVX2-NEXT: vpshufb %xmm3, %xmm7, %xmm3 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm8[0],xmm3[0] -; AVX2-NEXT: vpand %xmm6, %xmm2, %xmm2 -; AVX2-NEXT: vpshufb %xmm2, %xmm7, %xmm2 -; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0] -; AVX2-NEXT: vpaddb %xmm1, %xmm3, %xmm1 -; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm1 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero -; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2 -; AVX2-NEXT: vpsadbw %ymm2, %ymm1, %ymm1 -; AVX2-NEXT: vpmovzxdq {{.*#+}} ymm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero -; AVX2-NEXT: vpsadbw %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: # kill: def $xmm1 killed $xmm1 def $ymm1 +; AVX2-NEXT: # kill: def $xmm0 killed $xmm0 def $ymm0 +; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm1, %ymm1 +; AVX2-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3 +; AVX2-NEXT: vpaddd %ymm3, %ymm1, %ymm4 +; AVX2-NEXT: vpandn %ymm4, %ymm1, %ymm1 +; AVX2-NEXT: vpbroadcastb {{.*#+}} ymm4 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15] +; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm5 +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4,0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4] +; AVX2-NEXT: # ymm6 = mem[0,1,0,1] +; AVX2-NEXT: vpshufb %ymm5, %ymm6, %ymm5 +; AVX2-NEXT: vpsrlw $4, %ymm1, %ymm1 +; AVX2-NEXT: vpand %ymm4, %ymm1, %ymm1 +; AVX2-NEXT: vpshufb %ymm1, %ymm6, %ymm1 +; AVX2-NEXT: vpaddb %ymm5, %ymm1, %ymm1 +; AVX2-NEXT: vpxor %xmm5, %xmm5, %xmm5 +; AVX2-NEXT: vpunpckldq {{.*#+}} ymm1 = ymm1[0],ymm5[0],ymm1[1],ymm5[1],ymm1[4],ymm5[4],ymm1[5],ymm5[5] +; AVX2-NEXT: vpsadbw %ymm5, %ymm1, %ymm1 +; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0 +; AVX2-NEXT: vpaddd %ymm3, %ymm0, %ymm2 +; AVX2-NEXT: vpandn %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm2 +; AVX2-NEXT: vpshufb %ymm2, %ymm6, %ymm2 +; AVX2-NEXT: vpsrlw $4, %ymm0, %ymm0 +; AVX2-NEXT: vpand %ymm4, %ymm0, %ymm0 +; AVX2-NEXT: vpshufb %ymm0, %ymm6, %ymm0 +; AVX2-NEXT: vpaddb %ymm2, %ymm0, %ymm0 +; AVX2-NEXT: vpunpckldq {{.*#+}} ymm0 = ymm0[0],ymm5[0],ymm0[1],ymm5[1],ymm0[4],ymm5[4],ymm0[5],ymm5[5] +; AVX2-NEXT: vpsadbw %ymm5, %ymm0, %ymm0 ; AVX2-NEXT: vpackuswb %ymm1, %ymm0, %ymm0 ; AVX2-NEXT: retq ; diff --git a/llvm/test/CodeGen/X86/x86-interleaved-access.ll b/llvm/test/CodeGen/X86/x86-interleaved-access.ll index 3e76bffb77a66..edadcdadad42e 100644 --- a/llvm/test/CodeGen/X86/x86-interleaved-access.ll +++ b/llvm/test/CodeGen/X86/x86-interleaved-access.ll @@ -915,24 +915,44 @@ ret void } define void @interleaved_store_vf16_i8_stride3(<16 x i8> %a, <16 x i8> %b, <16 x i8> %c, ptr %p) nounwind { -; AVX1OR2-LABEL: interleaved_store_vf16_i8_stride3: -; AVX1OR2: # %bb.0: -; AVX1OR2-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] -; AVX1OR2-NEXT: vpalignr {{.*#+}} xmm3 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10] -; AVX1OR2-NEXT: vpalignr {{.*#+}} xmm4 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] -; AVX1OR2-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4] -; AVX1OR2-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4] -; AVX1OR2-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4] -; AVX1OR2-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] -; AVX1OR2-NEXT: vpshufb %xmm3, %xmm1, %xmm1 -; AVX1OR2-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] -; AVX1OR2-NEXT: vpshufb %xmm3, %xmm0, %xmm0 -; AVX1OR2-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] -; AVX1OR2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 -; AVX1OR2-NEXT: vmovdqu %xmm0, 16(%rdi) -; AVX1OR2-NEXT: vmovdqu %xmm1, (%rdi) -; AVX1OR2-NEXT: vmovdqu %xmm2, 32(%rdi) -; AVX1OR2-NEXT: retq +; AVX1-LABEL: interleaved_store_vf16_i8_stride3: +; AVX1: # %bb.0: +; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] +; AVX1-NEXT: vpalignr {{.*#+}} xmm3 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10] +; AVX1-NEXT: vpalignr {{.*#+}} xmm4 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] +; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4] +; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4] +; AVX1-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4] +; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] +; AVX1-NEXT: vpshufb %xmm3, %xmm1, %xmm1 +; AVX1-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] +; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0 +; AVX1-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] +; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX1-NEXT: vmovdqu %xmm0, 16(%rdi) +; AVX1-NEXT: vmovdqu %xmm1, (%rdi) +; AVX1-NEXT: vmovdqu %xmm2, 32(%rdi) +; AVX1-NEXT: retq +; +; AVX2-LABEL: interleaved_store_vf16_i8_stride3: +; AVX2: # %bb.0: +; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,5] +; AVX2-NEXT: vpalignr {{.*#+}} xmm3 = xmm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10] +; AVX2-NEXT: vpalignr {{.*#+}} xmm4 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] +; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm3[5,6,7,8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4] +; AVX2-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm3[0,1,2,3,4] +; AVX2-NEXT: vpalignr {{.*#+}} xmm1 = xmm4[5,6,7,8,9,10,11,12,13,14,15],xmm1[0,1,2,3,4] +; AVX2-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[5,6,7,8,9,10,11,12,13,14,15],xmm2[0,1,2,3,4] +; AVX2-NEXT: vpalignr {{.*#+}} xmm2 = xmm2[5,6,7,8,9,10,11,12,13,14,15],xmm4[0,1,2,3,4] +; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5,0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5] +; AVX2-NEXT: # ymm3 = mem[0,1,0,1] +; AVX2-NEXT: vpshufb %xmm3, %xmm2, %xmm2 +; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0 +; AVX2-NEXT: vpshufb %ymm3, %ymm0, %ymm0 +; AVX2-NEXT: vmovdqu %xmm2, 32(%rdi) +; AVX2-NEXT: vmovdqu %ymm0, (%rdi) +; AVX2-NEXT: vzeroupper +; AVX2-NEXT: retq ; ; AVX512-LABEL: interleaved_store_vf16_i8_stride3: ; AVX512: # %bb.0: